Doxygen bump and release notes for v1.0.11

Windows: fix examples build to look for ispc.exe in ../.. as well
Windows: fix some compiler warnings during build
2011-10-07 09:57:55 -07:00 · 2011-10-09 07:40:18 -07:00 · 2011-10-09 07:40:17 -07:00 · 2011-10-07 09:20:48 -07:00 · 2011-10-08 17:17:05 -07:00 · 2011-10-06 17:10:30 -07:00
305 changed files with 21736 additions and 5659 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -5,3 +5,4 @@ ispc
 ispc_test
 objs
 docs/doxygen
+docs/ispc.html
--- a/LICENSE.txt
+++ b/LICENSE.txt
@@ -114,3 +114,30 @@ CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
 SOFTWARE.
+
+---------------------------------------------------------------------------
+
+ispc's code to convert to and from half-precision floats is based on James
+Tursa's code, which is covered by the following license:
+
+Redistribution and use in source and binary forms, with or without 
+modification, are permitted provided that the following conditions are 
+met:
+
+   * Redistributions of source code must retain the above copyright 
+     notice, this list of conditions and the following disclaimer.
+   * Redistributions in binary form must reproduce the above copyright 
+     notice, this list of conditions and the following disclaimer in 
+     the documentation and/or other materials provided with the distribution
+      
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 
+POSSIBILITY OF SUCH DAMAGE.
--- a/54
+++ b/54
@@ -10,12 +10,18 @@ CLANG_LIBS = -lclangFrontend -lclangDriver \
             -lclangSerialization -lclangParse -lclangSema \
             -lclangAnalysis -lclangAST -lclangLex -lclangBasic

-LLVM_LIBS=$(shell llvm-config --ldflags --libs) -lpthread -ldl
+ISPC_LIBS=$(CLANG_LIBS) \
+	$(shell llvm-config --ldflags --libs) \
+	-lpthread -ldl
+ISPC_TEST_LIBS=$(shell llvm-config --ldflags --libs) \
+	-lpthread -ldl
+
 LLVM_CXXFLAGS=$(shell llvm-config --cppflags)
-LLVM_VERSION_DEF=-DLLVM_$(shell llvm-config --version | sed s/\\./_/)
+LLVM_VERSION=$(shell llvm-config --version | sed s/\\./_/)
+LLVM_VERSION_DEF=-DLLVM_$(LLVM_VERSION)

 BUILD_DATE=$(shell date +%Y%m%d)
-BUILD_VERSION=$(shell git log | head -1)
+BUILD_VERSION=$(shell git log --abbrev-commit --abbrev=16 | head -1)

 CXX=g++
 CPP=cpp
@@ -38,22 +44,24 @@ YACC=bison -d -v -t

 ###########################################################################

-CXX_SRC=builtins.cpp ctx.cpp decl.cpp expr.cpp ispc.cpp \
+CXX_SRC=ast.cpp builtins.cpp ctx.cpp decl.cpp expr.cpp func.cpp ispc.cpp \
 	llvmutil.cpp main.cpp module.cpp opt.cpp stmt.cpp sym.cpp type.cpp \
 	util.cpp
-HEADERS=builtins.h ctx.h decl.h expr.h ispc.h llvmutil.h module.h \
+HEADERS=ast.h builtins.h ctx.h decl.h expr.h func.h ispc.h llvmutil.h module.h \
 	opt.h stmt.h sym.h type.h util.h
-STDLIB_SRC=stdlib-avx.ll stdlib-sse2.ll stdlib-sse4.ll stdlib-sse4x2.ll
+BUILTINS_SRC=builtins-avx.ll builtins-avx-x2.ll builtins-sse2.ll \
+	builtins-sse4.ll builtins-sse4-x2.ll builtins-dispatch.ll
 BISON_SRC=parse.yy
 FLEX_SRC=lex.ll

-OBJS=$(addprefix objs/, $(CXX_SRC:.cpp=.o) $(STDLIB_SRC:.ll=.o) stdlib-c.o stdlib_ispc.o \
-	$(BISON_SRC:.yy=.o) $(FLEX_SRC:.ll=.o))
+OBJS=$(addprefix objs/, $(CXX_SRC:.cpp=.o) $(BUILTINS_SRC:.ll=.o) \
+	builtins-c-32.o builtins-c-64.o stdlib_ispc.o $(BISON_SRC:.yy=.o) \
+	$(FLEX_SRC:.ll=.o))

 default: ispc ispc_test

 .PHONY: dirs clean depend doxygen print_llvm_src
-.PRECIOUS: objs/stdlib-%.cpp
+.PRECIOUS: objs/builtins-%.cpp

 depend: $(CXX_SRC) $(HEADERS)
 	@echo Updating dependencies
@@ -77,11 +85,11 @@ doxygen:

 ispc: print_llvm_src dirs $(OBJS)
 	@echo Creating ispc executable
-	@$(CXX) $(LDFLAGS) -o $@ $(OBJS) $(CLANG_LIBS) $(LLVM_LIBS)
+	@$(CXX) $(LDFLAGS) -o $@ $(OBJS) $(ISPC_LIBS)

 ispc_test: dirs ispc_test.cpp
 	@echo Creating ispc_test executable
-	@$(CXX) $(LDFLAGS) $(CXXFLAGS) -o $@ ispc_test.cpp $(LLVM_LIBS)
+	@$(CXX) $(LDFLAGS) $(CXXFLAGS) -o $@ ispc_test.cpp $(ISPC_TEST_LIBS)

 objs/%.o: %.cpp
 	@echo Compiling $<
@@ -103,19 +111,27 @@ objs/lex.o: objs/lex.cpp $(HEADERS) objs/parse.cc
 	@echo Compiling $<
 	@$(CXX) $(CXXFLAGS) -o $@ -c $<

-objs/stdlib-%.cpp: stdlib-%.ll stdlib.m4 stdlib-sse.ll
-	@echo Creating C++ source from stdlib file $<
-	@m4 stdlib.m4 $< | ./bitcode2cpp.py $< > $@
+objs/builtins-%.cpp: builtins-%.ll builtins.m4 builtins-sse.ll builtins-avx-common.ll
+	@echo Creating C++ source from builtin definitions file $<
+	@m4 -DLLVM_VERSION=$(LLVM_VERSION) builtins.m4 $< | ./bitcode2cpp.py $< > $@

-objs/stdlib-%.o: objs/stdlib-%.cpp
+objs/builtins-%.o: objs/builtins-%.cpp
 	@echo Compiling $<
 	@$(CXX) $(CXXFLAGS) -o $@ -c $<

-objs/stdlib-c.cpp: stdlib-c.c
-	@echo Creating C++ source from stdlib file $<
-	@$(CLANG) -I /opt/l1om/usr/include/ -emit-llvm -c $< -o - | llvm-dis - | ./bitcode2cpp.py $< > $@
+objs/builtins-c-32.cpp: builtins-c.c
+	@echo Creating C++ source from builtins definition file $<
+	@$(CLANG) -m32 -emit-llvm -c $< -o - | llvm-dis - | ./bitcode2cpp.py builtins-c-32.c > $@

-objs/stdlib-c.o: objs/stdlib-c.cpp
+objs/builtins-c-32.o: objs/builtins-c-32.cpp
+	@echo Compiling $<
+	@$(CXX) $(CXXFLAGS) -o $@ -c $<
+
+objs/builtins-c-64.cpp: builtins-c.c
+	@echo Creating C++ source from builtins definition file $<
+	@$(CLANG) -m64 -emit-llvm -c $< -o - | llvm-dis - | ./bitcode2cpp.py builtins-c-64.c > $@
+
+objs/builtins-c-64.o: objs/builtins-c-64.cpp
 	@echo Compiling $<
 	@$(CXX) $(CXXFLAGS) -o $@ -c $<

--- a/README.txt
+++ b/README.txt
@@ -15,8 +15,8 @@ code.

 ispc is an open source compiler under the BSD license; see the file
 LICENSE.txt.  ispc supports Windows, Mac, and Linux, with both x86 and
-x86-64 targets. It currently supports the SSE2 and SSE4 instruction sets,
-though support for AVX should be available soon.
+x86-64 targets.  It currently supports the SSE2, SSE4, and AVX instruction
+sets.

 For more information and examples, as well as a wiki and the bug database,
 see the ispc distribution site, http://ispc.github.com.
--- a/ast.cpp
+++ b/ast.cpp
@@ -0,0 +1,65 @@
+/*
+  Copyright (c) 2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+/** @file ast.cpp
+    @brief 
+*/
+
+#include "ast.h"
+#include "decl.h"
+#include "func.h"
+#include "type.h"
+#include "sym.h"
+
+///////////////////////////////////////////////////////////////////////////
+// ASTNode
+
+ASTNode::~ASTNode() {
+}
+
+
+///////////////////////////////////////////////////////////////////////////
+// AST
+
+void
+AST::AddFunction(DeclSpecs *ds, Declarator *decl, Stmt *code) {
+    functions.push_back(new Function(ds, decl, code));
+}
+
+
+void
+AST::GenerateIR() {
+    for (unsigned int i = 0; i < functions.size(); ++i)
+        functions[i]->GenerateIR();
+}
+
--- a/ast.h
+++ b/ast.h
@@ -0,0 +1,93 @@
+/*
+  Copyright (c) 2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+/** @file ast.h
+    @brief 
+*/
+
+#ifndef ISPC_AST_H
+#define ISPC_AST_H 1
+
+#include "ispc.h"
+#include <vector>
+
+/** @brief Abstract base class for nodes in the abstract syntax tree (AST).
+
+    This class defines a basic interface that all abstract syntax tree
+    (AST) nodes must implement.  The base classes for both expressions
+    (Expr) and statements (Stmt) inherit from this class.
+*/
+class ASTNode {
+public:
+    ASTNode(SourcePos p) : pos(p) { }
+    virtual ~ASTNode();
+
+    /** The Optimize() method should perform any appropriate early-stage
+        optimizations on the node (e.g. constant folding).  The caller
+        should use the returned ASTNode * in place of the original node.
+        This method may return NULL if an error is encountered during
+        optimization. */
+    virtual ASTNode *Optimize() = 0;
+
+    /** Type checking should be performed by the node when this method is
+        called.  In the event of an error, a NULL value may be returned.
+        As with ASTNode::Optimize(), the caller should store the returned
+        pointer in place of the original ASTNode *. */
+    virtual ASTNode *TypeCheck() = 0;
+
+    virtual int EstimateCost() const = 0;
+
+    /** All AST nodes must track the file position where they are
+        defined. */
+    const SourcePos pos;
+};
+
+
+/** Simple representation of the abstract syntax trees for all of the
+    functions declared in a compilation unit.
+ */
+class AST {
+public:
+    /** Add the AST for a function described by the given declaration
+        information and source code. */
+    void AddFunction(DeclSpecs *ds, Declarator *decl, Stmt *code);
+
+    /** Generate LLVM IR for all of the functions into the current
+        module. */
+    void GenerateIR();
+
+private:
+    std::vector<Function *> functions;
+};
+
+#endif // ISPC_AST_H
--- a/bitcode2cpp.py
+++ b/bitcode2cpp.py
@@ -4,30 +4,36 @@ import sys
 import string
 import re
 import subprocess
+import platform
+import os

 length=0

 src=str(sys.argv[1])

-target = re.sub(".*stdlib-", "", src)
+target = re.sub(".*builtins-", "", src)
 target = re.sub("\.ll$", "", target)
 target = re.sub("\.c$", "", target)
 target = re.sub("-", "_", target)

+llvm_as="llvm-as"
+if platform.system() == 'Windows' or string.find(platform.system(), "CYGWIN_NT") != -1:
+    llvm_as = os.getenv("LLVM_INSTALL_DIR").replace("\\", "/") + "/bin/" + llvm_as
+
 try:
-    as_out=subprocess.Popen([ "llvm-as", "-", "-o", "-"], stdout=subprocess.PIPE)
+    as_out=subprocess.Popen([llvm_as, "-", "-o", "-"], stdout=subprocess.PIPE)
 except IOError:
    print >> sys.stderr, "Couldn't open " + src
    sys.exit(1)

-print "unsigned char stdlib_bitcode_" + target + "[] = {"
+print "unsigned char builtins_bitcode_" + target + "[] = {"
 for line in as_out.stdout.readlines():
    length = length + len(line)
    for c in line:
        print ord(c)
        print ", "
 print " 0 };\n\n"
-print "int stdlib_bitcode_" + target + "_length = " + str(length) + ";\n"
+print "int builtins_bitcode_" + target + "_length = " + str(length) + ";\n"

 as_out.wait()

--- a/buildall.bat
+++ b/buildall.bat
@@ -0,0 +1,16 @@
+@echo off
+
+REM If LLVM_INSTALL_DIR isn't set globally in your environment,
+REM it can be set here_
+set LLVM_INSTALL_DIR=c:\users\mmp\llvm-dev
+
+REM Both the LLVM binaries and python need to be in the path
+set path=%LLVM_INSTALL_DIR%\bin;%PATH%;c:\cygwin\bin
+
+msbuild ispc.vcxproj /V:m /p:Platform=Win32 /p:Configuration=Release
+msbuild ispc_test.vcxproj /V:m /p:Platform=Win32 /p:Configuration=Release
+
+msbuild examples\examples.sln /V:m /p:Platform=x64 /p:Configuration=Release /t:rebuild
+msbuild examples\examples.sln /V:m /p:Platform=x64 /p:Configuration=Debug /t:rebuild
+msbuild examples\examples.sln /V:m /p:Platform=Win32 /p:Configuration=Release /t:rebuild
+msbuild examples\examples.sln /V:m /p:Platform=Win32 /p:Configuration=Debug /t:rebuild
--- a/builtins-avx-common.ll
+++ b/builtins-avx-common.ll
@@ -0,0 +1,278 @@
+;;  Copyright (c) 2010-2011, Intel Corporation
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Intel Corporation nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; *** Untested *** AVX target implementation.
+;;
+;; The LLVM AVX code generator is incomplete, so the ispc AVX target
+;; hasn't yet been tested.  There is therefore a higher-than-normal
+;; chance that there are bugs in the code in this file.
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rcp
+
+declare <4 x float> @llvm.x86.sse.rcp.ss(<4 x float>) nounwind readnone
+
+define internal float @__rcp_uniform_float(float) nounwind readonly alwaysinline {
+;    uniform float iv = extract(__rcp_u(v), 0);
+;    return iv * (2. - v * iv);
+  %vecval = insertelement <4 x float> undef, float %0, i32 0
+  %call = call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %vecval)
+  %scall = extractelement <4 x float> %call, i32 0
+
+  ; do one N-R iteration
+  %v_iv = fmul float %0, %scall
+  %two_minus = fsub float 2., %v_iv  
+  %iv_mul = fmul float %scall, %two_minus
+  ret float %iv_mul
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rounding floats
+
+declare <4 x float> @llvm.x86.sse41.round.ss(<4 x float>, <4 x float>, i32) nounwind readnone
+
+define internal float @__round_uniform_float(float) nounwind readonly alwaysinline {
+  ; roundss, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8
+  ; the roundss intrinsic is a total mess--docs say:
+  ;
+  ;  __m128 _mm_round_ss (__m128 a, __m128 b, const int c)
+  ;       
+  ;  b is a 128-bit parameter. The lowest 32 bits are the result of the rounding function
+  ;  on b0. The higher order 96 bits are copied directly from input parameter a. The
+  ;  return value is described by the following equations:
+  ;
+  ;  r0 = RND(b0)
+  ;  r1 = a1
+  ;  r2 = a2
+  ;  r3 = a3
+  ;
+  ;  It doesn't matter what we pass as a, since we only need the r0 value
+  ;  here.  So we pass the same register for both.
+  %xi = insertelement <4 x float> undef, float %0, i32 0
+  %xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 8)
+  %rs = extractelement <4 x float> %xr, i32 0
+  ret float %rs
+}
+
+define internal float @__floor_uniform_float(float) nounwind readonly alwaysinline {
+  ; see above for round_ss instrinsic discussion...
+  %xi = insertelement <4 x float> undef, float %0, i32 0
+  ; roundps, round down 0b01 | don't signal precision exceptions 0b1001 = 9
+  %xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 9)
+  %rs = extractelement <4 x float> %xr, i32 0
+  ret float %rs
+}
+
+define internal float @__ceil_uniform_float(float) nounwind readonly alwaysinline {
+  ; see above for round_ss instrinsic discussion...
+  %xi = insertelement <4 x float> undef, float %0, i32 0
+  ; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
+  %xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 10)
+  %rs = extractelement <4 x float> %xr, i32 0
+  ret float %rs
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rounding doubles
+
+declare <2 x double> @llvm.x86.sse41.round.sd(<2 x double>, <2 x double>, i32) nounwind readnone
+
+define internal double @__round_uniform_double(double) nounwind readonly alwaysinline {
+  %xi = insertelement <2 x double> undef, double %0, i32 0
+  %xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 8)
+  %rs = extractelement <2 x double> %xr, i32 0
+  ret double %rs
+}
+
+define internal double @__floor_uniform_double(double) nounwind readonly alwaysinline {
+  ; see above for round_ss instrinsic discussion...
+  %xi = insertelement <2 x double> undef, double %0, i32 0
+  ; roundpd, round down 0b01 | don't signal precision exceptions 0b1001 = 9
+  %xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 9)
+  %rs = extractelement <2 x double> %xr, i32 0
+  ret double %rs
+}
+
+define internal double @__ceil_uniform_double(double) nounwind readonly alwaysinline {
+  ; see above for round_ss instrinsic discussion...
+  %xi = insertelement <2 x double> undef, double %0, i32 0
+  ; roundpd, round up 0b10 | don't signal precision exceptions 0b1010 = 10
+  %xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 10)
+  %rs = extractelement <2 x double> %xr, i32 0
+  ret double %rs
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rsqrt
+
+declare <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float>) nounwind readnone
+
+define internal float @__rsqrt_uniform_float(float) nounwind readonly alwaysinline {
+  ;  uniform float is = extract(__rsqrt_u(v), 0);
+  %v = insertelement <4 x float> undef, float %0, i32 0
+  %vis = call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %v)
+  %is = extractelement <4 x float> %vis, i32 0
+
+  ;  return 0.5 * is * (3. - (v * is) * is);
+  %v_is = fmul float %0, %is
+  %v_is_is = fmul float %v_is, %is
+  %three_sub = fsub float 3., %v_is_is
+  %is_mul = fmul float %is, %three_sub
+  %half_scale = fmul float 0.5, %is_mul
+  ret float %half_scale
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; sqrt
+
+declare <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float>) nounwind readnone
+
+define internal float @__sqrt_uniform_float(float) nounwind readonly alwaysinline {
+  sse_unary_scalar(ret, 4, float, @llvm.x86.sse.sqrt.ss, %0)
+  ret float %ret
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; fastmath
+
+declare void @llvm.x86.sse.stmxcsr(i8 *) nounwind
+declare void @llvm.x86.sse.ldmxcsr(i8 *) nounwind
+
+define internal void @__fastmath() nounwind alwaysinline {
+  %ptr = alloca i32
+  %ptr8 = bitcast i32 * %ptr to i8 *
+  call void @llvm.x86.sse.stmxcsr(i8 * %ptr8)
+  %oldval = load i32 *%ptr
+
+  ; turn on DAZ (64)/FTZ (32768) -> 32832
+  %update = or i32 %oldval, 32832
+  store i32 %update, i32 *%ptr
+  call void @llvm.x86.sse.ldmxcsr(i8 * %ptr8)
+  ret void
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; float min/max
+
+declare <4 x float> @llvm.x86.sse.max.ss(<4 x float>, <4 x float>) nounwind readnone
+declare <4 x float> @llvm.x86.sse.min.ss(<4 x float>, <4 x float>) nounwind readnone
+
+define internal float @__max_uniform_float(float, float) nounwind readonly alwaysinline {
+  sse_binary_scalar(ret, 4, float, @llvm.x86.sse.max.ss, %0, %1)
+  ret float %ret
+}
+
+define internal float @__min_uniform_float(float, float) nounwind readonly alwaysinline {
+  sse_binary_scalar(ret, 4, float, @llvm.x86.sse.min.ss, %0, %1)
+  ret float %ret
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int min/max
+
+declare <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32>, <4 x i32>) nounwind readnone
+declare <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32>, <4 x i32>) nounwind readnone
+
+define internal i32 @__min_uniform_int32(i32, i32) nounwind readonly alwaysinline {
+  sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pminsd, %0, %1)
+  ret i32 %ret
+}
+
+define internal i32 @__max_uniform_int32(i32, i32) nounwind readonly alwaysinline {
+  sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pmaxsd, %0, %1)
+  ret i32 %ret
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; unsigned int min/max
+
+declare <4 x i32> @llvm.x86.sse41.pminud(<4 x i32>, <4 x i32>) nounwind readnone
+declare <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32>, <4 x i32>) nounwind readnone
+
+define internal i32 @__min_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
+  sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pminud, %0, %1)
+  ret i32 %ret
+}
+
+define internal i32 @__max_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
+  sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pmaxud, %0, %1)
+  ret i32 %ret
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; horizontal ops
+
+declare i32 @llvm.ctpop.i32(i32) nounwind readnone
+
+define internal i32 @__popcnt_int32(i32) nounwind readonly alwaysinline {
+  %call = call i32 @llvm.ctpop.i32(i32 %0)
+  ret i32 %call
+}
+
+declare i64 @llvm.ctpop.i64(i64) nounwind readnone
+
+define internal i64 @__popcnt_int64(i64) nounwind readonly alwaysinline {
+  %call = call i64 @llvm.ctpop.i64(i64 %0)
+  ret i64 %call
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; double precision sqrt
+
+declare <2 x double> @llvm.x86.sse.sqrt.sd(<2 x double>) nounwind readnone
+
+define internal double @__sqrt_uniform_double(double) nounwind alwaysinline {
+  sse_unary_scalar(ret, 2, double, @llvm.x86.sse.sqrt.sd, %0)
+  ret double %ret
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; double precision min/max
+
+declare <2 x double> @llvm.x86.sse2.max.sd(<2 x double>, <2 x double>) nounwind readnone
+declare <2 x double> @llvm.x86.sse2.min.sd(<2 x double>, <2 x double>) nounwind readnone
+
+define internal double @__min_uniform_double(double, double) nounwind readnone alwaysinline {
+  sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.min.sd, %0, %1)
+  ret double %ret
+}
+
+define internal double @__max_uniform_double(double, double) nounwind readnone alwaysinline {
+  sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.max.sd, %0, %1)
+  ret double %ret
+}
--- a/builtins-avx-x2.ll
+++ b/builtins-avx-x2.ll
@@ -0,0 +1,665 @@
+;;  Copyright (c) 2010-2011, Intel Corporation
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Intel Corporation nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; *** Untested *** AVX target implementation.
+;;
+;; The LLVM AVX code generator is incomplete, so the ispc AVX target
+;; hasn't yet been tested.  There is therefore a higher-than-normal
+;; chance that there are bugs in the code in this file.
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Basic 16-wide definitions
+
+stdlib_core(16)
+packed_load_and_store(16)
+scans(16)
+int64minmax(16)
+
+include(`builtins-avx-common.ll')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rcp
+
+declare <8 x float> @llvm.x86.avx.rcp.ps.256(<8 x float>) nounwind readnone
+
+define internal <16 x float> @__rcp_varying_float(<16 x float>) nounwind readonly alwaysinline {
+  ;  float iv = __rcp_v(v);
+  ;  return iv * (2. - v * iv);
+
+  unary8to16(call, float, @llvm.x86.avx.rcp.ps.256, %0)
+  ; do one N-R iteration
+  %v_iv = fmul <16 x float> %0, %call
+  %two_minus = fsub <16 x float> <float 2., float 2., float 2., float 2.,
+                                  float 2., float 2., float 2., float 2.,
+                                  float 2., float 2., float 2., float 2.,
+                                  float 2., float 2., float 2., float 2.>, %v_iv  
+  %iv_mul = fmul <16 x float> %call, %two_minus
+  ret <16 x float> %iv_mul
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rounding floats
+
+declare <8 x float> @llvm.x86.avx.round.ps.256(<8 x float>, i32) nounwind readnone
+
+define internal <16 x float> @__round_varying_float(<16 x float>) nounwind readonly alwaysinline {
+  ; roundps, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8
+  round8to16(%0, 8)
+}
+
+define internal <16 x float> @__floor_varying_float(<16 x float>) nounwind readonly alwaysinline {
+  ; roundps, round down 0b01 | don't signal precision exceptions 0b1001 = 9
+  round8to16(%0, 9)
+}
+
+define internal <16 x float> @__ceil_varying_float(<16 x float>) nounwind readonly alwaysinline {
+  ; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
+  round8to16(%0, 10)
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rounding doubles
+
+declare <4 x double> @llvm.x86.avx.round.pd.256(<4 x double>, i32) nounwind readnone
+
+define internal <16 x double> @__round_varying_double(<16 x double>) nounwind readonly alwaysinline {
+  round4to16double(%0, 8)
+}
+
+define internal <16 x double> @__floor_varying_double(<16 x double>) nounwind readonly alwaysinline {
+  round4to16double(%0, 9)
+}
+
+define internal <16 x double> @__ceil_varying_double(<16 x double>) nounwind readonly alwaysinline {
+  round4to16double(%0, 10)
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rsqrt
+
+declare <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float>) nounwind readnone
+
+define internal <16 x float> @__rsqrt_varying_float(<16 x float> %v) nounwind readonly alwaysinline {
+  ;  float is = __rsqrt_v(v);
+  unary8to16(is, float, @llvm.x86.avx.rsqrt.ps.256, %v)
+  ;  return 0.5 * is * (3. - (v * is) * is);
+  %v_is = fmul <16 x float> %v, %is
+  %v_is_is = fmul <16 x float> %v_is, %is
+  %three_sub = fsub <16 x float> <float 3., float 3., float 3., float 3.,
+                                  float 3., float 3., float 3., float 3.,
+                                  float 3., float 3., float 3., float 3.,
+                                  float 3., float 3., float 3., float 3.>, %v_is_is
+  %is_mul = fmul <16 x float> %is, %three_sub
+  %half_scale = fmul <16 x float> <float 0.5, float 0.5, float 0.5, float 0.5,
+                                   float 0.5, float 0.5, float 0.5, float 0.5,
+                                   float 0.5, float 0.5, float 0.5, float 0.5,
+                                   float 0.5, float 0.5, float 0.5, float 0.5>, %is_mul
+  ret <16 x float> %half_scale
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; sqrt
+
+declare <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float>) nounwind readnone
+
+define internal <16 x float> @__sqrt_varying_float(<16 x float>) nounwind readonly alwaysinline {
+  unary8to16(call, float, @llvm.x86.avx.sqrt.ps.256, %0)
+  ret <16 x float> %call
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; svml
+
+; FIXME: need either to wire these up to the 8-wide SVML entrypoints,
+; or, use the macro to call the 4-wide ones 4x with our 16-wide
+; vectors...
+
+declare <16 x float> @__svml_sin(<16 x float>)
+declare <16 x float> @__svml_cos(<16 x float>)
+declare void @__svml_sincos(<16 x float>, <16 x float> *, <16 x float> *)
+declare <16 x float> @__svml_tan(<16 x float>)
+declare <16 x float> @__svml_atan(<16 x float>)
+declare <16 x float> @__svml_atan2(<16 x float>, <16 x float>)
+declare <16 x float> @__svml_exp(<16 x float>)
+declare <16 x float> @__svml_log(<16 x float>)
+declare <16 x float> @__svml_pow(<16 x float>, <16 x float>)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; float min/max
+
+declare <8 x float> @llvm.x86.avx.max.ps.256(<8 x float>, <8 x float>) nounwind readnone
+declare <8 x float> @llvm.x86.avx.min.ps.256(<8 x float>, <8 x float>) nounwind readnone
+
+define internal <16 x float> @__max_varying_float(<16 x float>,
+                                                  <16 x float>) nounwind readonly alwaysinline {
+  binary8to16(call, float, @llvm.x86.avx.max.ps.256, %0, %1)
+  ret <16 x float> %call
+}
+
+define internal <16 x float> @__min_varying_float(<16 x float>,
+                                                  <16 x float>) nounwind readonly alwaysinline {
+  binary8to16(call, float, @llvm.x86.avx.min.ps.256, %0, %1)
+  ret <16 x float> %call
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int min/max
+
+define internal <16 x i32> @__min_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
+  binary4to16(ret, i32, @llvm.x86.sse41.pminsd, %0, %1)
+  ret <16 x i32> %ret
+}
+
+define internal <16 x i32> @__max_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
+  binary4to16(ret, i32, @llvm.x86.sse41.pmaxsd, %0, %1)
+  ret <16 x i32> %ret
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; unsigned int min/max
+
+define internal <16 x i32> @__min_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
+  binary4to16(ret, i32, @llvm.x86.sse41.pminud, %0, %1)
+  ret <16 x i32> %ret
+}
+
+define internal <16 x i32> @__max_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
+  binary4to16(ret, i32, @llvm.x86.sse41.pmaxud, %0, %1)
+  ret <16 x i32> %ret
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; horizontal ops
+
+declare i32 @llvm.x86.avx.movmsk.ps.256(<8 x float>) nounwind readnone
+
+define internal i32 @__movmsk(<16 x i32>) nounwind readnone alwaysinline {
+  %floatmask = bitcast <16 x i32> %0 to <16 x float>
+  %mask0 = shufflevector <16 x float> %floatmask, <16 x float> undef,
+          <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %v0 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %mask0) nounwind readnone
+  %mask1 = shufflevector <16 x float> %floatmask, <16 x float> undef,
+          <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %v1 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %mask1) nounwind readnone
+
+  %v1shift = shl i32 %v1, 8
+  %v = or i32 %v1shift, %v0
+  ret i32 %v
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; horizontal float ops
+
+declare <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float>, <8 x float>) nounwind readnone
+
+define internal float @__reduce_add_float(<16 x float>) nounwind readonly alwaysinline {
+  %va = shufflevector <16 x float> %0, <16 x float> undef,
+          <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %vb = shufflevector <16 x float> %0, <16 x float> undef,
+          <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %v1 = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %va, <8 x float> %vb)
+  %v2 = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %v1, <8 x float> %v1)
+  %v3 = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %v2, <8 x float> %v2)
+  %scalar1 = extractelement <8 x float> %v3, i32 0
+  %scalar2 = extractelement <8 x float> %v3, i32 4
+  %sum = fadd float %scalar1, %scalar2
+  ret float %sum
+}
+
+
+define internal float @__reduce_min_float(<16 x float>) nounwind readnone alwaysinline {
+  reduce16(float, @__min_varying_float, @__min_uniform_float)
+}
+
+
+define internal float @__reduce_max_float(<16 x float>) nounwind readnone alwaysinline {
+  reduce16(float, @__max_varying_float, @__max_uniform_float)
+}
+
+reduce_equal(16)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; horizontal int32 ops
+
+define internal <16 x i32> @__add_varying_int32(<16 x i32>,
+                                                <16 x i32>) nounwind readnone alwaysinline {
+  %s = add <16 x i32> %0, %1
+  ret <16 x i32> %s
+}
+
+define internal i32 @__add_uniform_int32(i32, i32) nounwind readnone alwaysinline {
+  %s = add i32 %0, %1
+  ret i32 %s
+}
+
+define internal i32 @__reduce_add_int32(<16 x i32>) nounwind readnone alwaysinline {
+  reduce16(i32, @__add_varying_int32, @__add_uniform_int32)
+}
+
+
+define internal i32 @__reduce_min_int32(<16 x i32>) nounwind readnone alwaysinline {
+  reduce16(i32, @__min_varying_int32, @__min_uniform_int32)
+}
+
+
+define internal i32 @__reduce_max_int32(<16 x i32>) nounwind readnone alwaysinline {
+  reduce16(i32, @__max_varying_int32, @__max_uniform_int32)
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; horizontal uint32 ops
+
+define internal i32 @__reduce_add_uint32(<16 x i32> %v) nounwind readnone alwaysinline {
+  %r = call i32 @__reduce_add_int32(<16 x i32> %v)
+  ret i32 %r
+}
+
+define internal i32 @__reduce_min_uint32(<16 x i32>) nounwind readnone alwaysinline {
+  reduce16(i32, @__min_varying_uint32, @__min_uniform_uint32)
+}
+
+
+define internal i32 @__reduce_max_uint32(<16 x i32>) nounwind readnone alwaysinline {
+  reduce16(i32, @__max_varying_uint32, @__max_uniform_uint32)
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; horizontal double ops
+
+declare <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double>, <4 x double>) nounwind readnone
+
+define internal double @__reduce_add_double(<16 x double>) nounwind readonly alwaysinline {
+  %va = shufflevector <16 x double> %0, <16 x double> undef,
+         <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %vb = shufflevector <16 x double> %0, <16 x double> undef,
+         <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %vc = shufflevector <16 x double> %0, <16 x double> undef,
+         <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+  %vd = shufflevector <16 x double> %0, <16 x double> undef,
+         <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+  %vab = fadd <4 x double> %va, %vb
+  %vcd = fadd <4 x double> %vc, %vd
+
+  %sum0 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %vab, <4 x double> %vcd)
+  %sum1 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %sum0, <4 x double> %sum0)
+  %final0 = extractelement <4 x double> %sum1, i32 0
+  %final1 = extractelement <4 x double> %sum1, i32 2
+  %sum = fadd double %final0, %final1
+  ret double %sum
+}
+
+define internal double @__reduce_min_double(<16 x double>) nounwind readnone alwaysinline {
+  reduce16(double, @__min_varying_double, @__min_uniform_double)
+}
+
+
+define internal double @__reduce_max_double(<16 x double>) nounwind readnone alwaysinline {
+  reduce16(double, @__max_varying_double, @__max_uniform_double)
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; horizontal int64 ops
+
+define internal <16 x i64> @__add_varying_int64(<16 x i64>,
+                                                <16 x i64>) nounwind readnone alwaysinline {
+  %s = add <16 x i64> %0, %1
+  ret <16 x i64> %s
+}
+
+define internal i64 @__add_uniform_int64(i64, i64) nounwind readnone alwaysinline {
+  %s = add i64 %0, %1
+  ret i64 %s
+}
+
+define internal i64 @__reduce_add_int64(<16 x i64>) nounwind readnone alwaysinline {
+  reduce16(i64, @__add_varying_int64, @__add_uniform_int64)
+}
+
+
+define internal i64 @__reduce_min_int64(<16 x i64>) nounwind readnone alwaysinline {
+  reduce16(i64, @__min_varying_int64, @__min_uniform_int64)
+}
+
+
+define internal i64 @__reduce_max_int64(<16 x i64>) nounwind readnone alwaysinline {
+  reduce16(i64, @__max_varying_int64, @__max_uniform_int64)
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; horizontal uint64 ops
+
+define internal i64 @__reduce_add_uint64(<16 x i64> %v) nounwind readnone alwaysinline {
+  %r = call i64 @__reduce_add_int64(<16 x i64> %v)
+  ret i64 %r
+}
+
+define internal i64 @__reduce_min_uint64(<16 x i64>) nounwind readnone alwaysinline {
+  reduce16(i64, @__min_varying_uint64, @__min_uniform_uint64)
+}
+
+
+define internal i64 @__reduce_max_uint64(<16 x i64>) nounwind readnone alwaysinline {
+  reduce16(i64, @__max_varying_uint64, @__max_uniform_uint64)
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; unaligned loads/loads+broadcasts
+
+load_and_broadcast(16, i8, 8)
+load_and_broadcast(16, i16, 16)
+load_and_broadcast(16, i32, 32)
+load_and_broadcast(16, i64, 64)
+
+; no masked load instruction for i8 and i16 types??
+load_masked(16, i8,  8,  1)
+load_masked(16, i16, 16, 2)
+
+declare <8 x float> @llvm.x86.avx.maskload.ps.256(i8 *, <8 x float> %mask)
+declare <4 x double> @llvm.x86.avx.maskload.pd.256(i8 *, <4 x double> %mask)
+ 
+define <16 x i32> @__load_masked_32(i8 *, <16 x i32> %mask) nounwind alwaysinline {
+  %floatmask = bitcast <16 x i32> %mask to <16 x float>
+  %mask0 = shufflevector <16 x float> %floatmask, <16 x float> undef,
+     <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %val0 = call <8 x float> @llvm.x86.avx.maskload.ps.256(i8 * %0, <8 x float> %mask0)
+  %mask1 = shufflevector <16 x float> %floatmask, <16 x float> undef,
+     <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %ptr1 = getelementptr i8 * %0, i32 32   ;; 8x4 bytes = 32
+  %val1 = call <8 x float> @llvm.x86.avx.maskload.ps.256(i8 * %ptr1, <8 x float> %mask1)
+
+  %retval = shufflevector <8 x float> %val0, <8 x float> %val1,
+     <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                 i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %reti32 = bitcast <16 x float> %retval to <16 x i32>
+  ret <16 x i32> %reti32
+}
+
+
+define <16 x i64> @__load_masked_64(i8 *, <16 x i32> %mask) nounwind alwaysinline {
+  ; double up masks, bitcast to doubles
+  %mask0 = shufflevector <16 x i32> %mask, <16 x i32> undef,
+     <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
+  %mask1 = shufflevector <16 x i32> %mask, <16 x i32> undef,
+     <8 x i32> <i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
+  %mask2 = shufflevector <16 x i32> %mask, <16 x i32> undef,
+     <8 x i32> <i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11>
+  %mask3 = shufflevector <16 x i32> %mask, <16 x i32> undef,
+     <8 x i32> <i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15>
+  %mask0d = bitcast <8 x i32> %mask0 to <4 x double>
+  %mask1d = bitcast <8 x i32> %mask1 to <4 x double>
+  %mask2d = bitcast <8 x i32> %mask2 to <4 x double>
+  %mask3d = bitcast <8 x i32> %mask3 to <4 x double>
+
+  %val0d = call <4 x double> @llvm.x86.avx.maskload.pd.256(i8 * %0, <4 x double> %mask0d)
+  %ptr1 = getelementptr i8 * %0, i32 32
+  %val1d = call <4 x double> @llvm.x86.avx.maskload.pd.256(i8 * %ptr1, <4 x double> %mask1d)
+  %ptr2 = getelementptr i8 * %0, i32 64
+  %val2d = call <4 x double> @llvm.x86.avx.maskload.pd.256(i8 * %ptr2, <4 x double> %mask2d)
+  %ptr3 = getelementptr i8 * %0, i32 96
+  %val3d = call <4 x double> @llvm.x86.avx.maskload.pd.256(i8 * %ptr3, <4 x double> %mask3d)
+
+  %val01 = shufflevector <4 x double> %val0d, <4 x double> %val1d,
+      <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %val23 = shufflevector <4 x double> %val2d, <4 x double> %val3d,
+      <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %val0123 = shufflevector <8 x double> %val01, <8 x double> %val23,
+      <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                  i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %val = bitcast <16 x double> %val0123 to <16 x i64>
+  ret <16 x i64> %val
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; masked store
+
+; FIXME: there is no AVX instruction for these, but we could be clever
+; by packing the bits down and setting the last 3/4 or half, respectively,
+; of the mask to zero...  Not sure if this would be a win in the end
+gen_masked_store(16, i8, 8)
+gen_masked_store(16, i16, 16)
+
+; note that mask is the 2nd parameter, not the 3rd one!!
+declare void @llvm.x86.avx.maskstore.ps.256(i8 *, <8 x float>, <8 x float>)
+declare void @llvm.x86.avx.maskstore.pd.256(i8 *, <4 x double>, <4 x double>)
+
+define void @__masked_store_32(<16 x i32>* nocapture, <16 x i32>, 
+                               <16 x i32>) nounwind alwaysinline {
+  %ptr = bitcast <16 x i32> * %0 to i8 *
+  %val = bitcast <16 x i32> %1 to <16 x float>
+  %mask = bitcast <16 x i32> %2 to <16 x float>
+
+  %val0 = shufflevector <16 x float> %val, <16 x float> undef,
+        <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %val1 = shufflevector <16 x float> %val, <16 x float> undef,
+        <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+
+  %mask0 = shufflevector <16 x float> %mask, <16 x float> undef,
+        <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %mask1 = shufflevector <16 x float> %mask, <16 x float> undef,
+        <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+
+  call void @llvm.x86.avx.maskstore.ps.256(i8 * %ptr, <8 x float> %mask0, <8 x float> %val0)
+  %ptr1 = getelementptr i8 * %ptr, i32 32
+  call void @llvm.x86.avx.maskstore.ps.256(i8 * %ptr1, <8 x float> %mask1, <8 x float> %val1)
+
+  ret void
+}
+
+define void @__masked_store_64(<16 x i64>* nocapture, <16 x i64>,
+                               <16 x i32> %mask) nounwind alwaysinline {
+  %ptr = bitcast <16 x i64> * %0 to i8 *
+  %val = bitcast <16 x i64> %1 to <16 x double>
+
+  ; double up masks, bitcast to doubles
+  %mask0 = shufflevector <16 x i32> %mask, <16 x i32> undef,
+     <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
+  %mask1 = shufflevector <16 x i32> %mask, <16 x i32> undef,
+     <8 x i32> <i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
+  %mask2 = shufflevector <16 x i32> %mask, <16 x i32> undef,
+     <8 x i32> <i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11>
+  %mask3 = shufflevector <16 x i32> %mask, <16 x i32> undef,
+     <8 x i32> <i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15>
+  %mask0d = bitcast <8 x i32> %mask0 to <4 x double>
+  %mask1d = bitcast <8 x i32> %mask1 to <4 x double>
+  %mask2d = bitcast <8 x i32> %mask2 to <4 x double>
+  %mask3d = bitcast <8 x i32> %mask3 to <4 x double>
+
+  %val0 = shufflevector <16 x double> %val, <16 x double> undef,
+     <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %val1 = shufflevector <16 x double> %val, <16 x double> undef,
+     <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %val2 = shufflevector <16 x double> %val, <16 x double> undef,
+     <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+  %val3 = shufflevector <16 x double> %val, <16 x double> undef,
+     <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+
+  call void @llvm.x86.avx.maskstore.pd.256(i8 * %ptr, <4 x double> %mask0d, <4 x double> %val0)
+  %ptr1 = getelementptr i8 * %ptr, i32 32
+  call void @llvm.x86.avx.maskstore.pd.256(i8 * %ptr1, <4 x double> %mask1d, <4 x double> %val1)
+  %ptr2 = getelementptr i8 * %ptr, i32 64
+  call void @llvm.x86.avx.maskstore.pd.256(i8 * %ptr2, <4 x double> %mask2d, <4 x double> %val2)
+  %ptr3 = getelementptr i8 * %ptr, i32 96
+  call void @llvm.x86.avx.maskstore.pd.256(i8 * %ptr3, <4 x double> %mask3d, <4 x double> %val3)
+
+  ret void
+}
+
+
+masked_store_blend_8_16_by_16()
+
+declare <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float>, <8 x float>,
+                                                <8 x float>) nounwind readnone
+
+define void @__masked_store_blend_32(<16 x i32>* nocapture, <16 x i32>, 
+                                     <16 x i32>) nounwind alwaysinline {
+  %maskAsFloat = bitcast <16 x i32> %2 to <16 x float>
+  %oldValue = load <16 x i32>* %0, align 4
+  %oldAsFloat = bitcast <16 x i32> %oldValue to <16 x float>
+  %newAsFloat = bitcast <16 x i32> %1 to <16 x float>
+ 
+  %old0 = shufflevector <16 x float> %oldAsFloat, <16 x float> undef,
+        <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %old1 = shufflevector <16 x float> %oldAsFloat, <16 x float> undef,
+        <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %new0 = shufflevector <16 x float> %newAsFloat, <16 x float> undef,
+        <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %new1 = shufflevector <16 x float> %newAsFloat, <16 x float> undef,
+        <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %mask0 = shufflevector <16 x float> %maskAsFloat, <16 x float> undef,
+        <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %mask1 = shufflevector <16 x float> %maskAsFloat, <16 x float> undef,
+        <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+
+  %blend0 = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %old0,
+                                                         <8 x float> %new0,
+                                                         <8 x float> %mask0)
+  %blend1 = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %old1,
+                                                         <8 x float> %new1,
+                                                         <8 x float> %mask1)
+  %blend = shufflevector <8 x float> %blend0, <8 x float> %blend1,
+    <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %blendAsInt = bitcast <16 x float> %blend to <16 x i32>
+  store <16 x i32> %blendAsInt, <16 x i32>* %0, align 4
+  ret void
+}
+
+
+declare <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double>, <4 x double>,
+                                                 <4 x double>) nounwind readnone
+
+define void @__masked_store_blend_64(<16 x i64>* nocapture %ptr, <16 x i64> %newi64, 
+                                     <16 x i32> %mask) nounwind alwaysinline {
+  %oldValue = load <16 x i64>* %ptr, align 8
+  %old = bitcast <16 x i64> %oldValue to <16 x double>
+  %old0d = shufflevector <16 x double> %old, <16 x double> undef,
+     <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %old1d = shufflevector <16 x double> %old, <16 x double> undef,
+     <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %old2d = shufflevector <16 x double> %old, <16 x double> undef,
+     <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+  %old3d = shufflevector <16 x double> %old, <16 x double> undef,
+     <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+
+  %new = bitcast <16 x i64> %newi64 to <16 x double>
+  %new0d = shufflevector <16 x double> %new, <16 x double> undef,
+     <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %new1d = shufflevector <16 x double> %new, <16 x double> undef,
+     <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %new2d = shufflevector <16 x double> %new, <16 x double> undef,
+     <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+  %new3d = shufflevector <16 x double> %new, <16 x double> undef,
+     <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+
+  %mask0 = shufflevector <16 x i32> %mask, <16 x i32> undef,
+     <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
+  %mask1 = shufflevector <16 x i32> %mask, <16 x i32> undef,
+     <8 x i32> <i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
+  %mask2 = shufflevector <16 x i32> %mask, <16 x i32> undef,
+     <8 x i32> <i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11>
+  %mask3 = shufflevector <16 x i32> %mask, <16 x i32> undef,
+     <8 x i32> <i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15>
+  %mask0d = bitcast <8 x i32> %mask0 to <4 x double>
+  %mask1d = bitcast <8 x i32> %mask1 to <4 x double>
+  %mask2d = bitcast <8 x i32> %mask2 to <4 x double>
+  %mask3d = bitcast <8 x i32> %mask3 to <4 x double>
+
+  %result0d = call <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double> %old0d,
+                                 <4 x double> %new0d, <4 x double> %mask0d)
+  %result1d = call <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double> %old1d,
+                                 <4 x double> %new1d, <4 x double> %mask1d)
+  %result2d = call <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double> %old2d,
+                                 <4 x double> %new2d, <4 x double> %mask2d)
+  %result3d = call <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double> %old3d,
+                                 <4 x double> %new3d, <4 x double> %mask3d)
+
+  %result01 = shufflevector <4 x double> %result0d, <4 x double> %result1d,
+           <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %result23 = shufflevector <4 x double> %result2d, <4 x double> %result3d,
+           <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+
+  %result = shufflevector <8 x double> %result01, <8 x double> %result23,
+           <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                       i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %result64 = bitcast <16 x double> %result to <16 x i64>
+  store <16 x i64> %result64, <16 x i64> * %ptr
+  ret void
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; gather/scatter
+
+gen_gather(16, i8)
+gen_gather(16, i16)
+gen_gather(16, i32)
+gen_gather(16, i64)
+
+gen_scatter(16, i8)
+gen_scatter(16, i16)
+gen_scatter(16, i32)
+gen_scatter(16, i64)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; double precision sqrt
+
+declare <4 x double> @llvm.x86.avx.sqrt.pd.256(<4 x double>) nounwind readnone
+
+define internal <16 x double> @__sqrt_varying_double(<16 x double>) nounwind alwaysinline {
+  unary4to16(ret, double, @llvm.x86.avx.sqrt.pd.256, %0)
+  ret <16 x double> %ret
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; double precision min/max
+
+declare <4 x double> @llvm.x86.avx.max.pd.256(<4 x double>, <4 x double>) nounwind readnone
+declare <4 x double> @llvm.x86.avx.min.pd.256(<4 x double>, <4 x double>) nounwind readnone
+
+define internal <16 x double> @__min_varying_double(<16 x double>, <16 x double>) nounwind readnone alwaysinline {
+  binary4to16(ret, double, @llvm.x86.avx.min.pd.256, %0, %1)
+  ret <16 x double> %ret
+}
+
+define internal <16 x double> @__max_varying_double(<16 x double>, <16 x double>) nounwind readnone alwaysinline {
+  binary4to16(ret, double, @llvm.x86.avx.max.pd.256, %0, %1)
+  ret <16 x double> %ret
+}
--- a/builtins-avx.ll
+++ b/builtins-avx.ll
@@ -41,13 +41,15 @@

 stdlib_core(8)
 packed_load_and_store(8)
-int8_16(8)
+scans(8)
+int64minmax(8)
+
+include(`builtins-avx-common.ll')

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rcp

 declare <8 x float> @llvm.x86.avx.rcp.ps.256(<8 x float>) nounwind readnone
-declare <4 x float> @llvm.x86.sse.rcp.ss(<4 x float>) nounwind readnone

 define internal <8 x float> @__rcp_varying_float(<8 x float>) nounwind readonly alwaysinline {
  ;  float iv = __rcp_v(v);
@@ -62,25 +64,10 @@ define internal <8 x float> @__rcp_varying_float(<8 x float>) nounwind readonly
  ret <8 x float> %iv_mul
 }

-define internal float @__rcp_uniform_float(float) nounwind readonly alwaysinline {
-;    uniform float iv = extract(__rcp_u(v), 0);
-;    return iv * (2. - v * iv);
-  %vecval = insertelement <4 x float> undef, float %0, i32 0
-  %call = call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %vecval)
-  %scall = extractelement <4 x float> %call, i32 0
-
-  ; do one N-R iteration
-  %v_iv = fmul float %0, %scall
-  %two_minus = fsub float 2., %v_iv  
-  %iv_mul = fmul float %scall, %two_minus
-  ret float %iv_mul
-}
-
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; rounding
+;; rounding floats

 declare <8 x float> @llvm.x86.avx.round.ps.256(<8 x float>, i32) nounwind readnone
-declare <4 x float> @llvm.x86.sse.round.ss(<4 x float>, <4 x float>, i32) nounwind readnone

 define internal <8 x float> @__round_varying_float(<8 x float>) nounwind readonly alwaysinline {
  ; roundps, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8
@@ -88,61 +75,43 @@ define internal <8 x float> @__round_varying_float(<8 x float>) nounwind readonl
  ret <8 x float> %call
 }

-define internal float @__round_uniform_float(float) nounwind readonly alwaysinline {
-  ; roundss, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8
-  ; the roundss intrinsic is a total mess--docs say:
-  ;
-  ;  __m128 _mm_round_ss (__m128 a, __m128 b, const int c)
-  ;       
-  ;  b is a 128-bit parameter. The lowest 32 bits are the result of the rounding function
-  ;  on b0. The higher order 96 bits are copied directly from input parameter a. The
-  ;  return value is described by the following equations:
-  ;
-  ;  r0 = RND(b0)
-  ;  r1 = a1
-  ;  r2 = a2
-  ;  r3 = a3
-  ;
-  ;  It doesn't matter what we pass as a, since we only need the r0 value
-  ;  here.  So we pass the same register for both.
-  %xi = insertelement <4 x float> undef, float %0, i32 0
-  %xr = call <4 x float> @llvm.x86.sse.round.ss(<4 x float> %xi, <4 x float> %xi, i32 8)
-  %rs = extractelement <4 x float> %xr, i32 0
-  ret float %rs
-}
-
 define internal <8 x float> @__floor_varying_float(<8 x float>) nounwind readonly alwaysinline {
-  ; roundps, round down 0b01 | don't signal precision exceptions 0b1000 = 9
+  ; roundps, round down 0b01 | don't signal precision exceptions 0b1001 = 9
  %call = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %0, i32 9)
  ret <8 x float> %call
 }

-define internal float @__floor_uniform_float(float) nounwind readonly alwaysinline {
-  ; see above for round_ss instrinsic discussion...
-  %xi = insertelement <4 x float> undef, float %0, i32 0
-  ; roundps, round down 0b01 | don't signal precision exceptions 0b1000 = 9
-  %xr = call <4 x float> @llvm.x86.sse.round.ss(<4 x float> %xi, <4 x float> %xi, i32 9)
-  %rs = extractelement <4 x float> %xr, i32 0
-  ret float %rs
-}
-
 define internal <8 x float> @__ceil_varying_float(<8 x float>) nounwind readonly alwaysinline {
-  ; roundps, round up 0b10 | don't signal precision exceptions 0b1000 = 10
+  ; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
  %call = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %0, i32 10)
  ret <8 x float> %call
 }

-define internal float @__ceil_uniform_float(float) nounwind readonly alwaysinline {
-  ; see above for round_ss instrinsic discussion...
-  %xi = insertelement <4 x float> undef, float %0, i32 0
-  ; roundps, round up 0b10 | don't signal precision exceptions 0b1000 = 10
-  %xr = call <4 x float> @llvm.x86.sse.round.ss(<4 x float> %xi, <4 x float> %xi, i32 10)
-  %rs = extractelement <4 x float> %xr, i32 0
-  ret float %rs
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rounding doubles
+
+declare <4 x double> @llvm.x86.avx.round.pd.256(<4 x double>, i32) nounwind readnone
+
+define internal <8 x double> @__round_varying_double(<8 x double>) nounwind readonly alwaysinline {
+  round4to8double(%0, 8)
 }

+define internal <8 x double> @__floor_varying_double(<8 x double>) nounwind readonly alwaysinline {
+  ; roundpd, round down 0b01 | don't signal precision exceptions 0b1000 = 9
+  round4to8double(%0, 9)
+}
+
+
+define internal <8 x double> @__ceil_varying_double(<8 x double>) nounwind readonly alwaysinline {
+  ; roundpd, round up 0b10 | don't signal precision exceptions 0b1000 = 10
+  round4to8double(%0, 10)
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rsqrt
+
 declare <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float>) nounwind readnone
-declare <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float>) nounwind readnone

 define internal <8 x float> @__rsqrt_varying_float(<8 x float> %v) nounwind readonly alwaysinline {
  ;  float is = __rsqrt_v(v);
@@ -150,64 +119,24 @@ define internal <8 x float> @__rsqrt_varying_float(<8 x float> %v) nounwind read
  ;  return 0.5 * is * (3. - (v * is) * is);
  %v_is = fmul <8 x float> %v, %is
  %v_is_is = fmul <8 x float> %v_is, %is
-  %three_sub = fsub <8 x float> <float 3., float 3., float 3., float 3., float 3., float 3., float 3., float 3.>, %v_is_is
+  %three_sub = fsub <8 x float> <float 3., float 3., float 3., float 3.,
+                                 float 3., float 3., float 3., float 3.>, %v_is_is
  %is_mul = fmul <8 x float> %is, %three_sub
-  %half_scale = fmul <8 x float> <float 0.5, float 0.5, float 0.5, float 0.5, float 0.5, float 0.5, float 0.5, float 0.5>, %is_mul
+  %half_scale = fmul <8 x float> <float 0.5, float 0.5, float 0.5, float 0.5,
+                                  float 0.5, float 0.5, float 0.5, float 0.5>, %is_mul
  ret <8 x float> %half_scale
 }

-define internal float @__rsqrt_uniform_float(float) nounwind readonly alwaysinline {
-  ;  uniform float is = extract(__rsqrt_u(v), 0);
-  %v = insertelement <4 x float> undef, float %0, i32 0
-  %vis = call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %v)
-  %is = extractelement <4 x float> %vis, i32 0
-
-  ;  return 0.5 * is * (3. - (v * is) * is);
-  %v_is = fmul float %0, %is
-  %v_is_is = fmul float %v_is, %is
-  %three_sub = fsub float 3., %v_is_is
-  %is_mul = fmul float %is, %three_sub
-  %half_scale = fmul float 0.5, %is_mul
-  ret float %half_scale
-}
-
-
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; sqrt

 declare <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float>) nounwind readnone
-declare <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float>) nounwind readnone

 define internal <8 x float> @__sqrt_varying_float(<8 x float>) nounwind readonly alwaysinline {
  %call = call <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float> %0)
  ret <8 x float> %call
 }

-define internal float @__sqrt_uniform_float(float) nounwind readonly alwaysinline {
-  sse_unary_scalar(ret, 4, float, @llvm.x86.sse.sqrt.ss, %0)
-  ret float %ret
-}
-
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; fastmath
-
-declare void @llvm.x86.sse.stmxcsr(i32 *) nounwind
-declare void @llvm.x86.sse.ldmxcsr(i32 *) nounwind
-
-define internal void @__fastmath() nounwind alwaysinline {
-  %ptr = alloca i32
-  call void @llvm.x86.sse.stmxcsr(i32 * %ptr)
-  %oldval = load i32 *%ptr
-
-  ; turn on DAZ (64)/FTZ (32768) -> 32832
-  %update = or i32 %oldval, 32832
-  store i32 %update, i32 *%ptr
-  call void @llvm.x86.sse.ldmxcsr(i32 * %ptr)
-  ret void
-}
-
-
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; svml

@@ -229,9 +158,7 @@ declare <8 x float> @__svml_pow(<8 x float>, <8 x float>)
 ;; float min/max

 declare <8 x float> @llvm.x86.avx.max.ps.256(<8 x float>, <8 x float>) nounwind readnone
-declare <4 x float> @llvm.x86.sse.max.ss(<4 x float>, <4 x float>) nounwind readnone
 declare <8 x float> @llvm.x86.avx.min.ps.256(<8 x float>, <8 x float>) nounwind readnone
-declare <4 x float> @llvm.x86.sse.min.ss(<4 x float>, <4 x float>) nounwind readnone

 define internal <8 x float> @__max_varying_float(<8 x float>,
                                                 <8 x float>) nounwind readonly alwaysinline {
@@ -239,90 +166,43 @@ define internal <8 x float> @__max_varying_float(<8 x float>,
  ret <8 x float> %call
 }

-define internal float @__max_uniform_float(float, float) nounwind readonly alwaysinline {
-  sse_binary_scalar(ret, 4, float, @llvm.x86.sse.max.ss, %0, %1)
-  ret float %ret
-}
-
 define internal <8 x float> @__min_varying_float(<8 x float>,
                                                 <8 x float>) nounwind readonly alwaysinline {
  %call = call <8 x float> @llvm.x86.avx.min.ps.256(<8 x float> %0, <8 x float> %1)
  ret <8 x float> %call
 }

-define internal float @__min_uniform_float(float, float) nounwind readonly alwaysinline {
-  sse_binary_scalar(ret, 4, float, @llvm.x86.sse.min.ss, %0, %1)
-  ret float %ret
-}
-

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; int min/max

-declare <8 x i32> @llvm.x86.avx.min.sd.256(<8 x i32>, <8 x i32>) nounwind readnone
-declare <8 x i32> @llvm.x86.avx.max.sd.256(<8 x i32>, <8 x i32>) nounwind readnone
-
 define internal <8 x i32> @__min_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
-  %call = call <8 x i32> @llvm.x86.avx.min.sd.256(<8 x i32> %0, <8 x i32> %1)
-  ret <8 x i32> %call
-}
-
-define internal i32 @__min_uniform_int32(i32, i32) nounwind readonly alwaysinline {
-  sse_binary_scalar(ret, 8, i32, @llvm.x86.avx.min.sd.256, %0, %1)
-  ret i32 %ret
+  binary4to8(ret, i32, @llvm.x86.sse41.pminsd, %0, %1)
+  ret <8 x i32> %ret
 }

 define internal <8 x i32> @__max_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
-  %call = call <8 x i32> @llvm.x86.avx.max.sd.256(<8 x i32> %0, <8 x i32> %1)
-  ret <8 x i32> %call
-}
-
-define internal i32 @__max_uniform_int32(i32, i32) nounwind readonly alwaysinline {
-  sse_binary_scalar(ret, 8, i32, @llvm.x86.avx.max.sd.256, %0, %1)
-  ret i32 %ret
+  binary4to8(ret, i32, @llvm.x86.sse41.pmaxsd, %0, %1)
+  ret <8 x i32> %ret
 }


 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; unsigned int min/max

-; FIXME: looks like these aren't available in LLVM?
-declare <8 x i32> @llvm.x86.avx.min.ud.256(<8 x i32>, <8 x i32>) nounwind readnone
-declare <8 x i32> @llvm.x86.avx.max.ud.256(<8 x i32>, <8 x i32>) nounwind readnone
-
-define internal <8 x i32> @__min_varying_uint32(<8 x i32>,
-                                                <8 x i32>) nounwind readonly alwaysinline {
-  %call = call <8 x i32> @llvm.x86.avx.min.ud.256(<8 x i32> %0, <8 x i32> %1)
-  ret <8 x i32> %call
+define internal <8 x i32> @__min_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
+  binary4to8(ret, i32, @llvm.x86.sse41.pminud, %0, %1)
+  ret <8 x i32> %ret
 }

-define internal i32 @__min_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
-  sse_binary_scalar(ret, 8, i32, @llvm.x86.avx.min.ud.256, %0, %1)
-  ret i32 %ret
+define internal <8 x i32> @__max_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
+  binary4to8(ret, i32, @llvm.x86.sse41.pmaxud, %0, %1)
+  ret <8 x i32> %ret
 }

-define internal <8 x i32> @__max_varying_uint32(<8 x i32>,
-                                                <8 x i32>) nounwind readonly alwaysinline {
-  %call = call <8 x i32> @llvm.x86.avx.max.ud.256(<8 x i32> %0, <8 x i32> %1)
-  ret <8 x i32> %call
-}
-
-define internal i32 @__max_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
-  sse_binary_scalar(ret, 8, i32, @llvm.x86.avx.max.ud.256, %0, %1)
-  ret i32 %ret
-}
-
-
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; horizontal ops

-declare i32 @llvm.ctpop.i32(i32) nounwind readnone
-
-define internal i32 @__popcnt(i32) nounwind readonly alwaysinline {
-  %call = call i32 @llvm.ctpop.i32(i32 %0)
-  ret i32 %call
-}
-
 declare i32 @llvm.x86.avx.movmsk.ps.256(<8 x float>) nounwind readnone

 define internal i32 @__movmsk(<8 x i32>) nounwind readnone alwaysinline {
@@ -355,6 +235,7 @@ define internal float @__reduce_max_float(<8 x float>) nounwind readnone alwaysi
  reduce8(float, @__max_varying_float, @__max_uniform_float)
 }

+reduce_equal(8)

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; horizontal int32 ops
@@ -403,58 +284,93 @@ define internal i32 @__reduce_max_uint32(<8 x i32>) nounwind readnone alwaysinli
 }


+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; horizontal double ops
+
+declare <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double>, <4 x double>) nounwind readnone
+
+define internal double @__reduce_add_double(<8 x double>) nounwind readonly alwaysinline {
+  %v0 = shufflevector <8 x double> %0, <8 x double> undef,
+                      <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v1 = shufflevector <8 x double> %0, <8 x double> undef,
+                      <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %sum0 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %v0, <4 x double> %v1)
+  %sum1 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %sum0, <4 x double> %sum0)
+  %final0 = extractelement <4 x double> %sum1, i32 0
+  %final1 = extractelement <4 x double> %sum1, i32 2
+  %sum = fadd double %final0, %final1
+
+  ret double %sum
+}
+
+define internal double @__reduce_min_double(<8 x double>) nounwind readnone alwaysinline {
+  reduce8(double, @__min_varying_double, @__min_uniform_double)
+}
+
+
+define internal double @__reduce_max_double(<8 x double>) nounwind readnone alwaysinline {
+  reduce8(double, @__max_varying_double, @__max_uniform_double)
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; horizontal int64 ops
+
+define internal <8 x i64> @__add_varying_int64(<8 x i64>,
+                                               <8 x i64>) nounwind readnone alwaysinline {
+  %s = add <8 x i64> %0, %1
+  ret <8 x i64> %s
+}
+
+define internal i64 @__add_uniform_int64(i64, i64) nounwind readnone alwaysinline {
+  %s = add i64 %0, %1
+  ret i64 %s
+}
+
+define internal i64 @__reduce_add_int64(<8 x i64>) nounwind readnone alwaysinline {
+  reduce8(i64, @__add_varying_int64, @__add_uniform_int64)
+}
+
+
+define internal i64 @__reduce_min_int64(<8 x i64>) nounwind readnone alwaysinline {
+  reduce8(i64, @__min_varying_int64, @__min_uniform_int64)
+}
+
+
+define internal i64 @__reduce_max_int64(<8 x i64>) nounwind readnone alwaysinline {
+  reduce8(i64, @__max_varying_int64, @__max_uniform_int64)
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; horizontal uint64 ops
+
+define internal i64 @__reduce_add_uint64(<8 x i64> %v) nounwind readnone alwaysinline {
+  %r = call i64 @__reduce_add_int64(<8 x i64> %v)
+  ret i64 %r
+}
+
+define internal i64 @__reduce_min_uint64(<8 x i64>) nounwind readnone alwaysinline {
+  reduce8(i64, @__min_varying_uint64, @__min_uniform_uint64)
+}
+
+
+define internal i64 @__reduce_max_uint64(<8 x i64>) nounwind readnone alwaysinline {
+  reduce8(i64, @__max_varying_uint64, @__max_uniform_uint64)
+}
+
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; unaligned loads/loads+broadcasts

-define <8 x i32> @__load_and_broadcast_32(i8 *, <8 x i32> %mask) nounwind alwaysinline {
-  %mm = call i32 @__movmsk(<8 x i32> %mask)
-  %any_on = icmp ne i32 %mm, 0
-  br i1 %any_on, label %load, label %skip
-
-load:
-  ; TODO: make sure this becomes a vbroadcast...
-  %ptr = bitcast i8 * %0 to i32 *
-  %val = load i32 * %ptr
-
-  %ret0 = insertelement <8 x i32> undef, i32 %val, i32 0
-  %ret1 = insertelement <8 x i32> %ret0, i32 %val, i32 1
-  %ret2 = insertelement <8 x i32> %ret1, i32 %val, i32 2
-  %ret3 = insertelement <8 x i32> %ret2, i32 %val, i32 3
-  %ret4 = insertelement <8 x i32> %ret3, i32 %val, i32 4
-  %ret5 = insertelement <8 x i32> %ret4, i32 %val, i32 5
-  %ret6 = insertelement <8 x i32> %ret5, i32 %val, i32 6
-  %ret7 = insertelement <8 x i32> %ret6, i32 %val, i32 7
-  ret <8 x i32> %ret7
-
-skip:
-  ret <8 x i32> undef
-}
-
-
-define <8 x i64> @__load_and_broadcast_64(i8 *, <8 x i32> %mask) nounwind alwaysinline {
-  %mm = call i32 @__movmsk(<8 x i32> %mask)
-  %any_on = icmp ne i32 %mm, 0
-  br i1 %any_on, label %load, label %skip
-
-load:
-  ; TODO: make sure this becomes a vbroadcast...
-  %ptr = bitcast i8 * %0 to i64 *
-  %val = load i64 * %ptr
-
-  %ret0 = insertelement <8 x i64> undef, i64 %val, i32 0
-  %ret1 = insertelement <8 x i64> %ret0, i64 %val, i32 1
-  %ret2 = insertelement <8 x i64> %ret1, i64 %val, i32 2
-  %ret3 = insertelement <8 x i64> %ret2, i64 %val, i32 3
-  %ret4 = insertelement <8 x i64> %ret3, i64 %val, i32 4
-  %ret5 = insertelement <8 x i64> %ret4, i64 %val, i32 5
-  %ret6 = insertelement <8 x i64> %ret5, i64 %val, i32 6
-  %ret7 = insertelement <8 x i64> %ret6, i64 %val, i32 7
-  ret <8 x i64> %ret3
-
-skip:
-  ret <8 x i64> undef
-}
+load_and_broadcast(8, i8, 8)
+load_and_broadcast(8, i16, 16)
+load_and_broadcast(8, i32, 32)
+load_and_broadcast(8, i64, 64)

+; no masked load instruction for i8 and i16 types??
+load_masked(8, i8,  8,  1)
+load_masked(8, i16, 16, 2)

 declare <8 x float> @llvm.x86.avx.maskload.ps.256(i8 *, <8 x float> %mask)
 declare <4 x double> @llvm.x86.avx.maskload.pd.256(i8 *, <4 x double> %mask)
@@ -490,6 +406,12 @@ define <8 x i64> @__load_masked_64(i8 *, <8 x i32> %mask) nounwind alwaysinline
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; masked store

+; FIXME: there is no AVX instruction for these, but we could be clever
+; by packing the bits down and setting the last 3/4 or half, respectively,
+; of the mask to zero...  Not sure if this would be a win in the end
+gen_masked_store(8, i8, 8)
+gen_masked_store(8, i16, 16)
+
 ; note that mask is the 2nd parameter, not the 3rd one!!
 declare void @llvm.x86.avx.maskstore.ps.256(i8 *, <8 x float>, <8 x float>)
 declare void @llvm.x86.avx.maskstore.pd.256(i8 *, <4 x double>, <4 x double>)
@@ -528,12 +450,14 @@ define void @__masked_store_64(<8 x i64>* nocapture, <8 x i64>,
 }


+
+masked_store_blend_8_16_by_8()
+
 declare <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float>, <8 x float>,
                                                <8 x float>) nounwind readnone

-
-define void @__masked_store_blend_32(<8 x i32>* nocapture, <8 x i32>,
-                                           <8 x i32>) nounwind alwaysinline {
+define void @__masked_store_blend_32(<8 x i32>* nocapture, <8 x i32>, 
+                                     <8 x i32>) nounwind alwaysinline {
  %mask_as_float = bitcast <8 x i32> %2 to <8 x float>
  %oldValue = load <8 x i32>* %0, align 4
  %oldAsFloat = bitcast <8 x i32> %oldValue to <8 x float>
@@ -547,7 +471,7 @@ define void @__masked_store_blend_32(<8 x i32>* nocapture, <8 x i32>,
 }


-define void @__masked_store_blend_64(<8 x i64>* nocapture %ptr, <8 x i64> %new,
+define void @__masked_store_blend_64(<8 x i64>* nocapture %ptr, <8 x i64> %new, 
                                     <8 x i32> %i32mask) nounwind alwaysinline {
  %oldValue = load <8 x i64>* %ptr, align 8
  %mask = bitcast <8 x i32> %i32mask to <8 x float>
@@ -597,56 +521,44 @@ define void @__masked_store_blend_64(<8 x i64>* nocapture %ptr, <8 x i64> %new,
  ret void
 }

+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; gather/scatter

+gen_gather(8, i8)
+gen_gather(8, i16)
 gen_gather(8, i32)
 gen_gather(8, i64)
+
+gen_scatter(8, i8)
+gen_scatter(8, i16)
 gen_scatter(8, i32)
 gen_scatter(8, i64)

-
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; double precision sqrt

 declare <4 x double> @llvm.x86.avx.sqrt.pd.256(<4 x double>) nounwind readnone
-declare <2 x double> @llvm.x86.sse.sqrt.sd(<2 x double>) nounwind readnone

 define internal <8 x double> @__sqrt_varying_double(<8 x double>) nounwind alwaysinline {
  unary4to8(ret, double, @llvm.x86.avx.sqrt.pd.256, %0)
  ret <8 x double> %ret
 }

-define internal double @__sqrt_uniform_double(double) nounwind alwaysinline {
-  sse_unary_scalar(ret, 2, double, @llvm.x86.sse.sqrt.sd, %0)
-  ret double %ret
-}
-

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; double precision min/max

 declare <4 x double> @llvm.x86.avx.max.pd.256(<4 x double>, <4 x double>) nounwind readnone
-declare <2 x double> @llvm.x86.sse.max.sd(<2 x double>, <2 x double>) nounwind readnone
 declare <4 x double> @llvm.x86.avx.min.pd.256(<4 x double>, <4 x double>) nounwind readnone
-declare <2 x double> @llvm.x86.sse.min.sd(<2 x double>, <2 x double>) nounwind readnone

 define internal <8 x double> @__min_varying_double(<8 x double>, <8 x double>) nounwind readnone alwaysinline {
  binary4to8(ret, double, @llvm.x86.avx.min.pd.256, %0, %1)
  ret <8 x double> %ret
 }

-define internal double @__min_uniform_double(double, double) nounwind readnone alwaysinline {
-  sse_binary_scalar(ret, 2, double, @llvm.x86.sse.min.sd, %0, %1)
-  ret double %ret
-}
-
 define internal <8 x double> @__max_varying_double(<8 x double>, <8 x double>) nounwind readnone alwaysinline {
  binary4to8(ret, double, @llvm.x86.avx.max.pd.256, %0, %1)
  ret <8 x double> %ret
 }

-define internal double @__max_uniform_double(double, double) nounwind readnone alwaysinline {
-  sse_binary_scalar(ret, 2, double, @llvm.x86.sse.max.sd, %0, %1)
-  ret double %ret
-}
--- a/builtins-c.c
+++ b/builtins-c.c
@@ -31,7 +31,7 @@
   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
 */

-/** @file stdlib-c.c
+/** @file builtins-c.c
    @brief Standard library function implementations written in C.

    This file provides C implementations of various functions that can be
@@ -51,8 +51,13 @@
  */


+#ifndef _MSC_VER
+#include <unistd.h>
+#endif // !_MSC_VER
+
 #include <stdint.h>
 #include <stdio.h>
+#include <stdlib.h>
 #include <stdarg.h>

 typedef int Bool;
@@ -139,3 +144,28 @@ void __do_print(const char *format, const char *types, int width, int mask,
    }
    fflush(stdout);
 }
+
+
+int __num_cores() {
+#ifdef _MSC_VER
+    // This is quite a hack.  Including all of windows.h to get this definition
+    // pulls in a bunch of stuff that leads to undefined symbols at link time.
+    // So we don't #include <windows.h> but instead have the equivalent declarations
+    // here.  Presumably this struct declaration won't be changing in the future
+    // anyway...
+    struct SYSTEM_INFO {
+        int pad0[2];
+        void *pad1[2];
+        int *pad2;
+        int dwNumberOfProcessors;
+        int pad3[3];
+    };
+
+    struct SYSTEM_INFO sysInfo;
+    extern void __stdcall GetSystemInfo(struct SYSTEM_INFO *);
+    GetSystemInfo(&sysInfo);
+    return sysInfo.dwNumberOfProcessors;
+#else
+    return sysconf(_SC_NPROCESSORS_ONLN);
+#endif // !_MSC_VER
+}
--- a/builtins-dispatch.ll
+++ b/builtins-dispatch.ll
@@ -0,0 +1,123 @@
+;;  Copyright (c) 2011, Intel Corporation
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Intel Corporation nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+;; This file defines various functions that are used when generating the
+;; the "dispatch" object/assembly file that has entrypoints for each
+;; exported function in a module that dispatch to the best available
+;; variant of that function that will run on the system's CPU.
+
+;; Stores the best target ISA that the system on which we're actually
+;; running supports.  -1 represents "uninitialized", otherwise this value
+;; should correspond to one of the enumerant values of Target::ISA from
+;; ispc.h.
+
+@__system_best_isa = internal global i32 -1
+
+declare void @abort() noreturn
+
+;; The below is the result of running "clang -O2 -emit-llvm -c -o -" on the
+;; following code...  Specifically, __get_system_isa should return a value
+;; corresponding to one of the Target::ISA enumerant values that gives the
+;; most capable ISA that the curremt system can run.
+;;
+;; #ifdef _MSC_VER
+;; extern void __stdcall __cpuid(int info[4], int infoType);
+;; #else
+;; static void __cpuid(int info[4], int infoType) {
+;;     __asm__ __volatile__ ("cpuid"
+;;                           : "=a" (info[0]), "=b" (info[1]), "=c" (info[2]), "=d" (info[3])
+;;                           : "0" (infoType));
+;; }
+;; #endif
+;; 
+;; int32_t __get_system_isa() {
+;;     int info[4];
+;;     __cpuid(info, 1);
+;;     /* NOTE: the values returned below must be the same as the
+;;        corresponding enumerant values in Target::ISA. */
+;;     if ((info[2] & (1 << 28)) != 0)
+;;         return 2; // AVX
+;;     else if ((info[2] & (1 << 19)) != 0)
+;;         return 1; // SSE4
+;;     else if ((info[3] & (1 << 26)) != 0)
+;;         return 0; // SSE2
+;;     else
+;;         abort();
+;; }
+
+%0 = type { i32, i32, i32, i32 }
+
+define internal i32 @__get_system_isa() nounwind ssp {
+  %1 = tail call %0 asm sideeffect "cpuid", "={ax},={bx},={cx},={dx},0,~{dirflag},~{fpsr},~{flags}"(i32 1) nounwind
+  %2 = extractvalue %0 %1, 2
+  %3 = extractvalue %0 %1, 3
+  %4 = and i32 %2, 268435456
+  %5 = icmp eq i32 %4, 0
+  br i1 %5, label %6, label %13
+
+; <label>:6                                       ; preds = %0
+  %7 = and i32 %2, 524288
+  %8 = icmp eq i32 %7, 0
+  br i1 %8, label %9, label %13
+
+; <label>:9                                       ; preds = %6
+  %10 = and i32 %3, 67108864
+  %11 = icmp eq i32 %10, 0
+  br i1 %11, label %12, label %13
+
+; <label>:12                                      ; preds = %9
+  tail call void @abort() noreturn nounwind
+  unreachable
+
+; <label>:13                                      ; preds = %9, %6, %0
+  %.0 = phi i32 [ 2, %0 ], [ 1, %6 ], [ 0, %9 ]
+  ret i32 %.0
+}
+
+
+;; This function is called by each of the dispatch functions we generate;
+;; it sets @__system_best_isa if it is unset.
+
+define internal void @__set_system_isa() {
+entry:
+  %bi = load i32* @__system_best_isa
+  %unset = icmp eq i32 %bi, -1
+  br i1 %unset, label %set_system_isa, label %done
+
+set_system_isa:
+  %bival = call i32 @__get_system_isa()
+  store i32 %bival, i32* @__system_best_isa
+  ret void
+
+done:
+  ret void
+}
+
--- a/builtins-sse.ll
+++ b/builtins-sse.ll
@@ -31,12 +31,12 @@

 ;; This file declares implementations of various stdlib builtins that
 ;; only require SSE version 1 and 2 functionality; this file, in turn
-;; is then included by stdlib-sse2.ll and stdlib-sse4.ll to provide
+;; is then included by builtins-sse2.ll and builtins-sse4.ll to provide
 ;; those definitions for them.

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

-int8_16(4)
+int64minmax(4)

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rcp
@@ -124,18 +124,19 @@ define internal float @__sqrt_uniform_float(float) nounwind readonly alwaysinlin
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; fast math mode

-declare void @llvm.x86.sse.stmxcsr(i32 *) nounwind
-declare void @llvm.x86.sse.ldmxcsr(i32 *) nounwind
+declare void @llvm.x86.sse.stmxcsr(i8 *) nounwind
+declare void @llvm.x86.sse.ldmxcsr(i8 *) nounwind

 define internal void @__fastmath() nounwind alwaysinline {
  %ptr = alloca i32
-  call void @llvm.x86.sse.stmxcsr(i32 * %ptr)
+  %ptr8 = bitcast i32 * %ptr to i8 *
+  call void @llvm.x86.sse.stmxcsr(i8 * %ptr8)
  %oldval = load i32 *%ptr

  ; turn on DAZ (64)/FTZ (32768) -> 32832
  %update = or i32 %oldval, 32832
  store i32 %update, i32 *%ptr
-  call void @llvm.x86.sse.ldmxcsr(i32 * %ptr)
+  call void @llvm.x86.sse.ldmxcsr(i8 * %ptr8)
  ret void
 }

@@ -227,6 +228,54 @@ define internal float @__min_uniform_float(float, float) nounwind readonly alway
  ret float %ret
 }

+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; double precision sqrt
+
+declare <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double>) nounwind readnone
+declare <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double>) nounwind readnone
+
+define internal <4 x double> @__sqrt_varying_double(<4 x double>) nounwind alwaysinline {
+  unary2to4(ret, double, @llvm.x86.sse2.sqrt.pd, %0)
+  ret <4 x double> %ret
+}
+
+
+define internal double @__sqrt_uniform_double(double) nounwind alwaysinline {
+  sse_unary_scalar(ret, 2, double, @llvm.x86.sse2.sqrt.sd, %0)
+  ret double %ret
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; double precision min/max
+
+declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>) nounwind readnone
+declare <2 x double> @llvm.x86.sse2.max.sd(<2 x double>, <2 x double>) nounwind readnone
+declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) nounwind readnone
+declare <2 x double> @llvm.x86.sse2.min.sd(<2 x double>, <2 x double>) nounwind readnone
+
+define internal <4 x double> @__min_varying_double(<4 x double>, <4 x double>) nounwind readnone {
+  binary2to4(ret, double, @llvm.x86.sse2.min.pd, %0, %1)
+  ret <4 x double> %ret
+}
+
+
+define internal double @__min_uniform_double(double, double) nounwind readnone {
+  sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.min.sd, %0, %1)
+  ret double %ret
+}
+
+
+define internal <4 x double> @__max_varying_double(<4 x double>, <4 x double>) nounwind readnone {
+  binary2to4(ret, double, @llvm.x86.sse2.max.pd, %0, %1)
+  ret <4 x double> %ret
+}
+
+
+define internal double @__max_uniform_double(double, double) nounwind readnone {
+  sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.max.sd, %0, %1)
+  ret double %ret
+}

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; horizontal ops / reductions
@@ -279,163 +328,90 @@ define internal i32 @__reduce_max_uint32(<4 x i32>) nounwind readnone {
 }


+define internal double @__reduce_add_double(<4 x double>) nounwind readnone {
+  %v0 = shufflevector <4 x double> %0, <4 x double> undef,
+                      <2 x i32> <i32 0, i32 1>
+  %v1 = shufflevector <4 x double> %0, <4 x double> undef,
+                      <2 x i32> <i32 2, i32 3>
+  %sum = fadd <2 x double> %v0, %v1
+  %e0 = extractelement <2 x double> %sum, i32 0
+  %e1 = extractelement <2 x double> %sum, i32 1
+  %m = fadd double %e0, %e1
+  ret double %m
+}
+
+define internal double @__reduce_min_double(<4 x double>) nounwind readnone {
+  reduce4(double, @__min_varying_double, @__min_uniform_double)
+}
+
+define internal double @__reduce_max_double(<4 x double>) nounwind readnone {
+  reduce4(double, @__max_varying_double, @__max_uniform_double)
+}
+
+define internal i64 @__reduce_add_int64(<4 x i64>) nounwind readnone {
+  %v0 = shufflevector <4 x i64> %0, <4 x i64> undef,
+                      <2 x i32> <i32 0, i32 1>
+  %v1 = shufflevector <4 x i64> %0, <4 x i64> undef,
+                      <2 x i32> <i32 2, i32 3>
+  %sum = add <2 x i64> %v0, %v1
+  %e0 = extractelement <2 x i64> %sum, i32 0
+  %e1 = extractelement <2 x i64> %sum, i32 1
+  %m = add i64 %e0, %e1
+  ret i64 %m
+}
+
+define internal i64 @__reduce_min_int64(<4 x i64>) nounwind readnone {
+  reduce4(i64, @__min_varying_int64, @__min_uniform_int64)
+}
+
+define internal i64 @__reduce_max_int64(<4 x i64>) nounwind readnone {
+  reduce4(i64, @__max_varying_int64, @__max_uniform_int64)
+}
+
+define internal i64 @__reduce_min_uint64(<4 x i64>) nounwind readnone {
+  reduce4(i64, @__min_varying_uint64, @__min_uniform_uint64)
+}
+
+define internal i64 @__reduce_max_uint64(<4 x i64>) nounwind readnone {
+  reduce4(i64, @__max_varying_uint64, @__max_uniform_uint64)
+}
+
+reduce_equal(4)
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; masked store

-define void @__masked_store_32(<4 x i32>* nocapture, <4 x i32>, <4 x i32>) nounwind alwaysinline {
-  per_lane(4, <4 x i32> %2, `
-      ; compute address for this one
-      %ptr_ID = getelementptr <4 x i32> * %0, i32 0, i32 LANE
-      %storeval_ID = extractelement <4 x i32> %1, i32 LANE
-      store i32 %storeval_ID, i32 * %ptr_ID')
-  ret void
-}
-
-define void @__masked_store_64(<4 x i64>* nocapture, <4 x i64>, <4 x i32>) nounwind alwaysinline {
-  per_lane(4, <4 x i32> %2, `
-      %ptr_ID = getelementptr <4 x i64> * %0, i32 0, i32 LANE
-      %storeval_ID = extractelement <4 x i64> %1, i32 LANE
-      store i64 %storeval_ID, i64 * %ptr_ID')
-  ret void
-}
+masked_store_blend_8_16_by_4()

+gen_masked_store(4, i8, 8)
+gen_masked_store(4, i16, 16)
+gen_masked_store(4, i32, 32)
+gen_masked_store(4, i64, 64)

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; unaligned loads/loads+broadcasts

-define <4 x i32> @__load_and_broadcast_32(i8 *, <4 x i32> %mask) nounwind alwaysinline {
-  ; must not load if the mask is all off; the address may be invalid
-  %mm = call i32 @__movmsk(<4 x i32> %mask)
-  %any_on = icmp ne i32 %mm, 0
-  br i1 %any_on, label %load, label %skip
-
-load:
-  %ptr = bitcast i8 * %0 to i32 *
-  %val = load i32 * %ptr
-
-  %ret0 = insertelement <4 x i32> undef, i32 %val, i32 0
-  %ret1 = insertelement <4 x i32> %ret0, i32 %val, i32 1
-  %ret2 = insertelement <4 x i32> %ret1, i32 %val, i32 2
-  %ret3 = insertelement <4 x i32> %ret2, i32 %val, i32 3
-  ret <4 x i32> %ret3
-
-skip:
-  ret <4 x i32> undef
-}
-
-define <4 x i64> @__load_and_broadcast_64(i8 *, <4 x i32> %mask) nounwind alwaysinline {
-  ; must not load if the mask is all off; the address may be invalid
-  %mm = call i32 @__movmsk(<4 x i32> %mask)
-  %any_on = icmp ne i32 %mm, 0
-  br i1 %any_on, label %load, label %skip
-
-load:
-  %ptr = bitcast i8 * %0 to i64 *
-  %val = load i64 * %ptr
-
-  %ret0 = insertelement <4 x i64> undef, i64 %val, i32 0
-  %ret1 = insertelement <4 x i64> %ret0, i64 %val, i32 1
-  %ret2 = insertelement <4 x i64> %ret1, i64 %val, i32 2
-  %ret3 = insertelement <4 x i64> %ret2, i64 %val, i32 3
-  ret <4 x i64> %ret3
-
-skip:
-  ret <4 x i64> undef
-}
-
-define <4 x i32> @__load_masked_32(i8 *, <4 x i32> %mask) nounwind alwaysinline {
-  %mm = call i32 @__movmsk(<4 x i32> %mask)
-  %any_on = icmp ne i32 %mm, 0
-  br i1 %any_on, label %load, label %skip
-
-load: 
-  ; if any mask lane is on, just load all of the values
-  ; FIXME: there is a lurking bug here if we straddle a page boundary, the
-  ; next page is invalid to read, but the mask bits are set so that we
-  ; aren't supposed to be reading those elements...
-  %ptr = bitcast i8 * %0 to <4 x i32> *
-  %val = load <4 x i32> * %ptr, align 4
-  ret <4 x i32> %val
-
-skip:
-  ret <4 x i32> undef
-}
-
-define <4 x i64> @__load_masked_64(i8 *, <4 x i32> %mask) nounwind alwaysinline {
-  %mm = call i32 @__movmsk(<4 x i32> %mask)
-  %any_on = icmp ne i32 %mm, 0
-  br i1 %any_on, label %load, label %skip
-
-load:
-  ; if any mask lane is on, just load all of the values
-  ; FIXME: there is a lurking bug here if we straddle a page boundary, the
-  ; next page is invalid to read, but the mask bits are set so that we
-  ; aren't supposed to be reading those elements...
-  %ptr = bitcast i8 * %0 to <4 x i64> *
-  %val = load <4 x i64> * %ptr, align 8
-  ret <4 x i64> %val
-
-skip:
-  ret <4 x i64> undef
-}
+load_and_broadcast(4, i8, 8)
+load_and_broadcast(4, i16, 16)
+load_and_broadcast(4, i32, 32)
+load_and_broadcast(4, i64, 64)

+load_masked(4, i8,  8,  1)
+load_masked(4, i16, 16, 2)
+load_masked(4, i32, 32, 4)
+load_masked(4, i64, 64, 8)

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; gather/scatter

 ; define these with the macros from stdlib.m4

+gen_gather(4, i8)
+gen_gather(4, i16)
 gen_gather(4, i32)
 gen_gather(4, i64)
+
+gen_scatter(4, i8)
+gen_scatter(4, i16)
 gen_scatter(4, i32)
 gen_scatter(4, i64)
-
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; double precision sqrt
-
-declare <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double>) nounwind readnone
-declare <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double>) nounwind readnone
-
-define internal <4 x double> @__sqrt_varying_double(<4 x double>) nounwind alwaysinline {
-  unary2to4(ret, double, @llvm.x86.sse2.sqrt.pd, %0)
-  ret <4 x double> %ret
-}
-
-
-define internal double @__sqrt_uniform_double(double) nounwind alwaysinline {
-  sse_unary_scalar(ret, 2, double, @llvm.x86.sse2.sqrt.sd, %0)
-  ret double %ret
-}
-
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; double precision min/max
-
-declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>) nounwind readnone
-declare <2 x double> @llvm.x86.sse2.max.sd(<2 x double>, <2 x double>) nounwind readnone
-declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) nounwind readnone
-declare <2 x double> @llvm.x86.sse2.min.sd(<2 x double>, <2 x double>) nounwind readnone
-
-define internal <4 x double> @__min_varying_double(<4 x double>, <4 x double>) nounwind readnone {
-  binary2to4(ret, double, @llvm.x86.sse2.min.pd, %0, %1)
-  ret <4 x double> %ret
-}
-
-
-define internal double @__min_uniform_double(double, double) nounwind readnone {
-  sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.min.sd, %0, %1)
-  ret double %ret
-}
-
-
-define internal <4 x double> @__max_varying_double(<4 x double>, <4 x double>) nounwind readnone {
-  binary2to4(ret, double, @llvm.x86.sse2.max.pd, %0, %1)
-  ret <4 x double> %ret
-}
-
-
-define internal double @__max_uniform_double(double, double) nounwind readnone {
-  sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.max.sd, %0, %1)
-  ret double %ret
-}
--- a/builtins-sse2.ll
+++ b/builtins-sse2.ll
@@ -35,9 +35,10 @@
 ; Define some basics for a 4-wide target
 stdlib_core(4)
 packed_load_and_store(4)
+scans(4)

 ; Include the various definitions of things that only require SSE1 and SSE2
-include(`stdlib-sse.ll')
+include(`builtins-sse.ll')

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rounding
@@ -152,6 +153,40 @@ define internal float @__ceil_uniform_float(float) nounwind readonly alwaysinlin
  ret float %binop.i
 }

+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rounding doubles
+
+declare double @round(double)
+declare double @floor(double)
+declare double @ceil(double)
+
+define internal <4 x double> @__round_varying_double(<4 x double>) nounwind readonly alwaysinline {
+  unary1to4(double, @round)
+}
+
+define internal double @__round_uniform_double(double) nounwind readonly alwaysinline {
+  %r = call double @round(double %0)
+  ret double %r
+}
+
+define internal <4 x double> @__floor_varying_double(<4 x double>) nounwind readonly alwaysinline {
+  unary1to4(double, @floor)
+}
+
+define internal double @__floor_uniform_double(double) nounwind readonly alwaysinline {
+  %r = call double @floor(double %0)
+  ret double %r
+}
+
+define internal <4 x double> @__ceil_varying_double(<4 x double>) nounwind readonly alwaysinline {
+  unary1to4(double, @ceil)
+}
+
+define internal double @__ceil_uniform_double(double) nounwind readonly alwaysinline {
+  %r = call double @ceil(double %0)
+  ret double %r
+}
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; min/max

@@ -242,23 +277,17 @@ define internal i32 @__max_uniform_uint32(i32, i32) nounwind readonly alwaysinli
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; horizontal ops / reductions

-; FIXME: this is very inefficient, loops over all 32 bits...
+declare i32 @llvm.ctpop.i32(i32)
+declare i64 @llvm.ctpop.i64(i64)

-define internal i32 @__popcnt(i32) nounwind readonly alwaysinline {
-entry:
-  br label %loop
+define internal i32 @__popcnt_int32(i32) nounwind readonly alwaysinline {
+  %val = call i32 @llvm.ctpop.i32(i32 %0)
+  ret i32 %val
+}

-loop:
-  %count = phi i32 [ 0, %entry ], [ %newcount, %loop ]
-  %val = phi i32 [ %0, %entry ], [ %newval, %loop ]
-  %delta = and i32 %val, 1
-  %newcount = add i32 %count, %delta
-  %newval = lshr i32 %val, 1
-  %done = icmp eq i32 %newval, 0
-  br i1 %done, label %exit, label %loop
-
-exit:
-  ret i32 %newcount
+define internal i64 @__popcnt_int64(i64) nounwind readnone alwaysinline {
+  %val = call i64 @llvm.ctpop.i64(i64 %0)
+  ret i64 %val
 }


--- a/builtins-sse4-x2.ll
+++ b/builtins-sse4-x2.ll
@@ -38,7 +38,8 @@

 stdlib_core(8)
 packed_load_and_store(8)
-int8_16(8)
+scans(8)
+int64minmax(8)

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rcp
@@ -127,22 +128,22 @@ define internal float @__sqrt_uniform_float(float) nounwind readonly alwaysinlin
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; fast math

-declare void @llvm.x86.sse.stmxcsr(i32 *) nounwind
-declare void @llvm.x86.sse.ldmxcsr(i32 *) nounwind
+declare void @llvm.x86.sse.stmxcsr(i8 *) nounwind
+declare void @llvm.x86.sse.ldmxcsr(i8 *) nounwind

 define internal void @__fastmath() nounwind alwaysinline {
  %ptr = alloca i32
-  call void @llvm.x86.sse.stmxcsr(i32 * %ptr)
+  %ptr8 = bitcast i32 * %ptr to i8 *
+  call void @llvm.x86.sse.stmxcsr(i8 * %ptr8)
  %oldval = load i32 *%ptr

  ; turn on DAZ (64)/FTZ (32768) -> 32832
  %update = or i32 %oldval, 32832
  store i32 %update, i32 *%ptr
-  call void @llvm.x86.sse.ldmxcsr(i32 * %ptr)
+  call void @llvm.x86.sse.ldmxcsr(i8 * %ptr8)
  ret void
 }

-
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; svml stuff

@@ -258,7 +259,7 @@ define internal float @__min_uniform_float(float, float) nounwind readonly alway
 }

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; int min/max
+;; int32 min/max

 declare <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32>, <4 x i32>) nounwind readnone
 declare <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32>, <4 x i32>) nounwind readnone
@@ -380,92 +381,90 @@ define internal i32 @__reduce_max_uint32(<8 x i32>) nounwind readnone alwaysinli
  reduce8by4(i32, @llvm.x86.sse41.pmaxud, @__max_uniform_uint32)
 }

-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; masked store
-
-define void @__masked_store_32(<8 x i32>* nocapture, <8 x i32>,
-                               <8 x i32>) nounwind alwaysinline {
-  per_lane(8, <8 x i32> %2, `
-      ; compute address for this one
-      %ptr_ID = getelementptr <8 x i32> * %0, i32 0, i32 LANE
-      %storeval_ID = extractelement <8 x i32> %1, i32 LANE
-      store i32 %storeval_ID, i32 * %ptr_ID')
-  ret void
+define internal <4 x double> @__add_varying_double(<4 x double>,
+                                     <4 x double>) nounwind readnone alwaysinline {
+  %r = fadd <4 x double> %0, %1
+  ret <4 x double> %r
 }

-
-define void @__masked_store_64(<8 x i64>* nocapture, <8 x i64>,
-                               <8 x i32>) nounwind alwaysinline {
-  per_lane(8, <8 x i32> %2, `
-      ; compute address for this one
-      %ptr_ID = getelementptr <8 x i64> * %0, i32 0, i32 LANE
-      %storeval_ID = extractelement <8 x i64> %1, i32 LANE
-      store i64 %storeval_ID, i64 * %ptr_ID')
-  ret void
+define internal double @__add_uniform_double(double, double) nounwind readnone alwaysinline {
+  %r = fadd double %0, %1
+  ret double %r
 }

+define internal double @__reduce_add_double(<8 x double>) nounwind readnone {
+  reduce8by4(double, @__add_varying_double, @__add_uniform_double)
+}
+
+define internal double @__reduce_min_double(<8 x double>) nounwind readnone {
+  reduce8(double, @__min_varying_double, @__min_uniform_double)
+}
+
+define internal double @__reduce_max_double(<8 x double>) nounwind readnone {
+  reduce8(double, @__max_varying_double, @__max_uniform_double)
+}
+
+define internal <4 x i64> @__add_varying_int64(<4 x i64>,
+                                               <4 x i64>) nounwind readnone alwaysinline {
+  %r = add <4 x i64> %0, %1
+  ret <4 x i64> %r
+}
+
+define internal i64 @__add_uniform_int64(i64, i64) nounwind readnone alwaysinline {
+  %r = add i64 %0, %1
+  ret i64 %r
+}
+
+define internal i64 @__reduce_add_int64(<8 x i64>) nounwind readnone {
+  reduce8by4(i64, @__add_varying_int64, @__add_uniform_int64)
+}
+
+define internal i64 @__reduce_min_int64(<8 x i64>) nounwind readnone {
+  reduce8(i64, @__min_varying_int64, @__min_uniform_int64)
+}
+
+define internal i64 @__reduce_max_int64(<8 x i64>) nounwind readnone {
+  reduce8(i64, @__max_varying_int64, @__max_uniform_int64)
+}
+
+define internal i64 @__reduce_min_uint64(<8 x i64>) nounwind readnone {
+  reduce8(i64, @__min_varying_uint64, @__min_uniform_uint64)
+}
+
+define internal i64 @__reduce_max_uint64(<8 x i64>) nounwind readnone {
+  reduce8(i64, @__max_varying_uint64, @__max_uniform_uint64)
+}
+
+reduce_equal(8)

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; unaligned loads/loads+broadcasts

-; FIXME: I think this and the next one need to verify that the mask isn't
-; all off before doing the load!!!  (See e.g. stdlib-sse.ll)
+load_and_broadcast(8, i8, 8)
+load_and_broadcast(8, i16, 16)
+load_and_broadcast(8, i32, 32)
+load_and_broadcast(8, i64, 64)

-define <8 x i32> @__load_and_broadcast_32(i8 *, <8 x i32> %mask) nounwind alwaysinline {
-  %ptr = bitcast i8 * %0 to i32 *
-  %val = load i32 * %ptr
-
-  %ret0 = insertelement <8 x i32> undef, i32 %val, i32 0
-  %ret1 = insertelement <8 x i32> %ret0, i32 %val, i32 1
-  %ret2 = insertelement <8 x i32> %ret1, i32 %val, i32 2
-  %ret3 = insertelement <8 x i32> %ret2, i32 %val, i32 3
-  %ret4 = insertelement <8 x i32> %ret3, i32 %val, i32 4
-  %ret5 = insertelement <8 x i32> %ret4, i32 %val, i32 5
-  %ret6 = insertelement <8 x i32> %ret5, i32 %val, i32 6
-  %ret7 = insertelement <8 x i32> %ret6, i32 %val, i32 7
-  ret <8 x i32> %ret7
-}
-
-
-define <8 x i64> @__load_and_broadcast_64(i8 *, <8 x i32> %mask) nounwind alwaysinline {
-  %ptr = bitcast i8 * %0 to i64 *
-  %val = load i64 * %ptr
-
-  %ret0 = insertelement <8 x i64> undef, i64 %val, i32 0
-  %ret1 = insertelement <8 x i64> %ret0, i64 %val, i32 1
-  %ret2 = insertelement <8 x i64> %ret1, i64 %val, i32 2
-  %ret3 = insertelement <8 x i64> %ret2, i64 %val, i32 3
-  %ret4 = insertelement <8 x i64> %ret3, i64 %val, i32 4
-  %ret5 = insertelement <8 x i64> %ret4, i64 %val, i32 5
-  %ret6 = insertelement <8 x i64> %ret5, i64 %val, i32 6
-  %ret7 = insertelement <8 x i64> %ret6, i64 %val, i32 7
-  ret <8 x i64> %ret7
-}
-
-
-define <8 x i32> @__load_masked_32(i8 *, <8 x i32> %mask) nounwind alwaysinline {
-  %ptr = bitcast i8 * %0 to <8 x i32> *
-  %val = load <8 x i32> * %ptr, align 4
-  ret <8 x i32> %val
-}
-
-
-define <8 x i64> @__load_masked_64(i8 *, <8 x i32> %mask) nounwind alwaysinline {
-  %ptr = bitcast i8 * %0 to <8 x i64> *
-  %val = load <8 x i64> * %ptr, align 8
-  ret <8 x i64> %val
-}
+load_masked(8, i8,  8,  1)
+load_masked(8, i16, 16, 2)
+load_masked(8, i32, 32, 4)
+load_masked(8, i64, 64, 8)

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; gather/scatter

+gen_gather(8, i8)
+gen_gather(8, i16)
 gen_gather(8, i32)
 gen_gather(8, i64)
+
+gen_scatter(8, i8)
+gen_scatter(8, i16)
 gen_scatter(8, i32)
 gen_scatter(8, i64)

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; math
+;; float rounding

 declare <4 x float> @llvm.x86.sse41.round.ps(<4 x float>, i32) nounwind readnone
 declare <4 x float> @llvm.x86.sse41.round.ss(<4 x float>, <4 x float>, i32) nounwind readnone
@@ -499,43 +498,95 @@ define internal float @__round_uniform_float(float) nounwind readonly alwaysinli
 }

 define internal <8 x float> @__floor_varying_float(<8 x float>) nounwind readonly alwaysinline {
-  ; roundps, round down 0b01 | don't signal precision exceptions 0b1000 = 9
+  ; roundps, round down 0b01 | don't signal precision exceptions 0b1001 = 9
  round4to8(%0, 9)
 }

 define internal float @__floor_uniform_float(float) nounwind readonly alwaysinline {
  ; see above for round_ss instrinsic discussion...
  %xi = insertelement <4 x float> undef, float %0, i32 0
-  ; roundps, round down 0b01 | don't signal precision exceptions 0b1000 = 9
+  ; roundps, round down 0b01 | don't signal precision exceptions 0b1001 = 9
  %xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 9)
  %rs = extractelement <4 x float> %xr, i32 0
  ret float %rs
 }

 define internal <8 x float> @__ceil_varying_float(<8 x float>) nounwind readonly alwaysinline {
-  ; roundps, round up 0b10 | don't signal precision exceptions 0b1000 = 10
+  ; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
  round4to8(%0, 10)
 }

 define internal float @__ceil_uniform_float(float) nounwind readonly alwaysinline {
  ; see above for round_ss instrinsic discussion...
  %xi = insertelement <4 x float> undef, float %0, i32 0
-  ; roundps, round up 0b10 | don't signal precision exceptions 0b1000 = 10
+  ; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
  %xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 10)
  %rs = extractelement <4 x float> %xr, i32 0
  ret float %rs
 }

+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rounding doubles
+
+declare <2 x double> @llvm.x86.sse41.round.pd(<2 x double>, i32) nounwind readnone
+declare <2 x double> @llvm.x86.sse41.round.sd(<2 x double>, <2 x double>, i32) nounwind readnone
+
+define internal <8 x double> @__round_varying_double(<8 x double>) nounwind readonly alwaysinline {
+  round2to8double(%0, 8)
+}
+
+define internal double @__round_uniform_double(double) nounwind readonly alwaysinline {
+  %xi = insertelement <2 x double> undef, double %0, i32 0
+  %xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 8)
+  %rs = extractelement <2 x double> %xr, i32 0
+  ret double %rs
+}
+
+define internal <8 x double> @__floor_varying_double(<8 x double>) nounwind readonly alwaysinline {
+  ; roundpd, round down 0b01 | don't signal precision exceptions 0b1001 = 9
+  round2to8double(%0, 9)
+}
+
+define internal double @__floor_uniform_double(double) nounwind readonly alwaysinline {
+  ; see above for round_ss instrinsic discussion...
+  %xi = insertelement <2 x double> undef, double %0, i32 0
+  ; roundpd, round down 0b01 | don't signal precision exceptions 0b1001 = 9
+  %xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 9)
+  %rs = extractelement <2 x double> %xr, i32 0
+  ret double %rs
+}
+
+define internal <8 x double> @__ceil_varying_double(<8 x double>) nounwind readonly alwaysinline {
+  ; roundpd, round up 0b10 | don't signal precision exceptions 0b1010 = 10
+  round2to8double(%0, 10)
+}
+
+define internal double @__ceil_uniform_double(double) nounwind readonly alwaysinline {
+  ; see above for round_ss instrinsic discussion...
+  %xi = insertelement <2 x double> undef, double %0, i32 0
+  ; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
+  %xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 10)
+  %rs = extractelement <2 x double> %xr, i32 0
+  ret double %rs
+}
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; horizontal ops / reductions

 declare i32 @llvm.ctpop.i32(i32) nounwind readnone

-define internal i32 @__popcnt(i32) nounwind readonly alwaysinline {
+define internal i32 @__popcnt_int32(i32) nounwind readonly alwaysinline {
  %call = call i32 @llvm.ctpop.i32(i32 %0)
  ret i32 %call
 }

+declare i64 @llvm.ctpop.i64(i64) nounwind readnone
+
+define internal i64 @__popcnt_int64(i64) nounwind readonly alwaysinline {
+  %call = call i64 @llvm.ctpop.i64(i64 %0)
+  ret i64 %call
+}
+
 declare <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float>, <4 x float>) nounwind readnone

 define internal float @__reduce_add_float(<8 x float>) nounwind readonly alwaysinline {
@@ -555,6 +606,13 @@ define internal float @__reduce_add_float(<8 x float>) nounwind readonly alwaysi
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; masked store

+gen_masked_store(8, i8, 8)
+gen_masked_store(8, i16, 16)
+gen_masked_store(8, i32, 32)
+gen_masked_store(8, i64, 64)
+
+masked_store_blend_8_16_by_8()
+
 declare <4 x float> @llvm.x86.sse41.blendvps(<4 x float>, <4 x float>,
                                             <4 x float>) nounwind readnone

--- a/builtins-sse4.ll
+++ b/builtins-sse4.ll
@@ -35,12 +35,13 @@
 ; Define common 4-wide stuff
 stdlib_core(4)
 packed_load_and_store(4)
+scans(4)

 ; Define the stuff that can be done with base SSE1/SSE2 instructions
-include(`stdlib-sse.ll')
+include(`builtins-sse.ll')

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; math
+;; rounding floats

 declare <4 x float> @llvm.x86.sse41.round.ps(<4 x float>, i32) nounwind readnone
 declare <4 x float> @llvm.x86.sse41.round.ss(<4 x float>, <4 x float>, i32) nounwind readnone
@@ -76,7 +77,7 @@ define internal float @__round_uniform_float(float) nounwind readonly alwaysinli
 }

 define internal <4 x float> @__floor_varying_float(<4 x float>) nounwind readonly alwaysinline {
-  ; roundps, round down 0b01 | don't signal precision exceptions 0b1000 = 9
+  ; roundps, round down 0b01 | don't signal precision exceptions 0b1001 = 9
  %call = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %0, i32 9)
  ret <4 x float> %call
 }
@@ -84,14 +85,14 @@ define internal <4 x float> @__floor_varying_float(<4 x float>) nounwind readonl
 define internal float @__floor_uniform_float(float) nounwind readonly alwaysinline {
  ; see above for round_ss instrinsic discussion...
  %xi = insertelement <4 x float> undef, float %0, i32 0
-  ; roundps, round down 0b01 | don't signal precision exceptions 0b1000 = 9
+  ; roundps, round down 0b01 | don't signal precision exceptions 0b1010 = 9
  %xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 9)
  %rs = extractelement <4 x float> %xr, i32 0
  ret float %rs
 }

 define internal <4 x float> @__ceil_varying_float(<4 x float>) nounwind readonly alwaysinline {
-  ; roundps, round up 0b10 | don't signal precision exceptions 0b1000 = 10
+  ; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
  %call = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %0, i32 10)
  ret <4 x float> %call
 }
@@ -99,14 +100,59 @@ define internal <4 x float> @__ceil_varying_float(<4 x float>) nounwind readonly
 define internal float @__ceil_uniform_float(float) nounwind readonly alwaysinline {
  ; see above for round_ss instrinsic discussion...
  %xi = insertelement <4 x float> undef, float %0, i32 0
-  ; roundps, round up 0b10 | don't signal precision exceptions 0b1000 = 10
+  ; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
  %xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 10)
  %rs = extractelement <4 x float> %xr, i32 0
  ret float %rs
 }

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; integer min/max
+;; rounding doubles
+
+declare <2 x double> @llvm.x86.sse41.round.pd(<2 x double>, i32) nounwind readnone
+declare <2 x double> @llvm.x86.sse41.round.sd(<2 x double>, <2 x double>, i32) nounwind readnone
+
+define internal <4 x double> @__round_varying_double(<4 x double>) nounwind readonly alwaysinline {
+  round2to4double(%0, 8)
+}
+
+define internal double @__round_uniform_double(double) nounwind readonly alwaysinline {
+  %xi = insertelement <2 x double> undef, double %0, i32 0
+  %xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 8)
+  %rs = extractelement <2 x double> %xr, i32 0
+  ret double %rs
+}
+
+define internal <4 x double> @__floor_varying_double(<4 x double>) nounwind readonly alwaysinline {
+  ; roundpd, round down 0b01 | don't signal precision exceptions 0b1001 = 9
+  round2to4double(%0, 9)
+}
+
+define internal double @__floor_uniform_double(double) nounwind readonly alwaysinline {
+  ; see above for round_ss instrinsic discussion...
+  %xi = insertelement <2 x double> undef, double %0, i32 0
+  ; roundpd, round down 0b01 | don't signal precision exceptions 0b1001 = 9
+  %xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 9)
+  %rs = extractelement <2 x double> %xr, i32 0
+  ret double %rs
+}
+
+define internal <4 x double> @__ceil_varying_double(<4 x double>) nounwind readonly alwaysinline {
+  ; roundpd, round up 0b10 | don't signal precision exceptions 0b1010 = 10
+  round2to4double(%0, 10)
+}
+
+define internal double @__ceil_uniform_double(double) nounwind readonly alwaysinline {
+  ; see above for round_ss instrinsic discussion...
+  %xi = insertelement <2 x double> undef, double %0, i32 0
+  ; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
+  %xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 10)
+  %rs = extractelement <2 x double> %xr, i32 0
+  ret double %rs
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int32 min/max

 declare <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32>, <4 x i32>) nounwind readnone
 declare <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32>, <4 x i32>) nounwind readnone
@@ -163,11 +209,18 @@ define internal i32 @__max_uniform_uint32(i32, i32) nounwind readonly alwaysinli

 declare i32 @llvm.ctpop.i32(i32) nounwind readnone

-define internal i32 @__popcnt(i32) nounwind readonly alwaysinline {
+define internal i32 @__popcnt_int32(i32) nounwind readonly alwaysinline {
  %call = call i32 @llvm.ctpop.i32(i32 %0)
  ret i32 %call
 }

+declare i64 @llvm.ctpop.i64(i64) nounwind readnone
+
+define internal i64 @__popcnt_int64(i64) nounwind readonly alwaysinline {
+  %call = call i64 @llvm.ctpop.i64(i64 %0)
+  ret i64 %call
+}
+
 declare <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float>, <4 x float>) nounwind readnone

 define internal float @__reduce_add_float(<4 x float>) nounwind readonly alwaysinline {
@@ -177,7 +230,6 @@ define internal float @__reduce_add_float(<4 x float>) nounwind readonly alwaysi
  ret float %scalar
 }

-
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; masked store

--- a/builtins.cpp
+++ b/builtins.cpp
@@ -52,7 +52,10 @@
 #include <llvm/Type.h>
 #include <llvm/DerivedTypes.h>
 #include <llvm/Instructions.h>
+#include <llvm/Intrinsics.h>
 #include <llvm/Linker.h>
+#include <llvm/Target/TargetMachine.h>
+#include <llvm/ADT/Triple.h>
 #include <llvm/Support/MemoryBuffer.h>
 #include <llvm/Bitcode/ReaderWriter.h>

@@ -67,7 +70,7 @@ extern yy_buffer_state *yy_scan_string(const char *);
    distinguish between signed and unsigned integers in its types.)

    Because this function is only used for generating ispc declarations of
-    functions defined in LLVM bitcode in the stdlib-*.ll files, in practice
+    functions defined in LLVM bitcode in the builtins-*.ll files, in practice
    we can get enough of what we need for the relevant cases to make things
    work, partially with the help of the intAsUnsigned parameter, which
    indicates whether LLVM integer types should be treated as being signed
@@ -78,8 +81,14 @@ static const Type *
 lLLVMTypeToISPCType(const llvm::Type *t, bool intAsUnsigned) {
    if (t == LLVMTypes::VoidType)
        return AtomicType::Void;
+
+    // uniform
    else if (t == LLVMTypes::BoolType)
        return AtomicType::UniformBool;
+    else if (t == LLVMTypes::Int8Type)
+        return intAsUnsigned ? AtomicType::UniformUInt8 : AtomicType::UniformInt8;
+    else if (t == LLVMTypes::Int16Type)
+        return intAsUnsigned ? AtomicType::UniformUInt16 : AtomicType::UniformInt16;
    else if (t == LLVMTypes::Int32Type)
        return intAsUnsigned ? AtomicType::UniformUInt32 : AtomicType::UniformInt32;
    else if (t == LLVMTypes::FloatType)
@@ -88,6 +97,12 @@ lLLVMTypeToISPCType(const llvm::Type *t, bool intAsUnsigned) {
        return AtomicType::UniformDouble;
    else if (t == LLVMTypes::Int64Type)
        return intAsUnsigned ? AtomicType::UniformUInt64 : AtomicType::UniformInt64;
+
+    // varying
+    else if (t == LLVMTypes::Int8VectorType)
+        return intAsUnsigned ? AtomicType::VaryingUInt8 : AtomicType::VaryingInt8;
+    else if (t == LLVMTypes::Int16VectorType)
+        return intAsUnsigned ? AtomicType::VaryingUInt16 : AtomicType::VaryingInt16;
    else if (t == LLVMTypes::Int32VectorType)
        return intAsUnsigned ? AtomicType::VaryingUInt32 : AtomicType::VaryingInt32;
    else if (t == LLVMTypes::FloatVectorType)
@@ -96,6 +111,14 @@ lLLVMTypeToISPCType(const llvm::Type *t, bool intAsUnsigned) {
        return AtomicType::VaryingDouble;
    else if (t == LLVMTypes::Int64VectorType)
        return intAsUnsigned ? AtomicType::VaryingUInt64 : AtomicType::VaryingInt64;
+
+    // pointers to uniform
+    else if (t == LLVMTypes::Int8PointerType)
+        return new ReferenceType(intAsUnsigned ? AtomicType::UniformUInt8 :
+                                                 AtomicType::UniformInt8, false);
+    else if (t == LLVMTypes::Int16PointerType)
+        return new ReferenceType(intAsUnsigned ? AtomicType::UniformUInt16 :
+                                                 AtomicType::UniformInt16, false);
    else if (t == LLVMTypes::Int32PointerType)
        return new ReferenceType(intAsUnsigned ? AtomicType::UniformUInt32 :
                                                 AtomicType::UniformInt32, false);
@@ -104,6 +127,16 @@ lLLVMTypeToISPCType(const llvm::Type *t, bool intAsUnsigned) {
                                                 AtomicType::UniformInt64, false);
    else if (t == LLVMTypes::FloatPointerType)
        return new ReferenceType(AtomicType::UniformFloat, false);
+    else if (t == LLVMTypes::DoublePointerType)
+        return new ReferenceType(AtomicType::UniformDouble, false);
+
+    // pointers to varying
+    else if (t == LLVMTypes::Int8VectorPointerType)
+        return new ReferenceType(intAsUnsigned ? AtomicType::VaryingUInt8 :
+                                                 AtomicType::VaryingInt8, false);
+    else if (t == LLVMTypes::Int16VectorPointerType)
+        return new ReferenceType(intAsUnsigned ? AtomicType::VaryingUInt16 :
+                                                 AtomicType::VaryingInt16, false);
    else if (t == LLVMTypes::Int32VectorPointerType)
        return new ReferenceType(intAsUnsigned ? AtomicType::VaryingUInt32 :
                                                 AtomicType::VaryingInt32, false);
@@ -112,6 +145,10 @@ lLLVMTypeToISPCType(const llvm::Type *t, bool intAsUnsigned) {
                                                 AtomicType::VaryingInt64, false);
    else if (t == LLVMTypes::FloatVectorPointerType)
        return new ReferenceType(AtomicType::VaryingFloat, false);
+    else if (t == LLVMTypes::DoubleVectorPointerType)
+        return new ReferenceType(AtomicType::VaryingDouble, false);
+
+    // arrays
    else if (llvm::isa<const llvm::PointerType>(t)) {
        const llvm::PointerType *pt = llvm::dyn_cast<const llvm::PointerType>(t);

@@ -126,7 +163,9 @@ lLLVMTypeToISPCType(const llvm::Type *t, bool intAsUnsigned) {
                                                      intAsUnsigned);
            if (eltType == NULL)
                return NULL;
-            return new ReferenceType(new ArrayType(eltType, at->getNumElements()),
+            // FIXME: this needs to be fixed when arrays can have 
+            // over 4G elements...
+            return new ReferenceType(new ArrayType(eltType, (int)at->getNumElements()),
                                     false);
        }
    }
@@ -135,6 +174,27 @@ lLLVMTypeToISPCType(const llvm::Type *t, bool intAsUnsigned) {
 }


+static void
+lCreateSymbol(const std::string &name, const Type *returnType, 
+              const std::vector<const Type *> &argTypes, 
+              const llvm::FunctionType *ftype, llvm::Function *func, 
+              SymbolTable *symbolTable) {
+    SourcePos noPos;
+    noPos.name = "__stdlib";
+
+    FunctionType *funcType = new FunctionType(returnType, argTypes, noPos);
+    // set NULL default arguments
+    std::vector<ConstExpr *> defaults;
+    for (unsigned int j = 0; j < ftype->getNumParams(); ++j)
+        defaults.push_back(NULL);
+    funcType->SetArgumentDefaults(defaults);
+
+    Symbol *sym = new Symbol(name, noPos, funcType);
+    sym->function = func;
+    symbolTable->AddFunction(sym);
+}
+
+
 /** Given an LLVM function declaration, synthesize the equivalent ispc
    symbol for the function (if possible).  Returns true on success, false
    on failure.
@@ -147,6 +207,30 @@ lCreateISPCSymbol(llvm::Function *func, SymbolTable *symbolTable) {
    const llvm::FunctionType *ftype = func->getFunctionType();
    std::string name = func->getName();

+    if (name.size() < 3 || name[0] != '_' || name[1] != '_')
+        return false;
+
+    // An unfortunate hack: we want this builtin function to have the
+    // signature "int __sext_varying_bool(bool)", but the ispc function
+    // symbol creation code below assumes that any LLVM vector of i32s is a
+    // varying int32.  Here, we need that to be interpreted as a varying
+    // bool, so just have a one-off override for that one...
+    if (name == "__sext_varying_bool") {
+        const Type *returnType = AtomicType::VaryingInt32;
+        std::vector<const Type *> argTypes;
+        argTypes.push_back(AtomicType::VaryingBool);
+        std::vector<ConstExpr *> defaults;
+        defaults.push_back(NULL);
+
+        FunctionType *funcType = new FunctionType(returnType, argTypes, noPos);
+        funcType->SetArgumentDefaults(defaults);
+
+        Symbol *sym = new Symbol(name, noPos, funcType);
+        sym->function = func;
+        symbolTable->AddFunction(sym);
+        return true;
+    }
+
    // If the function has any parameters with integer types, we'll make
    // two Symbols for two overloaded versions of the function, one with
    // all of the integer types treated as signed integers and one with all
@@ -162,7 +246,7 @@ lCreateISPCSymbol(llvm::Function *func, SymbolTable *symbolTable) {

        // Iterate over the arguments and try to find their equivalent ispc
        // types.  Track if any of the arguments has an integer type.
-        bool anyIntArgs = false;
+        bool anyIntArgs = false, anyReferenceArgs = false;
        std::vector<const Type *> argTypes;
        for (unsigned int j = 0; j < ftype->getNumParams(); ++j) {
            const llvm::Type *llvmArgType = ftype->getParamType(j);
@@ -171,16 +255,26 @@ lCreateISPCSymbol(llvm::Function *func, SymbolTable *symbolTable) {
                return false;
            anyIntArgs |= 
                (Type::Equal(type, lLLVMTypeToISPCType(llvmArgType, !intAsUnsigned)) == false);
+            anyReferenceArgs |= (dynamic_cast<const ReferenceType *>(type) != NULL);
            argTypes.push_back(type);
        }

        // Always create the symbol the first time through, in particular
        // so that we get symbols for things with no integer types!
-        if (i == 0 || anyIntArgs == true) {
-            FunctionType *funcType = new FunctionType(returnType, argTypes, noPos);
-            Symbol *sym = new Symbol(name, noPos, funcType);
-            sym->function = func;
-            symbolTable->AddFunction(sym);
+        if (i == 0 || anyIntArgs == true)
+            lCreateSymbol(name, returnType, argTypes, ftype, func, symbolTable);
+
+        // If there are any reference types, also make a variant of the
+        // symbol that has them as const references.  This obviously
+        // doesn't make sense for many builtins, but we'll give the stdlib
+        // the option to call one if it needs one.
+        if (anyReferenceArgs == true) {
+            for (unsigned int j = 0; j < argTypes.size(); ++j) {
+                if (dynamic_cast<const ReferenceType *>(argTypes[j]) != NULL)
+                    argTypes[j] = argTypes[j]->GetAsConstType();
+                lCreateSymbol(name + "_refsconst", returnType, argTypes, 
+                              ftype, func, symbolTable);
+            }
        }
    }

@@ -205,203 +299,32 @@ lAddModuleSymbols(llvm::Module *module, SymbolTable *symbolTable) {
    }
 }

-/** Declare the 'pseudo-gather' functions.  When the ispc front-end needs
-    to perform a gather, it generates a call to one of these functions,
-    which have signatures:
-    
-    varying int32 __pseudo_gather(varying int32 *, mask)
-    varying int64 __pseudo_gather(varying int64 *, mask)

-    These functions are never actually implemented; the
-    GatherScatterFlattenOpt optimization pass finds them and then converts
-    them to make calls to the following functions, which represent gathers
-    from a common base pointer with offsets.  This approach allows the
-    front-end to be relatively simple in how it emits address calculation
-    for gathers.
-
-    varying int32 __pseudo_gather_base_offsets_32(uniform int32 *base, 
-                                                  int32 offsets, mask)
-    varying int64 __pseudo_gather_base_offsets_64(uniform int64 *base, 
-                                                  int64 offsets, mask)
-
-    Then, the GSImprovementsPass optimizations finds these and either
-    converts them to native gather functions or converts them to vector
-    loads, if equivalent.
- */
-static void
-lDeclarePseudoGathers(llvm::Module *module) {
-    SourcePos noPos;
-    noPos.name = "__stdlib";
-
-    {
-        std::vector<const llvm::Type *> argTypes;
-        argTypes.push_back(LLVMTypes::VoidPointerVectorType);
-        argTypes.push_back(LLVMTypes::MaskType);
-
-        llvm::FunctionType *fType = 
-            llvm::FunctionType::get(LLVMTypes::Int32VectorType, argTypes, false);
-        llvm::Function *func =
-            llvm::Function::Create(fType, llvm::GlobalValue::ExternalLinkage,
-                                   "__pseudo_gather_32", module);
-        func->setOnlyReadsMemory(true);
-        func->setDoesNotThrow(true);
-
-        fType = llvm::FunctionType::get(LLVMTypes::Int64VectorType, argTypes, false);
-        func = llvm::Function::Create(fType, llvm::GlobalValue::ExternalLinkage,
-                                      "__pseudo_gather_64", module);
-        func->setOnlyReadsMemory(true);
-        func->setDoesNotThrow(true);
-    }
-
-    {
-        std::vector<const llvm::Type *> argTypes;
-        argTypes.push_back(LLVMTypes::VoidPointerType);
-        argTypes.push_back(LLVMTypes::Int32VectorType);
-        argTypes.push_back(LLVMTypes::MaskType);
-
-        llvm::FunctionType *fType = 
-            llvm::FunctionType::get(LLVMTypes::Int32VectorType, argTypes, false);
-        llvm::Function *func =
-            llvm::Function::Create(fType, llvm::GlobalValue::ExternalLinkage,
-                                   "__pseudo_gather_base_offsets_32", module);
-        func->setOnlyReadsMemory(true);
-        func->setDoesNotThrow(true);
-
-        fType = llvm::FunctionType::get(LLVMTypes::Int64VectorType, argTypes, false);
-        func = llvm::Function::Create(fType, llvm::GlobalValue::ExternalLinkage,
-                                      "__pseudo_gather_base_offsets_64", module);
-        func->setOnlyReadsMemory(true);
-        func->setDoesNotThrow(true);
-    }
-}
-
-
-/** Similarly to the 'pseudo-gathers' defined by lDeclarePseudoGathers(),
-    we also declare (but never define) pseudo-scatter instructions with
-    signatures:
-
-    void __pseudo_scatter_32(varying int32 *, varying int32 values, mask)
-    void __pseudo_scatter_64(varying int64 *, varying int64 values, mask)
-
-    The GatherScatterFlattenOpt optimization pass also finds these and
-    transforms them to scatters like:
-
-    void __pseudo_scatter_base_offsets_32(uniform int32 *base, 
-                    varying int32 offsets, varying int32 values, mask)
-    void __pseudo_scatter_base_offsets_64(uniform int64 *base, 
-                    varying int62 offsets, varying int64 values, mask)
-
-    And the GSImprovementsPass in turn converts these to actual native
-    scatters or masked stores.  
+/** In many of the builtins-*.ll files, we have declarations of various LLVM
+    intrinsics that are then used in the implementation of various target-
+    specific functions.  This function loops over all of the intrinsic 
+    declarations and makes sure that the signature we have in our .ll file
+    matches the signature of the actual intrinsic.
 */
 static void
-lDeclarePseudoScatters(llvm::Module *module) {
-    SourcePos noPos;
-    noPos.name = "__stdlib";
+lCheckModuleIntrinsics(llvm::Module *module) {
+    llvm::Module::iterator iter;
+    for (iter = module->begin(); iter != module->end(); ++iter) {
+        llvm::Function *func = iter;
+        if (!func->isIntrinsic())
+            continue;

-    {
-        std::vector<const llvm::Type *> argTypes;
-        argTypes.push_back(LLVMTypes::VoidPointerVectorType);
-        argTypes.push_back(LLVMTypes::Int32VectorType);
-        argTypes.push_back(LLVMTypes::MaskType);
-
-        llvm::FunctionType *fType = 
-            llvm::FunctionType::get(LLVMTypes::VoidType, argTypes, false);
-        llvm::Function *func =
-            llvm::Function::Create(fType, llvm::GlobalValue::ExternalLinkage,
-                                   "__pseudo_scatter_32", module);
-        func->setDoesNotThrow(true);
-    }
-    {
-        std::vector<const llvm::Type *> argTypes;
-        argTypes.push_back(LLVMTypes::VoidPointerVectorType);
-        argTypes.push_back(LLVMTypes::Int64VectorType);
-        argTypes.push_back(LLVMTypes::MaskType);
-
-        llvm::FunctionType *fType = 
-            llvm::FunctionType::get(LLVMTypes::VoidType, argTypes, false);
-        llvm::Function *func =
-            llvm::Function::Create(fType, llvm::GlobalValue::ExternalLinkage,
-                                   "__pseudo_scatter_64", module);
-        func->setDoesNotThrow(true);
-    }
-
-    {
-        std::vector<const llvm::Type *> argTypes;
-        argTypes.push_back(LLVMTypes::VoidPointerType);
-        argTypes.push_back(LLVMTypes::Int32VectorType);
-        argTypes.push_back(LLVMTypes::Int32VectorType);
-        argTypes.push_back(LLVMTypes::MaskType);
-
-        llvm::FunctionType *fType = 
-            llvm::FunctionType::get(LLVMTypes::VoidType, argTypes, false);
-        llvm::Function *func =
-            llvm::Function::Create(fType, llvm::GlobalValue::ExternalLinkage,
-                                   "__pseudo_scatter_base_offsets_32", module);
-        func->setDoesNotThrow(true);
-    }
-    {
-        std::vector<const llvm::Type *> argTypes;
-        argTypes.push_back(LLVMTypes::VoidPointerType);
-        argTypes.push_back(LLVMTypes::Int32VectorType);
-        argTypes.push_back(LLVMTypes::Int64VectorType);
-        argTypes.push_back(LLVMTypes::MaskType);
-
-        llvm::FunctionType *fType = 
-            llvm::FunctionType::get(LLVMTypes::VoidType, argTypes, false);
-        llvm::Function *func =
-            llvm::Function::Create(fType, llvm::GlobalValue::ExternalLinkage,
-                                   "__pseudo_scatter_base_offsets_64", module);
-        func->setDoesNotThrow(true);
-    }
-}
-
-
-/** This function declares placeholder masked store functions for the
-    front-end to use.
-
-    void __pseudo_masked_store_32(uniform int32 *ptr, varying int32 values, mask)
-    void __pseudo_masked_store_64(uniform int64 *ptr, varying int64 values, mask)
-
-    These in turn are converted to native masked stores or to regular
-    stores (if the mask is all on) by the MaskedStoreOptPass optimization
-    pass.
- */
-static void
-lDeclarePseudoMaskedStore(llvm::Module *module) {
-    SourcePos noPos;
-    noPos.name = "__stdlib";
-
-    {
-    std::vector<const llvm::Type *> argTypes;
-    argTypes.push_back(LLVMTypes::Int32VectorPointerType);
-    argTypes.push_back(LLVMTypes::Int32VectorType);
-    argTypes.push_back(LLVMTypes::MaskType);
-
-    llvm::FunctionType *fType = 
-        llvm::FunctionType::get(LLVMTypes::VoidType, argTypes, false);
-    llvm::Function *func = 
-        llvm::Function::Create(fType, llvm::GlobalValue::ExternalLinkage,
-                               "__pseudo_masked_store_32", module);
-    func->setDoesNotThrow(true);
-    func->addFnAttr(llvm::Attribute::AlwaysInline);
-    func->setDoesNotCapture(1, true);
-    }
-
-    {
-    std::vector<const llvm::Type *> argTypes;
-    argTypes.push_back(LLVMTypes::Int64VectorPointerType);
-    argTypes.push_back(LLVMTypes::Int64VectorType);
-    argTypes.push_back(LLVMTypes::MaskType);
-
-    llvm::FunctionType *fType = 
-        llvm::FunctionType::get(LLVMTypes::VoidType, argTypes, false);
-    llvm::Function *func = 
-        llvm::Function::Create(fType, llvm::GlobalValue::ExternalLinkage,
-                               "__pseudo_masked_store_64", module);
-    func->setDoesNotThrow(true);
-    func->addFnAttr(llvm::Attribute::AlwaysInline);
-    func->setDoesNotCapture(1, true);
+        const std::string funcName = func->getName().str();
+        // Work around http://llvm.org/bugs/show_bug.cgi?id=10438; only
+        // check the llvm.x86.* intrinsics for now...
+        if (!strncmp(funcName.c_str(), "llvm.x86.", 9)) {
+            llvm::Intrinsic::ID id = (llvm::Intrinsic::ID)func->getIntrinsicID();
+            assert(id != 0);
+            LLVM_TYPE_CONST llvm::Type *intrinsicType = 
+                llvm::Intrinsic::getType(*g->ctx, id);
+            intrinsicType = llvm::PointerType::get(intrinsicType, 0);
+            assert(func->getType() == intrinsicType);
+        }
    }
 }

@@ -415,9 +338,9 @@ lDeclarePseudoMaskedStore(llvm::Module *module) {
    @param module      Module to link the bitcode into
    @param symbolTable Symbol table to add definitions to
 */
-static void
-lAddBitcode(const unsigned char *bitcode, int length,
-            llvm::Module *module, SymbolTable *symbolTable) {
+void
+AddBitcodeToModule(const unsigned char *bitcode, int length,
+                   llvm::Module *module, SymbolTable *symbolTable) {
    std::string bcErr;
    llvm::StringRef sb = llvm::StringRef((char *)bitcode, length);
    llvm::MemoryBuffer *bcBuf = llvm::MemoryBuffer::getMemBuffer(sb);
@@ -425,10 +348,28 @@ lAddBitcode(const unsigned char *bitcode, int length,
    if (!bcModule)
        Error(SourcePos(), "Error parsing stdlib bitcode: %s", bcErr.c_str());
    else {
+        // FIXME: this feels like a bad idea, but the issue is that when we
+        // set the llvm::Module's target triple in the ispc Module::Module
+        // constructor, we start by calling llvm::sys::getHostTriple() (and
+        // then change the arch if needed).  Somehow that ends up giving us
+        // strings like 'x86_64-apple-darwin11.0.0', while the stuff we
+        // compile to bitcode with clang has module triples like
+        // 'i386-apple-macosx10.7.0'.  And then LLVM issues a warning about
+        // linking together modules with incompatible target triples..
+        llvm::Triple mTriple(m->module->getTargetTriple());
+        llvm::Triple bcTriple(bcModule->getTargetTriple());
+        assert(bcTriple.getArch() == llvm::Triple::UnknownArch ||
+               mTriple.getArch() == bcTriple.getArch());
+        assert(bcTriple.getVendor() == llvm::Triple::UnknownVendor ||
+               mTriple.getVendor() == bcTriple.getVendor());
+        bcModule->setTargetTriple(mTriple.str());
+
        std::string(linkError);
        if (llvm::Linker::LinkModules(module, bcModule, &linkError))
            Error(SourcePos(), "Error linking stdlib bitcode: %s", linkError.c_str());
-        lAddModuleSymbols(module, symbolTable);
+        if (symbolTable != NULL)
+            lAddModuleSymbols(module, symbolTable);
+        lCheckModuleIntrinsics(module);
    }
 }

@@ -439,10 +380,10 @@ lAddBitcode(const unsigned char *bitcode, int length,
 static void
 lDefineConstantInt(const char *name, int val, llvm::Module *module,
                   SymbolTable *symbolTable) {
-    Symbol *pw = new Symbol(name, SourcePos(), AtomicType::UniformConstInt32);
-    pw->isStatic = true;
+    Symbol *pw = new Symbol(name, SourcePos(), AtomicType::UniformConstInt32,
+                            SC_STATIC);
    pw->constValue = new ConstExpr(pw->type, val, SourcePos());
-    const llvm::Type *ltype = LLVMTypes::Int32Type;
+    LLVM_TYPE_CONST llvm::Type *ltype = LLVMTypes::Int32Type;
    llvm::Constant *linit = LLVMInt32(val);
    pw->storagePtr = new llvm::GlobalVariable(*module, ltype, true, 
                                              llvm::GlobalValue::InternalLinkage,
@@ -451,18 +392,37 @@ lDefineConstantInt(const char *name, int val, llvm::Module *module,
 }


+
+static void
+lDefineConstantIntFunc(const char *name, int val, llvm::Module *module,
+                       SymbolTable *symbolTable) {
+    std::vector<const Type *> args;
+    FunctionType *ft = new FunctionType(AtomicType::UniformInt32, args, SourcePos());
+    Symbol *sym = new Symbol(name, SourcePos(), ft, SC_STATIC);
+
+    llvm::Function *func = module->getFunction(name);
+    assert(func != NULL); // it should be declared already...
+    func->addFnAttr(llvm::Attribute::AlwaysInline);
+    llvm::BasicBlock *bblock = llvm::BasicBlock::Create(*g->ctx, "entry", func, 0);
+    llvm::ReturnInst::Create(*g->ctx, LLVMInt32(val), bblock);
+
+    sym->function = func;
+    symbolTable->AddVariable(sym);
+}
+
+
+
 static void
 lDefineProgramIndex(llvm::Module *module, SymbolTable *symbolTable) {
    Symbol *pidx = new Symbol("programIndex", SourcePos(), 
-                              AtomicType::VaryingConstInt32);
-    pidx->isStatic = true;
+                              AtomicType::VaryingConstInt32, SC_STATIC);

    int pi[ISPC_MAX_NVEC];
    for (int i = 0; i < g->target.vectorWidth; ++i)
        pi[i] = i;
    pidx->constValue = new ConstExpr(pidx->type, pi, SourcePos());

-    const llvm::Type *ltype = LLVMTypes::Int32VectorType;
+    LLVM_TYPE_CONST llvm::Type *ltype = LLVMTypes::Int32VectorType;
    llvm::Constant *linit = LLVMInt32Vector(pi);
    pidx->storagePtr = new llvm::GlobalVariable(*module, ltype, true, 
                                                llvm::GlobalValue::InternalLinkage, linit, 
@@ -474,124 +434,69 @@ lDefineProgramIndex(llvm::Module *module, SymbolTable *symbolTable) {
 void
 DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *module,
             bool includeStdlibISPC) {
-    // Add the definitions from the compiled stdlib-c.c file
-    extern unsigned char stdlib_bitcode_c[];
-    extern int stdlib_bitcode_c_length;
-    lAddBitcode(stdlib_bitcode_c, stdlib_bitcode_c_length, module, symbolTable);
+    // Add the definitions from the compiled builtins-c.c file
+    if (g->target.is32bit) {
+        extern unsigned char builtins_bitcode_c_32[];
+        extern int builtins_bitcode_c_32_length;
+        AddBitcodeToModule(builtins_bitcode_c_32, builtins_bitcode_c_32_length, 
+                           module, symbolTable);
+    }
+    else {
+        extern unsigned char builtins_bitcode_c_64[];
+        extern int builtins_bitcode_c_64_length;
+        AddBitcodeToModule(builtins_bitcode_c_64, builtins_bitcode_c_64_length, 
+                           module, symbolTable);
+    }

    // Next, add the target's custom implementations of the various needed
    // builtin functions (e.g. __masked_store_32(), etc).
    switch (g->target.isa) {
    case Target::SSE2:
-        extern unsigned char stdlib_bitcode_sse2[];
-        extern int stdlib_bitcode_sse2_length;
-        lAddBitcode(stdlib_bitcode_sse2, stdlib_bitcode_sse2_length, module,
-                    symbolTable);
+        extern unsigned char builtins_bitcode_sse2[];
+        extern int builtins_bitcode_sse2_length;
+        AddBitcodeToModule(builtins_bitcode_sse2, builtins_bitcode_sse2_length, 
+                           module, symbolTable);
        break;
    case Target::SSE4:
-        extern unsigned char stdlib_bitcode_sse4[];
-        extern int stdlib_bitcode_sse4_length;
-        extern unsigned char stdlib_bitcode_sse4x2[];
-        extern int stdlib_bitcode_sse4x2_length;
+        extern unsigned char builtins_bitcode_sse4[];
+        extern int builtins_bitcode_sse4_length;
+        extern unsigned char builtins_bitcode_sse4_x2[];
+        extern int builtins_bitcode_sse4_x2_length;
        switch (g->target.vectorWidth) {
        case 4: 
-            lAddBitcode(stdlib_bitcode_sse4, stdlib_bitcode_sse4_length, 
-                        module, symbolTable);
+            AddBitcodeToModule(builtins_bitcode_sse4, builtins_bitcode_sse4_length, 
+                               module, symbolTable);
            break;
        case 8:
-            lAddBitcode(stdlib_bitcode_sse4x2, stdlib_bitcode_sse4x2_length, 
-                        module, symbolTable);
+            AddBitcodeToModule(builtins_bitcode_sse4_x2, builtins_bitcode_sse4_x2_length, 
+                               module, symbolTable);
            break;
        default:
            FATAL("logic error in DefineStdlib");
        }
        break;
    case Target::AVX:
-        extern unsigned char stdlib_bitcode_avx[];
-        extern int stdlib_bitcode_avx_length;
-        lAddBitcode(stdlib_bitcode_avx, stdlib_bitcode_avx_length, module, 
-                    symbolTable);
+        switch (g->target.vectorWidth) {
+        case 8:
+            extern unsigned char builtins_bitcode_avx[];
+            extern int builtins_bitcode_avx_length;
+            AddBitcodeToModule(builtins_bitcode_avx, builtins_bitcode_avx_length, 
+                               module, symbolTable);
+            break;
+        case 16:
+            extern unsigned char builtins_bitcode_avx_x2[];
+            extern int builtins_bitcode_avx_x2_length;
+            AddBitcodeToModule(builtins_bitcode_avx_x2, builtins_bitcode_avx_x2_length,
+                               module,  symbolTable);
+            break;
+        default:
+            FATAL("logic error in DefineStdlib");
+        }
        break;
    default:
        FATAL("logic error");
    }

-    // Add a declaration of void *ISPCMalloc(int64_t).  The user is
-    // responsible for linking in a definition of this if it's needed by
-    // the compiled program.
-    { std::vector<const llvm::Type *> argTypes;
-        argTypes.push_back(llvm::Type::getInt64Ty(*ctx));
-        llvm::FunctionType *ftype = llvm::FunctionType::get(LLVMTypes::VoidPointerType, 
-                                                            argTypes, false);
-        llvm::Function *func = 
-            llvm::Function::Create(ftype, llvm::GlobalValue::ExternalLinkage,
-                                   "ISPCMalloc", module);
-        func->setDoesNotThrow(true);
-    }
-
-    // Add a declaration of void ISPCFree(void *).  The user is
-    // responsible for linking in a definition of this if it's needed by
-    // the compiled program.
-    { std::vector<const llvm::Type *> argTypes;
-        argTypes.push_back(LLVMTypes::VoidPointerType);
-        llvm::FunctionType *ftype = llvm::FunctionType::get(LLVMTypes::VoidPointerType, 
-                                                            argTypes, false);
-        llvm::Function *func = 
-            llvm::Function::Create(ftype, llvm::GlobalValue::ExternalLinkage,
-                                   "ISPCFree", module);
-        func->setDoesNotThrow(true);
-    }
-
-    // Add a declaration of void ISPCLaunch(void *funcPtr, void *data).
-    // The user is responsible for linking in a definition of this if it's
-    // needed by the compiled program.
-    { std::vector<const llvm::Type *> argTypes;
-        argTypes.push_back(LLVMTypes::VoidPointerType);
-        argTypes.push_back(LLVMTypes::VoidPointerType);
-        llvm::FunctionType *ftype = llvm::FunctionType::get(LLVMTypes::VoidType, 
-                                                            argTypes, false);
-        llvm::Function *func = 
-            llvm::Function::Create(ftype, llvm::GlobalValue::ExternalLinkage,
-                                   "ISPCLaunch", module);
-        func->setDoesNotThrow(true);
-    }
-
-    // Add a declaration of void ISPCSync().  The user is responsible for
-    // linking in a definition of this if it's needed by the compiled
-    // program.
-    { 
-        std::vector<const llvm::Type *> argTypes;
-        llvm::FunctionType *ftype = llvm::FunctionType::get(LLVMTypes::VoidType, 
-                                                            argTypes, false);
-        llvm::Function *func = 
-            llvm::Function::Create(ftype, llvm::GlobalValue::ExternalLinkage,
-                                   "ISPCSync", module);
-        func->setDoesNotThrow(true);
-    }
-
-    // Add a declaration of void ISPCInstrument(void *, void *, int, int).
-    // The user is responsible for linking in a definition of this if it's
-    // needed by the compiled program.
-    { 
-        std::vector<const llvm::Type *> argTypes;
-        argTypes.push_back(llvm::PointerType::get(llvm::Type::getInt8Ty(*g->ctx), 0));
-        argTypes.push_back(llvm::PointerType::get(llvm::Type::getInt8Ty(*g->ctx), 0));
-        argTypes.push_back(LLVMTypes::Int32Type);
-        argTypes.push_back(LLVMTypes::Int32Type);
-        llvm::FunctionType *ftype = llvm::FunctionType::get(LLVMTypes::VoidType, 
-                                                            argTypes, false);
-        llvm::Function *func = 
-            llvm::Function::Create(ftype, llvm::GlobalValue::ExternalLinkage,
-                                   "ISPCInstrument", module);
-        func->setDoesNotThrow(true);
-    }
-
-    // Declare various placeholder functions that the optimizer will later
-    // find and replace with something more useful.
-    lDeclarePseudoGathers(module);
-    lDeclarePseudoScatters(module);
-    lDeclarePseudoMaskedStore(module);
-
    // define the 'programCount' builtin variable
    lDefineConstantInt("programCount", g->target.vectorWidth, module, symbolTable);

@@ -609,12 +514,16 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
                       symbolTable);
    lDefineConstantInt("__math_lib_system", (int)Globals::Math_System, module,
                       symbolTable);
+    lDefineConstantIntFunc("__fast_masked_vload", (int)g->opt.fastMaskedVload, module,
+                           symbolTable);

    if (includeStdlibISPC) {
        // If the user wants the standard library to be included, parse the
-        // serialized version of the stdlib.ispc file to get its definitions
-        // added.
-        extern const char *stdlib_code;
+        // serialized version of the stdlib.ispc file to get its
+        // definitions added.  Disable emission of performance warnings for
+        // now, since the user doesn't care about any of that in the stdlib
+        // implementation...
+        extern char stdlib_code[];
        yy_scan_string(stdlib_code);
        yyparse();
    }
--- a/builtins.h
+++ b/builtins.h
@@ -55,4 +55,7 @@
 void DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *module,
                  bool includeStdlib);

+void AddBitcodeToModule(const unsigned char *bitcode, int length,
+                        llvm::Module *module, SymbolTable *symbolTable = NULL);
+
 #endif // ISPC_STDLIB_H
--- a/builtins.m4
+++ b/builtins.m4
--- a/contrib/ispc.vim
+++ b/contrib/ispc.vim
@@ -0,0 +1,32 @@
+" Vim syntax file
+" Language:	ISPC
+" Maintainer:	Andreas Wendleder <andreas.wendleder@gmail.com>
+" Last Change:	2011 Aug 3
+
+" Quit when a syntax file was already loaded
+if exists("b:current_syntax")
+  finish
+endif
+
+" Read the C syntax to start with
+runtime! syntax/c.vim
+unlet b:current_syntax
+
+" New keywords
+syn keyword	ispcStatement	cbreak ccontinue creturn launch print reference soa sync task
+syn keyword	ispcConditional	cif
+syn keyword	ispcRepeat	cdo cfor cwhile
+syn keyword	ispcBuiltin	programCount programIndex	
+syn keyword	ispcType	export int8 int16 int32 int64
+
+" Default highlighting
+command -nargs=+ HiLink hi def link <args>
+HiLink ispcStatement	Statement
+HiLink ispcConditional	Conditional
+HiLink ispcRepeat	Repeat
+HiLink ispcBuiltin	Statement
+HiLink ispcType		Type
+delcommand HiLink
+
+let b:current_syntax = "ispc"
+
--- a/ctx.cpp
+++ b/ctx.cpp
@@ -37,6 +37,7 @@

 #include "ctx.h"
 #include "util.h"
+#include "func.h"
 #include "llvmutil.h"
 #include "type.h"
 #include "stmt.h"
@@ -123,19 +124,20 @@ CFInfo::GetLoop(bool isUniform, llvm::BasicBlock *breakTarget,

 ///////////////////////////////////////////////////////////////////////////

-FunctionEmitContext::FunctionEmitContext(const Type *rt, llvm::Function *function,
-                                         Symbol *funSym, SourcePos firstStmtPos) {
+FunctionEmitContext::FunctionEmitContext(Function *function, Symbol *funSym,
+                                         llvm::Function *llvmFunction,
+                                         SourcePos firstStmtPos) {
+    const Type *rt = function->GetReturnType();
+
    /* Create a new basic block to store all of the allocas */
-    allocaBlock = llvm::BasicBlock::Create(*g->ctx, "allocas", function, 0);
-    bblock = llvm::BasicBlock::Create(*g->ctx, "entry", function, 0);
+    allocaBlock = llvm::BasicBlock::Create(*g->ctx, "allocas", llvmFunction, 0);
+    bblock = llvm::BasicBlock::Create(*g->ctx, "entry", llvmFunction, 0);
    /* But jump from it immediately into the real entry block */
    llvm::BranchInst::Create(bblock, allocaBlock);

-    maskPtr = AllocaInst(LLVMTypes::MaskType, "mask_memory");
-    StoreInst(LLVMMaskAllOn, maskPtr);
-
    funcStartPos = funSym->pos;
    returnType = rt;
+    maskPtr = NULL;
    entryMask = NULL;
    loopMask = NULL;
    breakLanesPtr = continueLanesPtr = NULL;
@@ -144,16 +146,20 @@ FunctionEmitContext::FunctionEmitContext(const Type *rt, llvm::Function *functio
    returnedLanesPtr = AllocaInst(LLVMTypes::MaskType, "returned_lanes_memory");
    StoreInst(LLVMMaskAllOff, returnedLanesPtr);

+    launchedTasks = false;
+    launchGroupHandlePtr = AllocaInst(LLVMTypes::VoidPointerType, "launch_group_handle");
+    StoreInst(llvm::Constant::getNullValue(LLVMTypes::VoidPointerType), 
+              launchGroupHandlePtr);
+
    if (!returnType || returnType == AtomicType::Void)
        returnValuePtr = NULL;
    else {
-        const llvm::Type *ftype = returnType->LLVMType(g->ctx);
+        LLVM_TYPE_CONST llvm::Type *ftype = returnType->LLVMType(g->ctx);
        returnValuePtr = AllocaInst(ftype, "return_value_memory");
        // FIXME: don't do this store???
        StoreInst(llvm::Constant::getNullValue(ftype), returnValuePtr);
    }

-#ifndef LLVM_2_8
    if (m->diBuilder) {
        /* If debugging is enabled, tell the debug information emission
           code about this new function */
@@ -161,33 +167,18 @@ FunctionEmitContext::FunctionEmitContext(const Type *rt, llvm::Function *functio
        llvm::DIType retType = rt->GetDIType(diFile);
        int flags = llvm::DIDescriptor::FlagPrototyped; // ??
        diFunction = m->diBuilder->createFunction(diFile, /* scope */
-                                                  function->getName(), // mangled
+                                                  llvmFunction->getName(), // mangled
                                                  funSym->name,
                                                  diFile,
                                                  funcStartPos.first_line,
                                                  retType,
-                                                  funSym->isStatic,
+                                                  funSym->storageClass == SC_STATIC,
                                                  true, /* is definition */
                                                  flags,
                                                  g->opt.level > 0,
-                                                  function);
+                                                  llvmFunction);
        /* And start a scope representing the initial function scope */
        StartScope();
-    }
-#endif // LLVM_2_8
-
-    launchedTasks = false;
-
-    // connect the funciton's mask memory to the __mask symbol
-    Symbol *maskSymbol = m->symbolTable->LookupVariable("__mask");
-    assert(maskSymbol != NULL);
-    maskSymbol->storagePtr = maskPtr;
-
-#ifndef LLVM_2_8
-    // add debugging info for __mask, programIndex, ...
-    if (m->diBuilder) {
-        maskSymbol->pos = funcStartPos;
-        EmitVariableDebugInfo(maskSymbol);

        llvm::DIFile file = funcStartPos.GetDIFile();
        Symbol *programIndexSymbol = m->symbolTable->LookupVariable("programIndex");
@@ -208,15 +199,12 @@ FunctionEmitContext::FunctionEmitContext(const Type *rt, llvm::Function *functio
                                           true /* static */,
                                           programCountSymbol->storagePtr);
    }
-#endif
 }


 FunctionEmitContext::~FunctionEmitContext() {
    assert(controlFlowInfo.size() == 0);
-#ifndef LLVM_2_8
    assert(debugScopes.size() == (m->diBuilder ? 1 : 0));
-#endif
 }


@@ -238,6 +226,12 @@ FunctionEmitContext::GetMask() {
 }


+void
+FunctionEmitContext::SetMaskPointer(llvm::Value *p) {
+    maskPtr = p;
+}
+
+
 void
 FunctionEmitContext::SetEntryMask(llvm::Value *value) {
    entryMask = value;
@@ -704,6 +698,7 @@ FunctionEmitContext::LaneMask(llvm::Value *v) {

 llvm::Value *
 FunctionEmitContext::MasksAllEqual(llvm::Value *v1, llvm::Value *v2) {
+#if 0
    // Compare the two masks to get a vector of i1s
    llvm::Value *cmp = CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_EQ,
                               v1, v2, "v1==v2");
@@ -711,6 +706,12 @@ FunctionEmitContext::MasksAllEqual(llvm::Value *v1, llvm::Value *v2) {
    cmp = I1VecToBoolVec(cmp);
    // And see if it's all on
    return All(cmp);
+#else
+    llvm::Value *mm1 = LaneMask(v1);
+    llvm::Value *mm2 = LaneMask(v2);
+    return CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_EQ, mm1, mm2,
+                   "v1==v2");
+#endif
 }


@@ -735,11 +736,12 @@ FunctionEmitContext::CreateBasicBlock(const char *name) {

 llvm::Value *
 FunctionEmitContext::I1VecToBoolVec(llvm::Value *b) {
-    const llvm::ArrayType *at = llvm::dyn_cast<const llvm::ArrayType>(b->getType());
+    LLVM_TYPE_CONST llvm::ArrayType *at = 
+        llvm::dyn_cast<LLVM_TYPE_CONST llvm::ArrayType>(b->getType());
    if (at) {
        // If we're given an array of vectors of i1s, then do the
        // conversion for each of the elements
-        const llvm::Type *boolArrayType = 
+        LLVM_TYPE_CONST llvm::Type *boolArrayType = 
            llvm::ArrayType::get(LLVMTypes::BoolVectorType, at->getNumElements());
        llvm::Value *ret = llvm::UndefValue::get(boolArrayType);

@@ -757,34 +759,24 @@ FunctionEmitContext::I1VecToBoolVec(llvm::Value *b) {


 llvm::Value *
-FunctionEmitContext::EmitMalloc(const llvm::Type *ty) {
+FunctionEmitContext::SizeOf(LLVM_TYPE_CONST llvm::Type *ty) {
    // Emit code to compute the size of the given type using a GEP with a
    // NULL base pointer, indexing one element of the given type, and
    // casting the resulting 'pointer' to an int giving its size.
-    const llvm::Type *ptrType = llvm::PointerType::get(ty, 0);
+    LLVM_TYPE_CONST llvm::Type *ptrType = llvm::PointerType::get(ty, 0);
    llvm::Value *nullPtr = llvm::Constant::getNullValue(ptrType);
    llvm::Value *index[1] = { LLVMInt32(1) };
+#if defined(LLVM_3_0) || defined(LLVM_3_0svn)
+    llvm::ArrayRef<llvm::Value *> arrayRef(&index[0], &index[1]);
+    llvm::Value *poffset = llvm::GetElementPtrInst::Create(nullPtr, arrayRef,
+                                                           "offset_ptr", bblock);
+#else
    llvm::Value *poffset = llvm::GetElementPtrInst::Create(nullPtr, &index[0], &index[1],
                                                           "offset_ptr", bblock);
+#endif
    AddDebugPos(poffset);
-    llvm::Value *sizeOf =  PtrToIntInst(poffset, LLVMTypes::Int64Type, "offset_int");
-
-    // And given the size, call the malloc function
-    llvm::Function *fmalloc = m->module->getFunction("ISPCMalloc");
-    assert(fmalloc != NULL);
-    llvm::Value *mem = CallInst(fmalloc, sizeOf, "raw_argmem");
-    // Cast the void * back to the result pointer type
-    return BitCastInst(mem, ptrType, "mem_bitcast");
-}
-
-
-void
-FunctionEmitContext::EmitFree(llvm::Value *ptr) {
-    llvm::Value *freeArg = BitCastInst(ptr, LLVMTypes::VoidPointerType,
-                                       "argmemfree");
-    llvm::Function *ffree = m->module->getFunction("ISPCFree");
-    assert(ffree != NULL);
-    CallInst(ffree, freeArg);
+    llvm::Value *sizeOf = PtrToIntInst(poffset, LLVMTypes::Int64Type, "offset_int");
+    return sizeOf;
 }


@@ -796,8 +788,13 @@ lGetStringAsValue(llvm::BasicBlock *bblock, const char *s) {
                                                 llvm::GlobalValue::InternalLinkage,
                                                 sConstant, s);
    llvm::Value *indices[2] = { LLVMInt32(0), LLVMInt32(0) };
+#if defined(LLVM_3_0) || defined(LLVM_3_0svn)
+    llvm::ArrayRef<llvm::Value *> arrayRef(&indices[0], &indices[2]);
+    return llvm::GetElementPtrInst::Create(sPtr, arrayRef, "sptr", bblock);
+#else
    return llvm::GetElementPtrInst::Create(sPtr, &indices[0], &indices[2],
                                           "sptr", bblock);
+#endif
 }


@@ -837,7 +834,6 @@ FunctionEmitContext::GetDebugPos() const {
 void
 FunctionEmitContext::AddDebugPos(llvm::Value *value, const SourcePos *pos, 
                                 llvm::DIScope *scope) {
-#ifndef LLVM_2_8
    llvm::Instruction *inst = llvm::dyn_cast<llvm::Instruction>(value);
    if (inst != NULL && m->diBuilder) {
        SourcePos p = pos ? *pos : currentPos;
@@ -848,13 +844,11 @@ FunctionEmitContext::AddDebugPos(llvm::Value *value, const SourcePos *pos,
            inst->setDebugLoc(llvm::DebugLoc::get(p.first_line, p.first_column, 
                                                  scope ? *scope : GetDIScope()));
    }
-#endif
 }


 void
 FunctionEmitContext::StartScope() {
-#ifndef LLVM_2_8
    if (m->diBuilder != NULL) {
        llvm::DIScope parentScope;
        if (debugScopes.size() > 0)
@@ -868,18 +862,15 @@ FunctionEmitContext::StartScope() {
                                             currentPos.first_column);
        debugScopes.push_back(lexicalBlock);
    }
-#endif
 }


 void
 FunctionEmitContext::EndScope() {
-#ifndef LLVM_2_8
    if (m->diBuilder != NULL) {
        assert(debugScopes.size() > 0);
        debugScopes.pop_back();
    }
-#endif
 }


@@ -892,7 +883,6 @@ FunctionEmitContext::GetDIScope() const {

 void
 FunctionEmitContext::EmitVariableDebugInfo(Symbol *sym) {
-#ifndef LLVM_2_8
    if (m->diBuilder == NULL)
        return;

@@ -908,13 +898,11 @@ FunctionEmitContext::EmitVariableDebugInfo(Symbol *sym) {
    llvm::Instruction *declareInst = 
        m->diBuilder->insertDeclare(sym->storagePtr, var, bblock);
    AddDebugPos(declareInst, &sym->pos, &scope);
-#endif
 }


 void
 FunctionEmitContext::EmitFunctionParameterDebugInfo(Symbol *sym) {
-#ifndef LLVM_2_8
    if (m->diBuilder == NULL)
        return;

@@ -930,7 +918,6 @@ FunctionEmitContext::EmitFunctionParameterDebugInfo(Symbol *sym) {
    llvm::Instruction *declareInst = 
        m->diBuilder->insertDeclare(sym->storagePtr, var, bblock);
    AddDebugPos(declareInst, &sym->pos, &scope);
-#endif
 }


@@ -940,15 +927,16 @@ FunctionEmitContext::EmitFunctionParameterDebugInfo(Symbol *sym) {
    Otherwise return zero.
 */
 static int
-lArrayVectorWidth(const llvm::Type *t) {
-    const llvm::ArrayType *arrayType = llvm::dyn_cast<const llvm::ArrayType>(t);
+lArrayVectorWidth(LLVM_TYPE_CONST llvm::Type *t) {
+    LLVM_TYPE_CONST llvm::ArrayType *arrayType = 
+        llvm::dyn_cast<LLVM_TYPE_CONST llvm::ArrayType>(t);
    if (arrayType == NULL)
        return 0;

    // We shouldn't be seeing arrays of anything but vectors being passed
    // to things like FunctionEmitContext::BinaryOperator() as operands
-    const llvm::VectorType *vectorElementType = 
-        llvm::dyn_cast<const llvm::VectorType>(arrayType->getElementType());
+    LLVM_TYPE_CONST llvm::VectorType *vectorElementType = 
+        llvm::dyn_cast<LLVM_TYPE_CONST llvm::VectorType>(arrayType->getElementType());
    assert(vectorElementType != NULL &&
           (int)vectorElementType->getNumElements() == g->target.vectorWidth);
    return (int)arrayType->getNumElements();
@@ -965,7 +953,7 @@ FunctionEmitContext::BinaryOperator(llvm::Instruction::BinaryOps inst,
    }

    assert(v0->getType() == v1->getType());
-    const llvm::Type *type = v0->getType();
+    LLVM_TYPE_CONST llvm::Type *type = v0->getType();
    int arraySize = lArrayVectorWidth(type);
    if (arraySize == 0) {
        llvm::Instruction *bop = 
@@ -999,7 +987,7 @@ FunctionEmitContext::NotOperator(llvm::Value *v, const char *name) {
    // Similarly to BinaryOperator, do the operation on all the elements of
    // the array if we're given an array type; otherwise just do the
    // regular llvm operation.
-    const llvm::Type *type = v->getType();
+    LLVM_TYPE_CONST llvm::Type *type = v->getType();
    int arraySize = lArrayVectorWidth(type);
    if (arraySize == 0) {
        llvm::Instruction *binst = 
@@ -1024,20 +1012,20 @@ FunctionEmitContext::NotOperator(llvm::Value *v, const char *name) {
 // Given the llvm Type that represents an ispc VectorType, return an
 // equally-shaped type with boolean elements.  (This is the type that will
 // be returned from CmpInst with ispc VectorTypes).
-static const llvm::Type *
-lGetMatchingBoolVectorType(const llvm::Type *type) {
-    const llvm::ArrayType *arrayType = 
-        llvm::dyn_cast<const llvm::ArrayType>(type);
+static LLVM_TYPE_CONST llvm::Type *
+lGetMatchingBoolVectorType(LLVM_TYPE_CONST llvm::Type *type) {
+    LLVM_TYPE_CONST llvm::ArrayType *arrayType = 
+        llvm::dyn_cast<LLVM_TYPE_CONST llvm::ArrayType>(type);
    // should only be called for vector typed stuff...
    assert(arrayType != NULL);

-    const llvm::VectorType *vectorElementType =
-        llvm::dyn_cast<const llvm::VectorType>(arrayType->getElementType());
+    LLVM_TYPE_CONST llvm::VectorType *vectorElementType =
+        llvm::dyn_cast<LLVM_TYPE_CONST llvm::VectorType>(arrayType->getElementType());
    assert(vectorElementType != NULL &&
           (int)vectorElementType->getNumElements() == g->target.vectorWidth);

-    const llvm::Type *base = llvm::VectorType::get(LLVMTypes::BoolType, 
-                                                   g->target.vectorWidth);
+    LLVM_TYPE_CONST llvm::Type *base = 
+        llvm::VectorType::get(LLVMTypes::BoolType, g->target.vectorWidth);
    return llvm::ArrayType::get(base, arrayType->getNumElements());
 }

@@ -1053,7 +1041,7 @@ FunctionEmitContext::CmpInst(llvm::Instruction::OtherOps inst,
    }

    assert(v0->getType() == v1->getType());
-    const llvm::Type *type = v0->getType();
+    LLVM_TYPE_CONST llvm::Type *type = v0->getType();
    int arraySize = lArrayVectorWidth(type);
    if (arraySize == 0) {
        llvm::Instruction *ci = 
@@ -1063,7 +1051,7 @@ FunctionEmitContext::CmpInst(llvm::Instruction::OtherOps inst,
        return ci;
    }
    else {
-        const llvm::Type *boolType = lGetMatchingBoolVectorType(type);
+        LLVM_TYPE_CONST llvm::Type *boolType = lGetMatchingBoolVectorType(type);
        llvm::Value *ret = llvm::UndefValue::get(boolType);
        for (int i = 0; i < arraySize; ++i) {
            llvm::Value *a = ExtractInst(v0, i);
@@ -1077,16 +1065,17 @@ FunctionEmitContext::CmpInst(llvm::Instruction::OtherOps inst,


 llvm::Value *
-FunctionEmitContext::BitCastInst(llvm::Value *value, const llvm::Type *type, 
+FunctionEmitContext::BitCastInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *type, 
                                 const char *name) {
    if (value == NULL) {
        assert(m->errorCount > 0);
        return NULL;
    }

-    const llvm::Type *valType = value->getType();
-    const llvm::ArrayType *at = llvm::dyn_cast<const llvm::ArrayType>(valType);
-    if (at && llvm::isa<const llvm::PointerType>(at->getElementType())) {
+    LLVM_TYPE_CONST llvm::Type *valType = value->getType();
+    LLVM_TYPE_CONST llvm::ArrayType *at = 
+        llvm::dyn_cast<LLVM_TYPE_CONST llvm::ArrayType>(valType);
+    if (at && llvm::isa<LLVM_TYPE_CONST llvm::PointerType>(at->getElementType())) {
        // If we're bitcasting an array of pointers, we have a varying
        // lvalue; apply the corresponding bitcast to each of the
        // individual pointers and return the result array.
@@ -1111,16 +1100,17 @@ FunctionEmitContext::BitCastInst(llvm::Value *value, const llvm::Type *type,


 llvm::Value *
-FunctionEmitContext::PtrToIntInst(llvm::Value *value, const llvm::Type *type,
+FunctionEmitContext::PtrToIntInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *type,
                                  const char *name) {
    if (value == NULL) {
        assert(m->errorCount > 0);
        return NULL;
    }

-    const llvm::Type *valType = value->getType();
-    const llvm::ArrayType *at = llvm::dyn_cast<const llvm::ArrayType>(valType);
-    if (at && llvm::isa<const llvm::PointerType>(at->getElementType())) {
+    LLVM_TYPE_CONST llvm::Type *valType = value->getType();
+    LLVM_TYPE_CONST llvm::ArrayType *at = 
+        llvm::dyn_cast<LLVM_TYPE_CONST llvm::ArrayType>(valType);
+    if (at && llvm::isa<LLVM_TYPE_CONST llvm::PointerType>(at->getElementType())) {
        // varying lvalue -> apply ptr to int to the individual pointers
        assert((int)at->getNumElements() == g->target.vectorWidth);

@@ -1143,16 +1133,17 @@ FunctionEmitContext::PtrToIntInst(llvm::Value *value, const llvm::Type *type,


 llvm::Value *
-FunctionEmitContext::IntToPtrInst(llvm::Value *value, const llvm::Type *type,
+FunctionEmitContext::IntToPtrInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *type,
                                  const char *name) {
    if (value == NULL) {
        assert(m->errorCount > 0);
        return NULL;
    }

-    const llvm::Type *valType = value->getType();
-    const llvm::ArrayType *at = llvm::dyn_cast<const llvm::ArrayType>(valType);
-    if (at && llvm::isa<const llvm::PointerType>(at->getElementType())) {
+    LLVM_TYPE_CONST llvm::Type *valType = value->getType();
+    LLVM_TYPE_CONST llvm::ArrayType *at = 
+        llvm::dyn_cast<LLVM_TYPE_CONST llvm::ArrayType>(valType);
+    if (at && llvm::isa<LLVM_TYPE_CONST llvm::PointerType>(at->getElementType())) {
        // varying lvalue -> apply int to ptr to the individual pointers
        assert((int)at->getNumElements() == g->target.vectorWidth);

@@ -1175,7 +1166,7 @@ FunctionEmitContext::IntToPtrInst(llvm::Value *value, const llvm::Type *type,


 llvm::Instruction *
-FunctionEmitContext::TruncInst(llvm::Value *value, const llvm::Type *type,
+FunctionEmitContext::TruncInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *type,
                               const char *name) {
    if (value == NULL) {
        assert(m->errorCount > 0);
@@ -1193,7 +1184,7 @@ FunctionEmitContext::TruncInst(llvm::Value *value, const llvm::Type *type,

 llvm::Instruction *
 FunctionEmitContext::CastInst(llvm::Instruction::CastOps op, llvm::Value *value,
-                              const llvm::Type *type, const char *name) {
+                              LLVM_TYPE_CONST llvm::Type *type, const char *name) {
    if (value == NULL) {
        assert(m->errorCount > 0);
        return NULL;
@@ -1209,7 +1200,7 @@ FunctionEmitContext::CastInst(llvm::Instruction::CastOps op, llvm::Value *value,


 llvm::Instruction *
-FunctionEmitContext::FPCastInst(llvm::Value *value, const llvm::Type *type, 
+FunctionEmitContext::FPCastInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *type, 
                                const char *name) {
    if (value == NULL) {
        assert(m->errorCount > 0);
@@ -1226,7 +1217,7 @@ FunctionEmitContext::FPCastInst(llvm::Value *value, const llvm::Type *type,


 llvm::Instruction *
-FunctionEmitContext::SExtInst(llvm::Value *value, const llvm::Type *type, 
+FunctionEmitContext::SExtInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *type, 
                              const char *name) {
    if (value == NULL) {
        assert(m->errorCount > 0);
@@ -1243,7 +1234,7 @@ FunctionEmitContext::SExtInst(llvm::Value *value, const llvm::Type *type,


 llvm::Instruction *
-FunctionEmitContext::ZExtInst(llvm::Value *value, const llvm::Type *type, 
+FunctionEmitContext::ZExtInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *type, 
                              const char *name) {
    if (value == NULL) {
        assert(m->errorCount > 0);
@@ -1269,22 +1260,30 @@ FunctionEmitContext::GetElementPtrInst(llvm::Value *basePtr, llvm::Value *index0

    // FIXME: do we need need to handle the case of the first index being
    // varying?  It's currently needed...
-    assert(!llvm::isa<const llvm::VectorType>(index0->getType()));
+    assert(!llvm::isa<LLVM_TYPE_CONST llvm::VectorType>(index0->getType()));

-    const llvm::Type *basePtrType = basePtr->getType();
-    const llvm::ArrayType *baseArrayType = 
-        llvm::dyn_cast<const llvm::ArrayType>(basePtrType);
+    LLVM_TYPE_CONST llvm::Type *basePtrType = basePtr->getType();
+    LLVM_TYPE_CONST llvm::ArrayType *baseArrayType = 
+        llvm::dyn_cast<LLVM_TYPE_CONST llvm::ArrayType>(basePtrType);
    bool baseIsVaryingTypePointer = (baseArrayType != NULL) && 
-        llvm::isa<const llvm::PointerType>(baseArrayType->getElementType());
-    bool indexIsVaryingType = llvm::isa<const llvm::VectorType>(index1->getType());
+        llvm::isa<LLVM_TYPE_CONST llvm::PointerType>(baseArrayType->getElementType());
+    bool indexIsVaryingType = 
+        llvm::isa<LLVM_TYPE_CONST llvm::VectorType>(index1->getType());

    if (!indexIsVaryingType && !baseIsVaryingTypePointer) {
        // The easy case: both the base pointer and the indices are
        // uniform, so just emit the regular LLVM GEP instruction
        llvm::Value *indices[2] = { index0, index1 };
+#if defined(LLVM_3_0) || defined(LLVM_3_0svn)
+        llvm::ArrayRef<llvm::Value *> arrayRef(&indices[0], &indices[2]);
+        llvm::Instruction *inst = 
+            llvm::GetElementPtrInst::Create(basePtr, arrayRef,
+                                            name ? name : "gep", bblock);
+#else
        llvm::Instruction *inst = 
            llvm::GetElementPtrInst::Create(basePtr, &indices[0], &indices[2], 
                                            name ? name : "gep", bblock);
+#endif
        AddDebugPos(inst);
        return inst;
    }
@@ -1315,9 +1314,10 @@ FunctionEmitContext::GetElementPtrInst(llvm::Value *basePtr, llvm::Value *index0
                // This is kind of a hack: use the type from the GEP to
                // figure out the return type and the first time through,
                // create an undef value of that type here
-                const llvm::PointerType *elementPtrType = 
-                    llvm::dyn_cast<const llvm::PointerType>(eltPtr->getType());
-                const llvm::Type *elementType = elementPtrType->getElementType();
+                LLVM_TYPE_CONST llvm::PointerType *elementPtrType = 
+                    llvm::dyn_cast<LLVM_TYPE_CONST llvm::PointerType>(eltPtr->getType());
+                LLVM_TYPE_CONST llvm::Type *elementType = 
+                    elementPtrType->getElementType();
                lret = llvm::UndefValue::get(LLVMPointerVectorType(elementType));
            }

@@ -1344,7 +1344,7 @@ FunctionEmitContext::LoadInst(llvm::Value *lvalue, const Type *type,
        return NULL;
    }

-    if (llvm::isa<const llvm::PointerType>(lvalue->getType())) {
+    if (llvm::isa<LLVM_TYPE_CONST llvm::PointerType>(lvalue->getType())) {
        // If the lvalue is a straight up regular pointer, then just issue
        // a regular load.  First figure out the alignment; in general we
        // can just assume the natural alignment (0 here), but for varying
@@ -1371,7 +1371,7 @@ FunctionEmitContext::LoadInst(llvm::Value *lvalue, const Type *type,
        // information we need from the LLVM::Type, so have to carry the
        // ispc type in through this path..
        assert(type != NULL);
-        assert(llvm::isa<const llvm::ArrayType>(lvalue->getType()));
+        assert(llvm::isa<LLVM_TYPE_CONST llvm::ArrayType>(lvalue->getType()));
        return gather(lvalue, type, name);
    }
 }
@@ -1381,9 +1381,9 @@ llvm::Value *
 FunctionEmitContext::gather(llvm::Value *lvalue, const Type *type, 
                            const char *name) {
    // We should have a varying lvalue if we get here...
-    assert(llvm::dyn_cast<const llvm::ArrayType>(lvalue->getType()));
+    assert(llvm::dyn_cast<LLVM_TYPE_CONST llvm::ArrayType>(lvalue->getType()));

-    const llvm::Type *retType = type->LLVMType(g->ctx);
+    LLVM_TYPE_CONST llvm::Type *retType = type->LLVMType(g->ctx);

    const StructType *st = dynamic_cast<const StructType *>(type);
    if (st) {
@@ -1409,7 +1409,7 @@ FunctionEmitContext::gather(llvm::Value *lvalue, const Type *type,
        // the GEP stuff in the loop below ends up computing pointers based
        // on elements in the vectors rather than incorrectly advancing to
        // the next vector...
-        const llvm::Type *eltType = 
+        LLVM_TYPE_CONST llvm::Type *eltType = 
            vt->GetBaseType()->GetAsUniformType()->LLVMType(g->ctx);
        lvalue = BitCastInst(lvalue, llvm::PointerType::get(llvm::ArrayType::get(eltType, 0), 0));

@@ -1440,17 +1440,20 @@ FunctionEmitContext::gather(llvm::Value *lvalue, const Type *type,
    llvm::Value *mask = GetMask();
    llvm::Function *gather = NULL;
    // Figure out which gather function to call based on the size of
-    // the elements; will need to generalize this for 8 and 16-bit
-    // types.
+    // the elements.
    if (retType == LLVMTypes::DoubleVectorType || 
        retType == LLVMTypes::Int64VectorType)
        gather = m->module->getFunction("__pseudo_gather_64");
-    else {
-        assert(retType == LLVMTypes::FloatVectorType || 
-               retType == LLVMTypes::Int32VectorType);
+    else if (retType == LLVMTypes::FloatVectorType || 
+             retType == LLVMTypes::Int32VectorType)
        gather = m->module->getFunction("__pseudo_gather_32");
+    else if (retType == LLVMTypes::Int16VectorType)
+        gather = m->module->getFunction("__pseudo_gather_16");
+    else {
+        assert(retType == LLVMTypes::Int8VectorType);
+        gather = m->module->getFunction("__pseudo_gather_8");
    }
-    assert(gather);
+    assert(gather != NULL);

    llvm::Value *voidlvalue = BitCastInst(lvalue, LLVMTypes::VoidPointerType);
    llvm::Instruction *call = CallInst(gather, voidlvalue, mask, name);
@@ -1472,33 +1475,21 @@ FunctionEmitContext::gather(llvm::Value *lvalue, const Type *type,
 void
 FunctionEmitContext::addGSMetadata(llvm::Instruction *inst, SourcePos pos) {
    llvm::Value *str = llvm::MDString::get(*g->ctx, pos.name);
-#ifdef LLVM_2_8
-    llvm::MDNode *md = llvm::MDNode::get(*g->ctx, &str, 1);
-#else
    llvm::MDNode *md = llvm::MDNode::get(*g->ctx, str);
-#endif
    inst->setMetadata("filename", md);

    llvm::Value *line = LLVMInt32(pos.first_line);
-#ifdef LLVM_2_8
-    md = llvm::MDNode::get(*g->ctx, &line, 1);
-#else
    md = llvm::MDNode::get(*g->ctx, line);
-#endif
    inst->setMetadata("line", md);

    llvm::Value *column = LLVMInt32(pos.first_column);
-#ifdef LLVM_2_8
-    md = llvm::MDNode::get(*g->ctx, &column, 1);
-#else
    md = llvm::MDNode::get(*g->ctx, column);
-#endif
    inst->setMetadata("column", md);
 }


 llvm::Value *
-FunctionEmitContext::AllocaInst(const llvm::Type *llvmType, const char *name,
+FunctionEmitContext::AllocaInst(LLVM_TYPE_CONST llvm::Type *llvmType, const char *name,
                                int align, bool atEntryBlock) {
    llvm::AllocaInst *inst = NULL;
    if (atEntryBlock) {
@@ -1518,9 +1509,10 @@ FunctionEmitContext::AllocaInst(const llvm::Type *llvmType, const char *name,
    // unlikely that this array will be loaded into varying variables with
    // what will be aligned accesses if the uniform -> varying load is done
    // in regular chunks.
-    const llvm::ArrayType *arrayType = llvm::dyn_cast<const llvm::ArrayType>(llvmType);
+    LLVM_TYPE_CONST llvm::ArrayType *arrayType = 
+        llvm::dyn_cast<LLVM_TYPE_CONST llvm::ArrayType>(llvmType);
    if (align == 0 && arrayType != NULL && 
-        !llvm::isa<const llvm::VectorType>(arrayType->getElementType()))
+        !llvm::isa<LLVM_TYPE_CONST llvm::VectorType>(arrayType->getElementType()))
        align = 4 * g->target.nativeVectorWidth;

    if (align != 0)
@@ -1545,7 +1537,7 @@ FunctionEmitContext::maskedStore(llvm::Value *rvalue, llvm::Value *lvalue,
        return;
    }

-    assert(llvm::isa<const llvm::PointerType>(lvalue->getType()));
+    assert(llvm::isa<LLVM_TYPE_CONST llvm::PointerType>(lvalue->getType()));
    
    const CollectionType *collectionType = 
        dynamic_cast<const CollectionType *>(rvalueType);
@@ -1563,14 +1555,13 @@ FunctionEmitContext::maskedStore(llvm::Value *rvalue, llvm::Value *lvalue,
        return;
    }

-    // We must have a regular atomic type at this point
-    assert(dynamic_cast<const AtomicType *>(rvalueType) != NULL);
+    // We must have a regular atomic or enumerator type at this point
+    assert(dynamic_cast<const AtomicType *>(rvalueType) != NULL ||
+           dynamic_cast<const EnumType *>(rvalueType) != NULL);
    rvalueType = rvalueType->GetAsNonConstType();

    llvm::Function *maskedStoreFunc = NULL;
-    // Figure out if we need a 32-bit or 64-bit masked store.  This
-    // will need to be generalized when/if 8 and 16-bit data types are
-    // added.
+    // Figure out if we need a 8, 16, 32 or 64-bit masked store.
    if (rvalueType == AtomicType::VaryingDouble || 
        rvalueType == AtomicType::VaryingInt64 ||
        rvalueType == AtomicType::VaryingUInt64) {
@@ -1580,12 +1571,11 @@ FunctionEmitContext::maskedStore(llvm::Value *rvalue, llvm::Value *lvalue,
        rvalue = BitCastInst(rvalue, LLVMTypes::Int64VectorType, 
                             "rvalue_to_int64");
    }
-    else {
-        assert(rvalueType == AtomicType::VaryingFloat ||
-               rvalueType == AtomicType::VaryingBool ||
-               rvalueType == AtomicType::VaryingInt32 ||
-               rvalueType == AtomicType::VaryingUInt32);
-
+    else if (rvalueType == AtomicType::VaryingFloat ||
+             rvalueType == AtomicType::VaryingBool ||
+             rvalueType == AtomicType::VaryingInt32 ||
+             rvalueType == AtomicType::VaryingUInt32 ||
+             dynamic_cast<const EnumType *>(rvalueType) != NULL) {
        maskedStoreFunc = m->module->getFunction("__pseudo_masked_store_32");
        lvalue = BitCastInst(lvalue, LLVMTypes::Int32VectorPointerType, 
                             "lvalue_to_int32vecptr");
@@ -1593,6 +1583,18 @@ FunctionEmitContext::maskedStore(llvm::Value *rvalue, llvm::Value *lvalue,
            rvalue = BitCastInst(rvalue, LLVMTypes::Int32VectorType, 
                                 "rvalue_to_int32");
    }
+    else if (rvalueType == AtomicType::VaryingInt16 ||
+             rvalueType == AtomicType::VaryingUInt16) {
+        maskedStoreFunc = m->module->getFunction("__pseudo_masked_store_16");
+        lvalue = BitCastInst(lvalue, LLVMTypes::Int16VectorPointerType, 
+                             "lvalue_to_int16vecptr");
+    }
+    else if (rvalueType == AtomicType::VaryingInt8 ||
+             rvalueType == AtomicType::VaryingUInt8) {
+        maskedStoreFunc = m->module->getFunction("__pseudo_masked_store_8");
+        lvalue = BitCastInst(lvalue, LLVMTypes::Int8VectorPointerType, 
+                             "lvalue_to_int8vecptr");
+    }

    std::vector<llvm::Value *> args;
    args.push_back(lvalue);
@@ -1613,7 +1615,7 @@ void
 FunctionEmitContext::scatter(llvm::Value *rvalue, llvm::Value *lvalue, 
                             llvm::Value *storeMask, const Type *rvalueType) {
    assert(rvalueType->IsVaryingType());
-    assert(llvm::isa<const llvm::ArrayType>(lvalue->getType()));
+    assert(llvm::isa<LLVM_TYPE_CONST llvm::ArrayType>(lvalue->getType()));

    const StructType *structType = dynamic_cast<const StructType *>(rvalueType);
    if (structType) {
@@ -1632,7 +1634,8 @@ FunctionEmitContext::scatter(llvm::Value *rvalue, llvm::Value *lvalue,
        // the GEP stuff in the loop below ends up computing pointers based
        // on elements in the vectors rather than incorrectly advancing to
        // the next vector...
-        const llvm::Type *eltType = vt->GetBaseType()->GetAsUniformType()->LLVMType(g->ctx);
+        LLVM_TYPE_CONST llvm::Type *eltType = 
+            vt->GetBaseType()->GetAsUniformType()->LLVMType(g->ctx);
        lvalue = BitCastInst(lvalue, llvm::PointerType::get(llvm::ArrayType::get(eltType, 0), 0));

        for (int i = 0; i < vt->GetElementCount(); ++i) {
@@ -1650,20 +1653,21 @@ FunctionEmitContext::scatter(llvm::Value *rvalue, llvm::Value *lvalue,
    assert(dynamic_cast<const AtomicType *>(rvalueType) != NULL);

    llvm::Function *func = NULL;
-    const llvm::Type *type = rvalue->getType();
+    LLVM_TYPE_CONST llvm::Type *type = rvalue->getType();
    if (type == LLVMTypes::DoubleVectorType || 
        type == LLVMTypes::Int64VectorType) {
        func = m->module->getFunction("__pseudo_scatter_64");
        rvalue = BitCastInst(rvalue, LLVMTypes::Int64VectorType, "rvalue2int");
    }
-    else {
-        // FIXME: if this hits, presumably it's due to needing int8 and/or
-        // int16 versions of scatter...
-        assert(type == LLVMTypes::FloatVectorType || 
-               type == LLVMTypes::Int32VectorType);
+    else if (type == LLVMTypes::FloatVectorType || 
+             type == LLVMTypes::Int32VectorType) {
        func = m->module->getFunction("__pseudo_scatter_32");
        rvalue = BitCastInst(rvalue, LLVMTypes::Int32VectorType, "rvalue2int");
    }
+    else if (type == LLVMTypes::Int16VectorType)
+        func = m->module->getFunction("__pseudo_scatter_16");
+    else if (type == LLVMTypes::Int8VectorType)
+        func = m->module->getFunction("__pseudo_scatter_8");
    assert(func != NULL);
    
    AddInstrumentationPoint("scatter");
@@ -1717,7 +1721,7 @@ FunctionEmitContext::StoreInst(llvm::Value *rvalue, llvm::Value *lvalue,
        llvm::Instruction *si = new llvm::StoreInst(rvalue, lvalue, bblock);
        AddDebugPos(si);
    }
-    else if (llvm::isa<const llvm::ArrayType>(lvalue->getType()))
+    else if (llvm::isa<LLVM_TYPE_CONST llvm::ArrayType>(lvalue->getType()))
        // We have a varying lvalue (an array of pointers), so it's time to
        // scatter
        scatter(rvalue, lvalue, storeMask, rvalueType);
@@ -1761,7 +1765,7 @@ FunctionEmitContext::ExtractInst(llvm::Value *v, int elt, const char *name) {
    }

    llvm::Instruction *ei = NULL;
-    if (llvm::isa<const llvm::VectorType>(v->getType()))
+    if (llvm::isa<LLVM_TYPE_CONST llvm::VectorType>(v->getType()))
        ei = llvm::ExtractElementInst::Create(v, LLVMInt32(elt), 
                                              name ? name : "extract", bblock);
    else
@@ -1781,7 +1785,7 @@ FunctionEmitContext::InsertInst(llvm::Value *v, llvm::Value *eltVal, int elt,
    }

    llvm::Instruction *ii = NULL;
-    if (llvm::isa<const llvm::VectorType>(v->getType()))
+    if (llvm::isa<LLVM_TYPE_CONST llvm::VectorType>(v->getType()))
        ii = llvm::InsertElementInst::Create(v, eltVal, LLVMInt32(elt), 
                                             name ? name : "insert", bblock);
    else
@@ -1793,12 +1797,12 @@ FunctionEmitContext::InsertInst(llvm::Value *v, llvm::Value *eltVal, int elt,


 llvm::PHINode *
-FunctionEmitContext::PhiNode(const llvm::Type *type, int count, 
+FunctionEmitContext::PhiNode(LLVM_TYPE_CONST llvm::Type *type, int count, 
                             const char *name) {
    llvm::PHINode *pn = llvm::PHINode::Create(type, 
-#if !defined(LLVM_2_8) && !defined(LLVM_2_9)
+#if defined(LLVM_3_0) || defined(LLVM_3_0svn)
                                              count, 
-#endif // !LLVM_2_8 && !LLVM_2_9
+#endif // LLVM_3_0
                                              name ? name : "phi", bblock);
    AddDebugPos(pn);
    return pn;
@@ -1830,9 +1834,14 @@ FunctionEmitContext::CallInst(llvm::Function *func,
        return NULL;
    }

+#if defined(LLVM_3_0) || defined(LLVM_3_0svn)
+    llvm::Instruction *ci = 
+        llvm::CallInst::Create(func, args, name ? name : "", bblock);
+#else
    llvm::Instruction *ci = 
        llvm::CallInst::Create(func, args.begin(), args.end(), 
                               name ? name : "", bblock);
+#endif
    AddDebugPos(ci);
    return ci;
 }
@@ -1846,10 +1855,15 @@ FunctionEmitContext::CallInst(llvm::Function *func, llvm::Value *arg,
        return NULL;
    }

+#if defined(LLVM_3_0) || defined(LLVM_3_0svn)
+    llvm::Instruction *ci = 
+        llvm::CallInst::Create(func, arg, name ? name : "", bblock);
+#else
    llvm::Value *args[] = { arg };
    llvm::Instruction *ci = 
        llvm::CallInst::Create(func, &args[0], &args[1], name ? name : "",
                               bblock);
+#endif
    AddDebugPos(ci);
    return ci;
 }
@@ -1864,9 +1878,16 @@ FunctionEmitContext::CallInst(llvm::Function *func, llvm::Value *arg0,
    }

    llvm::Value *args[] = { arg0, arg1 };
+#if defined(LLVM_3_0) || defined(LLVM_3_0svn)
+    llvm::ArrayRef<llvm::Value *> argArrayRef(&args[0], &args[2]);
+    llvm::Instruction *ci = 
+        llvm::CallInst::Create(func, argArrayRef, name ? name : "", 
+                               bblock);
+#else
    llvm::Instruction *ci = 
        llvm::CallInst::Create(func, &args[0], &args[2], name ? name : "", 
                               bblock);
+#endif
    AddDebugPos(ci);
    return ci;
 }
@@ -1874,15 +1895,9 @@ FunctionEmitContext::CallInst(llvm::Function *func, llvm::Value *arg0,

 llvm::Instruction *
 FunctionEmitContext::ReturnInst() {
-    if (launchedTasks) {
-        // Automatically add a sync call at the end of any function that
-        // launched tasks
-        SourcePos noPos;
-        noPos.name = "__auto_sync";
-        ExprStmt *es = new ExprStmt(new SyncExpr(noPos), noPos);
-        es->EmitCode(this); 
-        delete es;
-    }
+    if (launchedTasks)
+        // Add a sync call at the end of any function that launched tasks
+        SyncInst();

    llvm::Instruction *rinst = NULL;
    if (returnValuePtr != NULL) {
@@ -1905,7 +1920,8 @@ FunctionEmitContext::ReturnInst() {

 llvm::Instruction *
 FunctionEmitContext::LaunchInst(llvm::Function *callee, 
-                                std::vector<llvm::Value *> &argVals) {
+                                std::vector<llvm::Value *> &argVals,
+                                llvm::Value *launchCount) {
    if (callee == NULL) {
        assert(m->errorCount > 0);
        return NULL;
@@ -1913,21 +1929,24 @@ FunctionEmitContext::LaunchInst(llvm::Function *callee,

    launchedTasks = true;

-    const llvm::Type *argType = callee->arg_begin()->getType();
+    LLVM_TYPE_CONST llvm::Type *argType = callee->arg_begin()->getType();
    assert(llvm::PointerType::classof(argType));
-    const llvm::PointerType *pt = static_cast<const llvm::PointerType *>(argType);
+    LLVM_TYPE_CONST llvm::PointerType *pt = 
+        llvm::dyn_cast<LLVM_TYPE_CONST llvm::PointerType>(argType);
    assert(llvm::StructType::classof(pt->getElementType()));
-    const llvm::StructType *argStructType = 
-        static_cast<const llvm::StructType *>(pt->getElementType());
+    LLVM_TYPE_CONST llvm::StructType *argStructType = 
+        static_cast<LLVM_TYPE_CONST llvm::StructType *>(pt->getElementType());
    assert(argStructType->getNumElements() == argVals.size() + 1);

-    // Use alloca for space for the task args.  KEY DETAIL: pass false
-    // to the call of FunctionEmitContext::AllocaInst so that the alloca
-    // doesn't happen just once at the top of the function, but happens
-    // each time the enclosing basic block executes.
+    llvm::Function *falloc = m->module->getFunction("ISPCAlloc");
+    assert(falloc != NULL);
    int align = 4 * RoundUpPow2(g->target.nativeVectorWidth);
-    llvm::Value *argmem = AllocaInst(argStructType, "argmem", align, false);
-    llvm::Value *voidmem = BitCastInst(argmem, LLVMTypes::VoidPointerType);
+    std::vector<llvm::Value *> allocArgs;
+    allocArgs.push_back(launchGroupHandlePtr);
+    allocArgs.push_back(SizeOf(argStructType));
+    allocArgs.push_back(LLVMInt32(align));
+    llvm::Value *voidmem = CallInst(falloc, allocArgs, "args_ptr");
+    llvm::Value *argmem = BitCastInst(voidmem, pt);

    // Copy the values of the parameters into the appropriate place in
    // the argument block
@@ -1949,5 +1968,32 @@ FunctionEmitContext::LaunchInst(llvm::Function *callee,
    llvm::Value *fptr = BitCastInst(callee, LLVMTypes::VoidPointerType);
    llvm::Function *flaunch = m->module->getFunction("ISPCLaunch");
    assert(flaunch != NULL);
-    return CallInst(flaunch, fptr, voidmem, "");
+    std::vector<llvm::Value *> args;
+    args.push_back(launchGroupHandlePtr);
+    args.push_back(fptr);
+    args.push_back(voidmem);
+    args.push_back(launchCount);
+    return CallInst(flaunch, args, "");
+}
+
+
+void
+FunctionEmitContext::SyncInst() {
+    llvm::Value *launchGroupHandle = LoadInst(launchGroupHandlePtr, NULL);
+    llvm::Value *nullPtrValue = llvm::Constant::getNullValue(LLVMTypes::VoidPointerType);
+    llvm::Value *nonNull = CmpInst(llvm::Instruction::ICmp,
+                                   llvm::CmpInst::ICMP_NE,
+                                   launchGroupHandle, nullPtrValue);
+    llvm::BasicBlock *bSync = CreateBasicBlock("call_sync");
+    llvm::BasicBlock *bPostSync = CreateBasicBlock("post_sync");
+    BranchInst(bSync, bPostSync, nonNull);
+
+    SetCurrentBasicBlock(bSync);
+    llvm::Function *fsync = m->module->getFunction("ISPCSync");
+    if (fsync == NULL)
+        FATAL("Couldn't find ISPCSync declaration?!");
+    CallInst(fsync, launchGroupHandle, "");
+    BranchInst(bPostSync);
+
+    SetCurrentBasicBlock(bPostSync);
 }
--- a/ctx.h
+++ b/ctx.h
@@ -59,14 +59,15 @@ struct CFInfo;
 class FunctionEmitContext {
 public:
    /** Create a new FunctionEmitContext.
-        @param returnType   The return type of the function
-        @param function     LLVM function in the current module that corresponds
+        @param function     The Function object representing the function
+        @param sym          Symbol that corresponds to the function
+        @param llvmFunction LLVM function in the current module that corresponds
                            to the function
-        @param funSym       Symbol that corresponds to the function
        @param firstStmtPos Source file position of the first statement in the
                            function
     */
-    FunctionEmitContext(const Type *returnType, llvm::Function *function, Symbol *funSym,
+    FunctionEmitContext(Function *function, Symbol *funSym, 
+                        llvm::Function *llvmFunction,
                        SourcePos firstStmtPos);
    ~FunctionEmitContext();

@@ -86,6 +87,8 @@ public:
    /** Returns the current mask value */ 
    llvm::Value *GetMask();

+    void SetMaskPointer(llvm::Value *p);
+
    /** Provides the value of the mask at function entry */
    void SetEntryMask(llvm::Value *val);

@@ -210,15 +213,8 @@ public:
        i32. */
    llvm::Value *I1VecToBoolVec(llvm::Value *b);

-    /** Emit code to call the user-supplied ISPCMalloc function to
-        allocate space for an object of thee given type.  Returns the
-        pointer value returned by the ISPCMalloc call. */
-    llvm::Value *EmitMalloc(const llvm::Type *ty);
-
-    /** Emit code to call the user-supplied ISPCFree function, passing it
-        the given pointer to storage previously allocated by an
-        EmitMalloc() call. */
-    void EmitFree(llvm::Value *ptr);
+    /** Returns the size of the given type. */
+    llvm::Value *SizeOf(LLVM_TYPE_CONST llvm::Type *ty);

    /** If the user has asked to compile the program with instrumentation,
        this inserts a callback to the user-supplied instrumentation
@@ -303,21 +299,21 @@ public:
                         llvm::CmpInst::Predicate pred,
                         llvm::Value *v0, llvm::Value *v1, const char *name = NULL);

-    llvm::Value *BitCastInst(llvm::Value *value, const llvm::Type *type,
+    llvm::Value *BitCastInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *type,
                             const char *name = NULL);
-    llvm::Value *PtrToIntInst(llvm::Value *value, const llvm::Type *type,
+    llvm::Value *PtrToIntInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *type,
                              const char *name = NULL);
-    llvm::Value *IntToPtrInst(llvm::Value *value, const llvm::Type *type,
+    llvm::Value *IntToPtrInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *type,
                              const char *name = NULL);
-    llvm::Instruction *TruncInst(llvm::Value *value, const llvm::Type *type,
+    llvm::Instruction *TruncInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *type,
                                 const char *name = NULL);
    llvm::Instruction *CastInst(llvm::Instruction::CastOps op, llvm::Value *value,
-                                const llvm::Type *type, const char *name = NULL);
-    llvm::Instruction *FPCastInst(llvm::Value *value, const llvm::Type *type, 
+                                LLVM_TYPE_CONST llvm::Type *type, const char *name = NULL);
+    llvm::Instruction *FPCastInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *type, 
                                  const char *name = NULL);
-    llvm::Instruction *SExtInst(llvm::Value *value, const llvm::Type *type, 
+    llvm::Instruction *SExtInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *type, 
                                const char *name = NULL);
-    llvm::Instruction *ZExtInst(llvm::Value *value, const llvm::Type *type, 
+    llvm::Instruction *ZExtInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *type, 
                                const char *name = NULL);

    /** This GEP method is a generalization of the standard one in LLVM; it
@@ -347,7 +343,7 @@ public:
        instruction is added at the start of the function in the entry
        basic block; if it should be added to the current basic block, then
        the atEntryBlock parameter should be false. */ 
-    llvm::Value *AllocaInst(const llvm::Type *llvmType, const char *name = NULL,
+    llvm::Value *AllocaInst(LLVM_TYPE_CONST llvm::Type *llvmType, const char *name = NULL,
                            int align = 0, bool atEntryBlock = true);

    /** Standard store instruction; for this variant, the lvalue must be a
@@ -378,7 +374,8 @@ public:
    llvm::Value *InsertInst(llvm::Value *v, llvm::Value *eltVal, int elt, 
                            const char *name = NULL);

-    llvm::PHINode *PhiNode(const llvm::Type *type, int count, const char *name = NULL);
+    llvm::PHINode *PhiNode(LLVM_TYPE_CONST llvm::Type *type, int count, 
+                           const char *name = NULL);
    llvm::Instruction *SelectInst(llvm::Value *test, llvm::Value *val0,
                                  llvm::Value *val1, const char *name = NULL);

@@ -398,7 +395,10 @@ public:
    /** Launch an asynchronous task to run the given function, passing it
        he given argument values. */
    llvm::Instruction *LaunchInst(llvm::Function *callee, 
-                                  std::vector<llvm::Value *> &argVals);
+                                  std::vector<llvm::Value *> &argVals,
+                                  llvm::Value *launchCount);
+
+    void SyncInst();

    llvm::Instruction *ReturnInst();
    /** @} */
@@ -488,6 +488,11 @@ private:
    /** True if a 'launch' statement has been encountered in the function. */
    bool launchedTasks;

+    /** This is a pointer to a void * that is passed to the ISPCLaunch(),
+        ISPCAlloc(), and ISPCSync() routines as a handle to the group ot
+        tasks launched from the current function. */
+    llvm::Value *launchGroupHandlePtr;
+
    llvm::Value *pointerVectorToVoidPointers(llvm::Value *value);
    static void addGSMetadata(llvm::Instruction *inst, SourcePos pos);
    bool ifsInLoopAllUniform() const;
--- a/decl.cpp
+++ b/decl.cpp
@@ -101,9 +101,7 @@ Declarator::AddArrayDimension(int size) {
 void
 Declarator::InitFromDeclSpecs(DeclSpecs *ds) {
    sym->type = GetType(ds);
-
-    if (ds->storageClass == SC_STATIC)
-        sym->isStatic = true;
+    sym->storageClass = ds->storageClass;
 }


@@ -237,7 +235,7 @@ Declarator::GetType(DeclSpecs *ds) const {
                    sprintf(buf, "__anon_parameter_%d", i);
                    sym = new Symbol(buf, pos);
                    Declarator *declarator = new Declarator(sym, sym->pos);
-                    sym->type = declarator->GetType(ds);
+                    sym->type = declarator->GetType(d->declSpecs);
                    d->declarators.push_back(declarator);
                }
                else {
--- a/docs/ReleaseNotes.txt
+++ b/docs/ReleaseNotes.txt
@@ -1,3 +1,181 @@
+=== v1.0.11 === (6 October 2011)
+
+The main new feature in this release is support for generating code for
+multiple targets (e.g., SSE2, SSE4, and AVX) and having the compiled code
+select the best variant at execution time.  For more information, see
+http://ispc.github.com/ispc.html#compiling-with-support-for-multiple-instruction-sets.
+
+All of the examples now take advantage of the support for multiple
+compilation targets; thus, if one has an AVX system, it's not necessary to
+recompile the examples to use the AVX target.
+
+Performance of the built-in task system that is used in the examples has
+been improved.
+
+Finally, the print() statement now works on OSX; it had been broken for the
+last few releases.
+
+=== v1.0.10 === (30 September 2011)
+
+This release features an extensive new example showing the application of
+ispc to a deferred shading algorithm for scenes with thousands of lights
+(examples/deferred).  This is an implementation of the algorithm that Johan
+Andersson described at SIGGRAPH 2009 and was implemented by Andrew
+Lauritzen and Jefferson Montgomery.  The basic idea is that a pre-rendered
+G-buffer is partitioned into tiles, and in each tile, the set of lights
+that contribute to the tile is computed.  Then, the pixels in the tile are
+then shaded using those light sources. (See slides 19-29 of
+http://s09.idav.ucdavis.edu/talks/04-JAndersson-ParallelFrostbite-Siggraph09.pdf
+for more details on the algorithm.)
+
+The mechanism for launching tasks from ispc code has been generalized to
+allow multiple tasks to be launched with a single launch call (see
+http://ispc.github.com/ispc.html#task-parallelism-language-syntax for more
+information.)
+
+A few new functions have been added to the standard library: num_cores()
+returns the number of cores in the system's CPU, and variants of all of the
+atomic operators that take 'uniform' values as parameters have been added.
+
+=== v1.0.9 === (26 September 2011)
+
+The binary release of v1.0.9 is the first that supports AVX code
+generation.  Two targets are provided: "avx", which runs with a
+programCount of 8, and "avx-x2" which runs 16 program instances
+simultaneously.  (This binary is also built using the in-progress LLVM 3.0
+development libraries, while previous ones have been built with the
+released 2.9 version of LLVM.)
+
+This release has no other significant changes beyond a number of small
+bugfixes (https://github.com/ispc/ispc/issues/100,
+https://github.com/ispc/ispc/issues/101, https://github.com/ispc/ispc/issues/103.)
+ 
+=== v1.0.8 === (19 September 2011)
+
+A number of improvements have been made to handling of 'if' statements in
+the language:
+  - A bug was fixed where invalid memory could be incorrectly accessed even
+    if none of the running program instances wanted to execute the
+    corresponding instructions (https://github.com/ispc/ispc/issues/74).
+  - The code generated for 'if' statements is a bit simpler and thus more
+    efficient.
+
+There is now '--pic' command-line argument that causes position-independent
+code to be generated (Linux and OSX only).
+
+A number of additional performance improvements:
+  - Loops are now unrolled by default; the --opt=disable-loop-unroll
+    command-line argument can be used to disable this behavior.
+    (https://github.com/ispc/ispc/issues/78)
+  - A few more cases where gathers/scatters could be determined at compile
+    time to actually access contiguous locations have been added.
+    (https://github.com/ispc/ispc/issues/79)
+
+Finally, warnings are now issued (if possible) when it can be determined
+at compile-time that an out-of-bounds array index is being used.
+(https://github.com/ispc/ispc/issues/98).
+
+
+=== v1.0.7 === (3 September 2011)
+
+The various atomic_*_global() standard library functions are generally
+substantially more efficient.  They all previously issued one hardware
+atomic instruction for each running program instance but now locally
+compute a reduction over the operands and issue a single hardware atomic,
+giving the same effect and results in the end (issue #57).
+
+CPU/ISA target handling has been substantially improved.  If no CPU is
+specified, the host CPU type is used, not just a default of "nehalem".  A
+number of bugs were fixed that ensure that LLVM doesn't generate SSE>2
+instructions when using the SSE2 target (fixes issue #82).
+
+Shift rights of unsigned integer types use a logical shift right
+instruction now, not an arithmetic shift right (fixed issue #88).
+
+When emitting header files, 'extern' declarations of globals used in ispc
+code are now outside of the ispc namespace.  Fixes issue #64.
+
+The stencil example has been modified to do runs with and without
+parallelism.
+
+Many other small bugfixes and improvements.
+
+=== v1.0.6 === (17 August 2011)
+
+Some additional cross-program instance operations have been added to the
+standard library.  reduce_equal() checks to see if the given value is the
+same across all running program instances, and exclusive_scan_{and,or,and}()
+computes a scan over the given value in the running program instances.
+See the documentation of these new routines for more information:
+http://ispc.github.com/ispc.html#cross-program-instance-operations.
+
+The simple task system implementations used in the examples have been
+improved.  The Windows version no nlonger has a hard limit on the number of
+tasks that can be launched, and all versions have less dynamic memory
+allocation and less locking.  More of the examples now have paths that also
+measure performance using tasks along with SPMD vectorization.
+
+Two new examples have been added: one that shows the implementation of a
+ray-marching volume rendering algorithm, and one that shows a 3D stencil
+computation, as might be done for PDE solutions.
+
+Standard library routines to issue prefetches have been added.  See the
+documentation for more details: http://ispc.github.com/ispc.html#prefetches.
+
+Fast versions of the float to half-precision float conversion routines have
+been added.  For more details, see:
+http://ispc.github.com/ispc.html#conversions-to-and-from-half-precision-floats.
+
+There is the usual set of small bug fixes.  Notably, a number of details
+related to handling 32 versus 64 bit targets have been fixed, which in turn
+has fixed a bug related to tasks having incorrect values for pointers
+passed to them.
+
+=== v1.0.5 === (1 August 2011)
+
+Multi-element vector swizzles are supported; for example, given a 3-wide
+vector "foo", then expressions like "foo.zyx" and "foo.yz" can be used to
+construct other short vectors.  See
+http://ispc.github.com/ispc.html#short-vector-types
+for more details.  (Thanks to Pete Couperus for implementing this code!).
+
+int8 and int16 datatypes are now supported.  It is still generally more
+efficient to use int32 for intermediate computations, even if the in-memory
+format is int8 or int16.
+
+There are now standard library routines to convert to and from 'half'-format
+floating-point values (half_to_float() and float_to_half()).
+
+There is a new example with an implementation of Perlin's Noise function
+(examples/noise).  It shows a speedup of approximately 4.2x versus a C
+implementation on OSX and a 2.9x speedup versus C on Windows.
+
+=== v1.0.4 === (18 July 2011)
+
+enums are now supported in ispc; see the section on enumeration types in
+the documentation (http://ispc.github.com/ispc.html#enumeration-types) for
+more informaiton.
+
+bools are converted to integers with zero extension, not sign extension as
+before (i.e. a 'true' bool converts to the value one, not 'all bits on'.)
+For cases where sign extension is still desired, there is a
+sign_extend(bool) function in the standard library.
+
+Support for 64-bit types in the standard library is much more complete than
+before.
+
+64-bit integer constants are now supported by the parser.
+
+Storage for parameters to tasks is now allocated dynamically on Windows,
+rather than on the stack; with this fix, all tests now run correctly on
+Windows.
+
+There is now support for atomic swap and compare/exchange with float and
+double types.
+
+A number of additional small bugs have been fixed and a number of cases
+where the compiler would crash given a malformed program have been fixed.
+
 === v1.0.3 === (4 July 2011)

 ispc now has a bulit-in pre-processor (from LLVM's clang compiler).
--- a/docs/ispc.txt
+++ b/docs/ispc.txt
--- a/doxygen.cfg
+++ b/doxygen.cfg
@@ -31,7 +31,7 @@ PROJECT_NAME           = "Intel SPMD Program Compiler"
 # This could be handy for archiving the generated documentation or
 # if some version control system is used.

-PROJECT_NUMBER         = 1.0.3
+PROJECT_NUMBER         = 1.0.11

 # The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute)
 # base path where the generated documentation will be put.
@@ -585,7 +585,6 @@ INPUT                  = builtins.h \
                         ctx.h \
                         decl.h \
                         expr.h \
-                         gatherbuf.h \
                         ispc.h \
                         llvmutil.h \
                         module.h \
@@ -598,7 +597,6 @@ INPUT                  = builtins.h \
                         ctx.cpp \
                         decl.cpp \
                         expr.cpp \
-                         gatherbuf.cpp \
                         ispc.cpp \
                         llvmutil.cpp \
                         main.cpp \
@@ -610,7 +608,7 @@ INPUT                  = builtins.h \
                         util.cpp \
                         parse.yy \
                         lex.ll \
-                         stdlib-c.c
+                         builtins-c.c

 # This tag can be used to specify the character encoding of the source files
 # that doxygen parses. Internally doxygen uses the UTF-8 encoding, which is
--- a/examples/README.txt
+++ b/examples/README.txt
@@ -13,6 +13,7 @@ against regular serial C++ implementations, printing out a comparison of
 the runtimes and the speedup delivered by ispc.  It may be instructive to
 do a side-by-side diff of the C++ and ispc implementations of these
 algorithms to learn more about wirting ispc code.
+
 
 AOBench
 =======
@@ -27,6 +28,7 @@ It executes the program for the given number of iterations, rendering an
 (xres x yres) image each time and measuring the computation time with both
 serial and ispc implementations.

+
 AOBench_Instrumented
 ====================

@@ -40,12 +42,47 @@ is provided in the instrument.cpp file.
 *** Note: on Linux, this example currently hits an assertion in LLVM during
 *** compilation

+
+Deferred
+========
+
+This example shows an extensive example of using ispc for efficient
+deferred shading of scenes with thousands of lights; it's an implementation
+of the algorithm that Johan Andersson described at SIGGRAPH 2009,
+implemented by Andrew Lauritzen and Jefferson Montgomery.  The basic idea
+is that a pre-rendered G-buffer is partitioned into tiles, and in each
+tile, the set of lights that contribute to the tile is first computed.
+Then, the pixels in the tile are then shaded using just those light
+sources. (See slides 19-29 of
+http://s09.idav.ucdavis.edu/talks/04-JAndersson-ParallelFrostbite-Siggraph09.pdf
+for more details on the algorithm.)
+
+This directory includes three implementations of the algorithm:
+
+- An ispc implementation that first does a static partitioning of the
+  screen into tiles to parallelize across the CPU cores.  Within each tile
+  ispc kernels provide highly efficient implementations of the light
+  culling and shading calculations.
+- A "best practices" serial C++ implementation.  This implementation does a
+  dynamic partitioning of the screen, refining tiles with significant Z
+  depth complexity (these tiles often have a large number of lights that
+  affect them).  Within each final tile, the pixels are shaded using
+  regular C++ code.
+- If the Cilk extensions are available in your compiler, an ispc
+  implementation that uses Cilk will also be built.
+  (See http://software.intel.com/en-us/articles/intel-cilk-plus/).  Like 
+  the "best practices" serial implementation, this version does dynamic
+  tile partitioning for better load balancing and then uses ispc for the
+  light culling and shading.
+
+
 Mandelbrot
 ==========

 Mandelbrot set generation.  This example is extensively documented at the
 http://ispc.github.com/example.html page.

+
 Mandelbrot_tasks
 ================

@@ -57,6 +94,14 @@ Linux, a pthreads-based task system is used (tasks_pthreads.cpp).  When
 using tasks with ispc, no task system is mandated; the user is free to plug
 in any task system they want, for ease of interoperating with existing task
 systems.
+
+
+Noise
+=====
+
+This example has an implementation of Ken Perlin's procedural "noise"
+function, as described in his 2002 "Improving Noise" SIGGRAPH paper.
+
 
 Options
 =======
@@ -64,6 +109,7 @@ Options
 This program implements both the Black-Scholes and Binomial options pricing
 models in both ispc and regular serial C++ code.

+
 RT
 ==

@@ -80,9 +126,25 @@ and triangle intersection code from pbrt; see the pbrt source code and/or
 "Physically Based Rendering" book for more about the basic algorithmic
 details.

+
 Simple
 ======

 This is a simple "hello world" type program that shows a ~10 line
 application program calling out to a ~5 line ispc program to do a simple
 computation.
+
+
+Volume
+======
+
+Ray-marching volume rendering, with single scattering lighting model.  To
+run it, specify a camera parameter file and a volume density file, e.g.:
+
+volume camera.dat density_highres.vol
+
+(See, e.g. Chapters 11 and 16 of "Physically Based Rendering" for
+information about the algorithm implemented here.)  The volume data set
+included here was generated by the example implementation of the "Wavelet
+Turbulence for Fluid Simulation" SIGGRAPH 2008 paper by Kim et
+al. (http://www.cs.cornell.edu/~tedkim/WTURB/)
--- a/examples/aobench/.gitignore
+++ b/examples/aobench/.gitignore
@@ -0,0 +1,2 @@
+ao
+*.ppm
--- a/examples/aobench/Makefile
+++ b/examples/aobench/Makefile
@@ -1,8 +1,18 @@

-CXX=g++ -m64
-CXXFLAGS=-Iobjs/ -O3 -Wall
+ARCH = $(shell uname)
+
+TASK_CXX=../tasksys.cpp
+TASK_LIB=-lpthread
+TASK_OBJ=$(addprefix objs/, $(subst ../,, $(TASK_CXX:.cpp=.o)))
+
+CXX=g++
+CXXFLAGS=-Iobjs/ -O3 -Wall -m64
 ISPC=ispc
-ISPCFLAGS=-O2 --fast-math --arch=x86-64
+ISPCFLAGS=-O2 --target=sse2,sse4,avx --arch=x86-64
+
+ISPC_OBJS=objs/ao_ispc.o objs/ao_ispc_sse2.o objs/ao_ispc_sse4.o \
+	objs/ao_ispc_avx.o
+OBJS=objs/ao.o objs/ao_serial.o $(ISPC_OBJS) $(TASK_OBJ)

 default: ao

@@ -14,13 +24,16 @@ dirs:
 clean:
 	/bin/rm -rf objs *~ ao

-ao: dirs objs/ao.o objs/ao_serial.o objs/ao_ispc.o
-	$(CXX) $(CXXFLAGS) -o $@ objs/ao.o objs/ao_ispc.o objs/ao_serial.o -lm -lpthread
+ao: dirs $(OBJS) $(TASK_OBJ)
+	$(CXX) $(CXXFLAGS) -o $@ $(OBJS) -lm $(TASK_LIB)

 objs/%.o: %.cpp
 	$(CXX) $< $(CXXFLAGS) -c -o $@

+objs/%.o: ../%.cpp
+	$(CXX) $< $(CXXFLAGS) -c -o $@
+
 objs/ao.o: objs/ao_ispc.h 

-objs/%_ispc.h objs/%_ispc.o: %.ispc
+objs/%_ispc.h objs/%_ispc.o objs/%_ispc_sse2.o objs/%_ispc_sse4.o objs/%_ispc_avx.o: %.ispc
 	$(ISPC) $(ISPCFLAGS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
--- a/examples/aobench/ao.cpp
+++ b/examples/aobench/ao.cpp
@@ -100,6 +100,7 @@ savePPM(const char *fname, int w, int h)
    fprintf(fp, "255\n");
    fwrite(img, w * h * 3, 1, fp);
    fclose(fp);
+    printf("Wrote image file %s\n", fname);
 }


@@ -137,10 +138,30 @@ int main(int argc, char **argv)
    }

    // Report results and save image
-    printf("[aobench ispc]:\t\t\t[%.3f] M cycles (%d x %d image)\n", minTimeISPC, 
-           width, height);
+    printf("[aobench ispc]:\t\t\t[%.3f] M cycles (%d x %d image)\n", 
+           minTimeISPC, width, height);
    savePPM("ao-ispc.ppm", width, height); 

+    //
+    // Run the ispc + tasks path, test_iterations times, and report the
+    // minimum time for any of them.
+    //
+    double minTimeISPCTasks = 1e30;
+    for (unsigned int i = 0; i < test_iterations; i++) {
+        memset((void *)fimg, 0, sizeof(float) * width * height * 3);
+        assert(NSUBSAMPLES == 2);
+
+        reset_and_start_timer();
+        ao_ispc_tasks(width, height, NSUBSAMPLES, fimg);
+        double t = get_elapsed_mcycles();
+        minTimeISPCTasks = std::min(minTimeISPCTasks, t);
+    }
+
+    // Report results and save image
+    printf("[aobench ispc + tasks]:\t\t[%.3f] M cycles (%d x %d image)\n", 
+           minTimeISPCTasks, width, height);
+    savePPM("ao-ispc-tasks.ppm", width, height); 
+
    //
    // Run the serial path, again test_iteration times, and report the
    // minimum time.
@@ -157,7 +178,8 @@ int main(int argc, char **argv)
    // Report more results, save another image...
    printf("[aobench serial]:\t\t[%.3f] M cycles (%d x %d image)\n", minTimeSerial, 
           width, height);
-    printf("\t\t\t\t(%.2fx speedup from ISPC)\n", minTimeSerial / minTimeISPC);
+    printf("\t\t\t\t(%.2fx speedup from ISPC, %.2fx speedup from ISPC + tasks)\n", 
+           minTimeSerial / minTimeISPC, minTimeSerial / minTimeISPCTasks);
    savePPM("ao-serial.ppm", width, height); 
        
    return 0;
--- a/examples/aobench/ao.ispc
+++ b/examples/aobench/ao.ispc
@@ -203,8 +203,9 @@ ambient_occlusion(reference Isect isect, reference Plane plane,
 /* Compute the image for the scanlines from [y0,y1), for an overall image
   of width w and height h.
 */
-void ao_scanlines(uniform int y0, uniform int y1, uniform int w, uniform int h, 
-                  uniform int nsubsamples, reference uniform float image[]) {
+static void ao_scanlines(uniform int y0, uniform int y1, uniform int w, 
+                         uniform int h,  uniform int nsubsamples, 
+                         reference uniform float image[]) {
    static Plane plane = { { 0.0f, -0.5f, 0.0f }, { 0.f, 1.f, 0.f } };
    static Sphere spheres[3] = {
        { { -2.0f, 0.0f, -3.5f }, 0.5f },
@@ -231,6 +232,9 @@ void ao_scanlines(uniform int y0, uniform int y1, uniform int w, uniform int h,
    // direction we do per iteration and ny the number in y.
    uniform int nx = 1, ny = 1;

+    // FIXME: We actually need ny to be 1 regardless of the decomposition,
+    // since the task decomposition is one scanline high.
+
    if (programCount == 8) {
        // Do two pixels at once in the x direction
        nx = 2;
@@ -239,19 +243,21 @@ void ao_scanlines(uniform int y0, uniform int y1, uniform int w, uniform int h,
            ++du;
    }
    else if (programCount == 16) {
-        // Two at once in both x and y
-        nx = ny = 2;
-        if ((programIndex >= 4 && programIndex < 8) || programIndex >= 12)
+        nx = 4;
+        ny = 1;
+        if (programIndex >= 4 && programIndex < 8)
            ++du;
-        if (programIndex >= 8)  
-            ++dv;
+        if (programIndex >= 8 && programIndex < 12)
+            du += 2;
+        if (programIndex >= 12)
+            du += 3;
    }

    // Now loop over all of the pixels, stepping in x and y as calculated
    // above.  (Assumes that ny divides y and nx divides x...)
    for (uniform int y = y0; y < y1; y += ny) {
        for (uniform int x = 0; x < w; x += nx)  {
-            // Figur out x,y pixel in NDC
+            // Figure out x,y pixel in NDC
            float px =  (x + du - (w / 2.0f)) / (w / 2.0f);
            float py = -(y + dv - (h / 2.0f)) / (h / 2.0f);
            float ret = 0.f;
@@ -293,7 +299,7 @@ void ao_scanlines(uniform int y0, uniform int y1, uniform int w, uniform int h,

            // offset to the first pixel in the image
            uniform int offset = 3 * (y * w + x);
-            for (uniform int p = 0; p < programCount; p += 4, ++offset) {
+            for (uniform int p = 0; p < programCount; p += 4, offset += 3) {
                // Get the four sample values for this pixel
                uniform float sumret = retArray[p] + retArray[p+1] + retArray[p+2] +
                    retArray[p+3];
@@ -315,3 +321,15 @@ export void ao_ispc(uniform int w, uniform int h, uniform int nsubsamples,
                    uniform float image[]) {
    ao_scanlines(0, h, w, h, nsubsamples, image);
 }
+
+
+static void task ao_task(uniform int width, uniform int height, 
+                         uniform int nsubsamples, uniform float image[]) {
+    ao_scanlines(taskIndex, taskIndex+1, width, height, nsubsamples, image);
+}
+
+
+export void ao_ispc_tasks(uniform int w, uniform int h, uniform int nsubsamples, 
+                          uniform float image[]) {
+    launch[h] < ao_task(w, h, nsubsamples, image) >;
+}
--- a/examples/aobench/ao_serial.cpp
+++ b/examples/aobench/ao_serial.cpp
@@ -140,7 +140,7 @@ ray_plane_intersect(Isect &isect, Ray &ray,
    float d = -dot(plane.p, plane.n);
    float v = dot(ray.dir, plane.n);

-    if (fabsf(v) < 1.0e-17) 
+    if (fabsf(v) < 1.0e-17f) 
        return;
    else {
        float t = -(dot(ray.org, plane.n) + d) / v;
@@ -183,11 +183,11 @@ orthoBasis(vec basis[3], const vec &n) {
    basis[2] = n;
    basis[1].x = 0.0; basis[1].y = 0.0; basis[1].z = 0.0;

-    if ((n.x < 0.6) && (n.x > -0.6)) {
+    if ((n.x < 0.6f) && (n.x > -0.6f)) {
        basis[1].x = 1.0;
-    } else if ((n.y < 0.6) && (n.y > -0.6)) {
+    } else if ((n.y < 0.6f) && (n.y > -0.6f)) {
        basis[1].y = 1.0;
-    } else if ((n.z < 0.6) && (n.z > -0.6)) {
+    } else if ((n.z < 0.6f) && (n.z > -0.6f)) {
        basis[1].z = 1.0;
    } else {
        basis[1].x = 1.0;
@@ -224,7 +224,7 @@ ambient_occlusion(Isect &isect, Plane &plane,
            float phi   = 2.0f * M_PI * drand48();
            float x = cosf(phi) * theta;
            float y = sinf(phi) * theta;
-            float z = sqrtf(1.0 - theta * theta);
+            float z = sqrtf(1.0f - theta * theta);

            // local . global
            float rx = x * basis[0].x + y * basis[1].x + z * basis[2].x;
@@ -236,14 +236,14 @@ ambient_occlusion(Isect &isect, Plane &plane,
            ray.dir.y = ry;
            ray.dir.z = rz;

-            occIsect.t   = 1.0e+17;
+            occIsect.t   = 1.0e+17f;
            occIsect.hit = 0;

            for (int snum = 0; snum < 3; ++snum)
                ray_sphere_intersect(occIsect, ray, spheres[snum]); 
            ray_plane_intersect (occIsect, ray, plane); 

-            if (occIsect.hit) occlusion += 1.0;
+            if (occIsect.hit) occlusion += 1.f;
        }
    }

@@ -280,10 +280,10 @@ static void ao_scanlines(int y0, int y1, int w, int h, int nsubsamples,

                    ray.dir.x = px;
                    ray.dir.y = py;
-                    ray.dir.z = -1.0;
+                    ray.dir.z = -1.0f;
                    vnormalize(ray.dir);

-                    isect.t   = 1.0e+17;
+                    isect.t   = 1.0e+17f;
                    isect.hit = 0;

                    for (int snum = 0; snum < 3; ++snum)
--- a/examples/aobench/aobench.vcxproj
+++ b/examples/aobench/aobench.vcxproj
@@ -21,22 +21,23 @@
  <ItemGroup>
    <ClCompile Include="ao.cpp" />
    <ClCompile Include="ao_serial.cpp" />
+    <ClCompile Include="../tasksys.cpp" />
  </ItemGroup>
  <ItemGroup>
    <CustomBuild Include="ao.ispc">
      <FileType>Document</FileType>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4,avx
 </Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4,avx
 </Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(Filename).obj</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">%(Filename).obj</Outputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4,avx
 </Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4,avx
 </Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(Filename).obj</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">%(Filename).obj</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
    </CustomBuild>
  </ItemGroup>
  <PropertyGroup Label="Globals">
@@ -85,15 +86,19 @@
  <PropertyGroup Label="UserMacros" />
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
    <LinkIncremental>true</LinkIncremental>
+    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
    <LinkIncremental>true</LinkIncremental>
+    <ExecutablePath>$(ExecutablePath);$(ProjectDir)..\..</ExecutablePath>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
    <LinkIncremental>false</LinkIncremental>
+    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
    <LinkIncremental>false</LinkIncremental>
+    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
  </PropertyGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
    <ClCompile>
@@ -102,6 +107,7 @@
      <WarningLevel>Level3</WarningLevel>
      <Optimization>Disabled</Optimization>
      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <FloatingPointModel>Fast</FloatingPointModel>
    </ClCompile>
@@ -117,6 +123,7 @@
      <WarningLevel>Level3</WarningLevel>
      <Optimization>Disabled</Optimization>
      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <FloatingPointModel>Fast</FloatingPointModel>
    </ClCompile>
@@ -134,6 +141,7 @@
      <FunctionLevelLinking>true</FunctionLevelLinking>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
      <FloatingPointModel>Fast</FloatingPointModel>
    </ClCompile>
    <Link>
@@ -152,6 +160,7 @@
      <FunctionLevelLinking>true</FunctionLevelLinking>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
      <FloatingPointModel>Fast</FloatingPointModel>
    </ClCompile>
    <Link>
@@ -164,4 +173,4 @@
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
  </ImportGroup>
-</Project>
+</Project>
--- a/examples/aobench_instrumented/.gitignore
+++ b/examples/aobench_instrumented/.gitignore
@@ -0,0 +1,2 @@
+ao
+*.ppm
--- a/examples/aobench_instrumented/Makefile
+++ b/examples/aobench_instrumented/Makefile
@@ -2,7 +2,7 @@
 CXX=g++ -m64
 CXXFLAGS=-Iobjs/ -g3 -Wall
 ISPC=ispc
-ISPCFLAGS=-O2 --fast-math --instrument --arch=x86-64
+ISPCFLAGS=-O2 --instrument --arch=x86-64 --target=sse2

 default: ao

--- a/examples/aobench_instrumented/ao.cpp
+++ b/examples/aobench_instrumented/ao.cpp
@@ -32,7 +32,6 @@
 */

 #ifdef _MSC_VER
-#define _CRT_SECURE_NO_WARNINGS
 #define NOMINMAX
 #pragma warning (disable: 4244)
 #pragma warning (disable: 4305)
@@ -99,9 +98,11 @@ savePPM(const char *fname, int w, int h)
    fprintf(fp, "255\n");
    fwrite(img, w * h * 3, 1, fp);
    fclose(fp);
+    printf("Wrote image file %s\n", fname);
 }


+
 int main(int argc, char **argv)
 {
    if (argc != 4) {
--- a/examples/aobench_instrumented/aobench_instrumented.vcxproj
+++ b/examples/aobench_instrumented/aobench_instrumented.vcxproj
@@ -1,4 +1,4 @@
-<?xml version="1.0" encoding="utf-8"?>
+<?xml version="1.0" encoding="utf-8"?>
 <Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
  <ItemGroup Label="ProjectConfigurations">
    <ProjectConfiguration Include="Debug|Win32">
@@ -25,18 +25,18 @@
  <ItemGroup>
    <CustomBuild Include="ao.ispc">
      <FileType>Document</FileType>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --instrument
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --instrument --target=sse2
 </Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --instrument
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --instrument --target=sse2
 </Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(Filename).obj</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">%(Filename).obj</Outputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --instrument
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --instrument --target=sse2
 </Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --instrument
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --instrument --target=sse2
 </Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(Filename).obj</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">%(Filename).obj</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
    </CustomBuild>
  </ItemGroup>
  <PropertyGroup Label="Globals">
@@ -85,15 +85,19 @@
  <PropertyGroup Label="UserMacros" />
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
    <LinkIncremental>true</LinkIncremental>
+    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
    <LinkIncremental>true</LinkIncremental>
+    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
    <LinkIncremental>false</LinkIncremental>
+    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
    <LinkIncremental>false</LinkIncremental>
+    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
  </PropertyGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
    <ClCompile>
@@ -101,7 +105,8 @@
      </PrecompiledHeader>
      <WarningLevel>Level3</WarningLevel>
      <Optimization>Disabled</Optimization>
-      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
@@ -114,7 +119,8 @@
      </PrecompiledHeader>
      <WarningLevel>Level3</WarningLevel>
      <Optimization>Disabled</Optimization>
-      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
@@ -129,7 +135,8 @@
      <Optimization>MaxSpeed</Optimization>
      <FunctionLevelLinking>true</FunctionLevelLinking>
      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
@@ -146,7 +153,8 @@
      <Optimization>MaxSpeed</Optimization>
      <FunctionLevelLinking>true</FunctionLevelLinking>
      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
--- a/examples/deferred/Makefile
+++ b/examples/deferred/Makefile
@@ -0,0 +1,38 @@
+
+ARCH = $(shell uname)
+
+TASK_CXX=../tasksys.cpp
+TASK_LIB=-lpthread
+TASK_OBJ=$(addprefix objs/, $(subst ../,, $(TASK_CXX:.cpp=.o)))
+
+CXX=g++
+CXXFLAGS=-Iobjs/ -O3 -Wall -m64
+ISPC=ispc
+ISPCFLAGS=-O2 --target=sse2,sse4-x2,avx-x2 --arch=x86-64 --math-lib=fast
+
+OBJS=objs/main.o objs/common.o objs/kernels_ispc.o objs/kernels_ispc_sse2.o \
+	objs/kernels_ispc_sse4.o objs/kernels_ispc_avx.o \
+	objs/dynamic_c.o objs/dynamic_cilk.o
+
+default: deferred_shading
+
+.PHONY: dirs clean
+.PRECIOUS: objs/kernels_ispc.h
+
+dirs:
+	/bin/mkdir -p objs/
+
+clean:
+	/bin/rm -rf objs *~ deferred_shading
+
+deferred_shading: dirs $(OBJS) $(TASK_OBJ)
+	$(CXX) $(CXXFLAGS) -o $@ $(OBJS) $(TASK_OBJ) -lm $(TASK_LIB)
+
+objs/%.o: %.cpp objs/kernels_ispc.h deferred.h
+	$(CXX) $< $(CXXFLAGS) -c -o $@
+
+objs/%.o: ../%.cpp
+	$(CXX) $< $(CXXFLAGS) -c -o $@
+
+objs/%_ispc.h objs/%_ispc.o objs/%_ispc_sse2.o objs/%_ispc_sse4.o objs/%_ispc_avx.o: %.ispc
+	$(ISPC) $(ISPCFLAGS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
--- a/examples/deferred/common.cpp
+++ b/examples/deferred/common.cpp
@@ -0,0 +1,209 @@
+/*
+  Copyright (c) 2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+#ifdef _MSC_VER
+#define _CRT_SECURE_NO_WARNINGS
+#define ISPC_IS_WINDOWS
+#elif defined(__linux__)
+#define ISPC_IS_LINUX
+#elif defined(__APPLE__)
+#define ISPC_IS_APPLE
+#endif
+
+#include <fcntl.h>
+#include <float.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+#include <stdint.h>
+#include <algorithm>
+#include <assert.h>
+#include <vector>
+#ifdef ISPC_IS_WINDOWS
+  #define WIN32_LEAN_AND_MEAN
+  #include <windows.h>
+#endif
+#ifdef ISPC_IS_LINUX
+  #include <malloc.h>
+#endif
+#include "deferred.h"
+#include "../timing.h"
+
+///////////////////////////////////////////////////////////////////////////
+
+static void *
+lAlignedMalloc(size_t size, int32_t alignment) {
+#ifdef ISPC_IS_WINDOWS
+    return _aligned_malloc(size, alignment);
+#endif
+#ifdef ISPC_IS_LINUX
+    return memalign(alignment, size);
+#endif
+#ifdef ISPC_IS_APPLE
+    void *mem = malloc(size + (alignment-1) + sizeof(void*));
+    char *amem = ((char*)mem) + sizeof(void*);
+    amem = amem + uint32_t(alignment - (reinterpret_cast<uint64_t>(amem) &
+                                        (alignment - 1)));
+    ((void**)amem)[-1] = mem;
+    return amem;
+#endif
+}
+
+
+static void
+lAlignedFree(void *ptr) {
+#ifdef ISPC_IS_WINDOWS
+    _aligned_free(ptr);
+#endif
+#ifdef ISPC_IS_LINUX
+    free(ptr);
+#endif
+#ifdef ISPC_IS_APPLE
+    free(((void**)ptr)[-1]);
+#endif
+}
+
+
+Framebuffer::Framebuffer(int width, int height) {
+    nPixels = width*height;
+    r = (uint8_t *)lAlignedMalloc(nPixels, ALIGNMENT_BYTES);
+    g = (uint8_t *)lAlignedMalloc(nPixels, ALIGNMENT_BYTES);
+    b = (uint8_t *)lAlignedMalloc(nPixels, ALIGNMENT_BYTES);
+}
+
+
+Framebuffer::~Framebuffer() {
+    lAlignedFree(r);
+    lAlignedFree(g);
+    lAlignedFree(b);
+}
+
+
+void
+Framebuffer::clear() {
+    memset(r, 0, nPixels);
+    memset(g, 0, nPixels);
+    memset(b, 0, nPixels);
+}
+
+
+InputData *
+CreateInputDataFromFile(const char *path) {
+    FILE *in = fopen(path, "rb");
+    if (!in) return 0;
+
+    InputData *input = new InputData;
+
+    // Load header
+    if (fread(&input->header, sizeof(ispc::InputHeader), 1, in) != 1) {
+        fprintf(stderr, "Preumature EOF reading file \"%s\"\n", path);
+        return NULL;
+    }
+
+    // Load data chunk and update pointers
+    input->chunk = (uint8_t *)lAlignedMalloc(input->header.inputDataChunkSize, 
+                                             ALIGNMENT_BYTES);
+    if (fread(input->chunk, input->header.inputDataChunkSize, 1, in) != 1) {
+        fprintf(stderr, "Preumature EOF reading file \"%s\"\n", path);
+        return NULL;
+    }
+    
+    input->arrays.zBuffer =
+        (float *)&input->chunk[input->header.inputDataArrayOffsets[idaZBuffer]];
+    input->arrays.normalEncoded_x =
+        (uint16_t *)&input->chunk[input->header.inputDataArrayOffsets[idaNormalEncoded_x]];
+    input->arrays.normalEncoded_y =
+        (uint16_t *)&input->chunk[input->header.inputDataArrayOffsets[idaNormalEncoded_y]];
+    input->arrays.specularAmount =
+        (uint16_t *)&input->chunk[input->header.inputDataArrayOffsets[idaSpecularAmount]];
+    input->arrays.specularPower =
+        (uint16_t *)&input->chunk[input->header.inputDataArrayOffsets[idaSpecularPower]];
+    input->arrays.albedo_x =
+        (uint8_t *)&input->chunk[input->header.inputDataArrayOffsets[idaAlbedo_x]];
+    input->arrays.albedo_y =
+        (uint8_t *)&input->chunk[input->header.inputDataArrayOffsets[idaAlbedo_y]];
+    input->arrays.albedo_z =
+        (uint8_t *)&input->chunk[input->header.inputDataArrayOffsets[idaAlbedo_z]];
+    input->arrays.lightPositionView_x =
+        (float *)&input->chunk[input->header.inputDataArrayOffsets[idaLightPositionView_x]];
+    input->arrays.lightPositionView_y =
+        (float *)&input->chunk[input->header.inputDataArrayOffsets[idaLightPositionView_y]];
+    input->arrays.lightPositionView_z =
+        (float *)&input->chunk[input->header.inputDataArrayOffsets[idaLightPositionView_z]];
+    input->arrays.lightAttenuationBegin =
+        (float *)&input->chunk[input->header.inputDataArrayOffsets[idaLightAttenuationBegin]];
+    input->arrays.lightColor_x =
+        (float *)&input->chunk[input->header.inputDataArrayOffsets[idaLightColor_x]];
+    input->arrays.lightColor_y =
+        (float *)&input->chunk[input->header.inputDataArrayOffsets[idaLightColor_y]];
+    input->arrays.lightColor_z =
+        (float *)&input->chunk[input->header.inputDataArrayOffsets[idaLightColor_z]];
+    input->arrays.lightAttenuationEnd =
+        (float *)&input->chunk[input->header.inputDataArrayOffsets[idaLightAttenuationEnd]];
+
+    fclose(in);
+    return input;
+}
+
+
+void DeleteInputData(InputData *input) {
+    lAlignedFree(input->chunk);
+}
+
+
+void WriteFrame(const char *filename, const InputData *input,
+                const Framebuffer &framebuffer) {
+    // Deswizzle and copy to RGBA output
+    // Doesn't need to be fast... only happens once
+    size_t imageBytes = 3 * input->header.framebufferWidth * 
+        input->header.framebufferHeight;
+    uint8_t* framebufferAOS = (uint8_t *)lAlignedMalloc(imageBytes, ALIGNMENT_BYTES);
+    memset(framebufferAOS, 0, imageBytes);
+
+    for (int i = 0; i < input->header.framebufferWidth * 
+                        input->header.framebufferHeight; ++i) {
+        framebufferAOS[3 * i + 0] = framebuffer.r[i];
+        framebufferAOS[3 * i + 1] = framebuffer.g[i];
+        framebufferAOS[3 * i + 2] = framebuffer.b[i];
+    }
+    
+    // Write out simple PPM file
+    FILE *out = fopen(filename, "wb");
+    fprintf(out, "P6 %d %d 255\n", input->header.framebufferWidth, 
+            input->header.framebufferHeight);
+    fwrite(framebufferAOS, imageBytes, 1, out);
+
+    lAlignedFree(framebufferAOS);
+}
--- a/examples/deferred/data/pp1280x720.bin
+++ b/examples/deferred/data/pp1280x720.bin
--- a/examples/deferred/data/pp1920x1200.bin
+++ b/examples/deferred/data/pp1920x1200.bin
--- a/examples/deferred/deferred.h
+++ b/examples/deferred/deferred.h
@@ -0,0 +1,108 @@
+/*
+  Copyright (c) 2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+#ifndef DEFERRED_H
+#define DEFERRED_H
+
+// Currently tile widths must be a multiple of SIMD width (i.e. 8 for ispc sse4x2)!
+#define MIN_TILE_WIDTH 16
+#define MIN_TILE_HEIGHT 16
+#define MAX_LIGHTS 1024
+
+enum InputDataArraysEnum {
+    idaZBuffer = 0,
+    idaNormalEncoded_x,
+    idaNormalEncoded_y,
+    idaSpecularAmount,
+    idaSpecularPower,
+    idaAlbedo_x,
+    idaAlbedo_y,
+    idaAlbedo_z,
+    idaLightPositionView_x,
+    idaLightPositionView_y,
+    idaLightPositionView_z,
+    idaLightAttenuationBegin,
+    idaLightColor_x,
+    idaLightColor_y,
+    idaLightColor_z,
+    idaLightAttenuationEnd,
+
+    idaNum
+};
+
+#ifndef ISPC
+
+#include <stdint.h>
+#include "kernels_ispc.h"
+
+#define ALIGNMENT_BYTES 64
+
+#define MAX_LIGHTS 1024
+
+#define VISUALIZE_LIGHT_COUNT 0
+
+struct InputData
+{
+    ispc::InputHeader header;
+    ispc::InputDataArrays arrays;
+    uint8_t *chunk;
+};
+
+
+struct Framebuffer {
+    Framebuffer(int width, int height);
+    ~Framebuffer();
+
+    void clear();
+
+    uint8_t *r, *g, *b;
+
+private:
+    int nPixels;
+    Framebuffer(const Framebuffer &);
+    Framebuffer &operator=(const Framebuffer *);
+};
+
+
+InputData *CreateInputDataFromFile(const char *path);
+void DeleteInputData(InputData *input);
+void WriteFrame(const char *filename, const InputData *input,
+                const Framebuffer &framebuffer);
+void InitDynamicC(InputData *input);
+void InitDynamicCilk(InputData *input);
+void DispatchDynamicC(InputData *input, Framebuffer *framebuffer);
+void DispatchDynamicCilk(InputData *input, Framebuffer *framebuffer);
+
+#endif // !ISPC
+
+#endif // DEFERRED_H
--- a/examples/deferred/deferred_shading.vcxproj
+++ b/examples/deferred/deferred_shading.vcxproj
@@ -0,0 +1,178 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|Win32">
+      <Configuration>Debug</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|Win32">
+      <Configuration>Release</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{87f53c53-957e-4e91-878a-bc27828fb9eb}</ProjectGuid>
+    <Keyword>Win32Proj</Keyword>
+    <RootNamespace>mandelbrot</RootNamespace>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <LinkIncremental>true</LinkIncremental>
+    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <LinkIncremental>true</LinkIncremental>
+    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <LinkIncremental>false</LinkIncremental>
+    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <LinkIncremental>false</LinkIncremental>
+    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
+  </PropertyGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <ClCompile>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <FloatingPointModel>Fast</FloatingPointModel>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <ClCompile>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <FloatingPointModel>Fast</FloatingPointModel>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
+      <FloatingPointModel>Fast</FloatingPointModel>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
+      <FloatingPointModel>Fast</FloatingPointModel>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <ClCompile Include="common.cpp" />
+    <ClCompile Include="dynamic_c.cpp" />
+    <ClCompile Include="dynamic_cilk.cpp" />
+    <ClCompile Include="main.cpp" />
+    <ClCompile Include="../tasksys.cpp" />
+  </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="kernels.ispc">
+      <FileType>Document</FileType>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx-x2
+</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx-x2
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx-x2
+</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx-x2
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
+    </CustomBuild>
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+  </ImportGroup>
+</Project>
--- a/examples/deferred/dynamic_c.cpp
+++ b/examples/deferred/dynamic_c.cpp
@@ -0,0 +1,870 @@
+/*
+  Copyright (c) 2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+#include "deferred.h"
+#include "kernels_ispc.h"
+#include <algorithm>
+#include <stdint.h>
+#include <assert.h>
+#include <math.h>
+
+#ifdef _MSC_VER
+#define ISPC_IS_WINDOWS
+#elif defined(__linux__)
+#define ISPC_IS_LINUX
+#elif defined(__APPLE__)
+#define ISPC_IS_APPLE
+#endif
+
+#ifdef ISPC_IS_LINUX
+#include <malloc.h>
+#endif // ISPC_IS_LINUX
+
+// Currently tile widths must be a multiple of SIMD width (i.e. 8 for ispc sse4x2)!
+#define MIN_TILE_WIDTH 16
+#define MIN_TILE_HEIGHT 16
+
+
+#define DYNAMIC_TREE_LEVELS 5
+// If this is set to 1 then the result will be identical to the static version
+#define DYNAMIC_MIN_LIGHTS_TO_SUBDIVIDE 1
+
+static void *
+lAlignedMalloc(size_t size, int32_t alignment) {
+#ifdef ISPC_IS_WINDOWS
+    return _aligned_malloc(size, alignment);
+#endif
+#ifdef ISPC_IS_LINUX
+    return memalign(alignment, size);
+#endif
+#ifdef ISPC_IS_APPLE
+    void *mem = malloc(size + (alignment-1) + sizeof(void*));
+    char *amem = ((char*)mem) + sizeof(void*);
+    amem = amem + uint32_t(alignment - (reinterpret_cast<uint64_t>(amem) &
+                                        (alignment - 1)));
+    ((void**)amem)[-1] = mem;
+    return amem;
+#endif
+}
+
+
+static void
+lAlignedFree(void *ptr) {
+#ifdef ISPC_IS_WINDOWS
+    _aligned_free(ptr);
+#endif
+#ifdef ISPC_IS_LINUX
+    free(ptr);
+#endif
+#ifdef ISPC_IS_APPLE
+    free(((void**)ptr)[-1]);
+#endif
+}
+
+
+static void
+ComputeZBounds(int tileStartX, int tileEndX,
+               int tileStartY, int tileEndY,
+               // G-buffer data
+               float zBuffer[],
+               int gBufferWidth,
+               // Camera data
+               float cameraProj_33, float cameraProj_43,
+               float cameraNear, float cameraFar,
+               // Output
+               float *minZ, float *maxZ)
+{
+    // Find Z bounds
+    float laneMinZ = cameraFar;
+    float laneMaxZ = cameraNear;
+    for (int y = tileStartY; y < tileEndY; ++y) {
+        for (int x = tileStartX; x < tileEndX; ++x) {
+            // Unproject depth buffer Z value into view space
+            float z = zBuffer[(y * gBufferWidth + x)];
+            float viewSpaceZ = cameraProj_43 / (z - cameraProj_33);
+
+            // Work out Z bounds for our samples
+            // Avoid considering skybox/background or otherwise invalid pixels
+            if ((viewSpaceZ < cameraFar) && (viewSpaceZ >= cameraNear)) {
+                laneMinZ = std::min(laneMinZ, viewSpaceZ);
+                laneMaxZ = std::max(laneMaxZ, viewSpaceZ);
+            }
+        }
+    }
+    *minZ = laneMinZ;
+    *maxZ = laneMaxZ;
+}
+
+
+static void
+ComputeZBoundsRow(int tileY, int tileWidth, int tileHeight,
+                  int numTilesX, int numTilesY,
+                  // G-buffer data
+                  float zBuffer[],
+                  int gBufferWidth,
+                  // Camera data
+                  float cameraProj_33, float cameraProj_43,
+                  float cameraNear, float cameraFar,
+                  // Output
+                  float minZArray[],
+                  float maxZArray[])
+{
+    for (int tileX = 0; tileX < numTilesX; ++tileX) {
+        float minZ, maxZ;
+        ComputeZBounds(tileX * tileWidth, tileX * tileWidth + tileWidth,
+                       tileY * tileHeight, tileY * tileHeight + tileHeight,
+                       zBuffer, gBufferWidth, cameraProj_33, cameraProj_43, 
+                       cameraNear, cameraFar, &minZ, &maxZ);
+        minZArray[tileX] = minZ;
+        maxZArray[tileX] = maxZ;
+    }
+}
+
+
+class MinMaxZTree
+{
+public:
+    // Currently (min) tile dimensions must divide gBuffer dimensions evenly
+    // Levels must be small enough that neither dimension goes below one tile
+    MinMaxZTree(
+        int tileWidth, int tileHeight, int levels,
+        int gBufferWidth, int gBufferHeight)
+        : mTileWidth(tileWidth), mTileHeight(tileHeight), mLevels(levels)
+    {
+        mNumTilesX = gBufferWidth / mTileWidth;
+        mNumTilesY = gBufferHeight / mTileHeight;
+        
+        // Allocate arrays
+        mMinZArrays = (float **)lAlignedMalloc(sizeof(float *) * mLevels, 16);
+        mMaxZArrays = (float **)lAlignedMalloc(sizeof(float *) * mLevels, 16);
+        for (int i = 0; i < mLevels; ++i) {
+            int x = NumTilesX(i);
+            int y = NumTilesY(i);
+            assert(x > 0);
+            assert(y > 0);
+            // NOTE: If the following two asserts fire it probably means that
+            // the base tile dimensions do not evenly divide the G-buffer dimensions
+            assert(x * (mTileWidth << i) >= gBufferWidth);
+            assert(y * (mTileHeight << i) >= gBufferHeight);
+            mMinZArrays[i] = (float *)lAlignedMalloc(sizeof(float) * x * y, 16);
+            mMaxZArrays[i] = (float *)lAlignedMalloc(sizeof(float) * x * y, 16);
+        }
+    }
+
+    void Update(float *zBuffer, int gBufferPitchInElements,
+        float cameraProj_33, float cameraProj_43,
+        float cameraNear, float cameraFar)
+    {
+        for (int tileY = 0; tileY < mNumTilesY; ++tileY) {
+            ComputeZBoundsRow(tileY, mTileWidth, mTileHeight, mNumTilesX, mNumTilesY,
+                              zBuffer, gBufferPitchInElements,
+                              cameraProj_33, cameraProj_43, cameraNear, cameraFar,
+                              mMinZArrays[0] + (tileY * mNumTilesX),
+                              mMaxZArrays[0] + (tileY * mNumTilesX));
+        }
+
+        // Generate other levels
+        for (int level = 1; level < mLevels; ++level) {
+            int destTilesX = NumTilesX(level);
+            int destTilesY = NumTilesY(level);
+            int srcLevel = level - 1;
+            int srcTilesX = NumTilesX(srcLevel);
+            int srcTilesY = NumTilesY(srcLevel);
+            for (int y = 0; y < destTilesY; ++y) {
+                for (int x = 0; x < destTilesX; ++x) {
+                    int srcX = x << 1;
+                    int srcY = y << 1;
+                    // NOTE: Ugly branches to deal with non-multiple dimensions at some levels
+                    // TODO: SSE branchless min/max is probably better...
+                    float minZ = mMinZArrays[srcLevel][(srcY) * srcTilesX + (srcX)];
+                    float maxZ = mMaxZArrays[srcLevel][(srcY) * srcTilesX + (srcX)];
+                    if (srcX + 1 < srcTilesX) {
+                        minZ = std::min(minZ, mMinZArrays[srcLevel][(srcY) * srcTilesX + 
+                                                                    (srcX + 1)]);
+                        maxZ = std::max(maxZ, mMaxZArrays[srcLevel][(srcY) * srcTilesX +
+                                                                    (srcX + 1)]);
+                        if (srcY + 1 < srcTilesY) {
+                            minZ = std::min(minZ, mMinZArrays[srcLevel][(srcY + 1) * srcTilesX +
+                                                                        (srcX + 1)]);
+                            maxZ = std::max(maxZ, mMaxZArrays[srcLevel][(srcY + 1) * srcTilesX +
+                                                                        (srcX + 1)]);
+                        }
+                    }
+                    if (srcY + 1 < srcTilesY) {
+                        minZ = std::min(minZ, mMinZArrays[srcLevel][(srcY + 1) * srcTilesX +
+                                                                    (srcX    )]);
+                        maxZ = std::max(maxZ, mMaxZArrays[srcLevel][(srcY + 1) * srcTilesX +
+                                                                    (srcX    )]);
+                    }
+                    mMinZArrays[level][y * destTilesX + x] = minZ;
+                    mMaxZArrays[level][y * destTilesX + x] = maxZ;
+                }
+            }
+        }
+    }
+
+    ~MinMaxZTree() {
+        for (int i = 0; i < mLevels; ++i) {
+            lAlignedFree(mMinZArrays[i]);
+            lAlignedFree(mMaxZArrays[i]);
+        }
+        lAlignedFree(mMinZArrays);
+        lAlignedFree(mMaxZArrays); 
+    }
+
+    int Levels() const { return mLevels; }
+
+    // These round UP, so beware that the last tile for a given level may not be completely full
+    // TODO: Verify this...
+    int NumTilesX(int level = 0) const { return (mNumTilesX + (1 << level) - 1) >> level; }
+    int NumTilesY(int level = 0) const { return (mNumTilesY + (1 << level) - 1) >> level; }
+    int TileWidth(int level = 0) const { return (mTileWidth << level); }
+    int TileHeight(int level = 0) const { return (mTileHeight << level); }
+
+    float MinZ(int level, int tileX, int tileY) const {
+        return mMinZArrays[level][tileY * NumTilesX(level) + tileX];
+    }
+    float MaxZ(int level, int tileX, int tileY) const {
+        return mMaxZArrays[level][tileY * NumTilesX(level) + tileX];
+    }
+
+private:
+    int mTileWidth;
+    int mTileHeight;
+    int mLevels;
+    int mNumTilesX;
+    int mNumTilesY;
+
+    // One array for each "level" in the tree
+    float **mMinZArrays;
+    float **mMaxZArrays;
+};
+
+static MinMaxZTree *gMinMaxZTree = 0;
+
+void InitDynamicC(InputData *input) {
+    gMinMaxZTree = 
+        new MinMaxZTree(MIN_TILE_WIDTH, MIN_TILE_HEIGHT, DYNAMIC_TREE_LEVELS,
+                        input->header.framebufferWidth, 
+                        input->header.framebufferHeight);
+}
+
+
+/* We're going to split a tile into 4 sub-tiles.  This function
+   reclassifies the tile's lights with respect to the sub-tiles. */
+static void
+SplitTileMinMax(
+    int tileMidX, int tileMidY,
+    // Subtile data (00, 10, 01, 11)
+    float subtileMinZ[],
+    float subtileMaxZ[],
+    // G-buffer data
+    int gBufferWidth, int gBufferHeight,
+    // Camera data
+    float cameraProj_11, float cameraProj_22,
+    // Light Data
+    int lightIndices[],
+    int numLights,
+    float light_positionView_x_array[],
+    float light_positionView_y_array[],
+    float light_positionView_z_array[],
+    float light_attenuationEnd_array[],
+    // Outputs
+    int subtileIndices[],
+    int subtileIndicesPitch,
+    int subtileNumLights[]
+    )
+{
+    float gBufferScale_x = 0.5f * (float)gBufferWidth;
+    float gBufferScale_y = 0.5f * (float)gBufferHeight;
+        
+    float frustumPlanes_xy[2] = { -(cameraProj_11 * gBufferScale_x),
+                                   (cameraProj_22 * gBufferScale_y) };
+    float frustumPlanes_z[2] = { tileMidX - gBufferScale_x,
+                                 tileMidY - gBufferScale_y };
+
+    for (int i = 0; i < 2; ++i) {
+        // Normalize
+        float norm = 1.f / sqrtf(frustumPlanes_xy[i] * frustumPlanes_xy[i] + 
+                                 frustumPlanes_z[i] * frustumPlanes_z[i]);
+        frustumPlanes_xy[i] *= norm;
+        frustumPlanes_z[i] *= norm;
+    }
+
+    // Initialize
+    int subtileLightOffset[4];
+    subtileLightOffset[0] = 0 * subtileIndicesPitch;
+    subtileLightOffset[1] = 1 * subtileIndicesPitch;
+    subtileLightOffset[2] = 2 * subtileIndicesPitch;
+    subtileLightOffset[3] = 3 * subtileIndicesPitch;
+
+    for (int i = 0; i < numLights; ++i) {
+        int lightIndex = lightIndices[i];
+
+        float light_positionView_x = light_positionView_x_array[lightIndex];
+        float light_positionView_y = light_positionView_y_array[lightIndex];
+        float light_positionView_z = light_positionView_z_array[lightIndex];
+        float light_attenuationEnd = light_attenuationEnd_array[lightIndex];
+        float light_attenuationEndNeg = -light_attenuationEnd;
+        
+        // Test lights again against subtile z bounds
+        bool inFrustum[4];
+        inFrustum[0] = (light_positionView_z - subtileMinZ[0] >= light_attenuationEndNeg) &&
+            (subtileMaxZ[0] - light_positionView_z >= light_attenuationEndNeg);
+        inFrustum[1] = (light_positionView_z - subtileMinZ[1] >= light_attenuationEndNeg) && 
+            (subtileMaxZ[1] - light_positionView_z >= light_attenuationEndNeg);
+        inFrustum[2] = (light_positionView_z - subtileMinZ[2] >= light_attenuationEndNeg) && 
+            (subtileMaxZ[2] - light_positionView_z >= light_attenuationEndNeg);
+        inFrustum[3] = (light_positionView_z - subtileMinZ[3] >= light_attenuationEndNeg) && 
+            (subtileMaxZ[3] - light_positionView_z >= light_attenuationEndNeg);
+
+        float dx = light_positionView_z * frustumPlanes_z[0] + 
+            light_positionView_x * frustumPlanes_xy[0];
+        float dy = light_positionView_z * frustumPlanes_z[1] +
+            light_positionView_y * frustumPlanes_xy[1];
+        
+        if (fabsf(dx) > light_attenuationEnd) {
+            bool positiveX = dx > 0.0f;
+            inFrustum[0] = inFrustum[0] &&  positiveX;    // 00 subtile
+            inFrustum[1] = inFrustum[1] && !positiveX;    // 10 subtile
+            inFrustum[2] = inFrustum[2] &&  positiveX;    // 01 subtile
+            inFrustum[3] = inFrustum[3] && !positiveX;    // 11 subtile
+        }
+        if (fabsf(dy) > light_attenuationEnd) {
+            bool positiveY = dy > 0.0f;
+            inFrustum[0] = inFrustum[0] &&  positiveY;    // 00 subtile
+            inFrustum[1] = inFrustum[1] &&  positiveY;    // 10 subtile
+            inFrustum[2] = inFrustum[2] && !positiveY;    // 01 subtile
+            inFrustum[3] = inFrustum[3] && !positiveY;    // 11 subtile
+        }
+
+        if (inFrustum[0])
+            subtileIndices[subtileLightOffset[0]++] = lightIndex;
+        if (inFrustum[1])
+            subtileIndices[subtileLightOffset[1]++] = lightIndex;
+        if (inFrustum[2])
+            subtileIndices[subtileLightOffset[2]++] = lightIndex;
+        if (inFrustum[3])
+            subtileIndices[subtileLightOffset[3]++] = lightIndex;
+    }
+
+    subtileNumLights[0] = subtileLightOffset[0] - 0 * subtileIndicesPitch;
+    subtileNumLights[1] = subtileLightOffset[1] - 1 * subtileIndicesPitch;
+    subtileNumLights[2] = subtileLightOffset[2] - 2 * subtileIndicesPitch;
+    subtileNumLights[3] = subtileLightOffset[3] - 3 * subtileIndicesPitch;
+}
+
+
+static inline float
+dot3(float x, float y, float z, float a, float b, float c) {
+    return (x*a + y*b + z*c);
+}
+
+
+static inline void
+normalize3(float x, float y, float z, float &ox, float &oy, float &oz) {
+    float n = 1.f / sqrtf(x*x + y*y + z*z);
+    ox = x * n;
+    oy = y * n;
+    oz = z * n;
+}
+
+
+static inline float
+Unorm8ToFloat32(uint8_t u) {
+    return (float)u * (1.0f / 255.0f);
+}
+
+
+static inline uint8_t
+Float32ToUnorm8(float f) {
+    return (uint8_t)(f * 255.0f);
+}
+
+
+static inline float
+half_to_float_fast(uint16_t h) {
+    uint32_t hs = h & (int32_t)0x8000u;  // Pick off sign bit
+    uint32_t he = h & (int32_t)0x7C00u;  // Pick off exponent bits
+    uint32_t hm = h & (int32_t)0x03FFu;  // Pick off mantissa bits
+
+    // sign
+    uint32_t xs = ((uint32_t) hs) << 16; 
+    // Exponent: unbias the halfp, then bias the single
+    int32_t xes = ((int32_t) (he >> 10)) - 15 + 127; 
+    // Exponent
+    uint32_t xe = (uint32_t) (xes << 23);
+    // Mantissa
+    uint32_t xm = ((uint32_t) hm) << 13; 
+
+    uint32_t bits = (xs | xe | xm);
+    float *fp = reinterpret_cast<float *>(&bits);
+    return *fp;
+}
+
+
+static void
+ShadeTileC(
+    int32_t tileStartX, int32_t tileEndX,
+    int32_t tileStartY, int32_t tileEndY,
+    int32_t gBufferWidth, int32_t gBufferHeight,
+    const ispc::InputDataArrays &inputData,
+    // Camera data
+    float cameraProj_11, float cameraProj_22,
+    float cameraProj_33, float cameraProj_43,
+    // Light list
+    int32_t tileLightIndices[],
+    int32_t tileNumLights,
+    // UI
+    bool visualizeLightCount,
+    // Output
+    uint8_t framebuffer_r[],
+    uint8_t framebuffer_g[],
+    uint8_t framebuffer_b[]
+    )
+{
+    if (tileNumLights == 0 || visualizeLightCount) {
+        uint8_t c = (uint8_t)(std::min(tileNumLights << 2, 255));
+        for (int32_t y = tileStartY; y < tileEndY; ++y) {
+            for (int32_t x = tileStartX; x < tileEndX; ++x) {
+                int32_t framebufferIndex = (y * gBufferWidth + x);
+                framebuffer_r[framebufferIndex] = c;
+                framebuffer_g[framebufferIndex] = c;
+                framebuffer_b[framebufferIndex] = c;
+            }
+        }
+    } else {
+        float twoOverGBufferWidth = 2.0f / gBufferWidth;
+        float twoOverGBufferHeight = 2.0f / gBufferHeight;
+        
+        for (int32_t y = tileStartY; y < tileEndY; ++y) {
+            float positionScreen_y = -(((0.5f + y) * twoOverGBufferHeight) - 1.f);
+
+            for (int32_t x = tileStartX; x < tileEndX; ++x) {
+                int32_t gBufferOffset = y * gBufferWidth + x;
+                
+                // Reconstruct position and (negative) view vector from G-buffer
+                float surface_positionView_x, surface_positionView_y, surface_positionView_z;
+                float Vneg_x, Vneg_y, Vneg_z;
+
+                float z = inputData.zBuffer[gBufferOffset];
+
+                // Compute screen/clip-space position
+                // NOTE: Mind DX11 viewport transform and pixel center!
+                float positionScreen_x = (0.5f + (float)(x)) * 
+                    twoOverGBufferWidth - 1.0f;
+
+                // Unproject depth buffer Z value into view space
+                surface_positionView_z = cameraProj_43 / (z - cameraProj_33);
+                surface_positionView_x = positionScreen_x * surface_positionView_z / 
+                    cameraProj_11;
+                surface_positionView_y = positionScreen_y * surface_positionView_z / 
+                    cameraProj_22;
+                
+                // We actually end up with a vector pointing *at* the
+                // surface (i.e. the negative view vector)
+                normalize3(surface_positionView_x, surface_positionView_y, 
+                           surface_positionView_z, Vneg_x, Vneg_y, Vneg_z);
+
+                // Reconstruct normal from G-buffer
+                float surface_normal_x, surface_normal_y, surface_normal_z;
+                float normal_x = half_to_float_fast(inputData.normalEncoded_x[gBufferOffset]);
+                float normal_y = half_to_float_fast(inputData.normalEncoded_y[gBufferOffset]);
+                    
+                float f = (normal_x - normal_x * normal_x) + (normal_y - normal_y * normal_y);
+                float m = sqrtf(4.0f * f - 1.0f);
+                    
+                surface_normal_x = m * (4.0f * normal_x - 2.0f);
+                surface_normal_y = m * (4.0f * normal_y - 2.0f);
+                surface_normal_z = 3.0f - 8.0f * f;
+
+                // Load other G-buffer parameters
+                float surface_specularAmount = 
+                    half_to_float_fast(inputData.specularAmount[gBufferOffset]);
+                float surface_specularPower  = 
+                    half_to_float_fast(inputData.specularPower[gBufferOffset]);
+                float surface_albedo_x = Unorm8ToFloat32(inputData.albedo_x[gBufferOffset]);
+                float surface_albedo_y = Unorm8ToFloat32(inputData.albedo_y[gBufferOffset]);
+                float surface_albedo_z = Unorm8ToFloat32(inputData.albedo_z[gBufferOffset]);
+                
+                float lit_x = 0.0f;
+                float lit_y = 0.0f;
+                float lit_z = 0.0f;
+                for (int32_t tileLightIndex = 0; tileLightIndex < tileNumLights; 
+                     ++tileLightIndex) {
+                    int32_t lightIndex = tileLightIndices[tileLightIndex];
+                                        
+                    // Gather light data relevant to initial culling
+                    float light_positionView_x = 
+                        inputData.lightPositionView_x[lightIndex];
+                    float light_positionView_y = 
+                        inputData.lightPositionView_y[lightIndex];
+                    float light_positionView_z = 
+                        inputData.lightPositionView_z[lightIndex];
+                    float light_attenuationEnd = 
+                        inputData.lightAttenuationEnd[lightIndex];
+                    
+                    // Compute light vector
+                    float L_x = light_positionView_x - surface_positionView_x;
+                    float L_y = light_positionView_y - surface_positionView_y;
+                    float L_z = light_positionView_z - surface_positionView_z;
+
+                    float distanceToLight2 = dot3(L_x, L_y, L_z, L_x, L_y, L_z);
+                    
+                    // Clip at end of attenuation
+                    float light_attenutaionEnd2 = light_attenuationEnd * light_attenuationEnd;
+
+                    if (distanceToLight2 < light_attenutaionEnd2) {                    
+                        float distanceToLight = sqrtf(distanceToLight2);
+
+                        float distanceToLightRcp = 1.f / distanceToLight;
+                        L_x *= distanceToLightRcp;
+                        L_y *= distanceToLightRcp;
+                        L_z *= distanceToLightRcp;
+
+                        // Start computing brdf
+                        float NdotL = dot3(surface_normal_x, surface_normal_y, 
+                                           surface_normal_z, L_x, L_y, L_z);
+                    
+                        // Clip back facing
+                        if (NdotL > 0.0f) {
+                            float light_attenuationBegin = 
+                                inputData.lightAttenuationBegin[lightIndex];
+
+                            // Light distance attenuation (linstep)
+                            float lightRange = (light_attenuationEnd - light_attenuationBegin);
+                            float falloffPosition = (light_attenuationEnd - distanceToLight);
+                            float attenuation = std::min(falloffPosition / lightRange, 1.0f);
+
+                            float H_x = (L_x - Vneg_x);
+                            float H_y = (L_y - Vneg_y);
+                            float H_z = (L_z - Vneg_z);
+                            normalize3(H_x, H_y, H_z, H_x, H_y, H_z);
+                    
+                            float NdotH = dot3(surface_normal_x, surface_normal_y, 
+                                               surface_normal_z, H_x, H_y, H_z);
+                            NdotH = std::max(NdotH, 0.0f);
+
+                            float specular = powf(NdotH, surface_specularPower);
+                            float specularNorm = (surface_specularPower + 2.0f) * 
+                                (1.0f / 8.0f);
+                            float specularContrib = surface_specularAmount * 
+                                specularNorm * specular;
+
+                            float k = attenuation * NdotL * (1.0f + specularContrib);
+                    
+                            float light_color_x = inputData.lightColor_x[lightIndex];
+                            float light_color_y = inputData.lightColor_y[lightIndex];
+                            float light_color_z = inputData.lightColor_z[lightIndex];
+
+                            float lightContrib_x = surface_albedo_x * light_color_x;
+                            float lightContrib_y = surface_albedo_y * light_color_y;
+                            float lightContrib_z = surface_albedo_z * light_color_z;
+
+                            lit_x += lightContrib_x * k;
+                            lit_y += lightContrib_y * k;
+                            lit_z += lightContrib_z * k;
+                        }
+                    }
+                }
+
+                // Gamma correct
+                float gamma = 1.0 / 2.2f;
+                lit_x = powf(std::min(std::max(lit_x, 0.0f), 1.0f), gamma);
+                lit_y = powf(std::min(std::max(lit_y, 0.0f), 1.0f), gamma);
+                lit_z = powf(std::min(std::max(lit_z, 0.0f), 1.0f), gamma);
+                
+                framebuffer_r[gBufferOffset] = Float32ToUnorm8(lit_x);
+                framebuffer_g[gBufferOffset] = Float32ToUnorm8(lit_y);
+                framebuffer_b[gBufferOffset] = Float32ToUnorm8(lit_z);
+            }
+        }
+    }
+}
+
+
+void
+ShadeDynamicTileRecurse(InputData *input, int level, int tileX, int tileY, 
+                        int *lightIndices, int numLights, 
+                        Framebuffer *framebuffer) {
+    const MinMaxZTree *minMaxZTree = gMinMaxZTree;
+    
+    // If we few enough lights or this is the base case (last level), shade
+    // this full tile directly
+    if (level == 0 || numLights < DYNAMIC_MIN_LIGHTS_TO_SUBDIVIDE) {
+        int width = minMaxZTree->TileWidth(level);
+        int height = minMaxZTree->TileHeight(level);
+        int startX = tileX * width;
+        int startY = tileY * height;
+        int endX = std::min(input->header.framebufferWidth, startX + width);
+        int endY = std::min(input->header.framebufferHeight, startY + height);
+        
+        // Skip entirely offscreen tiles
+        if (endX > startX && endY > startY) {
+            ShadeTileC(startX, endX, startY, endY,
+                       input->header.framebufferWidth, input->header.framebufferHeight,
+                       input->arrays,
+                       input->header.cameraProj[0][0], input->header.cameraProj[1][1], 
+                       input->header.cameraProj[2][2], input->header.cameraProj[3][2],
+                       lightIndices, numLights, VISUALIZE_LIGHT_COUNT, 
+                       framebuffer->r, framebuffer->g, framebuffer->b);
+        }
+    } 
+    else {
+        // Otherwise, subdivide and 4-way recurse using X and Y splitting planes
+        // Move down a level in the tree
+        --level;
+        tileX <<= 1;
+        tileY <<= 1;
+        int width = minMaxZTree->TileWidth(level);
+        int height = minMaxZTree->TileHeight(level);
+
+        // Work out splitting coords
+        int midX = (tileX + 1) * width;
+        int midY = (tileY + 1) * height;
+
+        // Read subtile min/max data
+        // NOTE: We must be sure to handle out-of-bounds access here since
+        // sometimes we'll only have 1 or 2 subtiles for non-pow-2
+        // framebuffer sizes.
+        bool rightTileExists = (tileX + 1 < minMaxZTree->NumTilesX(level));
+        bool bottomTileExists = (tileY + 1 < minMaxZTree->NumTilesY(level));
+
+        // NOTE: Order is 00, 10, 01, 11
+        // Set defaults up to cull all lights if the tile doesn't exist (offscreen)
+        float minZ[4] = {input->header.cameraFar, input->header.cameraFar, 
+                         input->header.cameraFar, input->header.cameraFar};
+        float maxZ[4] = {input->header.cameraNear, input->header.cameraNear, 
+                         input->header.cameraNear, input->header.cameraNear};
+
+        minZ[0] = minMaxZTree->MinZ(level, tileX, tileY);
+        maxZ[0] = minMaxZTree->MaxZ(level, tileX, tileY);
+        if (rightTileExists) {
+            minZ[1] = minMaxZTree->MinZ(level, tileX + 1, tileY);
+            maxZ[1] = minMaxZTree->MaxZ(level, tileX + 1, tileY);
+            if (bottomTileExists) {
+                minZ[3] = minMaxZTree->MinZ(level, tileX + 1, tileY + 1);
+                maxZ[3] = minMaxZTree->MaxZ(level, tileX + 1, tileY + 1);
+            }
+        }
+        if (bottomTileExists) {
+            minZ[2] = minMaxZTree->MinZ(level, tileX, tileY + 1);
+            maxZ[2] = minMaxZTree->MaxZ(level, tileX, tileY + 1);
+        }
+
+        // Cull lights into subtile lists
+#ifdef ISPC_IS_WINDOWS
+        __declspec(align(ALIGNMENT_BYTES)) 
+#endif
+            int subtileLightIndices[4][MAX_LIGHTS]
+#ifndef ISPC_IS_WINDOWS
+            __attribute__ ((aligned(ALIGNMENT_BYTES)))
+#endif
+;
+        int subtileNumLights[4];
+        SplitTileMinMax(midX, midY, minZ, maxZ,
+            input->header.framebufferWidth, input->header.framebufferHeight, 
+            input->header.cameraProj[0][0], input->header.cameraProj[1][1],
+            lightIndices, numLights, input->arrays.lightPositionView_x, 
+            input->arrays.lightPositionView_y, input->arrays.lightPositionView_z, 
+            input->arrays.lightAttenuationEnd,
+            subtileLightIndices[0], MAX_LIGHTS, subtileNumLights);
+        
+        // Recurse into subtiles
+        ShadeDynamicTileRecurse(input, level, tileX    , tileY, 
+                                subtileLightIndices[0], subtileNumLights[0],
+                                framebuffer);
+        ShadeDynamicTileRecurse(input, level, tileX + 1, tileY,
+                                subtileLightIndices[1], subtileNumLights[1],
+                                framebuffer);
+        ShadeDynamicTileRecurse(input, level, tileX    , tileY + 1,
+                                subtileLightIndices[2], subtileNumLights[2],
+                                framebuffer);
+        ShadeDynamicTileRecurse(input, level, tileX + 1, tileY + 1,
+                                subtileLightIndices[3], subtileNumLights[3],
+                                framebuffer);
+    }
+}
+
+
+static int
+IntersectLightsWithTileMinMax(
+    int tileStartX, int tileEndX,
+    int tileStartY, int tileEndY,
+    // Tile data
+    float minZ,
+    float maxZ,
+    // G-buffer data
+    int gBufferWidth, int gBufferHeight,
+    // Camera data
+    float cameraProj_11, float cameraProj_22,
+    // Light Data
+    int numLights,
+    float light_positionView_x_array[],
+    float light_positionView_y_array[],
+    float light_positionView_z_array[],
+    float light_attenuationEnd_array[],
+    // Output
+    int tileLightIndices[]
+    )
+{
+    float gBufferScale_x = 0.5f * (float)gBufferWidth;
+    float gBufferScale_y = 0.5f * (float)gBufferHeight;
+        
+    float frustumPlanes_xy[4];
+    float frustumPlanes_z[4];
+
+    // This one is totally constant over the whole screen... worth pulling it up at all?
+    float frustumPlanes_xy_v[4] = { -(cameraProj_11 * gBufferScale_x),
+                                    (cameraProj_11 * gBufferScale_x),
+                                    (cameraProj_22 * gBufferScale_y),
+                                    -(cameraProj_22 * gBufferScale_y) };
+    
+    float frustumPlanes_z_v[4] = {  tileEndX - gBufferScale_x,
+                                    -tileStartX + gBufferScale_x,
+                                    tileEndY - gBufferScale_y,
+                                    -tileStartY + gBufferScale_y };
+
+    for (int i = 0; i < 4; ++i) {
+        float norm = 1.f / sqrtf(frustumPlanes_xy_v[i] * frustumPlanes_xy_v[i] + 
+                                 frustumPlanes_z_v[i] * frustumPlanes_z_v[i]);
+        frustumPlanes_xy_v[i] *= norm;
+        frustumPlanes_z_v[i] *= norm;
+
+        frustumPlanes_xy[i] = frustumPlanes_xy_v[i];
+        frustumPlanes_z[i] = frustumPlanes_z_v[i];
+    }
+
+    int tileNumLights = 0;
+
+    for (int lightIndex = 0; lightIndex < numLights; ++lightIndex) {
+        float light_positionView_z = light_positionView_z_array[lightIndex];
+        float light_attenuationEnd = light_attenuationEnd_array[lightIndex];
+        float light_attenuationEndNeg = -light_attenuationEnd;
+
+        float d = light_positionView_z - minZ;
+        bool inFrustum = (d >= light_attenuationEndNeg);
+
+        d = maxZ - light_positionView_z;
+        inFrustum = inFrustum && (d >= light_attenuationEndNeg);
+        
+        if (!inFrustum) 
+            continue;
+
+        float light_positionView_x = light_positionView_x_array[lightIndex];
+        float light_positionView_y = light_positionView_y_array[lightIndex];
+
+        d = light_positionView_z * frustumPlanes_z[0] + 
+            light_positionView_x * frustumPlanes_xy[0];
+        inFrustum = inFrustum && (d >= light_attenuationEndNeg);
+
+        d = light_positionView_z * frustumPlanes_z[1] + 
+            light_positionView_x * frustumPlanes_xy[1];
+        inFrustum = inFrustum && (d >= light_attenuationEndNeg);
+
+        d = light_positionView_z * frustumPlanes_z[2] + 
+            light_positionView_y * frustumPlanes_xy[2];
+        inFrustum = inFrustum && (d >= light_attenuationEndNeg);
+
+        d = light_positionView_z * frustumPlanes_z[3] + 
+            light_positionView_y * frustumPlanes_xy[3];
+        inFrustum = inFrustum && (d >= light_attenuationEndNeg);
+        
+        // Pack and store intersecting lights
+        if (inFrustum)
+            tileLightIndices[tileNumLights++] = lightIndex;
+    }
+
+    return tileNumLights;
+}
+
+
+void
+ShadeDynamicTile(InputData *input, int level, int tileX, int tileY,
+                 Framebuffer *framebuffer) {
+    const MinMaxZTree *minMaxZTree = gMinMaxZTree;
+
+    // Get Z min/max for this tile
+    int width = minMaxZTree->TileWidth(level);
+    int height = minMaxZTree->TileHeight(level);
+    float minZ = minMaxZTree->MinZ(level, tileX, tileY);
+    float maxZ = minMaxZTree->MaxZ(level, tileX, tileY);
+
+    int startX = tileX * width;
+    int startY = tileY * height;
+    int endX = std::min(input->header.framebufferWidth, startX + width);
+    int endY = std::min(input->header.framebufferHeight, startY + height);
+
+    // This is a root tile, so first do a full 6-plane cull
+#ifdef ISPC_IS_WINDOWS
+    __declspec(align(ALIGNMENT_BYTES)) 
+#endif
+        int lightIndices[MAX_LIGHTS]
+#ifndef ISPC_IS_WINDOWS
+        __attribute__ ((aligned(ALIGNMENT_BYTES)))
+#endif
+;
+    int numLights = IntersectLightsWithTileMinMax(
+        startX, endX, startY, endY,    minZ, maxZ,
+        input->header.framebufferWidth, input->header.framebufferHeight,
+        input->header.cameraProj[0][0], input->header.cameraProj[1][1],
+        MAX_LIGHTS, input->arrays.lightPositionView_x, 
+        input->arrays.lightPositionView_y, input->arrays.lightPositionView_z, 
+        input->arrays.lightAttenuationEnd, lightIndices);
+
+    // Now kick off the recursive process for this tile
+    ShadeDynamicTileRecurse(input, level, tileX, tileY, lightIndices, 
+                            numLights, framebuffer);
+}
+
+
+void
+DispatchDynamicC(InputData *input, Framebuffer *framebuffer)
+{
+    MinMaxZTree *minMaxZTree = gMinMaxZTree;
+        
+    // Update min/max Z tree
+    minMaxZTree->Update(input->arrays.zBuffer, input->header.framebufferWidth,
+        input->header.cameraProj[2][2], input->header.cameraProj[3][2], 
+        input->header.cameraNear, input->header.cameraFar);
+
+    int rootLevel = minMaxZTree->Levels() - 1;
+    int rootTilesX = minMaxZTree->NumTilesX(rootLevel);
+    int rootTilesY = minMaxZTree->NumTilesY(rootLevel);
+    int rootTiles = rootTilesX * rootTilesY;
+    for (int g = 0; g < rootTiles; ++g) {
+        uint32_t tileY = g / rootTilesX;
+        uint32_t tileX = g % rootTilesX;
+        ShadeDynamicTile(input, rootLevel, tileX, tileY, framebuffer);
+    }
+}
--- a/examples/deferred/dynamic_cilk.cpp
+++ b/examples/deferred/dynamic_cilk.cpp
@@ -0,0 +1,398 @@
+/*
+  Copyright (c) 2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+#ifdef __cilk
+
+#include "deferred.h"
+#include "kernels_ispc.h"
+#include <algorithm>
+#include <assert.h>
+
+#ifdef _MSC_VER
+#define ISPC_IS_WINDOWS
+#elif defined(__linux__)
+#define ISPC_IS_LINUX
+#elif defined(__APPLE__)
+#define ISPC_IS_APPLE
+#endif
+
+#ifdef ISPC_IS_LINUX
+#include <malloc.h>
+#endif // ISPC_IS_LINUX
+
+// Currently tile widths must be a multiple of SIMD width (i.e. 8 for ispc sse4x2)!
+#define MIN_TILE_WIDTH 16
+#define MIN_TILE_HEIGHT 16
+
+
+#define DYNAMIC_TREE_LEVELS 5
+// If this is set to 1 then the result will be identical to the static version
+#define DYNAMIC_MIN_LIGHTS_TO_SUBDIVIDE 1
+
+static void *
+lAlignedMalloc(size_t size, int32_t alignment) {
+#ifdef ISPC_IS_WINDOWS
+    return _aligned_malloc(size, alignment);
+#endif
+#ifdef ISPC_IS_LINUX
+    return memalign(alignment, size);
+#endif
+#ifdef ISPC_IS_APPLE
+    void *mem = malloc(size + (alignment-1) + sizeof(void*));
+    char *amem = ((char*)mem) + sizeof(void*);
+    amem = amem + uint32_t(alignment - (reinterpret_cast<uint64_t>(amem) &
+                                        (alignment - 1)));
+    ((void**)amem)[-1] = mem;
+    return amem;
+#endif
+}
+
+
+static void
+lAlignedFree(void *ptr) {
+#ifdef ISPC_IS_WINDOWS
+    _aligned_free(ptr);
+#endif
+#ifdef ISPC_IS_LINUX
+    free(ptr);
+#endif
+#ifdef ISPC_IS_APPLE
+    free(((void**)ptr)[-1]);
+#endif
+}
+
+
+class MinMaxZTreeCilk
+{
+public:
+    // Currently (min) tile dimensions must divide gBuffer dimensions evenly
+    // Levels must be small enough that neither dimension goes below one tile
+    MinMaxZTreeCilk(
+        int tileWidth, int tileHeight, int levels,
+        int gBufferWidth, int gBufferHeight)
+        : mTileWidth(tileWidth), mTileHeight(tileHeight), mLevels(levels)
+    {
+        mNumTilesX = gBufferWidth / mTileWidth;
+        mNumTilesY = gBufferHeight / mTileHeight;
+        
+        // Allocate arrays
+        mMinZArrays = (float **)lAlignedMalloc(sizeof(float *) * mLevels, 16);
+        mMaxZArrays = (float **)lAlignedMalloc(sizeof(float *) * mLevels, 16);
+        for (int i = 0; i < mLevels; ++i) {
+            int x = NumTilesX(i);
+            int y = NumTilesY(i);
+            assert(x > 0);
+            assert(y > 0);
+            // NOTE: If the following two asserts fire it probably means that
+            // the base tile dimensions do not evenly divide the G-buffer dimensions
+            assert(x * (mTileWidth << i) >= gBufferWidth);
+            assert(y * (mTileHeight << i) >= gBufferHeight);
+            mMinZArrays[i] = (float *)lAlignedMalloc(sizeof(float) * x * y, 16);
+            mMaxZArrays[i] = (float *)lAlignedMalloc(sizeof(float) * x * y, 16);
+        }
+    }
+
+    void Update(float *zBuffer, int gBufferPitchInElements,
+        float cameraProj_33, float cameraProj_43,
+        float cameraNear, float cameraFar)
+    {
+        // Compute level 0 in parallel. Outer loops is here since we use Cilk
+        _Cilk_for (int tileY = 0; tileY < mNumTilesY; ++tileY) {
+            ispc::ComputeZBoundsRow(tileY,
+                mTileWidth, mTileHeight, mNumTilesX, mNumTilesY,
+                zBuffer, gBufferPitchInElements,
+                cameraProj_33, cameraProj_43, cameraNear, cameraFar,
+                mMinZArrays[0] + (tileY * mNumTilesX),
+                mMaxZArrays[0] + (tileY * mNumTilesX));
+        }
+
+        // Generate other levels
+        // NOTE: We currently don't use ispc here since it's sort of an
+        // awkward gather-based reduction Using SSE odd pack/unpack
+        // instructions might actually work here when we need to optimize
+        for (int level = 1; level < mLevels; ++level) {
+            int destTilesX = NumTilesX(level);
+            int destTilesY = NumTilesY(level);
+            int srcLevel = level - 1;
+            int srcTilesX = NumTilesX(srcLevel);
+            int srcTilesY = NumTilesY(srcLevel);
+            _Cilk_for (int y = 0; y < destTilesY; ++y) {
+                for (int x = 0; x < destTilesX; ++x) {
+                    int srcX = x << 1;
+                    int srcY = y << 1;
+                    // NOTE: Ugly branches to deal with non-multiple dimensions at some levels
+                    // TODO: SSE branchless min/max is probably better...
+                    float minZ = mMinZArrays[srcLevel][(srcY) * srcTilesX + (srcX)];
+                    float maxZ = mMaxZArrays[srcLevel][(srcY) * srcTilesX + (srcX)];
+                    if (srcX + 1 < srcTilesX) {
+                        minZ = std::min(minZ, mMinZArrays[srcLevel][(srcY) * srcTilesX + 
+                                                                    (srcX + 1)]);
+                        maxZ = std::max(maxZ, mMaxZArrays[srcLevel][(srcY) * srcTilesX +
+                                                                    (srcX + 1)]);
+                        if (srcY + 1 < srcTilesY) {
+                            minZ = std::min(minZ, mMinZArrays[srcLevel][(srcY + 1) * srcTilesX +
+                                                                        (srcX + 1)]);
+                            maxZ = std::max(maxZ, mMaxZArrays[srcLevel][(srcY + 1) * srcTilesX +
+                                                                        (srcX + 1)]);
+                        }
+                    }
+                    if (srcY + 1 < srcTilesY) {
+                        minZ = std::min(minZ, mMinZArrays[srcLevel][(srcY + 1) * srcTilesX +
+                                                                    (srcX    )]);
+                        maxZ = std::max(maxZ, mMaxZArrays[srcLevel][(srcY + 1) * srcTilesX +
+                                                                    (srcX    )]);
+                    }
+                    mMinZArrays[level][y * destTilesX + x] = minZ;
+                    mMaxZArrays[level][y * destTilesX + x] = maxZ;
+                }
+            }
+        }
+    }
+
+    ~MinMaxZTreeCilk() {
+        for (int i = 0; i < mLevels; ++i) {
+            lAlignedFree(mMinZArrays[i]);
+            lAlignedFree(mMaxZArrays[i]);
+        }
+        lAlignedFree(mMinZArrays);
+        lAlignedFree(mMaxZArrays); 
+    }
+
+    int Levels() const { return mLevels; }
+
+    // These round UP, so beware that the last tile for a given level may not be completely full
+    // TODO: Verify this...
+    int NumTilesX(int level = 0) const { return (mNumTilesX + (1 << level) - 1) >> level; }
+    int NumTilesY(int level = 0) const { return (mNumTilesY + (1 << level) - 1) >> level; }
+    int TileWidth(int level = 0) const { return (mTileWidth << level); }
+    int TileHeight(int level = 0) const { return (mTileHeight << level); }
+
+    float MinZ(int level, int tileX, int tileY) const {
+        return mMinZArrays[level][tileY * NumTilesX(level) + tileX];
+    }
+    float MaxZ(int level, int tileX, int tileY) const {
+        return mMaxZArrays[level][tileY * NumTilesX(level) + tileX];
+    }
+
+private:
+    int mTileWidth;
+    int mTileHeight;
+    int mLevels;
+    int mNumTilesX;
+    int mNumTilesY;
+
+    // One array for each "level" in the tree
+    float **mMinZArrays;
+    float **mMaxZArrays;
+};
+
+static MinMaxZTreeCilk *gMinMaxZTreeCilk = 0;
+
+void InitDynamicCilk(InputData *input) {
+    gMinMaxZTreeCilk = 
+        new MinMaxZTreeCilk(MIN_TILE_WIDTH, MIN_TILE_HEIGHT, DYNAMIC_TREE_LEVELS,
+                            input->header.framebufferWidth, 
+                            input->header.framebufferHeight);
+}
+
+
+static void
+ShadeDynamicTileRecurse(InputData *input, int level, int tileX, int tileY, 
+                        int *lightIndices, int numLights, 
+                        Framebuffer *framebuffer) {
+    const MinMaxZTreeCilk *minMaxZTree = gMinMaxZTreeCilk;
+    
+    // If we few enough lights or this is the base case (last level), shade
+    // this full tile directly
+    if (level == 0 || numLights < DYNAMIC_MIN_LIGHTS_TO_SUBDIVIDE) {
+        int width = minMaxZTree->TileWidth(level);
+        int height = minMaxZTree->TileHeight(level);
+        int startX = tileX * width;
+        int startY = tileY * height;
+        int endX = std::min(input->header.framebufferWidth, startX + width);
+        int endY = std::min(input->header.framebufferHeight, startY + height);
+        
+        // Skip entirely offscreen tiles
+        if (endX > startX && endY > startY) {
+            ispc::ShadeTile(
+                startX, endX, startY, endY,
+                input->header.framebufferWidth, input->header.framebufferHeight,
+                &input->arrays,
+                input->header.cameraProj[0][0], input->header.cameraProj[1][1], 
+                input->header.cameraProj[2][2], input->header.cameraProj[3][2],
+                lightIndices, numLights, VISUALIZE_LIGHT_COUNT, 
+                framebuffer->r, framebuffer->g, framebuffer->b);
+        }
+    } 
+    else {
+        // Otherwise, subdivide and 4-way recurse using X and Y splitting planes
+        // Move down a level in the tree
+        --level;
+        tileX <<= 1;
+        tileY <<= 1;
+        int width = minMaxZTree->TileWidth(level);
+        int height = minMaxZTree->TileHeight(level);
+
+        // Work out splitting coords
+        int midX = (tileX + 1) * width;
+        int midY = (tileY + 1) * height;
+
+        // Read subtile min/max data
+        // NOTE: We must be sure to handle out-of-bounds access here since
+        // sometimes we'll only have 1 or 2 subtiles for non-pow-2
+        // framebuffer sizes.
+        bool rightTileExists = (tileX + 1 < minMaxZTree->NumTilesX(level));
+        bool bottomTileExists = (tileY + 1 < minMaxZTree->NumTilesY(level));
+
+        // NOTE: Order is 00, 10, 01, 11
+        // Set defaults up to cull all lights if the tile doesn't exist (offscreen)
+        float minZ[4] = {input->header.cameraFar, input->header.cameraFar, 
+                         input->header.cameraFar, input->header.cameraFar};
+        float maxZ[4] = {input->header.cameraNear, input->header.cameraNear, 
+                         input->header.cameraNear, input->header.cameraNear};
+
+        minZ[0] = minMaxZTree->MinZ(level, tileX, tileY);
+        maxZ[0] = minMaxZTree->MaxZ(level, tileX, tileY);
+        if (rightTileExists) {
+            minZ[1] = minMaxZTree->MinZ(level, tileX + 1, tileY);
+            maxZ[1] = minMaxZTree->MaxZ(level, tileX + 1, tileY);
+            if (bottomTileExists) {
+                minZ[3] = minMaxZTree->MinZ(level, tileX + 1, tileY + 1);
+                maxZ[3] = minMaxZTree->MaxZ(level, tileX + 1, tileY + 1);
+            }
+        }
+        if (bottomTileExists) {
+            minZ[2] = minMaxZTree->MinZ(level, tileX, tileY + 1);
+            maxZ[2] = minMaxZTree->MaxZ(level, tileX, tileY + 1);
+        }
+
+        // Cull lights into subtile lists
+#ifdef ISPC_IS_WINDOWS
+        __declspec(align(ALIGNMENT_BYTES)) 
+#endif
+            int subtileLightIndices[4][MAX_LIGHTS]
+#ifndef ISPC_IS_WINDOWS
+            __attribute__ ((aligned(ALIGNMENT_BYTES)))
+#endif
+;
+        int subtileNumLights[4];
+        ispc::SplitTileMinMax(midX, midY, minZ, maxZ,
+            input->header.framebufferWidth, input->header.framebufferHeight, 
+            input->header.cameraProj[0][0], input->header.cameraProj[1][1],
+            lightIndices, numLights, input->arrays.lightPositionView_x, 
+            input->arrays.lightPositionView_y, input->arrays.lightPositionView_z, 
+            input->arrays.lightAttenuationEnd,
+            subtileLightIndices[0], MAX_LIGHTS, subtileNumLights);
+        
+        // Recurse into subtiles
+        _Cilk_spawn ShadeDynamicTileRecurse(input, level, tileX    , tileY, 
+                                            subtileLightIndices[0], subtileNumLights[0],
+                                            framebuffer);
+        _Cilk_spawn ShadeDynamicTileRecurse(input, level, tileX + 1, tileY,
+                                            subtileLightIndices[1], subtileNumLights[1],
+                                            framebuffer);
+        _Cilk_spawn ShadeDynamicTileRecurse(input, level, tileX    , tileY + 1,
+                                            subtileLightIndices[2], subtileNumLights[2],
+                                            framebuffer);
+        ShadeDynamicTileRecurse(input, level, tileX + 1, tileY + 1,
+                                subtileLightIndices[3], subtileNumLights[3],
+                                framebuffer);
+    }
+}
+
+
+static void
+ShadeDynamicTile(InputData *input, int level, int tileX, int tileY,
+                 Framebuffer *framebuffer) {
+    const MinMaxZTreeCilk *minMaxZTree = gMinMaxZTreeCilk;
+
+    // Get Z min/max for this tile
+    int width = minMaxZTree->TileWidth(level);
+    int height = minMaxZTree->TileHeight(level);
+    float minZ = minMaxZTree->MinZ(level, tileX, tileY);
+    float maxZ = minMaxZTree->MaxZ(level, tileX, tileY);
+
+    int startX = tileX * width;
+    int startY = tileY * height;
+    int endX = std::min(input->header.framebufferWidth, startX + width);
+    int endY = std::min(input->header.framebufferHeight, startY + height);
+
+    // This is a root tile, so first do a full 6-plane cull
+#ifdef ISPC_IS_WINDOWS
+    __declspec(align(ALIGNMENT_BYTES)) 
+#endif
+        int lightIndices[MAX_LIGHTS]
+#ifndef ISPC_IS_WINDOWS
+        __attribute__ ((aligned(ALIGNMENT_BYTES)))
+#endif
+;
+    int numLights = ispc::IntersectLightsWithTileMinMax(
+        startX, endX, startY, endY,    minZ, maxZ,
+        input->header.framebufferWidth, input->header.framebufferHeight,
+        input->header.cameraProj[0][0], input->header.cameraProj[1][1],
+        MAX_LIGHTS, input->arrays.lightPositionView_x, 
+        input->arrays.lightPositionView_y, input->arrays.lightPositionView_z, 
+        input->arrays.lightAttenuationEnd, lightIndices);
+
+    // Now kick off the recursive process for this tile
+    ShadeDynamicTileRecurse(input, level, tileX, tileY, lightIndices, 
+                            numLights, framebuffer);
+}
+
+
+void
+DispatchDynamicCilk(InputData *input, Framebuffer *framebuffer)
+{
+    MinMaxZTreeCilk *minMaxZTree = gMinMaxZTreeCilk;
+        
+    // Update min/max Z tree
+    minMaxZTree->Update(input->arrays.zBuffer, input->header.framebufferWidth,
+        input->header.cameraProj[2][2], input->header.cameraProj[3][2], 
+        input->header.cameraNear, input->header.cameraFar);
+
+    // Launch the "root" tiles.  Ideally these should at least fill the
+    // machine... at the moment we have a static number of "levels" to the
+    // mip tree but it might make sense to compute it based on the width of
+    // the machine.
+    int rootLevel = minMaxZTree->Levels() - 1;
+    int rootTilesX = minMaxZTree->NumTilesX(rootLevel);
+    int rootTilesY = minMaxZTree->NumTilesY(rootLevel);
+    int rootTiles = rootTilesX * rootTilesY;
+    _Cilk_for (int g = 0; g < rootTiles; ++g) {
+        uint32_t tileY = g / rootTilesX;
+        uint32_t tileX = g % rootTilesX;
+        ShadeDynamicTile(input, rootLevel, tileX, tileY, framebuffer);
+    }
+}
+
+#endif // __cilk
--- a/examples/deferred/kernels.ispc
+++ b/examples/deferred/kernels.ispc
@@ -0,0 +1,717 @@
+/*
+  Copyright (c) 2010-2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+#include "deferred.h"
+
+struct InputDataArrays
+{
+    uniform float zBuffer[];
+    uniform unsigned int16 normalEncoded_x[]; // half float
+    uniform unsigned int16 normalEncoded_y[]; // half float
+    uniform unsigned int16 specularAmount[]; // half float
+    uniform unsigned int16 specularPower[]; // half float
+    uniform unsigned int8 albedo_x[]; // unorm8
+    uniform unsigned int8 albedo_y[]; // unorm8
+    uniform unsigned int8 albedo_z[]; // unorm8
+    uniform float lightPositionView_x[];
+    uniform float lightPositionView_y[];
+    uniform float lightPositionView_z[];
+    uniform float lightAttenuationBegin[];
+    uniform float lightColor_x[];
+    uniform float lightColor_y[];
+    uniform float lightColor_z[];
+    uniform float lightAttenuationEnd[];
+};
+
+struct InputHeader
+{
+    uniform float cameraProj[4][4];
+    uniform float cameraNear;
+    uniform float cameraFar;
+
+    uniform int32 framebufferWidth;
+    uniform int32 framebufferHeight;
+    uniform int32 numLights;
+    uniform int32 inputDataChunkSize;
+    uniform int32 inputDataArrayOffsets[idaNum];
+};
+
+
+///////////////////////////////////////////////////////////////////////////
+// Common utility routines
+
+static inline float
+dot3(float x, float y, float z, float a, float b, float c) {
+    return (x*a + y*b + z*c);
+}
+
+
+static inline void
+normalize3(float x, float y, float z, reference float ox, 
+           reference float oy, reference float oz) {
+    float n = rsqrt(x*x + y*y + z*z);
+    ox = x * n;
+    oy = y * n;
+    oz = z * n;
+}
+
+
+static inline float
+Unorm8ToFloat32(unsigned int8 u) {
+    return (float)u * (1.0f / 255.0f);
+}
+
+
+static inline unsigned int8
+Float32ToUnorm8(float f) {
+    return (unsigned int8)(f * 255.0f);
+}
+
+
+// tile width must be a multiple of programCount (SIMD size)
+static void
+ComputeZBounds(
+    uniform int32 tileStartX, uniform int32 tileEndX,
+    uniform int32 tileStartY, uniform int32 tileEndY,
+    // G-buffer data
+    uniform float zBuffer[],
+    uniform int32 gBufferWidth,
+    // Camera data
+    uniform float cameraProj_33, uniform float cameraProj_43,
+    uniform float cameraNear, uniform float cameraFar,
+    // Output
+    reference uniform float minZ,
+    reference uniform float maxZ
+    )
+{
+    // Find Z bounds
+    float laneMinZ = cameraFar;
+    float laneMaxZ = cameraNear;
+    for (uniform int32 y = tileStartY; y < tileEndY; ++y) {
+        for (uniform int32 x = tileStartX; x < tileEndX; x += programCount) {
+            // Unproject depth buffer Z value into view space
+            float z = zBuffer[(y * gBufferWidth + x) + programIndex];
+            float viewSpaceZ = cameraProj_43 / (z - cameraProj_33);
+
+            // Work out Z bounds for our samples
+            // Avoid considering skybox/background or otherwise invalid pixels
+            if ((viewSpaceZ < cameraFar) && (viewSpaceZ >= cameraNear)) {
+                laneMinZ = min(laneMinZ, viewSpaceZ);
+                laneMaxZ = max(laneMaxZ, viewSpaceZ);
+            }
+        }
+    }
+    minZ = reduce_min(laneMinZ);
+    maxZ = reduce_max(laneMaxZ);
+}
+
+
+// tile width must be a multiple of programCount (SIMD size)
+// numLights must currently be a multiple of programCount (SIMD size)
+export uniform int32
+IntersectLightsWithTileMinMax(
+    uniform int32 tileStartX, uniform int32 tileEndX,
+    uniform int32 tileStartY, uniform int32 tileEndY,
+    // Tile data
+    uniform float minZ,
+    uniform float maxZ,
+    // G-buffer data
+    uniform int32 gBufferWidth, uniform int32 gBufferHeight,
+    // Camera data
+    uniform float cameraProj_11, uniform float cameraProj_22,
+    // Light Data
+    uniform int32 numLights,
+    uniform float light_positionView_x_array[],
+    uniform float light_positionView_y_array[],
+    uniform float light_positionView_z_array[],
+    uniform float light_attenuationEnd_array[],
+    // Output
+    reference uniform int32 tileLightIndices[]
+    )
+{
+    uniform float gBufferScale_x = 0.5f * (float)gBufferWidth;
+    uniform float gBufferScale_y = 0.5f * (float)gBufferHeight;
+        
+    // Parallize across frustum planes.
+    // We really only have four side planes here, but write the code to
+    // handle programCount > 4 robustly
+    uniform float frustumPlanes_xy[programCount];
+    uniform float frustumPlanes_z[programCount];
+
+    // TODO: If programIndex < 4 here? Don't care about masking off the
+    // rest but if interleaving ("x2" modes) the other lanes should ideally
+    // not be emitted...
+    {
+        // This one is totally constant over the whole screen... worth pulling it up at all?
+        float frustumPlanes_xy_v;
+        frustumPlanes_xy_v = insert(frustumPlanes_xy_v, 0, -(cameraProj_11 * gBufferScale_x));
+        frustumPlanes_xy_v = insert(frustumPlanes_xy_v, 1,  (cameraProj_11 * gBufferScale_x));
+        frustumPlanes_xy_v = insert(frustumPlanes_xy_v, 2,  (cameraProj_22 * gBufferScale_y));
+        frustumPlanes_xy_v = insert(frustumPlanes_xy_v, 3, -(cameraProj_22 * gBufferScale_y));
+    
+        float frustumPlanes_z_v;
+        frustumPlanes_z_v = insert(frustumPlanes_z_v, 0,  tileEndX - gBufferScale_x);
+        frustumPlanes_z_v = insert(frustumPlanes_z_v, 1, -tileStartX + gBufferScale_x);
+        frustumPlanes_z_v = insert(frustumPlanes_z_v, 2,  tileEndY - gBufferScale_y);
+        frustumPlanes_z_v = insert(frustumPlanes_z_v, 3, -tileStartY + gBufferScale_y);
+
+        // Normalize
+        float norm = rsqrt(frustumPlanes_xy_v * frustumPlanes_xy_v + 
+                           frustumPlanes_z_v * frustumPlanes_z_v);
+            frustumPlanes_xy_v *= norm;
+            frustumPlanes_z_v *= norm;
+
+        // Save out for uniform use later
+        frustumPlanes_xy[programIndex] = frustumPlanes_xy_v;
+        frustumPlanes_z[programIndex] = frustumPlanes_z_v;
+    }
+
+    uniform int32 tileNumLights = 0;
+
+    for (uniform int32 baseLightIndex = 0; baseLightIndex < numLights; 
+         baseLightIndex += programCount) {
+        int32 lightIndex = baseLightIndex + programIndex;
+        float light_positionView_z = light_positionView_z_array[lightIndex];
+        float light_attenuationEnd = light_attenuationEnd_array[lightIndex];
+        float light_attenuationEndNeg = -light_attenuationEnd;
+
+        float d = light_positionView_z - minZ;
+        bool inFrustum = (d >= light_attenuationEndNeg);
+
+        d = maxZ - light_positionView_z;
+        inFrustum = inFrustum && (d >= light_attenuationEndNeg);
+        
+        // This seems better than cif(!inFrustum) ccontinue; here since we
+        // don't actually need to mask the rest of this function - this is
+        // just a greedy early-out.  Could also structure all of this as
+        // nested if() statements, but this a bit easier to read
+        if (!any(inFrustum)) 
+            continue;
+
+        float light_positionView_x = light_positionView_x_array[lightIndex];
+        float light_positionView_y = light_positionView_y_array[lightIndex];
+
+        d = light_positionView_z * frustumPlanes_z[0] + 
+            light_positionView_x * frustumPlanes_xy[0];
+        inFrustum = inFrustum && (d >= light_attenuationEndNeg);
+
+        d = light_positionView_z * frustumPlanes_z[1] + 
+            light_positionView_x * frustumPlanes_xy[1];
+        inFrustum = inFrustum && (d >= light_attenuationEndNeg);
+
+        d = light_positionView_z * frustumPlanes_z[2] + 
+            light_positionView_y * frustumPlanes_xy[2];
+        inFrustum = inFrustum && (d >= light_attenuationEndNeg);
+
+        d = light_positionView_z * frustumPlanes_z[3] + 
+            light_positionView_y * frustumPlanes_xy[3];
+        inFrustum = inFrustum && (d >= light_attenuationEndNeg);
+        
+        // Pack and store intersecting lights
+        cif (inFrustum) {
+            tileNumLights += packed_store_active(tileLightIndices, tileNumLights, 
+                                                 lightIndex);
+        }
+    }
+
+    return tileNumLights;
+}
+
+
+// tile width must be a multiple of programCount (SIMD size)
+// numLights must currently be a multiple of programCount (SIMD size)
+static uniform int32
+IntersectLightsWithTile(
+    uniform int32 tileStartX, uniform int32 tileEndX,
+    uniform int32 tileStartY, uniform int32 tileEndY,
+    uniform int32 gBufferWidth, uniform int32 gBufferHeight,
+    // G-buffer data
+    uniform float zBuffer[],
+    // Camera data
+    uniform float cameraProj_11, uniform float cameraProj_22,
+    uniform float cameraProj_33, uniform float cameraProj_43,
+    uniform float cameraNear, uniform float cameraFar,
+    // Light Data
+    uniform int32 numLights,
+    uniform float light_positionView_x_array[],
+    uniform float light_positionView_y_array[],
+    uniform float light_positionView_z_array[],
+    uniform float light_attenuationEnd_array[],
+    // Output
+    reference uniform int32 tileLightIndices[]
+    )
+{
+    uniform float minZ, maxZ;
+    ComputeZBounds(tileStartX, tileEndX, tileStartY, tileEndY,
+        zBuffer, gBufferWidth, cameraProj_33, cameraProj_43, cameraNear, cameraFar,
+        minZ, maxZ);
+
+    uniform int32 tileNumLights = IntersectLightsWithTileMinMax(
+        tileStartX, tileEndX, tileStartY, tileEndY, minZ, maxZ,
+        gBufferWidth, gBufferHeight, cameraProj_11, cameraProj_22,
+        MAX_LIGHTS, light_positionView_x_array, light_positionView_y_array, 
+        light_positionView_z_array, light_attenuationEnd_array,
+        tileLightIndices);
+
+    return tileNumLights;
+}
+
+
+// tile width must be a multiple of programCount (SIMD size)
+export void
+ShadeTile(
+    uniform int32 tileStartX, uniform int32 tileEndX,
+    uniform int32 tileStartY, uniform int32 tileEndY,
+    uniform int32 gBufferWidth, uniform int32 gBufferHeight,
+    reference uniform InputDataArrays inputData,
+    // Camera data
+    uniform float cameraProj_11, uniform float cameraProj_22,
+    uniform float cameraProj_33, uniform float cameraProj_43,
+    // Light list
+    reference uniform int32 tileLightIndices[],
+    uniform int32 tileNumLights,
+    // UI
+    uniform bool visualizeLightCount,
+    // Output
+    reference uniform unsigned int8 framebuffer_r[],
+    reference uniform unsigned int8 framebuffer_g[],
+    reference uniform unsigned int8 framebuffer_b[]
+    )
+{
+    if (tileNumLights == 0 || visualizeLightCount) {
+        uniform unsigned int8 c = (unsigned int8)(min(tileNumLights << 2, 255));
+        for (uniform int32 y = tileStartY; y < tileEndY; ++y) {
+            for (uniform int32 x = tileStartX; x < tileEndX; x += programCount) {
+                int32 framebufferIndex = (y * gBufferWidth + x) + programIndex;
+                framebuffer_r[framebufferIndex] = c;
+                framebuffer_g[framebufferIndex] = c;
+                framebuffer_b[framebufferIndex] = c;
+            }
+        }
+    } else {
+        uniform float twoOverGBufferWidth = 2.0f / gBufferWidth;
+        uniform float twoOverGBufferHeight = 2.0f / gBufferHeight;
+        
+        for (uniform int32 y = tileStartY; y < tileEndY; ++y) {
+            uniform float positionScreen_y = -(((0.5f + y) * twoOverGBufferHeight) - 1.f);
+
+            for (uniform int32 x = tileStartX; x < tileEndX; x += programCount) {
+                uniform int32 gBufferOffsetBase = y * gBufferWidth + x;
+                int32 gBufferOffset = gBufferOffsetBase + programIndex;
+                
+                // Reconstruct position and (negative) view vector from G-buffer
+                float surface_positionView_x, surface_positionView_y, surface_positionView_z;
+                float Vneg_x, Vneg_y, Vneg_z;
+
+                float z = inputData.zBuffer[gBufferOffset];
+
+                // Compute screen/clip-space position
+                // NOTE: Mind DX11 viewport transform and pixel center!
+                float positionScreen_x = (0.5f + (float)(x + programIndex)) * 
+                    twoOverGBufferWidth - 1.0f;
+
+                // Unproject depth buffer Z value into view space
+                surface_positionView_z = cameraProj_43 / (z - cameraProj_33);
+                surface_positionView_x = positionScreen_x * surface_positionView_z / 
+                    cameraProj_11;
+                surface_positionView_y = positionScreen_y * surface_positionView_z / 
+                    cameraProj_22;
+                
+                // We actually end up with a vector pointing *at* the
+                // surface (i.e. the negative view vector)
+                normalize3(surface_positionView_x, surface_positionView_y, 
+                           surface_positionView_z, Vneg_x, Vneg_y, Vneg_z);
+
+                // Reconstruct normal from G-buffer
+                float surface_normal_x, surface_normal_y, surface_normal_z;
+                float normal_x = half_to_float_fast(inputData.normalEncoded_x[gBufferOffset]);
+                float normal_y = half_to_float_fast(inputData.normalEncoded_y[gBufferOffset]);
+                    
+                float f = (normal_x - normal_x * normal_x) + (normal_y - normal_y * normal_y);
+                float m = sqrt(4.0f * f - 1.0f);
+                    
+                surface_normal_x = m * (4.0f * normal_x - 2.0f);
+                surface_normal_y = m * (4.0f * normal_y - 2.0f);
+                surface_normal_z = 3.0f - 8.0f * f;
+
+                // Load other G-buffer parameters
+                float surface_specularAmount = 
+                    half_to_float_fast(inputData.specularAmount[gBufferOffset]);
+                float surface_specularPower  = 
+                    half_to_float_fast(inputData.specularPower[gBufferOffset]);
+                float surface_albedo_x = Unorm8ToFloat32(inputData.albedo_x[gBufferOffset]);
+                float surface_albedo_y = Unorm8ToFloat32(inputData.albedo_y[gBufferOffset]);
+                float surface_albedo_z = Unorm8ToFloat32(inputData.albedo_z[gBufferOffset]);
+                
+                float lit_x = 0.0f;
+                float lit_y = 0.0f;
+                float lit_z = 0.0f;
+                for (uniform int32 tileLightIndex = 0; tileLightIndex < tileNumLights; 
+                     ++tileLightIndex) {
+                    uniform int32 lightIndex = tileLightIndices[tileLightIndex];
+                                        
+                    // Gather light data relevant to initial culling
+                    uniform float light_positionView_x = 
+                        inputData.lightPositionView_x[lightIndex];
+                    uniform float light_positionView_y = 
+                        inputData.lightPositionView_y[lightIndex];
+                    uniform float light_positionView_z = 
+                        inputData.lightPositionView_z[lightIndex];
+                    uniform float light_attenuationEnd = 
+                        inputData.lightAttenuationEnd[lightIndex];
+                    
+                    // Compute light vector
+                    float L_x = light_positionView_x - surface_positionView_x;
+                    float L_y = light_positionView_y - surface_positionView_y;
+                    float L_z = light_positionView_z - surface_positionView_z;
+
+                    float distanceToLight2 = dot3(L_x, L_y, L_z, L_x, L_y, L_z);
+                    
+                    // Clip at end of attenuation
+                    float light_attenutaionEnd2 = light_attenuationEnd * light_attenuationEnd;
+
+                    cif (distanceToLight2 < light_attenutaionEnd2) {                    
+                        float distanceToLight = sqrt(distanceToLight2);
+
+                        // HLSL "rcp" is allowed to be fairly inaccurate
+                        float distanceToLightRcp = rcp(distanceToLight);
+                        L_x *= distanceToLightRcp;
+                        L_y *= distanceToLightRcp;
+                        L_z *= distanceToLightRcp;
+
+                        // Start computing brdf
+                        float NdotL = dot3(surface_normal_x, surface_normal_y, 
+                                           surface_normal_z, L_x, L_y, L_z);
+                    
+                        // Clip back facing
+                        cif (NdotL > 0.0f) {
+                            uniform float light_attenuationBegin = 
+                                inputData.lightAttenuationBegin[lightIndex];
+
+                            // Light distance attenuation (linstep)
+                            float lightRange = (light_attenuationEnd - light_attenuationBegin);
+                            float falloffPosition = (light_attenuationEnd - distanceToLight);
+                            float attenuation = min(falloffPosition / lightRange, 1.0f);
+
+                            float H_x = (L_x - Vneg_x);
+                            float H_y = (L_y - Vneg_y);
+                            float H_z = (L_z - Vneg_z);
+                            normalize3(H_x, H_y, H_z, H_x, H_y, H_z);
+                    
+                            float NdotH = dot3(surface_normal_x, surface_normal_y, 
+                                               surface_normal_z, H_x, H_y, H_z);
+                            NdotH = max(NdotH, 0.0f);
+
+                            float specular = pow(NdotH, surface_specularPower);
+                            float specularNorm = (surface_specularPower + 2.0f) * 
+                                (1.0f / 8.0f);
+                            float specularContrib = surface_specularAmount * 
+                                specularNorm * specular;
+
+                            float k = attenuation * NdotL * (1.0f + specularContrib);
+                    
+                            uniform float light_color_x = inputData.lightColor_x[lightIndex];
+                            uniform float light_color_y = inputData.lightColor_y[lightIndex];
+                            uniform float light_color_z = inputData.lightColor_z[lightIndex];
+
+                            float lightContrib_x = surface_albedo_x * light_color_x;
+                            float lightContrib_y = surface_albedo_y * light_color_y;
+                            float lightContrib_z = surface_albedo_z * light_color_z;
+
+                            lit_x += lightContrib_x * k;
+                            lit_y += lightContrib_y * k;
+                            lit_z += lightContrib_z * k;
+                        }
+                    }
+                }
+
+                // Gamma correct
+                // These pows are pretty slow right now, but we can do
+                // something faster if really necessary to squeeze every
+                // last bit of performance out of it
+                float gamma = 1.0 / 2.2f;
+                lit_x = pow(clamp(lit_x, 0.0f, 1.0f), gamma);
+                lit_y = pow(clamp(lit_y, 0.0f, 1.0f), gamma);
+                lit_z = pow(clamp(lit_z, 0.0f, 1.0f), gamma);
+                
+                framebuffer_r[gBufferOffset] = Float32ToUnorm8(lit_x);
+                framebuffer_g[gBufferOffset] = Float32ToUnorm8(lit_y);
+                framebuffer_b[gBufferOffset] = Float32ToUnorm8(lit_z);
+            }
+        }
+    }
+}
+
+
+///////////////////////////////////////////////////////////////////////////
+// Static decomposition
+
+task void
+RenderTile(uniform int num_groups_x, uniform int num_groups_y,
+           reference uniform InputHeader inputHeader,
+           reference uniform InputDataArrays inputData,
+           uniform int visualizeLightCount,
+           // Output
+           reference uniform unsigned int8 framebuffer_r[],
+           reference uniform unsigned int8 framebuffer_g[],
+           reference uniform unsigned int8 framebuffer_b[]) {
+    uniform int32 group_y = taskIndex / num_groups_x;
+    uniform int32 group_x = taskIndex % num_groups_x;
+    uniform int32 tile_start_x = group_x * MIN_TILE_WIDTH;
+    uniform int32 tile_start_y = group_y * MIN_TILE_HEIGHT;
+    uniform int32 tile_end_x = tile_start_x + MIN_TILE_WIDTH;
+    uniform int32 tile_end_y = tile_start_y + MIN_TILE_HEIGHT;
+
+    uniform int framebufferWidth = inputHeader.framebufferWidth;
+    uniform int framebufferHeight = inputHeader.framebufferHeight;
+    uniform float cameraProj_00 = inputHeader.cameraProj[0][0];
+    uniform float cameraProj_11 = inputHeader.cameraProj[1][1];
+    uniform float cameraProj_22 = inputHeader.cameraProj[2][2];
+    uniform float cameraProj_32 = inputHeader.cameraProj[3][2];
+
+    // Light intersection: figure out which lights illuminate this tile.
+    uniform int tileLightIndices[MAX_LIGHTS];  // Light list for the tile
+    uniform int numTileLights = 
+        IntersectLightsWithTile(tile_start_x, tile_end_x, 
+                                tile_start_y, tile_end_y,
+                                framebufferWidth, framebufferHeight,
+                                inputData.zBuffer,
+                                cameraProj_00, cameraProj_11,
+                                cameraProj_22, cameraProj_32,
+                                inputHeader.cameraNear, inputHeader.cameraFar,
+                                MAX_LIGHTS,
+                                inputData.lightPositionView_x, 
+                                inputData.lightPositionView_y, 
+                                inputData.lightPositionView_z, 
+                                inputData.lightAttenuationEnd,
+                                tileLightIndices);
+
+    // And now shade the tile, using the lights in tileLightIndices
+    ShadeTile(tile_start_x, tile_end_x, tile_start_y, tile_end_y,
+              framebufferWidth, framebufferHeight, inputData,
+              cameraProj_00, cameraProj_11, cameraProj_22, cameraProj_32,
+              tileLightIndices, numTileLights, visualizeLightCount, 
+              framebuffer_r, framebuffer_g, framebuffer_b);
+}
+
+
+export void
+RenderStatic(reference uniform InputHeader inputHeader,
+             reference uniform InputDataArrays inputData,
+             uniform int visualizeLightCount,
+             // Output
+             reference uniform unsigned int8 framebuffer_r[],
+             reference uniform unsigned int8 framebuffer_g[],
+             reference uniform unsigned int8 framebuffer_b[]) {
+    uniform int num_groups_x = (inputHeader.framebufferWidth + 
+                                MIN_TILE_WIDTH - 1) / MIN_TILE_WIDTH;
+    uniform int num_groups_y = (inputHeader.framebufferHeight + 
+                                MIN_TILE_HEIGHT - 1) / MIN_TILE_HEIGHT;
+    uniform int num_groups = num_groups_x * num_groups_y;
+
+    // Launch a task to render each tile, each of which is MIN_TILE_WIDTH
+    // by MIN_TILE_HEIGHT pixels.
+    launch[num_groups] < RenderTile(num_groups_x, num_groups_y,
+                                    inputHeader, inputData, visualizeLightCount,
+                                    framebuffer_r, framebuffer_g, framebuffer_b) >;
+}
+
+
+///////////////////////////////////////////////////////////////////////////
+// Routines for dynamic decomposition path
+
+// This computes the z min/max range for a whole row worth of tiles.
+// The tile width must be a multiple of programCount (SIMD size)
+export void
+ComputeZBoundsRow(
+    uniform int32 tileY,
+    uniform int32 tileWidth, uniform int32 tileHeight,
+    uniform int32 numTilesX, uniform int32 numTilesY,
+    // G-buffer data
+    uniform float zBuffer[],
+    uniform int32 gBufferWidth,
+    // Camera data
+    uniform float cameraProj_33, uniform float cameraProj_43,
+    uniform float cameraNear, uniform float cameraFar,
+    // Output
+    reference uniform float minZArray[],
+    reference uniform float maxZArray[]
+    )
+{
+    for (uniform int32 tileX = 0; tileX < numTilesX; ++tileX) {
+        uniform float minZ, maxZ;
+        ComputeZBounds(
+            tileX * tileWidth, tileX * tileWidth + tileWidth,
+            tileY * tileHeight, tileY * tileHeight + tileHeight,
+            zBuffer, gBufferWidth,
+            cameraProj_33, cameraProj_43, cameraNear, cameraFar,
+            minZ, maxZ);
+        minZArray[tileX] = minZ;
+        maxZArray[tileX] = maxZ;
+    }
+}
+
+
+// Reclassifies the lights with respect to four sub-tiles when we refine a tile.
+// numLights need not be a multiple of programCount here, but the input and output arrays
+// should be able to handle programCount-sized load/stores.
+export void
+SplitTileMinMax(
+    uniform int32 tileMidX, uniform int32 tileMidY,
+    // Subtile data (00, 10, 01, 11)
+    uniform float subtileMinZ[],
+    uniform float subtileMaxZ[],
+    // G-buffer data
+    uniform int32 gBufferWidth, uniform int32 gBufferHeight,
+    // Camera data
+    uniform float cameraProj_11, uniform float cameraProj_22,
+    // Light Data
+    reference uniform int32 lightIndices[],
+    uniform int32 numLights,
+    uniform float light_positionView_x_array[],
+    uniform float light_positionView_y_array[],
+    uniform float light_positionView_z_array[],
+    uniform float light_attenuationEnd_array[],
+    // Outputs
+    // TODO: ISPC doesn't currently like multidimensionsal arrays so we'll do the
+    // indexing math ourselves
+    reference uniform int32 subtileIndices[],
+    uniform int32 subtileIndicesPitch,
+    reference uniform int32 subtileNumLights[]
+    )
+{
+    uniform float gBufferScale_x = 0.5f * (float)gBufferWidth;
+    uniform float gBufferScale_y = 0.5f * (float)gBufferHeight;
+        
+    // Parallize across frustum planes
+    // Only have 2 frustum split planes here so may not be worth it, but
+    // we'll do it for now for consistency
+    uniform float frustumPlanes_xy[programCount];
+    uniform float frustumPlanes_z[programCount];
+
+    // This one is totally constant over the whole screen... worth pulling it up at all?
+    float frustumPlanes_xy_v;
+    frustumPlanes_xy_v = insert(frustumPlanes_xy_v, 0, -(cameraProj_11 * gBufferScale_x));
+    frustumPlanes_xy_v = insert(frustumPlanes_xy_v, 1,  (cameraProj_22 * gBufferScale_y));
+    
+    float frustumPlanes_z_v;
+    frustumPlanes_z_v = insert(frustumPlanes_z_v, 0, tileMidX - gBufferScale_x);
+    frustumPlanes_z_v = insert(frustumPlanes_z_v, 1, tileMidY - gBufferScale_y);
+
+    // Normalize
+    float norm = rsqrt(frustumPlanes_xy_v * frustumPlanes_xy_v + 
+                       frustumPlanes_z_v * frustumPlanes_z_v);
+    frustumPlanes_xy_v *= norm;
+    frustumPlanes_z_v *= norm;
+
+    // Save out for uniform use later
+    frustumPlanes_xy[programIndex] = frustumPlanes_xy_v;
+    frustumPlanes_z[programIndex] = frustumPlanes_z_v;
+
+    // Initialize
+    uniform int32 subtileLightOffset[4];
+    subtileLightOffset[0] = 0 * subtileIndicesPitch;
+    subtileLightOffset[1] = 1 * subtileIndicesPitch;
+    subtileLightOffset[2] = 2 * subtileIndicesPitch;
+    subtileLightOffset[3] = 3 * subtileIndicesPitch;
+
+    for (int32 i = programIndex; i < numLights; i += programCount) {
+        // TODO: ISPC says gather required here when it actually
+        // isn't... this could be fixed this by nesting an if() within a
+        // uniform loop, but I'm not totally sure if that's a win
+        // overall. For now we'll just eat the perf cost for cleanliness
+        // since the below are real gathers anyways.
+        int32 lightIndex = lightIndices[i];
+
+        float light_positionView_x = light_positionView_x_array[lightIndex];
+        float light_positionView_y = light_positionView_y_array[lightIndex];
+        float light_positionView_z = light_positionView_z_array[lightIndex];
+        float light_attenuationEnd = light_attenuationEnd_array[lightIndex];
+        float light_attenuationEndNeg = -light_attenuationEnd;
+        
+        // Test lights again subtile z bounds
+        bool inFrustum[4];
+        inFrustum[0] = (light_positionView_z - subtileMinZ[0] >= light_attenuationEndNeg) &&
+            (subtileMaxZ[0] - light_positionView_z >= light_attenuationEndNeg);
+        inFrustum[1] = (light_positionView_z - subtileMinZ[1] >= light_attenuationEndNeg) && 
+            (subtileMaxZ[1] - light_positionView_z >= light_attenuationEndNeg);
+        inFrustum[2] = (light_positionView_z - subtileMinZ[2] >= light_attenuationEndNeg) && 
+            (subtileMaxZ[2] - light_positionView_z >= light_attenuationEndNeg);
+        inFrustum[3] = (light_positionView_z - subtileMinZ[3] >= light_attenuationEndNeg) && 
+            (subtileMaxZ[3] - light_positionView_z >= light_attenuationEndNeg);
+
+        float dx = light_positionView_z * frustumPlanes_z[0] + 
+            light_positionView_x * frustumPlanes_xy[0];
+        float dy = light_positionView_z * frustumPlanes_z[1] +
+            light_positionView_y * frustumPlanes_xy[1];
+        
+        cif (abs(dx) > light_attenuationEnd) {
+            bool positiveX = dx > 0.0f;
+            inFrustum[0] = inFrustum[0] &&  positiveX;    // 00 subtile
+            inFrustum[1] = inFrustum[1] && !positiveX;    // 10 subtile
+            inFrustum[2] = inFrustum[2] &&  positiveX;    // 01 subtile
+            inFrustum[3] = inFrustum[3] && !positiveX;    // 11 subtile
+        }
+        cif (abs(dy) > light_attenuationEnd) {
+            bool positiveY = dy > 0.0f;
+            inFrustum[0] = inFrustum[0] &&  positiveY;    // 00 subtile
+            inFrustum[1] = inFrustum[1] &&  positiveY;    // 10 subtile
+            inFrustum[2] = inFrustum[2] && !positiveY;    // 01 subtile
+            inFrustum[3] = inFrustum[3] && !positiveY;    // 11 subtile
+        }
+
+        // Pack and store intersecting lights
+        // TODO: Experiment with a loop here instead
+        cif (inFrustum[0])
+            subtileLightOffset[0] += packed_store_active(subtileIndices, 
+                                                         subtileLightOffset[0], 
+                                                         lightIndex);
+        cif (inFrustum[1])
+            subtileLightOffset[1] += packed_store_active(subtileIndices, 
+                                                         subtileLightOffset[1], 
+                                                         lightIndex);
+        cif (inFrustum[2])
+            subtileLightOffset[2] += packed_store_active(subtileIndices, 
+                                                         subtileLightOffset[2], 
+                                                         lightIndex);
+        cif (inFrustum[3])
+            subtileLightOffset[3] += packed_store_active(subtileIndices, 
+                                                         subtileLightOffset[3], 
+                                                         lightIndex);
+    }
+
+    subtileNumLights[0] = subtileLightOffset[0] - 0 * subtileIndicesPitch;
+    subtileNumLights[1] = subtileLightOffset[1] - 1 * subtileIndicesPitch;
+    subtileNumLights[2] = subtileLightOffset[2] - 2 * subtileIndicesPitch;
+    subtileNumLights[3] = subtileLightOffset[3] - 3 * subtileIndicesPitch;
+}
--- a/examples/deferred/main.cpp
+++ b/examples/deferred/main.cpp
@@ -0,0 +1,139 @@
+/*
+  Copyright (c) 2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+#ifdef _MSC_VER
+#define ISPC_IS_WINDOWS
+#define NOMINMAX
+#elif defined(__linux__)
+#define ISPC_IS_LINUX
+#elif defined(__APPLE__)
+#define ISPC_IS_APPLE
+#endif
+
+#include <fcntl.h>
+#include <float.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+#include <stdint.h>
+#include <algorithm>
+#include <assert.h>
+#include <vector>
+#ifdef ISPC_IS_WINDOWS
+  #define WIN32_LEAN_AND_MEAN
+  #include <windows.h>
+#endif
+#include "deferred.h"
+#include "kernels_ispc.h"
+#include "../timing.h"
+
+///////////////////////////////////////////////////////////////////////////
+
+int main(int argc, char** argv) {
+    if (argc != 2) {
+        printf("usage: deferred_shading <input_file (e.g. data/pp1280x720.bin)>\n");
+        return 1;
+    }
+
+    InputData *input = CreateInputDataFromFile(argv[1]);
+    if (!input) {
+        printf("Failed to load input file \"%s\"!\n", argv[1]);
+        return 1;
+    }
+
+    Framebuffer framebuffer(input->header.framebufferWidth,
+                            input->header.framebufferHeight);
+
+    InitDynamicC(input);
+#ifdef __cilk
+    InitDynamicCilk(input);
+#endif // __cilk
+
+    int nframes = 5;
+    double ispcCycles = 1e30;
+    for (int i = 0; i < 5; ++i) {
+        framebuffer.clear();
+        reset_and_start_timer();
+        for (int j = 0; j < nframes; ++j)
+            ispc::RenderStatic(&input->header, &input->arrays, 
+                               VISUALIZE_LIGHT_COUNT,
+                               framebuffer.r, framebuffer.g, framebuffer.b);
+        double mcycles = get_elapsed_mcycles() / nframes;
+        ispcCycles = std::min(ispcCycles, mcycles);
+    }
+    printf("[ispc static + tasks]:\t\t[%.3f] million cycles to render "
+           "%d x %d image\n", ispcCycles,
+           input->header.framebufferWidth, input->header.framebufferHeight);
+    WriteFrame("deferred-ispc-static.ppm", input, framebuffer);
+
+#ifdef __cilk
+    double dynamicCilkCycles = 1e30;
+    for (int i = 0; i < 5; ++i) {
+        framebuffer.clear();
+        reset_and_start_timer();
+        for (int j = 0; j < nframes; ++j)
+            DispatchDynamicCilk(input, &framebuffer);
+        double mcycles = get_elapsed_mcycles() / nframes;
+        dynamicCilkCycles = std::min(dynamicCilkCycles, mcycles);
+    }
+    printf("[ispc + Cilk dynamic]:\t\t[%.3f] million cycles to render image\n", 
+           dynamicCilkCycles);
+    WriteFrame("deferred-ispc-dynamic.ppm", input, framebuffer);
+#endif // __cilk
+
+    double serialCycles = 1e30;
+    for (int i = 0; i < 5; ++i) {
+        framebuffer.clear();
+        reset_and_start_timer();
+        for (int j = 0; j < nframes; ++j)
+            DispatchDynamicC(input, &framebuffer);
+        double mcycles = get_elapsed_mcycles() / nframes;
+        serialCycles = std::min(serialCycles, mcycles);
+    }
+    printf("[C++ serial dynamic, 1 core]:\t[%.3f] million cycles to render image\n", 
+           serialCycles);
+    WriteFrame("deferred-serial-dynamic.ppm", input, framebuffer);
+
+#ifdef __cilk
+    printf("\t\t\t\t(%.2fx speedup from static ISPC, %.2fx from Cilk+ISPC)\n", 
+           serialCycles/ispcCycles, serialCycles/dynamicCilkCycles);
+#else
+    printf("\t\t\t\t(%.2fx speedup from ISPC)\n", serialCycles/ispcCycles);
+#endif // __cilk
+
+    DeleteInputData(input);
+
+    return 0;
+}
--- a/examples/examples.sln
+++ b/examples/examples.sln
@@ -15,6 +15,14 @@ Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "mandelbrot_tasks", "mandelb
 EndProject
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "aobench_instrumented", "aobench_instrumented\aobench_instrumented.vcxproj", "{B3B4AE3D-6D5A-4CF9-AF5B-43CF2131B958}"
 EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "noise", "noise\noise.vcxproj", "{0E0886D8-8B5E-4EAF-9A21-91E63DAF81FD}"
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "volume", "volume_rendering\volume.vcxproj", "{DEE5733A-E93E-449D-9114-9BFFCAEB4DF9}"
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "stencil", "stencil\stencil.vcxproj", "{2EF070A1-F62F-4E6A-944B-88D140945C3C}"
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "deferred_shading", "deferred\deferred_shading.vcxproj", "{87F53C53-957E-4E91-878A-BC27828FB9EB}"
+EndProject
 Global
 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
 		Debug|Win32 = Debug|Win32
@@ -79,6 +87,38 @@ Global
 		{B3B4AE3D-6D5A-4CF9-AF5B-43CF2131B958}.Release|Win32.Build.0 = Release|Win32
 		{B3B4AE3D-6D5A-4CF9-AF5B-43CF2131B958}.Release|x64.ActiveCfg = Release|x64
 		{B3B4AE3D-6D5A-4CF9-AF5B-43CF2131B958}.Release|x64.Build.0 = Release|x64
+		{0E0886D8-8B5E-4EAF-9A21-91E63DAF81FD}.Debug|Win32.ActiveCfg = Debug|Win32
+		{0E0886D8-8B5E-4EAF-9A21-91E63DAF81FD}.Debug|Win32.Build.0 = Debug|Win32
+		{0E0886D8-8B5E-4EAF-9A21-91E63DAF81FD}.Debug|x64.ActiveCfg = Debug|x64
+		{0E0886D8-8B5E-4EAF-9A21-91E63DAF81FD}.Debug|x64.Build.0 = Debug|x64
+		{0E0886D8-8B5E-4EAF-9A21-91E63DAF81FD}.Release|Win32.ActiveCfg = Release|Win32
+		{0E0886D8-8B5E-4EAF-9A21-91E63DAF81FD}.Release|Win32.Build.0 = Release|Win32
+		{0E0886D8-8B5E-4EAF-9A21-91E63DAF81FD}.Release|x64.ActiveCfg = Release|x64
+		{0E0886D8-8B5E-4EAF-9A21-91E63DAF81FD}.Release|x64.Build.0 = Release|x64
+		{DEE5733A-E93E-449D-9114-9BFFCAEB4DF9}.Debug|Win32.ActiveCfg = Debug|Win32
+		{DEE5733A-E93E-449D-9114-9BFFCAEB4DF9}.Debug|Win32.Build.0 = Debug|Win32
+		{DEE5733A-E93E-449D-9114-9BFFCAEB4DF9}.Debug|x64.ActiveCfg = Debug|x64
+		{DEE5733A-E93E-449D-9114-9BFFCAEB4DF9}.Debug|x64.Build.0 = Debug|x64
+		{DEE5733A-E93E-449D-9114-9BFFCAEB4DF9}.Release|Win32.ActiveCfg = Release|Win32
+		{DEE5733A-E93E-449D-9114-9BFFCAEB4DF9}.Release|Win32.Build.0 = Release|Win32
+		{DEE5733A-E93E-449D-9114-9BFFCAEB4DF9}.Release|x64.ActiveCfg = Release|x64
+		{DEE5733A-E93E-449D-9114-9BFFCAEB4DF9}.Release|x64.Build.0 = Release|x64
+		{2EF070A1-F62F-4E6A-944B-88D140945C3C}.Debug|Win32.ActiveCfg = Debug|Win32
+		{2EF070A1-F62F-4E6A-944B-88D140945C3C}.Debug|Win32.Build.0 = Debug|Win32
+		{2EF070A1-F62F-4E6A-944B-88D140945C3C}.Debug|x64.ActiveCfg = Debug|x64
+		{2EF070A1-F62F-4E6A-944B-88D140945C3C}.Debug|x64.Build.0 = Debug|x64
+		{2EF070A1-F62F-4E6A-944B-88D140945C3C}.Release|Win32.ActiveCfg = Release|Win32
+		{2EF070A1-F62F-4E6A-944B-88D140945C3C}.Release|Win32.Build.0 = Release|Win32
+		{2EF070A1-F62F-4E6A-944B-88D140945C3C}.Release|x64.ActiveCfg = Release|x64
+		{2EF070A1-F62F-4E6A-944B-88D140945C3C}.Release|x64.Build.0 = Release|x64
+		{87F53C53-957E-4E91-878A-BC27828FB9EB}.Debug|Win32.ActiveCfg = Debug|Win32
+		{87F53C53-957E-4E91-878A-BC27828FB9EB}.Debug|Win32.Build.0 = Debug|Win32
+		{87F53C53-957E-4E91-878A-BC27828FB9EB}.Debug|x64.ActiveCfg = Debug|x64
+		{87F53C53-957E-4E91-878A-BC27828FB9EB}.Debug|x64.Build.0 = Debug|x64
+		{87F53C53-957E-4E91-878A-BC27828FB9EB}.Release|Win32.ActiveCfg = Release|Win32
+		{87F53C53-957E-4E91-878A-BC27828FB9EB}.Release|Win32.Build.0 = Release|Win32
+		{87F53C53-957E-4E91-878A-BC27828FB9EB}.Release|x64.ActiveCfg = Release|x64
+		{87F53C53-957E-4E91-878A-BC27828FB9EB}.Release|x64.Build.0 = Release|x64
 	EndGlobalSection
 	GlobalSection(SolutionProperties) = preSolution
 		HideSolutionNode = FALSE
--- a/examples/mandelbrot/Makefile
+++ b/examples/mandelbrot/Makefile
@@ -2,7 +2,7 @@
 CXX=g++ -m64
 CXXFLAGS=-Iobjs/ -O3 -Wall
 ISPC=ispc
-ISPCFLAGS=-O2 --target=sse4x2 --arch=x86-64
+ISPCFLAGS=-O2 --target=sse2,sse4-x2,avx-x2 --arch=x86-64

 default: mandelbrot

@@ -14,13 +14,17 @@ dirs:
 clean:
 	/bin/rm -rf objs *~ mandelbrot

-mandelbrot: dirs objs/mandelbrot.o objs/mandelbrot_serial.o objs/mandelbrot_ispc.o
-	$(CXX) $(CXXFLAGS) -o $@ objs/mandelbrot.o objs/mandelbrot_ispc.o objs/mandelbrot_serial.o -lm
+OBJS=objs/mandelbrot.o objs/mandelbrot_serial.o objs/mandelbrot_ispc_sse2.o \
+	objs/mandelbrot_ispc_sse4.o objs/mandelbrot_ispc_avx.o \
+	objs/mandelbrot_ispc.o
+
+mandelbrot: dirs $(OBJS)
+	$(CXX) $(CXXFLAGS) -o $@ $(OBJS) -lm

 objs/%.o: %.cpp
 	$(CXX) $< $(CXXFLAGS) -c -o $@

 objs/mandelbrot.o: objs/mandelbrot_ispc.h 

-objs/%_ispc.h objs/%_ispc.o: %.ispc
+objs/%_ispc.h objs/%_ispc.o objs/%_ispc_sse2.o objs/%_ispc_sse4.o objs/%_ispc_avx.o: %.ispc
 	$(ISPC) $(ISPCFLAGS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
--- a/examples/mandelbrot/mandelbrot.cpp
+++ b/examples/mandelbrot/mandelbrot.cpp
@@ -63,6 +63,7 @@ writePPM(int *buf, int width, int height, const char *fn) {
            fputc(c, fp);
    }
    fclose(fp);
+    printf("Wrote image file %s\n", fn);
 }


--- a/examples/mandelbrot/mandelbrot.vcxproj
+++ b/examples/mandelbrot/mandelbrot.vcxproj
@@ -64,15 +64,19 @@
  <PropertyGroup Label="UserMacros" />
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
    <LinkIncremental>true</LinkIncremental>
+    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
    <LinkIncremental>true</LinkIncremental>
+    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
    <LinkIncremental>false</LinkIncremental>
+    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
    <LinkIncremental>false</LinkIncremental>
+    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
  </PropertyGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
    <ClCompile>
@@ -81,6 +85,7 @@
      <WarningLevel>Level3</WarningLevel>
      <Optimization>Disabled</Optimization>
      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <FloatingPointModel>Fast</FloatingPointModel>
    </ClCompile>
@@ -96,6 +101,7 @@
      <WarningLevel>Level3</WarningLevel>
      <Optimization>Disabled</Optimization>
      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <FloatingPointModel>Fast</FloatingPointModel>
    </ClCompile>
@@ -113,6 +119,7 @@
      <FunctionLevelLinking>true</FunctionLevelLinking>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
      <FloatingPointModel>Fast</FloatingPointModel>
    </ClCompile>
    <Link>
@@ -131,6 +138,7 @@
      <FunctionLevelLinking>true</FunctionLevelLinking>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
      <FloatingPointModel>Fast</FloatingPointModel>
    </ClCompile>
    <Link>
@@ -147,18 +155,18 @@
  <ItemGroup>
    <CustomBuild Include="mandelbrot.ispc">
      <FileType>Document</FileType>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=sse4x2
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx-x2
 </Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --target=sse4x2
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx-x2
 </Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=sse4x2
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx-x2
 </Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --target=sse4x2
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx-x2
 </Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
    </CustomBuild>
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
--- a/examples/mandelbrot/mandelbrot_serial.cpp
+++ b/examples/mandelbrot/mandelbrot_serial.cpp
@@ -36,7 +36,7 @@ static int mandel(float c_re, float c_im, int count) {
    float z_re = c_re, z_im = c_im;
    int i;
    for (i = 0; i < count; ++i) {
-        if (z_re * z_re + z_im * z_im > 4.)
+        if (z_re * z_re + z_im * z_im > 4.f)
            break;

        float new_re = z_re*z_re - z_im*z_im;
--- a/examples/mandelbrot_tasks/.gitignore
+++ b/examples/mandelbrot_tasks/.gitignore
@@ -0,0 +1,2 @@
+mandelbrot
+*.ppm
--- a/examples/mandelbrot_tasks/Makefile
+++ b/examples/mandelbrot_tasks/Makefile
@@ -1,20 +1,18 @@

 ARCH = $(shell uname)

-TASK_CXX=tasks_pthreads.cpp
+TASK_CXX=../tasksys.cpp
 TASK_LIB=-lpthread
+TASK_OBJ=$(addprefix objs/, $(subst ../,, $(TASK_CXX:.cpp=.o)))

-ifeq ($(ARCH), Darwin)
-  TASK_CXX=tasks_gcd.cpp
-  TASK_LIB=
-endif
-
-TASK_OBJ=$(addprefix objs/, $(TASK_CXX:.cpp=.o))
-
-CXX=g++ -m64
-CXXFLAGS=-Iobjs/ -O3 -Wall
+CXX=g++
+CXXFLAGS=-Iobjs/ -O3 -Wall -m64
 ISPC=ispc
-ISPCFLAGS=-O2 --target=sse4x2 --arch=x86-64
+ISPCFLAGS=-O2 --target=sse2,sse4-x2,avx-x2 --arch=x86-64
+
+OBJS=objs/mandelbrot.o objs/mandelbrot_serial.o $(TASK_OBJ) \
+	objs/mandelbrot_ispc.o objs/mandelbrot_ispc_sse2.o \
+	objs/mandelbrot_ispc_sse4.o objs/mandelbrot_ispc_avx.o 

 default: mandelbrot

@@ -26,13 +24,16 @@ dirs:
 clean:
 	/bin/rm -rf objs *~ mandelbrot

-mandelbrot: dirs objs/mandelbrot.o objs/mandelbrot_serial.o objs/mandelbrot_ispc.o $(TASK_OBJ)
-	$(CXX) $(CXXFLAGS) -o $@ objs/mandelbrot.o objs/mandelbrot_ispc.o objs/mandelbrot_serial.o $(TASK_OBJ) -lm $(TASK_LIB)
+mandelbrot: dirs $(OBJS)
+	$(CXX) $(CXXFLAGS) -o $@ $(OBJS) -lm $(TASK_LIB)

 objs/%.o: %.cpp
 	$(CXX) $< $(CXXFLAGS) -c -o $@

+objs/%.o: ../%.cpp
+	$(CXX) $< $(CXXFLAGS) -c -o $@
+
 objs/mandelbrot.o: objs/mandelbrot_ispc.h 

-objs/%_ispc.h objs/%_ispc.o: %.ispc
+objs/%_ispc.h objs/%_ispc.o objs/%_ispc_sse2.o objs/%_ispc_sse4.o objs/%_ispc_avx.o: %.ispc
 	$(ISPC) $(ISPCFLAGS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
--- a/examples/mandelbrot_tasks/mandelbrot.cpp
+++ b/examples/mandelbrot_tasks/mandelbrot.cpp
@@ -40,6 +40,7 @@

 #include <stdio.h>
 #include <algorithm>
+#include <string.h>
 #include "../timing.h"
 #include "mandelbrot_ispc.h"
 using namespace ispc;
@@ -63,10 +64,16 @@ writePPM(int *buf, int width, int height, const char *fn) {
            fputc(c, fp);
    }
    fclose(fp);
+    printf("Wrote image file %s\n", fn);
 }


-int main() {
+static void usage() {
+    fprintf(stderr, "usage: mandelbrot [--scale=<factor>]\n");
+    exit(1);
+}
+
+int main(int argc, char *argv[]) {
    unsigned int width = 1536;
    unsigned int height = 1024;
    float x0 = -2;
@@ -74,8 +81,24 @@ int main() {
    float y0 = -1;
    float y1 = 1;

-    extern void TasksInit();
-    TasksInit();
+    if (argc == 1)
+        ;
+    else if (argc == 2) {
+        if (strncmp(argv[1], "--scale=", 8) == 0) {
+            float scale = atof(argv[1] + 8);
+            if (scale == 0.f)
+                usage();
+            width *= scale;
+            height *= scale;
+            // round up to multiples of 16
+            width = (width + 0xf) & ~0xf;
+            height = (height + 0xf) & ~0xf;
+        }
+        else 
+            usage();
+    }
+    else
+        usage();

    int maxIterations = 512;
    int *buf = new int[width*height];
@@ -86,6 +109,9 @@ int main() {
    //
    double minISPC = 1e30;
    for (int i = 0; i < 3; ++i) {
+        // Clear out the buffer
+        for (unsigned int i = 0; i < width * height; ++i)
+            buf[i] = 0;
        reset_and_start_timer();
        mandelbrot_ispc(x0, y0, x1, y1, width, height, maxIterations, buf);
        double dt = get_elapsed_mcycles();
@@ -95,9 +121,6 @@ int main() {
    printf("[mandelbrot ispc+tasks]:\t[%.3f] million cycles\n", minISPC);
    writePPM(buf, width, height, "mandelbrot-ispc.ppm");

-    // Clear out the buffer
-    for (unsigned int i = 0; i < width * height; ++i)
-        buf[i] = 0;

    // 
    // And run the serial implementation 3 times, again reporting the
@@ -105,6 +128,9 @@ int main() {
    //
    double minSerial = 1e30;
    for (int i = 0; i < 3; ++i) {
+        // Clear out the buffer
+        for (unsigned int i = 0; i < width * height; ++i)
+            buf[i] = 0;
        reset_and_start_timer();
        mandelbrot_serial(x0, y0, x1, y1, width, height, maxIterations, buf);
        double dt = get_elapsed_mcycles();
--- a/examples/mandelbrot_tasks/mandelbrot.ispc
+++ b/examples/mandelbrot_tasks/mandelbrot.ispc
@@ -53,11 +53,14 @@ mandel(float c_re, float c_im, int count) {
   [ystart,yend).
 */
 task void
-mandelbrot_scanlines(uniform int ystart, uniform int yend,
+mandelbrot_scanlines(uniform int ybase, uniform int span,
                     uniform float x0, uniform float dx, 
                     uniform float y0, uniform float dy,
                     uniform int width, uniform int maxIterations,
                     reference uniform int output[]) {
+    uniform int ystart = ybase + taskIndex * span;
+    uniform int yend = ystart + span;
+
    for (uniform int j = ystart; j < yend; ++j) {
        for (uniform int i = 0; i < width; i += programCount) {
            float x = x0 + (programIndex + i) * dx;
@@ -70,6 +73,20 @@ mandelbrot_scanlines(uniform int ystart, uniform int yend,
 }
                               

+task void
+mandelbrot_chunk(uniform float x0, uniform float dx,
+                 uniform float y0, uniform float dy,
+                 uniform int width, uniform int height,
+                 uniform int maxIterations, reference uniform int output[]) {
+    uniform int ystart = taskIndex * (height/taskCount);
+    uniform int yend = (taskIndex+1) * (height/taskCount);
+    uniform int span = 1;
+
+    launch[(yend-ystart)/span] < mandelbrot_scanlines(ystart, span, x0, dx, y0, dy,
+                                                      width, maxIterations, output) >;
+}
+
+
 export void
 mandelbrot_ispc(uniform float x0, uniform float y0, 
                uniform float x1, uniform float y1,
@@ -78,9 +95,6 @@ mandelbrot_ispc(uniform float x0, uniform float y0,
    uniform float dx = (x1 - x0) / width;
    uniform float dy = (y1 - y0) / height;

-    /* Launch task to compute results for spans of 'span' scanlines. */
-    uniform int span = 2;
-    for (uniform int j = 0; j < height; j += span)
-        launch < mandelbrot_scanlines(j, j+span, x0, dx, y0, dy, width,
-                                      maxIterations, output) >;
+    launch[32] < mandelbrot_chunk(x0, dx, y0, dy, width, height,
+                                  maxIterations, output) >;
 }
--- a/examples/mandelbrot_tasks/mandelbrot_serial.cpp
+++ b/examples/mandelbrot_tasks/mandelbrot_serial.cpp
@@ -36,7 +36,7 @@ static int mandel(float c_re, float c_im, int count) {
    float z_re = c_re, z_im = c_im;
    int i;
    for (i = 0; i < count; ++i) {
-        if (z_re * z_re + z_im * z_im > 4.)
+        if (z_re * z_re + z_im * z_im > 4.f)
            break;

        float new_re = z_re*z_re - z_im*z_im;
--- a/examples/mandelbrot_tasks/mandelbrot_tasks.vcxproj
+++ b/examples/mandelbrot_tasks/mandelbrot_tasks.vcxproj
@@ -1,4 +1,4 @@
-<?xml version="1.0" encoding="utf-8"?>
+<?xml version="1.0" encoding="utf-8"?>
 <Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
  <ItemGroup Label="ProjectConfigurations">
    <ProjectConfiguration Include="Debug|Win32">
@@ -64,15 +64,19 @@
  <PropertyGroup Label="UserMacros" />
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
    <LinkIncremental>true</LinkIncremental>
+    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
    <LinkIncremental>true</LinkIncremental>
+    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
    <LinkIncremental>false</LinkIncremental>
+    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
    <LinkIncremental>false</LinkIncremental>
+    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
  </PropertyGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
    <ClCompile>
@@ -81,6 +85,7 @@
      <WarningLevel>Level3</WarningLevel>
      <Optimization>Disabled</Optimization>
      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <FloatingPointModel>Fast</FloatingPointModel>
    </ClCompile>
@@ -96,6 +101,7 @@
      <WarningLevel>Level3</WarningLevel>
      <Optimization>Disabled</Optimization>
      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <FloatingPointModel>Fast</FloatingPointModel>
    </ClCompile>
@@ -113,6 +119,7 @@
      <FunctionLevelLinking>true</FunctionLevelLinking>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
      <FloatingPointModel>Fast</FloatingPointModel>
    </ClCompile>
    <Link>
@@ -131,6 +138,7 @@
      <FunctionLevelLinking>true</FunctionLevelLinking>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
      <FloatingPointModel>Fast</FloatingPointModel>
    </ClCompile>
    <Link>
@@ -143,23 +151,23 @@
  <ItemGroup>
    <ClCompile Include="mandelbrot.cpp" />
    <ClCompile Include="mandelbrot_serial.cpp" />
-    <ClCompile Include="tasks_concrt.cpp" />
+    <ClCompile Include="../tasksys.cpp" />
  </ItemGroup>
  <ItemGroup>
    <CustomBuild Include="mandelbrot.ispc">
      <FileType>Document</FileType>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=sse4x2
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx-x2
 </Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --target=sse4x2
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx-x2
 </Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=sse4x2
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx-x2
 </Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --target=sse4x2
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx-x2
 </Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
    </CustomBuild>
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
--- a/examples/mandelbrot_tasks/tasks_concrt.cpp
+++ b/examples/mandelbrot_tasks/tasks_concrt.cpp
@@ -1,128 +0,0 @@
-/*
-  Copyright (c) 2010-2011, Intel Corporation
-  All rights reserved.
-
-  Redistribution and use in source and binary forms, with or without
-  modification, are permitted provided that the following conditions are
-  met:
-
-    * Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-
-    * Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-
-    * Neither the name of Intel Corporation nor the names of its
-      contributors may be used to endorse or promote products derived from
-      this software without specific prior written permission.
-
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
-   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
-   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
-   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
-*/
-
-/* Simple task system implementation for ispc based on Microsoft's
-   Concurrency Runtime. */
-
-#include <windows.h>
-#include <concrt.h>
-using namespace Concurrency;
-#include <assert.h>
-#include <stdio.h>
-#include <stdlib.h>
-
-// ispc expects these functions to have C linkage / not be mangled
-extern "C" { 
-    void ISPCLaunch(void *f, void *data);
-    void ISPCSync();
-}
-
-typedef void (*TaskFuncType)(void *, int, int);
-
-struct TaskInfo {
-    TaskFuncType ispcFunc;
-    void *ispcData;
-};
-
-// This is a simple implementation that just aborts if more than MAX_TASKS
-// are launched.  It could easily be extended to be more general...
-
-#define MAX_TASKS 4096
-static int taskOffset;
-static TaskInfo taskInfo[MAX_TASKS];
-static event *events[MAX_TASKS];
-static CRITICAL_SECTION criticalSection;
-static bool initialized = false;
-
-void
-TasksInit() {
-    InitializeCriticalSection(&criticalSection);
-    for (int i = 0; i < MAX_TASKS; ++i)
-        events[i] = new event;
-    initialized = true;
-}
-
-
-void __cdecl
-lRunTask(LPVOID param) {
-    TaskInfo *ti = (TaskInfo *)param;
-    
-    // Actually run the task. 
-    // FIXME: like the tasks_gcd.cpp implementation, this is passing bogus
-    // values for the threadIndex and threadCount builtins, which in turn
-    // will cause bugs in code that uses those.  FWIW this example doesn't
-    // use them...
-    int threadIndex = 0;
-    int threadCount = 1;
-    ti->ispcFunc(ti->ispcData, threadIndex, threadCount);
-
-    // Signal the event that this task is done
-    int taskNum = ti - &taskInfo[0];
-    events[taskNum]->set();
-}
-
-
-void
-ISPCLaunch(void *func, void *data) {
-    if (!initialized) {
-        fprintf(stderr, "You must call TasksInit() before launching tasks.\n");
-        exit(1);
-    }
-
-    // Get a TaskInfo struct for this task
-    EnterCriticalSection(&criticalSection);
-    TaskInfo *ti = &taskInfo[taskOffset++];
-    assert(taskOffset < MAX_TASKS);
-    LeaveCriticalSection(&criticalSection);
-
-    // And pass it on to the Concurrency Runtime...
-    ti->ispcFunc = (TaskFuncType)func;
-    ti->ispcData = data;
-    CurrentScheduler::ScheduleTask(lRunTask, ti);
-}
-
-
-void ISPCSync() {
-    if (!initialized) {
-        fprintf(stderr, "You must call TasksInit() before launching tasks.\n");
-        exit(1);
-    }
-
-    event::wait_for_multiple(&events[0], taskOffset, true, 
-                             COOPERATIVE_TIMEOUT_INFINITE);
-
-    for (int i = 0; i < taskOffset; ++i)
-        events[i]->reset();
-
-    taskOffset = 0;
-}
--- a/examples/mandelbrot_tasks/tasks_gcd.cpp
+++ b/examples/mandelbrot_tasks/tasks_gcd.cpp
@@ -1,103 +0,0 @@
-/*
-  Copyright (c) 2010-2011, Intel Corporation
-  All rights reserved.
-
-  Redistribution and use in source and binary forms, with or without
-  modification, are permitted provided that the following conditions are
-  met:
-
-    * Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-
-    * Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-
-    * Neither the name of Intel Corporation nor the names of its
-      contributors may be used to endorse or promote products derived from
-      this software without specific prior written permission.
-
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
-   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
-   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
-   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
-*/
-
-/* A simple task system for ispc programs based on Apple's Grand Central
-   Dispatch. */
-
-#include <dispatch/dispatch.h>
-#include <stdio.h>
-#include <stdlib.h>
-
-static bool initialized = false;
-static dispatch_queue_t gcdQueue;
-static dispatch_group_t gcdGroup;
-
-// ispc expects these functions to have C linkage / not be mangled
-extern "C" {
-    void ISPCLaunch(void *f, void *data);
-    void ISPCSync();
-}
-
-struct TaskInfo {
-    void *func;
-    void *data;
-};
-
-
-void
-TasksInit() {
-    gcdQueue = dispatch_get_global_queue(DISPATCH_QUEUE_PRIORITY_DEFAULT, 0);
-    gcdGroup = dispatch_group_create();
-    initialized = true;
-}
-
-
-static void
-lRunTask(void *ti) {
-    typedef void (*TaskFuncType)(void *, int, int);
-    TaskInfo *taskInfo = (TaskInfo *)ti;
-
-    TaskFuncType func = (TaskFuncType)(taskInfo->func);
-
-    // FIXME: these are bogus values; may cause bugs in code that depends
-    // on them having unique values in different threads.
-    int threadIndex = 0;
-    int threadCount = 1;
-    // Actually run the task
-    func(taskInfo->data, threadIndex, threadCount);
-
-    // FIXME: taskInfo leaks...
-}
-
-
-void ISPCLaunch(void *func, void *data) {
-    if (!initialized) {
-        fprintf(stderr, "You must call TasksInit() before launching tasks.\n");
-        exit(1);
-    }
-    TaskInfo *ti = new TaskInfo;
-    ti->func = func;
-    ti->data = data;
-    dispatch_group_async_f(gcdGroup, gcdQueue, ti, lRunTask);
-}
-
-
-void ISPCSync() {
-    if (!initialized) {
-        fprintf(stderr, "You must call TasksInit() before launching tasks.\n");
-        exit(1);
-    }
-
-    // Wait for all of the tasks in the group to complete before returning
-    dispatch_group_wait(gcdGroup, DISPATCH_TIME_FOREVER);
-}
--- a/examples/mandelbrot_tasks/tasks_pthreads.cpp
+++ b/examples/mandelbrot_tasks/tasks_pthreads.cpp
@@ -1,295 +0,0 @@
-/*
-  Copyright (c) 2010-2011, Intel Corporation
-  All rights reserved.
-
-  Redistribution and use in source and binary forms, with or without
-  modification, are permitted provided that the following conditions are
-  met:
-
-    * Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-
-    * Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-
-    * Neither the name of Intel Corporation nor the names of its
-      contributors may be used to endorse or promote products derived from
-      this software without specific prior written permission.
-
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
-   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
-   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
-   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
-*/
-
-#include <pthread.h>
-#include <semaphore.h>
-#include <string.h>
-#include <unistd.h>
-#include <assert.h>
-#include <stdio.h>
-#include <fcntl.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <sys/param.h>
-#include <sys/sysctl.h>
-#include <stdint.h>
-#include <stdlib.h>
-#include <errno.h>
-#include <vector>
-
-// ispc expects these functions to have C linkage / not be mangled
-extern "C" { 
-    void ISPCLaunch(void *f, void *data);
-    void ISPCSync();
-}
-
-
-static int nThreads;
-static pthread_t *threads;
-static pthread_mutex_t taskQueueMutex;
-static std::vector<std::pair<void *, void *> > taskQueue;
-static sem_t *workerSemaphore;
-static uint32_t numUnfinishedTasks;
-static pthread_mutex_t tasksRunningConditionMutex;
-static pthread_cond_t tasksRunningCondition;
-
-static void *lTaskEntry(void *arg);
-
-/** Figure out how many CPU cores there are in the system
- */
-static int
-lNumCPUCores() {
-#if defined(__linux__)
-    return sysconf(_SC_NPROCESSORS_ONLN);
-#else
-    // Mac
-    int mib[2];
-    mib[0] = CTL_HW;
-    size_t length = 2;
-    if (sysctlnametomib("hw.logicalcpu", mib, &length) == -1) {
-        fprintf(stderr, "sysctlnametomib() filed.  Guessing 2 cores.");
-        return 2;
-    }
-    assert(length == 2);
-
-    int nCores = 0;
-    size_t size = sizeof(nCores);
-
-    if (sysctl(mib, 2, &nCores, &size, NULL, 0) == -1) {
-        fprintf(stderr, "sysctl() to find number of cores present failed.  Guessing 2.");
-        return 2;
-    }
-    return nCores;
-#endif
-}
-
-void
-TasksInit() {
-    nThreads = lNumCPUCores();
-
-    threads = new pthread_t[nThreads];
-
-    int err;
-    if ((err = pthread_mutex_init(&taskQueueMutex, NULL)) != 0) {
-        fprintf(stderr, "Error creating mutex: %s\n", strerror(err));
-        exit(1);
-    }
-
-    char name[32];
-    sprintf(name, "mandelbrot.%d", (int)getpid());
-    workerSemaphore = sem_open(name, O_CREAT, S_IRUSR|S_IWUSR, 0);
-    if (!workerSemaphore) {
-        fprintf(stderr, "Error creating semaphore: %s\n", strerror(err));
-        exit(1);
-    }
-
-    if ((err = pthread_cond_init(&tasksRunningCondition, NULL)) != 0) {
-        fprintf(stderr, "Error creating condition variable: %s\n", strerror(err));
-        exit(1);
-    }
-
-    if ((err = pthread_mutex_init(&tasksRunningConditionMutex, NULL)) != 0) {
-        fprintf(stderr, "Error creating mutex: %s\n", strerror(err));
-        exit(1);
-    }
-
-    for (int i = 0; i < nThreads; ++i) {
-        err = pthread_create(&threads[i], NULL, &lTaskEntry, reinterpret_cast<void *>(i));
-        if (err != 0) {
-            fprintf(stderr, "Error creating pthread %d: %s\n", i, strerror(err));
-            exit(1);
-        }
-    }
-}
-
-
-void
-ISPCLaunch(void *f, void *d) {
-    if (threads == NULL) {
-        fprintf(stderr, "You must call TasksInit() before launching tasks.\n");
-        exit(1);
-    }
-
-    //
-    // Acquire mutex, add task
-    //
-    int err;
-    if ((err = pthread_mutex_lock(&taskQueueMutex)) != 0) {
-        fprintf(stderr, "Error from pthread_mutex_lock: %s\n", strerror(err));
-        exit(1);
-    }
-
-    taskQueue.push_back(std::make_pair(f, d));
-
-    if ((err = pthread_mutex_unlock(&taskQueueMutex)) != 0) {
-        fprintf(stderr, "Error from pthread_mutex_unlock: %s\n", strerror(err));
-        exit(1);
-    }
-
-    //
-    // Update count of number of tasks left to run
-    //
-    if ((err = pthread_mutex_lock(&tasksRunningConditionMutex)) != 0) {
-        fprintf(stderr, "Error from pthread_mutex_lock: %s\n", strerror(err));
-        exit(1);
-    }
-
-    ++numUnfinishedTasks;
-
-    if ((err = pthread_mutex_unlock(&tasksRunningConditionMutex)) != 0) {
-        fprintf(stderr, "Error from pthread_mutex_lock: %s\n", strerror(err));
-        exit(1);
-    }
-
-    //
-    // Post to the worker semaphore to wake up worker threads that are
-    // sleeping waiting for tasks to show up
-    //
-    if ((err = sem_post(workerSemaphore)) != 0) {
-        fprintf(stderr, "Error from sem_post: %s\n", strerror(err));
-        exit(1);
-    }
-}
-
-
-static void *
-lTaskEntry(void *arg) {
-    int threadIndex = int(reinterpret_cast<int64_t>(arg));
-    int threadCount = nThreads;
-
-    while (true) {
-        int err;
-        if ((err = sem_wait(workerSemaphore)) != 0) {
-            fprintf(stderr, "Error from sem_wait: %s\n", strerror(err));
-            exit(1);
-        }
-
-        std::pair<void *, void *> myTask;
-        //
-        // Acquire mutex, get task
-        //
-        if ((err = pthread_mutex_lock(&taskQueueMutex)) != 0) {
-            fprintf(stderr, "Error from pthread_mutex_lock: %s\n", strerror(err));
-            exit(1);
-        }
-        if (taskQueue.size() == 0) {
-            //
-            // Task queue is empty, go back and wait on the semaphore
-            //
-            if ((err = pthread_mutex_unlock(&taskQueueMutex)) != 0) {
-                fprintf(stderr, "Error from pthread_mutex_unlock: %s\n", strerror(err));
-                exit(1);
-            }
-            continue;
-        }
-
-        myTask = taskQueue.back();
-        taskQueue.pop_back();
-
-        if ((err = pthread_mutex_unlock(&taskQueueMutex)) != 0) {
-            fprintf(stderr, "Error from pthread_mutex_unlock: %s\n", strerror(err));
-            exit(1);
-        }
-
-        //
-        // Do work for _myTask_
-        //
-        typedef void (*TaskFunType)(void *, int, int);
-        TaskFunType func = (TaskFunType)myTask.first;
-        func(myTask.second, threadIndex, threadCount);
-
-        //
-        // Decrement the number of unfinished tasks counter
-        //
-        if ((err = pthread_mutex_lock(&tasksRunningConditionMutex)) != 0) {
-            fprintf(stderr, "Error from pthread_mutex_lock: %s\n", strerror(err));
-            exit(1);
-        }
-
-        int unfinished = --numUnfinishedTasks;
-        if (unfinished == 0) {
-            //
-            // Signal the "no more tasks are running" condition if all of
-            // them are done.
-            //
-            int err;
-            if ((err = pthread_cond_signal(&tasksRunningCondition)) != 0) {
-                fprintf(stderr, "Error from pthread_cond_signal: %s\n", strerror(err));
-                exit(1);
-            }
-        }
-
-        if ((err = pthread_mutex_unlock(&tasksRunningConditionMutex)) != 0) {
-            fprintf(stderr, "Error from pthread_mutex_lock: %s\n", strerror(err));
-            exit(1);
-        }
-    }
-
-    pthread_exit(NULL);
-    return 0;
-}
-
-
-void ISPCSync() {
-    if (threads == NULL) {
-        fprintf(stderr, "You must call TasksInit() before launching tasks.\n");
-        exit(1);
-    }
-
-    int err;
-    if ((err = pthread_mutex_lock(&tasksRunningConditionMutex)) != 0) {
-        fprintf(stderr, "Error from pthread_mutex_lock: %s\n", strerror(err));
-        exit(1);
-    }
-
-    // As long as there are tasks running, wait on the condition variable;
-    // doing so causes this thread to go to sleep until someone signals on
-    // the tasksRunningCondition condition variable.
-    while (numUnfinishedTasks > 0) {
-        if ((err = pthread_cond_wait(&tasksRunningCondition, 
-                                     &tasksRunningConditionMutex)) != 0) {
-            fprintf(stderr, "Error from pthread_cond_wait: %s\n", strerror(err));
-            exit(1);
-        }
-    }
-    
-    // We acquire ownership of the condition variable mutex when the above
-    // pthread_cond_wait returns.
-    // FIXME: is there a lurking issue here if numUnfinishedTasks gets back
-    // to zero by the time we get to ISPCSync() and thence we're trying to
-    // unlock a mutex we don't have a lock on?
-    if ((err = pthread_mutex_unlock(&tasksRunningConditionMutex)) != 0) {
-        fprintf(stderr, "Error from pthread_mutex_lock: %s\n", strerror(err));
-        exit(1);
-    }
-}
--- a/examples/noise/.gitignore
+++ b/examples/noise/.gitignore
@@ -0,0 +1,3 @@
+noise
+*.ppm
+objs
--- a/examples/noise/Makefile
+++ b/examples/noise/Makefile
@@ -0,0 +1,29 @@
+
+CXX=g++ -m64
+CXXFLAGS=-Iobjs/ -O3 -Wall
+ISPC=ispc
+ISPCFLAGS=-O2 --target=sse2,sse4,avx-x2 --arch=x86-64
+
+OBJS=objs/noise.o objs/noise_serial.o objs/noise_ispc.o objs/noise_ispc_sse2.o \
+	objs/noise_ispc_sse4.o objs/noise_ispc_avx.o 
+
+default: noise
+
+.PHONY: dirs clean
+
+dirs:
+	/bin/mkdir -p objs/
+
+clean:
+	/bin/rm -rf objs *~ noise
+
+noise: dirs $(OBJS)
+	$(CXX) $(CXXFLAGS) -o $@ $(OBJS) -lm
+
+objs/%.o: %.cpp
+	$(CXX) $< $(CXXFLAGS) -c -o $@
+
+objs/noise.o: objs/noise_ispc.h 
+
+objs/%_ispc.h objs/%_ispc.o objs/%_ispc_sse2.o objs/%_ispc_sse4.o objs/%_ispc_avx.o: %.ispc
+	$(ISPC) $(ISPCFLAGS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
--- a/examples/noise/noise.cpp
+++ b/examples/noise/noise.cpp
@@ -0,0 +1,115 @@
+/*
+  Copyright (c) 2010-2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+#ifdef _MSC_VER
+#define _CRT_SECURE_NO_WARNINGS
+#define NOMINMAX
+#pragma warning (disable: 4244)
+#pragma warning (disable: 4305)
+#endif
+
+#include <stdio.h>
+#include <algorithm>
+#include "../timing.h"
+#include "noise_ispc.h"
+using namespace ispc;
+
+extern void noise_serial(float x0, float y0, float x1, float y1,
+                         int width, int height, float output[]);
+
+/* Write a PPM image file with the image */
+static void
+writePPM(float *buf, int width, int height, const char *fn) {
+    FILE *fp = fopen(fn, "wb");
+    fprintf(fp, "P6\n");
+    fprintf(fp, "%d %d\n", width, height);
+    fprintf(fp, "255\n");
+    for (int i = 0; i < width*height; ++i) {
+        float v = buf[i] * 255.f;
+        if (v < 0) v = 0;
+        if (v > 255) v = 255;
+        for (int j = 0; j < 3; ++j)
+            fputc((char)v, fp);
+    }
+    fclose(fp);
+}
+
+
+int main() {
+    unsigned int width = 768;
+    unsigned int height = 768;
+    float x0 = -10;
+    float x1 = 10;
+    float y0 = -10;
+    float y1 = 10;
+
+    float *buf = new float[width*height];
+
+    //
+    // Compute the image using the ispc implementation; report the minimum
+    // time of three runs.
+    //
+    double minISPC = 1e30;
+    for (int i = 0; i < 3; ++i) {
+        reset_and_start_timer();
+        noise_ispc(x0, y0, x1, y1, width, height, buf);
+        double dt = get_elapsed_mcycles();
+        minISPC = std::min(minISPC, dt);
+    }
+
+    printf("[noise ispc]:\t\t\t[%.3f] million cycles\n", minISPC);
+    writePPM(buf, width, height, "noise-ispc.ppm");
+
+    // Clear out the buffer
+    for (unsigned int i = 0; i < width * height; ++i)
+        buf[i] = 0;
+
+    // 
+    // And run the serial implementation 3 times, again reporting the
+    // minimum time.
+    //
+    double minSerial = 1e30;
+    for (int i = 0; i < 3; ++i) {
+        reset_and_start_timer();
+        noise_serial(x0, y0, x1, y1, width, height, buf);
+        double dt = get_elapsed_mcycles();
+        minSerial = std::min(minSerial, dt);
+    }
+
+    printf("[noise serial]:\t\t\t[%.3f] millon cycles\n", minSerial);
+    writePPM(buf, width, height, "noise-serial.ppm");
+
+    printf("\t\t\t\t(%.2fx speedup from ISPC)\n", minSerial/minISPC);
+
+    return 0;
+}
--- a/examples/noise/noise.ispc
+++ b/examples/noise/noise.ispc
@@ -0,0 +1,164 @@
+/*
+  Copyright (c) 2010-2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+#define NOISE_PERM_SIZE 256
+
+static uniform int NoisePerm[2 * NOISE_PERM_SIZE] = {
+    151, 160, 137, 91, 90, 15, 131, 13, 201, 95, 96, 53, 194, 233, 7, 225, 140,
+    36, 103, 30, 69, 142, 8, 99, 37, 240, 21, 10, 23, 190, 6, 148, 247, 120,
+    234, 75, 0, 26, 197, 62, 94, 252, 219, 203, 117, 35, 11, 32, 57, 177, 33,
+    88, 237, 149, 56, 87, 174, 20, 125, 136, 171, 168,  68, 175, 74, 165, 71, 
+    134, 139, 48, 27, 166, 77, 146, 158, 231, 83, 111, 229, 122, 60, 211, 133, 
+    230, 220, 105, 92, 41, 55, 46, 245, 40, 244, 102, 143, 54, 65, 25, 63, 161,
+    1, 216, 80, 73, 209, 76, 132, 187, 208,  89, 18, 169, 200, 196, 135, 130, 
+    116, 188, 159, 86, 164, 100, 109, 198, 173, 186,  3, 64, 52, 217, 226, 250,
+    124, 123, 5, 202, 38, 147, 118, 126, 255, 82, 85, 212, 207, 206, 59, 227, 
+    47, 16, 58, 17, 182, 189, 28, 42, 223, 183, 170, 213, 119, 248, 152,  2, 44,
+    154, 163, 70, 221, 153, 101, 155, 167,  43, 172, 9, 129, 22, 39, 253,  19, 
+    98, 108, 110, 79, 113, 224, 232, 178, 185,  112, 104, 218, 246, 97, 228, 251,
+    34, 242, 193, 238, 210, 144, 12, 191, 179, 162, 241, 81, 51, 145, 235, 249,
+    14, 239, 107, 49, 192, 214,  31, 181, 199, 106, 157, 184, 84, 204, 176, 115,
+    121, 50, 45, 127,  4, 150, 254, 138, 236, 205, 93, 222, 114, 67, 29, 24, 72, 
+    243, 141, 128, 195, 78, 66, 215, 61, 156, 180, 151, 160, 137, 91, 90, 15,
+    131, 13, 201, 95, 96, 53, 194, 233, 7, 225, 140, 36, 103, 30, 69, 142, 8, 99,
+    37, 240, 21, 10, 23, 190, 6, 148, 247, 120, 234, 75, 0, 26, 197, 62, 94, 252,
+    219, 203, 117, 35, 11, 32, 57, 177, 33, 88, 237, 149, 56, 87, 174, 20, 125, 
+    136, 171, 168,  68, 175, 74, 165, 71, 134, 139, 48, 27, 166, 77, 146, 158,
+    231, 83, 111, 229, 122, 60, 211, 133, 230, 220, 105, 92, 41, 55, 46, 245,
+    40, 244, 102, 143, 54,  65, 25, 63, 161,  1, 216, 80, 73, 209, 76, 132, 187,
+    208,  89, 18, 169, 200, 196, 135, 130, 116, 188, 159, 86, 164, 100, 109, 
+    198, 173, 186,  3, 64, 52, 217, 226, 250, 124, 123, 5, 202, 38, 147, 118,
+    126, 255, 82, 85, 212, 207, 206, 59, 227, 47, 16, 58, 17, 182, 189, 28, 42,
+    223, 183, 170, 213, 119, 248, 152,  2, 44, 154, 163, 70, 221, 153, 101, 155, 
+    167,  43, 172, 9, 129, 22, 39, 253,  19, 98, 108, 110, 79, 113, 224, 232,
+    178, 185,  112, 104, 218, 246, 97, 228, 251, 34, 242, 193, 238, 210, 144,
+    12, 191, 179, 162, 241,  81, 51, 145, 235, 249, 14, 239, 107, 49, 192, 214,
+    31, 181, 199, 106, 157, 184,  84, 204, 176, 115, 121, 50, 45, 127,  4, 150,
+    254, 138, 236, 205, 93, 222, 114, 67, 29, 24, 72, 243, 141, 128, 195, 78, 
+    66, 215, 61, 156, 180
+};
+
+
+inline float SmoothStep(float low, float high, float value) {
+    float v = clamp((value - low) / (high - low), 0.f, 1.f);
+    return v * v * (-2.f * v  + 3.f);
+}
+
+
+inline int Floor2Int(float val) {
+    return (int)floor(val);
+}
+
+
+inline float Grad(int x, int y, int z, float dx, float dy, float dz) {
+    int h = NoisePerm[NoisePerm[NoisePerm[x]+y]+z];
+    h &= 15;
+    float u = h<8 || h==12 || h==13 ? dx : dy;
+    float v = h<4 || h==12 || h==13 ? dy : dz;
+    return ((h&1) ? -u : u) + ((h&2) ? -v : v);
+}
+
+
+inline float NoiseWeight(float t) {
+    float t3 = t*t*t;
+    float t4 = t3*t;
+    return 6.f*t4*t - 15.f*t4 + 10.f*t3;
+}
+
+
+inline float Lerp(float t, float low, float high) {
+    return (1. - t) * low + t * high;
+}
+
+
+static float Noise(float x, float y, float z) {
+    // Compute noise cell coordinates and offsets
+    int ix = Floor2Int(x), iy = Floor2Int(y), iz = Floor2Int(z);
+    float dx = x - ix, dy = y - iy, dz = z - iz;
+
+    // Compute gradient weights
+    ix &= (NOISE_PERM_SIZE-1);
+    iy &= (NOISE_PERM_SIZE-1);
+    iz &= (NOISE_PERM_SIZE-1);
+    float w000 = Grad(ix,   iy,   iz,   dx,   dy,   dz);
+    float w100 = Grad(ix+1, iy,   iz,   dx-1, dy,   dz);
+    float w010 = Grad(ix,   iy+1, iz,   dx,   dy-1, dz);
+    float w110 = Grad(ix+1, iy+1, iz,   dx-1, dy-1, dz);
+    float w001 = Grad(ix,   iy,   iz+1, dx,   dy,   dz-1);
+    float w101 = Grad(ix+1, iy,   iz+1, dx-1, dy,   dz-1);
+    float w011 = Grad(ix,   iy+1, iz+1, dx,   dy-1, dz-1);
+    float w111 = Grad(ix+1, iy+1, iz+1, dx-1, dy-1, dz-1);
+
+    // Compute trilinear interpolation of weights
+    float wx = NoiseWeight(dx), wy = NoiseWeight(dy), wz = NoiseWeight(dz);
+    float x00 = Lerp(wx, w000, w100);
+    float x10 = Lerp(wx, w010, w110);
+    float x01 = Lerp(wx, w001, w101);
+    float x11 = Lerp(wx, w011, w111);
+    float y0 = Lerp(wy, x00, x10);
+    float y1 = Lerp(wy, x01, x11);
+    return Lerp(wz, y0, y1);
+}
+
+
+static float Turbulence(float x, float y, float z, uniform int octaves) {
+    float omega = 0.6;
+
+    float sum = 0., lambda = 1., o = 1.;
+    for (uniform int i = 0; i < octaves; ++i) {
+        sum += abs(o * Noise(lambda * x, lambda * y, lambda * z));
+        lambda *= 1.99f;
+        o *= omega;
+    }
+    return sum * 0.5;
+}
+
+
+export void noise_ispc(uniform float x0, uniform float y0, uniform float x1, 
+                       uniform float y1, uniform int width, uniform int height, 
+                       uniform float output[])
+{
+    uniform float dx = (x1 - x0) / width;
+    uniform float dy = (y1 - y0) / height;
+
+    for (uniform int j = 0; j < height; j++) {
+        for (uniform int i = 0; i < width; i += programCount) {
+            float x = x0 + (i + programIndex) * dx;
+            float y = y0 + j * dy;
+
+            int index = (j * width + i + programIndex);
+            output[index] = Turbulence(x, y, 0.6, 8);
+        }
+    }
+}
+
--- a/examples/noise/noise.vcxproj
+++ b/examples/noise/noise.vcxproj
@@ -0,0 +1,175 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|Win32">
+      <Configuration>Debug</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|Win32">
+      <Configuration>Release</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{0E0886D8-8B5E-4EAF-9A21-91E63DAF81FD}</ProjectGuid>
+    <Keyword>Win32Proj</Keyword>
+    <RootNamespace>noise</RootNamespace>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <LinkIncremental>true</LinkIncremental>
+    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <LinkIncremental>true</LinkIncremental>
+    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <LinkIncremental>false</LinkIncremental>
+    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <LinkIncremental>false</LinkIncremental>
+    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
+  </PropertyGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <ClCompile>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <FloatingPointModel>Fast</FloatingPointModel>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <ClCompile>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <FloatingPointModel>Fast</FloatingPointModel>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
+      <FloatingPointModel>Fast</FloatingPointModel>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
+      <FloatingPointModel>Fast</FloatingPointModel>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <ClCompile Include="noise.cpp" />
+    <ClCompile Include="noise_serial.cpp" />
+  </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="noise.ispc">
+      <FileType>Document</FileType>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4,avx-x2
+</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4,avx-x2
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4,avx-x2
+</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4,avx-x2
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
+    </CustomBuild>
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+  </ImportGroup>
+</Project>
--- a/examples/noise/noise_serial.cpp
+++ b/examples/noise/noise_serial.cpp
@@ -0,0 +1,170 @@
+/*
+  Copyright (c) 2010-2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+#include <math.h>
+
+#define NOISE_PERM_SIZE 256
+
+static int NoisePerm[2 * NOISE_PERM_SIZE] = {
+    151, 160, 137, 91, 90, 15, 131, 13, 201, 95, 96, 53, 194, 233, 7, 225, 140,
+    36, 103, 30, 69, 142, 8, 99, 37, 240, 21, 10, 23, 190, 6, 148, 247, 120,
+    234, 75, 0, 26, 197, 62, 94, 252, 219, 203, 117, 35, 11, 32, 57, 177, 33,
+    88, 237, 149, 56, 87, 174, 20, 125, 136, 171, 168,  68, 175, 74, 165, 71, 
+    134, 139, 48, 27, 166, 77, 146, 158, 231, 83, 111, 229, 122, 60, 211, 133, 
+    230, 220, 105, 92, 41, 55, 46, 245, 40, 244, 102, 143, 54, 65, 25, 63, 161,
+    1, 216, 80, 73, 209, 76, 132, 187, 208,  89, 18, 169, 200, 196, 135, 130, 
+    116, 188, 159, 86, 164, 100, 109, 198, 173, 186,  3, 64, 52, 217, 226, 250,
+    124, 123, 5, 202, 38, 147, 118, 126, 255, 82, 85, 212, 207, 206, 59, 227, 
+    47, 16, 58, 17, 182, 189, 28, 42, 223, 183, 170, 213, 119, 248, 152,  2, 44,
+    154, 163, 70, 221, 153, 101, 155, 167,  43, 172, 9, 129, 22, 39, 253,  19, 
+    98, 108, 110, 79, 113, 224, 232, 178, 185,  112, 104, 218, 246, 97, 228, 251,
+    34, 242, 193, 238, 210, 144, 12, 191, 179, 162, 241, 81, 51, 145, 235, 249,
+    14, 239, 107, 49, 192, 214,  31, 181, 199, 106, 157, 184, 84, 204, 176, 115,
+    121, 50, 45, 127,  4, 150, 254, 138, 236, 205, 93, 222, 114, 67, 29, 24, 72, 
+    243, 141, 128, 195, 78, 66, 215, 61, 156, 180, 151, 160, 137, 91, 90, 15,
+    131, 13, 201, 95, 96, 53, 194, 233, 7, 225, 140, 36, 103, 30, 69, 142, 8, 99,
+    37, 240, 21, 10, 23, 190, 6, 148, 247, 120, 234, 75, 0, 26, 197, 62, 94, 252,
+    219, 203, 117, 35, 11, 32, 57, 177, 33, 88, 237, 149, 56, 87, 174, 20, 125, 
+    136, 171, 168,  68, 175, 74, 165, 71, 134, 139, 48, 27, 166, 77, 146, 158,
+    231, 83, 111, 229, 122, 60, 211, 133, 230, 220, 105, 92, 41, 55, 46, 245,
+    40, 244, 102, 143, 54,  65, 25, 63, 161,  1, 216, 80, 73, 209, 76, 132, 187,
+    208,  89, 18, 169, 200, 196, 135, 130, 116, 188, 159, 86, 164, 100, 109, 
+    198, 173, 186,  3, 64, 52, 217, 226, 250, 124, 123, 5, 202, 38, 147, 118,
+    126, 255, 82, 85, 212, 207, 206, 59, 227, 47, 16, 58, 17, 182, 189, 28, 42,
+    223, 183, 170, 213, 119, 248, 152,  2, 44, 154, 163, 70, 221, 153, 101, 155, 
+    167,  43, 172, 9, 129, 22, 39, 253,  19, 98, 108, 110, 79, 113, 224, 232,
+    178, 185,  112, 104, 218, 246, 97, 228, 251, 34, 242, 193, 238, 210, 144,
+    12, 191, 179, 162, 241,  81, 51, 145, 235, 249, 14, 239, 107, 49, 192, 214,
+    31, 181, 199, 106, 157, 184,  84, 204, 176, 115, 121, 50, 45, 127,  4, 150,
+    254, 138, 236, 205, 93, 222, 114, 67, 29, 24, 72, 243, 141, 128, 195, 78, 
+    66, 215, 61, 156, 180
+};
+
+
+inline float Clamp(float v, float low, float high) {
+    return v < low ? low : ((v > high) ? high : v);
+}
+
+
+inline float SmoothStep(float low, float high, float value) {
+    float v = Clamp((value - low) / (high - low), 0.f, 1.f);
+    return v * v * (-2.f * v  + 3.f);
+}
+
+
+inline int Floor2Int(float val) {
+    return (int)floorf(val);
+}
+
+
+inline float Grad(int x, int y, int z, float dx, float dy, float dz) {
+    int h = NoisePerm[NoisePerm[NoisePerm[x]+y]+z];
+    h &= 15;
+    float u = h<8 || h==12 || h==13 ? dx : dy;
+    float v = h<4 || h==12 || h==13 ? dy : dz;
+    return ((h&1) ? -u : u) + ((h&2) ? -v : v);
+}
+
+
+inline float NoiseWeight(float t) {
+    float t3 = t*t*t;
+    float t4 = t3*t;
+    return 6.f*t4*t - 15.f*t4 + 10.f*t3;
+}
+
+
+inline float Lerp(float t, float low, float high) {
+    return (1.f - t) * low + t * high;
+}
+
+
+static float Noise(float x, float y, float z) {
+    // Compute noise cell coordinates and offsets
+    int ix = Floor2Int(x), iy = Floor2Int(y), iz = Floor2Int(z);
+    float dx = x - ix, dy = y - iy, dz = z - iz;
+
+    // Compute gradient weights
+    ix &= (NOISE_PERM_SIZE-1);
+    iy &= (NOISE_PERM_SIZE-1);
+    iz &= (NOISE_PERM_SIZE-1);
+    float w000 = Grad(ix,   iy,   iz,   dx,   dy,   dz);
+    float w100 = Grad(ix+1, iy,   iz,   dx-1, dy,   dz);
+    float w010 = Grad(ix,   iy+1, iz,   dx,   dy-1, dz);
+    float w110 = Grad(ix+1, iy+1, iz,   dx-1, dy-1, dz);
+    float w001 = Grad(ix,   iy,   iz+1, dx,   dy,   dz-1);
+    float w101 = Grad(ix+1, iy,   iz+1, dx-1, dy,   dz-1);
+    float w011 = Grad(ix,   iy+1, iz+1, dx,   dy-1, dz-1);
+    float w111 = Grad(ix+1, iy+1, iz+1, dx-1, dy-1, dz-1);
+
+    // Compute trilinear interpolation of weights
+    float wx = NoiseWeight(dx), wy = NoiseWeight(dy), wz = NoiseWeight(dz);
+    float x00 = Lerp(wx, w000, w100);
+    float x10 = Lerp(wx, w010, w110);
+    float x01 = Lerp(wx, w001, w101);
+    float x11 = Lerp(wx, w011, w111);
+    float y0 = Lerp(wy, x00, x10);
+    float y1 = Lerp(wy, x01, x11);
+    return Lerp(wz, y0, y1);
+}
+
+
+static float Turbulence(float x, float y, float z, int octaves) {
+    float omega = 0.6;
+
+    float sum = 0., lambda = 1., o = 1.;
+    for (int i = 0; i < octaves; ++i) {
+        sum += fabsf(o * Noise(lambda * x, lambda * y, lambda * z));
+        lambda *= 1.99f;
+        o *= omega;
+    }
+    return sum * 0.5f;
+}
+
+
+void noise_serial(float x0, float y0, float x1, float y1,
+                  int width, int height, float output[])
+{
+    float dx = (x1 - x0) / width;
+    float dy = (y1 - y0) / height;
+
+    for (int j = 0; j < height; j++) {
+        for (int i = 0; i < width; ++i) {
+            float x = x0 + i * dx;
+            float y = y0 + j * dy;
+
+            int index = (j * width + i);
+            output[index] = Turbulence(x, y, 0.6f, 8);
+        }
+    }
+}
+
--- a/examples/options/.gitignore
+++ b/examples/options/.gitignore
@@ -0,0 +1 @@
+options
--- a/examples/options/Makefile
+++ b/examples/options/Makefile
@@ -2,7 +2,11 @@
 CXX=g++ -m64
 CXXFLAGS=-Iobjs/ -g -Wall
 ISPC=ispc
-ISPCFLAGS=-O2 --target=sse4x2 --arch=x86-64
+ISPCFLAGS=-O2 --target=sse2,sse4-x2,avx-x2 --arch=x86-64
+
+OBJS=objs/options.o objs/options_serial.o objs/options_ispc.o \
+	objs/options_ispc_sse2.o objs/options_ispc_sse4.o \
+	objs/options_ispc_avx.o

 default: options

@@ -14,13 +18,13 @@ dirs:
 clean:
 	/bin/rm -rf objs *~ options

-options: dirs objs/options.o objs/options_serial.o objs/options_ispc.o
-	$(CXX) $(CXXFLAGS) -o $@ objs/options.o objs/options_ispc.o objs/options_serial.o -lm
+options: dirs $(OBJS)
+	$(CXX) $(CXXFLAGS) -o $@ $(OBJS) -lm

 objs/%.o: %.cpp
 	$(CXX) $< $(CXXFLAGS) -c -o $@

 objs/options.o: objs/options_ispc.h options_defs.h

-objs/%_ispc.h objs/%_ispc.o: %.ispc options_defs.h
+objs/%_ispc.h objs/%_ispc.o objs/%_ispc_sse2.o objs/%_ispc_sse4.o objs/%_ispc_avx.o: %.ispc options_defs.h
 	$(ISPC) $(ISPCFLAGS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
--- a/examples/options/options.vcxproj
+++ b/examples/options/options.vcxproj
@@ -1,4 +1,4 @@
-<?xml version="1.0" encoding="utf-8"?>
+<?xml version="1.0" encoding="utf-8"?>
 <Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
  <ItemGroup Label="ProjectConfigurations">
    <ProjectConfiguration Include="Debug|Win32">
@@ -64,15 +64,19 @@
  <PropertyGroup Label="UserMacros" />
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
    <LinkIncremental>true</LinkIncremental>
+    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
    <LinkIncremental>true</LinkIncremental>
+    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
    <LinkIncremental>false</LinkIncremental>
+    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
    <LinkIncremental>false</LinkIncremental>
+    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
  </PropertyGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
    <ClCompile>
@@ -81,6 +85,7 @@
      <WarningLevel>Level3</WarningLevel>
      <Optimization>Disabled</Optimization>
      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
      <DisableSpecificWarnings>4305</DisableSpecificWarnings>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <FloatingPointModel>Fast</FloatingPointModel>
@@ -97,6 +102,7 @@
      <WarningLevel>Level3</WarningLevel>
      <Optimization>Disabled</Optimization>
      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
      <DisableSpecificWarnings>4305</DisableSpecificWarnings>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <FloatingPointModel>Fast</FloatingPointModel>
@@ -115,6 +121,7 @@
      <FunctionLevelLinking>true</FunctionLevelLinking>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
      <DisableSpecificWarnings>4305</DisableSpecificWarnings>
      <FloatingPointModel>Fast</FloatingPointModel>
    </ClCompile>
@@ -134,6 +141,7 @@
      <FunctionLevelLinking>true</FunctionLevelLinking>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
      <DisableSpecificWarnings>4305</DisableSpecificWarnings>
      <FloatingPointModel>Fast</FloatingPointModel>
    </ClCompile>
@@ -151,18 +159,18 @@
  <ItemGroup>
    <CustomBuild Include="options.ispc">
      <FileType>Document</FileType>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=sse4x2
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx-x2
 </Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --target=sse4x2
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx-x2
 </Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=sse4x2
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx-x2
 </Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --target=sse4x2
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx-x2
 </Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
    </CustomBuild>
  </ItemGroup>
  <ItemGroup>
--- a/examples/options/options_serial.cpp
+++ b/examples/options/options_serial.cpp
@@ -47,7 +47,7 @@ static inline float
 CND(float X) {
    float L = fabsf(X);

-    float k = 1.0 / (1.0 + 0.2316419 * L);
+    float k = 1.f / (1.f + 0.2316419f * L);
    float k2 = k*k;
    float k3 = k2*k;
    float k4 = k2*k2;
@@ -59,7 +59,7 @@ CND(float X) {
    w *= invSqrt2Pi * expf(-L * L * .5f);

    if (X > 0.f)
-        w = 1.0 - w;
+        w = 1.f - w;
    return w;
 }

@@ -94,7 +94,7 @@ binomial_put_serial(float Sa[], float Xa[], float Ta[],

        float dt = T / BINOMIAL_NUM;
        float u = expf(v * sqrtf(dt));
-        float d = 1. / u;
+        float d = 1.f / u;
        float disc = expf(r * dt);
        float Pu = (disc - d) / (u - d);

--- a/examples/rt/.gitignore
+++ b/examples/rt/.gitignore
@@ -0,0 +1,2 @@
+rt
+*.ppm
--- a/examples/rt/Makefile
+++ b/examples/rt/Makefile
@@ -1,8 +1,17 @@

-CXX=g++ -m64
-CXXFLAGS=-Iobjs/ -O3 -Wall
+ARCH = $(shell uname)
+
+TASK_CXX=../tasksys.cpp
+TASK_LIB=-lpthread
+TASK_OBJ=$(addprefix objs/, $(subst ../,, $(TASK_CXX:.cpp=.o)))
+
+CXX=g++
+CXXFLAGS=-Iobjs/ -O3 -Wall -m64
 ISPC=ispc
-ISPCFLAGS=-O2 --target=sse4x2 --arch=x86-64
+ISPCFLAGS=-O2 --target=sse2,sse4-x2,avx --arch=x86-64
+
+OBJS=objs/rt.o objs/rt_serial.o $(TASK_OBJ) objs/rt_ispc.o objs/rt_ispc_sse2.o \
+	objs/rt_ispc_sse4.o objs/rt_ispc_avx.o

 default: rt

@@ -14,11 +23,16 @@ dirs:
 clean:
 	/bin/rm -rf objs *~ rt

-rt: dirs objs/rt.o objs/rt_serial.o objs/rt_ispc.o
-	$(CXX) $(CXXFLAGS) -o $@ objs/rt.o objs/rt_ispc.o objs/rt_serial.o -lm
+rt: dirs $(OBJS)
+	$(CXX) $(CXXFLAGS) -o $@ $(OBJS) -lm $(TASK_LIB)

-objs/%.o: %.cpp objs/rt_ispc.h
+objs/%.o: %.cpp
 	$(CXX) $< $(CXXFLAGS) -c -o $@

-objs/%_ispc.h objs/%_ispc.o: %.ispc
+objs/%.o: ../%.cpp
+	$(CXX) $< $(CXXFLAGS) -c -o $@
+
+objs/rt.o: objs/rt_ispc.h 
+
+objs/%_ispc.h objs/%_ispc.o objs/%_ispc_sse2.o objs/%_ispc_sse4.o objs/%_ispc_avx.o: %.ispc
 	$(ISPC) $(ISPCFLAGS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
--- a/examples/rt/rt.cpp
+++ b/examples/rt/rt.cpp
@@ -42,6 +42,7 @@
 #include <math.h>
 #include <algorithm>
 #include <assert.h>
+#include <string.h>
 #include <sys/types.h>
 #include "../timing.h"
 #include "rt_ispc.h"
@@ -50,7 +51,8 @@ using namespace ispc;

 typedef unsigned int uint;

-extern void raytrace_serial(int width, int height, const float raster2camera[4][4], 
+extern void raytrace_serial(int width, int height, int baseWidth, int baseHeight,
+                            const float raster2camera[4][4], 
                            const float camera2world[4][4], float image[],
                            int id[], const LinearBVHNode nodes[],
                            const Triangle triangles[]);
@@ -89,14 +91,32 @@ static void writeImage(int *idImage, float *depthImage, int width, int height,
        }
    }            
    fclose(f);
+    printf("Wrote image file %s\n", filename);
+}
+
+
+static void usage() {
+    fprintf(stderr, "rt [--scale=<factor>] <scene name base>\n");
+    exit(1);
 }


 int main(int argc, char *argv[]) {
-    if (argc != 2) {
-        fprintf(stderr, "usage: rt <filename base>\n");
-        exit(1);
+    float scale = 1.f;
+    const char *filename = NULL;
+    for (int i = 1; i < argc; ++i) {
+        if (strncmp(argv[i], "--scale=", 8) == 0) {
+            scale = atof(argv[i] + 8);
+            if (scale == 0.f)
+                usage();
+        }
+        else if (filename != NULL)
+            usage();
+        else
+            filename = argv[i];
    }
+    if (filename == NULL)
+        usage();

 #define READ(var, n)                                            \
    if (fread(&(var), sizeof(var), n, f) != (unsigned int)n) {  \
@@ -108,10 +128,10 @@ int main(int argc, char *argv[]) {
    // Read the camera specification information from the camera file
    //
    char fnbuf[1024];
-    sprintf(fnbuf, "%s.camera", argv[1]);
+    sprintf(fnbuf, "%s.camera", filename);
    FILE *f = fopen(fnbuf, "rb");
    if (!f) {
-        perror(argv[1]);
+        perror(fnbuf);
        return 1;
    }

@@ -119,20 +139,20 @@ int main(int argc, char *argv[]) {
    // Nothing fancy, and trouble if we run on a big-endian system, just
    // fread in the bits
    //
-    int width, height;
+    int baseWidth, baseHeight;
    float camera2world[4][4], raster2camera[4][4];
-    READ(width, 1);
-    READ(height, 1);
+    READ(baseWidth, 1);
+    READ(baseHeight, 1);
    READ(camera2world[0][0], 16);
    READ(raster2camera[0][0], 16);

    //
    // Read in the serialized BVH 
    //
-    sprintf(fnbuf, "%s.bvh", argv[1]);
+    sprintf(fnbuf, "%s.bvh", filename);
    f = fopen(fnbuf, "rb");
    if (!f) {
-        perror(argv[2]);
+        perror(fnbuf);
        return 1;
    }

@@ -148,14 +168,16 @@ int main(int argc, char *argv[]) {
        // of node, the total number of int it if a leaf node, etc.
        float b[6];
        READ(b[0], 6);
-        nodes[i].bounds[0].v[0] = b[0];
-        nodes[i].bounds[0].v[1] = b[1];
-        nodes[i].bounds[0].v[2] = b[2];
-        nodes[i].bounds[1].v[0] = b[3];
-        nodes[i].bounds[1].v[1] = b[4];
-        nodes[i].bounds[1].v[2] = b[5];
+        nodes[i].bounds[0][0] = b[0];
+        nodes[i].bounds[0][1] = b[1];
+        nodes[i].bounds[0][2] = b[2];
+        nodes[i].bounds[1][0] = b[3];
+        nodes[i].bounds[1][1] = b[4];
+        nodes[i].bounds[1][2] = b[5];
        READ(nodes[i].offset, 1);
-        READ(nodes[i].primsAxis, 1);
+        READ(nodes[i].nPrimitives, 1);
+        READ(nodes[i].splitAxis, 1);
+        READ(nodes[i].pad, 1);
    }

    // And then read the triangles 
@@ -168,19 +190,19 @@ int main(int argc, char *argv[]) {
        READ(v[0], 9);
        float *vp = v;
        for (int j = 0; j < 3; ++j) {
-            triangles[i].p[j].v[0] = *vp++;
-            triangles[i].p[j].v[1] = *vp++;
-            triangles[i].p[j].v[2] = *vp++;
+            triangles[i].p[j][0] = *vp++;
+            triangles[i].p[j][1] = *vp++;
+            triangles[i].p[j][2] = *vp++;
        }
        // And create an object id
        triangles[i].id = i+1;
    }
    fclose(f);

-    // round image resolution up to multiple of 4 to makethings easy for
+    // round image resolution up to multiple of 16 to make things easy for
    // the code that assigns pixels to ispc program instances
-    height = (height + 3) & ~3;
-    width = (width + 3) & ~3;
+    int height = (int(baseHeight * scale) + 0xf) & ~0xf;
+    int width = (int(baseWidth * scale) + 0xf) & ~0xf;

    // allocate images; one to hold hit object ids, one to hold depth to
    // the first interseciton
@@ -188,19 +210,42 @@ int main(int argc, char *argv[]) {
    float *image = new float[width*height];

    //
-    // Run 3 iterations with ispc, record the minimum time
+    // Run 3 iterations with ispc + 1 core, record the minimum time
    //
    double minTimeISPC = 1e30;
    for (int i = 0; i < 3; ++i) {
        reset_and_start_timer();
-        raytrace(width, height, raster2camera, camera2world, 
-                 image, id, nodes, triangles);
+        raytrace_ispc(width, height, baseWidth, baseHeight, raster2camera, 
+                      camera2world, image, id, nodes, triangles);
        double dt = get_elapsed_mcycles();
        minTimeISPC = std::min(dt, minTimeISPC);
    }
-    printf("[rt ispc]:\t\t\t[%.3f] million cycles for %d x %d image\n", minTimeISPC, width, height);
+    printf("[rt ispc, 1 core]:\t\t[%.3f] million cycles for %d x %d image\n", 
+           minTimeISPC, width, height);

-    writeImage(id, image, width, height, "rt-ispc.ppm");
+    writeImage(id, image, width, height, "rt-ispc-1core.ppm");
+
+    memset(id, 0, width*height*sizeof(int));
+    memset(image, 0, width*height*sizeof(float));
+
+    //
+    // Run 3 iterations with ispc + 1 core, record the minimum time
+    //
+    double minTimeISPCtasks = 1e30;
+    for (int i = 0; i < 3; ++i) {
+        reset_and_start_timer();
+        raytrace_ispc_tasks(width, height, baseWidth, baseHeight, raster2camera,
+                            camera2world, image, id, nodes, triangles);
+        double dt = get_elapsed_mcycles();
+        minTimeISPCtasks = std::min(dt, minTimeISPCtasks);
+    }
+    printf("[rt ispc + tasks]:\t\t[%.3f] million cycles for %d x %d image\n", 
+           minTimeISPCtasks, width, height);
+
+    writeImage(id, image, width, height, "rt-ispc-tasks.ppm");
+
+    memset(id, 0, width*height*sizeof(int));
+    memset(image, 0, width*height*sizeof(float));

    //
    // And 3 iterations with the serial implementation, reporting the
@@ -209,14 +254,15 @@ int main(int argc, char *argv[]) {
    double minTimeSerial = 1e30;
    for (int i = 0; i < 3; ++i) {
        reset_and_start_timer();
-        raytrace_serial(width, height, raster2camera, camera2world, 
-                        image, id, nodes, triangles);
+        raytrace_serial(width, height, baseWidth, baseHeight, raster2camera, 
+                        camera2world, image, id, nodes, triangles);
        double dt = get_elapsed_mcycles();
        minTimeSerial = std::min(dt, minTimeSerial);
    }
    printf("[rt serial]:\t\t\t[%.3f] million cycles for %d x %d image\n", 
           minTimeSerial, width, height);
-    printf("\t\t\t\t(%.2fx speedup from ISPC)\n", minTimeSerial / minTimeISPC);
+    printf("\t\t\t\t(%.2fx speedup from ISPC, %.2f from ISPC + tasks)\n", 
+           minTimeSerial / minTimeISPC, minTimeSerial / minTimeISPCtasks);

    writeImage(id, image, width, height, "rt-serial.ppm");

--- a/examples/rt/rt.ispc
+++ b/examples/rt/rt.ispc
@@ -43,28 +43,19 @@ struct Ray {
 };

 struct Triangle {
-    uniform float3 p[3];
+    uniform float p[3][4];
    uniform int id;
+    uniform int pad[3];
 };

 struct LinearBVHNode {
-    uniform float3 bounds[2];
+    uniform float bounds[2][3];
    uniform unsigned int offset;     // num primitives for leaf, second child for interior
-    uniform unsigned int primsAxis;  // 0:7 nPrimitives, 8:15 split axis, 16:31 padding
+    uniform unsigned int8 nPrimitives;
+    uniform unsigned int8 splitAxis;
+    uniform unsigned int16 pad;
 };

-static inline uniform int nPrims(const reference LinearBVHNode node) {
-    return (node.primsAxis & 0xff);
-}
-
-static inline uniform int axis(const reference LinearBVHNode node) {
-    return ((node.primsAxis >> 8) & 0xff);
-}
-
-static inline uniform bool isInterior(const reference LinearBVHNode node) {
-    return nPrims(node) == 0;
-}
-
 static inline float3 Cross(const float3 v1, const float3 v2) {
    float v1x = v1.x, v1y = v1.y, v1z = v1.z;
    float v2x = v2.x, v2y = v2.y, v2z = v2.z;
@@ -113,14 +104,16 @@ static void generateRay(uniform const float raster2camera[4][4],
 }


-static inline bool BBoxIntersect(const reference uniform float3 bounds[2], 
+static inline bool BBoxIntersect(const uniform float bounds[2][3], 
                                 const reference Ray ray) {
+    uniform float3 bounds0 = { bounds[0][0], bounds[0][1], bounds[0][2] };
+    uniform float3 bounds1 = { bounds[1][0], bounds[1][1], bounds[1][2] };
    float t0 = ray.mint, t1 = ray.maxt;

    // Check all three axis-aligned slabs.  Don't try to early out; it's
    // not worth the trouble
-    float3 tNear = (bounds[0] - ray.origin) * ray.invDir;
-    float3 tFar  = (bounds[1] - ray.origin) * ray.invDir;
+    float3 tNear = (bounds0 - ray.origin) * ray.invDir;
+    float3 tFar  = (bounds1 - ray.origin) * ray.invDir;
    if (tNear.x > tFar.x) {
        float tmp = tNear.x;
        tNear.x = tFar.x;
@@ -151,8 +144,11 @@ static inline bool BBoxIntersect(const reference uniform float3 bounds[2],


 static inline bool TriIntersect(const reference Triangle tri, reference Ray ray) {
-    uniform float3 e1 = tri.p[1] - tri.p[0];
-    uniform float3 e2 = tri.p[2] - tri.p[0];
+    uniform float3 p0 = { tri.p[0][0], tri.p[0][1], tri.p[0][2] };
+    uniform float3 p1 = { tri.p[1][0], tri.p[1][1], tri.p[1][2] };
+    uniform float3 p2 = { tri.p[2][0], tri.p[2][1], tri.p[2][2] };
+    uniform float3 e1 = p1 - p0;
+    uniform float3 e2 = p2 - p0;

    float3 s1 = Cross(ray.dir, e2);
    float divisor = Dot(s1, e1);
@@ -163,7 +159,7 @@ static inline bool TriIntersect(const reference Triangle tri, reference Ray ray)
    float invDivisor = 1.f / divisor;

    // Compute first barycentric coordinate
-    float3 d = ray.origin - tri.p[0];
+    float3 d = ray.origin - p0;
    float b1 = Dot(d, s1) * invDivisor;
    if (b1 < 0. || b1 > 1.)
        hit = false;
@@ -199,7 +195,7 @@ bool BVHIntersect(const LinearBVHNode nodes[], const Triangle tris[],
        // Check ray against BVH node
        LinearBVHNode node = nodes[nodeNum];
        if (any(BBoxIntersect(node.bounds, ray))) {
-            uniform unsigned int nPrimitives = nPrims(node);
+            uniform unsigned int nPrimitives = node.nPrimitives;
            if (nPrimitives > 0) {
                // Intersect ray with primitives in leaf BVH node
                uniform unsigned int primitivesOffset = node.offset;
@@ -213,7 +209,7 @@ bool BVHIntersect(const LinearBVHNode nodes[], const Triangle tris[],
            }
            else {
                // Put far BVH node on _todo_ stack, advance to near node
-                if (r.dirIsNeg[axis(node)]) {
+                if (r.dirIsNeg[node.splitAxis]) {
                   todo[todoOffset++] = nodeNum + 1;
                   nodeNum = node.offset;
                }
@@ -236,20 +232,26 @@ bool BVHIntersect(const LinearBVHNode nodes[], const Triangle tris[],
 }


-export void raytrace(uniform int width, uniform int height,
-                     const uniform float raster2camera[4][4], 
-                     const uniform float camera2world[4][4],
-                     uniform float image[], uniform int id[],
-                     const LinearBVHNode nodes[],
-                     const Triangle triangles[]) {
+static void raytrace_tile(uniform int x0, uniform int x1,
+                          uniform int y0, uniform int y1, 
+                          uniform int width, uniform int height,
+                          uniform int baseWidth, uniform int baseHeight,
+                          const uniform float raster2camera[4][4], 
+                          const uniform float camera2world[4][4],
+                          uniform float image[], uniform int id[],
+                          const LinearBVHNode nodes[],
+                          const Triangle triangles[]) {
+    uniform float widthScale = (float)(baseWidth) / (float)(width);
+    uniform float heightScale = (float)(baseHeight) / (float)(height);
+
    static const uniform float udx[16] = { 0, 1, 0, 1, 2, 3, 2, 3, 
                                           0, 1, 0, 1, 2, 3, 2, 3 };
    static const uniform float udy[16] = { 0, 0, 1, 1, 0, 0, 1, 1, 
                                           2, 2, 3, 3, 2, 2, 3, 3 };

    // The outer loops are always over blocks of 4x4 pixels
-    for (uniform int y = 0; y < height; y += 4) {
-        for (uniform int x = 0; x < width; x += 4) {
+    for (uniform int y = y0; y < y1; y += 4) {
+        for (uniform int x = x0; x < x1; x += 4) {
            // Now we have a block of 4x4=16 pixels to process; it will
            // take 16/programCount iterations of this loop to process
            // them.
@@ -261,7 +263,8 @@ export void raytrace(uniform int width, uniform int height,
                const float dy = udy[o * programCount + programIndex];

                Ray ray;
-                generateRay(raster2camera, camera2world, x+dx, y+dy, ray);
+                generateRay(raster2camera, camera2world, (x+dx)*widthScale,
+                            (y+dy)*heightScale, ray);
                BVHIntersect(nodes, triangles, ray);

                int offset = (y + (int)dy) * width + (x + (int)dx);
@@ -271,3 +274,54 @@ export void raytrace(uniform int width, uniform int height,
        }
    }
 }
+
+
+export void raytrace_ispc(uniform int width, uniform int height,
+                          uniform int baseWidth, uniform int baseHeight,
+                          const uniform float raster2camera[4][4], 
+                          const uniform float camera2world[4][4],
+                          uniform float image[], uniform int id[],
+                          const LinearBVHNode nodes[],
+                          const Triangle triangles[]) {
+    raytrace_tile(0, width, 0, height, width, height, baseWidth, baseHeight,
+                  raster2camera, camera2world, image,
+                  id, nodes, triangles);
+}
+
+
+task void raytrace_tile_task(uniform int width, uniform int height,
+                             uniform int baseWidth, uniform int baseHeight,
+                             const uniform float raster2camera[4][4], 
+                             const uniform float camera2world[4][4],
+                             uniform float image[], uniform int id[],
+                             const LinearBVHNode nodes[],
+                             const Triangle triangles[]) {
+    uniform int dx = 16, dy = 16; // must match dx, dy below
+    uniform int xBuckets = (width + (dx-1)) / dx;
+    uniform int x0 = (taskIndex % xBuckets) * dx;
+    uniform int x1 = min(x0 + dx, width);
+    uniform int y0 = (taskIndex / xBuckets) * dy;
+    uniform int y1 = min(y0 + dy, height);
+                             
+    raytrace_tile(x0, x1, y0, y1, width, height, baseWidth, baseHeight, 
+                  raster2camera, camera2world, image,
+                  id, nodes, triangles);
+}
+
+
+export void raytrace_ispc_tasks(uniform int width, uniform int height,
+                                uniform int baseWidth, uniform int baseHeight,
+                                const uniform float raster2camera[4][4], 
+                                const uniform float camera2world[4][4],
+                                uniform float image[], uniform int id[],
+                                const LinearBVHNode nodes[],
+                                const Triangle triangles[]) {
+    uniform int dx = 16, dy = 16;
+    uniform int xBuckets = (width + (dx-1)) / dx;
+    uniform int yBuckets = (height + (dy-1)) / dy;
+    uniform int nTasks = xBuckets * yBuckets;
+    launch[nTasks] < raytrace_tile_task(width, height, baseWidth, baseHeight, 
+                                        raster2camera, camera2world, 
+                                        image, id, nodes, triangles) >;
+}
+
--- a/examples/rt/rt.vcxproj
+++ b/examples/rt/rt.vcxproj
@@ -1,4 +1,4 @@
-<?xml version="1.0" encoding="utf-8"?>
+<?xml version="1.0" encoding="utf-8"?>
 <Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
  <ItemGroup Label="ProjectConfigurations">
    <ProjectConfiguration Include="Debug|Win32">
@@ -64,15 +64,19 @@
  <PropertyGroup Label="UserMacros" />
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
    <LinkIncremental>true</LinkIncremental>
+    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
    <LinkIncremental>true</LinkIncremental>
+    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
    <LinkIncremental>false</LinkIncremental>
+    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
    <LinkIncremental>false</LinkIncremental>
+    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
  </PropertyGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
    <ClCompile>
@@ -81,6 +85,7 @@
      <WarningLevel>Level3</WarningLevel>
      <Optimization>Disabled</Optimization>
      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <FloatingPointModel>Fast</FloatingPointModel>
    </ClCompile>
@@ -96,6 +101,7 @@
      <WarningLevel>Level3</WarningLevel>
      <Optimization>Disabled</Optimization>
      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <FloatingPointModel>Fast</FloatingPointModel>
    </ClCompile>
@@ -113,6 +119,7 @@
      <FunctionLevelLinking>true</FunctionLevelLinking>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
      <FloatingPointModel>Fast</FloatingPointModel>
    </ClCompile>
    <Link>
@@ -131,6 +138,7 @@
      <FunctionLevelLinking>true</FunctionLevelLinking>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
      <FloatingPointModel>Fast</FloatingPointModel>
    </ClCompile>
    <Link>
@@ -144,26 +152,27 @@
    <CustomBuild Include="rt.ispc">
      <FileType>Document</FileType>
      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86
+ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx
 </Command>
      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h
+ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx
 </Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(Filename).obj</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">%(Filename).obj</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86
+ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx
 </Command>
      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h
+ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx
 </Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(Filename).obj</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">%(Filename).obj</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
    </CustomBuild>
  </ItemGroup>
  <ItemGroup>
    <ClCompile Include="rt.cpp" />
    <ClCompile Include="rt_serial.cpp" />
+    <ClCompile Include="../tasksys.cpp" />
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
--- a/examples/rt/rt_serial.cpp
+++ b/examples/rt/rt_serial.cpp
@@ -39,6 +39,7 @@
 #endif

 #include <algorithm>
+#include <stdint.h>

 // Just enough of a float3 class to do what we need in this file.
 #ifdef _MSC_VER
@@ -74,31 +75,22 @@ struct Ray {
 // Declare these in a namespace so the mangling matches
 namespace ispc {
    struct Triangle {
-        float3 p[3];
-        int id;
+        float p[3][4]; // extra float pad after each vertex
+        int32_t id;
+        int32_t pad[3]; // make 16 x 32-bits
    };

    struct LinearBVHNode {
-        float3 bounds[2];
-        unsigned int offset;     // primitives for leaf, second child for interior
-        unsigned int primsAxis;  // 0:7 nPrimitives, 8:15 split axis, 16:31 padding
+        float bounds[2][3];
+        int32_t offset;     // primitives for leaf, second child for interior
+        uint8_t nPrimitives;
+        uint8_t splitAxis;
+        uint16_t pad;
    };
 }

 using namespace ispc;

-inline int nPrims(const LinearBVHNode &node) {
-    return (node.primsAxis & 0xff);
-}
-
-inline int axis(const LinearBVHNode &node) {
-    return ((node.primsAxis >> 8) & 0xff);
-}
-
-inline bool isInterior(const LinearBVHNode &node) {
-    return nPrims(node) == 0;
-}
-
 inline float3 Cross(const float3 &v1, const float3 &v2) {
    float v1x = v1.x, v1y = v1.y, v1z = v1.z;
    float v2x = v2.x, v2y = v2.y, v2z = v2.z;
@@ -149,12 +141,14 @@ static void generateRay(const float raster2camera[4][4],
 }


-static inline bool BBoxIntersect(const float3 bounds[2], 
+static inline bool BBoxIntersect(const float bounds[2][3], 
                                 const Ray &ray) {
+    float3 bounds0(bounds[0][0], bounds[0][1], bounds[0][2]);
+    float3 bounds1(bounds[1][0], bounds[1][1], bounds[1][2]);
    float t0 = ray.mint, t1 = ray.maxt;

-    float3 tNear = (bounds[0] - ray.origin) * ray.invDir;
-    float3 tFar  = (bounds[1] - ray.origin) * ray.invDir;
+    float3 tNear = (bounds0 - ray.origin) * ray.invDir;
+    float3 tFar  = (bounds1 - ray.origin) * ray.invDir;
    if (tNear.x > tFar.x) {
        float tmp = tNear.x;
        tNear.x = tFar.x;
@@ -185,8 +179,11 @@ static inline bool BBoxIntersect(const float3 bounds[2],


 inline bool TriIntersect(const Triangle &tri, Ray &ray) {
-    float3 e1 = tri.p[1] - tri.p[0];
-    float3 e2 = tri.p[2] - tri.p[0];
+    float3 p0(tri.p[0][0], tri.p[0][1], tri.p[0][2]);
+    float3 p1(tri.p[1][0], tri.p[1][1], tri.p[1][2]);
+    float3 p2(tri.p[2][0], tri.p[2][1], tri.p[2][2]);
+    float3 e1 = p1 - p0;
+    float3 e2 = p2 - p0;

    float3 s1 = Cross(ray.dir, e2);
    float divisor = Dot(s1, e1);
@@ -196,7 +193,7 @@ inline bool TriIntersect(const Triangle &tri, Ray &ray) {
    float invDivisor = 1.f / divisor;

    // Compute first barycentric coordinate
-    float3 d = ray.origin - tri.p[0];
+    float3 d = ray.origin - p0;
    float b1 = Dot(d, s1) * invDivisor;
    if (b1 < 0. || b1 > 1.)
        return false;
@@ -230,7 +227,7 @@ bool BVHIntersect(const LinearBVHNode nodes[], const Triangle tris[],
        // Check ray against BVH node
        const LinearBVHNode &node = nodes[nodeNum];
        if (BBoxIntersect(node.bounds, ray)) {
-            unsigned int nPrimitives = nPrims(node);
+            unsigned int nPrimitives = node.nPrimitives;
            if (nPrimitives > 0) {
                // Intersect ray with primitives in leaf BVH node
                unsigned int primitivesOffset = node.offset;
@@ -244,7 +241,7 @@ bool BVHIntersect(const LinearBVHNode nodes[], const Triangle tris[],
            }
            else {
                // Put far BVH node on _todo_ stack, advance to near node
-                if (r.dirIsNeg[axis(node)]) {
+                if (r.dirIsNeg[node.splitAxis]) {
                   todo[todoOffset++] = nodeNum + 1;
                   nodeNum = node.offset;
                }
@@ -267,17 +264,21 @@ bool BVHIntersect(const LinearBVHNode nodes[], const Triangle tris[],
 }


-void raytrace_serial(int width, int height,
+void raytrace_serial(int width, int height, int baseWidth, int baseHeight,
                     const float raster2camera[4][4], 
                     const float camera2world[4][4],
                     float image[],
                     int id[],
                     const LinearBVHNode nodes[],
                     const Triangle triangles[]) {
+    float widthScale = float(baseWidth) / float(width);
+    float heightScale = float(baseHeight) / float(height);
+
    for (int y = 0; y < height; ++y) {
        for (int x = 0; x < width; ++x) {
                Ray ray;
-                generateRay(raster2camera, camera2world, x, y, ray);
+                generateRay(raster2camera, camera2world, x * widthScale,
+                            y * heightScale, ray);
                BVHIntersect(nodes, triangles, ray);

                int offset = y * width + x;
--- a/examples/simple/Makefile
+++ b/examples/simple/Makefile
@@ -2,7 +2,7 @@
 CXX=g++ -m64
 CXXFLAGS=-Iobjs/ -O3 -Wall
 ISPC=ispc
-ISPCFLAGS=-O2 --arch=x86-64
+ISPCFLAGS=-O2 --arch=x86-64 --target=sse2

 default: simple

--- a/examples/simple/simple.cpp
+++ b/examples/simple/simple.cpp
@@ -32,6 +32,7 @@
 */

 #include <stdio.h>
+#include <stdlib.h>

 // Include the header file that the ispc compiler generates
 #include "simple_ispc.h"
--- a/examples/simple/simple.vcxproj
+++ b/examples/simple/simple.vcxproj
@@ -1,4 +1,4 @@
-<?xml version="1.0" encoding="utf-8"?>
+<?xml version="1.0" encoding="utf-8"?>
 <Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
  <ItemGroup Label="ProjectConfigurations">
    <ProjectConfiguration Include="Debug|Win32">
@@ -25,21 +25,21 @@
    <CustomBuild Include="simple.ispc">
      <FileType>Document</FileType>
      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86
+ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2
 </Command>
      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-ispc -O2 %(Filename).ispco %(Filename).obj -h %(Filename)_ispc.h
+ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2
 </Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(Filename).obj</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">%(Filename).obj</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86
+ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2
 </Command>
      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h
+ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2
 </Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(Filename).obj</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">%(Filename).obj</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
    </CustomBuild>
  </ItemGroup>
  <PropertyGroup Label="Globals">
@@ -88,15 +88,19 @@ ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h
  <PropertyGroup Label="UserMacros" />
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
    <LinkIncremental>true</LinkIncremental>
+    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
    <LinkIncremental>true</LinkIncremental>
+    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
    <LinkIncremental>false</LinkIncremental>
+    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
    <LinkIncremental>false</LinkIncremental>
+    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
  </PropertyGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
    <ClCompile>
@@ -105,6 +109,7 @@ ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h
      <WarningLevel>Level3</WarningLevel>
      <Optimization>Disabled</Optimization>
      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
@@ -118,6 +123,7 @@ ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h
      <WarningLevel>Level3</WarningLevel>
      <Optimization>Disabled</Optimization>
      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
@@ -133,6 +139,7 @@ ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h
      <FunctionLevelLinking>true</FunctionLevelLinking>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
@@ -150,6 +157,7 @@ ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h
      <FunctionLevelLinking>true</FunctionLevelLinking>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
--- a/examples/stencil/.gitignore
+++ b/examples/stencil/.gitignore
@@ -0,0 +1,2 @@
+stencil
+objs
--- a/examples/stencil/Makefile
+++ b/examples/stencil/Makefile
@@ -0,0 +1,39 @@
+
+ARCH = $(shell uname)
+
+TASK_CXX=../tasksys.cpp
+TASK_LIB=-lpthread
+TASK_OBJ=$(addprefix objs/, $(subst ../,, $(TASK_CXX:.cpp=.o)))
+
+CXX=g++
+CXXFLAGS=-Iobjs/ -O3 -Wall -m64
+ISPC=ispc
+ISPCFLAGS=-O2 --target=sse2,sse4-x2,avx --arch=x86-64
+
+OBJS=objs/stencil.o objs/stencil_serial.o $(TASK_OBJ) objs/stencil_ispc.o \
+	objs/stencil_ispc_sse2.o objs/stencil_ispc_sse4.o \
+	objs/stencil_ispc_avx.o
+
+default: stencil
+
+.PHONY: dirs clean
+
+dirs:
+	/bin/mkdir -p objs/
+
+clean:
+	/bin/rm -rf objs *~ stencil
+
+stencil: dirs $(OBJS)
+	$(CXX) $(CXXFLAGS) -o $@ $(OBJS) -lm $(TASK_LIB)
+
+objs/%.o: %.cpp
+	$(CXX) $< $(CXXFLAGS) -c -o $@
+
+objs/%.o: ../%.cpp
+	$(CXX) $< $(CXXFLAGS) -c -o $@
+
+objs/stencil.o: objs/stencil_ispc.h 
+
+objs/%_ispc.h objs/%_ispc.o objs/%_ispc_sse2.o objs/%_ispc_sse4.o objs/%_ispc_avx.o: %.ispc
+	$(ISPC) $(ISPCFLAGS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
--- a/examples/stencil/stencil.cpp
+++ b/examples/stencil/stencil.cpp
@@ -0,0 +1,151 @@
+/*
+  Copyright (c) 2010-2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+#ifdef _MSC_VER
+#define _CRT_SECURE_NO_WARNINGS
+#define NOMINMAX
+#pragma warning (disable: 4244)
+#pragma warning (disable: 4305)
+#endif
+
+#include <stdio.h>
+#include <algorithm>
+#include <math.h>
+#include "../timing.h"
+#include "stencil_ispc.h"
+using namespace ispc;
+
+
+extern void loop_stencil_serial(int t0, int t1, int x0, int x1,
+                                int y0, int y1, int z0, int z1,
+                                int Nx, int Ny, int Nz,
+                                const float coef[5], 
+                                const float vsq[],
+                                float Aeven[], float Aodd[]);
+
+
+void InitData(int Nx, int Ny, int Nz, float *A[2], float *vsq) {
+    int offset = 0;
+    for (int z = 0; z < Nz; ++z)
+        for (int y = 0; y < Ny; ++y)
+            for (int x = 0; x < Nx; ++x, ++offset) {
+                A[0][offset] = (x < Nx / 2) ? x / float(Nx) : y / float(Ny);
+                A[1][offset] = 0;
+                vsq[offset] = x*y*z / float(Nx * Ny * Nz);
+            }
+}
+
+
+int main() {
+    int Nx = 256, Ny = 256, Nz = 256;
+    int width = 4;
+    float *Aserial[2], *Aispc[2];
+    Aserial[0] = new float [Nx * Ny * Nz];
+    Aserial[1] = new float [Nx * Ny * Nz];
+    Aispc[0] = new float [Nx * Ny * Nz];
+    Aispc[1] = new float [Nx * Ny * Nz];
+    float *vsq = new float [Nx * Ny * Nz];
+
+    float coeff[4] = { 0.5, -.25, .125, -.0625 }; 
+
+    InitData(Nx, Ny, Nz, Aispc, vsq);
+
+    //
+    // Compute the image using the ispc implementation on one core; report
+    // the minimum time of three runs.
+    //
+    double minTimeISPC = 1e30;
+    for (int i = 0; i < 3; ++i) {
+        reset_and_start_timer();
+        loop_stencil_ispc(0, 6, width, Nx - width, width, Ny - width,
+                          width, Nz - width, Nx, Ny, Nz, coeff, vsq,
+                          Aispc[0], Aispc[1]);
+        double dt = get_elapsed_mcycles();
+        minTimeISPC = std::min(minTimeISPC, dt);
+    }
+
+    printf("[stencil ispc 1 core]:\t\t[%.3f] million cycles\n", minTimeISPC);
+
+    InitData(Nx, Ny, Nz, Aispc, vsq);
+
+    //
+    // Compute the image using the ispc implementation with tasks; report
+    // the minimum time of three runs.
+    //
+    double minTimeISPCTasks = 1e30;
+    for (int i = 0; i < 3; ++i) {
+        reset_and_start_timer();
+        loop_stencil_ispc_tasks(0, 6, width, Nx - width, width, Ny - width,
+                                width, Nz - width, Nx, Ny, Nz, coeff, vsq,
+                                Aispc[0], Aispc[1]);
+        double dt = get_elapsed_mcycles();
+        minTimeISPCTasks = std::min(minTimeISPCTasks, dt);
+    }
+
+    printf("[stencil ispc + tasks]:\t\t[%.3f] million cycles\n", minTimeISPCTasks);
+
+    InitData(Nx, Ny, Nz, Aserial, vsq);
+
+    // 
+    // And run the serial implementation 3 times, again reporting the
+    // minimum time.
+    //
+    double minTimeSerial = 1e30;
+    for (int i = 0; i < 3; ++i) {
+        reset_and_start_timer();
+        loop_stencil_serial(0, 6, width, Nx-width, width, Ny - width,
+                            width, Nz - width, Nx, Ny, Nz, coeff, vsq,
+                            Aserial[0], Aserial[1]);
+        double dt = get_elapsed_mcycles();
+        minTimeSerial = std::min(minTimeSerial, dt);
+    }
+
+    printf("[stencil serial]:\t\t[%.3f] millon cycles\n", minTimeSerial);
+
+    printf("\t\t\t\t(%.2fx speedup from ISPC, %.2f from ISPC + tasks)\n", 
+           minTimeSerial / minTimeISPC, minTimeSerial / minTimeISPCTasks);
+
+    // Check for agreement
+    int offset = 0;
+    for (int z = 0; z < Nz; ++z)
+        for (int y = 0; y < Ny; ++y)
+            for (int x = 0; x < Nx; ++x, ++offset) {
+                float error = fabsf((Aserial[1][offset] - Aispc[1][offset]) /
+                                    Aserial[1][offset]);
+                if (error > 1e-4)
+                    printf("Error @ (%d,%d,%d): ispc = %f, serial = %f\n",
+                           x, y, z, Aispc[1][offset], Aserial[1][offset]);
+            }
+
+    return 0;
+}
--- a/examples/stencil/stencil.ispc
+++ b/examples/stencil/stencil.ispc
@@ -0,0 +1,129 @@
+/*
+  Copyright (c) 2010-2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+
+static void
+stencil_step(uniform int x0, uniform int x1,
+             uniform int y0, uniform int y1,
+             uniform int z0, uniform int z1,
+             uniform int Nx, uniform int Ny, uniform int Nz,
+             uniform const float coef[4], uniform const float vsq[],
+             uniform const float Ain[], uniform float Aout[]) {
+    const uniform int Nxy = Nx * Ny;
+
+    for (uniform int z = z0; z < z1; ++z) {
+        for (uniform int y = y0; y < y1; ++y) {
+            // Assumes that (x1-x0) % programCount == 0
+            for (uniform int x = x0; x < x1; x += programCount) {
+                int index = (z * Nxy) + (y * Nx) + x + programIndex;
+#define A_cur(x, y, z) Ain[index + (x) + ((y) * Nx) + ((z) * Nxy)]
+#define A_next(x, y, z) Aout[index + (x) + ((y) * Nx) + ((z) * Nxy)]
+                float div = coef[0] * A_cur(0, 0, 0) +
+                            coef[1] * (A_cur(+1, 0, 0) + A_cur(-1, 0, 0) +
+                                       A_cur(0, +1, 0) + A_cur(0, -1, 0) +
+                                       A_cur(0, 0, +1) + A_cur(0, 0, -1)) +
+                            coef[2] * (A_cur(+2, 0, 0) + A_cur(-2, 0, 0) +
+                                       A_cur(0, +2, 0) + A_cur(0, -2, 0) +
+                                       A_cur(0, 0, +2) + A_cur(0, 0, -2)) +
+                            coef[3] * (A_cur(+3, 0, 0) + A_cur(-3, 0, 0) +
+                                       A_cur(0, +3, 0) + A_cur(0, -3, 0) +
+                                       A_cur(0, 0, +3) + A_cur(0, 0, -3));
+
+                A_next(0, 0, 0) = 2 * A_cur(0, 0, 0) - A_next(0, 0, 0) + 
+                    vsq[index] * div;
+            }
+        }
+    }
+}
+
+
+static task void
+stencil_step_task(uniform int x0, uniform int x1,
+                  uniform int y0, uniform int y1,
+                  uniform int z0, uniform int z1,
+                  uniform int Nx, uniform int Ny, uniform int Nz,
+                  uniform const float coef[4], uniform const float vsq[],
+                  uniform const float Ain[], uniform float Aout[]) {
+    stencil_step(x0, x1, y0, y1, z0, z1, Nx, Ny, Nz, coef, vsq, Ain, Aout);
+}
+
+
+export void
+loop_stencil_ispc_tasks(uniform int t0, uniform int t1, 
+                        uniform int x0, uniform int x1,
+                        uniform int y0, uniform int y1,
+                        uniform int z0, uniform int z1,
+                        uniform int Nx, uniform int Ny, uniform int Nz,
+                        uniform const float coef[4], 
+                        uniform const float vsq[],
+                        uniform float Aeven[], uniform float Aodd[])
+{
+    for (uniform int t = t0; t < t1; ++t) {
+        // Parallelize across cores as well: each task will work on a slice
+        // of "dz" in the z extent of the volume.  (dz=1 seems to work
+        // better than any larger values.)
+        uniform int dz = 1;
+        for (uniform int z = z0; z < z1; z += dz) {
+            if ((t & 1) == 0)
+                launch < stencil_step_task(x0, x1, y0, y1, z, z+dz, Nx, Ny, Nz, 
+                                           coef, vsq, Aeven, Aodd) >;
+            else
+                launch < stencil_step_task(x0, x1, y0, y1, z, z+dz, Nx, Ny, Nz, 
+                                           coef, vsq, Aodd, Aeven) >;
+        }
+        // We need to wait for all of the launched tasks to finish before
+        // starting the next iteration.
+        sync;
+    }
+}
+
+
+export void
+loop_stencil_ispc(uniform int t0, uniform int t1, 
+                  uniform int x0, uniform int x1,
+                  uniform int y0, uniform int y1,
+                  uniform int z0, uniform int z1,
+                  uniform int Nx, uniform int Ny, uniform int Nz,
+                  uniform const float coef[4], 
+                  uniform const float vsq[],
+                  uniform float Aeven[], uniform float Aodd[])
+{
+    for (uniform int t = t0; t < t1; ++t) {
+        if ((t & 1) == 0)
+            stencil_step(x0, x1, y0, y1, z0, z1, Nx, Ny, Nz, coef, vsq, 
+                         Aeven, Aodd);
+        else
+            stencil_step(x0, x1, y0, y1, z0, z1, Nx, Ny, Nz, coef, vsq, 
+                         Aodd, Aeven);
+    }
+}
--- a/examples/stencil/stencil.vcxproj
+++ b/examples/stencil/stencil.vcxproj
@@ -0,0 +1,180 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|Win32">
+      <Configuration>Debug</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|Win32">
+      <Configuration>Release</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{2ef070a1-f62f-4e6a-944b-88d140945c3c}</ProjectGuid>
+    <Keyword>Win32Proj</Keyword>
+    <RootNamespace>rt</RootNamespace>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <LinkIncremental>true</LinkIncremental>
+    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <LinkIncremental>true</LinkIncremental>
+    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <LinkIncremental>false</LinkIncremental>
+    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <LinkIncremental>false</LinkIncremental>
+    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
+  </PropertyGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <ClCompile>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <FloatingPointModel>Fast</FloatingPointModel>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <ClCompile>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <FloatingPointModel>Fast</FloatingPointModel>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
+      <FloatingPointModel>Fast</FloatingPointModel>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
+      <FloatingPointModel>Fast</FloatingPointModel>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <CustomBuild Include="stencil.ispc">
+      <FileType>Document</FileType>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx
+</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx
+</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
+    </CustomBuild>
+  </ItemGroup>
+  <ItemGroup>
+    <ClCompile Include="stencil.cpp" />
+    <ClCompile Include="stencil_serial.cpp" />
+    <ClCompile Include="../tasksys.cpp" />
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+  </ImportGroup>
+</Project>
--- a/examples/stencil/stencil_serial.cpp
+++ b/examples/stencil/stencil_serial.cpp
@@ -0,0 +1,86 @@
+/*
+  Copyright (c) 2010-2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+
+static void
+stencil_step(int x0, int x1,
+             int y0, int y1,
+             int z0, int z1,
+             int Nx, int Ny, int Nz,
+             const float coef[4], const float vsq[],
+             const float Ain[], float Aout[]) {
+    int Nxy = Nx * Ny;
+
+    for (int z = z0; z < z1; ++z) {
+        for (int y = y0; y < y1; ++y) {
+            for (int x = x0; x < x1; ++x) {
+                int index = (z * Nxy) + (y * Nx) + x;
+#define A_cur(x, y, z) Ain[index + (x) + ((y) * Nx) + ((z) * Nxy)]
+#define A_next(x, y, z) Aout[index + (x) + ((y) * Nx) + ((z) * Nxy)]
+                float div = coef[0] * A_cur(0, 0, 0) +
+                            coef[1] * (A_cur(+1, 0, 0) + A_cur(-1, 0, 0) +
+                                       A_cur(0, +1, 0) + A_cur(0, -1, 0) +
+                                       A_cur(0, 0, +1) + A_cur(0, 0, -1)) +
+                            coef[2] * (A_cur(+2, 0, 0) + A_cur(-2, 0, 0) +
+                                       A_cur(0, +2, 0) + A_cur(0, -2, 0) +
+                                       A_cur(0, 0, +2) + A_cur(0, 0, -2)) +
+                            coef[3] * (A_cur(+3, 0, 0) + A_cur(-3, 0, 0) +
+                                       A_cur(0, +3, 0) + A_cur(0, -3, 0) +
+                                       A_cur(0, 0, +3) + A_cur(0, 0, -3));
+
+                A_next(0, 0, 0) = 2 * A_cur(0, 0, 0) - A_next(0, 0, 0) + 
+                    vsq[index] * div;
+            }
+        }
+    }
+}
+
+
+void loop_stencil_serial(int t0, int t1, 
+                         int x0, int x1,
+                         int y0, int y1,
+                         int z0, int z1,
+                         int Nx, int Ny, int Nz,
+                         const float coef[4], 
+                         const float vsq[],
+                         float Aeven[], float Aodd[])
+{
+    for (int t = t0; t < t1; ++t) {
+        if ((t & 1) == 0)
+            stencil_step(x0, x1, y0, y1, z0, z1, Nx, Ny, Nz, coef, vsq, 
+                         Aeven, Aodd);
+        else
+            stencil_step(x0, x1, y0, y1, z0, z1, Nx, Ny, Nz, coef, vsq, 
+                         Aodd, Aeven);
+    }
+}
--- a/examples/tasksys.cpp
+++ b/examples/tasksys.cpp
@@ -0,0 +1,865 @@
+/*
+  Copyright (c) 2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+/*
+  This file implements simple task systems that provide the three
+  entrypoints used by ispc-generated to code to handle 'launch' and 'sync'
+  statements in ispc programs.  See the section "Task Parallelism: Language
+  Syntax" in the ispc documentation for information about using task
+  parallelism in ispc programs, and see the section "Task Parallelism:
+  Runtime Requirements" for information about the task-related entrypoints
+  that are implemented here.
+
+  There are three task systems in this file: one built using Microsoft's
+  Concurrency Runtime, one built with Apple's Grand Central Dispatch, and
+  one built on top of bare pthreads.
+*/
+
+#if defined(_WIN32) || defined(_WIN64)
+  #define ISPC_IS_WINDOWS
+  #define ISPC_USE_CONCRT
+#elif defined(__linux__)
+  #define ISPC_IS_LINUX
+  #define ISPC_USE_PTHREADS
+#elif defined(__APPLE__)
+  #define ISPC_IS_APPLE
+  #define ISPC_USE_GCD
+#endif
+
+#define DBG(x) 
+
+#ifdef ISPC_IS_WINDOWS
+  #define NOMINMAX
+  #include <windows.h>
+#endif // ISPC_IS_WINDOWS
+#ifdef ISPC_USE_CONCRT
+  #include <concrt.h>
+  using namespace Concurrency;
+#endif // ISPC_USE_CONCRT
+#ifdef ISPC_USE_GCD
+  #include <dispatch/dispatch.h>
+  #include <pthread.h>
+#endif // ISPC_USE_GCD
+#ifdef ISPC_USE_PTHREADS
+  #include <pthread.h>
+  #include <semaphore.h>
+  #include <unistd.h>
+  #include <fcntl.h>
+  #include <errno.h>
+  #include <sys/types.h>
+  #include <sys/stat.h>
+  #include <sys/param.h>
+  #include <sys/sysctl.h>
+  #include <vector>
+  #include <algorithm>
+#endif // ISPC_USE_PTHREADS
+#ifdef ISPC_IS_LINUX
+  #include <malloc.h>
+#endif // ISPC_IS_LINUX
+#include <stdio.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <assert.h>
+#include <string.h>
+#include <algorithm>
+
+// Signature of ispc-generated 'task' functions
+typedef void (*TaskFuncType)(void *data, int threadIndex, int threadCount,
+                             int taskIndex, int taskCount);
+
+// Small structure used to hold the data for each task
+struct TaskInfo {
+    TaskFuncType func;
+    void *data;
+    int taskIndex, taskCount;
+#if defined(ISPC_IS_WINDOWS)
+    event taskEvent;
+#endif
+};
+
+///////////////////////////////////////////////////////////////////////////
+// TaskGroupBase
+
+#define LOG_TASK_QUEUE_CHUNK_SIZE 14
+#define MAX_TASK_QUEUE_CHUNKS 8
+#define TASK_QUEUE_CHUNK_SIZE (1<<LOG_TASK_QUEUE_CHUNK_SIZE)
+
+#define MAX_LAUNCHED_TASKS (MAX_TASK_QUEUE_CHUNKS * TASK_QUEUE_CHUNK_SIZE)
+
+#define NUM_MEM_BUFFERS 16
+
+class TaskGroup;
+
+/** The TaskGroupBase structure provides common functionality for "task
+    groups"; a task group is the set of tasks launched from within a single
+    ispc function.  When the function is ready to return, it waits for all
+    of the tasks in its task group to finish before it actually returns.
+ */
+class TaskGroupBase {
+public:
+    void Reset();
+
+    int AllocTaskInfo(int count);
+    TaskInfo *GetTaskInfo(int index);
+
+    void *AllocMemory(int64_t size, int32_t alignment);
+
+protected:
+    TaskGroupBase();
+    ~TaskGroupBase();
+
+    int nextTaskInfoIndex;
+
+private:
+    /* We allocate blocks of TASK_QUEUE_CHUNK_SIZE TaskInfo structures as
+       needed by the calling function.  We hold up to MAX_TASK_QUEUE_CHUNKS
+       of these (and then exit at runtime if more than this many tasks are
+       launched.)
+     */
+    TaskInfo *taskInfo[MAX_TASK_QUEUE_CHUNKS];
+
+    /* We also allocate chunks of memory to service ISPCAlloc() calls.  The
+       memBuffers[] array holds pointers to this memory.  The first element
+       of this array is initialized to point to mem and then any subsequent
+       elements required are initialized with dynamic allocation.
+     */
+    int curMemBuffer, curMemBufferOffset;
+    int memBufferSize[NUM_MEM_BUFFERS];
+    char *memBuffers[NUM_MEM_BUFFERS];
+    char mem[256];
+};
+
+
+inline TaskGroupBase::TaskGroupBase() { 
+    nextTaskInfoIndex = 0; 
+
+    curMemBuffer = 0; 
+    curMemBufferOffset = 0;
+    memBuffers[0] = mem;
+    memBufferSize[0] = sizeof(mem) / sizeof(mem[0]);
+    for (int i = 1; i < NUM_MEM_BUFFERS; ++i) {
+        memBuffers[i] = NULL;
+        memBufferSize[i] = 0;
+    }
+
+    for (int i = 0; i < MAX_TASK_QUEUE_CHUNKS; ++i)
+        taskInfo[i] = NULL;
+}
+
+
+inline TaskGroupBase::~TaskGroupBase() {
+    // Note: don't delete memBuffers[0], since it points to the start of
+    // the "mem" member!
+    for (int i = 1; i < NUM_MEM_BUFFERS; ++i)
+        delete[] memBuffers[i];
+}
+
+
+inline void
+TaskGroupBase::Reset() {
+    nextTaskInfoIndex = 0; 
+    curMemBuffer = 0; 
+    curMemBufferOffset = 0;
+}
+
+
+inline int
+TaskGroupBase::AllocTaskInfo(int count) {
+    int ret = nextTaskInfoIndex;
+    nextTaskInfoIndex += count;
+    return ret;
+}
+
+
+inline TaskInfo *
+TaskGroupBase::GetTaskInfo(int index) {
+    int chunk = (index >> LOG_TASK_QUEUE_CHUNK_SIZE);
+    int offset = index & (TASK_QUEUE_CHUNK_SIZE-1);
+
+    if (chunk == MAX_TASK_QUEUE_CHUNKS) {
+        fprintf(stderr, "A total of %d tasks have been launched from the "
+                "current function--the simple built-in task system can handle "
+                "no more. You can increase the values of TASK_QUEUE_CHUNK_SIZE "
+                "and LOG_TASK_QUEUE_CHUNK_SIZE to work around this limitation.  "
+                "Sorry!  Exiting.\n", index);
+        exit(1);
+    }
+
+    if (taskInfo[chunk] == NULL)
+        taskInfo[chunk] = new TaskInfo[TASK_QUEUE_CHUNK_SIZE];
+    return &taskInfo[chunk][offset];
+}
+
+
+inline void *
+TaskGroupBase::AllocMemory(int64_t size, int32_t alignment) {
+    char *basePtr = memBuffers[curMemBuffer];
+    int64_t iptr = (int64_t)(basePtr + curMemBufferOffset);
+    iptr = (iptr + (alignment-1)) & ~(alignment-1);
+
+    int newOffset = int(iptr + size - (int64_t)basePtr);
+    if (newOffset < memBufferSize[curMemBuffer]) {
+        curMemBufferOffset = newOffset;
+        return (char *)iptr;
+    }
+
+    ++curMemBuffer;
+    curMemBufferOffset = 0;
+    assert(curMemBuffer < NUM_MEM_BUFFERS);
+
+    int allocSize = 1 << (12 + curMemBuffer);
+    allocSize = std::max(int(size+alignment), allocSize);
+    char *newBuf = new char[allocSize];
+    memBufferSize[curMemBuffer] = allocSize;
+    memBuffers[curMemBuffer] = newBuf;
+    return AllocMemory(size, alignment);
+}
+
+
+///////////////////////////////////////////////////////////////////////////
+// Atomics and the like
+
+#ifndef ISPC_IS_WINDOWS
+static inline void
+lMemFence() {
+    __asm__ __volatile__("mfence":::"memory");
+}
+#endif // !ISPC_IS_WINDOWS
+
+
+#if (__SIZEOF_POINTER__ == 4) || defined(__i386__) || defined(_WIN32)
+#define ISPC_POINTER_BYTES 4
+#elif (__SIZEOF_POINTER__ == 8) || defined(__x86_64__) || defined(__amd64__) || defined(_WIN64)
+#define ISPC_POINTER_BYTES 8
+#else
+#error "Pointer size unknown!"
+#endif // __SIZEOF_POINTER__
+
+
+static void *
+lAtomicCompareAndSwapPointer(void **v, void *newValue, void *oldValue) {
+#ifdef ISPC_IS_WINDOWS
+    return InterlockedCompareExchangePointer(v, newValue, oldValue);
+#else
+    void *result;
+#if (ISPC_POINTER_BYTES == 4)
+    __asm__ __volatile__("lock\ncmpxchgd %2,%1"
+                          : "=a"(result), "=m"(*v)
+                          : "q"(newValue), "0"(oldValue)
+                          : "memory");
+#else
+    __asm__ __volatile__("lock\ncmpxchgq %2,%1"
+                          : "=a"(result), "=m"(*v)
+                          : "q"(newValue), "0"(oldValue)
+                          : "memory");
+#endif // ISPC_POINTER_BYTES
+    lMemFence();
+    return result;
+#endif // ISPC_IS_WINDOWS
+}
+
+
+
+#ifndef ISPC_IS_WINDOWS
+static int32_t 
+lAtomicCompareAndSwap32(volatile int32_t *v, int32_t newValue, int32_t oldValue) {
+    int32_t result;
+    __asm__ __volatile__("lock\ncmpxchgl %2,%1"
+                          : "=a"(result), "=m"(*v)
+                          : "q"(newValue), "0"(oldValue)
+                          : "memory");
+    lMemFence();
+    return result;
+}
+#endif // !ISPC_IS_WINDOWS
+
+
+///////////////////////////////////////////////////////////////////////////
+
+#ifdef ISPC_USE_CONCRT
+// With ConcRT, we don't need to extend TaskGroupBase at all.
+class TaskGroup : public TaskGroupBase {
+public:
+    void Launch(int baseIndex, int count);
+    void Sync();
+};
+#endif // ISPC_USE_CONCRT
+
+#ifdef ISPC_USE_GCD
+/* With Grand Central Dispatch, we associate a GCD dispatch group with each
+   task group.  (We'll later wait on this dispatch group when we need to
+   wait on all of the tasks in the group to finish.)
+ */
+class TaskGroup : public TaskGroupBase {
+public:
+    TaskGroup() {
+        gcdGroup = dispatch_group_create();
+    }
+
+    void Launch(int baseIndex, int count);
+    void Sync();
+
+private:
+    dispatch_group_t gcdGroup;
+};
+#endif // ISPC_USE_GCD
+
+#ifdef ISPC_USE_PTHREADS
+static void *lTaskEntry(void *arg);
+
+class TaskGroup : public TaskGroupBase {
+public:
+    TaskGroup() {
+        numUnfinishedTasks = 0;
+        waitingTasks.reserve(128);
+        inActiveList = false;
+    }
+
+    void Reset() {
+        TaskGroupBase::Reset();
+        numUnfinishedTasks = 0;
+        assert(inActiveList == false);
+        lMemFence();
+    }
+
+    void Launch(int baseIndex, int count);
+    void Sync();
+
+private:
+    friend void *lTaskEntry(void *arg);
+
+    int32_t numUnfinishedTasks;
+    int32_t pad[3];
+    std::vector<int> waitingTasks;
+    bool inActiveList;
+};
+
+#endif // ISPC_USE_PTHREADS
+
+
+///////////////////////////////////////////////////////////////////////////
+// Grand Central Dispatch
+
+#ifdef ISPC_USE_GCD
+
+/* A simple task system for ispc programs based on Apple's Grand Central
+   Dispatch. */
+
+static dispatch_queue_t gcdQueue;
+static volatile int32_t lock = 0;
+
+static void
+InitTaskSystem() {
+    if (gcdQueue != NULL)
+        return;
+
+    while (1) {
+        if (lAtomicCompareAndSwap32(&lock, 1, 0) == 0) {
+            if (gcdQueue == NULL) {
+                gcdQueue = dispatch_get_global_queue(DISPATCH_QUEUE_PRIORITY_DEFAULT, 0);
+                assert(gcdQueue != NULL);
+                lMemFence();
+            }
+            lock = 0;
+            break;
+        }
+    }
+}
+
+
+static void
+lRunTask(void *ti) {
+    TaskInfo *taskInfo = (TaskInfo *)ti;
+    // FIXME: these are bogus values; may cause bugs in code that depends
+    // on them having unique values in different threads.
+    int threadIndex = 0;
+    int threadCount = 1;
+
+    // Actually run the task
+    taskInfo->func(taskInfo->data, threadIndex, threadCount, 
+                   taskInfo->taskIndex, taskInfo->taskCount);
+}
+
+
+inline void
+TaskGroup::Launch(int baseIndex, int count) {
+    for (int i = 0; i < count; ++i) {
+        TaskInfo *ti = GetTaskInfo(baseIndex + i);
+        dispatch_group_async_f(gcdGroup, gcdQueue, ti, lRunTask);
+    }
+}
+
+
+inline void
+TaskGroup::Sync() {
+    dispatch_group_wait(gcdGroup, DISPATCH_TIME_FOREVER);
+}
+
+#endif // ISPC_USE_GCD
+
+///////////////////////////////////////////////////////////////////////////
+// Concurrency Runtime
+
+#ifdef ISPC_USE_CONCRT
+
+static void
+InitTaskSystem() {
+    // No initialization needed
+}
+
+
+static void __cdecl
+lRunTask(LPVOID param) {
+    TaskInfo *ti = (TaskInfo *)param;
+    
+    // Actually run the task. 
+    // FIXME: like the GCD implementation for OS X, this is passing bogus
+    // values for the threadIndex and threadCount builtins, which in turn
+    // will cause bugs in code that uses those.
+    int threadIndex = 0;
+    int threadCount = 1;
+    ti->func(ti->data, threadIndex, threadCount, ti->taskIndex, ti->taskCount);
+
+    // Signal the event that this task is done
+    ti->taskEvent.set();
+}
+
+
+inline void
+TaskGroup::Launch(int baseIndex, int count) {
+    for (int i = 0; i < count; ++i)
+        CurrentScheduler::ScheduleTask(lRunTask, GetTaskInfo(baseIndex + i));
+}
+
+
+inline void
+TaskGroup::Sync() {
+    for (int i = 0; i < nextTaskInfoIndex; ++i) {
+        TaskInfo *ti = GetTaskInfo(i);
+        ti->taskEvent.wait();
+        ti->taskEvent.reset();
+    }
+}
+
+#endif // ISPC_USE_CONCRT
+
+///////////////////////////////////////////////////////////////////////////
+// pthreads
+
+#ifdef ISPC_USE_PTHREADS
+
+static volatile int32_t lock = 0;
+
+static int nThreads;
+static pthread_t *threads = NULL;
+
+static pthread_mutex_t taskSysMutex;
+static std::vector<TaskGroup *> activeTaskGroups;
+static sem_t *workerSemaphore;
+
+
+static inline int32_t 
+lAtomicAdd(int32_t *v, int32_t delta) {
+    int32_t origValue;
+    __asm__ __volatile__("lock\n"
+                         "xaddl %0,%1"
+                         : "=r"(origValue), "=m"(*v) : "0"(delta)
+                         : "memory");
+    return origValue;
+}
+
+
+static void *
+lTaskEntry(void *arg) {
+    int threadIndex = (int)((int64_t)arg);
+    int threadCount = nThreads;
+
+    while (1) {
+        int err;
+        //
+        // Wait on the semaphore until we're woken up due to the arrival of
+        // more work.
+        //
+        if ((err = sem_wait(workerSemaphore)) != 0) {
+            fprintf(stderr, "Error from sem_wait: %s\n", strerror(err));
+            exit(1);
+        }
+
+        //
+        // Acquire the mutex
+        //
+        if ((err = pthread_mutex_lock(&taskSysMutex)) != 0) {
+            fprintf(stderr, "Error from pthread_mutex_lock: %s\n", strerror(err));
+            exit(1);
+        }
+
+        if (activeTaskGroups.size() == 0) {
+            //
+            // Task queue is empty, go back and wait on the semaphore
+            //
+            if ((err = pthread_mutex_unlock(&taskSysMutex)) != 0) {
+                fprintf(stderr, "Error from pthread_mutex_unlock: %s\n", strerror(err));
+                exit(1);
+            }
+            continue;
+        }
+
+        //
+        // Get the last task group on the active list and the last task
+        // from its waiting tasks list.
+        //
+        TaskGroup *tg = activeTaskGroups.back();
+        assert(tg->waitingTasks.size() > 0);
+        int taskNumber = tg->waitingTasks.back();
+        tg->waitingTasks.pop_back();
+
+        if (tg->waitingTasks.size() == 0) {
+            // We just took the last task from this task group, so remove
+            // it from the active list.
+            activeTaskGroups.pop_back();
+            tg->inActiveList = false;
+        }
+    
+        if ((err = pthread_mutex_unlock(&taskSysMutex)) != 0) {
+            fprintf(stderr, "Error from pthread_mutex_unlock: %s\n", strerror(err));
+            exit(1);
+        }
+
+        //
+        // And now actually run the task
+        //
+        DBG(fprintf(stderr, "running task %d from group %p\n", taskNumber, tg));
+        TaskInfo *myTask = tg->GetTaskInfo(taskNumber);
+        myTask->func(myTask->data, threadIndex, threadCount, myTask->taskIndex,
+                     myTask->taskCount);
+
+        //
+        // Decrement the "number of unfinished tasks" counter in the task
+        // group.
+        //
+        lMemFence();
+        lAtomicAdd(&tg->numUnfinishedTasks, -1);
+    }
+
+    pthread_exit(NULL);
+    return 0;
+}
+
+
+static void
+InitTaskSystem() {
+    if (threads == NULL) {
+        while (1) {
+            if (lAtomicCompareAndSwap32(&lock, 1, 0) == 0) {
+                if (threads == NULL) {
+                    // We launch one fewer thread than there are cores,
+                    // since the main thread here will also grab jobs from
+                    // the task queue itself.
+                    nThreads = sysconf(_SC_NPROCESSORS_ONLN) - 1;
+
+                    int err;
+                    if ((err = pthread_mutex_init(&taskSysMutex, NULL)) != 0) {
+                        fprintf(stderr, "Error creating mutex: %s\n", strerror(err));
+                        exit(1);
+                    }
+
+                    char name[32];
+                    sprintf(name, "ispc_task.%d", (int)getpid());
+                    workerSemaphore = sem_open(name, O_CREAT, S_IRUSR|S_IWUSR, 0);
+                    if (!workerSemaphore) {
+                        fprintf(stderr, "Error creating semaphore: %s\n", strerror(err));
+                        exit(1);
+                    }
+
+                    threads = (pthread_t *)malloc(nThreads * sizeof(pthread_t));
+                    for (int i = 0; i < nThreads; ++i) {
+                        err = pthread_create(&threads[i], NULL, &lTaskEntry, (void *)(i));
+                        if (err != 0) {
+                            fprintf(stderr, "Error creating pthread %d: %s\n", i, strerror(err));
+                            exit(1);
+                        }
+                    }
+
+                    activeTaskGroups.reserve(64);
+                }
+
+                // Make sure all of the above goes to memory before we
+                // clear the lock.
+                lMemFence();
+                lock = 0;
+                break;
+            }
+        }
+    }
+}
+
+
+inline void
+TaskGroup::Launch(int baseCoord, int count) {
+    //
+    // Acquire mutex, add task
+    //
+    int err;
+    if ((err = pthread_mutex_lock(&taskSysMutex)) != 0) {
+        fprintf(stderr, "Error from pthread_mutex_lock: %s\n", strerror(err));
+        exit(1);
+    }
+
+    // Add the corresponding set of tasks to the waiting-to-be-run list for
+    // this task group.
+    //
+    // FIXME: it's a little ugly to hold a global mutex for this when we
+    // only need to make sure no one else is accessing this task group's
+    // waitingTasks list.  (But a small experiment in switching to a
+    // per-TaskGroup mutex showed worse performance!)
+    for (int i = 0; i < count; ++i)
+        waitingTasks.push_back(baseCoord + i);
+
+    // Add the task group to the global active list if it isn't there
+    // already.
+    if (inActiveList == false) {
+        activeTaskGroups.push_back(this);
+        inActiveList = true;
+    }
+
+    if ((err = pthread_mutex_unlock(&taskSysMutex)) != 0) {
+        fprintf(stderr, "Error from pthread_mutex_unlock: %s\n", strerror(err));
+        exit(1);
+    }
+
+    //
+    // Update the count of the number of tasks left to run in this task
+    // group.
+    //
+    lMemFence();
+    lAtomicAdd(&numUnfinishedTasks, count);
+
+    //
+    // Post to the worker semaphore to wake up worker threads that are
+    // sleeping waiting for tasks to show up
+    //
+    for (int i = 0; i < count; ++i)
+        if ((err = sem_post(workerSemaphore)) != 0) {
+            fprintf(stderr, "Error from sem_post: %s\n", strerror(err));
+            exit(1);
+        }
+}
+
+
+inline void
+TaskGroup::Sync() {
+    DBG(fprintf(stderr, "syncing %p - %d unfinished\n", tg, numUnfinishedTasks));
+
+    while (numUnfinishedTasks > 0) {
+        // All of the tasks in this group aren't finished yet.  We'll try
+        // to help out here since we don't have anything else to do...
+
+        DBG(fprintf(stderr, "while syncing %p - %d unfinished\n", tg, 
+                    numUnfinishedTasks));
+
+        //
+        // Acquire the global task system mutex to grab a task to work on
+        //
+        int err;
+        if ((err = pthread_mutex_lock(&taskSysMutex)) != 0) {
+            fprintf(stderr, "Error from pthread_mutex_lock: %s\n", strerror(err));
+            exit(1);
+        }
+
+        TaskInfo *myTask = NULL;
+        TaskGroup *runtg = this;
+        if (waitingTasks.size() > 0) {
+            int taskNumber = waitingTasks.back();
+            waitingTasks.pop_back();
+
+            if (waitingTasks.size() == 0) {
+                // There's nothing left to start running from this group,
+                // so remove it from the active task list.
+                activeTaskGroups.erase(std::find(activeTaskGroups.begin(),
+                                                 activeTaskGroups.end(), this));
+                inActiveList = false;
+            }
+            myTask = GetTaskInfo(taskNumber);
+            DBG(fprintf(stderr, "running task %d from group %p in sync\n", taskNumber, tg));
+        }
+        else {
+            // Other threads are already working on all of the tasks in
+            // this group, so we can't help out by running one ourself.
+            // We'll try to run one from another group to make ourselves
+            // useful here.
+            if (activeTaskGroups.size() == 0) {
+                // No active task groups left--there's nothing for us to do.
+                if ((err = pthread_mutex_unlock(&taskSysMutex)) != 0) {
+                    fprintf(stderr, "Error from pthread_mutex_unlock: %s\n", strerror(err));
+                    exit(1);
+                }
+                // FIXME: We basically end up busy-waiting here, which is
+                // extra wasteful in a world with hyperthreading.  It would
+                // be much better to put this thread to sleep on a
+                // condition variable that was signaled when the last task
+                // in this group was finished.
+                sleep(0);
+                continue;
+            }
+
+            // Get a task to run from another task group.
+            runtg = activeTaskGroups.back();
+            assert(runtg->waitingTasks.size() > 0);
+
+            int taskNumber = runtg->waitingTasks.back();
+            runtg->waitingTasks.pop_back();
+            if (runtg->waitingTasks.size() == 0) {
+                // There's left to start running from this group, so remove
+                // it from the active task list.
+                activeTaskGroups.pop_back();
+                runtg->inActiveList = false;
+            }
+            myTask = runtg->GetTaskInfo(taskNumber);
+            DBG(fprintf(stderr, "running task %d from other group %p in sync\n", 
+                        taskNumber, runtg));
+        }
+
+        if ((err = pthread_mutex_unlock(&taskSysMutex)) != 0) {
+            fprintf(stderr, "Error from pthread_mutex_unlock: %s\n", strerror(err));
+            exit(1);
+        }
+    
+        //
+        // Do work for _myTask_
+        //
+        // FIXME: bogus values for thread index/thread count here as well..
+        myTask->func(myTask->data, 0, 1, myTask->taskIndex, myTask->taskCount);
+
+        //
+        // Decrement the number of unfinished tasks counter
+        //
+        lMemFence();
+        lAtomicAdd(&runtg->numUnfinishedTasks, -1);
+    }
+    DBG(fprintf(stderr, "sync for %p done!n", tg));
+}
+
+#endif // ISPC_USE_PTHREADS
+
+///////////////////////////////////////////////////////////////////////////
+
+#define MAX_FREE_TASK_GROUPS 64
+static TaskGroup *freeTaskGroups[MAX_FREE_TASK_GROUPS];
+
+static inline TaskGroup *
+AllocTaskGroup() {
+    for (int i = 0; i < MAX_FREE_TASK_GROUPS; ++i) {
+        TaskGroup *tg = freeTaskGroups[i];
+        if (tg != NULL) {
+            void *ptr = lAtomicCompareAndSwapPointer((void **)(&freeTaskGroups[i]), NULL, tg);
+            if (ptr != NULL) {
+                assert(ptr == tg);
+                return (TaskGroup *)ptr;
+            }
+        }
+    }
+
+    return new TaskGroup;
+}
+
+
+static inline void
+FreeTaskGroup(TaskGroup *tg) {
+    tg->Reset();
+
+    for (int i = 0; i < MAX_FREE_TASK_GROUPS; ++i) {
+        if (freeTaskGroups[i] == NULL) {
+            void *ptr = lAtomicCompareAndSwapPointer((void **)&freeTaskGroups[i], tg, NULL);
+            if (ptr == NULL)
+                return;
+        }
+    }
+
+    delete tg;
+}
+
+///////////////////////////////////////////////////////////////////////////
+
+// ispc expects these functions to have C linkage / not be mangled
+extern "C" { 
+    void ISPCLaunch(void **handlePtr, void *f, void *data, int count);
+    void *ISPCAlloc(void **handlePtr, int64_t size, int32_t alignment);
+    void ISPCSync(void *handle);
+}
+
+void
+ISPCLaunch(void **taskGroupPtr, void *func, void *data, int count) {
+    TaskGroup *taskGroup;
+    if (*taskGroupPtr == NULL) {
+        InitTaskSystem();
+        taskGroup = AllocTaskGroup();
+        *taskGroupPtr = taskGroup;
+    }
+    else
+        taskGroup = (TaskGroup *)(*taskGroupPtr);
+
+    int baseIndex = taskGroup->AllocTaskInfo(count);
+    for (int i = 0; i < count; ++i) {
+        TaskInfo *ti = taskGroup->GetTaskInfo(baseIndex+i);
+        ti->func = (TaskFuncType)func;
+        ti->data = data;
+        ti->taskIndex = i;
+        ti->taskCount = count;
+    }
+    taskGroup->Launch(baseIndex, count);
+}
+
+
+void
+ISPCSync(void *h) {
+    TaskGroup *taskGroup = (TaskGroup *)h;
+    if (taskGroup != NULL) {
+        taskGroup->Sync();
+        FreeTaskGroup(taskGroup);
+    }
+}
+
+
+void *
+ISPCAlloc(void **taskGroupPtr, int64_t size, int32_t alignment) {
+    TaskGroup *taskGroup;
+    if (*taskGroupPtr == NULL) {
+        InitTaskSystem();
+        taskGroup = AllocTaskGroup();
+        *taskGroupPtr = taskGroup;
+    }
+    else
+        taskGroup = (TaskGroup *)(*taskGroupPtr);
+
+    return taskGroup->AllocMemory(size, alignment);
+}
--- a/examples/timing.h
+++ b/examples/timing.h
@@ -38,7 +38,9 @@
 #include <windows.h>
 #define rdtsc __rdtsc
 #else
+#ifdef __cplusplus
 extern "C" {
+#endif /* __cplusplus */
    __inline__ uint64_t rdtsc() {
        uint32_t low, high;
        __asm__ __volatile__ (
@@ -48,7 +50,9 @@ extern "C" {
                              "rdtsc" : "=a" (low), "=d" (high));
        return (uint64_t)high << 32 | low;
    }
+#ifdef __cplusplus
 }
+#endif /* __cplusplus */
 #endif            
            
 static uint64_t start, end;
--- a/examples/volume_rendering/.gitignore
+++ b/examples/volume_rendering/.gitignore
@@ -0,0 +1,2 @@
+mandelbrot
+*.ppm
--- a/examples/volume_rendering/Makefile
+++ b/examples/volume_rendering/Makefile
@@ -0,0 +1,38 @@
+
+ARCH = $(shell uname)
+
+TASK_CXX=../tasksys.cpp
+TASK_LIB=-lpthread
+TASK_OBJ=$(addprefix objs/, $(subst ../,, $(TASK_CXX:.cpp=.o)))
+
+CXX=g++
+CXXFLAGS=-Iobjs/ -O3 -Wall -m64
+ISPC=ispc
+ISPCFLAGS=-O2 --target=sse2,sse4-x2 --arch=x86-64
+
+OBJS=objs/volume.o objs/volume_serial.o $(TASK_OBJ) objs/volume_ispc.o \
+	objs/volume_ispc_sse2.o objs/volume_ispc_sse4.o
+
+default: volume
+
+.PHONY: dirs clean
+
+dirs:
+	/bin/mkdir -p objs/
+
+clean:
+	/bin/rm -rf objs *~ volume
+
+volume: dirs $(OBJS)
+	$(CXX) $(CXXFLAGS) -o $@ $(OBJS) -lm $(TASK_LIB)
+
+objs/%.o: %.cpp
+	$(CXX) $< $(CXXFLAGS) -c -o $@
+
+objs/%.o: ../%.cpp
+	$(CXX) $< $(CXXFLAGS) -c -o $@
+
+objs/volume.o: objs/volume_ispc.h 
+
+objs/%_ispc.h objs/%_ispc.o objs/%_ispc_sse2.o objs/%_ispc_sse4.o: %.ispc
+	$(ISPC) $(ISPCFLAGS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
--- a/examples/volume_rendering/camera.dat
+++ b/examples/volume_rendering/camera.dat
@@ -0,0 +1,11 @@
+896 1184
+
+0.000155 0.000000 0.000000 -0.069927
+0.000000 -0.000155 0.000000 0.093236
+0.000000 0.000000 0.000000 1.000000
+0.000000 0.000000 -99.999001 100.000000
+
+1.000000 0.000000 0.000000 1.000000
+0.000000 0.980129 -0.198360 2.900000
+0.000000 0.198360 0.980129 -10.500000
+0.000000 0.000000 0.000000 1.000000
--- a/examples/volume_rendering/density_highres.vol
+++ b/examples/volume_rendering/density_highres.vol
--- a/examples/volume_rendering/density_lowres.vol
+++ b/examples/volume_rendering/density_lowres.vol
--- a/examples/volume_rendering/volume.cpp
+++ b/examples/volume_rendering/volume.cpp
@@ -0,0 +1,214 @@
+/*
+  Copyright (c) 2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+#ifdef _MSC_VER
+#define _CRT_SECURE_NO_WARNINGS
+#define NOMINMAX
+#pragma warning (disable: 4244)
+#pragma warning (disable: 4305)
+#endif
+
+#include <stdio.h>
+#include <algorithm>
+#include "../timing.h"
+#include "volume_ispc.h"
+using namespace ispc;
+
+extern void volume_serial(float density[], int nVoxels[3], 
+                          const float raster2camera[4][4],
+                          const float camera2world[4][4], 
+                          int width, int height, float image[]);
+
+/* Write a PPM image file with the image */
+static void
+writePPM(float *buf, int width, int height, const char *fn) {
+    FILE *fp = fopen(fn, "wb");
+    fprintf(fp, "P6\n");
+    fprintf(fp, "%d %d\n", width, height);
+    fprintf(fp, "255\n");
+    for (int i = 0; i < width*height; ++i) {
+        float v = buf[i] * 255.f;
+        if (v < 0.f) v = 0.f;
+        else if (v > 255.f) v = 255.f;
+        unsigned char c = (unsigned char)v;
+        for (int j = 0; j < 3; ++j)
+            fputc(c, fp);
+    }
+    fclose(fp);
+    printf("Wrote image file %s\n", fn);
+}
+
+
+/* Load image and viewing parameters from a camera data file.
+   FIXME: we should add support to be able to specify viewing parameters
+   in the program here directly. */
+static void
+loadCamera(const char *fn, int *width, int *height, float raster2camera[4][4],
+           float camera2world[4][4]) {
+    FILE *f = fopen(fn, "r");
+    if (!f) {
+        perror(fn);
+        exit(1);
+    }
+    if (fscanf(f, "%d %d", width, height) != 2) {
+        fprintf(stderr, "Unexpected end of file in camera file\n");
+        exit(1);
+    }
+
+    for (int i = 0; i < 4; ++i) {
+        for (int j = 0; j < 4; ++j) {
+            if (fscanf(f, "%f", &raster2camera[i][j]) != 1) {
+                fprintf(stderr, "Unexpected end of file in camera file\n");
+                exit(1);
+            }
+        }
+    }
+    for (int i = 0; i < 4; ++i) {
+        for (int j = 0; j < 4; ++j) {
+            if (fscanf(f, "%f", &camera2world[i][j]) != 1) {
+                fprintf(stderr, "Unexpected end of file in camera file\n");
+                exit(1);
+            }
+        }
+    }
+    fclose(f);
+}
+
+
+/* Load a volume density file.  Expects the number of x, y, and z samples
+   as the first three values (as integer strings), then x*y*z
+   floating-point values (also as strings) to give the densities.  */
+static float *
+loadVolume(const char *fn, int n[3]) {
+    FILE *f = fopen(fn, "r");
+    if (!f) {
+        perror(fn);
+        exit(1);
+    }
+
+    if (fscanf(f, "%d %d %d", &n[0], &n[1], &n[2]) != 3) {
+        fprintf(stderr, "Couldn't find resolution at start of density file\n");
+        exit(1);
+    }
+
+    int count = n[0] * n[1] * n[2];
+    float *v = new float[count];
+    for (int i = 0; i < count; ++i) {
+        if (fscanf(f, "%f", &v[i]) != 1) {
+            fprintf(stderr, "Unexpected end of file at %d'th density value\n", i);
+            exit(1);
+        }
+    }
+
+    return v;
+}
+
+
+int main(int argc, char *argv[]) {
+    if (argc != 3) {
+        fprintf(stderr, "usage: volume <camera.dat> <volume_density.vol>\n");
+        return 1;
+    }
+
+    //
+    // Load viewing data and the volume density data
+    //
+    int width, height;
+    float raster2camera[4][4], camera2world[4][4];
+    loadCamera(argv[1], &width, &height, raster2camera, camera2world);
+    float *image = new float[width*height];
+
+    int n[3];
+    float *density = loadVolume(argv[2], n);
+
+    //
+    // Compute the image using the ispc implementation; report the minimum
+    // time of three runs.
+    //
+    double minISPC = 1e30;
+    for (int i = 0; i < 3; ++i) {
+        reset_and_start_timer();
+        volume_ispc(density, n, raster2camera, camera2world,
+                    width, height, image);
+        double dt = get_elapsed_mcycles();
+        minISPC = std::min(minISPC, dt);
+    }
+
+    printf("[volume ispc 1 core]:\t\t[%.3f] million cycles\n", minISPC);
+    writePPM(image, width, height, "volume-ispc-1core.ppm");
+
+    // Clear out the buffer
+    for (int i = 0; i < width * height; ++i)
+        image[i] = 0.;
+
+    //
+    // Compute the image using the ispc implementation that also uses
+    // tasks; report the minimum time of three runs.
+    //
+    double minISPCtasks = 1e30;
+    for (int i = 0; i < 3; ++i) {
+        reset_and_start_timer();
+        volume_ispc_tasks(density, n, raster2camera, camera2world,
+                          width, height, image);
+        double dt = get_elapsed_mcycles();
+        minISPCtasks = std::min(minISPCtasks, dt);
+    }
+
+    printf("[volume ispc + tasks]:\t\t[%.3f] million cycles\n", minISPCtasks);
+    writePPM(image, width, height, "volume-ispc-tasks.ppm");
+
+    // Clear out the buffer
+    for (int i = 0; i < width * height; ++i)
+        image[i] = 0.;
+
+    // 
+    // And run the serial implementation 3 times, again reporting the
+    // minimum time.
+    //
+    double minSerial = 1e30;
+    for (int i = 0; i < 3; ++i) {
+        reset_and_start_timer();
+        volume_serial(density, n, raster2camera, camera2world,
+                      width, height, image);
+        double dt = get_elapsed_mcycles();
+        minSerial = std::min(minSerial, dt);
+    }
+
+    printf("[volume serial]:\t\t[%.3f] millon cycles\n", minSerial);
+    writePPM(image, width, height, "volume-serial.ppm");
+
+    printf("\t\t\t\t(%.2fx speedup from ISPC serial, %.2fx from ISPC+tasks)\n", 
+           minSerial/minISPC, minSerial / minISPCtasks);
+
+    return 0;
+}
--- a/examples/volume_rendering/volume.ispc
+++ b/examples/volume_rendering/volume.ispc
@@ -0,0 +1,385 @@
+/*
+  Copyright (c) 2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+typedef float<3> float3;
+
+struct Ray {
+    float3 origin, dir;
+};
+
+
+static void
+generateRay(const uniform float raster2camera[4][4], 
+            const uniform float camera2world[4][4],
+            float x, float y, reference Ray ray) {
+    // transform raster coordinate (x, y, 0) to camera space
+    float camx = raster2camera[0][0] * x + raster2camera[0][1] * y + raster2camera[0][3];
+    float camy = raster2camera[1][0] * x + raster2camera[1][1] * y + raster2camera[1][3];
+    float camz = raster2camera[2][3];
+    float camw = raster2camera[3][3];
+    camx /= camw;
+    camy /= camw;
+    camz /= camw;
+
+    ray.dir.x = camera2world[0][0] * camx + camera2world[0][1] * camy + camera2world[0][2] * camz;
+    ray.dir.y = camera2world[1][0] * camx + camera2world[1][1] * camy + camera2world[1][2] * camz;
+    ray.dir.z = camera2world[2][0] * camx + camera2world[2][1] * camy + camera2world[2][2] * camz;
+
+    ray.origin.x = camera2world[0][3] / camera2world[3][3];
+    ray.origin.y = camera2world[1][3] / camera2world[3][3];
+    ray.origin.z = camera2world[2][3] / camera2world[3][3];
+}
+
+
+static inline bool
+Inside(float3 p, float3 pMin, float3 pMax) {
+    return (p.x >= pMin.x && p.x <= pMax.x &&
+            p.y >= pMin.y && p.y <= pMax.y &&
+            p.z >= pMin.z && p.z <= pMax.z);
+}
+
+
+static bool
+IntersectP(Ray ray, float3 pMin, float3 pMax, reference float hit0, reference float hit1) {
+    float t0 = -1e30, t1 = 1e30;
+
+    float3 tNear = (pMin - ray.origin) / ray.dir;
+    float3 tFar  = (pMax - ray.origin) / ray.dir;
+    if (tNear.x > tFar.x) {
+        float tmp = tNear.x;
+        tNear.x = tFar.x;
+        tFar.x = tmp;
+    }
+    t0 = max(tNear.x, t0);
+    t1 = min(tFar.x, t1);
+
+    if (tNear.y > tFar.y) {
+        float tmp = tNear.y;
+        tNear.y = tFar.y;
+        tFar.y = tmp;
+    }
+    t0 = max(tNear.y, t0);
+    t1 = min(tFar.y, t1);
+
+    if (tNear.z > tFar.z) {
+        float tmp = tNear.z;
+        tNear.z = tFar.z;
+        tFar.z = tmp;
+    }
+    t0 = max(tNear.z, t0);
+    t1 = min(tFar.z, t1);
+    
+    if (t0 <= t1) {
+        hit0 = t0;
+        hit1 = t1;
+        return true;
+    }
+    else
+        return false;
+}
+
+
+static inline float Lerp(float t, float a, float b) {
+    return (1.f - t) * a + t * b;
+}
+
+
+static inline float D(int x, int y, int z, uniform int nVoxels[3], 
+                      uniform float density[]) {
+    x = clamp(x, 0, nVoxels[0]-1);
+    y = clamp(y, 0, nVoxels[1]-1);
+    z = clamp(z, 0, nVoxels[2]-1);
+
+    return density[z*nVoxels[0]*nVoxels[1] + y*nVoxels[0] + x];
+}
+
+
+static inline float Du(uniform int x, uniform int y, uniform int z, 
+                       uniform int nVoxels[3], uniform float density[]) {
+    x = clamp(x, 0, nVoxels[0]-1);
+    y = clamp(y, 0, nVoxels[1]-1);
+    z = clamp(z, 0, nVoxels[2]-1);
+
+    return density[z*nVoxels[0]*nVoxels[1] + y*nVoxels[0] + x];
+}
+
+
+static inline float3 Offset(float3 p, float3 pMin, float3 pMax) {
+    return (p - pMin) / (pMax - pMin);
+}
+
+
+static inline float Density(float3 Pobj, float3 pMin, float3 pMax, 
+                            uniform float density[], uniform int nVoxels[3],
+                            reference uniform bool checkForSameVoxel) {
+    if (!Inside(Pobj, pMin, pMax)) 
+        return 0;
+    // Compute voxel coordinates and offsets for _Pobj_
+    float3 vox = Offset(Pobj, pMin, pMax);
+    vox.x = vox.x * nVoxels[0] - .5f;
+    vox.y = vox.y * nVoxels[1] - .5f;
+    vox.z = vox.z * nVoxels[2] - .5f;
+    int vx = (int)(vox.x), vy = (int)(vox.y), vz = (int)(vox.z);
+    float dx = vox.x - vx, dy = vox.y - vy, dz = vox.z - vz;
+
+    // Trilinearly interpolate density values to compute local density
+    float d00, d10, d01, d11;
+    uniform int uvx, uvy, uvz;
+    if (checkForSameVoxel && reduce_equal(vx, uvx) && reduce_equal(vy, uvy) &&
+        reduce_equal(vz, uvz)) {
+        // If all of the program instances are inside the same voxel, then
+        // we'll call the 'uniform' variant of the voxel density lookup
+        // function, thus doing a single load for each value rather than a
+        // gather.
+        d00 = Lerp(dx, Du(uvx, uvy, uvz, nVoxels, density),     
+                       Du(uvx+1, uvy, uvz, nVoxels, density));
+        d10 = Lerp(dx, Du(uvx, uvy+1, uvz, nVoxels, density),   
+                       Du(uvx+1, uvy+1, uvz, nVoxels, density));
+        d01 = Lerp(dx, Du(uvx, uvy, uvz+1, nVoxels, density),   
+                       Du(uvx+1, uvy, uvz+1, nVoxels, density));
+        d11 = Lerp(dx, Du(uvx, uvy+1, uvz+1, nVoxels, density), 
+                       Du(uvx+1, uvy+1, uvz+1, nVoxels, density));
+    }
+    else {
+        // Otherwise, we have to do an actual gather in the more general
+        // D() function.  Once the reduce_equal tests above fail, we stop
+        // checking in subsequent steps, since it's unlikely that this will
+        // be true in the future once they've diverged into different
+        // voxels.
+        checkForSameVoxel = false;
+        d00 = Lerp(dx, D(vx, vy, vz, nVoxels, density),     
+                       D(vx+1, vy, vz, nVoxels, density));
+        d10 = Lerp(dx, D(vx, vy+1, vz, nVoxels, density),   
+                       D(vx+1, vy+1, vz, nVoxels, density));
+        d01 = Lerp(dx, D(vx, vy, vz+1, nVoxels, density),   
+                       D(vx+1, vy, vz+1, nVoxels, density));
+        d11 = Lerp(dx, D(vx, vy+1, vz+1, nVoxels, density), 
+                       D(vx+1, vy+1, vz+1, nVoxels, density));
+    }
+    float d0 = Lerp(dy, d00, d10);
+    float d1 = Lerp(dy, d01, d11);
+    return Lerp(dz, d0, d1);
+}
+
+
+/* Returns the transmittance between two points p0 and p1, in a volume
+   with extent (pMin,pMax) with transmittance coefficient sigma_t,
+   defined by nVoxels[3] voxels in each dimension in the given density
+   array. */
+static float
+transmittance(uniform float3 p0, float3 p1, uniform float3 pMin,
+              uniform float3 pMax, uniform float sigma_t, 
+              uniform float density[], uniform int nVoxels[3]) {
+    float rayT0, rayT1;
+    Ray ray;
+    ray.origin = p1;
+    ray.dir = p0 - p1;
+
+    // Find the parametric t range along the ray that is inside the volume.
+    if (!IntersectP(ray, pMin, pMax, rayT0, rayT1))
+        return 1.;
+
+    rayT0 = max(rayT0, 0.f);
+
+    // Accumulate beam transmittance in tau
+    float tau = 0;
+    float rayLength = sqrt(ray.dir.x * ray.dir.x + ray.dir.y * ray.dir.y +
+                           ray.dir.z * ray.dir.z);
+    uniform float stepDist = 0.2;
+    float stepT = stepDist / rayLength;
+
+    float t = rayT0;
+    float3 pos = ray.origin + ray.dir * rayT0;
+    float3 dirStep = ray.dir * stepT;
+    uniform bool checkForSameVoxel = true;
+    while (t < rayT1) {
+        tau += stepDist * sigma_t * Density(pos, pMin, pMax, density, nVoxels,
+                                            checkForSameVoxel);
+        pos = pos + dirStep;
+        t += stepT;
+    }
+
+    return exp(-tau);
+}
+
+
+static inline float
+distanceSquared(float3 a, float3 b) {
+    float3 d = a-b;
+    return d.x*d.x + d.y*d.y + d.z*d.z;
+}
+
+
+static float 
+raymarch(uniform float density[], uniform int nVoxels[3], Ray ray) {
+    float rayT0, rayT1;
+    uniform float3 pMin = {.3, -.2, .3}, pMax = {1.8, 2.3, 1.8};
+    uniform float3 lightPos = { -1, 4, 1.5 };
+
+    cif (!IntersectP(ray, pMin, pMax, rayT0, rayT1))
+        return 0.;
+
+    rayT0 = max(rayT0, 0.f);
+
+    // Parameters that define the volume scattering characteristics and
+    // sampling rate for raymarching
+    uniform float Le = .25;            // Emission coefficient
+    uniform float sigma_a = 10;        // Absorption coefficient
+    uniform float sigma_s = 10;        // Scattering coefficient
+    uniform float stepDist = 0.025;    // Ray step amount
+    uniform float lightIntensity = 40; // Light source intensity
+
+    float tau = 0.f;  // accumulated beam transmittance
+    float L = 0;      // radiance along the ray
+    float rayLength = sqrt(ray.dir.x * ray.dir.x + ray.dir.y * ray.dir.y +
+                           ray.dir.z * ray.dir.z);
+    float stepT = stepDist / rayLength;
+
+    float t = rayT0;
+    float3 pos = ray.origin + ray.dir * rayT0;
+    float3 dirStep = ray.dir * stepT;
+    uniform bool checkForSameVoxel = true;
+    cwhile (t < rayT1) {
+        float d = Density(pos, pMin, pMax, density, nVoxels, checkForSameVoxel);
+
+        // terminate once attenuation is high
+        float atten = exp(-tau);
+        if (atten < .005)
+            cbreak;
+
+        // direct lighting
+        float Li = lightIntensity / distanceSquared(lightPos, pos) * 
+            transmittance(lightPos, pos, pMin, pMax, sigma_a + sigma_s,
+                          density, nVoxels);
+        L += stepDist * atten * d * sigma_s * (Li + Le);
+
+        // update beam transmittance
+        tau += stepDist * (sigma_a + sigma_s) * d;
+
+        pos = pos + dirStep;
+        t += stepT;
+    }
+
+    // Gamma correction
+    return pow(L, 1.f / 2.2f);
+}
+
+
+/* Utility routine used by both the task-based and the single-core entrypoints.
+   Renders a tile of the image, covering [x0,x0) * [y0, y1), storing the
+   result into the image[] array.
+ */
+static void
+volume_tile(uniform int x0, uniform int y0, uniform int x1,
+            uniform int y1, uniform float density[], uniform int nVoxels[3], 
+            const uniform float raster2camera[4][4],
+            const uniform float camera2world[4][4], 
+            uniform int width, uniform int height, uniform float image[]) {
+    // Work on 4x4=16 pixel big tiles of the image.  This function thus
+    // implicitly assumes that both (x1-x0) and (y1-y0) are evenly divisble
+    // by 4.
+    for (uniform int y = y0; y < y1; y += 4) {
+        for (uniform int x = x0; x < x1; x += 4) {
+            // For each such tile, process programCount pixels at a time,
+            // until we've done all 16 of them.  Thus, we're also assuming
+            // that programCount <= 16 and that 16 is evenly dividible by
+            // programCount.
+            for (uniform int o = 0; o < 16; o += programCount) {
+                // These two arrays encode the mapping from [0,15] to
+                // offsets within the 4x4 pixel block so that we render
+                // each pixel inside the block
+                const uniform int xoffsets[16] = { 0, 1, 0, 1, 2, 3, 2, 3,
+                                                   0, 1, 0, 1, 2, 3, 2, 3 };
+                const uniform int yoffsets[16] = { 0, 0, 1, 1, 0, 0, 1, 1,
+                                                   2, 2, 3, 3, 2, 2, 3, 3 };
+
+                // Figure out the pixel to render for this program instance
+                int xo = x + xoffsets[o + programIndex];
+                int yo = y + yoffsets[o + programIndex];
+
+                // Use viewing parameters to compute the corresponding ray
+                // for the pixel
+                Ray ray;
+                generateRay(raster2camera, camera2world, xo, yo, ray);
+
+                // And raymarch through the volume to compute the pixel's
+                // value
+                int offset = yo * width + xo;
+                image[offset] = raymarch(density, nVoxels, ray);
+            }
+        }
+    }
+}
+
+
+task void
+volume_task(uniform float density[], uniform int nVoxels[3], 
+            const uniform float raster2camera[4][4],
+            const uniform float camera2world[4][4], 
+            uniform int width, uniform int height, uniform float image[]) {
+    uniform int dx = 8, dy = 8; // must match value in volume_ispc_tasks
+    uniform int xbuckets = (width + (dx-1)) / dx;
+    uniform int ybuckets = (height + (dy-1)) / dy;
+
+    uniform int x0 = (taskIndex % xbuckets) * dx;
+    uniform int y0 = (taskIndex / xbuckets) * dy;
+    uniform int x1 = x0 + dx, y1 = y0 + dy;
+    x1 = min(x1, width);
+    y1 = min(y1, height);
+
+    volume_tile(x0, y0, x1, y1, density, nVoxels, raster2camera,
+                 camera2world, width, height, image);
+}
+
+
+export void
+volume_ispc(uniform float density[], uniform int nVoxels[3], 
+            const uniform float raster2camera[4][4],
+            const uniform float camera2world[4][4], 
+            uniform int width, uniform int height, uniform float image[]) {
+    volume_tile(0, 0, width, height, density, nVoxels, raster2camera, 
+                camera2world, width, height,  image);
+}
+
+
+export void
+volume_ispc_tasks(uniform float density[], uniform int nVoxels[3], 
+                  const uniform float raster2camera[4][4],
+                  const uniform float camera2world[4][4], 
+                  uniform int width, uniform int height, uniform float image[]) {
+    // Launch tasks to work on (dx,dy)-sized tiles of the image
+    uniform int dx = 8, dy = 8;
+    uniform int nTasks = ((width+(dx-1))/dx) * ((height+(dy-1))/dy);
+    launch[nTasks] < volume_task(density, nVoxels, raster2camera, camera2world, 
+                                 width, height, image) >;
+}
--- a/examples/volume_rendering/volume.vcxproj
+++ b/examples/volume_rendering/volume.vcxproj
@@ -0,0 +1,176 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|Win32">
+      <Configuration>Debug</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|Win32">
+      <Configuration>Release</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{dee5733a-e93e-449d-9114-9bffcaeb4df9}</ProjectGuid>
+    <Keyword>Win32Proj</Keyword>
+    <RootNamespace>volume</RootNamespace>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <LinkIncremental>true</LinkIncremental>
+    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <LinkIncremental>true</LinkIncremental>
+    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <LinkIncremental>false</LinkIncremental>
+    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <LinkIncremental>false</LinkIncremental>
+    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
+  </PropertyGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <ClCompile>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <FloatingPointModel>Fast</FloatingPointModel>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <ClCompile>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <FloatingPointModel>Fast</FloatingPointModel>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
+      <FloatingPointModel>Fast</FloatingPointModel>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
+      <FloatingPointModel>Fast</FloatingPointModel>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <ClCompile Include="volume.cpp" />
+    <ClCompile Include="volume_serial.cpp" />
+    <ClCompile Include="../tasksys.cpp" />
+  </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="volume.ispc">
+      <FileType>Document</FileType>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2
+</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2
+</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
+    </CustomBuild>
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+  </ImportGroup>
+</Project>
--- a/examples/volume_rendering/volume_serial.cpp
+++ b/examples/volume_rendering/volume_serial.cpp
@@ -0,0 +1,302 @@
+/*
+  Copyright (c) 2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+#include <assert.h>
+#include <math.h>
+#include <algorithm>
+
+// Just enough of a float3 class to do what we need in this file.
+struct float3 {
+    float3() { }
+    float3(float xx, float yy, float zz) { x = xx; y = yy; z = zz; }
+
+    float3 operator*(float f) const { return float3(x*f, y*f, z*f); }
+    float3 operator-(const float3 &f2) const { 
+        return float3(x-f2.x, y-f2.y, z-f2.z); 
+    }
+    float3 operator*(const float3 &f2) const { 
+        return float3(x*f2.x, y*f2.y, z*f2.z); 
+    }
+    float3 operator+(const float3 &f2) const { 
+        return float3(x+f2.x, y+f2.y, z+f2.z); 
+    }
+    float3 operator/(const float3 &f2) const { 
+        return float3(x/f2.x, y/f2.y, z/f2.z); 
+    }
+    float operator[](int i) const { return (&x)[i]; }
+    float &operator[](int i) { return (&x)[i]; }
+
+    float x, y, z;
+    float pad;  // match padding/alignment of ispc version 
+}
+#ifndef _MSC_VER
+__attribute__ ((aligned(16)))
+#endif
+;
+
+struct Ray {
+    float3 origin, dir;
+};
+
+
+static void
+generateRay(const float raster2camera[4][4], const float camera2world[4][4],
+            float x, float y, Ray &ray) {
+    // transform raster coordinate (x, y, 0) to camera space
+    float camx = raster2camera[0][0] * x + raster2camera[0][1] * y + raster2camera[0][3];
+    float camy = raster2camera[1][0] * x + raster2camera[1][1] * y + raster2camera[1][3];
+    float camz = raster2camera[2][3];
+    float camw = raster2camera[3][3];
+    camx /= camw;
+    camy /= camw;
+    camz /= camw;
+
+    ray.dir.x = camera2world[0][0] * camx + camera2world[0][1] * camy + camera2world[0][2] * camz;
+    ray.dir.y = camera2world[1][0] * camx + camera2world[1][1] * camy + camera2world[1][2] * camz;
+    ray.dir.z = camera2world[2][0] * camx + camera2world[2][1] * camy + camera2world[2][2] * camz;
+
+    ray.origin.x = camera2world[0][3] / camera2world[3][3];
+    ray.origin.y = camera2world[1][3] / camera2world[3][3];
+    ray.origin.z = camera2world[2][3] / camera2world[3][3];
+}
+
+
+static bool
+Inside(float3 p, float3 pMin, float3 pMax) {
+    return (p.x >= pMin.x && p.x <= pMax.x &&
+            p.y >= pMin.y && p.y <= pMax.y &&
+            p.z >= pMin.z && p.z <= pMax.z);
+}
+
+
+static bool
+IntersectP(const Ray &ray, float3 pMin, float3 pMax, float *hit0, float *hit1) {
+    float t0 = -1e30f, t1 = 1e30f;
+
+    float3 tNear = (pMin - ray.origin) / ray.dir;
+    float3 tFar  = (pMax - ray.origin) / ray.dir;
+    if (tNear.x > tFar.x) {
+        float tmp = tNear.x;
+        tNear.x = tFar.x;
+        tFar.x = tmp;
+    }
+    t0 = std::max(tNear.x, t0);
+    t1 = std::min(tFar.x, t1);
+
+    if (tNear.y > tFar.y) {
+        float tmp = tNear.y;
+        tNear.y = tFar.y;
+        tFar.y = tmp;
+    }
+    t0 = std::max(tNear.y, t0);
+    t1 = std::min(tFar.y, t1);
+
+    if (tNear.z > tFar.z) {
+        float tmp = tNear.z;
+        tNear.z = tFar.z;
+        tFar.z = tmp;
+    }
+    t0 = std::max(tNear.z, t0);
+    t1 = std::min(tFar.z, t1);
+    
+    if (t0 <= t1) {
+        *hit0 = t0;
+        *hit1 = t1;
+        return true;
+    }
+    else
+        return false;
+}
+
+
+static inline float Lerp(float t, float a, float b) {
+    return (1.f - t) * a + t * b;
+}
+
+
+static inline int Clamp(int v, int low, int high) {
+    return std::min(std::max(v, low), high);
+}
+
+
+static inline float D(int x, int y, int z, int nVoxels[3], float density[]) {
+    x = Clamp(x, 0, nVoxels[0]-1);
+    y = Clamp(y, 0, nVoxels[1]-1);
+    z = Clamp(z, 0, nVoxels[2]-1);
+    return density[z*nVoxels[0]*nVoxels[1] + y*nVoxels[0] + x];
+}
+
+
+static inline float3 Offset(float3 p, float3 pMin, float3 pMax) {
+    return float3((p.x - pMin.x) / (pMax.x - pMin.x),
+                  (p.y - pMin.y) / (pMax.y - pMin.y),
+                  (p.z - pMin.z) / (pMax.z - pMin.z));
+}
+
+
+static inline float Density(float3 Pobj, float3 pMin, float3 pMax, 
+                            float density[], int nVoxels[3]) {
+    if (!Inside(Pobj, pMin, pMax)) 
+        return 0;
+    // Compute voxel coordinates and offsets for _Pobj_
+    float3 vox = Offset(Pobj, pMin, pMax);
+    vox.x = vox.x * nVoxels[0] - .5f;
+    vox.y = vox.y * nVoxels[1] - .5f;
+    vox.z = vox.z * nVoxels[2] - .5f;
+    int vx = (int)(vox.x), vy = (int)(vox.y), vz = (int)(vox.z);
+    float dx = vox.x - vx, dy = vox.y - vy, dz = vox.z - vz;
+
+    // Trilinearly interpolate density values to compute local density
+    float d00 = Lerp(dx, D(vx, vy, vz, nVoxels, density),     
+                         D(vx+1, vy, vz, nVoxels, density));
+    float d10 = Lerp(dx, D(vx, vy+1, vz, nVoxels, density),   
+                         D(vx+1, vy+1, vz, nVoxels, density));
+    float d01 = Lerp(dx, D(vx, vy, vz+1, nVoxels, density),   
+                         D(vx+1, vy, vz+1, nVoxels, density));
+    float d11 = Lerp(dx, D(vx, vy+1, vz+1, nVoxels, density), 
+                         D(vx+1, vy+1, vz+1, nVoxels, density));
+    float d0 = Lerp(dy, d00, d10);
+    float d1 = Lerp(dy, d01, d11);
+    return Lerp(dz, d0, d1);
+}
+
+
+
+static float
+transmittance(float3 p0, float3 p1, float3 pMin,
+              float3 pMax, float sigma_t, float density[], int nVoxels[3]) {
+    float rayT0, rayT1;
+    Ray ray;
+    ray.origin = p1;
+    ray.dir = p0 - p1;
+
+    // Find the parametric t range along the ray that is inside the volume.
+    if (!IntersectP(ray, pMin, pMax, &rayT0, &rayT1))
+        return 1.;
+
+    rayT0 = std::max(rayT0, 0.f);
+
+    // Accumulate beam transmittance in tau
+    float tau = 0;
+    float rayLength = sqrtf(ray.dir.x * ray.dir.x + ray.dir.y * ray.dir.y +
+                            ray.dir.z * ray.dir.z);
+    float stepDist = 0.2f;
+    float stepT = stepDist / rayLength;
+
+    float t = rayT0;
+    float3 pos = ray.origin + ray.dir * rayT0;
+    float3 dirStep = ray.dir * stepT;
+    while (t < rayT1) {
+        tau += stepDist * sigma_t * Density(pos, pMin, pMax, density, nVoxels);
+        pos = pos + dirStep;
+        t += stepT;
+    }
+
+    return expf(-tau);
+}
+
+
+static float
+distanceSquared(float3 a, float3 b) {
+    float3 d = a-b;
+    return d.x*d.x + d.y*d.y + d.z*d.z;
+}
+
+
+static float 
+raymarch(float density[], int nVoxels[3], const Ray &ray) {
+    float rayT0, rayT1;
+    float3 pMin(.3f, -.2f, .3f), pMax(1.8f, 2.3f, 1.8f);
+    float3 lightPos(-1.f, 4.f, 1.5f);
+
+    if (!IntersectP(ray, pMin, pMax, &rayT0, &rayT1))
+        return 0.;
+
+    rayT0 = std::max(rayT0, 0.f);
+
+    // Parameters that define the volume scattering characteristics and
+    // sampling rate for raymarching
+    float Le = .25f;           // Emission coefficient
+    float sigma_a = 10;        // Absorption coefficient
+    float sigma_s = 10;        // Scattering coefficient
+    float stepDist = 0.025f;   // Ray step amount
+    float lightIntensity = 40; // Light source intensity
+
+    float tau = 0.f;  // accumulated beam transmittance
+    float L = 0;      // radiance along the ray
+    float rayLength = sqrtf(ray.dir.x * ray.dir.x + ray.dir.y * ray.dir.y +
+                            ray.dir.z * ray.dir.z);
+    float stepT = stepDist / rayLength;
+
+    float t = rayT0;
+    float3 pos = ray.origin + ray.dir * rayT0;
+    float3 dirStep = ray.dir * stepT;
+    while (t < rayT1) {
+        float d = Density(pos, pMin, pMax, density, nVoxels);
+
+        // terminate once attenuation is high
+        float atten = expf(-tau);
+        if (atten < .005f)
+            break;
+
+        // direct lighting
+        float Li = lightIntensity / distanceSquared(lightPos, pos) * 
+            transmittance(lightPos, pos, pMin, pMax, sigma_a + sigma_s,
+                          density, nVoxels);
+        L += stepDist * atten * d * sigma_s * (Li + Le);
+
+        // update beam transmittance
+        tau += stepDist * (sigma_a + sigma_s) * d;
+
+        pos = pos + dirStep;
+        t += stepT;
+    }
+
+    // Gamma correction
+    return powf(L, 1.f / 2.2f);
+}
+
+
+void
+volume_serial(float density[], int nVoxels[3], const float raster2camera[4][4],
+              const float camera2world[4][4], 
+              int width, int height, float image[]) {
+    int offset = 0;
+    for (int y = 0; y < height; ++y) {
+        for (int x = 0; x < width; ++x, ++offset) {
+            Ray ray;
+            generateRay(raster2camera, camera2world, (float)x, (float)y, ray);
+            image[offset] = raymarch(density, nVoxels, ray);
+        }
+    }
+}
--- a/expr.cpp
+++ b/expr.cpp
--- a/expr.h
+++ b/expr.h
@@ -39,6 +39,8 @@
 #define ISPC_EXPR_H 1

 #include "ispc.h"
+#include "ast.h"
+#include "type.h"

 class FunctionSymbolExpr;

@@ -96,7 +98,7 @@ public:
        that incorporates the given error message string.  In either
        failure case, NULL is returned.  */
    Expr *TypeConv(const Type *type, const char *errorMsgBase = NULL, 
-                   bool failureOk = false);
+                   bool failureOk = false, bool issuePrecisionWarnings = true);
 };


@@ -120,8 +122,8 @@ public:
    void Print() const;
    Expr *Optimize();
    Expr *TypeCheck();
+    int EstimateCost() const;

-private:
    const Op op;
    Expr *expr;
 };
@@ -163,8 +165,8 @@ public:

    Expr *Optimize();
    Expr *TypeCheck();
+    int EstimateCost() const;

-private:
    const Op op;
    Expr *arg0, *arg1;
 };
@@ -195,8 +197,8 @@ public:

    Expr *Optimize();
    Expr *TypeCheck();
+    int EstimateCost() const;

-private:
    const Op op;
    Expr *lvalue, *rvalue;
 };
@@ -216,8 +218,8 @@ public:

    Expr *Optimize();
    Expr *TypeCheck();
+    int EstimateCost() const;

-private:
    Expr *test, *expr1, *expr2;
 };

@@ -239,6 +241,7 @@ public:
    llvm::Constant *GetConstant(const Type *type) const;
    ExprList *Optimize();
    ExprList *TypeCheck();
+    int EstimateCost() const;

    std::vector<Expr *> exprs;
 };
@@ -248,7 +251,8 @@ public:
 */
 class FunctionCallExpr : public Expr {
 public:
-    FunctionCallExpr(Expr *func, ExprList *args, SourcePos p, bool isLaunch);
+    FunctionCallExpr(Expr *func, ExprList *args, SourcePos p, 
+                     bool isLaunch = false, Expr *launchCountExpr = NULL);

    llvm::Value *GetValue(FunctionEmitContext *ctx) const;
    const Type *GetType() const;
@@ -256,13 +260,15 @@ public:

    Expr *Optimize();
    Expr *TypeCheck();
+    int EstimateCost() const;

-private:
    Expr *func;
    ExprList *args;
    bool isLaunch;
+    Expr *launchCountExpr;

-    void resolveFunctionOverloads();
+private:
+    void resolveFunctionOverloads(bool exactMatchOnly);
    bool tryResolve(bool (*matchFunc)(Expr *, const Type *));
 };

@@ -284,16 +290,21 @@ public:

    Expr *Optimize();
    Expr *TypeCheck();
+    int EstimateCost() const;

-private:
    Expr *arrayOrVector, *index;
 };


 /** @brief Expression representing member selection ("foo.bar").
+ *
+ *  This will also be overloaded to deal with swizzles.
 */
 class MemberExpr : public Expr {
 public:
+    static MemberExpr* create(Expr *expr, const char *identifier,
+                              SourcePos pos, SourcePos identifierPos);
+
    MemberExpr(Expr *expr, const char *identifier, SourcePos pos, 
               SourcePos identifierPos);

@@ -304,10 +315,11 @@ public:
    void Print() const;
    Expr *Optimize();
    Expr *TypeCheck();
+    int EstimateCost() const;
+
+    virtual int getElementNumber() const;

-private:
    std::string getCandidateNearMatches() const;
-    int getElementNumber() const;

    Expr *expr;
    std::string identifier;
@@ -318,12 +330,30 @@ private:
 /** @brief Expression representing a compile-time constant value.  

    This class can currently represent compile-time constants of anything
-    that is an AtomicType; for anything more complex, we don't currently
-    have a representation of a compile-time constant that can be further
-    reasoned about.
+    that is an AtomicType or an EnumType; for anything more complex, we
+    don't currently have a representation of a compile-time constant that
+    can be further reasoned about.
 */
 class ConstExpr : public Expr {
 public:
+    /** Create a ConstExpr from a uniform int8 value */
+    ConstExpr(const Type *t, int8_t i, SourcePos p);
+    /** Create a ConstExpr from a varying int8 value */
+    ConstExpr(const Type *t, int8_t *i, SourcePos p);
+    /** Create a ConstExpr from a uniform uint8 value */
+    ConstExpr(const Type *t, uint8_t u, SourcePos p);
+    /** Create a ConstExpr from a varying uint8 value */
+    ConstExpr(const Type *t, uint8_t *u, SourcePos p);
+
+    /** Create a ConstExpr from a uniform int16 value */
+    ConstExpr(const Type *t, int16_t i, SourcePos p);
+    /** Create a ConstExpr from a varying int16 value */
+    ConstExpr(const Type *t, int16_t *i, SourcePos p);
+    /** Create a ConstExpr from a uniform uint16 value */
+    ConstExpr(const Type *t, uint16_t u, SourcePos p);
+    /** Create a ConstExpr from a varying uint16 value */
+    ConstExpr(const Type *t, uint16_t *u, SourcePos p);
+
    /** Create a ConstExpr from a uniform int32 value */
    ConstExpr(const Type *t, int32_t i, SourcePos p);
    /** Create a ConstExpr from a varying int32 value */
@@ -332,14 +362,17 @@ public:
    ConstExpr(const Type *t, uint32_t u, SourcePos p);
    /** Create a ConstExpr from a varying uint32 value */
    ConstExpr(const Type *t, uint32_t *u, SourcePos p);
+
    /** Create a ConstExpr from a uniform float value */
    ConstExpr(const Type *t, float f, SourcePos p);
    /** Create a ConstExpr from a varying float value */
    ConstExpr(const Type *t, float *f, SourcePos p);
+
    /** Create a ConstExpr from a uniform double value */
    ConstExpr(const Type *t, double d, SourcePos p);
    /** Create a ConstExpr from a varying double value */
    ConstExpr(const Type *t, double *d, SourcePos p);
+
    /** Create a ConstExpr from a uniform int64 value */
    ConstExpr(const Type *t, int64_t i, SourcePos p);
    /** Create a ConstExpr from a varying int64 value */
@@ -348,10 +381,12 @@ public:
    ConstExpr(const Type *t, uint64_t i, SourcePos p);
    /** Create a ConstExpr from a varying uint64 value */
    ConstExpr(const Type *t, uint64_t *i, SourcePos p);
+
    /** Create a ConstExpr from a uniform bool value */
    ConstExpr(const Type *t, bool b, SourcePos p);
    /** Create a ConstExpr from a varying bool value */
    ConstExpr(const Type *t, bool *b, SourcePos p);
+
    /** Create a ConstExpr of the same type as the given old ConstExpr,
        with values given by the "vales" parameter. */
    ConstExpr(ConstExpr *old, double *values);
@@ -363,6 +398,7 @@ public:

    Expr *TypeCheck();
    Expr *Optimize();
+    int EstimateCost() const;

    /** Return the ConstExpr's values as booleans, doing type conversion
        from the actual type if needed.  If forceVarying is true, then type
@@ -370,6 +406,30 @@ public:
        equal to the target vector width into the given pointer. */
    int AsBool(bool *, bool forceVarying = false) const;

+    /** Return the ConstExpr's values as int8s, doing type conversion
+        from the actual type if needed.  If forceVarying is true, then type
+        convert to 'varying' so as to always return a number of values
+        equal to the target vector width into the given pointer. */
+    int AsInt8(int8_t *, bool forceVarying = false) const;
+
+    /** Return the ConstExpr's values as uint8s, doing type conversion
+        from the actual type if needed.  If forceVarying is true, then type
+        convert to 'varying' so as to always return a number of values
+        equal to the target vector width into the given pointer. */
+    int AsUInt8(uint8_t *, bool forceVarying = false) const;
+
+    /** Return the ConstExpr's values as int16s, doing type conversion
+        from the actual type if needed.  If forceVarying is true, then type
+        convert to 'varying' so as to always return a number of values
+        equal to the target vector width into the given pointer. */
+    int AsInt16(int16_t *, bool forceVarying = false) const;
+
+    /** Return the ConstExpr's values as uint16s, doing type conversion
+        from the actual type if needed.  If forceVarying is true, then type
+        convert to 'varying' so as to always return a number of values
+        equal to the target vector width into the given pointer. */
+    int AsUInt16(uint16_t *, bool forceVarying = false) const;
+
    /** Return the ConstExpr's values as int32s, doing type conversion
        from the actual type if needed.  If forceVarying is true, then type
        convert to 'varying' so as to always return a number of values
@@ -412,8 +472,14 @@ public:
    int Count() const;

 private:
-    const AtomicType *type;
+    AtomicType::BasicType getBasicType() const;
+
+    const Type *type;
    union {
+        int8_t int8Val[ISPC_MAX_NVEC];
+        uint8_t uint8Val[ISPC_MAX_NVEC];
+        int16_t int16Val[ISPC_MAX_NVEC];
+        uint16_t uint16Val[ISPC_MAX_NVEC];
        int32_t int32Val[ISPC_MAX_NVEC];
        uint32_t uint32Val[ISPC_MAX_NVEC];
        bool boolVal[ISPC_MAX_NVEC];
@@ -436,8 +502,8 @@ public:
    void Print() const;
    Expr *TypeCheck();
    Expr *Optimize();
+    int EstimateCost() const;

-private:
    const Type *type;
    Expr *expr;
 };
@@ -455,8 +521,8 @@ public:
    void Print() const;
    Expr *TypeCheck();
    Expr *Optimize();
+    int EstimateCost() const;

-private:
    Expr *expr;
 };

@@ -474,8 +540,8 @@ public:
    void Print() const;
    Expr *TypeCheck();
    Expr *Optimize();
+    int EstimateCost() const;

-private:
    Expr *expr;
 };

@@ -492,6 +558,7 @@ public:
    Expr *TypeCheck();
    Expr *Optimize();
    void Print() const;
+    int EstimateCost() const;

 private:
    Symbol *symbol;
@@ -503,7 +570,7 @@ private:
 */    
 class FunctionSymbolExpr : public Expr {
 public:
-    FunctionSymbolExpr(std::vector<Symbol *> *candidateFunctions, 
+    FunctionSymbolExpr(const char *name, std::vector<Symbol *> *candidateFunctions,
                       SourcePos pos);

    llvm::Value *GetValue(FunctionEmitContext *ctx) const;
@@ -512,10 +579,14 @@ public:
    Expr *TypeCheck();
    Expr *Optimize();
    void Print() const;
+    int EstimateCost() const;

 private:
    friend class FunctionCallExpr;

+    /** Name of the function that is being called. */
+    std::string name;
+
    /** All of the functions with the name given in the function call;
        there may be more then one, in which case we need to resolve which
        overload is the best match. */
@@ -538,6 +609,7 @@ public:
    Expr *TypeCheck();
    Expr *Optimize();
    void Print() const;
+    int EstimateCost() const;
 };

 #endif // ISPC_EXPR_H
--- a/Show More
+++ b/Show More