Release notes and doxygen bump for v1.0.10

Added deferred shading workload
Added updated task launch implementation that now tracks task groups.
2011-09-30 15:09:19 -07:00 · 2011-09-30 15:09:04 -07:00 · 2011-09-30 11:20:53 -07:00 · 2011-09-30 11:11:52 -07:00 · 2011-09-29 16:19:59 -07:00 · 2011-09-29 13:35:50 -07:00
172 changed files with 11134 additions and 2097 deletions
--- a/40
+++ b/40
@@ -10,9 +10,15 @@ CLANG_LIBS = -lclangFrontend -lclangDriver \
             -lclangSerialization -lclangParse -lclangSema \
             -lclangAnalysis -lclangAST -lclangLex -lclangBasic

-LLVM_LIBS=$(shell llvm-config --ldflags --libs) -lpthread -ldl
+ISPC_LIBS=$(CLANG_LIBS) \
+	$(shell llvm-config --ldflags --libs) \
+	-lpthread -ldl
+ISPC_TEST_LIBS=$(shell llvm-config --ldflags --libs) \
+	-lpthread -ldl
+
 LLVM_CXXFLAGS=$(shell llvm-config --cppflags)
-LLVM_VERSION_DEF=-DLLVM_$(shell llvm-config --version | sed s/\\./_/)
+LLVM_VERSION=$(shell llvm-config --version | sed s/\\./_/)
+LLVM_VERSION_DEF=-DLLVM_$(LLVM_VERSION)

 BUILD_DATE=$(shell date +%Y%m%d)
 BUILD_VERSION=$(shell git log --abbrev-commit --abbrev=16 | head -1)
@@ -43,12 +49,14 @@ CXX_SRC=builtins.cpp ctx.cpp decl.cpp expr.cpp ispc.cpp \
 	util.cpp
 HEADERS=builtins.h ctx.h decl.h expr.h ispc.h llvmutil.h module.h \
 	opt.h stmt.h sym.h type.h util.h
-BUILTINS_SRC=builtins-avx.ll builtins-sse2.ll builtins-sse4.ll builtins-sse4x2.ll
+BUILTINS_SRC=builtins-avx.ll builtins-avx-x2.ll builtins-sse2.ll \
+	builtins-sse4.ll builtins-sse4x2.ll
 BISON_SRC=parse.yy
 FLEX_SRC=lex.ll

-OBJS=$(addprefix objs/, $(CXX_SRC:.cpp=.o) $(BUILTINS_SRC:.ll=.o) builtins-c.o stdlib_ispc.o \
-	$(BISON_SRC:.yy=.o) $(FLEX_SRC:.ll=.o))
+OBJS=$(addprefix objs/, $(CXX_SRC:.cpp=.o) $(BUILTINS_SRC:.ll=.o) \
+	builtins-c-32.o builtins-c-64.o stdlib_ispc.o $(BISON_SRC:.yy=.o) \
+	$(FLEX_SRC:.ll=.o))

 default: ispc ispc_test

@@ -77,11 +85,11 @@ doxygen:

 ispc: print_llvm_src dirs $(OBJS)
 	@echo Creating ispc executable
-	@$(CXX) $(LDFLAGS) -o $@ $(OBJS) $(CLANG_LIBS) $(LLVM_LIBS)
+	@$(CXX) $(LDFLAGS) -o $@ $(OBJS) $(ISPC_LIBS)

 ispc_test: dirs ispc_test.cpp
 	@echo Creating ispc_test executable
-	@$(CXX) $(LDFLAGS) $(CXXFLAGS) -o $@ ispc_test.cpp $(LLVM_LIBS)
+	@$(CXX) $(LDFLAGS) $(CXXFLAGS) -o $@ ispc_test.cpp $(ISPC_TEST_LIBS)

 objs/%.o: %.cpp
 	@echo Compiling $<
@@ -103,19 +111,27 @@ objs/lex.o: objs/lex.cpp $(HEADERS) objs/parse.cc
 	@echo Compiling $<
 	@$(CXX) $(CXXFLAGS) -o $@ -c $<

-objs/builtins-%.cpp: builtins-%.ll builtins.m4 builtins-sse.ll
+objs/builtins-%.cpp: builtins-%.ll builtins.m4 builtins-sse.ll builtins-avx-common.ll
 	@echo Creating C++ source from builtin definitions file $<
-	@m4 builtins.m4 $< | ./bitcode2cpp.py $< > $@
+	@m4 -DLLVM_VERSION=$(LLVM_VERSION) builtins.m4 $< | ./bitcode2cpp.py $< > $@

 objs/builtins-%.o: objs/builtins-%.cpp
 	@echo Compiling $<
 	@$(CXX) $(CXXFLAGS) -o $@ -c $<

-objs/builtins-c.cpp: builtins-c.c
+objs/builtins-c-32.cpp: builtins-c.c
 	@echo Creating C++ source from builtins definition file $<
-	@$(CLANG) -I /opt/l1om/usr/include/ -emit-llvm -c $< -o - | llvm-dis - | ./bitcode2cpp.py $< > $@
+	@$(CLANG) -m32 -emit-llvm -c $< -o - | llvm-dis - | ./bitcode2cpp.py builtins-c-32.c > $@

-objs/builtins-c.o: objs/builtins-c.cpp
+objs/builtins-c-32.o: objs/builtins-c-32.cpp
+	@echo Compiling $<
+	@$(CXX) $(CXXFLAGS) -o $@ -c $<
+
+objs/builtins-c-64.cpp: builtins-c.c
+	@echo Creating C++ source from builtins definition file $<
+	@$(CLANG) -m64 -emit-llvm -c $< -o - | llvm-dis - | ./bitcode2cpp.py builtins-c-64.c > $@
+
+objs/builtins-c-64.o: objs/builtins-c-64.cpp
 	@echo Compiling $<
 	@$(CXX) $(CXXFLAGS) -o $@ -c $<

--- a/README.txt
+++ b/README.txt
@@ -15,8 +15,8 @@ code.

 ispc is an open source compiler under the BSD license; see the file
 LICENSE.txt.  ispc supports Windows, Mac, and Linux, with both x86 and
-x86-64 targets. It currently supports the SSE2 and SSE4 instruction sets,
-though support for AVX should be available soon.
+x86-64 targets.  It currently supports the SSE2, SSE4, and AVX instruction
+sets.

 For more information and examples, as well as a wiki and the bug database,
 see the ispc distribution site, http://ispc.github.com.
--- a/bitcode2cpp.py
+++ b/bitcode2cpp.py
@@ -4,6 +4,8 @@ import sys
 import string
 import re
 import subprocess
+import platform
+import os

 length=0

@@ -14,8 +16,12 @@ target = re.sub("\.ll$", "", target)
 target = re.sub("\.c$", "", target)
 target = re.sub("-", "_", target)

+llvm_as="llvm-as"
+if platform.system() == 'Windows' or string.find(platform.system(), "CYGWIN_NT") != -1:
+    llvm_as = os.getenv("LLVM_INSTALL_DIR").replace("\\", "/") + "/bin/" + llvm_as
+
 try:
-    as_out=subprocess.Popen([ "llvm-as", "-", "-o", "-"], stdout=subprocess.PIPE)
+    as_out=subprocess.Popen([llvm_as, "-", "-o", "-"], stdout=subprocess.PIPE)
 except IOError:
    print >> sys.stderr, "Couldn't open " + src
    sys.exit(1)
--- a/builtins-avx-common.ll
+++ b/builtins-avx-common.ll
@@ -0,0 +1,278 @@
+;;  Copyright (c) 2010-2011, Intel Corporation
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Intel Corporation nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; *** Untested *** AVX target implementation.
+;;
+;; The LLVM AVX code generator is incomplete, so the ispc AVX target
+;; hasn't yet been tested.  There is therefore a higher-than-normal
+;; chance that there are bugs in the code in this file.
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rcp
+
+declare <4 x float> @llvm.x86.sse.rcp.ss(<4 x float>) nounwind readnone
+
+define internal float @__rcp_uniform_float(float) nounwind readonly alwaysinline {
+;    uniform float iv = extract(__rcp_u(v), 0);
+;    return iv * (2. - v * iv);
+  %vecval = insertelement <4 x float> undef, float %0, i32 0
+  %call = call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %vecval)
+  %scall = extractelement <4 x float> %call, i32 0
+
+  ; do one N-R iteration
+  %v_iv = fmul float %0, %scall
+  %two_minus = fsub float 2., %v_iv  
+  %iv_mul = fmul float %scall, %two_minus
+  ret float %iv_mul
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rounding floats
+
+declare <4 x float> @llvm.x86.sse41.round.ss(<4 x float>, <4 x float>, i32) nounwind readnone
+
+define internal float @__round_uniform_float(float) nounwind readonly alwaysinline {
+  ; roundss, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8
+  ; the roundss intrinsic is a total mess--docs say:
+  ;
+  ;  __m128 _mm_round_ss (__m128 a, __m128 b, const int c)
+  ;       
+  ;  b is a 128-bit parameter. The lowest 32 bits are the result of the rounding function
+  ;  on b0. The higher order 96 bits are copied directly from input parameter a. The
+  ;  return value is described by the following equations:
+  ;
+  ;  r0 = RND(b0)
+  ;  r1 = a1
+  ;  r2 = a2
+  ;  r3 = a3
+  ;
+  ;  It doesn't matter what we pass as a, since we only need the r0 value
+  ;  here.  So we pass the same register for both.
+  %xi = insertelement <4 x float> undef, float %0, i32 0
+  %xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 8)
+  %rs = extractelement <4 x float> %xr, i32 0
+  ret float %rs
+}
+
+define internal float @__floor_uniform_float(float) nounwind readonly alwaysinline {
+  ; see above for round_ss instrinsic discussion...
+  %xi = insertelement <4 x float> undef, float %0, i32 0
+  ; roundps, round down 0b01 | don't signal precision exceptions 0b1001 = 9
+  %xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 9)
+  %rs = extractelement <4 x float> %xr, i32 0
+  ret float %rs
+}
+
+define internal float @__ceil_uniform_float(float) nounwind readonly alwaysinline {
+  ; see above for round_ss instrinsic discussion...
+  %xi = insertelement <4 x float> undef, float %0, i32 0
+  ; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
+  %xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 10)
+  %rs = extractelement <4 x float> %xr, i32 0
+  ret float %rs
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rounding doubles
+
+declare <2 x double> @llvm.x86.sse41.round.sd(<2 x double>, <2 x double>, i32) nounwind readnone
+
+define internal double @__round_uniform_double(double) nounwind readonly alwaysinline {
+  %xi = insertelement <2 x double> undef, double %0, i32 0
+  %xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 8)
+  %rs = extractelement <2 x double> %xr, i32 0
+  ret double %rs
+}
+
+define internal double @__floor_uniform_double(double) nounwind readonly alwaysinline {
+  ; see above for round_ss instrinsic discussion...
+  %xi = insertelement <2 x double> undef, double %0, i32 0
+  ; roundpd, round down 0b01 | don't signal precision exceptions 0b1001 = 9
+  %xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 9)
+  %rs = extractelement <2 x double> %xr, i32 0
+  ret double %rs
+}
+
+define internal double @__ceil_uniform_double(double) nounwind readonly alwaysinline {
+  ; see above for round_ss instrinsic discussion...
+  %xi = insertelement <2 x double> undef, double %0, i32 0
+  ; roundpd, round up 0b10 | don't signal precision exceptions 0b1010 = 10
+  %xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 10)
+  %rs = extractelement <2 x double> %xr, i32 0
+  ret double %rs
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rsqrt
+
+declare <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float>) nounwind readnone
+
+define internal float @__rsqrt_uniform_float(float) nounwind readonly alwaysinline {
+  ;  uniform float is = extract(__rsqrt_u(v), 0);
+  %v = insertelement <4 x float> undef, float %0, i32 0
+  %vis = call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %v)
+  %is = extractelement <4 x float> %vis, i32 0
+
+  ;  return 0.5 * is * (3. - (v * is) * is);
+  %v_is = fmul float %0, %is
+  %v_is_is = fmul float %v_is, %is
+  %three_sub = fsub float 3., %v_is_is
+  %is_mul = fmul float %is, %three_sub
+  %half_scale = fmul float 0.5, %is_mul
+  ret float %half_scale
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; sqrt
+
+declare <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float>) nounwind readnone
+
+define internal float @__sqrt_uniform_float(float) nounwind readonly alwaysinline {
+  sse_unary_scalar(ret, 4, float, @llvm.x86.sse.sqrt.ss, %0)
+  ret float %ret
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; fastmath
+
+declare void @llvm.x86.sse.stmxcsr(i8 *) nounwind
+declare void @llvm.x86.sse.ldmxcsr(i8 *) nounwind
+
+define internal void @__fastmath() nounwind alwaysinline {
+  %ptr = alloca i32
+  %ptr8 = bitcast i32 * %ptr to i8 *
+  call void @llvm.x86.sse.stmxcsr(i8 * %ptr8)
+  %oldval = load i32 *%ptr
+
+  ; turn on DAZ (64)/FTZ (32768) -> 32832
+  %update = or i32 %oldval, 32832
+  store i32 %update, i32 *%ptr
+  call void @llvm.x86.sse.ldmxcsr(i8 * %ptr8)
+  ret void
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; float min/max
+
+declare <4 x float> @llvm.x86.sse.max.ss(<4 x float>, <4 x float>) nounwind readnone
+declare <4 x float> @llvm.x86.sse.min.ss(<4 x float>, <4 x float>) nounwind readnone
+
+define internal float @__max_uniform_float(float, float) nounwind readonly alwaysinline {
+  sse_binary_scalar(ret, 4, float, @llvm.x86.sse.max.ss, %0, %1)
+  ret float %ret
+}
+
+define internal float @__min_uniform_float(float, float) nounwind readonly alwaysinline {
+  sse_binary_scalar(ret, 4, float, @llvm.x86.sse.min.ss, %0, %1)
+  ret float %ret
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int min/max
+
+declare <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32>, <4 x i32>) nounwind readnone
+declare <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32>, <4 x i32>) nounwind readnone
+
+define internal i32 @__min_uniform_int32(i32, i32) nounwind readonly alwaysinline {
+  sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pminsd, %0, %1)
+  ret i32 %ret
+}
+
+define internal i32 @__max_uniform_int32(i32, i32) nounwind readonly alwaysinline {
+  sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pmaxsd, %0, %1)
+  ret i32 %ret
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; unsigned int min/max
+
+declare <4 x i32> @llvm.x86.sse41.pminud(<4 x i32>, <4 x i32>) nounwind readnone
+declare <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32>, <4 x i32>) nounwind readnone
+
+define internal i32 @__min_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
+  sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pminud, %0, %1)
+  ret i32 %ret
+}
+
+define internal i32 @__max_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
+  sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pmaxud, %0, %1)
+  ret i32 %ret
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; horizontal ops
+
+declare i32 @llvm.ctpop.i32(i32) nounwind readnone
+
+define internal i32 @__popcnt_int32(i32) nounwind readonly alwaysinline {
+  %call = call i32 @llvm.ctpop.i32(i32 %0)
+  ret i32 %call
+}
+
+declare i64 @llvm.ctpop.i64(i64) nounwind readnone
+
+define internal i64 @__popcnt_int64(i64) nounwind readonly alwaysinline {
+  %call = call i64 @llvm.ctpop.i64(i64 %0)
+  ret i64 %call
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; double precision sqrt
+
+declare <2 x double> @llvm.x86.sse.sqrt.sd(<2 x double>) nounwind readnone
+
+define internal double @__sqrt_uniform_double(double) nounwind alwaysinline {
+  sse_unary_scalar(ret, 2, double, @llvm.x86.sse.sqrt.sd, %0)
+  ret double %ret
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; double precision min/max
+
+declare <2 x double> @llvm.x86.sse2.max.sd(<2 x double>, <2 x double>) nounwind readnone
+declare <2 x double> @llvm.x86.sse2.min.sd(<2 x double>, <2 x double>) nounwind readnone
+
+define internal double @__min_uniform_double(double, double) nounwind readnone alwaysinline {
+  sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.min.sd, %0, %1)
+  ret double %ret
+}
+
+define internal double @__max_uniform_double(double, double) nounwind readnone alwaysinline {
+  sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.max.sd, %0, %1)
+  ret double %ret
+}
--- a/builtins-avx-x2.ll
+++ b/builtins-avx-x2.ll
@@ -0,0 +1,665 @@
+;;  Copyright (c) 2010-2011, Intel Corporation
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Intel Corporation nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; *** Untested *** AVX target implementation.
+;;
+;; The LLVM AVX code generator is incomplete, so the ispc AVX target
+;; hasn't yet been tested.  There is therefore a higher-than-normal
+;; chance that there are bugs in the code in this file.
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Basic 16-wide definitions
+
+stdlib_core(16)
+packed_load_and_store(16)
+scans(16)
+int64minmax(16)
+
+include(`builtins-avx-common.ll')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rcp
+
+declare <8 x float> @llvm.x86.avx.rcp.ps.256(<8 x float>) nounwind readnone
+
+define internal <16 x float> @__rcp_varying_float(<16 x float>) nounwind readonly alwaysinline {
+  ;  float iv = __rcp_v(v);
+  ;  return iv * (2. - v * iv);
+
+  unary8to16(call, float, @llvm.x86.avx.rcp.ps.256, %0)
+  ; do one N-R iteration
+  %v_iv = fmul <16 x float> %0, %call
+  %two_minus = fsub <16 x float> <float 2., float 2., float 2., float 2.,
+                                  float 2., float 2., float 2., float 2.,
+                                  float 2., float 2., float 2., float 2.,
+                                  float 2., float 2., float 2., float 2.>, %v_iv  
+  %iv_mul = fmul <16 x float> %call, %two_minus
+  ret <16 x float> %iv_mul
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rounding floats
+
+declare <8 x float> @llvm.x86.avx.round.ps.256(<8 x float>, i32) nounwind readnone
+
+define internal <16 x float> @__round_varying_float(<16 x float>) nounwind readonly alwaysinline {
+  ; roundps, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8
+  round8to16(%0, 8)
+}
+
+define internal <16 x float> @__floor_varying_float(<16 x float>) nounwind readonly alwaysinline {
+  ; roundps, round down 0b01 | don't signal precision exceptions 0b1001 = 9
+  round8to16(%0, 9)
+}
+
+define internal <16 x float> @__ceil_varying_float(<16 x float>) nounwind readonly alwaysinline {
+  ; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
+  round8to16(%0, 10)
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rounding doubles
+
+declare <4 x double> @llvm.x86.avx.round.pd.256(<4 x double>, i32) nounwind readnone
+
+define internal <16 x double> @__round_varying_double(<16 x double>) nounwind readonly alwaysinline {
+  round4to16double(%0, 8)
+}
+
+define internal <16 x double> @__floor_varying_double(<16 x double>) nounwind readonly alwaysinline {
+  round4to16double(%0, 9)
+}
+
+define internal <16 x double> @__ceil_varying_double(<16 x double>) nounwind readonly alwaysinline {
+  round4to16double(%0, 10)
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rsqrt
+
+declare <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float>) nounwind readnone
+
+define internal <16 x float> @__rsqrt_varying_float(<16 x float> %v) nounwind readonly alwaysinline {
+  ;  float is = __rsqrt_v(v);
+  unary8to16(is, float, @llvm.x86.avx.rsqrt.ps.256, %v)
+  ;  return 0.5 * is * (3. - (v * is) * is);
+  %v_is = fmul <16 x float> %v, %is
+  %v_is_is = fmul <16 x float> %v_is, %is
+  %three_sub = fsub <16 x float> <float 3., float 3., float 3., float 3.,
+                                  float 3., float 3., float 3., float 3.,
+                                  float 3., float 3., float 3., float 3.,
+                                  float 3., float 3., float 3., float 3.>, %v_is_is
+  %is_mul = fmul <16 x float> %is, %three_sub
+  %half_scale = fmul <16 x float> <float 0.5, float 0.5, float 0.5, float 0.5,
+                                   float 0.5, float 0.5, float 0.5, float 0.5,
+                                   float 0.5, float 0.5, float 0.5, float 0.5,
+                                   float 0.5, float 0.5, float 0.5, float 0.5>, %is_mul
+  ret <16 x float> %half_scale
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; sqrt
+
+declare <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float>) nounwind readnone
+
+define internal <16 x float> @__sqrt_varying_float(<16 x float>) nounwind readonly alwaysinline {
+  unary8to16(call, float, @llvm.x86.avx.sqrt.ps.256, %0)
+  ret <16 x float> %call
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; svml
+
+; FIXME: need either to wire these up to the 8-wide SVML entrypoints,
+; or, use the macro to call the 4-wide ones 4x with our 16-wide
+; vectors...
+
+declare <16 x float> @__svml_sin(<16 x float>)
+declare <16 x float> @__svml_cos(<16 x float>)
+declare void @__svml_sincos(<16 x float>, <16 x float> *, <16 x float> *)
+declare <16 x float> @__svml_tan(<16 x float>)
+declare <16 x float> @__svml_atan(<16 x float>)
+declare <16 x float> @__svml_atan2(<16 x float>, <16 x float>)
+declare <16 x float> @__svml_exp(<16 x float>)
+declare <16 x float> @__svml_log(<16 x float>)
+declare <16 x float> @__svml_pow(<16 x float>, <16 x float>)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; float min/max
+
+declare <8 x float> @llvm.x86.avx.max.ps.256(<8 x float>, <8 x float>) nounwind readnone
+declare <8 x float> @llvm.x86.avx.min.ps.256(<8 x float>, <8 x float>) nounwind readnone
+
+define internal <16 x float> @__max_varying_float(<16 x float>,
+                                                  <16 x float>) nounwind readonly alwaysinline {
+  binary8to16(call, float, @llvm.x86.avx.max.ps.256, %0, %1)
+  ret <16 x float> %call
+}
+
+define internal <16 x float> @__min_varying_float(<16 x float>,
+                                                  <16 x float>) nounwind readonly alwaysinline {
+  binary8to16(call, float, @llvm.x86.avx.min.ps.256, %0, %1)
+  ret <16 x float> %call
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int min/max
+
+define internal <16 x i32> @__min_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
+  binary4to16(ret, i32, @llvm.x86.sse41.pminsd, %0, %1)
+  ret <16 x i32> %ret
+}
+
+define internal <16 x i32> @__max_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
+  binary4to16(ret, i32, @llvm.x86.sse41.pmaxsd, %0, %1)
+  ret <16 x i32> %ret
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; unsigned int min/max
+
+define internal <16 x i32> @__min_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
+  binary4to16(ret, i32, @llvm.x86.sse41.pminud, %0, %1)
+  ret <16 x i32> %ret
+}
+
+define internal <16 x i32> @__max_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
+  binary4to16(ret, i32, @llvm.x86.sse41.pmaxud, %0, %1)
+  ret <16 x i32> %ret
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; horizontal ops
+
+declare i32 @llvm.x86.avx.movmsk.ps.256(<8 x float>) nounwind readnone
+
+define internal i32 @__movmsk(<16 x i32>) nounwind readnone alwaysinline {
+  %floatmask = bitcast <16 x i32> %0 to <16 x float>
+  %mask0 = shufflevector <16 x float> %floatmask, <16 x float> undef,
+          <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %v0 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %mask0) nounwind readnone
+  %mask1 = shufflevector <16 x float> %floatmask, <16 x float> undef,
+          <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %v1 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %mask1) nounwind readnone
+
+  %v1shift = shl i32 %v1, 8
+  %v = or i32 %v1shift, %v0
+  ret i32 %v
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; horizontal float ops
+
+declare <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float>, <8 x float>) nounwind readnone
+
+define internal float @__reduce_add_float(<16 x float>) nounwind readonly alwaysinline {
+  %va = shufflevector <16 x float> %0, <16 x float> undef,
+          <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %vb = shufflevector <16 x float> %0, <16 x float> undef,
+          <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %v1 = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %va, <8 x float> %vb)
+  %v2 = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %v1, <8 x float> %v1)
+  %v3 = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %v2, <8 x float> %v2)
+  %scalar1 = extractelement <8 x float> %v3, i32 0
+  %scalar2 = extractelement <8 x float> %v3, i32 4
+  %sum = fadd float %scalar1, %scalar2
+  ret float %sum
+}
+
+
+define internal float @__reduce_min_float(<16 x float>) nounwind readnone alwaysinline {
+  reduce16(float, @__min_varying_float, @__min_uniform_float)
+}
+
+
+define internal float @__reduce_max_float(<16 x float>) nounwind readnone alwaysinline {
+  reduce16(float, @__max_varying_float, @__max_uniform_float)
+}
+
+reduce_equal(16)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; horizontal int32 ops
+
+define internal <16 x i32> @__add_varying_int32(<16 x i32>,
+                                                <16 x i32>) nounwind readnone alwaysinline {
+  %s = add <16 x i32> %0, %1
+  ret <16 x i32> %s
+}
+
+define internal i32 @__add_uniform_int32(i32, i32) nounwind readnone alwaysinline {
+  %s = add i32 %0, %1
+  ret i32 %s
+}
+
+define internal i32 @__reduce_add_int32(<16 x i32>) nounwind readnone alwaysinline {
+  reduce16(i32, @__add_varying_int32, @__add_uniform_int32)
+}
+
+
+define internal i32 @__reduce_min_int32(<16 x i32>) nounwind readnone alwaysinline {
+  reduce16(i32, @__min_varying_int32, @__min_uniform_int32)
+}
+
+
+define internal i32 @__reduce_max_int32(<16 x i32>) nounwind readnone alwaysinline {
+  reduce16(i32, @__max_varying_int32, @__max_uniform_int32)
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; horizontal uint32 ops
+
+define internal i32 @__reduce_add_uint32(<16 x i32> %v) nounwind readnone alwaysinline {
+  %r = call i32 @__reduce_add_int32(<16 x i32> %v)
+  ret i32 %r
+}
+
+define internal i32 @__reduce_min_uint32(<16 x i32>) nounwind readnone alwaysinline {
+  reduce16(i32, @__min_varying_uint32, @__min_uniform_uint32)
+}
+
+
+define internal i32 @__reduce_max_uint32(<16 x i32>) nounwind readnone alwaysinline {
+  reduce16(i32, @__max_varying_uint32, @__max_uniform_uint32)
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; horizontal double ops
+
+declare <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double>, <4 x double>) nounwind readnone
+
+define internal double @__reduce_add_double(<16 x double>) nounwind readonly alwaysinline {
+  %va = shufflevector <16 x double> %0, <16 x double> undef,
+         <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %vb = shufflevector <16 x double> %0, <16 x double> undef,
+         <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %vc = shufflevector <16 x double> %0, <16 x double> undef,
+         <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+  %vd = shufflevector <16 x double> %0, <16 x double> undef,
+         <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+  %vab = fadd <4 x double> %va, %vb
+  %vcd = fadd <4 x double> %vc, %vd
+
+  %sum0 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %vab, <4 x double> %vcd)
+  %sum1 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %sum0, <4 x double> %sum0)
+  %final0 = extractelement <4 x double> %sum1, i32 0
+  %final1 = extractelement <4 x double> %sum1, i32 2
+  %sum = fadd double %final0, %final1
+  ret double %sum
+}
+
+define internal double @__reduce_min_double(<16 x double>) nounwind readnone alwaysinline {
+  reduce16(double, @__min_varying_double, @__min_uniform_double)
+}
+
+
+define internal double @__reduce_max_double(<16 x double>) nounwind readnone alwaysinline {
+  reduce16(double, @__max_varying_double, @__max_uniform_double)
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; horizontal int64 ops
+
+define internal <16 x i64> @__add_varying_int64(<16 x i64>,
+                                                <16 x i64>) nounwind readnone alwaysinline {
+  %s = add <16 x i64> %0, %1
+  ret <16 x i64> %s
+}
+
+define internal i64 @__add_uniform_int64(i64, i64) nounwind readnone alwaysinline {
+  %s = add i64 %0, %1
+  ret i64 %s
+}
+
+define internal i64 @__reduce_add_int64(<16 x i64>) nounwind readnone alwaysinline {
+  reduce16(i64, @__add_varying_int64, @__add_uniform_int64)
+}
+
+
+define internal i64 @__reduce_min_int64(<16 x i64>) nounwind readnone alwaysinline {
+  reduce16(i64, @__min_varying_int64, @__min_uniform_int64)
+}
+
+
+define internal i64 @__reduce_max_int64(<16 x i64>) nounwind readnone alwaysinline {
+  reduce16(i64, @__max_varying_int64, @__max_uniform_int64)
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; horizontal uint64 ops
+
+define internal i64 @__reduce_add_uint64(<16 x i64> %v) nounwind readnone alwaysinline {
+  %r = call i64 @__reduce_add_int64(<16 x i64> %v)
+  ret i64 %r
+}
+
+define internal i64 @__reduce_min_uint64(<16 x i64>) nounwind readnone alwaysinline {
+  reduce16(i64, @__min_varying_uint64, @__min_uniform_uint64)
+}
+
+
+define internal i64 @__reduce_max_uint64(<16 x i64>) nounwind readnone alwaysinline {
+  reduce16(i64, @__max_varying_uint64, @__max_uniform_uint64)
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; unaligned loads/loads+broadcasts
+
+load_and_broadcast(16, i8, 8)
+load_and_broadcast(16, i16, 16)
+load_and_broadcast(16, i32, 32)
+load_and_broadcast(16, i64, 64)
+
+; no masked load instruction for i8 and i16 types??
+load_masked(16, i8,  8,  1)
+load_masked(16, i16, 16, 2)
+
+declare <8 x float> @llvm.x86.avx.maskload.ps.256(i8 *, <8 x float> %mask)
+declare <4 x double> @llvm.x86.avx.maskload.pd.256(i8 *, <4 x double> %mask)
+ 
+define <16 x i32> @__load_masked_32(i8 *, <16 x i32> %mask) nounwind alwaysinline {
+  %floatmask = bitcast <16 x i32> %mask to <16 x float>
+  %mask0 = shufflevector <16 x float> %floatmask, <16 x float> undef,
+     <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %val0 = call <8 x float> @llvm.x86.avx.maskload.ps.256(i8 * %0, <8 x float> %mask0)
+  %mask1 = shufflevector <16 x float> %floatmask, <16 x float> undef,
+     <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %ptr1 = getelementptr i8 * %0, i32 32   ;; 8x4 bytes = 32
+  %val1 = call <8 x float> @llvm.x86.avx.maskload.ps.256(i8 * %ptr1, <8 x float> %mask1)
+
+  %retval = shufflevector <8 x float> %val0, <8 x float> %val1,
+     <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                 i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %reti32 = bitcast <16 x float> %retval to <16 x i32>
+  ret <16 x i32> %reti32
+}
+
+
+define <16 x i64> @__load_masked_64(i8 *, <16 x i32> %mask) nounwind alwaysinline {
+  ; double up masks, bitcast to doubles
+  %mask0 = shufflevector <16 x i32> %mask, <16 x i32> undef,
+     <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
+  %mask1 = shufflevector <16 x i32> %mask, <16 x i32> undef,
+     <8 x i32> <i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
+  %mask2 = shufflevector <16 x i32> %mask, <16 x i32> undef,
+     <8 x i32> <i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11>
+  %mask3 = shufflevector <16 x i32> %mask, <16 x i32> undef,
+     <8 x i32> <i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15>
+  %mask0d = bitcast <8 x i32> %mask0 to <4 x double>
+  %mask1d = bitcast <8 x i32> %mask1 to <4 x double>
+  %mask2d = bitcast <8 x i32> %mask2 to <4 x double>
+  %mask3d = bitcast <8 x i32> %mask3 to <4 x double>
+
+  %val0d = call <4 x double> @llvm.x86.avx.maskload.pd.256(i8 * %0, <4 x double> %mask0d)
+  %ptr1 = getelementptr i8 * %0, i32 32
+  %val1d = call <4 x double> @llvm.x86.avx.maskload.pd.256(i8 * %ptr1, <4 x double> %mask1d)
+  %ptr2 = getelementptr i8 * %0, i32 64
+  %val2d = call <4 x double> @llvm.x86.avx.maskload.pd.256(i8 * %ptr2, <4 x double> %mask2d)
+  %ptr3 = getelementptr i8 * %0, i32 96
+  %val3d = call <4 x double> @llvm.x86.avx.maskload.pd.256(i8 * %ptr3, <4 x double> %mask3d)
+
+  %val01 = shufflevector <4 x double> %val0d, <4 x double> %val1d,
+      <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %val23 = shufflevector <4 x double> %val2d, <4 x double> %val3d,
+      <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %val0123 = shufflevector <8 x double> %val01, <8 x double> %val23,
+      <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                  i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %val = bitcast <16 x double> %val0123 to <16 x i64>
+  ret <16 x i64> %val
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; masked store
+
+; FIXME: there is no AVX instruction for these, but we could be clever
+; by packing the bits down and setting the last 3/4 or half, respectively,
+; of the mask to zero...  Not sure if this would be a win in the end
+gen_masked_store(16, i8, 8)
+gen_masked_store(16, i16, 16)
+
+; note that mask is the 2nd parameter, not the 3rd one!!
+declare void @llvm.x86.avx.maskstore.ps.256(i8 *, <8 x float>, <8 x float>)
+declare void @llvm.x86.avx.maskstore.pd.256(i8 *, <4 x double>, <4 x double>)
+
+define void @__masked_store_32(<16 x i32>* nocapture, <16 x i32>, 
+                               <16 x i32>) nounwind alwaysinline {
+  %ptr = bitcast <16 x i32> * %0 to i8 *
+  %val = bitcast <16 x i32> %1 to <16 x float>
+  %mask = bitcast <16 x i32> %2 to <16 x float>
+
+  %val0 = shufflevector <16 x float> %val, <16 x float> undef,
+        <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %val1 = shufflevector <16 x float> %val, <16 x float> undef,
+        <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+
+  %mask0 = shufflevector <16 x float> %mask, <16 x float> undef,
+        <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %mask1 = shufflevector <16 x float> %mask, <16 x float> undef,
+        <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+
+  call void @llvm.x86.avx.maskstore.ps.256(i8 * %ptr, <8 x float> %mask0, <8 x float> %val0)
+  %ptr1 = getelementptr i8 * %ptr, i32 32
+  call void @llvm.x86.avx.maskstore.ps.256(i8 * %ptr1, <8 x float> %mask1, <8 x float> %val1)
+
+  ret void
+}
+
+define void @__masked_store_64(<16 x i64>* nocapture, <16 x i64>,
+                               <16 x i32> %mask) nounwind alwaysinline {
+  %ptr = bitcast <16 x i64> * %0 to i8 *
+  %val = bitcast <16 x i64> %1 to <16 x double>
+
+  ; double up masks, bitcast to doubles
+  %mask0 = shufflevector <16 x i32> %mask, <16 x i32> undef,
+     <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
+  %mask1 = shufflevector <16 x i32> %mask, <16 x i32> undef,
+     <8 x i32> <i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
+  %mask2 = shufflevector <16 x i32> %mask, <16 x i32> undef,
+     <8 x i32> <i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11>
+  %mask3 = shufflevector <16 x i32> %mask, <16 x i32> undef,
+     <8 x i32> <i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15>
+  %mask0d = bitcast <8 x i32> %mask0 to <4 x double>
+  %mask1d = bitcast <8 x i32> %mask1 to <4 x double>
+  %mask2d = bitcast <8 x i32> %mask2 to <4 x double>
+  %mask3d = bitcast <8 x i32> %mask3 to <4 x double>
+
+  %val0 = shufflevector <16 x double> %val, <16 x double> undef,
+     <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %val1 = shufflevector <16 x double> %val, <16 x double> undef,
+     <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %val2 = shufflevector <16 x double> %val, <16 x double> undef,
+     <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+  %val3 = shufflevector <16 x double> %val, <16 x double> undef,
+     <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+
+  call void @llvm.x86.avx.maskstore.pd.256(i8 * %ptr, <4 x double> %mask0d, <4 x double> %val0)
+  %ptr1 = getelementptr i8 * %ptr, i32 32
+  call void @llvm.x86.avx.maskstore.pd.256(i8 * %ptr1, <4 x double> %mask1d, <4 x double> %val1)
+  %ptr2 = getelementptr i8 * %ptr, i32 64
+  call void @llvm.x86.avx.maskstore.pd.256(i8 * %ptr2, <4 x double> %mask2d, <4 x double> %val2)
+  %ptr3 = getelementptr i8 * %ptr, i32 96
+  call void @llvm.x86.avx.maskstore.pd.256(i8 * %ptr3, <4 x double> %mask3d, <4 x double> %val3)
+
+  ret void
+}
+
+
+masked_store_blend_8_16_by_16()
+
+declare <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float>, <8 x float>,
+                                                <8 x float>) nounwind readnone
+
+define void @__masked_store_blend_32(<16 x i32>* nocapture, <16 x i32>, 
+                                     <16 x i32>) nounwind alwaysinline {
+  %maskAsFloat = bitcast <16 x i32> %2 to <16 x float>
+  %oldValue = load <16 x i32>* %0, align 4
+  %oldAsFloat = bitcast <16 x i32> %oldValue to <16 x float>
+  %newAsFloat = bitcast <16 x i32> %1 to <16 x float>
+ 
+  %old0 = shufflevector <16 x float> %oldAsFloat, <16 x float> undef,
+        <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %old1 = shufflevector <16 x float> %oldAsFloat, <16 x float> undef,
+        <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %new0 = shufflevector <16 x float> %newAsFloat, <16 x float> undef,
+        <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %new1 = shufflevector <16 x float> %newAsFloat, <16 x float> undef,
+        <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %mask0 = shufflevector <16 x float> %maskAsFloat, <16 x float> undef,
+        <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %mask1 = shufflevector <16 x float> %maskAsFloat, <16 x float> undef,
+        <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+
+  %blend0 = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %old0,
+                                                         <8 x float> %new0,
+                                                         <8 x float> %mask0)
+  %blend1 = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %old1,
+                                                         <8 x float> %new1,
+                                                         <8 x float> %mask1)
+  %blend = shufflevector <8 x float> %blend0, <8 x float> %blend1,
+    <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %blendAsInt = bitcast <16 x float> %blend to <16 x i32>
+  store <16 x i32> %blendAsInt, <16 x i32>* %0, align 4
+  ret void
+}
+
+
+declare <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double>, <4 x double>,
+                                                 <4 x double>) nounwind readnone
+
+define void @__masked_store_blend_64(<16 x i64>* nocapture %ptr, <16 x i64> %newi64, 
+                                     <16 x i32> %mask) nounwind alwaysinline {
+  %oldValue = load <16 x i64>* %ptr, align 8
+  %old = bitcast <16 x i64> %oldValue to <16 x double>
+  %old0d = shufflevector <16 x double> %old, <16 x double> undef,
+     <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %old1d = shufflevector <16 x double> %old, <16 x double> undef,
+     <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %old2d = shufflevector <16 x double> %old, <16 x double> undef,
+     <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+  %old3d = shufflevector <16 x double> %old, <16 x double> undef,
+     <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+
+  %new = bitcast <16 x i64> %newi64 to <16 x double>
+  %new0d = shufflevector <16 x double> %new, <16 x double> undef,
+     <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %new1d = shufflevector <16 x double> %new, <16 x double> undef,
+     <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %new2d = shufflevector <16 x double> %new, <16 x double> undef,
+     <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+  %new3d = shufflevector <16 x double> %new, <16 x double> undef,
+     <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+
+  %mask0 = shufflevector <16 x i32> %mask, <16 x i32> undef,
+     <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
+  %mask1 = shufflevector <16 x i32> %mask, <16 x i32> undef,
+     <8 x i32> <i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
+  %mask2 = shufflevector <16 x i32> %mask, <16 x i32> undef,
+     <8 x i32> <i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11>
+  %mask3 = shufflevector <16 x i32> %mask, <16 x i32> undef,
+     <8 x i32> <i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15>
+  %mask0d = bitcast <8 x i32> %mask0 to <4 x double>
+  %mask1d = bitcast <8 x i32> %mask1 to <4 x double>
+  %mask2d = bitcast <8 x i32> %mask2 to <4 x double>
+  %mask3d = bitcast <8 x i32> %mask3 to <4 x double>
+
+  %result0d = call <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double> %old0d,
+                                 <4 x double> %new0d, <4 x double> %mask0d)
+  %result1d = call <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double> %old1d,
+                                 <4 x double> %new1d, <4 x double> %mask1d)
+  %result2d = call <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double> %old2d,
+                                 <4 x double> %new2d, <4 x double> %mask2d)
+  %result3d = call <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double> %old3d,
+                                 <4 x double> %new3d, <4 x double> %mask3d)
+
+  %result01 = shufflevector <4 x double> %result0d, <4 x double> %result1d,
+           <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %result23 = shufflevector <4 x double> %result2d, <4 x double> %result3d,
+           <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+
+  %result = shufflevector <8 x double> %result01, <8 x double> %result23,
+           <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                       i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %result64 = bitcast <16 x double> %result to <16 x i64>
+  store <16 x i64> %result64, <16 x i64> * %ptr
+  ret void
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; gather/scatter
+
+gen_gather(16, i8)
+gen_gather(16, i16)
+gen_gather(16, i32)
+gen_gather(16, i64)
+
+gen_scatter(16, i8)
+gen_scatter(16, i16)
+gen_scatter(16, i32)
+gen_scatter(16, i64)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; double precision sqrt
+
+declare <4 x double> @llvm.x86.avx.sqrt.pd.256(<4 x double>) nounwind readnone
+
+define internal <16 x double> @__sqrt_varying_double(<16 x double>) nounwind alwaysinline {
+  unary4to16(ret, double, @llvm.x86.avx.sqrt.pd.256, %0)
+  ret <16 x double> %ret
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; double precision min/max
+
+declare <4 x double> @llvm.x86.avx.max.pd.256(<4 x double>, <4 x double>) nounwind readnone
+declare <4 x double> @llvm.x86.avx.min.pd.256(<4 x double>, <4 x double>) nounwind readnone
+
+define internal <16 x double> @__min_varying_double(<16 x double>, <16 x double>) nounwind readnone alwaysinline {
+  binary4to16(ret, double, @llvm.x86.avx.min.pd.256, %0, %1)
+  ret <16 x double> %ret
+}
+
+define internal <16 x double> @__max_varying_double(<16 x double>, <16 x double>) nounwind readnone alwaysinline {
+  binary4to16(ret, double, @llvm.x86.avx.max.pd.256, %0, %1)
+  ret <16 x double> %ret
+}
--- a/builtins-avx.ll
+++ b/builtins-avx.ll
@@ -41,13 +41,15 @@

 stdlib_core(8)
 packed_load_and_store(8)
+scans(8)
 int64minmax(8)

+include(`builtins-avx-common.ll')
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rcp

 declare <8 x float> @llvm.x86.avx.rcp.ps.256(<8 x float>) nounwind readnone
-declare <4 x float> @llvm.x86.sse.rcp.ss(<4 x float>) nounwind readnone

 define internal <8 x float> @__rcp_varying_float(<8 x float>) nounwind readonly alwaysinline {
  ;  float iv = __rcp_v(v);
@@ -62,25 +64,10 @@ define internal <8 x float> @__rcp_varying_float(<8 x float>) nounwind readonly
  ret <8 x float> %iv_mul
 }

-define internal float @__rcp_uniform_float(float) nounwind readonly alwaysinline {
-;    uniform float iv = extract(__rcp_u(v), 0);
-;    return iv * (2. - v * iv);
-  %vecval = insertelement <4 x float> undef, float %0, i32 0
-  %call = call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %vecval)
-  %scall = extractelement <4 x float> %call, i32 0
-
-  ; do one N-R iteration
-  %v_iv = fmul float %0, %scall
-  %two_minus = fsub float 2., %v_iv  
-  %iv_mul = fmul float %scall, %two_minus
-  ret float %iv_mul
-}
-
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rounding floats

 declare <8 x float> @llvm.x86.avx.round.ps.256(<8 x float>, i32) nounwind readnone
-declare <4 x float> @llvm.x86.sse.round.ss(<4 x float>, <4 x float>, i32) nounwind readnone

 define internal <8 x float> @__round_varying_float(<8 x float>) nounwind readonly alwaysinline {
  ; roundps, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8
@@ -88,111 +75,43 @@ define internal <8 x float> @__round_varying_float(<8 x float>) nounwind readonl
  ret <8 x float> %call
 }

-define internal float @__round_uniform_float(float) nounwind readonly alwaysinline {
-  ; roundss, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8
-  ; the roundss intrinsic is a total mess--docs say:
-  ;
-  ;  __m128 _mm_round_ss (__m128 a, __m128 b, const int c)
-  ;       
-  ;  b is a 128-bit parameter. The lowest 32 bits are the result of the rounding function
-  ;  on b0. The higher order 96 bits are copied directly from input parameter a. The
-  ;  return value is described by the following equations:
-  ;
-  ;  r0 = RND(b0)
-  ;  r1 = a1
-  ;  r2 = a2
-  ;  r3 = a3
-  ;
-  ;  It doesn't matter what we pass as a, since we only need the r0 value
-  ;  here.  So we pass the same register for both.
-  %xi = insertelement <4 x float> undef, float %0, i32 0
-  %xr = call <4 x float> @llvm.x86.sse.round.ss(<4 x float> %xi, <4 x float> %xi, i32 8)
-  %rs = extractelement <4 x float> %xr, i32 0
-  ret float %rs
-}
-
 define internal <8 x float> @__floor_varying_float(<8 x float>) nounwind readonly alwaysinline {
-  ; roundps, round down 0b01 | don't signal precision exceptions 0b1000 = 9
+  ; roundps, round down 0b01 | don't signal precision exceptions 0b1001 = 9
  %call = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %0, i32 9)
  ret <8 x float> %call
 }

-define internal float @__floor_uniform_float(float) nounwind readonly alwaysinline {
-  ; see above for round_ss instrinsic discussion...
-  %xi = insertelement <4 x float> undef, float %0, i32 0
-  ; roundps, round down 0b01 | don't signal precision exceptions 0b1000 = 9
-  %xr = call <4 x float> @llvm.x86.sse.round.ss(<4 x float> %xi, <4 x float> %xi, i32 9)
-  %rs = extractelement <4 x float> %xr, i32 0
-  ret float %rs
-}
-
 define internal <8 x float> @__ceil_varying_float(<8 x float>) nounwind readonly alwaysinline {
-  ; roundps, round up 0b10 | don't signal precision exceptions 0b1000 = 10
+  ; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
  %call = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %0, i32 10)
  ret <8 x float> %call
 }

-define internal float @__ceil_uniform_float(float) nounwind readonly alwaysinline {
-  ; see above for round_ss instrinsic discussion...
-  %xi = insertelement <4 x float> undef, float %0, i32 0
-  ; roundps, round up 0b10 | don't signal precision exceptions 0b1000 = 10
-  %xr = call <4 x float> @llvm.x86.sse.round.ss(<4 x float> %xi, <4 x float> %xi, i32 10)
-  %rs = extractelement <4 x float> %xr, i32 0
-  ret float %rs
-}
-
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rounding doubles

 declare <4 x double> @llvm.x86.avx.round.pd.256(<4 x double>, i32) nounwind readnone
-declare <2 x double> @llvm.x86.sse41.round.sd(<2 x double>, <2 x double>, i32) nounwind readnone

 define internal <8 x double> @__round_varying_double(<8 x double>) nounwind readonly alwaysinline {
  round4to8double(%0, 8)
 }

-define internal double @__round_uniform_double(double) nounwind readonly alwaysinline {
-  %xi = insertelement <2 x double> undef, double %0, i32 0
-  %xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 8)
-  %rs = extractelement <2 x double> %xr, i32 0
-  ret double %rs
-}
-
 define internal <8 x double> @__floor_varying_double(<8 x double>) nounwind readonly alwaysinline {
  ; roundpd, round down 0b01 | don't signal precision exceptions 0b1000 = 9
  round4to8double(%0, 9)
 }

-define internal double @__floor_uniform_double(double) nounwind readonly alwaysinline {
-  ; see above for round_ss instrinsic discussion...
-  %xi = insertelement <2 x double> undef, double %0, i32 0
-  ; roundpd, round down 0b01 | don't signal precision exceptions 0b1000 = 9
-  %xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 9)
-  %rs = extractelement <2 x double> %xr, i32 0
-  ret double %rs
-}

 define internal <8 x double> @__ceil_varying_double(<8 x double>) nounwind readonly alwaysinline {
  ; roundpd, round up 0b10 | don't signal precision exceptions 0b1000 = 10
  round4to8double(%0, 10)
 }

-define internal double @__ceil_uniform_double(double) nounwind readonly alwaysinline {
-  ; see above for round_ss instrinsic discussion...
-  %xi = insertelement <2 x double> undef, double %0, i32 0
-  ; roundps, round up 0b10 | don't signal precision exceptions 0b1000 = 10
-  %xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 10)
-  %rs = extractelement <2 x double> %xr, i32 0
-  ret double %rs
-}
-
-

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rsqrt

 declare <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float>) nounwind readnone
-declare <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float>) nounwind readnone

 define internal <8 x float> @__rsqrt_varying_float(<8 x float> %v) nounwind readonly alwaysinline {
  ;  float is = __rsqrt_v(v);
@@ -200,64 +119,24 @@ define internal <8 x float> @__rsqrt_varying_float(<8 x float> %v) nounwind read
  ;  return 0.5 * is * (3. - (v * is) * is);
  %v_is = fmul <8 x float> %v, %is
  %v_is_is = fmul <8 x float> %v_is, %is
-  %three_sub = fsub <8 x float> <float 3., float 3., float 3., float 3., float 3., float 3., float 3., float 3.>, %v_is_is
+  %three_sub = fsub <8 x float> <float 3., float 3., float 3., float 3.,
+                                 float 3., float 3., float 3., float 3.>, %v_is_is
  %is_mul = fmul <8 x float> %is, %three_sub
-  %half_scale = fmul <8 x float> <float 0.5, float 0.5, float 0.5, float 0.5, float 0.5, float 0.5, float 0.5, float 0.5>, %is_mul
+  %half_scale = fmul <8 x float> <float 0.5, float 0.5, float 0.5, float 0.5,
+                                  float 0.5, float 0.5, float 0.5, float 0.5>, %is_mul
  ret <8 x float> %half_scale
 }

-define internal float @__rsqrt_uniform_float(float) nounwind readonly alwaysinline {
-  ;  uniform float is = extract(__rsqrt_u(v), 0);
-  %v = insertelement <4 x float> undef, float %0, i32 0
-  %vis = call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %v)
-  %is = extractelement <4 x float> %vis, i32 0
-
-  ;  return 0.5 * is * (3. - (v * is) * is);
-  %v_is = fmul float %0, %is
-  %v_is_is = fmul float %v_is, %is
-  %three_sub = fsub float 3., %v_is_is
-  %is_mul = fmul float %is, %three_sub
-  %half_scale = fmul float 0.5, %is_mul
-  ret float %half_scale
-}
-
-
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; sqrt

 declare <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float>) nounwind readnone
-declare <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float>) nounwind readnone

 define internal <8 x float> @__sqrt_varying_float(<8 x float>) nounwind readonly alwaysinline {
  %call = call <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float> %0)
  ret <8 x float> %call
 }

-define internal float @__sqrt_uniform_float(float) nounwind readonly alwaysinline {
-  sse_unary_scalar(ret, 4, float, @llvm.x86.sse.sqrt.ss, %0)
-  ret float %ret
-}
-
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; fastmath
-
-declare void @llvm.x86.sse.stmxcsr(i8 *) nounwind
-declare void @llvm.x86.sse.ldmxcsr(i8 *) nounwind
-
-define internal void @__fastmath() nounwind alwaysinline {
-  %ptr = alloca i32
-  %ptr8 = bitcast i32 * %ptr to i8 *
-  call void @llvm.x86.sse.stmxcsr(i8 * %ptr8)
-  %oldval = load i32 *%ptr
-
-  ; turn on DAZ (64)/FTZ (32768) -> 32832
-  %update = or i32 %oldval, 32832
-  store i32 %update, i32 *%ptr
-  call void @llvm.x86.sse.ldmxcsr(i8 * %ptr8)
-  ret void
-}
-
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; svml

@@ -279,9 +158,7 @@ declare <8 x float> @__svml_pow(<8 x float>, <8 x float>)
 ;; float min/max

 declare <8 x float> @llvm.x86.avx.max.ps.256(<8 x float>, <8 x float>) nounwind readnone
-declare <4 x float> @llvm.x86.sse.max.ss(<4 x float>, <4 x float>) nounwind readnone
 declare <8 x float> @llvm.x86.avx.min.ps.256(<8 x float>, <8 x float>) nounwind readnone
-declare <4 x float> @llvm.x86.sse.min.ss(<4 x float>, <4 x float>) nounwind readnone

 define internal <8 x float> @__max_varying_float(<8 x float>,
                                                 <8 x float>) nounwind readonly alwaysinline {
@@ -289,97 +166,43 @@ define internal <8 x float> @__max_varying_float(<8 x float>,
  ret <8 x float> %call
 }

-define internal float @__max_uniform_float(float, float) nounwind readonly alwaysinline {
-  sse_binary_scalar(ret, 4, float, @llvm.x86.sse.max.ss, %0, %1)
-  ret float %ret
-}
-
 define internal <8 x float> @__min_varying_float(<8 x float>,
                                                 <8 x float>) nounwind readonly alwaysinline {
  %call = call <8 x float> @llvm.x86.avx.min.ps.256(<8 x float> %0, <8 x float> %1)
  ret <8 x float> %call
 }

-define internal float @__min_uniform_float(float, float) nounwind readonly alwaysinline {
-  sse_binary_scalar(ret, 4, float, @llvm.x86.sse.min.ss, %0, %1)
-  ret float %ret
-}
-

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; int min/max

-declare <8 x i32> @llvm.x86.avx.min.sd.256(<8 x i32>, <8 x i32>) nounwind readnone
-declare <8 x i32> @llvm.x86.avx.max.sd.256(<8 x i32>, <8 x i32>) nounwind readnone
-
 define internal <8 x i32> @__min_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
-  %call = call <8 x i32> @llvm.x86.avx.min.sd.256(<8 x i32> %0, <8 x i32> %1)
-  ret <8 x i32> %call
-}
-
-define internal i32 @__min_uniform_int32(i32, i32) nounwind readonly alwaysinline {
-  sse_binary_scalar(ret, 8, i32, @llvm.x86.avx.min.sd.256, %0, %1)
-  ret i32 %ret
+  binary4to8(ret, i32, @llvm.x86.sse41.pminsd, %0, %1)
+  ret <8 x i32> %ret
 }

 define internal <8 x i32> @__max_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
-  %call = call <8 x i32> @llvm.x86.avx.max.sd.256(<8 x i32> %0, <8 x i32> %1)
-  ret <8 x i32> %call
-}
-
-define internal i32 @__max_uniform_int32(i32, i32) nounwind readonly alwaysinline {
-  sse_binary_scalar(ret, 8, i32, @llvm.x86.avx.max.sd.256, %0, %1)
-  ret i32 %ret
+  binary4to8(ret, i32, @llvm.x86.sse41.pmaxsd, %0, %1)
+  ret <8 x i32> %ret
 }


 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; unsigned int min/max

-; FIXME: looks like these aren't available in LLVM?
-declare <8 x i32> @llvm.x86.avx.min.ud.256(<8 x i32>, <8 x i32>) nounwind readnone
-declare <8 x i32> @llvm.x86.avx.max.ud.256(<8 x i32>, <8 x i32>) nounwind readnone
-
-define internal <8 x i32> @__min_varying_uint32(<8 x i32>,
-                                                <8 x i32>) nounwind readonly alwaysinline {
-  %call = call <8 x i32> @llvm.x86.avx.min.ud.256(<8 x i32> %0, <8 x i32> %1)
-  ret <8 x i32> %call
+define internal <8 x i32> @__min_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
+  binary4to8(ret, i32, @llvm.x86.sse41.pminud, %0, %1)
+  ret <8 x i32> %ret
 }

-define internal i32 @__min_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
-  sse_binary_scalar(ret, 8, i32, @llvm.x86.avx.min.ud.256, %0, %1)
-  ret i32 %ret
+define internal <8 x i32> @__max_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
+  binary4to8(ret, i32, @llvm.x86.sse41.pmaxud, %0, %1)
+  ret <8 x i32> %ret
 }

-define internal <8 x i32> @__max_varying_uint32(<8 x i32>,
-                                                <8 x i32>) nounwind readonly alwaysinline {
-  %call = call <8 x i32> @llvm.x86.avx.max.ud.256(<8 x i32> %0, <8 x i32> %1)
-  ret <8 x i32> %call
-}
-
-define internal i32 @__max_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
-  sse_binary_scalar(ret, 8, i32, @llvm.x86.avx.max.ud.256, %0, %1)
-  ret i32 %ret
-}
-
-
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; horizontal ops

-declare i32 @llvm.ctpop.i32(i32) nounwind readnone
-
-define internal i32 @__popcnt_int32(i32) nounwind readonly alwaysinline {
-  %call = call i32 @llvm.ctpop.i32(i32 %0)
-  ret i32 %call
-}
-
-declare i64 @llvm.ctpop.i64(i64) nounwind readnone
-
-define internal i64 @__popcnt_int64(i64) nounwind readonly alwaysinline {
-  %call = call i64 @llvm.ctpop.i64(i64 %0)
-  ret i64 %call
-}
-
 declare i32 @llvm.x86.avx.movmsk.ps.256(<8 x float>) nounwind readnone

 define internal i32 @__movmsk(<8 x i32>) nounwind readnone alwaysinline {
@@ -412,6 +235,7 @@ define internal float @__reduce_max_float(<8 x float>) nounwind readnone alwaysi
  reduce8(float, @__max_varying_float, @__max_uniform_float)
 }

+reduce_equal(8)

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; horizontal int32 ops
@@ -472,9 +296,10 @@ define internal double @__reduce_add_double(<8 x double>) nounwind readonly alwa
                      <4 x i32> <i32 4, i32 5, i32 6, i32 7>
  %sum0 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %v0, <4 x double> %v1)
  %sum1 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %sum0, <4 x double> %sum0)
-  %scalar1 = extractelement <4 x double> %sum0, i32 0
-  %scalar2 = extractelement <4 x double> %sum1, i32 1
-  %sum = fadd double %scalar1, %scalar2
+  %final0 = extractelement <4 x double> %sum1, i32 0
+  %final1 = extractelement <4 x double> %sum1, i32 2
+  %sum = fadd double %final0, %final1
+
  ret double %sum
 }

@@ -624,13 +449,14 @@ define void @__masked_store_64(<8 x i64>* nocapture, <8 x i64>,
  ret void
 }

+
+
 masked_store_blend_8_16_by_8()

 declare <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float>, <8 x float>,
                                                <8 x float>) nounwind readnone

-
-define void @__masked_store_blend_32(<8 x i32>* nocapture, <8 x i32>,
+define void @__masked_store_blend_32(<8 x i32>* nocapture, <8 x i32>, 
                                     <8 x i32>) nounwind alwaysinline {
  %mask_as_float = bitcast <8 x i32> %2 to <8 x float>
  %oldValue = load <8 x i32>* %0, align 4
@@ -645,7 +471,7 @@ define void @__masked_store_blend_32(<8 x i32>* nocapture, <8 x i32>,
 }


-define void @__masked_store_blend_64(<8 x i64>* nocapture %ptr, <8 x i64> %new,
+define void @__masked_store_blend_64(<8 x i64>* nocapture %ptr, <8 x i64> %new, 
                                     <8 x i32> %i32mask) nounwind alwaysinline {
  %oldValue = load <8 x i64>* %ptr, align 8
  %mask = bitcast <8 x i32> %i32mask to <8 x float>
@@ -695,6 +521,7 @@ define void @__masked_store_blend_64(<8 x i64>* nocapture %ptr, <8 x i64> %new,
  ret void
 }

+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; gather/scatter

@@ -712,43 +539,26 @@ gen_scatter(8, i64)
 ;; double precision sqrt

 declare <4 x double> @llvm.x86.avx.sqrt.pd.256(<4 x double>) nounwind readnone
-declare <2 x double> @llvm.x86.sse.sqrt.sd(<2 x double>) nounwind readnone

 define internal <8 x double> @__sqrt_varying_double(<8 x double>) nounwind alwaysinline {
  unary4to8(ret, double, @llvm.x86.avx.sqrt.pd.256, %0)
  ret <8 x double> %ret
 }

-define internal double @__sqrt_uniform_double(double) nounwind alwaysinline {
-  sse_unary_scalar(ret, 2, double, @llvm.x86.sse.sqrt.sd, %0)
-  ret double %ret
-}
-

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; double precision min/max

 declare <4 x double> @llvm.x86.avx.max.pd.256(<4 x double>, <4 x double>) nounwind readnone
-declare <2 x double> @llvm.x86.sse.max.sd(<2 x double>, <2 x double>) nounwind readnone
 declare <4 x double> @llvm.x86.avx.min.pd.256(<4 x double>, <4 x double>) nounwind readnone
-declare <2 x double> @llvm.x86.sse.min.sd(<2 x double>, <2 x double>) nounwind readnone

 define internal <8 x double> @__min_varying_double(<8 x double>, <8 x double>) nounwind readnone alwaysinline {
  binary4to8(ret, double, @llvm.x86.avx.min.pd.256, %0, %1)
  ret <8 x double> %ret
 }

-define internal double @__min_uniform_double(double, double) nounwind readnone alwaysinline {
-  sse_binary_scalar(ret, 2, double, @llvm.x86.sse.min.sd, %0, %1)
-  ret double %ret
-}
-
 define internal <8 x double> @__max_varying_double(<8 x double>, <8 x double>) nounwind readnone alwaysinline {
  binary4to8(ret, double, @llvm.x86.avx.max.pd.256, %0, %1)
  ret <8 x double> %ret
 }

-define internal double @__max_uniform_double(double, double) nounwind readnone alwaysinline {
-  sse_binary_scalar(ret, 2, double, @llvm.x86.sse.max.sd, %0, %1)
-  ret double %ret
-}
--- a/builtins-c.c
+++ b/builtins-c.c
@@ -51,6 +51,10 @@
  */


+#ifndef _MSC_VER
+#include <unistd.h>
+#endif // !_MSC_VER
+
 #include <stdint.h>
 #include <stdio.h>
 #include <stdarg.h>
@@ -139,3 +143,28 @@ void __do_print(const char *format, const char *types, int width, int mask,
    }
    fflush(stdout);
 }
+
+
+int __num_cores() {
+#ifdef _MSC_VER
+	// This is quite a hack.  Including all of windows.h to get this definition
+	// pulls in a bunch of stuff that leads to undefined symbols at link time.
+	// So we don't #include <windows.h> but instead have the equivalent declarations
+	// here.  Presumably this struct declaration won't be changing in the future
+	// anyway...
+  	struct SYSTEM_INFO {
+        int pad0[2];
+        void *pad1[2];
+        int *pad2;
+        int dwNumberOfProcessors;
+        int pad3[3];
+	};
+
+    struct SYSTEM_INFO sysInfo;
+	extern void __stdcall GetSystemInfo(struct SYSTEM_INFO *);
+    GetSystemInfo(&sysInfo);
+    return sysInfo.dwNumberOfProcessors;
+#else
+    return sysconf(_SC_NPROCESSORS_ONLN);
+#endif // !_MSC_VER
+}
--- a/builtins-sse.ll
+++ b/builtins-sse.ll
@@ -376,6 +376,7 @@ define internal i64 @__reduce_max_uint64(<4 x i64>) nounwind readnone {
  reduce4(i64, @__max_varying_uint64, @__max_uniform_uint64)
 }

+reduce_equal(4)

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; masked store
--- a/builtins-sse2.ll
+++ b/builtins-sse2.ll
@@ -35,6 +35,7 @@
 ; Define some basics for a 4-wide target
 stdlib_core(4)
 packed_load_and_store(4)
+scans(4)

 ; Include the various definitions of things that only require SSE1 and SSE2
 include(`builtins-sse.ll')
@@ -276,41 +277,17 @@ define internal i32 @__max_uniform_uint32(i32, i32) nounwind readonly alwaysinli
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; horizontal ops / reductions

-; FIXME: this is very inefficient, loops over all 32 bits...
-
-; we could use the LLVM intrinsic declare i32 @llvm.ctpop.i32(i32),
-; although that currently ends up generating a POPCNT instruction even
-; if we give --target=sse2 on the command line.  We probably need to
-; pipe through the 'sse2' request to LLVM via the 'features' string
-; at codegen time...  (If e.g. --cpu=penryn is also passed along, then
-; it does generate non-POPCNT code and in particular better code than
-; the below does.)
+declare i32 @llvm.ctpop.i32(i32)
+declare i64 @llvm.ctpop.i64(i64)

 define internal i32 @__popcnt_int32(i32) nounwind readonly alwaysinline {
-entry:
-  br label %loop
-
-loop:
-  %count = phi i32 [ 0, %entry ], [ %newcount, %loop ]
-  %val = phi i32 [ %0, %entry ], [ %newval, %loop ]
-  %delta = and i32 %val, 1
-  %newcount = add i32 %count, %delta
-  %newval = lshr i32 %val, 1
-  %done = icmp eq i32 %newval, 0
-  br i1 %done, label %exit, label %loop
-
-exit:
-  ret i32 %newcount
+  %val = call i32 @llvm.ctpop.i32(i32 %0)
+  ret i32 %val
 }

-define internal i32 @__popcnt_int64(i64) nounwind readnone alwaysinline {
-  %vec = bitcast i64 %0 to <2 x i32>
-  %v0 = extractelement <2 x i32> %vec, i32 0
-  %v1 = extractelement <2 x i32> %vec, i32 1
-  %c0 = call i32 @__popcnt_int32(i32 %v0)
-  %c1 = call i32 @__popcnt_int32(i32 %v1)
-  %sum = add i32 %c0, %c1
-  ret i32 %sum
+define internal i64 @__popcnt_int64(i64) nounwind readnone alwaysinline {
+  %val = call i64 @llvm.ctpop.i64(i64 %0)
+  ret i64 %val
 }


--- a/builtins-sse4.ll
+++ b/builtins-sse4.ll
@@ -35,6 +35,7 @@
 ; Define common 4-wide stuff
 stdlib_core(4)
 packed_load_and_store(4)
+scans(4)

 ; Define the stuff that can be done with base SSE1/SSE2 instructions
 include(`builtins-sse.ll')
@@ -76,7 +77,7 @@ define internal float @__round_uniform_float(float) nounwind readonly alwaysinli
 }

 define internal <4 x float> @__floor_varying_float(<4 x float>) nounwind readonly alwaysinline {
-  ; roundps, round down 0b01 | don't signal precision exceptions 0b1000 = 9
+  ; roundps, round down 0b01 | don't signal precision exceptions 0b1001 = 9
  %call = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %0, i32 9)
  ret <4 x float> %call
 }
@@ -84,14 +85,14 @@ define internal <4 x float> @__floor_varying_float(<4 x float>) nounwind readonl
 define internal float @__floor_uniform_float(float) nounwind readonly alwaysinline {
  ; see above for round_ss instrinsic discussion...
  %xi = insertelement <4 x float> undef, float %0, i32 0
-  ; roundps, round down 0b01 | don't signal precision exceptions 0b1000 = 9
+  ; roundps, round down 0b01 | don't signal precision exceptions 0b1010 = 9
  %xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 9)
  %rs = extractelement <4 x float> %xr, i32 0
  ret float %rs
 }

 define internal <4 x float> @__ceil_varying_float(<4 x float>) nounwind readonly alwaysinline {
-  ; roundps, round up 0b10 | don't signal precision exceptions 0b1000 = 10
+  ; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
  %call = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %0, i32 10)
  ret <4 x float> %call
 }
@@ -99,7 +100,7 @@ define internal <4 x float> @__ceil_varying_float(<4 x float>) nounwind readonly
 define internal float @__ceil_uniform_float(float) nounwind readonly alwaysinline {
  ; see above for round_ss instrinsic discussion...
  %xi = insertelement <4 x float> undef, float %0, i32 0
-  ; roundps, round up 0b10 | don't signal precision exceptions 0b1000 = 10
+  ; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
  %xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 10)
  %rs = extractelement <4 x float> %xr, i32 0
  ret float %rs
@@ -123,28 +124,28 @@ define internal double @__round_uniform_double(double) nounwind readonly alwaysi
 }

 define internal <4 x double> @__floor_varying_double(<4 x double>) nounwind readonly alwaysinline {
-  ; roundpd, round down 0b01 | don't signal precision exceptions 0b1000 = 9
+  ; roundpd, round down 0b01 | don't signal precision exceptions 0b1001 = 9
  round2to4double(%0, 9)
 }

 define internal double @__floor_uniform_double(double) nounwind readonly alwaysinline {
  ; see above for round_ss instrinsic discussion...
  %xi = insertelement <2 x double> undef, double %0, i32 0
-  ; roundpd, round down 0b01 | don't signal precision exceptions 0b1000 = 9
+  ; roundpd, round down 0b01 | don't signal precision exceptions 0b1001 = 9
  %xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 9)
  %rs = extractelement <2 x double> %xr, i32 0
  ret double %rs
 }

 define internal <4 x double> @__ceil_varying_double(<4 x double>) nounwind readonly alwaysinline {
-  ; roundpd, round up 0b10 | don't signal precision exceptions 0b1000 = 10
+  ; roundpd, round up 0b10 | don't signal precision exceptions 0b1010 = 10
  round2to4double(%0, 10)
 }

 define internal double @__ceil_uniform_double(double) nounwind readonly alwaysinline {
  ; see above for round_ss instrinsic discussion...
  %xi = insertelement <2 x double> undef, double %0, i32 0
-  ; roundps, round up 0b10 | don't signal precision exceptions 0b1000 = 10
+  ; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
  %xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 10)
  %rs = extractelement <2 x double> %xr, i32 0
  ret double %rs
@@ -229,7 +230,6 @@ define internal float @__reduce_add_float(<4 x float>) nounwind readonly alwaysi
  ret float %scalar
 }

-
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; masked store

--- a/builtins-sse4x2.ll
+++ b/builtins-sse4x2.ll
@@ -38,6 +38,7 @@

 stdlib_core(8)
 packed_load_and_store(8)
+scans(8)
 int64minmax(8)

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@@ -434,6 +435,8 @@ define internal i64 @__reduce_max_uint64(<8 x i64>) nounwind readnone {
  reduce8(i64, @__max_varying_uint64, @__max_uniform_uint64)
 }

+reduce_equal(8)
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; unaligned loads/loads+broadcasts

@@ -495,28 +498,28 @@ define internal float @__round_uniform_float(float) nounwind readonly alwaysinli
 }

 define internal <8 x float> @__floor_varying_float(<8 x float>) nounwind readonly alwaysinline {
-  ; roundps, round down 0b01 | don't signal precision exceptions 0b1000 = 9
+  ; roundps, round down 0b01 | don't signal precision exceptions 0b1001 = 9
  round4to8(%0, 9)
 }

 define internal float @__floor_uniform_float(float) nounwind readonly alwaysinline {
  ; see above for round_ss instrinsic discussion...
  %xi = insertelement <4 x float> undef, float %0, i32 0
-  ; roundps, round down 0b01 | don't signal precision exceptions 0b1000 = 9
+  ; roundps, round down 0b01 | don't signal precision exceptions 0b1001 = 9
  %xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 9)
  %rs = extractelement <4 x float> %xr, i32 0
  ret float %rs
 }

 define internal <8 x float> @__ceil_varying_float(<8 x float>) nounwind readonly alwaysinline {
-  ; roundps, round up 0b10 | don't signal precision exceptions 0b1000 = 10
+  ; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
  round4to8(%0, 10)
 }

 define internal float @__ceil_uniform_float(float) nounwind readonly alwaysinline {
  ; see above for round_ss instrinsic discussion...
  %xi = insertelement <4 x float> undef, float %0, i32 0
-  ; roundps, round up 0b10 | don't signal precision exceptions 0b1000 = 10
+  ; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
  %xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 10)
  %rs = extractelement <4 x float> %xr, i32 0
  ret float %rs
@@ -540,28 +543,28 @@ define internal double @__round_uniform_double(double) nounwind readonly alwaysi
 }

 define internal <8 x double> @__floor_varying_double(<8 x double>) nounwind readonly alwaysinline {
-  ; roundpd, round down 0b01 | don't signal precision exceptions 0b1000 = 9
+  ; roundpd, round down 0b01 | don't signal precision exceptions 0b1001 = 9
  round2to8double(%0, 9)
 }

 define internal double @__floor_uniform_double(double) nounwind readonly alwaysinline {
  ; see above for round_ss instrinsic discussion...
  %xi = insertelement <2 x double> undef, double %0, i32 0
-  ; roundpd, round down 0b01 | don't signal precision exceptions 0b1000 = 9
+  ; roundpd, round down 0b01 | don't signal precision exceptions 0b1001 = 9
  %xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 9)
  %rs = extractelement <2 x double> %xr, i32 0
  ret double %rs
 }

 define internal <8 x double> @__ceil_varying_double(<8 x double>) nounwind readonly alwaysinline {
-  ; roundpd, round up 0b10 | don't signal precision exceptions 0b1000 = 10
+  ; roundpd, round up 0b10 | don't signal precision exceptions 0b1010 = 10
  round2to8double(%0, 10)
 }

 define internal double @__ceil_uniform_double(double) nounwind readonly alwaysinline {
  ; see above for round_ss instrinsic discussion...
  %xi = insertelement <2 x double> undef, double %0, i32 0
-  ; roundps, round up 0b10 | don't signal precision exceptions 0b1000 = 10
+  ; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
  %xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 10)
  %rs = extractelement <2 x double> %xr, i32 0
  ret double %rs
--- a/builtins.cpp
+++ b/builtins.cpp
@@ -54,6 +54,8 @@
 #include <llvm/Instructions.h>
 #include <llvm/Intrinsics.h>
 #include <llvm/Linker.h>
+#include <llvm/Target/TargetMachine.h>
+#include <llvm/ADT/Triple.h>
 #include <llvm/Support/MemoryBuffer.h>
 #include <llvm/Bitcode/ReaderWriter.h>

@@ -170,6 +172,27 @@ lLLVMTypeToISPCType(const llvm::Type *t, bool intAsUnsigned) {
 }


+static void
+lCreateSymbol(const std::string &name, const Type *returnType, 
+              const std::vector<const Type *> &argTypes, 
+              const llvm::FunctionType *ftype, llvm::Function *func, 
+              SymbolTable *symbolTable) {
+    SourcePos noPos;
+    noPos.name = "__stdlib";
+
+    FunctionType *funcType = new FunctionType(returnType, argTypes, noPos);
+    // set NULL default arguments
+    std::vector<ConstExpr *> defaults;
+    for (unsigned int j = 0; j < ftype->getNumParams(); ++j)
+        defaults.push_back(NULL);
+    funcType->SetArgumentDefaults(defaults);
+
+    Symbol *sym = new Symbol(name, noPos, funcType);
+    sym->function = func;
+    symbolTable->AddFunction(sym);
+}
+
+
 /** Given an LLVM function declaration, synthesize the equivalent ispc
    symbol for the function (if possible).  Returns true on success, false
    on failure.
@@ -221,7 +244,7 @@ lCreateISPCSymbol(llvm::Function *func, SymbolTable *symbolTable) {

        // Iterate over the arguments and try to find their equivalent ispc
        // types.  Track if any of the arguments has an integer type.
-        bool anyIntArgs = false;
+        bool anyIntArgs = false, anyReferenceArgs = false;
        std::vector<const Type *> argTypes;
        for (unsigned int j = 0; j < ftype->getNumParams(); ++j) {
            const llvm::Type *llvmArgType = ftype->getParamType(j);
@@ -230,22 +253,26 @@ lCreateISPCSymbol(llvm::Function *func, SymbolTable *symbolTable) {
                return false;
            anyIntArgs |= 
                (Type::Equal(type, lLLVMTypeToISPCType(llvmArgType, !intAsUnsigned)) == false);
+            anyReferenceArgs |= (dynamic_cast<const ReferenceType *>(type) != NULL);
            argTypes.push_back(type);
        }

        // Always create the symbol the first time through, in particular
        // so that we get symbols for things with no integer types!
-        if (i == 0 || anyIntArgs == true) {
-            FunctionType *funcType = new FunctionType(returnType, argTypes, noPos);
-            // set NULL default arguments
-            std::vector<ConstExpr *> defaults;
-            for (unsigned int j = 0; j < ftype->getNumParams(); ++j)
-                defaults.push_back(NULL);
-            funcType->SetArgumentDefaults(defaults);
+        if (i == 0 || anyIntArgs == true)
+            lCreateSymbol(name, returnType, argTypes, ftype, func, symbolTable);

-            Symbol *sym = new Symbol(name, noPos, funcType);
-            sym->function = func;
-            symbolTable->AddFunction(sym);
+        // If there are any reference types, also make a variant of the
+        // symbol that has them as const references.  This obviously
+        // doesn't make sense for many builtins, but we'll give the stdlib
+        // the option to call one if it needs one.
+        if (anyReferenceArgs == true) {
+            for (unsigned int j = 0; j < argTypes.size(); ++j) {
+                if (dynamic_cast<const ReferenceType *>(argTypes[j]) != NULL)
+                    argTypes[j] = argTypes[j]->GetAsConstType();
+                lCreateSymbol(name + "_refsconst", returnType, argTypes, 
+                              ftype, func, symbolTable);
+            }
        }
    }

@@ -319,6 +346,22 @@ lAddBitcode(const unsigned char *bitcode, int length,
    if (!bcModule)
        Error(SourcePos(), "Error parsing stdlib bitcode: %s", bcErr.c_str());
    else {
+        // FIXME: this feels like a bad idea, but the issue is that when we
+        // set the llvm::Module's target triple in the ispc Module::Module
+        // constructor, we start by calling llvm::sys::getHostTriple() (and
+        // then change the arch if needed).  Somehow that ends up giving us
+        // strings like 'x86_64-apple-darwin11.0.0', while the stuff we
+        // compile to bitcode with clang has module triples like
+        // 'i386-apple-macosx10.7.0'.  And then LLVM issues a warning about
+        // linking together modules with incompatible target triples..
+        llvm::Triple mTriple(m->module->getTargetTriple());
+        llvm::Triple bcTriple(bcModule->getTargetTriple());
+        assert(bcTriple.getArch() == llvm::Triple::UnknownArch ||
+               mTriple.getArch() == bcTriple.getArch());
+        assert(bcTriple.getVendor() == llvm::Triple::UnknownVendor ||
+               mTriple.getVendor() == bcTriple.getVendor());
+        bcModule->setTargetTriple(mTriple.str());
+
        std::string(linkError);
        if (llvm::Linker::LinkModules(module, bcModule, &linkError))
            Error(SourcePos(), "Error linking stdlib bitcode: %s", linkError.c_str());
@@ -346,6 +389,27 @@ lDefineConstantInt(const char *name, int val, llvm::Module *module,
 }


+
+static void
+lDefineConstantIntFunc(const char *name, int val, llvm::Module *module,
+                       SymbolTable *symbolTable) {
+    std::vector<const Type *> args;
+    FunctionType *ft = new FunctionType(AtomicType::UniformInt32, args, SourcePos());
+    Symbol *sym = new Symbol(name, SourcePos(), ft);
+    sym->isStatic = true;
+
+    llvm::Function *func = module->getFunction(name);
+    assert(func != NULL); // it should be declared already...
+    func->addFnAttr(llvm::Attribute::AlwaysInline);
+    llvm::BasicBlock *bblock = llvm::BasicBlock::Create(*g->ctx, "entry", func, 0);
+    llvm::ReturnInst::Create(*g->ctx, LLVMInt32(val), bblock);
+
+    sym->function = func;
+    symbolTable->AddVariable(sym);
+}
+
+
+
 static void
 lDefineProgramIndex(llvm::Module *module, SymbolTable *symbolTable) {
    Symbol *pidx = new Symbol("programIndex", SourcePos(), 
@@ -370,9 +434,18 @@ void
 DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *module,
             bool includeStdlibISPC) {
    // Add the definitions from the compiled builtins-c.c file
-    extern unsigned char builtins_bitcode_c[];
-    extern int builtins_bitcode_c_length;
-    lAddBitcode(builtins_bitcode_c, builtins_bitcode_c_length, module, symbolTable);
+    if (g->target.is32bit) {
+        extern unsigned char builtins_bitcode_c_32[];
+        extern int builtins_bitcode_c_32_length;
+        lAddBitcode(builtins_bitcode_c_32, builtins_bitcode_c_32_length, 
+                    module, symbolTable);
+    }
+    else {
+        extern unsigned char builtins_bitcode_c_64[];
+        extern int builtins_bitcode_c_64_length;
+        lAddBitcode(builtins_bitcode_c_64, builtins_bitcode_c_64_length, 
+                    module, symbolTable);
+    }

    // Next, add the target's custom implementations of the various needed
    // builtin functions (e.g. __masked_store_32(), etc).
@@ -402,10 +475,22 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
        }
        break;
    case Target::AVX:
-        extern unsigned char builtins_bitcode_avx[];
-        extern int builtins_bitcode_avx_length;
-        lAddBitcode(builtins_bitcode_avx, builtins_bitcode_avx_length, module, 
-                    symbolTable);
+        switch (g->target.vectorWidth) {
+        case 8:
+            extern unsigned char builtins_bitcode_avx[];
+            extern int builtins_bitcode_avx_length;
+            lAddBitcode(builtins_bitcode_avx, builtins_bitcode_avx_length, module, 
+                        symbolTable);
+            break;
+        case 16:
+            extern unsigned char builtins_bitcode_avx_x2[];
+            extern int builtins_bitcode_avx_x2_length;
+            lAddBitcode(builtins_bitcode_avx_x2, builtins_bitcode_avx_x2_length,
+                        module,  symbolTable);
+            break;
+        default:
+            FATAL("logic error in DefineStdlib");
+        }
        break;
    default:
        FATAL("logic error");
@@ -428,6 +513,8 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
                       symbolTable);
    lDefineConstantInt("__math_lib_system", (int)Globals::Math_System, module,
                       symbolTable);
+    lDefineConstantIntFunc("__fast_masked_vload", (int)g->opt.fastMaskedVload, module,
+                           symbolTable);

    if (includeStdlibISPC) {
        // If the user wants the standard library to be included, parse the
--- a/builtins.m4
+++ b/builtins.m4
@@ -111,6 +111,32 @@ define(`reduce8', `
 '
 )

+define(`reduce16', `
+  %v1 = shufflevector <16 x $1> %0, <16 x $1> undef,
+        <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15,
+                    i32 undef, i32 undef, i32 undef, i32 undef,
+                    i32 undef, i32 undef, i32 undef, i32 undef>
+  %m1 = call <16 x $1> $2(<16 x $1> %v1, <16 x $1> %0)
+  %v2 = shufflevector <16 x $1> %m1, <16 x $1> undef,
+        <16 x i32> <i32 4, i32 5, i32 6, i32 7,
+                    i32 undef, i32 undef, i32 undef, i32 undef,
+                    i32 undef, i32 undef, i32 undef, i32 undef,
+                    i32 undef, i32 undef, i32 undef, i32 undef>
+  %m2 = call <16 x $1> $2(<16 x $1> %v2, <16 x $1> %m1)
+  %v3 = shufflevector <16 x $1> %m2, <16 x $1> undef,
+        <16 x i32> <i32 2, i32 3, i32 undef, i32 undef,
+                    i32 undef, i32 undef, i32 undef, i32 undef,
+                    i32 undef, i32 undef, i32 undef, i32 undef,
+                    i32 undef, i32 undef, i32 undef, i32 undef>
+  %m3 = call <16 x $1> $2(<16 x $1> %v3, <16 x $1> %m2)
+
+  %m3a = extractelement <16 x $1> %m3, i32 0
+  %m3b = extractelement <16 x $1> %m3, i32 1
+  %m = call $1 $3($1 %m3a, $1 %m3b)
+  ret $1 %m
+'
+)
+
 ;; Do an reduction over an 8-wide vector, using a vector reduction function
 ;; that only takes 4-wide vectors
 ;; $1: type of final scalar result
@@ -211,6 +237,45 @@ define(`unary4to8', `
 '
 )

+define(`unary4to16', `
+  %$1_0 = shufflevector <16 x $2> $4, <16 x $2> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v$1_0 = call <4 x $2> $3(<4 x $2> %$1_0)
+  %$1_1 = shufflevector <16 x $2> $4, <16 x $2> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %v$1_1 = call <4 x $2> $3(<4 x $2> %$1_1)
+  %$1_2 = shufflevector <16 x $2> $4, <16 x $2> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+  %v$1_2 = call <4 x $2> $3(<4 x $2> %$1_2)
+  %$1_3 = shufflevector <16 x $2> $4, <16 x $2> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+  %v$1_3 = call <4 x $2> $3(<4 x $2> %$1_3)
+
+  %$1a = shufflevector <4 x $2> %v$1_0, <4 x $2> %v$1_1, 
+           <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %$1b = shufflevector <4 x $2> %v$1_2, <4 x $2> %v$1_3, 
+           <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %$1 = shufflevector <8 x $2> %$1a, <8 x $2> %$1b,
+           <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                       i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+'
+)
+
+;; And so forth...
+;; $1: name of variable into which the final result should go
+;; $2: scalar type of the vector elements
+;; $3: 8-wide unary vector function to apply
+;; $4: 16-wide operand value
+
+define(`unary8to16', `
+  %$1_0 = shufflevector <16 x $2> $4, <16 x $2> undef,
+             <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %v$1_0 = call <8 x $2> $3(<8 x $2> %$1_0)
+  %$1_1 = shufflevector <16 x $2> $4, <16 x $2> undef,
+             <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %v$1_1 = call <8 x $2> $3(<8 x $2> %$1_1)
+  %$1 = shufflevector <8 x $2> %v$1_0, <8 x $2> %v$1_1, 
+           <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                       i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+'
+)
+
 ;; And along the lines of `binary2to4', this maps a 4-wide binary function to
 ;; two 8-wide vector operands
 ;; $1: name of variable into which the final result should go
@@ -231,6 +296,57 @@ define(`binary4to8', `
 '
 )

+define(`binary8to16', `
+%$1_0a = shufflevector <16 x $2> $4, <16 x $2> undef,
+          <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+%$1_0b = shufflevector <16 x $2> $5, <16 x $2> undef,
+          <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+%v$1_0 = call <8 x $2> $3(<8 x $2> %$1_0a, <8 x $2> %$1_0b)
+%$1_1a = shufflevector <16 x $2> $4, <16 x $2> undef,
+          <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+%$1_1b = shufflevector <16 x $2> $5, <16 x $2> undef,
+          <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+%v$1_1 = call <8 x $2> $3(<8 x $2> %$1_1a, <8 x $2> %$1_1b)
+%$1 = shufflevector <8 x $2> %v$1_0, <8 x $2> %v$1_1, 
+         <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                     i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+'
+)
+
+define(`binary4to16', `
+%$1_0a = shufflevector <16 x $2> $4, <16 x $2> undef,
+          <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+%$1_0b = shufflevector <16 x $2> $5, <16 x $2> undef,
+          <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+%r$1_0 = call <4 x $2> $3(<4 x $2> %$1_0a, <4 x $2> %$1_0b) 
+
+%$1_1a = shufflevector <16 x $2> $4, <16 x $2> undef,
+          <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+%$1_1b = shufflevector <16 x $2> $5, <16 x $2> undef,
+          <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+%r$1_1 = call <4 x $2> $3(<4 x $2> %$1_1a, <4 x $2> %$1_1b) 
+
+%$1_2a = shufflevector <16 x $2> $4, <16 x $2> undef,
+          <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+%$1_2b = shufflevector <16 x $2> $5, <16 x $2> undef,
+          <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+%r$1_2 = call <4 x $2> $3(<4 x $2> %$1_2a, <4 x $2> %$1_2b) 
+
+%$1_3a = shufflevector <16 x $2> $4, <16 x $2> undef,
+          <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+%$1_3b = shufflevector <16 x $2> $5, <16 x $2> undef,
+          <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+%r$1_3 = call <4 x $2> $3(<4 x $2> %$1_3a, <4 x $2> %$1_3b)
+
+%r$1_01 = shufflevector <4 x $2> %r$1_0, <4 x $2> %r$1_1, 
+          <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+%r$1_23 = shufflevector <4 x $2> %r$1_2, <4 x $2> %r$1_3, 
+          <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+
+%$1 = shufflevector <8 x $2> %r$1_01, <8 x $2> %r$1_23, 
+          <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                      i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+')

 ;; Maps a 2-wide unary function to an 8-wide vector operand, returning an 
 ;; 8-wide vector result
@@ -306,6 +422,20 @@ ret <8 x float> %ret
 '
 )

+define(`round8to16', `
+%v0 = shufflevector <16 x float> $1, <16 x float> undef,
+        <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+%v1 = shufflevector <16 x float> $1, <16 x float> undef,
+        <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+%r0 = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %v0, i32 $2)
+%r1 = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %v1, i32 $2)
+%ret = shufflevector <8 x float> %r0, <8 x float> %r1, 
+         <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                     i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ret <16 x float> %ret
+'
+)
+
 define(`round4to8double', `
 %v0 = shufflevector <8 x double> $1, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 %v1 = shufflevector <8 x double> $1, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
@@ -349,6 +479,30 @@ ret <8 x double> %ret
 '
 )

+define(`round4to16double', `
+%v0 = shufflevector <16 x double> $1, <16 x double> undef,
+         <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+%v1 = shufflevector <16 x double> $1, <16 x double> undef,
+         <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+%v2 = shufflevector <16 x double> $1, <16 x double> undef,
+         <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+%v3 = shufflevector <16 x double> $1, <16 x double> undef,
+         <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+%r0 = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %v0, i32 $2)
+%r1 = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %v1, i32 $2)
+%r2 = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %v2, i32 $2)
+%r3 = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %v3, i32 $2)
+%ret0 = shufflevector <4 x double> %r0, <4 x double> %r1, 
+          <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+%ret1 = shufflevector <4 x double> %r2, <4 x double> %r3, 
+          <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+%ret = shufflevector <8 x double> %ret0, <8 x double> %ret1,
+          <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                      i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ret <16 x double> %ret
+'
+)
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; forloop macro

@@ -468,12 +622,91 @@ forloop(i, 1, eval($1-1), `
 }
 ')

+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; global_atomic
+;; global_atomic_associative
+;; More efficient implementation for atomics that are associative (e.g.,
+;; add, and, ...).  If a basic implementation would do sometihng like:
+;; result0 = atomic_op(ptr, val0)
+;; result1 = atomic_op(ptr, val1)
+;; ..
+;; Then instead we can do:
+;; tmp = (val0 op val1 op ...)
+;; result0 = atomic_op(ptr, tmp)
+;; result1 = (result0 op val0)
+;; ..
+;; And more efficiently compute the same result
+;;
+;; Takes five parameters:
+;; $1: vector width of the target
+;; $2: operation being performed (w.r.t. LLVM atomic intrinsic names)
+;;     (add, sub...)
+;; $3: return type of the LLVM atomic (e.g. i32)
+;; $4: return type of the LLVM atomic type, in ispc naming paralance (e.g. int32)
+;; $5: identity value for the operator (e.g. 0 for add, -1 for AND, ...)
+
+define(`global_atomic_associative', `
+
+define internal <$1 x $3> @__atomic_$2_$4_global($3 * %ptr, <$1 x $3> %val,
+                                                 <$1 x i32> %m) nounwind alwaysinline {
+  ; first, for any lanes where the mask is off, compute a vector where those lanes
+  ; hold the identity value..
+
+  ; for the bit tricks below, we need the mask to be sign extended to be
+  ; the size of the element type.
+  ifelse($3, `i64', `%mask = sext <$1 x i32> %m to <$1 x i64>')
+  ifelse($3, `i32', `
+     ; silly workaround to do %mask = %m, which is not possible directly..
+     %maskmem = alloca <$1 x i32>
+     store <$1 x i32> %m, <$1 x i32> * %maskmem
+     %mask = load <$1 x i32> * %maskmem'
+  )
+  ; zero out any lanes that are off
+  %valoff = and <$1 x $3> %val, %mask
+
+  ; compute an identity vector that is zero in on lanes and has the identiy value
+  ; in the off lanes
+  %idv1 = bitcast $3 $5 to <1 x $3>
+  %idvec = shufflevector <1 x $3> %idv1, <1 x $3> undef,
+     <$1 x i32> < forloop(i, 1, eval($1-1), `i32 0, ') i32 0 >
+  %notmask = xor <$1 x $3> %mask, < forloop(i, 1, eval($1-1), `$3 -1, ') $3 -1 >
+  %idoff = and <$1 x $3> %idvec, %notmask
+
+  ; and comptue the merged vector that holds the identity in the off lanes
+  %valp = or <$1 x $3> %valoff, %idoff
+
+  ; now compute the local reduction (val0 op val1 op ... )--initialize
+  ; %eltvec so that the 0th element is the identity, the first is val0,
+  ; the second is (val0 op val1), ..
+  %red0 = extractelement <$1 x $3> %valp, i32 0
+  %eltvec0 = insertelement <$1 x $3> undef, $3 $5, i32 0
+
+  forloop(i, 1, eval($1-1), `
+  %elt`'i = extractelement <$1 x $3> %valp, i32 i
+  %red`'i = $2 $3 %red`'eval(i-1), %elt`'i
+  %eltvec`'i = insertelement <$1 x $3> %eltvec`'eval(i-1), $3 %red`'eval(i-1), i32 i')
+
+  ; make the atomic call, passing it the final reduced value
+  %final0 = call $3 @llvm.atomic.load.$2.$3.p0$3($3 * %ptr, $3 %red`'eval($1-1))
+
+  ; now go back and compute the values to be returned for each program 
+  ; instance--this just involves smearing the old value returned from the
+  ; actual atomic call across the vector and applying the vector op to the
+  ; %eltvec vector computed above..
+  %finalv1 = bitcast $3 %final0 to <1 x $3>
+  %final_base = shufflevector <1 x $3> %finalv1, <1 x $3> undef,
+     <$1 x i32> < forloop(i, 1, eval($1-1), `i32 0, ') i32 0 >
+  %r = $2 <$1 x $3> %final_base, %eltvec`'eval($1-1)
+
+  ret <$1 x $3> %r
+}
+')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; global_atomic_uniform
 ;; Defines the implementation of a function that handles the mapping from
-;; an ispc atomic function to the underlying LLVM intrinsics.  Specifically,
-;; the function handles loooping over the active lanes, calling the underlying
-;; scalar atomic intrinsic for each one, and assembling the vector result.
+;; an ispc atomic function to the underlying LLVM intrinsics.  This variant
+;; just calls the atomic once, for the given uniform value
 ;;
 ;; Takes four parameters:
 ;; $1: vector width of the target
@@ -482,23 +715,14 @@ forloop(i, 1, eval($1-1), `
 ;; $3: return type of the LLVM atomic (e.g. i32)
 ;; $4: return type of the LLVM atomic type, in ispc naming paralance (e.g. int32)

-define(`global_atomic', `
+define(`global_atomic_uniform', `

 declare $3 @llvm.atomic.load.$2.$3.p0$3($3 * %ptr, $3 %delta)

-define internal <$1 x $3> @__atomic_$2_$4_global($3 * %ptr, <$1 x $3> %val,
-                                                 <$1 x i32> %mask) nounwind alwaysinline {
-  %rptr = alloca <$1 x $3>
-  %rptr32 = bitcast <$1 x $3> * %rptr to $3 *
-
-  per_lane($1, <$1 x i32> %mask, `
-   %v_LANE_ID = extractelement <$1 x $3> %val, i32 LANE
-   %r_LANE_ID = call $3 @llvm.atomic.load.$2.$3.p0$3($3 * %ptr, $3 %v_LANE_ID)
-   %rp_LANE_ID = getelementptr $3 * %rptr32, i32 LANE
-   store $3 %r_LANE_ID, $3 * %rp_LANE_ID')
-
-  %r = load <$1 x $3> * %rptr
-  ret <$1 x $3> %r
+define internal $3 @__atomic_$2_uniform_$4_global($3 * %ptr, $3 %val,
+                                          <$1 x i32> %mask) nounwind alwaysinline {
+  %r = call $3 @llvm.atomic.load.$2.$3.p0$3($3 * %ptr, $3 %val)
+  ret $3 %r
 }
 ')

@@ -508,9 +732,10 @@ define internal <$1 x $3> @__atomic_$2_$4_global($3 * %ptr, <$1 x $3> %val,
 ;; $2: llvm type of the vector elements (e.g. i32)
 ;; $3: ispc type of the elements (e.g. int32)

-define(`global_swap', `
+declare i32 @llvm.atomic.swap.i32.p0i32(i32 * %ptr, i32 %val)
+declare i64 @llvm.atomic.swap.i64.p0i64(i64 * %ptr, i64 %val)

-declare $2 @llvm.atomic.swap.$2.p0$2($2 * %ptr, $2 %val)
+define(`global_swap', `

 define internal <$1 x $2> @__atomic_swap_$3_global($2* %ptr, <$1 x $2> %val,
                                                   <$1 x i32> %mask) nounwind alwaysinline {
@@ -526,6 +751,12 @@ define internal <$1 x $2> @__atomic_swap_$3_global($2* %ptr, <$1 x $2> %val,
  %r = load <$1 x $2> * %rptr
  ret <$1 x $2> %r
 }
+
+define internal $2 @__atomic_swap_uniform_$3_global($2* %ptr, $2 %val,
+                                                    <$1 x i32> %mask) nounwind alwaysinline {
+ %r = call $2 @llvm.atomic.swap.$2.p0$2($2 * %ptr, $2 %val)
+ ret $2 %r
+}
 ')


@@ -555,15 +786,57 @@ define internal <$1 x $2> @__atomic_compare_exchange_$3_global($2* %ptr, <$1 x $
  %r = load <$1 x $2> * %rptr
  ret <$1 x $2> %r
 }
+
+define internal $2 @__atomic_compare_exchange_uniform_$3_global($2* %ptr, $2 %cmp,
+                               $2 %val, <$1 x i32> %mask) nounwind alwaysinline {
+  %r = call $2 @llvm.atomic.cmp.swap.$2.p0$2($2 * %ptr, $2 %cmp, $2 %val)
+  ret $2 %r
+}
 ')

+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; prefetch definitions
+
+; prefetch has a new parameter in LLVM3.0, to distinguish between instruction
+; and data caches--the declaration is now:
+; declare void @llvm.prefetch(i8* nocapture %ptr, i32 %readwrite, i32 %locality,
+;                             i32 %cachetype)  (cachetype 1 == data cache)
+; however, the version below seems to still work...
+
+declare void @llvm.prefetch(i8* nocapture %ptr, i32 %readwrite, i32 %locality)
+
+define(`prefetch_read', `
+define internal void @__prefetch_read_1_$1($2 *) alwaysinline {
+  %ptr8 = bitcast $2 * %0 to i8 *
+  call void @llvm.prefetch(i8 * %ptr8, i32 0, i32 3)
+  ret void
+}
+define internal void @__prefetch_read_2_$1($2 *) alwaysinline {
+  %ptr8 = bitcast $2 * %0 to i8 *
+  call void @llvm.prefetch(i8 * %ptr8, i32 0, i32 2)
+  ret void
+}
+define internal void @__prefetch_read_3_$1($2 *) alwaysinline {
+  %ptr8 = bitcast $2 * %0 to i8 *
+  call void @llvm.prefetch(i8 * %ptr8, i32 0, i32 1)
+  ret void
+}
+define internal void @__prefetch_read_nt_$1($2 *) alwaysinline {
+  %ptr8 = bitcast $2 * %0 to i8 *
+  call void @llvm.prefetch(i8 * %ptr8, i32 0, i32 0)
+  ret void
+}
+')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

 define(`stdlib_core', `

-declare i8* @ISPCMalloc(i64, i32) nounwind
-declare i8* @ISPCFree(i8*) nounwind
-declare void @ISPCLaunch(i8*, i8*) nounwind
-declare void @ISPCSync() nounwind
+declare i32 @__fast_masked_vload()
+
+declare i8* @ISPCAlloc(i8**, i64, i32) nounwind
+declare void @ISPCLaunch(i8**, i8*, i8*, i32) nounwind
+declare void @ISPCSync(i8*) nounwind
 declare void @ISPCInstrument(i8*, i8*, i32, i32) nounwind

 declare i1 @__is_compile_time_constant_mask(<$1 x i32> %mask)
@@ -779,6 +1052,25 @@ define internal <$1 x i32> @__sext_varying_bool(<$1 x i32>) nounwind readnone al
  ret <$1 x i32> %0
 }

+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; prefetching
+
+prefetch_read(uniform_bool, i1)
+prefetch_read(uniform_int8, i8)
+prefetch_read(uniform_int16, i16)
+prefetch_read(uniform_int32, i32)
+prefetch_read(uniform_int64, i64)
+prefetch_read(uniform_float, float)
+prefetch_read(uniform_double, double)
+
+prefetch_read(varying_bool, <$1 x i32>)
+prefetch_read(varying_int8, <$1 x i8>)
+prefetch_read(varying_int16, <$1 x i16>)
+prefetch_read(varying_int32, <$1 x i32>)
+prefetch_read(varying_int64, <$1 x i64>)
+prefetch_read(varying_float, <$1 x float>)
+prefetch_read(varying_double, <$1 x double>)
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; stdlib transcendentals
 ;;
@@ -911,25 +1203,35 @@ define internal void @__memory_barrier() nounwind readnone alwaysinline {
  ret void
 }

-global_atomic($1, add, i32, int32)
-global_atomic($1, sub, i32, int32)
-global_atomic($1, and, i32, int32)
-global_atomic($1, or, i32, int32)
-global_atomic($1, xor, i32, int32)
-global_atomic($1, min, i32, int32)
-global_atomic($1, max, i32, int32)
-global_atomic($1, umin, i32, uint32)
-global_atomic($1, umax, i32, uint32)
+global_atomic_associative($1, add, i32, int32, 0)
+global_atomic_associative($1, sub, i32, int32, 0)
+global_atomic_associative($1, and, i32, int32, -1)
+global_atomic_associative($1, or, i32, int32, 0)
+global_atomic_associative($1, xor, i32, int32, 0)
+global_atomic_uniform($1, add, i32, int32)
+global_atomic_uniform($1, sub, i32, int32)
+global_atomic_uniform($1, and, i32, int32)
+global_atomic_uniform($1, or, i32, int32)
+global_atomic_uniform($1, xor, i32, int32)
+global_atomic_uniform($1, min, i32, int32)
+global_atomic_uniform($1, max, i32, int32)
+global_atomic_uniform($1, umin, i32, uint32)
+global_atomic_uniform($1, umax, i32, uint32)

-global_atomic($1, add, i64, int64)
-global_atomic($1, sub, i64, int64)
-global_atomic($1, and, i64, int64)
-global_atomic($1, or, i64, int64)
-global_atomic($1, xor, i64, int64)
-global_atomic($1, min, i64, int64)
-global_atomic($1, max, i64, int64)
-global_atomic($1, umin, i64, uint64)
-global_atomic($1, umax, i64, uint64)
+global_atomic_associative($1, add, i64, int64, 0)
+global_atomic_associative($1, sub, i64, int64, 0)
+global_atomic_associative($1, and, i64, int64, -1)
+global_atomic_associative($1, or, i64, int64, 0)
+global_atomic_associative($1, xor, i64, int64, 0)
+global_atomic_uniform($1, add, i64, int64)
+global_atomic_uniform($1, sub, i64, int64)
+global_atomic_uniform($1, and, i64, int64)
+global_atomic_uniform($1, or, i64, int64)
+global_atomic_uniform($1, xor, i64, int64)
+global_atomic_uniform($1, min, i64, int64)
+global_atomic_uniform($1, max, i64, int64)
+global_atomic_uniform($1, umin, i64, uint64)
+global_atomic_uniform($1, umax, i64, uint64)

 global_swap($1, i32, int32)
 global_swap($1, i64, int64)
@@ -952,6 +1254,24 @@ define internal <$1 x double> @__atomic_swap_double_global(double * %ptr, <$1 x
  ret <$1 x double> %ret
 }

+define internal float @__atomic_swap_uniform_float_global(float * %ptr, float %val,
+                                                   <$1 x i32> %mask) nounwind alwaysinline {
+  %iptr = bitcast float * %ptr to i32 *
+  %ival = bitcast float %val to i32
+  %iret = call i32 @__atomic_swap_uniform_int32_global(i32 * %iptr, i32 %ival, <$1 x i32> %mask)
+  %ret = bitcast i32 %iret to float
+  ret float %ret
+}
+
+define internal double @__atomic_swap_uniform_double_global(double * %ptr, double %val,
+                                                   <$1 x i32> %mask) nounwind alwaysinline {
+  %iptr = bitcast double * %ptr to i64 *
+  %ival = bitcast double %val to i64
+  %iret = call i64 @__atomic_swap_uniform_int64_global(i64 * %iptr, i64 %ival, <$1 x i32> %mask)
+  %ret = bitcast i64 %iret to double
+  ret double %ret
+}
+
 global_atomic_exchange($1, i32, int32)
 global_atomic_exchange($1, i64, int64)

@@ -976,6 +1296,29 @@ define internal <$1 x double> @__atomic_compare_exchange_double_global(double *
  %ret = bitcast <$1 x i64> %iret to <$1 x double>
  ret <$1 x double> %ret
 }
+
+define internal float @__atomic_compare_exchange_uniform_float_global(float * %ptr, float %cmp, float %val,
+                                                   <$1 x i32> %mask) nounwind alwaysinline {
+  %iptr = bitcast float * %ptr to i32 *
+  %icmp = bitcast float %cmp to i32
+  %ival = bitcast float %val to i32
+  %iret = call i32 @__atomic_compare_exchange_uniform_int32_global(i32 * %iptr, i32 %icmp,
+                                                                   i32 %ival, <$1 x i32> %mask)
+  %ret = bitcast i32 %iret to float
+  ret float %ret
+}
+
+define internal double @__atomic_compare_exchange_uniform_double_global(double * %ptr, double %cmp,
+                                            double %val, <$1 x i32> %mask) nounwind alwaysinline {
+  %iptr = bitcast double * %ptr to i64 *
+  %icmp = bitcast double %cmp to i64
+  %ival = bitcast double %val to i64
+  %iret = call i64 @__atomic_compare_exchange_uniform_int64_global(i64 * %iptr, i64 %icmp,
+                                                                   i64 %ival, <$1 x i32> %mask)
+  %ret = bitcast i64 %iret to double
+  ret double %ret
+}
+
 ')


@@ -1034,12 +1377,6 @@ i64minmax($1,max,uint64,ugt)

 define(`load_and_broadcast', `
 define <$1 x $2> @__load_and_broadcast_$3(i8 *, <$1 x i32> %mask) nounwind alwaysinline {
-  ; must not load if the mask is all off; the address may be invalid
-  %mm = call i32 @__movmsk(<$1 x i32> %mask)
-  %any_on = icmp ne i32 %mm, 0
-  br i1 %any_on, label %load, label %skip
-
-load:
  %ptr = bitcast i8 * %0 to $2 *
  %val = load $2 * %ptr

@@ -1047,9 +1384,6 @@ load:
  forloop(i, 1, eval($1-1), `
  %ret`'i = insertelement <$1 x $2> %ret`'eval(i-1), $2 %val, i32 i')
  ret <$1 x $2> %ret`'eval($1-1)
-
-skip:
-  ret <$1 x $2> undef
 }
 ')

@@ -1065,14 +1399,20 @@ define(`load_masked', `
 define <$1 x $2> @__load_masked_$3(i8 *, <$1 x i32> %mask) nounwind alwaysinline {
 entry:
  %mm = call i32 @__movmsk(<$1 x i32> %mask)
+  
  ; if the first lane and the last lane are on, then it is safe to do a vector load
  ; of the whole thing--what the lanes in the middle want turns out to not matter...
  %mm_and = and i32 %mm, eval(1 | (1<<($1-1)))
  %can_vload = icmp eq i32 %mm_and, eval(1 | (1<<($1-1)))
+
+  %fast32 = call i32 @__fast_masked_vload()
+  %fast_i1 = trunc i32 %fast32 to i1
+  %can_vload_maybe_fast = or i1 %fast_i1, %can_vload
+
  ; if we are not able to do a singe vload, we will accumulate lanes in this memory..
  %retptr = alloca <$1 x $2>
  %retptr32 = bitcast <$1 x $2> * %retptr to $2 *
-  br i1 %can_vload, label %load, label %loop
+  br i1 %can_vload_maybe_fast, label %load, label %loop

 load: 
  %ptr = bitcast i8 * %0 to <$1 x $2> *
@@ -1207,6 +1547,46 @@ define void @__masked_store_blend_16(<8 x i16>* nocapture, <8 x i16>,
 ')


+define(`masked_store_blend_8_16_by_16', `
+define void @__masked_store_blend_8(<16 x i8>* nocapture, <16 x i8>,
+                                    <16 x i32>) nounwind alwaysinline {
+  %old = load <16 x i8> * %0
+  %old128 = bitcast <16 x i8> %old to i128
+  %new128 = bitcast <16 x i8> %1 to i128
+
+  %mask8 = trunc <16 x i32> %2 to <16 x i8>
+  %mask128 = bitcast <16 x i8> %mask8 to i128
+  %notmask128 = xor i128 %mask128, -1
+
+  %newmasked = and i128 %new128, %mask128
+  %oldmasked = and i128 %old128, %notmask128
+  %result = or i128 %newmasked, %oldmasked
+
+  %resultvec = bitcast i128 %result to <16 x i8>
+  store <16 x i8> %resultvec, <16 x i8> * %0
+  ret void
+}
+
+define void @__masked_store_blend_16(<16 x i16>* nocapture, <16 x i16>,
+                                     <16 x i32>) nounwind alwaysinline {
+  %old = load <16 x i16> * %0
+  %old256 = bitcast <16 x i16> %old to i256
+  %new256 = bitcast <16 x i16> %1 to i256
+
+  %mask16 = trunc <16 x i32> %2 to <16 x i16>
+  %mask256 = bitcast <16 x i16> %mask16 to i256
+  %notmask256 = xor i256 %mask256, -1
+
+  %newmasked = and i256 %new256, %mask256
+  %oldmasked = and i256 %old256, %notmask256
+  %result = or i256 %newmasked, %oldmasked
+
+  %resultvec = bitcast i256 %result to <16 x i16>
+  store <16 x i16> %resultvec, <16 x i16> * %0
+  ret void
+}
+')
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; packed load and store functions
 ;;
@@ -1234,7 +1614,7 @@ entry:

 known_mask:
  %allon = icmp eq i32 %mask, eval((1 << $1) -1)
-  br i1 %allon, label %all_on, label %not_all_on
+  br i1 %allon, label %all_on, label %unknown_mask

 all_on:
  ;; everyone wants to load, so just load an entire vector width in a single
@@ -1244,14 +1624,6 @@ all_on:
  store <$1 x i32> %vec_load, <$1 x i32> * %val_ptr, align 4
  ret i32 $1

-not_all_on:
-  %alloff = icmp eq i32 %mask, 0
-  br i1 %alloff, label %all_off, label %unknown_mask
-
-all_off:
-  ;; no one wants to load
-  ret i32 0
-
 unknown_mask:
  br label %loop

@@ -1298,20 +1670,13 @@ entry:

 known_mask:
  %allon = icmp eq i32 %mask, eval((1 << $1) -1)
-  br i1 %allon, label %all_on, label %not_all_on
+  br i1 %allon, label %all_on, label %unknown_mask

 all_on:
  %vecptr = bitcast i32 *%startptr to <$1 x i32> *
  store <$1 x i32> %vals, <$1 x i32> * %vecptr, align 4
  ret i32 $1

-not_all_on:
-  %alloff = icmp eq i32 %mask, 0
-  br i1 %alloff, label %all_off, label %unknown_mask
-
-all_off:
-  ret i32 0
-
 unknown_mask:
  br label %loop

@@ -1346,6 +1711,150 @@ done:
 }
 ')

+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; reduce_equal
+
+; count leading zeros
+declare i32 @llvm.cttz.i32(i32)
+
+define(`reduce_equal_aux', `
+define internal i1 @__reduce_equal_$3(<$1 x $2> %v, $2 * %samevalue,
+                                      <$1 x i32> %mask) nounwind alwaysinline {
+entry:
+   %mm = call i32 @__movmsk(<$1 x i32> %mask)
+   %allon = icmp eq i32 %mm, eval((1<<$1)-1)
+   br i1 %allon, label %check_neighbors, label %domixed
+
+domixed:
+  ; First, figure out which lane is the first active one
+  %first = call i32 @llvm.cttz.i32(i32 %mm)
+  %baseval = extractelement <$1 x $2> %v, i32 %first
+  %basev1 = bitcast $2 %baseval to <1 x $2>
+  ; get a vector that is that value smeared across all elements
+  %basesmear = shufflevector <1 x $2> %basev1, <1 x $2> undef,
+        <$1 x i32> < forloop(i, 0, eval($1-2), `i32 0, ') i32 0 >
+
+  ; now to a blend of that vector with the original vector, such that the
+  ; result will be the original value for the active lanes, and the value
+  ; from the first active lane for the inactive lanes.  Given that, we can
+  ; just unconditionally check if the lanes are all equal in check_neighbors
+  ; below without worrying about inactive lanes...
+  %ptr = alloca <$1 x $2>
+  store <$1 x $2> %basesmear, <$1 x $2> * %ptr
+  %castptr = bitcast <$1 x $2> * %ptr to <$1 x $4> *
+  %castv = bitcast <$1 x $2> %v to <$1 x $4>
+  call void @__masked_store_blend_$6(<$1 x $4> * %castptr, <$1 x $4> %castv, <$1 x i32> %mask)
+  %blendvec = load <$1 x $2> * %ptr
+  br label %check_neighbors
+
+check_neighbors:
+  %vec = phi <$1 x $2> [ %blendvec, %domixed ], [ %v, %entry ]
+  ifelse($6, `32', `
+  ; For 32-bit elements, we rotate once and compare with the vector, which ends 
+  ; up comparing each element to its neighbor on the right.  Then see if
+  ; all of those values are true; if so, then all of the elements are equal..
+  %castvec = bitcast <$1 x $2> %vec to <$1 x $4>
+  %castvr = call <$1 x $4> @__rotate_int$6(<$1 x $4> %castvec, i32 1)
+  %vr = bitcast <$1 x $4> %castvr to <$1 x $2>
+  %eq = $5 eq <$1 x $2> %vec, %vr
+  %eq32 = sext <$1 x i1> %eq to <$1 x i32>
+  %eqmm = call i32 @__movmsk(<$1 x i32> %eq32)
+  %alleq = icmp eq i32 %eqmm, eval((1<<$1)-1)
+  br i1 %alleq, label %all_equal, label %not_all_equal
+  ', `
+  ; But for 64-bit elements, it turns out to be more efficient to just
+  ; scalarize and do a individual pairwise comparisons and AND those
+  ; all together..
+  forloop(i, 0, eval($1-1), `
+  %v`'i = extractelement <$1 x $2> %vec, i32 i')
+
+  forloop(i, 0, eval($1-2), `
+  %eq`'i = $5 eq $2 %v`'i, %v`'eval(i+1)')
+
+  %and0 = and i1 %eq0, %eq1
+  forloop(i, 1, eval($1-3), `
+  %and`'i = and i1 %and`'eval(i-1), %eq`'eval(i+1)')
+
+  br i1 %and`'eval($1-3), label %all_equal, label %not_all_equal
+  ')
+
+all_equal:
+  %the_value = extractelement <$1 x $2> %vec, i32 0
+  store $2 %the_value, $2 * %samevalue
+  ret i1 true
+
+not_all_equal:
+  ret i1 false
+}
+')
+
+define(`reduce_equal', `
+reduce_equal_aux($1, i32, int32, i32, icmp, 32)
+reduce_equal_aux($1, float, float, i32, fcmp, 32)
+reduce_equal_aux($1, i64, int64, i64, icmp, 64)
+reduce_equal_aux($1, double, double, i64, fcmp, 64)
+')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; prefix sum stuff
+
+; $1: vector width (e.g. 4)
+; $2: vector element type (e.g. float)
+; $3: bit width of vector element type (e.g. 32)
+; $4: operator to apply (e.g. fadd)
+; $5: identity element value (e.g. 0)
+; $6: suffix for function (e.g. add_float)
+
+define(`exclusive_scan', `
+define internal <$1 x $2> @__exclusive_scan_$6(<$1 x $2> %v,
+                                  <$1 x i32> %mask) nounwind alwaysinline {
+  ; first, set the value of any off lanes to the identity value
+  %ptr = alloca <$1 x $2>
+  %idvec1 = bitcast $2 $5 to <1 x $2>
+  %idvec = shufflevector <1 x $2> %idvec1, <1 x $2> undef,
+      <$1 x i32> < forloop(i, 0, eval($1-2), `i32 0, ') i32 0 >
+  store <$1 x $2> %idvec, <$1 x $2> * %ptr
+  %ptr`'$3 = bitcast <$1 x $2> * %ptr to <$1 x i`'$3> *
+  %vi = bitcast <$1 x $2> %v to <$1 x i`'$3>
+  call void @__masked_store_blend_$3(<$1 x i`'$3> * %ptr`'$3, <$1 x i`'$3> %vi,
+                                     <$1 x i32> %mask)
+  %v_id = load <$1 x $2> * %ptr
+
+  ; extract elements of the vector to use in computing the scan
+  forloop(i, 0, eval($1-1), `
+  %v`'i = extractelement <$1 x $2> %v_id, i32 i')
+
+  ; and just compute the scan directly.
+  ; 0th element is the identity (so nothing to do here),
+  ; 1st element is identity (op) the 0th element of the original vector,
+  ; each successive element is the previous element (op) the previous element
+  ;  of the original vector
+  %s1 = $4 $2 $5, %v0
+  forloop(i, 2, eval($1-1), `
+  %s`'i = $4 $2 %s`'eval(i-1), %v`'eval(i-1)')
+
+  ; and fill in the result vector
+  %r0 = insertelement <$1 x $2> undef, $2 $5, i32 0  ; 0th element gets identity
+  forloop(i, 1, eval($1-1), `
+  %r`'i = insertelement <$1 x $2> %r`'eval(i-1), $2 %s`'i, i32 i')
+
+  ret <$1 x $2> %r`'eval($1-1)
+}
+')
+
+define(`scans', `
+exclusive_scan($1, i32, 32, add, 0, add_i32)
+exclusive_scan($1, float, 32, fadd, zeroinitializer, add_float)
+exclusive_scan($1, i64, 64, add, 0, add_i64)
+exclusive_scan($1, double, 64, fadd, zeroinitializer, add_double)
+
+exclusive_scan($1, i32, 32, and, -1, and_i32)
+exclusive_scan($1, i64, 64, and, -1, and_i64)
+
+exclusive_scan($1, i32, 32, or, 0, or_i32)
+exclusive_scan($1, i64, 64, or, 0, or_i64)
+')
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; per_lane
 ;;
@@ -1371,7 +1880,7 @@ pl_known_mask:
  ;; the mask is known at compile time; see if it is something we can
  ;; handle more efficiently
  %pl_is_allon = icmp eq i32 %pl_mask, eval((1<<$1)-1)
-  br i1 %pl_is_allon, label %pl_all_on, label %pl_not_all_on
+  br i1 %pl_is_allon, label %pl_all_on, label %pl_unknown_mask

 pl_all_on:
  ;; the mask is all on--just expand the code for each lane sequentially
@@ -1379,19 +1888,14 @@ pl_all_on:
          `patsubst(`$3', `ID\|LANE', i)')
  br label %pl_done

-pl_not_all_on:
-  ;; not all on--see if it is all off or mixed
-  ;; for the mixed case, we just run the general case, though we could
+pl_unknown_mask:
+  ;; we just run the general case, though we could
  ;; try to be smart and just emit the code based on what it actually is,
  ;; for example by emitting the code straight-line without a loop and doing 
  ;; the lane tests explicitly, leaving later optimization passes to eliminate
  ;; the stuff that is definitely not needed.  Not clear if we will frequently 
  ;; encounter a mask that is known at compile-time but is not either all on or
  ;; all off...
-  %pl_alloff = icmp eq i32 %pl_mask, 0
-  br i1 %pl_alloff, label %pl_done, label %pl_unknown_mask
-
-pl_unknown_mask:
  br label %pl_loop

 pl_loop:
@@ -1447,20 +1951,6 @@ define internal <$1 x $2> @__gather_elt_$2(i8 * %ptr, <$1 x i32> %offsets, <$1 x

 define <$1 x $2> @__gather_base_offsets_$2(i8 * %ptr, <$1 x i32> %offsets,
                                           <$1 x i32> %vecmask) nounwind readonly alwaysinline {
-entry:
-  %mask = call i32 @__movmsk(<$1 x i32> %vecmask)
-
-  %maskKnown = call i1 @__is_compile_time_constant_mask(<$1 x i32> %vecmask)
-  br i1 %maskKnown, label %known_mask, label %unknown_mask
-
-known_mask:
-  %alloff = icmp eq i32 %mask, 0
-  br i1 %alloff, label %gather_all_off, label %unknown_mask
-
-gather_all_off:
-  ret <$1 x $2> undef
-
-unknown_mask:
  ; We can be clever and avoid the per-lane stuff for gathers if we are willing
  ; to require that the 0th element of the array being gathered from is always
  ; legal to read from (and we do indeed require that, given the benefits!) 
--- a/contrib/ispc.vim
+++ b/contrib/ispc.vim
@@ -0,0 +1,32 @@
+" Vim syntax file
+" Language:	ISPC
+" Maintainer:	Andreas Wendleder <andreas.wendleder@gmail.com>
+" Last Change:	2011 Aug 3
+
+" Quit when a syntax file was already loaded
+if exists("b:current_syntax")
+  finish
+endif
+
+" Read the C syntax to start with
+runtime! syntax/c.vim
+unlet b:current_syntax
+
+" New keywords
+syn keyword	ispcStatement	cbreak ccontinue creturn launch print reference soa sync task
+syn keyword	ispcConditional	cif
+syn keyword	ispcRepeat	cdo cfor cwhile
+syn keyword	ispcBuiltin	programCount programIndex	
+syn keyword	ispcType	export int8 int16 int32 int64
+
+" Default highlighting
+command -nargs=+ HiLink hi def link <args>
+HiLink ispcStatement	Statement
+HiLink ispcConditional	Conditional
+HiLink ispcRepeat	Repeat
+HiLink ispcBuiltin	Statement
+HiLink ispcType		Type
+delcommand HiLink
+
+let b:current_syntax = "ispc"
+
--- a/ctx.cpp
+++ b/ctx.cpp
@@ -144,6 +144,11 @@ FunctionEmitContext::FunctionEmitContext(const Type *rt, llvm::Function *functio
    returnedLanesPtr = AllocaInst(LLVMTypes::MaskType, "returned_lanes_memory");
    StoreInst(LLVMMaskAllOff, returnedLanesPtr);

+    launchedTasks = false;
+    launchGroupHandlePtr = AllocaInst(LLVMTypes::VoidPointerType, "launch_group_handle");
+    StoreInst(llvm::Constant::getNullValue(LLVMTypes::VoidPointerType), 
+              launchGroupHandlePtr);
+
    if (!returnType || returnType == AtomicType::Void)
        returnValuePtr = NULL;
    else {
@@ -153,7 +158,6 @@ FunctionEmitContext::FunctionEmitContext(const Type *rt, llvm::Function *functio
        StoreInst(llvm::Constant::getNullValue(ftype), returnValuePtr);
    }

-#ifndef LLVM_2_8
    if (m->diBuilder) {
        /* If debugging is enabled, tell the debug information emission
           code about this new function */
@@ -174,16 +178,12 @@ FunctionEmitContext::FunctionEmitContext(const Type *rt, llvm::Function *functio
        /* And start a scope representing the initial function scope */
        StartScope();
    }
-#endif // LLVM_2_8
-
-    launchedTasks = false;

    // connect the funciton's mask memory to the __mask symbol
    Symbol *maskSymbol = m->symbolTable->LookupVariable("__mask");
    assert(maskSymbol != NULL);
    maskSymbol->storagePtr = maskPtr;

-#ifndef LLVM_2_8
    // add debugging info for __mask, programIndex, ...
    if (m->diBuilder) {
        maskSymbol->pos = funcStartPos;
@@ -208,15 +208,12 @@ FunctionEmitContext::FunctionEmitContext(const Type *rt, llvm::Function *functio
                                           true /* static */,
                                           programCountSymbol->storagePtr);
    }
-#endif
 }


 FunctionEmitContext::~FunctionEmitContext() {
    assert(controlFlowInfo.size() == 0);
-#ifndef LLVM_2_8
    assert(debugScopes.size() == (m->diBuilder ? 1 : 0));
-#endif
 }


@@ -704,6 +701,7 @@ FunctionEmitContext::LaneMask(llvm::Value *v) {

 llvm::Value *
 FunctionEmitContext::MasksAllEqual(llvm::Value *v1, llvm::Value *v2) {
+#if 0
    // Compare the two masks to get a vector of i1s
    llvm::Value *cmp = CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_EQ,
                               v1, v2, "v1==v2");
@@ -711,6 +709,12 @@ FunctionEmitContext::MasksAllEqual(llvm::Value *v1, llvm::Value *v2) {
    cmp = I1VecToBoolVec(cmp);
    // And see if it's all on
    return All(cmp);
+#else
+    llvm::Value *mm1 = LaneMask(v1);
+    llvm::Value *mm2 = LaneMask(v2);
+    return CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_EQ, mm1, mm2,
+                   "v1==v2");
+#endif
 }


@@ -758,7 +762,7 @@ FunctionEmitContext::I1VecToBoolVec(llvm::Value *b) {


 llvm::Value *
-FunctionEmitContext::EmitMalloc(LLVM_TYPE_CONST llvm::Type *ty, int align) {
+FunctionEmitContext::SizeOf(LLVM_TYPE_CONST llvm::Type *ty) {
    // Emit code to compute the size of the given type using a GEP with a
    // NULL base pointer, indexing one element of the given type, and
    // casting the resulting 'pointer' to an int giving its size.
@@ -775,24 +779,7 @@ FunctionEmitContext::EmitMalloc(LLVM_TYPE_CONST llvm::Type *ty, int align) {
 #endif
    AddDebugPos(poffset);
    llvm::Value *sizeOf = PtrToIntInst(poffset, LLVMTypes::Int64Type, "offset_int");
-
-    // And given the size, call the malloc function
-    llvm::Function *fmalloc = m->module->getFunction("ISPCMalloc");
-    assert(fmalloc != NULL);
-    llvm::Value *mem = CallInst(fmalloc, sizeOf, LLVMInt32(align), 
-                                "raw_argmem");
-    // Cast the void * back to the result pointer type
-    return BitCastInst(mem, ptrType, "mem_bitcast");
-}
-
-
-void
-FunctionEmitContext::EmitFree(llvm::Value *ptr) {
-    llvm::Value *freeArg = BitCastInst(ptr, LLVMTypes::VoidPointerType,
-                                       "argmemfree");
-    llvm::Function *ffree = m->module->getFunction("ISPCFree");
-    assert(ffree != NULL);
-    CallInst(ffree, freeArg);
+    return sizeOf;
 }


@@ -850,7 +837,6 @@ FunctionEmitContext::GetDebugPos() const {
 void
 FunctionEmitContext::AddDebugPos(llvm::Value *value, const SourcePos *pos, 
                                 llvm::DIScope *scope) {
-#ifndef LLVM_2_8
    llvm::Instruction *inst = llvm::dyn_cast<llvm::Instruction>(value);
    if (inst != NULL && m->diBuilder) {
        SourcePos p = pos ? *pos : currentPos;
@@ -861,13 +847,11 @@ FunctionEmitContext::AddDebugPos(llvm::Value *value, const SourcePos *pos,
            inst->setDebugLoc(llvm::DebugLoc::get(p.first_line, p.first_column, 
                                                  scope ? *scope : GetDIScope()));
    }
-#endif
 }


 void
 FunctionEmitContext::StartScope() {
-#ifndef LLVM_2_8
    if (m->diBuilder != NULL) {
        llvm::DIScope parentScope;
        if (debugScopes.size() > 0)
@@ -881,18 +865,15 @@ FunctionEmitContext::StartScope() {
                                             currentPos.first_column);
        debugScopes.push_back(lexicalBlock);
    }
-#endif
 }


 void
 FunctionEmitContext::EndScope() {
-#ifndef LLVM_2_8
    if (m->diBuilder != NULL) {
        assert(debugScopes.size() > 0);
        debugScopes.pop_back();
    }
-#endif
 }


@@ -905,7 +886,6 @@ FunctionEmitContext::GetDIScope() const {

 void
 FunctionEmitContext::EmitVariableDebugInfo(Symbol *sym) {
-#ifndef LLVM_2_8
    if (m->diBuilder == NULL)
        return;

@@ -921,13 +901,11 @@ FunctionEmitContext::EmitVariableDebugInfo(Symbol *sym) {
    llvm::Instruction *declareInst = 
        m->diBuilder->insertDeclare(sym->storagePtr, var, bblock);
    AddDebugPos(declareInst, &sym->pos, &scope);
-#endif
 }


 void
 FunctionEmitContext::EmitFunctionParameterDebugInfo(Symbol *sym) {
-#ifndef LLVM_2_8
    if (m->diBuilder == NULL)
        return;

@@ -943,7 +921,6 @@ FunctionEmitContext::EmitFunctionParameterDebugInfo(Symbol *sym) {
    llvm::Instruction *declareInst = 
        m->diBuilder->insertDeclare(sym->storagePtr, var, bblock);
    AddDebugPos(declareInst, &sym->pos, &scope);
-#endif
 }


@@ -1501,27 +1478,15 @@ FunctionEmitContext::gather(llvm::Value *lvalue, const Type *type,
 void
 FunctionEmitContext::addGSMetadata(llvm::Instruction *inst, SourcePos pos) {
    llvm::Value *str = llvm::MDString::get(*g->ctx, pos.name);
-#ifdef LLVM_2_8
-    llvm::MDNode *md = llvm::MDNode::get(*g->ctx, &str, 1);
-#else
    llvm::MDNode *md = llvm::MDNode::get(*g->ctx, str);
-#endif
    inst->setMetadata("filename", md);

    llvm::Value *line = LLVMInt32(pos.first_line);
-#ifdef LLVM_2_8
-    md = llvm::MDNode::get(*g->ctx, &line, 1);
-#else
    md = llvm::MDNode::get(*g->ctx, line);
-#endif
    inst->setMetadata("line", md);

    llvm::Value *column = LLVMInt32(pos.first_column);
-#ifdef LLVM_2_8
-    md = llvm::MDNode::get(*g->ctx, &column, 1);
-#else
    md = llvm::MDNode::get(*g->ctx, column);
-#endif
    inst->setMetadata("column", md);
 }

@@ -1838,9 +1803,9 @@ llvm::PHINode *
 FunctionEmitContext::PhiNode(LLVM_TYPE_CONST llvm::Type *type, int count, 
                             const char *name) {
    llvm::PHINode *pn = llvm::PHINode::Create(type, 
-#if !defined(LLVM_2_8) && !defined(LLVM_2_9)
+#if defined(LLVM_3_0) || defined(LLVM_3_0svn)
                                              count, 
-#endif // !LLVM_2_8 && !LLVM_2_9
+#endif // LLVM_3_0
                                              name ? name : "phi", bblock);
    AddDebugPos(pn);
    return pn;
@@ -1933,15 +1898,9 @@ FunctionEmitContext::CallInst(llvm::Function *func, llvm::Value *arg0,

 llvm::Instruction *
 FunctionEmitContext::ReturnInst() {
-    if (launchedTasks) {
-        // Automatically add a sync call at the end of any function that
-        // launched tasks
-        SourcePos noPos;
-        noPos.name = "__auto_sync";
-        ExprStmt *es = new ExprStmt(new SyncExpr(noPos), noPos);
-        es->EmitCode(this); 
-        delete es;
-    }
+    if (launchedTasks)
+        // Add a sync call at the end of any function that launched tasks
+        SyncInst();

    llvm::Instruction *rinst = NULL;
    if (returnValuePtr != NULL) {
@@ -1964,7 +1923,8 @@ FunctionEmitContext::ReturnInst() {

 llvm::Instruction *
 FunctionEmitContext::LaunchInst(llvm::Function *callee, 
-                                std::vector<llvm::Value *> &argVals) {
+                                std::vector<llvm::Value *> &argVals,
+                                llvm::Value *launchCount) {
    if (callee == NULL) {
        assert(m->errorCount > 0);
        return NULL;
@@ -1981,20 +1941,15 @@ FunctionEmitContext::LaunchInst(llvm::Function *callee,
        static_cast<LLVM_TYPE_CONST llvm::StructType *>(pt->getElementType());
    assert(argStructType->getNumElements() == argVals.size() + 1);

+    llvm::Function *falloc = m->module->getFunction("ISPCAlloc");
+    assert(falloc != NULL);
    int align = 4 * RoundUpPow2(g->target.nativeVectorWidth);
-#ifdef ISPC_IS_WINDOWS
-    // Use malloc() to allocate storage on Windows, since the stack is
-    // generally not big enough there to do enough allocations for lots of
-    // tasks and then things crash horribly...
-    llvm::Value *argmem = EmitMalloc(argStructType, align);
-#else
-    // Use alloca for space for the task args on OSX And Linux.  KEY
-    // DETAIL: pass false to the call of FunctionEmitContext::AllocaInst so
-    // that the alloca doesn't happen just once at the top of the function,
-    // but happens each time the enclosing basic block executes.
-    llvm::Value *argmem = AllocaInst(argStructType, "argmem", align, false);
-#endif // ISPC_IS_WINDOWS
-    llvm::Value *voidmem = BitCastInst(argmem, LLVMTypes::VoidPointerType);
+    std::vector<llvm::Value *> allocArgs;
+    allocArgs.push_back(launchGroupHandlePtr);
+    allocArgs.push_back(SizeOf(argStructType));
+    allocArgs.push_back(LLVMInt32(align));
+    llvm::Value *voidmem = CallInst(falloc, allocArgs, "args_ptr");
+    llvm::Value *argmem = BitCastInst(voidmem, pt);

    // Copy the values of the parameters into the appropriate place in
    // the argument block
@@ -2016,5 +1971,32 @@ FunctionEmitContext::LaunchInst(llvm::Function *callee,
    llvm::Value *fptr = BitCastInst(callee, LLVMTypes::VoidPointerType);
    llvm::Function *flaunch = m->module->getFunction("ISPCLaunch");
    assert(flaunch != NULL);
-    return CallInst(flaunch, fptr, voidmem, "");
+    std::vector<llvm::Value *> args;
+    args.push_back(launchGroupHandlePtr);
+    args.push_back(fptr);
+    args.push_back(voidmem);
+    args.push_back(launchCount);
+    return CallInst(flaunch, args, "");
+}
+
+
+void
+FunctionEmitContext::SyncInst() {
+    llvm::Value *launchGroupHandle = LoadInst(launchGroupHandlePtr, NULL);
+    llvm::Value *nullPtrValue = llvm::Constant::getNullValue(LLVMTypes::VoidPointerType);
+    llvm::Value *nonNull = CmpInst(llvm::Instruction::ICmp,
+                                   llvm::CmpInst::ICMP_NE,
+                                   launchGroupHandle, nullPtrValue);
+    llvm::BasicBlock *bSync = CreateBasicBlock("call_sync");
+    llvm::BasicBlock *bPostSync = CreateBasicBlock("post_sync");
+    BranchInst(bSync, bPostSync, nonNull);
+
+    SetCurrentBasicBlock(bSync);
+    llvm::Function *fsync = m->module->getFunction("ISPCSync");
+    if (fsync == NULL)
+        FATAL("Couldn't find ISPCSync declaration?!");
+    CallInst(fsync, launchGroupHandle, "");
+    BranchInst(bPostSync);
+
+    SetCurrentBasicBlock(bPostSync);
 }
--- a/ctx.h
+++ b/ctx.h
@@ -210,15 +210,8 @@ public:
        i32. */
    llvm::Value *I1VecToBoolVec(llvm::Value *b);

-    /** Emit code to call the user-supplied ISPCMalloc function to
-        allocate space for an object of thee given type.  Returns the
-        pointer value returned by the ISPCMalloc call. */
-    llvm::Value *EmitMalloc(LLVM_TYPE_CONST llvm::Type *ty, int align = 0);
-
-    /** Emit code to call the user-supplied ISPCFree function, passing it
-        the given pointer to storage previously allocated by an
-        EmitMalloc() call. */
-    void EmitFree(llvm::Value *ptr);
+    /** Returns the size of the given type. */
+    llvm::Value *SizeOf(LLVM_TYPE_CONST llvm::Type *ty);

    /** If the user has asked to compile the program with instrumentation,
        this inserts a callback to the user-supplied instrumentation
@@ -399,7 +392,10 @@ public:
    /** Launch an asynchronous task to run the given function, passing it
        he given argument values. */
    llvm::Instruction *LaunchInst(llvm::Function *callee, 
-                                  std::vector<llvm::Value *> &argVals);
+                                  std::vector<llvm::Value *> &argVals,
+                                  llvm::Value *launchCount);
+
+    void SyncInst();

    llvm::Instruction *ReturnInst();
    /** @} */
@@ -489,6 +485,11 @@ private:
    /** True if a 'launch' statement has been encountered in the function. */
    bool launchedTasks;

+    /** This is a pointer to a void * that is passed to the ISPCLaunch(),
+        ISPCAlloc(), and ISPCSync() routines as a handle to the group ot
+        tasks launched from the current function. */
+    llvm::Value *launchGroupHandlePtr;
+
    llvm::Value *pointerVectorToVoidPointers(llvm::Value *value);
    static void addGSMetadata(llvm::Instruction *inst, SourcePos pos);
    bool ifsInLoopAllUniform() const;
--- a/decl.cpp
+++ b/decl.cpp
@@ -237,7 +237,7 @@ Declarator::GetType(DeclSpecs *ds) const {
                    sprintf(buf, "__anon_parameter_%d", i);
                    sym = new Symbol(buf, pos);
                    Declarator *declarator = new Declarator(sym, sym->pos);
-                    sym->type = declarator->GetType(ds);
+                    sym->type = declarator->GetType(d->declSpecs);
                    d->declarators.push_back(declarator);
                }
                else {
--- a/docs/ReleaseNotes.txt
+++ b/docs/ReleaseNotes.txt
@@ -1,3 +1,119 @@
+=== v1.0.10 === (30 September 2011)
+
+This release features an extensive new example showing the application of
+ispc to a deferred shading algorithm for scenes with thousands of lights
+(examples/deferred).  This is an implementation of the algorithm that Johan
+Andersson described at SIGGRAPH 2009 and was implemented by Andrew
+Lauritzen and Jefferson Montgomery.  The basic idea is that a pre-rendered
+G-buffer is partitioned into tiles, and in each tile, the set of lights
+that contribute to the tile is computed.  Then, the pixels in the tile are
+then shaded using those light sources. (See slides 19-29 of
+http://s09.idav.ucdavis.edu/talks/04-JAndersson-ParallelFrostbite-Siggraph09.pdf
+for more details on the algorithm.)
+
+The mechanism for launching tasks from ispc code has been generalized to
+allow multiple tasks to be launched with a single launch call (see
+http://ispc.github.com/ispc.html#task-parallelism-language-syntax for more
+information.)
+
+A few new functions have been added to the standard library: num_cores()
+returns the number of cores in the system's CPU, and variants of all of the
+atomic operators that take 'uniform' values as parameters have been added.
+
+=== v1.0.9 === (26 September 2011)
+
+The binary release of v1.0.9 is the first that supports AVX code
+generation.  Two targets are provided: "avx", which runs with a
+programCount of 8, and "avx-x2" which runs 16 program instances
+simultaneously.  (This binary is also built using the in-progress LLVM 3.0
+development libraries, while previous ones have been built with the
+released 2.9 version of LLVM.)
+
+This release has no other significant changes beyond a number of small
+bugfixes (https://github.com/ispc/ispc/issues/100,
+https://github.com/ispc/ispc/issues/101, https://github.com/ispc/ispc/issues/103.)
+ 
+=== v1.0.8 === (19 September 2011)
+
+A number of improvements have been made to handling of 'if' statements in
+the language:
+  - A bug was fixed where invalid memory could be incorrectly accessed even
+    if none of the running program instances wanted to execute the
+    corresponding instructions (https://github.com/ispc/ispc/issues/74).
+  - The code generated for 'if' statements is a bit simpler and thus more
+    efficient.
+
+There is now '--pic' command-line argument that causes position-independent
+code to be generated (Linux and OSX only).
+
+A number of additional performance improvements:
+  - Loops are now unrolled by default; the --opt=disable-loop-unroll
+    command-line argument can be used to disable this behavior.
+    (https://github.com/ispc/ispc/issues/78)
+  - A few more cases where gathers/scatters could be determined at compile
+    time to actually access contiguous locations have been added.
+    (https://github.com/ispc/ispc/issues/79)
+
+Finally, warnings are now issued (if possible) when it can be determined
+at compile-time that an out-of-bounds array index is being used.
+(https://github.com/ispc/ispc/issues/98).
+
+
+=== v1.0.7 === (3 September 2011)
+
+The various atomic_*_global() standard library functions are generally
+substantially more efficient.  They all previously issued one hardware
+atomic instruction for each running program instance but now locally
+compute a reduction over the operands and issue a single hardware atomic,
+giving the same effect and results in the end (issue #57).
+
+CPU/ISA target handling has been substantially improved.  If no CPU is
+specified, the host CPU type is used, not just a default of "nehalem".  A
+number of bugs were fixed that ensure that LLVM doesn't generate SSE>2
+instructions when using the SSE2 target (fixes issue #82).
+
+Shift rights of unsigned integer types use a logical shift right
+instruction now, not an arithmetic shift right (fixed issue #88).
+
+When emitting header files, 'extern' declarations of globals used in ispc
+code are now outside of the ispc namespace.  Fixes issue #64.
+
+The stencil example has been modified to do runs with and without
+parallelism.
+
+Many other small bugfixes and improvements.
+
+=== v1.0.6 === (17 August 2011)
+
+Some additional cross-program instance operations have been added to the
+standard library.  reduce_equal() checks to see if the given value is the
+same across all running program instances, and exclusive_scan_{and,or,and}()
+computes a scan over the given value in the running program instances.
+See the documentation of these new routines for more information:
+http://ispc.github.com/ispc.html#cross-program-instance-operations.
+
+The simple task system implementations used in the examples have been
+improved.  The Windows version no nlonger has a hard limit on the number of
+tasks that can be launched, and all versions have less dynamic memory
+allocation and less locking.  More of the examples now have paths that also
+measure performance using tasks along with SPMD vectorization.
+
+Two new examples have been added: one that shows the implementation of a
+ray-marching volume rendering algorithm, and one that shows a 3D stencil
+computation, as might be done for PDE solutions.
+
+Standard library routines to issue prefetches have been added.  See the
+documentation for more details: http://ispc.github.com/ispc.html#prefetches.
+
+Fast versions of the float to half-precision float conversion routines have
+been added.  For more details, see:
+http://ispc.github.com/ispc.html#conversions-to-and-from-half-precision-floats.
+
+There is the usual set of small bug fixes.  Notably, a number of details
+related to handling 32 versus 64 bit targets have been fixed, which in turn
+has fixed a bug related to tasks having incorrect values for pointers
+passed to them.
+
 === v1.0.5 === (1 August 2011)

 Multi-element vector swizzles are supported; for example, given a 3-wide
--- a/docs/ispc.txt
+++ b/docs/ispc.txt
@@ -33,6 +33,17 @@ The main goals behind ``ispc`` are to:
 number of non-trivial workloads that aren't handled well by other
 compilation approaches (e.g. loop auto-vectorization.)

+**We are very interested in your feedback and comments about ispc and
+in hearing your experiences using the system.  We are especially interested
+in hearing if you try using ispc but see results that are not as you
+were expecting or hoping for.** We encourage you to send a note with your
+experiences or comments to the `ispc-users`_ mailing list or to file bug or
+feature requests with the ``ispc`` `bug tracker`_. (Thanks!)
+
+.. _ispc-users: http://groups.google.com/group/ispc-users
+.. _bug tracker: https://github.com/ispc/ispc/issues?state=open
+
+
 Contents:

 * `Recent Changes to ISPC`_
@@ -69,7 +80,8 @@ Contents:
  + `Program Instance Convergence`_
  + `Data Races`_
  + `Uniform Variables and Varying Control Flow`_
-  + `Task Parallelism in ISPC`_
+  + `Task Parallelism: Language Syntax`_
+  + `Task Parallelism: Runtime Requirements`_

 * `The ISPC Standard Library`_

@@ -79,6 +91,8 @@ Contents:
  + `Packed Load and Store Operations`_
  + `Conversions To and From Half-Precision Floats`_
  + `Atomic Operations and Memory Fences`_
+  + `Prefetches`_
+  + `System Information`_
  + `Low-Level Bits`_

 * `Interoperability with the Application`_
@@ -100,6 +114,9 @@ Contents:
  + `"Inline" Aggressively`_
  + `Small Performance Tricks`_
  + `Instrumenting Your ISPC Programs`_
+  + `Using Scan Operations For Variable Output`_
+  + `Application-Supplied Execution Masks`_
+  + `Explicit Vector Programming With Uniform Short Vector Types`_

 * `Disclaimer and Legal Information`_

@@ -822,8 +839,8 @@ by default.  If a function is declared with a ``static`` qualifier, then it
 is only visible in the file in which it was declared.

 Any function that can be launched with the ``launch`` construct in ``ispc``
-must have a ``task`` qualifier; see `Task Parallelism in ISPC`_ for more
-discussion of launching tasks in ``ispc``.
+must have a ``task`` qualifier; see `Task Parallelism: Language Syntax`_
+for more discussion of launching tasks in ``ispc``.

 Functions that are intended to be called from C/C++ application code must
 have the ``export`` qualifier.  This causes them to have regular C linkage
@@ -924,8 +941,9 @@ execution model is critical for writing efficient and correct programs in

 ``ispc`` supports both task parallelism to parallelize across multiple
 cores and SPMD parallelism to parallelize across the SIMD vector lanes on a
-single core.  This section focuses on SPMD parallelism.  See the section
-`Task Parallelism in ISPC`_ for discussion of task parallelism in ``ispc``.
+single core.  This section focuses on SPMD parallelism.  See the sections
+`Task Parallelism: Language Syntax`_ and `Task Parallelism: Runtime
+Requirements`_ for discussion of task parallelism in ``ispc``.

 The SPMD-on-SIMD Execution Model
 --------------------------------
@@ -1172,7 +1190,7 @@ This code implicitly assumes that ``programCount`` evenly divides
 ::

    for (uniform int i = 0; i < count; i += programCount) {
-        if (i + programIndex < programCount) {
+        if (i + programIndex < count) {
            float d = data[i + programIndex];
            ...

@@ -1368,112 +1386,190 @@ be modified in the above code even if *none* of the program instances
 evaluated a true value for the test, given the ``ispc`` execution model.


-Task Parallelism in ISPC
------------------------
+Task Parallelism: Language Syntax
+---------------------------------

 One option for combining task-parallelism with ``ispc`` is to just use
 regular task parallelism in the C/C++ application code (be it through
-Intel® Cilk(tm), Intel® Thread Building Blocks or another task system,
-etc.), and for tasks to use ``ispc`` for SPMD parallelism across the vector
-lanes as appropriate.  Alternatively, ``ispc`` also has some support for
-launching tasks from ``ispc`` code.  The approach is similar to Intel®
-Cilk's task launch feature.  (See the ``examples/mandelbrot_tasks`` example
-to see it used in a non-trivial example.)
+Intel® Cilk(tm), Intel® Thread Building Blocks or another task system), and
+for tasks to use ``ispc`` for SPMD parallelism across the vector lanes as
+appropriate.  Alternatively, ``ispc`` also has support for launching tasks
+from ``ispc`` code.  The approach is similar to Intel® Cilk's task launch
+feature.  (See the ``examples/mandelbrot_tasks`` example to see it used in
+a small example.)

-Any function that is launched as a task must be declared with the ``task``
-qualifier:
+First, any function that is launched as a task must be declared with the
+``task`` qualifier:

 ::

-    task void func(uniform float a[], uniform int start) {
-        ....
+    task void func(uniform float a[], uniform int index) {
+        ...
+        a[index] = ....
    }

 Tasks must return ``void``; a compile time error is issued if a
 non-``void`` task is defined.

-Given a task, one can then write code that launches tasks as follows:
+Given a task definitions, there are two ways to write code that launches
+tasks, using the ``launch`` construct.  First, one task can be launched at
+a time, with parameters passed to the task to help it determine what part
+of the overall computation it's responsible for:

 ::

    for (uniform int i = 0; i < 100; ++i)
-        launch < func(a, i); >
+        launch < func(a, i) >;

 Note the ``launch`` keyword and the brackets around the function call.
 This code launches 100 tasks, each of which presumably does some
-computation keyed off of given the value ``i``.  In general, one should
-launch many more tasks than there are processors in the system to
+computation that is keyed off of given the value ``i``.  In general, one
+should launch many more tasks than there are processors in the system to
 ensure good load-balancing, but not so many that the overhead of scheduling
 and running tasks dominates the computation.

-Program execution continues asynchronously after task launch; thus, the
-function shouldn't access values being generated by the tasks without
-synchronization.  A function uses a ``sync`` statement to wait for all
-launched tasks to finish:
+Alternatively, a number of tasks may be launched from a single ``launch``
+statement.  We might instead write the above example with a single
+``launch`` like this:

 ::

-    for (uniform int i = 0; i < 100; ++i)
-        launch < func(a, i); >
+    launch[100] < func2(a) >;
+
+Where an integer value (not necessarily a compile-time constant) is
+provided to the ``launch`` keyword in square brackets; this number of tasks
+will be enqueued to be run asynchronously.  Within each of the tasks, two
+special built-in variables are available--``taskIndex``, and ``taskCount``.
+The first, ``taskIndex``, ranges from zero to one minus the number of tasks
+provided to ``launch``, and ``taskCount`` equals the number of launched
+taks.  Thus, we might use ``taskIndex`` in the implementation of ``func2``
+to determine which array element to process.
+
+::
+
+    task void func2(uniform float a[]) {
+        ...
+        a[taskIndex] = ...
+    }
+
+Program execution continues asynchronously after a ``launch`` statement;
+thus, a function shouldn't access values being generated by the tasks it
+has launched within the function without synchronization.  If results are
+needed before function return, a function can use a ``sync`` statement to
+wait for all launched tasks to finish:
+
+::
+
+    launch[100] < func2(a) >;
    sync;
    // now safe to use computed values in a[]...

-Alternatively, any function that launches tasks has an implicit ``sync``
-before it returns, so that functions that call a function that launches
-tasks don't have to worry about outstanding asynchronous computation.
+Alternatively, any function that launches tasks has an automatically-added
+``sync`` statement before it returns, so that functions that call a
+function that launches tasks don't have to worry about outstanding
+asynchronous computation from that function.

 Inside functions with the ``task`` qualifier, two additional built-in
-variables are provided: ``threadIndex`` and ``threadCount``.
-``threadCount`` gives the total number of hardware threads that have been
-launched by the task system.  ``threadIndex`` provides an index between
-zero and ``threadCount-1`` that gives a unique index that corresponds to
-the hardware thread that is executing the current task.  The
-``threadIndex`` can be used for accessing data that is private to the
-current thread and thus doesn't require synchronization to access under
-parallel execution.
+variables are provided in addition to ``taskIndex`` and ``taskCount``:
+``threadIndex`` and ``threadCount``.  ``threadCount`` gives the total
+number of hardware threads that have been launched by the task system.
+``threadIndex`` provides an index between zero and ``threadCount-1`` that
+gives a unique index that corresponds to the hardware thread that is
+executing the current task.  The ``threadIndex`` can be used for accessing
+data that is private to the current thread and thus doesn't require
+synchronization to access under parallel execution.
+
+Task Parallelism: Runtime Requirements
+--------------------------------------

 If you use the task launch feature in ``ispc``, you must provide C/C++
-implementations of two functions and link them into your final executable
-file.  Although these functions may be implemented in either language, they
-must have "C" linkage (i.e. their prototypes must be declared inside an
-``extern "C"`` block if they are defined in C++.)
+implementations of three specific functions that manage launching and
+synchronizing parallel tasks; these functions must be linked into your
+executable.  Although these functions may be implemented in any
+language, they must have "C" linkage (i.e. their prototypes must be
+declared inside an ``extern "C"`` block if they are defined in C++.)
+
+By using user-supplied versions of these functions, ``ispc`` programs can
+easily interoperate with software systems that have existing task systems
+for managing parallelism.  If you're using ``ispc`` with a system that
+isn't otherwise multi-threaded and don't want to write custom
+implementations of them, you can use the implementations of these functions
+provided in the ``examples/tasksys.cpp`` file in the ``ispc``
+distributions.
+
+If you are implementing your own task system, the remainder of this section
+discusses the requirements for these calls.  You will also likely want to
+review the example task systems in ``examples/tasksys.cpp`` for reference.
+If you are not implmenting your own task system, you can skip reading the
+remainder of this section.
+
+Here are the declarations of the three functions that must be provided to
+manage tasks in ``ispc``:

 ::

-    void ISPCLaunch(void *funcptr, void *data);
-    void ISPCSync();
+    void *ISPCAlloc(void **handlePtr, int64_t size, int32_t alignment);
+    void ISPCLaunch(void **handlePtr, void *f, void *data, int count);
+    void ISPCSync(void *handle);

-On Windows, two additional functions must be provided to dynamically
-allocate and free memory to store the arguments passed to tasks.  (On OSX
-and Linux, the stack provides memory for task arguments; on Windows, the
-stack is generally not large enough to do this for large numbers of tasks.)
+All three of these functions take an opaque handle (or a pointer to an
+opaque handle) as their first parameter.  This handle allows the task
+system runtime to distinguish between calls to these functions from
+different functions in ``ispc`` code.  In this way, the task system
+implementation can efficiently wait for completion on just the tasks
+launched from a single function.
+
+The first time one of ``ISPCLaunch()`` or ``ISPCAlloc()`` is called in an
+``ispc`` functon, the ``void *`` pointed to by the ``handlePtr`` parameter
+will be ``NULL``.  The implementations of these function should then
+initialize ``*handlePtr`` to a unique handle value of some sort.  (For
+example, it might allocate a small structure to record which tasks were
+launched by the current function.)  In subsequent calls to these functions
+in the emitted ``ispc`` code, the same value for ``handlePtr`` will be
+passed in, such that loading from ``*handlePtr`` will retrieve the value
+stored in the first call.
+
+At function exit (or at an explicit ``sync`` statement), a call to
+``ISPCSync()`` will be generated if ``*handlePtr`` is non-``NULL``.
+Therefore, the handle value is passed directly to ``ISPCSync()``, rather
+than a pointer to it, as in the other functions.
+
+The ``ISPCAlloc()`` function is used to allocate small blocks of memory to
+store parameters passed to tasks.  It should return a pointer to memory
+with the given aize and alignment.  Note that there is no explicit
+``ISPCFree()`` call; instead, all memory allocated within an ``ispc``
+function should be freed when ``ISPCSync()`` is called.
+
+``ISPCLaunch()`` is called to launch to launch one or more asynchronous
+tasks.  Each ``launch`` statement in ``ispc`` code causes a call to
+``ISPCLaunch()`` to be emitted in the generated code.  The three parameters
+after the handle pointer to thie function are relatively straightforward;
+the ``void *f`` parameter holds a pointer to a function to call to run the
+work for this task, ``data`` holds a pointer to data to pass to this
+function, and ``count`` is the number of instances of this function to
+enqueue for asynchronous execution.  (In other words, ``count`` corresponds
+to the value ``n`` in a multiple-task launch statement like ``launch[n]``.)
+
+The signature of the provided function pointer ``f`` is

 ::

-    void *ISPCMalloc(int64_t size, int32_t alignment);
-    void ISPCFree(void *ptr);
+    void (*TaskFuncPtr)(void *data, int threadIndex, int threadCount,
+                        int taskIndex, int taskCount)

-These are called by the task launch code generated by the ``ispc``
-compiler; the first is called to launch to launch a task and the second is
-called to wait for, respectively.  (Factoring them out in this way
-allows ``ispc`` to inter-operate with the application's task system, if
-any, rather than having a separate one of its own.)  To run a particular
-task, the task system should cast the function pointer to a ``void (*)(void
-*, int, int)`` function pointer and then call it with the provided ``void
-*`` data and then an index for the current hardware thread and the total
-number of hardware threads the task system has launched--in other words:
-
-::
-
-    typedef void (*TaskFuncType)(void *, int, int);
-    TaskFuncType tft = (TaskFuncType)(funcptr);
-    tft(data, threadIndex, threadCount);
-
-A number of sample task system implementations are provided with ``ispc``; 
-see the files ``tasks_concrt.cpp``, ``tasks_gcd.cpp`` and
-``tasks_pthreads.cpp`` in the ``examples/mandelbrot_tasks`` directory of
-the ``ispc`` distribution.
+When this function pointer is called by one of the hardware threads managed
+bythe task system, the ``data`` pointer passed to ``ISPCLaunch()`` should
+be passed to it for its first parameter; ``threadCount`` gives the total
+number of hardware threads that have been spawned to run tasks and
+``threadIndex`` should be an integer index between zero and ``threadCount``
+uniquely identifying the hardware thread that is running the task.  (These
+values can be used to index into thread-local storage.)

+The value of ``taskCount`` should be the number of tasks launched in the
+``launch`` statement that caused the call to ``ISPCLaunch()`` and each of
+the calls to this function should be given a unique value of ``taskIndex``
+between zero and ``taskCount``, to distinguish which of the instances
+of the set of launched tasks is running.

 The ISPC Standard Library
 =========================
@@ -1822,6 +1918,71 @@ given value across all of the currently-executing vector lanes.
    uniform int reduce_max(int a, int b)
    uniform unsigned int reduce_max(unsigned int a, unsigned int b)

+Finally, you can check to see if a particular value has the same value in
+all of the currently-running program instances:
+
+::
+
+    uniform bool reduce_equal(int32 v)
+    uniform bool reduce_equal(unsigned int32 v)
+    uniform bool reduce_equal(float v)
+    uniform bool reduce_equal(int64 v)
+    uniform bool reduce_equal(unsigned int64 v)
+    uniform bool reduce_equal(double)
+
+There are also variants of these functions that return the value as a
+``uniform`` in the case where the values are all the same.
+
+::
+
+    uniform bool reduce_equal(int32 v, reference uniform int32 sameval)
+    uniform bool reduce_equal(unsigned int32 v,
+                              reference uniform unsigned int32 sameval)
+    uniform bool reduce_equal(float v, reference uniform float sameval)
+    uniform bool reduce_equal(int64 v, reference uniform int64 sameval)
+    uniform bool reduce_equal(unsigned int64 v,
+                              reference uniform unsigned int64 sameval)
+    uniform bool reduce_equal(double, reference uniform double sameval)
+
+If called when none of the program instances are running,
+``reduce_equal()`` will return ``false``.
+
+There are also a number of functions to compute "scan"s of values across
+the program instances.  For example, the ``exclusive_scan_and()`` function
+computes, for each program instance, the sum of the given value over all of
+the preceeding program instances.  (The scans currently available in
+``ispc`` are all so-called "exclusive" scans, meaning that the value
+computed for a given element does not include the value provided for that
+element.)  In C code, an exclusive add scan over an array might be
+implemented as:
+
+::
+
+    void scan_add(int *in_array, int *result_array, int count) {
+        result_array[0] = 0;
+        for (int i = 0; i < count; ++i)
+            result_array[i] = result_array[i-1] + in_array[i-1];
+    }
+
+``ispc`` provides the following scan functions--addition, bitwise-and, and
+bitwise-or are available:
+
+::
+
+    int32 exclusive_scan_add(int32 v) 
+    unsigned int32 exclusive_scan_add(unsigned int32 v) 
+    float exclusive_scan_add(float v) 
+    int64 exclusive_scan_add(int64 v) 
+    unsigned int64 exclusive_scan_add(unsigned int64 v) 
+    double exclusive_scan_add(double v) 
+    int32 exclusive_scan_and(int32 v) 
+    unsigned int32 exclusive_scan_and(unsigned int32 v) 
+    int64 exclusive_scan_and(int64 v) 
+    unsigned int64 exclusive_scan_and(unsigned int64 v) 
+    int32 exclusive_scan_or(int32 v) 
+    unsigned int32 exclusive_scan_or(unsigned int32 v) 
+    int64 exclusive_scan_or(int64 v) 
+    unsigned int64 exclusive_scan_or(unsigned int64 v) 


 Packed Load and Store Operations
@@ -1921,6 +2082,18 @@ function returns the 16 bits that are the closest match to the given
    int16 float_to_half(float f)
    uniform int16 float_to_half(uniform float f)

+There are also faster versions of these functions that don't worry about
+handling floating point infinity, "not a number" and denormalized numbers
+correctly.  These are faster than the above functions, but are less
+precise.
+
+::
+
+    float half_to_float_fast(unsigned int16 h)
+    uniform float half_to_float_fast(uniform unsigned int16 h)
+    int16 float_to_half_fast(float f)
+    uniform int16 float_to_half_fast(uniform float f)
+

 Atomic Operations and Memory Fences
 -----------------------------------
@@ -1941,12 +2114,12 @@ end.)

 One thing to note is that that the value being added to here is a
 ``uniform`` integer, while the increment amount and the return value are
-``varying``.  In other words, the semantics are that each running program
-instance individually issues the atomic operation with its own ``delta``
-value and gets the previous value of ``val`` back in return.  The atomics
-for the running program instances may be issued in arbitrary order; it's
-not guaranteed that they will be issued in ``programIndex`` order, for
-example.
+``varying``.  In other words, the semantics of this call are that each
+running program instance individually issues the atomic operation with its
+own ``delta`` value and gets the previous value of ``val`` back in return.
+The atomics for the running program instances may be issued in arbitrary
+order; it's not guaranteed that they will be issued in ``programIndex``
+order, for example.

 Here are the declarations of the ``int32`` variants of these functions.
 There are also ``int64`` equivalents as well as variants that take
@@ -1964,17 +2137,44 @@ function can be used with ``float`` and ``double`` types as well.)
  int32 atomic_xor_global(reference uniform int32 val, int32 value)
  int32 atomic_swap_global(reference uniform int32 val, int32 newval)

-There is also an atomic "compare and exchange" function; it atomically
-compares the value in "val" to "compare"--if they match, it assigns
-"newval" to "val".  In either case, the old value of "val" is returned.
-(As with the other atomic operations, there are also ``unsigned`` and
-64-bit variants of this function.  Furthermore, there are ``float`` and
-``double`` variants as well.)
+There are also variants of these functions that take ``uniform`` values for
+the operand and return a ``uniform`` result:

 ::

+  uniform int32 atomic_add_global(reference uniform int32 val,
+                                  uniform int32 value)
+  uniform int32 atomic_subtract_global(reference uniform int32 val,
+                                       uniform int32 value)
+  uniform int32 atomic_min_global(reference uniform int32 val,
+                                  uniform int32 value)
+  uniform int32 atomic_max_global(reference uniform int32 val,
+                                  uniform int32 value)
+  uniform int32 atomic_and_global(reference uniform int32 val,
+                                  uniform int32 value)
+  uniform int32 atomic_or_global(reference uniform int32 val,
+                                  uniform int32 value)
+  uniform int32 atomic_xor_global(reference uniform int32 val,
+                                  uniform int32 value)
+  uniform int32 atomic_swap_global(reference uniform int32 val,
+                                   uniform int32 newval)
+
+There are also an atomic swap and "compare and exchange" functions.
+Compare and exchange atomically compares the value in "val" to
+"compare"--if they match, it assigns "newval" to "val".  In either case,
+the old value of "val" is returned.  (As with the other atomic operations,
+there are also ``unsigned`` and 64-bit variants of this function.
+Furthermore, there are ``float`` and ``double`` variants as well.)
+
+::
+
+  int32 atomic_swap_global(reference uniform int32 val, int32 new)
+  uniform int32 atomic_swap_global(reference uniform int32 val,
+                                   uniform int32 new)
  int32 atomic_compare_exchange_global(reference uniform int32 val,
                                       int32 compare, int32 newval)
+  uniform int32 atomic_compare_exchange_global(reference uniform int32 val,
+                                  uniform int32 compare, uniform int32 newval)

 ``ispc`` also has a standard library routine that inserts a memory barrier
 into the code; it ensures that all memory reads and writes prior to be
@@ -1990,6 +2190,53 @@ code.
    void memory_barrier();


+Prefetches
+----------
+
+The standard library has a variety of functions to prefetch data into the
+processor's cache.  While modern CPUs have automatic prefetchers that do a
+reasonable job of prefetching data to the cache before its needed, high
+performance applications may find it helpful to prefetch data before it's
+needed.
+
+For example, this code shows how to prefetch data to the processor's L1
+cache while iterating over the items in an array.  
+
+::
+
+   uniform int32 array[...];
+   for (uniform int i = 0; i < count; ++i) {
+       // do computation with array[i]
+       prefetch_l1(array[i+32]);
+   }
+
+The standard library has routines to prefetch to the L1, L2, and L3
+caches.  It also has a variant, ``prefetch_nt()``, that indicates that the
+value being prefetched isn't expected to be used more than once (so should
+be high priority to be evicted from the cache).
+
+::
+
+    void prefetch_{l1,l2,l3,nt}(reference TYPE)
+
+These functions are available for all of the basic types in the
+language--``int8``, ``int16``, ``int32``, ``float``, and so forth.
+
+
+System Information
+------------------
+
+A routine is available to find the number of CPU cores available in the
+system:
+
+::
+
+    int num_cores()
+
+This value can be useful for adapting the granularity of parallel task
+decomposition depending on the number of processors in the system.
+
+
 Low-Level Bits
 --------------

@@ -2097,14 +2344,14 @@ Both the ``foo`` and ``bar`` global variables can be accessed on each
 side.

 ``ispc`` code can also call back to C/C++.  On the ``ispc`` side, any
-application functions to be called must be declared with the ``export "C"``
+application functions to be called must be declared with the ``extern "C"``
 qualifier.

 ::

   extern "C" void foo(uniform float f, uniform float g);

-Unlike in C++, ``export "C"`` doesn't take braces to delineate
+Unlike in C++, ``extern "C"`` doesn't take braces to delineate
 multiple functions to be declared; thus, multiple C functions to be called
 from ``ispc`` must be declared as follows:

@@ -2699,6 +2946,123 @@ active upon function entry.
    ao.ispc(0088) - function entry: 36928 calls (0 / 0.00% all off!), 97.40% active lanes
    ...

+
+Using Scan Operations For Variable Output
+-----------------------------------------
+
+One important application of the ``exclusive_scan_add()`` function in the
+standard library is when program instances want to generate a variable amount
+of output and when one would like that output to be densely packed in a
+single array.  For example, consider the code fragment below:
+
+::
+
+    uniform int func(uniform float outArray[], ...) {
+       int numOut = ...;  // figure out how many to be output
+       float outLocal[MAX_OUT]; // staging area
+       // put results in outLocal[0], ..., outLocal[numOut-1]
+       int startOffset = exclusive_scan_add(numOut);
+       for (int i = 0; i < numOut; ++i)
+           outArray[startOffset + i] = outLocal[i];
+       return reduce_add(numOut);
+    }
+
+Here, each program instance has computed a number, ``numOut``, of values to
+output, and has stored them in the ``outLocal`` array.  Assume that four
+program instances are running and that the first one wants to output one
+value, the second two values, and the third and fourth three values each.
+In this case, ``exclusive_scan_add()`` will return the values (0, 1, 3, 6)
+to the four program instances, respectively.  The first program instance
+will write its one result to ``outArray[0]``, the second will write its two
+values to ``outArray[1]`` and ``outArray[2]``, and so forth.  The
+``reduce_add`` call at the end returns the total number of values that the
+program instances have written to the array.
+
+Application-Supplied Execution Masks
+------------------------------------
+
+Recall that when execution transitions from the application code to an
+``ispc`` function, all of the program instances are initially executing.
+In some cases, it may desired that only some of them are running, based on
+a data-dependent condition computed in the application program.  This
+situation can easily be handled via an additional parameter from the
+application.
+
+As a simple example, consider a case where the application code has an
+array of ``float`` values and we'd like the ``ispc`` code to update
+just specific values in that array, where which of those values to be
+updated has been determined by the application.  In C++ code, we might
+have:
+
+::
+
+    int count = ...;
+    float *array = new float[count];
+    bool *shouldUpdate = new bool[count];
+    // initialize array and shouldUpdate
+    ispc_func(array, shouldUpdate, count);
+
+Then, the ``ispc`` code could process this update as:
+
+::
+
+    export void ispc_func(uniform float array[], uniform bool update[],
+                          uniform int count) {
+        for (uniform int i = 0; i < count; i += programCount) {
+            cif (update[i+programIndex] == true)
+                // update array[i+programIndex]...
+        }
+    }
+
+(In this case a "coherent" if statement is likely to be worthwhile if the
+``update`` array will tend to have sections that are either all-true or
+all-false.)
+
+Explicit Vector Programming With Uniform Short Vector Types
+-----------------------------------------------------------
+
+The typical model for programming in ``ispc`` is an *implicit* parallel
+model, where one writes a program that is apparently doing scalar
+computation on values and the program is then vectorized to run in parallel
+across the SIMD lanes of a processor.  However, ``ispc`` also has some
+support for explicit vector unit programming, where the vectorization is
+explicit.  Some computations may be more effectively described in the
+explicit model rather than the implicit model.
+
+This support is provided via ``uniform`` instances of short vectors 
+(as were introduced in the `Short Vector Types`_ section).  Specifically, 
+if this short program
+
+::
+
+    export uniform float<8> madd(uniform float<8> a, 
+                                 uniform float<8> b, uniform float<8> c) {
+        return a + b * c;
+    }
+
+is compiled with the AVX target, ``ispc`` generates the following assembly:
+
+::
+    _madd:
+	vmulps	%ymm2, %ymm1, %ymm1
+	vaddps	%ymm0, %ymm1, %ymm0
+	ret
+
+(And similarly, if compiled with a 4-wide SSE target, two ``mulps`` and two
+``addps`` instructions are generated, and so forth.)
+
+Note that ``ispc`` doesn't currently support control-flow based on
+``uniform`` short vector types; it is thus not possible to write code like:
+
+::
+
+    export uniform int<8> count(uniform float<8> a, uniform float<8> b) {
+        uniform int<8> sum = 0;
+        while (a++ < b)
+            ++sum;
+    }
+
+
 Disclaimer and Legal Information
 ================================

--- a/doxygen.cfg
+++ b/doxygen.cfg
@@ -31,7 +31,7 @@ PROJECT_NAME           = "Intel SPMD Program Compiler"
 # This could be handy for archiving the generated documentation or
 # if some version control system is used.

-PROJECT_NUMBER         = 1.0.5
+PROJECT_NUMBER         = 1.0.10

 # The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute)
 # base path where the generated documentation will be put.
--- a/examples/README.txt
+++ b/examples/README.txt
@@ -13,6 +13,7 @@ against regular serial C++ implementations, printing out a comparison of
 the runtimes and the speedup delivered by ispc.  It may be instructive to
 do a side-by-side diff of the C++ and ispc implementations of these
 algorithms to learn more about wirting ispc code.
+
 
 AOBench
 =======
@@ -27,6 +28,7 @@ It executes the program for the given number of iterations, rendering an
 (xres x yres) image each time and measuring the computation time with both
 serial and ispc implementations.

+
 AOBench_Instrumented
 ====================

@@ -40,12 +42,47 @@ is provided in the instrument.cpp file.
 *** Note: on Linux, this example currently hits an assertion in LLVM during
 *** compilation

+
+Deferred
+========
+
+This example shows an extensive example of using ispc for efficient
+deferred shading of scenes with thousands of lights; it's an implementation
+of the algorithm that Johan Andersson described at SIGGRAPH 2009,
+implemented by Andrew Lauritzen and Jefferson Montgomery.  The basic idea
+is that a pre-rendered G-buffer is partitioned into tiles, and in each
+tile, the set of lights that contribute to the tile is first computed.
+Then, the pixels in the tile are then shaded using just those light
+sources. (See slides 19-29 of
+http://s09.idav.ucdavis.edu/talks/04-JAndersson-ParallelFrostbite-Siggraph09.pdf
+for more details on the algorithm.)
+
+This directory includes three implementations of the algorithm:
+
+- An ispc implementation that first does a static partitioning of the
+  screen into tiles to parallelize across the CPU cores.  Within each tile
+  ispc kernels provide highly efficient implementations of the light
+  culling and shading calculations.
+- A "best practices" serial C++ implementation.  This implementation does a
+  dynamic partitioning of the screen, refining tiles with significant Z
+  depth complexity (these tiles often have a large number of lights that
+  affect them).  Within each final tile, the pixels are shaded using
+  regular C++ code.
+- If the Cilk extensions are available in your compiler, an ispc
+  implementation that uses Cilk will also be built.
+  (See http://software.intel.com/en-us/articles/intel-cilk-plus/).  Like 
+  the "best practices" serial implementation, this version does dynamic
+  tile partitioning for better load balancing and then uses ispc for the
+  light culling and shading.
+
+
 Mandelbrot
 ==========

 Mandelbrot set generation.  This example is extensively documented at the
 http://ispc.github.com/example.html page.

+
 Mandelbrot_tasks
 ================

@@ -58,6 +95,7 @@ using tasks with ispc, no task system is mandated; the user is free to plug
 in any task system they want, for ease of interoperating with existing task
 systems.

+
 Noise
 =====

@@ -71,6 +109,7 @@ Options
 This program implements both the Black-Scholes and Binomial options pricing
 models in both ispc and regular serial C++ code.

+
 RT
 ==

@@ -87,9 +126,25 @@ and triangle intersection code from pbrt; see the pbrt source code and/or
 "Physically Based Rendering" book for more about the basic algorithmic
 details.

+
 Simple
 ======

 This is a simple "hello world" type program that shows a ~10 line
 application program calling out to a ~5 line ispc program to do a simple
 computation.
+
+
+Volume
+======
+
+Ray-marching volume rendering, with single scattering lighting model.  To
+run it, specify a camera parameter file and a volume density file, e.g.:
+
+volume camera.dat density_highres.vol
+
+(See, e.g. Chapters 11 and 16 of "Physically Based Rendering" for
+information about the algorithm implemented here.)  The volume data set
+included here was generated by the example implementation of the "Wavelet
+Turbulence for Fluid Simulation" SIGGRAPH 2008 paper by Kim et
+al. (http://www.cs.cornell.edu/~tedkim/WTURB/)
--- a/examples/aobench/Makefile
+++ b/examples/aobench/Makefile
@@ -1,8 +1,14 @@

-CXX=g++ -m64
-CXXFLAGS=-Iobjs/ -O3 -Wall
+ARCH = $(shell uname)
+
+TASK_CXX=../tasksys.cpp
+TASK_LIB=-lpthread
+TASK_OBJ=$(addprefix objs/, $(subst ../,, $(TASK_CXX:.cpp=.o)))
+
+CXX=g++
+CXXFLAGS=-Iobjs/ -O3 -Wall -m64
 ISPC=ispc
-ISPCFLAGS=-O2 --fast-math --arch=x86-64
+ISPCFLAGS=-O2 --target=sse4 --arch=x86-64

 default: ao

@@ -14,12 +20,15 @@ dirs:
 clean:
 	/bin/rm -rf objs *~ ao

-ao: dirs objs/ao.o objs/ao_serial.o objs/ao_ispc.o
-	$(CXX) $(CXXFLAGS) -o $@ objs/ao.o objs/ao_ispc.o objs/ao_serial.o -lm -lpthread
+ao: dirs objs/ao.o objs/ao_serial.o objs/ao_ispc.o $(TASK_OBJ)
+	$(CXX) $(CXXFLAGS) -o $@ objs/ao.o objs/ao_ispc.o objs/ao_serial.o $(TASK_OBJ) -lm $(TASK_LIB)

 objs/%.o: %.cpp
 	$(CXX) $< $(CXXFLAGS) -c -o $@

+objs/%.o: ../%.cpp
+	$(CXX) $< $(CXXFLAGS) -c -o $@
+
 objs/ao.o: objs/ao_ispc.h 

 objs/%_ispc.h objs/%_ispc.o: %.ispc
--- a/examples/aobench/ao.cpp
+++ b/examples/aobench/ao.cpp
@@ -101,6 +101,7 @@ savePPM(const char *fname, int w, int h)
    fprintf(fp, "255\n");
    fwrite(img, w * h * 3, 1, fp);
    fclose(fp);
+    printf("Wrote image file %s\n", fname);
 }


@@ -172,10 +173,30 @@ int main(int argc, char **argv)
    }

    // Report results and save image
-    printf("[aobench ispc]:\t\t\t[%.3f] M cycles (%d x %d image)\n", minTimeISPC, 
-           width, height);
+    printf("[aobench ispc]:\t\t\t[%.3f] M cycles (%d x %d image)\n", 
+           minTimeISPC, width, height);
    savePPM("ao-ispc.ppm", width, height); 

+    //
+    // Run the ispc + tasks path, test_iterations times, and report the
+    // minimum time for any of them.
+    //
+    double minTimeISPCTasks = 1e30;
+    for (unsigned int i = 0; i < test_iterations; i++) {
+        memset((void *)fimg, 0, sizeof(float) * width * height * 3);
+        assert(NSUBSAMPLES == 2);
+
+        reset_and_start_timer();
+        ao_ispc_tasks(width, height, NSUBSAMPLES, fimg);
+        double t = get_elapsed_mcycles();
+        minTimeISPCTasks = std::min(minTimeISPCTasks, t);
+    }
+
+    // Report results and save image
+    printf("[aobench ispc + tasks]:\t\t[%.3f] M cycles (%d x %d image)\n", 
+           minTimeISPCTasks, width, height);
+    savePPM("ao-ispc-tasks.ppm", width, height); 
+
    //
    // Run the serial path, again test_iteration times, and report the
    // minimum time.
@@ -192,7 +213,8 @@ int main(int argc, char **argv)
    // Report more results, save another image...
    printf("[aobench serial]:\t\t[%.3f] M cycles (%d x %d image)\n", minTimeSerial, 
           width, height);
-    printf("\t\t\t\t(%.2fx speedup from ISPC)\n", minTimeSerial / minTimeISPC);
+    printf("\t\t\t\t(%.2fx speedup from ISPC, %.2fx speedup from ISPC + tasks)\n", 
+           minTimeSerial / minTimeISPC, minTimeSerial / minTimeISPCTasks);
    savePPM("ao-serial.ppm", width, height); 
        
    return 0;
--- a/examples/aobench/ao.ispc
+++ b/examples/aobench/ao.ispc
@@ -203,8 +203,9 @@ ambient_occlusion(reference Isect isect, reference Plane plane,
 /* Compute the image for the scanlines from [y0,y1), for an overall image
   of width w and height h.
 */
-void ao_scanlines(uniform int y0, uniform int y1, uniform int w, uniform int h, 
-                  uniform int nsubsamples, reference uniform float image[]) {
+static void ao_scanlines(uniform int y0, uniform int y1, uniform int w, 
+                         uniform int h,  uniform int nsubsamples, 
+                         reference uniform float image[]) {
    static Plane plane = { { 0.0f, -0.5f, 0.0f }, { 0.f, 1.f, 0.f } };
    static Sphere spheres[3] = {
        { { -2.0f, 0.0f, -3.5f }, 0.5f },
@@ -231,6 +232,9 @@ void ao_scanlines(uniform int y0, uniform int y1, uniform int w, uniform int h,
    // direction we do per iteration and ny the number in y.
    uniform int nx = 1, ny = 1;

+    // FIXME: We actually need ny to be 1 regardless of the decomposition,
+    // since the task decomposition is one scanline high.
+
    if (programCount == 8) {
        // Do two pixels at once in the x direction
        nx = 2;
@@ -239,19 +243,21 @@ void ao_scanlines(uniform int y0, uniform int y1, uniform int w, uniform int h,
            ++du;
    }
    else if (programCount == 16) {
-        // Two at once in both x and y
-        nx = ny = 2;
-        if ((programIndex >= 4 && programIndex < 8) || programIndex >= 12)
+        nx = 4;
+        ny = 1;
+        if (programIndex >= 4 && programIndex < 8)
            ++du;
-        if (programIndex >= 8)  
-            ++dv;
+        if (programIndex >= 8 && programIndex < 12)
+            du += 2;
+        if (programIndex >= 12)
+            du += 3;
    }

    // Now loop over all of the pixels, stepping in x and y as calculated
    // above.  (Assumes that ny divides y and nx divides x...)
    for (uniform int y = y0; y < y1; y += ny) {
        for (uniform int x = 0; x < w; x += nx)  {
-            // Figur out x,y pixel in NDC
+            // Figure out x,y pixel in NDC
            float px =  (x + du - (w / 2.0f)) / (w / 2.0f);
            float py = -(y + dv - (h / 2.0f)) / (h / 2.0f);
            float ret = 0.f;
@@ -293,7 +299,7 @@ void ao_scanlines(uniform int y0, uniform int y1, uniform int w, uniform int h,

            // offset to the first pixel in the image
            uniform int offset = 3 * (y * w + x);
-            for (uniform int p = 0; p < programCount; p += 4, ++offset) {
+            for (uniform int p = 0; p < programCount; p += 4, offset += 3) {
                // Get the four sample values for this pixel
                uniform float sumret = retArray[p] + retArray[p+1] + retArray[p+2] +
                    retArray[p+3];
@@ -315,3 +321,15 @@ export void ao_ispc(uniform int w, uniform int h, uniform int nsubsamples,
                    uniform float image[]) {
    ao_scanlines(0, h, w, h, nsubsamples, image);
 }
+
+
+static void task ao_task(uniform int width, uniform int height, 
+                         uniform int nsubsamples, uniform float image[]) {
+    ao_scanlines(taskIndex, taskIndex+1, width, height, nsubsamples, image);
+}
+
+
+export void ao_ispc_tasks(uniform int w, uniform int h, uniform int nsubsamples, 
+                          uniform float image[]) {
+    launch[h] < ao_task(w, h, nsubsamples, image) >;
+}
--- a/examples/aobench/ao_serial.cpp
+++ b/examples/aobench/ao_serial.cpp
@@ -140,7 +140,7 @@ ray_plane_intersect(Isect &isect, Ray &ray,
    float d = -dot(plane.p, plane.n);
    float v = dot(ray.dir, plane.n);

-    if (fabsf(v) < 1.0e-17) 
+    if (fabsf(v) < 1.0e-17f) 
        return;
    else {
        float t = -(dot(ray.org, plane.n) + d) / v;
@@ -183,11 +183,11 @@ orthoBasis(vec basis[3], const vec &n) {
    basis[2] = n;
    basis[1].x = 0.0; basis[1].y = 0.0; basis[1].z = 0.0;

-    if ((n.x < 0.6) && (n.x > -0.6)) {
+    if ((n.x < 0.6f) && (n.x > -0.6f)) {
        basis[1].x = 1.0;
-    } else if ((n.y < 0.6) && (n.y > -0.6)) {
+    } else if ((n.y < 0.6f) && (n.y > -0.6f)) {
        basis[1].y = 1.0;
-    } else if ((n.z < 0.6) && (n.z > -0.6)) {
+    } else if ((n.z < 0.6f) && (n.z > -0.6f)) {
        basis[1].z = 1.0;
    } else {
        basis[1].x = 1.0;
@@ -224,7 +224,7 @@ ambient_occlusion(Isect &isect, Plane &plane,
            float phi   = 2.0f * M_PI * drand48();
            float x = cosf(phi) * theta;
            float y = sinf(phi) * theta;
-            float z = sqrtf(1.0 - theta * theta);
+            float z = sqrtf(1.0f - theta * theta);

            // local . global
            float rx = x * basis[0].x + y * basis[1].x + z * basis[2].x;
@@ -236,14 +236,14 @@ ambient_occlusion(Isect &isect, Plane &plane,
            ray.dir.y = ry;
            ray.dir.z = rz;

-            occIsect.t   = 1.0e+17;
+            occIsect.t   = 1.0e+17f;
            occIsect.hit = 0;

            for (int snum = 0; snum < 3; ++snum)
                ray_sphere_intersect(occIsect, ray, spheres[snum]); 
            ray_plane_intersect (occIsect, ray, plane); 

-            if (occIsect.hit) occlusion += 1.0;
+            if (occIsect.hit) occlusion += 1.f;
        }
    }

@@ -280,10 +280,10 @@ static void ao_scanlines(int y0, int y1, int w, int h, int nsubsamples,

                    ray.dir.x = px;
                    ray.dir.y = py;
-                    ray.dir.z = -1.0;
+                    ray.dir.z = -1.0f;
                    vnormalize(ray.dir);

-                    isect.t   = 1.0e+17;
+                    isect.t   = 1.0e+17f;
                    isect.hit = 0;

                    for (int snum = 0; snum < 3; ++snum)
--- a/examples/aobench/aobench.vcxproj
+++ b/examples/aobench/aobench.vcxproj
@@ -1,4 +1,4 @@
-<?xml version="1.0" encoding="utf-8"?>
+<?xml version="1.0" encoding="utf-8"?>
 <Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
  <ItemGroup Label="ProjectConfigurations">
    <ProjectConfiguration Include="Debug|Win32">
@@ -21,6 +21,7 @@
  <ItemGroup>
    <ClCompile Include="ao.cpp" />
    <ClCompile Include="ao_serial.cpp" />
+    <ClCompile Include="../tasksys.cpp" />
  </ItemGroup>
  <ItemGroup>
    <CustomBuild Include="ao.ispc">
--- a/examples/aobench_instrumented/Makefile
+++ b/examples/aobench_instrumented/Makefile
@@ -2,7 +2,7 @@
 CXX=g++ -m64
 CXXFLAGS=-Iobjs/ -g3 -Wall
 ISPC=ispc
-ISPCFLAGS=-O2 --fast-math --instrument --arch=x86-64
+ISPCFLAGS=-O2 --instrument --arch=x86-64

 default: ao

--- a/examples/aobench_instrumented/ao.cpp
+++ b/examples/aobench_instrumented/ao.cpp
@@ -100,6 +100,7 @@ savePPM(const char *fname, int w, int h)
    fprintf(fp, "255\n");
    fwrite(img, w * h * 3, 1, fp);
    fclose(fp);
+    printf("Wrote image file %s\n", fname);
 }


--- a/examples/aobench_instrumented/aobench_instrumented.vcxproj
+++ b/examples/aobench_instrumented/aobench_instrumented.vcxproj
--- a/examples/deferred/Makefile
+++ b/examples/deferred/Makefile
@@ -0,0 +1,42 @@
+
+ARCH = $(shell uname)
+
+TASK_CXX=../tasks_pthreads.cpp
+TASK_LIB=-lpthread
+
+ifeq ($(ARCH), Darwin)
+  TASK_CXX=../tasks_gcd.cpp
+  TASK_LIB=
+endif
+
+TASK_OBJ=$(addprefix objs/, $(subst ../,, $(TASK_CXX:.cpp=.o)))
+
+CXX=g++
+CXXFLAGS=-Iobjs/ -O3 -Wall -m64
+ISPC=ispc
+ISPCFLAGS=-O2 --target=sse4x2 --arch=x86-64 --math-lib=fast
+
+OBJS=objs/main.o objs/common.o objs/kernels_ispc.o objs/dynamic_c.o objs/dynamic_cilk.o
+
+default: deferred_shading
+
+.PHONY: dirs clean
+.PRECIOUS: objs/kernels_ispc.h
+
+dirs:
+	/bin/mkdir -p objs/
+
+clean:
+	/bin/rm -rf objs *~ deferred_shading
+
+deferred_shading: dirs $(OBJS) $(TASK_OBJ)
+	$(CXX) $(CXXFLAGS) -o $@ $(OBJS) $(TASK_OBJ) -lm $(TASK_LIB)
+
+objs/%.o: %.cpp objs/kernels_ispc.h deferred.h
+	$(CXX) $< $(CXXFLAGS) -c -o $@
+
+objs/%.o: ../%.cpp
+	$(CXX) $< $(CXXFLAGS) -c -o $@
+
+objs/%_ispc.h objs/%_ispc.o: %.ispc
+	$(ISPC) $(ISPCFLAGS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
--- a/examples/deferred/common.cpp
+++ b/examples/deferred/common.cpp
@@ -0,0 +1,209 @@
+/*
+  Copyright (c) 2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+#ifdef _MSC_VER
+#define _CRT_SECURE_NO_WARNINGS
+#define ISPC_IS_WINDOWS
+#elif defined(__linux__)
+#define ISPC_IS_LINUX
+#elif defined(__APPLE__)
+#define ISPC_IS_APPLE
+#endif
+
+#include <fcntl.h>
+#include <float.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+#include <stdint.h>
+#include <algorithm>
+#include <assert.h>
+#include <vector>
+#ifdef ISPC_IS_WINDOWS
+  #define WIN32_LEAN_AND_MEAN
+  #include <windows.h>
+#endif
+#ifdef ISPC_IS_LINUX
+  #include <malloc.h>
+#endif
+#include "deferred.h"
+#include "../timing.h"
+
+///////////////////////////////////////////////////////////////////////////
+
+static void *
+lAlignedMalloc(int64_t size, int32_t alignment) {
+#ifdef ISPC_IS_WINDOWS
+    return _aligned_malloc(size, alignment);
+#endif
+#ifdef ISPC_IS_LINUX
+    return memalign(alignment, size);
+#endif
+#ifdef ISPC_IS_APPLE
+    void *mem = malloc(size + (alignment-1) + sizeof(void*));
+    char *amem = ((char*)mem) + sizeof(void*);
+    amem = amem + uint32_t(alignment - (reinterpret_cast<uint64_t>(amem) &
+                                        (alignment - 1)));
+    ((void**)amem)[-1] = mem;
+    return amem;
+#endif
+}
+
+
+static void
+lAlignedFree(void *ptr) {
+#ifdef ISPC_IS_WINDOWS
+    _aligned_free(ptr);
+#endif
+#ifdef ISPC_IS_LINUX
+    free(ptr);
+#endif
+#ifdef ISPC_IS_APPLE
+    free(((void**)ptr)[-1]);
+#endif
+}
+
+
+Framebuffer::Framebuffer(int width, int height) {
+    nPixels = width*height;
+    r = (uint8_t *)lAlignedMalloc(nPixels, ALIGNMENT_BYTES);
+    g = (uint8_t *)lAlignedMalloc(nPixels, ALIGNMENT_BYTES);
+    b = (uint8_t *)lAlignedMalloc(nPixels, ALIGNMENT_BYTES);
+}
+
+
+Framebuffer::~Framebuffer() {
+    lAlignedFree(r);
+    lAlignedFree(g);
+    lAlignedFree(b);
+}
+
+
+void
+Framebuffer::clear() {
+    memset(r, 0, nPixels);
+    memset(g, 0, nPixels);
+    memset(b, 0, nPixels);
+}
+
+InputData *
+CreateInputDataFromFile(const char *path) {
+    FILE *in = fopen(path, "rb");
+    if (!in) return 0;
+
+    InputData *input = new InputData;
+
+    // Load header
+    if (fread(&input->header, sizeof(ispc::InputHeader), 1, in) != 1) {
+        fprintf(stderr, "Preumature EOF reading file \"%s\"\n", path);
+        return NULL;
+    }
+
+    // Load data chunk and update pointers
+    input->chunk = (uint8_t *)lAlignedMalloc(input->header.inputDataChunkSize, 
+                                             ALIGNMENT_BYTES);
+    if (fread(input->chunk, input->header.inputDataChunkSize, 1, in) != 1) {
+        fprintf(stderr, "Preumature EOF reading file \"%s\"\n", path);
+        return NULL;
+    }
+    
+    input->arrays.zBuffer =
+        (float *)&input->chunk[input->header.inputDataArrayOffsets[idaZBuffer]];
+    input->arrays.normalEncoded_x =
+        (uint16_t *)&input->chunk[input->header.inputDataArrayOffsets[idaNormalEncoded_x]];
+    input->arrays.normalEncoded_y =
+        (uint16_t *)&input->chunk[input->header.inputDataArrayOffsets[idaNormalEncoded_y]];
+    input->arrays.specularAmount =
+        (uint16_t *)&input->chunk[input->header.inputDataArrayOffsets[idaSpecularAmount]];
+    input->arrays.specularPower =
+        (uint16_t *)&input->chunk[input->header.inputDataArrayOffsets[idaSpecularPower]];
+    input->arrays.albedo_x =
+        (uint8_t *)&input->chunk[input->header.inputDataArrayOffsets[idaAlbedo_x]];
+    input->arrays.albedo_y =
+        (uint8_t *)&input->chunk[input->header.inputDataArrayOffsets[idaAlbedo_y]];
+    input->arrays.albedo_z =
+        (uint8_t *)&input->chunk[input->header.inputDataArrayOffsets[idaAlbedo_z]];
+    input->arrays.lightPositionView_x =
+        (float *)&input->chunk[input->header.inputDataArrayOffsets[idaLightPositionView_x]];
+    input->arrays.lightPositionView_y =
+        (float *)&input->chunk[input->header.inputDataArrayOffsets[idaLightPositionView_y]];
+    input->arrays.lightPositionView_z =
+        (float *)&input->chunk[input->header.inputDataArrayOffsets[idaLightPositionView_z]];
+    input->arrays.lightAttenuationBegin =
+        (float *)&input->chunk[input->header.inputDataArrayOffsets[idaLightAttenuationBegin]];
+    input->arrays.lightColor_x =
+        (float *)&input->chunk[input->header.inputDataArrayOffsets[idaLightColor_x]];
+    input->arrays.lightColor_y =
+        (float *)&input->chunk[input->header.inputDataArrayOffsets[idaLightColor_y]];
+    input->arrays.lightColor_z =
+        (float *)&input->chunk[input->header.inputDataArrayOffsets[idaLightColor_z]];
+    input->arrays.lightAttenuationEnd =
+        (float *)&input->chunk[input->header.inputDataArrayOffsets[idaLightAttenuationEnd]];
+
+    fclose(in);
+    return input;
+}
+
+
+void DeleteInputData(InputData *input)
+{
+    lAlignedFree(input->chunk);
+}
+
+
+void WriteFrame(const char *filename, const InputData *input,
+                const Framebuffer &framebuffer) {
+    // Deswizzle and copy to RGBA output
+    // Doesn't need to be fast... only happens once
+    size_t imageBytes = 3 * input->header.framebufferWidth * 
+        input->header.framebufferHeight;
+    uint8_t* framebufferAOS = (uint8_t *)lAlignedMalloc(imageBytes, ALIGNMENT_BYTES);
+    memset(framebufferAOS, 0, imageBytes);
+
+    for (int i = 0; i < input->header.framebufferWidth * 
+                        input->header.framebufferHeight; ++i) {
+        framebufferAOS[3 * i + 0] = framebuffer.r[i];
+        framebufferAOS[3 * i + 1] = framebuffer.g[i];
+        framebufferAOS[3 * i + 2] = framebuffer.b[i];
+    }
+    
+    // Write out simple PPM file
+    FILE *out = fopen(filename, "wb");
+    fprintf(out, "P6 %d %d 255\n", input->header.framebufferWidth, 
+            input->header.framebufferHeight);
+    fwrite(framebufferAOS, imageBytes, 1, out);
+
+    lAlignedFree(framebufferAOS);
+}
--- a/examples/deferred/data/pp1280x720.bin
+++ b/examples/deferred/data/pp1280x720.bin
--- a/examples/deferred/data/pp1920x1200.bin
+++ b/examples/deferred/data/pp1920x1200.bin
--- a/examples/deferred/deferred.h
+++ b/examples/deferred/deferred.h
@@ -0,0 +1,108 @@
+/*
+  Copyright (c) 2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+#ifndef DEFERRED_H
+#define DEFERRED_H
+
+// Currently tile widths must be a multiple of SIMD width (i.e. 8 for ispc sse4x2)!
+#define MIN_TILE_WIDTH 16
+#define MIN_TILE_HEIGHT 16
+#define MAX_LIGHTS 1024
+
+enum InputDataArraysEnum {
+    idaZBuffer = 0,
+    idaNormalEncoded_x,
+    idaNormalEncoded_y,
+    idaSpecularAmount,
+    idaSpecularPower,
+    idaAlbedo_x,
+    idaAlbedo_y,
+    idaAlbedo_z,
+    idaLightPositionView_x,
+    idaLightPositionView_y,
+    idaLightPositionView_z,
+    idaLightAttenuationBegin,
+    idaLightColor_x,
+    idaLightColor_y,
+    idaLightColor_z,
+    idaLightAttenuationEnd,
+
+    idaNum
+};
+
+#ifndef ISPC
+
+#include <stdint.h>
+#include "kernels_ispc.h"
+
+#define ALIGNMENT_BYTES 64
+
+#define MAX_LIGHTS 1024
+
+#define VISUALIZE_LIGHT_COUNT 0
+
+struct InputData
+{
+    ispc::InputHeader header;
+    ispc::InputDataArrays arrays;
+    uint8_t *chunk;
+};
+
+
+struct Framebuffer {
+    Framebuffer(int width, int height);
+    ~Framebuffer();
+
+    void clear();
+
+    uint8_t *r, *g, *b;
+
+private:
+    int nPixels;
+    Framebuffer(const Framebuffer &);
+    Framebuffer &operator=(const Framebuffer *);
+};
+
+
+InputData *CreateInputDataFromFile(const char *path);
+void DeleteInputData(InputData *input);
+void WriteFrame(const char *filename, const InputData *input,
+                const Framebuffer &framebuffer);
+void InitDynamicC(InputData *input);
+void InitDynamicCilk(InputData *input);
+void DispatchDynamicC(InputData *input, Framebuffer *framebuffer);
+void DispatchDynamicCilk(InputData *input, Framebuffer *framebuffer);
+
+#endif // !ISPC
+
+#endif // DEFERRED_H
--- a/examples/deferred/deferred_shading.vcxproj
+++ b/examples/deferred/deferred_shading.vcxproj
@@ -0,0 +1,170 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|Win32">
+      <Configuration>Debug</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|Win32">
+      <Configuration>Release</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{87f53c53-957e-4e91-878a-bc27828fb9eb}</ProjectGuid>
+    <Keyword>Win32Proj</Keyword>
+    <RootNamespace>mandelbrot</RootNamespace>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <LinkIncremental>true</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <LinkIncremental>true</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <LinkIncremental>false</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <LinkIncremental>false</LinkIncremental>
+  </PropertyGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <ClCompile>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <FloatingPointModel>Fast</FloatingPointModel>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <ClCompile>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <FloatingPointModel>Fast</FloatingPointModel>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <FloatingPointModel>Fast</FloatingPointModel>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <FloatingPointModel>Fast</FloatingPointModel>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <ClCompile Include="common.cpp" />
+    <ClCompile Include="dynamic_c.cpp" />
+    <ClCompile Include="dynamic_cilk.cpp" />
+    <ClCompile Include="main.cpp" />
+    <ClCompile Include="../tasks_concrt.cpp" />
+  </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="kernels.ispc">
+      <FileType>Document</FileType>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=sse4x2
+</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --target=sse4x2
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=sse4x2
+</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --target=sse4x2
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
+    </CustomBuild>
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+  </ImportGroup>
+</Project>
--- a/examples/deferred/dynamic_c.cpp
+++ b/examples/deferred/dynamic_c.cpp
@@ -0,0 +1,871 @@
+/*
+  Copyright (c) 2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+#include "deferred.h"
+#include "kernels_ispc.h"
+#include <algorithm>
+#include <stdint.h>
+#include <assert.h>
+#include <math.h>
+
+#ifdef _MSC_VER
+#define ISPC_IS_WINDOWS
+#elif defined(__linux__)
+#define ISPC_IS_LINUX
+#elif defined(__APPLE__)
+#define ISPC_IS_APPLE
+#endif
+
+#ifdef ISPC_IS_LINUX
+#include <malloc.h>
+#endif // ISPC_IS_LINUX
+
+// Currently tile widths must be a multiple of SIMD width (i.e. 8 for ispc sse4x2)!
+#define MIN_TILE_WIDTH 16
+#define MIN_TILE_HEIGHT 16
+
+
+#define DYNAMIC_TREE_LEVELS 5
+// If this is set to 1 then the result will be identical to the static version
+#define DYNAMIC_MIN_LIGHTS_TO_SUBDIVIDE 1
+
+static void *
+lAlignedMalloc(int64_t size, int32_t alignment) {
+#ifdef ISPC_IS_WINDOWS
+    return _aligned_malloc(size, alignment);
+#endif
+#ifdef ISPC_IS_LINUX
+    return memalign(alignment, size);
+#endif
+#ifdef ISPC_IS_APPLE
+    void *mem = malloc(size + (alignment-1) + sizeof(void*));
+    char *amem = ((char*)mem) + sizeof(void*);
+    amem = amem + uint32_t(alignment - (reinterpret_cast<uint64_t>(amem) &
+                                        (alignment - 1)));
+    ((void**)amem)[-1] = mem;
+    return amem;
+#endif
+}
+
+
+static void
+lAlignedFree(void *ptr) {
+#ifdef ISPC_IS_WINDOWS
+    _aligned_free(ptr);
+#endif
+#ifdef ISPC_IS_LINUX
+    free(ptr);
+#endif
+#ifdef ISPC_IS_APPLE
+    free(((void**)ptr)[-1]);
+#endif
+}
+
+
+static void
+ComputeZBounds(int tileStartX, int tileEndX,
+               int tileStartY, int tileEndY,
+               // G-buffer data
+               float zBuffer[],
+               int gBufferWidth,
+               // Camera data
+               float cameraProj_33, float cameraProj_43,
+               float cameraNear, float cameraFar,
+               // Output
+               float *minZ, float *maxZ)
+{
+    // Find Z bounds
+    float laneMinZ = cameraFar;
+    float laneMaxZ = cameraNear;
+    for (int y = tileStartY; y < tileEndY; ++y) {
+        for (int x = tileStartX; x < tileEndX; ++x) {
+            // Unproject depth buffer Z value into view space
+            float z = zBuffer[(y * gBufferWidth + x)];
+            float viewSpaceZ = cameraProj_43 / (z - cameraProj_33);
+
+            // Work out Z bounds for our samples
+            // Avoid considering skybox/background or otherwise invalid pixels
+            if ((viewSpaceZ < cameraFar) && (viewSpaceZ >= cameraNear)) {
+                laneMinZ = std::min(laneMinZ, viewSpaceZ);
+                laneMaxZ = std::max(laneMaxZ, viewSpaceZ);
+            }
+        }
+    }
+    *minZ = laneMinZ;
+    *maxZ = laneMaxZ;
+}
+
+
+static void
+ComputeZBoundsRow(int tileY, int tileWidth, int tileHeight,
+                  int numTilesX, int numTilesY,
+                  // G-buffer data
+                  float zBuffer[],
+                  int gBufferWidth,
+                  // Camera data
+                  float cameraProj_33, float cameraProj_43,
+                  float cameraNear, float cameraFar,
+                  // Output
+                  float minZArray[],
+                  float maxZArray[])
+{
+    for (int tileX = 0; tileX < numTilesX; ++tileX) {
+        float minZ, maxZ;
+        ComputeZBounds(
+            tileX * tileWidth, tileX * tileWidth + tileWidth,
+            tileY * tileHeight, tileY * tileHeight + tileHeight,
+            zBuffer, gBufferWidth,
+            cameraProj_33, cameraProj_43, cameraNear, cameraFar,
+            &minZ, &maxZ);
+        minZArray[tileX] = minZ;
+        maxZArray[tileX] = maxZ;
+    }
+}
+
+
+class MinMaxZTree
+{
+public:
+    // Currently (min) tile dimensions must divide gBuffer dimensions evenly
+    // Levels must be small enough that neither dimension goes below one tile
+    MinMaxZTree(
+        int tileWidth, int tileHeight, int levels,
+        int gBufferWidth, int gBufferHeight)
+        : mTileWidth(tileWidth), mTileHeight(tileHeight), mLevels(levels)
+    {
+        mNumTilesX = gBufferWidth / mTileWidth;
+        mNumTilesY = gBufferHeight / mTileHeight;
+        
+        // Allocate arrays
+        mMinZArrays = (float **)lAlignedMalloc(sizeof(float *) * mLevels, 16);
+        mMaxZArrays = (float **)lAlignedMalloc(sizeof(float *) * mLevels, 16);
+        for (int i = 0; i < mLevels; ++i) {
+            int x = NumTilesX(i);
+            int y = NumTilesY(i);
+            assert(x > 0);
+            assert(y > 0);
+            // NOTE: If the following two asserts fire it probably means that
+            // the base tile dimensions do not evenly divide the G-buffer dimensions
+            assert(x * (mTileWidth << i) >= gBufferWidth);
+            assert(y * (mTileHeight << i) >= gBufferHeight);
+            mMinZArrays[i] = (float *)lAlignedMalloc(sizeof(float) * x * y, 16);
+            mMaxZArrays[i] = (float *)lAlignedMalloc(sizeof(float) * x * y, 16);
+        }
+    }
+
+    void Update(float *zBuffer, int gBufferPitchInElements,
+        float cameraProj_33, float cameraProj_43,
+        float cameraNear, float cameraFar)
+    {
+        for (int tileY = 0; tileY < mNumTilesY; ++tileY) {
+            ComputeZBoundsRow(tileY, mTileWidth, mTileHeight, mNumTilesX, mNumTilesY,
+                              zBuffer, gBufferPitchInElements,
+                              cameraProj_33, cameraProj_43, cameraNear, cameraFar,
+                              mMinZArrays[0] + (tileY * mNumTilesX),
+                              mMaxZArrays[0] + (tileY * mNumTilesX));
+        }
+
+        // Generate other levels
+        for (int level = 1; level < mLevels; ++level) {
+            int destTilesX = NumTilesX(level);
+            int destTilesY = NumTilesY(level);
+            int srcLevel = level - 1;
+            int srcTilesX = NumTilesX(srcLevel);
+            int srcTilesY = NumTilesY(srcLevel);
+            for (int y = 0; y < destTilesY; ++y) {
+                for (int x = 0; x < destTilesX; ++x) {
+                    int srcX = x << 1;
+                    int srcY = y << 1;
+                    // NOTE: Ugly branches to deal with non-multiple dimensions at some levels
+                    // TODO: SSE branchless min/max is probably better...
+                    float minZ = mMinZArrays[srcLevel][(srcY) * srcTilesX + (srcX)];
+                    float maxZ = mMaxZArrays[srcLevel][(srcY) * srcTilesX + (srcX)];
+                    if (srcX + 1 < srcTilesX) {
+                        minZ = std::min(minZ, mMinZArrays[srcLevel][(srcY) * srcTilesX + 
+                                                                    (srcX + 1)]);
+                        maxZ = std::max(maxZ, mMaxZArrays[srcLevel][(srcY) * srcTilesX +
+                                                                    (srcX + 1)]);
+                        if (srcY + 1 < srcTilesY) {
+                            minZ = std::min(minZ, mMinZArrays[srcLevel][(srcY + 1) * srcTilesX +
+                                                                        (srcX + 1)]);
+                            maxZ = std::max(maxZ, mMaxZArrays[srcLevel][(srcY + 1) * srcTilesX +
+                                                                        (srcX + 1)]);
+                        }
+                    }
+                    if (srcY + 1 < srcTilesY) {
+                        minZ = std::min(minZ, mMinZArrays[srcLevel][(srcY + 1) * srcTilesX +
+                                                                    (srcX    )]);
+                        maxZ = std::max(maxZ, mMaxZArrays[srcLevel][(srcY + 1) * srcTilesX +
+                                                                    (srcX    )]);
+                    }
+                    mMinZArrays[level][y * destTilesX + x] = minZ;
+                    mMaxZArrays[level][y * destTilesX + x] = maxZ;
+                }
+            }
+        }
+    }
+
+    ~MinMaxZTree() {
+        for (int i = 0; i < mLevels; ++i) {
+            lAlignedFree(mMinZArrays[i]);
+            lAlignedFree(mMaxZArrays[i]);
+        }
+        lAlignedFree(mMinZArrays);
+        lAlignedFree(mMaxZArrays); 
+    }
+
+    int Levels() const { return mLevels; }
+
+    // These round UP, so beware that the last tile for a given level may not be completely full
+    // TODO: Verify this...
+    int NumTilesX(int level = 0) const { return (mNumTilesX + (1 << level) - 1) >> level; }
+    int NumTilesY(int level = 0) const { return (mNumTilesY + (1 << level) - 1) >> level; }
+    int TileWidth(int level = 0) const { return (mTileWidth << level); }
+    int TileHeight(int level = 0) const { return (mTileHeight << level); }
+
+    float MinZ(int level, int tileX, int tileY) const {
+        return mMinZArrays[level][tileY * NumTilesX(level) + tileX];
+    }
+    float MaxZ(int level, int tileX, int tileY) const {
+        return mMaxZArrays[level][tileY * NumTilesX(level) + tileX];
+    }
+
+private:
+    int mTileWidth;
+    int mTileHeight;
+    int mLevels;
+    int mNumTilesX;
+    int mNumTilesY;
+
+    // One array for each "level" in the tree
+    float **mMinZArrays;
+    float **mMaxZArrays;
+};
+
+static MinMaxZTree *gMinMaxZTree = 0;
+
+void InitDynamicC(InputData *input) {
+    gMinMaxZTree = 
+        new MinMaxZTree(MIN_TILE_WIDTH, MIN_TILE_HEIGHT, DYNAMIC_TREE_LEVELS,
+                        input->header.framebufferWidth, 
+                        input->header.framebufferHeight);
+}
+
+
+// numLights need not be a multiple of programCount here, but the input and output arrays
+// should be able to handle programCount-sized load/stores.
+static void
+SplitTileMinMax(
+    int tileMidX, int tileMidY,
+    // Subtile data (00, 10, 01, 11)
+    float subtileMinZ[],
+    float subtileMaxZ[],
+    // G-buffer data
+    int gBufferWidth, int gBufferHeight,
+    // Camera data
+    float cameraProj_11, float cameraProj_22,
+    // Light Data
+    int lightIndices[],
+    int numLights,
+    float light_positionView_x_array[],
+    float light_positionView_y_array[],
+    float light_positionView_z_array[],
+    float light_attenuationEnd_array[],
+    // Outputs
+    int subtileIndices[],
+    int subtileIndicesPitch,
+    int subtileNumLights[]
+    )
+{
+    float gBufferScale_x = 0.5f * (float)gBufferWidth;
+    float gBufferScale_y = 0.5f * (float)gBufferHeight;
+        
+    float frustumPlanes_xy[2] = { -(cameraProj_11 * gBufferScale_x),
+                                   (cameraProj_22 * gBufferScale_y) };
+    float frustumPlanes_z[2] = { tileMidX - gBufferScale_x,
+                                 tileMidY - gBufferScale_y };
+
+    for (int i = 0; i < 2; ++i) {
+        // Normalize
+        float norm = 1.f / sqrtf(frustumPlanes_xy[i] * frustumPlanes_xy[i] + 
+                                 frustumPlanes_z[i] * frustumPlanes_z[i]);
+        frustumPlanes_xy[i] *= norm;
+        frustumPlanes_z[i] *= norm;
+    }
+
+    // Initialize
+    int subtileLightOffset[4];
+    subtileLightOffset[0] = 0 * subtileIndicesPitch;
+    subtileLightOffset[1] = 1 * subtileIndicesPitch;
+    subtileLightOffset[2] = 2 * subtileIndicesPitch;
+    subtileLightOffset[3] = 3 * subtileIndicesPitch;
+
+    for (int i = 0; i < numLights; ++i) {
+        int lightIndex = lightIndices[i];
+
+        float light_positionView_x = light_positionView_x_array[lightIndex];
+        float light_positionView_y = light_positionView_y_array[lightIndex];
+        float light_positionView_z = light_positionView_z_array[lightIndex];
+        float light_attenuationEnd = light_attenuationEnd_array[lightIndex];
+        float light_attenuationEndNeg = -light_attenuationEnd;
+        
+        // Test lights again subtile z bounds
+        bool inFrustum[4];
+        inFrustum[0] = (light_positionView_z - subtileMinZ[0] >= light_attenuationEndNeg) &&
+            (subtileMaxZ[0] - light_positionView_z >= light_attenuationEndNeg);
+        inFrustum[1] = (light_positionView_z - subtileMinZ[1] >= light_attenuationEndNeg) && 
+            (subtileMaxZ[1] - light_positionView_z >= light_attenuationEndNeg);
+        inFrustum[2] = (light_positionView_z - subtileMinZ[2] >= light_attenuationEndNeg) && 
+            (subtileMaxZ[2] - light_positionView_z >= light_attenuationEndNeg);
+        inFrustum[3] = (light_positionView_z - subtileMinZ[3] >= light_attenuationEndNeg) && 
+            (subtileMaxZ[3] - light_positionView_z >= light_attenuationEndNeg);
+
+        float dx = light_positionView_z * frustumPlanes_z[0] + 
+            light_positionView_x * frustumPlanes_xy[0];
+        float dy = light_positionView_z * frustumPlanes_z[1] +
+            light_positionView_y * frustumPlanes_xy[1];
+        
+        if (fabsf(dx) > light_attenuationEnd) {
+            bool positiveX = dx > 0.0f;
+            inFrustum[0] = inFrustum[0] &&  positiveX;    // 00 subtile
+            inFrustum[1] = inFrustum[1] && !positiveX;    // 10 subtile
+            inFrustum[2] = inFrustum[2] &&  positiveX;    // 01 subtile
+            inFrustum[3] = inFrustum[3] && !positiveX;    // 11 subtile
+        }
+        if (fabsf(dy) > light_attenuationEnd) {
+            bool positiveY = dy > 0.0f;
+            inFrustum[0] = inFrustum[0] &&  positiveY;    // 00 subtile
+            inFrustum[1] = inFrustum[1] &&  positiveY;    // 10 subtile
+            inFrustum[2] = inFrustum[2] && !positiveY;    // 01 subtile
+            inFrustum[3] = inFrustum[3] && !positiveY;    // 11 subtile
+        }
+
+        if (inFrustum[0])
+            subtileIndices[subtileLightOffset[0]++] = lightIndex;
+        if (inFrustum[1])
+            subtileIndices[subtileLightOffset[1]++] = lightIndex;
+        if (inFrustum[2])
+            subtileIndices[subtileLightOffset[2]++] = lightIndex;
+        if (inFrustum[3])
+            subtileIndices[subtileLightOffset[3]++] = lightIndex;
+    }
+
+    subtileNumLights[0] = subtileLightOffset[0] - 0 * subtileIndicesPitch;
+    subtileNumLights[1] = subtileLightOffset[1] - 1 * subtileIndicesPitch;
+    subtileNumLights[2] = subtileLightOffset[2] - 2 * subtileIndicesPitch;
+    subtileNumLights[3] = subtileLightOffset[3] - 3 * subtileIndicesPitch;
+}
+
+
+static inline float
+dot3(float x, float y, float z, float a, float b, float c) {
+    return (x*a + y*b + z*c);
+}
+
+
+static inline void
+normalize3(float x, float y, float z, float &ox, float &oy, float &oz) {
+    float n = 1.f / sqrtf(x*x + y*y + z*z);
+    ox = x * n;
+    oy = y * n;
+    oz = z * n;
+}
+
+
+static inline float
+Unorm8ToFloat32(uint8_t u) {
+    return (float)u * (1.0f / 255.0f);
+}
+
+
+static inline uint8_t
+Float32ToUnorm8(float f) {
+    return (uint8_t)(f * 255.0f);
+}
+
+
+static inline float half_to_float_fast(uint16_t h) {
+    uint32_t hs = h & (int32_t)0x8000u;  // Pick off sign bit
+    uint32_t he = h & (int32_t)0x7C00u;  // Pick off exponent bits
+    uint32_t hm = h & (int32_t)0x03FFu;  // Pick off mantissa bits
+
+    // sign
+    uint32_t xs = ((uint32_t) hs) << 16; 
+    // Exponent: unbias the halfp, then bias the single
+    int32_t xes = ((int32_t) (he >> 10)) - 15 + 127; 
+    // Exponent
+    uint32_t xe = (uint32_t) (xes << 23);
+    // Mantissa
+    uint32_t xm = ((uint32_t) hm) << 13; 
+
+    uint32_t bits = (xs | xe | xm);
+    float *fp = reinterpret_cast<float *>(&bits);
+    return *fp;
+}
+
+
+static void
+ShadeTileC(
+    int32_t tileStartX, int32_t tileEndX,
+    int32_t tileStartY, int32_t tileEndY,
+    int32_t gBufferWidth, int32_t gBufferHeight,
+    const ispc::InputDataArrays &inputData,
+    // Camera data
+    float cameraProj_11, float cameraProj_22,
+    float cameraProj_33, float cameraProj_43,
+    // Light list
+    int32_t tileLightIndices[],
+    int32_t tileNumLights,
+    // UI
+    bool visualizeLightCount,
+    // Output
+    uint8_t framebuffer_r[],
+    uint8_t framebuffer_g[],
+    uint8_t framebuffer_b[]
+    )
+{
+    if (tileNumLights == 0 || visualizeLightCount) {
+        uint8_t c = (uint8_t)(std::min(tileNumLights << 2, 255));
+        for (int32_t y = tileStartY; y < tileEndY; ++y) {
+            for (int32_t x = tileStartX; x < tileEndX; ++x) {
+                int32_t framebufferIndex = (y * gBufferWidth + x);
+                framebuffer_r[framebufferIndex] = c;
+                framebuffer_g[framebufferIndex] = c;
+                framebuffer_b[framebufferIndex] = c;
+            }
+        }
+    } else {
+        float twoOverGBufferWidth = 2.0f / gBufferWidth;
+        float twoOverGBufferHeight = 2.0f / gBufferHeight;
+        
+        for (int32_t y = tileStartY; y < tileEndY; ++y) {
+            float positionScreen_y = -(((0.5f + y) * twoOverGBufferHeight) - 1.f);
+
+            for (int32_t x = tileStartX; x < tileEndX; ++x) {
+                int32_t gBufferOffset = y * gBufferWidth + x;
+                
+                // Reconstruct position and (negative) view vector from G-buffer
+                float surface_positionView_x, surface_positionView_y, surface_positionView_z;
+                float Vneg_x, Vneg_y, Vneg_z;
+
+                float z = inputData.zBuffer[gBufferOffset];
+
+                // Compute screen/clip-space position
+                // NOTE: Mind DX11 viewport transform and pixel center!
+                float positionScreen_x = (0.5f + (float)(x)) * 
+                    twoOverGBufferWidth - 1.0f;
+
+                // Unproject depth buffer Z value into view space
+                surface_positionView_z = cameraProj_43 / (z - cameraProj_33);
+                surface_positionView_x = positionScreen_x * surface_positionView_z / 
+                    cameraProj_11;
+                surface_positionView_y = positionScreen_y * surface_positionView_z / 
+                    cameraProj_22;
+                
+                // We actually end up with a vector pointing *at* the
+                // surface (i.e. the negative view vector)
+                normalize3(surface_positionView_x, surface_positionView_y, 
+                           surface_positionView_z, Vneg_x, Vneg_y, Vneg_z);
+
+                // Reconstruct normal from G-buffer
+                float surface_normal_x, surface_normal_y, surface_normal_z;
+                float normal_x = half_to_float_fast(inputData.normalEncoded_x[gBufferOffset]);
+                float normal_y = half_to_float_fast(inputData.normalEncoded_y[gBufferOffset]);
+                    
+                float f = (normal_x - normal_x * normal_x) + (normal_y - normal_y * normal_y);
+                float m = sqrtf(4.0f * f - 1.0f);
+                    
+                surface_normal_x = m * (4.0f * normal_x - 2.0f);
+                surface_normal_y = m * (4.0f * normal_y - 2.0f);
+                surface_normal_z = 3.0f - 8.0f * f;
+
+                // Load other G-buffer parameters
+                float surface_specularAmount = 
+                    half_to_float_fast(inputData.specularAmount[gBufferOffset]);
+                float surface_specularPower  = 
+                    half_to_float_fast(inputData.specularPower[gBufferOffset]);
+                float surface_albedo_x = Unorm8ToFloat32(inputData.albedo_x[gBufferOffset]);
+                float surface_albedo_y = Unorm8ToFloat32(inputData.albedo_y[gBufferOffset]);
+                float surface_albedo_z = Unorm8ToFloat32(inputData.albedo_z[gBufferOffset]);
+                
+                float lit_x = 0.0f;
+                float lit_y = 0.0f;
+                float lit_z = 0.0f;
+                for (int32_t tileLightIndex = 0; tileLightIndex < tileNumLights; 
+                     ++tileLightIndex) {
+                    int32_t lightIndex = tileLightIndices[tileLightIndex];
+                                        
+                    // Gather light data relevant to initial culling
+                    float light_positionView_x = 
+                        inputData.lightPositionView_x[lightIndex];
+                    float light_positionView_y = 
+                        inputData.lightPositionView_y[lightIndex];
+                    float light_positionView_z = 
+                        inputData.lightPositionView_z[lightIndex];
+                    float light_attenuationEnd = 
+                        inputData.lightAttenuationEnd[lightIndex];
+                    
+                    // Compute light vector
+                    float L_x = light_positionView_x - surface_positionView_x;
+                    float L_y = light_positionView_y - surface_positionView_y;
+                    float L_z = light_positionView_z - surface_positionView_z;
+
+                    float distanceToLight2 = dot3(L_x, L_y, L_z, L_x, L_y, L_z);
+                    
+                    // Clip at end of attenuation
+                    float light_attenutaionEnd2 = light_attenuationEnd * light_attenuationEnd;
+
+                    if (distanceToLight2 < light_attenutaionEnd2) {                    
+                        float distanceToLight = sqrtf(distanceToLight2);
+
+                        float distanceToLightRcp = 1.f / distanceToLight;
+                        L_x *= distanceToLightRcp;
+                        L_y *= distanceToLightRcp;
+                        L_z *= distanceToLightRcp;
+
+                        // Start computing brdf
+                        float NdotL = dot3(surface_normal_x, surface_normal_y, 
+                                           surface_normal_z, L_x, L_y, L_z);
+                    
+                        // Clip back facing
+                        if (NdotL > 0.0f) {
+                            float light_attenuationBegin = 
+                                inputData.lightAttenuationBegin[lightIndex];
+
+                            // Light distance attenuation (linstep)
+                            float lightRange = (light_attenuationEnd - light_attenuationBegin);
+                            float falloffPosition = (light_attenuationEnd - distanceToLight);
+                            float attenuation = std::min(falloffPosition / lightRange, 1.0f);
+
+                            float H_x = (L_x - Vneg_x);
+                            float H_y = (L_y - Vneg_y);
+                            float H_z = (L_z - Vneg_z);
+                            normalize3(H_x, H_y, H_z, H_x, H_y, H_z);
+                    
+                            float NdotH = dot3(surface_normal_x, surface_normal_y, 
+                                               surface_normal_z, H_x, H_y, H_z);
+                            NdotH = std::max(NdotH, 0.0f);
+
+                            float specular = powf(NdotH, surface_specularPower);
+                            float specularNorm = (surface_specularPower + 2.0f) * 
+                                (1.0f / 8.0f);
+                            float specularContrib = surface_specularAmount * 
+                                specularNorm * specular;
+
+                            float k = attenuation * NdotL * (1.0f + specularContrib);
+                    
+                            float light_color_x = inputData.lightColor_x[lightIndex];
+                            float light_color_y = inputData.lightColor_y[lightIndex];
+                            float light_color_z = inputData.lightColor_z[lightIndex];
+
+                            float lightContrib_x = surface_albedo_x * light_color_x;
+                            float lightContrib_y = surface_albedo_y * light_color_y;
+                            float lightContrib_z = surface_albedo_z * light_color_z;
+
+                            lit_x += lightContrib_x * k;
+                            lit_y += lightContrib_y * k;
+                            lit_z += lightContrib_z * k;
+                        }
+                    }
+                }
+
+                // Gamma correct
+                float gamma = 1.0 / 2.2f;
+                lit_x = powf(std::min(std::max(lit_x, 0.0f), 1.0f), gamma);
+                lit_y = powf(std::min(std::max(lit_y, 0.0f), 1.0f), gamma);
+                lit_z = powf(std::min(std::max(lit_z, 0.0f), 1.0f), gamma);
+                
+                framebuffer_r[gBufferOffset] = Float32ToUnorm8(lit_x);
+                framebuffer_g[gBufferOffset] = Float32ToUnorm8(lit_y);
+                framebuffer_b[gBufferOffset] = Float32ToUnorm8(lit_z);
+            }
+        }
+    }
+}
+
+
+void
+ShadeDynamicTileRecurse(InputData *input, int level, int tileX, int tileY, 
+                        int *lightIndices, int numLights, 
+                        Framebuffer *framebuffer) {
+    const MinMaxZTree *minMaxZTree = gMinMaxZTree;
+    
+    // If we few enough lights or this is the base case (last level), shade
+    // this full tile directly
+    if (level == 0 || numLights < DYNAMIC_MIN_LIGHTS_TO_SUBDIVIDE) {
+        int width = minMaxZTree->TileWidth(level);
+        int height = minMaxZTree->TileHeight(level);
+        int startX = tileX * width;
+        int startY = tileY * height;
+        int endX = std::min(input->header.framebufferWidth, startX + width);
+        int endY = std::min(input->header.framebufferHeight, startY + height);
+        
+        // Skip entirely offscreen tiles
+        if (endX > startX && endY > startY) {
+            ShadeTileC(startX, endX, startY, endY,
+                       input->header.framebufferWidth, input->header.framebufferHeight,
+                       input->arrays,
+                       input->header.cameraProj[0][0], input->header.cameraProj[1][1], 
+                       input->header.cameraProj[2][2], input->header.cameraProj[3][2],
+                       lightIndices, numLights, VISUALIZE_LIGHT_COUNT, 
+                       framebuffer->r, framebuffer->g, framebuffer->b);
+        }
+    } 
+    else {
+        // Otherwise, subdivide and 4-way recurse using X and Y splitting planes
+        // Move down a level in the tree
+        --level;
+        tileX <<= 1;
+        tileY <<= 1;
+        int width = minMaxZTree->TileWidth(level);
+        int height = minMaxZTree->TileHeight(level);
+
+        // Work out splitting coords
+        int midX = (tileX + 1) * width;
+        int midY = (tileY + 1) * height;
+
+        // Read subtile min/max data
+        // NOTE: We must be sure to handle out-of-bounds access here since
+        // sometimes we'll only have 1 or 2 subtiles for non-pow-2
+        // framebuffer sizes.
+        bool rightTileExists = (tileX + 1 < minMaxZTree->NumTilesX(level));
+        bool bottomTileExists = (tileY + 1 < minMaxZTree->NumTilesY(level));
+
+        // NOTE: Order is 00, 10, 01, 11
+        // Set defaults up to cull all lights if the tile doesn't exist (offscreen)
+        float minZ[4] = {input->header.cameraFar, input->header.cameraFar, 
+                         input->header.cameraFar, input->header.cameraFar};
+        float maxZ[4] = {input->header.cameraNear, input->header.cameraNear, 
+                         input->header.cameraNear, input->header.cameraNear};
+
+        minZ[0] = minMaxZTree->MinZ(level, tileX, tileY);
+        maxZ[0] = minMaxZTree->MaxZ(level, tileX, tileY);
+        if (rightTileExists) {
+            minZ[1] = minMaxZTree->MinZ(level, tileX + 1, tileY);
+            maxZ[1] = minMaxZTree->MaxZ(level, tileX + 1, tileY);
+            if (bottomTileExists) {
+                minZ[3] = minMaxZTree->MinZ(level, tileX + 1, tileY + 1);
+                maxZ[3] = minMaxZTree->MaxZ(level, tileX + 1, tileY + 1);
+            }
+        }
+        if (bottomTileExists) {
+            minZ[2] = minMaxZTree->MinZ(level, tileX, tileY + 1);
+            maxZ[2] = minMaxZTree->MaxZ(level, tileX, tileY + 1);
+        }
+
+        // Cull lights into subtile lists
+#ifdef ISPC_IS_WINDOWS
+        __declspec(align(ALIGNMENT_BYTES)) 
+#endif
+            int subtileLightIndices[4][MAX_LIGHTS]
+#ifndef ISPC_IS_WINDOWS
+            __attribute__ ((aligned(ALIGNMENT_BYTES)))
+#endif
+;
+        int subtileNumLights[4];
+        SplitTileMinMax(midX, midY, minZ, maxZ,
+            input->header.framebufferWidth, input->header.framebufferHeight, 
+            input->header.cameraProj[0][0], input->header.cameraProj[1][1],
+            lightIndices, numLights, input->arrays.lightPositionView_x, 
+            input->arrays.lightPositionView_y, input->arrays.lightPositionView_z, 
+            input->arrays.lightAttenuationEnd,
+            subtileLightIndices[0], MAX_LIGHTS, subtileNumLights);
+        
+        // Recurse into subtiles
+        ShadeDynamicTileRecurse(input, level, tileX    , tileY, 
+                                subtileLightIndices[0], subtileNumLights[0],
+                                framebuffer);
+        ShadeDynamicTileRecurse(input, level, tileX + 1, tileY,
+                                subtileLightIndices[1], subtileNumLights[1],
+                                framebuffer);
+        ShadeDynamicTileRecurse(input, level, tileX    , tileY + 1,
+                                subtileLightIndices[2], subtileNumLights[2],
+                                framebuffer);
+        ShadeDynamicTileRecurse(input, level, tileX + 1, tileY + 1,
+                                subtileLightIndices[3], subtileNumLights[3],
+                                framebuffer);
+    }
+}
+
+
+static int
+IntersectLightsWithTileMinMax(
+    int tileStartX, int tileEndX,
+    int tileStartY, int tileEndY,
+    // Tile data
+    float minZ,
+    float maxZ,
+    // G-buffer data
+    int gBufferWidth, int gBufferHeight,
+    // Camera data
+    float cameraProj_11, float cameraProj_22,
+    // Light Data
+    int numLights,
+    float light_positionView_x_array[],
+    float light_positionView_y_array[],
+    float light_positionView_z_array[],
+    float light_attenuationEnd_array[],
+    // Output
+    int tileLightIndices[]
+    )
+{
+    float gBufferScale_x = 0.5f * (float)gBufferWidth;
+    float gBufferScale_y = 0.5f * (float)gBufferHeight;
+        
+    float frustumPlanes_xy[4];
+    float frustumPlanes_z[4];
+
+    // This one is totally constant over the whole screen... worth pulling it up at all?
+    float frustumPlanes_xy_v[4] = { -(cameraProj_11 * gBufferScale_x),
+                                    (cameraProj_11 * gBufferScale_x),
+                                    (cameraProj_22 * gBufferScale_y),
+                                    -(cameraProj_22 * gBufferScale_y) };
+    
+    float frustumPlanes_z_v[4] = {  tileEndX - gBufferScale_x,
+                                    -tileStartX + gBufferScale_x,
+                                    tileEndY - gBufferScale_y,
+                                    -tileStartY + gBufferScale_y };
+
+    for (int i = 0; i < 4; ++i) {
+        float norm = 1.f / sqrtf(frustumPlanes_xy_v[i] * frustumPlanes_xy_v[i] + 
+                                 frustumPlanes_z_v[i] * frustumPlanes_z_v[i]);
+        frustumPlanes_xy_v[i] *= norm;
+        frustumPlanes_z_v[i] *= norm;
+
+        frustumPlanes_xy[i] = frustumPlanes_xy_v[i];
+        frustumPlanes_z[i] = frustumPlanes_z_v[i];
+    }
+
+    int tileNumLights = 0;
+
+    for (int lightIndex = 0; lightIndex < numLights; ++lightIndex) {
+        float light_positionView_z = light_positionView_z_array[lightIndex];
+        float light_attenuationEnd = light_attenuationEnd_array[lightIndex];
+        float light_attenuationEndNeg = -light_attenuationEnd;
+
+        float d = light_positionView_z - minZ;
+        bool inFrustum = (d >= light_attenuationEndNeg);
+
+        d = maxZ - light_positionView_z;
+        inFrustum = inFrustum && (d >= light_attenuationEndNeg);
+        
+        if (!inFrustum) 
+            continue;
+
+        float light_positionView_x = light_positionView_x_array[lightIndex];
+        float light_positionView_y = light_positionView_y_array[lightIndex];
+
+        d = light_positionView_z * frustumPlanes_z[0] + 
+            light_positionView_x * frustumPlanes_xy[0];
+        inFrustum = inFrustum && (d >= light_attenuationEndNeg);
+
+        d = light_positionView_z * frustumPlanes_z[1] + 
+            light_positionView_x * frustumPlanes_xy[1];
+        inFrustum = inFrustum && (d >= light_attenuationEndNeg);
+
+        d = light_positionView_z * frustumPlanes_z[2] + 
+            light_positionView_y * frustumPlanes_xy[2];
+        inFrustum = inFrustum && (d >= light_attenuationEndNeg);
+
+        d = light_positionView_z * frustumPlanes_z[3] + 
+            light_positionView_y * frustumPlanes_xy[3];
+        inFrustum = inFrustum && (d >= light_attenuationEndNeg);
+        
+        // Pack and store intersecting lights
+        if (inFrustum)
+            tileLightIndices[tileNumLights++] = lightIndex;
+    }
+
+    return tileNumLights;
+}
+
+
+void
+ShadeDynamicTile(InputData *input, int level, int tileX, int tileY,
+                 Framebuffer *framebuffer) {
+    const MinMaxZTree *minMaxZTree = gMinMaxZTree;
+
+    // Get Z min/max for this tile
+    int width = minMaxZTree->TileWidth(level);
+    int height = minMaxZTree->TileHeight(level);
+    float minZ = minMaxZTree->MinZ(level, tileX, tileY);
+    float maxZ = minMaxZTree->MaxZ(level, tileX, tileY);
+
+    int startX = tileX * width;
+    int startY = tileY * height;
+    int endX = std::min(input->header.framebufferWidth, startX + width);
+    int endY = std::min(input->header.framebufferHeight, startY + height);
+
+    // This is a root tile, so first do a full 6-plane cull
+#ifdef ISPC_IS_WINDOWS
+    __declspec(align(ALIGNMENT_BYTES)) 
+#endif
+        int lightIndices[MAX_LIGHTS]
+#ifndef ISPC_IS_WINDOWS
+        __attribute__ ((aligned(ALIGNMENT_BYTES)))
+#endif
+;
+    int numLights = IntersectLightsWithTileMinMax(
+        startX, endX, startY, endY,    minZ, maxZ,
+        input->header.framebufferWidth, input->header.framebufferHeight,
+        input->header.cameraProj[0][0], input->header.cameraProj[1][1],
+        MAX_LIGHTS, input->arrays.lightPositionView_x, 
+        input->arrays.lightPositionView_y, input->arrays.lightPositionView_z, 
+        input->arrays.lightAttenuationEnd, lightIndices);
+
+    // Now kick off the recursive process for this tile
+    ShadeDynamicTileRecurse(input, level, tileX, tileY, lightIndices, 
+                            numLights, framebuffer);
+}
+
+
+void
+DispatchDynamicC(InputData *input, Framebuffer *framebuffer)
+{
+    MinMaxZTree *minMaxZTree = gMinMaxZTree;
+        
+    // Update min/max Z tree
+    minMaxZTree->Update(input->arrays.zBuffer, input->header.framebufferWidth,
+        input->header.cameraProj[2][2], input->header.cameraProj[3][2], 
+        input->header.cameraNear, input->header.cameraFar);
+
+    int rootLevel = minMaxZTree->Levels() - 1;
+    int rootTilesX = minMaxZTree->NumTilesX(rootLevel);
+    int rootTilesY = minMaxZTree->NumTilesY(rootLevel);
+    int rootTiles = rootTilesX * rootTilesY;
+    for (int g = 0; g < rootTiles; ++g) {
+        uint32_t tileY = g / rootTilesX;
+        uint32_t tileX = g % rootTilesX;
+        ShadeDynamicTile(input, rootLevel, tileX, tileY, framebuffer);
+    }
+}
--- a/examples/deferred/dynamic_cilk.cpp
+++ b/examples/deferred/dynamic_cilk.cpp
@@ -0,0 +1,398 @@
+/*
+  Copyright (c) 2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+#ifdef __cilkplusplus
+
+#include "deferred.h"
+#include "kernels_ispc.h"
+#include <algorithm>
+#include <assert.h>
+
+#ifdef _MSC_VER
+#define ISPC_IS_WINDOWS
+#elif defined(__linux__)
+#define ISPC_IS_LINUX
+#elif defined(__APPLE__)
+#define ISPC_IS_APPLE
+#endif
+
+#ifdef ISPC_IS_LINUX
+#include <malloc.h>
+#endif // ISPC_IS_LINUX
+
+// Currently tile widths must be a multiple of SIMD width (i.e. 8 for ispc sse4x2)!
+#define MIN_TILE_WIDTH 16
+#define MIN_TILE_HEIGHT 16
+
+
+#define DYNAMIC_TREE_LEVELS 5
+// If this is set to 1 then the result will be identical to the static version
+#define DYNAMIC_MIN_LIGHTS_TO_SUBDIVIDE 1
+
+static void *
+lAlignedMalloc(int64_t size, int32_t alignment) {
+#ifdef ISPC_IS_WINDOWS
+    return _aligned_malloc(size, alignment);
+#endif
+#ifdef ISPC_IS_LINUX
+    return memalign(alignment, size);
+#endif
+#ifdef ISPC_IS_APPLE
+    void *mem = malloc(size + (alignment-1) + sizeof(void*));
+    char *amem = ((char*)mem) + sizeof(void*);
+    amem = amem + uint32_t(alignment - (reinterpret_cast<uint64_t>(amem) &
+                                        (alignment - 1)));
+    ((void**)amem)[-1] = mem;
+    return amem;
+#endif
+}
+
+
+static void
+lAlignedFree(void *ptr) {
+#ifdef ISPC_IS_WINDOWS
+    _aligned_free(ptr);
+#endif
+#ifdef ISPC_IS_LINUX
+    free(ptr);
+#endif
+#ifdef ISPC_IS_APPLE
+    free(((void**)ptr)[-1]);
+#endif
+}
+
+
+class MinMaxZTreeCilk
+{
+public:
+    // Currently (min) tile dimensions must divide gBuffer dimensions evenly
+    // Levels must be small enough that neither dimension goes below one tile
+    MinMaxZTreeCilk(
+        int tileWidth, int tileHeight, int levels,
+        int gBufferWidth, int gBufferHeight)
+        : mTileWidth(tileWidth), mTileHeight(tileHeight), mLevels(levels)
+    {
+        mNumTilesX = gBufferWidth / mTileWidth;
+        mNumTilesY = gBufferHeight / mTileHeight;
+        
+        // Allocate arrays
+        mMinZArrays = (float **)lAlignedMalloc(sizeof(float *) * mLevels, 16);
+        mMaxZArrays = (float **)lAlignedMalloc(sizeof(float *) * mLevels, 16);
+        for (int i = 0; i < mLevels; ++i) {
+            int x = NumTilesX(i);
+            int y = NumTilesY(i);
+            assert(x > 0);
+            assert(y > 0);
+            // NOTE: If the following two asserts fire it probably means that
+            // the base tile dimensions do not evenly divide the G-buffer dimensions
+            assert(x * (mTileWidth << i) >= gBufferWidth);
+            assert(y * (mTileHeight << i) >= gBufferHeight);
+            mMinZArrays[i] = (float *)lAlignedMalloc(sizeof(float) * x * y, 16);
+            mMaxZArrays[i] = (float *)lAlignedMalloc(sizeof(float) * x * y, 16);
+        }
+    }
+
+    void Update(float *zBuffer, int gBufferPitchInElements,
+        float cameraProj_33, float cameraProj_43,
+        float cameraNear, float cameraFar)
+    {
+        // Compute level 0 in parallel. Outer loops is here since we use Cilk
+        _Cilk_for (int tileY = 0; tileY < mNumTilesY; ++tileY) {
+            ispc::ComputeZBoundsRow(tileY,
+                mTileWidth, mTileHeight, mNumTilesX, mNumTilesY,
+                zBuffer, gBufferPitchInElements,
+                cameraProj_33, cameraProj_43, cameraNear, cameraFar,
+                mMinZArrays[0] + (tileY * mNumTilesX),
+                mMaxZArrays[0] + (tileY * mNumTilesX));
+        }
+
+        // Generate other levels
+        // NOTE: We currently don't use ispc here since it's sort of an
+        // awkward gather-based reduction Using SSE odd pack/unpack
+        // instructions might actually work here when we need to optimize
+        for (int level = 1; level < mLevels; ++level) {
+            int destTilesX = NumTilesX(level);
+            int destTilesY = NumTilesY(level);
+            int srcLevel = level - 1;
+            int srcTilesX = NumTilesX(srcLevel);
+            int srcTilesY = NumTilesY(srcLevel);
+            _Cilk_for (int y = 0; y < destTilesY; ++y) {
+                for (int x = 0; x < destTilesX; ++x) {
+                    int srcX = x << 1;
+                    int srcY = y << 1;
+                    // NOTE: Ugly branches to deal with non-multiple dimensions at some levels
+                    // TODO: SSE branchless min/max is probably better...
+                    float minZ = mMinZArrays[srcLevel][(srcY) * srcTilesX + (srcX)];
+                    float maxZ = mMaxZArrays[srcLevel][(srcY) * srcTilesX + (srcX)];
+                    if (srcX + 1 < srcTilesX) {
+                        minZ = std::min(minZ, mMinZArrays[srcLevel][(srcY) * srcTilesX + 
+                                                                    (srcX + 1)]);
+                        maxZ = std::max(maxZ, mMaxZArrays[srcLevel][(srcY) * srcTilesX +
+                                                                    (srcX + 1)]);
+                        if (srcY + 1 < srcTilesY) {
+                            minZ = std::min(minZ, mMinZArrays[srcLevel][(srcY + 1) * srcTilesX +
+                                                                        (srcX + 1)]);
+                            maxZ = std::max(maxZ, mMaxZArrays[srcLevel][(srcY + 1) * srcTilesX +
+                                                                        (srcX + 1)]);
+                        }
+                    }
+                    if (srcY + 1 < srcTilesY) {
+                        minZ = std::min(minZ, mMinZArrays[srcLevel][(srcY + 1) * srcTilesX +
+                                                                    (srcX    )]);
+                        maxZ = std::max(maxZ, mMaxZArrays[srcLevel][(srcY + 1) * srcTilesX +
+                                                                    (srcX    )]);
+                    }
+                    mMinZArrays[level][y * destTilesX + x] = minZ;
+                    mMaxZArrays[level][y * destTilesX + x] = maxZ;
+                }
+            }
+        }
+    }
+
+    ~MinMaxZTreeCilk() {
+        for (int i = 0; i < mLevels; ++i) {
+            lAlignedFree(mMinZArrays[i]);
+            lAlignedFree(mMaxZArrays[i]);
+        }
+        lAlignedFree(mMinZArrays);
+        lAlignedFree(mMaxZArrays); 
+    }
+
+    int Levels() const { return mLevels; }
+
+    // These round UP, so beware that the last tile for a given level may not be completely full
+    // TODO: Verify this...
+    int NumTilesX(int level = 0) const { return (mNumTilesX + (1 << level) - 1) >> level; }
+    int NumTilesY(int level = 0) const { return (mNumTilesY + (1 << level) - 1) >> level; }
+    int TileWidth(int level = 0) const { return (mTileWidth << level); }
+    int TileHeight(int level = 0) const { return (mTileHeight << level); }
+
+    float MinZ(int level, int tileX, int tileY) const {
+        return mMinZArrays[level][tileY * NumTilesX(level) + tileX];
+    }
+    float MaxZ(int level, int tileX, int tileY) const {
+        return mMaxZArrays[level][tileY * NumTilesX(level) + tileX];
+    }
+
+private:
+    int mTileWidth;
+    int mTileHeight;
+    int mLevels;
+    int mNumTilesX;
+    int mNumTilesY;
+
+    // One array for each "level" in the tree
+    float **mMinZArrays;
+    float **mMaxZArrays;
+};
+
+static MinMaxZTreeCilk *gMinMaxZTreeCilk = 0;
+
+void InitDynamicCilk(InputData *input) {
+    gMinMaxZTreeCilk = 
+        new MinMaxZTreeCilk(MIN_TILE_WIDTH, MIN_TILE_HEIGHT, DYNAMIC_TREE_LEVELS,
+                            input->header.framebufferWidth, 
+                            input->header.framebufferHeight);
+}
+
+
+static void
+ShadeDynamicTileRecurse(InputData *input, int level, int tileX, int tileY, 
+                        int *lightIndices, int numLights, 
+                        Framebuffer *framebuffer) {
+    const MinMaxZTreeCilk *minMaxZTree = gMinMaxZTreeCilk;
+    
+    // If we few enough lights or this is the base case (last level), shade
+    // this full tile directly
+    if (level == 0 || numLights < DYNAMIC_MIN_LIGHTS_TO_SUBDIVIDE) {
+        int width = minMaxZTree->TileWidth(level);
+        int height = minMaxZTree->TileHeight(level);
+        int startX = tileX * width;
+        int startY = tileY * height;
+        int endX = std::min(input->header.framebufferWidth, startX + width);
+        int endY = std::min(input->header.framebufferHeight, startY + height);
+        
+        // Skip entirely offscreen tiles
+        if (endX > startX && endY > startY) {
+            ispc::ShadeTile(
+                startX, endX, startY, endY,
+                input->header.framebufferWidth, input->header.framebufferHeight,
+                &input->arrays,
+                input->header.cameraProj[0][0], input->header.cameraProj[1][1], 
+                input->header.cameraProj[2][2], input->header.cameraProj[3][2],
+                lightIndices, numLights, VISUALIZE_LIGHT_COUNT, 
+                framebuffer->r, framebuffer->g, framebuffer->b);
+        }
+    } 
+    else {
+        // Otherwise, subdivide and 4-way recurse using X and Y splitting planes
+        // Move down a level in the tree
+        --level;
+        tileX <<= 1;
+        tileY <<= 1;
+        int width = minMaxZTree->TileWidth(level);
+        int height = minMaxZTree->TileHeight(level);
+
+        // Work out splitting coords
+        int midX = (tileX + 1) * width;
+        int midY = (tileY + 1) * height;
+
+        // Read subtile min/max data
+        // NOTE: We must be sure to handle out-of-bounds access here since
+        // sometimes we'll only have 1 or 2 subtiles for non-pow-2
+        // framebuffer sizes.
+        bool rightTileExists = (tileX + 1 < minMaxZTree->NumTilesX(level));
+        bool bottomTileExists = (tileY + 1 < minMaxZTree->NumTilesY(level));
+
+        // NOTE: Order is 00, 10, 01, 11
+        // Set defaults up to cull all lights if the tile doesn't exist (offscreen)
+        float minZ[4] = {input->header.cameraFar, input->header.cameraFar, 
+                         input->header.cameraFar, input->header.cameraFar};
+        float maxZ[4] = {input->header.cameraNear, input->header.cameraNear, 
+                         input->header.cameraNear, input->header.cameraNear};
+
+        minZ[0] = minMaxZTree->MinZ(level, tileX, tileY);
+        maxZ[0] = minMaxZTree->MaxZ(level, tileX, tileY);
+        if (rightTileExists) {
+            minZ[1] = minMaxZTree->MinZ(level, tileX + 1, tileY);
+            maxZ[1] = minMaxZTree->MaxZ(level, tileX + 1, tileY);
+            if (bottomTileExists) {
+                minZ[3] = minMaxZTree->MinZ(level, tileX + 1, tileY + 1);
+                maxZ[3] = minMaxZTree->MaxZ(level, tileX + 1, tileY + 1);
+            }
+        }
+        if (bottomTileExists) {
+            minZ[2] = minMaxZTree->MinZ(level, tileX, tileY + 1);
+            maxZ[2] = minMaxZTree->MaxZ(level, tileX, tileY + 1);
+        }
+
+        // Cull lights into subtile lists
+#ifdef ISPC_IS_WINDOWS
+        __declspec(align(ALIGNMENT_BYTES)) 
+#endif
+            int subtileLightIndices[4][MAX_LIGHTS]
+#ifndef ISPC_IS_WINDOWS
+            __attribute__ ((aligned(ALIGNMENT_BYTES)))
+#endif
+;
+        int subtileNumLights[4];
+        ispc::SplitTileMinMax(midX, midY, minZ, maxZ,
+            input->header.framebufferWidth, input->header.framebufferHeight, 
+            input->header.cameraProj[0][0], input->header.cameraProj[1][1],
+            lightIndices, numLights, input->arrays.lightPositionView_x, 
+            input->arrays.lightPositionView_y, input->arrays.lightPositionView_z, 
+            input->arrays.lightAttenuationEnd,
+            subtileLightIndices[0], MAX_LIGHTS, subtileNumLights);
+        
+        // Recurse into subtiles
+        _Cilk_spawn ShadeDynamicTileRecurse(input, level, tileX    , tileY, 
+                                            subtileLightIndices[0], subtileNumLights[0],
+                                            framebuffer);
+        _Cilk_spawn ShadeDynamicTileRecurse(input, level, tileX + 1, tileY,
+                                            subtileLightIndices[1], subtileNumLights[1],
+                                            framebuffer);
+        _Cilk_spawn ShadeDynamicTileRecurse(input, level, tileX    , tileY + 1,
+                                            subtileLightIndices[2], subtileNumLights[2],
+                                            framebuffer);
+        ShadeDynamicTileRecurse(input, level, tileX + 1, tileY + 1,
+                                subtileLightIndices[3], subtileNumLights[3],
+                                framebuffer);
+    }
+}
+
+
+static void
+ShadeDynamicTile(InputData *input, int level, int tileX, int tileY,
+                 Framebuffer *framebuffer) {
+    const MinMaxZTreeCilk *minMaxZTree = gMinMaxZTreeCilk;
+
+    // Get Z min/max for this tile
+    int width = minMaxZTree->TileWidth(level);
+    int height = minMaxZTree->TileHeight(level);
+    float minZ = minMaxZTree->MinZ(level, tileX, tileY);
+    float maxZ = minMaxZTree->MaxZ(level, tileX, tileY);
+
+    int startX = tileX * width;
+    int startY = tileY * height;
+    int endX = std::min(input->header.framebufferWidth, startX + width);
+    int endY = std::min(input->header.framebufferHeight, startY + height);
+
+    // This is a root tile, so first do a full 6-plane cull
+#ifdef ISPC_IS_WINDOWS
+    __declspec(align(ALIGNMENT_BYTES)) 
+#endif
+        int lightIndices[MAX_LIGHTS]
+#ifndef ISPC_IS_WINDOWS
+        __attribute__ ((aligned(ALIGNMENT_BYTES)))
+#endif
+;
+    int numLights = ispc::IntersectLightsWithTileMinMax(
+        startX, endX, startY, endY,    minZ, maxZ,
+        input->header.framebufferWidth, input->header.framebufferHeight,
+        input->header.cameraProj[0][0], input->header.cameraProj[1][1],
+        MAX_LIGHTS, input->arrays.lightPositionView_x, 
+        input->arrays.lightPositionView_y, input->arrays.lightPositionView_z, 
+        input->arrays.lightAttenuationEnd, lightIndices);
+
+    // Now kick off the recursive process for this tile
+    ShadeDynamicTileRecurse(input, level, tileX, tileY, lightIndices, 
+                            numLights, framebuffer);
+}
+
+
+void
+DispatchDynamicCilk(InputData *input, Framebuffer *framebuffer)
+{
+    MinMaxZTreeCilk *minMaxZTree = gMinMaxZTreeCilk;
+        
+    // Update min/max Z tree
+    minMaxZTree->Update(input->arrays.zBuffer, input->header.framebufferWidth,
+        input->header.cameraProj[2][2], input->header.cameraProj[3][2], 
+        input->header.cameraNear, input->header.cameraFar);
+
+    // Launch the "root" tiles.  Ideally these should at least fill the
+    // machine... at the moment we have a static number of "levels" to the
+    // mip tree but it might make sense to compute it based on the width of
+    // the machine.
+    int rootLevel = minMaxZTree->Levels() - 1;
+    int rootTilesX = minMaxZTree->NumTilesX(rootLevel);
+    int rootTilesY = minMaxZTree->NumTilesY(rootLevel);
+    int rootTiles = rootTilesX * rootTilesY;
+    _Cilk_for (int g = 0; g < rootTiles; ++g) {
+        uint32_t tileY = g / rootTilesX;
+        uint32_t tileX = g % rootTilesX;
+        ShadeDynamicTile(input, rootLevel, tileX, tileY, framebuffer);
+    }
+}
+
+#endif // __cilkplusplus
--- a/examples/deferred/kernels.ispc
+++ b/examples/deferred/kernels.ispc
@@ -0,0 +1,717 @@
+/*
+  Copyright (c) 2010-2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+#include "deferred.h"
+
+struct InputDataArrays
+{
+    uniform float zBuffer[];
+    uniform unsigned int16 normalEncoded_x[]; // half float
+    uniform unsigned int16 normalEncoded_y[]; // half float
+    uniform unsigned int16 specularAmount[]; // half float
+    uniform unsigned int16 specularPower[]; // half float
+    uniform unsigned int8 albedo_x[]; // unorm8
+    uniform unsigned int8 albedo_y[]; // unorm8
+    uniform unsigned int8 albedo_z[]; // unorm8
+    uniform float lightPositionView_x[];
+    uniform float lightPositionView_y[];
+    uniform float lightPositionView_z[];
+    uniform float lightAttenuationBegin[];
+    uniform float lightColor_x[];
+    uniform float lightColor_y[];
+    uniform float lightColor_z[];
+    uniform float lightAttenuationEnd[];
+};
+
+struct InputHeader
+{
+    uniform float cameraProj[4][4];
+    uniform float cameraNear;
+    uniform float cameraFar;
+
+    uniform int32 framebufferWidth;
+    uniform int32 framebufferHeight;
+    uniform int32 numLights;
+    uniform int32 inputDataChunkSize;
+    uniform int32 inputDataArrayOffsets[idaNum];
+};
+
+export void foo(reference InputHeader h) { }
+
+
+///////////////////////////////////////////////////////////////////////////
+// Common utility routines
+
+static inline float
+dot3(float x, float y, float z, float a, float b, float c) {
+    return (x*a + y*b + z*c);
+}
+
+
+static inline void
+normalize3(float x, float y, float z, reference float ox, 
+           reference float oy, reference float oz) {
+    float n = rsqrt(x*x + y*y + z*z);
+    ox = x * n;
+    oy = y * n;
+    oz = z * n;
+}
+
+
+static inline float
+Unorm8ToFloat32(unsigned int8 u) {
+    return (float)u * (1.0f / 255.0f);
+}
+
+
+static inline unsigned int8
+Float32ToUnorm8(float f) {
+    return (unsigned int8)(f * 255.0f);
+}
+
+
+// tile width must be a multiple of programCount (SIMD size)
+static void
+ComputeZBounds(
+    uniform int32 tileStartX, uniform int32 tileEndX,
+    uniform int32 tileStartY, uniform int32 tileEndY,
+    // G-buffer data
+    uniform float zBuffer[],
+    uniform int32 gBufferWidth,
+    // Camera data
+    uniform float cameraProj_33, uniform float cameraProj_43,
+    uniform float cameraNear, uniform float cameraFar,
+    // Output
+    reference uniform float minZ,
+    reference uniform float maxZ
+    )
+{
+    // Find Z bounds
+    float laneMinZ = cameraFar;
+    float laneMaxZ = cameraNear;
+    for (uniform int32 y = tileStartY; y < tileEndY; ++y) {
+        for (uniform int32 x = tileStartX; x < tileEndX; x += programCount) {
+            // Unproject depth buffer Z value into view space
+            float z = zBuffer[(y * gBufferWidth + x) + programIndex];
+            float viewSpaceZ = cameraProj_43 / (z - cameraProj_33);
+
+            // Work out Z bounds for our samples
+            // Avoid considering skybox/background or otherwise invalid pixels
+            if ((viewSpaceZ < cameraFar) && (viewSpaceZ >= cameraNear)) {
+                laneMinZ = min(laneMinZ, viewSpaceZ);
+                laneMaxZ = max(laneMaxZ, viewSpaceZ);
+            }
+        }
+    }
+    minZ = reduce_min(laneMinZ);
+    maxZ = reduce_max(laneMaxZ);
+}
+
+
+// tile width must be a multiple of programCount (SIMD size)
+// numLights must currently be a multiple of programCount (SIMD size)
+export uniform int32
+IntersectLightsWithTileMinMax(
+    uniform int32 tileStartX, uniform int32 tileEndX,
+    uniform int32 tileStartY, uniform int32 tileEndY,
+    // Tile data
+    uniform float minZ,
+    uniform float maxZ,
+    // G-buffer data
+    uniform int32 gBufferWidth, uniform int32 gBufferHeight,
+    // Camera data
+    uniform float cameraProj_11, uniform float cameraProj_22,
+    // Light Data
+    uniform int32 numLights,
+    uniform float light_positionView_x_array[],
+    uniform float light_positionView_y_array[],
+    uniform float light_positionView_z_array[],
+    uniform float light_attenuationEnd_array[],
+    // Output
+    reference uniform int32 tileLightIndices[]
+    )
+{
+    uniform float gBufferScale_x = 0.5f * (float)gBufferWidth;
+    uniform float gBufferScale_y = 0.5f * (float)gBufferHeight;
+        
+    // Parallize across frustum planes.
+    // We really only have four side planes here, but write the code to
+    // handle programCount > 4 robustly
+    uniform float frustumPlanes_xy[programCount];
+    uniform float frustumPlanes_z[programCount];
+
+    // TODO: If programIndex < 4 here? Don't care about masking off the
+    // rest but if interleaving ("x2" modes) the other lanes should ideally
+    // not be emitted...
+    {
+        // This one is totally constant over the whole screen... worth pulling it up at all?
+        float frustumPlanes_xy_v;
+        frustumPlanes_xy_v = insert(frustumPlanes_xy_v, 0, -(cameraProj_11 * gBufferScale_x));
+        frustumPlanes_xy_v = insert(frustumPlanes_xy_v, 1,  (cameraProj_11 * gBufferScale_x));
+        frustumPlanes_xy_v = insert(frustumPlanes_xy_v, 2,  (cameraProj_22 * gBufferScale_y));
+        frustumPlanes_xy_v = insert(frustumPlanes_xy_v, 3, -(cameraProj_22 * gBufferScale_y));
+    
+        float frustumPlanes_z_v;
+        frustumPlanes_z_v = insert(frustumPlanes_z_v, 0,  tileEndX - gBufferScale_x);
+        frustumPlanes_z_v = insert(frustumPlanes_z_v, 1, -tileStartX + gBufferScale_x);
+        frustumPlanes_z_v = insert(frustumPlanes_z_v, 2,  tileEndY - gBufferScale_y);
+        frustumPlanes_z_v = insert(frustumPlanes_z_v, 3, -tileStartY + gBufferScale_y);
+
+        // Normalize
+        float norm = rsqrt(frustumPlanes_xy_v * frustumPlanes_xy_v + 
+                           frustumPlanes_z_v * frustumPlanes_z_v);
+            frustumPlanes_xy_v *= norm;
+            frustumPlanes_z_v *= norm;
+
+        // Save out for uniform use later
+        frustumPlanes_xy[programIndex] = frustumPlanes_xy_v;
+        frustumPlanes_z[programIndex] = frustumPlanes_z_v;
+    }
+
+    uniform int32 tileNumLights = 0;
+
+    for (uniform int32 baseLightIndex = 0; baseLightIndex < numLights; 
+         baseLightIndex += programCount) {
+        int32 lightIndex = baseLightIndex + programIndex;
+        float light_positionView_z = light_positionView_z_array[lightIndex];
+        float light_attenuationEnd = light_attenuationEnd_array[lightIndex];
+        float light_attenuationEndNeg = -light_attenuationEnd;
+
+        float d = light_positionView_z - minZ;
+        bool inFrustum = (d >= light_attenuationEndNeg);
+
+        d = maxZ - light_positionView_z;
+        inFrustum = inFrustum && (d >= light_attenuationEndNeg);
+        
+        // This seems better than cif(!inFrustum) ccontinue; here since we
+        // don't actually need to mask the rest of this function - this is
+        // just a greedy early-out.  Could also structure all of this as
+        // nested if() statements, but this a bit easier to read
+        if (!any(inFrustum)) 
+            continue;
+
+        float light_positionView_x = light_positionView_x_array[lightIndex];
+        float light_positionView_y = light_positionView_y_array[lightIndex];
+
+        d = light_positionView_z * frustumPlanes_z[0] + 
+            light_positionView_x * frustumPlanes_xy[0];
+        inFrustum = inFrustum && (d >= light_attenuationEndNeg);
+
+        d = light_positionView_z * frustumPlanes_z[1] + 
+            light_positionView_x * frustumPlanes_xy[1];
+        inFrustum = inFrustum && (d >= light_attenuationEndNeg);
+
+        d = light_positionView_z * frustumPlanes_z[2] + 
+            light_positionView_y * frustumPlanes_xy[2];
+        inFrustum = inFrustum && (d >= light_attenuationEndNeg);
+
+        d = light_positionView_z * frustumPlanes_z[3] + 
+            light_positionView_y * frustumPlanes_xy[3];
+        inFrustum = inFrustum && (d >= light_attenuationEndNeg);
+        
+        // Pack and store intersecting lights
+        cif (inFrustum) {
+            tileNumLights += packed_store_active(tileLightIndices, tileNumLights, 
+                                                 lightIndex);
+        }
+    }
+
+    return tileNumLights;
+}
+
+
+// tile width must be a multiple of programCount (SIMD size)
+// numLights must currently be a multiple of programCount (SIMD size)
+static uniform int32
+IntersectLightsWithTile(
+    uniform int32 tileStartX, uniform int32 tileEndX,
+    uniform int32 tileStartY, uniform int32 tileEndY,
+    uniform int32 gBufferWidth, uniform int32 gBufferHeight,
+    // G-buffer data
+    uniform float zBuffer[],
+    // Camera data
+    uniform float cameraProj_11, uniform float cameraProj_22,
+    uniform float cameraProj_33, uniform float cameraProj_43,
+    uniform float cameraNear, uniform float cameraFar,
+    // Light Data
+    uniform int32 numLights,
+    uniform float light_positionView_x_array[],
+    uniform float light_positionView_y_array[],
+    uniform float light_positionView_z_array[],
+    uniform float light_attenuationEnd_array[],
+    // Output
+    reference uniform int32 tileLightIndices[]
+    )
+{
+    uniform float minZ, maxZ;
+    ComputeZBounds(tileStartX, tileEndX, tileStartY, tileEndY,
+        zBuffer, gBufferWidth, cameraProj_33, cameraProj_43, cameraNear, cameraFar,
+        minZ, maxZ);
+
+    uniform int32 tileNumLights = IntersectLightsWithTileMinMax(
+        tileStartX, tileEndX, tileStartY, tileEndY, minZ, maxZ,
+        gBufferWidth, gBufferHeight, cameraProj_11, cameraProj_22,
+        MAX_LIGHTS, light_positionView_x_array, light_positionView_y_array, 
+        light_positionView_z_array, light_attenuationEnd_array,
+        tileLightIndices);
+
+    return tileNumLights;
+}
+
+
+// tile width must be a multiple of programCount (SIMD size)
+export void
+ShadeTile(
+    uniform int32 tileStartX, uniform int32 tileEndX,
+    uniform int32 tileStartY, uniform int32 tileEndY,
+    uniform int32 gBufferWidth, uniform int32 gBufferHeight,
+    reference uniform InputDataArrays inputData,
+    // Camera data
+    uniform float cameraProj_11, uniform float cameraProj_22,
+    uniform float cameraProj_33, uniform float cameraProj_43,
+    // Light list
+    reference uniform int32 tileLightIndices[],
+    uniform int32 tileNumLights,
+    // UI
+    uniform bool visualizeLightCount,
+    // Output
+    reference uniform unsigned int8 framebuffer_r[],
+    reference uniform unsigned int8 framebuffer_g[],
+    reference uniform unsigned int8 framebuffer_b[]
+    )
+{
+    if (tileNumLights == 0 || visualizeLightCount) {
+        uniform unsigned int8 c = (unsigned int8)(min(tileNumLights << 2, 255));
+        for (uniform int32 y = tileStartY; y < tileEndY; ++y) {
+            for (uniform int32 x = tileStartX; x < tileEndX; x += programCount) {
+                int32 framebufferIndex = (y * gBufferWidth + x) + programIndex;
+                framebuffer_r[framebufferIndex] = c;
+                framebuffer_g[framebufferIndex] = c;
+                framebuffer_b[framebufferIndex] = c;
+            }
+        }
+    } else {
+        uniform float twoOverGBufferWidth = 2.0f / gBufferWidth;
+        uniform float twoOverGBufferHeight = 2.0f / gBufferHeight;
+        
+        for (uniform int32 y = tileStartY; y < tileEndY; ++y) {
+            uniform float positionScreen_y = -(((0.5f + y) * twoOverGBufferHeight) - 1.f);
+
+            for (uniform int32 x = tileStartX; x < tileEndX; x += programCount) {
+                uniform int32 gBufferOffsetBase = y * gBufferWidth + x;
+                int32 gBufferOffset = gBufferOffsetBase + programIndex;
+                
+                // Reconstruct position and (negative) view vector from G-buffer
+                float surface_positionView_x, surface_positionView_y, surface_positionView_z;
+                float Vneg_x, Vneg_y, Vneg_z;
+
+                float z = inputData.zBuffer[gBufferOffset];
+
+                // Compute screen/clip-space position
+                // NOTE: Mind DX11 viewport transform and pixel center!
+                float positionScreen_x = (0.5f + (float)(x + programIndex)) * 
+                    twoOverGBufferWidth - 1.0f;
+
+                // Unproject depth buffer Z value into view space
+                surface_positionView_z = cameraProj_43 / (z - cameraProj_33);
+                surface_positionView_x = positionScreen_x * surface_positionView_z / 
+                    cameraProj_11;
+                surface_positionView_y = positionScreen_y * surface_positionView_z / 
+                    cameraProj_22;
+                
+                // We actually end up with a vector pointing *at* the
+                // surface (i.e. the negative view vector)
+                normalize3(surface_positionView_x, surface_positionView_y, 
+                           surface_positionView_z, Vneg_x, Vneg_y, Vneg_z);
+
+                // Reconstruct normal from G-buffer
+                float surface_normal_x, surface_normal_y, surface_normal_z;
+                float normal_x = half_to_float_fast(inputData.normalEncoded_x[gBufferOffset]);
+                float normal_y = half_to_float_fast(inputData.normalEncoded_y[gBufferOffset]);
+                    
+                float f = (normal_x - normal_x * normal_x) + (normal_y - normal_y * normal_y);
+                float m = sqrt(4.0f * f - 1.0f);
+                    
+                surface_normal_x = m * (4.0f * normal_x - 2.0f);
+                surface_normal_y = m * (4.0f * normal_y - 2.0f);
+                surface_normal_z = 3.0f - 8.0f * f;
+
+                // Load other G-buffer parameters
+                float surface_specularAmount = 
+                    half_to_float_fast(inputData.specularAmount[gBufferOffset]);
+                float surface_specularPower  = 
+                    half_to_float_fast(inputData.specularPower[gBufferOffset]);
+                float surface_albedo_x = Unorm8ToFloat32(inputData.albedo_x[gBufferOffset]);
+                float surface_albedo_y = Unorm8ToFloat32(inputData.albedo_y[gBufferOffset]);
+                float surface_albedo_z = Unorm8ToFloat32(inputData.albedo_z[gBufferOffset]);
+                
+                float lit_x = 0.0f;
+                float lit_y = 0.0f;
+                float lit_z = 0.0f;
+                for (uniform int32 tileLightIndex = 0; tileLightIndex < tileNumLights; 
+                     ++tileLightIndex) {
+                    uniform int32 lightIndex = tileLightIndices[tileLightIndex];
+                                        
+                    // Gather light data relevant to initial culling
+                    uniform float light_positionView_x = 
+                        inputData.lightPositionView_x[lightIndex];
+                    uniform float light_positionView_y = 
+                        inputData.lightPositionView_y[lightIndex];
+                    uniform float light_positionView_z = 
+                        inputData.lightPositionView_z[lightIndex];
+                    uniform float light_attenuationEnd = 
+                        inputData.lightAttenuationEnd[lightIndex];
+                    
+                    // Compute light vector
+                    float L_x = light_positionView_x - surface_positionView_x;
+                    float L_y = light_positionView_y - surface_positionView_y;
+                    float L_z = light_positionView_z - surface_positionView_z;
+
+                    float distanceToLight2 = dot3(L_x, L_y, L_z, L_x, L_y, L_z);
+                    
+                    // Clip at end of attenuation
+                    float light_attenutaionEnd2 = light_attenuationEnd * light_attenuationEnd;
+
+                    cif (distanceToLight2 < light_attenutaionEnd2) {                    
+                        float distanceToLight = sqrt(distanceToLight2);
+
+                        // HLSL "rcp" is allowed to be fairly inaccurate
+                        float distanceToLightRcp = rcp(distanceToLight);
+                        L_x *= distanceToLightRcp;
+                        L_y *= distanceToLightRcp;
+                        L_z *= distanceToLightRcp;
+
+                        // Start computing brdf
+                        float NdotL = dot3(surface_normal_x, surface_normal_y, 
+                                           surface_normal_z, L_x, L_y, L_z);
+                    
+                        // Clip back facing
+                        cif (NdotL > 0.0f) {
+                            uniform float light_attenuationBegin = 
+                                inputData.lightAttenuationBegin[lightIndex];
+
+                            // Light distance attenuation (linstep)
+                            float lightRange = (light_attenuationEnd - light_attenuationBegin);
+                            float falloffPosition = (light_attenuationEnd - distanceToLight);
+                            float attenuation = min(falloffPosition / lightRange, 1.0f);
+
+                            float H_x = (L_x - Vneg_x);
+                            float H_y = (L_y - Vneg_y);
+                            float H_z = (L_z - Vneg_z);
+                            normalize3(H_x, H_y, H_z, H_x, H_y, H_z);
+                    
+                            float NdotH = dot3(surface_normal_x, surface_normal_y, 
+                                               surface_normal_z, H_x, H_y, H_z);
+                            NdotH = max(NdotH, 0.0f);
+
+                            float specular = pow(NdotH, surface_specularPower);
+                            float specularNorm = (surface_specularPower + 2.0f) * 
+                                (1.0f / 8.0f);
+                            float specularContrib = surface_specularAmount * 
+                                specularNorm * specular;
+
+                            float k = attenuation * NdotL * (1.0f + specularContrib);
+                    
+                            uniform float light_color_x = inputData.lightColor_x[lightIndex];
+                            uniform float light_color_y = inputData.lightColor_y[lightIndex];
+                            uniform float light_color_z = inputData.lightColor_z[lightIndex];
+
+                            float lightContrib_x = surface_albedo_x * light_color_x;
+                            float lightContrib_y = surface_albedo_y * light_color_y;
+                            float lightContrib_z = surface_albedo_z * light_color_z;
+
+                            lit_x += lightContrib_x * k;
+                            lit_y += lightContrib_y * k;
+                            lit_z += lightContrib_z * k;
+                        }
+                    }
+                }
+
+                // Gamma correct
+                // These pows are pretty slow right now, but we can do
+                // something faster if really necessary to squeeze every
+                // last bit of performance out of it
+                float gamma = 1.0 / 2.2f;
+                lit_x = pow(clamp(lit_x, 0.0f, 1.0f), gamma);
+                lit_y = pow(clamp(lit_y, 0.0f, 1.0f), gamma);
+                lit_z = pow(clamp(lit_z, 0.0f, 1.0f), gamma);
+                
+                framebuffer_r[gBufferOffset] = Float32ToUnorm8(lit_x);
+                framebuffer_g[gBufferOffset] = Float32ToUnorm8(lit_y);
+                framebuffer_b[gBufferOffset] = Float32ToUnorm8(lit_z);
+            }
+        }
+    }
+}
+
+
+///////////////////////////////////////////////////////////////////////////
+// Static decomposition
+
+task void
+RenderTile(uniform int g, uniform int num_groups_x, uniform int num_groups_y,
+           reference uniform InputHeader inputHeader,
+           reference uniform InputDataArrays inputData,
+           uniform int visualizeLightCount,
+           // Output
+           reference uniform unsigned int8 framebuffer_r[],
+           reference uniform unsigned int8 framebuffer_g[],
+           reference uniform unsigned int8 framebuffer_b[]) {
+    uniform int32 group_y = g / num_groups_x;
+    uniform int32 group_x = g % num_groups_x;
+    uniform int32 tile_start_x = group_x * MIN_TILE_WIDTH;
+    uniform int32 tile_start_y = group_y * MIN_TILE_HEIGHT;
+    uniform int32 tile_end_x = tile_start_x + MIN_TILE_WIDTH;
+    uniform int32 tile_end_y = tile_start_y + MIN_TILE_HEIGHT;
+
+    uniform int sTileNumLights = 0;
+    uniform int sTileLightIndices[MAX_LIGHTS];  // Light list for the tile
+
+    uniform int framebufferWidth = inputHeader.framebufferWidth;
+    uniform int framebufferHeight = inputHeader.framebufferHeight;
+    uniform float cameraProj_00 = inputHeader.cameraProj[0][0];
+    uniform float cameraProj_11 = inputHeader.cameraProj[1][1];
+    uniform float cameraProj_22 = inputHeader.cameraProj[2][2];
+    uniform float cameraProj_32 = inputHeader.cameraProj[3][2];
+
+    // Light intersection
+    sTileNumLights = 
+        IntersectLightsWithTile(tile_start_x, tile_end_x, 
+                                tile_start_y, tile_end_y,
+                                framebufferWidth, framebufferHeight,
+                                inputData.zBuffer,
+                                cameraProj_00, cameraProj_11,
+                                cameraProj_22, cameraProj_32,
+                                inputHeader.cameraNear, inputHeader.cameraFar,
+                                MAX_LIGHTS,
+                                inputData.lightPositionView_x, 
+                                inputData.lightPositionView_y, 
+                                inputData.lightPositionView_z, 
+                                inputData.lightAttenuationEnd,
+                                sTileLightIndices);
+
+    ShadeTile(tile_start_x, tile_end_x, tile_start_y, tile_end_y,
+              framebufferWidth, framebufferHeight, inputData,
+              cameraProj_00, cameraProj_11, cameraProj_22, cameraProj_32,
+              sTileLightIndices, sTileNumLights, visualizeLightCount, 
+              framebuffer_r, framebuffer_g, framebuffer_b);
+}
+
+
+export void
+RenderStatic(reference uniform InputHeader inputHeader,
+             reference uniform InputDataArrays inputData,
+             uniform int visualizeLightCount,
+             // Output
+             reference uniform unsigned int8 framebuffer_r[],
+             reference uniform unsigned int8 framebuffer_g[],
+             reference uniform unsigned int8 framebuffer_b[]) {
+    uniform int num_groups_x = (inputHeader.framebufferWidth + 
+                                MIN_TILE_WIDTH - 1) / MIN_TILE_WIDTH;
+    uniform int num_groups_y = (inputHeader.framebufferHeight + 
+                                MIN_TILE_HEIGHT - 1) / MIN_TILE_HEIGHT;
+    uniform int num_groups = num_groups_x * num_groups_y;
+
+    for (uniform int g = 0; g < num_groups; ++g)
+        launch < RenderTile(g, num_groups_x, num_groups_y,
+                            inputHeader, inputData, visualizeLightCount,
+                            framebuffer_r, framebuffer_g, framebuffer_b) >;
+}
+
+
+///////////////////////////////////////////////////////////////////////////
+// Routines for dynamic decomposition path
+
+// tile width must be a multiple of programCount (SIMD size)
+export void
+ComputeZBoundsRow(
+    uniform int32 tileY,
+    uniform int32 tileWidth, uniform int32 tileHeight,
+    uniform int32 numTilesX, uniform int32 numTilesY,
+    // G-buffer data
+    uniform float zBuffer[],
+    uniform int32 gBufferWidth,
+    // Camera data
+    uniform float cameraProj_33, uniform float cameraProj_43,
+    uniform float cameraNear, uniform float cameraFar,
+    // Output
+    reference uniform float minZArray[],
+    reference uniform float maxZArray[]
+    )
+{
+    for (uniform int32 tileX = 0; tileX < numTilesX; ++tileX) {
+        uniform float minZ, maxZ;
+        ComputeZBounds(
+            tileX * tileWidth, tileX * tileWidth + tileWidth,
+            tileY * tileHeight, tileY * tileHeight + tileHeight,
+            zBuffer, gBufferWidth,
+            cameraProj_33, cameraProj_43, cameraNear, cameraFar,
+            minZ, maxZ);
+        minZArray[tileX] = minZ;
+        maxZArray[tileX] = maxZ;
+    }
+}
+
+
+// numLights need not be a multiple of programCount here, but the input and output arrays
+// should be able to handle programCount-sized load/stores.
+export void
+SplitTileMinMax(
+    uniform int32 tileMidX, uniform int32 tileMidY,
+    // Subtile data (00, 10, 01, 11)
+    uniform float subtileMinZ[],
+    uniform float subtileMaxZ[],
+    // G-buffer data
+    uniform int32 gBufferWidth, uniform int32 gBufferHeight,
+    // Camera data
+    uniform float cameraProj_11, uniform float cameraProj_22,
+    // Light Data
+    reference uniform int32 lightIndices[],
+    uniform int32 numLights,
+    uniform float light_positionView_x_array[],
+    uniform float light_positionView_y_array[],
+    uniform float light_positionView_z_array[],
+    uniform float light_attenuationEnd_array[],
+    // Outputs
+    // TODO: ISPC doesn't currently like multidimensionsal arrays so we'll do the
+    // indexing math ourselves
+    reference uniform int32 subtileIndices[],
+    uniform int32 subtileIndicesPitch,
+    reference uniform int32 subtileNumLights[]
+    )
+{
+    uniform float gBufferScale_x = 0.5f * (float)gBufferWidth;
+    uniform float gBufferScale_y = 0.5f * (float)gBufferHeight;
+        
+    // Parallize across frustum planes
+    // Only have 2 frustum split planes here so may not be worth it, but
+    // we'll do it for now for consistency
+    uniform float frustumPlanes_xy[programCount];
+    uniform float frustumPlanes_z[programCount];
+
+    // This one is totally constant over the whole screen... worth pulling it up at all?
+    float frustumPlanes_xy_v;
+    frustumPlanes_xy_v = insert(frustumPlanes_xy_v, 0, -(cameraProj_11 * gBufferScale_x));
+    frustumPlanes_xy_v = insert(frustumPlanes_xy_v, 1,  (cameraProj_22 * gBufferScale_y));
+    
+    float frustumPlanes_z_v;
+    frustumPlanes_z_v = insert(frustumPlanes_z_v, 0, tileMidX - gBufferScale_x);
+    frustumPlanes_z_v = insert(frustumPlanes_z_v, 1, tileMidY - gBufferScale_y);
+
+    // Normalize
+    float norm = rsqrt(frustumPlanes_xy_v * frustumPlanes_xy_v + 
+                       frustumPlanes_z_v * frustumPlanes_z_v);
+    frustumPlanes_xy_v *= norm;
+    frustumPlanes_z_v *= norm;
+
+    // Save out for uniform use later
+    frustumPlanes_xy[programIndex] = frustumPlanes_xy_v;
+    frustumPlanes_z[programIndex] = frustumPlanes_z_v;
+
+    // Initialize
+    uniform int32 subtileLightOffset[4];
+    subtileLightOffset[0] = 0 * subtileIndicesPitch;
+    subtileLightOffset[1] = 1 * subtileIndicesPitch;
+    subtileLightOffset[2] = 2 * subtileIndicesPitch;
+    subtileLightOffset[3] = 3 * subtileIndicesPitch;
+
+    for (int32 i = programIndex; i < numLights; i += programCount) {
+        // TODO: ISPC says gather required here when it actually
+        // isn't... this could be fixed this by nesting an if() within a
+        // uniform loop, but I'm not totally sure if that's a win
+        // overall. For now we'll just eat the perf cost for cleanliness
+        // since the below are real gathers anyways.
+        int32 lightIndex = lightIndices[i];
+
+        float light_positionView_x = light_positionView_x_array[lightIndex];
+        float light_positionView_y = light_positionView_y_array[lightIndex];
+        float light_positionView_z = light_positionView_z_array[lightIndex];
+        float light_attenuationEnd = light_attenuationEnd_array[lightIndex];
+        float light_attenuationEndNeg = -light_attenuationEnd;
+        
+        // Test lights again subtile z bounds
+        bool inFrustum[4];
+        inFrustum[0] = (light_positionView_z - subtileMinZ[0] >= light_attenuationEndNeg) &&
+            (subtileMaxZ[0] - light_positionView_z >= light_attenuationEndNeg);
+        inFrustum[1] = (light_positionView_z - subtileMinZ[1] >= light_attenuationEndNeg) && 
+            (subtileMaxZ[1] - light_positionView_z >= light_attenuationEndNeg);
+        inFrustum[2] = (light_positionView_z - subtileMinZ[2] >= light_attenuationEndNeg) && 
+            (subtileMaxZ[2] - light_positionView_z >= light_attenuationEndNeg);
+        inFrustum[3] = (light_positionView_z - subtileMinZ[3] >= light_attenuationEndNeg) && 
+            (subtileMaxZ[3] - light_positionView_z >= light_attenuationEndNeg);
+
+        float dx = light_positionView_z * frustumPlanes_z[0] + 
+            light_positionView_x * frustumPlanes_xy[0];
+        float dy = light_positionView_z * frustumPlanes_z[1] +
+            light_positionView_y * frustumPlanes_xy[1];
+        
+        cif (abs(dx) > light_attenuationEnd) {
+            bool positiveX = dx > 0.0f;
+            inFrustum[0] = inFrustum[0] &&  positiveX;    // 00 subtile
+            inFrustum[1] = inFrustum[1] && !positiveX;    // 10 subtile
+            inFrustum[2] = inFrustum[2] &&  positiveX;    // 01 subtile
+            inFrustum[3] = inFrustum[3] && !positiveX;    // 11 subtile
+        }
+        cif (abs(dy) > light_attenuationEnd) {
+            bool positiveY = dy > 0.0f;
+            inFrustum[0] = inFrustum[0] &&  positiveY;    // 00 subtile
+            inFrustum[1] = inFrustum[1] &&  positiveY;    // 10 subtile
+            inFrustum[2] = inFrustum[2] && !positiveY;    // 01 subtile
+            inFrustum[3] = inFrustum[3] && !positiveY;    // 11 subtile
+        }
+
+        // Pack and store intersecting lights
+        // TODO: Experiment with a loop here instead
+        cif (inFrustum[0])
+            subtileLightOffset[0] += packed_store_active(subtileIndices, 
+                                                         subtileLightOffset[0], 
+                                                         lightIndex);
+        cif (inFrustum[1])
+            subtileLightOffset[1] += packed_store_active(subtileIndices, 
+                                                         subtileLightOffset[1], 
+                                                         lightIndex);
+        cif (inFrustum[2])
+            subtileLightOffset[2] += packed_store_active(subtileIndices, 
+                                                         subtileLightOffset[2], 
+                                                         lightIndex);
+        cif (inFrustum[3])
+            subtileLightOffset[3] += packed_store_active(subtileIndices, 
+                                                         subtileLightOffset[3], 
+                                                         lightIndex);
+    }
+
+    subtileNumLights[0] = subtileLightOffset[0] - 0 * subtileIndicesPitch;
+    subtileNumLights[1] = subtileLightOffset[1] - 1 * subtileIndicesPitch;
+    subtileNumLights[2] = subtileLightOffset[2] - 2 * subtileIndicesPitch;
+    subtileNumLights[3] = subtileLightOffset[3] - 3 * subtileIndicesPitch;
+}
--- a/examples/deferred/main.cpp
+++ b/examples/deferred/main.cpp
@@ -0,0 +1,137 @@
+/*
+  Copyright (c) 2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+#ifdef _MSC_VER
+#define ISPC_IS_WINDOWS
+#define NOMINMAX
+#elif defined(__linux__)
+#define ISPC_IS_LINUX
+#elif defined(__APPLE__)
+#define ISPC_IS_APPLE
+#endif
+
+#include <fcntl.h>
+#include <float.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+#include <stdint.h>
+#include <algorithm>
+#include <assert.h>
+#include <vector>
+#ifdef ISPC_IS_WINDOWS
+  #define WIN32_LEAN_AND_MEAN
+  #include <windows.h>
+#endif
+#include "deferred.h"
+#include "kernels_ispc.h"
+#include "../timing.h"
+
+///////////////////////////////////////////////////////////////////////////
+
+int main(int argc, char** argv) {
+    if (argc != 2) {
+        printf("usage: deferred_shading <input_file>\n");
+        return 1;
+    }
+
+    InputData *input = CreateInputDataFromFile(argv[1]);
+    if (!input) {
+        printf("Failed to load input file \"%s\"!\n", argv[1]);
+        return 1;
+    }
+
+    Framebuffer framebuffer(input->header.framebufferWidth,
+                            input->header.framebufferHeight);
+
+    InitDynamicC(input);
+#ifdef __cilkplusplus
+    InitDynamicCilk(input);
+#endif // __cilkplusplus
+
+    int nframes = 5;
+    double ispcCycles = 1e30;
+    for (int i = 0; i < 5; ++i) {
+        framebuffer.clear();
+        reset_and_start_timer();
+        for (int j = 0; j < nframes; ++j)
+            ispc::RenderStatic(&input->header, &input->arrays, 
+                               VISUALIZE_LIGHT_COUNT,
+                               framebuffer.r, framebuffer.g, framebuffer.b);
+        double mcycles = get_elapsed_mcycles() / nframes;
+        ispcCycles = std::min(ispcCycles, mcycles);
+    }
+    printf("[ispc static + tasks]:\t\t[%.3f] million cycles to render "
+           "%d x %d image\n", ispcCycles,
+           input->header.framebufferWidth, input->header.framebufferHeight);
+    WriteFrame("deferred-ispc-static.ppm", input, framebuffer);
+
+    double serialCycles = 1e30;
+    for (int i = 0; i < 5; ++i) {
+        framebuffer.clear();
+        reset_and_start_timer();
+        for (int j = 0; j < nframes; ++j)
+            DispatchDynamicC(input, &framebuffer);
+        double mcycles = get_elapsed_mcycles() / nframes;
+        serialCycles = std::min(serialCycles, mcycles);
+    }
+    printf("[C++ serial dynamic, 1 core]:\t[%.3f] million cycles\n", 
+           serialCycles);
+    WriteFrame("deferred-serial-dynamic.ppm", input, framebuffer);
+
+#ifdef __cilkplusplus
+    double dynamicCilkCycles = 1e30;
+    for (int i = 0; i < 5; ++i) {
+        framebuffer.clear();
+        reset_and_start_timer();
+        for (int j = 0; j < nframes; ++j)
+            DispatchDynamicCilk(input, &framebuffer);
+        double mcycles = get_elapsed_mcycles() / nframes;
+        dynamicCilkCycles = std::min(dynamicCilkCycles, mcycles);
+    }
+    printf("[ispc + Cilk dynamic]:\t\t[%.3f] million cycles\n", 
+           dynamicCilkCycles);
+    WriteFrame("deferred-ispc-dynamic.ppm", input, framebuffer);
+
+    printf("\t\t\t\t(%.2fx speedup from static ISPC, %.2fx from Cilk+ISPC)\n", 
+           serialCycles/ispcCycles, serialCycles/dynamicCilkCycles);
+#else
+    printf("\t\t\t\t(%.2fx speedup from ISPC)\n", serialCycles/ispcCycles);
+#endif // __cilkplusplus
+
+    DeleteInputData(input);
+
+    return 0;
+}
--- a/examples/examples.sln
+++ b/examples/examples.sln
@@ -15,8 +15,14 @@ Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "mandelbrot_tasks", "mandelb
 EndProject
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "aobench_instrumented", "aobench_instrumented\aobench_instrumented.vcxproj", "{B3B4AE3D-6D5A-4CF9-AF5B-43CF2131B958}"
 EndProject
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "noise", "noise\noise.vcxproj", "{0E0886D8-8B5E-4EAF-9A21-91E63DAF81FD}"
-EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "noise", "noise\noise.vcxproj", "{0E0886D8-8B5E-4EAF-9A21-91E63DAF81FD}"
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "volume", "volume_rendering\volume.vcxproj", "{DEE5733A-E93E-449D-9114-9BFFCAEB4DF9}"
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "stencil", "stencil\stencil.vcxproj", "{2EF070A1-F62F-4E6A-944B-88D140945C3C}"
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "deferred_shading", "deferred\deferred_shading.vcxproj", "{87F53C53-957E-4E91-878A-BC27828FB9EB}"
+EndProject
 Global
 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
 		Debug|Win32 = Debug|Win32
@@ -81,14 +87,38 @@ Global
 		{B3B4AE3D-6D5A-4CF9-AF5B-43CF2131B958}.Release|Win32.Build.0 = Release|Win32
 		{B3B4AE3D-6D5A-4CF9-AF5B-43CF2131B958}.Release|x64.ActiveCfg = Release|x64
 		{B3B4AE3D-6D5A-4CF9-AF5B-43CF2131B958}.Release|x64.Build.0 = Release|x64
-		{0E0886D8-8B5E-4EAF-9A21-91E63DAF81FD}.Debug|Win32.ActiveCfg = Debug|Win32
-		{0E0886D8-8B5E-4EAF-9A21-91E63DAF81FD}.Debug|Win32.Build.0 = Debug|Win32
-		{0E0886D8-8B5E-4EAF-9A21-91E63DAF81FD}.Debug|x64.ActiveCfg = Debug|x64
-		{0E0886D8-8B5E-4EAF-9A21-91E63DAF81FD}.Debug|x64.Build.0 = Debug|x64
-		{0E0886D8-8B5E-4EAF-9A21-91E63DAF81FD}.Release|Win32.ActiveCfg = Release|Win32
-		{0E0886D8-8B5E-4EAF-9A21-91E63DAF81FD}.Release|Win32.Build.0 = Release|Win32
-		{0E0886D8-8B5E-4EAF-9A21-91E63DAF81FD}.Release|x64.ActiveCfg = Release|x64
-		{0E0886D8-8B5E-4EAF-9A21-91E63DAF81FD}.Release|x64.Build.0 = Release|x64
+		{0E0886D8-8B5E-4EAF-9A21-91E63DAF81FD}.Debug|Win32.ActiveCfg = Debug|Win32
+		{0E0886D8-8B5E-4EAF-9A21-91E63DAF81FD}.Debug|Win32.Build.0 = Debug|Win32
+		{0E0886D8-8B5E-4EAF-9A21-91E63DAF81FD}.Debug|x64.ActiveCfg = Debug|x64
+		{0E0886D8-8B5E-4EAF-9A21-91E63DAF81FD}.Debug|x64.Build.0 = Debug|x64
+		{0E0886D8-8B5E-4EAF-9A21-91E63DAF81FD}.Release|Win32.ActiveCfg = Release|Win32
+		{0E0886D8-8B5E-4EAF-9A21-91E63DAF81FD}.Release|Win32.Build.0 = Release|Win32
+		{0E0886D8-8B5E-4EAF-9A21-91E63DAF81FD}.Release|x64.ActiveCfg = Release|x64
+		{0E0886D8-8B5E-4EAF-9A21-91E63DAF81FD}.Release|x64.Build.0 = Release|x64
+		{DEE5733A-E93E-449D-9114-9BFFCAEB4DF9}.Debug|Win32.ActiveCfg = Debug|Win32
+		{DEE5733A-E93E-449D-9114-9BFFCAEB4DF9}.Debug|Win32.Build.0 = Debug|Win32
+		{DEE5733A-E93E-449D-9114-9BFFCAEB4DF9}.Debug|x64.ActiveCfg = Debug|x64
+		{DEE5733A-E93E-449D-9114-9BFFCAEB4DF9}.Debug|x64.Build.0 = Debug|x64
+		{DEE5733A-E93E-449D-9114-9BFFCAEB4DF9}.Release|Win32.ActiveCfg = Release|Win32
+		{DEE5733A-E93E-449D-9114-9BFFCAEB4DF9}.Release|Win32.Build.0 = Release|Win32
+		{DEE5733A-E93E-449D-9114-9BFFCAEB4DF9}.Release|x64.ActiveCfg = Release|x64
+		{DEE5733A-E93E-449D-9114-9BFFCAEB4DF9}.Release|x64.Build.0 = Release|x64
+		{2EF070A1-F62F-4E6A-944B-88D140945C3C}.Debug|Win32.ActiveCfg = Debug|Win32
+		{2EF070A1-F62F-4E6A-944B-88D140945C3C}.Debug|Win32.Build.0 = Debug|Win32
+		{2EF070A1-F62F-4E6A-944B-88D140945C3C}.Debug|x64.ActiveCfg = Debug|x64
+		{2EF070A1-F62F-4E6A-944B-88D140945C3C}.Debug|x64.Build.0 = Debug|x64
+		{2EF070A1-F62F-4E6A-944B-88D140945C3C}.Release|Win32.ActiveCfg = Release|Win32
+		{2EF070A1-F62F-4E6A-944B-88D140945C3C}.Release|Win32.Build.0 = Release|Win32
+		{2EF070A1-F62F-4E6A-944B-88D140945C3C}.Release|x64.ActiveCfg = Release|x64
+		{2EF070A1-F62F-4E6A-944B-88D140945C3C}.Release|x64.Build.0 = Release|x64
+		{87F53C53-957E-4E91-878A-BC27828FB9EB}.Debug|Win32.ActiveCfg = Debug|Win32
+		{87F53C53-957E-4E91-878A-BC27828FB9EB}.Debug|Win32.Build.0 = Debug|Win32
+		{87F53C53-957E-4E91-878A-BC27828FB9EB}.Debug|x64.ActiveCfg = Debug|x64
+		{87F53C53-957E-4E91-878A-BC27828FB9EB}.Debug|x64.Build.0 = Debug|x64
+		{87F53C53-957E-4E91-878A-BC27828FB9EB}.Release|Win32.ActiveCfg = Release|Win32
+		{87F53C53-957E-4E91-878A-BC27828FB9EB}.Release|Win32.Build.0 = Release|Win32
+		{87F53C53-957E-4E91-878A-BC27828FB9EB}.Release|x64.ActiveCfg = Release|x64
+		{87F53C53-957E-4E91-878A-BC27828FB9EB}.Release|x64.Build.0 = Release|x64
 	EndGlobalSection
 	GlobalSection(SolutionProperties) = preSolution
 		HideSolutionNode = FALSE
--- a/examples/mandelbrot/mandelbrot.cpp
+++ b/examples/mandelbrot/mandelbrot.cpp
@@ -64,6 +64,7 @@ writePPM(int *buf, int width, int height, const char *fn) {
            fputc(c, fp);
    }
    fclose(fp);
+    printf("Wrote image file %s\n", fn);
 }


--- a/examples/mandelbrot/mandelbrot.vcxproj
+++ b/examples/mandelbrot/mandelbrot.vcxproj
--- a/examples/mandelbrot/mandelbrot_serial.cpp
+++ b/examples/mandelbrot/mandelbrot_serial.cpp
@@ -36,7 +36,7 @@ static int mandel(float c_re, float c_im, int count) {
    float z_re = c_re, z_im = c_im;
    int i;
    for (i = 0; i < count; ++i) {
-        if (z_re * z_re + z_im * z_im > 4.)
+        if (z_re * z_re + z_im * z_im > 4.f)
            break;

        float new_re = z_re*z_re - z_im*z_im;
--- a/examples/mandelbrot_tasks/Makefile
+++ b/examples/mandelbrot_tasks/Makefile
@@ -1,18 +1,12 @@

 ARCH = $(shell uname)

-TASK_CXX=tasks_pthreads.cpp
+TASK_CXX=../tasksys.cpp
 TASK_LIB=-lpthread
+TASK_OBJ=$(addprefix objs/, $(subst ../,, $(TASK_CXX:.cpp=.o)))

-ifeq ($(ARCH), Darwin)
-  TASK_CXX=tasks_gcd.cpp
-  TASK_LIB=
-endif
-
-TASK_OBJ=$(addprefix objs/, $(TASK_CXX:.cpp=.o))
-
-CXX=g++ -m64
-CXXFLAGS=-Iobjs/ -O3 -Wall
+CXX=g++
+CXXFLAGS=-Iobjs/ -O3 -Wall -m64
 ISPC=ispc
 ISPCFLAGS=-O2 --target=sse4x2 --arch=x86-64

@@ -32,6 +26,9 @@ mandelbrot: dirs objs/mandelbrot.o objs/mandelbrot_serial.o objs/mandelbrot_ispc
 objs/%.o: %.cpp
 	$(CXX) $< $(CXXFLAGS) -c -o $@

+objs/%.o: ../%.cpp
+	$(CXX) $< $(CXXFLAGS) -c -o $@
+
 objs/mandelbrot.o: objs/mandelbrot_ispc.h 

 objs/%_ispc.h objs/%_ispc.o: %.ispc
--- a/examples/mandelbrot_tasks/mandelbrot.cpp
+++ b/examples/mandelbrot_tasks/mandelbrot.cpp
@@ -40,6 +40,7 @@

 #include <stdio.h>
 #include <algorithm>
+#include <string.h>
 #include "../timing.h"
 #include "../cpuid.h"
 #include "mandelbrot_ispc.h"
@@ -64,6 +65,7 @@ writePPM(int *buf, int width, int height, const char *fn) {
            fputc(c, fp);
    }
    fclose(fp);
+    printf("Wrote image file %s\n", fn);
 }


@@ -98,8 +100,12 @@ ensureTargetISAIsSupported() {
    }
 }

+static void usage() {
+    fprintf(stderr, "usage: mandelbrot [--scale=<factor>]\n");
+    exit(1);
+}

-int main() {
+int main(int argc, char *argv[]) {
    unsigned int width = 1536;
    unsigned int height = 1024;
    float x0 = -2;
@@ -107,10 +113,26 @@ int main() {
    float y0 = -1;
    float y1 = 1;

-    ensureTargetISAIsSupported();
+    if (argc == 1)
+        ;
+    else if (argc == 2) {
+        if (strncmp(argv[1], "--scale=", 8) == 0) {
+            float scale = atof(argv[1] + 8);
+            if (scale == 0.f)
+                usage();
+            width *= scale;
+            height *= scale;
+            // round up to multiples of 16
+            width = (width + 0xf) & ~0xf;
+            height = (height + 0xf) & ~0xf;
+        }
+        else 
+            usage();
+    }
+    else
+        usage();

-    extern void TasksInit();
-    TasksInit();
+    ensureTargetISAIsSupported();

    int maxIterations = 512;
    int *buf = new int[width*height];
@@ -121,6 +143,9 @@ int main() {
    //
    double minISPC = 1e30;
    for (int i = 0; i < 3; ++i) {
+        // Clear out the buffer
+        for (unsigned int i = 0; i < width * height; ++i)
+            buf[i] = 0;
        reset_and_start_timer();
        mandelbrot_ispc(x0, y0, x1, y1, width, height, maxIterations, buf);
        double dt = get_elapsed_mcycles();
@@ -130,9 +155,6 @@ int main() {
    printf("[mandelbrot ispc+tasks]:\t[%.3f] million cycles\n", minISPC);
    writePPM(buf, width, height, "mandelbrot-ispc.ppm");

-    // Clear out the buffer
-    for (unsigned int i = 0; i < width * height; ++i)
-        buf[i] = 0;

    // 
    // And run the serial implementation 3 times, again reporting the
@@ -140,6 +162,9 @@ int main() {
    //
    double minSerial = 1e30;
    for (int i = 0; i < 3; ++i) {
+        // Clear out the buffer
+        for (unsigned int i = 0; i < width * height; ++i)
+            buf[i] = 0;
        reset_and_start_timer();
        mandelbrot_serial(x0, y0, x1, y1, width, height, maxIterations, buf);
        double dt = get_elapsed_mcycles();
--- a/examples/mandelbrot_tasks/mandelbrot.ispc
+++ b/examples/mandelbrot_tasks/mandelbrot.ispc
@@ -53,11 +53,14 @@ mandel(float c_re, float c_im, int count) {
   [ystart,yend).
 */
 task void
-mandelbrot_scanlines(uniform int ystart, uniform int yend,
+mandelbrot_scanlines(uniform int ybase, uniform int span,
                     uniform float x0, uniform float dx, 
                     uniform float y0, uniform float dy,
                     uniform int width, uniform int maxIterations,
                     reference uniform int output[]) {
+    uniform int ystart = ybase + taskIndex * span;
+    uniform int yend = ystart + span;
+
    for (uniform int j = ystart; j < yend; ++j) {
        for (uniform int i = 0; i < width; i += programCount) {
            float x = x0 + (programIndex + i) * dx;
@@ -70,6 +73,20 @@ mandelbrot_scanlines(uniform int ystart, uniform int yend,
 }
                               

+task void
+mandelbrot_chunk(uniform float x0, uniform float dx,
+                 uniform float y0, uniform float dy,
+                 uniform int width, uniform int height,
+                 uniform int maxIterations, reference uniform int output[]) {
+    uniform int ystart = taskIndex * (height/taskCount);
+    uniform int yend = (taskIndex+1) * (height/taskCount);
+    uniform int span = 1;
+
+    launch[(yend-ystart)/span] < mandelbrot_scanlines(ystart, span, x0, dx, y0, dy,
+                                                      width, maxIterations, output) >;
+}
+
+
 export void
 mandelbrot_ispc(uniform float x0, uniform float y0, 
                uniform float x1, uniform float y1,
@@ -78,9 +95,6 @@ mandelbrot_ispc(uniform float x0, uniform float y0,
    uniform float dx = (x1 - x0) / width;
    uniform float dy = (y1 - y0) / height;

-    /* Launch task to compute results for spans of 'span' scanlines. */
-    uniform int span = 2;
-    for (uniform int j = 0; j < height; j += span)
-        launch < mandelbrot_scanlines(j, j+span, x0, dx, y0, dy, width,
-                                      maxIterations, output) >;
+    launch[32] < mandelbrot_chunk(x0, dx, y0, dy, width, height,
+                                  maxIterations, output) >;
 }
--- a/examples/mandelbrot_tasks/mandelbrot_serial.cpp
+++ b/examples/mandelbrot_tasks/mandelbrot_serial.cpp
@@ -36,7 +36,7 @@ static int mandel(float c_re, float c_im, int count) {
    float z_re = c_re, z_im = c_im;
    int i;
    for (i = 0; i < count; ++i) {
-        if (z_re * z_re + z_im * z_im > 4.)
+        if (z_re * z_re + z_im * z_im > 4.f)
            break;

        float new_re = z_re*z_re - z_im*z_im;
--- a/examples/mandelbrot_tasks/mandelbrot_tasks.vcxproj
+++ b/examples/mandelbrot_tasks/mandelbrot_tasks.vcxproj
@@ -1,4 +1,4 @@
-<?xml version="1.0" encoding="utf-8"?>
+<?xml version="1.0" encoding="utf-8"?>
 <Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
  <ItemGroup Label="ProjectConfigurations">
    <ProjectConfiguration Include="Debug|Win32">
@@ -143,7 +143,7 @@
  <ItemGroup>
    <ClCompile Include="mandelbrot.cpp" />
    <ClCompile Include="mandelbrot_serial.cpp" />
-    <ClCompile Include="tasks_concrt.cpp" />
+    <ClCompile Include="../tasksys.cpp" />
  </ItemGroup>
  <ItemGroup>
    <CustomBuild Include="mandelbrot.ispc">
--- a/examples/mandelbrot_tasks/tasks_concrt.cpp
+++ b/examples/mandelbrot_tasks/tasks_concrt.cpp
@@ -1,141 +0,0 @@
-/*
-  Copyright (c) 2010-2011, Intel Corporation
-  All rights reserved.
-
-  Redistribution and use in source and binary forms, with or without
-  modification, are permitted provided that the following conditions are
-  met:
-
-    * Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-
-    * Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-
-    * Neither the name of Intel Corporation nor the names of its
-      contributors may be used to endorse or promote products derived from
-      this software without specific prior written permission.
-
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
-   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
-   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
-   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
-*/
-
-/* Simple task system implementation for ispc based on Microsoft's
-   Concurrency Runtime. */
-
-#include <windows.h>
-#include <concrt.h>
-using namespace Concurrency;
-#include <stdint.h>
-#include <assert.h>
-#include <stdio.h>
-#include <stdlib.h>
-
-// ispc expects these functions to have C linkage / not be mangled
-extern "C" { 
-    void ISPCLaunch(void *f, void *data);
-    void ISPCSync();
-    void *ISPCMalloc(int64_t size, int32_t alignment);
-    void ISPCFree(void *ptr);
-}
-
-typedef void (*TaskFuncType)(void *, int, int);
-
-struct TaskInfo {
-    TaskFuncType ispcFunc;
-    void *ispcData;
-};
-
-// This is a simple implementation that just aborts if more than MAX_TASKS
-// are launched.  It could easily be extended to be more general...
-
-#define MAX_TASKS 4096
-static int taskOffset;
-static TaskInfo taskInfo[MAX_TASKS];
-static event *events[MAX_TASKS];
-static CRITICAL_SECTION criticalSection;
-static bool initialized = false;
-
-void
-TasksInit() {
-    InitializeCriticalSection(&criticalSection);
-    for (int i = 0; i < MAX_TASKS; ++i)
-        events[i] = new event;
-    initialized = true;
-}
-
-
-void __cdecl
-lRunTask(LPVOID param) {
-    TaskInfo *ti = (TaskInfo *)param;
-    
-    // Actually run the task. 
-    // FIXME: like the tasks_gcd.cpp implementation, this is passing bogus
-    // values for the threadIndex and threadCount builtins, which in turn
-    // will cause bugs in code that uses those.  FWIW this example doesn't
-    // use them...
-    int threadIndex = 0;
-    int threadCount = 1;
-    ti->ispcFunc(ti->ispcData, threadIndex, threadCount);
-
-    // Signal the event that this task is done
-    int taskNum = ti - &taskInfo[0];
-    events[taskNum]->set();
-}
-
-
-void
-ISPCLaunch(void *func, void *data) {
-    if (!initialized) {
-        fprintf(stderr, "You must call TasksInit() before launching tasks.\n");
-        exit(1);
-    }
-
-    // Get a TaskInfo struct for this task
-    EnterCriticalSection(&criticalSection);
-    TaskInfo *ti = &taskInfo[taskOffset++];
-    assert(taskOffset < MAX_TASKS);
-    LeaveCriticalSection(&criticalSection);
-
-    // And pass it on to the Concurrency Runtime...
-    ti->ispcFunc = (TaskFuncType)func;
-    ti->ispcData = data;
-    CurrentScheduler::ScheduleTask(lRunTask, ti);
-}
-
-
-void ISPCSync() {
-    if (!initialized) {
-        fprintf(stderr, "You must call TasksInit() before launching tasks.\n");
-        exit(1);
-    }
-
-    event::wait_for_multiple(&events[0], taskOffset, true, 
-                             COOPERATIVE_TIMEOUT_INFINITE);
-
-    for (int i = 0; i < taskOffset; ++i)
-        events[i]->reset();
-
-    taskOffset = 0;
-}
-
-
-void *ISPCMalloc(int64_t size, int32_t alignment) {
-    return _aligned_malloc(size, alignment);
-}
-
-
-void ISPCFree(void *ptr) {
-    _aligned_free(ptr);
-}
--- a/examples/mandelbrot_tasks/tasks_gcd.cpp
+++ b/examples/mandelbrot_tasks/tasks_gcd.cpp
@@ -1,103 +0,0 @@
-/*
-  Copyright (c) 2010-2011, Intel Corporation
-  All rights reserved.
-
-  Redistribution and use in source and binary forms, with or without
-  modification, are permitted provided that the following conditions are
-  met:
-
-    * Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-
-    * Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-
-    * Neither the name of Intel Corporation nor the names of its
-      contributors may be used to endorse or promote products derived from
-      this software without specific prior written permission.
-
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
-   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
-   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
-   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
-*/
-
-/* A simple task system for ispc programs based on Apple's Grand Central
-   Dispatch. */
-
-#include <dispatch/dispatch.h>
-#include <stdio.h>
-#include <stdlib.h>
-
-static bool initialized = false;
-static dispatch_queue_t gcdQueue;
-static dispatch_group_t gcdGroup;
-
-// ispc expects these functions to have C linkage / not be mangled
-extern "C" {
-    void ISPCLaunch(void *f, void *data);
-    void ISPCSync();
-}
-
-struct TaskInfo {
-    void *func;
-    void *data;
-};
-
-
-void
-TasksInit() {
-    gcdQueue = dispatch_get_global_queue(DISPATCH_QUEUE_PRIORITY_DEFAULT, 0);
-    gcdGroup = dispatch_group_create();
-    initialized = true;
-}
-
-
-static void
-lRunTask(void *ti) {
-    typedef void (*TaskFuncType)(void *, int, int);
-    TaskInfo *taskInfo = (TaskInfo *)ti;
-
-    TaskFuncType func = (TaskFuncType)(taskInfo->func);
-
-    // FIXME: these are bogus values; may cause bugs in code that depends
-    // on them having unique values in different threads.
-    int threadIndex = 0;
-    int threadCount = 1;
-    // Actually run the task
-    func(taskInfo->data, threadIndex, threadCount);
-
-    // FIXME: taskInfo leaks...
-}
-
-
-void ISPCLaunch(void *func, void *data) {
-    if (!initialized) {
-        fprintf(stderr, "You must call TasksInit() before launching tasks.\n");
-        exit(1);
-    }
-    TaskInfo *ti = new TaskInfo;
-    ti->func = func;
-    ti->data = data;
-    dispatch_group_async_f(gcdGroup, gcdQueue, ti, lRunTask);
-}
-
-
-void ISPCSync() {
-    if (!initialized) {
-        fprintf(stderr, "You must call TasksInit() before launching tasks.\n");
-        exit(1);
-    }
-
-    // Wait for all of the tasks in the group to complete before returning
-    dispatch_group_wait(gcdGroup, DISPATCH_TIME_FOREVER);
-}
--- a/examples/mandelbrot_tasks/tasks_pthreads.cpp
+++ b/examples/mandelbrot_tasks/tasks_pthreads.cpp
@@ -1,295 +0,0 @@
-/*
-  Copyright (c) 2010-2011, Intel Corporation
-  All rights reserved.
-
-  Redistribution and use in source and binary forms, with or without
-  modification, are permitted provided that the following conditions are
-  met:
-
-    * Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-
-    * Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-
-    * Neither the name of Intel Corporation nor the names of its
-      contributors may be used to endorse or promote products derived from
-      this software without specific prior written permission.
-
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
-   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
-   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
-   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
-*/
-
-#include <pthread.h>
-#include <semaphore.h>
-#include <string.h>
-#include <unistd.h>
-#include <assert.h>
-#include <stdio.h>
-#include <fcntl.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <sys/param.h>
-#include <sys/sysctl.h>
-#include <stdint.h>
-#include <stdlib.h>
-#include <errno.h>
-#include <vector>
-
-// ispc expects these functions to have C linkage / not be mangled
-extern "C" { 
-    void ISPCLaunch(void *f, void *data);
-    void ISPCSync();
-}
-
-
-static int nThreads;
-static pthread_t *threads;
-static pthread_mutex_t taskQueueMutex;
-static std::vector<std::pair<void *, void *> > taskQueue;
-static sem_t *workerSemaphore;
-static uint32_t numUnfinishedTasks;
-static pthread_mutex_t tasksRunningConditionMutex;
-static pthread_cond_t tasksRunningCondition;
-
-static void *lTaskEntry(void *arg);
-
-/** Figure out how many CPU cores there are in the system
- */
-static int
-lNumCPUCores() {
-#if defined(__linux__)
-    return sysconf(_SC_NPROCESSORS_ONLN);
-#else
-    // Mac
-    int mib[2];
-    mib[0] = CTL_HW;
-    size_t length = 2;
-    if (sysctlnametomib("hw.logicalcpu", mib, &length) == -1) {
-        fprintf(stderr, "sysctlnametomib() filed.  Guessing 2 cores.");
-        return 2;
-    }
-    assert(length == 2);
-
-    int nCores = 0;
-    size_t size = sizeof(nCores);
-
-    if (sysctl(mib, 2, &nCores, &size, NULL, 0) == -1) {
-        fprintf(stderr, "sysctl() to find number of cores present failed.  Guessing 2.");
-        return 2;
-    }
-    return nCores;
-#endif
-}
-
-void
-TasksInit() {
-    nThreads = lNumCPUCores();
-
-    threads = new pthread_t[nThreads];
-
-    int err;
-    if ((err = pthread_mutex_init(&taskQueueMutex, NULL)) != 0) {
-        fprintf(stderr, "Error creating mutex: %s\n", strerror(err));
-        exit(1);
-    }
-
-    char name[32];
-    sprintf(name, "mandelbrot.%d", (int)getpid());
-    workerSemaphore = sem_open(name, O_CREAT, S_IRUSR|S_IWUSR, 0);
-    if (!workerSemaphore) {
-        fprintf(stderr, "Error creating semaphore: %s\n", strerror(err));
-        exit(1);
-    }
-
-    if ((err = pthread_cond_init(&tasksRunningCondition, NULL)) != 0) {
-        fprintf(stderr, "Error creating condition variable: %s\n", strerror(err));
-        exit(1);
-    }
-
-    if ((err = pthread_mutex_init(&tasksRunningConditionMutex, NULL)) != 0) {
-        fprintf(stderr, "Error creating mutex: %s\n", strerror(err));
-        exit(1);
-    }
-
-    for (int i = 0; i < nThreads; ++i) {
-        err = pthread_create(&threads[i], NULL, &lTaskEntry, reinterpret_cast<void *>(i));
-        if (err != 0) {
-            fprintf(stderr, "Error creating pthread %d: %s\n", i, strerror(err));
-            exit(1);
-        }
-    }
-}
-
-
-void
-ISPCLaunch(void *f, void *d) {
-    if (threads == NULL) {
-        fprintf(stderr, "You must call TasksInit() before launching tasks.\n");
-        exit(1);
-    }
-
-    //
-    // Acquire mutex, add task
-    //
-    int err;
-    if ((err = pthread_mutex_lock(&taskQueueMutex)) != 0) {
-        fprintf(stderr, "Error from pthread_mutex_lock: %s\n", strerror(err));
-        exit(1);
-    }
-
-    taskQueue.push_back(std::make_pair(f, d));
-
-    if ((err = pthread_mutex_unlock(&taskQueueMutex)) != 0) {
-        fprintf(stderr, "Error from pthread_mutex_unlock: %s\n", strerror(err));
-        exit(1);
-    }
-
-    //
-    // Update count of number of tasks left to run
-    //
-    if ((err = pthread_mutex_lock(&tasksRunningConditionMutex)) != 0) {
-        fprintf(stderr, "Error from pthread_mutex_lock: %s\n", strerror(err));
-        exit(1);
-    }
-
-    ++numUnfinishedTasks;
-
-    if ((err = pthread_mutex_unlock(&tasksRunningConditionMutex)) != 0) {
-        fprintf(stderr, "Error from pthread_mutex_lock: %s\n", strerror(err));
-        exit(1);
-    }
-
-    //
-    // Post to the worker semaphore to wake up worker threads that are
-    // sleeping waiting for tasks to show up
-    //
-    if ((err = sem_post(workerSemaphore)) != 0) {
-        fprintf(stderr, "Error from sem_post: %s\n", strerror(err));
-        exit(1);
-    }
-}
-
-
-static void *
-lTaskEntry(void *arg) {
-    int threadIndex = int(reinterpret_cast<int64_t>(arg));
-    int threadCount = nThreads;
-
-    while (true) {
-        int err;
-        if ((err = sem_wait(workerSemaphore)) != 0) {
-            fprintf(stderr, "Error from sem_wait: %s\n", strerror(err));
-            exit(1);
-        }
-
-        std::pair<void *, void *> myTask;
-        //
-        // Acquire mutex, get task
-        //
-        if ((err = pthread_mutex_lock(&taskQueueMutex)) != 0) {
-            fprintf(stderr, "Error from pthread_mutex_lock: %s\n", strerror(err));
-            exit(1);
-        }
-        if (taskQueue.size() == 0) {
-            //
-            // Task queue is empty, go back and wait on the semaphore
-            //
-            if ((err = pthread_mutex_unlock(&taskQueueMutex)) != 0) {
-                fprintf(stderr, "Error from pthread_mutex_unlock: %s\n", strerror(err));
-                exit(1);
-            }
-            continue;
-        }
-
-        myTask = taskQueue.back();
-        taskQueue.pop_back();
-
-        if ((err = pthread_mutex_unlock(&taskQueueMutex)) != 0) {
-            fprintf(stderr, "Error from pthread_mutex_unlock: %s\n", strerror(err));
-            exit(1);
-        }
-
-        //
-        // Do work for _myTask_
-        //
-        typedef void (*TaskFunType)(void *, int, int);
-        TaskFunType func = (TaskFunType)myTask.first;
-        func(myTask.second, threadIndex, threadCount);
-
-        //
-        // Decrement the number of unfinished tasks counter
-        //
-        if ((err = pthread_mutex_lock(&tasksRunningConditionMutex)) != 0) {
-            fprintf(stderr, "Error from pthread_mutex_lock: %s\n", strerror(err));
-            exit(1);
-        }
-
-        int unfinished = --numUnfinishedTasks;
-        if (unfinished == 0) {
-            //
-            // Signal the "no more tasks are running" condition if all of
-            // them are done.
-            //
-            int err;
-            if ((err = pthread_cond_signal(&tasksRunningCondition)) != 0) {
-                fprintf(stderr, "Error from pthread_cond_signal: %s\n", strerror(err));
-                exit(1);
-            }
-        }
-
-        if ((err = pthread_mutex_unlock(&tasksRunningConditionMutex)) != 0) {
-            fprintf(stderr, "Error from pthread_mutex_lock: %s\n", strerror(err));
-            exit(1);
-        }
-    }
-
-    pthread_exit(NULL);
-    return 0;
-}
-
-
-void ISPCSync() {
-    if (threads == NULL) {
-        fprintf(stderr, "You must call TasksInit() before launching tasks.\n");
-        exit(1);
-    }
-
-    int err;
-    if ((err = pthread_mutex_lock(&tasksRunningConditionMutex)) != 0) {
-        fprintf(stderr, "Error from pthread_mutex_lock: %s\n", strerror(err));
-        exit(1);
-    }
-
-    // As long as there are tasks running, wait on the condition variable;
-    // doing so causes this thread to go to sleep until someone signals on
-    // the tasksRunningCondition condition variable.
-    while (numUnfinishedTasks > 0) {
-        if ((err = pthread_cond_wait(&tasksRunningCondition, 
-                                     &tasksRunningConditionMutex)) != 0) {
-            fprintf(stderr, "Error from pthread_cond_wait: %s\n", strerror(err));
-            exit(1);
-        }
-    }
-    
-    // We acquire ownership of the condition variable mutex when the above
-    // pthread_cond_wait returns.
-    // FIXME: is there a lurking issue here if numUnfinishedTasks gets back
-    // to zero by the time we get to ISPCSync() and thence we're trying to
-    // unlock a mutex we don't have a lock on?
-    if ((err = pthread_mutex_unlock(&tasksRunningConditionMutex)) != 0) {
-        fprintf(stderr, "Error from pthread_mutex_lock: %s\n", strerror(err));
-        exit(1);
-    }
-}
--- a/examples/noise/noise.ispc
+++ b/examples/noise/noise.ispc
@@ -131,11 +131,11 @@ static float Noise(float x, float y, float z) {
 }


-static float Turbulence(float x, float y, float z, int octaves) {
+static float Turbulence(float x, float y, float z, uniform int octaves) {
    float omega = 0.6;

    float sum = 0., lambda = 1., o = 1.;
-    for (int i = 0; i < octaves; ++i) {
+    for (uniform int i = 0; i < octaves; ++i) {
        sum += abs(o * Noise(lambda * x, lambda * y, lambda * z));
        lambda *= 1.99f;
        o *= omega;
--- a/examples/noise/noise.vcxproj
+++ b/examples/noise/noise.vcxproj
@@ -164,4 +164,4 @@
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
  </ImportGroup>
-</Project>
+</Project>
--- a/examples/noise/noise_serial.cpp
+++ b/examples/noise/noise_serial.cpp
@@ -104,7 +104,7 @@ inline float NoiseWeight(float t) {


 inline float Lerp(float t, float low, float high) {
-    return (1. - t) * low + t * high;
+    return (1.f - t) * low + t * high;
 }


@@ -147,7 +147,7 @@ static float Turbulence(float x, float y, float z, int octaves) {
        lambda *= 1.99f;
        o *= omega;
    }
-    return sum * 0.5;
+    return sum * 0.5f;
 }


@@ -163,7 +163,7 @@ void noise_serial(float x0, float y0, float x1, float y1,
            float y = y0 + j * dy;

            int index = (j * width + i);
-            output[index] = Turbulence(x, y, 0.6, 8);
+            output[index] = Turbulence(x, y, 0.6f, 8);
        }
    }
 }
--- a/examples/options/options.vcxproj
+++ b/examples/options/options.vcxproj
--- a/examples/options/options_serial.cpp
+++ b/examples/options/options_serial.cpp
@@ -47,7 +47,7 @@ static inline float
 CND(float X) {
    float L = fabsf(X);

-    float k = 1.0 / (1.0 + 0.2316419 * L);
+    float k = 1.f / (1.f + 0.2316419f * L);
    float k2 = k*k;
    float k3 = k2*k;
    float k4 = k2*k2;
@@ -59,7 +59,7 @@ CND(float X) {
    w *= invSqrt2Pi * expf(-L * L * .5f);

    if (X > 0.f)
-        w = 1.0 - w;
+        w = 1.f - w;
    return w;
 }

@@ -94,7 +94,7 @@ binomial_put_serial(float Sa[], float Xa[], float Ta[],

        float dt = T / BINOMIAL_NUM;
        float u = expf(v * sqrtf(dt));
-        float d = 1. / u;
+        float d = 1.f / u;
        float disc = expf(r * dt);
        float Pu = (disc - d) / (u - d);

--- a/examples/rt/Makefile
+++ b/examples/rt/Makefile
@@ -1,6 +1,12 @@

-CXX=g++ -m64
-CXXFLAGS=-Iobjs/ -O3 -Wall
+ARCH = $(shell uname)
+
+TASK_CXX=../tasksys.cpp
+TASK_LIB=-lpthread
+TASK_OBJ=$(addprefix objs/, $(subst ../,, $(TASK_CXX:.cpp=.o)))
+
+CXX=g++
+CXXFLAGS=-Iobjs/ -O3 -Wall -m64
 ISPC=ispc
 ISPCFLAGS=-O2 --target=sse4x2 --arch=x86-64

@@ -14,11 +20,16 @@ dirs:
 clean:
 	/bin/rm -rf objs *~ rt

-rt: dirs objs/rt.o objs/rt_serial.o objs/rt_ispc.o
-	$(CXX) $(CXXFLAGS) -o $@ objs/rt.o objs/rt_ispc.o objs/rt_serial.o -lm
+rt: dirs objs/rt.o objs/rt_serial.o objs/rt_ispc.o $(TASK_OBJ)
+	$(CXX) $(CXXFLAGS) -o $@ objs/rt.o objs/rt_ispc.o objs/rt_serial.o $(TASK_OBJ) -lm $(TASK_LIB)

-objs/%.o: %.cpp objs/rt_ispc.h
+objs/%.o: %.cpp
 	$(CXX) $< $(CXXFLAGS) -c -o $@

+objs/%.o: ../%.cpp
+	$(CXX) $< $(CXXFLAGS) -c -o $@
+
+objs/rt.o: objs/rt_ispc.h 
+
 objs/%_ispc.h objs/%_ispc.o: %.ispc
 	$(ISPC) $(ISPCFLAGS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
--- a/examples/rt/rt.cpp
+++ b/examples/rt/rt.cpp
@@ -42,6 +42,7 @@
 #include <math.h>
 #include <algorithm>
 #include <assert.h>
+#include <string.h>
 #include <sys/types.h>
 #include "../timing.h"
 #include "../cpuid.h"
@@ -51,7 +52,8 @@ using namespace ispc;

 typedef unsigned int uint;

-extern void raytrace_serial(int width, int height, const float raster2camera[4][4], 
+extern void raytrace_serial(int width, int height, int baseWidth, int baseHeight,
+                            const float raster2camera[4][4], 
                            const float camera2world[4][4], float image[],
                            int id[], const LinearBVHNode nodes[],
                            const Triangle triangles[]);
@@ -90,6 +92,7 @@ static void writeImage(int *idImage, float *depthImage, int width, int height,
        }
    }            
    fclose(f);
+    printf("Wrote image file %s\n", filename);
 }


@@ -125,11 +128,28 @@ ensureTargetISAIsSupported() {
 }


+static void usage() {
+    fprintf(stderr, "rt [--scale=<factor>] <scene name base>\n");
+    exit(1);
+}
+
+
 int main(int argc, char *argv[]) {
-    if (argc != 2) {
-        fprintf(stderr, "usage: rt <filename base>\n");
-        exit(1);
+    float scale = 1.f;
+    const char *filename = NULL;
+    for (int i = 1; i < argc; ++i) {
+        if (strncmp(argv[i], "--scale=", 8) == 0) {
+            scale = atof(argv[i] + 8);
+            if (scale == 0.f)
+                usage();
+        }
+        else if (filename != NULL)
+            usage();
+        else
+            filename = argv[i];
    }
+    if (filename == NULL)
+        usage();

    ensureTargetISAIsSupported();

@@ -143,10 +163,10 @@ int main(int argc, char *argv[]) {
    // Read the camera specification information from the camera file
    //
    char fnbuf[1024];
-    sprintf(fnbuf, "%s.camera", argv[1]);
+    sprintf(fnbuf, "%s.camera", filename);
    FILE *f = fopen(fnbuf, "rb");
    if (!f) {
-        perror(argv[1]);
+        perror(fnbuf);
        return 1;
    }

@@ -154,20 +174,20 @@ int main(int argc, char *argv[]) {
    // Nothing fancy, and trouble if we run on a big-endian system, just
    // fread in the bits
    //
-    int width, height;
+    int baseWidth, baseHeight;
    float camera2world[4][4], raster2camera[4][4];
-    READ(width, 1);
-    READ(height, 1);
+    READ(baseWidth, 1);
+    READ(baseHeight, 1);
    READ(camera2world[0][0], 16);
    READ(raster2camera[0][0], 16);

    //
    // Read in the serialized BVH 
    //
-    sprintf(fnbuf, "%s.bvh", argv[1]);
+    sprintf(fnbuf, "%s.bvh", filename);
    f = fopen(fnbuf, "rb");
    if (!f) {
-        perror(argv[2]);
+        perror(fnbuf);
        return 1;
    }

@@ -214,10 +234,10 @@ int main(int argc, char *argv[]) {
    }
    fclose(f);

-    // round image resolution up to multiple of 4 to makethings easy for
+    // round image resolution up to multiple of 16 to make things easy for
    // the code that assigns pixels to ispc program instances
-    height = (height + 3) & ~3;
-    width = (width + 3) & ~3;
+    int height = (int(baseHeight * scale) + 0xf) & ~0xf;
+    int width = (int(baseWidth * scale) + 0xf) & ~0xf;

    // allocate images; one to hold hit object ids, one to hold depth to
    // the first interseciton
@@ -225,19 +245,42 @@ int main(int argc, char *argv[]) {
    float *image = new float[width*height];

    //
-    // Run 3 iterations with ispc, record the minimum time
+    // Run 3 iterations with ispc + 1 core, record the minimum time
    //
    double minTimeISPC = 1e30;
    for (int i = 0; i < 3; ++i) {
        reset_and_start_timer();
-        raytrace(width, height, raster2camera, camera2world, 
-                 image, id, nodes, triangles);
+        raytrace_ispc(width, height, baseWidth, baseHeight, raster2camera, 
+                      camera2world, image, id, nodes, triangles);
        double dt = get_elapsed_mcycles();
        minTimeISPC = std::min(dt, minTimeISPC);
    }
-    printf("[rt ispc]:\t\t\t[%.3f] million cycles for %d x %d image\n", minTimeISPC, width, height);
+    printf("[rt ispc, 1 core]:\t\t[%.3f] million cycles for %d x %d image\n", 
+           minTimeISPC, width, height);

-    writeImage(id, image, width, height, "rt-ispc.ppm");
+    writeImage(id, image, width, height, "rt-ispc-1core.ppm");
+
+    memset(id, 0, width*height*sizeof(int));
+    memset(image, 0, width*height*sizeof(float));
+
+    //
+    // Run 3 iterations with ispc + 1 core, record the minimum time
+    //
+    double minTimeISPCtasks = 1e30;
+    for (int i = 0; i < 3; ++i) {
+        reset_and_start_timer();
+        raytrace_ispc_tasks(width, height, baseWidth, baseHeight, raster2camera,
+                            camera2world, image, id, nodes, triangles);
+        double dt = get_elapsed_mcycles();
+        minTimeISPCtasks = std::min(dt, minTimeISPCtasks);
+    }
+    printf("[rt ispc + tasks]:\t\t[%.3f] million cycles for %d x %d image\n", 
+           minTimeISPCtasks, width, height);
+
+    writeImage(id, image, width, height, "rt-ispc-tasks.ppm");
+
+    memset(id, 0, width*height*sizeof(int));
+    memset(image, 0, width*height*sizeof(float));

    //
    // And 3 iterations with the serial implementation, reporting the
@@ -246,14 +289,15 @@ int main(int argc, char *argv[]) {
    double minTimeSerial = 1e30;
    for (int i = 0; i < 3; ++i) {
        reset_and_start_timer();
-        raytrace_serial(width, height, raster2camera, camera2world, 
-                        image, id, nodes, triangles);
+        raytrace_serial(width, height, baseWidth, baseHeight, raster2camera, 
+                        camera2world, image, id, nodes, triangles);
        double dt = get_elapsed_mcycles();
        minTimeSerial = std::min(dt, minTimeSerial);
    }
    printf("[rt serial]:\t\t\t[%.3f] million cycles for %d x %d image\n", 
           minTimeSerial, width, height);
-    printf("\t\t\t\t(%.2fx speedup from ISPC)\n", minTimeSerial / minTimeISPC);
+    printf("\t\t\t\t(%.2fx speedup from ISPC, %.2f from ISPC + tasks)\n", 
+           minTimeSerial / minTimeISPC, minTimeSerial / minTimeISPCtasks);

    writeImage(id, image, width, height, "rt-serial.ppm");

--- a/examples/rt/rt.ispc
+++ b/examples/rt/rt.ispc
@@ -226,20 +226,26 @@ bool BVHIntersect(const LinearBVHNode nodes[], const Triangle tris[],
 }


-export void raytrace(uniform int width, uniform int height,
-                     const uniform float raster2camera[4][4], 
-                     const uniform float camera2world[4][4],
-                     uniform float image[], uniform int id[],
-                     const LinearBVHNode nodes[],
-                     const Triangle triangles[]) {
+static void raytrace_tile(uniform int x0, uniform int x1,
+                          uniform int y0, uniform int y1, 
+                          uniform int width, uniform int height,
+                          uniform int baseWidth, uniform int baseHeight,
+                          const uniform float raster2camera[4][4], 
+                          const uniform float camera2world[4][4],
+                          uniform float image[], uniform int id[],
+                          const LinearBVHNode nodes[],
+                          const Triangle triangles[]) {
+    uniform float widthScale = (float)(baseWidth) / (float)(width);
+    uniform float heightScale = (float)(baseHeight) / (float)(height);
+
    static const uniform float udx[16] = { 0, 1, 0, 1, 2, 3, 2, 3, 
                                           0, 1, 0, 1, 2, 3, 2, 3 };
    static const uniform float udy[16] = { 0, 0, 1, 1, 0, 0, 1, 1, 
                                           2, 2, 3, 3, 2, 2, 3, 3 };

    // The outer loops are always over blocks of 4x4 pixels
-    for (uniform int y = 0; y < height; y += 4) {
-        for (uniform int x = 0; x < width; x += 4) {
+    for (uniform int y = y0; y < y1; y += 4) {
+        for (uniform int x = x0; x < x1; x += 4) {
            // Now we have a block of 4x4=16 pixels to process; it will
            // take 16/programCount iterations of this loop to process
            // them.
@@ -251,7 +257,8 @@ export void raytrace(uniform int width, uniform int height,
                const float dy = udy[o * programCount + programIndex];

                Ray ray;
-                generateRay(raster2camera, camera2world, x+dx, y+dy, ray);
+                generateRay(raster2camera, camera2world, (x+dx)*widthScale,
+                            (y+dy)*heightScale, ray);
                BVHIntersect(nodes, triangles, ray);

                int offset = (y + (int)dy) * width + (x + (int)dx);
@@ -261,3 +268,54 @@ export void raytrace(uniform int width, uniform int height,
        }
    }
 }
+
+
+export void raytrace_ispc(uniform int width, uniform int height,
+                          uniform int baseWidth, uniform int baseHeight,
+                          const uniform float raster2camera[4][4], 
+                          const uniform float camera2world[4][4],
+                          uniform float image[], uniform int id[],
+                          const LinearBVHNode nodes[],
+                          const Triangle triangles[]) {
+    raytrace_tile(0, width, 0, height, width, height, baseWidth, baseHeight,
+                  raster2camera, camera2world, image,
+                  id, nodes, triangles);
+}
+
+
+task void raytrace_tile_task(uniform int y0, uniform int y1, 
+                             uniform int width, uniform int height,
+                             uniform int baseWidth, uniform int baseHeight,
+                             const uniform float raster2camera[4][4], 
+                             const uniform float camera2world[4][4],
+                             uniform float image[], uniform int id[],
+                             const LinearBVHNode nodes[],
+                             const Triangle triangles[]) {
+    uniform int dx = 16; // must match dx below
+    uniform int xTasks = (width + (dx-1)) / dx;
+    uniform int x0 = (taskIndex % xTasks) * dx;
+    uniform int x1 = x0 + dx;
+    x1 = min(x1, width);
+                             
+    raytrace_tile(x0, x1, y0, y1, width, height, baseWidth, baseHeight, 
+                  raster2camera, camera2world, image,
+                  id, nodes, triangles);
+}
+
+
+export void raytrace_ispc_tasks(uniform int width, uniform int height,
+                                uniform int baseWidth, uniform int baseHeight,
+                                const uniform float raster2camera[4][4], 
+                                const uniform float camera2world[4][4],
+                                uniform float image[], uniform int id[],
+                                const LinearBVHNode nodes[],
+                                const Triangle triangles[]) {
+    uniform int dx = 16, dy = 16;
+    uniform int nTasks = (width + (dx-1)) / dx;
+    for (uniform int y = 0; y < height; y += dy) {
+        uniform int y1 = min(y + dy, height);
+        launch[nTasks] < raytrace_tile_task(y, y1, width, height, baseWidth,
+                                            baseHeight, raster2camera, camera2world, 
+                                            image, id, nodes, triangles) >;
+    }
+}
--- a/examples/rt/rt.vcxproj
+++ b/examples/rt/rt.vcxproj
@@ -1,4 +1,4 @@
-<?xml version="1.0" encoding="utf-8"?>
+<?xml version="1.0" encoding="utf-8"?>
 <Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
  <ItemGroup Label="ProjectConfigurations">
    <ProjectConfiguration Include="Debug|Win32">
@@ -164,6 +164,7 @@ ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h
  <ItemGroup>
    <ClCompile Include="rt.cpp" />
    <ClCompile Include="rt_serial.cpp" />
+    <ClCompile Include="../tasksys.cpp" />
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
--- a/examples/rt/rt_serial.cpp
+++ b/examples/rt/rt_serial.cpp
@@ -258,17 +258,21 @@ bool BVHIntersect(const LinearBVHNode nodes[], const Triangle tris[],
 }


-void raytrace_serial(int width, int height,
+void raytrace_serial(int width, int height, int baseWidth, int baseHeight,
                     const float raster2camera[4][4], 
                     const float camera2world[4][4],
                     float image[],
                     int id[],
                     const LinearBVHNode nodes[],
                     const Triangle triangles[]) {
+    float widthScale = float(baseWidth) / float(width);
+    float heightScale = float(baseHeight) / float(height);
+
    for (int y = 0; y < height; ++y) {
        for (int x = 0; x < width; ++x) {
                Ray ray;
-                generateRay(raster2camera, camera2world, x, y, ray);
+                generateRay(raster2camera, camera2world, x * widthScale,
+                            y * heightScale, ray);
                BVHIntersect(nodes, triangles, ray);

                int offset = y * width + x;
--- a/examples/simple/simple.vcxproj
+++ b/examples/simple/simple.vcxproj
@@ -1,4 +1,4 @@
-<?xml version="1.0" encoding="utf-8"?>
+<?xml version="1.0" encoding="utf-8"?>
 <Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
  <ItemGroup Label="ProjectConfigurations">
    <ProjectConfiguration Include="Debug|Win32">
@@ -28,7 +28,7 @@
 ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86
 </Command>
      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-ispc -O2 %(Filename).ispco %(Filename).obj -h %(Filename)_ispc.h
+ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h
 </Command>
      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(Filename).obj</Outputs>
      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">%(Filename).obj</Outputs>
@@ -161,4 +161,4 @@ ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
  </ImportGroup>
-</Project>
+</Project>
--- a/examples/stencil/.gitignore
+++ b/examples/stencil/.gitignore
@@ -0,0 +1,2 @@
+stencil
+objs
--- a/examples/stencil/Makefile
+++ b/examples/stencil/Makefile
@@ -0,0 +1,35 @@
+
+ARCH = $(shell uname)
+
+TASK_CXX=../tasksys.cpp
+TASK_LIB=-lpthread
+TASK_OBJ=$(addprefix objs/, $(subst ../,, $(TASK_CXX:.cpp=.o)))
+
+CXX=g++
+CXXFLAGS=-Iobjs/ -O3 -Wall -m64
+ISPC=ispc
+ISPCFLAGS=-O2 --target=sse4x2 --arch=x86-64
+
+default: stencil
+
+.PHONY: dirs clean
+
+dirs:
+	/bin/mkdir -p objs/
+
+clean:
+	/bin/rm -rf objs *~ stencil
+
+stencil: dirs objs/stencil.o objs/stencil_serial.o objs/stencil_ispc.o $(TASK_OBJ)
+	$(CXX) $(CXXFLAGS) -o $@ objs/stencil.o objs/stencil_ispc.o objs/stencil_serial.o $(TASK_OBJ) -lm $(TASK_LIB)
+
+objs/%.o: %.cpp
+	$(CXX) $< $(CXXFLAGS) -c -o $@
+
+objs/%.o: ../%.cpp
+	$(CXX) $< $(CXXFLAGS) -c -o $@
+
+objs/stencil.o: objs/stencil_ispc.h 
+
+objs/%_ispc.h objs/%_ispc.o: %.ispc
+	$(ISPC) $(ISPCFLAGS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
--- a/examples/stencil/stencil.cpp
+++ b/examples/stencil/stencil.cpp
@@ -0,0 +1,186 @@
+/*
+  Copyright (c) 2010-2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+#ifdef _MSC_VER
+#define _CRT_SECURE_NO_WARNINGS
+#define NOMINMAX
+#pragma warning (disable: 4244)
+#pragma warning (disable: 4305)
+#endif
+
+#include <stdio.h>
+#include <algorithm>
+#include <math.h>
+#include "../timing.h"
+#include "../cpuid.h"
+#include "stencil_ispc.h"
+using namespace ispc;
+
+
+// Make sure that the vector ISA used during compilation is supported by
+// the processor.  The ISPC_TARGET_* macro is set in the ispc-generated
+// header file that we include above.
+static void
+ensureTargetISAIsSupported() {
+#if defined(ISPC_TARGET_SSE2)
+    bool isaSupported = CPUSupportsSSE2();
+    const char *target = "SSE2";
+#elif defined(ISPC_TARGET_SSE4)
+    bool isaSupported = CPUSupportsSSE4();
+    const char *target = "SSE4";
+#elif defined(ISPC_TARGET_AVX)
+    bool isaSupported = CPUSupportsAVX();
+    const char *target = "AVX";
+#else
+#error "Unknown ISPC_TARGET_* value"
+#endif
+    if (!isaSupported) {
+        fprintf(stderr, "***\n*** Error: the ispc-compiled code uses the %s instruction "
+                "set, which isn't\n***        supported by this computer's CPU!\n", target);
+        fprintf(stderr, "***\n***        Please modify the "
+#ifdef _MSC_VER
+                "MSVC project file "
+#else
+                "Makefile "
+#endif
+                "to select another target (e.g. sse2)\n***\n");
+        exit(1);
+    }
+}
+
+
+extern void loop_stencil_serial(int t0, int t1, int x0, int x1,
+                                int y0, int y1, int z0, int z1,
+                                int Nx, int Ny, int Nz,
+                                const float coef[5], 
+                                const float vsq[],
+                                float Aeven[], float Aodd[]);
+
+
+void InitData(int Nx, int Ny, int Nz, float *A[2], float *vsq) {
+    int offset = 0;
+    for (int z = 0; z < Nz; ++z)
+        for (int y = 0; y < Ny; ++y)
+            for (int x = 0; x < Nx; ++x, ++offset) {
+                A[0][offset] = (x < Nx / 2) ? x / float(Nx) : y / float(Ny);
+                A[1][offset] = 0;
+                vsq[offset] = x*y*z / float(Nx * Ny * Nz);
+            }
+}
+
+
+int main() {
+    ensureTargetISAIsSupported();
+
+    int Nx = 256, Ny = 256, Nz = 256;
+    int width = 4;
+    float *Aserial[2], *Aispc[2];
+    Aserial[0] = new float [Nx * Ny * Nz];
+    Aserial[1] = new float [Nx * Ny * Nz];
+    Aispc[0] = new float [Nx * Ny * Nz];
+    Aispc[1] = new float [Nx * Ny * Nz];
+    float *vsq = new float [Nx * Ny * Nz];
+
+    float coeff[4] = { 0.5, -.25, .125, -.0625 }; 
+
+    InitData(Nx, Ny, Nz, Aispc, vsq);
+
+    //
+    // Compute the image using the ispc implementation on one core; report
+    // the minimum time of three runs.
+    //
+    double minTimeISPC = 1e30;
+    for (int i = 0; i < 3; ++i) {
+        reset_and_start_timer();
+        loop_stencil_ispc(0, 6, width, Nx - width, width, Ny - width,
+                          width, Nz - width, Nx, Ny, Nz, coeff, vsq,
+                          Aispc[0], Aispc[1]);
+        double dt = get_elapsed_mcycles();
+        minTimeISPC = std::min(minTimeISPC, dt);
+    }
+
+    printf("[stencil ispc 1 core]:\t\t[%.3f] million cycles\n", minTimeISPC);
+
+    InitData(Nx, Ny, Nz, Aispc, vsq);
+
+    //
+    // Compute the image using the ispc implementation with tasks; report
+    // the minimum time of three runs.
+    //
+    double minTimeISPCTasks = 1e30;
+    for (int i = 0; i < 3; ++i) {
+        reset_and_start_timer();
+        loop_stencil_ispc_tasks(0, 6, width, Nx - width, width, Ny - width,
+                                width, Nz - width, Nx, Ny, Nz, coeff, vsq,
+                                Aispc[0], Aispc[1]);
+        double dt = get_elapsed_mcycles();
+        minTimeISPCTasks = std::min(minTimeISPCTasks, dt);
+    }
+
+    printf("[stencil ispc + tasks]:\t\t[%.3f] million cycles\n", minTimeISPCTasks);
+
+    InitData(Nx, Ny, Nz, Aserial, vsq);
+
+    // 
+    // And run the serial implementation 3 times, again reporting the
+    // minimum time.
+    //
+    double minTimeSerial = 1e30;
+    for (int i = 0; i < 3; ++i) {
+        reset_and_start_timer();
+        loop_stencil_serial(0, 6, width, Nx-width, width, Ny - width,
+                            width, Nz - width, Nx, Ny, Nz, coeff, vsq,
+                            Aserial[0], Aserial[1]);
+        double dt = get_elapsed_mcycles();
+        minTimeSerial = std::min(minTimeSerial, dt);
+    }
+
+    printf("[stencil serial]:\t\t[%.3f] millon cycles\n", minTimeSerial);
+
+    printf("\t\t\t\t(%.2fx speedup from ISPC, %.2f from ISPC + tasks)\n", 
+           minTimeSerial / minTimeISPC, minTimeSerial / minTimeISPCTasks);
+
+    // Check for agreement
+    int offset = 0;
+    for (int z = 0; z < Nz; ++z)
+        for (int y = 0; y < Ny; ++y)
+            for (int x = 0; x < Nx; ++x, ++offset) {
+                float error = fabsf((Aserial[1][offset] - Aispc[1][offset]) /
+                                    Aserial[1][offset]);
+                if (error > 1e-4)
+                    printf("Error @ (%d,%d,%d): ispc = %f, serial = %f\n",
+                           x, y, z, Aispc[1][offset], Aserial[1][offset]);
+            }
+
+    return 0;
+}
--- a/examples/stencil/stencil.ispc
+++ b/examples/stencil/stencil.ispc
@@ -0,0 +1,129 @@
+/*
+  Copyright (c) 2010-2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+
+static void
+stencil_step(uniform int x0, uniform int x1,
+             uniform int y0, uniform int y1,
+             uniform int z0, uniform int z1,
+             uniform int Nx, uniform int Ny, uniform int Nz,
+             uniform const float coef[4], uniform const float vsq[],
+             uniform const float Ain[], uniform float Aout[]) {
+    const uniform int Nxy = Nx * Ny;
+
+    for (uniform int z = z0; z < z1; ++z) {
+        for (uniform int y = y0; y < y1; ++y) {
+            // Assumes that (x1-x0) % programCount == 0
+            for (uniform int x = x0; x < x1; x += programCount) {
+                int index = (z * Nxy) + (y * Nx) + x + programIndex;
+#define A_cur(x, y, z) Ain[index + (x) + ((y) * Nx) + ((z) * Nxy)]
+#define A_next(x, y, z) Aout[index + (x) + ((y) * Nx) + ((z) * Nxy)]
+                float div = coef[0] * A_cur(0, 0, 0) +
+                            coef[1] * (A_cur(+1, 0, 0) + A_cur(-1, 0, 0) +
+                                       A_cur(0, +1, 0) + A_cur(0, -1, 0) +
+                                       A_cur(0, 0, +1) + A_cur(0, 0, -1)) +
+                            coef[2] * (A_cur(+2, 0, 0) + A_cur(-2, 0, 0) +
+                                       A_cur(0, +2, 0) + A_cur(0, -2, 0) +
+                                       A_cur(0, 0, +2) + A_cur(0, 0, -2)) +
+                            coef[3] * (A_cur(+3, 0, 0) + A_cur(-3, 0, 0) +
+                                       A_cur(0, +3, 0) + A_cur(0, -3, 0) +
+                                       A_cur(0, 0, +3) + A_cur(0, 0, -3));
+
+                A_next(0, 0, 0) = 2 * A_cur(0, 0, 0) - A_next(0, 0, 0) + 
+                    vsq[index] * div;
+            }
+        }
+    }
+}
+
+
+static task void
+stencil_step_task(uniform int x0, uniform int x1,
+                  uniform int y0, uniform int y1,
+                  uniform int z0, uniform int z1,
+                  uniform int Nx, uniform int Ny, uniform int Nz,
+                  uniform const float coef[4], uniform const float vsq[],
+                  uniform const float Ain[], uniform float Aout[]) {
+    stencil_step(x0, x1, y0, y1, z0, z1, Nx, Ny, Nz, coef, vsq, Ain, Aout);
+}
+
+
+export void
+loop_stencil_ispc_tasks(uniform int t0, uniform int t1, 
+                        uniform int x0, uniform int x1,
+                        uniform int y0, uniform int y1,
+                        uniform int z0, uniform int z1,
+                        uniform int Nx, uniform int Ny, uniform int Nz,
+                        uniform const float coef[4], 
+                        uniform const float vsq[],
+                        uniform float Aeven[], uniform float Aodd[])
+{
+    for (uniform int t = t0; t < t1; ++t) {
+        // Parallelize across cores as well: each task will work on a slice
+        // of "dz" in the z extent of the volume.  (dz=1 seems to work
+        // better than any larger values.)
+        uniform int dz = 1;
+        for (uniform int z = z0; z < z1; z += dz) {
+            if ((t & 1) == 0)
+                launch < stencil_step_task(x0, x1, y0, y1, z, z+dz, Nx, Ny, Nz, 
+                                           coef, vsq, Aeven, Aodd) >;
+            else
+                launch < stencil_step_task(x0, x1, y0, y1, z, z+dz, Nx, Ny, Nz, 
+                                           coef, vsq, Aodd, Aeven) >;
+        }
+        // We need to wait for all of the launched tasks to finish before
+        // starting the next iteration.
+        sync;
+    }
+}
+
+
+export void
+loop_stencil_ispc(uniform int t0, uniform int t1, 
+                  uniform int x0, uniform int x1,
+                  uniform int y0, uniform int y1,
+                  uniform int z0, uniform int z1,
+                  uniform int Nx, uniform int Ny, uniform int Nz,
+                  uniform const float coef[4], 
+                  uniform const float vsq[],
+                  uniform float Aeven[], uniform float Aodd[])
+{
+    for (uniform int t = t0; t < t1; ++t) {
+        if ((t & 1) == 0)
+            stencil_step(x0, x1, y0, y1, z0, z1, Nx, Ny, Nz, coef, vsq, 
+                         Aeven, Aodd);
+        else
+            stencil_step(x0, x1, y0, y1, z0, z1, Nx, Ny, Nz, coef, vsq, 
+                         Aodd, Aeven);
+    }
+}
--- a/examples/stencil/stencil.vcxproj
+++ b/examples/stencil/stencil.vcxproj
@@ -0,0 +1,172 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|Win32">
+      <Configuration>Debug</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|Win32">
+      <Configuration>Release</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{2ef070a1-f62f-4e6a-944b-88d140945c3c}</ProjectGuid>
+    <Keyword>Win32Proj</Keyword>
+    <RootNamespace>rt</RootNamespace>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <LinkIncremental>true</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <LinkIncremental>true</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <LinkIncremental>false</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <LinkIncremental>false</LinkIncremental>
+  </PropertyGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <ClCompile>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <FloatingPointModel>Fast</FloatingPointModel>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <ClCompile>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <FloatingPointModel>Fast</FloatingPointModel>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <FloatingPointModel>Fast</FloatingPointModel>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <FloatingPointModel>Fast</FloatingPointModel>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <CustomBuild Include="stencil.ispc">
+      <FileType>Document</FileType>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86
+</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(Filename).obj</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">%(Filename).obj</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86
+</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(Filename).obj</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">%(Filename).obj</Outputs>
+    </CustomBuild>
+  </ItemGroup>
+  <ItemGroup>
+    <ClCompile Include="stencil.cpp" />
+    <ClCompile Include="stencil_serial.cpp" />
+    <ClCompile Include="../tasksys.cpp" />
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+  </ImportGroup>
+</Project>
--- a/examples/stencil/stencil_serial.cpp
+++ b/examples/stencil/stencil_serial.cpp
@@ -0,0 +1,86 @@
+/*
+  Copyright (c) 2010-2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+
+static void
+stencil_step(int x0, int x1,
+             int y0, int y1,
+             int z0, int z1,
+             int Nx, int Ny, int Nz,
+             const float coef[4], const float vsq[],
+             const float Ain[], float Aout[]) {
+    int Nxy = Nx * Ny;
+
+    for (int z = z0; z < z1; ++z) {
+        for (int y = y0; y < y1; ++y) {
+            for (int x = x0; x < x1; ++x) {
+                int index = (z * Nxy) + (y * Nx) + x;
+#define A_cur(x, y, z) Ain[index + (x) + ((y) * Nx) + ((z) * Nxy)]
+#define A_next(x, y, z) Aout[index + (x) + ((y) * Nx) + ((z) * Nxy)]
+                float div = coef[0] * A_cur(0, 0, 0) +
+                            coef[1] * (A_cur(+1, 0, 0) + A_cur(-1, 0, 0) +
+                                       A_cur(0, +1, 0) + A_cur(0, -1, 0) +
+                                       A_cur(0, 0, +1) + A_cur(0, 0, -1)) +
+                            coef[2] * (A_cur(+2, 0, 0) + A_cur(-2, 0, 0) +
+                                       A_cur(0, +2, 0) + A_cur(0, -2, 0) +
+                                       A_cur(0, 0, +2) + A_cur(0, 0, -2)) +
+                            coef[3] * (A_cur(+3, 0, 0) + A_cur(-3, 0, 0) +
+                                       A_cur(0, +3, 0) + A_cur(0, -3, 0) +
+                                       A_cur(0, 0, +3) + A_cur(0, 0, -3));
+
+                A_next(0, 0, 0) = 2 * A_cur(0, 0, 0) - A_next(0, 0, 0) + 
+                    vsq[index] * div;
+            }
+        }
+    }
+}
+
+
+void loop_stencil_serial(int t0, int t1, 
+                         int x0, int x1,
+                         int y0, int y1,
+                         int z0, int z1,
+                         int Nx, int Ny, int Nz,
+                         const float coef[4], 
+                         const float vsq[],
+                         float Aeven[], float Aodd[])
+{
+    for (int t = t0; t < t1; ++t) {
+        if ((t & 1) == 0)
+            stencil_step(x0, x1, y0, y1, z0, z1, Nx, Ny, Nz, coef, vsq, 
+                         Aeven, Aodd);
+        else
+            stencil_step(x0, x1, y0, y1, z0, z1, Nx, Ny, Nz, coef, vsq, 
+                         Aodd, Aeven);
+    }
+}
--- a/examples/tasksys.cpp
+++ b/examples/tasksys.cpp
@@ -0,0 +1,868 @@
+/*
+  Copyright (c) 2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+/*
+  This file implements simple task systems that provide the three
+  entrypoints used by ispc-generated to code to handle 'launch' and 'sync'
+  statements in ispc programs.  See the section "Task Parallelism: Language
+  Syntax" in the ispc documentation for information about using task
+  parallelism in ispc programs, and see the section "Task Parallelism:
+  Runtime Requirements" for information about the task-related entrypoints
+  that are implemented here.
+
+  There are three task systems in this file: one built using Microsoft's
+  Concurrency Runtime, one built with Apple's Grand Central Dispatch, and
+  one built on top of bare pthreads.
+*/
+
+#if defined(_WIN32) || defined(_WIN64)
+  #define ISPC_IS_WINDOWS
+  #define ISPC_USE_CONCRT
+#elif defined(__linux__)
+  #define ISPC_IS_LINUX
+  #define ISPC_USE_PTHREADS
+#elif defined(__APPLE__)
+  #define ISPC_IS_APPLE
+  // pthreads is noticably more efficient than GCD on OSX
+  #define ISPC_USE_PTHREADS
+  //#define ISPC_USE_GCD
+#endif
+
+#define DBG(x) 
+
+#ifdef ISPC_IS_WINDOWS
+  #define NOMINMAX
+  #include <windows.h>
+#endif // ISPC_IS_WINDOWS
+#ifdef ISPC_USE_CONCRT
+  #include <concrt.h>
+  using namespace Concurrency;
+#endif // ISPC_USE_CONCRT
+#ifdef ISPC_USE_GCD
+  #include <dispatch/dispatch.h>
+  #include <pthread.h>
+#endif // ISPC_USE_GCD
+#ifdef ISPC_USE_PTHREADS
+  #include <pthread.h>
+  #include <semaphore.h>
+  #include <unistd.h>
+  #include <fcntl.h>
+  #include <errno.h>
+  #include <sys/types.h>
+  #include <sys/stat.h>
+  #include <sys/param.h>
+  #include <sys/sysctl.h>
+  #include <vector>
+  #include <algorithm>
+#endif // ISPC_USE_PTHREADS
+#ifdef ISPC_IS_LINUX
+  #include <malloc.h>
+#endif // ISPC_IS_LINUX
+#include <stdio.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <assert.h>
+#include <string.h>
+#include <algorithm>
+
+// Signature of ispc-generated 'task' functions
+typedef void (*TaskFuncType)(void *data, int threadIndex, int threadCount,
+                             int taskIndex, int taskCount);
+
+// Small structure used to hold the data for each task
+struct TaskInfo {
+    TaskFuncType func;
+    void *data;
+    int taskIndex, taskCount;
+#if defined(ISPC_IS_WINDOWS)
+    event taskEvent;
+#endif
+};
+
+///////////////////////////////////////////////////////////////////////////
+// TaskGroupBase
+
+#define LOG_TASK_QUEUE_CHUNK_SIZE 12
+#define MAX_TASK_QUEUE_CHUNKS 8
+#define TASK_QUEUE_CHUNK_SIZE (1<<LOG_TASK_QUEUE_CHUNK_SIZE)
+
+#define MAX_LAUNCHED_TASKS (MAX_TASK_QUEUE_CHUNKS * TASK_QUEUE_CHUNK_SIZE)
+
+#define NUM_MEM_BUFFERS 16
+
+class TaskGroup;
+
+/** The TaskGroupBase structure provides common functionality for "task
+    groups"; a task group is the set of tasks launched from within a single
+    ispc function.  When the function is ready to return, it waits for all
+    of the tasks in its task group to finish before it actually returns.
+ */
+class TaskGroupBase {
+public:
+    void Reset();
+
+    int AllocTaskInfo(int count);
+    TaskInfo *GetTaskInfo(int index);
+
+    void *AllocMemory(int64_t size, int32_t alignment);
+
+protected:
+    TaskGroupBase();
+    ~TaskGroupBase();
+
+    int nextTaskInfoIndex;
+
+private:
+    /* We allocate blocks of TASK_QUEUE_CHUNK_SIZE TaskInfo structures as
+       needed by the calling function.  We hold up to MAX_TASK_QUEUE_CHUNKS
+       of these (and then exit at runtime if more than this many tasks are
+       launched.)
+     */
+    TaskInfo *taskInfo[MAX_TASK_QUEUE_CHUNKS];
+
+    /* We also allocate chunks of memory to service ISPCAlloc() calls.  The
+       memBuffers[] array holds pointers to this memory.  The first element
+       of this array is initialized to point to mem and then any subsequent
+       elements required are initialized with dynamic allocation.
+     */
+    int curMemBuffer, curMemBufferOffset;
+    int memBufferSize[NUM_MEM_BUFFERS];
+    char *memBuffers[NUM_MEM_BUFFERS];
+    char mem[256];
+
+};
+
+
+inline TaskGroupBase::TaskGroupBase() { 
+    nextTaskInfoIndex = 0; 
+
+    curMemBuffer = 0; 
+    curMemBufferOffset = 0;
+    memBuffers[0] = mem;
+    memBufferSize[0] = sizeof(mem) / sizeof(mem[0]);
+    for (int i = 1; i < NUM_MEM_BUFFERS; ++i) {
+        memBuffers[i] = NULL;
+        memBufferSize[i] = 0;
+    }
+
+    for (int i = 0; i < MAX_TASK_QUEUE_CHUNKS; ++i)
+        taskInfo[i] = NULL;
+}
+
+
+inline TaskGroupBase::~TaskGroupBase() {
+    // Note: don't delete memBuffers[0], since it points to the start of
+    // the "mem" member!
+    for (int i = 1; i < NUM_MEM_BUFFERS; ++i)
+        delete[] memBuffers[i];
+}
+
+
+inline void
+TaskGroupBase::Reset() {
+    nextTaskInfoIndex = 0; 
+    curMemBuffer = 0; 
+    curMemBufferOffset = 0;
+}
+
+
+inline int
+TaskGroupBase::AllocTaskInfo(int count) {
+    int ret = nextTaskInfoIndex;
+    nextTaskInfoIndex += count;
+    return ret;
+}
+
+
+inline TaskInfo *
+TaskGroupBase::GetTaskInfo(int index) {
+    int chunk = (index >> LOG_TASK_QUEUE_CHUNK_SIZE);
+    int offset = index & (TASK_QUEUE_CHUNK_SIZE-1);
+
+    if (chunk == MAX_TASK_QUEUE_CHUNKS) {
+        fprintf(stderr, "A total of %d tasks have been launched from the "
+                "current function--the simple built-in task system can handle "
+                "no more. You can increase the values of TASK_QUEUE_CHUNK_SIZE "
+                "and LOG_TASK_QUEUE_CHUNK_SIZE to work around this limitation.  "
+                "Sorry!  Exiting.\n", index);
+        exit(1);
+    }
+
+    if (taskInfo[chunk] == NULL)
+        taskInfo[chunk] = new TaskInfo[TASK_QUEUE_CHUNK_SIZE];
+    return &taskInfo[chunk][offset];
+}
+
+
+inline void *
+TaskGroupBase::AllocMemory(int64_t size, int32_t alignment) {
+    char *basePtr = memBuffers[curMemBuffer];
+    int64_t iptr = (int64_t)(basePtr + curMemBufferOffset);
+    iptr = (iptr + (alignment-1)) & ~(alignment-1);
+
+    int newOffset = int(iptr + size - (int64_t)basePtr);
+    if (newOffset < memBufferSize[curMemBuffer]) {
+        curMemBufferOffset = newOffset;
+        return (char *)iptr;
+    }
+
+    ++curMemBuffer;
+    curMemBufferOffset = 0;
+    assert(curMemBuffer < NUM_MEM_BUFFERS);
+
+    int allocSize = 1 << (12 + curMemBuffer);
+    allocSize = std::max(int(size+alignment), allocSize);
+    char *newBuf = new char[allocSize];
+    memBufferSize[curMemBuffer] = allocSize;
+    memBuffers[curMemBuffer] = newBuf;
+    return AllocMemory(size, alignment);
+}
+
+
+///////////////////////////////////////////////////////////////////////////
+// Atomics and the like
+
+#ifndef ISPC_IS_WINDOWS
+static inline void
+lMemFence() {
+    __asm__ __volatile__("mfence":::"memory");
+}
+#endif // !ISPC_IS_WINDOWS
+
+
+#if (__SIZEOF_POINTER__ == 4) || defined(__i386__) || defined(_WIN32)
+#define ISPC_POINTER_BYTES 4
+#elif (__SIZEOF_POINTER__ == 8) || defined(__x86_64__) || defined(__amd64__) || defined(_WIN64)
+#define ISPC_POINTER_BYTES 8
+#else
+#error "Pointer size unknown!"
+#endif // __SIZEOF_POINTER__
+
+
+static void *
+lAtomicCompareAndSwapPointer(void **v, void *newValue, void *oldValue) {
+#ifdef ISPC_IS_WINDOWS
+    return InterlockedCompareExchangePointer(v, newValue, oldValue);
+#else
+    void *result;
+#if (ISPC_POINTER_BYTES == 4)
+    __asm__ __volatile__("lock\ncmpxchgd %2,%1"
+                          : "=a"(result), "=m"(*v)
+                          : "q"(newValue), "0"(oldValue)
+                          : "memory");
+#else
+    __asm__ __volatile__("lock\ncmpxchgq %2,%1"
+                          : "=a"(result), "=m"(*v)
+                          : "q"(newValue), "0"(oldValue)
+                          : "memory");
+#endif // ISPC_POINTER_BYTES
+    lMemFence();
+    return result;
+#endif // ISPC_IS_WINDOWS
+}
+
+
+
+#ifndef ISPC_IS_WINDOWS
+static int32_t 
+lAtomicCompareAndSwap32(volatile int32_t *v, int32_t newValue, int32_t oldValue) {
+    int32_t result;
+    __asm__ __volatile__("lock\ncmpxchgl %2,%1"
+                          : "=a"(result), "=m"(*v)
+                          : "q"(newValue), "0"(oldValue)
+                          : "memory");
+    lMemFence();
+    return result;
+}
+#endif // !ISPC_IS_WINDOWS
+
+
+///////////////////////////////////////////////////////////////////////////
+
+#ifdef ISPC_USE_CONCRT
+// With ConcRT, we don't need to extend TaskGroupBase at all.
+class TaskGroup : public TaskGroupBase {
+public:
+    void Launch(int baseIndex, int count);
+    void Sync();
+};
+#endif // ISPC_USE_CONCRT
+
+#ifdef ISPC_USE_GCD
+/* With Grand Central Dispatch, we associate a GCD dispatch group with each
+   task group.  (We'll later wait on this dispatch group when we need to
+   wait on all of the tasks in the group to finish.)
+ */
+class TaskGroup : public TaskGroupBase {
+public:
+    TaskGroup() {
+        gcdGroup = dispatch_group_create();
+    }
+
+    void Launch(int baseIndex, int count);
+    void Sync();
+
+private:
+    dispatch_group_t gcdGroup;
+};
+#endif // ISPC_USE_GCD
+
+#ifdef ISPC_USE_PTHREADS
+static void *lTaskEntry(void *arg);
+
+class TaskGroup : public TaskGroupBase {
+public:
+    TaskGroup() {
+        numUnfinishedTasks = 0;
+        waitingTasks.reserve(128);
+        inActiveList = false;
+    }
+
+    void Reset() {
+        TaskGroupBase::Reset();
+        numUnfinishedTasks = 0;
+        assert(inActiveList == false);
+        lMemFence();
+    }
+
+    void Launch(int baseIndex, int count);
+    void Sync();
+
+private:
+    friend void *lTaskEntry(void *arg);
+
+    int32_t numUnfinishedTasks;
+    int32_t pad[3];
+    std::vector<int> waitingTasks;
+    bool inActiveList;
+};
+
+#endif // ISPC_USE_PTHREADS
+
+
+///////////////////////////////////////////////////////////////////////////
+// Grand Central Dispatch
+
+#ifdef ISPC_USE_GCD
+
+/* A simple task system for ispc programs based on Apple's Grand Central
+   Dispatch. */
+
+static dispatch_queue_t gcdQueue;
+static volatile int32_t lock = 0;
+
+static void
+InitTaskSystem() {
+    if (gcdQueue != NULL)
+        return;
+
+    while (1) {
+        if (lAtomicCompareAndSwap32(&lock, 1, 0) == 0) {
+            if (gcdQueue == NULL) {
+                gcdQueue = dispatch_get_global_queue(DISPATCH_QUEUE_PRIORITY_DEFAULT, 0);
+                assert(gcdQueue != NULL);
+                lMemFence();
+            }
+            lock = 0;
+            break;
+        }
+    }
+}
+
+
+static void
+lRunTask(void *ti) {
+    TaskInfo *taskInfo = (TaskInfo *)ti;
+    // FIXME: these are bogus values; may cause bugs in code that depends
+    // on them having unique values in different threads.
+    int threadIndex = 0;
+    int threadCount = 1;
+
+    // Actually run the task
+    taskInfo->func(taskInfo->data, threadIndex, threadCount, 
+                   taskInfo->taskIndex, taskInfo->taskCount);
+}
+
+
+inline void
+TaskGroup::Launch(int baseIndex, int count) {
+    for (int i = 0; i < count; ++i) {
+        TaskInfo *ti = GetTaskInfo(baseIndex + i);
+        dispatch_group_async_f(gcdGroup, gcdQueue, ti, lRunTask);
+    }
+}
+
+
+inline void
+TaskGroup::Sync() {
+    dispatch_group_wait(gcdGroup, DISPATCH_TIME_FOREVER);
+}
+
+#endif // ISPC_USE_GCD
+
+///////////////////////////////////////////////////////////////////////////
+// Concurrency Runtime
+
+#ifdef ISPC_USE_CONCRT
+
+static void
+InitTaskSystem() {
+    // No initialization needed
+}
+
+
+static void __cdecl
+lRunTask(LPVOID param) {
+    TaskInfo *ti = (TaskInfo *)param;
+    
+    // Actually run the task. 
+    // FIXME: like the GCD implementation for OS X, this is passing bogus
+    // values for the threadIndex and threadCount builtins, which in turn
+    // will cause bugs in code that uses those.
+    int threadIndex = 0;
+    int threadCount = 1;
+    ti->func(ti->data, threadIndex, threadCount, ti->taskIndex, ti->taskCount);
+
+    // Signal the event that this task is done
+    ti->taskEvent.set();
+}
+
+
+inline void
+TaskGroup::Launch(int baseIndex, int count) {
+    for (int i = 0; i < count; ++i)
+        CurrentScheduler::ScheduleTask(lRunTask, GetTaskInfo(baseIndex + i));
+}
+
+
+inline void
+TaskGroup::Sync() {
+    for (int i = 0; i < nextTaskInfoIndex; ++i) {
+        TaskInfo *ti = GetTaskInfo(i);
+        ti->taskEvent.wait();
+        ti->taskEvent.reset();
+    }
+}
+
+#endif // ISPC_USE_CONCRT
+
+///////////////////////////////////////////////////////////////////////////
+// pthreads
+
+#ifdef ISPC_USE_PTHREADS
+
+static volatile int32_t lock = 0;
+
+static int nThreads;
+static pthread_t *threads = NULL;
+
+static pthread_mutex_t taskSysMutex;
+static std::vector<TaskGroup *> activeTaskGroups;
+static sem_t *workerSemaphore;
+
+
+static inline int32_t 
+lAtomicAdd(int32_t *v, int32_t delta) {
+    int32_t origValue;
+    __asm__ __volatile__("lock\n"
+                         "xaddl %0,%1"
+                         : "=r"(origValue), "=m"(*v) : "0"(delta)
+                         : "memory");
+    return origValue;
+}
+
+
+static void *
+lTaskEntry(void *arg) {
+    int threadIndex = (int)((int64_t)arg);
+    int threadCount = nThreads;
+
+    while (1) {
+        int err;
+        //
+        // Wait on the semaphore until we're woken up due to the arrival of
+        // more work.
+        //
+        if ((err = sem_wait(workerSemaphore)) != 0) {
+            fprintf(stderr, "Error from sem_wait: %s\n", strerror(err));
+            exit(1);
+        }
+
+        //
+        // Acquire the mutex
+        //
+        if ((err = pthread_mutex_lock(&taskSysMutex)) != 0) {
+            fprintf(stderr, "Error from pthread_mutex_lock: %s\n", strerror(err));
+            exit(1);
+        }
+
+        if (activeTaskGroups.size() == 0) {
+            //
+            // Task queue is empty, go back and wait on the semaphore
+            //
+            if ((err = pthread_mutex_unlock(&taskSysMutex)) != 0) {
+                fprintf(stderr, "Error from pthread_mutex_unlock: %s\n", strerror(err));
+                exit(1);
+            }
+            continue;
+        }
+
+        //
+        // Get the last task group on the active list and the last task
+        // from its waiting tasks list.
+        //
+        TaskGroup *tg = activeTaskGroups.back();
+        assert(tg->waitingTasks.size() > 0);
+        int taskNumber = tg->waitingTasks.back();
+        tg->waitingTasks.pop_back();
+
+        if (tg->waitingTasks.size() == 0) {
+            // We just took the last task from this task group, so remove
+            // it from the active list.
+            activeTaskGroups.pop_back();
+            tg->inActiveList = false;
+        }
+    
+        if ((err = pthread_mutex_unlock(&taskSysMutex)) != 0) {
+            fprintf(stderr, "Error from pthread_mutex_unlock: %s\n", strerror(err));
+            exit(1);
+        }
+
+        //
+        // And now actually run the task
+        //
+        DBG(fprintf(stderr, "running task %d from group %p\n", taskNumber, tg));
+        TaskInfo *myTask = tg->GetTaskInfo(taskNumber);
+        myTask->func(myTask->data, threadIndex, threadCount, myTask->taskIndex,
+                     myTask->taskCount);
+
+        //
+        // Decrement the "number of unfinished tasks" counter in the task
+        // group.
+        //
+        lMemFence();
+        lAtomicAdd(&tg->numUnfinishedTasks, -1);
+    }
+
+    pthread_exit(NULL);
+    return 0;
+}
+
+
+static void
+InitTaskSystem() {
+    if (threads == NULL) {
+        while (1) {
+            if (lAtomicCompareAndSwap32(&lock, 1, 0) == 0) {
+                if (threads == NULL) {
+                    // We launch one fewer thread than there are cores,
+                    // since the main thread here will also grab jobs from
+                    // the task queue itself.
+                    nThreads = sysconf(_SC_NPROCESSORS_ONLN) - 1;
+
+                    int err;
+                    if ((err = pthread_mutex_init(&taskSysMutex, NULL)) != 0) {
+                        fprintf(stderr, "Error creating mutex: %s\n", strerror(err));
+                        exit(1);
+                    }
+
+                    char name[32];
+                    sprintf(name, "ispc_task.%d", (int)getpid());
+                    workerSemaphore = sem_open(name, O_CREAT, S_IRUSR|S_IWUSR, 0);
+                    if (!workerSemaphore) {
+                        fprintf(stderr, "Error creating semaphore: %s\n", strerror(err));
+                        exit(1);
+                    }
+
+                    threads = (pthread_t *)malloc(nThreads * sizeof(pthread_t));
+                    for (int i = 0; i < nThreads; ++i) {
+                        err = pthread_create(&threads[i], NULL, &lTaskEntry, (void *)(i));
+                        if (err != 0) {
+                            fprintf(stderr, "Error creating pthread %d: %s\n", i, strerror(err));
+                            exit(1);
+                        }
+                    }
+
+                    activeTaskGroups.reserve(64);
+                }
+
+                // Make sure all of the above goes to memory before we
+                // clear the lock.
+                lMemFence();
+                lock = 0;
+                break;
+            }
+        }
+    }
+}
+
+
+inline void
+TaskGroup::Launch(int baseCoord, int count) {
+    //
+    // Acquire mutex, add task
+    //
+    int err;
+    if ((err = pthread_mutex_lock(&taskSysMutex)) != 0) {
+        fprintf(stderr, "Error from pthread_mutex_lock: %s\n", strerror(err));
+        exit(1);
+    }
+
+    // Add the corresponding set of tasks to the waiting-to-be-run list for
+    // this task group.
+    //
+    // FIXME: it's a little ugly to hold a global mutex for this when we
+    // only need to make sure no one else is accessing this task group's
+    // waitingTasks list.  (But a small experiment in switching to a
+    // per-TaskGroup mutex showed worse performance!)
+    for (int i = 0; i < count; ++i)
+        waitingTasks.push_back(baseCoord + i);
+
+    // Add the task group to the global active list if it isn't there
+    // already.
+    if (inActiveList == false) {
+        activeTaskGroups.push_back(this);
+        inActiveList = true;
+    }
+
+    if ((err = pthread_mutex_unlock(&taskSysMutex)) != 0) {
+        fprintf(stderr, "Error from pthread_mutex_unlock: %s\n", strerror(err));
+        exit(1);
+    }
+
+    //
+    // Update the count of the number of tasks left to run in this task
+    // group.
+    //
+    lMemFence();
+    lAtomicAdd(&numUnfinishedTasks, count);
+
+    //
+    // Post to the worker semaphore to wake up worker threads that are
+    // sleeping waiting for tasks to show up
+    //
+    for (int i = 0; i < count; ++i)
+        if ((err = sem_post(workerSemaphore)) != 0) {
+            fprintf(stderr, "Error from sem_post: %s\n", strerror(err));
+            exit(1);
+        }
+}
+
+
+inline void
+TaskGroup::Sync() {
+    DBG(fprintf(stderr, "syncing %p - %d unfinished\n", tg, numUnfinishedTasks));
+
+    while (numUnfinishedTasks > 0) {
+        // All of the tasks in this group aren't finished yet.  We'll try
+        // to help out here since we don't have anything else to do...
+
+        DBG(fprintf(stderr, "while syncing %p - %d unfinished\n", tg, 
+                    numUnfinishedTasks));
+
+        //
+        // Acquire the global task system mutex to grab a task to work on
+        //
+        int err;
+        if ((err = pthread_mutex_lock(&taskSysMutex)) != 0) {
+            fprintf(stderr, "Error from pthread_mutex_lock: %s\n", strerror(err));
+            exit(1);
+        }
+
+        TaskInfo *myTask = NULL;
+        TaskGroup *runtg = this;
+        if (waitingTasks.size() > 0) {
+            int taskNumber = waitingTasks.back();
+            waitingTasks.pop_back();
+
+            if (waitingTasks.size() == 0) {
+                // There's nothing left to start running from this group,
+                // so remove it from the active task list.
+                activeTaskGroups.erase(std::find(activeTaskGroups.begin(),
+                                                 activeTaskGroups.end(), this));
+                inActiveList = false;
+            }
+            myTask = GetTaskInfo(taskNumber);
+            DBG(fprintf(stderr, "running task %d from group %p in sync\n", taskNumber, tg));
+        }
+        else {
+            // Other threads are already working on all of the tasks in
+            // this group, so we can't help out by running one ourself.
+            // We'll try to run one from another group to make ourselves
+            // useful here.
+            if (activeTaskGroups.size() == 0) {
+                // No active task groups left--there's nothing for us to do.
+                if ((err = pthread_mutex_unlock(&taskSysMutex)) != 0) {
+                    fprintf(stderr, "Error from pthread_mutex_unlock: %s\n", strerror(err));
+                    exit(1);
+                }
+                // FIXME: We basically end up busy-waiting here, which is
+                // extra wasteful in a world with hyperthreading.  It would
+                // be much better to put this thread to sleep on a
+                // condition variable that was signaled when the last task
+                // in this group was finished.
+                sleep(0);
+                continue;
+            }
+
+            // Get a task to run from another task group.
+            runtg = activeTaskGroups.back();
+            assert(runtg->waitingTasks.size() > 0);
+
+            int taskNumber = runtg->waitingTasks.back();
+            runtg->waitingTasks.pop_back();
+            if (runtg->waitingTasks.size() == 0) {
+                // There's left to start running from this group, so remove
+                // it from the active task list.
+                activeTaskGroups.pop_back();
+                runtg->inActiveList = false;
+            }
+            myTask = runtg->GetTaskInfo(taskNumber);
+            DBG(fprintf(stderr, "running task %d from other group %p in sync\n", 
+                        taskNumber, runtg));
+        }
+
+        if ((err = pthread_mutex_unlock(&taskSysMutex)) != 0) {
+            fprintf(stderr, "Error from pthread_mutex_unlock: %s\n", strerror(err));
+            exit(1);
+        }
+    
+        //
+        // Do work for _myTask_
+        //
+        // FIXME: bogus values for thread index/thread count here as well..
+        myTask->func(myTask->data, 0, 1, myTask->taskIndex, myTask->taskCount);
+
+        //
+        // Decrement the number of unfinished tasks counter
+        //
+        lMemFence();
+        lAtomicAdd(&runtg->numUnfinishedTasks, -1);
+    }
+    DBG(fprintf(stderr, "sync for %p done!n", tg));
+}
+
+#endif // ISPC_USE_PTHREADS
+
+///////////////////////////////////////////////////////////////////////////
+
+#define MAX_FREE_TASK_GROUPS 64
+static TaskGroup *freeTaskGroups[MAX_FREE_TASK_GROUPS];
+
+static inline TaskGroup *
+AllocTaskGroup() {
+    for (int i = 0; i < MAX_FREE_TASK_GROUPS; ++i) {
+        TaskGroup *tg = freeTaskGroups[i];
+        if (tg != NULL) {
+            void *ptr = lAtomicCompareAndSwapPointer((void **)(&freeTaskGroups[i]), NULL, tg);
+            if (ptr != NULL) {
+                assert(ptr == tg);
+                return (TaskGroup *)ptr;
+            }
+        }
+    }
+
+    return new TaskGroup;
+}
+
+
+static inline void
+FreeTaskGroup(TaskGroup *tg) {
+    tg->Reset();
+
+    for (int i = 0; i < MAX_FREE_TASK_GROUPS; ++i) {
+        if (freeTaskGroups[i] == NULL) {
+            void *ptr = lAtomicCompareAndSwapPointer((void **)&freeTaskGroups[i], tg, NULL);
+            if (ptr == NULL)
+                return;
+        }
+    }
+
+    delete tg;
+}
+
+///////////////////////////////////////////////////////////////////////////
+
+// ispc expects these functions to have C linkage / not be mangled
+extern "C" { 
+    void ISPCLaunch(void **handlePtr, void *f, void *data, int count);
+    void *ISPCAlloc(void **handlePtr, int64_t size, int32_t alignment);
+    void ISPCSync(void *handle);
+}
+
+void
+ISPCLaunch(void **taskGroupPtr, void *func, void *data, int count) {
+    TaskGroup *taskGroup;
+    if (*taskGroupPtr == NULL) {
+        InitTaskSystem();
+        taskGroup = AllocTaskGroup();
+        *taskGroupPtr = taskGroup;
+    }
+    else
+        taskGroup = (TaskGroup *)(*taskGroupPtr);
+
+    int baseIndex = taskGroup->AllocTaskInfo(count);
+    for (int i = 0; i < count; ++i) {
+        TaskInfo *ti = taskGroup->GetTaskInfo(baseIndex+i);
+        ti->func = (TaskFuncType)func;
+        ti->data = data;
+        ti->taskIndex = i;
+        ti->taskCount = count;
+    }
+    taskGroup->Launch(baseIndex, count);
+}
+
+
+void
+ISPCSync(void *h) {
+    TaskGroup *taskGroup = (TaskGroup *)h;
+    if (taskGroup != NULL) {
+        taskGroup->Sync();
+        FreeTaskGroup(taskGroup);
+    }
+}
+
+
+void *
+ISPCAlloc(void **taskGroupPtr, int64_t size, int32_t alignment) {
+    TaskGroup *taskGroup;
+    if (*taskGroupPtr == NULL) {
+        InitTaskSystem();
+        taskGroup = AllocTaskGroup();
+        *taskGroupPtr = taskGroup;
+    }
+    else
+        taskGroup = (TaskGroup *)(*taskGroupPtr);
+
+    return taskGroup->AllocMemory(size, alignment);
+}
--- a/examples/timing.h
+++ b/examples/timing.h
@@ -38,7 +38,9 @@
 #include <windows.h>
 #define rdtsc __rdtsc
 #else
+#ifdef __cplusplus
 extern "C" {
+#endif /* __cplusplus */
    __inline__ uint64_t rdtsc() {
        uint32_t low, high;
        __asm__ __volatile__ (
@@ -48,7 +50,9 @@ extern "C" {
                              "rdtsc" : "=a" (low), "=d" (high));
        return (uint64_t)high << 32 | low;
    }
+#ifdef __cplusplus
 }
+#endif /* __cplusplus */
 #endif            
            
 static uint64_t start, end;
--- a/examples/volume_rendering/.gitignore
+++ b/examples/volume_rendering/.gitignore
@@ -0,0 +1,2 @@
+mandelbrot
+*.ppm
--- a/examples/volume_rendering/Makefile
+++ b/examples/volume_rendering/Makefile
@@ -0,0 +1,35 @@
+
+ARCH = $(shell uname)
+
+TASK_CXX=../tasksys.cpp
+TASK_LIB=-lpthread
+TASK_OBJ=$(addprefix objs/, $(subst ../,, $(TASK_CXX:.cpp=.o)))
+
+CXX=g++
+CXXFLAGS=-Iobjs/ -O3 -Wall -m64
+ISPC=ispc
+ISPCFLAGS=-O2 --target=sse4x2 --arch=x86-64
+
+default: volume
+
+.PHONY: dirs clean
+
+dirs:
+	/bin/mkdir -p objs/
+
+clean:
+	/bin/rm -rf objs *~ volume
+
+volume: dirs objs/volume.o objs/volume_serial.o objs/volume_ispc.o $(TASK_OBJ)
+	$(CXX) $(CXXFLAGS) -o $@ objs/volume.o objs/volume_ispc.o objs/volume_serial.o $(TASK_OBJ) -lm $(TASK_LIB)
+
+objs/%.o: %.cpp
+	$(CXX) $< $(CXXFLAGS) -c -o $@
+
+objs/%.o: ../%.cpp
+	$(CXX) $< $(CXXFLAGS) -c -o $@
+
+objs/volume.o: objs/volume_ispc.h 
+
+objs/%_ispc.h objs/%_ispc.o: %.ispc
+	$(ISPC) $(ISPCFLAGS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
--- a/examples/volume_rendering/camera.dat
+++ b/examples/volume_rendering/camera.dat
@@ -0,0 +1,11 @@
+896 1184
+
+0.000155 0.000000 0.000000 -0.069927
+0.000000 -0.000155 0.000000 0.093236
+0.000000 0.000000 0.000000 1.000000
+0.000000 0.000000 -99.999001 100.000000
+
+1.000000 0.000000 0.000000 1.000000
+0.000000 0.980129 -0.198360 2.900000
+0.000000 0.198360 0.980129 -10.500000
+0.000000 0.000000 0.000000 1.000000
--- a/examples/volume_rendering/density_highres.vol
+++ b/examples/volume_rendering/density_highres.vol
--- a/examples/volume_rendering/density_lowres.vol
+++ b/examples/volume_rendering/density_lowres.vol
--- a/examples/volume_rendering/volume.cpp
+++ b/examples/volume_rendering/volume.cpp
@@ -0,0 +1,248 @@
+/*
+  Copyright (c) 2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+#ifdef _MSC_VER
+#define _CRT_SECURE_NO_WARNINGS
+#define NOMINMAX
+#pragma warning (disable: 4244)
+#pragma warning (disable: 4305)
+#endif
+
+#include <stdio.h>
+#include <algorithm>
+#include "../timing.h"
+#include "../cpuid.h"
+#include "volume_ispc.h"
+using namespace ispc;
+
+extern void volume_serial(float density[], int nVoxels[3], 
+                          const float raster2camera[4][4],
+                          const float camera2world[4][4], 
+                          int width, int height, float image[]);
+
+/* Write a PPM image file with the image */
+static void
+writePPM(float *buf, int width, int height, const char *fn) {
+    FILE *fp = fopen(fn, "wb");
+    fprintf(fp, "P6\n");
+    fprintf(fp, "%d %d\n", width, height);
+    fprintf(fp, "255\n");
+    for (int i = 0; i < width*height; ++i) {
+        float v = buf[i] * 255.f;
+        if (v < 0.f) v = 0.f;
+        else if (v > 255.f) v = 255.f;
+        unsigned char c = (unsigned char)v;
+        for (int j = 0; j < 3; ++j)
+            fputc(c, fp);
+    }
+    fclose(fp);
+    printf("Wrote image file %s\n", fn);
+}
+
+
+// Make sure that the vector ISA used during compilation is supported by
+// the processor.  The ISPC_TARGET_* macro is set in the ispc-generated
+// header file that we include above.
+static void
+ensureTargetISAIsSupported() {
+#if defined(ISPC_TARGET_SSE2)
+    bool isaSupported = CPUSupportsSSE2();
+    const char *target = "SSE2";
+#elif defined(ISPC_TARGET_SSE4)
+    bool isaSupported = CPUSupportsSSE4();
+    const char *target = "SSE4";
+#elif defined(ISPC_TARGET_AVX)
+    bool isaSupported = CPUSupportsAVX();
+    const char *target = "AVX";
+#else
+#error "Unknown ISPC_TARGET_* value"
+#endif
+    if (!isaSupported) {
+        fprintf(stderr, "***\n*** Error: the ispc-compiled code uses the %s instruction "
+                "set, which isn't\n***        supported by this computer's CPU!\n", target);
+        fprintf(stderr, "***\n***        Please modify the "
+#ifdef _MSC_VER
+                "MSVC project file "
+#else
+                "Makefile "
+#endif
+                "to select another target (e.g. sse2)\n***\n");
+        exit(1);
+    }
+}
+
+/* Load image and viewing parameters from a camera data file.
+   FIXME: we should add support to be able to specify viewing parameters
+   in the program here directly. */
+static void
+loadCamera(const char *fn, int *width, int *height, float raster2camera[4][4],
+           float camera2world[4][4]) {
+    FILE *f = fopen(fn, "r");
+    if (!f) {
+        perror(fn);
+        exit(1);
+    }
+    if (fscanf(f, "%d %d", width, height) != 2) {
+        fprintf(stderr, "Unexpected end of file in camera file\n");
+        exit(1);
+    }
+
+    for (int i = 0; i < 4; ++i) {
+        for (int j = 0; j < 4; ++j) {
+            if (fscanf(f, "%f", &raster2camera[i][j]) != 1) {
+                fprintf(stderr, "Unexpected end of file in camera file\n");
+                exit(1);
+            }
+        }
+    }
+    for (int i = 0; i < 4; ++i) {
+        for (int j = 0; j < 4; ++j) {
+            if (fscanf(f, "%f", &camera2world[i][j]) != 1) {
+                fprintf(stderr, "Unexpected end of file in camera file\n");
+                exit(1);
+            }
+        }
+    }
+    fclose(f);
+}
+
+
+/* Load a volume density file.  Expects the number of x, y, and z samples
+   as the first three values (as integer strings), then x*y*z
+   floating-point values (also as strings) to give the densities.  */
+static float *
+loadVolume(const char *fn, int n[3]) {
+    FILE *f = fopen(fn, "r");
+    if (!f) {
+        perror(fn);
+        exit(1);
+    }
+
+    if (fscanf(f, "%d %d %d", &n[0], &n[1], &n[2]) != 3) {
+        fprintf(stderr, "Couldn't find resolution at start of density file\n");
+        exit(1);
+    }
+
+    int count = n[0] * n[1] * n[2];
+    float *v = new float[count];
+    for (int i = 0; i < count; ++i) {
+        if (fscanf(f, "%f", &v[i]) != 1) {
+            fprintf(stderr, "Unexpected end of file at %d'th density value\n", i);
+            exit(1);
+        }
+    }
+
+    return v;
+}
+
+
+int main(int argc, char *argv[]) {
+    if (argc != 3) {
+        fprintf(stderr, "usage: volume <camera.dat> <volume_density.vol>\n");
+        return 1;
+    }
+
+    ensureTargetISAIsSupported();
+
+    //
+    // Load viewing data and the volume density data
+    //
+    int width, height;
+    float raster2camera[4][4], camera2world[4][4];
+    loadCamera(argv[1], &width, &height, raster2camera, camera2world);
+    float *image = new float[width*height];
+
+    int n[3];
+    float *density = loadVolume(argv[2], n);
+
+    //
+    // Compute the image using the ispc implementation; report the minimum
+    // time of three runs.
+    //
+    double minISPC = 1e30;
+    for (int i = 0; i < 3; ++i) {
+        reset_and_start_timer();
+        volume_ispc(density, n, raster2camera, camera2world,
+                    width, height, image);
+        double dt = get_elapsed_mcycles();
+        minISPC = std::min(minISPC, dt);
+    }
+
+    printf("[volume ispc 1 core]:\t\t[%.3f] million cycles\n", minISPC);
+    writePPM(image, width, height, "volume-ispc-1core.ppm");
+
+    // Clear out the buffer
+    for (int i = 0; i < width * height; ++i)
+        image[i] = 0.;
+
+    //
+    // Compute the image using the ispc implementation that also uses
+    // tasks; report the minimum time of three runs.
+    //
+    double minISPCtasks = 1e30;
+    for (int i = 0; i < 3; ++i) {
+        reset_and_start_timer();
+        volume_ispc_tasks(density, n, raster2camera, camera2world,
+                          width, height, image);
+        double dt = get_elapsed_mcycles();
+        minISPCtasks = std::min(minISPCtasks, dt);
+    }
+
+    printf("[volume ispc + tasks]:\t\t[%.3f] million cycles\n", minISPCtasks);
+    writePPM(image, width, height, "volume-ispc-tasks.ppm");
+
+    // Clear out the buffer
+    for (int i = 0; i < width * height; ++i)
+        image[i] = 0.;
+
+    // 
+    // And run the serial implementation 3 times, again reporting the
+    // minimum time.
+    //
+    double minSerial = 1e30;
+    for (int i = 0; i < 3; ++i) {
+        reset_and_start_timer();
+        volume_serial(density, n, raster2camera, camera2world,
+                      width, height, image);
+        double dt = get_elapsed_mcycles();
+        minSerial = std::min(minSerial, dt);
+    }
+
+    printf("[volume serial]:\t\t[%.3f] millon cycles\n", minSerial);
+    writePPM(image, width, height, "volume-serial.ppm");
+
+    printf("\t\t\t\t(%.2fx speedup from ISPC serial, %.2fx from ISPC+tasks)\n", 
+           minSerial/minISPC, minSerial / minISPCtasks);
+
+    return 0;
+}
--- a/examples/volume_rendering/volume.ispc
+++ b/examples/volume_rendering/volume.ispc
@@ -0,0 +1,385 @@
+/*
+  Copyright (c) 2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+typedef float<3> float3;
+
+struct Ray {
+    float3 origin, dir;
+};
+
+
+static void
+generateRay(const uniform float raster2camera[4][4], 
+            const uniform float camera2world[4][4],
+            float x, float y, reference Ray ray) {
+    // transform raster coordinate (x, y, 0) to camera space
+    float camx = raster2camera[0][0] * x + raster2camera[0][1] * y + raster2camera[0][3];
+    float camy = raster2camera[1][0] * x + raster2camera[1][1] * y + raster2camera[1][3];
+    float camz = raster2camera[2][3];
+    float camw = raster2camera[3][3];
+    camx /= camw;
+    camy /= camw;
+    camz /= camw;
+
+    ray.dir.x = camera2world[0][0] * camx + camera2world[0][1] * camy + camera2world[0][2] * camz;
+    ray.dir.y = camera2world[1][0] * camx + camera2world[1][1] * camy + camera2world[1][2] * camz;
+    ray.dir.z = camera2world[2][0] * camx + camera2world[2][1] * camy + camera2world[2][2] * camz;
+
+    ray.origin.x = camera2world[0][3] / camera2world[3][3];
+    ray.origin.y = camera2world[1][3] / camera2world[3][3];
+    ray.origin.z = camera2world[2][3] / camera2world[3][3];
+}
+
+
+static inline bool
+Inside(float3 p, float3 pMin, float3 pMax) {
+    return (p.x >= pMin.x && p.x <= pMax.x &&
+            p.y >= pMin.y && p.y <= pMax.y &&
+            p.z >= pMin.z && p.z <= pMax.z);
+}
+
+
+static bool
+IntersectP(Ray ray, float3 pMin, float3 pMax, reference float hit0, reference float hit1) {
+    float t0 = -1e30, t1 = 1e30;
+
+    float3 tNear = (pMin - ray.origin) / ray.dir;
+    float3 tFar  = (pMax - ray.origin) / ray.dir;
+    if (tNear.x > tFar.x) {
+        float tmp = tNear.x;
+        tNear.x = tFar.x;
+        tFar.x = tmp;
+    }
+    t0 = max(tNear.x, t0);
+    t1 = min(tFar.x, t1);
+
+    if (tNear.y > tFar.y) {
+        float tmp = tNear.y;
+        tNear.y = tFar.y;
+        tFar.y = tmp;
+    }
+    t0 = max(tNear.y, t0);
+    t1 = min(tFar.y, t1);
+
+    if (tNear.z > tFar.z) {
+        float tmp = tNear.z;
+        tNear.z = tFar.z;
+        tFar.z = tmp;
+    }
+    t0 = max(tNear.z, t0);
+    t1 = min(tFar.z, t1);
+    
+    if (t0 <= t1) {
+        hit0 = t0;
+        hit1 = t1;
+        return true;
+    }
+    else
+        return false;
+}
+
+
+static inline float Lerp(float t, float a, float b) {
+    return (1.f - t) * a + t * b;
+}
+
+
+static inline float D(int x, int y, int z, uniform int nVoxels[3], 
+                      uniform float density[]) {
+    x = clamp(x, 0, nVoxels[0]-1);
+    y = clamp(y, 0, nVoxels[1]-1);
+    z = clamp(z, 0, nVoxels[2]-1);
+
+    return density[z*nVoxels[0]*nVoxels[1] + y*nVoxels[0] + x];
+}
+
+
+static inline float Du(uniform int x, uniform int y, uniform int z, 
+                       uniform int nVoxels[3], uniform float density[]) {
+    x = clamp(x, 0, nVoxels[0]-1);
+    y = clamp(y, 0, nVoxels[1]-1);
+    z = clamp(z, 0, nVoxels[2]-1);
+
+    return density[z*nVoxels[0]*nVoxels[1] + y*nVoxels[0] + x];
+}
+
+
+static inline float3 Offset(float3 p, float3 pMin, float3 pMax) {
+    return (p - pMin) / (pMax - pMin);
+}
+
+
+static inline float Density(float3 Pobj, float3 pMin, float3 pMax, 
+                            uniform float density[], uniform int nVoxels[3],
+                            reference uniform bool checkForSameVoxel) {
+    if (!Inside(Pobj, pMin, pMax)) 
+        return 0;
+    // Compute voxel coordinates and offsets for _Pobj_
+    float3 vox = Offset(Pobj, pMin, pMax);
+    vox.x = vox.x * nVoxels[0] - .5f;
+    vox.y = vox.y * nVoxels[1] - .5f;
+    vox.z = vox.z * nVoxels[2] - .5f;
+    int vx = (int)(vox.x), vy = (int)(vox.y), vz = (int)(vox.z);
+    float dx = vox.x - vx, dy = vox.y - vy, dz = vox.z - vz;
+
+    // Trilinearly interpolate density values to compute local density
+    float d00, d10, d01, d11;
+    uniform int uvx, uvy, uvz;
+    if (checkForSameVoxel && reduce_equal(vx, uvx) && reduce_equal(vy, uvy) &&
+        reduce_equal(vz, uvz)) {
+        // If all of the program instances are inside the same voxel, then
+        // we'll call the 'uniform' variant of the voxel density lookup
+        // function, thus doing a single load for each value rather than a
+        // gather.
+        d00 = Lerp(dx, Du(uvx, uvy, uvz, nVoxels, density),     
+                       Du(uvx+1, uvy, uvz, nVoxels, density));
+        d10 = Lerp(dx, Du(uvx, uvy+1, uvz, nVoxels, density),   
+                       Du(uvx+1, uvy+1, uvz, nVoxels, density));
+        d01 = Lerp(dx, Du(uvx, uvy, uvz+1, nVoxels, density),   
+                       Du(uvx+1, uvy, uvz+1, nVoxels, density));
+        d11 = Lerp(dx, Du(uvx, uvy+1, uvz+1, nVoxels, density), 
+                       Du(uvx+1, uvy+1, uvz+1, nVoxels, density));
+    }
+    else {
+        // Otherwise, we have to do an actual gather in the more general
+        // D() function.  Once the reduce_equal tests above fail, we stop
+        // checking in subsequent steps, since it's unlikely that this will
+        // be true in the future once they've diverged into different
+        // voxels.
+        checkForSameVoxel = false;
+        d00 = Lerp(dx, D(vx, vy, vz, nVoxels, density),     
+                       D(vx+1, vy, vz, nVoxels, density));
+        d10 = Lerp(dx, D(vx, vy+1, vz, nVoxels, density),   
+                       D(vx+1, vy+1, vz, nVoxels, density));
+        d01 = Lerp(dx, D(vx, vy, vz+1, nVoxels, density),   
+                       D(vx+1, vy, vz+1, nVoxels, density));
+        d11 = Lerp(dx, D(vx, vy+1, vz+1, nVoxels, density), 
+                       D(vx+1, vy+1, vz+1, nVoxels, density));
+    }
+    float d0 = Lerp(dy, d00, d10);
+    float d1 = Lerp(dy, d01, d11);
+    return Lerp(dz, d0, d1);
+}
+
+
+/* Returns the transmittance between two points p0 and p1, in a volume
+   with extent (pMin,pMax) with transmittance coefficient sigma_t,
+   defined by nVoxels[3] voxels in each dimension in the given density
+   array. */
+static float
+transmittance(uniform float3 p0, float3 p1, uniform float3 pMin,
+              uniform float3 pMax, uniform float sigma_t, 
+              uniform float density[], uniform int nVoxels[3]) {
+    float rayT0, rayT1;
+    Ray ray;
+    ray.origin = p1;
+    ray.dir = p0 - p1;
+
+    // Find the parametric t range along the ray that is inside the volume.
+    if (!IntersectP(ray, pMin, pMax, rayT0, rayT1))
+        return 1.;
+
+    rayT0 = max(rayT0, 0.f);
+
+    // Accumulate beam transmittance in tau
+    float tau = 0;
+    float rayLength = sqrt(ray.dir.x * ray.dir.x + ray.dir.y * ray.dir.y +
+                           ray.dir.z * ray.dir.z);
+    uniform float stepDist = 0.2;
+    float stepT = stepDist / rayLength;
+
+    float t = rayT0;
+    float3 pos = ray.origin + ray.dir * rayT0;
+    float3 dirStep = ray.dir * stepT;
+    uniform bool checkForSameVoxel = true;
+    while (t < rayT1) {
+        tau += stepDist * sigma_t * Density(pos, pMin, pMax, density, nVoxels,
+                                            checkForSameVoxel);
+        pos = pos + dirStep;
+        t += stepT;
+    }
+
+    return exp(-tau);
+}
+
+
+static inline float
+distanceSquared(float3 a, float3 b) {
+    float3 d = a-b;
+    return d.x*d.x + d.y*d.y + d.z*d.z;
+}
+
+
+static float 
+raymarch(uniform float density[], uniform int nVoxels[3], Ray ray) {
+    float rayT0, rayT1;
+    uniform float3 pMin = {.3, -.2, .3}, pMax = {1.8, 2.3, 1.8};
+    uniform float3 lightPos = { -1, 4, 1.5 };
+
+    cif (!IntersectP(ray, pMin, pMax, rayT0, rayT1))
+        return 0.;
+
+    rayT0 = max(rayT0, 0.f);
+
+    // Parameters that define the volume scattering characteristics and
+    // sampling rate for raymarching
+    uniform float Le = .25;            // Emission coefficient
+    uniform float sigma_a = 10;        // Absorption coefficient
+    uniform float sigma_s = 10;        // Scattering coefficient
+    uniform float stepDist = 0.025;    // Ray step amount
+    uniform float lightIntensity = 40; // Light source intensity
+
+    float tau = 0.f;  // accumulated beam transmittance
+    float L = 0;      // radiance along the ray
+    float rayLength = sqrt(ray.dir.x * ray.dir.x + ray.dir.y * ray.dir.y +
+                           ray.dir.z * ray.dir.z);
+    float stepT = stepDist / rayLength;
+
+    float t = rayT0;
+    float3 pos = ray.origin + ray.dir * rayT0;
+    float3 dirStep = ray.dir * stepT;
+    uniform bool checkForSameVoxel = true;
+    cwhile (t < rayT1) {
+        float d = Density(pos, pMin, pMax, density, nVoxels, checkForSameVoxel);
+
+        // terminate once attenuation is high
+        float atten = exp(-tau);
+        if (atten < .005)
+            cbreak;
+
+        // direct lighting
+        float Li = lightIntensity / distanceSquared(lightPos, pos) * 
+            transmittance(lightPos, pos, pMin, pMax, sigma_a + sigma_s,
+                          density, nVoxels);
+        L += stepDist * atten * d * sigma_s * (Li + Le);
+
+        // update beam transmittance
+        tau += stepDist * (sigma_a + sigma_s) * d;
+
+        pos = pos + dirStep;
+        t += stepT;
+    }
+
+    // Gamma correction
+    return pow(L, 1.f / 2.2f);
+}
+
+
+/* Utility routine used by both the task-based and the single-core entrypoints.
+   Renders a tile of the image, covering [x0,x0) * [y0, y1), storing the
+   result into the image[] array.
+ */
+static void
+volume_tile(uniform int x0, uniform int y0, uniform int x1,
+            uniform int y1, uniform float density[], uniform int nVoxels[3], 
+            const uniform float raster2camera[4][4],
+            const uniform float camera2world[4][4], 
+            uniform int width, uniform int height, uniform float image[]) {
+    // Work on 4x4=16 pixel big tiles of the image.  This function thus
+    // implicitly assumes that both (x1-x0) and (y1-y0) are evenly divisble
+    // by 4.
+    for (uniform int y = y0; y < y1; y += 4) {
+        for (uniform int x = x0; x < x1; x += 4) {
+            // For each such tile, process programCount pixels at a time,
+            // until we've done all 16 of them.  Thus, we're also assuming
+            // that programCount <= 16 and that 16 is evenly dividible by
+            // programCount.
+            for (uniform int o = 0; o < 16; o += programCount) {
+                // These two arrays encode the mapping from [0,15] to
+                // offsets within the 4x4 pixel block so that we render
+                // each pixel inside the block
+                const uniform int xoffsets[16] = { 0, 1, 0, 1, 2, 3, 2, 3,
+                                                   0, 1, 0, 1, 2, 3, 2, 3 };
+                const uniform int yoffsets[16] = { 0, 0, 1, 1, 0, 0, 1, 1,
+                                                   2, 2, 3, 3, 2, 2, 3, 3 };
+
+                // Figure out the pixel to render for this program instance
+                int xo = x + xoffsets[o + programIndex];
+                int yo = y + yoffsets[o + programIndex];
+
+                // Use viewing parameters to compute the corresponding ray
+                // for the pixel
+                Ray ray;
+                generateRay(raster2camera, camera2world, xo, yo, ray);
+
+                // And raymarch through the volume to compute the pixel's
+                // value
+                int offset = yo * width + xo;
+                image[offset] = raymarch(density, nVoxels, ray);
+            }
+        }
+    }
+}
+
+
+task void
+volume_task(uniform float density[], uniform int nVoxels[3], 
+            const uniform float raster2camera[4][4],
+            const uniform float camera2world[4][4], 
+            uniform int width, uniform int height, uniform float image[]) {
+    uniform int dx = 8, dy = 8; // must match value in volume_ispc_tasks
+    uniform int xbuckets = (width + (dx-1)) / dx;
+    uniform int ybuckets = (height + (dy-1)) / dy;
+
+    uniform int x0 = (taskIndex % xbuckets) * dx;
+    uniform int y0 = (taskIndex / ybuckets) * dy;
+    uniform int x1 = x0 + dx, y1 = y0 + dy;
+    x1 = min(x1, width);
+    y1 = min(y1, height);
+
+    volume_tile(x0, y0, x1, y1, density, nVoxels, raster2camera,
+                 camera2world, width, height, image);
+}
+
+
+export void
+volume_ispc(uniform float density[], uniform int nVoxels[3], 
+            const uniform float raster2camera[4][4],
+            const uniform float camera2world[4][4], 
+            uniform int width, uniform int height, uniform float image[]) {
+    volume_tile(0, 0, width, height, density, nVoxels, raster2camera, 
+                camera2world, width, height,  image);
+}
+
+
+export void
+volume_ispc_tasks(uniform float density[], uniform int nVoxels[3], 
+                  const uniform float raster2camera[4][4],
+                  const uniform float camera2world[4][4], 
+                  uniform int width, uniform int height, uniform float image[]) {
+    // Launch tasks to work on (dx,dy)-sized tiles of the image
+    uniform int dx = 8, dy = 8;
+    uniform int nTasks = ((width+(dx-1))/dx) * ((height+(dy-1))/dy);
+    launch[nTasks] < volume_task(density, nVoxels, raster2camera, camera2world, 
+                                 width, height, image) >;
+}
--- a/examples/volume_rendering/volume.vcxproj
+++ b/examples/volume_rendering/volume.vcxproj
@@ -0,0 +1,168 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|Win32">
+      <Configuration>Debug</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|Win32">
+      <Configuration>Release</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{dee5733a-e93e-449d-9114-9bffcaeb4df9}</ProjectGuid>
+    <Keyword>Win32Proj</Keyword>
+    <RootNamespace>volume</RootNamespace>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <LinkIncremental>true</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <LinkIncremental>true</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <LinkIncremental>false</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <LinkIncremental>false</LinkIncremental>
+  </PropertyGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <ClCompile>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <FloatingPointModel>Fast</FloatingPointModel>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <ClCompile>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <FloatingPointModel>Fast</FloatingPointModel>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <FloatingPointModel>Fast</FloatingPointModel>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <FloatingPointModel>Fast</FloatingPointModel>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <ClCompile Include="volume.cpp" />
+    <ClCompile Include="volume_serial.cpp" />
+    <ClCompile Include="../tasksys.cpp" />
+  </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="volume.ispc">
+      <FileType>Document</FileType>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=sse4x2
+</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --target=sse4x2
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=sse4x2
+</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --target=sse4x2
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
+    </CustomBuild>
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+  </ImportGroup>
+</Project>
--- a/examples/volume_rendering/volume_serial.cpp
+++ b/examples/volume_rendering/volume_serial.cpp
@@ -0,0 +1,305 @@
+/*
+  Copyright (c) 2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+#include <assert.h>
+#include <math.h>
+#include <algorithm>
+
+// Just enough of a float3 class to do what we need in this file.
+#ifdef _MSC_VER
+__declspec(align(16)) 
+#endif
+struct float3 {
+    float3() { }
+    float3(float xx, float yy, float zz) { x = xx; y = yy; z = zz; }
+
+    float3 operator*(float f) const { return float3(x*f, y*f, z*f); }
+    float3 operator-(const float3 &f2) const { 
+        return float3(x-f2.x, y-f2.y, z-f2.z); 
+    }
+    float3 operator*(const float3 &f2) const { 
+        return float3(x*f2.x, y*f2.y, z*f2.z); 
+    }
+    float3 operator+(const float3 &f2) const { 
+        return float3(x+f2.x, y+f2.y, z+f2.z); 
+    }
+    float3 operator/(const float3 &f2) const { 
+        return float3(x/f2.x, y/f2.y, z/f2.z); 
+    }
+    float operator[](int i) const { return (&x)[i]; }
+    float &operator[](int i) { return (&x)[i]; }
+
+    float x, y, z;
+    float pad;  // match padding/alignment of ispc version 
+}
+#ifndef _MSC_VER
+__attribute__ ((aligned(16)))
+#endif
+;
+
+struct Ray {
+    float3 origin, dir;
+};
+
+
+static void
+generateRay(const float raster2camera[4][4], const float camera2world[4][4],
+            float x, float y, Ray &ray) {
+    // transform raster coordinate (x, y, 0) to camera space
+    float camx = raster2camera[0][0] * x + raster2camera[0][1] * y + raster2camera[0][3];
+    float camy = raster2camera[1][0] * x + raster2camera[1][1] * y + raster2camera[1][3];
+    float camz = raster2camera[2][3];
+    float camw = raster2camera[3][3];
+    camx /= camw;
+    camy /= camw;
+    camz /= camw;
+
+    ray.dir.x = camera2world[0][0] * camx + camera2world[0][1] * camy + camera2world[0][2] * camz;
+    ray.dir.y = camera2world[1][0] * camx + camera2world[1][1] * camy + camera2world[1][2] * camz;
+    ray.dir.z = camera2world[2][0] * camx + camera2world[2][1] * camy + camera2world[2][2] * camz;
+
+    ray.origin.x = camera2world[0][3] / camera2world[3][3];
+    ray.origin.y = camera2world[1][3] / camera2world[3][3];
+    ray.origin.z = camera2world[2][3] / camera2world[3][3];
+}
+
+
+static bool
+Inside(float3 p, float3 pMin, float3 pMax) {
+    return (p.x >= pMin.x && p.x <= pMax.x &&
+            p.y >= pMin.y && p.y <= pMax.y &&
+            p.z >= pMin.z && p.z <= pMax.z);
+}
+
+
+static bool
+IntersectP(const Ray &ray, float3 pMin, float3 pMax, float *hit0, float *hit1) {
+    float t0 = -1e30f, t1 = 1e30f;
+
+    float3 tNear = (pMin - ray.origin) / ray.dir;
+    float3 tFar  = (pMax - ray.origin) / ray.dir;
+    if (tNear.x > tFar.x) {
+        float tmp = tNear.x;
+        tNear.x = tFar.x;
+        tFar.x = tmp;
+    }
+    t0 = std::max(tNear.x, t0);
+    t1 = std::min(tFar.x, t1);
+
+    if (tNear.y > tFar.y) {
+        float tmp = tNear.y;
+        tNear.y = tFar.y;
+        tFar.y = tmp;
+    }
+    t0 = std::max(tNear.y, t0);
+    t1 = std::min(tFar.y, t1);
+
+    if (tNear.z > tFar.z) {
+        float tmp = tNear.z;
+        tNear.z = tFar.z;
+        tFar.z = tmp;
+    }
+    t0 = std::max(tNear.z, t0);
+    t1 = std::min(tFar.z, t1);
+    
+    if (t0 <= t1) {
+        *hit0 = t0;
+        *hit1 = t1;
+        return true;
+    }
+    else
+        return false;
+}
+
+
+static inline float Lerp(float t, float a, float b) {
+    return (1.f - t) * a + t * b;
+}
+
+
+static inline int Clamp(int v, int low, int high) {
+    return std::min(std::max(v, low), high);
+}
+
+
+static inline float D(int x, int y, int z, int nVoxels[3], float density[]) {
+    x = Clamp(x, 0, nVoxels[0]-1);
+    y = Clamp(y, 0, nVoxels[1]-1);
+    z = Clamp(z, 0, nVoxels[2]-1);
+    return density[z*nVoxels[0]*nVoxels[1] + y*nVoxels[0] + x];
+}
+
+
+static inline float3 Offset(float3 p, float3 pMin, float3 pMax) {
+    return float3((p.x - pMin.x) / (pMax.x - pMin.x),
+                  (p.y - pMin.y) / (pMax.y - pMin.y),
+                  (p.z - pMin.z) / (pMax.z - pMin.z));
+}
+
+
+static inline float Density(float3 Pobj, float3 pMin, float3 pMax, 
+                            float density[], int nVoxels[3]) {
+    if (!Inside(Pobj, pMin, pMax)) 
+        return 0;
+    // Compute voxel coordinates and offsets for _Pobj_
+    float3 vox = Offset(Pobj, pMin, pMax);
+    vox.x = vox.x * nVoxels[0] - .5f;
+    vox.y = vox.y * nVoxels[1] - .5f;
+    vox.z = vox.z * nVoxels[2] - .5f;
+    int vx = (int)(vox.x), vy = (int)(vox.y), vz = (int)(vox.z);
+    float dx = vox.x - vx, dy = vox.y - vy, dz = vox.z - vz;
+
+    // Trilinearly interpolate density values to compute local density
+    float d00 = Lerp(dx, D(vx, vy, vz, nVoxels, density),     
+                         D(vx+1, vy, vz, nVoxels, density));
+    float d10 = Lerp(dx, D(vx, vy+1, vz, nVoxels, density),   
+                         D(vx+1, vy+1, vz, nVoxels, density));
+    float d01 = Lerp(dx, D(vx, vy, vz+1, nVoxels, density),   
+                         D(vx+1, vy, vz+1, nVoxels, density));
+    float d11 = Lerp(dx, D(vx, vy+1, vz+1, nVoxels, density), 
+                         D(vx+1, vy+1, vz+1, nVoxels, density));
+    float d0 = Lerp(dy, d00, d10);
+    float d1 = Lerp(dy, d01, d11);
+    return Lerp(dz, d0, d1);
+}
+
+
+
+static float
+transmittance(float3 p0, float3 p1, float3 pMin,
+              float3 pMax, float sigma_t, float density[], int nVoxels[3]) {
+    float rayT0, rayT1;
+    Ray ray;
+    ray.origin = p1;
+    ray.dir = p0 - p1;
+
+    // Find the parametric t range along the ray that is inside the volume.
+    if (!IntersectP(ray, pMin, pMax, &rayT0, &rayT1))
+        return 1.;
+
+    rayT0 = std::max(rayT0, 0.f);
+
+    // Accumulate beam transmittance in tau
+    float tau = 0;
+    float rayLength = sqrtf(ray.dir.x * ray.dir.x + ray.dir.y * ray.dir.y +
+                            ray.dir.z * ray.dir.z);
+    float stepDist = 0.2f;
+    float stepT = stepDist / rayLength;
+
+    float t = rayT0;
+    float3 pos = ray.origin + ray.dir * rayT0;
+    float3 dirStep = ray.dir * stepT;
+    while (t < rayT1) {
+        tau += stepDist * sigma_t * Density(pos, pMin, pMax, density, nVoxels);
+        pos = pos + dirStep;
+        t += stepT;
+    }
+
+    return expf(-tau);
+}
+
+
+static float
+distanceSquared(float3 a, float3 b) {
+    float3 d = a-b;
+    return d.x*d.x + d.y*d.y + d.z*d.z;
+}
+
+
+static float 
+raymarch(float density[], int nVoxels[3], const Ray &ray) {
+    float rayT0, rayT1;
+    float3 pMin(.3f, -.2f, .3f), pMax(1.8f, 2.3f, 1.8f);
+    float3 lightPos(-1.f, 4.f, 1.5f);
+
+    if (!IntersectP(ray, pMin, pMax, &rayT0, &rayT1))
+        return 0.;
+
+    rayT0 = std::max(rayT0, 0.f);
+
+    // Parameters that define the volume scattering characteristics and
+    // sampling rate for raymarching
+    float Le = .25f;           // Emission coefficient
+    float sigma_a = 10;        // Absorption coefficient
+    float sigma_s = 10;        // Scattering coefficient
+    float stepDist = 0.025f;   // Ray step amount
+    float lightIntensity = 40; // Light source intensity
+
+    float tau = 0.f;  // accumulated beam transmittance
+    float L = 0;      // radiance along the ray
+    float rayLength = sqrtf(ray.dir.x * ray.dir.x + ray.dir.y * ray.dir.y +
+                            ray.dir.z * ray.dir.z);
+    float stepT = stepDist / rayLength;
+
+    float t = rayT0;
+    float3 pos = ray.origin + ray.dir * rayT0;
+    float3 dirStep = ray.dir * stepT;
+    while (t < rayT1) {
+        float d = Density(pos, pMin, pMax, density, nVoxels);
+
+        // terminate once attenuation is high
+        float atten = expf(-tau);
+        if (atten < .005f)
+            break;
+
+        // direct lighting
+        float Li = lightIntensity / distanceSquared(lightPos, pos) * 
+            transmittance(lightPos, pos, pMin, pMax, sigma_a + sigma_s,
+                          density, nVoxels);
+        L += stepDist * atten * d * sigma_s * (Li + Le);
+
+        // update beam transmittance
+        tau += stepDist * (sigma_a + sigma_s) * d;
+
+        pos = pos + dirStep;
+        t += stepT;
+    }
+
+    // Gamma correction
+    return powf(L, 1.f / 2.2f);
+}
+
+
+void
+volume_serial(float density[], int nVoxels[3], const float raster2camera[4][4],
+              const float camera2world[4][4], 
+              int width, int height, float image[]) {
+    int offset = 0;
+    for (int y = 0; y < height; ++y) {
+        for (int x = 0; x < width; ++x, ++offset) {
+            Ray ray;
+            generateRay(raster2camera, camera2world, x, y, ray);
+            image[offset] = raymarch(density, nVoxels, ray);
+        }
+    }
+}
--- a/expr.cpp
+++ b/expr.cpp
@@ -741,6 +741,12 @@ UnaryExpr::TypeCheck() {
 }


+int
+UnaryExpr::EstimateCost() const {
+    return (expr ? expr->EstimateCost() : 0) + COST_SIMPLE_ARITH_LOGIC_OP;
+}
+
+
 void
 UnaryExpr::Print() const {
    if (!expr || !GetType())
@@ -799,11 +805,17 @@ lOpString(BinaryExpr::Op op) {
 */
 static llvm::Value *
 lEmitBinaryBitOp(BinaryExpr::Op op, llvm::Value *arg0Val,
-                 llvm::Value *arg1Val, FunctionEmitContext *ctx) {
+                 llvm::Value *arg1Val, bool isUnsigned,
+                 FunctionEmitContext *ctx) {
    llvm::Instruction::BinaryOps inst;
    switch (op) {
    case BinaryExpr::Shl:    inst = llvm::Instruction::Shl;  break;
-    case BinaryExpr::Shr:    inst = llvm::Instruction::AShr; break; 
+    case BinaryExpr::Shr:
+        if (isUnsigned)
+            inst = llvm::Instruction::LShr; 
+        else
+            inst = llvm::Instruction::AShr; 
+        break; 
    case BinaryExpr::BitAnd: inst = llvm::Instruction::And;  break;
    case BinaryExpr::BitXor: inst = llvm::Instruction::Xor;  break;
    case BinaryExpr::BitOr:  inst = llvm::Instruction::Or;   break;
@@ -949,7 +961,8 @@ BinaryExpr::GetValue(FunctionEmitContext *ctx) const {
            dynamic_cast<ConstExpr *>(arg1) == NULL)
            PerformanceWarning(pos, "Shift right is extremely inefficient for "
                               "varying shift amounts.");
-        return lEmitBinaryBitOp(op, e0Val, e1Val, ctx);
+        return lEmitBinaryBitOp(op, e0Val, e1Val, 
+                                arg0->GetType()->IsUnsignedType(), ctx);
    }
    case LogicalAnd:
        return ctx->BinaryOperator(llvm::Instruction::And, e0Val, e1Val,
@@ -1176,10 +1189,10 @@ BinaryExpr::Optimize() {
                    m->symbolTable->LookupFunction("rcp");
                if (rcpFuns != NULL) {
                    assert(rcpFuns->size() == 2);
-                    Expr *rcpSymExpr = new FunctionSymbolExpr(rcpFuns, pos);
+                    Expr *rcpSymExpr = new FunctionSymbolExpr("rcp", rcpFuns, pos);
                    ExprList *args = new ExprList(arg1, arg1->pos);
                    Expr *rcpCall = new FunctionCallExpr(rcpSymExpr, args, 
-                                                         arg1->pos, false);
+                                                         arg1->pos);
                    rcpCall = rcpCall->TypeCheck();
                    if (rcpCall == NULL)
                        return NULL;
@@ -1292,6 +1305,17 @@ BinaryExpr::TypeCheck() {
    if (type0 == NULL || type1 == NULL)
        return NULL;

+    if (dynamic_cast<const ReferenceType *>(type0) != NULL) {
+        arg0 = new DereferenceExpr(arg0, arg0->pos);
+        type0 = arg0->GetType();
+        assert(type0 != NULL);
+    }
+    if (dynamic_cast<const ReferenceType *>(type1) != NULL) {
+        arg1 = new DereferenceExpr(arg1, arg1->pos);
+        type1 = arg1->GetType();
+        assert(type1 != NULL);
+    }
+
    switch (op) {
    case Shl:
    case Shr:
@@ -1438,6 +1462,15 @@ BinaryExpr::TypeCheck() {
 }


+int
+BinaryExpr::EstimateCost() const {
+    return ((arg0 ? arg0->EstimateCost() : 0) +
+            (arg1 ? arg1->EstimateCost() : 0) +
+            ((op == Div || op == Mod) ? COST_COMPLEX_ARITH_OP : 
+                                        COST_SIMPLE_ARITH_LOGIC_OP));
+}
+
+
 void
 BinaryExpr::Print() const {
    if (!arg0 || !arg1 || !GetType())
@@ -1533,7 +1566,8 @@ lEmitOpAssign(AssignExpr::Op op, Expr *arg0, Expr *arg1, const Type *type,
    case AssignExpr::AndAssign:
    case AssignExpr::XorAssign:
    case AssignExpr::OrAssign:
-        newValue = lEmitBinaryBitOp(basicop, oldLHS, rvalue, ctx);
+        newValue = lEmitBinaryBitOp(basicop, oldLHS, rvalue, 
+                                    arg0->GetType()->IsUnsignedType(), ctx);
        break;
    default:
        FATAL("logic error in lEmitOpAssign");
@@ -1688,6 +1722,20 @@ AssignExpr::TypeCheck() {
 }


+int
+AssignExpr::EstimateCost() const {
+    int cost = ((lvalue ? lvalue->EstimateCost() : 0) +
+                (rvalue ? rvalue->EstimateCost() : 0));
+    cost += COST_ASSIGN;
+    if (op == Assign)
+        return cost;
+    if (op == DivAssign || op == ModAssign)
+        return cost + COST_COMPLEX_ARITH_OP;
+    else
+        return cost + COST_SIMPLE_ARITH_LOGIC_OP;
+}
+
+
 void
 AssignExpr::Print() const {
    if (!lvalue || !rvalue || !GetType())
@@ -1936,6 +1984,12 @@ SelectExpr::TypeCheck() {
 }


+int
+SelectExpr::EstimateCost() const {
+    return COST_SELECT;
+}
+
+
 void
 SelectExpr::Print() const {
    if (!test || !expr1 || !expr2 || !GetType())
@@ -2100,7 +2154,8 @@ FunctionCallExpr::tryResolve(bool (*matchFunc)(Expr *, const Type *)) {
            // It's kind of a silly to redundantly discover this for each
            // potential match versus detecting this earlier in the
            // matching process and just giving up.
-            if (!callArgs[i] || !callArgs[i]->GetType() || !candArgTypes[i])
+            if (!callArgs[i] || !callArgs[i]->GetType() || !candArgTypes[i] ||
+                dynamic_cast<const FunctionType *>(callArgs[i]->GetType()) != NULL)
                return false;
            
            // See if this caller argument matches the type of the
@@ -2158,7 +2213,7 @@ FunctionCallExpr::tryResolve(bool (*matchFunc)(Expr *, const Type *)) {


 void
-FunctionCallExpr::resolveFunctionOverloads() {
+FunctionCallExpr::resolveFunctionOverloads(bool exactMatchOnly) {
    FunctionSymbolExpr *fse = dynamic_cast<FunctionSymbolExpr *>(func);
    if (!fse) 
        // error will be issued later if not calling an actual function
@@ -2172,93 +2227,55 @@ FunctionCallExpr::resolveFunctionOverloads() {
    if (tryResolve(lExactMatch))
        return;

-    // Try to find a single match ignoring references
-    if (tryResolve(lMatchIgnoringReferences))
-        return;
+    if (!exactMatchOnly) {
+        // Try to find a single match ignoring references
+        if (tryResolve(lMatchIgnoringReferences))
+            return;

-    // TODO: next, try to find an exact match via type promotion--i.e. char
-    // -> int, etc--things that don't lose data
+        // TODO: next, try to find an exact match via type promotion--i.e. char
+        // -> int, etc--things that don't lose data

-    // Next try to see if there's a match via just uniform -> varying
-    // promotions.  TODO: look for one with a minimal number of them?
-    if (tryResolve(lMatchIgnoringUniform))
-        return;
+        // Next try to see if there's a match via just uniform -> varying
+        // promotions.  TODO: look for one with a minimal number of them?
+        if (tryResolve(lMatchIgnoringUniform))
+            return;

-    // Try to find a match via type conversion, but don't change
-    // unif->varying
-    if (tryResolve(lMatchWithTypeConvSameVariability))
-        return;
+        // Try to find a match via type conversion, but don't change
+        // unif->varying
+        if (tryResolve(lMatchWithTypeConvSameVariability))
+            return;
    
-    // Last chance: try to find a match via arbitrary type conversion.
-    if (tryResolve(lMatchWithTypeConv))
-        return;
+        // Last chance: try to find a match via arbitrary type conversion.
+        if (tryResolve(lMatchWithTypeConv))
+            return;
+    }

    // failure :-(
    const char *funName = fse->candidateFunctions->front()->name.c_str();
-    Error(pos, "Unable to find matching overload for call to function \"%s\".",
-          funName);
+    Error(pos, "Unable to find matching overload for call to function \"%s\"%s.",
+          funName, exactMatchOnly ? " only considering exact matches" : "");
    fprintf(stderr, "Candidates are:\n");
    lPrintFunctionOverloads(*fse->candidateFunctions);
    lPrintPassedTypes(funName, args->exprs);
 }


-FunctionCallExpr::FunctionCallExpr(Expr *f, ExprList *a, SourcePos p, bool il) 
-    : Expr(p) {
+FunctionCallExpr::FunctionCallExpr(Expr *f, ExprList *a, SourcePos p, 
+                                   bool il, Expr *lce) 
+    : Expr(p), isLaunch(il) {
    func = f;
    args = a;
-    isLaunch = il;
+    launchCountExpr = lce;

-    resolveFunctionOverloads();
-}
-
-
-/** Starting from the function initialFunction, we're calling into
-    calledFunc.  The question is: is this a recursive call back to
-    initialFunc?  If it definitely is or if it may be, then return true.
-    Return false if it definitely is not.
- */
-static bool
-lMayBeRecursiveCall(llvm::Function *calledFunc, 
-                    llvm::Function *initialFunc,
-                    std::set<llvm::Function *> &seenFuncs) {
-    // Easy case: intrinsics aren't going to call functions themselves
-    if (calledFunc->isIntrinsic())
-        return false;
-
-    std::string name = calledFunc->getName();
-    if (name.size() > 2 && name[0] == '_' && name[1] == '_')
-        // builtin stdlib function; none of these are recursive...
-        return false;
-
-    if (calledFunc->isDeclaration())
-        // There's visibility into what the called function does without a
-        // definition, so we have to be conservative
-        return true;
-
-    if (calledFunc == initialFunc)
-        // hello recursive call
-        return true;
-
-    // Otherwise iterate over all of the instructions in the function.  If
-    // any of them is a function call then check recursively..
-    llvm::inst_iterator iter;
-    for (iter = llvm::inst_begin(calledFunc); 
-         iter != llvm::inst_end(calledFunc); ++iter) {
-        llvm::Instruction *inst = &*iter;
-        llvm::CallInst *ci = llvm::dyn_cast<llvm::CallInst>(inst);
-        if (ci != NULL) {
-            llvm::Function *nextCalledFunc = ci->getCalledFunction();
-            // Don't repeatedly test functions we've seen before 
-            if (seenFuncs.find(nextCalledFunc) == seenFuncs.end()) {
-                seenFuncs.insert(nextCalledFunc);
-                if (lMayBeRecursiveCall(nextCalledFunc, initialFunc, 
-                                        seenFuncs))
-                    return true;
-            }
-        }
-    }
-    return false;
+    FunctionSymbolExpr *fse = dynamic_cast<FunctionSymbolExpr *>(func);
+    // Functions with names that start with "__" should only be various
+    // builtins.  For those, we'll demand an exact match, since we'll
+    // expect whichever function in stdlib.ispc is calling out to one of
+    // those to be matching the argument types exactly; this is to be a bit
+    // extra safe to be sure that the expected builtin is in fact being
+    // called.
+    bool exactMatchOnly = (fse != NULL) && (fse->name.substr(0,2) == "__");
+    resolveFunctionOverloads(exactMatchOnly);
 }


@@ -2382,47 +2399,18 @@ FunctionCallExpr::GetValue(FunctionEmitContext *ctx) const {
        }
    }

-    // We sometimes need to check to see if the mask is all off here;
-    // specifically, if the mask is all off and we call a recursive
-    // function, then we will probably have an unsesirable infinite loop.
-    ctx->SetDebugPos(pos);
-    llvm::BasicBlock *bDoCall = ctx->CreateBasicBlock("funcall_mask_ok");
-    llvm::BasicBlock *bSkip = ctx->CreateBasicBlock("funcall_mask_off");
-    llvm::BasicBlock *bAfter = ctx->CreateBasicBlock("after_funcall");
-    llvm::Function *currentFunc = ctx->GetCurrentBasicBlock()->getParent();
-
-    // If we need to check the mask (it may be a recursive call, possibly
-    // transitively), or we're launching a task, which is expensive and
-    // thus probably always worth checking, then use the mask to choose
-    // whether to go to the bDoCallBlock or the bSkip block
-    std::set<llvm::Function *> seenFuncs;
-    seenFuncs.insert(currentFunc);
-    if (ft->isTask || lMayBeRecursiveCall(callee, currentFunc, seenFuncs)) {
-        Debug(pos, "Checking mask before function call \"%s\".", funSym->name.c_str());
-        ctx->BranchIfMaskAny(bDoCall, bSkip);
-    }
-    else
-        // If we don't need to check the mask, then always to the call;
-        // just jump to bDoCall
-        ctx->BranchInst(bDoCall);
-    
-    // And the bSkip block just jumps immediately to bAfter.  So why do we
-    // need it?  So the phi node below can easily tell what paths are
-    // going into it
-    ctx->SetCurrentBasicBlock(bSkip);
-    ctx->BranchInst(bAfter);
-
-    // Emit the code to do the function call
-    ctx->SetCurrentBasicBlock(bDoCall);
-
    llvm::Value *retVal = NULL;
    ctx->SetDebugPos(pos);
-    if (ft->isTask)
-        ctx->LaunchInst(callee, argVals);
+    if (ft->isTask) {
+        assert(launchCountExpr != NULL);
+        llvm::Value *launchCount = launchCountExpr->GetValue(ctx);
+        if (launchCount != NULL)
+            ctx->LaunchInst(callee, argVals, launchCount);
+    }
    else {
        // Most of the time, the mask is passed as the last argument.  this
-        // isn't the case for things like SSE intrinsics and extern "C"
-        // functions from the application.
+        // isn't the case for things like intrinsics, builtins, and extern
+        // "C" functions from the application.
        assert(callargs.size() + 1 == callee->arg_size() ||
               callargs.size() == callee->arg_size());

@@ -2449,22 +2437,10 @@ FunctionCallExpr::GetValue(FunctionEmitContext *ctx) const {
        }
    }

-    // And jump out to the 'after funciton call' basic block
-    ctx->BranchInst(bAfter);
-    ctx->SetCurrentBasicBlock(bAfter);
-
    if (isVoidFunc)
        return NULL;
-
-    // The return value for the non-void case is either undefined or the
-    // function return value, depending on whether we actually ran the code
-    // path that called the function or not.
-    LLVM_TYPE_CONST llvm::Type *lrType = ft->GetReturnType()->LLVMType(g->ctx);
-    llvm::PHINode *ret = ctx->PhiNode(lrType, 2, "fun_ret");
-    assert(retVal != NULL);
-    ret->addIncoming(llvm::UndefValue::get(lrType), bSkip);
-    ret->addIncoming(retVal, bDoCall);
-    return ret;
+    else
+        return retVal;
 }


@@ -2506,10 +2482,21 @@ FunctionCallExpr::TypeCheck() {
                    if (!isLaunch)
                        Error(pos, "\"launch\" expression needed to call function "
                              "with \"task\" qualifier.");
+                    if (!launchCountExpr)
+                        return NULL;
+
+                    launchCountExpr = 
+                        launchCountExpr->TypeConv(AtomicType::UniformInt32,
+                                                  "task launch count");
+                    if (!launchCountExpr)
+                        return NULL;
+                }
+                else {
+                    if (isLaunch)
+                        Error(pos, "\"launch\" expression illegal with non-\"task\"-"
+                              "qualified function.");
+                    assert(launchCountExpr == NULL);
                }
-                else if (isLaunch)
-                    Error(pos, "\"launch\" expression illegal with non-\"task\"-"
-                          "qualified function.");
            }
            else
                Error(pos, "Valid function name must be used for function call.");
@@ -2525,6 +2512,13 @@ FunctionCallExpr::TypeCheck() {
 }


+int
+FunctionCallExpr::EstimateCost() const {
+    return ((args ? args->EstimateCost() : 0) +
+            (isLaunch ? COST_TASK_LAUNCH : COST_FUNCALL));
+}
+
+
 void
 FunctionCallExpr::Print() const {
    if (!func || !args || !GetType())
@@ -2613,7 +2607,7 @@ ExprList::GetConstant(const Type *type) const {
    }

    if (dynamic_cast<const StructType *>(type) != NULL) {
-#if defined(LLVM_2_8) || defined(LLVM_2_9)
+#if defined(LLVM_2_9)
        return llvm::ConstantStruct::get(*g->ctx, cv, false);
 #else
        LLVM_TYPE_CONST llvm::StructType *llvmStructType =
@@ -2636,6 +2630,17 @@ ExprList::GetConstant(const Type *type) const {
 }


+int
+ExprList::EstimateCost() const {
+    int cost = 0;
+    for (unsigned int i = 0; i < exprs.size(); ++i) {
+        if (exprs[i] != NULL)
+            cost += exprs[i]->EstimateCost();
+    }
+    return cost;
+}
+
+
 void
 ExprList::Print() const {
    printf("expr list (");
@@ -2766,6 +2771,22 @@ IndexExpr::GetLValue(FunctionEmitContext *ctx) const {
    if (!basePtr)
        return NULL;

+    // If the array index is a compile time constant, check to see if it
+    // may lead to an out-of-bounds access.
+    ConstExpr *ce = dynamic_cast<ConstExpr *>(index);
+    const SequentialType *seqType = dynamic_cast<const SequentialType *>(type);
+    assert(seqType != NULL);
+    int nElements = seqType->GetElementCount();
+    if (ce != NULL && nElements > 0) {
+        int32_t indices[ISPC_MAX_NVEC];
+        int count = ce->AsInt32(indices);
+        for (int i = 0; i < count; ++i) {
+            if (indices[i] < 0 || indices[i] >= nElements)
+                Warning(index->pos, "Array index \"%d\" may be out of bounds for "
+                        "\"%d\" element array.", indices[i], nElements);
+        }
+    }
+
    basePtr = lCastUniformVectorBasePtr(basePtr, ctx);

    ctx->SetDebugPos(pos);
@@ -2818,6 +2839,16 @@ IndexExpr::TypeCheck() {
 }


+int
+IndexExpr::EstimateCost() const {
+    // be pessimistic
+    if (index && index->GetType()->IsVaryingType())
+        return COST_GATHER;
+    else
+        return COST_LOAD;
+}
+
+
 void
 IndexExpr::Print() const {
    if (!arrayOrVector || !index || !GetType())
@@ -3117,6 +3148,7 @@ MemberExpr::create(Expr *e, const char *id, SourcePos p, SourcePos idpos) {
    return new MemberExpr(e, id, p, idpos);
 }

+
 MemberExpr::MemberExpr(Expr *e, const char *id, SourcePos p, SourcePos idpos) 
    : Expr(p), identifierPos(idpos) {
    expr = e;
@@ -3213,6 +3245,14 @@ MemberExpr::Optimize() {
 }


+int
+MemberExpr::EstimateCost() const {
+    // FIXME: return gather cost when we can tell a gather is going to be
+    // needed
+    return COST_SIMPLE_ARITH_LOGIC_OP;
+}
+
+
 void
 MemberExpr::Print() const {
    if (!expr || !GetType())
@@ -3280,7 +3320,7 @@ ConstExpr::ConstExpr(const Type *t, uint8_t u, SourcePos p)
  : Expr(p) {
    type = t;
    type = type->GetAsConstType();
-    assert(type == AtomicType::UniformUInt8);
+    assert(type == AtomicType::UniformConstUInt8);
    uint8Val[0] = u;
 }

@@ -3320,7 +3360,7 @@ ConstExpr::ConstExpr(const Type *t, uint16_t u, SourcePos p)
  : Expr(p) {
    type = t;
    type = type->GetAsConstType();
-    assert(type == AtomicType::UniformUInt16);
+    assert(type == AtomicType::UniformConstUInt16);
    uint16Val[0] = u;
 }

@@ -3423,7 +3463,7 @@ ConstExpr::ConstExpr(const Type *t, uint64_t u, SourcePos p)
  : Expr(p) {
    type = t;
    type = type->GetAsConstType();
-    assert(type == AtomicType::UniformUInt64);
+    assert(type == AtomicType::UniformConstUInt64);
    uint64Val[0] = u;
 }

@@ -4008,6 +4048,12 @@ ConstExpr::TypeCheck() {
 }


+int
+ConstExpr::EstimateCost() const {
+    return 0;
+}
+
+
 void
 ConstExpr::Print() const {
    printf("[%s] (", GetType()->GetString().c_str());
@@ -4094,7 +4140,7 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal,
        case AtomicType::TYPE_BOOL:
            if (fromType->IsVaryingType() && 
                LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType)
-                // If we have a bool vector of i32 element,s first truncate
+                // If we have a bool vector of i32 elements, first truncate
                // down to a single bit
                exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, "bool_to_i1");
            // And then do an unisgned int->float cast
@@ -4154,9 +4200,6 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal,
        case AtomicType::TYPE_UINT16:
        case AtomicType::TYPE_UINT32:
        case AtomicType::TYPE_UINT64:
-            if (fromType->IsVaryingType())
-                PerformanceWarning(pos, "Conversion from unsigned int64 to float is slow. "
-                                   "Use \"int64\" if possible");
            cast = ctx->CastInst(llvm::Instruction::UIToFP, // unsigned int
                                 exprVal, targetType, "uint2double");
            break;
@@ -4928,6 +4971,13 @@ TypeCastExpr::Optimize() {
 }


+int
+TypeCastExpr::EstimateCost() const {
+    // FIXME: return COST_TYPECAST_COMPLEX when appropriate
+    return COST_TYPECAST_SIMPLE;
+}
+
+
 void
 TypeCastExpr::Print() const {
    printf("[%s] type cast (", GetType()->GetString().c_str());
@@ -4993,6 +5043,12 @@ ReferenceExpr::TypeCheck() {
 }


+int
+ReferenceExpr::EstimateCost() const {
+    return 0;
+}
+
+
 void
 ReferenceExpr::Print() const {
    if (expr == NULL || GetType() == NULL)
@@ -5071,6 +5127,12 @@ DereferenceExpr::Optimize() {
 }


+int
+DereferenceExpr::EstimateCost() const {
+    return COST_DEREF;
+}
+
+
 void
 DereferenceExpr::Print() const {
    if (expr == NULL || GetType() == NULL)
@@ -5142,6 +5204,15 @@ SymbolExpr::Optimize() {
 }


+int
+SymbolExpr::EstimateCost() const {
+    if (symbol->constValue != NULL)
+        return 0;
+    else
+        return COST_LOAD;
+}
+
+
 void
 SymbolExpr::Print() const {
    if (symbol == NULL || GetType() == NULL)
@@ -5156,9 +5227,11 @@ SymbolExpr::Print() const {
 ///////////////////////////////////////////////////////////////////////////
 // FunctionSymbolExpr

-FunctionSymbolExpr::FunctionSymbolExpr(std::vector<Symbol *> *candidates,
+FunctionSymbolExpr::FunctionSymbolExpr(const char *n,
+                                       std::vector<Symbol *> *candidates,
                                       SourcePos p) 
  : Expr(p) {
+    name = n;
    matchingFunc = NULL;
    candidateFunctions = candidates;
 }
@@ -5195,6 +5268,12 @@ FunctionSymbolExpr::Optimize() {
 }


+int
+FunctionSymbolExpr::EstimateCost() const {
+    return 0;
+}
+
+
 void
 FunctionSymbolExpr::Print() const {
    if (!matchingFunc || !GetType())
@@ -5218,14 +5297,14 @@ SyncExpr::GetType() const {
 llvm::Value *
 SyncExpr::GetValue(FunctionEmitContext *ctx) const {
    ctx->SetDebugPos(pos);
-    std::vector<llvm::Value *> noArg;
-    llvm::Function *fsync = m->module->getFunction("ISPCSync");
-    if (fsync == NULL) {
-        FATAL("Couldn't find ISPCSync declaration?!");
-        return NULL;
-    }
+    ctx->SyncInst();
+    return NULL;
+}

-    return ctx->CallInst(fsync, noArg, "");
+
+int
+SyncExpr::EstimateCost() const {
+    return COST_SYNC;
 }


--- a/expr.h
+++ b/expr.h
@@ -121,8 +121,8 @@ public:
    void Print() const;
    Expr *Optimize();
    Expr *TypeCheck();
+    int EstimateCost() const;

-private:
    const Op op;
    Expr *expr;
 };
@@ -164,8 +164,8 @@ public:

    Expr *Optimize();
    Expr *TypeCheck();
+    int EstimateCost() const;

-private:
    const Op op;
    Expr *arg0, *arg1;
 };
@@ -196,8 +196,8 @@ public:

    Expr *Optimize();
    Expr *TypeCheck();
+    int EstimateCost() const;

-private:
    const Op op;
    Expr *lvalue, *rvalue;
 };
@@ -217,8 +217,8 @@ public:

    Expr *Optimize();
    Expr *TypeCheck();
+    int EstimateCost() const;

-private:
    Expr *test, *expr1, *expr2;
 };

@@ -240,6 +240,7 @@ public:
    llvm::Constant *GetConstant(const Type *type) const;
    ExprList *Optimize();
    ExprList *TypeCheck();
+    int EstimateCost() const;

    std::vector<Expr *> exprs;
 };
@@ -249,7 +250,8 @@ public:
 */
 class FunctionCallExpr : public Expr {
 public:
-    FunctionCallExpr(Expr *func, ExprList *args, SourcePos p, bool isLaunch);
+    FunctionCallExpr(Expr *func, ExprList *args, SourcePos p, 
+                     bool isLaunch = false, Expr *launchCountExpr = NULL);

    llvm::Value *GetValue(FunctionEmitContext *ctx) const;
    const Type *GetType() const;
@@ -257,13 +259,15 @@ public:

    Expr *Optimize();
    Expr *TypeCheck();
+    int EstimateCost() const;

-private:
    Expr *func;
    ExprList *args;
    bool isLaunch;
+    Expr *launchCountExpr;

-    void resolveFunctionOverloads();
+private:
+    void resolveFunctionOverloads(bool exactMatchOnly);
    bool tryResolve(bool (*matchFunc)(Expr *, const Type *));
 };

@@ -285,8 +289,8 @@ public:

    Expr *Optimize();
    Expr *TypeCheck();
+    int EstimateCost() const;

-private:
    Expr *arrayOrVector, *index;
 };

@@ -303,16 +307,17 @@ public:
    MemberExpr(Expr *expr, const char *identifier, SourcePos pos, 
               SourcePos identifierPos);

-    virtual llvm::Value *GetValue(FunctionEmitContext *ctx) const;
-    virtual llvm::Value *GetLValue(FunctionEmitContext *ctx) const;
-    virtual const Type *GetType() const;
-    virtual Symbol *GetBaseSymbol() const;
-    virtual void Print() const;
-    virtual Expr *Optimize();
-    virtual Expr *TypeCheck();
+    llvm::Value *GetValue(FunctionEmitContext *ctx) const;
+    llvm::Value *GetLValue(FunctionEmitContext *ctx) const;
+    const Type *GetType() const;
+    Symbol *GetBaseSymbol() const;
+    void Print() const;
+    Expr *Optimize();
+    Expr *TypeCheck();
+    int EstimateCost() const;
+
    virtual int getElementNumber() const;

-protected:
    std::string getCandidateNearMatches() const;

    Expr *expr;
@@ -392,6 +397,7 @@ public:

    Expr *TypeCheck();
    Expr *Optimize();
+    int EstimateCost() const;

    /** Return the ConstExpr's values as booleans, doing type conversion
        from the actual type if needed.  If forceVarying is true, then type
@@ -495,8 +501,8 @@ public:
    void Print() const;
    Expr *TypeCheck();
    Expr *Optimize();
+    int EstimateCost() const;

-private:
    const Type *type;
    Expr *expr;
 };
@@ -514,8 +520,8 @@ public:
    void Print() const;
    Expr *TypeCheck();
    Expr *Optimize();
+    int EstimateCost() const;

-private:
    Expr *expr;
 };

@@ -533,8 +539,8 @@ public:
    void Print() const;
    Expr *TypeCheck();
    Expr *Optimize();
+    int EstimateCost() const;

-private:
    Expr *expr;
 };

@@ -551,6 +557,7 @@ public:
    Expr *TypeCheck();
    Expr *Optimize();
    void Print() const;
+    int EstimateCost() const;

 private:
    Symbol *symbol;
@@ -562,7 +569,7 @@ private:
 */    
 class FunctionSymbolExpr : public Expr {
 public:
-    FunctionSymbolExpr(std::vector<Symbol *> *candidateFunctions, 
+    FunctionSymbolExpr(const char *name, std::vector<Symbol *> *candidateFunctions,
                       SourcePos pos);

    llvm::Value *GetValue(FunctionEmitContext *ctx) const;
@@ -571,10 +578,14 @@ public:
    Expr *TypeCheck();
    Expr *Optimize();
    void Print() const;
+    int EstimateCost() const;

 private:
    friend class FunctionCallExpr;

+    /** Name of the function that is being called. */
+    std::string name;
+
    /** All of the functions with the name given in the function call;
        there may be more then one, in which case we need to resolve which
        overload is the best match. */
@@ -597,6 +608,7 @@ public:
    Expr *TypeCheck();
    Expr *Optimize();
    void Print() const;
+    int EstimateCost() const;
 };

 #endif // ISPC_EXPR_H
--- a/failing_tests/masked-scatter-vector.ispc
+++ b/failing_tests/masked-scatter-vector.ispc
@@ -14,7 +14,7 @@ export void f_fu(uniform float ret[], uniform float aa[], uniform float b) {
    varying int3 vv = array[a];
    ++vv.y;
    array[a] = vv;
-    print("fin %\n", array[programIndex].y);
+//CO    print("fin %\n", array[programIndex].y);
    ret[programIndex] = array[programIndex].y;
 }

--- a/failing_tests/max-uint-1.ispc
+++ b/failing_tests/max-uint-1.ispc
@@ -1,19 +1,14 @@
-static float float4(uniform float a, uniform float b, uniform float c, 
-                    uniform float d) {
-    float ret = 0;
-    for (uniform int i = 0; i < programCount; i += 4) {
-        ret = insert(ret, i + 0, a);
-        ret = insert(ret, i + 1, b);
-        ret = insert(ret, i + 2, c);
-        ret = insert(ret, i + 3, d);
-    }
-    return ret;
+
+export uniform int width() { return programCount; }
+
+export void f_f(uniform float r[], uniform float a[]) {
+    unsigned int i = (unsigned int)a[programIndex];
+    r[programIndex] = max((unsigned int)2, i);
 }

-export float f_f(float a) {
-    unsigned int i = (unsigned int)a;
-    return max((unsigned int)2, i);
+export void result(uniform float r[]) { 
+    r[programIndex] = 1+programIndex;
+    r[0] = 2;
 }

-export float result() { return float4(2,2,3,4); }

--- a/failing_tests/max-uint.ispc
+++ b/failing_tests/max-uint.ispc
@@ -1,8 +1,10 @@

-export float f_f(float a) {
-    unsigned int i = (unsigned int)a;
-    return max((unsigned int)10, i);
+export uniform int width() { return programCount; }
+
+export void f_f(uniform float result[], uniform float aa[]) {
+    unsigned int i = (unsigned int)aa[programIndex];
+    result[programIndex] = max((unsigned int)100, i);
 }

-export float result() { return 10; }
+export void result(uniform float r[]) { r[programIndex] = 100; }

--- a/failing_tests/min-uint-1.ispc
+++ b/failing_tests/min-uint-1.ispc
@@ -1,19 +1,14 @@
-static float float4(uniform float a, uniform float b, uniform float c, 
-                    uniform float d) {
-    float ret = 0;
-    for (uniform int i = 0; i < programCount; i += 4) {
-        ret = insert(ret, i + 0, a);
-        ret = insert(ret, i + 1, b);
-        ret = insert(ret, i + 2, c);
-        ret = insert(ret, i + 3, d);
-    }
-    return ret;
+
+export uniform int width() { return programCount; }
+
+export void f_f(uniform float result[], uniform float aa[]) {
+    unsigned int i = (unsigned int)aa[programIndex];
+    result[programIndex] = min((unsigned int)2, i);
 }

-export float f_f(float a) {
-    unsigned int i = (unsigned int)a;
-    return min((unsigned int)2, i);
+export void result(uniform float r[]) { 
+    r[programIndex] = 2;
+    r[0] = 1;
 }

-export float result() { return float4(1,2,2,2); }

--- a/failing_tests/min-uint-2.ispc
+++ b/failing_tests/min-uint-2.ispc
@@ -1,19 +1,13 @@
-static float float4(uniform float a, uniform float b, uniform float c, 
-                    uniform float d) {
-    float ret = 0;
-    for (uniform int i = 0; i < programCount; i += 4) {
-        ret = insert(ret, i + 0, a);
-        ret = insert(ret, i + 1, b);
-        ret = insert(ret, i + 2, c);
-        ret = insert(ret, i + 3, d);
-    }
-    return ret;
+
+export uniform int width() { return programCount; }
+
+export void f_f(uniform float r[], uniform float a[]) {
+    unsigned int i = (unsigned int)a[programIndex];
+    r[programIndex] =  min((unsigned int)20, i);
 }

-export float f_f(float a) {
-    unsigned int i = (unsigned int)a;
-    return min((unsigned int)20, i);
+export void result(uniform float r[]) { 
+    r[programIndex] = 1+programIndex;
 }

-export float result() { return float4(1,2,3,4); }

--- a/failing_tests/struct-array-assign.ispc
+++ b/failing_tests/struct-array-assign.ispc
@@ -1,11 +0,0 @@
-
-struct Foo {
-    float f;
-};
-
-
-export float foo(Foo f[], int i, uniform int j) {
-    Foo x = f[i];
-    return x.f;
-}
-
--- a/ispc.cpp
+++ b/ispc.cpp
@@ -42,14 +42,25 @@
 #ifdef ISPC_IS_WINDOWS
 #include <windows.h>
 #include <direct.h>
+#define strcasecmp stricmp
 #endif
 #include <llvm/LLVMContext.h>
 #include <llvm/Module.h>
-#ifndef LLVM_2_8
 #include <llvm/Analysis/DIBuilder.h>
-#endif
 #include <llvm/Analysis/DebugInfo.h>
 #include <llvm/Support/Dwarf.h>
+#include <llvm/Target/TargetMachine.h>
+#include <llvm/Target/TargetOptions.h>
+#include <llvm/Target/TargetData.h>
+#if defined(LLVM_3_0) || defined(LLVM_3_0svn)
+  #include <llvm/Support/TargetRegistry.h>
+  #include <llvm/Support/TargetSelect.h>
+#else
+  #include <llvm/Target/TargetRegistry.h>
+  #include <llvm/Target/TargetSelect.h>
+  #include <llvm/Target/SubtargetFeature.h>
+#endif
+#include <llvm/Support/Host.h>

 Globals *g;
 Module *m;
@@ -57,20 +68,198 @@ Module *m;
 ///////////////////////////////////////////////////////////////////////////
 // Target

-Target::Target() {
-    arch = "x86-64";
-    cpu = "nehalem";
-    isa = SSE4;
-    nativeVectorWidth = 4;
-    vectorWidth = 4;
+bool
+Target::GetTarget(const char *arch, const char *cpu, const char *isa,
+                  bool pic, Target *t) {
+    if (cpu == NULL) {
+        std::string hostCPU = llvm::sys::getHostCPUName();
+        if (hostCPU.size() > 0)
+            cpu = hostCPU.c_str();
+        else {
+            fprintf(stderr, "Warning: unable to determine host CPU!\n");
+            cpu = "generic";
+        }
+    }
+    t->cpu = cpu;
+
+    if (isa == NULL) {
+        if (!strcasecmp(cpu, "atom"))
+            isa = "sse2";
+#if defined(LLVM_3_0) || defined(LLVM_3_0_svn)
+        else if (!strcasecmp(cpu, "sandybridge") ||
+                 !strcasecmp(cpu, "corei7-avx"))
+            isa = "avx";
+#endif // LLVM_3_0
+        else
+            isa = "sse4";
+    }
+    if (arch == NULL)
+        arch = "x86-64";
+
+    bool error = false;
+
+    t->generatePIC = pic;
+
+    // Make sure the target architecture is a known one; print an error
+    // with the valid ones otherwise.
+    t->target = NULL;
+    for (llvm::TargetRegistry::iterator iter = llvm::TargetRegistry::begin();
+         iter != llvm::TargetRegistry::end(); ++iter) {
+        if (std::string(arch) == iter->getName()) {
+            t->target = &*iter;
+            break;
+        }
+    }
+    if (t->target == NULL) {
+        fprintf(stderr, "Invalid architecture \"%s\"\nOptions: ", arch);
+        llvm::TargetRegistry::iterator iter;
+        for (iter = llvm::TargetRegistry::begin();
+             iter != llvm::TargetRegistry::end(); ++iter)
+            fprintf(stderr, "%s ", iter->getName());
+        fprintf(stderr, "\n");
+        error = true;
+    }
+    else {
+        t->arch = arch;
+    }
+
+    if (!strcasecmp(isa, "sse2")) {
+        t->isa = Target::SSE2;
+        t->nativeVectorWidth = 4;
+        t->vectorWidth = 4;
+        t->attributes = "+sse,+sse2,-sse3,-sse41,-sse42,-sse4a,-ssse3,-popcnt";
+    }
+    else if (!strcasecmp(isa, "sse4")) {
+        t->isa = Target::SSE4;
+        t->nativeVectorWidth = 4;
+        t->vectorWidth = 4;
+        t->attributes = "+sse,+sse2,+sse3,+sse41,-sse42,-sse4a,+ssse3,-popcnt,+cmov";
+    }
+    else if (!strcasecmp(isa, "sse4x2")) {
+        t->isa = Target::SSE4;
+        t->nativeVectorWidth = 4;
+        t->vectorWidth = 8;
+        t->attributes = "+sse,+sse2,+sse3,+sse41,-sse42,-sse4a,+ssse3,-popcnt,+cmov";
+    }
+#if defined(LLVM_3_0) || defined(LLVM_3_0svn)
+    else if (!strcasecmp(isa, "avx")) {
+        t->isa = Target::AVX;
+        t->nativeVectorWidth = 8;
+        t->vectorWidth = 8;
+        t->attributes = "+avx,+popcnt,+cmov";
+    }
+    else if (!strcasecmp(isa, "avx-x2")) {
+        t->isa = Target::AVX;
+        t->nativeVectorWidth = 8;
+        t->vectorWidth = 16;
+        t->attributes = "+avx,+popcnt,+cmov";
+    }
+#endif // LLVM 3.0
+    else {
+        fprintf(stderr, "Target ISA \"%s\" is unknown.  Choices are: %s\n", 
+                isa, SupportedTargetISAs());
+        error = true;
+    }
+
+    if (!error) {
+        llvm::TargetMachine *targetMachine = t->GetTargetMachine();
+        const llvm::TargetData *targetData = targetMachine->getTargetData();
+        t->is32bit = (targetData->getPointerSize() == 4);
+    }
+
+    return !error;
 }

+
+const char *
+Target::SupportedTargetCPUs() {
+    return "atom, barcelona, core2, corei7, "
+#if defined(LLVM_3_0) || defined(LLVM_3_0_svn)
+        "corei7-avx, "
+#endif
+        "istanbul, nocona, penryn, "
+#ifdef LLVM_2_9
+        "sandybridge, "
+#endif
+        "westmere";
+}
+
+
+const char *
+Target::SupportedTargetArchs() {
+    return "x86, x86-64";
+}
+
+
+const char *
+Target::SupportedTargetISAs() {
+    return "sse2, sse4, sse4x2"
+#if defined(LLVM_3_0) || defined(LLVM_3_0_svn)
+        ", avx, avx-x2"
+#endif
+        ;
+}
+
+
+std::string
+Target::GetTripleString() const {
+    llvm::Triple triple;
+    // Start with the host triple as the default
+    triple.setTriple(llvm::sys::getHostTriple());
+
+    // And override the arch in the host triple based on what the user
+    // specified.  Here we need to deal with the fact that LLVM uses one
+    // naming convention for targets TargetRegistry, but wants some
+    // slightly different ones for the triple.  TODO: is there a way to
+    // have it do this remapping, which would presumably be a bit less
+    // error prone?
+    if (arch == "x86")
+        triple.setArchName("i386");
+    else if (arch == "x86-64")
+        triple.setArchName("x86_64");
+    else
+        triple.setArchName(arch);
+
+    return triple.str();
+}
+
+
+llvm::TargetMachine *
+Target::GetTargetMachine() const {
+    std::string triple = GetTripleString();
+
+    llvm::Reloc::Model relocModel = generatePIC ? llvm::Reloc::PIC_ : 
+                                                  llvm::Reloc::Default;
+#if defined(LLVM_3_0svn) || defined(LLVM_3_0)
+    std::string featuresString = attributes;
+    llvm::TargetMachine *targetMachine = 
+        target->createTargetMachine(triple, cpu, featuresString, relocModel);
+#else
+#ifdef ISPC_IS_APPLE
+    relocModel = llvm::Reloc::PIC_;
+#endif // ISPC_IS_APPLE
+    std::string featuresString = cpu + std::string(",") + attributes;
+    llvm::TargetMachine *targetMachine = 
+        target->createTargetMachine(triple, featuresString);
+#ifndef ISPC_IS_WINDOWS
+    targetMachine->setRelocationModel(relocModel);
+#endif // !ISPC_IS_WINDOWS
+#endif
+    assert(targetMachine != NULL);
+
+    targetMachine->setAsmVerbosityDefault(true);
+    return targetMachine;
+}
+
+
 ///////////////////////////////////////////////////////////////////////////
 // Opt

 Opt::Opt() {
    level = 1;
    fastMath = false;
+    fastMaskedVload = false;
+    unrollLoops = true;
    disableBlendedMaskedStores = false;
    disableCoherentControlFlow = false;
    disableUniformControlFlow = false;
@@ -120,13 +309,9 @@ SourcePos::SourcePos(const char *n, int l, int c) {
 }

 llvm::DIFile SourcePos::GetDIFile() const {
-#ifdef LLVM_2_8
-    return llvm::DIFile();
-#else
    std::string directory, filename;
    GetDirectoryAndFileName(g->currentDirectory, name, &directory, &filename);
    return m->diBuilder->createFile(filename, directory);
-#endif // LLVM_2_8
 }


--- a/ispc.h
+++ b/ispc.h
@@ -69,6 +69,8 @@ namespace llvm {
    class FunctionType;
    class LLVMContext;
    class Module;
+    class Target;
+    class TargetMachine;
    class Type;
    class Value;
 }
@@ -146,6 +148,8 @@ public:
        pointer in place of the original ASTNode *. */
    virtual ASTNode *TypeCheck() = 0;

+    virtual int EstimateCost() const = 0;
+
    /** All AST nodes must track the file position where they are
        defined. */
    const SourcePos pos;
@@ -156,7 +160,34 @@ public:
    This structure defines a compilation target for the ispc compiler.
 */
 struct Target {
-    Target();
+    /** Initializes the given Target pointer for a target of the given
+        name, if the name is a known target.  Returns true if the
+        target was initialized and false if the name is unknown. */
+    static bool GetTarget(const char *arch, const char *cpu, const char *isa,
+                          bool pic, Target *);
+
+    /** Returns a comma-delimited string giving the names of the currently
+        supported target ISAs. */
+    static const char *SupportedTargetISAs();
+
+    /** Returns a comma-delimited string giving the names of the currently
+        supported target CPUs. */
+    static const char *SupportedTargetCPUs();
+
+    /** Returns a comma-delimited string giving the names of the currently
+        supported target architectures. */
+    static const char *SupportedTargetArchs();
+
+    /** Returns a triple string specifying the target architecture, vendor,
+        and environment. */
+    std::string GetTripleString() const;
+
+    /** Returns the LLVM TargetMachine object corresponding to this
+        target. */
+    llvm::TargetMachine *GetTargetMachine() const;
+
+    /** llvm Target object representing this target. */
+    const llvm::Target *target;

    /** Enumerator giving the instruction sets that the compiler can
        target. */
@@ -168,9 +199,15 @@ struct Target {
    /** Target system architecture.  (e.g. "x86-64", "x86"). */
    std::string arch;

+    /** Is the target architecture 32 or 64 bit */
+    bool is32bit;
+
    /** Target CPU. (e.g. "corei7", "corei7-avx", ..) */
    std::string cpu;

+    /** Target-specific attributes to pass along to the LLVM backend */
+    std::string attributes;
+
    /** Native vector width of the vector instruction set.  Note that this
        value is directly derived from the ISA Being used (e.g. it's 4 for
        SSE, 8 for AVX, etc.) */
@@ -180,8 +217,12 @@ struct Target {
        integer multiple of the native vector width, for example if we're
        "doubling up" and compiling 8-wide on a 4-wide SSE system. */
    int vectorWidth;
+
+    /** Indicates whether position independent code should be generated. */
+    bool generatePIC;
 };

+
 /** @brief Structure that collects optimization options

    This structure collects all of the options related to optimization of
@@ -199,6 +240,16 @@ struct Opt {
        should be performed.  This is false by default. */
    bool fastMath;

+    /** Indicates whether an vector load should be issued for masked loads
+        on platforms that don't have a native masked vector load.  (This may
+        lead to accessing memory up to programCount-1 elements past the end of
+        arrays, so is unsafe in general.) */
+    bool fastMaskedVload;
+
+    /** Indicates when loops should be unrolled (when doing so seems like
+        it will make sense. */
+    bool unrollLoops;
+
    /** On targets that don't have a masked store instruction but do have a
        blending instruction, by default, we simulate masked stores by
        loading the old value, blending, and storing the result.  This can
@@ -316,6 +367,29 @@ struct Globals {
    std::vector<std::string> cppArgs;
 };

+enum {
+    COST_ASSIGN = 1,
+    COST_COHERENT_BREAK_CONTINE = 4,
+    COST_COMPLEX_ARITH_OP = 4,
+    COST_DEREF = 4,
+    COST_FUNCALL = 4,
+    COST_GATHER = 8,
+    COST_LOAD = 2,
+    COST_REGULAR_BREAK_CONTINUE = 2,
+    COST_RETURN = 4,
+    COST_SELECT = 4,
+    COST_SIMPLE_ARITH_LOGIC_OP = 1,
+    COST_SYNC = 32,
+    COST_TASK_LAUNCH = 16,
+    COST_TYPECAST_COMPLEX = 4,
+    COST_TYPECAST_SIMPLE = 1,
+    COST_UNIFORM_LOOP = 4,
+    COST_VARYING_LOOP = 6,
+
+    CHECK_MASK_AT_FUNCTION_START_COST = 16,
+    PREDICATE_SAFE_IF_STATEMENT_COST = 6,
+};
+
 extern Globals *g;
 extern Module *m;

--- a/ispc.vcxproj
+++ b/ispc.vcxproj
@@ -1,4 +1,4 @@
-<?xml version="1.0" encoding="utf-8"?>
+<?xml version="1.0" encoding="utf-8"?>
 <Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
  <ItemGroup Label="ProjectConfigurations">
    <ProjectConfiguration Include="Debug|Win32">
@@ -16,7 +16,9 @@
    <ClCompile Include="decl.cpp" />
    <ClCompile Include="expr.cpp" />
    <ClCompile Include="gen-bitcode-avx.cpp" />
-    <ClCompile Include="gen-bitcode-c.cpp" />
+    <ClCompile Include="gen-bitcode-avx-x2.cpp" />
+    <ClCompile Include="gen-bitcode-c-32.cpp" />
+    <ClCompile Include="gen-bitcode-c-64.cpp" />
    <ClCompile Include="gen-bitcode-sse2.cpp" />
    <ClCompile Include="gen-bitcode-sse4.cpp" />
    <ClCompile Include="gen-bitcode-sse4x2.cpp" />
@@ -29,12 +31,14 @@
    <ClCompile Include="opt.cpp" />
    <ClCompile Include="parse.cc" />
    <CustomBuild Include="builtins-c.c">
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%LLVM_INSTALL_DIR%\bin\clang -emit-llvm builtins-c.c -c -o - | %LLVM_INSTALL_DIR%\bin\llvm-dis - | python bitcode2cpp.py builtins-c.c &gt; gen-bitcode-c.cpp</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%LLVM_INSTALL_DIR%\bin\clang -m32 -emit-llvm builtins-c.c -c -o - | %LLVM_INSTALL_DIR%\bin\llvm-dis - | python bitcode2cpp.py builtins-c-32.c &gt; gen-bitcode-c-32.cpp;
+%LLVM_INSTALL_DIR%\bin\clang -m64 -emit-llvm builtins-c.c -c -o - | %LLVM_INSTALL_DIR%\bin\llvm-dis - | python bitcode2cpp.py builtins-c-64.c &gt; gen-bitcode-c-64.cpp</Command>
      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">clang builtins-c.c</Message>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%LLVM_INSTALL_DIR%\bin\clang -emit-llvm builtins-c.c -c -o - | %LLVM_INSTALL_DIR%\bin\llvm-dis - | python bitcode2cpp.py builtins-c.c &gt; gen-bitcode-c.cpp</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%LLVM_INSTALL_DIR%\bin\clang -m32 -emit-llvm builtins-c.c -c -o - | %LLVM_INSTALL_DIR%\bin\llvm-dis - | python bitcode2cpp.py builtins-c-32.c &gt; gen-bitcode-c-32.cpp;
+%LLVM_INSTALL_DIR%\bin\clang -m64 -emit-llvm builtins-c.c -c -o - | %LLVM_INSTALL_DIR%\bin\llvm-dis - | python bitcode2cpp.py builtins-c-64.c &gt; gen-bitcode-c-64.cpp</Command>
      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">clang builtins-c.c</Message>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-c.cpp</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-c.cpp</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-c-32.cpp;gen-bitcore-c-64.cpp</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-c-32.cpp;gen-bitcore-c-64.cpp</Outputs>
    </CustomBuild>
    <ClCompile Include="stmt.cpp" />
    <ClCompile Include="sym.cpp" />
@@ -59,9 +63,9 @@
  <ItemGroup>
    <CustomBuild Include="stdlib.ispc">
      <FileType>Document</FileType>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">clang -E -x c %(Filename).ispc -DISPC=1 -DPI=3.1415926535 | python stdlib2cpp.py &gt; gen-stdlib.cpp</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%LLVM_INSTALL_DIR%\bin\clang -E -x c %(Filename).ispc -DISPC=1 -DPI=3.1415926535 | python stdlib2cpp.py &gt; gen-stdlib.cpp</Command>
      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-stdlib.cpp</Outputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">clang -E -x c %(Filename).ispc -DISPC=1 -DPI=3.1415926535 | python stdlib2cpp.py &gt; gen-stdlib.cpp</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%LLVM_INSTALL_DIR%\bin\clang -E -x c %(Filename).ispc -DISPC=1 -DPI=3.1415926535 | python stdlib2cpp.py &gt; gen-stdlib.cpp</Command>
      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-stdlib.cpp</Outputs>
      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-stdlib.cpp</Message>
      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-stdlib.cpp</Message>
@@ -119,6 +123,19 @@
      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-avx.cpp</Message>
    </CustomBuild>
  </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="builtins-avx-x2.ll">
+      <FileType>Document</FileType>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 builtins.m4 builtins-avx-x2.ll | python bitcode2cpp.py builtins-avx-x2.ll &gt; gen-bitcode-avx-x2.cpp</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-avx-x2.cpp</Outputs>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins.m4;builtins-sse.ll</AdditionalInputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 builtins.m4 builtins-avx-x2.ll | python bitcode2cpp.py builtins-avx-x2.ll &gt; gen-bitcode-avx-x2.cpp</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-avx-x2.cpp</Outputs>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins.m4;builtins-sse.ll</AdditionalInputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-avx-x2.cpp</Message>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-avx-x2.cpp</Message>
+    </CustomBuild>
+  </ItemGroup>
  <ItemGroup>
    <CustomBuild Include="lex.ll">
      <FileType>Document</FileType>
@@ -179,7 +196,7 @@
      <PrecompiledHeader>NotUsing</PrecompiledHeader>
      <WarningLevel>Level3</WarningLevel>
      <Optimization>Disabled</Optimization>
-      <PreprocessorDefinitions>NOMINMAX;LLVM_2_9</PreprocessorDefinitions>
+      <PreprocessorDefinitions>NOMINMAX;LLVM_3_0</PreprocessorDefinitions>
      <AdditionalIncludeDirectories>$(LLVM_INSTALL_DIR)\include;.;.\winstuff;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
      <DisableSpecificWarnings>4146;4800;4996;4355;4624</DisableSpecificWarnings>
    </ClCompile>
@@ -187,7 +204,7 @@
      <SubSystem>Console</SubSystem>
      <GenerateDebugInformation>true</GenerateDebugInformation>
      <AdditionalLibraryDirectories>$(LLVM_INSTALL_DIR)\lib;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
-      <AdditionalDependencies>clangFrontend.lib;clangDriver.lib;clangSerialization.lib;clangParse.lib;clangSema.lib;clangAnalysis.lib;clangAST.lib;clangLex.lib;clangBasic.lib;LLVMAnalysis.lib;LLVMArchive.lib;LLVMAsmPrinter.lib;LLVMBitReader.lib;LLVMBitWriter.lib;LLVMCodeGen.lib;LLVMCore.lib;LLVMExecutionEngine.lib;LLVMInstCombine.lib;LLVMInstrumentation.lib;LLVMipa.lib;LLVMipo.lib;LLVMLinker.lib;LLVMMC.lib;LLVMMCParser.lib;LLVMObject.lib;LLVMScalarOpts.lib;LLVMSelectionDAG.lib;LLVMSupport.lib;LLVMTarget.lib;LLVMTransformUtils.lib;LLVMX86ASMPrinter.lib;LLVMX86ASMParser.lib;LLVMX86Utils.lib;LLVMX86CodeGen.lib;LLVMX86Disassembler.lib;LLVMX86Info.lib;shlwapi.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalDependencies>clangFrontend.lib;clangDriver.lib;clangSerialization.lib;clangParse.lib;clangSema.lib;clangAnalysis.lib;clangAST.lib;clangLex.lib;clangBasic.lib;LLVMAnalysis.lib;LLVMArchive.lib;LLVMAsmParser.lib;LLVMAsmPrinter.lib;LLVMBitReader.lib;LLVMBitWriter.lib;LLVMCodeGen.lib;LLVMCore.lib;LLVMDebugInfo.lib;LLVMExecutionEngine.lib;LLVMInstCombine.lib;LLVMInstrumentation.lib;LLVMLinker.lib;LLVMMC.lib;LLVMMCDisassembler.lib;LLVMMCParser.lib;LLVMObject.lib;LLVMScalarOpts.lib;LLVMSelectionDAG.lib;LLVMSupport.lib;LLVMTarget.lib;LLVMTransformUtils.lib;LLVMX86ASMPrinter.lib;LLVMX86ASMParser.lib;LLVMX86Utils.lib;LLVMX86CodeGen.lib;LLVMX86Desc.lib;LLVMX86Disassembler.lib;LLVMX86Info.lib;LLVMipa.lib;LLVMipo.lib;shlwapi.lib;%(AdditionalDependencies)</AdditionalDependencies>
    </Link>
  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
@@ -197,7 +214,7 @@
      <Optimization>MaxSpeed</Optimization>
      <FunctionLevelLinking>true</FunctionLevelLinking>
      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <PreprocessorDefinitions>NOMINMAX;LLVM_2_9</PreprocessorDefinitions>
+      <PreprocessorDefinitions>NOMINMAX;LLVM_3_0</PreprocessorDefinitions>
      <AdditionalIncludeDirectories>$(LLVM_INSTALL_DIR)\include;.;.\winstuff;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
      <DisableSpecificWarnings>4146;4800;4996;4355;4624</DisableSpecificWarnings>
    </ClCompile>
@@ -207,10 +224,10 @@
      <EnableCOMDATFolding>true</EnableCOMDATFolding>
      <OptimizeReferences>true</OptimizeReferences>
      <AdditionalLibraryDirectories>$(LLVM_INSTALL_DIR)\lib;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
-      <AdditionalDependencies>clangFrontend.lib;clangDriver.lib;clangSerialization.lib;clangParse.lib;clangSema.lib;clangAnalysis.lib;clangAST.lib;clangLex.lib;clangBasic.lib;LLVMAnalysis.lib;LLVMArchive.lib;LLVMAsmPrinter.lib;LLVMBitReader.lib;LLVMBitWriter.lib;LLVMCodeGen.lib;LLVMCore.lib;LLVMExecutionEngine.lib;LLVMInstCombine.lib;LLVMInstrumentation.lib;LLVMipa.lib;LLVMipo.lib;LLVMLinker.lib;LLVMMC.lib;LLVMMCParser.lib;LLVMObject.lib;LLVMScalarOpts.lib;LLVMSelectionDAG.lib;LLVMSupport.lib;LLVMTarget.lib;LLVMTransformUtils.lib;LLVMX86ASMPrinter.lib;LLVMX86ASMParser.lib;LLVMX86Utils.lib;LLVMX86CodeGen.lib;LLVMX86Disassembler.lib;LLVMX86Info.lib;shlwapi.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalDependencies>clangFrontend.lib;clangDriver.lib;clangSerialization.lib;clangParse.lib;clangSema.lib;clangAnalysis.lib;clangAST.lib;clangLex.lib;clangBasic.lib;LLVMAnalysis.lib;LLVMArchive.lib;LLVMAsmParser.lib;LLVMAsmPrinter.lib;LLVMBitReader.lib;LLVMBitWriter.lib;LLVMCodeGen.lib;LLVMCore.lib;LLVMDebugInfo.lib;LLVMExecutionEngine.lib;LLVMInstCombine.lib;LLVMInstrumentation.lib;LLVMLinker.lib;LLVMMC.lib;LLVMMCDisassembler.lib;LLVMMCParser.lib;LLVMObject.lib;LLVMScalarOpts.lib;LLVMSelectionDAG.lib;LLVMSupport.lib;LLVMTarget.lib;LLVMTransformUtils.lib;LLVMX86ASMPrinter.lib;LLVMX86ASMParser.lib;LLVMX86Utils.lib;LLVMX86CodeGen.lib;LLVMX86Desc.lib;LLVMX86Disassembler.lib;LLVMX86Info.lib;LLVMipa.lib;LLVMipo.lib;shlwapi.lib;%(AdditionalDependencies)</AdditionalDependencies>
    </Link>
  </ItemDefinitionGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
  </ImportGroup>
-</Project>
+</Project>
--- a/ispc_test.cpp
+++ b/ispc_test.cpp
@@ -33,12 +33,25 @@

 #define _CRT_SECURE_NO_WARNINGS

+#if defined(_WIN32) || defined(_WIN64)
+#define ISPC_IS_WINDOWS
+#elif defined(__linux__)
+#define ISPC_IS_LINUX
+#elif defined(__APPLE__)
+#define ISPC_IS_APPLE
+#endif
+
 #ifdef ISPC_IS_WINDOWS
 #define NOMINMAX
 #include <windows.h>
 #endif
 #include <stdio.h>
 #include <stdint.h>
+#include <stdlib.h>
+#include <memory.h>
+#ifdef ISPC_IS_LINUX
+#include <malloc.h>
+#endif

 #ifdef ISPC_HAVE_SVML
 #include <xmmintrin.h>
@@ -61,8 +74,14 @@ extern "C" {
 #include <llvm/DerivedTypes.h>
 #include <llvm/Instructions.h>
 #include <llvm/ExecutionEngine/ExecutionEngine.h>
+#if defined(LLVM_3_0) || defined(LLVM_3_0svn)
+  #include <llvm/Support/TargetRegistry.h>
+  #include <llvm/Support/TargetSelect.h>
+#else
+  #include <llvm/Target/TargetRegistry.h>
+  #include <llvm/Target/TargetSelect.h>
+#endif
 #include <llvm/ExecutionEngine/JIT.h>
-#include <llvm/Target/TargetSelect.h>
 #include <llvm/Target/TargetOptions.h>
 #include <llvm/Target/TargetData.h>
 #include <llvm/Transforms/Scalar.h>
@@ -74,42 +93,53 @@ extern "C" {
 #include <llvm/Support/raw_ostream.h>
 #include <llvm/Bitcode/ReaderWriter.h>
 #include <llvm/Support/MemoryBuffer.h>
-#ifndef LLVM_2_8
 #include <llvm/Support/system_error.h>
-#endif
+
+bool shouldFail = false;

 extern "C" { 
-    void ISPCLaunch(void *, void *);
-    void ISPCSync();
-    void *ISPCMalloc(int64_t size, int32_t alignment);
-    void ISPCFree(void *ptr);
+    void ISPCLaunch(void **, void *, void *, int32_t);
+    void ISPCSync(void *);
+    void *ISPCAlloc(void **, int64_t size, int32_t alignment);
 }

-void ISPCLaunch(void *func, void *data) {
-    typedef void (*TaskFuncType)(void *, int, int);
+void ISPCLaunch(void **handle, void *func, void *data, int32_t count) {
+    *handle = (void *)0xdeadbeef;
+    typedef void (*TaskFuncType)(void *, int, int, int, int);
    TaskFuncType tft = (TaskFuncType)(func);
-    tft(data, 0, 1);
+    for (int i = 0; i < count; ++i)
+        tft(data, 0, 1, i, count);
 }


-void ISPCSync() {
+void ISPCSync(void *) {
 }


+void *ISPCAlloc(void **handle, int64_t size, int32_t alignment) {
+    *handle = (void *)0xdeadbeef;
+    // leak time!
 #ifdef ISPC_IS_WINDOWS
-void *ISPCMalloc(int64_t size, int32_t alignment) {
    return _aligned_malloc(size, alignment);
-}
-
-
-void ISPCFree(void *ptr) {
-    _aligned_free(ptr);
-}
 #endif
+#ifdef ISPC_IS_LINUX
+    return memalign(alignment, size);
+#endif
+#ifdef ISPC_IS_APPLE
+    void *mem = malloc(size + (alignment-1) + sizeof(void*));
+    char *amem = ((char*)mem) + sizeof(void*);
+    amem = amem + uint32_t(alignment - (reinterpret_cast<uint64_t>(amem) &
+                                        (alignment - 1)));
+    ((void**)amem)[-1] = mem;
+    return amem;
+#endif
+}
+

 static void usage(int ret) {
    fprintf(stderr, "usage: ispc_test\n");
    fprintf(stderr, "\t[-h/--help]\tprint help\n");
+    fprintf(stderr, "\t[-f]\t\tindicates that test is expected to fail\n");
    fprintf(stderr, "\t<files>\n");
    exit(ret);
 }
@@ -119,20 +149,22 @@ static void svml_missing() {
    exit(1);
 }

+// On Windows, sin() is an overloaded function, so we need an unambiguous
+// function we can take the address of when wiring up the external references
+// below.
+
+double Sin(double x) { return sin(x); }
+double Cos(double x) { return cos(x); }
+double Tan(double x) { return tan(x); }
+double Atan(double x) { return atan(x); }
+double Atan2(double y, double x) { return atan2(y, x); }
+double Pow(double a, double b) { return pow(a, b); }
+double Exp(double x) { return exp(x); }
+double Log(double x) { return log(x); }
+
 static bool lRunTest(const char *fn) {
    llvm::LLVMContext *ctx = new llvm::LLVMContext;

-#ifdef LLVM_2_8
-    std::string err;
-    llvm::MemoryBuffer *buf = llvm::MemoryBuffer::getFileOrSTDIN(fn, &err);
-    if (!buf) {
-        fprintf(stderr, "Unable to open file \"%s\": %s\n", fn, err.c_str());
-        delete ctx;
-        return false;
-    }
-    std::string bcErr;
-    llvm::Module *module = llvm::ParseBitcodeFile(buf, *ctx, &bcErr);
-#else
    llvm::OwningPtr<llvm::MemoryBuffer> buf;
    llvm::error_code err = llvm::MemoryBuffer::getFileOrSTDIN(fn, buf);
    if (err) {
@@ -142,7 +174,6 @@ static bool lRunTest(const char *fn) {
    }
    std::string bcErr;
    llvm::Module *module = llvm::ParseBitcodeFile(buf.get(), *ctx, &bcErr);
-#endif

    if (!module) {
        fprintf(stderr, "Bitcode reader failed for \"%s\": %s\n", fn, bcErr.c_str());
@@ -151,7 +182,21 @@ static bool lRunTest(const char *fn) {
    }

    std::string eeError;
+#if defined(LLVM_3_0) || defined(LLVM_3_0svn)
+    llvm::EngineBuilder engineBuilder(module);
+    engineBuilder.setErrorStr(&eeError);
+    engineBuilder.setEngineKind(llvm::EngineKind::JIT);
+#if 0
+    std::vector<std::string> attributes;
+    if (target != NULL && !strcmp(target, "avx"))
+        attributes.push_back("+avx");
+    engineBuilder.setMAttrs(attributes);
+    engineBuilder.setUseMCJIT(true);
+#endif
+    llvm::ExecutionEngine *ee = engineBuilder.create();
+#else
    llvm::ExecutionEngine *ee = llvm::ExecutionEngine::createJIT(module, &eeError);
+#endif
    if (!ee) {
        fprintf(stderr, "Unable to create ExecutionEngine: %s\n", eeError.c_str());
        return false;
@@ -163,10 +208,7 @@ static bool lRunTest(const char *fn) {
        ee->addGlobalMapping(func, (void *)FUNC)
    DO_FUNC(ISPCLaunch, "ISPCLaunch");
    DO_FUNC(ISPCSync, "ISPCSync");
-#ifdef ISPC_IS_WINDOWS
-    DO_FUNC(ISPCMalloc, "ISPCMalloc");
-    DO_FUNC(ISPCFree, "ISPCFree");
-#endif // ISPC_IS_WINDOWS
+    DO_FUNC(ISPCAlloc, "ISPCAlloc");
    DO_FUNC(putchar, "putchar");
    DO_FUNC(printf, "printf");
    DO_FUNC(fflush, "fflush");
@@ -178,14 +220,14 @@ static bool lRunTest(const char *fn) {
    DO_FUNC(powf, "powf");
    DO_FUNC(expf, "expf");
    DO_FUNC(logf, "logf");
-    DO_FUNC(sin, "sin");
-    DO_FUNC(cos, "cos");
-    DO_FUNC(tan, "tan");
-    DO_FUNC(atan, "atan");
-    DO_FUNC(atan2, "atan2");
-    DO_FUNC(pow, "pow");
-    DO_FUNC(exp, "exp");
-    DO_FUNC(log, "log");
+    DO_FUNC(Sin, "sin");
+    DO_FUNC(Cos, "cos");
+    DO_FUNC(Tan, "tan");
+    DO_FUNC(Atan, "atan");
+    DO_FUNC(Atan2, "atan2");
+    DO_FUNC(Pow, "pow");
+    DO_FUNC(Exp, "exp");
+    DO_FUNC(Log, "log");
    DO_FUNC(memset, "memset");
 #ifdef ISPC_IS_APPLE
    DO_FUNC(memset_pattern4, "memset_pattern4");
@@ -233,7 +275,6 @@ static bool lRunTest(const char *fn) {
    float result[16];
    for (int i = 0; i < 16; ++i)
        result[i] = 0;
-    bool ok = true;
    if (foundResult) {
        typedef void (*PFN)(float *);
        PFN pfn = reinterpret_cast<PFN>(ee->getPointerToFunction(func));
@@ -290,50 +331,49 @@ static bool lRunTest(const char *fn) {
    }
    else {
        fprintf(stderr, "Unable to find runnable function in file \"%s\"\n", fn);
-        ok = false;
+        return false;
    }

    // see if we got the right result
-    if (ok) {
-        if (foundResult) {
-            for (int i = 0; i < width; ++i)
-                if (returned[i] != result[i]) {
-                    ok = false;
-                    fprintf(stderr, "Test \"%s\" RETURNED %d: %g / %a EXPECTED %g / %a\n",
-                            fn, i, returned[i], returned[i], result[i], result[i]);
-                }
-        }
-        else {
-            for (int i = 0; i < width; ++i)
-                fprintf(stderr, "Test \"%s\" returned %d: %g / %a\n",
-                        fn, i, returned[i], returned[i]);
-        }
+    bool resultsMatch = true;
+    if (foundResult) {
+        for (int i = 0; i < width; ++i)
+            if (returned[i] != result[i]) {
+                resultsMatch = false;
+                fprintf(stderr, "Test \"%s\" RETURNED %d: %g / %a EXPECTED %g / %a\n",
+                        fn, i, returned[i], returned[i], result[i], result[i]);
+            }
    }
+    else {
+        for (int i = 0; i < width; ++i)
+            fprintf(stderr, "Test \"%s\" returned %d: %g / %a\n",
+                    fn, i, returned[i], returned[i]);
+    }
+    if (foundResult && shouldFail && resultsMatch)
+        fprintf(stderr, "Test %s unexpectedly passed\n", fn);

    delete ee;
    delete ctx;

-    return ok && foundResult;
+    return foundResult && resultsMatch;
 }

+
 int main(int argc, char *argv[]) {
    llvm::InitializeNativeTarget();
+#if defined(LLVM_3_0) || defined(LLVM_3_0svn)
+    LLVMLinkInJIT();
+#endif

-    std::vector<const char *> files;
+    const char *filename = NULL;
    for (int i = 1; i < argc; ++i) {
        if (!strcmp(argv[i], "--help") || !strcmp(argv[i], "-h"))
            usage(0);
+        if (!strcmp(argv[i], "-f"))
+            shouldFail = true;
        else
-            files.push_back(argv[i]);
+            filename = argv[i];
    }

-    int passes = 0, fails = 0;
-    for (unsigned int i = 0; i < files.size(); ++i) {
-        if (lRunTest(files[i])) ++passes;
-        else ++fails;
-    }
-
-    if (fails > 0)
-        fprintf(stderr, "%d/%d tests passed\n", passes, passes+fails);
-    return fails > 0;
+    return (lRunTest(filename) == true) ? 0 : 1;
 }
--- a/ispc_test.vcxproj
+++ b/ispc_test.vcxproj
@@ -52,14 +52,14 @@
      </PrecompiledHeader>
      <WarningLevel>Level3</WarningLevel>
      <Optimization>Disabled</Optimization>
-      <PreprocessorDefinitions>ISPC_IS_WINDOWS;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <PreprocessorDefinitions>LLVM_3_0;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <AdditionalIncludeDirectories>$(LLVM_INSTALL_DIR)/include</AdditionalIncludeDirectories>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
      <GenerateDebugInformation>true</GenerateDebugInformation>
      <AdditionalLibraryDirectories>$(LLVM_INSTALL_DIR)/lib</AdditionalLibraryDirectories>
-      <AdditionalDependencies>LLVMAnalysis.lib;LLVMArchive.lib;LLVMAsmPrinter.lib;LLVMBitReader.lib;LLVMBitWriter.lib;LLVMCodeGen.lib;LLVMCore.lib;LLVMExecutionEngine.lib;LLVMInstCombine.lib;LLVMInstrumentation.lib;LLVMipa.lib;LLVMipo.lib;LLVMJIT.lib;LLVMLinker.lib;LLVMMC.lib;LLVMMCParser.lib;LLVMObject.lib;LLVMScalarOpts.lib;LLVMSelectionDAG.lib;LLVMSupport.lib;LLVMTarget.lib;LLVMTransformUtils.lib;LLVMX86ASMPrinter.lib;LLVMX86ASMParser.lib;LLVMX86Utils.lib;LLVMX86CodeGen.lib;LLVMX86Disassembler.lib;LLVMX86Info.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalDependencies>LLVMAnalysis.lib;LLVMArchive.lib;LLVMAsmPrinter.lib;LLVMBitReader.lib;LLVMBitWriter.lib;LLVMCodeGen.lib;LLVMCore.lib;LLVMExecutionEngine.lib;LLVMInstCombine.lib;LLVMInstrumentation.lib;LLVMipa.lib;LLVMipo.lib;LLVMJIT.lib;LLVMLinker.lib;LLVMMC.lib;LLVMMCParser.lib;LLVMObject.lib;LLVMScalarOpts.lib;LLVMSelectionDAG.lib;LLVMSupport.lib;LLVMTarget.lib;LLVMTransformUtils.lib;LLVMX86ASMPrinter.lib;LLVMX86ASMParser.lib;LLVMX86Utils.lib;LLVMX86CodeGen.lib;LLVMX86Disassembler.lib;LLVMX86Desc.lib;LLVMX86Info.lib;%(AdditionalDependencies)</AdditionalDependencies>
    </Link>
  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
@@ -70,7 +70,7 @@
      <Optimization>MaxSpeed</Optimization>
      <FunctionLevelLinking>true</FunctionLevelLinking>
      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <PreprocessorDefinitions>ISPC_IS_WINDOWS;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <PreprocessorDefinitions>LLVM_3_0;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <AdditionalIncludeDirectories>$(LLVM_INSTALL_DIR)/include</AdditionalIncludeDirectories>
    </ClCompile>
    <Link>
@@ -79,10 +79,10 @@
      <EnableCOMDATFolding>true</EnableCOMDATFolding>
      <OptimizeReferences>true</OptimizeReferences>
      <AdditionalLibraryDirectories>$(LLVM_INSTALL_DIR)/lib</AdditionalLibraryDirectories>
-      <AdditionalDependencies>LLVMAnalysis.lib;LLVMArchive.lib;LLVMAsmPrinter.lib;LLVMBitReader.lib;LLVMBitWriter.lib;LLVMCodeGen.lib;LLVMCore.lib;LLVMExecutionEngine.lib;LLVMInstCombine.lib;LLVMInstrumentation.lib;LLVMipa.lib;LLVMipo.lib;LLVMJIT.lib;LLVMLinker.lib;LLVMMC.lib;LLVMMCParser.lib;LLVMObject.lib;LLVMScalarOpts.lib;LLVMSelectionDAG.lib;LLVMSupport.lib;LLVMTarget.lib;LLVMTransformUtils.lib;LLVMX86ASMPrinter.lib;LLVMX86ASMParser.lib;LLVMX86Utils.lib;LLVMX86CodeGen.lib;LLVMX86Disassembler.lib;LLVMX86Info.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalDependencies>LLVMAnalysis.lib;LLVMArchive.lib;LLVMAsmPrinter.lib;LLVMBitReader.lib;LLVMBitWriter.lib;LLVMCodeGen.lib;LLVMCore.lib;LLVMExecutionEngine.lib;LLVMInstCombine.lib;LLVMInstrumentation.lib;LLVMipa.lib;LLVMipo.lib;LLVMJIT.lib;LLVMLinker.lib;LLVMMC.lib;LLVMMCParser.lib;LLVMObject.lib;LLVMScalarOpts.lib;LLVMSelectionDAG.lib;LLVMSupport.lib;LLVMTarget.lib;LLVMTransformUtils.lib;LLVMX86ASMPrinter.lib;LLVMX86ASMParser.lib;LLVMX86Utils.lib;LLVMX86CodeGen.lib;LLVMX86Disassembler.lib;LLVMX86Desc.lib;LLVMX86Info.lib;%(AdditionalDependencies)</AdditionalDependencies>
    </Link>
  </ItemDefinitionGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
  </ImportGroup>
-</Project>
+</Project>
--- a/main.cpp
+++ b/main.cpp
@@ -40,10 +40,14 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <llvm/Support/PrettyStackTrace.h>
-#ifdef LLVM_2_8
-#include <llvm/System/Signals.h>
-#else
 #include <llvm/Support/Signals.h>
+#if defined(LLVM_3_0) || defined(LLVM_3_0svn)
+  #include <llvm/Support/TargetRegistry.h>
+  #include <llvm/Support/TargetSelect.h>
+#else
+  #include <llvm/Target/TargetRegistry.h>
+  #include <llvm/Target/TargetSelect.h>
+  #include <llvm/Target/SubtargetFeature.h>
 #endif

 #ifdef ISPC_IS_WINDOWS
@@ -53,36 +57,36 @@
 #endif // ISPC_IS_WINDOWS

 static void usage(int ret) {
-    printf("This is the Intel(r) SPMD Program Compiler (ispc), build %s (%s)\n\n", BUILD_DATE, BUILD_VERSION);
+    printf("This is the Intel(r) SPMD Program Compiler (ispc), build %s (%s)\n\n", 
+           BUILD_DATE, BUILD_VERSION);
    printf("usage: ispc\n");
-    printf("    [--arch={x86,x86-64}]\t\tSelect target architecture\n");
+    printf("    [--arch={%s}]\t\tSelect target architecture\n", 
+           Target::SupportedTargetArchs());
    printf("    [--cpu=<cpu>]\t\t\tSelect target CPU type\n");
-    printf("         (atom, barcelona, core2, corei7, corei7-avx, istanbul, nocona,\n");
-    printf("          penryn, westmere)\n");
-#ifndef ISPC_IS_WINDOWS
-    printf("    [-D<foo>]\t\t\t\t#define value when running preprocessor\n");
-#endif
+    printf("         <cpu>={%s}\n", Target::SupportedTargetCPUs());
+    printf("    [-D<foo>]\t\t\t\t#define given value when running preprocessor\n");
    printf("    [--debug]\t\t\t\tPrint information useful for debugging ispc\n");
    printf("    [--emit-asm]\t\t\tGenerate assembly language file as output\n");
    printf("    [--emit-llvm]\t\t\tEmit LLVM bitode file as output\n");
-    printf("    [--emit-obj]\t\t\tGenerate object file file as output\n");
-    printf("    [--fast-math]\t\t\tPerform non-IEEE-compliant optimizations of numeric expressions\n");
+    printf("    [--emit-obj]\t\t\tGenerate object file file as output (default)\n");
    printf("    [-g]\t\t\t\tGenerate debugging information\n");
    printf("    [--help]\t\t\t\tPrint help\n");
-    printf("    [-h] <name>\t\t\t\tOutput filename for header\n");
+    printf("    [-h <name>/--header-outfile=<name>]\tOutput filename for header\n");
    printf("    [--instrument]\t\t\tEmit instrumentation to gather performance data\n");
    printf("    [--math-lib=<option>]\t\tSelect math library\n");
    printf("        default\t\t\t\tUse ispc's built-in math functions\n");
    printf("        fast\t\t\t\tUse high-performance but lower-accuracy math functions\n");
-    printf("        svml\t\t\t\tUse the Intel SVML math libraries\n");
+    printf("        svml\t\t\t\tUse the Intel(r) SVML math libraries\n");
    printf("        system\t\t\t\tUse the system's math library (*may be quite slow*)\n");
    printf("    [--nostdlib]\t\t\tDon't make the ispc standard library available\n");
-#ifndef ISPC_IS_WINDOWS
    printf("    [--nocpp]\t\t\t\tDon't run the C preprocessor\n");
-#endif
-    printf("    [-o/--outfile] <name>\t\tOutput filename for bitcode (may be \"-\" for standard output)\n");
-    printf("    [-O0/-O1]\t\t\t\tSet optimization level\n");
+    printf("    [-o <name>/--outfile=<name>]\tOutput filename (may be \"-\" for standard output)\n");
+    printf("    [-O0/-O1]\t\t\t\tSet optimization level (-O1 is default)\n");
    printf("    [--opt=<option>]\t\t\tSet optimization option\n");
+    printf("        disable-loop-unroll\t\tDisable loop unrolling.\n");
+    printf("        fast-masked-vload\t\tFaster masked vector loads on SSE (may go past end of array)\n");
+    printf("        fast-math\t\t\tPerform non-IEEE-compliant optimizations of numeric expressions\n");
+#if 0
    printf("        disable-blended-masked-stores\t\tScalarize masked stores on SSE (vs. using vblendps)\n");
    printf("        disable-coherent-control-flow\t\tDisable coherent control flow optimizations\n");
    printf("        disable-uniform-control-flow\t\tDisable uniform control flow optimizations\n");
@@ -91,11 +95,11 @@ static void usage(int ret) {
    printf("        disable-gather-scatter-flattening\tDisable flattening when all lanes are on\n");
    printf("        disable-uniform-memory-optimizations\tDisable uniform-based coherent memory access\n");
    printf("        disable-masked-store-optimizations\tDisable lowering to regular stores when possible\n");
-#if defined(LLVM_3_0) || defined(LLVM_3_0svn)
-    printf("    [--target={sse2,sse4,sse4x2,avx}] Select target ISA (SSE4 is default unless compiling for atom; then SSE2 is.)\n");
-#else
-    printf("    [--target={sse2,sse4,sse4x2}] Select target ISA (SSE4 is default unless compiling for atom; then SSE2 is.)\n");
-#endif // LLVM 3.0
+#endif
+#ifndef ISPC_IS_WINDOWS
+    printf("    [--pic]\t\t\t\tGenerate position-independent code\n");
+#endif // !ISPC_IS_WINDOWS
+    printf("    [--target=<isa>]\t\t\tSelect target ISA. <isa>={%s}\n", Target::SupportedTargetISAs());
    printf("    [--version]\t\t\t\tPrint ispc version\n");
    printf("    [--woff]\t\t\t\tDisable warnings\n");
    printf("    [--wno-perf]\t\t\tDon't issue warnings related to performance-related issues\n");
@@ -103,35 +107,6 @@ static void usage(int ret) {
    exit(ret);
 }

-/** Given a target name string, set initialize the global g->target
-    structure appropriately. 
-*/
-static void lDoTarget(const char *target) {
-    if (!strcasecmp(target, "sse2")) {
-        g->target.isa = Target::SSE2;
-        g->target.nativeVectorWidth = 4;
-        g->target.vectorWidth = 4;
-    }
-    else if (!strcasecmp(target, "sse4")) {
-        g->target.isa = Target::SSE4;
-        g->target.nativeVectorWidth = 4;
-        g->target.vectorWidth = 4;
-    }
-    else if (!strcasecmp(target, "sse4x2")) {
-        g->target.isa = Target::SSE4;
-        g->target.nativeVectorWidth = 4;
-        g->target.vectorWidth = 8;
-    }
-#if defined(LLVM_3_0) || defined(LLVM_3_0svn)
-    else if (!strcasecmp(target, "avx")) {
-        g->target.isa = Target::AVX;
-        g->target.nativeVectorWidth = 8;
-        g->target.vectorWidth = 8;
-    }
-#endif // LLVM 3.0
-    else
-        usage(1);
-}


 /** We take arguments from both the command line as well as from the
@@ -190,6 +165,16 @@ int main(int Argc, char *Argv[]) {
    llvm::sys::PrintStackTraceOnErrorSignal();
    llvm::PrettyStackTraceProgram X(argc, argv);

+    // initialize available LLVM targets
+    LLVMInitializeX86TargetInfo();
+    LLVMInitializeX86Target();
+    LLVMInitializeX86AsmPrinter();
+    LLVMInitializeX86AsmParser();
+    LLVMInitializeX86Disassembler();
+#if defined(LLVM_3_0) || defined(LLVM_3_0svn)
+    LLVMInitializeX86TargetMC();
+#endif
+
    char *file = NULL;
    const char *headerFileName = NULL;
    const char *outFileName = NULL;
@@ -198,23 +183,29 @@ int main(int Argc, char *Argv[]) {
    // as we're parsing below
    g = new Globals;

-    bool debugSet = false, optSet = false, targetSet = false;
+    bool debugSet = false, optSet = false;
    Module::OutputType ot = Module::Object;
+    bool generatePIC = false;
+    const char *arch = NULL, *cpu = NULL, *target = NULL;

    for (int i = 1; i < argc; ++i) {
        if (!strcmp(argv[i], "--help"))
            usage(0);
-#ifndef ISPC_IS_WINDOWS
-        else if (!strncmp(argv[i], "-D", 2)) {
+        else if (!strncmp(argv[i], "-D", 2))
            g->cppArgs.push_back(argv[i]);
-        }
-#endif // !ISPC_IS_WINDOWS
        else if (!strncmp(argv[i], "--arch=", 7))
-            g->target.arch = argv[i] + 7;
+            arch = argv[i] + 7;
        else if (!strncmp(argv[i], "--cpu=", 6))
-            g->target.cpu = argv[i] + 6;
-        else if (!strcmp(argv[i], "--fast-math"))
-            g->opt.fastMath = true;
+            cpu = argv[i] + 6;
+        else if (!strcmp(argv[i], "--fast-math")) {
+            fprintf(stderr, "--fast-math option has been renamed to --opt=fast-math!\n");
+            usage(1);
+        }
+        else if (!strcmp(argv[i], "--fast-masked-vload")) {
+            fprintf(stderr, "--fast-masked-vload option has been renamed to "
+                    "--opt=fast-masked-vload!\n");
+            usage(1);
+        }
        else if (!strcmp(argv[i], "--debug"))
            g->debugPrint = true;
        else if (!strcmp(argv[i], "--instrument"))
@@ -230,14 +221,12 @@ int main(int Argc, char *Argv[]) {
        else if (!strcmp(argv[i], "--emit-obj"))
            ot = Module::Object;
        else if (!strcmp(argv[i], "--target")) {
+            // FIXME: should remove this way of specifying the target...
            if (++i == argc) usage(1);
-            lDoTarget(argv[i]);
-            targetSet = true;
-        }
-        else if (!strncmp(argv[i], "--target=", 9)) {
-            const char *target = argv[i] + 9;
-            lDoTarget(target);
+            target = argv[i];
        }
+        else if (!strncmp(argv[i], "--target=", 9))
+            target = argv[i] + 9;
        else if (!strncmp(argv[i], "--math-lib=", 11)) {
            const char *lib = argv[i] + 11;
            if (!strcmp(lib, "default"))
@@ -253,7 +242,16 @@ int main(int Argc, char *Argv[]) {
        }
        else if (!strncmp(argv[i], "--opt=", 6)) {
            const char *opt = argv[i] + 6;
-            if (!strcmp(opt, "disable-blended-masked-stores"))
+            if (!strcmp(opt, "fast-math"))
+                g->opt.fastMath = true;
+            else if (!strcmp(opt, "fast-masked-vload"))
+                g->opt.fastMaskedVload = true;
+            else if (!strcmp(opt, "disable-loop-unroll"))
+                g->opt.unrollLoops = false;
+
+            // These are only used for performance tests of specific
+            // optimizations
+            else if (!strcmp(opt, "disable-blended-masked-stores"))
                g->opt.disableBlendedMaskedStores = true;
            else if (!strcmp(opt, "disable-coherent-control-flow"))
                g->opt.disableCoherentControlFlow = true;
@@ -278,14 +276,19 @@ int main(int Argc, char *Argv[]) {
        }
        else if (!strcmp(argv[i], "--wno-perf") || !strcmp(argv[i], "-wno-perf"))
            g->emitPerfWarnings = false;
-        else if (!strcmp(argv[i], "-o") || !strcmp(argv[i], "--outfile")) {
+        else if (!strcmp(argv[i], "-o")) {
            if (++i == argc) usage(1);
            outFileName = argv[i];
        }
-        else if (!strcmp(argv[i], "-h") || !strcmp(argv[i], "--header-outfile")) {
+        else if (!strcmp(argv[i], "--outfile="))
+            outFileName = argv[i] + strlen("--outfile=");
+        else if (!strcmp(argv[i], "-h")) {
            if (++i == argc) usage(1);
            headerFileName = argv[i];
        }
+        else if (!strcmp(argv[i], "--header-outfile=")) {
+            headerFileName = argv[i] + strlen("--header-outfile=");
+        }
        else if (!strcmp(argv[i], "-O0")) {
            g->opt.level = 0;
            optSet = true;
@@ -301,6 +304,10 @@ int main(int Argc, char *Argv[]) {
            g->includeStdlib = false;
        else if (!strcmp(argv[i], "--nocpp"))
            g->runCPP = false;
+#ifndef ISPC_IS_WINDOWS
+        else if (!strcmp(argv[i], "--pic"))
+            generatePIC = true;
+#endif // !ISPC_IS_WINDOWS
        else if (!strcmp(argv[i], "-v") || !strcmp(argv[i], "--version")) {
            printf("Intel(r) SPMD Program Compiler (ispc) build %s (%s)\n", 
                   BUILD_DATE, BUILD_VERSION);
@@ -322,10 +329,8 @@ int main(int Argc, char *Argv[]) {
    if (debugSet && !optSet)
        g->opt.level = 0;

-    // Make SSE2 the default target on atom unless the target has been set
-    // explicitly.
-    if (!targetSet && (g->target.cpu == "atom"))
-        lDoTarget("sse2");
+    if (!Target::GetTarget(arch, cpu, target, generatePIC, &g->target))
+        usage(1);

    m = new Module(file);
    if (m->CompileFile() == 0) {
--- a/module.cpp
+++ b/module.cpp
@@ -72,23 +72,17 @@
 #include <llvm/Support/FormattedStream.h>
 #include <llvm/Support/FileUtilities.h>
 #include <llvm/Target/TargetMachine.h>
-#include <llvm/Target/TargetRegistry.h>
-#include <llvm/Target/TargetSelect.h>
 #include <llvm/Target/TargetOptions.h>
 #include <llvm/Target/TargetData.h>
-#include <llvm/Target/SubtargetFeature.h>
 #include <llvm/PassManager.h>
 #include <llvm/Analysis/Verifier.h>
 #include <llvm/Support/CFG.h>
 #include <clang/Frontend/CompilerInstance.h>
+#include <clang/Frontend/TextDiagnosticPrinter.h>
 #include <clang/Frontend/Utils.h>
 #include <clang/Basic/TargetInfo.h>
-#ifndef LLVM_2_8
 #include <llvm/Support/ToolOutputFile.h>
 #include <llvm/Support/Host.h>
-#else // !LLVM_2_8
-#include <llvm/System/Host.h>
-#endif // LLVM_2_8
 #include <llvm/Assembly/PrintModulePass.h>
 #include <llvm/Support/raw_ostream.h>
 #include <llvm/Bitcode/ReaderWriter.h>
@@ -107,14 +101,13 @@ Module::Module(const char *fn) {
    symbolTable = new SymbolTable;
    module = new llvm::Module(filename ? filename : "<stdin>", *g->ctx);

-#ifndef LLVM_2_8
+    module->setTargetTriple(g->target.GetTripleString());
+
    if (g->generateDebuggingSymbols)
        diBuilder = new llvm::DIBuilder(*module);
    else
        diBuilder = NULL;
-#endif // LLVM_2_8

-#ifndef LLVM_2_8
    // If we're generating debugging symbols, let the DIBuilder know that
    // we're starting a new compilation unit.
    if (diBuilder != NULL) {
@@ -140,7 +133,6 @@ Module::Module(const char *fn) {
                                         0 /* run time version */);
        }
    }
-#endif // LLVM_2_8
 }


@@ -154,6 +146,9 @@ extern void yy_delete_buffer(YY_BUFFER_STATE);

 int
 Module::CompileFile() {
+    if (g->opt.fastMath == true)
+        llvm::UnsafeFPMath = true;
+
    // FIXME: it'd be nice to do this in the Module constructor, but this
    // function ends up calling into routines that expect the global
    // variable 'm' to be initialized and available (which it isn't until
@@ -458,6 +453,10 @@ Module::AddGlobal(DeclSpecs *ds, Declarator *decl) {
    // declarations, typedefs, and global variables declarations /
    // definitions.  Figure out what we've got and take care of it.

+    if (ds == NULL || decl == NULL)
+        // Error happened earlier during parsing
+        return;
+
    if (decl->isFunction) {
        // function declaration
        const Type *t = decl->GetType(ds);
@@ -558,7 +557,6 @@ Module::AddGlobal(DeclSpecs *ds, Declarator *decl) {
                                                         decl->sym->name.c_str());
        m->symbolTable->AddVariable(decl->sym);

-#ifndef LLVM_2_8
        if (diBuilder && (ds->storageClass != SC_EXTERN)) {
            llvm::DIFile file = decl->pos.GetDIFile();
            diBuilder->createGlobalVariable(decl->sym->name, 
@@ -568,7 +566,6 @@ Module::AddGlobal(DeclSpecs *ds, Declarator *decl) {
                                            (ds->storageClass == SC_STATIC),
                                            decl->sym->storagePtr);
        }
-#endif // LLVM_2_8
    }
 }

@@ -606,6 +603,7 @@ lCopyInTaskParameter(int i, llvm::Value *structArgPtr, Declarator *decl,
    // memory
    llvm::Value *ptrval = ctx->LoadInst(ptr, NULL, sym->name.c_str());
    ctx->StoreInst(ptrval, sym->storagePtr);
+    ctx->EmitFunctionParameterDebugInfo(sym);
 }


@@ -629,6 +627,8 @@ lEmitFunctionCode(FunctionEmitContext *ctx, llvm::Function *function,
        llvm::Value *structParamPtr = argIter++;
        llvm::Value *threadIndex = argIter++;
        llvm::Value *threadCount = argIter++;
+        llvm::Value *taskIndex = argIter++;
+        llvm::Value *taskCount = argIter++;

        // Copy the function parameter values from the structure into local
        // storage
@@ -656,13 +656,17 @@ lEmitFunctionCode(FunctionEmitContext *ctx, llvm::Function *function,
        threadCountSym->storagePtr = ctx->AllocaInst(LLVMTypes::Int32Type, "threadCount");
        ctx->StoreInst(threadCount, threadCountSym->storagePtr);

-#ifdef ISPC_IS_WINDOWS
-        // On Windows, we dynamically-allocate space for the task arguments
-        // (see FunctionEmitContext::LaunchInst().)  Here is where we emit
-        // the code to free that memory, now that we've copied the
-        // parameter values out of the structure.
-        ctx->EmitFree(structParamPtr);
-#endif // ISPC_IS_WINDOWS
+        // Copy taskIndex and taskCount into stack-allocated storage so
+        // that their symbols point to something reasonable.
+        Symbol *taskIndexSym = m->symbolTable->LookupVariable("taskIndex");
+        assert(taskIndexSym);
+        taskIndexSym->storagePtr = ctx->AllocaInst(LLVMTypes::Int32Type, "taskIndex");
+        ctx->StoreInst(taskIndex, taskIndexSym->storagePtr);
+
+        Symbol *taskCountSym = m->symbolTable->LookupVariable("taskCount");
+        assert(taskCountSym);
+        taskCountSym->storagePtr = ctx->AllocaInst(LLVMTypes::Int32Type, "taskCount");
+        ctx->StoreInst(taskCount, taskCountSym->storagePtr);
    }
    else {
        // Regular, non-task function
@@ -700,8 +704,18 @@ lEmitFunctionCode(FunctionEmitContext *ctx, llvm::Function *function,

    // Finally, we can generate code for the function
    if (code != NULL) {
+        int costEstimate = code->EstimateCost();
        bool checkMask = (ft->isTask == true) || 
-            (function->hasFnAttr(llvm::Attribute::AlwaysInline) == false);
+            ((function->hasFnAttr(llvm::Attribute::AlwaysInline) == false) &&
+             costEstimate > CHECK_MASK_AT_FUNCTION_START_COST);
+        Debug(code->pos, "Estimated cost for function \"%s\" = %d\n", 
+              funSym->name.c_str(), costEstimate);
+        // If the body of the function is non-trivial, then we wrap the
+        // entire thing around a varying "cif (true)" test in order to reap
+        // the side-effect benefit of checking to see if the execution mask
+        // is all on and thence having a specialized code path for that
+        // case.  If this is a simple function, then this isn't worth the
+        // code bloat / overhead.
        if (checkMask) {
            bool allTrue[ISPC_MAX_NVEC];
            for (int i = 0; i < g->target.vectorWidth; ++i)
@@ -849,6 +863,11 @@ Module::AddFunction(DeclSpecs *ds, Declarator *decl, Stmt *code) {

 bool
 Module::WriteOutput(OutputType outputType, const char *outFileName) {
+#if defined(LLVM_3_0) || defined(LLVM_3_0svn)
+    if (diBuilder != NULL && outputType != Header)
+        diBuilder->finalize();
+#endif // LLVM_3_0
+
    // First, issue a warning if the output file suffix and the type of
    // file being created seem to mismatch.  This can help catch missing
    // command-line arguments specifying the output file type.
@@ -909,12 +928,7 @@ Module::WriteOutput(OutputType outputType, const char *outFileName) {
            return true;
        }
        else {
-#ifdef LLVM_2_8
-            fprintf(stderr, "Direct object file emission not supported in this build.\n");
-            return false;
-#else
            return writeObjectFileOrAssembly(outputType, outFileName);
-#endif // LLVM_2_8
        }
    }
 }
@@ -922,79 +936,7 @@ Module::WriteOutput(OutputType outputType, const char *outFileName) {

 bool
 Module::writeObjectFileOrAssembly(OutputType outputType, const char *outFileName) {
-    llvm::InitializeAllTargets();
-#if defined(LLVM_3_0) || defined(LLVM_3_0svn)
-    llvm::InitializeAllTargetMCs();
-#endif
-    llvm::InitializeAllAsmPrinters();
-    llvm::InitializeAllAsmParsers();
-
-    llvm::Triple triple(module->getTargetTriple());
-    if (triple.getTriple().empty())
-        triple.setTriple(llvm::sys::getHostTriple());
-
-    const llvm::Target *target = NULL;
-    if (g->target.arch != "") {
-        // If the user specified a target architecture, see if it's a known
-        // one; print an error with the valid ones otherwise.
-        for (llvm::TargetRegistry::iterator iter = llvm::TargetRegistry::begin();
-             iter != llvm::TargetRegistry::end(); ++iter) {
-            if (g->target.arch == iter->getName()) {
-                target = &*iter;
-                break;
-            }
-        }
-        if (!target) {
-            fprintf(stderr, "Invalid target \"%s\"\nOptions: ", 
-                    g->target.arch.c_str());
-            llvm::TargetRegistry::iterator iter;
-            for (iter = llvm::TargetRegistry::begin();
-                 iter != llvm::TargetRegistry::end(); ++iter)
-                fprintf(stderr, "%s ", iter->getName());
-            fprintf(stderr, "\n");
-            return false;
-        }
-
-        llvm::Triple::ArchType archType = 
-            llvm::Triple::getArchTypeForLLVMName(g->target.arch);
-        if (archType != llvm::Triple::UnknownArch)
-            triple.setArch(archType);
-    }
-    else {
-        // Otherwise get the target either based on the host or the
-        // module's target, if it has been set there.
-        std::string error;
-        target = llvm::TargetRegistry::lookupTarget(triple.getTriple(), error);
-        if (!target) {
-            fprintf(stderr, "Unable to select target for module: %s\n", 
-                    error.c_str());
-            return false;
-        }
-    }
-
-    std::string featuresString;
-    llvm::TargetMachine *targetMachine = NULL;
-#if defined LLVM_3_0svn || defined LLVM_3_0
-    if (g->target.isa == Target::AVX)
-        featuresString = "+avx";
-    targetMachine = target->createTargetMachine(triple.getTriple(), g->target.cpu,
-                                                featuresString);
-#else
-    if (g->target.cpu.size()) {
-        llvm::SubtargetFeatures features;
-        features.setCPU(g->target.cpu);
-        featuresString = features.getString();
-    }
-
-    targetMachine = target->createTargetMachine(triple.getTriple(), 
-                                                featuresString);
-#endif
-    if (targetMachine == NULL) {
-        fprintf(stderr, "Unable to create target machine for target \"%s\"!",
-                triple.str().c_str());
-        return false;
-    }
-    targetMachine->setAsmVerbosityDefault(true);
+    llvm::TargetMachine *targetMachine = g->target.GetTargetMachine();

    // Figure out if we're generating object file or assembly output, and
    // set binary output for object files
@@ -1021,9 +963,8 @@ Module::writeObjectFileOrAssembly(OutputType outputType, const char *outFileName
        (g->opt.level > 0) ? llvm::CodeGenOpt::Aggressive : llvm::CodeGenOpt::None;

    if (targetMachine->addPassesToEmitFile(pm, fos, fileType, optLevel)) {
-        fprintf(stderr, "Fatal error adding passes to emit object file for "
-                "target %s!\n", triple.str().c_str());
-        return false;
+        fprintf(stderr, "Fatal error adding passes to emit object file!");
+        exit(1);
    }

    // Finally, run the passes to emit the object file/assembly
@@ -1189,6 +1130,12 @@ lEmitVectorTypedefs(const std::vector<const VectorType *> &types, FILE *file) {
    for (unsigned int i = 0; i < types.size(); ++i) {
        std::string baseDecl;
        const VectorType *vt = types[i]->GetAsNonConstType();
+        if (!vt->IsUniformType())
+            // Varying stuff shouldn't be visibile to / used by the
+            // application, so at least make it not simple to access it by
+            // not declaring the type here...
+            continue;
+
        int size = vt->GetElementCount();

        baseDecl = vt->GetBaseType()->GetCDeclaration("");
@@ -1361,6 +1308,7 @@ Module::writeHeader(const char *fn) {
    default:
        FATAL("Unhandled target in header emission");
    }
+    fprintf(f, "#define ISPC_TARGET_VECTOR_WIDTH %d\n", g->target.vectorWidth);

    fprintf(f, "#ifdef __cplusplus\nnamespace ispc {\n#endif // __cplusplus\n\n");

@@ -1397,14 +1345,6 @@ Module::writeHeader(const char *fn) {
    lEmitEnumDecls(exportedEnumTypes, f);
    lEmitStructDecls(exportedStructTypes, f);

-    // emit externs for globals
-    if (externGlobals.size() > 0) {
-        fprintf(f, "///////////////////////////////////////////////////////////////////////////\n");
-        fprintf(f, "// Globals declared \"extern\" from ispc code\n");
-        fprintf(f, "///////////////////////////////////////////////////////////////////////////\n");
-        lPrintExternGlobals(f, externGlobals);
-    }
-
    // emit function declarations for exported stuff...
    if (exportedFuncs.size() > 0) {
        fprintf(f, "\n");
@@ -1426,6 +1366,15 @@ Module::writeHeader(const char *fn) {
    // end namespace
    fprintf(f, "\n#ifdef __cplusplus\n}\n#endif // __cplusplus\n");

+    // and only now emit externs for globals, outside of the ispc namespace
+    if (externGlobals.size() > 0) {
+        fprintf(f, "\n");
+        fprintf(f, "///////////////////////////////////////////////////////////////////////////\n");
+        fprintf(f, "// Globals declared \"extern\" from ispc code\n");
+        fprintf(f, "///////////////////////////////////////////////////////////////////////////\n");
+        lPrintExternGlobals(f, externGlobals);
+    }
+
    // end guard
    fprintf(f, "\n#endif // %s\n", guard.c_str());

@@ -1441,23 +1390,26 @@ Module::execPreprocessor(const char* infilename, llvm::raw_string_ostream* ostre
    std::string error;

    inst.createFileManager();
-    inst.createDiagnostics(0, NULL);
-    clang::TargetOptions& options = inst.getTargetOpts();

+    llvm::raw_fd_ostream stderrRaw(2, false);
+    clang::TextDiagnosticPrinter *diagPrinter = 
+        new clang::TextDiagnosticPrinter(stderrRaw, clang::DiagnosticOptions());
+    inst.createDiagnostics(0, NULL, diagPrinter);
+
+    clang::TargetOptions &options = inst.getTargetOpts();
    llvm::Triple triple(module->getTargetTriple());
    if (triple.getTriple().empty())
        triple.setTriple(llvm::sys::getHostTriple());
-    
    options.Triple = triple.getTriple();

-    clang::TargetInfo* target 
-        = clang::TargetInfo::CreateTargetInfo(inst.getDiagnostics(), options);
+    clang::TargetInfo *target =
+        clang::TargetInfo::CreateTargetInfo(inst.getDiagnostics(), options);

    inst.setTarget(target);
    inst.createSourceManager(inst.getFileManager());
    inst.InitializeSourceManager(infilename);

-    clang::PreprocessorOptions& opts = inst.getPreprocessorOpts();
+    clang::PreprocessorOptions &opts = inst.getPreprocessorOpts();

    //Add defs for ISPC and PI
    opts.addMacroDef("ISPC");
@@ -1470,7 +1422,10 @@ Module::execPreprocessor(const char* infilename, llvm::raw_string_ostream* ostre
        }
    }    
    inst.createPreprocessor();
+
+    clang::LangOptions langOptions;
+    diagPrinter->BeginSourceFile(langOptions, &inst.getPreprocessor());
    clang::DoPrintPreprocessedInput(inst.getPreprocessor(),
                                    ostream, inst.getPreprocessorOutputOpts());
+    diagPrinter->EndSourceFile();
 }
-
--- a/module.h
+++ b/module.h
@@ -91,11 +91,8 @@ public:
    /** llvm Module object into which globals and functions are added. */
    llvm::Module *module; 

-#ifndef LLVM_2_8
-    /** The diBuilder manages generating debugging information (only
-        supported in LLVM 2.9 and beyond...) */
+    /** The diBuilder manages generating debugging information */
    llvm::DIBuilder *diBuilder;
-#endif

    GatherBuffer *gatherBuffer;

--- a/opt.cpp
+++ b/opt.cpp
@@ -55,13 +55,12 @@
 #include <llvm/Instructions.h>
 #include <llvm/Intrinsics.h>
 #include <llvm/Constants.h>
-#ifndef LLVM_2_8
-    #include <llvm/Target/TargetLibraryInfo.h>
-    #ifdef LLVM_2_9
-        #include <llvm/Support/StandardPasses.h>
-    #else
-        #include <llvm/Support/PassManagerBuilder.h>
-    #endif // LLVM_2_9
+#include <llvm/Analysis/ConstantFolding.h>
+#include <llvm/Target/TargetLibraryInfo.h>
+#ifdef LLVM_2_9
+    #include <llvm/Support/StandardPasses.h>
+#else
+    #include <llvm/Transforms/IPO/PassManagerBuilder.h>
 #endif // LLVM_2_8
 #include <llvm/ADT/Triple.h>
 #include <llvm/Transforms/Scalar.h>
@@ -69,13 +68,18 @@
 #include <llvm/Transforms/Utils/BasicBlockUtils.h>
 #include <llvm/Target/TargetOptions.h>
 #include <llvm/Target/TargetData.h>
+#include <llvm/Target/TargetMachine.h>
 #include <llvm/Analysis/Verifier.h>
 #include <llvm/Support/raw_ostream.h>
-#ifndef LLVM_2_8
 #include <llvm/Analysis/DIBuilder.h>
-#endif
 #include <llvm/Analysis/DebugInfo.h>
 #include <llvm/Support/Dwarf.h>
+#ifdef ISPC_IS_LINUX
+  #include <alloca.h>
+#elif defined(ISPC_IS_WINDOWS)
+  #include <malloc.h>
+  #define alloca _alloca
+#endif // ISPC_IS_WINDOWS

 static llvm::Pass *CreateIntrinsicsOptPass();
 static llvm::Pass *CreateGatherScatterFlattenPass();
@@ -178,19 +182,22 @@ Optimize(llvm::Module *module, int optLevel) {
    llvm::PassManager optPM;
    llvm::FunctionPassManager funcPM(module);

-#ifndef LLVM_2_8
    llvm::TargetLibraryInfo *targetLibraryInfo =
        new llvm::TargetLibraryInfo(llvm::Triple(module->getTargetTriple()));
    optPM.add(targetLibraryInfo);
-#endif
    optPM.add(new llvm::TargetData(module));

+#if defined(LLVM_3_0) || defined(LLVM_3_0svn)
+    optPM.add(llvm::createIndVarSimplifyPass());
+#endif
+
    if (optLevel == 0) {
        // This is more or less the minimum set of optimizations that we
        // need to do to generate code that will actually run.  (We can't
        // run absolutely no optimizations, since the front-end needs us to
        // take the various __pseudo_* functions it has emitted and turn
        // them into something that can actually execute.
+        optPM.add(llvm::createPromoteMemoryToRegisterPass());
        optPM.add(CreateGatherScatterFlattenPass());
        optPM.add(CreateLowerGatherScatterPass());
        optPM.add(CreateLowerMaskedStorePass());
@@ -211,7 +218,6 @@ Optimize(llvm::Module *module, int optLevel) {
        // only later in the optimization process as things like constant
        // propagation have done their thing, and then when they do kick
        // in, they can often open up new opportunities for optimization...
-#ifndef LLVM_2_8
        llvm::PassRegistry *registry = llvm::PassRegistry::getPassRegistry();
        llvm::initializeCore(*registry);
        llvm::initializeScalarOpts(*registry);
@@ -222,7 +228,7 @@ Optimize(llvm::Module *module, int optLevel) {
        llvm::initializeInstCombine(*registry);
        llvm::initializeInstrumentation(*registry);
        llvm::initializeTarget(*registry);
-#endif
+
        // Early optimizations to try to reduce the total amount of code to
        // work with if we can
        optPM.add(CreateGatherScatterFlattenPass());
@@ -279,13 +285,11 @@ Optimize(llvm::Module *module, int optLevel) {
        optPM.add(llvm::createConstantPropagationPass());
        optPM.add(CreateIntrinsicsOptPass());

-#if defined(LLVM_2_8)
-        optPM.add(CreateIsCompileTimeConstantPass(true));
-#elif defined(LLVM_2_9)
+#if defined(LLVM_2_9)
        llvm::createStandardModulePasses(&optPM, 3, 
                                         false /* opt size */,
                                         true /* unit at a time */, 
-                                         false /* unroll loops */,
+                                         g->opt.unrollLoops,
                                         true /* simplify lib calls */,
                                         false /* may have exceptions */,
                                         llvm::createFunctionInliningPass());
@@ -300,7 +304,7 @@ Optimize(llvm::Module *module, int optLevel) {
        llvm::createStandardModulePasses(&optPM, 3, 
                                         false /* opt size */,
                                         true /* unit at a time */, 
-                                         false /* unroll loops */,
+                                         g->opt.unrollLoops,
                                         true /* simplify lib calls */,
                                         false /* may have exceptions */,
                                         llvm::createFunctionInliningPass());
@@ -309,6 +313,8 @@ Optimize(llvm::Module *module, int optLevel) {
        llvm::PassManagerBuilder builder;
        builder.OptLevel = 3;
        builder.Inliner = llvm::createFunctionInliningPass();
+        if (g->opt.unrollLoops == false)
+            builder.DisableUnrollLoops = true;
        builder.populateFunctionPassManager(funcPM);
        builder.populateModulePassManager(optPM);
        optPM.add(CreateIsCompileTimeConstantPass(true));
@@ -421,8 +427,11 @@ IntrinsicsOpt::IntrinsicsOpt()
    blendInstructions.push_back(BlendInstruction(
        llvm::Intrinsic::getDeclaration(m->module, llvm::Intrinsic::x86_sse41_blendvps),
        0xf, 0, 1, 2));
+#if defined(LLVM_3_0) || defined(LLVM_3_0svn)
    blendInstructions.push_back(BlendInstruction(
-        m->module->getFunction("llvm.x86.avx.blendvps"), 0xff, 0, 1, 2));
+        llvm::Intrinsic::getDeclaration(m->module, llvm::Intrinsic::x86_avx_blendv_ps_256),
+        0xff, 0, 1, 2));
+#endif
 }


@@ -469,8 +478,18 @@ lGetMask(llvm::Value *factor) {
    else if (llvm::isa<llvm::ConstantAggregateZero>(factor))
        return 0;
    else {
+#if 0
+        llvm::ConstantExpr *ce = llvm::dyn_cast<llvm::ConstantExpr>(factor);
+        if (ce != NULL) {
+            llvm::TargetMachine *targetMachine = g->target.GetTargetMachine();
+            const llvm::TargetData *td = targetMachine->getTargetData();
+            llvm::Constant *c = llvm::ConstantFoldConstantExpression(ce, td);
+            c->dump();
+            factor = c;
+        }
        // else we should be able to handle it above...
        assert(!llvm::isa<llvm::Constant>(factor));
+#endif
        return -1;
    }
 }
@@ -608,9 +627,10 @@ IntrinsicsOpt::runOnBasicBlock(llvm::BasicBlock &bb) {
                                          llvm::PointerType::get(returnType, 0), 
                                          "ptr2vec", callInst);
                lCopyMetadata(castPtr, callInst);
+                int align = callInst->getCalledFunction() == avxMaskedLoad32 ? 4 : 8;
                llvm::Instruction *loadInst = 
                    new llvm::LoadInst(castPtr, "load", false /* not volatile */,
-                                       0 /* align */, (llvm::Instruction *)NULL);
+                                       align, (llvm::Instruction *)NULL);
                lCopyMetadata(loadInst, callInst);
                llvm::ReplaceInstWithInst(callInst, loadInst);
                modifiedAny = true;
@@ -630,17 +650,21 @@ IntrinsicsOpt::runOnBasicBlock(llvm::BasicBlock &bb) {
            }
            else if (mask == 0xff) {
                // all lanes storing, so replace with a regular store
-                llvm::Value *rvalue = callInst->getArgOperand(1);
+                llvm::Value *rvalue = callInst->getArgOperand(2);
                llvm::Type *storeType = rvalue->getType();
                llvm::Value *castPtr = 
                    new llvm::BitCastInst(callInst->getArgOperand(0),
                                          llvm::PointerType::get(storeType, 0), 
                                          "ptr2vec", callInst);
                lCopyMetadata(castPtr, callInst);
-                llvm::Instruction *storeInst = 
+
+                llvm::StoreInst *storeInst = 
                    new llvm::StoreInst(rvalue, castPtr, (llvm::Instruction *)NULL);
+                int align = callInst->getCalledFunction() == avxMaskedStore32 ? 4 : 8;
+                storeInst->setAlignment(align);
                lCopyMetadata(storeInst, callInst);
                llvm::ReplaceInstWithInst(callInst, storeInst);
+
                modifiedAny = true;
                goto restart;
            }
@@ -1416,15 +1440,12 @@ LowerMaskedStorePass::runOnBasicBlock(llvm::BasicBlock &bb) {
        llvm::Value *rvalue  = callInst->getArgOperand(1);
        llvm::Value *mask = callInst->getArgOperand(2);

-        // On SSE, we need to choose between doing the load + blend + store
-        // trick, or serializing the masked store.  On targets with a
-        // native masked store instruction, the implementations of
-        // __masked_store_blend_* should be the same as __masked_store_*,
-        // so this doesn't matter.  On SSE, blending is generally more
-        // efficient and is always safe to do on stack-allocated values.(?)
-        bool doBlend = lIsStackVariablePointer(lvalue);
-        if (g->target.isa == Target::SSE4 || g->target.isa == Target::SSE2)
-            doBlend |= !g->opt.disableBlendedMaskedStores;
+        // We need to choose between doing the load + blend + store trick,
+        // or serializing the masked store.  Even on targets with a native
+        // masked store instruction, this is preferable since it lets us
+        // keep values in registers rather than going out to the stack.
+        bool doBlend = (!g->opt.disableBlendedMaskedStores ||
+                        lIsStackVariablePointer(lvalue));

        // Generate the call to the appropriate masked store function and
        // replace the __pseudo_* one with it.
@@ -1502,8 +1523,8 @@ static void lPrintVector(const char *info, llvm::Value *elements[ISPC_MAX_NVEC])


 /** Given an LLVM vector in vec, return a 'scalarized' version of the
-    vector in the provided offsets[] array.  For example, if the vector
-    value passed in is:  
+    vector in the provided scalarizedVector[] array.  For example, if the
+    vector value passed in is:

    add <4 x i32> %a_smear, <4 x i32> <4, 8, 12, 16>,

@@ -1524,28 +1545,39 @@ static void lPrintVector(const char *info, llvm::Value *elements[ISPC_MAX_NVEC])
    @param vec               Vector to be scalarized
    @param scalarizedVector  Array in which to store the individual vector 
                             elements
+    @param vectorLength      Number of elements in the given vector. (The
+                             passed scalarizedVector array must also be at least
+                             this length as well.)
    @returns                 True if the vector was successfully scalarized and
                             the values in offsets[] are valid; false otherwise
 */
 static bool
-lScalarizeVector(llvm::Value *vec, llvm::Value *scalarizedVector[ISPC_MAX_NVEC]) {
+lScalarizeVector(llvm::Value *vec, llvm::Value **scalarizedVector,
+                 int vectorLength) {
    // First initialize the values of scalarizedVector[] to NULL.
-    for (int i = 0; i < g->target.vectorWidth; ++i)
+    for (int i = 0; i < vectorLength; ++i)
        scalarizedVector[i] = NULL;
+    
+    // It may be ok for the vector to be an undef vector; these come up for
+    // example in shufflevector instructions.  As long as elements of the
+    // undef vector aren't referenced by the shuffle indices, this is fine.
+    if (llvm::isa<llvm::UndefValue>(vec))
+        return true;

    // ConstantVectors are easy; just pull out the individual constant
    // element values
    llvm::ConstantVector *cv = llvm::dyn_cast<llvm::ConstantVector>(vec);
    if (cv != NULL) {
-        for (int i = 0; i < g->target.vectorWidth; ++i)
+        for (int i = 0; i < vectorLength; ++i)
            scalarizedVector[i] = cv->getOperand(i);
        return true;
    }

    // It's also easy if it's just a vector of all zeros
-    llvm::ConstantAggregateZero *caz = llvm::dyn_cast<llvm::ConstantAggregateZero>(vec);
-    if (caz) {
-        for (int i = 0; i < g->target.vectorWidth; ++i)
+    llvm::ConstantAggregateZero *caz = 
+        llvm::dyn_cast<llvm::ConstantAggregateZero>(vec);
+    if (caz != NULL) {
+        for (int i = 0; i < vectorLength; ++i)
            scalarizedVector[i] = LLVMInt32(0);
        return true;
    }
@@ -1557,13 +1589,16 @@ lScalarizeVector(llvm::Value *vec, llvm::Value *scalarizedVector[ISPC_MAX_NVEC])
        // scalar values we return from here are synthesized with scalar
        // versions of the original vector binary operator
        llvm::Instruction::BinaryOps opcode = bo->getOpcode();
-        llvm::Value *v0[ISPC_MAX_NVEC], *v1[ISPC_MAX_NVEC];
+        llvm::Value **v0 = 
+            (llvm::Value **)alloca(vectorLength * sizeof(llvm::Value *));
+        llvm::Value **v1 = 
+            (llvm::Value **)alloca(vectorLength * sizeof(llvm::Value *));

-        if (!lScalarizeVector(bo->getOperand(0), v0) || 
-            !lScalarizeVector(bo->getOperand(1), v1))
+        if (!lScalarizeVector(bo->getOperand(0), v0, vectorLength) || 
+            !lScalarizeVector(bo->getOperand(1), v1, vectorLength))
            return false;

-        for (int i = 0; i < g->target.vectorWidth; ++i) {
+        for (int i = 0; i < vectorLength; ++i) {
            scalarizedVector[i] = 
                llvm::BinaryOperator::Create(opcode, v0[i], v1[i], "flat_bop", bo);
            lCopyMetadata(scalarizedVector[i], bo);
@@ -1588,7 +1623,7 @@ lScalarizeVector(llvm::Value *vec, llvm::Value *scalarizedVector[ISPC_MAX_NVEC])
        // vaue in scalarizedVector[] based on the value being inserted.
        while (ie != NULL) {
            uint64_t iOffset = lGetIntValue(ie->getOperand(2));
-            assert((int)iOffset < g->target.vectorWidth);
+            assert((int)iOffset < vectorLength);
            assert(scalarizedVector[iOffset] == NULL);

            scalarizedVector[iOffset] = ie->getOperand(1);
@@ -1602,15 +1637,17 @@ lScalarizeVector(llvm::Value *vec, llvm::Value *scalarizedVector[ISPC_MAX_NVEC])
    }

    llvm::CastInst *ci = llvm::dyn_cast<llvm::CastInst>(vec);
-    if (ci) {
+    if (ci != NULL) {
        // Casts are similar to BinaryOperators in that we attempt to
        // scalarize the vector being cast and if successful, we apply
        // equivalent scalar cast operators to each of the values in the
        // scalarized vector.
        llvm::Instruction::CastOps op = ci->getOpcode();

-        llvm::Value *scalarizedTarget[ISPC_MAX_NVEC];
-        if (!lScalarizeVector(ci->getOperand(0), scalarizedTarget))
+        llvm::Value **scalarizedTarget = 
+            (llvm::Value **)alloca(vectorLength * sizeof(llvm::Value *));
+        if (!lScalarizeVector(ci->getOperand(0), scalarizedTarget,
+                              vectorLength))
            return false;

        LLVM_TYPE_CONST llvm::Type *destType = ci->getDestTy();
@@ -1619,7 +1656,7 @@ lScalarizeVector(llvm::Value *vec, llvm::Value *scalarizedVector[ISPC_MAX_NVEC])
        assert(vectorDestType != NULL);
        LLVM_TYPE_CONST llvm::Type *elementType = vectorDestType->getElementType();

-        for (int i = 0; i < g->target.vectorWidth; ++i) {
+        for (int i = 0; i < vectorLength; ++i) {
            scalarizedVector[i] = 
                llvm::CastInst::Create(op, scalarizedTarget[i], elementType,
                                       "cast", ci);
@@ -1629,16 +1666,11 @@ lScalarizeVector(llvm::Value *vec, llvm::Value *scalarizedVector[ISPC_MAX_NVEC])
    }

    llvm::ShuffleVectorInst *svi = llvm::dyn_cast<llvm::ShuffleVectorInst>(vec);
-    if (svi) {
-        // Note that the code for shufflevector instructions is untested.
-        // (We haven't yet had a case where it needs to run).  Therefore,
-        // an assert at the bottom of this routien will hit the first time
-        // it runs as a reminder that this needs to be tested further.
-
+    if (svi != NULL) {
        LLVM_TYPE_CONST llvm::VectorType *svInstType = 
            llvm::dyn_cast<LLVM_TYPE_CONST llvm::VectorType>(svi->getType());
        assert(svInstType != NULL);
-        assert((int)svInstType->getNumElements() == g->target.vectorWidth);
+        assert((int)svInstType->getNumElements() == vectorLength);

        // Scalarize the two vectors being shuffled.  First figure out how
        // big they are.
@@ -1653,58 +1685,90 @@ lScalarizeVector(llvm::Value *vec, llvm::Value *scalarizedVector[ISPC_MAX_NVEC])
        int n0 = vectorType0->getNumElements();
        int n1 = vectorType1->getNumElements();

-        // FIXME: It's actually totally legitimate for these two to have
-        // different sizes; the final result just needs to have the native
-        // vector width.  To handle this, not only do we need to
-        // potentially dynamically allocate space for the arrays passed
-        // into lScalarizeVector, but we need to change the rest of its
-        // implementation to not key off g->target.vectorWidth everywhere
-        // to get the sizes of the arrays to iterate over, etc.
-        assert(n0 == g->target.vectorWidth && n1 == g->target.vectorWidth);
-
        // Go ahead and scalarize the two input vectors now.
-        // FIXME: it's ok if some or all of the values of these two vectors
-        // have undef values, so long as we don't try to access undef
-        // values with the vector indices provided to the instruction.
-        // Should fix lScalarizeVector so that it doesn't return false in
-        // this case and just leaves the elements of the arrays with undef
-        // values as NULL.
-        llvm::Value *v0[ISPC_MAX_NVEC], *v1[ISPC_MAX_NVEC];
-        if (!lScalarizeVector(svi->getOperand(0), v0) ||
-            !lScalarizeVector(svi->getOperand(1), v1))
+        llvm::Value **v0 = (llvm::Value **)alloca(n0 * sizeof(llvm::Value *));
+        llvm::Value **v1 = (llvm::Value **)alloca(n1 * sizeof(llvm::Value *));
+
+        if (!lScalarizeVector(svi->getOperand(0), v0, n0) ||
+            !lScalarizeVector(svi->getOperand(1), v1, n1))
            return false;

-        llvm::ConstantVector *shuffleIndicesVector = 
-            llvm::dyn_cast<llvm::ConstantVector>(svi->getOperand(2));
-        // I think this has to be a ConstantVector.  If this ever hits,
-        // we'll dig into what we got instead and figure out how to handle
-        // that...
-        assert(shuffleIndicesVector != NULL);
-
-        // Get the integer indices for each element of the returned vector
-        llvm::SmallVector<llvm::Constant *, ISPC_MAX_NVEC> shuffleIndices;
-        shuffleIndicesVector->getVectorElements(shuffleIndices);
-        assert((int)shuffleIndices.size() == g->target.vectorWidth);
-
-        // And loop over the indices, setting the i'th element of the
-        // result vector with the source vector element that corresponds to
-        // the i'th shuffle index value.
-        for (unsigned int i = 0; i < shuffleIndices.size(); ++i) {
-            if (!llvm::isa<llvm::ConstantInt>(shuffleIndices[i]))
-                // I'm not sure when this case would ever happen, though..
-                return false;
-            int offset = (int)lGetIntValue(shuffleIndices[i]);
-            assert(offset >= 0 && offset < n0+n1);
-
-            if (offset < n0)
-                // Offsets from 0 to n0-1 index into the first vector
-                scalarizedVector[i] = v0[offset];
-            else
-                // And offsets from n0 to (n0+n1-1) index into the second
-                // vector
-                scalarizedVector[i] = v1[offset - n0];
+        llvm::ConstantAggregateZero *caz = 
+            llvm::dyn_cast<llvm::ConstantAggregateZero>(svi->getOperand(2));
+        if (caz != NULL) {
+            for (int i = 0; i < vectorLength; ++i)
+                scalarizedVector[i] = v0[0];
+        }
+        else {
+            llvm::ConstantVector *shuffleIndicesVector = 
+                llvm::dyn_cast<llvm::ConstantVector>(svi->getOperand(2));
+            // I think this has to be a ConstantVector.  If this ever hits,
+            // we'll dig into what we got instead and figure out how to handle
+            // that...
+            assert(shuffleIndicesVector != NULL);
+
+            // Get the integer indices for each element of the returned vector
+            llvm::SmallVector<llvm::Constant *, ISPC_MAX_NVEC> shuffleIndices;
+            shuffleIndicesVector->getVectorElements(shuffleIndices);
+            assert((int)shuffleIndices.size() == vectorLength);
+
+            // And loop over the indices, setting the i'th element of the
+            // result vector with the source vector element that corresponds to
+            // the i'th shuffle index value.
+            for (unsigned int i = 0; i < shuffleIndices.size(); ++i) {
+                // I'm not sure when this case would ever happen, though..
+                assert(llvm::isa<llvm::ConstantInt>(shuffleIndices[i]));
+
+                int offset = (int)lGetIntValue(shuffleIndices[i]);
+                assert(offset >= 0 && offset < n0+n1);
+
+                if (offset < n0)
+                    // Offsets from 0 to n0-1 index into the first vector
+                    scalarizedVector[i] = v0[offset];
+                else
+                    // And offsets from n0 to (n0+n1-1) index into the second
+                    // vector
+                    scalarizedVector[i] = v1[offset - n0];
+            }
+        }
+        return true;
+    }
+
+    llvm::LoadInst *li = llvm::dyn_cast<llvm::LoadInst>(vec);
+    if (li != NULL) {
+        llvm::Value *baseAddr = li->getOperand(0);
+        llvm::Value *baseInt = new llvm::PtrToIntInst(baseAddr, LLVMTypes::Int64Type,
+                                                      "base2int", li);
+        lCopyMetadata(baseInt, li);
+
+        LLVM_TYPE_CONST llvm::PointerType *ptrType = 
+            llvm::dyn_cast<llvm::PointerType>(baseAddr->getType());
+        assert(ptrType != NULL);
+        LLVM_TYPE_CONST llvm::VectorType *vecType = 
+            llvm::dyn_cast<llvm::VectorType>(ptrType->getElementType());
+        assert(vecType != NULL);
+        LLVM_TYPE_CONST llvm::Type *elementType = vecType->getElementType();
+        uint64_t elementSize;
+        bool sizeKnown = lSizeOfIfKnown(elementType, &elementSize);
+        assert(sizeKnown == true);
+
+        LLVM_TYPE_CONST llvm::Type *eltPtrType = llvm::PointerType::get(elementType, 0);
+
+        for (int i = 0; i < vectorLength; ++i) {
+            llvm::Value *intPtrOffset = 
+                llvm::BinaryOperator::Create(llvm::Instruction::Add, baseInt,
+                                             LLVMInt64(i * elementSize), "baseoffset",
+                                             li);
+            lCopyMetadata(intPtrOffset, li);
+            llvm::Value *scalarLoadPtr = 
+                new llvm::IntToPtrInst(intPtrOffset, eltPtrType, "int2ptr", li);
+            lCopyMetadata(scalarLoadPtr, li);
+
+            llvm::Instruction *scalarLoad = 
+                new llvm::LoadInst(scalarLoadPtr, "loadelt", li);
+            lCopyMetadata(scalarLoad, li);
+            scalarizedVector[i] = scalarLoad;
        }
-        FATAL("the above code is untested so far; check now that it's actually running");
        return true;
    }

@@ -2116,11 +2180,18 @@ GSImprovementsPass::runOnBasicBlock(llvm::BasicBlock &bb) {
        if (ce && ce->getOpcode() == llvm::Instruction::BitCast)
            base = ce->getOperand(0);

-        // Try to out the offsets; the i'th element of the offsetElements
-        // array should be an i32 with the value of the offset for the i'th
-        // vector lane.  This may fail; if so, just give up.
+        // Try to find out the offsets; the i'th element of the
+        // offsetElements array should be an i32 with the value of the
+        // offset for the i'th vector lane.  This may fail; if so, just
+        // give up.
+        llvm::Value *vecValue = callInst->getArgOperand(1);
+        LLVM_TYPE_CONST llvm::VectorType *vt = 
+            llvm::dyn_cast<llvm::VectorType>(vecValue->getType());
+        assert(vt != NULL);
+        int vecLength = vt->getNumElements();
+        assert(vecLength == g->target.vectorWidth);
        llvm::Value *offsetElements[ISPC_MAX_NVEC];
-        if (!lScalarizeVector(callInst->getArgOperand(1), offsetElements))
+        if (!lScalarizeVector(vecValue, offsetElements, vecLength))
            continue;

        llvm::Value *mask = callInst->getArgOperand((gatherInfo != NULL) ? 2 : 3);
@@ -2497,7 +2568,7 @@ llvm::RegisterPass<MakeInternalFuncsStaticPass>
 bool
 MakeInternalFuncsStaticPass::runOnModule(llvm::Module &module) {
    const char *names[] = {
-        "__do_print",
+        "__do_print", "__fast_masked_vload", "__num_cores",
        "__gather_base_offsets_i8", "__gather_base_offsets_i16",
        "__gather_base_offsets_i32", "__gather_base_offsets_i64",
        "__gather_elt_8", "__gather_elt_16", 
--- a/parse.yy
+++ b/parse.yy
@@ -165,7 +165,7 @@ static const char *lParamListTokens[] = {
 %token TOKEN_CBREAK TOKEN_CCONTINUE TOKEN_CRETURN TOKEN_SYNC TOKEN_PRINT

 %type <expr> primary_expression postfix_expression
-%type <expr> unary_expression cast_expression
+%type <expr> unary_expression cast_expression launch_expression
 %type <expr> multiplicative_expression additive_expression shift_expression
 %type <expr> relational_expression equality_expression and_expression
 %type <expr> exclusive_or_expression inclusive_or_expression
@@ -177,6 +177,7 @@ static const char *lParamListTokens[] = {
 %type <stmt> statement labeled_statement compound_statement for_init_statement
 %type <stmt> expression_statement selection_statement iteration_statement
 %type <stmt> jump_statement statement_list declaration_statement print_statement
+%type <stmt> sync_statement

 %type <declaration> declaration parameter_declaration
 %type <declarators> init_declarator_list 
@@ -221,7 +222,7 @@ primary_expression
        else {
            std::vector<Symbol *> *funs = m->symbolTable->LookupFunction(name);
            if (funs)
-                $$ = new FunctionSymbolExpr(funs, @1);
+                $$ = new FunctionSymbolExpr(name, funs, @1);
        }
        if ($$ == NULL) {
            std::vector<std::string> alternates = 
@@ -256,18 +257,32 @@ primary_expression
    | '(' expression ')' { $$ = $2; }
    ;

+launch_expression
+    : TOKEN_LAUNCH '<' postfix_expression '(' argument_expression_list ')' '>'
+      { 
+          ConstExpr *oneExpr = new ConstExpr(AtomicType::UniformInt32, (int32_t)1, @3);
+          $$ = new FunctionCallExpr($3, $5, @3, true, oneExpr);
+      }
+    | TOKEN_LAUNCH '<' postfix_expression '(' ')' '>'
+      {
+          ConstExpr *oneExpr = new ConstExpr(AtomicType::UniformInt32, (int32_t)1, @3);
+          $$ = new FunctionCallExpr($3, new ExprList(@3), @3, true, oneExpr);
+       }
+    | TOKEN_LAUNCH '[' expression ']' '<' postfix_expression '(' argument_expression_list ')' '>'
+      { $$ = new FunctionCallExpr($6, $8, @6, true, $3); }
+    | TOKEN_LAUNCH '[' expression ']' '<' postfix_expression '(' ')' '>'
+      { $$ = new FunctionCallExpr($6, new ExprList(@6), @6, true, $3); }
+    ;
+
 postfix_expression
    : primary_expression
    | postfix_expression '[' expression ']'
      { $$ = new IndexExpr($1, $3, @1); }
    | postfix_expression '(' ')'
-      { $$ = new FunctionCallExpr($1, new ExprList(@1), @1, false); }
+      { $$ = new FunctionCallExpr($1, new ExprList(@1), @1); }
    | postfix_expression '(' argument_expression_list ')'
-      { $$ = new FunctionCallExpr($1, $3, @1, false); }
-    | TOKEN_LAUNCH '<' postfix_expression '(' argument_expression_list ')' '>'
-      { $$ = new FunctionCallExpr($3, $5, @3, true); }
-    | TOKEN_LAUNCH '<' postfix_expression '(' ')' '>'
-      { $$ = new FunctionCallExpr($3, new ExprList(@3), @3, true); }
+      { $$ = new FunctionCallExpr($1, $3, @1); }
+    | launch_expression
    | postfix_expression '.' TOKEN_IDENTIFIER
      { $$ = MemberExpr::create($1, yytext, @1, @3); }
 /*    | postfix_expression TOKEN_PTR_OP TOKEN_IDENTIFIER
@@ -436,8 +451,6 @@ assignment_expression

 expression
    : assignment_expression
-    | TOKEN_SYNC 
-      { $$ = new SyncExpr(@1); }
    | expression ',' assignment_expression
      { $$ = new BinaryExpr(BinaryExpr::Comma, $1, $3, @2); }
    ;
@@ -928,9 +941,13 @@ parameter_list
            builtinTokens.push_back(*token);
            ++token;
        }
-        std::vector<std::string> alternates = MatchStrings(yytext, builtinTokens);
-        std::string alts = lGetAlternates(alternates);
-        Error(@1, "Syntax error--token \"%s\" unknown.%s", yytext, alts.c_str());
+        if (strlen(yytext) == 0)
+            Error(@1, "Syntax error--premature end of file.");
+        else {
+            std::vector<std::string> alternates = MatchStrings(yytext, builtinTokens);
+            std::string alts = lGetAlternates(alternates);
+            Error(@1, "Syntax error--token \"%s\" unknown.%s", yytext, alts.c_str());
+        }
        $$ = NULL;
    }
    ;
@@ -1019,6 +1036,7 @@ statement
    | jump_statement
    | declaration_statement
    | print_statement
+    | sync_statement
    | error
    {
        std::vector<std::string> builtinTokens;
@@ -1027,9 +1045,13 @@ statement
            builtinTokens.push_back(*token);
            ++token;
        }
-        std::vector<std::string> alternates = MatchStrings(yytext, builtinTokens);
-        std::string alts = lGetAlternates(alternates);
-        Error(@1, "Syntax error--token \"%s\" unknown.%s", yytext, alts.c_str());
+        if (strlen(yytext) == 0)
+            Error(@1, "Syntax error--premature end of file.");
+        else {
+            std::vector<std::string> alternates = MatchStrings(yytext, builtinTokens);
+            std::string alts = lGetAlternates(alternates);
+            Error(@1, "Syntax error--token \"%s\" unknown.%s", yytext, alts.c_str());
+        }
        $$ = NULL;
    }
    ;
@@ -1155,6 +1177,11 @@ jump_statement
      { $$ = new ReturnStmt($2, true, @1); }
    ;

+sync_statement
+    : TOKEN_SYNC 
+      { $$ = new ExprStmt(new SyncExpr(@1), @1); }
+    ;
+
 print_statement
    : TOKEN_PRINT '(' string_constant ')'
      {
@@ -1177,9 +1204,13 @@ translation_unit
            builtinTokens.push_back(*token);
            ++token;
        }
-        std::vector<std::string> alternates = MatchStrings(yytext, builtinTokens);
-        std::string alts = lGetAlternates(alternates);
-        Error(@1, "Syntax error--token \"%s\" unknown.%s", yytext, alts.c_str());
+        if (strlen(yytext) == 0)
+            Error(@1, "Syntax error--premature end of file.");
+        else {
+            std::vector<std::string> alternates = MatchStrings(yytext, builtinTokens);
+            std::string alts = lGetAlternates(alternates);
+            Error(@1, "Syntax error--token \"%s\" unknown.%s", yytext, alts.c_str());
+        }
    }
    ;

@@ -1266,6 +1297,12 @@ static void lAddThreadIndexCountToSymbolTable(SourcePos pos) {

    Symbol *threadCountSym = new Symbol("threadCount", pos, AtomicType::UniformConstUInt32);
    m->symbolTable->AddVariable(threadCountSym);
+
+    Symbol *taskIndexSym = new Symbol("taskIndex", pos, AtomicType::UniformConstUInt32);
+    m->symbolTable->AddVariable(taskIndexSym);
+
+    Symbol *taskCountSym = new Symbol("taskCount", pos, AtomicType::UniformConstUInt32);
+    m->symbolTable->AddVariable(taskCountSym);
 }


--- a/run_tests.py
+++ b/run_tests.py
@@ -0,0 +1,218 @@
+#!/usr/bin/python
+
+# test-running driver for ispc
+
+# TODO: windows support (mostly should be calling CL.exe rather than gcc
+#   for static linking?)
+
+from optparse import OptionParser
+import multiprocessing
+from ctypes import c_int
+import os
+import sys
+import glob
+import re
+import signal
+import random
+import string
+import mutex
+import subprocess
+import platform
+
+parser = OptionParser()
+parser.add_option("-r", "--random-shuffle", dest="random", help="Randomly order tests",
+                  default=False, action="store_true")
+parser.add_option("-s", "--static-exe", dest="static_exe", 
+                  help="Create and run a regular executable for each test (rather than using the LLVM JIT).",
+                  default=False, action="store_true")
+parser.add_option('-t', '--target', dest='target',
+                  help='Set compilation target (sse2, sse4, sse4x2, avx, avx-x2)',
+                  default="sse4")
+parser.add_option('-a', '--arch', dest='arch',
+                  help='Set architecture (x86, x86-64)',
+                  default="x86-64")
+parser.add_option('-o', '--no-opt', dest='no_opt', help='Disable optimization',
+                  default=False, action="store_true")
+
+(options, args) = parser.parse_args()
+
+# if no specific test files are specified, run all of the tests in tests/
+# and failing_tests/
+if len(args) == 0:
+    files = glob.glob("tests/*ispc") + glob.glob("failing_tests/*ispc")
+else:
+    files = args
+
+# randomly shuffle the tests if asked to do so
+if (options.random):
+    random.seed()
+    random.shuffle(files)
+
+# counter
+total_tests = 0
+finished_tests_counter = multiprocessing.Value(c_int)
+
+# We'd like to use the Lock class from the multiprocessing package to
+# serialize accesses to finished_tests_counter.  Unfortunately, the version of
+# python that ships with OSX 10.5 has this bug:
+# http://bugs.python.org/issue5261.  Therefore, we use the (deprecated but
+# still available) mutex class.
+#finished_tests_counter_lock = multiprocessing.Lock()
+finished_tests_mutex = mutex.mutex()
+
+# utility routine to print an update on the number of tests that have been
+# finished.  Should be called with the mutex (or lock) held..
+def update_progress(fn):
+    finished_tests_counter.value = finished_tests_counter.value + 1
+    progress_str = " Done %d / %d [%s]" % (finished_tests_counter.value, total_tests, fn)
+    # spaces to clear out detrius from previous printing...
+    for x in range(30):
+        progress_str += ' '
+    progress_str += '\r'
+    sys.stdout.write(progress_str)
+    sys.stdout.flush()
+    finished_tests_mutex.unlock()
+
+fnull = open(os.devnull, 'w')
+
+# run the commands in cmd_list
+def run_cmds(cmd_list, filename, expect_failure):
+    for cmd in cmd_list:
+        if expect_failure:
+            failed = (subprocess.call(cmd, shell = True, stdout = fnull, stderr = fnull) != 0)
+        else:
+            failed = (os.system(cmd) != 0)
+        if failed:
+            break
+
+    surprise = ((expect_failure and not failed) or (not expect_failure and failed))
+    if surprise == True:
+        print "Test %s %s                 " % \
+            (filename, "unexpectedly passed" if expect_failure else "failed")
+    return surprise
+
+
+# pull tests to run from the given queue and run them.  Multiple copies of
+# this function will be running in parallel across all of the CPU cores of
+# the system.
+def run_tasks_from_queue(queue):
+    error_count = 0
+    while True:
+        filename = queue.get()
+        if (filename == 'STOP'):
+            sys.exit(error_count)
+
+        # do we expect this test to fail?
+        should_fail = (filename.find("failing_") != -1)
+
+        if options.static_exe == True:
+            # if the user wants us to build a static executable to run for
+            # this test, we need to figure out the signature of the test
+            # function that this test has.
+            sig2def = { "f_v(" : 0, "f_f(" : 1, "f_fu(" : 2, "f_fi(" : 3, 
+                        "f_du(" : 4, "f_duf(" : 5, "f_di(" : 6 }
+            file = open(filename, 'r')
+            match = -1
+            for line in file:
+                # look for lines with 'export'...
+                if line.find("export") == -1:
+                    continue
+                # one of them should have a function with one of the
+                # declarations in sig2def
+                for pattern, ident in sig2def.items():
+                    if line.find(pattern) != -1:
+                        match = ident
+                        break
+            file.close()
+            if match == -1:
+                print "Fatal error: unable to find function signature in test %s" % filename
+                error_count += 1
+            else:
+                obj_name = "%s.o" % filename
+                exe_name = "%s.run" % filename
+                ispc_cmd = "ispc --woff %s -o %s --arch=%s --target=%s" % \
+                    (filename, obj_name, options.arch, options.target)
+                if options.no_opt:
+                    ispc_cmd += " -O0" 
+                if options.arch == 'x86':
+                    gcc_arch = '-m32'
+                else:
+                    gcc_arch = '-m64'
+                gcc_cmd = "g++ %s test_static.cpp -DTEST_SIG=%d %s.o -o %s" % \
+                    (gcc_arch, match, filename, exe_name)
+                if platform.system() == 'Darwin':
+                    gcc_cmd += ' -Wl,-no_pie'
+                if should_fail:
+                    gcc_cmd += " -DEXPECT_FAILURE"
+                    
+                # compile the ispc code, make the executable, and run it...
+                error_count += run_cmds([ispc_cmd, gcc_cmd, exe_name], filename, should_fail)
+
+                # clean up after running the test
+                try:
+                    os.unlink(exe_name)
+                    os.unlink(obj_name)
+                except:
+                    None
+        else:
+            # otherwise we'll use ispc_test + the LLVM JIT to run the test
+            bitcode_file = "%s.bc" % filename
+            compile_cmd = "ispc --woff --emit-llvm %s --target=%s -o %s" % \
+                (filename, options.target, bitcode_file)
+            if options.no_opt:
+                compile_cmd += " -O0"
+            test_cmd = "ispc_test %s" % bitcode_file
+
+            error_count += run_cmds([compile_cmd, test_cmd], filename, should_fail)
+
+            try:
+                os.unlink(bitcode_file)
+            except:
+                None
+
+        # If not for http://bugs.python.org/issue5261 on OSX, we'd like to do this:
+        #with finished_tests_counter_lock:
+            #update_progress(filename)
+        # but instead we do this...
+        finished_tests_mutex.lock(update_progress, filename)
+
+
+task_threads = []
+
+def sigint(signum, frame):
+    for t in task_threads:
+        t.terminate()
+    sys.exit(1)
+
+if __name__ == '__main__':
+    nthreads = multiprocessing.cpu_count()
+    total_tests = len(files)
+    print "Found %d CPUs. Running %d tests." % (nthreads, total_tests)
+
+    # put each of the test filenames into a queue
+    q = multiprocessing.Queue()
+    for fn in files:
+        q.put(fn)
+    for x in range(nthreads):
+        q.put('STOP')
+
+    # need to catch sigint so that we can terminate all of the tasks if
+    # we're interrupted
+    signal.signal(signal.SIGINT, sigint)
+
+    # launch jobs to run tests
+    for x in range(nthreads):
+        t = multiprocessing.Process(target=run_tasks_from_queue, args=(q,))
+        task_threads.append(t)
+        t.start()
+
+    # wait for them to all finish and then return the number that failed
+    # (i.e. return 0 if all is ok)
+    error_count = 0
+    for t in task_threads:
+        t.join()
+        error_count += t.exitcode
+    print
+    if error_count > 0:
+        print "%d / %d tests FAILED!" % (error_count, total_tests)
+    sys.exit(error_count)
--- a/run_tests.sh
+++ b/run_tests.sh
@@ -23,6 +23,12 @@ EOF
    esac
 done

+ISPC_ARCH=x86-64
+if [[ $OS == "Windows_NT" ]]; then
+  ISPC_ARCH=x86
+fi
+ISPC_ARGS="--target=$target --arch=$ISPC_ARCH -O2 --woff"
+
 shift $(( $OPTIND - 1 ))
 if [[ "$1" > 0 ]]; then
    while [[ "$1" > 0 ]]; do
@@ -31,7 +37,7 @@ if [[ "$1" > 0 ]]; then
        echo Running test $i

        bc=${i%%ispc}bc
-        ispc -O2 $i -woff -o $bc --emit-llvm --target=$target
+        ispc $ISPC_ARGS $i -o $bc --emit-llvm
        if [[ $? != 0 ]]; then
            surprises=1
            echo Test $i FAILED ispc compile
@@ -55,7 +61,7 @@ else
        fi
        (( counter++ ))
        bc=${i%%ispc}bc
-        ispc -O2 $i -woff -o $bc --emit-llvm --target=$target
+        ispc $ISPC_ARGS $i -o $bc --emit-llvm 
        if [[ $? != 0 ]]; then
            surprises=1
            echo Test $i FAILED ispc compile
--- a/stdlib.ispc
+++ b/stdlib.ispc
@@ -315,6 +315,39 @@ static inline uniform int lanemask() {
    return __movmsk(__mask);
 }

+///////////////////////////////////////////////////////////////////////////
+// Prefetching
+
+#define PREFETCHES(NAME, TYPE)                                  \
+static inline void prefetch_l1(const reference TYPE ptr) {      \
+    __prefetch_read_1_##NAME##_refsconst(ptr);                  \
+}                                                               \
+static inline void prefetch_l2(const reference TYPE ptr) {      \
+    __prefetch_read_2_##NAME##_refsconst(ptr);                  \
+}                                                               \
+static inline void prefetch_l3(const reference TYPE ptr) {      \
+    __prefetch_read_3_##NAME##_refsconst(ptr);                  \
+}                                                               \
+ static inline void prefetch_nt(const reference TYPE ptr) {     \
+     __prefetch_read_nt_##NAME##_refsconst(ptr);                \
+}
+
+PREFETCHES(uniform_int8, uniform int8)
+PREFETCHES(uniform_int16, uniform int16)
+PREFETCHES(uniform_int32, uniform int32)
+PREFETCHES(uniform_int64, uniform int64)
+PREFETCHES(uniform_float, uniform float)
+PREFETCHES(uniform_double, uniform double)
+
+PREFETCHES(varying_int8, int8)
+PREFETCHES(varying_int16, int16)
+PREFETCHES(varying_int32, int32)
+PREFETCHES(varying_int64, int64)
+PREFETCHES(varying_float, float)
+PREFETCHES(varying_double, double)
+
+#undef PREFETCHES
+
 ///////////////////////////////////////////////////////////////////////////
 // Horizontal ops / reductions

@@ -336,7 +369,7 @@ static inline uniform float reduce_min(float v) {
 static inline uniform float reduce_max(float v) {
    // For the lanes where the mask is off, replace the given value with
    // negative infinity, so that it doesn't affect the result.
-    const uniform int iflt_neg_max = 0xff800000; // -infinity
+    const int iflt_neg_max = 0xff800000; // -infinity
    // Must use __floatbits_varying_int32, not floatbits(), since with the
    // latter the current mask enters into the returned result...
    return __reduce_max_float(__mask ? v : __floatbits_varying_int32(iflt_neg_max));
@@ -394,7 +427,7 @@ static inline uniform double reduce_min(double v) {
 }

 static inline uniform double reduce_max(double v) {
-    const uniform int64 iflt_neg_max = 0xfff0000000000000; // -infinity
+    const int64 iflt_neg_max = 0xfff0000000000000; // -infinity
    // Must use __doublebits_varying_int64, not doublebits(), since with the
    // latter the current mask enters into the returned result...
    return __reduce_max_double(__mask ? v : __doublebits_varying_int64(iflt_neg_max));
@@ -438,29 +471,110 @@ static inline uniform unsigned int64 reduce_max(unsigned int64 v) {
    return __reduce_max_uint64(__mask ? v : 0);
 }

+#define REDUCE_EQUAL(TYPE, FUNCTYPE, MASKTYPE)                     \
+static inline uniform bool reduce_equal(TYPE v) {                  \
+    uniform TYPE unusedValue;                                      \
+    return __reduce_equal_##FUNCTYPE(v, unusedValue, (MASKTYPE)__mask); \
+}                                                                  \
+static inline uniform bool reduce_equal(TYPE v, reference uniform TYPE value) { \
+    return __reduce_equal_##FUNCTYPE(v, value, (MASKTYPE)__mask);       \
+}
+
+REDUCE_EQUAL(int32, int32, int32)
+REDUCE_EQUAL(unsigned int32, int32, unsigned int32)
+REDUCE_EQUAL(float, float, int32)
+REDUCE_EQUAL(int64, int64, int32)
+REDUCE_EQUAL(unsigned int64, int64, unsigned int32)
+REDUCE_EQUAL(double, double, int32)
+
+static int32 exclusive_scan_add(int32 v) {
+    return __exclusive_scan_add_i32(v, (int32)__mask);
+}
+
+static unsigned int32 exclusive_scan_add(unsigned int32 v) {
+    return __exclusive_scan_add_i32(v, __mask);
+}
+
+static float exclusive_scan_add(float v) {
+    return __exclusive_scan_add_float(v, __mask);
+}
+
+static int64 exclusive_scan_add(int64 v) {
+    return __exclusive_scan_add_i64(v, (int32)__mask);
+}
+
+static unsigned int64 exclusive_scan_add(unsigned int64 v) {
+    return __exclusive_scan_add_i64(v, __mask);
+}
+
+static double exclusive_scan_add(double v) {
+    return __exclusive_scan_add_double(v, __mask);
+}
+
+static int32 exclusive_scan_and(int32 v) {
+    return __exclusive_scan_and_i32(v, (int32)__mask);
+}
+
+static unsigned int32 exclusive_scan_and(unsigned int32 v) {
+    return __exclusive_scan_and_i32(v, __mask);
+}
+
+static int64 exclusive_scan_and(int64 v) {
+    return __exclusive_scan_and_i64(v, (int32)__mask);
+}
+
+static unsigned int64 exclusive_scan_and(unsigned int64 v) {
+    return __exclusive_scan_and_i64(v, __mask);
+}
+
+static int32 exclusive_scan_or(int32 v) {
+    return __exclusive_scan_or_i32(v, (int32)__mask);
+}
+
+static unsigned int32 exclusive_scan_or(unsigned int32 v) {
+    return __exclusive_scan_or_i32(v, __mask);
+}
+
+static int64 exclusive_scan_or(int64 v) {
+    return __exclusive_scan_or_i64(v, (int32)__mask);
+}
+
+static unsigned int64 exclusive_scan_or(unsigned int64 v) {
+    return __exclusive_scan_or_i64(v, __mask);
+}
+
 ///////////////////////////////////////////////////////////////////////////
 // packed load, store

 static inline uniform int 
 packed_load_active(uniform unsigned int a[], uniform int start,
                   reference unsigned int vals) {
-    return __packed_load_active(a, start, vals, __mask);
+    return __packed_load_active(a, (unsigned int)start, vals,
+                                (unsigned int32)__mask);
 }

 static inline uniform int
 packed_store_active(uniform unsigned int a[], uniform int start,
                    unsigned int vals) {
-    return __packed_store_active(a, start, vals, __mask);
+    return __packed_store_active(a, (unsigned int)start, vals,
+                                 (unsigned int32)__mask);
 }

 static inline uniform int packed_load_active(uniform int a[], uniform int start,
                                             reference int vals) {
-    return __packed_load_active(a, start, vals, __mask);
+    return __packed_load_active(a, start, vals, (int32)__mask);
 }

 static inline uniform int packed_store_active(uniform int a[], uniform int start,
                                              int vals) {
-    return __packed_store_active(a, start, vals, __mask);
+    return __packed_store_active(a, start, vals, (int32)__mask);
+}
+
+///////////////////////////////////////////////////////////////////////////
+// System information
+
+static inline int num_cores() {
+    return __num_cores();
 }

 ///////////////////////////////////////////////////////////////////////////
@@ -470,73 +584,110 @@ static inline void memory_barrier() {
    __memory_barrier();
 }

-#define DEFINE_ATOMIC_OP(TA,TB,OPA,OPB)                                 \
+#define DEFINE_ATOMIC_OP(TA,TB,OPA,OPB,MASKTYPE)                        \
 static inline TA atomic_##OPA##_global(uniform reference TA ref, TA value) { \
    memory_barrier();                                                   \
-    TA ret = __atomic_##OPB##_##TB##_global(ref, value, __mask);  \
+    TA ret = __atomic_##OPB##_##TB##_global(ref, value, (MASKTYPE)__mask); \
+    memory_barrier();                                                   \
+    return ret;                                                         \
+}                                                                       \
+static inline uniform TA atomic_##OPA##_global(uniform reference TA ref, \
+                                               uniform TA value) {      \
+    memory_barrier();                                                   \
+    uniform TA ret = __atomic_##OPB##_uniform_##TB##_global(ref, value, (MASKTYPE)__mask); \
    memory_barrier();                                                   \
    return ret;                                                         \
 }

-DEFINE_ATOMIC_OP(int32,int32,add,add)
-DEFINE_ATOMIC_OP(int32,int32,subtract,sub)
-DEFINE_ATOMIC_OP(int32,int32,min,min)
-DEFINE_ATOMIC_OP(int32,int32,max,max)
-DEFINE_ATOMIC_OP(int32,int32,and,and)
-DEFINE_ATOMIC_OP(int32,int32,or,or)
-DEFINE_ATOMIC_OP(int32,int32,xor,xor)
-DEFINE_ATOMIC_OP(int32,int32,swap,swap)
+#define DEFINE_ATOMIC_MINMAX_OP(TA,TB,OPA,OPB, MASKTYPE)                \
+static inline TA atomic_##OPA##_global(uniform reference TA ref, TA value) { \
+    uniform TA oneval = reduce_##OPA(value);                            \
+    TA ret;                                                             \
+    if (lanemask() != 0) {                                              \
+        memory_barrier();                                               \
+        ret = __atomic_##OPB##_uniform_##TB##_global(ref, oneval, (MASKTYPE)__mask); \
+        memory_barrier();                                               \
+    }                                                                   \
+    return ret;                                                         \
+}                                                                       \
+static inline uniform TA atomic_##OPA##_global(uniform reference TA ref, \
+                                               uniform TA value) {      \
+    memory_barrier();                                                   \
+    uniform TA ret = __atomic_##OPB##_uniform_##TB##_global(ref, value, (MASKTYPE)__mask); \
+    memory_barrier();                                                   \
+    return ret;                                                         \
+}
+
+DEFINE_ATOMIC_OP(int32,int32,add,add,int32)
+DEFINE_ATOMIC_OP(int32,int32,subtract,sub,int32)
+DEFINE_ATOMIC_MINMAX_OP(int32,int32,min,min,int32)
+DEFINE_ATOMIC_MINMAX_OP(int32,int32,max,max,int32)
+DEFINE_ATOMIC_OP(int32,int32,and,and,int32)
+DEFINE_ATOMIC_OP(int32,int32,or,or,int32)
+DEFINE_ATOMIC_OP(int32,int32,xor,xor,int32)
+DEFINE_ATOMIC_OP(int32,int32,swap,swap,int32)

 // For everything but atomic min and max, we can use the same
 // implementations for unsigned as for signed.
-DEFINE_ATOMIC_OP(unsigned int32,int32,add,add)
-DEFINE_ATOMIC_OP(unsigned int32,int32,subtract,sub)
-DEFINE_ATOMIC_OP(unsigned int32,uint32,min,umin)
-DEFINE_ATOMIC_OP(unsigned int32,uint32,max,umax)
-DEFINE_ATOMIC_OP(unsigned int32,int32,and,and)
-DEFINE_ATOMIC_OP(unsigned int32,int32,or,or)
-DEFINE_ATOMIC_OP(unsigned int32,int32,xor,xor)
-DEFINE_ATOMIC_OP(unsigned int32,int32,swap,swap)
+DEFINE_ATOMIC_OP(unsigned int32,int32,add,add,unsigned int32)
+DEFINE_ATOMIC_OP(unsigned int32,int32,subtract,sub,unsigned int32)
+DEFINE_ATOMIC_MINMAX_OP(unsigned int32,uint32,min,umin,unsigned int32)
+DEFINE_ATOMIC_MINMAX_OP(unsigned int32,uint32,max,umax,unsigned int32)
+DEFINE_ATOMIC_OP(unsigned int32,int32,and,and,unsigned int32)
+DEFINE_ATOMIC_OP(unsigned int32,int32,or,or,unsigned int32)
+DEFINE_ATOMIC_OP(unsigned int32,int32,xor,xor,unsigned int32)
+DEFINE_ATOMIC_OP(unsigned int32,int32,swap,swap,unsigned int32)

-DEFINE_ATOMIC_OP(float,float,swap,swap)
+DEFINE_ATOMIC_OP(float,float,swap,swap,int32)

-DEFINE_ATOMIC_OP(int64,int64,add,add)
-DEFINE_ATOMIC_OP(int64,int64,subtract,sub)
-DEFINE_ATOMIC_OP(int64,int64,min,min)
-DEFINE_ATOMIC_OP(int64,int64,max,max)
-DEFINE_ATOMIC_OP(int64,int64,and,and)
-DEFINE_ATOMIC_OP(int64,int64,or,or)
-DEFINE_ATOMIC_OP(int64,int64,xor,xor)
-DEFINE_ATOMIC_OP(int64,int64,swap,swap)
+DEFINE_ATOMIC_OP(int64,int64,add,add,int32)
+DEFINE_ATOMIC_OP(int64,int64,subtract,sub,int32)
+DEFINE_ATOMIC_MINMAX_OP(int64,int64,min,min,int32)
+DEFINE_ATOMIC_MINMAX_OP(int64,int64,max,max,int32)
+DEFINE_ATOMIC_OP(int64,int64,and,and,int32)
+DEFINE_ATOMIC_OP(int64,int64,or,or,int32)
+DEFINE_ATOMIC_OP(int64,int64,xor,xor,int32)
+DEFINE_ATOMIC_OP(int64,int64,swap,swap,int32)

 // For everything but atomic min and max, we can use the same
 // implementations for unsigned as for signed.
-DEFINE_ATOMIC_OP(unsigned int64,int64,add,add)
-DEFINE_ATOMIC_OP(unsigned int64,int64,subtract,sub)
-DEFINE_ATOMIC_OP(unsigned int64,uint64,min,umin)
-DEFINE_ATOMIC_OP(unsigned int64,uint64,max,umax)
-DEFINE_ATOMIC_OP(unsigned int64,int64,and,and)
-DEFINE_ATOMIC_OP(unsigned int64,int64,or,or)
-DEFINE_ATOMIC_OP(unsigned int64,int64,xor,xor)
-DEFINE_ATOMIC_OP(unsigned int64,int64,swap,swap)
+DEFINE_ATOMIC_OP(unsigned int64,int64,add,add,unsigned int32)
+DEFINE_ATOMIC_OP(unsigned int64,int64,subtract,sub,unsigned int32)
+DEFINE_ATOMIC_MINMAX_OP(unsigned int64,uint64,min,umin,unsigned int32)
+DEFINE_ATOMIC_MINMAX_OP(unsigned int64,uint64,max,umax,unsigned int32)
+DEFINE_ATOMIC_OP(unsigned int64,int64,and,and,unsigned int32)
+DEFINE_ATOMIC_OP(unsigned int64,int64,or,or,unsigned int32)
+DEFINE_ATOMIC_OP(unsigned int64,int64,xor,xor,unsigned int32)
+DEFINE_ATOMIC_OP(unsigned int64,int64,swap,swap,unsigned int32)

-DEFINE_ATOMIC_OP(double,double,swap,swap)
+DEFINE_ATOMIC_OP(double,double,swap,swap,int32)

-#define ATOMIC_DECL_CMPXCHG(TA, TB)                                        \
+#undef DEFINE_ATOMIC_OP
+
+#define ATOMIC_DECL_CMPXCHG(TA, TB, MASKTYPE)                           \
 static inline TA atomic_compare_exchange_global(                           \
         uniform reference TA ref, TA oldval, TA newval) {                 \
    memory_barrier();                                                      \
-    TA ret = __atomic_compare_exchange_##TB##_global(ref, oldval, newval, __mask); \
+    TA ret = __atomic_compare_exchange_##TB##_global(ref, oldval, newval, (MASKTYPE)__mask); \
    memory_barrier();                                                      \
    return ret;                                                            \
+} \
+static inline uniform TA atomic_compare_exchange_global(               \
+         uniform reference TA ref, uniform TA oldval, uniform TA newval) {                 \
+    memory_barrier();                                                   \
+    uniform TA ret = __atomic_compare_exchange_uniform_##TB##_global(ref, oldval, newval, (MASKTYPE)__mask); \
+    memory_barrier();                                                   \
+    return ret;                                                         \
 }

-ATOMIC_DECL_CMPXCHG(int32, int32)
-ATOMIC_DECL_CMPXCHG(unsigned int32, int32)
-ATOMIC_DECL_CMPXCHG(float, float)
-ATOMIC_DECL_CMPXCHG(int64, int64)
-ATOMIC_DECL_CMPXCHG(unsigned int64, int64)
-ATOMIC_DECL_CMPXCHG(double, double)
+ATOMIC_DECL_CMPXCHG(int32, int32, int32)
+ATOMIC_DECL_CMPXCHG(unsigned int32, int32, unsigned int32)
+ATOMIC_DECL_CMPXCHG(float, float, int32)
+ATOMIC_DECL_CMPXCHG(int64, int64, int32)
+ATOMIC_DECL_CMPXCHG(unsigned int64, int64, unsigned int32)
+ATOMIC_DECL_CMPXCHG(double, double, int32)
+
+#undef ATOMIC_DECL_CMPXCHG

 ///////////////////////////////////////////////////////////////////////////
 // Floating-Point Math
@@ -2600,6 +2751,80 @@ static inline int16 float_to_half(float f) {
 }


+static inline uniform float half_to_float_fast(uniform unsigned int16 h) {
+    uniform unsigned int32 hs = h & (int32)0x8000u;  // Pick off sign bit
+    uniform unsigned int32 he = h & (int32)0x7C00u;  // Pick off exponent bits
+    uniform unsigned int32 hm = h & (int32)0x03FFu;  // Pick off mantissa bits
+
+    // sign
+    uniform unsigned int32 xs = ((unsigned int32) hs) << 16; 
+    // Exponent: unbias the halfp, then bias the single
+    uniform int32 xes = ((int32) (he >> 10)) - 15 + 127; 
+    // Exponent
+    uniform unsigned int32 xe = (unsigned int32) (xes << 23);
+    // Mantissa
+    uniform unsigned int32 xm = ((unsigned int32) hm) << 13; 
+    return floatbits(xs | xe | xm);
+
+}
+
+static inline float half_to_float_fast(unsigned int16 h) {
+    unsigned int32 hs = h & (int32)0x8000u;  // Pick off sign bit
+    unsigned int32 he = h & (int32)0x7C00u;  // Pick off exponent bits
+    unsigned int32 hm = h & (int32)0x03FFu;  // Pick off mantissa bits
+
+    // sign
+    unsigned int32 xs = ((unsigned int32) hs) << 16; 
+    // Exponent: unbias the halfp, then bias the single
+    int32 xes = ((int32) (he >> 10)) - 15 + 127; 
+    // Exponent
+    unsigned int32 xe = (unsigned int32) (xes << 23);
+    // Mantissa
+    unsigned int32 xm = ((unsigned int32) hm) << 13; 
+    return floatbits(xs | xe | xm);
+
+}
+
+static inline uniform int16 float_to_half_fast(uniform float f) {
+    uniform int32 x = intbits(f);
+    uniform unsigned int32 xs = x & 0x80000000u;  // Pick off sign bit
+    uniform unsigned int32 xe = x & 0x7F800000u;  // Pick off exponent bits
+    uniform unsigned int32 xm = x & 0x007FFFFFu;  // Pick off mantissa bits
+
+    uniform unsigned int32 hs = (xs >> 16); // Sign bit
+    // Exponent unbias the single, then bias the halfp
+    uniform int32 hes = ((int)(xe >> 23)) - 127 + 15; 
+    uniform unsigned int32 he = (hes << 10); // Exponent
+    uniform int32 hm = (xm >> 13); // Mantissa
+    uniform int32 ret = (hs | he | hm);
+
+    if (xm & 0x00001000u) // Check for rounding
+        // Round, might overflow to inf, this is OK
+        ret += 1u; 
+
+    return (int16)ret;
+}
+
+static inline int16 float_to_half_fast(float f) {
+    int32 x = intbits(f);
+    unsigned int32 xs = x & 0x80000000u;  // Pick off sign bit
+    unsigned int32 xe = x & 0x7F800000u;  // Pick off exponent bits
+    unsigned int32 xm = x & 0x007FFFFFu;  // Pick off mantissa bits
+
+    unsigned int32 hs = (xs >> 16); // Sign bit
+    // Exponent unbias the single, then bias the halfp
+    int32 hes = ((int)(xe >> 23)) - 127 + 15; 
+    unsigned int32 he = (hes << 10); // Exponent
+    int32 hm = (xm >> 13); // Mantissa
+    int32 ret = (hs | he | hm);
+
+    if (xm & 0x00001000u) // Check for rounding
+        // Round, might overflow to inf, this is OK
+        ret += 1u; 
+
+    return (int16)ret;
+}
+
 ///////////////////////////////////////////////////////////////////////////
 // RNG stuff

@@ -2624,7 +2849,9 @@ static inline unsigned int random(reference RNGState state)

 static inline float frandom(reference RNGState state)
 {
-    return ((int)(random(state) & ((1<<24)-1))) / (float)(1 << 24);
+    unsigned int irand = random(state);
+    irand &= (1<<23)-1;
+    return floatbits(0x3F800000 | irand)-1.0f;
 }

 static inline uniform unsigned int __seed4(reference RNGState state, 
@@ -2665,6 +2892,12 @@ static inline void seed_rng(reference uniform RNGState state, uniform unsigned i
    seed = __seed4(state, 0, seed);
    if (programCount == 8)
        __seed4(state, 4, seed ^ 0xbeeff00d);
+    if (programCount == 16) {
+        __seed4(state, 4,  seed ^ 0xbeeff00d);
+        __seed4(state, 8,  ((seed & 0xffff) << 16) | (seed >> 16));
+        __seed4(state, 12, (((seed & 0xff) << 24) | ((seed & 0xff00)  << 8) |
+                            ((seed & 0xff0000) >> 8) | (seed & 0xff000000) >> 24));
+    }
 }

 static inline void fastmath() {
--- a/Show More
+++ b/Show More