update doxygen, release notes for 1.0.8 release

Predicated 'if' statement performance improvements.
Go back to running both sides of 'if' statements with masking and without branching if we can determine that the code is relatively simple (as per the simple cost model), and is safe to run even if the mask is 'all off'. This gives a bit of a performance improvement for some of the examples (most notably, the ray tracer), and is the code that one wants generated in this case anyhow.
2011-09-19 15:22:25 -07:00 · 2011-09-19 09:54:09 -07:00 · 2011-09-17 13:42:46 -07:00 · 2011-09-17 13:38:51 -07:00 · 2011-09-17 13:18:59 -07:00 · 2011-09-17 13:03:51 -07:00
129 changed files with 7099 additions and 1399 deletions
--- a/40
+++ b/40
@@ -10,9 +10,15 @@ CLANG_LIBS = -lclangFrontend -lclangDriver \
             -lclangSerialization -lclangParse -lclangSema \
             -lclangAnalysis -lclangAST -lclangLex -lclangBasic

-LLVM_LIBS=$(shell llvm-config --ldflags --libs) -lpthread -ldl
+ISPC_LIBS=$(CLANG_LIBS) \
+	$(shell llvm-config --ldflags --libs) \
+	-lpthread -ldl
+ISPC_TEST_LIBS=$(shell llvm-config --ldflags --libs) \
+	-lpthread -ldl
+
 LLVM_CXXFLAGS=$(shell llvm-config --cppflags)
-LLVM_VERSION_DEF=-DLLVM_$(shell llvm-config --version | sed s/\\./_/)
+LLVM_VERSION=$(shell llvm-config --version | sed s/\\./_/)
+LLVM_VERSION_DEF=-DLLVM_$(LLVM_VERSION)

 BUILD_DATE=$(shell date +%Y%m%d)
 BUILD_VERSION=$(shell git log --abbrev-commit --abbrev=16 | head -1)
@@ -43,12 +49,14 @@ CXX_SRC=builtins.cpp ctx.cpp decl.cpp expr.cpp ispc.cpp \
 	util.cpp
 HEADERS=builtins.h ctx.h decl.h expr.h ispc.h llvmutil.h module.h \
 	opt.h stmt.h sym.h type.h util.h
-BUILTINS_SRC=builtins-avx.ll builtins-sse2.ll builtins-sse4.ll builtins-sse4x2.ll
+BUILTINS_SRC=builtins-avx.ll builtins-avx-x2.ll builtins-sse2.ll \
+	builtins-sse4.ll builtins-sse4x2.ll
 BISON_SRC=parse.yy
 FLEX_SRC=lex.ll

-OBJS=$(addprefix objs/, $(CXX_SRC:.cpp=.o) $(BUILTINS_SRC:.ll=.o) builtins-c.o stdlib_ispc.o \
-	$(BISON_SRC:.yy=.o) $(FLEX_SRC:.ll=.o))
+OBJS=$(addprefix objs/, $(CXX_SRC:.cpp=.o) $(BUILTINS_SRC:.ll=.o) \
+	builtins-c-32.o builtins-c-64.o stdlib_ispc.o $(BISON_SRC:.yy=.o) \
+	$(FLEX_SRC:.ll=.o))

 default: ispc ispc_test

@@ -77,11 +85,11 @@ doxygen:

 ispc: print_llvm_src dirs $(OBJS)
 	@echo Creating ispc executable
-	@$(CXX) $(LDFLAGS) -o $@ $(OBJS) $(CLANG_LIBS) $(LLVM_LIBS)
+	@$(CXX) $(LDFLAGS) -o $@ $(OBJS) $(ISPC_LIBS)

 ispc_test: dirs ispc_test.cpp
 	@echo Creating ispc_test executable
-	@$(CXX) $(LDFLAGS) $(CXXFLAGS) -o $@ ispc_test.cpp $(LLVM_LIBS)
+	@$(CXX) $(LDFLAGS) $(CXXFLAGS) -o $@ ispc_test.cpp $(ISPC_TEST_LIBS)

 objs/%.o: %.cpp
 	@echo Compiling $<
@@ -103,19 +111,27 @@ objs/lex.o: objs/lex.cpp $(HEADERS) objs/parse.cc
 	@echo Compiling $<
 	@$(CXX) $(CXXFLAGS) -o $@ -c $<

-objs/builtins-%.cpp: builtins-%.ll builtins.m4 builtins-sse.ll
+objs/builtins-%.cpp: builtins-%.ll builtins.m4 builtins-sse.ll builtins-avx-common.ll
 	@echo Creating C++ source from builtin definitions file $<
-	@m4 builtins.m4 $< | ./bitcode2cpp.py $< > $@
+	@m4 -DLLVM_VERSION=$(LLVM_VERSION) builtins.m4 $< | ./bitcode2cpp.py $< > $@

 objs/builtins-%.o: objs/builtins-%.cpp
 	@echo Compiling $<
 	@$(CXX) $(CXXFLAGS) -o $@ -c $<

-objs/builtins-c.cpp: builtins-c.c
+objs/builtins-c-32.cpp: builtins-c.c
 	@echo Creating C++ source from builtins definition file $<
-	@$(CLANG) -I /opt/l1om/usr/include/ -emit-llvm -c $< -o - | llvm-dis - | ./bitcode2cpp.py $< > $@
+	@$(CLANG) -m32 -emit-llvm -c $< -o - | llvm-dis - | ./bitcode2cpp.py builtins-c-32.c > $@

-objs/builtins-c.o: objs/builtins-c.cpp
+objs/builtins-c-32.o: objs/builtins-c-32.cpp
+	@echo Compiling $<
+	@$(CXX) $(CXXFLAGS) -o $@ -c $<
+
+objs/builtins-c-64.cpp: builtins-c.c
+	@echo Creating C++ source from builtins definition file $<
+	@$(CLANG) -m64 -emit-llvm -c $< -o - | llvm-dis - | ./bitcode2cpp.py builtins-c-64.c > $@
+
+objs/builtins-c-64.o: objs/builtins-c-64.cpp
 	@echo Compiling $<
 	@$(CXX) $(CXXFLAGS) -o $@ -c $<

--- a/builtins-avx-common.ll
+++ b/builtins-avx-common.ll
@@ -0,0 +1,278 @@
+;;  Copyright (c) 2010-2011, Intel Corporation
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Intel Corporation nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; *** Untested *** AVX target implementation.
+;;
+;; The LLVM AVX code generator is incomplete, so the ispc AVX target
+;; hasn't yet been tested.  There is therefore a higher-than-normal
+;; chance that there are bugs in the code in this file.
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rcp
+
+declare <4 x float> @llvm.x86.sse.rcp.ss(<4 x float>) nounwind readnone
+
+define internal float @__rcp_uniform_float(float) nounwind readonly alwaysinline {
+;    uniform float iv = extract(__rcp_u(v), 0);
+;    return iv * (2. - v * iv);
+  %vecval = insertelement <4 x float> undef, float %0, i32 0
+  %call = call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %vecval)
+  %scall = extractelement <4 x float> %call, i32 0
+
+  ; do one N-R iteration
+  %v_iv = fmul float %0, %scall
+  %two_minus = fsub float 2., %v_iv  
+  %iv_mul = fmul float %scall, %two_minus
+  ret float %iv_mul
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rounding floats
+
+declare <4 x float> @llvm.x86.sse41.round.ss(<4 x float>, <4 x float>, i32) nounwind readnone
+
+define internal float @__round_uniform_float(float) nounwind readonly alwaysinline {
+  ; roundss, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8
+  ; the roundss intrinsic is a total mess--docs say:
+  ;
+  ;  __m128 _mm_round_ss (__m128 a, __m128 b, const int c)
+  ;       
+  ;  b is a 128-bit parameter. The lowest 32 bits are the result of the rounding function
+  ;  on b0. The higher order 96 bits are copied directly from input parameter a. The
+  ;  return value is described by the following equations:
+  ;
+  ;  r0 = RND(b0)
+  ;  r1 = a1
+  ;  r2 = a2
+  ;  r3 = a3
+  ;
+  ;  It doesn't matter what we pass as a, since we only need the r0 value
+  ;  here.  So we pass the same register for both.
+  %xi = insertelement <4 x float> undef, float %0, i32 0
+  %xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 8)
+  %rs = extractelement <4 x float> %xr, i32 0
+  ret float %rs
+}
+
+define internal float @__floor_uniform_float(float) nounwind readonly alwaysinline {
+  ; see above for round_ss instrinsic discussion...
+  %xi = insertelement <4 x float> undef, float %0, i32 0
+  ; roundps, round down 0b01 | don't signal precision exceptions 0b1001 = 9
+  %xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 9)
+  %rs = extractelement <4 x float> %xr, i32 0
+  ret float %rs
+}
+
+define internal float @__ceil_uniform_float(float) nounwind readonly alwaysinline {
+  ; see above for round_ss instrinsic discussion...
+  %xi = insertelement <4 x float> undef, float %0, i32 0
+  ; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
+  %xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 10)
+  %rs = extractelement <4 x float> %xr, i32 0
+  ret float %rs
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rounding doubles
+
+declare <2 x double> @llvm.x86.sse41.round.sd(<2 x double>, <2 x double>, i32) nounwind readnone
+
+define internal double @__round_uniform_double(double) nounwind readonly alwaysinline {
+  %xi = insertelement <2 x double> undef, double %0, i32 0
+  %xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 8)
+  %rs = extractelement <2 x double> %xr, i32 0
+  ret double %rs
+}
+
+define internal double @__floor_uniform_double(double) nounwind readonly alwaysinline {
+  ; see above for round_ss instrinsic discussion...
+  %xi = insertelement <2 x double> undef, double %0, i32 0
+  ; roundpd, round down 0b01 | don't signal precision exceptions 0b1001 = 9
+  %xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 9)
+  %rs = extractelement <2 x double> %xr, i32 0
+  ret double %rs
+}
+
+define internal double @__ceil_uniform_double(double) nounwind readonly alwaysinline {
+  ; see above for round_ss instrinsic discussion...
+  %xi = insertelement <2 x double> undef, double %0, i32 0
+  ; roundpd, round up 0b10 | don't signal precision exceptions 0b1010 = 10
+  %xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 10)
+  %rs = extractelement <2 x double> %xr, i32 0
+  ret double %rs
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rsqrt
+
+declare <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float>) nounwind readnone
+
+define internal float @__rsqrt_uniform_float(float) nounwind readonly alwaysinline {
+  ;  uniform float is = extract(__rsqrt_u(v), 0);
+  %v = insertelement <4 x float> undef, float %0, i32 0
+  %vis = call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %v)
+  %is = extractelement <4 x float> %vis, i32 0
+
+  ;  return 0.5 * is * (3. - (v * is) * is);
+  %v_is = fmul float %0, %is
+  %v_is_is = fmul float %v_is, %is
+  %three_sub = fsub float 3., %v_is_is
+  %is_mul = fmul float %is, %three_sub
+  %half_scale = fmul float 0.5, %is_mul
+  ret float %half_scale
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; sqrt
+
+declare <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float>) nounwind readnone
+
+define internal float @__sqrt_uniform_float(float) nounwind readonly alwaysinline {
+  sse_unary_scalar(ret, 4, float, @llvm.x86.sse.sqrt.ss, %0)
+  ret float %ret
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; fastmath
+
+declare void @llvm.x86.sse.stmxcsr(i8 *) nounwind
+declare void @llvm.x86.sse.ldmxcsr(i8 *) nounwind
+
+define internal void @__fastmath() nounwind alwaysinline {
+  %ptr = alloca i32
+  %ptr8 = bitcast i32 * %ptr to i8 *
+  call void @llvm.x86.sse.stmxcsr(i8 * %ptr8)
+  %oldval = load i32 *%ptr
+
+  ; turn on DAZ (64)/FTZ (32768) -> 32832
+  %update = or i32 %oldval, 32832
+  store i32 %update, i32 *%ptr
+  call void @llvm.x86.sse.ldmxcsr(i8 * %ptr8)
+  ret void
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; float min/max
+
+declare <4 x float> @llvm.x86.sse.max.ss(<4 x float>, <4 x float>) nounwind readnone
+declare <4 x float> @llvm.x86.sse.min.ss(<4 x float>, <4 x float>) nounwind readnone
+
+define internal float @__max_uniform_float(float, float) nounwind readonly alwaysinline {
+  sse_binary_scalar(ret, 4, float, @llvm.x86.sse.max.ss, %0, %1)
+  ret float %ret
+}
+
+define internal float @__min_uniform_float(float, float) nounwind readonly alwaysinline {
+  sse_binary_scalar(ret, 4, float, @llvm.x86.sse.min.ss, %0, %1)
+  ret float %ret
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int min/max
+
+declare <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32>, <4 x i32>) nounwind readnone
+declare <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32>, <4 x i32>) nounwind readnone
+
+define internal i32 @__min_uniform_int32(i32, i32) nounwind readonly alwaysinline {
+  sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pminsd, %0, %1)
+  ret i32 %ret
+}
+
+define internal i32 @__max_uniform_int32(i32, i32) nounwind readonly alwaysinline {
+  sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pmaxsd, %0, %1)
+  ret i32 %ret
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; unsigned int min/max
+
+declare <4 x i32> @llvm.x86.sse41.pminud(<4 x i32>, <4 x i32>) nounwind readnone
+declare <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32>, <4 x i32>) nounwind readnone
+
+define internal i32 @__min_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
+  sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pminud, %0, %1)
+  ret i32 %ret
+}
+
+define internal i32 @__max_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
+  sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pmaxud, %0, %1)
+  ret i32 %ret
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; horizontal ops
+
+declare i32 @llvm.ctpop.i32(i32) nounwind readnone
+
+define internal i32 @__popcnt_int32(i32) nounwind readonly alwaysinline {
+  %call = call i32 @llvm.ctpop.i32(i32 %0)
+  ret i32 %call
+}
+
+declare i64 @llvm.ctpop.i64(i64) nounwind readnone
+
+define internal i64 @__popcnt_int64(i64) nounwind readonly alwaysinline {
+  %call = call i64 @llvm.ctpop.i64(i64 %0)
+  ret i64 %call
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; double precision sqrt
+
+declare <2 x double> @llvm.x86.sse.sqrt.sd(<2 x double>) nounwind readnone
+
+define internal double @__sqrt_uniform_double(double) nounwind alwaysinline {
+  sse_unary_scalar(ret, 2, double, @llvm.x86.sse.sqrt.sd, %0)
+  ret double %ret
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; double precision min/max
+
+declare <2 x double> @llvm.x86.sse2.max.sd(<2 x double>, <2 x double>) nounwind readnone
+declare <2 x double> @llvm.x86.sse2.min.sd(<2 x double>, <2 x double>) nounwind readnone
+
+define internal double @__min_uniform_double(double, double) nounwind readnone alwaysinline {
+  sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.min.sd, %0, %1)
+  ret double %ret
+}
+
+define internal double @__max_uniform_double(double, double) nounwind readnone alwaysinline {
+  sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.max.sd, %0, %1)
+  ret double %ret
+}
--- a/builtins-avx-x2.ll
+++ b/builtins-avx-x2.ll
@@ -0,0 +1,665 @@
+;;  Copyright (c) 2010-2011, Intel Corporation
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Intel Corporation nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; *** Untested *** AVX target implementation.
+;;
+;; The LLVM AVX code generator is incomplete, so the ispc AVX target
+;; hasn't yet been tested.  There is therefore a higher-than-normal
+;; chance that there are bugs in the code in this file.
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Basic 16-wide definitions
+
+stdlib_core(16)
+packed_load_and_store(16)
+scans(16)
+int64minmax(16)
+
+include(`builtins-avx-common.ll')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rcp
+
+declare <8 x float> @llvm.x86.avx.rcp.ps.256(<8 x float>) nounwind readnone
+
+define internal <16 x float> @__rcp_varying_float(<16 x float>) nounwind readonly alwaysinline {
+  ;  float iv = __rcp_v(v);
+  ;  return iv * (2. - v * iv);
+
+  unary8to16(call, float, @llvm.x86.avx.rcp.ps.256, %0)
+  ; do one N-R iteration
+  %v_iv = fmul <16 x float> %0, %call
+  %two_minus = fsub <16 x float> <float 2., float 2., float 2., float 2.,
+                                  float 2., float 2., float 2., float 2.,
+                                  float 2., float 2., float 2., float 2.,
+                                  float 2., float 2., float 2., float 2.>, %v_iv  
+  %iv_mul = fmul <16 x float> %call, %two_minus
+  ret <16 x float> %iv_mul
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rounding floats
+
+declare <8 x float> @llvm.x86.avx.round.ps.256(<8 x float>, i32) nounwind readnone
+
+define internal <16 x float> @__round_varying_float(<16 x float>) nounwind readonly alwaysinline {
+  ; roundps, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8
+  round8to16(%0, 8)
+}
+
+define internal <16 x float> @__floor_varying_float(<16 x float>) nounwind readonly alwaysinline {
+  ; roundps, round down 0b01 | don't signal precision exceptions 0b1001 = 9
+  round8to16(%0, 9)
+}
+
+define internal <16 x float> @__ceil_varying_float(<16 x float>) nounwind readonly alwaysinline {
+  ; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
+  round8to16(%0, 10)
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rounding doubles
+
+declare <4 x double> @llvm.x86.avx.round.pd.256(<4 x double>, i32) nounwind readnone
+
+define internal <16 x double> @__round_varying_double(<16 x double>) nounwind readonly alwaysinline {
+  round4to16double(%0, 8)
+}
+
+define internal <16 x double> @__floor_varying_double(<16 x double>) nounwind readonly alwaysinline {
+  round4to16double(%0, 9)
+}
+
+define internal <16 x double> @__ceil_varying_double(<16 x double>) nounwind readonly alwaysinline {
+  round4to16double(%0, 10)
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rsqrt
+
+declare <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float>) nounwind readnone
+
+define internal <16 x float> @__rsqrt_varying_float(<16 x float> %v) nounwind readonly alwaysinline {
+  ;  float is = __rsqrt_v(v);
+  unary8to16(is, float, @llvm.x86.avx.rsqrt.ps.256, %v)
+  ;  return 0.5 * is * (3. - (v * is) * is);
+  %v_is = fmul <16 x float> %v, %is
+  %v_is_is = fmul <16 x float> %v_is, %is
+  %three_sub = fsub <16 x float> <float 3., float 3., float 3., float 3.,
+                                  float 3., float 3., float 3., float 3.,
+                                  float 3., float 3., float 3., float 3.,
+                                  float 3., float 3., float 3., float 3.>, %v_is_is
+  %is_mul = fmul <16 x float> %is, %three_sub
+  %half_scale = fmul <16 x float> <float 0.5, float 0.5, float 0.5, float 0.5,
+                                   float 0.5, float 0.5, float 0.5, float 0.5,
+                                   float 0.5, float 0.5, float 0.5, float 0.5,
+                                   float 0.5, float 0.5, float 0.5, float 0.5>, %is_mul
+  ret <16 x float> %half_scale
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; sqrt
+
+declare <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float>) nounwind readnone
+
+define internal <16 x float> @__sqrt_varying_float(<16 x float>) nounwind readonly alwaysinline {
+  unary8to16(call, float, @llvm.x86.avx.sqrt.ps.256, %0)
+  ret <16 x float> %call
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; svml
+
+; FIXME: need either to wire these up to the 8-wide SVML entrypoints,
+; or, use the macro to call the 4-wide ones 4x with our 16-wide
+; vectors...
+
+declare <16 x float> @__svml_sin(<16 x float>)
+declare <16 x float> @__svml_cos(<16 x float>)
+declare void @__svml_sincos(<16 x float>, <16 x float> *, <16 x float> *)
+declare <16 x float> @__svml_tan(<16 x float>)
+declare <16 x float> @__svml_atan(<16 x float>)
+declare <16 x float> @__svml_atan2(<16 x float>, <16 x float>)
+declare <16 x float> @__svml_exp(<16 x float>)
+declare <16 x float> @__svml_log(<16 x float>)
+declare <16 x float> @__svml_pow(<16 x float>, <16 x float>)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; float min/max
+
+declare <8 x float> @llvm.x86.avx.max.ps.256(<8 x float>, <8 x float>) nounwind readnone
+declare <8 x float> @llvm.x86.avx.min.ps.256(<8 x float>, <8 x float>) nounwind readnone
+
+define internal <16 x float> @__max_varying_float(<16 x float>,
+                                                  <16 x float>) nounwind readonly alwaysinline {
+  binary8to16(call, float, @llvm.x86.avx.max.ps.256, %0, %1)
+  ret <16 x float> %call
+}
+
+define internal <16 x float> @__min_varying_float(<16 x float>,
+                                                  <16 x float>) nounwind readonly alwaysinline {
+  binary8to16(call, float, @llvm.x86.avx.min.ps.256, %0, %1)
+  ret <16 x float> %call
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int min/max
+
+define internal <16 x i32> @__min_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
+  binary4to16(ret, i32, @llvm.x86.sse41.pminsd, %0, %1)
+  ret <16 x i32> %ret
+}
+
+define internal <16 x i32> @__max_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
+  binary4to16(ret, i32, @llvm.x86.sse41.pmaxsd, %0, %1)
+  ret <16 x i32> %ret
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; unsigned int min/max
+
+define internal <16 x i32> @__min_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
+  binary4to16(ret, i32, @llvm.x86.sse41.pminud, %0, %1)
+  ret <16 x i32> %ret
+}
+
+define internal <16 x i32> @__max_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
+  binary4to16(ret, i32, @llvm.x86.sse41.pmaxud, %0, %1)
+  ret <16 x i32> %ret
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; horizontal ops
+
+declare i32 @llvm.x86.avx.movmsk.ps.256(<8 x float>) nounwind readnone
+
+define internal i32 @__movmsk(<16 x i32>) nounwind readnone alwaysinline {
+  %floatmask = bitcast <16 x i32> %0 to <16 x float>
+  %mask0 = shufflevector <16 x float> %floatmask, <16 x float> undef,
+          <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %v0 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %mask0) nounwind readnone
+  %mask1 = shufflevector <16 x float> %floatmask, <16 x float> undef,
+          <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %v1 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %mask1) nounwind readnone
+
+  %v1shift = shl i32 %v1, 8
+  %v = or i32 %v1shift, %v0
+  ret i32 %v
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; horizontal float ops
+
+declare <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float>, <8 x float>) nounwind readnone
+
+define internal float @__reduce_add_float(<16 x float>) nounwind readonly alwaysinline {
+  %va = shufflevector <16 x float> %0, <16 x float> undef,
+          <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %vb = shufflevector <16 x float> %0, <16 x float> undef,
+          <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %v1 = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %va, <8 x float> %vb)
+  %v2 = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %v1, <8 x float> %v1)
+  %v3 = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %v2, <8 x float> %v2)
+  %scalar1 = extractelement <8 x float> %v3, i32 0
+  %scalar2 = extractelement <8 x float> %v3, i32 4
+  %sum = fadd float %scalar1, %scalar2
+  ret float %sum
+}
+
+
+define internal float @__reduce_min_float(<16 x float>) nounwind readnone alwaysinline {
+  reduce16(float, @__min_varying_float, @__min_uniform_float)
+}
+
+
+define internal float @__reduce_max_float(<16 x float>) nounwind readnone alwaysinline {
+  reduce16(float, @__max_varying_float, @__max_uniform_float)
+}
+
+reduce_equal(16)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; horizontal int32 ops
+
+define internal <16 x i32> @__add_varying_int32(<16 x i32>,
+                                                <16 x i32>) nounwind readnone alwaysinline {
+  %s = add <16 x i32> %0, %1
+  ret <16 x i32> %s
+}
+
+define internal i32 @__add_uniform_int32(i32, i32) nounwind readnone alwaysinline {
+  %s = add i32 %0, %1
+  ret i32 %s
+}
+
+define internal i32 @__reduce_add_int32(<16 x i32>) nounwind readnone alwaysinline {
+  reduce16(i32, @__add_varying_int32, @__add_uniform_int32)
+}
+
+
+define internal i32 @__reduce_min_int32(<16 x i32>) nounwind readnone alwaysinline {
+  reduce16(i32, @__min_varying_int32, @__min_uniform_int32)
+}
+
+
+define internal i32 @__reduce_max_int32(<16 x i32>) nounwind readnone alwaysinline {
+  reduce16(i32, @__max_varying_int32, @__max_uniform_int32)
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; horizontal uint32 ops
+
+define internal i32 @__reduce_add_uint32(<16 x i32> %v) nounwind readnone alwaysinline {
+  %r = call i32 @__reduce_add_int32(<16 x i32> %v)
+  ret i32 %r
+}
+
+define internal i32 @__reduce_min_uint32(<16 x i32>) nounwind readnone alwaysinline {
+  reduce16(i32, @__min_varying_uint32, @__min_uniform_uint32)
+}
+
+
+define internal i32 @__reduce_max_uint32(<16 x i32>) nounwind readnone alwaysinline {
+  reduce16(i32, @__max_varying_uint32, @__max_uniform_uint32)
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; horizontal double ops
+
+declare <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double>, <4 x double>) nounwind readnone
+
+define internal double @__reduce_add_double(<16 x double>) nounwind readonly alwaysinline {
+  %va = shufflevector <16 x double> %0, <16 x double> undef,
+         <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %vb = shufflevector <16 x double> %0, <16 x double> undef,
+         <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %vc = shufflevector <16 x double> %0, <16 x double> undef,
+         <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+  %vd = shufflevector <16 x double> %0, <16 x double> undef,
+         <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+  %vab = fadd <4 x double> %va, %vb
+  %vcd = fadd <4 x double> %vc, %vd
+
+  %sum0 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %vab, <4 x double> %vcd)
+  %sum1 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %sum0, <4 x double> %sum0)
+  %final0 = extractelement <4 x double> %sum1, i32 0
+  %final1 = extractelement <4 x double> %sum1, i32 2
+  %sum = fadd double %final0, %final1
+  ret double %sum
+}
+
+define internal double @__reduce_min_double(<16 x double>) nounwind readnone alwaysinline {
+  reduce16(double, @__min_varying_double, @__min_uniform_double)
+}
+
+
+define internal double @__reduce_max_double(<16 x double>) nounwind readnone alwaysinline {
+  reduce16(double, @__max_varying_double, @__max_uniform_double)
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; horizontal int64 ops
+
+define internal <16 x i64> @__add_varying_int64(<16 x i64>,
+                                                <16 x i64>) nounwind readnone alwaysinline {
+  %s = add <16 x i64> %0, %1
+  ret <16 x i64> %s
+}
+
+define internal i64 @__add_uniform_int64(i64, i64) nounwind readnone alwaysinline {
+  %s = add i64 %0, %1
+  ret i64 %s
+}
+
+define internal i64 @__reduce_add_int64(<16 x i64>) nounwind readnone alwaysinline {
+  reduce16(i64, @__add_varying_int64, @__add_uniform_int64)
+}
+
+
+define internal i64 @__reduce_min_int64(<16 x i64>) nounwind readnone alwaysinline {
+  reduce16(i64, @__min_varying_int64, @__min_uniform_int64)
+}
+
+
+define internal i64 @__reduce_max_int64(<16 x i64>) nounwind readnone alwaysinline {
+  reduce16(i64, @__max_varying_int64, @__max_uniform_int64)
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; horizontal uint64 ops
+
+define internal i64 @__reduce_add_uint64(<16 x i64> %v) nounwind readnone alwaysinline {
+  %r = call i64 @__reduce_add_int64(<16 x i64> %v)
+  ret i64 %r
+}
+
+define internal i64 @__reduce_min_uint64(<16 x i64>) nounwind readnone alwaysinline {
+  reduce16(i64, @__min_varying_uint64, @__min_uniform_uint64)
+}
+
+
+define internal i64 @__reduce_max_uint64(<16 x i64>) nounwind readnone alwaysinline {
+  reduce16(i64, @__max_varying_uint64, @__max_uniform_uint64)
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; unaligned loads/loads+broadcasts
+
+load_and_broadcast(16, i8, 8)
+load_and_broadcast(16, i16, 16)
+load_and_broadcast(16, i32, 32)
+load_and_broadcast(16, i64, 64)
+
+; no masked load instruction for i8 and i16 types??
+load_masked(16, i8,  8,  1)
+load_masked(16, i16, 16, 2)
+
+declare <8 x float> @llvm.x86.avx.maskload.ps.256(i8 *, <8 x float> %mask)
+declare <4 x double> @llvm.x86.avx.maskload.pd.256(i8 *, <4 x double> %mask)
+ 
+define <16 x i32> @__load_masked_32(i8 *, <16 x i32> %mask) nounwind alwaysinline {
+  %floatmask = bitcast <16 x i32> %mask to <16 x float>
+  %mask0 = shufflevector <16 x float> %floatmask, <16 x float> undef,
+     <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %val0 = call <8 x float> @llvm.x86.avx.maskload.ps.256(i8 * %0, <8 x float> %mask0)
+  %mask1 = shufflevector <16 x float> %floatmask, <16 x float> undef,
+     <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %ptr1 = getelementptr i8 * %0, i32 32   ;; 8x4 bytes = 32
+  %val1 = call <8 x float> @llvm.x86.avx.maskload.ps.256(i8 * %ptr1, <8 x float> %mask1)
+
+  %retval = shufflevector <8 x float> %val0, <8 x float> %val1,
+     <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                 i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %reti32 = bitcast <16 x float> %retval to <16 x i32>
+  ret <16 x i32> %reti32
+}
+
+
+define <16 x i64> @__load_masked_64(i8 *, <16 x i32> %mask) nounwind alwaysinline {
+  ; double up masks, bitcast to doubles
+  %mask0 = shufflevector <16 x i32> %mask, <16 x i32> undef,
+     <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
+  %mask1 = shufflevector <16 x i32> %mask, <16 x i32> undef,
+     <8 x i32> <i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
+  %mask2 = shufflevector <16 x i32> %mask, <16 x i32> undef,
+     <8 x i32> <i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11>
+  %mask3 = shufflevector <16 x i32> %mask, <16 x i32> undef,
+     <8 x i32> <i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15>
+  %mask0d = bitcast <8 x i32> %mask0 to <4 x double>
+  %mask1d = bitcast <8 x i32> %mask1 to <4 x double>
+  %mask2d = bitcast <8 x i32> %mask2 to <4 x double>
+  %mask3d = bitcast <8 x i32> %mask3 to <4 x double>
+
+  %val0d = call <4 x double> @llvm.x86.avx.maskload.pd.256(i8 * %0, <4 x double> %mask0d)
+  %ptr1 = getelementptr i8 * %0, i32 32
+  %val1d = call <4 x double> @llvm.x86.avx.maskload.pd.256(i8 * %ptr1, <4 x double> %mask1d)
+  %ptr2 = getelementptr i8 * %0, i32 64
+  %val2d = call <4 x double> @llvm.x86.avx.maskload.pd.256(i8 * %ptr2, <4 x double> %mask2d)
+  %ptr3 = getelementptr i8 * %0, i32 96
+  %val3d = call <4 x double> @llvm.x86.avx.maskload.pd.256(i8 * %ptr3, <4 x double> %mask3d)
+
+  %val01 = shufflevector <4 x double> %val0d, <4 x double> %val1d,
+      <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %val23 = shufflevector <4 x double> %val2d, <4 x double> %val3d,
+      <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %val0123 = shufflevector <8 x double> %val01, <8 x double> %val23,
+      <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                  i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %val = bitcast <16 x double> %val0123 to <16 x i64>
+  ret <16 x i64> %val
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; masked store
+
+; FIXME: there is no AVX instruction for these, but we could be clever
+; by packing the bits down and setting the last 3/4 or half, respectively,
+; of the mask to zero...  Not sure if this would be a win in the end
+gen_masked_store(16, i8, 8)
+gen_masked_store(16, i16, 16)
+
+; note that mask is the 2nd parameter, not the 3rd one!!
+declare void @llvm.x86.avx.maskstore.ps.256(i8 *, <8 x float>, <8 x float>)
+declare void @llvm.x86.avx.maskstore.pd.256(i8 *, <4 x double>, <4 x double>)
+
+define void @__masked_store_32(<16 x i32>* nocapture, <16 x i32>, 
+                               <16 x i32>) nounwind alwaysinline {
+  %ptr = bitcast <16 x i32> * %0 to i8 *
+  %val = bitcast <16 x i32> %1 to <16 x float>
+  %mask = bitcast <16 x i32> %2 to <16 x float>
+
+  %val0 = shufflevector <16 x float> %val, <16 x float> undef,
+        <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %val1 = shufflevector <16 x float> %val, <16 x float> undef,
+        <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+
+  %mask0 = shufflevector <16 x float> %mask, <16 x float> undef,
+        <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %mask1 = shufflevector <16 x float> %mask, <16 x float> undef,
+        <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+
+  call void @llvm.x86.avx.maskstore.ps.256(i8 * %ptr, <8 x float> %mask0, <8 x float> %val0)
+  %ptr1 = getelementptr i8 * %ptr, i32 32
+  call void @llvm.x86.avx.maskstore.ps.256(i8 * %ptr1, <8 x float> %mask1, <8 x float> %val1)
+
+  ret void
+}
+
+define void @__masked_store_64(<16 x i64>* nocapture, <16 x i64>,
+                               <16 x i32> %mask) nounwind alwaysinline {
+  %ptr = bitcast <16 x i64> * %0 to i8 *
+  %val = bitcast <16 x i64> %1 to <16 x double>
+
+  ; double up masks, bitcast to doubles
+  %mask0 = shufflevector <16 x i32> %mask, <16 x i32> undef,
+     <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
+  %mask1 = shufflevector <16 x i32> %mask, <16 x i32> undef,
+     <8 x i32> <i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
+  %mask2 = shufflevector <16 x i32> %mask, <16 x i32> undef,
+     <8 x i32> <i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11>
+  %mask3 = shufflevector <16 x i32> %mask, <16 x i32> undef,
+     <8 x i32> <i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15>
+  %mask0d = bitcast <8 x i32> %mask0 to <4 x double>
+  %mask1d = bitcast <8 x i32> %mask1 to <4 x double>
+  %mask2d = bitcast <8 x i32> %mask2 to <4 x double>
+  %mask3d = bitcast <8 x i32> %mask3 to <4 x double>
+
+  %val0 = shufflevector <16 x double> %val, <16 x double> undef,
+     <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %val1 = shufflevector <16 x double> %val, <16 x double> undef,
+     <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %val2 = shufflevector <16 x double> %val, <16 x double> undef,
+     <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+  %val3 = shufflevector <16 x double> %val, <16 x double> undef,
+     <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+
+  call void @llvm.x86.avx.maskstore.pd.256(i8 * %ptr, <4 x double> %mask0d, <4 x double> %val0)
+  %ptr1 = getelementptr i8 * %ptr, i32 32
+  call void @llvm.x86.avx.maskstore.pd.256(i8 * %ptr1, <4 x double> %mask1d, <4 x double> %val1)
+  %ptr2 = getelementptr i8 * %ptr, i32 64
+  call void @llvm.x86.avx.maskstore.pd.256(i8 * %ptr2, <4 x double> %mask2d, <4 x double> %val2)
+  %ptr3 = getelementptr i8 * %ptr, i32 96
+  call void @llvm.x86.avx.maskstore.pd.256(i8 * %ptr3, <4 x double> %mask3d, <4 x double> %val3)
+
+  ret void
+}
+
+
+masked_store_blend_8_16_by_16()
+
+declare <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float>, <8 x float>,
+                                                <8 x float>) nounwind readnone
+
+define void @__masked_store_blend_32(<16 x i32>* nocapture, <16 x i32>, 
+                                     <16 x i32>) nounwind alwaysinline {
+  %maskAsFloat = bitcast <16 x i32> %2 to <16 x float>
+  %oldValue = load <16 x i32>* %0, align 4
+  %oldAsFloat = bitcast <16 x i32> %oldValue to <16 x float>
+  %newAsFloat = bitcast <16 x i32> %1 to <16 x float>
+ 
+  %old0 = shufflevector <16 x float> %oldAsFloat, <16 x float> undef,
+        <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %old1 = shufflevector <16 x float> %oldAsFloat, <16 x float> undef,
+        <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %new0 = shufflevector <16 x float> %newAsFloat, <16 x float> undef,
+        <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %new1 = shufflevector <16 x float> %newAsFloat, <16 x float> undef,
+        <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %mask0 = shufflevector <16 x float> %maskAsFloat, <16 x float> undef,
+        <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %mask1 = shufflevector <16 x float> %maskAsFloat, <16 x float> undef,
+        <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+
+  %blend0 = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %old0,
+                                                         <8 x float> %new0,
+                                                         <8 x float> %mask0)
+  %blend1 = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %old1,
+                                                         <8 x float> %new1,
+                                                         <8 x float> %mask1)
+  %blend = shufflevector <8 x float> %blend0, <8 x float> %blend1,
+    <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %blendAsInt = bitcast <16 x float> %blend to <16 x i32>
+  store <16 x i32> %blendAsInt, <16 x i32>* %0, align 4
+  ret void
+}
+
+
+declare <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double>, <4 x double>,
+                                                 <4 x double>) nounwind readnone
+
+define void @__masked_store_blend_64(<16 x i64>* nocapture %ptr, <16 x i64> %newi64, 
+                                     <16 x i32> %mask) nounwind alwaysinline {
+  %oldValue = load <16 x i64>* %ptr, align 8
+  %old = bitcast <16 x i64> %oldValue to <16 x double>
+  %old0d = shufflevector <16 x double> %old, <16 x double> undef,
+     <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %old1d = shufflevector <16 x double> %old, <16 x double> undef,
+     <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %old2d = shufflevector <16 x double> %old, <16 x double> undef,
+     <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+  %old3d = shufflevector <16 x double> %old, <16 x double> undef,
+     <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+
+  %new = bitcast <16 x i64> %newi64 to <16 x double>
+  %new0d = shufflevector <16 x double> %new, <16 x double> undef,
+     <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %new1d = shufflevector <16 x double> %new, <16 x double> undef,
+     <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %new2d = shufflevector <16 x double> %new, <16 x double> undef,
+     <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+  %new3d = shufflevector <16 x double> %new, <16 x double> undef,
+     <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+
+  %mask0 = shufflevector <16 x i32> %mask, <16 x i32> undef,
+     <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
+  %mask1 = shufflevector <16 x i32> %mask, <16 x i32> undef,
+     <8 x i32> <i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
+  %mask2 = shufflevector <16 x i32> %mask, <16 x i32> undef,
+     <8 x i32> <i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11>
+  %mask3 = shufflevector <16 x i32> %mask, <16 x i32> undef,
+     <8 x i32> <i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15>
+  %mask0d = bitcast <8 x i32> %mask0 to <4 x double>
+  %mask1d = bitcast <8 x i32> %mask1 to <4 x double>
+  %mask2d = bitcast <8 x i32> %mask2 to <4 x double>
+  %mask3d = bitcast <8 x i32> %mask3 to <4 x double>
+
+  %result0d = call <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double> %old0d,
+                                 <4 x double> %new0d, <4 x double> %mask0d)
+  %result1d = call <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double> %old1d,
+                                 <4 x double> %new1d, <4 x double> %mask1d)
+  %result2d = call <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double> %old2d,
+                                 <4 x double> %new2d, <4 x double> %mask2d)
+  %result3d = call <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double> %old3d,
+                                 <4 x double> %new3d, <4 x double> %mask3d)
+
+  %result01 = shufflevector <4 x double> %result0d, <4 x double> %result1d,
+           <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %result23 = shufflevector <4 x double> %result2d, <4 x double> %result3d,
+           <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+
+  %result = shufflevector <8 x double> %result01, <8 x double> %result23,
+           <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                       i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %result64 = bitcast <16 x double> %result to <16 x i64>
+  store <16 x i64> %result64, <16 x i64> * %ptr
+  ret void
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; gather/scatter
+
+gen_gather(16, i8)
+gen_gather(16, i16)
+gen_gather(16, i32)
+gen_gather(16, i64)
+
+gen_scatter(16, i8)
+gen_scatter(16, i16)
+gen_scatter(16, i32)
+gen_scatter(16, i64)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; double precision sqrt
+
+declare <4 x double> @llvm.x86.avx.sqrt.pd.256(<4 x double>) nounwind readnone
+
+define internal <16 x double> @__sqrt_varying_double(<16 x double>) nounwind alwaysinline {
+  unary4to16(ret, double, @llvm.x86.avx.sqrt.pd.256, %0)
+  ret <16 x double> %ret
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; double precision min/max
+
+declare <4 x double> @llvm.x86.avx.max.pd.256(<4 x double>, <4 x double>) nounwind readnone
+declare <4 x double> @llvm.x86.avx.min.pd.256(<4 x double>, <4 x double>) nounwind readnone
+
+define internal <16 x double> @__min_varying_double(<16 x double>, <16 x double>) nounwind readnone alwaysinline {
+  binary4to16(ret, double, @llvm.x86.avx.min.pd.256, %0, %1)
+  ret <16 x double> %ret
+}
+
+define internal <16 x double> @__max_varying_double(<16 x double>, <16 x double>) nounwind readnone alwaysinline {
+  binary4to16(ret, double, @llvm.x86.avx.max.pd.256, %0, %1)
+  ret <16 x double> %ret
+}
--- a/builtins-avx.ll
+++ b/builtins-avx.ll
@@ -41,13 +41,15 @@

 stdlib_core(8)
 packed_load_and_store(8)
+scans(8)
 int64minmax(8)

+include(`builtins-avx-common.ll')
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rcp

 declare <8 x float> @llvm.x86.avx.rcp.ps.256(<8 x float>) nounwind readnone
-declare <4 x float> @llvm.x86.sse.rcp.ss(<4 x float>) nounwind readnone

 define internal <8 x float> @__rcp_varying_float(<8 x float>) nounwind readonly alwaysinline {
  ;  float iv = __rcp_v(v);
@@ -62,25 +64,10 @@ define internal <8 x float> @__rcp_varying_float(<8 x float>) nounwind readonly
  ret <8 x float> %iv_mul
 }

-define internal float @__rcp_uniform_float(float) nounwind readonly alwaysinline {
-;    uniform float iv = extract(__rcp_u(v), 0);
-;    return iv * (2. - v * iv);
-  %vecval = insertelement <4 x float> undef, float %0, i32 0
-  %call = call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %vecval)
-  %scall = extractelement <4 x float> %call, i32 0
-
-  ; do one N-R iteration
-  %v_iv = fmul float %0, %scall
-  %two_minus = fsub float 2., %v_iv  
-  %iv_mul = fmul float %scall, %two_minus
-  ret float %iv_mul
-}
-
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rounding floats

 declare <8 x float> @llvm.x86.avx.round.ps.256(<8 x float>, i32) nounwind readnone
-declare <4 x float> @llvm.x86.sse.round.ss(<4 x float>, <4 x float>, i32) nounwind readnone

 define internal <8 x float> @__round_varying_float(<8 x float>) nounwind readonly alwaysinline {
  ; roundps, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8
@@ -88,111 +75,43 @@ define internal <8 x float> @__round_varying_float(<8 x float>) nounwind readonl
  ret <8 x float> %call
 }

-define internal float @__round_uniform_float(float) nounwind readonly alwaysinline {
-  ; roundss, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8
-  ; the roundss intrinsic is a total mess--docs say:
-  ;
-  ;  __m128 _mm_round_ss (__m128 a, __m128 b, const int c)
-  ;       
-  ;  b is a 128-bit parameter. The lowest 32 bits are the result of the rounding function
-  ;  on b0. The higher order 96 bits are copied directly from input parameter a. The
-  ;  return value is described by the following equations:
-  ;
-  ;  r0 = RND(b0)
-  ;  r1 = a1
-  ;  r2 = a2
-  ;  r3 = a3
-  ;
-  ;  It doesn't matter what we pass as a, since we only need the r0 value
-  ;  here.  So we pass the same register for both.
-  %xi = insertelement <4 x float> undef, float %0, i32 0
-  %xr = call <4 x float> @llvm.x86.sse.round.ss(<4 x float> %xi, <4 x float> %xi, i32 8)
-  %rs = extractelement <4 x float> %xr, i32 0
-  ret float %rs
-}
-
 define internal <8 x float> @__floor_varying_float(<8 x float>) nounwind readonly alwaysinline {
-  ; roundps, round down 0b01 | don't signal precision exceptions 0b1000 = 9
+  ; roundps, round down 0b01 | don't signal precision exceptions 0b1001 = 9
  %call = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %0, i32 9)
  ret <8 x float> %call
 }

-define internal float @__floor_uniform_float(float) nounwind readonly alwaysinline {
-  ; see above for round_ss instrinsic discussion...
-  %xi = insertelement <4 x float> undef, float %0, i32 0
-  ; roundps, round down 0b01 | don't signal precision exceptions 0b1000 = 9
-  %xr = call <4 x float> @llvm.x86.sse.round.ss(<4 x float> %xi, <4 x float> %xi, i32 9)
-  %rs = extractelement <4 x float> %xr, i32 0
-  ret float %rs
-}
-
 define internal <8 x float> @__ceil_varying_float(<8 x float>) nounwind readonly alwaysinline {
-  ; roundps, round up 0b10 | don't signal precision exceptions 0b1000 = 10
+  ; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
  %call = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %0, i32 10)
  ret <8 x float> %call
 }

-define internal float @__ceil_uniform_float(float) nounwind readonly alwaysinline {
-  ; see above for round_ss instrinsic discussion...
-  %xi = insertelement <4 x float> undef, float %0, i32 0
-  ; roundps, round up 0b10 | don't signal precision exceptions 0b1000 = 10
-  %xr = call <4 x float> @llvm.x86.sse.round.ss(<4 x float> %xi, <4 x float> %xi, i32 10)
-  %rs = extractelement <4 x float> %xr, i32 0
-  ret float %rs
-}
-
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rounding doubles

 declare <4 x double> @llvm.x86.avx.round.pd.256(<4 x double>, i32) nounwind readnone
-declare <2 x double> @llvm.x86.sse41.round.sd(<2 x double>, <2 x double>, i32) nounwind readnone

 define internal <8 x double> @__round_varying_double(<8 x double>) nounwind readonly alwaysinline {
  round4to8double(%0, 8)
 }

-define internal double @__round_uniform_double(double) nounwind readonly alwaysinline {
-  %xi = insertelement <2 x double> undef, double %0, i32 0
-  %xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 8)
-  %rs = extractelement <2 x double> %xr, i32 0
-  ret double %rs
-}
-
 define internal <8 x double> @__floor_varying_double(<8 x double>) nounwind readonly alwaysinline {
  ; roundpd, round down 0b01 | don't signal precision exceptions 0b1000 = 9
  round4to8double(%0, 9)
 }

-define internal double @__floor_uniform_double(double) nounwind readonly alwaysinline {
-  ; see above for round_ss instrinsic discussion...
-  %xi = insertelement <2 x double> undef, double %0, i32 0
-  ; roundpd, round down 0b01 | don't signal precision exceptions 0b1000 = 9
-  %xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 9)
-  %rs = extractelement <2 x double> %xr, i32 0
-  ret double %rs
-}

 define internal <8 x double> @__ceil_varying_double(<8 x double>) nounwind readonly alwaysinline {
  ; roundpd, round up 0b10 | don't signal precision exceptions 0b1000 = 10
  round4to8double(%0, 10)
 }

-define internal double @__ceil_uniform_double(double) nounwind readonly alwaysinline {
-  ; see above for round_ss instrinsic discussion...
-  %xi = insertelement <2 x double> undef, double %0, i32 0
-  ; roundps, round up 0b10 | don't signal precision exceptions 0b1000 = 10
-  %xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 10)
-  %rs = extractelement <2 x double> %xr, i32 0
-  ret double %rs
-}
-
-

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rsqrt

 declare <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float>) nounwind readnone
-declare <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float>) nounwind readnone

 define internal <8 x float> @__rsqrt_varying_float(<8 x float> %v) nounwind readonly alwaysinline {
  ;  float is = __rsqrt_v(v);
@@ -200,64 +119,24 @@ define internal <8 x float> @__rsqrt_varying_float(<8 x float> %v) nounwind read
  ;  return 0.5 * is * (3. - (v * is) * is);
  %v_is = fmul <8 x float> %v, %is
  %v_is_is = fmul <8 x float> %v_is, %is
-  %three_sub = fsub <8 x float> <float 3., float 3., float 3., float 3., float 3., float 3., float 3., float 3.>, %v_is_is
+  %three_sub = fsub <8 x float> <float 3., float 3., float 3., float 3.,
+                                 float 3., float 3., float 3., float 3.>, %v_is_is
  %is_mul = fmul <8 x float> %is, %three_sub
-  %half_scale = fmul <8 x float> <float 0.5, float 0.5, float 0.5, float 0.5, float 0.5, float 0.5, float 0.5, float 0.5>, %is_mul
+  %half_scale = fmul <8 x float> <float 0.5, float 0.5, float 0.5, float 0.5,
+                                  float 0.5, float 0.5, float 0.5, float 0.5>, %is_mul
  ret <8 x float> %half_scale
 }

-define internal float @__rsqrt_uniform_float(float) nounwind readonly alwaysinline {
-  ;  uniform float is = extract(__rsqrt_u(v), 0);
-  %v = insertelement <4 x float> undef, float %0, i32 0
-  %vis = call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %v)
-  %is = extractelement <4 x float> %vis, i32 0
-
-  ;  return 0.5 * is * (3. - (v * is) * is);
-  %v_is = fmul float %0, %is
-  %v_is_is = fmul float %v_is, %is
-  %three_sub = fsub float 3., %v_is_is
-  %is_mul = fmul float %is, %three_sub
-  %half_scale = fmul float 0.5, %is_mul
-  ret float %half_scale
-}
-
-
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; sqrt

 declare <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float>) nounwind readnone
-declare <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float>) nounwind readnone

 define internal <8 x float> @__sqrt_varying_float(<8 x float>) nounwind readonly alwaysinline {
  %call = call <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float> %0)
  ret <8 x float> %call
 }

-define internal float @__sqrt_uniform_float(float) nounwind readonly alwaysinline {
-  sse_unary_scalar(ret, 4, float, @llvm.x86.sse.sqrt.ss, %0)
-  ret float %ret
-}
-
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; fastmath
-
-declare void @llvm.x86.sse.stmxcsr(i8 *) nounwind
-declare void @llvm.x86.sse.ldmxcsr(i8 *) nounwind
-
-define internal void @__fastmath() nounwind alwaysinline {
-  %ptr = alloca i32
-  %ptr8 = bitcast i32 * %ptr to i8 *
-  call void @llvm.x86.sse.stmxcsr(i8 * %ptr8)
-  %oldval = load i32 *%ptr
-
-  ; turn on DAZ (64)/FTZ (32768) -> 32832
-  %update = or i32 %oldval, 32832
-  store i32 %update, i32 *%ptr
-  call void @llvm.x86.sse.ldmxcsr(i8 * %ptr8)
-  ret void
-}
-
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; svml

@@ -279,9 +158,7 @@ declare <8 x float> @__svml_pow(<8 x float>, <8 x float>)
 ;; float min/max

 declare <8 x float> @llvm.x86.avx.max.ps.256(<8 x float>, <8 x float>) nounwind readnone
-declare <4 x float> @llvm.x86.sse.max.ss(<4 x float>, <4 x float>) nounwind readnone
 declare <8 x float> @llvm.x86.avx.min.ps.256(<8 x float>, <8 x float>) nounwind readnone
-declare <4 x float> @llvm.x86.sse.min.ss(<4 x float>, <4 x float>) nounwind readnone

 define internal <8 x float> @__max_varying_float(<8 x float>,
                                                 <8 x float>) nounwind readonly alwaysinline {
@@ -289,97 +166,43 @@ define internal <8 x float> @__max_varying_float(<8 x float>,
  ret <8 x float> %call
 }

-define internal float @__max_uniform_float(float, float) nounwind readonly alwaysinline {
-  sse_binary_scalar(ret, 4, float, @llvm.x86.sse.max.ss, %0, %1)
-  ret float %ret
-}
-
 define internal <8 x float> @__min_varying_float(<8 x float>,
                                                 <8 x float>) nounwind readonly alwaysinline {
  %call = call <8 x float> @llvm.x86.avx.min.ps.256(<8 x float> %0, <8 x float> %1)
  ret <8 x float> %call
 }

-define internal float @__min_uniform_float(float, float) nounwind readonly alwaysinline {
-  sse_binary_scalar(ret, 4, float, @llvm.x86.sse.min.ss, %0, %1)
-  ret float %ret
-}
-

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; int min/max

-declare <8 x i32> @llvm.x86.avx.min.sd.256(<8 x i32>, <8 x i32>) nounwind readnone
-declare <8 x i32> @llvm.x86.avx.max.sd.256(<8 x i32>, <8 x i32>) nounwind readnone
-
 define internal <8 x i32> @__min_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
-  %call = call <8 x i32> @llvm.x86.avx.min.sd.256(<8 x i32> %0, <8 x i32> %1)
-  ret <8 x i32> %call
-}
-
-define internal i32 @__min_uniform_int32(i32, i32) nounwind readonly alwaysinline {
-  sse_binary_scalar(ret, 8, i32, @llvm.x86.avx.min.sd.256, %0, %1)
-  ret i32 %ret
+  binary4to8(ret, i32, @llvm.x86.sse41.pminsd, %0, %1)
+  ret <8 x i32> %ret
 }

 define internal <8 x i32> @__max_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
-  %call = call <8 x i32> @llvm.x86.avx.max.sd.256(<8 x i32> %0, <8 x i32> %1)
-  ret <8 x i32> %call
-}
-
-define internal i32 @__max_uniform_int32(i32, i32) nounwind readonly alwaysinline {
-  sse_binary_scalar(ret, 8, i32, @llvm.x86.avx.max.sd.256, %0, %1)
-  ret i32 %ret
+  binary4to8(ret, i32, @llvm.x86.sse41.pmaxsd, %0, %1)
+  ret <8 x i32> %ret
 }


 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; unsigned int min/max

-; FIXME: looks like these aren't available in LLVM?
-declare <8 x i32> @llvm.x86.avx.min.ud.256(<8 x i32>, <8 x i32>) nounwind readnone
-declare <8 x i32> @llvm.x86.avx.max.ud.256(<8 x i32>, <8 x i32>) nounwind readnone
-
-define internal <8 x i32> @__min_varying_uint32(<8 x i32>,
-                                                <8 x i32>) nounwind readonly alwaysinline {
-  %call = call <8 x i32> @llvm.x86.avx.min.ud.256(<8 x i32> %0, <8 x i32> %1)
-  ret <8 x i32> %call
+define internal <8 x i32> @__min_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
+  binary4to8(ret, i32, @llvm.x86.sse41.pminud, %0, %1)
+  ret <8 x i32> %ret
 }

-define internal i32 @__min_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
-  sse_binary_scalar(ret, 8, i32, @llvm.x86.avx.min.ud.256, %0, %1)
-  ret i32 %ret
+define internal <8 x i32> @__max_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
+  binary4to8(ret, i32, @llvm.x86.sse41.pmaxud, %0, %1)
+  ret <8 x i32> %ret
 }

-define internal <8 x i32> @__max_varying_uint32(<8 x i32>,
-                                                <8 x i32>) nounwind readonly alwaysinline {
-  %call = call <8 x i32> @llvm.x86.avx.max.ud.256(<8 x i32> %0, <8 x i32> %1)
-  ret <8 x i32> %call
-}
-
-define internal i32 @__max_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
-  sse_binary_scalar(ret, 8, i32, @llvm.x86.avx.max.ud.256, %0, %1)
-  ret i32 %ret
-}
-
-
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; horizontal ops

-declare i32 @llvm.ctpop.i32(i32) nounwind readnone
-
-define internal i32 @__popcnt_int32(i32) nounwind readonly alwaysinline {
-  %call = call i32 @llvm.ctpop.i32(i32 %0)
-  ret i32 %call
-}
-
-declare i64 @llvm.ctpop.i64(i64) nounwind readnone
-
-define internal i64 @__popcnt_int64(i64) nounwind readonly alwaysinline {
-  %call = call i64 @llvm.ctpop.i64(i64 %0)
-  ret i64 %call
-}
-
 declare i32 @llvm.x86.avx.movmsk.ps.256(<8 x float>) nounwind readnone

 define internal i32 @__movmsk(<8 x i32>) nounwind readnone alwaysinline {
@@ -412,6 +235,7 @@ define internal float @__reduce_max_float(<8 x float>) nounwind readnone alwaysi
  reduce8(float, @__max_varying_float, @__max_uniform_float)
 }

+reduce_equal(8)

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; horizontal int32 ops
@@ -472,9 +296,10 @@ define internal double @__reduce_add_double(<8 x double>) nounwind readonly alwa
                      <4 x i32> <i32 4, i32 5, i32 6, i32 7>
  %sum0 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %v0, <4 x double> %v1)
  %sum1 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %sum0, <4 x double> %sum0)
-  %scalar1 = extractelement <4 x double> %sum0, i32 0
-  %scalar2 = extractelement <4 x double> %sum1, i32 1
-  %sum = fadd double %scalar1, %scalar2
+  %final0 = extractelement <4 x double> %sum1, i32 0
+  %final1 = extractelement <4 x double> %sum1, i32 2
+  %sum = fadd double %final0, %final1
+
  ret double %sum
 }

@@ -624,13 +449,14 @@ define void @__masked_store_64(<8 x i64>* nocapture, <8 x i64>,
  ret void
 }

+
+
 masked_store_blend_8_16_by_8()

 declare <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float>, <8 x float>,
                                                <8 x float>) nounwind readnone

-
-define void @__masked_store_blend_32(<8 x i32>* nocapture, <8 x i32>,
+define void @__masked_store_blend_32(<8 x i32>* nocapture, <8 x i32>, 
                                     <8 x i32>) nounwind alwaysinline {
  %mask_as_float = bitcast <8 x i32> %2 to <8 x float>
  %oldValue = load <8 x i32>* %0, align 4
@@ -645,7 +471,7 @@ define void @__masked_store_blend_32(<8 x i32>* nocapture, <8 x i32>,
 }


-define void @__masked_store_blend_64(<8 x i64>* nocapture %ptr, <8 x i64> %new,
+define void @__masked_store_blend_64(<8 x i64>* nocapture %ptr, <8 x i64> %new, 
                                     <8 x i32> %i32mask) nounwind alwaysinline {
  %oldValue = load <8 x i64>* %ptr, align 8
  %mask = bitcast <8 x i32> %i32mask to <8 x float>
@@ -695,6 +521,7 @@ define void @__masked_store_blend_64(<8 x i64>* nocapture %ptr, <8 x i64> %new,
  ret void
 }

+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; gather/scatter

@@ -712,43 +539,26 @@ gen_scatter(8, i64)
 ;; double precision sqrt

 declare <4 x double> @llvm.x86.avx.sqrt.pd.256(<4 x double>) nounwind readnone
-declare <2 x double> @llvm.x86.sse.sqrt.sd(<2 x double>) nounwind readnone

 define internal <8 x double> @__sqrt_varying_double(<8 x double>) nounwind alwaysinline {
  unary4to8(ret, double, @llvm.x86.avx.sqrt.pd.256, %0)
  ret <8 x double> %ret
 }

-define internal double @__sqrt_uniform_double(double) nounwind alwaysinline {
-  sse_unary_scalar(ret, 2, double, @llvm.x86.sse.sqrt.sd, %0)
-  ret double %ret
-}
-

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; double precision min/max

 declare <4 x double> @llvm.x86.avx.max.pd.256(<4 x double>, <4 x double>) nounwind readnone
-declare <2 x double> @llvm.x86.sse.max.sd(<2 x double>, <2 x double>) nounwind readnone
 declare <4 x double> @llvm.x86.avx.min.pd.256(<4 x double>, <4 x double>) nounwind readnone
-declare <2 x double> @llvm.x86.sse.min.sd(<2 x double>, <2 x double>) nounwind readnone

 define internal <8 x double> @__min_varying_double(<8 x double>, <8 x double>) nounwind readnone alwaysinline {
  binary4to8(ret, double, @llvm.x86.avx.min.pd.256, %0, %1)
  ret <8 x double> %ret
 }

-define internal double @__min_uniform_double(double, double) nounwind readnone alwaysinline {
-  sse_binary_scalar(ret, 2, double, @llvm.x86.sse.min.sd, %0, %1)
-  ret double %ret
-}
-
 define internal <8 x double> @__max_varying_double(<8 x double>, <8 x double>) nounwind readnone alwaysinline {
  binary4to8(ret, double, @llvm.x86.avx.max.pd.256, %0, %1)
  ret <8 x double> %ret
 }

-define internal double @__max_uniform_double(double, double) nounwind readnone alwaysinline {
-  sse_binary_scalar(ret, 2, double, @llvm.x86.sse.max.sd, %0, %1)
-  ret double %ret
-}
--- a/builtins-sse.ll
+++ b/builtins-sse.ll
@@ -376,6 +376,7 @@ define internal i64 @__reduce_max_uint64(<4 x i64>) nounwind readnone {
  reduce4(i64, @__max_varying_uint64, @__max_uniform_uint64)
 }

+reduce_equal(4)

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; masked store
--- a/builtins-sse2.ll
+++ b/builtins-sse2.ll
@@ -35,6 +35,7 @@
 ; Define some basics for a 4-wide target
 stdlib_core(4)
 packed_load_and_store(4)
+scans(4)

 ; Include the various definitions of things that only require SSE1 and SSE2
 include(`builtins-sse.ll')
@@ -276,41 +277,17 @@ define internal i32 @__max_uniform_uint32(i32, i32) nounwind readonly alwaysinli
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; horizontal ops / reductions

-; FIXME: this is very inefficient, loops over all 32 bits...
-
-; we could use the LLVM intrinsic declare i32 @llvm.ctpop.i32(i32),
-; although that currently ends up generating a POPCNT instruction even
-; if we give --target=sse2 on the command line.  We probably need to
-; pipe through the 'sse2' request to LLVM via the 'features' string
-; at codegen time...  (If e.g. --cpu=penryn is also passed along, then
-; it does generate non-POPCNT code and in particular better code than
-; the below does.)
+declare i32 @llvm.ctpop.i32(i32)
+declare i64 @llvm.ctpop.i64(i64)

 define internal i32 @__popcnt_int32(i32) nounwind readonly alwaysinline {
-entry:
-  br label %loop
-
-loop:
-  %count = phi i32 [ 0, %entry ], [ %newcount, %loop ]
-  %val = phi i32 [ %0, %entry ], [ %newval, %loop ]
-  %delta = and i32 %val, 1
-  %newcount = add i32 %count, %delta
-  %newval = lshr i32 %val, 1
-  %done = icmp eq i32 %newval, 0
-  br i1 %done, label %exit, label %loop
-
-exit:
-  ret i32 %newcount
+  %val = call i32 @llvm.ctpop.i32(i32 %0)
+  ret i32 %val
 }

-define internal i32 @__popcnt_int64(i64) nounwind readnone alwaysinline {
-  %vec = bitcast i64 %0 to <2 x i32>
-  %v0 = extractelement <2 x i32> %vec, i32 0
-  %v1 = extractelement <2 x i32> %vec, i32 1
-  %c0 = call i32 @__popcnt_int32(i32 %v0)
-  %c1 = call i32 @__popcnt_int32(i32 %v1)
-  %sum = add i32 %c0, %c1
-  ret i32 %sum
+define internal i64 @__popcnt_int64(i64) nounwind readnone alwaysinline {
+  %val = call i64 @llvm.ctpop.i64(i64 %0)
+  ret i64 %val
 }


--- a/builtins-sse4.ll
+++ b/builtins-sse4.ll
@@ -35,6 +35,7 @@
 ; Define common 4-wide stuff
 stdlib_core(4)
 packed_load_and_store(4)
+scans(4)

 ; Define the stuff that can be done with base SSE1/SSE2 instructions
 include(`builtins-sse.ll')
@@ -76,7 +77,7 @@ define internal float @__round_uniform_float(float) nounwind readonly alwaysinli
 }

 define internal <4 x float> @__floor_varying_float(<4 x float>) nounwind readonly alwaysinline {
-  ; roundps, round down 0b01 | don't signal precision exceptions 0b1000 = 9
+  ; roundps, round down 0b01 | don't signal precision exceptions 0b1001 = 9
  %call = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %0, i32 9)
  ret <4 x float> %call
 }
@@ -84,14 +85,14 @@ define internal <4 x float> @__floor_varying_float(<4 x float>) nounwind readonl
 define internal float @__floor_uniform_float(float) nounwind readonly alwaysinline {
  ; see above for round_ss instrinsic discussion...
  %xi = insertelement <4 x float> undef, float %0, i32 0
-  ; roundps, round down 0b01 | don't signal precision exceptions 0b1000 = 9
+  ; roundps, round down 0b01 | don't signal precision exceptions 0b1010 = 9
  %xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 9)
  %rs = extractelement <4 x float> %xr, i32 0
  ret float %rs
 }

 define internal <4 x float> @__ceil_varying_float(<4 x float>) nounwind readonly alwaysinline {
-  ; roundps, round up 0b10 | don't signal precision exceptions 0b1000 = 10
+  ; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
  %call = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %0, i32 10)
  ret <4 x float> %call
 }
@@ -99,7 +100,7 @@ define internal <4 x float> @__ceil_varying_float(<4 x float>) nounwind readonly
 define internal float @__ceil_uniform_float(float) nounwind readonly alwaysinline {
  ; see above for round_ss instrinsic discussion...
  %xi = insertelement <4 x float> undef, float %0, i32 0
-  ; roundps, round up 0b10 | don't signal precision exceptions 0b1000 = 10
+  ; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
  %xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 10)
  %rs = extractelement <4 x float> %xr, i32 0
  ret float %rs
@@ -123,28 +124,28 @@ define internal double @__round_uniform_double(double) nounwind readonly alwaysi
 }

 define internal <4 x double> @__floor_varying_double(<4 x double>) nounwind readonly alwaysinline {
-  ; roundpd, round down 0b01 | don't signal precision exceptions 0b1000 = 9
+  ; roundpd, round down 0b01 | don't signal precision exceptions 0b1001 = 9
  round2to4double(%0, 9)
 }

 define internal double @__floor_uniform_double(double) nounwind readonly alwaysinline {
  ; see above for round_ss instrinsic discussion...
  %xi = insertelement <2 x double> undef, double %0, i32 0
-  ; roundpd, round down 0b01 | don't signal precision exceptions 0b1000 = 9
+  ; roundpd, round down 0b01 | don't signal precision exceptions 0b1001 = 9
  %xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 9)
  %rs = extractelement <2 x double> %xr, i32 0
  ret double %rs
 }

 define internal <4 x double> @__ceil_varying_double(<4 x double>) nounwind readonly alwaysinline {
-  ; roundpd, round up 0b10 | don't signal precision exceptions 0b1000 = 10
+  ; roundpd, round up 0b10 | don't signal precision exceptions 0b1010 = 10
  round2to4double(%0, 10)
 }

 define internal double @__ceil_uniform_double(double) nounwind readonly alwaysinline {
  ; see above for round_ss instrinsic discussion...
  %xi = insertelement <2 x double> undef, double %0, i32 0
-  ; roundps, round up 0b10 | don't signal precision exceptions 0b1000 = 10
+  ; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
  %xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 10)
  %rs = extractelement <2 x double> %xr, i32 0
  ret double %rs
@@ -229,7 +230,6 @@ define internal float @__reduce_add_float(<4 x float>) nounwind readonly alwaysi
  ret float %scalar
 }

-
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; masked store

--- a/builtins-sse4x2.ll
+++ b/builtins-sse4x2.ll
@@ -38,6 +38,7 @@

 stdlib_core(8)
 packed_load_and_store(8)
+scans(8)
 int64minmax(8)

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@@ -434,6 +435,8 @@ define internal i64 @__reduce_max_uint64(<8 x i64>) nounwind readnone {
  reduce8(i64, @__max_varying_uint64, @__max_uniform_uint64)
 }

+reduce_equal(8)
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; unaligned loads/loads+broadcasts

@@ -495,28 +498,28 @@ define internal float @__round_uniform_float(float) nounwind readonly alwaysinli
 }

 define internal <8 x float> @__floor_varying_float(<8 x float>) nounwind readonly alwaysinline {
-  ; roundps, round down 0b01 | don't signal precision exceptions 0b1000 = 9
+  ; roundps, round down 0b01 | don't signal precision exceptions 0b1001 = 9
  round4to8(%0, 9)
 }

 define internal float @__floor_uniform_float(float) nounwind readonly alwaysinline {
  ; see above for round_ss instrinsic discussion...
  %xi = insertelement <4 x float> undef, float %0, i32 0
-  ; roundps, round down 0b01 | don't signal precision exceptions 0b1000 = 9
+  ; roundps, round down 0b01 | don't signal precision exceptions 0b1001 = 9
  %xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 9)
  %rs = extractelement <4 x float> %xr, i32 0
  ret float %rs
 }

 define internal <8 x float> @__ceil_varying_float(<8 x float>) nounwind readonly alwaysinline {
-  ; roundps, round up 0b10 | don't signal precision exceptions 0b1000 = 10
+  ; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
  round4to8(%0, 10)
 }

 define internal float @__ceil_uniform_float(float) nounwind readonly alwaysinline {
  ; see above for round_ss instrinsic discussion...
  %xi = insertelement <4 x float> undef, float %0, i32 0
-  ; roundps, round up 0b10 | don't signal precision exceptions 0b1000 = 10
+  ; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
  %xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 10)
  %rs = extractelement <4 x float> %xr, i32 0
  ret float %rs
@@ -540,28 +543,28 @@ define internal double @__round_uniform_double(double) nounwind readonly alwaysi
 }

 define internal <8 x double> @__floor_varying_double(<8 x double>) nounwind readonly alwaysinline {
-  ; roundpd, round down 0b01 | don't signal precision exceptions 0b1000 = 9
+  ; roundpd, round down 0b01 | don't signal precision exceptions 0b1001 = 9
  round2to8double(%0, 9)
 }

 define internal double @__floor_uniform_double(double) nounwind readonly alwaysinline {
  ; see above for round_ss instrinsic discussion...
  %xi = insertelement <2 x double> undef, double %0, i32 0
-  ; roundpd, round down 0b01 | don't signal precision exceptions 0b1000 = 9
+  ; roundpd, round down 0b01 | don't signal precision exceptions 0b1001 = 9
  %xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 9)
  %rs = extractelement <2 x double> %xr, i32 0
  ret double %rs
 }

 define internal <8 x double> @__ceil_varying_double(<8 x double>) nounwind readonly alwaysinline {
-  ; roundpd, round up 0b10 | don't signal precision exceptions 0b1000 = 10
+  ; roundpd, round up 0b10 | don't signal precision exceptions 0b1010 = 10
  round2to8double(%0, 10)
 }

 define internal double @__ceil_uniform_double(double) nounwind readonly alwaysinline {
  ; see above for round_ss instrinsic discussion...
  %xi = insertelement <2 x double> undef, double %0, i32 0
-  ; roundps, round up 0b10 | don't signal precision exceptions 0b1000 = 10
+  ; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
  %xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 10)
  %rs = extractelement <2 x double> %xr, i32 0
  ret double %rs
--- a/builtins.cpp
+++ b/builtins.cpp
@@ -54,6 +54,8 @@
 #include <llvm/Instructions.h>
 #include <llvm/Intrinsics.h>
 #include <llvm/Linker.h>
+#include <llvm/Target/TargetMachine.h>
+#include <llvm/ADT/Triple.h>
 #include <llvm/Support/MemoryBuffer.h>
 #include <llvm/Bitcode/ReaderWriter.h>

@@ -170,6 +172,27 @@ lLLVMTypeToISPCType(const llvm::Type *t, bool intAsUnsigned) {
 }


+static void
+lCreateSymbol(const std::string &name, const Type *returnType, 
+              const std::vector<const Type *> &argTypes, 
+              const llvm::FunctionType *ftype, llvm::Function *func, 
+              SymbolTable *symbolTable) {
+    SourcePos noPos;
+    noPos.name = "__stdlib";
+
+    FunctionType *funcType = new FunctionType(returnType, argTypes, noPos);
+    // set NULL default arguments
+    std::vector<ConstExpr *> defaults;
+    for (unsigned int j = 0; j < ftype->getNumParams(); ++j)
+        defaults.push_back(NULL);
+    funcType->SetArgumentDefaults(defaults);
+
+    Symbol *sym = new Symbol(name, noPos, funcType);
+    sym->function = func;
+    symbolTable->AddFunction(sym);
+}
+
+
 /** Given an LLVM function declaration, synthesize the equivalent ispc
    symbol for the function (if possible).  Returns true on success, false
    on failure.
@@ -221,7 +244,7 @@ lCreateISPCSymbol(llvm::Function *func, SymbolTable *symbolTable) {

        // Iterate over the arguments and try to find their equivalent ispc
        // types.  Track if any of the arguments has an integer type.
-        bool anyIntArgs = false;
+        bool anyIntArgs = false, anyReferenceArgs = false;
        std::vector<const Type *> argTypes;
        for (unsigned int j = 0; j < ftype->getNumParams(); ++j) {
            const llvm::Type *llvmArgType = ftype->getParamType(j);
@@ -230,22 +253,26 @@ lCreateISPCSymbol(llvm::Function *func, SymbolTable *symbolTable) {
                return false;
            anyIntArgs |= 
                (Type::Equal(type, lLLVMTypeToISPCType(llvmArgType, !intAsUnsigned)) == false);
+            anyReferenceArgs |= (dynamic_cast<const ReferenceType *>(type) != NULL);
            argTypes.push_back(type);
        }

        // Always create the symbol the first time through, in particular
        // so that we get symbols for things with no integer types!
-        if (i == 0 || anyIntArgs == true) {
-            FunctionType *funcType = new FunctionType(returnType, argTypes, noPos);
-            // set NULL default arguments
-            std::vector<ConstExpr *> defaults;
-            for (unsigned int j = 0; j < ftype->getNumParams(); ++j)
-                defaults.push_back(NULL);
-            funcType->SetArgumentDefaults(defaults);
+        if (i == 0 || anyIntArgs == true)
+            lCreateSymbol(name, returnType, argTypes, ftype, func, symbolTable);

-            Symbol *sym = new Symbol(name, noPos, funcType);
-            sym->function = func;
-            symbolTable->AddFunction(sym);
+        // If there are any reference types, also make a variant of the
+        // symbol that has them as const references.  This obviously
+        // doesn't make sense for many builtins, but we'll give the stdlib
+        // the option to call one if it needs one.
+        if (anyReferenceArgs == true) {
+            for (unsigned int j = 0; j < argTypes.size(); ++j) {
+                if (dynamic_cast<const ReferenceType *>(argTypes[j]) != NULL)
+                    argTypes[j] = argTypes[j]->GetAsConstType();
+                lCreateSymbol(name + "_refsconst", returnType, argTypes, 
+                              ftype, func, symbolTable);
+            }
        }
    }

@@ -319,6 +346,22 @@ lAddBitcode(const unsigned char *bitcode, int length,
    if (!bcModule)
        Error(SourcePos(), "Error parsing stdlib bitcode: %s", bcErr.c_str());
    else {
+        // FIXME: this feels like a bad idea, but the issue is that when we
+        // set the llvm::Module's target triple in the ispc Module::Module
+        // constructor, we start by calling llvm::sys::getHostTriple() (and
+        // then change the arch if needed).  Somehow that ends up giving us
+        // strings like 'x86_64-apple-darwin11.0.0', while the stuff we
+        // compile to bitcode with clang has module triples like
+        // 'i386-apple-macosx10.7.0'.  And then LLVM issues a warning about
+        // linking together modules with incompatible target triples..
+        llvm::Triple mTriple(m->module->getTargetTriple());
+        llvm::Triple bcTriple(bcModule->getTargetTriple());
+        assert(bcTriple.getArch() == llvm::Triple::UnknownArch ||
+               mTriple.getArch() == bcTriple.getArch());
+        assert(bcTriple.getVendor() == llvm::Triple::UnknownVendor ||
+               mTriple.getVendor() == bcTriple.getVendor());
+        bcModule->setTargetTriple(mTriple.str());
+
        std::string(linkError);
        if (llvm::Linker::LinkModules(module, bcModule, &linkError))
            Error(SourcePos(), "Error linking stdlib bitcode: %s", linkError.c_str());
@@ -346,6 +389,27 @@ lDefineConstantInt(const char *name, int val, llvm::Module *module,
 }


+
+static void
+lDefineConstantIntFunc(const char *name, int val, llvm::Module *module,
+                       SymbolTable *symbolTable) {
+    std::vector<const Type *> args;
+    FunctionType *ft = new FunctionType(AtomicType::UniformInt32, args, SourcePos());
+    Symbol *sym = new Symbol(name, SourcePos(), ft);
+    sym->isStatic = true;
+
+    llvm::Function *func = module->getFunction(name);
+    assert(func != NULL); // it should be declared already...
+    func->addFnAttr(llvm::Attribute::AlwaysInline);
+    llvm::BasicBlock *bblock = llvm::BasicBlock::Create(*g->ctx, "entry", func, 0);
+    llvm::ReturnInst::Create(*g->ctx, LLVMInt32(val), bblock);
+
+    sym->function = func;
+    symbolTable->AddVariable(sym);
+}
+
+
+
 static void
 lDefineProgramIndex(llvm::Module *module, SymbolTable *symbolTable) {
    Symbol *pidx = new Symbol("programIndex", SourcePos(), 
@@ -370,9 +434,18 @@ void
 DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *module,
             bool includeStdlibISPC) {
    // Add the definitions from the compiled builtins-c.c file
-    extern unsigned char builtins_bitcode_c[];
-    extern int builtins_bitcode_c_length;
-    lAddBitcode(builtins_bitcode_c, builtins_bitcode_c_length, module, symbolTable);
+    if (g->target.is32bit) {
+        extern unsigned char builtins_bitcode_c_32[];
+        extern int builtins_bitcode_c_32_length;
+        lAddBitcode(builtins_bitcode_c_32, builtins_bitcode_c_32_length, 
+                    module, symbolTable);
+    }
+    else {
+        extern unsigned char builtins_bitcode_c_64[];
+        extern int builtins_bitcode_c_64_length;
+        lAddBitcode(builtins_bitcode_c_64, builtins_bitcode_c_64_length, 
+                    module, symbolTable);
+    }

    // Next, add the target's custom implementations of the various needed
    // builtin functions (e.g. __masked_store_32(), etc).
@@ -402,10 +475,22 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
        }
        break;
    case Target::AVX:
-        extern unsigned char builtins_bitcode_avx[];
-        extern int builtins_bitcode_avx_length;
-        lAddBitcode(builtins_bitcode_avx, builtins_bitcode_avx_length, module, 
-                    symbolTable);
+        switch (g->target.vectorWidth) {
+        case 8:
+            extern unsigned char builtins_bitcode_avx[];
+            extern int builtins_bitcode_avx_length;
+            lAddBitcode(builtins_bitcode_avx, builtins_bitcode_avx_length, module, 
+                        symbolTable);
+            break;
+        case 16:
+            extern unsigned char builtins_bitcode_avx_x2[];
+            extern int builtins_bitcode_avx_x2_length;
+            lAddBitcode(builtins_bitcode_avx_x2, builtins_bitcode_avx_x2_length,
+                        module,  symbolTable);
+            break;
+        default:
+            FATAL("logic error in DefineStdlib");
+        }
        break;
    default:
        FATAL("logic error");
@@ -428,6 +513,8 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
                       symbolTable);
    lDefineConstantInt("__math_lib_system", (int)Globals::Math_System, module,
                       symbolTable);
+    lDefineConstantIntFunc("__fast_masked_vload", (int)g->opt.fastMaskedVload, module,
+                           symbolTable);

    if (includeStdlibISPC) {
        // If the user wants the standard library to be included, parse the
--- a/builtins.m4
+++ b/builtins.m4
@@ -111,6 +111,32 @@ define(`reduce8', `
 '
 )

+define(`reduce16', `
+  %v1 = shufflevector <16 x $1> %0, <16 x $1> undef,
+        <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15,
+                    i32 undef, i32 undef, i32 undef, i32 undef,
+                    i32 undef, i32 undef, i32 undef, i32 undef>
+  %m1 = call <16 x $1> $2(<16 x $1> %v1, <16 x $1> %0)
+  %v2 = shufflevector <16 x $1> %m1, <16 x $1> undef,
+        <16 x i32> <i32 4, i32 5, i32 6, i32 7,
+                    i32 undef, i32 undef, i32 undef, i32 undef,
+                    i32 undef, i32 undef, i32 undef, i32 undef,
+                    i32 undef, i32 undef, i32 undef, i32 undef>
+  %m2 = call <16 x $1> $2(<16 x $1> %v2, <16 x $1> %m1)
+  %v3 = shufflevector <16 x $1> %m2, <16 x $1> undef,
+        <16 x i32> <i32 2, i32 3, i32 undef, i32 undef,
+                    i32 undef, i32 undef, i32 undef, i32 undef,
+                    i32 undef, i32 undef, i32 undef, i32 undef,
+                    i32 undef, i32 undef, i32 undef, i32 undef>
+  %m3 = call <16 x $1> $2(<16 x $1> %v3, <16 x $1> %m2)
+
+  %m3a = extractelement <16 x $1> %m3, i32 0
+  %m3b = extractelement <16 x $1> %m3, i32 1
+  %m = call $1 $3($1 %m3a, $1 %m3b)
+  ret $1 %m
+'
+)
+
 ;; Do an reduction over an 8-wide vector, using a vector reduction function
 ;; that only takes 4-wide vectors
 ;; $1: type of final scalar result
@@ -211,6 +237,45 @@ define(`unary4to8', `
 '
 )

+define(`unary4to16', `
+  %$1_0 = shufflevector <16 x $2> $4, <16 x $2> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v$1_0 = call <4 x $2> $3(<4 x $2> %$1_0)
+  %$1_1 = shufflevector <16 x $2> $4, <16 x $2> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %v$1_1 = call <4 x $2> $3(<4 x $2> %$1_1)
+  %$1_2 = shufflevector <16 x $2> $4, <16 x $2> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+  %v$1_2 = call <4 x $2> $3(<4 x $2> %$1_2)
+  %$1_3 = shufflevector <16 x $2> $4, <16 x $2> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+  %v$1_3 = call <4 x $2> $3(<4 x $2> %$1_3)
+
+  %$1a = shufflevector <4 x $2> %v$1_0, <4 x $2> %v$1_1, 
+           <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %$1b = shufflevector <4 x $2> %v$1_2, <4 x $2> %v$1_3, 
+           <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %$1 = shufflevector <8 x $2> %$1a, <8 x $2> %$1b,
+           <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                       i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+'
+)
+
+;; And so forth...
+;; $1: name of variable into which the final result should go
+;; $2: scalar type of the vector elements
+;; $3: 8-wide unary vector function to apply
+;; $4: 16-wide operand value
+
+define(`unary8to16', `
+  %$1_0 = shufflevector <16 x $2> $4, <16 x $2> undef,
+             <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %v$1_0 = call <8 x $2> $3(<8 x $2> %$1_0)
+  %$1_1 = shufflevector <16 x $2> $4, <16 x $2> undef,
+             <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %v$1_1 = call <8 x $2> $3(<8 x $2> %$1_1)
+  %$1 = shufflevector <8 x $2> %v$1_0, <8 x $2> %v$1_1, 
+           <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                       i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+'
+)
+
 ;; And along the lines of `binary2to4', this maps a 4-wide binary function to
 ;; two 8-wide vector operands
 ;; $1: name of variable into which the final result should go
@@ -231,6 +296,57 @@ define(`binary4to8', `
 '
 )

+define(`binary8to16', `
+%$1_0a = shufflevector <16 x $2> $4, <16 x $2> undef,
+          <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+%$1_0b = shufflevector <16 x $2> $5, <16 x $2> undef,
+          <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+%v$1_0 = call <8 x $2> $3(<8 x $2> %$1_0a, <8 x $2> %$1_0b)
+%$1_1a = shufflevector <16 x $2> $4, <16 x $2> undef,
+          <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+%$1_1b = shufflevector <16 x $2> $5, <16 x $2> undef,
+          <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+%v$1_1 = call <8 x $2> $3(<8 x $2> %$1_1a, <8 x $2> %$1_1b)
+%$1 = shufflevector <8 x $2> %v$1_0, <8 x $2> %v$1_1, 
+         <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                     i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+'
+)
+
+define(`binary4to16', `
+%$1_0a = shufflevector <16 x $2> $4, <16 x $2> undef,
+          <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+%$1_0b = shufflevector <16 x $2> $5, <16 x $2> undef,
+          <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+%r$1_0 = call <4 x $2> $3(<4 x $2> %$1_0a, <4 x $2> %$1_0b) 
+
+%$1_1a = shufflevector <16 x $2> $4, <16 x $2> undef,
+          <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+%$1_1b = shufflevector <16 x $2> $5, <16 x $2> undef,
+          <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+%r$1_1 = call <4 x $2> $3(<4 x $2> %$1_1a, <4 x $2> %$1_1b) 
+
+%$1_2a = shufflevector <16 x $2> $4, <16 x $2> undef,
+          <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+%$1_2b = shufflevector <16 x $2> $5, <16 x $2> undef,
+          <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+%r$1_2 = call <4 x $2> $3(<4 x $2> %$1_2a, <4 x $2> %$1_2b) 
+
+%$1_3a = shufflevector <16 x $2> $4, <16 x $2> undef,
+          <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+%$1_3b = shufflevector <16 x $2> $5, <16 x $2> undef,
+          <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+%r$1_3 = call <4 x $2> $3(<4 x $2> %$1_3a, <4 x $2> %$1_3b)
+
+%r$1_01 = shufflevector <4 x $2> %r$1_0, <4 x $2> %r$1_1, 
+          <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+%r$1_23 = shufflevector <4 x $2> %r$1_2, <4 x $2> %r$1_3, 
+          <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+
+%$1 = shufflevector <8 x $2> %r$1_01, <8 x $2> %r$1_23, 
+          <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                      i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+')

 ;; Maps a 2-wide unary function to an 8-wide vector operand, returning an 
 ;; 8-wide vector result
@@ -306,6 +422,20 @@ ret <8 x float> %ret
 '
 )

+define(`round8to16', `
+%v0 = shufflevector <16 x float> $1, <16 x float> undef,
+        <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+%v1 = shufflevector <16 x float> $1, <16 x float> undef,
+        <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+%r0 = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %v0, i32 $2)
+%r1 = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %v1, i32 $2)
+%ret = shufflevector <8 x float> %r0, <8 x float> %r1, 
+         <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                     i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ret <16 x float> %ret
+'
+)
+
 define(`round4to8double', `
 %v0 = shufflevector <8 x double> $1, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 %v1 = shufflevector <8 x double> $1, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
@@ -349,6 +479,30 @@ ret <8 x double> %ret
 '
 )

+define(`round4to16double', `
+%v0 = shufflevector <16 x double> $1, <16 x double> undef,
+         <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+%v1 = shufflevector <16 x double> $1, <16 x double> undef,
+         <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+%v2 = shufflevector <16 x double> $1, <16 x double> undef,
+         <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+%v3 = shufflevector <16 x double> $1, <16 x double> undef,
+         <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+%r0 = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %v0, i32 $2)
+%r1 = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %v1, i32 $2)
+%r2 = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %v2, i32 $2)
+%r3 = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %v3, i32 $2)
+%ret0 = shufflevector <4 x double> %r0, <4 x double> %r1, 
+          <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+%ret1 = shufflevector <4 x double> %r2, <4 x double> %r3, 
+          <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+%ret = shufflevector <8 x double> %ret0, <8 x double> %ret1,
+          <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                      i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ret <16 x double> %ret
+'
+)
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; forloop macro

@@ -502,6 +656,108 @@ define internal <$1 x $3> @__atomic_$2_$4_global($3 * %ptr, <$1 x $3> %val,
 }
 ')

+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; global_atomic_associative
+;; More efficient implementation for atomics that are associative (e.g.,
+;; add, and, ...).  If a basic implementation would do sometihng like:
+;; result0 = atomic_op(ptr, val0)
+;; result1 = atomic_op(ptr, val1)
+;; ..
+;; Then instead we can do:
+;; tmp = (val0 op val1 op ...)
+;; result0 = atomic_op(ptr, tmp)
+;; result1 = (result0 op val0)
+;; ..
+;; And more efficiently compute the same result
+;;
+;; Takes five parameters:
+;; $1: vector width of the target
+;; $2: operation being performed (w.r.t. LLVM atomic intrinsic names)
+;;     (add, sub...)
+;; $3: return type of the LLVM atomic (e.g. i32)
+;; $4: return type of the LLVM atomic type, in ispc naming paralance (e.g. int32)
+;; $5: identity value for the operator (e.g. 0 for add, -1 for AND, ...)
+
+define(`global_atomic_associative', `
+
+declare $3 @llvm.atomic.load.$2.$3.p0$3($3 * %ptr, $3 %delta)
+
+;; note that the mask is expected to be of type $3, so the caller must ensure
+;; that for 64-bit types, the mask is cast to a signed int before being passed
+;; to this so that it is properly sign extended...  (The code in stdlib.ispc
+;; does do this..)
+
+define internal <$1 x $3> @__atomic_$2_$4_global($3 * %ptr, <$1 x $3> %val,
+                                                 <$1 x $3> %mask) nounwind alwaysinline {
+  ; first, for any lanes where the mask is off, compute a vector where those lanes
+  ; hold the identity value..
+
+  ; zero out any lanes that are off
+  %valoff = and <$1 x $3> %val, %mask
+
+  ; compute an identity vector that is zero in on lanes and has the identiy value
+  ; in the off lanes
+  %idv1 = bitcast $3 $5 to <1 x $3>
+  %idvec = shufflevector <1 x $3> %idv1, <1 x $3> undef,
+     <$1 x i32> < forloop(i, 1, eval($1-1), `i32 0, ') i32 0 >
+  %notmask = xor <$1 x $3> %mask, < forloop(i, 1, eval($1-1), `$3 -1, ') $3 -1 >
+  %idoff = and <$1 x $3> %idvec, %notmask
+
+  ; and comptue the merged vector that holds the identity in the off lanes
+  %valp = or <$1 x $3> %valoff, %idoff
+
+  ; now compute the local reduction (val0 op val1 op ... )--initialize
+  ; %eltvec so that the 0th element is the identity, the first is val0,
+  ; the second is (val0 op val1), ..
+  %red0 = extractelement <$1 x $3> %valp, i32 0
+  %eltvec0 = insertelement <$1 x $3> undef, $3 $5, i32 0
+
+  forloop(i, 1, eval($1-1), `
+  %elt`'i = extractelement <$1 x $3> %valp, i32 i
+  %red`'i = $2 $3 %red`'eval(i-1), %elt`'i
+  %eltvec`'i = insertelement <$1 x $3> %eltvec`'eval(i-1), $3 %red`'eval(i-1), i32 i')
+
+  ; make the atomic call, passing it the final reduced value
+  %final0 = call $3 @llvm.atomic.load.$2.$3.p0$3($3 * %ptr, $3 %red`'eval($1-1))
+
+  ; now go back and compute the values to be returned for each program 
+  ; instance--this just involves smearing the old value returned from the
+  ; actual atomic call across the vector and applying the vector op to the
+  ; %eltvec vector computed above..
+  %finalv1 = bitcast $3 %final0 to <1 x $3>
+  %final_base = shufflevector <1 x $3> %finalv1, <1 x $3> undef,
+     <$1 x i32> < forloop(i, 1, eval($1-1), `i32 0, ') i32 0 >
+  %r = $2 <$1 x $3> %final_base, %eltvec`'eval($1-1)
+
+  ret <$1 x $3> %r
+}
+')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; global_atomic_uniform
+;; Defines the implementation of a function that handles the mapping from
+;; an ispc atomic function to the underlying LLVM intrinsics.  This variant
+;; just calls the atomic once, for the given uniform value
+;;
+;; Takes four parameters:
+;; $1: vector width of the target
+;; $2: operation being performed (w.r.t. LLVM atomic intrinsic names)
+;;     (add, sub...)
+;; $3: return type of the LLVM atomic (e.g. i32)
+;; $4: return type of the LLVM atomic type, in ispc naming paralance (e.g. int32)
+
+define(`global_atomic_uniform', `
+
+declare $3 @llvm.atomic.load.$2.$3.p0$3($3 * %ptr, $3 %delta)
+
+define internal $3 @__atomic_$2_$4_global($3 * %ptr, $3 %val,
+                                          <$1 x i32> %mask) nounwind alwaysinline {
+  %r = call $3 @llvm.atomic.load.$2.$3.p0$3($3 * %ptr, $3 %val)
+  ret $3 %r
+}
+')
+
 ;; Macro to declare the function that implements the swap atomic.  
 ;; Takes three parameters:
 ;; $1: vector width of the target
@@ -557,9 +813,46 @@ define internal <$1 x $2> @__atomic_compare_exchange_$3_global($2* %ptr, <$1 x $
 }
 ')

+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; prefetch definitions
+
+; prefetch has a new parameter in LLVM3.0, to distinguish between instruction
+; and data caches--the declaration is now:
+; declare void @llvm.prefetch(i8* nocapture %ptr, i32 %readwrite, i32 %locality,
+;                             i32 %cachetype)  (cachetype 1 == data cache)
+; however, the version below seems to still work...
+
+declare void @llvm.prefetch(i8* nocapture %ptr, i32 %readwrite, i32 %locality)
+
+define(`prefetch_read', `
+define internal void @__prefetch_read_1_$1($2 *) alwaysinline {
+  %ptr8 = bitcast $2 * %0 to i8 *
+  call void @llvm.prefetch(i8 * %ptr8, i32 0, i32 3)
+  ret void
+}
+define internal void @__prefetch_read_2_$1($2 *) alwaysinline {
+  %ptr8 = bitcast $2 * %0 to i8 *
+  call void @llvm.prefetch(i8 * %ptr8, i32 0, i32 2)
+  ret void
+}
+define internal void @__prefetch_read_3_$1($2 *) alwaysinline {
+  %ptr8 = bitcast $2 * %0 to i8 *
+  call void @llvm.prefetch(i8 * %ptr8, i32 0, i32 1)
+  ret void
+}
+define internal void @__prefetch_read_nt_$1($2 *) alwaysinline {
+  %ptr8 = bitcast $2 * %0 to i8 *
+  call void @llvm.prefetch(i8 * %ptr8, i32 0, i32 0)
+  ret void
+}
+')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

 define(`stdlib_core', `

+declare i32 @__fast_masked_vload()
+
 declare i8* @ISPCMalloc(i64, i32) nounwind
 declare i8* @ISPCFree(i8*) nounwind
 declare void @ISPCLaunch(i8*, i8*) nounwind
@@ -779,6 +1072,25 @@ define internal <$1 x i32> @__sext_varying_bool(<$1 x i32>) nounwind readnone al
  ret <$1 x i32> %0
 }

+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; prefetching
+
+prefetch_read(uniform_bool, i1)
+prefetch_read(uniform_int8, i8)
+prefetch_read(uniform_int16, i16)
+prefetch_read(uniform_int32, i32)
+prefetch_read(uniform_int64, i64)
+prefetch_read(uniform_float, float)
+prefetch_read(uniform_double, double)
+
+prefetch_read(varying_bool, <$1 x i32>)
+prefetch_read(varying_int8, <$1 x i8>)
+prefetch_read(varying_int16, <$1 x i16>)
+prefetch_read(varying_int32, <$1 x i32>)
+prefetch_read(varying_int64, <$1 x i64>)
+prefetch_read(varying_float, <$1 x float>)
+prefetch_read(varying_double, <$1 x double>)
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; stdlib transcendentals
 ;;
@@ -911,25 +1223,25 @@ define internal void @__memory_barrier() nounwind readnone alwaysinline {
  ret void
 }

-global_atomic($1, add, i32, int32)
-global_atomic($1, sub, i32, int32)
-global_atomic($1, and, i32, int32)
-global_atomic($1, or, i32, int32)
-global_atomic($1, xor, i32, int32)
-global_atomic($1, min, i32, int32)
-global_atomic($1, max, i32, int32)
-global_atomic($1, umin, i32, uint32)
-global_atomic($1, umax, i32, uint32)
+global_atomic_associative($1, add, i32, int32, 0)
+global_atomic_associative($1, sub, i32, int32, 0)
+global_atomic_associative($1, and, i32, int32, -1)
+global_atomic_associative($1, or, i32, int32, 0)
+global_atomic_associative($1, xor, i32, int32, 0)
+global_atomic_uniform($1, min, i32, int32)
+global_atomic_uniform($1, max, i32, int32)
+global_atomic_uniform($1, umin, i32, uint32)
+global_atomic_uniform($1, umax, i32, uint32)

-global_atomic($1, add, i64, int64)
-global_atomic($1, sub, i64, int64)
-global_atomic($1, and, i64, int64)
-global_atomic($1, or, i64, int64)
-global_atomic($1, xor, i64, int64)
-global_atomic($1, min, i64, int64)
-global_atomic($1, max, i64, int64)
-global_atomic($1, umin, i64, uint64)
-global_atomic($1, umax, i64, uint64)
+global_atomic_associative($1, add, i64, int64, 0)
+global_atomic_associative($1, sub, i64, int64, 0)
+global_atomic_associative($1, and, i64, int64, -1)
+global_atomic_associative($1, or, i64, int64, 0)
+global_atomic_associative($1, xor, i64, int64, 0)
+global_atomic_uniform($1, min, i64, int64)
+global_atomic_uniform($1, max, i64, int64)
+global_atomic_uniform($1, umin, i64, uint64)
+global_atomic_uniform($1, umax, i64, uint64)

 global_swap($1, i32, int32)
 global_swap($1, i64, int64)
@@ -1034,12 +1346,6 @@ i64minmax($1,max,uint64,ugt)

 define(`load_and_broadcast', `
 define <$1 x $2> @__load_and_broadcast_$3(i8 *, <$1 x i32> %mask) nounwind alwaysinline {
-  ; must not load if the mask is all off; the address may be invalid
-  %mm = call i32 @__movmsk(<$1 x i32> %mask)
-  %any_on = icmp ne i32 %mm, 0
-  br i1 %any_on, label %load, label %skip
-
-load:
  %ptr = bitcast i8 * %0 to $2 *
  %val = load $2 * %ptr

@@ -1047,9 +1353,6 @@ load:
  forloop(i, 1, eval($1-1), `
  %ret`'i = insertelement <$1 x $2> %ret`'eval(i-1), $2 %val, i32 i')
  ret <$1 x $2> %ret`'eval($1-1)
-
-skip:
-  ret <$1 x $2> undef
 }
 ')

@@ -1065,14 +1368,20 @@ define(`load_masked', `
 define <$1 x $2> @__load_masked_$3(i8 *, <$1 x i32> %mask) nounwind alwaysinline {
 entry:
  %mm = call i32 @__movmsk(<$1 x i32> %mask)
+  
  ; if the first lane and the last lane are on, then it is safe to do a vector load
  ; of the whole thing--what the lanes in the middle want turns out to not matter...
  %mm_and = and i32 %mm, eval(1 | (1<<($1-1)))
  %can_vload = icmp eq i32 %mm_and, eval(1 | (1<<($1-1)))
+
+  %fast32 = call i32 @__fast_masked_vload()
+  %fast_i1 = trunc i32 %fast32 to i1
+  %can_vload_maybe_fast = or i1 %fast_i1, %can_vload
+
  ; if we are not able to do a singe vload, we will accumulate lanes in this memory..
  %retptr = alloca <$1 x $2>
  %retptr32 = bitcast <$1 x $2> * %retptr to $2 *
-  br i1 %can_vload, label %load, label %loop
+  br i1 %can_vload_maybe_fast, label %load, label %loop

 load: 
  %ptr = bitcast i8 * %0 to <$1 x $2> *
@@ -1207,6 +1516,46 @@ define void @__masked_store_blend_16(<8 x i16>* nocapture, <8 x i16>,
 ')


+define(`masked_store_blend_8_16_by_16', `
+define void @__masked_store_blend_8(<16 x i8>* nocapture, <16 x i8>,
+                                    <16 x i32>) nounwind alwaysinline {
+  %old = load <16 x i8> * %0
+  %old128 = bitcast <16 x i8> %old to i128
+  %new128 = bitcast <16 x i8> %1 to i128
+
+  %mask8 = trunc <16 x i32> %2 to <16 x i8>
+  %mask128 = bitcast <16 x i8> %mask8 to i128
+  %notmask128 = xor i128 %mask128, -1
+
+  %newmasked = and i128 %new128, %mask128
+  %oldmasked = and i128 %old128, %notmask128
+  %result = or i128 %newmasked, %oldmasked
+
+  %resultvec = bitcast i128 %result to <16 x i8>
+  store <16 x i8> %resultvec, <16 x i8> * %0
+  ret void
+}
+
+define void @__masked_store_blend_16(<16 x i16>* nocapture, <16 x i16>,
+                                     <16 x i32>) nounwind alwaysinline {
+  %old = load <16 x i16> * %0
+  %old256 = bitcast <16 x i16> %old to i256
+  %new256 = bitcast <16 x i16> %1 to i256
+
+  %mask16 = trunc <16 x i32> %2 to <16 x i16>
+  %mask256 = bitcast <16 x i16> %mask16 to i256
+  %notmask256 = xor i256 %mask256, -1
+
+  %newmasked = and i256 %new256, %mask256
+  %oldmasked = and i256 %old256, %notmask256
+  %result = or i256 %newmasked, %oldmasked
+
+  %resultvec = bitcast i256 %result to <16 x i16>
+  store <16 x i16> %resultvec, <16 x i16> * %0
+  ret void
+}
+')
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; packed load and store functions
 ;;
@@ -1234,7 +1583,7 @@ entry:

 known_mask:
  %allon = icmp eq i32 %mask, eval((1 << $1) -1)
-  br i1 %allon, label %all_on, label %not_all_on
+  br i1 %allon, label %all_on, label %unknown_mask

 all_on:
  ;; everyone wants to load, so just load an entire vector width in a single
@@ -1244,14 +1593,6 @@ all_on:
  store <$1 x i32> %vec_load, <$1 x i32> * %val_ptr, align 4
  ret i32 $1

-not_all_on:
-  %alloff = icmp eq i32 %mask, 0
-  br i1 %alloff, label %all_off, label %unknown_mask
-
-all_off:
-  ;; no one wants to load
-  ret i32 0
-
 unknown_mask:
  br label %loop

@@ -1298,20 +1639,13 @@ entry:

 known_mask:
  %allon = icmp eq i32 %mask, eval((1 << $1) -1)
-  br i1 %allon, label %all_on, label %not_all_on
+  br i1 %allon, label %all_on, label %unknown_mask

 all_on:
  %vecptr = bitcast i32 *%startptr to <$1 x i32> *
  store <$1 x i32> %vals, <$1 x i32> * %vecptr, align 4
  ret i32 $1

-not_all_on:
-  %alloff = icmp eq i32 %mask, 0
-  br i1 %alloff, label %all_off, label %unknown_mask
-
-all_off:
-  ret i32 0
-
 unknown_mask:
  br label %loop

@@ -1346,6 +1680,150 @@ done:
 }
 ')

+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; reduce_equal
+
+; count leading zeros
+declare i32 @llvm.cttz.i32(i32)
+
+define(`reduce_equal_aux', `
+define internal i1 @__reduce_equal_$3(<$1 x $2> %v, $2 * %samevalue,
+                                      <$1 x i32> %mask) nounwind alwaysinline {
+entry:
+   %mm = call i32 @__movmsk(<$1 x i32> %mask)
+   %allon = icmp eq i32 %mm, eval((1<<$1)-1)
+   br i1 %allon, label %check_neighbors, label %domixed
+
+domixed:
+  ; First, figure out which lane is the first active one
+  %first = call i32 @llvm.cttz.i32(i32 %mm)
+  %baseval = extractelement <$1 x $2> %v, i32 %first
+  %basev1 = bitcast $2 %baseval to <1 x $2>
+  ; get a vector that is that value smeared across all elements
+  %basesmear = shufflevector <1 x $2> %basev1, <1 x $2> undef,
+        <$1 x i32> < forloop(i, 0, eval($1-2), `i32 0, ') i32 0 >
+
+  ; now to a blend of that vector with the original vector, such that the
+  ; result will be the original value for the active lanes, and the value
+  ; from the first active lane for the inactive lanes.  Given that, we can
+  ; just unconditionally check if the lanes are all equal in check_neighbors
+  ; below without worrying about inactive lanes...
+  %ptr = alloca <$1 x $2>
+  store <$1 x $2> %basesmear, <$1 x $2> * %ptr
+  %castptr = bitcast <$1 x $2> * %ptr to <$1 x $4> *
+  %castv = bitcast <$1 x $2> %v to <$1 x $4>
+  call void @__masked_store_blend_$6(<$1 x $4> * %castptr, <$1 x $4> %castv, <$1 x i32> %mask)
+  %blendvec = load <$1 x $2> * %ptr
+  br label %check_neighbors
+
+check_neighbors:
+  %vec = phi <$1 x $2> [ %blendvec, %domixed ], [ %v, %entry ]
+  ifelse($6, `32', `
+  ; For 32-bit elements, we rotate once and compare with the vector, which ends 
+  ; up comparing each element to its neighbor on the right.  Then see if
+  ; all of those values are true; if so, then all of the elements are equal..
+  %castvec = bitcast <$1 x $2> %vec to <$1 x $4>
+  %castvr = call <$1 x $4> @__rotate_int$6(<$1 x $4> %castvec, i32 1)
+  %vr = bitcast <$1 x $4> %castvr to <$1 x $2>
+  %eq = $5 eq <$1 x $2> %vec, %vr
+  %eq32 = sext <$1 x i1> %eq to <$1 x i32>
+  %eqmm = call i32 @__movmsk(<$1 x i32> %eq32)
+  %alleq = icmp eq i32 %eqmm, eval((1<<$1)-1)
+  br i1 %alleq, label %all_equal, label %not_all_equal
+  ', `
+  ; But for 64-bit elements, it turns out to be more efficient to just
+  ; scalarize and do a individual pairwise comparisons and AND those
+  ; all together..
+  forloop(i, 0, eval($1-1), `
+  %v`'i = extractelement <$1 x $2> %vec, i32 i')
+
+  forloop(i, 0, eval($1-2), `
+  %eq`'i = $5 eq $2 %v`'i, %v`'eval(i+1)')
+
+  %and0 = and i1 %eq0, %eq1
+  forloop(i, 1, eval($1-3), `
+  %and`'i = and i1 %and`'eval(i-1), %eq`'eval(i+1)')
+
+  br i1 %and`'eval($1-3), label %all_equal, label %not_all_equal
+  ')
+
+all_equal:
+  %the_value = extractelement <$1 x $2> %vec, i32 0
+  store $2 %the_value, $2 * %samevalue
+  ret i1 true
+
+not_all_equal:
+  ret i1 false
+}
+')
+
+define(`reduce_equal', `
+reduce_equal_aux($1, i32, int32, i32, icmp, 32)
+reduce_equal_aux($1, float, float, i32, fcmp, 32)
+reduce_equal_aux($1, i64, int64, i64, icmp, 64)
+reduce_equal_aux($1, double, double, i64, fcmp, 64)
+')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; prefix sum stuff
+
+; $1: vector width (e.g. 4)
+; $2: vector element type (e.g. float)
+; $3: bit width of vector element type (e.g. 32)
+; $4: operator to apply (e.g. fadd)
+; $5: identity element value (e.g. 0)
+; $6: suffix for function (e.g. add_float)
+
+define(`exclusive_scan', `
+define internal <$1 x $2> @__exclusive_scan_$6(<$1 x $2> %v,
+                                  <$1 x i32> %mask) nounwind alwaysinline {
+  ; first, set the value of any off lanes to the identity value
+  %ptr = alloca <$1 x $2>
+  %idvec1 = bitcast $2 $5 to <1 x $2>
+  %idvec = shufflevector <1 x $2> %idvec1, <1 x $2> undef,
+      <$1 x i32> < forloop(i, 0, eval($1-2), `i32 0, ') i32 0 >
+  store <$1 x $2> %idvec, <$1 x $2> * %ptr
+  %ptr`'$3 = bitcast <$1 x $2> * %ptr to <$1 x i`'$3> *
+  %vi = bitcast <$1 x $2> %v to <$1 x i`'$3>
+  call void @__masked_store_blend_$3(<$1 x i`'$3> * %ptr`'$3, <$1 x i`'$3> %vi,
+                                     <$1 x i32> %mask)
+  %v_id = load <$1 x $2> * %ptr
+
+  ; extract elements of the vector to use in computing the scan
+  forloop(i, 0, eval($1-1), `
+  %v`'i = extractelement <$1 x $2> %v_id, i32 i')
+
+  ; and just compute the scan directly.
+  ; 0th element is the identity (so nothing to do here),
+  ; 1st element is identity (op) the 0th element of the original vector,
+  ; each successive element is the previous element (op) the previous element
+  ;  of the original vector
+  %s1 = $4 $2 $5, %v0
+  forloop(i, 2, eval($1-1), `
+  %s`'i = $4 $2 %s`'eval(i-1), %v`'eval(i-1)')
+
+  ; and fill in the result vector
+  %r0 = insertelement <$1 x $2> undef, $2 $5, i32 0  ; 0th element gets identity
+  forloop(i, 1, eval($1-1), `
+  %r`'i = insertelement <$1 x $2> %r`'eval(i-1), $2 %s`'i, i32 i')
+
+  ret <$1 x $2> %r`'eval($1-1)
+}
+')
+
+define(`scans', `
+exclusive_scan($1, i32, 32, add, 0, add_i32)
+exclusive_scan($1, float, 32, fadd, zeroinitializer, add_float)
+exclusive_scan($1, i64, 64, add, 0, add_i64)
+exclusive_scan($1, double, 64, fadd, zeroinitializer, add_double)
+
+exclusive_scan($1, i32, 32, and, -1, and_i32)
+exclusive_scan($1, i64, 64, and, -1, and_i64)
+
+exclusive_scan($1, i32, 32, or, 0, or_i32)
+exclusive_scan($1, i64, 64, or, 0, or_i64)
+')
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; per_lane
 ;;
@@ -1371,7 +1849,7 @@ pl_known_mask:
  ;; the mask is known at compile time; see if it is something we can
  ;; handle more efficiently
  %pl_is_allon = icmp eq i32 %pl_mask, eval((1<<$1)-1)
-  br i1 %pl_is_allon, label %pl_all_on, label %pl_not_all_on
+  br i1 %pl_is_allon, label %pl_all_on, label %pl_unknown_mask

 pl_all_on:
  ;; the mask is all on--just expand the code for each lane sequentially
@@ -1379,19 +1857,14 @@ pl_all_on:
          `patsubst(`$3', `ID\|LANE', i)')
  br label %pl_done

-pl_not_all_on:
-  ;; not all on--see if it is all off or mixed
-  ;; for the mixed case, we just run the general case, though we could
+pl_unknown_mask:
+  ;; we just run the general case, though we could
  ;; try to be smart and just emit the code based on what it actually is,
  ;; for example by emitting the code straight-line without a loop and doing 
  ;; the lane tests explicitly, leaving later optimization passes to eliminate
  ;; the stuff that is definitely not needed.  Not clear if we will frequently 
  ;; encounter a mask that is known at compile-time but is not either all on or
  ;; all off...
-  %pl_alloff = icmp eq i32 %pl_mask, 0
-  br i1 %pl_alloff, label %pl_done, label %pl_unknown_mask
-
-pl_unknown_mask:
  br label %pl_loop

 pl_loop:
@@ -1447,20 +1920,6 @@ define internal <$1 x $2> @__gather_elt_$2(i8 * %ptr, <$1 x i32> %offsets, <$1 x

 define <$1 x $2> @__gather_base_offsets_$2(i8 * %ptr, <$1 x i32> %offsets,
                                           <$1 x i32> %vecmask) nounwind readonly alwaysinline {
-entry:
-  %mask = call i32 @__movmsk(<$1 x i32> %vecmask)
-
-  %maskKnown = call i1 @__is_compile_time_constant_mask(<$1 x i32> %vecmask)
-  br i1 %maskKnown, label %known_mask, label %unknown_mask
-
-known_mask:
-  %alloff = icmp eq i32 %mask, 0
-  br i1 %alloff, label %gather_all_off, label %unknown_mask
-
-gather_all_off:
-  ret <$1 x $2> undef
-
-unknown_mask:
  ; We can be clever and avoid the per-lane stuff for gathers if we are willing
  ; to require that the 0th element of the array being gathered from is always
  ; legal to read from (and we do indeed require that, given the benefits!) 
--- a/contrib/ispc.vim
+++ b/contrib/ispc.vim
@@ -0,0 +1,32 @@
+" Vim syntax file
+" Language:	ISPC
+" Maintainer:	Andreas Wendleder <andreas.wendleder@gmail.com>
+" Last Change:	2011 Aug 3
+
+" Quit when a syntax file was already loaded
+if exists("b:current_syntax")
+  finish
+endif
+
+" Read the C syntax to start with
+runtime! syntax/c.vim
+unlet b:current_syntax
+
+" New keywords
+syn keyword	ispcStatement	cbreak ccontinue creturn launch print reference soa sync task
+syn keyword	ispcConditional	cif
+syn keyword	ispcRepeat	cdo cfor cwhile
+syn keyword	ispcBuiltin	programCount programIndex	
+syn keyword	ispcType	export int8 int16 int32 int64
+
+" Default highlighting
+command -nargs=+ HiLink hi def link <args>
+HiLink ispcStatement	Statement
+HiLink ispcConditional	Conditional
+HiLink ispcRepeat	Repeat
+HiLink ispcBuiltin	Statement
+HiLink ispcType		Type
+delcommand HiLink
+
+let b:current_syntax = "ispc"
+
--- a/ctx.cpp
+++ b/ctx.cpp
@@ -153,7 +153,6 @@ FunctionEmitContext::FunctionEmitContext(const Type *rt, llvm::Function *functio
        StoreInst(llvm::Constant::getNullValue(ftype), returnValuePtr);
    }

-#ifndef LLVM_2_8
    if (m->diBuilder) {
        /* If debugging is enabled, tell the debug information emission
           code about this new function */
@@ -174,7 +173,6 @@ FunctionEmitContext::FunctionEmitContext(const Type *rt, llvm::Function *functio
        /* And start a scope representing the initial function scope */
        StartScope();
    }
-#endif // LLVM_2_8

    launchedTasks = false;

@@ -183,7 +181,6 @@ FunctionEmitContext::FunctionEmitContext(const Type *rt, llvm::Function *functio
    assert(maskSymbol != NULL);
    maskSymbol->storagePtr = maskPtr;

-#ifndef LLVM_2_8
    // add debugging info for __mask, programIndex, ...
    if (m->diBuilder) {
        maskSymbol->pos = funcStartPos;
@@ -208,15 +205,12 @@ FunctionEmitContext::FunctionEmitContext(const Type *rt, llvm::Function *functio
                                           true /* static */,
                                           programCountSymbol->storagePtr);
    }
-#endif
 }


 FunctionEmitContext::~FunctionEmitContext() {
    assert(controlFlowInfo.size() == 0);
-#ifndef LLVM_2_8
    assert(debugScopes.size() == (m->diBuilder ? 1 : 0));
-#endif
 }


@@ -704,6 +698,7 @@ FunctionEmitContext::LaneMask(llvm::Value *v) {

 llvm::Value *
 FunctionEmitContext::MasksAllEqual(llvm::Value *v1, llvm::Value *v2) {
+#if 0
    // Compare the two masks to get a vector of i1s
    llvm::Value *cmp = CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_EQ,
                               v1, v2, "v1==v2");
@@ -711,6 +706,12 @@ FunctionEmitContext::MasksAllEqual(llvm::Value *v1, llvm::Value *v2) {
    cmp = I1VecToBoolVec(cmp);
    // And see if it's all on
    return All(cmp);
+#else
+    llvm::Value *mm1 = LaneMask(v1);
+    llvm::Value *mm2 = LaneMask(v2);
+    return CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_EQ, mm1, mm2,
+                   "v1==v2");
+#endif
 }


@@ -850,7 +851,6 @@ FunctionEmitContext::GetDebugPos() const {
 void
 FunctionEmitContext::AddDebugPos(llvm::Value *value, const SourcePos *pos, 
                                 llvm::DIScope *scope) {
-#ifndef LLVM_2_8
    llvm::Instruction *inst = llvm::dyn_cast<llvm::Instruction>(value);
    if (inst != NULL && m->diBuilder) {
        SourcePos p = pos ? *pos : currentPos;
@@ -861,13 +861,11 @@ FunctionEmitContext::AddDebugPos(llvm::Value *value, const SourcePos *pos,
            inst->setDebugLoc(llvm::DebugLoc::get(p.first_line, p.first_column, 
                                                  scope ? *scope : GetDIScope()));
    }
-#endif
 }


 void
 FunctionEmitContext::StartScope() {
-#ifndef LLVM_2_8
    if (m->diBuilder != NULL) {
        llvm::DIScope parentScope;
        if (debugScopes.size() > 0)
@@ -881,18 +879,15 @@ FunctionEmitContext::StartScope() {
                                             currentPos.first_column);
        debugScopes.push_back(lexicalBlock);
    }
-#endif
 }


 void
 FunctionEmitContext::EndScope() {
-#ifndef LLVM_2_8
    if (m->diBuilder != NULL) {
        assert(debugScopes.size() > 0);
        debugScopes.pop_back();
    }
-#endif
 }


@@ -905,7 +900,6 @@ FunctionEmitContext::GetDIScope() const {

 void
 FunctionEmitContext::EmitVariableDebugInfo(Symbol *sym) {
-#ifndef LLVM_2_8
    if (m->diBuilder == NULL)
        return;

@@ -921,13 +915,11 @@ FunctionEmitContext::EmitVariableDebugInfo(Symbol *sym) {
    llvm::Instruction *declareInst = 
        m->diBuilder->insertDeclare(sym->storagePtr, var, bblock);
    AddDebugPos(declareInst, &sym->pos, &scope);
-#endif
 }


 void
 FunctionEmitContext::EmitFunctionParameterDebugInfo(Symbol *sym) {
-#ifndef LLVM_2_8
    if (m->diBuilder == NULL)
        return;

@@ -943,7 +935,6 @@ FunctionEmitContext::EmitFunctionParameterDebugInfo(Symbol *sym) {
    llvm::Instruction *declareInst = 
        m->diBuilder->insertDeclare(sym->storagePtr, var, bblock);
    AddDebugPos(declareInst, &sym->pos, &scope);
-#endif
 }


@@ -1501,27 +1492,15 @@ FunctionEmitContext::gather(llvm::Value *lvalue, const Type *type,
 void
 FunctionEmitContext::addGSMetadata(llvm::Instruction *inst, SourcePos pos) {
    llvm::Value *str = llvm::MDString::get(*g->ctx, pos.name);
-#ifdef LLVM_2_8
-    llvm::MDNode *md = llvm::MDNode::get(*g->ctx, &str, 1);
-#else
    llvm::MDNode *md = llvm::MDNode::get(*g->ctx, str);
-#endif
    inst->setMetadata("filename", md);

    llvm::Value *line = LLVMInt32(pos.first_line);
-#ifdef LLVM_2_8
-    md = llvm::MDNode::get(*g->ctx, &line, 1);
-#else
    md = llvm::MDNode::get(*g->ctx, line);
-#endif
    inst->setMetadata("line", md);

    llvm::Value *column = LLVMInt32(pos.first_column);
-#ifdef LLVM_2_8
-    md = llvm::MDNode::get(*g->ctx, &column, 1);
-#else
    md = llvm::MDNode::get(*g->ctx, column);
-#endif
    inst->setMetadata("column", md);
 }

@@ -1838,9 +1817,9 @@ llvm::PHINode *
 FunctionEmitContext::PhiNode(LLVM_TYPE_CONST llvm::Type *type, int count, 
                             const char *name) {
    llvm::PHINode *pn = llvm::PHINode::Create(type, 
-#if !defined(LLVM_2_8) && !defined(LLVM_2_9)
+#if defined(LLVM_3_0) || defined(LLVM_3_0svn)
                                              count, 
-#endif // !LLVM_2_8 && !LLVM_2_9
+#endif // LLVM_3_0
                                              name ? name : "phi", bblock);
    AddDebugPos(pn);
    return pn;
@@ -1982,17 +1961,26 @@ FunctionEmitContext::LaunchInst(llvm::Function *callee,
    assert(argStructType->getNumElements() == argVals.size() + 1);

    int align = 4 * RoundUpPow2(g->target.nativeVectorWidth);
+    llvm::Value *argmem;
 #ifdef ISPC_IS_WINDOWS
    // Use malloc() to allocate storage on Windows, since the stack is
    // generally not big enough there to do enough allocations for lots of
    // tasks and then things crash horribly...
-    llvm::Value *argmem = EmitMalloc(argStructType, align);
+    argmem = EmitMalloc(argStructType, align);
 #else
-    // Use alloca for space for the task args on OSX And Linux.  KEY
-    // DETAIL: pass false to the call of FunctionEmitContext::AllocaInst so
-    // that the alloca doesn't happen just once at the top of the function,
-    // but happens each time the enclosing basic block executes.
-    llvm::Value *argmem = AllocaInst(argStructType, "argmem", align, false);
+    // Otherwise, use alloca for space for the task args, ** unless we're 
+    // compiling to AVX, in which case we use malloc after all **. (See
+    // http://llvm.org/bugs/show_bug.cgi?id=10841 for details.  There are
+    // limitations in LLVM with respect to dynamic allocas of this sort
+    // when the stack also has to be 32-byte aligned...).
+    if (g->target.isa == Target::AVX)
+        argmem = EmitMalloc(argStructType, align);
+    else
+        // KEY DETAIL: pass false to the call of
+        // FunctionEmitContext::AllocaInst so that the alloca doesn't
+        // happen just once at the top of the function, but happens each
+        // time the enclosing basic block executes.
+        argmem = AllocaInst(argStructType, "argmem", align, false);
 #endif // ISPC_IS_WINDOWS
    llvm::Value *voidmem = BitCastInst(argmem, LLVMTypes::VoidPointerType);

--- a/docs/ReleaseNotes.txt
+++ b/docs/ReleaseNotes.txt
@@ -1,3 +1,84 @@
+=== v1.0.8 === (19 September 2011)
+
+A number of improvements have been made to handling of 'if' statements in
+the language:
+  - A bug was fixed where invalid memory could be incorrectly accessed even
+    if none of the running program instances wanted to execute the
+    corresponding instructions (https://github.com/ispc/ispc/issues/74).
+  - The code generated for 'if' statements is a bit simpler and thus more
+    efficient.
+
+There is now '--pic' command-line argument that causes position-independent
+code to be generated (Linux and OSX only).
+
+A number of additional performance improvements:
+  - Loops are now unrolled by default; the --opt=disable-loop-unroll
+    command-line argument can be used to disable this behavior.
+    (https://github.com/ispc/ispc/issues/78)
+  - A few more cases where gathers/scatters could be determined at compile
+    time to actually access contiguous locations have been added.
+    (https://github.com/ispc/ispc/issues/79)
+
+Finally, warnings are now issued (if possible) when it can be determined
+at compile-time that an out-of-bounds array index is being used.
+(https://github.com/ispc/ispc/issues/98).
+
+
+=== v1.0.7 === (3 September 2011)
+
+The various atomic_*_global() standard library functions are generally
+substantially more efficient.  They all previously issued one hardware
+atomic instruction for each running program instance but now locally
+compute a reduction over the operands and issue a single hardware atomic,
+giving the same effect and results in the end (issue #57).
+
+CPU/ISA target handling has been substantially improved.  If no CPU is
+specified, the host CPU type is used, not just a default of "nehalem".  A
+number of bugs were fixed that ensure that LLVM doesn't generate SSE>2
+instructions when using the SSE2 target (fixes issue #82).
+
+Shift rights of unsigned integer types use a logical shift right
+instruction now, not an arithmetic shift right (fixed issue #88).
+
+When emitting header files, 'extern' declarations of globals used in ispc
+code are now outside of the ispc namespace.  Fixes issue #64.
+
+The stencil example has been modified to do runs with and without
+parallelism.
+
+Many other small bugfixes and improvements.
+
+=== v1.0.6 === (17 August 2011)
+
+Some additional cross-program instance operations have been added to the
+standard library.  reduce_equal() checks to see if the given value is the
+same across all running program instances, and exclusive_scan_{and,or,and}()
+computes a scan over the given value in the running program instances.
+See the documentation of these new routines for more information:
+http://ispc.github.com/ispc.html#cross-program-instance-operations.
+
+The simple task system implementations used in the examples have been
+improved.  The Windows version no nlonger has a hard limit on the number of
+tasks that can be launched, and all versions have less dynamic memory
+allocation and less locking.  More of the examples now have paths that also
+measure performance using tasks along with SPMD vectorization.
+
+Two new examples have been added: one that shows the implementation of a
+ray-marching volume rendering algorithm, and one that shows a 3D stencil
+computation, as might be done for PDE solutions.
+
+Standard library routines to issue prefetches have been added.  See the
+documentation for more details: http://ispc.github.com/ispc.html#prefetches.
+
+Fast versions of the float to half-precision float conversion routines have
+been added.  For more details, see:
+http://ispc.github.com/ispc.html#conversions-to-and-from-half-precision-floats.
+
+There is the usual set of small bug fixes.  Notably, a number of details
+related to handling 32 versus 64 bit targets have been fixed, which in turn
+has fixed a bug related to tasks having incorrect values for pointers
+passed to them.
+
 === v1.0.5 === (1 August 2011)

 Multi-element vector swizzles are supported; for example, given a 3-wide
--- a/docs/ispc.txt
+++ b/docs/ispc.txt
@@ -33,6 +33,17 @@ The main goals behind ``ispc`` are to:
 number of non-trivial workloads that aren't handled well by other
 compilation approaches (e.g. loop auto-vectorization.)

+**We are very interested in your feedback and comments about ispc and
+in hearing your experiences using the system.  We are especially interested
+in hearing if you try using ispc but see results that are not as you
+were expecting or hoping for.** We encourage you to send a note with your
+experiences or comments to the `ispc-users`_ mailing list or to file bug or
+feature requests with the ``ispc`` `bug tracker`_. (Thanks!)
+
+.. _ispc-users: http://groups.google.com/group/ispc-users
+.. _bug tracker: https://github.com/ispc/ispc/issues?state=open
+
+
 Contents:

 * `Recent Changes to ISPC`_
@@ -79,6 +90,7 @@ Contents:
  + `Packed Load and Store Operations`_
  + `Conversions To and From Half-Precision Floats`_
  + `Atomic Operations and Memory Fences`_
+  + `Prefetches`_
  + `Low-Level Bits`_

 * `Interoperability with the Application`_
@@ -100,6 +112,9 @@ Contents:
  + `"Inline" Aggressively`_
  + `Small Performance Tricks`_
  + `Instrumenting Your ISPC Programs`_
+  + `Using Scan Operations For Variable Output`_
+  + `Application-Supplied Execution Masks`_
+  + `Explicit Vector Programming With Uniform Short Vector Types`_

 * `Disclaimer and Legal Information`_

@@ -1172,7 +1187,7 @@ This code implicitly assumes that ``programCount`` evenly divides
 ::

    for (uniform int i = 0; i < count; i += programCount) {
-        if (i + programIndex < programCount) {
+        if (i + programIndex < count) {
            float d = data[i + programIndex];
            ...

@@ -1822,6 +1837,71 @@ given value across all of the currently-executing vector lanes.
    uniform int reduce_max(int a, int b)
    uniform unsigned int reduce_max(unsigned int a, unsigned int b)

+Finally, you can check to see if a particular value has the same value in
+all of the currently-running program instances:
+
+::
+
+    uniform bool reduce_equal(int32 v)
+    uniform bool reduce_equal(unsigned int32 v)
+    uniform bool reduce_equal(float v)
+    uniform bool reduce_equal(int64 v)
+    uniform bool reduce_equal(unsigned int64 v)
+    uniform bool reduce_equal(double)
+
+There are also variants of these functions that return the value as a
+``uniform`` in the case where the values are all the same.
+
+::
+
+    uniform bool reduce_equal(int32 v, reference uniform int32 sameval)
+    uniform bool reduce_equal(unsigned int32 v,
+                              reference uniform unsigned int32 sameval)
+    uniform bool reduce_equal(float v, reference uniform float sameval)
+    uniform bool reduce_equal(int64 v, reference uniform int64 sameval)
+    uniform bool reduce_equal(unsigned int64 v,
+                              reference uniform unsigned int64 sameval)
+    uniform bool reduce_equal(double, reference uniform double sameval)
+
+If called when none of the program instances are running,
+``reduce_equal()`` will return ``false``.
+
+There are also a number of functions to compute "scan"s of values across
+the program instances.  For example, the ``exclusive_scan_and()`` function
+computes, for each program instance, the sum of the given value over all of
+the preceeding program instances.  (The scans currently available in
+``ispc`` are all so-called "exclusive" scans, meaning that the value
+computed for a given element does not include the value provided for that
+element.)  In C code, an exclusive add scan over an array might be
+implemented as:
+
+::
+
+    void scan_add(int *in_array, int *result_array, int count) {
+        result_array[0] = 0;
+        for (int i = 0; i < count; ++i)
+            result_array[i] = result_array[i-1] + in_array[i-1];
+    }
+
+``ispc`` provides the following scan functions--addition, bitwise-and, and
+bitwise-or are available:
+
+::
+
+    int32 exclusive_scan_add(int32 v) 
+    unsigned int32 exclusive_scan_add(unsigned int32 v) 
+    float exclusive_scan_add(float v) 
+    int64 exclusive_scan_add(int64 v) 
+    unsigned int64 exclusive_scan_add(unsigned int64 v) 
+    double exclusive_scan_add(double v) 
+    int32 exclusive_scan_and(int32 v) 
+    unsigned int32 exclusive_scan_and(unsigned int32 v) 
+    int64 exclusive_scan_and(int64 v) 
+    unsigned int64 exclusive_scan_and(unsigned int64 v) 
+    int32 exclusive_scan_or(int32 v) 
+    unsigned int32 exclusive_scan_or(unsigned int32 v) 
+    int64 exclusive_scan_or(int64 v) 
+    unsigned int64 exclusive_scan_or(unsigned int64 v) 


 Packed Load and Store Operations
@@ -1921,6 +2001,18 @@ function returns the 16 bits that are the closest match to the given
    int16 float_to_half(float f)
    uniform int16 float_to_half(uniform float f)

+There are also faster versions of these functions that don't worry about
+handling floating point infinity, "not a number" and denormalized numbers
+correctly.  These are faster than the above functions, but are less
+precise.
+
+::
+
+    float half_to_float_fast(unsigned int16 h)
+    uniform float half_to_float_fast(uniform unsigned int16 h)
+    int16 float_to_half_fast(float f)
+    uniform int16 float_to_half_fast(uniform float f)
+

 Atomic Operations and Memory Fences
 -----------------------------------
@@ -1990,6 +2082,39 @@ code.
    void memory_barrier();


+Prefetches
+----------
+
+The standard library has a variety of functions to prefetch data into the
+processor's cache.  While modern CPUs have automatic prefetchers that do a
+reasonable job of prefetching data to the cache before its needed, high
+performance applications may find it helpful to prefetch data before it's
+needed.
+
+For example, this code shows how to prefetch data to the processor's L1
+cache while iterating over the items in an array.  
+
+::
+
+   uniform int32 array[...];
+   for (uniform int i = 0; i < count; ++i) {
+       // do computation with array[i]
+       prefetch_l1(array[i+32]);
+   }
+
+The standard library has routines to prefetch to the L1, L2, and L3
+caches.  It also has a variant, ``prefetch_nt()``, that indicates that the
+value being prefetched isn't expected to be used more than once (so should
+be high priority to be evicted from the cache).
+
+::
+
+    void prefetch_{l1,l2,l3,nt}(reference TYPE)
+
+These functions are available for all of the basic types in the
+language--``int8``, ``int16``, ``int32``, ``float``, and so forth.
+
+
 Low-Level Bits
 --------------

@@ -2097,14 +2222,14 @@ Both the ``foo`` and ``bar`` global variables can be accessed on each
 side.

 ``ispc`` code can also call back to C/C++.  On the ``ispc`` side, any
-application functions to be called must be declared with the ``export "C"``
+application functions to be called must be declared with the ``extern "C"``
 qualifier.

 ::

   extern "C" void foo(uniform float f, uniform float g);

-Unlike in C++, ``export "C"`` doesn't take braces to delineate
+Unlike in C++, ``extern "C"`` doesn't take braces to delineate
 multiple functions to be declared; thus, multiple C functions to be called
 from ``ispc`` must be declared as follows:

@@ -2699,6 +2824,123 @@ active upon function entry.
    ao.ispc(0088) - function entry: 36928 calls (0 / 0.00% all off!), 97.40% active lanes
    ...

+
+Using Scan Operations For Variable Output
+-----------------------------------------
+
+One important application of the ``exclusive_scan_add()`` function in the
+standard library is when program instances want to generate a variable amount
+of output and when one would like that output to be densely packed in a
+single array.  For example, consider the code fragment below:
+
+::
+
+    uniform int func(uniform float outArray[], ...) {
+       int numOut = ...;  // figure out how many to be output
+       float outLocal[MAX_OUT]; // staging area
+       // put results in outLocal[0], ..., outLocal[numOut-1]
+       int startOffset = exclusive_scan_add(numOut);
+       for (int i = 0; i < numOut; ++i)
+           outArray[startOffset + i] = outLocal[i];
+       return reduce_add(numOut);
+    }
+
+Here, each program instance has computed a number, ``numOut``, of values to
+output, and has stored them in the ``outLocal`` array.  Assume that four
+program instances are running and that the first one wants to output one
+value, the second two values, and the third and fourth three values each.
+In this case, ``exclusive_scan_add()`` will return the values (0, 1, 3, 6)
+to the four program instances, respectively.  The first program instance
+will write its one result to ``outArray[0]``, the second will write its two
+values to ``outArray[1]`` and ``outArray[2]``, and so forth.  The
+``reduce_add`` call at the end returns the total number of values that the
+program instances have written to the array.
+
+Application-Supplied Execution Masks
+------------------------------------
+
+Recall that when execution transitions from the application code to an
+``ispc`` function, all of the program instances are initially executing.
+In some cases, it may desired that only some of them are running, based on
+a data-dependent condition computed in the application program.  This
+situation can easily be handled via an additional parameter from the
+application.
+
+As a simple example, consider a case where the application code has an
+array of ``float`` values and we'd like the ``ispc`` code to update
+just specific values in that array, where which of those values to be
+updated has been determined by the application.  In C++ code, we might
+have:
+
+::
+
+    int count = ...;
+    float *array = new float[count];
+    bool *shouldUpdate = new bool[count];
+    // initialize array and shouldUpdate
+    ispc_func(array, shouldUpdate, count);
+
+Then, the ``ispc`` code could process this update as:
+
+::
+
+    export void ispc_func(uniform float array[], uniform bool update[],
+                          uniform int count) {
+        for (uniform int i = 0; i < count; i += programCount) {
+            cif (update[i+programIndex] == true)
+                // update array[i+programIndex]...
+        }
+    }
+
+(In this case a "coherent" if statement is likely to be worthwhile if the
+``update`` array will tend to have sections that are either all-true or
+all-false.)
+
+Explicit Vector Programming With Uniform Short Vector Types
+-----------------------------------------------------------
+
+The typical model for programming in ``ispc`` is an *implicit* parallel
+model, where one writes a program that is apparently doing scalar
+computation on values and the program is then vectorized to run in parallel
+across the SIMD lanes of a processor.  However, ``ispc`` also has some
+support for explicit vector unit programming, where the vectorization is
+explicit.  Some computations may be more effectively described in the
+explicit model rather than the implicit model.
+
+This support is provided via ``uniform`` instances of short vectors 
+(as were introduced in the `Short Vector Types`_ section).  Specifically, 
+if this short program
+
+::
+
+    export uniform float<8> madd(uniform float<8> a, 
+                                 uniform float<8> b, uniform float<8> c) {
+        return a + b * c;
+    }
+
+is compiled with the AVX target, ``ispc`` generates the following assembly:
+
+::
+    _madd:
+	vmulps	%ymm2, %ymm1, %ymm1
+	vaddps	%ymm0, %ymm1, %ymm0
+	ret
+
+(And similarly, if compiled with a 4-wide SSE target, two ``mulps`` and two
+``addps`` instructions are generated, and so forth.)
+
+Note that ``ispc`` doesn't currently support control-flow based on
+``uniform`` short vector types; it is thus not possible to write code like:
+
+::
+
+    export uniform int<8> count(uniform float<8> a, uniform float<8> b) {
+        uniform int<8> sum = 0;
+        while (a++ < b)
+            ++sum;
+    }
+
+
 Disclaimer and Legal Information
 ================================

--- a/doxygen.cfg
+++ b/doxygen.cfg
@@ -31,7 +31,7 @@ PROJECT_NAME           = "Intel SPMD Program Compiler"
 # This could be handy for archiving the generated documentation or
 # if some version control system is used.

-PROJECT_NUMBER         = 1.0.5
+PROJECT_NUMBER         = 1.0.8

 # The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute)
 # base path where the generated documentation will be put.
--- a/examples/README.txt
+++ b/examples/README.txt
@@ -93,3 +93,17 @@ Simple
 This is a simple "hello world" type program that shows a ~10 line
 application program calling out to a ~5 line ispc program to do a simple
 computation.
+
+Volume
+======
+
+Ray-marching volume rendering, with single scattering lighting model.  To
+run it, specify a camera parameter file and a volume density file, e.g.:
+
+volume camera.dat density_highres.vol
+
+(See, e.g. Chapters 11 and 16 of "Physically Based Rendering" for
+information about the algorithm implemented here.)  The volume data set
+included here was generated by the example implementation of the "Wavelet
+Turbulence for Fluid Simulation" SIGGRAPH 2008 paper by Kim et
+al. (http://www.cs.cornell.edu/~tedkim/WTURB/)
--- a/examples/aobench/Makefile
+++ b/examples/aobench/Makefile
@@ -1,8 +1,20 @@

-CXX=g++ -m64
-CXXFLAGS=-Iobjs/ -O3 -Wall
+ARCH = $(shell uname)
+
+TASK_CXX=../tasks_pthreads.cpp
+TASK_LIB=-lpthread
+
+ifeq ($(ARCH), Darwin)
+  TASK_CXX=../tasks_gcd.cpp
+  TASK_LIB=
+endif
+
+TASK_OBJ=$(addprefix objs/, $(subst ../,, $(TASK_CXX:.cpp=.o)))
+
+CXX=g++
+CXXFLAGS=-Iobjs/ -O3 -Wall -m64
 ISPC=ispc
-ISPCFLAGS=-O2 --fast-math --arch=x86-64
+ISPCFLAGS=-O2 --target=sse4 --arch=x86-64

 default: ao

@@ -14,12 +26,15 @@ dirs:
 clean:
 	/bin/rm -rf objs *~ ao

-ao: dirs objs/ao.o objs/ao_serial.o objs/ao_ispc.o
-	$(CXX) $(CXXFLAGS) -o $@ objs/ao.o objs/ao_ispc.o objs/ao_serial.o -lm -lpthread
+ao: dirs objs/ao.o objs/ao_serial.o objs/ao_ispc.o $(TASK_OBJ)
+	$(CXX) $(CXXFLAGS) -o $@ objs/ao.o objs/ao_ispc.o objs/ao_serial.o $(TASK_OBJ) -lm $(TASK_LIB)

 objs/%.o: %.cpp
 	$(CXX) $< $(CXXFLAGS) -c -o $@

+objs/%.o: ../%.cpp
+	$(CXX) $< $(CXXFLAGS) -c -o $@
+
 objs/ao.o: objs/ao_ispc.h 

 objs/%_ispc.h objs/%_ispc.o: %.ispc
--- a/examples/aobench/ao.cpp
+++ b/examples/aobench/ao.cpp
@@ -101,6 +101,7 @@ savePPM(const char *fname, int w, int h)
    fprintf(fp, "255\n");
    fwrite(img, w * h * 3, 1, fp);
    fclose(fp);
+    printf("Wrote image file %s\n", fname);
 }


@@ -172,10 +173,30 @@ int main(int argc, char **argv)
    }

    // Report results and save image
-    printf("[aobench ispc]:\t\t\t[%.3f] M cycles (%d x %d image)\n", minTimeISPC, 
-           width, height);
+    printf("[aobench ispc]:\t\t\t[%.3f] M cycles (%d x %d image)\n", 
+           minTimeISPC, width, height);
    savePPM("ao-ispc.ppm", width, height); 

+    //
+    // Run the ispc + tasks path, test_iterations times, and report the
+    // minimum time for any of them.
+    //
+    double minTimeISPCTasks = 1e30;
+    for (unsigned int i = 0; i < test_iterations; i++) {
+        memset((void *)fimg, 0, sizeof(float) * width * height * 3);
+        assert(NSUBSAMPLES == 2);
+
+        reset_and_start_timer();
+        ao_ispc_tasks(width, height, NSUBSAMPLES, fimg);
+        double t = get_elapsed_mcycles();
+        minTimeISPCTasks = std::min(minTimeISPCTasks, t);
+    }
+
+    // Report results and save image
+    printf("[aobench ispc + tasks]:\t\t[%.3f] M cycles (%d x %d image)\n", 
+           minTimeISPCTasks, width, height);
+    savePPM("ao-ispc-tasks.ppm", width, height); 
+
    //
    // Run the serial path, again test_iteration times, and report the
    // minimum time.
@@ -192,7 +213,8 @@ int main(int argc, char **argv)
    // Report more results, save another image...
    printf("[aobench serial]:\t\t[%.3f] M cycles (%d x %d image)\n", minTimeSerial, 
           width, height);
-    printf("\t\t\t\t(%.2fx speedup from ISPC)\n", minTimeSerial / minTimeISPC);
+    printf("\t\t\t\t(%.2fx speedup from ISPC, %.2fx speedup from ISPC + tasks)\n", 
+           minTimeSerial / minTimeISPC, minTimeSerial / minTimeISPCTasks);
    savePPM("ao-serial.ppm", width, height); 
        
    return 0;
--- a/examples/aobench/ao.ispc
+++ b/examples/aobench/ao.ispc
@@ -203,8 +203,9 @@ ambient_occlusion(reference Isect isect, reference Plane plane,
 /* Compute the image for the scanlines from [y0,y1), for an overall image
   of width w and height h.
 */
-void ao_scanlines(uniform int y0, uniform int y1, uniform int w, uniform int h, 
-                  uniform int nsubsamples, reference uniform float image[]) {
+static void ao_scanlines(uniform int y0, uniform int y1, uniform int w, 
+                         uniform int h,  uniform int nsubsamples, 
+                         reference uniform float image[]) {
    static Plane plane = { { 0.0f, -0.5f, 0.0f }, { 0.f, 1.f, 0.f } };
    static Sphere spheres[3] = {
        { { -2.0f, 0.0f, -3.5f }, 0.5f },
@@ -231,6 +232,9 @@ void ao_scanlines(uniform int y0, uniform int y1, uniform int w, uniform int h,
    // direction we do per iteration and ny the number in y.
    uniform int nx = 1, ny = 1;

+    // FIXME: We actually need ny to be 1 regardless of the decomposition,
+    // since the task decomposition is one scanline high.
+
    if (programCount == 8) {
        // Do two pixels at once in the x direction
        nx = 2;
@@ -239,19 +243,21 @@ void ao_scanlines(uniform int y0, uniform int y1, uniform int w, uniform int h,
            ++du;
    }
    else if (programCount == 16) {
-        // Two at once in both x and y
-        nx = ny = 2;
-        if ((programIndex >= 4 && programIndex < 8) || programIndex >= 12)
+        nx = 4;
+        ny = 1;
+        if (programIndex >= 4 && programIndex < 8)
            ++du;
-        if (programIndex >= 8)  
-            ++dv;
+        if (programIndex >= 8 && programIndex < 12)
+            du += 2;
+        if (programIndex >= 12)
+            du += 3;
    }

    // Now loop over all of the pixels, stepping in x and y as calculated
    // above.  (Assumes that ny divides y and nx divides x...)
    for (uniform int y = y0; y < y1; y += ny) {
        for (uniform int x = 0; x < w; x += nx)  {
-            // Figur out x,y pixel in NDC
+            // Figure out x,y pixel in NDC
            float px =  (x + du - (w / 2.0f)) / (w / 2.0f);
            float py = -(y + dv - (h / 2.0f)) / (h / 2.0f);
            float ret = 0.f;
@@ -293,7 +299,7 @@ void ao_scanlines(uniform int y0, uniform int y1, uniform int w, uniform int h,

            // offset to the first pixel in the image
            uniform int offset = 3 * (y * w + x);
-            for (uniform int p = 0; p < programCount; p += 4, ++offset) {
+            for (uniform int p = 0; p < programCount; p += 4, offset += 3) {
                // Get the four sample values for this pixel
                uniform float sumret = retArray[p] + retArray[p+1] + retArray[p+2] +
                    retArray[p+3];
@@ -315,3 +321,18 @@ export void ao_ispc(uniform int w, uniform int h, uniform int nsubsamples,
                    uniform float image[]) {
    ao_scanlines(0, h, w, h, nsubsamples, image);
 }
+
+
+static void task ao_task(uniform int y0, uniform int y1, uniform int width,
+                         uniform int height, uniform int nsubsamples, 
+                         uniform float image[]) {
+    ao_scanlines(y0, y1, width, height, nsubsamples, image);
+}
+
+
+export void ao_ispc_tasks(uniform int w, uniform int h, uniform int nsubsamples, 
+                          uniform float image[]) {
+    uniform int dy = 1;
+    for (uniform int y = 0; y < h; y += dy)
+        launch < ao_task(y, y+dy, w, h, nsubsamples, image) >;
+}
--- a/examples/aobench/ao_serial.cpp
+++ b/examples/aobench/ao_serial.cpp
@@ -140,7 +140,7 @@ ray_plane_intersect(Isect &isect, Ray &ray,
    float d = -dot(plane.p, plane.n);
    float v = dot(ray.dir, plane.n);

-    if (fabsf(v) < 1.0e-17) 
+    if (fabsf(v) < 1.0e-17f) 
        return;
    else {
        float t = -(dot(ray.org, plane.n) + d) / v;
@@ -183,11 +183,11 @@ orthoBasis(vec basis[3], const vec &n) {
    basis[2] = n;
    basis[1].x = 0.0; basis[1].y = 0.0; basis[1].z = 0.0;

-    if ((n.x < 0.6) && (n.x > -0.6)) {
+    if ((n.x < 0.6f) && (n.x > -0.6f)) {
        basis[1].x = 1.0;
-    } else if ((n.y < 0.6) && (n.y > -0.6)) {
+    } else if ((n.y < 0.6f) && (n.y > -0.6f)) {
        basis[1].y = 1.0;
-    } else if ((n.z < 0.6) && (n.z > -0.6)) {
+    } else if ((n.z < 0.6f) && (n.z > -0.6f)) {
        basis[1].z = 1.0;
    } else {
        basis[1].x = 1.0;
@@ -224,7 +224,7 @@ ambient_occlusion(Isect &isect, Plane &plane,
            float phi   = 2.0f * M_PI * drand48();
            float x = cosf(phi) * theta;
            float y = sinf(phi) * theta;
-            float z = sqrtf(1.0 - theta * theta);
+            float z = sqrtf(1.0f - theta * theta);

            // local . global
            float rx = x * basis[0].x + y * basis[1].x + z * basis[2].x;
@@ -236,14 +236,14 @@ ambient_occlusion(Isect &isect, Plane &plane,
            ray.dir.y = ry;
            ray.dir.z = rz;

-            occIsect.t   = 1.0e+17;
+            occIsect.t   = 1.0e+17f;
            occIsect.hit = 0;

            for (int snum = 0; snum < 3; ++snum)
                ray_sphere_intersect(occIsect, ray, spheres[snum]); 
            ray_plane_intersect (occIsect, ray, plane); 

-            if (occIsect.hit) occlusion += 1.0;
+            if (occIsect.hit) occlusion += 1.f;
        }
    }

@@ -280,10 +280,10 @@ static void ao_scanlines(int y0, int y1, int w, int h, int nsubsamples,

                    ray.dir.x = px;
                    ray.dir.y = py;
-                    ray.dir.z = -1.0;
+                    ray.dir.z = -1.0f;
                    vnormalize(ray.dir);

-                    isect.t   = 1.0e+17;
+                    isect.t   = 1.0e+17f;
                    isect.hit = 0;

                    for (int snum = 0; snum < 3; ++snum)
--- a/examples/aobench/aobench.vcxproj
+++ b/examples/aobench/aobench.vcxproj
@@ -1,4 +1,4 @@
-<?xml version="1.0" encoding="utf-8"?>
+<?xml version="1.0" encoding="utf-8"?>
 <Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
  <ItemGroup Label="ProjectConfigurations">
    <ProjectConfiguration Include="Debug|Win32">
@@ -21,6 +21,7 @@
  <ItemGroup>
    <ClCompile Include="ao.cpp" />
    <ClCompile Include="ao_serial.cpp" />
+    <ClCompile Include="../tasks_concrt.cpp" />
  </ItemGroup>
  <ItemGroup>
    <CustomBuild Include="ao.ispc">
--- a/examples/aobench_instrumented/Makefile
+++ b/examples/aobench_instrumented/Makefile
@@ -2,7 +2,7 @@
 CXX=g++ -m64
 CXXFLAGS=-Iobjs/ -g3 -Wall
 ISPC=ispc
-ISPCFLAGS=-O2 --fast-math --instrument --arch=x86-64
+ISPCFLAGS=-O2 --instrument --arch=x86-64

 default: ao

--- a/examples/aobench_instrumented/ao.cpp
+++ b/examples/aobench_instrumented/ao.cpp
@@ -100,6 +100,7 @@ savePPM(const char *fname, int w, int h)
    fprintf(fp, "255\n");
    fwrite(img, w * h * 3, 1, fp);
    fclose(fp);
+    printf("Wrote image file %s\n", fname);
 }


--- a/examples/examples.sln
+++ b/examples/examples.sln
@@ -15,8 +15,11 @@ Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "mandelbrot_tasks", "mandelb
 EndProject
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "aobench_instrumented", "aobench_instrumented\aobench_instrumented.vcxproj", "{B3B4AE3D-6D5A-4CF9-AF5B-43CF2131B958}"
 EndProject
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "noise", "noise\noise.vcxproj", "{0E0886D8-8B5E-4EAF-9A21-91E63DAF81FD}"
-EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "noise", "noise\noise.vcxproj", "{0E0886D8-8B5E-4EAF-9A21-91E63DAF81FD}"
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "volume", "volume_rendering\volume.vcxproj", "{DEE5733A-E93E-449D-9114-9BFFCAEB4DF9}"
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "stencil", "stencil\stencil.vcxproj", "{2EF070A1-F62F-4E6A-944B-88D140945C3C}"
+EndProject
 Global
 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
 		Debug|Win32 = Debug|Win32
@@ -81,14 +84,30 @@ Global
 		{B3B4AE3D-6D5A-4CF9-AF5B-43CF2131B958}.Release|Win32.Build.0 = Release|Win32
 		{B3B4AE3D-6D5A-4CF9-AF5B-43CF2131B958}.Release|x64.ActiveCfg = Release|x64
 		{B3B4AE3D-6D5A-4CF9-AF5B-43CF2131B958}.Release|x64.Build.0 = Release|x64
-		{0E0886D8-8B5E-4EAF-9A21-91E63DAF81FD}.Debug|Win32.ActiveCfg = Debug|Win32
-		{0E0886D8-8B5E-4EAF-9A21-91E63DAF81FD}.Debug|Win32.Build.0 = Debug|Win32
-		{0E0886D8-8B5E-4EAF-9A21-91E63DAF81FD}.Debug|x64.ActiveCfg = Debug|x64
-		{0E0886D8-8B5E-4EAF-9A21-91E63DAF81FD}.Debug|x64.Build.0 = Debug|x64
-		{0E0886D8-8B5E-4EAF-9A21-91E63DAF81FD}.Release|Win32.ActiveCfg = Release|Win32
-		{0E0886D8-8B5E-4EAF-9A21-91E63DAF81FD}.Release|Win32.Build.0 = Release|Win32
-		{0E0886D8-8B5E-4EAF-9A21-91E63DAF81FD}.Release|x64.ActiveCfg = Release|x64
-		{0E0886D8-8B5E-4EAF-9A21-91E63DAF81FD}.Release|x64.Build.0 = Release|x64
+		{0E0886D8-8B5E-4EAF-9A21-91E63DAF81FD}.Debug|Win32.ActiveCfg = Debug|Win32
+		{0E0886D8-8B5E-4EAF-9A21-91E63DAF81FD}.Debug|Win32.Build.0 = Debug|Win32
+		{0E0886D8-8B5E-4EAF-9A21-91E63DAF81FD}.Debug|x64.ActiveCfg = Debug|x64
+		{0E0886D8-8B5E-4EAF-9A21-91E63DAF81FD}.Debug|x64.Build.0 = Debug|x64
+		{0E0886D8-8B5E-4EAF-9A21-91E63DAF81FD}.Release|Win32.ActiveCfg = Release|Win32
+		{0E0886D8-8B5E-4EAF-9A21-91E63DAF81FD}.Release|Win32.Build.0 = Release|Win32
+		{0E0886D8-8B5E-4EAF-9A21-91E63DAF81FD}.Release|x64.ActiveCfg = Release|x64
+		{0E0886D8-8B5E-4EAF-9A21-91E63DAF81FD}.Release|x64.Build.0 = Release|x64
+		{DEE5733A-E93E-449D-9114-9BFFCAEB4DF9}.Debug|Win32.ActiveCfg = Debug|Win32
+		{DEE5733A-E93E-449D-9114-9BFFCAEB4DF9}.Debug|Win32.Build.0 = Debug|Win32
+		{DEE5733A-E93E-449D-9114-9BFFCAEB4DF9}.Debug|x64.ActiveCfg = Debug|x64
+		{DEE5733A-E93E-449D-9114-9BFFCAEB4DF9}.Debug|x64.Build.0 = Debug|x64
+		{DEE5733A-E93E-449D-9114-9BFFCAEB4DF9}.Release|Win32.ActiveCfg = Release|Win32
+		{DEE5733A-E93E-449D-9114-9BFFCAEB4DF9}.Release|Win32.Build.0 = Release|Win32
+		{DEE5733A-E93E-449D-9114-9BFFCAEB4DF9}.Release|x64.ActiveCfg = Release|x64
+		{DEE5733A-E93E-449D-9114-9BFFCAEB4DF9}.Release|x64.Build.0 = Release|x64
+		{2EF070A1-F62F-4E6A-944B-88D140945C3C}.Debug|Win32.ActiveCfg = Debug|Win32
+		{2EF070A1-F62F-4E6A-944B-88D140945C3C}.Debug|Win32.Build.0 = Debug|Win32
+		{2EF070A1-F62F-4E6A-944B-88D140945C3C}.Debug|x64.ActiveCfg = Debug|x64
+		{2EF070A1-F62F-4E6A-944B-88D140945C3C}.Debug|x64.Build.0 = Debug|x64
+		{2EF070A1-F62F-4E6A-944B-88D140945C3C}.Release|Win32.ActiveCfg = Release|Win32
+		{2EF070A1-F62F-4E6A-944B-88D140945C3C}.Release|Win32.Build.0 = Release|Win32
+		{2EF070A1-F62F-4E6A-944B-88D140945C3C}.Release|x64.ActiveCfg = Release|x64
+		{2EF070A1-F62F-4E6A-944B-88D140945C3C}.Release|x64.Build.0 = Release|x64
 	EndGlobalSection
 	GlobalSection(SolutionProperties) = preSolution
 		HideSolutionNode = FALSE
--- a/examples/mandelbrot/mandelbrot.cpp
+++ b/examples/mandelbrot/mandelbrot.cpp
@@ -64,6 +64,7 @@ writePPM(int *buf, int width, int height, const char *fn) {
            fputc(c, fp);
    }
    fclose(fp);
+    printf("Wrote image file %s\n", fn);
 }


--- a/examples/mandelbrot/mandelbrot_serial.cpp
+++ b/examples/mandelbrot/mandelbrot_serial.cpp
@@ -36,7 +36,7 @@ static int mandel(float c_re, float c_im, int count) {
    float z_re = c_re, z_im = c_im;
    int i;
    for (i = 0; i < count; ++i) {
-        if (z_re * z_re + z_im * z_im > 4.)
+        if (z_re * z_re + z_im * z_im > 4.f)
            break;

        float new_re = z_re*z_re - z_im*z_im;
--- a/examples/mandelbrot_tasks/Makefile
+++ b/examples/mandelbrot_tasks/Makefile
@@ -1,18 +1,18 @@

 ARCH = $(shell uname)

-TASK_CXX=tasks_pthreads.cpp
+TASK_CXX=../tasks_pthreads.cpp
 TASK_LIB=-lpthread

 ifeq ($(ARCH), Darwin)
-  TASK_CXX=tasks_gcd.cpp
+  TASK_CXX=../tasks_gcd.cpp
  TASK_LIB=
 endif

-TASK_OBJ=$(addprefix objs/, $(TASK_CXX:.cpp=.o))
+TASK_OBJ=$(addprefix objs/, $(subst ../,, $(TASK_CXX:.cpp=.o)))

-CXX=g++ -m64
-CXXFLAGS=-Iobjs/ -O3 -Wall
+CXX=g++
+CXXFLAGS=-Iobjs/ -O3 -Wall -m64
 ISPC=ispc
 ISPCFLAGS=-O2 --target=sse4x2 --arch=x86-64

@@ -32,6 +32,9 @@ mandelbrot: dirs objs/mandelbrot.o objs/mandelbrot_serial.o objs/mandelbrot_ispc
 objs/%.o: %.cpp
 	$(CXX) $< $(CXXFLAGS) -c -o $@

+objs/%.o: ../%.cpp
+	$(CXX) $< $(CXXFLAGS) -c -o $@
+
 objs/mandelbrot.o: objs/mandelbrot_ispc.h 

 objs/%_ispc.h objs/%_ispc.o: %.ispc
--- a/examples/mandelbrot_tasks/mandelbrot.cpp
+++ b/examples/mandelbrot_tasks/mandelbrot.cpp
@@ -40,6 +40,7 @@

 #include <stdio.h>
 #include <algorithm>
+#include <string.h>
 #include "../timing.h"
 #include "../cpuid.h"
 #include "mandelbrot_ispc.h"
@@ -64,6 +65,7 @@ writePPM(int *buf, int width, int height, const char *fn) {
            fputc(c, fp);
    }
    fclose(fp);
+    printf("Wrote image file %s\n", fn);
 }


@@ -98,8 +100,12 @@ ensureTargetISAIsSupported() {
    }
 }

+static void usage() {
+    fprintf(stderr, "usage: mandelbrot [--scale=<factor]\n");
+    exit(1);
+}

-int main() {
+int main(int argc, char *argv[]) {
    unsigned int width = 1536;
    unsigned int height = 1024;
    float x0 = -2;
@@ -107,10 +113,26 @@ int main() {
    float y0 = -1;
    float y1 = 1;

-    ensureTargetISAIsSupported();
+    if (argc == 1)
+        ;
+    else if (argc == 2) {
+        if (strncmp(argv[1], "--scale=", 8) == 0) {
+            float scale = atof(argv[1] + 8);
+            if (scale == 0.f)
+                usage();
+            width *= scale;
+            height *= scale;
+            // round up to multiples of 16
+            width = (width + 0xf) & ~0xf;
+            height = (height + 0xf) & ~0xf;
+        }
+        else 
+            usage();
+    }
+    else
+        usage();

-    extern void TasksInit();
-    TasksInit();
+    ensureTargetISAIsSupported();

    int maxIterations = 512;
    int *buf = new int[width*height];
--- a/examples/mandelbrot_tasks/mandelbrot_serial.cpp
+++ b/examples/mandelbrot_tasks/mandelbrot_serial.cpp
@@ -36,7 +36,7 @@ static int mandel(float c_re, float c_im, int count) {
    float z_re = c_re, z_im = c_im;
    int i;
    for (i = 0; i < count; ++i) {
-        if (z_re * z_re + z_im * z_im > 4.)
+        if (z_re * z_re + z_im * z_im > 4.f)
            break;

        float new_re = z_re*z_re - z_im*z_im;
--- a/examples/mandelbrot_tasks/mandelbrot_tasks.vcxproj
+++ b/examples/mandelbrot_tasks/mandelbrot_tasks.vcxproj
@@ -1,4 +1,4 @@
-<?xml version="1.0" encoding="utf-8"?>
+<?xml version="1.0" encoding="utf-8"?>
 <Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
  <ItemGroup Label="ProjectConfigurations">
    <ProjectConfiguration Include="Debug|Win32">
@@ -143,7 +143,7 @@
  <ItemGroup>
    <ClCompile Include="mandelbrot.cpp" />
    <ClCompile Include="mandelbrot_serial.cpp" />
-    <ClCompile Include="tasks_concrt.cpp" />
+    <ClCompile Include="../tasks_concrt.cpp" />
  </ItemGroup>
  <ItemGroup>
    <CustomBuild Include="mandelbrot.ispc">
--- a/examples/noise/noise.ispc
+++ b/examples/noise/noise.ispc
@@ -131,11 +131,11 @@ static float Noise(float x, float y, float z) {
 }


-static float Turbulence(float x, float y, float z, int octaves) {
+static float Turbulence(float x, float y, float z, uniform int octaves) {
    float omega = 0.6;

    float sum = 0., lambda = 1., o = 1.;
-    for (int i = 0; i < octaves; ++i) {
+    for (uniform int i = 0; i < octaves; ++i) {
        sum += abs(o * Noise(lambda * x, lambda * y, lambda * z));
        lambda *= 1.99f;
        o *= omega;
--- a/examples/noise/noise_serial.cpp
+++ b/examples/noise/noise_serial.cpp
@@ -104,7 +104,7 @@ inline float NoiseWeight(float t) {


 inline float Lerp(float t, float low, float high) {
-    return (1. - t) * low + t * high;
+    return (1.f - t) * low + t * high;
 }


@@ -147,7 +147,7 @@ static float Turbulence(float x, float y, float z, int octaves) {
        lambda *= 1.99f;
        o *= omega;
    }
-    return sum * 0.5;
+    return sum * 0.5f;
 }


@@ -163,7 +163,7 @@ void noise_serial(float x0, float y0, float x1, float y1,
            float y = y0 + j * dy;

            int index = (j * width + i);
-            output[index] = Turbulence(x, y, 0.6, 8);
+            output[index] = Turbulence(x, y, 0.6f, 8);
        }
    }
 }
--- a/examples/options/options_serial.cpp
+++ b/examples/options/options_serial.cpp
@@ -47,7 +47,7 @@ static inline float
 CND(float X) {
    float L = fabsf(X);

-    float k = 1.0 / (1.0 + 0.2316419 * L);
+    float k = 1.f / (1.f + 0.2316419f * L);
    float k2 = k*k;
    float k3 = k2*k;
    float k4 = k2*k2;
@@ -59,7 +59,7 @@ CND(float X) {
    w *= invSqrt2Pi * expf(-L * L * .5f);

    if (X > 0.f)
-        w = 1.0 - w;
+        w = 1.f - w;
    return w;
 }

@@ -94,7 +94,7 @@ binomial_put_serial(float Sa[], float Xa[], float Ta[],

        float dt = T / BINOMIAL_NUM;
        float u = expf(v * sqrtf(dt));
-        float d = 1. / u;
+        float d = 1.f / u;
        float disc = expf(r * dt);
        float Pu = (disc - d) / (u - d);

--- a/examples/rt/Makefile
+++ b/examples/rt/Makefile
@@ -1,6 +1,18 @@

-CXX=g++ -m64
-CXXFLAGS=-Iobjs/ -O3 -Wall
+ARCH = $(shell uname)
+
+TASK_CXX=../tasks_pthreads.cpp
+TASK_LIB=-lpthread
+
+ifeq ($(ARCH), Darwin)
+  TASK_CXX=../tasks_gcd.cpp
+  TASK_LIB=
+endif
+
+TASK_OBJ=$(addprefix objs/, $(subst ../,, $(TASK_CXX:.cpp=.o)))
+
+CXX=g++
+CXXFLAGS=-Iobjs/ -O3 -Wall -m64
 ISPC=ispc
 ISPCFLAGS=-O2 --target=sse4x2 --arch=x86-64

@@ -14,11 +26,16 @@ dirs:
 clean:
 	/bin/rm -rf objs *~ rt

-rt: dirs objs/rt.o objs/rt_serial.o objs/rt_ispc.o
-	$(CXX) $(CXXFLAGS) -o $@ objs/rt.o objs/rt_ispc.o objs/rt_serial.o -lm
+rt: dirs objs/rt.o objs/rt_serial.o objs/rt_ispc.o $(TASK_OBJ)
+	$(CXX) $(CXXFLAGS) -o $@ objs/rt.o objs/rt_ispc.o objs/rt_serial.o $(TASK_OBJ) -lm $(TASK_LIB)

-objs/%.o: %.cpp objs/rt_ispc.h
+objs/%.o: %.cpp
 	$(CXX) $< $(CXXFLAGS) -c -o $@

+objs/%.o: ../%.cpp
+	$(CXX) $< $(CXXFLAGS) -c -o $@
+
+objs/rt.o: objs/rt_ispc.h 
+
 objs/%_ispc.h objs/%_ispc.o: %.ispc
 	$(ISPC) $(ISPCFLAGS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
--- a/examples/rt/rt.cpp
+++ b/examples/rt/rt.cpp
@@ -42,6 +42,7 @@
 #include <math.h>
 #include <algorithm>
 #include <assert.h>
+#include <string.h>
 #include <sys/types.h>
 #include "../timing.h"
 #include "../cpuid.h"
@@ -51,7 +52,8 @@ using namespace ispc;

 typedef unsigned int uint;

-extern void raytrace_serial(int width, int height, const float raster2camera[4][4], 
+extern void raytrace_serial(int width, int height, int baseWidth, int baseHeight,
+                            const float raster2camera[4][4], 
                            const float camera2world[4][4], float image[],
                            int id[], const LinearBVHNode nodes[],
                            const Triangle triangles[]);
@@ -90,6 +92,7 @@ static void writeImage(int *idImage, float *depthImage, int width, int height,
        }
    }            
    fclose(f);
+    printf("Wrote image file %s\n", filename);
 }


@@ -125,11 +128,28 @@ ensureTargetISAIsSupported() {
 }


+static void usage() {
+    fprintf(stderr, "rt [--scale=<factor>] <scene name base>\n");
+    exit(1);
+}
+
+
 int main(int argc, char *argv[]) {
-    if (argc != 2) {
-        fprintf(stderr, "usage: rt <filename base>\n");
-        exit(1);
+    float scale = 1.f;
+    const char *filename = NULL;
+    for (int i = 1; i < argc; ++i) {
+        if (strncmp(argv[i], "--scale=", 8) == 0) {
+            scale = atof(argv[i] + 8);
+            if (scale == 0.f)
+                usage();
+        }
+        else if (filename != NULL)
+            usage();
+        else
+            filename = argv[i];
    }
+    if (filename == NULL)
+        usage();

    ensureTargetISAIsSupported();

@@ -143,10 +163,10 @@ int main(int argc, char *argv[]) {
    // Read the camera specification information from the camera file
    //
    char fnbuf[1024];
-    sprintf(fnbuf, "%s.camera", argv[1]);
+    sprintf(fnbuf, "%s.camera", filename);
    FILE *f = fopen(fnbuf, "rb");
    if (!f) {
-        perror(argv[1]);
+        perror(fnbuf);
        return 1;
    }

@@ -154,20 +174,20 @@ int main(int argc, char *argv[]) {
    // Nothing fancy, and trouble if we run on a big-endian system, just
    // fread in the bits
    //
-    int width, height;
+    int baseWidth, baseHeight;
    float camera2world[4][4], raster2camera[4][4];
-    READ(width, 1);
-    READ(height, 1);
+    READ(baseWidth, 1);
+    READ(baseHeight, 1);
    READ(camera2world[0][0], 16);
    READ(raster2camera[0][0], 16);

    //
    // Read in the serialized BVH 
    //
-    sprintf(fnbuf, "%s.bvh", argv[1]);
+    sprintf(fnbuf, "%s.bvh", filename);
    f = fopen(fnbuf, "rb");
    if (!f) {
-        perror(argv[2]);
+        perror(fnbuf);
        return 1;
    }

@@ -214,10 +234,10 @@ int main(int argc, char *argv[]) {
    }
    fclose(f);

-    // round image resolution up to multiple of 4 to makethings easy for
+    // round image resolution up to multiple of 16 to make things easy for
    // the code that assigns pixels to ispc program instances
-    height = (height + 3) & ~3;
-    width = (width + 3) & ~3;
+    int height = (int(baseHeight * scale) + 0xf) & ~0xf;
+    int width = (int(baseWidth * scale) + 0xf) & ~0xf;

    // allocate images; one to hold hit object ids, one to hold depth to
    // the first interseciton
@@ -225,19 +245,42 @@ int main(int argc, char *argv[]) {
    float *image = new float[width*height];

    //
-    // Run 3 iterations with ispc, record the minimum time
+    // Run 3 iterations with ispc + 1 core, record the minimum time
    //
    double minTimeISPC = 1e30;
    for (int i = 0; i < 3; ++i) {
        reset_and_start_timer();
-        raytrace(width, height, raster2camera, camera2world, 
-                 image, id, nodes, triangles);
+        raytrace_ispc(width, height, baseWidth, baseHeight, raster2camera, 
+                      camera2world, image, id, nodes, triangles);
        double dt = get_elapsed_mcycles();
        minTimeISPC = std::min(dt, minTimeISPC);
    }
-    printf("[rt ispc]:\t\t\t[%.3f] million cycles for %d x %d image\n", minTimeISPC, width, height);
+    printf("[rt ispc, 1 core]:\t\t[%.3f] million cycles for %d x %d image\n", 
+           minTimeISPC, width, height);

-    writeImage(id, image, width, height, "rt-ispc.ppm");
+    writeImage(id, image, width, height, "rt-ispc-1core.ppm");
+
+    memset(id, 0, width*height*sizeof(int));
+    memset(image, 0, width*height*sizeof(float));
+
+    //
+    // Run 3 iterations with ispc + 1 core, record the minimum time
+    //
+    double minTimeISPCtasks = 1e30;
+    for (int i = 0; i < 3; ++i) {
+        reset_and_start_timer();
+        raytrace_ispc_tasks(width, height, baseWidth, baseHeight, raster2camera,
+                            camera2world, image, id, nodes, triangles);
+        double dt = get_elapsed_mcycles();
+        minTimeISPCtasks = std::min(dt, minTimeISPCtasks);
+    }
+    printf("[rt ispc + tasks]:\t\t[%.3f] million cycles for %d x %d image\n", 
+           minTimeISPCtasks, width, height);
+
+    writeImage(id, image, width, height, "rt-ispc-tasks.ppm");
+
+    memset(id, 0, width*height*sizeof(int));
+    memset(image, 0, width*height*sizeof(float));

    //
    // And 3 iterations with the serial implementation, reporting the
@@ -246,14 +289,15 @@ int main(int argc, char *argv[]) {
    double minTimeSerial = 1e30;
    for (int i = 0; i < 3; ++i) {
        reset_and_start_timer();
-        raytrace_serial(width, height, raster2camera, camera2world, 
-                        image, id, nodes, triangles);
+        raytrace_serial(width, height, baseWidth, baseHeight, raster2camera, 
+                        camera2world, image, id, nodes, triangles);
        double dt = get_elapsed_mcycles();
        minTimeSerial = std::min(dt, minTimeSerial);
    }
    printf("[rt serial]:\t\t\t[%.3f] million cycles for %d x %d image\n", 
           minTimeSerial, width, height);
-    printf("\t\t\t\t(%.2fx speedup from ISPC)\n", minTimeSerial / minTimeISPC);
+    printf("\t\t\t\t(%.2fx speedup from ISPC, %.2f from ISPC + tasks)\n", 
+           minTimeSerial / minTimeISPC, minTimeSerial / minTimeISPCtasks);

    writeImage(id, image, width, height, "rt-serial.ppm");

--- a/examples/rt/rt.ispc
+++ b/examples/rt/rt.ispc
@@ -226,20 +226,26 @@ bool BVHIntersect(const LinearBVHNode nodes[], const Triangle tris[],
 }


-export void raytrace(uniform int width, uniform int height,
-                     const uniform float raster2camera[4][4], 
-                     const uniform float camera2world[4][4],
-                     uniform float image[], uniform int id[],
-                     const LinearBVHNode nodes[],
-                     const Triangle triangles[]) {
+static void raytrace_tile(uniform int x0, uniform int x1,
+                          uniform int y0, uniform int y1, 
+                          uniform int width, uniform int height,
+                          uniform int baseWidth, uniform int baseHeight,
+                          const uniform float raster2camera[4][4], 
+                          const uniform float camera2world[4][4],
+                          uniform float image[], uniform int id[],
+                          const LinearBVHNode nodes[],
+                          const Triangle triangles[]) {
+    uniform float widthScale = (float)(baseWidth) / (float)(width);
+    uniform float heightScale = (float)(baseHeight) / (float)(height);
+
    static const uniform float udx[16] = { 0, 1, 0, 1, 2, 3, 2, 3, 
                                           0, 1, 0, 1, 2, 3, 2, 3 };
    static const uniform float udy[16] = { 0, 0, 1, 1, 0, 0, 1, 1, 
                                           2, 2, 3, 3, 2, 2, 3, 3 };

    // The outer loops are always over blocks of 4x4 pixels
-    for (uniform int y = 0; y < height; y += 4) {
-        for (uniform int x = 0; x < width; x += 4) {
+    for (uniform int y = y0; y < y1; y += 4) {
+        for (uniform int x = x0; x < x1; x += 4) {
            // Now we have a block of 4x4=16 pixels to process; it will
            // take 16/programCount iterations of this loop to process
            // them.
@@ -251,7 +257,8 @@ export void raytrace(uniform int width, uniform int height,
                const float dy = udy[o * programCount + programIndex];

                Ray ray;
-                generateRay(raster2camera, camera2world, x+dx, y+dy, ray);
+                generateRay(raster2camera, camera2world, (x+dx)*widthScale,
+                            (y+dy)*heightScale, ray);
                BVHIntersect(nodes, triangles, ray);

                int offset = (y + (int)dy) * width + (x + (int)dx);
@@ -261,3 +268,51 @@ export void raytrace(uniform int width, uniform int height,
        }
    }
 }
+
+
+export void raytrace_ispc(uniform int width, uniform int height,
+                          uniform int baseWidth, uniform int baseHeight,
+                          const uniform float raster2camera[4][4], 
+                          const uniform float camera2world[4][4],
+                          uniform float image[], uniform int id[],
+                          const LinearBVHNode nodes[],
+                          const Triangle triangles[]) {
+    raytrace_tile(0, width, 0, height, width, height, baseWidth, baseHeight,
+                  raster2camera, camera2world, image,
+                  id, nodes, triangles);
+}
+
+
+task void raytrace_tile_task(uniform int x0, uniform int x1,
+                             uniform int y0, uniform int y1, 
+                             uniform int width, uniform int height,
+                             uniform int baseWidth, uniform int baseHeight,
+                             const uniform float raster2camera[4][4], 
+                             const uniform float camera2world[4][4],
+                             uniform float image[], uniform int id[],
+                             const LinearBVHNode nodes[],
+                             const Triangle triangles[]) {
+    raytrace_tile(x0, x1, y0, y1, width, height, baseWidth, baseHeight, 
+                  raster2camera, camera2world, image,
+                  id, nodes, triangles);
+}
+
+
+export void raytrace_ispc_tasks(uniform int width, uniform int height,
+                                uniform int baseWidth, uniform int baseHeight,
+                                const uniform float raster2camera[4][4], 
+                                const uniform float camera2world[4][4],
+                                uniform float image[], uniform int id[],
+                                const LinearBVHNode nodes[],
+                                const Triangle triangles[]) {
+    uniform int dx = 16, dy = 16;
+    for (uniform int y = 0; y < height; y += dy) {
+        uniform int y1 = min(y + dy, height);
+        for (uniform int x = 0; x < width; x += dx) {
+            uniform int x1 = min(x + dx, width);
+            launch < raytrace_tile_task(x, x1, y, y1, width, height, baseWidth,
+                                        baseHeight, raster2camera, camera2world, 
+                                        image, id, nodes, triangles) >;
+         }
+    }
+}
--- a/examples/rt/rt.vcxproj
+++ b/examples/rt/rt.vcxproj
@@ -1,4 +1,4 @@
-<?xml version="1.0" encoding="utf-8"?>
+<?xml version="1.0" encoding="utf-8"?>
 <Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
  <ItemGroup Label="ProjectConfigurations">
    <ProjectConfiguration Include="Debug|Win32">
@@ -164,6 +164,7 @@ ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h
  <ItemGroup>
    <ClCompile Include="rt.cpp" />
    <ClCompile Include="rt_serial.cpp" />
+    <ClCompile Include="../tasks_concrt.cpp" />
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
--- a/examples/rt/rt_serial.cpp
+++ b/examples/rt/rt_serial.cpp
@@ -258,17 +258,21 @@ bool BVHIntersect(const LinearBVHNode nodes[], const Triangle tris[],
 }


-void raytrace_serial(int width, int height,
+void raytrace_serial(int width, int height, int baseWidth, int baseHeight,
                     const float raster2camera[4][4], 
                     const float camera2world[4][4],
                     float image[],
                     int id[],
                     const LinearBVHNode nodes[],
                     const Triangle triangles[]) {
+    float widthScale = float(baseWidth) / float(width);
+    float heightScale = float(baseHeight) / float(height);
+
    for (int y = 0; y < height; ++y) {
        for (int x = 0; x < width; ++x) {
                Ray ray;
-                generateRay(raster2camera, camera2world, x, y, ray);
+                generateRay(raster2camera, camera2world, x * widthScale,
+                            y * heightScale, ray);
                BVHIntersect(nodes, triangles, ray);

                int offset = y * width + x;
--- a/examples/stencil/.gitignore
+++ b/examples/stencil/.gitignore
@@ -0,0 +1,2 @@
+stencil
+objs
--- a/examples/stencil/Makefile
+++ b/examples/stencil/Makefile
@@ -0,0 +1,41 @@
+
+ARCH = $(shell uname)
+
+TASK_CXX=../tasks_pthreads.cpp
+TASK_LIB=-lpthread
+
+ifeq ($(ARCH), Darwin)
+  TASK_CXX=../tasks_gcd.cpp
+  TASK_LIB=
+endif
+
+TASK_OBJ=$(addprefix objs/, $(subst ../,, $(TASK_CXX:.cpp=.o)))
+
+CXX=g++
+CXXFLAGS=-Iobjs/ -O3 -Wall -m64
+ISPC=ispc
+ISPCFLAGS=-O2 --target=sse4x2 --arch=x86-64
+
+default: stencil
+
+.PHONY: dirs clean
+
+dirs:
+	/bin/mkdir -p objs/
+
+clean:
+	/bin/rm -rf objs *~ stencil
+
+stencil: dirs objs/stencil.o objs/stencil_serial.o objs/stencil_ispc.o $(TASK_OBJ)
+	$(CXX) $(CXXFLAGS) -o $@ objs/stencil.o objs/stencil_ispc.o objs/stencil_serial.o $(TASK_OBJ) -lm $(TASK_LIB)
+
+objs/%.o: %.cpp
+	$(CXX) $< $(CXXFLAGS) -c -o $@
+
+objs/%.o: ../%.cpp
+	$(CXX) $< $(CXXFLAGS) -c -o $@
+
+objs/stencil.o: objs/stencil_ispc.h 
+
+objs/%_ispc.h objs/%_ispc.o: %.ispc
+	$(ISPC) $(ISPCFLAGS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
--- a/examples/stencil/stencil.cpp
+++ b/examples/stencil/stencil.cpp
@@ -0,0 +1,186 @@
+/*
+  Copyright (c) 2010-2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+#ifdef _MSC_VER
+#define _CRT_SECURE_NO_WARNINGS
+#define NOMINMAX
+#pragma warning (disable: 4244)
+#pragma warning (disable: 4305)
+#endif
+
+#include <stdio.h>
+#include <algorithm>
+#include <math.h>
+#include "../timing.h"
+#include "../cpuid.h"
+#include "stencil_ispc.h"
+using namespace ispc;
+
+
+// Make sure that the vector ISA used during compilation is supported by
+// the processor.  The ISPC_TARGET_* macro is set in the ispc-generated
+// header file that we include above.
+static void
+ensureTargetISAIsSupported() {
+#if defined(ISPC_TARGET_SSE2)
+    bool isaSupported = CPUSupportsSSE2();
+    const char *target = "SSE2";
+#elif defined(ISPC_TARGET_SSE4)
+    bool isaSupported = CPUSupportsSSE4();
+    const char *target = "SSE4";
+#elif defined(ISPC_TARGET_AVX)
+    bool isaSupported = CPUSupportsAVX();
+    const char *target = "AVX";
+#else
+#error "Unknown ISPC_TARGET_* value"
+#endif
+    if (!isaSupported) {
+        fprintf(stderr, "***\n*** Error: the ispc-compiled code uses the %s instruction "
+                "set, which isn't\n***        supported by this computer's CPU!\n", target);
+        fprintf(stderr, "***\n***        Please modify the "
+#ifdef _MSC_VER
+                "MSVC project file "
+#else
+                "Makefile "
+#endif
+                "to select another target (e.g. sse2)\n***\n");
+        exit(1);
+    }
+}
+
+
+extern void loop_stencil_serial(int t0, int t1, int x0, int x1,
+                                int y0, int y1, int z0, int z1,
+                                int Nx, int Ny, int Nz,
+                                const float coef[5], 
+                                const float vsq[],
+                                float Aeven[], float Aodd[]);
+
+
+void InitData(int Nx, int Ny, int Nz, float *A[2], float *vsq) {
+    int offset = 0;
+    for (int z = 0; z < Nz; ++z)
+        for (int y = 0; y < Ny; ++y)
+            for (int x = 0; x < Nx; ++x, ++offset) {
+                A[0][offset] = (x < Nx / 2) ? x / float(Nx) : y / float(Ny);
+                A[1][offset] = 0;
+                vsq[offset] = x*y*z / float(Nx * Ny * Nz);
+            }
+}
+
+
+int main() {
+    ensureTargetISAIsSupported();
+
+    int Nx = 256, Ny = 256, Nz = 256;
+    int width = 4;
+    float *Aserial[2], *Aispc[2];
+    Aserial[0] = new float [Nx * Ny * Nz];
+    Aserial[1] = new float [Nx * Ny * Nz];
+    Aispc[0] = new float [Nx * Ny * Nz];
+    Aispc[1] = new float [Nx * Ny * Nz];
+    float *vsq = new float [Nx * Ny * Nz];
+
+    float coeff[4] = { 0.5, -.25, .125, -.0625 }; 
+
+    InitData(Nx, Ny, Nz, Aispc, vsq);
+
+    //
+    // Compute the image using the ispc implementation on one core; report
+    // the minimum time of three runs.
+    //
+    double minTimeISPC = 1e30;
+    for (int i = 0; i < 3; ++i) {
+        reset_and_start_timer();
+        loop_stencil_ispc(0, 6, width, Nx - width, width, Ny - width,
+                          width, Nz - width, Nx, Ny, Nz, coeff, vsq,
+                          Aispc[0], Aispc[1]);
+        double dt = get_elapsed_mcycles();
+        minTimeISPC = std::min(minTimeISPC, dt);
+    }
+
+    printf("[stencil ispc 1 core]:\t\t[%.3f] million cycles\n", minTimeISPC);
+
+    InitData(Nx, Ny, Nz, Aispc, vsq);
+
+    //
+    // Compute the image using the ispc implementation with tasks; report
+    // the minimum time of three runs.
+    //
+    double minTimeISPCTasks = 1e30;
+    for (int i = 0; i < 3; ++i) {
+        reset_and_start_timer();
+        loop_stencil_ispc_tasks(0, 6, width, Nx - width, width, Ny - width,
+                                width, Nz - width, Nx, Ny, Nz, coeff, vsq,
+                                Aispc[0], Aispc[1]);
+        double dt = get_elapsed_mcycles();
+        minTimeISPCTasks = std::min(minTimeISPCTasks, dt);
+    }
+
+    printf("[stencil ispc + tasks]:\t\t[%.3f] million cycles\n", minTimeISPCTasks);
+
+    InitData(Nx, Ny, Nz, Aserial, vsq);
+
+    // 
+    // And run the serial implementation 3 times, again reporting the
+    // minimum time.
+    //
+    double minTimeSerial = 1e30;
+    for (int i = 0; i < 3; ++i) {
+        reset_and_start_timer();
+        loop_stencil_serial(0, 6, width, Nx-width, width, Ny - width,
+                            width, Nz - width, Nx, Ny, Nz, coeff, vsq,
+                            Aserial[0], Aserial[1]);
+        double dt = get_elapsed_mcycles();
+        minTimeSerial = std::min(minTimeSerial, dt);
+    }
+
+    printf("[stencil serial]:\t\t[%.3f] millon cycles\n", minTimeSerial);
+
+    printf("\t\t\t\t(%.2fx speedup from ISPC, %.2f from ISPC + tasks)\n", 
+           minTimeSerial / minTimeISPC, minTimeSerial / minTimeISPCTasks);
+
+    // Check for agreement
+    int offset = 0;
+    for (int z = 0; z < Nz; ++z)
+        for (int y = 0; y < Ny; ++y)
+            for (int x = 0; x < Nx; ++x, ++offset) {
+                float error = fabsf((Aserial[1][offset] - Aispc[1][offset]) /
+                                    Aserial[1][offset]);
+                if (error > 1e-4)
+                    printf("Error @ (%d,%d,%d): ispc = %f, serial = %f\n",
+                           x, y, z, Aispc[1][offset], Aserial[1][offset]);
+            }
+
+    return 0;
+}
--- a/examples/stencil/stencil.ispc
+++ b/examples/stencil/stencil.ispc
@@ -0,0 +1,129 @@
+/*
+  Copyright (c) 2010-2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+
+static void
+stencil_step(uniform int x0, uniform int x1,
+             uniform int y0, uniform int y1,
+             uniform int z0, uniform int z1,
+             uniform int Nx, uniform int Ny, uniform int Nz,
+             uniform const float coef[4], uniform const float vsq[],
+             uniform const float Ain[], uniform float Aout[]) {
+    const uniform int Nxy = Nx * Ny;
+
+    for (uniform int z = z0; z < z1; ++z) {
+        for (uniform int y = y0; y < y1; ++y) {
+            // Assumes that (x1-x0) % programCount == 0
+            for (uniform int x = x0; x < x1; x += programCount) {
+                int index = (z * Nxy) + (y * Nx) + x + programIndex;
+#define A_cur(x, y, z) Ain[index + (x) + ((y) * Nx) + ((z) * Nxy)]
+#define A_next(x, y, z) Aout[index + (x) + ((y) * Nx) + ((z) * Nxy)]
+                float div = coef[0] * A_cur(0, 0, 0) +
+                            coef[1] * (A_cur(+1, 0, 0) + A_cur(-1, 0, 0) +
+                                       A_cur(0, +1, 0) + A_cur(0, -1, 0) +
+                                       A_cur(0, 0, +1) + A_cur(0, 0, -1)) +
+                            coef[2] * (A_cur(+2, 0, 0) + A_cur(-2, 0, 0) +
+                                       A_cur(0, +2, 0) + A_cur(0, -2, 0) +
+                                       A_cur(0, 0, +2) + A_cur(0, 0, -2)) +
+                            coef[3] * (A_cur(+3, 0, 0) + A_cur(-3, 0, 0) +
+                                       A_cur(0, +3, 0) + A_cur(0, -3, 0) +
+                                       A_cur(0, 0, +3) + A_cur(0, 0, -3));
+
+                A_next(0, 0, 0) = 2 * A_cur(0, 0, 0) - A_next(0, 0, 0) + 
+                    vsq[index] * div;
+            }
+        }
+    }
+}
+
+
+static task void
+stencil_step_task(uniform int x0, uniform int x1,
+                  uniform int y0, uniform int y1,
+                  uniform int z0, uniform int z1,
+                  uniform int Nx, uniform int Ny, uniform int Nz,
+                  uniform const float coef[4], uniform const float vsq[],
+                  uniform const float Ain[], uniform float Aout[]) {
+    stencil_step(x0, x1, y0, y1, z0, z1, Nx, Ny, Nz, coef, vsq, Ain, Aout);
+}
+
+
+export void
+loop_stencil_ispc_tasks(uniform int t0, uniform int t1, 
+                        uniform int x0, uniform int x1,
+                        uniform int y0, uniform int y1,
+                        uniform int z0, uniform int z1,
+                        uniform int Nx, uniform int Ny, uniform int Nz,
+                        uniform const float coef[4], 
+                        uniform const float vsq[],
+                        uniform float Aeven[], uniform float Aodd[])
+{
+    for (uniform int t = t0; t < t1; ++t) {
+        // Parallelize across cores as well: each task will work on a slice
+        // of "dz" in the z extent of the volume.  (dz=1 seems to work
+        // better than any larger values.)
+        uniform int dz = 1;
+        for (uniform int z = z0; z < z1; z += dz) {
+            if ((t & 1) == 0)
+                launch < stencil_step_task(x0, x1, y0, y1, z, z+dz, Nx, Ny, Nz, 
+                                           coef, vsq, Aeven, Aodd) >;
+            else
+                launch < stencil_step_task(x0, x1, y0, y1, z, z+dz, Nx, Ny, Nz, 
+                                           coef, vsq, Aodd, Aeven) >;
+        }
+        // We need to wait for all of the launched tasks to finish before
+        // starting the next iteration.
+        sync;
+    }
+}
+
+
+export void
+loop_stencil_ispc(uniform int t0, uniform int t1, 
+                  uniform int x0, uniform int x1,
+                  uniform int y0, uniform int y1,
+                  uniform int z0, uniform int z1,
+                  uniform int Nx, uniform int Ny, uniform int Nz,
+                  uniform const float coef[4], 
+                  uniform const float vsq[],
+                  uniform float Aeven[], uniform float Aodd[])
+{
+    for (uniform int t = t0; t < t1; ++t) {
+        if ((t & 1) == 0)
+            stencil_step(x0, x1, y0, y1, z0, z1, Nx, Ny, Nz, coef, vsq, 
+                         Aeven, Aodd);
+        else
+            stencil_step(x0, x1, y0, y1, z0, z1, Nx, Ny, Nz, coef, vsq, 
+                         Aodd, Aeven);
+    }
+}
--- a/examples/stencil/stencil.vcxproj
+++ b/examples/stencil/stencil.vcxproj
@@ -0,0 +1,172 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|Win32">
+      <Configuration>Debug</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|Win32">
+      <Configuration>Release</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{2ef070a1-f62f-4e6a-944b-88d140945c3c}</ProjectGuid>
+    <Keyword>Win32Proj</Keyword>
+    <RootNamespace>rt</RootNamespace>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <LinkIncremental>true</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <LinkIncremental>true</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <LinkIncremental>false</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <LinkIncremental>false</LinkIncremental>
+  </PropertyGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <ClCompile>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <FloatingPointModel>Fast</FloatingPointModel>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <ClCompile>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <FloatingPointModel>Fast</FloatingPointModel>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <FloatingPointModel>Fast</FloatingPointModel>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <FloatingPointModel>Fast</FloatingPointModel>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <CustomBuild Include="stencil.ispc">
+      <FileType>Document</FileType>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86
+</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(Filename).obj</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">%(Filename).obj</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86
+</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(Filename).obj</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">%(Filename).obj</Outputs>
+    </CustomBuild>
+  </ItemGroup>
+  <ItemGroup>
+    <ClCompile Include="stencil.cpp" />
+    <ClCompile Include="stencil_serial.cpp" />
+    <ClCompile Include="../tasks_concrt.cpp" />
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+  </ImportGroup>
+</Project>
--- a/examples/stencil/stencil_serial.cpp
+++ b/examples/stencil/stencil_serial.cpp
@@ -0,0 +1,86 @@
+/*
+  Copyright (c) 2010-2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+
+static void
+stencil_step(int x0, int x1,
+             int y0, int y1,
+             int z0, int z1,
+             int Nx, int Ny, int Nz,
+             const float coef[4], const float vsq[],
+             const float Ain[], float Aout[]) {
+    int Nxy = Nx * Ny;
+
+    for (int z = z0; z < z1; ++z) {
+        for (int y = y0; y < y1; ++y) {
+            for (int x = x0; x < x1; ++x) {
+                int index = (z * Nxy) + (y * Nx) + x;
+#define A_cur(x, y, z) Ain[index + (x) + ((y) * Nx) + ((z) * Nxy)]
+#define A_next(x, y, z) Aout[index + (x) + ((y) * Nx) + ((z) * Nxy)]
+                float div = coef[0] * A_cur(0, 0, 0) +
+                            coef[1] * (A_cur(+1, 0, 0) + A_cur(-1, 0, 0) +
+                                       A_cur(0, +1, 0) + A_cur(0, -1, 0) +
+                                       A_cur(0, 0, +1) + A_cur(0, 0, -1)) +
+                            coef[2] * (A_cur(+2, 0, 0) + A_cur(-2, 0, 0) +
+                                       A_cur(0, +2, 0) + A_cur(0, -2, 0) +
+                                       A_cur(0, 0, +2) + A_cur(0, 0, -2)) +
+                            coef[3] * (A_cur(+3, 0, 0) + A_cur(-3, 0, 0) +
+                                       A_cur(0, +3, 0) + A_cur(0, -3, 0) +
+                                       A_cur(0, 0, +3) + A_cur(0, 0, -3));
+
+                A_next(0, 0, 0) = 2 * A_cur(0, 0, 0) - A_next(0, 0, 0) + 
+                    vsq[index] * div;
+            }
+        }
+    }
+}
+
+
+void loop_stencil_serial(int t0, int t1, 
+                         int x0, int x1,
+                         int y0, int y1,
+                         int z0, int z1,
+                         int Nx, int Ny, int Nz,
+                         const float coef[4], 
+                         const float vsq[],
+                         float Aeven[], float Aodd[])
+{
+    for (int t = t0; t < t1; ++t) {
+        if ((t & 1) == 0)
+            stencil_step(x0, x1, y0, y1, z0, z1, Nx, Ny, Nz, coef, vsq, 
+                         Aeven, Aodd);
+        else
+            stencil_step(x0, x1, y0, y1, z0, z1, Nx, Ny, Nz, coef, vsq, 
+                         Aodd, Aeven);
+    }
+}
--- a/examples/taskinfo.h
+++ b/examples/taskinfo.h
@@ -0,0 +1,180 @@
+/*
+  Copyright (c) 2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+#ifndef TASKINFO_H
+#define TASKINFO_H 1
+
+#ifdef _MSC_VER
+#define ISPC_IS_WINDOWS
+#elif defined(__linux__)
+#define ISPC_IS_LINUX
+#elif defined(__APPLE__)
+#define ISPC_IS_APPLE
+#endif
+
+#ifdef ISPC_IS_WINDOWS
+#define NOMINMAX
+#include <windows.h>
+#include <concrt.h>
+using namespace Concurrency;
+#endif // ISPC_IS_WINDOWS
+
+#if (__SIZEOF_POINTER__ == 4) || defined(__i386__) || defined(_WIN32)
+#define ISPC_POINTER_BYTES 4
+#elif (__SIZEOF_POINTER__ == 8) || defined(__x86_64__) || defined(__amd64__) || defined(_WIN64)
+#define ISPC_POINTER_BYTES 8
+#else
+#error "Pointer size unknown!"
+#endif // __SIZEOF_POINTER__
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <assert.h>
+
+typedef struct TaskInfo {
+    void *func;
+    void *data;
+#if defined(ISPC_IS_WINDOWS)
+    event taskEvent;
+#endif
+} TaskInfo;
+
+
+#ifndef ISPC_IS_WINDOWS
+static int32_t 
+lAtomicCompareAndSwap32(volatile int32_t *v, int32_t newValue, int32_t oldValue) {
+    int32_t result;
+    __asm__ __volatile__("lock\ncmpxchgl %2,%1"
+                          : "=a"(result), "=m"(*v)
+                          : "q"(newValue), "0"(oldValue)
+                          : "memory");
+    __asm__ __volatile__("mfence":::"memory");
+    return result;
+}
+#endif // !ISPC_IS_WINDOWS
+
+
+static void *
+lAtomicCompareAndSwapPointer(void **v, void *newValue, void *oldValue) {
+#ifdef ISPC_IS_WINDOWS
+	return InterlockedCompareExchangePointer(v, newValue, oldValue);
+#else
+    void *result;
+#if (ISPC_POINTER_BYTES == 4)
+    __asm__ __volatile__("lock\ncmpxchgd %2,%1"
+                          : "=a"(result), "=m"(*v)
+                          : "q"(newValue), "0"(oldValue)
+                          : "memory");
+#else
+    __asm__ __volatile__("lock\ncmpxchgq %2,%1"
+                          : "=a"(result), "=m"(*v)
+                          : "q"(newValue), "0"(oldValue)
+                          : "memory");
+#endif // ISPC_POINTER_BYTES
+    __asm__ __volatile__("mfence":::"memory");
+    return result;
+#endif // ISPC_IS_WINDOWS
+}
+
+
+#ifndef ISPC_IS_WINDOWS
+static int32_t 
+lAtomicAdd32(volatile int32_t *v, int32_t delta) {
+    // Do atomic add with gcc x86 inline assembly
+    int32_t origValue;
+    __asm__ __volatile__("lock\n"
+                         "xaddl %0,%1"
+                         : "=r"(origValue), "=m"(*v) : "0"(delta)
+                         : "memory");
+    return origValue;
+}
+#endif
+
+#define LOG_TASK_QUEUE_CHUNK_SIZE 13
+#define MAX_TASK_QUEUE_CHUNKS 1024
+#define TASK_QUEUE_CHUNK_SIZE (1<<LOG_TASK_QUEUE_CHUNK_SIZE)
+
+#define MAX_LAUNCHED_TASKS (MAX_TASK_QUEUE_CHUNKS * TASK_QUEUE_CHUNK_SIZE)
+
+typedef void (*TaskFuncType)(void *, int, int);
+
+#ifdef ISPC_IS_WINDOWS
+static volatile LONG nextTaskInfoCoordinate;
+#else
+static volatile int nextTaskInfoCoordinate;
+#endif
+
+static TaskInfo *taskInfo[MAX_TASK_QUEUE_CHUNKS];
+
+static inline void
+lInitTaskInfo() {
+    taskInfo[0] = new TaskInfo[TASK_QUEUE_CHUNK_SIZE];
+}
+
+
+static inline TaskInfo *
+lGetTaskInfo() {
+#ifdef ISPC_IS_WINDOWS
+    int myCoord = InterlockedAdd(&nextTaskInfoCoordinate, 1)-1;
+#else
+    int myCoord = lAtomicAdd32(&nextTaskInfoCoordinate, 1);
+#endif
+	int index = (myCoord >> LOG_TASK_QUEUE_CHUNK_SIZE);
+    int offset = myCoord & (TASK_QUEUE_CHUNK_SIZE-1);
+    if (index == MAX_TASK_QUEUE_CHUNKS) {
+        fprintf(stderr, "A total of %d tasks have been launched--the simple "
+                "built-in task system can handle no more. Exiting.", myCoord);
+        exit(1);
+    }
+
+    if (taskInfo[index] == NULL) {
+        TaskInfo *newChunk = new TaskInfo[TASK_QUEUE_CHUNK_SIZE];
+        if (lAtomicCompareAndSwapPointer((void **)&taskInfo[index], newChunk, 
+                                         NULL) != NULL) {
+            // failure--someone else got it, but that's cool
+            assert(taskInfo[index] != NULL);
+            free(newChunk);
+        }
+    }
+
+    return &taskInfo[index][offset];
+}
+
+
+static inline void
+lResetTaskInfo() {
+    nextTaskInfoCoordinate = 0;
+}
+
+#endif // TASKINFO_H
--- a/examples/mandelbrot_tasks/tasks_concrt.cpp
+++ b/examples/mandelbrot_tasks/tasks_concrt.cpp
@@ -1,5 +1,5 @@
 /*
-  Copyright (c) 2010-2011, Intel Corporation
+  Copyright (c) 2011, Intel Corporation
  All rights reserved.

  Redistribution and use in source and binary forms, with or without
@@ -31,6 +31,8 @@
   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
 */

+#include "taskinfo.h"
+
 /* Simple task system implementation for ispc based on Microsoft's
   Concurrency Runtime. */

@@ -41,6 +43,7 @@ using namespace Concurrency;
 #include <assert.h>
 #include <stdio.h>
 #include <stdlib.h>
+#include <algorithm>

 // ispc expects these functions to have C linkage / not be mangled
 extern "C" { 
@@ -50,84 +53,44 @@ extern "C" {
    void ISPCFree(void *ptr);
 }

-typedef void (*TaskFuncType)(void *, int, int);
-
-struct TaskInfo {
-    TaskFuncType ispcFunc;
-    void *ispcData;
-};
-
-// This is a simple implementation that just aborts if more than MAX_TASKS
-// are launched.  It could easily be extended to be more general...
-
-#define MAX_TASKS 4096
-static int taskOffset;
-static TaskInfo taskInfo[MAX_TASKS];
-static event *events[MAX_TASKS];
-static CRITICAL_SECTION criticalSection;
-static bool initialized = false;
-
-void
-TasksInit() {
-    InitializeCriticalSection(&criticalSection);
-    for (int i = 0; i < MAX_TASKS; ++i)
-        events[i] = new event;
-    initialized = true;
-}
-

 void __cdecl
 lRunTask(LPVOID param) {
    TaskInfo *ti = (TaskInfo *)param;
    
    // Actually run the task. 
-    // FIXME: like the tasks_gcd.cpp implementation, this is passing bogus
+    // FIXME: like the GCD implementation for OS X, this is passing bogus
    // values for the threadIndex and threadCount builtins, which in turn
-    // will cause bugs in code that uses those.  FWIW this example doesn't
-    // use them...
+    // will cause bugs in code that uses those.
    int threadIndex = 0;
    int threadCount = 1;
-    ti->ispcFunc(ti->ispcData, threadIndex, threadCount);
+    TaskFuncType func = (TaskFuncType)ti->func;
+    func(ti->data, threadIndex, threadCount);

    // Signal the event that this task is done
-    int taskNum = ti - &taskInfo[0];
-    events[taskNum]->set();
+    ti->taskEvent.set();
 }


 void
 ISPCLaunch(void *func, void *data) {
-    if (!initialized) {
-        fprintf(stderr, "You must call TasksInit() before launching tasks.\n");
-        exit(1);
-    }
-
-    // Get a TaskInfo struct for this task
-    EnterCriticalSection(&criticalSection);
-    TaskInfo *ti = &taskInfo[taskOffset++];
-    assert(taskOffset < MAX_TASKS);
-    LeaveCriticalSection(&criticalSection);
-
-    // And pass it on to the Concurrency Runtime...
-    ti->ispcFunc = (TaskFuncType)func;
-    ti->ispcData = data;
+    TaskInfo *ti = lGetTaskInfo();
+    ti->func = (TaskFuncType)func;
+    ti->data = data;
+	ti->taskEvent.reset();
    CurrentScheduler::ScheduleTask(lRunTask, ti);
 }


 void ISPCSync() {
-    if (!initialized) {
-        fprintf(stderr, "You must call TasksInit() before launching tasks.\n");
-        exit(1);
+    for (int i = 0; i < nextTaskInfoCoordinate; ++i) {
+		int index = (i >> LOG_TASK_QUEUE_CHUNK_SIZE);
+		int offset = i & (TASK_QUEUE_CHUNK_SIZE-1);
+		taskInfo[index][offset].taskEvent.wait();
+		taskInfo[index][offset].taskEvent.reset();
    }

-    event::wait_for_multiple(&events[0], taskOffset, true, 
-                             COOPERATIVE_TIMEOUT_INFINITE);
-
-    for (int i = 0; i < taskOffset; ++i)
-        events[i]->reset();
-
-    taskOffset = 0;
+    lResetTaskInfo();
 }


--- a/examples/mandelbrot_tasks/tasks_gcd.cpp
+++ b/examples/mandelbrot_tasks/tasks_gcd.cpp
@@ -1,5 +1,5 @@
 /*
-  Copyright (c) 2010-2011, Intel Corporation
+  Copyright (c) 2011, Intel Corporation
  All rights reserved.

  Redistribution and use in source and binary forms, with or without
@@ -31,61 +31,69 @@
   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
 */

+#include "taskinfo.h"
+
+#if defined(_WIN32) || defined(_WIN64)
+#define ISPC_IS_WINDOWS
+#elif defined(__linux__)
+#define ISPC_IS_LINUX
+#elif defined(__APPLE__)
+#define ISPC_IS_APPLE
+#endif
+
 /* A simple task system for ispc programs based on Apple's Grand Central
   Dispatch. */
-
 #include <dispatch/dispatch.h>
 #include <stdio.h>
+#include <stdint.h>
 #include <stdlib.h>

-static bool initialized = false;
+static int initialized = 0;
+static volatile int32_t lock = 0;
 static dispatch_queue_t gcdQueue;
 static dispatch_group_t gcdGroup;

 // ispc expects these functions to have C linkage / not be mangled
-extern "C" {
+extern "C" { 
    void ISPCLaunch(void *f, void *data);
    void ISPCSync();
-}
-
-struct TaskInfo {
-    void *func;
-    void *data;
-};
-
-
-void
-TasksInit() {
-    gcdQueue = dispatch_get_global_queue(DISPATCH_QUEUE_PRIORITY_DEFAULT, 0);
-    gcdGroup = dispatch_group_create();
-    initialized = true;
+    void *ISPCMalloc(int64_t size, int32_t alignment);
+    void ISPCFree(void *ptr);
 }


 static void
 lRunTask(void *ti) {
-    typedef void (*TaskFuncType)(void *, int, int);
    TaskInfo *taskInfo = (TaskInfo *)ti;
-
-    TaskFuncType func = (TaskFuncType)(taskInfo->func);
-
    // FIXME: these are bogus values; may cause bugs in code that depends
    // on them having unique values in different threads.
    int threadIndex = 0;
    int threadCount = 1;
+    TaskFuncType func = (TaskFuncType)(taskInfo->func);
+
    // Actually run the task
    func(taskInfo->data, threadIndex, threadCount);
-
-    // FIXME: taskInfo leaks...
 }


 void ISPCLaunch(void *func, void *data) {
    if (!initialized) {
-        fprintf(stderr, "You must call TasksInit() before launching tasks.\n");
-        exit(1);
+        while (1) {
+            if (lAtomicCompareAndSwap32(&lock, 1, 0) == 0) {
+                if (!initialized) {
+                    gcdQueue = dispatch_get_global_queue(DISPATCH_QUEUE_PRIORITY_DEFAULT, 0);
+                    gcdGroup = dispatch_group_create();
+                    lInitTaskInfo();
+                    __asm__ __volatile__("mfence":::"memory");
+                    initialized = 1;
+                }
+                lock = 0;
+                break;
+            }
+        }
    }
-    TaskInfo *ti = new TaskInfo;
+
+    TaskInfo *ti = lGetTaskInfo();
    ti->func = func;
    ti->data = data;
    dispatch_group_async_f(gcdGroup, gcdQueue, ti, lRunTask);
@@ -93,11 +101,26 @@ void ISPCLaunch(void *func, void *data) {


 void ISPCSync() {
-    if (!initialized) {
-        fprintf(stderr, "You must call TasksInit() before launching tasks.\n");
-        exit(1);
-    }
+    if (!initialized)
+        return;

    // Wait for all of the tasks in the group to complete before returning
    dispatch_group_wait(gcdGroup, DISPATCH_TIME_FOREVER);
+
+    lResetTaskInfo();
 }
+
+void *ISPCMalloc(int64_t size, int32_t alignment) {
+    void *mem = malloc(size + (alignment-1) + sizeof(void*));
+    char *amem = ((char*)mem) + sizeof(void*);
+    amem = amem + uint32_t(alignment - (reinterpret_cast<uint64_t>(amem) &
+                                        (alignment - 1)));
+    ((void**)amem)[-1] = mem;
+    return amem;
+}
+
+
+void ISPCFree(void *ptr) {
+    free(((void**)ptr)[-1]);
+}
+
--- a/examples/mandelbrot_tasks/tasks_pthreads.cpp
+++ b/examples/mandelbrot_tasks/tasks_pthreads.cpp
@@ -1,5 +1,5 @@
 /*
-  Copyright (c) 2010-2011, Intel Corporation
+  Copyright (c) 2011, Intel Corporation
  All rights reserved.

  Redistribution and use in source and binary forms, with or without
@@ -31,6 +31,15 @@
   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
 */

+#if defined(_WIN32) || defined(_WIN64)
+#define ISPC_IS_WINDOWS
+#elif defined(__linux__)
+#define ISPC_IS_LINUX
+#elif defined(__APPLE__)
+#define ISPC_IS_APPLE
+#endif
+
+#include "taskinfo.h"
 #include <pthread.h>
 #include <semaphore.h>
 #include <string.h>
@@ -45,59 +54,45 @@
 #include <stdint.h>
 #include <stdlib.h>
 #include <errno.h>
-#include <vector>
+#ifdef ISPC_IS_LINUX
+#include <malloc.h>
+#endif
+
+static int initialized = 0;
+static volatile int32_t lock = 0;
+
+static int nThreads;
+static pthread_t *threads;
+static pthread_mutex_t taskQueueMutex;
+static int nextTaskToRun;
+static sem_t *workerSemaphore;
+static uint32_t numUnfinishedTasks;
+static pthread_mutex_t tasksRunningConditionMutex;
+static pthread_cond_t tasksRunningCondition;

 // ispc expects these functions to have C linkage / not be mangled
 extern "C" { 
    void ISPCLaunch(void *f, void *data);
    void ISPCSync();
+    void *ISPCMalloc(int64_t size, int32_t alignment);
+    void ISPCFree(void *ptr);
 }

-
-static int nThreads;
-static pthread_t *threads;
-static pthread_mutex_t taskQueueMutex;
-static std::vector<std::pair<void *, void *> > taskQueue;
-static sem_t *workerSemaphore;
-static uint32_t numUnfinishedTasks;
-static pthread_mutex_t tasksRunningConditionMutex;
-static pthread_cond_t tasksRunningCondition;
-
 static void *lTaskEntry(void *arg);

 /** Figure out how many CPU cores there are in the system
 */
 static int
 lNumCPUCores() {
-#if defined(__linux__)
    return sysconf(_SC_NPROCESSORS_ONLN);
-#else
-    // Mac
-    int mib[2];
-    mib[0] = CTL_HW;
-    size_t length = 2;
-    if (sysctlnametomib("hw.logicalcpu", mib, &length) == -1) {
-        fprintf(stderr, "sysctlnametomib() filed.  Guessing 2 cores.");
-        return 2;
-    }
-    assert(length == 2);
-
-    int nCores = 0;
-    size_t size = sizeof(nCores);
-
-    if (sysctl(mib, 2, &nCores, &size, NULL, 0) == -1) {
-        fprintf(stderr, "sysctl() to find number of cores present failed.  Guessing 2.");
-        return 2;
-    }
-    return nCores;
-#endif
 }

-void
-TasksInit() {
+
+static void
+lTasksInit() {
    nThreads = lNumCPUCores();

-    threads = new pthread_t[nThreads];
+    threads = (pthread_t *)malloc(nThreads * sizeof(pthread_t));

    int err;
    if ((err = pthread_mutex_init(&taskQueueMutex, NULL)) != 0) {
@@ -106,7 +101,7 @@ TasksInit() {
    }

    char name[32];
-    sprintf(name, "mandelbrot.%d", (int)getpid());
+    sprintf(name, "ispc_task.%d", (int)getpid());
    workerSemaphore = sem_open(name, O_CREAT, S_IRUSR|S_IWUSR, 0);
    if (!workerSemaphore) {
        fprintf(stderr, "Error creating semaphore: %s\n", strerror(err));
@@ -124,7 +119,7 @@ TasksInit() {
    }

    for (int i = 0; i < nThreads; ++i) {
-        err = pthread_create(&threads[i], NULL, &lTaskEntry, reinterpret_cast<void *>(i));
+        err = pthread_create(&threads[i], NULL, &lTaskEntry, (void *)(i));
        if (err != 0) {
            fprintf(stderr, "Error creating pthread %d: %s\n", i, strerror(err));
            exit(1);
@@ -135,21 +130,35 @@ TasksInit() {

 void
 ISPCLaunch(void *f, void *d) {
-    if (threads == NULL) {
-        fprintf(stderr, "You must call TasksInit() before launching tasks.\n");
-        exit(1);
+    int err;
+
+    if (!initialized) {
+        while (1) {
+            if (lAtomicCompareAndSwap32(&lock, 1, 0) == 0) {
+                if (!initialized) {
+                    lTasksInit();
+                    __asm__ __volatile__("mfence":::"memory");
+                    initialized = 1;
+                }
+                lock = 0;
+                break;
+            }
+        }
    }

    //
    // Acquire mutex, add task
    //
-    int err;
    if ((err = pthread_mutex_lock(&taskQueueMutex)) != 0) {
        fprintf(stderr, "Error from pthread_mutex_lock: %s\n", strerror(err));
        exit(1);
    }

-    taskQueue.push_back(std::make_pair(f, d));
+    // Need a mutex here to ensure we get this filled in before a worker
+    // grabs it and starts running...
+    TaskInfo *ti = lGetTaskInfo();
+    ti->func = f;
+    ti->data = d;

    if ((err = pthread_mutex_unlock(&taskQueueMutex)) != 0) {
        fprintf(stderr, "Error from pthread_mutex_unlock: %s\n", strerror(err));
@@ -164,6 +173,7 @@ ISPCLaunch(void *f, void *d) {
        exit(1);
    }

+    // FIXME: is this redundant with nextTaskInfoCoordinate?
    ++numUnfinishedTasks;

    if ((err = pthread_mutex_unlock(&tasksRunningConditionMutex)) != 0) {
@@ -184,17 +194,17 @@ ISPCLaunch(void *f, void *d) {

 static void *
 lTaskEntry(void *arg) {
-    int threadIndex = int(reinterpret_cast<int64_t>(arg));
+    int threadIndex = (int)((int64_t)arg);
    int threadCount = nThreads;
+    TaskFuncType func;

-    while (true) {
+    while (1) {
        int err;
        if ((err = sem_wait(workerSemaphore)) != 0) {
            fprintf(stderr, "Error from sem_wait: %s\n", strerror(err));
            exit(1);
        }

-        std::pair<void *, void *> myTask;
        //
        // Acquire mutex, get task
        //
@@ -202,7 +212,8 @@ lTaskEntry(void *arg) {
            fprintf(stderr, "Error from pthread_mutex_lock: %s\n", strerror(err));
            exit(1);
        }
-        if (taskQueue.size() == 0) {
+
+        if (nextTaskToRun == nextTaskInfoCoordinate) {
            //
            // Task queue is empty, go back and wait on the semaphore
            //
@@ -213,8 +224,10 @@ lTaskEntry(void *arg) {
            continue;
        }

-        myTask = taskQueue.back();
-        taskQueue.pop_back();
+        int runCoord = nextTaskToRun++;
+        int index = (runCoord >> LOG_TASK_QUEUE_CHUNK_SIZE);
+        int offset = runCoord & (TASK_QUEUE_CHUNK_SIZE-1);
+        TaskInfo *myTask = &taskInfo[index][offset];

        if ((err = pthread_mutex_unlock(&taskQueueMutex)) != 0) {
            fprintf(stderr, "Error from pthread_mutex_unlock: %s\n", strerror(err));
@@ -224,9 +237,8 @@ lTaskEntry(void *arg) {
        //
        // Do work for _myTask_
        //
-        typedef void (*TaskFunType)(void *, int, int);
-        TaskFunType func = (TaskFunType)myTask.first;
-        func(myTask.second, threadIndex, threadCount);
+        func = (TaskFuncType)myTask->func;
+        func(myTask->data, threadIndex, threadCount);

        //
        // Decrement the number of unfinished tasks counter
@@ -236,6 +248,8 @@ lTaskEntry(void *arg) {
            exit(1);
        }

+        // FIXME: can this be a comparison of (nextTaskToRun == nextTaskInfoCoordinate)?
+        // (I don't think so--think there is a race...)
        int unfinished = --numUnfinishedTasks;
        if (unfinished == 0) {
            //
@@ -261,11 +275,6 @@ lTaskEntry(void *arg) {


 void ISPCSync() {
-    if (threads == NULL) {
-        fprintf(stderr, "You must call TasksInit() before launching tasks.\n");
-        exit(1);
-    }
-
    int err;
    if ((err = pthread_mutex_lock(&tasksRunningConditionMutex)) != 0) {
        fprintf(stderr, "Error from pthread_mutex_lock: %s\n", strerror(err));
@@ -283,6 +292,9 @@ void ISPCSync() {
        }
    }
    
+    lResetTaskInfo();
+    nextTaskToRun = 0;
+
    // We acquire ownership of the condition variable mutex when the above
    // pthread_cond_wait returns.
    // FIXME: is there a lurking issue here if numUnfinishedTasks gets back
@@ -293,3 +305,35 @@ void ISPCSync() {
        exit(1);
    }
 }
+
+
+void *ISPCMalloc(int64_t size, int32_t alignment) {
+#ifdef ISPC_IS_WINDOWS
+    return _aligned_malloc(size, alignment);
+#endif
+#ifdef ISPC_IS_LINUX
+    return memalign(alignment, size);
+#endif
+#ifdef ISPC_IS_APPLE
+    void *mem = malloc(size + (alignment-1) + sizeof(void*));
+    char *amem = ((char*)mem) + sizeof(void*);
+    amem = amem + uint32_t(alignment - (reinterpret_cast<uint64_t>(amem) &
+                                        (alignment - 1)));
+    ((void**)amem)[-1] = mem;
+    return amem;
+#endif
+}
+
+
+void ISPCFree(void *ptr) {
+#ifdef ISPC_IS_WINDOWS
+    _aligned_free(ptr);
+#endif
+#ifdef ISPC_IS_LINUX
+    free(ptr);
+#endif
+#ifdef ISPC_IS_APPLE
+    free(((void**)ptr)[-1]);
+#endif
+}
+
--- a/examples/timing.h
+++ b/examples/timing.h
@@ -38,7 +38,9 @@
 #include <windows.h>
 #define rdtsc __rdtsc
 #else
+#ifdef __cplusplus
 extern "C" {
+#endif /* __cplusplus */
    __inline__ uint64_t rdtsc() {
        uint32_t low, high;
        __asm__ __volatile__ (
@@ -48,7 +50,9 @@ extern "C" {
                              "rdtsc" : "=a" (low), "=d" (high));
        return (uint64_t)high << 32 | low;
    }
+#ifdef __cplusplus
 }
+#endif /* __cplusplus */
 #endif            
            
 static uint64_t start, end;
--- a/examples/volume_rendering/.gitignore
+++ b/examples/volume_rendering/.gitignore
@@ -0,0 +1,2 @@
+mandelbrot
+*.ppm
--- a/examples/volume_rendering/Makefile
+++ b/examples/volume_rendering/Makefile
@@ -0,0 +1,41 @@
+
+ARCH = $(shell uname)
+
+TASK_CXX=../tasks_pthreads.cpp
+TASK_LIB=-lpthread
+
+ifeq ($(ARCH), Darwin)
+  TASK_CXX=../tasks_gcd.cpp
+  TASK_LIB=
+endif
+
+TASK_OBJ=$(addprefix objs/, $(subst ../,, $(TASK_CXX:.cpp=.o)))
+
+CXX=g++
+CXXFLAGS=-Iobjs/ -O3 -Wall -m64
+ISPC=ispc
+ISPCFLAGS=-O2 --target=sse4x2 --arch=x86-64
+
+default: volume
+
+.PHONY: dirs clean
+
+dirs:
+	/bin/mkdir -p objs/
+
+clean:
+	/bin/rm -rf objs *~ volume
+
+volume: dirs objs/volume.o objs/volume_serial.o objs/volume_ispc.o $(TASK_OBJ)
+	$(CXX) $(CXXFLAGS) -o $@ objs/volume.o objs/volume_ispc.o objs/volume_serial.o $(TASK_OBJ) -lm $(TASK_LIB)
+
+objs/%.o: %.cpp
+	$(CXX) $< $(CXXFLAGS) -c -o $@
+
+objs/%.o: ../%.cpp
+	$(CXX) $< $(CXXFLAGS) -c -o $@
+
+objs/volume.o: objs/volume_ispc.h 
+
+objs/%_ispc.h objs/%_ispc.o: %.ispc
+	$(ISPC) $(ISPCFLAGS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
--- a/examples/volume_rendering/camera.dat
+++ b/examples/volume_rendering/camera.dat
@@ -0,0 +1,11 @@
+896 1184
+
+0.000155 0.000000 0.000000 -0.069927
+0.000000 -0.000155 0.000000 0.093236
+0.000000 0.000000 0.000000 1.000000
+0.000000 0.000000 -99.999001 100.000000
+
+1.000000 0.000000 0.000000 1.000000
+0.000000 0.980129 -0.198360 2.900000
+0.000000 0.198360 0.980129 -10.500000
+0.000000 0.000000 0.000000 1.000000
--- a/examples/volume_rendering/density_highres.vol
+++ b/examples/volume_rendering/density_highres.vol
--- a/examples/volume_rendering/density_lowres.vol
+++ b/examples/volume_rendering/density_lowres.vol
--- a/examples/volume_rendering/volume.cpp
+++ b/examples/volume_rendering/volume.cpp
@@ -0,0 +1,248 @@
+/*
+  Copyright (c) 2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+#ifdef _MSC_VER
+#define _CRT_SECURE_NO_WARNINGS
+#define NOMINMAX
+#pragma warning (disable: 4244)
+#pragma warning (disable: 4305)
+#endif
+
+#include <stdio.h>
+#include <algorithm>
+#include "../timing.h"
+#include "../cpuid.h"
+#include "volume_ispc.h"
+using namespace ispc;
+
+extern void volume_serial(float density[], int nVoxels[3], 
+                          const float raster2camera[4][4],
+                          const float camera2world[4][4], 
+                          int width, int height, float image[]);
+
+/* Write a PPM image file with the image */
+static void
+writePPM(float *buf, int width, int height, const char *fn) {
+    FILE *fp = fopen(fn, "wb");
+    fprintf(fp, "P6\n");
+    fprintf(fp, "%d %d\n", width, height);
+    fprintf(fp, "255\n");
+    for (int i = 0; i < width*height; ++i) {
+        float v = buf[i] * 255.f;
+        if (v < 0.f) v = 0.f;
+        else if (v > 255.f) v = 255.f;
+        unsigned char c = (unsigned char)v;
+        for (int j = 0; j < 3; ++j)
+            fputc(c, fp);
+    }
+    fclose(fp);
+    printf("Wrote image file %s\n", fn);
+}
+
+
+// Make sure that the vector ISA used during compilation is supported by
+// the processor.  The ISPC_TARGET_* macro is set in the ispc-generated
+// header file that we include above.
+static void
+ensureTargetISAIsSupported() {
+#if defined(ISPC_TARGET_SSE2)
+    bool isaSupported = CPUSupportsSSE2();
+    const char *target = "SSE2";
+#elif defined(ISPC_TARGET_SSE4)
+    bool isaSupported = CPUSupportsSSE4();
+    const char *target = "SSE4";
+#elif defined(ISPC_TARGET_AVX)
+    bool isaSupported = CPUSupportsAVX();
+    const char *target = "AVX";
+#else
+#error "Unknown ISPC_TARGET_* value"
+#endif
+    if (!isaSupported) {
+        fprintf(stderr, "***\n*** Error: the ispc-compiled code uses the %s instruction "
+                "set, which isn't\n***        supported by this computer's CPU!\n", target);
+        fprintf(stderr, "***\n***        Please modify the "
+#ifdef _MSC_VER
+                "MSVC project file "
+#else
+                "Makefile "
+#endif
+                "to select another target (e.g. sse2)\n***\n");
+        exit(1);
+    }
+}
+
+/* Load image and viewing parameters from a camera data file.
+   FIXME: we should add support to be able to specify viewing parameters
+   in the program here directly. */
+static void
+loadCamera(const char *fn, int *width, int *height, float raster2camera[4][4],
+           float camera2world[4][4]) {
+    FILE *f = fopen(fn, "r");
+    if (!f) {
+        perror(fn);
+        exit(1);
+    }
+    if (fscanf(f, "%d %d", width, height) != 2) {
+        fprintf(stderr, "Unexpected end of file in camera file\n");
+        exit(1);
+    }
+
+    for (int i = 0; i < 4; ++i) {
+        for (int j = 0; j < 4; ++j) {
+            if (fscanf(f, "%f", &raster2camera[i][j]) != 1) {
+                fprintf(stderr, "Unexpected end of file in camera file\n");
+                exit(1);
+            }
+        }
+    }
+    for (int i = 0; i < 4; ++i) {
+        for (int j = 0; j < 4; ++j) {
+            if (fscanf(f, "%f", &camera2world[i][j]) != 1) {
+                fprintf(stderr, "Unexpected end of file in camera file\n");
+                exit(1);
+            }
+        }
+    }
+    fclose(f);
+}
+
+
+/* Load a volume density file.  Expects the number of x, y, and z samples
+   as the first three values (as integer strings), then x*y*z
+   floating-point values (also as strings) to give the densities.  */
+static float *
+loadVolume(const char *fn, int n[3]) {
+    FILE *f = fopen(fn, "r");
+    if (!f) {
+        perror(fn);
+        exit(1);
+    }
+
+    if (fscanf(f, "%d %d %d", &n[0], &n[1], &n[2]) != 3) {
+        fprintf(stderr, "Couldn't find resolution at start of density file\n");
+        exit(1);
+    }
+
+    int count = n[0] * n[1] * n[2];
+    float *v = new float[count];
+    for (int i = 0; i < count; ++i) {
+        if (fscanf(f, "%f", &v[i]) != 1) {
+            fprintf(stderr, "Unexpected end of file at %d'th density value\n", i);
+            exit(1);
+        }
+    }
+
+    return v;
+}
+
+
+int main(int argc, char *argv[]) {
+    if (argc != 3) {
+        fprintf(stderr, "usage: volume <camera.dat> <volume_density.vol>\n");
+        return 1;
+    }
+
+    ensureTargetISAIsSupported();
+
+    //
+    // Load viewing data and the volume density data
+    //
+    int width, height;
+    float raster2camera[4][4], camera2world[4][4];
+    loadCamera(argv[1], &width, &height, raster2camera, camera2world);
+    float *image = new float[width*height];
+
+    int n[3];
+    float *density = loadVolume(argv[2], n);
+
+    //
+    // Compute the image using the ispc implementation; report the minimum
+    // time of three runs.
+    //
+    double minISPC = 1e30;
+    for (int i = 0; i < 3; ++i) {
+        reset_and_start_timer();
+        volume_ispc(density, n, raster2camera, camera2world,
+                    width, height, image);
+        double dt = get_elapsed_mcycles();
+        minISPC = std::min(minISPC, dt);
+    }
+
+    printf("[volume ispc 1 core]:\t\t[%.3f] million cycles\n", minISPC);
+    writePPM(image, width, height, "volume-ispc-1core.ppm");
+
+    // Clear out the buffer
+    for (int i = 0; i < width * height; ++i)
+        image[i] = 0.;
+
+    //
+    // Compute the image using the ispc implementation that also uses
+    // tasks; report the minimum time of three runs.
+    //
+    double minISPCtasks = 1e30;
+    for (int i = 0; i < 3; ++i) {
+        reset_and_start_timer();
+        volume_ispc_tasks(density, n, raster2camera, camera2world,
+                          width, height, image);
+        double dt = get_elapsed_mcycles();
+        minISPCtasks = std::min(minISPCtasks, dt);
+    }
+
+    printf("[volume ispc + tasks]:\t\t[%.3f] million cycles\n", minISPCtasks);
+    writePPM(image, width, height, "volume-ispc-tasks.ppm");
+
+    // Clear out the buffer
+    for (int i = 0; i < width * height; ++i)
+        image[i] = 0.;
+
+    // 
+    // And run the serial implementation 3 times, again reporting the
+    // minimum time.
+    //
+    double minSerial = 1e30;
+    for (int i = 0; i < 3; ++i) {
+        reset_and_start_timer();
+        volume_serial(density, n, raster2camera, camera2world,
+                      width, height, image);
+        double dt = get_elapsed_mcycles();
+        minSerial = std::min(minSerial, dt);
+    }
+
+    printf("[volume serial]:\t\t[%.3f] millon cycles\n", minSerial);
+    writePPM(image, width, height, "volume-serial.ppm");
+
+    printf("\t\t\t\t(%.2fx speedup from ISPC serial, %.2fx from ISPC+tasks)\n", 
+           minSerial/minISPC, minSerial / minISPCtasks);
+
+    return 0;
+}
--- a/examples/volume_rendering/volume.ispc
+++ b/examples/volume_rendering/volume.ispc
@@ -0,0 +1,378 @@
+/*
+  Copyright (c) 2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+typedef float<3> float3;
+
+struct Ray {
+    float3 origin, dir;
+};
+
+
+static void
+generateRay(const uniform float raster2camera[4][4], 
+            const uniform float camera2world[4][4],
+            float x, float y, reference Ray ray) {
+    // transform raster coordinate (x, y, 0) to camera space
+    float camx = raster2camera[0][0] * x + raster2camera[0][1] * y + raster2camera[0][3];
+    float camy = raster2camera[1][0] * x + raster2camera[1][1] * y + raster2camera[1][3];
+    float camz = raster2camera[2][3];
+    float camw = raster2camera[3][3];
+    camx /= camw;
+    camy /= camw;
+    camz /= camw;
+
+    ray.dir.x = camera2world[0][0] * camx + camera2world[0][1] * camy + camera2world[0][2] * camz;
+    ray.dir.y = camera2world[1][0] * camx + camera2world[1][1] * camy + camera2world[1][2] * camz;
+    ray.dir.z = camera2world[2][0] * camx + camera2world[2][1] * camy + camera2world[2][2] * camz;
+
+    ray.origin.x = camera2world[0][3] / camera2world[3][3];
+    ray.origin.y = camera2world[1][3] / camera2world[3][3];
+    ray.origin.z = camera2world[2][3] / camera2world[3][3];
+}
+
+
+static inline bool
+Inside(float3 p, float3 pMin, float3 pMax) {
+    return (p.x >= pMin.x && p.x <= pMax.x &&
+            p.y >= pMin.y && p.y <= pMax.y &&
+            p.z >= pMin.z && p.z <= pMax.z);
+}
+
+
+static bool
+IntersectP(Ray ray, float3 pMin, float3 pMax, reference float hit0, reference float hit1) {
+    float t0 = -1e30, t1 = 1e30;
+
+    float3 tNear = (pMin - ray.origin) / ray.dir;
+    float3 tFar  = (pMax - ray.origin) / ray.dir;
+    if (tNear.x > tFar.x) {
+        float tmp = tNear.x;
+        tNear.x = tFar.x;
+        tFar.x = tmp;
+    }
+    t0 = max(tNear.x, t0);
+    t1 = min(tFar.x, t1);
+
+    if (tNear.y > tFar.y) {
+        float tmp = tNear.y;
+        tNear.y = tFar.y;
+        tFar.y = tmp;
+    }
+    t0 = max(tNear.y, t0);
+    t1 = min(tFar.y, t1);
+
+    if (tNear.z > tFar.z) {
+        float tmp = tNear.z;
+        tNear.z = tFar.z;
+        tFar.z = tmp;
+    }
+    t0 = max(tNear.z, t0);
+    t1 = min(tFar.z, t1);
+    
+    if (t0 <= t1) {
+        hit0 = t0;
+        hit1 = t1;
+        return true;
+    }
+    else
+        return false;
+}
+
+
+static inline float Lerp(float t, float a, float b) {
+    return (1.f - t) * a + t * b;
+}
+
+
+static inline float D(int x, int y, int z, uniform int nVoxels[3], 
+                      uniform float density[]) {
+    x = clamp(x, 0, nVoxels[0]-1);
+    y = clamp(y, 0, nVoxels[1]-1);
+    z = clamp(z, 0, nVoxels[2]-1);
+
+    return density[z*nVoxels[0]*nVoxels[1] + y*nVoxels[0] + x];
+}
+
+
+static inline float Du(uniform int x, uniform int y, uniform int z, 
+                       uniform int nVoxels[3], uniform float density[]) {
+    x = clamp(x, 0, nVoxels[0]-1);
+    y = clamp(y, 0, nVoxels[1]-1);
+    z = clamp(z, 0, nVoxels[2]-1);
+
+    return density[z*nVoxels[0]*nVoxels[1] + y*nVoxels[0] + x];
+}
+
+
+static inline float3 Offset(float3 p, float3 pMin, float3 pMax) {
+    return (p - pMin) / (pMax - pMin);
+}
+
+
+static inline float Density(float3 Pobj, float3 pMin, float3 pMax, 
+                            uniform float density[], uniform int nVoxels[3],
+                            reference uniform bool checkForSameVoxel) {
+    if (!Inside(Pobj, pMin, pMax)) 
+        return 0;
+    // Compute voxel coordinates and offsets for _Pobj_
+    float3 vox = Offset(Pobj, pMin, pMax);
+    vox.x = vox.x * nVoxels[0] - .5f;
+    vox.y = vox.y * nVoxels[1] - .5f;
+    vox.z = vox.z * nVoxels[2] - .5f;
+    int vx = (int)(vox.x), vy = (int)(vox.y), vz = (int)(vox.z);
+    float dx = vox.x - vx, dy = vox.y - vy, dz = vox.z - vz;
+
+    // Trilinearly interpolate density values to compute local density
+    float d00, d10, d01, d11;
+    uniform int uvx, uvy, uvz;
+    if (checkForSameVoxel && reduce_equal(vx, uvx) && reduce_equal(vy, uvy) &&
+        reduce_equal(vz, uvz)) {
+        // If all of the program instances are inside the same voxel, then
+        // we'll call the 'uniform' variant of the voxel density lookup
+        // function, thus doing a single load for each value rather than a
+        // gather.
+        d00 = Lerp(dx, Du(uvx, uvy, uvz, nVoxels, density),     
+                       Du(uvx+1, uvy, uvz, nVoxels, density));
+        d10 = Lerp(dx, Du(uvx, uvy+1, uvz, nVoxels, density),   
+                       Du(uvx+1, uvy+1, uvz, nVoxels, density));
+        d01 = Lerp(dx, Du(uvx, uvy, uvz+1, nVoxels, density),   
+                       Du(uvx+1, uvy, uvz+1, nVoxels, density));
+        d11 = Lerp(dx, Du(uvx, uvy+1, uvz+1, nVoxels, density), 
+                       Du(uvx+1, uvy+1, uvz+1, nVoxels, density));
+    }
+    else {
+        // Otherwise, we have to do an actual gather in the more general
+        // D() function.  Once the reduce_equal tests above fail, we stop
+        // checking in subsequent steps, since it's unlikely that this will
+        // be true in the future once they've diverged into different
+        // voxels.
+        checkForSameVoxel = false;
+        d00 = Lerp(dx, D(vx, vy, vz, nVoxels, density),     
+                       D(vx+1, vy, vz, nVoxels, density));
+        d10 = Lerp(dx, D(vx, vy+1, vz, nVoxels, density),   
+                       D(vx+1, vy+1, vz, nVoxels, density));
+        d01 = Lerp(dx, D(vx, vy, vz+1, nVoxels, density),   
+                       D(vx+1, vy, vz+1, nVoxels, density));
+        d11 = Lerp(dx, D(vx, vy+1, vz+1, nVoxels, density), 
+                       D(vx+1, vy+1, vz+1, nVoxels, density));
+    }
+    float d0 = Lerp(dy, d00, d10);
+    float d1 = Lerp(dy, d01, d11);
+    return Lerp(dz, d0, d1);
+}
+
+
+/* Returns the transmittance between two points p0 and p1, in a volume
+   with extent (pMin,pMax) with transmittance coefficient sigma_t,
+   defined by nVoxels[3] voxels in each dimension in the given density
+   array. */
+static float
+transmittance(uniform float3 p0, float3 p1, uniform float3 pMin,
+              uniform float3 pMax, uniform float sigma_t, 
+              uniform float density[], uniform int nVoxels[3]) {
+    float rayT0, rayT1;
+    Ray ray;
+    ray.origin = p1;
+    ray.dir = p0 - p1;
+
+    // Find the parametric t range along the ray that is inside the volume.
+    if (!IntersectP(ray, pMin, pMax, rayT0, rayT1))
+        return 1.;
+
+    rayT0 = max(rayT0, 0.f);
+
+    // Accumulate beam transmittance in tau
+    float tau = 0;
+    float rayLength = sqrt(ray.dir.x * ray.dir.x + ray.dir.y * ray.dir.y +
+                           ray.dir.z * ray.dir.z);
+    uniform float stepDist = 0.2;
+    float stepT = stepDist / rayLength;
+
+    float t = rayT0;
+    float3 pos = ray.origin + ray.dir * rayT0;
+    float3 dirStep = ray.dir * stepT;
+    uniform bool checkForSameVoxel = true;
+    while (t < rayT1) {
+        tau += stepDist * sigma_t * Density(pos, pMin, pMax, density, nVoxels,
+                                            checkForSameVoxel);
+        pos = pos + dirStep;
+        t += stepT;
+    }
+
+    return exp(-tau);
+}
+
+
+static inline float
+distanceSquared(float3 a, float3 b) {
+    float3 d = a-b;
+    return d.x*d.x + d.y*d.y + d.z*d.z;
+}
+
+
+static float 
+raymarch(uniform float density[], uniform int nVoxels[3], Ray ray) {
+    float rayT0, rayT1;
+    uniform float3 pMin = {.3, -.2, .3}, pMax = {1.8, 2.3, 1.8};
+    uniform float3 lightPos = { -1, 4, 1.5 };
+
+    cif (!IntersectP(ray, pMin, pMax, rayT0, rayT1))
+        return 0.;
+
+    rayT0 = max(rayT0, 0.f);
+
+    // Parameters that define the volume scattering characteristics and
+    // sampling rate for raymarching
+    uniform float Le = .25;            // Emission coefficient
+    uniform float sigma_a = 10;        // Absorption coefficient
+    uniform float sigma_s = 10;        // Scattering coefficient
+    uniform float stepDist = 0.025;    // Ray step amount
+    uniform float lightIntensity = 40; // Light source intensity
+
+    float tau = 0.f;  // accumulated beam transmittance
+    float L = 0;      // radiance along the ray
+    float rayLength = sqrt(ray.dir.x * ray.dir.x + ray.dir.y * ray.dir.y +
+                           ray.dir.z * ray.dir.z);
+    float stepT = stepDist / rayLength;
+
+    float t = rayT0;
+    float3 pos = ray.origin + ray.dir * rayT0;
+    float3 dirStep = ray.dir * stepT;
+    uniform bool checkForSameVoxel = true;
+    cwhile (t < rayT1) {
+        float d = Density(pos, pMin, pMax, density, nVoxels, checkForSameVoxel);
+
+        // terminate once attenuation is high
+        float atten = exp(-tau);
+        if (atten < .005)
+            cbreak;
+
+        // direct lighting
+        float Li = lightIntensity / distanceSquared(lightPos, pos) * 
+            transmittance(lightPos, pos, pMin, pMax, sigma_a + sigma_s,
+                          density, nVoxels);
+        L += stepDist * atten * d * sigma_s * (Li + Le);
+
+        // update beam transmittance
+        tau += stepDist * (sigma_a + sigma_s) * d;
+
+        pos = pos + dirStep;
+        t += stepT;
+    }
+
+    // Gamma correction
+    return pow(L, 1.f / 2.2f);
+}
+
+
+/* Utility routine used by both the task-based and the single-core entrypoints.
+   Renders a tile of the image, covering [x0,x0) * [y0, y1), storing the
+   result into the image[] array.
+ */
+static void
+volume_tile(uniform int x0, uniform int y0, uniform int x1,
+            uniform int y1, uniform float density[], uniform int nVoxels[3], 
+            const uniform float raster2camera[4][4],
+            const uniform float camera2world[4][4], 
+            uniform int width, uniform int height, uniform float image[]) {
+    // Work on 4x4=16 pixel big tiles of the image.  This function thus
+    // implicitly assumes that both (x1-x0) and (y1-y0) are evenly divisble
+    // by 4.
+    for (uniform int y = y0; y < y1; y += 4) {
+        for (uniform int x = x0; x < x1; x += 4) {
+            // For each such tile, process programCount pixels at a time,
+            // until we've done all 16 of them.  Thus, we're also assuming
+            // that programCount <= 16 and that 16 is evenly dividible by
+            // programCount.
+            for (uniform int o = 0; o < 16; o += programCount) {
+                // These two arrays encode the mapping from [0,15] to
+                // offsets within the 4x4 pixel block so that we render
+                // each pixel inside the block
+                const uniform int xoffsets[16] = { 0, 1, 0, 1, 2, 3, 2, 3,
+                                                   0, 1, 0, 1, 2, 3, 2, 3 };
+                const uniform int yoffsets[16] = { 0, 0, 1, 1, 0, 0, 1, 1,
+                                                   2, 2, 3, 3, 2, 2, 3, 3 };
+
+                // Figure out the pixel to render for this program instance
+                int xo = x + xoffsets[o + programIndex];
+                int yo = y + yoffsets[o + programIndex];
+
+                // Use viewing parameters to compute the corresponding ray
+                // for the pixel
+                Ray ray;
+                generateRay(raster2camera, camera2world, xo, yo, ray);
+
+                // And raymarch through the volume to compute the pixel's
+                // value
+                int offset = yo * width + xo;
+                image[offset] = raymarch(density, nVoxels, ray);
+            }
+        }
+    }
+}
+
+
+task void
+volume_task(uniform int x0, uniform int y0, uniform int x1,
+            uniform int y1, uniform float density[], uniform int nVoxels[3], 
+            const uniform float raster2camera[4][4],
+            const uniform float camera2world[4][4], 
+            uniform int width, uniform int height, uniform float image[]) {
+    volume_tile(x0, y0, x1, y1, density, nVoxels, raster2camera,
+                 camera2world, width, height, image);
+}
+
+
+export void
+volume_ispc(uniform float density[], uniform int nVoxels[3], 
+            const uniform float raster2camera[4][4],
+            const uniform float camera2world[4][4], 
+            uniform int width, uniform int height, uniform float image[]) {
+    volume_tile(0, 0, width, height, density, nVoxels, raster2camera, 
+                camera2world, width, height,  image);
+}
+
+
+export void
+volume_ispc_tasks(uniform float density[], uniform int nVoxels[3], 
+                  const uniform float raster2camera[4][4],
+                  const uniform float camera2world[4][4], 
+                  uniform int width, uniform int height, uniform float image[]) {
+    // Launch tasks to work on (dx,dy)-sized tiles of the image
+    uniform int dx = 8, dy = 8;
+    for (uniform int y = 0; y < height; y += dy)
+        for (uniform int x = 0; x < width; x += dx)
+            launch < volume_task(x, y, x+dx, y+dy, density, nVoxels, 
+                                 raster2camera, camera2world, width, height, 
+                                 image) >;
+}
--- a/examples/volume_rendering/volume.vcxproj
+++ b/examples/volume_rendering/volume.vcxproj
@@ -0,0 +1,168 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|Win32">
+      <Configuration>Debug</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|Win32">
+      <Configuration>Release</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{dee5733a-e93e-449d-9114-9bffcaeb4df9}</ProjectGuid>
+    <Keyword>Win32Proj</Keyword>
+    <RootNamespace>volume</RootNamespace>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <LinkIncremental>true</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <LinkIncremental>true</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <LinkIncremental>false</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <LinkIncremental>false</LinkIncremental>
+  </PropertyGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <ClCompile>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <FloatingPointModel>Fast</FloatingPointModel>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <ClCompile>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <FloatingPointModel>Fast</FloatingPointModel>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <FloatingPointModel>Fast</FloatingPointModel>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <FloatingPointModel>Fast</FloatingPointModel>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <ClCompile Include="volume.cpp" />
+    <ClCompile Include="volume_serial.cpp" />
+    <ClCompile Include="../tasks_concrt.cpp" />
+  </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="volume.ispc">
+      <FileType>Document</FileType>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=sse4x2
+</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --target=sse4x2
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=sse4x2
+</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --target=sse4x2
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
+    </CustomBuild>
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+  </ImportGroup>
+</Project>
--- a/examples/volume_rendering/volume_serial.cpp
+++ b/examples/volume_rendering/volume_serial.cpp
@@ -0,0 +1,305 @@
+/*
+  Copyright (c) 2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+#include <assert.h>
+#include <math.h>
+#include <algorithm>
+
+// Just enough of a float3 class to do what we need in this file.
+#ifdef _MSC_VER
+__declspec(align(16)) 
+#endif
+struct float3 {
+    float3() { }
+    float3(float xx, float yy, float zz) { x = xx; y = yy; z = zz; }
+
+    float3 operator*(float f) const { return float3(x*f, y*f, z*f); }
+    float3 operator-(const float3 &f2) const { 
+        return float3(x-f2.x, y-f2.y, z-f2.z); 
+    }
+    float3 operator*(const float3 &f2) const { 
+        return float3(x*f2.x, y*f2.y, z*f2.z); 
+    }
+    float3 operator+(const float3 &f2) const { 
+        return float3(x+f2.x, y+f2.y, z+f2.z); 
+    }
+    float3 operator/(const float3 &f2) const { 
+        return float3(x/f2.x, y/f2.y, z/f2.z); 
+    }
+    float operator[](int i) const { return (&x)[i]; }
+    float &operator[](int i) { return (&x)[i]; }
+
+    float x, y, z;
+    float pad;  // match padding/alignment of ispc version 
+}
+#ifndef _MSC_VER
+__attribute__ ((aligned(16)))
+#endif
+;
+
+struct Ray {
+    float3 origin, dir;
+};
+
+
+static void
+generateRay(const float raster2camera[4][4], const float camera2world[4][4],
+            float x, float y, Ray &ray) {
+    // transform raster coordinate (x, y, 0) to camera space
+    float camx = raster2camera[0][0] * x + raster2camera[0][1] * y + raster2camera[0][3];
+    float camy = raster2camera[1][0] * x + raster2camera[1][1] * y + raster2camera[1][3];
+    float camz = raster2camera[2][3];
+    float camw = raster2camera[3][3];
+    camx /= camw;
+    camy /= camw;
+    camz /= camw;
+
+    ray.dir.x = camera2world[0][0] * camx + camera2world[0][1] * camy + camera2world[0][2] * camz;
+    ray.dir.y = camera2world[1][0] * camx + camera2world[1][1] * camy + camera2world[1][2] * camz;
+    ray.dir.z = camera2world[2][0] * camx + camera2world[2][1] * camy + camera2world[2][2] * camz;
+
+    ray.origin.x = camera2world[0][3] / camera2world[3][3];
+    ray.origin.y = camera2world[1][3] / camera2world[3][3];
+    ray.origin.z = camera2world[2][3] / camera2world[3][3];
+}
+
+
+static bool
+Inside(float3 p, float3 pMin, float3 pMax) {
+    return (p.x >= pMin.x && p.x <= pMax.x &&
+            p.y >= pMin.y && p.y <= pMax.y &&
+            p.z >= pMin.z && p.z <= pMax.z);
+}
+
+
+static bool
+IntersectP(const Ray &ray, float3 pMin, float3 pMax, float *hit0, float *hit1) {
+    float t0 = -1e30f, t1 = 1e30f;
+
+    float3 tNear = (pMin - ray.origin) / ray.dir;
+    float3 tFar  = (pMax - ray.origin) / ray.dir;
+    if (tNear.x > tFar.x) {
+        float tmp = tNear.x;
+        tNear.x = tFar.x;
+        tFar.x = tmp;
+    }
+    t0 = std::max(tNear.x, t0);
+    t1 = std::min(tFar.x, t1);
+
+    if (tNear.y > tFar.y) {
+        float tmp = tNear.y;
+        tNear.y = tFar.y;
+        tFar.y = tmp;
+    }
+    t0 = std::max(tNear.y, t0);
+    t1 = std::min(tFar.y, t1);
+
+    if (tNear.z > tFar.z) {
+        float tmp = tNear.z;
+        tNear.z = tFar.z;
+        tFar.z = tmp;
+    }
+    t0 = std::max(tNear.z, t0);
+    t1 = std::min(tFar.z, t1);
+    
+    if (t0 <= t1) {
+        *hit0 = t0;
+        *hit1 = t1;
+        return true;
+    }
+    else
+        return false;
+}
+
+
+static inline float Lerp(float t, float a, float b) {
+    return (1.f - t) * a + t * b;
+}
+
+
+static inline int Clamp(int v, int low, int high) {
+    return std::min(std::max(v, low), high);
+}
+
+
+static inline float D(int x, int y, int z, int nVoxels[3], float density[]) {
+    x = Clamp(x, 0, nVoxels[0]-1);
+    y = Clamp(y, 0, nVoxels[1]-1);
+    z = Clamp(z, 0, nVoxels[2]-1);
+    return density[z*nVoxels[0]*nVoxels[1] + y*nVoxels[0] + x];
+}
+
+
+static inline float3 Offset(float3 p, float3 pMin, float3 pMax) {
+    return float3((p.x - pMin.x) / (pMax.x - pMin.x),
+                  (p.y - pMin.y) / (pMax.y - pMin.y),
+                  (p.z - pMin.z) / (pMax.z - pMin.z));
+}
+
+
+static inline float Density(float3 Pobj, float3 pMin, float3 pMax, 
+                            float density[], int nVoxels[3]) {
+    if (!Inside(Pobj, pMin, pMax)) 
+        return 0;
+    // Compute voxel coordinates and offsets for _Pobj_
+    float3 vox = Offset(Pobj, pMin, pMax);
+    vox.x = vox.x * nVoxels[0] - .5f;
+    vox.y = vox.y * nVoxels[1] - .5f;
+    vox.z = vox.z * nVoxels[2] - .5f;
+    int vx = (int)(vox.x), vy = (int)(vox.y), vz = (int)(vox.z);
+    float dx = vox.x - vx, dy = vox.y - vy, dz = vox.z - vz;
+
+    // Trilinearly interpolate density values to compute local density
+    float d00 = Lerp(dx, D(vx, vy, vz, nVoxels, density),     
+                         D(vx+1, vy, vz, nVoxels, density));
+    float d10 = Lerp(dx, D(vx, vy+1, vz, nVoxels, density),   
+                         D(vx+1, vy+1, vz, nVoxels, density));
+    float d01 = Lerp(dx, D(vx, vy, vz+1, nVoxels, density),   
+                         D(vx+1, vy, vz+1, nVoxels, density));
+    float d11 = Lerp(dx, D(vx, vy+1, vz+1, nVoxels, density), 
+                         D(vx+1, vy+1, vz+1, nVoxels, density));
+    float d0 = Lerp(dy, d00, d10);
+    float d1 = Lerp(dy, d01, d11);
+    return Lerp(dz, d0, d1);
+}
+
+
+
+static float
+transmittance(float3 p0, float3 p1, float3 pMin,
+              float3 pMax, float sigma_t, float density[], int nVoxels[3]) {
+    float rayT0, rayT1;
+    Ray ray;
+    ray.origin = p1;
+    ray.dir = p0 - p1;
+
+    // Find the parametric t range along the ray that is inside the volume.
+    if (!IntersectP(ray, pMin, pMax, &rayT0, &rayT1))
+        return 1.;
+
+    rayT0 = std::max(rayT0, 0.f);
+
+    // Accumulate beam transmittance in tau
+    float tau = 0;
+    float rayLength = sqrtf(ray.dir.x * ray.dir.x + ray.dir.y * ray.dir.y +
+                            ray.dir.z * ray.dir.z);
+    float stepDist = 0.2f;
+    float stepT = stepDist / rayLength;
+
+    float t = rayT0;
+    float3 pos = ray.origin + ray.dir * rayT0;
+    float3 dirStep = ray.dir * stepT;
+    while (t < rayT1) {
+        tau += stepDist * sigma_t * Density(pos, pMin, pMax, density, nVoxels);
+        pos = pos + dirStep;
+        t += stepT;
+    }
+
+    return expf(-tau);
+}
+
+
+static float
+distanceSquared(float3 a, float3 b) {
+    float3 d = a-b;
+    return d.x*d.x + d.y*d.y + d.z*d.z;
+}
+
+
+static float 
+raymarch(float density[], int nVoxels[3], const Ray &ray) {
+    float rayT0, rayT1;
+    float3 pMin(.3f, -.2f, .3f), pMax(1.8f, 2.3f, 1.8f);
+    float3 lightPos(-1.f, 4.f, 1.5f);
+
+    if (!IntersectP(ray, pMin, pMax, &rayT0, &rayT1))
+        return 0.;
+
+    rayT0 = std::max(rayT0, 0.f);
+
+    // Parameters that define the volume scattering characteristics and
+    // sampling rate for raymarching
+    float Le = .25f;           // Emission coefficient
+    float sigma_a = 10;        // Absorption coefficient
+    float sigma_s = 10;        // Scattering coefficient
+    float stepDist = 0.025f;   // Ray step amount
+    float lightIntensity = 40; // Light source intensity
+
+    float tau = 0.f;  // accumulated beam transmittance
+    float L = 0;      // radiance along the ray
+    float rayLength = sqrtf(ray.dir.x * ray.dir.x + ray.dir.y * ray.dir.y +
+                            ray.dir.z * ray.dir.z);
+    float stepT = stepDist / rayLength;
+
+    float t = rayT0;
+    float3 pos = ray.origin + ray.dir * rayT0;
+    float3 dirStep = ray.dir * stepT;
+    while (t < rayT1) {
+        float d = Density(pos, pMin, pMax, density, nVoxels);
+
+        // terminate once attenuation is high
+        float atten = expf(-tau);
+        if (atten < .005f)
+            break;
+
+        // direct lighting
+        float Li = lightIntensity / distanceSquared(lightPos, pos) * 
+            transmittance(lightPos, pos, pMin, pMax, sigma_a + sigma_s,
+                          density, nVoxels);
+        L += stepDist * atten * d * sigma_s * (Li + Le);
+
+        // update beam transmittance
+        tau += stepDist * (sigma_a + sigma_s) * d;
+
+        pos = pos + dirStep;
+        t += stepT;
+    }
+
+    // Gamma correction
+    return powf(L, 1.f / 2.2f);
+}
+
+
+void
+volume_serial(float density[], int nVoxels[3], const float raster2camera[4][4],
+              const float camera2world[4][4], 
+              int width, int height, float image[]) {
+    int offset = 0;
+    for (int y = 0; y < height; ++y) {
+        for (int x = 0; x < width; ++x, ++offset) {
+            Ray ray;
+            generateRay(raster2camera, camera2world, x, y, ray);
+            image[offset] = raymarch(density, nVoxels, ray);
+        }
+    }
+}
--- a/expr.cpp
+++ b/expr.cpp
@@ -741,6 +741,12 @@ UnaryExpr::TypeCheck() {
 }


+int
+UnaryExpr::EstimateCost() const {
+    return (expr ? expr->EstimateCost() : 0) + COST_SIMPLE_ARITH_LOGIC_OP;
+}
+
+
 void
 UnaryExpr::Print() const {
    if (!expr || !GetType())
@@ -799,11 +805,17 @@ lOpString(BinaryExpr::Op op) {
 */
 static llvm::Value *
 lEmitBinaryBitOp(BinaryExpr::Op op, llvm::Value *arg0Val,
-                 llvm::Value *arg1Val, FunctionEmitContext *ctx) {
+                 llvm::Value *arg1Val, bool isUnsigned,
+                 FunctionEmitContext *ctx) {
    llvm::Instruction::BinaryOps inst;
    switch (op) {
    case BinaryExpr::Shl:    inst = llvm::Instruction::Shl;  break;
-    case BinaryExpr::Shr:    inst = llvm::Instruction::AShr; break; 
+    case BinaryExpr::Shr:
+        if (isUnsigned)
+            inst = llvm::Instruction::LShr; 
+        else
+            inst = llvm::Instruction::AShr; 
+        break; 
    case BinaryExpr::BitAnd: inst = llvm::Instruction::And;  break;
    case BinaryExpr::BitXor: inst = llvm::Instruction::Xor;  break;
    case BinaryExpr::BitOr:  inst = llvm::Instruction::Or;   break;
@@ -949,7 +961,8 @@ BinaryExpr::GetValue(FunctionEmitContext *ctx) const {
            dynamic_cast<ConstExpr *>(arg1) == NULL)
            PerformanceWarning(pos, "Shift right is extremely inefficient for "
                               "varying shift amounts.");
-        return lEmitBinaryBitOp(op, e0Val, e1Val, ctx);
+        return lEmitBinaryBitOp(op, e0Val, e1Val, 
+                                arg0->GetType()->IsUnsignedType(), ctx);
    }
    case LogicalAnd:
        return ctx->BinaryOperator(llvm::Instruction::And, e0Val, e1Val,
@@ -1438,6 +1451,15 @@ BinaryExpr::TypeCheck() {
 }


+int
+BinaryExpr::EstimateCost() const {
+    return ((arg0 ? arg0->EstimateCost() : 0) +
+            (arg1 ? arg1->EstimateCost() : 0) +
+            ((op == Div || op == Mod) ? COST_COMPLEX_ARITH_OP : 
+                                        COST_SIMPLE_ARITH_LOGIC_OP));
+}
+
+
 void
 BinaryExpr::Print() const {
    if (!arg0 || !arg1 || !GetType())
@@ -1533,7 +1555,8 @@ lEmitOpAssign(AssignExpr::Op op, Expr *arg0, Expr *arg1, const Type *type,
    case AssignExpr::AndAssign:
    case AssignExpr::XorAssign:
    case AssignExpr::OrAssign:
-        newValue = lEmitBinaryBitOp(basicop, oldLHS, rvalue, ctx);
+        newValue = lEmitBinaryBitOp(basicop, oldLHS, rvalue, 
+                                    arg0->GetType()->IsUnsignedType(), ctx);
        break;
    default:
        FATAL("logic error in lEmitOpAssign");
@@ -1688,6 +1711,20 @@ AssignExpr::TypeCheck() {
 }


+int
+AssignExpr::EstimateCost() const {
+    int cost = ((lvalue ? lvalue->EstimateCost() : 0) +
+                (rvalue ? rvalue->EstimateCost() : 0));
+    cost += COST_ASSIGN;
+    if (op == Assign)
+        return cost;
+    if (op == DivAssign || op == ModAssign)
+        return cost + COST_COMPLEX_ARITH_OP;
+    else
+        return cost + COST_SIMPLE_ARITH_LOGIC_OP;
+}
+
+
 void
 AssignExpr::Print() const {
    if (!lvalue || !rvalue || !GetType())
@@ -1936,6 +1973,12 @@ SelectExpr::TypeCheck() {
 }


+int
+SelectExpr::EstimateCost() const {
+    return COST_SELECT;
+}
+
+
 void
 SelectExpr::Print() const {
    if (!test || !expr1 || !expr2 || !GetType())
@@ -2100,7 +2143,8 @@ FunctionCallExpr::tryResolve(bool (*matchFunc)(Expr *, const Type *)) {
            // It's kind of a silly to redundantly discover this for each
            // potential match versus detecting this earlier in the
            // matching process and just giving up.
-            if (!callArgs[i] || !callArgs[i]->GetType() || !candArgTypes[i])
+            if (!callArgs[i] || !callArgs[i]->GetType() || !candArgTypes[i] ||
+                dynamic_cast<const FunctionType *>(callArgs[i]->GetType()) != NULL)
                return false;
            
            // See if this caller argument matches the type of the
@@ -2213,55 +2257,6 @@ FunctionCallExpr::FunctionCallExpr(Expr *f, ExprList *a, SourcePos p, bool il)
 }


-/** Starting from the function initialFunction, we're calling into
-    calledFunc.  The question is: is this a recursive call back to
-    initialFunc?  If it definitely is or if it may be, then return true.
-    Return false if it definitely is not.
- */
-static bool
-lMayBeRecursiveCall(llvm::Function *calledFunc, 
-                    llvm::Function *initialFunc,
-                    std::set<llvm::Function *> &seenFuncs) {
-    // Easy case: intrinsics aren't going to call functions themselves
-    if (calledFunc->isIntrinsic())
-        return false;
-
-    std::string name = calledFunc->getName();
-    if (name.size() > 2 && name[0] == '_' && name[1] == '_')
-        // builtin stdlib function; none of these are recursive...
-        return false;
-
-    if (calledFunc->isDeclaration())
-        // There's visibility into what the called function does without a
-        // definition, so we have to be conservative
-        return true;
-
-    if (calledFunc == initialFunc)
-        // hello recursive call
-        return true;
-
-    // Otherwise iterate over all of the instructions in the function.  If
-    // any of them is a function call then check recursively..
-    llvm::inst_iterator iter;
-    for (iter = llvm::inst_begin(calledFunc); 
-         iter != llvm::inst_end(calledFunc); ++iter) {
-        llvm::Instruction *inst = &*iter;
-        llvm::CallInst *ci = llvm::dyn_cast<llvm::CallInst>(inst);
-        if (ci != NULL) {
-            llvm::Function *nextCalledFunc = ci->getCalledFunction();
-            // Don't repeatedly test functions we've seen before 
-            if (seenFuncs.find(nextCalledFunc) == seenFuncs.end()) {
-                seenFuncs.insert(nextCalledFunc);
-                if (lMayBeRecursiveCall(nextCalledFunc, initialFunc, 
-                                        seenFuncs))
-                    return true;
-            }
-        }
-    }
-    return false;
-}
-
-
 llvm::Value *
 FunctionCallExpr::GetValue(FunctionEmitContext *ctx) const {
    if (!func || !args)
@@ -2382,47 +2377,14 @@ FunctionCallExpr::GetValue(FunctionEmitContext *ctx) const {
        }
    }

-    // We sometimes need to check to see if the mask is all off here;
-    // specifically, if the mask is all off and we call a recursive
-    // function, then we will probably have an unsesirable infinite loop.
-    ctx->SetDebugPos(pos);
-    llvm::BasicBlock *bDoCall = ctx->CreateBasicBlock("funcall_mask_ok");
-    llvm::BasicBlock *bSkip = ctx->CreateBasicBlock("funcall_mask_off");
-    llvm::BasicBlock *bAfter = ctx->CreateBasicBlock("after_funcall");
-    llvm::Function *currentFunc = ctx->GetCurrentBasicBlock()->getParent();
-
-    // If we need to check the mask (it may be a recursive call, possibly
-    // transitively), or we're launching a task, which is expensive and
-    // thus probably always worth checking, then use the mask to choose
-    // whether to go to the bDoCallBlock or the bSkip block
-    std::set<llvm::Function *> seenFuncs;
-    seenFuncs.insert(currentFunc);
-    if (ft->isTask || lMayBeRecursiveCall(callee, currentFunc, seenFuncs)) {
-        Debug(pos, "Checking mask before function call \"%s\".", funSym->name.c_str());
-        ctx->BranchIfMaskAny(bDoCall, bSkip);
-    }
-    else
-        // If we don't need to check the mask, then always to the call;
-        // just jump to bDoCall
-        ctx->BranchInst(bDoCall);
-    
-    // And the bSkip block just jumps immediately to bAfter.  So why do we
-    // need it?  So the phi node below can easily tell what paths are
-    // going into it
-    ctx->SetCurrentBasicBlock(bSkip);
-    ctx->BranchInst(bAfter);
-
-    // Emit the code to do the function call
-    ctx->SetCurrentBasicBlock(bDoCall);
-
    llvm::Value *retVal = NULL;
    ctx->SetDebugPos(pos);
    if (ft->isTask)
        ctx->LaunchInst(callee, argVals);
    else {
        // Most of the time, the mask is passed as the last argument.  this
-        // isn't the case for things like SSE intrinsics and extern "C"
-        // functions from the application.
+        // isn't the case for things like intrinsics, builtins, and extern
+        // "C" functions from the application.
        assert(callargs.size() + 1 == callee->arg_size() ||
               callargs.size() == callee->arg_size());

@@ -2449,22 +2411,10 @@ FunctionCallExpr::GetValue(FunctionEmitContext *ctx) const {
        }
    }

-    // And jump out to the 'after funciton call' basic block
-    ctx->BranchInst(bAfter);
-    ctx->SetCurrentBasicBlock(bAfter);
-
    if (isVoidFunc)
        return NULL;
-
-    // The return value for the non-void case is either undefined or the
-    // function return value, depending on whether we actually ran the code
-    // path that called the function or not.
-    LLVM_TYPE_CONST llvm::Type *lrType = ft->GetReturnType()->LLVMType(g->ctx);
-    llvm::PHINode *ret = ctx->PhiNode(lrType, 2, "fun_ret");
-    assert(retVal != NULL);
-    ret->addIncoming(llvm::UndefValue::get(lrType), bSkip);
-    ret->addIncoming(retVal, bDoCall);
-    return ret;
+    else
+        return retVal;
 }


@@ -2525,6 +2475,13 @@ FunctionCallExpr::TypeCheck() {
 }


+int
+FunctionCallExpr::EstimateCost() const {
+    return ((args ? args->EstimateCost() : 0) +
+            (isLaunch ? COST_TASK_LAUNCH : COST_FUNCALL));
+}
+
+
 void
 FunctionCallExpr::Print() const {
    if (!func || !args || !GetType())
@@ -2613,7 +2570,7 @@ ExprList::GetConstant(const Type *type) const {
    }

    if (dynamic_cast<const StructType *>(type) != NULL) {
-#if defined(LLVM_2_8) || defined(LLVM_2_9)
+#if defined(LLVM_2_9)
        return llvm::ConstantStruct::get(*g->ctx, cv, false);
 #else
        LLVM_TYPE_CONST llvm::StructType *llvmStructType =
@@ -2636,6 +2593,17 @@ ExprList::GetConstant(const Type *type) const {
 }


+int
+ExprList::EstimateCost() const {
+    int cost = 0;
+    for (unsigned int i = 0; i < exprs.size(); ++i) {
+        if (exprs[i] != NULL)
+            cost += exprs[i]->EstimateCost();
+    }
+    return cost;
+}
+
+
 void
 ExprList::Print() const {
    printf("expr list (");
@@ -2766,6 +2734,22 @@ IndexExpr::GetLValue(FunctionEmitContext *ctx) const {
    if (!basePtr)
        return NULL;

+    // If the array index is a compile time constant, check to see if it
+    // may lead to an out-of-bounds access.
+    ConstExpr *ce = dynamic_cast<ConstExpr *>(index);
+    const SequentialType *seqType = dynamic_cast<const SequentialType *>(type);
+    assert(seqType != NULL);
+    int nElements = seqType->GetElementCount();
+    if (ce != NULL && nElements > 0) {
+        int32_t indices[ISPC_MAX_NVEC];
+        int count = ce->AsInt32(indices);
+        for (int i = 0; i < count; ++i) {
+            if (indices[i] < 0 || indices[i] >= nElements)
+                Warning(index->pos, "Array index \"%d\" may be out of bounds for "
+                        "\"%d\" element array.", indices[i], nElements);
+        }
+    }
+
    basePtr = lCastUniformVectorBasePtr(basePtr, ctx);

    ctx->SetDebugPos(pos);
@@ -2818,6 +2802,16 @@ IndexExpr::TypeCheck() {
 }


+int
+IndexExpr::EstimateCost() const {
+    // be pessimistic
+    if (index && index->GetType()->IsVaryingType())
+        return COST_GATHER;
+    else
+        return COST_LOAD;
+}
+
+
 void
 IndexExpr::Print() const {
    if (!arrayOrVector || !index || !GetType())
@@ -3117,6 +3111,7 @@ MemberExpr::create(Expr *e, const char *id, SourcePos p, SourcePos idpos) {
    return new MemberExpr(e, id, p, idpos);
 }

+
 MemberExpr::MemberExpr(Expr *e, const char *id, SourcePos p, SourcePos idpos) 
    : Expr(p), identifierPos(idpos) {
    expr = e;
@@ -3213,6 +3208,14 @@ MemberExpr::Optimize() {
 }


+int
+MemberExpr::EstimateCost() const {
+    // FIXME: return gather cost when we can tell a gather is going to be
+    // needed
+    return COST_SIMPLE_ARITH_LOGIC_OP;
+}
+
+
 void
 MemberExpr::Print() const {
    if (!expr || !GetType())
@@ -3280,7 +3283,7 @@ ConstExpr::ConstExpr(const Type *t, uint8_t u, SourcePos p)
  : Expr(p) {
    type = t;
    type = type->GetAsConstType();
-    assert(type == AtomicType::UniformUInt8);
+    assert(type == AtomicType::UniformConstUInt8);
    uint8Val[0] = u;
 }

@@ -3320,7 +3323,7 @@ ConstExpr::ConstExpr(const Type *t, uint16_t u, SourcePos p)
  : Expr(p) {
    type = t;
    type = type->GetAsConstType();
-    assert(type == AtomicType::UniformUInt16);
+    assert(type == AtomicType::UniformConstUInt16);
    uint16Val[0] = u;
 }

@@ -3423,7 +3426,7 @@ ConstExpr::ConstExpr(const Type *t, uint64_t u, SourcePos p)
  : Expr(p) {
    type = t;
    type = type->GetAsConstType();
-    assert(type == AtomicType::UniformUInt64);
+    assert(type == AtomicType::UniformConstUInt64);
    uint64Val[0] = u;
 }

@@ -4008,6 +4011,12 @@ ConstExpr::TypeCheck() {
 }


+int
+ConstExpr::EstimateCost() const {
+    return 0;
+}
+
+
 void
 ConstExpr::Print() const {
    printf("[%s] (", GetType()->GetString().c_str());
@@ -4928,6 +4937,13 @@ TypeCastExpr::Optimize() {
 }


+int
+TypeCastExpr::EstimateCost() const {
+    // FIXME: return COST_TYPECAST_COMPLEX when appropriate
+    return COST_TYPECAST_SIMPLE;
+}
+
+
 void
 TypeCastExpr::Print() const {
    printf("[%s] type cast (", GetType()->GetString().c_str());
@@ -4993,6 +5009,12 @@ ReferenceExpr::TypeCheck() {
 }


+int
+ReferenceExpr::EstimateCost() const {
+    return 0;
+}
+
+
 void
 ReferenceExpr::Print() const {
    if (expr == NULL || GetType() == NULL)
@@ -5071,6 +5093,12 @@ DereferenceExpr::Optimize() {
 }


+int
+DereferenceExpr::EstimateCost() const {
+    return COST_DEREF;
+}
+
+
 void
 DereferenceExpr::Print() const {
    if (expr == NULL || GetType() == NULL)
@@ -5142,6 +5170,15 @@ SymbolExpr::Optimize() {
 }


+int
+SymbolExpr::EstimateCost() const {
+    if (symbol->constValue != NULL)
+        return 0;
+    else
+        return COST_LOAD;
+}
+
+
 void
 SymbolExpr::Print() const {
    if (symbol == NULL || GetType() == NULL)
@@ -5195,6 +5232,12 @@ FunctionSymbolExpr::Optimize() {
 }


+int
+FunctionSymbolExpr::EstimateCost() const {
+    return 0;
+}
+
+
 void
 FunctionSymbolExpr::Print() const {
    if (!matchingFunc || !GetType())
@@ -5229,6 +5272,12 @@ SyncExpr::GetValue(FunctionEmitContext *ctx) const {
 }


+int
+SyncExpr::EstimateCost() const {
+    return COST_SYNC;
+}
+
+
 void
 SyncExpr::Print() const {
    printf("sync");
--- a/expr.h
+++ b/expr.h
@@ -121,8 +121,8 @@ public:
    void Print() const;
    Expr *Optimize();
    Expr *TypeCheck();
+    int EstimateCost() const;

-private:
    const Op op;
    Expr *expr;
 };
@@ -164,8 +164,8 @@ public:

    Expr *Optimize();
    Expr *TypeCheck();
+    int EstimateCost() const;

-private:
    const Op op;
    Expr *arg0, *arg1;
 };
@@ -196,8 +196,8 @@ public:

    Expr *Optimize();
    Expr *TypeCheck();
+    int EstimateCost() const;

-private:
    const Op op;
    Expr *lvalue, *rvalue;
 };
@@ -217,8 +217,8 @@ public:

    Expr *Optimize();
    Expr *TypeCheck();
+    int EstimateCost() const;

-private:
    Expr *test, *expr1, *expr2;
 };

@@ -240,6 +240,7 @@ public:
    llvm::Constant *GetConstant(const Type *type) const;
    ExprList *Optimize();
    ExprList *TypeCheck();
+    int EstimateCost() const;

    std::vector<Expr *> exprs;
 };
@@ -257,12 +258,13 @@ public:

    Expr *Optimize();
    Expr *TypeCheck();
+    int EstimateCost() const;

-private:
    Expr *func;
    ExprList *args;
    bool isLaunch;

+private:
    void resolveFunctionOverloads();
    bool tryResolve(bool (*matchFunc)(Expr *, const Type *));
 };
@@ -285,8 +287,8 @@ public:

    Expr *Optimize();
    Expr *TypeCheck();
+    int EstimateCost() const;

-private:
    Expr *arrayOrVector, *index;
 };

@@ -303,16 +305,17 @@ public:
    MemberExpr(Expr *expr, const char *identifier, SourcePos pos, 
               SourcePos identifierPos);

-    virtual llvm::Value *GetValue(FunctionEmitContext *ctx) const;
-    virtual llvm::Value *GetLValue(FunctionEmitContext *ctx) const;
-    virtual const Type *GetType() const;
-    virtual Symbol *GetBaseSymbol() const;
-    virtual void Print() const;
-    virtual Expr *Optimize();
-    virtual Expr *TypeCheck();
+    llvm::Value *GetValue(FunctionEmitContext *ctx) const;
+    llvm::Value *GetLValue(FunctionEmitContext *ctx) const;
+    const Type *GetType() const;
+    Symbol *GetBaseSymbol() const;
+    void Print() const;
+    Expr *Optimize();
+    Expr *TypeCheck();
+    int EstimateCost() const;
+
    virtual int getElementNumber() const;

-protected:
    std::string getCandidateNearMatches() const;

    Expr *expr;
@@ -392,6 +395,7 @@ public:

    Expr *TypeCheck();
    Expr *Optimize();
+    int EstimateCost() const;

    /** Return the ConstExpr's values as booleans, doing type conversion
        from the actual type if needed.  If forceVarying is true, then type
@@ -495,8 +499,8 @@ public:
    void Print() const;
    Expr *TypeCheck();
    Expr *Optimize();
+    int EstimateCost() const;

-private:
    const Type *type;
    Expr *expr;
 };
@@ -514,8 +518,8 @@ public:
    void Print() const;
    Expr *TypeCheck();
    Expr *Optimize();
+    int EstimateCost() const;

-private:
    Expr *expr;
 };

@@ -533,8 +537,8 @@ public:
    void Print() const;
    Expr *TypeCheck();
    Expr *Optimize();
+    int EstimateCost() const;

-private:
    Expr *expr;
 };

@@ -551,6 +555,7 @@ public:
    Expr *TypeCheck();
    Expr *Optimize();
    void Print() const;
+    int EstimateCost() const;

 private:
    Symbol *symbol;
@@ -571,6 +576,7 @@ public:
    Expr *TypeCheck();
    Expr *Optimize();
    void Print() const;
+    int EstimateCost() const;

 private:
    friend class FunctionCallExpr;
@@ -597,6 +603,7 @@ public:
    Expr *TypeCheck();
    Expr *Optimize();
    void Print() const;
+    int EstimateCost() const;
 };

 #endif // ISPC_EXPR_H
--- a/failing_tests/masked-scatter-vector.ispc
+++ b/failing_tests/masked-scatter-vector.ispc
@@ -14,7 +14,7 @@ export void f_fu(uniform float ret[], uniform float aa[], uniform float b) {
    varying int3 vv = array[a];
    ++vv.y;
    array[a] = vv;
-    print("fin %\n", array[programIndex].y);
+//CO    print("fin %\n", array[programIndex].y);
    ret[programIndex] = array[programIndex].y;
 }

--- a/failing_tests/max-uint-1.ispc
+++ b/failing_tests/max-uint-1.ispc
@@ -1,19 +1,14 @@
-static float float4(uniform float a, uniform float b, uniform float c, 
-                    uniform float d) {
-    float ret = 0;
-    for (uniform int i = 0; i < programCount; i += 4) {
-        ret = insert(ret, i + 0, a);
-        ret = insert(ret, i + 1, b);
-        ret = insert(ret, i + 2, c);
-        ret = insert(ret, i + 3, d);
-    }
-    return ret;
+
+export uniform int width() { return programCount; }
+
+export void f_f(uniform float r[], uniform float a[]) {
+    unsigned int i = (unsigned int)a[programIndex];
+    r[programIndex] = max((unsigned int)2, i);
 }

-export float f_f(float a) {
-    unsigned int i = (unsigned int)a;
-    return max((unsigned int)2, i);
+export void result(uniform float r[]) { 
+    r[programIndex] = 1+programIndex;
+    r[0] = 2;
 }

-export float result() { return float4(2,2,3,4); }

--- a/failing_tests/max-uint.ispc
+++ b/failing_tests/max-uint.ispc
@@ -1,8 +1,10 @@

-export float f_f(float a) {
-    unsigned int i = (unsigned int)a;
-    return max((unsigned int)10, i);
+export uniform int width() { return programCount; }
+
+export void f_f(uniform float result[], uniform float aa[]) {
+    unsigned int i = (unsigned int)aa[programIndex];
+    result[programIndex] = max((unsigned int)100, i);
 }

-export float result() { return 10; }
+export void result(uniform float r[]) { r[programIndex] = 100; }

--- a/failing_tests/min-uint-1.ispc
+++ b/failing_tests/min-uint-1.ispc
@@ -1,19 +1,14 @@
-static float float4(uniform float a, uniform float b, uniform float c, 
-                    uniform float d) {
-    float ret = 0;
-    for (uniform int i = 0; i < programCount; i += 4) {
-        ret = insert(ret, i + 0, a);
-        ret = insert(ret, i + 1, b);
-        ret = insert(ret, i + 2, c);
-        ret = insert(ret, i + 3, d);
-    }
-    return ret;
+
+export uniform int width() { return programCount; }
+
+export void f_f(uniform float result[], uniform float aa[]) {
+    unsigned int i = (unsigned int)aa[programIndex];
+    result[programIndex] = min((unsigned int)2, i);
 }

-export float f_f(float a) {
-    unsigned int i = (unsigned int)a;
-    return min((unsigned int)2, i);
+export void result(uniform float r[]) { 
+    r[programIndex] = 2;
+    r[0] = 1;
 }

-export float result() { return float4(1,2,2,2); }

--- a/failing_tests/min-uint-2.ispc
+++ b/failing_tests/min-uint-2.ispc
@@ -1,19 +1,13 @@
-static float float4(uniform float a, uniform float b, uniform float c, 
-                    uniform float d) {
-    float ret = 0;
-    for (uniform int i = 0; i < programCount; i += 4) {
-        ret = insert(ret, i + 0, a);
-        ret = insert(ret, i + 1, b);
-        ret = insert(ret, i + 2, c);
-        ret = insert(ret, i + 3, d);
-    }
-    return ret;
+
+export uniform int width() { return programCount; }
+
+export void f_f(uniform float r[], uniform float a[]) {
+    unsigned int i = (unsigned int)a[programIndex];
+    r[programIndex] =  min((unsigned int)20, i);
 }

-export float f_f(float a) {
-    unsigned int i = (unsigned int)a;
-    return min((unsigned int)20, i);
+export void result(uniform float r[]) { 
+    r[programIndex] = 1+programIndex;
 }

-export float result() { return float4(1,2,3,4); }

--- a/failing_tests/struct-array-assign.ispc
+++ b/failing_tests/struct-array-assign.ispc
@@ -1,11 +0,0 @@
-
-struct Foo {
-    float f;
-};
-
-
-export float foo(Foo f[], int i, uniform int j) {
-    Foo x = f[i];
-    return x.f;
-}
-
--- a/ispc.cpp
+++ b/ispc.cpp
@@ -42,14 +42,25 @@
 #ifdef ISPC_IS_WINDOWS
 #include <windows.h>
 #include <direct.h>
+#define strcasecmp stricmp
 #endif
 #include <llvm/LLVMContext.h>
 #include <llvm/Module.h>
-#ifndef LLVM_2_8
 #include <llvm/Analysis/DIBuilder.h>
-#endif
 #include <llvm/Analysis/DebugInfo.h>
 #include <llvm/Support/Dwarf.h>
+#include <llvm/Target/TargetMachine.h>
+#include <llvm/Target/TargetOptions.h>
+#include <llvm/Target/TargetData.h>
+#if defined(LLVM_3_0) || defined(LLVM_3_0svn)
+  #include <llvm/Support/TargetRegistry.h>
+  #include <llvm/Support/TargetSelect.h>
+#else
+  #include <llvm/Target/TargetRegistry.h>
+  #include <llvm/Target/TargetSelect.h>
+  #include <llvm/Target/SubtargetFeature.h>
+#endif
+#include <llvm/Support/Host.h>

 Globals *g;
 Module *m;
@@ -57,20 +68,196 @@ Module *m;
 ///////////////////////////////////////////////////////////////////////////
 // Target

-Target::Target() {
-    arch = "x86-64";
-    cpu = "nehalem";
-    isa = SSE4;
-    nativeVectorWidth = 4;
-    vectorWidth = 4;
+bool
+Target::GetTarget(const char *arch, const char *cpu, const char *isa,
+                  bool pic, Target *t) {
+    if (cpu == NULL) {
+        std::string hostCPU = llvm::sys::getHostCPUName();
+        if (hostCPU.size() > 0)
+            cpu = hostCPU.c_str();
+        else {
+            fprintf(stderr, "Warning: unable to determine host CPU!\n");
+            cpu = "generic";
+        }
+    }
+    t->cpu = cpu;
+
+    if (isa == NULL) {
+        if (!strcasecmp(cpu, "atom"))
+            isa = "sse2";
+#if defined(LLVM_3_0) || defined(LLVM_3_0_svn)
+        else if (!strcasecmp(cpu, "sandybridge") ||
+                 !strcasecmp(cpu, "corei7-avx"))
+            isa = "avx";
+#endif // LLVM_3_0
+        else
+            isa = "sse4";
+    }
+    if (arch == NULL)
+        arch = "x86-64";
+
+    bool error = false;
+
+    t->generatePIC = pic;
+
+    // Make sure the target architecture is a known one; print an error
+    // with the valid ones otherwise.
+    t->target = NULL;
+    for (llvm::TargetRegistry::iterator iter = llvm::TargetRegistry::begin();
+         iter != llvm::TargetRegistry::end(); ++iter) {
+        if (std::string(arch) == iter->getName()) {
+            t->target = &*iter;
+            break;
+        }
+    }
+    if (t->target == NULL) {
+        fprintf(stderr, "Invalid architecture \"%s\"\nOptions: ", arch);
+        llvm::TargetRegistry::iterator iter;
+        for (iter = llvm::TargetRegistry::begin();
+             iter != llvm::TargetRegistry::end(); ++iter)
+            fprintf(stderr, "%s ", iter->getName());
+        fprintf(stderr, "\n");
+        error = true;
+    }
+    else {
+        t->arch = arch;
+    }
+
+    if (!strcasecmp(isa, "sse2")) {
+        t->isa = Target::SSE2;
+        t->nativeVectorWidth = 4;
+        t->vectorWidth = 4;
+        t->attributes = "+sse,+sse2,-sse3,-sse41,-sse42,-sse4a,-ssse3,-popcnt";
+    }
+    else if (!strcasecmp(isa, "sse4")) {
+        t->isa = Target::SSE4;
+        t->nativeVectorWidth = 4;
+        t->vectorWidth = 4;
+        t->attributes = "+sse,+sse2,+sse3,+sse41,-sse42,-sse4a,+ssse3,-popcnt,+cmov";
+    }
+    else if (!strcasecmp(isa, "sse4x2")) {
+        t->isa = Target::SSE4;
+        t->nativeVectorWidth = 4;
+        t->vectorWidth = 8;
+        t->attributes = "+sse,+sse2,+sse3,+sse41,-sse42,-sse4a,+ssse3,-popcnt,+cmov";
+    }
+#if defined(LLVM_3_0) || defined(LLVM_3_0svn)
+    else if (!strcasecmp(isa, "avx")) {
+        t->isa = Target::AVX;
+        t->nativeVectorWidth = 8;
+        t->vectorWidth = 8;
+        t->attributes = "+avx,+popcnt,+cmov";
+    }
+    else if (!strcasecmp(isa, "avx-x2")) {
+        t->isa = Target::AVX;
+        t->nativeVectorWidth = 8;
+        t->vectorWidth = 16;
+        t->attributes = "+avx,+popcnt,+cmov";
+    }
+#endif // LLVM 3.0
+    else {
+        fprintf(stderr, "Target ISA \"%s\" is unknown.  Choices are: %s\n", 
+                isa, SupportedTargetISAs());
+        error = true;
+    }
+
+    if (!error) {
+        llvm::TargetMachine *targetMachine = t->GetTargetMachine();
+        const llvm::TargetData *targetData = targetMachine->getTargetData();
+        t->is32bit = (targetData->getPointerSize() == 4);
+    }
+
+    return !error;
 }

+
+const char *
+Target::SupportedTargetCPUs() {
+    return "atom, barcelona, core2, corei7, "
+#if defined(LLVM_3_0) || defined(LLVM_3_0_svn)
+        "corei7-avx, "
+#endif
+        "istanbul, nocona, penryn, "
+#ifdef LLVM_2_9
+        "sandybridge, "
+#endif
+        "westmere";
+}
+
+
+const char *
+Target::SupportedTargetArchs() {
+    return "x86, x86-64";
+}
+
+
+const char *
+Target::SupportedTargetISAs() {
+    return "sse2, sse4, sse4x2"
+#if defined(LLVM_3_0) || defined(LLVM_3_0_svn)
+        ", avx, avx-x2"
+#endif
+        ;
+}
+
+
+std::string
+Target::GetTripleString() const {
+    llvm::Triple triple;
+    // Start with the host triple as the default
+    triple.setTriple(llvm::sys::getHostTriple());
+
+    // And override the arch in the host triple based on what the user
+    // specified.  Here we need to deal with the fact that LLVM uses one
+    // naming convention for targets TargetRegistry, but wants some
+    // slightly different ones for the triple.  TODO: is there a way to
+    // have it do this remapping, which would presumably be a bit less
+    // error prone?
+    if (arch == "x86")
+        triple.setArchName("i386");
+    else if (arch == "x86-64")
+        triple.setArchName("x86_64");
+    else
+        triple.setArchName(arch);
+
+    return triple.str();
+}
+
+
+llvm::TargetMachine *
+Target::GetTargetMachine() const {
+    std::string triple = GetTripleString();
+
+    llvm::Reloc::Model relocModel = generatePIC ? llvm::Reloc::PIC_ : 
+                                                  llvm::Reloc::Default;
+#if defined(LLVM_3_0svn) || defined(LLVM_3_0)
+    std::string featuresString = attributes;
+    llvm::TargetMachine *targetMachine = 
+        target->createTargetMachine(triple, cpu, featuresString, relocModel);
+#else
+#ifdef ISPC_IS_APPLE
+    relocModel = llvm::Reloc::PIC_;
+#endif // ISPC_IS_APPLE
+    std::string featuresString = cpu + std::string(",") + attributes;
+    llvm::TargetMachine *targetMachine = 
+        target->createTargetMachine(triple, featuresString);
+    targetMachine->setRelocationModel(relocModel);
+#endif
+    assert(targetMachine != NULL);
+
+    targetMachine->setAsmVerbosityDefault(true);
+    return targetMachine;
+}
+
+
 ///////////////////////////////////////////////////////////////////////////
 // Opt

 Opt::Opt() {
    level = 1;
    fastMath = false;
+    fastMaskedVload = false;
+    unrollLoops = true;
    disableBlendedMaskedStores = false;
    disableCoherentControlFlow = false;
    disableUniformControlFlow = false;
@@ -120,13 +307,9 @@ SourcePos::SourcePos(const char *n, int l, int c) {
 }

 llvm::DIFile SourcePos::GetDIFile() const {
-#ifdef LLVM_2_8
-    return llvm::DIFile();
-#else
    std::string directory, filename;
    GetDirectoryAndFileName(g->currentDirectory, name, &directory, &filename);
    return m->diBuilder->createFile(filename, directory);
-#endif // LLVM_2_8
 }


--- a/ispc.h
+++ b/ispc.h
@@ -69,6 +69,8 @@ namespace llvm {
    class FunctionType;
    class LLVMContext;
    class Module;
+    class Target;
+    class TargetMachine;
    class Type;
    class Value;
 }
@@ -146,6 +148,8 @@ public:
        pointer in place of the original ASTNode *. */
    virtual ASTNode *TypeCheck() = 0;

+    virtual int EstimateCost() const = 0;
+
    /** All AST nodes must track the file position where they are
        defined. */
    const SourcePos pos;
@@ -156,7 +160,34 @@ public:
    This structure defines a compilation target for the ispc compiler.
 */
 struct Target {
-    Target();
+    /** Initializes the given Target pointer for a target of the given
+        name, if the name is a known target.  Returns true if the
+        target was initialized and false if the name is unknown. */
+    static bool GetTarget(const char *arch, const char *cpu, const char *isa,
+                          bool pic, Target *);
+
+    /** Returns a comma-delimited string giving the names of the currently
+        supported target ISAs. */
+    static const char *SupportedTargetISAs();
+
+    /** Returns a comma-delimited string giving the names of the currently
+        supported target CPUs. */
+    static const char *SupportedTargetCPUs();
+
+    /** Returns a comma-delimited string giving the names of the currently
+        supported target architectures. */
+    static const char *SupportedTargetArchs();
+
+    /** Returns a triple string specifying the target architecture, vendor,
+        and environment. */
+    std::string GetTripleString() const;
+
+    /** Returns the LLVM TargetMachine object corresponding to this
+        target. */
+    llvm::TargetMachine *GetTargetMachine() const;
+
+    /** llvm Target object representing this target. */
+    const llvm::Target *target;

    /** Enumerator giving the instruction sets that the compiler can
        target. */
@@ -168,9 +199,15 @@ struct Target {
    /** Target system architecture.  (e.g. "x86-64", "x86"). */
    std::string arch;

+    /** Is the target architecture 32 or 64 bit */
+    bool is32bit;
+
    /** Target CPU. (e.g. "corei7", "corei7-avx", ..) */
    std::string cpu;

+    /** Target-specific attributes to pass along to the LLVM backend */
+    std::string attributes;
+
    /** Native vector width of the vector instruction set.  Note that this
        value is directly derived from the ISA Being used (e.g. it's 4 for
        SSE, 8 for AVX, etc.) */
@@ -180,8 +217,12 @@ struct Target {
        integer multiple of the native vector width, for example if we're
        "doubling up" and compiling 8-wide on a 4-wide SSE system. */
    int vectorWidth;
+
+    /** Indicates whether position independent code should be generated. */
+    bool generatePIC;
 };

+
 /** @brief Structure that collects optimization options

    This structure collects all of the options related to optimization of
@@ -199,6 +240,16 @@ struct Opt {
        should be performed.  This is false by default. */
    bool fastMath;

+    /** Indicates whether an vector load should be issued for masked loads
+        on platforms that don't have a native masked vector load.  (This may
+        lead to accessing memory up to programCount-1 elements past the end of
+        arrays, so is unsafe in general.) */
+    bool fastMaskedVload;
+
+    /** Indicates when loops should be unrolled (when doing so seems like
+        it will make sense. */
+    bool unrollLoops;
+
    /** On targets that don't have a masked store instruction but do have a
        blending instruction, by default, we simulate masked stores by
        loading the old value, blending, and storing the result.  This can
@@ -316,6 +367,29 @@ struct Globals {
    std::vector<std::string> cppArgs;
 };

+enum {
+    COST_ASSIGN = 1,
+    COST_COHERENT_BREAK_CONTINE = 4,
+    COST_COMPLEX_ARITH_OP = 4,
+    COST_DEREF = 4,
+    COST_FUNCALL = 4,
+    COST_GATHER = 8,
+    COST_LOAD = 2,
+    COST_REGULAR_BREAK_CONTINUE = 2,
+    COST_RETURN = 4,
+    COST_SELECT = 4,
+    COST_SIMPLE_ARITH_LOGIC_OP = 1,
+    COST_SYNC = 32,
+    COST_TASK_LAUNCH = 16,
+    COST_TYPECAST_COMPLEX = 4,
+    COST_TYPECAST_SIMPLE = 1,
+    COST_UNIFORM_LOOP = 4,
+    COST_VARYING_LOOP = 6,
+
+    CHECK_MASK_AT_FUNCTION_START_COST = 16,
+    PREDICATE_SAFE_IF_STATEMENT_COST = 6,
+};
+
 extern Globals *g;
 extern Module *m;

--- a/ispc.vcxproj
+++ b/ispc.vcxproj
@@ -16,7 +16,9 @@
    <ClCompile Include="decl.cpp" />
    <ClCompile Include="expr.cpp" />
    <ClCompile Include="gen-bitcode-avx.cpp" />
-    <ClCompile Include="gen-bitcode-c.cpp" />
+    <ClCompile Include="gen-bitcode-avx-x2.cpp" />
+    <ClCompile Include="gen-bitcode-c-32.cpp" />
+    <ClCompile Include="gen-bitcode-c-64.cpp" />
    <ClCompile Include="gen-bitcode-sse2.cpp" />
    <ClCompile Include="gen-bitcode-sse4.cpp" />
    <ClCompile Include="gen-bitcode-sse4x2.cpp" />
@@ -29,9 +31,9 @@
    <ClCompile Include="opt.cpp" />
    <ClCompile Include="parse.cc" />
    <CustomBuild Include="builtins-c.c">
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%LLVM_INSTALL_DIR%\bin\clang -emit-llvm builtins-c.c -c -o - | %LLVM_INSTALL_DIR%\bin\llvm-dis - | python bitcode2cpp.py builtins-c.c &gt; gen-bitcode-c.cpp</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%LLVM_INSTALL_DIR%\bin\clang -m32 -emit-llvm builtins-c.c -c -o - | %LLVM_INSTALL_DIR%\bin\llvm-dis - | python bitcode2cpp.py builtins-c-32.c &gt; gen-bitcode-c-32.cpp</Command>
      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">clang builtins-c.c</Message>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%LLVM_INSTALL_DIR%\bin\clang -emit-llvm builtins-c.c -c -o - | %LLVM_INSTALL_DIR%\bin\llvm-dis - | python bitcode2cpp.py builtins-c.c &gt; gen-bitcode-c.cpp</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%LLVM_INSTALL_DIR%\bin\clang -m32 -emit-llvm builtins-c.c -c -o - | %LLVM_INSTALL_DIR%\bin\llvm-dis - | python bitcode2cpp.py builtins-c-32.c &gt; gen-bitcode-c-32.cpp</Command>
      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">clang builtins-c.c</Message>
      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-c.cpp</Outputs>
      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-c.cpp</Outputs>
@@ -119,6 +121,19 @@
      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-avx.cpp</Message>
    </CustomBuild>
  </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="builtins-avx-x2.ll">
+      <FileType>Document</FileType>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 builtins.m4 builtins-avx-x2.ll | python bitcode2cpp.py builtins-avx-x2.ll &gt; gen-bitcode-avx-x2.cpp</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-avx-x2.cpp</Outputs>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins.m4;builtins-sse.ll</AdditionalInputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 builtins.m4 builtins-avx-x2.ll | python bitcode2cpp.py builtins-avx-x2.ll &gt; gen-bitcode-avx-x2.cpp</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-avx-x2.cpp</Outputs>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins.m4;builtins-sse.ll</AdditionalInputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-avx-x2.cpp</Message>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-avx-x2.cpp</Message>
+    </CustomBuild>
+  </ItemGroup>
  <ItemGroup>
    <CustomBuild Include="lex.ll">
      <FileType>Document</FileType>
--- a/ispc_test.cpp
+++ b/ispc_test.cpp
@@ -33,12 +33,25 @@

 #define _CRT_SECURE_NO_WARNINGS

+#if defined(_WIN32) || defined(_WIN64)
+#define ISPC_IS_WINDOWS
+#elif defined(__linux__)
+#define ISPC_IS_LINUX
+#elif defined(__APPLE__)
+#define ISPC_IS_APPLE
+#endif
+
 #ifdef ISPC_IS_WINDOWS
 #define NOMINMAX
 #include <windows.h>
 #endif
 #include <stdio.h>
 #include <stdint.h>
+#include <stdlib.h>
+#include <memory.h>
+#ifdef ISPC_IS_LINUX
+#include <malloc.h>
+#endif

 #ifdef ISPC_HAVE_SVML
 #include <xmmintrin.h>
@@ -61,8 +74,15 @@ extern "C" {
 #include <llvm/DerivedTypes.h>
 #include <llvm/Instructions.h>
 #include <llvm/ExecutionEngine/ExecutionEngine.h>
+#if defined(LLVM_3_0) || defined(LLVM_3_0svn)
+  #include <llvm/ExecutionEngine/MCJIT.h>
+  #include <llvm/Support/TargetRegistry.h>
+  #include <llvm/Support/TargetSelect.h>
+#else
+  #include <llvm/Target/TargetRegistry.h>
+  #include <llvm/Target/TargetSelect.h>
+#endif
 #include <llvm/ExecutionEngine/JIT.h>
-#include <llvm/Target/TargetSelect.h>
 #include <llvm/Target/TargetOptions.h>
 #include <llvm/Target/TargetData.h>
 #include <llvm/Transforms/Scalar.h>
@@ -74,9 +94,9 @@ extern "C" {
 #include <llvm/Support/raw_ostream.h>
 #include <llvm/Bitcode/ReaderWriter.h>
 #include <llvm/Support/MemoryBuffer.h>
-#ifndef LLVM_2_8
 #include <llvm/Support/system_error.h>
-#endif
+
+bool shouldFail = false;

 extern "C" { 
    void ISPCLaunch(void *, void *);
@@ -96,20 +116,40 @@ void ISPCSync() {
 }


-#ifdef ISPC_IS_WINDOWS
 void *ISPCMalloc(int64_t size, int32_t alignment) {
+#ifdef ISPC_IS_WINDOWS
    return _aligned_malloc(size, alignment);
+#endif
+#ifdef ISPC_IS_LINUX
+    return memalign(alignment, size);
+#endif
+#ifdef ISPC_IS_APPLE
+    void *mem = malloc(size + (alignment-1) + sizeof(void*));
+    char *amem = ((char*)mem) + sizeof(void*);
+    amem = amem + uint32_t(alignment - (reinterpret_cast<uint64_t>(amem) &
+                                        (alignment - 1)));
+    ((void**)amem)[-1] = mem;
+    return amem;
+#endif
 }


 void ISPCFree(void *ptr) {
+#ifdef ISPC_IS_WINDOWS
    _aligned_free(ptr);
-}
 #endif
+#ifdef ISPC_IS_LINUX
+    free(ptr);
+#endif
+#ifdef ISPC_IS_APPLE
+    free(((void**)ptr)[-1]);
+#endif
+}

 static void usage(int ret) {
    fprintf(stderr, "usage: ispc_test\n");
    fprintf(stderr, "\t[-h/--help]\tprint help\n");
+    fprintf(stderr, "\t[-f]\t\tindicates that test is expected to fail\n");
    fprintf(stderr, "\t<files>\n");
    exit(ret);
 }
@@ -119,20 +159,22 @@ static void svml_missing() {
    exit(1);
 }

+// On Windows, sin() is an overloaded function, so we need an unambiguous
+// function we can take the address of when wiring up the external references
+// below.
+
+double Sin(double x) { return sin(x); }
+double Cos(double x) { return cos(x); }
+double Tan(double x) { return tan(x); }
+double Atan(double x) { return atan(x); }
+double Atan2(double y, double x) { return atan2(y, x); }
+double Pow(double a, double b) { return pow(a, b); }
+double Exp(double x) { return exp(x); }
+double Log(double x) { return log(x); }
+
 static bool lRunTest(const char *fn) {
    llvm::LLVMContext *ctx = new llvm::LLVMContext;

-#ifdef LLVM_2_8
-    std::string err;
-    llvm::MemoryBuffer *buf = llvm::MemoryBuffer::getFileOrSTDIN(fn, &err);
-    if (!buf) {
-        fprintf(stderr, "Unable to open file \"%s\": %s\n", fn, err.c_str());
-        delete ctx;
-        return false;
-    }
-    std::string bcErr;
-    llvm::Module *module = llvm::ParseBitcodeFile(buf, *ctx, &bcErr);
-#else
    llvm::OwningPtr<llvm::MemoryBuffer> buf;
    llvm::error_code err = llvm::MemoryBuffer::getFileOrSTDIN(fn, buf);
    if (err) {
@@ -142,7 +184,6 @@ static bool lRunTest(const char *fn) {
    }
    std::string bcErr;
    llvm::Module *module = llvm::ParseBitcodeFile(buf.get(), *ctx, &bcErr);
-#endif

    if (!module) {
        fprintf(stderr, "Bitcode reader failed for \"%s\": %s\n", fn, bcErr.c_str());
@@ -151,7 +192,21 @@ static bool lRunTest(const char *fn) {
    }

    std::string eeError;
+#if defined(LLVM_3_0) || defined(LLVM_3_0svn)
+    llvm::EngineBuilder engineBuilder(module);
+    engineBuilder.setErrorStr(&eeError);
+    engineBuilder.setEngineKind(llvm::EngineKind::JIT);
+#if 0
+    std::vector<std::string> attributes;
+    if (target != NULL && !strcmp(target, "avx"))
+        attributes.push_back("+avx");
+    engineBuilder.setMAttrs(attributes);
+    engineBuilder.setUseMCJIT(true);
+#endif
+    llvm::ExecutionEngine *ee = engineBuilder.create();
+#else
    llvm::ExecutionEngine *ee = llvm::ExecutionEngine::createJIT(module, &eeError);
+#endif
    if (!ee) {
        fprintf(stderr, "Unable to create ExecutionEngine: %s\n", eeError.c_str());
        return false;
@@ -163,10 +218,8 @@ static bool lRunTest(const char *fn) {
        ee->addGlobalMapping(func, (void *)FUNC)
    DO_FUNC(ISPCLaunch, "ISPCLaunch");
    DO_FUNC(ISPCSync, "ISPCSync");
-#ifdef ISPC_IS_WINDOWS
    DO_FUNC(ISPCMalloc, "ISPCMalloc");
    DO_FUNC(ISPCFree, "ISPCFree");
-#endif // ISPC_IS_WINDOWS
    DO_FUNC(putchar, "putchar");
    DO_FUNC(printf, "printf");
    DO_FUNC(fflush, "fflush");
@@ -178,14 +231,14 @@ static bool lRunTest(const char *fn) {
    DO_FUNC(powf, "powf");
    DO_FUNC(expf, "expf");
    DO_FUNC(logf, "logf");
-    DO_FUNC(sin, "sin");
-    DO_FUNC(cos, "cos");
-    DO_FUNC(tan, "tan");
-    DO_FUNC(atan, "atan");
-    DO_FUNC(atan2, "atan2");
-    DO_FUNC(pow, "pow");
-    DO_FUNC(exp, "exp");
-    DO_FUNC(log, "log");
+    DO_FUNC(Sin, "sin");
+    DO_FUNC(Cos, "cos");
+    DO_FUNC(Tan, "tan");
+    DO_FUNC(Atan, "atan");
+    DO_FUNC(Atan2, "atan2");
+    DO_FUNC(Pow, "pow");
+    DO_FUNC(Exp, "exp");
+    DO_FUNC(Log, "log");
    DO_FUNC(memset, "memset");
 #ifdef ISPC_IS_APPLE
    DO_FUNC(memset_pattern4, "memset_pattern4");
@@ -233,7 +286,6 @@ static bool lRunTest(const char *fn) {
    float result[16];
    for (int i = 0; i < 16; ++i)
        result[i] = 0;
-    bool ok = true;
    if (foundResult) {
        typedef void (*PFN)(float *);
        PFN pfn = reinterpret_cast<PFN>(ee->getPointerToFunction(func));
@@ -290,50 +342,49 @@ static bool lRunTest(const char *fn) {
    }
    else {
        fprintf(stderr, "Unable to find runnable function in file \"%s\"\n", fn);
-        ok = false;
+        return false;
    }

    // see if we got the right result
-    if (ok) {
-        if (foundResult) {
-            for (int i = 0; i < width; ++i)
-                if (returned[i] != result[i]) {
-                    ok = false;
-                    fprintf(stderr, "Test \"%s\" RETURNED %d: %g / %a EXPECTED %g / %a\n",
-                            fn, i, returned[i], returned[i], result[i], result[i]);
-                }
-        }
-        else {
-            for (int i = 0; i < width; ++i)
-                fprintf(stderr, "Test \"%s\" returned %d: %g / %a\n",
-                        fn, i, returned[i], returned[i]);
-        }
+    bool resultsMatch = true;
+    if (foundResult) {
+        for (int i = 0; i < width; ++i)
+            if (returned[i] != result[i]) {
+                resultsMatch = false;
+                fprintf(stderr, "Test \"%s\" RETURNED %d: %g / %a EXPECTED %g / %a\n",
+                        fn, i, returned[i], returned[i], result[i], result[i]);
+            }
    }
+    else {
+        for (int i = 0; i < width; ++i)
+            fprintf(stderr, "Test \"%s\" returned %d: %g / %a\n",
+                    fn, i, returned[i], returned[i]);
+    }
+    if (foundResult && shouldFail && resultsMatch)
+        fprintf(stderr, "Test %s unexpectedly passed\n", fn);

    delete ee;
    delete ctx;

-    return ok && foundResult;
+    return foundResult && resultsMatch;
 }

+
 int main(int argc, char *argv[]) {
    llvm::InitializeNativeTarget();
+#if defined(LLVM_3_0) || defined(LLVM_3_0svn)
+    LLVMLinkInJIT();
+#endif

-    std::vector<const char *> files;
+    const char *filename = NULL;
    for (int i = 1; i < argc; ++i) {
        if (!strcmp(argv[i], "--help") || !strcmp(argv[i], "-h"))
            usage(0);
+        if (!strcmp(argv[i], "-f"))
+            shouldFail = true;
        else
-            files.push_back(argv[i]);
+            filename = argv[i];
    }

-    int passes = 0, fails = 0;
-    for (unsigned int i = 0; i < files.size(); ++i) {
-        if (lRunTest(files[i])) ++passes;
-        else ++fails;
-    }
-
-    if (fails > 0)
-        fprintf(stderr, "%d/%d tests passed\n", passes, passes+fails);
-    return fails > 0;
+    return (lRunTest(filename) == true) ? 0 : 1;
 }
--- a/main.cpp
+++ b/main.cpp
@@ -40,10 +40,14 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <llvm/Support/PrettyStackTrace.h>
-#ifdef LLVM_2_8
-#include <llvm/System/Signals.h>
-#else
 #include <llvm/Support/Signals.h>
+#if defined(LLVM_3_0) || defined(LLVM_3_0svn)
+  #include <llvm/Support/TargetRegistry.h>
+  #include <llvm/Support/TargetSelect.h>
+#else
+  #include <llvm/Target/TargetRegistry.h>
+  #include <llvm/Target/TargetSelect.h>
+  #include <llvm/Target/SubtargetFeature.h>
 #endif

 #ifdef ISPC_IS_WINDOWS
@@ -53,36 +57,36 @@
 #endif // ISPC_IS_WINDOWS

 static void usage(int ret) {
-    printf("This is the Intel(r) SPMD Program Compiler (ispc), build %s (%s)\n\n", BUILD_DATE, BUILD_VERSION);
+    printf("This is the Intel(r) SPMD Program Compiler (ispc), build %s (%s)\n\n", 
+           BUILD_DATE, BUILD_VERSION);
    printf("usage: ispc\n");
-    printf("    [--arch={x86,x86-64}]\t\tSelect target architecture\n");
+    printf("    [--arch={%s}]\t\tSelect target architecture\n", 
+           Target::SupportedTargetArchs());
    printf("    [--cpu=<cpu>]\t\t\tSelect target CPU type\n");
-    printf("         (atom, barcelona, core2, corei7, corei7-avx, istanbul, nocona,\n");
-    printf("          penryn, westmere)\n");
-#ifndef ISPC_IS_WINDOWS
-    printf("    [-D<foo>]\t\t\t\t#define value when running preprocessor\n");
-#endif
+    printf("         <cpu>={%s}\n", Target::SupportedTargetCPUs());
+    printf("    [-D<foo>]\t\t\t\t#define given value when running preprocessor\n");
    printf("    [--debug]\t\t\t\tPrint information useful for debugging ispc\n");
    printf("    [--emit-asm]\t\t\tGenerate assembly language file as output\n");
    printf("    [--emit-llvm]\t\t\tEmit LLVM bitode file as output\n");
-    printf("    [--emit-obj]\t\t\tGenerate object file file as output\n");
-    printf("    [--fast-math]\t\t\tPerform non-IEEE-compliant optimizations of numeric expressions\n");
+    printf("    [--emit-obj]\t\t\tGenerate object file file as output (default)\n");
    printf("    [-g]\t\t\t\tGenerate debugging information\n");
    printf("    [--help]\t\t\t\tPrint help\n");
-    printf("    [-h] <name>\t\t\t\tOutput filename for header\n");
+    printf("    [-h <name>/--header-outfile=<name>]\tOutput filename for header\n");
    printf("    [--instrument]\t\t\tEmit instrumentation to gather performance data\n");
    printf("    [--math-lib=<option>]\t\tSelect math library\n");
    printf("        default\t\t\t\tUse ispc's built-in math functions\n");
    printf("        fast\t\t\t\tUse high-performance but lower-accuracy math functions\n");
-    printf("        svml\t\t\t\tUse the Intel SVML math libraries\n");
+    printf("        svml\t\t\t\tUse the Intel(r) SVML math libraries\n");
    printf("        system\t\t\t\tUse the system's math library (*may be quite slow*)\n");
    printf("    [--nostdlib]\t\t\tDon't make the ispc standard library available\n");
-#ifndef ISPC_IS_WINDOWS
    printf("    [--nocpp]\t\t\t\tDon't run the C preprocessor\n");
-#endif
-    printf("    [-o/--outfile] <name>\t\tOutput filename for bitcode (may be \"-\" for standard output)\n");
-    printf("    [-O0/-O1]\t\t\t\tSet optimization level\n");
+    printf("    [-o <name>/--outfile=<name>]\tOutput filename (may be \"-\" for standard output)\n");
+    printf("    [-O0/-O1]\t\t\t\tSet optimization level (-O1 is default)\n");
    printf("    [--opt=<option>]\t\t\tSet optimization option\n");
+    printf("        disable-loop-unroll\t\tDisable loop unrolling.\n");
+    printf("        fast-masked-vload\t\tFaster masked vector loads on SSE (may go past end of array)\n");
+    printf("        fast-math\t\t\tPerform non-IEEE-compliant optimizations of numeric expressions\n");
+#if 0
    printf("        disable-blended-masked-stores\t\tScalarize masked stores on SSE (vs. using vblendps)\n");
    printf("        disable-coherent-control-flow\t\tDisable coherent control flow optimizations\n");
    printf("        disable-uniform-control-flow\t\tDisable uniform control flow optimizations\n");
@@ -91,11 +95,9 @@ static void usage(int ret) {
    printf("        disable-gather-scatter-flattening\tDisable flattening when all lanes are on\n");
    printf("        disable-uniform-memory-optimizations\tDisable uniform-based coherent memory access\n");
    printf("        disable-masked-store-optimizations\tDisable lowering to regular stores when possible\n");
-#if defined(LLVM_3_0) || defined(LLVM_3_0svn)
-    printf("    [--target={sse2,sse4,sse4x2,avx}] Select target ISA (SSE4 is default unless compiling for atom; then SSE2 is.)\n");
-#else
-    printf("    [--target={sse2,sse4,sse4x2}] Select target ISA (SSE4 is default unless compiling for atom; then SSE2 is.)\n");
-#endif // LLVM 3.0
+#endif
+    printf("    [--pic]\t\t\t\tGenerate position-independent code\n");
+    printf("    [--target=<isa>]\t\t\tSelect target ISA. <isa>={%s}\n", Target::SupportedTargetISAs());
    printf("    [--version]\t\t\t\tPrint ispc version\n");
    printf("    [--woff]\t\t\t\tDisable warnings\n");
    printf("    [--wno-perf]\t\t\tDon't issue warnings related to performance-related issues\n");
@@ -103,35 +105,6 @@ static void usage(int ret) {
    exit(ret);
 }

-/** Given a target name string, set initialize the global g->target
-    structure appropriately. 
-*/
-static void lDoTarget(const char *target) {
-    if (!strcasecmp(target, "sse2")) {
-        g->target.isa = Target::SSE2;
-        g->target.nativeVectorWidth = 4;
-        g->target.vectorWidth = 4;
-    }
-    else if (!strcasecmp(target, "sse4")) {
-        g->target.isa = Target::SSE4;
-        g->target.nativeVectorWidth = 4;
-        g->target.vectorWidth = 4;
-    }
-    else if (!strcasecmp(target, "sse4x2")) {
-        g->target.isa = Target::SSE4;
-        g->target.nativeVectorWidth = 4;
-        g->target.vectorWidth = 8;
-    }
-#if defined(LLVM_3_0) || defined(LLVM_3_0svn)
-    else if (!strcasecmp(target, "avx")) {
-        g->target.isa = Target::AVX;
-        g->target.nativeVectorWidth = 8;
-        g->target.vectorWidth = 8;
-    }
-#endif // LLVM 3.0
-    else
-        usage(1);
-}


 /** We take arguments from both the command line as well as from the
@@ -190,6 +163,16 @@ int main(int Argc, char *Argv[]) {
    llvm::sys::PrintStackTraceOnErrorSignal();
    llvm::PrettyStackTraceProgram X(argc, argv);

+    // initialize available LLVM targets
+    LLVMInitializeX86TargetInfo();
+    LLVMInitializeX86Target();
+    LLVMInitializeX86AsmPrinter();
+    LLVMInitializeX86AsmParser();
+    LLVMInitializeX86Disassembler();
+#if defined(LLVM_3_0) || defined(LLVM_3_0svn)
+    LLVMInitializeX86TargetMC();
+#endif
+
    char *file = NULL;
    const char *headerFileName = NULL;
    const char *outFileName = NULL;
@@ -198,23 +181,29 @@ int main(int Argc, char *Argv[]) {
    // as we're parsing below
    g = new Globals;

-    bool debugSet = false, optSet = false, targetSet = false;
+    bool debugSet = false, optSet = false;
    Module::OutputType ot = Module::Object;
+    bool generatePIC = false;
+    const char *arch = NULL, *cpu = NULL, *target = NULL;

    for (int i = 1; i < argc; ++i) {
        if (!strcmp(argv[i], "--help"))
            usage(0);
-#ifndef ISPC_IS_WINDOWS
-        else if (!strncmp(argv[i], "-D", 2)) {
+        else if (!strncmp(argv[i], "-D", 2))
            g->cppArgs.push_back(argv[i]);
-        }
-#endif // !ISPC_IS_WINDOWS
        else if (!strncmp(argv[i], "--arch=", 7))
-            g->target.arch = argv[i] + 7;
+            arch = argv[i] + 7;
        else if (!strncmp(argv[i], "--cpu=", 6))
-            g->target.cpu = argv[i] + 6;
-        else if (!strcmp(argv[i], "--fast-math"))
-            g->opt.fastMath = true;
+            cpu = argv[i] + 6;
+        else if (!strcmp(argv[i], "--fast-math")) {
+            fprintf(stderr, "--fast-math option has been renamed to --opt=fast-math!\n");
+            usage(1);
+        }
+        else if (!strcmp(argv[i], "--fast-masked-vload")) {
+            fprintf(stderr, "--fast-masked-vload option has been renamed to "
+                    "--opt=fast-masked-vload!\n");
+            usage(1);
+        }
        else if (!strcmp(argv[i], "--debug"))
            g->debugPrint = true;
        else if (!strcmp(argv[i], "--instrument"))
@@ -230,14 +219,12 @@ int main(int Argc, char *Argv[]) {
        else if (!strcmp(argv[i], "--emit-obj"))
            ot = Module::Object;
        else if (!strcmp(argv[i], "--target")) {
+            // FIXME: should remove this way of specifying the target...
            if (++i == argc) usage(1);
-            lDoTarget(argv[i]);
-            targetSet = true;
-        }
-        else if (!strncmp(argv[i], "--target=", 9)) {
-            const char *target = argv[i] + 9;
-            lDoTarget(target);
+            target = argv[i];
        }
+        else if (!strncmp(argv[i], "--target=", 9))
+            target = argv[i] + 9;
        else if (!strncmp(argv[i], "--math-lib=", 11)) {
            const char *lib = argv[i] + 11;
            if (!strcmp(lib, "default"))
@@ -253,7 +240,16 @@ int main(int Argc, char *Argv[]) {
        }
        else if (!strncmp(argv[i], "--opt=", 6)) {
            const char *opt = argv[i] + 6;
-            if (!strcmp(opt, "disable-blended-masked-stores"))
+            if (!strcmp(opt, "fast-math"))
+                g->opt.fastMath = true;
+            else if (!strcmp(opt, "fast-masked-vload"))
+                g->opt.fastMaskedVload = true;
+            else if (!strcmp(opt, "disable-loop-unroll"))
+                g->opt.unrollLoops = false;
+
+            // These are only used for performance tests of specific
+            // optimizations
+            else if (!strcmp(opt, "disable-blended-masked-stores"))
                g->opt.disableBlendedMaskedStores = true;
            else if (!strcmp(opt, "disable-coherent-control-flow"))
                g->opt.disableCoherentControlFlow = true;
@@ -278,14 +274,19 @@ int main(int Argc, char *Argv[]) {
        }
        else if (!strcmp(argv[i], "--wno-perf") || !strcmp(argv[i], "-wno-perf"))
            g->emitPerfWarnings = false;
-        else if (!strcmp(argv[i], "-o") || !strcmp(argv[i], "--outfile")) {
+        else if (!strcmp(argv[i], "-o")) {
            if (++i == argc) usage(1);
            outFileName = argv[i];
        }
-        else if (!strcmp(argv[i], "-h") || !strcmp(argv[i], "--header-outfile")) {
+        else if (!strcmp(argv[i], "--outfile="))
+            outFileName = argv[i] + strlen("--outfile=");
+        else if (!strcmp(argv[i], "-h")) {
            if (++i == argc) usage(1);
            headerFileName = argv[i];
        }
+        else if (!strcmp(argv[i], "--header-outfile=")) {
+            headerFileName = argv[i] + strlen("--header-outfile=");
+        }
        else if (!strcmp(argv[i], "-O0")) {
            g->opt.level = 0;
            optSet = true;
@@ -301,6 +302,8 @@ int main(int Argc, char *Argv[]) {
            g->includeStdlib = false;
        else if (!strcmp(argv[i], "--nocpp"))
            g->runCPP = false;
+        else if (!strcmp(argv[i], "--pic"))
+            generatePIC = true;
        else if (!strcmp(argv[i], "-v") || !strcmp(argv[i], "--version")) {
            printf("Intel(r) SPMD Program Compiler (ispc) build %s (%s)\n", 
                   BUILD_DATE, BUILD_VERSION);
@@ -322,10 +325,8 @@ int main(int Argc, char *Argv[]) {
    if (debugSet && !optSet)
        g->opt.level = 0;

-    // Make SSE2 the default target on atom unless the target has been set
-    // explicitly.
-    if (!targetSet && (g->target.cpu == "atom"))
-        lDoTarget("sse2");
+    if (!Target::GetTarget(arch, cpu, target, generatePIC, &g->target))
+        usage(1);

    m = new Module(file);
    if (m->CompileFile() == 0) {
--- a/module.cpp
+++ b/module.cpp
@@ -72,23 +72,16 @@
 #include <llvm/Support/FormattedStream.h>
 #include <llvm/Support/FileUtilities.h>
 #include <llvm/Target/TargetMachine.h>
-#include <llvm/Target/TargetRegistry.h>
-#include <llvm/Target/TargetSelect.h>
 #include <llvm/Target/TargetOptions.h>
 #include <llvm/Target/TargetData.h>
-#include <llvm/Target/SubtargetFeature.h>
 #include <llvm/PassManager.h>
 #include <llvm/Analysis/Verifier.h>
 #include <llvm/Support/CFG.h>
 #include <clang/Frontend/CompilerInstance.h>
 #include <clang/Frontend/Utils.h>
 #include <clang/Basic/TargetInfo.h>
-#ifndef LLVM_2_8
 #include <llvm/Support/ToolOutputFile.h>
 #include <llvm/Support/Host.h>
-#else // !LLVM_2_8
-#include <llvm/System/Host.h>
-#endif // LLVM_2_8
 #include <llvm/Assembly/PrintModulePass.h>
 #include <llvm/Support/raw_ostream.h>
 #include <llvm/Bitcode/ReaderWriter.h>
@@ -107,14 +100,13 @@ Module::Module(const char *fn) {
    symbolTable = new SymbolTable;
    module = new llvm::Module(filename ? filename : "<stdin>", *g->ctx);

-#ifndef LLVM_2_8
+    module->setTargetTriple(g->target.GetTripleString());
+
    if (g->generateDebuggingSymbols)
        diBuilder = new llvm::DIBuilder(*module);
    else
        diBuilder = NULL;
-#endif // LLVM_2_8

-#ifndef LLVM_2_8
    // If we're generating debugging symbols, let the DIBuilder know that
    // we're starting a new compilation unit.
    if (diBuilder != NULL) {
@@ -140,7 +132,6 @@ Module::Module(const char *fn) {
                                         0 /* run time version */);
        }
    }
-#endif // LLVM_2_8
 }


@@ -154,6 +145,9 @@ extern void yy_delete_buffer(YY_BUFFER_STATE);

 int
 Module::CompileFile() {
+    if (g->opt.fastMath == true)
+        llvm::UnsafeFPMath = true;
+
    // FIXME: it'd be nice to do this in the Module constructor, but this
    // function ends up calling into routines that expect the global
    // variable 'm' to be initialized and available (which it isn't until
@@ -458,6 +452,10 @@ Module::AddGlobal(DeclSpecs *ds, Declarator *decl) {
    // declarations, typedefs, and global variables declarations /
    // definitions.  Figure out what we've got and take care of it.

+    if (ds == NULL || decl == NULL)
+        // Error happened earlier during parsing
+        return;
+
    if (decl->isFunction) {
        // function declaration
        const Type *t = decl->GetType(ds);
@@ -558,7 +556,6 @@ Module::AddGlobal(DeclSpecs *ds, Declarator *decl) {
                                                         decl->sym->name.c_str());
        m->symbolTable->AddVariable(decl->sym);

-#ifndef LLVM_2_8
        if (diBuilder && (ds->storageClass != SC_EXTERN)) {
            llvm::DIFile file = decl->pos.GetDIFile();
            diBuilder->createGlobalVariable(decl->sym->name, 
@@ -568,7 +565,6 @@ Module::AddGlobal(DeclSpecs *ds, Declarator *decl) {
                                            (ds->storageClass == SC_STATIC),
                                            decl->sym->storagePtr);
        }
-#endif // LLVM_2_8
    }
 }

@@ -606,6 +602,7 @@ lCopyInTaskParameter(int i, llvm::Value *structArgPtr, Declarator *decl,
    // memory
    llvm::Value *ptrval = ctx->LoadInst(ptr, NULL, sym->name.c_str());
    ctx->StoreInst(ptrval, sym->storagePtr);
+    ctx->EmitFunctionParameterDebugInfo(sym);
 }


@@ -662,6 +659,11 @@ lEmitFunctionCode(FunctionEmitContext *ctx, llvm::Function *function,
        // the code to free that memory, now that we've copied the
        // parameter values out of the structure.
        ctx->EmitFree(structParamPtr);
+#else
+        // We also do this for AVX... (See discussion in
+        // FunctionEmitContext::LaunchInst().)
+        if (g->target.isa == Target::AVX)
+            ctx->EmitFree(structParamPtr);
 #endif // ISPC_IS_WINDOWS
    }
    else {
@@ -700,8 +702,18 @@ lEmitFunctionCode(FunctionEmitContext *ctx, llvm::Function *function,

    // Finally, we can generate code for the function
    if (code != NULL) {
+        int costEstimate = code->EstimateCost();
        bool checkMask = (ft->isTask == true) || 
-            (function->hasFnAttr(llvm::Attribute::AlwaysInline) == false);
+            ((function->hasFnAttr(llvm::Attribute::AlwaysInline) == false) &&
+             costEstimate > CHECK_MASK_AT_FUNCTION_START_COST);
+        Debug(code->pos, "Estimated cost for function \"%s\" = %d\n", 
+              funSym->name.c_str(), costEstimate);
+        // If the body of the function is non-trivial, then we wrap the
+        // entire thing around a varying "cif (true)" test in order to reap
+        // the side-effect benefit of checking to see if the execution mask
+        // is all on and thence having a specialized code path for that
+        // case.  If this is a simple function, then this isn't worth the
+        // code bloat / overhead.
        if (checkMask) {
            bool allTrue[ISPC_MAX_NVEC];
            for (int i = 0; i < g->target.vectorWidth; ++i)
@@ -849,6 +861,11 @@ Module::AddFunction(DeclSpecs *ds, Declarator *decl, Stmt *code) {

 bool
 Module::WriteOutput(OutputType outputType, const char *outFileName) {
+#if defined(LLVM_3_0) || defined(LLVM_3_0svn)
+    if (diBuilder != NULL && outputType != Header)
+        diBuilder->finalize();
+#endif // LLVM_3_0
+
    // First, issue a warning if the output file suffix and the type of
    // file being created seem to mismatch.  This can help catch missing
    // command-line arguments specifying the output file type.
@@ -909,12 +926,7 @@ Module::WriteOutput(OutputType outputType, const char *outFileName) {
            return true;
        }
        else {
-#ifdef LLVM_2_8
-            fprintf(stderr, "Direct object file emission not supported in this build.\n");
-            return false;
-#else
            return writeObjectFileOrAssembly(outputType, outFileName);
-#endif // LLVM_2_8
        }
    }
 }
@@ -922,79 +934,7 @@ Module::WriteOutput(OutputType outputType, const char *outFileName) {

 bool
 Module::writeObjectFileOrAssembly(OutputType outputType, const char *outFileName) {
-    llvm::InitializeAllTargets();
-#if defined(LLVM_3_0) || defined(LLVM_3_0svn)
-    llvm::InitializeAllTargetMCs();
-#endif
-    llvm::InitializeAllAsmPrinters();
-    llvm::InitializeAllAsmParsers();
-
-    llvm::Triple triple(module->getTargetTriple());
-    if (triple.getTriple().empty())
-        triple.setTriple(llvm::sys::getHostTriple());
-
-    const llvm::Target *target = NULL;
-    if (g->target.arch != "") {
-        // If the user specified a target architecture, see if it's a known
-        // one; print an error with the valid ones otherwise.
-        for (llvm::TargetRegistry::iterator iter = llvm::TargetRegistry::begin();
-             iter != llvm::TargetRegistry::end(); ++iter) {
-            if (g->target.arch == iter->getName()) {
-                target = &*iter;
-                break;
-            }
-        }
-        if (!target) {
-            fprintf(stderr, "Invalid target \"%s\"\nOptions: ", 
-                    g->target.arch.c_str());
-            llvm::TargetRegistry::iterator iter;
-            for (iter = llvm::TargetRegistry::begin();
-                 iter != llvm::TargetRegistry::end(); ++iter)
-                fprintf(stderr, "%s ", iter->getName());
-            fprintf(stderr, "\n");
-            return false;
-        }
-
-        llvm::Triple::ArchType archType = 
-            llvm::Triple::getArchTypeForLLVMName(g->target.arch);
-        if (archType != llvm::Triple::UnknownArch)
-            triple.setArch(archType);
-    }
-    else {
-        // Otherwise get the target either based on the host or the
-        // module's target, if it has been set there.
-        std::string error;
-        target = llvm::TargetRegistry::lookupTarget(triple.getTriple(), error);
-        if (!target) {
-            fprintf(stderr, "Unable to select target for module: %s\n", 
-                    error.c_str());
-            return false;
-        }
-    }
-
-    std::string featuresString;
-    llvm::TargetMachine *targetMachine = NULL;
-#if defined LLVM_3_0svn || defined LLVM_3_0
-    if (g->target.isa == Target::AVX)
-        featuresString = "+avx";
-    targetMachine = target->createTargetMachine(triple.getTriple(), g->target.cpu,
-                                                featuresString);
-#else
-    if (g->target.cpu.size()) {
-        llvm::SubtargetFeatures features;
-        features.setCPU(g->target.cpu);
-        featuresString = features.getString();
-    }
-
-    targetMachine = target->createTargetMachine(triple.getTriple(), 
-                                                featuresString);
-#endif
-    if (targetMachine == NULL) {
-        fprintf(stderr, "Unable to create target machine for target \"%s\"!",
-                triple.str().c_str());
-        return false;
-    }
-    targetMachine->setAsmVerbosityDefault(true);
+    llvm::TargetMachine *targetMachine = g->target.GetTargetMachine();

    // Figure out if we're generating object file or assembly output, and
    // set binary output for object files
@@ -1021,9 +961,8 @@ Module::writeObjectFileOrAssembly(OutputType outputType, const char *outFileName
        (g->opt.level > 0) ? llvm::CodeGenOpt::Aggressive : llvm::CodeGenOpt::None;

    if (targetMachine->addPassesToEmitFile(pm, fos, fileType, optLevel)) {
-        fprintf(stderr, "Fatal error adding passes to emit object file for "
-                "target %s!\n", triple.str().c_str());
-        return false;
+        fprintf(stderr, "Fatal error adding passes to emit object file!");
+        exit(1);
    }

    // Finally, run the passes to emit the object file/assembly
@@ -1189,6 +1128,12 @@ lEmitVectorTypedefs(const std::vector<const VectorType *> &types, FILE *file) {
    for (unsigned int i = 0; i < types.size(); ++i) {
        std::string baseDecl;
        const VectorType *vt = types[i]->GetAsNonConstType();
+        if (!vt->IsUniformType())
+            // Varying stuff shouldn't be visibile to / used by the
+            // application, so at least make it not simple to access it by
+            // not declaring the type here...
+            continue;
+
        int size = vt->GetElementCount();

        baseDecl = vt->GetBaseType()->GetCDeclaration("");
@@ -1361,6 +1306,7 @@ Module::writeHeader(const char *fn) {
    default:
        FATAL("Unhandled target in header emission");
    }
+    fprintf(f, "#define ISPC_TARGET_VECTOR_WIDTH %d\n", g->target.vectorWidth);

    fprintf(f, "#ifdef __cplusplus\nnamespace ispc {\n#endif // __cplusplus\n\n");

@@ -1397,14 +1343,6 @@ Module::writeHeader(const char *fn) {
    lEmitEnumDecls(exportedEnumTypes, f);
    lEmitStructDecls(exportedStructTypes, f);

-    // emit externs for globals
-    if (externGlobals.size() > 0) {
-        fprintf(f, "///////////////////////////////////////////////////////////////////////////\n");
-        fprintf(f, "// Globals declared \"extern\" from ispc code\n");
-        fprintf(f, "///////////////////////////////////////////////////////////////////////////\n");
-        lPrintExternGlobals(f, externGlobals);
-    }
-
    // emit function declarations for exported stuff...
    if (exportedFuncs.size() > 0) {
        fprintf(f, "\n");
@@ -1426,6 +1364,15 @@ Module::writeHeader(const char *fn) {
    // end namespace
    fprintf(f, "\n#ifdef __cplusplus\n}\n#endif // __cplusplus\n");

+    // and only now emit externs for globals, outside of the ispc namespace
+    if (externGlobals.size() > 0) {
+        fprintf(f, "\n");
+        fprintf(f, "///////////////////////////////////////////////////////////////////////////\n");
+        fprintf(f, "// Globals declared \"extern\" from ispc code\n");
+        fprintf(f, "///////////////////////////////////////////////////////////////////////////\n");
+        lPrintExternGlobals(f, externGlobals);
+    }
+
    // end guard
    fprintf(f, "\n#endif // %s\n", guard.c_str());

--- a/module.h
+++ b/module.h
@@ -91,11 +91,8 @@ public:
    /** llvm Module object into which globals and functions are added. */
    llvm::Module *module; 

-#ifndef LLVM_2_8
-    /** The diBuilder manages generating debugging information (only
-        supported in LLVM 2.9 and beyond...) */
+    /** The diBuilder manages generating debugging information */
    llvm::DIBuilder *diBuilder;
-#endif

    GatherBuffer *gatherBuffer;

--- a/opt.cpp
+++ b/opt.cpp
@@ -55,13 +55,12 @@
 #include <llvm/Instructions.h>
 #include <llvm/Intrinsics.h>
 #include <llvm/Constants.h>
-#ifndef LLVM_2_8
-    #include <llvm/Target/TargetLibraryInfo.h>
-    #ifdef LLVM_2_9
-        #include <llvm/Support/StandardPasses.h>
-    #else
-        #include <llvm/Support/PassManagerBuilder.h>
-    #endif // LLVM_2_9
+#include <llvm/Analysis/ConstantFolding.h>
+#include <llvm/Target/TargetLibraryInfo.h>
+#ifdef LLVM_2_9
+    #include <llvm/Support/StandardPasses.h>
+#else
+    #include <llvm/Transforms/IPO/PassManagerBuilder.h>
 #endif // LLVM_2_8
 #include <llvm/ADT/Triple.h>
 #include <llvm/Transforms/Scalar.h>
@@ -69,13 +68,18 @@
 #include <llvm/Transforms/Utils/BasicBlockUtils.h>
 #include <llvm/Target/TargetOptions.h>
 #include <llvm/Target/TargetData.h>
+#include <llvm/Target/TargetMachine.h>
 #include <llvm/Analysis/Verifier.h>
 #include <llvm/Support/raw_ostream.h>
-#ifndef LLVM_2_8
 #include <llvm/Analysis/DIBuilder.h>
-#endif
 #include <llvm/Analysis/DebugInfo.h>
 #include <llvm/Support/Dwarf.h>
+#ifdef ISPC_IS_LINUX
+  #include <alloca.h>
+#elif defined(ISPC_IS_WINDOWS)
+  #include <malloc.h>
+  #define alloca _alloca
+#endif // ISPC_IS_WINDOWS

 static llvm::Pass *CreateIntrinsicsOptPass();
 static llvm::Pass *CreateGatherScatterFlattenPass();
@@ -178,19 +182,22 @@ Optimize(llvm::Module *module, int optLevel) {
    llvm::PassManager optPM;
    llvm::FunctionPassManager funcPM(module);

-#ifndef LLVM_2_8
    llvm::TargetLibraryInfo *targetLibraryInfo =
        new llvm::TargetLibraryInfo(llvm::Triple(module->getTargetTriple()));
    optPM.add(targetLibraryInfo);
-#endif
    optPM.add(new llvm::TargetData(module));

+#if defined(LLVM_3_0) || defined(LLVM_3_0svn)
+    optPM.add(llvm::createIndVarSimplifyPass());
+#endif
+
    if (optLevel == 0) {
        // This is more or less the minimum set of optimizations that we
        // need to do to generate code that will actually run.  (We can't
        // run absolutely no optimizations, since the front-end needs us to
        // take the various __pseudo_* functions it has emitted and turn
        // them into something that can actually execute.
+        optPM.add(llvm::createPromoteMemoryToRegisterPass());
        optPM.add(CreateGatherScatterFlattenPass());
        optPM.add(CreateLowerGatherScatterPass());
        optPM.add(CreateLowerMaskedStorePass());
@@ -211,7 +218,6 @@ Optimize(llvm::Module *module, int optLevel) {
        // only later in the optimization process as things like constant
        // propagation have done their thing, and then when they do kick
        // in, they can often open up new opportunities for optimization...
-#ifndef LLVM_2_8
        llvm::PassRegistry *registry = llvm::PassRegistry::getPassRegistry();
        llvm::initializeCore(*registry);
        llvm::initializeScalarOpts(*registry);
@@ -222,7 +228,7 @@ Optimize(llvm::Module *module, int optLevel) {
        llvm::initializeInstCombine(*registry);
        llvm::initializeInstrumentation(*registry);
        llvm::initializeTarget(*registry);
-#endif
+
        // Early optimizations to try to reduce the total amount of code to
        // work with if we can
        optPM.add(CreateGatherScatterFlattenPass());
@@ -279,13 +285,11 @@ Optimize(llvm::Module *module, int optLevel) {
        optPM.add(llvm::createConstantPropagationPass());
        optPM.add(CreateIntrinsicsOptPass());

-#if defined(LLVM_2_8)
-        optPM.add(CreateIsCompileTimeConstantPass(true));
-#elif defined(LLVM_2_9)
+#if defined(LLVM_2_9)
        llvm::createStandardModulePasses(&optPM, 3, 
                                         false /* opt size */,
                                         true /* unit at a time */, 
-                                         false /* unroll loops */,
+                                         g->opt.unrollLoops,
                                         true /* simplify lib calls */,
                                         false /* may have exceptions */,
                                         llvm::createFunctionInliningPass());
@@ -300,7 +304,7 @@ Optimize(llvm::Module *module, int optLevel) {
        llvm::createStandardModulePasses(&optPM, 3, 
                                         false /* opt size */,
                                         true /* unit at a time */, 
-                                         false /* unroll loops */,
+                                         g->opt.unrollLoops,
                                         true /* simplify lib calls */,
                                         false /* may have exceptions */,
                                         llvm::createFunctionInliningPass());
@@ -309,6 +313,8 @@ Optimize(llvm::Module *module, int optLevel) {
        llvm::PassManagerBuilder builder;
        builder.OptLevel = 3;
        builder.Inliner = llvm::createFunctionInliningPass();
+        if (g->opt.unrollLoops == false)
+            builder.DisableUnrollLoops = true;
        builder.populateFunctionPassManager(funcPM);
        builder.populateModulePassManager(optPM);
        optPM.add(CreateIsCompileTimeConstantPass(true));
@@ -421,8 +427,11 @@ IntrinsicsOpt::IntrinsicsOpt()
    blendInstructions.push_back(BlendInstruction(
        llvm::Intrinsic::getDeclaration(m->module, llvm::Intrinsic::x86_sse41_blendvps),
        0xf, 0, 1, 2));
+#if defined(LLVM_3_0) || defined(LLVM_3_0svn)
    blendInstructions.push_back(BlendInstruction(
-        m->module->getFunction("llvm.x86.avx.blendvps"), 0xff, 0, 1, 2));
+        llvm::Intrinsic::getDeclaration(m->module, llvm::Intrinsic::x86_avx_blendv_ps_256),
+        0xff, 0, 1, 2));
+#endif
 }


@@ -469,8 +478,18 @@ lGetMask(llvm::Value *factor) {
    else if (llvm::isa<llvm::ConstantAggregateZero>(factor))
        return 0;
    else {
+#if 0
+        llvm::ConstantExpr *ce = llvm::dyn_cast<llvm::ConstantExpr>(factor);
+        if (ce != NULL) {
+            llvm::TargetMachine *targetMachine = g->target.GetTargetMachine();
+            const llvm::TargetData *td = targetMachine->getTargetData();
+            llvm::Constant *c = llvm::ConstantFoldConstantExpression(ce, td);
+            c->dump();
+            factor = c;
+        }
        // else we should be able to handle it above...
        assert(!llvm::isa<llvm::Constant>(factor));
+#endif
        return -1;
    }
 }
@@ -608,9 +627,10 @@ IntrinsicsOpt::runOnBasicBlock(llvm::BasicBlock &bb) {
                                          llvm::PointerType::get(returnType, 0), 
                                          "ptr2vec", callInst);
                lCopyMetadata(castPtr, callInst);
+                int align = callInst->getCalledFunction() == avxMaskedLoad32 ? 4 : 8;
                llvm::Instruction *loadInst = 
                    new llvm::LoadInst(castPtr, "load", false /* not volatile */,
-                                       0 /* align */, (llvm::Instruction *)NULL);
+                                       align, (llvm::Instruction *)NULL);
                lCopyMetadata(loadInst, callInst);
                llvm::ReplaceInstWithInst(callInst, loadInst);
                modifiedAny = true;
@@ -630,17 +650,21 @@ IntrinsicsOpt::runOnBasicBlock(llvm::BasicBlock &bb) {
            }
            else if (mask == 0xff) {
                // all lanes storing, so replace with a regular store
-                llvm::Value *rvalue = callInst->getArgOperand(1);
+                llvm::Value *rvalue = callInst->getArgOperand(2);
                llvm::Type *storeType = rvalue->getType();
                llvm::Value *castPtr = 
                    new llvm::BitCastInst(callInst->getArgOperand(0),
                                          llvm::PointerType::get(storeType, 0), 
                                          "ptr2vec", callInst);
                lCopyMetadata(castPtr, callInst);
-                llvm::Instruction *storeInst = 
+
+                llvm::StoreInst *storeInst = 
                    new llvm::StoreInst(rvalue, castPtr, (llvm::Instruction *)NULL);
+                int align = callInst->getCalledFunction() == avxMaskedStore32 ? 4 : 8;
+                storeInst->setAlignment(align);
                lCopyMetadata(storeInst, callInst);
                llvm::ReplaceInstWithInst(callInst, storeInst);
+
                modifiedAny = true;
                goto restart;
            }
@@ -1416,15 +1440,12 @@ LowerMaskedStorePass::runOnBasicBlock(llvm::BasicBlock &bb) {
        llvm::Value *rvalue  = callInst->getArgOperand(1);
        llvm::Value *mask = callInst->getArgOperand(2);

-        // On SSE, we need to choose between doing the load + blend + store
-        // trick, or serializing the masked store.  On targets with a
-        // native masked store instruction, the implementations of
-        // __masked_store_blend_* should be the same as __masked_store_*,
-        // so this doesn't matter.  On SSE, blending is generally more
-        // efficient and is always safe to do on stack-allocated values.(?)
-        bool doBlend = lIsStackVariablePointer(lvalue);
-        if (g->target.isa == Target::SSE4 || g->target.isa == Target::SSE2)
-            doBlend |= !g->opt.disableBlendedMaskedStores;
+        // We need to choose between doing the load + blend + store trick,
+        // or serializing the masked store.  Even on targets with a native
+        // masked store instruction, this is preferable since it lets us
+        // keep values in registers rather than going out to the stack.
+        bool doBlend = (!g->opt.disableBlendedMaskedStores ||
+                        lIsStackVariablePointer(lvalue));

        // Generate the call to the appropriate masked store function and
        // replace the __pseudo_* one with it.
@@ -1502,8 +1523,8 @@ static void lPrintVector(const char *info, llvm::Value *elements[ISPC_MAX_NVEC])


 /** Given an LLVM vector in vec, return a 'scalarized' version of the
-    vector in the provided offsets[] array.  For example, if the vector
-    value passed in is:  
+    vector in the provided scalarizedVector[] array.  For example, if the
+    vector value passed in is:

    add <4 x i32> %a_smear, <4 x i32> <4, 8, 12, 16>,

@@ -1524,28 +1545,39 @@ static void lPrintVector(const char *info, llvm::Value *elements[ISPC_MAX_NVEC])
    @param vec               Vector to be scalarized
    @param scalarizedVector  Array in which to store the individual vector 
                             elements
+    @param vectorLength      Number of elements in the given vector. (The
+                             passed scalarizedVector array must also be at least
+                             this length as well.)
    @returns                 True if the vector was successfully scalarized and
                             the values in offsets[] are valid; false otherwise
 */
 static bool
-lScalarizeVector(llvm::Value *vec, llvm::Value *scalarizedVector[ISPC_MAX_NVEC]) {
+lScalarizeVector(llvm::Value *vec, llvm::Value **scalarizedVector,
+                 int vectorLength) {
    // First initialize the values of scalarizedVector[] to NULL.
-    for (int i = 0; i < g->target.vectorWidth; ++i)
+    for (int i = 0; i < vectorLength; ++i)
        scalarizedVector[i] = NULL;
+    
+    // It may be ok for the vector to be an undef vector; these come up for
+    // example in shufflevector instructions.  As long as elements of the
+    // undef vector aren't referenced by the shuffle indices, this is fine.
+    if (llvm::isa<llvm::UndefValue>(vec))
+        return true;

    // ConstantVectors are easy; just pull out the individual constant
    // element values
    llvm::ConstantVector *cv = llvm::dyn_cast<llvm::ConstantVector>(vec);
    if (cv != NULL) {
-        for (int i = 0; i < g->target.vectorWidth; ++i)
+        for (int i = 0; i < vectorLength; ++i)
            scalarizedVector[i] = cv->getOperand(i);
        return true;
    }

    // It's also easy if it's just a vector of all zeros
-    llvm::ConstantAggregateZero *caz = llvm::dyn_cast<llvm::ConstantAggregateZero>(vec);
-    if (caz) {
-        for (int i = 0; i < g->target.vectorWidth; ++i)
+    llvm::ConstantAggregateZero *caz = 
+        llvm::dyn_cast<llvm::ConstantAggregateZero>(vec);
+    if (caz != NULL) {
+        for (int i = 0; i < vectorLength; ++i)
            scalarizedVector[i] = LLVMInt32(0);
        return true;
    }
@@ -1557,13 +1589,16 @@ lScalarizeVector(llvm::Value *vec, llvm::Value *scalarizedVector[ISPC_MAX_NVEC])
        // scalar values we return from here are synthesized with scalar
        // versions of the original vector binary operator
        llvm::Instruction::BinaryOps opcode = bo->getOpcode();
-        llvm::Value *v0[ISPC_MAX_NVEC], *v1[ISPC_MAX_NVEC];
+        llvm::Value **v0 = 
+            (llvm::Value **)alloca(vectorLength * sizeof(llvm::Value *));
+        llvm::Value **v1 = 
+            (llvm::Value **)alloca(vectorLength * sizeof(llvm::Value *));

-        if (!lScalarizeVector(bo->getOperand(0), v0) || 
-            !lScalarizeVector(bo->getOperand(1), v1))
+        if (!lScalarizeVector(bo->getOperand(0), v0, vectorLength) || 
+            !lScalarizeVector(bo->getOperand(1), v1, vectorLength))
            return false;

-        for (int i = 0; i < g->target.vectorWidth; ++i) {
+        for (int i = 0; i < vectorLength; ++i) {
            scalarizedVector[i] = 
                llvm::BinaryOperator::Create(opcode, v0[i], v1[i], "flat_bop", bo);
            lCopyMetadata(scalarizedVector[i], bo);
@@ -1588,7 +1623,7 @@ lScalarizeVector(llvm::Value *vec, llvm::Value *scalarizedVector[ISPC_MAX_NVEC])
        // vaue in scalarizedVector[] based on the value being inserted.
        while (ie != NULL) {
            uint64_t iOffset = lGetIntValue(ie->getOperand(2));
-            assert((int)iOffset < g->target.vectorWidth);
+            assert((int)iOffset < vectorLength);
            assert(scalarizedVector[iOffset] == NULL);

            scalarizedVector[iOffset] = ie->getOperand(1);
@@ -1602,15 +1637,17 @@ lScalarizeVector(llvm::Value *vec, llvm::Value *scalarizedVector[ISPC_MAX_NVEC])
    }

    llvm::CastInst *ci = llvm::dyn_cast<llvm::CastInst>(vec);
-    if (ci) {
+    if (ci != NULL) {
        // Casts are similar to BinaryOperators in that we attempt to
        // scalarize the vector being cast and if successful, we apply
        // equivalent scalar cast operators to each of the values in the
        // scalarized vector.
        llvm::Instruction::CastOps op = ci->getOpcode();

-        llvm::Value *scalarizedTarget[ISPC_MAX_NVEC];
-        if (!lScalarizeVector(ci->getOperand(0), scalarizedTarget))
+        llvm::Value **scalarizedTarget = 
+            (llvm::Value **)alloca(vectorLength * sizeof(llvm::Value *));
+        if (!lScalarizeVector(ci->getOperand(0), scalarizedTarget,
+                              vectorLength))
            return false;

        LLVM_TYPE_CONST llvm::Type *destType = ci->getDestTy();
@@ -1619,7 +1656,7 @@ lScalarizeVector(llvm::Value *vec, llvm::Value *scalarizedVector[ISPC_MAX_NVEC])
        assert(vectorDestType != NULL);
        LLVM_TYPE_CONST llvm::Type *elementType = vectorDestType->getElementType();

-        for (int i = 0; i < g->target.vectorWidth; ++i) {
+        for (int i = 0; i < vectorLength; ++i) {
            scalarizedVector[i] = 
                llvm::CastInst::Create(op, scalarizedTarget[i], elementType,
                                       "cast", ci);
@@ -1629,16 +1666,11 @@ lScalarizeVector(llvm::Value *vec, llvm::Value *scalarizedVector[ISPC_MAX_NVEC])
    }

    llvm::ShuffleVectorInst *svi = llvm::dyn_cast<llvm::ShuffleVectorInst>(vec);
-    if (svi) {
-        // Note that the code for shufflevector instructions is untested.
-        // (We haven't yet had a case where it needs to run).  Therefore,
-        // an assert at the bottom of this routien will hit the first time
-        // it runs as a reminder that this needs to be tested further.
-
+    if (svi != NULL) {
        LLVM_TYPE_CONST llvm::VectorType *svInstType = 
            llvm::dyn_cast<LLVM_TYPE_CONST llvm::VectorType>(svi->getType());
        assert(svInstType != NULL);
-        assert((int)svInstType->getNumElements() == g->target.vectorWidth);
+        assert((int)svInstType->getNumElements() == vectorLength);

        // Scalarize the two vectors being shuffled.  First figure out how
        // big they are.
@@ -1653,58 +1685,90 @@ lScalarizeVector(llvm::Value *vec, llvm::Value *scalarizedVector[ISPC_MAX_NVEC])
        int n0 = vectorType0->getNumElements();
        int n1 = vectorType1->getNumElements();

-        // FIXME: It's actually totally legitimate for these two to have
-        // different sizes; the final result just needs to have the native
-        // vector width.  To handle this, not only do we need to
-        // potentially dynamically allocate space for the arrays passed
-        // into lScalarizeVector, but we need to change the rest of its
-        // implementation to not key off g->target.vectorWidth everywhere
-        // to get the sizes of the arrays to iterate over, etc.
-        assert(n0 == g->target.vectorWidth && n1 == g->target.vectorWidth);
-
        // Go ahead and scalarize the two input vectors now.
-        // FIXME: it's ok if some or all of the values of these two vectors
-        // have undef values, so long as we don't try to access undef
-        // values with the vector indices provided to the instruction.
-        // Should fix lScalarizeVector so that it doesn't return false in
-        // this case and just leaves the elements of the arrays with undef
-        // values as NULL.
-        llvm::Value *v0[ISPC_MAX_NVEC], *v1[ISPC_MAX_NVEC];
-        if (!lScalarizeVector(svi->getOperand(0), v0) ||
-            !lScalarizeVector(svi->getOperand(1), v1))
+        llvm::Value **v0 = (llvm::Value **)alloca(n0 * sizeof(llvm::Value *));
+        llvm::Value **v1 = (llvm::Value **)alloca(n1 * sizeof(llvm::Value *));
+
+        if (!lScalarizeVector(svi->getOperand(0), v0, n0) ||
+            !lScalarizeVector(svi->getOperand(1), v1, n1))
            return false;

-        llvm::ConstantVector *shuffleIndicesVector = 
-            llvm::dyn_cast<llvm::ConstantVector>(svi->getOperand(2));
-        // I think this has to be a ConstantVector.  If this ever hits,
-        // we'll dig into what we got instead and figure out how to handle
-        // that...
-        assert(shuffleIndicesVector != NULL);
-
-        // Get the integer indices for each element of the returned vector
-        llvm::SmallVector<llvm::Constant *, ISPC_MAX_NVEC> shuffleIndices;
-        shuffleIndicesVector->getVectorElements(shuffleIndices);
-        assert((int)shuffleIndices.size() == g->target.vectorWidth);
-
-        // And loop over the indices, setting the i'th element of the
-        // result vector with the source vector element that corresponds to
-        // the i'th shuffle index value.
-        for (unsigned int i = 0; i < shuffleIndices.size(); ++i) {
-            if (!llvm::isa<llvm::ConstantInt>(shuffleIndices[i]))
-                // I'm not sure when this case would ever happen, though..
-                return false;
-            int offset = (int)lGetIntValue(shuffleIndices[i]);
-            assert(offset >= 0 && offset < n0+n1);
-
-            if (offset < n0)
-                // Offsets from 0 to n0-1 index into the first vector
-                scalarizedVector[i] = v0[offset];
-            else
-                // And offsets from n0 to (n0+n1-1) index into the second
-                // vector
-                scalarizedVector[i] = v1[offset - n0];
+        llvm::ConstantAggregateZero *caz = 
+            llvm::dyn_cast<llvm::ConstantAggregateZero>(svi->getOperand(2));
+        if (caz != NULL) {
+            for (int i = 0; i < vectorLength; ++i)
+                scalarizedVector[i] = v0[0];
+        }
+        else {
+            llvm::ConstantVector *shuffleIndicesVector = 
+                llvm::dyn_cast<llvm::ConstantVector>(svi->getOperand(2));
+            // I think this has to be a ConstantVector.  If this ever hits,
+            // we'll dig into what we got instead and figure out how to handle
+            // that...
+            assert(shuffleIndicesVector != NULL);
+
+            // Get the integer indices for each element of the returned vector
+            llvm::SmallVector<llvm::Constant *, ISPC_MAX_NVEC> shuffleIndices;
+            shuffleIndicesVector->getVectorElements(shuffleIndices);
+            assert((int)shuffleIndices.size() == vectorLength);
+
+            // And loop over the indices, setting the i'th element of the
+            // result vector with the source vector element that corresponds to
+            // the i'th shuffle index value.
+            for (unsigned int i = 0; i < shuffleIndices.size(); ++i) {
+                // I'm not sure when this case would ever happen, though..
+                assert(llvm::isa<llvm::ConstantInt>(shuffleIndices[i]));
+
+                int offset = (int)lGetIntValue(shuffleIndices[i]);
+                assert(offset >= 0 && offset < n0+n1);
+
+                if (offset < n0)
+                    // Offsets from 0 to n0-1 index into the first vector
+                    scalarizedVector[i] = v0[offset];
+                else
+                    // And offsets from n0 to (n0+n1-1) index into the second
+                    // vector
+                    scalarizedVector[i] = v1[offset - n0];
+            }
+        }
+        return true;
+    }
+
+    llvm::LoadInst *li = llvm::dyn_cast<llvm::LoadInst>(vec);
+    if (li != NULL) {
+        llvm::Value *baseAddr = li->getOperand(0);
+        llvm::Value *baseInt = new llvm::PtrToIntInst(baseAddr, LLVMTypes::Int64Type,
+                                                      "base2int", li);
+        lCopyMetadata(baseInt, li);
+
+        LLVM_TYPE_CONST llvm::PointerType *ptrType = 
+            llvm::dyn_cast<llvm::PointerType>(baseAddr->getType());
+        assert(ptrType != NULL);
+        LLVM_TYPE_CONST llvm::VectorType *vecType = 
+            llvm::dyn_cast<llvm::VectorType>(ptrType->getElementType());
+        assert(vecType != NULL);
+        LLVM_TYPE_CONST llvm::Type *elementType = vecType->getElementType();
+        uint64_t elementSize;
+        bool sizeKnown = lSizeOfIfKnown(elementType, &elementSize);
+        assert(sizeKnown == true);
+
+        LLVM_TYPE_CONST llvm::Type *eltPtrType = llvm::PointerType::get(elementType, 0);
+
+        for (int i = 0; i < vectorLength; ++i) {
+            llvm::Value *intPtrOffset = 
+                llvm::BinaryOperator::Create(llvm::Instruction::Add, baseInt,
+                                             LLVMInt64(i * elementSize), "baseoffset",
+                                             li);
+            lCopyMetadata(intPtrOffset, li);
+            llvm::Value *scalarLoadPtr = 
+                new llvm::IntToPtrInst(intPtrOffset, eltPtrType, "int2ptr", li);
+            lCopyMetadata(scalarLoadPtr, li);
+
+            llvm::Instruction *scalarLoad = 
+                new llvm::LoadInst(scalarLoadPtr, "loadelt", li);
+            lCopyMetadata(scalarLoad, li);
+            scalarizedVector[i] = scalarLoad;
        }
-        FATAL("the above code is untested so far; check now that it's actually running");
        return true;
    }

@@ -2116,11 +2180,18 @@ GSImprovementsPass::runOnBasicBlock(llvm::BasicBlock &bb) {
        if (ce && ce->getOpcode() == llvm::Instruction::BitCast)
            base = ce->getOperand(0);

-        // Try to out the offsets; the i'th element of the offsetElements
-        // array should be an i32 with the value of the offset for the i'th
-        // vector lane.  This may fail; if so, just give up.
+        // Try to find out the offsets; the i'th element of the
+        // offsetElements array should be an i32 with the value of the
+        // offset for the i'th vector lane.  This may fail; if so, just
+        // give up.
+        llvm::Value *vecValue = callInst->getArgOperand(1);
+        LLVM_TYPE_CONST llvm::VectorType *vt = 
+            llvm::dyn_cast<llvm::VectorType>(vecValue->getType());
+        assert(vt != NULL);
+        int vecLength = vt->getNumElements();
+        assert(vecLength == g->target.vectorWidth);
        llvm::Value *offsetElements[ISPC_MAX_NVEC];
-        if (!lScalarizeVector(callInst->getArgOperand(1), offsetElements))
+        if (!lScalarizeVector(vecValue, offsetElements, vecLength))
            continue;

        llvm::Value *mask = callInst->getArgOperand((gatherInfo != NULL) ? 2 : 3);
@@ -2497,7 +2568,7 @@ llvm::RegisterPass<MakeInternalFuncsStaticPass>
 bool
 MakeInternalFuncsStaticPass::runOnModule(llvm::Module &module) {
    const char *names[] = {
-        "__do_print",
+        "__do_print", "__fast_masked_vload",
        "__gather_base_offsets_i8", "__gather_base_offsets_i16",
        "__gather_base_offsets_i32", "__gather_base_offsets_i64",
        "__gather_elt_8", "__gather_elt_16", 
--- a/run_tests.py
+++ b/run_tests.py
@@ -0,0 +1,215 @@
+#!/usr/bin/python
+
+# test-running driver for ispc
+
+# TODO: windows support (mostly should be calling CL.exe rather than gcc
+#   for static linking?)
+
+from optparse import OptionParser
+import multiprocessing
+from ctypes import c_int
+import os
+import sys
+import glob
+import re
+import signal
+import random
+import string
+import mutex
+import subprocess
+
+parser = OptionParser()
+parser.add_option("-r", "--random-shuffle", dest="random", help="Randomly order tests",
+                  default=False, action="store_true")
+parser.add_option("-s", "--static-exe", dest="static_exe", 
+                  help="Create and run a regular executable for each test (rather than using the LLVM JIT).",
+                  default=False, action="store_true")
+parser.add_option('-t', '--target', dest='target',
+                  help='Set compilation target (sse2, sse4, sse4x2, avx, avx-x2)',
+                  default="sse4")
+parser.add_option('-a', '--arch', dest='arch',
+                  help='Set architecture (x86, x86-64)',
+                  default="x86-64")
+parser.add_option('-o', '--no-opt', dest='no_opt', help='Disable optimization',
+                  default=False, action="store_true")
+
+(options, args) = parser.parse_args()
+
+# if no specific test files are specified, run all of the tests in tests/
+# and failing_tests/
+if len(args) == 0:
+    files = glob.glob("tests/*ispc") + glob.glob("failing_tests/*ispc")
+else:
+    files = args
+
+# randomly shuffle the tests if asked to do so
+if (options.random):
+    random.seed()
+    random.shuffle(files)
+
+# counter
+total_tests = 0
+finished_tests_counter = multiprocessing.Value(c_int)
+
+# We'd like to use the Lock class from the multiprocessing package to
+# serialize accesses to finished_tests_counter.  Unfortunately, the version of
+# python that ships with OSX 10.5 has this bug:
+# http://bugs.python.org/issue5261.  Therefore, we use the (deprecated but
+# still available) mutex class.
+#finished_tests_counter_lock = multiprocessing.Lock()
+finished_tests_mutex = mutex.mutex()
+
+# utility routine to print an update on the number of tests that have been
+# finished.  Should be called with the mutex (or lock) held..
+def update_progress(fn):
+    finished_tests_counter.value = finished_tests_counter.value + 1
+    progress_str = " Done %d / %d [%s]" % (finished_tests_counter.value, total_tests, fn)
+    # spaces to clear out detrius from previous printing...
+    for x in range(30):
+        progress_str += ' '
+    progress_str += '\r'
+    sys.stdout.write(progress_str)
+    sys.stdout.flush()
+    finished_tests_mutex.unlock()
+
+fnull = open(os.devnull, 'w')
+
+# run the commands in cmd_list
+def run_cmds(cmd_list, filename, expect_failure):
+    for cmd in cmd_list:
+        if expect_failure:
+            failed = (subprocess.call(cmd, shell = True, stdout = fnull, stderr = fnull) != 0)
+        else:
+            failed = (os.system(cmd) != 0)
+        if failed:
+            break
+
+    surprise = ((expect_failure and not failed) or (not expect_failure and failed))
+    if surprise == True:
+        print "Test %s %s                 " % \
+            (filename, "unexpectedly passed" if expect_failure else "failed")
+    return surprise
+
+
+# pull tests to run from the given queue and run them.  Multiple copies of
+# this function will be running in parallel across all of the CPU cores of
+# the system.
+def run_tasks_from_queue(queue):
+    error_count = 0
+    while True:
+        filename = queue.get()
+        if (filename == 'STOP'):
+            sys.exit(error_count)
+
+        # do we expect this test to fail?
+        should_fail = (filename.find("failing_") != -1)
+
+        if options.static_exe == True:
+            # if the user wants us to build a static executable to run for
+            # this test, we need to figure out the signature of the test
+            # function that this test has.
+            sig2def = { "f_v(" : 0, "f_f(" : 1, "f_fu(" : 2, "f_fi(" : 3, 
+                        "f_du(" : 4, "f_duf(" : 5, "f_di(" : 6 }
+            file = open(filename, 'r')
+            match = -1
+            for line in file:
+                # look for lines with 'export'...
+                if line.find("export") == -1:
+                    continue
+                # one of them should have a function with one of the
+                # declarations in sig2def
+                for pattern, ident in sig2def.items():
+                    if line.find(pattern) != -1:
+                        match = ident
+                        break
+            file.close()
+            if match == -1:
+                print "Fatal error: unable to find function signature in test %s" % filename
+                error_count += 1
+            else:
+                obj_name = "%s.o" % filename
+                exe_name = "%s.run" % filename
+                ispc_cmd = "ispc --woff %s -o %s --arch=%s --target=%s" % \
+                    (filename, obj_name, options.arch, options.target)
+                if options.no_opt:
+                    ispc_cmd += " -O0" 
+                if options.arch == 'x86':
+                    gcc_arch = '-m32'
+                else:
+                    gcc_arch = '-m64'
+                gcc_cmd = "g++ -Wl,-no_pie %s test_static.cpp -DTEST_SIG=%d %s.o -o %s" % \
+                    (gcc_arch, match, filename, exe_name)
+                if should_fail:
+                    gcc_cmd += " -DEXPECT_FAILURE"
+                    
+                # compile the ispc code, make the executable, and run it...
+                error_count += run_cmds([ispc_cmd, gcc_cmd, exe_name], filename, should_fail)
+
+                # clean up after running the test
+                try:
+                    os.unlink(exe_name)
+                    os.unlink(obj_name)
+                except:
+                    None
+        else:
+            # otherwise we'll use ispc_test + the LLVM JIT to run the test
+            bitcode_file = "%s.bc" % filename
+            compile_cmd = "ispc --woff --emit-llvm %s --target=%s -o %s" % \
+                (filename, options.target, bitcode_file)
+            if options.no_opt:
+                compile_cmd += " -O0"
+            test_cmd = "ispc_test %s" % bitcode_file
+
+            error_count += run_cmds([compile_cmd, test_cmd], filename, should_fail)
+
+            try:
+                os.unlink(bitcode_file)
+            except:
+                None
+
+        # If not for http://bugs.python.org/issue5261 on OSX, we'd like to do this:
+        #with finished_tests_counter_lock:
+            #update_progress(filename)
+        # but instead we do this...
+        finished_tests_mutex.lock(update_progress, filename)
+
+
+task_threads = []
+
+def sigint(signum, frame):
+    for t in task_threads:
+        t.terminate()
+    sys.exit(1)
+
+if __name__ == '__main__':
+    nthreads = multiprocessing.cpu_count()
+    total_tests = len(files)
+    print "Found %d CPUs. Running %d tests." % (nthreads, total_tests)
+
+    # put each of the test filenames into a queue
+    q = multiprocessing.Queue()
+    for fn in files:
+        q.put(fn)
+    for x in range(nthreads):
+        q.put('STOP')
+
+    # need to catch sigint so that we can terminate all of the tasks if
+    # we're interrupted
+    signal.signal(signal.SIGINT, sigint)
+
+    # launch jobs to run tests
+    for x in range(nthreads):
+        t = multiprocessing.Process(target=run_tasks_from_queue, args=(q,))
+        task_threads.append(t)
+        t.start()
+
+    # wait for them to all finish and then return the number that failed
+    # (i.e. return 0 if all is ok)
+    error_count = 0
+    for t in task_threads:
+        t.join()
+        error_count += t.exitcode
+    print
+    if error_count > 0:
+        print "%d / %d tests FAILED!" % (error_count, total_tests)
+    sys.exit(error_count)
--- a/run_tests.sh
+++ b/run_tests.sh
@@ -23,6 +23,12 @@ EOF
    esac
 done

+ISPC_ARCH=x86-64
+if [[ $OS == "Windows_NT" ]]; then
+  ISPC_ARCH=x86
+fi
+ISPC_ARGS="--target=$target --arch=$ISPC_ARCH -O2 --woff"
+
 shift $(( $OPTIND - 1 ))
 if [[ "$1" > 0 ]]; then
    while [[ "$1" > 0 ]]; do
@@ -31,7 +37,7 @@ if [[ "$1" > 0 ]]; then
        echo Running test $i

        bc=${i%%ispc}bc
-        ispc -O2 $i -woff -o $bc --emit-llvm --target=$target
+        ispc $ISPC_ARGS $i -o $bc --emit-llvm
        if [[ $? != 0 ]]; then
            surprises=1
            echo Test $i FAILED ispc compile
@@ -55,7 +61,7 @@ else
        fi
        (( counter++ ))
        bc=${i%%ispc}bc
-        ispc -O2 $i -woff -o $bc --emit-llvm --target=$target
+        ispc $ISPC_ARGS $i -o $bc --emit-llvm 
        if [[ $? != 0 ]]; then
            surprises=1
            echo Test $i FAILED ispc compile
--- a/stdlib.ispc
+++ b/stdlib.ispc
@@ -315,6 +315,39 @@ static inline uniform int lanemask() {
    return __movmsk(__mask);
 }

+///////////////////////////////////////////////////////////////////////////
+// Prefetching
+
+#define PREFETCHES(NAME, TYPE)                                  \
+static inline void prefetch_l1(const reference TYPE ptr) {      \
+    __prefetch_read_1_##NAME##_refsconst(ptr);                  \
+}                                                               \
+static inline void prefetch_l2(const reference TYPE ptr) {      \
+    __prefetch_read_2_##NAME##_refsconst(ptr);                  \
+}                                                               \
+static inline void prefetch_l3(const reference TYPE ptr) {      \
+    __prefetch_read_3_##NAME##_refsconst(ptr);                  \
+}                                                               \
+ static inline void prefetch_nt(const reference TYPE ptr) {     \
+     __prefetch_read_nt_##NAME##_refsconst(ptr);                \
+}
+
+PREFETCHES(uniform_int8, uniform int8)
+PREFETCHES(uniform_int16, uniform int16)
+PREFETCHES(uniform_int32, uniform int32)
+PREFETCHES(uniform_int64, uniform int64)
+PREFETCHES(uniform_float, uniform float)
+PREFETCHES(uniform_double, uniform double)
+
+PREFETCHES(varying_int8, int8)
+PREFETCHES(varying_int16, int16)
+PREFETCHES(varying_int32, int32)
+PREFETCHES(varying_int64, int64)
+PREFETCHES(varying_float, float)
+PREFETCHES(varying_double, double)
+
+#undef PREFETCHES
+
 ///////////////////////////////////////////////////////////////////////////
 // Horizontal ops / reductions

@@ -438,6 +471,78 @@ static inline uniform unsigned int64 reduce_max(unsigned int64 v) {
    return __reduce_max_uint64(__mask ? v : 0);
 }

+#define REDUCE_EQUAL(TYPE, FUNCTYPE)                               \
+static inline uniform bool reduce_equal(TYPE v) {                  \
+    uniform TYPE unusedValue;                                      \
+    return __reduce_equal_##FUNCTYPE(v, unusedValue, (int32)__mask); \
+}                                                                  \
+static inline uniform bool reduce_equal(TYPE v, reference uniform TYPE value) { \
+    return __reduce_equal_##FUNCTYPE(v, value, (int32)__mask);       \
+}
+
+REDUCE_EQUAL(int32, int32)
+REDUCE_EQUAL(unsigned int32, int32)
+REDUCE_EQUAL(float, float)
+REDUCE_EQUAL(int64, int64)
+REDUCE_EQUAL(unsigned int64, int64)
+REDUCE_EQUAL(double, double)
+
+static int32 exclusive_scan_add(int32 v) {
+    return __exclusive_scan_add_i32(v, (int32)__mask);
+}
+
+static unsigned int32 exclusive_scan_add(unsigned int32 v) {
+    return __exclusive_scan_add_i32(v, __mask);
+}
+
+static float exclusive_scan_add(float v) {
+    return __exclusive_scan_add_float(v, __mask);
+}
+
+static int64 exclusive_scan_add(int64 v) {
+    return __exclusive_scan_add_i64(v, (int32)__mask);
+}
+
+static unsigned int64 exclusive_scan_add(unsigned int64 v) {
+    return __exclusive_scan_add_i64(v, __mask);
+}
+
+static double exclusive_scan_add(double v) {
+    return __exclusive_scan_add_double(v, __mask);
+}
+
+static int32 exclusive_scan_and(int32 v) {
+    return __exclusive_scan_and_i32(v, (int32)__mask);
+}
+
+static unsigned int32 exclusive_scan_and(unsigned int32 v) {
+    return __exclusive_scan_and_i32(v, __mask);
+}
+
+static int64 exclusive_scan_and(int64 v) {
+    return __exclusive_scan_and_i64(v, (int32)__mask);
+}
+
+static unsigned int64 exclusive_scan_and(unsigned int64 v) {
+    return __exclusive_scan_and_i64(v, __mask);
+}
+
+static int32 exclusive_scan_or(int32 v) {
+    return __exclusive_scan_or_i32(v, (int32)__mask);
+}
+
+static unsigned int32 exclusive_scan_or(unsigned int32 v) {
+    return __exclusive_scan_or_i32(v, __mask);
+}
+
+static int64 exclusive_scan_or(int64 v) {
+    return __exclusive_scan_or_i64(v, (int32)__mask);
+}
+
+static unsigned int64 exclusive_scan_or(unsigned int64 v) {
+    return __exclusive_scan_or_i64(v, __mask);
+}
+
 ///////////////////////////////////////////////////////////////////////////
 // packed load, store

@@ -470,57 +575,71 @@ static inline void memory_barrier() {
    __memory_barrier();
 }

-#define DEFINE_ATOMIC_OP(TA,TB,OPA,OPB)                                 \
+#define DEFINE_ATOMIC_OP(TA,TB,OPA,OPB,MASKTYPE)                        \
 static inline TA atomic_##OPA##_global(uniform reference TA ref, TA value) { \
    memory_barrier();                                                   \
-    TA ret = __atomic_##OPB##_##TB##_global(ref, value, __mask);  \
+    TA ret = __atomic_##OPB##_##TB##_global(ref, value, (MASKTYPE)__mask); \
    memory_barrier();                                                   \
    return ret;                                                         \
 }

-DEFINE_ATOMIC_OP(int32,int32,add,add)
-DEFINE_ATOMIC_OP(int32,int32,subtract,sub)
-DEFINE_ATOMIC_OP(int32,int32,min,min)
-DEFINE_ATOMIC_OP(int32,int32,max,max)
-DEFINE_ATOMIC_OP(int32,int32,and,and)
-DEFINE_ATOMIC_OP(int32,int32,or,or)
-DEFINE_ATOMIC_OP(int32,int32,xor,xor)
-DEFINE_ATOMIC_OP(int32,int32,swap,swap)
+#define DEFINE_ATOMIC_MINMAX_OP(TA,TB,OPA,OPB)                          \
+static inline TA atomic_##OPA##_global(uniform reference TA ref, TA value) { \
+    uniform TA oneval = reduce_##OPA(value);                            \
+    TA ret;                                                             \
+    if (lanemask() != 0) {                                              \
+        memory_barrier();                                               \
+        ret = __atomic_##OPB##_##TB##_global(ref, oneval, __mask);      \
+        memory_barrier();                                               \
+    }                                                                   \
+    return ret;                                                         \
+}
+
+DEFINE_ATOMIC_OP(int32,int32,add,add,int32)
+DEFINE_ATOMIC_OP(int32,int32,subtract,sub,int32)
+DEFINE_ATOMIC_MINMAX_OP(int32,int32,min,min)
+DEFINE_ATOMIC_MINMAX_OP(int32,int32,max,max)
+DEFINE_ATOMIC_OP(int32,int32,and,and,int32)
+DEFINE_ATOMIC_OP(int32,int32,or,or,int32)
+DEFINE_ATOMIC_OP(int32,int32,xor,xor,int32)
+DEFINE_ATOMIC_OP(int32,int32,swap,swap,int32)

 // For everything but atomic min and max, we can use the same
 // implementations for unsigned as for signed.
-DEFINE_ATOMIC_OP(unsigned int32,int32,add,add)
-DEFINE_ATOMIC_OP(unsigned int32,int32,subtract,sub)
-DEFINE_ATOMIC_OP(unsigned int32,uint32,min,umin)
-DEFINE_ATOMIC_OP(unsigned int32,uint32,max,umax)
-DEFINE_ATOMIC_OP(unsigned int32,int32,and,and)
-DEFINE_ATOMIC_OP(unsigned int32,int32,or,or)
-DEFINE_ATOMIC_OP(unsigned int32,int32,xor,xor)
-DEFINE_ATOMIC_OP(unsigned int32,int32,swap,swap)
+DEFINE_ATOMIC_OP(unsigned int32,int32,add,add,int32)
+DEFINE_ATOMIC_OP(unsigned int32,int32,subtract,sub,int32)
+DEFINE_ATOMIC_MINMAX_OP(unsigned int32,uint32,min,umin)
+DEFINE_ATOMIC_MINMAX_OP(unsigned int32,uint32,max,umax)
+DEFINE_ATOMIC_OP(unsigned int32,int32,and,and,int32)
+DEFINE_ATOMIC_OP(unsigned int32,int32,or,or,int32)
+DEFINE_ATOMIC_OP(unsigned int32,int32,xor,xor,int32)
+DEFINE_ATOMIC_OP(unsigned int32,int32,swap,swap,int32)

-DEFINE_ATOMIC_OP(float,float,swap,swap)
+DEFINE_ATOMIC_OP(float,float,swap,swap,int32)

-DEFINE_ATOMIC_OP(int64,int64,add,add)
-DEFINE_ATOMIC_OP(int64,int64,subtract,sub)
-DEFINE_ATOMIC_OP(int64,int64,min,min)
-DEFINE_ATOMIC_OP(int64,int64,max,max)
-DEFINE_ATOMIC_OP(int64,int64,and,and)
-DEFINE_ATOMIC_OP(int64,int64,or,or)
-DEFINE_ATOMIC_OP(int64,int64,xor,xor)
-DEFINE_ATOMIC_OP(int64,int64,swap,swap)
+DEFINE_ATOMIC_OP(int64,int64,add,add,int64)
+DEFINE_ATOMIC_OP(int64,int64,subtract,sub,int64)
+DEFINE_ATOMIC_MINMAX_OP(int64,int64,min,min)
+DEFINE_ATOMIC_MINMAX_OP(int64,int64,max,max)
+DEFINE_ATOMIC_OP(int64,int64,and,and,int64)
+DEFINE_ATOMIC_OP(int64,int64,or,or,int64)
+DEFINE_ATOMIC_OP(int64,int64,xor,xor,int64)
+DEFINE_ATOMIC_OP(int64,int64,swap,swap,int32)

 // For everything but atomic min and max, we can use the same
 // implementations for unsigned as for signed.
-DEFINE_ATOMIC_OP(unsigned int64,int64,add,add)
-DEFINE_ATOMIC_OP(unsigned int64,int64,subtract,sub)
-DEFINE_ATOMIC_OP(unsigned int64,uint64,min,umin)
-DEFINE_ATOMIC_OP(unsigned int64,uint64,max,umax)
-DEFINE_ATOMIC_OP(unsigned int64,int64,and,and)
-DEFINE_ATOMIC_OP(unsigned int64,int64,or,or)
-DEFINE_ATOMIC_OP(unsigned int64,int64,xor,xor)
-DEFINE_ATOMIC_OP(unsigned int64,int64,swap,swap)
+DEFINE_ATOMIC_OP(unsigned int64,int64,add,add,int64)
+DEFINE_ATOMIC_OP(unsigned int64,int64,subtract,sub,int64)
+DEFINE_ATOMIC_MINMAX_OP(unsigned int64,uint64,min,umin)
+DEFINE_ATOMIC_MINMAX_OP(unsigned int64,uint64,max,umax)
+DEFINE_ATOMIC_OP(unsigned int64,int64,and,and,int64)
+DEFINE_ATOMIC_OP(unsigned int64,int64,or,or,int64)
+DEFINE_ATOMIC_OP(unsigned int64,int64,xor,xor,int64)
+DEFINE_ATOMIC_OP(unsigned int64,int64,swap,swap,int32)

-DEFINE_ATOMIC_OP(double,double,swap,swap)
+DEFINE_ATOMIC_OP(double,double,swap,swap,int32)
+
+#undef DEFINE_ATOMIC_OP

 #define ATOMIC_DECL_CMPXCHG(TA, TB)                                        \
 static inline TA atomic_compare_exchange_global(                           \
@@ -538,6 +657,8 @@ ATOMIC_DECL_CMPXCHG(int64, int64)
 ATOMIC_DECL_CMPXCHG(unsigned int64, int64)
 ATOMIC_DECL_CMPXCHG(double, double)

+#undef ATOMIC_DECL_CMPXCHG
+
 ///////////////////////////////////////////////////////////////////////////
 // Floating-Point Math

@@ -2600,6 +2721,80 @@ static inline int16 float_to_half(float f) {
 }


+static inline uniform float half_to_float_fast(uniform unsigned int16 h) {
+    uniform unsigned int32 hs = h & (int32)0x8000u;  // Pick off sign bit
+    uniform unsigned int32 he = h & (int32)0x7C00u;  // Pick off exponent bits
+    uniform unsigned int32 hm = h & (int32)0x03FFu;  // Pick off mantissa bits
+
+    // sign
+    uniform unsigned int32 xs = ((unsigned int32) hs) << 16; 
+    // Exponent: unbias the halfp, then bias the single
+    uniform int32 xes = ((int32) (he >> 10)) - 15 + 127; 
+    // Exponent
+    uniform unsigned int32 xe = (unsigned int32) (xes << 23);
+    // Mantissa
+    uniform unsigned int32 xm = ((unsigned int32) hm) << 13; 
+    return floatbits(xs | xe | xm);
+
+}
+
+static inline float half_to_float_fast(unsigned int16 h) {
+    unsigned int32 hs = h & (int32)0x8000u;  // Pick off sign bit
+    unsigned int32 he = h & (int32)0x7C00u;  // Pick off exponent bits
+    unsigned int32 hm = h & (int32)0x03FFu;  // Pick off mantissa bits
+
+    // sign
+    unsigned int32 xs = ((unsigned int32) hs) << 16; 
+    // Exponent: unbias the halfp, then bias the single
+    int32 xes = ((int32) (he >> 10)) - 15 + 127; 
+    // Exponent
+    unsigned int32 xe = (unsigned int32) (xes << 23);
+    // Mantissa
+    unsigned int32 xm = ((unsigned int32) hm) << 13; 
+    return floatbits(xs | xe | xm);
+
+}
+
+static inline uniform int16 float_to_half_fast(uniform float f) {
+    uniform int32 x = intbits(f);
+    uniform unsigned int32 xs = x & 0x80000000u;  // Pick off sign bit
+    uniform unsigned int32 xe = x & 0x7F800000u;  // Pick off exponent bits
+    uniform unsigned int32 xm = x & 0x007FFFFFu;  // Pick off mantissa bits
+
+    uniform unsigned int32 hs = (xs >> 16); // Sign bit
+    // Exponent unbias the single, then bias the halfp
+    uniform int32 hes = ((int)(xe >> 23)) - 127 + 15; 
+    uniform unsigned int32 he = (hes << 10); // Exponent
+    uniform int32 hm = (xm >> 13); // Mantissa
+    uniform int32 ret = (hs | he | hm);
+
+    if (xm & 0x00001000u) // Check for rounding
+        // Round, might overflow to inf, this is OK
+        ret += 1u; 
+
+    return (int16)ret;
+}
+
+static inline int16 float_to_half_fast(float f) {
+    int32 x = intbits(f);
+    unsigned int32 xs = x & 0x80000000u;  // Pick off sign bit
+    unsigned int32 xe = x & 0x7F800000u;  // Pick off exponent bits
+    unsigned int32 xm = x & 0x007FFFFFu;  // Pick off mantissa bits
+
+    unsigned int32 hs = (xs >> 16); // Sign bit
+    // Exponent unbias the single, then bias the halfp
+    int32 hes = ((int)(xe >> 23)) - 127 + 15; 
+    unsigned int32 he = (hes << 10); // Exponent
+    int32 hm = (xm >> 13); // Mantissa
+    int32 ret = (hs | he | hm);
+
+    if (xm & 0x00001000u) // Check for rounding
+        // Round, might overflow to inf, this is OK
+        ret += 1u; 
+
+    return (int16)ret;
+}
+
 ///////////////////////////////////////////////////////////////////////////
 // RNG stuff

@@ -2624,7 +2819,9 @@ static inline unsigned int random(reference RNGState state)

 static inline float frandom(reference RNGState state)
 {
-    return ((int)(random(state) & ((1<<24)-1))) / (float)(1 << 24);
+    unsigned int irand = random(state);
+    irand &= (1<<23)-1;
+    return floatbits(0x3F800000 | irand)-1.0f;
 }

 static inline uniform unsigned int __seed4(reference RNGState state, 
@@ -2665,6 +2862,12 @@ static inline void seed_rng(reference uniform RNGState state, uniform unsigned i
    seed = __seed4(state, 0, seed);
    if (programCount == 8)
        __seed4(state, 4, seed ^ 0xbeeff00d);
+    if (programCount == 16) {
+        __seed4(state, 4,  seed ^ 0xbeeff00d);
+        __seed4(state, 8,  ((seed & 0xffff) << 16) | (seed >> 16));
+        __seed4(state, 12, (((seed & 0xff) << 24) | ((seed & 0xff00)  << 8) |
+                            ((seed & 0xff0000) >> 8) | (seed & 0xff000000) >> 24));
+    }
 }

 static inline void fastmath() {
--- a/stmt.cpp
+++ b/stmt.cpp
@@ -107,6 +107,12 @@ ExprStmt::Print(int indent) const {
 }


+int
+ExprStmt::EstimateCost() const {
+    return expr ? expr->EstimateCost() : 0;
+}
+
+
 ///////////////////////////////////////////////////////////////////////////
 // DeclStmt

@@ -399,12 +405,25 @@ DeclStmt::Print(int indent) const {
 }


+int
+DeclStmt::EstimateCost() const {
+    int cost = 0;
+    for (unsigned int i = 0; i < declaration->declarators.size(); ++i)
+        if (declaration->declarators[i]->initExpr)
+            cost += declaration->declarators[i]->initExpr->EstimateCost();
+    return cost;
+}
+
+
 ///////////////////////////////////////////////////////////////////////////
 // IfStmt

-IfStmt::IfStmt(Expr *t, Stmt *ts, Stmt *fs, bool doUnif, SourcePos p) 
+IfStmt::IfStmt(Expr *t, Stmt *ts, Stmt *fs, bool checkCoherence, SourcePos p) 
    : Stmt(p), test(t), trueStmts(ts), falseStmts(fs), 
-      doCoherentCheck(doUnif && !g->opt.disableCoherentControlFlow) {
+      doAllCheck(checkCoherence &&
+                 !g->opt.disableCoherentControlFlow),
+      doAnyCheck(test->GetType() != NULL &&
+                 test->GetType()->IsVaryingType()) {
 }


@@ -436,62 +455,46 @@ IfStmt::EmitCode(FunctionEmitContext *ctx) const {

    ctx->SetDebugPos(pos);
    bool isUniform = testType->IsUniformType();
+
+    llvm::Value *testValue = test->GetValue(ctx);
+    if (testValue == NULL)
+        return;
+
    if (isUniform) {
        ctx->StartUniformIf(ctx->GetMask());
-        if (doCoherentCheck)
-            Warning(test->pos, "Uniform condition supplied to cif statement.");
+        if (doAllCheck)
+            Warning(test->pos, "Uniform condition supplied to \"cif\" statement.");

        // 'If' statements with uniform conditions are relatively
        // straightforward.  We evaluate the condition and then jump to
        // either the 'then' or 'else' clause depending on its value.
-        llvm::Value *vtest = test->GetValue(ctx);
-        if (vtest != NULL) {
-            llvm::BasicBlock *bthen = ctx->CreateBasicBlock("if_then");
-            llvm::BasicBlock *belse = ctx->CreateBasicBlock("if_else");
-            llvm::BasicBlock *bexit = ctx->CreateBasicBlock("if_exit");
+        llvm::BasicBlock *bthen = ctx->CreateBasicBlock("if_then");
+        llvm::BasicBlock *belse = ctx->CreateBasicBlock("if_else");
+        llvm::BasicBlock *bexit = ctx->CreateBasicBlock("if_exit");

-            // Jump to the appropriate basic block based on the value of
-            // the 'if' test
-            ctx->BranchInst(bthen, belse, vtest);
+        // Jump to the appropriate basic block based on the value of
+        // the 'if' test
+        ctx->BranchInst(bthen, belse, testValue);

-            // Emit code for the 'true' case
-            ctx->SetCurrentBasicBlock(bthen);
-            lEmitIfStatements(ctx, trueStmts, "true");
-            if (ctx->GetCurrentBasicBlock()) 
-                ctx->BranchInst(bexit);
+        // Emit code for the 'true' case
+        ctx->SetCurrentBasicBlock(bthen);
+        lEmitIfStatements(ctx, trueStmts, "true");
+        if (ctx->GetCurrentBasicBlock()) 
+            ctx->BranchInst(bexit);

-            // Emit code for the 'false' case
-            ctx->SetCurrentBasicBlock(belse);
-            lEmitIfStatements(ctx, falseStmts, "false");
-            if (ctx->GetCurrentBasicBlock())
-                ctx->BranchInst(bexit);
+        // Emit code for the 'false' case
+        ctx->SetCurrentBasicBlock(belse);
+        lEmitIfStatements(ctx, falseStmts, "false");
+        if (ctx->GetCurrentBasicBlock())
+            ctx->BranchInst(bexit);

-            // Set the active basic block to the newly-created exit block
-            // so that subsequent emitted code starts there.
-            ctx->SetCurrentBasicBlock(bexit);
-        }
+        // Set the active basic block to the newly-created exit block
+        // so that subsequent emitted code starts there.
+        ctx->SetCurrentBasicBlock(bexit);
        ctx->EndIf();
    }
-    else {
-        // Code for 'If' statemnts with 'varying' conditions can be
-        // generated in two ways; one takes some care to see if all of the
-        // active program instances want to follow only the 'true' or
-        // 'false' cases, and the other always runs both cases but sets the
-        // mask appropriately.  The first case is handled by the
-        // IfStmt::emitCoherentTests() call, and the second is handled by
-        // IfStmt::emitMaskedTrueAndFalse().
-        llvm::Value *testValue = test->GetValue(ctx);
-        if (testValue) {
-            if (doCoherentCheck) 
-                emitCoherentTests(ctx, testValue);
-            else {
-                llvm::Value *oldMask = ctx->GetMask();
-                ctx->StartVaryingIf(oldMask);
-                emitMaskedTrueAndFalse(ctx, oldMask, testValue);
-                ctx->EndIf();
-            }
-        }
-    }
+    else
+        emitVaryingIf(ctx, testValue);
 }


@@ -535,9 +538,17 @@ Stmt *IfStmt::TypeCheck() {
 }


+int
+IfStmt::EstimateCost() const {
+    return ((test ? test->EstimateCost() : 0) +
+            (trueStmts ? trueStmts->EstimateCost() : 0) +
+            (falseStmts ? falseStmts->EstimateCost() : 0));
+}
+
+
 void
 IfStmt::Print(int indent) const {
-    printf("%*cIf Stmt %s", indent, ' ', doCoherentCheck ? "DO COHERENT CHECK" : "");
+    printf("%*cIf Stmt %s", indent, ' ', doAllCheck ? "DO ALL CHECK" : "");
    pos.Print();
    printf("\n%*cTest: ", indent+4, ' ');
    test->Print();
@@ -554,7 +565,7 @@ IfStmt::Print(int indent) const {


 /** Emit code to run both the true and false statements for the if test,
-    with the mask set appropriately before runnign each one. 
+    with the mask set appropriately before running each one. 
 */
 void
 IfStmt::emitMaskedTrueAndFalse(FunctionEmitContext *ctx, llvm::Value *oldMask, 
@@ -574,11 +585,185 @@ IfStmt::emitMaskedTrueAndFalse(FunctionEmitContext *ctx, llvm::Value *oldMask,
 }


+/** Similar to the Stmt variant of this function, this conservatively
+    checks to see if it's safe to run the code for the given Expr even if
+    the mask is 'all off'.
+ */
+static bool
+lSafeToRunWithAllLanesOff(Expr *expr) {
+    if (expr == NULL)
+        return false;
+
+    UnaryExpr *ue;
+    if ((ue = dynamic_cast<UnaryExpr *>(expr)) != NULL)
+        return lSafeToRunWithAllLanesOff(ue->expr);
+
+    BinaryExpr *be;
+    if ((be = dynamic_cast<BinaryExpr *>(expr)) != NULL)
+        return (lSafeToRunWithAllLanesOff(be->arg0) &&
+                lSafeToRunWithAllLanesOff(be->arg1));
+
+    AssignExpr *ae;
+    if ((ae = dynamic_cast<AssignExpr *>(expr)) != NULL)
+        return (lSafeToRunWithAllLanesOff(ae->lvalue) &&
+                lSafeToRunWithAllLanesOff(ae->rvalue));
+
+    SelectExpr *se;
+    if ((se = dynamic_cast<SelectExpr *>(expr)) != NULL)
+        return (lSafeToRunWithAllLanesOff(se->test) && 
+                lSafeToRunWithAllLanesOff(se->expr1) && 
+                lSafeToRunWithAllLanesOff(se->expr2));
+
+    ExprList *el;
+    if ((el = dynamic_cast<ExprList *>(expr)) != NULL) {
+        for (unsigned int i = 0; i < el->exprs.size(); ++i)
+            if (!lSafeToRunWithAllLanesOff(el->exprs[i]))
+                return false;
+        return true;
+    }
+
+    FunctionCallExpr *fce;
+    if ((fce = dynamic_cast<FunctionCallExpr *>(expr)) != NULL)
+        // FIXME: If we could somehow determine that the function being
+        // called was safe (and all of the args Exprs were safe, then it'd
+        // be nice to be able to return true here.  (Consider a call to
+        // e.g. floatbits() in the stdlib.)  Unfortunately for now we just
+        // have to be conservative.
+        return false;
+
+    IndexExpr *ie;
+    if ((ie = dynamic_cast<IndexExpr *>(expr)) != NULL) {
+        // If we can determine at compile time the size of the array/vector
+        // and if the indices are compile-time constants, then we may be
+        // able to safely run this under a predicated if statement..
+        if (ie->arrayOrVector == NULL)
+            return false;
+
+        const Type *type = ie->arrayOrVector->GetType();
+        ConstExpr *ce = dynamic_cast<ConstExpr *>(ie->index);
+        if (type == NULL || ce == NULL)
+            return false;
+        if (dynamic_cast<const ReferenceType *>(type) != NULL)
+            type = type->GetReferenceTarget();
+
+        const SequentialType *seqType = 
+            dynamic_cast<const SequentialType *>(type);
+        assert(seqType != NULL);
+        int nElements = seqType->GetElementCount();
+        if (nElements == 0)
+            // Unsized array, so we can't be sure
+            return false;
+
+        int32_t indices[ISPC_MAX_NVEC];
+        int count = ce->AsInt32(indices);
+        for (int i = 0; i < count; ++i)
+            if (indices[i] < 0 || indices[i] >= nElements)
+                return false;
+
+        // All indices are in-bounds
+        return true;
+    }
+
+    MemberExpr *me;
+    if ((me = dynamic_cast<MemberExpr *>(expr)) != NULL)
+        return lSafeToRunWithAllLanesOff(me->expr);
+
+    if (dynamic_cast<ConstExpr *>(expr) != NULL)
+        return true;
+
+    TypeCastExpr *tce;
+    if ((tce = dynamic_cast<TypeCastExpr *>(expr)) != NULL)
+        return lSafeToRunWithAllLanesOff(tce->expr);
+
+    ReferenceExpr *re;
+    if ((re = dynamic_cast<ReferenceExpr *>(expr)) != NULL)
+        return lSafeToRunWithAllLanesOff(re->expr);
+
+    DereferenceExpr *dre;
+    if ((dre = dynamic_cast<DereferenceExpr *>(expr)) != NULL)
+        return lSafeToRunWithAllLanesOff(dre->expr);
+
+    if (dynamic_cast<SymbolExpr *>(expr) != NULL ||
+        dynamic_cast<FunctionSymbolExpr *>(expr) != NULL ||
+        dynamic_cast<SyncExpr *>(expr) != NULL)
+        return true;
+
+    FATAL("Unknown Expr type in lSafeToRunWithAllLanesOff()");
+    return false;
+}
+
+
+/** Given an arbitrary statement, this function conservatively tests to see
+    if it's safe to run the code for the statement even if the mask is all
+    off.  Here we just need to determine which kind of statement we have
+    and recursively traverse it and/or the expressions inside of it.
+ */
+static bool
+lSafeToRunWithAllLanesOff(Stmt *stmt) {
+    if (stmt == NULL)
+        return true;
+
+    ExprStmt *es;
+    if ((es = dynamic_cast<ExprStmt *>(stmt)) != NULL)
+        return lSafeToRunWithAllLanesOff(es->expr);
+
+    DeclStmt *ds;
+    if ((ds = dynamic_cast<DeclStmt *>(stmt)) != NULL) {
+        for (unsigned int i = 0; i < ds->declaration->declarators.size(); ++i)
+            if (!lSafeToRunWithAllLanesOff(ds->declaration->declarators[i]->initExpr))
+                return false;
+        return true;
+    }
+
+    IfStmt *is;
+    if ((is = dynamic_cast<IfStmt *>(stmt)) != NULL)
+        return (lSafeToRunWithAllLanesOff(is->test) &&
+                lSafeToRunWithAllLanesOff(is->trueStmts) &&
+                lSafeToRunWithAllLanesOff(is->falseStmts));
+
+    DoStmt *dos;
+    if ((dos = dynamic_cast<DoStmt *>(stmt)) != NULL)
+        return (lSafeToRunWithAllLanesOff(dos->testExpr) &&
+                lSafeToRunWithAllLanesOff(dos->bodyStmts));
+
+    ForStmt *fs;
+    if ((fs = dynamic_cast<ForStmt *>(stmt)) != NULL)
+        return (lSafeToRunWithAllLanesOff(fs->init) &&
+                lSafeToRunWithAllLanesOff(fs->test) &&
+                lSafeToRunWithAllLanesOff(fs->step) &&
+                lSafeToRunWithAllLanesOff(fs->stmts));
+
+    if (dynamic_cast<BreakStmt *>(stmt) != NULL ||
+        dynamic_cast<ContinueStmt *>(stmt) != NULL)
+        return true;
+
+    ReturnStmt *rs;
+    if ((rs = dynamic_cast<ReturnStmt *>(stmt)) != NULL)
+        return lSafeToRunWithAllLanesOff(rs->val);
+
+    StmtList *sl;
+    if ((sl = dynamic_cast<StmtList *>(stmt)) != NULL) {
+        const std::vector<Stmt *> &sls = sl->GetStatements();
+        for (unsigned int i = 0; i < sls.size(); ++i)
+            if (!lSafeToRunWithAllLanesOff(sls[i]))
+                return false;
+        return true;
+    }
+
+    PrintStmt *ps;
+    if ((ps = dynamic_cast<PrintStmt *>(stmt)) != NULL)
+        return lSafeToRunWithAllLanesOff(ps->values);
+
+    FATAL("Unexpected stmt type in lSafeToRunWithAllLanesOff()");
+    return false;
+}
+
+
 /** Emit code for an if test that checks the mask and the test values and
    tries to be smart about jumping over code that doesn't need to be run.
 */
 void
-IfStmt::emitCoherentTests(FunctionEmitContext *ctx, llvm::Value *ltest) const {
+IfStmt::emitVaryingIf(FunctionEmitContext *ctx, llvm::Value *ltest) const {
    llvm::Value *oldMask = ctx->GetMask();
    if (oldMask == LLVMMaskAllOn) {
        // We can tell that the mask is on statically at compile time; just
@@ -587,7 +772,7 @@ IfStmt::emitCoherentTests(FunctionEmitContext *ctx, llvm::Value *ltest) const {
        emitMaskAllOn(ctx, ltest, bDone);
        ctx->SetCurrentBasicBlock(bDone);
    }
-    else {
+    else if (doAllCheck) {
        // We can't tell if the mask going into the if is all on at the
        // compile time.  Emit code to check for this and then either run
        // the code for the 'all on' or the 'mixed' case depending on the
@@ -619,6 +804,43 @@ IfStmt::emitCoherentTests(FunctionEmitContext *ctx, llvm::Value *ltest) const {
        // paths above jump to when they're done.
        ctx->SetCurrentBasicBlock(bDone);
    }
+    else if (trueStmts != NULL || falseStmts != NULL) {
+        // If there is nothing that is potentially unsafe to run with all
+        // lanes off in the true and false statements and if the total
+        // complexity of those two is relatively simple, then we'll go
+        // ahead and emit straightline code that runs both sides, updating
+        // the mask accordingly.  This is useful for efficiently compiling
+        // things like:
+        //
+        // if (foo) x = 0;
+        // else     ++x;
+        //
+        // Where the overhead of checking if any of the program instances wants
+        // to run one side or the other is more than the actual computation.
+        // The lSafeToRunWithAllLanesOff() checks to make sure that we don't do this
+        // for potentially dangerous code like:
+        //
+        // if (index < count) array[index] = 0;
+        //
+        // where our use of blend for conditional assignments doesn't check
+        // for the 'all lanes' off case.
+        if (lSafeToRunWithAllLanesOff(trueStmts) &&
+            lSafeToRunWithAllLanesOff(falseStmts) &&
+            (((trueStmts ? trueStmts->EstimateCost() : 0) + 
+              (falseStmts ? falseStmts->EstimateCost() : 0)) < 
+             PREDICATE_SAFE_IF_STATEMENT_COST)) {
+            ctx->StartVaryingIf(oldMask);
+            emitMaskedTrueAndFalse(ctx, oldMask, ltest);
+            assert(ctx->GetCurrentBasicBlock());
+            ctx->EndIf();
+        }
+        else {
+            assert(doAnyCheck);
+            llvm::BasicBlock *bDone = ctx->CreateBasicBlock("if_done");
+            emitMaskMixed(ctx, oldMask, ltest, bDone);
+            ctx->SetCurrentBasicBlock(bDone);
+        }
+    }
 }


@@ -677,69 +899,50 @@ IfStmt::emitMaskAllOn(FunctionEmitContext *ctx, llvm::Value *ltest,
 }


-/** Emits code that checks to see if for all of the lanes where the mask is
-    on, the test has the value true.
- */
-static llvm::Value *
-lTestMatchesMask(FunctionEmitContext *ctx, llvm::Value *test, llvm::Value *mask) {
-    llvm::Value *testAndMask = ctx->BinaryOperator(llvm::Instruction::And, test,
-                                                   mask, "test&mask");
-    return ctx->MasksAllEqual(testAndMask, mask);
-}
-
-
 /** Emit code for an 'if' test where the lane mask is known to be mixed
    on/off going into it.
 */
 void
 IfStmt::emitMaskMixed(FunctionEmitContext *ctx, llvm::Value *oldMask, 
                      llvm::Value *ltest, llvm::BasicBlock *bDone) const {
-    // First, see if, for all of the lanes where the mask is on, if the
-    // value of the test is on.  (i.e. (test&mask) == mask).  In this case,
-    // we only need to run the 'true' case code, since the lanes where the
-    // test was false aren't supposed to be running here anyway.
-     llvm::Value *testAllEqual = lTestMatchesMask(ctx, ltest, oldMask);
-    llvm::BasicBlock *bTestAll = ctx->CreateBasicBlock("cif_mixed_test_all");
-    llvm::BasicBlock *bTestAnyCheck = ctx->CreateBasicBlock("cif_mixed_test_any_check");
-    ctx->BranchInst(bTestAll, bTestAnyCheck, testAllEqual);
+    ctx->StartVaryingIf(oldMask);
+    llvm::BasicBlock *bNext = ctx->CreateBasicBlock("safe_if_after_true");
+    if (trueStmts != NULL) {
+        llvm::BasicBlock *bRunTrue = ctx->CreateBasicBlock("safe_if_run_true");
+        ctx->MaskAnd(oldMask, ltest);

-    // Emit code for the (test&mask)==mask case.  Not only do we only need
-    // to emit code for the true statements, but we don't need to modify
-    // the mask's value; it's already correct.
-    ctx->SetCurrentBasicBlock(bTestAll);
-    ctx->StartVaryingIf(ctx->GetMask());
-    lEmitIfStatements(ctx, trueStmts, "cif: all running lanes want just true stmts");
-    assert(ctx->GetCurrentBasicBlock());
-    ctx->EndIf();
+        // Do any of the program instances want to run the 'true'
+        // block?  If not, jump ahead to bNext.
+        llvm::Value *maskAnyQ = ctx->Any(ctx->GetMask());
+        ctx->BranchInst(bRunTrue, bNext, maskAnyQ);
+
+        // Emit statements for true
+        ctx->SetCurrentBasicBlock(bRunTrue);
+        lEmitIfStatements(ctx, trueStmts, "if: expr mixed, true statements");
+        assert(ctx->GetCurrentBasicBlock()); 
+        ctx->BranchInst(bNext);
+        ctx->SetCurrentBasicBlock(bNext);
+    }
+    if (falseStmts != NULL) {
+        llvm::BasicBlock *bRunFalse = ctx->CreateBasicBlock("safe_if_run_false");
+        bNext = ctx->CreateBasicBlock("safe_if_after_false");
+        ctx->MaskAndNot(oldMask, ltest);
+
+        // Similarly, check to see if any of the instances want to
+        // run the 'false' block...
+        llvm::Value *maskAnyQ = ctx->Any(ctx->GetMask());
+        ctx->BranchInst(bRunFalse, bNext, maskAnyQ);
+
+        // Emit code for false
+        ctx->SetCurrentBasicBlock(bRunFalse);
+        lEmitIfStatements(ctx, falseStmts, "if: expr mixed, false statements");
+        assert(ctx->GetCurrentBasicBlock());
+        ctx->BranchInst(bNext);
+        ctx->SetCurrentBasicBlock(bNext);
+    }
    ctx->BranchInst(bDone);
-
-    // Next, see if the active lanes only need to run the false case--i.e. if
-    // (~test & mask) == mask.
-    ctx->SetCurrentBasicBlock(bTestAnyCheck);
-    llvm::Value *notTest = ctx->BinaryOperator(llvm::Instruction::Xor, LLVMMaskAllOn,
-                                               ltest, "~test");
-    llvm::Value *notMatchesMask = lTestMatchesMask(ctx, notTest, oldMask);
-    llvm::BasicBlock *bTestAllNot = ctx->CreateBasicBlock("cif_mixed_test_none");
-    llvm::BasicBlock *bTestMixed = ctx->CreateBasicBlock("cif_mixed_test_mixed");
-    ctx->BranchInst(bTestAllNot, bTestMixed, notMatchesMask);
-
-    // Emit code for the (~test & mask) == mask case.  We only need the
-    // 'false' statements and again don't need to modify the value of the
-    // mask.
-    ctx->SetCurrentBasicBlock(bTestAllNot);
-    ctx->StartVaryingIf(ctx->GetMask());
-    lEmitIfStatements(ctx, falseStmts, "cif: all running lanes want just false stmts");
-    assert(ctx->GetCurrentBasicBlock());
+    ctx->SetCurrentBasicBlock(bDone);
    ctx->EndIf();
-    ctx->BranchInst(bDone);
-
-    // It's mixed; we need to run both the true and false cases and also do
-    // mask update stuff.
-    ctx->SetCurrentBasicBlock(bTestMixed);
-    ctx->StartVaryingIf(ctx->GetMask());
-    emitMaskedTrueAndFalse(ctx, oldMask, ltest);
-    ctx->EndIf();
-    ctx->BranchInst(bDone);
 }


@@ -955,6 +1158,13 @@ DoStmt::TypeCheck() {
 }


+int
+DoStmt::EstimateCost() const {
+    return ((testExpr ? testExpr->EstimateCost() : 0) +
+            (bodyStmts ? bodyStmts->EstimateCost() : 0));
+}
+
+
 void
 DoStmt::Print(int indent) const {
    printf("%*cDo Stmt", indent, ' ');
@@ -1162,6 +1372,20 @@ ForStmt::TypeCheck() {
 }


+int
+ForStmt::EstimateCost() const {
+    bool uniformTest = test ? test->GetType()->IsUniformType() :
+        (!g->opt.disableUniformControlFlow &&
+         !lHasVaryingBreakOrContinue(stmts));
+
+    return ((init ? init->EstimateCost() : 0) +
+            (test ? test->EstimateCost() : 0) +
+            (step ? step->EstimateCost() : 0) +
+            (stmts ? stmts->EstimateCost() : 0) +
+            (uniformTest ? COST_UNIFORM_LOOP : COST_VARYING_LOOP));
+}
+
+
 void
 ForStmt::Print(int indent) const {
    printf("%*cFor Stmt", indent, ' ');
@@ -1216,6 +1440,13 @@ BreakStmt::TypeCheck() {
 }


+int
+BreakStmt::EstimateCost() const {
+    return doCoherenceCheck ? COST_COHERENT_BREAK_CONTINE : 
+        COST_REGULAR_BREAK_CONTINUE;
+}
+
+
 void
 BreakStmt::Print(int indent) const {
    printf("%*c%sBreak Stmt", indent, ' ', doCoherenceCheck ? "Coherent " : "");
@@ -1254,6 +1485,13 @@ ContinueStmt::TypeCheck() {
 }


+int
+ContinueStmt::EstimateCost() const {
+    return doCoherenceCheck ? COST_COHERENT_BREAK_CONTINE : 
+        COST_REGULAR_BREAK_CONTINUE;
+}
+
+
 void
 ContinueStmt::Print(int indent) const {
    printf("%*c%sContinue Stmt", indent, ' ', doCoherenceCheck ? "Coherent " : "");
@@ -1300,6 +1538,12 @@ ReturnStmt::TypeCheck() {
 }


+int
+ReturnStmt::EstimateCost() const {
+    return COST_RETURN + (val ? val->EstimateCost() : 0);
+}
+
+
 void
 ReturnStmt::Print(int indent) const {
    printf("%*c%sReturn Stmt", indent, ' ', doCoherenceCheck ? "Coherent " : "");
@@ -1345,6 +1589,16 @@ StmtList::TypeCheck() {
 }


+int
+StmtList::EstimateCost() const {
+    int cost = 0;
+    for (unsigned int i = 0; i < stmts.size(); ++i)
+        if (stmts[i])
+            cost += stmts[i]->EstimateCost();
+    return cost;
+}
+
+
 void
 StmtList::Print(int indent) const {
    printf("%*cStmt List", indent, ' ');
@@ -1464,8 +1718,11 @@ PrintStmt::EmitCode(FunctionEmitContext *ctx) const {
    llvm::Value *args[5];
    std::string argTypes;

-    if (values == NULL)
-        args[4] = NULL;
+    if (values == NULL) {
+        LLVM_TYPE_CONST llvm::Type *ptrPtrType = 
+            llvm::PointerType::get(LLVMTypes::VoidPointerType, 0);
+        args[4] = llvm::Constant::getNullValue(ptrPtrType);
+    }
    else {
        // Get the values passed to the print() statement evaluated and
        // stored in memory so that we set up the array of pointers to them
@@ -1542,3 +1799,11 @@ PrintStmt::TypeCheck() {
        values = values->TypeCheck();
    return this;
 }
+
+
+int
+PrintStmt::EstimateCost() const {
+    return COST_FUNCALL + (values ? values->EstimateCost() : 0);
+}
+
+
--- a/stmt.h
+++ b/stmt.h
@@ -75,8 +75,8 @@ public:

    Stmt *Optimize();
    Stmt *TypeCheck();
+    int EstimateCost() const;

-private:
    Expr *expr;
 };

@@ -92,8 +92,8 @@ public:

    Stmt *Optimize();
    Stmt *TypeCheck();
+    int EstimateCost() const;

-private:
    Declaration *declaration;
 };

@@ -103,13 +103,14 @@ private:
 class IfStmt : public Stmt {
 public:
    IfStmt(Expr *testExpr, Stmt *trueStmts, Stmt *falseStmts,
-           bool doCoherentCheck, SourcePos pos);
+           bool doAllCheck, SourcePos pos);

    void EmitCode(FunctionEmitContext *ctx) const;
    void Print(int indent) const;

    Stmt *Optimize();
    Stmt *TypeCheck();
+    int EstimateCost() const;

    // @todo these are only public for lHasVaryingBreakOrContinue(); would
    // be nice to clean that up...
@@ -125,11 +126,12 @@ private:
        source and thus, if the emitted code should check to see if all
        active program instances want to follow just one of the 'true' or
        'false' blocks. */
-    const bool doCoherentCheck;
+    const bool doAllCheck;
+    const bool doAnyCheck;

    void emitMaskedTrueAndFalse(FunctionEmitContext *ctx, llvm::Value *oldMask, 
                                llvm::Value *test) const;
-    void emitCoherentTests(FunctionEmitContext *ctx, llvm::Value *test) const;
+    void emitVaryingIf(FunctionEmitContext *ctx, llvm::Value *test) const;
    void emitMaskAllOn(FunctionEmitContext *ctx,
                       llvm::Value *test, llvm::BasicBlock *bDone) const;
    void emitMaskMixed(FunctionEmitContext *ctx, llvm::Value *oldMask, 
@@ -150,8 +152,8 @@ public:

    Stmt *Optimize();
    Stmt *TypeCheck();
+    int EstimateCost() const;

-private:
    Expr *testExpr;
    Stmt *bodyStmts;
    const bool doCoherentCheck;
@@ -171,8 +173,8 @@ public:

    Stmt *Optimize();
    Stmt *TypeCheck();
+    int EstimateCost() const;

-private:
    /** 'for' statment initializer; may be NULL, indicating no intitializer */
    Stmt *init;
    /** expression that returns a value indicating whether the loop should
@@ -198,6 +200,7 @@ public:

    Stmt *Optimize();
    Stmt *TypeCheck();
+    int EstimateCost() const;

 private:
    /** This indicates whether the generated code will check to see if no
@@ -219,6 +222,7 @@ public:

    Stmt *Optimize();
    Stmt *TypeCheck();
+    int EstimateCost() const;

 private:
    /** This indicates whether the generated code will check to see if no
@@ -240,8 +244,8 @@ public:

    Stmt *Optimize();
    Stmt *TypeCheck();
+    int EstimateCost() const;

-private:
    Expr *val;
    /** This indicates whether the generated code will check to see if no
        more program instances are currently running after the return, in
@@ -262,6 +266,7 @@ public:

    Stmt *Optimize();
    Stmt *TypeCheck();
+    int EstimateCost() const;

    void Add(Stmt *s) { if (s) stmts.push_back(s); }
    const std::vector<Stmt *> &GetStatements() { return stmts; }
@@ -289,8 +294,8 @@ public:

    Stmt *Optimize();
    Stmt *TypeCheck();
+    int EstimateCost() const;

-private:
    /** Format string for the print() statement. */
    const std::string format;
    /** This holds the arguments passed to the print() statement.  If more
--- a/test_static.cpp
+++ b/test_static.cpp
@@ -0,0 +1,163 @@
+/*
+  Copyright (c) 2010-2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+#if defined(_WIN32) || defined(_WIN64)
+#define ISPC_IS_WINDOWS
+#elif defined(__linux__)
+#define ISPC_IS_LINUX
+#elif defined(__APPLE__)
+#define ISPC_IS_APPLE
+#endif
+
+#include <string.h>
+#include <stdio.h>
+#include <assert.h>
+#include <stdint.h>
+#ifdef ISPC_IS_LINUX
+#include <malloc.h>
+#endif
+
+extern "C" {
+    extern int width();
+    extern void f_v(float *result);
+    extern void f_f(float *result, float *a);
+    extern void f_fu(float *result, float *a, float b);
+    extern void f_fi(float *result, float *a, int *b);
+    extern void f_du(float *result, double *a, double b);
+    extern void f_duf(float *result, double *a, float b);
+    extern void f_di(float *result, double *a, int *b);
+    extern void result(float *val);
+    
+    void ISPCLaunch(void *f, void *d);
+    void ISPCSync();
+    void *ISPCMalloc(int64_t size, int32_t alignment);
+    void ISPCFree(void *ptr);
+}
+
+void ISPCLaunch(void *f, void *d) {
+    typedef void (*TaskFuncType)(void *, int, int);
+    TaskFuncType func = (TaskFuncType)f;
+    func(d, 0, 1);
+}
+
+void ISPCSync() {
+}
+
+
+void *ISPCMalloc(int64_t size, int32_t alignment) {
+#ifdef ISPC_IS_WINDOWS
+    return _aligned_malloc(size, alignment);
+#endif
+#ifdef ISPC_IS_LINUX
+    return memalign(alignment, size);
+#endif
+#ifdef ISPC_IS_APPLE
+    void *mem = malloc(size + (alignment-1) + sizeof(void*));
+    char *amem = ((char*)mem) + sizeof(void*);
+    amem = amem + uint32_t(alignment - (reinterpret_cast<uint64_t>(amem) &
+                                        (alignment - 1)));
+    ((void**)amem)[-1] = mem;
+    return amem;
+#endif
+}
+
+
+void ISPCFree(void *ptr) {
+#ifdef ISPC_IS_WINDOWS
+    _aligned_free(ptr);
+#endif
+#ifdef ISPC_IS_LINUX
+    free(ptr);
+#endif
+#ifdef ISPC_IS_APPLE
+    free(((void**)ptr)[-1]);
+#endif
+}
+
+
+int main(int argc, char *argv[]) {
+    int w = width();
+    assert(w <= 16);
+
+    float returned_result[16];
+    memset(returned_result, 0, 16*sizeof(float));
+    float vfloat[16] = { 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16};
+    double vdouble[16] = { 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16};
+    int vint[16] = { 2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32 };
+    int vint2[16] = { 5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20};
+    float b = 5.;
+
+#if (TEST_SIG == 0)
+    f_v(returned_result);
+#elif (TEST_SIG == 1)
+    f_f(returned_result, vfloat);
+#elif (TEST_SIG == 2)
+    f_fu(returned_result, vfloat, b);
+#elif (TEST_SIG == 3)
+    f_fi(returned_result, vfloat, vint);
+#elif (TEST_SIG == 4)
+    f_du(returned_result, vdouble, 5.);
+#elif (TEST_SIG == 5)
+    f_duf(returned_result, vdouble, 5.f);
+#elif (TEST_SIG == 6)
+    f_di(returned_result, vdouble, vint2);
+#else
+#error "Unknown or unset TEST_SIG value"
+#endif    
+
+    float expected_result[16];
+    memset(expected_result, 0, 16*sizeof(float));
+    result(expected_result);
+
+    int errors = 0;
+    for (int i = 0; i < w; ++i) {
+        if (returned_result[i] != expected_result[i]) {
+#ifdef EXPECT_FAILURE
+            // bingo, failed
+            return 1;
+#else
+            printf("%s: value %d disagrees: returned %f [%a], expected %f [%a]\n",
+                   argv[0], i, returned_result[i], returned_result[i], 
+                   expected_result[i], expected_result[i]);
+            ++errors;
+#endif // EXPECT_FAILURE
+        }
+    }
+
+#ifdef EXPECT_FAILURE
+    // Don't expect to get here
+    return 0;
+#else
+    return errors > 0;
+#endif
+}
--- a/tests/array-1.ispc
+++ b/tests/array-1.ispc
@@ -3,7 +3,7 @@ export uniform int width() { return programCount; }



-static float x[2][1];
+static float x[1][2];

 export void f_f(uniform float RET[], uniform float aFOO[]) {
    float a = aFOO[programIndex];
--- a/tests/array-scatter-vary.ispc
+++ b/tests/array-scatter-vary.ispc
@@ -13,7 +13,7 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
 }

    
-export void result(uniform float RET[4]) { 
+export void result(uniform float RET[]) { 
    RET[programIndex] = 0;
    RET[3] = 4;
    RET[4] = 5;
--- a/tests/atomics-10.ispc
+++ b/tests/atomics-10.ispc
@@ -0,0 +1,16 @@
+
+export uniform int width() { return programCount; }
+
+uniform unsigned int32 s = 0;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = aFOO[programIndex]; 
+    float b = 0;
+    if (programIndex < 2)
+        b = atomic_add_global(s, 1);
+    RET[programIndex] = s;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 2;
+}
--- a/tests/atomics-11.ispc
+++ b/tests/atomics-11.ispc
@@ -0,0 +1,20 @@
+
+export uniform int width() { return programCount; }
+
+uniform unsigned int32 s = 0;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = aFOO[programIndex]; 
+    float b = 0;
+    if (programIndex & 1)
+        b = atomic_add_global(s, programIndex);
+    RET[programIndex] = s;
+}
+
+export void result(uniform float RET[]) {
+    uniform int sum = 0;
+    for (uniform int i = 0; i < programCount; ++i)
+        if (i & 1)
+            sum += i;
+    RET[programIndex] = sum;
+}
--- a/tests/atomics-12.ispc
+++ b/tests/atomics-12.ispc
@@ -0,0 +1,20 @@
+
+export uniform int width() { return programCount; }
+
+uniform unsigned int32 s = 0;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = aFOO[programIndex]; 
+    float b = 0;
+    if (programIndex & 1)
+        b = atomic_or_global(s, (1 << programIndex));
+    RET[programIndex] = s;
+}
+
+export void result(uniform float RET[]) {
+    uniform int sum = 0;
+    for (uniform int i = 0; i < programCount; ++i)
+        if (i & 1)
+            sum += (1 << i);
+    RET[programIndex] = sum;
+}
--- a/tests/atomics-13.ispc
+++ b/tests/atomics-13.ispc
@@ -0,0 +1,16 @@
+
+export uniform int width() { return programCount; }
+
+uniform unsigned int32 s = 0;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = aFOO[programIndex]; 
+    float b = 0;
+    if (programIndex & 1)
+        b = atomic_or_global(s, (1 << programIndex));
+    RET[programIndex] = popcnt(reduce_max((int32)b));
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = (programCount/2) - 1;
+}
--- a/tests/atomics-14.ispc
+++ b/tests/atomics-14.ispc
@@ -0,0 +1,20 @@
+
+export uniform int width() { return programCount; }
+
+uniform unsigned int64 s = 0xffffffffff000000;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = aFOO[programIndex]; 
+    float b = 0;
+    if (programIndex & 1)
+        b = atomic_or_global(s, (1 << programIndex));
+    RET[programIndex] = (s>>20);
+}
+
+export void result(uniform float RET[]) {
+    uniform int sum = 0;
+    for (uniform int i = 0; i < programCount; ++i)
+        if (i & 1)
+            sum += (1 << i);
+    RET[programIndex] = ((unsigned int64)(0xffffffffff000000 | sum)) >> 20;
+}
--- a/tests/atomics-7.ispc
+++ b/tests/atomics-7.ispc
@@ -0,0 +1,14 @@
+
+export uniform int width() { return programCount; }
+
+uniform int32 s = 0;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    int32 a = aFOO[programIndex]; 
+    float b = atomic_min_global(s, a);
+    RET[programIndex] = reduce_min(b);
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = reduce_min(programIndex);
+}
--- a/tests/atomics-8.ispc
+++ b/tests/atomics-8.ispc
@@ -0,0 +1,16 @@
+
+export uniform int width() { return programCount; }
+
+uniform int32 s = 0;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    int32 a = aFOO[programIndex]; 
+    int32 b = 0;
+    if (programIndex & 1)
+        b = atomic_max_global(s, a);
+    RET[programIndex] = s;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = programCount;
+}
--- a/tests/atomics-9.ispc
+++ b/tests/atomics-9.ispc
@@ -0,0 +1,16 @@
+
+export uniform int width() { return programCount; }
+
+uniform unsigned int32 s = 0;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = aFOO[programIndex]; 
+    float b = 0;
+    if (programIndex < 2)
+        b = atomic_add_global(s, 1);
+    RET[programIndex] = reduce_add(b);
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1;
+}
--- a/tests/cwhile-test-60.ispc
+++ b/tests/cwhile-test-60.ispc
@@ -11,5 +11,5 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {


 export void result(uniform float RET[]) {
-    RET[programIndex] = 10;
+    RET[programIndex] = max(10, 1 + programIndex);
 }
--- a/tests/exclusive-scan-add-1.ispc
+++ b/tests/exclusive-scan-add-1.ispc
@@ -0,0 +1,12 @@
+
+export uniform int width() { return programCount; }
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    RET[programIndex] = exclusive_scan_add(programIndex); 
+}
+
+export void result(uniform float RET[]) {
+    uniform int result[] = { 0, 0, 1, 3, 6, 10, 15, 21, 28,
+                             36, 45, 55, 66, 78, 91, 105, 120 };
+    RET[programIndex] = result[programIndex]; 
+}
--- a/tests/exclusive-scan-add-10.ispc
+++ b/tests/exclusive-scan-add-10.ispc
@@ -0,0 +1,20 @@
+
+export uniform int width() { return programCount; }
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    RET[programIndex] = -1;
+    unsigned int64 a = aFOO[programIndex];
+    if (programIndex & 1) {
+        RET[programIndex] = exclusive_scan_add(a);
+    }
+}
+
+
+export void result(uniform float RET[]) {
+    uniform int result[16] = { 0, 0, 0, 2, 0, 6, 0, 12, 
+                               0, 20, 0, 30, 0, 42, 0, 56 };
+    if (programIndex & 1)
+        RET[programIndex] = result[programIndex]; 
+    else
+        RET[programIndex] = -1;
+}
--- a/tests/exclusive-scan-add-2.ispc
+++ b/tests/exclusive-scan-add-2.ispc
@@ -0,0 +1,12 @@
+
+export uniform int width() { return programCount; }
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    RET[programIndex] = exclusive_scan_add(aFOO[programIndex]);
+}
+
+export void result(uniform float RET[]) {
+    uniform int result[] = { 0, 1, 3, 6, 10, 15, 21, 28,
+                             36, 45, 55, 66, 78, 91, 105, 120, 136 };
+    RET[programIndex] = result[programIndex]; 
+}
--- a/tests/exclusive-scan-add-3.ispc
+++ b/tests/exclusive-scan-add-3.ispc
@@ -0,0 +1,17 @@
+
+export uniform int width() { return programCount; }
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    RET[programIndex] = -1;
+    float a = aFOO[programIndex];
+    if (a <= 2)
+        RET[programIndex] = exclusive_scan_add(a);
+}
+
+export void result(uniform float RET[]) {
+    uniform int result[] = { 0, 1, 3, 0, 0, 0, 0, 0,
+                             0, 0, 0, 0, 0, 0, 0, 0 };
+    RET[programIndex] = -1;
+    if (programIndex <= 1)
+        RET[programIndex] = result[programIndex]; 
+}
--- a/tests/exclusive-scan-add-4.ispc
+++ b/tests/exclusive-scan-add-4.ispc
@@ -0,0 +1,17 @@
+
+export uniform int width() { return programCount; }
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    RET[programIndex] = -1;
+    float a = aFOO[programIndex];
+    if (a <= 2)
+        RET[programIndex] = exclusive_scan_add(a);
+}
+
+export void result(uniform float RET[]) {
+    uniform int result[] = { 0, 1, 3, 0, 0, 0, 0, 0,
+                             0, 0, 0, 0, 0, 0, 0, 0 };
+    RET[programIndex] = -1;
+    if (programIndex <= 1)
+        RET[programIndex] = result[programIndex]; 
+}
--- a/tests/exclusive-scan-add-5.ispc
+++ b/tests/exclusive-scan-add-5.ispc
@@ -0,0 +1,20 @@
+
+export uniform int width() { return programCount; }
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    RET[programIndex] = -1;
+    float a = aFOO[programIndex];
+    if (programIndex & 1) {
+        RET[programIndex] = exclusive_scan_add(a);
+    }
+}
+
+
+export void result(uniform float RET[]) {
+    uniform int result[16] = { 0, 0, 0, 2, 0, 6, 0, 12, 
+                               0, 20, 0, 30, 0, 42, 0, 56 };
+    if (programIndex & 1)
+        RET[programIndex] = result[programIndex]; 
+    else
+        RET[programIndex] = -1;
+}
--- a/tests/exclusive-scan-add-6.ispc
+++ b/tests/exclusive-scan-add-6.ispc
@@ -0,0 +1,12 @@
+
+export uniform int width() { return programCount; }
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    RET[programIndex] = exclusive_scan_add((float)programIndex); 
+}
+
+export void result(uniform float RET[]) {
+    uniform int result[] = { 0, 0, 1, 3, 6, 10, 15, 21, 28,
+                             36, 45, 55, 66, 78, 91, 105, 120 };
+    RET[programIndex] = result[programIndex]; 
+}
--- a/tests/exclusive-scan-add-7.ispc
+++ b/tests/exclusive-scan-add-7.ispc
@@ -0,0 +1,12 @@
+
+export uniform int width() { return programCount; }
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    RET[programIndex] = exclusive_scan_add((double)aFOO[programIndex]);
+}
+
+export void result(uniform float RET[]) {
+    uniform int result[] = { 0, 1, 3, 6, 10, 15, 21, 28,
+                             36, 45, 55, 66, 78, 91, 105, 120, 136 };
+    RET[programIndex] = result[programIndex]; 
+}
--- a/tests/exclusive-scan-add-8.ispc
+++ b/tests/exclusive-scan-add-8.ispc
@@ -0,0 +1,17 @@
+
+export uniform int width() { return programCount; }
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    RET[programIndex] = -1;
+    int64 a = aFOO[programIndex];
+    if (a <= 2)
+        RET[programIndex] = exclusive_scan_add(a);
+}
+
+export void result(uniform float RET[]) {
+    uniform int result[] = { 0, 1, 3, 0, 0, 0, 0, 0,
+                             0, 0, 0, 0, 0, 0, 0, 0 };
+    RET[programIndex] = -1;
+    if (programIndex <= 1)
+        RET[programIndex] = result[programIndex]; 
+}
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Matt Pharr	8ad28a3f6f	update doxygen, release notes for 1.0.8 release	2011-09-19 15:22:25 -07:00
Matt Pharr	9921b8e530	Predicated 'if' statement performance improvements. Go back to running both sides of 'if' statements with masking and without branching if we can determine that the code is relatively simple (as per the simple cost model), and is safe to run even if the mask is 'all off'. This gives a bit of a performance improvement for some of the examples (most notably, the ray tracer), and is the code that one wants generated in this case anyhow.	2011-09-19 09:54:09 -07:00
Matt Pharr	9052d4b10b	Linux build fixes	2011-09-17 13:42:46 -07:00
Matt Pharr	2405dae8e6	Use malloc() to get space for task arguments when compiling to AVX. This is to work around the LLVM bug/limitation discused in LLVM bug 10841 (http://llvm.org/bugs/show_bug.cgi?id=10841).	2011-09-17 13:38:51 -07:00
Matt Pharr	3607f3e045	Remove support for building with LLVM 2.8. Fixes issue #66 . Both 2.9 and top-of-tree generate substantially better code than LLVM 2.8 did, so it's not worth fixing the 2.8 build.	2011-09-17 13:18:59 -07:00
Matt Pharr	de84acfa5d	On OSX with LLVM 2.9, always generate position-independent code. Fixes Issue #99.	2011-09-17 13:03:51 -07:00
Matt Pharr	a501ab1aa6	Fix parenthesization bugs in cost estimates. Also added the debugging print that helped find these issues. Revert inlining some functions in examples	2011-09-16 19:07:07 -07:00
Matt Pharr	cdc850f98c	Inline some functions in examples	2011-09-16 17:02:21 -07:00
Matt Pharr	ca87579f23	Add a very simple cost model to estimate runtime cost of running code. This is currently only used to decide whether it's worth doing an "are all lanes running" check at the start of functions--for small functions, it's not worth the overhead. The cost is estimated relatively early in compilation (e.g. before we know if an array access is a scatter/gather or not, before constant folding, etc.), so there are many known shortcomings.	2011-09-16 15:09:17 -07:00
Matt Pharr	38fc13d1ab	Remove now unused function.	2011-09-16 14:21:13 -07:00
Matt Pharr	cf9d9f717e	Logic simplification to 'mixed true/false' case for coherent ifs. Use the approach from `173632f446` here as well.	2011-09-16 14:10:55 -07:00
Matt Pharr	173632f446	Generate more efficient for regular varying 'if' statements. For the case where we have a regular (i.e. non-'cif') 'if' statement, the generated code just simply checks to see if any program instance is running before running the corresponding statements. This is a lighter-weight check than IfStmt::emitMaskMixed() was performing.	2011-09-16 12:03:42 -07:00
Matt Pharr	1dedd88132	Improve implementaton of 'are both masks equal' check for AVX. Previously, we did a vector equal compare and then a movmsk, the result of which we checked to see if it was on for all lanes. Because masks are vectors of i32s, under AVX, the vector equal compare required two 4-wide SSE compares and some shuffling. Now, we do a movmsk of both masks first and then a scalar equality comparison of those two values, which seems to generate overall better code.	2011-09-15 06:25:02 -07:00
Matt Pharr	0848c2cc19	Actually make all 'if' statements check for 'all off' mask. Contrary to claims in `0c2048385`, that checkin didn't include the changes to not run if/else blocks if none of the program instances wanted to be running them. This checkin fixes that and thus actually fixes issue #74.	2011-09-13 19:48:04 -07:00
Matt Pharr	e2a88d491f	Mark the internal __fast_masked_vload function as static	2011-09-13 15:43:48 -07:00
Matt Pharr	30f9dcd4f5	Unroll loops by default, add --opt=disable-loop-unroll to disable. Issue #78.	2011-09-13 15:37:18 -07:00
Matt Pharr	0c344b6755	Fix Linux build of mandelbrot_tasks example	2011-09-13 15:17:30 -07:00
Matt Pharr	6734021520	Issue warning when compile-time constant out-of-bounds array index is used. Issue #98. Also fixes two examples that had bugs of this type that this warning uncovered!	2011-09-13 14:42:20 -07:00
Matt Pharr	dd153d3c5c	Handle more instruction types when flattening offset vectors. Generalize the lScalarizeVector() utility routine (used in determining when we can change gathers/scatters into vector loads/stores, respectively) to handle vector shuffles and vector loads. This fixes issue #79, which provided a case where a gather was being performed even though a vector load was possible.	2011-09-13 09:43:56 -07:00
Matt Pharr	9ca7541d52	Remove check for any program instances running before function calls. Given the change in `0c20483853`, this is no longer necessary, since we know that one instance will always be running if we're executing a given block of code.	2011-09-13 06:26:16 -07:00
Matt Pharr	0c20483853	Make all "if" statements "coherent" ifs. Workaround for issue #74 . Using blend to do masked stores is unsafe if all of the lanes are off: it may read from or write to invalid memory. For now, this workaround transforms all 'if' statements into coherent 'if's, ensuring that an instruction only runs if at least on program instance wants to be running it. One nice thing about this change is that a number of implementations of various builtins can be simplified, since they no longer need to confirm that at least one program instance is running. It might be nice to re-enable regular if statements in a future checkin, but we'd want to make sure they don't have any masked loads or blended masked stores in their statement lists. There isn't a performance impact for any of the examples with this change, so it's unclear if this is important. Note that this only impacts 'if' statements with a varying condition.	2011-09-12 16:25:08 -07:00
Matt Pharr	9d4ff1bc06	Fix alignment in usage message	2011-09-12 15:06:41 -07:00
Matt Pharr	83f22f1939	Add experimental --fast-masked-vload flag for SSE.	2011-09-12 12:29:33 -07:00
Matt Pharr	6375ed9224	AVX: Fix bug with misdeclaration of blend intrinsic. This was preventing the "convert an all-on blend to one of the operand values" optimization from kicking on in AVX.	2011-09-12 06:42:38 -07:00
Matt Pharr	cf23cf9ef4	Fix typo in user guide. Issue #96	2011-09-12 05:24:32 -07:00
Matt Pharr	1147b53dcd	Add #define with target vector width in emitted headers	2011-09-09 09:33:56 -07:00
Matt Pharr	4cf831a651	When --fast-math is enabled, tell LLVM about it, too.	2011-09-09 09:32:59 -07:00
Matt Pharr	785d8a29d3	Run mem2reg pass even when doing -O0 compiles	2011-09-09 09:24:43 -07:00
Matt Pharr	46d2bad231	Fix malformed program crash	2011-09-09 09:24:43 -07:00
Matt Pharr	32da8e11b4	Fix crash with varying global vector types when emitting header file.	2011-09-09 09:16:59 -07:00
Matt Pharr	5dedb6f836	Add --scale command line argument to mandelbrot and rt examples. This applies a floating-point scale factor to the image resolution; it's useful for experiments with many-core systems where the base image resolution may not give enough work for good load-balancing with tasks.	2011-09-07 20:07:51 -07:00
Matt Pharr	2ea6d249d5	Fix mapping to 8, 16 program instances in AO bench example. With this, we now compute a correct image with AVX.	2011-09-07 11:34:24 -07:00
Matt Pharr	c86128e8ee	AVX: go back to using blend (vs. masked store) when possible. All of the masked store calls were inhibiting putting values into registers, which in turn led to a lot of unnecessary stack traffic. This approach seems to give better code in the end.	2011-09-07 11:26:49 -07:00
Matt Pharr	375f1cb8e8	Make octaves and octaves loop uniform in noise example	2011-09-07 10:34:23 -07:00
Matt Pharr	3ca7b6b078	Remove MCJIT stuff from ispc_test (fix Linux build)	2011-09-07 09:44:27 -07:00
Matt Pharr	effe901890	Add task-parallel version of aobench	2011-09-07 05:43:21 -07:00
Matt Pharr	4f451bd041	More AVX fixes Fix RNG state initialization for 16-wide targets Fix a number of bugs in reduce_add builtin implementations for AVX. Fix some tests that had incorrect expected results for the 16-wide case.	2011-09-06 15:53:11 -07:00
Matt Pharr	c76ef7b174	Add command-line option to specify position-independent codegen	2011-09-06 11:12:43 -07:00
Matt Pharr	743d82e935	Various documentation updates.	2011-09-06 09:51:02 -07:00
Matt Pharr	18546e9c6d	Add option to disable optimizations to test running script	2011-09-04 18:09:00 -07:00
Matt Pharr	f24ab16b91	Release notes, doxygen update for 1.0.7 release.	2011-09-03 07:33:39 -07:00
Matt Pharr	766b34683c	Fix Windows build	2011-09-03 07:23:16 -07:00
Matt Pharr	b5bfa43e92	Fix error with float suffixes	2011-09-02 13:09:25 -07:00
Matt Pharr	99221f7d17	Fix a few places in examples where C reference implementaion had a double-precision fp constant undesirably causing computation to be done in double precision. Makes C scalar versions of the options pricing models, rt, and aobench 3-5% faster. Makes scalar version of noise about 15% faster. Others are unchanged.	2011-09-01 16:31:22 -07:00
Matt Pharr	eb7913f1dd	AVX: fix alignment when changing masked load to regular load. Also added some debugging/tracing stuff (commented out). Commented out iffy assert that was hitting for avx stuff.	2011-09-01 15:45:49 -07:00
Matt Pharr	08cad7a665	AVX bugfixes	2011-09-01 14:23:10 -07:00
Matt Pharr	9cd92facbd	Fix test: was incorrectly failing for 8-wide targets	2011-09-01 05:03:49 -07:00
Matt Pharr	85063f493c	Revert attempt to be clever about which LLVM libraries to link in--just link all of them. (This was causing build problems for some folks.)	2011-09-01 05:02:44 -07:00
Matt Pharr	f65a20c700	AVX bugfix: when replacing 'all on' masked store with a store, the rvalue is operand 2, not operand 1 (which is the mask!)	2011-08-31 18:06:29 -07:00
Matt Pharr	e144724979	Improve performance of global atomics, taking advantage of associativity. For associative atomic ops (add, and, or, xor), we can take advantage of their associativity to do just a single hardware atomic instruction, rather than one for each of the running program instances (as the previous implementation did.) The basic approach is to locally compute a reduction across the active program instances with the given op and to then issue a single HW atomic with that reduced value as the operand. We then take the old value that was stored in the location that is returned from the HW atomic op and use that to compute the values to return to each of the program instances (conceptually representing the cumulative effect of each of the preceding program instances having performed their atomic operation.) Issue #56.	2011-08-31 05:35:01 -07:00
Matt Pharr	96a297c747	Small improvements to help output	2011-08-30 14:48:22 -07:00
Matt Pharr	67e00b97c6	Fix incorrect assertions in ConstExpr constructors	2011-08-30 11:08:53 -07:00
Matt Pharr	a94cabc692	Modify stencil example to do separate runs with and without task parallelism.	2011-08-30 05:08:21 -07:00
Matt Pharr	ad9e66650d	AVX bugfix with alignment for store instructions. When replacing 'all on' masked store with regular store, set alignment to be the vector element alignment, not the alignment for a whole vector. (i.e. 4 or 8 byte alignment, not 32 or 64).	2011-08-29 16:58:48 -07:00
Matt Pharr	6de494cfdb	Fix AVX bug introduced in `4ab982bc16`	2011-08-29 16:50:59 -07:00
Matt Pharr	58e34ba4ae	Add new test-driver script, run_tests.py. Old run_tests.sh still lives (for now). Changes include: - Tests are run in parallel across all of the available CPU cores - Option to create a statically-linked executable for each test (rather than using the LLVM JIT). This is in particular useful for AVX, which doesn't have good JIT support yet. - Static executables also makes it possible to test x86, not just x86-64, codegen. - Fixed a number of tests in failing_tests, which were actually failing due to the fact that the expected function signature of tests had changed.	2011-08-29 14:15:09 -07:00
Matt Pharr	33feeffe5d	Update timing header so it works with C code	2011-08-29 11:23:43 -07:00
Matt Pharr	d0db46aac5	Use logical shift right op for shifts of unsigned ints. Fixes issue #88 .	2011-08-29 10:32:26 -07:00
Matt Pharr	da76396c75	Fix typo in SSE2 attributes string.	2011-08-27 08:59:25 -07:00
Matt Pharr	bbf3fb6307	Disable popcnt on SSE4 targets--should only enable if system CPU supports it	2011-08-27 04:09:55 -07:00
Matt Pharr	4ab982bc16	Various AVX fixes (found by inspection). Emit calls to masked_store, not masked_store_blend, when handling masked stores emitted by the frontend. Fix bug in binary8to16 macro in builtins.m4 Fix bug in 16-wide version of __reduce_add_float Remove blend function implementations for masked_store_blend for AVX; just forward those on to the corresponding real masked store functions.	2011-08-26 12:58:02 -07:00
Matt Pharr	34301e09f5	Fix incorrect comment in builtins definitions files. (And all of the places it was cut and pasted to. :-( ).	2011-08-26 10:44:46 -07:00
Matt Pharr	84e586e767	Commit correct atomics tests	2011-08-26 10:43:30 -07:00
Matt Pharr	72a2f5d2f4	Make SSE2 __popcnt_int64 return i64 to be consistent with other targets	2011-08-26 10:42:12 -07:00
Matt Pharr	606cbab0d4	Performance improvements for global min/max atomics. Issue #57 . Compute a "local" min/max across the active program instances and then do a single atomic memory op. Added a few tests to exercise global min/max atomics (which were previously untested!)	2011-08-26 10:35:24 -07:00
Matt Pharr	54ec56c81d	Clean up and centralize LLVM target initialization	2011-08-26 10:15:33 -07:00
Matt Pharr	a322398c62	When emitting header files, put 'extern' declarations of globals used in ispc code outside of the ispc namespace. Fixes issue #64.	2011-08-26 10:03:06 -07:00
Matt Pharr	f22b3a25bd	Update command-line processing and usage string now that we have a preprocessor on Windows. We had been prohibiting Windows users from providing #definitions on the command line, which is the wrong thing to do ever since we switched to using the clang preprocessor.	2011-08-26 09:58:08 -07:00
Matt Pharr	b67498766e	Big rewrite / improvement of target handling. If no CPU is specified, use the host CPU type, not just a default of "nehalem". Provide better features strings to the LLVM target machinery. -> Thus ensuring that LLVM doesn't generate SSE>2 instructions for the SSE2 target (Fixes issue #82). -> Slight code improvements from using cmovs in generated code now Use the llvm popcnt intrinsic for the SSE2 target now (it now generates code that doesn't call the popcnt instruction now that we properly tell LLVM which instructions are and aren't available for SSE2.)	2011-08-26 09:54:45 -07:00
Matt Pharr	c340ff3893	Fixes to build with LLVM ToT	2011-08-25 08:53:56 +01:00
Matt Pharr	b0f59777d4	Silly bug: don't pass NULL to the print() stmt when we want a llvm::Value * that has the value NULL. (This was causing crashes with print() statements with no additional values to be printed.)	2011-08-25 07:48:13 +01:00
Matt Pharr	e14208f489	Update to call DIBuilder::finalize() with LLVM 3.0	2011-08-24 22:28:20 +01:00
Matt Pharr	7756265503	Add double-pumped AVX target (i.e., run 16-wide). Not yet tested.	2011-08-20 11:28:22 +01:00
Matt Pharr	f841b775c3	Small bugfixes in AVX builtins	2011-08-20 09:09:55 +01:00
Matt Pharr	8c921544a0	fix broken test	2011-08-18 20:40:50 +01:00
Matt Pharr	fe54f1ad8e	Fixes to build with latest LLVM ToT	2011-08-18 08:34:49 +01:00
Matt Pharr	74c2c8ae07	Linux build fixes	2011-08-17 07:08:44 -07:00
Matt Pharr	87ec7aa10d	release notes, housekeeping for 1.0.6 release	2011-08-17 14:55:21 +01:00
Matt Pharr	206c851146	Various improvements to example task systems in examples/. - Only have a single copy of all of the tasks_*.cpp sample implementations, stored in examples/. - Reduce dynamic storage allocation and locking in task launch code paths. - Don't have a hard limit of the number of tasks that can be launched on Windows (fix issue #85).	2011-08-17 14:31:45 +01:00
Matt Pharr	60bdf1ef8a	Modify rt example to also do a set of runs with tasks + SPMD together.	2011-08-17 13:14:32 +01:00
Matt Pharr	d7662b3eb9	Use reduce_equal() in volume rendering example to avoid some gathers. Modified this example to use reduce_equal() to see if all of the program instances want to load the 8 sample values around the same voxel. When this is the case, we can just do 8 scalar loads, rather than needing to do a fully general gather. Once this check fails, it isn't done again, since it's not likely to start succeeding in the future. This gives a ~10% speedup with the low-res data set, and basically no performance difference with the high-res one. (It makes sense that the lower-resolution the voxel sampling, the longer all of the rays will stay in the same set of voxels.)	2011-08-17 12:37:07 +01:00
Matt Pharr	ecaa57c7c6	Add volume rendering example. (~2.3x speedup from SIMD vs serial code.)	2011-08-17 12:05:37 +01:00
Matt Pharr	fce183c244	Merge branch 'master' of github.com:ispc/ispc	2011-08-17 10:32:49 +01:00
Matt Pharr	7a92f8b3f9	Add MSVC build support for stencil example	2011-08-17 02:28:49 -07:00
Matt Pharr	96af08e789	Print notices about image files being written	2011-08-16 06:31:26 +01:00
Matt Pharr	cb29c10660	Fix tests on Windows: need arch=x86 since ispc_test.exe is a32-bit app	2011-08-15 08:25:08 -07:00
Matt Pharr	04c93043d6	Target handling fixes. Set the Module's target appropriately when it's first created. Compile separate 32 and 64 bit versions of the builtins-c bitcocde and load the appropriate one based on the target we're compiling for.	2011-08-15 16:03:50 +01:00
Matt Pharr	46037c7a11	Merge branch 'master' of github.com:ispc/ispc	2011-08-15 12:44:38 +01:00
Matt Pharr	c570108026	Fix linux build of stencil example	2011-08-15 04:44:17 -07:00
Matt Pharr	230a0fadea	Attempt to generate debug info for task parameters.	2011-08-15 12:31:56 +01:00
Matt Pharr	87cf05e0d2	Improve performance of 64-bit reduce_equal implementations. Just pulling out the elements and doing a set of scalar equality tests is the best approach for those (nearly 2x better than the rotate and vector equality check that we use for 32-bit stuff).	2011-08-14 07:39:05 +01:00
Matt Pharr	ff608eef71	Change reduce_equal to return false if no instances are executing	2011-08-14 07:11:45 +01:00
Matt Pharr	f868a63064	Add support for scan operations across program instances (add, and, or).	2011-08-13 20:11:41 +01:00
Matt Pharr	c74116aa24	Fix crasher with malformed program	2011-08-12 07:47:17 +01:00
Matt Pharr	8c534d4d74	Add reduce_equal() function to standard library.	2011-08-10 15:55:55 -07:00
Matt Pharr	d821a11c7c	Fix min/max for integer types with AVX.	2011-08-04 06:24:20 -07:00
Matt Pharr	8a138eeb5a	vim syntax highlighting for ispc from <andreas.wendleder@googlemail.com>	2011-08-04 05:49:28 -07:00
Matt Pharr	137ea7bde6	Rename semaphore filename to be more generic	2011-08-04 05:28:00 -07:00
Matt Pharr	e05b3981d9	Add stencil example	2011-08-03 13:49:02 -07:00
Matt Pharr	a5a133ccce	Do more iterations of RNG test to let result converge to bounds.	2011-08-03 13:44:49 -07:00
Matt Pharr	0ac4f7b620	Add various prefetch functions to the standard library.	2011-08-03 13:31:45 -07:00
Matt Pharr	467f1e71d7	Add fast versions of the float<-->half conversion routines in the stdlib. These get slightly wrong results for zero and the denorms and also don't handle the Inf/NaN stuff correctly, but are much more efficient than the full versions of these routines.	2011-08-03 15:58:42 +01:00
Matt Pharr	a2996ed5d9	More efficient implementation of frandom() in stdlib	2011-08-03 14:28:06 +01:00
Matt Pharr	7d7dd2b204	Merge branch 'master' of github.com:ispc/ispc	2011-08-01 12:16:33 +01:00
Matt Pharr	9ee6f86c73	Fix Windows build of ispc_test	2011-08-01 04:05:37 -07:00