Release notes and doxygen bump for v1.0.10

Added deferred shading workload
Added updated task launch implementation that now tracks task groups.
2011-09-30 15:09:19 -07:00 · 2011-09-30 15:09:04 -07:00 · 2011-09-30 11:20:53 -07:00 · 2011-09-30 11:11:52 -07:00 · 2011-09-29 16:19:59 -07:00 · 2011-09-29 13:35:50 -07:00
131 changed files with 8123 additions and 2248 deletions
--- a/16
+++ b/16
@@ -10,7 +10,12 @@ CLANG_LIBS = -lclangFrontend -lclangDriver \
             -lclangSerialization -lclangParse -lclangSema \
             -lclangAnalysis -lclangAST -lclangLex -lclangBasic
-LLVM_LIBS=$(shell llvm-config --ldflags --libs) -lpthread -ldl
+ISPC_LIBS=$(CLANG_LIBS) \
 	$(shell llvm-config --ldflags --libs) \
 	-lpthread -ldl
 ISPC_TEST_LIBS=$(shell llvm-config --ldflags --libs) \
 	-lpthread -ldl
 LLVM_CXXFLAGS=$(shell llvm-config --cppflags)
 LLVM_VERSION=$(shell llvm-config --version | sed s/\\./_/)
 LLVM_VERSION_DEF=-DLLVM_$(LLVM_VERSION)
@@ -44,7 +49,8 @@ CXX_SRC=builtins.cpp ctx.cpp decl.cpp expr.cpp ispc.cpp \
 	util.cpp
 HEADERS=builtins.h ctx.h decl.h expr.h ispc.h llvmutil.h module.h \
 	opt.h stmt.h sym.h type.h util.h
-BUILTINS_SRC=builtins-avx.ll builtins-sse2.ll builtins-sse4.ll builtins-sse4x2.ll
+BUILTINS_SRC=builtins-avx.ll builtins-avx-x2.ll builtins-sse2.ll \
 	builtins-sse4.ll builtins-sse4x2.ll
 BISON_SRC=parse.yy
 FLEX_SRC=lex.ll
@@ -79,11 +85,11 @@ doxygen:
 ispc: print_llvm_src dirs $(OBJS)
 	@echo Creating ispc executable
-	@$(CXX) $(LDFLAGS) -o $@ $(OBJS) $(CLANG_LIBS) $(LLVM_LIBS)
+	@$(CXX) $(LDFLAGS) -o $@ $(OBJS) $(ISPC_LIBS)
 ispc_test: dirs ispc_test.cpp
 	@echo Creating ispc_test executable
-	@$(CXX) $(LDFLAGS) $(CXXFLAGS) -o $@ ispc_test.cpp $(LLVM_LIBS)
+	@$(CXX) $(LDFLAGS) $(CXXFLAGS) -o $@ ispc_test.cpp $(ISPC_TEST_LIBS)
 objs/%.o: %.cpp
 	@echo Compiling $<
@@ -105,7 +111,7 @@ objs/lex.o: objs/lex.cpp $(HEADERS) objs/parse.cc
 	@echo Compiling $<
 	@$(CXX) $(CXXFLAGS) -o $@ -c $<
-objs/builtins-%.cpp: builtins-%.ll builtins.m4 builtins-sse.ll
+objs/builtins-%.cpp: builtins-%.ll builtins.m4 builtins-sse.ll builtins-avx-common.ll
 	@echo Creating C++ source from builtin definitions file $<
 	@m4 -DLLVM_VERSION=$(LLVM_VERSION) builtins.m4 $< | ./bitcode2cpp.py $< > $@
--- a/README.txt
+++ b/README.txt
@@ -15,8 +15,8 @@ code.
 ispc is an open source compiler under the BSD license; see the file
 LICENSE.txt.  ispc supports Windows, Mac, and Linux, with both x86 and
-x86-64 targets. It currently supports the SSE2 and SSE4 instruction sets,
+x86-64 targets.  It currently supports the SSE2, SSE4, and AVX instruction
-though support for AVX should be available soon.
+sets.
 For more information and examples, as well as a wiki and the bug database,
 see the ispc distribution site, http://ispc.github.com.
--- a/bitcode2cpp.py
+++ b/bitcode2cpp.py
@@ -4,6 +4,8 @@ import sys
 import string
 import re
 import subprocess
 import platform
 import os
 length=0
@@ -14,8 +16,12 @@ target = re.sub("\.ll$", "", target)
 target = re.sub("\.c$", "", target)
 target = re.sub("-", "_", target)
 llvm_as="llvm-as"
 if platform.system() == 'Windows' or string.find(platform.system(), "CYGWIN_NT") != -1:
    llvm_as = os.getenv("LLVM_INSTALL_DIR").replace("\\", "/") + "/bin/" + llvm_as
 try:
-    as_out=subprocess.Popen([ "llvm-as", "-", "-o", "-"], stdout=subprocess.PIPE)
+    as_out=subprocess.Popen([llvm_as, "-", "-o", "-"], stdout=subprocess.PIPE)
 except IOError:
    print >> sys.stderr, "Couldn't open " + src
    sys.exit(1)
--- a/builtins-avx-common.ll
+++ b/builtins-avx-common.ll
@@ -0,0 +1,278 @@
 ;;  Copyright (c) 2010-2011, Intel Corporation
 ;;  All rights reserved.
 ;;
 ;;  Redistribution and use in source and binary forms, with or without
 ;;  modification, are permitted provided that the following conditions are
 ;;  met:
 ;;
 ;;    * Redistributions of source code must retain the above copyright
 ;;      notice, this list of conditions and the following disclaimer.
 ;;
 ;;    * Redistributions in binary form must reproduce the above copyright
 ;;      notice, this list of conditions and the following disclaimer in the
 ;;      documentation and/or other materials provided with the distribution.
 ;;
 ;;    * Neither the name of Intel Corporation nor the names of its
 ;;      contributors may be used to endorse or promote products derived from
 ;;      this software without specific prior written permission.
 ;;
 ;;
 ;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
 ;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 ;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
 ;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
 ;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 ;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 ;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 ;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 ;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 ;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 ;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; *** Untested *** AVX target implementation.
 ;;
 ;; The LLVM AVX code generator is incomplete, so the ispc AVX target
 ;; hasn't yet been tested.  There is therefore a higher-than-normal
 ;; chance that there are bugs in the code in this file.
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rcp
 declare <4 x float> @llvm.x86.sse.rcp.ss(<4 x float>) nounwind readnone
 define internal float @__rcp_uniform_float(float) nounwind readonly alwaysinline {
 ;    uniform float iv = extract(__rcp_u(v), 0);
 ;    return iv * (2. - v * iv);
  %vecval = insertelement <4 x float> undef, float %0, i32 0
  %call = call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %vecval)
  %scall = extractelement <4 x float> %call, i32 0
  ; do one N-R iteration
  %v_iv = fmul float %0, %scall
  %two_minus = fsub float 2., %v_iv  
  %iv_mul = fmul float %scall, %two_minus
  ret float %iv_mul
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rounding floats
 declare <4 x float> @llvm.x86.sse41.round.ss(<4 x float>, <4 x float>, i32) nounwind readnone
 define internal float @__round_uniform_float(float) nounwind readonly alwaysinline {
  ; roundss, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8
  ; the roundss intrinsic is a total mess--docs say:
  ;
  ;  __m128 _mm_round_ss (__m128 a, __m128 b, const int c)
  ;       
  ;  b is a 128-bit parameter. The lowest 32 bits are the result of the rounding function
  ;  on b0. The higher order 96 bits are copied directly from input parameter a. The
  ;  return value is described by the following equations:
  ;
  ;  r0 = RND(b0)
  ;  r1 = a1
  ;  r2 = a2
  ;  r3 = a3
  ;
  ;  It doesn't matter what we pass as a, since we only need the r0 value
  ;  here.  So we pass the same register for both.
  %xi = insertelement <4 x float> undef, float %0, i32 0
  %xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 8)
  %rs = extractelement <4 x float> %xr, i32 0
  ret float %rs
 }
 define internal float @__floor_uniform_float(float) nounwind readonly alwaysinline {
  ; see above for round_ss instrinsic discussion...
  %xi = insertelement <4 x float> undef, float %0, i32 0
  ; roundps, round down 0b01 | don't signal precision exceptions 0b1001 = 9
  %xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 9)
  %rs = extractelement <4 x float> %xr, i32 0
  ret float %rs
 }
 define internal float @__ceil_uniform_float(float) nounwind readonly alwaysinline {
  ; see above for round_ss instrinsic discussion...
  %xi = insertelement <4 x float> undef, float %0, i32 0
  ; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
  %xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 10)
  %rs = extractelement <4 x float> %xr, i32 0
  ret float %rs
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rounding doubles
 declare <2 x double> @llvm.x86.sse41.round.sd(<2 x double>, <2 x double>, i32) nounwind readnone
 define internal double @__round_uniform_double(double) nounwind readonly alwaysinline {
  %xi = insertelement <2 x double> undef, double %0, i32 0
  %xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 8)
  %rs = extractelement <2 x double> %xr, i32 0
  ret double %rs
 }
 define internal double @__floor_uniform_double(double) nounwind readonly alwaysinline {
  ; see above for round_ss instrinsic discussion...
  %xi = insertelement <2 x double> undef, double %0, i32 0
  ; roundpd, round down 0b01 | don't signal precision exceptions 0b1001 = 9
  %xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 9)
  %rs = extractelement <2 x double> %xr, i32 0
  ret double %rs
 }
 define internal double @__ceil_uniform_double(double) nounwind readonly alwaysinline {
  ; see above for round_ss instrinsic discussion...
  %xi = insertelement <2 x double> undef, double %0, i32 0
  ; roundpd, round up 0b10 | don't signal precision exceptions 0b1010 = 10
  %xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 10)
  %rs = extractelement <2 x double> %xr, i32 0
  ret double %rs
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rsqrt
 declare <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float>) nounwind readnone
 define internal float @__rsqrt_uniform_float(float) nounwind readonly alwaysinline {
  ;  uniform float is = extract(__rsqrt_u(v), 0);
  %v = insertelement <4 x float> undef, float %0, i32 0
  %vis = call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %v)
  %is = extractelement <4 x float> %vis, i32 0
  ;  return 0.5 * is * (3. - (v * is) * is);
  %v_is = fmul float %0, %is
  %v_is_is = fmul float %v_is, %is
  %three_sub = fsub float 3., %v_is_is
  %is_mul = fmul float %is, %three_sub
  %half_scale = fmul float 0.5, %is_mul
  ret float %half_scale
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; sqrt
 declare <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float>) nounwind readnone
 define internal float @__sqrt_uniform_float(float) nounwind readonly alwaysinline {
  sse_unary_scalar(ret, 4, float, @llvm.x86.sse.sqrt.ss, %0)
  ret float %ret
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; fastmath
 declare void @llvm.x86.sse.stmxcsr(i8 *) nounwind
 declare void @llvm.x86.sse.ldmxcsr(i8 *) nounwind
 define internal void @__fastmath() nounwind alwaysinline {
  %ptr = alloca i32
  %ptr8 = bitcast i32 * %ptr to i8 *
  call void @llvm.x86.sse.stmxcsr(i8 * %ptr8)
  %oldval = load i32 *%ptr
  ; turn on DAZ (64)/FTZ (32768) -> 32832
  %update = or i32 %oldval, 32832
  store i32 %update, i32 *%ptr
  call void @llvm.x86.sse.ldmxcsr(i8 * %ptr8)
  ret void
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; float min/max
 declare <4 x float> @llvm.x86.sse.max.ss(<4 x float>, <4 x float>) nounwind readnone
 declare <4 x float> @llvm.x86.sse.min.ss(<4 x float>, <4 x float>) nounwind readnone
 define internal float @__max_uniform_float(float, float) nounwind readonly alwaysinline {
  sse_binary_scalar(ret, 4, float, @llvm.x86.sse.max.ss, %0, %1)
  ret float %ret
 }
 define internal float @__min_uniform_float(float, float) nounwind readonly alwaysinline {
  sse_binary_scalar(ret, 4, float, @llvm.x86.sse.min.ss, %0, %1)
  ret float %ret
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; int min/max
 declare <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32>, <4 x i32>) nounwind readnone
 declare <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32>, <4 x i32>) nounwind readnone
 define internal i32 @__min_uniform_int32(i32, i32) nounwind readonly alwaysinline {
  sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pminsd, %0, %1)
  ret i32 %ret
 }
 define internal i32 @__max_uniform_int32(i32, i32) nounwind readonly alwaysinline {
  sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pmaxsd, %0, %1)
  ret i32 %ret
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; unsigned int min/max
 declare <4 x i32> @llvm.x86.sse41.pminud(<4 x i32>, <4 x i32>) nounwind readnone
 declare <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32>, <4 x i32>) nounwind readnone
 define internal i32 @__min_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
  sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pminud, %0, %1)
  ret i32 %ret
 }
 define internal i32 @__max_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
  sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pmaxud, %0, %1)
  ret i32 %ret
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; horizontal ops
 declare i32 @llvm.ctpop.i32(i32) nounwind readnone
 define internal i32 @__popcnt_int32(i32) nounwind readonly alwaysinline {
  %call = call i32 @llvm.ctpop.i32(i32 %0)
  ret i32 %call
 }
 declare i64 @llvm.ctpop.i64(i64) nounwind readnone
 define internal i64 @__popcnt_int64(i64) nounwind readonly alwaysinline {
  %call = call i64 @llvm.ctpop.i64(i64 %0)
  ret i64 %call
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; double precision sqrt
 declare <2 x double> @llvm.x86.sse.sqrt.sd(<2 x double>) nounwind readnone
 define internal double @__sqrt_uniform_double(double) nounwind alwaysinline {
  sse_unary_scalar(ret, 2, double, @llvm.x86.sse.sqrt.sd, %0)
  ret double %ret
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; double precision min/max
 declare <2 x double> @llvm.x86.sse2.max.sd(<2 x double>, <2 x double>) nounwind readnone
 declare <2 x double> @llvm.x86.sse2.min.sd(<2 x double>, <2 x double>) nounwind readnone
 define internal double @__min_uniform_double(double, double) nounwind readnone alwaysinline {
  sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.min.sd, %0, %1)
  ret double %ret
 }
 define internal double @__max_uniform_double(double, double) nounwind readnone alwaysinline {
  sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.max.sd, %0, %1)
  ret double %ret
 }
--- a/builtins-avx-x2.ll
+++ b/builtins-avx-x2.ll
@@ -0,0 +1,665 @@
 ;;  Copyright (c) 2010-2011, Intel Corporation
 ;;  All rights reserved.
 ;;
 ;;  Redistribution and use in source and binary forms, with or without
 ;;  modification, are permitted provided that the following conditions are
 ;;  met:
 ;;
 ;;    * Redistributions of source code must retain the above copyright
 ;;      notice, this list of conditions and the following disclaimer.
 ;;
 ;;    * Redistributions in binary form must reproduce the above copyright
 ;;      notice, this list of conditions and the following disclaimer in the
 ;;      documentation and/or other materials provided with the distribution.
 ;;
 ;;    * Neither the name of Intel Corporation nor the names of its
 ;;      contributors may be used to endorse or promote products derived from
 ;;      this software without specific prior written permission.
 ;;
 ;;
 ;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
 ;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 ;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
 ;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
 ;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 ;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 ;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 ;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 ;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 ;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 ;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; *** Untested *** AVX target implementation.
 ;;
 ;; The LLVM AVX code generator is incomplete, so the ispc AVX target
 ;; hasn't yet been tested.  There is therefore a higher-than-normal
 ;; chance that there are bugs in the code in this file.
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; Basic 16-wide definitions
 stdlib_core(16)
 packed_load_and_store(16)
 scans(16)
 int64minmax(16)
 include(`builtins-avx-common.ll')
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rcp
 declare <8 x float> @llvm.x86.avx.rcp.ps.256(<8 x float>) nounwind readnone
 define internal <16 x float> @__rcp_varying_float(<16 x float>) nounwind readonly alwaysinline {
  ;  float iv = __rcp_v(v);
  ;  return iv * (2. - v * iv);
  unary8to16(call, float, @llvm.x86.avx.rcp.ps.256, %0)
  ; do one N-R iteration
  %v_iv = fmul <16 x float> %0, %call
  %two_minus = fsub <16 x float> <float 2., float 2., float 2., float 2.,
                                  float 2., float 2., float 2., float 2.,
                                  float 2., float 2., float 2., float 2.,
                                  float 2., float 2., float 2., float 2.>, %v_iv  
  %iv_mul = fmul <16 x float> %call, %two_minus
  ret <16 x float> %iv_mul
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rounding floats
 declare <8 x float> @llvm.x86.avx.round.ps.256(<8 x float>, i32) nounwind readnone
 define internal <16 x float> @__round_varying_float(<16 x float>) nounwind readonly alwaysinline {
  ; roundps, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8
  round8to16(%0, 8)
 }
 define internal <16 x float> @__floor_varying_float(<16 x float>) nounwind readonly alwaysinline {
  ; roundps, round down 0b01 | don't signal precision exceptions 0b1001 = 9
  round8to16(%0, 9)
 }
 define internal <16 x float> @__ceil_varying_float(<16 x float>) nounwind readonly alwaysinline {
  ; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
  round8to16(%0, 10)
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rounding doubles
 declare <4 x double> @llvm.x86.avx.round.pd.256(<4 x double>, i32) nounwind readnone
 define internal <16 x double> @__round_varying_double(<16 x double>) nounwind readonly alwaysinline {
  round4to16double(%0, 8)
 }
 define internal <16 x double> @__floor_varying_double(<16 x double>) nounwind readonly alwaysinline {
  round4to16double(%0, 9)
 }
 define internal <16 x double> @__ceil_varying_double(<16 x double>) nounwind readonly alwaysinline {
  round4to16double(%0, 10)
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rsqrt
 declare <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float>) nounwind readnone
 define internal <16 x float> @__rsqrt_varying_float(<16 x float> %v) nounwind readonly alwaysinline {
  ;  float is = __rsqrt_v(v);
  unary8to16(is, float, @llvm.x86.avx.rsqrt.ps.256, %v)
  ;  return 0.5 * is * (3. - (v * is) * is);
  %v_is = fmul <16 x float> %v, %is
  %v_is_is = fmul <16 x float> %v_is, %is
  %three_sub = fsub <16 x float> <float 3., float 3., float 3., float 3.,
                                  float 3., float 3., float 3., float 3.,
                                  float 3., float 3., float 3., float 3.,
                                  float 3., float 3., float 3., float 3.>, %v_is_is
  %is_mul = fmul <16 x float> %is, %three_sub
  %half_scale = fmul <16 x float> <float 0.5, float 0.5, float 0.5, float 0.5,
                                   float 0.5, float 0.5, float 0.5, float 0.5,
                                   float 0.5, float 0.5, float 0.5, float 0.5,
                                   float 0.5, float 0.5, float 0.5, float 0.5>, %is_mul
  ret <16 x float> %half_scale
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; sqrt
 declare <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float>) nounwind readnone
 define internal <16 x float> @__sqrt_varying_float(<16 x float>) nounwind readonly alwaysinline {
  unary8to16(call, float, @llvm.x86.avx.sqrt.ps.256, %0)
  ret <16 x float> %call
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; svml
 ; FIXME: need either to wire these up to the 8-wide SVML entrypoints,
 ; or, use the macro to call the 4-wide ones 4x with our 16-wide
 ; vectors...
 declare <16 x float> @__svml_sin(<16 x float>)
 declare <16 x float> @__svml_cos(<16 x float>)
 declare void @__svml_sincos(<16 x float>, <16 x float> *, <16 x float> *)
 declare <16 x float> @__svml_tan(<16 x float>)
 declare <16 x float> @__svml_atan(<16 x float>)
 declare <16 x float> @__svml_atan2(<16 x float>, <16 x float>)
 declare <16 x float> @__svml_exp(<16 x float>)
 declare <16 x float> @__svml_log(<16 x float>)
 declare <16 x float> @__svml_pow(<16 x float>, <16 x float>)
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; float min/max
 declare <8 x float> @llvm.x86.avx.max.ps.256(<8 x float>, <8 x float>) nounwind readnone
 declare <8 x float> @llvm.x86.avx.min.ps.256(<8 x float>, <8 x float>) nounwind readnone
 define internal <16 x float> @__max_varying_float(<16 x float>,
                                                  <16 x float>) nounwind readonly alwaysinline {
  binary8to16(call, float, @llvm.x86.avx.max.ps.256, %0, %1)
  ret <16 x float> %call
 }
 define internal <16 x float> @__min_varying_float(<16 x float>,
                                                  <16 x float>) nounwind readonly alwaysinline {
  binary8to16(call, float, @llvm.x86.avx.min.ps.256, %0, %1)
  ret <16 x float> %call
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; int min/max
 define internal <16 x i32> @__min_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
  binary4to16(ret, i32, @llvm.x86.sse41.pminsd, %0, %1)
  ret <16 x i32> %ret
 }
 define internal <16 x i32> @__max_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
  binary4to16(ret, i32, @llvm.x86.sse41.pmaxsd, %0, %1)
  ret <16 x i32> %ret
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; unsigned int min/max
 define internal <16 x i32> @__min_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
  binary4to16(ret, i32, @llvm.x86.sse41.pminud, %0, %1)
  ret <16 x i32> %ret
 }
 define internal <16 x i32> @__max_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
  binary4to16(ret, i32, @llvm.x86.sse41.pmaxud, %0, %1)
  ret <16 x i32> %ret
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; horizontal ops
 declare i32 @llvm.x86.avx.movmsk.ps.256(<8 x float>) nounwind readnone
 define internal i32 @__movmsk(<16 x i32>) nounwind readnone alwaysinline {
  %floatmask = bitcast <16 x i32> %0 to <16 x float>
  %mask0 = shufflevector <16 x float> %floatmask, <16 x float> undef,
          <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
  %v0 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %mask0) nounwind readnone
  %mask1 = shufflevector <16 x float> %floatmask, <16 x float> undef,
          <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
  %v1 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %mask1) nounwind readnone
  %v1shift = shl i32 %v1, 8
  %v = or i32 %v1shift, %v0
  ret i32 %v
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; horizontal float ops
 declare <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float>, <8 x float>) nounwind readnone
 define internal float @__reduce_add_float(<16 x float>) nounwind readonly alwaysinline {
  %va = shufflevector <16 x float> %0, <16 x float> undef,
          <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
  %vb = shufflevector <16 x float> %0, <16 x float> undef,
          <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
  %v1 = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %va, <8 x float> %vb)
  %v2 = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %v1, <8 x float> %v1)
  %v3 = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %v2, <8 x float> %v2)
  %scalar1 = extractelement <8 x float> %v3, i32 0
  %scalar2 = extractelement <8 x float> %v3, i32 4
  %sum = fadd float %scalar1, %scalar2
  ret float %sum
 }
 define internal float @__reduce_min_float(<16 x float>) nounwind readnone alwaysinline {
  reduce16(float, @__min_varying_float, @__min_uniform_float)
 }
 define internal float @__reduce_max_float(<16 x float>) nounwind readnone alwaysinline {
  reduce16(float, @__max_varying_float, @__max_uniform_float)
 }
 reduce_equal(16)
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; horizontal int32 ops
 define internal <16 x i32> @__add_varying_int32(<16 x i32>,
                                                <16 x i32>) nounwind readnone alwaysinline {
  %s = add <16 x i32> %0, %1
  ret <16 x i32> %s
 }
 define internal i32 @__add_uniform_int32(i32, i32) nounwind readnone alwaysinline {
  %s = add i32 %0, %1
  ret i32 %s
 }
 define internal i32 @__reduce_add_int32(<16 x i32>) nounwind readnone alwaysinline {
  reduce16(i32, @__add_varying_int32, @__add_uniform_int32)
 }
 define internal i32 @__reduce_min_int32(<16 x i32>) nounwind readnone alwaysinline {
  reduce16(i32, @__min_varying_int32, @__min_uniform_int32)
 }
 define internal i32 @__reduce_max_int32(<16 x i32>) nounwind readnone alwaysinline {
  reduce16(i32, @__max_varying_int32, @__max_uniform_int32)
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;;; horizontal uint32 ops
 define internal i32 @__reduce_add_uint32(<16 x i32> %v) nounwind readnone alwaysinline {
  %r = call i32 @__reduce_add_int32(<16 x i32> %v)
  ret i32 %r
 }
 define internal i32 @__reduce_min_uint32(<16 x i32>) nounwind readnone alwaysinline {
  reduce16(i32, @__min_varying_uint32, @__min_uniform_uint32)
 }
 define internal i32 @__reduce_max_uint32(<16 x i32>) nounwind readnone alwaysinline {
  reduce16(i32, @__max_varying_uint32, @__max_uniform_uint32)
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; horizontal double ops
 declare <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double>, <4 x double>) nounwind readnone
 define internal double @__reduce_add_double(<16 x double>) nounwind readonly alwaysinline {
  %va = shufflevector <16 x double> %0, <16 x double> undef,
         <4 x i32> <i32 0, i32 1, i32 2, i32 3>
  %vb = shufflevector <16 x double> %0, <16 x double> undef,
         <4 x i32> <i32 4, i32 5, i32 6, i32 7>
  %vc = shufflevector <16 x double> %0, <16 x double> undef,
         <4 x i32> <i32 8, i32 9, i32 10, i32 11>
  %vd = shufflevector <16 x double> %0, <16 x double> undef,
         <4 x i32> <i32 12, i32 13, i32 14, i32 15>
  %vab = fadd <4 x double> %va, %vb
  %vcd = fadd <4 x double> %vc, %vd
  %sum0 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %vab, <4 x double> %vcd)
  %sum1 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %sum0, <4 x double> %sum0)
  %final0 = extractelement <4 x double> %sum1, i32 0
  %final1 = extractelement <4 x double> %sum1, i32 2
  %sum = fadd double %final0, %final1
  ret double %sum
 }
 define internal double @__reduce_min_double(<16 x double>) nounwind readnone alwaysinline {
  reduce16(double, @__min_varying_double, @__min_uniform_double)
 }
 define internal double @__reduce_max_double(<16 x double>) nounwind readnone alwaysinline {
  reduce16(double, @__max_varying_double, @__max_uniform_double)
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; horizontal int64 ops
 define internal <16 x i64> @__add_varying_int64(<16 x i64>,
                                                <16 x i64>) nounwind readnone alwaysinline {
  %s = add <16 x i64> %0, %1
  ret <16 x i64> %s
 }
 define internal i64 @__add_uniform_int64(i64, i64) nounwind readnone alwaysinline {
  %s = add i64 %0, %1
  ret i64 %s
 }
 define internal i64 @__reduce_add_int64(<16 x i64>) nounwind readnone alwaysinline {
  reduce16(i64, @__add_varying_int64, @__add_uniform_int64)
 }
 define internal i64 @__reduce_min_int64(<16 x i64>) nounwind readnone alwaysinline {
  reduce16(i64, @__min_varying_int64, @__min_uniform_int64)
 }
 define internal i64 @__reduce_max_int64(<16 x i64>) nounwind readnone alwaysinline {
  reduce16(i64, @__max_varying_int64, @__max_uniform_int64)
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;;; horizontal uint64 ops
 define internal i64 @__reduce_add_uint64(<16 x i64> %v) nounwind readnone alwaysinline {
  %r = call i64 @__reduce_add_int64(<16 x i64> %v)
  ret i64 %r
 }
 define internal i64 @__reduce_min_uint64(<16 x i64>) nounwind readnone alwaysinline {
  reduce16(i64, @__min_varying_uint64, @__min_uniform_uint64)
 }
 define internal i64 @__reduce_max_uint64(<16 x i64>) nounwind readnone alwaysinline {
  reduce16(i64, @__max_varying_uint64, @__max_uniform_uint64)
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; unaligned loads/loads+broadcasts
 load_and_broadcast(16, i8, 8)
 load_and_broadcast(16, i16, 16)
 load_and_broadcast(16, i32, 32)
 load_and_broadcast(16, i64, 64)
 ; no masked load instruction for i8 and i16 types??
 load_masked(16, i8,  8,  1)
 load_masked(16, i16, 16, 2)
 declare <8 x float> @llvm.x86.avx.maskload.ps.256(i8 *, <8 x float> %mask)
 declare <4 x double> @llvm.x86.avx.maskload.pd.256(i8 *, <4 x double> %mask)
 define <16 x i32> @__load_masked_32(i8 *, <16 x i32> %mask) nounwind alwaysinline {
  %floatmask = bitcast <16 x i32> %mask to <16 x float>
  %mask0 = shufflevector <16 x float> %floatmask, <16 x float> undef,
     <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
  %val0 = call <8 x float> @llvm.x86.avx.maskload.ps.256(i8 * %0, <8 x float> %mask0)
  %mask1 = shufflevector <16 x float> %floatmask, <16 x float> undef,
     <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
  %ptr1 = getelementptr i8 * %0, i32 32   ;; 8x4 bytes = 32
  %val1 = call <8 x float> @llvm.x86.avx.maskload.ps.256(i8 * %ptr1, <8 x float> %mask1)
  %retval = shufflevector <8 x float> %val0, <8 x float> %val1,
     <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
                 i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
  %reti32 = bitcast <16 x float> %retval to <16 x i32>
  ret <16 x i32> %reti32
 }
 define <16 x i64> @__load_masked_64(i8 *, <16 x i32> %mask) nounwind alwaysinline {
  ; double up masks, bitcast to doubles
  %mask0 = shufflevector <16 x i32> %mask, <16 x i32> undef,
     <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
  %mask1 = shufflevector <16 x i32> %mask, <16 x i32> undef,
     <8 x i32> <i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
  %mask2 = shufflevector <16 x i32> %mask, <16 x i32> undef,
     <8 x i32> <i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11>
  %mask3 = shufflevector <16 x i32> %mask, <16 x i32> undef,
     <8 x i32> <i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15>
  %mask0d = bitcast <8 x i32> %mask0 to <4 x double>
  %mask1d = bitcast <8 x i32> %mask1 to <4 x double>
  %mask2d = bitcast <8 x i32> %mask2 to <4 x double>
  %mask3d = bitcast <8 x i32> %mask3 to <4 x double>
  %val0d = call <4 x double> @llvm.x86.avx.maskload.pd.256(i8 * %0, <4 x double> %mask0d)
  %ptr1 = getelementptr i8 * %0, i32 32
  %val1d = call <4 x double> @llvm.x86.avx.maskload.pd.256(i8 * %ptr1, <4 x double> %mask1d)
  %ptr2 = getelementptr i8 * %0, i32 64
  %val2d = call <4 x double> @llvm.x86.avx.maskload.pd.256(i8 * %ptr2, <4 x double> %mask2d)
  %ptr3 = getelementptr i8 * %0, i32 96
  %val3d = call <4 x double> @llvm.x86.avx.maskload.pd.256(i8 * %ptr3, <4 x double> %mask3d)
  %val01 = shufflevector <4 x double> %val0d, <4 x double> %val1d,
      <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
  %val23 = shufflevector <4 x double> %val2d, <4 x double> %val3d,
      <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
  %val0123 = shufflevector <8 x double> %val01, <8 x double> %val23,
      <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
                  i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
  %val = bitcast <16 x double> %val0123 to <16 x i64>
  ret <16 x i64> %val
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; masked store
 ; FIXME: there is no AVX instruction for these, but we could be clever
 ; by packing the bits down and setting the last 3/4 or half, respectively,
 ; of the mask to zero...  Not sure if this would be a win in the end
 gen_masked_store(16, i8, 8)
 gen_masked_store(16, i16, 16)
 ; note that mask is the 2nd parameter, not the 3rd one!!
 declare void @llvm.x86.avx.maskstore.ps.256(i8 *, <8 x float>, <8 x float>)
 declare void @llvm.x86.avx.maskstore.pd.256(i8 *, <4 x double>, <4 x double>)
 define void @__masked_store_32(<16 x i32>* nocapture, <16 x i32>, 
                               <16 x i32>) nounwind alwaysinline {
  %ptr = bitcast <16 x i32> * %0 to i8 *
  %val = bitcast <16 x i32> %1 to <16 x float>
  %mask = bitcast <16 x i32> %2 to <16 x float>
  %val0 = shufflevector <16 x float> %val, <16 x float> undef,
        <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
  %val1 = shufflevector <16 x float> %val, <16 x float> undef,
        <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
  %mask0 = shufflevector <16 x float> %mask, <16 x float> undef,
        <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
  %mask1 = shufflevector <16 x float> %mask, <16 x float> undef,
        <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
  call void @llvm.x86.avx.maskstore.ps.256(i8 * %ptr, <8 x float> %mask0, <8 x float> %val0)
  %ptr1 = getelementptr i8 * %ptr, i32 32
  call void @llvm.x86.avx.maskstore.ps.256(i8 * %ptr1, <8 x float> %mask1, <8 x float> %val1)
  ret void
 }
 define void @__masked_store_64(<16 x i64>* nocapture, <16 x i64>,
                               <16 x i32> %mask) nounwind alwaysinline {
  %ptr = bitcast <16 x i64> * %0 to i8 *
  %val = bitcast <16 x i64> %1 to <16 x double>
  ; double up masks, bitcast to doubles
  %mask0 = shufflevector <16 x i32> %mask, <16 x i32> undef,
     <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
  %mask1 = shufflevector <16 x i32> %mask, <16 x i32> undef,
     <8 x i32> <i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
  %mask2 = shufflevector <16 x i32> %mask, <16 x i32> undef,
     <8 x i32> <i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11>
  %mask3 = shufflevector <16 x i32> %mask, <16 x i32> undef,
     <8 x i32> <i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15>
  %mask0d = bitcast <8 x i32> %mask0 to <4 x double>
  %mask1d = bitcast <8 x i32> %mask1 to <4 x double>
  %mask2d = bitcast <8 x i32> %mask2 to <4 x double>
  %mask3d = bitcast <8 x i32> %mask3 to <4 x double>
  %val0 = shufflevector <16 x double> %val, <16 x double> undef,
     <4 x i32> <i32 0, i32 1, i32 2, i32 3>
  %val1 = shufflevector <16 x double> %val, <16 x double> undef,
     <4 x i32> <i32 4, i32 5, i32 6, i32 7>
  %val2 = shufflevector <16 x double> %val, <16 x double> undef,
     <4 x i32> <i32 8, i32 9, i32 10, i32 11>
  %val3 = shufflevector <16 x double> %val, <16 x double> undef,
     <4 x i32> <i32 12, i32 13, i32 14, i32 15>
  call void @llvm.x86.avx.maskstore.pd.256(i8 * %ptr, <4 x double> %mask0d, <4 x double> %val0)
  %ptr1 = getelementptr i8 * %ptr, i32 32
  call void @llvm.x86.avx.maskstore.pd.256(i8 * %ptr1, <4 x double> %mask1d, <4 x double> %val1)
  %ptr2 = getelementptr i8 * %ptr, i32 64
  call void @llvm.x86.avx.maskstore.pd.256(i8 * %ptr2, <4 x double> %mask2d, <4 x double> %val2)
  %ptr3 = getelementptr i8 * %ptr, i32 96
  call void @llvm.x86.avx.maskstore.pd.256(i8 * %ptr3, <4 x double> %mask3d, <4 x double> %val3)
  ret void
 }
 masked_store_blend_8_16_by_16()
 declare <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float>, <8 x float>,
                                                <8 x float>) nounwind readnone
 define void @__masked_store_blend_32(<16 x i32>* nocapture, <16 x i32>, 
                                     <16 x i32>) nounwind alwaysinline {
  %maskAsFloat = bitcast <16 x i32> %2 to <16 x float>
  %oldValue = load <16 x i32>* %0, align 4
  %oldAsFloat = bitcast <16 x i32> %oldValue to <16 x float>
  %newAsFloat = bitcast <16 x i32> %1 to <16 x float>
  %old0 = shufflevector <16 x float> %oldAsFloat, <16 x float> undef,
        <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
  %old1 = shufflevector <16 x float> %oldAsFloat, <16 x float> undef,
        <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
  %new0 = shufflevector <16 x float> %newAsFloat, <16 x float> undef,
        <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
  %new1 = shufflevector <16 x float> %newAsFloat, <16 x float> undef,
        <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
  %mask0 = shufflevector <16 x float> %maskAsFloat, <16 x float> undef,
        <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
  %mask1 = shufflevector <16 x float> %maskAsFloat, <16 x float> undef,
        <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
  %blend0 = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %old0,
                                                         <8 x float> %new0,
                                                         <8 x float> %mask0)
  %blend1 = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %old1,
                                                         <8 x float> %new1,
                                                         <8 x float> %mask1)
  %blend = shufflevector <8 x float> %blend0, <8 x float> %blend1,
    <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
                i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
  %blendAsInt = bitcast <16 x float> %blend to <16 x i32>
  store <16 x i32> %blendAsInt, <16 x i32>* %0, align 4
  ret void
 }
 declare <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double>, <4 x double>,
                                                 <4 x double>) nounwind readnone
 define void @__masked_store_blend_64(<16 x i64>* nocapture %ptr, <16 x i64> %newi64, 
                                     <16 x i32> %mask) nounwind alwaysinline {
  %oldValue = load <16 x i64>* %ptr, align 8
  %old = bitcast <16 x i64> %oldValue to <16 x double>
  %old0d = shufflevector <16 x double> %old, <16 x double> undef,
     <4 x i32> <i32 0, i32 1, i32 2, i32 3>
  %old1d = shufflevector <16 x double> %old, <16 x double> undef,
     <4 x i32> <i32 4, i32 5, i32 6, i32 7>
  %old2d = shufflevector <16 x double> %old, <16 x double> undef,
     <4 x i32> <i32 8, i32 9, i32 10, i32 11>
  %old3d = shufflevector <16 x double> %old, <16 x double> undef,
     <4 x i32> <i32 12, i32 13, i32 14, i32 15>
  %new = bitcast <16 x i64> %newi64 to <16 x double>
  %new0d = shufflevector <16 x double> %new, <16 x double> undef,
     <4 x i32> <i32 0, i32 1, i32 2, i32 3>
  %new1d = shufflevector <16 x double> %new, <16 x double> undef,
     <4 x i32> <i32 4, i32 5, i32 6, i32 7>
  %new2d = shufflevector <16 x double> %new, <16 x double> undef,
     <4 x i32> <i32 8, i32 9, i32 10, i32 11>
  %new3d = shufflevector <16 x double> %new, <16 x double> undef,
     <4 x i32> <i32 12, i32 13, i32 14, i32 15>
  %mask0 = shufflevector <16 x i32> %mask, <16 x i32> undef,
     <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
  %mask1 = shufflevector <16 x i32> %mask, <16 x i32> undef,
     <8 x i32> <i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
  %mask2 = shufflevector <16 x i32> %mask, <16 x i32> undef,
     <8 x i32> <i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11>
  %mask3 = shufflevector <16 x i32> %mask, <16 x i32> undef,
     <8 x i32> <i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15>
  %mask0d = bitcast <8 x i32> %mask0 to <4 x double>
  %mask1d = bitcast <8 x i32> %mask1 to <4 x double>
  %mask2d = bitcast <8 x i32> %mask2 to <4 x double>
  %mask3d = bitcast <8 x i32> %mask3 to <4 x double>
  %result0d = call <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double> %old0d,
                                 <4 x double> %new0d, <4 x double> %mask0d)
  %result1d = call <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double> %old1d,
                                 <4 x double> %new1d, <4 x double> %mask1d)
  %result2d = call <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double> %old2d,
                                 <4 x double> %new2d, <4 x double> %mask2d)
  %result3d = call <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double> %old3d,
                                 <4 x double> %new3d, <4 x double> %mask3d)
  %result01 = shufflevector <4 x double> %result0d, <4 x double> %result1d,
           <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
  %result23 = shufflevector <4 x double> %result2d, <4 x double> %result3d,
           <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
  %result = shufflevector <8 x double> %result01, <8 x double> %result23,
           <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
                       i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
  %result64 = bitcast <16 x double> %result to <16 x i64>
  store <16 x i64> %result64, <16 x i64> * %ptr
  ret void
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; gather/scatter
 gen_gather(16, i8)
 gen_gather(16, i16)
 gen_gather(16, i32)
 gen_gather(16, i64)
 gen_scatter(16, i8)
 gen_scatter(16, i16)
 gen_scatter(16, i32)
 gen_scatter(16, i64)
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; double precision sqrt
 declare <4 x double> @llvm.x86.avx.sqrt.pd.256(<4 x double>) nounwind readnone
 define internal <16 x double> @__sqrt_varying_double(<16 x double>) nounwind alwaysinline {
  unary4to16(ret, double, @llvm.x86.avx.sqrt.pd.256, %0)
  ret <16 x double> %ret
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; double precision min/max
 declare <4 x double> @llvm.x86.avx.max.pd.256(<4 x double>, <4 x double>) nounwind readnone
 declare <4 x double> @llvm.x86.avx.min.pd.256(<4 x double>, <4 x double>) nounwind readnone
 define internal <16 x double> @__min_varying_double(<16 x double>, <16 x double>) nounwind readnone alwaysinline {
  binary4to16(ret, double, @llvm.x86.avx.min.pd.256, %0, %1)
  ret <16 x double> %ret
 }
 define internal <16 x double> @__max_varying_double(<16 x double>, <16 x double>) nounwind readnone alwaysinline {
  binary4to16(ret, double, @llvm.x86.avx.max.pd.256, %0, %1)
  ret <16 x double> %ret
 }
--- a/builtins-avx.ll
+++ b/builtins-avx.ll
@@ -44,11 +44,12 @@ packed_load_and_store(8)
 scans(8)
 int64minmax(8)
 include(`builtins-avx-common.ll')
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rcp
 declare <8 x float> @llvm.x86.avx.rcp.ps.256(<8 x float>) nounwind readnone
 declare <4 x float> @llvm.x86.sse.rcp.ss(<4 x float>) nounwind readnone
 define internal <8 x float> @__rcp_varying_float(<8 x float>) nounwind readonly alwaysinline {
  ;  float iv = __rcp_v(v);
@@ -63,25 +64,10 @@ define internal <8 x float> @__rcp_varying_float(<8 x float>) nounwind readonly
  ret <8 x float> %iv_mul
 }
 define internal float @__rcp_uniform_float(float) nounwind readonly alwaysinline {
 ;    uniform float iv = extract(__rcp_u(v), 0);
 ;    return iv * (2. - v * iv);
  %vecval = insertelement <4 x float> undef, float %0, i32 0
  %call = call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %vecval)
  %scall = extractelement <4 x float> %call, i32 0
  ; do one N-R iteration
  %v_iv = fmul float %0, %scall
  %two_minus = fsub float 2., %v_iv  
  %iv_mul = fmul float %scall, %two_minus
  ret float %iv_mul
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rounding floats
 declare <8 x float> @llvm.x86.avx.round.ps.256(<8 x float>, i32) nounwind readnone
 declare <4 x float> @llvm.x86.sse.round.ss(<4 x float>, <4 x float>, i32) nounwind readnone
 define internal <8 x float> @__round_varying_float(<8 x float>) nounwind readonly alwaysinline {
  ; roundps, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8
@@ -89,111 +75,43 @@ define internal <8 x float> @__round_varying_float(<8 x float>) nounwind readonl
  ret <8 x float> %call
 }
 define internal float @__round_uniform_float(float) nounwind readonly alwaysinline {
  ; roundss, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8
  ; the roundss intrinsic is a total mess--docs say:
  ;
  ;  __m128 _mm_round_ss (__m128 a, __m128 b, const int c)
  ;       
  ;  b is a 128-bit parameter. The lowest 32 bits are the result of the rounding function
  ;  on b0. The higher order 96 bits are copied directly from input parameter a. The
  ;  return value is described by the following equations:
  ;
  ;  r0 = RND(b0)
  ;  r1 = a1
  ;  r2 = a2
  ;  r3 = a3
  ;
  ;  It doesn't matter what we pass as a, since we only need the r0 value
  ;  here.  So we pass the same register for both.
  %xi = insertelement <4 x float> undef, float %0, i32 0
  %xr = call <4 x float> @llvm.x86.sse.round.ss(<4 x float> %xi, <4 x float> %xi, i32 8)
  %rs = extractelement <4 x float> %xr, i32 0
  ret float %rs
 }
 define internal <8 x float> @__floor_varying_float(<8 x float>) nounwind readonly alwaysinline {
-  ; roundps, round down 0b01 | don't signal precision exceptions 0b1000 = 9
+  ; roundps, round down 0b01 | don't signal precision exceptions 0b1001 = 9
  %call = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %0, i32 9)
  ret <8 x float> %call
 }
 define internal float @__floor_uniform_float(float) nounwind readonly alwaysinline {
  ; see above for round_ss instrinsic discussion...
  %xi = insertelement <4 x float> undef, float %0, i32 0
  ; roundps, round down 0b01 | don't signal precision exceptions 0b1000 = 9
  %xr = call <4 x float> @llvm.x86.sse.round.ss(<4 x float> %xi, <4 x float> %xi, i32 9)
  %rs = extractelement <4 x float> %xr, i32 0
  ret float %rs
 }
 define internal <8 x float> @__ceil_varying_float(<8 x float>) nounwind readonly alwaysinline {
-  ; roundps, round up 0b10 | don't signal precision exceptions 0b1000 = 10
+  ; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
  %call = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %0, i32 10)
  ret <8 x float> %call
 }
 define internal float @__ceil_uniform_float(float) nounwind readonly alwaysinline {
  ; see above for round_ss instrinsic discussion...
  %xi = insertelement <4 x float> undef, float %0, i32 0
  ; roundps, round up 0b10 | don't signal precision exceptions 0b1000 = 10
  %xr = call <4 x float> @llvm.x86.sse.round.ss(<4 x float> %xi, <4 x float> %xi, i32 10)
  %rs = extractelement <4 x float> %xr, i32 0
  ret float %rs
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rounding doubles
 declare <4 x double> @llvm.x86.avx.round.pd.256(<4 x double>, i32) nounwind readnone
 declare <2 x double> @llvm.x86.sse41.round.sd(<2 x double>, <2 x double>, i32) nounwind readnone
 define internal <8 x double> @__round_varying_double(<8 x double>) nounwind readonly alwaysinline {
  round4to8double(%0, 8)
 }
 define internal double @__round_uniform_double(double) nounwind readonly alwaysinline {
  %xi = insertelement <2 x double> undef, double %0, i32 0
  %xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 8)
  %rs = extractelement <2 x double> %xr, i32 0
  ret double %rs
 }
 define internal <8 x double> @__floor_varying_double(<8 x double>) nounwind readonly alwaysinline {
  ; roundpd, round down 0b01 | don't signal precision exceptions 0b1000 = 9
  round4to8double(%0, 9)
 }
 define internal double @__floor_uniform_double(double) nounwind readonly alwaysinline {
  ; see above for round_ss instrinsic discussion...
  %xi = insertelement <2 x double> undef, double %0, i32 0
  ; roundpd, round down 0b01 | don't signal precision exceptions 0b1000 = 9
  %xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 9)
  %rs = extractelement <2 x double> %xr, i32 0
  ret double %rs
 }
 define internal <8 x double> @__ceil_varying_double(<8 x double>) nounwind readonly alwaysinline {
  ; roundpd, round up 0b10 | don't signal precision exceptions 0b1000 = 10
  round4to8double(%0, 10)
 }
 define internal double @__ceil_uniform_double(double) nounwind readonly alwaysinline {
  ; see above for round_ss instrinsic discussion...
  %xi = insertelement <2 x double> undef, double %0, i32 0
  ; roundps, round up 0b10 | don't signal precision exceptions 0b1000 = 10
  %xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 10)
  %rs = extractelement <2 x double> %xr, i32 0
  ret double %rs
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rsqrt
 declare <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float>) nounwind readnone
 declare <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float>) nounwind readnone
 define internal <8 x float> @__rsqrt_varying_float(<8 x float> %v) nounwind readonly alwaysinline {
  ;  float is = __rsqrt_v(v);
@@ -201,64 +119,24 @@ define internal <8 x float> @__rsqrt_varying_float(<8 x float> %v) nounwind read
  ;  return 0.5 * is * (3. - (v * is) * is);
  %v_is = fmul <8 x float> %v, %is
  %v_is_is = fmul <8 x float> %v_is, %is
-  %three_sub = fsub <8 x float> <float 3., float 3., float 3., float 3., float 3., float 3., float 3., float 3.>, %v_is_is
+  %three_sub = fsub <8 x float> <float 3., float 3., float 3., float 3.,
                                 float 3., float 3., float 3., float 3.>, %v_is_is
  %is_mul = fmul <8 x float> %is, %three_sub
-  %half_scale = fmul <8 x float> <float 0.5, float 0.5, float 0.5, float 0.5, float 0.5, float 0.5, float 0.5, float 0.5>, %is_mul
+  %half_scale = fmul <8 x float> <float 0.5, float 0.5, float 0.5, float 0.5,
                                  float 0.5, float 0.5, float 0.5, float 0.5>, %is_mul
  ret <8 x float> %half_scale
 }
 define internal float @__rsqrt_uniform_float(float) nounwind readonly alwaysinline {
  ;  uniform float is = extract(__rsqrt_u(v), 0);
  %v = insertelement <4 x float> undef, float %0, i32 0
  %vis = call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %v)
  %is = extractelement <4 x float> %vis, i32 0
  ;  return 0.5 * is * (3. - (v * is) * is);
  %v_is = fmul float %0, %is
  %v_is_is = fmul float %v_is, %is
  %three_sub = fsub float 3., %v_is_is
  %is_mul = fmul float %is, %three_sub
  %half_scale = fmul float 0.5, %is_mul
  ret float %half_scale
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; sqrt
 declare <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float>) nounwind readnone
 declare <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float>) nounwind readnone
 define internal <8 x float> @__sqrt_varying_float(<8 x float>) nounwind readonly alwaysinline {
  %call = call <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float> %0)
  ret <8 x float> %call
 }
 define internal float @__sqrt_uniform_float(float) nounwind readonly alwaysinline {
  sse_unary_scalar(ret, 4, float, @llvm.x86.sse.sqrt.ss, %0)
  ret float %ret
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; fastmath
 declare void @llvm.x86.sse.stmxcsr(i8 *) nounwind
 declare void @llvm.x86.sse.ldmxcsr(i8 *) nounwind
 define internal void @__fastmath() nounwind alwaysinline {
  %ptr = alloca i32
  %ptr8 = bitcast i32 * %ptr to i8 *
  call void @llvm.x86.sse.stmxcsr(i8 * %ptr8)
  %oldval = load i32 *%ptr
  ; turn on DAZ (64)/FTZ (32768) -> 32832
  %update = or i32 %oldval, 32832
  store i32 %update, i32 *%ptr
  call void @llvm.x86.sse.ldmxcsr(i8 * %ptr8)
  ret void
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; svml
@@ -280,9 +158,7 @@ declare <8 x float> @__svml_pow(<8 x float>, <8 x float>)
 ;; float min/max
 declare <8 x float> @llvm.x86.avx.max.ps.256(<8 x float>, <8 x float>) nounwind readnone
 declare <4 x float> @llvm.x86.sse.max.ss(<4 x float>, <4 x float>) nounwind readnone
 declare <8 x float> @llvm.x86.avx.min.ps.256(<8 x float>, <8 x float>) nounwind readnone
 declare <4 x float> @llvm.x86.sse.min.ss(<4 x float>, <4 x float>) nounwind readnone
 define internal <8 x float> @__max_varying_float(<8 x float>,
                                                 <8 x float>) nounwind readonly alwaysinline {
@@ -290,94 +166,43 @@ define internal <8 x float> @__max_varying_float(<8 x float>,
  ret <8 x float> %call
 }
 define internal float @__max_uniform_float(float, float) nounwind readonly alwaysinline {
  sse_binary_scalar(ret, 4, float, @llvm.x86.sse.max.ss, %0, %1)
  ret float %ret
 }
 define internal <8 x float> @__min_varying_float(<8 x float>,
                                                 <8 x float>) nounwind readonly alwaysinline {
  %call = call <8 x float> @llvm.x86.avx.min.ps.256(<8 x float> %0, <8 x float> %1)
  ret <8 x float> %call
 }
 define internal float @__min_uniform_float(float, float) nounwind readonly alwaysinline {
  sse_binary_scalar(ret, 4, float, @llvm.x86.sse.min.ss, %0, %1)
  ret float %ret
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; int min/max
 ; no 8-wide integer stuff in avx1... 
 declare <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32>, <4 x i32>) nounwind readnone
 declare <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32>, <4 x i32>) nounwind readnone
 define internal <8 x i32> @__min_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
  binary4to8(ret, i32, @llvm.x86.sse41.pminsd, %0, %1)
  ret <8 x i32> %ret
 }
 define internal i32 @__min_uniform_int32(i32, i32) nounwind readonly alwaysinline {
  sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pminsd, %0, %1)
  ret i32 %ret
 }
 define internal <8 x i32> @__max_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
  binary4to8(ret, i32, @llvm.x86.sse41.pmaxsd, %0, %1)
  ret <8 x i32> %ret
 }
 define internal i32 @__max_uniform_int32(i32, i32) nounwind readonly alwaysinline {
  sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pmaxsd, %0, %1)
  ret i32 %ret
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; unsigned int min/max
 declare <4 x i32> @llvm.x86.sse41.pminud(<4 x i32>, <4 x i32>) nounwind readnone
 declare <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32>, <4 x i32>) nounwind readnone
 define internal <8 x i32> @__min_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
  binary4to8(ret, i32, @llvm.x86.sse41.pminud, %0, %1)
  ret <8 x i32> %ret
 }
 define internal i32 @__min_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
  sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pminud, %0, %1)
  ret i32 %ret
 }
 define internal <8 x i32> @__max_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
  binary4to8(ret, i32, @llvm.x86.sse41.pmaxud, %0, %1)
  ret <8 x i32> %ret
 }
 define internal i32 @__max_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
  sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pmaxud, %0, %1)
  ret i32 %ret
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; horizontal ops
 declare i32 @llvm.ctpop.i32(i32) nounwind readnone
 define internal i32 @__popcnt_int32(i32) nounwind readonly alwaysinline {
  %call = call i32 @llvm.ctpop.i32(i32 %0)
  ret i32 %call
 }
 declare i64 @llvm.ctpop.i64(i64) nounwind readnone
 define internal i64 @__popcnt_int64(i64) nounwind readonly alwaysinline {
  %call = call i64 @llvm.ctpop.i64(i64 %0)
  ret i64 %call
 }
 declare i32 @llvm.x86.avx.movmsk.ps.256(<8 x float>) nounwind readnone
 define internal i32 @__movmsk(<8 x i32>) nounwind readnone alwaysinline {
@@ -471,9 +296,10 @@ define internal double @__reduce_add_double(<8 x double>) nounwind readonly alwa
                      <4 x i32> <i32 4, i32 5, i32 6, i32 7>
  %sum0 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %v0, <4 x double> %v1)
  %sum1 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %sum0, <4 x double> %sum0)
-  %scalar1 = extractelement <4 x double> %sum0, i32 0
+  %final0 = extractelement <4 x double> %sum1, i32 0
-  %scalar2 = extractelement <4 x double> %sum1, i32 1
+  %final1 = extractelement <4 x double> %sum1, i32 2
-  %sum = fadd double %scalar1, %scalar2
+  %sum = fadd double %final0, %final1
  ret double %sum
 }
@@ -623,12 +449,13 @@ define void @__masked_store_64(<8 x i64>* nocapture, <8 x i64>,
  ret void
 }
 masked_store_blend_8_16_by_8()
 declare <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float>, <8 x float>,
                                                <8 x float>) nounwind readnone
 define void @__masked_store_blend_32(<8 x i32>* nocapture, <8 x i32>, 
                                     <8 x i32>) nounwind alwaysinline {
  %mask_as_float = bitcast <8 x i32> %2 to <8 x float>
@@ -694,6 +521,7 @@ define void @__masked_store_blend_64(<8 x i64>* nocapture %ptr, <8 x i64> %new,
  ret void
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; gather/scatter
@@ -711,43 +539,26 @@ gen_scatter(8, i64)
 ;; double precision sqrt
 declare <4 x double> @llvm.x86.avx.sqrt.pd.256(<4 x double>) nounwind readnone
 declare <2 x double> @llvm.x86.sse.sqrt.sd(<2 x double>) nounwind readnone
 define internal <8 x double> @__sqrt_varying_double(<8 x double>) nounwind alwaysinline {
  unary4to8(ret, double, @llvm.x86.avx.sqrt.pd.256, %0)
  ret <8 x double> %ret
 }
 define internal double @__sqrt_uniform_double(double) nounwind alwaysinline {
  sse_unary_scalar(ret, 2, double, @llvm.x86.sse.sqrt.sd, %0)
  ret double %ret
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; double precision min/max
 declare <4 x double> @llvm.x86.avx.max.pd.256(<4 x double>, <4 x double>) nounwind readnone
 declare <2 x double> @llvm.x86.sse.max.sd(<2 x double>, <2 x double>) nounwind readnone
 declare <4 x double> @llvm.x86.avx.min.pd.256(<4 x double>, <4 x double>) nounwind readnone
 declare <2 x double> @llvm.x86.sse.min.sd(<2 x double>, <2 x double>) nounwind readnone
 define internal <8 x double> @__min_varying_double(<8 x double>, <8 x double>) nounwind readnone alwaysinline {
  binary4to8(ret, double, @llvm.x86.avx.min.pd.256, %0, %1)
  ret <8 x double> %ret
 }
 define internal double @__min_uniform_double(double, double) nounwind readnone alwaysinline {
  sse_binary_scalar(ret, 2, double, @llvm.x86.sse.min.sd, %0, %1)
  ret double %ret
 }
 define internal <8 x double> @__max_varying_double(<8 x double>, <8 x double>) nounwind readnone alwaysinline {
  binary4to8(ret, double, @llvm.x86.avx.max.pd.256, %0, %1)
  ret <8 x double> %ret
 }
 define internal double @__max_uniform_double(double, double) nounwind readnone alwaysinline {
  sse_binary_scalar(ret, 2, double, @llvm.x86.sse.max.sd, %0, %1)
  ret double %ret
 }
--- a/builtins-c.c
+++ b/builtins-c.c
@@ -51,6 +51,10 @@
  */
 #ifndef _MSC_VER
 #include <unistd.h>
 #endif // !_MSC_VER
 #include <stdint.h>
 #include <stdio.h>
 #include <stdarg.h>
@@ -139,3 +143,28 @@ void __do_print(const char *format, const char *types, int width, int mask,
    }
    fflush(stdout);
 }
 int __num_cores() {
 #ifdef _MSC_VER
 	// This is quite a hack.  Including all of windows.h to get this definition
 	// pulls in a bunch of stuff that leads to undefined symbols at link time.
 	// So we don't #include <windows.h> but instead have the equivalent declarations
 	// here.  Presumably this struct declaration won't be changing in the future
 	// anyway...
  	struct SYSTEM_INFO {
        int pad0[2];
        void *pad1[2];
        int *pad2;
        int dwNumberOfProcessors;
        int pad3[3];
 	};
    struct SYSTEM_INFO sysInfo;
 	extern void __stdcall GetSystemInfo(struct SYSTEM_INFO *);
    GetSystemInfo(&sysInfo);
    return sysInfo.dwNumberOfProcessors;
 #else
    return sysconf(_SC_NPROCESSORS_ONLN);
 #endif // !_MSC_VER
 }
--- a/builtins-sse2.ll
+++ b/builtins-sse2.ll
@@ -277,41 +277,17 @@ define internal i32 @__max_uniform_uint32(i32, i32) nounwind readonly alwaysinli
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; horizontal ops / reductions
-; FIXME: this is very inefficient, loops over all 32 bits...
+declare i32 @llvm.ctpop.i32(i32)
-
+declare i64 @llvm.ctpop.i64(i64)
 ; we could use the LLVM intrinsic declare i32 @llvm.ctpop.i32(i32),
 ; although that currently ends up generating a POPCNT instruction even
 ; if we give --target=sse2 on the command line.  We probably need to
 ; pipe through the 'sse2' request to LLVM via the 'features' string
 ; at codegen time...  (If e.g. --cpu=penryn is also passed along, then
 ; it does generate non-POPCNT code and in particular better code than
 ; the below does.)
 define internal i32 @__popcnt_int32(i32) nounwind readonly alwaysinline {
-entry:
+  %val = call i32 @llvm.ctpop.i32(i32 %0)
-  br label %loop
+  ret i32 %val
 loop:
  %count = phi i32 [ 0, %entry ], [ %newcount, %loop ]
  %val = phi i32 [ %0, %entry ], [ %newval, %loop ]
  %delta = and i32 %val, 1
  %newcount = add i32 %count, %delta
  %newval = lshr i32 %val, 1
  %done = icmp eq i32 %newval, 0
  br i1 %done, label %exit, label %loop
 exit:
  ret i32 %newcount
 }
-define internal i32 @__popcnt_int64(i64) nounwind readnone alwaysinline {
+define internal i64 @__popcnt_int64(i64) nounwind readnone alwaysinline {
-  %vec = bitcast i64 %0 to <2 x i32>
+  %val = call i64 @llvm.ctpop.i64(i64 %0)
-  %v0 = extractelement <2 x i32> %vec, i32 0
+  ret i64 %val
  %v1 = extractelement <2 x i32> %vec, i32 1
  %c0 = call i32 @__popcnt_int32(i32 %v0)
  %c1 = call i32 @__popcnt_int32(i32 %v1)
  %sum = add i32 %c0, %c1
  ret i32 %sum
 }
--- a/builtins-sse4.ll
+++ b/builtins-sse4.ll
@@ -77,7 +77,7 @@ define internal float @__round_uniform_float(float) nounwind readonly alwaysinli
 }
 define internal <4 x float> @__floor_varying_float(<4 x float>) nounwind readonly alwaysinline {
-  ; roundps, round down 0b01 | don't signal precision exceptions 0b1000 = 9
+  ; roundps, round down 0b01 | don't signal precision exceptions 0b1001 = 9
  %call = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %0, i32 9)
  ret <4 x float> %call
 }
@@ -85,14 +85,14 @@ define internal <4 x float> @__floor_varying_float(<4 x float>) nounwind readonl
 define internal float @__floor_uniform_float(float) nounwind readonly alwaysinline {
  ; see above for round_ss instrinsic discussion...
  %xi = insertelement <4 x float> undef, float %0, i32 0
-  ; roundps, round down 0b01 | don't signal precision exceptions 0b1000 = 9
+  ; roundps, round down 0b01 | don't signal precision exceptions 0b1010 = 9
  %xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 9)
  %rs = extractelement <4 x float> %xr, i32 0
  ret float %rs
 }
 define internal <4 x float> @__ceil_varying_float(<4 x float>) nounwind readonly alwaysinline {
-  ; roundps, round up 0b10 | don't signal precision exceptions 0b1000 = 10
+  ; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
  %call = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %0, i32 10)
  ret <4 x float> %call
 }
@@ -100,7 +100,7 @@ define internal <4 x float> @__ceil_varying_float(<4 x float>) nounwind readonly
 define internal float @__ceil_uniform_float(float) nounwind readonly alwaysinline {
  ; see above for round_ss instrinsic discussion...
  %xi = insertelement <4 x float> undef, float %0, i32 0
-  ; roundps, round up 0b10 | don't signal precision exceptions 0b1000 = 10
+  ; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
  %xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 10)
  %rs = extractelement <4 x float> %xr, i32 0
  ret float %rs
@@ -124,28 +124,28 @@ define internal double @__round_uniform_double(double) nounwind readonly alwaysi
 }
 define internal <4 x double> @__floor_varying_double(<4 x double>) nounwind readonly alwaysinline {
-  ; roundpd, round down 0b01 | don't signal precision exceptions 0b1000 = 9
+  ; roundpd, round down 0b01 | don't signal precision exceptions 0b1001 = 9
  round2to4double(%0, 9)
 }
 define internal double @__floor_uniform_double(double) nounwind readonly alwaysinline {
  ; see above for round_ss instrinsic discussion...
  %xi = insertelement <2 x double> undef, double %0, i32 0
-  ; roundpd, round down 0b01 | don't signal precision exceptions 0b1000 = 9
+  ; roundpd, round down 0b01 | don't signal precision exceptions 0b1001 = 9
  %xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 9)
  %rs = extractelement <2 x double> %xr, i32 0
  ret double %rs
 }
 define internal <4 x double> @__ceil_varying_double(<4 x double>) nounwind readonly alwaysinline {
-  ; roundpd, round up 0b10 | don't signal precision exceptions 0b1000 = 10
+  ; roundpd, round up 0b10 | don't signal precision exceptions 0b1010 = 10
  round2to4double(%0, 10)
 }
 define internal double @__ceil_uniform_double(double) nounwind readonly alwaysinline {
  ; see above for round_ss instrinsic discussion...
  %xi = insertelement <2 x double> undef, double %0, i32 0
-  ; roundps, round up 0b10 | don't signal precision exceptions 0b1000 = 10
+  ; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
  %xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 10)
  %rs = extractelement <2 x double> %xr, i32 0
  ret double %rs
--- a/builtins-sse4x2.ll
+++ b/builtins-sse4x2.ll
@@ -498,28 +498,28 @@ define internal float @__round_uniform_float(float) nounwind readonly alwaysinli
 }
 define internal <8 x float> @__floor_varying_float(<8 x float>) nounwind readonly alwaysinline {
-  ; roundps, round down 0b01 | don't signal precision exceptions 0b1000 = 9
+  ; roundps, round down 0b01 | don't signal precision exceptions 0b1001 = 9
  round4to8(%0, 9)
 }
 define internal float @__floor_uniform_float(float) nounwind readonly alwaysinline {
  ; see above for round_ss instrinsic discussion...
  %xi = insertelement <4 x float> undef, float %0, i32 0
-  ; roundps, round down 0b01 | don't signal precision exceptions 0b1000 = 9
+  ; roundps, round down 0b01 | don't signal precision exceptions 0b1001 = 9
  %xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 9)
  %rs = extractelement <4 x float> %xr, i32 0
  ret float %rs
 }
 define internal <8 x float> @__ceil_varying_float(<8 x float>) nounwind readonly alwaysinline {
-  ; roundps, round up 0b10 | don't signal precision exceptions 0b1000 = 10
+  ; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
  round4to8(%0, 10)
 }
 define internal float @__ceil_uniform_float(float) nounwind readonly alwaysinline {
  ; see above for round_ss instrinsic discussion...
  %xi = insertelement <4 x float> undef, float %0, i32 0
-  ; roundps, round up 0b10 | don't signal precision exceptions 0b1000 = 10
+  ; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
  %xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 10)
  %rs = extractelement <4 x float> %xr, i32 0
  ret float %rs
@@ -543,28 +543,28 @@ define internal double @__round_uniform_double(double) nounwind readonly alwaysi
 }
 define internal <8 x double> @__floor_varying_double(<8 x double>) nounwind readonly alwaysinline {
-  ; roundpd, round down 0b01 | don't signal precision exceptions 0b1000 = 9
+  ; roundpd, round down 0b01 | don't signal precision exceptions 0b1001 = 9
  round2to8double(%0, 9)
 }
 define internal double @__floor_uniform_double(double) nounwind readonly alwaysinline {
  ; see above for round_ss instrinsic discussion...
  %xi = insertelement <2 x double> undef, double %0, i32 0
-  ; roundpd, round down 0b01 | don't signal precision exceptions 0b1000 = 9
+  ; roundpd, round down 0b01 | don't signal precision exceptions 0b1001 = 9
  %xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 9)
  %rs = extractelement <2 x double> %xr, i32 0
  ret double %rs
 }
 define internal <8 x double> @__ceil_varying_double(<8 x double>) nounwind readonly alwaysinline {
-  ; roundpd, round up 0b10 | don't signal precision exceptions 0b1000 = 10
+  ; roundpd, round up 0b10 | don't signal precision exceptions 0b1010 = 10
  round2to8double(%0, 10)
 }
 define internal double @__ceil_uniform_double(double) nounwind readonly alwaysinline {
  ; see above for round_ss instrinsic discussion...
  %xi = insertelement <2 x double> undef, double %0, i32 0
-  ; roundps, round up 0b10 | don't signal precision exceptions 0b1000 = 10
+  ; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
  %xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 10)
  %rs = extractelement <2 x double> %xr, i32 0
  ret double %rs
--- a/builtins.cpp
+++ b/builtins.cpp
@@ -55,7 +55,7 @@
 #include <llvm/Intrinsics.h>
 #include <llvm/Linker.h>
 #include <llvm/Target/TargetMachine.h>
-#include <llvm/Target/SubtargetFeature.h>
+#include <llvm/ADT/Triple.h>
 #include <llvm/Support/MemoryBuffer.h>
 #include <llvm/Bitcode/ReaderWriter.h>
@@ -389,6 +389,27 @@ lDefineConstantInt(const char *name, int val, llvm::Module *module,
 }
 static void
 lDefineConstantIntFunc(const char *name, int val, llvm::Module *module,
                       SymbolTable *symbolTable) {
    std::vector<const Type *> args;
    FunctionType *ft = new FunctionType(AtomicType::UniformInt32, args, SourcePos());
    Symbol *sym = new Symbol(name, SourcePos(), ft);
    sym->isStatic = true;
    llvm::Function *func = module->getFunction(name);
    assert(func != NULL); // it should be declared already...
    func->addFnAttr(llvm::Attribute::AlwaysInline);
    llvm::BasicBlock *bblock = llvm::BasicBlock::Create(*g->ctx, "entry", func, 0);
    llvm::ReturnInst::Create(*g->ctx, LLVMInt32(val), bblock);
    sym->function = func;
    symbolTable->AddVariable(sym);
 }
 static void
 lDefineProgramIndex(llvm::Module *module, SymbolTable *symbolTable) {
    Symbol *pidx = new Symbol("programIndex", SourcePos(), 
@@ -454,11 +475,23 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
        }
        break;
    case Target::AVX:
        switch (g->target.vectorWidth) {
        case 8:
            extern unsigned char builtins_bitcode_avx[];
            extern int builtins_bitcode_avx_length;
            lAddBitcode(builtins_bitcode_avx, builtins_bitcode_avx_length, module, 
                        symbolTable);
            break;
        case 16:
            extern unsigned char builtins_bitcode_avx_x2[];
            extern int builtins_bitcode_avx_x2_length;
            lAddBitcode(builtins_bitcode_avx_x2, builtins_bitcode_avx_x2_length,
                        module,  symbolTable);
            break;
        default:
            FATAL("logic error in DefineStdlib");
        }
        break;
    default:
        FATAL("logic error");
    }
@@ -480,6 +513,8 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
                       symbolTable);
    lDefineConstantInt("__math_lib_system", (int)Globals::Math_System, module,
                       symbolTable);
    lDefineConstantIntFunc("__fast_masked_vload", (int)g->opt.fastMaskedVload, module,
                           symbolTable);
    if (includeStdlibISPC) {
        // If the user wants the standard library to be included, parse the
--- a/builtins.m4
+++ b/builtins.m4
@@ -111,6 +111,32 @@ define(`reduce8', `
 '
 )
 define(`reduce16', `
  %v1 = shufflevector <16 x $1> %0, <16 x $1> undef,
        <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15,
                    i32 undef, i32 undef, i32 undef, i32 undef,
                    i32 undef, i32 undef, i32 undef, i32 undef>
  %m1 = call <16 x $1> $2(<16 x $1> %v1, <16 x $1> %0)
  %v2 = shufflevector <16 x $1> %m1, <16 x $1> undef,
        <16 x i32> <i32 4, i32 5, i32 6, i32 7,
                    i32 undef, i32 undef, i32 undef, i32 undef,
                    i32 undef, i32 undef, i32 undef, i32 undef,
                    i32 undef, i32 undef, i32 undef, i32 undef>
  %m2 = call <16 x $1> $2(<16 x $1> %v2, <16 x $1> %m1)
  %v3 = shufflevector <16 x $1> %m2, <16 x $1> undef,
        <16 x i32> <i32 2, i32 3, i32 undef, i32 undef,
                    i32 undef, i32 undef, i32 undef, i32 undef,
                    i32 undef, i32 undef, i32 undef, i32 undef,
                    i32 undef, i32 undef, i32 undef, i32 undef>
  %m3 = call <16 x $1> $2(<16 x $1> %v3, <16 x $1> %m2)
  %m3a = extractelement <16 x $1> %m3, i32 0
  %m3b = extractelement <16 x $1> %m3, i32 1
  %m = call $1 $3($1 %m3a, $1 %m3b)
  ret $1 %m
 '
 )
 ;; Do an reduction over an 8-wide vector, using a vector reduction function
 ;; that only takes 4-wide vectors
 ;; $1: type of final scalar result
@@ -211,6 +237,45 @@ define(`unary4to8', `
 '
 )
 define(`unary4to16', `
  %$1_0 = shufflevector <16 x $2> $4, <16 x $2> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
  %v$1_0 = call <4 x $2> $3(<4 x $2> %$1_0)
  %$1_1 = shufflevector <16 x $2> $4, <16 x $2> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
  %v$1_1 = call <4 x $2> $3(<4 x $2> %$1_1)
  %$1_2 = shufflevector <16 x $2> $4, <16 x $2> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
  %v$1_2 = call <4 x $2> $3(<4 x $2> %$1_2)
  %$1_3 = shufflevector <16 x $2> $4, <16 x $2> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
  %v$1_3 = call <4 x $2> $3(<4 x $2> %$1_3)
  %$1a = shufflevector <4 x $2> %v$1_0, <4 x $2> %v$1_1, 
           <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
  %$1b = shufflevector <4 x $2> %v$1_2, <4 x $2> %v$1_3, 
           <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
  %$1 = shufflevector <8 x $2> %$1a, <8 x $2> %$1b,
           <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
                       i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 '
 )
 ;; And so forth...
 ;; $1: name of variable into which the final result should go
 ;; $2: scalar type of the vector elements
 ;; $3: 8-wide unary vector function to apply
 ;; $4: 16-wide operand value
 define(`unary8to16', `
  %$1_0 = shufflevector <16 x $2> $4, <16 x $2> undef,
             <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
  %v$1_0 = call <8 x $2> $3(<8 x $2> %$1_0)
  %$1_1 = shufflevector <16 x $2> $4, <16 x $2> undef,
             <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
  %v$1_1 = call <8 x $2> $3(<8 x $2> %$1_1)
  %$1 = shufflevector <8 x $2> %v$1_0, <8 x $2> %v$1_1, 
           <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
                       i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 '
 )
 ;; And along the lines of `binary2to4', this maps a 4-wide binary function to
 ;; two 8-wide vector operands
 ;; $1: name of variable into which the final result should go
@@ -231,6 +296,57 @@ define(`binary4to8', `
 '
 )
 define(`binary8to16', `
 %$1_0a = shufflevector <16 x $2> $4, <16 x $2> undef,
          <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 %$1_0b = shufflevector <16 x $2> $5, <16 x $2> undef,
          <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 %v$1_0 = call <8 x $2> $3(<8 x $2> %$1_0a, <8 x $2> %$1_0b)
 %$1_1a = shufflevector <16 x $2> $4, <16 x $2> undef,
          <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 %$1_1b = shufflevector <16 x $2> $5, <16 x $2> undef,
          <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 %v$1_1 = call <8 x $2> $3(<8 x $2> %$1_1a, <8 x $2> %$1_1b)
 %$1 = shufflevector <8 x $2> %v$1_0, <8 x $2> %v$1_1, 
         <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
                     i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 '
 )
 define(`binary4to16', `
 %$1_0a = shufflevector <16 x $2> $4, <16 x $2> undef,
          <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 %$1_0b = shufflevector <16 x $2> $5, <16 x $2> undef,
          <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 %r$1_0 = call <4 x $2> $3(<4 x $2> %$1_0a, <4 x $2> %$1_0b) 
 %$1_1a = shufflevector <16 x $2> $4, <16 x $2> undef,
          <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 %$1_1b = shufflevector <16 x $2> $5, <16 x $2> undef,
          <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 %r$1_1 = call <4 x $2> $3(<4 x $2> %$1_1a, <4 x $2> %$1_1b) 
 %$1_2a = shufflevector <16 x $2> $4, <16 x $2> undef,
          <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 %$1_2b = shufflevector <16 x $2> $5, <16 x $2> undef,
          <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 %r$1_2 = call <4 x $2> $3(<4 x $2> %$1_2a, <4 x $2> %$1_2b) 
 %$1_3a = shufflevector <16 x $2> $4, <16 x $2> undef,
          <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 %$1_3b = shufflevector <16 x $2> $5, <16 x $2> undef,
          <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 %r$1_3 = call <4 x $2> $3(<4 x $2> %$1_3a, <4 x $2> %$1_3b)
 %r$1_01 = shufflevector <4 x $2> %r$1_0, <4 x $2> %r$1_1, 
          <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 %r$1_23 = shufflevector <4 x $2> %r$1_2, <4 x $2> %r$1_3, 
          <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 %$1 = shufflevector <8 x $2> %r$1_01, <8 x $2> %r$1_23, 
          <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
                      i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ')
 ;; Maps a 2-wide unary function to an 8-wide vector operand, returning an 
 ;; 8-wide vector result
@@ -306,6 +422,20 @@ ret <8 x float> %ret
 '
 )
 define(`round8to16', `
 %v0 = shufflevector <16 x float> $1, <16 x float> undef,
        <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 %v1 = shufflevector <16 x float> $1, <16 x float> undef,
        <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 %r0 = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %v0, i32 $2)
 %r1 = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %v1, i32 $2)
 %ret = shufflevector <8 x float> %r0, <8 x float> %r1, 
         <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
                     i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ret <16 x float> %ret
 '
 )
 define(`round4to8double', `
 %v0 = shufflevector <8 x double> $1, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 %v1 = shufflevector <8 x double> $1, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
@@ -349,6 +479,30 @@ ret <8 x double> %ret
 '
 )
 define(`round4to16double', `
 %v0 = shufflevector <16 x double> $1, <16 x double> undef,
         <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 %v1 = shufflevector <16 x double> $1, <16 x double> undef,
         <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 %v2 = shufflevector <16 x double> $1, <16 x double> undef,
         <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 %v3 = shufflevector <16 x double> $1, <16 x double> undef,
         <4 x i32> <i32 12, i32 13, i32 14, i32 15>
 %r0 = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %v0, i32 $2)
 %r1 = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %v1, i32 $2)
 %r2 = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %v2, i32 $2)
 %r3 = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %v3, i32 $2)
 %ret0 = shufflevector <4 x double> %r0, <4 x double> %r1, 
          <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 %ret1 = shufflevector <4 x double> %r2, <4 x double> %r3, 
          <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 %ret = shufflevector <8 x double> %ret0, <8 x double> %ret1,
          <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
                      i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ret <16 x double> %ret
 '
 )
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; forloop macro
@@ -468,12 +622,91 @@ forloop(i, 1, eval($1-1), `
 }
 ')
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; global_atomic
+;; global_atomic_associative
 ;; More efficient implementation for atomics that are associative (e.g.,
 ;; add, and, ...).  If a basic implementation would do sometihng like:
 ;; result0 = atomic_op(ptr, val0)
 ;; result1 = atomic_op(ptr, val1)
 ;; ..
 ;; Then instead we can do:
 ;; tmp = (val0 op val1 op ...)
 ;; result0 = atomic_op(ptr, tmp)
 ;; result1 = (result0 op val0)
 ;; ..
 ;; And more efficiently compute the same result
 ;;
 ;; Takes five parameters:
 ;; $1: vector width of the target
 ;; $2: operation being performed (w.r.t. LLVM atomic intrinsic names)
 ;;     (add, sub...)
 ;; $3: return type of the LLVM atomic (e.g. i32)
 ;; $4: return type of the LLVM atomic type, in ispc naming paralance (e.g. int32)
 ;; $5: identity value for the operator (e.g. 0 for add, -1 for AND, ...)
 define(`global_atomic_associative', `
 define internal <$1 x $3> @__atomic_$2_$4_global($3 * %ptr, <$1 x $3> %val,
                                                 <$1 x i32> %m) nounwind alwaysinline {
  ; first, for any lanes where the mask is off, compute a vector where those lanes
  ; hold the identity value..
  ; for the bit tricks below, we need the mask to be sign extended to be
  ; the size of the element type.
  ifelse($3, `i64', `%mask = sext <$1 x i32> %m to <$1 x i64>')
  ifelse($3, `i32', `
     ; silly workaround to do %mask = %m, which is not possible directly..
     %maskmem = alloca <$1 x i32>
     store <$1 x i32> %m, <$1 x i32> * %maskmem
     %mask = load <$1 x i32> * %maskmem'
  )
  ; zero out any lanes that are off
  %valoff = and <$1 x $3> %val, %mask
  ; compute an identity vector that is zero in on lanes and has the identiy value
  ; in the off lanes
  %idv1 = bitcast $3 $5 to <1 x $3>
  %idvec = shufflevector <1 x $3> %idv1, <1 x $3> undef,
     <$1 x i32> < forloop(i, 1, eval($1-1), `i32 0, ') i32 0 >
  %notmask = xor <$1 x $3> %mask, < forloop(i, 1, eval($1-1), `$3 -1, ') $3 -1 >
  %idoff = and <$1 x $3> %idvec, %notmask
  ; and comptue the merged vector that holds the identity in the off lanes
  %valp = or <$1 x $3> %valoff, %idoff
  ; now compute the local reduction (val0 op val1 op ... )--initialize
  ; %eltvec so that the 0th element is the identity, the first is val0,
  ; the second is (val0 op val1), ..
  %red0 = extractelement <$1 x $3> %valp, i32 0
  %eltvec0 = insertelement <$1 x $3> undef, $3 $5, i32 0
  forloop(i, 1, eval($1-1), `
  %elt`'i = extractelement <$1 x $3> %valp, i32 i
  %red`'i = $2 $3 %red`'eval(i-1), %elt`'i
  %eltvec`'i = insertelement <$1 x $3> %eltvec`'eval(i-1), $3 %red`'eval(i-1), i32 i')
  ; make the atomic call, passing it the final reduced value
  %final0 = call $3 @llvm.atomic.load.$2.$3.p0$3($3 * %ptr, $3 %red`'eval($1-1))
  ; now go back and compute the values to be returned for each program 
  ; instance--this just involves smearing the old value returned from the
  ; actual atomic call across the vector and applying the vector op to the
  ; %eltvec vector computed above..
  %finalv1 = bitcast $3 %final0 to <1 x $3>
  %final_base = shufflevector <1 x $3> %finalv1, <1 x $3> undef,
     <$1 x i32> < forloop(i, 1, eval($1-1), `i32 0, ') i32 0 >
  %r = $2 <$1 x $3> %final_base, %eltvec`'eval($1-1)
  ret <$1 x $3> %r
 }
 ')
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; global_atomic_uniform
 ;; Defines the implementation of a function that handles the mapping from
-;; an ispc atomic function to the underlying LLVM intrinsics.  Specifically,
+;; an ispc atomic function to the underlying LLVM intrinsics.  This variant
-;; the function handles loooping over the active lanes, calling the underlying
+;; just calls the atomic once, for the given uniform value
 ;; scalar atomic intrinsic for each one, and assembling the vector result.
 ;;
 ;; Takes four parameters:
 ;; $1: vector width of the target
@@ -482,23 +715,14 @@ forloop(i, 1, eval($1-1), `
 ;; $3: return type of the LLVM atomic (e.g. i32)
 ;; $4: return type of the LLVM atomic type, in ispc naming paralance (e.g. int32)
-define(`global_atomic', `
+define(`global_atomic_uniform', `
 declare $3 @llvm.atomic.load.$2.$3.p0$3($3 * %ptr, $3 %delta)
-define internal <$1 x $3> @__atomic_$2_$4_global($3 * %ptr, <$1 x $3> %val,
+define internal $3 @__atomic_$2_uniform_$4_global($3 * %ptr, $3 %val,
                                          <$1 x i32> %mask) nounwind alwaysinline {
-  %rptr = alloca <$1 x $3>
+  %r = call $3 @llvm.atomic.load.$2.$3.p0$3($3 * %ptr, $3 %val)
-  %rptr32 = bitcast <$1 x $3> * %rptr to $3 *
+  ret $3 %r
  per_lane($1, <$1 x i32> %mask, `
   %v_LANE_ID = extractelement <$1 x $3> %val, i32 LANE
   %r_LANE_ID = call $3 @llvm.atomic.load.$2.$3.p0$3($3 * %ptr, $3 %v_LANE_ID)
   %rp_LANE_ID = getelementptr $3 * %rptr32, i32 LANE
   store $3 %r_LANE_ID, $3 * %rp_LANE_ID')
  %r = load <$1 x $3> * %rptr
  ret <$1 x $3> %r
 }
 ')
@@ -508,9 +732,10 @@ define internal <$1 x $3> @__atomic_$2_$4_global($3 * %ptr, <$1 x $3> %val,
 ;; $2: llvm type of the vector elements (e.g. i32)
 ;; $3: ispc type of the elements (e.g. int32)
-define(`global_swap', `
+declare i32 @llvm.atomic.swap.i32.p0i32(i32 * %ptr, i32 %val)
 declare i64 @llvm.atomic.swap.i64.p0i64(i64 * %ptr, i64 %val)
-declare $2 @llvm.atomic.swap.$2.p0$2($2 * %ptr, $2 %val)
+define(`global_swap', `
 define internal <$1 x $2> @__atomic_swap_$3_global($2* %ptr, <$1 x $2> %val,
                                                   <$1 x i32> %mask) nounwind alwaysinline {
@@ -526,6 +751,12 @@ define internal <$1 x $2> @__atomic_swap_$3_global($2* %ptr, <$1 x $2> %val,
  %r = load <$1 x $2> * %rptr
  ret <$1 x $2> %r
 }
 define internal $2 @__atomic_swap_uniform_$3_global($2* %ptr, $2 %val,
                                                    <$1 x i32> %mask) nounwind alwaysinline {
 %r = call $2 @llvm.atomic.swap.$2.p0$2($2 * %ptr, $2 %val)
 ret $2 %r
 }
 ')
@@ -555,6 +786,12 @@ define internal <$1 x $2> @__atomic_compare_exchange_$3_global($2* %ptr, <$1 x $
  %r = load <$1 x $2> * %rptr
  ret <$1 x $2> %r
 }
 define internal $2 @__atomic_compare_exchange_uniform_$3_global($2* %ptr, $2 %cmp,
                               $2 %val, <$1 x i32> %mask) nounwind alwaysinline {
  %r = call $2 @llvm.atomic.cmp.swap.$2.p0$2($2 * %ptr, $2 %cmp, $2 %val)
  ret $2 %r
 }
 ')
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@@ -595,10 +832,11 @@ define internal void @__prefetch_read_nt_$1($2 *) alwaysinline {
 define(`stdlib_core', `
-declare i8* @ISPCMalloc(i64, i32) nounwind
+declare i32 @__fast_masked_vload()
-declare i8* @ISPCFree(i8*) nounwind
+
-declare void @ISPCLaunch(i8*, i8*) nounwind
+declare i8* @ISPCAlloc(i8**, i64, i32) nounwind
-declare void @ISPCSync() nounwind
+declare void @ISPCLaunch(i8**, i8*, i8*, i32) nounwind
 declare void @ISPCSync(i8*) nounwind
 declare void @ISPCInstrument(i8*, i8*, i32, i32) nounwind
 declare i1 @__is_compile_time_constant_mask(<$1 x i32> %mask)
@@ -965,25 +1203,35 @@ define internal void @__memory_barrier() nounwind readnone alwaysinline {
  ret void
 }
-global_atomic($1, add, i32, int32)
+global_atomic_associative($1, add, i32, int32, 0)
-global_atomic($1, sub, i32, int32)
+global_atomic_associative($1, sub, i32, int32, 0)
-global_atomic($1, and, i32, int32)
+global_atomic_associative($1, and, i32, int32, -1)
-global_atomic($1, or, i32, int32)
+global_atomic_associative($1, or, i32, int32, 0)
-global_atomic($1, xor, i32, int32)
+global_atomic_associative($1, xor, i32, int32, 0)
-global_atomic($1, min, i32, int32)
+global_atomic_uniform($1, add, i32, int32)
-global_atomic($1, max, i32, int32)
+global_atomic_uniform($1, sub, i32, int32)
-global_atomic($1, umin, i32, uint32)
+global_atomic_uniform($1, and, i32, int32)
-global_atomic($1, umax, i32, uint32)
+global_atomic_uniform($1, or, i32, int32)
 global_atomic_uniform($1, xor, i32, int32)
 global_atomic_uniform($1, min, i32, int32)
 global_atomic_uniform($1, max, i32, int32)
 global_atomic_uniform($1, umin, i32, uint32)
 global_atomic_uniform($1, umax, i32, uint32)
-global_atomic($1, add, i64, int64)
+global_atomic_associative($1, add, i64, int64, 0)
-global_atomic($1, sub, i64, int64)
+global_atomic_associative($1, sub, i64, int64, 0)
-global_atomic($1, and, i64, int64)
+global_atomic_associative($1, and, i64, int64, -1)
-global_atomic($1, or, i64, int64)
+global_atomic_associative($1, or, i64, int64, 0)
-global_atomic($1, xor, i64, int64)
+global_atomic_associative($1, xor, i64, int64, 0)
-global_atomic($1, min, i64, int64)
+global_atomic_uniform($1, add, i64, int64)
-global_atomic($1, max, i64, int64)
+global_atomic_uniform($1, sub, i64, int64)
-global_atomic($1, umin, i64, uint64)
+global_atomic_uniform($1, and, i64, int64)
-global_atomic($1, umax, i64, uint64)
+global_atomic_uniform($1, or, i64, int64)
 global_atomic_uniform($1, xor, i64, int64)
 global_atomic_uniform($1, min, i64, int64)
 global_atomic_uniform($1, max, i64, int64)
 global_atomic_uniform($1, umin, i64, uint64)
 global_atomic_uniform($1, umax, i64, uint64)
 global_swap($1, i32, int32)
 global_swap($1, i64, int64)
@@ -1006,6 +1254,24 @@ define internal <$1 x double> @__atomic_swap_double_global(double * %ptr, <$1 x
  ret <$1 x double> %ret
 }
 define internal float @__atomic_swap_uniform_float_global(float * %ptr, float %val,
                                                   <$1 x i32> %mask) nounwind alwaysinline {
  %iptr = bitcast float * %ptr to i32 *
  %ival = bitcast float %val to i32
  %iret = call i32 @__atomic_swap_uniform_int32_global(i32 * %iptr, i32 %ival, <$1 x i32> %mask)
  %ret = bitcast i32 %iret to float
  ret float %ret
 }
 define internal double @__atomic_swap_uniform_double_global(double * %ptr, double %val,
                                                   <$1 x i32> %mask) nounwind alwaysinline {
  %iptr = bitcast double * %ptr to i64 *
  %ival = bitcast double %val to i64
  %iret = call i64 @__atomic_swap_uniform_int64_global(i64 * %iptr, i64 %ival, <$1 x i32> %mask)
  %ret = bitcast i64 %iret to double
  ret double %ret
 }
 global_atomic_exchange($1, i32, int32)
 global_atomic_exchange($1, i64, int64)
@@ -1030,6 +1296,29 @@ define internal <$1 x double> @__atomic_compare_exchange_double_global(double *
  %ret = bitcast <$1 x i64> %iret to <$1 x double>
  ret <$1 x double> %ret
 }
 define internal float @__atomic_compare_exchange_uniform_float_global(float * %ptr, float %cmp, float %val,
                                                   <$1 x i32> %mask) nounwind alwaysinline {
  %iptr = bitcast float * %ptr to i32 *
  %icmp = bitcast float %cmp to i32
  %ival = bitcast float %val to i32
  %iret = call i32 @__atomic_compare_exchange_uniform_int32_global(i32 * %iptr, i32 %icmp,
                                                                   i32 %ival, <$1 x i32> %mask)
  %ret = bitcast i32 %iret to float
  ret float %ret
 }
 define internal double @__atomic_compare_exchange_uniform_double_global(double * %ptr, double %cmp,
                                            double %val, <$1 x i32> %mask) nounwind alwaysinline {
  %iptr = bitcast double * %ptr to i64 *
  %icmp = bitcast double %cmp to i64
  %ival = bitcast double %val to i64
  %iret = call i64 @__atomic_compare_exchange_uniform_int64_global(i64 * %iptr, i64 %icmp,
                                                                   i64 %ival, <$1 x i32> %mask)
  %ret = bitcast i64 %iret to double
  ret double %ret
 }
 ')
@@ -1088,12 +1377,6 @@ i64minmax($1,max,uint64,ugt)
 define(`load_and_broadcast', `
 define <$1 x $2> @__load_and_broadcast_$3(i8 *, <$1 x i32> %mask) nounwind alwaysinline {
  ; must not load if the mask is all off; the address may be invalid
  %mm = call i32 @__movmsk(<$1 x i32> %mask)
  %any_on = icmp ne i32 %mm, 0
  br i1 %any_on, label %load, label %skip
 load:
  %ptr = bitcast i8 * %0 to $2 *
  %val = load $2 * %ptr
@@ -1101,9 +1384,6 @@ load:
  forloop(i, 1, eval($1-1), `
  %ret`'i = insertelement <$1 x $2> %ret`'eval(i-1), $2 %val, i32 i')
  ret <$1 x $2> %ret`'eval($1-1)
 skip:
  ret <$1 x $2> undef
 }
 ')
@@ -1119,14 +1399,20 @@ define(`load_masked', `
 define <$1 x $2> @__load_masked_$3(i8 *, <$1 x i32> %mask) nounwind alwaysinline {
 entry:
  %mm = call i32 @__movmsk(<$1 x i32> %mask)
  ; if the first lane and the last lane are on, then it is safe to do a vector load
  ; of the whole thing--what the lanes in the middle want turns out to not matter...
  %mm_and = and i32 %mm, eval(1 | (1<<($1-1)))
  %can_vload = icmp eq i32 %mm_and, eval(1 | (1<<($1-1)))
  %fast32 = call i32 @__fast_masked_vload()
  %fast_i1 = trunc i32 %fast32 to i1
  %can_vload_maybe_fast = or i1 %fast_i1, %can_vload
  ; if we are not able to do a singe vload, we will accumulate lanes in this memory..
  %retptr = alloca <$1 x $2>
  %retptr32 = bitcast <$1 x $2> * %retptr to $2 *
-  br i1 %can_vload, label %load, label %loop
+  br i1 %can_vload_maybe_fast, label %load, label %loop
 load: 
  %ptr = bitcast i8 * %0 to <$1 x $2> *
@@ -1261,6 +1547,46 @@ define void @__masked_store_blend_16(<8 x i16>* nocapture, <8 x i16>,
 ')
 define(`masked_store_blend_8_16_by_16', `
 define void @__masked_store_blend_8(<16 x i8>* nocapture, <16 x i8>,
                                    <16 x i32>) nounwind alwaysinline {
  %old = load <16 x i8> * %0
  %old128 = bitcast <16 x i8> %old to i128
  %new128 = bitcast <16 x i8> %1 to i128
  %mask8 = trunc <16 x i32> %2 to <16 x i8>
  %mask128 = bitcast <16 x i8> %mask8 to i128
  %notmask128 = xor i128 %mask128, -1
  %newmasked = and i128 %new128, %mask128
  %oldmasked = and i128 %old128, %notmask128
  %result = or i128 %newmasked, %oldmasked
  %resultvec = bitcast i128 %result to <16 x i8>
  store <16 x i8> %resultvec, <16 x i8> * %0
  ret void
 }
 define void @__masked_store_blend_16(<16 x i16>* nocapture, <16 x i16>,
                                     <16 x i32>) nounwind alwaysinline {
  %old = load <16 x i16> * %0
  %old256 = bitcast <16 x i16> %old to i256
  %new256 = bitcast <16 x i16> %1 to i256
  %mask16 = trunc <16 x i32> %2 to <16 x i16>
  %mask256 = bitcast <16 x i16> %mask16 to i256
  %notmask256 = xor i256 %mask256, -1
  %newmasked = and i256 %new256, %mask256
  %oldmasked = and i256 %old256, %notmask256
  %result = or i256 %newmasked, %oldmasked
  %resultvec = bitcast i256 %result to <16 x i16>
  store <16 x i16> %resultvec, <16 x i16> * %0
  ret void
 }
 ')
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; packed load and store functions
 ;;
@@ -1288,7 +1614,7 @@ entry:
 known_mask:
  %allon = icmp eq i32 %mask, eval((1 << $1) -1)
-  br i1 %allon, label %all_on, label %not_all_on
+  br i1 %allon, label %all_on, label %unknown_mask
 all_on:
  ;; everyone wants to load, so just load an entire vector width in a single
@@ -1298,14 +1624,6 @@ all_on:
  store <$1 x i32> %vec_load, <$1 x i32> * %val_ptr, align 4
  ret i32 $1
 not_all_on:
  %alloff = icmp eq i32 %mask, 0
  br i1 %alloff, label %all_off, label %unknown_mask
 all_off:
  ;; no one wants to load
  ret i32 0
 unknown_mask:
  br label %loop
@@ -1352,20 +1670,13 @@ entry:
 known_mask:
  %allon = icmp eq i32 %mask, eval((1 << $1) -1)
-  br i1 %allon, label %all_on, label %not_all_on
+  br i1 %allon, label %all_on, label %unknown_mask
 all_on:
  %vecptr = bitcast i32 *%startptr to <$1 x i32> *
  store <$1 x i32> %vals, <$1 x i32> * %vecptr, align 4
  ret i32 $1
 not_all_on:
  %alloff = icmp eq i32 %mask, 0
  br i1 %alloff, label %all_off, label %unknown_mask
 all_off:
  ret i32 0
 unknown_mask:
  br label %loop
@@ -1415,14 +1726,6 @@ entry:
   br i1 %allon, label %check_neighbors, label %domixed
 domixed:
  ; the mask is mixed on/off.  First see if the lanes are all off
  %alloff = icmp eq i32 %mm, 0
  br i1 %alloff, label %doalloff, label %actuallymixed
 doalloff:
  ret i1 false  ;; this seems safest
 actuallymixed: 
  ; First, figure out which lane is the first active one
  %first = call i32 @llvm.cttz.i32(i32 %mm)
  %baseval = extractelement <$1 x $2> %v, i32 %first
@@ -1445,7 +1748,7 @@ actuallymixed:
  br label %check_neighbors
 check_neighbors:
-  %vec = phi <$1 x $2> [ %blendvec, %actuallymixed ], [ %v, %entry ]
+  %vec = phi <$1 x $2> [ %blendvec, %domixed ], [ %v, %entry ]
  ifelse($6, `32', `
  ; For 32-bit elements, we rotate once and compare with the vector, which ends 
  ; up comparing each element to its neighbor on the right.  Then see if
@@ -1577,7 +1880,7 @@ pl_known_mask:
  ;; the mask is known at compile time; see if it is something we can
  ;; handle more efficiently
  %pl_is_allon = icmp eq i32 %pl_mask, eval((1<<$1)-1)
-  br i1 %pl_is_allon, label %pl_all_on, label %pl_not_all_on
+  br i1 %pl_is_allon, label %pl_all_on, label %pl_unknown_mask
 pl_all_on:
  ;; the mask is all on--just expand the code for each lane sequentially
@@ -1585,19 +1888,14 @@ pl_all_on:
          `patsubst(`$3', `ID\|LANE', i)')
  br label %pl_done
-pl_not_all_on:
+pl_unknown_mask:
-  ;; not all on--see if it is all off or mixed
+  ;; we just run the general case, though we could
  ;; for the mixed case, we just run the general case, though we could
  ;; try to be smart and just emit the code based on what it actually is,
  ;; for example by emitting the code straight-line without a loop and doing 
  ;; the lane tests explicitly, leaving later optimization passes to eliminate
  ;; the stuff that is definitely not needed.  Not clear if we will frequently 
  ;; encounter a mask that is known at compile-time but is not either all on or
  ;; all off...
  %pl_alloff = icmp eq i32 %pl_mask, 0
  br i1 %pl_alloff, label %pl_done, label %pl_unknown_mask
 pl_unknown_mask:
  br label %pl_loop
 pl_loop:
@@ -1653,20 +1951,6 @@ define internal <$1 x $2> @__gather_elt_$2(i8 * %ptr, <$1 x i32> %offsets, <$1 x
 define <$1 x $2> @__gather_base_offsets_$2(i8 * %ptr, <$1 x i32> %offsets,
                                           <$1 x i32> %vecmask) nounwind readonly alwaysinline {
 entry:
  %mask = call i32 @__movmsk(<$1 x i32> %vecmask)
  %maskKnown = call i1 @__is_compile_time_constant_mask(<$1 x i32> %vecmask)
  br i1 %maskKnown, label %known_mask, label %unknown_mask
 known_mask:
  %alloff = icmp eq i32 %mask, 0
  br i1 %alloff, label %gather_all_off, label %unknown_mask
 gather_all_off:
  ret <$1 x $2> undef
 unknown_mask:
  ; We can be clever and avoid the per-lane stuff for gathers if we are willing
  ; to require that the 0th element of the array being gathered from is always
  ; legal to read from (and we do indeed require that, given the benefits!) 
--- a/ctx.cpp
+++ b/ctx.cpp
@@ -144,6 +144,11 @@ FunctionEmitContext::FunctionEmitContext(const Type *rt, llvm::Function *functio
    returnedLanesPtr = AllocaInst(LLVMTypes::MaskType, "returned_lanes_memory");
    StoreInst(LLVMMaskAllOff, returnedLanesPtr);
    launchedTasks = false;
    launchGroupHandlePtr = AllocaInst(LLVMTypes::VoidPointerType, "launch_group_handle");
    StoreInst(llvm::Constant::getNullValue(LLVMTypes::VoidPointerType), 
              launchGroupHandlePtr);
    if (!returnType || returnType == AtomicType::Void)
        returnValuePtr = NULL;
    else {
@@ -153,7 +158,6 @@ FunctionEmitContext::FunctionEmitContext(const Type *rt, llvm::Function *functio
        StoreInst(llvm::Constant::getNullValue(ftype), returnValuePtr);
    }
 #ifndef LLVM_2_8
    if (m->diBuilder) {
        /* If debugging is enabled, tell the debug information emission
           code about this new function */
@@ -174,16 +178,12 @@ FunctionEmitContext::FunctionEmitContext(const Type *rt, llvm::Function *functio
        /* And start a scope representing the initial function scope */
        StartScope();
    }
 #endif // LLVM_2_8
    launchedTasks = false;
    // connect the funciton's mask memory to the __mask symbol
    Symbol *maskSymbol = m->symbolTable->LookupVariable("__mask");
    assert(maskSymbol != NULL);
    maskSymbol->storagePtr = maskPtr;
 #ifndef LLVM_2_8
    // add debugging info for __mask, programIndex, ...
    if (m->diBuilder) {
        maskSymbol->pos = funcStartPos;
@@ -208,15 +208,12 @@ FunctionEmitContext::FunctionEmitContext(const Type *rt, llvm::Function *functio
                                           true /* static */,
                                           programCountSymbol->storagePtr);
    }
 #endif
 }
 FunctionEmitContext::~FunctionEmitContext() {
    assert(controlFlowInfo.size() == 0);
 #ifndef LLVM_2_8
    assert(debugScopes.size() == (m->diBuilder ? 1 : 0));
 #endif
 }
@@ -704,6 +701,7 @@ FunctionEmitContext::LaneMask(llvm::Value *v) {
 llvm::Value *
 FunctionEmitContext::MasksAllEqual(llvm::Value *v1, llvm::Value *v2) {
 #if 0
    // Compare the two masks to get a vector of i1s
    llvm::Value *cmp = CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_EQ,
                               v1, v2, "v1==v2");
@@ -711,6 +709,12 @@ FunctionEmitContext::MasksAllEqual(llvm::Value *v1, llvm::Value *v2) {
    cmp = I1VecToBoolVec(cmp);
    // And see if it's all on
    return All(cmp);
 #else
    llvm::Value *mm1 = LaneMask(v1);
    llvm::Value *mm2 = LaneMask(v2);
    return CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_EQ, mm1, mm2,
                   "v1==v2");
 #endif
 }
@@ -758,7 +762,7 @@ FunctionEmitContext::I1VecToBoolVec(llvm::Value *b) {
 llvm::Value *
-FunctionEmitContext::EmitMalloc(LLVM_TYPE_CONST llvm::Type *ty, int align) {
+FunctionEmitContext::SizeOf(LLVM_TYPE_CONST llvm::Type *ty) {
    // Emit code to compute the size of the given type using a GEP with a
    // NULL base pointer, indexing one element of the given type, and
    // casting the resulting 'pointer' to an int giving its size.
@@ -775,24 +779,7 @@ FunctionEmitContext::EmitMalloc(LLVM_TYPE_CONST llvm::Type *ty, int align) {
 #endif
    AddDebugPos(poffset);
    llvm::Value *sizeOf = PtrToIntInst(poffset, LLVMTypes::Int64Type, "offset_int");
-
+    return sizeOf;
    // And given the size, call the malloc function
    llvm::Function *fmalloc = m->module->getFunction("ISPCMalloc");
    assert(fmalloc != NULL);
    llvm::Value *mem = CallInst(fmalloc, sizeOf, LLVMInt32(align), 
                                "raw_argmem");
    // Cast the void * back to the result pointer type
    return BitCastInst(mem, ptrType, "mem_bitcast");
 }
 void
 FunctionEmitContext::EmitFree(llvm::Value *ptr) {
    llvm::Value *freeArg = BitCastInst(ptr, LLVMTypes::VoidPointerType,
                                       "argmemfree");
    llvm::Function *ffree = m->module->getFunction("ISPCFree");
    assert(ffree != NULL);
    CallInst(ffree, freeArg);
 }
@@ -850,7 +837,6 @@ FunctionEmitContext::GetDebugPos() const {
 void
 FunctionEmitContext::AddDebugPos(llvm::Value *value, const SourcePos *pos, 
                                 llvm::DIScope *scope) {
 #ifndef LLVM_2_8
    llvm::Instruction *inst = llvm::dyn_cast<llvm::Instruction>(value);
    if (inst != NULL && m->diBuilder) {
        SourcePos p = pos ? *pos : currentPos;
@@ -861,13 +847,11 @@ FunctionEmitContext::AddDebugPos(llvm::Value *value, const SourcePos *pos,
            inst->setDebugLoc(llvm::DebugLoc::get(p.first_line, p.first_column, 
                                                  scope ? *scope : GetDIScope()));
    }
 #endif
 }
 void
 FunctionEmitContext::StartScope() {
 #ifndef LLVM_2_8
    if (m->diBuilder != NULL) {
        llvm::DIScope parentScope;
        if (debugScopes.size() > 0)
@@ -881,18 +865,15 @@ FunctionEmitContext::StartScope() {
                                             currentPos.first_column);
        debugScopes.push_back(lexicalBlock);
    }
 #endif
 }
 void
 FunctionEmitContext::EndScope() {
 #ifndef LLVM_2_8
    if (m->diBuilder != NULL) {
        assert(debugScopes.size() > 0);
        debugScopes.pop_back();
    }
 #endif
 }
@@ -905,7 +886,6 @@ FunctionEmitContext::GetDIScope() const {
 void
 FunctionEmitContext::EmitVariableDebugInfo(Symbol *sym) {
 #ifndef LLVM_2_8
    if (m->diBuilder == NULL)
        return;
@@ -921,13 +901,11 @@ FunctionEmitContext::EmitVariableDebugInfo(Symbol *sym) {
    llvm::Instruction *declareInst = 
        m->diBuilder->insertDeclare(sym->storagePtr, var, bblock);
    AddDebugPos(declareInst, &sym->pos, &scope);
 #endif
 }
 void
 FunctionEmitContext::EmitFunctionParameterDebugInfo(Symbol *sym) {
 #ifndef LLVM_2_8
    if (m->diBuilder == NULL)
        return;
@@ -943,7 +921,6 @@ FunctionEmitContext::EmitFunctionParameterDebugInfo(Symbol *sym) {
    llvm::Instruction *declareInst = 
        m->diBuilder->insertDeclare(sym->storagePtr, var, bblock);
    AddDebugPos(declareInst, &sym->pos, &scope);
 #endif
 }
@@ -1501,27 +1478,15 @@ FunctionEmitContext::gather(llvm::Value *lvalue, const Type *type,
 void
 FunctionEmitContext::addGSMetadata(llvm::Instruction *inst, SourcePos pos) {
    llvm::Value *str = llvm::MDString::get(*g->ctx, pos.name);
 #ifdef LLVM_2_8
    llvm::MDNode *md = llvm::MDNode::get(*g->ctx, &str, 1);
 #else
    llvm::MDNode *md = llvm::MDNode::get(*g->ctx, str);
 #endif
    inst->setMetadata("filename", md);
    llvm::Value *line = LLVMInt32(pos.first_line);
 #ifdef LLVM_2_8
    md = llvm::MDNode::get(*g->ctx, &line, 1);
 #else
    md = llvm::MDNode::get(*g->ctx, line);
 #endif
    inst->setMetadata("line", md);
    llvm::Value *column = LLVMInt32(pos.first_column);
 #ifdef LLVM_2_8
    md = llvm::MDNode::get(*g->ctx, &column, 1);
 #else
    md = llvm::MDNode::get(*g->ctx, column);
 #endif
    inst->setMetadata("column", md);
 }
@@ -1838,9 +1803,9 @@ llvm::PHINode *
 FunctionEmitContext::PhiNode(LLVM_TYPE_CONST llvm::Type *type, int count, 
                             const char *name) {
    llvm::PHINode *pn = llvm::PHINode::Create(type, 
-#if !defined(LLVM_2_8) && !defined(LLVM_2_9)
+#if defined(LLVM_3_0) || defined(LLVM_3_0svn)
                                              count, 
-#endif // !LLVM_2_8 && !LLVM_2_9
+#endif // LLVM_3_0
                                              name ? name : "phi", bblock);
    AddDebugPos(pn);
    return pn;
@@ -1933,15 +1898,9 @@ FunctionEmitContext::CallInst(llvm::Function *func, llvm::Value *arg0,
 llvm::Instruction *
 FunctionEmitContext::ReturnInst() {
-    if (launchedTasks) {
+    if (launchedTasks)
-        // Automatically add a sync call at the end of any function that
+        // Add a sync call at the end of any function that launched tasks
-        // launched tasks
+        SyncInst();
        SourcePos noPos;
        noPos.name = "__auto_sync";
        ExprStmt *es = new ExprStmt(new SyncExpr(noPos), noPos);
        es->EmitCode(this); 
        delete es;
    }
    llvm::Instruction *rinst = NULL;
    if (returnValuePtr != NULL) {
@@ -1964,7 +1923,8 @@ FunctionEmitContext::ReturnInst() {
 llvm::Instruction *
 FunctionEmitContext::LaunchInst(llvm::Function *callee, 
-                                std::vector<llvm::Value *> &argVals) {
+                                std::vector<llvm::Value *> &argVals,
                                llvm::Value *launchCount) {
    if (callee == NULL) {
        assert(m->errorCount > 0);
        return NULL;
@@ -1981,20 +1941,15 @@ FunctionEmitContext::LaunchInst(llvm::Function *callee,
        static_cast<LLVM_TYPE_CONST llvm::StructType *>(pt->getElementType());
    assert(argStructType->getNumElements() == argVals.size() + 1);
    llvm::Function *falloc = m->module->getFunction("ISPCAlloc");
    assert(falloc != NULL);
    int align = 4 * RoundUpPow2(g->target.nativeVectorWidth);
-#ifdef ISPC_IS_WINDOWS
+    std::vector<llvm::Value *> allocArgs;
-    // Use malloc() to allocate storage on Windows, since the stack is
+    allocArgs.push_back(launchGroupHandlePtr);
-    // generally not big enough there to do enough allocations for lots of
+    allocArgs.push_back(SizeOf(argStructType));
-    // tasks and then things crash horribly...
+    allocArgs.push_back(LLVMInt32(align));
-    llvm::Value *argmem = EmitMalloc(argStructType, align);
+    llvm::Value *voidmem = CallInst(falloc, allocArgs, "args_ptr");
-#else
+    llvm::Value *argmem = BitCastInst(voidmem, pt);
    // Use alloca for space for the task args on OSX And Linux.  KEY
    // DETAIL: pass false to the call of FunctionEmitContext::AllocaInst so
    // that the alloca doesn't happen just once at the top of the function,
    // but happens each time the enclosing basic block executes.
    llvm::Value *argmem = AllocaInst(argStructType, "argmem", align, false);
 #endif // ISPC_IS_WINDOWS
    llvm::Value *voidmem = BitCastInst(argmem, LLVMTypes::VoidPointerType);
    // Copy the values of the parameters into the appropriate place in
    // the argument block
@@ -2016,5 +1971,32 @@ FunctionEmitContext::LaunchInst(llvm::Function *callee,
    llvm::Value *fptr = BitCastInst(callee, LLVMTypes::VoidPointerType);
    llvm::Function *flaunch = m->module->getFunction("ISPCLaunch");
    assert(flaunch != NULL);
-    return CallInst(flaunch, fptr, voidmem, "");
+    std::vector<llvm::Value *> args;
    args.push_back(launchGroupHandlePtr);
    args.push_back(fptr);
    args.push_back(voidmem);
    args.push_back(launchCount);
    return CallInst(flaunch, args, "");
 }
 void
 FunctionEmitContext::SyncInst() {
    llvm::Value *launchGroupHandle = LoadInst(launchGroupHandlePtr, NULL);
    llvm::Value *nullPtrValue = llvm::Constant::getNullValue(LLVMTypes::VoidPointerType);
    llvm::Value *nonNull = CmpInst(llvm::Instruction::ICmp,
                                   llvm::CmpInst::ICMP_NE,
                                   launchGroupHandle, nullPtrValue);
    llvm::BasicBlock *bSync = CreateBasicBlock("call_sync");
    llvm::BasicBlock *bPostSync = CreateBasicBlock("post_sync");
    BranchInst(bSync, bPostSync, nonNull);
    SetCurrentBasicBlock(bSync);
    llvm::Function *fsync = m->module->getFunction("ISPCSync");
    if (fsync == NULL)
        FATAL("Couldn't find ISPCSync declaration?!");
    CallInst(fsync, launchGroupHandle, "");
    BranchInst(bPostSync);
    SetCurrentBasicBlock(bPostSync);
 }
--- a/ctx.h
+++ b/ctx.h
@@ -210,15 +210,8 @@ public:
        i32. */
    llvm::Value *I1VecToBoolVec(llvm::Value *b);
-    /** Emit code to call the user-supplied ISPCMalloc function to
+    /** Returns the size of the given type. */
-        allocate space for an object of thee given type.  Returns the
+    llvm::Value *SizeOf(LLVM_TYPE_CONST llvm::Type *ty);
        pointer value returned by the ISPCMalloc call. */
    llvm::Value *EmitMalloc(LLVM_TYPE_CONST llvm::Type *ty, int align = 0);
    /** Emit code to call the user-supplied ISPCFree function, passing it
        the given pointer to storage previously allocated by an
        EmitMalloc() call. */
    void EmitFree(llvm::Value *ptr);
    /** If the user has asked to compile the program with instrumentation,
        this inserts a callback to the user-supplied instrumentation
@@ -399,7 +392,10 @@ public:
    /** Launch an asynchronous task to run the given function, passing it
        he given argument values. */
    llvm::Instruction *LaunchInst(llvm::Function *callee, 
-                                  std::vector<llvm::Value *> &argVals);
+                                  std::vector<llvm::Value *> &argVals,
                                  llvm::Value *launchCount);
    void SyncInst();
    llvm::Instruction *ReturnInst();
    /** @} */
@@ -489,6 +485,11 @@ private:
    /** True if a 'launch' statement has been encountered in the function. */
    bool launchedTasks;
    /** This is a pointer to a void * that is passed to the ISPCLaunch(),
        ISPCAlloc(), and ISPCSync() routines as a handle to the group ot
        tasks launched from the current function. */
    llvm::Value *launchGroupHandlePtr;
    llvm::Value *pointerVectorToVoidPointers(llvm::Value *value);
    static void addGSMetadata(llvm::Instruction *inst, SourcePos pos);
    bool ifsInLoopAllUniform() const;
--- a/decl.cpp
+++ b/decl.cpp
@@ -237,7 +237,7 @@ Declarator::GetType(DeclSpecs *ds) const {
                    sprintf(buf, "__anon_parameter_%d", i);
                    sym = new Symbol(buf, pos);
                    Declarator *declarator = new Declarator(sym, sym->pos);
-                    sym->type = declarator->GetType(ds);
+                    sym->type = declarator->GetType(d->declSpecs);
                    d->declarators.push_back(declarator);
                }
                else {
--- a/docs/ReleaseNotes.txt
+++ b/docs/ReleaseNotes.txt
@@ -1,3 +1,88 @@
 === v1.0.10 === (30 September 2011)
 This release features an extensive new example showing the application of
 ispc to a deferred shading algorithm for scenes with thousands of lights
 (examples/deferred).  This is an implementation of the algorithm that Johan
 Andersson described at SIGGRAPH 2009 and was implemented by Andrew
 Lauritzen and Jefferson Montgomery.  The basic idea is that a pre-rendered
 G-buffer is partitioned into tiles, and in each tile, the set of lights
 that contribute to the tile is computed.  Then, the pixels in the tile are
 then shaded using those light sources. (See slides 19-29 of
 http://s09.idav.ucdavis.edu/talks/04-JAndersson-ParallelFrostbite-Siggraph09.pdf
 for more details on the algorithm.)
 The mechanism for launching tasks from ispc code has been generalized to
 allow multiple tasks to be launched with a single launch call (see
 http://ispc.github.com/ispc.html#task-parallelism-language-syntax for more
 information.)
 A few new functions have been added to the standard library: num_cores()
 returns the number of cores in the system's CPU, and variants of all of the
 atomic operators that take 'uniform' values as parameters have been added.
 === v1.0.9 === (26 September 2011)
 The binary release of v1.0.9 is the first that supports AVX code
 generation.  Two targets are provided: "avx", which runs with a
 programCount of 8, and "avx-x2" which runs 16 program instances
 simultaneously.  (This binary is also built using the in-progress LLVM 3.0
 development libraries, while previous ones have been built with the
 released 2.9 version of LLVM.)
 This release has no other significant changes beyond a number of small
 bugfixes (https://github.com/ispc/ispc/issues/100,
 https://github.com/ispc/ispc/issues/101, https://github.com/ispc/ispc/issues/103.)
 === v1.0.8 === (19 September 2011)
 A number of improvements have been made to handling of 'if' statements in
 the language:
  - A bug was fixed where invalid memory could be incorrectly accessed even
    if none of the running program instances wanted to execute the
    corresponding instructions (https://github.com/ispc/ispc/issues/74).
  - The code generated for 'if' statements is a bit simpler and thus more
    efficient.
 There is now '--pic' command-line argument that causes position-independent
 code to be generated (Linux and OSX only).
 A number of additional performance improvements:
  - Loops are now unrolled by default; the --opt=disable-loop-unroll
    command-line argument can be used to disable this behavior.
    (https://github.com/ispc/ispc/issues/78)
  - A few more cases where gathers/scatters could be determined at compile
    time to actually access contiguous locations have been added.
    (https://github.com/ispc/ispc/issues/79)
 Finally, warnings are now issued (if possible) when it can be determined
 at compile-time that an out-of-bounds array index is being used.
 (https://github.com/ispc/ispc/issues/98).
 === v1.0.7 === (3 September 2011)
 The various atomic_*_global() standard library functions are generally
 substantially more efficient.  They all previously issued one hardware
 atomic instruction for each running program instance but now locally
 compute a reduction over the operands and issue a single hardware atomic,
 giving the same effect and results in the end (issue #57).
 CPU/ISA target handling has been substantially improved.  If no CPU is
 specified, the host CPU type is used, not just a default of "nehalem".  A
 number of bugs were fixed that ensure that LLVM doesn't generate SSE>2
 instructions when using the SSE2 target (fixes issue #82).
 Shift rights of unsigned integer types use a logical shift right
 instruction now, not an arithmetic shift right (fixed issue #88).
 When emitting header files, 'extern' declarations of globals used in ispc
 code are now outside of the ispc namespace.  Fixes issue #64.
 The stencil example has been modified to do runs with and without
 parallelism.
 Many other small bugfixes and improvements.
 === v1.0.6 === (17 August 2011)
 Some additional cross-program instance operations have been added to the
--- a/docs/ispc.txt
+++ b/docs/ispc.txt
@@ -33,6 +33,17 @@ The main goals behind ``ispc`` are to:
 number of non-trivial workloads that aren't handled well by other
 compilation approaches (e.g. loop auto-vectorization.)
 **We are very interested in your feedback and comments about ispc and
 in hearing your experiences using the system.  We are especially interested
 in hearing if you try using ispc but see results that are not as you
 were expecting or hoping for.** We encourage you to send a note with your
 experiences or comments to the `ispc-users`_ mailing list or to file bug or
 feature requests with the ``ispc`` `bug tracker`_. (Thanks!)
 .. _ispc-users: http://groups.google.com/group/ispc-users
 .. _bug tracker: https://github.com/ispc/ispc/issues?state=open
 Contents:
 * `Recent Changes to ISPC`_
@@ -69,7 +80,8 @@ Contents:
  + `Program Instance Convergence`_
  + `Data Races`_
  + `Uniform Variables and Varying Control Flow`_
-  + `Task Parallelism in ISPC`_
+  + `Task Parallelism: Language Syntax`_
  + `Task Parallelism: Runtime Requirements`_
 * `The ISPC Standard Library`_
@@ -80,6 +92,7 @@ Contents:
  + `Conversions To and From Half-Precision Floats`_
  + `Atomic Operations and Memory Fences`_
  + `Prefetches`_
  + `System Information`_
  + `Low-Level Bits`_
 * `Interoperability with the Application`_
@@ -102,6 +115,8 @@ Contents:
  + `Small Performance Tricks`_
  + `Instrumenting Your ISPC Programs`_
  + `Using Scan Operations For Variable Output`_
  + `Application-Supplied Execution Masks`_
  + `Explicit Vector Programming With Uniform Short Vector Types`_
 * `Disclaimer and Legal Information`_
@@ -824,8 +839,8 @@ by default.  If a function is declared with a ``static`` qualifier, then it
 is only visible in the file in which it was declared.
 Any function that can be launched with the ``launch`` construct in ``ispc``
-must have a ``task`` qualifier; see `Task Parallelism in ISPC`_ for more
+must have a ``task`` qualifier; see `Task Parallelism: Language Syntax`_
-discussion of launching tasks in ``ispc``.
+for more discussion of launching tasks in ``ispc``.
 Functions that are intended to be called from C/C++ application code must
 have the ``export`` qualifier.  This causes them to have regular C linkage
@@ -926,8 +941,9 @@ execution model is critical for writing efficient and correct programs in
 ``ispc`` supports both task parallelism to parallelize across multiple
 cores and SPMD parallelism to parallelize across the SIMD vector lanes on a
-single core.  This section focuses on SPMD parallelism.  See the section
+single core.  This section focuses on SPMD parallelism.  See the sections
-`Task Parallelism in ISPC`_ for discussion of task parallelism in ``ispc``.
+`Task Parallelism: Language Syntax`_ and `Task Parallelism: Runtime
 Requirements`_ for discussion of task parallelism in ``ispc``.
 The SPMD-on-SIMD Execution Model
 --------------------------------
@@ -1174,7 +1190,7 @@ This code implicitly assumes that ``programCount`` evenly divides
 ::
    for (uniform int i = 0; i < count; i += programCount) {
-        if (i + programIndex < programCount) {
+        if (i + programIndex < count) {
            float d = data[i + programIndex];
            ...
@@ -1370,112 +1386,190 @@ be modified in the above code even if *none* of the program instances
 evaluated a true value for the test, given the ``ispc`` execution model.
-Task Parallelism in ISPC
+Task Parallelism: Language Syntax
------------------------
+---------------------------------
 One option for combining task-parallelism with ``ispc`` is to just use
 regular task parallelism in the C/C++ application code (be it through
-Intel® Cilk(tm), Intel® Thread Building Blocks or another task system,
+Intel® Cilk(tm), Intel® Thread Building Blocks or another task system), and
-etc.), and for tasks to use ``ispc`` for SPMD parallelism across the vector
+for tasks to use ``ispc`` for SPMD parallelism across the vector lanes as
-lanes as appropriate.  Alternatively, ``ispc`` also has some support for
+appropriate.  Alternatively, ``ispc`` also has support for launching tasks
-launching tasks from ``ispc`` code.  The approach is similar to Intel®
+from ``ispc`` code.  The approach is similar to Intel® Cilk's task launch
-Cilk's task launch feature.  (See the ``examples/mandelbrot_tasks`` example
+feature.  (See the ``examples/mandelbrot_tasks`` example to see it used in
-to see it used in a non-trivial example.)
+a small example.)
-Any function that is launched as a task must be declared with the ``task``
+First, any function that is launched as a task must be declared with the
-qualifier:
+``task`` qualifier:
 ::
-    task void func(uniform float a[], uniform int start) {
+    task void func(uniform float a[], uniform int index) {
-        ....
+        ...
        a[index] = ....
    }
 Tasks must return ``void``; a compile time error is issued if a
 non-``void`` task is defined.
-Given a task, one can then write code that launches tasks as follows:
+Given a task definitions, there are two ways to write code that launches
 tasks, using the ``launch`` construct.  First, one task can be launched at
 a time, with parameters passed to the task to help it determine what part
 of the overall computation it's responsible for:
 ::
    for (uniform int i = 0; i < 100; ++i)
-        launch < func(a, i); >
+        launch < func(a, i) >;
 Note the ``launch`` keyword and the brackets around the function call.
 This code launches 100 tasks, each of which presumably does some
-computation keyed off of given the value ``i``.  In general, one should
+computation that is keyed off of given the value ``i``.  In general, one
-launch many more tasks than there are processors in the system to
+should launch many more tasks than there are processors in the system to
 ensure good load-balancing, but not so many that the overhead of scheduling
 and running tasks dominates the computation.
-Program execution continues asynchronously after task launch; thus, the
+Alternatively, a number of tasks may be launched from a single ``launch``
-function shouldn't access values being generated by the tasks without
+statement.  We might instead write the above example with a single
-synchronization.  A function uses a ``sync`` statement to wait for all
+``launch`` like this:
 launched tasks to finish:
 ::
-    for (uniform int i = 0; i < 100; ++i)
+    launch[100] < func2(a) >;
-        launch < func(a, i); >
+
 Where an integer value (not necessarily a compile-time constant) is
 provided to the ``launch`` keyword in square brackets; this number of tasks
 will be enqueued to be run asynchronously.  Within each of the tasks, two
 special built-in variables are available--``taskIndex``, and ``taskCount``.
 The first, ``taskIndex``, ranges from zero to one minus the number of tasks
 provided to ``launch``, and ``taskCount`` equals the number of launched
 taks.  Thus, we might use ``taskIndex`` in the implementation of ``func2``
 to determine which array element to process.
 ::
    task void func2(uniform float a[]) {
        ...
        a[taskIndex] = ...
    }
 Program execution continues asynchronously after a ``launch`` statement;
 thus, a function shouldn't access values being generated by the tasks it
 has launched within the function without synchronization.  If results are
 needed before function return, a function can use a ``sync`` statement to
 wait for all launched tasks to finish:
 ::
    launch[100] < func2(a) >;
    sync;
    // now safe to use computed values in a[]...
-Alternatively, any function that launches tasks has an implicit ``sync``
+Alternatively, any function that launches tasks has an automatically-added
-before it returns, so that functions that call a function that launches
+``sync`` statement before it returns, so that functions that call a
-tasks don't have to worry about outstanding asynchronous computation.
+function that launches tasks don't have to worry about outstanding
 asynchronous computation from that function.
 Inside functions with the ``task`` qualifier, two additional built-in
-variables are provided: ``threadIndex`` and ``threadCount``.
+variables are provided in addition to ``taskIndex`` and ``taskCount``:
-``threadCount`` gives the total number of hardware threads that have been
+``threadIndex`` and ``threadCount``.  ``threadCount`` gives the total
-launched by the task system.  ``threadIndex`` provides an index between
+number of hardware threads that have been launched by the task system.
-zero and ``threadCount-1`` that gives a unique index that corresponds to
+``threadIndex`` provides an index between zero and ``threadCount-1`` that
-the hardware thread that is executing the current task.  The
+gives a unique index that corresponds to the hardware thread that is
-``threadIndex`` can be used for accessing data that is private to the
+executing the current task.  The ``threadIndex`` can be used for accessing
-current thread and thus doesn't require synchronization to access under
+data that is private to the current thread and thus doesn't require
-parallel execution.
+synchronization to access under parallel execution.
 Task Parallelism: Runtime Requirements
 --------------------------------------
 If you use the task launch feature in ``ispc``, you must provide C/C++
-implementations of two functions and link them into your final executable
+implementations of three specific functions that manage launching and
-file.  Although these functions may be implemented in either language, they
+synchronizing parallel tasks; these functions must be linked into your
-must have "C" linkage (i.e. their prototypes must be declared inside an
+executable.  Although these functions may be implemented in any
-``extern "C"`` block if they are defined in C++.)
+language, they must have "C" linkage (i.e. their prototypes must be
 declared inside an ``extern "C"`` block if they are defined in C++.)
 By using user-supplied versions of these functions, ``ispc`` programs can
 easily interoperate with software systems that have existing task systems
 for managing parallelism.  If you're using ``ispc`` with a system that
 isn't otherwise multi-threaded and don't want to write custom
 implementations of them, you can use the implementations of these functions
 provided in the ``examples/tasksys.cpp`` file in the ``ispc``
 distributions.
 If you are implementing your own task system, the remainder of this section
 discusses the requirements for these calls.  You will also likely want to
 review the example task systems in ``examples/tasksys.cpp`` for reference.
 If you are not implmenting your own task system, you can skip reading the
 remainder of this section.
 Here are the declarations of the three functions that must be provided to
 manage tasks in ``ispc``:
 ::
-    void ISPCLaunch(void *funcptr, void *data);
+    void *ISPCAlloc(void **handlePtr, int64_t size, int32_t alignment);
-    void ISPCSync();
+    void ISPCLaunch(void **handlePtr, void *f, void *data, int count);
    void ISPCSync(void *handle);
-On Windows, two additional functions must be provided to dynamically
+All three of these functions take an opaque handle (or a pointer to an
-allocate and free memory to store the arguments passed to tasks.  (On OSX
+opaque handle) as their first parameter.  This handle allows the task
-and Linux, the stack provides memory for task arguments; on Windows, the
+system runtime to distinguish between calls to these functions from
-stack is generally not large enough to do this for large numbers of tasks.)
+different functions in ``ispc`` code.  In this way, the task system
 implementation can efficiently wait for completion on just the tasks
 launched from a single function.
 The first time one of ``ISPCLaunch()`` or ``ISPCAlloc()`` is called in an
 ``ispc`` functon, the ``void *`` pointed to by the ``handlePtr`` parameter
 will be ``NULL``.  The implementations of these function should then
 initialize ``*handlePtr`` to a unique handle value of some sort.  (For
 example, it might allocate a small structure to record which tasks were
 launched by the current function.)  In subsequent calls to these functions
 in the emitted ``ispc`` code, the same value for ``handlePtr`` will be
 passed in, such that loading from ``*handlePtr`` will retrieve the value
 stored in the first call.
 At function exit (or at an explicit ``sync`` statement), a call to
 ``ISPCSync()`` will be generated if ``*handlePtr`` is non-``NULL``.
 Therefore, the handle value is passed directly to ``ISPCSync()``, rather
 than a pointer to it, as in the other functions.
 The ``ISPCAlloc()`` function is used to allocate small blocks of memory to
 store parameters passed to tasks.  It should return a pointer to memory
 with the given aize and alignment.  Note that there is no explicit
 ``ISPCFree()`` call; instead, all memory allocated within an ``ispc``
 function should be freed when ``ISPCSync()`` is called.
 ``ISPCLaunch()`` is called to launch to launch one or more asynchronous
 tasks.  Each ``launch`` statement in ``ispc`` code causes a call to
 ``ISPCLaunch()`` to be emitted in the generated code.  The three parameters
 after the handle pointer to thie function are relatively straightforward;
 the ``void *f`` parameter holds a pointer to a function to call to run the
 work for this task, ``data`` holds a pointer to data to pass to this
 function, and ``count`` is the number of instances of this function to
 enqueue for asynchronous execution.  (In other words, ``count`` corresponds
 to the value ``n`` in a multiple-task launch statement like ``launch[n]``.)
 The signature of the provided function pointer ``f`` is
 ::
-    void *ISPCMalloc(int64_t size, int32_t alignment);
+    void (*TaskFuncPtr)(void *data, int threadIndex, int threadCount,
-    void ISPCFree(void *ptr);
+                        int taskIndex, int taskCount)
-These are called by the task launch code generated by the ``ispc``
+When this function pointer is called by one of the hardware threads managed
-compiler; the first is called to launch to launch a task and the second is
+bythe task system, the ``data`` pointer passed to ``ISPCLaunch()`` should
-called to wait for, respectively.  (Factoring them out in this way
+be passed to it for its first parameter; ``threadCount`` gives the total
-allows ``ispc`` to inter-operate with the application's task system, if
+number of hardware threads that have been spawned to run tasks and
-any, rather than having a separate one of its own.)  To run a particular
+``threadIndex`` should be an integer index between zero and ``threadCount``
-task, the task system should cast the function pointer to a ``void (*)(void
+uniquely identifying the hardware thread that is running the task.  (These
-*, int, int)`` function pointer and then call it with the provided ``void
+values can be used to index into thread-local storage.)
 *`` data and then an index for the current hardware thread and the total
 number of hardware threads the task system has launched--in other words:
 ::
    typedef void (*TaskFuncType)(void *, int, int);
    TaskFuncType tft = (TaskFuncType)(funcptr);
    tft(data, threadIndex, threadCount);
 A number of sample task system implementations are provided with ``ispc``; 
 see the files ``tasks_concrt.cpp``, ``tasks_gcd.cpp`` and
 ``tasks_pthreads.cpp`` in the ``examples/mandelbrot_tasks`` directory of
 the ``ispc`` distribution.
 The value of ``taskCount`` should be the number of tasks launched in the
 ``launch`` statement that caused the call to ``ISPCLaunch()`` and each of
 the calls to this function should be given a unique value of ``taskIndex``
 between zero and ``taskCount``, to distinguish which of the instances
 of the set of launched tasks is running.
 The ISPC Standard Library
 =========================
@@ -2020,12 +2114,12 @@ end.)
 One thing to note is that that the value being added to here is a
 ``uniform`` integer, while the increment amount and the return value are
-``varying``.  In other words, the semantics are that each running program
+``varying``.  In other words, the semantics of this call are that each
-instance individually issues the atomic operation with its own ``delta``
+running program instance individually issues the atomic operation with its
-value and gets the previous value of ``val`` back in return.  The atomics
+own ``delta`` value and gets the previous value of ``val`` back in return.
-for the running program instances may be issued in arbitrary order; it's
+The atomics for the running program instances may be issued in arbitrary
-not guaranteed that they will be issued in ``programIndex`` order, for
+order; it's not guaranteed that they will be issued in ``programIndex``
-example.
+order, for example.
 Here are the declarations of the ``int32`` variants of these functions.
 There are also ``int64`` equivalents as well as variants that take
@@ -2043,17 +2137,44 @@ function can be used with ``float`` and ``double`` types as well.)
  int32 atomic_xor_global(reference uniform int32 val, int32 value)
  int32 atomic_swap_global(reference uniform int32 val, int32 newval)
-There is also an atomic "compare and exchange" function; it atomically
+There are also variants of these functions that take ``uniform`` values for
-compares the value in "val" to "compare"--if they match, it assigns
+the operand and return a ``uniform`` result:
 "newval" to "val".  In either case, the old value of "val" is returned.
 (As with the other atomic operations, there are also ``unsigned`` and
 64-bit variants of this function.  Furthermore, there are ``float`` and
 ``double`` variants as well.)
 ::
  uniform int32 atomic_add_global(reference uniform int32 val,
                                  uniform int32 value)
  uniform int32 atomic_subtract_global(reference uniform int32 val,
                                       uniform int32 value)
  uniform int32 atomic_min_global(reference uniform int32 val,
                                  uniform int32 value)
  uniform int32 atomic_max_global(reference uniform int32 val,
                                  uniform int32 value)
  uniform int32 atomic_and_global(reference uniform int32 val,
                                  uniform int32 value)
  uniform int32 atomic_or_global(reference uniform int32 val,
                                  uniform int32 value)
  uniform int32 atomic_xor_global(reference uniform int32 val,
                                  uniform int32 value)
  uniform int32 atomic_swap_global(reference uniform int32 val,
                                   uniform int32 newval)
 There are also an atomic swap and "compare and exchange" functions.
 Compare and exchange atomically compares the value in "val" to
 "compare"--if they match, it assigns "newval" to "val".  In either case,
 the old value of "val" is returned.  (As with the other atomic operations,
 there are also ``unsigned`` and 64-bit variants of this function.
 Furthermore, there are ``float`` and ``double`` variants as well.)
 ::
  int32 atomic_swap_global(reference uniform int32 val, int32 new)
  uniform int32 atomic_swap_global(reference uniform int32 val,
                                   uniform int32 new)
  int32 atomic_compare_exchange_global(reference uniform int32 val,
                                       int32 compare, int32 newval)
  uniform int32 atomic_compare_exchange_global(reference uniform int32 val,
                                  uniform int32 compare, uniform int32 newval)
 ``ispc`` also has a standard library routine that inserts a memory barrier
 into the code; it ensures that all memory reads and writes prior to be
@@ -2102,6 +2223,20 @@ These functions are available for all of the basic types in the
 language--``int8``, ``int16``, ``int32``, ``float``, and so forth.
 System Information
 ------------------
 A routine is available to find the number of CPU cores available in the
 system:
 ::
    int num_cores()
 This value can be useful for adapting the granularity of parallel task
 decomposition depending on the number of processors in the system.
 Low-Level Bits
 --------------
@@ -2209,14 +2344,14 @@ Both the ``foo`` and ``bar`` global variables can be accessed on each
 side.
 ``ispc`` code can also call back to C/C++.  On the ``ispc`` side, any
-application functions to be called must be declared with the ``export "C"``
+application functions to be called must be declared with the ``extern "C"``
 qualifier.
 ::
   extern "C" void foo(uniform float f, uniform float g);
-Unlike in C++, ``export "C"`` doesn't take braces to delineate
+Unlike in C++, ``extern "C"`` doesn't take braces to delineate
 multiple functions to be declared; thus, multiple C functions to be called
 from ``ispc`` must be declared as follows:
@@ -2843,6 +2978,91 @@ values to ``outArray[1]`` and ``outArray[2]``, and so forth.  The
 ``reduce_add`` call at the end returns the total number of values that the
 program instances have written to the array.
 Application-Supplied Execution Masks
 ------------------------------------
 Recall that when execution transitions from the application code to an
 ``ispc`` function, all of the program instances are initially executing.
 In some cases, it may desired that only some of them are running, based on
 a data-dependent condition computed in the application program.  This
 situation can easily be handled via an additional parameter from the
 application.
 As a simple example, consider a case where the application code has an
 array of ``float`` values and we'd like the ``ispc`` code to update
 just specific values in that array, where which of those values to be
 updated has been determined by the application.  In C++ code, we might
 have:
 ::
    int count = ...;
    float *array = new float[count];
    bool *shouldUpdate = new bool[count];
    // initialize array and shouldUpdate
    ispc_func(array, shouldUpdate, count);
 Then, the ``ispc`` code could process this update as:
 ::
    export void ispc_func(uniform float array[], uniform bool update[],
                          uniform int count) {
        for (uniform int i = 0; i < count; i += programCount) {
            cif (update[i+programIndex] == true)
                // update array[i+programIndex]...
        }
    }
 (In this case a "coherent" if statement is likely to be worthwhile if the
 ``update`` array will tend to have sections that are either all-true or
 all-false.)
 Explicit Vector Programming With Uniform Short Vector Types
 -----------------------------------------------------------
 The typical model for programming in ``ispc`` is an *implicit* parallel
 model, where one writes a program that is apparently doing scalar
 computation on values and the program is then vectorized to run in parallel
 across the SIMD lanes of a processor.  However, ``ispc`` also has some
 support for explicit vector unit programming, where the vectorization is
 explicit.  Some computations may be more effectively described in the
 explicit model rather than the implicit model.
 This support is provided via ``uniform`` instances of short vectors 
 (as were introduced in the `Short Vector Types`_ section).  Specifically, 
 if this short program
 ::
    export uniform float<8> madd(uniform float<8> a, 
                                 uniform float<8> b, uniform float<8> c) {
        return a + b * c;
    }
 is compiled with the AVX target, ``ispc`` generates the following assembly:
 ::
    _madd:
 	vmulps	%ymm2, %ymm1, %ymm1
 	vaddps	%ymm0, %ymm1, %ymm0
 	ret
 (And similarly, if compiled with a 4-wide SSE target, two ``mulps`` and two
 ``addps`` instructions are generated, and so forth.)
 Note that ``ispc`` doesn't currently support control-flow based on
 ``uniform`` short vector types; it is thus not possible to write code like:
 ::
    export uniform int<8> count(uniform float<8> a, uniform float<8> b) {
        uniform int<8> sum = 0;
        while (a++ < b)
            ++sum;
    }
 Disclaimer and Legal Information
 ================================
--- a/doxygen.cfg
+++ b/doxygen.cfg
@@ -31,7 +31,7 @@ PROJECT_NAME           = "Intel SPMD Program Compiler"
 # This could be handy for archiving the generated documentation or
 # if some version control system is used.
-PROJECT_NUMBER         = 1.0.6
+PROJECT_NUMBER         = 1.0.10
 # The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute)
 # base path where the generated documentation will be put.
--- a/examples/README.txt
+++ b/examples/README.txt
@@ -14,6 +14,7 @@ the runtimes and the speedup delivered by ispc.  It may be instructive to
 do a side-by-side diff of the C++ and ispc implementations of these
 algorithms to learn more about wirting ispc code.
 AOBench
 =======
@@ -27,6 +28,7 @@ It executes the program for the given number of iterations, rendering an
 (xres x yres) image each time and measuring the computation time with both
 serial and ispc implementations.
 AOBench_Instrumented
 ====================
@@ -40,12 +42,47 @@ is provided in the instrument.cpp file.
 *** Note: on Linux, this example currently hits an assertion in LLVM during
 *** compilation
 Deferred
 ========
 This example shows an extensive example of using ispc for efficient
 deferred shading of scenes with thousands of lights; it's an implementation
 of the algorithm that Johan Andersson described at SIGGRAPH 2009,
 implemented by Andrew Lauritzen and Jefferson Montgomery.  The basic idea
 is that a pre-rendered G-buffer is partitioned into tiles, and in each
 tile, the set of lights that contribute to the tile is first computed.
 Then, the pixels in the tile are then shaded using just those light
 sources. (See slides 19-29 of
 http://s09.idav.ucdavis.edu/talks/04-JAndersson-ParallelFrostbite-Siggraph09.pdf
 for more details on the algorithm.)
 This directory includes three implementations of the algorithm:
 - An ispc implementation that first does a static partitioning of the
  screen into tiles to parallelize across the CPU cores.  Within each tile
  ispc kernels provide highly efficient implementations of the light
  culling and shading calculations.
 - A "best practices" serial C++ implementation.  This implementation does a
  dynamic partitioning of the screen, refining tiles with significant Z
  depth complexity (these tiles often have a large number of lights that
  affect them).  Within each final tile, the pixels are shaded using
  regular C++ code.
 - If the Cilk extensions are available in your compiler, an ispc
  implementation that uses Cilk will also be built.
  (See http://software.intel.com/en-us/articles/intel-cilk-plus/).  Like 
  the "best practices" serial implementation, this version does dynamic
  tile partitioning for better load balancing and then uses ispc for the
  light culling and shading.
 Mandelbrot
 ==========
 Mandelbrot set generation.  This example is extensively documented at the
 http://ispc.github.com/example.html page.
 Mandelbrot_tasks
 ================
@@ -58,6 +95,7 @@ using tasks with ispc, no task system is mandated; the user is free to plug
 in any task system they want, for ease of interoperating with existing task
 systems.
 Noise
 =====
@@ -71,6 +109,7 @@ Options
 This program implements both the Black-Scholes and Binomial options pricing
 models in both ispc and regular serial C++ code.
 RT
 ==
@@ -87,6 +126,7 @@ and triangle intersection code from pbrt; see the pbrt source code and/or
 "Physically Based Rendering" book for more about the basic algorithmic
 details.
 Simple
 ======
@@ -94,6 +134,7 @@ This is a simple "hello world" type program that shows a ~10 line
 application program calling out to a ~5 line ispc program to do a simple
 computation.
 Volume
 ======
--- a/examples/aobench/Makefile
+++ b/examples/aobench/Makefile
@@ -1,8 +1,14 @@
-CXX=g++ -m64
+ARCH = $(shell uname)
-CXXFLAGS=-Iobjs/ -O3 -Wall
+
 TASK_CXX=../tasksys.cpp
 TASK_LIB=-lpthread
 TASK_OBJ=$(addprefix objs/, $(subst ../,, $(TASK_CXX:.cpp=.o)))
 CXX=g++
 CXXFLAGS=-Iobjs/ -O3 -Wall -m64
 ISPC=ispc
-ISPCFLAGS=-O2 --fast-math --arch=x86-64
+ISPCFLAGS=-O2 --target=sse4 --arch=x86-64
 default: ao
@@ -14,12 +20,15 @@ dirs:
 clean:
 	/bin/rm -rf objs *~ ao
-ao: dirs objs/ao.o objs/ao_serial.o objs/ao_ispc.o
+ao: dirs objs/ao.o objs/ao_serial.o objs/ao_ispc.o $(TASK_OBJ)
-	$(CXX) $(CXXFLAGS) -o $@ objs/ao.o objs/ao_ispc.o objs/ao_serial.o -lm -lpthread
+	$(CXX) $(CXXFLAGS) -o $@ objs/ao.o objs/ao_ispc.o objs/ao_serial.o $(TASK_OBJ) -lm $(TASK_LIB)
 objs/%.o: %.cpp
 	$(CXX) $< $(CXXFLAGS) -c -o $@
 objs/%.o: ../%.cpp
 	$(CXX) $< $(CXXFLAGS) -c -o $@
 objs/ao.o: objs/ao_ispc.h 
 objs/%_ispc.h objs/%_ispc.o: %.ispc
--- a/examples/aobench/ao.cpp
+++ b/examples/aobench/ao.cpp
@@ -173,10 +173,30 @@ int main(int argc, char **argv)
    }
    // Report results and save image
-    printf("[aobench ispc]:\t\t\t[%.3f] M cycles (%d x %d image)\n", minTimeISPC, 
+    printf("[aobench ispc]:\t\t\t[%.3f] M cycles (%d x %d image)\n", 
-           width, height);
+           minTimeISPC, width, height);
    savePPM("ao-ispc.ppm", width, height); 
    //
    // Run the ispc + tasks path, test_iterations times, and report the
    // minimum time for any of them.
    //
    double minTimeISPCTasks = 1e30;
    for (unsigned int i = 0; i < test_iterations; i++) {
        memset((void *)fimg, 0, sizeof(float) * width * height * 3);
        assert(NSUBSAMPLES == 2);
        reset_and_start_timer();
        ao_ispc_tasks(width, height, NSUBSAMPLES, fimg);
        double t = get_elapsed_mcycles();
        minTimeISPCTasks = std::min(minTimeISPCTasks, t);
    }
    // Report results and save image
    printf("[aobench ispc + tasks]:\t\t[%.3f] M cycles (%d x %d image)\n", 
           minTimeISPCTasks, width, height);
    savePPM("ao-ispc-tasks.ppm", width, height); 
    //
    // Run the serial path, again test_iteration times, and report the
    // minimum time.
@@ -193,7 +213,8 @@ int main(int argc, char **argv)
    // Report more results, save another image...
    printf("[aobench serial]:\t\t[%.3f] M cycles (%d x %d image)\n", minTimeSerial, 
           width, height);
-    printf("\t\t\t\t(%.2fx speedup from ISPC)\n", minTimeSerial / minTimeISPC);
+    printf("\t\t\t\t(%.2fx speedup from ISPC, %.2fx speedup from ISPC + tasks)\n", 
           minTimeSerial / minTimeISPC, minTimeSerial / minTimeISPCTasks);
    savePPM("ao-serial.ppm", width, height); 
    return 0;
--- a/examples/aobench/ao.ispc
+++ b/examples/aobench/ao.ispc
@@ -203,8 +203,9 @@ ambient_occlusion(reference Isect isect, reference Plane plane,
 /* Compute the image for the scanlines from [y0,y1), for an overall image
   of width w and height h.
 */
-void ao_scanlines(uniform int y0, uniform int y1, uniform int w, uniform int h, 
+static void ao_scanlines(uniform int y0, uniform int y1, uniform int w, 
-                  uniform int nsubsamples, reference uniform float image[]) {
+                         uniform int h,  uniform int nsubsamples, 
                         reference uniform float image[]) {
    static Plane plane = { { 0.0f, -0.5f, 0.0f }, { 0.f, 1.f, 0.f } };
    static Sphere spheres[3] = {
        { { -2.0f, 0.0f, -3.5f }, 0.5f },
@@ -231,6 +232,9 @@ void ao_scanlines(uniform int y0, uniform int y1, uniform int w, uniform int h,
    // direction we do per iteration and ny the number in y.
    uniform int nx = 1, ny = 1;
    // FIXME: We actually need ny to be 1 regardless of the decomposition,
    // since the task decomposition is one scanline high.
    if (programCount == 8) {
        // Do two pixels at once in the x direction
        nx = 2;
@@ -239,19 +243,21 @@ void ao_scanlines(uniform int y0, uniform int y1, uniform int w, uniform int h,
            ++du;
    }
    else if (programCount == 16) {
-        // Two at once in both x and y
+        nx = 4;
-        nx = ny = 2;
+        ny = 1;
-        if ((programIndex >= 4 && programIndex < 8) || programIndex >= 12)
+        if (programIndex >= 4 && programIndex < 8)
            ++du;
-        if (programIndex >= 8)  
+        if (programIndex >= 8 && programIndex < 12)
-            ++dv;
+            du += 2;
        if (programIndex >= 12)
            du += 3;
    }
    // Now loop over all of the pixels, stepping in x and y as calculated
    // above.  (Assumes that ny divides y and nx divides x...)
    for (uniform int y = y0; y < y1; y += ny) {
        for (uniform int x = 0; x < w; x += nx)  {
-            // Figur out x,y pixel in NDC
+            // Figure out x,y pixel in NDC
            float px =  (x + du - (w / 2.0f)) / (w / 2.0f);
            float py = -(y + dv - (h / 2.0f)) / (h / 2.0f);
            float ret = 0.f;
@@ -293,7 +299,7 @@ void ao_scanlines(uniform int y0, uniform int y1, uniform int w, uniform int h,
            // offset to the first pixel in the image
            uniform int offset = 3 * (y * w + x);
-            for (uniform int p = 0; p < programCount; p += 4, ++offset) {
+            for (uniform int p = 0; p < programCount; p += 4, offset += 3) {
                // Get the four sample values for this pixel
                uniform float sumret = retArray[p] + retArray[p+1] + retArray[p+2] +
                    retArray[p+3];
@@ -315,3 +321,15 @@ export void ao_ispc(uniform int w, uniform int h, uniform int nsubsamples,
                    uniform float image[]) {
    ao_scanlines(0, h, w, h, nsubsamples, image);
 }
 static void task ao_task(uniform int width, uniform int height, 
                         uniform int nsubsamples, uniform float image[]) {
    ao_scanlines(taskIndex, taskIndex+1, width, height, nsubsamples, image);
 }
 export void ao_ispc_tasks(uniform int w, uniform int h, uniform int nsubsamples, 
                          uniform float image[]) {
    launch[h] < ao_task(w, h, nsubsamples, image) >;
 }
--- a/examples/aobench/ao_serial.cpp
+++ b/examples/aobench/ao_serial.cpp
@@ -140,7 +140,7 @@ ray_plane_intersect(Isect &isect, Ray &ray,
    float d = -dot(plane.p, plane.n);
    float v = dot(ray.dir, plane.n);
-    if (fabsf(v) < 1.0e-17) 
+    if (fabsf(v) < 1.0e-17f) 
        return;
    else {
        float t = -(dot(ray.org, plane.n) + d) / v;
@@ -183,11 +183,11 @@ orthoBasis(vec basis[3], const vec &n) {
    basis[2] = n;
    basis[1].x = 0.0; basis[1].y = 0.0; basis[1].z = 0.0;
-    if ((n.x < 0.6) && (n.x > -0.6)) {
+    if ((n.x < 0.6f) && (n.x > -0.6f)) {
        basis[1].x = 1.0;
-    } else if ((n.y < 0.6) && (n.y > -0.6)) {
+    } else if ((n.y < 0.6f) && (n.y > -0.6f)) {
        basis[1].y = 1.0;
-    } else if ((n.z < 0.6) && (n.z > -0.6)) {
+    } else if ((n.z < 0.6f) && (n.z > -0.6f)) {
        basis[1].z = 1.0;
    } else {
        basis[1].x = 1.0;
@@ -224,7 +224,7 @@ ambient_occlusion(Isect &isect, Plane &plane,
            float phi   = 2.0f * M_PI * drand48();
            float x = cosf(phi) * theta;
            float y = sinf(phi) * theta;
-            float z = sqrtf(1.0 - theta * theta);
+            float z = sqrtf(1.0f - theta * theta);
            // local . global
            float rx = x * basis[0].x + y * basis[1].x + z * basis[2].x;
@@ -236,14 +236,14 @@ ambient_occlusion(Isect &isect, Plane &plane,
            ray.dir.y = ry;
            ray.dir.z = rz;
-            occIsect.t   = 1.0e+17;
+            occIsect.t   = 1.0e+17f;
            occIsect.hit = 0;
            for (int snum = 0; snum < 3; ++snum)
                ray_sphere_intersect(occIsect, ray, spheres[snum]); 
            ray_plane_intersect (occIsect, ray, plane); 
-            if (occIsect.hit) occlusion += 1.0;
+            if (occIsect.hit) occlusion += 1.f;
        }
    }
@@ -280,10 +280,10 @@ static void ao_scanlines(int y0, int y1, int w, int h, int nsubsamples,
                    ray.dir.x = px;
                    ray.dir.y = py;
-                    ray.dir.z = -1.0;
+                    ray.dir.z = -1.0f;
                    vnormalize(ray.dir);
-                    isect.t   = 1.0e+17;
+                    isect.t   = 1.0e+17f;
                    isect.hit = 0;
                    for (int snum = 0; snum < 3; ++snum)
--- a/examples/aobench/aobench.vcxproj
+++ b/examples/aobench/aobench.vcxproj
@@ -1,4 +1,4 @@
-<?xml version="1.0" encoding="utf-8"?>
+<?xml version="1.0" encoding="utf-8"?>
 <Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
  <ItemGroup Label="ProjectConfigurations">
    <ProjectConfiguration Include="Debug|Win32">
@@ -21,6 +21,7 @@
  <ItemGroup>
    <ClCompile Include="ao.cpp" />
    <ClCompile Include="ao_serial.cpp" />
    <ClCompile Include="../tasksys.cpp" />
  </ItemGroup>
  <ItemGroup>
    <CustomBuild Include="ao.ispc">
--- a/examples/aobench_instrumented/Makefile
+++ b/examples/aobench_instrumented/Makefile
@@ -2,7 +2,7 @@
 CXX=g++ -m64
 CXXFLAGS=-Iobjs/ -g3 -Wall
 ISPC=ispc
-ISPCFLAGS=-O2 --fast-math --instrument --arch=x86-64
+ISPCFLAGS=-O2 --instrument --arch=x86-64
 default: ao
--- a/examples/aobench_instrumented/aobench_instrumented.vcxproj
+++ b/examples/aobench_instrumented/aobench_instrumented.vcxproj
--- a/examples/deferred/Makefile
+++ b/examples/deferred/Makefile
@@ -0,0 +1,42 @@
 ARCH = $(shell uname)
 TASK_CXX=../tasks_pthreads.cpp
 TASK_LIB=-lpthread
 ifeq ($(ARCH), Darwin)
  TASK_CXX=../tasks_gcd.cpp
  TASK_LIB=
 endif
 TASK_OBJ=$(addprefix objs/, $(subst ../,, $(TASK_CXX:.cpp=.o)))
 CXX=g++
 CXXFLAGS=-Iobjs/ -O3 -Wall -m64
 ISPC=ispc
 ISPCFLAGS=-O2 --target=sse4x2 --arch=x86-64 --math-lib=fast
 OBJS=objs/main.o objs/common.o objs/kernels_ispc.o objs/dynamic_c.o objs/dynamic_cilk.o
 default: deferred_shading
 .PHONY: dirs clean
 .PRECIOUS: objs/kernels_ispc.h
 dirs:
 	/bin/mkdir -p objs/
 clean:
 	/bin/rm -rf objs *~ deferred_shading
 deferred_shading: dirs $(OBJS) $(TASK_OBJ)
 	$(CXX) $(CXXFLAGS) -o $@ $(OBJS) $(TASK_OBJ) -lm $(TASK_LIB)
 objs/%.o: %.cpp objs/kernels_ispc.h deferred.h
 	$(CXX) $< $(CXXFLAGS) -c -o $@
 objs/%.o: ../%.cpp
 	$(CXX) $< $(CXXFLAGS) -c -o $@
 objs/%_ispc.h objs/%_ispc.o: %.ispc
 	$(ISPC) $(ISPCFLAGS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
--- a/examples/deferred/common.cpp
+++ b/examples/deferred/common.cpp
@@ -0,0 +1,209 @@
 /*
  Copyright (c) 2011, Intel Corporation
  All rights reserved.
  Redistribution and use in source and binary forms, with or without
  modification, are permitted provided that the following conditions are
  met:
    * Redistributions of source code must retain the above copyright
      notice, this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.
    * Neither the name of Intel Corporation nor the names of its
      contributors may be used to endorse or promote products derived from
      this software without specific prior written permission.
   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
 */
 #ifdef _MSC_VER
 #define _CRT_SECURE_NO_WARNINGS
 #define ISPC_IS_WINDOWS
 #elif defined(__linux__)
 #define ISPC_IS_LINUX
 #elif defined(__APPLE__)
 #define ISPC_IS_APPLE
 #endif
 #include <fcntl.h>
 #include <float.h>
 #include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <sys/types.h>
 #include <stdint.h>
 #include <algorithm>
 #include <assert.h>
 #include <vector>
 #ifdef ISPC_IS_WINDOWS
  #define WIN32_LEAN_AND_MEAN
  #include <windows.h>
 #endif
 #ifdef ISPC_IS_LINUX
  #include <malloc.h>
 #endif
 #include "deferred.h"
 #include "../timing.h"
 ///////////////////////////////////////////////////////////////////////////
 static void *
 lAlignedMalloc(int64_t size, int32_t alignment) {
 #ifdef ISPC_IS_WINDOWS
    return _aligned_malloc(size, alignment);
 #endif
 #ifdef ISPC_IS_LINUX
    return memalign(alignment, size);
 #endif
 #ifdef ISPC_IS_APPLE
    void *mem = malloc(size + (alignment-1) + sizeof(void*));
    char *amem = ((char*)mem) + sizeof(void*);
    amem = amem + uint32_t(alignment - (reinterpret_cast<uint64_t>(amem) &
                                        (alignment - 1)));
    ((void**)amem)[-1] = mem;
    return amem;
 #endif
 }
 static void
 lAlignedFree(void *ptr) {
 #ifdef ISPC_IS_WINDOWS
    _aligned_free(ptr);
 #endif
 #ifdef ISPC_IS_LINUX
    free(ptr);
 #endif
 #ifdef ISPC_IS_APPLE
    free(((void**)ptr)[-1]);
 #endif
 }
 Framebuffer::Framebuffer(int width, int height) {
    nPixels = width*height;
    r = (uint8_t *)lAlignedMalloc(nPixels, ALIGNMENT_BYTES);
    g = (uint8_t *)lAlignedMalloc(nPixels, ALIGNMENT_BYTES);
    b = (uint8_t *)lAlignedMalloc(nPixels, ALIGNMENT_BYTES);
 }
 Framebuffer::~Framebuffer() {
    lAlignedFree(r);
    lAlignedFree(g);
    lAlignedFree(b);
 }
 void
 Framebuffer::clear() {
    memset(r, 0, nPixels);
    memset(g, 0, nPixels);
    memset(b, 0, nPixels);
 }
 InputData *
 CreateInputDataFromFile(const char *path) {
    FILE *in = fopen(path, "rb");
    if (!in) return 0;
    InputData *input = new InputData;
    // Load header
    if (fread(&input->header, sizeof(ispc::InputHeader), 1, in) != 1) {
        fprintf(stderr, "Preumature EOF reading file \"%s\"\n", path);
        return NULL;
    }
    // Load data chunk and update pointers
    input->chunk = (uint8_t *)lAlignedMalloc(input->header.inputDataChunkSize, 
                                             ALIGNMENT_BYTES);
    if (fread(input->chunk, input->header.inputDataChunkSize, 1, in) != 1) {
        fprintf(stderr, "Preumature EOF reading file \"%s\"\n", path);
        return NULL;
    }
    input->arrays.zBuffer =
        (float *)&input->chunk[input->header.inputDataArrayOffsets[idaZBuffer]];
    input->arrays.normalEncoded_x =
        (uint16_t *)&input->chunk[input->header.inputDataArrayOffsets[idaNormalEncoded_x]];
    input->arrays.normalEncoded_y =
        (uint16_t *)&input->chunk[input->header.inputDataArrayOffsets[idaNormalEncoded_y]];
    input->arrays.specularAmount =
        (uint16_t *)&input->chunk[input->header.inputDataArrayOffsets[idaSpecularAmount]];
    input->arrays.specularPower =
        (uint16_t *)&input->chunk[input->header.inputDataArrayOffsets[idaSpecularPower]];
    input->arrays.albedo_x =
        (uint8_t *)&input->chunk[input->header.inputDataArrayOffsets[idaAlbedo_x]];
    input->arrays.albedo_y =
        (uint8_t *)&input->chunk[input->header.inputDataArrayOffsets[idaAlbedo_y]];
    input->arrays.albedo_z =
        (uint8_t *)&input->chunk[input->header.inputDataArrayOffsets[idaAlbedo_z]];
    input->arrays.lightPositionView_x =
        (float *)&input->chunk[input->header.inputDataArrayOffsets[idaLightPositionView_x]];
    input->arrays.lightPositionView_y =
        (float *)&input->chunk[input->header.inputDataArrayOffsets[idaLightPositionView_y]];
    input->arrays.lightPositionView_z =
        (float *)&input->chunk[input->header.inputDataArrayOffsets[idaLightPositionView_z]];
    input->arrays.lightAttenuationBegin =
        (float *)&input->chunk[input->header.inputDataArrayOffsets[idaLightAttenuationBegin]];
    input->arrays.lightColor_x =
        (float *)&input->chunk[input->header.inputDataArrayOffsets[idaLightColor_x]];
    input->arrays.lightColor_y =
        (float *)&input->chunk[input->header.inputDataArrayOffsets[idaLightColor_y]];
    input->arrays.lightColor_z =
        (float *)&input->chunk[input->header.inputDataArrayOffsets[idaLightColor_z]];
    input->arrays.lightAttenuationEnd =
        (float *)&input->chunk[input->header.inputDataArrayOffsets[idaLightAttenuationEnd]];
    fclose(in);
    return input;
 }
 void DeleteInputData(InputData *input)
 {
    lAlignedFree(input->chunk);
 }
 void WriteFrame(const char *filename, const InputData *input,
                const Framebuffer &framebuffer) {
    // Deswizzle and copy to RGBA output
    // Doesn't need to be fast... only happens once
    size_t imageBytes = 3 * input->header.framebufferWidth * 
        input->header.framebufferHeight;
    uint8_t* framebufferAOS = (uint8_t *)lAlignedMalloc(imageBytes, ALIGNMENT_BYTES);
    memset(framebufferAOS, 0, imageBytes);
    for (int i = 0; i < input->header.framebufferWidth * 
                        input->header.framebufferHeight; ++i) {
        framebufferAOS[3 * i + 0] = framebuffer.r[i];
        framebufferAOS[3 * i + 1] = framebuffer.g[i];
        framebufferAOS[3 * i + 2] = framebuffer.b[i];
    }
    // Write out simple PPM file
    FILE *out = fopen(filename, "wb");
    fprintf(out, "P6 %d %d 255\n", input->header.framebufferWidth, 
            input->header.framebufferHeight);
    fwrite(framebufferAOS, imageBytes, 1, out);
    lAlignedFree(framebufferAOS);
 }
--- a/examples/deferred/data/pp1280x720.bin
+++ b/examples/deferred/data/pp1280x720.bin
--- a/examples/deferred/data/pp1920x1200.bin
+++ b/examples/deferred/data/pp1920x1200.bin
--- a/examples/deferred/deferred.h
+++ b/examples/deferred/deferred.h
@@ -0,0 +1,108 @@
 /*
  Copyright (c) 2011, Intel Corporation
  All rights reserved.
  Redistribution and use in source and binary forms, with or without
  modification, are permitted provided that the following conditions are
  met:
    * Redistributions of source code must retain the above copyright
      notice, this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.
    * Neither the name of Intel Corporation nor the names of its
      contributors may be used to endorse or promote products derived from
      this software without specific prior written permission.
   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
 */
 #ifndef DEFERRED_H
 #define DEFERRED_H
 // Currently tile widths must be a multiple of SIMD width (i.e. 8 for ispc sse4x2)!
 #define MIN_TILE_WIDTH 16
 #define MIN_TILE_HEIGHT 16
 #define MAX_LIGHTS 1024
 enum InputDataArraysEnum {
    idaZBuffer = 0,
    idaNormalEncoded_x,
    idaNormalEncoded_y,
    idaSpecularAmount,
    idaSpecularPower,
    idaAlbedo_x,
    idaAlbedo_y,
    idaAlbedo_z,
    idaLightPositionView_x,
    idaLightPositionView_y,
    idaLightPositionView_z,
    idaLightAttenuationBegin,
    idaLightColor_x,
    idaLightColor_y,
    idaLightColor_z,
    idaLightAttenuationEnd,
    idaNum
 };
 #ifndef ISPC
 #include <stdint.h>
 #include "kernels_ispc.h"
 #define ALIGNMENT_BYTES 64
 #define MAX_LIGHTS 1024
 #define VISUALIZE_LIGHT_COUNT 0
 struct InputData
 {
    ispc::InputHeader header;
    ispc::InputDataArrays arrays;
    uint8_t *chunk;
 };
 struct Framebuffer {
    Framebuffer(int width, int height);
    ~Framebuffer();
    void clear();
    uint8_t *r, *g, *b;
 private:
    int nPixels;
    Framebuffer(const Framebuffer &);
    Framebuffer &operator=(const Framebuffer *);
 };
 InputData *CreateInputDataFromFile(const char *path);
 void DeleteInputData(InputData *input);
 void WriteFrame(const char *filename, const InputData *input,
                const Framebuffer &framebuffer);
 void InitDynamicC(InputData *input);
 void InitDynamicCilk(InputData *input);
 void DispatchDynamicC(InputData *input, Framebuffer *framebuffer);
 void DispatchDynamicCilk(InputData *input, Framebuffer *framebuffer);
 #endif // !ISPC
 #endif // DEFERRED_H
--- a/examples/deferred/deferred_shading.vcxproj
+++ b/examples/deferred/deferred_shading.vcxproj
@@ -0,0 +1,170 @@
 <?xml version="1.0" encoding="utf-8"?>
 <Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
  <ItemGroup Label="ProjectConfigurations">
    <ProjectConfiguration Include="Debug|Win32">
      <Configuration>Debug</Configuration>
      <Platform>Win32</Platform>
    </ProjectConfiguration>
    <ProjectConfiguration Include="Debug|x64">
      <Configuration>Debug</Configuration>
      <Platform>x64</Platform>
    </ProjectConfiguration>
    <ProjectConfiguration Include="Release|Win32">
      <Configuration>Release</Configuration>
      <Platform>Win32</Platform>
    </ProjectConfiguration>
    <ProjectConfiguration Include="Release|x64">
      <Configuration>Release</Configuration>
      <Platform>x64</Platform>
    </ProjectConfiguration>
  </ItemGroup>
  <PropertyGroup Label="Globals">
    <ProjectGuid>{87f53c53-957e-4e91-878a-bc27828fb9eb}</ProjectGuid>
    <Keyword>Win32Proj</Keyword>
    <RootNamespace>mandelbrot</RootNamespace>
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
    <ConfigurationType>Application</ConfigurationType>
    <UseDebugLibraries>true</UseDebugLibraries>
    <CharacterSet>Unicode</CharacterSet>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
    <ConfigurationType>Application</ConfigurationType>
    <UseDebugLibraries>true</UseDebugLibraries>
    <CharacterSet>Unicode</CharacterSet>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
    <ConfigurationType>Application</ConfigurationType>
    <UseDebugLibraries>false</UseDebugLibraries>
    <WholeProgramOptimization>true</WholeProgramOptimization>
    <CharacterSet>Unicode</CharacterSet>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
    <ConfigurationType>Application</ConfigurationType>
    <UseDebugLibraries>false</UseDebugLibraries>
    <WholeProgramOptimization>true</WholeProgramOptimization>
    <CharacterSet>Unicode</CharacterSet>
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
  <ImportGroup Label="ExtensionSettings">
  </ImportGroup>
  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
  </ImportGroup>
  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
  </ImportGroup>
  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
  </ImportGroup>
  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
  </ImportGroup>
  <PropertyGroup Label="UserMacros" />
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
    <LinkIncremental>true</LinkIncremental>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
    <LinkIncremental>true</LinkIncremental>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
    <LinkIncremental>false</LinkIncremental>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
    <LinkIncremental>false</LinkIncremental>
  </PropertyGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
    <ClCompile>
      <PrecompiledHeader>
      </PrecompiledHeader>
      <WarningLevel>Level3</WarningLevel>
      <Optimization>Disabled</Optimization>
      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <FloatingPointModel>Fast</FloatingPointModel>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
      <GenerateDebugInformation>true</GenerateDebugInformation>
    </Link>
  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
    <ClCompile>
      <PrecompiledHeader>
      </PrecompiledHeader>
      <WarningLevel>Level3</WarningLevel>
      <Optimization>Disabled</Optimization>
      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <FloatingPointModel>Fast</FloatingPointModel>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
      <GenerateDebugInformation>true</GenerateDebugInformation>
    </Link>
  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
    <ClCompile>
      <WarningLevel>Level3</WarningLevel>
      <PrecompiledHeader>
      </PrecompiledHeader>
      <Optimization>MaxSpeed</Optimization>
      <FunctionLevelLinking>true</FunctionLevelLinking>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <FloatingPointModel>Fast</FloatingPointModel>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
      <GenerateDebugInformation>true</GenerateDebugInformation>
      <EnableCOMDATFolding>true</EnableCOMDATFolding>
      <OptimizeReferences>true</OptimizeReferences>
    </Link>
  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
    <ClCompile>
      <WarningLevel>Level3</WarningLevel>
      <PrecompiledHeader>
      </PrecompiledHeader>
      <Optimization>MaxSpeed</Optimization>
      <FunctionLevelLinking>true</FunctionLevelLinking>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <FloatingPointModel>Fast</FloatingPointModel>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
      <GenerateDebugInformation>true</GenerateDebugInformation>
      <EnableCOMDATFolding>true</EnableCOMDATFolding>
      <OptimizeReferences>true</OptimizeReferences>
    </Link>
  </ItemDefinitionGroup>
  <ItemGroup>
    <ClCompile Include="common.cpp" />
    <ClCompile Include="dynamic_c.cpp" />
    <ClCompile Include="dynamic_cilk.cpp" />
    <ClCompile Include="main.cpp" />
    <ClCompile Include="../tasks_concrt.cpp" />
  </ItemGroup>
  <ItemGroup>
    <CustomBuild Include="kernels.ispc">
      <FileType>Document</FileType>
      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=sse4x2
 </Command>
      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --target=sse4x2
 </Command>
      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=sse4x2
 </Command>
      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --target=sse4x2
 </Command>
      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
    </CustomBuild>
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
  </ImportGroup>
 </Project>
--- a/examples/deferred/dynamic_c.cpp
+++ b/examples/deferred/dynamic_c.cpp
@@ -0,0 +1,871 @@
 /*
  Copyright (c) 2011, Intel Corporation
  All rights reserved.
  Redistribution and use in source and binary forms, with or without
  modification, are permitted provided that the following conditions are
  met:
    * Redistributions of source code must retain the above copyright
      notice, this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.
    * Neither the name of Intel Corporation nor the names of its
      contributors may be used to endorse or promote products derived from
      this software without specific prior written permission.
   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
 */
 #include "deferred.h"
 #include "kernels_ispc.h"
 #include <algorithm>
 #include <stdint.h>
 #include <assert.h>
 #include <math.h>
 #ifdef _MSC_VER
 #define ISPC_IS_WINDOWS
 #elif defined(__linux__)
 #define ISPC_IS_LINUX
 #elif defined(__APPLE__)
 #define ISPC_IS_APPLE
 #endif
 #ifdef ISPC_IS_LINUX
 #include <malloc.h>
 #endif // ISPC_IS_LINUX
 // Currently tile widths must be a multiple of SIMD width (i.e. 8 for ispc sse4x2)!
 #define MIN_TILE_WIDTH 16
 #define MIN_TILE_HEIGHT 16
 #define DYNAMIC_TREE_LEVELS 5
 // If this is set to 1 then the result will be identical to the static version
 #define DYNAMIC_MIN_LIGHTS_TO_SUBDIVIDE 1
 static void *
 lAlignedMalloc(int64_t size, int32_t alignment) {
 #ifdef ISPC_IS_WINDOWS
    return _aligned_malloc(size, alignment);
 #endif
 #ifdef ISPC_IS_LINUX
    return memalign(alignment, size);
 #endif
 #ifdef ISPC_IS_APPLE
    void *mem = malloc(size + (alignment-1) + sizeof(void*));
    char *amem = ((char*)mem) + sizeof(void*);
    amem = amem + uint32_t(alignment - (reinterpret_cast<uint64_t>(amem) &
                                        (alignment - 1)));
    ((void**)amem)[-1] = mem;
    return amem;
 #endif
 }
 static void
 lAlignedFree(void *ptr) {
 #ifdef ISPC_IS_WINDOWS
    _aligned_free(ptr);
 #endif
 #ifdef ISPC_IS_LINUX
    free(ptr);
 #endif
 #ifdef ISPC_IS_APPLE
    free(((void**)ptr)[-1]);
 #endif
 }
 static void
 ComputeZBounds(int tileStartX, int tileEndX,
               int tileStartY, int tileEndY,
               // G-buffer data
               float zBuffer[],
               int gBufferWidth,
               // Camera data
               float cameraProj_33, float cameraProj_43,
               float cameraNear, float cameraFar,
               // Output
               float *minZ, float *maxZ)
 {
    // Find Z bounds
    float laneMinZ = cameraFar;
    float laneMaxZ = cameraNear;
    for (int y = tileStartY; y < tileEndY; ++y) {
        for (int x = tileStartX; x < tileEndX; ++x) {
            // Unproject depth buffer Z value into view space
            float z = zBuffer[(y * gBufferWidth + x)];
            float viewSpaceZ = cameraProj_43 / (z - cameraProj_33);
            // Work out Z bounds for our samples
            // Avoid considering skybox/background or otherwise invalid pixels
            if ((viewSpaceZ < cameraFar) && (viewSpaceZ >= cameraNear)) {
                laneMinZ = std::min(laneMinZ, viewSpaceZ);
                laneMaxZ = std::max(laneMaxZ, viewSpaceZ);
            }
        }
    }
    *minZ = laneMinZ;
    *maxZ = laneMaxZ;
 }
 static void
 ComputeZBoundsRow(int tileY, int tileWidth, int tileHeight,
                  int numTilesX, int numTilesY,
                  // G-buffer data
                  float zBuffer[],
                  int gBufferWidth,
                  // Camera data
                  float cameraProj_33, float cameraProj_43,
                  float cameraNear, float cameraFar,
                  // Output
                  float minZArray[],
                  float maxZArray[])
 {
    for (int tileX = 0; tileX < numTilesX; ++tileX) {
        float minZ, maxZ;
        ComputeZBounds(
            tileX * tileWidth, tileX * tileWidth + tileWidth,
            tileY * tileHeight, tileY * tileHeight + tileHeight,
            zBuffer, gBufferWidth,
            cameraProj_33, cameraProj_43, cameraNear, cameraFar,
            &minZ, &maxZ);
        minZArray[tileX] = minZ;
        maxZArray[tileX] = maxZ;
    }
 }
 class MinMaxZTree
 {
 public:
    // Currently (min) tile dimensions must divide gBuffer dimensions evenly
    // Levels must be small enough that neither dimension goes below one tile
    MinMaxZTree(
        int tileWidth, int tileHeight, int levels,
        int gBufferWidth, int gBufferHeight)
        : mTileWidth(tileWidth), mTileHeight(tileHeight), mLevels(levels)
    {
        mNumTilesX = gBufferWidth / mTileWidth;
        mNumTilesY = gBufferHeight / mTileHeight;
        // Allocate arrays
        mMinZArrays = (float **)lAlignedMalloc(sizeof(float *) * mLevels, 16);
        mMaxZArrays = (float **)lAlignedMalloc(sizeof(float *) * mLevels, 16);
        for (int i = 0; i < mLevels; ++i) {
            int x = NumTilesX(i);
            int y = NumTilesY(i);
            assert(x > 0);
            assert(y > 0);
            // NOTE: If the following two asserts fire it probably means that
            // the base tile dimensions do not evenly divide the G-buffer dimensions
            assert(x * (mTileWidth << i) >= gBufferWidth);
            assert(y * (mTileHeight << i) >= gBufferHeight);
            mMinZArrays[i] = (float *)lAlignedMalloc(sizeof(float) * x * y, 16);
            mMaxZArrays[i] = (float *)lAlignedMalloc(sizeof(float) * x * y, 16);
        }
    }
    void Update(float *zBuffer, int gBufferPitchInElements,
        float cameraProj_33, float cameraProj_43,
        float cameraNear, float cameraFar)
    {
        for (int tileY = 0; tileY < mNumTilesY; ++tileY) {
            ComputeZBoundsRow(tileY, mTileWidth, mTileHeight, mNumTilesX, mNumTilesY,
                              zBuffer, gBufferPitchInElements,
                              cameraProj_33, cameraProj_43, cameraNear, cameraFar,
                              mMinZArrays[0] + (tileY * mNumTilesX),
                              mMaxZArrays[0] + (tileY * mNumTilesX));
        }
        // Generate other levels
        for (int level = 1; level < mLevels; ++level) {
            int destTilesX = NumTilesX(level);
            int destTilesY = NumTilesY(level);
            int srcLevel = level - 1;
            int srcTilesX = NumTilesX(srcLevel);
            int srcTilesY = NumTilesY(srcLevel);
            for (int y = 0; y < destTilesY; ++y) {
                for (int x = 0; x < destTilesX; ++x) {
                    int srcX = x << 1;
                    int srcY = y << 1;
                    // NOTE: Ugly branches to deal with non-multiple dimensions at some levels
                    // TODO: SSE branchless min/max is probably better...
                    float minZ = mMinZArrays[srcLevel][(srcY) * srcTilesX + (srcX)];
                    float maxZ = mMaxZArrays[srcLevel][(srcY) * srcTilesX + (srcX)];
                    if (srcX + 1 < srcTilesX) {
                        minZ = std::min(minZ, mMinZArrays[srcLevel][(srcY) * srcTilesX + 
                                                                    (srcX + 1)]);
                        maxZ = std::max(maxZ, mMaxZArrays[srcLevel][(srcY) * srcTilesX +
                                                                    (srcX + 1)]);
                        if (srcY + 1 < srcTilesY) {
                            minZ = std::min(minZ, mMinZArrays[srcLevel][(srcY + 1) * srcTilesX +
                                                                        (srcX + 1)]);
                            maxZ = std::max(maxZ, mMaxZArrays[srcLevel][(srcY + 1) * srcTilesX +
                                                                        (srcX + 1)]);
                        }
                    }
                    if (srcY + 1 < srcTilesY) {
                        minZ = std::min(minZ, mMinZArrays[srcLevel][(srcY + 1) * srcTilesX +
                                                                    (srcX    )]);
                        maxZ = std::max(maxZ, mMaxZArrays[srcLevel][(srcY + 1) * srcTilesX +
                                                                    (srcX    )]);
                    }
                    mMinZArrays[level][y * destTilesX + x] = minZ;
                    mMaxZArrays[level][y * destTilesX + x] = maxZ;
                }
            }
        }
    }
    ~MinMaxZTree() {
        for (int i = 0; i < mLevels; ++i) {
            lAlignedFree(mMinZArrays[i]);
            lAlignedFree(mMaxZArrays[i]);
        }
        lAlignedFree(mMinZArrays);
        lAlignedFree(mMaxZArrays); 
    }
    int Levels() const { return mLevels; }
    // These round UP, so beware that the last tile for a given level may not be completely full
    // TODO: Verify this...
    int NumTilesX(int level = 0) const { return (mNumTilesX + (1 << level) - 1) >> level; }
    int NumTilesY(int level = 0) const { return (mNumTilesY + (1 << level) - 1) >> level; }
    int TileWidth(int level = 0) const { return (mTileWidth << level); }
    int TileHeight(int level = 0) const { return (mTileHeight << level); }
    float MinZ(int level, int tileX, int tileY) const {
        return mMinZArrays[level][tileY * NumTilesX(level) + tileX];
    }
    float MaxZ(int level, int tileX, int tileY) const {
        return mMaxZArrays[level][tileY * NumTilesX(level) + tileX];
    }
 private:
    int mTileWidth;
    int mTileHeight;
    int mLevels;
    int mNumTilesX;
    int mNumTilesY;
    // One array for each "level" in the tree
    float **mMinZArrays;
    float **mMaxZArrays;
 };
 static MinMaxZTree *gMinMaxZTree = 0;
 void InitDynamicC(InputData *input) {
    gMinMaxZTree = 
        new MinMaxZTree(MIN_TILE_WIDTH, MIN_TILE_HEIGHT, DYNAMIC_TREE_LEVELS,
                        input->header.framebufferWidth, 
                        input->header.framebufferHeight);
 }
 // numLights need not be a multiple of programCount here, but the input and output arrays
 // should be able to handle programCount-sized load/stores.
 static void
 SplitTileMinMax(
    int tileMidX, int tileMidY,
    // Subtile data (00, 10, 01, 11)
    float subtileMinZ[],
    float subtileMaxZ[],
    // G-buffer data
    int gBufferWidth, int gBufferHeight,
    // Camera data
    float cameraProj_11, float cameraProj_22,
    // Light Data
    int lightIndices[],
    int numLights,
    float light_positionView_x_array[],
    float light_positionView_y_array[],
    float light_positionView_z_array[],
    float light_attenuationEnd_array[],
    // Outputs
    int subtileIndices[],
    int subtileIndicesPitch,
    int subtileNumLights[]
    )
 {
    float gBufferScale_x = 0.5f * (float)gBufferWidth;
    float gBufferScale_y = 0.5f * (float)gBufferHeight;
    float frustumPlanes_xy[2] = { -(cameraProj_11 * gBufferScale_x),
                                   (cameraProj_22 * gBufferScale_y) };
    float frustumPlanes_z[2] = { tileMidX - gBufferScale_x,
                                 tileMidY - gBufferScale_y };
    for (int i = 0; i < 2; ++i) {
        // Normalize
        float norm = 1.f / sqrtf(frustumPlanes_xy[i] * frustumPlanes_xy[i] + 
                                 frustumPlanes_z[i] * frustumPlanes_z[i]);
        frustumPlanes_xy[i] *= norm;
        frustumPlanes_z[i] *= norm;
    }
    // Initialize
    int subtileLightOffset[4];
    subtileLightOffset[0] = 0 * subtileIndicesPitch;
    subtileLightOffset[1] = 1 * subtileIndicesPitch;
    subtileLightOffset[2] = 2 * subtileIndicesPitch;
    subtileLightOffset[3] = 3 * subtileIndicesPitch;
    for (int i = 0; i < numLights; ++i) {
        int lightIndex = lightIndices[i];
        float light_positionView_x = light_positionView_x_array[lightIndex];
        float light_positionView_y = light_positionView_y_array[lightIndex];
        float light_positionView_z = light_positionView_z_array[lightIndex];
        float light_attenuationEnd = light_attenuationEnd_array[lightIndex];
        float light_attenuationEndNeg = -light_attenuationEnd;
        // Test lights again subtile z bounds
        bool inFrustum[4];
        inFrustum[0] = (light_positionView_z - subtileMinZ[0] >= light_attenuationEndNeg) &&
            (subtileMaxZ[0] - light_positionView_z >= light_attenuationEndNeg);
        inFrustum[1] = (light_positionView_z - subtileMinZ[1] >= light_attenuationEndNeg) && 
            (subtileMaxZ[1] - light_positionView_z >= light_attenuationEndNeg);
        inFrustum[2] = (light_positionView_z - subtileMinZ[2] >= light_attenuationEndNeg) && 
            (subtileMaxZ[2] - light_positionView_z >= light_attenuationEndNeg);
        inFrustum[3] = (light_positionView_z - subtileMinZ[3] >= light_attenuationEndNeg) && 
            (subtileMaxZ[3] - light_positionView_z >= light_attenuationEndNeg);
        float dx = light_positionView_z * frustumPlanes_z[0] + 
            light_positionView_x * frustumPlanes_xy[0];
        float dy = light_positionView_z * frustumPlanes_z[1] +
            light_positionView_y * frustumPlanes_xy[1];
        if (fabsf(dx) > light_attenuationEnd) {
            bool positiveX = dx > 0.0f;
            inFrustum[0] = inFrustum[0] &&  positiveX;    // 00 subtile
            inFrustum[1] = inFrustum[1] && !positiveX;    // 10 subtile
            inFrustum[2] = inFrustum[2] &&  positiveX;    // 01 subtile
            inFrustum[3] = inFrustum[3] && !positiveX;    // 11 subtile
        }
        if (fabsf(dy) > light_attenuationEnd) {
            bool positiveY = dy > 0.0f;
            inFrustum[0] = inFrustum[0] &&  positiveY;    // 00 subtile
            inFrustum[1] = inFrustum[1] &&  positiveY;    // 10 subtile
            inFrustum[2] = inFrustum[2] && !positiveY;    // 01 subtile
            inFrustum[3] = inFrustum[3] && !positiveY;    // 11 subtile
        }
        if (inFrustum[0])
            subtileIndices[subtileLightOffset[0]++] = lightIndex;
        if (inFrustum[1])
            subtileIndices[subtileLightOffset[1]++] = lightIndex;
        if (inFrustum[2])
            subtileIndices[subtileLightOffset[2]++] = lightIndex;
        if (inFrustum[3])
            subtileIndices[subtileLightOffset[3]++] = lightIndex;
    }
    subtileNumLights[0] = subtileLightOffset[0] - 0 * subtileIndicesPitch;
    subtileNumLights[1] = subtileLightOffset[1] - 1 * subtileIndicesPitch;
    subtileNumLights[2] = subtileLightOffset[2] - 2 * subtileIndicesPitch;
    subtileNumLights[3] = subtileLightOffset[3] - 3 * subtileIndicesPitch;
 }
 static inline float
 dot3(float x, float y, float z, float a, float b, float c) {
    return (x*a + y*b + z*c);
 }
 static inline void
 normalize3(float x, float y, float z, float &ox, float &oy, float &oz) {
    float n = 1.f / sqrtf(x*x + y*y + z*z);
    ox = x * n;
    oy = y * n;
    oz = z * n;
 }
 static inline float
 Unorm8ToFloat32(uint8_t u) {
    return (float)u * (1.0f / 255.0f);
 }
 static inline uint8_t
 Float32ToUnorm8(float f) {
    return (uint8_t)(f * 255.0f);
 }
 static inline float half_to_float_fast(uint16_t h) {
    uint32_t hs = h & (int32_t)0x8000u;  // Pick off sign bit
    uint32_t he = h & (int32_t)0x7C00u;  // Pick off exponent bits
    uint32_t hm = h & (int32_t)0x03FFu;  // Pick off mantissa bits
    // sign
    uint32_t xs = ((uint32_t) hs) << 16; 
    // Exponent: unbias the halfp, then bias the single
    int32_t xes = ((int32_t) (he >> 10)) - 15 + 127; 
    // Exponent
    uint32_t xe = (uint32_t) (xes << 23);
    // Mantissa
    uint32_t xm = ((uint32_t) hm) << 13; 
    uint32_t bits = (xs | xe | xm);
    float *fp = reinterpret_cast<float *>(&bits);
    return *fp;
 }
 static void
 ShadeTileC(
    int32_t tileStartX, int32_t tileEndX,
    int32_t tileStartY, int32_t tileEndY,
    int32_t gBufferWidth, int32_t gBufferHeight,
    const ispc::InputDataArrays &inputData,
    // Camera data
    float cameraProj_11, float cameraProj_22,
    float cameraProj_33, float cameraProj_43,
    // Light list
    int32_t tileLightIndices[],
    int32_t tileNumLights,
    // UI
    bool visualizeLightCount,
    // Output
    uint8_t framebuffer_r[],
    uint8_t framebuffer_g[],
    uint8_t framebuffer_b[]
    )
 {
    if (tileNumLights == 0 || visualizeLightCount) {
        uint8_t c = (uint8_t)(std::min(tileNumLights << 2, 255));
        for (int32_t y = tileStartY; y < tileEndY; ++y) {
            for (int32_t x = tileStartX; x < tileEndX; ++x) {
                int32_t framebufferIndex = (y * gBufferWidth + x);
                framebuffer_r[framebufferIndex] = c;
                framebuffer_g[framebufferIndex] = c;
                framebuffer_b[framebufferIndex] = c;
            }
        }
    } else {
        float twoOverGBufferWidth = 2.0f / gBufferWidth;
        float twoOverGBufferHeight = 2.0f / gBufferHeight;
        for (int32_t y = tileStartY; y < tileEndY; ++y) {
            float positionScreen_y = -(((0.5f + y) * twoOverGBufferHeight) - 1.f);
            for (int32_t x = tileStartX; x < tileEndX; ++x) {
                int32_t gBufferOffset = y * gBufferWidth + x;
                // Reconstruct position and (negative) view vector from G-buffer
                float surface_positionView_x, surface_positionView_y, surface_positionView_z;
                float Vneg_x, Vneg_y, Vneg_z;
                float z = inputData.zBuffer[gBufferOffset];
                // Compute screen/clip-space position
                // NOTE: Mind DX11 viewport transform and pixel center!
                float positionScreen_x = (0.5f + (float)(x)) * 
                    twoOverGBufferWidth - 1.0f;
                // Unproject depth buffer Z value into view space
                surface_positionView_z = cameraProj_43 / (z - cameraProj_33);
                surface_positionView_x = positionScreen_x * surface_positionView_z / 
                    cameraProj_11;
                surface_positionView_y = positionScreen_y * surface_positionView_z / 
                    cameraProj_22;
                // We actually end up with a vector pointing *at* the
                // surface (i.e. the negative view vector)
                normalize3(surface_positionView_x, surface_positionView_y, 
                           surface_positionView_z, Vneg_x, Vneg_y, Vneg_z);
                // Reconstruct normal from G-buffer
                float surface_normal_x, surface_normal_y, surface_normal_z;
                float normal_x = half_to_float_fast(inputData.normalEncoded_x[gBufferOffset]);
                float normal_y = half_to_float_fast(inputData.normalEncoded_y[gBufferOffset]);
                float f = (normal_x - normal_x * normal_x) + (normal_y - normal_y * normal_y);
                float m = sqrtf(4.0f * f - 1.0f);
                surface_normal_x = m * (4.0f * normal_x - 2.0f);
                surface_normal_y = m * (4.0f * normal_y - 2.0f);
                surface_normal_z = 3.0f - 8.0f * f;
                // Load other G-buffer parameters
                float surface_specularAmount = 
                    half_to_float_fast(inputData.specularAmount[gBufferOffset]);
                float surface_specularPower  = 
                    half_to_float_fast(inputData.specularPower[gBufferOffset]);
                float surface_albedo_x = Unorm8ToFloat32(inputData.albedo_x[gBufferOffset]);
                float surface_albedo_y = Unorm8ToFloat32(inputData.albedo_y[gBufferOffset]);
                float surface_albedo_z = Unorm8ToFloat32(inputData.albedo_z[gBufferOffset]);
                float lit_x = 0.0f;
                float lit_y = 0.0f;
                float lit_z = 0.0f;
                for (int32_t tileLightIndex = 0; tileLightIndex < tileNumLights; 
                     ++tileLightIndex) {
                    int32_t lightIndex = tileLightIndices[tileLightIndex];
                    // Gather light data relevant to initial culling
                    float light_positionView_x = 
                        inputData.lightPositionView_x[lightIndex];
                    float light_positionView_y = 
                        inputData.lightPositionView_y[lightIndex];
                    float light_positionView_z = 
                        inputData.lightPositionView_z[lightIndex];
                    float light_attenuationEnd = 
                        inputData.lightAttenuationEnd[lightIndex];
                    // Compute light vector
                    float L_x = light_positionView_x - surface_positionView_x;
                    float L_y = light_positionView_y - surface_positionView_y;
                    float L_z = light_positionView_z - surface_positionView_z;
                    float distanceToLight2 = dot3(L_x, L_y, L_z, L_x, L_y, L_z);
                    // Clip at end of attenuation
                    float light_attenutaionEnd2 = light_attenuationEnd * light_attenuationEnd;
                    if (distanceToLight2 < light_attenutaionEnd2) {                    
                        float distanceToLight = sqrtf(distanceToLight2);
                        float distanceToLightRcp = 1.f / distanceToLight;
                        L_x *= distanceToLightRcp;
                        L_y *= distanceToLightRcp;
                        L_z *= distanceToLightRcp;
                        // Start computing brdf
                        float NdotL = dot3(surface_normal_x, surface_normal_y, 
                                           surface_normal_z, L_x, L_y, L_z);
                        // Clip back facing
                        if (NdotL > 0.0f) {
                            float light_attenuationBegin = 
                                inputData.lightAttenuationBegin[lightIndex];
                            // Light distance attenuation (linstep)
                            float lightRange = (light_attenuationEnd - light_attenuationBegin);
                            float falloffPosition = (light_attenuationEnd - distanceToLight);
                            float attenuation = std::min(falloffPosition / lightRange, 1.0f);
                            float H_x = (L_x - Vneg_x);
                            float H_y = (L_y - Vneg_y);
                            float H_z = (L_z - Vneg_z);
                            normalize3(H_x, H_y, H_z, H_x, H_y, H_z);
                            float NdotH = dot3(surface_normal_x, surface_normal_y, 
                                               surface_normal_z, H_x, H_y, H_z);
                            NdotH = std::max(NdotH, 0.0f);
                            float specular = powf(NdotH, surface_specularPower);
                            float specularNorm = (surface_specularPower + 2.0f) * 
                                (1.0f / 8.0f);
                            float specularContrib = surface_specularAmount * 
                                specularNorm * specular;
                            float k = attenuation * NdotL * (1.0f + specularContrib);
                            float light_color_x = inputData.lightColor_x[lightIndex];
                            float light_color_y = inputData.lightColor_y[lightIndex];
                            float light_color_z = inputData.lightColor_z[lightIndex];
                            float lightContrib_x = surface_albedo_x * light_color_x;
                            float lightContrib_y = surface_albedo_y * light_color_y;
                            float lightContrib_z = surface_albedo_z * light_color_z;
                            lit_x += lightContrib_x * k;
                            lit_y += lightContrib_y * k;
                            lit_z += lightContrib_z * k;
                        }
                    }
                }
                // Gamma correct
                float gamma = 1.0 / 2.2f;
                lit_x = powf(std::min(std::max(lit_x, 0.0f), 1.0f), gamma);
                lit_y = powf(std::min(std::max(lit_y, 0.0f), 1.0f), gamma);
                lit_z = powf(std::min(std::max(lit_z, 0.0f), 1.0f), gamma);
                framebuffer_r[gBufferOffset] = Float32ToUnorm8(lit_x);
                framebuffer_g[gBufferOffset] = Float32ToUnorm8(lit_y);
                framebuffer_b[gBufferOffset] = Float32ToUnorm8(lit_z);
            }
        }
    }
 }
 void
 ShadeDynamicTileRecurse(InputData *input, int level, int tileX, int tileY, 
                        int *lightIndices, int numLights, 
                        Framebuffer *framebuffer) {
    const MinMaxZTree *minMaxZTree = gMinMaxZTree;
    // If we few enough lights or this is the base case (last level), shade
    // this full tile directly
    if (level == 0 || numLights < DYNAMIC_MIN_LIGHTS_TO_SUBDIVIDE) {
        int width = minMaxZTree->TileWidth(level);
        int height = minMaxZTree->TileHeight(level);
        int startX = tileX * width;
        int startY = tileY * height;
        int endX = std::min(input->header.framebufferWidth, startX + width);
        int endY = std::min(input->header.framebufferHeight, startY + height);
        // Skip entirely offscreen tiles
        if (endX > startX && endY > startY) {
            ShadeTileC(startX, endX, startY, endY,
                       input->header.framebufferWidth, input->header.framebufferHeight,
                       input->arrays,
                       input->header.cameraProj[0][0], input->header.cameraProj[1][1], 
                       input->header.cameraProj[2][2], input->header.cameraProj[3][2],
                       lightIndices, numLights, VISUALIZE_LIGHT_COUNT, 
                       framebuffer->r, framebuffer->g, framebuffer->b);
        }
    } 
    else {
        // Otherwise, subdivide and 4-way recurse using X and Y splitting planes
        // Move down a level in the tree
        --level;
        tileX <<= 1;
        tileY <<= 1;
        int width = minMaxZTree->TileWidth(level);
        int height = minMaxZTree->TileHeight(level);
        // Work out splitting coords
        int midX = (tileX + 1) * width;
        int midY = (tileY + 1) * height;
        // Read subtile min/max data
        // NOTE: We must be sure to handle out-of-bounds access here since
        // sometimes we'll only have 1 or 2 subtiles for non-pow-2
        // framebuffer sizes.
        bool rightTileExists = (tileX + 1 < minMaxZTree->NumTilesX(level));
        bool bottomTileExists = (tileY + 1 < minMaxZTree->NumTilesY(level));
        // NOTE: Order is 00, 10, 01, 11
        // Set defaults up to cull all lights if the tile doesn't exist (offscreen)
        float minZ[4] = {input->header.cameraFar, input->header.cameraFar, 
                         input->header.cameraFar, input->header.cameraFar};
        float maxZ[4] = {input->header.cameraNear, input->header.cameraNear, 
                         input->header.cameraNear, input->header.cameraNear};
        minZ[0] = minMaxZTree->MinZ(level, tileX, tileY);
        maxZ[0] = minMaxZTree->MaxZ(level, tileX, tileY);
        if (rightTileExists) {
            minZ[1] = minMaxZTree->MinZ(level, tileX + 1, tileY);
            maxZ[1] = minMaxZTree->MaxZ(level, tileX + 1, tileY);
            if (bottomTileExists) {
                minZ[3] = minMaxZTree->MinZ(level, tileX + 1, tileY + 1);
                maxZ[3] = minMaxZTree->MaxZ(level, tileX + 1, tileY + 1);
            }
        }
        if (bottomTileExists) {
            minZ[2] = minMaxZTree->MinZ(level, tileX, tileY + 1);
            maxZ[2] = minMaxZTree->MaxZ(level, tileX, tileY + 1);
        }
        // Cull lights into subtile lists
 #ifdef ISPC_IS_WINDOWS
        __declspec(align(ALIGNMENT_BYTES)) 
 #endif
            int subtileLightIndices[4][MAX_LIGHTS]
 #ifndef ISPC_IS_WINDOWS
            __attribute__ ((aligned(ALIGNMENT_BYTES)))
 #endif
 ;
        int subtileNumLights[4];
        SplitTileMinMax(midX, midY, minZ, maxZ,
            input->header.framebufferWidth, input->header.framebufferHeight, 
            input->header.cameraProj[0][0], input->header.cameraProj[1][1],
            lightIndices, numLights, input->arrays.lightPositionView_x, 
            input->arrays.lightPositionView_y, input->arrays.lightPositionView_z, 
            input->arrays.lightAttenuationEnd,
            subtileLightIndices[0], MAX_LIGHTS, subtileNumLights);
        // Recurse into subtiles
        ShadeDynamicTileRecurse(input, level, tileX    , tileY, 
                                subtileLightIndices[0], subtileNumLights[0],
                                framebuffer);
        ShadeDynamicTileRecurse(input, level, tileX + 1, tileY,
                                subtileLightIndices[1], subtileNumLights[1],
                                framebuffer);
        ShadeDynamicTileRecurse(input, level, tileX    , tileY + 1,
                                subtileLightIndices[2], subtileNumLights[2],
                                framebuffer);
        ShadeDynamicTileRecurse(input, level, tileX + 1, tileY + 1,
                                subtileLightIndices[3], subtileNumLights[3],
                                framebuffer);
    }
 }
 static int
 IntersectLightsWithTileMinMax(
    int tileStartX, int tileEndX,
    int tileStartY, int tileEndY,
    // Tile data
    float minZ,
    float maxZ,
    // G-buffer data
    int gBufferWidth, int gBufferHeight,
    // Camera data
    float cameraProj_11, float cameraProj_22,
    // Light Data
    int numLights,
    float light_positionView_x_array[],
    float light_positionView_y_array[],
    float light_positionView_z_array[],
    float light_attenuationEnd_array[],
    // Output
    int tileLightIndices[]
    )
 {
    float gBufferScale_x = 0.5f * (float)gBufferWidth;
    float gBufferScale_y = 0.5f * (float)gBufferHeight;
    float frustumPlanes_xy[4];
    float frustumPlanes_z[4];
    // This one is totally constant over the whole screen... worth pulling it up at all?
    float frustumPlanes_xy_v[4] = { -(cameraProj_11 * gBufferScale_x),
                                    (cameraProj_11 * gBufferScale_x),
                                    (cameraProj_22 * gBufferScale_y),
                                    -(cameraProj_22 * gBufferScale_y) };
    float frustumPlanes_z_v[4] = {  tileEndX - gBufferScale_x,
                                    -tileStartX + gBufferScale_x,
                                    tileEndY - gBufferScale_y,
                                    -tileStartY + gBufferScale_y };
    for (int i = 0; i < 4; ++i) {
        float norm = 1.f / sqrtf(frustumPlanes_xy_v[i] * frustumPlanes_xy_v[i] + 
                                 frustumPlanes_z_v[i] * frustumPlanes_z_v[i]);
        frustumPlanes_xy_v[i] *= norm;
        frustumPlanes_z_v[i] *= norm;
        frustumPlanes_xy[i] = frustumPlanes_xy_v[i];
        frustumPlanes_z[i] = frustumPlanes_z_v[i];
    }
    int tileNumLights = 0;
    for (int lightIndex = 0; lightIndex < numLights; ++lightIndex) {
        float light_positionView_z = light_positionView_z_array[lightIndex];
        float light_attenuationEnd = light_attenuationEnd_array[lightIndex];
        float light_attenuationEndNeg = -light_attenuationEnd;
        float d = light_positionView_z - minZ;
        bool inFrustum = (d >= light_attenuationEndNeg);
        d = maxZ - light_positionView_z;
        inFrustum = inFrustum && (d >= light_attenuationEndNeg);
        if (!inFrustum) 
            continue;
        float light_positionView_x = light_positionView_x_array[lightIndex];
        float light_positionView_y = light_positionView_y_array[lightIndex];
        d = light_positionView_z * frustumPlanes_z[0] + 
            light_positionView_x * frustumPlanes_xy[0];
        inFrustum = inFrustum && (d >= light_attenuationEndNeg);
        d = light_positionView_z * frustumPlanes_z[1] + 
            light_positionView_x * frustumPlanes_xy[1];
        inFrustum = inFrustum && (d >= light_attenuationEndNeg);
        d = light_positionView_z * frustumPlanes_z[2] + 
            light_positionView_y * frustumPlanes_xy[2];
        inFrustum = inFrustum && (d >= light_attenuationEndNeg);
        d = light_positionView_z * frustumPlanes_z[3] + 
            light_positionView_y * frustumPlanes_xy[3];
        inFrustum = inFrustum && (d >= light_attenuationEndNeg);
        // Pack and store intersecting lights
        if (inFrustum)
            tileLightIndices[tileNumLights++] = lightIndex;
    }
    return tileNumLights;
 }
 void
 ShadeDynamicTile(InputData *input, int level, int tileX, int tileY,
                 Framebuffer *framebuffer) {
    const MinMaxZTree *minMaxZTree = gMinMaxZTree;
    // Get Z min/max for this tile
    int width = minMaxZTree->TileWidth(level);
    int height = minMaxZTree->TileHeight(level);
    float minZ = minMaxZTree->MinZ(level, tileX, tileY);
    float maxZ = minMaxZTree->MaxZ(level, tileX, tileY);
    int startX = tileX * width;
    int startY = tileY * height;
    int endX = std::min(input->header.framebufferWidth, startX + width);
    int endY = std::min(input->header.framebufferHeight, startY + height);
    // This is a root tile, so first do a full 6-plane cull
 #ifdef ISPC_IS_WINDOWS
    __declspec(align(ALIGNMENT_BYTES)) 
 #endif
        int lightIndices[MAX_LIGHTS]
 #ifndef ISPC_IS_WINDOWS
        __attribute__ ((aligned(ALIGNMENT_BYTES)))
 #endif
 ;
    int numLights = IntersectLightsWithTileMinMax(
        startX, endX, startY, endY,    minZ, maxZ,
        input->header.framebufferWidth, input->header.framebufferHeight,
        input->header.cameraProj[0][0], input->header.cameraProj[1][1],
        MAX_LIGHTS, input->arrays.lightPositionView_x, 
        input->arrays.lightPositionView_y, input->arrays.lightPositionView_z, 
        input->arrays.lightAttenuationEnd, lightIndices);
    // Now kick off the recursive process for this tile
    ShadeDynamicTileRecurse(input, level, tileX, tileY, lightIndices, 
                            numLights, framebuffer);
 }
 void
 DispatchDynamicC(InputData *input, Framebuffer *framebuffer)
 {
    MinMaxZTree *minMaxZTree = gMinMaxZTree;
    // Update min/max Z tree
    minMaxZTree->Update(input->arrays.zBuffer, input->header.framebufferWidth,
        input->header.cameraProj[2][2], input->header.cameraProj[3][2], 
        input->header.cameraNear, input->header.cameraFar);
    int rootLevel = minMaxZTree->Levels() - 1;
    int rootTilesX = minMaxZTree->NumTilesX(rootLevel);
    int rootTilesY = minMaxZTree->NumTilesY(rootLevel);
    int rootTiles = rootTilesX * rootTilesY;
    for (int g = 0; g < rootTiles; ++g) {
        uint32_t tileY = g / rootTilesX;
        uint32_t tileX = g % rootTilesX;
        ShadeDynamicTile(input, rootLevel, tileX, tileY, framebuffer);
    }
 }
--- a/examples/deferred/dynamic_cilk.cpp
+++ b/examples/deferred/dynamic_cilk.cpp
@@ -0,0 +1,398 @@
 /*
  Copyright (c) 2011, Intel Corporation
  All rights reserved.
  Redistribution and use in source and binary forms, with or without
  modification, are permitted provided that the following conditions are
  met:
    * Redistributions of source code must retain the above copyright
      notice, this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.
    * Neither the name of Intel Corporation nor the names of its
      contributors may be used to endorse or promote products derived from
      this software without specific prior written permission.
   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
 */
 #ifdef __cilkplusplus
 #include "deferred.h"
 #include "kernels_ispc.h"
 #include <algorithm>
 #include <assert.h>
 #ifdef _MSC_VER
 #define ISPC_IS_WINDOWS
 #elif defined(__linux__)
 #define ISPC_IS_LINUX
 #elif defined(__APPLE__)
 #define ISPC_IS_APPLE
 #endif
 #ifdef ISPC_IS_LINUX
 #include <malloc.h>
 #endif // ISPC_IS_LINUX
 // Currently tile widths must be a multiple of SIMD width (i.e. 8 for ispc sse4x2)!
 #define MIN_TILE_WIDTH 16
 #define MIN_TILE_HEIGHT 16
 #define DYNAMIC_TREE_LEVELS 5
 // If this is set to 1 then the result will be identical to the static version
 #define DYNAMIC_MIN_LIGHTS_TO_SUBDIVIDE 1
 static void *
 lAlignedMalloc(int64_t size, int32_t alignment) {
 #ifdef ISPC_IS_WINDOWS
    return _aligned_malloc(size, alignment);
 #endif
 #ifdef ISPC_IS_LINUX
    return memalign(alignment, size);
 #endif
 #ifdef ISPC_IS_APPLE
    void *mem = malloc(size + (alignment-1) + sizeof(void*));
    char *amem = ((char*)mem) + sizeof(void*);
    amem = amem + uint32_t(alignment - (reinterpret_cast<uint64_t>(amem) &
                                        (alignment - 1)));
    ((void**)amem)[-1] = mem;
    return amem;
 #endif
 }
 static void
 lAlignedFree(void *ptr) {
 #ifdef ISPC_IS_WINDOWS
    _aligned_free(ptr);
 #endif
 #ifdef ISPC_IS_LINUX
    free(ptr);
 #endif
 #ifdef ISPC_IS_APPLE
    free(((void**)ptr)[-1]);
 #endif
 }
 class MinMaxZTreeCilk
 {
 public:
    // Currently (min) tile dimensions must divide gBuffer dimensions evenly
    // Levels must be small enough that neither dimension goes below one tile
    MinMaxZTreeCilk(
        int tileWidth, int tileHeight, int levels,
        int gBufferWidth, int gBufferHeight)
        : mTileWidth(tileWidth), mTileHeight(tileHeight), mLevels(levels)
    {
        mNumTilesX = gBufferWidth / mTileWidth;
        mNumTilesY = gBufferHeight / mTileHeight;
        // Allocate arrays
        mMinZArrays = (float **)lAlignedMalloc(sizeof(float *) * mLevels, 16);
        mMaxZArrays = (float **)lAlignedMalloc(sizeof(float *) * mLevels, 16);
        for (int i = 0; i < mLevels; ++i) {
            int x = NumTilesX(i);
            int y = NumTilesY(i);
            assert(x > 0);
            assert(y > 0);
            // NOTE: If the following two asserts fire it probably means that
            // the base tile dimensions do not evenly divide the G-buffer dimensions
            assert(x * (mTileWidth << i) >= gBufferWidth);
            assert(y * (mTileHeight << i) >= gBufferHeight);
            mMinZArrays[i] = (float *)lAlignedMalloc(sizeof(float) * x * y, 16);
            mMaxZArrays[i] = (float *)lAlignedMalloc(sizeof(float) * x * y, 16);
        }
    }
    void Update(float *zBuffer, int gBufferPitchInElements,
        float cameraProj_33, float cameraProj_43,
        float cameraNear, float cameraFar)
    {
        // Compute level 0 in parallel. Outer loops is here since we use Cilk
        _Cilk_for (int tileY = 0; tileY < mNumTilesY; ++tileY) {
            ispc::ComputeZBoundsRow(tileY,
                mTileWidth, mTileHeight, mNumTilesX, mNumTilesY,
                zBuffer, gBufferPitchInElements,
                cameraProj_33, cameraProj_43, cameraNear, cameraFar,
                mMinZArrays[0] + (tileY * mNumTilesX),
                mMaxZArrays[0] + (tileY * mNumTilesX));
        }
        // Generate other levels
        // NOTE: We currently don't use ispc here since it's sort of an
        // awkward gather-based reduction Using SSE odd pack/unpack
        // instructions might actually work here when we need to optimize
        for (int level = 1; level < mLevels; ++level) {
            int destTilesX = NumTilesX(level);
            int destTilesY = NumTilesY(level);
            int srcLevel = level - 1;
            int srcTilesX = NumTilesX(srcLevel);
            int srcTilesY = NumTilesY(srcLevel);
            _Cilk_for (int y = 0; y < destTilesY; ++y) {
                for (int x = 0; x < destTilesX; ++x) {
                    int srcX = x << 1;
                    int srcY = y << 1;
                    // NOTE: Ugly branches to deal with non-multiple dimensions at some levels
                    // TODO: SSE branchless min/max is probably better...
                    float minZ = mMinZArrays[srcLevel][(srcY) * srcTilesX + (srcX)];
                    float maxZ = mMaxZArrays[srcLevel][(srcY) * srcTilesX + (srcX)];
                    if (srcX + 1 < srcTilesX) {
                        minZ = std::min(minZ, mMinZArrays[srcLevel][(srcY) * srcTilesX + 
                                                                    (srcX + 1)]);
                        maxZ = std::max(maxZ, mMaxZArrays[srcLevel][(srcY) * srcTilesX +
                                                                    (srcX + 1)]);
                        if (srcY + 1 < srcTilesY) {
                            minZ = std::min(minZ, mMinZArrays[srcLevel][(srcY + 1) * srcTilesX +
                                                                        (srcX + 1)]);
                            maxZ = std::max(maxZ, mMaxZArrays[srcLevel][(srcY + 1) * srcTilesX +
                                                                        (srcX + 1)]);
                        }
                    }
                    if (srcY + 1 < srcTilesY) {
                        minZ = std::min(minZ, mMinZArrays[srcLevel][(srcY + 1) * srcTilesX +
                                                                    (srcX    )]);
                        maxZ = std::max(maxZ, mMaxZArrays[srcLevel][(srcY + 1) * srcTilesX +
                                                                    (srcX    )]);
                    }
                    mMinZArrays[level][y * destTilesX + x] = minZ;
                    mMaxZArrays[level][y * destTilesX + x] = maxZ;
                }
            }
        }
    }
    ~MinMaxZTreeCilk() {
        for (int i = 0; i < mLevels; ++i) {
            lAlignedFree(mMinZArrays[i]);
            lAlignedFree(mMaxZArrays[i]);
        }
        lAlignedFree(mMinZArrays);
        lAlignedFree(mMaxZArrays); 
    }
    int Levels() const { return mLevels; }
    // These round UP, so beware that the last tile for a given level may not be completely full
    // TODO: Verify this...
    int NumTilesX(int level = 0) const { return (mNumTilesX + (1 << level) - 1) >> level; }
    int NumTilesY(int level = 0) const { return (mNumTilesY + (1 << level) - 1) >> level; }
    int TileWidth(int level = 0) const { return (mTileWidth << level); }
    int TileHeight(int level = 0) const { return (mTileHeight << level); }
    float MinZ(int level, int tileX, int tileY) const {
        return mMinZArrays[level][tileY * NumTilesX(level) + tileX];
    }
    float MaxZ(int level, int tileX, int tileY) const {
        return mMaxZArrays[level][tileY * NumTilesX(level) + tileX];
    }
 private:
    int mTileWidth;
    int mTileHeight;
    int mLevels;
    int mNumTilesX;
    int mNumTilesY;
    // One array for each "level" in the tree
    float **mMinZArrays;
    float **mMaxZArrays;
 };
 static MinMaxZTreeCilk *gMinMaxZTreeCilk = 0;
 void InitDynamicCilk(InputData *input) {
    gMinMaxZTreeCilk = 
        new MinMaxZTreeCilk(MIN_TILE_WIDTH, MIN_TILE_HEIGHT, DYNAMIC_TREE_LEVELS,
                            input->header.framebufferWidth, 
                            input->header.framebufferHeight);
 }
 static void
 ShadeDynamicTileRecurse(InputData *input, int level, int tileX, int tileY, 
                        int *lightIndices, int numLights, 
                        Framebuffer *framebuffer) {
    const MinMaxZTreeCilk *minMaxZTree = gMinMaxZTreeCilk;
    // If we few enough lights or this is the base case (last level), shade
    // this full tile directly
    if (level == 0 || numLights < DYNAMIC_MIN_LIGHTS_TO_SUBDIVIDE) {
        int width = minMaxZTree->TileWidth(level);
        int height = minMaxZTree->TileHeight(level);
        int startX = tileX * width;
        int startY = tileY * height;
        int endX = std::min(input->header.framebufferWidth, startX + width);
        int endY = std::min(input->header.framebufferHeight, startY + height);
        // Skip entirely offscreen tiles
        if (endX > startX && endY > startY) {
            ispc::ShadeTile(
                startX, endX, startY, endY,
                input->header.framebufferWidth, input->header.framebufferHeight,
                &input->arrays,
                input->header.cameraProj[0][0], input->header.cameraProj[1][1], 
                input->header.cameraProj[2][2], input->header.cameraProj[3][2],
                lightIndices, numLights, VISUALIZE_LIGHT_COUNT, 
                framebuffer->r, framebuffer->g, framebuffer->b);
        }
    } 
    else {
        // Otherwise, subdivide and 4-way recurse using X and Y splitting planes
        // Move down a level in the tree
        --level;
        tileX <<= 1;
        tileY <<= 1;
        int width = minMaxZTree->TileWidth(level);
        int height = minMaxZTree->TileHeight(level);
        // Work out splitting coords
        int midX = (tileX + 1) * width;
        int midY = (tileY + 1) * height;
        // Read subtile min/max data
        // NOTE: We must be sure to handle out-of-bounds access here since
        // sometimes we'll only have 1 or 2 subtiles for non-pow-2
        // framebuffer sizes.
        bool rightTileExists = (tileX + 1 < minMaxZTree->NumTilesX(level));
        bool bottomTileExists = (tileY + 1 < minMaxZTree->NumTilesY(level));
        // NOTE: Order is 00, 10, 01, 11
        // Set defaults up to cull all lights if the tile doesn't exist (offscreen)
        float minZ[4] = {input->header.cameraFar, input->header.cameraFar, 
                         input->header.cameraFar, input->header.cameraFar};
        float maxZ[4] = {input->header.cameraNear, input->header.cameraNear, 
                         input->header.cameraNear, input->header.cameraNear};
        minZ[0] = minMaxZTree->MinZ(level, tileX, tileY);
        maxZ[0] = minMaxZTree->MaxZ(level, tileX, tileY);
        if (rightTileExists) {
            minZ[1] = minMaxZTree->MinZ(level, tileX + 1, tileY);
            maxZ[1] = minMaxZTree->MaxZ(level, tileX + 1, tileY);
            if (bottomTileExists) {
                minZ[3] = minMaxZTree->MinZ(level, tileX + 1, tileY + 1);
                maxZ[3] = minMaxZTree->MaxZ(level, tileX + 1, tileY + 1);
            }
        }
        if (bottomTileExists) {
            minZ[2] = minMaxZTree->MinZ(level, tileX, tileY + 1);
            maxZ[2] = minMaxZTree->MaxZ(level, tileX, tileY + 1);
        }
        // Cull lights into subtile lists
 #ifdef ISPC_IS_WINDOWS
        __declspec(align(ALIGNMENT_BYTES)) 
 #endif
            int subtileLightIndices[4][MAX_LIGHTS]
 #ifndef ISPC_IS_WINDOWS
            __attribute__ ((aligned(ALIGNMENT_BYTES)))
 #endif
 ;
        int subtileNumLights[4];
        ispc::SplitTileMinMax(midX, midY, minZ, maxZ,
            input->header.framebufferWidth, input->header.framebufferHeight, 
            input->header.cameraProj[0][0], input->header.cameraProj[1][1],
            lightIndices, numLights, input->arrays.lightPositionView_x, 
            input->arrays.lightPositionView_y, input->arrays.lightPositionView_z, 
            input->arrays.lightAttenuationEnd,
            subtileLightIndices[0], MAX_LIGHTS, subtileNumLights);
        // Recurse into subtiles
        _Cilk_spawn ShadeDynamicTileRecurse(input, level, tileX    , tileY, 
                                            subtileLightIndices[0], subtileNumLights[0],
                                            framebuffer);
        _Cilk_spawn ShadeDynamicTileRecurse(input, level, tileX + 1, tileY,
                                            subtileLightIndices[1], subtileNumLights[1],
                                            framebuffer);
        _Cilk_spawn ShadeDynamicTileRecurse(input, level, tileX    , tileY + 1,
                                            subtileLightIndices[2], subtileNumLights[2],
                                            framebuffer);
        ShadeDynamicTileRecurse(input, level, tileX + 1, tileY + 1,
                                subtileLightIndices[3], subtileNumLights[3],
                                framebuffer);
    }
 }
 static void
 ShadeDynamicTile(InputData *input, int level, int tileX, int tileY,
                 Framebuffer *framebuffer) {
    const MinMaxZTreeCilk *minMaxZTree = gMinMaxZTreeCilk;
    // Get Z min/max for this tile
    int width = minMaxZTree->TileWidth(level);
    int height = minMaxZTree->TileHeight(level);
    float minZ = minMaxZTree->MinZ(level, tileX, tileY);
    float maxZ = minMaxZTree->MaxZ(level, tileX, tileY);
    int startX = tileX * width;
    int startY = tileY * height;
    int endX = std::min(input->header.framebufferWidth, startX + width);
    int endY = std::min(input->header.framebufferHeight, startY + height);
    // This is a root tile, so first do a full 6-plane cull
 #ifdef ISPC_IS_WINDOWS
    __declspec(align(ALIGNMENT_BYTES)) 
 #endif
        int lightIndices[MAX_LIGHTS]
 #ifndef ISPC_IS_WINDOWS
        __attribute__ ((aligned(ALIGNMENT_BYTES)))
 #endif
 ;
    int numLights = ispc::IntersectLightsWithTileMinMax(
        startX, endX, startY, endY,    minZ, maxZ,
        input->header.framebufferWidth, input->header.framebufferHeight,
        input->header.cameraProj[0][0], input->header.cameraProj[1][1],
        MAX_LIGHTS, input->arrays.lightPositionView_x, 
        input->arrays.lightPositionView_y, input->arrays.lightPositionView_z, 
        input->arrays.lightAttenuationEnd, lightIndices);
    // Now kick off the recursive process for this tile
    ShadeDynamicTileRecurse(input, level, tileX, tileY, lightIndices, 
                            numLights, framebuffer);
 }
 void
 DispatchDynamicCilk(InputData *input, Framebuffer *framebuffer)
 {
    MinMaxZTreeCilk *minMaxZTree = gMinMaxZTreeCilk;
    // Update min/max Z tree
    minMaxZTree->Update(input->arrays.zBuffer, input->header.framebufferWidth,
        input->header.cameraProj[2][2], input->header.cameraProj[3][2], 
        input->header.cameraNear, input->header.cameraFar);
    // Launch the "root" tiles.  Ideally these should at least fill the
    // machine... at the moment we have a static number of "levels" to the
    // mip tree but it might make sense to compute it based on the width of
    // the machine.
    int rootLevel = minMaxZTree->Levels() - 1;
    int rootTilesX = minMaxZTree->NumTilesX(rootLevel);
    int rootTilesY = minMaxZTree->NumTilesY(rootLevel);
    int rootTiles = rootTilesX * rootTilesY;
    _Cilk_for (int g = 0; g < rootTiles; ++g) {
        uint32_t tileY = g / rootTilesX;
        uint32_t tileX = g % rootTilesX;
        ShadeDynamicTile(input, rootLevel, tileX, tileY, framebuffer);
    }
 }
 #endif // __cilkplusplus
--- a/examples/deferred/kernels.ispc
+++ b/examples/deferred/kernels.ispc
@@ -0,0 +1,717 @@
 /*
  Copyright (c) 2010-2011, Intel Corporation
  All rights reserved.
  Redistribution and use in source and binary forms, with or without
  modification, are permitted provided that the following conditions are
  met:
    * Redistributions of source code must retain the above copyright
      notice, this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.
    * Neither the name of Intel Corporation nor the names of its
      contributors may be used to endorse or promote products derived from
      this software without specific prior written permission.
   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
 */
 #include "deferred.h"
 struct InputDataArrays
 {
    uniform float zBuffer[];
    uniform unsigned int16 normalEncoded_x[]; // half float
    uniform unsigned int16 normalEncoded_y[]; // half float
    uniform unsigned int16 specularAmount[]; // half float
    uniform unsigned int16 specularPower[]; // half float
    uniform unsigned int8 albedo_x[]; // unorm8
    uniform unsigned int8 albedo_y[]; // unorm8
    uniform unsigned int8 albedo_z[]; // unorm8
    uniform float lightPositionView_x[];
    uniform float lightPositionView_y[];
    uniform float lightPositionView_z[];
    uniform float lightAttenuationBegin[];
    uniform float lightColor_x[];
    uniform float lightColor_y[];
    uniform float lightColor_z[];
    uniform float lightAttenuationEnd[];
 };
 struct InputHeader
 {
    uniform float cameraProj[4][4];
    uniform float cameraNear;
    uniform float cameraFar;
    uniform int32 framebufferWidth;
    uniform int32 framebufferHeight;
    uniform int32 numLights;
    uniform int32 inputDataChunkSize;
    uniform int32 inputDataArrayOffsets[idaNum];
 };
 export void foo(reference InputHeader h) { }
 ///////////////////////////////////////////////////////////////////////////
 // Common utility routines
 static inline float
 dot3(float x, float y, float z, float a, float b, float c) {
    return (x*a + y*b + z*c);
 }
 static inline void
 normalize3(float x, float y, float z, reference float ox, 
           reference float oy, reference float oz) {
    float n = rsqrt(x*x + y*y + z*z);
    ox = x * n;
    oy = y * n;
    oz = z * n;
 }
 static inline float
 Unorm8ToFloat32(unsigned int8 u) {
    return (float)u * (1.0f / 255.0f);
 }
 static inline unsigned int8
 Float32ToUnorm8(float f) {
    return (unsigned int8)(f * 255.0f);
 }
 // tile width must be a multiple of programCount (SIMD size)
 static void
 ComputeZBounds(
    uniform int32 tileStartX, uniform int32 tileEndX,
    uniform int32 tileStartY, uniform int32 tileEndY,
    // G-buffer data
    uniform float zBuffer[],
    uniform int32 gBufferWidth,
    // Camera data
    uniform float cameraProj_33, uniform float cameraProj_43,
    uniform float cameraNear, uniform float cameraFar,
    // Output
    reference uniform float minZ,
    reference uniform float maxZ
    )
 {
    // Find Z bounds
    float laneMinZ = cameraFar;
    float laneMaxZ = cameraNear;
    for (uniform int32 y = tileStartY; y < tileEndY; ++y) {
        for (uniform int32 x = tileStartX; x < tileEndX; x += programCount) {
            // Unproject depth buffer Z value into view space
            float z = zBuffer[(y * gBufferWidth + x) + programIndex];
            float viewSpaceZ = cameraProj_43 / (z - cameraProj_33);
            // Work out Z bounds for our samples
            // Avoid considering skybox/background or otherwise invalid pixels
            if ((viewSpaceZ < cameraFar) && (viewSpaceZ >= cameraNear)) {
                laneMinZ = min(laneMinZ, viewSpaceZ);
                laneMaxZ = max(laneMaxZ, viewSpaceZ);
            }
        }
    }
    minZ = reduce_min(laneMinZ);
    maxZ = reduce_max(laneMaxZ);
 }
 // tile width must be a multiple of programCount (SIMD size)
 // numLights must currently be a multiple of programCount (SIMD size)
 export uniform int32
 IntersectLightsWithTileMinMax(
    uniform int32 tileStartX, uniform int32 tileEndX,
    uniform int32 tileStartY, uniform int32 tileEndY,
    // Tile data
    uniform float minZ,
    uniform float maxZ,
    // G-buffer data
    uniform int32 gBufferWidth, uniform int32 gBufferHeight,
    // Camera data
    uniform float cameraProj_11, uniform float cameraProj_22,
    // Light Data
    uniform int32 numLights,
    uniform float light_positionView_x_array[],
    uniform float light_positionView_y_array[],
    uniform float light_positionView_z_array[],
    uniform float light_attenuationEnd_array[],
    // Output
    reference uniform int32 tileLightIndices[]
    )
 {
    uniform float gBufferScale_x = 0.5f * (float)gBufferWidth;
    uniform float gBufferScale_y = 0.5f * (float)gBufferHeight;
    // Parallize across frustum planes.
    // We really only have four side planes here, but write the code to
    // handle programCount > 4 robustly
    uniform float frustumPlanes_xy[programCount];
    uniform float frustumPlanes_z[programCount];
    // TODO: If programIndex < 4 here? Don't care about masking off the
    // rest but if interleaving ("x2" modes) the other lanes should ideally
    // not be emitted...
    {
        // This one is totally constant over the whole screen... worth pulling it up at all?
        float frustumPlanes_xy_v;
        frustumPlanes_xy_v = insert(frustumPlanes_xy_v, 0, -(cameraProj_11 * gBufferScale_x));
        frustumPlanes_xy_v = insert(frustumPlanes_xy_v, 1,  (cameraProj_11 * gBufferScale_x));
        frustumPlanes_xy_v = insert(frustumPlanes_xy_v, 2,  (cameraProj_22 * gBufferScale_y));
        frustumPlanes_xy_v = insert(frustumPlanes_xy_v, 3, -(cameraProj_22 * gBufferScale_y));
        float frustumPlanes_z_v;
        frustumPlanes_z_v = insert(frustumPlanes_z_v, 0,  tileEndX - gBufferScale_x);
        frustumPlanes_z_v = insert(frustumPlanes_z_v, 1, -tileStartX + gBufferScale_x);
        frustumPlanes_z_v = insert(frustumPlanes_z_v, 2,  tileEndY - gBufferScale_y);
        frustumPlanes_z_v = insert(frustumPlanes_z_v, 3, -tileStartY + gBufferScale_y);
        // Normalize
        float norm = rsqrt(frustumPlanes_xy_v * frustumPlanes_xy_v + 
                           frustumPlanes_z_v * frustumPlanes_z_v);
            frustumPlanes_xy_v *= norm;
            frustumPlanes_z_v *= norm;
        // Save out for uniform use later
        frustumPlanes_xy[programIndex] = frustumPlanes_xy_v;
        frustumPlanes_z[programIndex] = frustumPlanes_z_v;
    }
    uniform int32 tileNumLights = 0;
    for (uniform int32 baseLightIndex = 0; baseLightIndex < numLights; 
         baseLightIndex += programCount) {
        int32 lightIndex = baseLightIndex + programIndex;
        float light_positionView_z = light_positionView_z_array[lightIndex];
        float light_attenuationEnd = light_attenuationEnd_array[lightIndex];
        float light_attenuationEndNeg = -light_attenuationEnd;
        float d = light_positionView_z - minZ;
        bool inFrustum = (d >= light_attenuationEndNeg);
        d = maxZ - light_positionView_z;
        inFrustum = inFrustum && (d >= light_attenuationEndNeg);
        // This seems better than cif(!inFrustum) ccontinue; here since we
        // don't actually need to mask the rest of this function - this is
        // just a greedy early-out.  Could also structure all of this as
        // nested if() statements, but this a bit easier to read
        if (!any(inFrustum)) 
            continue;
        float light_positionView_x = light_positionView_x_array[lightIndex];
        float light_positionView_y = light_positionView_y_array[lightIndex];
        d = light_positionView_z * frustumPlanes_z[0] + 
            light_positionView_x * frustumPlanes_xy[0];
        inFrustum = inFrustum && (d >= light_attenuationEndNeg);
        d = light_positionView_z * frustumPlanes_z[1] + 
            light_positionView_x * frustumPlanes_xy[1];
        inFrustum = inFrustum && (d >= light_attenuationEndNeg);
        d = light_positionView_z * frustumPlanes_z[2] + 
            light_positionView_y * frustumPlanes_xy[2];
        inFrustum = inFrustum && (d >= light_attenuationEndNeg);
        d = light_positionView_z * frustumPlanes_z[3] + 
            light_positionView_y * frustumPlanes_xy[3];
        inFrustum = inFrustum && (d >= light_attenuationEndNeg);
        // Pack and store intersecting lights
        cif (inFrustum) {
            tileNumLights += packed_store_active(tileLightIndices, tileNumLights, 
                                                 lightIndex);
        }
    }
    return tileNumLights;
 }
 // tile width must be a multiple of programCount (SIMD size)
 // numLights must currently be a multiple of programCount (SIMD size)
 static uniform int32
 IntersectLightsWithTile(
    uniform int32 tileStartX, uniform int32 tileEndX,
    uniform int32 tileStartY, uniform int32 tileEndY,
    uniform int32 gBufferWidth, uniform int32 gBufferHeight,
    // G-buffer data
    uniform float zBuffer[],
    // Camera data
    uniform float cameraProj_11, uniform float cameraProj_22,
    uniform float cameraProj_33, uniform float cameraProj_43,
    uniform float cameraNear, uniform float cameraFar,
    // Light Data
    uniform int32 numLights,
    uniform float light_positionView_x_array[],
    uniform float light_positionView_y_array[],
    uniform float light_positionView_z_array[],
    uniform float light_attenuationEnd_array[],
    // Output
    reference uniform int32 tileLightIndices[]
    )
 {
    uniform float minZ, maxZ;
    ComputeZBounds(tileStartX, tileEndX, tileStartY, tileEndY,
        zBuffer, gBufferWidth, cameraProj_33, cameraProj_43, cameraNear, cameraFar,
        minZ, maxZ);
    uniform int32 tileNumLights = IntersectLightsWithTileMinMax(
        tileStartX, tileEndX, tileStartY, tileEndY, minZ, maxZ,
        gBufferWidth, gBufferHeight, cameraProj_11, cameraProj_22,
        MAX_LIGHTS, light_positionView_x_array, light_positionView_y_array, 
        light_positionView_z_array, light_attenuationEnd_array,
        tileLightIndices);
    return tileNumLights;
 }
 // tile width must be a multiple of programCount (SIMD size)
 export void
 ShadeTile(
    uniform int32 tileStartX, uniform int32 tileEndX,
    uniform int32 tileStartY, uniform int32 tileEndY,
    uniform int32 gBufferWidth, uniform int32 gBufferHeight,
    reference uniform InputDataArrays inputData,
    // Camera data
    uniform float cameraProj_11, uniform float cameraProj_22,
    uniform float cameraProj_33, uniform float cameraProj_43,
    // Light list
    reference uniform int32 tileLightIndices[],
    uniform int32 tileNumLights,
    // UI
    uniform bool visualizeLightCount,
    // Output
    reference uniform unsigned int8 framebuffer_r[],
    reference uniform unsigned int8 framebuffer_g[],
    reference uniform unsigned int8 framebuffer_b[]
    )
 {
    if (tileNumLights == 0 || visualizeLightCount) {
        uniform unsigned int8 c = (unsigned int8)(min(tileNumLights << 2, 255));
        for (uniform int32 y = tileStartY; y < tileEndY; ++y) {
            for (uniform int32 x = tileStartX; x < tileEndX; x += programCount) {
                int32 framebufferIndex = (y * gBufferWidth + x) + programIndex;
                framebuffer_r[framebufferIndex] = c;
                framebuffer_g[framebufferIndex] = c;
                framebuffer_b[framebufferIndex] = c;
            }
        }
    } else {
        uniform float twoOverGBufferWidth = 2.0f / gBufferWidth;
        uniform float twoOverGBufferHeight = 2.0f / gBufferHeight;
        for (uniform int32 y = tileStartY; y < tileEndY; ++y) {
            uniform float positionScreen_y = -(((0.5f + y) * twoOverGBufferHeight) - 1.f);
            for (uniform int32 x = tileStartX; x < tileEndX; x += programCount) {
                uniform int32 gBufferOffsetBase = y * gBufferWidth + x;
                int32 gBufferOffset = gBufferOffsetBase + programIndex;
                // Reconstruct position and (negative) view vector from G-buffer
                float surface_positionView_x, surface_positionView_y, surface_positionView_z;
                float Vneg_x, Vneg_y, Vneg_z;
                float z = inputData.zBuffer[gBufferOffset];
                // Compute screen/clip-space position
                // NOTE: Mind DX11 viewport transform and pixel center!
                float positionScreen_x = (0.5f + (float)(x + programIndex)) * 
                    twoOverGBufferWidth - 1.0f;
                // Unproject depth buffer Z value into view space
                surface_positionView_z = cameraProj_43 / (z - cameraProj_33);
                surface_positionView_x = positionScreen_x * surface_positionView_z / 
                    cameraProj_11;
                surface_positionView_y = positionScreen_y * surface_positionView_z / 
                    cameraProj_22;
                // We actually end up with a vector pointing *at* the
                // surface (i.e. the negative view vector)
                normalize3(surface_positionView_x, surface_positionView_y, 
                           surface_positionView_z, Vneg_x, Vneg_y, Vneg_z);
                // Reconstruct normal from G-buffer
                float surface_normal_x, surface_normal_y, surface_normal_z;
                float normal_x = half_to_float_fast(inputData.normalEncoded_x[gBufferOffset]);
                float normal_y = half_to_float_fast(inputData.normalEncoded_y[gBufferOffset]);
                float f = (normal_x - normal_x * normal_x) + (normal_y - normal_y * normal_y);
                float m = sqrt(4.0f * f - 1.0f);
                surface_normal_x = m * (4.0f * normal_x - 2.0f);
                surface_normal_y = m * (4.0f * normal_y - 2.0f);
                surface_normal_z = 3.0f - 8.0f * f;
                // Load other G-buffer parameters
                float surface_specularAmount = 
                    half_to_float_fast(inputData.specularAmount[gBufferOffset]);
                float surface_specularPower  = 
                    half_to_float_fast(inputData.specularPower[gBufferOffset]);
                float surface_albedo_x = Unorm8ToFloat32(inputData.albedo_x[gBufferOffset]);
                float surface_albedo_y = Unorm8ToFloat32(inputData.albedo_y[gBufferOffset]);
                float surface_albedo_z = Unorm8ToFloat32(inputData.albedo_z[gBufferOffset]);
                float lit_x = 0.0f;
                float lit_y = 0.0f;
                float lit_z = 0.0f;
                for (uniform int32 tileLightIndex = 0; tileLightIndex < tileNumLights; 
                     ++tileLightIndex) {
                    uniform int32 lightIndex = tileLightIndices[tileLightIndex];
                    // Gather light data relevant to initial culling
                    uniform float light_positionView_x = 
                        inputData.lightPositionView_x[lightIndex];
                    uniform float light_positionView_y = 
                        inputData.lightPositionView_y[lightIndex];
                    uniform float light_positionView_z = 
                        inputData.lightPositionView_z[lightIndex];
                    uniform float light_attenuationEnd = 
                        inputData.lightAttenuationEnd[lightIndex];
                    // Compute light vector
                    float L_x = light_positionView_x - surface_positionView_x;
                    float L_y = light_positionView_y - surface_positionView_y;
                    float L_z = light_positionView_z - surface_positionView_z;
                    float distanceToLight2 = dot3(L_x, L_y, L_z, L_x, L_y, L_z);
                    // Clip at end of attenuation
                    float light_attenutaionEnd2 = light_attenuationEnd * light_attenuationEnd;
                    cif (distanceToLight2 < light_attenutaionEnd2) {                    
                        float distanceToLight = sqrt(distanceToLight2);
                        // HLSL "rcp" is allowed to be fairly inaccurate
                        float distanceToLightRcp = rcp(distanceToLight);
                        L_x *= distanceToLightRcp;
                        L_y *= distanceToLightRcp;
                        L_z *= distanceToLightRcp;
                        // Start computing brdf
                        float NdotL = dot3(surface_normal_x, surface_normal_y, 
                                           surface_normal_z, L_x, L_y, L_z);
                        // Clip back facing
                        cif (NdotL > 0.0f) {
                            uniform float light_attenuationBegin = 
                                inputData.lightAttenuationBegin[lightIndex];
                            // Light distance attenuation (linstep)
                            float lightRange = (light_attenuationEnd - light_attenuationBegin);
                            float falloffPosition = (light_attenuationEnd - distanceToLight);
                            float attenuation = min(falloffPosition / lightRange, 1.0f);
                            float H_x = (L_x - Vneg_x);
                            float H_y = (L_y - Vneg_y);
                            float H_z = (L_z - Vneg_z);
                            normalize3(H_x, H_y, H_z, H_x, H_y, H_z);
                            float NdotH = dot3(surface_normal_x, surface_normal_y, 
                                               surface_normal_z, H_x, H_y, H_z);
                            NdotH = max(NdotH, 0.0f);
                            float specular = pow(NdotH, surface_specularPower);
                            float specularNorm = (surface_specularPower + 2.0f) * 
                                (1.0f / 8.0f);
                            float specularContrib = surface_specularAmount * 
                                specularNorm * specular;
                            float k = attenuation * NdotL * (1.0f + specularContrib);
                            uniform float light_color_x = inputData.lightColor_x[lightIndex];
                            uniform float light_color_y = inputData.lightColor_y[lightIndex];
                            uniform float light_color_z = inputData.lightColor_z[lightIndex];
                            float lightContrib_x = surface_albedo_x * light_color_x;
                            float lightContrib_y = surface_albedo_y * light_color_y;
                            float lightContrib_z = surface_albedo_z * light_color_z;
                            lit_x += lightContrib_x * k;
                            lit_y += lightContrib_y * k;
                            lit_z += lightContrib_z * k;
                        }
                    }
                }
                // Gamma correct
                // These pows are pretty slow right now, but we can do
                // something faster if really necessary to squeeze every
                // last bit of performance out of it
                float gamma = 1.0 / 2.2f;
                lit_x = pow(clamp(lit_x, 0.0f, 1.0f), gamma);
                lit_y = pow(clamp(lit_y, 0.0f, 1.0f), gamma);
                lit_z = pow(clamp(lit_z, 0.0f, 1.0f), gamma);
                framebuffer_r[gBufferOffset] = Float32ToUnorm8(lit_x);
                framebuffer_g[gBufferOffset] = Float32ToUnorm8(lit_y);
                framebuffer_b[gBufferOffset] = Float32ToUnorm8(lit_z);
            }
        }
    }
 }
 ///////////////////////////////////////////////////////////////////////////
 // Static decomposition
 task void
 RenderTile(uniform int g, uniform int num_groups_x, uniform int num_groups_y,
           reference uniform InputHeader inputHeader,
           reference uniform InputDataArrays inputData,
           uniform int visualizeLightCount,
           // Output
           reference uniform unsigned int8 framebuffer_r[],
           reference uniform unsigned int8 framebuffer_g[],
           reference uniform unsigned int8 framebuffer_b[]) {
    uniform int32 group_y = g / num_groups_x;
    uniform int32 group_x = g % num_groups_x;
    uniform int32 tile_start_x = group_x * MIN_TILE_WIDTH;
    uniform int32 tile_start_y = group_y * MIN_TILE_HEIGHT;
    uniform int32 tile_end_x = tile_start_x + MIN_TILE_WIDTH;
    uniform int32 tile_end_y = tile_start_y + MIN_TILE_HEIGHT;
    uniform int sTileNumLights = 0;
    uniform int sTileLightIndices[MAX_LIGHTS];  // Light list for the tile
    uniform int framebufferWidth = inputHeader.framebufferWidth;
    uniform int framebufferHeight = inputHeader.framebufferHeight;
    uniform float cameraProj_00 = inputHeader.cameraProj[0][0];
    uniform float cameraProj_11 = inputHeader.cameraProj[1][1];
    uniform float cameraProj_22 = inputHeader.cameraProj[2][2];
    uniform float cameraProj_32 = inputHeader.cameraProj[3][2];
    // Light intersection
    sTileNumLights = 
        IntersectLightsWithTile(tile_start_x, tile_end_x, 
                                tile_start_y, tile_end_y,
                                framebufferWidth, framebufferHeight,
                                inputData.zBuffer,
                                cameraProj_00, cameraProj_11,
                                cameraProj_22, cameraProj_32,
                                inputHeader.cameraNear, inputHeader.cameraFar,
                                MAX_LIGHTS,
                                inputData.lightPositionView_x, 
                                inputData.lightPositionView_y, 
                                inputData.lightPositionView_z, 
                                inputData.lightAttenuationEnd,
                                sTileLightIndices);
    ShadeTile(tile_start_x, tile_end_x, tile_start_y, tile_end_y,
              framebufferWidth, framebufferHeight, inputData,
              cameraProj_00, cameraProj_11, cameraProj_22, cameraProj_32,
              sTileLightIndices, sTileNumLights, visualizeLightCount, 
              framebuffer_r, framebuffer_g, framebuffer_b);
 }
 export void
 RenderStatic(reference uniform InputHeader inputHeader,
             reference uniform InputDataArrays inputData,
             uniform int visualizeLightCount,
             // Output
             reference uniform unsigned int8 framebuffer_r[],
             reference uniform unsigned int8 framebuffer_g[],
             reference uniform unsigned int8 framebuffer_b[]) {
    uniform int num_groups_x = (inputHeader.framebufferWidth + 
                                MIN_TILE_WIDTH - 1) / MIN_TILE_WIDTH;
    uniform int num_groups_y = (inputHeader.framebufferHeight + 
                                MIN_TILE_HEIGHT - 1) / MIN_TILE_HEIGHT;
    uniform int num_groups = num_groups_x * num_groups_y;
    for (uniform int g = 0; g < num_groups; ++g)
        launch < RenderTile(g, num_groups_x, num_groups_y,
                            inputHeader, inputData, visualizeLightCount,
                            framebuffer_r, framebuffer_g, framebuffer_b) >;
 }
 ///////////////////////////////////////////////////////////////////////////
 // Routines for dynamic decomposition path
 // tile width must be a multiple of programCount (SIMD size)
 export void
 ComputeZBoundsRow(
    uniform int32 tileY,
    uniform int32 tileWidth, uniform int32 tileHeight,
    uniform int32 numTilesX, uniform int32 numTilesY,
    // G-buffer data
    uniform float zBuffer[],
    uniform int32 gBufferWidth,
    // Camera data
    uniform float cameraProj_33, uniform float cameraProj_43,
    uniform float cameraNear, uniform float cameraFar,
    // Output
    reference uniform float minZArray[],
    reference uniform float maxZArray[]
    )
 {
    for (uniform int32 tileX = 0; tileX < numTilesX; ++tileX) {
        uniform float minZ, maxZ;
        ComputeZBounds(
            tileX * tileWidth, tileX * tileWidth + tileWidth,
            tileY * tileHeight, tileY * tileHeight + tileHeight,
            zBuffer, gBufferWidth,
            cameraProj_33, cameraProj_43, cameraNear, cameraFar,
            minZ, maxZ);
        minZArray[tileX] = minZ;
        maxZArray[tileX] = maxZ;
    }
 }
 // numLights need not be a multiple of programCount here, but the input and output arrays
 // should be able to handle programCount-sized load/stores.
 export void
 SplitTileMinMax(
    uniform int32 tileMidX, uniform int32 tileMidY,
    // Subtile data (00, 10, 01, 11)
    uniform float subtileMinZ[],
    uniform float subtileMaxZ[],
    // G-buffer data
    uniform int32 gBufferWidth, uniform int32 gBufferHeight,
    // Camera data
    uniform float cameraProj_11, uniform float cameraProj_22,
    // Light Data
    reference uniform int32 lightIndices[],
    uniform int32 numLights,
    uniform float light_positionView_x_array[],
    uniform float light_positionView_y_array[],
    uniform float light_positionView_z_array[],
    uniform float light_attenuationEnd_array[],
    // Outputs
    // TODO: ISPC doesn't currently like multidimensionsal arrays so we'll do the
    // indexing math ourselves
    reference uniform int32 subtileIndices[],
    uniform int32 subtileIndicesPitch,
    reference uniform int32 subtileNumLights[]
    )
 {
    uniform float gBufferScale_x = 0.5f * (float)gBufferWidth;
    uniform float gBufferScale_y = 0.5f * (float)gBufferHeight;
    // Parallize across frustum planes
    // Only have 2 frustum split planes here so may not be worth it, but
    // we'll do it for now for consistency
    uniform float frustumPlanes_xy[programCount];
    uniform float frustumPlanes_z[programCount];
    // This one is totally constant over the whole screen... worth pulling it up at all?
    float frustumPlanes_xy_v;
    frustumPlanes_xy_v = insert(frustumPlanes_xy_v, 0, -(cameraProj_11 * gBufferScale_x));
    frustumPlanes_xy_v = insert(frustumPlanes_xy_v, 1,  (cameraProj_22 * gBufferScale_y));
    float frustumPlanes_z_v;
    frustumPlanes_z_v = insert(frustumPlanes_z_v, 0, tileMidX - gBufferScale_x);
    frustumPlanes_z_v = insert(frustumPlanes_z_v, 1, tileMidY - gBufferScale_y);
    // Normalize
    float norm = rsqrt(frustumPlanes_xy_v * frustumPlanes_xy_v + 
                       frustumPlanes_z_v * frustumPlanes_z_v);
    frustumPlanes_xy_v *= norm;
    frustumPlanes_z_v *= norm;
    // Save out for uniform use later
    frustumPlanes_xy[programIndex] = frustumPlanes_xy_v;
    frustumPlanes_z[programIndex] = frustumPlanes_z_v;
    // Initialize
    uniform int32 subtileLightOffset[4];
    subtileLightOffset[0] = 0 * subtileIndicesPitch;
    subtileLightOffset[1] = 1 * subtileIndicesPitch;
    subtileLightOffset[2] = 2 * subtileIndicesPitch;
    subtileLightOffset[3] = 3 * subtileIndicesPitch;
    for (int32 i = programIndex; i < numLights; i += programCount) {
        // TODO: ISPC says gather required here when it actually
        // isn't... this could be fixed this by nesting an if() within a
        // uniform loop, but I'm not totally sure if that's a win
        // overall. For now we'll just eat the perf cost for cleanliness
        // since the below are real gathers anyways.
        int32 lightIndex = lightIndices[i];
        float light_positionView_x = light_positionView_x_array[lightIndex];
        float light_positionView_y = light_positionView_y_array[lightIndex];
        float light_positionView_z = light_positionView_z_array[lightIndex];
        float light_attenuationEnd = light_attenuationEnd_array[lightIndex];
        float light_attenuationEndNeg = -light_attenuationEnd;
        // Test lights again subtile z bounds
        bool inFrustum[4];
        inFrustum[0] = (light_positionView_z - subtileMinZ[0] >= light_attenuationEndNeg) &&
            (subtileMaxZ[0] - light_positionView_z >= light_attenuationEndNeg);
        inFrustum[1] = (light_positionView_z - subtileMinZ[1] >= light_attenuationEndNeg) && 
            (subtileMaxZ[1] - light_positionView_z >= light_attenuationEndNeg);
        inFrustum[2] = (light_positionView_z - subtileMinZ[2] >= light_attenuationEndNeg) && 
            (subtileMaxZ[2] - light_positionView_z >= light_attenuationEndNeg);
        inFrustum[3] = (light_positionView_z - subtileMinZ[3] >= light_attenuationEndNeg) && 
            (subtileMaxZ[3] - light_positionView_z >= light_attenuationEndNeg);
        float dx = light_positionView_z * frustumPlanes_z[0] + 
            light_positionView_x * frustumPlanes_xy[0];
        float dy = light_positionView_z * frustumPlanes_z[1] +
            light_positionView_y * frustumPlanes_xy[1];
        cif (abs(dx) > light_attenuationEnd) {
            bool positiveX = dx > 0.0f;
            inFrustum[0] = inFrustum[0] &&  positiveX;    // 00 subtile
            inFrustum[1] = inFrustum[1] && !positiveX;    // 10 subtile
            inFrustum[2] = inFrustum[2] &&  positiveX;    // 01 subtile
            inFrustum[3] = inFrustum[3] && !positiveX;    // 11 subtile
        }
        cif (abs(dy) > light_attenuationEnd) {
            bool positiveY = dy > 0.0f;
            inFrustum[0] = inFrustum[0] &&  positiveY;    // 00 subtile
            inFrustum[1] = inFrustum[1] &&  positiveY;    // 10 subtile
            inFrustum[2] = inFrustum[2] && !positiveY;    // 01 subtile
            inFrustum[3] = inFrustum[3] && !positiveY;    // 11 subtile
        }
        // Pack and store intersecting lights
        // TODO: Experiment with a loop here instead
        cif (inFrustum[0])
            subtileLightOffset[0] += packed_store_active(subtileIndices, 
                                                         subtileLightOffset[0], 
                                                         lightIndex);
        cif (inFrustum[1])
            subtileLightOffset[1] += packed_store_active(subtileIndices, 
                                                         subtileLightOffset[1], 
                                                         lightIndex);
        cif (inFrustum[2])
            subtileLightOffset[2] += packed_store_active(subtileIndices, 
                                                         subtileLightOffset[2], 
                                                         lightIndex);
        cif (inFrustum[3])
            subtileLightOffset[3] += packed_store_active(subtileIndices, 
                                                         subtileLightOffset[3], 
                                                         lightIndex);
    }
    subtileNumLights[0] = subtileLightOffset[0] - 0 * subtileIndicesPitch;
    subtileNumLights[1] = subtileLightOffset[1] - 1 * subtileIndicesPitch;
    subtileNumLights[2] = subtileLightOffset[2] - 2 * subtileIndicesPitch;
    subtileNumLights[3] = subtileLightOffset[3] - 3 * subtileIndicesPitch;
 }
--- a/examples/deferred/main.cpp
+++ b/examples/deferred/main.cpp
@@ -0,0 +1,137 @@
 /*
  Copyright (c) 2011, Intel Corporation
  All rights reserved.
  Redistribution and use in source and binary forms, with or without
  modification, are permitted provided that the following conditions are
  met:
    * Redistributions of source code must retain the above copyright
      notice, this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.
    * Neither the name of Intel Corporation nor the names of its
      contributors may be used to endorse or promote products derived from
      this software without specific prior written permission.
   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
 */
 #ifdef _MSC_VER
 #define ISPC_IS_WINDOWS
 #define NOMINMAX
 #elif defined(__linux__)
 #define ISPC_IS_LINUX
 #elif defined(__APPLE__)
 #define ISPC_IS_APPLE
 #endif
 #include <fcntl.h>
 #include <float.h>
 #include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <sys/types.h>
 #include <stdint.h>
 #include <algorithm>
 #include <assert.h>
 #include <vector>
 #ifdef ISPC_IS_WINDOWS
  #define WIN32_LEAN_AND_MEAN
  #include <windows.h>
 #endif
 #include "deferred.h"
 #include "kernels_ispc.h"
 #include "../timing.h"
 ///////////////////////////////////////////////////////////////////////////
 int main(int argc, char** argv) {
    if (argc != 2) {
        printf("usage: deferred_shading <input_file>\n");
        return 1;
    }
    InputData *input = CreateInputDataFromFile(argv[1]);
    if (!input) {
        printf("Failed to load input file \"%s\"!\n", argv[1]);
        return 1;
    }
    Framebuffer framebuffer(input->header.framebufferWidth,
                            input->header.framebufferHeight);
    InitDynamicC(input);
 #ifdef __cilkplusplus
    InitDynamicCilk(input);
 #endif // __cilkplusplus
    int nframes = 5;
    double ispcCycles = 1e30;
    for (int i = 0; i < 5; ++i) {
        framebuffer.clear();
        reset_and_start_timer();
        for (int j = 0; j < nframes; ++j)
            ispc::RenderStatic(&input->header, &input->arrays, 
                               VISUALIZE_LIGHT_COUNT,
                               framebuffer.r, framebuffer.g, framebuffer.b);
        double mcycles = get_elapsed_mcycles() / nframes;
        ispcCycles = std::min(ispcCycles, mcycles);
    }
    printf("[ispc static + tasks]:\t\t[%.3f] million cycles to render "
           "%d x %d image\n", ispcCycles,
           input->header.framebufferWidth, input->header.framebufferHeight);
    WriteFrame("deferred-ispc-static.ppm", input, framebuffer);
    double serialCycles = 1e30;
    for (int i = 0; i < 5; ++i) {
        framebuffer.clear();
        reset_and_start_timer();
        for (int j = 0; j < nframes; ++j)
            DispatchDynamicC(input, &framebuffer);
        double mcycles = get_elapsed_mcycles() / nframes;
        serialCycles = std::min(serialCycles, mcycles);
    }
    printf("[C++ serial dynamic, 1 core]:\t[%.3f] million cycles\n", 
           serialCycles);
    WriteFrame("deferred-serial-dynamic.ppm", input, framebuffer);
 #ifdef __cilkplusplus
    double dynamicCilkCycles = 1e30;
    for (int i = 0; i < 5; ++i) {
        framebuffer.clear();
        reset_and_start_timer();
        for (int j = 0; j < nframes; ++j)
            DispatchDynamicCilk(input, &framebuffer);
        double mcycles = get_elapsed_mcycles() / nframes;
        dynamicCilkCycles = std::min(dynamicCilkCycles, mcycles);
    }
    printf("[ispc + Cilk dynamic]:\t\t[%.3f] million cycles\n", 
           dynamicCilkCycles);
    WriteFrame("deferred-ispc-dynamic.ppm", input, framebuffer);
    printf("\t\t\t\t(%.2fx speedup from static ISPC, %.2fx from Cilk+ISPC)\n", 
           serialCycles/ispcCycles, serialCycles/dynamicCilkCycles);
 #else
    printf("\t\t\t\t(%.2fx speedup from ISPC)\n", serialCycles/ispcCycles);
 #endif // __cilkplusplus
    DeleteInputData(input);
    return 0;
 }
--- a/examples/examples.sln
+++ b/examples/examples.sln
@@ -18,8 +18,11 @@ EndProject
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "noise", "noise\noise.vcxproj", "{0E0886D8-8B5E-4EAF-9A21-91E63DAF81FD}"
 EndProject
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "volume", "volume_rendering\volume.vcxproj", "{DEE5733A-E93E-449D-9114-9BFFCAEB4DF9}"
 EndProject
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "stencil", "stencil\stencil.vcxproj", "{2EF070A1-F62F-4E6A-944B-88D140945C3C}"
 EndProject
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "deferred_shading", "deferred\deferred_shading.vcxproj", "{87F53C53-957E-4E91-878A-BC27828FB9EB}"
 EndProject
 Global
 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
 		Debug|Win32 = Debug|Win32
@@ -108,6 +111,14 @@ Global
 		{2EF070A1-F62F-4E6A-944B-88D140945C3C}.Release|Win32.Build.0 = Release|Win32
 		{2EF070A1-F62F-4E6A-944B-88D140945C3C}.Release|x64.ActiveCfg = Release|x64
 		{2EF070A1-F62F-4E6A-944B-88D140945C3C}.Release|x64.Build.0 = Release|x64
 		{87F53C53-957E-4E91-878A-BC27828FB9EB}.Debug|Win32.ActiveCfg = Debug|Win32
 		{87F53C53-957E-4E91-878A-BC27828FB9EB}.Debug|Win32.Build.0 = Debug|Win32
 		{87F53C53-957E-4E91-878A-BC27828FB9EB}.Debug|x64.ActiveCfg = Debug|x64
 		{87F53C53-957E-4E91-878A-BC27828FB9EB}.Debug|x64.Build.0 = Debug|x64
 		{87F53C53-957E-4E91-878A-BC27828FB9EB}.Release|Win32.ActiveCfg = Release|Win32
 		{87F53C53-957E-4E91-878A-BC27828FB9EB}.Release|Win32.Build.0 = Release|Win32
 		{87F53C53-957E-4E91-878A-BC27828FB9EB}.Release|x64.ActiveCfg = Release|x64
 		{87F53C53-957E-4E91-878A-BC27828FB9EB}.Release|x64.Build.0 = Release|x64
 	EndGlobalSection
 	GlobalSection(SolutionProperties) = preSolution
 		HideSolutionNode = FALSE
--- a/examples/mandelbrot/mandelbrot.vcxproj
+++ b/examples/mandelbrot/mandelbrot.vcxproj
--- a/examples/mandelbrot/mandelbrot_serial.cpp
+++ b/examples/mandelbrot/mandelbrot_serial.cpp
@@ -36,7 +36,7 @@ static int mandel(float c_re, float c_im, int count) {
    float z_re = c_re, z_im = c_im;
    int i;
    for (i = 0; i < count; ++i) {
-        if (z_re * z_re + z_im * z_im > 4.)
+        if (z_re * z_re + z_im * z_im > 4.f)
            break;
        float new_re = z_re*z_re - z_im*z_im;
--- a/examples/mandelbrot_tasks/Makefile
+++ b/examples/mandelbrot_tasks/Makefile
@@ -1,14 +1,8 @@
 ARCH = $(shell uname)
-TASK_CXX=../tasks_pthreads.cpp
+TASK_CXX=../tasksys.cpp
 TASK_LIB=-lpthread
 ifeq ($(ARCH), Darwin)
  TASK_CXX=../tasks_gcd.cpp
  TASK_LIB=
 endif
 TASK_OBJ=$(addprefix objs/, $(subst ../,, $(TASK_CXX:.cpp=.o)))
 CXX=g++
--- a/examples/mandelbrot_tasks/mandelbrot.cpp
+++ b/examples/mandelbrot_tasks/mandelbrot.cpp
@@ -40,6 +40,7 @@
 #include <stdio.h>
 #include <algorithm>
 #include <string.h>
 #include "../timing.h"
 #include "../cpuid.h"
 #include "mandelbrot_ispc.h"
@@ -99,8 +100,12 @@ ensureTargetISAIsSupported() {
    }
 }
 static void usage() {
    fprintf(stderr, "usage: mandelbrot [--scale=<factor>]\n");
    exit(1);
 }
-int main() {
+int main(int argc, char *argv[]) {
    unsigned int width = 1536;
    unsigned int height = 1024;
    float x0 = -2;
@@ -108,6 +113,25 @@ int main() {
    float y0 = -1;
    float y1 = 1;
    if (argc == 1)
        ;
    else if (argc == 2) {
        if (strncmp(argv[1], "--scale=", 8) == 0) {
            float scale = atof(argv[1] + 8);
            if (scale == 0.f)
                usage();
            width *= scale;
            height *= scale;
            // round up to multiples of 16
            width = (width + 0xf) & ~0xf;
            height = (height + 0xf) & ~0xf;
        }
        else 
            usage();
    }
    else
        usage();
    ensureTargetISAIsSupported();
    int maxIterations = 512;
@@ -119,6 +143,9 @@ int main() {
    //
    double minISPC = 1e30;
    for (int i = 0; i < 3; ++i) {
        // Clear out the buffer
        for (unsigned int i = 0; i < width * height; ++i)
            buf[i] = 0;
        reset_and_start_timer();
        mandelbrot_ispc(x0, y0, x1, y1, width, height, maxIterations, buf);
        double dt = get_elapsed_mcycles();
@@ -128,9 +155,6 @@ int main() {
    printf("[mandelbrot ispc+tasks]:\t[%.3f] million cycles\n", minISPC);
    writePPM(buf, width, height, "mandelbrot-ispc.ppm");
    // Clear out the buffer
    for (unsigned int i = 0; i < width * height; ++i)
        buf[i] = 0;
    // 
    // And run the serial implementation 3 times, again reporting the
@@ -138,6 +162,9 @@ int main() {
    //
    double minSerial = 1e30;
    for (int i = 0; i < 3; ++i) {
        // Clear out the buffer
        for (unsigned int i = 0; i < width * height; ++i)
            buf[i] = 0;
        reset_and_start_timer();
        mandelbrot_serial(x0, y0, x1, y1, width, height, maxIterations, buf);
        double dt = get_elapsed_mcycles();
--- a/examples/mandelbrot_tasks/mandelbrot.ispc
+++ b/examples/mandelbrot_tasks/mandelbrot.ispc
@@ -53,11 +53,14 @@ mandel(float c_re, float c_im, int count) {
   [ystart,yend).
 */
 task void
-mandelbrot_scanlines(uniform int ystart, uniform int yend,
+mandelbrot_scanlines(uniform int ybase, uniform int span,
                     uniform float x0, uniform float dx, 
                     uniform float y0, uniform float dy,
                     uniform int width, uniform int maxIterations,
                     reference uniform int output[]) {
    uniform int ystart = ybase + taskIndex * span;
    uniform int yend = ystart + span;
    for (uniform int j = ystart; j < yend; ++j) {
        for (uniform int i = 0; i < width; i += programCount) {
            float x = x0 + (programIndex + i) * dx;
@@ -70,6 +73,20 @@ mandelbrot_scanlines(uniform int ystart, uniform int yend,
 }
 task void
 mandelbrot_chunk(uniform float x0, uniform float dx,
                 uniform float y0, uniform float dy,
                 uniform int width, uniform int height,
                 uniform int maxIterations, reference uniform int output[]) {
    uniform int ystart = taskIndex * (height/taskCount);
    uniform int yend = (taskIndex+1) * (height/taskCount);
    uniform int span = 1;
    launch[(yend-ystart)/span] < mandelbrot_scanlines(ystart, span, x0, dx, y0, dy,
                                                      width, maxIterations, output) >;
 }
 export void
 mandelbrot_ispc(uniform float x0, uniform float y0, 
                uniform float x1, uniform float y1,
@@ -78,9 +95,6 @@ mandelbrot_ispc(uniform float x0, uniform float y0,
    uniform float dx = (x1 - x0) / width;
    uniform float dy = (y1 - y0) / height;
-    /* Launch task to compute results for spans of 'span' scanlines. */
+    launch[32] < mandelbrot_chunk(x0, dx, y0, dy, width, height,
    uniform int span = 2;
    for (uniform int j = 0; j < height; j += span)
        launch < mandelbrot_scanlines(j, j+span, x0, dx, y0, dy, width,
                                  maxIterations, output) >;
 }
--- a/examples/mandelbrot_tasks/mandelbrot_serial.cpp
+++ b/examples/mandelbrot_tasks/mandelbrot_serial.cpp
@@ -36,7 +36,7 @@ static int mandel(float c_re, float c_im, int count) {
    float z_re = c_re, z_im = c_im;
    int i;
    for (i = 0; i < count; ++i) {
-        if (z_re * z_re + z_im * z_im > 4.)
+        if (z_re * z_re + z_im * z_im > 4.f)
            break;
        float new_re = z_re*z_re - z_im*z_im;
--- a/examples/mandelbrot_tasks/mandelbrot_tasks.vcxproj
+++ b/examples/mandelbrot_tasks/mandelbrot_tasks.vcxproj
@@ -143,7 +143,7 @@
  <ItemGroup>
    <ClCompile Include="mandelbrot.cpp" />
    <ClCompile Include="mandelbrot_serial.cpp" />
-    <ClCompile Include="../tasks_concrt.cpp" />
+    <ClCompile Include="../tasksys.cpp" />
  </ItemGroup>
  <ItemGroup>
    <CustomBuild Include="mandelbrot.ispc">
--- a/examples/noise/noise.ispc
+++ b/examples/noise/noise.ispc
@@ -131,11 +131,11 @@ static float Noise(float x, float y, float z) {
 }
-static float Turbulence(float x, float y, float z, int octaves) {
+static float Turbulence(float x, float y, float z, uniform int octaves) {
    float omega = 0.6;
    float sum = 0., lambda = 1., o = 1.;
-    for (int i = 0; i < octaves; ++i) {
+    for (uniform int i = 0; i < octaves; ++i) {
        sum += abs(o * Noise(lambda * x, lambda * y, lambda * z));
        lambda *= 1.99f;
        o *= omega;
--- a/examples/noise/noise.vcxproj
+++ b/examples/noise/noise.vcxproj
--- a/examples/noise/noise_serial.cpp
+++ b/examples/noise/noise_serial.cpp
@@ -104,7 +104,7 @@ inline float NoiseWeight(float t) {
 inline float Lerp(float t, float low, float high) {
-    return (1. - t) * low + t * high;
+    return (1.f - t) * low + t * high;
 }
@@ -147,7 +147,7 @@ static float Turbulence(float x, float y, float z, int octaves) {
        lambda *= 1.99f;
        o *= omega;
    }
-    return sum * 0.5;
+    return sum * 0.5f;
 }
@@ -163,7 +163,7 @@ void noise_serial(float x0, float y0, float x1, float y1,
            float y = y0 + j * dy;
            int index = (j * width + i);
-            output[index] = Turbulence(x, y, 0.6, 8);
+            output[index] = Turbulence(x, y, 0.6f, 8);
        }
    }
 }
--- a/examples/options/options.vcxproj
+++ b/examples/options/options.vcxproj
--- a/examples/options/options_serial.cpp
+++ b/examples/options/options_serial.cpp
@@ -47,7 +47,7 @@ static inline float
 CND(float X) {
    float L = fabsf(X);
-    float k = 1.0 / (1.0 + 0.2316419 * L);
+    float k = 1.f / (1.f + 0.2316419f * L);
    float k2 = k*k;
    float k3 = k2*k;
    float k4 = k2*k2;
@@ -59,7 +59,7 @@ CND(float X) {
    w *= invSqrt2Pi * expf(-L * L * .5f);
    if (X > 0.f)
-        w = 1.0 - w;
+        w = 1.f - w;
    return w;
 }
@@ -94,7 +94,7 @@ binomial_put_serial(float Sa[], float Xa[], float Ta[],
        float dt = T / BINOMIAL_NUM;
        float u = expf(v * sqrtf(dt));
-        float d = 1. / u;
+        float d = 1.f / u;
        float disc = expf(r * dt);
        float Pu = (disc - d) / (u - d);
--- a/examples/rt/Makefile
+++ b/examples/rt/Makefile
@@ -1,14 +1,8 @@
 ARCH = $(shell uname)
-TASK_CXX=../tasks_pthreads.cpp
+TASK_CXX=../tasksys.cpp
 TASK_LIB=-lpthread
 ifeq ($(ARCH), Darwin)
  TASK_CXX=../tasks_gcd.cpp
  TASK_LIB=
 endif
 TASK_OBJ=$(addprefix objs/, $(subst ../,, $(TASK_CXX:.cpp=.o)))
 CXX=g++
--- a/examples/rt/rt.cpp
+++ b/examples/rt/rt.cpp
@@ -42,6 +42,7 @@
 #include <math.h>
 #include <algorithm>
 #include <assert.h>
 #include <string.h>
 #include <sys/types.h>
 #include "../timing.h"
 #include "../cpuid.h"
@@ -51,7 +52,8 @@ using namespace ispc;
 typedef unsigned int uint;
-extern void raytrace_serial(int width, int height, const float raster2camera[4][4], 
+extern void raytrace_serial(int width, int height, int baseWidth, int baseHeight,
                            const float raster2camera[4][4], 
                            const float camera2world[4][4], float image[],
                            int id[], const LinearBVHNode nodes[],
                            const Triangle triangles[]);
@@ -126,11 +128,28 @@ ensureTargetISAIsSupported() {
 }
-int main(int argc, char *argv[]) {
+static void usage() {
-    if (argc != 2) {
+    fprintf(stderr, "rt [--scale=<factor>] <scene name base>\n");
        fprintf(stderr, "usage: rt <filename base>\n");
    exit(1);
 }
 int main(int argc, char *argv[]) {
    float scale = 1.f;
    const char *filename = NULL;
    for (int i = 1; i < argc; ++i) {
        if (strncmp(argv[i], "--scale=", 8) == 0) {
            scale = atof(argv[i] + 8);
            if (scale == 0.f)
                usage();
        }
        else if (filename != NULL)
            usage();
        else
            filename = argv[i];
    }
    if (filename == NULL)
        usage();
    ensureTargetISAIsSupported();
@@ -144,10 +163,10 @@ int main(int argc, char *argv[]) {
    // Read the camera specification information from the camera file
    //
    char fnbuf[1024];
-    sprintf(fnbuf, "%s.camera", argv[1]);
+    sprintf(fnbuf, "%s.camera", filename);
    FILE *f = fopen(fnbuf, "rb");
    if (!f) {
-        perror(argv[1]);
+        perror(fnbuf);
        return 1;
    }
@@ -155,20 +174,20 @@ int main(int argc, char *argv[]) {
    // Nothing fancy, and trouble if we run on a big-endian system, just
    // fread in the bits
    //
-    int width, height;
+    int baseWidth, baseHeight;
    float camera2world[4][4], raster2camera[4][4];
-    READ(width, 1);
+    READ(baseWidth, 1);
-    READ(height, 1);
+    READ(baseHeight, 1);
    READ(camera2world[0][0], 16);
    READ(raster2camera[0][0], 16);
    //
    // Read in the serialized BVH 
    //
-    sprintf(fnbuf, "%s.bvh", argv[1]);
+    sprintf(fnbuf, "%s.bvh", filename);
    f = fopen(fnbuf, "rb");
    if (!f) {
-        perror(argv[2]);
+        perror(fnbuf);
        return 1;
    }
@@ -215,10 +234,10 @@ int main(int argc, char *argv[]) {
    }
    fclose(f);
-    // round image resolution up to multiple of 4 to make things easy for
+    // round image resolution up to multiple of 16 to make things easy for
    // the code that assigns pixels to ispc program instances
-    height = (height + 3) & ~3;
+    int height = (int(baseHeight * scale) + 0xf) & ~0xf;
-    width = (width + 3) & ~3;
+    int width = (int(baseWidth * scale) + 0xf) & ~0xf;
    // allocate images; one to hold hit object ids, one to hold depth to
    // the first interseciton
@@ -231,8 +250,8 @@ int main(int argc, char *argv[]) {
    double minTimeISPC = 1e30;
    for (int i = 0; i < 3; ++i) {
        reset_and_start_timer();
-        raytrace_ispc(width, height, raster2camera, camera2world, 
+        raytrace_ispc(width, height, baseWidth, baseHeight, raster2camera, 
-                      image, id, nodes, triangles);
+                      camera2world, image, id, nodes, triangles);
        double dt = get_elapsed_mcycles();
        minTimeISPC = std::min(dt, minTimeISPC);
    }
@@ -250,8 +269,8 @@ int main(int argc, char *argv[]) {
    double minTimeISPCtasks = 1e30;
    for (int i = 0; i < 3; ++i) {
        reset_and_start_timer();
-        raytrace_ispc_tasks(width, height, raster2camera, camera2world, 
+        raytrace_ispc_tasks(width, height, baseWidth, baseHeight, raster2camera,
-                            image, id, nodes, triangles);
+                            camera2world, image, id, nodes, triangles);
        double dt = get_elapsed_mcycles();
        minTimeISPCtasks = std::min(dt, minTimeISPCtasks);
    }
@@ -270,8 +289,8 @@ int main(int argc, char *argv[]) {
    double minTimeSerial = 1e30;
    for (int i = 0; i < 3; ++i) {
        reset_and_start_timer();
-        raytrace_serial(width, height, raster2camera, camera2world, 
+        raytrace_serial(width, height, baseWidth, baseHeight, raster2camera, 
-                        image, id, nodes, triangles);
+                        camera2world, image, id, nodes, triangles);
        double dt = get_elapsed_mcycles();
        minTimeSerial = std::min(dt, minTimeSerial);
    }
--- a/examples/rt/rt.ispc
+++ b/examples/rt/rt.ispc
@@ -227,12 +227,17 @@ bool BVHIntersect(const LinearBVHNode nodes[], const Triangle tris[],
 static void raytrace_tile(uniform int x0, uniform int x1,
-                          uniform int y0, uniform int y1, uniform int width,
+                          uniform int y0, uniform int y1, 
                          uniform int width, uniform int height,
                          uniform int baseWidth, uniform int baseHeight,
                          const uniform float raster2camera[4][4], 
                          const uniform float camera2world[4][4],
                          uniform float image[], uniform int id[],
                          const LinearBVHNode nodes[],
                          const Triangle triangles[]) {
    uniform float widthScale = (float)(baseWidth) / (float)(width);
    uniform float heightScale = (float)(baseHeight) / (float)(height);
    static const uniform float udx[16] = { 0, 1, 0, 1, 2, 3, 2, 3, 
                                           0, 1, 0, 1, 2, 3, 2, 3 };
    static const uniform float udy[16] = { 0, 0, 1, 1, 0, 0, 1, 1, 
@@ -252,7 +257,8 @@ static void raytrace_tile(uniform int x0, uniform int x1,
                const float dy = udy[o * programCount + programIndex];
                Ray ray;
-                generateRay(raster2camera, camera2world, x+dx, y+dy, ray);
+                generateRay(raster2camera, camera2world, (x+dx)*widthScale,
                            (y+dy)*heightScale, ray);
                BVHIntersect(nodes, triangles, ray);
                int offset = (y + (int)dy) * width + (x + (int)dx);
@@ -265,42 +271,51 @@ static void raytrace_tile(uniform int x0, uniform int x1,
 export void raytrace_ispc(uniform int width, uniform int height,
                          uniform int baseWidth, uniform int baseHeight,
                          const uniform float raster2camera[4][4], 
                          const uniform float camera2world[4][4],
                          uniform float image[], uniform int id[],
                          const LinearBVHNode nodes[],
                          const Triangle triangles[]) {
-    raytrace_tile(0, width, 0, height, width, raster2camera, camera2world, image,
+    raytrace_tile(0, width, 0, height, width, height, baseWidth, baseHeight,
                  raster2camera, camera2world, image,
                  id, nodes, triangles);
 }
-task void raytrace_tile_task(uniform int x0, uniform int x1,
+task void raytrace_tile_task(uniform int y0, uniform int y1, 
-                             uniform int y0, uniform int y1, uniform int width,
+                             uniform int width, uniform int height,
                             uniform int baseWidth, uniform int baseHeight,
                             const uniform float raster2camera[4][4], 
                             const uniform float camera2world[4][4],
                             uniform float image[], uniform int id[],
                             const LinearBVHNode nodes[],
                             const Triangle triangles[]) {
-    raytrace_tile(x0, x1, y0, y1, width, raster2camera, camera2world, image,
+    uniform int dx = 16; // must match dx below
    uniform int xTasks = (width + (dx-1)) / dx;
    uniform int x0 = (taskIndex % xTasks) * dx;
    uniform int x1 = x0 + dx;
    x1 = min(x1, width);
    raytrace_tile(x0, x1, y0, y1, width, height, baseWidth, baseHeight, 
                  raster2camera, camera2world, image,
                  id, nodes, triangles);
 }
 export void raytrace_ispc_tasks(uniform int width, uniform int height,
                                uniform int baseWidth, uniform int baseHeight,
                                const uniform float raster2camera[4][4], 
                                const uniform float camera2world[4][4],
                                uniform float image[], uniform int id[],
                                const LinearBVHNode nodes[],
                                const Triangle triangles[]) {
    uniform int dx = 16, dy = 16;
    uniform int nTasks = (width + (dx-1)) / dx;
    for (uniform int y = 0; y < height; y += dy) {
        uniform int y1 = min(y + dy, height);
-        for (uniform int x = 0; x < width; x += dx) {
+        launch[nTasks] < raytrace_tile_task(y, y1, width, height, baseWidth,
-            uniform int x1 = min(x + dx, width);
+                                            baseHeight, raster2camera, camera2world, 
-            launch < raytrace_tile_task(x, x1, y, y1, width, raster2camera, 
+                                            image, id, nodes, triangles) >;
                                        camera2world, image, id, nodes,
                                        triangles) >;
         }
    }
 }
--- a/examples/rt/rt.vcxproj
+++ b/examples/rt/rt.vcxproj
@@ -164,7 +164,7 @@ ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h
  <ItemGroup>
    <ClCompile Include="rt.cpp" />
    <ClCompile Include="rt_serial.cpp" />
-    <ClCompile Include="../tasks_concrt.cpp" />
+    <ClCompile Include="../tasksys.cpp" />
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
--- a/examples/rt/rt_serial.cpp
+++ b/examples/rt/rt_serial.cpp
@@ -258,17 +258,21 @@ bool BVHIntersect(const LinearBVHNode nodes[], const Triangle tris[],
 }
-void raytrace_serial(int width, int height,
+void raytrace_serial(int width, int height, int baseWidth, int baseHeight,
                     const float raster2camera[4][4], 
                     const float camera2world[4][4],
                     float image[],
                     int id[],
                     const LinearBVHNode nodes[],
                     const Triangle triangles[]) {
    float widthScale = float(baseWidth) / float(width);
    float heightScale = float(baseHeight) / float(height);
    for (int y = 0; y < height; ++y) {
        for (int x = 0; x < width; ++x) {
                Ray ray;
-                generateRay(raster2camera, camera2world, x, y, ray);
+                generateRay(raster2camera, camera2world, x * widthScale,
                            y * heightScale, ray);
                BVHIntersect(nodes, triangles, ray);
                int offset = y * width + x;
--- a/examples/simple/simple.vcxproj
+++ b/examples/simple/simple.vcxproj
@@ -1,4 +1,4 @@
-<?xml version="1.0" encoding="utf-8"?>
+<?xml version="1.0" encoding="utf-8"?>
 <Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
  <ItemGroup Label="ProjectConfigurations">
    <ProjectConfiguration Include="Debug|Win32">
@@ -28,7 +28,7 @@
 ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86
 </Command>
      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-ispc -O2 %(Filename).ispco %(Filename).obj -h %(Filename)_ispc.h
+ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h
 </Command>
      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(Filename).obj</Outputs>
      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">%(Filename).obj</Outputs>
--- a/examples/stencil/Makefile
+++ b/examples/stencil/Makefile
@@ -1,14 +1,8 @@
 ARCH = $(shell uname)
-TASK_CXX=../tasks_pthreads.cpp
+TASK_CXX=../tasksys.cpp
 TASK_LIB=-lpthread
 ifeq ($(ARCH), Darwin)
  TASK_CXX=../tasks_gcd.cpp
  TASK_LIB=
 endif
 TASK_OBJ=$(addprefix objs/, $(subst ../,, $(TASK_CXX:.cpp=.o)))
 CXX=g++
--- a/examples/stencil/stencil.cpp
+++ b/examples/stencil/stencil.cpp
@@ -116,20 +116,38 @@ int main() {
    InitData(Nx, Ny, Nz, Aispc, vsq);
    //
-    // Compute the image using the ispc implementation; report the minimum
+    // Compute the image using the ispc implementation on one core; report
-    // time of three runs.
+    // the minimum time of three runs.
    //
-    double minISPC = 1e30;
+    double minTimeISPC = 1e30;
    for (int i = 0; i < 3; ++i) {
        reset_and_start_timer();
        loop_stencil_ispc(0, 6, width, Nx - width, width, Ny - width,
                          width, Nz - width, Nx, Ny, Nz, coeff, vsq,
                          Aispc[0], Aispc[1]);
        double dt = get_elapsed_mcycles();
-        minISPC = std::min(minISPC, dt);
+        minTimeISPC = std::min(minTimeISPC, dt);
    }
-    printf("[stencil ispc]:\t\t\t[%.3f] million cycles\n", minISPC);
+    printf("[stencil ispc 1 core]:\t\t[%.3f] million cycles\n", minTimeISPC);
    InitData(Nx, Ny, Nz, Aispc, vsq);
    //
    // Compute the image using the ispc implementation with tasks; report
    // the minimum time of three runs.
    //
    double minTimeISPCTasks = 1e30;
    for (int i = 0; i < 3; ++i) {
        reset_and_start_timer();
        loop_stencil_ispc_tasks(0, 6, width, Nx - width, width, Ny - width,
                                width, Nz - width, Nx, Ny, Nz, coeff, vsq,
                                Aispc[0], Aispc[1]);
        double dt = get_elapsed_mcycles();
        minTimeISPCTasks = std::min(minTimeISPCTasks, dt);
    }
    printf("[stencil ispc + tasks]:\t\t[%.3f] million cycles\n", minTimeISPCTasks);
    InitData(Nx, Ny, Nz, Aserial, vsq);
@@ -137,19 +155,20 @@ int main() {
    // And run the serial implementation 3 times, again reporting the
    // minimum time.
    //
-    double minSerial = 1e30;
+    double minTimeSerial = 1e30;
    for (int i = 0; i < 3; ++i) {
        reset_and_start_timer();
        loop_stencil_serial(0, 6, width, Nx-width, width, Ny - width,
                            width, Nz - width, Nx, Ny, Nz, coeff, vsq,
                            Aserial[0], Aserial[1]);
        double dt = get_elapsed_mcycles();
-        minSerial = std::min(minSerial, dt);
+        minTimeSerial = std::min(minTimeSerial, dt);
    }
-    printf("[stencil serial]:\t\t[%.3f] millon cycles\n", minSerial);
+    printf("[stencil serial]:\t\t[%.3f] millon cycles\n", minTimeSerial);
-    printf("\t\t\t\t(%.2fx speedup from ISPC)\n", minSerial/minISPC);
+    printf("\t\t\t\t(%.2fx speedup from ISPC, %.2f from ISPC + tasks)\n", 
           minTimeSerial / minTimeISPC, minTimeSerial / minTimeISPCTasks);
    // Check for agreement
    int offset = 0;
--- a/examples/stencil/stencil.ispc
+++ b/examples/stencil/stencil.ispc
@@ -32,7 +32,7 @@
 */
-static task void
+static void
 stencil_step(uniform int x0, uniform int x1,
             uniform int y0, uniform int y1,
             uniform int z0, uniform int z1,
@@ -67,7 +67,19 @@ stencil_step(uniform int x0, uniform int x1,
 }
-export void loop_stencil_ispc(uniform int t0, uniform int t1, 
+static task void
 stencil_step_task(uniform int x0, uniform int x1,
                  uniform int y0, uniform int y1,
                  uniform int z0, uniform int z1,
                  uniform int Nx, uniform int Ny, uniform int Nz,
                  uniform const float coef[4], uniform const float vsq[],
                  uniform const float Ain[], uniform float Aout[]) {
    stencil_step(x0, x1, y0, y1, z0, z1, Nx, Ny, Nz, coef, vsq, Ain, Aout);
 }
 export void
 loop_stencil_ispc_tasks(uniform int t0, uniform int t1, 
                        uniform int x0, uniform int x1,
                        uniform int y0, uniform int y1,
                        uniform int z0, uniform int z1,
@@ -83,14 +95,35 @@ export void loop_stencil_ispc(uniform int t0, uniform int t1,
        uniform int dz = 1;
        for (uniform int z = z0; z < z1; z += dz) {
            if ((t & 1) == 0)
-                launch < stencil_step(x0, x1, y0, y1, z, z+dz, Nx, Ny, Nz, coef, vsq, 
+                launch < stencil_step_task(x0, x1, y0, y1, z, z+dz, Nx, Ny, Nz, 
-                                      Aeven, Aodd) >;
+                                           coef, vsq, Aeven, Aodd) >;
            else
-                launch < stencil_step(x0, x1, y0, y1, z, z+dz, Nx, Ny, Nz, coef, vsq, 
+                launch < stencil_step_task(x0, x1, y0, y1, z, z+dz, Nx, Ny, Nz, 
-                                      Aodd, Aeven) >;
+                                           coef, vsq, Aodd, Aeven) >;
        }
        // We need to wait for all of the launched tasks to finish before
        // starting the next iteration.
        sync;
    }
 }
 export void
 loop_stencil_ispc(uniform int t0, uniform int t1, 
                  uniform int x0, uniform int x1,
                  uniform int y0, uniform int y1,
                  uniform int z0, uniform int z1,
                  uniform int Nx, uniform int Ny, uniform int Nz,
                  uniform const float coef[4], 
                  uniform const float vsq[],
                  uniform float Aeven[], uniform float Aodd[])
 {
    for (uniform int t = t0; t < t1; ++t) {
        if ((t & 1) == 0)
            stencil_step(x0, x1, y0, y1, z0, z1, Nx, Ny, Nz, coef, vsq, 
                         Aeven, Aodd);
        else
            stencil_step(x0, x1, y0, y1, z0, z1, Nx, Ny, Nz, coef, vsq, 
                         Aodd, Aeven);
    }
 }
--- a/examples/stencil/stencil.vcxproj
+++ b/examples/stencil/stencil.vcxproj
@@ -164,7 +164,7 @@ ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h
  <ItemGroup>
    <ClCompile Include="stencil.cpp" />
    <ClCompile Include="stencil_serial.cpp" />
-    <ClCompile Include="../tasks_concrt.cpp" />
+    <ClCompile Include="../tasksys.cpp" />
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
--- a/examples/taskinfo.h
+++ b/examples/taskinfo.h
@@ -1,180 +0,0 @@
 /*
  Copyright (c) 2011, Intel Corporation
  All rights reserved.
  Redistribution and use in source and binary forms, with or without
  modification, are permitted provided that the following conditions are
  met:
    * Redistributions of source code must retain the above copyright
      notice, this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.
    * Neither the name of Intel Corporation nor the names of its
      contributors may be used to endorse or promote products derived from
      this software without specific prior written permission.
   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
 */
 #ifndef TASKINFO_H
 #define TASKINFO_H 1
 #ifdef _MSC_VER
 #define ISPC_IS_WINDOWS
 #elif defined(__linux__)
 #define ISPC_IS_LINUX
 #elif defined(__APPLE__)
 #define ISPC_IS_APPLE
 #endif
 #ifdef ISPC_IS_WINDOWS
 #define NOMINMAX
 #include <windows.h>
 #include <concrt.h>
 using namespace Concurrency;
 #endif // ISPC_IS_WINDOWS
 #if (__SIZEOF_POINTER__ == 4) || defined(__i386__) || defined(_WIN32)
 #define ISPC_POINTER_BYTES 4
 #elif (__SIZEOF_POINTER__ == 8) || defined(__x86_64__) || defined(__amd64__) || defined(_WIN64)
 #define ISPC_POINTER_BYTES 8
 #else
 #error "Pointer size unknown!"
 #endif // __SIZEOF_POINTER__
 #include <stdint.h>
 #include <stdlib.h>
 #include <stdio.h>
 #include <assert.h>
 typedef struct TaskInfo {
    void *func;
    void *data;
 #if defined(ISPC_IS_WINDOWS)
    event taskEvent;
 #endif
 } TaskInfo;
 #ifndef ISPC_IS_WINDOWS
 static int32_t 
 lAtomicCompareAndSwap32(volatile int32_t *v, int32_t newValue, int32_t oldValue) {
    int32_t result;
    __asm__ __volatile__("lock\ncmpxchgl %2,%1"
                          : "=a"(result), "=m"(*v)
                          : "q"(newValue), "0"(oldValue)
                          : "memory");
    __asm__ __volatile__("mfence":::"memory");
    return result;
 }
 #endif // !ISPC_IS_WINDOWS
 static void *
 lAtomicCompareAndSwapPointer(void **v, void *newValue, void *oldValue) {
 #ifdef ISPC_IS_WINDOWS
 	return InterlockedCompareExchangePointer(v, newValue, oldValue);
 #else
    void *result;
 #if (ISPC_POINTER_BYTES == 4)
    __asm__ __volatile__("lock\ncmpxchgd %2,%1"
                          : "=a"(result), "=m"(*v)
                          : "q"(newValue), "0"(oldValue)
                          : "memory");
 #else
    __asm__ __volatile__("lock\ncmpxchgq %2,%1"
                          : "=a"(result), "=m"(*v)
                          : "q"(newValue), "0"(oldValue)
                          : "memory");
 #endif // ISPC_POINTER_BYTES
    __asm__ __volatile__("mfence":::"memory");
    return result;
 #endif // ISPC_IS_WINDOWS
 }
 #ifndef ISPC_IS_WINDOWS
 static int32_t 
 lAtomicAdd32(volatile int32_t *v, int32_t delta) {
    // Do atomic add with gcc x86 inline assembly
    int32_t origValue;
    __asm__ __volatile__("lock\n"
                         "xaddl %0,%1"
                         : "=r"(origValue), "=m"(*v) : "0"(delta)
                         : "memory");
    return origValue;
 }
 #endif
 #define LOG_TASK_QUEUE_CHUNK_SIZE 13
 #define MAX_TASK_QUEUE_CHUNKS 1024
 #define TASK_QUEUE_CHUNK_SIZE (1<<LOG_TASK_QUEUE_CHUNK_SIZE)
 #define MAX_LAUNCHED_TASKS (MAX_TASK_QUEUE_CHUNKS * TASK_QUEUE_CHUNK_SIZE)
 typedef void (*TaskFuncType)(void *, int, int);
 #ifdef ISPC_IS_WINDOWS
 static volatile LONG nextTaskInfoCoordinate;
 #else
 static volatile int nextTaskInfoCoordinate;
 #endif
 static TaskInfo *taskInfo[MAX_TASK_QUEUE_CHUNKS];
 static inline void
 lInitTaskInfo() {
    taskInfo[0] = new TaskInfo[TASK_QUEUE_CHUNK_SIZE];
 }
 static inline TaskInfo *
 lGetTaskInfo() {
 #ifdef ISPC_IS_WINDOWS
    int myCoord = InterlockedAdd(&nextTaskInfoCoordinate, 1)-1;
 #else
    int myCoord = lAtomicAdd32(&nextTaskInfoCoordinate, 1);
 #endif
 	int index = (myCoord >> LOG_TASK_QUEUE_CHUNK_SIZE);
    int offset = myCoord & (TASK_QUEUE_CHUNK_SIZE-1);
    if (index == MAX_TASK_QUEUE_CHUNKS) {
        fprintf(stderr, "A total of %d tasks have been launched--the simple "
                "built-in task system can handle no more. Exiting.", myCoord);
        exit(1);
    }
    if (taskInfo[index] == NULL) {
        TaskInfo *newChunk = new TaskInfo[TASK_QUEUE_CHUNK_SIZE];
        if (lAtomicCompareAndSwapPointer((void **)&taskInfo[index], newChunk, 
                                         NULL) != NULL) {
            // failure--someone else got it, but that's cool
            assert(taskInfo[index] != NULL);
            free(newChunk);
        }
    }
    return &taskInfo[index][offset];
 }
 static inline void
 lResetTaskInfo() {
    nextTaskInfoCoordinate = 0;
 }
 #endif // TASKINFO_H
--- a/examples/tasks_concrt.cpp
+++ b/examples/tasks_concrt.cpp
@@ -1,104 +0,0 @@
 /*
  Copyright (c) 2011, Intel Corporation
  All rights reserved.
  Redistribution and use in source and binary forms, with or without
  modification, are permitted provided that the following conditions are
  met:
    * Redistributions of source code must retain the above copyright
      notice, this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.
    * Neither the name of Intel Corporation nor the names of its
      contributors may be used to endorse or promote products derived from
      this software without specific prior written permission.
   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
 */
 #include "taskinfo.h"
 /* Simple task system implementation for ispc based on Microsoft's
   Concurrency Runtime. */
 #include <windows.h>
 #include <concrt.h>
 using namespace Concurrency;
 #include <stdint.h>
 #include <assert.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <algorithm>
 // ispc expects these functions to have C linkage / not be mangled
 extern "C" { 
    void ISPCLaunch(void *f, void *data);
    void ISPCSync();
    void *ISPCMalloc(int64_t size, int32_t alignment);
    void ISPCFree(void *ptr);
 }
 void __cdecl
 lRunTask(LPVOID param) {
    TaskInfo *ti = (TaskInfo *)param;
    // Actually run the task. 
    // FIXME: like the GCD implementation for OS X, this is passing bogus
    // values for the threadIndex and threadCount builtins, which in turn
    // will cause bugs in code that uses those.
    int threadIndex = 0;
    int threadCount = 1;
    TaskFuncType func = (TaskFuncType)ti->func;
    func(ti->data, threadIndex, threadCount);
    // Signal the event that this task is done
    ti->taskEvent.set();
 }
 void
 ISPCLaunch(void *func, void *data) {
    TaskInfo *ti = lGetTaskInfo();
    ti->func = (TaskFuncType)func;
    ti->data = data;
 	ti->taskEvent.reset();
    CurrentScheduler::ScheduleTask(lRunTask, ti);
 }
 void ISPCSync() {
    for (int i = 0; i < nextTaskInfoCoordinate; ++i) {
 		int index = (i >> LOG_TASK_QUEUE_CHUNK_SIZE);
 		int offset = i & (TASK_QUEUE_CHUNK_SIZE-1);
 		taskInfo[index][offset].taskEvent.wait();
 		taskInfo[index][offset].taskEvent.reset();
    }
    lResetTaskInfo();
 }
 void *ISPCMalloc(int64_t size, int32_t alignment) {
    return _aligned_malloc(size, alignment);
 }
 void ISPCFree(void *ptr) {
    _aligned_free(ptr);
 }
--- a/examples/tasks_gcd.cpp
+++ b/examples/tasks_gcd.cpp
@@ -1,99 +0,0 @@
 /*
  Copyright (c) 2011, Intel Corporation
  All rights reserved.
  Redistribution and use in source and binary forms, with or without
  modification, are permitted provided that the following conditions are
  met:
    * Redistributions of source code must retain the above copyright
      notice, this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.
    * Neither the name of Intel Corporation nor the names of its
      contributors may be used to endorse or promote products derived from
      this software without specific prior written permission.
   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
 */
 #include "taskinfo.h"
 /* A simple task system for ispc programs based on Apple's Grand Central
   Dispatch. */
 #include <dispatch/dispatch.h>
 #include <stdio.h>
 static int initialized = 0;
 static volatile int32_t lock = 0;
 static dispatch_queue_t gcdQueue;
 static dispatch_group_t gcdGroup;
 // ispc expects these functions to have C linkage / not be mangled
 extern "C" { 
    void ISPCLaunch(void *f, void *data);
    void ISPCSync();
 }
 static void
 lRunTask(void *ti) {
    TaskInfo *taskInfo = (TaskInfo *)ti;
    // FIXME: these are bogus values; may cause bugs in code that depends
    // on them having unique values in different threads.
    int threadIndex = 0;
    int threadCount = 1;
    TaskFuncType func = (TaskFuncType)(taskInfo->func);
    // Actually run the task
    func(taskInfo->data, threadIndex, threadCount);
 }
 void ISPCLaunch(void *func, void *data) {
    if (!initialized) {
        while (1) {
            if (lAtomicCompareAndSwap32(&lock, 1, 0) == 0) {
                if (!initialized) {
                    gcdQueue = dispatch_get_global_queue(DISPATCH_QUEUE_PRIORITY_DEFAULT, 0);
                    gcdGroup = dispatch_group_create();
                    lInitTaskInfo();
                    __asm__ __volatile__("mfence":::"memory");
                    initialized = 1;
                }
                lock = 0;
                break;
            }
        }
    }
    TaskInfo *ti = lGetTaskInfo();
    ti->func = func;
    ti->data = data;
    dispatch_group_async_f(gcdGroup, gcdQueue, ti, lRunTask);
 }
 void ISPCSync() {
    if (!initialized)
        return;
    // Wait for all of the tasks in the group to complete before returning
    dispatch_group_wait(gcdGroup, DISPATCH_TIME_FOREVER);
    lResetTaskInfo();
 }
--- a/examples/tasks_pthreads.cpp
+++ b/examples/tasks_pthreads.cpp
@@ -1,294 +0,0 @@
 /*
  Copyright (c) 2011, Intel Corporation
  All rights reserved.
  Redistribution and use in source and binary forms, with or without
  modification, are permitted provided that the following conditions are
  met:
    * Redistributions of source code must retain the above copyright
      notice, this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.
    * Neither the name of Intel Corporation nor the names of its
      contributors may be used to endorse or promote products derived from
      this software without specific prior written permission.
   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
 */
 #include "taskinfo.h"
 #include <pthread.h>
 #include <semaphore.h>
 #include <string.h>
 #include <unistd.h>
 #include <assert.h>
 #include <stdio.h>
 #include <fcntl.h>
 #include <sys/types.h>
 #include <sys/stat.h>
 #include <sys/param.h>
 #include <sys/sysctl.h>
 #include <stdint.h>
 #include <stdlib.h>
 #include <errno.h>
 static int initialized = 0;
 static volatile int32_t lock = 0;
 static int nThreads;
 static pthread_t *threads;
 static pthread_mutex_t taskQueueMutex;
 static int nextTaskToRun;
 static sem_t *workerSemaphore;
 static uint32_t numUnfinishedTasks;
 static pthread_mutex_t tasksRunningConditionMutex;
 static pthread_cond_t tasksRunningCondition;
 // ispc expects these functions to have C linkage / not be mangled
 extern "C" { 
    void ISPCLaunch(void *f, void *data);
    void ISPCSync();
 }
 static void *lTaskEntry(void *arg);
 /** Figure out how many CPU cores there are in the system
 */
 static int
 lNumCPUCores() {
    return sysconf(_SC_NPROCESSORS_ONLN);
 }
 static void
 lTasksInit() {
    nThreads = lNumCPUCores();
    threads = (pthread_t *)malloc(nThreads * sizeof(pthread_t));
    int err;
    if ((err = pthread_mutex_init(&taskQueueMutex, NULL)) != 0) {
        fprintf(stderr, "Error creating mutex: %s\n", strerror(err));
        exit(1);
    }
    char name[32];
    sprintf(name, "ispc_task.%d", (int)getpid());
    workerSemaphore = sem_open(name, O_CREAT, S_IRUSR|S_IWUSR, 0);
    if (!workerSemaphore) {
        fprintf(stderr, "Error creating semaphore: %s\n", strerror(err));
        exit(1);
    }
    if ((err = pthread_cond_init(&tasksRunningCondition, NULL)) != 0) {
        fprintf(stderr, "Error creating condition variable: %s\n", strerror(err));
        exit(1);
    }
    if ((err = pthread_mutex_init(&tasksRunningConditionMutex, NULL)) != 0) {
        fprintf(stderr, "Error creating mutex: %s\n", strerror(err));
        exit(1);
    }
    for (int i = 0; i < nThreads; ++i) {
        err = pthread_create(&threads[i], NULL, &lTaskEntry, (void *)(i));
        if (err != 0) {
            fprintf(stderr, "Error creating pthread %d: %s\n", i, strerror(err));
            exit(1);
        }
    }
 }
 void
 ISPCLaunch(void *f, void *d) {
    int err;
    if (!initialized) {
        while (1) {
            if (lAtomicCompareAndSwap32(&lock, 1, 0) == 0) {
                if (!initialized) {
                    lTasksInit();
                    __asm__ __volatile__("mfence":::"memory");
                    initialized = 1;
                }
                lock = 0;
                break;
            }
        }
    }
    //
    // Acquire mutex, add task
    //
    if ((err = pthread_mutex_lock(&taskQueueMutex)) != 0) {
        fprintf(stderr, "Error from pthread_mutex_lock: %s\n", strerror(err));
        exit(1);
    }
    // Need a mutex here to ensure we get this filled in before a worker
    // grabs it and starts running...
    TaskInfo *ti = lGetTaskInfo();
    ti->func = f;
    ti->data = d;
    if ((err = pthread_mutex_unlock(&taskQueueMutex)) != 0) {
        fprintf(stderr, "Error from pthread_mutex_unlock: %s\n", strerror(err));
        exit(1);
    }
    //
    // Update count of number of tasks left to run
    //
    if ((err = pthread_mutex_lock(&tasksRunningConditionMutex)) != 0) {
        fprintf(stderr, "Error from pthread_mutex_lock: %s\n", strerror(err));
        exit(1);
    }
    // FIXME: is this redundant with nextTaskInfoCoordinate?
    ++numUnfinishedTasks;
    if ((err = pthread_mutex_unlock(&tasksRunningConditionMutex)) != 0) {
        fprintf(stderr, "Error from pthread_mutex_lock: %s\n", strerror(err));
        exit(1);
    }
    //
    // Post to the worker semaphore to wake up worker threads that are
    // sleeping waiting for tasks to show up
    //
    if ((err = sem_post(workerSemaphore)) != 0) {
        fprintf(stderr, "Error from sem_post: %s\n", strerror(err));
        exit(1);
    }
 }
 static void *
 lTaskEntry(void *arg) {
    int threadIndex = (int)arg;
    int threadCount = nThreads;
    TaskFuncType func;
    while (1) {
        int err;
        if ((err = sem_wait(workerSemaphore)) != 0) {
            fprintf(stderr, "Error from sem_wait: %s\n", strerror(err));
            exit(1);
        }
        //
        // Acquire mutex, get task
        //
        if ((err = pthread_mutex_lock(&taskQueueMutex)) != 0) {
            fprintf(stderr, "Error from pthread_mutex_lock: %s\n", strerror(err));
            exit(1);
        }
        if (nextTaskToRun == nextTaskInfoCoordinate) {
            //
            // Task queue is empty, go back and wait on the semaphore
            //
            if ((err = pthread_mutex_unlock(&taskQueueMutex)) != 0) {
                fprintf(stderr, "Error from pthread_mutex_unlock: %s\n", strerror(err));
                exit(1);
            }
            continue;
        }
        int runCoord = nextTaskToRun++;
        int index = (runCoord >> LOG_TASK_QUEUE_CHUNK_SIZE);
        int offset = runCoord & (TASK_QUEUE_CHUNK_SIZE-1);
        TaskInfo *myTask = &taskInfo[index][offset];
        if ((err = pthread_mutex_unlock(&taskQueueMutex)) != 0) {
            fprintf(stderr, "Error from pthread_mutex_unlock: %s\n", strerror(err));
            exit(1);
        }
        //
        // Do work for _myTask_
        //
        func = (TaskFuncType)myTask->func;
        func(myTask->data, threadIndex, threadCount);
        //
        // Decrement the number of unfinished tasks counter
        //
        if ((err = pthread_mutex_lock(&tasksRunningConditionMutex)) != 0) {
            fprintf(stderr, "Error from pthread_mutex_lock: %s\n", strerror(err));
            exit(1);
        }
        // FIXME: can this be a comparison of (nextTaskToRun == nextTaskInfoCoordinate)?
        // (I don't think so--think there is a race...)
        int unfinished = --numUnfinishedTasks;
        if (unfinished == 0) {
            //
            // Signal the "no more tasks are running" condition if all of
            // them are done.
            //
            int err;
            if ((err = pthread_cond_signal(&tasksRunningCondition)) != 0) {
                fprintf(stderr, "Error from pthread_cond_signal: %s\n", strerror(err));
                exit(1);
            }
        }
        if ((err = pthread_mutex_unlock(&tasksRunningConditionMutex)) != 0) {
            fprintf(stderr, "Error from pthread_mutex_lock: %s\n", strerror(err));
            exit(1);
        }
    }
    pthread_exit(NULL);
    return 0;
 }
 void ISPCSync() {
    int err;
    if ((err = pthread_mutex_lock(&tasksRunningConditionMutex)) != 0) {
        fprintf(stderr, "Error from pthread_mutex_lock: %s\n", strerror(err));
        exit(1);
    }
    // As long as there are tasks running, wait on the condition variable;
    // doing so causes this thread to go to sleep until someone signals on
    // the tasksRunningCondition condition variable.
    while (numUnfinishedTasks > 0) {
        if ((err = pthread_cond_wait(&tasksRunningCondition, 
                                     &tasksRunningConditionMutex)) != 0) {
            fprintf(stderr, "Error from pthread_cond_wait: %s\n", strerror(err));
            exit(1);
        }
    }
    lResetTaskInfo();
    nextTaskToRun = 0;
    // We acquire ownership of the condition variable mutex when the above
    // pthread_cond_wait returns.
    // FIXME: is there a lurking issue here if numUnfinishedTasks gets back
    // to zero by the time we get to ISPCSync() and thence we're trying to
    // unlock a mutex we don't have a lock on?
    if ((err = pthread_mutex_unlock(&tasksRunningConditionMutex)) != 0) {
        fprintf(stderr, "Error from pthread_mutex_lock: %s\n", strerror(err));
        exit(1);
    }
 }
--- a/examples/tasksys.cpp
+++ b/examples/tasksys.cpp
@@ -0,0 +1,868 @@
 /*
  Copyright (c) 2011, Intel Corporation
  All rights reserved.
  Redistribution and use in source and binary forms, with or without
  modification, are permitted provided that the following conditions are
  met:
    * Redistributions of source code must retain the above copyright
      notice, this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.
    * Neither the name of Intel Corporation nor the names of its
      contributors may be used to endorse or promote products derived from
      this software without specific prior written permission.
   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
 */
 /*
  This file implements simple task systems that provide the three
  entrypoints used by ispc-generated to code to handle 'launch' and 'sync'
  statements in ispc programs.  See the section "Task Parallelism: Language
  Syntax" in the ispc documentation for information about using task
  parallelism in ispc programs, and see the section "Task Parallelism:
  Runtime Requirements" for information about the task-related entrypoints
  that are implemented here.
  There are three task systems in this file: one built using Microsoft's
  Concurrency Runtime, one built with Apple's Grand Central Dispatch, and
  one built on top of bare pthreads.
 */
 #if defined(_WIN32) || defined(_WIN64)
  #define ISPC_IS_WINDOWS
  #define ISPC_USE_CONCRT
 #elif defined(__linux__)
  #define ISPC_IS_LINUX
  #define ISPC_USE_PTHREADS
 #elif defined(__APPLE__)
  #define ISPC_IS_APPLE
  // pthreads is noticably more efficient than GCD on OSX
  #define ISPC_USE_PTHREADS
  //#define ISPC_USE_GCD
 #endif
 #define DBG(x) 
 #ifdef ISPC_IS_WINDOWS
  #define NOMINMAX
  #include <windows.h>
 #endif // ISPC_IS_WINDOWS
 #ifdef ISPC_USE_CONCRT
  #include <concrt.h>
  using namespace Concurrency;
 #endif // ISPC_USE_CONCRT
 #ifdef ISPC_USE_GCD
  #include <dispatch/dispatch.h>
  #include <pthread.h>
 #endif // ISPC_USE_GCD
 #ifdef ISPC_USE_PTHREADS
  #include <pthread.h>
  #include <semaphore.h>
  #include <unistd.h>
  #include <fcntl.h>
  #include <errno.h>
  #include <sys/types.h>
  #include <sys/stat.h>
  #include <sys/param.h>
  #include <sys/sysctl.h>
  #include <vector>
  #include <algorithm>
 #endif // ISPC_USE_PTHREADS
 #ifdef ISPC_IS_LINUX
  #include <malloc.h>
 #endif // ISPC_IS_LINUX
 #include <stdio.h>
 #include <stdint.h>
 #include <stdlib.h>
 #include <assert.h>
 #include <string.h>
 #include <algorithm>
 // Signature of ispc-generated 'task' functions
 typedef void (*TaskFuncType)(void *data, int threadIndex, int threadCount,
                             int taskIndex, int taskCount);
 // Small structure used to hold the data for each task
 struct TaskInfo {
    TaskFuncType func;
    void *data;
    int taskIndex, taskCount;
 #if defined(ISPC_IS_WINDOWS)
    event taskEvent;
 #endif
 };
 ///////////////////////////////////////////////////////////////////////////
 // TaskGroupBase
 #define LOG_TASK_QUEUE_CHUNK_SIZE 12
 #define MAX_TASK_QUEUE_CHUNKS 8
 #define TASK_QUEUE_CHUNK_SIZE (1<<LOG_TASK_QUEUE_CHUNK_SIZE)
 #define MAX_LAUNCHED_TASKS (MAX_TASK_QUEUE_CHUNKS * TASK_QUEUE_CHUNK_SIZE)
 #define NUM_MEM_BUFFERS 16
 class TaskGroup;
 /** The TaskGroupBase structure provides common functionality for "task
    groups"; a task group is the set of tasks launched from within a single
    ispc function.  When the function is ready to return, it waits for all
    of the tasks in its task group to finish before it actually returns.
 */
 class TaskGroupBase {
 public:
    void Reset();
    int AllocTaskInfo(int count);
    TaskInfo *GetTaskInfo(int index);
    void *AllocMemory(int64_t size, int32_t alignment);
 protected:
    TaskGroupBase();
    ~TaskGroupBase();
    int nextTaskInfoIndex;
 private:
    /* We allocate blocks of TASK_QUEUE_CHUNK_SIZE TaskInfo structures as
       needed by the calling function.  We hold up to MAX_TASK_QUEUE_CHUNKS
       of these (and then exit at runtime if more than this many tasks are
       launched.)
     */
    TaskInfo *taskInfo[MAX_TASK_QUEUE_CHUNKS];
    /* We also allocate chunks of memory to service ISPCAlloc() calls.  The
       memBuffers[] array holds pointers to this memory.  The first element
       of this array is initialized to point to mem and then any subsequent
       elements required are initialized with dynamic allocation.
     */
    int curMemBuffer, curMemBufferOffset;
    int memBufferSize[NUM_MEM_BUFFERS];
    char *memBuffers[NUM_MEM_BUFFERS];
    char mem[256];
 };
 inline TaskGroupBase::TaskGroupBase() { 
    nextTaskInfoIndex = 0; 
    curMemBuffer = 0; 
    curMemBufferOffset = 0;
    memBuffers[0] = mem;
    memBufferSize[0] = sizeof(mem) / sizeof(mem[0]);
    for (int i = 1; i < NUM_MEM_BUFFERS; ++i) {
        memBuffers[i] = NULL;
        memBufferSize[i] = 0;
    }
    for (int i = 0; i < MAX_TASK_QUEUE_CHUNKS; ++i)
        taskInfo[i] = NULL;
 }
 inline TaskGroupBase::~TaskGroupBase() {
    // Note: don't delete memBuffers[0], since it points to the start of
    // the "mem" member!
    for (int i = 1; i < NUM_MEM_BUFFERS; ++i)
        delete[] memBuffers[i];
 }
 inline void
 TaskGroupBase::Reset() {
    nextTaskInfoIndex = 0; 
    curMemBuffer = 0; 
    curMemBufferOffset = 0;
 }
 inline int
 TaskGroupBase::AllocTaskInfo(int count) {
    int ret = nextTaskInfoIndex;
    nextTaskInfoIndex += count;
    return ret;
 }
 inline TaskInfo *
 TaskGroupBase::GetTaskInfo(int index) {
    int chunk = (index >> LOG_TASK_QUEUE_CHUNK_SIZE);
    int offset = index & (TASK_QUEUE_CHUNK_SIZE-1);
    if (chunk == MAX_TASK_QUEUE_CHUNKS) {
        fprintf(stderr, "A total of %d tasks have been launched from the "
                "current function--the simple built-in task system can handle "
                "no more. You can increase the values of TASK_QUEUE_CHUNK_SIZE "
                "and LOG_TASK_QUEUE_CHUNK_SIZE to work around this limitation.  "
                "Sorry!  Exiting.\n", index);
        exit(1);
    }
    if (taskInfo[chunk] == NULL)
        taskInfo[chunk] = new TaskInfo[TASK_QUEUE_CHUNK_SIZE];
    return &taskInfo[chunk][offset];
 }
 inline void *
 TaskGroupBase::AllocMemory(int64_t size, int32_t alignment) {
    char *basePtr = memBuffers[curMemBuffer];
    int64_t iptr = (int64_t)(basePtr + curMemBufferOffset);
    iptr = (iptr + (alignment-1)) & ~(alignment-1);
    int newOffset = int(iptr + size - (int64_t)basePtr);
    if (newOffset < memBufferSize[curMemBuffer]) {
        curMemBufferOffset = newOffset;
        return (char *)iptr;
    }
    ++curMemBuffer;
    curMemBufferOffset = 0;
    assert(curMemBuffer < NUM_MEM_BUFFERS);
    int allocSize = 1 << (12 + curMemBuffer);
    allocSize = std::max(int(size+alignment), allocSize);
    char *newBuf = new char[allocSize];
    memBufferSize[curMemBuffer] = allocSize;
    memBuffers[curMemBuffer] = newBuf;
    return AllocMemory(size, alignment);
 }
 ///////////////////////////////////////////////////////////////////////////
 // Atomics and the like
 #ifndef ISPC_IS_WINDOWS
 static inline void
 lMemFence() {
    __asm__ __volatile__("mfence":::"memory");
 }
 #endif // !ISPC_IS_WINDOWS
 #if (__SIZEOF_POINTER__ == 4) || defined(__i386__) || defined(_WIN32)
 #define ISPC_POINTER_BYTES 4
 #elif (__SIZEOF_POINTER__ == 8) || defined(__x86_64__) || defined(__amd64__) || defined(_WIN64)
 #define ISPC_POINTER_BYTES 8
 #else
 #error "Pointer size unknown!"
 #endif // __SIZEOF_POINTER__
 static void *
 lAtomicCompareAndSwapPointer(void **v, void *newValue, void *oldValue) {
 #ifdef ISPC_IS_WINDOWS
    return InterlockedCompareExchangePointer(v, newValue, oldValue);
 #else
    void *result;
 #if (ISPC_POINTER_BYTES == 4)
    __asm__ __volatile__("lock\ncmpxchgd %2,%1"
                          : "=a"(result), "=m"(*v)
                          : "q"(newValue), "0"(oldValue)
                          : "memory");
 #else
    __asm__ __volatile__("lock\ncmpxchgq %2,%1"
                          : "=a"(result), "=m"(*v)
                          : "q"(newValue), "0"(oldValue)
                          : "memory");
 #endif // ISPC_POINTER_BYTES
    lMemFence();
    return result;
 #endif // ISPC_IS_WINDOWS
 }
 #ifndef ISPC_IS_WINDOWS
 static int32_t 
 lAtomicCompareAndSwap32(volatile int32_t *v, int32_t newValue, int32_t oldValue) {
    int32_t result;
    __asm__ __volatile__("lock\ncmpxchgl %2,%1"
                          : "=a"(result), "=m"(*v)
                          : "q"(newValue), "0"(oldValue)
                          : "memory");
    lMemFence();
    return result;
 }
 #endif // !ISPC_IS_WINDOWS
 ///////////////////////////////////////////////////////////////////////////
 #ifdef ISPC_USE_CONCRT
 // With ConcRT, we don't need to extend TaskGroupBase at all.
 class TaskGroup : public TaskGroupBase {
 public:
    void Launch(int baseIndex, int count);
    void Sync();
 };
 #endif // ISPC_USE_CONCRT
 #ifdef ISPC_USE_GCD
 /* With Grand Central Dispatch, we associate a GCD dispatch group with each
   task group.  (We'll later wait on this dispatch group when we need to
   wait on all of the tasks in the group to finish.)
 */
 class TaskGroup : public TaskGroupBase {
 public:
    TaskGroup() {
        gcdGroup = dispatch_group_create();
    }
    void Launch(int baseIndex, int count);
    void Sync();
 private:
    dispatch_group_t gcdGroup;
 };
 #endif // ISPC_USE_GCD
 #ifdef ISPC_USE_PTHREADS
 static void *lTaskEntry(void *arg);
 class TaskGroup : public TaskGroupBase {
 public:
    TaskGroup() {
        numUnfinishedTasks = 0;
        waitingTasks.reserve(128);
        inActiveList = false;
    }
    void Reset() {
        TaskGroupBase::Reset();
        numUnfinishedTasks = 0;
        assert(inActiveList == false);
        lMemFence();
    }
    void Launch(int baseIndex, int count);
    void Sync();
 private:
    friend void *lTaskEntry(void *arg);
    int32_t numUnfinishedTasks;
    int32_t pad[3];
    std::vector<int> waitingTasks;
    bool inActiveList;
 };
 #endif // ISPC_USE_PTHREADS
 ///////////////////////////////////////////////////////////////////////////
 // Grand Central Dispatch
 #ifdef ISPC_USE_GCD
 /* A simple task system for ispc programs based on Apple's Grand Central
   Dispatch. */
 static dispatch_queue_t gcdQueue;
 static volatile int32_t lock = 0;
 static void
 InitTaskSystem() {
    if (gcdQueue != NULL)
        return;
    while (1) {
        if (lAtomicCompareAndSwap32(&lock, 1, 0) == 0) {
            if (gcdQueue == NULL) {
                gcdQueue = dispatch_get_global_queue(DISPATCH_QUEUE_PRIORITY_DEFAULT, 0);
                assert(gcdQueue != NULL);
                lMemFence();
            }
            lock = 0;
            break;
        }
    }
 }
 static void
 lRunTask(void *ti) {
    TaskInfo *taskInfo = (TaskInfo *)ti;
    // FIXME: these are bogus values; may cause bugs in code that depends
    // on them having unique values in different threads.
    int threadIndex = 0;
    int threadCount = 1;
    // Actually run the task
    taskInfo->func(taskInfo->data, threadIndex, threadCount, 
                   taskInfo->taskIndex, taskInfo->taskCount);
 }
 inline void
 TaskGroup::Launch(int baseIndex, int count) {
    for (int i = 0; i < count; ++i) {
        TaskInfo *ti = GetTaskInfo(baseIndex + i);
        dispatch_group_async_f(gcdGroup, gcdQueue, ti, lRunTask);
    }
 }
 inline void
 TaskGroup::Sync() {
    dispatch_group_wait(gcdGroup, DISPATCH_TIME_FOREVER);
 }
 #endif // ISPC_USE_GCD
 ///////////////////////////////////////////////////////////////////////////
 // Concurrency Runtime
 #ifdef ISPC_USE_CONCRT
 static void
 InitTaskSystem() {
    // No initialization needed
 }
 static void __cdecl
 lRunTask(LPVOID param) {
    TaskInfo *ti = (TaskInfo *)param;
    // Actually run the task. 
    // FIXME: like the GCD implementation for OS X, this is passing bogus
    // values for the threadIndex and threadCount builtins, which in turn
    // will cause bugs in code that uses those.
    int threadIndex = 0;
    int threadCount = 1;
    ti->func(ti->data, threadIndex, threadCount, ti->taskIndex, ti->taskCount);
    // Signal the event that this task is done
    ti->taskEvent.set();
 }
 inline void
 TaskGroup::Launch(int baseIndex, int count) {
    for (int i = 0; i < count; ++i)
        CurrentScheduler::ScheduleTask(lRunTask, GetTaskInfo(baseIndex + i));
 }
 inline void
 TaskGroup::Sync() {
    for (int i = 0; i < nextTaskInfoIndex; ++i) {
        TaskInfo *ti = GetTaskInfo(i);
        ti->taskEvent.wait();
        ti->taskEvent.reset();
    }
 }
 #endif // ISPC_USE_CONCRT
 ///////////////////////////////////////////////////////////////////////////
 // pthreads
 #ifdef ISPC_USE_PTHREADS
 static volatile int32_t lock = 0;
 static int nThreads;
 static pthread_t *threads = NULL;
 static pthread_mutex_t taskSysMutex;
 static std::vector<TaskGroup *> activeTaskGroups;
 static sem_t *workerSemaphore;
 static inline int32_t 
 lAtomicAdd(int32_t *v, int32_t delta) {
    int32_t origValue;
    __asm__ __volatile__("lock\n"
                         "xaddl %0,%1"
                         : "=r"(origValue), "=m"(*v) : "0"(delta)
                         : "memory");
    return origValue;
 }
 static void *
 lTaskEntry(void *arg) {
    int threadIndex = (int)((int64_t)arg);
    int threadCount = nThreads;
    while (1) {
        int err;
        //
        // Wait on the semaphore until we're woken up due to the arrival of
        // more work.
        //
        if ((err = sem_wait(workerSemaphore)) != 0) {
            fprintf(stderr, "Error from sem_wait: %s\n", strerror(err));
            exit(1);
        }
        //
        // Acquire the mutex
        //
        if ((err = pthread_mutex_lock(&taskSysMutex)) != 0) {
            fprintf(stderr, "Error from pthread_mutex_lock: %s\n", strerror(err));
            exit(1);
        }
        if (activeTaskGroups.size() == 0) {
            //
            // Task queue is empty, go back and wait on the semaphore
            //
            if ((err = pthread_mutex_unlock(&taskSysMutex)) != 0) {
                fprintf(stderr, "Error from pthread_mutex_unlock: %s\n", strerror(err));
                exit(1);
            }
            continue;
        }
        //
        // Get the last task group on the active list and the last task
        // from its waiting tasks list.
        //
        TaskGroup *tg = activeTaskGroups.back();
        assert(tg->waitingTasks.size() > 0);
        int taskNumber = tg->waitingTasks.back();
        tg->waitingTasks.pop_back();
        if (tg->waitingTasks.size() == 0) {
            // We just took the last task from this task group, so remove
            // it from the active list.
            activeTaskGroups.pop_back();
            tg->inActiveList = false;
        }
        if ((err = pthread_mutex_unlock(&taskSysMutex)) != 0) {
            fprintf(stderr, "Error from pthread_mutex_unlock: %s\n", strerror(err));
            exit(1);
        }
        //
        // And now actually run the task
        //
        DBG(fprintf(stderr, "running task %d from group %p\n", taskNumber, tg));
        TaskInfo *myTask = tg->GetTaskInfo(taskNumber);
        myTask->func(myTask->data, threadIndex, threadCount, myTask->taskIndex,
                     myTask->taskCount);
        //
        // Decrement the "number of unfinished tasks" counter in the task
        // group.
        //
        lMemFence();
        lAtomicAdd(&tg->numUnfinishedTasks, -1);
    }
    pthread_exit(NULL);
    return 0;
 }
 static void
 InitTaskSystem() {
    if (threads == NULL) {
        while (1) {
            if (lAtomicCompareAndSwap32(&lock, 1, 0) == 0) {
                if (threads == NULL) {
                    // We launch one fewer thread than there are cores,
                    // since the main thread here will also grab jobs from
                    // the task queue itself.
                    nThreads = sysconf(_SC_NPROCESSORS_ONLN) - 1;
                    int err;
                    if ((err = pthread_mutex_init(&taskSysMutex, NULL)) != 0) {
                        fprintf(stderr, "Error creating mutex: %s\n", strerror(err));
                        exit(1);
                    }
                    char name[32];
                    sprintf(name, "ispc_task.%d", (int)getpid());
                    workerSemaphore = sem_open(name, O_CREAT, S_IRUSR|S_IWUSR, 0);
                    if (!workerSemaphore) {
                        fprintf(stderr, "Error creating semaphore: %s\n", strerror(err));
                        exit(1);
                    }
                    threads = (pthread_t *)malloc(nThreads * sizeof(pthread_t));
                    for (int i = 0; i < nThreads; ++i) {
                        err = pthread_create(&threads[i], NULL, &lTaskEntry, (void *)(i));
                        if (err != 0) {
                            fprintf(stderr, "Error creating pthread %d: %s\n", i, strerror(err));
                            exit(1);
                        }
                    }
                    activeTaskGroups.reserve(64);
                }
                // Make sure all of the above goes to memory before we
                // clear the lock.
                lMemFence();
                lock = 0;
                break;
            }
        }
    }
 }
 inline void
 TaskGroup::Launch(int baseCoord, int count) {
    //
    // Acquire mutex, add task
    //
    int err;
    if ((err = pthread_mutex_lock(&taskSysMutex)) != 0) {
        fprintf(stderr, "Error from pthread_mutex_lock: %s\n", strerror(err));
        exit(1);
    }
    // Add the corresponding set of tasks to the waiting-to-be-run list for
    // this task group.
    //
    // FIXME: it's a little ugly to hold a global mutex for this when we
    // only need to make sure no one else is accessing this task group's
    // waitingTasks list.  (But a small experiment in switching to a
    // per-TaskGroup mutex showed worse performance!)
    for (int i = 0; i < count; ++i)
        waitingTasks.push_back(baseCoord + i);
    // Add the task group to the global active list if it isn't there
    // already.
    if (inActiveList == false) {
        activeTaskGroups.push_back(this);
        inActiveList = true;
    }
    if ((err = pthread_mutex_unlock(&taskSysMutex)) != 0) {
        fprintf(stderr, "Error from pthread_mutex_unlock: %s\n", strerror(err));
        exit(1);
    }
    //
    // Update the count of the number of tasks left to run in this task
    // group.
    //
    lMemFence();
    lAtomicAdd(&numUnfinishedTasks, count);
    //
    // Post to the worker semaphore to wake up worker threads that are
    // sleeping waiting for tasks to show up
    //
    for (int i = 0; i < count; ++i)
        if ((err = sem_post(workerSemaphore)) != 0) {
            fprintf(stderr, "Error from sem_post: %s\n", strerror(err));
            exit(1);
        }
 }
 inline void
 TaskGroup::Sync() {
    DBG(fprintf(stderr, "syncing %p - %d unfinished\n", tg, numUnfinishedTasks));
    while (numUnfinishedTasks > 0) {
        // All of the tasks in this group aren't finished yet.  We'll try
        // to help out here since we don't have anything else to do...
        DBG(fprintf(stderr, "while syncing %p - %d unfinished\n", tg, 
                    numUnfinishedTasks));
        //
        // Acquire the global task system mutex to grab a task to work on
        //
        int err;
        if ((err = pthread_mutex_lock(&taskSysMutex)) != 0) {
            fprintf(stderr, "Error from pthread_mutex_lock: %s\n", strerror(err));
            exit(1);
        }
        TaskInfo *myTask = NULL;
        TaskGroup *runtg = this;
        if (waitingTasks.size() > 0) {
            int taskNumber = waitingTasks.back();
            waitingTasks.pop_back();
            if (waitingTasks.size() == 0) {
                // There's nothing left to start running from this group,
                // so remove it from the active task list.
                activeTaskGroups.erase(std::find(activeTaskGroups.begin(),
                                                 activeTaskGroups.end(), this));
                inActiveList = false;
            }
            myTask = GetTaskInfo(taskNumber);
            DBG(fprintf(stderr, "running task %d from group %p in sync\n", taskNumber, tg));
        }
        else {
            // Other threads are already working on all of the tasks in
            // this group, so we can't help out by running one ourself.
            // We'll try to run one from another group to make ourselves
            // useful here.
            if (activeTaskGroups.size() == 0) {
                // No active task groups left--there's nothing for us to do.
                if ((err = pthread_mutex_unlock(&taskSysMutex)) != 0) {
                    fprintf(stderr, "Error from pthread_mutex_unlock: %s\n", strerror(err));
                    exit(1);
                }
                // FIXME: We basically end up busy-waiting here, which is
                // extra wasteful in a world with hyperthreading.  It would
                // be much better to put this thread to sleep on a
                // condition variable that was signaled when the last task
                // in this group was finished.
                sleep(0);
                continue;
            }
            // Get a task to run from another task group.
            runtg = activeTaskGroups.back();
            assert(runtg->waitingTasks.size() > 0);
            int taskNumber = runtg->waitingTasks.back();
            runtg->waitingTasks.pop_back();
            if (runtg->waitingTasks.size() == 0) {
                // There's left to start running from this group, so remove
                // it from the active task list.
                activeTaskGroups.pop_back();
                runtg->inActiveList = false;
            }
            myTask = runtg->GetTaskInfo(taskNumber);
            DBG(fprintf(stderr, "running task %d from other group %p in sync\n", 
                        taskNumber, runtg));
        }
        if ((err = pthread_mutex_unlock(&taskSysMutex)) != 0) {
            fprintf(stderr, "Error from pthread_mutex_unlock: %s\n", strerror(err));
            exit(1);
        }
        //
        // Do work for _myTask_
        //
        // FIXME: bogus values for thread index/thread count here as well..
        myTask->func(myTask->data, 0, 1, myTask->taskIndex, myTask->taskCount);
        //
        // Decrement the number of unfinished tasks counter
        //
        lMemFence();
        lAtomicAdd(&runtg->numUnfinishedTasks, -1);
    }
    DBG(fprintf(stderr, "sync for %p done!n", tg));
 }
 #endif // ISPC_USE_PTHREADS
 ///////////////////////////////////////////////////////////////////////////
 #define MAX_FREE_TASK_GROUPS 64
 static TaskGroup *freeTaskGroups[MAX_FREE_TASK_GROUPS];
 static inline TaskGroup *
 AllocTaskGroup() {
    for (int i = 0; i < MAX_FREE_TASK_GROUPS; ++i) {
        TaskGroup *tg = freeTaskGroups[i];
        if (tg != NULL) {
            void *ptr = lAtomicCompareAndSwapPointer((void **)(&freeTaskGroups[i]), NULL, tg);
            if (ptr != NULL) {
                assert(ptr == tg);
                return (TaskGroup *)ptr;
            }
        }
    }
    return new TaskGroup;
 }
 static inline void
 FreeTaskGroup(TaskGroup *tg) {
    tg->Reset();
    for (int i = 0; i < MAX_FREE_TASK_GROUPS; ++i) {
        if (freeTaskGroups[i] == NULL) {
            void *ptr = lAtomicCompareAndSwapPointer((void **)&freeTaskGroups[i], tg, NULL);
            if (ptr == NULL)
                return;
        }
    }
    delete tg;
 }
 ///////////////////////////////////////////////////////////////////////////
 // ispc expects these functions to have C linkage / not be mangled
 extern "C" { 
    void ISPCLaunch(void **handlePtr, void *f, void *data, int count);
    void *ISPCAlloc(void **handlePtr, int64_t size, int32_t alignment);
    void ISPCSync(void *handle);
 }
 void
 ISPCLaunch(void **taskGroupPtr, void *func, void *data, int count) {
    TaskGroup *taskGroup;
    if (*taskGroupPtr == NULL) {
        InitTaskSystem();
        taskGroup = AllocTaskGroup();
        *taskGroupPtr = taskGroup;
    }
    else
        taskGroup = (TaskGroup *)(*taskGroupPtr);
    int baseIndex = taskGroup->AllocTaskInfo(count);
    for (int i = 0; i < count; ++i) {
        TaskInfo *ti = taskGroup->GetTaskInfo(baseIndex+i);
        ti->func = (TaskFuncType)func;
        ti->data = data;
        ti->taskIndex = i;
        ti->taskCount = count;
    }
    taskGroup->Launch(baseIndex, count);
 }
 void
 ISPCSync(void *h) {
    TaskGroup *taskGroup = (TaskGroup *)h;
    if (taskGroup != NULL) {
        taskGroup->Sync();
        FreeTaskGroup(taskGroup);
    }
 }
 void *
 ISPCAlloc(void **taskGroupPtr, int64_t size, int32_t alignment) {
    TaskGroup *taskGroup;
    if (*taskGroupPtr == NULL) {
        InitTaskSystem();
        taskGroup = AllocTaskGroup();
        *taskGroupPtr = taskGroup;
    }
    else
        taskGroup = (TaskGroup *)(*taskGroupPtr);
    return taskGroup->AllocMemory(size, alignment);
 }
--- a/examples/timing.h
+++ b/examples/timing.h
@@ -38,7 +38,9 @@
 #include <windows.h>
 #define rdtsc __rdtsc
 #else
 #ifdef __cplusplus
 extern "C" {
 #endif /* __cplusplus */
    __inline__ uint64_t rdtsc() {
        uint32_t low, high;
        __asm__ __volatile__ (
@@ -48,7 +50,9 @@ extern "C" {
                              "rdtsc" : "=a" (low), "=d" (high));
        return (uint64_t)high << 32 | low;
    }
 #ifdef __cplusplus
 }
 #endif /* __cplusplus */
 #endif            
 static uint64_t start, end;
--- a/examples/volume_rendering/Makefile
+++ b/examples/volume_rendering/Makefile
@@ -1,14 +1,8 @@
 ARCH = $(shell uname)
-TASK_CXX=../tasks_pthreads.cpp
+TASK_CXX=../tasksys.cpp
 TASK_LIB=-lpthread
 ifeq ($(ARCH), Darwin)
  TASK_CXX=../tasks_gcd.cpp
  TASK_LIB=
 endif
 TASK_OBJ=$(addprefix objs/, $(subst ../,, $(TASK_CXX:.cpp=.o)))
 CXX=g++
--- a/examples/volume_rendering/volume.ispc
+++ b/examples/volume_rendering/volume.ispc
@@ -343,11 +343,20 @@ volume_tile(uniform int x0, uniform int y0, uniform int x1,
 task void
-volume_task(uniform int x0, uniform int y0, uniform int x1,
+volume_task(uniform float density[], uniform int nVoxels[3], 
            uniform int y1, uniform float density[], uniform int nVoxels[3], 
            const uniform float raster2camera[4][4],
            const uniform float camera2world[4][4], 
            uniform int width, uniform int height, uniform float image[]) {
    uniform int dx = 8, dy = 8; // must match value in volume_ispc_tasks
    uniform int xbuckets = (width + (dx-1)) / dx;
    uniform int ybuckets = (height + (dy-1)) / dy;
    uniform int x0 = (taskIndex % xbuckets) * dx;
    uniform int y0 = (taskIndex / ybuckets) * dy;
    uniform int x1 = x0 + dx, y1 = y0 + dy;
    x1 = min(x1, width);
    y1 = min(y1, height);
    volume_tile(x0, y0, x1, y1, density, nVoxels, raster2camera,
                 camera2world, width, height, image);
 }
@@ -370,9 +379,7 @@ volume_ispc_tasks(uniform float density[], uniform int nVoxels[3],
                  uniform int width, uniform int height, uniform float image[]) {
    // Launch tasks to work on (dx,dy)-sized tiles of the image
    uniform int dx = 8, dy = 8;
-    for (uniform int y = 0; y < height; y += dy)
+    uniform int nTasks = ((width+(dx-1))/dx) * ((height+(dy-1))/dy);
-        for (uniform int x = 0; x < width; x += dx)
+    launch[nTasks] < volume_task(density, nVoxels, raster2camera, camera2world, 
-            launch < volume_task(x, y, x+dx, y+dy, density, nVoxels, 
+                                 width, height, image) >;
                                 raster2camera, camera2world, width, height, 
                                 image) >;
 }
--- a/examples/volume_rendering/volume.vcxproj
+++ b/examples/volume_rendering/volume.vcxproj
@@ -143,7 +143,7 @@
  <ItemGroup>
    <ClCompile Include="volume.cpp" />
    <ClCompile Include="volume_serial.cpp" />
-    <ClCompile Include="../tasks_concrt.cpp" />
+    <ClCompile Include="../tasksys.cpp" />
  </ItemGroup>
  <ItemGroup>
    <CustomBuild Include="volume.ispc">
--- a/examples/volume_rendering/volume_serial.cpp
+++ b/examples/volume_rendering/volume_serial.cpp
@@ -104,7 +104,7 @@ Inside(float3 p, float3 pMin, float3 pMax) {
 static bool
 IntersectP(const Ray &ray, float3 pMin, float3 pMax, float *hit0, float *hit1) {
-    float t0 = -1e30, t1 = 1e30;
+    float t0 = -1e30f, t1 = 1e30f;
    float3 tNear = (pMin - ray.origin) / ray.dir;
    float3 tFar  = (pMax - ray.origin) / ray.dir;
@@ -213,7 +213,7 @@ transmittance(float3 p0, float3 p1, float3 pMin,
    float tau = 0;
    float rayLength = sqrtf(ray.dir.x * ray.dir.x + ray.dir.y * ray.dir.y +
                            ray.dir.z * ray.dir.z);
-    float stepDist = 0.2;
+    float stepDist = 0.2f;
    float stepT = stepDist / rayLength;
    float t = rayT0;
@@ -239,8 +239,8 @@ distanceSquared(float3 a, float3 b) {
 static float 
 raymarch(float density[], int nVoxels[3], const Ray &ray) {
    float rayT0, rayT1;
-    float3 pMin(.3, -.2, .3), pMax(1.8, 2.3, 1.8);
+    float3 pMin(.3f, -.2f, .3f), pMax(1.8f, 2.3f, 1.8f);
-    float3 lightPos(-1, 4, 1.5);
+    float3 lightPos(-1.f, 4.f, 1.5f);
    if (!IntersectP(ray, pMin, pMax, &rayT0, &rayT1))
        return 0.;
@@ -249,10 +249,10 @@ raymarch(float density[], int nVoxels[3], const Ray &ray) {
    // Parameters that define the volume scattering characteristics and
    // sampling rate for raymarching
-    float Le = .25;            // Emission coefficient
+    float Le = .25f;           // Emission coefficient
    float sigma_a = 10;        // Absorption coefficient
    float sigma_s = 10;        // Scattering coefficient
-    float stepDist = 0.025;    // Ray step amount
+    float stepDist = 0.025f;   // Ray step amount
    float lightIntensity = 40; // Light source intensity
    float tau = 0.f;  // accumulated beam transmittance
@@ -269,7 +269,7 @@ raymarch(float density[], int nVoxels[3], const Ray &ray) {
        // terminate once attenuation is high
        float atten = expf(-tau);
-        if (atten < .005)
+        if (atten < .005f)
            break;
        // direct lighting
--- a/expr.cpp
+++ b/expr.cpp
@@ -741,6 +741,12 @@ UnaryExpr::TypeCheck() {
 }
 int
 UnaryExpr::EstimateCost() const {
    return (expr ? expr->EstimateCost() : 0) + COST_SIMPLE_ARITH_LOGIC_OP;
 }
 void
 UnaryExpr::Print() const {
    if (!expr || !GetType())
@@ -799,11 +805,17 @@ lOpString(BinaryExpr::Op op) {
 */
 static llvm::Value *
 lEmitBinaryBitOp(BinaryExpr::Op op, llvm::Value *arg0Val,
-                 llvm::Value *arg1Val, FunctionEmitContext *ctx) {
+                 llvm::Value *arg1Val, bool isUnsigned,
                 FunctionEmitContext *ctx) {
    llvm::Instruction::BinaryOps inst;
    switch (op) {
    case BinaryExpr::Shl:    inst = llvm::Instruction::Shl;  break;
-    case BinaryExpr::Shr:    inst = llvm::Instruction::AShr; break; 
+    case BinaryExpr::Shr:
        if (isUnsigned)
            inst = llvm::Instruction::LShr; 
        else
            inst = llvm::Instruction::AShr; 
        break; 
    case BinaryExpr::BitAnd: inst = llvm::Instruction::And;  break;
    case BinaryExpr::BitXor: inst = llvm::Instruction::Xor;  break;
    case BinaryExpr::BitOr:  inst = llvm::Instruction::Or;   break;
@@ -949,7 +961,8 @@ BinaryExpr::GetValue(FunctionEmitContext *ctx) const {
            dynamic_cast<ConstExpr *>(arg1) == NULL)
            PerformanceWarning(pos, "Shift right is extremely inefficient for "
                               "varying shift amounts.");
-        return lEmitBinaryBitOp(op, e0Val, e1Val, ctx);
+        return lEmitBinaryBitOp(op, e0Val, e1Val, 
                                arg0->GetType()->IsUnsignedType(), ctx);
    }
    case LogicalAnd:
        return ctx->BinaryOperator(llvm::Instruction::And, e0Val, e1Val,
@@ -1176,10 +1189,10 @@ BinaryExpr::Optimize() {
                    m->symbolTable->LookupFunction("rcp");
                if (rcpFuns != NULL) {
                    assert(rcpFuns->size() == 2);
-                    Expr *rcpSymExpr = new FunctionSymbolExpr(rcpFuns, pos);
+                    Expr *rcpSymExpr = new FunctionSymbolExpr("rcp", rcpFuns, pos);
                    ExprList *args = new ExprList(arg1, arg1->pos);
                    Expr *rcpCall = new FunctionCallExpr(rcpSymExpr, args, 
-                                                         arg1->pos, false);
+                                                         arg1->pos);
                    rcpCall = rcpCall->TypeCheck();
                    if (rcpCall == NULL)
                        return NULL;
@@ -1292,6 +1305,17 @@ BinaryExpr::TypeCheck() {
    if (type0 == NULL || type1 == NULL)
        return NULL;
    if (dynamic_cast<const ReferenceType *>(type0) != NULL) {
        arg0 = new DereferenceExpr(arg0, arg0->pos);
        type0 = arg0->GetType();
        assert(type0 != NULL);
    }
    if (dynamic_cast<const ReferenceType *>(type1) != NULL) {
        arg1 = new DereferenceExpr(arg1, arg1->pos);
        type1 = arg1->GetType();
        assert(type1 != NULL);
    }
    switch (op) {
    case Shl:
    case Shr:
@@ -1438,6 +1462,15 @@ BinaryExpr::TypeCheck() {
 }
 int
 BinaryExpr::EstimateCost() const {
    return ((arg0 ? arg0->EstimateCost() : 0) +
            (arg1 ? arg1->EstimateCost() : 0) +
            ((op == Div || op == Mod) ? COST_COMPLEX_ARITH_OP : 
                                        COST_SIMPLE_ARITH_LOGIC_OP));
 }
 void
 BinaryExpr::Print() const {
    if (!arg0 || !arg1 || !GetType())
@@ -1533,7 +1566,8 @@ lEmitOpAssign(AssignExpr::Op op, Expr *arg0, Expr *arg1, const Type *type,
    case AssignExpr::AndAssign:
    case AssignExpr::XorAssign:
    case AssignExpr::OrAssign:
-        newValue = lEmitBinaryBitOp(basicop, oldLHS, rvalue, ctx);
+        newValue = lEmitBinaryBitOp(basicop, oldLHS, rvalue, 
                                    arg0->GetType()->IsUnsignedType(), ctx);
        break;
    default:
        FATAL("logic error in lEmitOpAssign");
@@ -1688,6 +1722,20 @@ AssignExpr::TypeCheck() {
 }
 int
 AssignExpr::EstimateCost() const {
    int cost = ((lvalue ? lvalue->EstimateCost() : 0) +
                (rvalue ? rvalue->EstimateCost() : 0));
    cost += COST_ASSIGN;
    if (op == Assign)
        return cost;
    if (op == DivAssign || op == ModAssign)
        return cost + COST_COMPLEX_ARITH_OP;
    else
        return cost + COST_SIMPLE_ARITH_LOGIC_OP;
 }
 void
 AssignExpr::Print() const {
    if (!lvalue || !rvalue || !GetType())
@@ -1936,6 +1984,12 @@ SelectExpr::TypeCheck() {
 }
 int
 SelectExpr::EstimateCost() const {
    return COST_SELECT;
 }
 void
 SelectExpr::Print() const {
    if (!test || !expr1 || !expr2 || !GetType())
@@ -2159,7 +2213,7 @@ FunctionCallExpr::tryResolve(bool (*matchFunc)(Expr *, const Type *)) {
 void
-FunctionCallExpr::resolveFunctionOverloads() {
+FunctionCallExpr::resolveFunctionOverloads(bool exactMatchOnly) {
    FunctionSymbolExpr *fse = dynamic_cast<FunctionSymbolExpr *>(func);
    if (!fse) 
        // error will be issued later if not calling an actual function
@@ -2173,6 +2227,7 @@ FunctionCallExpr::resolveFunctionOverloads() {
    if (tryResolve(lExactMatch))
        return;
    if (!exactMatchOnly) {
        // Try to find a single match ignoring references
        if (tryResolve(lMatchIgnoringReferences))
            return;
@@ -2193,73 +2248,34 @@ FunctionCallExpr::resolveFunctionOverloads() {
        // Last chance: try to find a match via arbitrary type conversion.
        if (tryResolve(lMatchWithTypeConv))
            return;
    }
    // failure :-(
    const char *funName = fse->candidateFunctions->front()->name.c_str();
-    Error(pos, "Unable to find matching overload for call to function \"%s\".",
+    Error(pos, "Unable to find matching overload for call to function \"%s\"%s.",
-          funName);
+          funName, exactMatchOnly ? " only considering exact matches" : "");
    fprintf(stderr, "Candidates are:\n");
    lPrintFunctionOverloads(*fse->candidateFunctions);
    lPrintPassedTypes(funName, args->exprs);
 }
-FunctionCallExpr::FunctionCallExpr(Expr *f, ExprList *a, SourcePos p, bool il) 
+FunctionCallExpr::FunctionCallExpr(Expr *f, ExprList *a, SourcePos p, 
-    : Expr(p) {
+                                   bool il, Expr *lce) 
    : Expr(p), isLaunch(il) {
    func = f;
    args = a;
-    isLaunch = il;
+    launchCountExpr = lce;
-    resolveFunctionOverloads();
+    FunctionSymbolExpr *fse = dynamic_cast<FunctionSymbolExpr *>(func);
-}
+    // Functions with names that start with "__" should only be various
-
+    // builtins.  For those, we'll demand an exact match, since we'll
-
+    // expect whichever function in stdlib.ispc is calling out to one of
-/** Starting from the function initialFunction, we're calling into
+    // those to be matching the argument types exactly; this is to be a bit
-    calledFunc.  The question is: is this a recursive call back to
+    // extra safe to be sure that the expected builtin is in fact being
-    initialFunc?  If it definitely is or if it may be, then return true.
+    // called.
-    Return false if it definitely is not.
+    bool exactMatchOnly = (fse != NULL) && (fse->name.substr(0,2) == "__");
- */
+    resolveFunctionOverloads(exactMatchOnly);
 static bool
 lMayBeRecursiveCall(llvm::Function *calledFunc, 
                    llvm::Function *initialFunc,
                    std::set<llvm::Function *> &seenFuncs) {
    // Easy case: intrinsics aren't going to call functions themselves
    if (calledFunc->isIntrinsic())
        return false;
    std::string name = calledFunc->getName();
    if (name.size() > 2 && name[0] == '_' && name[1] == '_')
        // builtin stdlib function; none of these are recursive...
        return false;
    if (calledFunc->isDeclaration())
        // There's visibility into what the called function does without a
        // definition, so we have to be conservative
        return true;
    if (calledFunc == initialFunc)
        // hello recursive call
        return true;
    // Otherwise iterate over all of the instructions in the function.  If
    // any of them is a function call then check recursively..
    llvm::inst_iterator iter;
    for (iter = llvm::inst_begin(calledFunc); 
         iter != llvm::inst_end(calledFunc); ++iter) {
        llvm::Instruction *inst = &*iter;
        llvm::CallInst *ci = llvm::dyn_cast<llvm::CallInst>(inst);
        if (ci != NULL) {
            llvm::Function *nextCalledFunc = ci->getCalledFunction();
            // Don't repeatedly test functions we've seen before 
            if (seenFuncs.find(nextCalledFunc) == seenFuncs.end()) {
                seenFuncs.insert(nextCalledFunc);
                if (lMayBeRecursiveCall(nextCalledFunc, initialFunc, 
                                        seenFuncs))
                    return true;
            }
        }
    }
    return false;
 }
@@ -2383,47 +2399,18 @@ FunctionCallExpr::GetValue(FunctionEmitContext *ctx) const {
        }
    }
    // We sometimes need to check to see if the mask is all off here;
    // specifically, if the mask is all off and we call a recursive
    // function, then we will probably have an unsesirable infinite loop.
    ctx->SetDebugPos(pos);
    llvm::BasicBlock *bDoCall = ctx->CreateBasicBlock("funcall_mask_ok");
    llvm::BasicBlock *bSkip = ctx->CreateBasicBlock("funcall_mask_off");
    llvm::BasicBlock *bAfter = ctx->CreateBasicBlock("after_funcall");
    llvm::Function *currentFunc = ctx->GetCurrentBasicBlock()->getParent();
    // If we need to check the mask (it may be a recursive call, possibly
    // transitively), or we're launching a task, which is expensive and
    // thus probably always worth checking, then use the mask to choose
    // whether to go to the bDoCallBlock or the bSkip block
    std::set<llvm::Function *> seenFuncs;
    seenFuncs.insert(currentFunc);
    if (ft->isTask || lMayBeRecursiveCall(callee, currentFunc, seenFuncs)) {
        Debug(pos, "Checking mask before function call \"%s\".", funSym->name.c_str());
        ctx->BranchIfMaskAny(bDoCall, bSkip);
    }
    else
        // If we don't need to check the mask, then always to the call;
        // just jump to bDoCall
        ctx->BranchInst(bDoCall);
    // And the bSkip block just jumps immediately to bAfter.  So why do we
    // need it?  So the phi node below can easily tell what paths are
    // going into it
    ctx->SetCurrentBasicBlock(bSkip);
    ctx->BranchInst(bAfter);
    // Emit the code to do the function call
    ctx->SetCurrentBasicBlock(bDoCall);
    llvm::Value *retVal = NULL;
    ctx->SetDebugPos(pos);
-    if (ft->isTask)
+    if (ft->isTask) {
-        ctx->LaunchInst(callee, argVals);
+        assert(launchCountExpr != NULL);
        llvm::Value *launchCount = launchCountExpr->GetValue(ctx);
        if (launchCount != NULL)
            ctx->LaunchInst(callee, argVals, launchCount);
    }
    else {
        // Most of the time, the mask is passed as the last argument.  this
-        // isn't the case for things like SSE intrinsics and extern "C"
+        // isn't the case for things like intrinsics, builtins, and extern
-        // functions from the application.
+        // "C" functions from the application.
        assert(callargs.size() + 1 == callee->arg_size() ||
               callargs.size() == callee->arg_size());
@@ -2450,22 +2437,10 @@ FunctionCallExpr::GetValue(FunctionEmitContext *ctx) const {
        }
    }
    // And jump out to the 'after funciton call' basic block
    ctx->BranchInst(bAfter);
    ctx->SetCurrentBasicBlock(bAfter);
    if (isVoidFunc)
        return NULL;
-
+    else
-    // The return value for the non-void case is either undefined or the
+        return retVal;
    // function return value, depending on whether we actually ran the code
    // path that called the function or not.
    LLVM_TYPE_CONST llvm::Type *lrType = ft->GetReturnType()->LLVMType(g->ctx);
    llvm::PHINode *ret = ctx->PhiNode(lrType, 2, "fun_ret");
    assert(retVal != NULL);
    ret->addIncoming(llvm::UndefValue::get(lrType), bSkip);
    ret->addIncoming(retVal, bDoCall);
    return ret;
 }
@@ -2507,10 +2482,21 @@ FunctionCallExpr::TypeCheck() {
                    if (!isLaunch)
                        Error(pos, "\"launch\" expression needed to call function "
                              "with \"task\" qualifier.");
                    if (!launchCountExpr)
                        return NULL;
                    launchCountExpr = 
                        launchCountExpr->TypeConv(AtomicType::UniformInt32,
                                                  "task launch count");
                    if (!launchCountExpr)
                        return NULL;
                }
-                else if (isLaunch)
+                else {
                    if (isLaunch)
                        Error(pos, "\"launch\" expression illegal with non-\"task\"-"
                              "qualified function.");
                    assert(launchCountExpr == NULL);
                }
            }
            else
                Error(pos, "Valid function name must be used for function call.");
@@ -2526,6 +2512,13 @@ FunctionCallExpr::TypeCheck() {
 }
 int
 FunctionCallExpr::EstimateCost() const {
    return ((args ? args->EstimateCost() : 0) +
            (isLaunch ? COST_TASK_LAUNCH : COST_FUNCALL));
 }
 void
 FunctionCallExpr::Print() const {
    if (!func || !args || !GetType())
@@ -2614,7 +2607,7 @@ ExprList::GetConstant(const Type *type) const {
    }
    if (dynamic_cast<const StructType *>(type) != NULL) {
-#if defined(LLVM_2_8) || defined(LLVM_2_9)
+#if defined(LLVM_2_9)
        return llvm::ConstantStruct::get(*g->ctx, cv, false);
 #else
        LLVM_TYPE_CONST llvm::StructType *llvmStructType =
@@ -2637,6 +2630,17 @@ ExprList::GetConstant(const Type *type) const {
 }
 int
 ExprList::EstimateCost() const {
    int cost = 0;
    for (unsigned int i = 0; i < exprs.size(); ++i) {
        if (exprs[i] != NULL)
            cost += exprs[i]->EstimateCost();
    }
    return cost;
 }
 void
 ExprList::Print() const {
    printf("expr list (");
@@ -2767,6 +2771,22 @@ IndexExpr::GetLValue(FunctionEmitContext *ctx) const {
    if (!basePtr)
        return NULL;
    // If the array index is a compile time constant, check to see if it
    // may lead to an out-of-bounds access.
    ConstExpr *ce = dynamic_cast<ConstExpr *>(index);
    const SequentialType *seqType = dynamic_cast<const SequentialType *>(type);
    assert(seqType != NULL);
    int nElements = seqType->GetElementCount();
    if (ce != NULL && nElements > 0) {
        int32_t indices[ISPC_MAX_NVEC];
        int count = ce->AsInt32(indices);
        for (int i = 0; i < count; ++i) {
            if (indices[i] < 0 || indices[i] >= nElements)
                Warning(index->pos, "Array index \"%d\" may be out of bounds for "
                        "\"%d\" element array.", indices[i], nElements);
        }
    }
    basePtr = lCastUniformVectorBasePtr(basePtr, ctx);
    ctx->SetDebugPos(pos);
@@ -2819,6 +2839,16 @@ IndexExpr::TypeCheck() {
 }
 int
 IndexExpr::EstimateCost() const {
    // be pessimistic
    if (index && index->GetType()->IsVaryingType())
        return COST_GATHER;
    else
        return COST_LOAD;
 }
 void
 IndexExpr::Print() const {
    if (!arrayOrVector || !index || !GetType())
@@ -3118,6 +3148,7 @@ MemberExpr::create(Expr *e, const char *id, SourcePos p, SourcePos idpos) {
    return new MemberExpr(e, id, p, idpos);
 }
 MemberExpr::MemberExpr(Expr *e, const char *id, SourcePos p, SourcePos idpos) 
    : Expr(p), identifierPos(idpos) {
    expr = e;
@@ -3214,6 +3245,14 @@ MemberExpr::Optimize() {
 }
 int
 MemberExpr::EstimateCost() const {
    // FIXME: return gather cost when we can tell a gather is going to be
    // needed
    return COST_SIMPLE_ARITH_LOGIC_OP;
 }
 void
 MemberExpr::Print() const {
    if (!expr || !GetType())
@@ -3281,7 +3320,7 @@ ConstExpr::ConstExpr(const Type *t, uint8_t u, SourcePos p)
  : Expr(p) {
    type = t;
    type = type->GetAsConstType();
-    assert(type == AtomicType::UniformUInt8);
+    assert(type == AtomicType::UniformConstUInt8);
    uint8Val[0] = u;
 }
@@ -3321,7 +3360,7 @@ ConstExpr::ConstExpr(const Type *t, uint16_t u, SourcePos p)
  : Expr(p) {
    type = t;
    type = type->GetAsConstType();
-    assert(type == AtomicType::UniformUInt16);
+    assert(type == AtomicType::UniformConstUInt16);
    uint16Val[0] = u;
 }
@@ -3424,7 +3463,7 @@ ConstExpr::ConstExpr(const Type *t, uint64_t u, SourcePos p)
  : Expr(p) {
    type = t;
    type = type->GetAsConstType();
-    assert(type == AtomicType::UniformUInt64);
+    assert(type == AtomicType::UniformConstUInt64);
    uint64Val[0] = u;
 }
@@ -4009,6 +4048,12 @@ ConstExpr::TypeCheck() {
 }
 int
 ConstExpr::EstimateCost() const {
    return 0;
 }
 void
 ConstExpr::Print() const {
    printf("[%s] (", GetType()->GetString().c_str());
@@ -4095,7 +4140,7 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal,
        case AtomicType::TYPE_BOOL:
            if (fromType->IsVaryingType() && 
                LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType)
-                // If we have a bool vector of i32 element,s first truncate
+                // If we have a bool vector of i32 elements, first truncate
                // down to a single bit
                exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, "bool_to_i1");
            // And then do an unisgned int->float cast
@@ -4155,9 +4200,6 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal,
        case AtomicType::TYPE_UINT16:
        case AtomicType::TYPE_UINT32:
        case AtomicType::TYPE_UINT64:
            if (fromType->IsVaryingType())
                PerformanceWarning(pos, "Conversion from unsigned int64 to float is slow. "
                                   "Use \"int64\" if possible");
            cast = ctx->CastInst(llvm::Instruction::UIToFP, // unsigned int
                                 exprVal, targetType, "uint2double");
            break;
@@ -4929,6 +4971,13 @@ TypeCastExpr::Optimize() {
 }
 int
 TypeCastExpr::EstimateCost() const {
    // FIXME: return COST_TYPECAST_COMPLEX when appropriate
    return COST_TYPECAST_SIMPLE;
 }
 void
 TypeCastExpr::Print() const {
    printf("[%s] type cast (", GetType()->GetString().c_str());
@@ -4994,6 +5043,12 @@ ReferenceExpr::TypeCheck() {
 }
 int
 ReferenceExpr::EstimateCost() const {
    return 0;
 }
 void
 ReferenceExpr::Print() const {
    if (expr == NULL || GetType() == NULL)
@@ -5072,6 +5127,12 @@ DereferenceExpr::Optimize() {
 }
 int
 DereferenceExpr::EstimateCost() const {
    return COST_DEREF;
 }
 void
 DereferenceExpr::Print() const {
    if (expr == NULL || GetType() == NULL)
@@ -5143,6 +5204,15 @@ SymbolExpr::Optimize() {
 }
 int
 SymbolExpr::EstimateCost() const {
    if (symbol->constValue != NULL)
        return 0;
    else
        return COST_LOAD;
 }
 void
 SymbolExpr::Print() const {
    if (symbol == NULL || GetType() == NULL)
@@ -5157,9 +5227,11 @@ SymbolExpr::Print() const {
 ///////////////////////////////////////////////////////////////////////////
 // FunctionSymbolExpr
-FunctionSymbolExpr::FunctionSymbolExpr(std::vector<Symbol *> *candidates,
+FunctionSymbolExpr::FunctionSymbolExpr(const char *n,
                                       std::vector<Symbol *> *candidates,
                                       SourcePos p) 
  : Expr(p) {
    name = n;
    matchingFunc = NULL;
    candidateFunctions = candidates;
 }
@@ -5196,6 +5268,12 @@ FunctionSymbolExpr::Optimize() {
 }
 int
 FunctionSymbolExpr::EstimateCost() const {
    return 0;
 }
 void
 FunctionSymbolExpr::Print() const {
    if (!matchingFunc || !GetType())
@@ -5219,14 +5297,14 @@ SyncExpr::GetType() const {
 llvm::Value *
 SyncExpr::GetValue(FunctionEmitContext *ctx) const {
    ctx->SetDebugPos(pos);
-    std::vector<llvm::Value *> noArg;
+    ctx->SyncInst();
    llvm::Function *fsync = m->module->getFunction("ISPCSync");
    if (fsync == NULL) {
        FATAL("Couldn't find ISPCSync declaration?!");
    return NULL;
-    }
+}
-    return ctx->CallInst(fsync, noArg, "");
+
 int
 SyncExpr::EstimateCost() const {
    return COST_SYNC;
 }
--- a/expr.h
+++ b/expr.h
@@ -121,8 +121,8 @@ public:
    void Print() const;
    Expr *Optimize();
    Expr *TypeCheck();
    int EstimateCost() const;
 private:
    const Op op;
    Expr *expr;
 };
@@ -164,8 +164,8 @@ public:
    Expr *Optimize();
    Expr *TypeCheck();
    int EstimateCost() const;
 private:
    const Op op;
    Expr *arg0, *arg1;
 };
@@ -196,8 +196,8 @@ public:
    Expr *Optimize();
    Expr *TypeCheck();
    int EstimateCost() const;
 private:
    const Op op;
    Expr *lvalue, *rvalue;
 };
@@ -217,8 +217,8 @@ public:
    Expr *Optimize();
    Expr *TypeCheck();
    int EstimateCost() const;
 private:
    Expr *test, *expr1, *expr2;
 };
@@ -240,6 +240,7 @@ public:
    llvm::Constant *GetConstant(const Type *type) const;
    ExprList *Optimize();
    ExprList *TypeCheck();
    int EstimateCost() const;
    std::vector<Expr *> exprs;
 };
@@ -249,7 +250,8 @@ public:
 */
 class FunctionCallExpr : public Expr {
 public:
-    FunctionCallExpr(Expr *func, ExprList *args, SourcePos p, bool isLaunch);
+    FunctionCallExpr(Expr *func, ExprList *args, SourcePos p, 
                     bool isLaunch = false, Expr *launchCountExpr = NULL);
    llvm::Value *GetValue(FunctionEmitContext *ctx) const;
    const Type *GetType() const;
@@ -257,13 +259,15 @@ public:
    Expr *Optimize();
    Expr *TypeCheck();
    int EstimateCost() const;
 private:
    Expr *func;
    ExprList *args;
    bool isLaunch;
    Expr *launchCountExpr;
-    void resolveFunctionOverloads();
+private:
    void resolveFunctionOverloads(bool exactMatchOnly);
    bool tryResolve(bool (*matchFunc)(Expr *, const Type *));
 };
@@ -285,8 +289,8 @@ public:
    Expr *Optimize();
    Expr *TypeCheck();
    int EstimateCost() const;
 private:
    Expr *arrayOrVector, *index;
 };
@@ -303,16 +307,17 @@ public:
    MemberExpr(Expr *expr, const char *identifier, SourcePos pos, 
               SourcePos identifierPos);
-    virtual llvm::Value *GetValue(FunctionEmitContext *ctx) const;
+    llvm::Value *GetValue(FunctionEmitContext *ctx) const;
-    virtual llvm::Value *GetLValue(FunctionEmitContext *ctx) const;
+    llvm::Value *GetLValue(FunctionEmitContext *ctx) const;
-    virtual const Type *GetType() const;
+    const Type *GetType() const;
-    virtual Symbol *GetBaseSymbol() const;
+    Symbol *GetBaseSymbol() const;
-    virtual void Print() const;
+    void Print() const;
-    virtual Expr *Optimize();
+    Expr *Optimize();
-    virtual Expr *TypeCheck();
+    Expr *TypeCheck();
    int EstimateCost() const;
    virtual int getElementNumber() const;
 protected:
    std::string getCandidateNearMatches() const;
    Expr *expr;
@@ -392,6 +397,7 @@ public:
    Expr *TypeCheck();
    Expr *Optimize();
    int EstimateCost() const;
    /** Return the ConstExpr's values as booleans, doing type conversion
        from the actual type if needed.  If forceVarying is true, then type
@@ -495,8 +501,8 @@ public:
    void Print() const;
    Expr *TypeCheck();
    Expr *Optimize();
    int EstimateCost() const;
 private:
    const Type *type;
    Expr *expr;
 };
@@ -514,8 +520,8 @@ public:
    void Print() const;
    Expr *TypeCheck();
    Expr *Optimize();
    int EstimateCost() const;
 private:
    Expr *expr;
 };
@@ -533,8 +539,8 @@ public:
    void Print() const;
    Expr *TypeCheck();
    Expr *Optimize();
    int EstimateCost() const;
 private:
    Expr *expr;
 };
@@ -551,6 +557,7 @@ public:
    Expr *TypeCheck();
    Expr *Optimize();
    void Print() const;
    int EstimateCost() const;
 private:
    Symbol *symbol;
@@ -562,7 +569,7 @@ private:
 */    
 class FunctionSymbolExpr : public Expr {
 public:
-    FunctionSymbolExpr(std::vector<Symbol *> *candidateFunctions, 
+    FunctionSymbolExpr(const char *name, std::vector<Symbol *> *candidateFunctions,
                       SourcePos pos);
    llvm::Value *GetValue(FunctionEmitContext *ctx) const;
@@ -571,10 +578,14 @@ public:
    Expr *TypeCheck();
    Expr *Optimize();
    void Print() const;
    int EstimateCost() const;
 private:
    friend class FunctionCallExpr;
    /** Name of the function that is being called. */
    std::string name;
    /** All of the functions with the name given in the function call;
        there may be more then one, in which case we need to resolve which
        overload is the best match. */
@@ -597,6 +608,7 @@ public:
    Expr *TypeCheck();
    Expr *Optimize();
    void Print() const;
    int EstimateCost() const;
 };
 #endif // ISPC_EXPR_H
--- a/failing_tests/masked-scatter-vector.ispc
+++ b/failing_tests/masked-scatter-vector.ispc
@@ -14,7 +14,7 @@ export void f_fu(uniform float ret[], uniform float aa[], uniform float b) {
    varying int3 vv = array[a];
    ++vv.y;
    array[a] = vv;
-    print("fin %\n", array[programIndex].y);
+//CO    print("fin %\n", array[programIndex].y);
    ret[programIndex] = array[programIndex].y;
 }
--- a/failing_tests/max-uint-1.ispc
+++ b/failing_tests/max-uint-1.ispc
@@ -1,19 +1,14 @@
-static float float4(uniform float a, uniform float b, uniform float c, 
+
-                    uniform float d) {
+export uniform int width() { return programCount; }
-    float ret = 0;
+
-    for (uniform int i = 0; i < programCount; i += 4) {
+export void f_f(uniform float r[], uniform float a[]) {
-        ret = insert(ret, i + 0, a);
+    unsigned int i = (unsigned int)a[programIndex];
-        ret = insert(ret, i + 1, b);
+    r[programIndex] = max((unsigned int)2, i);
        ret = insert(ret, i + 2, c);
        ret = insert(ret, i + 3, d);
    }
    return ret;
 }
-export float f_f(float a) {
+export void result(uniform float r[]) { 
-    unsigned int i = (unsigned int)a;
+    r[programIndex] = 1+programIndex;
-    return max((unsigned int)2, i);
+    r[0] = 2;
 }
 export float result() { return float4(2,2,3,4); }
--- a/failing_tests/max-uint.ispc
+++ b/failing_tests/max-uint.ispc
@@ -1,8 +1,10 @@
-export float f_f(float a) {
+export uniform int width() { return programCount; }
-    unsigned int i = (unsigned int)a;
+
-    return max((unsigned int)10, i);
+export void f_f(uniform float result[], uniform float aa[]) {
    unsigned int i = (unsigned int)aa[programIndex];
    result[programIndex] = max((unsigned int)100, i);
 }
-export float result() { return 10; }
+export void result(uniform float r[]) { r[programIndex] = 100; }
--- a/failing_tests/min-uint-1.ispc
+++ b/failing_tests/min-uint-1.ispc
@@ -1,19 +1,14 @@
-static float float4(uniform float a, uniform float b, uniform float c, 
+
-                    uniform float d) {
+export uniform int width() { return programCount; }
-    float ret = 0;
+
-    for (uniform int i = 0; i < programCount; i += 4) {
+export void f_f(uniform float result[], uniform float aa[]) {
-        ret = insert(ret, i + 0, a);
+    unsigned int i = (unsigned int)aa[programIndex];
-        ret = insert(ret, i + 1, b);
+    result[programIndex] = min((unsigned int)2, i);
        ret = insert(ret, i + 2, c);
        ret = insert(ret, i + 3, d);
    }
    return ret;
 }
-export float f_f(float a) {
+export void result(uniform float r[]) { 
-    unsigned int i = (unsigned int)a;
+    r[programIndex] = 2;
-    return min((unsigned int)2, i);
+    r[0] = 1;
 }
 export float result() { return float4(1,2,2,2); }
--- a/failing_tests/min-uint-2.ispc
+++ b/failing_tests/min-uint-2.ispc
@@ -1,19 +1,13 @@
-static float float4(uniform float a, uniform float b, uniform float c, 
+
-                    uniform float d) {
+export uniform int width() { return programCount; }
-    float ret = 0;
+
-    for (uniform int i = 0; i < programCount; i += 4) {
+export void f_f(uniform float r[], uniform float a[]) {
-        ret = insert(ret, i + 0, a);
+    unsigned int i = (unsigned int)a[programIndex];
-        ret = insert(ret, i + 1, b);
+    r[programIndex] =  min((unsigned int)20, i);
        ret = insert(ret, i + 2, c);
        ret = insert(ret, i + 3, d);
    }
    return ret;
 }
-export float f_f(float a) {
+export void result(uniform float r[]) { 
-    unsigned int i = (unsigned int)a;
+    r[programIndex] = 1+programIndex;
    return min((unsigned int)20, i);
 }
 export float result() { return float4(1,2,3,4); }
--- a/failing_tests/struct-array-assign.ispc
+++ b/failing_tests/struct-array-assign.ispc
@@ -1,11 +0,0 @@
 struct Foo {
    float f;
 };
 export float foo(Foo f[], int i, uniform int j) {
    Foo x = f[i];
    return x.f;
 }
--- a/ispc.cpp
+++ b/ispc.cpp
@@ -42,14 +42,25 @@
 #ifdef ISPC_IS_WINDOWS
 #include <windows.h>
 #include <direct.h>
 #define strcasecmp stricmp
 #endif
 #include <llvm/LLVMContext.h>
 #include <llvm/Module.h>
 #ifndef LLVM_2_8
 #include <llvm/Analysis/DIBuilder.h>
 #endif
 #include <llvm/Analysis/DebugInfo.h>
 #include <llvm/Support/Dwarf.h>
 #include <llvm/Target/TargetMachine.h>
 #include <llvm/Target/TargetOptions.h>
 #include <llvm/Target/TargetData.h>
 #if defined(LLVM_3_0) || defined(LLVM_3_0svn)
  #include <llvm/Support/TargetRegistry.h>
  #include <llvm/Support/TargetSelect.h>
 #else
  #include <llvm/Target/TargetRegistry.h>
  #include <llvm/Target/TargetSelect.h>
  #include <llvm/Target/SubtargetFeature.h>
 #endif
 #include <llvm/Support/Host.h>
 Globals *g;
 Module *m;
@@ -57,21 +68,198 @@ Module *m;
 ///////////////////////////////////////////////////////////////////////////
 // Target
-Target::Target() {
+bool
 Target::GetTarget(const char *arch, const char *cpu, const char *isa,
                  bool pic, Target *t) {
    if (cpu == NULL) {
        std::string hostCPU = llvm::sys::getHostCPUName();
        if (hostCPU.size() > 0)
            cpu = hostCPU.c_str();
        else {
            fprintf(stderr, "Warning: unable to determine host CPU!\n");
            cpu = "generic";
        }
    }
    t->cpu = cpu;
    if (isa == NULL) {
        if (!strcasecmp(cpu, "atom"))
            isa = "sse2";
 #if defined(LLVM_3_0) || defined(LLVM_3_0_svn)
        else if (!strcasecmp(cpu, "sandybridge") ||
                 !strcasecmp(cpu, "corei7-avx"))
            isa = "avx";
 #endif // LLVM_3_0
        else
            isa = "sse4";
    }
    if (arch == NULL)
        arch = "x86-64";
-    cpu = "nehalem";
+
-    is32bit = false;
+    bool error = false;
-    isa = SSE4;
+
-    nativeVectorWidth = 4;
+    t->generatePIC = pic;
-    vectorWidth = 4;
+
    // Make sure the target architecture is a known one; print an error
    // with the valid ones otherwise.
    t->target = NULL;
    for (llvm::TargetRegistry::iterator iter = llvm::TargetRegistry::begin();
         iter != llvm::TargetRegistry::end(); ++iter) {
        if (std::string(arch) == iter->getName()) {
            t->target = &*iter;
            break;
        }
    }
    if (t->target == NULL) {
        fprintf(stderr, "Invalid architecture \"%s\"\nOptions: ", arch);
        llvm::TargetRegistry::iterator iter;
        for (iter = llvm::TargetRegistry::begin();
             iter != llvm::TargetRegistry::end(); ++iter)
            fprintf(stderr, "%s ", iter->getName());
        fprintf(stderr, "\n");
        error = true;
    }
    else {
        t->arch = arch;
    }
    if (!strcasecmp(isa, "sse2")) {
        t->isa = Target::SSE2;
        t->nativeVectorWidth = 4;
        t->vectorWidth = 4;
        t->attributes = "+sse,+sse2,-sse3,-sse41,-sse42,-sse4a,-ssse3,-popcnt";
    }
    else if (!strcasecmp(isa, "sse4")) {
        t->isa = Target::SSE4;
        t->nativeVectorWidth = 4;
        t->vectorWidth = 4;
        t->attributes = "+sse,+sse2,+sse3,+sse41,-sse42,-sse4a,+ssse3,-popcnt,+cmov";
    }
    else if (!strcasecmp(isa, "sse4x2")) {
        t->isa = Target::SSE4;
        t->nativeVectorWidth = 4;
        t->vectorWidth = 8;
        t->attributes = "+sse,+sse2,+sse3,+sse41,-sse42,-sse4a,+ssse3,-popcnt,+cmov";
    }
 #if defined(LLVM_3_0) || defined(LLVM_3_0svn)
    else if (!strcasecmp(isa, "avx")) {
        t->isa = Target::AVX;
        t->nativeVectorWidth = 8;
        t->vectorWidth = 8;
        t->attributes = "+avx,+popcnt,+cmov";
    }
    else if (!strcasecmp(isa, "avx-x2")) {
        t->isa = Target::AVX;
        t->nativeVectorWidth = 8;
        t->vectorWidth = 16;
        t->attributes = "+avx,+popcnt,+cmov";
    }
 #endif // LLVM 3.0
    else {
        fprintf(stderr, "Target ISA \"%s\" is unknown.  Choices are: %s\n", 
                isa, SupportedTargetISAs());
        error = true;
    }
    if (!error) {
        llvm::TargetMachine *targetMachine = t->GetTargetMachine();
        const llvm::TargetData *targetData = targetMachine->getTargetData();
        t->is32bit = (targetData->getPointerSize() == 4);
    }
    return !error;
 }
 const char *
 Target::SupportedTargetCPUs() {
    return "atom, barcelona, core2, corei7, "
 #if defined(LLVM_3_0) || defined(LLVM_3_0_svn)
        "corei7-avx, "
 #endif
        "istanbul, nocona, penryn, "
 #ifdef LLVM_2_9
        "sandybridge, "
 #endif
        "westmere";
 }
 const char *
 Target::SupportedTargetArchs() {
    return "x86, x86-64";
 }
 const char *
 Target::SupportedTargetISAs() {
    return "sse2, sse4, sse4x2"
 #if defined(LLVM_3_0) || defined(LLVM_3_0_svn)
        ", avx, avx-x2"
 #endif
        ;
 }
 std::string
 Target::GetTripleString() const {
    llvm::Triple triple;
    // Start with the host triple as the default
    triple.setTriple(llvm::sys::getHostTriple());
    // And override the arch in the host triple based on what the user
    // specified.  Here we need to deal with the fact that LLVM uses one
    // naming convention for targets TargetRegistry, but wants some
    // slightly different ones for the triple.  TODO: is there a way to
    // have it do this remapping, which would presumably be a bit less
    // error prone?
    if (arch == "x86")
        triple.setArchName("i386");
    else if (arch == "x86-64")
        triple.setArchName("x86_64");
    else
        triple.setArchName(arch);
    return triple.str();
 }
 llvm::TargetMachine *
 Target::GetTargetMachine() const {
    std::string triple = GetTripleString();
    llvm::Reloc::Model relocModel = generatePIC ? llvm::Reloc::PIC_ : 
                                                  llvm::Reloc::Default;
 #if defined(LLVM_3_0svn) || defined(LLVM_3_0)
    std::string featuresString = attributes;
    llvm::TargetMachine *targetMachine = 
        target->createTargetMachine(triple, cpu, featuresString, relocModel);
 #else
 #ifdef ISPC_IS_APPLE
    relocModel = llvm::Reloc::PIC_;
 #endif // ISPC_IS_APPLE
    std::string featuresString = cpu + std::string(",") + attributes;
    llvm::TargetMachine *targetMachine = 
        target->createTargetMachine(triple, featuresString);
 #ifndef ISPC_IS_WINDOWS
    targetMachine->setRelocationModel(relocModel);
 #endif // !ISPC_IS_WINDOWS
 #endif
    assert(targetMachine != NULL);
    targetMachine->setAsmVerbosityDefault(true);
    return targetMachine;
 }
 ///////////////////////////////////////////////////////////////////////////
 // Opt
 Opt::Opt() {
    level = 1;
    fastMath = false;
    fastMaskedVload = false;
    unrollLoops = true;
    disableBlendedMaskedStores = false;
    disableCoherentControlFlow = false;
    disableUniformControlFlow = false;
@@ -121,13 +309,9 @@ SourcePos::SourcePos(const char *n, int l, int c) {
 }
 llvm::DIFile SourcePos::GetDIFile() const {
 #ifdef LLVM_2_8
    return llvm::DIFile();
 #else
    std::string directory, filename;
    GetDirectoryAndFileName(g->currentDirectory, name, &directory, &filename);
    return m->diBuilder->createFile(filename, directory);
 #endif // LLVM_2_8
 }
--- a/ispc.h
+++ b/ispc.h
@@ -69,6 +69,8 @@ namespace llvm {
    class FunctionType;
    class LLVMContext;
    class Module;
    class Target;
    class TargetMachine;
    class Type;
    class Value;
 }
@@ -146,6 +148,8 @@ public:
        pointer in place of the original ASTNode *. */
    virtual ASTNode *TypeCheck() = 0;
    virtual int EstimateCost() const = 0;
    /** All AST nodes must track the file position where they are
        defined. */
    const SourcePos pos;
@@ -156,7 +160,34 @@ public:
    This structure defines a compilation target for the ispc compiler.
 */
 struct Target {
-    Target();
+    /** Initializes the given Target pointer for a target of the given
        name, if the name is a known target.  Returns true if the
        target was initialized and false if the name is unknown. */
    static bool GetTarget(const char *arch, const char *cpu, const char *isa,
                          bool pic, Target *);
    /** Returns a comma-delimited string giving the names of the currently
        supported target ISAs. */
    static const char *SupportedTargetISAs();
    /** Returns a comma-delimited string giving the names of the currently
        supported target CPUs. */
    static const char *SupportedTargetCPUs();
    /** Returns a comma-delimited string giving the names of the currently
        supported target architectures. */
    static const char *SupportedTargetArchs();
    /** Returns a triple string specifying the target architecture, vendor,
        and environment. */
    std::string GetTripleString() const;
    /** Returns the LLVM TargetMachine object corresponding to this
        target. */
    llvm::TargetMachine *GetTargetMachine() const;
    /** llvm Target object representing this target. */
    const llvm::Target *target;
    /** Enumerator giving the instruction sets that the compiler can
        target. */
@@ -174,6 +205,9 @@ struct Target {
    /** Target CPU. (e.g. "corei7", "corei7-avx", ..) */
    std::string cpu;
    /** Target-specific attributes to pass along to the LLVM backend */
    std::string attributes;
    /** Native vector width of the vector instruction set.  Note that this
        value is directly derived from the ISA Being used (e.g. it's 4 for
        SSE, 8 for AVX, etc.) */
@@ -183,8 +217,12 @@ struct Target {
        integer multiple of the native vector width, for example if we're
        "doubling up" and compiling 8-wide on a 4-wide SSE system. */
    int vectorWidth;
    /** Indicates whether position independent code should be generated. */
    bool generatePIC;
 };
 /** @brief Structure that collects optimization options
    This structure collects all of the options related to optimization of
@@ -202,6 +240,16 @@ struct Opt {
        should be performed.  This is false by default. */
    bool fastMath;
    /** Indicates whether an vector load should be issued for masked loads
        on platforms that don't have a native masked vector load.  (This may
        lead to accessing memory up to programCount-1 elements past the end of
        arrays, so is unsafe in general.) */
    bool fastMaskedVload;
    /** Indicates when loops should be unrolled (when doing so seems like
        it will make sense. */
    bool unrollLoops;
    /** On targets that don't have a masked store instruction but do have a
        blending instruction, by default, we simulate masked stores by
        loading the old value, blending, and storing the result.  This can
@@ -319,6 +367,29 @@ struct Globals {
    std::vector<std::string> cppArgs;
 };
 enum {
    COST_ASSIGN = 1,
    COST_COHERENT_BREAK_CONTINE = 4,
    COST_COMPLEX_ARITH_OP = 4,
    COST_DEREF = 4,
    COST_FUNCALL = 4,
    COST_GATHER = 8,
    COST_LOAD = 2,
    COST_REGULAR_BREAK_CONTINUE = 2,
    COST_RETURN = 4,
    COST_SELECT = 4,
    COST_SIMPLE_ARITH_LOGIC_OP = 1,
    COST_SYNC = 32,
    COST_TASK_LAUNCH = 16,
    COST_TYPECAST_COMPLEX = 4,
    COST_TYPECAST_SIMPLE = 1,
    COST_UNIFORM_LOOP = 4,
    COST_VARYING_LOOP = 6,
    CHECK_MASK_AT_FUNCTION_START_COST = 16,
    PREDICATE_SAFE_IF_STATEMENT_COST = 6,
 };
 extern Globals *g;
 extern Module *m;
--- a/ispc.vcxproj
+++ b/ispc.vcxproj
@@ -1,4 +1,4 @@
-<?xml version="1.0" encoding="utf-8"?>
+<?xml version="1.0" encoding="utf-8"?>
 <Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
  <ItemGroup Label="ProjectConfigurations">
    <ProjectConfiguration Include="Debug|Win32">
@@ -16,6 +16,7 @@
    <ClCompile Include="decl.cpp" />
    <ClCompile Include="expr.cpp" />
    <ClCompile Include="gen-bitcode-avx.cpp" />
    <ClCompile Include="gen-bitcode-avx-x2.cpp" />
    <ClCompile Include="gen-bitcode-c-32.cpp" />
    <ClCompile Include="gen-bitcode-c-64.cpp" />
    <ClCompile Include="gen-bitcode-sse2.cpp" />
@@ -30,12 +31,14 @@
    <ClCompile Include="opt.cpp" />
    <ClCompile Include="parse.cc" />
    <CustomBuild Include="builtins-c.c">
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%LLVM_INSTALL_DIR%\bin\clang -m32 -emit-llvm builtins-c.c -c -o - | %LLVM_INSTALL_DIR%\bin\llvm-dis - | python bitcode2cpp.py builtins-c-32.c &gt; gen-bitcode-c-32.cpp</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%LLVM_INSTALL_DIR%\bin\clang -m32 -emit-llvm builtins-c.c -c -o - | %LLVM_INSTALL_DIR%\bin\llvm-dis - | python bitcode2cpp.py builtins-c-32.c &gt; gen-bitcode-c-32.cpp;
 %LLVM_INSTALL_DIR%\bin\clang -m64 -emit-llvm builtins-c.c -c -o - | %LLVM_INSTALL_DIR%\bin\llvm-dis - | python bitcode2cpp.py builtins-c-64.c &gt; gen-bitcode-c-64.cpp</Command>
      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">clang builtins-c.c</Message>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%LLVM_INSTALL_DIR%\bin\clang -m32 -emit-llvm builtins-c.c -c -o - | %LLVM_INSTALL_DIR%\bin\llvm-dis - | python bitcode2cpp.py builtins-c-32.c &gt; gen-bitcode-c-32.cpp</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%LLVM_INSTALL_DIR%\bin\clang -m32 -emit-llvm builtins-c.c -c -o - | %LLVM_INSTALL_DIR%\bin\llvm-dis - | python bitcode2cpp.py builtins-c-32.c &gt; gen-bitcode-c-32.cpp;
 %LLVM_INSTALL_DIR%\bin\clang -m64 -emit-llvm builtins-c.c -c -o - | %LLVM_INSTALL_DIR%\bin\llvm-dis - | python bitcode2cpp.py builtins-c-64.c &gt; gen-bitcode-c-64.cpp</Command>
      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">clang builtins-c.c</Message>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-c.cpp</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-c-32.cpp;gen-bitcore-c-64.cpp</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-c.cpp</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-c-32.cpp;gen-bitcore-c-64.cpp</Outputs>
    </CustomBuild>
    <ClCompile Include="stmt.cpp" />
    <ClCompile Include="sym.cpp" />
@@ -60,9 +63,9 @@
  <ItemGroup>
    <CustomBuild Include="stdlib.ispc">
      <FileType>Document</FileType>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">clang -E -x c %(Filename).ispc -DISPC=1 -DPI=3.1415926535 | python stdlib2cpp.py &gt; gen-stdlib.cpp</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%LLVM_INSTALL_DIR%\bin\clang -E -x c %(Filename).ispc -DISPC=1 -DPI=3.1415926535 | python stdlib2cpp.py &gt; gen-stdlib.cpp</Command>
      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-stdlib.cpp</Outputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">clang -E -x c %(Filename).ispc -DISPC=1 -DPI=3.1415926535 | python stdlib2cpp.py &gt; gen-stdlib.cpp</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%LLVM_INSTALL_DIR%\bin\clang -E -x c %(Filename).ispc -DISPC=1 -DPI=3.1415926535 | python stdlib2cpp.py &gt; gen-stdlib.cpp</Command>
      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-stdlib.cpp</Outputs>
      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-stdlib.cpp</Message>
      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-stdlib.cpp</Message>
@@ -120,6 +123,19 @@
      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-avx.cpp</Message>
    </CustomBuild>
  </ItemGroup>
  <ItemGroup>
    <CustomBuild Include="builtins-avx-x2.ll">
      <FileType>Document</FileType>
      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 builtins.m4 builtins-avx-x2.ll | python bitcode2cpp.py builtins-avx-x2.ll &gt; gen-bitcode-avx-x2.cpp</Command>
      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-avx-x2.cpp</Outputs>
      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins.m4;builtins-sse.ll</AdditionalInputs>
      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 builtins.m4 builtins-avx-x2.ll | python bitcode2cpp.py builtins-avx-x2.ll &gt; gen-bitcode-avx-x2.cpp</Command>
      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-avx-x2.cpp</Outputs>
      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins.m4;builtins-sse.ll</AdditionalInputs>
      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-avx-x2.cpp</Message>
      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-avx-x2.cpp</Message>
    </CustomBuild>
  </ItemGroup>
  <ItemGroup>
    <CustomBuild Include="lex.ll">
      <FileType>Document</FileType>
@@ -180,7 +196,7 @@
      <PrecompiledHeader>NotUsing</PrecompiledHeader>
      <WarningLevel>Level3</WarningLevel>
      <Optimization>Disabled</Optimization>
-      <PreprocessorDefinitions>NOMINMAX;LLVM_2_9</PreprocessorDefinitions>
+      <PreprocessorDefinitions>NOMINMAX;LLVM_3_0</PreprocessorDefinitions>
      <AdditionalIncludeDirectories>$(LLVM_INSTALL_DIR)\include;.;.\winstuff;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
      <DisableSpecificWarnings>4146;4800;4996;4355;4624</DisableSpecificWarnings>
    </ClCompile>
@@ -188,7 +204,7 @@
      <SubSystem>Console</SubSystem>
      <GenerateDebugInformation>true</GenerateDebugInformation>
      <AdditionalLibraryDirectories>$(LLVM_INSTALL_DIR)\lib;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
-      <AdditionalDependencies>clangFrontend.lib;clangDriver.lib;clangSerialization.lib;clangParse.lib;clangSema.lib;clangAnalysis.lib;clangAST.lib;clangLex.lib;clangBasic.lib;LLVMAnalysis.lib;LLVMArchive.lib;LLVMAsmPrinter.lib;LLVMBitReader.lib;LLVMBitWriter.lib;LLVMCodeGen.lib;LLVMCore.lib;LLVMExecutionEngine.lib;LLVMInstCombine.lib;LLVMInstrumentation.lib;LLVMipa.lib;LLVMipo.lib;LLVMLinker.lib;LLVMMC.lib;LLVMMCParser.lib;LLVMObject.lib;LLVMScalarOpts.lib;LLVMSelectionDAG.lib;LLVMSupport.lib;LLVMTarget.lib;LLVMTransformUtils.lib;LLVMX86ASMPrinter.lib;LLVMX86ASMParser.lib;LLVMX86Utils.lib;LLVMX86CodeGen.lib;LLVMX86Disassembler.lib;LLVMX86Info.lib;shlwapi.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalDependencies>clangFrontend.lib;clangDriver.lib;clangSerialization.lib;clangParse.lib;clangSema.lib;clangAnalysis.lib;clangAST.lib;clangLex.lib;clangBasic.lib;LLVMAnalysis.lib;LLVMArchive.lib;LLVMAsmParser.lib;LLVMAsmPrinter.lib;LLVMBitReader.lib;LLVMBitWriter.lib;LLVMCodeGen.lib;LLVMCore.lib;LLVMDebugInfo.lib;LLVMExecutionEngine.lib;LLVMInstCombine.lib;LLVMInstrumentation.lib;LLVMLinker.lib;LLVMMC.lib;LLVMMCDisassembler.lib;LLVMMCParser.lib;LLVMObject.lib;LLVMScalarOpts.lib;LLVMSelectionDAG.lib;LLVMSupport.lib;LLVMTarget.lib;LLVMTransformUtils.lib;LLVMX86ASMPrinter.lib;LLVMX86ASMParser.lib;LLVMX86Utils.lib;LLVMX86CodeGen.lib;LLVMX86Desc.lib;LLVMX86Disassembler.lib;LLVMX86Info.lib;LLVMipa.lib;LLVMipo.lib;shlwapi.lib;%(AdditionalDependencies)</AdditionalDependencies>
    </Link>
  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
@@ -198,7 +214,7 @@
      <Optimization>MaxSpeed</Optimization>
      <FunctionLevelLinking>true</FunctionLevelLinking>
      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <PreprocessorDefinitions>NOMINMAX;LLVM_2_9</PreprocessorDefinitions>
+      <PreprocessorDefinitions>NOMINMAX;LLVM_3_0</PreprocessorDefinitions>
      <AdditionalIncludeDirectories>$(LLVM_INSTALL_DIR)\include;.;.\winstuff;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
      <DisableSpecificWarnings>4146;4800;4996;4355;4624</DisableSpecificWarnings>
    </ClCompile>
@@ -208,7 +224,7 @@
      <EnableCOMDATFolding>true</EnableCOMDATFolding>
      <OptimizeReferences>true</OptimizeReferences>
      <AdditionalLibraryDirectories>$(LLVM_INSTALL_DIR)\lib;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
-      <AdditionalDependencies>clangFrontend.lib;clangDriver.lib;clangSerialization.lib;clangParse.lib;clangSema.lib;clangAnalysis.lib;clangAST.lib;clangLex.lib;clangBasic.lib;LLVMAnalysis.lib;LLVMArchive.lib;LLVMAsmPrinter.lib;LLVMBitReader.lib;LLVMBitWriter.lib;LLVMCodeGen.lib;LLVMCore.lib;LLVMExecutionEngine.lib;LLVMInstCombine.lib;LLVMInstrumentation.lib;LLVMipa.lib;LLVMipo.lib;LLVMLinker.lib;LLVMMC.lib;LLVMMCParser.lib;LLVMObject.lib;LLVMScalarOpts.lib;LLVMSelectionDAG.lib;LLVMSupport.lib;LLVMTarget.lib;LLVMTransformUtils.lib;LLVMX86ASMPrinter.lib;LLVMX86ASMParser.lib;LLVMX86Utils.lib;LLVMX86CodeGen.lib;LLVMX86Disassembler.lib;LLVMX86Info.lib;shlwapi.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalDependencies>clangFrontend.lib;clangDriver.lib;clangSerialization.lib;clangParse.lib;clangSema.lib;clangAnalysis.lib;clangAST.lib;clangLex.lib;clangBasic.lib;LLVMAnalysis.lib;LLVMArchive.lib;LLVMAsmParser.lib;LLVMAsmPrinter.lib;LLVMBitReader.lib;LLVMBitWriter.lib;LLVMCodeGen.lib;LLVMCore.lib;LLVMDebugInfo.lib;LLVMExecutionEngine.lib;LLVMInstCombine.lib;LLVMInstrumentation.lib;LLVMLinker.lib;LLVMMC.lib;LLVMMCDisassembler.lib;LLVMMCParser.lib;LLVMObject.lib;LLVMScalarOpts.lib;LLVMSelectionDAG.lib;LLVMSupport.lib;LLVMTarget.lib;LLVMTransformUtils.lib;LLVMX86ASMPrinter.lib;LLVMX86ASMParser.lib;LLVMX86Utils.lib;LLVMX86CodeGen.lib;LLVMX86Desc.lib;LLVMX86Disassembler.lib;LLVMX86Info.lib;LLVMipa.lib;LLVMipo.lib;shlwapi.lib;%(AdditionalDependencies)</AdditionalDependencies>
    </Link>
  </ItemDefinitionGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
--- a/ispc_test.cpp
+++ b/ispc_test.cpp
@@ -33,12 +33,25 @@
 #define _CRT_SECURE_NO_WARNINGS
 #if defined(_WIN32) || defined(_WIN64)
 #define ISPC_IS_WINDOWS
 #elif defined(__linux__)
 #define ISPC_IS_LINUX
 #elif defined(__APPLE__)
 #define ISPC_IS_APPLE
 #endif
 #ifdef ISPC_IS_WINDOWS
 #define NOMINMAX
 #include <windows.h>
 #endif
 #include <stdio.h>
 #include <stdint.h>
 #include <stdlib.h>
 #include <memory.h>
 #ifdef ISPC_IS_LINUX
 #include <malloc.h>
 #endif
 #ifdef ISPC_HAVE_SVML
 #include <xmmintrin.h>
@@ -61,8 +74,14 @@ extern "C" {
 #include <llvm/DerivedTypes.h>
 #include <llvm/Instructions.h>
 #include <llvm/ExecutionEngine/ExecutionEngine.h>
 #if defined(LLVM_3_0) || defined(LLVM_3_0svn)
  #include <llvm/Support/TargetRegistry.h>
  #include <llvm/Support/TargetSelect.h>
 #else
  #include <llvm/Target/TargetRegistry.h>
  #include <llvm/Target/TargetSelect.h>
 #endif
 #include <llvm/ExecutionEngine/JIT.h>
 #include <llvm/Target/TargetSelect.h>
 #include <llvm/Target/TargetOptions.h>
 #include <llvm/Target/TargetData.h>
 #include <llvm/Transforms/Scalar.h>
@@ -74,42 +93,53 @@ extern "C" {
 #include <llvm/Support/raw_ostream.h>
 #include <llvm/Bitcode/ReaderWriter.h>
 #include <llvm/Support/MemoryBuffer.h>
 #ifndef LLVM_2_8
 #include <llvm/Support/system_error.h>
-#endif
+
 bool shouldFail = false;
 extern "C" { 
-    void ISPCLaunch(void *, void *);
+    void ISPCLaunch(void **, void *, void *, int32_t);
-    void ISPCSync();
+    void ISPCSync(void *);
-    void *ISPCMalloc(int64_t size, int32_t alignment);
+    void *ISPCAlloc(void **, int64_t size, int32_t alignment);
    void ISPCFree(void *ptr);
 }
-void ISPCLaunch(void *func, void *data) {
+void ISPCLaunch(void **handle, void *func, void *data, int32_t count) {
-    typedef void (*TaskFuncType)(void *, int, int);
+    *handle = (void *)0xdeadbeef;
    typedef void (*TaskFuncType)(void *, int, int, int, int);
    TaskFuncType tft = (TaskFuncType)(func);
-    tft(data, 0, 1);
+    for (int i = 0; i < count; ++i)
        tft(data, 0, 1, i, count);
 }
-void ISPCSync() {
+void ISPCSync(void *) {
 }
 void *ISPCAlloc(void **handle, int64_t size, int32_t alignment) {
    *handle = (void *)0xdeadbeef;
    // leak time!
 #ifdef ISPC_IS_WINDOWS
 void *ISPCMalloc(int64_t size, int32_t alignment) {
    return _aligned_malloc(size, alignment);
 }
 void ISPCFree(void *ptr) {
    _aligned_free(ptr);
 }
 #endif
 #ifdef ISPC_IS_LINUX
    return memalign(alignment, size);
 #endif
 #ifdef ISPC_IS_APPLE
    void *mem = malloc(size + (alignment-1) + sizeof(void*));
    char *amem = ((char*)mem) + sizeof(void*);
    amem = amem + uint32_t(alignment - (reinterpret_cast<uint64_t>(amem) &
                                        (alignment - 1)));
    ((void**)amem)[-1] = mem;
    return amem;
 #endif
 }
 static void usage(int ret) {
    fprintf(stderr, "usage: ispc_test\n");
    fprintf(stderr, "\t[-h/--help]\tprint help\n");
    fprintf(stderr, "\t[-f]\t\tindicates that test is expected to fail\n");
    fprintf(stderr, "\t<files>\n");
    exit(ret);
 }
@@ -135,17 +165,6 @@ double Log(double x) { return log(x); }
 static bool lRunTest(const char *fn) {
    llvm::LLVMContext *ctx = new llvm::LLVMContext;
 #ifdef LLVM_2_8
    std::string err;
    llvm::MemoryBuffer *buf = llvm::MemoryBuffer::getFileOrSTDIN(fn, &err);
    if (!buf) {
        fprintf(stderr, "Unable to open file \"%s\": %s\n", fn, err.c_str());
        delete ctx;
        return false;
    }
    std::string bcErr;
    llvm::Module *module = llvm::ParseBitcodeFile(buf, *ctx, &bcErr);
 #else
    llvm::OwningPtr<llvm::MemoryBuffer> buf;
    llvm::error_code err = llvm::MemoryBuffer::getFileOrSTDIN(fn, buf);
    if (err) {
@@ -155,7 +174,6 @@ static bool lRunTest(const char *fn) {
    }
    std::string bcErr;
    llvm::Module *module = llvm::ParseBitcodeFile(buf.get(), *ctx, &bcErr);
 #endif
    if (!module) {
        fprintf(stderr, "Bitcode reader failed for \"%s\": %s\n", fn, bcErr.c_str());
@@ -164,7 +182,21 @@ static bool lRunTest(const char *fn) {
    }
    std::string eeError;
 #if defined(LLVM_3_0) || defined(LLVM_3_0svn)
    llvm::EngineBuilder engineBuilder(module);
    engineBuilder.setErrorStr(&eeError);
    engineBuilder.setEngineKind(llvm::EngineKind::JIT);
 #if 0
    std::vector<std::string> attributes;
    if (target != NULL && !strcmp(target, "avx"))
        attributes.push_back("+avx");
    engineBuilder.setMAttrs(attributes);
    engineBuilder.setUseMCJIT(true);
 #endif
    llvm::ExecutionEngine *ee = engineBuilder.create();
 #else
    llvm::ExecutionEngine *ee = llvm::ExecutionEngine::createJIT(module, &eeError);
 #endif
    if (!ee) {
        fprintf(stderr, "Unable to create ExecutionEngine: %s\n", eeError.c_str());
        return false;
@@ -176,10 +208,7 @@ static bool lRunTest(const char *fn) {
        ee->addGlobalMapping(func, (void *)FUNC)
    DO_FUNC(ISPCLaunch, "ISPCLaunch");
    DO_FUNC(ISPCSync, "ISPCSync");
-#ifdef ISPC_IS_WINDOWS
+    DO_FUNC(ISPCAlloc, "ISPCAlloc");
    DO_FUNC(ISPCMalloc, "ISPCMalloc");
    DO_FUNC(ISPCFree, "ISPCFree");
 #endif // ISPC_IS_WINDOWS
    DO_FUNC(putchar, "putchar");
    DO_FUNC(printf, "printf");
    DO_FUNC(fflush, "fflush");
@@ -246,7 +275,6 @@ static bool lRunTest(const char *fn) {
    float result[16];
    for (int i = 0; i < 16; ++i)
        result[i] = 0;
    bool ok = true;
    if (foundResult) {
        typedef void (*PFN)(float *);
        PFN pfn = reinterpret_cast<PFN>(ee->getPointerToFunction(func));
@@ -303,15 +331,15 @@ static bool lRunTest(const char *fn) {
    }
    else {
        fprintf(stderr, "Unable to find runnable function in file \"%s\"\n", fn);
-        ok = false;
+        return false;
    }
    // see if we got the right result
-    if (ok) {
+    bool resultsMatch = true;
    if (foundResult) {
        for (int i = 0; i < width; ++i)
            if (returned[i] != result[i]) {
-                    ok = false;
+                resultsMatch = false;
                fprintf(stderr, "Test \"%s\" RETURNED %d: %g / %a EXPECTED %g / %a\n",
                        fn, i, returned[i], returned[i], result[i], result[i]);
            }
@@ -321,32 +349,31 @@ static bool lRunTest(const char *fn) {
            fprintf(stderr, "Test \"%s\" returned %d: %g / %a\n",
                    fn, i, returned[i], returned[i]);
    }
-    }
+    if (foundResult && shouldFail && resultsMatch)
        fprintf(stderr, "Test %s unexpectedly passed\n", fn);
    delete ee;
    delete ctx;
-    return ok && foundResult;
+    return foundResult && resultsMatch;
 }
 int main(int argc, char *argv[]) {
    llvm::InitializeNativeTarget();
 #if defined(LLVM_3_0) || defined(LLVM_3_0svn)
    LLVMLinkInJIT();
 #endif
-    std::vector<const char *> files;
+    const char *filename = NULL;
    for (int i = 1; i < argc; ++i) {
        if (!strcmp(argv[i], "--help") || !strcmp(argv[i], "-h"))
            usage(0);
        if (!strcmp(argv[i], "-f"))
            shouldFail = true;
        else
-            files.push_back(argv[i]);
+            filename = argv[i];
    }
-    int passes = 0, fails = 0;
+    return (lRunTest(filename) == true) ? 0 : 1;
    for (unsigned int i = 0; i < files.size(); ++i) {
        if (lRunTest(files[i])) ++passes;
        else ++fails;
    }
    if (fails > 0)
        fprintf(stderr, "%d/%d tests passed\n", passes, passes+fails);
    return fails > 0;
 }
--- a/ispc_test.vcxproj
+++ b/ispc_test.vcxproj
@@ -52,14 +52,14 @@
      </PrecompiledHeader>
      <WarningLevel>Level3</WarningLevel>
      <Optimization>Disabled</Optimization>
-      <PreprocessorDefinitions>ISPC_IS_WINDOWS;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <PreprocessorDefinitions>LLVM_3_0;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <AdditionalIncludeDirectories>$(LLVM_INSTALL_DIR)/include</AdditionalIncludeDirectories>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
      <GenerateDebugInformation>true</GenerateDebugInformation>
      <AdditionalLibraryDirectories>$(LLVM_INSTALL_DIR)/lib</AdditionalLibraryDirectories>
-      <AdditionalDependencies>LLVMAnalysis.lib;LLVMArchive.lib;LLVMAsmPrinter.lib;LLVMBitReader.lib;LLVMBitWriter.lib;LLVMCodeGen.lib;LLVMCore.lib;LLVMExecutionEngine.lib;LLVMInstCombine.lib;LLVMInstrumentation.lib;LLVMipa.lib;LLVMipo.lib;LLVMJIT.lib;LLVMLinker.lib;LLVMMC.lib;LLVMMCParser.lib;LLVMObject.lib;LLVMScalarOpts.lib;LLVMSelectionDAG.lib;LLVMSupport.lib;LLVMTarget.lib;LLVMTransformUtils.lib;LLVMX86ASMPrinter.lib;LLVMX86ASMParser.lib;LLVMX86Utils.lib;LLVMX86CodeGen.lib;LLVMX86Disassembler.lib;LLVMX86Info.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalDependencies>LLVMAnalysis.lib;LLVMArchive.lib;LLVMAsmPrinter.lib;LLVMBitReader.lib;LLVMBitWriter.lib;LLVMCodeGen.lib;LLVMCore.lib;LLVMExecutionEngine.lib;LLVMInstCombine.lib;LLVMInstrumentation.lib;LLVMipa.lib;LLVMipo.lib;LLVMJIT.lib;LLVMLinker.lib;LLVMMC.lib;LLVMMCParser.lib;LLVMObject.lib;LLVMScalarOpts.lib;LLVMSelectionDAG.lib;LLVMSupport.lib;LLVMTarget.lib;LLVMTransformUtils.lib;LLVMX86ASMPrinter.lib;LLVMX86ASMParser.lib;LLVMX86Utils.lib;LLVMX86CodeGen.lib;LLVMX86Disassembler.lib;LLVMX86Desc.lib;LLVMX86Info.lib;%(AdditionalDependencies)</AdditionalDependencies>
    </Link>
  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
@@ -70,7 +70,7 @@
      <Optimization>MaxSpeed</Optimization>
      <FunctionLevelLinking>true</FunctionLevelLinking>
      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <PreprocessorDefinitions>ISPC_IS_WINDOWS;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <PreprocessorDefinitions>LLVM_3_0;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <AdditionalIncludeDirectories>$(LLVM_INSTALL_DIR)/include</AdditionalIncludeDirectories>
    </ClCompile>
    <Link>
@@ -79,7 +79,7 @@
      <EnableCOMDATFolding>true</EnableCOMDATFolding>
      <OptimizeReferences>true</OptimizeReferences>
      <AdditionalLibraryDirectories>$(LLVM_INSTALL_DIR)/lib</AdditionalLibraryDirectories>
-      <AdditionalDependencies>LLVMAnalysis.lib;LLVMArchive.lib;LLVMAsmPrinter.lib;LLVMBitReader.lib;LLVMBitWriter.lib;LLVMCodeGen.lib;LLVMCore.lib;LLVMExecutionEngine.lib;LLVMInstCombine.lib;LLVMInstrumentation.lib;LLVMipa.lib;LLVMipo.lib;LLVMJIT.lib;LLVMLinker.lib;LLVMMC.lib;LLVMMCParser.lib;LLVMObject.lib;LLVMScalarOpts.lib;LLVMSelectionDAG.lib;LLVMSupport.lib;LLVMTarget.lib;LLVMTransformUtils.lib;LLVMX86ASMPrinter.lib;LLVMX86ASMParser.lib;LLVMX86Utils.lib;LLVMX86CodeGen.lib;LLVMX86Disassembler.lib;LLVMX86Info.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalDependencies>LLVMAnalysis.lib;LLVMArchive.lib;LLVMAsmPrinter.lib;LLVMBitReader.lib;LLVMBitWriter.lib;LLVMCodeGen.lib;LLVMCore.lib;LLVMExecutionEngine.lib;LLVMInstCombine.lib;LLVMInstrumentation.lib;LLVMipa.lib;LLVMipo.lib;LLVMJIT.lib;LLVMLinker.lib;LLVMMC.lib;LLVMMCParser.lib;LLVMObject.lib;LLVMScalarOpts.lib;LLVMSelectionDAG.lib;LLVMSupport.lib;LLVMTarget.lib;LLVMTransformUtils.lib;LLVMX86ASMPrinter.lib;LLVMX86ASMParser.lib;LLVMX86Utils.lib;LLVMX86CodeGen.lib;LLVMX86Disassembler.lib;LLVMX86Desc.lib;LLVMX86Info.lib;%(AdditionalDependencies)</AdditionalDependencies>
    </Link>
  </ItemDefinitionGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
--- a/main.cpp
+++ b/main.cpp
@@ -40,10 +40,14 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <llvm/Support/PrettyStackTrace.h>
 #ifdef LLVM_2_8
 #include <llvm/System/Signals.h>
 #else
 #include <llvm/Support/Signals.h>
 #if defined(LLVM_3_0) || defined(LLVM_3_0svn)
  #include <llvm/Support/TargetRegistry.h>
  #include <llvm/Support/TargetSelect.h>
 #else
  #include <llvm/Target/TargetRegistry.h>
  #include <llvm/Target/TargetSelect.h>
  #include <llvm/Target/SubtargetFeature.h>
 #endif
 #ifdef ISPC_IS_WINDOWS
@@ -53,36 +57,36 @@
 #endif // ISPC_IS_WINDOWS
 static void usage(int ret) {
-    printf("This is the Intel(r) SPMD Program Compiler (ispc), build %s (%s)\n\n", BUILD_DATE, BUILD_VERSION);
+    printf("This is the Intel(r) SPMD Program Compiler (ispc), build %s (%s)\n\n", 
           BUILD_DATE, BUILD_VERSION);
    printf("usage: ispc\n");
-    printf("    [--arch={x86,x86-64}]\t\tSelect target architecture\n");
+    printf("    [--arch={%s}]\t\tSelect target architecture\n", 
           Target::SupportedTargetArchs());
    printf("    [--cpu=<cpu>]\t\t\tSelect target CPU type\n");
-    printf("         (atom, barcelona, core2, corei7, corei7-avx, istanbul, nocona,\n");
+    printf("         <cpu>={%s}\n", Target::SupportedTargetCPUs());
-    printf("          penryn, westmere)\n");
+    printf("    [-D<foo>]\t\t\t\t#define given value when running preprocessor\n");
 #ifndef ISPC_IS_WINDOWS
    printf("    [-D<foo>]\t\t\t\t#define value when running preprocessor\n");
 #endif
    printf("    [--debug]\t\t\t\tPrint information useful for debugging ispc\n");
    printf("    [--emit-asm]\t\t\tGenerate assembly language file as output\n");
    printf("    [--emit-llvm]\t\t\tEmit LLVM bitode file as output\n");
-    printf("    [--emit-obj]\t\t\tGenerate object file file as output\n");
+    printf("    [--emit-obj]\t\t\tGenerate object file file as output (default)\n");
    printf("    [--fast-math]\t\t\tPerform non-IEEE-compliant optimizations of numeric expressions\n");
    printf("    [-g]\t\t\t\tGenerate debugging information\n");
    printf("    [--help]\t\t\t\tPrint help\n");
-    printf("    [-h] <name>\t\t\t\tOutput filename for header\n");
+    printf("    [-h <name>/--header-outfile=<name>]\tOutput filename for header\n");
    printf("    [--instrument]\t\t\tEmit instrumentation to gather performance data\n");
    printf("    [--math-lib=<option>]\t\tSelect math library\n");
    printf("        default\t\t\t\tUse ispc's built-in math functions\n");
    printf("        fast\t\t\t\tUse high-performance but lower-accuracy math functions\n");
-    printf("        svml\t\t\t\tUse the Intel SVML math libraries\n");
+    printf("        svml\t\t\t\tUse the Intel(r) SVML math libraries\n");
    printf("        system\t\t\t\tUse the system's math library (*may be quite slow*)\n");
    printf("    [--nostdlib]\t\t\tDon't make the ispc standard library available\n");
 #ifndef ISPC_IS_WINDOWS
    printf("    [--nocpp]\t\t\t\tDon't run the C preprocessor\n");
-#endif
+    printf("    [-o <name>/--outfile=<name>]\tOutput filename (may be \"-\" for standard output)\n");
-    printf("    [-o/--outfile] <name>\t\tOutput filename for bitcode (may be \"-\" for standard output)\n");
+    printf("    [-O0/-O1]\t\t\t\tSet optimization level (-O1 is default)\n");
    printf("    [-O0/-O1]\t\t\t\tSet optimization level\n");
    printf("    [--opt=<option>]\t\t\tSet optimization option\n");
    printf("        disable-loop-unroll\t\tDisable loop unrolling.\n");
    printf("        fast-masked-vload\t\tFaster masked vector loads on SSE (may go past end of array)\n");
    printf("        fast-math\t\t\tPerform non-IEEE-compliant optimizations of numeric expressions\n");
 #if 0
    printf("        disable-blended-masked-stores\t\tScalarize masked stores on SSE (vs. using vblendps)\n");
    printf("        disable-coherent-control-flow\t\tDisable coherent control flow optimizations\n");
    printf("        disable-uniform-control-flow\t\tDisable uniform control flow optimizations\n");
@@ -91,11 +95,11 @@ static void usage(int ret) {
    printf("        disable-gather-scatter-flattening\tDisable flattening when all lanes are on\n");
    printf("        disable-uniform-memory-optimizations\tDisable uniform-based coherent memory access\n");
    printf("        disable-masked-store-optimizations\tDisable lowering to regular stores when possible\n");
-#if defined(LLVM_3_0) || defined(LLVM_3_0svn)
+#endif
-    printf("    [--target={sse2,sse4,sse4x2,avx}] Select target ISA (SSE4 is default unless compiling for atom; then SSE2 is.)\n");
+#ifndef ISPC_IS_WINDOWS
-#else
+    printf("    [--pic]\t\t\t\tGenerate position-independent code\n");
-    printf("    [--target={sse2,sse4,sse4x2}] Select target ISA (SSE4 is default unless compiling for atom; then SSE2 is.)\n");
+#endif // !ISPC_IS_WINDOWS
-#endif // LLVM 3.0
+    printf("    [--target=<isa>]\t\t\tSelect target ISA. <isa>={%s}\n", Target::SupportedTargetISAs());
    printf("    [--version]\t\t\t\tPrint ispc version\n");
    printf("    [--woff]\t\t\t\tDisable warnings\n");
    printf("    [--wno-perf]\t\t\tDon't issue warnings related to performance-related issues\n");
@@ -103,35 +107,6 @@ static void usage(int ret) {
    exit(ret);
 }
 /** Given a target name string, set initialize the global g->target
    structure appropriately. 
 */
 static void lDoTarget(const char *target) {
    if (!strcasecmp(target, "sse2")) {
        g->target.isa = Target::SSE2;
        g->target.nativeVectorWidth = 4;
        g->target.vectorWidth = 4;
    }
    else if (!strcasecmp(target, "sse4")) {
        g->target.isa = Target::SSE4;
        g->target.nativeVectorWidth = 4;
        g->target.vectorWidth = 4;
    }
    else if (!strcasecmp(target, "sse4x2")) {
        g->target.isa = Target::SSE4;
        g->target.nativeVectorWidth = 4;
        g->target.vectorWidth = 8;
    }
 #if defined(LLVM_3_0) || defined(LLVM_3_0svn)
    else if (!strcasecmp(target, "avx")) {
        g->target.isa = Target::AVX;
        g->target.nativeVectorWidth = 8;
        g->target.vectorWidth = 8;
    }
 #endif // LLVM 3.0
    else
        usage(1);
 }
 /** We take arguments from both the command line as well as from the
@@ -190,6 +165,16 @@ int main(int Argc, char *Argv[]) {
    llvm::sys::PrintStackTraceOnErrorSignal();
    llvm::PrettyStackTraceProgram X(argc, argv);
    // initialize available LLVM targets
    LLVMInitializeX86TargetInfo();
    LLVMInitializeX86Target();
    LLVMInitializeX86AsmPrinter();
    LLVMInitializeX86AsmParser();
    LLVMInitializeX86Disassembler();
 #if defined(LLVM_3_0) || defined(LLVM_3_0svn)
    LLVMInitializeX86TargetMC();
 #endif
    char *file = NULL;
    const char *headerFileName = NULL;
    const char *outFileName = NULL;
@@ -198,28 +183,29 @@ int main(int Argc, char *Argv[]) {
    // as we're parsing below
    g = new Globals;
-    bool debugSet = false, optSet = false, targetSet = false;
+    bool debugSet = false, optSet = false;
    Module::OutputType ot = Module::Object;
    bool generatePIC = false;
    const char *arch = NULL, *cpu = NULL, *target = NULL;
    for (int i = 1; i < argc; ++i) {
        if (!strcmp(argv[i], "--help"))
            usage(0);
-#ifndef ISPC_IS_WINDOWS
+        else if (!strncmp(argv[i], "-D", 2))
        else if (!strncmp(argv[i], "-D", 2)) {
            g->cppArgs.push_back(argv[i]);
-        }
+        else if (!strncmp(argv[i], "--arch=", 7))
-#endif // !ISPC_IS_WINDOWS
+            arch = argv[i] + 7;
        else if (!strncmp(argv[i], "--arch=", 7)) {
            g->target.arch = argv[i] + 7;
            if (g->target.arch == "x86")
                g->target.is32bit = true;
            else if (g->target.arch == "x86-64")
                g->target.is32bit = false;
        }
        else if (!strncmp(argv[i], "--cpu=", 6))
-            g->target.cpu = argv[i] + 6;
+            cpu = argv[i] + 6;
-        else if (!strcmp(argv[i], "--fast-math"))
+        else if (!strcmp(argv[i], "--fast-math")) {
-            g->opt.fastMath = true;
+            fprintf(stderr, "--fast-math option has been renamed to --opt=fast-math!\n");
            usage(1);
        }
        else if (!strcmp(argv[i], "--fast-masked-vload")) {
            fprintf(stderr, "--fast-masked-vload option has been renamed to "
                    "--opt=fast-masked-vload!\n");
            usage(1);
        }
        else if (!strcmp(argv[i], "--debug"))
            g->debugPrint = true;
        else if (!strcmp(argv[i], "--instrument"))
@@ -235,14 +221,12 @@ int main(int Argc, char *Argv[]) {
        else if (!strcmp(argv[i], "--emit-obj"))
            ot = Module::Object;
        else if (!strcmp(argv[i], "--target")) {
            // FIXME: should remove this way of specifying the target...
            if (++i == argc) usage(1);
-            lDoTarget(argv[i]);
+            target = argv[i];
            targetSet = true;
        }
        else if (!strncmp(argv[i], "--target=", 9)) {
            const char *target = argv[i] + 9;
            lDoTarget(target);
        }
        else if (!strncmp(argv[i], "--target=", 9))
            target = argv[i] + 9;
        else if (!strncmp(argv[i], "--math-lib=", 11)) {
            const char *lib = argv[i] + 11;
            if (!strcmp(lib, "default"))
@@ -258,7 +242,16 @@ int main(int Argc, char *Argv[]) {
        }
        else if (!strncmp(argv[i], "--opt=", 6)) {
            const char *opt = argv[i] + 6;
-            if (!strcmp(opt, "disable-blended-masked-stores"))
+            if (!strcmp(opt, "fast-math"))
                g->opt.fastMath = true;
            else if (!strcmp(opt, "fast-masked-vload"))
                g->opt.fastMaskedVload = true;
            else if (!strcmp(opt, "disable-loop-unroll"))
                g->opt.unrollLoops = false;
            // These are only used for performance tests of specific
            // optimizations
            else if (!strcmp(opt, "disable-blended-masked-stores"))
                g->opt.disableBlendedMaskedStores = true;
            else if (!strcmp(opt, "disable-coherent-control-flow"))
                g->opt.disableCoherentControlFlow = true;
@@ -283,14 +276,19 @@ int main(int Argc, char *Argv[]) {
        }
        else if (!strcmp(argv[i], "--wno-perf") || !strcmp(argv[i], "-wno-perf"))
            g->emitPerfWarnings = false;
-        else if (!strcmp(argv[i], "-o") || !strcmp(argv[i], "--outfile")) {
+        else if (!strcmp(argv[i], "-o")) {
            if (++i == argc) usage(1);
            outFileName = argv[i];
        }
-        else if (!strcmp(argv[i], "-h") || !strcmp(argv[i], "--header-outfile")) {
+        else if (!strcmp(argv[i], "--outfile="))
            outFileName = argv[i] + strlen("--outfile=");
        else if (!strcmp(argv[i], "-h")) {
            if (++i == argc) usage(1);
            headerFileName = argv[i];
        }
        else if (!strcmp(argv[i], "--header-outfile=")) {
            headerFileName = argv[i] + strlen("--header-outfile=");
        }
        else if (!strcmp(argv[i], "-O0")) {
            g->opt.level = 0;
            optSet = true;
@@ -306,6 +304,10 @@ int main(int Argc, char *Argv[]) {
            g->includeStdlib = false;
        else if (!strcmp(argv[i], "--nocpp"))
            g->runCPP = false;
 #ifndef ISPC_IS_WINDOWS
        else if (!strcmp(argv[i], "--pic"))
            generatePIC = true;
 #endif // !ISPC_IS_WINDOWS
        else if (!strcmp(argv[i], "-v") || !strcmp(argv[i], "--version")) {
            printf("Intel(r) SPMD Program Compiler (ispc) build %s (%s)\n", 
                   BUILD_DATE, BUILD_VERSION);
@@ -327,10 +329,8 @@ int main(int Argc, char *Argv[]) {
    if (debugSet && !optSet)
        g->opt.level = 0;
-    // Make SSE2 the default target on atom unless the target has been set
+    if (!Target::GetTarget(arch, cpu, target, generatePIC, &g->target))
-    // explicitly.
+        usage(1);
    if (!targetSet && (g->target.cpu == "atom"))
        lDoTarget("sse2");
    m = new Module(file);
    if (m->CompileFile() == 0) {
--- a/module.cpp
+++ b/module.cpp
@@ -72,23 +72,17 @@
 #include <llvm/Support/FormattedStream.h>
 #include <llvm/Support/FileUtilities.h>
 #include <llvm/Target/TargetMachine.h>
 #include <llvm/Target/TargetRegistry.h>
 #include <llvm/Target/TargetSelect.h>
 #include <llvm/Target/TargetOptions.h>
 #include <llvm/Target/TargetData.h>
 #include <llvm/Target/SubtargetFeature.h>
 #include <llvm/PassManager.h>
 #include <llvm/Analysis/Verifier.h>
 #include <llvm/Support/CFG.h>
 #include <clang/Frontend/CompilerInstance.h>
 #include <clang/Frontend/TextDiagnosticPrinter.h>
 #include <clang/Frontend/Utils.h>
 #include <clang/Basic/TargetInfo.h>
 #ifndef LLVM_2_8
 #include <llvm/Support/ToolOutputFile.h>
 #include <llvm/Support/Host.h>
 #else // !LLVM_2_8
 #include <llvm/System/Host.h>
 #endif // LLVM_2_8
 #include <llvm/Assembly/PrintModulePass.h>
 #include <llvm/Support/raw_ostream.h>
 #include <llvm/Bitcode/ReaderWriter.h>
@@ -107,51 +101,13 @@ Module::Module(const char *fn) {
    symbolTable = new SymbolTable;
    module = new llvm::Module(filename ? filename : "<stdin>", *g->ctx);
-    // initialize target in module
+    module->setTargetTriple(g->target.GetTripleString());
    llvm::InitializeAllTargets();
    llvm::Triple triple;
    // Start with the host triple as the default
    triple.setTriple(llvm::sys::getHostTriple());
    if (g->target.arch != "") {
        // If the user specified a target architecture, see if it's a known
        // one; print an error with the valid ones otherwise.
        const llvm::Target *target = NULL;
        for (llvm::TargetRegistry::iterator iter = llvm::TargetRegistry::begin();
             iter != llvm::TargetRegistry::end(); ++iter) {
            if (g->target.arch == iter->getName()) {
                target = &*iter;
                break;
            }
        }
        if (!target) {
            fprintf(stderr, "Invalid target \"%s\"\nOptions: ", 
                    g->target.arch.c_str());
            llvm::TargetRegistry::iterator iter;
            for (iter = llvm::TargetRegistry::begin();
                 iter != llvm::TargetRegistry::end(); ++iter)
                fprintf(stderr, "%s ", iter->getName());
            fprintf(stderr, "\n");
            exit(1);
        }
        // And override the arch in the host triple
        llvm::Triple::ArchType archType = 
            llvm::Triple::getArchTypeForLLVMName(g->target.arch);
        if (archType != llvm::Triple::UnknownArch)
            triple.setArch(archType);
    }
    module->setTargetTriple(triple.str());
 #ifndef LLVM_2_8
    if (g->generateDebuggingSymbols)
        diBuilder = new llvm::DIBuilder(*module);
    else
        diBuilder = NULL;
 #endif // LLVM_2_8
 #ifndef LLVM_2_8
    // If we're generating debugging symbols, let the DIBuilder know that
    // we're starting a new compilation unit.
    if (diBuilder != NULL) {
@@ -177,7 +133,6 @@ Module::Module(const char *fn) {
                                         0 /* run time version */);
        }
    }
 #endif // LLVM_2_8
 }
@@ -191,6 +146,9 @@ extern void yy_delete_buffer(YY_BUFFER_STATE);
 int
 Module::CompileFile() {
    if (g->opt.fastMath == true)
        llvm::UnsafeFPMath = true;
    // FIXME: it'd be nice to do this in the Module constructor, but this
    // function ends up calling into routines that expect the global
    // variable 'm' to be initialized and available (which it isn't until
@@ -495,6 +453,10 @@ Module::AddGlobal(DeclSpecs *ds, Declarator *decl) {
    // declarations, typedefs, and global variables declarations /
    // definitions.  Figure out what we've got and take care of it.
    if (ds == NULL || decl == NULL)
        // Error happened earlier during parsing
        return;
    if (decl->isFunction) {
        // function declaration
        const Type *t = decl->GetType(ds);
@@ -595,7 +557,6 @@ Module::AddGlobal(DeclSpecs *ds, Declarator *decl) {
                                                         decl->sym->name.c_str());
        m->symbolTable->AddVariable(decl->sym);
 #ifndef LLVM_2_8
        if (diBuilder && (ds->storageClass != SC_EXTERN)) {
            llvm::DIFile file = decl->pos.GetDIFile();
            diBuilder->createGlobalVariable(decl->sym->name, 
@@ -605,7 +566,6 @@ Module::AddGlobal(DeclSpecs *ds, Declarator *decl) {
                                            (ds->storageClass == SC_STATIC),
                                            decl->sym->storagePtr);
        }
 #endif // LLVM_2_8
    }
 }
@@ -667,6 +627,8 @@ lEmitFunctionCode(FunctionEmitContext *ctx, llvm::Function *function,
        llvm::Value *structParamPtr = argIter++;
        llvm::Value *threadIndex = argIter++;
        llvm::Value *threadCount = argIter++;
        llvm::Value *taskIndex = argIter++;
        llvm::Value *taskCount = argIter++;
        // Copy the function parameter values from the structure into local
        // storage
@@ -694,13 +656,17 @@ lEmitFunctionCode(FunctionEmitContext *ctx, llvm::Function *function,
        threadCountSym->storagePtr = ctx->AllocaInst(LLVMTypes::Int32Type, "threadCount");
        ctx->StoreInst(threadCount, threadCountSym->storagePtr);
-#ifdef ISPC_IS_WINDOWS
+        // Copy taskIndex and taskCount into stack-allocated storage so
-        // On Windows, we dynamically-allocate space for the task arguments
+        // that their symbols point to something reasonable.
-        // (see FunctionEmitContext::LaunchInst().)  Here is where we emit
+        Symbol *taskIndexSym = m->symbolTable->LookupVariable("taskIndex");
-        // the code to free that memory, now that we've copied the
+        assert(taskIndexSym);
-        // parameter values out of the structure.
+        taskIndexSym->storagePtr = ctx->AllocaInst(LLVMTypes::Int32Type, "taskIndex");
-        ctx->EmitFree(structParamPtr);
+        ctx->StoreInst(taskIndex, taskIndexSym->storagePtr);
-#endif // ISPC_IS_WINDOWS
+
        Symbol *taskCountSym = m->symbolTable->LookupVariable("taskCount");
        assert(taskCountSym);
        taskCountSym->storagePtr = ctx->AllocaInst(LLVMTypes::Int32Type, "taskCount");
        ctx->StoreInst(taskCount, taskCountSym->storagePtr);
    }
    else {
        // Regular, non-task function
@@ -738,8 +704,18 @@ lEmitFunctionCode(FunctionEmitContext *ctx, llvm::Function *function,
    // Finally, we can generate code for the function
    if (code != NULL) {
        int costEstimate = code->EstimateCost();
        bool checkMask = (ft->isTask == true) || 
-            (function->hasFnAttr(llvm::Attribute::AlwaysInline) == false);
+            ((function->hasFnAttr(llvm::Attribute::AlwaysInline) == false) &&
             costEstimate > CHECK_MASK_AT_FUNCTION_START_COST);
        Debug(code->pos, "Estimated cost for function \"%s\" = %d\n", 
              funSym->name.c_str(), costEstimate);
        // If the body of the function is non-trivial, then we wrap the
        // entire thing around a varying "cif (true)" test in order to reap
        // the side-effect benefit of checking to see if the execution mask
        // is all on and thence having a specialized code path for that
        // case.  If this is a simple function, then this isn't worth the
        // code bloat / overhead.
        if (checkMask) {
            bool allTrue[ISPC_MAX_NVEC];
            for (int i = 0; i < g->target.vectorWidth; ++i)
@@ -887,6 +863,11 @@ Module::AddFunction(DeclSpecs *ds, Declarator *decl, Stmt *code) {
 bool
 Module::WriteOutput(OutputType outputType, const char *outFileName) {
 #if defined(LLVM_3_0) || defined(LLVM_3_0svn)
    if (diBuilder != NULL && outputType != Header)
        diBuilder->finalize();
 #endif // LLVM_3_0
    // First, issue a warning if the output file suffix and the type of
    // file being created seem to mismatch.  This can help catch missing
    // command-line arguments specifying the output file type.
@@ -947,12 +928,7 @@ Module::WriteOutput(OutputType outputType, const char *outFileName) {
            return true;
        }
        else {
 #ifdef LLVM_2_8
            fprintf(stderr, "Direct object file emission not supported in this build.\n");
            return false;
 #else
            return writeObjectFileOrAssembly(outputType, outFileName);
 #endif // LLVM_2_8
        }
    }
 }
@@ -960,43 +936,7 @@ Module::WriteOutput(OutputType outputType, const char *outFileName) {
 bool
 Module::writeObjectFileOrAssembly(OutputType outputType, const char *outFileName) {
-#if defined(LLVM_3_0) || defined(LLVM_3_0svn)
+    llvm::TargetMachine *targetMachine = g->target.GetTargetMachine();
    llvm::InitializeAllTargetMCs();
 #endif
    llvm::InitializeAllAsmPrinters();
    llvm::InitializeAllAsmParsers();
    llvm::Triple triple(module->getTargetTriple());
    assert(triple.getTriple().empty() == false);
    const llvm::Target *target = NULL;
    std::string error;
    target = llvm::TargetRegistry::lookupTarget(triple.getTriple(), error);
    assert(target != NULL);
    std::string featuresString;
    llvm::TargetMachine *targetMachine = NULL;
 #if defined LLVM_3_0svn || defined LLVM_3_0
    if (g->target.isa == Target::AVX)
        featuresString = "+avx";
    targetMachine = target->createTargetMachine(triple.getTriple(), g->target.cpu,
                                                featuresString);
 #else
    if (g->target.cpu.size()) {
        llvm::SubtargetFeatures features;
        features.setCPU(g->target.cpu);
        featuresString = features.getString();
    }
    targetMachine = target->createTargetMachine(triple.getTriple(), 
                                                featuresString);
 #endif
    if (targetMachine == NULL) {
        fprintf(stderr, "Unable to create target machine for target \"%s\"!",
                triple.str().c_str());
        return false;
    }
    targetMachine->setAsmVerbosityDefault(true);
    // Figure out if we're generating object file or assembly output, and
    // set binary output for object files
@@ -1005,6 +945,7 @@ Module::writeObjectFileOrAssembly(OutputType outputType, const char *outFileName
    bool binary = (fileType == llvm::TargetMachine::CGFT_ObjectFile);
    unsigned int flags = binary ? llvm::raw_fd_ostream::F_Binary : 0;
    std::string error;
    llvm::tool_output_file *of = new llvm::tool_output_file(outFileName, error, flags);
    if (error.size()) {
        fprintf(stderr, "Error opening output file \"%s\".\n", outFileName);
@@ -1022,9 +963,8 @@ Module::writeObjectFileOrAssembly(OutputType outputType, const char *outFileName
        (g->opt.level > 0) ? llvm::CodeGenOpt::Aggressive : llvm::CodeGenOpt::None;
    if (targetMachine->addPassesToEmitFile(pm, fos, fileType, optLevel)) {
-        fprintf(stderr, "Fatal error adding passes to emit object file for "
+        fprintf(stderr, "Fatal error adding passes to emit object file!");
-                "target %s!\n", triple.str().c_str());
+        exit(1);
        return false;
    }
    // Finally, run the passes to emit the object file/assembly
@@ -1190,6 +1130,12 @@ lEmitVectorTypedefs(const std::vector<const VectorType *> &types, FILE *file) {
    for (unsigned int i = 0; i < types.size(); ++i) {
        std::string baseDecl;
        const VectorType *vt = types[i]->GetAsNonConstType();
        if (!vt->IsUniformType())
            // Varying stuff shouldn't be visibile to / used by the
            // application, so at least make it not simple to access it by
            // not declaring the type here...
            continue;
        int size = vt->GetElementCount();
        baseDecl = vt->GetBaseType()->GetCDeclaration("");
@@ -1362,6 +1308,7 @@ Module::writeHeader(const char *fn) {
    default:
        FATAL("Unhandled target in header emission");
    }
    fprintf(f, "#define ISPC_TARGET_VECTOR_WIDTH %d\n", g->target.vectorWidth);
    fprintf(f, "#ifdef __cplusplus\nnamespace ispc {\n#endif // __cplusplus\n\n");
@@ -1398,14 +1345,6 @@ Module::writeHeader(const char *fn) {
    lEmitEnumDecls(exportedEnumTypes, f);
    lEmitStructDecls(exportedStructTypes, f);
    // emit externs for globals
    if (externGlobals.size() > 0) {
        fprintf(f, "///////////////////////////////////////////////////////////////////////////\n");
        fprintf(f, "// Globals declared \"extern\" from ispc code\n");
        fprintf(f, "///////////////////////////////////////////////////////////////////////////\n");
        lPrintExternGlobals(f, externGlobals);
    }
    // emit function declarations for exported stuff...
    if (exportedFuncs.size() > 0) {
        fprintf(f, "\n");
@@ -1427,6 +1366,15 @@ Module::writeHeader(const char *fn) {
    // end namespace
    fprintf(f, "\n#ifdef __cplusplus\n}\n#endif // __cplusplus\n");
    // and only now emit externs for globals, outside of the ispc namespace
    if (externGlobals.size() > 0) {
        fprintf(f, "\n");
        fprintf(f, "///////////////////////////////////////////////////////////////////////////\n");
        fprintf(f, "// Globals declared \"extern\" from ispc code\n");
        fprintf(f, "///////////////////////////////////////////////////////////////////////////\n");
        lPrintExternGlobals(f, externGlobals);
    }
    // end guard
    fprintf(f, "\n#endif // %s\n", guard.c_str());
@@ -1442,23 +1390,26 @@ Module::execPreprocessor(const char* infilename, llvm::raw_string_ostream* ostre
    std::string error;
    inst.createFileManager();
    inst.createDiagnostics(0, NULL);
    clang::TargetOptions& options = inst.getTargetOpts();
    llvm::raw_fd_ostream stderrRaw(2, false);
    clang::TextDiagnosticPrinter *diagPrinter = 
        new clang::TextDiagnosticPrinter(stderrRaw, clang::DiagnosticOptions());
    inst.createDiagnostics(0, NULL, diagPrinter);
    clang::TargetOptions &options = inst.getTargetOpts();
    llvm::Triple triple(module->getTargetTriple());
    if (triple.getTriple().empty())
        triple.setTriple(llvm::sys::getHostTriple());
    options.Triple = triple.getTriple();
-    clang::TargetInfo* target 
+    clang::TargetInfo *target =
-        = clang::TargetInfo::CreateTargetInfo(inst.getDiagnostics(), options);
+        clang::TargetInfo::CreateTargetInfo(inst.getDiagnostics(), options);
    inst.setTarget(target);
    inst.createSourceManager(inst.getFileManager());
    inst.InitializeSourceManager(infilename);
-    clang::PreprocessorOptions& opts = inst.getPreprocessorOpts();
+    clang::PreprocessorOptions &opts = inst.getPreprocessorOpts();
    //Add defs for ISPC and PI
    opts.addMacroDef("ISPC");
@@ -1471,7 +1422,10 @@ Module::execPreprocessor(const char* infilename, llvm::raw_string_ostream* ostre
        }
    }    
    inst.createPreprocessor();
    clang::LangOptions langOptions;
    diagPrinter->BeginSourceFile(langOptions, &inst.getPreprocessor());
    clang::DoPrintPreprocessedInput(inst.getPreprocessor(),
                                    ostream, inst.getPreprocessorOutputOpts());
    diagPrinter->EndSourceFile();
 }
--- a/module.h
+++ b/module.h
@@ -91,11 +91,8 @@ public:
    /** llvm Module object into which globals and functions are added. */
    llvm::Module *module; 
-#ifndef LLVM_2_8
+    /** The diBuilder manages generating debugging information */
    /** The diBuilder manages generating debugging information (only
        supported in LLVM 2.9 and beyond...) */
    llvm::DIBuilder *diBuilder;
 #endif
    GatherBuffer *gatherBuffer;
--- a/opt.cpp
+++ b/opt.cpp
@@ -55,13 +55,12 @@
 #include <llvm/Instructions.h>
 #include <llvm/Intrinsics.h>
 #include <llvm/Constants.h>
-#ifndef LLVM_2_8
+#include <llvm/Analysis/ConstantFolding.h>
-    #include <llvm/Target/TargetLibraryInfo.h>
+#include <llvm/Target/TargetLibraryInfo.h>
-    #ifdef LLVM_2_9
+#ifdef LLVM_2_9
    #include <llvm/Support/StandardPasses.h>
-    #else
+#else
-        #include <llvm/Support/PassManagerBuilder.h>
+    #include <llvm/Transforms/IPO/PassManagerBuilder.h>
    #endif // LLVM_2_9
 #endif // LLVM_2_8
 #include <llvm/ADT/Triple.h>
 #include <llvm/Transforms/Scalar.h>
@@ -69,13 +68,18 @@
 #include <llvm/Transforms/Utils/BasicBlockUtils.h>
 #include <llvm/Target/TargetOptions.h>
 #include <llvm/Target/TargetData.h>
 #include <llvm/Target/TargetMachine.h>
 #include <llvm/Analysis/Verifier.h>
 #include <llvm/Support/raw_ostream.h>
 #ifndef LLVM_2_8
 #include <llvm/Analysis/DIBuilder.h>
 #endif
 #include <llvm/Analysis/DebugInfo.h>
 #include <llvm/Support/Dwarf.h>
 #ifdef ISPC_IS_LINUX
  #include <alloca.h>
 #elif defined(ISPC_IS_WINDOWS)
  #include <malloc.h>
  #define alloca _alloca
 #endif // ISPC_IS_WINDOWS
 static llvm::Pass *CreateIntrinsicsOptPass();
 static llvm::Pass *CreateGatherScatterFlattenPass();
@@ -178,19 +182,22 @@ Optimize(llvm::Module *module, int optLevel) {
    llvm::PassManager optPM;
    llvm::FunctionPassManager funcPM(module);
 #ifndef LLVM_2_8
    llvm::TargetLibraryInfo *targetLibraryInfo =
        new llvm::TargetLibraryInfo(llvm::Triple(module->getTargetTriple()));
    optPM.add(targetLibraryInfo);
 #endif
    optPM.add(new llvm::TargetData(module));
 #if defined(LLVM_3_0) || defined(LLVM_3_0svn)
    optPM.add(llvm::createIndVarSimplifyPass());
 #endif
    if (optLevel == 0) {
        // This is more or less the minimum set of optimizations that we
        // need to do to generate code that will actually run.  (We can't
        // run absolutely no optimizations, since the front-end needs us to
        // take the various __pseudo_* functions it has emitted and turn
        // them into something that can actually execute.
        optPM.add(llvm::createPromoteMemoryToRegisterPass());
        optPM.add(CreateGatherScatterFlattenPass());
        optPM.add(CreateLowerGatherScatterPass());
        optPM.add(CreateLowerMaskedStorePass());
@@ -211,7 +218,6 @@ Optimize(llvm::Module *module, int optLevel) {
        // only later in the optimization process as things like constant
        // propagation have done their thing, and then when they do kick
        // in, they can often open up new opportunities for optimization...
 #ifndef LLVM_2_8
        llvm::PassRegistry *registry = llvm::PassRegistry::getPassRegistry();
        llvm::initializeCore(*registry);
        llvm::initializeScalarOpts(*registry);
@@ -222,7 +228,7 @@ Optimize(llvm::Module *module, int optLevel) {
        llvm::initializeInstCombine(*registry);
        llvm::initializeInstrumentation(*registry);
        llvm::initializeTarget(*registry);
-#endif
+
        // Early optimizations to try to reduce the total amount of code to
        // work with if we can
        optPM.add(CreateGatherScatterFlattenPass());
@@ -279,13 +285,11 @@ Optimize(llvm::Module *module, int optLevel) {
        optPM.add(llvm::createConstantPropagationPass());
        optPM.add(CreateIntrinsicsOptPass());
-#if defined(LLVM_2_8)
+#if defined(LLVM_2_9)
        optPM.add(CreateIsCompileTimeConstantPass(true));
 #elif defined(LLVM_2_9)
        llvm::createStandardModulePasses(&optPM, 3, 
                                         false /* opt size */,
                                         true /* unit at a time */, 
-                                         false /* unroll loops */,
+                                         g->opt.unrollLoops,
                                         true /* simplify lib calls */,
                                         false /* may have exceptions */,
                                         llvm::createFunctionInliningPass());
@@ -300,7 +304,7 @@ Optimize(llvm::Module *module, int optLevel) {
        llvm::createStandardModulePasses(&optPM, 3, 
                                         false /* opt size */,
                                         true /* unit at a time */, 
-                                         false /* unroll loops */,
+                                         g->opt.unrollLoops,
                                         true /* simplify lib calls */,
                                         false /* may have exceptions */,
                                         llvm::createFunctionInliningPass());
@@ -309,6 +313,8 @@ Optimize(llvm::Module *module, int optLevel) {
        llvm::PassManagerBuilder builder;
        builder.OptLevel = 3;
        builder.Inliner = llvm::createFunctionInliningPass();
        if (g->opt.unrollLoops == false)
            builder.DisableUnrollLoops = true;
        builder.populateFunctionPassManager(funcPM);
        builder.populateModulePassManager(optPM);
        optPM.add(CreateIsCompileTimeConstantPass(true));
@@ -421,8 +427,11 @@ IntrinsicsOpt::IntrinsicsOpt()
    blendInstructions.push_back(BlendInstruction(
        llvm::Intrinsic::getDeclaration(m->module, llvm::Intrinsic::x86_sse41_blendvps),
        0xf, 0, 1, 2));
 #if defined(LLVM_3_0) || defined(LLVM_3_0svn)
    blendInstructions.push_back(BlendInstruction(
-        m->module->getFunction("llvm.x86.avx.blendvps"), 0xff, 0, 1, 2));
+        llvm::Intrinsic::getDeclaration(m->module, llvm::Intrinsic::x86_avx_blendv_ps_256),
        0xff, 0, 1, 2));
 #endif
 }
@@ -469,8 +478,18 @@ lGetMask(llvm::Value *factor) {
    else if (llvm::isa<llvm::ConstantAggregateZero>(factor))
        return 0;
    else {
 #if 0
        llvm::ConstantExpr *ce = llvm::dyn_cast<llvm::ConstantExpr>(factor);
        if (ce != NULL) {
            llvm::TargetMachine *targetMachine = g->target.GetTargetMachine();
            const llvm::TargetData *td = targetMachine->getTargetData();
            llvm::Constant *c = llvm::ConstantFoldConstantExpression(ce, td);
            c->dump();
            factor = c;
        }
        // else we should be able to handle it above...
        assert(!llvm::isa<llvm::Constant>(factor));
 #endif
        return -1;
    }
 }
@@ -608,9 +627,10 @@ IntrinsicsOpt::runOnBasicBlock(llvm::BasicBlock &bb) {
                                          llvm::PointerType::get(returnType, 0), 
                                          "ptr2vec", callInst);
                lCopyMetadata(castPtr, callInst);
                int align = callInst->getCalledFunction() == avxMaskedLoad32 ? 4 : 8;
                llvm::Instruction *loadInst = 
                    new llvm::LoadInst(castPtr, "load", false /* not volatile */,
-                                       0 /* align */, (llvm::Instruction *)NULL);
+                                       align, (llvm::Instruction *)NULL);
                lCopyMetadata(loadInst, callInst);
                llvm::ReplaceInstWithInst(callInst, loadInst);
                modifiedAny = true;
@@ -630,17 +650,21 @@ IntrinsicsOpt::runOnBasicBlock(llvm::BasicBlock &bb) {
            }
            else if (mask == 0xff) {
                // all lanes storing, so replace with a regular store
-                llvm::Value *rvalue = callInst->getArgOperand(1);
+                llvm::Value *rvalue = callInst->getArgOperand(2);
                llvm::Type *storeType = rvalue->getType();
                llvm::Value *castPtr = 
                    new llvm::BitCastInst(callInst->getArgOperand(0),
                                          llvm::PointerType::get(storeType, 0), 
                                          "ptr2vec", callInst);
                lCopyMetadata(castPtr, callInst);
-                llvm::Instruction *storeInst = 
+
                llvm::StoreInst *storeInst = 
                    new llvm::StoreInst(rvalue, castPtr, (llvm::Instruction *)NULL);
                int align = callInst->getCalledFunction() == avxMaskedStore32 ? 4 : 8;
                storeInst->setAlignment(align);
                lCopyMetadata(storeInst, callInst);
                llvm::ReplaceInstWithInst(callInst, storeInst);
                modifiedAny = true;
                goto restart;
            }
@@ -1416,15 +1440,12 @@ LowerMaskedStorePass::runOnBasicBlock(llvm::BasicBlock &bb) {
        llvm::Value *rvalue  = callInst->getArgOperand(1);
        llvm::Value *mask = callInst->getArgOperand(2);
-        // On SSE, we need to choose between doing the load + blend + store
+        // We need to choose between doing the load + blend + store trick,
-        // trick, or serializing the masked store.  On targets with a
+        // or serializing the masked store.  Even on targets with a native
-        // native masked store instruction, the implementations of
+        // masked store instruction, this is preferable since it lets us
-        // __masked_store_blend_* should be the same as __masked_store_*,
+        // keep values in registers rather than going out to the stack.
-        // so this doesn't matter.  On SSE, blending is generally more
+        bool doBlend = (!g->opt.disableBlendedMaskedStores ||
-        // efficient and is always safe to do on stack-allocated values.(?)
+                        lIsStackVariablePointer(lvalue));
        bool doBlend = lIsStackVariablePointer(lvalue);
        if (g->target.isa == Target::SSE4 || g->target.isa == Target::SSE2)
            doBlend |= !g->opt.disableBlendedMaskedStores;
        // Generate the call to the appropriate masked store function and
        // replace the __pseudo_* one with it.
@@ -1502,8 +1523,8 @@ static void lPrintVector(const char *info, llvm::Value *elements[ISPC_MAX_NVEC])
 /** Given an LLVM vector in vec, return a 'scalarized' version of the
-    vector in the provided offsets[] array.  For example, if the vector
+    vector in the provided scalarizedVector[] array.  For example, if the
-    value passed in is:  
+    vector value passed in is:
    add <4 x i32> %a_smear, <4 x i32> <4, 8, 12, 16>,
@@ -1524,28 +1545,39 @@ static void lPrintVector(const char *info, llvm::Value *elements[ISPC_MAX_NVEC])
    @param vec               Vector to be scalarized
    @param scalarizedVector  Array in which to store the individual vector 
                             elements
    @param vectorLength      Number of elements in the given vector. (The
                             passed scalarizedVector array must also be at least
                             this length as well.)
    @returns                 True if the vector was successfully scalarized and
                             the values in offsets[] are valid; false otherwise
 */
 static bool
-lScalarizeVector(llvm::Value *vec, llvm::Value *scalarizedVector[ISPC_MAX_NVEC]) {
+lScalarizeVector(llvm::Value *vec, llvm::Value **scalarizedVector,
                 int vectorLength) {
    // First initialize the values of scalarizedVector[] to NULL.
-    for (int i = 0; i < g->target.vectorWidth; ++i)
+    for (int i = 0; i < vectorLength; ++i)
        scalarizedVector[i] = NULL;
    // It may be ok for the vector to be an undef vector; these come up for
    // example in shufflevector instructions.  As long as elements of the
    // undef vector aren't referenced by the shuffle indices, this is fine.
    if (llvm::isa<llvm::UndefValue>(vec))
        return true;
    // ConstantVectors are easy; just pull out the individual constant
    // element values
    llvm::ConstantVector *cv = llvm::dyn_cast<llvm::ConstantVector>(vec);
    if (cv != NULL) {
-        for (int i = 0; i < g->target.vectorWidth; ++i)
+        for (int i = 0; i < vectorLength; ++i)
            scalarizedVector[i] = cv->getOperand(i);
        return true;
    }
    // It's also easy if it's just a vector of all zeros
-    llvm::ConstantAggregateZero *caz = llvm::dyn_cast<llvm::ConstantAggregateZero>(vec);
+    llvm::ConstantAggregateZero *caz = 
-    if (caz) {
+        llvm::dyn_cast<llvm::ConstantAggregateZero>(vec);
-        for (int i = 0; i < g->target.vectorWidth; ++i)
+    if (caz != NULL) {
        for (int i = 0; i < vectorLength; ++i)
            scalarizedVector[i] = LLVMInt32(0);
        return true;
    }
@@ -1557,13 +1589,16 @@ lScalarizeVector(llvm::Value *vec, llvm::Value *scalarizedVector[ISPC_MAX_NVEC])
        // scalar values we return from here are synthesized with scalar
        // versions of the original vector binary operator
        llvm::Instruction::BinaryOps opcode = bo->getOpcode();
-        llvm::Value *v0[ISPC_MAX_NVEC], *v1[ISPC_MAX_NVEC];
+        llvm::Value **v0 = 
            (llvm::Value **)alloca(vectorLength * sizeof(llvm::Value *));
        llvm::Value **v1 = 
            (llvm::Value **)alloca(vectorLength * sizeof(llvm::Value *));
-        if (!lScalarizeVector(bo->getOperand(0), v0) || 
+        if (!lScalarizeVector(bo->getOperand(0), v0, vectorLength) || 
-            !lScalarizeVector(bo->getOperand(1), v1))
+            !lScalarizeVector(bo->getOperand(1), v1, vectorLength))
            return false;
-        for (int i = 0; i < g->target.vectorWidth; ++i) {
+        for (int i = 0; i < vectorLength; ++i) {
            scalarizedVector[i] = 
                llvm::BinaryOperator::Create(opcode, v0[i], v1[i], "flat_bop", bo);
            lCopyMetadata(scalarizedVector[i], bo);
@@ -1588,7 +1623,7 @@ lScalarizeVector(llvm::Value *vec, llvm::Value *scalarizedVector[ISPC_MAX_NVEC])
        // vaue in scalarizedVector[] based on the value being inserted.
        while (ie != NULL) {
            uint64_t iOffset = lGetIntValue(ie->getOperand(2));
-            assert((int)iOffset < g->target.vectorWidth);
+            assert((int)iOffset < vectorLength);
            assert(scalarizedVector[iOffset] == NULL);
            scalarizedVector[iOffset] = ie->getOperand(1);
@@ -1602,15 +1637,17 @@ lScalarizeVector(llvm::Value *vec, llvm::Value *scalarizedVector[ISPC_MAX_NVEC])
    }
    llvm::CastInst *ci = llvm::dyn_cast<llvm::CastInst>(vec);
-    if (ci) {
+    if (ci != NULL) {
        // Casts are similar to BinaryOperators in that we attempt to
        // scalarize the vector being cast and if successful, we apply
        // equivalent scalar cast operators to each of the values in the
        // scalarized vector.
        llvm::Instruction::CastOps op = ci->getOpcode();
-        llvm::Value *scalarizedTarget[ISPC_MAX_NVEC];
+        llvm::Value **scalarizedTarget = 
-        if (!lScalarizeVector(ci->getOperand(0), scalarizedTarget))
+            (llvm::Value **)alloca(vectorLength * sizeof(llvm::Value *));
        if (!lScalarizeVector(ci->getOperand(0), scalarizedTarget,
                              vectorLength))
            return false;
        LLVM_TYPE_CONST llvm::Type *destType = ci->getDestTy();
@@ -1619,7 +1656,7 @@ lScalarizeVector(llvm::Value *vec, llvm::Value *scalarizedVector[ISPC_MAX_NVEC])
        assert(vectorDestType != NULL);
        LLVM_TYPE_CONST llvm::Type *elementType = vectorDestType->getElementType();
-        for (int i = 0; i < g->target.vectorWidth; ++i) {
+        for (int i = 0; i < vectorLength; ++i) {
            scalarizedVector[i] = 
                llvm::CastInst::Create(op, scalarizedTarget[i], elementType,
                                       "cast", ci);
@@ -1629,16 +1666,11 @@ lScalarizeVector(llvm::Value *vec, llvm::Value *scalarizedVector[ISPC_MAX_NVEC])
    }
    llvm::ShuffleVectorInst *svi = llvm::dyn_cast<llvm::ShuffleVectorInst>(vec);
-    if (svi) {
+    if (svi != NULL) {
        // Note that the code for shufflevector instructions is untested.
        // (We haven't yet had a case where it needs to run).  Therefore,
        // an assert at the bottom of this routien will hit the first time
        // it runs as a reminder that this needs to be tested further.
        LLVM_TYPE_CONST llvm::VectorType *svInstType = 
            llvm::dyn_cast<LLVM_TYPE_CONST llvm::VectorType>(svi->getType());
        assert(svInstType != NULL);
-        assert((int)svInstType->getNumElements() == g->target.vectorWidth);
+        assert((int)svInstType->getNumElements() == vectorLength);
        // Scalarize the two vectors being shuffled.  First figure out how
        // big they are.
@@ -1653,27 +1685,21 @@ lScalarizeVector(llvm::Value *vec, llvm::Value *scalarizedVector[ISPC_MAX_NVEC])
        int n0 = vectorType0->getNumElements();
        int n1 = vectorType1->getNumElements();
        // FIXME: It's actually totally legitimate for these two to have
        // different sizes; the final result just needs to have the native
        // vector width.  To handle this, not only do we need to
        // potentially dynamically allocate space for the arrays passed
        // into lScalarizeVector, but we need to change the rest of its
        // implementation to not key off g->target.vectorWidth everywhere
        // to get the sizes of the arrays to iterate over, etc.
        assert(n0 == g->target.vectorWidth && n1 == g->target.vectorWidth);
        // Go ahead and scalarize the two input vectors now.
-        // FIXME: it's ok if some or all of the values of these two vectors
+        llvm::Value **v0 = (llvm::Value **)alloca(n0 * sizeof(llvm::Value *));
-        // have undef values, so long as we don't try to access undef
+        llvm::Value **v1 = (llvm::Value **)alloca(n1 * sizeof(llvm::Value *));
-        // values with the vector indices provided to the instruction.
+
-        // Should fix lScalarizeVector so that it doesn't return false in
+        if (!lScalarizeVector(svi->getOperand(0), v0, n0) ||
-        // this case and just leaves the elements of the arrays with undef
+            !lScalarizeVector(svi->getOperand(1), v1, n1))
        // values as NULL.
        llvm::Value *v0[ISPC_MAX_NVEC], *v1[ISPC_MAX_NVEC];
        if (!lScalarizeVector(svi->getOperand(0), v0) ||
            !lScalarizeVector(svi->getOperand(1), v1))
            return false;
        llvm::ConstantAggregateZero *caz = 
            llvm::dyn_cast<llvm::ConstantAggregateZero>(svi->getOperand(2));
        if (caz != NULL) {
            for (int i = 0; i < vectorLength; ++i)
                scalarizedVector[i] = v0[0];
        }
        else {
            llvm::ConstantVector *shuffleIndicesVector = 
                llvm::dyn_cast<llvm::ConstantVector>(svi->getOperand(2));
            // I think this has to be a ConstantVector.  If this ever hits,
@@ -1684,15 +1710,15 @@ lScalarizeVector(llvm::Value *vec, llvm::Value *scalarizedVector[ISPC_MAX_NVEC])
            // Get the integer indices for each element of the returned vector
            llvm::SmallVector<llvm::Constant *, ISPC_MAX_NVEC> shuffleIndices;
            shuffleIndicesVector->getVectorElements(shuffleIndices);
-        assert((int)shuffleIndices.size() == g->target.vectorWidth);
+            assert((int)shuffleIndices.size() == vectorLength);
            // And loop over the indices, setting the i'th element of the
            // result vector with the source vector element that corresponds to
            // the i'th shuffle index value.
            for (unsigned int i = 0; i < shuffleIndices.size(); ++i) {
            if (!llvm::isa<llvm::ConstantInt>(shuffleIndices[i]))
                // I'm not sure when this case would ever happen, though..
-                return false;
+                assert(llvm::isa<llvm::ConstantInt>(shuffleIndices[i]));
                int offset = (int)lGetIntValue(shuffleIndices[i]);
                assert(offset >= 0 && offset < n0+n1);
@@ -1704,7 +1730,45 @@ lScalarizeVector(llvm::Value *vec, llvm::Value *scalarizedVector[ISPC_MAX_NVEC])
                    // vector
                    scalarizedVector[i] = v1[offset - n0];
            }
-        FATAL("the above code is untested so far; check now that it's actually running");
+        }
        return true;
    }
    llvm::LoadInst *li = llvm::dyn_cast<llvm::LoadInst>(vec);
    if (li != NULL) {
        llvm::Value *baseAddr = li->getOperand(0);
        llvm::Value *baseInt = new llvm::PtrToIntInst(baseAddr, LLVMTypes::Int64Type,
                                                      "base2int", li);
        lCopyMetadata(baseInt, li);
        LLVM_TYPE_CONST llvm::PointerType *ptrType = 
            llvm::dyn_cast<llvm::PointerType>(baseAddr->getType());
        assert(ptrType != NULL);
        LLVM_TYPE_CONST llvm::VectorType *vecType = 
            llvm::dyn_cast<llvm::VectorType>(ptrType->getElementType());
        assert(vecType != NULL);
        LLVM_TYPE_CONST llvm::Type *elementType = vecType->getElementType();
        uint64_t elementSize;
        bool sizeKnown = lSizeOfIfKnown(elementType, &elementSize);
        assert(sizeKnown == true);
        LLVM_TYPE_CONST llvm::Type *eltPtrType = llvm::PointerType::get(elementType, 0);
        for (int i = 0; i < vectorLength; ++i) {
            llvm::Value *intPtrOffset = 
                llvm::BinaryOperator::Create(llvm::Instruction::Add, baseInt,
                                             LLVMInt64(i * elementSize), "baseoffset",
                                             li);
            lCopyMetadata(intPtrOffset, li);
            llvm::Value *scalarLoadPtr = 
                new llvm::IntToPtrInst(intPtrOffset, eltPtrType, "int2ptr", li);
            lCopyMetadata(scalarLoadPtr, li);
            llvm::Instruction *scalarLoad = 
                new llvm::LoadInst(scalarLoadPtr, "loadelt", li);
            lCopyMetadata(scalarLoad, li);
            scalarizedVector[i] = scalarLoad;
        }
        return true;
    }
@@ -2116,11 +2180,18 @@ GSImprovementsPass::runOnBasicBlock(llvm::BasicBlock &bb) {
        if (ce && ce->getOpcode() == llvm::Instruction::BitCast)
            base = ce->getOperand(0);
-        // Try to out the offsets; the i'th element of the offsetElements
+        // Try to find out the offsets; the i'th element of the
-        // array should be an i32 with the value of the offset for the i'th
+        // offsetElements array should be an i32 with the value of the
-        // vector lane.  This may fail; if so, just give up.
+        // offset for the i'th vector lane.  This may fail; if so, just
        // give up.
        llvm::Value *vecValue = callInst->getArgOperand(1);
        LLVM_TYPE_CONST llvm::VectorType *vt = 
            llvm::dyn_cast<llvm::VectorType>(vecValue->getType());
        assert(vt != NULL);
        int vecLength = vt->getNumElements();
        assert(vecLength == g->target.vectorWidth);
        llvm::Value *offsetElements[ISPC_MAX_NVEC];
-        if (!lScalarizeVector(callInst->getArgOperand(1), offsetElements))
+        if (!lScalarizeVector(vecValue, offsetElements, vecLength))
            continue;
        llvm::Value *mask = callInst->getArgOperand((gatherInfo != NULL) ? 2 : 3);
@@ -2497,7 +2568,7 @@ llvm::RegisterPass<MakeInternalFuncsStaticPass>
 bool
 MakeInternalFuncsStaticPass::runOnModule(llvm::Module &module) {
    const char *names[] = {
-        "__do_print",
+        "__do_print", "__fast_masked_vload", "__num_cores",
        "__gather_base_offsets_i8", "__gather_base_offsets_i16",
        "__gather_base_offsets_i32", "__gather_base_offsets_i64",
        "__gather_elt_8", "__gather_elt_16", 
--- a/parse.yy
+++ b/parse.yy
@@ -165,7 +165,7 @@ static const char *lParamListTokens[] = {
 %token TOKEN_CBREAK TOKEN_CCONTINUE TOKEN_CRETURN TOKEN_SYNC TOKEN_PRINT
 %type <expr> primary_expression postfix_expression
-%type <expr> unary_expression cast_expression
+%type <expr> unary_expression cast_expression launch_expression
 %type <expr> multiplicative_expression additive_expression shift_expression
 %type <expr> relational_expression equality_expression and_expression
 %type <expr> exclusive_or_expression inclusive_or_expression
@@ -177,6 +177,7 @@ static const char *lParamListTokens[] = {
 %type <stmt> statement labeled_statement compound_statement for_init_statement
 %type <stmt> expression_statement selection_statement iteration_statement
 %type <stmt> jump_statement statement_list declaration_statement print_statement
 %type <stmt> sync_statement
 %type <declaration> declaration parameter_declaration
 %type <declarators> init_declarator_list 
@@ -221,7 +222,7 @@ primary_expression
        else {
            std::vector<Symbol *> *funs = m->symbolTable->LookupFunction(name);
            if (funs)
-                $$ = new FunctionSymbolExpr(funs, @1);
+                $$ = new FunctionSymbolExpr(name, funs, @1);
        }
        if ($$ == NULL) {
            std::vector<std::string> alternates = 
@@ -256,18 +257,32 @@ primary_expression
    | '(' expression ')' { $$ = $2; }
    ;
 launch_expression
    : TOKEN_LAUNCH '<' postfix_expression '(' argument_expression_list ')' '>'
      { 
          ConstExpr *oneExpr = new ConstExpr(AtomicType::UniformInt32, (int32_t)1, @3);
          $$ = new FunctionCallExpr($3, $5, @3, true, oneExpr);
      }
    | TOKEN_LAUNCH '<' postfix_expression '(' ')' '>'
      {
          ConstExpr *oneExpr = new ConstExpr(AtomicType::UniformInt32, (int32_t)1, @3);
          $$ = new FunctionCallExpr($3, new ExprList(@3), @3, true, oneExpr);
       }
    | TOKEN_LAUNCH '[' expression ']' '<' postfix_expression '(' argument_expression_list ')' '>'
      { $$ = new FunctionCallExpr($6, $8, @6, true, $3); }
    | TOKEN_LAUNCH '[' expression ']' '<' postfix_expression '(' ')' '>'
      { $$ = new FunctionCallExpr($6, new ExprList(@6), @6, true, $3); }
    ;
 postfix_expression
    : primary_expression
    | postfix_expression '[' expression ']'
      { $$ = new IndexExpr($1, $3, @1); }
    | postfix_expression '(' ')'
-      { $$ = new FunctionCallExpr($1, new ExprList(@1), @1, false); }
+      { $$ = new FunctionCallExpr($1, new ExprList(@1), @1); }
    | postfix_expression '(' argument_expression_list ')'
-      { $$ = new FunctionCallExpr($1, $3, @1, false); }
+      { $$ = new FunctionCallExpr($1, $3, @1); }
-    | TOKEN_LAUNCH '<' postfix_expression '(' argument_expression_list ')' '>'
+    | launch_expression
      { $$ = new FunctionCallExpr($3, $5, @3, true); }
    | TOKEN_LAUNCH '<' postfix_expression '(' ')' '>'
      { $$ = new FunctionCallExpr($3, new ExprList(@3), @3, true); }
    | postfix_expression '.' TOKEN_IDENTIFIER
      { $$ = MemberExpr::create($1, yytext, @1, @3); }
 /*    | postfix_expression TOKEN_PTR_OP TOKEN_IDENTIFIER
@@ -436,8 +451,6 @@ assignment_expression
 expression
    : assignment_expression
    | TOKEN_SYNC 
      { $$ = new SyncExpr(@1); }
    | expression ',' assignment_expression
      { $$ = new BinaryExpr(BinaryExpr::Comma, $1, $3, @2); }
    ;
@@ -928,9 +941,13 @@ parameter_list
            builtinTokens.push_back(*token);
            ++token;
        }
        if (strlen(yytext) == 0)
            Error(@1, "Syntax error--premature end of file.");
        else {
            std::vector<std::string> alternates = MatchStrings(yytext, builtinTokens);
            std::string alts = lGetAlternates(alternates);
            Error(@1, "Syntax error--token \"%s\" unknown.%s", yytext, alts.c_str());
        }
        $$ = NULL;
    }
    ;
@@ -1019,6 +1036,7 @@ statement
    | jump_statement
    | declaration_statement
    | print_statement
    | sync_statement
    | error
    {
        std::vector<std::string> builtinTokens;
@@ -1027,9 +1045,13 @@ statement
            builtinTokens.push_back(*token);
            ++token;
        }
        if (strlen(yytext) == 0)
            Error(@1, "Syntax error--premature end of file.");
        else {
            std::vector<std::string> alternates = MatchStrings(yytext, builtinTokens);
            std::string alts = lGetAlternates(alternates);
            Error(@1, "Syntax error--token \"%s\" unknown.%s", yytext, alts.c_str());
        }
        $$ = NULL;
    }
    ;
@@ -1155,6 +1177,11 @@ jump_statement
      { $$ = new ReturnStmt($2, true, @1); }
    ;
 sync_statement
    : TOKEN_SYNC 
      { $$ = new ExprStmt(new SyncExpr(@1), @1); }
    ;
 print_statement
    : TOKEN_PRINT '(' string_constant ')'
      {
@@ -1177,10 +1204,14 @@ translation_unit
            builtinTokens.push_back(*token);
            ++token;
        }
        if (strlen(yytext) == 0)
            Error(@1, "Syntax error--premature end of file.");
        else {
            std::vector<std::string> alternates = MatchStrings(yytext, builtinTokens);
            std::string alts = lGetAlternates(alternates);
            Error(@1, "Syntax error--token \"%s\" unknown.%s", yytext, alts.c_str());
        }
    }
    ;
 external_declaration
@@ -1266,6 +1297,12 @@ static void lAddThreadIndexCountToSymbolTable(SourcePos pos) {
    Symbol *threadCountSym = new Symbol("threadCount", pos, AtomicType::UniformConstUInt32);
    m->symbolTable->AddVariable(threadCountSym);
    Symbol *taskIndexSym = new Symbol("taskIndex", pos, AtomicType::UniformConstUInt32);
    m->symbolTable->AddVariable(taskIndexSym);
    Symbol *taskCountSym = new Symbol("taskCount", pos, AtomicType::UniformConstUInt32);
    m->symbolTable->AddVariable(taskCountSym);
 }
--- a/run_tests.py
+++ b/run_tests.py
@@ -0,0 +1,218 @@
 #!/usr/bin/python
 # test-running driver for ispc
 # TODO: windows support (mostly should be calling CL.exe rather than gcc
 #   for static linking?)
 from optparse import OptionParser
 import multiprocessing
 from ctypes import c_int
 import os
 import sys
 import glob
 import re
 import signal
 import random
 import string
 import mutex
 import subprocess
 import platform
 parser = OptionParser()
 parser.add_option("-r", "--random-shuffle", dest="random", help="Randomly order tests",
                  default=False, action="store_true")
 parser.add_option("-s", "--static-exe", dest="static_exe", 
                  help="Create and run a regular executable for each test (rather than using the LLVM JIT).",
                  default=False, action="store_true")
 parser.add_option('-t', '--target', dest='target',
                  help='Set compilation target (sse2, sse4, sse4x2, avx, avx-x2)',
                  default="sse4")
 parser.add_option('-a', '--arch', dest='arch',
                  help='Set architecture (x86, x86-64)',
                  default="x86-64")
 parser.add_option('-o', '--no-opt', dest='no_opt', help='Disable optimization',
                  default=False, action="store_true")
 (options, args) = parser.parse_args()
 # if no specific test files are specified, run all of the tests in tests/
 # and failing_tests/
 if len(args) == 0:
    files = glob.glob("tests/*ispc") + glob.glob("failing_tests/*ispc")
 else:
    files = args
 # randomly shuffle the tests if asked to do so
 if (options.random):
    random.seed()
    random.shuffle(files)
 # counter
 total_tests = 0
 finished_tests_counter = multiprocessing.Value(c_int)
 # We'd like to use the Lock class from the multiprocessing package to
 # serialize accesses to finished_tests_counter.  Unfortunately, the version of
 # python that ships with OSX 10.5 has this bug:
 # http://bugs.python.org/issue5261.  Therefore, we use the (deprecated but
 # still available) mutex class.
 #finished_tests_counter_lock = multiprocessing.Lock()
 finished_tests_mutex = mutex.mutex()
 # utility routine to print an update on the number of tests that have been
 # finished.  Should be called with the mutex (or lock) held..
 def update_progress(fn):
    finished_tests_counter.value = finished_tests_counter.value + 1
    progress_str = " Done %d / %d [%s]" % (finished_tests_counter.value, total_tests, fn)
    # spaces to clear out detrius from previous printing...
    for x in range(30):
        progress_str += ' '
    progress_str += '\r'
    sys.stdout.write(progress_str)
    sys.stdout.flush()
    finished_tests_mutex.unlock()
 fnull = open(os.devnull, 'w')
 # run the commands in cmd_list
 def run_cmds(cmd_list, filename, expect_failure):
    for cmd in cmd_list:
        if expect_failure:
            failed = (subprocess.call(cmd, shell = True, stdout = fnull, stderr = fnull) != 0)
        else:
            failed = (os.system(cmd) != 0)
        if failed:
            break
    surprise = ((expect_failure and not failed) or (not expect_failure and failed))
    if surprise == True:
        print "Test %s %s                 " % \
            (filename, "unexpectedly passed" if expect_failure else "failed")
    return surprise
 # pull tests to run from the given queue and run them.  Multiple copies of
 # this function will be running in parallel across all of the CPU cores of
 # the system.
 def run_tasks_from_queue(queue):
    error_count = 0
    while True:
        filename = queue.get()
        if (filename == 'STOP'):
            sys.exit(error_count)
        # do we expect this test to fail?
        should_fail = (filename.find("failing_") != -1)
        if options.static_exe == True:
            # if the user wants us to build a static executable to run for
            # this test, we need to figure out the signature of the test
            # function that this test has.
            sig2def = { "f_v(" : 0, "f_f(" : 1, "f_fu(" : 2, "f_fi(" : 3, 
                        "f_du(" : 4, "f_duf(" : 5, "f_di(" : 6 }
            file = open(filename, 'r')
            match = -1
            for line in file:
                # look for lines with 'export'...
                if line.find("export") == -1:
                    continue
                # one of them should have a function with one of the
                # declarations in sig2def
                for pattern, ident in sig2def.items():
                    if line.find(pattern) != -1:
                        match = ident
                        break
            file.close()
            if match == -1:
                print "Fatal error: unable to find function signature in test %s" % filename
                error_count += 1
            else:
                obj_name = "%s.o" % filename
                exe_name = "%s.run" % filename
                ispc_cmd = "ispc --woff %s -o %s --arch=%s --target=%s" % \
                    (filename, obj_name, options.arch, options.target)
                if options.no_opt:
                    ispc_cmd += " -O0" 
                if options.arch == 'x86':
                    gcc_arch = '-m32'
                else:
                    gcc_arch = '-m64'
                gcc_cmd = "g++ %s test_static.cpp -DTEST_SIG=%d %s.o -o %s" % \
                    (gcc_arch, match, filename, exe_name)
                if platform.system() == 'Darwin':
                    gcc_cmd += ' -Wl,-no_pie'
                if should_fail:
                    gcc_cmd += " -DEXPECT_FAILURE"
                # compile the ispc code, make the executable, and run it...
                error_count += run_cmds([ispc_cmd, gcc_cmd, exe_name], filename, should_fail)
                # clean up after running the test
                try:
                    os.unlink(exe_name)
                    os.unlink(obj_name)
                except:
                    None
        else:
            # otherwise we'll use ispc_test + the LLVM JIT to run the test
            bitcode_file = "%s.bc" % filename
            compile_cmd = "ispc --woff --emit-llvm %s --target=%s -o %s" % \
                (filename, options.target, bitcode_file)
            if options.no_opt:
                compile_cmd += " -O0"
            test_cmd = "ispc_test %s" % bitcode_file
            error_count += run_cmds([compile_cmd, test_cmd], filename, should_fail)
            try:
                os.unlink(bitcode_file)
            except:
                None
        # If not for http://bugs.python.org/issue5261 on OSX, we'd like to do this:
        #with finished_tests_counter_lock:
            #update_progress(filename)
        # but instead we do this...
        finished_tests_mutex.lock(update_progress, filename)
 task_threads = []
 def sigint(signum, frame):
    for t in task_threads:
        t.terminate()
    sys.exit(1)
 if __name__ == '__main__':
    nthreads = multiprocessing.cpu_count()
    total_tests = len(files)
    print "Found %d CPUs. Running %d tests." % (nthreads, total_tests)
    # put each of the test filenames into a queue
    q = multiprocessing.Queue()
    for fn in files:
        q.put(fn)
    for x in range(nthreads):
        q.put('STOP')
    # need to catch sigint so that we can terminate all of the tasks if
    # we're interrupted
    signal.signal(signal.SIGINT, sigint)
    # launch jobs to run tests
    for x in range(nthreads):
        t = multiprocessing.Process(target=run_tasks_from_queue, args=(q,))
        task_threads.append(t)
        t.start()
    # wait for them to all finish and then return the number that failed
    # (i.e. return 0 if all is ok)
    error_count = 0
    for t in task_threads:
        t.join()
        error_count += t.exitcode
    print
    if error_count > 0:
        print "%d / %d tests FAILED!" % (error_count, total_tests)
    sys.exit(error_count)
--- a/stdlib.ispc
+++ b/stdlib.ispc
@@ -369,7 +369,7 @@ static inline uniform float reduce_min(float v) {
 static inline uniform float reduce_max(float v) {
    // For the lanes where the mask is off, replace the given value with
    // negative infinity, so that it doesn't affect the result.
-    const uniform int iflt_neg_max = 0xff800000; // -infinity
+    const int iflt_neg_max = 0xff800000; // -infinity
    // Must use __floatbits_varying_int32, not floatbits(), since with the
    // latter the current mask enters into the returned result...
    return __reduce_max_float(__mask ? v : __floatbits_varying_int32(iflt_neg_max));
@@ -427,7 +427,7 @@ static inline uniform double reduce_min(double v) {
 }
 static inline uniform double reduce_max(double v) {
-    const uniform int64 iflt_neg_max = 0xfff0000000000000; // -infinity
+    const int64 iflt_neg_max = 0xfff0000000000000; // -infinity
    // Must use __doublebits_varying_int64, not doublebits(), since with the
    // latter the current mask enters into the returned result...
    return __reduce_max_double(__mask ? v : __doublebits_varying_int64(iflt_neg_max));
@@ -471,21 +471,21 @@ static inline uniform unsigned int64 reduce_max(unsigned int64 v) {
    return __reduce_max_uint64(__mask ? v : 0);
 }
-#define REDUCE_EQUAL(TYPE, FUNCTYPE)                               \
+#define REDUCE_EQUAL(TYPE, FUNCTYPE, MASKTYPE)                     \
 static inline uniform bool reduce_equal(TYPE v) {                  \
    uniform TYPE unusedValue;                                      \
-    return __reduce_equal_##FUNCTYPE(v, unusedValue, (int32)__mask); \
+    return __reduce_equal_##FUNCTYPE(v, unusedValue, (MASKTYPE)__mask); \
 }                                                                  \
 static inline uniform bool reduce_equal(TYPE v, reference uniform TYPE value) { \
-    return __reduce_equal_##FUNCTYPE(v, value, (int32)__mask);       \
+    return __reduce_equal_##FUNCTYPE(v, value, (MASKTYPE)__mask);       \
 }
-REDUCE_EQUAL(int32, int32)
+REDUCE_EQUAL(int32, int32, int32)
-REDUCE_EQUAL(unsigned int32, int32)
+REDUCE_EQUAL(unsigned int32, int32, unsigned int32)
-REDUCE_EQUAL(float, float)
+REDUCE_EQUAL(float, float, int32)
-REDUCE_EQUAL(int64, int64)
+REDUCE_EQUAL(int64, int64, int32)
-REDUCE_EQUAL(unsigned int64, int64)
+REDUCE_EQUAL(unsigned int64, int64, unsigned int32)
-REDUCE_EQUAL(double, double)
+REDUCE_EQUAL(double, double, int32)
 static int32 exclusive_scan_add(int32 v) {
    return __exclusive_scan_add_i32(v, (int32)__mask);
@@ -549,23 +549,32 @@ static unsigned int64 exclusive_scan_or(unsigned int64 v) {
 static inline uniform int 
 packed_load_active(uniform unsigned int a[], uniform int start,
                   reference unsigned int vals) {
-    return __packed_load_active(a, start, vals, __mask);
+    return __packed_load_active(a, (unsigned int)start, vals,
                                (unsigned int32)__mask);
 }
 static inline uniform int
 packed_store_active(uniform unsigned int a[], uniform int start,
                    unsigned int vals) {
-    return __packed_store_active(a, start, vals, __mask);
+    return __packed_store_active(a, (unsigned int)start, vals,
                                 (unsigned int32)__mask);
 }
 static inline uniform int packed_load_active(uniform int a[], uniform int start,
                                             reference int vals) {
-    return __packed_load_active(a, start, vals, __mask);
+    return __packed_load_active(a, start, vals, (int32)__mask);
 }
 static inline uniform int packed_store_active(uniform int a[], uniform int start,
                                              int vals) {
-    return __packed_store_active(a, start, vals, __mask);
+    return __packed_store_active(a, start, vals, (int32)__mask);
 }
 ///////////////////////////////////////////////////////////////////////////
 // System information
 static inline int num_cores() {
    return __num_cores();
 }
 ///////////////////////////////////////////////////////////////////////////
@@ -575,75 +584,108 @@ static inline void memory_barrier() {
    __memory_barrier();
 }
-#define DEFINE_ATOMIC_OP(TA,TB,OPA,OPB)                                 \
+#define DEFINE_ATOMIC_OP(TA,TB,OPA,OPB,MASKTYPE)                        \
 static inline TA atomic_##OPA##_global(uniform reference TA ref, TA value) { \
    memory_barrier();                                                   \
-    TA ret = __atomic_##OPB##_##TB##_global(ref, value, __mask);  \
+    TA ret = __atomic_##OPB##_##TB##_global(ref, value, (MASKTYPE)__mask); \
    memory_barrier();                                                   \
    return ret;                                                         \
 }                                                                       \
 static inline uniform TA atomic_##OPA##_global(uniform reference TA ref, \
                                               uniform TA value) {      \
    memory_barrier();                                                   \
    uniform TA ret = __atomic_##OPB##_uniform_##TB##_global(ref, value, (MASKTYPE)__mask); \
    memory_barrier();                                                   \
    return ret;                                                         \
 }
-DEFINE_ATOMIC_OP(int32,int32,add,add)
+#define DEFINE_ATOMIC_MINMAX_OP(TA,TB,OPA,OPB, MASKTYPE)                \
-DEFINE_ATOMIC_OP(int32,int32,subtract,sub)
+static inline TA atomic_##OPA##_global(uniform reference TA ref, TA value) { \
-DEFINE_ATOMIC_OP(int32,int32,min,min)
+    uniform TA oneval = reduce_##OPA(value);                            \
-DEFINE_ATOMIC_OP(int32,int32,max,max)
+    TA ret;                                                             \
-DEFINE_ATOMIC_OP(int32,int32,and,and)
+    if (lanemask() != 0) {                                              \
-DEFINE_ATOMIC_OP(int32,int32,or,or)
+        memory_barrier();                                               \
-DEFINE_ATOMIC_OP(int32,int32,xor,xor)
+        ret = __atomic_##OPB##_uniform_##TB##_global(ref, oneval, (MASKTYPE)__mask); \
-DEFINE_ATOMIC_OP(int32,int32,swap,swap)
+        memory_barrier();                                               \
    }                                                                   \
    return ret;                                                         \
 }                                                                       \
 static inline uniform TA atomic_##OPA##_global(uniform reference TA ref, \
                                               uniform TA value) {      \
    memory_barrier();                                                   \
    uniform TA ret = __atomic_##OPB##_uniform_##TB##_global(ref, value, (MASKTYPE)__mask); \
    memory_barrier();                                                   \
    return ret;                                                         \
 }
 DEFINE_ATOMIC_OP(int32,int32,add,add,int32)
 DEFINE_ATOMIC_OP(int32,int32,subtract,sub,int32)
 DEFINE_ATOMIC_MINMAX_OP(int32,int32,min,min,int32)
 DEFINE_ATOMIC_MINMAX_OP(int32,int32,max,max,int32)
 DEFINE_ATOMIC_OP(int32,int32,and,and,int32)
 DEFINE_ATOMIC_OP(int32,int32,or,or,int32)
 DEFINE_ATOMIC_OP(int32,int32,xor,xor,int32)
 DEFINE_ATOMIC_OP(int32,int32,swap,swap,int32)
 // For everything but atomic min and max, we can use the same
 // implementations for unsigned as for signed.
-DEFINE_ATOMIC_OP(unsigned int32,int32,add,add)
+DEFINE_ATOMIC_OP(unsigned int32,int32,add,add,unsigned int32)
-DEFINE_ATOMIC_OP(unsigned int32,int32,subtract,sub)
+DEFINE_ATOMIC_OP(unsigned int32,int32,subtract,sub,unsigned int32)
-DEFINE_ATOMIC_OP(unsigned int32,uint32,min,umin)
+DEFINE_ATOMIC_MINMAX_OP(unsigned int32,uint32,min,umin,unsigned int32)
-DEFINE_ATOMIC_OP(unsigned int32,uint32,max,umax)
+DEFINE_ATOMIC_MINMAX_OP(unsigned int32,uint32,max,umax,unsigned int32)
-DEFINE_ATOMIC_OP(unsigned int32,int32,and,and)
+DEFINE_ATOMIC_OP(unsigned int32,int32,and,and,unsigned int32)
-DEFINE_ATOMIC_OP(unsigned int32,int32,or,or)
+DEFINE_ATOMIC_OP(unsigned int32,int32,or,or,unsigned int32)
-DEFINE_ATOMIC_OP(unsigned int32,int32,xor,xor)
+DEFINE_ATOMIC_OP(unsigned int32,int32,xor,xor,unsigned int32)
-DEFINE_ATOMIC_OP(unsigned int32,int32,swap,swap)
+DEFINE_ATOMIC_OP(unsigned int32,int32,swap,swap,unsigned int32)
-DEFINE_ATOMIC_OP(float,float,swap,swap)
+DEFINE_ATOMIC_OP(float,float,swap,swap,int32)
-DEFINE_ATOMIC_OP(int64,int64,add,add)
+DEFINE_ATOMIC_OP(int64,int64,add,add,int32)
-DEFINE_ATOMIC_OP(int64,int64,subtract,sub)
+DEFINE_ATOMIC_OP(int64,int64,subtract,sub,int32)
-DEFINE_ATOMIC_OP(int64,int64,min,min)
+DEFINE_ATOMIC_MINMAX_OP(int64,int64,min,min,int32)
-DEFINE_ATOMIC_OP(int64,int64,max,max)
+DEFINE_ATOMIC_MINMAX_OP(int64,int64,max,max,int32)
-DEFINE_ATOMIC_OP(int64,int64,and,and)
+DEFINE_ATOMIC_OP(int64,int64,and,and,int32)
-DEFINE_ATOMIC_OP(int64,int64,or,or)
+DEFINE_ATOMIC_OP(int64,int64,or,or,int32)
-DEFINE_ATOMIC_OP(int64,int64,xor,xor)
+DEFINE_ATOMIC_OP(int64,int64,xor,xor,int32)
-DEFINE_ATOMIC_OP(int64,int64,swap,swap)
+DEFINE_ATOMIC_OP(int64,int64,swap,swap,int32)
 // For everything but atomic min and max, we can use the same
 // implementations for unsigned as for signed.
-DEFINE_ATOMIC_OP(unsigned int64,int64,add,add)
+DEFINE_ATOMIC_OP(unsigned int64,int64,add,add,unsigned int32)
-DEFINE_ATOMIC_OP(unsigned int64,int64,subtract,sub)
+DEFINE_ATOMIC_OP(unsigned int64,int64,subtract,sub,unsigned int32)
-DEFINE_ATOMIC_OP(unsigned int64,uint64,min,umin)
+DEFINE_ATOMIC_MINMAX_OP(unsigned int64,uint64,min,umin,unsigned int32)
-DEFINE_ATOMIC_OP(unsigned int64,uint64,max,umax)
+DEFINE_ATOMIC_MINMAX_OP(unsigned int64,uint64,max,umax,unsigned int32)
-DEFINE_ATOMIC_OP(unsigned int64,int64,and,and)
+DEFINE_ATOMIC_OP(unsigned int64,int64,and,and,unsigned int32)
-DEFINE_ATOMIC_OP(unsigned int64,int64,or,or)
+DEFINE_ATOMIC_OP(unsigned int64,int64,or,or,unsigned int32)
-DEFINE_ATOMIC_OP(unsigned int64,int64,xor,xor)
+DEFINE_ATOMIC_OP(unsigned int64,int64,xor,xor,unsigned int32)
-DEFINE_ATOMIC_OP(unsigned int64,int64,swap,swap)
+DEFINE_ATOMIC_OP(unsigned int64,int64,swap,swap,unsigned int32)
-DEFINE_ATOMIC_OP(double,double,swap,swap)
+DEFINE_ATOMIC_OP(double,double,swap,swap,int32)
 #undef DEFINE_ATOMIC_OP
-#define ATOMIC_DECL_CMPXCHG(TA, TB)                                        \
+#define ATOMIC_DECL_CMPXCHG(TA, TB, MASKTYPE)                           \
 static inline TA atomic_compare_exchange_global(                           \
         uniform reference TA ref, TA oldval, TA newval) {                 \
    memory_barrier();                                                      \
-    TA ret = __atomic_compare_exchange_##TB##_global(ref, oldval, newval, __mask); \
+    TA ret = __atomic_compare_exchange_##TB##_global(ref, oldval, newval, (MASKTYPE)__mask); \
    memory_barrier();                                                      \
    return ret;                                                            \
 } \
 static inline uniform TA atomic_compare_exchange_global(               \
         uniform reference TA ref, uniform TA oldval, uniform TA newval) {                 \
    memory_barrier();                                                   \
    uniform TA ret = __atomic_compare_exchange_uniform_##TB##_global(ref, oldval, newval, (MASKTYPE)__mask); \
    memory_barrier();                                                   \
    return ret;                                                         \
 }
-ATOMIC_DECL_CMPXCHG(int32, int32)
+ATOMIC_DECL_CMPXCHG(int32, int32, int32)
-ATOMIC_DECL_CMPXCHG(unsigned int32, int32)
+ATOMIC_DECL_CMPXCHG(unsigned int32, int32, unsigned int32)
-ATOMIC_DECL_CMPXCHG(float, float)
+ATOMIC_DECL_CMPXCHG(float, float, int32)
-ATOMIC_DECL_CMPXCHG(int64, int64)
+ATOMIC_DECL_CMPXCHG(int64, int64, int32)
-ATOMIC_DECL_CMPXCHG(unsigned int64, int64)
+ATOMIC_DECL_CMPXCHG(unsigned int64, int64, unsigned int32)
-ATOMIC_DECL_CMPXCHG(double, double)
+ATOMIC_DECL_CMPXCHG(double, double, int32)
 #undef ATOMIC_DECL_CMPXCHG
@@ -2850,6 +2892,12 @@ static inline void seed_rng(reference uniform RNGState state, uniform unsigned i
    seed = __seed4(state, 0, seed);
    if (programCount == 8)
        __seed4(state, 4, seed ^ 0xbeeff00d);
    if (programCount == 16) {
        __seed4(state, 4,  seed ^ 0xbeeff00d);
        __seed4(state, 8,  ((seed & 0xffff) << 16) | (seed >> 16));
        __seed4(state, 12, (((seed & 0xff) << 24) | ((seed & 0xff00)  << 8) |
                            ((seed & 0xff0000) >> 8) | (seed & 0xff000000) >> 24));
    }
 }
 static inline void fastmath() {
--- a/stmt.cpp
+++ b/stmt.cpp
@@ -107,6 +107,12 @@ ExprStmt::Print(int indent) const {
 }
 int
 ExprStmt::EstimateCost() const {
    return expr ? expr->EstimateCost() : 0;
 }
 ///////////////////////////////////////////////////////////////////////////
 // DeclStmt
@@ -399,12 +405,25 @@ DeclStmt::Print(int indent) const {
 }
 int
 DeclStmt::EstimateCost() const {
    int cost = 0;
    for (unsigned int i = 0; i < declaration->declarators.size(); ++i)
        if (declaration->declarators[i]->initExpr)
            cost += declaration->declarators[i]->initExpr->EstimateCost();
    return cost;
 }
 ///////////////////////////////////////////////////////////////////////////
 // IfStmt
-IfStmt::IfStmt(Expr *t, Stmt *ts, Stmt *fs, bool doUnif, SourcePos p) 
+IfStmt::IfStmt(Expr *t, Stmt *ts, Stmt *fs, bool checkCoherence, SourcePos p) 
    : Stmt(p), test(t), trueStmts(ts), falseStmts(fs), 
-      doCoherentCheck(doUnif && !g->opt.disableCoherentControlFlow) {
+      doAllCheck(checkCoherence &&
                 !g->opt.disableCoherentControlFlow),
      doAnyCheck(test->GetType() != NULL &&
                 test->GetType()->IsVaryingType()) {
 }
@@ -436,23 +455,26 @@ IfStmt::EmitCode(FunctionEmitContext *ctx) const {
    ctx->SetDebugPos(pos);
    bool isUniform = testType->IsUniformType();
    llvm::Value *testValue = test->GetValue(ctx);
    if (testValue == NULL)
        return;
    if (isUniform) {
        ctx->StartUniformIf(ctx->GetMask());
-        if (doCoherentCheck)
+        if (doAllCheck)
-            Warning(test->pos, "Uniform condition supplied to cif statement.");
+            Warning(test->pos, "Uniform condition supplied to \"cif\" statement.");
        // 'If' statements with uniform conditions are relatively
        // straightforward.  We evaluate the condition and then jump to
        // either the 'then' or 'else' clause depending on its value.
        llvm::Value *vtest = test->GetValue(ctx);
        if (vtest != NULL) {
        llvm::BasicBlock *bthen = ctx->CreateBasicBlock("if_then");
        llvm::BasicBlock *belse = ctx->CreateBasicBlock("if_else");
        llvm::BasicBlock *bexit = ctx->CreateBasicBlock("if_exit");
        // Jump to the appropriate basic block based on the value of
        // the 'if' test
-            ctx->BranchInst(bthen, belse, vtest);
+        ctx->BranchInst(bthen, belse, testValue);
        // Emit code for the 'true' case
        ctx->SetCurrentBasicBlock(bthen);
@@ -469,29 +491,10 @@ IfStmt::EmitCode(FunctionEmitContext *ctx) const {
        // Set the active basic block to the newly-created exit block
        // so that subsequent emitted code starts there.
        ctx->SetCurrentBasicBlock(bexit);
        }
        ctx->EndIf();
    }
-    else {
+    else
-        // Code for 'If' statemnts with 'varying' conditions can be
+        emitVaryingIf(ctx, testValue);
        // generated in two ways; one takes some care to see if all of the
        // active program instances want to follow only the 'true' or
        // 'false' cases, and the other always runs both cases but sets the
        // mask appropriately.  The first case is handled by the
        // IfStmt::emitCoherentTests() call, and the second is handled by
        // IfStmt::emitMaskedTrueAndFalse().
        llvm::Value *testValue = test->GetValue(ctx);
        if (testValue) {
            if (doCoherentCheck) 
                emitCoherentTests(ctx, testValue);
            else {
                llvm::Value *oldMask = ctx->GetMask();
                ctx->StartVaryingIf(oldMask);
                emitMaskedTrueAndFalse(ctx, oldMask, testValue);
                ctx->EndIf();
            }
        }
    }
 }
@@ -535,9 +538,17 @@ Stmt *IfStmt::TypeCheck() {
 }
 int
 IfStmt::EstimateCost() const {
    return ((test ? test->EstimateCost() : 0) +
            (trueStmts ? trueStmts->EstimateCost() : 0) +
            (falseStmts ? falseStmts->EstimateCost() : 0));
 }
 void
 IfStmt::Print(int indent) const {
-    printf("%*cIf Stmt %s", indent, ' ', doCoherentCheck ? "DO COHERENT CHECK" : "");
+    printf("%*cIf Stmt %s", indent, ' ', doAllCheck ? "DO ALL CHECK" : "");
    pos.Print();
    printf("\n%*cTest: ", indent+4, ' ');
    test->Print();
@@ -554,7 +565,7 @@ IfStmt::Print(int indent) const {
 /** Emit code to run both the true and false statements for the if test,
-    with the mask set appropriately before runnign each one. 
+    with the mask set appropriately before running each one. 
 */
 void
 IfStmt::emitMaskedTrueAndFalse(FunctionEmitContext *ctx, llvm::Value *oldMask, 
@@ -574,11 +585,185 @@ IfStmt::emitMaskedTrueAndFalse(FunctionEmitContext *ctx, llvm::Value *oldMask,
 }
 /** Similar to the Stmt variant of this function, this conservatively
    checks to see if it's safe to run the code for the given Expr even if
    the mask is 'all off'.
 */
 static bool
 lSafeToRunWithAllLanesOff(Expr *expr) {
    if (expr == NULL)
        return false;
    UnaryExpr *ue;
    if ((ue = dynamic_cast<UnaryExpr *>(expr)) != NULL)
        return lSafeToRunWithAllLanesOff(ue->expr);
    BinaryExpr *be;
    if ((be = dynamic_cast<BinaryExpr *>(expr)) != NULL)
        return (lSafeToRunWithAllLanesOff(be->arg0) &&
                lSafeToRunWithAllLanesOff(be->arg1));
    AssignExpr *ae;
    if ((ae = dynamic_cast<AssignExpr *>(expr)) != NULL)
        return (lSafeToRunWithAllLanesOff(ae->lvalue) &&
                lSafeToRunWithAllLanesOff(ae->rvalue));
    SelectExpr *se;
    if ((se = dynamic_cast<SelectExpr *>(expr)) != NULL)
        return (lSafeToRunWithAllLanesOff(se->test) && 
                lSafeToRunWithAllLanesOff(se->expr1) && 
                lSafeToRunWithAllLanesOff(se->expr2));
    ExprList *el;
    if ((el = dynamic_cast<ExprList *>(expr)) != NULL) {
        for (unsigned int i = 0; i < el->exprs.size(); ++i)
            if (!lSafeToRunWithAllLanesOff(el->exprs[i]))
                return false;
        return true;
    }
    FunctionCallExpr *fce;
    if ((fce = dynamic_cast<FunctionCallExpr *>(expr)) != NULL)
        // FIXME: If we could somehow determine that the function being
        // called was safe (and all of the args Exprs were safe, then it'd
        // be nice to be able to return true here.  (Consider a call to
        // e.g. floatbits() in the stdlib.)  Unfortunately for now we just
        // have to be conservative.
        return false;
    IndexExpr *ie;
    if ((ie = dynamic_cast<IndexExpr *>(expr)) != NULL) {
        // If we can determine at compile time the size of the array/vector
        // and if the indices are compile-time constants, then we may be
        // able to safely run this under a predicated if statement..
        if (ie->arrayOrVector == NULL)
            return false;
        const Type *type = ie->arrayOrVector->GetType();
        ConstExpr *ce = dynamic_cast<ConstExpr *>(ie->index);
        if (type == NULL || ce == NULL)
            return false;
        if (dynamic_cast<const ReferenceType *>(type) != NULL)
            type = type->GetReferenceTarget();
        const SequentialType *seqType = 
            dynamic_cast<const SequentialType *>(type);
        assert(seqType != NULL);
        int nElements = seqType->GetElementCount();
        if (nElements == 0)
            // Unsized array, so we can't be sure
            return false;
        int32_t indices[ISPC_MAX_NVEC];
        int count = ce->AsInt32(indices);
        for (int i = 0; i < count; ++i)
            if (indices[i] < 0 || indices[i] >= nElements)
                return false;
        // All indices are in-bounds
        return true;
    }
    MemberExpr *me;
    if ((me = dynamic_cast<MemberExpr *>(expr)) != NULL)
        return lSafeToRunWithAllLanesOff(me->expr);
    if (dynamic_cast<ConstExpr *>(expr) != NULL)
        return true;
    TypeCastExpr *tce;
    if ((tce = dynamic_cast<TypeCastExpr *>(expr)) != NULL)
        return lSafeToRunWithAllLanesOff(tce->expr);
    ReferenceExpr *re;
    if ((re = dynamic_cast<ReferenceExpr *>(expr)) != NULL)
        return lSafeToRunWithAllLanesOff(re->expr);
    DereferenceExpr *dre;
    if ((dre = dynamic_cast<DereferenceExpr *>(expr)) != NULL)
        return lSafeToRunWithAllLanesOff(dre->expr);
    if (dynamic_cast<SymbolExpr *>(expr) != NULL ||
        dynamic_cast<FunctionSymbolExpr *>(expr) != NULL ||
        dynamic_cast<SyncExpr *>(expr) != NULL)
        return true;
    FATAL("Unknown Expr type in lSafeToRunWithAllLanesOff()");
    return false;
 }
 /** Given an arbitrary statement, this function conservatively tests to see
    if it's safe to run the code for the statement even if the mask is all
    off.  Here we just need to determine which kind of statement we have
    and recursively traverse it and/or the expressions inside of it.
 */
 static bool
 lSafeToRunWithAllLanesOff(Stmt *stmt) {
    if (stmt == NULL)
        return true;
    ExprStmt *es;
    if ((es = dynamic_cast<ExprStmt *>(stmt)) != NULL)
        return lSafeToRunWithAllLanesOff(es->expr);
    DeclStmt *ds;
    if ((ds = dynamic_cast<DeclStmt *>(stmt)) != NULL) {
        for (unsigned int i = 0; i < ds->declaration->declarators.size(); ++i)
            if (!lSafeToRunWithAllLanesOff(ds->declaration->declarators[i]->initExpr))
                return false;
        return true;
    }
    IfStmt *is;
    if ((is = dynamic_cast<IfStmt *>(stmt)) != NULL)
        return (lSafeToRunWithAllLanesOff(is->test) &&
                lSafeToRunWithAllLanesOff(is->trueStmts) &&
                lSafeToRunWithAllLanesOff(is->falseStmts));
    DoStmt *dos;
    if ((dos = dynamic_cast<DoStmt *>(stmt)) != NULL)
        return (lSafeToRunWithAllLanesOff(dos->testExpr) &&
                lSafeToRunWithAllLanesOff(dos->bodyStmts));
    ForStmt *fs;
    if ((fs = dynamic_cast<ForStmt *>(stmt)) != NULL)
        return (lSafeToRunWithAllLanesOff(fs->init) &&
                lSafeToRunWithAllLanesOff(fs->test) &&
                lSafeToRunWithAllLanesOff(fs->step) &&
                lSafeToRunWithAllLanesOff(fs->stmts));
    if (dynamic_cast<BreakStmt *>(stmt) != NULL ||
        dynamic_cast<ContinueStmt *>(stmt) != NULL)
        return true;
    ReturnStmt *rs;
    if ((rs = dynamic_cast<ReturnStmt *>(stmt)) != NULL)
        return lSafeToRunWithAllLanesOff(rs->val);
    StmtList *sl;
    if ((sl = dynamic_cast<StmtList *>(stmt)) != NULL) {
        const std::vector<Stmt *> &sls = sl->GetStatements();
        for (unsigned int i = 0; i < sls.size(); ++i)
            if (!lSafeToRunWithAllLanesOff(sls[i]))
                return false;
        return true;
    }
    PrintStmt *ps;
    if ((ps = dynamic_cast<PrintStmt *>(stmt)) != NULL)
        return lSafeToRunWithAllLanesOff(ps->values);
    FATAL("Unexpected stmt type in lSafeToRunWithAllLanesOff()");
    return false;
 }
 /** Emit code for an if test that checks the mask and the test values and
    tries to be smart about jumping over code that doesn't need to be run.
 */
 void
-IfStmt::emitCoherentTests(FunctionEmitContext *ctx, llvm::Value *ltest) const {
+IfStmt::emitVaryingIf(FunctionEmitContext *ctx, llvm::Value *ltest) const {
    llvm::Value *oldMask = ctx->GetMask();
    if (oldMask == LLVMMaskAllOn) {
        // We can tell that the mask is on statically at compile time; just
@@ -587,7 +772,7 @@ IfStmt::emitCoherentTests(FunctionEmitContext *ctx, llvm::Value *ltest) const {
        emitMaskAllOn(ctx, ltest, bDone);
        ctx->SetCurrentBasicBlock(bDone);
    }
-    else {
+    else if (doAllCheck) {
        // We can't tell if the mask going into the if is all on at the
        // compile time.  Emit code to check for this and then either run
        // the code for the 'all on' or the 'mixed' case depending on the
@@ -619,6 +804,43 @@ IfStmt::emitCoherentTests(FunctionEmitContext *ctx, llvm::Value *ltest) const {
        // paths above jump to when they're done.
        ctx->SetCurrentBasicBlock(bDone);
    }
    else if (trueStmts != NULL || falseStmts != NULL) {
        // If there is nothing that is potentially unsafe to run with all
        // lanes off in the true and false statements and if the total
        // complexity of those two is relatively simple, then we'll go
        // ahead and emit straightline code that runs both sides, updating
        // the mask accordingly.  This is useful for efficiently compiling
        // things like:
        //
        // if (foo) x = 0;
        // else     ++x;
        //
        // Where the overhead of checking if any of the program instances wants
        // to run one side or the other is more than the actual computation.
        // The lSafeToRunWithAllLanesOff() checks to make sure that we don't do this
        // for potentially dangerous code like:
        //
        // if (index < count) array[index] = 0;
        //
        // where our use of blend for conditional assignments doesn't check
        // for the 'all lanes' off case.
        if (lSafeToRunWithAllLanesOff(trueStmts) &&
            lSafeToRunWithAllLanesOff(falseStmts) &&
            (((trueStmts ? trueStmts->EstimateCost() : 0) + 
              (falseStmts ? falseStmts->EstimateCost() : 0)) < 
             PREDICATE_SAFE_IF_STATEMENT_COST)) {
            ctx->StartVaryingIf(oldMask);
            emitMaskedTrueAndFalse(ctx, oldMask, ltest);
            assert(ctx->GetCurrentBasicBlock());
            ctx->EndIf();
        }
        else {
            assert(doAnyCheck);
            llvm::BasicBlock *bDone = ctx->CreateBasicBlock("if_done");
            emitMaskMixed(ctx, oldMask, ltest, bDone);
            ctx->SetCurrentBasicBlock(bDone);
        }
    }
 }
@@ -677,69 +899,50 @@ IfStmt::emitMaskAllOn(FunctionEmitContext *ctx, llvm::Value *ltest,
 }
 /** Emits code that checks to see if for all of the lanes where the mask is
    on, the test has the value true.
 */
 static llvm::Value *
 lTestMatchesMask(FunctionEmitContext *ctx, llvm::Value *test, llvm::Value *mask) {
    llvm::Value *testAndMask = ctx->BinaryOperator(llvm::Instruction::And, test,
                                                   mask, "test&mask");
    return ctx->MasksAllEqual(testAndMask, mask);
 }
 /** Emit code for an 'if' test where the lane mask is known to be mixed
    on/off going into it.
 */
 void
 IfStmt::emitMaskMixed(FunctionEmitContext *ctx, llvm::Value *oldMask, 
                      llvm::Value *ltest, llvm::BasicBlock *bDone) const {
-    // First, see if, for all of the lanes where the mask is on, if the
+    ctx->StartVaryingIf(oldMask);
-    // value of the test is on.  (i.e. (test&mask) == mask).  In this case,
+    llvm::BasicBlock *bNext = ctx->CreateBasicBlock("safe_if_after_true");
-    // we only need to run the 'true' case code, since the lanes where the
+    if (trueStmts != NULL) {
-    // test was false aren't supposed to be running here anyway.
+        llvm::BasicBlock *bRunTrue = ctx->CreateBasicBlock("safe_if_run_true");
-     llvm::Value *testAllEqual = lTestMatchesMask(ctx, ltest, oldMask);
+        ctx->MaskAnd(oldMask, ltest);
    llvm::BasicBlock *bTestAll = ctx->CreateBasicBlock("cif_mixed_test_all");
    llvm::BasicBlock *bTestAnyCheck = ctx->CreateBasicBlock("cif_mixed_test_any_check");
    ctx->BranchInst(bTestAll, bTestAnyCheck, testAllEqual);
-    // Emit code for the (test&mask)==mask case.  Not only do we only need
+        // Do any of the program instances want to run the 'true'
-    // to emit code for the true statements, but we don't need to modify
+        // block?  If not, jump ahead to bNext.
-    // the mask's value; it's already correct.
+        llvm::Value *maskAnyQ = ctx->Any(ctx->GetMask());
-    ctx->SetCurrentBasicBlock(bTestAll);
+        ctx->BranchInst(bRunTrue, bNext, maskAnyQ);
-    ctx->StartVaryingIf(ctx->GetMask());
+
-    lEmitIfStatements(ctx, trueStmts, "cif: all running lanes want just true stmts");
+        // Emit statements for true
        ctx->SetCurrentBasicBlock(bRunTrue);
        lEmitIfStatements(ctx, trueStmts, "if: expr mixed, true statements");
        assert(ctx->GetCurrentBasicBlock()); 
-    ctx->EndIf();
+        ctx->BranchInst(bNext);
-    ctx->BranchInst(bDone);
+        ctx->SetCurrentBasicBlock(bNext);
    }
    if (falseStmts != NULL) {
        llvm::BasicBlock *bRunFalse = ctx->CreateBasicBlock("safe_if_run_false");
        bNext = ctx->CreateBasicBlock("safe_if_after_false");
        ctx->MaskAndNot(oldMask, ltest);
-    // Next, see if the active lanes only need to run the false case--i.e. if
+        // Similarly, check to see if any of the instances want to
-    // (~test & mask) == mask.
+        // run the 'false' block...
-    ctx->SetCurrentBasicBlock(bTestAnyCheck);
+        llvm::Value *maskAnyQ = ctx->Any(ctx->GetMask());
-    llvm::Value *notTest = ctx->BinaryOperator(llvm::Instruction::Xor, LLVMMaskAllOn,
+        ctx->BranchInst(bRunFalse, bNext, maskAnyQ);
                                               ltest, "~test");
    llvm::Value *notMatchesMask = lTestMatchesMask(ctx, notTest, oldMask);
    llvm::BasicBlock *bTestAllNot = ctx->CreateBasicBlock("cif_mixed_test_none");
    llvm::BasicBlock *bTestMixed = ctx->CreateBasicBlock("cif_mixed_test_mixed");
    ctx->BranchInst(bTestAllNot, bTestMixed, notMatchesMask);
-    // Emit code for the (~test & mask) == mask case.  We only need the
+        // Emit code for false
-    // 'false' statements and again don't need to modify the value of the
+        ctx->SetCurrentBasicBlock(bRunFalse);
-    // mask.
+        lEmitIfStatements(ctx, falseStmts, "if: expr mixed, false statements");
    ctx->SetCurrentBasicBlock(bTestAllNot);
    ctx->StartVaryingIf(ctx->GetMask());
    lEmitIfStatements(ctx, falseStmts, "cif: all running lanes want just false stmts");
        assert(ctx->GetCurrentBasicBlock());
-    ctx->EndIf();
+        ctx->BranchInst(bNext);
        ctx->SetCurrentBasicBlock(bNext);
    }
    ctx->BranchInst(bDone);
-
+    ctx->SetCurrentBasicBlock(bDone);
    // It's mixed; we need to run both the true and false cases and also do
    // mask update stuff.
    ctx->SetCurrentBasicBlock(bTestMixed);
    ctx->StartVaryingIf(ctx->GetMask());
    emitMaskedTrueAndFalse(ctx, oldMask, ltest);
    ctx->EndIf();
    ctx->BranchInst(bDone);
 }
@@ -955,6 +1158,13 @@ DoStmt::TypeCheck() {
 }
 int
 DoStmt::EstimateCost() const {
    return ((testExpr ? testExpr->EstimateCost() : 0) +
            (bodyStmts ? bodyStmts->EstimateCost() : 0));
 }
 void
 DoStmt::Print(int indent) const {
    printf("%*cDo Stmt", indent, ' ');
@@ -1162,6 +1372,20 @@ ForStmt::TypeCheck() {
 }
 int
 ForStmt::EstimateCost() const {
    bool uniformTest = test ? test->GetType()->IsUniformType() :
        (!g->opt.disableUniformControlFlow &&
         !lHasVaryingBreakOrContinue(stmts));
    return ((init ? init->EstimateCost() : 0) +
            (test ? test->EstimateCost() : 0) +
            (step ? step->EstimateCost() : 0) +
            (stmts ? stmts->EstimateCost() : 0) +
            (uniformTest ? COST_UNIFORM_LOOP : COST_VARYING_LOOP));
 }
 void
 ForStmt::Print(int indent) const {
    printf("%*cFor Stmt", indent, ' ');
@@ -1216,6 +1440,13 @@ BreakStmt::TypeCheck() {
 }
 int
 BreakStmt::EstimateCost() const {
    return doCoherenceCheck ? COST_COHERENT_BREAK_CONTINE : 
        COST_REGULAR_BREAK_CONTINUE;
 }
 void
 BreakStmt::Print(int indent) const {
    printf("%*c%sBreak Stmt", indent, ' ', doCoherenceCheck ? "Coherent " : "");
@@ -1254,6 +1485,13 @@ ContinueStmt::TypeCheck() {
 }
 int
 ContinueStmt::EstimateCost() const {
    return doCoherenceCheck ? COST_COHERENT_BREAK_CONTINE : 
        COST_REGULAR_BREAK_CONTINUE;
 }
 void
 ContinueStmt::Print(int indent) const {
    printf("%*c%sContinue Stmt", indent, ' ', doCoherenceCheck ? "Coherent " : "");
@@ -1300,6 +1538,12 @@ ReturnStmt::TypeCheck() {
 }
 int
 ReturnStmt::EstimateCost() const {
    return COST_RETURN + (val ? val->EstimateCost() : 0);
 }
 void
 ReturnStmt::Print(int indent) const {
    printf("%*c%sReturn Stmt", indent, ' ', doCoherenceCheck ? "Coherent " : "");
@@ -1345,6 +1589,16 @@ StmtList::TypeCheck() {
 }
 int
 StmtList::EstimateCost() const {
    int cost = 0;
    for (unsigned int i = 0; i < stmts.size(); ++i)
        if (stmts[i])
            cost += stmts[i]->EstimateCost();
    return cost;
 }
 void
 StmtList::Print(int indent) const {
    printf("%*cStmt List", indent, ' ');
@@ -1464,8 +1718,11 @@ PrintStmt::EmitCode(FunctionEmitContext *ctx) const {
    llvm::Value *args[5];
    std::string argTypes;
-    if (values == NULL)
+    if (values == NULL) {
-        args[4] = NULL;
+        LLVM_TYPE_CONST llvm::Type *ptrPtrType = 
            llvm::PointerType::get(LLVMTypes::VoidPointerType, 0);
        args[4] = llvm::Constant::getNullValue(ptrPtrType);
    }
    else {
        // Get the values passed to the print() statement evaluated and
        // stored in memory so that we set up the array of pointers to them
@@ -1542,3 +1799,11 @@ PrintStmt::TypeCheck() {
        values = values->TypeCheck();
    return this;
 }
 int
 PrintStmt::EstimateCost() const {
    return COST_FUNCALL + (values ? values->EstimateCost() : 0);
 }
--- a/stmt.h
+++ b/stmt.h
@@ -75,8 +75,8 @@ public:
    Stmt *Optimize();
    Stmt *TypeCheck();
    int EstimateCost() const;
 private:
    Expr *expr;
 };
@@ -92,8 +92,8 @@ public:
    Stmt *Optimize();
    Stmt *TypeCheck();
    int EstimateCost() const;
 private:
    Declaration *declaration;
 };
@@ -103,13 +103,14 @@ private:
 class IfStmt : public Stmt {
 public:
    IfStmt(Expr *testExpr, Stmt *trueStmts, Stmt *falseStmts,
-           bool doCoherentCheck, SourcePos pos);
+           bool doAllCheck, SourcePos pos);
    void EmitCode(FunctionEmitContext *ctx) const;
    void Print(int indent) const;
    Stmt *Optimize();
    Stmt *TypeCheck();
    int EstimateCost() const;
    // @todo these are only public for lHasVaryingBreakOrContinue(); would
    // be nice to clean that up...
@@ -125,11 +126,12 @@ private:
        source and thus, if the emitted code should check to see if all
        active program instances want to follow just one of the 'true' or
        'false' blocks. */
-    const bool doCoherentCheck;
+    const bool doAllCheck;
    const bool doAnyCheck;
    void emitMaskedTrueAndFalse(FunctionEmitContext *ctx, llvm::Value *oldMask, 
                                llvm::Value *test) const;
-    void emitCoherentTests(FunctionEmitContext *ctx, llvm::Value *test) const;
+    void emitVaryingIf(FunctionEmitContext *ctx, llvm::Value *test) const;
    void emitMaskAllOn(FunctionEmitContext *ctx,
                       llvm::Value *test, llvm::BasicBlock *bDone) const;
    void emitMaskMixed(FunctionEmitContext *ctx, llvm::Value *oldMask, 
@@ -150,8 +152,8 @@ public:
    Stmt *Optimize();
    Stmt *TypeCheck();
    int EstimateCost() const;
 private:
    Expr *testExpr;
    Stmt *bodyStmts;
    const bool doCoherentCheck;
@@ -171,8 +173,8 @@ public:
    Stmt *Optimize();
    Stmt *TypeCheck();
    int EstimateCost() const;
 private:
    /** 'for' statment initializer; may be NULL, indicating no intitializer */
    Stmt *init;
    /** expression that returns a value indicating whether the loop should
@@ -198,6 +200,7 @@ public:
    Stmt *Optimize();
    Stmt *TypeCheck();
    int EstimateCost() const;
 private:
    /** This indicates whether the generated code will check to see if no
@@ -219,6 +222,7 @@ public:
    Stmt *Optimize();
    Stmt *TypeCheck();
    int EstimateCost() const;
 private:
    /** This indicates whether the generated code will check to see if no
@@ -240,8 +244,8 @@ public:
    Stmt *Optimize();
    Stmt *TypeCheck();
    int EstimateCost() const;
 private:
    Expr *val;
    /** This indicates whether the generated code will check to see if no
        more program instances are currently running after the return, in
@@ -262,6 +266,7 @@ public:
    Stmt *Optimize();
    Stmt *TypeCheck();
    int EstimateCost() const;
    void Add(Stmt *s) { if (s) stmts.push_back(s); }
    const std::vector<Stmt *> &GetStatements() { return stmts; }
@@ -289,8 +294,8 @@ public:
    Stmt *Optimize();
    Stmt *TypeCheck();
    int EstimateCost() const;
 private:
    /** Format string for the print() statement. */
    const std::string format;
    /** This holds the arguments passed to the print() statement.  If more
--- a/test_static.cpp
+++ b/test_static.cpp
@@ -0,0 +1,154 @@
 /*
  Copyright (c) 2010-2011, Intel Corporation
  All rights reserved.
  Redistribution and use in source and binary forms, with or without
  modification, are permitted provided that the following conditions are
  met:
    * Redistributions of source code must retain the above copyright
      notice, this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.
    * Neither the name of Intel Corporation nor the names of its
      contributors may be used to endorse or promote products derived from
      this software without specific prior written permission.
   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
 */
 #if defined(_WIN32) || defined(_WIN64)
 #define ISPC_IS_WINDOWS
 #elif defined(__linux__)
 #define ISPC_IS_LINUX
 #elif defined(__APPLE__)
 #define ISPC_IS_APPLE
 #endif
 #include <string.h>
 #include <stdio.h>
 #include <assert.h>
 #include <stdint.h>
 #ifdef ISPC_IS_LINUX
 #include <malloc.h>
 #endif
 extern "C" {
    extern int width();
    extern void f_v(float *result);
    extern void f_f(float *result, float *a);
    extern void f_fu(float *result, float *a, float b);
    extern void f_fi(float *result, float *a, int *b);
    extern void f_du(float *result, double *a, double b);
    extern void f_duf(float *result, double *a, float b);
    extern void f_di(float *result, double *a, int *b);
    extern void result(float *val);
    void ISPCLaunch(void **handlePtr, void *f, void *d, int);
    void ISPCSync(void *handle);
    void *ISPCAlloc(void **handlePtr, int64_t size, int32_t alignment);
 }
 void ISPCLaunch(void **handle, void *f, void *d, int count) {
    *handle = (void *)0xdeadbeef;
    typedef void (*TaskFuncType)(void *, int, int, int, int);
    TaskFuncType func = (TaskFuncType)f;
    for (int i = 0; i < count; ++i)
        func(d, 0, 1, i, count);
 }
 void ISPCSync(void *) {
 }
 void *ISPCAlloc(void **handle, int64_t size, int32_t alignment) {
    *handle = (void *)0xdeadbeef;
    // and now, we leak...
 #ifdef ISPC_IS_WINDOWS
    return _aligned_malloc(size, alignment);
 #endif
 #ifdef ISPC_IS_LINUX
    return memalign(alignment, size);
 #endif
 #ifdef ISPC_IS_APPLE
    void *mem = malloc(size + (alignment-1) + sizeof(void*));
    char *amem = ((char*)mem) + sizeof(void*);
    amem = amem + uint32_t(alignment - (reinterpret_cast<uint64_t>(amem) &
                                        (alignment - 1)));
    ((void**)amem)[-1] = mem;
    return amem;
 #endif
 }
 int main(int argc, char *argv[]) {
    int w = width();
    assert(w <= 16);
    float returned_result[16];
    memset(returned_result, 0, 16*sizeof(float));
    float vfloat[16] = { 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16};
    double vdouble[16] = { 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16};
    int vint[16] = { 2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32 };
    int vint2[16] = { 5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20};
    float b = 5.;
 #if (TEST_SIG == 0)
    f_v(returned_result);
 #elif (TEST_SIG == 1)
    f_f(returned_result, vfloat);
 #elif (TEST_SIG == 2)
    f_fu(returned_result, vfloat, b);
 #elif (TEST_SIG == 3)
    f_fi(returned_result, vfloat, vint);
 #elif (TEST_SIG == 4)
    f_du(returned_result, vdouble, 5.);
 #elif (TEST_SIG == 5)
    f_duf(returned_result, vdouble, 5.f);
 #elif (TEST_SIG == 6)
    f_di(returned_result, vdouble, vint2);
 #else
 #error "Unknown or unset TEST_SIG value"
 #endif    
    float expected_result[16];
    memset(expected_result, 0, 16*sizeof(float));
    result(expected_result);
    int errors = 0;
    for (int i = 0; i < w; ++i) {
        if (returned_result[i] != expected_result[i]) {
 #ifdef EXPECT_FAILURE
            // bingo, failed
            return 1;
 #else
            printf("%s: value %d disagrees: returned %f [%a], expected %f [%a]\n",
                   argv[0], i, returned_result[i], returned_result[i], 
                   expected_result[i], expected_result[i]);
            ++errors;
 #endif // EXPECT_FAILURE
        }
    }
 #ifdef EXPECT_FAILURE
    // Don't expect to get here
    return 0;
 #else
    return errors > 0;
 #endif
 }
--- a/tests/array-1.ispc
+++ b/tests/array-1.ispc
@@ -3,7 +3,7 @@ export uniform int width() { return programCount; }
-static float x[2][1];
+static float x[1][2];
 export void f_f(uniform float RET[], uniform float aFOO[]) {
    float a = aFOO[programIndex];
--- a/tests/array-scatter-vary.ispc
+++ b/tests/array-scatter-vary.ispc
@@ -13,7 +13,7 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
 }
-export void result(uniform float RET[4]) { 
+export void result(uniform float RET[]) { 
    RET[programIndex] = 0;
    RET[3] = 4;
    RET[4] = 5;
--- a/tests/atomics-1.ispc
+++ b/tests/atomics-1.ispc
@@ -5,7 +5,8 @@ uniform unsigned int32 s = 0;
 export void f_f(uniform float RET[], uniform float aFOO[]) {
    float a = aFOO[programIndex]; 
-    float b = atomic_add_global(s, 1);
+    float delta = 1;
    float b = atomic_add_global(s, delta);
    RET[programIndex] = reduce_add(b);
 }
--- a/tests/atomics-10.ispc
+++ b/tests/atomics-10.ispc
@@ -0,0 +1,17 @@
 export uniform int width() { return programCount; }
 uniform unsigned int32 s = 0;
 export void f_f(uniform float RET[], uniform float aFOO[]) {
    float a = aFOO[programIndex]; 
    float b = 0;
    float delta = 1;
    if (programIndex < 2)
        b = atomic_add_global(s, delta);
    RET[programIndex] = s;
 }
 export void result(uniform float RET[]) {
    RET[programIndex] = 2;
 }
--- a/tests/atomics-11.ispc
+++ b/tests/atomics-11.ispc
@@ -0,0 +1,20 @@
 export uniform int width() { return programCount; }
 uniform unsigned int32 s = 0;
 export void f_f(uniform float RET[], uniform float aFOO[]) {
    float a = aFOO[programIndex]; 
    float b = 0;
    if (programIndex & 1)
        b = atomic_add_global(s, programIndex);
    RET[programIndex] = s;
 }
 export void result(uniform float RET[]) {
    uniform int sum = 0;
    for (uniform int i = 0; i < programCount; ++i)
        if (i & 1)
            sum += i;
    RET[programIndex] = sum;
 }
--- a/tests/atomics-12.ispc
+++ b/tests/atomics-12.ispc
@@ -0,0 +1,20 @@
 export uniform int width() { return programCount; }
 uniform unsigned int32 s = 0;
 export void f_f(uniform float RET[], uniform float aFOO[]) {
    float a = aFOO[programIndex]; 
    float b = 0;
    if (programIndex & 1)
        b = atomic_or_global(s, (1 << programIndex));
    RET[programIndex] = s;
 }
 export void result(uniform float RET[]) {
    uniform int sum = 0;
    for (uniform int i = 0; i < programCount; ++i)
        if (i & 1)
            sum += (1 << i);
    RET[programIndex] = sum;
 }
--- a/tests/atomics-13.ispc
+++ b/tests/atomics-13.ispc
@@ -0,0 +1,16 @@
 export uniform int width() { return programCount; }
 uniform unsigned int32 s = 0;
 export void f_f(uniform float RET[], uniform float aFOO[]) {
    float a = aFOO[programIndex]; 
    float b = 0;
    if (programIndex & 1)
        b = atomic_or_global(s, (1 << programIndex));
    RET[programIndex] = popcnt(reduce_max((int32)b));
 }
 export void result(uniform float RET[]) {
    RET[programIndex] = (programCount/2) - 1;
 }
--- a/tests/atomics-14.ispc
+++ b/tests/atomics-14.ispc
@@ -0,0 +1,20 @@
 export uniform int width() { return programCount; }
 uniform unsigned int64 s = 0xffffffffff000000;
 export void f_f(uniform float RET[], uniform float aFOO[]) {
    float a = aFOO[programIndex]; 
    float b = 0;
    if (programIndex & 1)
        b = atomic_or_global(s, (1 << programIndex));
    RET[programIndex] = (s>>20);
 }
 export void result(uniform float RET[]) {
    uniform int sum = 0;
    for (uniform int i = 0; i < programCount; ++i)
        if (i & 1)
            sum += (1 << i);
    RET[programIndex] = ((unsigned int64)(0xffffffffff000000 | sum)) >> 20;
 }
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Matt Pharr	2f35bc1a0f	Release notes and doxygen bump for v1.0.10	2011-09-30 15:09:19 -07:00
Matt Pharr	1620e0508d	Added deferred shading workload	2011-09-30 15:09:04 -07:00
Matt Pharr	cb7976bbf6	Added updated task launch implementation that now tracks task groups. Within each function that launches tasks, we now can easily track which tasks that function launched, so that the sync at the end of the function can just sync on the tasks launched by that function (not all tasks launched by all functions.) Implementing this led to a rework of the task system API that ispc generates code to call; the example task systems in examples/tasksys.cpp have been updated to conform to this API. (The updated API is also documented in the ispc user's guide.) As part of this, "launch[n]" syntax was added to launch a number of tasks in a single launch statement, rather than requiring a loop over 'n' to launch n tasks. This commit thus fixes issue #84 (enhancement to launch multiple tasks from a single launch statement) as well as issue #105 (recursive task launches were broken).	2011-09-30 11:20:53 -07:00
Matt Pharr	5ee4d7fce8	Add comment	2011-09-30 11:11:52 -07:00
Matt Pharr	8f3e46f67e	Use InterlockedExchangeAdd on Windows	2011-09-29 16:19:59 -07:00
Matt Pharr	9ed07ff2b5	Fix __num_cores() definition on Windows to not cause unresolved symbols	2011-09-29 13:35:50 -07:00
Matt Pharr	32a0a30cf5	Only allow exact matches for function overload resolution for builtins. The intent is that the code in stdlib.ispc that is calling out to the built-ins should match argument types exactly (using explicit casts as needed), just for maximal clarity/safety.	2011-09-28 17:20:31 -07:00
Matt Pharr	6d39d5fc3e	Small cleanups. Add __num_cores() to the list of symbols to remove from the module at the end. Fix declarations of mask type for 64-bit atomics to silence warnings.	2011-09-28 16:26:35 -07:00
Matt Pharr	c999c8a237	Add num_cores() stdlib routine. Issue #102 .	2011-09-28 16:16:58 -07:00
Matt Pharr	aad269fdf4	Added support for 'uniform' global atomics. Issue #93.	2011-09-28 16:06:07 -07:00
Matt Pharr	d45c536c47	Fix Windows debug build of simple example	2011-09-28 14:11:32 -07:00
Matt Pharr	f1b8e5b1bf	Release notes and doxygen bump for 1.0.9 release	2011-09-26 16:21:32 -07:00
Matt Pharr	e7a70b05af	Fix statically-linked tests on Linux	2011-09-26 16:11:45 -07:00
Matt Pharr	cf73286938	More small Windows build fixes. Also switch to LLVM 3.0 libs	2011-09-26 16:07:23 -07:00
Matt Pharr	e6f80c0adc	Remove stale include of MCJIT.h	2011-09-26 16:04:52 -07:00
Matt Pharr	5e31d7b6d0	Windows build: use LLVM_INSTALL_DIR to find clang.exe	2011-09-26 16:04:50 -07:00
Matt Pharr	649f2ad7b7	Update parser to make 'sync' a statement, not an expr.	2011-09-23 20:33:24 -07:00
Matt Pharr	fade1cdf1d	Pretty much all conversions to varying double are slow, so don't bother warning about them.	2011-09-23 16:03:35 -07:00
Matt Pharr	d261105a86	Error/warning reporting improvements. - Don't suggest matches when given an empty string or a single, non-alpha character. - Also fixed the parser to be a bit less confusing when it encounters an unexpected EOF.	2011-09-23 15:51:23 -07:00
Matt Pharr	b3d3e8987b	Provide a properly initialized TextDiagnosticPrinter to clang's preprocessor. Fixes issue #100 (crash when the preprocessor was trying to emit a diagnostic about a mismatched #if/#endif).	2011-09-23 15:50:18 -07:00
Matt Pharr	4e91f3777a	Fix BinaryExpr to handle reference-typed operands. Fixes issue #101.	2011-09-23 15:19:14 -07:00
Matt Pharr	5584240c7f	Fix crash with function declarations with unnamed parameters. Fixes issue #103. Previously, we were inadvertently grabbing the function's return type for the parameter, rather than the actual parameter type.	2011-09-23 15:05:59 -07:00
Matt Pharr	7126a39092	Disable PIC on Windows	2011-09-19 15:32:43 -07:00
Matt Pharr	8ad28a3f6f	update doxygen, release notes for 1.0.8 release	2011-09-19 15:22:25 -07:00
Matt Pharr	9921b8e530	Predicated 'if' statement performance improvements. Go back to running both sides of 'if' statements with masking and without branching if we can determine that the code is relatively simple (as per the simple cost model), and is safe to run even if the mask is 'all off'. This gives a bit of a performance improvement for some of the examples (most notably, the ray tracer), and is the code that one wants generated in this case anyhow.	2011-09-19 09:54:09 -07:00
Matt Pharr	9052d4b10b	Linux build fixes	2011-09-17 13:42:46 -07:00
Matt Pharr	2405dae8e6	Use malloc() to get space for task arguments when compiling to AVX. This is to work around the LLVM bug/limitation discused in LLVM bug 10841 (http://llvm.org/bugs/show_bug.cgi?id=10841).	2011-09-17 13:38:51 -07:00
Matt Pharr	3607f3e045	Remove support for building with LLVM 2.8. Fixes issue #66 . Both 2.9 and top-of-tree generate substantially better code than LLVM 2.8 did, so it's not worth fixing the 2.8 build.	2011-09-17 13:18:59 -07:00
Matt Pharr	de84acfa5d	On OSX with LLVM 2.9, always generate position-independent code. Fixes Issue #99.	2011-09-17 13:03:51 -07:00
Matt Pharr	a501ab1aa6	Fix parenthesization bugs in cost estimates. Also added the debugging print that helped find these issues. Revert inlining some functions in examples	2011-09-16 19:07:07 -07:00
Matt Pharr	cdc850f98c	Inline some functions in examples	2011-09-16 17:02:21 -07:00
Matt Pharr	ca87579f23	Add a very simple cost model to estimate runtime cost of running code. This is currently only used to decide whether it's worth doing an "are all lanes running" check at the start of functions--for small functions, it's not worth the overhead. The cost is estimated relatively early in compilation (e.g. before we know if an array access is a scatter/gather or not, before constant folding, etc.), so there are many known shortcomings.	2011-09-16 15:09:17 -07:00
Matt Pharr	38fc13d1ab	Remove now unused function.	2011-09-16 14:21:13 -07:00
Matt Pharr	cf9d9f717e	Logic simplification to 'mixed true/false' case for coherent ifs. Use the approach from `173632f446` here as well.	2011-09-16 14:10:55 -07:00
Matt Pharr	173632f446	Generate more efficient for regular varying 'if' statements. For the case where we have a regular (i.e. non-'cif') 'if' statement, the generated code just simply checks to see if any program instance is running before running the corresponding statements. This is a lighter-weight check than IfStmt::emitMaskMixed() was performing.	2011-09-16 12:03:42 -07:00
Matt Pharr	1dedd88132	Improve implementaton of 'are both masks equal' check for AVX. Previously, we did a vector equal compare and then a movmsk, the result of which we checked to see if it was on for all lanes. Because masks are vectors of i32s, under AVX, the vector equal compare required two 4-wide SSE compares and some shuffling. Now, we do a movmsk of both masks first and then a scalar equality comparison of those two values, which seems to generate overall better code.	2011-09-15 06:25:02 -07:00
Matt Pharr	0848c2cc19	Actually make all 'if' statements check for 'all off' mask. Contrary to claims in `0c2048385`, that checkin didn't include the changes to not run if/else blocks if none of the program instances wanted to be running them. This checkin fixes that and thus actually fixes issue #74.	2011-09-13 19:48:04 -07:00
Matt Pharr	e2a88d491f	Mark the internal __fast_masked_vload function as static	2011-09-13 15:43:48 -07:00
Matt Pharr	30f9dcd4f5	Unroll loops by default, add --opt=disable-loop-unroll to disable. Issue #78.	2011-09-13 15:37:18 -07:00
Matt Pharr	0c344b6755	Fix Linux build of mandelbrot_tasks example	2011-09-13 15:17:30 -07:00
Matt Pharr	6734021520	Issue warning when compile-time constant out-of-bounds array index is used. Issue #98. Also fixes two examples that had bugs of this type that this warning uncovered!	2011-09-13 14:42:20 -07:00
Matt Pharr	dd153d3c5c	Handle more instruction types when flattening offset vectors. Generalize the lScalarizeVector() utility routine (used in determining when we can change gathers/scatters into vector loads/stores, respectively) to handle vector shuffles and vector loads. This fixes issue #79, which provided a case where a gather was being performed even though a vector load was possible.	2011-09-13 09:43:56 -07:00
Matt Pharr	9ca7541d52	Remove check for any program instances running before function calls. Given the change in `0c20483853`, this is no longer necessary, since we know that one instance will always be running if we're executing a given block of code.	2011-09-13 06:26:16 -07:00
Matt Pharr	0c20483853	Make all "if" statements "coherent" ifs. Workaround for issue #74 . Using blend to do masked stores is unsafe if all of the lanes are off: it may read from or write to invalid memory. For now, this workaround transforms all 'if' statements into coherent 'if's, ensuring that an instruction only runs if at least on program instance wants to be running it. One nice thing about this change is that a number of implementations of various builtins can be simplified, since they no longer need to confirm that at least one program instance is running. It might be nice to re-enable regular if statements in a future checkin, but we'd want to make sure they don't have any masked loads or blended masked stores in their statement lists. There isn't a performance impact for any of the examples with this change, so it's unclear if this is important. Note that this only impacts 'if' statements with a varying condition.	2011-09-12 16:25:08 -07:00
Matt Pharr	9d4ff1bc06	Fix alignment in usage message	2011-09-12 15:06:41 -07:00
Matt Pharr	83f22f1939	Add experimental --fast-masked-vload flag for SSE.	2011-09-12 12:29:33 -07:00
Matt Pharr	6375ed9224	AVX: Fix bug with misdeclaration of blend intrinsic. This was preventing the "convert an all-on blend to one of the operand values" optimization from kicking on in AVX.	2011-09-12 06:42:38 -07:00
Matt Pharr	cf23cf9ef4	Fix typo in user guide. Issue #96	2011-09-12 05:24:32 -07:00
Matt Pharr	1147b53dcd	Add #define with target vector width in emitted headers	2011-09-09 09:33:56 -07:00
Matt Pharr	4cf831a651	When --fast-math is enabled, tell LLVM about it, too.	2011-09-09 09:32:59 -07:00
Matt Pharr	785d8a29d3	Run mem2reg pass even when doing -O0 compiles	2011-09-09 09:24:43 -07:00
Matt Pharr	46d2bad231	Fix malformed program crash	2011-09-09 09:24:43 -07:00
Matt Pharr	32da8e11b4	Fix crash with varying global vector types when emitting header file.	2011-09-09 09:16:59 -07:00
Matt Pharr	5dedb6f836	Add --scale command line argument to mandelbrot and rt examples. This applies a floating-point scale factor to the image resolution; it's useful for experiments with many-core systems where the base image resolution may not give enough work for good load-balancing with tasks.	2011-09-07 20:07:51 -07:00
Matt Pharr	2ea6d249d5	Fix mapping to 8, 16 program instances in AO bench example. With this, we now compute a correct image with AVX.	2011-09-07 11:34:24 -07:00
Matt Pharr	c86128e8ee	AVX: go back to using blend (vs. masked store) when possible. All of the masked store calls were inhibiting putting values into registers, which in turn led to a lot of unnecessary stack traffic. This approach seems to give better code in the end.	2011-09-07 11:26:49 -07:00
Matt Pharr	375f1cb8e8	Make octaves and octaves loop uniform in noise example	2011-09-07 10:34:23 -07:00
Matt Pharr	3ca7b6b078	Remove MCJIT stuff from ispc_test (fix Linux build)	2011-09-07 09:44:27 -07:00
Matt Pharr	effe901890	Add task-parallel version of aobench	2011-09-07 05:43:21 -07:00
Matt Pharr	4f451bd041	More AVX fixes Fix RNG state initialization for 16-wide targets Fix a number of bugs in reduce_add builtin implementations for AVX. Fix some tests that had incorrect expected results for the 16-wide case.	2011-09-06 15:53:11 -07:00
Matt Pharr	c76ef7b174	Add command-line option to specify position-independent codegen	2011-09-06 11:12:43 -07:00
Matt Pharr	743d82e935	Various documentation updates.	2011-09-06 09:51:02 -07:00
Matt Pharr	18546e9c6d	Add option to disable optimizations to test running script	2011-09-04 18:09:00 -07:00
Matt Pharr	f24ab16b91	Release notes, doxygen update for 1.0.7 release.	2011-09-03 07:33:39 -07:00
Matt Pharr	766b34683c	Fix Windows build	2011-09-03 07:23:16 -07:00
Matt Pharr	b5bfa43e92	Fix error with float suffixes	2011-09-02 13:09:25 -07:00
Matt Pharr	99221f7d17	Fix a few places in examples where C reference implementaion had a double-precision fp constant undesirably causing computation to be done in double precision. Makes C scalar versions of the options pricing models, rt, and aobench 3-5% faster. Makes scalar version of noise about 15% faster. Others are unchanged.	2011-09-01 16:31:22 -07:00
Matt Pharr	eb7913f1dd	AVX: fix alignment when changing masked load to regular load. Also added some debugging/tracing stuff (commented out). Commented out iffy assert that was hitting for avx stuff.	2011-09-01 15:45:49 -07:00
Matt Pharr	08cad7a665	AVX bugfixes	2011-09-01 14:23:10 -07:00
Matt Pharr	9cd92facbd	Fix test: was incorrectly failing for 8-wide targets	2011-09-01 05:03:49 -07:00
Matt Pharr	85063f493c	Revert attempt to be clever about which LLVM libraries to link in--just link all of them. (This was causing build problems for some folks.)	2011-09-01 05:02:44 -07:00
Matt Pharr	f65a20c700	AVX bugfix: when replacing 'all on' masked store with a store, the rvalue is operand 2, not operand 1 (which is the mask!)	2011-08-31 18:06:29 -07:00
Matt Pharr	e144724979	Improve performance of global atomics, taking advantage of associativity. For associative atomic ops (add, and, or, xor), we can take advantage of their associativity to do just a single hardware atomic instruction, rather than one for each of the running program instances (as the previous implementation did.) The basic approach is to locally compute a reduction across the active program instances with the given op and to then issue a single HW atomic with that reduced value as the operand. We then take the old value that was stored in the location that is returned from the HW atomic op and use that to compute the values to return to each of the program instances (conceptually representing the cumulative effect of each of the preceding program instances having performed their atomic operation.) Issue #56.	2011-08-31 05:35:01 -07:00
Matt Pharr	96a297c747	Small improvements to help output	2011-08-30 14:48:22 -07:00
Matt Pharr	67e00b97c6	Fix incorrect assertions in ConstExpr constructors	2011-08-30 11:08:53 -07:00
Matt Pharr	a94cabc692	Modify stencil example to do separate runs with and without task parallelism.	2011-08-30 05:08:21 -07:00
Matt Pharr	ad9e66650d	AVX bugfix with alignment for store instructions. When replacing 'all on' masked store with regular store, set alignment to be the vector element alignment, not the alignment for a whole vector. (i.e. 4 or 8 byte alignment, not 32 or 64).	2011-08-29 16:58:48 -07:00
Matt Pharr	6de494cfdb	Fix AVX bug introduced in `4ab982bc16`	2011-08-29 16:50:59 -07:00
Matt Pharr	58e34ba4ae	Add new test-driver script, run_tests.py. Old run_tests.sh still lives (for now). Changes include: - Tests are run in parallel across all of the available CPU cores - Option to create a statically-linked executable for each test (rather than using the LLVM JIT). This is in particular useful for AVX, which doesn't have good JIT support yet. - Static executables also makes it possible to test x86, not just x86-64, codegen. - Fixed a number of tests in failing_tests, which were actually failing due to the fact that the expected function signature of tests had changed.	2011-08-29 14:15:09 -07:00
Matt Pharr	33feeffe5d	Update timing header so it works with C code	2011-08-29 11:23:43 -07:00
Matt Pharr	d0db46aac5	Use logical shift right op for shifts of unsigned ints. Fixes issue #88 .	2011-08-29 10:32:26 -07:00
Matt Pharr	da76396c75	Fix typo in SSE2 attributes string.	2011-08-27 08:59:25 -07:00
Matt Pharr	bbf3fb6307	Disable popcnt on SSE4 targets--should only enable if system CPU supports it	2011-08-27 04:09:55 -07:00
Matt Pharr	4ab982bc16	Various AVX fixes (found by inspection). Emit calls to masked_store, not masked_store_blend, when handling masked stores emitted by the frontend. Fix bug in binary8to16 macro in builtins.m4 Fix bug in 16-wide version of __reduce_add_float Remove blend function implementations for masked_store_blend for AVX; just forward those on to the corresponding real masked store functions.	2011-08-26 12:58:02 -07:00
Matt Pharr	34301e09f5	Fix incorrect comment in builtins definitions files. (And all of the places it was cut and pasted to. :-( ).	2011-08-26 10:44:46 -07:00
Matt Pharr	84e586e767	Commit correct atomics tests	2011-08-26 10:43:30 -07:00
Matt Pharr	72a2f5d2f4	Make SSE2 __popcnt_int64 return i64 to be consistent with other targets	2011-08-26 10:42:12 -07:00
Matt Pharr	606cbab0d4	Performance improvements for global min/max atomics. Issue #57 . Compute a "local" min/max across the active program instances and then do a single atomic memory op. Added a few tests to exercise global min/max atomics (which were previously untested!)	2011-08-26 10:35:24 -07:00
Matt Pharr	54ec56c81d	Clean up and centralize LLVM target initialization	2011-08-26 10:15:33 -07:00
Matt Pharr	a322398c62	When emitting header files, put 'extern' declarations of globals used in ispc code outside of the ispc namespace. Fixes issue #64.	2011-08-26 10:03:06 -07:00
Matt Pharr	f22b3a25bd	Update command-line processing and usage string now that we have a preprocessor on Windows. We had been prohibiting Windows users from providing #definitions on the command line, which is the wrong thing to do ever since we switched to using the clang preprocessor.	2011-08-26 09:58:08 -07:00
Matt Pharr	b67498766e	Big rewrite / improvement of target handling. If no CPU is specified, use the host CPU type, not just a default of "nehalem". Provide better features strings to the LLVM target machinery. -> Thus ensuring that LLVM doesn't generate SSE>2 instructions for the SSE2 target (Fixes issue #82). -> Slight code improvements from using cmovs in generated code now Use the llvm popcnt intrinsic for the SSE2 target now (it now generates code that doesn't call the popcnt instruction now that we properly tell LLVM which instructions are and aren't available for SSE2.)	2011-08-26 09:54:45 -07:00
Matt Pharr	c340ff3893	Fixes to build with LLVM ToT	2011-08-25 08:53:56 +01:00
Matt Pharr	b0f59777d4	Silly bug: don't pass NULL to the print() stmt when we want a llvm::Value * that has the value NULL. (This was causing crashes with print() statements with no additional values to be printed.)	2011-08-25 07:48:13 +01:00
Matt Pharr	e14208f489	Update to call DIBuilder::finalize() with LLVM 3.0	2011-08-24 22:28:20 +01:00
Matt Pharr	7756265503	Add double-pumped AVX target (i.e., run 16-wide). Not yet tested.	2011-08-20 11:28:22 +01:00
Matt Pharr	f841b775c3	Small bugfixes in AVX builtins	2011-08-20 09:09:55 +01:00
Matt Pharr	8c921544a0	fix broken test	2011-08-18 20:40:50 +01:00
Matt Pharr	fe54f1ad8e	Fixes to build with latest LLVM ToT	2011-08-18 08:34:49 +01:00
Matt Pharr	74c2c8ae07	Linux build fixes	2011-08-17 07:08:44 -07:00