update doxygen, release notes for 1.0.8 release

Predicated 'if' statement performance improvements.
Go back to running both sides of 'if' statements with masking and without branching if we can determine that the code is relatively simple (as per the simple cost model), and is safe to run even if the mask is 'all off'. This gives a bit of a performance improvement for some of the examples (most notably, the ray tracer), and is the code that one wants generated in this case anyhow.
2011-09-19 15:22:25 -07:00 · 2011-09-19 09:54:09 -07:00 · 2011-09-17 13:42:46 -07:00 · 2011-09-17 13:38:51 -07:00 · 2011-09-17 13:18:59 -07:00 · 2011-09-17 13:03:51 -07:00
318 changed files with 17731 additions and 4709 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -4,3 +4,5 @@ depend
 ispc
 ispc_test
 objs
+docs/doxygen
+docs/ispc.html
--- a/LICENSE.txt
+++ b/LICENSE.txt
@@ -114,3 +114,30 @@ CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
 SOFTWARE.
+
+---------------------------------------------------------------------------
+
+ispc's code to convert to and from half-precision floats is based on James
+Tursa's code, which is covered by the following license:
+
+Redistribution and use in source and binary forms, with or without 
+modification, are permitted provided that the following conditions are 
+met:
+
+   * Redistributions of source code must retain the above copyright 
+     notice, this list of conditions and the following disclaimer.
+   * Redistributions in binary form must reproduce the above copyright 
+     notice, this list of conditions and the following disclaimer in 
+     the documentation and/or other materials provided with the distribution
+      
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 
+POSSIBILITY OF SUCH DAMAGE.
--- a/69
+++ b/69
@@ -2,15 +2,26 @@
 # ispc Makefile
 #

-ARCH = $(shell uname)
+ARCH_OS = $(shell uname)
+ARCH_TYPE = $(shell arch)

 CLANG=clang
-LLVM_LIBS=$(shell llvm-config --ldflags --libs) -lpthread -ldl
+CLANG_LIBS = -lclangFrontend -lclangDriver \
+             -lclangSerialization -lclangParse -lclangSema \
+             -lclangAnalysis -lclangAST -lclangLex -lclangBasic
+
+ISPC_LIBS=$(CLANG_LIBS) \
+	$(shell llvm-config --ldflags --libs) \
+	-lpthread -ldl
+ISPC_TEST_LIBS=$(shell llvm-config --ldflags --libs) \
+	-lpthread -ldl
+
 LLVM_CXXFLAGS=$(shell llvm-config --cppflags)
-LLVM_VERSION_DEF=-DLLVM_$(shell llvm-config --version | sed s/\\./_/)
+LLVM_VERSION=$(shell llvm-config --version | sed s/\\./_/)
+LLVM_VERSION_DEF=-DLLVM_$(LLVM_VERSION)

 BUILD_DATE=$(shell date +%Y%m%d)
-BUILD_VERSION=$(shell git log | head -1)
+BUILD_VERSION=$(shell git log --abbrev-commit --abbrev=16 | head -1)

 CXX=g++
 CPP=cpp
@@ -18,10 +29,14 @@ CXXFLAGS=-g3 $(LLVM_CXXFLAGS) -I. -Iobjs/ -Wall $(LLVM_VERSION_DEF) \
 	-DBUILD_DATE="\"$(BUILD_DATE)\"" -DBUILD_VERSION="\"$(BUILD_VERSION)\""

 LDFLAGS=
-ifeq ($(ARCH),Linux)
+ifeq ($(ARCH_OS),Linux)
  # try to link everything statically under Linux (including libstdc++) so
  # that the binaries we generate will be portable across distributions...
-  LDFLAGS=-static -L/usr/lib/gcc/x86_64-linux-gnu/4.4
+  ifeq ($(ARCH_TYPE),x86_64)
+    LDFLAGS=-static -L/usr/lib/gcc/x86_64-linux-gnu/4.4
+  else
+    LDFLAGS=-L/usr/lib/gcc/i686-redhat-linux/4.6.0
+  endif
 endif

 LEX=flex
@@ -34,17 +49,19 @@ CXX_SRC=builtins.cpp ctx.cpp decl.cpp expr.cpp ispc.cpp \
 	util.cpp
 HEADERS=builtins.h ctx.h decl.h expr.h ispc.h llvmutil.h module.h \
 	opt.h stmt.h sym.h type.h util.h
-STDLIB_SRC=stdlib-avx.ll stdlib-sse2.ll stdlib-sse4.ll stdlib-sse4x2.ll
+BUILTINS_SRC=builtins-avx.ll builtins-avx-x2.ll builtins-sse2.ll \
+	builtins-sse4.ll builtins-sse4x2.ll
 BISON_SRC=parse.yy
 FLEX_SRC=lex.ll

-OBJS=$(addprefix objs/, $(CXX_SRC:.cpp=.o) $(STDLIB_SRC:.ll=.o) stdlib-c.o stdlib_ispc.o \
-	$(BISON_SRC:.yy=.o) $(FLEX_SRC:.ll=.o))
+OBJS=$(addprefix objs/, $(CXX_SRC:.cpp=.o) $(BUILTINS_SRC:.ll=.o) \
+	builtins-c-32.o builtins-c-64.o stdlib_ispc.o $(BISON_SRC:.yy=.o) \
+	$(FLEX_SRC:.ll=.o))

 default: ispc ispc_test

 .PHONY: dirs clean depend doxygen print_llvm_src
-.PRECIOUS: objs/stdlib-%.cpp
+.PRECIOUS: objs/builtins-%.cpp

 depend: $(CXX_SRC) $(HEADERS)
 	@echo Updating dependencies
@@ -68,11 +85,11 @@ doxygen:

 ispc: print_llvm_src dirs $(OBJS)
 	@echo Creating ispc executable
-	@$(CXX) $(LDFLAGS) -o $@ $(OBJS) $(LLVM_LIBS)
+	@$(CXX) $(LDFLAGS) -o $@ $(OBJS) $(ISPC_LIBS)

 ispc_test: dirs ispc_test.cpp
 	@echo Creating ispc_test executable
-	@$(CXX) $(LDFLAGS) $(CXXFLAGS) -o $@ ispc_test.cpp $(LLVM_LIBS)
+	@$(CXX) $(LDFLAGS) $(CXXFLAGS) -o $@ ispc_test.cpp $(ISPC_TEST_LIBS)

 objs/%.o: %.cpp
 	@echo Compiling $<
@@ -94,27 +111,33 @@ objs/lex.o: objs/lex.cpp $(HEADERS) objs/parse.cc
 	@echo Compiling $<
 	@$(CXX) $(CXXFLAGS) -o $@ -c $<

-$(STDLIB_SRC): stdlib.m4
+objs/builtins-%.cpp: builtins-%.ll builtins.m4 builtins-sse.ll builtins-avx-common.ll
+	@echo Creating C++ source from builtin definitions file $<
+	@m4 -DLLVM_VERSION=$(LLVM_VERSION) builtins.m4 $< | ./bitcode2cpp.py $< > $@

-objs/stdlib-%.cpp: stdlib-%.ll
-	@echo Creating C++ source from stdlib file $<
-	@m4 stdlib.m4 $< | ./bitcode2cpp.py $< > $@
-
-objs/stdlib-%.o: objs/stdlib-%.cpp
+objs/builtins-%.o: objs/builtins-%.cpp
 	@echo Compiling $<
 	@$(CXX) $(CXXFLAGS) -o $@ -c $<

-objs/stdlib-c.cpp: stdlib-c.c
-	@echo Creating C++ source from stdlib file $<
-	@$(CLANG) -I /opt/l1om/usr/include/ -emit-llvm -c $< -o - | llvm-dis - | ./bitcode2cpp.py $< > $@
+objs/builtins-c-32.cpp: builtins-c.c
+	@echo Creating C++ source from builtins definition file $<
+	@$(CLANG) -m32 -emit-llvm -c $< -o - | llvm-dis - | ./bitcode2cpp.py builtins-c-32.c > $@

-objs/stdlib-c.o: objs/stdlib-c.cpp
+objs/builtins-c-32.o: objs/builtins-c-32.cpp
+	@echo Compiling $<
+	@$(CXX) $(CXXFLAGS) -o $@ -c $<
+
+objs/builtins-c-64.cpp: builtins-c.c
+	@echo Creating C++ source from builtins definition file $<
+	@$(CLANG) -m64 -emit-llvm -c $< -o - | llvm-dis - | ./bitcode2cpp.py builtins-c-64.c > $@
+
+objs/builtins-c-64.o: objs/builtins-c-64.cpp
 	@echo Compiling $<
 	@$(CXX) $(CXXFLAGS) -o $@ -c $<

 objs/stdlib_ispc.cpp: stdlib.ispc
 	@echo Creating C++ source from $<
-	@$(CPP) -DISPC=1 -DPI=3.1415926536 $< | ./stdlib2cpp.py > $@
+	@$(CLANG) -E -x c -DISPC=1 -DPI=3.1415926536 $< -o - | ./stdlib2cpp.py > $@

 objs/stdlib_ispc.o: objs/stdlib_ispc.cpp
 	@echo Compiling $<
--- a/bitcode2cpp.py
+++ b/bitcode2cpp.py
@@ -9,7 +9,7 @@ length=0

 src=str(sys.argv[1])

-target = re.sub(".*stdlib-", "", src)
+target = re.sub(".*builtins-", "", src)
 target = re.sub("\.ll$", "", target)
 target = re.sub("\.c$", "", target)
 target = re.sub("-", "_", target)
@@ -20,14 +20,14 @@ except IOError:
    print >> sys.stderr, "Couldn't open " + src
    sys.exit(1)

-print "unsigned char stdlib_bitcode_" + target + "[] = {"
+print "unsigned char builtins_bitcode_" + target + "[] = {"
 for line in as_out.stdout.readlines():
    length = length + len(line)
    for c in line:
        print ord(c)
        print ", "
 print " 0 };\n\n"
-print "int stdlib_bitcode_" + target + "_length = " + str(length) + ";\n"
+print "int builtins_bitcode_" + target + "_length = " + str(length) + ";\n"

 as_out.wait()

--- a/builtins-avx-common.ll
+++ b/builtins-avx-common.ll
@@ -0,0 +1,278 @@
+;;  Copyright (c) 2010-2011, Intel Corporation
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Intel Corporation nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; *** Untested *** AVX target implementation.
+;;
+;; The LLVM AVX code generator is incomplete, so the ispc AVX target
+;; hasn't yet been tested.  There is therefore a higher-than-normal
+;; chance that there are bugs in the code in this file.
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rcp
+
+declare <4 x float> @llvm.x86.sse.rcp.ss(<4 x float>) nounwind readnone
+
+define internal float @__rcp_uniform_float(float) nounwind readonly alwaysinline {
+;    uniform float iv = extract(__rcp_u(v), 0);
+;    return iv * (2. - v * iv);
+  %vecval = insertelement <4 x float> undef, float %0, i32 0
+  %call = call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %vecval)
+  %scall = extractelement <4 x float> %call, i32 0
+
+  ; do one N-R iteration
+  %v_iv = fmul float %0, %scall
+  %two_minus = fsub float 2., %v_iv  
+  %iv_mul = fmul float %scall, %two_minus
+  ret float %iv_mul
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rounding floats
+
+declare <4 x float> @llvm.x86.sse41.round.ss(<4 x float>, <4 x float>, i32) nounwind readnone
+
+define internal float @__round_uniform_float(float) nounwind readonly alwaysinline {
+  ; roundss, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8
+  ; the roundss intrinsic is a total mess--docs say:
+  ;
+  ;  __m128 _mm_round_ss (__m128 a, __m128 b, const int c)
+  ;       
+  ;  b is a 128-bit parameter. The lowest 32 bits are the result of the rounding function
+  ;  on b0. The higher order 96 bits are copied directly from input parameter a. The
+  ;  return value is described by the following equations:
+  ;
+  ;  r0 = RND(b0)
+  ;  r1 = a1
+  ;  r2 = a2
+  ;  r3 = a3
+  ;
+  ;  It doesn't matter what we pass as a, since we only need the r0 value
+  ;  here.  So we pass the same register for both.
+  %xi = insertelement <4 x float> undef, float %0, i32 0
+  %xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 8)
+  %rs = extractelement <4 x float> %xr, i32 0
+  ret float %rs
+}
+
+define internal float @__floor_uniform_float(float) nounwind readonly alwaysinline {
+  ; see above for round_ss instrinsic discussion...
+  %xi = insertelement <4 x float> undef, float %0, i32 0
+  ; roundps, round down 0b01 | don't signal precision exceptions 0b1001 = 9
+  %xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 9)
+  %rs = extractelement <4 x float> %xr, i32 0
+  ret float %rs
+}
+
+define internal float @__ceil_uniform_float(float) nounwind readonly alwaysinline {
+  ; see above for round_ss instrinsic discussion...
+  %xi = insertelement <4 x float> undef, float %0, i32 0
+  ; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
+  %xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 10)
+  %rs = extractelement <4 x float> %xr, i32 0
+  ret float %rs
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rounding doubles
+
+declare <2 x double> @llvm.x86.sse41.round.sd(<2 x double>, <2 x double>, i32) nounwind readnone
+
+define internal double @__round_uniform_double(double) nounwind readonly alwaysinline {
+  %xi = insertelement <2 x double> undef, double %0, i32 0
+  %xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 8)
+  %rs = extractelement <2 x double> %xr, i32 0
+  ret double %rs
+}
+
+define internal double @__floor_uniform_double(double) nounwind readonly alwaysinline {
+  ; see above for round_ss instrinsic discussion...
+  %xi = insertelement <2 x double> undef, double %0, i32 0
+  ; roundpd, round down 0b01 | don't signal precision exceptions 0b1001 = 9
+  %xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 9)
+  %rs = extractelement <2 x double> %xr, i32 0
+  ret double %rs
+}
+
+define internal double @__ceil_uniform_double(double) nounwind readonly alwaysinline {
+  ; see above for round_ss instrinsic discussion...
+  %xi = insertelement <2 x double> undef, double %0, i32 0
+  ; roundpd, round up 0b10 | don't signal precision exceptions 0b1010 = 10
+  %xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 10)
+  %rs = extractelement <2 x double> %xr, i32 0
+  ret double %rs
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rsqrt
+
+declare <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float>) nounwind readnone
+
+define internal float @__rsqrt_uniform_float(float) nounwind readonly alwaysinline {
+  ;  uniform float is = extract(__rsqrt_u(v), 0);
+  %v = insertelement <4 x float> undef, float %0, i32 0
+  %vis = call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %v)
+  %is = extractelement <4 x float> %vis, i32 0
+
+  ;  return 0.5 * is * (3. - (v * is) * is);
+  %v_is = fmul float %0, %is
+  %v_is_is = fmul float %v_is, %is
+  %three_sub = fsub float 3., %v_is_is
+  %is_mul = fmul float %is, %three_sub
+  %half_scale = fmul float 0.5, %is_mul
+  ret float %half_scale
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; sqrt
+
+declare <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float>) nounwind readnone
+
+define internal float @__sqrt_uniform_float(float) nounwind readonly alwaysinline {
+  sse_unary_scalar(ret, 4, float, @llvm.x86.sse.sqrt.ss, %0)
+  ret float %ret
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; fastmath
+
+declare void @llvm.x86.sse.stmxcsr(i8 *) nounwind
+declare void @llvm.x86.sse.ldmxcsr(i8 *) nounwind
+
+define internal void @__fastmath() nounwind alwaysinline {
+  %ptr = alloca i32
+  %ptr8 = bitcast i32 * %ptr to i8 *
+  call void @llvm.x86.sse.stmxcsr(i8 * %ptr8)
+  %oldval = load i32 *%ptr
+
+  ; turn on DAZ (64)/FTZ (32768) -> 32832
+  %update = or i32 %oldval, 32832
+  store i32 %update, i32 *%ptr
+  call void @llvm.x86.sse.ldmxcsr(i8 * %ptr8)
+  ret void
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; float min/max
+
+declare <4 x float> @llvm.x86.sse.max.ss(<4 x float>, <4 x float>) nounwind readnone
+declare <4 x float> @llvm.x86.sse.min.ss(<4 x float>, <4 x float>) nounwind readnone
+
+define internal float @__max_uniform_float(float, float) nounwind readonly alwaysinline {
+  sse_binary_scalar(ret, 4, float, @llvm.x86.sse.max.ss, %0, %1)
+  ret float %ret
+}
+
+define internal float @__min_uniform_float(float, float) nounwind readonly alwaysinline {
+  sse_binary_scalar(ret, 4, float, @llvm.x86.sse.min.ss, %0, %1)
+  ret float %ret
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int min/max
+
+declare <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32>, <4 x i32>) nounwind readnone
+declare <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32>, <4 x i32>) nounwind readnone
+
+define internal i32 @__min_uniform_int32(i32, i32) nounwind readonly alwaysinline {
+  sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pminsd, %0, %1)
+  ret i32 %ret
+}
+
+define internal i32 @__max_uniform_int32(i32, i32) nounwind readonly alwaysinline {
+  sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pmaxsd, %0, %1)
+  ret i32 %ret
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; unsigned int min/max
+
+declare <4 x i32> @llvm.x86.sse41.pminud(<4 x i32>, <4 x i32>) nounwind readnone
+declare <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32>, <4 x i32>) nounwind readnone
+
+define internal i32 @__min_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
+  sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pminud, %0, %1)
+  ret i32 %ret
+}
+
+define internal i32 @__max_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
+  sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pmaxud, %0, %1)
+  ret i32 %ret
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; horizontal ops
+
+declare i32 @llvm.ctpop.i32(i32) nounwind readnone
+
+define internal i32 @__popcnt_int32(i32) nounwind readonly alwaysinline {
+  %call = call i32 @llvm.ctpop.i32(i32 %0)
+  ret i32 %call
+}
+
+declare i64 @llvm.ctpop.i64(i64) nounwind readnone
+
+define internal i64 @__popcnt_int64(i64) nounwind readonly alwaysinline {
+  %call = call i64 @llvm.ctpop.i64(i64 %0)
+  ret i64 %call
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; double precision sqrt
+
+declare <2 x double> @llvm.x86.sse.sqrt.sd(<2 x double>) nounwind readnone
+
+define internal double @__sqrt_uniform_double(double) nounwind alwaysinline {
+  sse_unary_scalar(ret, 2, double, @llvm.x86.sse.sqrt.sd, %0)
+  ret double %ret
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; double precision min/max
+
+declare <2 x double> @llvm.x86.sse2.max.sd(<2 x double>, <2 x double>) nounwind readnone
+declare <2 x double> @llvm.x86.sse2.min.sd(<2 x double>, <2 x double>) nounwind readnone
+
+define internal double @__min_uniform_double(double, double) nounwind readnone alwaysinline {
+  sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.min.sd, %0, %1)
+  ret double %ret
+}
+
+define internal double @__max_uniform_double(double, double) nounwind readnone alwaysinline {
+  sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.max.sd, %0, %1)
+  ret double %ret
+}
--- a/builtins-avx-x2.ll
+++ b/builtins-avx-x2.ll
@@ -0,0 +1,665 @@
+;;  Copyright (c) 2010-2011, Intel Corporation
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Intel Corporation nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; *** Untested *** AVX target implementation.
+;;
+;; The LLVM AVX code generator is incomplete, so the ispc AVX target
+;; hasn't yet been tested.  There is therefore a higher-than-normal
+;; chance that there are bugs in the code in this file.
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Basic 16-wide definitions
+
+stdlib_core(16)
+packed_load_and_store(16)
+scans(16)
+int64minmax(16)
+
+include(`builtins-avx-common.ll')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rcp
+
+declare <8 x float> @llvm.x86.avx.rcp.ps.256(<8 x float>) nounwind readnone
+
+define internal <16 x float> @__rcp_varying_float(<16 x float>) nounwind readonly alwaysinline {
+  ;  float iv = __rcp_v(v);
+  ;  return iv * (2. - v * iv);
+
+  unary8to16(call, float, @llvm.x86.avx.rcp.ps.256, %0)
+  ; do one N-R iteration
+  %v_iv = fmul <16 x float> %0, %call
+  %two_minus = fsub <16 x float> <float 2., float 2., float 2., float 2.,
+                                  float 2., float 2., float 2., float 2.,
+                                  float 2., float 2., float 2., float 2.,
+                                  float 2., float 2., float 2., float 2.>, %v_iv  
+  %iv_mul = fmul <16 x float> %call, %two_minus
+  ret <16 x float> %iv_mul
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rounding floats
+
+declare <8 x float> @llvm.x86.avx.round.ps.256(<8 x float>, i32) nounwind readnone
+
+define internal <16 x float> @__round_varying_float(<16 x float>) nounwind readonly alwaysinline {
+  ; roundps, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8
+  round8to16(%0, 8)
+}
+
+define internal <16 x float> @__floor_varying_float(<16 x float>) nounwind readonly alwaysinline {
+  ; roundps, round down 0b01 | don't signal precision exceptions 0b1001 = 9
+  round8to16(%0, 9)
+}
+
+define internal <16 x float> @__ceil_varying_float(<16 x float>) nounwind readonly alwaysinline {
+  ; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
+  round8to16(%0, 10)
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rounding doubles
+
+declare <4 x double> @llvm.x86.avx.round.pd.256(<4 x double>, i32) nounwind readnone
+
+define internal <16 x double> @__round_varying_double(<16 x double>) nounwind readonly alwaysinline {
+  round4to16double(%0, 8)
+}
+
+define internal <16 x double> @__floor_varying_double(<16 x double>) nounwind readonly alwaysinline {
+  round4to16double(%0, 9)
+}
+
+define internal <16 x double> @__ceil_varying_double(<16 x double>) nounwind readonly alwaysinline {
+  round4to16double(%0, 10)
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rsqrt
+
+declare <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float>) nounwind readnone
+
+define internal <16 x float> @__rsqrt_varying_float(<16 x float> %v) nounwind readonly alwaysinline {
+  ;  float is = __rsqrt_v(v);
+  unary8to16(is, float, @llvm.x86.avx.rsqrt.ps.256, %v)
+  ;  return 0.5 * is * (3. - (v * is) * is);
+  %v_is = fmul <16 x float> %v, %is
+  %v_is_is = fmul <16 x float> %v_is, %is
+  %three_sub = fsub <16 x float> <float 3., float 3., float 3., float 3.,
+                                  float 3., float 3., float 3., float 3.,
+                                  float 3., float 3., float 3., float 3.,
+                                  float 3., float 3., float 3., float 3.>, %v_is_is
+  %is_mul = fmul <16 x float> %is, %three_sub
+  %half_scale = fmul <16 x float> <float 0.5, float 0.5, float 0.5, float 0.5,
+                                   float 0.5, float 0.5, float 0.5, float 0.5,
+                                   float 0.5, float 0.5, float 0.5, float 0.5,
+                                   float 0.5, float 0.5, float 0.5, float 0.5>, %is_mul
+  ret <16 x float> %half_scale
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; sqrt
+
+declare <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float>) nounwind readnone
+
+define internal <16 x float> @__sqrt_varying_float(<16 x float>) nounwind readonly alwaysinline {
+  unary8to16(call, float, @llvm.x86.avx.sqrt.ps.256, %0)
+  ret <16 x float> %call
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; svml
+
+; FIXME: need either to wire these up to the 8-wide SVML entrypoints,
+; or, use the macro to call the 4-wide ones 4x with our 16-wide
+; vectors...
+
+declare <16 x float> @__svml_sin(<16 x float>)
+declare <16 x float> @__svml_cos(<16 x float>)
+declare void @__svml_sincos(<16 x float>, <16 x float> *, <16 x float> *)
+declare <16 x float> @__svml_tan(<16 x float>)
+declare <16 x float> @__svml_atan(<16 x float>)
+declare <16 x float> @__svml_atan2(<16 x float>, <16 x float>)
+declare <16 x float> @__svml_exp(<16 x float>)
+declare <16 x float> @__svml_log(<16 x float>)
+declare <16 x float> @__svml_pow(<16 x float>, <16 x float>)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; float min/max
+
+declare <8 x float> @llvm.x86.avx.max.ps.256(<8 x float>, <8 x float>) nounwind readnone
+declare <8 x float> @llvm.x86.avx.min.ps.256(<8 x float>, <8 x float>) nounwind readnone
+
+define internal <16 x float> @__max_varying_float(<16 x float>,
+                                                  <16 x float>) nounwind readonly alwaysinline {
+  binary8to16(call, float, @llvm.x86.avx.max.ps.256, %0, %1)
+  ret <16 x float> %call
+}
+
+define internal <16 x float> @__min_varying_float(<16 x float>,
+                                                  <16 x float>) nounwind readonly alwaysinline {
+  binary8to16(call, float, @llvm.x86.avx.min.ps.256, %0, %1)
+  ret <16 x float> %call
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int min/max
+
+define internal <16 x i32> @__min_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
+  binary4to16(ret, i32, @llvm.x86.sse41.pminsd, %0, %1)
+  ret <16 x i32> %ret
+}
+
+define internal <16 x i32> @__max_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
+  binary4to16(ret, i32, @llvm.x86.sse41.pmaxsd, %0, %1)
+  ret <16 x i32> %ret
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; unsigned int min/max
+
+define internal <16 x i32> @__min_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
+  binary4to16(ret, i32, @llvm.x86.sse41.pminud, %0, %1)
+  ret <16 x i32> %ret
+}
+
+define internal <16 x i32> @__max_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
+  binary4to16(ret, i32, @llvm.x86.sse41.pmaxud, %0, %1)
+  ret <16 x i32> %ret
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; horizontal ops
+
+declare i32 @llvm.x86.avx.movmsk.ps.256(<8 x float>) nounwind readnone
+
+define internal i32 @__movmsk(<16 x i32>) nounwind readnone alwaysinline {
+  %floatmask = bitcast <16 x i32> %0 to <16 x float>
+  %mask0 = shufflevector <16 x float> %floatmask, <16 x float> undef,
+          <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %v0 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %mask0) nounwind readnone
+  %mask1 = shufflevector <16 x float> %floatmask, <16 x float> undef,
+          <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %v1 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %mask1) nounwind readnone
+
+  %v1shift = shl i32 %v1, 8
+  %v = or i32 %v1shift, %v0
+  ret i32 %v
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; horizontal float ops
+
+declare <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float>, <8 x float>) nounwind readnone
+
+define internal float @__reduce_add_float(<16 x float>) nounwind readonly alwaysinline {
+  %va = shufflevector <16 x float> %0, <16 x float> undef,
+          <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %vb = shufflevector <16 x float> %0, <16 x float> undef,
+          <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %v1 = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %va, <8 x float> %vb)
+  %v2 = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %v1, <8 x float> %v1)
+  %v3 = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %v2, <8 x float> %v2)
+  %scalar1 = extractelement <8 x float> %v3, i32 0
+  %scalar2 = extractelement <8 x float> %v3, i32 4
+  %sum = fadd float %scalar1, %scalar2
+  ret float %sum
+}
+
+
+define internal float @__reduce_min_float(<16 x float>) nounwind readnone alwaysinline {
+  reduce16(float, @__min_varying_float, @__min_uniform_float)
+}
+
+
+define internal float @__reduce_max_float(<16 x float>) nounwind readnone alwaysinline {
+  reduce16(float, @__max_varying_float, @__max_uniform_float)
+}
+
+reduce_equal(16)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; horizontal int32 ops
+
+define internal <16 x i32> @__add_varying_int32(<16 x i32>,
+                                                <16 x i32>) nounwind readnone alwaysinline {
+  %s = add <16 x i32> %0, %1
+  ret <16 x i32> %s
+}
+
+define internal i32 @__add_uniform_int32(i32, i32) nounwind readnone alwaysinline {
+  %s = add i32 %0, %1
+  ret i32 %s
+}
+
+define internal i32 @__reduce_add_int32(<16 x i32>) nounwind readnone alwaysinline {
+  reduce16(i32, @__add_varying_int32, @__add_uniform_int32)
+}
+
+
+define internal i32 @__reduce_min_int32(<16 x i32>) nounwind readnone alwaysinline {
+  reduce16(i32, @__min_varying_int32, @__min_uniform_int32)
+}
+
+
+define internal i32 @__reduce_max_int32(<16 x i32>) nounwind readnone alwaysinline {
+  reduce16(i32, @__max_varying_int32, @__max_uniform_int32)
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; horizontal uint32 ops
+
+define internal i32 @__reduce_add_uint32(<16 x i32> %v) nounwind readnone alwaysinline {
+  %r = call i32 @__reduce_add_int32(<16 x i32> %v)
+  ret i32 %r
+}
+
+define internal i32 @__reduce_min_uint32(<16 x i32>) nounwind readnone alwaysinline {
+  reduce16(i32, @__min_varying_uint32, @__min_uniform_uint32)
+}
+
+
+define internal i32 @__reduce_max_uint32(<16 x i32>) nounwind readnone alwaysinline {
+  reduce16(i32, @__max_varying_uint32, @__max_uniform_uint32)
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; horizontal double ops
+
+declare <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double>, <4 x double>) nounwind readnone
+
+define internal double @__reduce_add_double(<16 x double>) nounwind readonly alwaysinline {
+  %va = shufflevector <16 x double> %0, <16 x double> undef,
+         <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %vb = shufflevector <16 x double> %0, <16 x double> undef,
+         <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %vc = shufflevector <16 x double> %0, <16 x double> undef,
+         <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+  %vd = shufflevector <16 x double> %0, <16 x double> undef,
+         <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+  %vab = fadd <4 x double> %va, %vb
+  %vcd = fadd <4 x double> %vc, %vd
+
+  %sum0 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %vab, <4 x double> %vcd)
+  %sum1 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %sum0, <4 x double> %sum0)
+  %final0 = extractelement <4 x double> %sum1, i32 0
+  %final1 = extractelement <4 x double> %sum1, i32 2
+  %sum = fadd double %final0, %final1
+  ret double %sum
+}
+
+define internal double @__reduce_min_double(<16 x double>) nounwind readnone alwaysinline {
+  reduce16(double, @__min_varying_double, @__min_uniform_double)
+}
+
+
+define internal double @__reduce_max_double(<16 x double>) nounwind readnone alwaysinline {
+  reduce16(double, @__max_varying_double, @__max_uniform_double)
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; horizontal int64 ops
+
+define internal <16 x i64> @__add_varying_int64(<16 x i64>,
+                                                <16 x i64>) nounwind readnone alwaysinline {
+  %s = add <16 x i64> %0, %1
+  ret <16 x i64> %s
+}
+
+define internal i64 @__add_uniform_int64(i64, i64) nounwind readnone alwaysinline {
+  %s = add i64 %0, %1
+  ret i64 %s
+}
+
+define internal i64 @__reduce_add_int64(<16 x i64>) nounwind readnone alwaysinline {
+  reduce16(i64, @__add_varying_int64, @__add_uniform_int64)
+}
+
+
+define internal i64 @__reduce_min_int64(<16 x i64>) nounwind readnone alwaysinline {
+  reduce16(i64, @__min_varying_int64, @__min_uniform_int64)
+}
+
+
+define internal i64 @__reduce_max_int64(<16 x i64>) nounwind readnone alwaysinline {
+  reduce16(i64, @__max_varying_int64, @__max_uniform_int64)
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; horizontal uint64 ops
+
+define internal i64 @__reduce_add_uint64(<16 x i64> %v) nounwind readnone alwaysinline {
+  %r = call i64 @__reduce_add_int64(<16 x i64> %v)
+  ret i64 %r
+}
+
+define internal i64 @__reduce_min_uint64(<16 x i64>) nounwind readnone alwaysinline {
+  reduce16(i64, @__min_varying_uint64, @__min_uniform_uint64)
+}
+
+
+define internal i64 @__reduce_max_uint64(<16 x i64>) nounwind readnone alwaysinline {
+  reduce16(i64, @__max_varying_uint64, @__max_uniform_uint64)
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; unaligned loads/loads+broadcasts
+
+load_and_broadcast(16, i8, 8)
+load_and_broadcast(16, i16, 16)
+load_and_broadcast(16, i32, 32)
+load_and_broadcast(16, i64, 64)
+
+; no masked load instruction for i8 and i16 types??
+load_masked(16, i8,  8,  1)
+load_masked(16, i16, 16, 2)
+
+declare <8 x float> @llvm.x86.avx.maskload.ps.256(i8 *, <8 x float> %mask)
+declare <4 x double> @llvm.x86.avx.maskload.pd.256(i8 *, <4 x double> %mask)
+ 
+define <16 x i32> @__load_masked_32(i8 *, <16 x i32> %mask) nounwind alwaysinline {
+  %floatmask = bitcast <16 x i32> %mask to <16 x float>
+  %mask0 = shufflevector <16 x float> %floatmask, <16 x float> undef,
+     <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %val0 = call <8 x float> @llvm.x86.avx.maskload.ps.256(i8 * %0, <8 x float> %mask0)
+  %mask1 = shufflevector <16 x float> %floatmask, <16 x float> undef,
+     <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %ptr1 = getelementptr i8 * %0, i32 32   ;; 8x4 bytes = 32
+  %val1 = call <8 x float> @llvm.x86.avx.maskload.ps.256(i8 * %ptr1, <8 x float> %mask1)
+
+  %retval = shufflevector <8 x float> %val0, <8 x float> %val1,
+     <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                 i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %reti32 = bitcast <16 x float> %retval to <16 x i32>
+  ret <16 x i32> %reti32
+}
+
+
+define <16 x i64> @__load_masked_64(i8 *, <16 x i32> %mask) nounwind alwaysinline {
+  ; double up masks, bitcast to doubles
+  %mask0 = shufflevector <16 x i32> %mask, <16 x i32> undef,
+     <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
+  %mask1 = shufflevector <16 x i32> %mask, <16 x i32> undef,
+     <8 x i32> <i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
+  %mask2 = shufflevector <16 x i32> %mask, <16 x i32> undef,
+     <8 x i32> <i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11>
+  %mask3 = shufflevector <16 x i32> %mask, <16 x i32> undef,
+     <8 x i32> <i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15>
+  %mask0d = bitcast <8 x i32> %mask0 to <4 x double>
+  %mask1d = bitcast <8 x i32> %mask1 to <4 x double>
+  %mask2d = bitcast <8 x i32> %mask2 to <4 x double>
+  %mask3d = bitcast <8 x i32> %mask3 to <4 x double>
+
+  %val0d = call <4 x double> @llvm.x86.avx.maskload.pd.256(i8 * %0, <4 x double> %mask0d)
+  %ptr1 = getelementptr i8 * %0, i32 32
+  %val1d = call <4 x double> @llvm.x86.avx.maskload.pd.256(i8 * %ptr1, <4 x double> %mask1d)
+  %ptr2 = getelementptr i8 * %0, i32 64
+  %val2d = call <4 x double> @llvm.x86.avx.maskload.pd.256(i8 * %ptr2, <4 x double> %mask2d)
+  %ptr3 = getelementptr i8 * %0, i32 96
+  %val3d = call <4 x double> @llvm.x86.avx.maskload.pd.256(i8 * %ptr3, <4 x double> %mask3d)
+
+  %val01 = shufflevector <4 x double> %val0d, <4 x double> %val1d,
+      <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %val23 = shufflevector <4 x double> %val2d, <4 x double> %val3d,
+      <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %val0123 = shufflevector <8 x double> %val01, <8 x double> %val23,
+      <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                  i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %val = bitcast <16 x double> %val0123 to <16 x i64>
+  ret <16 x i64> %val
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; masked store
+
+; FIXME: there is no AVX instruction for these, but we could be clever
+; by packing the bits down and setting the last 3/4 or half, respectively,
+; of the mask to zero...  Not sure if this would be a win in the end
+gen_masked_store(16, i8, 8)
+gen_masked_store(16, i16, 16)
+
+; note that mask is the 2nd parameter, not the 3rd one!!
+declare void @llvm.x86.avx.maskstore.ps.256(i8 *, <8 x float>, <8 x float>)
+declare void @llvm.x86.avx.maskstore.pd.256(i8 *, <4 x double>, <4 x double>)
+
+define void @__masked_store_32(<16 x i32>* nocapture, <16 x i32>, 
+                               <16 x i32>) nounwind alwaysinline {
+  %ptr = bitcast <16 x i32> * %0 to i8 *
+  %val = bitcast <16 x i32> %1 to <16 x float>
+  %mask = bitcast <16 x i32> %2 to <16 x float>
+
+  %val0 = shufflevector <16 x float> %val, <16 x float> undef,
+        <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %val1 = shufflevector <16 x float> %val, <16 x float> undef,
+        <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+
+  %mask0 = shufflevector <16 x float> %mask, <16 x float> undef,
+        <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %mask1 = shufflevector <16 x float> %mask, <16 x float> undef,
+        <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+
+  call void @llvm.x86.avx.maskstore.ps.256(i8 * %ptr, <8 x float> %mask0, <8 x float> %val0)
+  %ptr1 = getelementptr i8 * %ptr, i32 32
+  call void @llvm.x86.avx.maskstore.ps.256(i8 * %ptr1, <8 x float> %mask1, <8 x float> %val1)
+
+  ret void
+}
+
+define void @__masked_store_64(<16 x i64>* nocapture, <16 x i64>,
+                               <16 x i32> %mask) nounwind alwaysinline {
+  %ptr = bitcast <16 x i64> * %0 to i8 *
+  %val = bitcast <16 x i64> %1 to <16 x double>
+
+  ; double up masks, bitcast to doubles
+  %mask0 = shufflevector <16 x i32> %mask, <16 x i32> undef,
+     <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
+  %mask1 = shufflevector <16 x i32> %mask, <16 x i32> undef,
+     <8 x i32> <i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
+  %mask2 = shufflevector <16 x i32> %mask, <16 x i32> undef,
+     <8 x i32> <i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11>
+  %mask3 = shufflevector <16 x i32> %mask, <16 x i32> undef,
+     <8 x i32> <i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15>
+  %mask0d = bitcast <8 x i32> %mask0 to <4 x double>
+  %mask1d = bitcast <8 x i32> %mask1 to <4 x double>
+  %mask2d = bitcast <8 x i32> %mask2 to <4 x double>
+  %mask3d = bitcast <8 x i32> %mask3 to <4 x double>
+
+  %val0 = shufflevector <16 x double> %val, <16 x double> undef,
+     <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %val1 = shufflevector <16 x double> %val, <16 x double> undef,
+     <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %val2 = shufflevector <16 x double> %val, <16 x double> undef,
+     <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+  %val3 = shufflevector <16 x double> %val, <16 x double> undef,
+     <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+
+  call void @llvm.x86.avx.maskstore.pd.256(i8 * %ptr, <4 x double> %mask0d, <4 x double> %val0)
+  %ptr1 = getelementptr i8 * %ptr, i32 32
+  call void @llvm.x86.avx.maskstore.pd.256(i8 * %ptr1, <4 x double> %mask1d, <4 x double> %val1)
+  %ptr2 = getelementptr i8 * %ptr, i32 64
+  call void @llvm.x86.avx.maskstore.pd.256(i8 * %ptr2, <4 x double> %mask2d, <4 x double> %val2)
+  %ptr3 = getelementptr i8 * %ptr, i32 96
+  call void @llvm.x86.avx.maskstore.pd.256(i8 * %ptr3, <4 x double> %mask3d, <4 x double> %val3)
+
+  ret void
+}
+
+
+masked_store_blend_8_16_by_16()
+
+declare <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float>, <8 x float>,
+                                                <8 x float>) nounwind readnone
+
+define void @__masked_store_blend_32(<16 x i32>* nocapture, <16 x i32>, 
+                                     <16 x i32>) nounwind alwaysinline {
+  %maskAsFloat = bitcast <16 x i32> %2 to <16 x float>
+  %oldValue = load <16 x i32>* %0, align 4
+  %oldAsFloat = bitcast <16 x i32> %oldValue to <16 x float>
+  %newAsFloat = bitcast <16 x i32> %1 to <16 x float>
+ 
+  %old0 = shufflevector <16 x float> %oldAsFloat, <16 x float> undef,
+        <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %old1 = shufflevector <16 x float> %oldAsFloat, <16 x float> undef,
+        <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %new0 = shufflevector <16 x float> %newAsFloat, <16 x float> undef,
+        <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %new1 = shufflevector <16 x float> %newAsFloat, <16 x float> undef,
+        <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %mask0 = shufflevector <16 x float> %maskAsFloat, <16 x float> undef,
+        <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %mask1 = shufflevector <16 x float> %maskAsFloat, <16 x float> undef,
+        <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+
+  %blend0 = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %old0,
+                                                         <8 x float> %new0,
+                                                         <8 x float> %mask0)
+  %blend1 = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %old1,
+                                                         <8 x float> %new1,
+                                                         <8 x float> %mask1)
+  %blend = shufflevector <8 x float> %blend0, <8 x float> %blend1,
+    <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %blendAsInt = bitcast <16 x float> %blend to <16 x i32>
+  store <16 x i32> %blendAsInt, <16 x i32>* %0, align 4
+  ret void
+}
+
+
+declare <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double>, <4 x double>,
+                                                 <4 x double>) nounwind readnone
+
+define void @__masked_store_blend_64(<16 x i64>* nocapture %ptr, <16 x i64> %newi64, 
+                                     <16 x i32> %mask) nounwind alwaysinline {
+  %oldValue = load <16 x i64>* %ptr, align 8
+  %old = bitcast <16 x i64> %oldValue to <16 x double>
+  %old0d = shufflevector <16 x double> %old, <16 x double> undef,
+     <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %old1d = shufflevector <16 x double> %old, <16 x double> undef,
+     <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %old2d = shufflevector <16 x double> %old, <16 x double> undef,
+     <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+  %old3d = shufflevector <16 x double> %old, <16 x double> undef,
+     <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+
+  %new = bitcast <16 x i64> %newi64 to <16 x double>
+  %new0d = shufflevector <16 x double> %new, <16 x double> undef,
+     <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %new1d = shufflevector <16 x double> %new, <16 x double> undef,
+     <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %new2d = shufflevector <16 x double> %new, <16 x double> undef,
+     <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+  %new3d = shufflevector <16 x double> %new, <16 x double> undef,
+     <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+
+  %mask0 = shufflevector <16 x i32> %mask, <16 x i32> undef,
+     <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
+  %mask1 = shufflevector <16 x i32> %mask, <16 x i32> undef,
+     <8 x i32> <i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
+  %mask2 = shufflevector <16 x i32> %mask, <16 x i32> undef,
+     <8 x i32> <i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11>
+  %mask3 = shufflevector <16 x i32> %mask, <16 x i32> undef,
+     <8 x i32> <i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15>
+  %mask0d = bitcast <8 x i32> %mask0 to <4 x double>
+  %mask1d = bitcast <8 x i32> %mask1 to <4 x double>
+  %mask2d = bitcast <8 x i32> %mask2 to <4 x double>
+  %mask3d = bitcast <8 x i32> %mask3 to <4 x double>
+
+  %result0d = call <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double> %old0d,
+                                 <4 x double> %new0d, <4 x double> %mask0d)
+  %result1d = call <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double> %old1d,
+                                 <4 x double> %new1d, <4 x double> %mask1d)
+  %result2d = call <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double> %old2d,
+                                 <4 x double> %new2d, <4 x double> %mask2d)
+  %result3d = call <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double> %old3d,
+                                 <4 x double> %new3d, <4 x double> %mask3d)
+
+  %result01 = shufflevector <4 x double> %result0d, <4 x double> %result1d,
+           <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %result23 = shufflevector <4 x double> %result2d, <4 x double> %result3d,
+           <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+
+  %result = shufflevector <8 x double> %result01, <8 x double> %result23,
+           <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                       i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %result64 = bitcast <16 x double> %result to <16 x i64>
+  store <16 x i64> %result64, <16 x i64> * %ptr
+  ret void
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; gather/scatter
+
+gen_gather(16, i8)
+gen_gather(16, i16)
+gen_gather(16, i32)
+gen_gather(16, i64)
+
+gen_scatter(16, i8)
+gen_scatter(16, i16)
+gen_scatter(16, i32)
+gen_scatter(16, i64)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; double precision sqrt
+
+declare <4 x double> @llvm.x86.avx.sqrt.pd.256(<4 x double>) nounwind readnone
+
+define internal <16 x double> @__sqrt_varying_double(<16 x double>) nounwind alwaysinline {
+  unary4to16(ret, double, @llvm.x86.avx.sqrt.pd.256, %0)
+  ret <16 x double> %ret
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; double precision min/max
+
+declare <4 x double> @llvm.x86.avx.max.pd.256(<4 x double>, <4 x double>) nounwind readnone
+declare <4 x double> @llvm.x86.avx.min.pd.256(<4 x double>, <4 x double>) nounwind readnone
+
+define internal <16 x double> @__min_varying_double(<16 x double>, <16 x double>) nounwind readnone alwaysinline {
+  binary4to16(ret, double, @llvm.x86.avx.min.pd.256, %0, %1)
+  ret <16 x double> %ret
+}
+
+define internal <16 x double> @__max_varying_double(<16 x double>, <16 x double>) nounwind readnone alwaysinline {
+  binary4to16(ret, double, @llvm.x86.avx.max.pd.256, %0, %1)
+  ret <16 x double> %ret
+}
--- a/builtins-avx.ll
+++ b/builtins-avx.ll
@@ -0,0 +1,564 @@
+;;  Copyright (c) 2010-2011, Intel Corporation
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Intel Corporation nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; *** Untested *** AVX target implementation.
+;;
+;; The LLVM AVX code generator is incomplete, so the ispc AVX target
+;; hasn't yet been tested.  There is therefore a higher-than-normal
+;; chance that there are bugs in the code in this file.
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Basic 8-wide definitions
+
+stdlib_core(8)
+packed_load_and_store(8)
+scans(8)
+int64minmax(8)
+
+include(`builtins-avx-common.ll')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rcp
+
+declare <8 x float> @llvm.x86.avx.rcp.ps.256(<8 x float>) nounwind readnone
+
+define internal <8 x float> @__rcp_varying_float(<8 x float>) nounwind readonly alwaysinline {
+  ;  float iv = __rcp_v(v);
+  ;  return iv * (2. - v * iv);
+
+  %call = call <8 x float> @llvm.x86.avx.rcp.ps.256(<8 x float> %0)
+  ; do one N-R iteration
+  %v_iv = fmul <8 x float> %0, %call
+  %two_minus = fsub <8 x float> <float 2., float 2., float 2., float 2.,
+                                 float 2., float 2., float 2., float 2.>, %v_iv  
+  %iv_mul = fmul <8 x float> %call, %two_minus
+  ret <8 x float> %iv_mul
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rounding floats
+
+declare <8 x float> @llvm.x86.avx.round.ps.256(<8 x float>, i32) nounwind readnone
+
+define internal <8 x float> @__round_varying_float(<8 x float>) nounwind readonly alwaysinline {
+  ; roundps, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8
+  %call = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %0, i32 8)
+  ret <8 x float> %call
+}
+
+define internal <8 x float> @__floor_varying_float(<8 x float>) nounwind readonly alwaysinline {
+  ; roundps, round down 0b01 | don't signal precision exceptions 0b1001 = 9
+  %call = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %0, i32 9)
+  ret <8 x float> %call
+}
+
+define internal <8 x float> @__ceil_varying_float(<8 x float>) nounwind readonly alwaysinline {
+  ; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
+  %call = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %0, i32 10)
+  ret <8 x float> %call
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rounding doubles
+
+declare <4 x double> @llvm.x86.avx.round.pd.256(<4 x double>, i32) nounwind readnone
+
+define internal <8 x double> @__round_varying_double(<8 x double>) nounwind readonly alwaysinline {
+  round4to8double(%0, 8)
+}
+
+define internal <8 x double> @__floor_varying_double(<8 x double>) nounwind readonly alwaysinline {
+  ; roundpd, round down 0b01 | don't signal precision exceptions 0b1000 = 9
+  round4to8double(%0, 9)
+}
+
+
+define internal <8 x double> @__ceil_varying_double(<8 x double>) nounwind readonly alwaysinline {
+  ; roundpd, round up 0b10 | don't signal precision exceptions 0b1000 = 10
+  round4to8double(%0, 10)
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rsqrt
+
+declare <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float>) nounwind readnone
+
+define internal <8 x float> @__rsqrt_varying_float(<8 x float> %v) nounwind readonly alwaysinline {
+  ;  float is = __rsqrt_v(v);
+  %is = call <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float> %v)
+  ;  return 0.5 * is * (3. - (v * is) * is);
+  %v_is = fmul <8 x float> %v, %is
+  %v_is_is = fmul <8 x float> %v_is, %is
+  %three_sub = fsub <8 x float> <float 3., float 3., float 3., float 3.,
+                                 float 3., float 3., float 3., float 3.>, %v_is_is
+  %is_mul = fmul <8 x float> %is, %three_sub
+  %half_scale = fmul <8 x float> <float 0.5, float 0.5, float 0.5, float 0.5,
+                                  float 0.5, float 0.5, float 0.5, float 0.5>, %is_mul
+  ret <8 x float> %half_scale
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; sqrt
+
+declare <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float>) nounwind readnone
+
+define internal <8 x float> @__sqrt_varying_float(<8 x float>) nounwind readonly alwaysinline {
+  %call = call <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float> %0)
+  ret <8 x float> %call
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; svml
+
+; FIXME: need either to wire these up to the 8-wide SVML entrypoints,
+; or, use the macro to call the 4-wide ones twice with our 8-wide
+; vectors...
+
+declare <8 x float> @__svml_sin(<8 x float>)
+declare <8 x float> @__svml_cos(<8 x float>)
+declare void @__svml_sincos(<8 x float>, <8 x float> *, <8 x float> *)
+declare <8 x float> @__svml_tan(<8 x float>)
+declare <8 x float> @__svml_atan(<8 x float>)
+declare <8 x float> @__svml_atan2(<8 x float>, <8 x float>)
+declare <8 x float> @__svml_exp(<8 x float>)
+declare <8 x float> @__svml_log(<8 x float>)
+declare <8 x float> @__svml_pow(<8 x float>, <8 x float>)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; float min/max
+
+declare <8 x float> @llvm.x86.avx.max.ps.256(<8 x float>, <8 x float>) nounwind readnone
+declare <8 x float> @llvm.x86.avx.min.ps.256(<8 x float>, <8 x float>) nounwind readnone
+
+define internal <8 x float> @__max_varying_float(<8 x float>,
+                                                 <8 x float>) nounwind readonly alwaysinline {
+  %call = call <8 x float> @llvm.x86.avx.max.ps.256(<8 x float> %0, <8 x float> %1)
+  ret <8 x float> %call
+}
+
+define internal <8 x float> @__min_varying_float(<8 x float>,
+                                                 <8 x float>) nounwind readonly alwaysinline {
+  %call = call <8 x float> @llvm.x86.avx.min.ps.256(<8 x float> %0, <8 x float> %1)
+  ret <8 x float> %call
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int min/max
+
+define internal <8 x i32> @__min_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
+  binary4to8(ret, i32, @llvm.x86.sse41.pminsd, %0, %1)
+  ret <8 x i32> %ret
+}
+
+define internal <8 x i32> @__max_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
+  binary4to8(ret, i32, @llvm.x86.sse41.pmaxsd, %0, %1)
+  ret <8 x i32> %ret
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; unsigned int min/max
+
+define internal <8 x i32> @__min_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
+  binary4to8(ret, i32, @llvm.x86.sse41.pminud, %0, %1)
+  ret <8 x i32> %ret
+}
+
+define internal <8 x i32> @__max_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
+  binary4to8(ret, i32, @llvm.x86.sse41.pmaxud, %0, %1)
+  ret <8 x i32> %ret
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; horizontal ops
+
+declare i32 @llvm.x86.avx.movmsk.ps.256(<8 x float>) nounwind readnone
+
+define internal i32 @__movmsk(<8 x i32>) nounwind readnone alwaysinline {
+  %floatmask = bitcast <8 x i32> %0 to <8 x float>
+  %v = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %floatmask) nounwind readnone
+  ret i32 %v
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; horizontal float ops
+
+declare <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float>, <8 x float>) nounwind readnone
+
+define internal float @__reduce_add_float(<8 x float>) nounwind readonly alwaysinline {
+  %v1 = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %0, <8 x float> %0)
+  %v2 = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %v1, <8 x float> %v1)
+  %scalar1 = extractelement <8 x float> %v2, i32 0
+  %scalar2 = extractelement <8 x float> %v2, i32 4
+  %sum = fadd float %scalar1, %scalar2
+  ret float %sum
+}
+
+
+define internal float @__reduce_min_float(<8 x float>) nounwind readnone alwaysinline {
+  reduce8(float, @__min_varying_float, @__min_uniform_float)
+}
+
+
+define internal float @__reduce_max_float(<8 x float>) nounwind readnone alwaysinline {
+  reduce8(float, @__max_varying_float, @__max_uniform_float)
+}
+
+reduce_equal(8)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; horizontal int32 ops
+
+define internal <8 x i32> @__add_varying_int32(<8 x i32>,
+                                               <8 x i32>) nounwind readnone alwaysinline {
+  %s = add <8 x i32> %0, %1
+  ret <8 x i32> %s
+}
+
+define internal i32 @__add_uniform_int32(i32, i32) nounwind readnone alwaysinline {
+  %s = add i32 %0, %1
+  ret i32 %s
+}
+
+define internal i32 @__reduce_add_int32(<8 x i32>) nounwind readnone alwaysinline {
+  reduce8(i32, @__add_varying_int32, @__add_uniform_int32)
+}
+
+
+define internal i32 @__reduce_min_int32(<8 x i32>) nounwind readnone alwaysinline {
+  reduce8(i32, @__min_varying_int32, @__min_uniform_int32)
+}
+
+
+define internal i32 @__reduce_max_int32(<8 x i32>) nounwind readnone alwaysinline {
+  reduce8(i32, @__max_varying_int32, @__max_uniform_int32)
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; horizontal uint32 ops
+
+define internal i32 @__reduce_add_uint32(<8 x i32> %v) nounwind readnone alwaysinline {
+  %r = call i32 @__reduce_add_int32(<8 x i32> %v)
+  ret i32 %r
+}
+
+define internal i32 @__reduce_min_uint32(<8 x i32>) nounwind readnone alwaysinline {
+  reduce8(i32, @__min_varying_uint32, @__min_uniform_uint32)
+}
+
+
+define internal i32 @__reduce_max_uint32(<8 x i32>) nounwind readnone alwaysinline {
+  reduce8(i32, @__max_varying_uint32, @__max_uniform_uint32)
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; horizontal double ops
+
+declare <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double>, <4 x double>) nounwind readnone
+
+define internal double @__reduce_add_double(<8 x double>) nounwind readonly alwaysinline {
+  %v0 = shufflevector <8 x double> %0, <8 x double> undef,
+                      <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v1 = shufflevector <8 x double> %0, <8 x double> undef,
+                      <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %sum0 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %v0, <4 x double> %v1)
+  %sum1 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %sum0, <4 x double> %sum0)
+  %final0 = extractelement <4 x double> %sum1, i32 0
+  %final1 = extractelement <4 x double> %sum1, i32 2
+  %sum = fadd double %final0, %final1
+
+  ret double %sum
+}
+
+define internal double @__reduce_min_double(<8 x double>) nounwind readnone alwaysinline {
+  reduce8(double, @__min_varying_double, @__min_uniform_double)
+}
+
+
+define internal double @__reduce_max_double(<8 x double>) nounwind readnone alwaysinline {
+  reduce8(double, @__max_varying_double, @__max_uniform_double)
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; horizontal int64 ops
+
+define internal <8 x i64> @__add_varying_int64(<8 x i64>,
+                                               <8 x i64>) nounwind readnone alwaysinline {
+  %s = add <8 x i64> %0, %1
+  ret <8 x i64> %s
+}
+
+define internal i64 @__add_uniform_int64(i64, i64) nounwind readnone alwaysinline {
+  %s = add i64 %0, %1
+  ret i64 %s
+}
+
+define internal i64 @__reduce_add_int64(<8 x i64>) nounwind readnone alwaysinline {
+  reduce8(i64, @__add_varying_int64, @__add_uniform_int64)
+}
+
+
+define internal i64 @__reduce_min_int64(<8 x i64>) nounwind readnone alwaysinline {
+  reduce8(i64, @__min_varying_int64, @__min_uniform_int64)
+}
+
+
+define internal i64 @__reduce_max_int64(<8 x i64>) nounwind readnone alwaysinline {
+  reduce8(i64, @__max_varying_int64, @__max_uniform_int64)
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; horizontal uint64 ops
+
+define internal i64 @__reduce_add_uint64(<8 x i64> %v) nounwind readnone alwaysinline {
+  %r = call i64 @__reduce_add_int64(<8 x i64> %v)
+  ret i64 %r
+}
+
+define internal i64 @__reduce_min_uint64(<8 x i64>) nounwind readnone alwaysinline {
+  reduce8(i64, @__min_varying_uint64, @__min_uniform_uint64)
+}
+
+
+define internal i64 @__reduce_max_uint64(<8 x i64>) nounwind readnone alwaysinline {
+  reduce8(i64, @__max_varying_uint64, @__max_uniform_uint64)
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; unaligned loads/loads+broadcasts
+
+load_and_broadcast(8, i8, 8)
+load_and_broadcast(8, i16, 16)
+load_and_broadcast(8, i32, 32)
+load_and_broadcast(8, i64, 64)
+
+; no masked load instruction for i8 and i16 types??
+load_masked(8, i8,  8,  1)
+load_masked(8, i16, 16, 2)
+
+declare <8 x float> @llvm.x86.avx.maskload.ps.256(i8 *, <8 x float> %mask)
+declare <4 x double> @llvm.x86.avx.maskload.pd.256(i8 *, <4 x double> %mask)
+ 
+define <8 x i32> @__load_masked_32(i8 *, <8 x i32> %mask) nounwind alwaysinline {
+  %floatmask = bitcast <8 x i32> %mask to <8 x float>
+  %floatval = call <8 x float> @llvm.x86.avx.maskload.ps.256(i8 * %0, <8 x float> %floatmask)
+  %retval = bitcast <8 x float> %floatval to <8 x i32>
+  ret <8 x i32> %retval
+}
+
+
+define <8 x i64> @__load_masked_64(i8 *, <8 x i32> %mask) nounwind alwaysinline {
+  ; double up masks, bitcast to doubles
+  %mask0 = shufflevector <8 x i32> %mask, <8 x i32> undef,
+     <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
+  %mask1 = shufflevector <8 x i32> %mask, <8 x i32> undef,
+     <8 x i32> <i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
+  %mask0d = bitcast <8 x i32> %mask0 to <4 x double>
+  %mask1d = bitcast <8 x i32> %mask1 to <4 x double>
+
+  %val0d = call <4 x double> @llvm.x86.avx.maskload.pd.256(i8 * %0, <4 x double> %mask0d)
+  %ptr1 = getelementptr i8 * %0, i32 32
+  %val1d = call <4 x double> @llvm.x86.avx.maskload.pd.256(i8 * %ptr1, <4 x double> %mask1d)
+
+  %vald = shufflevector <4 x double> %val0d, <4 x double> %val1d,
+      <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %val = bitcast <8 x double> %vald to <8 x i64>
+  ret <8 x i64> %val
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; masked store
+
+; FIXME: there is no AVX instruction for these, but we could be clever
+; by packing the bits down and setting the last 3/4 or half, respectively,
+; of the mask to zero...  Not sure if this would be a win in the end
+gen_masked_store(8, i8, 8)
+gen_masked_store(8, i16, 16)
+
+; note that mask is the 2nd parameter, not the 3rd one!!
+declare void @llvm.x86.avx.maskstore.ps.256(i8 *, <8 x float>, <8 x float>)
+declare void @llvm.x86.avx.maskstore.pd.256(i8 *, <4 x double>, <4 x double>)
+
+define void @__masked_store_32(<8 x i32>* nocapture, <8 x i32>, 
+                               <8 x i32>) nounwind alwaysinline {
+  %ptr = bitcast <8 x i32> * %0 to i8 *
+  %val = bitcast <8 x i32> %1 to <8 x float>
+  %mask = bitcast <8 x i32> %2 to <8 x float>
+  call void @llvm.x86.avx.maskstore.ps.256(i8 * %ptr, <8 x float> %mask, <8 x float> %val)
+  ret void
+}
+
+define void @__masked_store_64(<8 x i64>* nocapture, <8 x i64>,
+                               <8 x i32> %mask) nounwind alwaysinline {
+  %ptr = bitcast <8 x i64> * %0 to i8 *
+  %val = bitcast <8 x i64> %1 to <8 x double>
+
+  %mask0 = shufflevector <8 x i32> %mask, <8 x i32> undef,
+     <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
+  %mask1 = shufflevector <8 x i32> %mask, <8 x i32> undef,
+     <8 x i32> <i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
+
+  %mask0d = bitcast <8 x i32> %mask0 to <4 x double>
+  %mask1d = bitcast <8 x i32> %mask1 to <4 x double>
+
+  %val0 = shufflevector <8 x double> %val, <8 x double> undef,
+     <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %val1 = shufflevector <8 x double> %val, <8 x double> undef,
+     <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+
+  call void @llvm.x86.avx.maskstore.pd.256(i8 * %ptr, <4 x double> %mask0d, <4 x double> %val0)
+  %ptr1 = getelementptr i8 * %ptr, i32 32
+  call void @llvm.x86.avx.maskstore.pd.256(i8 * %ptr1, <4 x double> %mask1d, <4 x double> %val1)
+  ret void
+}
+
+
+
+masked_store_blend_8_16_by_8()
+
+declare <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float>, <8 x float>,
+                                                <8 x float>) nounwind readnone
+
+define void @__masked_store_blend_32(<8 x i32>* nocapture, <8 x i32>, 
+                                     <8 x i32>) nounwind alwaysinline {
+  %mask_as_float = bitcast <8 x i32> %2 to <8 x float>
+  %oldValue = load <8 x i32>* %0, align 4
+  %oldAsFloat = bitcast <8 x i32> %oldValue to <8 x float>
+  %newAsFloat = bitcast <8 x i32> %1 to <8 x float>
+  %blend = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %oldAsFloat,
+                                                        <8 x float> %newAsFloat,
+                                                        <8 x float> %mask_as_float)
+  %blendAsInt = bitcast <8 x float> %blend to <8 x i32>
+  store <8 x i32> %blendAsInt, <8 x i32>* %0, align 4
+  ret void
+}
+
+
+define void @__masked_store_blend_64(<8 x i64>* nocapture %ptr, <8 x i64> %new, 
+                                     <8 x i32> %i32mask) nounwind alwaysinline {
+  %oldValue = load <8 x i64>* %ptr, align 8
+  %mask = bitcast <8 x i32> %i32mask to <8 x float>
+
+  ; Do 4x64-bit blends by doing two <8 x i32> blends, where the <8 x i32> values
+  ; are actually bitcast <4 x i64> values
+  ;
+  ; set up the first four 64-bit values
+  %old01  = shufflevector <8 x i64> %oldValue, <8 x i64> undef,
+                          <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %old01f = bitcast <4 x i64> %old01 to <8 x float>
+  %new01  = shufflevector <8 x i64> %new, <8 x i64> undef,
+                          <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %new01f = bitcast <4 x i64> %new01 to <8 x float>
+  ; compute mask--note that the indices are all doubled-up
+  %mask01 = shufflevector <8 x float> %mask, <8 x float> undef,
+                          <8 x i32> <i32 0, i32 0, i32 1, i32 1,
+                                     i32 2, i32 2, i32 3, i32 3>
+  ; and blend them
+  %result01f = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %old01f,
+                                                            <8 x float> %new01f,
+                                                            <8 x float> %mask01)
+  %result01 = bitcast <8 x float> %result01f to <4 x i64>
+
+  ; and again
+  %old23  = shufflevector <8 x i64> %oldValue, <8 x i64> undef,
+                          <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %old23f = bitcast <4 x i64> %old23 to <8 x float>
+  %new23  = shufflevector <8 x i64> %new, <8 x i64> undef,
+                          <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %new23f = bitcast <4 x i64> %new23 to <8 x float>
+  ; compute mask--note that the values are doubled-up...
+  %mask23 = shufflevector <8 x float> %mask, <8 x float> undef,
+                          <8 x i32> <i32 4, i32 4, i32 5, i32 5,
+                                     i32 6, i32 6, i32 7, i32 7>
+  ; and blend them
+  %result23f = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %old23f,
+                                                            <8 x float> %new23f,
+                                                            <8 x float> %mask23)
+  %result23 = bitcast <8 x float> %result23f to <4 x i64>
+
+  ; reconstruct the final <8 x i64> vector
+  %final = shufflevector <4 x i64> %result01, <4 x i64> %result23,
+                         <8 x i32> <i32 0, i32 1, i32 2, i32 3,
+                                    i32 4, i32 5, i32 6, i32 7>
+  store <8 x i64> %final, <8 x i64> * %ptr, align 8
+  ret void
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; gather/scatter
+
+gen_gather(8, i8)
+gen_gather(8, i16)
+gen_gather(8, i32)
+gen_gather(8, i64)
+
+gen_scatter(8, i8)
+gen_scatter(8, i16)
+gen_scatter(8, i32)
+gen_scatter(8, i64)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; double precision sqrt
+
+declare <4 x double> @llvm.x86.avx.sqrt.pd.256(<4 x double>) nounwind readnone
+
+define internal <8 x double> @__sqrt_varying_double(<8 x double>) nounwind alwaysinline {
+  unary4to8(ret, double, @llvm.x86.avx.sqrt.pd.256, %0)
+  ret <8 x double> %ret
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; double precision min/max
+
+declare <4 x double> @llvm.x86.avx.max.pd.256(<4 x double>, <4 x double>) nounwind readnone
+declare <4 x double> @llvm.x86.avx.min.pd.256(<4 x double>, <4 x double>) nounwind readnone
+
+define internal <8 x double> @__min_varying_double(<8 x double>, <8 x double>) nounwind readnone alwaysinline {
+  binary4to8(ret, double, @llvm.x86.avx.min.pd.256, %0, %1)
+  ret <8 x double> %ret
+}
+
+define internal <8 x double> @__max_varying_double(<8 x double>, <8 x double>) nounwind readnone alwaysinline {
+  binary4to8(ret, double, @llvm.x86.avx.max.pd.256, %0, %1)
+  ret <8 x double> %ret
+}
+
--- a/builtins-c.c
+++ b/builtins-c.c
@@ -31,7 +31,7 @@
   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
 */

-/** @file stdlib-c.c
+/** @file builtins-c.c
    @brief Standard library function implementations written in C.

    This file provides C implementations of various functions that can be
--- a/builtins-sse.ll
+++ b/builtins-sse.ll
@@ -31,12 +31,12 @@

 ;; This file declares implementations of various stdlib builtins that
 ;; only require SSE version 1 and 2 functionality; this file, in turn
-;; is then included by stdlib-sse2.ll and stdlib-sse4.ll to provide
+;; is then included by builtins-sse2.ll and builtins-sse4.ll to provide
 ;; those definitions for them.

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

-int8_16(4)
+int64minmax(4)

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rcp
@@ -124,18 +124,19 @@ define internal float @__sqrt_uniform_float(float) nounwind readonly alwaysinlin
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; fast math mode

-declare void @llvm.x86.sse.stmxcsr(i32 *) nounwind
-declare void @llvm.x86.sse.ldmxcsr(i32 *) nounwind
+declare void @llvm.x86.sse.stmxcsr(i8 *) nounwind
+declare void @llvm.x86.sse.ldmxcsr(i8 *) nounwind

 define internal void @__fastmath() nounwind alwaysinline {
  %ptr = alloca i32
-  call void @llvm.x86.sse.stmxcsr(i32 * %ptr)
+  %ptr8 = bitcast i32 * %ptr to i8 *
+  call void @llvm.x86.sse.stmxcsr(i8 * %ptr8)
  %oldval = load i32 *%ptr

  ; turn on DAZ (64)/FTZ (32768) -> 32832
  %update = or i32 %oldval, 32832
  store i32 %update, i32 *%ptr
-  call void @llvm.x86.sse.ldmxcsr(i32 * %ptr)
+  call void @llvm.x86.sse.ldmxcsr(i8 * %ptr8)
  ret void
 }

@@ -227,6 +228,54 @@ define internal float @__min_uniform_float(float, float) nounwind readonly alway
  ret float %ret
 }

+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; double precision sqrt
+
+declare <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double>) nounwind readnone
+declare <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double>) nounwind readnone
+
+define internal <4 x double> @__sqrt_varying_double(<4 x double>) nounwind alwaysinline {
+  unary2to4(ret, double, @llvm.x86.sse2.sqrt.pd, %0)
+  ret <4 x double> %ret
+}
+
+
+define internal double @__sqrt_uniform_double(double) nounwind alwaysinline {
+  sse_unary_scalar(ret, 2, double, @llvm.x86.sse2.sqrt.sd, %0)
+  ret double %ret
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; double precision min/max
+
+declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>) nounwind readnone
+declare <2 x double> @llvm.x86.sse2.max.sd(<2 x double>, <2 x double>) nounwind readnone
+declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) nounwind readnone
+declare <2 x double> @llvm.x86.sse2.min.sd(<2 x double>, <2 x double>) nounwind readnone
+
+define internal <4 x double> @__min_varying_double(<4 x double>, <4 x double>) nounwind readnone {
+  binary2to4(ret, double, @llvm.x86.sse2.min.pd, %0, %1)
+  ret <4 x double> %ret
+}
+
+
+define internal double @__min_uniform_double(double, double) nounwind readnone {
+  sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.min.sd, %0, %1)
+  ret double %ret
+}
+
+
+define internal <4 x double> @__max_varying_double(<4 x double>, <4 x double>) nounwind readnone {
+  binary2to4(ret, double, @llvm.x86.sse2.max.pd, %0, %1)
+  ret <4 x double> %ret
+}
+
+
+define internal double @__max_uniform_double(double, double) nounwind readnone {
+  sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.max.sd, %0, %1)
+  ret double %ret
+}

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; horizontal ops / reductions
@@ -279,163 +328,90 @@ define internal i32 @__reduce_max_uint32(<4 x i32>) nounwind readnone {
 }


+define internal double @__reduce_add_double(<4 x double>) nounwind readnone {
+  %v0 = shufflevector <4 x double> %0, <4 x double> undef,
+                      <2 x i32> <i32 0, i32 1>
+  %v1 = shufflevector <4 x double> %0, <4 x double> undef,
+                      <2 x i32> <i32 2, i32 3>
+  %sum = fadd <2 x double> %v0, %v1
+  %e0 = extractelement <2 x double> %sum, i32 0
+  %e1 = extractelement <2 x double> %sum, i32 1
+  %m = fadd double %e0, %e1
+  ret double %m
+}
+
+define internal double @__reduce_min_double(<4 x double>) nounwind readnone {
+  reduce4(double, @__min_varying_double, @__min_uniform_double)
+}
+
+define internal double @__reduce_max_double(<4 x double>) nounwind readnone {
+  reduce4(double, @__max_varying_double, @__max_uniform_double)
+}
+
+define internal i64 @__reduce_add_int64(<4 x i64>) nounwind readnone {
+  %v0 = shufflevector <4 x i64> %0, <4 x i64> undef,
+                      <2 x i32> <i32 0, i32 1>
+  %v1 = shufflevector <4 x i64> %0, <4 x i64> undef,
+                      <2 x i32> <i32 2, i32 3>
+  %sum = add <2 x i64> %v0, %v1
+  %e0 = extractelement <2 x i64> %sum, i32 0
+  %e1 = extractelement <2 x i64> %sum, i32 1
+  %m = add i64 %e0, %e1
+  ret i64 %m
+}
+
+define internal i64 @__reduce_min_int64(<4 x i64>) nounwind readnone {
+  reduce4(i64, @__min_varying_int64, @__min_uniform_int64)
+}
+
+define internal i64 @__reduce_max_int64(<4 x i64>) nounwind readnone {
+  reduce4(i64, @__max_varying_int64, @__max_uniform_int64)
+}
+
+define internal i64 @__reduce_min_uint64(<4 x i64>) nounwind readnone {
+  reduce4(i64, @__min_varying_uint64, @__min_uniform_uint64)
+}
+
+define internal i64 @__reduce_max_uint64(<4 x i64>) nounwind readnone {
+  reduce4(i64, @__max_varying_uint64, @__max_uniform_uint64)
+}
+
+reduce_equal(4)
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; masked store

-define void @__masked_store_32(<4 x i32>* nocapture, <4 x i32>, <4 x i32>) nounwind alwaysinline {
-  per_lane(4, <4 x i32> %2, `
-      ; compute address for this one
-      %ptr_ID = getelementptr <4 x i32> * %0, i32 0, i32 LANE
-      %storeval_ID = extractelement <4 x i32> %1, i32 LANE
-      store i32 %storeval_ID, i32 * %ptr_ID')
-  ret void
-}
-
-define void @__masked_store_64(<4 x i64>* nocapture, <4 x i64>, <4 x i32>) nounwind alwaysinline {
-  per_lane(4, <4 x i32> %2, `
-      %ptr_ID = getelementptr <4 x i64> * %0, i32 0, i32 LANE
-      %storeval_ID = extractelement <4 x i64> %1, i32 LANE
-      store i64 %storeval_ID, i64 * %ptr_ID')
-  ret void
-}
+masked_store_blend_8_16_by_4()

+gen_masked_store(4, i8, 8)
+gen_masked_store(4, i16, 16)
+gen_masked_store(4, i32, 32)
+gen_masked_store(4, i64, 64)

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; unaligned loads/loads+broadcasts

-define <4 x i32> @__load_and_broadcast_32(i8 *, <4 x i32> %mask) nounwind alwaysinline {
-  ; must not load if the mask is all off; the address may be invalid
-  %mm = call i32 @__movmsk(<4 x i32> %mask)
-  %any_on = icmp ne i32 %mm, 0
-  br i1 %any_on, label %load, label %skip
-
-load:
-  %ptr = bitcast i8 * %0 to i32 *
-  %val = load i32 * %ptr
-
-  %ret0 = insertelement <4 x i32> undef, i32 %val, i32 0
-  %ret1 = insertelement <4 x i32> %ret0, i32 %val, i32 1
-  %ret2 = insertelement <4 x i32> %ret1, i32 %val, i32 2
-  %ret3 = insertelement <4 x i32> %ret2, i32 %val, i32 3
-  ret <4 x i32> %ret3
-
-skip:
-  ret <4 x i32> undef
-}
-
-define <4 x i64> @__load_and_broadcast_64(i8 *, <4 x i32> %mask) nounwind alwaysinline {
-  ; must not load if the mask is all off; the address may be invalid
-  %mm = call i32 @__movmsk(<4 x i32> %mask)
-  %any_on = icmp ne i32 %mm, 0
-  br i1 %any_on, label %load, label %skip
-
-load:
-  %ptr = bitcast i8 * %0 to i64 *
-  %val = load i64 * %ptr
-
-  %ret0 = insertelement <4 x i64> undef, i64 %val, i32 0
-  %ret1 = insertelement <4 x i64> %ret0, i64 %val, i32 1
-  %ret2 = insertelement <4 x i64> %ret1, i64 %val, i32 2
-  %ret3 = insertelement <4 x i64> %ret2, i64 %val, i32 3
-  ret <4 x i64> %ret3
-
-skip:
-  ret <4 x i64> undef
-}
-
-define <4 x i32> @__load_masked_32(i8 *, <4 x i32> %mask) nounwind alwaysinline {
-  %mm = call i32 @__movmsk(<4 x i32> %mask)
-  %any_on = icmp ne i32 %mm, 0
-  br i1 %any_on, label %load, label %skip
-
-load: 
-  ; if any mask lane is on, just load all of the values
-  ; FIXME: there is a lurking bug here if we straddle a page boundary, the
-  ; next page is invalid to read, but the mask bits are set so that we
-  ; aren't supposed to be reading those elements...
-  %ptr = bitcast i8 * %0 to <4 x i32> *
-  %val = load <4 x i32> * %ptr, align 4
-  ret <4 x i32> %val
-
-skip:
-  ret <4 x i32> undef
-}
-
-define <4 x i64> @__load_masked_64(i8 *, <4 x i32> %mask) nounwind alwaysinline {
-  %mm = call i32 @__movmsk(<4 x i32> %mask)
-  %any_on = icmp ne i32 %mm, 0
-  br i1 %any_on, label %load, label %skip
-
-load:
-  ; if any mask lane is on, just load all of the values
-  ; FIXME: there is a lurking bug here if we straddle a page boundary, the
-  ; next page is invalid to read, but the mask bits are set so that we
-  ; aren't supposed to be reading those elements...
-  %ptr = bitcast i8 * %0 to <4 x i64> *
-  %val = load <4 x i64> * %ptr, align 8
-  ret <4 x i64> %val
-
-skip:
-  ret <4 x i64> undef
-}
+load_and_broadcast(4, i8, 8)
+load_and_broadcast(4, i16, 16)
+load_and_broadcast(4, i32, 32)
+load_and_broadcast(4, i64, 64)

+load_masked(4, i8,  8,  1)
+load_masked(4, i16, 16, 2)
+load_masked(4, i32, 32, 4)
+load_masked(4, i64, 64, 8)

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; gather/scatter

 ; define these with the macros from stdlib.m4

+gen_gather(4, i8)
+gen_gather(4, i16)
 gen_gather(4, i32)
 gen_gather(4, i64)
+
+gen_scatter(4, i8)
+gen_scatter(4, i16)
 gen_scatter(4, i32)
 gen_scatter(4, i64)
-
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; double precision sqrt
-
-declare <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double>) nounwind readnone
-declare <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double>) nounwind readnone
-
-define internal <4 x double> @__sqrt_varying_double(<4 x double>) nounwind alwaysinline {
-  unary2to4(ret, double, @llvm.x86.sse2.sqrt.pd, %0)
-  ret <4 x double> %ret
-}
-
-
-define internal double @__sqrt_uniform_double(double) nounwind alwaysinline {
-  sse_unary_scalar(ret, 2, double, @llvm.x86.sse2.sqrt.pd, %0)
-  ret double %ret
-}
-
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; double precision min/max
-
-declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>) nounwind readnone
-declare <2 x double> @llvm.x86.sse2.max.sd(<2 x double>, <2 x double>) nounwind readnone
-declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) nounwind readnone
-declare <2 x double> @llvm.x86.sse2.min.sd(<2 x double>, <2 x double>) nounwind readnone
-
-define internal <4 x double> @__min_varying_double(<4 x double>, <4 x double>) nounwind readnone {
-  binary2to4(ret, double, @llvm.x86.sse2.min.pd, %0, %1)
-  ret <4 x double> %ret
-}
-
-
-define internal double @__min_uniform_double(double, double) nounwind readnone {
-  sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.min.pd, %0, %1)
-  ret double %ret
-}
-
-
-define internal <4 x double> @__max_varying_double(<4 x double>, <4 x double>) nounwind readnone {
-  binary2to4(ret, double, @llvm.x86.sse2.max.pd, %0, %1)
-  ret <4 x double> %ret
-}
-
-
-define internal double @__max_uniform_double(double, double) nounwind readnone {
-  sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.max.pd, %0, %1)
-  ret double %ret
-}
--- a/builtins-sse2.ll
+++ b/builtins-sse2.ll
@@ -35,9 +35,10 @@
 ; Define some basics for a 4-wide target
 stdlib_core(4)
 packed_load_and_store(4)
+scans(4)

 ; Include the various definitions of things that only require SSE1 and SSE2
-include(`stdlib-sse.ll')
+include(`builtins-sse.ll')

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rounding
@@ -152,6 +153,40 @@ define internal float @__ceil_uniform_float(float) nounwind readonly alwaysinlin
  ret float %binop.i
 }

+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rounding doubles
+
+declare double @round(double)
+declare double @floor(double)
+declare double @ceil(double)
+
+define internal <4 x double> @__round_varying_double(<4 x double>) nounwind readonly alwaysinline {
+  unary1to4(double, @round)
+}
+
+define internal double @__round_uniform_double(double) nounwind readonly alwaysinline {
+  %r = call double @round(double %0)
+  ret double %r
+}
+
+define internal <4 x double> @__floor_varying_double(<4 x double>) nounwind readonly alwaysinline {
+  unary1to4(double, @floor)
+}
+
+define internal double @__floor_uniform_double(double) nounwind readonly alwaysinline {
+  %r = call double @floor(double %0)
+  ret double %r
+}
+
+define internal <4 x double> @__ceil_varying_double(<4 x double>) nounwind readonly alwaysinline {
+  unary1to4(double, @ceil)
+}
+
+define internal double @__ceil_uniform_double(double) nounwind readonly alwaysinline {
+  %r = call double @ceil(double %0)
+  ret double %r
+}
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; min/max

@@ -242,23 +277,17 @@ define internal i32 @__max_uniform_uint32(i32, i32) nounwind readonly alwaysinli
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; horizontal ops / reductions

-; FIXME: this is very inefficient, loops over all 32 bits...
+declare i32 @llvm.ctpop.i32(i32)
+declare i64 @llvm.ctpop.i64(i64)

-define internal i32 @__popcnt(i32) nounwind readonly alwaysinline {
-entry:
-  br label %loop
+define internal i32 @__popcnt_int32(i32) nounwind readonly alwaysinline {
+  %val = call i32 @llvm.ctpop.i32(i32 %0)
+  ret i32 %val
+}

-loop:
-  %count = phi i32 [ 0, %entry ], [ %newcount, %loop ]
-  %val = phi i32 [ %0, %entry ], [ %newval, %loop ]
-  %delta = and i32 %val, 1
-  %newcount = add i32 %count, %delta
-  %newval = lshr i32 %val, 1
-  %done = icmp eq i32 %newval, 0
-  br i1 %done, label %exit, label %loop
-
-exit:
-  ret i32 %newcount
+define internal i64 @__popcnt_int64(i64) nounwind readnone alwaysinline {
+  %val = call i64 @llvm.ctpop.i64(i64 %0)
+  ret i64 %val
 }


--- a/builtins-sse4.ll
+++ b/builtins-sse4.ll
@@ -35,12 +35,13 @@
 ; Define common 4-wide stuff
 stdlib_core(4)
 packed_load_and_store(4)
+scans(4)

 ; Define the stuff that can be done with base SSE1/SSE2 instructions
-include(`stdlib-sse.ll')
+include(`builtins-sse.ll')

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; math
+;; rounding floats

 declare <4 x float> @llvm.x86.sse41.round.ps(<4 x float>, i32) nounwind readnone
 declare <4 x float> @llvm.x86.sse41.round.ss(<4 x float>, <4 x float>, i32) nounwind readnone
@@ -76,7 +77,7 @@ define internal float @__round_uniform_float(float) nounwind readonly alwaysinli
 }

 define internal <4 x float> @__floor_varying_float(<4 x float>) nounwind readonly alwaysinline {
-  ; roundps, round down 0b01 | don't signal precision exceptions 0b1000 = 9
+  ; roundps, round down 0b01 | don't signal precision exceptions 0b1001 = 9
  %call = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %0, i32 9)
  ret <4 x float> %call
 }
@@ -84,14 +85,14 @@ define internal <4 x float> @__floor_varying_float(<4 x float>) nounwind readonl
 define internal float @__floor_uniform_float(float) nounwind readonly alwaysinline {
  ; see above for round_ss instrinsic discussion...
  %xi = insertelement <4 x float> undef, float %0, i32 0
-  ; roundps, round down 0b01 | don't signal precision exceptions 0b1000 = 9
+  ; roundps, round down 0b01 | don't signal precision exceptions 0b1010 = 9
  %xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 9)
  %rs = extractelement <4 x float> %xr, i32 0
  ret float %rs
 }

 define internal <4 x float> @__ceil_varying_float(<4 x float>) nounwind readonly alwaysinline {
-  ; roundps, round up 0b10 | don't signal precision exceptions 0b1000 = 10
+  ; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
  %call = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %0, i32 10)
  ret <4 x float> %call
 }
@@ -99,14 +100,59 @@ define internal <4 x float> @__ceil_varying_float(<4 x float>) nounwind readonly
 define internal float @__ceil_uniform_float(float) nounwind readonly alwaysinline {
  ; see above for round_ss instrinsic discussion...
  %xi = insertelement <4 x float> undef, float %0, i32 0
-  ; roundps, round up 0b10 | don't signal precision exceptions 0b1000 = 10
+  ; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
  %xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 10)
  %rs = extractelement <4 x float> %xr, i32 0
  ret float %rs
 }

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; integer min/max
+;; rounding doubles
+
+declare <2 x double> @llvm.x86.sse41.round.pd(<2 x double>, i32) nounwind readnone
+declare <2 x double> @llvm.x86.sse41.round.sd(<2 x double>, <2 x double>, i32) nounwind readnone
+
+define internal <4 x double> @__round_varying_double(<4 x double>) nounwind readonly alwaysinline {
+  round2to4double(%0, 8)
+}
+
+define internal double @__round_uniform_double(double) nounwind readonly alwaysinline {
+  %xi = insertelement <2 x double> undef, double %0, i32 0
+  %xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 8)
+  %rs = extractelement <2 x double> %xr, i32 0
+  ret double %rs
+}
+
+define internal <4 x double> @__floor_varying_double(<4 x double>) nounwind readonly alwaysinline {
+  ; roundpd, round down 0b01 | don't signal precision exceptions 0b1001 = 9
+  round2to4double(%0, 9)
+}
+
+define internal double @__floor_uniform_double(double) nounwind readonly alwaysinline {
+  ; see above for round_ss instrinsic discussion...
+  %xi = insertelement <2 x double> undef, double %0, i32 0
+  ; roundpd, round down 0b01 | don't signal precision exceptions 0b1001 = 9
+  %xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 9)
+  %rs = extractelement <2 x double> %xr, i32 0
+  ret double %rs
+}
+
+define internal <4 x double> @__ceil_varying_double(<4 x double>) nounwind readonly alwaysinline {
+  ; roundpd, round up 0b10 | don't signal precision exceptions 0b1010 = 10
+  round2to4double(%0, 10)
+}
+
+define internal double @__ceil_uniform_double(double) nounwind readonly alwaysinline {
+  ; see above for round_ss instrinsic discussion...
+  %xi = insertelement <2 x double> undef, double %0, i32 0
+  ; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
+  %xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 10)
+  %rs = extractelement <2 x double> %xr, i32 0
+  ret double %rs
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int32 min/max

 declare <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32>, <4 x i32>) nounwind readnone
 declare <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32>, <4 x i32>) nounwind readnone
@@ -163,11 +209,18 @@ define internal i32 @__max_uniform_uint32(i32, i32) nounwind readonly alwaysinli

 declare i32 @llvm.ctpop.i32(i32) nounwind readnone

-define internal i32 @__popcnt(i32) nounwind readonly alwaysinline {
+define internal i32 @__popcnt_int32(i32) nounwind readonly alwaysinline {
  %call = call i32 @llvm.ctpop.i32(i32 %0)
  ret i32 %call
 }

+declare i64 @llvm.ctpop.i64(i64) nounwind readnone
+
+define internal i64 @__popcnt_int64(i64) nounwind readonly alwaysinline {
+  %call = call i64 @llvm.ctpop.i64(i64 %0)
+  ret i64 %call
+}
+
 declare <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float>, <4 x float>) nounwind readnone

 define internal float @__reduce_add_float(<4 x float>) nounwind readonly alwaysinline {
@@ -177,7 +230,6 @@ define internal float @__reduce_add_float(<4 x float>) nounwind readonly alwaysi
  ret float %scalar
 }

-
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; masked store

--- a/builtins-sse4x2.ll
+++ b/builtins-sse4x2.ll
@@ -38,7 +38,8 @@

 stdlib_core(8)
 packed_load_and_store(8)
-int8_16(8)
+scans(8)
+int64minmax(8)

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rcp
@@ -127,22 +128,22 @@ define internal float @__sqrt_uniform_float(float) nounwind readonly alwaysinlin
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; fast math

-declare void @llvm.x86.sse.stmxcsr(i32 *) nounwind
-declare void @llvm.x86.sse.ldmxcsr(i32 *) nounwind
+declare void @llvm.x86.sse.stmxcsr(i8 *) nounwind
+declare void @llvm.x86.sse.ldmxcsr(i8 *) nounwind

 define internal void @__fastmath() nounwind alwaysinline {
  %ptr = alloca i32
-  call void @llvm.x86.sse.stmxcsr(i32 * %ptr)
+  %ptr8 = bitcast i32 * %ptr to i8 *
+  call void @llvm.x86.sse.stmxcsr(i8 * %ptr8)
  %oldval = load i32 *%ptr

  ; turn on DAZ (64)/FTZ (32768) -> 32832
  %update = or i32 %oldval, 32832
  store i32 %update, i32 *%ptr
-  call void @llvm.x86.sse.ldmxcsr(i32 * %ptr)
+  call void @llvm.x86.sse.ldmxcsr(i8 * %ptr8)
  ret void
 }

-
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; svml stuff

@@ -258,7 +259,7 @@ define internal float @__min_uniform_float(float, float) nounwind readonly alway
 }

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; int min/max
+;; int32 min/max

 declare <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32>, <4 x i32>) nounwind readnone
 declare <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32>, <4 x i32>) nounwind readnone
@@ -380,92 +381,90 @@ define internal i32 @__reduce_max_uint32(<8 x i32>) nounwind readnone alwaysinli
  reduce8by4(i32, @llvm.x86.sse41.pmaxud, @__max_uniform_uint32)
 }

-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; masked store
-
-define void @__masked_store_32(<8 x i32>* nocapture, <8 x i32>,
-                               <8 x i32>) nounwind alwaysinline {
-  per_lane(8, <8 x i32> %2, `
-      ; compute address for this one
-      %ptr_ID = getelementptr <8 x i32> * %0, i32 0, i32 LANE
-      %storeval_ID = extractelement <8 x i32> %1, i32 LANE
-      store i32 %storeval_ID, i32 * %ptr_ID')
-  ret void
+define internal <4 x double> @__add_varying_double(<4 x double>,
+                                     <4 x double>) nounwind readnone alwaysinline {
+  %r = fadd <4 x double> %0, %1
+  ret <4 x double> %r
 }

-
-define void @__masked_store_64(<8 x i64>* nocapture, <8 x i64>,
-                               <8 x i32>) nounwind alwaysinline {
-  per_lane(8, <8 x i32> %2, `
-      ; compute address for this one
-      %ptr_ID = getelementptr <8 x i64> * %0, i32 0, i32 LANE
-      %storeval_ID = extractelement <8 x i64> %1, i32 LANE
-      store i64 %storeval_ID, i64 * %ptr_ID')
-  ret void
+define internal double @__add_uniform_double(double, double) nounwind readnone alwaysinline {
+  %r = fadd double %0, %1
+  ret double %r
 }

+define internal double @__reduce_add_double(<8 x double>) nounwind readnone {
+  reduce8by4(double, @__add_varying_double, @__add_uniform_double)
+}
+
+define internal double @__reduce_min_double(<8 x double>) nounwind readnone {
+  reduce8(double, @__min_varying_double, @__min_uniform_double)
+}
+
+define internal double @__reduce_max_double(<8 x double>) nounwind readnone {
+  reduce8(double, @__max_varying_double, @__max_uniform_double)
+}
+
+define internal <4 x i64> @__add_varying_int64(<4 x i64>,
+                                               <4 x i64>) nounwind readnone alwaysinline {
+  %r = add <4 x i64> %0, %1
+  ret <4 x i64> %r
+}
+
+define internal i64 @__add_uniform_int64(i64, i64) nounwind readnone alwaysinline {
+  %r = add i64 %0, %1
+  ret i64 %r
+}
+
+define internal i64 @__reduce_add_int64(<8 x i64>) nounwind readnone {
+  reduce8by4(i64, @__add_varying_int64, @__add_uniform_int64)
+}
+
+define internal i64 @__reduce_min_int64(<8 x i64>) nounwind readnone {
+  reduce8(i64, @__min_varying_int64, @__min_uniform_int64)
+}
+
+define internal i64 @__reduce_max_int64(<8 x i64>) nounwind readnone {
+  reduce8(i64, @__max_varying_int64, @__max_uniform_int64)
+}
+
+define internal i64 @__reduce_min_uint64(<8 x i64>) nounwind readnone {
+  reduce8(i64, @__min_varying_uint64, @__min_uniform_uint64)
+}
+
+define internal i64 @__reduce_max_uint64(<8 x i64>) nounwind readnone {
+  reduce8(i64, @__max_varying_uint64, @__max_uniform_uint64)
+}
+
+reduce_equal(8)

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; unaligned loads/loads+broadcasts

-; FIXME: I think this and the next one need to verify that the mask isn't
-; all off before doing the load!!!  (See e.g. stdlib-sse.ll)
+load_and_broadcast(8, i8, 8)
+load_and_broadcast(8, i16, 16)
+load_and_broadcast(8, i32, 32)
+load_and_broadcast(8, i64, 64)

-define <8 x i32> @__load_and_broadcast_32(i8 *, <8 x i32> %mask) nounwind alwaysinline {
-  %ptr = bitcast i8 * %0 to i32 *
-  %val = load i32 * %ptr
-
-  %ret0 = insertelement <8 x i32> undef, i32 %val, i32 0
-  %ret1 = insertelement <8 x i32> %ret0, i32 %val, i32 1
-  %ret2 = insertelement <8 x i32> %ret1, i32 %val, i32 2
-  %ret3 = insertelement <8 x i32> %ret2, i32 %val, i32 3
-  %ret4 = insertelement <8 x i32> %ret3, i32 %val, i32 4
-  %ret5 = insertelement <8 x i32> %ret4, i32 %val, i32 5
-  %ret6 = insertelement <8 x i32> %ret5, i32 %val, i32 6
-  %ret7 = insertelement <8 x i32> %ret6, i32 %val, i32 7
-  ret <8 x i32> %ret7
-}
-
-
-define <8 x i64> @__load_and_broadcast_64(i8 *, <8 x i32> %mask) nounwind alwaysinline {
-  %ptr = bitcast i8 * %0 to i64 *
-  %val = load i64 * %ptr
-
-  %ret0 = insertelement <8 x i64> undef, i64 %val, i32 0
-  %ret1 = insertelement <8 x i64> %ret0, i64 %val, i32 1
-  %ret2 = insertelement <8 x i64> %ret1, i64 %val, i32 2
-  %ret3 = insertelement <8 x i64> %ret2, i64 %val, i32 3
-  %ret4 = insertelement <8 x i64> %ret3, i64 %val, i32 4
-  %ret5 = insertelement <8 x i64> %ret4, i64 %val, i32 5
-  %ret6 = insertelement <8 x i64> %ret5, i64 %val, i32 6
-  %ret7 = insertelement <8 x i64> %ret6, i64 %val, i32 7
-  ret <8 x i64> %ret7
-}
-
-
-define <8 x i32> @__load_masked_32(i8 *, <8 x i32> %mask) nounwind alwaysinline {
-  %ptr = bitcast i8 * %0 to <8 x i32> *
-  %val = load <8 x i32> * %ptr, align 4
-  ret <8 x i32> %val
-}
-
-
-define <8 x i64> @__load_masked_64(i8 *, <8 x i32> %mask) nounwind alwaysinline {
-  %ptr = bitcast i8 * %0 to <8 x i64> *
-  %val = load <8 x i64> * %ptr, align 8
-  ret <8 x i64> %val
-}
+load_masked(8, i8,  8,  1)
+load_masked(8, i16, 16, 2)
+load_masked(8, i32, 32, 4)
+load_masked(8, i64, 64, 8)

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; gather/scatter

+gen_gather(8, i8)
+gen_gather(8, i16)
 gen_gather(8, i32)
 gen_gather(8, i64)
+
+gen_scatter(8, i8)
+gen_scatter(8, i16)
 gen_scatter(8, i32)
 gen_scatter(8, i64)

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; math
+;; float rounding

 declare <4 x float> @llvm.x86.sse41.round.ps(<4 x float>, i32) nounwind readnone
 declare <4 x float> @llvm.x86.sse41.round.ss(<4 x float>, <4 x float>, i32) nounwind readnone
@@ -499,43 +498,95 @@ define internal float @__round_uniform_float(float) nounwind readonly alwaysinli
 }

 define internal <8 x float> @__floor_varying_float(<8 x float>) nounwind readonly alwaysinline {
-  ; roundps, round down 0b01 | don't signal precision exceptions 0b1000 = 9
+  ; roundps, round down 0b01 | don't signal precision exceptions 0b1001 = 9
  round4to8(%0, 9)
 }

 define internal float @__floor_uniform_float(float) nounwind readonly alwaysinline {
  ; see above for round_ss instrinsic discussion...
  %xi = insertelement <4 x float> undef, float %0, i32 0
-  ; roundps, round down 0b01 | don't signal precision exceptions 0b1000 = 9
+  ; roundps, round down 0b01 | don't signal precision exceptions 0b1001 = 9
  %xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 9)
  %rs = extractelement <4 x float> %xr, i32 0
  ret float %rs
 }

 define internal <8 x float> @__ceil_varying_float(<8 x float>) nounwind readonly alwaysinline {
-  ; roundps, round up 0b10 | don't signal precision exceptions 0b1000 = 10
+  ; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
  round4to8(%0, 10)
 }

 define internal float @__ceil_uniform_float(float) nounwind readonly alwaysinline {
  ; see above for round_ss instrinsic discussion...
  %xi = insertelement <4 x float> undef, float %0, i32 0
-  ; roundps, round up 0b10 | don't signal precision exceptions 0b1000 = 10
+  ; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
  %xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 10)
  %rs = extractelement <4 x float> %xr, i32 0
  ret float %rs
 }

+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rounding doubles
+
+declare <2 x double> @llvm.x86.sse41.round.pd(<2 x double>, i32) nounwind readnone
+declare <2 x double> @llvm.x86.sse41.round.sd(<2 x double>, <2 x double>, i32) nounwind readnone
+
+define internal <8 x double> @__round_varying_double(<8 x double>) nounwind readonly alwaysinline {
+  round2to8double(%0, 8)
+}
+
+define internal double @__round_uniform_double(double) nounwind readonly alwaysinline {
+  %xi = insertelement <2 x double> undef, double %0, i32 0
+  %xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 8)
+  %rs = extractelement <2 x double> %xr, i32 0
+  ret double %rs
+}
+
+define internal <8 x double> @__floor_varying_double(<8 x double>) nounwind readonly alwaysinline {
+  ; roundpd, round down 0b01 | don't signal precision exceptions 0b1001 = 9
+  round2to8double(%0, 9)
+}
+
+define internal double @__floor_uniform_double(double) nounwind readonly alwaysinline {
+  ; see above for round_ss instrinsic discussion...
+  %xi = insertelement <2 x double> undef, double %0, i32 0
+  ; roundpd, round down 0b01 | don't signal precision exceptions 0b1001 = 9
+  %xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 9)
+  %rs = extractelement <2 x double> %xr, i32 0
+  ret double %rs
+}
+
+define internal <8 x double> @__ceil_varying_double(<8 x double>) nounwind readonly alwaysinline {
+  ; roundpd, round up 0b10 | don't signal precision exceptions 0b1010 = 10
+  round2to8double(%0, 10)
+}
+
+define internal double @__ceil_uniform_double(double) nounwind readonly alwaysinline {
+  ; see above for round_ss instrinsic discussion...
+  %xi = insertelement <2 x double> undef, double %0, i32 0
+  ; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
+  %xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 10)
+  %rs = extractelement <2 x double> %xr, i32 0
+  ret double %rs
+}
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; horizontal ops / reductions

 declare i32 @llvm.ctpop.i32(i32) nounwind readnone

-define internal i32 @__popcnt(i32) nounwind readonly alwaysinline {
+define internal i32 @__popcnt_int32(i32) nounwind readonly alwaysinline {
  %call = call i32 @llvm.ctpop.i32(i32 %0)
  ret i32 %call
 }

+declare i64 @llvm.ctpop.i64(i64) nounwind readnone
+
+define internal i64 @__popcnt_int64(i64) nounwind readonly alwaysinline {
+  %call = call i64 @llvm.ctpop.i64(i64 %0)
+  ret i64 %call
+}
+
 declare <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float>, <4 x float>) nounwind readnone

 define internal float @__reduce_add_float(<8 x float>) nounwind readonly alwaysinline {
@@ -555,6 +606,13 @@ define internal float @__reduce_add_float(<8 x float>) nounwind readonly alwaysi
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; masked store

+gen_masked_store(8, i8, 8)
+gen_masked_store(8, i16, 16)
+gen_masked_store(8, i32, 32)
+gen_masked_store(8, i64, 64)
+
+masked_store_blend_8_16_by_8()
+
 declare <4 x float> @llvm.x86.sse41.blendvps(<4 x float>, <4 x float>,
                                             <4 x float>) nounwind readnone

--- a/builtins.cpp
+++ b/builtins.cpp
@@ -52,7 +52,10 @@
 #include <llvm/Type.h>
 #include <llvm/DerivedTypes.h>
 #include <llvm/Instructions.h>
+#include <llvm/Intrinsics.h>
 #include <llvm/Linker.h>
+#include <llvm/Target/TargetMachine.h>
+#include <llvm/ADT/Triple.h>
 #include <llvm/Support/MemoryBuffer.h>
 #include <llvm/Bitcode/ReaderWriter.h>

@@ -64,43 +67,88 @@ extern yy_buffer_state *yy_scan_string(const char *);
 /** Given an LLVM type, try to find the equivalent ispc type.  Note that
    this is an under-constrained problem due to LLVM's type representations
    carrying less information than ispc's.  (For example, LLVM doesn't
-    distinguish between signed and unsigned integers in its types.)  
+    distinguish between signed and unsigned integers in its types.)
+
+    Because this function is only used for generating ispc declarations of
+    functions defined in LLVM bitcode in the builtins-*.ll files, in practice
+    we can get enough of what we need for the relevant cases to make things
+    work, partially with the help of the intAsUnsigned parameter, which
+    indicates whether LLVM integer types should be treated as being signed
+    or unsigned.

-    However, because this function is only used for generating ispc
-    declarations of functions defined in LLVM bitcode in the stdlib-*.ll
-    files, in practice we can get enough of what we need for the relevant
-    cases to make things work.
 */
 static const Type *
-lLLVMTypeToISPCType(const llvm::Type *t) {
+lLLVMTypeToISPCType(const llvm::Type *t, bool intAsUnsigned) {
    if (t == LLVMTypes::VoidType)
        return AtomicType::Void;
+
+    // uniform
    else if (t == LLVMTypes::BoolType)
        return AtomicType::UniformBool;
+    else if (t == LLVMTypes::Int8Type)
+        return intAsUnsigned ? AtomicType::UniformUInt8 : AtomicType::UniformInt8;
+    else if (t == LLVMTypes::Int16Type)
+        return intAsUnsigned ? AtomicType::UniformUInt16 : AtomicType::UniformInt16;
    else if (t == LLVMTypes::Int32Type)
-        return AtomicType::UniformInt32;
+        return intAsUnsigned ? AtomicType::UniformUInt32 : AtomicType::UniformInt32;
    else if (t == LLVMTypes::FloatType)
        return AtomicType::UniformFloat;
    else if (t == LLVMTypes::DoubleType)
        return AtomicType::UniformDouble;
    else if (t == LLVMTypes::Int64Type)
-        return AtomicType::UniformInt64;
+        return intAsUnsigned ? AtomicType::UniformUInt64 : AtomicType::UniformInt64;
+
+    // varying
+    else if (t == LLVMTypes::Int8VectorType)
+        return intAsUnsigned ? AtomicType::VaryingUInt8 : AtomicType::VaryingInt8;
+    else if (t == LLVMTypes::Int16VectorType)
+        return intAsUnsigned ? AtomicType::VaryingUInt16 : AtomicType::VaryingInt16;
    else if (t == LLVMTypes::Int32VectorType)
-        return AtomicType::VaryingInt32;
+        return intAsUnsigned ? AtomicType::VaryingUInt32 : AtomicType::VaryingInt32;
    else if (t == LLVMTypes::FloatVectorType)
        return AtomicType::VaryingFloat;
    else if (t == LLVMTypes::DoubleVectorType)
        return AtomicType::VaryingDouble;
    else if (t == LLVMTypes::Int64VectorType)
-        return AtomicType::VaryingInt64;
+        return intAsUnsigned ? AtomicType::VaryingUInt64 : AtomicType::VaryingInt64;
+
+    // pointers to uniform
+    else if (t == LLVMTypes::Int8PointerType)
+        return new ReferenceType(intAsUnsigned ? AtomicType::UniformUInt8 :
+                                                 AtomicType::UniformInt8, false);
+    else if (t == LLVMTypes::Int16PointerType)
+        return new ReferenceType(intAsUnsigned ? AtomicType::UniformUInt16 :
+                                                 AtomicType::UniformInt16, false);
    else if (t == LLVMTypes::Int32PointerType)
-        return new ReferenceType(AtomicType::UniformInt32, false);
+        return new ReferenceType(intAsUnsigned ? AtomicType::UniformUInt32 :
+                                                 AtomicType::UniformInt32, false);
+    else if (t == LLVMTypes::Int64PointerType)
+        return new ReferenceType(intAsUnsigned ? AtomicType::UniformUInt64 :
+                                                 AtomicType::UniformInt64, false);
    else if (t == LLVMTypes::FloatPointerType)
        return new ReferenceType(AtomicType::UniformFloat, false);
+    else if (t == LLVMTypes::DoublePointerType)
+        return new ReferenceType(AtomicType::UniformDouble, false);
+
+    // pointers to varying
+    else if (t == LLVMTypes::Int8VectorPointerType)
+        return new ReferenceType(intAsUnsigned ? AtomicType::VaryingUInt8 :
+                                                 AtomicType::VaryingInt8, false);
+    else if (t == LLVMTypes::Int16VectorPointerType)
+        return new ReferenceType(intAsUnsigned ? AtomicType::VaryingUInt16 :
+                                                 AtomicType::VaryingInt16, false);
    else if (t == LLVMTypes::Int32VectorPointerType)
-        return new ReferenceType(AtomicType::VaryingInt32, false);
+        return new ReferenceType(intAsUnsigned ? AtomicType::VaryingUInt32 :
+                                                 AtomicType::VaryingInt32, false);
+    else if (t == LLVMTypes::Int64VectorPointerType)
+        return new ReferenceType(intAsUnsigned ? AtomicType::VaryingUInt64 :
+                                                 AtomicType::VaryingInt64, false);
    else if (t == LLVMTypes::FloatVectorPointerType)
        return new ReferenceType(AtomicType::VaryingFloat, false);
+    else if (t == LLVMTypes::DoubleVectorPointerType)
+        return new ReferenceType(AtomicType::VaryingDouble, false);
+
+    // arrays
    else if (llvm::isa<const llvm::PointerType>(t)) {
        const llvm::PointerType *pt = llvm::dyn_cast<const llvm::PointerType>(t);

@@ -108,21 +156,43 @@ lLLVMTypeToISPCType(const llvm::Type *t) {
        // create the equivalent ispc type.  Note that it has to be a
        // reference to an array, since ispc passes arrays to functions by
        // reference.
-        //
-        // FIXME: generalize this to do more than uniform int32s (that's
-        // all that's necessary for the stdlib currently.)
        const llvm::ArrayType *at = 
            llvm::dyn_cast<const llvm::ArrayType>(pt->getElementType());
-        if (at && at->getNumElements() == 0 &&
-            at->getElementType() == LLVMTypes::Int32Type)
-            return new ReferenceType(new ArrayType(AtomicType::UniformInt32, 0),
+        if (at != NULL) {
+            const Type *eltType = lLLVMTypeToISPCType(at->getElementType(),
+                                                      intAsUnsigned);
+            if (eltType == NULL)
+                return NULL;
+            return new ReferenceType(new ArrayType(eltType, at->getNumElements()),
                                     false);
+        }
    }

    return NULL;
 }


+static void
+lCreateSymbol(const std::string &name, const Type *returnType, 
+              const std::vector<const Type *> &argTypes, 
+              const llvm::FunctionType *ftype, llvm::Function *func, 
+              SymbolTable *symbolTable) {
+    SourcePos noPos;
+    noPos.name = "__stdlib";
+
+    FunctionType *funcType = new FunctionType(returnType, argTypes, noPos);
+    // set NULL default arguments
+    std::vector<ConstExpr *> defaults;
+    for (unsigned int j = 0; j < ftype->getNumParams(); ++j)
+        defaults.push_back(NULL);
+    funcType->SetArgumentDefaults(defaults);
+
+    Symbol *sym = new Symbol(name, noPos, funcType);
+    sym->function = func;
+    symbolTable->AddFunction(sym);
+}
+
+
 /** Given an LLVM function declaration, synthesize the equivalent ispc
    symbol for the function (if possible).  Returns true on success, false
    on failure.
@@ -135,26 +205,77 @@ lCreateISPCSymbol(llvm::Function *func, SymbolTable *symbolTable) {
    const llvm::FunctionType *ftype = func->getFunctionType();
    std::string name = func->getName();

-    const Type *returnType = lLLVMTypeToISPCType(ftype->getReturnType());
-    if (!returnType)
-        // return type not representable in ispc -> not callable from ispc
+    if (name.size() < 3 || name[0] != '_' || name[1] != '_')
        return false;

-    // Iterate over the arguments and try to find their equivalent ispc
-    // types.
-    std::vector<const Type *> argTypes;
-    for (unsigned int i = 0; i < ftype->getNumParams(); ++i) {
-        const llvm::Type *llvmArgType = ftype->getParamType(i);
-        const Type *type = lLLVMTypeToISPCType(llvmArgType);
-        if (type == NULL)
-            return false;
-        argTypes.push_back(type);
+    // An unfortunate hack: we want this builtin function to have the
+    // signature "int __sext_varying_bool(bool)", but the ispc function
+    // symbol creation code below assumes that any LLVM vector of i32s is a
+    // varying int32.  Here, we need that to be interpreted as a varying
+    // bool, so just have a one-off override for that one...
+    if (name == "__sext_varying_bool") {
+        const Type *returnType = AtomicType::VaryingInt32;
+        std::vector<const Type *> argTypes;
+        argTypes.push_back(AtomicType::VaryingBool);
+        std::vector<ConstExpr *> defaults;
+        defaults.push_back(NULL);
+
+        FunctionType *funcType = new FunctionType(returnType, argTypes, noPos);
+        funcType->SetArgumentDefaults(defaults);
+
+        Symbol *sym = new Symbol(name, noPos, funcType);
+        sym->function = func;
+        symbolTable->AddFunction(sym);
+        return true;
+    }
+
+    // If the function has any parameters with integer types, we'll make
+    // two Symbols for two overloaded versions of the function, one with
+    // all of the integer types treated as signed integers and one with all
+    // of them treated as unsigned.
+    for (int i = 0; i < 2; ++i) {
+        bool intAsUnsigned = (i == 1);
+
+        const Type *returnType = lLLVMTypeToISPCType(ftype->getReturnType(),
+                                                     intAsUnsigned);
+        if (!returnType)
+            // return type not representable in ispc -> not callable from ispc
+            return false;
+
+        // Iterate over the arguments and try to find their equivalent ispc
+        // types.  Track if any of the arguments has an integer type.
+        bool anyIntArgs = false, anyReferenceArgs = false;
+        std::vector<const Type *> argTypes;
+        for (unsigned int j = 0; j < ftype->getNumParams(); ++j) {
+            const llvm::Type *llvmArgType = ftype->getParamType(j);
+            const Type *type = lLLVMTypeToISPCType(llvmArgType, intAsUnsigned);
+            if (type == NULL)
+                return false;
+            anyIntArgs |= 
+                (Type::Equal(type, lLLVMTypeToISPCType(llvmArgType, !intAsUnsigned)) == false);
+            anyReferenceArgs |= (dynamic_cast<const ReferenceType *>(type) != NULL);
+            argTypes.push_back(type);
+        }
+
+        // Always create the symbol the first time through, in particular
+        // so that we get symbols for things with no integer types!
+        if (i == 0 || anyIntArgs == true)
+            lCreateSymbol(name, returnType, argTypes, ftype, func, symbolTable);
+
+        // If there are any reference types, also make a variant of the
+        // symbol that has them as const references.  This obviously
+        // doesn't make sense for many builtins, but we'll give the stdlib
+        // the option to call one if it needs one.
+        if (anyReferenceArgs == true) {
+            for (unsigned int j = 0; j < argTypes.size(); ++j) {
+                if (dynamic_cast<const ReferenceType *>(argTypes[j]) != NULL)
+                    argTypes[j] = argTypes[j]->GetAsConstType();
+                lCreateSymbol(name + "_refsconst", returnType, argTypes, 
+                              ftype, func, symbolTable);
+            }
+        }
    }

-    FunctionType *funcType = new FunctionType(returnType, argTypes, noPos);
-    Symbol *sym = new Symbol(name, noPos, funcType);
-    sym->function = func;
-    symbolTable->AddFunction(sym);
    return true;
 }

@@ -176,227 +297,32 @@ lAddModuleSymbols(llvm::Module *module, SymbolTable *symbolTable) {
    }
 }

-/** Declare the function symbol 'bool __is_compile_time_constant_mask(mask type)'.  
-    This function will never be defined; it's just a placeholder
-    that will be handled during the optimization process.  See the
-    discussion of the implementation of CompileTimeConstantResolvePass for
-    more details.
- */
-static void
-lDeclareCompileTimeConstant(llvm::Module *module) {
-    SourcePos noPos;
-    noPos.name = "__stdlib";

-    std::vector<const llvm::Type *> argTypes;
-    argTypes.push_back(LLVMTypes::MaskType);
-
-    llvm::FunctionType *fType = 
-        llvm::FunctionType::get(LLVMTypes::BoolType, argTypes, false);
-    llvm::Function *func =
-        llvm::Function::Create(fType, llvm::GlobalValue::ExternalLinkage,
-                               "__is_compile_time_constant_mask", module);
-    func->setOnlyReadsMemory(true);
-    func->setDoesNotThrow(true);
-}
-
-
-/** Declare the 'pseudo-gather' functions.  When the ispc front-end needs
-    to perform a gather, it generates a call to one of these functions,
-    which have signatures:
-    
-    varying int32 __pseudo_gather(varying int32 *, mask)
-    varying int64 __pseudo_gather(varying int64 *, mask)
-
-    These functions are never actually implemented; the
-    GatherScatterFlattenOpt optimization pass finds them and then converts
-    them to make calls to the following functions, which represent gathers
-    from a common base pointer with offsets.  This approach allows the
-    front-end to be relatively simple in how it emits address calculation
-    for gathers.
-
-    varying int32 __pseudo_gather_base_offsets_32(uniform int32 *base, 
-                                                  int32 offsets, mask)
-    varying int64 __pseudo_gather_base_offsets_64(uniform int64 *base, 
-                                                  int64 offsets, mask)
-
-    Then, the GSImprovementsPass optimizations finds these and either
-    converts them to native gather functions or converts them to vector
-    loads, if equivalent.
- */
-static void
-lDeclarePseudoGathers(llvm::Module *module) {
-    SourcePos noPos;
-    noPos.name = "__stdlib";
-
-    {
-        std::vector<const llvm::Type *> argTypes;
-        argTypes.push_back(LLVMTypes::VoidPointerVectorType);
-        argTypes.push_back(LLVMTypes::MaskType);
-
-        llvm::FunctionType *fType = 
-            llvm::FunctionType::get(LLVMTypes::Int32VectorType, argTypes, false);
-        llvm::Function *func =
-            llvm::Function::Create(fType, llvm::GlobalValue::ExternalLinkage,
-                                   "__pseudo_gather_32", module);
-        func->setOnlyReadsMemory(true);
-        func->setDoesNotThrow(true);
-
-        fType = llvm::FunctionType::get(LLVMTypes::Int64VectorType, argTypes, false);
-        func = llvm::Function::Create(fType, llvm::GlobalValue::ExternalLinkage,
-                                      "__pseudo_gather_64", module);
-        func->setOnlyReadsMemory(true);
-        func->setDoesNotThrow(true);
-    }
-
-    {
-        std::vector<const llvm::Type *> argTypes;
-        argTypes.push_back(LLVMTypes::VoidPointerType);
-        argTypes.push_back(LLVMTypes::Int32VectorType);
-        argTypes.push_back(LLVMTypes::MaskType);
-
-        llvm::FunctionType *fType = 
-            llvm::FunctionType::get(LLVMTypes::Int32VectorType, argTypes, false);
-        llvm::Function *func =
-            llvm::Function::Create(fType, llvm::GlobalValue::ExternalLinkage,
-                                   "__pseudo_gather_base_offsets_32", module);
-        func->setOnlyReadsMemory(true);
-        func->setDoesNotThrow(true);
-
-        fType = llvm::FunctionType::get(LLVMTypes::Int64VectorType, argTypes, false);
-        func = llvm::Function::Create(fType, llvm::GlobalValue::ExternalLinkage,
-                                      "__pseudo_gather_base_offsets_64", module);
-        func->setOnlyReadsMemory(true);
-        func->setDoesNotThrow(true);
-    }
-}
-
-
-/** Similarly to the 'pseudo-gathers' defined by lDeclarePseudoGathers(),
-    we also declare (but never define) pseudo-scatter instructions with
-    signatures:
-
-    void __pseudo_scatter_32(varying int32 *, varying int32 values, mask)
-    void __pseudo_scatter_64(varying int64 *, varying int64 values, mask)
-
-    The GatherScatterFlattenOpt optimization pass also finds these and
-    transforms them to scatters like:
-
-    void __pseudo_scatter_base_offsets_32(uniform int32 *base, 
-                    varying int32 offsets, varying int32 values, mask)
-    void __pseudo_scatter_base_offsets_64(uniform int64 *base, 
-                    varying int62 offsets, varying int64 values, mask)
-
-    And the GSImprovementsPass in turn converts these to actual native
-    scatters or masked stores.  
+/** In many of the builtins-*.ll files, we have declarations of various LLVM
+    intrinsics that are then used in the implementation of various target-
+    specific functions.  This function loops over all of the intrinsic 
+    declarations and makes sure that the signature we have in our .ll file
+    matches the signature of the actual intrinsic.
 */
 static void
-lDeclarePseudoScatters(llvm::Module *module) {
-    SourcePos noPos;
-    noPos.name = "__stdlib";
+lCheckModuleIntrinsics(llvm::Module *module) {
+    llvm::Module::iterator iter;
+    for (iter = module->begin(); iter != module->end(); ++iter) {
+        llvm::Function *func = iter;
+        if (!func->isIntrinsic())
+            continue;

-    {
-        std::vector<const llvm::Type *> argTypes;
-        argTypes.push_back(LLVMTypes::VoidPointerVectorType);
-        argTypes.push_back(LLVMTypes::Int32VectorType);
-        argTypes.push_back(LLVMTypes::MaskType);
-
-        llvm::FunctionType *fType = 
-            llvm::FunctionType::get(LLVMTypes::VoidType, argTypes, false);
-        llvm::Function *func =
-            llvm::Function::Create(fType, llvm::GlobalValue::ExternalLinkage,
-                                   "__pseudo_scatter_32", module);
-        func->setDoesNotThrow(true);
-    }
-    {
-        std::vector<const llvm::Type *> argTypes;
-        argTypes.push_back(LLVMTypes::VoidPointerVectorType);
-        argTypes.push_back(LLVMTypes::Int64VectorType);
-        argTypes.push_back(LLVMTypes::MaskType);
-
-        llvm::FunctionType *fType = 
-            llvm::FunctionType::get(LLVMTypes::VoidType, argTypes, false);
-        llvm::Function *func =
-            llvm::Function::Create(fType, llvm::GlobalValue::ExternalLinkage,
-                                   "__pseudo_scatter_64", module);
-        func->setDoesNotThrow(true);
-    }
-
-    {
-        std::vector<const llvm::Type *> argTypes;
-        argTypes.push_back(LLVMTypes::VoidPointerType);
-        argTypes.push_back(LLVMTypes::Int32VectorType);
-        argTypes.push_back(LLVMTypes::Int32VectorType);
-        argTypes.push_back(LLVMTypes::MaskType);
-
-        llvm::FunctionType *fType = 
-            llvm::FunctionType::get(LLVMTypes::VoidType, argTypes, false);
-        llvm::Function *func =
-            llvm::Function::Create(fType, llvm::GlobalValue::ExternalLinkage,
-                                   "__pseudo_scatter_base_offsets_32", module);
-        func->setDoesNotThrow(true);
-    }
-    {
-        std::vector<const llvm::Type *> argTypes;
-        argTypes.push_back(LLVMTypes::VoidPointerType);
-        argTypes.push_back(LLVMTypes::Int32VectorType);
-        argTypes.push_back(LLVMTypes::Int64VectorType);
-        argTypes.push_back(LLVMTypes::MaskType);
-
-        llvm::FunctionType *fType = 
-            llvm::FunctionType::get(LLVMTypes::VoidType, argTypes, false);
-        llvm::Function *func =
-            llvm::Function::Create(fType, llvm::GlobalValue::ExternalLinkage,
-                                   "__pseudo_scatter_base_offsets_64", module);
-        func->setDoesNotThrow(true);
-    }
-}
-
-
-/** This function declares placeholder masked store functions for the
-    front-end to use.
-
-    void __pseudo_masked_store_32(uniform int32 *ptr, varying int32 values, mask)
-    void __pseudo_masked_store_64(uniform int64 *ptr, varying int64 values, mask)
-
-    These in turn are converted to native masked stores or to regular
-    stores (if the mask is all on) by the MaskedStoreOptPass optimization
-    pass.
- */
-static void
-lDeclarePseudoMaskedStore(llvm::Module *module) {
-    SourcePos noPos;
-    noPos.name = "__stdlib";
-
-    {
-    std::vector<const llvm::Type *> argTypes;
-    argTypes.push_back(LLVMTypes::Int32VectorPointerType);
-    argTypes.push_back(LLVMTypes::Int32VectorType);
-    argTypes.push_back(LLVMTypes::MaskType);
-
-    llvm::FunctionType *fType = 
-        llvm::FunctionType::get(LLVMTypes::VoidType, argTypes, false);
-    llvm::Function *func = 
-        llvm::Function::Create(fType, llvm::GlobalValue::ExternalLinkage,
-                               "__pseudo_masked_store_32", module);
-    func->setDoesNotThrow(true);
-    func->addFnAttr(llvm::Attribute::AlwaysInline);
-    func->setDoesNotCapture(1, true);
-    }
-
-    {
-    std::vector<const llvm::Type *> argTypes;
-    argTypes.push_back(LLVMTypes::Int64VectorPointerType);
-    argTypes.push_back(LLVMTypes::Int64VectorType);
-    argTypes.push_back(LLVMTypes::MaskType);
-
-    llvm::FunctionType *fType = 
-        llvm::FunctionType::get(LLVMTypes::VoidType, argTypes, false);
-    llvm::Function *func = 
-        llvm::Function::Create(fType, llvm::GlobalValue::ExternalLinkage,
-                               "__pseudo_masked_store_64", module);
-    func->setDoesNotThrow(true);
-    func->addFnAttr(llvm::Attribute::AlwaysInline);
-    func->setDoesNotCapture(1, true);
+        const std::string funcName = func->getName().str();
+        // Work around http://llvm.org/bugs/show_bug.cgi?id=10438; only
+        // check the llvm.x86.* intrinsics for now...
+        if (!strncmp(funcName.c_str(), "llvm.x86.", 9)) {
+            llvm::Intrinsic::ID id = (llvm::Intrinsic::ID)func->getIntrinsicID();
+            assert(id != 0);
+            LLVM_TYPE_CONST llvm::Type *intrinsicType = 
+                llvm::Intrinsic::getType(*g->ctx, id);
+            intrinsicType = llvm::PointerType::get(intrinsicType, 0);
+            assert(func->getType() == intrinsicType);
+        }
    }
 }

@@ -420,10 +346,27 @@ lAddBitcode(const unsigned char *bitcode, int length,
    if (!bcModule)
        Error(SourcePos(), "Error parsing stdlib bitcode: %s", bcErr.c_str());
    else {
+        // FIXME: this feels like a bad idea, but the issue is that when we
+        // set the llvm::Module's target triple in the ispc Module::Module
+        // constructor, we start by calling llvm::sys::getHostTriple() (and
+        // then change the arch if needed).  Somehow that ends up giving us
+        // strings like 'x86_64-apple-darwin11.0.0', while the stuff we
+        // compile to bitcode with clang has module triples like
+        // 'i386-apple-macosx10.7.0'.  And then LLVM issues a warning about
+        // linking together modules with incompatible target triples..
+        llvm::Triple mTriple(m->module->getTargetTriple());
+        llvm::Triple bcTriple(bcModule->getTargetTriple());
+        assert(bcTriple.getArch() == llvm::Triple::UnknownArch ||
+               mTriple.getArch() == bcTriple.getArch());
+        assert(bcTriple.getVendor() == llvm::Triple::UnknownVendor ||
+               mTriple.getVendor() == bcTriple.getVendor());
+        bcModule->setTargetTriple(mTriple.str());
+
        std::string(linkError);
        if (llvm::Linker::LinkModules(module, bcModule, &linkError))
            Error(SourcePos(), "Error linking stdlib bitcode: %s", linkError.c_str());
        lAddModuleSymbols(module, symbolTable);
+        lCheckModuleIntrinsics(module);
    }
 }

@@ -437,7 +380,7 @@ lDefineConstantInt(const char *name, int val, llvm::Module *module,
    Symbol *pw = new Symbol(name, SourcePos(), AtomicType::UniformConstInt32);
    pw->isStatic = true;
    pw->constValue = new ConstExpr(pw->type, val, SourcePos());
-    const llvm::Type *ltype = LLVMTypes::Int32Type;
+    LLVM_TYPE_CONST llvm::Type *ltype = LLVMTypes::Int32Type;
    llvm::Constant *linit = LLVMInt32(val);
    pw->storagePtr = new llvm::GlobalVariable(*module, ltype, true, 
                                              llvm::GlobalValue::InternalLinkage,
@@ -446,6 +389,27 @@ lDefineConstantInt(const char *name, int val, llvm::Module *module,
 }


+
+static void
+lDefineConstantIntFunc(const char *name, int val, llvm::Module *module,
+                       SymbolTable *symbolTable) {
+    std::vector<const Type *> args;
+    FunctionType *ft = new FunctionType(AtomicType::UniformInt32, args, SourcePos());
+    Symbol *sym = new Symbol(name, SourcePos(), ft);
+    sym->isStatic = true;
+
+    llvm::Function *func = module->getFunction(name);
+    assert(func != NULL); // it should be declared already...
+    func->addFnAttr(llvm::Attribute::AlwaysInline);
+    llvm::BasicBlock *bblock = llvm::BasicBlock::Create(*g->ctx, "entry", func, 0);
+    llvm::ReturnInst::Create(*g->ctx, LLVMInt32(val), bblock);
+
+    sym->function = func;
+    symbolTable->AddVariable(sym);
+}
+
+
+
 static void
 lDefineProgramIndex(llvm::Module *module, SymbolTable *symbolTable) {
    Symbol *pidx = new Symbol("programIndex", SourcePos(), 
@@ -457,7 +421,7 @@ lDefineProgramIndex(llvm::Module *module, SymbolTable *symbolTable) {
        pi[i] = i;
    pidx->constValue = new ConstExpr(pidx->type, pi, SourcePos());

-    const llvm::Type *ltype = LLVMTypes::Int32VectorType;
+    LLVM_TYPE_CONST llvm::Type *ltype = LLVMTypes::Int32VectorType;
    llvm::Constant *linit = LLVMInt32Vector(pi);
    pidx->storagePtr = new llvm::GlobalVariable(*module, ltype, true, 
                                                llvm::GlobalValue::InternalLinkage, linit, 
@@ -469,32 +433,41 @@ lDefineProgramIndex(llvm::Module *module, SymbolTable *symbolTable) {
 void
 DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *module,
             bool includeStdlibISPC) {
-    // Add the definitions from the compiled stdlib-c.c file
-    extern unsigned char stdlib_bitcode_c[];
-    extern int stdlib_bitcode_c_length;
-    lAddBitcode(stdlib_bitcode_c, stdlib_bitcode_c_length, module, symbolTable);
+    // Add the definitions from the compiled builtins-c.c file
+    if (g->target.is32bit) {
+        extern unsigned char builtins_bitcode_c_32[];
+        extern int builtins_bitcode_c_32_length;
+        lAddBitcode(builtins_bitcode_c_32, builtins_bitcode_c_32_length, 
+                    module, symbolTable);
+    }
+    else {
+        extern unsigned char builtins_bitcode_c_64[];
+        extern int builtins_bitcode_c_64_length;
+        lAddBitcode(builtins_bitcode_c_64, builtins_bitcode_c_64_length, 
+                    module, symbolTable);
+    }

    // Next, add the target's custom implementations of the various needed
    // builtin functions (e.g. __masked_store_32(), etc).
    switch (g->target.isa) {
    case Target::SSE2:
-        extern unsigned char stdlib_bitcode_sse2[];
-        extern int stdlib_bitcode_sse2_length;
-        lAddBitcode(stdlib_bitcode_sse2, stdlib_bitcode_sse2_length, module,
+        extern unsigned char builtins_bitcode_sse2[];
+        extern int builtins_bitcode_sse2_length;
+        lAddBitcode(builtins_bitcode_sse2, builtins_bitcode_sse2_length, module,
                    symbolTable);
        break;
    case Target::SSE4:
-        extern unsigned char stdlib_bitcode_sse4[];
-        extern int stdlib_bitcode_sse4_length;
-        extern unsigned char stdlib_bitcode_sse4x2[];
-        extern int stdlib_bitcode_sse4x2_length;
+        extern unsigned char builtins_bitcode_sse4[];
+        extern int builtins_bitcode_sse4_length;
+        extern unsigned char builtins_bitcode_sse4x2[];
+        extern int builtins_bitcode_sse4x2_length;
        switch (g->target.vectorWidth) {
        case 4: 
-            lAddBitcode(stdlib_bitcode_sse4, stdlib_bitcode_sse4_length, 
+            lAddBitcode(builtins_bitcode_sse4, builtins_bitcode_sse4_length, 
                        module, symbolTable);
            break;
        case 8:
-            lAddBitcode(stdlib_bitcode_sse4x2, stdlib_bitcode_sse4x2_length, 
+            lAddBitcode(builtins_bitcode_sse4x2, builtins_bitcode_sse4x2_length, 
                        module, symbolTable);
            break;
        default:
@@ -502,92 +475,27 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
        }
        break;
    case Target::AVX:
-        extern unsigned char stdlib_bitcode_avx[];
-        extern int stdlib_bitcode_avx_length;
-        lAddBitcode(stdlib_bitcode_avx, stdlib_bitcode_avx_length, module, 
-                    symbolTable);
+        switch (g->target.vectorWidth) {
+        case 8:
+            extern unsigned char builtins_bitcode_avx[];
+            extern int builtins_bitcode_avx_length;
+            lAddBitcode(builtins_bitcode_avx, builtins_bitcode_avx_length, module, 
+                        symbolTable);
+            break;
+        case 16:
+            extern unsigned char builtins_bitcode_avx_x2[];
+            extern int builtins_bitcode_avx_x2_length;
+            lAddBitcode(builtins_bitcode_avx_x2, builtins_bitcode_avx_x2_length,
+                        module,  symbolTable);
+            break;
+        default:
+            FATAL("logic error in DefineStdlib");
+        }
        break;
    default:
        FATAL("logic error");
    }

-    // Add a declaration of void *ISPCMalloc(int64_t).  The user is
-    // responsible for linking in a definition of this if it's needed by
-    // the compiled program.
-    { std::vector<const llvm::Type *> argTypes;
-        argTypes.push_back(llvm::Type::getInt64Ty(*ctx));
-        llvm::FunctionType *ftype = llvm::FunctionType::get(LLVMTypes::VoidPointerType, 
-                                                            argTypes, false);
-        llvm::Function *func = 
-            llvm::Function::Create(ftype, llvm::GlobalValue::ExternalLinkage,
-                                   "ISPCMalloc", module);
-        func->setDoesNotThrow(true);
-    }
-
-    // Add a declaration of void ISPCFree(void *).  The user is
-    // responsible for linking in a definition of this if it's needed by
-    // the compiled program.
-    { std::vector<const llvm::Type *> argTypes;
-        argTypes.push_back(LLVMTypes::VoidPointerType);
-        llvm::FunctionType *ftype = llvm::FunctionType::get(LLVMTypes::VoidPointerType, 
-                                                            argTypes, false);
-        llvm::Function *func = 
-            llvm::Function::Create(ftype, llvm::GlobalValue::ExternalLinkage,
-                                   "ISPCFree", module);
-        func->setDoesNotThrow(true);
-    }
-
-    // Add a declaration of void ISPCLaunch(void *funcPtr, void *data).
-    // The user is responsible for linking in a definition of this if it's
-    // needed by the compiled program.
-    { std::vector<const llvm::Type *> argTypes;
-        argTypes.push_back(LLVMTypes::VoidPointerType);
-        argTypes.push_back(LLVMTypes::VoidPointerType);
-        llvm::FunctionType *ftype = llvm::FunctionType::get(LLVMTypes::VoidType, 
-                                                            argTypes, false);
-        llvm::Function *func = 
-            llvm::Function::Create(ftype, llvm::GlobalValue::ExternalLinkage,
-                                   "ISPCLaunch", module);
-        func->setDoesNotThrow(true);
-    }
-
-    // Add a declaration of void ISPCSync().  The user is responsible for
-    // linking in a definition of this if it's needed by the compiled
-    // program.
-    { 
-        std::vector<const llvm::Type *> argTypes;
-        llvm::FunctionType *ftype = llvm::FunctionType::get(LLVMTypes::VoidType, 
-                                                            argTypes, false);
-        llvm::Function *func = 
-            llvm::Function::Create(ftype, llvm::GlobalValue::ExternalLinkage,
-                                   "ISPCSync", module);
-        func->setDoesNotThrow(true);
-    }
-
-    // Add a declaration of void ISPCInstrument(void *, void *, int, int).
-    // The user is responsible for linking in a definition of this if it's
-    // needed by the compiled program.
-    { 
-        std::vector<const llvm::Type *> argTypes;
-        argTypes.push_back(llvm::PointerType::get(llvm::Type::getInt8Ty(*g->ctx), 0));
-        argTypes.push_back(llvm::PointerType::get(llvm::Type::getInt8Ty(*g->ctx), 0));
-        argTypes.push_back(LLVMTypes::Int32Type);
-        argTypes.push_back(LLVMTypes::Int32Type);
-        llvm::FunctionType *ftype = llvm::FunctionType::get(LLVMTypes::VoidType, 
-                                                            argTypes, false);
-        llvm::Function *func = 
-            llvm::Function::Create(ftype, llvm::GlobalValue::ExternalLinkage,
-                                   "ISPCInstrument", module);
-        func->setDoesNotThrow(true);
-    }
-
-    // Declare various placeholder functions that the optimizer will later
-    // find and replace with something more useful.
-    lDeclareCompileTimeConstant(module);
-    lDeclarePseudoGathers(module);
-    lDeclarePseudoScatters(module);
-    lDeclarePseudoMaskedStore(module);
-
    // define the 'programCount' builtin variable
    lDefineConstantInt("programCount", g->target.vectorWidth, module, symbolTable);

@@ -605,13 +513,20 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
                       symbolTable);
    lDefineConstantInt("__math_lib_system", (int)Globals::Math_System, module,
                       symbolTable);
+    lDefineConstantIntFunc("__fast_masked_vload", (int)g->opt.fastMaskedVload, module,
+                           symbolTable);

    if (includeStdlibISPC) {
        // If the user wants the standard library to be included, parse the
-        // serialized version of the stdlib.ispc file to get its definitions
-        // added.
-        extern const char *stdlib_code;
+        // serialized version of the stdlib.ispc file to get its
+        // definitions added.  Disable emission of performance warnings for
+        // now, since the user doesn't care about any of that in the stdlib
+        // implementation...
+        bool epf = g->emitPerfWarnings;
+        g->emitPerfWarnings = false;
+        extern char stdlib_code[];
        yy_scan_string(stdlib_code);
        yyparse();
+        g->emitPerfWarnings = epf;
    }
 }
--- a/builtins.m4
+++ b/builtins.m4
--- a/contrib/ispc.vim
+++ b/contrib/ispc.vim
@@ -0,0 +1,32 @@
+" Vim syntax file
+" Language:	ISPC
+" Maintainer:	Andreas Wendleder <andreas.wendleder@gmail.com>
+" Last Change:	2011 Aug 3
+
+" Quit when a syntax file was already loaded
+if exists("b:current_syntax")
+  finish
+endif
+
+" Read the C syntax to start with
+runtime! syntax/c.vim
+unlet b:current_syntax
+
+" New keywords
+syn keyword	ispcStatement	cbreak ccontinue creturn launch print reference soa sync task
+syn keyword	ispcConditional	cif
+syn keyword	ispcRepeat	cdo cfor cwhile
+syn keyword	ispcBuiltin	programCount programIndex	
+syn keyword	ispcType	export int8 int16 int32 int64
+
+" Default highlighting
+command -nargs=+ HiLink hi def link <args>
+HiLink ispcStatement	Statement
+HiLink ispcConditional	Conditional
+HiLink ispcRepeat	Repeat
+HiLink ispcBuiltin	Statement
+HiLink ispcType		Type
+delcommand HiLink
+
+let b:current_syntax = "ispc"
+
--- a/ctx.cpp
+++ b/ctx.cpp
@@ -147,13 +147,12 @@ FunctionEmitContext::FunctionEmitContext(const Type *rt, llvm::Function *functio
    if (!returnType || returnType == AtomicType::Void)
        returnValuePtr = NULL;
    else {
-        const llvm::Type *ftype = returnType->LLVMType(g->ctx);
+        LLVM_TYPE_CONST llvm::Type *ftype = returnType->LLVMType(g->ctx);
        returnValuePtr = AllocaInst(ftype, "return_value_memory");
        // FIXME: don't do this store???
        StoreInst(llvm::Constant::getNullValue(ftype), returnValuePtr);
    }

-#ifndef LLVM_2_8
    if (m->diBuilder) {
        /* If debugging is enabled, tell the debug information emission
           code about this new function */
@@ -174,7 +173,6 @@ FunctionEmitContext::FunctionEmitContext(const Type *rt, llvm::Function *functio
        /* And start a scope representing the initial function scope */
        StartScope();
    }
-#endif // LLVM_2_8

    launchedTasks = false;

@@ -183,7 +181,6 @@ FunctionEmitContext::FunctionEmitContext(const Type *rt, llvm::Function *functio
    assert(maskSymbol != NULL);
    maskSymbol->storagePtr = maskPtr;

-#ifndef LLVM_2_8
    // add debugging info for __mask, programIndex, ...
    if (m->diBuilder) {
        maskSymbol->pos = funcStartPos;
@@ -208,15 +205,12 @@ FunctionEmitContext::FunctionEmitContext(const Type *rt, llvm::Function *functio
                                           true /* static */,
                                           programCountSymbol->storagePtr);
    }
-#endif
 }


 FunctionEmitContext::~FunctionEmitContext() {
    assert(controlFlowInfo.size() == 0);
-#ifndef LLVM_2_8
    assert(debugScopes.size() == (m->diBuilder ? 1 : 0));
-#endif
 }


@@ -695,7 +689,8 @@ FunctionEmitContext::LaneMask(llvm::Value *v) {
    // Call the target-dependent movmsk function to turn the vector mask
    // into an i32 value
    std::vector<Symbol *> *mm = m->symbolTable->LookupFunction("__movmsk");
-    assert(mm && mm->size() == 1);
+    // There should be one with signed int signature, one unsigned int.
+    assert(mm && mm->size() == 2); 
    llvm::Function *fmm = (*mm)[0]->function;
    return CallInst(fmm, v, "val_movmsk");
 }
@@ -703,6 +698,7 @@ FunctionEmitContext::LaneMask(llvm::Value *v) {

 llvm::Value *
 FunctionEmitContext::MasksAllEqual(llvm::Value *v1, llvm::Value *v2) {
+#if 0
    // Compare the two masks to get a vector of i1s
    llvm::Value *cmp = CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_EQ,
                               v1, v2, "v1==v2");
@@ -710,6 +706,12 @@ FunctionEmitContext::MasksAllEqual(llvm::Value *v1, llvm::Value *v2) {
    cmp = I1VecToBoolVec(cmp);
    // And see if it's all on
    return All(cmp);
+#else
+    llvm::Value *mm1 = LaneMask(v1);
+    llvm::Value *mm2 = LaneMask(v2);
+    return CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_EQ, mm1, mm2,
+                   "v1==v2");
+#endif
 }


@@ -734,11 +736,12 @@ FunctionEmitContext::CreateBasicBlock(const char *name) {

 llvm::Value *
 FunctionEmitContext::I1VecToBoolVec(llvm::Value *b) {
-    const llvm::ArrayType *at = llvm::dyn_cast<const llvm::ArrayType>(b->getType());
+    LLVM_TYPE_CONST llvm::ArrayType *at = 
+        llvm::dyn_cast<LLVM_TYPE_CONST llvm::ArrayType>(b->getType());
    if (at) {
        // If we're given an array of vectors of i1s, then do the
        // conversion for each of the elements
-        const llvm::Type *boolArrayType = 
+        LLVM_TYPE_CONST llvm::Type *boolArrayType = 
            llvm::ArrayType::get(LLVMTypes::BoolVectorType, at->getNumElements());
        llvm::Value *ret = llvm::UndefValue::get(boolArrayType);

@@ -756,22 +759,29 @@ FunctionEmitContext::I1VecToBoolVec(llvm::Value *b) {


 llvm::Value *
-FunctionEmitContext::EmitMalloc(const llvm::Type *ty) {
+FunctionEmitContext::EmitMalloc(LLVM_TYPE_CONST llvm::Type *ty, int align) {
    // Emit code to compute the size of the given type using a GEP with a
    // NULL base pointer, indexing one element of the given type, and
    // casting the resulting 'pointer' to an int giving its size.
-    const llvm::Type *ptrType = llvm::PointerType::get(ty, 0);
+    LLVM_TYPE_CONST llvm::Type *ptrType = llvm::PointerType::get(ty, 0);
    llvm::Value *nullPtr = llvm::Constant::getNullValue(ptrType);
    llvm::Value *index[1] = { LLVMInt32(1) };
+#if defined(LLVM_3_0) || defined(LLVM_3_0svn)
+    llvm::ArrayRef<llvm::Value *> arrayRef(&index[0], &index[1]);
+    llvm::Value *poffset = llvm::GetElementPtrInst::Create(nullPtr, arrayRef,
+                                                           "offset_ptr", bblock);
+#else
    llvm::Value *poffset = llvm::GetElementPtrInst::Create(nullPtr, &index[0], &index[1],
                                                           "offset_ptr", bblock);
+#endif
    AddDebugPos(poffset);
-    llvm::Value *sizeOf =  PtrToIntInst(poffset, LLVMTypes::Int64Type, "offset_int");
+    llvm::Value *sizeOf = PtrToIntInst(poffset, LLVMTypes::Int64Type, "offset_int");

    // And given the size, call the malloc function
    llvm::Function *fmalloc = m->module->getFunction("ISPCMalloc");
    assert(fmalloc != NULL);
-    llvm::Value *mem = CallInst(fmalloc, sizeOf, "raw_argmem");
+    llvm::Value *mem = CallInst(fmalloc, sizeOf, LLVMInt32(align), 
+                                "raw_argmem");
    // Cast the void * back to the result pointer type
    return BitCastInst(mem, ptrType, "mem_bitcast");
 }
@@ -795,8 +805,13 @@ lGetStringAsValue(llvm::BasicBlock *bblock, const char *s) {
                                                 llvm::GlobalValue::InternalLinkage,
                                                 sConstant, s);
    llvm::Value *indices[2] = { LLVMInt32(0), LLVMInt32(0) };
+#if defined(LLVM_3_0) || defined(LLVM_3_0svn)
+    llvm::ArrayRef<llvm::Value *> arrayRef(&indices[0], &indices[2]);
+    return llvm::GetElementPtrInst::Create(sPtr, arrayRef, "sptr", bblock);
+#else
    return llvm::GetElementPtrInst::Create(sPtr, &indices[0], &indices[2],
                                           "sptr", bblock);
+#endif
 }


@@ -836,7 +851,6 @@ FunctionEmitContext::GetDebugPos() const {
 void
 FunctionEmitContext::AddDebugPos(llvm::Value *value, const SourcePos *pos, 
                                 llvm::DIScope *scope) {
-#ifndef LLVM_2_8
    llvm::Instruction *inst = llvm::dyn_cast<llvm::Instruction>(value);
    if (inst != NULL && m->diBuilder) {
        SourcePos p = pos ? *pos : currentPos;
@@ -847,13 +861,11 @@ FunctionEmitContext::AddDebugPos(llvm::Value *value, const SourcePos *pos,
            inst->setDebugLoc(llvm::DebugLoc::get(p.first_line, p.first_column, 
                                                  scope ? *scope : GetDIScope()));
    }
-#endif
 }


 void
 FunctionEmitContext::StartScope() {
-#ifndef LLVM_2_8
    if (m->diBuilder != NULL) {
        llvm::DIScope parentScope;
        if (debugScopes.size() > 0)
@@ -867,18 +879,15 @@ FunctionEmitContext::StartScope() {
                                             currentPos.first_column);
        debugScopes.push_back(lexicalBlock);
    }
-#endif
 }


 void
 FunctionEmitContext::EndScope() {
-#ifndef LLVM_2_8
    if (m->diBuilder != NULL) {
        assert(debugScopes.size() > 0);
        debugScopes.pop_back();
    }
-#endif
 }


@@ -891,7 +900,6 @@ FunctionEmitContext::GetDIScope() const {

 void
 FunctionEmitContext::EmitVariableDebugInfo(Symbol *sym) {
-#ifndef LLVM_2_8
    if (m->diBuilder == NULL)
        return;

@@ -907,13 +915,11 @@ FunctionEmitContext::EmitVariableDebugInfo(Symbol *sym) {
    llvm::Instruction *declareInst = 
        m->diBuilder->insertDeclare(sym->storagePtr, var, bblock);
    AddDebugPos(declareInst, &sym->pos, &scope);
-#endif
 }


 void
 FunctionEmitContext::EmitFunctionParameterDebugInfo(Symbol *sym) {
-#ifndef LLVM_2_8
    if (m->diBuilder == NULL)
        return;

@@ -929,7 +935,6 @@ FunctionEmitContext::EmitFunctionParameterDebugInfo(Symbol *sym) {
    llvm::Instruction *declareInst = 
        m->diBuilder->insertDeclare(sym->storagePtr, var, bblock);
    AddDebugPos(declareInst, &sym->pos, &scope);
-#endif
 }


@@ -939,15 +944,16 @@ FunctionEmitContext::EmitFunctionParameterDebugInfo(Symbol *sym) {
    Otherwise return zero.
 */
 static int
-lArrayVectorWidth(const llvm::Type *t) {
-    const llvm::ArrayType *arrayType = llvm::dyn_cast<const llvm::ArrayType>(t);
+lArrayVectorWidth(LLVM_TYPE_CONST llvm::Type *t) {
+    LLVM_TYPE_CONST llvm::ArrayType *arrayType = 
+        llvm::dyn_cast<LLVM_TYPE_CONST llvm::ArrayType>(t);
    if (arrayType == NULL)
        return 0;

    // We shouldn't be seeing arrays of anything but vectors being passed
    // to things like FunctionEmitContext::BinaryOperator() as operands
-    const llvm::VectorType *vectorElementType = 
-        llvm::dyn_cast<const llvm::VectorType>(arrayType->getElementType());
+    LLVM_TYPE_CONST llvm::VectorType *vectorElementType = 
+        llvm::dyn_cast<LLVM_TYPE_CONST llvm::VectorType>(arrayType->getElementType());
    assert(vectorElementType != NULL &&
           (int)vectorElementType->getNumElements() == g->target.vectorWidth);
    return (int)arrayType->getNumElements();
@@ -964,7 +970,7 @@ FunctionEmitContext::BinaryOperator(llvm::Instruction::BinaryOps inst,
    }

    assert(v0->getType() == v1->getType());
-    const llvm::Type *type = v0->getType();
+    LLVM_TYPE_CONST llvm::Type *type = v0->getType();
    int arraySize = lArrayVectorWidth(type);
    if (arraySize == 0) {
        llvm::Instruction *bop = 
@@ -998,7 +1004,7 @@ FunctionEmitContext::NotOperator(llvm::Value *v, const char *name) {
    // Similarly to BinaryOperator, do the operation on all the elements of
    // the array if we're given an array type; otherwise just do the
    // regular llvm operation.
-    const llvm::Type *type = v->getType();
+    LLVM_TYPE_CONST llvm::Type *type = v->getType();
    int arraySize = lArrayVectorWidth(type);
    if (arraySize == 0) {
        llvm::Instruction *binst = 
@@ -1023,20 +1029,20 @@ FunctionEmitContext::NotOperator(llvm::Value *v, const char *name) {
 // Given the llvm Type that represents an ispc VectorType, return an
 // equally-shaped type with boolean elements.  (This is the type that will
 // be returned from CmpInst with ispc VectorTypes).
-static const llvm::Type *
-lGetMatchingBoolVectorType(const llvm::Type *type) {
-    const llvm::ArrayType *arrayType = 
-        llvm::dyn_cast<const llvm::ArrayType>(type);
+static LLVM_TYPE_CONST llvm::Type *
+lGetMatchingBoolVectorType(LLVM_TYPE_CONST llvm::Type *type) {
+    LLVM_TYPE_CONST llvm::ArrayType *arrayType = 
+        llvm::dyn_cast<LLVM_TYPE_CONST llvm::ArrayType>(type);
    // should only be called for vector typed stuff...
    assert(arrayType != NULL);

-    const llvm::VectorType *vectorElementType =
-        llvm::dyn_cast<const llvm::VectorType>(arrayType->getElementType());
+    LLVM_TYPE_CONST llvm::VectorType *vectorElementType =
+        llvm::dyn_cast<LLVM_TYPE_CONST llvm::VectorType>(arrayType->getElementType());
    assert(vectorElementType != NULL &&
           (int)vectorElementType->getNumElements() == g->target.vectorWidth);

-    const llvm::Type *base = llvm::VectorType::get(LLVMTypes::BoolType, 
-                                                   g->target.vectorWidth);
+    LLVM_TYPE_CONST llvm::Type *base = 
+        llvm::VectorType::get(LLVMTypes::BoolType, g->target.vectorWidth);
    return llvm::ArrayType::get(base, arrayType->getNumElements());
 }

@@ -1052,7 +1058,7 @@ FunctionEmitContext::CmpInst(llvm::Instruction::OtherOps inst,
    }

    assert(v0->getType() == v1->getType());
-    const llvm::Type *type = v0->getType();
+    LLVM_TYPE_CONST llvm::Type *type = v0->getType();
    int arraySize = lArrayVectorWidth(type);
    if (arraySize == 0) {
        llvm::Instruction *ci = 
@@ -1062,7 +1068,7 @@ FunctionEmitContext::CmpInst(llvm::Instruction::OtherOps inst,
        return ci;
    }
    else {
-        const llvm::Type *boolType = lGetMatchingBoolVectorType(type);
+        LLVM_TYPE_CONST llvm::Type *boolType = lGetMatchingBoolVectorType(type);
        llvm::Value *ret = llvm::UndefValue::get(boolType);
        for (int i = 0; i < arraySize; ++i) {
            llvm::Value *a = ExtractInst(v0, i);
@@ -1076,16 +1082,17 @@ FunctionEmitContext::CmpInst(llvm::Instruction::OtherOps inst,


 llvm::Value *
-FunctionEmitContext::BitCastInst(llvm::Value *value, const llvm::Type *type, 
+FunctionEmitContext::BitCastInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *type, 
                                 const char *name) {
    if (value == NULL) {
        assert(m->errorCount > 0);
        return NULL;
    }

-    const llvm::Type *valType = value->getType();
-    const llvm::ArrayType *at = llvm::dyn_cast<const llvm::ArrayType>(valType);
-    if (at && llvm::isa<const llvm::PointerType>(at->getElementType())) {
+    LLVM_TYPE_CONST llvm::Type *valType = value->getType();
+    LLVM_TYPE_CONST llvm::ArrayType *at = 
+        llvm::dyn_cast<LLVM_TYPE_CONST llvm::ArrayType>(valType);
+    if (at && llvm::isa<LLVM_TYPE_CONST llvm::PointerType>(at->getElementType())) {
        // If we're bitcasting an array of pointers, we have a varying
        // lvalue; apply the corresponding bitcast to each of the
        // individual pointers and return the result array.
@@ -1109,42 +1116,74 @@ FunctionEmitContext::BitCastInst(llvm::Value *value, const llvm::Type *type,
 }


-llvm::Instruction *
-FunctionEmitContext::PtrToIntInst(llvm::Value *value, const llvm::Type *type,
+llvm::Value *
+FunctionEmitContext::PtrToIntInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *type,
                                  const char *name) {
    if (value == NULL) {
        assert(m->errorCount > 0);
        return NULL;
    }

-    // TODO: we should probably handle the array case as in
-    // e.g. BitCastInst(), but we don't currently need that functionality
-    llvm::Instruction *inst = 
-        new llvm::PtrToIntInst(value, type, name ? name : "ptr2int", bblock);
-    AddDebugPos(inst);
-    return inst;
+    LLVM_TYPE_CONST llvm::Type *valType = value->getType();
+    LLVM_TYPE_CONST llvm::ArrayType *at = 
+        llvm::dyn_cast<LLVM_TYPE_CONST llvm::ArrayType>(valType);
+    if (at && llvm::isa<LLVM_TYPE_CONST llvm::PointerType>(at->getElementType())) {
+        // varying lvalue -> apply ptr to int to the individual pointers
+        assert((int)at->getNumElements() == g->target.vectorWidth);
+
+        llvm::Value *ret = 
+            llvm::UndefValue::get(llvm::ArrayType::get(type, g->target.vectorWidth));
+        for (int i = 0; i < g->target.vectorWidth; ++i) {
+            llvm::Value *elt = ExtractInst(value, i);
+            llvm::Value *p2i = PtrToIntInst(elt, type, name);
+            ret = InsertInst(ret, p2i, i);
+        }
+        return ret;
+    }
+    else {
+        llvm::Instruction *inst = 
+            new llvm::PtrToIntInst(value, type, name ? name : "ptr2int", bblock);
+        AddDebugPos(inst);
+        return inst;
+    }
 }


-llvm::Instruction *
-FunctionEmitContext::IntToPtrInst(llvm::Value *value, const llvm::Type *type,
+llvm::Value *
+FunctionEmitContext::IntToPtrInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *type,
                                  const char *name) {
    if (value == NULL) {
        assert(m->errorCount > 0);
        return NULL;
    }

-    // TODO: we should probably handle the array case as in
-    // e.g. BitCastInst(), but we don't currently need that functionality
-    llvm::Instruction *inst = 
-        new llvm::IntToPtrInst(value, type, name ? name : "int2ptr", bblock);
-    AddDebugPos(inst);
-    return inst;
+    LLVM_TYPE_CONST llvm::Type *valType = value->getType();
+    LLVM_TYPE_CONST llvm::ArrayType *at = 
+        llvm::dyn_cast<LLVM_TYPE_CONST llvm::ArrayType>(valType);
+    if (at && llvm::isa<LLVM_TYPE_CONST llvm::PointerType>(at->getElementType())) {
+        // varying lvalue -> apply int to ptr to the individual pointers
+        assert((int)at->getNumElements() == g->target.vectorWidth);
+
+        llvm::Value *ret = 
+            llvm::UndefValue::get(llvm::ArrayType::get(type, g->target.vectorWidth));
+        for (int i = 0; i < g->target.vectorWidth; ++i) {
+            llvm::Value *elt = ExtractInst(value, i);
+            llvm::Value *i2p = IntToPtrInst(elt, type, name);
+            ret = InsertInst(ret, i2p, i);
+        }
+        return ret;
+    }
+    else {
+        llvm::Instruction *inst = 
+            new llvm::IntToPtrInst(value, type, name ? name : "int2ptr", bblock);
+        AddDebugPos(inst);
+        return inst;
+    }
 }


 llvm::Instruction *
-FunctionEmitContext::TruncInst(llvm::Value *value, const llvm::Type *type,
+FunctionEmitContext::TruncInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *type,
                               const char *name) {
    if (value == NULL) {
        assert(m->errorCount > 0);
@@ -1162,7 +1201,7 @@ FunctionEmitContext::TruncInst(llvm::Value *value, const llvm::Type *type,

 llvm::Instruction *
 FunctionEmitContext::CastInst(llvm::Instruction::CastOps op, llvm::Value *value,
-                              const llvm::Type *type, const char *name) {
+                              LLVM_TYPE_CONST llvm::Type *type, const char *name) {
    if (value == NULL) {
        assert(m->errorCount > 0);
        return NULL;
@@ -1178,7 +1217,7 @@ FunctionEmitContext::CastInst(llvm::Instruction::CastOps op, llvm::Value *value,


 llvm::Instruction *
-FunctionEmitContext::FPCastInst(llvm::Value *value, const llvm::Type *type, 
+FunctionEmitContext::FPCastInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *type, 
                                const char *name) {
    if (value == NULL) {
        assert(m->errorCount > 0);
@@ -1195,7 +1234,7 @@ FunctionEmitContext::FPCastInst(llvm::Value *value, const llvm::Type *type,


 llvm::Instruction *
-FunctionEmitContext::SExtInst(llvm::Value *value, const llvm::Type *type, 
+FunctionEmitContext::SExtInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *type, 
                              const char *name) {
    if (value == NULL) {
        assert(m->errorCount > 0);
@@ -1212,7 +1251,7 @@ FunctionEmitContext::SExtInst(llvm::Value *value, const llvm::Type *type,


 llvm::Instruction *
-FunctionEmitContext::ZExtInst(llvm::Value *value, const llvm::Type *type, 
+FunctionEmitContext::ZExtInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *type, 
                              const char *name) {
    if (value == NULL) {
        assert(m->errorCount > 0);
@@ -1238,22 +1277,30 @@ FunctionEmitContext::GetElementPtrInst(llvm::Value *basePtr, llvm::Value *index0

    // FIXME: do we need need to handle the case of the first index being
    // varying?  It's currently needed...
-    assert(!llvm::isa<const llvm::VectorType>(index0->getType()));
+    assert(!llvm::isa<LLVM_TYPE_CONST llvm::VectorType>(index0->getType()));

-    const llvm::Type *basePtrType = basePtr->getType();
-    const llvm::ArrayType *baseArrayType = 
-        llvm::dyn_cast<const llvm::ArrayType>(basePtrType);
+    LLVM_TYPE_CONST llvm::Type *basePtrType = basePtr->getType();
+    LLVM_TYPE_CONST llvm::ArrayType *baseArrayType = 
+        llvm::dyn_cast<LLVM_TYPE_CONST llvm::ArrayType>(basePtrType);
    bool baseIsVaryingTypePointer = (baseArrayType != NULL) && 
-        llvm::isa<const llvm::PointerType>(baseArrayType->getElementType());
-    bool indexIsVaryingType = llvm::isa<const llvm::VectorType>(index1->getType());
+        llvm::isa<LLVM_TYPE_CONST llvm::PointerType>(baseArrayType->getElementType());
+    bool indexIsVaryingType = 
+        llvm::isa<LLVM_TYPE_CONST llvm::VectorType>(index1->getType());

    if (!indexIsVaryingType && !baseIsVaryingTypePointer) {
        // The easy case: both the base pointer and the indices are
        // uniform, so just emit the regular LLVM GEP instruction
        llvm::Value *indices[2] = { index0, index1 };
+#if defined(LLVM_3_0) || defined(LLVM_3_0svn)
+        llvm::ArrayRef<llvm::Value *> arrayRef(&indices[0], &indices[2]);
+        llvm::Instruction *inst = 
+            llvm::GetElementPtrInst::Create(basePtr, arrayRef,
+                                            name ? name : "gep", bblock);
+#else
        llvm::Instruction *inst = 
            llvm::GetElementPtrInst::Create(basePtr, &indices[0], &indices[2], 
                                            name ? name : "gep", bblock);
+#endif
        AddDebugPos(inst);
        return inst;
    }
@@ -1284,9 +1331,10 @@ FunctionEmitContext::GetElementPtrInst(llvm::Value *basePtr, llvm::Value *index0
                // This is kind of a hack: use the type from the GEP to
                // figure out the return type and the first time through,
                // create an undef value of that type here
-                const llvm::PointerType *elementPtrType = 
-                    llvm::dyn_cast<const llvm::PointerType>(eltPtr->getType());
-                const llvm::Type *elementType = elementPtrType->getElementType();
+                LLVM_TYPE_CONST llvm::PointerType *elementPtrType = 
+                    llvm::dyn_cast<LLVM_TYPE_CONST llvm::PointerType>(eltPtr->getType());
+                LLVM_TYPE_CONST llvm::Type *elementType = 
+                    elementPtrType->getElementType();
                lret = llvm::UndefValue::get(LLVMPointerVectorType(elementType));
            }

@@ -1313,7 +1361,7 @@ FunctionEmitContext::LoadInst(llvm::Value *lvalue, const Type *type,
        return NULL;
    }

-    if (llvm::isa<const llvm::PointerType>(lvalue->getType())) {
+    if (llvm::isa<LLVM_TYPE_CONST llvm::PointerType>(lvalue->getType())) {
        // If the lvalue is a straight up regular pointer, then just issue
        // a regular load.  First figure out the alignment; in general we
        // can just assume the natural alignment (0 here), but for varying
@@ -1340,7 +1388,7 @@ FunctionEmitContext::LoadInst(llvm::Value *lvalue, const Type *type,
        // information we need from the LLVM::Type, so have to carry the
        // ispc type in through this path..
        assert(type != NULL);
-        assert(llvm::isa<const llvm::ArrayType>(lvalue->getType()));
+        assert(llvm::isa<LLVM_TYPE_CONST llvm::ArrayType>(lvalue->getType()));
        return gather(lvalue, type, name);
    }
 }
@@ -1350,19 +1398,19 @@ llvm::Value *
 FunctionEmitContext::gather(llvm::Value *lvalue, const Type *type, 
                            const char *name) {
    // We should have a varying lvalue if we get here...
-    assert(llvm::dyn_cast<const llvm::ArrayType>(lvalue->getType()));
+    assert(llvm::dyn_cast<LLVM_TYPE_CONST llvm::ArrayType>(lvalue->getType()));

-    const llvm::Type *retType = type->LLVMType(g->ctx);
+    LLVM_TYPE_CONST llvm::Type *retType = type->LLVMType(g->ctx);

    const StructType *st = dynamic_cast<const StructType *>(type);
    if (st) {
        // If we're gathering structures, do an element-wise gather
        // recursively.
        llvm::Value *retValue = llvm::UndefValue::get(retType);
-        for (int i = 0; i < st->NumElements(); ++i) {
+        for (int i = 0; i < st->GetElementCount(); ++i) {
            llvm::Value *eltPtrs = GetElementPtrInst(lvalue, 0, i);
            // This in turn will be another gather
-            llvm::Value *eltValues = LoadInst(eltPtrs, st->GetMemberType(i), 
+            llvm::Value *eltValues = LoadInst(eltPtrs, st->GetElementType(i), 
                                              name);
            retValue = InsertInst(retValue, eltValues, i, "set_value");
        }
@@ -1378,7 +1426,7 @@ FunctionEmitContext::gather(llvm::Value *lvalue, const Type *type,
        // the GEP stuff in the loop below ends up computing pointers based
        // on elements in the vectors rather than incorrectly advancing to
        // the next vector...
-        const llvm::Type *eltType = 
+        LLVM_TYPE_CONST llvm::Type *eltType = 
            vt->GetBaseType()->GetAsUniformType()->LLVMType(g->ctx);
        lvalue = BitCastInst(lvalue, llvm::PointerType::get(llvm::ArrayType::get(eltType, 0), 0));

@@ -1409,17 +1457,20 @@ FunctionEmitContext::gather(llvm::Value *lvalue, const Type *type,
    llvm::Value *mask = GetMask();
    llvm::Function *gather = NULL;
    // Figure out which gather function to call based on the size of
-    // the elements; will need to generalize this for 8 and 16-bit
-    // types.
+    // the elements.
    if (retType == LLVMTypes::DoubleVectorType || 
        retType == LLVMTypes::Int64VectorType)
        gather = m->module->getFunction("__pseudo_gather_64");
-    else {
-        assert(retType == LLVMTypes::FloatVectorType || 
-               retType == LLVMTypes::Int32VectorType);
+    else if (retType == LLVMTypes::FloatVectorType || 
+             retType == LLVMTypes::Int32VectorType)
        gather = m->module->getFunction("__pseudo_gather_32");
+    else if (retType == LLVMTypes::Int16VectorType)
+        gather = m->module->getFunction("__pseudo_gather_16");
+    else {
+        assert(retType == LLVMTypes::Int8VectorType);
+        gather = m->module->getFunction("__pseudo_gather_8");
    }
-    assert(gather);
+    assert(gather != NULL);

    llvm::Value *voidlvalue = BitCastInst(lvalue, LLVMTypes::VoidPointerType);
    llvm::Instruction *call = CallInst(gather, voidlvalue, mask, name);
@@ -1441,33 +1492,21 @@ FunctionEmitContext::gather(llvm::Value *lvalue, const Type *type,
 void
 FunctionEmitContext::addGSMetadata(llvm::Instruction *inst, SourcePos pos) {
    llvm::Value *str = llvm::MDString::get(*g->ctx, pos.name);
-#ifdef LLVM_2_8
-    llvm::MDNode *md = llvm::MDNode::get(*g->ctx, &str, 1);
-#else
    llvm::MDNode *md = llvm::MDNode::get(*g->ctx, str);
-#endif
    inst->setMetadata("filename", md);

    llvm::Value *line = LLVMInt32(pos.first_line);
-#ifdef LLVM_2_8
-    md = llvm::MDNode::get(*g->ctx, &line, 1);
-#else
    md = llvm::MDNode::get(*g->ctx, line);
-#endif
    inst->setMetadata("line", md);

    llvm::Value *column = LLVMInt32(pos.first_column);
-#ifdef LLVM_2_8
-    md = llvm::MDNode::get(*g->ctx, &column, 1);
-#else
    md = llvm::MDNode::get(*g->ctx, column);
-#endif
    inst->setMetadata("column", md);
 }


 llvm::Value *
-FunctionEmitContext::AllocaInst(const llvm::Type *llvmType, const char *name,
+FunctionEmitContext::AllocaInst(LLVM_TYPE_CONST llvm::Type *llvmType, const char *name,
                                int align, bool atEntryBlock) {
    llvm::AllocaInst *inst = NULL;
    if (atEntryBlock) {
@@ -1482,6 +1521,17 @@ FunctionEmitContext::AllocaInst(const llvm::Type *llvmType, const char *name,
        // current basic block
        inst = new llvm::AllocaInst(llvmType, name ? name : "", bblock);

+    // If no alignment was specified but we have an array of a uniform
+    // type, then align it to 4 * the native vector width; it's not
+    // unlikely that this array will be loaded into varying variables with
+    // what will be aligned accesses if the uniform -> varying load is done
+    // in regular chunks.
+    LLVM_TYPE_CONST llvm::ArrayType *arrayType = 
+        llvm::dyn_cast<LLVM_TYPE_CONST llvm::ArrayType>(llvmType);
+    if (align == 0 && arrayType != NULL && 
+        !llvm::isa<LLVM_TYPE_CONST llvm::VectorType>(arrayType->getElementType()))
+        align = 4 * g->target.nativeVectorWidth;
+
    if (align != 0)
        inst->setAlignment(align);
    // Don't add debugging info to alloca instructions
@@ -1504,43 +1554,31 @@ FunctionEmitContext::maskedStore(llvm::Value *rvalue, llvm::Value *lvalue,
        return;
    }

-    assert(llvm::isa<const llvm::PointerType>(lvalue->getType()));
+    assert(llvm::isa<LLVM_TYPE_CONST llvm::PointerType>(lvalue->getType()));
    
-    const StructType *structType = dynamic_cast<const StructType *>(rvalueType);
-    if (structType != NULL) {
-        // Assigning a structure
-        for (int i = 0; i < structType->NumElements(); ++i) {
+    const CollectionType *collectionType = 
+        dynamic_cast<const CollectionType *>(rvalueType);
+    if (collectionType != NULL) {
+        // Assigning a structure / array / vector. Handle each element
+        // individually with what turns into a recursive call to
+        // makedStore()
+        for (int i = 0; i < collectionType->GetElementCount(); ++i) {
            llvm::Value *eltValue = ExtractInst(rvalue, i, "rvalue_member");
            llvm::Value *eltLValue = GetElementPtrInst(lvalue, 0, i, 
                                                       "struct_lvalue_ptr");
            StoreInst(eltValue, eltLValue, storeMask, 
-                      structType->GetMemberType(i));
+                      collectionType->GetElementType(i));
        }
        return;
    }

-    const SequentialType *sequentialType = 
-        dynamic_cast<const SequentialType *>(rvalueType);
-    if (sequentialType != NULL) {
-        // Assigning arrays and vectors. Handle each element individually
-        // with what turns into a recursive call to makedStore()
-        for (int i = 0; i < sequentialType->GetElementCount(); ++i) {
-            llvm::Value *eltLValue = GetElementPtrInst(lvalue, 0, i, "lval_i_ptr");
-            llvm::Value *eltValue = ExtractInst(rvalue, i, "array_i_val");
-            StoreInst(eltValue, eltLValue, storeMask, 
-                      sequentialType->GetElementType());
-        }
-        return;
-    }
-
-    // We must have a regular atomic type at this point
-    assert(dynamic_cast<const AtomicType *>(rvalueType) != NULL);
+    // We must have a regular atomic or enumerator type at this point
+    assert(dynamic_cast<const AtomicType *>(rvalueType) != NULL ||
+           dynamic_cast<const EnumType *>(rvalueType) != NULL);
    rvalueType = rvalueType->GetAsNonConstType();

    llvm::Function *maskedStoreFunc = NULL;
-    // Figure out if we need a 32-bit or 64-bit masked store.  This
-    // will need to be generalized when/if 8 and 16-bit data types are
-    // added.
+    // Figure out if we need a 8, 16, 32 or 64-bit masked store.
    if (rvalueType == AtomicType::VaryingDouble || 
        rvalueType == AtomicType::VaryingInt64 ||
        rvalueType == AtomicType::VaryingUInt64) {
@@ -1550,12 +1588,11 @@ FunctionEmitContext::maskedStore(llvm::Value *rvalue, llvm::Value *lvalue,
        rvalue = BitCastInst(rvalue, LLVMTypes::Int64VectorType, 
                             "rvalue_to_int64");
    }
-    else {
-        assert(rvalueType == AtomicType::VaryingFloat ||
-               rvalueType == AtomicType::VaryingBool ||
-               rvalueType == AtomicType::VaryingInt32 ||
-               rvalueType == AtomicType::VaryingUInt32);
-
+    else if (rvalueType == AtomicType::VaryingFloat ||
+             rvalueType == AtomicType::VaryingBool ||
+             rvalueType == AtomicType::VaryingInt32 ||
+             rvalueType == AtomicType::VaryingUInt32 ||
+             dynamic_cast<const EnumType *>(rvalueType) != NULL) {
        maskedStoreFunc = m->module->getFunction("__pseudo_masked_store_32");
        lvalue = BitCastInst(lvalue, LLVMTypes::Int32VectorPointerType, 
                             "lvalue_to_int32vecptr");
@@ -1563,6 +1600,18 @@ FunctionEmitContext::maskedStore(llvm::Value *rvalue, llvm::Value *lvalue,
            rvalue = BitCastInst(rvalue, LLVMTypes::Int32VectorType, 
                                 "rvalue_to_int32");
    }
+    else if (rvalueType == AtomicType::VaryingInt16 ||
+             rvalueType == AtomicType::VaryingUInt16) {
+        maskedStoreFunc = m->module->getFunction("__pseudo_masked_store_16");
+        lvalue = BitCastInst(lvalue, LLVMTypes::Int16VectorPointerType, 
+                             "lvalue_to_int16vecptr");
+    }
+    else if (rvalueType == AtomicType::VaryingInt8 ||
+             rvalueType == AtomicType::VaryingUInt8) {
+        maskedStoreFunc = m->module->getFunction("__pseudo_masked_store_8");
+        lvalue = BitCastInst(lvalue, LLVMTypes::Int8VectorPointerType, 
+                             "lvalue_to_int8vecptr");
+    }

    std::vector<llvm::Value *> args;
    args.push_back(lvalue);
@@ -1583,15 +1632,15 @@ void
 FunctionEmitContext::scatter(llvm::Value *rvalue, llvm::Value *lvalue, 
                             llvm::Value *storeMask, const Type *rvalueType) {
    assert(rvalueType->IsVaryingType());
-    assert(llvm::isa<const llvm::ArrayType>(lvalue->getType()));
+    assert(llvm::isa<LLVM_TYPE_CONST llvm::ArrayType>(lvalue->getType()));

    const StructType *structType = dynamic_cast<const StructType *>(rvalueType);
    if (structType) {
        // Scatter the struct elements individually
-        for (int i = 0; i < structType->NumElements(); ++i) {
+        for (int i = 0; i < structType->GetElementCount(); ++i) {
            llvm::Value *lv = GetElementPtrInst(lvalue, 0, i);
            llvm::Value *rv = ExtractInst(rvalue, i);
-            scatter(rv, lv, storeMask, structType->GetMemberType(i));
+            scatter(rv, lv, storeMask, structType->GetElementType(i));
        }
        return;
    }
@@ -1602,7 +1651,8 @@ FunctionEmitContext::scatter(llvm::Value *rvalue, llvm::Value *lvalue,
        // the GEP stuff in the loop below ends up computing pointers based
        // on elements in the vectors rather than incorrectly advancing to
        // the next vector...
-        const llvm::Type *eltType = vt->GetBaseType()->GetAsUniformType()->LLVMType(g->ctx);
+        LLVM_TYPE_CONST llvm::Type *eltType = 
+            vt->GetBaseType()->GetAsUniformType()->LLVMType(g->ctx);
        lvalue = BitCastInst(lvalue, llvm::PointerType::get(llvm::ArrayType::get(eltType, 0), 0));

        for (int i = 0; i < vt->GetElementCount(); ++i) {
@@ -1620,20 +1670,21 @@ FunctionEmitContext::scatter(llvm::Value *rvalue, llvm::Value *lvalue,
    assert(dynamic_cast<const AtomicType *>(rvalueType) != NULL);

    llvm::Function *func = NULL;
-    const llvm::Type *type = rvalue->getType();
+    LLVM_TYPE_CONST llvm::Type *type = rvalue->getType();
    if (type == LLVMTypes::DoubleVectorType || 
        type == LLVMTypes::Int64VectorType) {
        func = m->module->getFunction("__pseudo_scatter_64");
        rvalue = BitCastInst(rvalue, LLVMTypes::Int64VectorType, "rvalue2int");
    }
-    else {
-        // FIXME: if this hits, presumably it's due to needing int8 and/or
-        // int16 versions of scatter...
-        assert(type == LLVMTypes::FloatVectorType || 
-               type == LLVMTypes::Int32VectorType);
+    else if (type == LLVMTypes::FloatVectorType || 
+             type == LLVMTypes::Int32VectorType) {
        func = m->module->getFunction("__pseudo_scatter_32");
        rvalue = BitCastInst(rvalue, LLVMTypes::Int32VectorType, "rvalue2int");
    }
+    else if (type == LLVMTypes::Int16VectorType)
+        func = m->module->getFunction("__pseudo_scatter_16");
+    else if (type == LLVMTypes::Int8VectorType)
+        func = m->module->getFunction("__pseudo_scatter_8");
    assert(func != NULL);
    
    AddInstrumentationPoint("scatter");
@@ -1687,7 +1738,7 @@ FunctionEmitContext::StoreInst(llvm::Value *rvalue, llvm::Value *lvalue,
        llvm::Instruction *si = new llvm::StoreInst(rvalue, lvalue, bblock);
        AddDebugPos(si);
    }
-    else if (llvm::isa<const llvm::ArrayType>(lvalue->getType()))
+    else if (llvm::isa<LLVM_TYPE_CONST llvm::ArrayType>(lvalue->getType()))
        // We have a varying lvalue (an array of pointers), so it's time to
        // scatter
        scatter(rvalue, lvalue, storeMask, rvalueType);
@@ -1731,7 +1782,7 @@ FunctionEmitContext::ExtractInst(llvm::Value *v, int elt, const char *name) {
    }

    llvm::Instruction *ei = NULL;
-    if (llvm::isa<const llvm::VectorType>(v->getType()))
+    if (llvm::isa<LLVM_TYPE_CONST llvm::VectorType>(v->getType()))
        ei = llvm::ExtractElementInst::Create(v, LLVMInt32(elt), 
                                              name ? name : "extract", bblock);
    else
@@ -1751,7 +1802,7 @@ FunctionEmitContext::InsertInst(llvm::Value *v, llvm::Value *eltVal, int elt,
    }

    llvm::Instruction *ii = NULL;
-    if (llvm::isa<const llvm::VectorType>(v->getType()))
+    if (llvm::isa<LLVM_TYPE_CONST llvm::VectorType>(v->getType()))
        ii = llvm::InsertElementInst::Create(v, eltVal, LLVMInt32(elt), 
                                             name ? name : "insert", bblock);
    else
@@ -1763,12 +1814,12 @@ FunctionEmitContext::InsertInst(llvm::Value *v, llvm::Value *eltVal, int elt,


 llvm::PHINode *
-FunctionEmitContext::PhiNode(const llvm::Type *type, int count, 
+FunctionEmitContext::PhiNode(LLVM_TYPE_CONST llvm::Type *type, int count, 
                             const char *name) {
    llvm::PHINode *pn = llvm::PHINode::Create(type, 
-#if !defined(LLVM_2_8) && !defined(LLVM_2_9)
+#if defined(LLVM_3_0) || defined(LLVM_3_0svn)
                                              count, 
-#endif // !LLVM_2_8 && !LLVM_2_9
+#endif // LLVM_3_0
                                              name ? name : "phi", bblock);
    AddDebugPos(pn);
    return pn;
@@ -1800,9 +1851,14 @@ FunctionEmitContext::CallInst(llvm::Function *func,
        return NULL;
    }

+#if defined(LLVM_3_0) || defined(LLVM_3_0svn)
+    llvm::Instruction *ci = 
+        llvm::CallInst::Create(func, args, name ? name : "", bblock);
+#else
    llvm::Instruction *ci = 
        llvm::CallInst::Create(func, args.begin(), args.end(), 
                               name ? name : "", bblock);
+#endif
    AddDebugPos(ci);
    return ci;
 }
@@ -1816,10 +1872,15 @@ FunctionEmitContext::CallInst(llvm::Function *func, llvm::Value *arg,
        return NULL;
    }

+#if defined(LLVM_3_0) || defined(LLVM_3_0svn)
+    llvm::Instruction *ci = 
+        llvm::CallInst::Create(func, arg, name ? name : "", bblock);
+#else
    llvm::Value *args[] = { arg };
    llvm::Instruction *ci = 
        llvm::CallInst::Create(func, &args[0], &args[1], name ? name : "",
                               bblock);
+#endif
    AddDebugPos(ci);
    return ci;
 }
@@ -1834,9 +1895,16 @@ FunctionEmitContext::CallInst(llvm::Function *func, llvm::Value *arg0,
    }

    llvm::Value *args[] = { arg0, arg1 };
+#if defined(LLVM_3_0) || defined(LLVM_3_0svn)
+    llvm::ArrayRef<llvm::Value *> argArrayRef(&args[0], &args[2]);
+    llvm::Instruction *ci = 
+        llvm::CallInst::Create(func, argArrayRef, name ? name : "", 
+                               bblock);
+#else
    llvm::Instruction *ci = 
        llvm::CallInst::Create(func, &args[0], &args[2], name ? name : "", 
                               bblock);
+#endif
    AddDebugPos(ci);
    return ci;
 }
@@ -1883,20 +1951,37 @@ FunctionEmitContext::LaunchInst(llvm::Function *callee,

    launchedTasks = true;

-    const llvm::Type *argType = callee->arg_begin()->getType();
+    LLVM_TYPE_CONST llvm::Type *argType = callee->arg_begin()->getType();
    assert(llvm::PointerType::classof(argType));
-    const llvm::PointerType *pt = static_cast<const llvm::PointerType *>(argType);
+    LLVM_TYPE_CONST llvm::PointerType *pt = 
+        llvm::dyn_cast<LLVM_TYPE_CONST llvm::PointerType>(argType);
    assert(llvm::StructType::classof(pt->getElementType()));
-    const llvm::StructType *argStructType = 
-        static_cast<const llvm::StructType *>(pt->getElementType());
+    LLVM_TYPE_CONST llvm::StructType *argStructType = 
+        static_cast<LLVM_TYPE_CONST llvm::StructType *>(pt->getElementType());
    assert(argStructType->getNumElements() == argVals.size() + 1);

-    // Use alloca for space for the task args.  KEY DETAIL: pass false
-    // to the call of FunctionEmitContext::AllocaInst so that the alloca
-    // doesn't happen just once at the top of the function, but happens
-    // each time the enclosing basic block executes.
    int align = 4 * RoundUpPow2(g->target.nativeVectorWidth);
-    llvm::Value *argmem = AllocaInst(argStructType, "argmem", align, false);
+    llvm::Value *argmem;
+#ifdef ISPC_IS_WINDOWS
+    // Use malloc() to allocate storage on Windows, since the stack is
+    // generally not big enough there to do enough allocations for lots of
+    // tasks and then things crash horribly...
+    argmem = EmitMalloc(argStructType, align);
+#else
+    // Otherwise, use alloca for space for the task args, ** unless we're 
+    // compiling to AVX, in which case we use malloc after all **. (See
+    // http://llvm.org/bugs/show_bug.cgi?id=10841 for details.  There are
+    // limitations in LLVM with respect to dynamic allocas of this sort
+    // when the stack also has to be 32-byte aligned...).
+    if (g->target.isa == Target::AVX)
+        argmem = EmitMalloc(argStructType, align);
+    else
+        // KEY DETAIL: pass false to the call of
+        // FunctionEmitContext::AllocaInst so that the alloca doesn't
+        // happen just once at the top of the function, but happens each
+        // time the enclosing basic block executes.
+        argmem = AllocaInst(argStructType, "argmem", align, false);
+#endif // ISPC_IS_WINDOWS
    llvm::Value *voidmem = BitCastInst(argmem, LLVMTypes::VoidPointerType);

    // Copy the values of the parameters into the appropriate place in
--- a/ctx.h
+++ b/ctx.h
@@ -213,7 +213,7 @@ public:
    /** Emit code to call the user-supplied ISPCMalloc function to
        allocate space for an object of thee given type.  Returns the
        pointer value returned by the ISPCMalloc call. */
-    llvm::Value *EmitMalloc(const llvm::Type *ty);
+    llvm::Value *EmitMalloc(LLVM_TYPE_CONST llvm::Type *ty, int align = 0);

    /** Emit code to call the user-supplied ISPCFree function, passing it
        the given pointer to storage previously allocated by an
@@ -303,21 +303,21 @@ public:
                         llvm::CmpInst::Predicate pred,
                         llvm::Value *v0, llvm::Value *v1, const char *name = NULL);

-    llvm::Value *BitCastInst(llvm::Value *value, const llvm::Type *type,
+    llvm::Value *BitCastInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *type,
                             const char *name = NULL);
-    llvm::Instruction *PtrToIntInst(llvm::Value *value, const llvm::Type *type,
-                                    const char *name = NULL);
-    llvm::Instruction *IntToPtrInst(llvm::Value *value, const llvm::Type *type,
-                                    const char *name = NULL);
-    llvm::Instruction *TruncInst(llvm::Value *value, const llvm::Type *type,
+    llvm::Value *PtrToIntInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *type,
+                              const char *name = NULL);
+    llvm::Value *IntToPtrInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *type,
+                              const char *name = NULL);
+    llvm::Instruction *TruncInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *type,
                                 const char *name = NULL);
    llvm::Instruction *CastInst(llvm::Instruction::CastOps op, llvm::Value *value,
-                                const llvm::Type *type, const char *name = NULL);
-    llvm::Instruction *FPCastInst(llvm::Value *value, const llvm::Type *type, 
+                                LLVM_TYPE_CONST llvm::Type *type, const char *name = NULL);
+    llvm::Instruction *FPCastInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *type, 
                                  const char *name = NULL);
-    llvm::Instruction *SExtInst(llvm::Value *value, const llvm::Type *type, 
+    llvm::Instruction *SExtInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *type, 
                                const char *name = NULL);
-    llvm::Instruction *ZExtInst(llvm::Value *value, const llvm::Type *type, 
+    llvm::Instruction *ZExtInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *type, 
                                const char *name = NULL);

    /** This GEP method is a generalization of the standard one in LLVM; it
@@ -347,7 +347,7 @@ public:
        instruction is added at the start of the function in the entry
        basic block; if it should be added to the current basic block, then
        the atEntryBlock parameter should be false. */ 
-    llvm::Value *AllocaInst(const llvm::Type *llvmType, const char *name = NULL,
+    llvm::Value *AllocaInst(LLVM_TYPE_CONST llvm::Type *llvmType, const char *name = NULL,
                            int align = 0, bool atEntryBlock = true);

    /** Standard store instruction; for this variant, the lvalue must be a
@@ -378,7 +378,8 @@ public:
    llvm::Value *InsertInst(llvm::Value *v, llvm::Value *eltVal, int elt, 
                            const char *name = NULL);

-    llvm::PHINode *PhiNode(const llvm::Type *type, int count, const char *name = NULL);
+    llvm::PHINode *PhiNode(LLVM_TYPE_CONST llvm::Type *type, int count, 
+                           const char *name = NULL);
    llvm::Instruction *SelectInst(llvm::Value *test, llvm::Value *val0,
                                  llvm::Value *val1, const char *name = NULL);

--- a/decl.cpp
+++ b/decl.cpp
@@ -318,9 +318,10 @@ Declaration::Print() const {
 ///////////////////////////////////////////////////////////////////////////

 void
-GetStructTypesAndNames(const std::vector<StructDeclaration *> &sd,
-                       std::vector<const Type *> *elementTypes,
-                       std::vector<std::string> *elementNames) {
+GetStructTypesNamesPositions(const std::vector<StructDeclaration *> &sd,
+                             std::vector<const Type *> *elementTypes,
+                             std::vector<std::string> *elementNames,
+                             std::vector<SourcePos> *elementPositions) {
    for (unsigned int i = 0; i < sd.size(); ++i) {
        const Type *type = sd[i]->type;
        // FIXME: making this fake little DeclSpecs here is really
@@ -343,6 +344,7 @@ GetStructTypesAndNames(const std::vector<StructDeclaration *> &sd,

            elementTypes->push_back(d->sym->type);
            elementNames->push_back(d->sym->name);
+            elementPositions->push_back(d->sym->pos);
        }
    }
 }
--- a/decl.h
+++ b/decl.h
@@ -196,8 +196,9 @@ struct StructDeclaration {

 /** Given a set of StructDeclaration instances, this returns the types of
    the elements of the corresponding struct and their names. */
-extern void GetStructTypesAndNames(const std::vector<StructDeclaration *> &sd,
-                                   std::vector<const Type *> *elementTypes,
-                                   std::vector<std::string> *elementNames);
+extern void GetStructTypesNamesPositions(const std::vector<StructDeclaration *> &sd,
+                                         std::vector<const Type *> *elementTypes,
+                                         std::vector<std::string> *elementNames,
+                                         std::vector<SourcePos> *elementPositions);

 #endif // ISPC_DECL_H
--- a/docs/ReleaseNotes.txt
+++ b/docs/ReleaseNotes.txt
@@ -0,0 +1,184 @@
+=== v1.0.8 === (19 September 2011)
+
+A number of improvements have been made to handling of 'if' statements in
+the language:
+  - A bug was fixed where invalid memory could be incorrectly accessed even
+    if none of the running program instances wanted to execute the
+    corresponding instructions (https://github.com/ispc/ispc/issues/74).
+  - The code generated for 'if' statements is a bit simpler and thus more
+    efficient.
+
+There is now '--pic' command-line argument that causes position-independent
+code to be generated (Linux and OSX only).
+
+A number of additional performance improvements:
+  - Loops are now unrolled by default; the --opt=disable-loop-unroll
+    command-line argument can be used to disable this behavior.
+    (https://github.com/ispc/ispc/issues/78)
+  - A few more cases where gathers/scatters could be determined at compile
+    time to actually access contiguous locations have been added.
+    (https://github.com/ispc/ispc/issues/79)
+
+Finally, warnings are now issued (if possible) when it can be determined
+at compile-time that an out-of-bounds array index is being used.
+(https://github.com/ispc/ispc/issues/98).
+
+
+=== v1.0.7 === (3 September 2011)
+
+The various atomic_*_global() standard library functions are generally
+substantially more efficient.  They all previously issued one hardware
+atomic instruction for each running program instance but now locally
+compute a reduction over the operands and issue a single hardware atomic,
+giving the same effect and results in the end (issue #57).
+
+CPU/ISA target handling has been substantially improved.  If no CPU is
+specified, the host CPU type is used, not just a default of "nehalem".  A
+number of bugs were fixed that ensure that LLVM doesn't generate SSE>2
+instructions when using the SSE2 target (fixes issue #82).
+
+Shift rights of unsigned integer types use a logical shift right
+instruction now, not an arithmetic shift right (fixed issue #88).
+
+When emitting header files, 'extern' declarations of globals used in ispc
+code are now outside of the ispc namespace.  Fixes issue #64.
+
+The stencil example has been modified to do runs with and without
+parallelism.
+
+Many other small bugfixes and improvements.
+
+=== v1.0.6 === (17 August 2011)
+
+Some additional cross-program instance operations have been added to the
+standard library.  reduce_equal() checks to see if the given value is the
+same across all running program instances, and exclusive_scan_{and,or,and}()
+computes a scan over the given value in the running program instances.
+See the documentation of these new routines for more information:
+http://ispc.github.com/ispc.html#cross-program-instance-operations.
+
+The simple task system implementations used in the examples have been
+improved.  The Windows version no nlonger has a hard limit on the number of
+tasks that can be launched, and all versions have less dynamic memory
+allocation and less locking.  More of the examples now have paths that also
+measure performance using tasks along with SPMD vectorization.
+
+Two new examples have been added: one that shows the implementation of a
+ray-marching volume rendering algorithm, and one that shows a 3D stencil
+computation, as might be done for PDE solutions.
+
+Standard library routines to issue prefetches have been added.  See the
+documentation for more details: http://ispc.github.com/ispc.html#prefetches.
+
+Fast versions of the float to half-precision float conversion routines have
+been added.  For more details, see:
+http://ispc.github.com/ispc.html#conversions-to-and-from-half-precision-floats.
+
+There is the usual set of small bug fixes.  Notably, a number of details
+related to handling 32 versus 64 bit targets have been fixed, which in turn
+has fixed a bug related to tasks having incorrect values for pointers
+passed to them.
+
+=== v1.0.5 === (1 August 2011)
+
+Multi-element vector swizzles are supported; for example, given a 3-wide
+vector "foo", then expressions like "foo.zyx" and "foo.yz" can be used to
+construct other short vectors.  See
+http://ispc.github.com/ispc.html#short-vector-types
+for more details.  (Thanks to Pete Couperus for implementing this code!).
+
+int8 and int16 datatypes are now supported.  It is still generally more
+efficient to use int32 for intermediate computations, even if the in-memory
+format is int8 or int16.
+
+There are now standard library routines to convert to and from 'half'-format
+floating-point values (half_to_float() and float_to_half()).
+
+There is a new example with an implementation of Perlin's Noise function
+(examples/noise).  It shows a speedup of approximately 4.2x versus a C
+implementation on OSX and a 2.9x speedup versus C on Windows.
+
+=== v1.0.4 === (18 July 2011)
+
+enums are now supported in ispc; see the section on enumeration types in
+the documentation (http://ispc.github.com/ispc.html#enumeration-types) for
+more informaiton.
+
+bools are converted to integers with zero extension, not sign extension as
+before (i.e. a 'true' bool converts to the value one, not 'all bits on'.)
+For cases where sign extension is still desired, there is a
+sign_extend(bool) function in the standard library.
+
+Support for 64-bit types in the standard library is much more complete than
+before.
+
+64-bit integer constants are now supported by the parser.
+
+Storage for parameters to tasks is now allocated dynamically on Windows,
+rather than on the stack; with this fix, all tests now run correctly on
+Windows.
+
+There is now support for atomic swap and compare/exchange with float and
+double types.
+
+A number of additional small bugs have been fixed and a number of cases
+where the compiler would crash given a malformed program have been fixed.
+
+=== v1.0.3 === (4 July 2011)
+
+ispc now has a bulit-in pre-processor (from LLVM's clang compiler).
+(Thanks to Pete Couperus for this patch!)  It is therefore no longer
+necessary to use cl.exe for preprocessing on Windows; the MSVC proejct
+files for the examples have been updated accordingly.
+
+There is another variant of the shuffle() function int the standard
+library: "<type> shuffle(<type> v0, <type> v1, int permute)", where the
+permutation vector indexes over the concatenation of the two vectors
+(e.g. the value 0 corresponds to the first element of v0, the value
+2*programCount-1 corresponds to the last element of v1, etc.)
+
+ispc now supports the usual range of atomic operations (add, subtract, min,
+max, and, or, and xor) as well as atomic swap and atomic compare and
+exchange.  There is also a facility for inserting memory fences.  See the
+"Atomic Operations and Memory Fences" section of the user's guide
+(http://ispc.github.com/ispc.html#atomic-operations-and-memory-fences) for
+more information.
+ 
+There are now both 'signed' and 'unsigned' variants of the standard library
+functions like packed_load_active() that take references to arrays of
+signed int32s and unsigned int32s respectively.  (The
+{load_from,store_to}_{int8,int16}() functions have similarly been augmented
+to have both 'signed' and 'unsigned' variants.)
+
+In initializer expressions with variable declarations, it is no longer
+legal to initialize arrays and structs with single scalar values that then
+initialize their members; they now must be initialized with initializer
+lists in braces (or initialized after of the initializer with a loop over
+array elements, etc.)
+
+=== v1.0.2 === (1 July 2011)
+
+Floating-point hexidecimal constants are now parsed correctly on Windows
+(fixes issue #16).
+
+SSE2 is now the default target if --cpu=atom is given in the command line
+arguments and another target isn't explicitly specified.
+
+The standard library now provides broadcast(), rotate(), and shuffle()
+routines for efficient communication between program instances.
+
+The MSVC solution files to build the examples on Windows now use
+/fpmath:fast when building.
+
+=== v1.0.1 === (24 June 2011)
+
+ispc no longer requires that pointers to memory that are passed in to ispc
+have alignment equal to the targets vector width; now alignment just has to
+be the regular element alignment (e.g. 4 bytes for floats, etc.)  This
+change also fixed a number of cases where it previously incorrectly
+generated aligned load/store instructions in cases where the address wasn't
+actually aligned (even if the base address passed into ispc code was).
+
+=== v1.0 === (21 June 2011)
+
+Initial Release
--- a/docs/build.sh
+++ b/docs/build.sh
@@ -1,6 +1,6 @@
 #!/bin/bash

-rst2html ispc.txt > ispc.html
+rst2html.py ispc.txt > ispc.html

 #rst2latex --section-numbering --documentclass=article --documentoptions=DIV=9,10pt,letterpaper ispc.txt > ispc.tex
 #pdflatex ispc.tex
--- a/docs/ispc.txt
+++ b/docs/ispc.txt
@@ -33,6 +33,17 @@ The main goals behind ``ispc`` are to:
 number of non-trivial workloads that aren't handled well by other
 compilation approaches (e.g. loop auto-vectorization.)

+**We are very interested in your feedback and comments about ispc and
+in hearing your experiences using the system.  We are especially interested
+in hearing if you try using ispc but see results that are not as you
+were expecting or hoping for.** We encourage you to send a note with your
+experiences or comments to the `ispc-users`_ mailing list or to file bug or
+feature requests with the ``ispc`` `bug tracker`_. (Thanks!)
+
+.. _ispc-users: http://groups.google.com/group/ispc-users
+.. _bug tracker: https://github.com/ispc/ispc/issues?state=open
+
+
 Contents:

 * `Recent Changes to ISPC`_
@@ -50,6 +61,7 @@ Contents:

  + `Lexical Structure`_
  + `Basic Types and Type Qualifiers`_
+  + `Enumeration Types`_
  + `Short Vector Types`_
  + `Struct and Array Types`_
  + `Declarations and Initializers`_
@@ -74,7 +86,11 @@ Contents:

  + `Math Functions`_
  + `Output Functions`_
-  + `Cross-Lane Operations`_
+  + `Cross-Program Instance Operations`_
+  + `Packed Load and Store Operations`_
+  + `Conversions To and From Half-Precision Floats`_
+  + `Atomic Operations and Memory Fences`_
+  + `Prefetches`_
  + `Low-Level Bits`_

 * `Interoperability with the Application`_
@@ -89,12 +105,16 @@ Contents:
  + `Understanding How to Interoperate With the Application's Data`_
  + `Communicating Between SPMD Program Instances`_
  + `Gather and Scatter`_
+  + `8 and 16-bit Integer Types`_
  + `Low-level Vector Tricks`_
  + `Debugging`_
  + `The "Fast math" Option`_
  + `"Inline" Aggressively`_
  + `Small Performance Tricks`_
  + `Instrumenting Your ISPC Programs`_
+  + `Using Scan Operations For Variable Output`_
+  + `Application-Supplied Execution Masks`_
+  + `Explicit Vector Programming With Uniform Short Vector Types`_

 * `Disclaimer and Legal Information`_

@@ -103,27 +123,8 @@ Contents:
 Recent Changes to ISPC
 ======================

-This section summarizes recent changes and bugfixes.
-
-* 17 May: Fixed a number of bugs related to error handling in Windows*.  In
-  particular, if you use the ``/E`` command line flag to ``cl.exe`` (rather
-  than ``/EP``) when using it as a preprocessor, then ``ispc`` will
-  correctly report the source file position with warnings and errors.
-
-* 15 May: Improved error messages and warnings in many cases.  For example,
-  the column number is reported along with the line number and
-  the source line with the error is printed as part of the message.
-
-* 8 May: ``ispc``'s typechecker has been substantially improved in how it
-  handles ``const``-qualified types.  Some programs that previously
-  compiled may now fail with errors related to ``const``.  For example,
-  ``ispc`` issues an error message if you try to assign a member of a const
-  structure.
-
-* 2 May: "uniform" short-vector types are now stored across the lanes of
-  the SIMD registers.  This enables you to also write classic 'explicit
-  vector' computation in ``ispc`` as well.  This change does change how
-  these types are laid out in memory; see `Data Layout`_ for more details.)
+See the file ``ReleaseNotes.txt`` in the ``ispc`` distribution for a list
+of recent changes to the compiler.

 Getting Started with ISPC
 =========================
@@ -136,7 +137,7 @@ Linux\* and Mac OS\* available for download.  Alternatively, you can
 download the source code from that page and build it yourself; see see the
 `ispc wiki`_ for instructions about building ``ispc`` from source.

-.. _ispc downloads web page:downloads.html
+.. _ispc downloads web page: downloads.html
 .. _ispc wiki: http://github.com/ispc/ispc/wiki

 Once you have an executable for your system, copy it into a directory
@@ -281,19 +282,9 @@ with application code, enter the following command

   ispc foo.ispc -o foo.o

-On Linux\* and Mac OS\*, ``ispc`` automatically runs the C preprocessor on
-your input program; under Windows\*, this must be done manually.  With
-Microsoft Visual C++ 2010\*, the following custom build step for
-``ispc`` source files takes care of this job:
-
-::
-
-  cl /E /TP %(Filename).ispc | ispc - -o %(Filename).obj -h %(Filename).h
-
-The ``cl`` call runs the C preprocessor on the ``ispc`` file; the result is
-piped to ``ispc`` to generate an object file and a header.  As an example,
-see the file ``simple.vcxproj`` in the ``examples/simple`` directory of the
-``ispc`` distribution.
+``ispc`` automatically runs the C preprocessor on your input program before
+compiling it.  (This functionality can be disabled with the ``--nocpp``
+command-line argument.)

 Command-line Options
 --------------------
@@ -340,7 +331,7 @@ before it's compiled.  On Windows®, pre-processor definitions should be
 provided to the ``cl`` call.

 By default, the compiler generates x86-64 Intel® SSE4 code.  To generate
-32-bit code, you can use the the ``--arch=x86`` command-line flag.  To
+32-bit code, you can use the ``--arch=x86`` command-line flag.  To
 select Intel® SSE2, use ``--target=sse2``.

 ``ispc`` supports an alternative method for generating Intel® SSE4 code,
@@ -453,7 +444,8 @@ The following identifiers are reserved as language keywords: ``bool``,
 ``char``, ``cif``, ``cwhile``, ``const``, ``continue``, ``creturn``,
 ``default``, ``do``, ``double``, ``else``, ``enum``, ``export``,
 ``extern``, ``false``, ``float``, ``for``, ``goto``, ``if``, ``inline``, ``int``,
-``int32``, ``int64``, ``launch``, ``print``, ``reference``, ``return``,
+``int8``, ``int16``, ``int32``, ``int64``, ``launch``, ``print``,
+``reference``, ``return``,
 ``signed``, ``sizeof``, ``soa``, ``static``, ``struct``, ``switch``,
 ``sync``, ``task``, ``true``, ``typedef``, ``uniform``, ``union``,
 ``unsigned``, ``varying``, ``void``, ``volatile``, ``while``.
@@ -507,6 +499,10 @@ types.
 * ``void``: "empty" type representing no value.
 * ``bool``: boolean value; may be assigned ``true``, ``false``, or the
  value of a boolean expression.
+* ``int8``: 8-bit signed integer.
+* ``unsigned int8``: 8-bit unsigned integer.
+* ``int16``: 16-bit signed integer.
+* ``unsigned int16``: 16-bit unsigned integer.
 * ``int``: 32-bit signed integer; may also be specified as ``int32``.
 * ``unsigned int``: 32-bit unsigned integer; may also be specified as
  ``unsigned int32``.
@@ -523,7 +519,8 @@ general" of the two types, with the following precedence:

 ::

-  double > uint64 > int64 > float > uint32 > int32 > bool
+  double > uint64 > int64 > float > uint32 > int32 > 
+      uint16 > int16 > uint8 > int8 > bool

 In other words, adding an ``int64`` to a ``double`` causes the ``int64`` to
 be converted to a ``double``, the addition to be performed, and a
@@ -536,11 +533,9 @@ is provided in parenthesis around the expression:
    double foo = 1. / 3.;
    int bar = (float)bar + (float)bar;  // 32-bit float addition

-Note: if a ``bool`` is converted to an integer numeric type (``int``,
-``int64``, etc.), then the conversion is done with sign extension, not zero
-extension.  Thus, the resulting value has all bits set if the ``bool`` is
-``true``; for example, ``0xffffffff`` for ``int32``.  This differs from C
-and C++, where a ``true`` bool is converted to the integer value one.
+If a ``bool`` is converted to an integer numeric type (``int``, ``int64``,
+etc.), then the result is the value one if the ``bool`` has the value
+``true`` and has the value zero otherwise.

 Variables can be declared with the ``const`` qualifier, which prohibits
 their modification.
@@ -579,6 +574,51 @@ results or modify existing variables.
 ``ispc`` doesn't currently support pointer types.


+Enumeration Types
+-----------------
+
+It is possible to define user-defined enumeration types in ``ispc`` with
+the ``enum`` keyword, which is followed by an option enumeration type name
+and then a brace-delimited list of enumerators with optional values:
+
+::
+
+    enum Color { RED, GREEN, BLUE };
+    enum Flags { 
+        UNINITIALIZED = 0,
+        INITIALIZED = 2,
+        CACHED = 4
+    };
+
+Each ``enum`` declaration defines a new type; an attempt to implicitly
+convert between enumerations of different types gives a compile-time error,
+but enuemrations of different types can be explicitly cast to one other.
+
+::
+
+    Color c = (Color)CACHED;
+
+Enumerators are implicitly converted to integer types, however, so they can
+be directly passed to routines that take integer parameters and can be used
+in expressions including integers, for example.  However, the integer
+result of such an expression must be explicitly cast back to the enumerant
+type if it to be assigned to a variable with the enuemrant type.
+
+::
+
+    Color c = RED;
+    int nextColor = c+1;
+    c = (Color)nextColor;
+
+In this particular case, the explicit cast could be avoided using an
+increment operator.
+
+::
+
+    Color c = RED;
+    ++c;  // c == GREEN now
+
+
 Short Vector Types
 ------------------

@@ -648,6 +688,15 @@ expect, though the two vector types must have the same length:
    int<4> bat = foo;    // ERROR: different vector lengths
    float<4> bing = foo; // ERROR: different vector lengths

+For convenience, short vectors can be initialized with a list of individual
+element values:
+
+::
+
+    float x = ..., y = ..., z = ...;
+    float<3> pos = { x, y, z };
+
+
 There are two mechanisms to access the individual elements of these short
 vector data types.  The first is with the array indexing operator:

@@ -676,25 +725,24 @@ using the array indexing operator with an index that is greater than the
 vector size, accessing an element that is beyond the vector's size is
 undefined behavior and may cause your program to crash.

-Note: ``ispc`` doesn't support the "swizzling" operations that languages
-like HLSL do.  Only a single element of the vector can be accessed at a
-time with these member operators.
+It is also possible to construct new short vectors from other short vector
+values using this syntax, extended for "swizzling".  For example, 

 ::

-    float<3> foo = ...;
-    float<2> bar = foo.xy;  // ERROR
-    foo.xz = ...;           // ERROR
-    func(foo.xyx);          // ERROR
+    float<3> position = ...;
+    float<3> new_pos = position.zyx;  // reverse order of components
+    float<2> pos_2d = position.xy;

-For convenience, short vectors can be initialized with a list of individual
-element values:
+Though a single element can be assigned to, as in the examples above, it is
+not currently possible to use swizzles on the left-hand side of assignment
+expressions:

 ::

-    float x = ..., y = ..., z = ...;
-    float<3> pos = { x, y, z };
-
+    int8<2> foo = ...;
+    int8<2> bar = ...;
+    foo.yz = bar;   // Error: can't assign to left-hand side of expression

 Struct and Array Types
 ----------------------
@@ -765,22 +813,18 @@ Variables can also be declared in ``for`` statement initializers:

    for (int i = 0; ...)

-Arrays can be initialized with either a scalar value or with individual
-element values in braces:
+Arrays can be initialized with individual element values in braces:

 ::

-    int foo[10] = x;  // all ten elements take the value of x
    int bar[2][4] = { { 1, 2, 3, 4 }, { 5, 6, 7, 8 } };

-Structures can also be initialized both with scalar values or with element
-values in braces:
+Structures can also be initialized only with element values in braces:

 ::

    struct Color { float r, g, b; };
    ....
-    Color c = 1; // all are one
    Color d = { 0.5, .75, 1.0 }; // r = 0.5, ...


@@ -877,7 +921,6 @@ C Constructs not in ISPC

 The following C features are not available in ``ispc``.

-* ``enum`` s
 * Pointers and function pointers
 * ``char`` and ``short`` types
 * ``switch`` statements
@@ -1144,7 +1187,7 @@ This code implicitly assumes that ``programCount`` evenly divides
 ::

    for (uniform int i = 0; i < count; i += programCount) {
-        if (i + programIndex < programCount) {
+        if (i + programIndex < count) {
            float d = data[i + programIndex];
            ...

@@ -1246,7 +1289,7 @@ section.)
 For ``if`` statements where the different running SPMD program instances
 don't have coherent values for the boolean ``if`` test, using ``cif``
 introduces some additional overhead from the ``all`` and ``any`` tests as
-well as the corresponding branches.  For cases where the the program
+well as the corresponding branches.  For cases where the program
 instances often do compute the same boolean value, this overhead is
 worthwhile.  If the control flow is in fact usually incoherent, this
 overhead only costs performance.
@@ -1406,13 +1449,25 @@ parallel execution.

 If you use the task launch feature in ``ispc``, you must provide C/C++
 implementations of two functions and link them into your final executable
-file:
+file.  Although these functions may be implemented in either language, they
+must have "C" linkage (i.e. their prototypes must be declared inside an
+``extern "C"`` block if they are defined in C++.)

 ::

    void ISPCLaunch(void *funcptr, void *data);
    void ISPCSync();

+On Windows, two additional functions must be provided to dynamically
+allocate and free memory to store the arguments passed to tasks.  (On OSX
+and Linux, the stack provides memory for task arguments; on Windows, the
+stack is generally not large enough to do this for large numbers of tasks.)
+
+::
+
+    void *ISPCMalloc(int64_t size, int32_t alignment);
+    void ISPCFree(void *ptr);
+
 These are called by the task launch code generated by the ``ispc``
 compiler; the first is called to launch to launch a task and the second is
 called to wait for, respectively.  (Factoring them out in this way
@@ -1659,14 +1714,14 @@ values for the inactive program instances aren't printed.  (In other cases,
 they may have garbage values or be otherwise undefined.)


-Cross-Lane Operations
---------------------
+Cross-Program Instance Operations
+---------------------------------

-Usually, ``ispc`` code expresses independent computation on separate data
-elements.  There are, however, a number of cases where it's useful for the
-program instances to be able to cooperate in computing results.  The
-cross-lane operations described in this section provide primitives for
-communication between the running program instances.
+Usually, ``ispc`` code expresses independent programs performing
+computation on separate data elements.  There are, however, a number of
+cases where it's useful for the program instances to be able to cooperate
+in computing results.  The cross-lane operations described in this section
+provide primitives for communication between the running program instances.
 
 A few routines that evaluate conditions across the running program
 instances.  For example, ``any()`` returns ``true`` if the given value
@@ -1678,6 +1733,70 @@ and ``all()`` returns ``true`` if it true for all of them.
    uniform bool any(bool v)
    uniform bool all(bool v)

+To broadcast a value from one program instance to all of the others, a
+``broadcast()`` function is available.  It broadcasts the value of the
+``value`` parameter for the program instance given by ``index`` to all of
+the running program instances.
+
+::
+
+    int8 broadcast(int8 value, uniform int index)
+    int16 broadcast(int16 value, uniform int index)
+    int32 broadcast(int32 value, uniform int index)
+    int64 broadcast(int64 value, uniform int index)
+    float broadcast(float value, uniform int index)
+    double broadcast(double value, uniform int index)
+
+The ``rotate()`` function allows each program instance to find the value of
+the given value that their neighbor ``offset`` steps away has.  For
+example, on an 8-wide target, if ``offset`` has the value (1, 2, 3, 4, 5,
+6, 7, 8) in each of the running program instances, then ``rotate(value,
+-1)`` causes the first program instance to get the value 8, the second
+program instance to get the value 1, the third 2, and so forth.  The
+provided offset value can be positive or negative, and may be greater than
+``programCount`` (it is masked to ensure valid offsets).
+
+::
+
+    int8 rotate(int8 value, uniform int offset)
+    int16 rotate(int16 value, uniform int offset)
+    int32 rotate(int32 value, uniform int offset)
+    int64 rotate(int64 value, uniform int offset)
+    float rotate(float value, uniform int offset)
+    double rotate(double value, uniform int offset)
+
+
+Finally, the ``shuffle()`` functions allow two variants of fully general
+shuffling of values among the program instances.  For the first version,
+each program instance's value of permutation gives the program instance
+from which to get the value of ``value``.  The provided values for
+``permutation`` must all be between 0 and ``programCount-1``.
+
+::
+
+    int8 shuffle(int8 value, int permutation)
+    int16 shuffle(int16 value, int permutation)
+    int32 shuffle(int32 value, int permutation)
+    int64 shuffle(int64 value, int permutation)
+    float shuffle(float value, int permutation)
+    double shuffle(double value, int permutation)
+
+
+The second variant of ``shuffle()`` permutes over the extended vector that
+is the concatenation of the two provided values.  In other words, a value
+of 0 in an element of ``permutation`` corresponds to the first element of
+``value0``, the value ``2*programCount-1`` corresponds to the last element
+of ``value1``, etc.)
+
+::
+
+    int8 shuffle(int8 value0, int8 value1, int permutation)
+    int16 shuffle(int16 value0, int16 value1, int permutation)
+    int32 shuffle(int32 value0, int32 value1, int permutation)
+    int64 shuffle(int64 value0, int64 value1, int permutation)
+    float shuffle(float value0, float value1, int permutation)
+    double shuffle(double value0, double value1, int permutation)
+
 The various variants of ``popcnt()`` return the population count--the
 number of bits set in the given value.

@@ -1718,26 +1837,102 @@ given value across all of the currently-executing vector lanes.
    uniform int reduce_max(int a, int b)
    uniform unsigned int reduce_max(unsigned int a, unsigned int b)

-
-Finally, there are routines for writing out and reading in values from
-linear memory locations for the active program instances.
-``packed_load_active()`` loads consecutive values from the given array,
-starting at ``a[offset]``, loading one value for each currently-executing
-program instance and storing it into that program instance's ``val``
-variable.  It returns the total number of values loaded.  Similarly,
-``packed_store_active()`` stores the ``val`` values for each program
-instances that executed the ``packed_store_active()`` call, storing the
-results into the given array starting at the given offset.  It returns the
-total number of values stored.
+Finally, you can check to see if a particular value has the same value in
+all of the currently-running program instances:

 ::

-    uniform unsigned int packed_load_active(uniform int a[],
-                                            uniform int offset,
-                                            reference int val)
-    uniform unsigned int packed_store_active(uniform int a[],
-                                             uniform int offset,
-                                             int val)
+    uniform bool reduce_equal(int32 v)
+    uniform bool reduce_equal(unsigned int32 v)
+    uniform bool reduce_equal(float v)
+    uniform bool reduce_equal(int64 v)
+    uniform bool reduce_equal(unsigned int64 v)
+    uniform bool reduce_equal(double)
+
+There are also variants of these functions that return the value as a
+``uniform`` in the case where the values are all the same.
+
+::
+
+    uniform bool reduce_equal(int32 v, reference uniform int32 sameval)
+    uniform bool reduce_equal(unsigned int32 v,
+                              reference uniform unsigned int32 sameval)
+    uniform bool reduce_equal(float v, reference uniform float sameval)
+    uniform bool reduce_equal(int64 v, reference uniform int64 sameval)
+    uniform bool reduce_equal(unsigned int64 v,
+                              reference uniform unsigned int64 sameval)
+    uniform bool reduce_equal(double, reference uniform double sameval)
+
+If called when none of the program instances are running,
+``reduce_equal()`` will return ``false``.
+
+There are also a number of functions to compute "scan"s of values across
+the program instances.  For example, the ``exclusive_scan_and()`` function
+computes, for each program instance, the sum of the given value over all of
+the preceeding program instances.  (The scans currently available in
+``ispc`` are all so-called "exclusive" scans, meaning that the value
+computed for a given element does not include the value provided for that
+element.)  In C code, an exclusive add scan over an array might be
+implemented as:
+
+::
+
+    void scan_add(int *in_array, int *result_array, int count) {
+        result_array[0] = 0;
+        for (int i = 0; i < count; ++i)
+            result_array[i] = result_array[i-1] + in_array[i-1];
+    }
+
+``ispc`` provides the following scan functions--addition, bitwise-and, and
+bitwise-or are available:
+
+::
+
+    int32 exclusive_scan_add(int32 v) 
+    unsigned int32 exclusive_scan_add(unsigned int32 v) 
+    float exclusive_scan_add(float v) 
+    int64 exclusive_scan_add(int64 v) 
+    unsigned int64 exclusive_scan_add(unsigned int64 v) 
+    double exclusive_scan_add(double v) 
+    int32 exclusive_scan_and(int32 v) 
+    unsigned int32 exclusive_scan_and(unsigned int32 v) 
+    int64 exclusive_scan_and(int64 v) 
+    unsigned int64 exclusive_scan_and(unsigned int64 v) 
+    int32 exclusive_scan_or(int32 v) 
+    unsigned int32 exclusive_scan_or(unsigned int32 v) 
+    int64 exclusive_scan_or(int64 v) 
+    unsigned int64 exclusive_scan_or(unsigned int64 v) 
+
+
+Packed Load and Store Operations
+--------------------------------
+
+The standard library also offers routines for writing out and reading in
+values from linear memory locations for the active program instances.  The
+``packed_load_active()`` functions load consecutive values from the given
+array, starting at ``a[offset]``, loading one value for each
+currently-executing program instance and storing it into that program
+instance's ``val`` variable.  They return the total number of values
+loaded.  Similarly, the ``packed_store_active()`` functions store the
+``val`` values for each program instances that executed the
+``packed_store_active()`` call, storing the results into the given array
+starting at the given offset.  They return the total number of values
+stored.
+
+::
+
+    uniform int packed_load_active(uniform int a[],
+                                   uniform int offset,
+                                   reference int val)
+    uniform int packed_load_active(uniform unsigned int a[],
+                                   uniform int offset,
+                                   reference unsigned int val)
+    uniform int packed_store_active(uniform int a[],
+                                    uniform int offset,
+                                    int val)
+    uniform int packed_store_active(uniform unsigned int a[],
+                                    uniform int offset,
+                                    unsigned int val)


 As an example of how these functions can be used, the following code shows
@@ -1770,41 +1965,168 @@ where the ``i`` th element of ``x`` has been replaced with the value ``v``

 ::

+    uniform int8 extract(int8 x, uniform int i)
+    uniform int16 extract(int16 x, uniform int i)
+    uniform int32 extract(int32 x, uniform int i)
+    uniform int64 extract(int64 x, uniform int i)
    uniform float extract(float x, uniform int i)
-    uniform int extract(int x, uniform int i)
+
+::
+
+    int8 insert(int8 x, uniform int i, uniform int8 v)
+    int16 insert(int16 x, uniform int i, uniform int16 v)
+    int32 insert(int32 x, uniform int i, uniform int32 v)
+    int64 insert(int64 x, uniform int i, uniform int64 v)
    float insert(float x, uniform int i, uniform float v)
-    int insert(int x, uniform int i, uniform int v)
+
+
+Conversions To and From Half-Precision Floats
+---------------------------------------------
+
+There are functions to convert to and from the IEEE 16-bit floating-point
+format.  Note that there is no ``half`` data-type, and it isn't possible
+to do floating-point math directly with ``half`` types in ``ispc``; these
+functions facilitate converting to and from half-format data in memory.
+
+To use them, half-format data should be loaded into an ``int16`` and the
+``half_to_float()`` function used to convert it the a 32-bit floating point
+value.  To store a value to memory in half format, the ``float_to_half()``
+function returns the 16 bits that are the closest match to the given
+``float``, in half format.
+
+::
+
+    float half_to_float(unsigned int16 h)
+    uniform float half_to_float(uniform unsigned int16 h)
+    int16 float_to_half(float f)
+    uniform int16 float_to_half(uniform float f)
+
+There are also faster versions of these functions that don't worry about
+handling floating point infinity, "not a number" and denormalized numbers
+correctly.  These are faster than the above functions, but are less
+precise.
+
+::
+
+    float half_to_float_fast(unsigned int16 h)
+    uniform float half_to_float_fast(uniform unsigned int16 h)
+    int16 float_to_half_fast(float f)
+    uniform int16 float_to_half_fast(uniform float f)
+
+
+Atomic Operations and Memory Fences
+-----------------------------------
+
+The usual range of atomic memory operations are provided in ``ispc``.  As an
+example, consider the 32-bit integer atomic add routine:
+
+::
+
+  int32 atomic_add_global(reference uniform int32 val, int32 delta)
+
+The semantics are the expected ones for an atomic add function: the value
+"val" has the value "delta" added to it atomically, and the old value of
+"val" is returned from the function.  (Thus, if multiple processors 
+simultaneously issue atomic adds to the same memory location, the adds will
+be serialized by the hardware so that the correct result is computed in the
+end.)
+
+One thing to note is that that the value being added to here is a
+``uniform`` integer, while the increment amount and the return value are
+``varying``.  In other words, the semantics are that each running program
+instance individually issues the atomic operation with its own ``delta``
+value and gets the previous value of ``val`` back in return.  The atomics
+for the running program instances may be issued in arbitrary order; it's
+not guaranteed that they will be issued in ``programIndex`` order, for
+example.
+
+Here are the declarations of the ``int32`` variants of these functions.
+There are also ``int64`` equivalents as well as variants that take
+``unsigned`` ``int32`` and ``int64`` values.  (The ``atomic_swap_global()``
+function can be used with ``float`` and ``double`` types as well.)
+
+::
+
+  int32 atomic_add_global(reference uniform int32 val, int32 value)
+  int32 atomic_subtract_global(reference uniform int32 val, int32 value)
+  int32 atomic_min_global(reference uniform int32 val, int32 value)
+  int32 atomic_max_global(reference uniform int32 val, int32 value)
+  int32 atomic_and_global(reference uniform int32 val, int32 value)
+  int32 atomic_or_global(reference uniform int32 val, int32 value)
+  int32 atomic_xor_global(reference uniform int32 val, int32 value)
+  int32 atomic_swap_global(reference uniform int32 val, int32 newval)
+
+There is also an atomic "compare and exchange" function; it atomically
+compares the value in "val" to "compare"--if they match, it assigns
+"newval" to "val".  In either case, the old value of "val" is returned.
+(As with the other atomic operations, there are also ``unsigned`` and
+64-bit variants of this function.  Furthermore, there are ``float`` and
+``double`` variants as well.)
+
+::
+
+  int32 atomic_compare_exchange_global(reference uniform int32 val,
+                                       int32 compare, int32 newval)
+
+``ispc`` also has a standard library routine that inserts a memory barrier
+into the code; it ensures that all memory reads and writes prior to be
+barrier complete before any reads or writes after the barrier are issued.
+See the `Linux kernel documentation on memory barriers`_ for an excellent
+writeup on the need for and the use of memory barriers in multi-threaded
+code.
+
+.. _Linux kernel documentation on memory barriers: http://www.kernel.org/doc/Documentation/memory-barriers.txt
+
+::
+
+    void memory_barrier();
+
+
+Prefetches
+----------
+
+The standard library has a variety of functions to prefetch data into the
+processor's cache.  While modern CPUs have automatic prefetchers that do a
+reasonable job of prefetching data to the cache before its needed, high
+performance applications may find it helpful to prefetch data before it's
+needed.
+
+For example, this code shows how to prefetch data to the processor's L1
+cache while iterating over the items in an array.  
+
+::
+
+   uniform int32 array[...];
+   for (uniform int i = 0; i < count; ++i) {
+       // do computation with array[i]
+       prefetch_l1(array[i+32]);
+   }
+
+The standard library has routines to prefetch to the L1, L2, and L3
+caches.  It also has a variant, ``prefetch_nt()``, that indicates that the
+value being prefetched isn't expected to be used more than once (so should
+be high priority to be evicted from the cache).
+
+::
+
+    void prefetch_{l1,l2,l3,nt}(reference TYPE)
+
+These functions are available for all of the basic types in the
+language--``int8``, ``int16``, ``int32``, ``float``, and so forth.


 Low-Level Bits
 --------------

-``ispc`` provides a number of bit/memory-level utility routines in its
-standard library as well.  It has routines that load from and store
-to 8-bit and 16-bit integer values stored in memory, converting to and from
-32-bit integers for use in computation in ``ispc`` code.  (These functions
-and this conversion step are necessary because ``ispc`` doesn't have native
-8-bit or 16-bit types in the language.)
+Sometimes it's useful to convert a ``bool`` value to an integer using sign
+extension so that the integer's bits are all on if the ``bool`` has the
+value ``true`` (rather than just having the value one).  The
+``sign_extend()`` functions provide this functionality:

 ::

-    unsigned int load_from_int8(uniform int a[],
-                                uniform int offset)
-    void store_to_int8(uniform int a[], uniform int offset, 
-                       unsigned int val)
-    unsigned int load_from_int16(uniform int a[],
-                                 uniform int offset)
-    void store_to_int16(uniform int a[], uniform int offset, 
-                        unsigned int val)
-
-There are two things to note in these functions.  First, note that these
-functions take ``unsigned int`` arrays as parameters; you need
-to cast `the ``int8_t`` and ``int16_t`` pointers from the C/C++ side to
-``unsigned int`` when passing them to ``ispc`` code.  Second, although the
-arrays are passed as ``unsigned int``, in the array indexing calculation,
-with the ``offset`` parameter, they are treated as if they were ``int8`` or
-``int16`` types.  (i.e. the offset treated as being in terms of number of 8
-or 16-bit elements.)
+    int sign_extend(bool value) 
+    uniform int sign_extend(uniform bool value) 

 The ``intbits()`` and ``floatbits()`` functions can be used to implement
 low-level floating-point bit twiddling.  For example, ``intbits()`` returns
@@ -1840,7 +2162,6 @@ It, it clears the high order bit, to ensure that the given floating-point
 value is positive.  This compiles down to a single ``andps`` instruction
 when used with an Intel® SSE target, for example.

-
 Interoperability with the Application
 =====================================

@@ -1901,14 +2222,14 @@ Both the ``foo`` and ``bar`` global variables can be accessed on each
 side.

 ``ispc`` code can also call back to C/C++.  On the ``ispc`` side, any
-application functions to be called must be declared with the ``export "C"``
+application functions to be called must be declared with the ``extern "C"``
 qualifier.

 ::

   extern "C" void foo(uniform float f, uniform float g);

-Unlike in C++, ``export "C"`` doesn't take braces to delineate
+Unlike in C++, ``extern "C"`` doesn't take braces to delineate
 multiple functions to be declared; thus, multiple C functions to be called
 from ``ispc`` must be declared as follows:

@@ -2279,21 +2600,11 @@ elements to work with and then proceeds with the computation.
 Communicating Between SPMD Program Instances
 --------------------------------------------

-The ``programIndex`` built-in variable (see `Mapping Data To Program
-Instances`_) can be used to communicate between the set of executing
-program instances.  Consider the following code, which shows all of the
-program instances writing into unique locations in an array.
-
-::
-
-    float x = ...;
-    uniform float allX[programCount];
-    allX[programIndex] = x;
-
-In this code, a program instance that reads ``allX[0]`` finds the value of
-``x`` that was computed by the first of the running program instances, and
-so forth.  Program instances can communicate with their neighbor instances
-with indexing like ``allX[(programIndex+1)%programCount]``.
+The ``broadcast()``, ``rotate()``, and ``shuffle()`` standard library
+routines provide a variety of mechanisms for the running program instances
+to communicate values to each other during execution.  See the section
+`Cross-Program Instance Operations`_ for more information about their
+operation.


 Gather and Scatter
@@ -2351,6 +2662,15 @@ do a vector load.  For example, given:

 A regular vector load is done from array, starting at offset ``2*x``.

+
+8 and 16-bit Integer Types
+--------------------------
+
+The code generated for 8 and 16-bit integer types is generally not as
+efficient as the code generated for 32-bit integer types.  It is generally
+worthwhile to use 32-bit integer types for intermediate computations, even
+if the final result will be stored in a smaller integer type.
+
 Low-level Vector Tricks
 -----------------------

@@ -2504,6 +2824,123 @@ active upon function entry.
    ao.ispc(0088) - function entry: 36928 calls (0 / 0.00% all off!), 97.40% active lanes
    ...

+
+Using Scan Operations For Variable Output
+-----------------------------------------
+
+One important application of the ``exclusive_scan_add()`` function in the
+standard library is when program instances want to generate a variable amount
+of output and when one would like that output to be densely packed in a
+single array.  For example, consider the code fragment below:
+
+::
+
+    uniform int func(uniform float outArray[], ...) {
+       int numOut = ...;  // figure out how many to be output
+       float outLocal[MAX_OUT]; // staging area
+       // put results in outLocal[0], ..., outLocal[numOut-1]
+       int startOffset = exclusive_scan_add(numOut);
+       for (int i = 0; i < numOut; ++i)
+           outArray[startOffset + i] = outLocal[i];
+       return reduce_add(numOut);
+    }
+
+Here, each program instance has computed a number, ``numOut``, of values to
+output, and has stored them in the ``outLocal`` array.  Assume that four
+program instances are running and that the first one wants to output one
+value, the second two values, and the third and fourth three values each.
+In this case, ``exclusive_scan_add()`` will return the values (0, 1, 3, 6)
+to the four program instances, respectively.  The first program instance
+will write its one result to ``outArray[0]``, the second will write its two
+values to ``outArray[1]`` and ``outArray[2]``, and so forth.  The
+``reduce_add`` call at the end returns the total number of values that the
+program instances have written to the array.
+
+Application-Supplied Execution Masks
+------------------------------------
+
+Recall that when execution transitions from the application code to an
+``ispc`` function, all of the program instances are initially executing.
+In some cases, it may desired that only some of them are running, based on
+a data-dependent condition computed in the application program.  This
+situation can easily be handled via an additional parameter from the
+application.
+
+As a simple example, consider a case where the application code has an
+array of ``float`` values and we'd like the ``ispc`` code to update
+just specific values in that array, where which of those values to be
+updated has been determined by the application.  In C++ code, we might
+have:
+
+::
+
+    int count = ...;
+    float *array = new float[count];
+    bool *shouldUpdate = new bool[count];
+    // initialize array and shouldUpdate
+    ispc_func(array, shouldUpdate, count);
+
+Then, the ``ispc`` code could process this update as:
+
+::
+
+    export void ispc_func(uniform float array[], uniform bool update[],
+                          uniform int count) {
+        for (uniform int i = 0; i < count; i += programCount) {
+            cif (update[i+programIndex] == true)
+                // update array[i+programIndex]...
+        }
+    }
+
+(In this case a "coherent" if statement is likely to be worthwhile if the
+``update`` array will tend to have sections that are either all-true or
+all-false.)
+
+Explicit Vector Programming With Uniform Short Vector Types
+-----------------------------------------------------------
+
+The typical model for programming in ``ispc`` is an *implicit* parallel
+model, where one writes a program that is apparently doing scalar
+computation on values and the program is then vectorized to run in parallel
+across the SIMD lanes of a processor.  However, ``ispc`` also has some
+support for explicit vector unit programming, where the vectorization is
+explicit.  Some computations may be more effectively described in the
+explicit model rather than the implicit model.
+
+This support is provided via ``uniform`` instances of short vectors 
+(as were introduced in the `Short Vector Types`_ section).  Specifically, 
+if this short program
+
+::
+
+    export uniform float<8> madd(uniform float<8> a, 
+                                 uniform float<8> b, uniform float<8> c) {
+        return a + b * c;
+    }
+
+is compiled with the AVX target, ``ispc`` generates the following assembly:
+
+::
+    _madd:
+	vmulps	%ymm2, %ymm1, %ymm1
+	vaddps	%ymm0, %ymm1, %ymm0
+	ret
+
+(And similarly, if compiled with a 4-wide SSE target, two ``mulps`` and two
+``addps`` instructions are generated, and so forth.)
+
+Note that ``ispc`` doesn't currently support control-flow based on
+``uniform`` short vector types; it is thus not possible to write code like:
+
+::
+
+    export uniform int<8> count(uniform float<8> a, uniform float<8> b) {
+        uniform int<8> sum = 0;
+        while (a++ < b)
+            ++sum;
+    }
+
+
 Disclaimer and Legal Information
 ================================

--- a/doxygen.cfg
+++ b/doxygen.cfg
@@ -31,7 +31,7 @@ PROJECT_NAME           = "Intel SPMD Program Compiler"
 # This could be handy for archiving the generated documentation or
 # if some version control system is used.

-PROJECT_NUMBER         = 1.0
+PROJECT_NUMBER         = 1.0.8

 # The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute)
 # base path where the generated documentation will be put.
@@ -610,7 +610,7 @@ INPUT                  = builtins.h \
                         util.cpp \
                         parse.yy \
                         lex.ll \
-                         stdlib-c.c
+                         builtins-c.c

 # This tag can be used to specify the character encoding of the source files
 # that doxygen parses. Internally doxygen uses the UTF-8 encoding, which is
--- a/examples/README.txt
+++ b/examples/README.txt
@@ -57,6 +57,13 @@ Linux, a pthreads-based task system is used (tasks_pthreads.cpp).  When
 using tasks with ispc, no task system is mandated; the user is free to plug
 in any task system they want, for ease of interoperating with existing task
 systems.
+
+Noise
+=====
+
+This example has an implementation of Ken Perlin's procedural "noise"
+function, as described in his 2002 "Improving Noise" SIGGRAPH paper.
+
 
 Options
 =======
@@ -86,3 +93,17 @@ Simple
 This is a simple "hello world" type program that shows a ~10 line
 application program calling out to a ~5 line ispc program to do a simple
 computation.
+
+Volume
+======
+
+Ray-marching volume rendering, with single scattering lighting model.  To
+run it, specify a camera parameter file and a volume density file, e.g.:
+
+volume camera.dat density_highres.vol
+
+(See, e.g. Chapters 11 and 16 of "Physically Based Rendering" for
+information about the algorithm implemented here.)  The volume data set
+included here was generated by the example implementation of the "Wavelet
+Turbulence for Fluid Simulation" SIGGRAPH 2008 paper by Kim et
+al. (http://www.cs.cornell.edu/~tedkim/WTURB/)
--- a/examples/aobench/.gitignore
+++ b/examples/aobench/.gitignore
@@ -0,0 +1,2 @@
+ao
+*.ppm
--- a/examples/aobench/Makefile
+++ b/examples/aobench/Makefile
@@ -1,8 +1,20 @@

-CXX=g++ -m64
-CXXFLAGS=-Iobjs/ -O3 -Wall
+ARCH = $(shell uname)
+
+TASK_CXX=../tasks_pthreads.cpp
+TASK_LIB=-lpthread
+
+ifeq ($(ARCH), Darwin)
+  TASK_CXX=../tasks_gcd.cpp
+  TASK_LIB=
+endif
+
+TASK_OBJ=$(addprefix objs/, $(subst ../,, $(TASK_CXX:.cpp=.o)))
+
+CXX=g++
+CXXFLAGS=-Iobjs/ -O3 -Wall -m64
 ISPC=ispc
-ISPCFLAGS=-O2 --fast-math --arch=x86-64
+ISPCFLAGS=-O2 --target=sse4 --arch=x86-64

 default: ao

@@ -14,12 +26,15 @@ dirs:
 clean:
 	/bin/rm -rf objs *~ ao

-ao: dirs objs/ao.o objs/ao_serial.o objs/ao_ispc.o
-	$(CXX) $(CXXFLAGS) -o $@ objs/ao.o objs/ao_ispc.o objs/ao_serial.o -lm -lpthread
+ao: dirs objs/ao.o objs/ao_serial.o objs/ao_ispc.o $(TASK_OBJ)
+	$(CXX) $(CXXFLAGS) -o $@ objs/ao.o objs/ao_ispc.o objs/ao_serial.o $(TASK_OBJ) -lm $(TASK_LIB)

 objs/%.o: %.cpp
 	$(CXX) $< $(CXXFLAGS) -c -o $@

+objs/%.o: ../%.cpp
+	$(CXX) $< $(CXXFLAGS) -c -o $@
+
 objs/ao.o: objs/ao_ispc.h 

 objs/%_ispc.h objs/%_ispc.o: %.ispc
--- a/examples/aobench/ao.cpp
+++ b/examples/aobench/ao.cpp
@@ -55,6 +55,7 @@
 using namespace ispc;

 #include "../timing.h"
+#include "../cpuid.h"

 #define NSUBSAMPLES        2

@@ -100,6 +101,39 @@ savePPM(const char *fname, int w, int h)
    fprintf(fp, "255\n");
    fwrite(img, w * h * 3, 1, fp);
    fclose(fp);
+    printf("Wrote image file %s\n", fname);
+}
+
+
+// Make sure that the vector ISA used during compilation is supported by
+// the processor.  The ISPC_TARGET_* macro is set in the ispc-generated
+// header file that we include above.
+static void
+ensureTargetISAIsSupported() {
+#if defined(ISPC_TARGET_SSE2)
+    bool isaSupported = CPUSupportsSSE2();
+    const char *target = "SSE2";
+#elif defined(ISPC_TARGET_SSE4)
+    bool isaSupported = CPUSupportsSSE4();
+    const char *target = "SSE4";
+#elif defined(ISPC_TARGET_AVX)
+    bool isaSupported = CPUSupportsAVX();
+    const char *target = "AVX";
+#else
+#error "Unknown ISPC_TARGET_* value"
+#endif
+    if (!isaSupported) {
+        fprintf(stderr, "***\n*** Error: the ispc-compiled code uses the %s instruction "
+                "set, which isn't\n***        supported by this computer's CPU!\n", target);
+        fprintf(stderr, "***\n***        Please modify the "
+#ifdef _MSC_VER
+                "MSVC project file "
+#else
+                "Makefile "
+#endif
+                "to select another target (e.g. sse2)\n***\n");
+        exit(1);
+    }
 }


@@ -117,6 +151,8 @@ int main(int argc, char **argv)
        height = atoi (argv[3]);
    }

+    ensureTargetISAIsSupported();
+
    // Allocate space for output images
    img = new unsigned char[width * height * 3];
    fimg = new float[width * height * 3];
@@ -137,10 +173,30 @@ int main(int argc, char **argv)
    }

    // Report results and save image
-    printf("[aobench ispc]:\t\t\t[%.3f] M cycles (%d x %d image)\n", minTimeISPC, 
-           width, height);
+    printf("[aobench ispc]:\t\t\t[%.3f] M cycles (%d x %d image)\n", 
+           minTimeISPC, width, height);
    savePPM("ao-ispc.ppm", width, height); 

+    //
+    // Run the ispc + tasks path, test_iterations times, and report the
+    // minimum time for any of them.
+    //
+    double minTimeISPCTasks = 1e30;
+    for (unsigned int i = 0; i < test_iterations; i++) {
+        memset((void *)fimg, 0, sizeof(float) * width * height * 3);
+        assert(NSUBSAMPLES == 2);
+
+        reset_and_start_timer();
+        ao_ispc_tasks(width, height, NSUBSAMPLES, fimg);
+        double t = get_elapsed_mcycles();
+        minTimeISPCTasks = std::min(minTimeISPCTasks, t);
+    }
+
+    // Report results and save image
+    printf("[aobench ispc + tasks]:\t\t[%.3f] M cycles (%d x %d image)\n", 
+           minTimeISPCTasks, width, height);
+    savePPM("ao-ispc-tasks.ppm", width, height); 
+
    //
    // Run the serial path, again test_iteration times, and report the
    // minimum time.
@@ -157,7 +213,8 @@ int main(int argc, char **argv)
    // Report more results, save another image...
    printf("[aobench serial]:\t\t[%.3f] M cycles (%d x %d image)\n", minTimeSerial, 
           width, height);
-    printf("\t\t\t\t(%.2fx speedup from ISPC)\n", minTimeSerial / minTimeISPC);
+    printf("\t\t\t\t(%.2fx speedup from ISPC, %.2fx speedup from ISPC + tasks)\n", 
+           minTimeSerial / minTimeISPC, minTimeSerial / minTimeISPCTasks);
    savePPM("ao-serial.ppm", width, height); 
        
    return 0;
--- a/examples/aobench/ao.ispc
+++ b/examples/aobench/ao.ispc
@@ -203,8 +203,9 @@ ambient_occlusion(reference Isect isect, reference Plane plane,
 /* Compute the image for the scanlines from [y0,y1), for an overall image
   of width w and height h.
 */
-void ao_scanlines(uniform int y0, uniform int y1, uniform int w, uniform int h, 
-                  uniform int nsubsamples, reference uniform float image[]) {
+static void ao_scanlines(uniform int y0, uniform int y1, uniform int w, 
+                         uniform int h,  uniform int nsubsamples, 
+                         reference uniform float image[]) {
    static Plane plane = { { 0.0f, -0.5f, 0.0f }, { 0.f, 1.f, 0.f } };
    static Sphere spheres[3] = {
        { { -2.0f, 0.0f, -3.5f }, 0.5f },
@@ -231,6 +232,9 @@ void ao_scanlines(uniform int y0, uniform int y1, uniform int w, uniform int h,
    // direction we do per iteration and ny the number in y.
    uniform int nx = 1, ny = 1;

+    // FIXME: We actually need ny to be 1 regardless of the decomposition,
+    // since the task decomposition is one scanline high.
+
    if (programCount == 8) {
        // Do two pixels at once in the x direction
        nx = 2;
@@ -239,19 +243,21 @@ void ao_scanlines(uniform int y0, uniform int y1, uniform int w, uniform int h,
            ++du;
    }
    else if (programCount == 16) {
-        // Two at once in both x and y
-        nx = ny = 2;
-        if ((programIndex >= 4 && programIndex < 8) || programIndex >= 12)
+        nx = 4;
+        ny = 1;
+        if (programIndex >= 4 && programIndex < 8)
            ++du;
-        if (programIndex >= 8)  
-            ++dv;
+        if (programIndex >= 8 && programIndex < 12)
+            du += 2;
+        if (programIndex >= 12)
+            du += 3;
    }

    // Now loop over all of the pixels, stepping in x and y as calculated
    // above.  (Assumes that ny divides y and nx divides x...)
    for (uniform int y = y0; y < y1; y += ny) {
        for (uniform int x = 0; x < w; x += nx)  {
-            // Figur out x,y pixel in NDC
+            // Figure out x,y pixel in NDC
            float px =  (x + du - (w / 2.0f)) / (w / 2.0f);
            float py = -(y + dv - (h / 2.0f)) / (h / 2.0f);
            float ret = 0.f;
@@ -293,7 +299,7 @@ void ao_scanlines(uniform int y0, uniform int y1, uniform int w, uniform int h,

            // offset to the first pixel in the image
            uniform int offset = 3 * (y * w + x);
-            for (uniform int p = 0; p < programCount; p += 4, ++offset) {
+            for (uniform int p = 0; p < programCount; p += 4, offset += 3) {
                // Get the four sample values for this pixel
                uniform float sumret = retArray[p] + retArray[p+1] + retArray[p+2] +
                    retArray[p+3];
@@ -315,3 +321,18 @@ export void ao_ispc(uniform int w, uniform int h, uniform int nsubsamples,
                    uniform float image[]) {
    ao_scanlines(0, h, w, h, nsubsamples, image);
 }
+
+
+static void task ao_task(uniform int y0, uniform int y1, uniform int width,
+                         uniform int height, uniform int nsubsamples, 
+                         uniform float image[]) {
+    ao_scanlines(y0, y1, width, height, nsubsamples, image);
+}
+
+
+export void ao_ispc_tasks(uniform int w, uniform int h, uniform int nsubsamples, 
+                          uniform float image[]) {
+    uniform int dy = 1;
+    for (uniform int y = 0; y < h; y += dy)
+        launch < ao_task(y, y+dy, w, h, nsubsamples, image) >;
+}
--- a/examples/aobench/ao_serial.cpp
+++ b/examples/aobench/ao_serial.cpp
@@ -140,7 +140,7 @@ ray_plane_intersect(Isect &isect, Ray &ray,
    float d = -dot(plane.p, plane.n);
    float v = dot(ray.dir, plane.n);

-    if (fabsf(v) < 1.0e-17) 
+    if (fabsf(v) < 1.0e-17f) 
        return;
    else {
        float t = -(dot(ray.org, plane.n) + d) / v;
@@ -183,11 +183,11 @@ orthoBasis(vec basis[3], const vec &n) {
    basis[2] = n;
    basis[1].x = 0.0; basis[1].y = 0.0; basis[1].z = 0.0;

-    if ((n.x < 0.6) && (n.x > -0.6)) {
+    if ((n.x < 0.6f) && (n.x > -0.6f)) {
        basis[1].x = 1.0;
-    } else if ((n.y < 0.6) && (n.y > -0.6)) {
+    } else if ((n.y < 0.6f) && (n.y > -0.6f)) {
        basis[1].y = 1.0;
-    } else if ((n.z < 0.6) && (n.z > -0.6)) {
+    } else if ((n.z < 0.6f) && (n.z > -0.6f)) {
        basis[1].z = 1.0;
    } else {
        basis[1].x = 1.0;
@@ -224,7 +224,7 @@ ambient_occlusion(Isect &isect, Plane &plane,
            float phi   = 2.0f * M_PI * drand48();
            float x = cosf(phi) * theta;
            float y = sinf(phi) * theta;
-            float z = sqrtf(1.0 - theta * theta);
+            float z = sqrtf(1.0f - theta * theta);

            // local . global
            float rx = x * basis[0].x + y * basis[1].x + z * basis[2].x;
@@ -236,14 +236,14 @@ ambient_occlusion(Isect &isect, Plane &plane,
            ray.dir.y = ry;
            ray.dir.z = rz;

-            occIsect.t   = 1.0e+17;
+            occIsect.t   = 1.0e+17f;
            occIsect.hit = 0;

            for (int snum = 0; snum < 3; ++snum)
                ray_sphere_intersect(occIsect, ray, spheres[snum]); 
            ray_plane_intersect (occIsect, ray, plane); 

-            if (occIsect.hit) occlusion += 1.0;
+            if (occIsect.hit) occlusion += 1.f;
        }
    }

@@ -280,10 +280,10 @@ static void ao_scanlines(int y0, int y1, int w, int h, int nsubsamples,

                    ray.dir.x = px;
                    ray.dir.y = py;
-                    ray.dir.z = -1.0;
+                    ray.dir.z = -1.0f;
                    vnormalize(ray.dir);

-                    isect.t   = 1.0e+17;
+                    isect.t   = 1.0e+17f;
                    isect.hit = 0;

                    for (int snum = 0; snum < 3; ++snum)
--- a/examples/aobench/aobench.vcxproj
+++ b/examples/aobench/aobench.vcxproj
@@ -1,4 +1,4 @@
-<?xml version="1.0" encoding="utf-8"?>
+<?xml version="1.0" encoding="utf-8"?>
 <Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
  <ItemGroup Label="ProjectConfigurations">
    <ProjectConfiguration Include="Debug|Win32">
@@ -21,19 +21,20 @@
  <ItemGroup>
    <ClCompile Include="ao.cpp" />
    <ClCompile Include="ao_serial.cpp" />
+    <ClCompile Include="../tasks_concrt.cpp" />
  </ItemGroup>
  <ItemGroup>
    <CustomBuild Include="ao.ispc">
      <FileType>Document</FileType>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">cl /E /TP %(Filename).ispc | ispc -O2 - -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86
 </Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">cl /E /TP %(Filename).ispc | ispc -O2 - -o %(Filename).obj -h %(Filename)_ispc.h
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h
 </Command>
      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(Filename).obj</Outputs>
      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">%(Filename).obj</Outputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">cl /E /TP %(Filename).ispc | ispc -O2 - -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86
 </Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">cl /E /TP %(Filename).ispc | ispc -O2 - -o %(Filename).obj -h %(Filename)_ispc.h
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h
 </Command>
      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(Filename).obj</Outputs>
      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">%(Filename).obj</Outputs>
@@ -102,6 +103,8 @@
      <WarningLevel>Level3</WarningLevel>
      <Optimization>Disabled</Optimization>
      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <FloatingPointModel>Fast</FloatingPointModel>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
@@ -115,6 +118,8 @@
      <WarningLevel>Level3</WarningLevel>
      <Optimization>Disabled</Optimization>
      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <FloatingPointModel>Fast</FloatingPointModel>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
@@ -130,6 +135,7 @@
      <FunctionLevelLinking>true</FunctionLevelLinking>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <FloatingPointModel>Fast</FloatingPointModel>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
@@ -147,6 +153,7 @@
      <FunctionLevelLinking>true</FunctionLevelLinking>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <FloatingPointModel>Fast</FloatingPointModel>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
@@ -158,4 +165,4 @@
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
  </ImportGroup>
-</Project>
+</Project>
--- a/examples/aobench_instrumented/.gitignore
+++ b/examples/aobench_instrumented/.gitignore
@@ -0,0 +1,2 @@
+ao
+*.ppm
--- a/examples/aobench_instrumented/Makefile
+++ b/examples/aobench_instrumented/Makefile
@@ -2,7 +2,7 @@
 CXX=g++ -m64
 CXXFLAGS=-Iobjs/ -g3 -Wall
 ISPC=ispc
-ISPCFLAGS=-O2 --fast-math --instrument --arch=x86-64
+ISPCFLAGS=-O2 --instrument --arch=x86-64

 default: ao

--- a/examples/aobench_instrumented/ao.cpp
+++ b/examples/aobench_instrumented/ao.cpp
@@ -56,6 +56,7 @@ using namespace ispc;

 #include "instrument.h"
 #include "../timing.h"
+#include "../cpuid.h"

 #define NSUBSAMPLES        2

@@ -99,6 +100,39 @@ savePPM(const char *fname, int w, int h)
    fprintf(fp, "255\n");
    fwrite(img, w * h * 3, 1, fp);
    fclose(fp);
+    printf("Wrote image file %s\n", fname);
+}
+
+
+// Make sure that the vector ISA used during compilation is supported by
+// the processor.  The ISPC_TARGET_* macro is set in the ispc-generated
+// header file that we include above.
+static void
+ensureTargetISAIsSupported() {
+#if defined(ISPC_TARGET_SSE2)
+    bool isaSupported = CPUSupportsSSE2();
+    const char *target = "SSE2";
+#elif defined(ISPC_TARGET_SSE4)
+    bool isaSupported = CPUSupportsSSE4();
+    const char *target = "SSE4";
+#elif defined(ISPC_TARGET_AVX)
+    bool isaSupported = CPUSupportsAVX();
+    const char *target = "AVX";
+#else
+#error "Unknown ISPC_TARGET_* value"
+#endif
+    if (!isaSupported) {
+        fprintf(stderr, "***\n*** Error: the ispc-compiled code uses the %s instruction "
+                "set, which isn't\n***        supported by this computer's CPU!\n", target);
+        fprintf(stderr, "***\n***        Please modify the "
+#ifdef _MSC_VER
+                "MSVC project file "
+#else
+                "Makefile "
+#endif
+                "to select another target (e.g. sse2)\n***\n");
+        exit(1);
+    }
 }


@@ -116,6 +150,8 @@ int main(int argc, char **argv)
        height = atoi (argv[3]);
    }

+    ensureTargetISAIsSupported();
+
    // Allocate space for output images
    img = new unsigned char[width * height * 3];
    fimg = new float[width * height * 3];
--- a/examples/aobench_instrumented/aobench_instrumented.vcxproj
+++ b/examples/aobench_instrumented/aobench_instrumented.vcxproj
@@ -25,15 +25,15 @@
  <ItemGroup>
    <CustomBuild Include="ao.ispc">
      <FileType>Document</FileType>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">cl /E /TP %(Filename).ispc | ispc -O2 - -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --instrument
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --instrument
 </Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">cl /E /TP %(Filename).ispc | ispc -O2 - -o %(Filename).obj -h %(Filename)_ispc.h --instrument
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --instrument
 </Command>
      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(Filename).obj</Outputs>
      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">%(Filename).obj</Outputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">cl /E /TP %(Filename).ispc | ispc -O2 - -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --instrument
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --instrument
 </Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">cl /E /TP %(Filename).ispc | ispc -O2 - -o %(Filename).obj -h %(Filename)_ispc.h --instrument
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --instrument
 </Command>
      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(Filename).obj</Outputs>
      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">%(Filename).obj</Outputs>
--- a/examples/cpuid.h
+++ b/examples/cpuid.h
@@ -0,0 +1,66 @@
+/*
+  Copyright (c) 2010-2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+#ifndef ISPC_CPUID_H
+#define ISPC_CPUID_H 1
+
+#ifdef _MSC_VER
+// Provides a __cpuid() function with same signature as below
+#include <intrin.h>
+#else
+static void __cpuid(int info[4], int infoType) {
+    __asm__ __volatile__ ("cpuid"
+                          : "=a" (info[0]), "=b" (info[1]), "=c" (info[2]), "=d" (info[3])
+                          : "0" (infoType));
+}
+#endif
+
+inline bool CPUSupportsSSE2() {
+    int info[4];
+    __cpuid(info, 1);
+    return (info[3] & (1 << 26)) != 0;
+}
+
+inline bool CPUSupportsSSE4() {
+    int info[4];
+    __cpuid(info, 1);
+    return (info[2] & (1 << 19)) != 0;
+}
+
+inline bool CPUSupportsAVX() {
+    int info[4];
+    __cpuid(info, 1);
+    return (info[2] & (1 << 28)) != 0;
+}
+
+#endif // ISPC_CPUID_H
--- a/examples/examples.sln
+++ b/examples/examples.sln
@@ -15,6 +15,11 @@ Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "mandelbrot_tasks", "mandelb
 EndProject
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "aobench_instrumented", "aobench_instrumented\aobench_instrumented.vcxproj", "{B3B4AE3D-6D5A-4CF9-AF5B-43CF2131B958}"
 EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "noise", "noise\noise.vcxproj", "{0E0886D8-8B5E-4EAF-9A21-91E63DAF81FD}"
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "volume", "volume_rendering\volume.vcxproj", "{DEE5733A-E93E-449D-9114-9BFFCAEB4DF9}"
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "stencil", "stencil\stencil.vcxproj", "{2EF070A1-F62F-4E6A-944B-88D140945C3C}"
+EndProject
 Global
 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
 		Debug|Win32 = Debug|Win32
@@ -79,6 +84,30 @@ Global
 		{B3B4AE3D-6D5A-4CF9-AF5B-43CF2131B958}.Release|Win32.Build.0 = Release|Win32
 		{B3B4AE3D-6D5A-4CF9-AF5B-43CF2131B958}.Release|x64.ActiveCfg = Release|x64
 		{B3B4AE3D-6D5A-4CF9-AF5B-43CF2131B958}.Release|x64.Build.0 = Release|x64
+		{0E0886D8-8B5E-4EAF-9A21-91E63DAF81FD}.Debug|Win32.ActiveCfg = Debug|Win32
+		{0E0886D8-8B5E-4EAF-9A21-91E63DAF81FD}.Debug|Win32.Build.0 = Debug|Win32
+		{0E0886D8-8B5E-4EAF-9A21-91E63DAF81FD}.Debug|x64.ActiveCfg = Debug|x64
+		{0E0886D8-8B5E-4EAF-9A21-91E63DAF81FD}.Debug|x64.Build.0 = Debug|x64
+		{0E0886D8-8B5E-4EAF-9A21-91E63DAF81FD}.Release|Win32.ActiveCfg = Release|Win32
+		{0E0886D8-8B5E-4EAF-9A21-91E63DAF81FD}.Release|Win32.Build.0 = Release|Win32
+		{0E0886D8-8B5E-4EAF-9A21-91E63DAF81FD}.Release|x64.ActiveCfg = Release|x64
+		{0E0886D8-8B5E-4EAF-9A21-91E63DAF81FD}.Release|x64.Build.0 = Release|x64
+		{DEE5733A-E93E-449D-9114-9BFFCAEB4DF9}.Debug|Win32.ActiveCfg = Debug|Win32
+		{DEE5733A-E93E-449D-9114-9BFFCAEB4DF9}.Debug|Win32.Build.0 = Debug|Win32
+		{DEE5733A-E93E-449D-9114-9BFFCAEB4DF9}.Debug|x64.ActiveCfg = Debug|x64
+		{DEE5733A-E93E-449D-9114-9BFFCAEB4DF9}.Debug|x64.Build.0 = Debug|x64
+		{DEE5733A-E93E-449D-9114-9BFFCAEB4DF9}.Release|Win32.ActiveCfg = Release|Win32
+		{DEE5733A-E93E-449D-9114-9BFFCAEB4DF9}.Release|Win32.Build.0 = Release|Win32
+		{DEE5733A-E93E-449D-9114-9BFFCAEB4DF9}.Release|x64.ActiveCfg = Release|x64
+		{DEE5733A-E93E-449D-9114-9BFFCAEB4DF9}.Release|x64.Build.0 = Release|x64
+		{2EF070A1-F62F-4E6A-944B-88D140945C3C}.Debug|Win32.ActiveCfg = Debug|Win32
+		{2EF070A1-F62F-4E6A-944B-88D140945C3C}.Debug|Win32.Build.0 = Debug|Win32
+		{2EF070A1-F62F-4E6A-944B-88D140945C3C}.Debug|x64.ActiveCfg = Debug|x64
+		{2EF070A1-F62F-4E6A-944B-88D140945C3C}.Debug|x64.Build.0 = Debug|x64
+		{2EF070A1-F62F-4E6A-944B-88D140945C3C}.Release|Win32.ActiveCfg = Release|Win32
+		{2EF070A1-F62F-4E6A-944B-88D140945C3C}.Release|Win32.Build.0 = Release|Win32
+		{2EF070A1-F62F-4E6A-944B-88D140945C3C}.Release|x64.ActiveCfg = Release|x64
+		{2EF070A1-F62F-4E6A-944B-88D140945C3C}.Release|x64.Build.0 = Release|x64
 	EndGlobalSection
 	GlobalSection(SolutionProperties) = preSolution
 		HideSolutionNode = FALSE
--- a/examples/mandelbrot/mandelbrot.cpp
+++ b/examples/mandelbrot/mandelbrot.cpp
@@ -41,6 +41,7 @@
 #include <stdio.h>
 #include <algorithm>
 #include "../timing.h"
+#include "../cpuid.h"
 #include "mandelbrot_ispc.h"
 using namespace ispc;

@@ -63,6 +64,39 @@ writePPM(int *buf, int width, int height, const char *fn) {
            fputc(c, fp);
    }
    fclose(fp);
+    printf("Wrote image file %s\n", fn);
+}
+
+
+// Make sure that the vector ISA used during compilation is supported by
+// the processor.  The ISPC_TARGET_* macro is set in the ispc-generated
+// header file that we include above.
+static void
+ensureTargetISAIsSupported() {
+#if defined(ISPC_TARGET_SSE2)
+    bool isaSupported = CPUSupportsSSE2();
+    const char *target = "SSE2";
+#elif defined(ISPC_TARGET_SSE4)
+    bool isaSupported = CPUSupportsSSE4();
+    const char *target = "SSE4";
+#elif defined(ISPC_TARGET_AVX)
+    bool isaSupported = CPUSupportsAVX();
+    const char *target = "AVX";
+#else
+#error "Unknown ISPC_TARGET_* value"
+#endif
+    if (!isaSupported) {
+        fprintf(stderr, "***\n*** Error: the ispc-compiled code uses the %s instruction "
+                "set, which isn't\n***        supported by this computer's CPU!\n", target);
+        fprintf(stderr, "***\n***        Please modify the "
+#ifdef _MSC_VER
+                "MSVC project file "
+#else
+                "Makefile "
+#endif
+                "to select another target (e.g. sse2)\n***\n");
+        exit(1);
+    }
 }


@@ -77,6 +111,8 @@ int main() {
    int maxIterations = 256;
    int *buf = new int[width*height];

+    ensureTargetISAIsSupported();
+
    //
    // Compute the image using the ispc implementation; report the minimum
    // time of three runs.
--- a/examples/mandelbrot/mandelbrot.vcxproj
+++ b/examples/mandelbrot/mandelbrot.vcxproj
@@ -1,4 +1,4 @@
-<?xml version="1.0" encoding="utf-8"?>
+<?xml version="1.0" encoding="utf-8"?>
 <Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
  <ItemGroup Label="ProjectConfigurations">
    <ProjectConfiguration Include="Debug|Win32">
@@ -81,6 +81,8 @@
      <WarningLevel>Level3</WarningLevel>
      <Optimization>Disabled</Optimization>
      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <FloatingPointModel>Fast</FloatingPointModel>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
@@ -94,6 +96,8 @@
      <WarningLevel>Level3</WarningLevel>
      <Optimization>Disabled</Optimization>
      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <FloatingPointModel>Fast</FloatingPointModel>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
@@ -109,6 +113,7 @@
      <FunctionLevelLinking>true</FunctionLevelLinking>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <FloatingPointModel>Fast</FloatingPointModel>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
@@ -126,6 +131,7 @@
      <FunctionLevelLinking>true</FunctionLevelLinking>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <FloatingPointModel>Fast</FloatingPointModel>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
@@ -141,15 +147,15 @@
  <ItemGroup>
    <CustomBuild Include="mandelbrot.ispc">
      <FileType>Document</FileType>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">cl /E /TP %(Filename).ispc | ispc -O2 - -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=sse4x2
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=sse4x2
 </Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">cl /E /TP %(Filename).ispc | ispc -O2 - -o %(Filename).obj -h %(Filename)_ispc.h --target=sse4x2
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --target=sse4x2
 </Command>
      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">cl /E /TP %(Filename).ispc | ispc -O2 - -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=sse4x2
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=sse4x2
 </Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">cl /E /TP %(Filename).ispc | ispc -O2 - -o %(Filename).obj -h %(Filename)_ispc.h --target=sse4x2
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --target=sse4x2
 </Command>
      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
--- a/examples/mandelbrot/mandelbrot_serial.cpp
+++ b/examples/mandelbrot/mandelbrot_serial.cpp
@@ -36,7 +36,7 @@ static int mandel(float c_re, float c_im, int count) {
    float z_re = c_re, z_im = c_im;
    int i;
    for (i = 0; i < count; ++i) {
-        if (z_re * z_re + z_im * z_im > 4.)
+        if (z_re * z_re + z_im * z_im > 4.f)
            break;

        float new_re = z_re*z_re - z_im*z_im;
--- a/examples/mandelbrot_tasks/.gitignore
+++ b/examples/mandelbrot_tasks/.gitignore
@@ -0,0 +1,2 @@
+mandelbrot
+*.ppm
--- a/examples/mandelbrot_tasks/Makefile
+++ b/examples/mandelbrot_tasks/Makefile
@@ -1,18 +1,18 @@

 ARCH = $(shell uname)

-TASK_CXX=tasks_pthreads.cpp
+TASK_CXX=../tasks_pthreads.cpp
 TASK_LIB=-lpthread

 ifeq ($(ARCH), Darwin)
-  TASK_CXX=tasks_gcd.cpp
+  TASK_CXX=../tasks_gcd.cpp
  TASK_LIB=
 endif

-TASK_OBJ=$(addprefix objs/, $(TASK_CXX:.cpp=.o))
+TASK_OBJ=$(addprefix objs/, $(subst ../,, $(TASK_CXX:.cpp=.o)))

-CXX=g++ -m64
-CXXFLAGS=-Iobjs/ -O3 -Wall
+CXX=g++
+CXXFLAGS=-Iobjs/ -O3 -Wall -m64
 ISPC=ispc
 ISPCFLAGS=-O2 --target=sse4x2 --arch=x86-64

@@ -32,6 +32,9 @@ mandelbrot: dirs objs/mandelbrot.o objs/mandelbrot_serial.o objs/mandelbrot_ispc
 objs/%.o: %.cpp
 	$(CXX) $< $(CXXFLAGS) -c -o $@

+objs/%.o: ../%.cpp
+	$(CXX) $< $(CXXFLAGS) -c -o $@
+
 objs/mandelbrot.o: objs/mandelbrot_ispc.h 

 objs/%_ispc.h objs/%_ispc.o: %.ispc
--- a/examples/mandelbrot_tasks/mandelbrot.cpp
+++ b/examples/mandelbrot_tasks/mandelbrot.cpp
@@ -40,7 +40,9 @@

 #include <stdio.h>
 #include <algorithm>
+#include <string.h>
 #include "../timing.h"
+#include "../cpuid.h"
 #include "mandelbrot_ispc.h"
 using namespace ispc;

@@ -63,10 +65,47 @@ writePPM(int *buf, int width, int height, const char *fn) {
            fputc(c, fp);
    }
    fclose(fp);
+    printf("Wrote image file %s\n", fn);
 }


-int main() {
+// Make sure that the vector ISA used during compilation is supported by
+// the processor.  The ISPC_TARGET_* macro is set in the ispc-generated
+// header file that we include above.
+static void
+ensureTargetISAIsSupported() {
+#if defined(ISPC_TARGET_SSE2)
+    bool isaSupported = CPUSupportsSSE2();
+    const char *target = "SSE2";
+#elif defined(ISPC_TARGET_SSE4)
+    bool isaSupported = CPUSupportsSSE4();
+    const char *target = "SSE4";
+#elif defined(ISPC_TARGET_AVX)
+    bool isaSupported = CPUSupportsAVX();
+    const char *target = "AVX";
+#else
+#error "Unknown ISPC_TARGET_* value"
+#endif
+    if (!isaSupported) {
+        fprintf(stderr, "***\n*** Error: the ispc-compiled code uses the %s instruction "
+                "set, which isn't\n***        supported by this computer's CPU!\n", target);
+        fprintf(stderr, "***\n***        Please modify the "
+#ifdef _MSC_VER
+                "MSVC project file "
+#else
+                "Makefile "
+#endif
+                "to select another target (e.g. sse2)\n***\n");
+        exit(1);
+    }
+}
+
+static void usage() {
+    fprintf(stderr, "usage: mandelbrot [--scale=<factor]\n");
+    exit(1);
+}
+
+int main(int argc, char *argv[]) {
    unsigned int width = 1536;
    unsigned int height = 1024;
    float x0 = -2;
@@ -74,8 +113,26 @@ int main() {
    float y0 = -1;
    float y1 = 1;

-    extern void TasksInit();
-    TasksInit();
+    if (argc == 1)
+        ;
+    else if (argc == 2) {
+        if (strncmp(argv[1], "--scale=", 8) == 0) {
+            float scale = atof(argv[1] + 8);
+            if (scale == 0.f)
+                usage();
+            width *= scale;
+            height *= scale;
+            // round up to multiples of 16
+            width = (width + 0xf) & ~0xf;
+            height = (height + 0xf) & ~0xf;
+        }
+        else 
+            usage();
+    }
+    else
+        usage();
+
+    ensureTargetISAIsSupported();

    int maxIterations = 512;
    int *buf = new int[width*height];
--- a/examples/mandelbrot_tasks/mandelbrot_serial.cpp
+++ b/examples/mandelbrot_tasks/mandelbrot_serial.cpp
@@ -36,7 +36,7 @@ static int mandel(float c_re, float c_im, int count) {
    float z_re = c_re, z_im = c_im;
    int i;
    for (i = 0; i < count; ++i) {
-        if (z_re * z_re + z_im * z_im > 4.)
+        if (z_re * z_re + z_im * z_im > 4.f)
            break;

        float new_re = z_re*z_re - z_im*z_im;
--- a/examples/mandelbrot_tasks/mandelbrot_tasks.vcxproj
+++ b/examples/mandelbrot_tasks/mandelbrot_tasks.vcxproj
@@ -1,4 +1,4 @@
-<?xml version="1.0" encoding="utf-8"?>
+<?xml version="1.0" encoding="utf-8"?>
 <Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
  <ItemGroup Label="ProjectConfigurations">
    <ProjectConfiguration Include="Debug|Win32">
@@ -81,6 +81,8 @@
      <WarningLevel>Level3</WarningLevel>
      <Optimization>Disabled</Optimization>
      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <FloatingPointModel>Fast</FloatingPointModel>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
@@ -94,6 +96,8 @@
      <WarningLevel>Level3</WarningLevel>
      <Optimization>Disabled</Optimization>
      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <FloatingPointModel>Fast</FloatingPointModel>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
@@ -109,6 +113,7 @@
      <FunctionLevelLinking>true</FunctionLevelLinking>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <FloatingPointModel>Fast</FloatingPointModel>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
@@ -126,6 +131,7 @@
      <FunctionLevelLinking>true</FunctionLevelLinking>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <FloatingPointModel>Fast</FloatingPointModel>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
@@ -137,20 +143,20 @@
  <ItemGroup>
    <ClCompile Include="mandelbrot.cpp" />
    <ClCompile Include="mandelbrot_serial.cpp" />
-    <ClCompile Include="tasks_concrt.cpp" />
+    <ClCompile Include="../tasks_concrt.cpp" />
  </ItemGroup>
  <ItemGroup>
    <CustomBuild Include="mandelbrot.ispc">
      <FileType>Document</FileType>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">cl /E /TP %(Filename).ispc | ispc -O2 - -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=sse4x2
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=sse4x2
 </Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">cl /E /TP %(Filename).ispc | ispc -O2 - -o %(Filename).obj -h %(Filename)_ispc.h --target=sse4x2
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --target=sse4x2
 </Command>
      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">cl /E /TP %(Filename).ispc | ispc -O2 - -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=sse4x2
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=sse4x2
 </Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">cl /E /TP %(Filename).ispc | ispc -O2 - -o %(Filename).obj -h %(Filename)_ispc.h --target=sse4x2
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --target=sse4x2
 </Command>
      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
@@ -159,4 +165,4 @@
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
  </ImportGroup>
-</Project>
+</Project>
--- a/examples/noise/.gitignore
+++ b/examples/noise/.gitignore
@@ -0,0 +1,3 @@
+noise
+*.ppm
+objs
--- a/examples/noise/Makefile
+++ b/examples/noise/Makefile
@@ -0,0 +1,26 @@
+
+CXX=g++ -m64
+CXXFLAGS=-Iobjs/ -O3 -Wall
+ISPC=ispc
+ISPCFLAGS=-O2 --target=sse4 --arch=x86-64
+
+default: noise
+
+.PHONY: dirs clean
+
+dirs:
+	/bin/mkdir -p objs/
+
+clean:
+	/bin/rm -rf objs *~ noise
+
+noise: dirs objs/noise.o objs/noise_serial.o objs/noise_ispc.o
+	$(CXX) $(CXXFLAGS) -o $@ objs/noise.o objs/noise_ispc.o objs/noise_serial.o -lm
+
+objs/%.o: %.cpp
+	$(CXX) $< $(CXXFLAGS) -c -o $@
+
+objs/noise.o: objs/noise_ispc.h 
+
+objs/%_ispc.h objs/%_ispc.o: %.ispc
+	$(ISPC) $(ISPCFLAGS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
--- a/examples/noise/noise.cpp
+++ b/examples/noise/noise.cpp
@@ -0,0 +1,150 @@
+/*
+  Copyright (c) 2010-2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+#ifdef _MSC_VER
+#define _CRT_SECURE_NO_WARNINGS
+#define NOMINMAX
+#pragma warning (disable: 4244)
+#pragma warning (disable: 4305)
+#endif
+
+#include <stdio.h>
+#include <algorithm>
+#include "../timing.h"
+#include "../cpuid.h"
+#include "noise_ispc.h"
+using namespace ispc;
+
+extern void noise_serial(float x0, float y0, float x1, float y1,
+                         int width, int height, float output[]);
+
+/* Write a PPM image file with the image */
+static void
+writePPM(float *buf, int width, int height, const char *fn) {
+    FILE *fp = fopen(fn, "wb");
+    fprintf(fp, "P6\n");
+    fprintf(fp, "%d %d\n", width, height);
+    fprintf(fp, "255\n");
+    for (int i = 0; i < width*height; ++i) {
+        float v = buf[i] * 255.f;
+        if (v < 0) v = 0;
+        if (v > 255) v = 255;
+        for (int j = 0; j < 3; ++j)
+            fputc((char)v, fp);
+    }
+    fclose(fp);
+}
+
+
+// Make sure that the vector ISA used during compilation is supported by
+// the processor.  The ISPC_TARGET_* macro is set in the ispc-generated
+// header file that we include above.
+static void
+ensureTargetISAIsSupported() {
+#if defined(ISPC_TARGET_SSE2)
+    bool isaSupported = CPUSupportsSSE2();
+    const char *target = "SSE2";
+#elif defined(ISPC_TARGET_SSE4)
+    bool isaSupported = CPUSupportsSSE4();
+    const char *target = "SSE4";
+#elif defined(ISPC_TARGET_AVX)
+    bool isaSupported = CPUSupportsAVX();
+    const char *target = "AVX";
+#else
+#error "Unknown ISPC_TARGET_* value"
+#endif
+    if (!isaSupported) {
+        fprintf(stderr, "***\n*** Error: the ispc-compiled code uses the %s instruction "
+                "set, which isn't\n***        supported by this computer's CPU!\n", target);
+        fprintf(stderr, "***\n***        Please modify the "
+#ifdef _MSC_VER
+                "MSVC project file "
+#else
+                "Makefile "
+#endif
+                "to select another target (e.g. sse2)\n***\n");
+        exit(1);
+    }
+}
+
+
+int main() {
+    unsigned int width = 768;
+    unsigned int height = 768;
+    float x0 = -10;
+    float x1 = 10;
+    float y0 = -10;
+    float y1 = 10;
+
+    float *buf = new float[width*height];
+
+    ensureTargetISAIsSupported();
+
+    //
+    // Compute the image using the ispc implementation; report the minimum
+    // time of three runs.
+    //
+    double minISPC = 1e30;
+    for (int i = 0; i < 3; ++i) {
+        reset_and_start_timer();
+        noise_ispc(x0, y0, x1, y1, width, height, buf);
+        double dt = get_elapsed_mcycles();
+        minISPC = std::min(minISPC, dt);
+    }
+
+    printf("[noise ispc]:\t\t\t[%.3f] million cycles\n", minISPC);
+    writePPM(buf, width, height, "noise-ispc.ppm");
+
+    // Clear out the buffer
+    for (unsigned int i = 0; i < width * height; ++i)
+        buf[i] = 0;
+
+    // 
+    // And run the serial implementation 3 times, again reporting the
+    // minimum time.
+    //
+    double minSerial = 1e30;
+    for (int i = 0; i < 3; ++i) {
+        reset_and_start_timer();
+        noise_serial(x0, y0, x1, y1, width, height, buf);
+        double dt = get_elapsed_mcycles();
+        minSerial = std::min(minSerial, dt);
+    }
+
+    printf("[noise serial]:\t\t\t[%.3f] millon cycles\n", minSerial);
+    writePPM(buf, width, height, "noise-serial.ppm");
+
+    printf("\t\t\t\t(%.2fx speedup from ISPC)\n", minSerial/minISPC);
+
+    return 0;
+}
--- a/examples/noise/noise.ispc
+++ b/examples/noise/noise.ispc
@@ -0,0 +1,164 @@
+/*
+  Copyright (c) 2010-2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+#define NOISE_PERM_SIZE 256
+
+static uniform int NoisePerm[2 * NOISE_PERM_SIZE] = {
+    151, 160, 137, 91, 90, 15, 131, 13, 201, 95, 96, 53, 194, 233, 7, 225, 140,
+    36, 103, 30, 69, 142, 8, 99, 37, 240, 21, 10, 23, 190, 6, 148, 247, 120,
+    234, 75, 0, 26, 197, 62, 94, 252, 219, 203, 117, 35, 11, 32, 57, 177, 33,
+    88, 237, 149, 56, 87, 174, 20, 125, 136, 171, 168,  68, 175, 74, 165, 71, 
+    134, 139, 48, 27, 166, 77, 146, 158, 231, 83, 111, 229, 122, 60, 211, 133, 
+    230, 220, 105, 92, 41, 55, 46, 245, 40, 244, 102, 143, 54, 65, 25, 63, 161,
+    1, 216, 80, 73, 209, 76, 132, 187, 208,  89, 18, 169, 200, 196, 135, 130, 
+    116, 188, 159, 86, 164, 100, 109, 198, 173, 186,  3, 64, 52, 217, 226, 250,
+    124, 123, 5, 202, 38, 147, 118, 126, 255, 82, 85, 212, 207, 206, 59, 227, 
+    47, 16, 58, 17, 182, 189, 28, 42, 223, 183, 170, 213, 119, 248, 152,  2, 44,
+    154, 163, 70, 221, 153, 101, 155, 167,  43, 172, 9, 129, 22, 39, 253,  19, 
+    98, 108, 110, 79, 113, 224, 232, 178, 185,  112, 104, 218, 246, 97, 228, 251,
+    34, 242, 193, 238, 210, 144, 12, 191, 179, 162, 241, 81, 51, 145, 235, 249,
+    14, 239, 107, 49, 192, 214,  31, 181, 199, 106, 157, 184, 84, 204, 176, 115,
+    121, 50, 45, 127,  4, 150, 254, 138, 236, 205, 93, 222, 114, 67, 29, 24, 72, 
+    243, 141, 128, 195, 78, 66, 215, 61, 156, 180, 151, 160, 137, 91, 90, 15,
+    131, 13, 201, 95, 96, 53, 194, 233, 7, 225, 140, 36, 103, 30, 69, 142, 8, 99,
+    37, 240, 21, 10, 23, 190, 6, 148, 247, 120, 234, 75, 0, 26, 197, 62, 94, 252,
+    219, 203, 117, 35, 11, 32, 57, 177, 33, 88, 237, 149, 56, 87, 174, 20, 125, 
+    136, 171, 168,  68, 175, 74, 165, 71, 134, 139, 48, 27, 166, 77, 146, 158,
+    231, 83, 111, 229, 122, 60, 211, 133, 230, 220, 105, 92, 41, 55, 46, 245,
+    40, 244, 102, 143, 54,  65, 25, 63, 161,  1, 216, 80, 73, 209, 76, 132, 187,
+    208,  89, 18, 169, 200, 196, 135, 130, 116, 188, 159, 86, 164, 100, 109, 
+    198, 173, 186,  3, 64, 52, 217, 226, 250, 124, 123, 5, 202, 38, 147, 118,
+    126, 255, 82, 85, 212, 207, 206, 59, 227, 47, 16, 58, 17, 182, 189, 28, 42,
+    223, 183, 170, 213, 119, 248, 152,  2, 44, 154, 163, 70, 221, 153, 101, 155, 
+    167,  43, 172, 9, 129, 22, 39, 253,  19, 98, 108, 110, 79, 113, 224, 232,
+    178, 185,  112, 104, 218, 246, 97, 228, 251, 34, 242, 193, 238, 210, 144,
+    12, 191, 179, 162, 241,  81, 51, 145, 235, 249, 14, 239, 107, 49, 192, 214,
+    31, 181, 199, 106, 157, 184,  84, 204, 176, 115, 121, 50, 45, 127,  4, 150,
+    254, 138, 236, 205, 93, 222, 114, 67, 29, 24, 72, 243, 141, 128, 195, 78, 
+    66, 215, 61, 156, 180
+};
+
+
+inline float SmoothStep(float low, float high, float value) {
+    float v = clamp((value - low) / (high - low), 0.f, 1.f);
+    return v * v * (-2.f * v  + 3.f);
+}
+
+
+inline int Floor2Int(float val) {
+    return (int)floor(val);
+}
+
+
+inline float Grad(int x, int y, int z, float dx, float dy, float dz) {
+    int h = NoisePerm[NoisePerm[NoisePerm[x]+y]+z];
+    h &= 15;
+    float u = h<8 || h==12 || h==13 ? dx : dy;
+    float v = h<4 || h==12 || h==13 ? dy : dz;
+    return ((h&1) ? -u : u) + ((h&2) ? -v : v);
+}
+
+
+inline float NoiseWeight(float t) {
+    float t3 = t*t*t;
+    float t4 = t3*t;
+    return 6.f*t4*t - 15.f*t4 + 10.f*t3;
+}
+
+
+inline float Lerp(float t, float low, float high) {
+    return (1. - t) * low + t * high;
+}
+
+
+static float Noise(float x, float y, float z) {
+    // Compute noise cell coordinates and offsets
+    int ix = Floor2Int(x), iy = Floor2Int(y), iz = Floor2Int(z);
+    float dx = x - ix, dy = y - iy, dz = z - iz;
+
+    // Compute gradient weights
+    ix &= (NOISE_PERM_SIZE-1);
+    iy &= (NOISE_PERM_SIZE-1);
+    iz &= (NOISE_PERM_SIZE-1);
+    float w000 = Grad(ix,   iy,   iz,   dx,   dy,   dz);
+    float w100 = Grad(ix+1, iy,   iz,   dx-1, dy,   dz);
+    float w010 = Grad(ix,   iy+1, iz,   dx,   dy-1, dz);
+    float w110 = Grad(ix+1, iy+1, iz,   dx-1, dy-1, dz);
+    float w001 = Grad(ix,   iy,   iz+1, dx,   dy,   dz-1);
+    float w101 = Grad(ix+1, iy,   iz+1, dx-1, dy,   dz-1);
+    float w011 = Grad(ix,   iy+1, iz+1, dx,   dy-1, dz-1);
+    float w111 = Grad(ix+1, iy+1, iz+1, dx-1, dy-1, dz-1);
+
+    // Compute trilinear interpolation of weights
+    float wx = NoiseWeight(dx), wy = NoiseWeight(dy), wz = NoiseWeight(dz);
+    float x00 = Lerp(wx, w000, w100);
+    float x10 = Lerp(wx, w010, w110);
+    float x01 = Lerp(wx, w001, w101);
+    float x11 = Lerp(wx, w011, w111);
+    float y0 = Lerp(wy, x00, x10);
+    float y1 = Lerp(wy, x01, x11);
+    return Lerp(wz, y0, y1);
+}
+
+
+static float Turbulence(float x, float y, float z, uniform int octaves) {
+    float omega = 0.6;
+
+    float sum = 0., lambda = 1., o = 1.;
+    for (uniform int i = 0; i < octaves; ++i) {
+        sum += abs(o * Noise(lambda * x, lambda * y, lambda * z));
+        lambda *= 1.99f;
+        o *= omega;
+    }
+    return sum * 0.5;
+}
+
+
+export void noise_ispc(uniform float x0, uniform float y0, uniform float x1, 
+                       uniform float y1, uniform int width, uniform int height, 
+                       uniform float output[])
+{
+    uniform float dx = (x1 - x0) / width;
+    uniform float dy = (y1 - y0) / height;
+
+    for (uniform int j = 0; j < height; j++) {
+        for (uniform int i = 0; i < width; i += programCount) {
+            float x = x0 + (i + programIndex) * dx;
+            float y = y0 + j * dy;
+
+            int index = (j * width + i + programIndex);
+            output[index] = Turbulence(x, y, 0.6, 8);
+        }
+    }
+}
+
--- a/examples/noise/noise.vcxproj
+++ b/examples/noise/noise.vcxproj
@@ -0,0 +1,167 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|Win32">
+      <Configuration>Debug</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|Win32">
+      <Configuration>Release</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{0E0886D8-8B5E-4EAF-9A21-91E63DAF81FD}</ProjectGuid>
+    <Keyword>Win32Proj</Keyword>
+    <RootNamespace>noise</RootNamespace>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <LinkIncremental>true</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <LinkIncremental>true</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <LinkIncremental>false</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <LinkIncremental>false</LinkIncremental>
+  </PropertyGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <ClCompile>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <FloatingPointModel>Fast</FloatingPointModel>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <ClCompile>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <FloatingPointModel>Fast</FloatingPointModel>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <FloatingPointModel>Fast</FloatingPointModel>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <FloatingPointModel>Fast</FloatingPointModel>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <ClCompile Include="noise.cpp" />
+    <ClCompile Include="noise_serial.cpp" />
+  </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="noise.ispc">
+      <FileType>Document</FileType>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=sse4
+</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --target=sse4
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=sse4
+</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --target=sse4
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
+    </CustomBuild>
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+  </ImportGroup>
+</Project>
--- a/examples/noise/noise_serial.cpp
+++ b/examples/noise/noise_serial.cpp
@@ -0,0 +1,170 @@
+/*
+  Copyright (c) 2010-2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+#include <math.h>
+
+#define NOISE_PERM_SIZE 256
+
+static int NoisePerm[2 * NOISE_PERM_SIZE] = {
+    151, 160, 137, 91, 90, 15, 131, 13, 201, 95, 96, 53, 194, 233, 7, 225, 140,
+    36, 103, 30, 69, 142, 8, 99, 37, 240, 21, 10, 23, 190, 6, 148, 247, 120,
+    234, 75, 0, 26, 197, 62, 94, 252, 219, 203, 117, 35, 11, 32, 57, 177, 33,
+    88, 237, 149, 56, 87, 174, 20, 125, 136, 171, 168,  68, 175, 74, 165, 71, 
+    134, 139, 48, 27, 166, 77, 146, 158, 231, 83, 111, 229, 122, 60, 211, 133, 
+    230, 220, 105, 92, 41, 55, 46, 245, 40, 244, 102, 143, 54, 65, 25, 63, 161,
+    1, 216, 80, 73, 209, 76, 132, 187, 208,  89, 18, 169, 200, 196, 135, 130, 
+    116, 188, 159, 86, 164, 100, 109, 198, 173, 186,  3, 64, 52, 217, 226, 250,
+    124, 123, 5, 202, 38, 147, 118, 126, 255, 82, 85, 212, 207, 206, 59, 227, 
+    47, 16, 58, 17, 182, 189, 28, 42, 223, 183, 170, 213, 119, 248, 152,  2, 44,
+    154, 163, 70, 221, 153, 101, 155, 167,  43, 172, 9, 129, 22, 39, 253,  19, 
+    98, 108, 110, 79, 113, 224, 232, 178, 185,  112, 104, 218, 246, 97, 228, 251,
+    34, 242, 193, 238, 210, 144, 12, 191, 179, 162, 241, 81, 51, 145, 235, 249,
+    14, 239, 107, 49, 192, 214,  31, 181, 199, 106, 157, 184, 84, 204, 176, 115,
+    121, 50, 45, 127,  4, 150, 254, 138, 236, 205, 93, 222, 114, 67, 29, 24, 72, 
+    243, 141, 128, 195, 78, 66, 215, 61, 156, 180, 151, 160, 137, 91, 90, 15,
+    131, 13, 201, 95, 96, 53, 194, 233, 7, 225, 140, 36, 103, 30, 69, 142, 8, 99,
+    37, 240, 21, 10, 23, 190, 6, 148, 247, 120, 234, 75, 0, 26, 197, 62, 94, 252,
+    219, 203, 117, 35, 11, 32, 57, 177, 33, 88, 237, 149, 56, 87, 174, 20, 125, 
+    136, 171, 168,  68, 175, 74, 165, 71, 134, 139, 48, 27, 166, 77, 146, 158,
+    231, 83, 111, 229, 122, 60, 211, 133, 230, 220, 105, 92, 41, 55, 46, 245,
+    40, 244, 102, 143, 54,  65, 25, 63, 161,  1, 216, 80, 73, 209, 76, 132, 187,
+    208,  89, 18, 169, 200, 196, 135, 130, 116, 188, 159, 86, 164, 100, 109, 
+    198, 173, 186,  3, 64, 52, 217, 226, 250, 124, 123, 5, 202, 38, 147, 118,
+    126, 255, 82, 85, 212, 207, 206, 59, 227, 47, 16, 58, 17, 182, 189, 28, 42,
+    223, 183, 170, 213, 119, 248, 152,  2, 44, 154, 163, 70, 221, 153, 101, 155, 
+    167,  43, 172, 9, 129, 22, 39, 253,  19, 98, 108, 110, 79, 113, 224, 232,
+    178, 185,  112, 104, 218, 246, 97, 228, 251, 34, 242, 193, 238, 210, 144,
+    12, 191, 179, 162, 241,  81, 51, 145, 235, 249, 14, 239, 107, 49, 192, 214,
+    31, 181, 199, 106, 157, 184,  84, 204, 176, 115, 121, 50, 45, 127,  4, 150,
+    254, 138, 236, 205, 93, 222, 114, 67, 29, 24, 72, 243, 141, 128, 195, 78, 
+    66, 215, 61, 156, 180
+};
+
+
+inline float Clamp(float v, float low, float high) {
+    return v < low ? low : ((v > high) ? high : v);
+}
+
+
+inline float SmoothStep(float low, float high, float value) {
+    float v = Clamp((value - low) / (high - low), 0.f, 1.f);
+    return v * v * (-2.f * v  + 3.f);
+}
+
+
+inline int Floor2Int(float val) {
+    return (int)floorf(val);
+}
+
+
+inline float Grad(int x, int y, int z, float dx, float dy, float dz) {
+    int h = NoisePerm[NoisePerm[NoisePerm[x]+y]+z];
+    h &= 15;
+    float u = h<8 || h==12 || h==13 ? dx : dy;
+    float v = h<4 || h==12 || h==13 ? dy : dz;
+    return ((h&1) ? -u : u) + ((h&2) ? -v : v);
+}
+
+
+inline float NoiseWeight(float t) {
+    float t3 = t*t*t;
+    float t4 = t3*t;
+    return 6.f*t4*t - 15.f*t4 + 10.f*t3;
+}
+
+
+inline float Lerp(float t, float low, float high) {
+    return (1.f - t) * low + t * high;
+}
+
+
+static float Noise(float x, float y, float z) {
+    // Compute noise cell coordinates and offsets
+    int ix = Floor2Int(x), iy = Floor2Int(y), iz = Floor2Int(z);
+    float dx = x - ix, dy = y - iy, dz = z - iz;
+
+    // Compute gradient weights
+    ix &= (NOISE_PERM_SIZE-1);
+    iy &= (NOISE_PERM_SIZE-1);
+    iz &= (NOISE_PERM_SIZE-1);
+    float w000 = Grad(ix,   iy,   iz,   dx,   dy,   dz);
+    float w100 = Grad(ix+1, iy,   iz,   dx-1, dy,   dz);
+    float w010 = Grad(ix,   iy+1, iz,   dx,   dy-1, dz);
+    float w110 = Grad(ix+1, iy+1, iz,   dx-1, dy-1, dz);
+    float w001 = Grad(ix,   iy,   iz+1, dx,   dy,   dz-1);
+    float w101 = Grad(ix+1, iy,   iz+1, dx-1, dy,   dz-1);
+    float w011 = Grad(ix,   iy+1, iz+1, dx,   dy-1, dz-1);
+    float w111 = Grad(ix+1, iy+1, iz+1, dx-1, dy-1, dz-1);
+
+    // Compute trilinear interpolation of weights
+    float wx = NoiseWeight(dx), wy = NoiseWeight(dy), wz = NoiseWeight(dz);
+    float x00 = Lerp(wx, w000, w100);
+    float x10 = Lerp(wx, w010, w110);
+    float x01 = Lerp(wx, w001, w101);
+    float x11 = Lerp(wx, w011, w111);
+    float y0 = Lerp(wy, x00, x10);
+    float y1 = Lerp(wy, x01, x11);
+    return Lerp(wz, y0, y1);
+}
+
+
+static float Turbulence(float x, float y, float z, int octaves) {
+    float omega = 0.6;
+
+    float sum = 0., lambda = 1., o = 1.;
+    for (int i = 0; i < octaves; ++i) {
+        sum += fabsf(o * Noise(lambda * x, lambda * y, lambda * z));
+        lambda *= 1.99f;
+        o *= omega;
+    }
+    return sum * 0.5f;
+}
+
+
+void noise_serial(float x0, float y0, float x1, float y1,
+                  int width, int height, float output[])
+{
+    float dx = (x1 - x0) / width;
+    float dy = (y1 - y0) / height;
+
+    for (int j = 0; j < height; j++) {
+        for (int i = 0; i < width; ++i) {
+            float x = x0 + i * dx;
+            float y = y0 + j * dy;
+
+            int index = (j * width + i);
+            output[index] = Turbulence(x, y, 0.6f, 8);
+        }
+    }
+}
+
--- a/examples/options/.gitignore
+++ b/examples/options/.gitignore
@@ -0,0 +1 @@
+options
--- a/examples/options/options.cpp
+++ b/examples/options/options.cpp
@@ -41,6 +41,7 @@ using std::max;

 #include "options_defs.h"
 #include "../timing.h"
+#include "../cpuid.h"

 #include "options_ispc.h"
 using namespace ispc;
@@ -53,9 +54,41 @@ extern void binomial_put_serial(float Sa[], float Xa[], float Ta[],
                                float ra[], float va[], 
                                float result[], int count);

+// Make sure that the vector ISA used during compilation is supported by
+// the processor.  The ISPC_TARGET_* macro is set in the ispc-generated
+// header file that we include above.
+static void
+ensureTargetISAIsSupported() {
+#if defined(ISPC_TARGET_SSE2)
+    bool isaSupported = CPUSupportsSSE2();
+    const char *target = "SSE2";
+#elif defined(ISPC_TARGET_SSE4)
+    bool isaSupported = CPUSupportsSSE4();
+    const char *target = "SSE4";
+#elif defined(ISPC_TARGET_AVX)
+    bool isaSupported = CPUSupportsAVX();
+    const char *target = "AVX";
+#else
+#error "Unknown ISPC_TARGET_* value"
+#endif
+    if (!isaSupported) {
+        fprintf(stderr, "***\n*** Error: the ispc-compiled code uses the %s instruction "
+                "set, which isn't\n***        supported by this computer's CPU!\n", target);
+        fprintf(stderr, "***\n***        Please modify the "
+#ifdef _MSC_VER
+                "MSVC project file "
+#else
+                "Makefile "
+#endif
+                "to select another target (e.g. sse2)\n***\n");
+        exit(1);
+    }
+}
+
+
 int main() {
-    // Pointers passed to ispc code must have alignment of the target's
-    // vector width at minimum.
+    ensureTargetISAIsSupported();
+    
    float *S = new float[N_OPTIONS];
    float *X = new float[N_OPTIONS];
    float *T = new float[N_OPTIONS];
--- a/examples/options/options.vcxproj
+++ b/examples/options/options.vcxproj
@@ -1,4 +1,4 @@
-<?xml version="1.0" encoding="utf-8"?>
+<?xml version="1.0" encoding="utf-8"?>
 <Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
  <ItemGroup Label="ProjectConfigurations">
    <ProjectConfiguration Include="Debug|Win32">
@@ -82,6 +82,8 @@
      <Optimization>Disabled</Optimization>
      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <DisableSpecificWarnings>4305</DisableSpecificWarnings>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <FloatingPointModel>Fast</FloatingPointModel>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
@@ -96,6 +98,8 @@
      <Optimization>Disabled</Optimization>
      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <DisableSpecificWarnings>4305</DisableSpecificWarnings>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <FloatingPointModel>Fast</FloatingPointModel>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
@@ -112,6 +116,7 @@
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <DisableSpecificWarnings>4305</DisableSpecificWarnings>
+      <FloatingPointModel>Fast</FloatingPointModel>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
@@ -130,6 +135,7 @@
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <DisableSpecificWarnings>4305</DisableSpecificWarnings>
+      <FloatingPointModel>Fast</FloatingPointModel>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
@@ -145,15 +151,15 @@
  <ItemGroup>
    <CustomBuild Include="options.ispc">
      <FileType>Document</FileType>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">cl /E /TP %(Filename).ispc | ispc -O2 - -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=sse4x2
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=sse4x2
 </Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">cl /E /TP %(Filename).ispc | ispc -O2 - -o %(Filename).obj -h %(Filename)_ispc.h --target=sse4x2
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --target=sse4x2
 </Command>
      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">cl /E /TP %(Filename).ispc | ispc -O2 - -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=sse4x2
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=sse4x2
 </Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">cl /E /TP %(Filename).ispc | ispc -O2 - -o %(Filename).obj -h %(Filename)_ispc.h --target=sse4x2
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --target=sse4x2
 </Command>
      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
--- a/examples/options/options_serial.cpp
+++ b/examples/options/options_serial.cpp
@@ -47,7 +47,7 @@ static inline float
 CND(float X) {
    float L = fabsf(X);

-    float k = 1.0 / (1.0 + 0.2316419 * L);
+    float k = 1.f / (1.f + 0.2316419f * L);
    float k2 = k*k;
    float k3 = k2*k;
    float k4 = k2*k2;
@@ -59,7 +59,7 @@ CND(float X) {
    w *= invSqrt2Pi * expf(-L * L * .5f);

    if (X > 0.f)
-        w = 1.0 - w;
+        w = 1.f - w;
    return w;
 }

@@ -94,7 +94,7 @@ binomial_put_serial(float Sa[], float Xa[], float Ta[],

        float dt = T / BINOMIAL_NUM;
        float u = expf(v * sqrtf(dt));
-        float d = 1. / u;
+        float d = 1.f / u;
        float disc = expf(r * dt);
        float Pu = (disc - d) / (u - d);

--- a/examples/rt/.gitignore
+++ b/examples/rt/.gitignore
@@ -0,0 +1,2 @@
+rt
+*.ppm
--- a/examples/rt/Makefile
+++ b/examples/rt/Makefile
@@ -1,6 +1,18 @@

-CXX=g++ -m64
-CXXFLAGS=-Iobjs/ -O3 -Wall
+ARCH = $(shell uname)
+
+TASK_CXX=../tasks_pthreads.cpp
+TASK_LIB=-lpthread
+
+ifeq ($(ARCH), Darwin)
+  TASK_CXX=../tasks_gcd.cpp
+  TASK_LIB=
+endif
+
+TASK_OBJ=$(addprefix objs/, $(subst ../,, $(TASK_CXX:.cpp=.o)))
+
+CXX=g++
+CXXFLAGS=-Iobjs/ -O3 -Wall -m64
 ISPC=ispc
 ISPCFLAGS=-O2 --target=sse4x2 --arch=x86-64

@@ -14,11 +26,16 @@ dirs:
 clean:
 	/bin/rm -rf objs *~ rt

-rt: dirs objs/rt.o objs/rt_serial.o objs/rt_ispc.o
-	$(CXX) $(CXXFLAGS) -o $@ objs/rt.o objs/rt_ispc.o objs/rt_serial.o -lm
+rt: dirs objs/rt.o objs/rt_serial.o objs/rt_ispc.o $(TASK_OBJ)
+	$(CXX) $(CXXFLAGS) -o $@ objs/rt.o objs/rt_ispc.o objs/rt_serial.o $(TASK_OBJ) -lm $(TASK_LIB)

-objs/%.o: %.cpp objs/rt_ispc.h
+objs/%.o: %.cpp
 	$(CXX) $< $(CXXFLAGS) -c -o $@

+objs/%.o: ../%.cpp
+	$(CXX) $< $(CXXFLAGS) -c -o $@
+
+objs/rt.o: objs/rt_ispc.h 
+
 objs/%_ispc.h objs/%_ispc.o: %.ispc
 	$(ISPC) $(ISPCFLAGS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
--- a/examples/rt/rt.cpp
+++ b/examples/rt/rt.cpp
@@ -42,15 +42,18 @@
 #include <math.h>
 #include <algorithm>
 #include <assert.h>
+#include <string.h>
 #include <sys/types.h>
 #include "../timing.h"
+#include "../cpuid.h"
 #include "rt_ispc.h"

 using namespace ispc;

 typedef unsigned int uint;

-extern void raytrace_serial(int width, int height, const float raster2camera[4][4], 
+extern void raytrace_serial(int width, int height, int baseWidth, int baseHeight,
+                            const float raster2camera[4][4], 
                            const float camera2world[4][4], float image[],
                            int id[], const LinearBVHNode nodes[],
                            const Triangle triangles[]);
@@ -89,14 +92,66 @@ static void writeImage(int *idImage, float *depthImage, int width, int height,
        }
    }            
    fclose(f);
+    printf("Wrote image file %s\n", filename);
+}
+
+
+// Make sure that the vector ISA used during compilation is supported by
+// the processor.  The ISPC_TARGET_* macro is set in the ispc-generated
+// header file that we include above.
+static void
+ensureTargetISAIsSupported() {
+#if defined(ISPC_TARGET_SSE2)
+    bool isaSupported = CPUSupportsSSE2();
+    const char *target = "SSE2";
+#elif defined(ISPC_TARGET_SSE4)
+    bool isaSupported = CPUSupportsSSE4();
+    const char *target = "SSE4";
+#elif defined(ISPC_TARGET_AVX)
+    bool isaSupported = CPUSupportsAVX();
+    const char *target = "AVX";
+#else
+#error "Unknown ISPC_TARGET_* value"
+#endif
+    if (!isaSupported) {
+        fprintf(stderr, "***\n*** Error: the ispc-compiled code uses the %s instruction "
+                "set, which isn't\n***        supported by this computer's CPU!\n", target);
+        fprintf(stderr, "***\n***        Please modify the "
+#ifdef _MSC_VER
+                "MSVC project file "
+#else
+                "Makefile "
+#endif
+                "to select another target (e.g. sse2)\n***\n");
+        exit(1);
+    }
+}
+
+
+static void usage() {
+    fprintf(stderr, "rt [--scale=<factor>] <scene name base>\n");
+    exit(1);
 }


 int main(int argc, char *argv[]) {
-    if (argc != 2) {
-        fprintf(stderr, "usage: rt <filename base>\n");
-        exit(1);
+    float scale = 1.f;
+    const char *filename = NULL;
+    for (int i = 1; i < argc; ++i) {
+        if (strncmp(argv[i], "--scale=", 8) == 0) {
+            scale = atof(argv[i] + 8);
+            if (scale == 0.f)
+                usage();
+        }
+        else if (filename != NULL)
+            usage();
+        else
+            filename = argv[i];
    }
+    if (filename == NULL)
+        usage();
+
+    ensureTargetISAIsSupported();

 #define READ(var, n)                                            \
    if (fread(&(var), sizeof(var), n, f) != (unsigned int)n) {  \
@@ -108,10 +163,10 @@ int main(int argc, char *argv[]) {
    // Read the camera specification information from the camera file
    //
    char fnbuf[1024];
-    sprintf(fnbuf, "%s.camera", argv[1]);
+    sprintf(fnbuf, "%s.camera", filename);
    FILE *f = fopen(fnbuf, "rb");
    if (!f) {
-        perror(argv[1]);
+        perror(fnbuf);
        return 1;
    }

@@ -119,20 +174,20 @@ int main(int argc, char *argv[]) {
    // Nothing fancy, and trouble if we run on a big-endian system, just
    // fread in the bits
    //
-    int width, height;
+    int baseWidth, baseHeight;
    float camera2world[4][4], raster2camera[4][4];
-    READ(width, 1);
-    READ(height, 1);
+    READ(baseWidth, 1);
+    READ(baseHeight, 1);
    READ(camera2world[0][0], 16);
    READ(raster2camera[0][0], 16);

    //
    // Read in the serialized BVH 
    //
-    sprintf(fnbuf, "%s.bvh", argv[1]);
+    sprintf(fnbuf, "%s.bvh", filename);
    f = fopen(fnbuf, "rb");
    if (!f) {
-        perror(argv[2]);
+        perror(fnbuf);
        return 1;
    }

@@ -155,7 +210,9 @@ int main(int argc, char *argv[]) {
        nodes[i].bounds[1].v[1] = b[4];
        nodes[i].bounds[1].v[2] = b[5];
        READ(nodes[i].offset, 1);
-        READ(nodes[i].primsAxis, 1);
+        READ(nodes[i].nPrimitives, 1);
+        READ(nodes[i].splitAxis, 1);
+        READ(nodes[i].pad, 1);
    }

    // And then read the triangles 
@@ -177,10 +234,10 @@ int main(int argc, char *argv[]) {
    }
    fclose(f);

-    // round image resolution up to multiple of 4 to makethings easy for
+    // round image resolution up to multiple of 16 to make things easy for
    // the code that assigns pixels to ispc program instances
-    height = (height + 3) & ~3;
-    width = (width + 3) & ~3;
+    int height = (int(baseHeight * scale) + 0xf) & ~0xf;
+    int width = (int(baseWidth * scale) + 0xf) & ~0xf;

    // allocate images; one to hold hit object ids, one to hold depth to
    // the first interseciton
@@ -188,19 +245,42 @@ int main(int argc, char *argv[]) {
    float *image = new float[width*height];

    //
-    // Run 3 iterations with ispc, record the minimum time
+    // Run 3 iterations with ispc + 1 core, record the minimum time
    //
    double minTimeISPC = 1e30;
    for (int i = 0; i < 3; ++i) {
        reset_and_start_timer();
-        raytrace(width, height, raster2camera, camera2world, 
-                 image, id, nodes, triangles);
+        raytrace_ispc(width, height, baseWidth, baseHeight, raster2camera, 
+                      camera2world, image, id, nodes, triangles);
        double dt = get_elapsed_mcycles();
        minTimeISPC = std::min(dt, minTimeISPC);
    }
-    printf("[rt ispc]:\t\t\t[%.3f] million cycles for %d x %d image\n", minTimeISPC, width, height);
+    printf("[rt ispc, 1 core]:\t\t[%.3f] million cycles for %d x %d image\n", 
+           minTimeISPC, width, height);

-    writeImage(id, image, width, height, "rt-ispc.ppm");
+    writeImage(id, image, width, height, "rt-ispc-1core.ppm");
+
+    memset(id, 0, width*height*sizeof(int));
+    memset(image, 0, width*height*sizeof(float));
+
+    //
+    // Run 3 iterations with ispc + 1 core, record the minimum time
+    //
+    double minTimeISPCtasks = 1e30;
+    for (int i = 0; i < 3; ++i) {
+        reset_and_start_timer();
+        raytrace_ispc_tasks(width, height, baseWidth, baseHeight, raster2camera,
+                            camera2world, image, id, nodes, triangles);
+        double dt = get_elapsed_mcycles();
+        minTimeISPCtasks = std::min(dt, minTimeISPCtasks);
+    }
+    printf("[rt ispc + tasks]:\t\t[%.3f] million cycles for %d x %d image\n", 
+           minTimeISPCtasks, width, height);
+
+    writeImage(id, image, width, height, "rt-ispc-tasks.ppm");
+
+    memset(id, 0, width*height*sizeof(int));
+    memset(image, 0, width*height*sizeof(float));

    //
    // And 3 iterations with the serial implementation, reporting the
@@ -209,14 +289,15 @@ int main(int argc, char *argv[]) {
    double minTimeSerial = 1e30;
    for (int i = 0; i < 3; ++i) {
        reset_and_start_timer();
-        raytrace_serial(width, height, raster2camera, camera2world, 
-                        image, id, nodes, triangles);
+        raytrace_serial(width, height, baseWidth, baseHeight, raster2camera, 
+                        camera2world, image, id, nodes, triangles);
        double dt = get_elapsed_mcycles();
        minTimeSerial = std::min(dt, minTimeSerial);
    }
    printf("[rt serial]:\t\t\t[%.3f] million cycles for %d x %d image\n", 
           minTimeSerial, width, height);
-    printf("\t\t\t\t(%.2fx speedup from ISPC)\n", minTimeSerial / minTimeISPC);
+    printf("\t\t\t\t(%.2fx speedup from ISPC, %.2f from ISPC + tasks)\n", 
+           minTimeSerial / minTimeISPC, minTimeSerial / minTimeISPCtasks);

    writeImage(id, image, width, height, "rt-serial.ppm");

--- a/examples/rt/rt.ispc
+++ b/examples/rt/rt.ispc
@@ -50,21 +50,11 @@ struct Triangle {
 struct LinearBVHNode {
    uniform float3 bounds[2];
    uniform unsigned int offset;     // num primitives for leaf, second child for interior
-    uniform unsigned int primsAxis;  // 0:7 nPrimitives, 8:15 split axis, 16:31 padding
+    uniform unsigned int8 nPrimitives;
+    uniform unsigned int8 splitAxis;
+    uniform unsigned int16 pad;
 };

-static inline uniform int nPrims(const reference LinearBVHNode node) {
-    return (node.primsAxis & 0xff);
-}
-
-static inline uniform int axis(const reference LinearBVHNode node) {
-    return ((node.primsAxis >> 8) & 0xff);
-}
-
-static inline uniform bool isInterior(const reference LinearBVHNode node) {
-    return nPrims(node) == 0;
-}
-
 static inline float3 Cross(const float3 v1, const float3 v2) {
    float v1x = v1.x, v1y = v1.y, v1z = v1.z;
    float v2x = v2.x, v2y = v2.y, v2z = v2.z;
@@ -199,7 +189,7 @@ bool BVHIntersect(const LinearBVHNode nodes[], const Triangle tris[],
        // Check ray against BVH node
        LinearBVHNode node = nodes[nodeNum];
        if (any(BBoxIntersect(node.bounds, ray))) {
-            uniform unsigned int nPrimitives = nPrims(node);
+            uniform unsigned int nPrimitives = node.nPrimitives;
            if (nPrimitives > 0) {
                // Intersect ray with primitives in leaf BVH node
                uniform unsigned int primitivesOffset = node.offset;
@@ -213,7 +203,7 @@ bool BVHIntersect(const LinearBVHNode nodes[], const Triangle tris[],
            }
            else {
                // Put far BVH node on _todo_ stack, advance to near node
-                if (r.dirIsNeg[axis(node)]) {
+                if (r.dirIsNeg[node.splitAxis]) {
                   todo[todoOffset++] = nodeNum + 1;
                   nodeNum = node.offset;
                }
@@ -236,20 +226,26 @@ bool BVHIntersect(const LinearBVHNode nodes[], const Triangle tris[],
 }


-export void raytrace(uniform int width, uniform int height,
-                     const uniform float raster2camera[4][4], 
-                     const uniform float camera2world[4][4],
-                     uniform float image[], uniform int id[],
-                     const LinearBVHNode nodes[],
-                     const Triangle triangles[]) {
+static void raytrace_tile(uniform int x0, uniform int x1,
+                          uniform int y0, uniform int y1, 
+                          uniform int width, uniform int height,
+                          uniform int baseWidth, uniform int baseHeight,
+                          const uniform float raster2camera[4][4], 
+                          const uniform float camera2world[4][4],
+                          uniform float image[], uniform int id[],
+                          const LinearBVHNode nodes[],
+                          const Triangle triangles[]) {
+    uniform float widthScale = (float)(baseWidth) / (float)(width);
+    uniform float heightScale = (float)(baseHeight) / (float)(height);
+
    static const uniform float udx[16] = { 0, 1, 0, 1, 2, 3, 2, 3, 
                                           0, 1, 0, 1, 2, 3, 2, 3 };
    static const uniform float udy[16] = { 0, 0, 1, 1, 0, 0, 1, 1, 
                                           2, 2, 3, 3, 2, 2, 3, 3 };

    // The outer loops are always over blocks of 4x4 pixels
-    for (uniform int y = 0; y < height; y += 4) {
-        for (uniform int x = 0; x < width; x += 4) {
+    for (uniform int y = y0; y < y1; y += 4) {
+        for (uniform int x = x0; x < x1; x += 4) {
            // Now we have a block of 4x4=16 pixels to process; it will
            // take 16/programCount iterations of this loop to process
            // them.
@@ -261,7 +257,8 @@ export void raytrace(uniform int width, uniform int height,
                const float dy = udy[o * programCount + programIndex];

                Ray ray;
-                generateRay(raster2camera, camera2world, x+dx, y+dy, ray);
+                generateRay(raster2camera, camera2world, (x+dx)*widthScale,
+                            (y+dy)*heightScale, ray);
                BVHIntersect(nodes, triangles, ray);

                int offset = (y + (int)dy) * width + (x + (int)dx);
@@ -271,3 +268,51 @@ export void raytrace(uniform int width, uniform int height,
        }
    }
 }
+
+
+export void raytrace_ispc(uniform int width, uniform int height,
+                          uniform int baseWidth, uniform int baseHeight,
+                          const uniform float raster2camera[4][4], 
+                          const uniform float camera2world[4][4],
+                          uniform float image[], uniform int id[],
+                          const LinearBVHNode nodes[],
+                          const Triangle triangles[]) {
+    raytrace_tile(0, width, 0, height, width, height, baseWidth, baseHeight,
+                  raster2camera, camera2world, image,
+                  id, nodes, triangles);
+}
+
+
+task void raytrace_tile_task(uniform int x0, uniform int x1,
+                             uniform int y0, uniform int y1, 
+                             uniform int width, uniform int height,
+                             uniform int baseWidth, uniform int baseHeight,
+                             const uniform float raster2camera[4][4], 
+                             const uniform float camera2world[4][4],
+                             uniform float image[], uniform int id[],
+                             const LinearBVHNode nodes[],
+                             const Triangle triangles[]) {
+    raytrace_tile(x0, x1, y0, y1, width, height, baseWidth, baseHeight, 
+                  raster2camera, camera2world, image,
+                  id, nodes, triangles);
+}
+
+
+export void raytrace_ispc_tasks(uniform int width, uniform int height,
+                                uniform int baseWidth, uniform int baseHeight,
+                                const uniform float raster2camera[4][4], 
+                                const uniform float camera2world[4][4],
+                                uniform float image[], uniform int id[],
+                                const LinearBVHNode nodes[],
+                                const Triangle triangles[]) {
+    uniform int dx = 16, dy = 16;
+    for (uniform int y = 0; y < height; y += dy) {
+        uniform int y1 = min(y + dy, height);
+        for (uniform int x = 0; x < width; x += dx) {
+            uniform int x1 = min(x + dx, width);
+            launch < raytrace_tile_task(x, x1, y, y1, width, height, baseWidth,
+                                        baseHeight, raster2camera, camera2world, 
+                                        image, id, nodes, triangles) >;
+         }
+    }
+}
--- a/examples/rt/rt.vcxproj
+++ b/examples/rt/rt.vcxproj
@@ -81,6 +81,8 @@
      <WarningLevel>Level3</WarningLevel>
      <Optimization>Disabled</Optimization>
      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <FloatingPointModel>Fast</FloatingPointModel>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
@@ -94,6 +96,8 @@
      <WarningLevel>Level3</WarningLevel>
      <Optimization>Disabled</Optimization>
      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <FloatingPointModel>Fast</FloatingPointModel>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
@@ -109,6 +113,7 @@
      <FunctionLevelLinking>true</FunctionLevelLinking>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <FloatingPointModel>Fast</FloatingPointModel>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
@@ -126,6 +131,7 @@
      <FunctionLevelLinking>true</FunctionLevelLinking>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <FloatingPointModel>Fast</FloatingPointModel>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
@@ -138,18 +144,18 @@
    <CustomBuild Include="rt.ispc">
      <FileType>Document</FileType>
      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-cl /E /TP %(Filename).ispc | ispc -O2 - -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86
+ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86
 </Command>
      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-cl /E /TP %(Filename).ispc | ispc -O2 - -o %(Filename).obj -h %(Filename)_ispc.h
+ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h
 </Command>
      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(Filename).obj</Outputs>
      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">%(Filename).obj</Outputs>
      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-cl /E /TP %(Filename).ispc | ispc -O2 - -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86
+ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86
 </Command>
      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-cl /E /TP %(Filename).ispc | ispc -O2 - -o %(Filename).obj -h %(Filename)_ispc.h
+ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h
 </Command>
      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(Filename).obj</Outputs>
      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">%(Filename).obj</Outputs>
@@ -158,6 +164,7 @@ cl /E /TP %(Filename).ispc | ispc -O2 - -o %(Filename).obj -h %(Filename)_ispc.h
  <ItemGroup>
    <ClCompile Include="rt.cpp" />
    <ClCompile Include="rt_serial.cpp" />
+    <ClCompile Include="../tasks_concrt.cpp" />
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
--- a/examples/rt/rt_serial.cpp
+++ b/examples/rt/rt_serial.cpp
@@ -39,6 +39,7 @@
 #endif

 #include <algorithm>
+#include <stdint.h>

 // Just enough of a float3 class to do what we need in this file.
 #ifdef _MSC_VER
@@ -75,30 +76,20 @@ struct Ray {
 namespace ispc {
    struct Triangle {
        float3 p[3];
-        int id;
+        int32_t id;
    };

    struct LinearBVHNode {
        float3 bounds[2];
-        unsigned int offset;     // primitives for leaf, second child for interior
-        unsigned int primsAxis;  // 0:7 nPrimitives, 8:15 split axis, 16:31 padding
+        int32_t offset;     // primitives for leaf, second child for interior
+        uint8_t nPrimitives;
+        uint8_t splitAxis;
+        uint16_t pad;
    };
 }

 using namespace ispc;

-inline int nPrims(const LinearBVHNode &node) {
-    return (node.primsAxis & 0xff);
-}
-
-inline int axis(const LinearBVHNode &node) {
-    return ((node.primsAxis >> 8) & 0xff);
-}
-
-inline bool isInterior(const LinearBVHNode &node) {
-    return nPrims(node) == 0;
-}
-
 inline float3 Cross(const float3 &v1, const float3 &v2) {
    float v1x = v1.x, v1y = v1.y, v1z = v1.z;
    float v2x = v2.x, v2y = v2.y, v2z = v2.z;
@@ -230,7 +221,7 @@ bool BVHIntersect(const LinearBVHNode nodes[], const Triangle tris[],
        // Check ray against BVH node
        const LinearBVHNode &node = nodes[nodeNum];
        if (BBoxIntersect(node.bounds, ray)) {
-            unsigned int nPrimitives = nPrims(node);
+            unsigned int nPrimitives = node.nPrimitives;
            if (nPrimitives > 0) {
                // Intersect ray with primitives in leaf BVH node
                unsigned int primitivesOffset = node.offset;
@@ -244,7 +235,7 @@ bool BVHIntersect(const LinearBVHNode nodes[], const Triangle tris[],
            }
            else {
                // Put far BVH node on _todo_ stack, advance to near node
-                if (r.dirIsNeg[axis(node)]) {
+                if (r.dirIsNeg[node.splitAxis]) {
                   todo[todoOffset++] = nodeNum + 1;
                   nodeNum = node.offset;
                }
@@ -267,17 +258,21 @@ bool BVHIntersect(const LinearBVHNode nodes[], const Triangle tris[],
 }


-void raytrace_serial(int width, int height,
+void raytrace_serial(int width, int height, int baseWidth, int baseHeight,
                     const float raster2camera[4][4], 
                     const float camera2world[4][4],
                     float image[],
                     int id[],
                     const LinearBVHNode nodes[],
                     const Triangle triangles[]) {
+    float widthScale = float(baseWidth) / float(width);
+    float heightScale = float(baseHeight) / float(height);
+
    for (int y = 0; y < height; ++y) {
        for (int x = 0; x < width; ++x) {
                Ray ray;
-                generateRay(raster2camera, camera2world, x, y, ray);
+                generateRay(raster2camera, camera2world, x * widthScale,
+                            y * heightScale, ray);
                BVHIntersect(nodes, triangles, ray);

                int offset = y * width + x;
--- a/examples/simple/simple.cpp
+++ b/examples/simple/simple.cpp
@@ -32,12 +32,48 @@
 */

 #include <stdio.h>
+#include <stdlib.h>
+#include "../cpuid.h"

 // Include the header file that the ispc compiler generates
 #include "simple_ispc.h"
 using namespace ispc;

+// Make sure that the vector ISA used during compilation is supported by
+// the processor.  The ISPC_TARGET_* macro is set in the ispc-generated
+// header file that we include above.
+static void
+ensureTargetISAIsSupported() {
+#if defined(ISPC_TARGET_SSE2)
+    bool isaSupported = CPUSupportsSSE2();
+    const char *target = "SSE2";
+#elif defined(ISPC_TARGET_SSE4)
+    bool isaSupported = CPUSupportsSSE4();
+    const char *target = "SSE4";
+#elif defined(ISPC_TARGET_AVX)
+    bool isaSupported = CPUSupportsAVX();
+    const char *target = "AVX";
+#else
+#error "Unknown ISPC_TARGET_* value"
+#endif
+    if (!isaSupported) {
+        fprintf(stderr, "***\n*** Error: the ispc-compiled code uses the %s instruction "
+                "set, which isn't\n***        supported by this computer's CPU!\n", target);
+        fprintf(stderr, "***\n***        Please modify the "
+#ifdef _MSC_VER
+                "MSVC project file "
+#else
+                "Makefile "
+#endif
+                "to select another target (e.g. sse2)\n***\n");
+        exit(1);
+    }
+}
+
+
 int main() {
+    ensureTargetISAIsSupported();
+
    float vin[16], vout[16];

    // Initialize input buffer
--- a/examples/simple/simple.vcxproj
+++ b/examples/simple/simple.vcxproj
@@ -25,18 +25,18 @@
    <CustomBuild Include="simple.ispc">
      <FileType>Document</FileType>
      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-cl /E /TP %(Filename).ispc | ispc -O2 - -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86
+ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86
 </Command>
      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-cl /E /TP %(Filename).ispc | ispc -O2 -o %(Filename).obj -h %(Filename)_ispc.h
+ispc -O2 %(Filename).ispco %(Filename).obj -h %(Filename)_ispc.h
 </Command>
      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(Filename).obj</Outputs>
      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">%(Filename).obj</Outputs>
      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-cl /E /TP %(Filename).ispc | ispc -O2 - -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86
+ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86
 </Command>
      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-cl /E /TP %(Filename).ispc | ispc -O2 - -o %(Filename).obj -h %(Filename)_ispc.h
+ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h
 </Command>
      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(Filename).obj</Outputs>
      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">%(Filename).obj</Outputs>
--- a/examples/stencil/.gitignore
+++ b/examples/stencil/.gitignore
@@ -0,0 +1,2 @@
+stencil
+objs
--- a/examples/stencil/Makefile
+++ b/examples/stencil/Makefile
@@ -0,0 +1,41 @@
+
+ARCH = $(shell uname)
+
+TASK_CXX=../tasks_pthreads.cpp
+TASK_LIB=-lpthread
+
+ifeq ($(ARCH), Darwin)
+  TASK_CXX=../tasks_gcd.cpp
+  TASK_LIB=
+endif
+
+TASK_OBJ=$(addprefix objs/, $(subst ../,, $(TASK_CXX:.cpp=.o)))
+
+CXX=g++
+CXXFLAGS=-Iobjs/ -O3 -Wall -m64
+ISPC=ispc
+ISPCFLAGS=-O2 --target=sse4x2 --arch=x86-64
+
+default: stencil
+
+.PHONY: dirs clean
+
+dirs:
+	/bin/mkdir -p objs/
+
+clean:
+	/bin/rm -rf objs *~ stencil
+
+stencil: dirs objs/stencil.o objs/stencil_serial.o objs/stencil_ispc.o $(TASK_OBJ)
+	$(CXX) $(CXXFLAGS) -o $@ objs/stencil.o objs/stencil_ispc.o objs/stencil_serial.o $(TASK_OBJ) -lm $(TASK_LIB)
+
+objs/%.o: %.cpp
+	$(CXX) $< $(CXXFLAGS) -c -o $@
+
+objs/%.o: ../%.cpp
+	$(CXX) $< $(CXXFLAGS) -c -o $@
+
+objs/stencil.o: objs/stencil_ispc.h 
+
+objs/%_ispc.h objs/%_ispc.o: %.ispc
+	$(ISPC) $(ISPCFLAGS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
--- a/examples/stencil/stencil.cpp
+++ b/examples/stencil/stencil.cpp
@@ -0,0 +1,186 @@
+/*
+  Copyright (c) 2010-2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+#ifdef _MSC_VER
+#define _CRT_SECURE_NO_WARNINGS
+#define NOMINMAX
+#pragma warning (disable: 4244)
+#pragma warning (disable: 4305)
+#endif
+
+#include <stdio.h>
+#include <algorithm>
+#include <math.h>
+#include "../timing.h"
+#include "../cpuid.h"
+#include "stencil_ispc.h"
+using namespace ispc;
+
+
+// Make sure that the vector ISA used during compilation is supported by
+// the processor.  The ISPC_TARGET_* macro is set in the ispc-generated
+// header file that we include above.
+static void
+ensureTargetISAIsSupported() {
+#if defined(ISPC_TARGET_SSE2)
+    bool isaSupported = CPUSupportsSSE2();
+    const char *target = "SSE2";
+#elif defined(ISPC_TARGET_SSE4)
+    bool isaSupported = CPUSupportsSSE4();
+    const char *target = "SSE4";
+#elif defined(ISPC_TARGET_AVX)
+    bool isaSupported = CPUSupportsAVX();
+    const char *target = "AVX";
+#else
+#error "Unknown ISPC_TARGET_* value"
+#endif
+    if (!isaSupported) {
+        fprintf(stderr, "***\n*** Error: the ispc-compiled code uses the %s instruction "
+                "set, which isn't\n***        supported by this computer's CPU!\n", target);
+        fprintf(stderr, "***\n***        Please modify the "
+#ifdef _MSC_VER
+                "MSVC project file "
+#else
+                "Makefile "
+#endif
+                "to select another target (e.g. sse2)\n***\n");
+        exit(1);
+    }
+}
+
+
+extern void loop_stencil_serial(int t0, int t1, int x0, int x1,
+                                int y0, int y1, int z0, int z1,
+                                int Nx, int Ny, int Nz,
+                                const float coef[5], 
+                                const float vsq[],
+                                float Aeven[], float Aodd[]);
+
+
+void InitData(int Nx, int Ny, int Nz, float *A[2], float *vsq) {
+    int offset = 0;
+    for (int z = 0; z < Nz; ++z)
+        for (int y = 0; y < Ny; ++y)
+            for (int x = 0; x < Nx; ++x, ++offset) {
+                A[0][offset] = (x < Nx / 2) ? x / float(Nx) : y / float(Ny);
+                A[1][offset] = 0;
+                vsq[offset] = x*y*z / float(Nx * Ny * Nz);
+            }
+}
+
+
+int main() {
+    ensureTargetISAIsSupported();
+
+    int Nx = 256, Ny = 256, Nz = 256;
+    int width = 4;
+    float *Aserial[2], *Aispc[2];
+    Aserial[0] = new float [Nx * Ny * Nz];
+    Aserial[1] = new float [Nx * Ny * Nz];
+    Aispc[0] = new float [Nx * Ny * Nz];
+    Aispc[1] = new float [Nx * Ny * Nz];
+    float *vsq = new float [Nx * Ny * Nz];
+
+    float coeff[4] = { 0.5, -.25, .125, -.0625 }; 
+
+    InitData(Nx, Ny, Nz, Aispc, vsq);
+
+    //
+    // Compute the image using the ispc implementation on one core; report
+    // the minimum time of three runs.
+    //
+    double minTimeISPC = 1e30;
+    for (int i = 0; i < 3; ++i) {
+        reset_and_start_timer();
+        loop_stencil_ispc(0, 6, width, Nx - width, width, Ny - width,
+                          width, Nz - width, Nx, Ny, Nz, coeff, vsq,
+                          Aispc[0], Aispc[1]);
+        double dt = get_elapsed_mcycles();
+        minTimeISPC = std::min(minTimeISPC, dt);
+    }
+
+    printf("[stencil ispc 1 core]:\t\t[%.3f] million cycles\n", minTimeISPC);
+
+    InitData(Nx, Ny, Nz, Aispc, vsq);
+
+    //
+    // Compute the image using the ispc implementation with tasks; report
+    // the minimum time of three runs.
+    //
+    double minTimeISPCTasks = 1e30;
+    for (int i = 0; i < 3; ++i) {
+        reset_and_start_timer();
+        loop_stencil_ispc_tasks(0, 6, width, Nx - width, width, Ny - width,
+                                width, Nz - width, Nx, Ny, Nz, coeff, vsq,
+                                Aispc[0], Aispc[1]);
+        double dt = get_elapsed_mcycles();
+        minTimeISPCTasks = std::min(minTimeISPCTasks, dt);
+    }
+
+    printf("[stencil ispc + tasks]:\t\t[%.3f] million cycles\n", minTimeISPCTasks);
+
+    InitData(Nx, Ny, Nz, Aserial, vsq);
+
+    // 
+    // And run the serial implementation 3 times, again reporting the
+    // minimum time.
+    //
+    double minTimeSerial = 1e30;
+    for (int i = 0; i < 3; ++i) {
+        reset_and_start_timer();
+        loop_stencil_serial(0, 6, width, Nx-width, width, Ny - width,
+                            width, Nz - width, Nx, Ny, Nz, coeff, vsq,
+                            Aserial[0], Aserial[1]);
+        double dt = get_elapsed_mcycles();
+        minTimeSerial = std::min(minTimeSerial, dt);
+    }
+
+    printf("[stencil serial]:\t\t[%.3f] millon cycles\n", minTimeSerial);
+
+    printf("\t\t\t\t(%.2fx speedup from ISPC, %.2f from ISPC + tasks)\n", 
+           minTimeSerial / minTimeISPC, minTimeSerial / minTimeISPCTasks);
+
+    // Check for agreement
+    int offset = 0;
+    for (int z = 0; z < Nz; ++z)
+        for (int y = 0; y < Ny; ++y)
+            for (int x = 0; x < Nx; ++x, ++offset) {
+                float error = fabsf((Aserial[1][offset] - Aispc[1][offset]) /
+                                    Aserial[1][offset]);
+                if (error > 1e-4)
+                    printf("Error @ (%d,%d,%d): ispc = %f, serial = %f\n",
+                           x, y, z, Aispc[1][offset], Aserial[1][offset]);
+            }
+
+    return 0;
+}
--- a/examples/stencil/stencil.ispc
+++ b/examples/stencil/stencil.ispc
@@ -0,0 +1,129 @@
+/*
+  Copyright (c) 2010-2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+
+static void
+stencil_step(uniform int x0, uniform int x1,
+             uniform int y0, uniform int y1,
+             uniform int z0, uniform int z1,
+             uniform int Nx, uniform int Ny, uniform int Nz,
+             uniform const float coef[4], uniform const float vsq[],
+             uniform const float Ain[], uniform float Aout[]) {
+    const uniform int Nxy = Nx * Ny;
+
+    for (uniform int z = z0; z < z1; ++z) {
+        for (uniform int y = y0; y < y1; ++y) {
+            // Assumes that (x1-x0) % programCount == 0
+            for (uniform int x = x0; x < x1; x += programCount) {
+                int index = (z * Nxy) + (y * Nx) + x + programIndex;
+#define A_cur(x, y, z) Ain[index + (x) + ((y) * Nx) + ((z) * Nxy)]
+#define A_next(x, y, z) Aout[index + (x) + ((y) * Nx) + ((z) * Nxy)]
+                float div = coef[0] * A_cur(0, 0, 0) +
+                            coef[1] * (A_cur(+1, 0, 0) + A_cur(-1, 0, 0) +
+                                       A_cur(0, +1, 0) + A_cur(0, -1, 0) +
+                                       A_cur(0, 0, +1) + A_cur(0, 0, -1)) +
+                            coef[2] * (A_cur(+2, 0, 0) + A_cur(-2, 0, 0) +
+                                       A_cur(0, +2, 0) + A_cur(0, -2, 0) +
+                                       A_cur(0, 0, +2) + A_cur(0, 0, -2)) +
+                            coef[3] * (A_cur(+3, 0, 0) + A_cur(-3, 0, 0) +
+                                       A_cur(0, +3, 0) + A_cur(0, -3, 0) +
+                                       A_cur(0, 0, +3) + A_cur(0, 0, -3));
+
+                A_next(0, 0, 0) = 2 * A_cur(0, 0, 0) - A_next(0, 0, 0) + 
+                    vsq[index] * div;
+            }
+        }
+    }
+}
+
+
+static task void
+stencil_step_task(uniform int x0, uniform int x1,
+                  uniform int y0, uniform int y1,
+                  uniform int z0, uniform int z1,
+                  uniform int Nx, uniform int Ny, uniform int Nz,
+                  uniform const float coef[4], uniform const float vsq[],
+                  uniform const float Ain[], uniform float Aout[]) {
+    stencil_step(x0, x1, y0, y1, z0, z1, Nx, Ny, Nz, coef, vsq, Ain, Aout);
+}
+
+
+export void
+loop_stencil_ispc_tasks(uniform int t0, uniform int t1, 
+                        uniform int x0, uniform int x1,
+                        uniform int y0, uniform int y1,
+                        uniform int z0, uniform int z1,
+                        uniform int Nx, uniform int Ny, uniform int Nz,
+                        uniform const float coef[4], 
+                        uniform const float vsq[],
+                        uniform float Aeven[], uniform float Aodd[])
+{
+    for (uniform int t = t0; t < t1; ++t) {
+        // Parallelize across cores as well: each task will work on a slice
+        // of "dz" in the z extent of the volume.  (dz=1 seems to work
+        // better than any larger values.)
+        uniform int dz = 1;
+        for (uniform int z = z0; z < z1; z += dz) {
+            if ((t & 1) == 0)
+                launch < stencil_step_task(x0, x1, y0, y1, z, z+dz, Nx, Ny, Nz, 
+                                           coef, vsq, Aeven, Aodd) >;
+            else
+                launch < stencil_step_task(x0, x1, y0, y1, z, z+dz, Nx, Ny, Nz, 
+                                           coef, vsq, Aodd, Aeven) >;
+        }
+        // We need to wait for all of the launched tasks to finish before
+        // starting the next iteration.
+        sync;
+    }
+}
+
+
+export void
+loop_stencil_ispc(uniform int t0, uniform int t1, 
+                  uniform int x0, uniform int x1,
+                  uniform int y0, uniform int y1,
+                  uniform int z0, uniform int z1,
+                  uniform int Nx, uniform int Ny, uniform int Nz,
+                  uniform const float coef[4], 
+                  uniform const float vsq[],
+                  uniform float Aeven[], uniform float Aodd[])
+{
+    for (uniform int t = t0; t < t1; ++t) {
+        if ((t & 1) == 0)
+            stencil_step(x0, x1, y0, y1, z0, z1, Nx, Ny, Nz, coef, vsq, 
+                         Aeven, Aodd);
+        else
+            stencil_step(x0, x1, y0, y1, z0, z1, Nx, Ny, Nz, coef, vsq, 
+                         Aodd, Aeven);
+    }
+}
--- a/examples/stencil/stencil.vcxproj
+++ b/examples/stencil/stencil.vcxproj
@@ -0,0 +1,172 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|Win32">
+      <Configuration>Debug</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|Win32">
+      <Configuration>Release</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{2ef070a1-f62f-4e6a-944b-88d140945c3c}</ProjectGuid>
+    <Keyword>Win32Proj</Keyword>
+    <RootNamespace>rt</RootNamespace>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <LinkIncremental>true</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <LinkIncremental>true</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <LinkIncremental>false</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <LinkIncremental>false</LinkIncremental>
+  </PropertyGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <ClCompile>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <FloatingPointModel>Fast</FloatingPointModel>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <ClCompile>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <FloatingPointModel>Fast</FloatingPointModel>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <FloatingPointModel>Fast</FloatingPointModel>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <FloatingPointModel>Fast</FloatingPointModel>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <CustomBuild Include="stencil.ispc">
+      <FileType>Document</FileType>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86
+</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(Filename).obj</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">%(Filename).obj</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86
+</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(Filename).obj</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">%(Filename).obj</Outputs>
+    </CustomBuild>
+  </ItemGroup>
+  <ItemGroup>
+    <ClCompile Include="stencil.cpp" />
+    <ClCompile Include="stencil_serial.cpp" />
+    <ClCompile Include="../tasks_concrt.cpp" />
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+  </ImportGroup>
+</Project>
--- a/examples/stencil/stencil_serial.cpp
+++ b/examples/stencil/stencil_serial.cpp
@@ -0,0 +1,86 @@
+/*
+  Copyright (c) 2010-2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+
+static void
+stencil_step(int x0, int x1,
+             int y0, int y1,
+             int z0, int z1,
+             int Nx, int Ny, int Nz,
+             const float coef[4], const float vsq[],
+             const float Ain[], float Aout[]) {
+    int Nxy = Nx * Ny;
+
+    for (int z = z0; z < z1; ++z) {
+        for (int y = y0; y < y1; ++y) {
+            for (int x = x0; x < x1; ++x) {
+                int index = (z * Nxy) + (y * Nx) + x;
+#define A_cur(x, y, z) Ain[index + (x) + ((y) * Nx) + ((z) * Nxy)]
+#define A_next(x, y, z) Aout[index + (x) + ((y) * Nx) + ((z) * Nxy)]
+                float div = coef[0] * A_cur(0, 0, 0) +
+                            coef[1] * (A_cur(+1, 0, 0) + A_cur(-1, 0, 0) +
+                                       A_cur(0, +1, 0) + A_cur(0, -1, 0) +
+                                       A_cur(0, 0, +1) + A_cur(0, 0, -1)) +
+                            coef[2] * (A_cur(+2, 0, 0) + A_cur(-2, 0, 0) +
+                                       A_cur(0, +2, 0) + A_cur(0, -2, 0) +
+                                       A_cur(0, 0, +2) + A_cur(0, 0, -2)) +
+                            coef[3] * (A_cur(+3, 0, 0) + A_cur(-3, 0, 0) +
+                                       A_cur(0, +3, 0) + A_cur(0, -3, 0) +
+                                       A_cur(0, 0, +3) + A_cur(0, 0, -3));
+
+                A_next(0, 0, 0) = 2 * A_cur(0, 0, 0) - A_next(0, 0, 0) + 
+                    vsq[index] * div;
+            }
+        }
+    }
+}
+
+
+void loop_stencil_serial(int t0, int t1, 
+                         int x0, int x1,
+                         int y0, int y1,
+                         int z0, int z1,
+                         int Nx, int Ny, int Nz,
+                         const float coef[4], 
+                         const float vsq[],
+                         float Aeven[], float Aodd[])
+{
+    for (int t = t0; t < t1; ++t) {
+        if ((t & 1) == 0)
+            stencil_step(x0, x1, y0, y1, z0, z1, Nx, Ny, Nz, coef, vsq, 
+                         Aeven, Aodd);
+        else
+            stencil_step(x0, x1, y0, y1, z0, z1, Nx, Ny, Nz, coef, vsq, 
+                         Aodd, Aeven);
+    }
+}
--- a/examples/taskinfo.h
+++ b/examples/taskinfo.h
@@ -0,0 +1,180 @@
+/*
+  Copyright (c) 2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+#ifndef TASKINFO_H
+#define TASKINFO_H 1
+
+#ifdef _MSC_VER
+#define ISPC_IS_WINDOWS
+#elif defined(__linux__)
+#define ISPC_IS_LINUX
+#elif defined(__APPLE__)
+#define ISPC_IS_APPLE
+#endif
+
+#ifdef ISPC_IS_WINDOWS
+#define NOMINMAX
+#include <windows.h>
+#include <concrt.h>
+using namespace Concurrency;
+#endif // ISPC_IS_WINDOWS
+
+#if (__SIZEOF_POINTER__ == 4) || defined(__i386__) || defined(_WIN32)
+#define ISPC_POINTER_BYTES 4
+#elif (__SIZEOF_POINTER__ == 8) || defined(__x86_64__) || defined(__amd64__) || defined(_WIN64)
+#define ISPC_POINTER_BYTES 8
+#else
+#error "Pointer size unknown!"
+#endif // __SIZEOF_POINTER__
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <assert.h>
+
+typedef struct TaskInfo {
+    void *func;
+    void *data;
+#if defined(ISPC_IS_WINDOWS)
+    event taskEvent;
+#endif
+} TaskInfo;
+
+
+#ifndef ISPC_IS_WINDOWS
+static int32_t 
+lAtomicCompareAndSwap32(volatile int32_t *v, int32_t newValue, int32_t oldValue) {
+    int32_t result;
+    __asm__ __volatile__("lock\ncmpxchgl %2,%1"
+                          : "=a"(result), "=m"(*v)
+                          : "q"(newValue), "0"(oldValue)
+                          : "memory");
+    __asm__ __volatile__("mfence":::"memory");
+    return result;
+}
+#endif // !ISPC_IS_WINDOWS
+
+
+static void *
+lAtomicCompareAndSwapPointer(void **v, void *newValue, void *oldValue) {
+#ifdef ISPC_IS_WINDOWS
+	return InterlockedCompareExchangePointer(v, newValue, oldValue);
+#else
+    void *result;
+#if (ISPC_POINTER_BYTES == 4)
+    __asm__ __volatile__("lock\ncmpxchgd %2,%1"
+                          : "=a"(result), "=m"(*v)
+                          : "q"(newValue), "0"(oldValue)
+                          : "memory");
+#else
+    __asm__ __volatile__("lock\ncmpxchgq %2,%1"
+                          : "=a"(result), "=m"(*v)
+                          : "q"(newValue), "0"(oldValue)
+                          : "memory");
+#endif // ISPC_POINTER_BYTES
+    __asm__ __volatile__("mfence":::"memory");
+    return result;
+#endif // ISPC_IS_WINDOWS
+}
+
+
+#ifndef ISPC_IS_WINDOWS
+static int32_t 
+lAtomicAdd32(volatile int32_t *v, int32_t delta) {
+    // Do atomic add with gcc x86 inline assembly
+    int32_t origValue;
+    __asm__ __volatile__("lock\n"
+                         "xaddl %0,%1"
+                         : "=r"(origValue), "=m"(*v) : "0"(delta)
+                         : "memory");
+    return origValue;
+}
+#endif
+
+#define LOG_TASK_QUEUE_CHUNK_SIZE 13
+#define MAX_TASK_QUEUE_CHUNKS 1024
+#define TASK_QUEUE_CHUNK_SIZE (1<<LOG_TASK_QUEUE_CHUNK_SIZE)
+
+#define MAX_LAUNCHED_TASKS (MAX_TASK_QUEUE_CHUNKS * TASK_QUEUE_CHUNK_SIZE)
+
+typedef void (*TaskFuncType)(void *, int, int);
+
+#ifdef ISPC_IS_WINDOWS
+static volatile LONG nextTaskInfoCoordinate;
+#else
+static volatile int nextTaskInfoCoordinate;
+#endif
+
+static TaskInfo *taskInfo[MAX_TASK_QUEUE_CHUNKS];
+
+static inline void
+lInitTaskInfo() {
+    taskInfo[0] = new TaskInfo[TASK_QUEUE_CHUNK_SIZE];
+}
+
+
+static inline TaskInfo *
+lGetTaskInfo() {
+#ifdef ISPC_IS_WINDOWS
+    int myCoord = InterlockedAdd(&nextTaskInfoCoordinate, 1)-1;
+#else
+    int myCoord = lAtomicAdd32(&nextTaskInfoCoordinate, 1);
+#endif
+	int index = (myCoord >> LOG_TASK_QUEUE_CHUNK_SIZE);
+    int offset = myCoord & (TASK_QUEUE_CHUNK_SIZE-1);
+    if (index == MAX_TASK_QUEUE_CHUNKS) {
+        fprintf(stderr, "A total of %d tasks have been launched--the simple "
+                "built-in task system can handle no more. Exiting.", myCoord);
+        exit(1);
+    }
+
+    if (taskInfo[index] == NULL) {
+        TaskInfo *newChunk = new TaskInfo[TASK_QUEUE_CHUNK_SIZE];
+        if (lAtomicCompareAndSwapPointer((void **)&taskInfo[index], newChunk, 
+                                         NULL) != NULL) {
+            // failure--someone else got it, but that's cool
+            assert(taskInfo[index] != NULL);
+            free(newChunk);
+        }
+    }
+
+    return &taskInfo[index][offset];
+}
+
+
+static inline void
+lResetTaskInfo() {
+    nextTaskInfoCoordinate = 0;
+}
+
+#endif // TASKINFO_H
--- a/examples/mandelbrot_tasks/tasks_concrt.cpp
+++ b/examples/mandelbrot_tasks/tasks_concrt.cpp
@@ -1,5 +1,5 @@
 /*
-  Copyright (c) 2010-2011, Intel Corporation
+  Copyright (c) 2011, Intel Corporation
  All rights reserved.

  Redistribution and use in source and binary forms, with or without
@@ -31,42 +31,26 @@
   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
 */

+#include "taskinfo.h"
+
 /* Simple task system implementation for ispc based on Microsoft's
   Concurrency Runtime. */

 #include <windows.h>
 #include <concrt.h>
 using namespace Concurrency;
+#include <stdint.h>
 #include <assert.h>
 #include <stdio.h>
+#include <stdlib.h>
+#include <algorithm>

 // ispc expects these functions to have C linkage / not be mangled
 extern "C" { 
    void ISPCLaunch(void *f, void *data);
    void ISPCSync();
-}
-
-typedef void (*TaskFuncType)(void *, int, int);
-
-struct TaskInfo {
-    TaskFuncType ispcFunc;
-    void *ispcData;
-};
-
-// This is a simple implementation that just aborts if more than MAX_TASKS
-// are launched.  It could easily be extended to be more general...
-
-#define MAX_TASKS 4096
-static int taskOffset;
-static TaskInfo taskInfo[MAX_TASKS];
-static event *events[MAX_TASKS];
-static CRITICAL_SECTION criticalSection;
-
-void
-TasksInit() {
-    InitializeCriticalSection(&criticalSection);
-    for (int i = 0; i < MAX_TASKS; ++i)
-        events[i] = new event;
+    void *ISPCMalloc(int64_t size, int32_t alignment);
+    void ISPCFree(void *ptr);
 }


@@ -75,41 +59,46 @@ lRunTask(LPVOID param) {
    TaskInfo *ti = (TaskInfo *)param;
    
    // Actually run the task. 
-    // FIXME: like the tasks_gcd.cpp implementation, this is passing bogus
+    // FIXME: like the GCD implementation for OS X, this is passing bogus
    // values for the threadIndex and threadCount builtins, which in turn
-    // will cause bugs in code that uses those.  FWIW this example doesn't
-    // use them...
+    // will cause bugs in code that uses those.
    int threadIndex = 0;
    int threadCount = 1;
-    ti->ispcFunc(ti->ispcData, threadIndex, threadCount);
+    TaskFuncType func = (TaskFuncType)ti->func;
+    func(ti->data, threadIndex, threadCount);

    // Signal the event that this task is done
-    int taskNum = ti - &taskInfo[0];
-    events[taskNum]->set();
+    ti->taskEvent.set();
 }


 void
 ISPCLaunch(void *func, void *data) {
-    // Get a TaskInfo struct for this task
-    EnterCriticalSection(&criticalSection);
-    TaskInfo *ti = &taskInfo[taskOffset++];
-    assert(taskOffset < MAX_TASKS);
-    LeaveCriticalSection(&criticalSection);
-
-    // And pass it on to the Concurrency Runtime...
-    ti->ispcFunc = (TaskFuncType)func;
-    ti->ispcData = data;
+    TaskInfo *ti = lGetTaskInfo();
+    ti->func = (TaskFuncType)func;
+    ti->data = data;
+	ti->taskEvent.reset();
    CurrentScheduler::ScheduleTask(lRunTask, ti);
 }


 void ISPCSync() {
-    event::wait_for_multiple(&events[0], taskOffset, true, 
-                             COOPERATIVE_TIMEOUT_INFINITE);
+    for (int i = 0; i < nextTaskInfoCoordinate; ++i) {
+		int index = (i >> LOG_TASK_QUEUE_CHUNK_SIZE);
+		int offset = i & (TASK_QUEUE_CHUNK_SIZE-1);
+		taskInfo[index][offset].taskEvent.wait();
+		taskInfo[index][offset].taskEvent.reset();
+    }

-    for (int i = 0; i < taskOffset; ++i)
-        events[i]->reset();
-
-    taskOffset = 0;
+    lResetTaskInfo();
+}
+
+
+void *ISPCMalloc(int64_t size, int32_t alignment) {
+    return _aligned_malloc(size, alignment);
+}
+
+
+void ISPCFree(void *ptr) {
+    _aligned_free(ptr);
 }
--- a/examples/mandelbrot_tasks/tasks_gcd.cpp
+++ b/examples/mandelbrot_tasks/tasks_gcd.cpp
@@ -1,5 +1,5 @@
 /*
-  Copyright (c) 2010-2011, Intel Corporation
+  Copyright (c) 2011, Intel Corporation
  All rights reserved.

  Redistribution and use in source and binary forms, with or without
@@ -31,53 +31,69 @@
   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
 */

+#include "taskinfo.h"
+
+#if defined(_WIN32) || defined(_WIN64)
+#define ISPC_IS_WINDOWS
+#elif defined(__linux__)
+#define ISPC_IS_LINUX
+#elif defined(__APPLE__)
+#define ISPC_IS_APPLE
+#endif
+
 /* A simple task system for ispc programs based on Apple's Grand Central
   Dispatch. */
-
 #include <dispatch/dispatch.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <stdlib.h>

+static int initialized = 0;
+static volatile int32_t lock = 0;
 static dispatch_queue_t gcdQueue;
 static dispatch_group_t gcdGroup;

 // ispc expects these functions to have C linkage / not be mangled
-extern "C" {
+extern "C" { 
    void ISPCLaunch(void *f, void *data);
    void ISPCSync();
-}
-
-struct TaskInfo {
-    void *func;
-    void *data;
-};
-
-
-void
-TasksInit() {
-    gcdQueue = dispatch_get_global_queue(DISPATCH_QUEUE_PRIORITY_DEFAULT, 0);
-    gcdGroup = dispatch_group_create();
+    void *ISPCMalloc(int64_t size, int32_t alignment);
+    void ISPCFree(void *ptr);
 }


 static void
 lRunTask(void *ti) {
-    typedef void (*TaskFuncType)(void *, int, int);
    TaskInfo *taskInfo = (TaskInfo *)ti;
-
-    TaskFuncType func = (TaskFuncType)(taskInfo->func);
-
    // FIXME: these are bogus values; may cause bugs in code that depends
    // on them having unique values in different threads.
    int threadIndex = 0;
    int threadCount = 1;
+    TaskFuncType func = (TaskFuncType)(taskInfo->func);
+
    // Actually run the task
    func(taskInfo->data, threadIndex, threadCount);
-
-    // FIXME: taskInfo leaks...
 }


 void ISPCLaunch(void *func, void *data) {
-    TaskInfo *ti = new TaskInfo;
+    if (!initialized) {
+        while (1) {
+            if (lAtomicCompareAndSwap32(&lock, 1, 0) == 0) {
+                if (!initialized) {
+                    gcdQueue = dispatch_get_global_queue(DISPATCH_QUEUE_PRIORITY_DEFAULT, 0);
+                    gcdGroup = dispatch_group_create();
+                    lInitTaskInfo();
+                    __asm__ __volatile__("mfence":::"memory");
+                    initialized = 1;
+                }
+                lock = 0;
+                break;
+            }
+        }
+    }
+
+    TaskInfo *ti = lGetTaskInfo();
    ti->func = func;
    ti->data = data;
    dispatch_group_async_f(gcdGroup, gcdQueue, ti, lRunTask);
@@ -85,6 +101,26 @@ void ISPCLaunch(void *func, void *data) {


 void ISPCSync() {
+    if (!initialized)
+        return;
+
    // Wait for all of the tasks in the group to complete before returning
    dispatch_group_wait(gcdGroup, DISPATCH_TIME_FOREVER);
+
+    lResetTaskInfo();
 }
+
+void *ISPCMalloc(int64_t size, int32_t alignment) {
+    void *mem = malloc(size + (alignment-1) + sizeof(void*));
+    char *amem = ((char*)mem) + sizeof(void*);
+    amem = amem + uint32_t(alignment - (reinterpret_cast<uint64_t>(amem) &
+                                        (alignment - 1)));
+    ((void**)amem)[-1] = mem;
+    return amem;
+}
+
+
+void ISPCFree(void *ptr) {
+    free(((void**)ptr)[-1]);
+}
+
--- a/examples/mandelbrot_tasks/tasks_pthreads.cpp
+++ b/examples/mandelbrot_tasks/tasks_pthreads.cpp
@@ -1,5 +1,5 @@
 /*
-  Copyright (c) 2010-2011, Intel Corporation
+  Copyright (c) 2011, Intel Corporation
  All rights reserved.

  Redistribution and use in source and binary forms, with or without
@@ -31,6 +31,15 @@
   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
 */

+#if defined(_WIN32) || defined(_WIN64)
+#define ISPC_IS_WINDOWS
+#elif defined(__linux__)
+#define ISPC_IS_LINUX
+#elif defined(__APPLE__)
+#define ISPC_IS_APPLE
+#endif
+
+#include "taskinfo.h"
 #include <pthread.h>
 #include <semaphore.h>
 #include <string.h>
@@ -45,59 +54,45 @@
 #include <stdint.h>
 #include <stdlib.h>
 #include <errno.h>
-#include <vector>
+#ifdef ISPC_IS_LINUX
+#include <malloc.h>
+#endif
+
+static int initialized = 0;
+static volatile int32_t lock = 0;
+
+static int nThreads;
+static pthread_t *threads;
+static pthread_mutex_t taskQueueMutex;
+static int nextTaskToRun;
+static sem_t *workerSemaphore;
+static uint32_t numUnfinishedTasks;
+static pthread_mutex_t tasksRunningConditionMutex;
+static pthread_cond_t tasksRunningCondition;

 // ispc expects these functions to have C linkage / not be mangled
 extern "C" { 
    void ISPCLaunch(void *f, void *data);
    void ISPCSync();
+    void *ISPCMalloc(int64_t size, int32_t alignment);
+    void ISPCFree(void *ptr);
 }

-
-static int nThreads;
-static pthread_t *threads;
-static pthread_mutex_t taskQueueMutex;
-static std::vector<std::pair<void *, void *> > taskQueue;
-static sem_t *workerSemaphore;
-static uint32_t numUnfinishedTasks;
-static pthread_mutex_t tasksRunningConditionMutex;
-static pthread_cond_t tasksRunningCondition;
-
 static void *lTaskEntry(void *arg);

 /** Figure out how many CPU cores there are in the system
 */
 static int
 lNumCPUCores() {
-#if defined(__linux__)
    return sysconf(_SC_NPROCESSORS_ONLN);
-#else
-    // Mac
-    int mib[2];
-    mib[0] = CTL_HW;
-    size_t length = 2;
-    if (sysctlnametomib("hw.logicalcpu", mib, &length) == -1) {
-        fprintf(stderr, "sysctlnametomib() filed.  Guessing 2 cores.");
-        return 2;
-    }
-    assert(length == 2);
-
-    int nCores = 0;
-    size_t size = sizeof(nCores);
-
-    if (sysctl(mib, 2, &nCores, &size, NULL, 0) == -1) {
-        fprintf(stderr, "sysctl() to find number of cores present failed.  Guessing 2.");
-        return 2;
-    }
-    return nCores;
-#endif
 }

-void
-TasksInit() {
+
+static void
+lTasksInit() {
    nThreads = lNumCPUCores();

-    threads = new pthread_t[nThreads];
+    threads = (pthread_t *)malloc(nThreads * sizeof(pthread_t));

    int err;
    if ((err = pthread_mutex_init(&taskQueueMutex, NULL)) != 0) {
@@ -106,7 +101,7 @@ TasksInit() {
    }

    char name[32];
-    sprintf(name, "mandelbrot.%d", (int)getpid());
+    sprintf(name, "ispc_task.%d", (int)getpid());
    workerSemaphore = sem_open(name, O_CREAT, S_IRUSR|S_IWUSR, 0);
    if (!workerSemaphore) {
        fprintf(stderr, "Error creating semaphore: %s\n", strerror(err));
@@ -124,7 +119,7 @@ TasksInit() {
    }

    for (int i = 0; i < nThreads; ++i) {
-        err = pthread_create(&threads[i], NULL, &lTaskEntry, reinterpret_cast<void *>(i));
+        err = pthread_create(&threads[i], NULL, &lTaskEntry, (void *)(i));
        if (err != 0) {
            fprintf(stderr, "Error creating pthread %d: %s\n", i, strerror(err));
            exit(1);
@@ -135,16 +130,35 @@ TasksInit() {

 void
 ISPCLaunch(void *f, void *d) {
+    int err;
+
+    if (!initialized) {
+        while (1) {
+            if (lAtomicCompareAndSwap32(&lock, 1, 0) == 0) {
+                if (!initialized) {
+                    lTasksInit();
+                    __asm__ __volatile__("mfence":::"memory");
+                    initialized = 1;
+                }
+                lock = 0;
+                break;
+            }
+        }
+    }
+
    //
    // Acquire mutex, add task
    //
-    int err;
    if ((err = pthread_mutex_lock(&taskQueueMutex)) != 0) {
        fprintf(stderr, "Error from pthread_mutex_lock: %s\n", strerror(err));
        exit(1);
    }

-    taskQueue.push_back(std::make_pair(f, d));
+    // Need a mutex here to ensure we get this filled in before a worker
+    // grabs it and starts running...
+    TaskInfo *ti = lGetTaskInfo();
+    ti->func = f;
+    ti->data = d;

    if ((err = pthread_mutex_unlock(&taskQueueMutex)) != 0) {
        fprintf(stderr, "Error from pthread_mutex_unlock: %s\n", strerror(err));
@@ -159,6 +173,7 @@ ISPCLaunch(void *f, void *d) {
        exit(1);
    }

+    // FIXME: is this redundant with nextTaskInfoCoordinate?
    ++numUnfinishedTasks;

    if ((err = pthread_mutex_unlock(&tasksRunningConditionMutex)) != 0) {
@@ -179,17 +194,17 @@ ISPCLaunch(void *f, void *d) {

 static void *
 lTaskEntry(void *arg) {
-    int threadIndex = int(reinterpret_cast<int64_t>(arg));
+    int threadIndex = (int)((int64_t)arg);
    int threadCount = nThreads;
+    TaskFuncType func;

-    while (true) {
+    while (1) {
        int err;
        if ((err = sem_wait(workerSemaphore)) != 0) {
            fprintf(stderr, "Error from sem_wait: %s\n", strerror(err));
            exit(1);
        }

-        std::pair<void *, void *> myTask;
        //
        // Acquire mutex, get task
        //
@@ -197,7 +212,8 @@ lTaskEntry(void *arg) {
            fprintf(stderr, "Error from pthread_mutex_lock: %s\n", strerror(err));
            exit(1);
        }
-        if (taskQueue.size() == 0) {
+
+        if (nextTaskToRun == nextTaskInfoCoordinate) {
            //
            // Task queue is empty, go back and wait on the semaphore
            //
@@ -208,8 +224,10 @@ lTaskEntry(void *arg) {
            continue;
        }

-        myTask = taskQueue.back();
-        taskQueue.pop_back();
+        int runCoord = nextTaskToRun++;
+        int index = (runCoord >> LOG_TASK_QUEUE_CHUNK_SIZE);
+        int offset = runCoord & (TASK_QUEUE_CHUNK_SIZE-1);
+        TaskInfo *myTask = &taskInfo[index][offset];

        if ((err = pthread_mutex_unlock(&taskQueueMutex)) != 0) {
            fprintf(stderr, "Error from pthread_mutex_unlock: %s\n", strerror(err));
@@ -219,9 +237,8 @@ lTaskEntry(void *arg) {
        //
        // Do work for _myTask_
        //
-        typedef void (*TaskFunType)(void *, int, int);
-        TaskFunType func = (TaskFunType)myTask.first;
-        func(myTask.second, threadIndex, threadCount);
+        func = (TaskFuncType)myTask->func;
+        func(myTask->data, threadIndex, threadCount);

        //
        // Decrement the number of unfinished tasks counter
@@ -231,6 +248,8 @@ lTaskEntry(void *arg) {
            exit(1);
        }

+        // FIXME: can this be a comparison of (nextTaskToRun == nextTaskInfoCoordinate)?
+        // (I don't think so--think there is a race...)
        int unfinished = --numUnfinishedTasks;
        if (unfinished == 0) {
            //
@@ -273,6 +292,9 @@ void ISPCSync() {
        }
    }
    
+    lResetTaskInfo();
+    nextTaskToRun = 0;
+
    // We acquire ownership of the condition variable mutex when the above
    // pthread_cond_wait returns.
    // FIXME: is there a lurking issue here if numUnfinishedTasks gets back
@@ -283,3 +305,35 @@ void ISPCSync() {
        exit(1);
    }
 }
+
+
+void *ISPCMalloc(int64_t size, int32_t alignment) {
+#ifdef ISPC_IS_WINDOWS
+    return _aligned_malloc(size, alignment);
+#endif
+#ifdef ISPC_IS_LINUX
+    return memalign(alignment, size);
+#endif
+#ifdef ISPC_IS_APPLE
+    void *mem = malloc(size + (alignment-1) + sizeof(void*));
+    char *amem = ((char*)mem) + sizeof(void*);
+    amem = amem + uint32_t(alignment - (reinterpret_cast<uint64_t>(amem) &
+                                        (alignment - 1)));
+    ((void**)amem)[-1] = mem;
+    return amem;
+#endif
+}
+
+
+void ISPCFree(void *ptr) {
+#ifdef ISPC_IS_WINDOWS
+    _aligned_free(ptr);
+#endif
+#ifdef ISPC_IS_LINUX
+    free(ptr);
+#endif
+#ifdef ISPC_IS_APPLE
+    free(((void**)ptr)[-1]);
+#endif
+}
+
--- a/examples/timing.h
+++ b/examples/timing.h
@@ -38,7 +38,9 @@
 #include <windows.h>
 #define rdtsc __rdtsc
 #else
+#ifdef __cplusplus
 extern "C" {
+#endif /* __cplusplus */
    __inline__ uint64_t rdtsc() {
        uint32_t low, high;
        __asm__ __volatile__ (
@@ -48,7 +50,9 @@ extern "C" {
                              "rdtsc" : "=a" (low), "=d" (high));
        return (uint64_t)high << 32 | low;
    }
+#ifdef __cplusplus
 }
+#endif /* __cplusplus */
 #endif            
            
 static uint64_t start, end;
--- a/examples/volume_rendering/.gitignore
+++ b/examples/volume_rendering/.gitignore
@@ -0,0 +1,2 @@
+mandelbrot
+*.ppm
--- a/examples/volume_rendering/Makefile
+++ b/examples/volume_rendering/Makefile
@@ -0,0 +1,41 @@
+
+ARCH = $(shell uname)
+
+TASK_CXX=../tasks_pthreads.cpp
+TASK_LIB=-lpthread
+
+ifeq ($(ARCH), Darwin)
+  TASK_CXX=../tasks_gcd.cpp
+  TASK_LIB=
+endif
+
+TASK_OBJ=$(addprefix objs/, $(subst ../,, $(TASK_CXX:.cpp=.o)))
+
+CXX=g++
+CXXFLAGS=-Iobjs/ -O3 -Wall -m64
+ISPC=ispc
+ISPCFLAGS=-O2 --target=sse4x2 --arch=x86-64
+
+default: volume
+
+.PHONY: dirs clean
+
+dirs:
+	/bin/mkdir -p objs/
+
+clean:
+	/bin/rm -rf objs *~ volume
+
+volume: dirs objs/volume.o objs/volume_serial.o objs/volume_ispc.o $(TASK_OBJ)
+	$(CXX) $(CXXFLAGS) -o $@ objs/volume.o objs/volume_ispc.o objs/volume_serial.o $(TASK_OBJ) -lm $(TASK_LIB)
+
+objs/%.o: %.cpp
+	$(CXX) $< $(CXXFLAGS) -c -o $@
+
+objs/%.o: ../%.cpp
+	$(CXX) $< $(CXXFLAGS) -c -o $@
+
+objs/volume.o: objs/volume_ispc.h 
+
+objs/%_ispc.h objs/%_ispc.o: %.ispc
+	$(ISPC) $(ISPCFLAGS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
--- a/examples/volume_rendering/camera.dat
+++ b/examples/volume_rendering/camera.dat
@@ -0,0 +1,11 @@
+896 1184
+
+0.000155 0.000000 0.000000 -0.069927
+0.000000 -0.000155 0.000000 0.093236
+0.000000 0.000000 0.000000 1.000000
+0.000000 0.000000 -99.999001 100.000000
+
+1.000000 0.000000 0.000000 1.000000
+0.000000 0.980129 -0.198360 2.900000
+0.000000 0.198360 0.980129 -10.500000
+0.000000 0.000000 0.000000 1.000000
--- a/examples/volume_rendering/density_highres.vol
+++ b/examples/volume_rendering/density_highres.vol
--- a/examples/volume_rendering/density_lowres.vol
+++ b/examples/volume_rendering/density_lowres.vol
--- a/examples/volume_rendering/volume.cpp
+++ b/examples/volume_rendering/volume.cpp
@@ -0,0 +1,248 @@
+/*
+  Copyright (c) 2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+#ifdef _MSC_VER
+#define _CRT_SECURE_NO_WARNINGS
+#define NOMINMAX
+#pragma warning (disable: 4244)
+#pragma warning (disable: 4305)
+#endif
+
+#include <stdio.h>
+#include <algorithm>
+#include "../timing.h"
+#include "../cpuid.h"
+#include "volume_ispc.h"
+using namespace ispc;
+
+extern void volume_serial(float density[], int nVoxels[3], 
+                          const float raster2camera[4][4],
+                          const float camera2world[4][4], 
+                          int width, int height, float image[]);
+
+/* Write a PPM image file with the image */
+static void
+writePPM(float *buf, int width, int height, const char *fn) {
+    FILE *fp = fopen(fn, "wb");
+    fprintf(fp, "P6\n");
+    fprintf(fp, "%d %d\n", width, height);
+    fprintf(fp, "255\n");
+    for (int i = 0; i < width*height; ++i) {
+        float v = buf[i] * 255.f;
+        if (v < 0.f) v = 0.f;
+        else if (v > 255.f) v = 255.f;
+        unsigned char c = (unsigned char)v;
+        for (int j = 0; j < 3; ++j)
+            fputc(c, fp);
+    }
+    fclose(fp);
+    printf("Wrote image file %s\n", fn);
+}
+
+
+// Make sure that the vector ISA used during compilation is supported by
+// the processor.  The ISPC_TARGET_* macro is set in the ispc-generated
+// header file that we include above.
+static void
+ensureTargetISAIsSupported() {
+#if defined(ISPC_TARGET_SSE2)
+    bool isaSupported = CPUSupportsSSE2();
+    const char *target = "SSE2";
+#elif defined(ISPC_TARGET_SSE4)
+    bool isaSupported = CPUSupportsSSE4();
+    const char *target = "SSE4";
+#elif defined(ISPC_TARGET_AVX)
+    bool isaSupported = CPUSupportsAVX();
+    const char *target = "AVX";
+#else
+#error "Unknown ISPC_TARGET_* value"
+#endif
+    if (!isaSupported) {
+        fprintf(stderr, "***\n*** Error: the ispc-compiled code uses the %s instruction "
+                "set, which isn't\n***        supported by this computer's CPU!\n", target);
+        fprintf(stderr, "***\n***        Please modify the "
+#ifdef _MSC_VER
+                "MSVC project file "
+#else
+                "Makefile "
+#endif
+                "to select another target (e.g. sse2)\n***\n");
+        exit(1);
+    }
+}
+
+/* Load image and viewing parameters from a camera data file.
+   FIXME: we should add support to be able to specify viewing parameters
+   in the program here directly. */
+static void
+loadCamera(const char *fn, int *width, int *height, float raster2camera[4][4],
+           float camera2world[4][4]) {
+    FILE *f = fopen(fn, "r");
+    if (!f) {
+        perror(fn);
+        exit(1);
+    }
+    if (fscanf(f, "%d %d", width, height) != 2) {
+        fprintf(stderr, "Unexpected end of file in camera file\n");
+        exit(1);
+    }
+
+    for (int i = 0; i < 4; ++i) {
+        for (int j = 0; j < 4; ++j) {
+            if (fscanf(f, "%f", &raster2camera[i][j]) != 1) {
+                fprintf(stderr, "Unexpected end of file in camera file\n");
+                exit(1);
+            }
+        }
+    }
+    for (int i = 0; i < 4; ++i) {
+        for (int j = 0; j < 4; ++j) {
+            if (fscanf(f, "%f", &camera2world[i][j]) != 1) {
+                fprintf(stderr, "Unexpected end of file in camera file\n");
+                exit(1);
+            }
+        }
+    }
+    fclose(f);
+}
+
+
+/* Load a volume density file.  Expects the number of x, y, and z samples
+   as the first three values (as integer strings), then x*y*z
+   floating-point values (also as strings) to give the densities.  */
+static float *
+loadVolume(const char *fn, int n[3]) {
+    FILE *f = fopen(fn, "r");
+    if (!f) {
+        perror(fn);
+        exit(1);
+    }
+
+    if (fscanf(f, "%d %d %d", &n[0], &n[1], &n[2]) != 3) {
+        fprintf(stderr, "Couldn't find resolution at start of density file\n");
+        exit(1);
+    }
+
+    int count = n[0] * n[1] * n[2];
+    float *v = new float[count];
+    for (int i = 0; i < count; ++i) {
+        if (fscanf(f, "%f", &v[i]) != 1) {
+            fprintf(stderr, "Unexpected end of file at %d'th density value\n", i);
+            exit(1);
+        }
+    }
+
+    return v;
+}
+
+
+int main(int argc, char *argv[]) {
+    if (argc != 3) {
+        fprintf(stderr, "usage: volume <camera.dat> <volume_density.vol>\n");
+        return 1;
+    }
+
+    ensureTargetISAIsSupported();
+
+    //
+    // Load viewing data and the volume density data
+    //
+    int width, height;
+    float raster2camera[4][4], camera2world[4][4];
+    loadCamera(argv[1], &width, &height, raster2camera, camera2world);
+    float *image = new float[width*height];
+
+    int n[3];
+    float *density = loadVolume(argv[2], n);
+
+    //
+    // Compute the image using the ispc implementation; report the minimum
+    // time of three runs.
+    //
+    double minISPC = 1e30;
+    for (int i = 0; i < 3; ++i) {
+        reset_and_start_timer();
+        volume_ispc(density, n, raster2camera, camera2world,
+                    width, height, image);
+        double dt = get_elapsed_mcycles();
+        minISPC = std::min(minISPC, dt);
+    }
+
+    printf("[volume ispc 1 core]:\t\t[%.3f] million cycles\n", minISPC);
+    writePPM(image, width, height, "volume-ispc-1core.ppm");
+
+    // Clear out the buffer
+    for (int i = 0; i < width * height; ++i)
+        image[i] = 0.;
+
+    //
+    // Compute the image using the ispc implementation that also uses
+    // tasks; report the minimum time of three runs.
+    //
+    double minISPCtasks = 1e30;
+    for (int i = 0; i < 3; ++i) {
+        reset_and_start_timer();
+        volume_ispc_tasks(density, n, raster2camera, camera2world,
+                          width, height, image);
+        double dt = get_elapsed_mcycles();
+        minISPCtasks = std::min(minISPCtasks, dt);
+    }
+
+    printf("[volume ispc + tasks]:\t\t[%.3f] million cycles\n", minISPCtasks);
+    writePPM(image, width, height, "volume-ispc-tasks.ppm");
+
+    // Clear out the buffer
+    for (int i = 0; i < width * height; ++i)
+        image[i] = 0.;
+
+    // 
+    // And run the serial implementation 3 times, again reporting the
+    // minimum time.
+    //
+    double minSerial = 1e30;
+    for (int i = 0; i < 3; ++i) {
+        reset_and_start_timer();
+        volume_serial(density, n, raster2camera, camera2world,
+                      width, height, image);
+        double dt = get_elapsed_mcycles();
+        minSerial = std::min(minSerial, dt);
+    }
+
+    printf("[volume serial]:\t\t[%.3f] millon cycles\n", minSerial);
+    writePPM(image, width, height, "volume-serial.ppm");
+
+    printf("\t\t\t\t(%.2fx speedup from ISPC serial, %.2fx from ISPC+tasks)\n", 
+           minSerial/minISPC, minSerial / minISPCtasks);
+
+    return 0;
+}
--- a/examples/volume_rendering/volume.ispc
+++ b/examples/volume_rendering/volume.ispc
@@ -0,0 +1,378 @@
+/*
+  Copyright (c) 2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+typedef float<3> float3;
+
+struct Ray {
+    float3 origin, dir;
+};
+
+
+static void
+generateRay(const uniform float raster2camera[4][4], 
+            const uniform float camera2world[4][4],
+            float x, float y, reference Ray ray) {
+    // transform raster coordinate (x, y, 0) to camera space
+    float camx = raster2camera[0][0] * x + raster2camera[0][1] * y + raster2camera[0][3];
+    float camy = raster2camera[1][0] * x + raster2camera[1][1] * y + raster2camera[1][3];
+    float camz = raster2camera[2][3];
+    float camw = raster2camera[3][3];
+    camx /= camw;
+    camy /= camw;
+    camz /= camw;
+
+    ray.dir.x = camera2world[0][0] * camx + camera2world[0][1] * camy + camera2world[0][2] * camz;
+    ray.dir.y = camera2world[1][0] * camx + camera2world[1][1] * camy + camera2world[1][2] * camz;
+    ray.dir.z = camera2world[2][0] * camx + camera2world[2][1] * camy + camera2world[2][2] * camz;
+
+    ray.origin.x = camera2world[0][3] / camera2world[3][3];
+    ray.origin.y = camera2world[1][3] / camera2world[3][3];
+    ray.origin.z = camera2world[2][3] / camera2world[3][3];
+}
+
+
+static inline bool
+Inside(float3 p, float3 pMin, float3 pMax) {
+    return (p.x >= pMin.x && p.x <= pMax.x &&
+            p.y >= pMin.y && p.y <= pMax.y &&
+            p.z >= pMin.z && p.z <= pMax.z);
+}
+
+
+static bool
+IntersectP(Ray ray, float3 pMin, float3 pMax, reference float hit0, reference float hit1) {
+    float t0 = -1e30, t1 = 1e30;
+
+    float3 tNear = (pMin - ray.origin) / ray.dir;
+    float3 tFar  = (pMax - ray.origin) / ray.dir;
+    if (tNear.x > tFar.x) {
+        float tmp = tNear.x;
+        tNear.x = tFar.x;
+        tFar.x = tmp;
+    }
+    t0 = max(tNear.x, t0);
+    t1 = min(tFar.x, t1);
+
+    if (tNear.y > tFar.y) {
+        float tmp = tNear.y;
+        tNear.y = tFar.y;
+        tFar.y = tmp;
+    }
+    t0 = max(tNear.y, t0);
+    t1 = min(tFar.y, t1);
+
+    if (tNear.z > tFar.z) {
+        float tmp = tNear.z;
+        tNear.z = tFar.z;
+        tFar.z = tmp;
+    }
+    t0 = max(tNear.z, t0);
+    t1 = min(tFar.z, t1);
+    
+    if (t0 <= t1) {
+        hit0 = t0;
+        hit1 = t1;
+        return true;
+    }
+    else
+        return false;
+}
+
+
+static inline float Lerp(float t, float a, float b) {
+    return (1.f - t) * a + t * b;
+}
+
+
+static inline float D(int x, int y, int z, uniform int nVoxels[3], 
+                      uniform float density[]) {
+    x = clamp(x, 0, nVoxels[0]-1);
+    y = clamp(y, 0, nVoxels[1]-1);
+    z = clamp(z, 0, nVoxels[2]-1);
+
+    return density[z*nVoxels[0]*nVoxels[1] + y*nVoxels[0] + x];
+}
+
+
+static inline float Du(uniform int x, uniform int y, uniform int z, 
+                       uniform int nVoxels[3], uniform float density[]) {
+    x = clamp(x, 0, nVoxels[0]-1);
+    y = clamp(y, 0, nVoxels[1]-1);
+    z = clamp(z, 0, nVoxels[2]-1);
+
+    return density[z*nVoxels[0]*nVoxels[1] + y*nVoxels[0] + x];
+}
+
+
+static inline float3 Offset(float3 p, float3 pMin, float3 pMax) {
+    return (p - pMin) / (pMax - pMin);
+}
+
+
+static inline float Density(float3 Pobj, float3 pMin, float3 pMax, 
+                            uniform float density[], uniform int nVoxels[3],
+                            reference uniform bool checkForSameVoxel) {
+    if (!Inside(Pobj, pMin, pMax)) 
+        return 0;
+    // Compute voxel coordinates and offsets for _Pobj_
+    float3 vox = Offset(Pobj, pMin, pMax);
+    vox.x = vox.x * nVoxels[0] - .5f;
+    vox.y = vox.y * nVoxels[1] - .5f;
+    vox.z = vox.z * nVoxels[2] - .5f;
+    int vx = (int)(vox.x), vy = (int)(vox.y), vz = (int)(vox.z);
+    float dx = vox.x - vx, dy = vox.y - vy, dz = vox.z - vz;
+
+    // Trilinearly interpolate density values to compute local density
+    float d00, d10, d01, d11;
+    uniform int uvx, uvy, uvz;
+    if (checkForSameVoxel && reduce_equal(vx, uvx) && reduce_equal(vy, uvy) &&
+        reduce_equal(vz, uvz)) {
+        // If all of the program instances are inside the same voxel, then
+        // we'll call the 'uniform' variant of the voxel density lookup
+        // function, thus doing a single load for each value rather than a
+        // gather.
+        d00 = Lerp(dx, Du(uvx, uvy, uvz, nVoxels, density),     
+                       Du(uvx+1, uvy, uvz, nVoxels, density));
+        d10 = Lerp(dx, Du(uvx, uvy+1, uvz, nVoxels, density),   
+                       Du(uvx+1, uvy+1, uvz, nVoxels, density));
+        d01 = Lerp(dx, Du(uvx, uvy, uvz+1, nVoxels, density),   
+                       Du(uvx+1, uvy, uvz+1, nVoxels, density));
+        d11 = Lerp(dx, Du(uvx, uvy+1, uvz+1, nVoxels, density), 
+                       Du(uvx+1, uvy+1, uvz+1, nVoxels, density));
+    }
+    else {
+        // Otherwise, we have to do an actual gather in the more general
+        // D() function.  Once the reduce_equal tests above fail, we stop
+        // checking in subsequent steps, since it's unlikely that this will
+        // be true in the future once they've diverged into different
+        // voxels.
+        checkForSameVoxel = false;
+        d00 = Lerp(dx, D(vx, vy, vz, nVoxels, density),     
+                       D(vx+1, vy, vz, nVoxels, density));
+        d10 = Lerp(dx, D(vx, vy+1, vz, nVoxels, density),   
+                       D(vx+1, vy+1, vz, nVoxels, density));
+        d01 = Lerp(dx, D(vx, vy, vz+1, nVoxels, density),   
+                       D(vx+1, vy, vz+1, nVoxels, density));
+        d11 = Lerp(dx, D(vx, vy+1, vz+1, nVoxels, density), 
+                       D(vx+1, vy+1, vz+1, nVoxels, density));
+    }
+    float d0 = Lerp(dy, d00, d10);
+    float d1 = Lerp(dy, d01, d11);
+    return Lerp(dz, d0, d1);
+}
+
+
+/* Returns the transmittance between two points p0 and p1, in a volume
+   with extent (pMin,pMax) with transmittance coefficient sigma_t,
+   defined by nVoxels[3] voxels in each dimension in the given density
+   array. */
+static float
+transmittance(uniform float3 p0, float3 p1, uniform float3 pMin,
+              uniform float3 pMax, uniform float sigma_t, 
+              uniform float density[], uniform int nVoxels[3]) {
+    float rayT0, rayT1;
+    Ray ray;
+    ray.origin = p1;
+    ray.dir = p0 - p1;
+
+    // Find the parametric t range along the ray that is inside the volume.
+    if (!IntersectP(ray, pMin, pMax, rayT0, rayT1))
+        return 1.;
+
+    rayT0 = max(rayT0, 0.f);
+
+    // Accumulate beam transmittance in tau
+    float tau = 0;
+    float rayLength = sqrt(ray.dir.x * ray.dir.x + ray.dir.y * ray.dir.y +
+                           ray.dir.z * ray.dir.z);
+    uniform float stepDist = 0.2;
+    float stepT = stepDist / rayLength;
+
+    float t = rayT0;
+    float3 pos = ray.origin + ray.dir * rayT0;
+    float3 dirStep = ray.dir * stepT;
+    uniform bool checkForSameVoxel = true;
+    while (t < rayT1) {
+        tau += stepDist * sigma_t * Density(pos, pMin, pMax, density, nVoxels,
+                                            checkForSameVoxel);
+        pos = pos + dirStep;
+        t += stepT;
+    }
+
+    return exp(-tau);
+}
+
+
+static inline float
+distanceSquared(float3 a, float3 b) {
+    float3 d = a-b;
+    return d.x*d.x + d.y*d.y + d.z*d.z;
+}
+
+
+static float 
+raymarch(uniform float density[], uniform int nVoxels[3], Ray ray) {
+    float rayT0, rayT1;
+    uniform float3 pMin = {.3, -.2, .3}, pMax = {1.8, 2.3, 1.8};
+    uniform float3 lightPos = { -1, 4, 1.5 };
+
+    cif (!IntersectP(ray, pMin, pMax, rayT0, rayT1))
+        return 0.;
+
+    rayT0 = max(rayT0, 0.f);
+
+    // Parameters that define the volume scattering characteristics and
+    // sampling rate for raymarching
+    uniform float Le = .25;            // Emission coefficient
+    uniform float sigma_a = 10;        // Absorption coefficient
+    uniform float sigma_s = 10;        // Scattering coefficient
+    uniform float stepDist = 0.025;    // Ray step amount
+    uniform float lightIntensity = 40; // Light source intensity
+
+    float tau = 0.f;  // accumulated beam transmittance
+    float L = 0;      // radiance along the ray
+    float rayLength = sqrt(ray.dir.x * ray.dir.x + ray.dir.y * ray.dir.y +
+                           ray.dir.z * ray.dir.z);
+    float stepT = stepDist / rayLength;
+
+    float t = rayT0;
+    float3 pos = ray.origin + ray.dir * rayT0;
+    float3 dirStep = ray.dir * stepT;
+    uniform bool checkForSameVoxel = true;
+    cwhile (t < rayT1) {
+        float d = Density(pos, pMin, pMax, density, nVoxels, checkForSameVoxel);
+
+        // terminate once attenuation is high
+        float atten = exp(-tau);
+        if (atten < .005)
+            cbreak;
+
+        // direct lighting
+        float Li = lightIntensity / distanceSquared(lightPos, pos) * 
+            transmittance(lightPos, pos, pMin, pMax, sigma_a + sigma_s,
+                          density, nVoxels);
+        L += stepDist * atten * d * sigma_s * (Li + Le);
+
+        // update beam transmittance
+        tau += stepDist * (sigma_a + sigma_s) * d;
+
+        pos = pos + dirStep;
+        t += stepT;
+    }
+
+    // Gamma correction
+    return pow(L, 1.f / 2.2f);
+}
+
+
+/* Utility routine used by both the task-based and the single-core entrypoints.
+   Renders a tile of the image, covering [x0,x0) * [y0, y1), storing the
+   result into the image[] array.
+ */
+static void
+volume_tile(uniform int x0, uniform int y0, uniform int x1,
+            uniform int y1, uniform float density[], uniform int nVoxels[3], 
+            const uniform float raster2camera[4][4],
+            const uniform float camera2world[4][4], 
+            uniform int width, uniform int height, uniform float image[]) {
+    // Work on 4x4=16 pixel big tiles of the image.  This function thus
+    // implicitly assumes that both (x1-x0) and (y1-y0) are evenly divisble
+    // by 4.
+    for (uniform int y = y0; y < y1; y += 4) {
+        for (uniform int x = x0; x < x1; x += 4) {
+            // For each such tile, process programCount pixels at a time,
+            // until we've done all 16 of them.  Thus, we're also assuming
+            // that programCount <= 16 and that 16 is evenly dividible by
+            // programCount.
+            for (uniform int o = 0; o < 16; o += programCount) {
+                // These two arrays encode the mapping from [0,15] to
+                // offsets within the 4x4 pixel block so that we render
+                // each pixel inside the block
+                const uniform int xoffsets[16] = { 0, 1, 0, 1, 2, 3, 2, 3,
+                                                   0, 1, 0, 1, 2, 3, 2, 3 };
+                const uniform int yoffsets[16] = { 0, 0, 1, 1, 0, 0, 1, 1,
+                                                   2, 2, 3, 3, 2, 2, 3, 3 };
+
+                // Figure out the pixel to render for this program instance
+                int xo = x + xoffsets[o + programIndex];
+                int yo = y + yoffsets[o + programIndex];
+
+                // Use viewing parameters to compute the corresponding ray
+                // for the pixel
+                Ray ray;
+                generateRay(raster2camera, camera2world, xo, yo, ray);
+
+                // And raymarch through the volume to compute the pixel's
+                // value
+                int offset = yo * width + xo;
+                image[offset] = raymarch(density, nVoxels, ray);
+            }
+        }
+    }
+}
+
+
+task void
+volume_task(uniform int x0, uniform int y0, uniform int x1,
+            uniform int y1, uniform float density[], uniform int nVoxels[3], 
+            const uniform float raster2camera[4][4],
+            const uniform float camera2world[4][4], 
+            uniform int width, uniform int height, uniform float image[]) {
+    volume_tile(x0, y0, x1, y1, density, nVoxels, raster2camera,
+                 camera2world, width, height, image);
+}
+
+
+export void
+volume_ispc(uniform float density[], uniform int nVoxels[3], 
+            const uniform float raster2camera[4][4],
+            const uniform float camera2world[4][4], 
+            uniform int width, uniform int height, uniform float image[]) {
+    volume_tile(0, 0, width, height, density, nVoxels, raster2camera, 
+                camera2world, width, height,  image);
+}
+
+
+export void
+volume_ispc_tasks(uniform float density[], uniform int nVoxels[3], 
+                  const uniform float raster2camera[4][4],
+                  const uniform float camera2world[4][4], 
+                  uniform int width, uniform int height, uniform float image[]) {
+    // Launch tasks to work on (dx,dy)-sized tiles of the image
+    uniform int dx = 8, dy = 8;
+    for (uniform int y = 0; y < height; y += dy)
+        for (uniform int x = 0; x < width; x += dx)
+            launch < volume_task(x, y, x+dx, y+dy, density, nVoxels, 
+                                 raster2camera, camera2world, width, height, 
+                                 image) >;
+}
--- a/examples/volume_rendering/volume.vcxproj
+++ b/examples/volume_rendering/volume.vcxproj
@@ -0,0 +1,168 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|Win32">
+      <Configuration>Debug</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|Win32">
+      <Configuration>Release</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{dee5733a-e93e-449d-9114-9bffcaeb4df9}</ProjectGuid>
+    <Keyword>Win32Proj</Keyword>
+    <RootNamespace>volume</RootNamespace>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <LinkIncremental>true</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <LinkIncremental>true</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <LinkIncremental>false</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <LinkIncremental>false</LinkIncremental>
+  </PropertyGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <ClCompile>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <FloatingPointModel>Fast</FloatingPointModel>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <ClCompile>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <FloatingPointModel>Fast</FloatingPointModel>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <FloatingPointModel>Fast</FloatingPointModel>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <FloatingPointModel>Fast</FloatingPointModel>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <ClCompile Include="volume.cpp" />
+    <ClCompile Include="volume_serial.cpp" />
+    <ClCompile Include="../tasks_concrt.cpp" />
+  </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="volume.ispc">
+      <FileType>Document</FileType>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=sse4x2
+</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --target=sse4x2
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=sse4x2
+</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --target=sse4x2
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
+    </CustomBuild>
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+  </ImportGroup>
+</Project>
--- a/examples/volume_rendering/volume_serial.cpp
+++ b/examples/volume_rendering/volume_serial.cpp
@@ -0,0 +1,305 @@
+/*
+  Copyright (c) 2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+#include <assert.h>
+#include <math.h>
+#include <algorithm>
+
+// Just enough of a float3 class to do what we need in this file.
+#ifdef _MSC_VER
+__declspec(align(16)) 
+#endif
+struct float3 {
+    float3() { }
+    float3(float xx, float yy, float zz) { x = xx; y = yy; z = zz; }
+
+    float3 operator*(float f) const { return float3(x*f, y*f, z*f); }
+    float3 operator-(const float3 &f2) const { 
+        return float3(x-f2.x, y-f2.y, z-f2.z); 
+    }
+    float3 operator*(const float3 &f2) const { 
+        return float3(x*f2.x, y*f2.y, z*f2.z); 
+    }
+    float3 operator+(const float3 &f2) const { 
+        return float3(x+f2.x, y+f2.y, z+f2.z); 
+    }
+    float3 operator/(const float3 &f2) const { 
+        return float3(x/f2.x, y/f2.y, z/f2.z); 
+    }
+    float operator[](int i) const { return (&x)[i]; }
+    float &operator[](int i) { return (&x)[i]; }
+
+    float x, y, z;
+    float pad;  // match padding/alignment of ispc version 
+}
+#ifndef _MSC_VER
+__attribute__ ((aligned(16)))
+#endif
+;
+
+struct Ray {
+    float3 origin, dir;
+};
+
+
+static void
+generateRay(const float raster2camera[4][4], const float camera2world[4][4],
+            float x, float y, Ray &ray) {
+    // transform raster coordinate (x, y, 0) to camera space
+    float camx = raster2camera[0][0] * x + raster2camera[0][1] * y + raster2camera[0][3];
+    float camy = raster2camera[1][0] * x + raster2camera[1][1] * y + raster2camera[1][3];
+    float camz = raster2camera[2][3];
+    float camw = raster2camera[3][3];
+    camx /= camw;
+    camy /= camw;
+    camz /= camw;
+
+    ray.dir.x = camera2world[0][0] * camx + camera2world[0][1] * camy + camera2world[0][2] * camz;
+    ray.dir.y = camera2world[1][0] * camx + camera2world[1][1] * camy + camera2world[1][2] * camz;
+    ray.dir.z = camera2world[2][0] * camx + camera2world[2][1] * camy + camera2world[2][2] * camz;
+
+    ray.origin.x = camera2world[0][3] / camera2world[3][3];
+    ray.origin.y = camera2world[1][3] / camera2world[3][3];
+    ray.origin.z = camera2world[2][3] / camera2world[3][3];
+}
+
+
+static bool
+Inside(float3 p, float3 pMin, float3 pMax) {
+    return (p.x >= pMin.x && p.x <= pMax.x &&
+            p.y >= pMin.y && p.y <= pMax.y &&
+            p.z >= pMin.z && p.z <= pMax.z);
+}
+
+
+static bool
+IntersectP(const Ray &ray, float3 pMin, float3 pMax, float *hit0, float *hit1) {
+    float t0 = -1e30f, t1 = 1e30f;
+
+    float3 tNear = (pMin - ray.origin) / ray.dir;
+    float3 tFar  = (pMax - ray.origin) / ray.dir;
+    if (tNear.x > tFar.x) {
+        float tmp = tNear.x;
+        tNear.x = tFar.x;
+        tFar.x = tmp;
+    }
+    t0 = std::max(tNear.x, t0);
+    t1 = std::min(tFar.x, t1);
+
+    if (tNear.y > tFar.y) {
+        float tmp = tNear.y;
+        tNear.y = tFar.y;
+        tFar.y = tmp;
+    }
+    t0 = std::max(tNear.y, t0);
+    t1 = std::min(tFar.y, t1);
+
+    if (tNear.z > tFar.z) {
+        float tmp = tNear.z;
+        tNear.z = tFar.z;
+        tFar.z = tmp;
+    }
+    t0 = std::max(tNear.z, t0);
+    t1 = std::min(tFar.z, t1);
+    
+    if (t0 <= t1) {
+        *hit0 = t0;
+        *hit1 = t1;
+        return true;
+    }
+    else
+        return false;
+}
+
+
+static inline float Lerp(float t, float a, float b) {
+    return (1.f - t) * a + t * b;
+}
+
+
+static inline int Clamp(int v, int low, int high) {
+    return std::min(std::max(v, low), high);
+}
+
+
+static inline float D(int x, int y, int z, int nVoxels[3], float density[]) {
+    x = Clamp(x, 0, nVoxels[0]-1);
+    y = Clamp(y, 0, nVoxels[1]-1);
+    z = Clamp(z, 0, nVoxels[2]-1);
+    return density[z*nVoxels[0]*nVoxels[1] + y*nVoxels[0] + x];
+}
+
+
+static inline float3 Offset(float3 p, float3 pMin, float3 pMax) {
+    return float3((p.x - pMin.x) / (pMax.x - pMin.x),
+                  (p.y - pMin.y) / (pMax.y - pMin.y),
+                  (p.z - pMin.z) / (pMax.z - pMin.z));
+}
+
+
+static inline float Density(float3 Pobj, float3 pMin, float3 pMax, 
+                            float density[], int nVoxels[3]) {
+    if (!Inside(Pobj, pMin, pMax)) 
+        return 0;
+    // Compute voxel coordinates and offsets for _Pobj_
+    float3 vox = Offset(Pobj, pMin, pMax);
+    vox.x = vox.x * nVoxels[0] - .5f;
+    vox.y = vox.y * nVoxels[1] - .5f;
+    vox.z = vox.z * nVoxels[2] - .5f;
+    int vx = (int)(vox.x), vy = (int)(vox.y), vz = (int)(vox.z);
+    float dx = vox.x - vx, dy = vox.y - vy, dz = vox.z - vz;
+
+    // Trilinearly interpolate density values to compute local density
+    float d00 = Lerp(dx, D(vx, vy, vz, nVoxels, density),     
+                         D(vx+1, vy, vz, nVoxels, density));
+    float d10 = Lerp(dx, D(vx, vy+1, vz, nVoxels, density),   
+                         D(vx+1, vy+1, vz, nVoxels, density));
+    float d01 = Lerp(dx, D(vx, vy, vz+1, nVoxels, density),   
+                         D(vx+1, vy, vz+1, nVoxels, density));
+    float d11 = Lerp(dx, D(vx, vy+1, vz+1, nVoxels, density), 
+                         D(vx+1, vy+1, vz+1, nVoxels, density));
+    float d0 = Lerp(dy, d00, d10);
+    float d1 = Lerp(dy, d01, d11);
+    return Lerp(dz, d0, d1);
+}
+
+
+
+static float
+transmittance(float3 p0, float3 p1, float3 pMin,
+              float3 pMax, float sigma_t, float density[], int nVoxels[3]) {
+    float rayT0, rayT1;
+    Ray ray;
+    ray.origin = p1;
+    ray.dir = p0 - p1;
+
+    // Find the parametric t range along the ray that is inside the volume.
+    if (!IntersectP(ray, pMin, pMax, &rayT0, &rayT1))
+        return 1.;
+
+    rayT0 = std::max(rayT0, 0.f);
+
+    // Accumulate beam transmittance in tau
+    float tau = 0;
+    float rayLength = sqrtf(ray.dir.x * ray.dir.x + ray.dir.y * ray.dir.y +
+                            ray.dir.z * ray.dir.z);
+    float stepDist = 0.2f;
+    float stepT = stepDist / rayLength;
+
+    float t = rayT0;
+    float3 pos = ray.origin + ray.dir * rayT0;
+    float3 dirStep = ray.dir * stepT;
+    while (t < rayT1) {
+        tau += stepDist * sigma_t * Density(pos, pMin, pMax, density, nVoxels);
+        pos = pos + dirStep;
+        t += stepT;
+    }
+
+    return expf(-tau);
+}
+
+
+static float
+distanceSquared(float3 a, float3 b) {
+    float3 d = a-b;
+    return d.x*d.x + d.y*d.y + d.z*d.z;
+}
+
+
+static float 
+raymarch(float density[], int nVoxels[3], const Ray &ray) {
+    float rayT0, rayT1;
+    float3 pMin(.3f, -.2f, .3f), pMax(1.8f, 2.3f, 1.8f);
+    float3 lightPos(-1.f, 4.f, 1.5f);
+
+    if (!IntersectP(ray, pMin, pMax, &rayT0, &rayT1))
+        return 0.;
+
+    rayT0 = std::max(rayT0, 0.f);
+
+    // Parameters that define the volume scattering characteristics and
+    // sampling rate for raymarching
+    float Le = .25f;           // Emission coefficient
+    float sigma_a = 10;        // Absorption coefficient
+    float sigma_s = 10;        // Scattering coefficient
+    float stepDist = 0.025f;   // Ray step amount
+    float lightIntensity = 40; // Light source intensity
+
+    float tau = 0.f;  // accumulated beam transmittance
+    float L = 0;      // radiance along the ray
+    float rayLength = sqrtf(ray.dir.x * ray.dir.x + ray.dir.y * ray.dir.y +
+                            ray.dir.z * ray.dir.z);
+    float stepT = stepDist / rayLength;
+
+    float t = rayT0;
+    float3 pos = ray.origin + ray.dir * rayT0;
+    float3 dirStep = ray.dir * stepT;
+    while (t < rayT1) {
+        float d = Density(pos, pMin, pMax, density, nVoxels);
+
+        // terminate once attenuation is high
+        float atten = expf(-tau);
+        if (atten < .005f)
+            break;
+
+        // direct lighting
+        float Li = lightIntensity / distanceSquared(lightPos, pos) * 
+            transmittance(lightPos, pos, pMin, pMax, sigma_a + sigma_s,
+                          density, nVoxels);
+        L += stepDist * atten * d * sigma_s * (Li + Le);
+
+        // update beam transmittance
+        tau += stepDist * (sigma_a + sigma_s) * d;
+
+        pos = pos + dirStep;
+        t += stepT;
+    }
+
+    // Gamma correction
+    return powf(L, 1.f / 2.2f);
+}
+
+
+void
+volume_serial(float density[], int nVoxels[3], const float raster2camera[4][4],
+              const float camera2world[4][4], 
+              int width, int height, float image[]) {
+    int offset = 0;
+    for (int y = 0; y < height; ++y) {
+        for (int x = 0; x < width; ++x, ++offset) {
+            Ray ray;
+            generateRay(raster2camera, camera2world, x, y, ray);
+            image[offset] = raymarch(density, nVoxels, ray);
+        }
+    }
+}
--- a/expr.cpp
+++ b/expr.cpp
--- a/expr.h
+++ b/expr.h
@@ -39,6 +39,7 @@
 #define ISPC_EXPR_H 1

 #include "ispc.h"
+#include "type.h"

 class FunctionSymbolExpr;

@@ -96,7 +97,7 @@ public:
        that incorporates the given error message string.  In either
        failure case, NULL is returned.  */
    Expr *TypeConv(const Type *type, const char *errorMsgBase = NULL, 
-                   bool failureOk = false);
+                   bool failureOk = false, bool issuePrecisionWarnings = true);
 };


@@ -120,8 +121,8 @@ public:
    void Print() const;
    Expr *Optimize();
    Expr *TypeCheck();
+    int EstimateCost() const;

-private:
    const Op op;
    Expr *expr;
 };
@@ -163,8 +164,8 @@ public:

    Expr *Optimize();
    Expr *TypeCheck();
+    int EstimateCost() const;

-private:
    const Op op;
    Expr *arg0, *arg1;
 };
@@ -195,8 +196,8 @@ public:

    Expr *Optimize();
    Expr *TypeCheck();
+    int EstimateCost() const;

-private:
    const Op op;
    Expr *lvalue, *rvalue;
 };
@@ -216,8 +217,8 @@ public:

    Expr *Optimize();
    Expr *TypeCheck();
+    int EstimateCost() const;

-private:
    Expr *test, *expr1, *expr2;
 };

@@ -239,6 +240,7 @@ public:
    llvm::Constant *GetConstant(const Type *type) const;
    ExprList *Optimize();
    ExprList *TypeCheck();
+    int EstimateCost() const;

    std::vector<Expr *> exprs;
 };
@@ -256,12 +258,13 @@ public:

    Expr *Optimize();
    Expr *TypeCheck();
+    int EstimateCost() const;

-private:
    Expr *func;
    ExprList *args;
    bool isLaunch;

+private:
    void resolveFunctionOverloads();
    bool tryResolve(bool (*matchFunc)(Expr *, const Type *));
 };
@@ -284,16 +287,21 @@ public:

    Expr *Optimize();
    Expr *TypeCheck();
+    int EstimateCost() const;

-private:
    Expr *arrayOrVector, *index;
 };


 /** @brief Expression representing member selection ("foo.bar").
+ *
+ *  This will also be overloaded to deal with swizzles.
 */
 class MemberExpr : public Expr {
 public:
+    static MemberExpr* create(Expr *expr, const char *identifier,
+                              SourcePos pos, SourcePos identifierPos);
+
    MemberExpr(Expr *expr, const char *identifier, SourcePos pos, 
               SourcePos identifierPos);

@@ -304,10 +312,11 @@ public:
    void Print() const;
    Expr *Optimize();
    Expr *TypeCheck();
+    int EstimateCost() const;
+
+    virtual int getElementNumber() const;

-private:
    std::string getCandidateNearMatches() const;
-    int getElementNumber() const;

    Expr *expr;
    std::string identifier;
@@ -318,12 +327,30 @@ private:
 /** @brief Expression representing a compile-time constant value.  

    This class can currently represent compile-time constants of anything
-    that is an AtomicType; for anything more complex, we don't currently
-    have a representation of a compile-time constant that can be further
-    reasoned about.
+    that is an AtomicType or an EnumType; for anything more complex, we
+    don't currently have a representation of a compile-time constant that
+    can be further reasoned about.
 */
 class ConstExpr : public Expr {
 public:
+    /** Create a ConstExpr from a uniform int8 value */
+    ConstExpr(const Type *t, int8_t i, SourcePos p);
+    /** Create a ConstExpr from a varying int8 value */
+    ConstExpr(const Type *t, int8_t *i, SourcePos p);
+    /** Create a ConstExpr from a uniform uint8 value */
+    ConstExpr(const Type *t, uint8_t u, SourcePos p);
+    /** Create a ConstExpr from a varying uint8 value */
+    ConstExpr(const Type *t, uint8_t *u, SourcePos p);
+
+    /** Create a ConstExpr from a uniform int16 value */
+    ConstExpr(const Type *t, int16_t i, SourcePos p);
+    /** Create a ConstExpr from a varying int16 value */
+    ConstExpr(const Type *t, int16_t *i, SourcePos p);
+    /** Create a ConstExpr from a uniform uint16 value */
+    ConstExpr(const Type *t, uint16_t u, SourcePos p);
+    /** Create a ConstExpr from a varying uint16 value */
+    ConstExpr(const Type *t, uint16_t *u, SourcePos p);
+
    /** Create a ConstExpr from a uniform int32 value */
    ConstExpr(const Type *t, int32_t i, SourcePos p);
    /** Create a ConstExpr from a varying int32 value */
@@ -332,14 +359,17 @@ public:
    ConstExpr(const Type *t, uint32_t u, SourcePos p);
    /** Create a ConstExpr from a varying uint32 value */
    ConstExpr(const Type *t, uint32_t *u, SourcePos p);
+
    /** Create a ConstExpr from a uniform float value */
    ConstExpr(const Type *t, float f, SourcePos p);
    /** Create a ConstExpr from a varying float value */
    ConstExpr(const Type *t, float *f, SourcePos p);
+
    /** Create a ConstExpr from a uniform double value */
    ConstExpr(const Type *t, double d, SourcePos p);
    /** Create a ConstExpr from a varying double value */
    ConstExpr(const Type *t, double *d, SourcePos p);
+
    /** Create a ConstExpr from a uniform int64 value */
    ConstExpr(const Type *t, int64_t i, SourcePos p);
    /** Create a ConstExpr from a varying int64 value */
@@ -348,10 +378,12 @@ public:
    ConstExpr(const Type *t, uint64_t i, SourcePos p);
    /** Create a ConstExpr from a varying uint64 value */
    ConstExpr(const Type *t, uint64_t *i, SourcePos p);
+
    /** Create a ConstExpr from a uniform bool value */
    ConstExpr(const Type *t, bool b, SourcePos p);
    /** Create a ConstExpr from a varying bool value */
    ConstExpr(const Type *t, bool *b, SourcePos p);
+
    /** Create a ConstExpr of the same type as the given old ConstExpr,
        with values given by the "vales" parameter. */
    ConstExpr(ConstExpr *old, double *values);
@@ -363,6 +395,7 @@ public:

    Expr *TypeCheck();
    Expr *Optimize();
+    int EstimateCost() const;

    /** Return the ConstExpr's values as booleans, doing type conversion
        from the actual type if needed.  If forceVarying is true, then type
@@ -370,6 +403,30 @@ public:
        equal to the target vector width into the given pointer. */
    int AsBool(bool *, bool forceVarying = false) const;

+    /** Return the ConstExpr's values as int8s, doing type conversion
+        from the actual type if needed.  If forceVarying is true, then type
+        convert to 'varying' so as to always return a number of values
+        equal to the target vector width into the given pointer. */
+    int AsInt8(int8_t *, bool forceVarying = false) const;
+
+    /** Return the ConstExpr's values as uint8s, doing type conversion
+        from the actual type if needed.  If forceVarying is true, then type
+        convert to 'varying' so as to always return a number of values
+        equal to the target vector width into the given pointer. */
+    int AsUInt8(uint8_t *, bool forceVarying = false) const;
+
+    /** Return the ConstExpr's values as int16s, doing type conversion
+        from the actual type if needed.  If forceVarying is true, then type
+        convert to 'varying' so as to always return a number of values
+        equal to the target vector width into the given pointer. */
+    int AsInt16(int16_t *, bool forceVarying = false) const;
+
+    /** Return the ConstExpr's values as uint16s, doing type conversion
+        from the actual type if needed.  If forceVarying is true, then type
+        convert to 'varying' so as to always return a number of values
+        equal to the target vector width into the given pointer. */
+    int AsUInt16(uint16_t *, bool forceVarying = false) const;
+
    /** Return the ConstExpr's values as int32s, doing type conversion
        from the actual type if needed.  If forceVarying is true, then type
        convert to 'varying' so as to always return a number of values
@@ -412,8 +469,14 @@ public:
    int Count() const;

 private:
-    const AtomicType *type;
+    AtomicType::BasicType getBasicType() const;
+
+    const Type *type;
    union {
+        int8_t int8Val[ISPC_MAX_NVEC];
+        uint8_t uint8Val[ISPC_MAX_NVEC];
+        int16_t int16Val[ISPC_MAX_NVEC];
+        uint16_t uint16Val[ISPC_MAX_NVEC];
        int32_t int32Val[ISPC_MAX_NVEC];
        uint32_t uint32Val[ISPC_MAX_NVEC];
        bool boolVal[ISPC_MAX_NVEC];
@@ -436,8 +499,8 @@ public:
    void Print() const;
    Expr *TypeCheck();
    Expr *Optimize();
+    int EstimateCost() const;

-private:
    const Type *type;
    Expr *expr;
 };
@@ -455,8 +518,8 @@ public:
    void Print() const;
    Expr *TypeCheck();
    Expr *Optimize();
+    int EstimateCost() const;

-private:
    Expr *expr;
 };

@@ -474,8 +537,8 @@ public:
    void Print() const;
    Expr *TypeCheck();
    Expr *Optimize();
+    int EstimateCost() const;

-private:
    Expr *expr;
 };

@@ -492,6 +555,7 @@ public:
    Expr *TypeCheck();
    Expr *Optimize();
    void Print() const;
+    int EstimateCost() const;

 private:
    Symbol *symbol;
@@ -512,6 +576,7 @@ public:
    Expr *TypeCheck();
    Expr *Optimize();
    void Print() const;
+    int EstimateCost() const;

 private:
    friend class FunctionCallExpr;
@@ -538,6 +603,7 @@ public:
    Expr *TypeCheck();
    Expr *Optimize();
    void Print() const;
+    int EstimateCost() const;
 };

 #endif // ISPC_EXPR_H
--- a/failing_tests/masked-scatter-vector.ispc
+++ b/failing_tests/masked-scatter-vector.ispc
@@ -14,7 +14,7 @@ export void f_fu(uniform float ret[], uniform float aa[], uniform float b) {
    varying int3 vv = array[a];
    ++vv.y;
    array[a] = vv;
-    print("fin %\n", array[programIndex].y);
+//CO    print("fin %\n", array[programIndex].y);
    ret[programIndex] = array[programIndex].y;
 }

--- a/failing_tests/max-uint-1.ispc
+++ b/failing_tests/max-uint-1.ispc
@@ -1,19 +1,14 @@
-static float float4(uniform float a, uniform float b, uniform float c, 
-                    uniform float d) {
-    float ret = 0;
-    for (uniform int i = 0; i < programCount; i += 4) {
-        ret = insert(ret, i + 0, a);
-        ret = insert(ret, i + 1, b);
-        ret = insert(ret, i + 2, c);
-        ret = insert(ret, i + 3, d);
-    }
-    return ret;
+
+export uniform int width() { return programCount; }
+
+export void f_f(uniform float r[], uniform float a[]) {
+    unsigned int i = (unsigned int)a[programIndex];
+    r[programIndex] = max((unsigned int)2, i);
 }

-export float f_f(float a) {
-    unsigned int i = (unsigned int)a;
-    return max((unsigned int)2, i);
+export void result(uniform float r[]) { 
+    r[programIndex] = 1+programIndex;
+    r[0] = 2;
 }

-export float result() { return float4(2,2,3,4); }

--- a/failing_tests/max-uint.ispc
+++ b/failing_tests/max-uint.ispc
@@ -1,8 +1,10 @@

-export float f_f(float a) {
-    unsigned int i = (unsigned int)a;
-    return max((unsigned int)10, i);
+export uniform int width() { return programCount; }
+
+export void f_f(uniform float result[], uniform float aa[]) {
+    unsigned int i = (unsigned int)aa[programIndex];
+    result[programIndex] = max((unsigned int)100, i);
 }

-export float result() { return 10; }
+export void result(uniform float r[]) { r[programIndex] = 100; }

--- a/failing_tests/min-uint-1.ispc
+++ b/failing_tests/min-uint-1.ispc
@@ -1,19 +1,14 @@
-static float float4(uniform float a, uniform float b, uniform float c, 
-                    uniform float d) {
-    float ret = 0;
-    for (uniform int i = 0; i < programCount; i += 4) {
-        ret = insert(ret, i + 0, a);
-        ret = insert(ret, i + 1, b);
-        ret = insert(ret, i + 2, c);
-        ret = insert(ret, i + 3, d);
-    }
-    return ret;
+
+export uniform int width() { return programCount; }
+
+export void f_f(uniform float result[], uniform float aa[]) {
+    unsigned int i = (unsigned int)aa[programIndex];
+    result[programIndex] = min((unsigned int)2, i);
 }

-export float f_f(float a) {
-    unsigned int i = (unsigned int)a;
-    return min((unsigned int)2, i);
+export void result(uniform float r[]) { 
+    r[programIndex] = 2;
+    r[0] = 1;
 }

-export float result() { return float4(1,2,2,2); }

--- a/failing_tests/min-uint-2.ispc
+++ b/failing_tests/min-uint-2.ispc
@@ -1,19 +1,13 @@
-static float float4(uniform float a, uniform float b, uniform float c, 
-                    uniform float d) {
-    float ret = 0;
-    for (uniform int i = 0; i < programCount; i += 4) {
-        ret = insert(ret, i + 0, a);
-        ret = insert(ret, i + 1, b);
-        ret = insert(ret, i + 2, c);
-        ret = insert(ret, i + 3, d);
-    }
-    return ret;
+
+export uniform int width() { return programCount; }
+
+export void f_f(uniform float r[], uniform float a[]) {
+    unsigned int i = (unsigned int)a[programIndex];
+    r[programIndex] =  min((unsigned int)20, i);
 }

-export float f_f(float a) {
-    unsigned int i = (unsigned int)a;
-    return min((unsigned int)20, i);
+export void result(uniform float r[]) { 
+    r[programIndex] = 1+programIndex;
 }

-export float result() { return float4(1,2,3,4); }

--- a/failing_tests/shuffle2-10.ispc
+++ b/failing_tests/shuffle2-10.ispc
@@ -0,0 +1,16 @@
+
+/* failing due to llvm bug http://llvm.org/bugs/show_bug.cgi?id=10421 */
+
+export uniform int width() { return programCount; }
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    int8 aa = aFOO[programIndex]; 
+    int8 bb = aa + programCount;
+    int8 shuf = shuffle(aa, bb, 2*programIndex+(int)b-5);
+//CO    print("%\n%\n%\n%\n", aa, bb, 2*programIndex+(int)b-5, shuf);
+    RET[programIndex] = shuf;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1 + 2*programIndex;
+}
--- a/failing_tests/struct-array-assign.ispc
+++ b/failing_tests/struct-array-assign.ispc
@@ -1,11 +0,0 @@
-
-struct Foo {
-    float f;
-};
-
-
-export float foo(Foo f[], int i, uniform int j) {
-    Foo x = f[i];
-    return x.f;
-}
-
--- a/ispc.cpp
+++ b/ispc.cpp
@@ -42,14 +42,25 @@
 #ifdef ISPC_IS_WINDOWS
 #include <windows.h>
 #include <direct.h>
+#define strcasecmp stricmp
 #endif
 #include <llvm/LLVMContext.h>
 #include <llvm/Module.h>
-#ifndef LLVM_2_8
 #include <llvm/Analysis/DIBuilder.h>
-#endif
 #include <llvm/Analysis/DebugInfo.h>
 #include <llvm/Support/Dwarf.h>
+#include <llvm/Target/TargetMachine.h>
+#include <llvm/Target/TargetOptions.h>
+#include <llvm/Target/TargetData.h>
+#if defined(LLVM_3_0) || defined(LLVM_3_0svn)
+  #include <llvm/Support/TargetRegistry.h>
+  #include <llvm/Support/TargetSelect.h>
+#else
+  #include <llvm/Target/TargetRegistry.h>
+  #include <llvm/Target/TargetSelect.h>
+  #include <llvm/Target/SubtargetFeature.h>
+#endif
+#include <llvm/Support/Host.h>

 Globals *g;
 Module *m;
@@ -57,20 +68,196 @@ Module *m;
 ///////////////////////////////////////////////////////////////////////////
 // Target

-Target::Target() {
-    arch = "x86-64";
-    cpu = "nehalem";
-    isa = SSE4;
-    nativeVectorWidth = 4;
-    vectorWidth = 4;
+bool
+Target::GetTarget(const char *arch, const char *cpu, const char *isa,
+                  bool pic, Target *t) {
+    if (cpu == NULL) {
+        std::string hostCPU = llvm::sys::getHostCPUName();
+        if (hostCPU.size() > 0)
+            cpu = hostCPU.c_str();
+        else {
+            fprintf(stderr, "Warning: unable to determine host CPU!\n");
+            cpu = "generic";
+        }
+    }
+    t->cpu = cpu;
+
+    if (isa == NULL) {
+        if (!strcasecmp(cpu, "atom"))
+            isa = "sse2";
+#if defined(LLVM_3_0) || defined(LLVM_3_0_svn)
+        else if (!strcasecmp(cpu, "sandybridge") ||
+                 !strcasecmp(cpu, "corei7-avx"))
+            isa = "avx";
+#endif // LLVM_3_0
+        else
+            isa = "sse4";
+    }
+    if (arch == NULL)
+        arch = "x86-64";
+
+    bool error = false;
+
+    t->generatePIC = pic;
+
+    // Make sure the target architecture is a known one; print an error
+    // with the valid ones otherwise.
+    t->target = NULL;
+    for (llvm::TargetRegistry::iterator iter = llvm::TargetRegistry::begin();
+         iter != llvm::TargetRegistry::end(); ++iter) {
+        if (std::string(arch) == iter->getName()) {
+            t->target = &*iter;
+            break;
+        }
+    }
+    if (t->target == NULL) {
+        fprintf(stderr, "Invalid architecture \"%s\"\nOptions: ", arch);
+        llvm::TargetRegistry::iterator iter;
+        for (iter = llvm::TargetRegistry::begin();
+             iter != llvm::TargetRegistry::end(); ++iter)
+            fprintf(stderr, "%s ", iter->getName());
+        fprintf(stderr, "\n");
+        error = true;
+    }
+    else {
+        t->arch = arch;
+    }
+
+    if (!strcasecmp(isa, "sse2")) {
+        t->isa = Target::SSE2;
+        t->nativeVectorWidth = 4;
+        t->vectorWidth = 4;
+        t->attributes = "+sse,+sse2,-sse3,-sse41,-sse42,-sse4a,-ssse3,-popcnt";
+    }
+    else if (!strcasecmp(isa, "sse4")) {
+        t->isa = Target::SSE4;
+        t->nativeVectorWidth = 4;
+        t->vectorWidth = 4;
+        t->attributes = "+sse,+sse2,+sse3,+sse41,-sse42,-sse4a,+ssse3,-popcnt,+cmov";
+    }
+    else if (!strcasecmp(isa, "sse4x2")) {
+        t->isa = Target::SSE4;
+        t->nativeVectorWidth = 4;
+        t->vectorWidth = 8;
+        t->attributes = "+sse,+sse2,+sse3,+sse41,-sse42,-sse4a,+ssse3,-popcnt,+cmov";
+    }
+#if defined(LLVM_3_0) || defined(LLVM_3_0svn)
+    else if (!strcasecmp(isa, "avx")) {
+        t->isa = Target::AVX;
+        t->nativeVectorWidth = 8;
+        t->vectorWidth = 8;
+        t->attributes = "+avx,+popcnt,+cmov";
+    }
+    else if (!strcasecmp(isa, "avx-x2")) {
+        t->isa = Target::AVX;
+        t->nativeVectorWidth = 8;
+        t->vectorWidth = 16;
+        t->attributes = "+avx,+popcnt,+cmov";
+    }
+#endif // LLVM 3.0
+    else {
+        fprintf(stderr, "Target ISA \"%s\" is unknown.  Choices are: %s\n", 
+                isa, SupportedTargetISAs());
+        error = true;
+    }
+
+    if (!error) {
+        llvm::TargetMachine *targetMachine = t->GetTargetMachine();
+        const llvm::TargetData *targetData = targetMachine->getTargetData();
+        t->is32bit = (targetData->getPointerSize() == 4);
+    }
+
+    return !error;
 }

+
+const char *
+Target::SupportedTargetCPUs() {
+    return "atom, barcelona, core2, corei7, "
+#if defined(LLVM_3_0) || defined(LLVM_3_0_svn)
+        "corei7-avx, "
+#endif
+        "istanbul, nocona, penryn, "
+#ifdef LLVM_2_9
+        "sandybridge, "
+#endif
+        "westmere";
+}
+
+
+const char *
+Target::SupportedTargetArchs() {
+    return "x86, x86-64";
+}
+
+
+const char *
+Target::SupportedTargetISAs() {
+    return "sse2, sse4, sse4x2"
+#if defined(LLVM_3_0) || defined(LLVM_3_0_svn)
+        ", avx, avx-x2"
+#endif
+        ;
+}
+
+
+std::string
+Target::GetTripleString() const {
+    llvm::Triple triple;
+    // Start with the host triple as the default
+    triple.setTriple(llvm::sys::getHostTriple());
+
+    // And override the arch in the host triple based on what the user
+    // specified.  Here we need to deal with the fact that LLVM uses one
+    // naming convention for targets TargetRegistry, but wants some
+    // slightly different ones for the triple.  TODO: is there a way to
+    // have it do this remapping, which would presumably be a bit less
+    // error prone?
+    if (arch == "x86")
+        triple.setArchName("i386");
+    else if (arch == "x86-64")
+        triple.setArchName("x86_64");
+    else
+        triple.setArchName(arch);
+
+    return triple.str();
+}
+
+
+llvm::TargetMachine *
+Target::GetTargetMachine() const {
+    std::string triple = GetTripleString();
+
+    llvm::Reloc::Model relocModel = generatePIC ? llvm::Reloc::PIC_ : 
+                                                  llvm::Reloc::Default;
+#if defined(LLVM_3_0svn) || defined(LLVM_3_0)
+    std::string featuresString = attributes;
+    llvm::TargetMachine *targetMachine = 
+        target->createTargetMachine(triple, cpu, featuresString, relocModel);
+#else
+#ifdef ISPC_IS_APPLE
+    relocModel = llvm::Reloc::PIC_;
+#endif // ISPC_IS_APPLE
+    std::string featuresString = cpu + std::string(",") + attributes;
+    llvm::TargetMachine *targetMachine = 
+        target->createTargetMachine(triple, featuresString);
+    targetMachine->setRelocationModel(relocModel);
+#endif
+    assert(targetMachine != NULL);
+
+    targetMachine->setAsmVerbosityDefault(true);
+    return targetMachine;
+}
+
+
 ///////////////////////////////////////////////////////////////////////////
 // Opt

 Opt::Opt() {
    level = 1;
    fastMath = false;
+    fastMaskedVload = false;
+    unrollLoops = true;
    disableBlendedMaskedStores = false;
    disableCoherentControlFlow = false;
    disableUniformControlFlow = false;
@@ -120,13 +307,9 @@ SourcePos::SourcePos(const char *n, int l, int c) {
 }

 llvm::DIFile SourcePos::GetDIFile() const {
-#ifdef LLVM_2_8
-    return llvm::DIFile();
-#else
    std::string directory, filename;
    GetDirectoryAndFileName(g->currentDirectory, name, &directory, &filename);
    return m->diBuilder->createFile(filename, directory);
-#endif // LLVM_2_8
 }


@@ -135,3 +318,14 @@ SourcePos::Print() const {
    printf(" @ [%s:%d.%d - %d.%d] ", name, first_line, first_column,
           last_line, last_column); 
 }
+
+
+bool
+SourcePos::operator==(const SourcePos &p2) const {
+    return (!strcmp(name, p2.name) && 
+            first_line == p2.first_line &&
+            first_column == p2.first_column &&
+            last_line == p2.last_line &&
+            last_column == p2.last_column);
+}
+
--- a/ispc.h
+++ b/ispc.h
@@ -69,10 +69,19 @@ namespace llvm {
    class FunctionType;
    class LLVMContext;
    class Module;
+    class Target;
+    class TargetMachine;
    class Type;
    class Value;
 }

+// llvm::Type *s are no longer const in llvm 3.0
+#if defined(LLVM_3_0) || defined(LLVM_3_0svn)
+#define LLVM_TYPE_CONST
+#else
+#define LLVM_TYPE_CONST const
+#endif
+
 class ArrayType;
 class AtomicType;
 class DeclSpecs;
@@ -110,6 +119,8 @@ struct SourcePos {

    /** Returns a LLVM DIFile object that represents the SourcePos's file */
    llvm::DIFile GetDIFile() const;
+
+    bool operator==(const SourcePos &p2) const;
 };


@@ -137,6 +148,8 @@ public:
        pointer in place of the original ASTNode *. */
    virtual ASTNode *TypeCheck() = 0;

+    virtual int EstimateCost() const = 0;
+
    /** All AST nodes must track the file position where they are
        defined. */
    const SourcePos pos;
@@ -147,9 +160,36 @@ public:
    This structure defines a compilation target for the ispc compiler.
 */
 struct Target {
-    Target();
+    /** Initializes the given Target pointer for a target of the given
+        name, if the name is a known target.  Returns true if the
+        target was initialized and false if the name is unknown. */
+    static bool GetTarget(const char *arch, const char *cpu, const char *isa,
+                          bool pic, Target *);

-    /** Enumerant giving the instruction sets that the compiler can
+    /** Returns a comma-delimited string giving the names of the currently
+        supported target ISAs. */
+    static const char *SupportedTargetISAs();
+
+    /** Returns a comma-delimited string giving the names of the currently
+        supported target CPUs. */
+    static const char *SupportedTargetCPUs();
+
+    /** Returns a comma-delimited string giving the names of the currently
+        supported target architectures. */
+    static const char *SupportedTargetArchs();
+
+    /** Returns a triple string specifying the target architecture, vendor,
+        and environment. */
+    std::string GetTripleString() const;
+
+    /** Returns the LLVM TargetMachine object corresponding to this
+        target. */
+    llvm::TargetMachine *GetTargetMachine() const;
+
+    /** llvm Target object representing this target. */
+    const llvm::Target *target;
+
+    /** Enumerator giving the instruction sets that the compiler can
        target. */
    enum ISA { SSE2, SSE4, AVX };

@@ -159,9 +199,15 @@ struct Target {
    /** Target system architecture.  (e.g. "x86-64", "x86"). */
    std::string arch;

+    /** Is the target architecture 32 or 64 bit */
+    bool is32bit;
+
    /** Target CPU. (e.g. "corei7", "corei7-avx", ..) */
    std::string cpu;

+    /** Target-specific attributes to pass along to the LLVM backend */
+    std::string attributes;
+
    /** Native vector width of the vector instruction set.  Note that this
        value is directly derived from the ISA Being used (e.g. it's 4 for
        SSE, 8 for AVX, etc.) */
@@ -171,8 +217,12 @@ struct Target {
        integer multiple of the native vector width, for example if we're
        "doubling up" and compiling 8-wide on a 4-wide SSE system. */
    int vectorWidth;
+
+    /** Indicates whether position independent code should be generated. */
+    bool generatePIC;
 };

+
 /** @brief Structure that collects optimization options

    This structure collects all of the options related to optimization of
@@ -190,6 +240,16 @@ struct Opt {
        should be performed.  This is false by default. */
    bool fastMath;

+    /** Indicates whether an vector load should be issued for masked loads
+        on platforms that don't have a native masked vector load.  (This may
+        lead to accessing memory up to programCount-1 elements past the end of
+        arrays, so is unsafe in general.) */
+    bool fastMaskedVload;
+
+    /** Indicates when loops should be unrolled (when doing so seems like
+        it will make sense. */
+    bool unrollLoops;
+
    /** On targets that don't have a masked store instruction but do have a
        blending instruction, by default, we simulate masked stores by
        loading the old value, blending, and storing the result.  This can
@@ -307,6 +367,29 @@ struct Globals {
    std::vector<std::string> cppArgs;
 };

+enum {
+    COST_ASSIGN = 1,
+    COST_COHERENT_BREAK_CONTINE = 4,
+    COST_COMPLEX_ARITH_OP = 4,
+    COST_DEREF = 4,
+    COST_FUNCALL = 4,
+    COST_GATHER = 8,
+    COST_LOAD = 2,
+    COST_REGULAR_BREAK_CONTINUE = 2,
+    COST_RETURN = 4,
+    COST_SELECT = 4,
+    COST_SIMPLE_ARITH_LOGIC_OP = 1,
+    COST_SYNC = 32,
+    COST_TASK_LAUNCH = 16,
+    COST_TYPECAST_COMPLEX = 4,
+    COST_TYPECAST_SIMPLE = 1,
+    COST_UNIFORM_LOOP = 4,
+    COST_VARYING_LOOP = 6,
+
+    CHECK_MASK_AT_FUNCTION_START_COST = 16,
+    PREDICATE_SAFE_IF_STATEMENT_COST = 6,
+};
+
 extern Globals *g;
 extern Module *m;

--- a/ispc.vcxproj
+++ b/ispc.vcxproj
@@ -16,7 +16,9 @@
    <ClCompile Include="decl.cpp" />
    <ClCompile Include="expr.cpp" />
    <ClCompile Include="gen-bitcode-avx.cpp" />
-    <ClCompile Include="gen-bitcode-c.cpp" />
+    <ClCompile Include="gen-bitcode-avx-x2.cpp" />
+    <ClCompile Include="gen-bitcode-c-32.cpp" />
+    <ClCompile Include="gen-bitcode-c-64.cpp" />
    <ClCompile Include="gen-bitcode-sse2.cpp" />
    <ClCompile Include="gen-bitcode-sse4.cpp" />
    <ClCompile Include="gen-bitcode-sse4x2.cpp" />
@@ -28,11 +30,11 @@
    <ClCompile Include="main.cpp" />
    <ClCompile Include="opt.cpp" />
    <ClCompile Include="parse.cc" />
-    <CustomBuild Include="stdlib-c.c">
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%LLVM_INSTALL_DIR%\bin\clang -emit-llvm stdlib-c.c -c -o - | %LLVM_INSTALL_DIR%\bin\llvm-dis - | python bitcode2cpp.py stdlib-c.c &gt; gen-bitcode-c.cpp</Command>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">clang stdlib-c.c</Message>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%LLVM_INSTALL_DIR%\bin\clang -emit-llvm stdlib-c.c -c -o - | %LLVM_INSTALL_DIR%\bin\llvm-dis - | python bitcode2cpp.py stdlib-c.c &gt; gen-bitcode-c.cpp</Command>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">clang stdlib-c.c</Message>
+    <CustomBuild Include="builtins-c.c">
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%LLVM_INSTALL_DIR%\bin\clang -m32 -emit-llvm builtins-c.c -c -o - | %LLVM_INSTALL_DIR%\bin\llvm-dis - | python bitcode2cpp.py builtins-c-32.c &gt; gen-bitcode-c-32.cpp</Command>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">clang builtins-c.c</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%LLVM_INSTALL_DIR%\bin\clang -m32 -emit-llvm builtins-c.c -c -o - | %LLVM_INSTALL_DIR%\bin\llvm-dis - | python bitcode2cpp.py builtins-c-32.c &gt; gen-bitcode-c-32.cpp</Command>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">clang builtins-c.c</Message>
      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-c.cpp</Outputs>
      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-c.cpp</Outputs>
    </CustomBuild>
@@ -59,66 +61,79 @@
  <ItemGroup>
    <CustomBuild Include="stdlib.ispc">
      <FileType>Document</FileType>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">cl /EP /TP %(Filename).ispc /DISPC=1 /DPI=3.1415926535 | python stdlib2cpp.py &gt; gen-stdlib.cpp</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">clang -E -x c %(Filename).ispc -DISPC=1 -DPI=3.1415926535 | python stdlib2cpp.py &gt; gen-stdlib.cpp</Command>
      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-stdlib.cpp</Outputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">cl /EP /TP %(Filename).ispc /DISPC=1 /DPI=3.1415926535 | python stdlib2cpp.py &gt; gen-stdlib.cpp</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">clang -E -x c %(Filename).ispc -DISPC=1 -DPI=3.1415926535 | python stdlib2cpp.py &gt; gen-stdlib.cpp</Command>
      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-stdlib.cpp</Outputs>
      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-stdlib.cpp</Message>
      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-stdlib.cpp</Message>
    </CustomBuild>
  </ItemGroup>
  <ItemGroup>
-    <CustomBuild Include="stdlib-sse4.ll">
+    <CustomBuild Include="builtins-sse4.ll">
      <FileType>Document</FileType>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 stdlib.m4 stdlib-sse4.ll | python bitcode2cpp.py stdlib-sse4.ll &gt; gen-bitcode-sse4.cpp</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 builtins.m4 builtins-sse4.ll | python bitcode2cpp.py builtins-sse4.ll &gt; gen-bitcode-sse4.cpp</Command>
      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-sse4.cpp</Outputs>
-      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">stdlib.m4;stdlib-sse.ll</AdditionalInputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 stdlib.m4 stdlib-sse4.ll | python bitcode2cpp.py stdlib-sse4.ll &gt; gen-bitcode-sse4.cpp</Command>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins.m4;builtins-sse.ll</AdditionalInputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 builtins.m4 builtins-sse4.ll | python bitcode2cpp.py builtins-sse4.ll &gt; gen-bitcode-sse4.cpp</Command>
      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-sse4.cpp</Outputs>
-      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">stdlib.m4;stdlib-sse.ll</AdditionalInputs>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins.m4;builtins-sse.ll</AdditionalInputs>
      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-sse4.cpp</Message>
      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-sse4.cpp</Message>
    </CustomBuild>
  </ItemGroup>
  <ItemGroup>
-    <CustomBuild Include="stdlib-sse4x2.ll">
+    <CustomBuild Include="builtins-sse4x2.ll">
      <FileType>Document</FileType>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 stdlib.m4 stdlib-sse4x2.ll | python bitcode2cpp.py stdlib-sse4x2.ll &gt; gen-bitcode-sse4x2.cpp</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 builtins.m4 builtins-sse4x2.ll | python bitcode2cpp.py builtins-sse4x2.ll &gt; gen-bitcode-sse4x2.cpp</Command>
      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-sse4x2.cpp</Outputs>
-      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">stdlib.m4;stdlib-sse.ll</AdditionalInputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 stdlib.m4 stdlib-sse4x2.ll | python bitcode2cpp.py stdlib-sse4x2.ll &gt; gen-bitcode-sse4x2.cpp</Command>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins.m4;builtins-sse.ll</AdditionalInputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 builtins.m4 builtins-sse4x2.ll | python bitcode2cpp.py builtins-sse4x2.ll &gt; gen-bitcode-sse4x2.cpp</Command>
      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-sse4x2.cpp</Outputs>
-      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">stdlib.m4;stdlib-sse.ll</AdditionalInputs>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins.m4;builtins-sse.ll</AdditionalInputs>
      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-sse4x2.cpp</Message>
      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-sse4x2.cpp</Message>
    </CustomBuild>
  </ItemGroup>
  <ItemGroup>
-    <CustomBuild Include="stdlib-sse2.ll">
+    <CustomBuild Include="builtins-sse2.ll">
      <FileType>Document</FileType>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 stdlib.m4 stdlib-sse2.ll | python bitcode2cpp.py stdlib-sse2.ll &gt; gen-bitcode-sse2.cpp</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 builtins.m4 builtins-sse2.ll | python bitcode2cpp.py builtins-sse2.ll &gt; gen-bitcode-sse2.cpp</Command>
      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-sse2.cpp</Outputs>
-      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">stdlib.m4;stdlib-sse.ll</AdditionalInputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 stdlib.m4 stdlib-sse2.ll | python bitcode2cpp.py stdlib-sse2.ll &gt; gen-bitcode-sse2.cpp</Command>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins.m4;builtins-sse.ll</AdditionalInputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 builtins.m4 builtins-sse2.ll | python bitcode2cpp.py builtins-sse2.ll &gt; gen-bitcode-sse2.cpp</Command>
      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-sse2.cpp</Outputs>
-      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">stdlib.m4;stdlib-sse.ll</AdditionalInputs>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins.m4;builtins-sse.ll</AdditionalInputs>
      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-sse2.cpp</Message>
      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-sse2.cpp</Message>
    </CustomBuild>
  </ItemGroup>
  <ItemGroup>
-    <CustomBuild Include="stdlib-avx.ll">
+    <CustomBuild Include="builtins-avx.ll">
      <FileType>Document</FileType>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 stdlib.m4 stdlib-avx.ll | python bitcode2cpp.py stdlib-avx.ll &gt; gen-bitcode-avx.cpp</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 builtins.m4 builtins-avx.ll | python bitcode2cpp.py builtins-avx.ll &gt; gen-bitcode-avx.cpp</Command>
      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-avx.cpp</Outputs>
-      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">stdlib.m4;stdlib-sse.ll</AdditionalInputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 stdlib.m4 stdlib-avx.ll | python bitcode2cpp.py stdlib-avx.ll &gt; gen-bitcode-avx.cpp</Command>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins.m4;builtins-sse.ll</AdditionalInputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 builtins.m4 builtins-avx.ll | python bitcode2cpp.py builtins-avx.ll &gt; gen-bitcode-avx.cpp</Command>
      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-avx.cpp</Outputs>
-      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">stdlib.m4;stdlib-sse.ll</AdditionalInputs>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins.m4;builtins-sse.ll</AdditionalInputs>
      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-avx.cpp</Message>
      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-avx.cpp</Message>
    </CustomBuild>
  </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="builtins-avx-x2.ll">
+      <FileType>Document</FileType>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 builtins.m4 builtins-avx-x2.ll | python bitcode2cpp.py builtins-avx-x2.ll &gt; gen-bitcode-avx-x2.cpp</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-avx-x2.cpp</Outputs>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins.m4;builtins-sse.ll</AdditionalInputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 builtins.m4 builtins-avx-x2.ll | python bitcode2cpp.py builtins-avx-x2.ll &gt; gen-bitcode-avx-x2.cpp</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-avx-x2.cpp</Outputs>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins.m4;builtins-sse.ll</AdditionalInputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-avx-x2.cpp</Message>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-avx-x2.cpp</Message>
+    </CustomBuild>
+  </ItemGroup>
  <ItemGroup>
    <CustomBuild Include="lex.ll">
      <FileType>Document</FileType>
@@ -187,7 +202,7 @@
      <SubSystem>Console</SubSystem>
      <GenerateDebugInformation>true</GenerateDebugInformation>
      <AdditionalLibraryDirectories>$(LLVM_INSTALL_DIR)\lib;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
-      <AdditionalDependencies>LLVMAnalysis.lib;LLVMArchive.lib;LLVMAsmPrinter.lib;LLVMBitReader.lib;LLVMBitWriter.lib;LLVMCodeGen.lib;LLVMCore.lib;LLVMExecutionEngine.lib;LLVMInstCombine.lib;LLVMInstrumentation.lib;LLVMipa.lib;LLVMipo.lib;LLVMLinker.lib;LLVMMC.lib;LLVMMCParser.lib;LLVMObject.lib;LLVMScalarOpts.lib;LLVMSelectionDAG.lib;LLVMSupport.lib;LLVMTarget.lib;LLVMTransformUtils.lib;LLVMX86ASMPrinter.lib;LLVMX86ASMParser.lib;LLVMX86Utils.lib;LLVMX86CodeGen.lib;LLVMX86Disassembler.lib;LLVMX86Info.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalDependencies>clangFrontend.lib;clangDriver.lib;clangSerialization.lib;clangParse.lib;clangSema.lib;clangAnalysis.lib;clangAST.lib;clangLex.lib;clangBasic.lib;LLVMAnalysis.lib;LLVMArchive.lib;LLVMAsmPrinter.lib;LLVMBitReader.lib;LLVMBitWriter.lib;LLVMCodeGen.lib;LLVMCore.lib;LLVMExecutionEngine.lib;LLVMInstCombine.lib;LLVMInstrumentation.lib;LLVMipa.lib;LLVMipo.lib;LLVMLinker.lib;LLVMMC.lib;LLVMMCParser.lib;LLVMObject.lib;LLVMScalarOpts.lib;LLVMSelectionDAG.lib;LLVMSupport.lib;LLVMTarget.lib;LLVMTransformUtils.lib;LLVMX86ASMPrinter.lib;LLVMX86ASMParser.lib;LLVMX86Utils.lib;LLVMX86CodeGen.lib;LLVMX86Disassembler.lib;LLVMX86Info.lib;shlwapi.lib;%(AdditionalDependencies)</AdditionalDependencies>
    </Link>
  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
@@ -207,7 +222,7 @@
      <EnableCOMDATFolding>true</EnableCOMDATFolding>
      <OptimizeReferences>true</OptimizeReferences>
      <AdditionalLibraryDirectories>$(LLVM_INSTALL_DIR)\lib;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
-      <AdditionalDependencies>LLVMAnalysis.lib;LLVMArchive.lib;LLVMAsmPrinter.lib;LLVMBitReader.lib;LLVMBitWriter.lib;LLVMCodeGen.lib;LLVMCore.lib;LLVMExecutionEngine.lib;LLVMInstCombine.lib;LLVMInstrumentation.lib;LLVMipa.lib;LLVMipo.lib;LLVMLinker.lib;LLVMMC.lib;LLVMMCParser.lib;LLVMObject.lib;LLVMScalarOpts.lib;LLVMSelectionDAG.lib;LLVMSupport.lib;LLVMTarget.lib;LLVMTransformUtils.lib;LLVMX86ASMPrinter.lib;LLVMX86ASMParser.lib;LLVMX86Utils.lib;LLVMX86CodeGen.lib;LLVMX86Disassembler.lib;LLVMX86Info.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalDependencies>clangFrontend.lib;clangDriver.lib;clangSerialization.lib;clangParse.lib;clangSema.lib;clangAnalysis.lib;clangAST.lib;clangLex.lib;clangBasic.lib;LLVMAnalysis.lib;LLVMArchive.lib;LLVMAsmPrinter.lib;LLVMBitReader.lib;LLVMBitWriter.lib;LLVMCodeGen.lib;LLVMCore.lib;LLVMExecutionEngine.lib;LLVMInstCombine.lib;LLVMInstrumentation.lib;LLVMipa.lib;LLVMipo.lib;LLVMLinker.lib;LLVMMC.lib;LLVMMCParser.lib;LLVMObject.lib;LLVMScalarOpts.lib;LLVMSelectionDAG.lib;LLVMSupport.lib;LLVMTarget.lib;LLVMTransformUtils.lib;LLVMX86ASMPrinter.lib;LLVMX86ASMParser.lib;LLVMX86Utils.lib;LLVMX86CodeGen.lib;LLVMX86Disassembler.lib;LLVMX86Info.lib;shlwapi.lib;%(AdditionalDependencies)</AdditionalDependencies>
    </Link>
  </ItemDefinitionGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
--- a/ispc_test.cpp
+++ b/ispc_test.cpp
@@ -33,8 +33,25 @@

 #define _CRT_SECURE_NO_WARNINGS

+#if defined(_WIN32) || defined(_WIN64)
+#define ISPC_IS_WINDOWS
+#elif defined(__linux__)
+#define ISPC_IS_LINUX
+#elif defined(__APPLE__)
+#define ISPC_IS_APPLE
+#endif
+
+#ifdef ISPC_IS_WINDOWS
+#define NOMINMAX
+#include <windows.h>
+#endif
 #include <stdio.h>
 #include <stdint.h>
+#include <stdlib.h>
+#include <memory.h>
+#ifdef ISPC_IS_LINUX
+#include <malloc.h>
+#endif

 #ifdef ISPC_HAVE_SVML
 #include <xmmintrin.h>
@@ -57,8 +74,15 @@ extern "C" {
 #include <llvm/DerivedTypes.h>
 #include <llvm/Instructions.h>
 #include <llvm/ExecutionEngine/ExecutionEngine.h>
+#if defined(LLVM_3_0) || defined(LLVM_3_0svn)
+  #include <llvm/ExecutionEngine/MCJIT.h>
+  #include <llvm/Support/TargetRegistry.h>
+  #include <llvm/Support/TargetSelect.h>
+#else
+  #include <llvm/Target/TargetRegistry.h>
+  #include <llvm/Target/TargetSelect.h>
+#endif
 #include <llvm/ExecutionEngine/JIT.h>
-#include <llvm/Target/TargetSelect.h>
 #include <llvm/Target/TargetOptions.h>
 #include <llvm/Target/TargetData.h>
 #include <llvm/Transforms/Scalar.h>
@@ -70,13 +94,15 @@ extern "C" {
 #include <llvm/Support/raw_ostream.h>
 #include <llvm/Bitcode/ReaderWriter.h>
 #include <llvm/Support/MemoryBuffer.h>
-#ifndef LLVM_2_8
 #include <llvm/Support/system_error.h>
-#endif
+
+bool shouldFail = false;

 extern "C" { 
    void ISPCLaunch(void *, void *);
    void ISPCSync();
+    void *ISPCMalloc(int64_t size, int32_t alignment);
+    void ISPCFree(void *ptr);
 }

 void ISPCLaunch(void *func, void *data) {
@@ -89,9 +115,41 @@ void ISPCLaunch(void *func, void *data) {
 void ISPCSync() {
 }

+
+void *ISPCMalloc(int64_t size, int32_t alignment) {
+#ifdef ISPC_IS_WINDOWS
+    return _aligned_malloc(size, alignment);
+#endif
+#ifdef ISPC_IS_LINUX
+    return memalign(alignment, size);
+#endif
+#ifdef ISPC_IS_APPLE
+    void *mem = malloc(size + (alignment-1) + sizeof(void*));
+    char *amem = ((char*)mem) + sizeof(void*);
+    amem = amem + uint32_t(alignment - (reinterpret_cast<uint64_t>(amem) &
+                                        (alignment - 1)));
+    ((void**)amem)[-1] = mem;
+    return amem;
+#endif
+}
+
+
+void ISPCFree(void *ptr) {
+#ifdef ISPC_IS_WINDOWS
+    _aligned_free(ptr);
+#endif
+#ifdef ISPC_IS_LINUX
+    free(ptr);
+#endif
+#ifdef ISPC_IS_APPLE
+    free(((void**)ptr)[-1]);
+#endif
+}
+
 static void usage(int ret) {
    fprintf(stderr, "usage: ispc_test\n");
    fprintf(stderr, "\t[-h/--help]\tprint help\n");
+    fprintf(stderr, "\t[-f]\t\tindicates that test is expected to fail\n");
    fprintf(stderr, "\t<files>\n");
    exit(ret);
 }
@@ -101,20 +159,22 @@ static void svml_missing() {
    exit(1);
 }

+// On Windows, sin() is an overloaded function, so we need an unambiguous
+// function we can take the address of when wiring up the external references
+// below.
+
+double Sin(double x) { return sin(x); }
+double Cos(double x) { return cos(x); }
+double Tan(double x) { return tan(x); }
+double Atan(double x) { return atan(x); }
+double Atan2(double y, double x) { return atan2(y, x); }
+double Pow(double a, double b) { return pow(a, b); }
+double Exp(double x) { return exp(x); }
+double Log(double x) { return log(x); }
+
 static bool lRunTest(const char *fn) {
    llvm::LLVMContext *ctx = new llvm::LLVMContext;

-#ifdef LLVM_2_8
-    std::string err;
-    llvm::MemoryBuffer *buf = llvm::MemoryBuffer::getFileOrSTDIN(fn, &err);
-    if (!buf) {
-        fprintf(stderr, "Unable to open file \"%s\": %s\n", fn, err.c_str());
-        delete ctx;
-        return false;
-    }
-    std::string bcErr;
-    llvm::Module *module = llvm::ParseBitcodeFile(buf, *ctx, &bcErr);
-#else
    llvm::OwningPtr<llvm::MemoryBuffer> buf;
    llvm::error_code err = llvm::MemoryBuffer::getFileOrSTDIN(fn, buf);
    if (err) {
@@ -124,7 +184,6 @@ static bool lRunTest(const char *fn) {
    }
    std::string bcErr;
    llvm::Module *module = llvm::ParseBitcodeFile(buf.get(), *ctx, &bcErr);
-#endif

    if (!module) {
        fprintf(stderr, "Bitcode reader failed for \"%s\": %s\n", fn, bcErr.c_str());
@@ -133,39 +192,59 @@ static bool lRunTest(const char *fn) {
    }

    std::string eeError;
+#if defined(LLVM_3_0) || defined(LLVM_3_0svn)
+    llvm::EngineBuilder engineBuilder(module);
+    engineBuilder.setErrorStr(&eeError);
+    engineBuilder.setEngineKind(llvm::EngineKind::JIT);
+#if 0
+    std::vector<std::string> attributes;
+    if (target != NULL && !strcmp(target, "avx"))
+        attributes.push_back("+avx");
+    engineBuilder.setMAttrs(attributes);
+    engineBuilder.setUseMCJIT(true);
+#endif
+    llvm::ExecutionEngine *ee = engineBuilder.create();
+#else
    llvm::ExecutionEngine *ee = llvm::ExecutionEngine::createJIT(module, &eeError);
+#endif
    if (!ee) {
        fprintf(stderr, "Unable to create ExecutionEngine: %s\n", eeError.c_str());
        return false;
    }

    llvm::Function *func;
-    if ((func = module->getFunction("ISPCLaunch")) != NULL)
-        ee->addGlobalMapping(func, (void *)ISPCLaunch);
-    if ((func = module->getFunction("ISPCSync")) != NULL)
-        ee->addGlobalMapping(func, (void *)ISPCSync);
-    if ((func = module->getFunction("putchar")) != NULL)
-        ee->addGlobalMapping(func, (void *)putchar);
-    if ((func = module->getFunction("printf")) != NULL)
-        ee->addGlobalMapping(func, (void *)printf);
-    if ((func = module->getFunction("fflush")) != NULL)
-        ee->addGlobalMapping(func, (void *)fflush);
-    if ((func = module->getFunction("sinf")) != NULL)
-        ee->addGlobalMapping(func, (void *)sinf);
-    if ((func = module->getFunction("cosf")) != NULL)
-        ee->addGlobalMapping(func, (void *)cosf);
-    if ((func = module->getFunction("tanf")) != NULL)
-        ee->addGlobalMapping(func, (void *)tanf);
-    if ((func = module->getFunction("atanf")) != NULL)
-        ee->addGlobalMapping(func, (void *)atanf);
-    if ((func = module->getFunction("atan2f")) != NULL)
-        ee->addGlobalMapping(func, (void *)atan2f);
-    if ((func = module->getFunction("powf")) != NULL)
-        ee->addGlobalMapping(func, (void *)powf);
-    if ((func = module->getFunction("expf")) != NULL)
-        ee->addGlobalMapping(func, (void *)expf);
-    if ((func = module->getFunction("logf")) != NULL)
-        ee->addGlobalMapping(func, (void *)logf);
+#define DO_FUNC(FUNC ,FUNCNAME)                           \
+    if ((func = module->getFunction(FUNCNAME)) != NULL)   \
+        ee->addGlobalMapping(func, (void *)FUNC)
+    DO_FUNC(ISPCLaunch, "ISPCLaunch");
+    DO_FUNC(ISPCSync, "ISPCSync");
+    DO_FUNC(ISPCMalloc, "ISPCMalloc");
+    DO_FUNC(ISPCFree, "ISPCFree");
+    DO_FUNC(putchar, "putchar");
+    DO_FUNC(printf, "printf");
+    DO_FUNC(fflush, "fflush");
+    DO_FUNC(sinf, "sinf");
+    DO_FUNC(cosf, "cosf");
+    DO_FUNC(tanf, "tanf");
+    DO_FUNC(atanf, "atanf");
+    DO_FUNC(atan2f, "atan2f");
+    DO_FUNC(powf, "powf");
+    DO_FUNC(expf, "expf");
+    DO_FUNC(logf, "logf");
+    DO_FUNC(Sin, "sin");
+    DO_FUNC(Cos, "cos");
+    DO_FUNC(Tan, "tan");
+    DO_FUNC(Atan, "atan");
+    DO_FUNC(Atan2, "atan2");
+    DO_FUNC(Pow, "pow");
+    DO_FUNC(Exp, "exp");
+    DO_FUNC(Log, "log");
+    DO_FUNC(memset, "memset");
+#ifdef ISPC_IS_APPLE
+    DO_FUNC(memset_pattern4, "memset_pattern4");
+    DO_FUNC(memset_pattern8, "memset_pattern8");
+    DO_FUNC(memset_pattern16, "memset_pattern16");
+#endif

 #ifdef ISPC_HAVE_SVML
 #define DO_SVML(FUNC ,FUNCNAME)                           \
@@ -207,7 +286,6 @@ static bool lRunTest(const char *fn) {
    float result[16];
    for (int i = 0; i < 16; ++i)
        result[i] = 0;
-    bool ok = true;
    if (foundResult) {
        typedef void (*PFN)(float *);
        PFN pfn = reinterpret_cast<PFN>(ee->getPointerToFunction(func));
@@ -264,50 +342,49 @@ static bool lRunTest(const char *fn) {
    }
    else {
        fprintf(stderr, "Unable to find runnable function in file \"%s\"\n", fn);
-        ok = false;
+        return false;
    }

    // see if we got the right result
-    if (ok) {
-        if (foundResult) {
-            for (int i = 0; i < width; ++i)
-                if (returned[i] != result[i]) {
-                    ok = false;
-                    fprintf(stderr, "Test \"%s\" RETURNED %d: %g / %a EXPECTED %g / %a\n",
-                            fn, i, returned[i], returned[i], result[i], result[i]);
-                }
-        }
-        else {
-            for (int i = 0; i < width; ++i)
-                fprintf(stderr, "Test \"%s\" returned %d: %g / %a\n",
-                        fn, i, returned[i], returned[i]);
-        }
+    bool resultsMatch = true;
+    if (foundResult) {
+        for (int i = 0; i < width; ++i)
+            if (returned[i] != result[i]) {
+                resultsMatch = false;
+                fprintf(stderr, "Test \"%s\" RETURNED %d: %g / %a EXPECTED %g / %a\n",
+                        fn, i, returned[i], returned[i], result[i], result[i]);
+            }
    }
+    else {
+        for (int i = 0; i < width; ++i)
+            fprintf(stderr, "Test \"%s\" returned %d: %g / %a\n",
+                    fn, i, returned[i], returned[i]);
+    }
+    if (foundResult && shouldFail && resultsMatch)
+        fprintf(stderr, "Test %s unexpectedly passed\n", fn);

    delete ee;
    delete ctx;

-    return ok && foundResult;
+    return foundResult && resultsMatch;
 }

+
 int main(int argc, char *argv[]) {
    llvm::InitializeNativeTarget();
+#if defined(LLVM_3_0) || defined(LLVM_3_0svn)
+    LLVMLinkInJIT();
+#endif

-    std::vector<const char *> files;
+    const char *filename = NULL;
    for (int i = 1; i < argc; ++i) {
        if (!strcmp(argv[i], "--help") || !strcmp(argv[i], "-h"))
            usage(0);
+        if (!strcmp(argv[i], "-f"))
+            shouldFail = true;
        else
-            files.push_back(argv[i]);
+            filename = argv[i];
    }

-    int passes = 0, fails = 0;
-    for (unsigned int i = 0; i < files.size(); ++i) {
-        if (lRunTest(files[i])) ++passes;
-        else ++fails;
-    }
-
-    if (fails > 0)
-        fprintf(stderr, "%d/%d tests passed\n", passes, passes+fails);
-    return fails > 0;
+    return (lRunTest(filename) == true) ? 0 : 1;
 }
--- a/ispc_test.vcxproj
+++ b/ispc_test.vcxproj
@@ -52,7 +52,7 @@
      </PrecompiledHeader>
      <WarningLevel>Level3</WarningLevel>
      <Optimization>Disabled</Optimization>
-      <PreprocessorDefinitions>_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <PreprocessorDefinitions>ISPC_IS_WINDOWS;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <AdditionalIncludeDirectories>$(LLVM_INSTALL_DIR)/include</AdditionalIncludeDirectories>
    </ClCompile>
    <Link>
@@ -70,7 +70,7 @@
      <Optimization>MaxSpeed</Optimization>
      <FunctionLevelLinking>true</FunctionLevelLinking>
      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <PreprocessorDefinitions>NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <PreprocessorDefinitions>ISPC_IS_WINDOWS;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <AdditionalIncludeDirectories>$(LLVM_INSTALL_DIR)/include</AdditionalIncludeDirectories>
    </ClCompile>
    <Link>
--- a/lex.ll
+++ b/lex.ll
@@ -35,16 +35,19 @@

 #include "ispc.h"
 #include "decl.h"
-#include "parse.hh"
 #include "sym.h"
 #include "util.h"
 #include "module.h"
+#include "type.h"
+#include "parse.hh"
+#include <stdlib.h>

-static uint32_t lParseBinary(const char *ptr, SourcePos pos);
+static uint64_t lParseBinary(const char *ptr, SourcePos pos);
 static void lCComment(SourcePos *);
 static void lCppComment(SourcePos *);
 static void lHandleCppHash(SourcePos *);
 static void lStringConst(YYSTYPE *, SourcePos *);
+static double lParseHexFloat(const char *ptr);

 #define YY_USER_ACTION \
    yylloc->first_line = yylloc->last_line; \
@@ -65,9 +68,11 @@ inline int isatty(int) { return 0; }

 WHITESPACE [ \t\r]+
 INT_NUMBER (([0-9]+)|(0x[0-9a-fA-F]+)|(0b[01]+))
-FLOAT_NUMBER (([0-9]+|(([0-9]+\.[0-9]*[fF]?)|(\.[0-9]+)))([eE][-+]?[0-9]+)?[fF]?)|([-]?0x[01]\.?[0-9a-fA-F]+p[-+]?[0-9]+[fF]?)
+FLOAT_NUMBER (([0-9]+|(([0-9]+\.[0-9]*[fF]?)|(\.[0-9]+)))([eE][-+]?[0-9]+)?[fF]?)
+HEX_FLOAT_NUMBER (0x[01](\.[0-9a-fA-F]*)?p[-+]?[0-9]+[fF]?)

 IDENT [a-zA-Z_][a-zA-Z_0-9]*
+ZO_SWIZZLE ([01]+[w-z]+)+|([01]+[rgba]+)+|([01]+[uv]+)+

 %%
 "/*"            { lCComment(yylloc); }
@@ -100,6 +105,8 @@ goto { return TOKEN_GOTO; }
 if { return TOKEN_IF; }
 inline { return TOKEN_INLINE; }
 int { return TOKEN_INT; }
+int8 { return TOKEN_INT8; }
+int16 { return TOKEN_INT16; }
 int32 { return TOKEN_INT; }
 int64 { return TOKEN_INT64; }
 launch { return TOKEN_LAUNCH; }
@@ -134,61 +141,66 @@ L?\"(\\.|[^\\"])*\" { lStringConst(yylval, yylloc); return TOKEN_STRING_LITERAL;

 {INT_NUMBER} { 
    char *endPtr = NULL;
-#ifdef ISPC_IS_WINDOWS
-    unsigned long val;
-#else
-    unsigned long long val;
-#endif
+    int64_t val;

    if (yytext[0] == '0' && yytext[1] == 'b')
        val = lParseBinary(yytext+2, *yylloc);
    else {
 #ifdef ISPC_IS_WINDOWS
-        val = strtoul(yytext, &endPtr, 0);
+        val = _strtoi64(yytext, &endPtr, 0);
 #else
+        // FIXME: should use strtouq and then issue an error if we can't
+        // fit into 64 bits...
        val = strtoull(yytext, &endPtr, 0);
 #endif
    }
-    yylval->int32Val = (int32_t)val;
-    if (val != (unsigned int)yylval->int32Val)
-        Warning(*yylloc, "32-bit integer has insufficient bits to represent value %s (%x %llx)",
-                yytext, yylval->int32Val, (unsigned long long)val);
-    return TOKEN_INT_CONSTANT; 
+
+    // See if we can fit this into a 32-bit integer...
+    if ((val & 0xffffffff) == val) {
+        yylval->int32Val = (int32_t)val;
+        return TOKEN_INT32_CONSTANT; 
+    }
+    else {
+        yylval->int64Val = val;
+        return TOKEN_INT64_CONSTANT; 
+    }
 }

 {INT_NUMBER}[uU] {
    char *endPtr = NULL;
-#ifdef ISPC_IS_WINDOWS
-    unsigned long val;
-#else
-    unsigned long long val;
-#endif
+    uint64_t val;

    if (yytext[0] == '0' && yytext[1] == 'b')
        val = lParseBinary(yytext+2, *yylloc);
    else {
 #ifdef ISPC_IS_WINDOWS
-        val = strtoul(yytext, &endPtr, 0);
+        val = _strtoui64(yytext, &endPtr, 0);
 #else
        val = strtoull(yytext, &endPtr, 0);
 #endif
    }

-    yylval->int32Val = (int32_t)val;
-    if (val != (unsigned int)yylval->int32Val)
-        Warning(*yylloc, "32-bit integer has insufficient bits to represent value %s (%x %llx)",
-                yytext, yylval->int32Val, (unsigned long long)val);
-    return TOKEN_UINT_CONSTANT; 
+    if ((val & 0xffffffff) == val) {
+        // we can represent it in a 32-bit value
+        yylval->int32Val = (int32_t)val;
+        return TOKEN_UINT32_CONSTANT; 
+    }
+    else {
+        yylval->int64Val = val;
+        return TOKEN_UINT64_CONSTANT; 
+    }
 }

 {FLOAT_NUMBER} { 
-    /* FIXME: need to implement a hex float constant parser so that we can 
-       support them on Windows (which doesn't handle them in its atof()
-       implementation... */
    yylval->floatVal = atof(yytext); 
    return TOKEN_FLOAT_CONSTANT; 
 }

+{HEX_FLOAT_NUMBER} {
+    yylval->floatVal = lParseHexFloat(yytext); 
+    return TOKEN_FLOAT_CONSTANT; 
+}
+
 "++" { return TOKEN_INC_OP; }
 "--" { return TOKEN_DEC_OP; }
 "<<" { return TOKEN_LEFT_OP; }
@@ -264,19 +276,18 @@ L?\"(\\.|[^\\"])*\" { lStringConst(yylval, yylloc); return TOKEN_STRING_LITERAL;

 /** Return the integer version of a binary constant from a string.
 */
-static uint32_t
+static uint64_t
 lParseBinary(const char *ptr, SourcePos pos) {
-    uint32_t val = 0;
+    uint64_t val = 0;
    bool warned = false;

    while (*ptr != '\0') {
        /* if this hits, the regexp for 0b... constants is broken */
        assert(*ptr == '0' || *ptr == '1');

-        if ((val & (1<<31)) && warned == false) {
+        if ((val & (((int64_t)1)<<63)) && warned == false) {
            // We're about to shift out a set bit
-            // FIXME: 64-bit int constants...
-            Warning(pos, "Can't represent binary constant with 32-bit integer type");
+            Warning(pos, "Can't represent binary constant with a 64-bit integer type");
            warned = true;
        }

@@ -389,12 +400,12 @@ lEscapeChar(char *str, char *pChar, SourcePos *pos)
        // octal constants \012
        case '0': case '1': case '2': case '3': case '4':
        case '5': case '6': case '7':
-            *pChar = strtol(str, &tail, 8);
+            *pChar = (char)strtol(str, &tail, 8);
            str = tail - 1;
            break;
        // hexidecimal constant \xff
        case 'x':
-            *pChar = strtol(str, &tail, 16);
+            *pChar = (char)strtol(str, &tail, 16);
            str = tail - 1;
            break;
        default:
@@ -424,3 +435,82 @@ lStringConst(YYSTYPE *yylval, SourcePos *pos)
    } 
    yylval->stringVal = new std::string(str);
 }
+
+
+/** Compute the value 2^n, where the exponent is given as an integer.
+    There are more efficient ways to do this, for example by just slamming
+    the bits into the appropriate bits of the double, but let's just do the
+    obvious thing. 
+*/
+static double
+ipow2(int exponent) {
+    if (exponent < 0)
+        return 1. / ipow2(-exponent);
+
+    double ret = 1.;
+    while (exponent > 16) {
+        ret *= 65536.;
+        exponent -= 16;
+    }
+    while (exponent-- > 0)
+        ret *= 2.;
+    return ret;
+}
+
+
+/** Parse a hexadecimal-formatted floating-point number (C99 hex float
+    constant-style). 
+*/
+static double
+lParseHexFloat(const char *ptr) {
+    assert(ptr != NULL);
+
+    assert(ptr[0] == '0' && ptr[1] == 'x');
+    ptr += 2;
+
+    // Start initializing the mantissa
+    assert(*ptr == '0' || *ptr == '1');
+    double mantissa = (*ptr == '1') ? 1. : 0.;
+    ++ptr;
+
+    if (*ptr == '.') {
+        // Is there a fraction part?  If so, the i'th digit we encounter
+        // gives the 1/(16^i) component of the mantissa.
+        ++ptr;
+
+        double scale = 1. / 16.;
+        // Keep going until we come to the 'p', which indicates that we've
+        // come to the exponent
+        while (*ptr != 'p') {
+            // Figure out the raw value from 0-15
+            int digit;
+            if (*ptr >= '0' && *ptr <= '9')
+                digit = *ptr - '0';
+            else if (*ptr >= 'a' && *ptr <= 'f')
+                digit = 10 + *ptr - 'a';
+            else {
+                assert(*ptr >= 'A' && *ptr <= 'F');
+                digit = 10 + *ptr - 'A';
+            }
+
+            // And add its contribution to the mantissa
+            mantissa += scale * digit;
+            scale /= 16.;
+            ++ptr;
+        }
+    }
+    else
+        // If there's not a '.', then we better be going straight to the
+        // exponent
+        assert(*ptr == 'p');
+
+    ++ptr; // skip the 'p'
+
+    // interestingly enough, the exponent is provided base 10..
+    int exponent = (int)strtol(ptr, (char **)NULL, 10);
+
+    // Does stdlib exp2() guarantee exact results for integer n where can
+    // be represented exactly as doubles?  I would hope so but am not sure,
+    // so let's be sure.
+    return mantissa * ipow2(exponent);
+}
--- a/llvmutil.cpp
+++ b/llvmutil.cpp
@@ -38,30 +38,43 @@
 #include "llvmutil.h"
 #include "type.h"

-const llvm::Type *LLVMTypes::VoidType = NULL;
-const llvm::PointerType *LLVMTypes::VoidPointerType = NULL;
-const llvm::Type *LLVMTypes::BoolType = NULL;
-const llvm::Type *LLVMTypes::Int8Type = NULL;
-const llvm::Type *LLVMTypes::Int16Type = NULL;
-const llvm::Type *LLVMTypes::Int32Type = NULL;
-const llvm::Type *LLVMTypes::Int32PointerType = NULL;
-const llvm::Type *LLVMTypes::Int64Type = NULL;
-const llvm::Type *LLVMTypes::Int64PointerType = NULL;
-const llvm::Type *LLVMTypes::FloatType = NULL;
-const llvm::Type *LLVMTypes::FloatPointerType = NULL;
-const llvm::Type *LLVMTypes::DoubleType = NULL;
+LLVM_TYPE_CONST llvm::Type *LLVMTypes::VoidType = NULL;
+LLVM_TYPE_CONST llvm::PointerType *LLVMTypes::VoidPointerType = NULL;
+LLVM_TYPE_CONST llvm::Type *LLVMTypes::BoolType = NULL;

-const llvm::VectorType *LLVMTypes::MaskType = NULL;
-const llvm::VectorType *LLVMTypes::BoolVectorType = NULL;
-const llvm::VectorType *LLVMTypes::Int1VectorType = NULL;
-const llvm::VectorType *LLVMTypes::Int32VectorType = NULL;
-const llvm::Type *LLVMTypes::Int32VectorPointerType = NULL;
-const llvm::VectorType *LLVMTypes::Int64VectorType = NULL;
-const llvm::Type *LLVMTypes::Int64VectorPointerType = NULL;
-const llvm::VectorType *LLVMTypes::FloatVectorType = NULL;
-const llvm::Type *LLVMTypes::FloatVectorPointerType = NULL;
-const llvm::VectorType *LLVMTypes::DoubleVectorType = NULL;
-const llvm::ArrayType *LLVMTypes::VoidPointerVectorType = NULL;
+LLVM_TYPE_CONST llvm::Type *LLVMTypes::Int8Type = NULL;
+LLVM_TYPE_CONST llvm::Type *LLVMTypes::Int16Type = NULL;
+LLVM_TYPE_CONST llvm::Type *LLVMTypes::Int32Type = NULL;
+LLVM_TYPE_CONST llvm::Type *LLVMTypes::Int64Type = NULL;
+LLVM_TYPE_CONST llvm::Type *LLVMTypes::FloatType = NULL;
+LLVM_TYPE_CONST llvm::Type *LLVMTypes::DoubleType = NULL;
+
+LLVM_TYPE_CONST llvm::Type *LLVMTypes::Int8PointerType = NULL;
+LLVM_TYPE_CONST llvm::Type *LLVMTypes::Int16PointerType = NULL;
+LLVM_TYPE_CONST llvm::Type *LLVMTypes::Int32PointerType = NULL;
+LLVM_TYPE_CONST llvm::Type *LLVMTypes::Int64PointerType = NULL;
+LLVM_TYPE_CONST llvm::Type *LLVMTypes::FloatPointerType = NULL;
+LLVM_TYPE_CONST llvm::Type *LLVMTypes::DoublePointerType = NULL;
+
+LLVM_TYPE_CONST llvm::VectorType *LLVMTypes::MaskType = NULL;
+LLVM_TYPE_CONST llvm::VectorType *LLVMTypes::BoolVectorType = NULL;
+
+LLVM_TYPE_CONST llvm::VectorType *LLVMTypes::Int1VectorType = NULL;
+LLVM_TYPE_CONST llvm::VectorType *LLVMTypes::Int8VectorType = NULL;
+LLVM_TYPE_CONST llvm::VectorType *LLVMTypes::Int16VectorType = NULL;
+LLVM_TYPE_CONST llvm::VectorType *LLVMTypes::Int32VectorType = NULL;
+LLVM_TYPE_CONST llvm::VectorType *LLVMTypes::Int64VectorType = NULL;
+LLVM_TYPE_CONST llvm::VectorType *LLVMTypes::FloatVectorType = NULL;
+LLVM_TYPE_CONST llvm::VectorType *LLVMTypes::DoubleVectorType = NULL;
+
+LLVM_TYPE_CONST llvm::Type *LLVMTypes::Int8VectorPointerType = NULL;
+LLVM_TYPE_CONST llvm::Type *LLVMTypes::Int16VectorPointerType = NULL;
+LLVM_TYPE_CONST llvm::Type *LLVMTypes::Int32VectorPointerType = NULL;
+LLVM_TYPE_CONST llvm::Type *LLVMTypes::Int64VectorPointerType = NULL;
+LLVM_TYPE_CONST llvm::Type *LLVMTypes::FloatVectorPointerType = NULL;
+LLVM_TYPE_CONST llvm::Type *LLVMTypes::DoubleVectorPointerType = NULL;
+
+LLVM_TYPE_CONST llvm::ArrayType *LLVMTypes::VoidPointerVectorType = NULL;

 llvm::Constant *LLVMTrue = NULL;
 llvm::Constant *LLVMFalse = NULL;
@@ -73,17 +86,22 @@ void
 InitLLVMUtil(llvm::LLVMContext *ctx, Target target) {
    LLVMTypes::VoidType = llvm::Type::getVoidTy(*ctx);
    LLVMTypes::VoidPointerType = llvm::PointerType::get(llvm::Type::getInt8Ty(*ctx), 0);
+
    LLVMTypes::BoolType = llvm::Type::getInt1Ty(*ctx);
    LLVMTypes::Int8Type = llvm::Type::getInt8Ty(*ctx);
    LLVMTypes::Int16Type = llvm::Type::getInt16Ty(*ctx);
    LLVMTypes::Int32Type = llvm::Type::getInt32Ty(*ctx);
-    LLVMTypes::Int32PointerType = llvm::PointerType::get(LLVMTypes::Int32Type, 0);
    LLVMTypes::Int64Type = llvm::Type::getInt64Ty(*ctx);
-    LLVMTypes::Int64PointerType = llvm::PointerType::get(LLVMTypes::Int64Type, 0);
    LLVMTypes::FloatType = llvm::Type::getFloatTy(*ctx);
-    LLVMTypes::FloatPointerType = llvm::PointerType::get(LLVMTypes::FloatType, 0);
    LLVMTypes::DoubleType = llvm::Type::getDoubleTy(*ctx);

+    LLVMTypes::Int8PointerType = llvm::PointerType::get(LLVMTypes::Int8Type, 0);
+    LLVMTypes::Int16PointerType = llvm::PointerType::get(LLVMTypes::Int16Type, 0);
+    LLVMTypes::Int32PointerType = llvm::PointerType::get(LLVMTypes::Int32Type, 0);
+    LLVMTypes::Int64PointerType = llvm::PointerType::get(LLVMTypes::Int64Type, 0);
+    LLVMTypes::FloatPointerType = llvm::PointerType::get(LLVMTypes::FloatType, 0);
+    LLVMTypes::DoublePointerType = llvm::PointerType::get(LLVMTypes::DoubleType, 0);
+
    // Note that both the mask and bool vectors are vector of int32s
    // (not i1s).  LLVM ends up generating much better SSE code with
    // this representation.
@@ -92,17 +110,26 @@ InitLLVMUtil(llvm::LLVMContext *ctx, Target target) {

    LLVMTypes::Int1VectorType = 
        llvm::VectorType::get(llvm::Type::getInt1Ty(*ctx), target.vectorWidth);
+    LLVMTypes::Int8VectorType = 
+        llvm::VectorType::get(LLVMTypes::Int8Type, target.vectorWidth);
+    LLVMTypes::Int16VectorType = 
+        llvm::VectorType::get(LLVMTypes::Int16Type, target.vectorWidth);
    LLVMTypes::Int32VectorType = 
        llvm::VectorType::get(LLVMTypes::Int32Type, target.vectorWidth);
-    LLVMTypes::Int32VectorPointerType = llvm::PointerType::get(LLVMTypes::Int32VectorType, 0);
    LLVMTypes::Int64VectorType = 
        llvm::VectorType::get(LLVMTypes::Int64Type, target.vectorWidth);
-    LLVMTypes::Int64VectorPointerType = llvm::PointerType::get(LLVMTypes::Int64VectorType, 0);
    LLVMTypes::FloatVectorType = 
        llvm::VectorType::get(LLVMTypes::FloatType, target.vectorWidth);
-    LLVMTypes::FloatVectorPointerType = llvm::PointerType::get(LLVMTypes::FloatVectorType, 0);
    LLVMTypes::DoubleVectorType = 
        llvm::VectorType::get(LLVMTypes::DoubleType, target.vectorWidth);
+
+    LLVMTypes::Int8VectorPointerType = llvm::PointerType::get(LLVMTypes::Int8VectorType, 0);
+    LLVMTypes::Int16VectorPointerType = llvm::PointerType::get(LLVMTypes::Int16VectorType, 0);
+    LLVMTypes::Int32VectorPointerType = llvm::PointerType::get(LLVMTypes::Int32VectorType, 0);
+    LLVMTypes::Int64VectorPointerType = llvm::PointerType::get(LLVMTypes::Int64VectorType, 0);
+    LLVMTypes::FloatVectorPointerType = llvm::PointerType::get(LLVMTypes::FloatVectorType, 0);
+    LLVMTypes::DoubleVectorPointerType = llvm::PointerType::get(LLVMTypes::DoubleVectorType, 0);
+
    LLVMTypes::VoidPointerVectorType = 
        llvm::ArrayType::get(LLVMTypes::VoidPointerType, target.vectorWidth);

@@ -129,7 +156,36 @@ InitLLVMUtil(llvm::LLVMContext *ctx, Target target) {
 }


-llvm::ConstantInt *LLVMInt32(int32_t ival) {
+llvm::ConstantInt *
+LLVMInt8(int8_t ival) {
+    return llvm::ConstantInt::get(llvm::Type::getInt8Ty(*g->ctx), ival,
+                                  true /*signed*/);
+}
+
+
+llvm::ConstantInt *
+LLVMUInt8(uint8_t ival) {
+    return llvm::ConstantInt::get(llvm::Type::getInt8Ty(*g->ctx), ival,
+                                  false /*unsigned*/);
+}
+
+
+llvm::ConstantInt *
+LLVMInt16(int16_t ival) {
+    return llvm::ConstantInt::get(llvm::Type::getInt16Ty(*g->ctx), ival,
+                                  true /*signed*/);
+}
+
+
+llvm::ConstantInt *
+LLVMUInt16(uint16_t ival) {
+    return llvm::ConstantInt::get(llvm::Type::getInt16Ty(*g->ctx), ival,
+                                  false /*unsigned*/);
+}
+
+
+llvm::ConstantInt *
+LLVMInt32(int32_t ival) {
    return llvm::ConstantInt::get(llvm::Type::getInt32Ty(*g->ctx), ival,
                                  true /*signed*/);
 }
@@ -168,6 +224,82 @@ LLVMDouble(double dval) {
 }


+llvm::Constant *
+LLVMInt8Vector(int8_t ival) {
+    llvm::Constant *v = LLVMInt8(ival);
+    std::vector<llvm::Constant *> vals;
+    for (int i = 0; i < g->target.vectorWidth; ++i)
+        vals.push_back(v);
+    return llvm::ConstantVector::get(vals);
+}
+
+
+llvm::Constant *
+LLVMInt8Vector(const int8_t *ivec) {
+    std::vector<llvm::Constant *> vals;
+    for (int i = 0; i < g->target.vectorWidth; ++i)
+        vals.push_back(LLVMInt8(ivec[i]));
+    return llvm::ConstantVector::get(vals);
+}
+
+
+llvm::Constant *
+LLVMUInt8Vector(uint8_t ival) {
+    llvm::Constant *v = LLVMUInt8(ival);
+    std::vector<llvm::Constant *> vals;
+    for (int i = 0; i < g->target.vectorWidth; ++i)
+        vals.push_back(v);
+    return llvm::ConstantVector::get(vals);
+}
+
+
+llvm::Constant *
+LLVMUInt8Vector(const uint8_t *ivec) {
+    std::vector<llvm::Constant *> vals;
+    for (int i = 0; i < g->target.vectorWidth; ++i)
+        vals.push_back(LLVMUInt8(ivec[i]));
+    return llvm::ConstantVector::get(vals);
+}
+
+
+llvm::Constant *
+LLVMInt16Vector(int16_t ival) {
+    llvm::Constant *v = LLVMInt16(ival);
+    std::vector<llvm::Constant *> vals;
+    for (int i = 0; i < g->target.vectorWidth; ++i)
+        vals.push_back(v);
+    return llvm::ConstantVector::get(vals);
+}
+
+
+llvm::Constant *
+LLVMInt16Vector(const int16_t *ivec) {
+    std::vector<llvm::Constant *> vals;
+    for (int i = 0; i < g->target.vectorWidth; ++i)
+        vals.push_back(LLVMInt16(ivec[i]));
+    return llvm::ConstantVector::get(vals);
+}
+
+
+llvm::Constant *
+LLVMUInt16Vector(uint16_t ival) {
+    llvm::Constant *v = LLVMUInt16(ival);
+    std::vector<llvm::Constant *> vals;
+    for (int i = 0; i < g->target.vectorWidth; ++i)
+        vals.push_back(v);
+    return llvm::ConstantVector::get(vals);
+}
+
+
+llvm::Constant *
+LLVMUInt16Vector(const uint16_t *ivec) {
+    std::vector<llvm::Constant *> vals;
+    for (int i = 0; i < g->target.vectorWidth; ++i)
+        vals.push_back(LLVMUInt16(ivec[i]));
+    return llvm::ConstantVector::get(vals);
+}
+
+
 llvm::Constant *
 LLVMInt32Vector(int32_t ival) {
    llvm::Constant *v = LLVMInt32(ival);
@@ -321,8 +453,8 @@ LLVMBoolVector(const bool *bvec) {
 }


-const llvm::ArrayType *
-LLVMPointerVectorType(const llvm::Type *t) {
+LLVM_TYPE_CONST llvm::ArrayType *
+LLVMPointerVectorType(LLVM_TYPE_CONST llvm::Type *t) {
    // NOTE: ArrayType, not VectorType
    return llvm::ArrayType::get(llvm::PointerType::get(t, 0), 
                                g->target.vectorWidth);
--- a/llvmutil.h
+++ b/llvmutil.h
@@ -44,35 +44,49 @@
 #include <llvm/DerivedTypes.h>
 #include <llvm/Constants.h>

+
 /** This structure holds pointers to a variety of LLVM types; code
    elsewhere can use them from here, ratherthan needing to make more
    verbose LLVM API calls.
 */ 
 struct LLVMTypes {
-    static const llvm::Type *VoidType;
-    static const llvm::PointerType *VoidPointerType;
-    static const llvm::Type *BoolType;
-    static const llvm::Type *Int8Type;
-    static const llvm::Type *Int16Type;
-    static const llvm::Type *Int32Type;
-    static const llvm::Type *Int32PointerType;
-    static const llvm::Type *Int64Type;
-    static const llvm::Type *Int64PointerType;
-    static const llvm::Type *FloatType;
-    static const llvm::Type *FloatPointerType;
-    static const llvm::Type *DoubleType;
+    static LLVM_TYPE_CONST llvm::Type *VoidType;
+    static LLVM_TYPE_CONST llvm::PointerType *VoidPointerType;
+    static LLVM_TYPE_CONST llvm::Type *BoolType;

-    static const llvm::VectorType *MaskType;
-    static const llvm::VectorType *BoolVectorType;
-    static const llvm::VectorType *Int1VectorType;
-    static const llvm::VectorType *Int32VectorType;
-    static const llvm::Type *Int32VectorPointerType;
-    static const llvm::VectorType *Int64VectorType;
-    static const llvm::Type *Int64VectorPointerType;
-    static const llvm::VectorType *FloatVectorType;
-    static const llvm::Type *FloatVectorPointerType;
-    static const llvm::VectorType *DoubleVectorType;
-    static const llvm::ArrayType *VoidPointerVectorType;
+    static LLVM_TYPE_CONST llvm::Type *Int8Type;
+    static LLVM_TYPE_CONST llvm::Type *Int16Type;
+    static LLVM_TYPE_CONST llvm::Type *Int32Type;
+    static LLVM_TYPE_CONST llvm::Type *Int64Type;
+    static LLVM_TYPE_CONST llvm::Type *FloatType;
+    static LLVM_TYPE_CONST llvm::Type *DoubleType;
+
+    static LLVM_TYPE_CONST llvm::Type *Int8PointerType;
+    static LLVM_TYPE_CONST llvm::Type *Int16PointerType;
+    static LLVM_TYPE_CONST llvm::Type *Int32PointerType;
+    static LLVM_TYPE_CONST llvm::Type *Int64PointerType;
+    static LLVM_TYPE_CONST llvm::Type *FloatPointerType;
+    static LLVM_TYPE_CONST llvm::Type *DoublePointerType;
+
+    static LLVM_TYPE_CONST llvm::VectorType *MaskType;
+
+    static LLVM_TYPE_CONST llvm::VectorType *BoolVectorType;
+    static LLVM_TYPE_CONST llvm::VectorType *Int1VectorType;
+    static LLVM_TYPE_CONST llvm::VectorType *Int8VectorType;
+    static LLVM_TYPE_CONST llvm::VectorType *Int16VectorType;
+    static LLVM_TYPE_CONST llvm::VectorType *Int32VectorType;
+    static LLVM_TYPE_CONST llvm::VectorType *Int64VectorType;
+    static LLVM_TYPE_CONST llvm::VectorType *FloatVectorType;
+    static LLVM_TYPE_CONST llvm::VectorType *DoubleVectorType;
+
+    static LLVM_TYPE_CONST llvm::Type *Int8VectorPointerType;
+    static LLVM_TYPE_CONST llvm::Type *Int16VectorPointerType;
+    static LLVM_TYPE_CONST llvm::Type *Int32VectorPointerType;
+    static LLVM_TYPE_CONST llvm::Type *Int64VectorPointerType;
+    static LLVM_TYPE_CONST llvm::Type *FloatVectorPointerType;
+    static LLVM_TYPE_CONST llvm::Type *DoubleVectorPointerType;
+
+    static LLVM_TYPE_CONST llvm::ArrayType *VoidPointerVectorType;
 };

 /** These variables hold the corresponding LLVM constant values as a
@@ -86,6 +100,14 @@ extern llvm::Constant *LLVMTrue, *LLVMFalse;
 */
 extern void InitLLVMUtil(llvm::LLVMContext *ctx, Target target);

+/** Returns an LLVM i8 constant of the given value */
+extern llvm::ConstantInt *LLVMInt8(int8_t i);
+/** Returns an LLVM i8 constant of the given value */
+extern llvm::ConstantInt *LLVMUInt8(uint8_t i);
+/** Returns an LLVM i16 constant of the given value */
+extern llvm::ConstantInt *LLVMInt16(int16_t i);
+/** Returns an LLVM i16 constant of the given value */
+extern llvm::ConstantInt *LLVMUInt16(uint16_t i);
 /** Returns an LLVM i32 constant of the given value */
 extern llvm::ConstantInt *LLVMInt32(int32_t i);
 /** Returns an LLVM i32 constant of the given value */
@@ -102,18 +124,35 @@ extern llvm::Constant *LLVMDouble(double f);
 /** Returns an LLVM boolean vector constant of the given value smeared
    across all elements */
 extern llvm::Constant *LLVMBoolVector(bool v);
+
+/** Returns an LLVM i8 vector constant of the given value smeared
+    across all elements */
+extern llvm::Constant *LLVMInt8Vector(int8_t i);
+/** Returns an LLVM i8 vector constant of the given value smeared
+    across all elements */
+extern llvm::Constant *LLVMUInt8Vector(uint8_t i);
+
+/** Returns an LLVM i16 vector constant of the given value smeared
+    across all elements */
+extern llvm::Constant *LLVMInt16Vector(int16_t i);
+/** Returns an LLVM i16 vector constant of the given value smeared
+    across all elements */
+extern llvm::Constant *LLVMUInt16Vector(uint16_t i);
+
 /** Returns an LLVM i32 vector constant of the given value smeared
    across all elements */
 extern llvm::Constant *LLVMInt32Vector(int32_t i);
 /** Returns an LLVM i32 vector constant of the given value smeared
    across all elements */
 extern llvm::Constant *LLVMUInt32Vector(uint32_t i);
+
 /** Returns an LLVM i64 vector constant of the given value smeared
    across all elements */
 extern llvm::Constant *LLVMInt64Vector(int64_t i);
 /** Returns an LLVM i64 vector constant of the given value smeared
    across all elements */
 extern llvm::Constant *LLVMUInt64Vector(uint64_t i);
+
 /** Returns an LLVM float vector constant of the given value smeared
    across all elements */
 extern llvm::Constant *LLVMFloatVector(float f);
@@ -124,18 +163,35 @@ extern llvm::Constant *LLVMDoubleVector(double f);
 /** Returns an LLVM boolean vector based on the given array of values.
    The array should have g->target.vectorWidth elements. */
 extern llvm::Constant *LLVMBoolVector(const bool *v);
+
+/** Returns an LLVM i8 vector based on the given array of values.
+    The array should have g->target.vectorWidth elements. */
+extern llvm::Constant *LLVMInt8Vector(const int8_t *i);
+/** Returns an LLVM i8 vector based on the given array of values.
+    The array should have g->target.vectorWidth elements. */
+extern llvm::Constant *LLVMUInt8Vector(const uint8_t *i);
+
+/** Returns an LLVM i16 vector based on the given array of values.
+    The array should have g->target.vectorWidth elements. */
+extern llvm::Constant *LLVMInt16Vector(const int16_t *i);
+/** Returns an LLVM i16 vector based on the given array of values.
+    The array should have g->target.vectorWidth elements. */
+extern llvm::Constant *LLVMUInt16Vector(const uint16_t *i);
+
 /** Returns an LLVM i32 vector based on the given array of values.
    The array should have g->target.vectorWidth elements. */
 extern llvm::Constant *LLVMInt32Vector(const int32_t *i);
 /** Returns an LLVM i32 vector based on the given array of values.
    The array should have g->target.vectorWidth elements. */
 extern llvm::Constant *LLVMUInt32Vector(const uint32_t *i);
+
 /** Returns an LLVM i64 vector based on the given array of values.
    The array should have g->target.vectorWidth elements. */
 extern llvm::Constant *LLVMInt64Vector(const int64_t *i);
 /** Returns an LLVM i64 vector based on the given array of values.
    The array should have g->target.vectorWidth elements. */
 extern llvm::Constant *LLVMUInt64Vector(const uint64_t *i);
+
 /** Returns an LLVM float vector based on the given array of values.
    The array should have g->target.vectorWidth elements. */
 extern llvm::Constant *LLVMFloatVector(const float *f);
@@ -152,6 +208,6 @@ extern llvm::Constant *LLVMMaskAllOff;
    pointers to that type.  (In practice, an array of pointers, since LLVM
    prohibits vectors of pointers.
 */
-extern const llvm::ArrayType *LLVMPointerVectorType(const llvm::Type *t);
+extern LLVM_TYPE_CONST llvm::ArrayType *LLVMPointerVectorType(LLVM_TYPE_CONST llvm::Type *t);

 #endif // ISPC_LLVMUTIL_H
--- a/main.cpp
+++ b/main.cpp
@@ -40,10 +40,14 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <llvm/Support/PrettyStackTrace.h>
-#ifdef LLVM_2_8
-#include <llvm/System/Signals.h>
-#else
 #include <llvm/Support/Signals.h>
+#if defined(LLVM_3_0) || defined(LLVM_3_0svn)
+  #include <llvm/Support/TargetRegistry.h>
+  #include <llvm/Support/TargetSelect.h>
+#else
+  #include <llvm/Target/TargetRegistry.h>
+  #include <llvm/Target/TargetSelect.h>
+  #include <llvm/Target/SubtargetFeature.h>
 #endif

 #ifdef ISPC_IS_WINDOWS
@@ -53,36 +57,36 @@
 #endif // ISPC_IS_WINDOWS

 static void usage(int ret) {
-    printf("This is the Intel(r) SPMD Program Compiler (ispc), build %s (%s)\n\n", BUILD_DATE, BUILD_VERSION);
+    printf("This is the Intel(r) SPMD Program Compiler (ispc), build %s (%s)\n\n", 
+           BUILD_DATE, BUILD_VERSION);
    printf("usage: ispc\n");
-    printf("    [--arch={x86,x86-64}]\t\tSelect target architecture\n");
+    printf("    [--arch={%s}]\t\tSelect target architecture\n", 
+           Target::SupportedTargetArchs());
    printf("    [--cpu=<cpu>]\t\t\tSelect target CPU type\n");
-    printf("         (atom, barcelona, core2, corei7, corei7-avx, istanbul, nocona,\n");
-    printf("          penryn, westmere)\n");
-#ifndef ISPC_IS_WINDOWS
-    printf("    [-D<foo>]\t\t\t\t#define value when running preprocessor\n");
-#endif
+    printf("         <cpu>={%s}\n", Target::SupportedTargetCPUs());
+    printf("    [-D<foo>]\t\t\t\t#define given value when running preprocessor\n");
    printf("    [--debug]\t\t\t\tPrint information useful for debugging ispc\n");
    printf("    [--emit-asm]\t\t\tGenerate assembly language file as output\n");
    printf("    [--emit-llvm]\t\t\tEmit LLVM bitode file as output\n");
-    printf("    [--emit-obj]\t\t\tGenerate object file file as output\n");
-    printf("    [--fast-math]\t\t\tPerform non-IEEE-compliant optimizations of numeric expressions\n");
+    printf("    [--emit-obj]\t\t\tGenerate object file file as output (default)\n");
    printf("    [-g]\t\t\t\tGenerate debugging information\n");
    printf("    [--help]\t\t\t\tPrint help\n");
-    printf("    [-h] <name>\t\t\t\tOutput filename for header\n");
+    printf("    [-h <name>/--header-outfile=<name>]\tOutput filename for header\n");
    printf("    [--instrument]\t\t\tEmit instrumentation to gather performance data\n");
    printf("    [--math-lib=<option>]\t\tSelect math library\n");
    printf("        default\t\t\t\tUse ispc's built-in math functions\n");
    printf("        fast\t\t\t\tUse high-performance but lower-accuracy math functions\n");
-    printf("        svml\t\t\t\tUse the Intel SVML math libraries\n");
+    printf("        svml\t\t\t\tUse the Intel(r) SVML math libraries\n");
    printf("        system\t\t\t\tUse the system's math library (*may be quite slow*)\n");
    printf("    [--nostdlib]\t\t\tDon't make the ispc standard library available\n");
-#ifndef ISPC_IS_WINDOWS
    printf("    [--nocpp]\t\t\t\tDon't run the C preprocessor\n");
-#endif
-    printf("    [-o/--outfile] <name>\t\tOutput filename for bitcode (may be \"-\" for standard output)\n");
-    printf("    [-O0/-O1]\t\t\t\tSet optimization level\n");
+    printf("    [-o <name>/--outfile=<name>]\tOutput filename (may be \"-\" for standard output)\n");
+    printf("    [-O0/-O1]\t\t\t\tSet optimization level (-O1 is default)\n");
    printf("    [--opt=<option>]\t\t\tSet optimization option\n");
+    printf("        disable-loop-unroll\t\tDisable loop unrolling.\n");
+    printf("        fast-masked-vload\t\tFaster masked vector loads on SSE (may go past end of array)\n");
+    printf("        fast-math\t\t\tPerform non-IEEE-compliant optimizations of numeric expressions\n");
+#if 0
    printf("        disable-blended-masked-stores\t\tScalarize masked stores on SSE (vs. using vblendps)\n");
    printf("        disable-coherent-control-flow\t\tDisable coherent control flow optimizations\n");
    printf("        disable-uniform-control-flow\t\tDisable uniform control flow optimizations\n");
@@ -91,7 +95,9 @@ static void usage(int ret) {
    printf("        disable-gather-scatter-flattening\tDisable flattening when all lanes are on\n");
    printf("        disable-uniform-memory-optimizations\tDisable uniform-based coherent memory access\n");
    printf("        disable-masked-store-optimizations\tDisable lowering to regular stores when possible\n");
-    printf("    [--target={sse2,sse4,sse4x2,avx}] Select target ISA (SSE4 is default)\n");
+#endif
+    printf("    [--pic]\t\t\t\tGenerate position-independent code\n");
+    printf("    [--target=<isa>]\t\t\tSelect target ISA. <isa>={%s}\n", Target::SupportedTargetISAs());
    printf("    [--version]\t\t\t\tPrint ispc version\n");
    printf("    [--woff]\t\t\t\tDisable warnings\n");
    printf("    [--wno-perf]\t\t\tDon't issue warnings related to performance-related issues\n");
@@ -99,33 +105,6 @@ static void usage(int ret) {
    exit(ret);
 }

-/** Given a target name string, set initialize the global g->target
-    structure appropriately. 
-*/
-static void lDoTarget(const char *target) {
-    if (!strcasecmp(target, "sse2")) {
-        g->target.isa = Target::SSE2;
-        g->target.nativeVectorWidth = 4;
-        g->target.vectorWidth = 4;
-    }
-    else if (!strcasecmp(target, "sse4")) {
-        g->target.isa = Target::SSE4;
-        g->target.nativeVectorWidth = 4;
-        g->target.vectorWidth = 4;
-    }
-    else if (!strcasecmp(target, "sse4x2")) {
-        g->target.isa = Target::SSE4;
-        g->target.nativeVectorWidth = 4;
-        g->target.vectorWidth = 8;
-    }
-    else if (!strcasecmp(target, "avx")) {
-        g->target.isa = Target::AVX;
-        g->target.nativeVectorWidth = 8;
-        g->target.vectorWidth = 8;
-    }
-    else
-        usage(1);
-}


 /** We take arguments from both the command line as well as from the
@@ -184,6 +163,16 @@ int main(int Argc, char *Argv[]) {
    llvm::sys::PrintStackTraceOnErrorSignal();
    llvm::PrettyStackTraceProgram X(argc, argv);

+    // initialize available LLVM targets
+    LLVMInitializeX86TargetInfo();
+    LLVMInitializeX86Target();
+    LLVMInitializeX86AsmPrinter();
+    LLVMInitializeX86AsmParser();
+    LLVMInitializeX86Disassembler();
+#if defined(LLVM_3_0) || defined(LLVM_3_0svn)
+    LLVMInitializeX86TargetMC();
+#endif
+
    char *file = NULL;
    const char *headerFileName = NULL;
    const char *outFileName = NULL;
@@ -194,21 +183,27 @@ int main(int Argc, char *Argv[]) {

    bool debugSet = false, optSet = false;
    Module::OutputType ot = Module::Object;
+    bool generatePIC = false;
+    const char *arch = NULL, *cpu = NULL, *target = NULL;

    for (int i = 1; i < argc; ++i) {
        if (!strcmp(argv[i], "--help"))
            usage(0);
-#ifndef ISPC_IS_WINDOWS
-        else if (!strncmp(argv[i], "-D", 2)) {
+        else if (!strncmp(argv[i], "-D", 2))
            g->cppArgs.push_back(argv[i]);
-        }
-#endif // !ISPC_IS_WINDOWS
        else if (!strncmp(argv[i], "--arch=", 7))
-            g->target.arch = argv[i] + 7;
+            arch = argv[i] + 7;
        else if (!strncmp(argv[i], "--cpu=", 6))
-            g->target.cpu = argv[i] + 6;
-        else if (!strcmp(argv[i], "--fast-math"))
-            g->opt.fastMath = true;
+            cpu = argv[i] + 6;
+        else if (!strcmp(argv[i], "--fast-math")) {
+            fprintf(stderr, "--fast-math option has been renamed to --opt=fast-math!\n");
+            usage(1);
+        }
+        else if (!strcmp(argv[i], "--fast-masked-vload")) {
+            fprintf(stderr, "--fast-masked-vload option has been renamed to "
+                    "--opt=fast-masked-vload!\n");
+            usage(1);
+        }
        else if (!strcmp(argv[i], "--debug"))
            g->debugPrint = true;
        else if (!strcmp(argv[i], "--instrument"))
@@ -224,13 +219,12 @@ int main(int Argc, char *Argv[]) {
        else if (!strcmp(argv[i], "--emit-obj"))
            ot = Module::Object;
        else if (!strcmp(argv[i], "--target")) {
+            // FIXME: should remove this way of specifying the target...
            if (++i == argc) usage(1);
-            lDoTarget(argv[i]);
-        }
-        else if (!strncmp(argv[i], "--target=", 9)) {
-            const char *target = argv[i] + 9;
-            lDoTarget(target);
+            target = argv[i];
        }
+        else if (!strncmp(argv[i], "--target=", 9))
+            target = argv[i] + 9;
        else if (!strncmp(argv[i], "--math-lib=", 11)) {
            const char *lib = argv[i] + 11;
            if (!strcmp(lib, "default"))
@@ -246,7 +240,16 @@ int main(int Argc, char *Argv[]) {
        }
        else if (!strncmp(argv[i], "--opt=", 6)) {
            const char *opt = argv[i] + 6;
-            if (!strcmp(opt, "disable-blended-masked-stores"))
+            if (!strcmp(opt, "fast-math"))
+                g->opt.fastMath = true;
+            else if (!strcmp(opt, "fast-masked-vload"))
+                g->opt.fastMaskedVload = true;
+            else if (!strcmp(opt, "disable-loop-unroll"))
+                g->opt.unrollLoops = false;
+
+            // These are only used for performance tests of specific
+            // optimizations
+            else if (!strcmp(opt, "disable-blended-masked-stores"))
                g->opt.disableBlendedMaskedStores = true;
            else if (!strcmp(opt, "disable-coherent-control-flow"))
                g->opt.disableCoherentControlFlow = true;
@@ -271,14 +274,19 @@ int main(int Argc, char *Argv[]) {
        }
        else if (!strcmp(argv[i], "--wno-perf") || !strcmp(argv[i], "-wno-perf"))
            g->emitPerfWarnings = false;
-        else if (!strcmp(argv[i], "-o") || !strcmp(argv[i], "--outfile")) {
+        else if (!strcmp(argv[i], "-o")) {
            if (++i == argc) usage(1);
            outFileName = argv[i];
        }
-        else if (!strcmp(argv[i], "-h") || !strcmp(argv[i], "--header-outfile")) {
+        else if (!strcmp(argv[i], "--outfile="))
+            outFileName = argv[i] + strlen("--outfile=");
+        else if (!strcmp(argv[i], "-h")) {
            if (++i == argc) usage(1);
            headerFileName = argv[i];
        }
+        else if (!strcmp(argv[i], "--header-outfile=")) {
+            headerFileName = argv[i] + strlen("--header-outfile=");
+        }
        else if (!strcmp(argv[i], "-O0")) {
            g->opt.level = 0;
            optSet = true;
@@ -294,6 +302,8 @@ int main(int Argc, char *Argv[]) {
            g->includeStdlib = false;
        else if (!strcmp(argv[i], "--nocpp"))
            g->runCPP = false;
+        else if (!strcmp(argv[i], "--pic"))
+            generatePIC = true;
        else if (!strcmp(argv[i], "-v") || !strcmp(argv[i], "--version")) {
            printf("Intel(r) SPMD Program Compiler (ispc) build %s (%s)\n", 
                   BUILD_DATE, BUILD_VERSION);
@@ -315,6 +325,9 @@ int main(int Argc, char *Argv[]) {
    if (debugSet && !optSet)
        g->opt.level = 0;

+    if (!Target::GetTarget(arch, cpu, target, generatePIC, &g->target))
+        usage(1);
+
    m = new Module(file);
    if (m->CompileFile() == 0) {
        if (outFileName != NULL)
--- a/Show More
+++ b/Show More