Release notes and doxygen bump for v1.0.10

Added deferred shading workload
2011-09-30 15:09:19 -07:00 · 2011-09-30 15:09:04 -07:00
407 changed files with 7912 additions and 20430 deletions
--- a/20
+++ b/20
@@ -25,8 +25,7 @@ BUILD_VERSION=$(shell git log --abbrev-commit --abbrev=16 | head -1)

 CXX=g++
 CPP=cpp
-OPT=-g3
-CXXFLAGS=$(OPT) $(LLVM_CXXFLAGS) -I. -Iobjs/ -Wall $(LLVM_VERSION_DEF) \
+CXXFLAGS=-g3 $(LLVM_CXXFLAGS) -I. -Iobjs/ -Wall $(LLVM_VERSION_DEF) \
 	-DBUILD_DATE="\"$(BUILD_DATE)\"" -DBUILD_VERSION="\"$(BUILD_VERSION)\""

 LDFLAGS=
@@ -45,13 +44,13 @@ YACC=bison -d -v -t

 ###########################################################################

-CXX_SRC=ast.cpp builtins.cpp ctx.cpp decl.cpp expr.cpp func.cpp ispc.cpp \
+CXX_SRC=builtins.cpp ctx.cpp decl.cpp expr.cpp ispc.cpp \
 	llvmutil.cpp main.cpp module.cpp opt.cpp stmt.cpp sym.cpp type.cpp \
 	util.cpp
-HEADERS=ast.h builtins.h ctx.h decl.h expr.h func.h ispc.h llvmutil.h module.h \
+HEADERS=builtins.h ctx.h decl.h expr.h ispc.h llvmutil.h module.h \
 	opt.h stmt.h sym.h type.h util.h
-BUILTINS_SRC=builtins-avx.ll builtins-avx-x2.ll builtins-sse2.ll builtins-sse2-x2.ll \
-	builtins-sse4.ll builtins-sse4-x2.ll builtins-dispatch.ll
+BUILTINS_SRC=builtins-avx.ll builtins-avx-x2.ll builtins-sse2.ll \
+	builtins-sse4.ll builtins-sse4x2.ll
 BISON_SRC=parse.yy
 FLEX_SRC=lex.ll

@@ -112,7 +111,7 @@ objs/lex.o: objs/lex.cpp $(HEADERS) objs/parse.cc
 	@echo Compiling $<
 	@$(CXX) $(CXXFLAGS) -o $@ -c $<

-objs/builtins-%.cpp: builtins-%.ll
+objs/builtins-%.cpp: builtins-%.ll builtins.m4 builtins-sse.ll builtins-avx-common.ll
 	@echo Creating C++ source from builtin definitions file $<
 	@m4 -DLLVM_VERSION=$(LLVM_VERSION) builtins.m4 $< | ./bitcode2cpp.py $< > $@

@@ -143,10 +142,3 @@ objs/stdlib_ispc.cpp: stdlib.ispc
 objs/stdlib_ispc.o: objs/stdlib_ispc.cpp
 	@echo Compiling $<
 	@$(CXX) $(CXXFLAGS) -o $@ -c $<
-
-objs/builtins-sse2.cpp: builtins.m4 builtins-sse2-common.ll builtins-sse2.ll
-objs/builtins-sse2-x2.cpp: builtins.m4 builtins-sse2-common.ll builtins-sse2-x2.ll
-objs/builtins-sse4.cpp: builtins.m4 builtins-sse4-common.ll builtins-sse4.ll
-objs/builtins-sse4-x2.cpp: builtins.m4 builtins-sse4-common.ll builtins-sse4-x2.ll
-objs/builtins-avx.cpp: builtins.m4 builtins-avx-common.ll builtins-avx.ll
-objs/builtins-avx-x2.cpp: builtins.m4 builtins-avx-common.ll builtins-avx-x2.ll
--- a/ast.h
+++ b/ast.h
@@ -1,94 +0,0 @@
-/*
-  Copyright (c) 2011, Intel Corporation
-  All rights reserved.
-
-  Redistribution and use in source and binary forms, with or without
-  modification, are permitted provided that the following conditions are
-  met:
-
-    * Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-
-    * Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-
-    * Neither the name of Intel Corporation nor the names of its
-      contributors may be used to endorse or promote products derived from
-      this software without specific prior written permission.
-
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
-   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
-   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
-   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
-*/
-
-/** @file ast.h
-    @brief 
-*/
-
-#ifndef ISPC_AST_H
-#define ISPC_AST_H 1
-
-#include "ispc.h"
-#include <vector>
-
-/** @brief Abstract base class for nodes in the abstract syntax tree (AST).
-
-    This class defines a basic interface that all abstract syntax tree
-    (AST) nodes must implement.  The base classes for both expressions
-    (Expr) and statements (Stmt) inherit from this class.
-*/
-class ASTNode {
-public:
-    ASTNode(SourcePos p) : pos(p) { }
-    virtual ~ASTNode();
-
-    /** The Optimize() method should perform any appropriate early-stage
-        optimizations on the node (e.g. constant folding).  The caller
-        should use the returned ASTNode * in place of the original node.
-        This method may return NULL if an error is encountered during
-        optimization. */
-    virtual ASTNode *Optimize() = 0;
-
-    /** Type checking should be performed by the node when this method is
-        called.  In the event of an error, a NULL value may be returned.
-        As with ASTNode::Optimize(), the caller should store the returned
-        pointer in place of the original ASTNode *. */
-    virtual ASTNode *TypeCheck() = 0;
-
-    virtual int EstimateCost() const = 0;
-
-    /** All AST nodes must track the file position where they are
-        defined. */
-    SourcePos pos;
-};
-
-
-/** Simple representation of the abstract syntax trees for all of the
-    functions declared in a compilation unit.
- */
-class AST {
-public:
-    /** Add the AST for a function described by the given declaration
-        information and source code. */
-    void AddFunction(Symbol *sym, const std::vector<Symbol *> &args, 
-                     Stmt *code);
-
-    /** Generate LLVM IR for all of the functions into the current
-        module. */
-    void GenerateIR();
-
-private:
-    std::vector<Function *> functions;
-};
-
-#endif // ISPC_AST_H
--- a/buildall.bat
+++ b/buildall.bat
@@ -1,16 +0,0 @@
-@echo off
-
-REM If LLVM_INSTALL_DIR isn't set globally in your environment,
-REM it can be set here_
-set LLVM_INSTALL_DIR=c:\users\mmp\llvm-dev
-
-REM Both the LLVM binaries and python need to be in the path
-set path=%LLVM_INSTALL_DIR%\bin;%PATH%;c:\cygwin\bin
-
-msbuild ispc.vcxproj /V:m /p:Platform=Win32 /p:Configuration=Release
-msbuild ispc_test.vcxproj /V:m /p:Platform=Win32 /p:Configuration=Release
-
-msbuild examples\examples.sln /V:m /p:Platform=x64 /p:Configuration=Release /t:rebuild
-msbuild examples\examples.sln /V:m /p:Platform=x64 /p:Configuration=Debug /t:rebuild
-msbuild examples\examples.sln /V:m /p:Platform=Win32 /p:Configuration=Release /t:rebuild
-msbuild examples\examples.sln /V:m /p:Platform=Win32 /p:Configuration=Debug /t:rebuild
--- a/builtins-avx-common.ll
+++ b/builtins-avx-common.ll
@@ -30,14 +30,18 @@
 ;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; AVX target implementation.
+;; *** Untested *** AVX target implementation.
+;;
+;; The LLVM AVX code generator is incomplete, so the ispc AVX target
+;; hasn't yet been tested.  There is therefore a higher-than-normal
+;; chance that there are bugs in the code in this file.

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rcp

 declare <4 x float> @llvm.x86.sse.rcp.ss(<4 x float>) nounwind readnone

-define float @__rcp_uniform_float(float) nounwind readonly alwaysinline {
+define internal float @__rcp_uniform_float(float) nounwind readonly alwaysinline {
 ;    uniform float iv = extract(__rcp_u(v), 0);
 ;    return iv * (2. - v * iv);
  %vecval = insertelement <4 x float> undef, float %0, i32 0
@@ -56,7 +60,7 @@ define float @__rcp_uniform_float(float) nounwind readonly alwaysinline {

 declare <4 x float> @llvm.x86.sse41.round.ss(<4 x float>, <4 x float>, i32) nounwind readnone

-define float @__round_uniform_float(float) nounwind readonly alwaysinline {
+define internal float @__round_uniform_float(float) nounwind readonly alwaysinline {
  ; roundss, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8
  ; the roundss intrinsic is a total mess--docs say:
  ;
@@ -79,7 +83,7 @@ define float @__round_uniform_float(float) nounwind readonly alwaysinline {
  ret float %rs
 }

-define float @__floor_uniform_float(float) nounwind readonly alwaysinline {
+define internal float @__floor_uniform_float(float) nounwind readonly alwaysinline {
  ; see above for round_ss instrinsic discussion...
  %xi = insertelement <4 x float> undef, float %0, i32 0
  ; roundps, round down 0b01 | don't signal precision exceptions 0b1001 = 9
@@ -88,7 +92,7 @@ define float @__floor_uniform_float(float) nounwind readonly alwaysinline {
  ret float %rs
 }

-define float @__ceil_uniform_float(float) nounwind readonly alwaysinline {
+define internal float @__ceil_uniform_float(float) nounwind readonly alwaysinline {
  ; see above for round_ss instrinsic discussion...
  %xi = insertelement <4 x float> undef, float %0, i32 0
  ; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
@@ -102,14 +106,14 @@ define float @__ceil_uniform_float(float) nounwind readonly alwaysinline {

 declare <2 x double> @llvm.x86.sse41.round.sd(<2 x double>, <2 x double>, i32) nounwind readnone

-define double @__round_uniform_double(double) nounwind readonly alwaysinline {
+define internal double @__round_uniform_double(double) nounwind readonly alwaysinline {
  %xi = insertelement <2 x double> undef, double %0, i32 0
  %xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 8)
  %rs = extractelement <2 x double> %xr, i32 0
  ret double %rs
 }

-define double @__floor_uniform_double(double) nounwind readonly alwaysinline {
+define internal double @__floor_uniform_double(double) nounwind readonly alwaysinline {
  ; see above for round_ss instrinsic discussion...
  %xi = insertelement <2 x double> undef, double %0, i32 0
  ; roundpd, round down 0b01 | don't signal precision exceptions 0b1001 = 9
@@ -118,7 +122,7 @@ define double @__floor_uniform_double(double) nounwind readonly alwaysinline {
  ret double %rs
 }

-define double @__ceil_uniform_double(double) nounwind readonly alwaysinline {
+define internal double @__ceil_uniform_double(double) nounwind readonly alwaysinline {
  ; see above for round_ss instrinsic discussion...
  %xi = insertelement <2 x double> undef, double %0, i32 0
  ; roundpd, round up 0b10 | don't signal precision exceptions 0b1010 = 10
@@ -133,7 +137,7 @@ define double @__ceil_uniform_double(double) nounwind readonly alwaysinline {

 declare <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float>) nounwind readnone

-define float @__rsqrt_uniform_float(float) nounwind readonly alwaysinline {
+define internal float @__rsqrt_uniform_float(float) nounwind readonly alwaysinline {
  ;  uniform float is = extract(__rsqrt_u(v), 0);
  %v = insertelement <4 x float> undef, float %0, i32 0
  %vis = call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %v)
@@ -154,7 +158,7 @@ define float @__rsqrt_uniform_float(float) nounwind readonly alwaysinline {

 declare <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float>) nounwind readnone

-define float @__sqrt_uniform_float(float) nounwind readonly alwaysinline {
+define internal float @__sqrt_uniform_float(float) nounwind readonly alwaysinline {
  sse_unary_scalar(ret, 4, float, @llvm.x86.sse.sqrt.ss, %0)
  ret float %ret
 }
@@ -166,7 +170,7 @@ define float @__sqrt_uniform_float(float) nounwind readonly alwaysinline {
 declare void @llvm.x86.sse.stmxcsr(i8 *) nounwind
 declare void @llvm.x86.sse.ldmxcsr(i8 *) nounwind

-define void @__fastmath() nounwind alwaysinline {
+define internal void @__fastmath() nounwind alwaysinline {
  %ptr = alloca i32
  %ptr8 = bitcast i32 * %ptr to i8 *
  call void @llvm.x86.sse.stmxcsr(i8 * %ptr8)
@@ -185,12 +189,12 @@ define void @__fastmath() nounwind alwaysinline {
 declare <4 x float> @llvm.x86.sse.max.ss(<4 x float>, <4 x float>) nounwind readnone
 declare <4 x float> @llvm.x86.sse.min.ss(<4 x float>, <4 x float>) nounwind readnone

-define float @__max_uniform_float(float, float) nounwind readonly alwaysinline {
+define internal float @__max_uniform_float(float, float) nounwind readonly alwaysinline {
  sse_binary_scalar(ret, 4, float, @llvm.x86.sse.max.ss, %0, %1)
  ret float %ret
 }

-define float @__min_uniform_float(float, float) nounwind readonly alwaysinline {
+define internal float @__min_uniform_float(float, float) nounwind readonly alwaysinline {
  sse_binary_scalar(ret, 4, float, @llvm.x86.sse.min.ss, %0, %1)
  ret float %ret
 }
@@ -202,12 +206,12 @@ define float @__min_uniform_float(float, float) nounwind readonly alwaysinline {
 declare <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32>, <4 x i32>) nounwind readnone
 declare <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32>, <4 x i32>) nounwind readnone

-define i32 @__min_uniform_int32(i32, i32) nounwind readonly alwaysinline {
+define internal i32 @__min_uniform_int32(i32, i32) nounwind readonly alwaysinline {
  sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pminsd, %0, %1)
  ret i32 %ret
 }

-define i32 @__max_uniform_int32(i32, i32) nounwind readonly alwaysinline {
+define internal i32 @__max_uniform_int32(i32, i32) nounwind readonly alwaysinline {
  sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pmaxsd, %0, %1)
  ret i32 %ret
 }
@@ -219,12 +223,12 @@ define i32 @__max_uniform_int32(i32, i32) nounwind readonly alwaysinline {
 declare <4 x i32> @llvm.x86.sse41.pminud(<4 x i32>, <4 x i32>) nounwind readnone
 declare <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32>, <4 x i32>) nounwind readnone

-define i32 @__min_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
+define internal i32 @__min_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
  sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pminud, %0, %1)
  ret i32 %ret
 }

-define i32 @__max_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
+define internal i32 @__max_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
  sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pmaxud, %0, %1)
  ret i32 %ret
 }
@@ -234,14 +238,14 @@ define i32 @__max_uniform_uint32(i32, i32) nounwind readonly alwaysinline {

 declare i32 @llvm.ctpop.i32(i32) nounwind readnone

-define i32 @__popcnt_int32(i32) nounwind readonly alwaysinline {
+define internal i32 @__popcnt_int32(i32) nounwind readonly alwaysinline {
  %call = call i32 @llvm.ctpop.i32(i32 %0)
  ret i32 %call
 }

 declare i64 @llvm.ctpop.i64(i64) nounwind readnone

-define i64 @__popcnt_int64(i64) nounwind readonly alwaysinline {
+define internal i64 @__popcnt_int64(i64) nounwind readonly alwaysinline {
  %call = call i64 @llvm.ctpop.i64(i64 %0)
  ret i64 %call
 }
@@ -251,7 +255,7 @@ define i64 @__popcnt_int64(i64) nounwind readonly alwaysinline {

 declare <2 x double> @llvm.x86.sse.sqrt.sd(<2 x double>) nounwind readnone

-define double @__sqrt_uniform_double(double) nounwind alwaysinline {
+define internal double @__sqrt_uniform_double(double) nounwind alwaysinline {
  sse_unary_scalar(ret, 2, double, @llvm.x86.sse.sqrt.sd, %0)
  ret double %ret
 }
@@ -263,12 +267,12 @@ define double @__sqrt_uniform_double(double) nounwind alwaysinline {
 declare <2 x double> @llvm.x86.sse2.max.sd(<2 x double>, <2 x double>) nounwind readnone
 declare <2 x double> @llvm.x86.sse2.min.sd(<2 x double>, <2 x double>) nounwind readnone

-define double @__min_uniform_double(double, double) nounwind readnone alwaysinline {
+define internal double @__min_uniform_double(double, double) nounwind readnone alwaysinline {
  sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.min.sd, %0, %1)
  ret double %ret
 }

-define double @__max_uniform_double(double, double) nounwind readnone alwaysinline {
+define internal double @__max_uniform_double(double, double) nounwind readnone alwaysinline {
  sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.max.sd, %0, %1)
  ret double %ret
 }
--- a/builtins-avx-x2.ll
+++ b/builtins-avx-x2.ll
@@ -29,6 +29,13 @@
 ;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 ;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  

+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; *** Untested *** AVX target implementation.
+;;
+;; The LLVM AVX code generator is incomplete, so the ispc AVX target
+;; hasn't yet been tested.  There is therefore a higher-than-normal
+;; chance that there are bugs in the code in this file.
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; Basic 16-wide definitions

@@ -44,7 +51,7 @@ include(`builtins-avx-common.ll')

 declare <8 x float> @llvm.x86.avx.rcp.ps.256(<8 x float>) nounwind readnone

-define <16 x float> @__rcp_varying_float(<16 x float>) nounwind readonly alwaysinline {
+define internal <16 x float> @__rcp_varying_float(<16 x float>) nounwind readonly alwaysinline {
  ;  float iv = __rcp_v(v);
  ;  return iv * (2. - v * iv);

@@ -64,17 +71,17 @@ define <16 x float> @__rcp_varying_float(<16 x float>) nounwind readonly alwaysi

 declare <8 x float> @llvm.x86.avx.round.ps.256(<8 x float>, i32) nounwind readnone

-define <16 x float> @__round_varying_float(<16 x float>) nounwind readonly alwaysinline {
+define internal <16 x float> @__round_varying_float(<16 x float>) nounwind readonly alwaysinline {
  ; roundps, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8
  round8to16(%0, 8)
 }

-define <16 x float> @__floor_varying_float(<16 x float>) nounwind readonly alwaysinline {
+define internal <16 x float> @__floor_varying_float(<16 x float>) nounwind readonly alwaysinline {
  ; roundps, round down 0b01 | don't signal precision exceptions 0b1001 = 9
  round8to16(%0, 9)
 }

-define <16 x float> @__ceil_varying_float(<16 x float>) nounwind readonly alwaysinline {
+define internal <16 x float> @__ceil_varying_float(<16 x float>) nounwind readonly alwaysinline {
  ; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
  round8to16(%0, 10)
 }
@@ -84,15 +91,15 @@ define <16 x float> @__ceil_varying_float(<16 x float>) nounwind readonly always

 declare <4 x double> @llvm.x86.avx.round.pd.256(<4 x double>, i32) nounwind readnone

-define <16 x double> @__round_varying_double(<16 x double>) nounwind readonly alwaysinline {
+define internal <16 x double> @__round_varying_double(<16 x double>) nounwind readonly alwaysinline {
  round4to16double(%0, 8)
 }

-define <16 x double> @__floor_varying_double(<16 x double>) nounwind readonly alwaysinline {
+define internal <16 x double> @__floor_varying_double(<16 x double>) nounwind readonly alwaysinline {
  round4to16double(%0, 9)
 }

-define <16 x double> @__ceil_varying_double(<16 x double>) nounwind readonly alwaysinline {
+define internal <16 x double> @__ceil_varying_double(<16 x double>) nounwind readonly alwaysinline {
  round4to16double(%0, 10)
 }

@@ -102,7 +109,7 @@ define <16 x double> @__ceil_varying_double(<16 x double>) nounwind readonly alw

 declare <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float>) nounwind readnone

-define <16 x float> @__rsqrt_varying_float(<16 x float> %v) nounwind readonly alwaysinline {
+define internal <16 x float> @__rsqrt_varying_float(<16 x float> %v) nounwind readonly alwaysinline {
  ;  float is = __rsqrt_v(v);
  unary8to16(is, float, @llvm.x86.avx.rsqrt.ps.256, %v)
  ;  return 0.5 * is * (3. - (v * is) * is);
@@ -125,7 +132,7 @@ define <16 x float> @__rsqrt_varying_float(<16 x float> %v) nounwind readonly al

 declare <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float>) nounwind readnone

-define <16 x float> @__sqrt_varying_float(<16 x float>) nounwind readonly alwaysinline {
+define internal <16 x float> @__sqrt_varying_float(<16 x float>) nounwind readonly alwaysinline {
  unary8to16(call, float, @llvm.x86.avx.sqrt.ps.256, %0)
  ret <16 x float> %call
 }
@@ -153,13 +160,13 @@ declare <16 x float> @__svml_pow(<16 x float>, <16 x float>)
 declare <8 x float> @llvm.x86.avx.max.ps.256(<8 x float>, <8 x float>) nounwind readnone
 declare <8 x float> @llvm.x86.avx.min.ps.256(<8 x float>, <8 x float>) nounwind readnone

-define <16 x float> @__max_varying_float(<16 x float>,
+define internal <16 x float> @__max_varying_float(<16 x float>,
                                                  <16 x float>) nounwind readonly alwaysinline {
  binary8to16(call, float, @llvm.x86.avx.max.ps.256, %0, %1)
  ret <16 x float> %call
 }

-define <16 x float> @__min_varying_float(<16 x float>,
+define internal <16 x float> @__min_varying_float(<16 x float>,
                                                  <16 x float>) nounwind readonly alwaysinline {
  binary8to16(call, float, @llvm.x86.avx.min.ps.256, %0, %1)
  ret <16 x float> %call
@@ -169,12 +176,12 @@ define <16 x float> @__min_varying_float(<16 x float>,
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; int min/max

-define <16 x i32> @__min_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
+define internal <16 x i32> @__min_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
  binary4to16(ret, i32, @llvm.x86.sse41.pminsd, %0, %1)
  ret <16 x i32> %ret
 }

-define <16 x i32> @__max_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
+define internal <16 x i32> @__max_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
  binary4to16(ret, i32, @llvm.x86.sse41.pmaxsd, %0, %1)
  ret <16 x i32> %ret
 }
@@ -183,12 +190,12 @@ define <16 x i32> @__max_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; unsigned int min/max

-define <16 x i32> @__min_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
+define internal <16 x i32> @__min_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
  binary4to16(ret, i32, @llvm.x86.sse41.pminud, %0, %1)
  ret <16 x i32> %ret
 }

-define <16 x i32> @__max_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
+define internal <16 x i32> @__max_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
  binary4to16(ret, i32, @llvm.x86.sse41.pmaxud, %0, %1)
  ret <16 x i32> %ret
 }
@@ -198,7 +205,7 @@ define <16 x i32> @__max_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonl

 declare i32 @llvm.x86.avx.movmsk.ps.256(<8 x float>) nounwind readnone

-define i32 @__movmsk(<16 x i32>) nounwind readnone alwaysinline {
+define internal i32 @__movmsk(<16 x i32>) nounwind readnone alwaysinline {
  %floatmask = bitcast <16 x i32> %0 to <16 x float>
  %mask0 = shufflevector <16 x float> %floatmask, <16 x float> undef,
          <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -217,7 +224,7 @@ define i32 @__movmsk(<16 x i32>) nounwind readnone alwaysinline {

 declare <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float>, <8 x float>) nounwind readnone

-define float @__reduce_add_float(<16 x float>) nounwind readonly alwaysinline {
+define internal float @__reduce_add_float(<16 x float>) nounwind readonly alwaysinline {
  %va = shufflevector <16 x float> %0, <16 x float> undef,
          <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
  %vb = shufflevector <16 x float> %0, <16 x float> undef,
@@ -232,12 +239,12 @@ define float @__reduce_add_float(<16 x float>) nounwind readonly alwaysinline {
 }


-define float @__reduce_min_float(<16 x float>) nounwind readnone alwaysinline {
+define internal float @__reduce_min_float(<16 x float>) nounwind readnone alwaysinline {
  reduce16(float, @__min_varying_float, @__min_uniform_float)
 }


-define float @__reduce_max_float(<16 x float>) nounwind readnone alwaysinline {
+define internal float @__reduce_max_float(<16 x float>) nounwind readnone alwaysinline {
  reduce16(float, @__max_varying_float, @__max_uniform_float)
 }

@@ -246,28 +253,28 @@ reduce_equal(16)
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; horizontal int32 ops

-define <16 x i32> @__add_varying_int32(<16 x i32>,
+define internal <16 x i32> @__add_varying_int32(<16 x i32>,
                                                <16 x i32>) nounwind readnone alwaysinline {
  %s = add <16 x i32> %0, %1
  ret <16 x i32> %s
 }

-define i32 @__add_uniform_int32(i32, i32) nounwind readnone alwaysinline {
+define internal i32 @__add_uniform_int32(i32, i32) nounwind readnone alwaysinline {
  %s = add i32 %0, %1
  ret i32 %s
 }

-define i32 @__reduce_add_int32(<16 x i32>) nounwind readnone alwaysinline {
+define internal i32 @__reduce_add_int32(<16 x i32>) nounwind readnone alwaysinline {
  reduce16(i32, @__add_varying_int32, @__add_uniform_int32)
 }


-define i32 @__reduce_min_int32(<16 x i32>) nounwind readnone alwaysinline {
+define internal i32 @__reduce_min_int32(<16 x i32>) nounwind readnone alwaysinline {
  reduce16(i32, @__min_varying_int32, @__min_uniform_int32)
 }


-define i32 @__reduce_max_int32(<16 x i32>) nounwind readnone alwaysinline {
+define internal i32 @__reduce_max_int32(<16 x i32>) nounwind readnone alwaysinline {
  reduce16(i32, @__max_varying_int32, @__max_uniform_int32)
 }

@@ -275,17 +282,17 @@ define i32 @__reduce_max_int32(<16 x i32>) nounwind readnone alwaysinline {
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;;; horizontal uint32 ops

-define i32 @__reduce_add_uint32(<16 x i32> %v) nounwind readnone alwaysinline {
+define internal i32 @__reduce_add_uint32(<16 x i32> %v) nounwind readnone alwaysinline {
  %r = call i32 @__reduce_add_int32(<16 x i32> %v)
  ret i32 %r
 }

-define i32 @__reduce_min_uint32(<16 x i32>) nounwind readnone alwaysinline {
+define internal i32 @__reduce_min_uint32(<16 x i32>) nounwind readnone alwaysinline {
  reduce16(i32, @__min_varying_uint32, @__min_uniform_uint32)
 }


-define i32 @__reduce_max_uint32(<16 x i32>) nounwind readnone alwaysinline {
+define internal i32 @__reduce_max_uint32(<16 x i32>) nounwind readnone alwaysinline {
  reduce16(i32, @__max_varying_uint32, @__max_uniform_uint32)
 }

@@ -295,7 +302,7 @@ define i32 @__reduce_max_uint32(<16 x i32>) nounwind readnone alwaysinline {

 declare <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double>, <4 x double>) nounwind readnone

-define double @__reduce_add_double(<16 x double>) nounwind readonly alwaysinline {
+define internal double @__reduce_add_double(<16 x double>) nounwind readonly alwaysinline {
  %va = shufflevector <16 x double> %0, <16 x double> undef,
         <4 x i32> <i32 0, i32 1, i32 2, i32 3>
  %vb = shufflevector <16 x double> %0, <16 x double> undef,
@@ -315,12 +322,12 @@ define double @__reduce_add_double(<16 x double>) nounwind readonly alwaysinline
  ret double %sum
 }

-define double @__reduce_min_double(<16 x double>) nounwind readnone alwaysinline {
+define internal double @__reduce_min_double(<16 x double>) nounwind readnone alwaysinline {
  reduce16(double, @__min_varying_double, @__min_uniform_double)
 }


-define double @__reduce_max_double(<16 x double>) nounwind readnone alwaysinline {
+define internal double @__reduce_max_double(<16 x double>) nounwind readnone alwaysinline {
  reduce16(double, @__max_varying_double, @__max_uniform_double)
 }

@@ -328,28 +335,28 @@ define double @__reduce_max_double(<16 x double>) nounwind readnone alwaysinline
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; horizontal int64 ops

-define <16 x i64> @__add_varying_int64(<16 x i64>,
+define internal <16 x i64> @__add_varying_int64(<16 x i64>,
                                                <16 x i64>) nounwind readnone alwaysinline {
  %s = add <16 x i64> %0, %1
  ret <16 x i64> %s
 }

-define i64 @__add_uniform_int64(i64, i64) nounwind readnone alwaysinline {
+define internal i64 @__add_uniform_int64(i64, i64) nounwind readnone alwaysinline {
  %s = add i64 %0, %1
  ret i64 %s
 }

-define i64 @__reduce_add_int64(<16 x i64>) nounwind readnone alwaysinline {
+define internal i64 @__reduce_add_int64(<16 x i64>) nounwind readnone alwaysinline {
  reduce16(i64, @__add_varying_int64, @__add_uniform_int64)
 }


-define i64 @__reduce_min_int64(<16 x i64>) nounwind readnone alwaysinline {
+define internal i64 @__reduce_min_int64(<16 x i64>) nounwind readnone alwaysinline {
  reduce16(i64, @__min_varying_int64, @__min_uniform_int64)
 }


-define i64 @__reduce_max_int64(<16 x i64>) nounwind readnone alwaysinline {
+define internal i64 @__reduce_max_int64(<16 x i64>) nounwind readnone alwaysinline {
  reduce16(i64, @__max_varying_int64, @__max_uniform_int64)
 }

@@ -357,17 +364,17 @@ define i64 @__reduce_max_int64(<16 x i64>) nounwind readnone alwaysinline {
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;;; horizontal uint64 ops

-define i64 @__reduce_add_uint64(<16 x i64> %v) nounwind readnone alwaysinline {
+define internal i64 @__reduce_add_uint64(<16 x i64> %v) nounwind readnone alwaysinline {
  %r = call i64 @__reduce_add_int64(<16 x i64> %v)
  ret i64 %r
 }

-define i64 @__reduce_min_uint64(<16 x i64>) nounwind readnone alwaysinline {
+define internal i64 @__reduce_min_uint64(<16 x i64>) nounwind readnone alwaysinline {
  reduce16(i64, @__min_varying_uint64, @__min_uniform_uint64)
 }


-define i64 @__reduce_max_uint64(<16 x i64>) nounwind readnone alwaysinline {
+define internal i64 @__reduce_max_uint64(<16 x i64>) nounwind readnone alwaysinline {
  reduce16(i64, @__max_varying_uint64, @__max_uniform_uint64)
 }

@@ -635,7 +642,7 @@ gen_scatter(16, i64)

 declare <4 x double> @llvm.x86.avx.sqrt.pd.256(<4 x double>) nounwind readnone

-define <16 x double> @__sqrt_varying_double(<16 x double>) nounwind alwaysinline {
+define internal <16 x double> @__sqrt_varying_double(<16 x double>) nounwind alwaysinline {
  unary4to16(ret, double, @llvm.x86.avx.sqrt.pd.256, %0)
  ret <16 x double> %ret
 }
@@ -647,12 +654,12 @@ define <16 x double> @__sqrt_varying_double(<16 x double>) nounwind alwaysinline
 declare <4 x double> @llvm.x86.avx.max.pd.256(<4 x double>, <4 x double>) nounwind readnone
 declare <4 x double> @llvm.x86.avx.min.pd.256(<4 x double>, <4 x double>) nounwind readnone

-define <16 x double> @__min_varying_double(<16 x double>, <16 x double>) nounwind readnone alwaysinline {
+define internal <16 x double> @__min_varying_double(<16 x double>, <16 x double>) nounwind readnone alwaysinline {
  binary4to16(ret, double, @llvm.x86.avx.min.pd.256, %0, %1)
  ret <16 x double> %ret
 }

-define <16 x double> @__max_varying_double(<16 x double>, <16 x double>) nounwind readnone alwaysinline {
+define internal <16 x double> @__max_varying_double(<16 x double>, <16 x double>) nounwind readnone alwaysinline {
  binary4to16(ret, double, @llvm.x86.avx.max.pd.256, %0, %1)
  ret <16 x double> %ret
 }
--- a/builtins-avx.ll
+++ b/builtins-avx.ll
@@ -29,6 +29,13 @@
 ;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 ;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  

+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; *** Untested *** AVX target implementation.
+;;
+;; The LLVM AVX code generator is incomplete, so the ispc AVX target
+;; hasn't yet been tested.  There is therefore a higher-than-normal
+;; chance that there are bugs in the code in this file.
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; Basic 8-wide definitions

@@ -44,7 +51,7 @@ include(`builtins-avx-common.ll')

 declare <8 x float> @llvm.x86.avx.rcp.ps.256(<8 x float>) nounwind readnone

-define <8 x float> @__rcp_varying_float(<8 x float>) nounwind readonly alwaysinline {
+define internal <8 x float> @__rcp_varying_float(<8 x float>) nounwind readonly alwaysinline {
  ;  float iv = __rcp_v(v);
  ;  return iv * (2. - v * iv);

@@ -62,19 +69,19 @@ define <8 x float> @__rcp_varying_float(<8 x float>) nounwind readonly alwaysinl

 declare <8 x float> @llvm.x86.avx.round.ps.256(<8 x float>, i32) nounwind readnone

-define <8 x float> @__round_varying_float(<8 x float>) nounwind readonly alwaysinline {
+define internal <8 x float> @__round_varying_float(<8 x float>) nounwind readonly alwaysinline {
  ; roundps, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8
  %call = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %0, i32 8)
  ret <8 x float> %call
 }

-define <8 x float> @__floor_varying_float(<8 x float>) nounwind readonly alwaysinline {
+define internal <8 x float> @__floor_varying_float(<8 x float>) nounwind readonly alwaysinline {
  ; roundps, round down 0b01 | don't signal precision exceptions 0b1001 = 9
  %call = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %0, i32 9)
  ret <8 x float> %call
 }

-define <8 x float> @__ceil_varying_float(<8 x float>) nounwind readonly alwaysinline {
+define internal <8 x float> @__ceil_varying_float(<8 x float>) nounwind readonly alwaysinline {
  ; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
  %call = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %0, i32 10)
  ret <8 x float> %call
@@ -85,17 +92,17 @@ define <8 x float> @__ceil_varying_float(<8 x float>) nounwind readonly alwaysin

 declare <4 x double> @llvm.x86.avx.round.pd.256(<4 x double>, i32) nounwind readnone

-define <8 x double> @__round_varying_double(<8 x double>) nounwind readonly alwaysinline {
+define internal <8 x double> @__round_varying_double(<8 x double>) nounwind readonly alwaysinline {
  round4to8double(%0, 8)
 }

-define <8 x double> @__floor_varying_double(<8 x double>) nounwind readonly alwaysinline {
+define internal <8 x double> @__floor_varying_double(<8 x double>) nounwind readonly alwaysinline {
  ; roundpd, round down 0b01 | don't signal precision exceptions 0b1000 = 9
  round4to8double(%0, 9)
 }


-define <8 x double> @__ceil_varying_double(<8 x double>) nounwind readonly alwaysinline {
+define internal <8 x double> @__ceil_varying_double(<8 x double>) nounwind readonly alwaysinline {
  ; roundpd, round up 0b10 | don't signal precision exceptions 0b1000 = 10
  round4to8double(%0, 10)
 }
@@ -106,7 +113,7 @@ define <8 x double> @__ceil_varying_double(<8 x double>) nounwind readonly alway

 declare <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float>) nounwind readnone

-define <8 x float> @__rsqrt_varying_float(<8 x float> %v) nounwind readonly alwaysinline {
+define internal <8 x float> @__rsqrt_varying_float(<8 x float> %v) nounwind readonly alwaysinline {
  ;  float is = __rsqrt_v(v);
  %is = call <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float> %v)
  ;  return 0.5 * is * (3. - (v * is) * is);
@@ -125,7 +132,7 @@ define <8 x float> @__rsqrt_varying_float(<8 x float> %v) nounwind readonly alwa

 declare <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float>) nounwind readnone

-define <8 x float> @__sqrt_varying_float(<8 x float>) nounwind readonly alwaysinline {
+define internal <8 x float> @__sqrt_varying_float(<8 x float>) nounwind readonly alwaysinline {
  %call = call <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float> %0)
  ret <8 x float> %call
 }
@@ -153,13 +160,13 @@ declare <8 x float> @__svml_pow(<8 x float>, <8 x float>)
 declare <8 x float> @llvm.x86.avx.max.ps.256(<8 x float>, <8 x float>) nounwind readnone
 declare <8 x float> @llvm.x86.avx.min.ps.256(<8 x float>, <8 x float>) nounwind readnone

-define <8 x float> @__max_varying_float(<8 x float>,
+define internal <8 x float> @__max_varying_float(<8 x float>,
                                                 <8 x float>) nounwind readonly alwaysinline {
  %call = call <8 x float> @llvm.x86.avx.max.ps.256(<8 x float> %0, <8 x float> %1)
  ret <8 x float> %call
 }

-define <8 x float> @__min_varying_float(<8 x float>,
+define internal <8 x float> @__min_varying_float(<8 x float>,
                                                 <8 x float>) nounwind readonly alwaysinline {
  %call = call <8 x float> @llvm.x86.avx.min.ps.256(<8 x float> %0, <8 x float> %1)
  ret <8 x float> %call
@@ -169,12 +176,12 @@ define <8 x float> @__min_varying_float(<8 x float>,
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; int min/max

-define <8 x i32> @__min_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
+define internal <8 x i32> @__min_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
  binary4to8(ret, i32, @llvm.x86.sse41.pminsd, %0, %1)
  ret <8 x i32> %ret
 }

-define <8 x i32> @__max_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
+define internal <8 x i32> @__max_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
  binary4to8(ret, i32, @llvm.x86.sse41.pmaxsd, %0, %1)
  ret <8 x i32> %ret
 }
@@ -183,12 +190,12 @@ define <8 x i32> @__max_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly al
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; unsigned int min/max

-define <8 x i32> @__min_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
+define internal <8 x i32> @__min_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
  binary4to8(ret, i32, @llvm.x86.sse41.pminud, %0, %1)
  ret <8 x i32> %ret
 }

-define <8 x i32> @__max_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
+define internal <8 x i32> @__max_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
  binary4to8(ret, i32, @llvm.x86.sse41.pmaxud, %0, %1)
  ret <8 x i32> %ret
 }
@@ -198,7 +205,7 @@ define <8 x i32> @__max_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly a

 declare i32 @llvm.x86.avx.movmsk.ps.256(<8 x float>) nounwind readnone

-define i32 @__movmsk(<8 x i32>) nounwind readnone alwaysinline {
+define internal i32 @__movmsk(<8 x i32>) nounwind readnone alwaysinline {
  %floatmask = bitcast <8 x i32> %0 to <8 x float>
  %v = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %floatmask) nounwind readnone
  ret i32 %v
@@ -209,7 +216,7 @@ define i32 @__movmsk(<8 x i32>) nounwind readnone alwaysinline {

 declare <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float>, <8 x float>) nounwind readnone

-define float @__reduce_add_float(<8 x float>) nounwind readonly alwaysinline {
+define internal float @__reduce_add_float(<8 x float>) nounwind readonly alwaysinline {
  %v1 = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %0, <8 x float> %0)
  %v2 = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %v1, <8 x float> %v1)
  %scalar1 = extractelement <8 x float> %v2, i32 0
@@ -219,12 +226,12 @@ define float @__reduce_add_float(<8 x float>) nounwind readonly alwaysinline {
 }


-define float @__reduce_min_float(<8 x float>) nounwind readnone alwaysinline {
+define internal float @__reduce_min_float(<8 x float>) nounwind readnone alwaysinline {
  reduce8(float, @__min_varying_float, @__min_uniform_float)
 }


-define float @__reduce_max_float(<8 x float>) nounwind readnone alwaysinline {
+define internal float @__reduce_max_float(<8 x float>) nounwind readnone alwaysinline {
  reduce8(float, @__max_varying_float, @__max_uniform_float)
 }

@@ -233,28 +240,28 @@ reduce_equal(8)
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; horizontal int32 ops

-define <8 x i32> @__add_varying_int32(<8 x i32>,
+define internal <8 x i32> @__add_varying_int32(<8 x i32>,
                                               <8 x i32>) nounwind readnone alwaysinline {
  %s = add <8 x i32> %0, %1
  ret <8 x i32> %s
 }

-define i32 @__add_uniform_int32(i32, i32) nounwind readnone alwaysinline {
+define internal i32 @__add_uniform_int32(i32, i32) nounwind readnone alwaysinline {
  %s = add i32 %0, %1
  ret i32 %s
 }

-define i32 @__reduce_add_int32(<8 x i32>) nounwind readnone alwaysinline {
+define internal i32 @__reduce_add_int32(<8 x i32>) nounwind readnone alwaysinline {
  reduce8(i32, @__add_varying_int32, @__add_uniform_int32)
 }


-define i32 @__reduce_min_int32(<8 x i32>) nounwind readnone alwaysinline {
+define internal i32 @__reduce_min_int32(<8 x i32>) nounwind readnone alwaysinline {
  reduce8(i32, @__min_varying_int32, @__min_uniform_int32)
 }


-define i32 @__reduce_max_int32(<8 x i32>) nounwind readnone alwaysinline {
+define internal i32 @__reduce_max_int32(<8 x i32>) nounwind readnone alwaysinline {
  reduce8(i32, @__max_varying_int32, @__max_uniform_int32)
 }

@@ -262,17 +269,17 @@ define i32 @__reduce_max_int32(<8 x i32>) nounwind readnone alwaysinline {
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;;; horizontal uint32 ops

-define i32 @__reduce_add_uint32(<8 x i32> %v) nounwind readnone alwaysinline {
+define internal i32 @__reduce_add_uint32(<8 x i32> %v) nounwind readnone alwaysinline {
  %r = call i32 @__reduce_add_int32(<8 x i32> %v)
  ret i32 %r
 }

-define i32 @__reduce_min_uint32(<8 x i32>) nounwind readnone alwaysinline {
+define internal i32 @__reduce_min_uint32(<8 x i32>) nounwind readnone alwaysinline {
  reduce8(i32, @__min_varying_uint32, @__min_uniform_uint32)
 }


-define i32 @__reduce_max_uint32(<8 x i32>) nounwind readnone alwaysinline {
+define internal i32 @__reduce_max_uint32(<8 x i32>) nounwind readnone alwaysinline {
  reduce8(i32, @__max_varying_uint32, @__max_uniform_uint32)
 }

@@ -282,7 +289,7 @@ define i32 @__reduce_max_uint32(<8 x i32>) nounwind readnone alwaysinline {

 declare <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double>, <4 x double>) nounwind readnone

-define double @__reduce_add_double(<8 x double>) nounwind readonly alwaysinline {
+define internal double @__reduce_add_double(<8 x double>) nounwind readonly alwaysinline {
  %v0 = shufflevector <8 x double> %0, <8 x double> undef,
                      <4 x i32> <i32 0, i32 1, i32 2, i32 3>
  %v1 = shufflevector <8 x double> %0, <8 x double> undef,
@@ -296,12 +303,12 @@ define double @__reduce_add_double(<8 x double>) nounwind readonly alwaysinline
  ret double %sum
 }

-define double @__reduce_min_double(<8 x double>) nounwind readnone alwaysinline {
+define internal double @__reduce_min_double(<8 x double>) nounwind readnone alwaysinline {
  reduce8(double, @__min_varying_double, @__min_uniform_double)
 }


-define double @__reduce_max_double(<8 x double>) nounwind readnone alwaysinline {
+define internal double @__reduce_max_double(<8 x double>) nounwind readnone alwaysinline {
  reduce8(double, @__max_varying_double, @__max_uniform_double)
 }

@@ -309,28 +316,28 @@ define double @__reduce_max_double(<8 x double>) nounwind readnone alwaysinline
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; horizontal int64 ops

-define <8 x i64> @__add_varying_int64(<8 x i64>,
+define internal <8 x i64> @__add_varying_int64(<8 x i64>,
                                               <8 x i64>) nounwind readnone alwaysinline {
  %s = add <8 x i64> %0, %1
  ret <8 x i64> %s
 }

-define i64 @__add_uniform_int64(i64, i64) nounwind readnone alwaysinline {
+define internal i64 @__add_uniform_int64(i64, i64) nounwind readnone alwaysinline {
  %s = add i64 %0, %1
  ret i64 %s
 }

-define i64 @__reduce_add_int64(<8 x i64>) nounwind readnone alwaysinline {
+define internal i64 @__reduce_add_int64(<8 x i64>) nounwind readnone alwaysinline {
  reduce8(i64, @__add_varying_int64, @__add_uniform_int64)
 }


-define i64 @__reduce_min_int64(<8 x i64>) nounwind readnone alwaysinline {
+define internal i64 @__reduce_min_int64(<8 x i64>) nounwind readnone alwaysinline {
  reduce8(i64, @__min_varying_int64, @__min_uniform_int64)
 }


-define i64 @__reduce_max_int64(<8 x i64>) nounwind readnone alwaysinline {
+define internal i64 @__reduce_max_int64(<8 x i64>) nounwind readnone alwaysinline {
  reduce8(i64, @__max_varying_int64, @__max_uniform_int64)
 }

@@ -338,17 +345,17 @@ define i64 @__reduce_max_int64(<8 x i64>) nounwind readnone alwaysinline {
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;;; horizontal uint64 ops

-define i64 @__reduce_add_uint64(<8 x i64> %v) nounwind readnone alwaysinline {
+define internal i64 @__reduce_add_uint64(<8 x i64> %v) nounwind readnone alwaysinline {
  %r = call i64 @__reduce_add_int64(<8 x i64> %v)
  ret i64 %r
 }

-define i64 @__reduce_min_uint64(<8 x i64>) nounwind readnone alwaysinline {
+define internal i64 @__reduce_min_uint64(<8 x i64>) nounwind readnone alwaysinline {
  reduce8(i64, @__min_varying_uint64, @__min_uniform_uint64)
 }


-define i64 @__reduce_max_uint64(<8 x i64>) nounwind readnone alwaysinline {
+define internal i64 @__reduce_max_uint64(<8 x i64>) nounwind readnone alwaysinline {
  reduce8(i64, @__max_varying_uint64, @__max_uniform_uint64)
 }

@@ -533,7 +540,7 @@ gen_scatter(8, i64)

 declare <4 x double> @llvm.x86.avx.sqrt.pd.256(<4 x double>) nounwind readnone

-define <8 x double> @__sqrt_varying_double(<8 x double>) nounwind alwaysinline {
+define internal <8 x double> @__sqrt_varying_double(<8 x double>) nounwind alwaysinline {
  unary4to8(ret, double, @llvm.x86.avx.sqrt.pd.256, %0)
  ret <8 x double> %ret
 }
@@ -545,12 +552,12 @@ define <8 x double> @__sqrt_varying_double(<8 x double>) nounwind alwaysinline {
 declare <4 x double> @llvm.x86.avx.max.pd.256(<4 x double>, <4 x double>) nounwind readnone
 declare <4 x double> @llvm.x86.avx.min.pd.256(<4 x double>, <4 x double>) nounwind readnone

-define <8 x double> @__min_varying_double(<8 x double>, <8 x double>) nounwind readnone alwaysinline {
+define internal <8 x double> @__min_varying_double(<8 x double>, <8 x double>) nounwind readnone alwaysinline {
  binary4to8(ret, double, @llvm.x86.avx.min.pd.256, %0, %1)
  ret <8 x double> %ret
 }

-define <8 x double> @__max_varying_double(<8 x double>, <8 x double>) nounwind readnone alwaysinline {
+define internal <8 x double> @__max_varying_double(<8 x double>, <8 x double>) nounwind readnone alwaysinline {
  binary4to8(ret, double, @llvm.x86.avx.max.pd.256, %0, %1)
  ret <8 x double> %ret
 }
--- a/builtins-c.c
+++ b/builtins-c.c
@@ -57,7 +57,6 @@

 #include <stdint.h>
 #include <stdio.h>
-#include <stdlib.h>
 #include <stdarg.h>

 typedef int Bool;
@@ -133,8 +132,6 @@ void __do_print(const char *format, const char *types, int width, int mask,
                case 'V': PRINT_VECTOR("%llu", unsigned long long);
                case 'd': PRINT_SCALAR("%f", double);
                case 'D': PRINT_VECTOR("%f", double);
-                case 'p': PRINT_SCALAR("%p", void *);
-                case 'P': PRINT_VECTOR("%p", void *);
                default:
                    printf("UNKNOWN TYPE ");
                    putchar(*types);
@@ -150,21 +147,21 @@ void __do_print(const char *format, const char *types, int width, int mask,

 int __num_cores() {
 #ifdef _MSC_VER
-    // This is quite a hack.  Including all of windows.h to get this definition
-    // pulls in a bunch of stuff that leads to undefined symbols at link time.
-    // So we don't #include <windows.h> but instead have the equivalent declarations
-    // here.  Presumably this struct declaration won't be changing in the future
-    // anyway...
-    struct SYSTEM_INFO {
-        int pad0[2];
-        void *pad1[2];
-        int *pad2;
-        int dwNumberOfProcessors;
+	// This is quite a hack.  Including all of windows.h to get this definition
+	// pulls in a bunch of stuff that leads to undefined symbols at link time.
+	// So we don't #include <windows.h> but instead have the equivalent declarations
+	// here.  Presumably this struct declaration won't be changing in the future
+	// anyway...
+  	struct SYSTEM_INFO {
+        int pad0[2];
+        void *pad1[2];
+        int *pad2;
+        int dwNumberOfProcessors;
        int pad3[3];
-    };
+	};

    struct SYSTEM_INFO sysInfo;
-    extern void __stdcall GetSystemInfo(struct SYSTEM_INFO *);
+	extern void __stdcall GetSystemInfo(struct SYSTEM_INFO *);
    GetSystemInfo(&sysInfo);
    return sysInfo.dwNumberOfProcessors;
 #else
--- a/builtins-dispatch.ll
+++ b/builtins-dispatch.ll
@@ -1,123 +0,0 @@
-;;  Copyright (c) 2011, Intel Corporation
-;;  All rights reserved.
-;;
-;;  Redistribution and use in source and binary forms, with or without
-;;  modification, are permitted provided that the following conditions are
-;;  met:
-;;
-;;    * Redistributions of source code must retain the above copyright
-;;      notice, this list of conditions and the following disclaimer.
-;;
-;;    * Redistributions in binary form must reproduce the above copyright
-;;      notice, this list of conditions and the following disclaimer in the
-;;      documentation and/or other materials provided with the distribution.
-;;
-;;    * Neither the name of Intel Corporation nor the names of its
-;;      contributors may be used to endorse or promote products derived from
-;;      this software without specific prior written permission.
-;;
-;;
-;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
-;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
-;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
-;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
-
-;; This file defines various functions that are used when generating the
-;; the "dispatch" object/assembly file that has entrypoints for each
-;; exported function in a module that dispatch to the best available
-;; variant of that function that will run on the system's CPU.
-
-;; Stores the best target ISA that the system on which we're actually
-;; running supports.  -1 represents "uninitialized", otherwise this value
-;; should correspond to one of the enumerant values of Target::ISA from
-;; ispc.h.
-
-@__system_best_isa = internal global i32 -1
-
-declare void @abort() noreturn
-
-;; The below is the result of running "clang -O2 -emit-llvm -c -o -" on the
-;; following code...  Specifically, __get_system_isa should return a value
-;; corresponding to one of the Target::ISA enumerant values that gives the
-;; most capable ISA that the curremt system can run.
-;;
-;; #ifdef _MSC_VER
-;; extern void __stdcall __cpuid(int info[4], int infoType);
-;; #else
-;; static void __cpuid(int info[4], int infoType) {
-;;     __asm__ __volatile__ ("cpuid"
-;;                           : "=a" (info[0]), "=b" (info[1]), "=c" (info[2]), "=d" (info[3])
-;;                           : "0" (infoType));
-;; }
-;; #endif
-;; 
-;; int32_t __get_system_isa() {
-;;     int info[4];
-;;     __cpuid(info, 1);
-;;     /* NOTE: the values returned below must be the same as the
-;;        corresponding enumerant values in Target::ISA. */
-;;     if ((info[2] & (1 << 28)) != 0)
-;;         return 2; // AVX
-;;     else if ((info[2] & (1 << 19)) != 0)
-;;         return 1; // SSE4
-;;     else if ((info[3] & (1 << 26)) != 0)
-;;         return 0; // SSE2
-;;     else
-;;         abort();
-;; }
-
-%0 = type { i32, i32, i32, i32 }
-
-define i32 @__get_system_isa() nounwind ssp {
-  %1 = tail call %0 asm sideeffect "cpuid", "={ax},={bx},={cx},={dx},0,~{dirflag},~{fpsr},~{flags}"(i32 1) nounwind
-  %2 = extractvalue %0 %1, 2
-  %3 = extractvalue %0 %1, 3
-  %4 = and i32 %2, 268435456
-  %5 = icmp eq i32 %4, 0
-  br i1 %5, label %6, label %13
-
-; <label>:6                                       ; preds = %0
-  %7 = and i32 %2, 524288
-  %8 = icmp eq i32 %7, 0
-  br i1 %8, label %9, label %13
-
-; <label>:9                                       ; preds = %6
-  %10 = and i32 %3, 67108864
-  %11 = icmp eq i32 %10, 0
-  br i1 %11, label %12, label %13
-
-; <label>:12                                      ; preds = %9
-  tail call void @abort() noreturn nounwind
-  unreachable
-
-; <label>:13                                      ; preds = %9, %6, %0
-  %.0 = phi i32 [ 2, %0 ], [ 1, %6 ], [ 0, %9 ]
-  ret i32 %.0
-}
-
-
-;; This function is called by each of the dispatch functions we generate;
-;; it sets @__system_best_isa if it is unset.
-
-define void @__set_system_isa() {
-entry:
-  %bi = load i32* @__system_best_isa
-  %unset = icmp eq i32 %bi, -1
-  br i1 %unset, label %set_system_isa, label %done
-
-set_system_isa:
-  %bival = call i32 @__get_system_isa()
-  store i32 %bival, i32* @__system_best_isa
-  ret void
-
-done:
-  ret void
-}
-
--- a/builtins-sse.ll
+++ b/builtins-sse.ll
@@ -0,0 +1,417 @@
+;;  Copyright (c) 2010-2011, Intel Corporation
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Intel Corporation nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+;; This file declares implementations of various stdlib builtins that
+;; only require SSE version 1 and 2 functionality; this file, in turn
+;; is then included by builtins-sse2.ll and builtins-sse4.ll to provide
+;; those definitions for them.
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+int64minmax(4)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rcp
+
+declare <4 x float> @llvm.x86.sse.rcp.ps(<4 x float>) nounwind readnone
+declare <4 x float> @llvm.x86.sse.rcp.ss(<4 x float>) nounwind readnone
+
+define internal <4 x float> @__rcp_varying_float(<4 x float>) nounwind readonly alwaysinline {
+  %call = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %0)
+  ; do one N-R iteration to improve precision
+  ;  float iv = __rcp_v(v);
+  ;  return iv * (2. - v * iv);
+  %v_iv = fmul <4 x float> %0, %call
+  %two_minus = fsub <4 x float> <float 2., float 2., float 2., float 2.>, %v_iv  
+  %iv_mul = fmul <4 x float> %call, %two_minus
+  ret <4 x float> %iv_mul
+}
+
+define internal float @__rcp_uniform_float(float) nounwind readonly alwaysinline {
+  ; do the rcpss call
+  %vecval = insertelement <4 x float> undef, float %0, i32 0
+  %call = call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %vecval)
+  %scall = extractelement <4 x float> %call, i32 0
+
+  ; do one N-R iteration to improve precision, as above
+  %v_iv = fmul float %0, %scall
+  %two_minus = fsub float 2., %v_iv  
+  %iv_mul = fmul float %scall, %two_minus
+  ret float %iv_mul
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; rsqrt
+
+declare <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float>) nounwind readnone
+declare <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float>) nounwind readnone
+
+define internal <4 x float> @__rsqrt_varying_float(<4 x float> %v) nounwind readonly alwaysinline {
+  ;  float is = __rsqrt_v(v);
+  %is = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %v)
+  ; Newton-Raphson iteration to improve precision
+  ;  return 0.5 * is * (3. - (v * is) * is);
+  %v_is = fmul <4 x float> %v, %is
+  %v_is_is = fmul <4 x float> %v_is, %is
+  %three_sub = fsub <4 x float> <float 3., float 3., float 3., float 3.>, %v_is_is
+  %is_mul = fmul <4 x float> %is, %three_sub
+  %half_scale = fmul <4 x float> <float 0.5, float 0.5, float 0.5, float 0.5>, %is_mul
+  ret <4 x float> %half_scale
+}
+
+define internal float @__rsqrt_uniform_float(float) nounwind readonly alwaysinline {
+  ;  uniform float is = extract(__rsqrt_u(v), 0);
+  %v = insertelement <4 x float> undef, float %0, i32 0
+  %vis = call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %v)
+  %is = extractelement <4 x float> %vis, i32 0
+
+  ; Newton-Raphson iteration to improve precision
+  ;  return 0.5 * is * (3. - (v * is) * is);
+  %v_is = fmul float %0, %is
+  %v_is_is = fmul float %v_is, %is
+  %three_sub = fsub float 3., %v_is_is
+  %is_mul = fmul float %is, %three_sub
+  %half_scale = fmul float 0.5, %is_mul
+  ret float %half_scale
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; sqrt
+
+declare <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float>) nounwind readnone
+declare <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float>) nounwind readnone
+
+define internal <4 x float> @__sqrt_varying_float(<4 x float>) nounwind readonly alwaysinline {
+  %call = call <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float> %0)
+  ret <4 x float> %call
+}
+
+define internal float @__sqrt_uniform_float(float) nounwind readonly alwaysinline {
+  sse_unary_scalar(ret, 4, float, @llvm.x86.sse.sqrt.ss, %0)
+  ret float %ret
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; fast math mode
+
+declare void @llvm.x86.sse.stmxcsr(i8 *) nounwind
+declare void @llvm.x86.sse.ldmxcsr(i8 *) nounwind
+
+define internal void @__fastmath() nounwind alwaysinline {
+  %ptr = alloca i32
+  %ptr8 = bitcast i32 * %ptr to i8 *
+  call void @llvm.x86.sse.stmxcsr(i8 * %ptr8)
+  %oldval = load i32 *%ptr
+
+  ; turn on DAZ (64)/FTZ (32768) -> 32832
+  %update = or i32 %oldval, 32832
+  store i32 %update, i32 *%ptr
+  call void @llvm.x86.sse.ldmxcsr(i8 * %ptr8)
+  ret void
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; svml stuff
+
+declare <4 x float> @__svml_sinf4(<4 x float>) nounwind readnone
+declare <4 x float> @__svml_cosf4(<4 x float>) nounwind readnone
+declare <4 x float> @__svml_sincosf4(<4 x float> *, <4 x float>) nounwind readnone
+declare <4 x float> @__svml_tanf4(<4 x float>) nounwind readnone
+declare <4 x float> @__svml_atanf4(<4 x float>) nounwind readnone
+declare <4 x float> @__svml_atan2f4(<4 x float>, <4 x float>) nounwind readnone
+declare <4 x float> @__svml_expf4(<4 x float>) nounwind readnone
+declare <4 x float> @__svml_logf4(<4 x float>) nounwind readnone
+declare <4 x float> @__svml_powf4(<4 x float>, <4 x float>) nounwind readnone
+
+
+define internal <4 x float> @__svml_sin(<4 x float>) nounwind readnone alwaysinline {
+  %ret = call <4 x float> @__svml_sinf4(<4 x float> %0)
+  ret <4 x float> %ret
+}
+
+define internal <4 x float> @__svml_cos(<4 x float>) nounwind readnone alwaysinline {
+  %ret = call <4 x float> @__svml_cosf4(<4 x float> %0)
+  ret <4 x float> %ret
+}
+
+define internal void @__svml_sincos(<4 x float>, <4 x float> *, <4 x float> *) nounwind readnone alwaysinline {
+  %s = call <4 x float> @__svml_sincosf4(<4 x float> * %2, <4 x float> %0)
+  store <4 x float> %s, <4 x float> * %1
+  ret void
+}
+
+define internal <4 x float> @__svml_tan(<4 x float>) nounwind readnone alwaysinline {
+  %ret = call <4 x float> @__svml_tanf4(<4 x float> %0)
+  ret <4 x float> %ret
+}
+
+define internal <4 x float> @__svml_atan(<4 x float>) nounwind readnone alwaysinline {
+  %ret = call <4 x float> @__svml_atanf4(<4 x float> %0)
+  ret <4 x float> %ret
+}
+
+define internal <4 x float> @__svml_atan2(<4 x float>, <4 x float>) nounwind readnone alwaysinline {
+  %ret = call <4 x float> @__svml_atan2f4(<4 x float> %0, <4 x float> %1)
+  ret <4 x float> %ret
+}
+
+define internal <4 x float> @__svml_exp(<4 x float>) nounwind readnone alwaysinline {
+  %ret = call <4 x float> @__svml_expf4(<4 x float> %0)
+  ret <4 x float> %ret
+}
+
+define internal <4 x float> @__svml_log(<4 x float>) nounwind readnone alwaysinline {
+  %ret = call <4 x float> @__svml_logf4(<4 x float> %0)
+  ret <4 x float> %ret
+}
+
+define internal <4 x float> @__svml_pow(<4 x float>, <4 x float>) nounwind readnone alwaysinline {
+  %ret = call <4 x float> @__svml_powf4(<4 x float> %0, <4 x float> %1)
+  ret <4 x float> %ret
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; float min/max
+
+declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind readnone
+declare <4 x float> @llvm.x86.sse.max.ss(<4 x float>, <4 x float>) nounwind readnone
+declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind readnone
+declare <4 x float> @llvm.x86.sse.min.ss(<4 x float>, <4 x float>) nounwind readnone
+
+define internal <4 x float> @__max_varying_float(<4 x float>, <4 x float>) nounwind readonly alwaysinline {
+  %call = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %0, <4 x float> %1)
+  ret <4 x float> %call
+}
+
+define internal float @__max_uniform_float(float, float) nounwind readonly alwaysinline {
+  sse_binary_scalar(ret, 4, float, @llvm.x86.sse.max.ss, %0, %1)
+  ret float %ret
+}
+
+define internal <4 x float> @__min_varying_float(<4 x float>, <4 x float>) nounwind readonly alwaysinline {
+  %call = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %0, <4 x float> %1)
+  ret <4 x float> %call
+}
+
+define internal float @__min_uniform_float(float, float) nounwind readonly alwaysinline {
+  sse_binary_scalar(ret, 4, float, @llvm.x86.sse.min.ss, %0, %1)
+  ret float %ret
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; double precision sqrt
+
+declare <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double>) nounwind readnone
+declare <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double>) nounwind readnone
+
+define internal <4 x double> @__sqrt_varying_double(<4 x double>) nounwind alwaysinline {
+  unary2to4(ret, double, @llvm.x86.sse2.sqrt.pd, %0)
+  ret <4 x double> %ret
+}
+
+
+define internal double @__sqrt_uniform_double(double) nounwind alwaysinline {
+  sse_unary_scalar(ret, 2, double, @llvm.x86.sse2.sqrt.sd, %0)
+  ret double %ret
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; double precision min/max
+
+declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>) nounwind readnone
+declare <2 x double> @llvm.x86.sse2.max.sd(<2 x double>, <2 x double>) nounwind readnone
+declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) nounwind readnone
+declare <2 x double> @llvm.x86.sse2.min.sd(<2 x double>, <2 x double>) nounwind readnone
+
+define internal <4 x double> @__min_varying_double(<4 x double>, <4 x double>) nounwind readnone {
+  binary2to4(ret, double, @llvm.x86.sse2.min.pd, %0, %1)
+  ret <4 x double> %ret
+}
+
+
+define internal double @__min_uniform_double(double, double) nounwind readnone {
+  sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.min.sd, %0, %1)
+  ret double %ret
+}
+
+
+define internal <4 x double> @__max_varying_double(<4 x double>, <4 x double>) nounwind readnone {
+  binary2to4(ret, double, @llvm.x86.sse2.max.pd, %0, %1)
+  ret <4 x double> %ret
+}
+
+
+define internal double @__max_uniform_double(double, double) nounwind readnone {
+  sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.max.sd, %0, %1)
+  ret double %ret
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; horizontal ops / reductions
+
+declare i32 @llvm.x86.sse.movmsk.ps(<4 x float>) nounwind readnone
+
+define internal i32 @__movmsk(<4 x i32>) nounwind readnone alwaysinline {
+  %floatmask = bitcast <4 x i32> %0 to <4 x float>
+  %v = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %floatmask) nounwind readnone
+  ret i32 %v
+}
+
+define internal float @__reduce_min_float(<4 x float>) nounwind readnone {
+  reduce4(float, @__min_varying_float, @__min_uniform_float)
+}
+
+define internal float @__reduce_max_float(<4 x float>) nounwind readnone {
+  reduce4(float, @__max_varying_float, @__max_uniform_float)
+}
+
+define internal i32 @__reduce_add_int32(<4 x i32> %v) nounwind readnone {
+  %v1 = shufflevector <4 x i32> %v, <4 x i32> undef,
+                      <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+  %m1 = add <4 x i32> %v1, %v
+  %m1a = extractelement <4 x i32> %m1, i32 0
+  %m1b = extractelement <4 x i32> %m1, i32 1
+  %sum = add i32 %m1a, %m1b
+  ret i32 %sum
+}
+
+define internal i32 @__reduce_min_int32(<4 x i32>) nounwind readnone {
+  reduce4(i32, @__min_varying_int32, @__min_uniform_int32)
+}
+
+define internal i32 @__reduce_max_int32(<4 x i32>) nounwind readnone {
+  reduce4(i32, @__max_varying_int32, @__max_uniform_int32)
+}
+
+define internal i32 @__reduce_add_uint32(<4 x i32> %v) nounwind readnone {
+  %r = call i32 @__reduce_add_int32(<4 x i32> %v)
+  ret i32 %r
+}
+
+define internal i32 @__reduce_min_uint32(<4 x i32>) nounwind readnone {
+  reduce4(i32, @__min_varying_uint32, @__min_uniform_uint32)
+}
+
+define internal i32 @__reduce_max_uint32(<4 x i32>) nounwind readnone {
+  reduce4(i32, @__max_varying_uint32, @__max_uniform_uint32)
+ }
+
+
+define internal double @__reduce_add_double(<4 x double>) nounwind readnone {
+  %v0 = shufflevector <4 x double> %0, <4 x double> undef,
+                      <2 x i32> <i32 0, i32 1>
+  %v1 = shufflevector <4 x double> %0, <4 x double> undef,
+                      <2 x i32> <i32 2, i32 3>
+  %sum = fadd <2 x double> %v0, %v1
+  %e0 = extractelement <2 x double> %sum, i32 0
+  %e1 = extractelement <2 x double> %sum, i32 1
+  %m = fadd double %e0, %e1
+  ret double %m
+}
+
+define internal double @__reduce_min_double(<4 x double>) nounwind readnone {
+  reduce4(double, @__min_varying_double, @__min_uniform_double)
+}
+
+define internal double @__reduce_max_double(<4 x double>) nounwind readnone {
+  reduce4(double, @__max_varying_double, @__max_uniform_double)
+}
+
+define internal i64 @__reduce_add_int64(<4 x i64>) nounwind readnone {
+  %v0 = shufflevector <4 x i64> %0, <4 x i64> undef,
+                      <2 x i32> <i32 0, i32 1>
+  %v1 = shufflevector <4 x i64> %0, <4 x i64> undef,
+                      <2 x i32> <i32 2, i32 3>
+  %sum = add <2 x i64> %v0, %v1
+  %e0 = extractelement <2 x i64> %sum, i32 0
+  %e1 = extractelement <2 x i64> %sum, i32 1
+  %m = add i64 %e0, %e1
+  ret i64 %m
+}
+
+define internal i64 @__reduce_min_int64(<4 x i64>) nounwind readnone {
+  reduce4(i64, @__min_varying_int64, @__min_uniform_int64)
+}
+
+define internal i64 @__reduce_max_int64(<4 x i64>) nounwind readnone {
+  reduce4(i64, @__max_varying_int64, @__max_uniform_int64)
+}
+
+define internal i64 @__reduce_min_uint64(<4 x i64>) nounwind readnone {
+  reduce4(i64, @__min_varying_uint64, @__min_uniform_uint64)
+}
+
+define internal i64 @__reduce_max_uint64(<4 x i64>) nounwind readnone {
+  reduce4(i64, @__max_varying_uint64, @__max_uniform_uint64)
+}
+
+reduce_equal(4)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; masked store
+
+masked_store_blend_8_16_by_4()
+
+gen_masked_store(4, i8, 8)
+gen_masked_store(4, i16, 16)
+gen_masked_store(4, i32, 32)
+gen_masked_store(4, i64, 64)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; unaligned loads/loads+broadcasts
+
+load_and_broadcast(4, i8, 8)
+load_and_broadcast(4, i16, 16)
+load_and_broadcast(4, i32, 32)
+load_and_broadcast(4, i64, 64)
+
+load_masked(4, i8,  8,  1)
+load_masked(4, i16, 16, 2)
+load_masked(4, i32, 32, 4)
+load_masked(4, i64, 64, 8)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; gather/scatter
+
+; define these with the macros from stdlib.m4
+
+gen_gather(4, i8)
+gen_gather(4, i16)
+gen_gather(4, i32)
+gen_gather(4, i64)
+
+gen_scatter(4, i8)
+gen_scatter(4, i16)
+gen_scatter(4, i32)
+gen_scatter(4, i64)
--- a/builtins-sse2-common.ll
+++ b/builtins-sse2-common.ll
@@ -1,266 +0,0 @@
-;;  Copyright (c) 2010-2011, Intel Corporation
-;;  All rights reserved.
-;;
-;;  Redistribution and use in source and binary forms, with or without
-;;  modification, are permitted provided that the following conditions are
-;;  met:
-;;
-;;    * Redistributions of source code must retain the above copyright
-;;      notice, this list of conditions and the following disclaimer.
-;;
-;;    * Redistributions in binary form must reproduce the above copyright
-;;      notice, this list of conditions and the following disclaimer in the
-;;      documentation and/or other materials provided with the distribution.
-;;
-;;    * Neither the name of Intel Corporation nor the names of its
-;;      contributors may be used to endorse or promote products derived from
-;;      this software without specific prior written permission.
-;;
-;;
-;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
-;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
-;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
-;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; rcp
-
-declare <4 x float> @llvm.x86.sse.rcp.ss(<4 x float>) nounwind readnone
-
-define float @__rcp_uniform_float(float) nounwind readonly alwaysinline {
-  ; do the rcpss call
-  %vecval = insertelement <4 x float> undef, float %0, i32 0
-  %call = call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %vecval)
-  %scall = extractelement <4 x float> %call, i32 0
-
-  ; do one N-R iteration to improve precision, as above
-  %v_iv = fmul float %0, %scall
-  %two_minus = fsub float 2., %v_iv  
-  %iv_mul = fmul float %scall, %two_minus
-  ret float %iv_mul
-}
-
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-; rsqrt
-
-declare <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float>) nounwind readnone
-
-define float @__rsqrt_uniform_float(float) nounwind readonly alwaysinline {
-  ;  uniform float is = extract(__rsqrt_u(v), 0);
-  %v = insertelement <4 x float> undef, float %0, i32 0
-  %vis = call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %v)
-  %is = extractelement <4 x float> %vis, i32 0
-
-  ; Newton-Raphson iteration to improve precision
-  ;  return 0.5 * is * (3. - (v * is) * is);
-  %v_is = fmul float %0, %is
-  %v_is_is = fmul float %v_is, %is
-  %three_sub = fsub float 3., %v_is_is
-  %is_mul = fmul float %is, %three_sub
-  %half_scale = fmul float 0.5, %is_mul
-  ret float %half_scale
-}
-
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-; sqrt
-
-declare <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float>) nounwind readnone
-
-
-define float @__sqrt_uniform_float(float) nounwind readonly alwaysinline {
-  sse_unary_scalar(ret, 4, float, @llvm.x86.sse.sqrt.ss, %0)
-  ret float %ret
-}
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; fast math mode
-
-declare void @llvm.x86.sse.stmxcsr(i8 *) nounwind
-declare void @llvm.x86.sse.ldmxcsr(i8 *) nounwind
-
-define void @__fastmath() nounwind alwaysinline {
-  %ptr = alloca i32
-  %ptr8 = bitcast i32 * %ptr to i8 *
-  call void @llvm.x86.sse.stmxcsr(i8 * %ptr8)
-  %oldval = load i32 *%ptr
-
-  ; turn on DAZ (64)/FTZ (32768) -> 32832
-  %update = or i32 %oldval, 32832
-  store i32 %update, i32 *%ptr
-  call void @llvm.x86.sse.ldmxcsr(i8 * %ptr8)
-  ret void
-}
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; float min/max
-
-declare <4 x float> @llvm.x86.sse.max.ss(<4 x float>, <4 x float>) nounwind readnone
-declare <4 x float> @llvm.x86.sse.min.ss(<4 x float>, <4 x float>) nounwind readnone
-
-define float @__max_uniform_float(float, float) nounwind readonly alwaysinline {
-  sse_binary_scalar(ret, 4, float, @llvm.x86.sse.max.ss, %0, %1)
-  ret float %ret
-}
-
-
-define float @__min_uniform_float(float, float) nounwind readonly alwaysinline {
-  sse_binary_scalar(ret, 4, float, @llvm.x86.sse.min.ss, %0, %1)
-  ret float %ret
-}
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; double precision sqrt
-
-declare <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double>) nounwind readnone
-
-define double @__sqrt_uniform_double(double) nounwind alwaysinline {
-  sse_unary_scalar(ret, 2, double, @llvm.x86.sse2.sqrt.sd, %0)
-  ret double %ret
-}
-
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; double precision min/max
-
-declare <2 x double> @llvm.x86.sse2.max.sd(<2 x double>, <2 x double>) nounwind readnone
-declare <2 x double> @llvm.x86.sse2.min.sd(<2 x double>, <2 x double>) nounwind readnone
-
-define double @__min_uniform_double(double, double) nounwind readnone {
-  sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.min.sd, %0, %1)
-  ret double %ret
-}
-
-define double @__max_uniform_double(double, double) nounwind readnone {
-  sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.max.sd, %0, %1)
-  ret double %ret
-}
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; rounding
-;;
-;; There are not any rounding instructions in SSE2, so we have to emulate
-;; the functionality with multiple instructions...
-
-; The code for __round_* is the result of compiling the following source
-; code.
-;
-; export float Round(float x) {
-;    unsigned int sign = signbits(x);
-;    unsigned int ix = intbits(x);
-;    ix ^= sign;
-;    x = floatbits(ix);
-;    x += 0x1.0p23f;
-;    x -= 0x1.0p23f;
-;    ix = intbits(x);
-;    ix ^= sign;
-;    x = floatbits(ix);
-;    return x;
-;}
-
-define float @__round_uniform_float(float) nounwind readonly alwaysinline {
-  %float_to_int_bitcast.i.i.i.i = bitcast float %0 to i32
-  %bitop.i.i = and i32 %float_to_int_bitcast.i.i.i.i, -2147483648
-  %bitop.i = xor i32 %bitop.i.i, %float_to_int_bitcast.i.i.i.i
-  %int_to_float_bitcast.i.i40.i = bitcast i32 %bitop.i to float
-  %binop.i = fadd float %int_to_float_bitcast.i.i40.i, 8.388608e+06
-  %binop21.i = fadd float %binop.i, -8.388608e+06
-  %float_to_int_bitcast.i.i.i = bitcast float %binop21.i to i32
-  %bitop31.i = xor i32 %float_to_int_bitcast.i.i.i, %bitop.i.i
-  %int_to_float_bitcast.i.i.i = bitcast i32 %bitop31.i to float
-  ret float %int_to_float_bitcast.i.i.i
-}
-
-;; Similarly, for implementations of the __floor* functions below, we have the
-;; bitcode from compiling the following source code...
-
-;export float Floor(float x) {
-;    float y = Round(x);
-;    unsigned int cmp = y > x ? 0xffffffff : 0;
-;    float delta = -1.f;
-;    unsigned int idelta = intbits(delta);
-;    idelta &= cmp;
-;    delta = floatbits(idelta);
-;    return y + delta;
-;}
-
-define float @__floor_uniform_float(float) nounwind readonly alwaysinline {
-  %calltmp.i = tail call float @__round_uniform_float(float %0) nounwind
-  %bincmp.i = fcmp ogt float %calltmp.i, %0
-  %selectexpr.i = sext i1 %bincmp.i to i32
-  %bitop.i = and i32 %selectexpr.i, -1082130432
-  %int_to_float_bitcast.i.i.i = bitcast i32 %bitop.i to float
-  %binop.i = fadd float %calltmp.i, %int_to_float_bitcast.i.i.i
-  ret float %binop.i
-}
-
-;; And here is the code we compiled to get the __ceil* functions below
-;
-;export uniform float Ceil(uniform float x) {
-;    uniform float y = Round(x);
-;    uniform int yltx = y < x ? 0xffffffff : 0;
-;    uniform float delta = 1.f;
-;    uniform int idelta = intbits(delta);
-;    idelta &= yltx;
-;    delta = floatbits(idelta);
-;    return y + delta;
-;}
-
-define float @__ceil_uniform_float(float) nounwind readonly alwaysinline {
-  %calltmp.i = tail call float @__round_uniform_float(float %0) nounwind
-  %bincmp.i = fcmp olt float %calltmp.i, %0
-  %selectexpr.i = sext i1 %bincmp.i to i32
-  %bitop.i = and i32 %selectexpr.i, 1065353216
-  %int_to_float_bitcast.i.i.i = bitcast i32 %bitop.i to float
-  %binop.i = fadd float %calltmp.i, %int_to_float_bitcast.i.i.i
-  ret float %binop.i
-}
-
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; rounding doubles
-
-declare double @round(double)
-declare double @floor(double)
-declare double @ceil(double)
-
-define double @__round_uniform_double(double) nounwind readonly alwaysinline {
-  %r = call double @round(double %0)
-  ret double %r
-}
-
-define double @__floor_uniform_double(double) nounwind readonly alwaysinline {
-  %r = call double @floor(double %0)
-  ret double %r
-}
-
-define double @__ceil_uniform_double(double) nounwind readonly alwaysinline {
-  %r = call double @ceil(double %0)
-  ret double %r
-}
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-; horizontal ops / reductions
-
-declare i32 @llvm.ctpop.i32(i32)
-declare i64 @llvm.ctpop.i64(i64)
-
-define i32 @__popcnt_int32(i32) nounwind readonly alwaysinline {
-  %val = call i32 @llvm.ctpop.i32(i32 %0)
-  ret i32 %val
-}
-
-define i64 @__popcnt_int64(i64) nounwind readnone alwaysinline {
-  %val = call i64 @llvm.ctpop.i64(i64 %0)
-  ret i64 %val
-}
-
-
--- a/builtins-sse2-x2.ll
+++ b/builtins-sse2-x2.ll
@@ -1,631 +0,0 @@
-;;  Copyright (c) 2010-2011, Intel Corporation
-;;  All rights reserved.
-;;
-;;  Redistribution and use in source and binary forms, with or without
-;;  modification, are permitted provided that the following conditions are
-;;  met:
-;;
-;;    * Redistributions of source code must retain the above copyright
-;;      notice, this list of conditions and the following disclaimer.
-;;
-;;    * Redistributions in binary form must reproduce the above copyright
-;;      notice, this list of conditions and the following disclaimer in the
-;;      documentation and/or other materials provided with the distribution.
-;;
-;;    * Neither the name of Intel Corporation nor the names of its
-;;      contributors may be used to endorse or promote products derived from
-;;      this software without specific prior written permission.
-;;
-;;
-;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
-;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
-;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
-;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
-
-
-;; This file defines the target for "double-pumped" SSE2, i.e. running
-;; with 8-wide vectors
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; standard 8-wide definitions from m4 macros
-
-stdlib_core(8)
-packed_load_and_store(8)
-scans(8)
-int64minmax(8)
-
-include(`builtins-sse2-common.ll')
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; rcp
-
-declare <4 x float> @llvm.x86.sse.rcp.ps(<4 x float>) nounwind readnone
-
-define <8 x float> @__rcp_varying_float(<8 x float>) nounwind readonly alwaysinline {
-  ;  float iv = __rcp_v(v);
-  ;  return iv * (2. - v * iv);
-
-  unary4to8(call, float, @llvm.x86.sse.rcp.ps, %0)
-  ; do one N-R iteration
-  %v_iv = fmul <8 x float> %0, %call
-  %two_minus = fsub <8 x float> <float 2., float 2., float 2., float 2.,
-                                 float 2., float 2., float 2., float 2.>, %v_iv  
-  %iv_mul = fmul <8 x float> %call, %two_minus
-  ret <8 x float> %iv_mul
-}
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; rsqrt
-
-declare <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float>) nounwind readnone
-
-define <8 x float> @__rsqrt_varying_float(<8 x float> %v) nounwind readonly alwaysinline {
-  ;  float is = __rsqrt_v(v);
-  unary4to8(is, float, @llvm.x86.sse.rsqrt.ps, %v)
-  ;  return 0.5 * is * (3. - (v * is) * is);
-  %v_is = fmul <8 x float> %v, %is
-  %v_is_is = fmul <8 x float> %v_is, %is
-  %three_sub = fsub <8 x float> <float 3., float 3., float 3., float 3.,
-                                 float 3., float 3., float 3., float 3.>, %v_is_is
-  %is_mul = fmul <8 x float> %is, %three_sub
-  %half_scale = fmul <8 x float> <float 0.5, float 0.5, float 0.5, float 0.5,
-                                  float 0.5, float 0.5, float 0.5, float 0.5>, %is_mul
-  ret <8 x float> %half_scale
-}
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; sqrt
-
-declare <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float>) nounwind readnone
-
-define <8 x float> @__sqrt_varying_float(<8 x float>) nounwind readonly alwaysinline {
-  unary4to8(call, float, @llvm.x86.sse.sqrt.ps, %0)
-  ret <8 x float> %call
-}
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-; svml stuff
-
-declare <4 x float> @__svml_sinf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_cosf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_sincosf4(<4 x float> *, <4 x float>) nounwind readnone
-declare <4 x float> @__svml_tanf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_atanf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_atan2f4(<4 x float>, <4 x float>) nounwind readnone
-declare <4 x float> @__svml_expf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_logf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_powf4(<4 x float>, <4 x float>) nounwind readnone
-
-
-define <8 x float> @__svml_sin(<8 x float>) nounwind readnone alwaysinline {
-  unary4to8(ret, float, @__svml_sinf4, %0)
-  ret <8 x float> %ret
-}
-
-define <8 x float> @__svml_cos(<8 x float>) nounwind readnone alwaysinline {
-  unary4to8(ret, float, @__svml_cosf4, %0)
-  ret <8 x float> %ret
-}
-
-define void @__svml_sincos(<8 x float>, <8 x float> *,
-                                    <8 x float> *) nounwind readnone alwaysinline {
-  ; call svml_sincosf4 two times with the two 4-wide sub-vectors
-  %a = shufflevector <8 x float> %0, <8 x float> undef,
-         <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %b = shufflevector <8 x float> %0, <8 x float> undef,
-         <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-
-  %cospa = alloca <4 x float>
-  %sa = call <4 x float> @__svml_sincosf4(<4 x float> * %cospa, <4 x float> %a)
-
-  %cospb = alloca <4 x float>
-  %sb = call <4 x float> @__svml_sincosf4(<4 x float> * %cospb, <4 x float> %b)
-
-  %sin = shufflevector <4 x float> %sa, <4 x float> %sb,
-         <8 x i32> <i32 0, i32 1, i32 2, i32 3,
-                    i32 4, i32 5, i32 6, i32 7>
-  store <8 x float> %sin, <8 x float> * %1
-
-  %cosa = load <4 x float> * %cospa
-  %cosb = load <4 x float> * %cospb
-  %cos = shufflevector <4 x float> %cosa, <4 x float> %cosb,
-         <8 x i32> <i32 0, i32 1, i32 2, i32 3,
-                    i32 4, i32 5, i32 6, i32 7>
-  store <8 x float> %cos, <8 x float> * %2
-
-  ret void
-}
-
-define <8 x float> @__svml_tan(<8 x float>) nounwind readnone alwaysinline {
-  unary4to8(ret, float, @__svml_tanf4, %0)
-  ret <8 x float> %ret
-}
-
-define <8 x float> @__svml_atan(<8 x float>) nounwind readnone alwaysinline {
-  unary4to8(ret, float, @__svml_atanf4, %0)
-  ret <8 x float> %ret
-}
-
-define <8 x float> @__svml_atan2(<8 x float>,
-                                          <8 x float>) nounwind readnone alwaysinline {
-  binary4to8(ret, float, @__svml_atan2f4, %0, %1)
-  ret <8 x float> %ret
-}
-
-define <8 x float> @__svml_exp(<8 x float>) nounwind readnone alwaysinline {
-  unary4to8(ret, float, @__svml_expf4, %0)
-  ret <8 x float> %ret
-}
-
-define <8 x float> @__svml_log(<8 x float>) nounwind readnone alwaysinline {
-  unary4to8(ret, float, @__svml_logf4, %0)
-  ret <8 x float> %ret
-}
-
-define <8 x float> @__svml_pow(<8 x float>,
-                                        <8 x float>) nounwind readnone alwaysinline {
-  binary4to8(ret, float, @__svml_powf4, %0, %1)
-  ret <8 x float> %ret
-}
-
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; float min/max
-
-declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind readnone
-declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind readnone
-
-define <8 x float> @__max_varying_float(<8 x float>, <8 x float>) nounwind readonly alwaysinline {
-  binary4to8(call, float, @llvm.x86.sse.max.ps, %0, %1)
-  ret <8 x float> %call
-}
-
-define <8 x float> @__min_varying_float(<8 x float>, <8 x float>) nounwind readonly alwaysinline {
-  binary4to8(call, float, @llvm.x86.sse.min.ps, %0, %1)
-  ret <8 x float> %call
-}
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; min/max
-
-; There is no blend instruction with SSE2, so we simulate it with bit
-; operations on i32s.  For these two vselect functions, for each
-; vector element, if the mask is on, we return the corresponding value
-; from %1, and otherwise return the value from %0.
-
-define <8 x i32> @__vselect_i32(<8 x i32>, <8 x i32> ,
-                                         <8 x i32> %mask) nounwind readnone alwaysinline {
-  %notmask = xor <8 x i32> %mask, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
-  %cleared_old = and <8 x i32> %0, %notmask
-  %masked_new = and <8 x i32> %1, %mask
-  %new = or <8 x i32> %cleared_old, %masked_new
-  ret <8 x i32> %new
-}
-
-define <8 x float> @__vselect_float(<8 x float>, <8 x float>,
-                                             <8 x i32> %mask) nounwind readnone alwaysinline {
-  %v0 = bitcast <8 x float> %0 to <8 x i32>
-  %v1 = bitcast <8 x float> %1 to <8 x i32>
-  %r = call <8 x i32> @__vselect_i32(<8 x i32> %v0, <8 x i32> %v1, <8 x i32> %mask)
-  %rf = bitcast <8 x i32> %r to <8 x float>
-  ret <8 x float> %rf
-}
-
-
-; To do vector integer min and max, we do the vector compare and then sign
-; extend the i1 vector result to an i32 mask.  The __vselect does the
-; rest...
-
-define <8 x i32> @__min_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
-  %c = icmp slt <8 x i32> %0, %1
-  %mask = sext <8 x i1> %c to <8 x i32>
-  %v = call <8 x i32> @__vselect_i32(<8 x i32> %1, <8 x i32> %0, <8 x i32> %mask)
-  ret <8 x i32> %v
-}
-
-define i32 @__min_uniform_int32(i32, i32) nounwind readonly alwaysinline {
-  %c = icmp slt i32 %0, %1
-  %r = select i1 %c, i32 %0, i32 %1
-  ret i32 %r
-}
-
-define <8 x i32> @__max_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
-  %c = icmp sgt <8 x i32> %0, %1
-  %mask = sext <8 x i1> %c to <8 x i32>
-  %v = call <8 x i32> @__vselect_i32(<8 x i32> %1, <8 x i32> %0, <8 x i32> %mask)
-  ret <8 x i32> %v
-}
-
-define i32 @__max_uniform_int32(i32, i32) nounwind readonly alwaysinline {
-  %c = icmp sgt i32 %0, %1
-  %r = select i1 %c, i32 %0, i32 %1
-  ret i32 %r
-}
-
-; The functions for unsigned ints are similar, just with unsigned
-; comparison functions...
-
-define <8 x i32> @__min_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
-  %c = icmp ult <8 x i32> %0, %1
-  %mask = sext <8 x i1> %c to <8 x i32>
-  %v = call <8 x i32> @__vselect_i32(<8 x i32> %1, <8 x i32> %0, <8 x i32> %mask)
-  ret <8 x i32> %v
-}
-
-define i32 @__min_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
-  %c = icmp ult i32 %0, %1
-  %r = select i1 %c, i32 %0, i32 %1
-  ret i32 %r
-}
-
-define <8 x i32> @__max_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
-  %c = icmp ugt <8 x i32> %0, %1
-  %mask = sext <8 x i1> %c to <8 x i32>
-  %v = call <8 x i32> @__vselect_i32(<8 x i32> %1, <8 x i32> %0, <8 x i32> %mask)
-  ret <8 x i32> %v
-}
-
-define i32 @__max_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
-  %c = icmp ugt i32 %0, %1
-  %r = select i1 %c, i32 %0, i32 %1
-  ret i32 %r
-}
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-; horizontal ops / reductions
-
-declare i32 @llvm.x86.sse.movmsk.ps(<4 x float>) nounwind readnone
-
-define i32 @__movmsk(<8 x i32>) nounwind readnone alwaysinline {
-  ; first do two 4-wide movmsk calls
-  %floatmask = bitcast <8 x i32> %0 to <8 x float>
-  %m0 = shufflevector <8 x float> %floatmask, <8 x float> undef,
-          <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %v0 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %m0) nounwind readnone
-  %m1 = shufflevector <8 x float> %floatmask, <8 x float> undef,
-          <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %v1 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %m1) nounwind readnone
-
-  ; and shift the first one over by 4 before ORing it with the value 
-  ; of the second one
-  %v1s = shl i32 %v1, 4
-  %v = or i32 %v0, %v1s
-  ret i32 %v
-}
-
-define <4 x float> @__vec4_add_float(<4 x float> %v0,
-                                            <4 x float> %v1) nounwind readnone alwaysinline {
-  %v = fadd <4 x float> %v0, %v1
-  ret <4 x float> %v
-}
-
-define float @__add_float(float, float) nounwind readnone alwaysinline {
-  %v = fadd float %0, %1
-  ret float %v
-}
-
-define float @__reduce_add_float(<8 x float>) nounwind readnone alwaysinline {
-  reduce8by4(float, @__vec4_add_float, @__add_float)
-}
-
-define float @__reduce_min_float(<8 x float>) nounwind readnone alwaysinline {
-  reduce8(float, @__min_varying_float, @__min_uniform_float)
-}
-
-define float @__reduce_max_float(<8 x float>) nounwind readnone alwaysinline {
-  reduce8(float, @__max_varying_float, @__max_uniform_float)
-}
-
-; helper function for reduce_add_int32
-define <4 x i32> @__vec4_add_int32(<4 x i32> %v0,
-                                            <4 x i32> %v1) nounwind readnone alwaysinline {
-  %v = add <4 x i32> %v0, %v1
-  ret <4 x i32> %v
-}
-
-; helper function for reduce_add_int32
-define i32 @__add_int32(i32, i32) nounwind readnone alwaysinline {
-  %v = add i32 %0, %1
-  ret i32 %v
-}
-
-define i32 @__reduce_add_int32(<8 x i32>) nounwind readnone alwaysinline {
-  reduce8by4(i32, @__vec4_add_int32, @__add_int32)
-}
-
-define i32 @__reduce_min_int32(<8 x i32>) nounwind readnone alwaysinline {
-  reduce8(i32, @__min_varying_int32, @__min_uniform_int32)
-}
-
-define i32 @__reduce_max_int32(<8 x i32>) nounwind readnone alwaysinline {
-  reduce8(i32, @__max_varying_int32, @__max_uniform_int32)
-}
-
-define i32 @__reduce_add_uint32(<8 x i32> %v) nounwind readnone alwaysinline {
-  %r = call i32 @__reduce_add_int32(<8 x i32> %v)
-  ret i32 %r
-}
-
-define i32 @__reduce_min_uint32(<8 x i32>) nounwind readnone alwaysinline {
-  reduce8(i32, @__min_varying_uint32, @__min_uniform_uint32)
-}
-
-define i32 @__reduce_max_uint32(<8 x i32>) nounwind readnone alwaysinline {
-  reduce8(i32, @__max_varying_uint32, @__max_uniform_uint32)
-}
-
-define <4 x double> @__add_varying_double(<4 x double>,
-                                     <4 x double>) nounwind readnone alwaysinline {
-  %r = fadd <4 x double> %0, %1
-  ret <4 x double> %r
-}
-
-define double @__add_uniform_double(double, double) nounwind readnone alwaysinline {
-  %r = fadd double %0, %1
-  ret double %r
-}
-
-define double @__reduce_add_double(<8 x double>) nounwind readnone {
-  reduce8by4(double, @__add_varying_double, @__add_uniform_double)
-}
-
-define double @__reduce_min_double(<8 x double>) nounwind readnone {
-  reduce8(double, @__min_varying_double, @__min_uniform_double)
-}
-
-define double @__reduce_max_double(<8 x double>) nounwind readnone {
-  reduce8(double, @__max_varying_double, @__max_uniform_double)
-}
-
-define <4 x i64> @__add_varying_int64(<4 x i64>,
-                                               <4 x i64>) nounwind readnone alwaysinline {
-  %r = add <4 x i64> %0, %1
-  ret <4 x i64> %r
-}
-
-define i64 @__add_uniform_int64(i64, i64) nounwind readnone alwaysinline {
-  %r = add i64 %0, %1
-  ret i64 %r
-}
-
-define i64 @__reduce_add_int64(<8 x i64>) nounwind readnone {
-  reduce8by4(i64, @__add_varying_int64, @__add_uniform_int64)
-}
-
-define i64 @__reduce_min_int64(<8 x i64>) nounwind readnone {
-  reduce8(i64, @__min_varying_int64, @__min_uniform_int64)
-}
-
-define i64 @__reduce_max_int64(<8 x i64>) nounwind readnone {
-  reduce8(i64, @__max_varying_int64, @__max_uniform_int64)
-}
-
-define i64 @__reduce_min_uint64(<8 x i64>) nounwind readnone {
-  reduce8(i64, @__min_varying_uint64, @__min_uniform_uint64)
-}
-
-define i64 @__reduce_max_uint64(<8 x i64>) nounwind readnone {
-  reduce8(i64, @__max_varying_uint64, @__max_uniform_uint64)
-}
-
-reduce_equal(8)
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; unaligned loads/loads+broadcasts
-
-load_and_broadcast(8, i8, 8)
-load_and_broadcast(8, i16, 16)
-load_and_broadcast(8, i32, 32)
-load_and_broadcast(8, i64, 64)
-
-load_masked(8, i8,  8,  1)
-load_masked(8, i16, 16, 2)
-load_masked(8, i32, 32, 4)
-load_masked(8, i64, 64, 8)
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; gather/scatter
-
-gen_gather(8, i8)
-gen_gather(8, i16)
-gen_gather(8, i32)
-gen_gather(8, i64)
-
-gen_scatter(8, i8)
-gen_scatter(8, i16)
-gen_scatter(8, i32)
-gen_scatter(8, i64)
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; float rounding
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; rounding
-;;
-;; There are not any rounding instructions in SSE2, so we have to emulate
-;; the functionality with multiple instructions...
-
-; The code for __round_* is the result of compiling the following source
-; code.
-;
-; export float Round(float x) {
-;    unsigned int sign = signbits(x);
-;    unsigned int ix = intbits(x);
-;    ix ^= sign;
-;    x = floatbits(ix);
-;    x += 0x1.0p23f;
-;    x -= 0x1.0p23f;
-;    ix = intbits(x);
-;    ix ^= sign;
-;    x = floatbits(ix);
-;    return x;
-;}
-
-define <8 x float> @__round_varying_float(<8 x float>) nounwind readonly alwaysinline {
-  %float_to_int_bitcast.i.i.i.i = bitcast <8 x float> %0 to <8 x i32>
-  %bitop.i.i = and <8 x i32> %float_to_int_bitcast.i.i.i.i, <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>
-  %bitop.i = xor <8 x i32> %float_to_int_bitcast.i.i.i.i, %bitop.i.i
-  %int_to_float_bitcast.i.i40.i = bitcast <8 x i32> %bitop.i to <8 x float>
-  %binop.i = fadd <8 x float> %int_to_float_bitcast.i.i40.i, <float 8.388608e+06, float 8.388608e+06, float 8.388608e+06, float 8.388608e+06, float 8.388608e+06, float 8.388608e+06, float 8.388608e+06, float 8.388608e+06>
-  %binop21.i = fadd <8 x float> %binop.i, <float -8.388608e+06, float -8.388608e+06, float -8.388608e+06, float -8.388608e+06, float -8.388608e+06, float -8.388608e+06, float -8.388608e+06, float -8.388608e+06>
-  %float_to_int_bitcast.i.i.i = bitcast <8 x float> %binop21.i to <8 x i32>
-  %bitop31.i = xor <8 x i32> %float_to_int_bitcast.i.i.i, %bitop.i.i
-  %int_to_float_bitcast.i.i.i = bitcast <8 x i32> %bitop31.i to <8 x float>
-  ret <8 x float> %int_to_float_bitcast.i.i.i
-}
-
-;; Similarly, for implementations of the __floor* functions below, we have the
-;; bitcode from compiling the following source code...
-
-;export float Floor(float x) {
-;    float y = Round(x);
-;    unsigned int cmp = y > x ? 0xffffffff : 0;
-;    float delta = -1.f;
-;    unsigned int idelta = intbits(delta);
-;    idelta &= cmp;
-;    delta = floatbits(idelta);
-;    return y + delta;
-;}
-
-define <8 x float> @__floor_varying_float(<8 x float>) nounwind readonly alwaysinline {
-  %calltmp.i = tail call <8 x float> @__round_varying_float(<8 x float> %0) nounwind
-  %bincmp.i = fcmp ogt <8 x float> %calltmp.i, %0
-  %val_to_boolvec32.i = sext <8 x i1> %bincmp.i to <8 x i32>
-  %bitop.i = and <8 x i32> %val_to_boolvec32.i, <i32 -1082130432, i32 -1082130432, i32 -1082130432, i32 -1082130432, i32 -1082130432, i32 -1082130432, i32 -1082130432, i32 -1082130432>
-  %int_to_float_bitcast.i.i.i = bitcast <8 x i32> %bitop.i to <8 x float>
-  %binop.i = fadd <8 x float> %calltmp.i, %int_to_float_bitcast.i.i.i
-  ret <8 x float> %binop.i
-}
-
-;; And here is the code we compiled to get the __ceil* functions below
-;
-;export uniform float Ceil(uniform float x) {
-;    uniform float y = Round(x);
-;    uniform int yltx = y < x ? 0xffffffff : 0;
-;    uniform float delta = 1.f;
-;    uniform int idelta = intbits(delta);
-;    idelta &= yltx;
-;    delta = floatbits(idelta);
-;    return y + delta;
-;}
-
-define <8 x float> @__ceil_varying_float(<8 x float>) nounwind readonly alwaysinline {
-  %calltmp.i = tail call <8 x float> @__round_varying_float(<8 x float> %0) nounwind
-  %bincmp.i = fcmp olt <8 x float> %calltmp.i, %0
-  %val_to_boolvec32.i = sext <8 x i1> %bincmp.i to <8 x i32>
-  %bitop.i = and <8 x i32> %val_to_boolvec32.i, <i32 1065353216, i32 1065353216, i32 1065353216, i32 1065353216, i32 1065353216, i32 1065353216, i32 1065353216, i32 1065353216>
-  %int_to_float_bitcast.i.i.i = bitcast <8 x i32> %bitop.i to <8 x float>
-  %binop.i = fadd <8 x float> %calltmp.i, %int_to_float_bitcast.i.i.i
-  ret <8 x float> %binop.i
-}
-
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; rounding doubles
-
-define <8 x double> @__round_varying_double(<8 x double>) nounwind readonly alwaysinline {
-  unary1to8(double, @round)
-}
-
-define <8 x double> @__floor_varying_double(<8 x double>) nounwind readonly alwaysinline {
-  unary1to8(double, @floor)
-}
-
-define <8 x double> @__ceil_varying_double(<8 x double>) nounwind readonly alwaysinline {
-  unary1to8(double, @ceil)
-}
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; masked store
-
-gen_masked_store(8, i8, 8)
-gen_masked_store(8, i16, 16)
-gen_masked_store(8, i32, 32)
-gen_masked_store(8, i64, 64)
-
-masked_store_blend_8_16_by_8()
-
-define void @__masked_store_blend_32(<8 x i32>* nocapture, <8 x i32>, 
-                                     <8 x i32> %mask) nounwind alwaysinline {
-  %val = load <8 x i32> * %0, align 4
-  %newval = call <8 x i32> @__vselect_i32(<8 x i32> %val, <8 x i32> %1, <8 x i32> %mask) 
-  store <8 x i32> %newval, <8 x i32> * %0, align 4
-  ret void
-}
-
-define void @__masked_store_blend_64(<8 x i64>* nocapture %ptr, <8 x i64> %new,
-                                     <8 x i32> %mask) nounwind alwaysinline {
-  %oldValue = load <8 x i64>* %ptr, align 8
-
-  ; Do 8x64-bit blends by doing two <8 x i32> blends, where the <8 x i32> values
-  ; are actually bitcast <2 x i64> values
-  ;
-  ; set up the first two 64-bit values
-  %old0123  = shufflevector <8 x i64> %oldValue, <8 x i64> undef,
-                            <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %old0123f = bitcast <4 x i64> %old0123 to <8 x float>
-  %new0123  = shufflevector <8 x i64> %new, <8 x i64> undef,
-                            <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %new0123f = bitcast <4 x i64> %new0123 to <8 x float>
-  ; compute mask--note that the indices are doubled-up
-  %mask0123 = shufflevector <8 x i32> %mask, <8 x i32> undef,
-              <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
-  ; and blend the first 4 values
-  %result0123f = call <8 x float> @__vselect_float(<8 x float> %old0123f, <8 x float> %new0123f,
-                                                   <8 x i32> %mask0123)
-  %result0123 = bitcast <8 x float> %result0123f to <4 x i64>
-
-  ; and again
-  %old4567  = shufflevector <8 x i64> %oldValue, <8 x i64> undef,
-                            <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %old4567f = bitcast <4 x i64> %old4567 to <8 x float>
-  %new4567  = shufflevector <8 x i64> %new, <8 x i64> undef,
-                            <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %new4567f = bitcast <4 x i64> %new4567 to <8 x float>
-  ; compute mask--note that the values are doubled-up
-  %mask4567 = shufflevector <8 x i32> %mask, <8 x i32> undef,
-              <8 x i32> <i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
-  ; and blend the two of the values
-  %result4567f = call <8 x float> @__vselect_float(<8 x float> %old4567f, <8 x float> %new4567f,
-                                                   <8 x i32> %mask4567)
-  %result4567 = bitcast <8 x float> %result4567f to <4 x i64>
-
-  ; reconstruct the final <8 x i64> vector
-  %final = shufflevector <4 x i64> %result0123, <4 x i64> %result4567,
-           <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  store <8 x i64> %final, <8 x i64> * %ptr, align 8
-  ret void
-}
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; double precision sqrt
-
-declare <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double>) nounwind readnone
-
-define <8 x double> @__sqrt_varying_double(<8 x double>) nounwind alwaysinline {
-  unary2to8(ret, double, @llvm.x86.sse2.sqrt.pd, %0)
-  ret <8 x double> %ret
-}
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; double precision float min/max
-
-declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>) nounwind readnone
-declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) nounwind readnone
-
-define <8 x double> @__min_varying_double(<8 x double>, <8 x double>) nounwind readnone alwaysinline {
-  binary2to8(ret, double, @llvm.x86.sse2.min.pd, %0, %1)
-  ret <8 x double> %ret
-}
-
-define <8 x double> @__max_varying_double(<8 x double>, <8 x double>) nounwind readnone alwaysinline {
-  binary2to8(ret, double, @llvm.x86.sse2.max.pd, %0, %1)
-  ret <8 x double> %ret
-}
--- a/builtins-sse2.ll
+++ b/builtins-sse2.ll
@@ -36,9 +36,9 @@
 stdlib_core(4)
 packed_load_and_store(4)
 scans(4)
-int64minmax(4)

-include(`builtins-sse2-common.ll')
+; Include the various definitions of things that only require SSE1 and SSE2
+include(`builtins-sse.ll')

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rounding
@@ -62,7 +62,7 @@ include(`builtins-sse2-common.ll')
 ;    return x;
 ;}

-define <4 x float> @__round_varying_float(<4 x float>) nounwind readonly alwaysinline {
+define internal <4 x float> @__round_varying_float(<4 x float>) nounwind readonly alwaysinline {
  %float_to_int_bitcast.i.i.i.i = bitcast <4 x float> %0 to <4 x i32>
  %bitop.i.i = and <4 x i32> %float_to_int_bitcast.i.i.i.i, <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>
  %bitop.i = xor <4 x i32> %float_to_int_bitcast.i.i.i.i, %bitop.i.i
@@ -75,6 +75,19 @@ define <4 x float> @__round_varying_float(<4 x float>) nounwind readonly alwaysi
  ret <4 x float> %int_to_float_bitcast.i.i.i
 }

+define internal float @__round_uniform_float(float) nounwind readonly alwaysinline {
+  %float_to_int_bitcast.i.i.i.i = bitcast float %0 to i32
+  %bitop.i.i = and i32 %float_to_int_bitcast.i.i.i.i, -2147483648
+  %bitop.i = xor i32 %bitop.i.i, %float_to_int_bitcast.i.i.i.i
+  %int_to_float_bitcast.i.i40.i = bitcast i32 %bitop.i to float
+  %binop.i = fadd float %int_to_float_bitcast.i.i40.i, 8.388608e+06
+  %binop21.i = fadd float %binop.i, -8.388608e+06
+  %float_to_int_bitcast.i.i.i = bitcast float %binop21.i to i32
+  %bitop31.i = xor i32 %float_to_int_bitcast.i.i.i, %bitop.i.i
+  %int_to_float_bitcast.i.i.i = bitcast i32 %bitop31.i to float
+  ret float %int_to_float_bitcast.i.i.i
+}
+
 ;; Similarly, for implementations of the __floor* functions below, we have the
 ;; bitcode from compiling the following source code...

@@ -88,7 +101,7 @@ define <4 x float> @__round_varying_float(<4 x float>) nounwind readonly alwaysi
 ;    return y + delta;
 ;}

-define <4 x float> @__floor_varying_float(<4 x float>) nounwind readonly alwaysinline {
+define internal <4 x float> @__floor_varying_float(<4 x float>) nounwind readonly alwaysinline {
  %calltmp.i = tail call <4 x float> @__round_varying_float(<4 x float> %0) nounwind
  %bincmp.i = fcmp ogt <4 x float> %calltmp.i, %0
  %val_to_boolvec32.i = sext <4 x i1> %bincmp.i to <4 x i32>
@@ -98,6 +111,16 @@ define <4 x float> @__floor_varying_float(<4 x float>) nounwind readonly alwaysi
  ret <4 x float> %binop.i
 }

+define internal float @__floor_uniform_float(float) nounwind readonly alwaysinline {
+  %calltmp.i = tail call float @__round_uniform_float(float %0) nounwind
+  %bincmp.i = fcmp ogt float %calltmp.i, %0
+  %selectexpr.i = sext i1 %bincmp.i to i32
+  %bitop.i = and i32 %selectexpr.i, -1082130432
+  %int_to_float_bitcast.i.i.i = bitcast i32 %bitop.i to float
+  %binop.i = fadd float %calltmp.i, %int_to_float_bitcast.i.i.i
+  ret float %binop.i
+}
+
 ;; And here is the code we compiled to get the __ceil* functions below
 ;
 ;export uniform float Ceil(uniform float x) {
@@ -110,7 +133,7 @@ define <4 x float> @__floor_varying_float(<4 x float>) nounwind readonly alwaysi
 ;    return y + delta;
 ;}

-define <4 x float> @__ceil_varying_float(<4 x float>) nounwind readonly alwaysinline {
+define internal <4 x float> @__ceil_varying_float(<4 x float>) nounwind readonly alwaysinline {
  %calltmp.i = tail call <4 x float> @__round_varying_float(<4 x float> %0) nounwind
  %bincmp.i = fcmp olt <4 x float> %calltmp.i, %0
  %val_to_boolvec32.i = sext <4 x i1> %bincmp.i to <4 x i32>
@@ -120,21 +143,50 @@ define <4 x float> @__ceil_varying_float(<4 x float>) nounwind readonly alwaysin
  ret <4 x float> %binop.i
 }

+define internal float @__ceil_uniform_float(float) nounwind readonly alwaysinline {
+  %calltmp.i = tail call float @__round_uniform_float(float %0) nounwind
+  %bincmp.i = fcmp olt float %calltmp.i, %0
+  %selectexpr.i = sext i1 %bincmp.i to i32
+  %bitop.i = and i32 %selectexpr.i, 1065353216
+  %int_to_float_bitcast.i.i.i = bitcast i32 %bitop.i to float
+  %binop.i = fadd float %calltmp.i, %int_to_float_bitcast.i.i.i
+  ret float %binop.i
+}
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rounding doubles

-define <4 x double> @__round_varying_double(<4 x double>) nounwind readonly alwaysinline {
+declare double @round(double)
+declare double @floor(double)
+declare double @ceil(double)
+
+define internal <4 x double> @__round_varying_double(<4 x double>) nounwind readonly alwaysinline {
  unary1to4(double, @round)
 }

-define <4 x double> @__floor_varying_double(<4 x double>) nounwind readonly alwaysinline {
+define internal double @__round_uniform_double(double) nounwind readonly alwaysinline {
+  %r = call double @round(double %0)
+  ret double %r
+}
+
+define internal <4 x double> @__floor_varying_double(<4 x double>) nounwind readonly alwaysinline {
  unary1to4(double, @floor)
 }

-define <4 x double> @__ceil_varying_double(<4 x double>) nounwind readonly alwaysinline {
+define internal double @__floor_uniform_double(double) nounwind readonly alwaysinline {
+  %r = call double @floor(double %0)
+  ret double %r
+}
+
+define internal <4 x double> @__ceil_varying_double(<4 x double>) nounwind readonly alwaysinline {
  unary1to4(double, @ceil)
 }

+define internal double @__ceil_uniform_double(double) nounwind readonly alwaysinline {
+  %r = call double @ceil(double %0)
+  ret double %r
+}
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; min/max

@@ -143,7 +195,7 @@ define <4 x double> @__ceil_varying_double(<4 x double>) nounwind readonly alway
 ; vector element, if the mask is on, we return the corresponding value
 ; from %1, and otherwise return the value from %0.

-define <4 x i32> @__vselect_i32(<4 x i32>, <4 x i32> ,
+define internal <4 x i32> @__vselect_i32(<4 x i32>, <4 x i32> ,
                                         <4 x i32> %mask) nounwind readnone alwaysinline {
  %notmask = xor <4 x i32> %mask, <i32 -1, i32 -1, i32 -1, i32 -1>
  %cleared_old = and <4 x i32> %0, %notmask
@@ -152,7 +204,7 @@ define <4 x i32> @__vselect_i32(<4 x i32>, <4 x i32> ,
  ret <4 x i32> %new
 }

-define <4 x float> @__vselect_float(<4 x float>, <4 x float>,
+define internal <4 x float> @__vselect_float(<4 x float>, <4 x float>,
                                             <4 x i32> %mask) nounwind readnone alwaysinline {
  %v0 = bitcast <4 x float> %0 to <4 x i32>
  %v1 = bitcast <4 x float> %1 to <4 x i32>
@@ -166,27 +218,27 @@ define <4 x float> @__vselect_float(<4 x float>, <4 x float>,
 ; extend the i1 vector result to an i32 mask.  The __vselect does the
 ; rest...

-define <4 x i32> @__min_varying_int32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
+define internal <4 x i32> @__min_varying_int32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
  %c = icmp slt <4 x i32> %0, %1
  %mask = sext <4 x i1> %c to <4 x i32>
  %v = call <4 x i32> @__vselect_i32(<4 x i32> %1, <4 x i32> %0, <4 x i32> %mask)
  ret <4 x i32> %v
 }

-define i32 @__min_uniform_int32(i32, i32) nounwind readonly alwaysinline {
+define internal i32 @__min_uniform_int32(i32, i32) nounwind readonly alwaysinline {
  %c = icmp slt i32 %0, %1
  %r = select i1 %c, i32 %0, i32 %1
  ret i32 %r
 }

-define <4 x i32> @__max_varying_int32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
+define internal <4 x i32> @__max_varying_int32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
  %c = icmp sgt <4 x i32> %0, %1
  %mask = sext <4 x i1> %c to <4 x i32>
  %v = call <4 x i32> @__vselect_i32(<4 x i32> %1, <4 x i32> %0, <4 x i32> %mask)
  ret <4 x i32> %v
 }

-define i32 @__max_uniform_int32(i32, i32) nounwind readonly alwaysinline {
+define internal i32 @__max_uniform_int32(i32, i32) nounwind readonly alwaysinline {
  %c = icmp sgt i32 %0, %1
  %r = select i1 %c, i32 %0, i32 %1
  ret i32 %r
@@ -195,27 +247,27 @@ define i32 @__max_uniform_int32(i32, i32) nounwind readonly alwaysinline {
 ; The functions for unsigned ints are similar, just with unsigned
 ; comparison functions...

-define <4 x i32> @__min_varying_uint32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
+define internal <4 x i32> @__min_varying_uint32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
  %c = icmp ult <4 x i32> %0, %1
  %mask = sext <4 x i1> %c to <4 x i32>
  %v = call <4 x i32> @__vselect_i32(<4 x i32> %1, <4 x i32> %0, <4 x i32> %mask)
  ret <4 x i32> %v
 }

-define i32 @__min_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
+define internal i32 @__min_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
  %c = icmp ult i32 %0, %1
  %r = select i1 %c, i32 %0, i32 %1
  ret i32 %r
 }

-define <4 x i32> @__max_varying_uint32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
+define internal <4 x i32> @__max_varying_uint32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
  %c = icmp ugt <4 x i32> %0, %1
  %mask = sext <4 x i1> %c to <4 x i32>
  %v = call <4 x i32> @__vselect_i32(<4 x i32> %1, <4 x i32> %0, <4 x i32> %mask)
  ret <4 x i32> %v
 }

-define i32 @__max_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
+define internal i32 @__max_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
  %c = icmp ugt i32 %0, %1
  %r = select i1 %c, i32 %0, i32 %1
  ret i32 %r
@@ -225,15 +277,21 @@ define i32 @__max_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; horizontal ops / reductions

-declare i32 @llvm.x86.sse.movmsk.ps(<4 x float>) nounwind readnone
+declare i32 @llvm.ctpop.i32(i32)
+declare i64 @llvm.ctpop.i64(i64)

-define i32 @__movmsk(<4 x i32>) nounwind readnone alwaysinline {
-  %floatmask = bitcast <4 x i32> %0 to <4 x float>
-  %v = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %floatmask) nounwind readnone
-  ret i32 %v
+define internal i32 @__popcnt_int32(i32) nounwind readonly alwaysinline {
+  %val = call i32 @llvm.ctpop.i32(i32 %0)
+  ret i32 %val
 }

-define float @__reduce_add_float(<4 x float> %v) nounwind readonly alwaysinline {
+define internal i64 @__popcnt_int64(i64) nounwind readnone alwaysinline {
+  %val = call i64 @llvm.ctpop.i64(i64 %0)
+  ret i64 %val
+}
+
+
+define internal float @__reduce_add_float(<4 x float> %v) nounwind readonly alwaysinline {
  %v1 = shufflevector <4 x float> %v, <4 x float> undef,
                      <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
  %m1 = fadd <4 x float> %v1, %v
@@ -243,96 +301,6 @@ define float @__reduce_add_float(<4 x float> %v) nounwind readonly alwaysinline
  ret float %sum
 }

-define float @__reduce_min_float(<4 x float>) nounwind readnone {
-  reduce4(float, @__min_varying_float, @__min_uniform_float)
-}
-
-define float @__reduce_max_float(<4 x float>) nounwind readnone {
-  reduce4(float, @__max_varying_float, @__max_uniform_float)
-}
-
-define i32 @__reduce_add_int32(<4 x i32> %v) nounwind readnone {
-  %v1 = shufflevector <4 x i32> %v, <4 x i32> undef,
-                      <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
-  %m1 = add <4 x i32> %v1, %v
-  %m1a = extractelement <4 x i32> %m1, i32 0
-  %m1b = extractelement <4 x i32> %m1, i32 1
-  %sum = add i32 %m1a, %m1b
-  ret i32 %sum
-}
-
-define i32 @__reduce_min_int32(<4 x i32>) nounwind readnone {
-  reduce4(i32, @__min_varying_int32, @__min_uniform_int32)
-}
-
-define i32 @__reduce_max_int32(<4 x i32>) nounwind readnone {
-  reduce4(i32, @__max_varying_int32, @__max_uniform_int32)
-}
-
-define i32 @__reduce_add_uint32(<4 x i32> %v) nounwind readnone {
-  %r = call i32 @__reduce_add_int32(<4 x i32> %v)
-  ret i32 %r
-}
-
-define i32 @__reduce_min_uint32(<4 x i32>) nounwind readnone {
-  reduce4(i32, @__min_varying_uint32, @__min_uniform_uint32)
-}
-
-define i32 @__reduce_max_uint32(<4 x i32>) nounwind readnone {
-  reduce4(i32, @__max_varying_uint32, @__max_uniform_uint32)
- }
-
-
-define double @__reduce_add_double(<4 x double>) nounwind readnone {
-  %v0 = shufflevector <4 x double> %0, <4 x double> undef,
-                      <2 x i32> <i32 0, i32 1>
-  %v1 = shufflevector <4 x double> %0, <4 x double> undef,
-                      <2 x i32> <i32 2, i32 3>
-  %sum = fadd <2 x double> %v0, %v1
-  %e0 = extractelement <2 x double> %sum, i32 0
-  %e1 = extractelement <2 x double> %sum, i32 1
-  %m = fadd double %e0, %e1
-  ret double %m
-}
-
-define double @__reduce_min_double(<4 x double>) nounwind readnone {
-  reduce4(double, @__min_varying_double, @__min_uniform_double)
-}
-
-define double @__reduce_max_double(<4 x double>) nounwind readnone {
-  reduce4(double, @__max_varying_double, @__max_uniform_double)
-}
-
-define i64 @__reduce_add_int64(<4 x i64>) nounwind readnone {
-  %v0 = shufflevector <4 x i64> %0, <4 x i64> undef,
-                      <2 x i32> <i32 0, i32 1>
-  %v1 = shufflevector <4 x i64> %0, <4 x i64> undef,
-                      <2 x i32> <i32 2, i32 3>
-  %sum = add <2 x i64> %v0, %v1
-  %e0 = extractelement <2 x i64> %sum, i32 0
-  %e1 = extractelement <2 x i64> %sum, i32 1
-  %m = add i64 %e0, %e1
-  ret i64 %m
-}
-
-define i64 @__reduce_min_int64(<4 x i64>) nounwind readnone {
-  reduce4(i64, @__min_varying_int64, @__min_uniform_int64)
-}
-
-define i64 @__reduce_max_int64(<4 x i64>) nounwind readnone {
-  reduce4(i64, @__max_varying_int64, @__max_uniform_int64)
-}
-
-define i64 @__reduce_min_uint64(<4 x i64>) nounwind readnone {
-  reduce4(i64, @__min_varying_uint64, @__min_uniform_uint64)
-}
-
-define i64 @__reduce_max_uint64(<4 x i64>) nounwind readnone {
-  reduce4(i64, @__max_varying_uint64, @__max_uniform_uint64)
-}
-
-reduce_equal(4)
-

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; masked store
@@ -387,187 +355,3 @@ define void @__masked_store_blend_64(<4 x i64>* nocapture %ptr, <4 x i64> %new,
  ret void
 }

-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; rcp
-
-declare <4 x float> @llvm.x86.sse.rcp.ps(<4 x float>) nounwind readnone
-
-define <4 x float> @__rcp_varying_float(<4 x float>) nounwind readonly alwaysinline {
-  %call = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %0)
-  ; do one N-R iteration to improve precision
-  ;  float iv = __rcp_v(v);
-  ;  return iv * (2. - v * iv);
-  %v_iv = fmul <4 x float> %0, %call
-  %two_minus = fsub <4 x float> <float 2., float 2., float 2., float 2.>, %v_iv  
-  %iv_mul = fmul <4 x float> %call, %two_minus
-  ret <4 x float> %iv_mul
-}
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-; rsqrt
-
-declare <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float>) nounwind readnone
-
-define <4 x float> @__rsqrt_varying_float(<4 x float> %v) nounwind readonly alwaysinline {
-  ;  float is = __rsqrt_v(v);
-  %is = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %v)
-  ; Newton-Raphson iteration to improve precision
-  ;  return 0.5 * is * (3. - (v * is) * is);
-  %v_is = fmul <4 x float> %v, %is
-  %v_is_is = fmul <4 x float> %v_is, %is
-  %three_sub = fsub <4 x float> <float 3., float 3., float 3., float 3.>, %v_is_is
-  %is_mul = fmul <4 x float> %is, %three_sub
-  %half_scale = fmul <4 x float> <float 0.5, float 0.5, float 0.5, float 0.5>, %is_mul
-  ret <4 x float> %half_scale
-}
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-; sqrt
-
-declare <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float>) nounwind readnone
-
-define <4 x float> @__sqrt_varying_float(<4 x float>) nounwind readonly alwaysinline {
-  %call = call <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float> %0)
-  ret <4 x float> %call
-}
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-; svml stuff
-
-declare <4 x float> @__svml_sinf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_cosf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_sincosf4(<4 x float> *, <4 x float>) nounwind readnone
-declare <4 x float> @__svml_tanf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_atanf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_atan2f4(<4 x float>, <4 x float>) nounwind readnone
-declare <4 x float> @__svml_expf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_logf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_powf4(<4 x float>, <4 x float>) nounwind readnone
-
-
-define <4 x float> @__svml_sin(<4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_sinf4(<4 x float> %0)
-  ret <4 x float> %ret
-}
-
-define <4 x float> @__svml_cos(<4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_cosf4(<4 x float> %0)
-  ret <4 x float> %ret
-}
-
-define void @__svml_sincos(<4 x float>, <4 x float> *, <4 x float> *) nounwind readnone alwaysinline {
-  %s = call <4 x float> @__svml_sincosf4(<4 x float> * %2, <4 x float> %0)
-  store <4 x float> %s, <4 x float> * %1
-  ret void
-}
-
-define <4 x float> @__svml_tan(<4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_tanf4(<4 x float> %0)
-  ret <4 x float> %ret
-}
-
-define <4 x float> @__svml_atan(<4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_atanf4(<4 x float> %0)
-  ret <4 x float> %ret
-}
-
-define <4 x float> @__svml_atan2(<4 x float>, <4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_atan2f4(<4 x float> %0, <4 x float> %1)
-  ret <4 x float> %ret
-}
-
-define <4 x float> @__svml_exp(<4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_expf4(<4 x float> %0)
-  ret <4 x float> %ret
-}
-
-define <4 x float> @__svml_log(<4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_logf4(<4 x float> %0)
-  ret <4 x float> %ret
-}
-
-define <4 x float> @__svml_pow(<4 x float>, <4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_powf4(<4 x float> %0, <4 x float> %1)
-  ret <4 x float> %ret
-}
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; float min/max
-
-declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind readnone
-declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind readnone
-
-define <4 x float> @__max_varying_float(<4 x float>, <4 x float>) nounwind readonly alwaysinline {
-  %call = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %0, <4 x float> %1)
-  ret <4 x float> %call
-}
-
-define <4 x float> @__min_varying_float(<4 x float>, <4 x float>) nounwind readonly alwaysinline {
-  %call = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %0, <4 x float> %1)
-  ret <4 x float> %call
-}
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; double precision sqrt
-
-declare <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double>) nounwind readnone
-
-define <4 x double> @__sqrt_varying_double(<4 x double>) nounwind alwaysinline {
-  unary2to4(ret, double, @llvm.x86.sse2.sqrt.pd, %0)
-  ret <4 x double> %ret
-}
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; double precision min/max
-
-declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>) nounwind readnone
-declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) nounwind readnone
-
-define <4 x double> @__min_varying_double(<4 x double>, <4 x double>) nounwind readnone {
-  binary2to4(ret, double, @llvm.x86.sse2.min.pd, %0, %1)
-  ret <4 x double> %ret
-}
-
-define <4 x double> @__max_varying_double(<4 x double>, <4 x double>) nounwind readnone {
-  binary2to4(ret, double, @llvm.x86.sse2.max.pd, %0, %1)
-  ret <4 x double> %ret
-}
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; masked store
-
-masked_store_blend_8_16_by_4()
-
-gen_masked_store(4, i8, 8)
-gen_masked_store(4, i16, 16)
-gen_masked_store(4, i32, 32)
-gen_masked_store(4, i64, 64)
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; unaligned loads/loads+broadcasts
-
-load_and_broadcast(4, i8, 8)
-load_and_broadcast(4, i16, 16)
-load_and_broadcast(4, i32, 32)
-load_and_broadcast(4, i64, 64)
-
-load_masked(4, i8,  8,  1)
-load_masked(4, i16, 16, 2)
-load_masked(4, i32, 32, 4)
-load_masked(4, i64, 64, 8)
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; gather/scatter
-
-; define these with the macros from stdlib.m4
-
-gen_gather(4, i8)
-gen_gather(4, i16)
-gen_gather(4, i32)
-gen_gather(4, i64)
-
-gen_scatter(4, i8)
-gen_scatter(4, i16)
-gen_scatter(4, i32)
-gen_scatter(4, i64)
--- a/builtins-sse4-common.ll
+++ b/builtins-sse4-common.ll
@@ -1,271 +0,0 @@
-;;  Copyright (c) 2010-2011, Intel Corporation
-;;  All rights reserved.
-;;
-;;  Redistribution and use in source and binary forms, with or without
-;;  modification, are permitted provided that the following conditions are
-;;  met:
-;;
-;;    * Redistributions of source code must retain the above copyright
-;;      notice, this list of conditions and the following disclaimer.
-;;
-;;    * Redistributions in binary form must reproduce the above copyright
-;;      notice, this list of conditions and the following disclaimer in the
-;;      documentation and/or other materials provided with the distribution.
-;;
-;;    * Neither the name of Intel Corporation nor the names of its
-;;      contributors may be used to endorse or promote products derived from
-;;      this software without specific prior written permission.
-;;
-;;
-;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
-;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
-;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
-;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; rounding floats
-
-declare <4 x float> @llvm.x86.sse41.round.ss(<4 x float>, <4 x float>, i32) nounwind readnone
-
-define float @__round_uniform_float(float) nounwind readonly alwaysinline {
-  ; roundss, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8
-  ; the roundss intrinsic is a total mess--docs say:
-  ;
-  ;  __m128 _mm_round_ss (__m128 a, __m128 b, const int c)
-  ;       
-  ;  b is a 128-bit parameter. The lowest 32 bits are the result of the rounding function
-  ;  on b0. The higher order 96 bits are copied directly from input parameter a. The
-  ;  return value is described by the following equations:
-  ;
-  ;  r0 = RND(b0)
-  ;  r1 = a1
-  ;  r2 = a2
-  ;  r3 = a3
-  ;
-  ;  It doesn't matter what we pass as a, since we only need the r0 value
-  ;  here.  So we pass the same register for both.  Further, only the 0th
-  ;  element of the b parameter matters
-  %xi = insertelement <4 x float> undef, float %0, i32 0
-  %xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 8)
-  %rs = extractelement <4 x float> %xr, i32 0
-  ret float %rs
-}
-
-define float @__floor_uniform_float(float) nounwind readonly alwaysinline {
-  ; see above for round_ss instrinsic discussion...
-  %xi = insertelement <4 x float> undef, float %0, i32 0
-  ; roundps, round down 0b01 | don't signal precision exceptions 0b1010 = 9
-  %xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 9)
-  %rs = extractelement <4 x float> %xr, i32 0
-  ret float %rs
-}
-
-define float @__ceil_uniform_float(float) nounwind readonly alwaysinline {
-  ; see above for round_ss instrinsic discussion...
-  %xi = insertelement <4 x float> undef, float %0, i32 0
-  ; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
-  %xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 10)
-  %rs = extractelement <4 x float> %xr, i32 0
-  ret float %rs
-}
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; rounding doubles
-
-declare <2 x double> @llvm.x86.sse41.round.sd(<2 x double>, <2 x double>, i32) nounwind readnone
-
-define double @__round_uniform_double(double) nounwind readonly alwaysinline {
-  %xi = insertelement <2 x double> undef, double %0, i32 0
-  %xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 8)
-  %rs = extractelement <2 x double> %xr, i32 0
-  ret double %rs
-}
-
-define double @__floor_uniform_double(double) nounwind readonly alwaysinline {
-  ; see above for round_ss instrinsic discussion...
-  %xi = insertelement <2 x double> undef, double %0, i32 0
-  ; roundpd, round down 0b01 | don't signal precision exceptions 0b1001 = 9
-  %xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 9)
-  %rs = extractelement <2 x double> %xr, i32 0
-  ret double %rs
-}
-
-define double @__ceil_uniform_double(double) nounwind readonly alwaysinline {
-  ; see above for round_ss instrinsic discussion...
-  %xi = insertelement <2 x double> undef, double %0, i32 0
-  ; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
-  %xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 10)
-  %rs = extractelement <2 x double> %xr, i32 0
-  ret double %rs
-}
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; rcp
-
-declare <4 x float> @llvm.x86.sse.rcp.ss(<4 x float>) nounwind readnone
-
-define float @__rcp_uniform_float(float) nounwind readonly alwaysinline {
-  ; do the rcpss call
-  %vecval = insertelement <4 x float> undef, float %0, i32 0
-  %call = call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %vecval)
-  %scall = extractelement <4 x float> %call, i32 0
-
-  ; do one N-R iteration to improve precision, as above
-  %v_iv = fmul float %0, %scall
-  %two_minus = fsub float 2., %v_iv  
-  %iv_mul = fmul float %scall, %two_minus
-  ret float %iv_mul
-}
-
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-; rsqrt
-
-declare <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float>) nounwind readnone
-
-define float @__rsqrt_uniform_float(float) nounwind readonly alwaysinline {
-  ;  uniform float is = extract(__rsqrt_u(v), 0);
-  %v = insertelement <4 x float> undef, float %0, i32 0
-  %vis = call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %v)
-  %is = extractelement <4 x float> %vis, i32 0
-
-  ; Newton-Raphson iteration to improve precision
-  ;  return 0.5 * is * (3. - (v * is) * is);
-  %v_is = fmul float %0, %is
-  %v_is_is = fmul float %v_is, %is
-  %three_sub = fsub float 3., %v_is_is
-  %is_mul = fmul float %is, %three_sub
-  %half_scale = fmul float 0.5, %is_mul
-  ret float %half_scale
-}
-
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-; sqrt
-
-declare <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float>) nounwind readnone
-
-define float @__sqrt_uniform_float(float) nounwind readonly alwaysinline {
-  sse_unary_scalar(ret, 4, float, @llvm.x86.sse.sqrt.ss, %0)
-  ret float %ret
-}
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; fast math mode
-
-declare void @llvm.x86.sse.stmxcsr(i8 *) nounwind
-declare void @llvm.x86.sse.ldmxcsr(i8 *) nounwind
-
-define void @__fastmath() nounwind alwaysinline {
-  %ptr = alloca i32
-  %ptr8 = bitcast i32 * %ptr to i8 *
-  call void @llvm.x86.sse.stmxcsr(i8 * %ptr8)
-  %oldval = load i32 *%ptr
-
-  ; turn on DAZ (64)/FTZ (32768) -> 32832
-  %update = or i32 %oldval, 32832
-  store i32 %update, i32 *%ptr
-  call void @llvm.x86.sse.ldmxcsr(i8 * %ptr8)
-  ret void
-}
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; float min/max
-
-declare <4 x float> @llvm.x86.sse.max.ss(<4 x float>, <4 x float>) nounwind readnone
-declare <4 x float> @llvm.x86.sse.min.ss(<4 x float>, <4 x float>) nounwind readnone
-
-define float @__max_uniform_float(float, float) nounwind readonly alwaysinline {
-  sse_binary_scalar(ret, 4, float, @llvm.x86.sse.max.ss, %0, %1)
-  ret float %ret
-}
-
-define float @__min_uniform_float(float, float) nounwind readonly alwaysinline {
-  sse_binary_scalar(ret, 4, float, @llvm.x86.sse.min.ss, %0, %1)
-  ret float %ret
-}
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; double precision sqrt
-
-declare <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double>) nounwind readnone
-
-define double @__sqrt_uniform_double(double) nounwind alwaysinline {
-  sse_unary_scalar(ret, 2, double, @llvm.x86.sse2.sqrt.sd, %0)
-  ret double %ret
-}
-
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; double precision min/max
-
-declare <2 x double> @llvm.x86.sse2.max.sd(<2 x double>, <2 x double>) nounwind readnone
-declare <2 x double> @llvm.x86.sse2.min.sd(<2 x double>, <2 x double>) nounwind readnone
-
-define double @__min_uniform_double(double, double) nounwind readnone {
-  sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.min.sd, %0, %1)
-  ret double %ret
-}
-
-
-define double @__max_uniform_double(double, double) nounwind readnone {
-  sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.max.sd, %0, %1)
-  ret double %ret
-}
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; int32 min/max
-
-declare <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32>, <4 x i32>) nounwind readnone
-declare <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32>, <4 x i32>) nounwind readnone
-
-define i32 @__min_uniform_int32(i32, i32) nounwind readonly alwaysinline {
-  sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pminsd, %0, %1)
-  ret i32 %ret
-}
-
-define i32 @__max_uniform_int32(i32, i32) nounwind readonly alwaysinline {
-  sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pmaxsd, %0, %1)
-  ret i32 %ret
-}
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-; unsigned int min/max
-
-declare <4 x i32> @llvm.x86.sse41.pminud(<4 x i32>, <4 x i32>) nounwind readnone
-declare <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32>, <4 x i32>) nounwind readnone
-
-define i32 @__min_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
-  sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pminud, %0, %1)
-  ret i32 %ret
-}
-
-define i32 @__max_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
-  sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pmaxud, %0, %1)
-  ret i32 %ret
-}
-
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-; horizontal ops / reductions
-
-declare i32 @llvm.ctpop.i32(i32) nounwind readnone
-
-define i32 @__popcnt_int32(i32) nounwind readonly alwaysinline {
-  %call = call i32 @llvm.ctpop.i32(i32 %0)
-  ret i32 %call
-}
-
-declare i64 @llvm.ctpop.i64(i64) nounwind readnone
-
-define i64 @__popcnt_int64(i64) nounwind readonly alwaysinline {
-  %call = call i64 @llvm.ctpop.i64(i64 %0)
-  ret i64 %call
-}
--- a/builtins-sse4.ll
+++ b/builtins-sse4.ll
@@ -36,334 +36,200 @@
 stdlib_core(4)
 packed_load_and_store(4)
 scans(4)
-int64minmax(4)

-include(`builtins-sse4-common.ll')
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; rcp
-
-declare <4 x float> @llvm.x86.sse.rcp.ps(<4 x float>) nounwind readnone
-
-define <4 x float> @__rcp_varying_float(<4 x float>) nounwind readonly alwaysinline {
-  %call = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %0)
-  ; do one N-R iteration to improve precision
-  ;  float iv = __rcp_v(v);
-  ;  return iv * (2. - v * iv);
-  %v_iv = fmul <4 x float> %0, %call
-  %two_minus = fsub <4 x float> <float 2., float 2., float 2., float 2.>, %v_iv  
-  %iv_mul = fmul <4 x float> %call, %two_minus
-  ret <4 x float> %iv_mul
-}
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-; rsqrt
-
-declare <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float>) nounwind readnone
-
-define <4 x float> @__rsqrt_varying_float(<4 x float> %v) nounwind readonly alwaysinline {
-  ;  float is = __rsqrt_v(v);
-  %is = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %v)
-  ; Newton-Raphson iteration to improve precision
-  ;  return 0.5 * is * (3. - (v * is) * is);
-  %v_is = fmul <4 x float> %v, %is
-  %v_is_is = fmul <4 x float> %v_is, %is
-  %three_sub = fsub <4 x float> <float 3., float 3., float 3., float 3.>, %v_is_is
-  %is_mul = fmul <4 x float> %is, %three_sub
-  %half_scale = fmul <4 x float> <float 0.5, float 0.5, float 0.5, float 0.5>, %is_mul
-  ret <4 x float> %half_scale
-}
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-; sqrt
-
-declare <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float>) nounwind readnone
-
-define <4 x float> @__sqrt_varying_float(<4 x float>) nounwind readonly alwaysinline {
-  %call = call <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float> %0)
-  ret <4 x float> %call
-}
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; double precision sqrt
-
-declare <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double>) nounwind readnone
-
-define <4 x double> @__sqrt_varying_double(<4 x double>) nounwind alwaysinline {
-  unary2to4(ret, double, @llvm.x86.sse2.sqrt.pd, %0)
-  ret <4 x double> %ret
-}
+; Define the stuff that can be done with base SSE1/SSE2 instructions
+include(`builtins-sse.ll')

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rounding floats

 declare <4 x float> @llvm.x86.sse41.round.ps(<4 x float>, i32) nounwind readnone
+declare <4 x float> @llvm.x86.sse41.round.ss(<4 x float>, <4 x float>, i32) nounwind readnone

-define <4 x float> @__round_varying_float(<4 x float>) nounwind readonly alwaysinline {
+define internal <4 x float> @__round_varying_float(<4 x float>) nounwind readonly alwaysinline {
  ; roundps, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8
  %call = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %0, i32 8)
  ret <4 x float> %call
 }

-define <4 x float> @__floor_varying_float(<4 x float>) nounwind readonly alwaysinline {
+define internal float @__round_uniform_float(float) nounwind readonly alwaysinline {
+  ; roundss, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8
+  ; the roundss intrinsic is a total mess--docs say:
+  ;
+  ;  __m128 _mm_round_ss (__m128 a, __m128 b, const int c)
+  ;       
+  ;  b is a 128-bit parameter. The lowest 32 bits are the result of the rounding function
+  ;  on b0. The higher order 96 bits are copied directly from input parameter a. The
+  ;  return value is described by the following equations:
+  ;
+  ;  r0 = RND(b0)
+  ;  r1 = a1
+  ;  r2 = a2
+  ;  r3 = a3
+  ;
+  ;  It doesn't matter what we pass as a, since we only need the r0 value
+  ;  here.  So we pass the same register for both.  Further, only the 0th
+  ;  element of the b parameter matters
+  %xi = insertelement <4 x float> undef, float %0, i32 0
+  %xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 8)
+  %rs = extractelement <4 x float> %xr, i32 0
+  ret float %rs
+}
+
+define internal <4 x float> @__floor_varying_float(<4 x float>) nounwind readonly alwaysinline {
  ; roundps, round down 0b01 | don't signal precision exceptions 0b1001 = 9
  %call = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %0, i32 9)
  ret <4 x float> %call
 }

-define <4 x float> @__ceil_varying_float(<4 x float>) nounwind readonly alwaysinline {
+define internal float @__floor_uniform_float(float) nounwind readonly alwaysinline {
+  ; see above for round_ss instrinsic discussion...
+  %xi = insertelement <4 x float> undef, float %0, i32 0
+  ; roundps, round down 0b01 | don't signal precision exceptions 0b1010 = 9
+  %xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 9)
+  %rs = extractelement <4 x float> %xr, i32 0
+  ret float %rs
+}
+
+define internal <4 x float> @__ceil_varying_float(<4 x float>) nounwind readonly alwaysinline {
  ; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
  %call = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %0, i32 10)
  ret <4 x float> %call
 }

+define internal float @__ceil_uniform_float(float) nounwind readonly alwaysinline {
+  ; see above for round_ss instrinsic discussion...
+  %xi = insertelement <4 x float> undef, float %0, i32 0
+  ; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
+  %xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 10)
+  %rs = extractelement <4 x float> %xr, i32 0
+  ret float %rs
+}
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rounding doubles

 declare <2 x double> @llvm.x86.sse41.round.pd(<2 x double>, i32) nounwind readnone
+declare <2 x double> @llvm.x86.sse41.round.sd(<2 x double>, <2 x double>, i32) nounwind readnone

-define <4 x double> @__round_varying_double(<4 x double>) nounwind readonly alwaysinline {
+define internal <4 x double> @__round_varying_double(<4 x double>) nounwind readonly alwaysinline {
  round2to4double(%0, 8)
 }

-define <4 x double> @__floor_varying_double(<4 x double>) nounwind readonly alwaysinline {
+define internal double @__round_uniform_double(double) nounwind readonly alwaysinline {
+  %xi = insertelement <2 x double> undef, double %0, i32 0
+  %xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 8)
+  %rs = extractelement <2 x double> %xr, i32 0
+  ret double %rs
+}
+
+define internal <4 x double> @__floor_varying_double(<4 x double>) nounwind readonly alwaysinline {
  ; roundpd, round down 0b01 | don't signal precision exceptions 0b1001 = 9
  round2to4double(%0, 9)
 }

-define <4 x double> @__ceil_varying_double(<4 x double>) nounwind readonly alwaysinline {
+define internal double @__floor_uniform_double(double) nounwind readonly alwaysinline {
+  ; see above for round_ss instrinsic discussion...
+  %xi = insertelement <2 x double> undef, double %0, i32 0
+  ; roundpd, round down 0b01 | don't signal precision exceptions 0b1001 = 9
+  %xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 9)
+  %rs = extractelement <2 x double> %xr, i32 0
+  ret double %rs
+}
+
+define internal <4 x double> @__ceil_varying_double(<4 x double>) nounwind readonly alwaysinline {
  ; roundpd, round up 0b10 | don't signal precision exceptions 0b1010 = 10
  round2to4double(%0, 10)
 }

-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; float min/max
-
-declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind readnone
-declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind readnone
-
-define <4 x float> @__max_varying_float(<4 x float>, <4 x float>) nounwind readonly alwaysinline {
-  %call = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %0, <4 x float> %1)
-  ret <4 x float> %call
-}
-
-define <4 x float> @__min_varying_float(<4 x float>, <4 x float>) nounwind readonly alwaysinline {
-  %call = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %0, <4 x float> %1)
-  ret <4 x float> %call
+define internal double @__ceil_uniform_double(double) nounwind readonly alwaysinline {
+  ; see above for round_ss instrinsic discussion...
+  %xi = insertelement <2 x double> undef, double %0, i32 0
+  ; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
+  %xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 10)
+  %rs = extractelement <2 x double> %xr, i32 0
+  ret double %rs
 }

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; int32 min/max

-define <4 x i32> @__min_varying_int32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
+declare <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32>, <4 x i32>) nounwind readnone
+declare <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32>, <4 x i32>) nounwind readnone
+
+define internal <4 x i32> @__min_varying_int32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
  %call = call <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32> %0, <4 x i32> %1)
  ret <4 x i32> %call
 }

-define <4 x i32> @__max_varying_int32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
+define internal i32 @__min_uniform_int32(i32, i32) nounwind readonly alwaysinline {
+  sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pminsd, %0, %1)
+  ret i32 %ret
+}
+
+define internal <4 x i32> @__max_varying_int32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
  %call = call <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32> %0, <4 x i32> %1)
  ret <4 x i32> %call
 }

+define internal i32 @__max_uniform_int32(i32, i32) nounwind readonly alwaysinline {
+  sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pmaxsd, %0, %1)
+  ret i32 %ret
+}
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; unsigned int min/max

-define <4 x i32> @__min_varying_uint32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
+declare <4 x i32> @llvm.x86.sse41.pminud(<4 x i32>, <4 x i32>) nounwind readnone
+declare <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32>, <4 x i32>) nounwind readnone
+
+define internal <4 x i32> @__min_varying_uint32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
  %call = call <4 x i32> @llvm.x86.sse41.pminud(<4 x i32> %0, <4 x i32> %1)
  ret <4 x i32> %call
 }

-define <4 x i32> @__max_varying_uint32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
+define internal i32 @__min_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
+  sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pminud, %0, %1)
+  ret i32 %ret
+}
+
+define internal <4 x i32> @__max_varying_uint32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
  %call = call <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32> %0, <4 x i32> %1)
  ret <4 x i32> %call
 }

-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; double precision min/max
-
-declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>) nounwind readnone
-declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) nounwind readnone
-
-define <4 x double> @__min_varying_double(<4 x double>, <4 x double>) nounwind readnone {
-  binary2to4(ret, double, @llvm.x86.sse2.min.pd, %0, %1)
-  ret <4 x double> %ret
+define internal i32 @__max_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
+  sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pmaxud, %0, %1)
+  ret i32 %ret
 }

-define <4 x double> @__max_varying_double(<4 x double>, <4 x double>) nounwind readnone {
-  binary2to4(ret, double, @llvm.x86.sse2.max.pd, %0, %1)
-  ret <4 x double> %ret
-}
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-; svml stuff
-
-declare <4 x float> @__svml_sinf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_cosf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_sincosf4(<4 x float> *, <4 x float>) nounwind readnone
-declare <4 x float> @__svml_tanf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_atanf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_atan2f4(<4 x float>, <4 x float>) nounwind readnone
-declare <4 x float> @__svml_expf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_logf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_powf4(<4 x float>, <4 x float>) nounwind readnone
-
-
-define <4 x float> @__svml_sin(<4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_sinf4(<4 x float> %0)
-  ret <4 x float> %ret
-}
-
-define <4 x float> @__svml_cos(<4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_cosf4(<4 x float> %0)
-  ret <4 x float> %ret
-}
-
-define void @__svml_sincos(<4 x float>, <4 x float> *, <4 x float> *) nounwind readnone alwaysinline {
-  %s = call <4 x float> @__svml_sincosf4(<4 x float> * %2, <4 x float> %0)
-  store <4 x float> %s, <4 x float> * %1
-  ret void
-}
-
-define <4 x float> @__svml_tan(<4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_tanf4(<4 x float> %0)
-  ret <4 x float> %ret
-}
-
-define <4 x float> @__svml_atan(<4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_atanf4(<4 x float> %0)
-  ret <4 x float> %ret
-}
-
-define <4 x float> @__svml_atan2(<4 x float>, <4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_atan2f4(<4 x float> %0, <4 x float> %1)
-  ret <4 x float> %ret
-}
-
-define <4 x float> @__svml_exp(<4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_expf4(<4 x float> %0)
-  ret <4 x float> %ret
-}
-
-define <4 x float> @__svml_log(<4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_logf4(<4 x float> %0)
-  ret <4 x float> %ret
-}
-
-define <4 x float> @__svml_pow(<4 x float>, <4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_powf4(<4 x float> %0, <4 x float> %1)
-  ret <4 x float> %ret
-}

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; horizontal ops / reductions

-declare i32 @llvm.x86.sse.movmsk.ps(<4 x float>) nounwind readnone
+declare i32 @llvm.ctpop.i32(i32) nounwind readnone

-define i32 @__movmsk(<4 x i32>) nounwind readnone alwaysinline {
-  %floatmask = bitcast <4 x i32> %0 to <4 x float>
-  %v = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %floatmask) nounwind readnone
-  ret i32 %v
+define internal i32 @__popcnt_int32(i32) nounwind readonly alwaysinline {
+  %call = call i32 @llvm.ctpop.i32(i32 %0)
+  ret i32 %call
+}
+
+declare i64 @llvm.ctpop.i64(i64) nounwind readnone
+
+define internal i64 @__popcnt_int64(i64) nounwind readonly alwaysinline {
+  %call = call i64 @llvm.ctpop.i64(i64 %0)
+  ret i64 %call
 }

 declare <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float>, <4 x float>) nounwind readnone

-define float @__reduce_add_float(<4 x float>) nounwind readonly alwaysinline {
+define internal float @__reduce_add_float(<4 x float>) nounwind readonly alwaysinline {
  %v1 = call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %0, <4 x float> %0)
  %v2 = call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %v1, <4 x float> %v1)
  %scalar = extractelement <4 x float> %v2, i32 0
  ret float %scalar
 }

-define float @__reduce_min_float(<4 x float>) nounwind readnone {
-  reduce4(float, @__min_varying_float, @__min_uniform_float)
-}
-
-define float @__reduce_max_float(<4 x float>) nounwind readnone {
-  reduce4(float, @__max_varying_float, @__max_uniform_float)
-}
-
-define i32 @__reduce_add_int32(<4 x i32> %v) nounwind readnone {
-  %v1 = shufflevector <4 x i32> %v, <4 x i32> undef,
-                      <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
-  %m1 = add <4 x i32> %v1, %v
-  %m1a = extractelement <4 x i32> %m1, i32 0
-  %m1b = extractelement <4 x i32> %m1, i32 1
-  %sum = add i32 %m1a, %m1b
-  ret i32 %sum
-}
-
-define i32 @__reduce_min_int32(<4 x i32>) nounwind readnone {
-  reduce4(i32, @__min_varying_int32, @__min_uniform_int32)
-}
-
-define i32 @__reduce_max_int32(<4 x i32>) nounwind readnone {
-  reduce4(i32, @__max_varying_int32, @__max_uniform_int32)
-}
-
-define i32 @__reduce_add_uint32(<4 x i32> %v) nounwind readnone {
-  %r = call i32 @__reduce_add_int32(<4 x i32> %v)
-  ret i32 %r
-}
-
-define i32 @__reduce_min_uint32(<4 x i32>) nounwind readnone {
-  reduce4(i32, @__min_varying_uint32, @__min_uniform_uint32)
-}
-
-define i32 @__reduce_max_uint32(<4 x i32>) nounwind readnone {
-  reduce4(i32, @__max_varying_uint32, @__max_uniform_uint32)
- }
-
-
-define double @__reduce_add_double(<4 x double>) nounwind readnone {
-  %v0 = shufflevector <4 x double> %0, <4 x double> undef,
-                      <2 x i32> <i32 0, i32 1>
-  %v1 = shufflevector <4 x double> %0, <4 x double> undef,
-                      <2 x i32> <i32 2, i32 3>
-  %sum = fadd <2 x double> %v0, %v1
-  %e0 = extractelement <2 x double> %sum, i32 0
-  %e1 = extractelement <2 x double> %sum, i32 1
-  %m = fadd double %e0, %e1
-  ret double %m
-}
-
-define double @__reduce_min_double(<4 x double>) nounwind readnone {
-  reduce4(double, @__min_varying_double, @__min_uniform_double)
-}
-
-define double @__reduce_max_double(<4 x double>) nounwind readnone {
-  reduce4(double, @__max_varying_double, @__max_uniform_double)
-}
-
-define i64 @__reduce_add_int64(<4 x i64>) nounwind readnone {
-  %v0 = shufflevector <4 x i64> %0, <4 x i64> undef,
-                      <2 x i32> <i32 0, i32 1>
-  %v1 = shufflevector <4 x i64> %0, <4 x i64> undef,
-                      <2 x i32> <i32 2, i32 3>
-  %sum = add <2 x i64> %v0, %v1
-  %e0 = extractelement <2 x i64> %sum, i32 0
-  %e1 = extractelement <2 x i64> %sum, i32 1
-  %m = add i64 %e0, %e1
-  ret i64 %m
-}
-
-define i64 @__reduce_min_int64(<4 x i64>) nounwind readnone {
-  reduce4(i64, @__min_varying_int64, @__min_uniform_int64)
-}
-
-define i64 @__reduce_max_int64(<4 x i64>) nounwind readnone {
-  reduce4(i64, @__max_varying_int64, @__max_uniform_int64)
-}
-
-define i64 @__reduce_min_uint64(<4 x i64>) nounwind readnone {
-  reduce4(i64, @__min_varying_uint64, @__min_uniform_uint64)
-}
-
-define i64 @__reduce_max_uint64(<4 x i64>) nounwind readnone {
-  reduce4(i64, @__max_varying_uint64, @__max_uniform_uint64)
-}
-
-reduce_equal(4)
-
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; masked store

@@ -432,41 +298,3 @@ define void @__masked_store_blend_64(<4 x i64>* nocapture %ptr, <4 x i64> %new,
  store <4 x i64> %final, <4 x i64> * %ptr, align 8
  ret void
 }
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; masked store
-
-masked_store_blend_8_16_by_4()
-
-gen_masked_store(4, i8, 8)
-gen_masked_store(4, i16, 16)
-gen_masked_store(4, i32, 32)
-gen_masked_store(4, i64, 64)
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; unaligned loads/loads+broadcasts
-
-load_and_broadcast(4, i8, 8)
-load_and_broadcast(4, i16, 16)
-load_and_broadcast(4, i32, 32)
-load_and_broadcast(4, i64, 64)
-
-load_masked(4, i8,  8,  1)
-load_masked(4, i16, 16, 2)
-load_masked(4, i32, 32, 4)
-load_masked(4, i64, 64, 8)
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; gather/scatter
-
-; define these with the macros from stdlib.m4
-
-gen_gather(4, i8)
-gen_gather(4, i16)
-gen_gather(4, i32)
-gen_gather(4, i64)
-
-gen_scatter(4, i8)
-gen_scatter(4, i16)
-gen_scatter(4, i32)
-gen_scatter(4, i64)
--- a/builtins-sse4-x2.ll
+++ b/builtins-sse4-x2.ll
@@ -41,14 +41,13 @@ packed_load_and_store(8)
 scans(8)
 int64minmax(8)

-include(`builtins-sse4-common.ll')
-
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rcp

 declare <4 x float> @llvm.x86.sse.rcp.ps(<4 x float>) nounwind readnone
+declare <4 x float> @llvm.x86.sse.rcp.ss(<4 x float>) nounwind readnone

-define <8 x float> @__rcp_varying_float(<8 x float>) nounwind readonly alwaysinline {
+define internal <8 x float> @__rcp_varying_float(<8 x float>) nounwind readonly alwaysinline {
  ;  float iv = __rcp_v(v);
  ;  return iv * (2. - v * iv);

@@ -61,12 +60,27 @@ define <8 x float> @__rcp_varying_float(<8 x float>) nounwind readonly alwaysinl
  ret <8 x float> %iv_mul
 }

+define internal float @__rcp_uniform_float(float) nounwind readonly alwaysinline {
+;    uniform float iv = extract(__rcp_u(v), 0);
+;    return iv * (2. - v * iv);
+  %vecval = insertelement <4 x float> undef, float %0, i32 0
+  %call = call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %vecval)
+  %scall = extractelement <4 x float> %call, i32 0
+
+  ; do one N-R iteration
+  %v_iv = fmul float %0, %scall
+  %two_minus = fsub float 2., %v_iv  
+  %iv_mul = fmul float %scall, %two_minus
+  ret float %iv_mul
+}
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rsqrt

 declare <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float>) nounwind readnone
+declare <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float>) nounwind readnone

-define <8 x float> @__rsqrt_varying_float(<8 x float> %v) nounwind readonly alwaysinline {
+define internal <8 x float> @__rsqrt_varying_float(<8 x float> %v) nounwind readonly alwaysinline {
  ;  float is = __rsqrt_v(v);
  unary4to8(is, float, @llvm.x86.sse.rsqrt.ps, %v)
  ;  return 0.5 * is * (3. - (v * is) * is);
@@ -80,16 +94,56 @@ define <8 x float> @__rsqrt_varying_float(<8 x float> %v) nounwind readonly alwa
  ret <8 x float> %half_scale
 }

+define internal float @__rsqrt_uniform_float(float) nounwind readonly alwaysinline {
+  ;  uniform float is = extract(__rsqrt_u(v), 0);
+  %v = insertelement <4 x float> undef, float %0, i32 0
+  %vis = call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %v)
+  %is = extractelement <4 x float> %vis, i32 0
+
+  ;  return 0.5 * is * (3. - (v * is) * is);
+  %v_is = fmul float %0, %is
+  %v_is_is = fmul float %v_is, %is
+  %three_sub = fsub float 3., %v_is_is
+  %is_mul = fmul float %is, %three_sub
+  %half_scale = fmul float 0.5, %is_mul
+  ret float %half_scale
+}
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; sqrt

 declare <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float>) nounwind readnone
+declare <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float>) nounwind readnone

-define <8 x float> @__sqrt_varying_float(<8 x float>) nounwind readonly alwaysinline {
+define internal <8 x float> @__sqrt_varying_float(<8 x float>) nounwind readonly alwaysinline {
  unary4to8(call, float, @llvm.x86.sse.sqrt.ps, %0)
  ret <8 x float> %call
 }

+define internal float @__sqrt_uniform_float(float) nounwind readonly alwaysinline {
+  sse_unary_scalar(ret, 4, float, @llvm.x86.sse.sqrt.ss, %0)
+  ret float %ret
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; fast math
+
+declare void @llvm.x86.sse.stmxcsr(i8 *) nounwind
+declare void @llvm.x86.sse.ldmxcsr(i8 *) nounwind
+
+define internal void @__fastmath() nounwind alwaysinline {
+  %ptr = alloca i32
+  %ptr8 = bitcast i32 * %ptr to i8 *
+  call void @llvm.x86.sse.stmxcsr(i8 * %ptr8)
+  %oldval = load i32 *%ptr
+
+  ; turn on DAZ (64)/FTZ (32768) -> 32832
+  %update = or i32 %oldval, 32832
+  store i32 %update, i32 *%ptr
+  call void @llvm.x86.sse.ldmxcsr(i8 * %ptr8)
+  ret void
+}
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; svml stuff

@@ -104,17 +158,17 @@ declare <4 x float> @__svml_logf4(<4 x float>) nounwind readnone
 declare <4 x float> @__svml_powf4(<4 x float>, <4 x float>) nounwind readnone


-define <8 x float> @__svml_sin(<8 x float>) nounwind readnone alwaysinline {
+define internal <8 x float> @__svml_sin(<8 x float>) nounwind readnone alwaysinline {
  unary4to8(ret, float, @__svml_sinf4, %0)
  ret <8 x float> %ret
 }

-define <8 x float> @__svml_cos(<8 x float>) nounwind readnone alwaysinline {
+define internal <8 x float> @__svml_cos(<8 x float>) nounwind readnone alwaysinline {
  unary4to8(ret, float, @__svml_cosf4, %0)
  ret <8 x float> %ret
 }

-define void @__svml_sincos(<8 x float>, <8 x float> *,
+define internal void @__svml_sincos(<8 x float>, <8 x float> *,
                                    <8 x float> *) nounwind readnone alwaysinline {
  ; call svml_sincosf4 two times with the two 4-wide sub-vectors
  %a = shufflevector <8 x float> %0, <8 x float> undef,
@@ -143,33 +197,33 @@ define void @__svml_sincos(<8 x float>, <8 x float> *,
  ret void
 }

-define <8 x float> @__svml_tan(<8 x float>) nounwind readnone alwaysinline {
+define internal <8 x float> @__svml_tan(<8 x float>) nounwind readnone alwaysinline {
  unary4to8(ret, float, @__svml_tanf4, %0)
  ret <8 x float> %ret
 }

-define <8 x float> @__svml_atan(<8 x float>) nounwind readnone alwaysinline {
+define internal <8 x float> @__svml_atan(<8 x float>) nounwind readnone alwaysinline {
  unary4to8(ret, float, @__svml_atanf4, %0)
  ret <8 x float> %ret
 }

-define <8 x float> @__svml_atan2(<8 x float>,
+define internal <8 x float> @__svml_atan2(<8 x float>,
                                          <8 x float>) nounwind readnone alwaysinline {
  binary4to8(ret, float, @__svml_atan2f4, %0, %1)
  ret <8 x float> %ret
 }

-define <8 x float> @__svml_exp(<8 x float>) nounwind readnone alwaysinline {
+define internal <8 x float> @__svml_exp(<8 x float>) nounwind readnone alwaysinline {
  unary4to8(ret, float, @__svml_expf4, %0)
  ret <8 x float> %ret
 }

-define <8 x float> @__svml_log(<8 x float>) nounwind readnone alwaysinline {
+define internal <8 x float> @__svml_log(<8 x float>) nounwind readnone alwaysinline {
  unary4to8(ret, float, @__svml_logf4, %0)
  ret <8 x float> %ret
 }

-define <8 x float> @__svml_pow(<8 x float>,
+define internal <8 x float> @__svml_pow(<8 x float>,
                                        <8 x float>) nounwind readnone alwaysinline {
  binary4to8(ret, float, @__svml_powf4, %0, %1)
  ret <8 x float> %ret
@@ -180,52 +234,91 @@ define <8 x float> @__svml_pow(<8 x float>,
 ;; float min/max

 declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind readnone
+declare <4 x float> @llvm.x86.sse.max.ss(<4 x float>, <4 x float>) nounwind readnone
 declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind readnone
+declare <4 x float> @llvm.x86.sse.min.ss(<4 x float>, <4 x float>) nounwind readnone

-define <8 x float> @__max_varying_float(<8 x float>, <8 x float>) nounwind readonly alwaysinline {
+define internal <8 x float> @__max_varying_float(<8 x float>, <8 x float>) nounwind readonly alwaysinline {
  binary4to8(call, float, @llvm.x86.sse.max.ps, %0, %1)
  ret <8 x float> %call
 }

-define <8 x float> @__min_varying_float(<8 x float>, <8 x float>) nounwind readonly alwaysinline {
+define internal float @__max_uniform_float(float, float) nounwind readonly alwaysinline {
+  sse_binary_scalar(ret, 4, float, @llvm.x86.sse.max.ss, %0, %1)
+  ret float %ret
+}
+
+define internal <8 x float> @__min_varying_float(<8 x float>, <8 x float>) nounwind readonly alwaysinline {
  binary4to8(call, float, @llvm.x86.sse.min.ps, %0, %1)
  ret <8 x float> %call
 }

+define internal float @__min_uniform_float(float, float) nounwind readonly alwaysinline {
+  sse_binary_scalar(ret, 4, float, @llvm.x86.sse.min.ss, %0, %1)
+  ret float %ret
+}
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; int32 min/max

-define <8 x i32> @__min_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
+declare <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32>, <4 x i32>) nounwind readnone
+declare <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32>, <4 x i32>) nounwind readnone
+
+define internal <8 x i32> @__min_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
  binary4to8(call, i32, @llvm.x86.sse41.pminsd, %0, %1)
  ret <8 x i32> %call
 }

-define <8 x i32> @__max_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
+define internal i32 @__min_uniform_int32(i32, i32) nounwind readonly alwaysinline {
+  sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pminsd, %0, %1)
+  ret i32 %ret
+}
+
+define internal <8 x i32> @__max_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
  binary4to8(call, i32, @llvm.x86.sse41.pmaxsd, %0, %1)
  ret <8 x i32> %call
 }

+define internal i32 @__max_uniform_int32(i32, i32) nounwind readonly alwaysinline {
+  sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pmaxsd, %0, %1)
+  ret i32 %ret
+}
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; unsigned int min/max

-define <8 x i32> @__min_varying_uint32(<8 x i32>,
+declare <4 x i32> @llvm.x86.sse41.pminud(<4 x i32>, <4 x i32>) nounwind readnone
+declare <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32>, <4 x i32>) nounwind readnone
+
+define internal <8 x i32> @__min_varying_uint32(<8 x i32>,
                                                <8 x i32>) nounwind readonly alwaysinline {
  binary4to8(call, i32, @llvm.x86.sse41.pminud, %0, %1)
  ret <8 x i32> %call
 }

-define <8 x i32> @__max_varying_uint32(<8 x i32>,
+define internal i32 @__min_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
+  sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pminud, %0, %1)
+  ret i32 %ret
+}
+
+define internal <8 x i32> @__max_varying_uint32(<8 x i32>,
                                                <8 x i32>) nounwind readonly alwaysinline {
  binary4to8(call, i32, @llvm.x86.sse41.pmaxud, %0, %1)
  ret <8 x i32> %call
 }

+define internal i32 @__max_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
+  sse_binary_scalar(ret, 4, i32, @llvm.x86.sse41.pmaxud, %0, %1)
+  ret i32 %ret
+}
+
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; horizontal ops / reductions

 declare i32 @llvm.x86.sse.movmsk.ps(<4 x float>) nounwind readnone

-define i32 @__movmsk(<8 x i32>) nounwind readnone alwaysinline {
+define internal i32 @__movmsk(<8 x i32>) nounwind readnone alwaysinline {
  ; first do two 4-wide movmsk calls
  %floatmask = bitcast <8 x i32> %0 to <8 x float>
  %m0 = shufflevector <8 x float> %floatmask, <8 x float> undef,
@@ -242,103 +335,103 @@ define i32 @__movmsk(<8 x i32>) nounwind readnone alwaysinline {
  ret i32 %v
 }

-define float @__reduce_min_float(<8 x float>) nounwind readnone alwaysinline {
+define internal float @__reduce_min_float(<8 x float>) nounwind readnone alwaysinline {
  reduce8by4(float, @llvm.x86.sse.min.ps, @__min_uniform_float)
 }

-define float @__reduce_max_float(<8 x float>) nounwind readnone alwaysinline {
+define internal float @__reduce_max_float(<8 x float>) nounwind readnone alwaysinline {
  reduce8by4(float, @llvm.x86.sse.max.ps, @__max_uniform_float)
 }

 ; helper function for reduce_add_int32
-define <4 x i32> @__vec4_add_int32(<4 x i32> %v0,
+define internal <4 x i32> @__vec4_add_int32(<4 x i32> %v0,
                                            <4 x i32> %v1) nounwind readnone alwaysinline {
  %v = add <4 x i32> %v0, %v1
  ret <4 x i32> %v
 }

 ; helper function for reduce_add_int32
-define i32 @__add_int32(i32, i32) nounwind readnone alwaysinline {
+define internal i32 @__add_int32(i32, i32) nounwind readnone alwaysinline {
  %v = add i32 %0, %1
  ret i32 %v
 }

-define i32 @__reduce_add_int32(<8 x i32>) nounwind readnone alwaysinline {
+define internal i32 @__reduce_add_int32(<8 x i32>) nounwind readnone alwaysinline {
  reduce8by4(i32, @__vec4_add_int32, @__add_int32)
 }

-define i32 @__reduce_min_int32(<8 x i32>) nounwind readnone alwaysinline {
+define internal i32 @__reduce_min_int32(<8 x i32>) nounwind readnone alwaysinline {
  reduce8by4(i32, @llvm.x86.sse41.pminsd, @__min_uniform_int32)
 }

-define i32 @__reduce_max_int32(<8 x i32>) nounwind readnone alwaysinline {
+define internal i32 @__reduce_max_int32(<8 x i32>) nounwind readnone alwaysinline {
  reduce8by4(i32, @llvm.x86.sse41.pmaxsd, @__max_uniform_int32)
 }

-define i32 @__reduce_add_uint32(<8 x i32> %v) nounwind readnone alwaysinline {
+define internal i32 @__reduce_add_uint32(<8 x i32> %v) nounwind readnone alwaysinline {
  %r = call i32 @__reduce_add_int32(<8 x i32> %v)
  ret i32 %r
 }

-define i32 @__reduce_min_uint32(<8 x i32>) nounwind readnone alwaysinline {
+define internal i32 @__reduce_min_uint32(<8 x i32>) nounwind readnone alwaysinline {
  reduce8by4(i32, @llvm.x86.sse41.pminud, @__min_uniform_uint32)
 }

-define i32 @__reduce_max_uint32(<8 x i32>) nounwind readnone alwaysinline {
+define internal i32 @__reduce_max_uint32(<8 x i32>) nounwind readnone alwaysinline {
  reduce8by4(i32, @llvm.x86.sse41.pmaxud, @__max_uniform_uint32)
 }

-define <4 x double> @__add_varying_double(<4 x double>,
+define internal <4 x double> @__add_varying_double(<4 x double>,
                                     <4 x double>) nounwind readnone alwaysinline {
  %r = fadd <4 x double> %0, %1
  ret <4 x double> %r
 }

-define double @__add_uniform_double(double, double) nounwind readnone alwaysinline {
+define internal double @__add_uniform_double(double, double) nounwind readnone alwaysinline {
  %r = fadd double %0, %1
  ret double %r
 }

-define double @__reduce_add_double(<8 x double>) nounwind readnone {
+define internal double @__reduce_add_double(<8 x double>) nounwind readnone {
  reduce8by4(double, @__add_varying_double, @__add_uniform_double)
 }

-define double @__reduce_min_double(<8 x double>) nounwind readnone {
+define internal double @__reduce_min_double(<8 x double>) nounwind readnone {
  reduce8(double, @__min_varying_double, @__min_uniform_double)
 }

-define double @__reduce_max_double(<8 x double>) nounwind readnone {
+define internal double @__reduce_max_double(<8 x double>) nounwind readnone {
  reduce8(double, @__max_varying_double, @__max_uniform_double)
 }

-define <4 x i64> @__add_varying_int64(<4 x i64>,
+define internal <4 x i64> @__add_varying_int64(<4 x i64>,
                                               <4 x i64>) nounwind readnone alwaysinline {
  %r = add <4 x i64> %0, %1
  ret <4 x i64> %r
 }

-define i64 @__add_uniform_int64(i64, i64) nounwind readnone alwaysinline {
+define internal i64 @__add_uniform_int64(i64, i64) nounwind readnone alwaysinline {
  %r = add i64 %0, %1
  ret i64 %r
 }

-define i64 @__reduce_add_int64(<8 x i64>) nounwind readnone {
+define internal i64 @__reduce_add_int64(<8 x i64>) nounwind readnone {
  reduce8by4(i64, @__add_varying_int64, @__add_uniform_int64)
 }

-define i64 @__reduce_min_int64(<8 x i64>) nounwind readnone {
+define internal i64 @__reduce_min_int64(<8 x i64>) nounwind readnone {
  reduce8(i64, @__min_varying_int64, @__min_uniform_int64)
 }

-define i64 @__reduce_max_int64(<8 x i64>) nounwind readnone {
+define internal i64 @__reduce_max_int64(<8 x i64>) nounwind readnone {
  reduce8(i64, @__max_varying_int64, @__max_uniform_int64)
 }

-define i64 @__reduce_min_uint64(<8 x i64>) nounwind readnone {
+define internal i64 @__reduce_min_uint64(<8 x i64>) nounwind readnone {
  reduce8(i64, @__min_varying_uint64, @__min_uniform_uint64)
 }

-define i64 @__reduce_max_uint64(<8 x i64>) nounwind readnone {
+define internal i64 @__reduce_max_uint64(<8 x i64>) nounwind readnone {
  reduce8(i64, @__max_varying_uint64, @__max_uniform_uint64)
 }

@@ -374,47 +467,129 @@ gen_scatter(8, i64)
 ;; float rounding

 declare <4 x float> @llvm.x86.sse41.round.ps(<4 x float>, i32) nounwind readnone
+declare <4 x float> @llvm.x86.sse41.round.ss(<4 x float>, <4 x float>, i32) nounwind readnone

-define <8 x float> @__round_varying_float(<8 x float>) nounwind readonly alwaysinline {
+define internal <8 x float> @__round_varying_float(<8 x float>) nounwind readonly alwaysinline {
  ; roundps, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8
  round4to8(%0, 8)
 }

-define <8 x float> @__floor_varying_float(<8 x float>) nounwind readonly alwaysinline {
+define internal float @__round_uniform_float(float) nounwind readonly alwaysinline {
+  ; roundss, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8
+  ; the roundss intrinsic is a total mess--docs say:
+  ;
+  ;  __m128 _mm_round_ss (__m128 a, __m128 b, const int c)
+  ;       
+  ;  b is a 128-bit parameter. The lowest 32 bits are the result of the rounding function
+  ;  on b0. The higher order 96 bits are copied directly from input parameter a. The
+  ;  return value is described by the following equations:
+  ;
+  ;  r0 = RND(b0)
+  ;  r1 = a1
+  ;  r2 = a2
+  ;  r3 = a3
+  ;
+  ;  It doesn't matter what we pass as a, since we only need the r0 value
+  ;  here.  So we pass the same register for both.  
+  %xi = insertelement <4 x float> undef, float %0, i32 0
+  %xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 8)
+  %rs = extractelement <4 x float> %xr, i32 0
+  ret float %rs
+}
+
+define internal <8 x float> @__floor_varying_float(<8 x float>) nounwind readonly alwaysinline {
  ; roundps, round down 0b01 | don't signal precision exceptions 0b1001 = 9
  round4to8(%0, 9)
 }

-define <8 x float> @__ceil_varying_float(<8 x float>) nounwind readonly alwaysinline {
+define internal float @__floor_uniform_float(float) nounwind readonly alwaysinline {
+  ; see above for round_ss instrinsic discussion...
+  %xi = insertelement <4 x float> undef, float %0, i32 0
+  ; roundps, round down 0b01 | don't signal precision exceptions 0b1001 = 9
+  %xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 9)
+  %rs = extractelement <4 x float> %xr, i32 0
+  ret float %rs
+}
+
+define internal <8 x float> @__ceil_varying_float(<8 x float>) nounwind readonly alwaysinline {
  ; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
  round4to8(%0, 10)
 }

+define internal float @__ceil_uniform_float(float) nounwind readonly alwaysinline {
+  ; see above for round_ss instrinsic discussion...
+  %xi = insertelement <4 x float> undef, float %0, i32 0
+  ; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
+  %xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 10)
+  %rs = extractelement <4 x float> %xr, i32 0
+  ret float %rs
+}
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rounding doubles

 declare <2 x double> @llvm.x86.sse41.round.pd(<2 x double>, i32) nounwind readnone
+declare <2 x double> @llvm.x86.sse41.round.sd(<2 x double>, <2 x double>, i32) nounwind readnone

-define <8 x double> @__round_varying_double(<8 x double>) nounwind readonly alwaysinline {
+define internal <8 x double> @__round_varying_double(<8 x double>) nounwind readonly alwaysinline {
  round2to8double(%0, 8)
 }

-define <8 x double> @__floor_varying_double(<8 x double>) nounwind readonly alwaysinline {
+define internal double @__round_uniform_double(double) nounwind readonly alwaysinline {
+  %xi = insertelement <2 x double> undef, double %0, i32 0
+  %xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 8)
+  %rs = extractelement <2 x double> %xr, i32 0
+  ret double %rs
+}
+
+define internal <8 x double> @__floor_varying_double(<8 x double>) nounwind readonly alwaysinline {
  ; roundpd, round down 0b01 | don't signal precision exceptions 0b1001 = 9
  round2to8double(%0, 9)
 }

-define <8 x double> @__ceil_varying_double(<8 x double>) nounwind readonly alwaysinline {
+define internal double @__floor_uniform_double(double) nounwind readonly alwaysinline {
+  ; see above for round_ss instrinsic discussion...
+  %xi = insertelement <2 x double> undef, double %0, i32 0
+  ; roundpd, round down 0b01 | don't signal precision exceptions 0b1001 = 9
+  %xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 9)
+  %rs = extractelement <2 x double> %xr, i32 0
+  ret double %rs
+}
+
+define internal <8 x double> @__ceil_varying_double(<8 x double>) nounwind readonly alwaysinline {
  ; roundpd, round up 0b10 | don't signal precision exceptions 0b1010 = 10
  round2to8double(%0, 10)
 }

+define internal double @__ceil_uniform_double(double) nounwind readonly alwaysinline {
+  ; see above for round_ss instrinsic discussion...
+  %xi = insertelement <2 x double> undef, double %0, i32 0
+  ; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
+  %xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 10)
+  %rs = extractelement <2 x double> %xr, i32 0
+  ret double %rs
+}
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; horizontal ops / reductions

+declare i32 @llvm.ctpop.i32(i32) nounwind readnone
+
+define internal i32 @__popcnt_int32(i32) nounwind readonly alwaysinline {
+  %call = call i32 @llvm.ctpop.i32(i32 %0)
+  ret i32 %call
+}
+
+declare i64 @llvm.ctpop.i64(i64) nounwind readnone
+
+define internal i64 @__popcnt_int64(i64) nounwind readonly alwaysinline {
+  %call = call i64 @llvm.ctpop.i64(i64 %0)
+  ret i64 %call
+}
+
 declare <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float>, <4 x float>) nounwind readnone

-define float @__reduce_add_float(<8 x float>) nounwind readonly alwaysinline {
+define internal float @__reduce_add_float(<8 x float>) nounwind readonly alwaysinline {
  %a = shufflevector <8 x float> %0, <8 x float> undef,
         <4 x i32> <i32 0, i32 1, i32 2, i32 3>
  %b = shufflevector <8 x float> %0, <8 x float> undef,
@@ -543,24 +718,44 @@ define void @__masked_store_blend_64(<8 x i64>* nocapture %ptr, <8 x i64> %new,
 ;; double precision sqrt

 declare <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double>) nounwind readnone
+declare <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double>) nounwind readnone

-define <8 x double> @__sqrt_varying_double(<8 x double>) nounwind alwaysinline {
+define internal <8 x double> @__sqrt_varying_double(<8 x double>) nounwind alwaysinline {
  unary2to8(ret, double, @llvm.x86.sse2.sqrt.pd, %0)
  ret <8 x double> %ret
 }

+
+define internal double @__sqrt_uniform_double(double) nounwind alwaysinline {
+  sse_unary_scalar(ret, 2, double, @llvm.x86.sse2.sqrt.pd, %0)
+  ret double %ret
+}
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; double precision float min/max

 declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>) nounwind readnone
+declare <2 x double> @llvm.x86.sse2.max.sd(<2 x double>, <2 x double>) nounwind readnone
 declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) nounwind readnone
+declare <2 x double> @llvm.x86.sse2.min.sd(<2 x double>, <2 x double>) nounwind readnone

-define <8 x double> @__min_varying_double(<8 x double>, <8 x double>) nounwind readnone alwaysinline {
+define internal <8 x double> @__min_varying_double(<8 x double>, <8 x double>) nounwind readnone alwaysinline {
  binary2to8(ret, double, @llvm.x86.sse2.min.pd, %0, %1)
  ret <8 x double> %ret
 }

-define <8 x double> @__max_varying_double(<8 x double>, <8 x double>) nounwind readnone alwaysinline {
+define internal double @__min_uniform_double(double, double) nounwind readnone alwaysinline {
+  sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.min.pd, %0, %1)
+  ret double %ret
+}
+
+define internal <8 x double> @__max_varying_double(<8 x double>, <8 x double>) nounwind readnone alwaysinline {
  binary2to8(ret, double, @llvm.x86.sse2.max.pd, %0, %1)
  ret <8 x double> %ret
 }
+
+define internal double @__max_uniform_double(double, double) nounwind readnone alwaysinline {
+  sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.max.pd, %0, %1)
+  ret double %ret
+
+}
--- a/builtins.cpp
+++ b/builtins.cpp
@@ -114,39 +114,59 @@ lLLVMTypeToISPCType(const llvm::Type *t, bool intAsUnsigned) {

    // pointers to uniform
    else if (t == LLVMTypes::Int8PointerType)
-        return PointerType::GetUniform(intAsUnsigned ? AtomicType::UniformUInt8 :
-                                       AtomicType::UniformInt8);
+        return new ReferenceType(intAsUnsigned ? AtomicType::UniformUInt8 :
+                                                 AtomicType::UniformInt8, false);
    else if (t == LLVMTypes::Int16PointerType)
-        return PointerType::GetUniform(intAsUnsigned ? AtomicType::UniformUInt16 :
-                                       AtomicType::UniformInt16);
+        return new ReferenceType(intAsUnsigned ? AtomicType::UniformUInt16 :
+                                                 AtomicType::UniformInt16, false);
    else if (t == LLVMTypes::Int32PointerType)
-        return PointerType::GetUniform(intAsUnsigned ? AtomicType::UniformUInt32 :
-                                       AtomicType::UniformInt32);
+        return new ReferenceType(intAsUnsigned ? AtomicType::UniformUInt32 :
+                                                 AtomicType::UniformInt32, false);
    else if (t == LLVMTypes::Int64PointerType)
-        return PointerType::GetUniform(intAsUnsigned ? AtomicType::UniformUInt64 :
-                                       AtomicType::UniformInt64);
+        return new ReferenceType(intAsUnsigned ? AtomicType::UniformUInt64 :
+                                                 AtomicType::UniformInt64, false);
    else if (t == LLVMTypes::FloatPointerType)
-        return PointerType::GetUniform(AtomicType::UniformFloat);
+        return new ReferenceType(AtomicType::UniformFloat, false);
    else if (t == LLVMTypes::DoublePointerType)
-        return PointerType::GetUniform(AtomicType::UniformDouble);
+        return new ReferenceType(AtomicType::UniformDouble, false);

    // pointers to varying
    else if (t == LLVMTypes::Int8VectorPointerType)
-        return PointerType::GetUniform(intAsUnsigned ? AtomicType::VaryingUInt8 :
-                                       AtomicType::VaryingInt8);
+        return new ReferenceType(intAsUnsigned ? AtomicType::VaryingUInt8 :
+                                                 AtomicType::VaryingInt8, false);
    else if (t == LLVMTypes::Int16VectorPointerType)
-        return PointerType::GetUniform(intAsUnsigned ? AtomicType::VaryingUInt16 :
-                                       AtomicType::VaryingInt16);
+        return new ReferenceType(intAsUnsigned ? AtomicType::VaryingUInt16 :
+                                                 AtomicType::VaryingInt16, false);
    else if (t == LLVMTypes::Int32VectorPointerType)
-        return PointerType::GetUniform(intAsUnsigned ? AtomicType::VaryingUInt32 :
-                                       AtomicType::VaryingInt32);
+        return new ReferenceType(intAsUnsigned ? AtomicType::VaryingUInt32 :
+                                                 AtomicType::VaryingInt32, false);
    else if (t == LLVMTypes::Int64VectorPointerType)
-        return PointerType::GetUniform(intAsUnsigned ? AtomicType::VaryingUInt64 :
-                                       AtomicType::VaryingInt64);
+        return new ReferenceType(intAsUnsigned ? AtomicType::VaryingUInt64 :
+                                                 AtomicType::VaryingInt64, false);
    else if (t == LLVMTypes::FloatVectorPointerType)
-        return PointerType::GetUniform(AtomicType::VaryingFloat);
+        return new ReferenceType(AtomicType::VaryingFloat, false);
    else if (t == LLVMTypes::DoubleVectorPointerType)
-        return PointerType::GetUniform(AtomicType::VaryingDouble);
+        return new ReferenceType(AtomicType::VaryingDouble, false);
+
+    // arrays
+    else if (llvm::isa<const llvm::PointerType>(t)) {
+        const llvm::PointerType *pt = llvm::dyn_cast<const llvm::PointerType>(t);
+
+        // Is it a pointer to an unsized array of objects?  If so, then
+        // create the equivalent ispc type.  Note that it has to be a
+        // reference to an array, since ispc passes arrays to functions by
+        // reference.
+        const llvm::ArrayType *at = 
+            llvm::dyn_cast<const llvm::ArrayType>(pt->getElementType());
+        if (at != NULL) {
+            const Type *eltType = lLLVMTypeToISPCType(at->getElementType(),
+                                                      intAsUnsigned);
+            if (eltType == NULL)
+                return NULL;
+            return new ReferenceType(new ArrayType(eltType, at->getNumElements()),
+                                     false);
+        }
+    }

    return NULL;
 }
@@ -161,9 +181,11 @@ lCreateSymbol(const std::string &name, const Type *returnType,
    noPos.name = "__stdlib";

    FunctionType *funcType = new FunctionType(returnType, argTypes, noPos);
-
-    Debug(noPos, "Created builtin symbol \"%s\" [%s]\n", name.c_str(),
-          funcType->GetString().c_str());
+    // set NULL default arguments
+    std::vector<ConstExpr *> defaults;
+    for (unsigned int j = 0; j < ftype->getNumParams(); ++j)
+        defaults.push_back(NULL);
+    funcType->SetArgumentDefaults(defaults);

    Symbol *sym = new Symbol(name, noPos, funcType);
    sym->function = func;
@@ -186,9 +208,6 @@ lCreateISPCSymbol(llvm::Function *func, SymbolTable *symbolTable) {
    if (name.size() < 3 || name[0] != '_' || name[1] != '_')
        return false;

-    Debug(SourcePos(), "Attempting to create ispc symbol for function \"%s\".",
-          name.c_str());
-
    // An unfortunate hack: we want this builtin function to have the
    // signature "int __sext_varying_bool(bool)", but the ispc function
    // symbol creation code below assumes that any LLVM vector of i32s is a
@@ -198,8 +217,11 @@ lCreateISPCSymbol(llvm::Function *func, SymbolTable *symbolTable) {
        const Type *returnType = AtomicType::VaryingInt32;
        std::vector<const Type *> argTypes;
        argTypes.push_back(AtomicType::VaryingBool);
+        std::vector<ConstExpr *> defaults;
+        defaults.push_back(NULL);

        FunctionType *funcType = new FunctionType(returnType, argTypes, noPos);
+        funcType->SetArgumentDefaults(defaults);

        Symbol *sym = new Symbol(name, noPos, funcType);
        sym->function = func;
@@ -216,27 +238,22 @@ lCreateISPCSymbol(llvm::Function *func, SymbolTable *symbolTable) {

        const Type *returnType = lLLVMTypeToISPCType(ftype->getReturnType(),
                                                     intAsUnsigned);
-        if (returnType == NULL) {
-            Debug(SourcePos(), "Failed: return type not representable for "
-                  "builtin %s.", name.c_str());
+        if (!returnType)
            // return type not representable in ispc -> not callable from ispc
            return false;
-        }

        // Iterate over the arguments and try to find their equivalent ispc
        // types.  Track if any of the arguments has an integer type.
-        bool anyIntArgs = false;
+        bool anyIntArgs = false, anyReferenceArgs = false;
        std::vector<const Type *> argTypes;
        for (unsigned int j = 0; j < ftype->getNumParams(); ++j) {
            const llvm::Type *llvmArgType = ftype->getParamType(j);
            const Type *type = lLLVMTypeToISPCType(llvmArgType, intAsUnsigned);
-            if (type == NULL) {
-                Debug(SourcePos(), "Failed: type of parameter %d not "
-                      "representable for builtin %s", j, name.c_str());
+            if (type == NULL)
                return false;
-            }
            anyIntArgs |= 
                (Type::Equal(type, lLLVMTypeToISPCType(llvmArgType, !intAsUnsigned)) == false);
+            anyReferenceArgs |= (dynamic_cast<const ReferenceType *>(type) != NULL);
            argTypes.push_back(type);
        }

@@ -244,6 +261,19 @@ lCreateISPCSymbol(llvm::Function *func, SymbolTable *symbolTable) {
        // so that we get symbols for things with no integer types!
        if (i == 0 || anyIntArgs == true)
            lCreateSymbol(name, returnType, argTypes, ftype, func, symbolTable);
+
+        // If there are any reference types, also make a variant of the
+        // symbol that has them as const references.  This obviously
+        // doesn't make sense for many builtins, but we'll give the stdlib
+        // the option to call one if it needs one.
+        if (anyReferenceArgs == true) {
+            for (unsigned int j = 0; j < argTypes.size(); ++j) {
+                if (dynamic_cast<const ReferenceType *>(argTypes[j]) != NULL)
+                    argTypes[j] = argTypes[j]->GetAsConstType();
+                lCreateSymbol(name + "_refsconst", returnType, argTypes, 
+                              ftype, func, symbolTable);
+            }
+        }
    }

    return true;
@@ -297,263 +327,6 @@ lCheckModuleIntrinsics(llvm::Module *module) {
 }


-/** We'd like to have all of these functions declared as 'internal' in
-    their respective bitcode files so that if they aren't needed by the
-    user's program they are elimiated from the final output.  However, if
-    we do so, then they aren't brought in by the LinkModules() call below
-    since they aren't yet used by anything in the module they're being
-    linked with (in LLVM 3.1, at least).
-
-    Therefore, we don't declare them as internal when we first define them,
-    but instead mark them as internal after they've been linked in.  This
-    is admittedly a kludge.
- */
-static void
-lSetInternalFunctions(llvm::Module *module) {
-    const char *names[] = {
-        "__add_uniform_int32",
-        "__add_uniform_int64",
-        "__add_varying_int32",
-        "__add_varying_int64",
-        "__aos_to_soa3_float",
-        "__aos_to_soa3_float16",
-        "__aos_to_soa3_float4",
-        "__aos_to_soa3_float8",
-        "__aos_to_soa3_int32",
-        "__aos_to_soa4_float",
-        "__aos_to_soa4_float16",
-        "__aos_to_soa4_float4",
-        "__aos_to_soa4_float8",
-        "__aos_to_soa4_int32",
-        "__atomic_add_int32_global",
-        "__atomic_add_int64_global",
-        "__atomic_add_uniform_int32_global",
-        "__atomic_add_uniform_int64_global",
-        "__atomic_and_int32_global",
-        "__atomic_and_int64_global",
-        "__atomic_and_uniform_int32_global",
-        "__atomic_and_uniform_int64_global",
-        "__atomic_compare_exchange_double_global",
-        "__atomic_compare_exchange_float_global",
-        "__atomic_compare_exchange_int32_global",
-        "__atomic_compare_exchange_int64_global",
-        "__atomic_compare_exchange_uniform_double_global",
-        "__atomic_compare_exchange_uniform_float_global",
-        "__atomic_compare_exchange_uniform_int32_global",
-        "__atomic_compare_exchange_uniform_int64_global",
-        "__atomic_max_uniform_int32_global",
-        "__atomic_max_uniform_int64_global",
-        "__atomic_min_uniform_int32_global",
-        "__atomic_min_uniform_int64_global",
-        "__atomic_or_int32_global",
-        "__atomic_or_int64_global",
-        "__atomic_or_uniform_int32_global",
-        "__atomic_or_uniform_int64_global",
-        "__atomic_sub_int32_global",
-        "__atomic_sub_int64_global",
-        "__atomic_sub_uniform_int32_global",
-        "__atomic_sub_uniform_int64_global",
-        "__atomic_swap_double_global",
-        "__atomic_swap_float_global",
-        "__atomic_swap_int32_global",
-        "__atomic_swap_int64_global",
-        "__atomic_swap_uniform_double_global",
-        "__atomic_swap_uniform_float_global",
-        "__atomic_swap_uniform_int32_global",
-        "__atomic_swap_uniform_int64_global",
-        "__atomic_umax_uniform_uint32_global",
-        "__atomic_umax_uniform_uint64_global",
-        "__atomic_umin_uniform_uint32_global",
-        "__atomic_umin_uniform_uint64_global",
-        "__atomic_xor_int32_global",
-        "__atomic_xor_int64_global",
-        "__atomic_xor_uniform_int32_global",
-        "__atomic_xor_uniform_int64_global",
-        "__broadcast_double",
-        "__broadcast_float",
-        "__broadcast_int16",
-        "__broadcast_int32",
-        "__broadcast_int64",
-        "__broadcast_int8",
-        "__ceil_uniform_double",
-        "__ceil_uniform_float",
-        "__ceil_varying_double",
-        "__ceil_varying_float",
-        "__count_trailing_zeros_i32",
-        "__count_trailing_zeros_i64",
-        "__count_leading_zeros_i32",
-        "__count_leading_zeros_i64",
-        "__do_assert_uniform",
-        "__do_assert_varying",
-        "__do_print", 
-        "__doublebits_uniform_int64",
-        "__doublebits_varying_int64",
-        "__exclusive_scan_add_double",
-        "__exclusive_scan_add_float",
-        "__exclusive_scan_add_i32",
-        "__exclusive_scan_add_i64",
-        "__exclusive_scan_and_i32",
-        "__exclusive_scan_and_i64",
-        "__exclusive_scan_or_i32",
-        "__exclusive_scan_or_i64",
-        "__extract_int16",
-        "__extract_int32",
-        "__extract_int64",
-        "__extract_int8",
-        "__fastmath",
-        "__floatbits_uniform_int32",
-        "__floatbits_varying_int32",
-        "__floor_uniform_double",
-        "__floor_uniform_float",
-        "__floor_varying_double",
-        "__floor_varying_float",
-        "__insert_int16",
-        "__insert_int32",
-        "__insert_int64",
-        "__insert_int8",
-        "__intbits_uniform_double",
-        "__intbits_uniform_float",
-        "__intbits_varying_double",
-        "__intbits_varying_float",
-        "__max_uniform_double",
-        "__max_uniform_float",
-        "__max_uniform_int32",
-        "__max_uniform_int64",
-        "__max_uniform_uint32",
-        "__max_uniform_uint64",
-        "__max_varying_double",
-        "__max_varying_float",
-        "__max_varying_int32",
-        "__max_varying_int64",
-        "__max_varying_uint32",
-        "__max_varying_uint64",
-        "__memory_barrier",
-        "__min_uniform_double",
-        "__min_uniform_float",
-        "__min_uniform_int32",
-        "__min_uniform_int64",
-        "__min_uniform_uint32",
-        "__min_uniform_uint64",
-        "__min_varying_double",
-        "__min_varying_float",
-        "__min_varying_int32",
-        "__min_varying_int64",
-        "__min_varying_uint32",
-        "__min_varying_uint64",
-        "__movmsk",
-        "__num_cores",
-        "__packed_load_active",
-        "__packed_store_active",
-        "__popcnt_int32",
-        "__popcnt_int64",
-        "__prefetch_read_uniform_1",
-        "__prefetch_read_uniform_2",
-        "__prefetch_read_uniform_3",
-        "__prefetch_read_uniform_nt",
-        "__rcp_uniform_float",
-        "__rcp_varying_float",
-        "__reduce_add_double",
-        "__reduce_add_float",
-        "__reduce_add_int32",
-        "__reduce_add_int64",
-        "__reduce_add_uint32",
-        "__reduce_add_uint64",
-        "__reduce_equal_double",
-        "__reduce_equal_float",
-        "__reduce_equal_int32",
-        "__reduce_equal_int64",
-        "__reduce_max_double",
-        "__reduce_max_float",
-        "__reduce_max_int32",
-        "__reduce_max_int64",
-        "__reduce_max_uint32",
-        "__reduce_max_uint64",
-        "__reduce_min_double",
-        "__reduce_min_float",
-        "__reduce_min_int32",
-        "__reduce_min_int64",
-        "__reduce_min_uint32",
-        "__reduce_min_uint64",
-        "__rotate_double",
-        "__rotate_float",
-        "__rotate_int16",
-        "__rotate_int32",
-        "__rotate_int64",
-        "__rotate_int8",
-        "__round_uniform_double",
-        "__round_uniform_float",
-        "__round_varying_double",
-        "__round_varying_float",
-        "__rsqrt_uniform_float",
-        "__rsqrt_varying_float",
-        "__sext_uniform_bool",
-        "__sext_varying_bool",
-        "__shuffle2_double",
-        "__shuffle2_float",
-        "__shuffle2_int16",
-        "__shuffle2_int32",
-        "__shuffle2_int64",
-        "__shuffle2_int8",
-        "__shuffle_double",
-        "__shuffle_float",
-        "__shuffle_int16",
-        "__shuffle_int32",
-        "__shuffle_int64",
-        "__shuffle_int8",
-        "__soa_to_aos3_float",
-        "__soa_to_aos3_float16",
-        "__soa_to_aos3_float4",
-        "__soa_to_aos3_float8",
-        "__soa_to_aos3_int32",
-        "__soa_to_aos4_float",
-        "__soa_to_aos4_float16",
-        "__soa_to_aos4_float4",
-        "__soa_to_aos4_float8",
-        "__soa_to_aos4_int32",
-        "__sqrt_uniform_double",
-        "__sqrt_uniform_float",
-        "__sqrt_varying_double",
-        "__sqrt_varying_float",
-        "__stdlib_atan",
-        "__stdlib_atan2",
-        "__stdlib_atan2f",
-        "__stdlib_atanf",
-        "__stdlib_cos",
-        "__stdlib_cosf",
-        "__stdlib_exp",
-        "__stdlib_expf",
-        "__stdlib_log",
-        "__stdlib_logf",
-        "__stdlib_pow",
-        "__stdlib_powf",
-        "__stdlib_sin",
-        "__stdlib_sincos",
-        "__stdlib_sincosf",
-        "__stdlib_sinf",
-        "__stdlib_tan",
-        "__stdlib_tanf",
-        "__svml_sin",
-        "__svml_cos",
-        "__svml_sincos",
-        "__svml_tan",
-        "__svml_atan",
-        "__svml_atan2",
-        "__svml_exp",
-        "__svml_log",
-        "__svml_pow",
-        "__undef_uniform",
-        "__undef_varying",
-    };
-
-    int count = sizeof(names) / sizeof(names[0]);
-    for (int i = 0; i < count; ++i) {
-        llvm::Function *f = module->getFunction(names[i]);
-        if (f != NULL)
-            f->setLinkage(llvm::GlobalValue::InternalLinkage);
-    }
-}
-
-
 /** This utility function takes serialized binary LLVM bitcode and adds its
    definitions to the given module.  Functions in the bitcode that can be
    mapped to ispc functions are also added to the symbol table.
@@ -563,9 +336,9 @@ lSetInternalFunctions(llvm::Module *module) {
    @param module      Module to link the bitcode into
    @param symbolTable Symbol table to add definitions to
 */
-void
-AddBitcodeToModule(const unsigned char *bitcode, int length,
-                   llvm::Module *module, SymbolTable *symbolTable) {
+static void
+lAddBitcode(const unsigned char *bitcode, int length,
+            llvm::Module *module, SymbolTable *symbolTable) {
    std::string bcErr;
    llvm::StringRef sb = llvm::StringRef((char *)bitcode, length);
    llvm::MemoryBuffer *bcBuf = llvm::MemoryBuffer::getMemBuffer(sb);
@@ -590,15 +363,9 @@ AddBitcodeToModule(const unsigned char *bitcode, int length,
        bcModule->setTargetTriple(mTriple.str());

        std::string(linkError);
-        if (llvm::Linker::LinkModules(module, bcModule, 
-#if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
-                                      llvm::Linker::DestroySource,
-#endif // LLVM_3_0
-                                      &linkError))
+        if (llvm::Linker::LinkModules(module, bcModule, &linkError))
            Error(SourcePos(), "Error linking stdlib bitcode: %s", linkError.c_str());
-        lSetInternalFunctions(module);
-        if (symbolTable != NULL)
-            lAddModuleSymbols(module, symbolTable);
+        lAddModuleSymbols(module, symbolTable);
        lCheckModuleIntrinsics(module);
    }
 }
@@ -610,8 +377,8 @@ AddBitcodeToModule(const unsigned char *bitcode, int length,
 static void
 lDefineConstantInt(const char *name, int val, llvm::Module *module,
                   SymbolTable *symbolTable) {
-    Symbol *pw = new Symbol(name, SourcePos(), AtomicType::UniformConstInt32,
-                            SC_STATIC);
+    Symbol *pw = new Symbol(name, SourcePos(), AtomicType::UniformConstInt32);
+    pw->isStatic = true;
    pw->constValue = new ConstExpr(pw->type, val, SourcePos());
    LLVM_TYPE_CONST llvm::Type *ltype = LLVMTypes::Int32Type;
    llvm::Constant *linit = LLVMInt32(val);
@@ -628,7 +395,8 @@ lDefineConstantIntFunc(const char *name, int val, llvm::Module *module,
                       SymbolTable *symbolTable) {
    std::vector<const Type *> args;
    FunctionType *ft = new FunctionType(AtomicType::UniformInt32, args, SourcePos());
-    Symbol *sym = new Symbol(name, SourcePos(), ft, SC_STATIC);
+    Symbol *sym = new Symbol(name, SourcePos(), ft);
+    sym->isStatic = true;

    llvm::Function *func = module->getFunction(name);
    assert(func != NULL); // it should be declared already...
@@ -645,7 +413,8 @@ lDefineConstantIntFunc(const char *name, int val, llvm::Module *module,
 static void
 lDefineProgramIndex(llvm::Module *module, SymbolTable *symbolTable) {
    Symbol *pidx = new Symbol("programIndex", SourcePos(), 
-                              AtomicType::VaryingConstInt32, SC_STATIC);
+                              AtomicType::VaryingConstInt32);
+    pidx->isStatic = true;

    int pi[ISPC_MAX_NVEC];
    for (int i = 0; i < g->target.vectorWidth; ++i)
@@ -665,17 +434,17 @@ void
 DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *module,
             bool includeStdlibISPC) {
    // Add the definitions from the compiled builtins-c.c file
-    if (g->target.is32Bit) {
+    if (g->target.is32bit) {
        extern unsigned char builtins_bitcode_c_32[];
        extern int builtins_bitcode_c_32_length;
-        AddBitcodeToModule(builtins_bitcode_c_32, builtins_bitcode_c_32_length, 
-                           module, symbolTable);
+        lAddBitcode(builtins_bitcode_c_32, builtins_bitcode_c_32_length, 
+                    module, symbolTable);
    }
    else {
        extern unsigned char builtins_bitcode_c_64[];
        extern int builtins_bitcode_c_64_length;
-        AddBitcodeToModule(builtins_bitcode_c_64, builtins_bitcode_c_64_length, 
-                           module, symbolTable);
+        lAddBitcode(builtins_bitcode_c_64, builtins_bitcode_c_64_length, 
+                    module, symbolTable);
    }

    // Next, add the target's custom implementations of the various needed
@@ -684,34 +453,22 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
    case Target::SSE2:
        extern unsigned char builtins_bitcode_sse2[];
        extern int builtins_bitcode_sse2_length;
-        extern unsigned char builtins_bitcode_sse2_x2[];
-        extern int builtins_bitcode_sse2_x2_length;
-        switch (g->target.vectorWidth) {
-        case 4: 
-            AddBitcodeToModule(builtins_bitcode_sse2, builtins_bitcode_sse2_length, 
-                               module, symbolTable);
-            break;
-        case 8:
-            AddBitcodeToModule(builtins_bitcode_sse2_x2, builtins_bitcode_sse2_x2_length, 
-                               module, symbolTable);
-            break;
-        default:
-            FATAL("logic error in DefineStdlib");
-        }
+        lAddBitcode(builtins_bitcode_sse2, builtins_bitcode_sse2_length, module,
+                    symbolTable);
        break;
    case Target::SSE4:
        extern unsigned char builtins_bitcode_sse4[];
        extern int builtins_bitcode_sse4_length;
-        extern unsigned char builtins_bitcode_sse4_x2[];
-        extern int builtins_bitcode_sse4_x2_length;
+        extern unsigned char builtins_bitcode_sse4x2[];
+        extern int builtins_bitcode_sse4x2_length;
        switch (g->target.vectorWidth) {
        case 4: 
-            AddBitcodeToModule(builtins_bitcode_sse4, builtins_bitcode_sse4_length, 
-                               module, symbolTable);
+            lAddBitcode(builtins_bitcode_sse4, builtins_bitcode_sse4_length, 
+                        module, symbolTable);
            break;
        case 8:
-            AddBitcodeToModule(builtins_bitcode_sse4_x2, builtins_bitcode_sse4_x2_length, 
-                               module, symbolTable);
+            lAddBitcode(builtins_bitcode_sse4x2, builtins_bitcode_sse4x2_length, 
+                        module, symbolTable);
            break;
        default:
            FATAL("logic error in DefineStdlib");
@@ -722,14 +479,14 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
        case 8:
            extern unsigned char builtins_bitcode_avx[];
            extern int builtins_bitcode_avx_length;
-            AddBitcodeToModule(builtins_bitcode_avx, builtins_bitcode_avx_length, 
-                               module, symbolTable);
+            lAddBitcode(builtins_bitcode_avx, builtins_bitcode_avx_length, module, 
+                        symbolTable);
            break;
        case 16:
            extern unsigned char builtins_bitcode_avx_x2[];
            extern int builtins_bitcode_avx_x2_length;
-            AddBitcodeToModule(builtins_bitcode_avx_x2, builtins_bitcode_avx_x2_length,
-                               module,  symbolTable);
+            lAddBitcode(builtins_bitcode_avx_x2, builtins_bitcode_avx_x2_length,
+                        module,  symbolTable);
            break;
        default:
            FATAL("logic error in DefineStdlib");
@@ -765,8 +522,11 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
        // definitions added.  Disable emission of performance warnings for
        // now, since the user doesn't care about any of that in the stdlib
        // implementation...
+        bool epf = g->emitPerfWarnings;
+        g->emitPerfWarnings = false;
        extern char stdlib_code[];
        yy_scan_string(stdlib_code);
        yyparse();
+        g->emitPerfWarnings = epf;
    }
 }
--- a/builtins.h
+++ b/builtins.h
@@ -55,7 +55,4 @@
 void DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *module,
                  bool includeStdlib);

-void AddBitcodeToModule(const unsigned char *bitcode, int length,
-                        llvm::Module *module, SymbolTable *symbolTable = NULL);
-
 #endif // ISPC_STDLIB_H
--- a/builtins.m4
+++ b/builtins.m4
--- a/ctx.cpp
+++ b/ctx.cpp
--- a/ctx.h
+++ b/ctx.h
@@ -41,7 +41,9 @@
 #include "ispc.h"
 #include <llvm/InstrTypes.h>
 #include <llvm/Instructions.h>
+#ifndef LLVM_2_8
 #include <llvm/Analysis/DIBuilder.h>
+#endif
 #include <llvm/Analysis/DebugInfo.h>

 struct CFInfo;
@@ -57,22 +59,17 @@ struct CFInfo;
 class FunctionEmitContext {
 public:
    /** Create a new FunctionEmitContext.
-        @param function     The Function object representing the function
-        @param funSym       Symbol that corresponds to the function
-        @param llvmFunction LLVM function in the current module that corresponds
+        @param returnType   The return type of the function
+        @param function     LLVM function in the current module that corresponds
                            to the function
+        @param funSym       Symbol that corresponds to the function
        @param firstStmtPos Source file position of the first statement in the
                            function
     */
-    FunctionEmitContext(Function *function, Symbol *funSym, 
-                        llvm::Function *llvmFunction,
+    FunctionEmitContext(const Type *returnType, llvm::Function *function, Symbol *funSym,
                        SourcePos firstStmtPos);
    ~FunctionEmitContext();

-    /** Returns the Function * corresponding to the function that we're
-        currently generating code for. */
-    const Function *GetFunction() const;
-
    /** @name Current basic block management
        @{
     */
@@ -86,33 +83,20 @@ public:
    /** @name Mask management
        @{
     */
-    /** Returns the mask value at entry to the current function. */ 
-    llvm::Value *GetFunctionMask();
-
-    /** Returns the mask value corresponding to "varying" control flow
-        within the current function.  (i.e. this doesn't include the effect
-        of the mask at function entry. */
-    llvm::Value *GetInternalMask();
-
-    /** Returns the complete current mask value--i.e. the logical AND of
-        the function entry mask and the internal mask. */ 
-    llvm::Value *GetFullMask();
-
-    /** Provides the alloca'd pointer to memory to store the full function
-        mask.  This is only used to wire up the __mask builtin variable. */
-    void SetMaskPointer(llvm::Value *p);
+    /** Returns the current mask value */ 
+    llvm::Value *GetMask();

    /** Provides the value of the mask at function entry */
-    void SetFunctionMask(llvm::Value *val);
+    void SetEntryMask(llvm::Value *val);

-    /** Sets the internal mask to a new value */
-    void SetInternalMask(llvm::Value *val);
+    /** Sets the mask to a new value */
+    void SetMask(llvm::Value *val);

-    /** Sets the internal mask to (oldMask & val) */
-    void SetInternalMaskAnd(llvm::Value *oldMask, llvm::Value *val);
+    /** Sets the mask to (oldMask & val) */
+    void MaskAnd(llvm::Value *oldMask, llvm::Value *val);

-    /** Sets the internal mask to (oldMask & ~val) */
-    void SetInternalMaskAndNot(llvm::Value *oldMask, llvm::Value *test);
+    /** Sets the mask to (oldMask & ~val) */
+    void MaskAndNot(llvm::Value *oldMask, llvm::Value *test);

    /** Emits a branch instruction to the basic block btrue if any of the
        lanes of current mask are on and bfalse if none are on. */
@@ -131,8 +115,9 @@ public:
        @{
    */
    /** Notifies the FunctionEmitContext that we're starting emission of an
-        'if' statement with a uniform test.  */
-    void StartUniformIf();
+        'if' statement with a uniform test.  The value of the mask going
+        into the 'if' statement is provided in the oldMask parameter. */
+    void StartUniformIf(llvm::Value *oldMask);

    /** Notifies the FunctionEmitContext that we're starting emission of an
        'if' statement with a varying test.  The value of the mask going
@@ -147,9 +132,10 @@ public:
        for a loop.  Basic blocks are provides for where 'break' and
        'continue' statements should jump to (if all running lanes want to
        break or continue), uniformControlFlow indicates whether the loop
-        condition is 'uniform'. */
+        condition is 'uniform', and oldMask provides the current mask going
+        into the loop. */
    void StartLoop(llvm::BasicBlock *breakTarget, llvm::BasicBlock *continueTarget, 
-                   bool uniformControlFlow);
+                   bool uniformControlFlow, llvm::Value *oldMask);

    /** Informs FunctionEmitContext of the value of the mask at the start
        of a loop body. */
@@ -159,13 +145,6 @@ public:
        finished. */
    void EndLoop();

-    /** Indicates that code generation for a 'foreach' or 'foreach_tiled'
-        loop is about to start.  The provided basic block pointer indicates
-        where control flow should go if a 'continue' statement is executed
-        in the loop. */
-    void StartForeach(llvm::BasicBlock *continueTarget);
-    void EndForeach();
-
    /** Emit code for a 'break' statement in a loop.  If doCoherenceCheck
        is true, then if we're in a 'varying' loop, code will be emitted to
        see if all of the lanes want to break, in which case a jump to the
@@ -190,8 +169,6 @@ public:
        flow */
    int VaryingCFDepth() const;

-    bool InForeachLoop() const;
-
    /** Called to generate code for 'return' statement; value is the
        expression in the return statement (if non-NULL), and
        doCoherenceCheck indicates whether instructions should be generated
@@ -233,6 +210,9 @@ public:
        i32. */
    llvm::Value *I1VecToBoolVec(llvm::Value *b);

+    /** Returns the size of the given type. */
+    llvm::Value *SizeOf(LLVM_TYPE_CONST llvm::Type *ty);
+
    /** If the user has asked to compile the program with instrumentation,
        this inserts a callback to the user-supplied instrumentation
        function at the current point in the code. */
@@ -316,18 +296,12 @@ public:
                         llvm::CmpInst::Predicate pred,
                         llvm::Value *v0, llvm::Value *v1, const char *name = NULL);

-    /** Given a scalar value, return a vector of the same type (or an
-        array, for pointer types). */
-    llvm::Value *SmearUniform(llvm::Value *value, const char *name = NULL);
-
    llvm::Value *BitCastInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *type,
                             const char *name = NULL);
-    llvm::Value *PtrToIntInst(llvm::Value *value, const char *name = NULL);
    llvm::Value *PtrToIntInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *type,
                              const char *name = NULL);
    llvm::Value *IntToPtrInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *type,
                              const char *name = NULL);
-
    llvm::Instruction *TruncInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *type,
                                 const char *name = NULL);
    llvm::Instruction *CastInst(llvm::Instruction::CastOps op, llvm::Value *value,
@@ -339,37 +313,26 @@ public:
    llvm::Instruction *ZExtInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *type, 
                                const char *name = NULL);

-    /** These GEP methods are generalizations of the standard ones in LLVM;
-        they support both uniform and varying basePtr values as well as
-        uniform and varying index values (arrays of indices).  Varying base
-        pointers are expected to come in as vectors of i32/i64 (depending
-        on the target), since LLVM doesn't currently support vectors of
-        pointers.  The underlying type of the base pointer must be provided
-        via the ptrType parameter */
-    llvm::Value *GetElementPtrInst(llvm::Value *basePtr, llvm::Value *index,
-                                   const Type *ptrType, const char *name = NULL);
+    /** This GEP method is a generalization of the standard one in LLVM; it
+        supports both uniform and varying basePtr values (an array of
+        pointers) as well as uniform and varying index values (arrays of
+        indices). */
    llvm::Value *GetElementPtrInst(llvm::Value *basePtr, llvm::Value *index0,
-                                   llvm::Value *index1, const Type *ptrType,
+                                   llvm::Value *index1, const char *name = NULL);
+
+    /** This is a convenience method to generate a GEP instruction with
+        indices with values with known constant values as the ispc program
+        is being compiled. */
+    llvm::Value *GetElementPtrInst(llvm::Value *basePtr, int v0, int v1,
                                   const char *name = NULL);

-    /** This method returns a new pointer that represents offsetting the
-        given base pointer to point at the given element number of the
-        structure type that the base pointer points to.  (The provided
-        pointer must be a pointer to a structure type.  The ptrType gives
-        the type of the pointer, though it may be NULL if the base pointer
-        is uniform. */
-    llvm::Value *AddElementOffset(llvm::Value *basePtr, int elementNum,
-                                  const Type *ptrType, const char *name = NULL);
-
-    /** Load from the memory location(s) given by lvalue, using the given
-        mask.  The lvalue may be varying, in which case this corresponds to
-        a gather from the multiple memory locations given by the array of
-        pointer values given by the lvalue.  If the lvalue is not varying,
-        then both the mask pointer and the type pointer may be NULL. */
-    llvm::Value *LoadInst(llvm::Value *ptr, llvm::Value *mask,
-                          const Type *ptrType, const char *name = NULL);
-
-    llvm::Value *LoadInst(llvm::Value *ptr, const char *name = NULL);
+    /** Load from the memory location(s) given by lvalue.  The lvalue may
+        be varying, in which case this corresponds to a gather from the
+        multiple memory locations given by the array of pointer values
+        given by the lvalue.  If the lvalue is not varying, then the type
+        parameter may be NULL. */
+    llvm::Value *LoadInst(llvm::Value *lvalue, const Type *type,
+                          const char *name = NULL);

    /** Emits an alloca instruction to allocate stack storage for the given
        type.  If a non-zero alignment is specified, the object is also
@@ -377,20 +340,21 @@ public:
        instruction is added at the start of the function in the entry
        basic block; if it should be added to the current basic block, then
        the atEntryBlock parameter should be false. */ 
-    llvm::Value *AllocaInst(LLVM_TYPE_CONST llvm::Type *llvmType, 
-                            const char *name = NULL, int align = 0, 
-                            bool atEntryBlock = true);
+    llvm::Value *AllocaInst(LLVM_TYPE_CONST llvm::Type *llvmType, const char *name = NULL,
+                            int align = 0, bool atEntryBlock = true);

    /** Standard store instruction; for this variant, the lvalue must be a
        single pointer, not a varying lvalue. */
-    void StoreInst(llvm::Value *value, llvm::Value *ptr);
+    void StoreInst(llvm::Value *rvalue, llvm::Value *lvalue, 
+                   const char *name = NULL);

    /** In this variant of StoreInst(), the lvalue may be varying.  If so,
        this corresponds to a scatter.  Whether the lvalue is uniform of
        varying, the given storeMask is used to mask the stores so that
        they only execute for the active program instances. */
-    void StoreInst(llvm::Value *value, llvm::Value *ptr,
-                   llvm::Value *storeMask, const Type *ptrType);
+    void StoreInst(llvm::Value *rvalue, llvm::Value *lvalue,
+                   llvm::Value *storeMask, const Type *rvalueType,
+                   const char *name = NULL);

    void BranchInst(llvm::BasicBlock *block);
    void BranchInst(llvm::BasicBlock *trueBlock, llvm::BasicBlock *falseBlock,
@@ -412,30 +376,24 @@ public:
    llvm::Instruction *SelectInst(llvm::Value *test, llvm::Value *val0,
                                  llvm::Value *val1, const char *name = NULL);

-    /** Emits IR to do a function call with the given arguments.  If the
-        function type is a varying function pointer type, its full type
-        must be provided in funcType.  funcType can be NULL if func is a
-        uniform function pointer. */
-    llvm::Value *CallInst(llvm::Value *func, const FunctionType *funcType,
-                          const std::vector<llvm::Value *> &args,
-                          const char *name = NULL);
-
+    llvm::Instruction *CallInst(llvm::Function *func, 
+                                const std::vector<llvm::Value *> &args,
+                                const char *name = NULL);
    /** This is a convenience method that issues a call instruction to a
        function that takes just a single argument. */
-    llvm::Value *CallInst(llvm::Value *func, const FunctionType *funcType,
-                          llvm::Value *arg, const char *name = NULL);
+    llvm::Instruction *CallInst(llvm::Function *func, llvm::Value *arg,
+                                const char *name = NULL);

    /** This is a convenience method that issues a call instruction to a
        function that takes two arguments. */
-    llvm::Value *CallInst(llvm::Value *func, const FunctionType *funcType,
-                          llvm::Value *arg0, llvm::Value *arg1,
-                          const char *name = NULL);
+    llvm::Instruction *CallInst(llvm::Function *func, llvm::Value *arg0,
+                                llvm::Value *arg1, const char *name = NULL);

    /** Launch an asynchronous task to run the given function, passing it
        he given argument values. */
-    llvm::Value *LaunchInst(llvm::Value *callee, 
-                            std::vector<llvm::Value *> &argVals,
-                            llvm::Value *launchCount);
+    llvm::Instruction *LaunchInst(llvm::Function *callee, 
+                                  std::vector<llvm::Value *> &argVals,
+                                  llvm::Value *launchCount);

    void SyncInst();

@@ -443,9 +401,6 @@ public:
    /** @} */

 private:
-    /** Pointer to the Function for which we're currently generating code. */
-    Function *function;
-
    /** The basic block into which we add any alloca instructions that need
        to go at the very start of the function. */
    llvm::BasicBlock *allocaBlock;
@@ -455,16 +410,8 @@ private:
    llvm::BasicBlock *bblock;

    /** Pointer to stack-allocated memory that stores the current value of
-        the full program mask. */
-    llvm::Value *fullMaskPointer;
-
-    /** Pointer to stack-allocated memory that stores the current value of
-        the program mask representing varying control flow within the
-        function. */
-    llvm::Value *internalMaskPointer;
-
-    /** Value of the program mask when the function starts execution.  */
-    llvm::Value *functionMaskValue;
+        the program mask. */
+    llvm::Value *maskPtr;

    /** Current source file position; if debugging information is being
        generated, this position is used to set file/line information for
@@ -475,6 +422,12 @@ private:
        for error messages and debugging symbols. */
    SourcePos funcStartPos;

+    /** Type of result that the current function returns. */
+    const Type *returnType;
+
+    /** Value of the program mask when the function starts execution.  */
+    llvm::Value *entryMask;
+
    /** If currently in a loop body, the value of the mask at the start of
        the loop. */
    llvm::Value *loopMask;
@@ -538,23 +491,19 @@ private:
    llvm::Value *launchGroupHandlePtr;

    llvm::Value *pointerVectorToVoidPointers(llvm::Value *value);
-    static void addGSMetadata(llvm::Value *inst, SourcePos pos);
+    static void addGSMetadata(llvm::Instruction *inst, SourcePos pos);
    bool ifsInLoopAllUniform() const;
    void jumpIfAllLoopLanesAreDone(llvm::BasicBlock *target);
    llvm::Value *emitGatherCallback(llvm::Value *lvalue, llvm::Value *retPtr);

-    llvm::Value *applyVaryingGEP(llvm::Value *basePtr, llvm::Value *index, 
-                                 const Type *ptrType);
-
    void restoreMaskGivenReturns(llvm::Value *oldMask);

-    void scatter(llvm::Value *value, llvm::Value *ptr, const Type *ptrType, 
-                 llvm::Value *mask);
-    void maskedStore(llvm::Value *value, llvm::Value *ptr, const Type *ptrType,
-                     llvm::Value *mask);
-    llvm::Value *gather(llvm::Value *ptr, const Type *ptrType, llvm::Value *mask,
+    void scatter(llvm::Value *rvalue, llvm::Value *lvalue, 
+                 llvm::Value *maskPtr, const Type *rvalueType);
+    llvm::Value *gather(llvm::Value *lvalue, const Type *type,
                        const char *name);
-    llvm::Value *addVaryingOffsetsIfNeeded(llvm::Value *ptr, const Type *ptrType);
+    void maskedStore(llvm::Value *rvalue, llvm::Value *lvalue,
+                     const Type *rvalueType, llvm::Value *maskPtr);
 };

 #endif // ISPC_CTX_H
--- a/decl.cpp
+++ b/decl.cpp
@@ -38,59 +38,10 @@

 #include "decl.h"
 #include "util.h"
-#include "module.h"
 #include "sym.h"
 #include "type.h"
-#include "stmt.h"
 #include "expr.h"
 #include <stdio.h>
-#include <set>
-
-/** Given a Type and a set of type qualifiers, apply the type qualifiers to
-    the type, returning the type that is the result. 
-*/
-static const Type *
-lApplyTypeQualifiers(int typeQualifiers, const Type *type, SourcePos pos) {
-    if (type == NULL)
-        return NULL;
-
-    if ((typeQualifiers & TYPEQUAL_UNSIGNED) != 0) {
-        if ((typeQualifiers & TYPEQUAL_SIGNED) != 0)
-            Error(pos, "Illegal to apply both \"signed\" and \"unsigned\" "
-                  "qualifiers.");
-
-        const Type *unsignedType = type->GetAsUnsignedType();
-        if (unsignedType != NULL)
-            type = unsignedType;
-        else
-            Error(pos, "\"unsigned\" qualifier is illegal with \"%s\" type.",
-              type->GetString().c_str());
-
-    }
-
-    if ((typeQualifiers & TYPEQUAL_SIGNED) != 0 && type->IsIntType() == false)
-        Error(pos, "\"signed\" qualifier is illegal with non-integer type "
-              "\"%s\".", type->GetString().c_str());
-
-    if ((typeQualifiers & TYPEQUAL_CONST) != 0)
-        type = type->GetAsConstType();
-
-    if ((typeQualifiers & TYPEQUAL_UNIFORM) != 0)
-        type = type->GetAsUniformType();
-    else if ((typeQualifiers & TYPEQUAL_VARYING) != 0)
-        type = type->GetAsVaryingType();
-    else {
-        // otherwise, structs are uniform by default and everything
-        // else is varying by default
-        if (dynamic_cast<const StructType *>(type->GetBaseType()) != NULL)
-            type = type->GetAsUniformType();
-        else
-            type = type->GetAsVaryingType();
-    }
-
-    return type;
-}
-

 ///////////////////////////////////////////////////////////////////////////
 // DeclSpecs
@@ -98,57 +49,29 @@ lApplyTypeQualifiers(int typeQualifiers, const Type *type, SourcePos pos) {
 DeclSpecs::DeclSpecs(const Type *t, StorageClass sc, int tq) {
    baseType = t;
    storageClass = sc;
-    typeQualifiers = tq;
+    typeQualifier = tq;
    soaWidth = 0;
    vectorSize = 0;
 }


-const Type *
-DeclSpecs::GetBaseType(SourcePos pos) const {
-    const Type *bt = baseType;
-    if (vectorSize > 0) {
-        const AtomicType *atomicType = dynamic_cast<const AtomicType *>(bt);
-        if (atomicType == NULL) {
-            Error(pos, "Only atomic types (int, float, ...) are legal for vector "
-                  "types.");
-            return NULL;
-        }
-        bt = new VectorType(atomicType, vectorSize);
-    }
-
-    return lApplyTypeQualifiers(typeQualifiers, bt, pos);
-}
-
-
-static const char *
-lGetStorageClassName(StorageClass storageClass) {
-    switch (storageClass) {
-    case SC_NONE:     return "";
-    case SC_EXTERN:   return "extern";
-    case SC_EXTERN_C: return "extern \"C\"";
-    case SC_EXPORT:   return "export";
-    case SC_STATIC:   return "static";
-    case SC_TYPEDEF:  return "typedef";
-    default:          FATAL("Unhandled storage class in lGetStorageClassName");
-                      return "";
-    }
-}
-
-
 void
 DeclSpecs::Print() const {
-    printf("%s ", lGetStorageClassName(storageClass));
+    if (storageClass == SC_EXTERN)   printf("extern ");
+    if (storageClass == SC_EXTERN_C) printf("extern \"C\" ");
+    if (storageClass == SC_EXPORT)   printf("export ");
+    if (storageClass == SC_STATIC)   printf("static ");
+    if (storageClass == SC_TYPEDEF)  printf("typedef ");

    if (soaWidth > 0) printf("soa<%d> ", soaWidth);

-    if (typeQualifiers & TYPEQUAL_INLINE)    printf("inline ");
-    if (typeQualifiers & TYPEQUAL_CONST)     printf("const ");
-    if (typeQualifiers & TYPEQUAL_UNIFORM)   printf("uniform ");
-    if (typeQualifiers & TYPEQUAL_VARYING)   printf("varying ");
-    if (typeQualifiers & TYPEQUAL_TASK)      printf("task ");
-    if (typeQualifiers & TYPEQUAL_SIGNED)    printf("signed ");
-    if (typeQualifiers & TYPEQUAL_UNSIGNED)  printf("unsigned ");
+    if (typeQualifier & TYPEQUAL_INLINE)    printf("inline ");
+    if (typeQualifier & TYPEQUAL_CONST)     printf("const ");
+    if (typeQualifier & TYPEQUAL_UNIFORM)   printf("uniform ");
+    if (typeQualifier & TYPEQUAL_VARYING)   printf("varying ");
+    if (typeQualifier & TYPEQUAL_TASK)      printf("task ");
+    if (typeQualifier & TYPEQUAL_REFERENCE) printf("reference ");
+    if (typeQualifier & TYPEQUAL_UNSIGNED)  printf("unsigned ");

    printf("%s", baseType->GetString().c_str());

@@ -159,46 +82,34 @@ DeclSpecs::Print() const {
 ///////////////////////////////////////////////////////////////////////////
 // Declarator

-Declarator::Declarator(DeclaratorKind dk, SourcePos p) 
-    : pos(p), kind(dk) { 
-    child = NULL;
-    typeQualifiers = 0;
-    arraySize = -1;
-    sym = NULL;
+Declarator::Declarator(Symbol *s, SourcePos p) 
+  : pos(p) { 
+    sym = s;
+    functionArgs = NULL;
+    isFunction = false;
    initExpr = NULL;
 }


 void
-Declarator::InitFromDeclSpecs(DeclSpecs *ds) {
-    const Type *t = GetType(ds);
-    Symbol *sym = GetSymbol();
-    if (sym != NULL) {
-        sym->type = t;
-        sym->storageClass = ds->storageClass;
-    }
+Declarator::AddArrayDimension(int size) {
+    assert(size > 0 || size == -1); // -1 -> unsized
+    arraySize.push_back(size);
 }


-Symbol *
-Declarator::GetSymbol() const {
-    // The symbol lives at the last child in the chain, so walk down there
-    // and return the one there.
-    const Declarator *d = this;
-    while (d->child != NULL)
-        d = d->child;
-    return d->sym;
+void
+Declarator::InitFromDeclSpecs(DeclSpecs *ds) {
+    sym->type = GetType(ds);
+
+    if (ds->storageClass == SC_STATIC)
+        sym->isStatic = true;
 }


 void
 Declarator::Print() const {
-    Symbol *sym = GetSymbol();
-    if (sym != NULL)
-        printf("%s", sym->name.c_str());
-    else
-        printf("(null symbol)");
-
+    printf("%s", sym->name.c_str());
    if (initExpr != NULL) {
        printf(" = (");
        initExpr->Print();
@@ -208,305 +119,188 @@ Declarator::Print() const {
 }


-Symbol *
-Declarator::GetFunctionInfo(DeclSpecs *ds, std::vector<Symbol *> *funArgs) {
-    const FunctionType *type = 
-        dynamic_cast<const FunctionType *>(GetType(ds));
-    if (type == NULL)
-        return NULL;
-
-    Symbol *declSym = GetSymbol();
-    assert(declSym != NULL);
-
-    // Get the symbol for the function from the symbol table.  (It should
-    // already have been added to the symbol table by AddGlobal() by the
-    // time we get here.)
-    Symbol *funSym = m->symbolTable->LookupFunction(declSym->name.c_str(), type);
-    if (funSym != NULL)
-        // May be NULL due to error earlier in compilation
-        funSym->pos = pos;
-
-    // Walk down to the declarator for the function.  (We have to get past
-    // the stuff that specifies the function's return type before we get to
-    // the function's declarator.)
-    Declarator *d = this;
-    while (d != NULL && d->kind != DK_FUNCTION)
-        d = d->child;
-    assert(d != NULL);
-
-    for (unsigned int i = 0; i < d->functionParams.size(); ++i) {
-        Declaration *pdecl = d->functionParams[i];
-        assert(pdecl->declarators.size() == 1);
-        funArgs->push_back(pdecl->declarators[0]->GetSymbol());
-    }
-
-    return funSym;
-}
-
-
-const Type *
-Declarator::GetType(const Type *base, DeclSpecs *ds) const {
-    bool hasUniformQual = ((typeQualifiers & TYPEQUAL_UNIFORM) != 0);
-    bool hasVaryingQual = ((typeQualifiers & TYPEQUAL_VARYING) != 0);
-    bool isTask =         ((typeQualifiers & TYPEQUAL_TASK) != 0);
-    bool isConst =        ((typeQualifiers & TYPEQUAL_CONST) != 0);
-
-    if (hasUniformQual && hasVaryingQual) {
-        Error(pos, "Can't provide both \"uniform\" and \"varying\" qualifiers.");
-        return NULL;
-    }
-    if (kind != DK_FUNCTION && isTask)
-        Error(pos, "\"task\" qualifier illegal in variable declaration.");
-
-    const Type *type = base;
-    switch (kind) {
-    case DK_BASE:
-        // All of the type qualifiers should be in the DeclSpecs for the
-        // base declarator
-        assert(typeQualifiers == 0);
-        assert(child == NULL);
-        return type;
-
-    case DK_POINTER:
-        type = new PointerType(type, hasUniformQual, isConst);
-        if (child != NULL)
-            return child->GetType(type, ds);
-        else
-            return type;
-        break;
-
-    case DK_REFERENCE:
-        if (hasUniformQual)
-            Error(pos, "\"uniform\" qualifier is illegal to apply to references.");
-        if (hasVaryingQual)
-            Error(pos, "\"varying\" qualifier is illegal to apply to references.");
-        if (isConst)
-            Error(pos, "\"const\" qualifier is to illegal apply to references.");
-
-        // The parser should disallow this already, but double check.
-        if (dynamic_cast<const ReferenceType *>(type) != NULL) {
-            Error(pos, "References to references are illegal.");
+static const Type *
+lGetType(const Declarator *decl, DeclSpecs *ds, 
+         std::vector<int>::const_iterator arrayIter) {
+    if (arrayIter == decl->arraySize.end()) {
+        // If we don't have an array (or have processed all of the array
+        // dimensions in previous recursive calls), we can go ahead and
+        // figure out the final non-array type we have here.
+        const Type *type = ds->baseType;
+        if (type == NULL) {
+            Error(decl->pos, "Type not provided in variable declaration for variable \"%s\".",
+                  decl->sym->name.c_str());
            return NULL;
        }

-        type = new ReferenceType(type);
-        if (child != NULL)
-            return child->GetType(type, ds);
-        else
-            return type;
-        break;
+        // Account for 'unsigned' and 'const' qualifiers in the type
+        if ((ds->typeQualifier & TYPEQUAL_UNSIGNED) != 0) {
+            const Type *unsignedType = type->GetAsUnsignedType();
+            if (unsignedType != NULL)
+                type = unsignedType;
+            else
+                Error(decl->pos, "\"unsigned\" qualifier is illegal with \"%s\" type.",
+                      type->GetString().c_str());
+        }
+        if ((ds->typeQualifier & TYPEQUAL_CONST) != 0)
+            type = type->GetAsConstType();

-    case DK_ARRAY:
-        type = new ArrayType(type, arraySize);
-        if (child)
-            return child->GetType(type, ds);
-        else
-            return type;
-        break;
-
-    case DK_FUNCTION: {
-        std::vector<const Type *> args;
-        std::vector<std::string> argNames;
-        std::vector<ConstExpr *> argDefaults;
-        std::vector<SourcePos> argPos;
-
-        // Loop over the function arguments and store the names, types,
-        // default values (if any), and source file positions each one in
-        // the corresponding vector.
-        for (unsigned int i = 0; i < functionParams.size(); ++i) {
-            Declaration *d = functionParams[i];
-
-            char buf[32];
-            Symbol *sym;
-            if (d->declarators.size() == 0) {
-                // function declaration like foo(float), w/o a name for
-                // the parameter
-                sprintf(buf, "__anon_parameter_%d", i);
-                sym = new Symbol(buf, pos);
-                sym->type = d->declSpecs->GetBaseType(pos);
+        if (ds->vectorSize > 0) {
+            const AtomicType *atomicType = dynamic_cast<const AtomicType *>(type);
+            if (atomicType == NULL) {
+                Error(decl->pos, "Only atomic types (int, float, ...) are legal for vector "
+                      "types.");
+                return NULL;
            }
-            else {
-                sym = d->declarators[0]->GetSymbol();
-                if (sym == NULL) {
-                    // Handle more complex anonymous declarations like
-                    // float (float **).
-                    sprintf(buf, "__anon_parameter_%d", i);
-                    sym = new Symbol(buf, d->declarators[0]->pos);
-                    sym->type = d->declarators[0]->GetType(d->declSpecs);
-                }
-            }
-
-            if (d->declSpecs->storageClass != SC_NONE)
-                Error(sym->pos, "Storage class \"%s\" is illegal in "
-                      "function parameter declaration for parameter \"%s\".", 
-                      lGetStorageClassName(d->declSpecs->storageClass),
-                      sym->name.c_str());
-
-            const ArrayType *at = dynamic_cast<const ArrayType *>(sym->type);
-            if (at != NULL) {
-                // As in C, arrays are passed to functions as pointers to
-                // their element type.  We'll just immediately make this
-                // change now.  (One shortcoming of losing the fact that
-                // the it was originally an array is that any warnings or
-                // errors later issued that print the function type will
-                // report this differently than it was originally declared
-                // in the function, but it's not clear that this is a
-                // significant problem.)
-                sym->type = PointerType::GetUniform(at->GetElementType());
-
-                // Make sure there are no unsized arrays (other than the
-                // first dimension) in function parameter lists.
-                at = dynamic_cast<const ArrayType *>(at->GetElementType());
-                while (at != NULL) {
-                    if (at->GetElementCount() == 0)
-                        Error(sym->pos, "Arrays with unsized dimensions in "
-                              "dimensions after the first one are illegal in "
-                              "function parameter lists.");
-                    at = dynamic_cast<const ArrayType *>(at->GetElementType());
-                }
-            }
-
-            args.push_back(sym->type);
-            argNames.push_back(sym->name);
-            argPos.push_back(sym->pos);
-
-            ConstExpr *init = NULL;
-            if (d->declarators.size()) {
-                // Try to find an initializer expression; if there is one,
-                // it lives down to the base declarator.
-                Declarator *decl = d->declarators[0];
-                while (decl->child != NULL) {
-                    assert(decl->initExpr == NULL);
-                    decl = decl->child;
-                }
-
-                if (decl->initExpr != NULL &&
-                    (decl->initExpr = decl->initExpr->TypeCheck()) != NULL &&
-                    (decl->initExpr = decl->initExpr->Optimize()) != NULL &&
-                    (init = dynamic_cast<ConstExpr *>(decl->initExpr)) == NULL) {
-                    Error(decl->initExpr->pos, "Default value for parameter "
-                          "\"%s\" must be a compile-time constant.", 
-                          sym->name.c_str());
-                }
-            }
-            argDefaults.push_back(init);
+            type = new VectorType(atomicType, ds->vectorSize);
        }

-        const Type *returnType = type;
-        if (returnType == NULL) {
-            Error(pos, "No return type provided in function declaration.");
-            return NULL;
+        // if uniform/varying is specified explicitly, then go with that
+        if ((ds->typeQualifier & TYPEQUAL_UNIFORM) != 0)
+            return type->GetAsUniformType();
+        else if ((ds->typeQualifier & TYPEQUAL_VARYING) != 0)
+            return type->GetAsVaryingType();
+        else {
+            // otherwise, structs are uniform by default and everything
+            // else is varying by default
+            if (dynamic_cast<const StructType *>(type) != NULL)
+                return type->GetAsUniformType();
+            else
+                return type->GetAsVaryingType();
        }
-
-        bool isExported = ds && (ds->storageClass == SC_EXPORT);
-        bool isExternC =  ds && (ds->storageClass == SC_EXTERN_C);
-        bool isTask =     ds && ((ds->typeQualifiers & TYPEQUAL_TASK) != 0);
-
-        if (isExported && isTask) {
-            Error(pos, "Function can't have both \"task\" and \"export\" "
-                  "qualifiers");
-            return NULL;
-        }
-        if (isExternC && isTask) {
-            Error(pos, "Function can't have both \"extern \"C\"\" and \"task\" "
-                  "qualifiers");
-            return NULL;
-        }
-        if (isExternC && isExported) {
-            Error(pos, "Function can't have both \"extern \"C\"\" and \"export\" "
-                  "qualifiers");
-            return NULL;
-        }
-
-        Type *functionType = 
-            new FunctionType(returnType, args, pos, argNames, argDefaults,
-                             argPos, isTask, isExported, isExternC);
-        return child->GetType(functionType, ds);
-    }
-    default:
-        FATAL("Unexpected decl kind");
-        return NULL;
    }
+    else {
+        // Peel off one dimension of the array
+        int arraySize = *arrayIter;
+        ++arrayIter;

-#if 0
+        // Get the type, not including the arraySize dimension peeled off
+        // above.
+        const Type *childType = lGetType(decl, ds, arrayIter);
+
+        int soaWidth = ds->soaWidth;
+        if (soaWidth == 0)
+            // If there's no "soa<n>" stuff going on, just return a regular
+            // array with the appropriate size 
+            return new ArrayType(childType, arraySize == -1 ? 0 : arraySize);
+       else {
            // Make sure we actually have an array of structs ..
            const StructType *childStructType = 
                dynamic_cast<const StructType *>(childType);
            if (childStructType == NULL) {
-                Error(pos, "Illegal to provide soa<%d> qualifier with non-struct "
+                Error(decl->pos, "Illegal to provide soa<%d> qualifier with non-struct "
                      "type \"%s\".", soaWidth, childType->GetString().c_str());
                return new ArrayType(childType, arraySize == -1 ? 0 : arraySize);
            }
            else if ((soaWidth & (soaWidth - 1)) != 0) {
-                Error(pos, "soa<%d> width illegal.  Value must be power of two.",
+                Error(decl->pos, "soa<%d> width illegal.  Value must be power of two.",
                      soaWidth);
                return NULL;
            }
            else if (arraySize != -1 && (arraySize % soaWidth) != 0) {
-                Error(pos, "soa<%d> width must evenly divide array size %d.",
+                Error(decl->pos, "soa<%d> width must evenly divide array size %d.",
                      soaWidth, arraySize);
                return NULL;
            }
            return new SOAArrayType(childStructType, arraySize == -1 ? 0 : arraySize,
                                    soaWidth);
-#endif
+        }
+    }
 }


 const Type *
 Declarator::GetType(DeclSpecs *ds) const {
-    const Type *baseType = ds->GetBaseType(pos);
-    const Type *type = GetType(baseType, ds);
-    return type;
-}
+    bool hasUniformQual = ((ds->typeQualifier & TYPEQUAL_UNIFORM) != 0);
+    bool hasVaryingQual = ((ds->typeQualifier & TYPEQUAL_VARYING) != 0);
+    bool isTask =         ((ds->typeQualifier & TYPEQUAL_TASK) != 0);
+    bool isReference =    ((ds->typeQualifier & TYPEQUAL_REFERENCE) != 0);

+    if (hasUniformQual && hasVaryingQual) {
+        Error(pos, "Can't provide both \"uniform\" and \"varying\" qualifiers.");
+        return NULL;
+    }
+
+    if (isFunction) {
+        std::vector<const Type *> args;
+        std::vector<std::string> argNames;
+        if (functionArgs) {
+            // Loop over the function arguments and get names and types for
+            // each one in the args and argNames arrays
+            for (unsigned int i = 0; i < functionArgs->size(); ++i) {
+                Declaration *d = (*functionArgs)[i];
+                Symbol *sym;
+                if (d->declarators.size() == 0) {
+                    // function declaration like foo(float), w/o a name for
+                    // the parameter
+                    char buf[32];
+                    sprintf(buf, "__anon_parameter_%d", i);
+                    sym = new Symbol(buf, pos);
+                    Declarator *declarator = new Declarator(sym, sym->pos);
+                    sym->type = declarator->GetType(d->declSpecs);
+                    d->declarators.push_back(declarator);
+                }
+                else {
+                    assert(d->declarators.size() == 1);
+                    sym = d->declarators[0]->sym;
+                }
+
+                // Arrays are passed by reference, so convert array
+                // parameters to be references here.
+                if (dynamic_cast<const ArrayType *>(sym->type) != NULL)
+                    sym->type = new ReferenceType(sym->type, sym->type->IsConstType());
+
+                args.push_back(sym->type);
+                argNames.push_back(sym->name);
+            }
+        }
+
+        if (ds->baseType == NULL) {
+            Warning(pos, "No return type provided in declaration of function \"%s\". "
+                    "Treating as \"void\".", sym->name.c_str());
+            ds->baseType = AtomicType::Void;
+        }
+
+        if (isReference) {
+            Error(pos, "Function return types can't be reference types.");
+            return NULL;
+        }
+
+        const Type *returnType = lGetType(this, ds, arraySize.begin());
+        if (returnType == NULL)
+            return NULL;
+
+        bool isExported = (ds->storageClass == SC_EXPORT);
+        bool isExternC =  (ds->storageClass == SC_EXTERN_C);
+        return new FunctionType(returnType, args, pos, &argNames, isTask, 
+                                isExported, isExternC);
+    }
+    else {
+        if (isTask)
+            Error(pos, "\"task\" qualifier illegal in variable declaration \"%s\".",
+                  sym->name.c_str());
+
+        const Type *type = lGetType(this, ds, arraySize.begin());
+
+        if (type != NULL && isReference) {
+            bool hasConstQual = ((ds->typeQualifier & TYPEQUAL_CONST) != 0);
+            type = new ReferenceType(type, hasConstQual);
+        }
+
+        return type;
+    }
+}

 ///////////////////////////////////////////////////////////////////////////
 // Declaration

-Declaration::Declaration(DeclSpecs *ds, std::vector<Declarator *> *dlist) {
-    declSpecs = ds;
-    if (dlist != NULL)
-        declarators = *dlist;
-    for (unsigned int i = 0; i < declarators.size(); ++i)
-        if (declarators[i] != NULL)
-            declarators[i]->InitFromDeclSpecs(declSpecs);
-}
-
-
-Declaration::Declaration(DeclSpecs *ds, Declarator *d) {
-    declSpecs = ds;
-    if (d != NULL) {
-        d->InitFromDeclSpecs(ds);
-        declarators.push_back(d);
-    }
-}
-
-
-std::vector<VariableDeclaration>
-Declaration::GetVariableDeclarations() const {
+void
+Declaration::AddSymbols(SymbolTable *st) const {
    assert(declSpecs->storageClass != SC_TYPEDEF);
-    std::vector<VariableDeclaration> vars;

-    for (unsigned int i = 0; i < declarators.size(); ++i) {
-        if (declarators[i] == NULL)
-            continue;
-        Declarator *decl = declarators[i];
-        if (decl == NULL)
-            // Ignore earlier errors
-            continue;
-
-        Symbol *sym = decl->GetSymbol();
-        if (dynamic_cast<const FunctionType *>(sym->type) != NULL) {
-            // function declaration
-            m->symbolTable->AddFunction(sym);
-        }
-        else {
-            m->symbolTable->AddVariable(sym);
-            vars.push_back(VariableDeclaration(sym, decl->initExpr));
-        }
-    }
-    return vars;
+    for (unsigned int i = 0; i < declarators.size(); ++i)
+       if (declarators[i])
+           st->AddVariable(declarators[i]->sym);
 }


@@ -528,44 +322,29 @@ GetStructTypesNamesPositions(const std::vector<StructDeclaration *> &sd,
                             std::vector<const Type *> *elementTypes,
                             std::vector<std::string> *elementNames,
                             std::vector<SourcePos> *elementPositions) {
-    std::set<std::string> seenNames;
    for (unsigned int i = 0; i < sd.size(); ++i) {
        const Type *type = sd[i]->type;
-        if (type == NULL)
-            continue;
-
        // FIXME: making this fake little DeclSpecs here is really
        // disgusting
        DeclSpecs ds(type);
        if (type->IsUniformType()) 
-            ds.typeQualifiers |= TYPEQUAL_UNIFORM;
+            ds.typeQualifier |= TYPEQUAL_UNIFORM;
        else
-            ds.typeQualifiers |= TYPEQUAL_VARYING;
+            ds.typeQualifier |= TYPEQUAL_VARYING;

        for (unsigned int j = 0; j < sd[i]->declarators->size(); ++j) {
            Declarator *d = (*sd[i]->declarators)[j];
            d->InitFromDeclSpecs(&ds);

-            Symbol *sym = d->GetSymbol();
+            // if it's an unsized array, make it a reference to an unsized
+            // array, so the caller can pass a pointer...
+            const ArrayType *at = dynamic_cast<const ArrayType *>(d->sym->type);
+            if (at && at->GetElementCount() == 0)
+                d->sym->type = new ReferenceType(d->sym->type, type->IsConstType());

-            const ArrayType *arrayType = 
-                dynamic_cast<const ArrayType *>(sym->type);
-            if (arrayType != NULL && arrayType->GetElementCount() == 0) {
-                Error(d->pos, "Unsized arrays aren't allowed in struct "
-                      "definitions.");
-                elementTypes->push_back(NULL);
-            }
-            else
-                elementTypes->push_back(sym->type);
-
-            if (seenNames.find(sym->name) != seenNames.end())
-                Error(d->pos, "Struct member \"%s\" has same name as a "
-                      "previously-declared member.", sym->name.c_str());
-            else
-                seenNames.insert(sym->name);
-
-            elementNames->push_back(sym->name);
-            elementPositions->push_back(sym->pos);
+            elementTypes->push_back(d->sym->type);
+            elementNames->push_back(d->sym->name);
+            elementPositions->push_back(d->sym->pos);
        }
    }
 }
--- a/decl.h
+++ b/decl.h
@@ -56,11 +56,6 @@

 #include "ispc.h"

-struct VariableDeclaration;
-
-class Declaration;
-class Declarator;
-
 enum StorageClass {
    SC_NONE,
    SC_EXTERN,
@@ -79,7 +74,7 @@ enum StorageClass {
 #define TYPEQUAL_UNIFORM    (1<<1)
 #define TYPEQUAL_VARYING    (1<<2)
 #define TYPEQUAL_TASK       (1<<3)
-#define TYPEQUAL_SIGNED     (1<<4)
+#define TYPEQUAL_REFERENCE  (1<<4)
 #define TYPEQUAL_UNSIGNED   (1<<5)
 #define TYPEQUAL_INLINE     (1<<6)

@@ -97,17 +92,15 @@ public:
    StorageClass storageClass;

    /** Zero or more of the TYPEQUAL_* values, ANDed together. */
-    int typeQualifiers;
+    int typeQualifier;

    /** The basic type provided in the declaration; this should be an
-        AtomicType, EnumType, StructType, or VectorType; other types (like
+        AtomicType, a StructType, or a VectorType; other types (like
        ArrayTypes) will end up being created if a particular declaration
        has an array size, etc.
    */
    const Type *baseType;

-    const Type *GetBaseType(SourcePos pos) const;
-
    /** If this is a declaration with a vector type, this gives the vector
        width.  For non-vector types, this is zero.
     */
@@ -120,14 +113,6 @@ public:
 };


-enum DeclaratorKind {
-    DK_BASE,
-    DK_POINTER,
-    DK_REFERENCE,
-    DK_ARRAY,
-    DK_FUNCTION
-};
-
 /** @brief Representation of the declaration of a single variable.  

    In conjunction with an instance of the DeclSpecs, this gives us
@@ -135,7 +120,13 @@ enum DeclaratorKind {
 */
 class Declarator {
 public:
-    Declarator(DeclaratorKind dk, SourcePos p);
+    Declarator(Symbol *s, SourcePos p);
+
+    /** As the parser peels off array dimension declarations after the
+        symbol name, it calls this method to provide them to the
+        Declarator.
+     */
+    void AddArrayDimension(int size);

    /** Once a DeclSpecs instance is available, this method completes the
        initialization of the Symbol, setting its Type accordingly.
@@ -143,51 +134,21 @@ public:
    void InitFromDeclSpecs(DeclSpecs *ds);

    /** Get the actual type of the combination of Declarator and the given
-        DeclSpecs.  If an explicit base type is provided, the declarator is
-        applied to that type; otherwise the base type from the DeclSpecs is
-        used. */
+        DeclSpecs */
    const Type *GetType(DeclSpecs *ds) const;
-    const Type *GetType(const Type *base, DeclSpecs *ds) const;
-
-    /** Returns the symbol corresponding to the function declared by this
-        declarator and symbols for its arguments in *args. */
-    Symbol *GetFunctionInfo(DeclSpecs *ds, std::vector<Symbol *> *args);
-
-    /** Returns the symbol associated with the declarator. */
-    Symbol *GetSymbol() const;

    void Print() const;

-    /** Position of the declarator in the source program. */
    const SourcePos pos;
-
-    /** The kind of this declarator; complex declarations are assembled as
-        a hierarchy of Declarators.  (For example, a pointer to an int
-        would have a root declarator with kind DK_POINTER and with the
-        Declarator::child member pointing to a DK_BASE declarator for the
-        int). */
-    const DeclaratorKind kind;
-
-    /** Child pointer if needed; this can only be non-NULL if the
-        declarator's kind isn't DK_BASE. */
-    Declarator *child;
-
-    /** Type qualifiers provided with the declarator. */
-    int typeQualifiers;
-
-    /** For array declarators, this gives the declared size of the array.
-        Unsized arrays have arraySize == 0. */ 
-    int arraySize;
-
-    /** Symbol associated with the declarator. */
    Symbol *sym;
-
+    /** If this declarator includes an array specification, the sizes of
+        the array dimensions are represented here.
+     */
+    std::vector<int> arraySize;
    /** Initialization expression for the variable.  May be NULL. */
    Expr *initExpr;
-
-    /** For function declarations, this holds the Declaration *s for the
-        funciton's parameters. */
-    std::vector<Declaration *> functionParams;
+    bool isFunction;
+    std::vector<Declaration *> *functionArgs;
 };


@@ -196,18 +157,27 @@ public:
 */
 class Declaration {
 public:
-    Declaration(DeclSpecs *ds, std::vector<Declarator *> *dlist = NULL);
-    Declaration(DeclSpecs *ds, Declarator *d);
+    Declaration(DeclSpecs *ds, std::vector<Declarator *> *dlist = NULL) {
+        declSpecs = ds;
+        if (dlist != NULL)
+            declarators = *dlist;
+        for (unsigned int i = 0; i < declarators.size(); ++i)
+            if (declarators[i] != NULL)
+                declarators[i]->InitFromDeclSpecs(declSpecs);
+    }
+    Declaration(DeclSpecs *ds, Declarator *d) {
+        declSpecs = ds;
+        if (d) {
+            d->InitFromDeclSpecs(ds);
+            declarators.push_back(d);
+        }
+    }

+    /** Adds the symbols for the variables in the declaration to the symbol
+        table. */
+    void AddSymbols(SymbolTable *st) const;
    void Print() const;

-    /** This method walks through all of the Declarators in a declaration
-        and returns a fully-initialized Symbol and (possibly) and
-        initialization expression for each one.  (This allows the rest of
-        the system to not have to worry about the mess of the general
-        Declarator representation.) */
-    std::vector<VariableDeclaration> GetVariableDeclarations() const;
-
    DeclSpecs *declSpecs;
    std::vector<Declarator *> declarators;
 };
--- a/docs/ReleaseNotes.txt
+++ b/docs/ReleaseNotes.txt
@@ -1,85 +1,3 @@
-=== v1.1.0 === (5 December 2011)
-
-This is a major new release of the compiler, with significant additions to
-language functionality and capabilities.  It includes a number of small
-language syntax changes that will require modification of existing
-programs.  These changes should generally be straightforward and all are
-steps toward eliminating parts of ispc syntax that are incompatible with
-C/C++.  See
-http://ispc.github.com/ispc.html#updating-ispc-programs-for-changes-in-ispc-1-1
-for more information about these changes.
-
-ispc now fully supports pointers, including pointer arithmetic, implicit
-conversions of arrays to pointers, and all of the other capabilities of
-pointers in C.  See http://ispc.github.com/ispc.html#pointer-types for more
-information about pointers in ispc and
-http://ispc.github.com/ispc.html#function-pointer-types for information
-about function pointers in ispc.
-
-Reference types are now declared with C++ syntax (e.g. "const float &foo").
-
-ispc now supports 64-bit addressing.  For performance reasons, this
-capability is disabled by default (even on 64-bit targets), but can be
-enabled with a command-line flag:
-http://ispc.github.com/ispc.html#selecting-32-or-64-bit-addressing.
-
-This release features new parallel "foreach" statements, which make it
-easier in many instances to map program instances to data for data-parallel
-computation than the programIndex/programCount mechanism:
-http://ispc.github.com/ispc.html#parallel-iteration-statements-foreach-and-foreach-tiled.
-
-Finally, all of the system's documentation has been significantly revised.
-The documentation of ispc's parallel execution model has been rewritten:
-http://ispc.github.com/ispc.html#the-ispc-parallel-execution-model, and
-there is now a more specific discussion of similarities and differences
-between ispc and C/C++:
-http://ispc.github.com/ispc.html#relationship-to-the-c-programming-language.
-There is now a separate FAQ (http://ispc.github.com/faq.html), and a
-Performance Guide (http://ispc.github.com/perfguide.html).
- 
-=== v1.0.12 === (20 October 2011)
-
-This release includes a new "double-pumped" 8-wide target for SSE2,
-"sse2-x2".  Like the sse4-x2 and avx-x2 targets, this target may deliver
-higher performance for some workloads than the regular sse2 target.  (For
-other workloads, it may be slower.)
-
-The ispc language now includes an "assert()" statement.  See
-http://ispc.github.com/ispc.html#assertions for more information.
-
-The compiler now sets a preprocessor #define based on the target ISA; for
-example, ISPC_TARGET_SSE4 is defined for the sse4 targets, and so forth.
-
-The standard library now provides high-performance routines for converting
-between some "array of structures" and "structure of arrays" formats.
-See
-http://ispc.github.com/ispc.html#converting-between-array-of-structures-and-structure-of-arrays-layout
-for more information.
-
-Inline functions now have static linkage.
-
-A number of improvements have been made to the optimization passes that
-detect when gathers and scatters can be transformed into vector stores and
-loads, respectively.  In particular, these passes now handle variables that
-are used as loop induction variables much better.
-
-=== v1.0.11 === (6 October 2011)
-
-The main new feature in this release is support for generating code for
-multiple targets (e.g., SSE2, SSE4, and AVX) and having the compiled code
-select the best variant at execution time.  For more information, see
-http://ispc.github.com/ispc.html#compiling-with-support-for-multiple-instruction-sets.
-
-All of the examples now take advantage of the support for multiple
-compilation targets; thus, if one has an AVX system, it's not necessary to
-recompile the examples to use the AVX target.
-
-Performance of the built-in task system that is used in the examples has
-been improved.
-
-Finally, the print() statement now works on OSX; it had been broken for the
-last few releases.
-
 === v1.0.10 === (30 September 2011)

 This release features an extensive new example showing the application of
--- a/docs/build.sh
+++ b/docs/build.sh
@@ -1,12 +1,6 @@
 #!/bin/bash

-for i in ispc perfguide faq; do
-    rst2html.py --template=template.txt --link-stylesheet \
-        --stylesheet-path=css/style.css $i.txt > $i.html
-done
-
-rst2html.py --template=template-perf.txt --link-stylesheet \
-        --stylesheet-path=css/style.css perf.txt > perf.html
+rst2html.py ispc.txt > ispc.html

 #rst2latex --section-numbering --documentclass=article --documentoptions=DIV=9,10pt,letterpaper ispc.txt > ispc.tex
 #pdflatex ispc.tex
--- a/docs/faq.txt
+++ b/docs/faq.txt
@@ -1,482 +0,0 @@
-=============================================================
-Intel® SPMD Program Compiler Frequently Asked Questions (FAQ)
-=============================================================
-
-This document includes a number of frequently (and not frequently) asked
-questions about ispc, the Intel® SPMD Program Compiler.  The source to this
-document is in the file ``docs/faq.txt`` in the ``ispc`` source
-distribution.
-
-* Understanding ispc's Output
-
-  + `How can I see the assembly language generated by ispc?`_
-  + `How can I have the assembly output be printed using Intel assembly syntax?`_
-  + `Why are there multiple versions of exported ispc functions in the assembly output?`_
-  + `How can I more easily see gathers and scatters in generated assembly?`_
-
-* Interoperability
-
-  + `How can I supply an initial execution mask in the call from the application?`_
-  + `How can I generate a single binary executable with support for multiple instruction sets?`_
-  + `How can I determine at run-time which vector instruction set's instructions were selected to execute?`_
-
-* Programming Techniques
-
-  + `What primitives are there for communicating between SPMD program instances?`_
-  + `How can a gang of program instances generate variable amounts of output efficiently?`_
-  + `Is it possible to use ispc for explicit vector programming?`_
-  + `How can I debug my ispc programs using Valgrind?`_
-
-Understanding ispc's Output
-===========================
-
-How can I see the assembly language generated by ispc?
------------------------------------------------------
-
-The ``--emit-asm`` flag causes assembly output to be generated.  If the
-``-o`` command-line flag is also supplied, the assembly is stored in the
-given file, or printed to standard output if ``-`` is specified for the
-filename.  For example, given the simple ``ispc`` program:
-
-::
-
-    export uniform int foo(uniform int a, uniform int b) {
-        return a+b;
-    }
-
-If the SSE4 target is used, then the following assembly is printed:
-
-::
-
-    _foo:
-            addl    %esi, %edi
-            movl    %edi, %eax
-            ret
-
-
-How can I have the assembly output be printed using Intel assembly syntax?
--------------------------------------------------------------------------
-
-The ``ispc`` compiler is currently only able to emit assembly with AT+T
-syntax, where the destination operand is the last operand after an
-instruction.  If you'd prefer Intel assembly output, one option is to use
-Agner Fog's ``objconv`` tool: have ``ispc`` emit a native object file and
-then use ``objconv`` to disassemble it, specifying the assembler syntax
-that you prefer.  ``objconv`` `is available for download here`_.
-
-.. _is available for download here: http://www.agner.org/optimize/#objconv
-
-Why are there multiple versions of exported ispc functions in the assembly output?
----------------------------------------------------------------------------------
-
-Two generations of all functions qualified with ``export`` are generated:
-one of them is for being be called by other ``ispc`` functions, and the
-other is to be called by the application.  The application callable
-function has the original function's name, while the ``ispc``-callable
-function has a mangled name that encodes the types of the function's
-parameters.
-
-The crucial difference between these two functions is that the
-application-callable function doesn't take a parameter encoding the current
-execution mask, while ``ispc``-callable functions have a hidden mask
-parameter.  An implication of this difference is that the ``export``
-function starts with the execution mask "all on".  This allows a number of
-improvements in the generated code, particularly on architectures that
-don't have support for masked load and store instructions.
-
-As an example, consider this short function, which loads a vector's worth
-values from two arrays in memory, adds them, and writes the result to an
-output array.
-
-::
-
-    export void foo(uniform float a[], uniform float b[],
-                    uniform float result[]) {
-        float aa = a[programIndex], bb = b[programIndex];
-        result[programIndex] = aa+bb;
-    }
-
-Here is the assembly code for the application-callable instance of the
-function.
-
-::
-
-    _foo:
-            movups        (%rsi), %xmm1
-            movups        (%rdi), %xmm0
-            addps         %xmm1, %xmm0
-            movups        %xmm0, (%rdx)
-            ret
-
-
-And here is the assembly code for the ``ispc``-callable instance of the
-function.
-
-::
-
-    "_foo___uptr<Uf>uptr<Uf>uptr<Uf>":
-            movmskps      %xmm0, %eax
-            cmpl          $15, %eax
-            je            LBB0_3
-            testl         %eax, %eax
-            jne           LBB0_4
-            ret
-    LBB0_3:
-            movups        (%rsi), %xmm1
-            movups        (%rdi), %xmm0
-            addps         %xmm1, %xmm0
-            movups        %xmm0, (%rdx)
-            ret
-    LBB0_4:
-    ####
-    ####  Code elided; handle mixed mask case..
-    ####
-            ret
-
-There are a few things to notice in this code.  First, the current program
-mask is coming in via the ``%xmm0`` register and the initial few
-instructions in the function essentially check to see if the mask is all on
-or all off.  If the mask is all on, the code at the label LBB0_3 executes;
-it's the same as the code that was generated for ``_foo`` above.  If the
-mask is all off, then there's nothing to be done, and the function can
-return immediately.
-
-In the case of a mixed mask, a substantial amount of code is generated to
-load from and then store to only the array elements that correspond to
-program instances where the mask is on.  (This code is elided below).  This
-general pattern of having two-code paths for the "all on" and "mixed" mask
-cases is used in the code generated for almost all but the most simple
-functions (where the overhead of the test isn't worthwhile.)
-
-How can I more easily see gathers and scatters in generated assembly?
---------------------------------------------------------------------
-
-Because CPU vector ISAs don't have native gather and scatter instructions,
-these memory operations are turned into sequences of a series of
-instructions in the code that ``ispc`` generates.  In some cases, it can be
-useful to see where gathers and scatters actually happen in code; there is
-an otherwise undocumented command-line flag that provides this information.
-
-Consider this simple program:
-
-::
-
-    void set(uniform int a[], int value, int index) {
-        a[index] = value;
-    }
-
-When compiled normally to the SSE4 target, this program generates this
-extensive code sequence, which makes it more difficult to see what the
-program is actually doing.
-
-::
-
-    "_set___uptr<Ui>ii":
-            pmulld        LCPI0_0(%rip), %xmm1
-            movmskps      %xmm2, %eax
-            testb         $1, %al
-            je            LBB0_2
-            movd          %xmm1, %ecx
-            movd          %xmm0, (%rcx,%rdi)
-    LBB0_2:
-            testb         $2, %al
-            je            LBB0_4
-            pextrd        $1, %xmm1, %ecx
-            pextrd        $1, %xmm0, (%rcx,%rdi)
-    LBB0_4:
-            testb         $4, %al
-            je            LBB0_6
-            pextrd        $2, %xmm1, %ecx
-            pextrd        $2, %xmm0, (%rcx,%rdi)
-    LBB0_6:
-            testb        $8, %al
-            je            LBB0_8
-            pextrd        $3, %xmm1, %eax
-            pextrd        $3, %xmm0, (%rax,%rdi)
-    LBB0_8:
-            ret
-
-If this program is compiled with the
-``--opt=disable-handle-pseudo-memory-ops`` command-line flag, then the
-scatter is left as an unresolved function call.  The resulting program
-won't link without unresolved symbols, but the assembly output is much
-easier to understand:
-
-::
-
-    "_set___uptr<Ui>ii":
-            movaps        %xmm0, %xmm3
-            pmulld        LCPI0_0(%rip), %xmm1
-            movdqa        %xmm1, %xmm0
-            movaps        %xmm3, %xmm1
-            jmp        ___pseudo_scatter_base_offsets32_32 ## TAILCALL
-
-
-Interoperability
-================
-
-How can I supply an initial execution mask in the call from the application?
----------------------------------------------------------------------------
-
-Recall that when execution transitions from the application code to an
-``ispc`` function, all of the program instances are initially executing.
-In some cases, it may desired that only some of them are running, based on
-a data-dependent condition computed in the application program.  This
-situation can easily be handled via an additional parameter from the
-application.
-
-As a simple example, consider a case where the application code has an
-array of ``float`` values and we'd like the ``ispc`` code to update
-just specific values in that array, where which of those values to be
-updated has been determined by the application.  In C++ code, we might
-have:
-
-::
-
-    int count = ...;
-    float *array = new float[count];
-    bool *shouldUpdate = new bool[count];
-    // initialize array and shouldUpdate
-    ispc_func(array, shouldUpdate, count);
-
-Then, the ``ispc`` code could process this update as:
-
-::
-
-    export void ispc_func(uniform float array[], uniform bool update[],
-                          uniform int count) {
-        foreach (i = 0 ... count) {
-            cif (update[i] == true)
-                // update array[i+programIndex]...
-        }
-    }
-
-(In this case a "coherent" if statement is likely to be worthwhile if the
-``update`` array will tend to have sections that are either all-true or
-all-false.)
-
-How can I generate a single binary executable with support for multiple instruction sets?
-----------------------------------------------------------------------------------------
-
-``ispc`` can also generate output that supports multiple target instruction
-sets, also generating code that chooses the most appropriate one at runtime
-if multiple targets are specified with the ``--target`` command-line
-argument.
-
-For example, if you run the command:
-
-::
-
-   ispc foo.ispc -o foo.o --target=sse2,sse4-x2,avx-x2
-
-Then four object files will be generated: ``foo_sse2.o``, ``foo_sse4.o``,
-``foo_avx.o``, and ``foo.o``.[#]_  Link all of these into your executable, and
-when you call a function in ``foo.ispc`` from your application code,
-``ispc`` will determine which instruction sets are supported by the CPU the
-code is running on and will call the most appropraite version of the
-function available.  
-
-.. [#] Similarly, if you choose to generate assembly langauage output or
-   LLVM bitcode output, multiple versions of those files will be created.
-
-In general, the version of the function that runs will be the one in the
-most general instruction set that is supported by the system.  If you only
-compile SSE2 and SSE4 variants and run on a system that supports AVX, for
-example, then the SSE4 variant will be executed.  If the system doesn't
-is not able to run any of the available variants of the function (for
-example, trying to run a function that only has SSE4 and AVX variants on a
-system that only supports SSE2), then the standard library ``abort()``
-function will be called.
-
-One subtlety is that all non-static global variables (if any) must have the
-same size and layout with all of the targets used.  For example, if you
-have the global variables:
-
-::
-
-   uniform int foo[2*programCount];
-   int bar;
-
-and compile to both SSE2 and AVX targets, both of these variables will have
-different sizes (the first due to program count having the value 4 for SSE2
-and 8 for AVX, and the second due to ``varying`` types having different
-numbers of elements with the two targets--essentially the same issue as the
-first.)  ``ispc`` issues an error in this case.
-
-
-How can I determine at run-time which vector instruction set's instructions were selected to execute?
-----------------------------------------------------------------------------------------------------
-
-``ispc`` doesn't provide any API that allows querying which vector ISA's
-instructions are running when multi-target compilation was used.  However,
-this can be solved in "user space" by writing a small helper function.
-Specifically, if you implement a function like this
-
-::
-
-    export uniform int isa() {
-    #if defined(ISPC_TARGET_SSE2)
-        return 0;
-    #elif defined(ISPC_TARGET_SSE4)
-        return 1;
-    #elif defined(ISPC_TARGET_AVX)
-        return 2;
-    #else
-        return -1;
-    #endif
-    }
-
-And then call it from your application code at runtime, it will return 0,
-1, or 2, depending on which target's instructions are running.
-
-The way this works is a little surprising, but it's a useful trick.  Of
-course the preprocessor ``#if`` checks are all compile-time only
-operations.  What's actually happening is that the function is compiled
-multiple times, once for each target, with the appropriate ``ISPC_TARGET``
-preprocessor symbol set.  Then, a small dispatch function is generated for
-the application to actually call.  This dispatch function in turn calls the
-appropriate version of the function based on the CPU of the system it's
-executing on, which in turn returns the appropriate value.
-
-In a similar fashion, it's possible to find out at run-time the value of
-``programCount`` for the target that's actually being used.
-
-::
-
-    export uniform int width() { return programCount; }
-
-
-Programming Techniques
-======================
-
-What primitives are there for communicating between SPMD program instances?
---------------------------------------------------------------------------
-
-The ``broadcast()``, ``rotate()``, and ``shuffle()`` standard library
-routines provide a variety of mechanisms for the running program instances
-to communicate values to each other during execution.  Note that there's no
-need to synchronize the program instances before communicating between
-them, due to the synchronized execution model of gangs of program instances
-in ``ispc``.
-
-How can a gang of program instances generate variable amounts of output efficiently?
------------------------------------------------------------------------------------
-
-It's not unusual to have a gang of program instances where each program
-instance generates a variable amount of output (perhaps some generate no
-output, some generate one output value, some generate many output values
-and so forth), and where one would like to have the output densely packed
-in an output array.  The ``exclusive_scan_add()`` function from the
-standard library is quite useful in this situation.
-
-Consider the following function:
-
-::
-
-    uniform int func(uniform float outArray[], ...) {
-       int numOut = ...;  // figure out how many to be output
-       float outLocal[MAX_OUT]; // staging area
-
-       // each program instance in the gang puts its results in
-       //  outLocal[0], ..., outLocal[numOut-1]
-
-       int startOffset = exclusive_scan_add(numOut);
-       for (int i = 0; i < numOut; ++i)
-           outArray[startOffset + i] = outLocal[i];
-       return reduce_add(numOut);
-    }
-
-Here, each program instance has computed a number, ``numOut``, of values to
-output, and has stored them in the ``outLocal`` array.  Assume that four
-program instances are running and that the first one wants to output one
-value, the second two values, and the third and fourth three values each.
-In this case, ``exclusive_scan_add()`` will return the values (0, 1, 3, 6)
-to the four program instances, respectively.  
-
-The first program instance will then write its one result to
-``outArray[0]``, the second will write its two values to ``outArray[1]``
-and ``outArray[2]``, and so forth.  The ``reduce_add()`` call at the end
-returns the total number of values that all of the program instances have
-written to the array.
-
-FIXME: add discussion of foreach_active as an option here once that's in
-
-Is it possible to use ispc for explicit vector programming?
-----------------------------------------------------------
-
-The typical model for programming in ``ispc`` is an *implicit* parallel
-model, where one writes a program that is apparently doing scalar
-computation on values and the program is then vectorized to run in parallel
-across the SIMD lanes of a processor.  However, ``ispc`` also has some
-support for explicit vector unit programming, where the vectorization is
-explicit.  Some computations may be more effectively described in the
-explicit model rather than the implicit model.
-
-This support is provided via ``uniform`` instances of short vectors
-Specifically, if this short program
-
-::
-
-    export uniform float<8> madd(uniform float<8> a, uniform float<8> b,
-                                 uniform float<8> c) {
-        return a + b * c;
-    }
-
-is compiled with the AVX target, ``ispc`` generates the following assembly:
-
-::
-
-    _madd:
-	vmulps	%ymm2, %ymm1, %ymm1
-	vaddps	%ymm0, %ymm1, %ymm0
-	ret
-
-(And similarly, if compiled with a 4-wide SSE target, two ``mulps`` and two
-``addps`` instructions are generated, and so forth.)
-
-Note that ``ispc`` doesn't currently support control-flow based on
-``uniform`` short vector types; it is thus not possible to write code like:
-
-::
-
-    export uniform int<8> count(uniform float<8> a, uniform float<8> b) {
-        uniform int<8> sum = 0;
-        while (a++ < b)
-            ++sum;
-    }
-
-
-How can I debug my ispc programs using Valgrind?
------------------------------------------------
-
-The `valgrind`_ memory checker is an extremely useful memory checker for
-Linux and OSX; it detects a range of memory errors, including accessing
-memory after it has been freed, accessing memory beyond the end of an
-array, accessing uninitialized stack variables, and so forth.
-In general, applications that use ``ispc`` code run with ``valgrind``
-without modification and ``valgrind`` will detect the same range of memory
-errors in ``ispc`` code that it does in C/C++ code.  
-
-.. _valgrind: http://valgrind.org
-
-One issue to be aware of is that until recently, ``valgrind`` only
-supported the SSE2 vector instructions; if you are using a version of
-``valgrind`` older than the 3.7.0 release (5 November 2011), you should
-compile your ``ispc`` programs with ``--target=sse2`` before running them
-through ``valgrind``.  (Note that if no target is specified, then ``ispc``
-chooses a target based on the capabilities of the system you're running
-``ispc`` on.)  If you run an ``ispc`` program that uses instructions that
-``valgrind`` doesn't support, you'll see an error message like:
-
-::
-
-    vex amd64->IR: unhandled instruction bytes: 0xC5 0xFA 0x10 0x0 0xC5 0xFA 0x11 0x84
-    ==46059== valgrind: Unrecognised instruction at address 0x100002707.
-
-The just-released valgrind 3.7.0 adds support for the SSE4.2 instruction
-set; if you're using that version (and your system supports SSE4.2), then
-you can use ``--target=sse4`` when compiling to run with ``valgrind``.
-
-Note that ``valgrind`` does not yet support programs that use the AVX
-instruction set.
-
--- a/docs/ispc.txt
+++ b/docs/ispc.txt
--- a/docs/perf.txt
+++ b/docs/perf.txt
@@ -1,85 +0,0 @@
-===========
-Performance
-===========
-
-The SPMD programming model that ``ispc`` makes it easy to harness the
-computational power available in SIMD vector units on modern CPUs, while
-its basis in C makes it easy for programmers to adopt and use
-productively.  This page summarizes the performance of ``ispc`` with the
-workloads in the ``examples/`` directory of the ``ispc`` distribution.
-
-These results were measured on a 4-core Apple iMac with a 4-core 3.4GHz
-Intel® Core-i7 processor using the Intel® AVX instruction set.  The basis
-for comparison is a reference C++ implementation compiled with gcc 4.2.1,
-the version distributed with OS X 10.7.2.  (The reference implementation is
-also included in the ``examples/`` directory.)
-
-.. list-table:: Performance of ``ispc`` with a variety of the workloads
-   from the ``examples/`` directory of the ``ispc`` distribution, compared
-   a reference C++ implementation compiled with gcc 4.2.1.
-
-  * - Workload
-    - ``ispc``, 1 core
-    - ``ispc``, 4 cores
-  * - `AOBench`_ (512 x 512 resolution)
-    - 3.99x
-    - 19.32x
-  * - `Binomial Options`_ (128k options)
-    - 7.94x
-    - 33.43x
-  * - `Black-Scholes Options`_ (128k options)
-    - 8.45x
-    - 32.48x
-  * - `Deferred Shading`_ (1280p)
-    - n/a
-    - 23.06x
-  * - `Mandelbrot Set`_
-    - 6.21x
-    - 19.90x
-  * - `Perlin Noise Function`_
-    - 5.37x
-    - n/a
-  * - `Ray Tracer`_ (Sponza dataset)
-    - 3.99x
-    - 19.32x
-  * - `3D Stencil`_
-    - 3.76x
-    - 13.79x
-  * - `Volume Rendering`_
-    - 3.11x
-    - 15.80x
-
-
-.. _AOBench: https://github.com/ispc/ispc/tree/master/examples/aobench
-.. _Binomial Options: https://github.com/ispc/ispc/tree/master/examples/options
-.. _Black-Scholes Options: https://github.com/ispc/ispc/tree/master/examples/options
-.. _Deferred Shading: https://github.com/ispc/ispc/tree/master/examples/deferred
-.. _Mandelbrot Set: https://github.com/ispc/ispc/tree/master/examples/mandelbrot_tasks
-.. _Ray Tracer: https://github.com/ispc/ispc/tree/master/examples/rt
-.. _Perlin Noise Function: https://github.com/ispc/ispc/tree/master/examples/noise
-.. _3D Stencil: https://github.com/ispc/ispc/tree/master/examples/stencil
-.. _Volume Rendering: https://github.com/ispc/ispc/tree/master/examples/volume_rendering
-
-
-The following table shows speedups for a number of the examples on a
-2.40GHz, 40-core Intel® Xeon E7-8870 system with the Intel® SSE4
-instruction set, running Microsoft Windows Server 2008 Enterprise.  Here,
-the serial C/C++ baseline code was compiled with MSVC 2010.
- 
-.. list-table:: Performance of ``ispc`` with a variety of the workloads
-   from the ``examples/`` directory of the ``ispc`` distribution, on 
-   system with 40 CPU cores.
-
-  * - Workload
-    - ``ispc``, 40 cores
-  * - AOBench (2048 x 2048 resolution)
-    - 182.36x
-  * - Binomial Options (2m options)
-    - 63.85x
-  * - Black-Scholes Options (2m options)
-    - 83.97x
-  * - Ray Tracer (Sponza dataset)
-    - 195.67x
-  * - Volume Rendering
-    - 243.18x
-
--- a/docs/perfguide.txt
+++ b/docs/perfguide.txt
@@ -1,714 +0,0 @@
-==============================================
-Intel® SPMD Program Compiler Performance Guide
-==============================================
-
-The SPMD programming model provided by ``ispc`` naturally delivers
-excellent performance for many workloads thanks to efficient use of CPU
-SIMD vector hardware.  This guide provides more details about how to get
-the most out of ``ispc`` in practice.
-
-* `Key Concepts`_
-
-  + `Efficient Iteration With "foreach"`_
-  + `Improving Control Flow Coherence With "foreach_tiled"`_
-  + `Using Coherent Control Flow Constructs`_
-  + `Use "uniform" Whenever Appropriate`_
-
-* `Tips and Techniques`_
-
-  + `Understanding Gather and Scatter`_
-  + `Avoid 64-bit Addressing Calculations When Possible`_
-  + `Avoid Computation With 8 and 16-bit Integer Types`_
-  + `Implementing Reductions Efficiently`_
-  + `Using Low-level Vector Tricks`_
-  + `The "Fast math" Option`_
-  + `"inline" Aggressively`_
-  + `Avoid The System Math Library`_
-  + `Declare Variables In The Scope Where They're Used`_
-  + `Instrumenting ISPC Programs To Understand Runtime Behavior`_
-  + `Choosing A Target Vector Width`_
-
-* `Disclaimer and Legal Information`_
-
-* `Optimization Notice`_
-
-Key Concepts
-============
-
-This section describes the four most important concepts to understand and
-keep in mind when writing high-performance ``ispc`` programs.  It assumes
-good familiarity with the topics covered in the ``ispc`` `Users Guide`_.
-
-.. _Users Guide: ispc.html
-
-Efficient Iteration With "foreach"
----------------------------------
-
-The ``foreach`` parallel iteration construct is semantically equivalent to
-a regular ``for()`` loop, though it offers meaningful performance benefits.
-(See the `documentation on "foreach" in the Users Guide`_ for a review of
-its syntax and semantics.)  As an example, consider this simple function
-that iterates over some number of elements in an array, doing computation
-on each one:
-
-.. _documentation on "foreach" in the Users Guide: ispc.html#parallel-iteration-statements-foreach-and-foreach-tiled
-
-::
-
-    export void foo(uniform int a[], uniform int count) {
-        for (int i = programIndex; i < count; i += programCount) {
-            // do some computation on a[i]
-        }
-    }
-
-Depending on the specifics of the computation being performed, the code
-generated for this function could likely be improved by modifying the code 
-so that the loop only goes as far through the data as is possible to pack
-an entire gang of program instances with computation each time thorugh the
-loop.  Doing so enables the ``ispc`` compiler to generate more efficient
-code for cases where it knows that the execution mask is "all on".  Then,
-an ``if`` statement at the end handles processing the ragged extra bits of
-data that didn't fully fill a gang.
-
-::
-
-    export void foo(uniform int a[], uniform int count) {
-        // First, just loop up to the point where all program instances
-        // in the gang will be active at the loop iteration start
-        uniform int countBase = count & ~(programCount-1);
-        for (uniform int i = 0; i < countBase; i += programCount) {
-            int index = i + programIndex;
-            // do some computation on a[index]
-        }
-        // Now handle the ragged extra bits at the end
-        if (countBase < count) {
-            int index = countBase + programIndex;
-            // do some computation on a[index]
-        }
-    }
-
-While the performance of the above code will likely be better than the
-first version of the function, the loop body code has been duplicated (or
-has been forced to move into a separate utility function).
-
-Using the ``foreach`` looping construct as below provides all of the
-performance benefits of the second version of this function, with the
-compactness of the first.
-
-::
-
-    export void foo(uniform int a[], uniform int count) {
-        foreach (i = 0 ... count) {
-            // do some computation on a[i]
-        }
-    }
-
-Improving Control Flow Coherence With "foreach_tiled"
-----------------------------------------------------
-
-Depending on the computation being performed, ``foreach_tiled`` may give
-better performance than ``foreach``.  (See the `documentation in the Users
-Guide`_ for the syntax and semantics of ``foreach_tiled``.)  Given a
-multi-dimensional iteration like:
-
-.. _documentation in the Users Guide: ispc.html#parallel-iteration-statements-foreach-and-foreach-tiled
-
-::
-
-    foreach (i = 0 ... width, j = 0 ... height) {
-        // do computation on element (i,j)
-    }
-
-if the ``foreach`` statement is used, elements in the gang of program
-instances will be mapped to values of ``i`` and ``j`` by taking spans of
-``programCount`` elements across ``i`` with a single value of ``j``.  For
-example, the ``foreach`` statement above roughly corresponds to:
-
-::
-
-    for (uniform int j = 0; j < height; ++j)
-        for (int i = 0; i < width; i += programCount) {
-            // do computation 
-    }
-
-When a multi-dimensional domain is being iterated over, ``foreach_tiled``
-statement maps program instances to data in a way that tries to select
-square n-dimensional segments of the domain.  For example, on a compilation
-target with 8-wide gangs of program instances, it generates code that
-iterates over the domain the same way as the following code (though more
-efficiently):
-
-::
-
-    for (int j = programIndex/4; j < height; j += 2)
-        for (int i = programIndex%4; i < width; i += 4) {
-            // do computation 
-    }
-
-Thus, each gang of program instances operates on a 2x4 tile of the domain.
-With higher-dimensional iteration and different gang sizes, a similar
-mapping is performed--e.g. for 2D iteration with a 16-wide gang size, 4x4
-tiles are iterated over; for 4D iteration with a 8-gang, 1x2x2x2 tiles are
-processed, and so forth.  
-
-Performance benefit can come from using ``foreach_tiled`` in that it
-essentially optimizes for the benefit of iterating over *compact* regions
-of the domian (while ``foreach`` iterates over the domain in a way that
-generally allows linear memory access.)  There are two benefits from
-processing compact regions of the domain.  
-
-First, it's often the case that the control flow coherence of the program
-instances in the gang is improved; if data-dependent control flow decisions
-are related to the values of the data in the domain being processed, and if
-the data values have some coherence, iterating with compact regions will
-improve control flow coherence.
-
-Second, processing compact regions may mean that the data accessed by
-program instances in the gang is be more coherent, leading to performance
-benefits from better cache hit rates.
-
-As a concrete example, for the ray tracer example in the ``ispc``
-distribution (in the ``examples/rt`` directory), performance is 20% better
-when the pixels are iterated over using ``foreach_tiled`` than ``foreach``,
-because more coherent regions of the scene are accessed by the set of rays
-in the gang of program instances.
-
-
-Using Coherent Control Flow Constructs
--------------------------------------
-
-Recall from the ``ispc`` Users Guide, in the `SPMD-on-SIMD Execution Model
-section`_ that ``if`` statements with a ``uniform`` test compile to more
-efficient code than ``if`` tests with varying tests.  The coherent ``cif``
-statement can provide many benefits of ``if`` with a uniform test in the
-case where the test is actually varying.
-
-.. _SPMD-on-SIMD Execution Model section: ispc.html#the-spmd-on-simd-execution-model
-
-In this case, the code the compiler generates for the ``if``
-test is along the lines of the following pseudo-code:
-
-::
-
-   bool expr = /* evaluate cif condition */
-   if (all(expr)) {
-       // run "true" case of if test only
-   } else if (!any(expr)) {
-       // run "false" case of if test only
-   } else {
-       // run both true and false cases, updating mask appropriately
-   }
-
-For ``if`` statements where the different running SPMD program instances
-don't have coherent values for the boolean ``if`` test, using ``cif``
-introduces some additional overhead from the ``all`` and ``any`` tests as
-well as the corresponding branches.  For cases where the program
-instances often do compute the same boolean value, this overhead is
-worthwhile.  If the control flow is in fact usually incoherent, this
-overhead only costs performance.
-
-In a similar fashion, ``ispc`` provides ``cfor``, ``cwhile``, and ``cdo``
-statements.  These statements are semantically the same as the
-corresponding non-"c"-prefixed functions.
-
-Use "uniform" Whenever Appropriate
----------------------------------
-
-For any variable that will always have the same value across all of the
-program instances in a gang, declare the variable with the  ``unfiorm``
-qualifier.  Doing so enables the ``ispc`` compiler to emit better code in
-many different ways.
-
-As a simple example, consider a ``for`` loop that always does the same
-number of iterations:
-
-::
-
-    for (int i = 0; i < 10; ++i)
-        // do something ten times
-
-If this is written with ``i`` as a ``varying`` variable, as above, there's
-additional overhead in the code generated for the loop as the compiler
-emits instructions to handle the possibilty of not all program instances
-following the same control flow path (as might be the case if the loop
-limit, 10, was itself a ``varying`` value.)
-
-If the above loop is instead written with ``i`` ``uniform``, as:
-
-::
-
-    for (uniform int i = 0; i < 10; ++i)
-        // do something ten times
-
-Then better code can be generated (and the loop possibly unrolled).
-
-In some cases, the compiler may be able to detect simple cases like these,
-but it's always best to provide the compiler with as much help as possible
-to understand the actual form of your computation.
-
-
-Tips and Techniques
-===================
-
-This section introduces a number of additional techniques that are worth
-keeping in mind when writing ``ispc`` programs.
-
-Understanding Gather and Scatter
--------------------------------
-
-Memory reads and writes from the program instances in a gang that access
-irregular memory locations (rather than a consecutive set of locations, or
-a single location) can be relatively inefficient.  As an example, consider
-the "simple" array indexing calculation below:
-
-::
-
-    int i = ....;
-    uniform float x[10] = { ... };
-    float f = x[i];
-
-Since the index ``i`` is a varying value, the program instances in the gang
-will in general be reading different locations in the array ``x``.  Because
-current CPUs have a "gather" instruction, the ``ispc`` compiler has to
-serialize these memory reads, performing a separate memory load for each
-running program instance, packing the result into ``f``.  (The analogous
-case happens for a write into ``x[i]``.)
-
-In many cases, gathers like these are unavoidable; the program instances
-just need to access incoherent memory locations.  However, if the array
-index ``i`` actually has the same value for all of the program instances or
-if it represents an access to a consecutive set of array locations, much
-more efficient load and store instructions can be generated instead of
-gathers and scatters, respectively.
-
-In many cases, the ``ispc`` compiler is able to deduce that the memory
-locations accessed by a varying index are either all the same or are
-uniform.  For example, given:
-
-::
-
-  uniform int x = ...;
-  int y = x;
-  return array[y];
-
-The compiler is able to determine that all of the program instances are
-loading from the same location, even though ``y`` is not a ``uniform``
-variable.  In this case, the compiler will transform this load to a regular
-vector load, rather than a general gather.
-
-Sometimes the running program instances will access a linear sequence of
-memory locations; this happens most frequently when array indexing is done
-based on the built-in ``programIndex`` variable.  In many of these cases,
-the compiler is also able to detect this case and then do a vector load.
-For example, given:
-
-::
-
-    for (int i = programIndex; i < count; i += programCount)
-      // process array[i];
-
-Regular vector loads and stores are issued for accesses to ``array[i]``.
-
-Both of these cases have been ones where the compiler is able to determine
-statically that the index has the same value at compile-time.  It's 
-often the case that this determination can't be made at compile time, but
-this is often the case at run time.  The ``reduce_equal()`` function from
-the standard library can be used in this case; it checks to see if the
-given value is the same across over all of the running program instances,
-returning true and its ``uniform`` value if so.
-
-The following function shows the use of ``reduce_equal()`` to check for an
-equal index at execution time and then either do a scalar load and
-broadcast or a general gather.
-
-::
-
-    uniform float array[..] = { ... };
-    float value;
-    int i = ...;
-    uniform int ui;
-    if (reduce_equal(i, &ui) == true)
-        value = array[ui]; // scalar load + broadcast
-    else
-        value = array[i];  // gather
-
-For a simple case like the one above, the overhead of doing the
-``reduce_equal()`` check is likely not worthwhile compared to just always
-doing a gather.  In more complex cases, where a number of accesses are done
-based on the index, it can be worth doing.  See the example
-``examples/volume_rendering`` in the ``ispc`` distribution for the use of
-this technique in an instance where it is beneficial to performance.
-
-Avoid 64-bit Addressing Calculations When Possible
--------------------------------------------------
-
-Even when compiling to a 64-bit architecture target, ``ispc`` does many of
-the addressing calculations in 32-bit precision by default--this behavior
-can be overridden with the ``--addressing=64`` command-line argument.  This
-option should only be used if it's necessary to be able to address over 4GB
-of memory in the ``ispc`` code, as it essentially doubles the cost of
-memory addressing calculations in the generated code.
-
-Avoid Computation With 8 and 16-bit Integer Types
-------------------------------------------------
-
-The code generated for 8 and 16-bit integer types is generally not as
-efficient as the code generated for 32-bit integer types.  It is generally
-worthwhile to use 32-bit integer types for intermediate computations, even
-if the final result will be stored in a smaller integer type.
-
-Implementing Reductions Efficiently
-----------------------------------
-
-It's often necessary to compute a reduction over a data set--for example,
-one might want to add all of the values in an array, compute their minimum,
-etc.  ``ispc`` provides a few capabilities that make it easy to efficiently
-compute reductions like these.  However, it's important to use these
-capabilities appropriately for best results.
-
-As an example, consider the task of computing the sum of all of the values
-in an array.  In C code, we might have:
-
-::
-
-    /* C implementation of a sum reduction */
-    float sum(const float array[], int count) {
-        float sum = 0;
-        for (int i = 0; i < count; ++i)
-            sum += array[i];
-        return sum;
-    } 
-
-Exactly this computation could also be expressed as a purely uniform
-computation in ``ispc``, though without any benefit from vectorization:
-
-::
-
-    /* inefficient ispc implementation of a sum reduction */
-    uniform float sum(const uniform float array[], uniform int count) {
-        uniform float sum = 0;
-        for (uniform int i = 0; i < count; ++i)
-            sum += array[i];
-        return sum;
-    } 
-
-As a first try, one might try using the ``reduce_add()`` function from the
-``ispc`` standard library; it takes a ``varying`` value and returns the sum
-of that value across all of the active program instances.
-
-::
-
-    /* inefficient ispc implementation of a sum reduction */
-    uniform float sum(const uniform float array[], uniform int count) {
-        uniform float sum = 0;
-        foreach (i = 0 ... count)
-            sum += reduce_add(array[i+programIndex]);
-        return sum;
-    } 
-
-This implementation loads a gang's worth of values from the array, one for
-each of the program instances, and then uses ``reduce_add()`` to reduce
-across the program instances and then update the sum.  Unfortunately this
-approach loses most benefit from vectorization, as it does more work on the
-cross-program instance ``reduce_add()`` call than it saves from the vector
-load of values.
-
-The most efficient approach is to do the reduction in two phases: rather
-than using a ``uniform`` variable to store the sum, we maintain a varying
-value, such that each program instance is effectively computing a local
-partial sum on the subset of array values that it has loaded from the
-array.  When the loop over array elements concludes, a single call to
-``reduce_add()`` computes the final reduction across each of the program
-instances' elements of ``sum``.  This approach effectively compiles to a
-single vector load and a single vector add for each loop iteration's of
-values--very efficient code in the end.
-
-::
-
-    /* good ispc implementation of a sum reduction */
-    uniform float sum(const uniform float array[], uniform int count) {
-        float sum = 0;
-        foreach (i = 0 ... count)
-            sum += array[i+programIndex];
-        return reduce_add(sum);
-    } 
-
-Using Low-level Vector Tricks
-----------------------------
-
-Many low-level Intel® SSE and AVX coding constructs can be implemented in
-``ispc`` code.  The ``ispc`` standard library functions ``intbits()`` and
-``floatbits()`` are often useful in this context.  Recall that
-``intbits()`` takes a ``float`` value and returns it as an integer where
-the bits of the integer are the same as the bit representation in memory of
-the ``float``.  (In other words, it does *not* perform an integer to
-floating-point conversion.)  ``floatbits()``, then, performs the inverse
-computation.
-
-As an example of the use of these functions, the following code efficiently
-reverses the sign of the given values.
-
-::
-
-  float flipsign(float a) {
-      unsigned int i = intbits(a);
-      i ^= 0x80000000;
-      return floatbits(i);
-  }
-
-This code compiles down to a single XOR instruction.
-
-The "Fast math" Option
----------------------
-
-``ispc`` has a ``--opt=fast-math`` command-line flag that enables a number of
-optimizations that may be undesirable in code where numerical precision is
-critically important.  For many graphics applications, for example, the
-approximations introduced may be acceptable, however.  The following two
-optimizations are performed when ``--opt=fast-math`` is used.  By default, the
-``--opt=fast-math`` flag is off.
-
-* Expressions like ``x / y``, where ``y`` is a compile-time constant, are
-  transformed to ``x * (1./y)``, where the inverse value of ``y`` is
-  precomputed at compile time.
-
-* Expressions like ``x / y``, where ``y`` is not a compile-time constant,
-  are transformed to ``x * rcp(y)``, where ``rcp()`` maps to the
-  approximate reciprocal instruction from the ``ispc`` standard library.
-
-
-"inline" Aggressively
---------------------
-
-Inlining functions aggressively is generally beneficial for performance
-with ``ispc``.  Definitely use the ``inline`` qualifier for any short
-functions (a few lines long), and experiment with it for longer functions.
-
-Avoid The System Math Library
-----------------------------
-
-The default math library for transcendentals and the like that ``ispc`` has
-higher error than the system's math library, though is much more efficient
-due to being vectorized across the program instances and due to the fact
-that the functions can be inlined in the final code.  (It generally has
-errors in the range of 10ulps, while the system math library generally has
-no more than 1ulp of error for transcendentals.)
-
-If the ``--math-lib=system`` command-line option is used when compiling an
-``ispc`` program, then calls to the system math library will be generated
-instead.  This option should only be used if the higher precision is
-absolutely required as the performance impact of using it can be
-significant.
-
-Declare Variables In The Scope Where They're Used
-------------------------------------------------
-
-Performance is slightly improved by declaring variables at the same block
-scope where they are first used.  For example, in code like the
-following, if the lifetime of ``foo`` is only within the scope of the
-``if`` clause, write the code like this:  
-
-::
-
-    float func() {
-        ....
-        if (x < y) {
-            float foo;
-            ... use foo ...
-        }
-    }
-
-Try not to write code as:
-
-::
-
-    float func() {
-        float foo;
-        ....
-        if (x < y) {
-            ... use foo ...
-        }
-    }
-
-Doing so can reduce the amount of masked store instructions that the
-compiler needs to generate.
-
-Instrumenting ISPC Programs To Understand Runtime Behavior
----------------------------------------------------------
-
-``ispc`` has an optional instrumentation feature that can help you
-understand performance issues.  If a program is compiled using the
-``--instrument`` flag, the compiler emits calls to a function with the
-following signature at various points in the program (for
-example, at interesting points in the control flow, when scatters or
-gathers happen.)
-
-::
-
-    extern "C" {
-        void ISPCInstrument(const char *fn, const char *note, 
-                            int line, int mask);
-    }
-
-This function is passed the file name of the ``ispc`` file running, a short
-note indicating what is happening, the line number in the source file, and
-the current mask of active program instances in the gang.  You must provide an
-implementation of this function and link it in with your application.
-
-For example, when the ``ispc`` program runs, this function might be called
-as follows:
-
-::
-
-   ISPCInstrument("foo.ispc", "function entry", 55, 0xf);
-
-This call indicates that at the currently executing program has just
-entered the function defined at line 55 of the file ``foo.ispc``, with a
-mask of all lanes currently executing (assuming a four-wide gang size
-target machine).
-
-For a fuller example of the utility of this functionality, see
-``examples/aobench_instrumented`` in the ``ispc`` distribution.  Ths
-example includes an implementation of the ``ISPCInstrument()`` function
-that collects aggregate data about the program's execution behavior.
-
-When running this example, you will want to direct to the ``ao`` executable
-to generate a low resolution image, because the instrumentation adds
-substantial execution overhead.  For example:
-
-::
-
-    % ./ao 1 32 32
-
-After the ``ao`` program exits, a summary report along the following lines
-will be printed.  In the first few lines, you can see how many times a few
-functions were called, and the average percentage of SIMD lanes that were
-active upon function entry.
-
-:: 
-
-    ao.ispc(0067) - function entry: 342424 calls (0 / 0.00% all off!), 95.86% active lanes
-    ao.ispc(0067) - return: uniform control flow: 342424 calls (0 / 0.00% all off!), 95.86% active lanes
-    ao.ispc(0071) - function entry: 1122 calls (0 / 0.00% all off!), 97.33% active lanes
-    ao.ispc(0075) - return: uniform control flow: 1122 calls (0 / 0.00% all off!), 97.33% active lanes
-    ao.ispc(0079) - function entry: 10072 calls (0 / 0.00% all off!), 45.09% active lanes
-    ao.ispc(0088) - function entry: 36928 calls (0 / 0.00% all off!), 97.40% active lanes
-    ...
-
-
-Choosing A Target Vector Width
------------------------------
-
-By default, ``ispc`` compiles to the natural vector width of the target
-instruction set.  For example, for SSE2 and SSE4, it compiles four-wide,
-and for AVX, it complies 8-wide.  For some programs, higher performance may
-be seen if the program is compiled to a doubled vector width--8-wide for
-SSE and 16-wide for AVX.  
-
-For workloads that don't require many of registers, this method can lead to
-significantly more efficient execution thanks to greater instruction level
-parallelism and amortization of various overhead over more program
-instances.  For other workloads, it may lead to a slowdown due to higher
-register pressure; trying both approaches for key kernels may be
-worthwhile.
-
-This option is only available for each of the SSE2, SSE4 and AVX targets.
-It is selected with the ``--target=sse2-x2``, ``--target=sse4-x2`` and
-``--target=avx-x2`` options, respectively.
-
-
-Disclaimer and Legal Information
-================================
-
-INFORMATION IN THIS DOCUMENT IS PROVIDED IN CONNECTION WITH INTEL(R) PRODUCTS.
-NO LICENSE, EXPRESS OR IMPLIED, BY ESTOPPEL OR OTHERWISE, TO ANY INTELLECTUAL
-PROPERTY RIGHTS IS GRANTED BY THIS DOCUMENT. EXCEPT AS PROVIDED IN INTEL'S TERMS
-AND CONDITIONS OF SALE FOR SUCH PRODUCTS, INTEL ASSUMES NO LIABILITY WHATSOEVER,
-AND INTEL DISCLAIMS ANY EXPRESS OR IMPLIED WARRANTY, RELATING TO SALE AND/OR USE
-OF INTEL PRODUCTS INCLUDING LIABILITY OR WARRANTIES RELATING TO FITNESS FOR A
-PARTICULAR PURPOSE, MERCHANTABILITY, OR INFRINGEMENT OF ANY PATENT, COPYRIGHT
-OR OTHER INTELLECTUAL PROPERTY RIGHT.
-
-UNLESS OTHERWISE AGREED IN WRITING BY INTEL, THE INTEL PRODUCTS ARE NOT DESIGNED
-NOR INTENDED FOR ANY APPLICATION IN WHICH THE FAILURE OF THE INTEL PRODUCT COULD
-CREATE A SITUATION WHERE PERSONAL INJURY OR DEATH MAY OCCUR.
-
-Intel may make changes to specifications and product descriptions at any time,
-without notice. Designers must not rely on the absence or characteristics of any
-features or instructions marked "reserved" or "undefined." Intel reserves these
-for future definition and shall have no responsibility whatsoever for conflicts
-or incompatibilities arising from future changes to them. The information here
-is subject to change without notice. Do not finalize a design with this
-information.
-
-The products described in this document may contain design defects or errors
-known as errata which may cause the product to deviate from published
-specifications. Current characterized errata are available on request.
-
-Contact your local Intel sales office or your distributor to obtain the latest
-specifications and before placing your product order.
-
-Copies of documents which have an order number and are referenced in this
-document, or other Intel literature, may be obtained by calling 1-800-548-4725,
-or by visiting Intel's Web Site.
-
-Intel processor numbers are not a measure of performance. Processor numbers
-differentiate features within each processor family, not across different
-processor families. See http://www.intel.com/products/processor_number for
-details.
-
-BunnyPeople, Celeron, Celeron Inside, Centrino, Centrino Atom,
-Centrino Atom Inside, Centrino Inside, Centrino logo, Core Inside, FlashFile,
-i960, InstantIP, Intel, Intel logo, Intel386, Intel486, IntelDX2, IntelDX4,
-IntelSX2, Intel Atom, Intel Atom Inside, Intel Core, Intel Inside,
-Intel Inside logo, Intel. Leap ahead., Intel. Leap ahead. logo, Intel NetBurst,
-Intel NetMerge, Intel NetStructure, Intel SingleDriver, Intel SpeedStep,
-Intel StrataFlash, Intel Viiv, Intel vPro, Intel XScale, Itanium,
-Itanium Inside, MCS, MMX, Oplus, OverDrive, PDCharm, Pentium, Pentium Inside,
-skoool, Sound Mark, The Journey Inside, Viiv Inside, vPro Inside, VTune, Xeon,
-and Xeon Inside are trademarks of Intel Corporation in the U.S. and other
-countries.
-
-* Other names and brands may be claimed as the property of others.
-
-Copyright(C) 2011, Intel Corporation. All rights reserved.
-
-
-Optimization Notice
-===================
-
-Intel compilers, associated libraries and associated development tools may
-include or utilize options that optimize for instruction sets that are
-available in both Intel and non-Intel microprocessors (for example SIMD
-instruction sets), but do not optimize equally for non-Intel
-microprocessors.  In addition, certain compiler options for Intel
-compilers, including some that are not specific to Intel
-micro-architecture, are reserved for Intel microprocessors.  For a detailed
-description of Intel compiler options, including the instruction sets and
-specific microprocessors they implicate, please refer to the "Intel
-Compiler User and Reference Guides" under "Compiler Options."  Many library
-routines that are part of Intel compiler products are more highly optimized
-for Intel microprocessors than for other microprocessors.  While the
-compilers and libraries in Intel compiler products offer optimizations for
-both Intel and Intel-compatible microprocessors, depending on the options
-you select, your code and other factors, you likely will get extra
-performance on Intel microprocessors.
-
-Intel compilers, associated libraries and associated development tools may
-or may not optimize to the same degree for non-Intel microprocessors for
-optimizations that are not unique to Intel microprocessors.  These
-optimizations include Intel® Streaming SIMD Extensions 2 (Intel® SSE2),
-Intel® Streaming SIMD Extensions 3 (Intel® SSE3), and Supplemental
-Streaming SIMD Extensions 3 (Intel SSSE3) instruction sets and other
-optimizations.  Intel does not guarantee the availability, functionality,
-or effectiveness of any optimization on microprocessors not manufactured by
-Intel.  Microprocessor-dependent optimizations in this product are intended
-for use with Intel microprocessors.
-
-While Intel believes our compilers and libraries are excellent choices to
-assist in obtaining the best performance on Intel and non-Intel
-microprocessors, Intel recommends that you evaluate other compilers and
-libraries to determine which best meet your requirements.  We hope to win
-your business by striving to offer the best performance of any compiler or
-library; please let us know if you find we do not.
-
--- a/docs/template-perf.txt
+++ b/docs/template-perf.txt
@@ -1,65 +0,0 @@
-%(head_prefix)s
-%(head)s
-<script type="text/javascript">
-
-  var _gaq = _gaq || [];
-  _gaq.push(['_setAccount', 'UA-1486404-4']);
-  _gaq.push(['_trackPageview']);
-
-  (function() {
-    var ga = document.createElement('script'); ga.type = 'text/javascript'; ga.async = true;
-    ga.src = ('https:' == document.location.protocol ? 'https://ssl' : 'http://www') + '.google-analytics.com/ga.js';
-    var s = document.getElementsByTagName('script')[0]; s.parentNode.insertBefore(ga, s);
-  })();
-
-</script>
-%(stylesheet)s
-%(body_prefix)s
-<div id="wrap">
-  <div id="wrap2">
-    <div id="header">
-      <h1 id="logo">Intel SPMD Program Compiler</h1>
-      <div id="slogan">An open-source compiler for high-performance SIMD programming on
-      the CPU</div>
-    </div>
-    <div id="nav">
-      <div id="nbar">
-        <ul>
-          <li><a href="index.html">Overview</a></li>
-          <li><a href="features.html">Features</a></li>
-          <li><a href="downloads.html">Downloads</a></li>
-          <li><a href="documentation.html">Documentation</a></li>
-          <li id="selected"><a href="perf.html">Performance</a></li>
-        </ul>
-      </div>
-    </div>
-    <div id="content-wrap">
-      <div id="sidebar">
-          <div class="widgetspace">
-            <h1>Resources</h1>
-            <ul class="menu">
-              <li><a href="http://github.com/ispc/ispc/">ispc page on github</a></li>
-              <li><a href="http://groups.google.com/group/ispc-users/">ispc
-              users mailing list</a></li>
-              <li><a href="http://groups.google.com/group/ispc-dev/">ispc
-              developers mailing list</a></li>
-              <li><a href="http://github.com/ispc/ispc/wiki/">Wiki</a></li>
-              <li><a href="http://github.com/ispc/ispc/issues/">Bug tracking</a></li>
-              <li><a href="doxygen/index.html">Doxygen documentation of
-              <tt>ispc</tt> source code</a></li>
-            </ul>
-        </div>
-      </div>
-%(body_pre_docinfo)s
-%(docinfo)s
-<div id="content">
-%(body)s
-</div>
-    <div class="clearfix"></div>
-    <div id="footer"> &copy; 2011 <strong>Intel Corporation</strong> | Valid <a href="http://validator.w3.org/check?uri=referer">XHTML</a> | <a href="http://jigsaw.w3.org/css-validator/check/referer">CSS</a> | ClearBlue  by: <a href="http://www.themebin.com/">ThemeBin</a>
-      <!-- Please Do Not remove this link, thank u -->
-      </div>
-      </div>
-      </div>
-      </div>
-%(body_suffix)s
--- a/docs/template.txt
+++ b/docs/template.txt
@@ -1,65 +0,0 @@
-%(head_prefix)s
-%(head)s
-<script type="text/javascript">
-
-  var _gaq = _gaq || [];
-  _gaq.push(['_setAccount', 'UA-1486404-4']);
-  _gaq.push(['_trackPageview']);
-
-  (function() {
-    var ga = document.createElement('script'); ga.type = 'text/javascript'; ga.async = true;
-    ga.src = ('https:' == document.location.protocol ? 'https://ssl' : 'http://www') + '.google-analytics.com/ga.js';
-    var s = document.getElementsByTagName('script')[0]; s.parentNode.insertBefore(ga, s);
-  })();
-
-</script>
-%(stylesheet)s
-%(body_prefix)s
-<div id="wrap">
-  <div id="wrap2">
-    <div id="header">
-      <h1 id="logo">Intel SPMD Program Compiler</h1>
-      <div id="slogan">An open-source compiler for high-performance SIMD programming on
-      the CPU</div>
-    </div>
-    <div id="nav">
-      <div id="nbar">
-        <ul>
-          <li><a href="index.html">Overview</a></li>
-          <li><a href="features.html">Features</a></li>
-          <li><a href="downloads.html">Downloads</a></li>
-          <li id="selected"><a href="documentation.html">Documentation</a></li>
-          <li><a href="perf.html">Performance</a></li>
-        </ul>
-      </div>
-    </div>
-    <div id="content-wrap">
-      <div id="sidebar">
-          <div class="widgetspace">
-            <h1>Resources</h1>
-            <ul class="menu">
-              <li><a href="http://github.com/ispc/ispc/">ispc page on github</a></li>
-              <li><a href="http://groups.google.com/group/ispc-users/">ispc
-              users mailing list</a></li>
-              <li><a href="http://groups.google.com/group/ispc-dev/">ispc
-              developers mailing list</a></li>
-              <li><a href="http://github.com/ispc/ispc/wiki/">Wiki</a></li>
-              <li><a href="http://github.com/ispc/ispc/issues/">Bug tracking</a></li>
-              <li><a href="doxygen/index.html">Doxygen documentation of
-              <tt>ispc</tt> source code</a></li>
-            </ul>
-        </div>
-      </div>
-%(body_pre_docinfo)s
-%(docinfo)s
-<div id="content">
-%(body)s
-</div>
-    <div class="clearfix"></div>
-    <div id="footer"> &copy; 2011 <strong>Intel Corporation</strong> | Valid <a href="http://validator.w3.org/check?uri=referer">XHTML</a> | <a href="http://jigsaw.w3.org/css-validator/check/referer">CSS</a> | ClearBlue  by: <a href="http://www.themebin.com/">ThemeBin</a>
-      <!-- Please Do Not remove this link, thank u -->
-      </div>
-      </div>
-      </div>
-      </div>
-%(body_suffix)s
--- a/doxygen.cfg
+++ b/doxygen.cfg
@@ -31,7 +31,7 @@ PROJECT_NAME           = "Intel SPMD Program Compiler"
 # This could be handy for archiving the generated documentation or
 # if some version control system is used.

-PROJECT_NUMBER         = 1.1.0
+PROJECT_NUMBER         = 1.0.10

 # The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute)
 # base path where the generated documentation will be put.
@@ -585,6 +585,7 @@ INPUT                  = builtins.h \
                         ctx.h \
                         decl.h \
                         expr.h \
+                         gatherbuf.h \
                         ispc.h \
                         llvmutil.h \
                         module.h \
@@ -597,6 +598,7 @@ INPUT                  = builtins.h \
                         ctx.cpp \
                         decl.cpp \
                         expr.cpp \
+                         gatherbuf.cpp \
                         ispc.cpp \
                         llvmutil.cpp \
                         main.cpp \
--- a/examples/aobench/Makefile
+++ b/examples/aobench/Makefile
@@ -8,11 +8,7 @@ TASK_OBJ=$(addprefix objs/, $(subst ../,, $(TASK_CXX:.cpp=.o)))
 CXX=g++
 CXXFLAGS=-Iobjs/ -O3 -Wall -m64
 ISPC=ispc
-ISPCFLAGS=-O2 --target=sse2,sse4,avx --arch=x86-64
-
-ISPC_OBJS=objs/ao_ispc.o objs/ao_ispc_sse2.o objs/ao_ispc_sse4.o \
-	objs/ao_ispc_avx.o
-OBJS=objs/ao.o objs/ao_serial.o $(ISPC_OBJS) $(TASK_OBJ)
+ISPCFLAGS=-O2 --target=sse4 --arch=x86-64

 default: ao

@@ -24,8 +20,8 @@ dirs:
 clean:
 	/bin/rm -rf objs *~ ao

-ao: dirs $(OBJS) $(TASK_OBJ)
-	$(CXX) $(CXXFLAGS) -o $@ $(OBJS) -lm $(TASK_LIB)
+ao: dirs objs/ao.o objs/ao_serial.o objs/ao_ispc.o $(TASK_OBJ)
+	$(CXX) $(CXXFLAGS) -o $@ objs/ao.o objs/ao_ispc.o objs/ao_serial.o $(TASK_OBJ) -lm $(TASK_LIB)

 objs/%.o: %.cpp
 	$(CXX) $< $(CXXFLAGS) -c -o $@
@@ -35,5 +31,5 @@ objs/%.o: ../%.cpp

 objs/ao.o: objs/ao_ispc.h 

-objs/%_ispc.h objs/%_ispc.o objs/%_ispc_sse2.o objs/%_ispc_sse4.o objs/%_ispc_avx.o: %.ispc
+objs/%_ispc.h objs/%_ispc.o: %.ispc
 	$(ISPC) $(ISPCFLAGS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
--- a/examples/aobench/ao.cpp
+++ b/examples/aobench/ao.cpp
@@ -55,6 +55,7 @@
 using namespace ispc;

 #include "../timing.h"
+#include "../cpuid.h"

 #define NSUBSAMPLES        2

@@ -104,6 +105,38 @@ savePPM(const char *fname, int w, int h)
 }


+// Make sure that the vector ISA used during compilation is supported by
+// the processor.  The ISPC_TARGET_* macro is set in the ispc-generated
+// header file that we include above.
+static void
+ensureTargetISAIsSupported() {
+#if defined(ISPC_TARGET_SSE2)
+    bool isaSupported = CPUSupportsSSE2();
+    const char *target = "SSE2";
+#elif defined(ISPC_TARGET_SSE4)
+    bool isaSupported = CPUSupportsSSE4();
+    const char *target = "SSE4";
+#elif defined(ISPC_TARGET_AVX)
+    bool isaSupported = CPUSupportsAVX();
+    const char *target = "AVX";
+#else
+#error "Unknown ISPC_TARGET_* value"
+#endif
+    if (!isaSupported) {
+        fprintf(stderr, "***\n*** Error: the ispc-compiled code uses the %s instruction "
+                "set, which isn't\n***        supported by this computer's CPU!\n", target);
+        fprintf(stderr, "***\n***        Please modify the "
+#ifdef _MSC_VER
+                "MSVC project file "
+#else
+                "Makefile "
+#endif
+                "to select another target (e.g. sse2)\n***\n");
+        exit(1);
+    }
+}
+
+
 int main(int argc, char **argv)
 {
    if (argc != 4) {
@@ -118,6 +151,8 @@ int main(int argc, char **argv)
        height = atoi (argv[3]);
    }

+    ensureTargetISAIsSupported();
+
    // Allocate space for output images
    img = new unsigned char[width * height * 3];
    fimg = new float[width * height * 3];
--- a/examples/aobench/ao.ispc
+++ b/examples/aobench/ao.ispc
@@ -75,7 +75,7 @@ static inline vec vcross(vec v0, vec v1) {
    return ret;
 }

-static inline void vnormalize(vec &v) {
+static inline void vnormalize(reference vec v) {
    float len2 = dot(v, v);
    float invlen = rsqrt(len2);
    v *= invlen;
@@ -83,7 +83,8 @@ static inline void vnormalize(vec &v) {


 static inline void
-ray_plane_intersect(Isect &isect, Ray &ray, Plane &plane) {
+ray_plane_intersect(reference Isect isect, reference Ray ray, 
+                    reference Plane plane) {
    float d = -dot(plane.p, plane.n);
    float v = dot(ray.dir, plane.n);

@@ -103,7 +104,8 @@ ray_plane_intersect(Isect &isect, Ray &ray, Plane &plane) {


 static inline void
-ray_sphere_intersect(Isect &isect, Ray &ray, Sphere &sphere) {
+ray_sphere_intersect(reference Isect isect, reference Ray ray, 
+                     reference Sphere sphere) {
    vec rs = ray.org - sphere.center;

    float B = dot(rs, ray.dir);
@@ -125,7 +127,7 @@ ray_sphere_intersect(Isect &isect, Ray &ray, Sphere &sphere) {


 static inline void
-orthoBasis(vec basis[3], vec n) {
+orthoBasis(reference vec basis[3], vec n) {
    basis[2] = n;
    basis[1].x = 0.0; basis[1].y = 0.0; basis[1].z = 0.0;

@@ -148,8 +150,8 @@ orthoBasis(vec basis[3], vec n) {


 static inline float
-ambient_occlusion(Isect &isect, Plane &plane, Sphere spheres[3], 
-                  RNGState &rngstate) {
+ambient_occlusion(reference Isect isect, reference Plane plane, 
+                  reference Sphere spheres[3], reference RNGState rngstate) {
    float eps = 0.0001f;
    vec p, n;
    vec basis[3];
@@ -166,8 +168,8 @@ ambient_occlusion(Isect &isect, Plane &plane, Sphere spheres[3],
            Ray ray;
            Isect occIsect;

-            float theta = sqrt(frandom(&rngstate));
-            float phi   = 2.0f * M_PI * frandom(&rngstate);
+            float theta = sqrt(frandom(rngstate));
+            float phi   = 2.0f * M_PI * frandom(rngstate);
            float x = cos(phi) * theta;
            float y = sin(phi) * theta;
            float z = sqrt(1.0 - theta * theta);
@@ -203,7 +205,7 @@ ambient_occlusion(Isect &isect, Plane &plane, Sphere spheres[3],
 */
 static void ao_scanlines(uniform int y0, uniform int y1, uniform int w, 
                         uniform int h,  uniform int nsubsamples, 
-                         uniform float image[]) {
+                         reference uniform float image[]) {
    static Plane plane = { { 0.0f, -0.5f, 0.0f }, { 0.f, 1.f, 0.f } };
    static Sphere spheres[3] = {
        { { -2.0f, 0.0f, -3.5f }, 0.5f },
@@ -211,7 +213,7 @@ static void ao_scanlines(uniform int y0, uniform int y1, uniform int w,
        { { 1.0f, 0.0f, -2.2f }, 0.5f } };
    RNGState rngstate;

-    seed_rng(&rngstate, y0);
+    seed_rng(rngstate, y0);

    // Compute the mapping between the 'programCount'-wide program
    // instances running in parallel and samples in the image.  
--- a/examples/aobench/aobench.vcxproj
+++ b/examples/aobench/aobench.vcxproj
@@ -1,4 +1,4 @@
-<?xml version="1.0" encoding="utf-8"?>
+<?xml version="1.0" encoding="utf-8"?>
 <Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
  <ItemGroup Label="ProjectConfigurations">
    <ProjectConfiguration Include="Debug|Win32">
@@ -26,18 +26,18 @@
  <ItemGroup>
    <CustomBuild Include="ao.ispc">
      <FileType>Document</FileType>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4,avx
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86
 </Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4,avx
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h
 </Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4,avx
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(Filename).obj</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">%(Filename).obj</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86
 </Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4,avx
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h
 </Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(Filename).obj</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">%(Filename).obj</Outputs>
    </CustomBuild>
  </ItemGroup>
  <PropertyGroup Label="Globals">
@@ -86,19 +86,15 @@
  <PropertyGroup Label="UserMacros" />
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
    <LinkIncremental>true</LinkIncremental>
-    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
    <LinkIncremental>true</LinkIncremental>
-    <ExecutablePath>$(ExecutablePath);$(ProjectDir)..\..</ExecutablePath>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
    <LinkIncremental>false</LinkIncremental>
-    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
    <LinkIncremental>false</LinkIncremental>
-    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
  </PropertyGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
    <ClCompile>
@@ -107,7 +103,6 @@
      <WarningLevel>Level3</WarningLevel>
      <Optimization>Disabled</Optimization>
      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <FloatingPointModel>Fast</FloatingPointModel>
    </ClCompile>
@@ -123,7 +118,6 @@
      <WarningLevel>Level3</WarningLevel>
      <Optimization>Disabled</Optimization>
      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <FloatingPointModel>Fast</FloatingPointModel>
    </ClCompile>
@@ -141,7 +135,6 @@
      <FunctionLevelLinking>true</FunctionLevelLinking>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
      <FloatingPointModel>Fast</FloatingPointModel>
    </ClCompile>
    <Link>
@@ -160,7 +153,6 @@
      <FunctionLevelLinking>true</FunctionLevelLinking>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
      <FloatingPointModel>Fast</FloatingPointModel>
    </ClCompile>
    <Link>
@@ -173,4 +165,4 @@
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
  </ImportGroup>
-</Project>
+</Project>
--- a/examples/aobench_instrumented/Makefile
+++ b/examples/aobench_instrumented/Makefile
@@ -2,7 +2,7 @@
 CXX=g++ -m64
 CXXFLAGS=-Iobjs/ -g3 -Wall
 ISPC=ispc
-ISPCFLAGS=-O2 --instrument --arch=x86-64 --target=sse2
+ISPCFLAGS=-O2 --instrument --arch=x86-64

 default: ao

--- a/examples/aobench_instrumented/ao.cpp
+++ b/examples/aobench_instrumented/ao.cpp
@@ -32,6 +32,7 @@
 */

 #ifdef _MSC_VER
+#define _CRT_SECURE_NO_WARNINGS
 #define NOMINMAX
 #pragma warning (disable: 4244)
 #pragma warning (disable: 4305)
@@ -50,11 +51,12 @@
 #include <algorithm>
 #include <sys/types.h>

-#include "ao_instrumented_ispc.h"
+#include "ao_ispc.h"
 using namespace ispc;

 #include "instrument.h"
 #include "../timing.h"
+#include "../cpuid.h"

 #define NSUBSAMPLES        2

@@ -102,6 +104,37 @@ savePPM(const char *fname, int w, int h)
 }


+// Make sure that the vector ISA used during compilation is supported by
+// the processor.  The ISPC_TARGET_* macro is set in the ispc-generated
+// header file that we include above.
+static void
+ensureTargetISAIsSupported() {
+#if defined(ISPC_TARGET_SSE2)
+    bool isaSupported = CPUSupportsSSE2();
+    const char *target = "SSE2";
+#elif defined(ISPC_TARGET_SSE4)
+    bool isaSupported = CPUSupportsSSE4();
+    const char *target = "SSE4";
+#elif defined(ISPC_TARGET_AVX)
+    bool isaSupported = CPUSupportsAVX();
+    const char *target = "AVX";
+#else
+#error "Unknown ISPC_TARGET_* value"
+#endif
+    if (!isaSupported) {
+        fprintf(stderr, "***\n*** Error: the ispc-compiled code uses the %s instruction "
+                "set, which isn't\n***        supported by this computer's CPU!\n", target);
+        fprintf(stderr, "***\n***        Please modify the "
+#ifdef _MSC_VER
+                "MSVC project file "
+#else
+                "Makefile "
+#endif
+                "to select another target (e.g. sse2)\n***\n");
+        exit(1);
+    }
+}
+

 int main(int argc, char **argv)
 {
@@ -117,6 +150,8 @@ int main(int argc, char **argv)
        height = atoi (argv[3]);
    }

+    ensureTargetISAIsSupported();
+
    // Allocate space for output images
    img = new unsigned char[width * height * 3];
    fimg = new float[width * height * 3];
--- a/examples/aobench_instrumented/ao.ispc
+++ b/examples/aobench_instrumented/ao.ispc
@@ -75,7 +75,7 @@ static inline vec vcross(vec v0, vec v1) {
    return ret;
 }

-static inline void vnormalize(vec &v) {
+static inline void vnormalize(reference vec v) {
    float len2 = dot(v, v);
    float invlen = rsqrt(len2);
    v *= invlen;
@@ -83,7 +83,8 @@ static inline void vnormalize(vec &v) {


 static inline void
-ray_plane_intersect(Isect &isect, Ray &ray, Plane &plane) {
+ray_plane_intersect(reference Isect isect, reference Ray ray, 
+                    reference Plane plane) {
    float d = -dot(plane.p, plane.n);
    float v = dot(ray.dir, plane.n);

@@ -103,7 +104,8 @@ ray_plane_intersect(Isect &isect, Ray &ray, Plane &plane) {


 static inline void
-ray_sphere_intersect(Isect &isect, Ray &ray, Sphere &sphere) {
+ray_sphere_intersect(reference Isect isect, reference Ray ray, 
+                     reference Sphere sphere) {
    vec rs = ray.org - sphere.center;

    float B = dot(rs, ray.dir);
@@ -125,7 +127,7 @@ ray_sphere_intersect(Isect &isect, Ray &ray, Sphere &sphere) {


 static inline void
-orthoBasis(vec basis[3], vec n) {
+orthoBasis(reference vec basis[3], vec n) {
    basis[2] = n;
    basis[1].x = 0.0; basis[1].y = 0.0; basis[1].z = 0.0;

@@ -148,8 +150,8 @@ orthoBasis(vec basis[3], vec n) {


 static inline float
-ambient_occlusion(Isect &isect, Plane &plane, Sphere spheres[3], 
-                  RNGState &rngstate) {
+ambient_occlusion(reference Isect isect, reference Plane plane, 
+                  reference Sphere spheres[3], reference RNGState rngstate) {
    float eps = 0.0001f;
    vec p, n;
    vec basis[3];
@@ -166,8 +168,8 @@ ambient_occlusion(Isect &isect, Plane &plane, Sphere spheres[3],
            Ray ray;
            Isect occIsect;

-            float theta = sqrt(frandom(&rngstate));
-            float phi   = 2.0f * M_PI * frandom(&rngstate);
+            float theta = sqrt(frandom(rngstate));
+            float phi   = 2.0f * M_PI * frandom(rngstate);
            float x = cos(phi) * theta;
            float y = sin(phi) * theta;
            float z = sqrt(1.0 - theta * theta);
@@ -201,9 +203,8 @@ ambient_occlusion(Isect &isect, Plane &plane, Sphere spheres[3],
 /* Compute the image for the scanlines from [y0,y1), for an overall image
   of width w and height h.
 */
-static void ao_scanlines(uniform int y0, uniform int y1, uniform int w, 
-                         uniform int h,  uniform int nsubsamples, 
-                         uniform float image[]) {
+void ao_scanlines(uniform int y0, uniform int y1, uniform int w, uniform int h, 
+                  uniform int nsubsamples, reference uniform float image[]) {
    static Plane plane = { { 0.0f, -0.5f, 0.0f }, { 0.f, 1.f, 0.f } };
    static Sphere spheres[3] = {
        { { -2.0f, 0.0f, -3.5f }, 0.5f },
@@ -211,7 +212,7 @@ static void ao_scanlines(uniform int y0, uniform int y1, uniform int w,
        { { 1.0f, 0.0f, -2.2f }, 0.5f } };
    RNGState rngstate;

-    seed_rng(&rngstate, y0);
+    seed_rng(rngstate, y0);

    // Compute the mapping between the 'programCount'-wide program
    // instances running in parallel and samples in the image.  
@@ -230,9 +231,6 @@ static void ao_scanlines(uniform int y0, uniform int y1, uniform int w,
    // direction we do per iteration and ny the number in y.
    uniform int nx = 1, ny = 1;

-    // FIXME: We actually need ny to be 1 regardless of the decomposition,
-    // since the task decomposition is one scanline high.
-
    if (programCount == 8) {
        // Do two pixels at once in the x direction
        nx = 2;
@@ -241,21 +239,19 @@ static void ao_scanlines(uniform int y0, uniform int y1, uniform int w,
            ++du;
    }
    else if (programCount == 16) {
-        nx = 4;
-        ny = 1;
-        if (programIndex >= 4 && programIndex < 8)
+        // Two at once in both x and y
+        nx = ny = 2;
+        if ((programIndex >= 4 && programIndex < 8) || programIndex >= 12)
            ++du;
-        if (programIndex >= 8 && programIndex < 12)
-            du += 2;
-        if (programIndex >= 12)
-            du += 3;
+        if (programIndex >= 8)  
+            ++dv;
    }

    // Now loop over all of the pixels, stepping in x and y as calculated
    // above.  (Assumes that ny divides y and nx divides x...)
    for (uniform int y = y0; y < y1; y += ny) {
        for (uniform int x = 0; x < w; x += nx)  {
-            // Figure out x,y pixel in NDC
+            // Figur out x,y pixel in NDC
            float px =  (x + du - (w / 2.0f)) / (w / 2.0f);
            float py = -(y + dv - (h / 2.0f)) / (h / 2.0f);
            float ret = 0.f;
@@ -297,7 +293,7 @@ static void ao_scanlines(uniform int y0, uniform int y1, uniform int w,

            // offset to the first pixel in the image
            uniform int offset = 3 * (y * w + x);
-            for (uniform int p = 0; p < programCount; p += 4, offset += 3) {
+            for (uniform int p = 0; p < programCount; p += 4, ++offset) {
                // Get the four sample values for this pixel
                uniform float sumret = retArray[p] + retArray[p+1] + retArray[p+2] +
                    retArray[p+3];
@@ -319,15 +315,3 @@ export void ao_ispc(uniform int w, uniform int h, uniform int nsubsamples,
                    uniform float image[]) {
    ao_scanlines(0, h, w, h, nsubsamples, image);
 }
-
-
-static void task ao_task(uniform int width, uniform int height, 
-                         uniform int nsubsamples, uniform float image[]) {
-    ao_scanlines(taskIndex, taskIndex+1, width, height, nsubsamples, image);
-}
-
-
-export void ao_ispc_tasks(uniform int w, uniform int h, uniform int nsubsamples, 
-                          uniform float image[]) {
-    launch[h] < ao_task(w, h, nsubsamples, image) >;
-}
--- a/examples/aobench_instrumented/aobench_instrumented.vcxproj
+++ b/examples/aobench_instrumented/aobench_instrumented.vcxproj
@@ -21,23 +21,22 @@
  <ItemGroup>
    <ClCompile Include="ao.cpp" />
    <ClCompile Include="instrument.cpp" />
-    <ClCompile Include="../tasksys.cpp" />
  </ItemGroup>
  <ItemGroup>
    <CustomBuild Include="ao.ispc">
      <FileType>Document</FileType>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename)_instrumented.obj -h $(TargetDir)%(Filename)_instrumented_ispc.h --arch=x86 --instrument --target=sse2
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --instrument
 </Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename)_instrumented.obj -h $(TargetDir)%(Filename)_instrumented_ispc.h --instrument --target=sse2
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --instrument
 </Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(TargetDir)%(Filename)_instrumented.obj;$(TargetDir)%(Filename)_instrumented_ispc.h</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(TargetDir)%(Filename)_instrumented.obj;$(TargetDir)%(Filename)_instrumented_ispc.h</Outputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename)_instrumented.obj -h $(TargetDir)%(Filename)_instrumented_ispc.h --arch=x86 --instrument --target=sse2
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(Filename).obj</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">%(Filename).obj</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --instrument
 </Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename)_instrumented.obj -h $(TargetDir)%(Filename)_instrumented_ispc.h --instrument --target=sse2
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --instrument
 </Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(TargetDir)%(Filename)_instrumented.obj;$(TargetDir)%(Filename)_instrumented_ispc.h</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(TargetDir)%(Filename)_instrumented.obj;$(TargetDir)%(Filename)_instrumented_ispc.h</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(Filename).obj</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">%(Filename).obj</Outputs>
    </CustomBuild>
  </ItemGroup>
  <PropertyGroup Label="Globals">
@@ -86,23 +85,15 @@
  <PropertyGroup Label="UserMacros" />
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
    <LinkIncremental>true</LinkIncremental>
-    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
-    <PreBuildEventUseInBuild>true</PreBuildEventUseInBuild>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
    <LinkIncremental>true</LinkIncremental>
-    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
-    <PreBuildEventUseInBuild>true</PreBuildEventUseInBuild>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
    <LinkIncremental>false</LinkIncremental>
-    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
-    <PreBuildEventUseInBuild>true</PreBuildEventUseInBuild>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
    <LinkIncremental>false</LinkIncremental>
-    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
-    <PreBuildEventUseInBuild>true</PreBuildEventUseInBuild>
  </PropertyGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
    <ClCompile>
@@ -110,8 +101,7 @@
      </PrecompiledHeader>
      <WarningLevel>Level3</WarningLevel>
      <Optimization>Disabled</Optimization>
-      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
@@ -124,8 +114,7 @@
      </PrecompiledHeader>
      <WarningLevel>Level3</WarningLevel>
      <Optimization>Disabled</Optimization>
-      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
@@ -140,8 +129,7 @@
      <Optimization>MaxSpeed</Optimization>
      <FunctionLevelLinking>true</FunctionLevelLinking>
      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
@@ -158,8 +146,7 @@
      <Optimization>MaxSpeed</Optimization>
      <FunctionLevelLinking>true</FunctionLevelLinking>
      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
@@ -171,4 +158,4 @@
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
  </ImportGroup>
-</Project>
+</Project>
--- a/examples/cpuid.h
+++ b/examples/cpuid.h
@@ -1,5 +1,5 @@
 /*
-  Copyright (c) 2011, Intel Corporation
+  Copyright (c) 2010-2011, Intel Corporation
  All rights reserved.

  Redistribution and use in source and binary forms, with or without
@@ -31,35 +31,36 @@
   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
 */

-/** @file ast.cpp
-    @brief 
-*/
+#ifndef ISPC_CPUID_H
+#define ISPC_CPUID_H 1

-#include "ast.h"
-#include "func.h"
-#include "sym.h"
+#ifdef _MSC_VER
+// Provides a __cpuid() function with same signature as below
+#include <intrin.h>
+#else
+static void __cpuid(int info[4], int infoType) {
+    __asm__ __volatile__ ("cpuid"
+                          : "=a" (info[0]), "=b" (info[1]), "=c" (info[2]), "=d" (info[3])
+                          : "0" (infoType));
+}
+#endif

-///////////////////////////////////////////////////////////////////////////
-// ASTNode
-
-ASTNode::~ASTNode() {
+inline bool CPUSupportsSSE2() {
+    int info[4];
+    __cpuid(info, 1);
+    return (info[3] & (1 << 26)) != 0;
 }

-
-///////////////////////////////////////////////////////////////////////////
-// AST
-
-void
-AST::AddFunction(Symbol *sym, const std::vector<Symbol *> &args, Stmt *code) {
-    if (sym == NULL)
-        return;
-    functions.push_back(new Function(sym, args, code));
+inline bool CPUSupportsSSE4() {
+    int info[4];
+    __cpuid(info, 1);
+    return (info[2] & (1 << 19)) != 0;
 }

-
-void
-AST::GenerateIR() {
-    for (unsigned int i = 0; i < functions.size(); ++i)
-        functions[i]->GenerateIR();
+inline bool CPUSupportsAVX() {
+    int info[4];
+    __cpuid(info, 1);
+    return (info[2] & (1 << 28)) != 0;
 }

+#endif // ISPC_CPUID_H
--- a/examples/deferred/Makefile
+++ b/examples/deferred/Makefile
@@ -1,18 +1,22 @@

 ARCH = $(shell uname)

-TASK_CXX=../tasksys.cpp
+TASK_CXX=../tasks_pthreads.cpp
 TASK_LIB=-lpthread
+
+ifeq ($(ARCH), Darwin)
+  TASK_CXX=../tasks_gcd.cpp
+  TASK_LIB=
+endif
+
 TASK_OBJ=$(addprefix objs/, $(subst ../,, $(TASK_CXX:.cpp=.o)))

 CXX=g++
 CXXFLAGS=-Iobjs/ -O3 -Wall -m64
 ISPC=ispc
-ISPCFLAGS=-O2 --target=sse2,sse4-x2,avx-x2 --arch=x86-64 --math-lib=fast
+ISPCFLAGS=-O2 --target=sse4x2 --arch=x86-64 --math-lib=fast

-OBJS=objs/main.o objs/common.o objs/kernels_ispc.o objs/kernels_ispc_sse2.o \
-	objs/kernels_ispc_sse4.o objs/kernels_ispc_avx.o \
-	objs/dynamic_c.o objs/dynamic_cilk.o
+OBJS=objs/main.o objs/common.o objs/kernels_ispc.o objs/dynamic_c.o objs/dynamic_cilk.o

 default: deferred_shading

@@ -34,5 +38,5 @@ objs/%.o: %.cpp objs/kernels_ispc.h deferred.h
 objs/%.o: ../%.cpp
 	$(CXX) $< $(CXXFLAGS) -c -o $@

-objs/%_ispc.h objs/%_ispc.o objs/%_ispc_sse2.o objs/%_ispc_sse4.o objs/%_ispc_avx.o: %.ispc
+objs/%_ispc.h objs/%_ispc.o: %.ispc
 	$(ISPC) $(ISPCFLAGS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
--- a/examples/deferred/common.cpp
+++ b/examples/deferred/common.cpp
@@ -64,7 +64,7 @@
 ///////////////////////////////////////////////////////////////////////////

 static void *
-lAlignedMalloc(size_t size, int32_t alignment) {
+lAlignedMalloc(int64_t size, int32_t alignment) {
 #ifdef ISPC_IS_WINDOWS
    return _aligned_malloc(size, alignment);
 #endif
@@ -118,7 +118,6 @@ Framebuffer::clear() {
    memset(b, 0, nPixels);
 }

-
 InputData *
 CreateInputDataFromFile(const char *path) {
    FILE *in = fopen(path, "rb");
@@ -178,7 +177,8 @@ CreateInputDataFromFile(const char *path) {
 }


-void DeleteInputData(InputData *input) {
+void DeleteInputData(InputData *input)
+{
    lAlignedFree(input->chunk);
 }

--- a/examples/deferred/deferred_shading.vcxproj
+++ b/examples/deferred/deferred_shading.vcxproj
@@ -64,19 +64,15 @@
  <PropertyGroup Label="UserMacros" />
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
    <LinkIncremental>true</LinkIncremental>
-    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
    <LinkIncremental>true</LinkIncremental>
-    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
    <LinkIncremental>false</LinkIncremental>
-    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
    <LinkIncremental>false</LinkIncremental>
-    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
  </PropertyGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
    <ClCompile>
@@ -85,7 +81,6 @@
      <WarningLevel>Level3</WarningLevel>
      <Optimization>Disabled</Optimization>
      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <FloatingPointModel>Fast</FloatingPointModel>
    </ClCompile>
@@ -101,7 +96,6 @@
      <WarningLevel>Level3</WarningLevel>
      <Optimization>Disabled</Optimization>
      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <FloatingPointModel>Fast</FloatingPointModel>
    </ClCompile>
@@ -119,7 +113,6 @@
      <FunctionLevelLinking>true</FunctionLevelLinking>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
      <FloatingPointModel>Fast</FloatingPointModel>
    </ClCompile>
    <Link>
@@ -138,7 +131,6 @@
      <FunctionLevelLinking>true</FunctionLevelLinking>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
      <FloatingPointModel>Fast</FloatingPointModel>
    </ClCompile>
    <Link>
@@ -153,23 +145,23 @@
    <ClCompile Include="dynamic_c.cpp" />
    <ClCompile Include="dynamic_cilk.cpp" />
    <ClCompile Include="main.cpp" />
-    <ClCompile Include="../tasksys.cpp" />
+    <ClCompile Include="../tasks_concrt.cpp" />
  </ItemGroup>
  <ItemGroup>
    <CustomBuild Include="kernels.ispc">
      <FileType>Document</FileType>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx-x2
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=sse4x2
 </Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx-x2
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --target=sse4x2
 </Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx-x2
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=sse4x2
 </Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx-x2
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --target=sse4x2
 </Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
    </CustomBuild>
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
--- a/examples/deferred/dynamic_c.cpp
+++ b/examples/deferred/dynamic_c.cpp
@@ -60,7 +60,7 @@
 #define DYNAMIC_MIN_LIGHTS_TO_SUBDIVIDE 1

 static void *
-lAlignedMalloc(size_t size, int32_t alignment) {
+lAlignedMalloc(int64_t size, int32_t alignment) {
 #ifdef ISPC_IS_WINDOWS
    return _aligned_malloc(size, alignment);
 #endif
@@ -141,10 +141,12 @@ ComputeZBoundsRow(int tileY, int tileWidth, int tileHeight,
 {
    for (int tileX = 0; tileX < numTilesX; ++tileX) {
        float minZ, maxZ;
-        ComputeZBounds(tileX * tileWidth, tileX * tileWidth + tileWidth,
-                       tileY * tileHeight, tileY * tileHeight + tileHeight,
-                       zBuffer, gBufferWidth, cameraProj_33, cameraProj_43, 
-                       cameraNear, cameraFar, &minZ, &maxZ);
+        ComputeZBounds(
+            tileX * tileWidth, tileX * tileWidth + tileWidth,
+            tileY * tileHeight, tileY * tileHeight + tileHeight,
+            zBuffer, gBufferWidth,
+            cameraProj_33, cameraProj_43, cameraNear, cameraFar,
+            &minZ, &maxZ);
        minZArray[tileX] = minZ;
        maxZArray[tileX] = maxZ;
    }
@@ -280,8 +282,8 @@ void InitDynamicC(InputData *input) {
 }


-/* We're going to split a tile into 4 sub-tiles.  This function
-   reclassifies the tile's lights with respect to the sub-tiles. */
+// numLights need not be a multiple of programCount here, but the input and output arrays
+// should be able to handle programCount-sized load/stores.
 static void
 SplitTileMinMax(
    int tileMidX, int tileMidY,
@@ -337,7 +339,7 @@ SplitTileMinMax(
        float light_attenuationEnd = light_attenuationEnd_array[lightIndex];
        float light_attenuationEndNeg = -light_attenuationEnd;
        
-        // Test lights again against subtile z bounds
+        // Test lights again subtile z bounds
        bool inFrustum[4];
        inFrustum[0] = (light_positionView_z - subtileMinZ[0] >= light_attenuationEndNeg) &&
            (subtileMaxZ[0] - light_positionView_z >= light_attenuationEndNeg);
@@ -412,8 +414,7 @@ Float32ToUnorm8(float f) {
 }


-static inline float
-half_to_float_fast(uint16_t h) {
+static inline float half_to_float_fast(uint16_t h) {
    uint32_t hs = h & (int32_t)0x8000u;  // Pick off sign bit
    uint32_t he = h & (int32_t)0x7C00u;  // Pick off exponent bits
    uint32_t hm = h & (int32_t)0x03FFu;  // Pick off mantissa bits
--- a/examples/deferred/dynamic_cilk.cpp
+++ b/examples/deferred/dynamic_cilk.cpp
@@ -31,7 +31,7 @@
   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
 */

-#ifdef __cilk
+#ifdef __cilkplusplus

 #include "deferred.h"
 #include "kernels_ispc.h"
@@ -60,7 +60,7 @@
 #define DYNAMIC_MIN_LIGHTS_TO_SUBDIVIDE 1

 static void *
-lAlignedMalloc(size_t size, int32_t alignment) {
+lAlignedMalloc(int64_t size, int32_t alignment) {
 #ifdef ISPC_IS_WINDOWS
    return _aligned_malloc(size, alignment);
 #endif
@@ -395,4 +395,4 @@ DispatchDynamicCilk(InputData *input, Framebuffer *framebuffer)
    }
 }

-#endif // __cilk
+#endif // __cilkplusplus
--- a/examples/deferred/kernels.ispc
+++ b/examples/deferred/kernels.ispc
@@ -35,22 +35,22 @@

 struct InputDataArrays
 {
-    uniform float * uniform zBuffer;
-    uniform unsigned int16 * uniform normalEncoded_x; // half float
-    uniform unsigned int16 * uniform normalEncoded_y; // half float
-    uniform unsigned int16 * uniform specularAmount; // half float
-    uniform unsigned int16 * uniform specularPower; // half float
-    uniform unsigned int8 * uniform albedo_x; // unorm8
-    uniform unsigned int8 * uniform albedo_y; // unorm8
-    uniform unsigned int8 * uniform albedo_z; // unorm8
-    uniform float * uniform lightPositionView_x;
-    uniform float * uniform lightPositionView_y;
-    uniform float * uniform lightPositionView_z;
-    uniform float * uniform lightAttenuationBegin;
-    uniform float * uniform lightColor_x;
-    uniform float * uniform lightColor_y;
-    uniform float * uniform lightColor_z;
-    uniform float * uniform lightAttenuationEnd;
+    uniform float zBuffer[];
+    uniform unsigned int16 normalEncoded_x[]; // half float
+    uniform unsigned int16 normalEncoded_y[]; // half float
+    uniform unsigned int16 specularAmount[]; // half float
+    uniform unsigned int16 specularPower[]; // half float
+    uniform unsigned int8 albedo_x[]; // unorm8
+    uniform unsigned int8 albedo_y[]; // unorm8
+    uniform unsigned int8 albedo_z[]; // unorm8
+    uniform float lightPositionView_x[];
+    uniform float lightPositionView_y[];
+    uniform float lightPositionView_z[];
+    uniform float lightAttenuationBegin[];
+    uniform float lightColor_x[];
+    uniform float lightColor_y[];
+    uniform float lightColor_z[];
+    uniform float lightAttenuationEnd[];
 };

 struct InputHeader
@@ -66,6 +66,8 @@ struct InputHeader
    uniform int32 inputDataArrayOffsets[idaNum];
 };

+export void foo(reference InputHeader h) { }
+

 ///////////////////////////////////////////////////////////////////////////
 // Common utility routines
@@ -77,7 +79,8 @@ dot3(float x, float y, float z, float a, float b, float c) {


 static inline void
-normalize3(float x, float y, float z, float &ox, float &oy, float &oz) {
+normalize3(float x, float y, float z, reference float ox, 
+           reference float oy, reference float oz) {
    float n = rsqrt(x*x + y*y + z*z);
    ox = x * n;
    oy = y * n;
@@ -97,6 +100,7 @@ Float32ToUnorm8(float f) {
 }


+// tile width must be a multiple of programCount (SIMD size)
 static void
 ComputeZBounds(
    uniform int32 tileStartX, uniform int32 tileEndX,
@@ -108,17 +112,17 @@ ComputeZBounds(
    uniform float cameraProj_33, uniform float cameraProj_43,
    uniform float cameraNear, uniform float cameraFar,
    // Output
-    uniform float &minZ,
-    uniform float &maxZ
+    reference uniform float minZ,
+    reference uniform float maxZ
    )
 {
    // Find Z bounds
    float laneMinZ = cameraFar;
    float laneMaxZ = cameraNear;
    for (uniform int32 y = tileStartY; y < tileEndY; ++y) {
-        foreach (x = tileStartX ... tileEndX) {
+        for (uniform int32 x = tileStartX; x < tileEndX; x += programCount) {
            // Unproject depth buffer Z value into view space
-            float z = zBuffer[y * gBufferWidth + x];
+            float z = zBuffer[(y * gBufferWidth + x) + programIndex];
            float viewSpaceZ = cameraProj_43 / (z - cameraProj_33);

            // Work out Z bounds for our samples
@@ -134,6 +138,8 @@ ComputeZBounds(
 }


+// tile width must be a multiple of programCount (SIMD size)
+// numLights must currently be a multiple of programCount (SIMD size)
 export uniform int32
 IntersectLightsWithTileMinMax(
    uniform int32 tileStartX, uniform int32 tileEndX,
@@ -152,7 +158,7 @@ IntersectLightsWithTileMinMax(
    uniform float light_positionView_z_array[],
    uniform float light_attenuationEnd_array[],
    // Output
-    uniform int32 tileLightIndices[]
+    reference uniform int32 tileLightIndices[]
    )
 {
    uniform float gBufferScale_x = 0.5f * (float)gBufferWidth;
@@ -194,7 +200,9 @@ IntersectLightsWithTileMinMax(

    uniform int32 tileNumLights = 0;

-    foreach (lightIndex = 0 ... numLights) {
+    for (uniform int32 baseLightIndex = 0; baseLightIndex < numLights; 
+         baseLightIndex += programCount) {
+        int32 lightIndex = baseLightIndex + programIndex;
        float light_positionView_z = light_positionView_z_array[lightIndex];
        float light_attenuationEnd = light_attenuationEnd_array[lightIndex];
        float light_attenuationEndNeg = -light_attenuationEnd;
@@ -209,31 +217,32 @@ IntersectLightsWithTileMinMax(
        // don't actually need to mask the rest of this function - this is
        // just a greedy early-out.  Could also structure all of this as
        // nested if() statements, but this a bit easier to read
-        if (any(inFrustum)) {
-            float light_positionView_x = light_positionView_x_array[lightIndex];
-            float light_positionView_y = light_positionView_y_array[lightIndex];
+        if (!any(inFrustum)) 
+            continue;

-            d = light_positionView_z * frustumPlanes_z[0] + 
-                light_positionView_x * frustumPlanes_xy[0];
-            inFrustum = inFrustum && (d >= light_attenuationEndNeg);
+        float light_positionView_x = light_positionView_x_array[lightIndex];
+        float light_positionView_y = light_positionView_y_array[lightIndex];

-            d = light_positionView_z * frustumPlanes_z[1] + 
-                light_positionView_x * frustumPlanes_xy[1];
-            inFrustum = inFrustum && (d >= light_attenuationEndNeg);
+        d = light_positionView_z * frustumPlanes_z[0] + 
+            light_positionView_x * frustumPlanes_xy[0];
+        inFrustum = inFrustum && (d >= light_attenuationEndNeg);

-            d = light_positionView_z * frustumPlanes_z[2] + 
-                light_positionView_y * frustumPlanes_xy[2];
-            inFrustum = inFrustum && (d >= light_attenuationEndNeg);
+        d = light_positionView_z * frustumPlanes_z[1] + 
+            light_positionView_x * frustumPlanes_xy[1];
+        inFrustum = inFrustum && (d >= light_attenuationEndNeg);

-            d = light_positionView_z * frustumPlanes_z[3] + 
-                light_positionView_y * frustumPlanes_xy[3];
-            inFrustum = inFrustum && (d >= light_attenuationEndNeg);
+        d = light_positionView_z * frustumPlanes_z[2] + 
+            light_positionView_y * frustumPlanes_xy[2];
+        inFrustum = inFrustum && (d >= light_attenuationEndNeg);
+
+        d = light_positionView_z * frustumPlanes_z[3] + 
+            light_positionView_y * frustumPlanes_xy[3];
+        inFrustum = inFrustum && (d >= light_attenuationEndNeg);
        
-            // Pack and store intersecting lights
-            cif (inFrustum) {
-                tileNumLights += packed_store_active(&tileLightIndices[tileNumLights], 
-                                                     lightIndex);
-            }
+        // Pack and store intersecting lights
+        cif (inFrustum) {
+            tileNumLights += packed_store_active(tileLightIndices, tileNumLights, 
+                                                 lightIndex);
        }
    }

@@ -241,6 +250,8 @@ IntersectLightsWithTileMinMax(
 }


+// tile width must be a multiple of programCount (SIMD size)
+// numLights must currently be a multiple of programCount (SIMD size)
 static uniform int32
 IntersectLightsWithTile(
    uniform int32 tileStartX, uniform int32 tileEndX,
@@ -259,7 +270,7 @@ IntersectLightsWithTile(
    uniform float light_positionView_z_array[],
    uniform float light_attenuationEnd_array[],
    // Output
-    uniform int32 tileLightIndices[]
+    reference uniform int32 tileLightIndices[]
    )
 {
    uniform float minZ, maxZ;
@@ -278,31 +289,32 @@ IntersectLightsWithTile(
 }


+// tile width must be a multiple of programCount (SIMD size)
 export void
 ShadeTile(
    uniform int32 tileStartX, uniform int32 tileEndX,
    uniform int32 tileStartY, uniform int32 tileEndY,
    uniform int32 gBufferWidth, uniform int32 gBufferHeight,
-    uniform InputDataArrays &inputData,
+    reference uniform InputDataArrays inputData,
    // Camera data
    uniform float cameraProj_11, uniform float cameraProj_22,
    uniform float cameraProj_33, uniform float cameraProj_43,
    // Light list
-    uniform int32 tileLightIndices[],
+    reference uniform int32 tileLightIndices[],
    uniform int32 tileNumLights,
    // UI
    uniform bool visualizeLightCount,
    // Output
-    uniform unsigned int8 framebuffer_r[],
-    uniform unsigned int8 framebuffer_g[],
-    uniform unsigned int8 framebuffer_b[]
+    reference uniform unsigned int8 framebuffer_r[],
+    reference uniform unsigned int8 framebuffer_g[],
+    reference uniform unsigned int8 framebuffer_b[]
    )
 {
    if (tileNumLights == 0 || visualizeLightCount) {
        uniform unsigned int8 c = (unsigned int8)(min(tileNumLights << 2, 255));
        for (uniform int32 y = tileStartY; y < tileEndY; ++y) {
-            foreach (x = tileStartX ... tileEndX) {
-                int32 framebufferIndex = (y * gBufferWidth + x);
+            for (uniform int32 x = tileStartX; x < tileEndX; x += programCount) {
+                int32 framebufferIndex = (y * gBufferWidth + x) + programIndex;
                framebuffer_r[framebufferIndex] = c;
                framebuffer_g[framebufferIndex] = c;
                framebuffer_b[framebufferIndex] = c;
@@ -315,8 +327,9 @@ ShadeTile(
        for (uniform int32 y = tileStartY; y < tileEndY; ++y) {
            uniform float positionScreen_y = -(((0.5f + y) * twoOverGBufferHeight) - 1.f);

-            foreach (x = tileStartX ... tileEndX) {
-                int32 gBufferOffset = y * gBufferWidth + x;
+            for (uniform int32 x = tileStartX; x < tileEndX; x += programCount) {
+                uniform int32 gBufferOffsetBase = y * gBufferWidth + x;
+                int32 gBufferOffset = gBufferOffsetBase + programIndex;
                
                // Reconstruct position and (negative) view vector from G-buffer
                float surface_positionView_x, surface_positionView_y, surface_positionView_z;
@@ -326,7 +339,7 @@ ShadeTile(

                // Compute screen/clip-space position
                // NOTE: Mind DX11 viewport transform and pixel center!
-                float positionScreen_x = (0.5f + (float)(x)) * 
+                float positionScreen_x = (0.5f + (float)(x + programIndex)) * 
                    twoOverGBufferWidth - 1.0f;

                // Unproject depth buffer Z value into view space
@@ -466,21 +479,24 @@ ShadeTile(
 // Static decomposition

 task void
-RenderTile(uniform int num_groups_x, uniform int num_groups_y,
-           uniform InputHeader &inputHeader,
-           uniform InputDataArrays &inputData,
+RenderTile(uniform int g, uniform int num_groups_x, uniform int num_groups_y,
+           reference uniform InputHeader inputHeader,
+           reference uniform InputDataArrays inputData,
           uniform int visualizeLightCount,
           // Output
-           uniform unsigned int8 framebuffer_r[],
-           uniform unsigned int8 framebuffer_g[],
-           uniform unsigned int8 framebuffer_b[]) {
-    uniform int32 group_y = taskIndex / num_groups_x;
-    uniform int32 group_x = taskIndex % num_groups_x;
+           reference uniform unsigned int8 framebuffer_r[],
+           reference uniform unsigned int8 framebuffer_g[],
+           reference uniform unsigned int8 framebuffer_b[]) {
+    uniform int32 group_y = g / num_groups_x;
+    uniform int32 group_x = g % num_groups_x;
    uniform int32 tile_start_x = group_x * MIN_TILE_WIDTH;
    uniform int32 tile_start_y = group_y * MIN_TILE_HEIGHT;
    uniform int32 tile_end_x = tile_start_x + MIN_TILE_WIDTH;
    uniform int32 tile_end_y = tile_start_y + MIN_TILE_HEIGHT;

+    uniform int sTileNumLights = 0;
+    uniform int sTileLightIndices[MAX_LIGHTS];  // Light list for the tile
+
    uniform int framebufferWidth = inputHeader.framebufferWidth;
    uniform int framebufferHeight = inputHeader.framebufferHeight;
    uniform float cameraProj_00 = inputHeader.cameraProj[0][0];
@@ -488,9 +504,8 @@ RenderTile(uniform int num_groups_x, uniform int num_groups_y,
    uniform float cameraProj_22 = inputHeader.cameraProj[2][2];
    uniform float cameraProj_32 = inputHeader.cameraProj[3][2];

-    // Light intersection: figure out which lights illuminate this tile.
-    uniform int tileLightIndices[MAX_LIGHTS];  // Light list for the tile
-    uniform int numTileLights = 
+    // Light intersection
+    sTileNumLights = 
        IntersectLightsWithTile(tile_start_x, tile_end_x, 
                                tile_start_y, tile_end_y,
                                framebufferWidth, framebufferHeight,
@@ -503,43 +518,41 @@ RenderTile(uniform int num_groups_x, uniform int num_groups_y,
                                inputData.lightPositionView_y, 
                                inputData.lightPositionView_z, 
                                inputData.lightAttenuationEnd,
-                                tileLightIndices);
+                                sTileLightIndices);

-    // And now shade the tile, using the lights in tileLightIndices
    ShadeTile(tile_start_x, tile_end_x, tile_start_y, tile_end_y,
              framebufferWidth, framebufferHeight, inputData,
              cameraProj_00, cameraProj_11, cameraProj_22, cameraProj_32,
-              tileLightIndices, numTileLights, visualizeLightCount, 
+              sTileLightIndices, sTileNumLights, visualizeLightCount, 
              framebuffer_r, framebuffer_g, framebuffer_b);
 }


 export void
-RenderStatic(uniform InputHeader &inputHeader,
-             uniform InputDataArrays &inputData,
+RenderStatic(reference uniform InputHeader inputHeader,
+             reference uniform InputDataArrays inputData,
             uniform int visualizeLightCount,
             // Output
-             uniform unsigned int8 framebuffer_r[],
-             uniform unsigned int8 framebuffer_g[],
-             uniform unsigned int8 framebuffer_b[]) {
+             reference uniform unsigned int8 framebuffer_r[],
+             reference uniform unsigned int8 framebuffer_g[],
+             reference uniform unsigned int8 framebuffer_b[]) {
    uniform int num_groups_x = (inputHeader.framebufferWidth + 
                                MIN_TILE_WIDTH - 1) / MIN_TILE_WIDTH;
    uniform int num_groups_y = (inputHeader.framebufferHeight + 
                                MIN_TILE_HEIGHT - 1) / MIN_TILE_HEIGHT;
    uniform int num_groups = num_groups_x * num_groups_y;

-    // Launch a task to render each tile, each of which is MIN_TILE_WIDTH
-    // by MIN_TILE_HEIGHT pixels.
-    launch[num_groups] < RenderTile(num_groups_x, num_groups_y,
-                                    inputHeader, inputData, visualizeLightCount,
-                                    framebuffer_r, framebuffer_g, framebuffer_b) >;
+    for (uniform int g = 0; g < num_groups; ++g)
+        launch < RenderTile(g, num_groups_x, num_groups_y,
+                            inputHeader, inputData, visualizeLightCount,
+                            framebuffer_r, framebuffer_g, framebuffer_b) >;
 }


 ///////////////////////////////////////////////////////////////////////////
 // Routines for dynamic decomposition path

-// This computes the z min/max range for a whole row worth of tiles.
+// tile width must be a multiple of programCount (SIMD size)
 export void
 ComputeZBoundsRow(
    uniform int32 tileY,
@@ -552,8 +565,8 @@ ComputeZBoundsRow(
    uniform float cameraProj_33, uniform float cameraProj_43,
    uniform float cameraNear, uniform float cameraFar,
    // Output
-    uniform float minZArray[],
-    uniform float maxZArray[]
+    reference uniform float minZArray[],
+    reference uniform float maxZArray[]
    )
 {
    for (uniform int32 tileX = 0; tileX < numTilesX; ++tileX) {
@@ -570,7 +583,6 @@ ComputeZBoundsRow(
 }


-// Reclassifies the lights with respect to four sub-tiles when we refine a tile.
 // numLights need not be a multiple of programCount here, but the input and output arrays
 // should be able to handle programCount-sized load/stores.
 export void
@@ -584,7 +596,7 @@ SplitTileMinMax(
    // Camera data
    uniform float cameraProj_11, uniform float cameraProj_22,
    // Light Data
-    uniform int32 lightIndices[],
+    reference uniform int32 lightIndices[],
    uniform int32 numLights,
    uniform float light_positionView_x_array[],
    uniform float light_positionView_y_array[],
@@ -593,9 +605,9 @@ SplitTileMinMax(
    // Outputs
    // TODO: ISPC doesn't currently like multidimensionsal arrays so we'll do the
    // indexing math ourselves
-    uniform int32 subtileIndices[],
+    reference uniform int32 subtileIndices[],
    uniform int32 subtileIndicesPitch,
-    uniform int32 subtileNumLights[]
+    reference uniform int32 subtileNumLights[]
    )
 {
    uniform float gBufferScale_x = 0.5f * (float)gBufferWidth;
@@ -633,7 +645,12 @@ SplitTileMinMax(
    subtileLightOffset[2] = 2 * subtileIndicesPitch;
    subtileLightOffset[3] = 3 * subtileIndicesPitch;

-    foreach (i = 0 ... numLights) {
+    for (int32 i = programIndex; i < numLights; i += programCount) {
+        // TODO: ISPC says gather required here when it actually
+        // isn't... this could be fixed this by nesting an if() within a
+        // uniform loop, but I'm not totally sure if that's a win
+        // overall. For now we'll just eat the perf cost for cleanliness
+        // since the below are real gathers anyways.
        int32 lightIndex = lightIndices[i];

        float light_positionView_x = light_positionView_x_array[lightIndex];
@@ -676,21 +693,21 @@ SplitTileMinMax(
        // Pack and store intersecting lights
        // TODO: Experiment with a loop here instead
        cif (inFrustum[0])
-            subtileLightOffset[0] += 
-            packed_store_active(&subtileIndices[subtileLightOffset[0]],
-                                lightIndex);
+            subtileLightOffset[0] += packed_store_active(subtileIndices, 
+                                                         subtileLightOffset[0], 
+                                                         lightIndex);
        cif (inFrustum[1])
-            subtileLightOffset[1] += 
-            packed_store_active(&subtileIndices[subtileLightOffset[1]],
-                                lightIndex);
+            subtileLightOffset[1] += packed_store_active(subtileIndices, 
+                                                         subtileLightOffset[1], 
+                                                         lightIndex);
        cif (inFrustum[2])
-            subtileLightOffset[2] += 
-            packed_store_active(&subtileIndices[subtileLightOffset[2]], 
-                                lightIndex);
+            subtileLightOffset[2] += packed_store_active(subtileIndices, 
+                                                         subtileLightOffset[2], 
+                                                         lightIndex);
        cif (inFrustum[3])
-            subtileLightOffset[3] += 
-            packed_store_active(&subtileIndices[subtileLightOffset[3]], 
-                                lightIndex);
+            subtileLightOffset[3] += packed_store_active(subtileIndices, 
+                                                         subtileLightOffset[3], 
+                                                         lightIndex);
    }

    subtileNumLights[0] = subtileLightOffset[0] - 0 * subtileIndicesPitch;
--- a/examples/deferred/main.cpp
+++ b/examples/deferred/main.cpp
@@ -63,7 +63,7 @@

 int main(int argc, char** argv) {
    if (argc != 2) {
-        printf("usage: deferred_shading <input_file (e.g. data/pp1280x720.bin)>\n");
+        printf("usage: deferred_shading <input_file>\n");
        return 1;
    }

@@ -77,9 +77,9 @@ int main(int argc, char** argv) {
                            input->header.framebufferHeight);

    InitDynamicC(input);
-#ifdef __cilk
+#ifdef __cilkplusplus
    InitDynamicCilk(input);
-#endif // __cilk
+#endif // __cilkplusplus

    int nframes = 5;
    double ispcCycles = 1e30;
@@ -98,21 +98,6 @@ int main(int argc, char** argv) {
           input->header.framebufferWidth, input->header.framebufferHeight);
    WriteFrame("deferred-ispc-static.ppm", input, framebuffer);

-#ifdef __cilk
-    double dynamicCilkCycles = 1e30;
-    for (int i = 0; i < 5; ++i) {
-        framebuffer.clear();
-        reset_and_start_timer();
-        for (int j = 0; j < nframes; ++j)
-            DispatchDynamicCilk(input, &framebuffer);
-        double mcycles = get_elapsed_mcycles() / nframes;
-        dynamicCilkCycles = std::min(dynamicCilkCycles, mcycles);
-    }
-    printf("[ispc + Cilk dynamic]:\t\t[%.3f] million cycles to render image\n", 
-           dynamicCilkCycles);
-    WriteFrame("deferred-ispc-dynamic.ppm", input, framebuffer);
-#endif // __cilk
-
    double serialCycles = 1e30;
    for (int i = 0; i < 5; ++i) {
        framebuffer.clear();
@@ -122,16 +107,29 @@ int main(int argc, char** argv) {
        double mcycles = get_elapsed_mcycles() / nframes;
        serialCycles = std::min(serialCycles, mcycles);
    }
-    printf("[C++ serial dynamic, 1 core]:\t[%.3f] million cycles to render image\n", 
+    printf("[C++ serial dynamic, 1 core]:\t[%.3f] million cycles\n", 
           serialCycles);
    WriteFrame("deferred-serial-dynamic.ppm", input, framebuffer);

-#ifdef __cilk
+#ifdef __cilkplusplus
+    double dynamicCilkCycles = 1e30;
+    for (int i = 0; i < 5; ++i) {
+        framebuffer.clear();
+        reset_and_start_timer();
+        for (int j = 0; j < nframes; ++j)
+            DispatchDynamicCilk(input, &framebuffer);
+        double mcycles = get_elapsed_mcycles() / nframes;
+        dynamicCilkCycles = std::min(dynamicCilkCycles, mcycles);
+    }
+    printf("[ispc + Cilk dynamic]:\t\t[%.3f] million cycles\n", 
+           dynamicCilkCycles);
+    WriteFrame("deferred-ispc-dynamic.ppm", input, framebuffer);
+
    printf("\t\t\t\t(%.2fx speedup from static ISPC, %.2fx from Cilk+ISPC)\n", 
           serialCycles/ispcCycles, serialCycles/dynamicCilkCycles);
 #else
    printf("\t\t\t\t(%.2fx speedup from ISPC)\n", serialCycles/ispcCycles);
-#endif // __cilk
+#endif // __cilkplusplus

    DeleteInputData(input);

--- a/examples/mandelbrot/Makefile
+++ b/examples/mandelbrot/Makefile
@@ -2,7 +2,7 @@
 CXX=g++ -m64
 CXXFLAGS=-Iobjs/ -O3 -Wall
 ISPC=ispc
-ISPCFLAGS=-O2 --target=sse2,sse4-x2,avx-x2 --arch=x86-64
+ISPCFLAGS=-O2 --target=sse4x2 --arch=x86-64

 default: mandelbrot

@@ -14,17 +14,13 @@ dirs:
 clean:
 	/bin/rm -rf objs *~ mandelbrot

-OBJS=objs/mandelbrot.o objs/mandelbrot_serial.o objs/mandelbrot_ispc_sse2.o \
-	objs/mandelbrot_ispc_sse4.o objs/mandelbrot_ispc_avx.o \
-	objs/mandelbrot_ispc.o
-
-mandelbrot: dirs $(OBJS)
-	$(CXX) $(CXXFLAGS) -o $@ $(OBJS) -lm
+mandelbrot: dirs objs/mandelbrot.o objs/mandelbrot_serial.o objs/mandelbrot_ispc.o
+	$(CXX) $(CXXFLAGS) -o $@ objs/mandelbrot.o objs/mandelbrot_ispc.o objs/mandelbrot_serial.o -lm

 objs/%.o: %.cpp
 	$(CXX) $< $(CXXFLAGS) -c -o $@

 objs/mandelbrot.o: objs/mandelbrot_ispc.h 

-objs/%_ispc.h objs/%_ispc.o objs/%_ispc_sse2.o objs/%_ispc_sse4.o objs/%_ispc_avx.o: %.ispc
+objs/%_ispc.h objs/%_ispc.o: %.ispc
 	$(ISPC) $(ISPCFLAGS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
--- a/examples/mandelbrot/mandelbrot.cpp
+++ b/examples/mandelbrot/mandelbrot.cpp
@@ -41,6 +41,7 @@
 #include <stdio.h>
 #include <algorithm>
 #include "../timing.h"
+#include "../cpuid.h"
 #include "mandelbrot_ispc.h"
 using namespace ispc;

@@ -67,6 +68,38 @@ writePPM(int *buf, int width, int height, const char *fn) {
 }


+// Make sure that the vector ISA used during compilation is supported by
+// the processor.  The ISPC_TARGET_* macro is set in the ispc-generated
+// header file that we include above.
+static void
+ensureTargetISAIsSupported() {
+#if defined(ISPC_TARGET_SSE2)
+    bool isaSupported = CPUSupportsSSE2();
+    const char *target = "SSE2";
+#elif defined(ISPC_TARGET_SSE4)
+    bool isaSupported = CPUSupportsSSE4();
+    const char *target = "SSE4";
+#elif defined(ISPC_TARGET_AVX)
+    bool isaSupported = CPUSupportsAVX();
+    const char *target = "AVX";
+#else
+#error "Unknown ISPC_TARGET_* value"
+#endif
+    if (!isaSupported) {
+        fprintf(stderr, "***\n*** Error: the ispc-compiled code uses the %s instruction "
+                "set, which isn't\n***        supported by this computer's CPU!\n", target);
+        fprintf(stderr, "***\n***        Please modify the "
+#ifdef _MSC_VER
+                "MSVC project file "
+#else
+                "Makefile "
+#endif
+                "to select another target (e.g. sse2)\n***\n");
+        exit(1);
+    }
+}
+
+
 int main() {
    unsigned int width = 768;
    unsigned int height = 512;
@@ -78,6 +111,8 @@ int main() {
    int maxIterations = 256;
    int *buf = new int[width*height];

+    ensureTargetISAIsSupported();
+
    //
    // Compute the image using the ispc implementation; report the minimum
    // time of three runs.
--- a/examples/mandelbrot/mandelbrot.ispc
+++ b/examples/mandelbrot/mandelbrot.ispc
@@ -51,7 +51,7 @@ export void mandelbrot_ispc(uniform float x0, uniform float y0,
                            uniform float x1, uniform float y1,
                            uniform int width, uniform int height, 
                            uniform int maxIterations,
-                            uniform int output[])
+                            reference uniform int output[])
 {
    float dx = (x1 - x0) / width;
    float dy = (y1 - y0) / height;
@@ -60,16 +60,16 @@ export void mandelbrot_ispc(uniform float x0, uniform float y0,
        // Note that we'll be doing programCount computations in parallel,
        // so increment i by that much.  This assumes that width evenly
        // divides programCount.
-        foreach (i = 0 ... width) {
+        for (uniform int i = 0; i < width; i += programCount) {
            // Figure out the position on the complex plane to compute the
            // number of iterations at.  Note that the x values are
            // different across different program instances, since its
            // initializer incorporates the value of the programIndex
            // variable.
-            float x = x0 + i * dx;
+            float x = x0 + (programIndex + i) * dx;
            float y = y0 + j * dy;

-            int index = j * width + i;
+            int index = j * width + i + programIndex;
            output[index] = mandel(x, y, maxIterations);
        }
    }
--- a/examples/mandelbrot/mandelbrot.vcxproj
+++ b/examples/mandelbrot/mandelbrot.vcxproj
@@ -64,19 +64,15 @@
  <PropertyGroup Label="UserMacros" />
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
    <LinkIncremental>true</LinkIncremental>
-    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
    <LinkIncremental>true</LinkIncremental>
-    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
    <LinkIncremental>false</LinkIncremental>
-    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
    <LinkIncremental>false</LinkIncremental>
-    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
  </PropertyGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
    <ClCompile>
@@ -85,7 +81,6 @@
      <WarningLevel>Level3</WarningLevel>
      <Optimization>Disabled</Optimization>
      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <FloatingPointModel>Fast</FloatingPointModel>
    </ClCompile>
@@ -101,7 +96,6 @@
      <WarningLevel>Level3</WarningLevel>
      <Optimization>Disabled</Optimization>
      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <FloatingPointModel>Fast</FloatingPointModel>
    </ClCompile>
@@ -119,7 +113,6 @@
      <FunctionLevelLinking>true</FunctionLevelLinking>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
      <FloatingPointModel>Fast</FloatingPointModel>
    </ClCompile>
    <Link>
@@ -138,7 +131,6 @@
      <FunctionLevelLinking>true</FunctionLevelLinking>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
      <FloatingPointModel>Fast</FloatingPointModel>
    </ClCompile>
    <Link>
@@ -155,18 +147,18 @@
  <ItemGroup>
    <CustomBuild Include="mandelbrot.ispc">
      <FileType>Document</FileType>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx-x2
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=sse4x2
 </Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx-x2
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --target=sse4x2
 </Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx-x2
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=sse4x2
 </Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx-x2
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --target=sse4x2
 </Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
    </CustomBuild>
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
--- a/examples/mandelbrot_tasks/Makefile
+++ b/examples/mandelbrot_tasks/Makefile
@@ -8,11 +8,7 @@ TASK_OBJ=$(addprefix objs/, $(subst ../,, $(TASK_CXX:.cpp=.o)))
 CXX=g++
 CXXFLAGS=-Iobjs/ -O3 -Wall -m64
 ISPC=ispc
-ISPCFLAGS=-O2 --target=sse2,sse4-x2,avx-x2 --arch=x86-64
-
-OBJS=objs/mandelbrot.o objs/mandelbrot_serial.o $(TASK_OBJ) \
-	objs/mandelbrot_ispc.o objs/mandelbrot_ispc_sse2.o \
-	objs/mandelbrot_ispc_sse4.o objs/mandelbrot_ispc_avx.o 
+ISPCFLAGS=-O2 --target=sse4x2 --arch=x86-64

 default: mandelbrot

@@ -24,8 +20,8 @@ dirs:
 clean:
 	/bin/rm -rf objs *~ mandelbrot

-mandelbrot: dirs $(OBJS)
-	$(CXX) $(CXXFLAGS) -o $@ $(OBJS) -lm $(TASK_LIB)
+mandelbrot: dirs objs/mandelbrot.o objs/mandelbrot_serial.o objs/mandelbrot_ispc.o $(TASK_OBJ)
+	$(CXX) $(CXXFLAGS) -o $@ objs/mandelbrot.o objs/mandelbrot_ispc.o objs/mandelbrot_serial.o $(TASK_OBJ) -lm $(TASK_LIB)

 objs/%.o: %.cpp
 	$(CXX) $< $(CXXFLAGS) -c -o $@
@@ -35,5 +31,5 @@ objs/%.o: ../%.cpp

 objs/mandelbrot.o: objs/mandelbrot_ispc.h 

-objs/%_ispc.h objs/%_ispc.o objs/%_ispc_sse2.o objs/%_ispc_sse4.o objs/%_ispc_avx.o: %.ispc
+objs/%_ispc.h objs/%_ispc.o: %.ispc
 	$(ISPC) $(ISPCFLAGS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
--- a/examples/mandelbrot_tasks/mandelbrot.cpp
+++ b/examples/mandelbrot_tasks/mandelbrot.cpp
@@ -42,6 +42,7 @@
 #include <algorithm>
 #include <string.h>
 #include "../timing.h"
+#include "../cpuid.h"
 #include "mandelbrot_ispc.h"
 using namespace ispc;

@@ -68,6 +69,37 @@ writePPM(int *buf, int width, int height, const char *fn) {
 }


+// Make sure that the vector ISA used during compilation is supported by
+// the processor.  The ISPC_TARGET_* macro is set in the ispc-generated
+// header file that we include above.
+static void
+ensureTargetISAIsSupported() {
+#if defined(ISPC_TARGET_SSE2)
+    bool isaSupported = CPUSupportsSSE2();
+    const char *target = "SSE2";
+#elif defined(ISPC_TARGET_SSE4)
+    bool isaSupported = CPUSupportsSSE4();
+    const char *target = "SSE4";
+#elif defined(ISPC_TARGET_AVX)
+    bool isaSupported = CPUSupportsAVX();
+    const char *target = "AVX";
+#else
+#error "Unknown ISPC_TARGET_* value"
+#endif
+    if (!isaSupported) {
+        fprintf(stderr, "***\n*** Error: the ispc-compiled code uses the %s instruction "
+                "set, which isn't\n***        supported by this computer's CPU!\n", target);
+        fprintf(stderr, "***\n***        Please modify the "
+#ifdef _MSC_VER
+                "MSVC project file "
+#else
+                "Makefile "
+#endif
+                "to select another target (e.g. sse2)\n***\n");
+        exit(1);
+    }
+}
+
 static void usage() {
    fprintf(stderr, "usage: mandelbrot [--scale=<factor>]\n");
    exit(1);
@@ -100,6 +132,8 @@ int main(int argc, char *argv[]) {
    else
        usage();

+    ensureTargetISAIsSupported();
+
    int maxIterations = 512;
    int *buf = new int[width*height];

--- a/examples/mandelbrot_tasks/mandelbrot.ispc
+++ b/examples/mandelbrot_tasks/mandelbrot.ispc
@@ -57,16 +57,18 @@ mandelbrot_scanlines(uniform int ybase, uniform int span,
                     uniform float x0, uniform float dx, 
                     uniform float y0, uniform float dy,
                     uniform int width, uniform int maxIterations,
-                     uniform int output[]) {
+                     reference uniform int output[]) {
    uniform int ystart = ybase + taskIndex * span;
    uniform int yend = ystart + span;

-    foreach (yi = ystart ... yend, xi = 0 ... width) {
-        float x = x0 + xi * dx;
-        float y = y0 + yi * dy;
+    for (uniform int j = ystart; j < yend; ++j) {
+        for (uniform int i = 0; i < width; i += programCount) {
+            float x = x0 + (programIndex + i) * dx;
+            float y = y0 + j * dy;

-        int index = yi * width + xi;
-        output[index] = mandel(x, y, maxIterations);
+            int index = j * width + i + programIndex;
+            output[index] = mandel(x, y, maxIterations);
+        }
    }
 }
                               
@@ -75,7 +77,7 @@ task void
 mandelbrot_chunk(uniform float x0, uniform float dx,
                 uniform float y0, uniform float dy,
                 uniform int width, uniform int height,
-                 uniform int maxIterations, uniform int output[]) {
+                 uniform int maxIterations, reference uniform int output[]) {
    uniform int ystart = taskIndex * (height/taskCount);
    uniform int yend = (taskIndex+1) * (height/taskCount);
    uniform int span = 1;
@@ -89,7 +91,7 @@ export void
 mandelbrot_ispc(uniform float x0, uniform float y0, 
                uniform float x1, uniform float y1,
                uniform int width, uniform int height, 
-                uniform int maxIterations, uniform int output[]) {
+                uniform int maxIterations, reference uniform int output[]) {
    uniform float dx = (x1 - x0) / width;
    uniform float dy = (y1 - y0) / height;

--- a/examples/mandelbrot_tasks/mandelbrot_tasks.vcxproj
+++ b/examples/mandelbrot_tasks/mandelbrot_tasks.vcxproj
@@ -64,19 +64,15 @@
  <PropertyGroup Label="UserMacros" />
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
    <LinkIncremental>true</LinkIncremental>
-    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
    <LinkIncremental>true</LinkIncremental>
-    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
    <LinkIncremental>false</LinkIncremental>
-    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
    <LinkIncremental>false</LinkIncremental>
-    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
  </PropertyGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
    <ClCompile>
@@ -85,7 +81,6 @@
      <WarningLevel>Level3</WarningLevel>
      <Optimization>Disabled</Optimization>
      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <FloatingPointModel>Fast</FloatingPointModel>
    </ClCompile>
@@ -101,7 +96,6 @@
      <WarningLevel>Level3</WarningLevel>
      <Optimization>Disabled</Optimization>
      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <FloatingPointModel>Fast</FloatingPointModel>
    </ClCompile>
@@ -119,7 +113,6 @@
      <FunctionLevelLinking>true</FunctionLevelLinking>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
      <FloatingPointModel>Fast</FloatingPointModel>
    </ClCompile>
    <Link>
@@ -138,7 +131,6 @@
      <FunctionLevelLinking>true</FunctionLevelLinking>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
      <FloatingPointModel>Fast</FloatingPointModel>
    </ClCompile>
    <Link>
@@ -156,18 +148,18 @@
  <ItemGroup>
    <CustomBuild Include="mandelbrot.ispc">
      <FileType>Document</FileType>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx-x2
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=sse4x2
 </Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx-x2
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --target=sse4x2
 </Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx-x2
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=sse4x2
 </Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx-x2
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --target=sse4x2
 </Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
    </CustomBuild>
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
--- a/examples/noise/Makefile
+++ b/examples/noise/Makefile
@@ -2,10 +2,7 @@
 CXX=g++ -m64
 CXXFLAGS=-Iobjs/ -O3 -Wall
 ISPC=ispc
-ISPCFLAGS=-O2 --target=sse2,sse4,avx-x2 --arch=x86-64
-
-OBJS=objs/noise.o objs/noise_serial.o objs/noise_ispc.o objs/noise_ispc_sse2.o \
-	objs/noise_ispc_sse4.o objs/noise_ispc_avx.o 
+ISPCFLAGS=-O2 --target=sse4 --arch=x86-64

 default: noise

@@ -17,13 +14,13 @@ dirs:
 clean:
 	/bin/rm -rf objs *~ noise

-noise: dirs $(OBJS)
-	$(CXX) $(CXXFLAGS) -o $@ $(OBJS) -lm
+noise: dirs objs/noise.o objs/noise_serial.o objs/noise_ispc.o
+	$(CXX) $(CXXFLAGS) -o $@ objs/noise.o objs/noise_ispc.o objs/noise_serial.o -lm

 objs/%.o: %.cpp
 	$(CXX) $< $(CXXFLAGS) -c -o $@

 objs/noise.o: objs/noise_ispc.h 

-objs/%_ispc.h objs/%_ispc.o objs/%_ispc_sse2.o objs/%_ispc_sse4.o objs/%_ispc_avx.o: %.ispc
+objs/%_ispc.h objs/%_ispc.o: %.ispc
 	$(ISPC) $(ISPCFLAGS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
--- a/examples/noise/noise.cpp
+++ b/examples/noise/noise.cpp
@@ -41,6 +41,7 @@
 #include <stdio.h>
 #include <algorithm>
 #include "../timing.h"
+#include "../cpuid.h"
 #include "noise_ispc.h"
 using namespace ispc;

@@ -65,6 +66,38 @@ writePPM(float *buf, int width, int height, const char *fn) {
 }


+// Make sure that the vector ISA used during compilation is supported by
+// the processor.  The ISPC_TARGET_* macro is set in the ispc-generated
+// header file that we include above.
+static void
+ensureTargetISAIsSupported() {
+#if defined(ISPC_TARGET_SSE2)
+    bool isaSupported = CPUSupportsSSE2();
+    const char *target = "SSE2";
+#elif defined(ISPC_TARGET_SSE4)
+    bool isaSupported = CPUSupportsSSE4();
+    const char *target = "SSE4";
+#elif defined(ISPC_TARGET_AVX)
+    bool isaSupported = CPUSupportsAVX();
+    const char *target = "AVX";
+#else
+#error "Unknown ISPC_TARGET_* value"
+#endif
+    if (!isaSupported) {
+        fprintf(stderr, "***\n*** Error: the ispc-compiled code uses the %s instruction "
+                "set, which isn't\n***        supported by this computer's CPU!\n", target);
+        fprintf(stderr, "***\n***        Please modify the "
+#ifdef _MSC_VER
+                "MSVC project file "
+#else
+                "Makefile "
+#endif
+                "to select another target (e.g. sse2)\n***\n");
+        exit(1);
+    }
+}
+
+
 int main() {
    unsigned int width = 768;
    unsigned int height = 768;
@@ -75,6 +108,8 @@ int main() {

    float *buf = new float[width*height];

+    ensureTargetISAIsSupported();
+
    //
    // Compute the image using the ispc implementation; report the minimum
    // time of three runs.
--- a/examples/noise/noise.vcxproj
+++ b/examples/noise/noise.vcxproj
@@ -1,4 +1,4 @@
-<?xml version="1.0" encoding="utf-8"?>
+<?xml version="1.0" encoding="utf-8"?>
 <Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
  <ItemGroup Label="ProjectConfigurations">
    <ProjectConfiguration Include="Debug|Win32">
@@ -64,19 +64,15 @@
  <PropertyGroup Label="UserMacros" />
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
    <LinkIncremental>true</LinkIncremental>
-    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
    <LinkIncremental>true</LinkIncremental>
-    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
    <LinkIncremental>false</LinkIncremental>
-    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
    <LinkIncremental>false</LinkIncremental>
-    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
  </PropertyGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
    <ClCompile>
@@ -85,7 +81,6 @@
      <WarningLevel>Level3</WarningLevel>
      <Optimization>Disabled</Optimization>
      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <FloatingPointModel>Fast</FloatingPointModel>
    </ClCompile>
@@ -101,7 +96,6 @@
      <WarningLevel>Level3</WarningLevel>
      <Optimization>Disabled</Optimization>
      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <FloatingPointModel>Fast</FloatingPointModel>
    </ClCompile>
@@ -119,7 +113,6 @@
      <FunctionLevelLinking>true</FunctionLevelLinking>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
      <FloatingPointModel>Fast</FloatingPointModel>
    </ClCompile>
    <Link>
@@ -138,7 +131,6 @@
      <FunctionLevelLinking>true</FunctionLevelLinking>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
      <FloatingPointModel>Fast</FloatingPointModel>
    </ClCompile>
    <Link>
@@ -155,18 +147,18 @@
  <ItemGroup>
    <CustomBuild Include="noise.ispc">
      <FileType>Document</FileType>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4,avx-x2
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=sse4
 </Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4,avx-x2
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --target=sse4
 </Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4,avx-x2
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=sse4
 </Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4,avx-x2
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --target=sse4
 </Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
    </CustomBuild>
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
--- a/examples/options/Makefile
+++ b/examples/options/Makefile
@@ -1,17 +1,8 @@

-TASK_CXX=../tasksys.cpp
-TASK_LIB=-lpthread
-TASK_OBJ=$(addprefix objs/, $(subst ../,, $(TASK_CXX:.cpp=.o)))
-
-
 CXX=g++ -m64
 CXXFLAGS=-Iobjs/ -g -Wall
 ISPC=ispc
-ISPCFLAGS=-O2 --target=sse2,sse4-x2,avx-x2 --arch=x86-64
-
-OBJS=objs/options.o objs/options_serial.o objs/options_ispc.o \
-	objs/options_ispc_sse2.o objs/options_ispc_sse4.o \
-	objs/options_ispc_avx.o $(TASK_OBJ)
+ISPCFLAGS=-O2 --target=sse4x2 --arch=x86-64

 default: options

@@ -23,16 +14,13 @@ dirs:
 clean:
 	/bin/rm -rf objs *~ options

-options: dirs $(OBJS)
-	$(CXX) $(CXXFLAGS) -o $@ $(OBJS) -lm $(TASK_LIB)
+options: dirs objs/options.o objs/options_serial.o objs/options_ispc.o
+	$(CXX) $(CXXFLAGS) -o $@ objs/options.o objs/options_ispc.o objs/options_serial.o -lm

 objs/%.o: %.cpp
 	$(CXX) $< $(CXXFLAGS) -c -o $@

-objs/%.o: ../%.cpp
-	$(CXX) $< $(CXXFLAGS) -c -o $@
-
 objs/options.o: objs/options_ispc.h options_defs.h

-objs/%_ispc.h objs/%_ispc.o objs/%_ispc_sse2.o objs/%_ispc_sse4.o objs/%_ispc_avx.o: %.ispc options_defs.h
+objs/%_ispc.h objs/%_ispc.o: %.ispc options_defs.h
 	$(ISPC) $(ISPCFLAGS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
--- a/examples/options/options.cpp
+++ b/examples/options/options.cpp
@@ -31,8 +31,6 @@
   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
 */

-#define NOMINMAX
-
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
@@ -43,6 +41,7 @@ using std::max;

 #include "options_defs.h"
 #include "../timing.h"
+#include "../cpuid.h"

 #include "options_ispc.h"
 using namespace ispc;
@@ -55,32 +54,49 @@ extern void binomial_put_serial(float Sa[], float Xa[], float Ta[],
                                float ra[], float va[], 
                                float result[], int count);

-static void usage() {
-    printf("usage: options [--count=<num options>]\n");
+// Make sure that the vector ISA used during compilation is supported by
+// the processor.  The ISPC_TARGET_* macro is set in the ispc-generated
+// header file that we include above.
+static void
+ensureTargetISAIsSupported() {
+#if defined(ISPC_TARGET_SSE2)
+    bool isaSupported = CPUSupportsSSE2();
+    const char *target = "SSE2";
+#elif defined(ISPC_TARGET_SSE4)
+    bool isaSupported = CPUSupportsSSE4();
+    const char *target = "SSE4";
+#elif defined(ISPC_TARGET_AVX)
+    bool isaSupported = CPUSupportsAVX();
+    const char *target = "AVX";
+#else
+#error "Unknown ISPC_TARGET_* value"
+#endif
+    if (!isaSupported) {
+        fprintf(stderr, "***\n*** Error: the ispc-compiled code uses the %s instruction "
+                "set, which isn't\n***        supported by this computer's CPU!\n", target);
+        fprintf(stderr, "***\n***        Please modify the "
+#ifdef _MSC_VER
+                "MSVC project file "
+#else
+                "Makefile "
+#endif
+                "to select another target (e.g. sse2)\n***\n");
+        exit(1);
+    }
 }


-int main(int argc, char *argv[]) {
-    int nOptions = 128*1024;
+int main() {
+    ensureTargetISAIsSupported();
+    
+    float *S = new float[N_OPTIONS];
+    float *X = new float[N_OPTIONS];
+    float *T = new float[N_OPTIONS];
+    float *r = new float[N_OPTIONS];
+    float *v = new float[N_OPTIONS];
+    float *result = new float[N_OPTIONS];

-    for (int i = 1; i < argc; ++i) {
-        if (strncmp(argv[i], "--count=", 8) == 0) {
-            nOptions = atoi(argv[i] + 8);
-            if (nOptions <= 0) {
-                usage();
-                exit(1);
-            }
-        }
-    }
-
-    float *S = new float[nOptions];
-    float *X = new float[nOptions];
-    float *T = new float[nOptions];
-    float *r = new float[nOptions];
-    float *v = new float[nOptions];
-    float *result = new float[nOptions];
-
-    for (int i = 0; i < nOptions; ++i) {
+    for (int i = 0; i < N_OPTIONS; ++i) {
        S[i] = 100;  // stock price
        X[i] = 98;   // option strike price
        T[i] = 2;    // time (years)
@@ -88,109 +104,61 @@ int main(int argc, char *argv[]) {
        v[i] = 5;    // volatility
    }

-    double sum;
-
    //
    // Binomial options pricing model, ispc implementation
    //
-    double binomial_ispc = 1e30;
-    for (int i = 0; i < 3; ++i) {
-        reset_and_start_timer();
-        binomial_put_ispc(S, X, T, r, v, result, nOptions);
-        double dt = get_elapsed_mcycles();
-        sum = 0.;
-        for (int i = 0; i < nOptions; ++i)
-            sum += result[i];
-        binomial_ispc = std::min(binomial_ispc, dt);
-    }
-    printf("[binomial ispc, 1 thread]:\t[%.3f] million cycles (avg %f)\n", 
-           binomial_ispc, sum / nOptions);
-
-    //
-    // Binomial options pricing model, ispc implementation, tasks
-    //
-    double binomial_tasks = 1e30;
-    for (int i = 0; i < 3; ++i) {
-        reset_and_start_timer();
-        binomial_put_ispc_tasks(S, X, T, r, v, result, nOptions);
-        double dt = get_elapsed_mcycles();
-        sum = 0.;
-        for (int i = 0; i < nOptions; ++i)
-            sum += result[i];
-        binomial_tasks = std::min(binomial_tasks, dt);
-    }
-    printf("[binomial ispc, tasks]:\t\t[%.3f] million cycles (avg %f)\n", 
-           binomial_tasks, sum / nOptions);
+    reset_and_start_timer();
+    binomial_put_ispc(S, X, T, r, v, result, N_OPTIONS);
+    double binomial_ispc = get_elapsed_mcycles();
+    float sum = 0.f;
+    for (int i = 0; i < N_OPTIONS; ++i)
+        sum += result[i];
+    printf("[binomial ispc]:\t\t[%.3f] million cycles (avg %f)\n", 
+           binomial_ispc, sum / N_OPTIONS);

    //
    // Binomial options, serial implementation
    //
-    double binomial_serial = 1e30;
-    for (int i = 0; i < 3; ++i) {
-        reset_and_start_timer();
-        binomial_put_serial(S, X, T, r, v, result, nOptions);
-        double dt = get_elapsed_mcycles();
-        sum = 0.;
-        for (int i = 0; i < nOptions; ++i)
-            sum += result[i];
-        binomial_serial = std::min(binomial_serial, dt);
-    }
+    reset_and_start_timer();
+    binomial_put_serial(S, X, T, r, v, result, N_OPTIONS);
+    double binomial_serial = get_elapsed_mcycles();
+    sum = 0.f;
+    for (int i = 0; i < N_OPTIONS; ++i)
+        sum += result[i];
    printf("[binomial serial]:\t\t[%.3f] million cycles (avg %f)\n", 
-           binomial_serial, sum / nOptions);
+           binomial_serial, sum / N_OPTIONS);

-    printf("\t\t\t\t(%.2fx speedup from ISPC, %.2fx speedup from ISPC + tasks)\n",
-           binomial_serial / binomial_ispc, binomial_serial / binomial_tasks);
+    printf("\t\t\t\t(%.2fx speedup from ISPC)\n", binomial_serial / binomial_ispc);

    //
-    // Black-Scholes options pricing model, ispc implementation, 1 thread
+    // Black-Scholes options pricing model, ispc implementation
    //
-    double bs_ispc = 1e30;
-    for (int i = 0; i < 3; ++i) {
-        reset_and_start_timer();
-        black_scholes_ispc(S, X, T, r, v, result, nOptions);
-        double dt = get_elapsed_mcycles();
-        sum = 0.;
-        for (int i = 0; i < nOptions; ++i)
+    sum = 0.f;
+    reset_and_start_timer();
+    for (int a = 0; a < N_BLACK_SCHOLES_ROUNDS; ++a) {
+        black_scholes_ispc(S, X, T, r, v, result, N_OPTIONS);
+        for (int i = 0; i < N_OPTIONS; ++i)
            sum += result[i];
-        bs_ispc = std::min(bs_ispc, dt);
    }
-    printf("[black-scholes ispc, 1 thread]:\t[%.3f] million cycles (avg %f)\n", 
-           bs_ispc, sum / nOptions);
-
-    //
-    // Black-Scholes options pricing model, ispc implementation, tasks
-    //
-    double bs_ispc_tasks = 1e30;
-    for (int i = 0; i < 3; ++i) {
-        reset_and_start_timer();
-        black_scholes_ispc_tasks(S, X, T, r, v, result, nOptions);
-        double dt = get_elapsed_mcycles();
-        sum = 0.;
-        for (int i = 0; i < nOptions; ++i)
-            sum += result[i];
-        bs_ispc_tasks = std::min(bs_ispc_tasks, dt);
-    }
-    printf("[black-scholes ispc, tasks]:\t[%.3f] million cycles (avg %f)\n", 
-           bs_ispc_tasks, sum / nOptions);
+    double bs_ispc = get_elapsed_mcycles();
+    printf("[black-scholes ispc]:\t\t[%.3f] million cycles (avg %f)\n", 
+           bs_ispc, sum / (N_BLACK_SCHOLES_ROUNDS * N_OPTIONS));

    //
    // Black-Scholes options pricing model, serial implementation
    //
-    double bs_serial = 1e30;
-    for (int i = 0; i < 3; ++i) {
-        reset_and_start_timer();
-        black_scholes_serial(S, X, T, r, v, result, nOptions);
-        double dt = get_elapsed_mcycles();
-        sum = 0.;
-        for (int i = 0; i < nOptions; ++i)
+    sum = 0.f;
+    reset_and_start_timer();
+    for (int a = 0; a < N_BLACK_SCHOLES_ROUNDS; ++a) {
+        black_scholes_serial(S, X, T, r, v, result, N_OPTIONS);
+        for (int i = 0; i < N_OPTIONS; ++i)
            sum += result[i];
-        bs_serial = std::min(bs_serial, dt);
    }
+    double bs_serial = get_elapsed_mcycles();
    printf("[black-scholes serial]:\t\t[%.3f] million cycles (avg %f)\n", bs_serial, 
-           sum / nOptions);
+           sum / (N_BLACK_SCHOLES_ROUNDS * N_OPTIONS));

-    printf("\t\t\t\t(%.2fx speedup from ISPC, %.2fx speedup from ISPC + tasks)\n", 
-           bs_serial / bs_ispc, bs_serial / bs_ispc_tasks);
+    printf("\t\t\t\t(%.2fx speedup from ISPC)\n", bs_serial / bs_ispc);

    return 0;
 }
--- a/examples/options/options.ispc
+++ b/examples/options/options.ispc
@@ -55,100 +55,49 @@ CND(float X) {
    return w;
 }

-task void
-bs_task(uniform float Sa[], uniform float Xa[], uniform float Ta[],
-        uniform float ra[], uniform float va[], 
-        uniform float result[], uniform int count) {
-    uniform int first = taskIndex * (count/taskCount);
-    uniform int last = min(count, (int)((taskIndex+1) * (count/taskCount)));
-
-    foreach (i = first ... last) {
-        float S = Sa[i], X = Xa[i], T = Ta[i], r = ra[i], v = va[i];
-
-        float d1 = (log(S/X) + (r + v * v * .5f) * T) / (v * sqrt(T));
-        float d2 = d1 - v * sqrt(T);
-
-        result[i] = S * CND(d1) - X * exp(-r * T) * CND(d2);
-    }
-}
-
-export void
-black_scholes_ispc_tasks(uniform float Sa[], uniform float Xa[], uniform float Ta[],
-                         uniform float ra[], uniform float va[], 
-                         uniform float result[], uniform int count) {
-    uniform int nTasks = max((int)64, (int)count/16384);
-    launch[nTasks] < bs_task(Sa, Xa, Ta, ra, va, result, count) >;
-}
-
-
 export void
 black_scholes_ispc(uniform float Sa[], uniform float Xa[], uniform float Ta[],
                   uniform float ra[], uniform float va[], 
                   uniform float result[], uniform int count) {
-    foreach (i = 0 ... count) {
-        float S = Sa[i], X = Xa[i], T = Ta[i], r = ra[i], v = va[i];
+    for (uniform int i = 0; i < count; i += programCount) {
+        float S = Sa[i + programIndex], X = Xa[i + programIndex];
+        float T = Ta[i + programIndex], r = ra[i + programIndex];
+        float v = va[i + programIndex];

        float d1 = (log(S/X) + (r + v * v * .5f) * T) / (v * sqrt(T));
        float d2 = d1 - v * sqrt(T);

-        result[i] = S * CND(d1) - X * exp(-r * T) * CND(d2);
+        result[i + programIndex] = S * CND(d1) - X * exp(-r * T) * CND(d2);
    }
 }


-static inline float
-binomial_put(float S, float X, float T, float r, float v) {
-    float V[BINOMIAL_NUM];
-
-    float dt = T / BINOMIAL_NUM;
-    float u = exp(v * sqrt(dt));
-    float d = 1. / u;
-    float disc = exp(r * dt);
-    float Pu = (disc - d) / (u - d);
-
-    for (uniform int j = 0; j < BINOMIAL_NUM; ++j) {
-        float upow = pow(u, (float)(2*j-BINOMIAL_NUM));
-        V[j] = max(0., X - S * upow);
-    }
-
-    for (uniform int j = BINOMIAL_NUM-1; j >= 0; --j)
-        for (uniform int k = 0; k < j; ++k)
-            V[k] = ((1 - Pu) * V[k] + Pu * V[k + 1]) / disc;
-    return V[0];
-}
-
-
 export void
 binomial_put_ispc(uniform float Sa[], uniform float Xa[], uniform float Ta[], 
                  uniform float ra[], uniform float va[], 
                  uniform float result[], uniform int count) {
-    foreach (i = 0 ... count) {
-        float S = Sa[i], X = Xa[i], T = Ta[i], r = ra[i], v = va[i];
-        result[i] = binomial_put(S, X, T, r, v);
+    float V[BINOMIAL_NUM];
+
+    for (uniform int i = 0; i < count; i += programCount) {
+        float S = Sa[i + programIndex], X = Xa[i + programIndex];
+        float T = Ta[i + programIndex], r = ra[i + programIndex];
+        float v = va[i + programIndex];
+
+        float dt = T / BINOMIAL_NUM;
+        float u = exp(v * sqrt(dt));
+        float d = 1. / u;
+        float disc = exp(r * dt);
+        float Pu = (disc - d) / (u - d);
+
+        for (uniform int j = 0; j < BINOMIAL_NUM; ++j) {
+            float upow = pow(u, (float)(2*j-BINOMIAL_NUM));
+            V[j] = max(0., X - S * upow);
+        }
+
+        for (uniform int j = BINOMIAL_NUM-1; j >= 0; --j)
+            for (uniform int k = 0; k < j; ++k)
+                V[k] = ((1 - Pu) * V[k] + Pu * V[k + 1]) / disc;
+
+        result[i + programIndex] = V[0];
    }
 }
-
-
-task void
-binomial_task(uniform float Sa[], uniform float Xa[], 
-              uniform float Ta[], uniform float ra[], 
-              uniform float va[], uniform float result[], 
-              uniform int count) {
-    uniform int first = taskIndex * (count/taskCount);
-    uniform int last = min(count, (int)((taskIndex+1) * (count/taskCount)));
-
-    foreach (i = first ... last) {
-        float S = Sa[i], X = Xa[i], T = Ta[i], r = ra[i], v = va[i];
-        result[i] = binomial_put(S, X, T, r, v);
-    }
-}
-
-
-export void
-binomial_put_ispc_tasks(uniform float Sa[], uniform float Xa[], 
-                        uniform float Ta[], uniform float ra[], 
-                        uniform float va[], uniform float result[], 
-                        uniform int count) {
-    uniform int nTasks = max((int)64, (int)count/16384);
-    launch[nTasks] < binomial_task(Sa, Xa, Ta, ra, va, result, count) >;
-}
--- a/examples/options/options.vcxproj
+++ b/examples/options/options.vcxproj
@@ -1,4 +1,4 @@
-<?xml version="1.0" encoding="utf-8"?>
+<?xml version="1.0" encoding="utf-8"?>
 <Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
  <ItemGroup Label="ProjectConfigurations">
    <ProjectConfiguration Include="Debug|Win32">
@@ -64,19 +64,15 @@
  <PropertyGroup Label="UserMacros" />
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
    <LinkIncremental>true</LinkIncremental>
-    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
    <LinkIncremental>true</LinkIncremental>
-    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
    <LinkIncremental>false</LinkIncremental>
-    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
    <LinkIncremental>false</LinkIncremental>
-    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
  </PropertyGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
    <ClCompile>
@@ -85,7 +81,6 @@
      <WarningLevel>Level3</WarningLevel>
      <Optimization>Disabled</Optimization>
      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
      <DisableSpecificWarnings>4305</DisableSpecificWarnings>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <FloatingPointModel>Fast</FloatingPointModel>
@@ -102,7 +97,6 @@
      <WarningLevel>Level3</WarningLevel>
      <Optimization>Disabled</Optimization>
      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
      <DisableSpecificWarnings>4305</DisableSpecificWarnings>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <FloatingPointModel>Fast</FloatingPointModel>
@@ -121,7 +115,6 @@
      <FunctionLevelLinking>true</FunctionLevelLinking>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
      <DisableSpecificWarnings>4305</DisableSpecificWarnings>
      <FloatingPointModel>Fast</FloatingPointModel>
    </ClCompile>
@@ -141,7 +134,6 @@
      <FunctionLevelLinking>true</FunctionLevelLinking>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
      <DisableSpecificWarnings>4305</DisableSpecificWarnings>
      <FloatingPointModel>Fast</FloatingPointModel>
    </ClCompile>
@@ -155,23 +147,22 @@
  <ItemGroup>
    <ClCompile Include="options.cpp" />
    <ClCompile Include="options_serial.cpp" />
-    <ClCompile Include="../tasksys.cpp" />
  </ItemGroup>
  <ItemGroup>
    <CustomBuild Include="options.ispc">
      <FileType>Document</FileType>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx-x2
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=sse4x2
 </Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx-x2
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --target=sse4x2
 </Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx-x2
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=sse4x2
 </Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx-x2
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --target=sse4x2
 </Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
    </CustomBuild>
  </ItemGroup>
  <ItemGroup>
--- a/examples/options/options_defs.h
+++ b/examples/options/options_defs.h
@@ -35,6 +35,8 @@
 #define OPTIONS_DEFS_H 1

 #define BINOMIAL_NUM 64
+#define N_OPTIONS 65536
+#define N_BLACK_SCHOLES_ROUNDS 20


 #endif // OPTIONS_DEFS_H
--- a/examples/rt/Makefile
+++ b/examples/rt/Makefile
@@ -8,10 +8,7 @@ TASK_OBJ=$(addprefix objs/, $(subst ../,, $(TASK_CXX:.cpp=.o)))
 CXX=g++
 CXXFLAGS=-Iobjs/ -O3 -Wall -m64
 ISPC=ispc
-ISPCFLAGS=-O2 --target=sse2,sse4-x2,avx --arch=x86-64
-
-OBJS=objs/rt.o objs/rt_serial.o $(TASK_OBJ) objs/rt_ispc.o objs/rt_ispc_sse2.o \
-	objs/rt_ispc_sse4.o objs/rt_ispc_avx.o
+ISPCFLAGS=-O2 --target=sse4x2 --arch=x86-64

 default: rt

@@ -23,8 +20,8 @@ dirs:
 clean:
 	/bin/rm -rf objs *~ rt

-rt: dirs $(OBJS)
-	$(CXX) $(CXXFLAGS) -o $@ $(OBJS) -lm $(TASK_LIB)
+rt: dirs objs/rt.o objs/rt_serial.o objs/rt_ispc.o $(TASK_OBJ)
+	$(CXX) $(CXXFLAGS) -o $@ objs/rt.o objs/rt_ispc.o objs/rt_serial.o $(TASK_OBJ) -lm $(TASK_LIB)

 objs/%.o: %.cpp
 	$(CXX) $< $(CXXFLAGS) -c -o $@
@@ -34,5 +31,5 @@ objs/%.o: ../%.cpp

 objs/rt.o: objs/rt_ispc.h 

-objs/%_ispc.h objs/%_ispc.o objs/%_ispc_sse2.o objs/%_ispc_sse4.o objs/%_ispc_avx.o: %.ispc
+objs/%_ispc.h objs/%_ispc.o: %.ispc
 	$(ISPC) $(ISPCFLAGS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
--- a/examples/rt/rt.cpp
+++ b/examples/rt/rt.cpp
@@ -45,6 +45,7 @@
 #include <string.h>
 #include <sys/types.h>
 #include "../timing.h"
+#include "../cpuid.h"
 #include "rt_ispc.h"

 using namespace ispc;
@@ -95,6 +96,38 @@ static void writeImage(int *idImage, float *depthImage, int width, int height,
 }


+// Make sure that the vector ISA used during compilation is supported by
+// the processor.  The ISPC_TARGET_* macro is set in the ispc-generated
+// header file that we include above.
+static void
+ensureTargetISAIsSupported() {
+#if defined(ISPC_TARGET_SSE2)
+    bool isaSupported = CPUSupportsSSE2();
+    const char *target = "SSE2";
+#elif defined(ISPC_TARGET_SSE4)
+    bool isaSupported = CPUSupportsSSE4();
+    const char *target = "SSE4";
+#elif defined(ISPC_TARGET_AVX)
+    bool isaSupported = CPUSupportsAVX();
+    const char *target = "AVX";
+#else
+#error "Unknown ISPC_TARGET_* value"
+#endif
+    if (!isaSupported) {
+        fprintf(stderr, "***\n*** Error: the ispc-compiled code uses the %s instruction "
+                "set, which isn't\n***        supported by this computer's CPU!\n", target);
+        fprintf(stderr, "***\n***        Please modify the "
+#ifdef _MSC_VER
+                "MSVC project file "
+#else
+                "Makefile "
+#endif
+                "to select another target (e.g. sse2)\n***\n");
+        exit(1);
+    }
+}
+
+
 static void usage() {
    fprintf(stderr, "rt [--scale=<factor>] <scene name base>\n");
    exit(1);
@@ -118,6 +151,8 @@ int main(int argc, char *argv[]) {
    if (filename == NULL)
        usage();

+    ensureTargetISAIsSupported();
+
 #define READ(var, n)                                            \
    if (fread(&(var), sizeof(var), n, f) != (unsigned int)n) {  \
        fprintf(stderr, "Unexpected EOF reading scene file\n"); \
@@ -168,12 +203,12 @@ int main(int argc, char *argv[]) {
        // of node, the total number of int it if a leaf node, etc.
        float b[6];
        READ(b[0], 6);
-        nodes[i].bounds[0][0] = b[0];
-        nodes[i].bounds[0][1] = b[1];
-        nodes[i].bounds[0][2] = b[2];
-        nodes[i].bounds[1][0] = b[3];
-        nodes[i].bounds[1][1] = b[4];
-        nodes[i].bounds[1][2] = b[5];
+        nodes[i].bounds[0].v[0] = b[0];
+        nodes[i].bounds[0].v[1] = b[1];
+        nodes[i].bounds[0].v[2] = b[2];
+        nodes[i].bounds[1].v[0] = b[3];
+        nodes[i].bounds[1].v[1] = b[4];
+        nodes[i].bounds[1].v[2] = b[5];
        READ(nodes[i].offset, 1);
        READ(nodes[i].nPrimitives, 1);
        READ(nodes[i].splitAxis, 1);
@@ -190,17 +225,19 @@ int main(int argc, char *argv[]) {
        READ(v[0], 9);
        float *vp = v;
        for (int j = 0; j < 3; ++j) {
-            triangles[i].p[j][0] = *vp++;
-            triangles[i].p[j][1] = *vp++;
-            triangles[i].p[j][2] = *vp++;
+            triangles[i].p[j].v[0] = *vp++;
+            triangles[i].p[j].v[1] = *vp++;
+            triangles[i].p[j].v[2] = *vp++;
        }
        // And create an object id
        triangles[i].id = i+1;
    }
    fclose(f);

-    int height = int(baseHeight * scale);
-    int width = int(baseWidth * scale);
+    // round image resolution up to multiple of 16 to make things easy for
+    // the code that assigns pixels to ispc program instances
+    int height = (int(baseHeight * scale) + 0xf) & ~0xf;
+    int width = (int(baseWidth * scale) + 0xf) & ~0xf;

    // allocate images; one to hold hit object ids, one to hold depth to
    // the first interseciton
--- a/examples/rt/rt.ispc
+++ b/examples/rt/rt.ispc
@@ -43,13 +43,12 @@ struct Ray {
 };

 struct Triangle {
-    uniform float p[3][4];
+    uniform float3 p[3];
    uniform int id;
-    uniform int pad[3];
 };

 struct LinearBVHNode {
-    uniform float bounds[2][3];
+    uniform float3 bounds[2];
    uniform unsigned int offset;     // num primitives for leaf, second child for interior
    uniform unsigned int8 nPrimitives;
    uniform unsigned int8 splitAxis;
@@ -73,7 +72,7 @@ static inline float Dot(const float3 a, const float3 b) {

 static void generateRay(uniform const float raster2camera[4][4], 
                        uniform const float camera2world[4][4],
-                        float x, float y, Ray &ray) {
+                        float x, float y, reference Ray ray) {
    ray.mint = 0.f;
    ray.maxt = 1e30f;

@@ -104,16 +103,14 @@ static void generateRay(uniform const float raster2camera[4][4],
 }


-static inline bool BBoxIntersect(const uniform float bounds[2][3], 
-                                 const Ray &ray) {
-    uniform float3 bounds0 = { bounds[0][0], bounds[0][1], bounds[0][2] };
-    uniform float3 bounds1 = { bounds[1][0], bounds[1][1], bounds[1][2] };
+static inline bool BBoxIntersect(const reference uniform float3 bounds[2], 
+                                 const reference Ray ray) {
    float t0 = ray.mint, t1 = ray.maxt;

    // Check all three axis-aligned slabs.  Don't try to early out; it's
    // not worth the trouble
-    float3 tNear = (bounds0 - ray.origin) * ray.invDir;
-    float3 tFar  = (bounds1 - ray.origin) * ray.invDir;
+    float3 tNear = (bounds[0] - ray.origin) * ray.invDir;
+    float3 tFar  = (bounds[1] - ray.origin) * ray.invDir;
    if (tNear.x > tFar.x) {
        float tmp = tNear.x;
        tNear.x = tFar.x;
@@ -143,12 +140,9 @@ static inline bool BBoxIntersect(const uniform float bounds[2][3],



-static inline bool TriIntersect(const Triangle &tri, Ray &ray) {
-    uniform float3 p0 = { tri.p[0][0], tri.p[0][1], tri.p[0][2] };
-    uniform float3 p1 = { tri.p[1][0], tri.p[1][1], tri.p[1][2] };
-    uniform float3 p2 = { tri.p[2][0], tri.p[2][1], tri.p[2][2] };
-    uniform float3 e1 = p1 - p0;
-    uniform float3 e2 = p2 - p0;
+static inline bool TriIntersect(const reference Triangle tri, reference Ray ray) {
+    uniform float3 e1 = tri.p[1] - tri.p[0];
+    uniform float3 e2 = tri.p[2] - tri.p[0];

    float3 s1 = Cross(ray.dir, e2);
    float divisor = Dot(s1, e1);
@@ -159,7 +153,7 @@ static inline bool TriIntersect(const Triangle &tri, Ray &ray) {
    float invDivisor = 1.f / divisor;

    // Compute first barycentric coordinate
-    float3 d = ray.origin - p0;
+    float3 d = ray.origin - tri.p[0];
    float b1 = Dot(d, s1) * invDivisor;
    if (b1 < 0. || b1 > 1.)
        hit = false;
@@ -184,7 +178,7 @@ static inline bool TriIntersect(const Triangle &tri, Ray &ray) {


 bool BVHIntersect(const LinearBVHNode nodes[], const Triangle tris[], 
-                  Ray &r) {
+                  reference Ray r) {
    Ray ray = r;
    bool hit = false;
    // Follow ray through BVH nodes to find primitive intersections
@@ -244,15 +238,34 @@ static void raytrace_tile(uniform int x0, uniform int x1,
    uniform float widthScale = (float)(baseWidth) / (float)(width);
    uniform float heightScale = (float)(baseHeight) / (float)(height);

-    foreach_tiled (y = y0 ... y1, x = x0 ... x1) {
-        Ray ray;
-        generateRay(raster2camera, camera2world, x*widthScale,
-                    y*heightScale, ray);
-        BVHIntersect(nodes, triangles, ray);
+    static const uniform float udx[16] = { 0, 1, 0, 1, 2, 3, 2, 3, 
+                                           0, 1, 0, 1, 2, 3, 2, 3 };
+    static const uniform float udy[16] = { 0, 0, 1, 1, 0, 0, 1, 1, 
+                                           2, 2, 3, 3, 2, 2, 3, 3 };

-        int offset = y * width + x;
-        image[offset] = ray.maxt;
-        id[offset] = ray.hitId;
+    // The outer loops are always over blocks of 4x4 pixels
+    for (uniform int y = y0; y < y1; y += 4) {
+        for (uniform int x = x0; x < x1; x += 4) {
+            // Now we have a block of 4x4=16 pixels to process; it will
+            // take 16/programCount iterations of this loop to process
+            // them.
+            for (uniform int o = 0; o < 16 / programCount; ++o) {
+                // Map program instances to samples in the udx/udy arrays
+                // to figure out which pixel each program instance is
+                // responsible for
+                const float dx = udx[o * programCount + programIndex];
+                const float dy = udy[o * programCount + programIndex];
+
+                Ray ray;
+                generateRay(raster2camera, camera2world, (x+dx)*widthScale,
+                            (y+dy)*heightScale, ray);
+                BVHIntersect(nodes, triangles, ray);
+
+                int offset = (y + (int)dy) * width + (x + (int)dx);
+                image[offset] = ray.maxt;
+                id[offset] = ray.hitId;
+            }
+        }
    }
 }

@@ -270,19 +283,19 @@ export void raytrace_ispc(uniform int width, uniform int height,
 }


-task void raytrace_tile_task(uniform int width, uniform int height,
+task void raytrace_tile_task(uniform int y0, uniform int y1, 
+                             uniform int width, uniform int height,
                             uniform int baseWidth, uniform int baseHeight,
                             const uniform float raster2camera[4][4], 
                             const uniform float camera2world[4][4],
                             uniform float image[], uniform int id[],
                             const LinearBVHNode nodes[],
                             const Triangle triangles[]) {
-    uniform int dx = 16, dy = 16; // must match dx, dy below
-    uniform int xBuckets = (width + (dx-1)) / dx;
-    uniform int x0 = (taskIndex % xBuckets) * dx;
-    uniform int x1 = min(x0 + dx, width);
-    uniform int y0 = (taskIndex / xBuckets) * dy;
-    uniform int y1 = min(y0 + dy, height);
+    uniform int dx = 16; // must match dx below
+    uniform int xTasks = (width + (dx-1)) / dx;
+    uniform int x0 = (taskIndex % xTasks) * dx;
+    uniform int x1 = x0 + dx;
+    x1 = min(x1, width);
                             
    raytrace_tile(x0, x1, y0, y1, width, height, baseWidth, baseHeight, 
                  raster2camera, camera2world, image,
@@ -298,11 +311,11 @@ export void raytrace_ispc_tasks(uniform int width, uniform int height,
                                const LinearBVHNode nodes[],
                                const Triangle triangles[]) {
    uniform int dx = 16, dy = 16;
-    uniform int xBuckets = (width + (dx-1)) / dx;
-    uniform int yBuckets = (height + (dy-1)) / dy;
-    uniform int nTasks = xBuckets * yBuckets;
-    launch[nTasks] < raytrace_tile_task(width, height, baseWidth, baseHeight, 
-                                        raster2camera, camera2world, 
-                                        image, id, nodes, triangles) >;
+    uniform int nTasks = (width + (dx-1)) / dx;
+    for (uniform int y = 0; y < height; y += dy) {
+        uniform int y1 = min(y + dy, height);
+        launch[nTasks] < raytrace_tile_task(y, y1, width, height, baseWidth,
+                                            baseHeight, raster2camera, camera2world, 
+                                            image, id, nodes, triangles) >;
+    }
 }
-
--- a/examples/rt/rt.vcxproj
+++ b/examples/rt/rt.vcxproj
@@ -64,19 +64,15 @@
  <PropertyGroup Label="UserMacros" />
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
    <LinkIncremental>true</LinkIncremental>
-    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
    <LinkIncremental>true</LinkIncremental>
-    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
    <LinkIncremental>false</LinkIncremental>
-    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
    <LinkIncremental>false</LinkIncremental>
-    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
  </PropertyGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
    <ClCompile>
@@ -85,7 +81,6 @@
      <WarningLevel>Level3</WarningLevel>
      <Optimization>Disabled</Optimization>
      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <FloatingPointModel>Fast</FloatingPointModel>
    </ClCompile>
@@ -101,7 +96,6 @@
      <WarningLevel>Level3</WarningLevel>
      <Optimization>Disabled</Optimization>
      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <FloatingPointModel>Fast</FloatingPointModel>
    </ClCompile>
@@ -119,7 +113,6 @@
      <FunctionLevelLinking>true</FunctionLevelLinking>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
      <FloatingPointModel>Fast</FloatingPointModel>
    </ClCompile>
    <Link>
@@ -138,7 +131,6 @@
      <FunctionLevelLinking>true</FunctionLevelLinking>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
      <FloatingPointModel>Fast</FloatingPointModel>
    </ClCompile>
    <Link>
@@ -152,21 +144,21 @@
    <CustomBuild Include="rt.ispc">
      <FileType>Document</FileType>
      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx
+ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86
 </Command>
      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx
+ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h
 </Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(Filename).obj</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">%(Filename).obj</Outputs>
      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx
+ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86
 </Command>
      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx
+ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h
 </Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(Filename).obj</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">%(Filename).obj</Outputs>
    </CustomBuild>
  </ItemGroup>
  <ItemGroup>
--- a/examples/rt/rt_serial.cpp
+++ b/examples/rt/rt_serial.cpp
@@ -75,13 +75,12 @@ struct Ray {
 // Declare these in a namespace so the mangling matches
 namespace ispc {
    struct Triangle {
-        float p[3][4]; // extra float pad after each vertex
+        float3 p[3];
        int32_t id;
-        int32_t pad[3]; // make 16 x 32-bits
    };

    struct LinearBVHNode {
-        float bounds[2][3];
+        float3 bounds[2];
        int32_t offset;     // primitives for leaf, second child for interior
        uint8_t nPrimitives;
        uint8_t splitAxis;
@@ -141,14 +140,12 @@ static void generateRay(const float raster2camera[4][4],
 }


-static inline bool BBoxIntersect(const float bounds[2][3], 
+static inline bool BBoxIntersect(const float3 bounds[2], 
                                 const Ray &ray) {
-    float3 bounds0(bounds[0][0], bounds[0][1], bounds[0][2]);
-    float3 bounds1(bounds[1][0], bounds[1][1], bounds[1][2]);
    float t0 = ray.mint, t1 = ray.maxt;

-    float3 tNear = (bounds0 - ray.origin) * ray.invDir;
-    float3 tFar  = (bounds1 - ray.origin) * ray.invDir;
+    float3 tNear = (bounds[0] - ray.origin) * ray.invDir;
+    float3 tFar  = (bounds[1] - ray.origin) * ray.invDir;
    if (tNear.x > tFar.x) {
        float tmp = tNear.x;
        tNear.x = tFar.x;
@@ -179,11 +176,8 @@ static inline bool BBoxIntersect(const float bounds[2][3],


 inline bool TriIntersect(const Triangle &tri, Ray &ray) {
-    float3 p0(tri.p[0][0], tri.p[0][1], tri.p[0][2]);
-    float3 p1(tri.p[1][0], tri.p[1][1], tri.p[1][2]);
-    float3 p2(tri.p[2][0], tri.p[2][1], tri.p[2][2]);
-    float3 e1 = p1 - p0;
-    float3 e2 = p2 - p0;
+    float3 e1 = tri.p[1] - tri.p[0];
+    float3 e2 = tri.p[2] - tri.p[0];

    float3 s1 = Cross(ray.dir, e2);
    float divisor = Dot(s1, e1);
@@ -193,7 +187,7 @@ inline bool TriIntersect(const Triangle &tri, Ray &ray) {
    float invDivisor = 1.f / divisor;

    // Compute first barycentric coordinate
-    float3 d = ray.origin - p0;
+    float3 d = ray.origin - tri.p[0];
    float b1 = Dot(d, s1) * invDivisor;
    if (b1 < 0. || b1 > 1.)
        return false;
--- a/examples/simple/Makefile
+++ b/examples/simple/Makefile
@@ -2,7 +2,7 @@
 CXX=g++ -m64
 CXXFLAGS=-Iobjs/ -O3 -Wall
 ISPC=ispc
-ISPCFLAGS=-O2 --arch=x86-64 --target=sse2
+ISPCFLAGS=-O2 --arch=x86-64

 default: simple

--- a/examples/simple/simple.cpp
+++ b/examples/simple/simple.cpp
@@ -33,12 +33,47 @@

 #include <stdio.h>
 #include <stdlib.h>
+#include "../cpuid.h"

 // Include the header file that the ispc compiler generates
 #include "simple_ispc.h"
 using namespace ispc;

+// Make sure that the vector ISA used during compilation is supported by
+// the processor.  The ISPC_TARGET_* macro is set in the ispc-generated
+// header file that we include above.
+static void
+ensureTargetISAIsSupported() {
+#if defined(ISPC_TARGET_SSE2)
+    bool isaSupported = CPUSupportsSSE2();
+    const char *target = "SSE2";
+#elif defined(ISPC_TARGET_SSE4)
+    bool isaSupported = CPUSupportsSSE4();
+    const char *target = "SSE4";
+#elif defined(ISPC_TARGET_AVX)
+    bool isaSupported = CPUSupportsAVX();
+    const char *target = "AVX";
+#else
+#error "Unknown ISPC_TARGET_* value"
+#endif
+    if (!isaSupported) {
+        fprintf(stderr, "***\n*** Error: the ispc-compiled code uses the %s instruction "
+                "set, which isn't\n***        supported by this computer's CPU!\n", target);
+        fprintf(stderr, "***\n***        Please modify the "
+#ifdef _MSC_VER
+                "MSVC project file "
+#else
+                "Makefile "
+#endif
+                "to select another target (e.g. sse2)\n***\n");
+        exit(1);
+    }
+}
+
+
 int main() {
+    ensureTargetISAIsSupported();
+
    float vin[16], vout[16];

    // Initialize input buffer
--- a/examples/simple/simple.ispc
+++ b/examples/simple/simple.ispc
@@ -34,7 +34,9 @@

 export void simple(uniform float vin[], uniform float vout[], 
                   uniform int count) {
-    foreach (index = 0 ... count) {
+    // Compute the result for 'programCount' values in parallel
+    for (uniform int i = 0; i < count; i += programCount) {
+        int index = i + programIndex;
        // Load the appropriate input value for this program instance.
        float v = vin[index];

--- a/examples/simple/simple.vcxproj
+++ b/examples/simple/simple.vcxproj
@@ -25,21 +25,21 @@
    <CustomBuild Include="simple.ispc">
      <FileType>Document</FileType>
      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2
+ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86
 </Command>
      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2
+ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h
 </Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(Filename).obj</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">%(Filename).obj</Outputs>
      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2
+ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86
 </Command>
      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2
+ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h
 </Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(Filename).obj</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">%(Filename).obj</Outputs>
    </CustomBuild>
  </ItemGroup>
  <PropertyGroup Label="Globals">
@@ -88,19 +88,15 @@ ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filena
  <PropertyGroup Label="UserMacros" />
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
    <LinkIncremental>true</LinkIncremental>
-    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
    <LinkIncremental>true</LinkIncremental>
-    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
    <LinkIncremental>false</LinkIncremental>
-    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
    <LinkIncremental>false</LinkIncremental>
-    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
  </PropertyGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
    <ClCompile>
@@ -109,7 +105,6 @@ ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filena
      <WarningLevel>Level3</WarningLevel>
      <Optimization>Disabled</Optimization>
      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
@@ -123,7 +118,6 @@ ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filena
      <WarningLevel>Level3</WarningLevel>
      <Optimization>Disabled</Optimization>
      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
@@ -139,7 +133,6 @@ ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filena
      <FunctionLevelLinking>true</FunctionLevelLinking>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
@@ -157,7 +150,6 @@ ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filena
      <FunctionLevelLinking>true</FunctionLevelLinking>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
@@ -169,4 +161,4 @@ ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filena
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
  </ImportGroup>
-</Project>
+</Project>
--- a/examples/stencil/Makefile
+++ b/examples/stencil/Makefile
@@ -8,11 +8,7 @@ TASK_OBJ=$(addprefix objs/, $(subst ../,, $(TASK_CXX:.cpp=.o)))
 CXX=g++
 CXXFLAGS=-Iobjs/ -O3 -Wall -m64
 ISPC=ispc
-ISPCFLAGS=-O2 --target=sse2,sse4-x2,avx --arch=x86-64
-
-OBJS=objs/stencil.o objs/stencil_serial.o $(TASK_OBJ) objs/stencil_ispc.o \
-	objs/stencil_ispc_sse2.o objs/stencil_ispc_sse4.o \
-	objs/stencil_ispc_avx.o
+ISPCFLAGS=-O2 --target=sse4x2 --arch=x86-64

 default: stencil

@@ -24,8 +20,8 @@ dirs:
 clean:
 	/bin/rm -rf objs *~ stencil

-stencil: dirs $(OBJS)
-	$(CXX) $(CXXFLAGS) -o $@ $(OBJS) -lm $(TASK_LIB)
+stencil: dirs objs/stencil.o objs/stencil_serial.o objs/stencil_ispc.o $(TASK_OBJ)
+	$(CXX) $(CXXFLAGS) -o $@ objs/stencil.o objs/stencil_ispc.o objs/stencil_serial.o $(TASK_OBJ) -lm $(TASK_LIB)

 objs/%.o: %.cpp
 	$(CXX) $< $(CXXFLAGS) -c -o $@
@@ -35,5 +31,5 @@ objs/%.o: ../%.cpp

 objs/stencil.o: objs/stencil_ispc.h 

-objs/%_ispc.h objs/%_ispc.o objs/%_ispc_sse2.o objs/%_ispc_sse4.o objs/%_ispc_avx.o: %.ispc
+objs/%_ispc.h objs/%_ispc.o: %.ispc
 	$(ISPC) $(ISPCFLAGS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
--- a/examples/stencil/stencil.cpp
+++ b/examples/stencil/stencil.cpp
@@ -42,10 +42,43 @@
 #include <algorithm>
 #include <math.h>
 #include "../timing.h"
+#include "../cpuid.h"
 #include "stencil_ispc.h"
 using namespace ispc;


+// Make sure that the vector ISA used during compilation is supported by
+// the processor.  The ISPC_TARGET_* macro is set in the ispc-generated
+// header file that we include above.
+static void
+ensureTargetISAIsSupported() {
+#if defined(ISPC_TARGET_SSE2)
+    bool isaSupported = CPUSupportsSSE2();
+    const char *target = "SSE2";
+#elif defined(ISPC_TARGET_SSE4)
+    bool isaSupported = CPUSupportsSSE4();
+    const char *target = "SSE4";
+#elif defined(ISPC_TARGET_AVX)
+    bool isaSupported = CPUSupportsAVX();
+    const char *target = "AVX";
+#else
+#error "Unknown ISPC_TARGET_* value"
+#endif
+    if (!isaSupported) {
+        fprintf(stderr, "***\n*** Error: the ispc-compiled code uses the %s instruction "
+                "set, which isn't\n***        supported by this computer's CPU!\n", target);
+        fprintf(stderr, "***\n***        Please modify the "
+#ifdef _MSC_VER
+                "MSVC project file "
+#else
+                "Makefile "
+#endif
+                "to select another target (e.g. sse2)\n***\n");
+        exit(1);
+    }
+}
+
+
 extern void loop_stencil_serial(int t0, int t1, int x0, int x1,
                                int y0, int y1, int z0, int z1,
                                int Nx, int Ny, int Nz,
@@ -67,6 +100,8 @@ void InitData(int Nx, int Ny, int Nz, float *A[2], float *vsq) {


 int main() {
+    ensureTargetISAIsSupported();
+
    int Nx = 256, Ny = 256, Nz = 256;
    int width = 4;
    float *Aserial[2], *Aispc[2];
--- a/examples/stencil/stencil.ispc
+++ b/examples/stencil/stencil.ispc
@@ -43,8 +43,9 @@ stencil_step(uniform int x0, uniform int x1,

    for (uniform int z = z0; z < z1; ++z) {
        for (uniform int y = y0; y < y1; ++y) {
-            foreach (x = x0 ... x1) {
-                int index = (z * Nxy) + (y * Nx) + x;
+            // Assumes that (x1-x0) % programCount == 0
+            for (uniform int x = x0; x < x1; x += programCount) {
+                int index = (z * Nxy) + (y * Nx) + x + programIndex;
 #define A_cur(x, y, z) Ain[index + (x) + ((y) * Nx) + ((z) * Nxy)]
 #define A_next(x, y, z) Aout[index + (x) + ((y) * Nx) + ((z) * Nxy)]
                float div = coef[0] * A_cur(0, 0, 0) +
--- a/examples/stencil/stencil.vcxproj
+++ b/examples/stencil/stencil.vcxproj
@@ -64,19 +64,15 @@
  <PropertyGroup Label="UserMacros" />
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
    <LinkIncremental>true</LinkIncremental>
-    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
    <LinkIncremental>true</LinkIncremental>
-    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
    <LinkIncremental>false</LinkIncremental>
-    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
    <LinkIncremental>false</LinkIncremental>
-    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
  </PropertyGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
    <ClCompile>
@@ -85,7 +81,6 @@
      <WarningLevel>Level3</WarningLevel>
      <Optimization>Disabled</Optimization>
      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <FloatingPointModel>Fast</FloatingPointModel>
    </ClCompile>
@@ -101,7 +96,6 @@
      <WarningLevel>Level3</WarningLevel>
      <Optimization>Disabled</Optimization>
      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <FloatingPointModel>Fast</FloatingPointModel>
    </ClCompile>
@@ -119,7 +113,6 @@
      <FunctionLevelLinking>true</FunctionLevelLinking>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
      <FloatingPointModel>Fast</FloatingPointModel>
    </ClCompile>
    <Link>
@@ -138,7 +131,6 @@
      <FunctionLevelLinking>true</FunctionLevelLinking>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
      <FloatingPointModel>Fast</FloatingPointModel>
    </ClCompile>
    <Link>
@@ -152,21 +144,21 @@
    <CustomBuild Include="stencil.ispc">
      <FileType>Document</FileType>
      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx
+ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86
 </Command>
      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx
+ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h
 </Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(Filename).obj</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">%(Filename).obj</Outputs>
      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx
+ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86
 </Command>
      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx
+ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h
 </Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(Filename).obj</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">%(Filename).obj</Outputs>
    </CustomBuild>
  </ItemGroup>
  <ItemGroup>
--- a/examples/tasksys.cpp
+++ b/examples/tasksys.cpp
@@ -53,7 +53,9 @@
  #define ISPC_USE_PTHREADS
 #elif defined(__APPLE__)
  #define ISPC_IS_APPLE
-  #define ISPC_USE_GCD
+  // pthreads is noticably more efficient than GCD on OSX
+  #define ISPC_USE_PTHREADS
+  //#define ISPC_USE_GCD
 #endif

 #define DBG(x) 
@@ -110,7 +112,7 @@ struct TaskInfo {
 ///////////////////////////////////////////////////////////////////////////
 // TaskGroupBase

-#define LOG_TASK_QUEUE_CHUNK_SIZE 14
+#define LOG_TASK_QUEUE_CHUNK_SIZE 12
 #define MAX_TASK_QUEUE_CHUNKS 8
 #define TASK_QUEUE_CHUNK_SIZE (1<<LOG_TASK_QUEUE_CHUNK_SIZE)

@@ -157,6 +159,7 @@ private:
    int memBufferSize[NUM_MEM_BUFFERS];
    char *memBuffers[NUM_MEM_BUFFERS];
    char mem[256];
+
 };


--- a/examples/volume_rendering/Makefile
+++ b/examples/volume_rendering/Makefile
@@ -8,10 +8,7 @@ TASK_OBJ=$(addprefix objs/, $(subst ../,, $(TASK_CXX:.cpp=.o)))
 CXX=g++
 CXXFLAGS=-Iobjs/ -O3 -Wall -m64
 ISPC=ispc
-ISPCFLAGS=-O2 --target=sse2,sse4-x2 --arch=x86-64
-
-OBJS=objs/volume.o objs/volume_serial.o $(TASK_OBJ) objs/volume_ispc.o \
-	objs/volume_ispc_sse2.o objs/volume_ispc_sse4.o
+ISPCFLAGS=-O2 --target=sse4x2 --arch=x86-64

 default: volume

@@ -23,8 +20,8 @@ dirs:
 clean:
 	/bin/rm -rf objs *~ volume

-volume: dirs $(OBJS)
-	$(CXX) $(CXXFLAGS) -o $@ $(OBJS) -lm $(TASK_LIB)
+volume: dirs objs/volume.o objs/volume_serial.o objs/volume_ispc.o $(TASK_OBJ)
+	$(CXX) $(CXXFLAGS) -o $@ objs/volume.o objs/volume_ispc.o objs/volume_serial.o $(TASK_OBJ) -lm $(TASK_LIB)

 objs/%.o: %.cpp
 	$(CXX) $< $(CXXFLAGS) -c -o $@
@@ -34,5 +31,5 @@ objs/%.o: ../%.cpp

 objs/volume.o: objs/volume_ispc.h 

-objs/%_ispc.h objs/%_ispc.o objs/%_ispc_sse2.o objs/%_ispc_sse4.o: %.ispc
+objs/%_ispc.h objs/%_ispc.o: %.ispc
 	$(ISPC) $(ISPCFLAGS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
--- a/examples/volume_rendering/volume.cpp
+++ b/examples/volume_rendering/volume.cpp
@@ -41,6 +41,7 @@
 #include <stdio.h>
 #include <algorithm>
 #include "../timing.h"
+#include "../cpuid.h"
 #include "volume_ispc.h"
 using namespace ispc;

@@ -69,6 +70,37 @@ writePPM(float *buf, int width, int height, const char *fn) {
 }


+// Make sure that the vector ISA used during compilation is supported by
+// the processor.  The ISPC_TARGET_* macro is set in the ispc-generated
+// header file that we include above.
+static void
+ensureTargetISAIsSupported() {
+#if defined(ISPC_TARGET_SSE2)
+    bool isaSupported = CPUSupportsSSE2();
+    const char *target = "SSE2";
+#elif defined(ISPC_TARGET_SSE4)
+    bool isaSupported = CPUSupportsSSE4();
+    const char *target = "SSE4";
+#elif defined(ISPC_TARGET_AVX)
+    bool isaSupported = CPUSupportsAVX();
+    const char *target = "AVX";
+#else
+#error "Unknown ISPC_TARGET_* value"
+#endif
+    if (!isaSupported) {
+        fprintf(stderr, "***\n*** Error: the ispc-compiled code uses the %s instruction "
+                "set, which isn't\n***        supported by this computer's CPU!\n", target);
+        fprintf(stderr, "***\n***        Please modify the "
+#ifdef _MSC_VER
+                "MSVC project file "
+#else
+                "Makefile "
+#endif
+                "to select another target (e.g. sse2)\n***\n");
+        exit(1);
+    }
+}
+
 /* Load image and viewing parameters from a camera data file.
   FIXME: we should add support to be able to specify viewing parameters
   in the program here directly. */
@@ -140,6 +172,8 @@ int main(int argc, char *argv[]) {
        return 1;
    }

+    ensureTargetISAIsSupported();
+
    //
    // Load viewing data and the volume density data
    //
--- a/examples/volume_rendering/volume.ispc
+++ b/examples/volume_rendering/volume.ispc
@@ -41,7 +41,7 @@ struct Ray {
 static void
 generateRay(const uniform float raster2camera[4][4], 
            const uniform float camera2world[4][4],
-            float x, float y, Ray &ray) {
+            float x, float y, reference Ray ray) {
    // transform raster coordinate (x, y, 0) to camera space
    float camx = raster2camera[0][0] * x + raster2camera[0][1] * y + raster2camera[0][3];
    float camy = raster2camera[1][0] * x + raster2camera[1][1] * y + raster2camera[1][3];
@@ -70,7 +70,7 @@ Inside(float3 p, float3 pMin, float3 pMax) {


 static bool
-IntersectP(Ray ray, float3 pMin, float3 pMax, float &hit0, float &hit1) {
+IntersectP(Ray ray, float3 pMin, float3 pMax, reference float hit0, reference float hit1) {
    float t0 = -1e30, t1 = 1e30;

    float3 tNear = (pMin - ray.origin) / ray.dir;
@@ -141,7 +141,7 @@ static inline float3 Offset(float3 p, float3 pMin, float3 pMax) {

 static inline float Density(float3 Pobj, float3 pMin, float3 pMax, 
                            uniform float density[], uniform int nVoxels[3],
-                            uniform bool &checkForSameVoxel) {
+                            reference uniform bool checkForSameVoxel) {
    if (!Inside(Pobj, pMin, pMax)) 
        return 0;
    // Compute voxel coordinates and offsets for _Pobj_
@@ -155,8 +155,8 @@ static inline float Density(float3 Pobj, float3 pMin, float3 pMax,
    // Trilinearly interpolate density values to compute local density
    float d00, d10, d01, d11;
    uniform int uvx, uvy, uvz;
-    if (checkForSameVoxel && reduce_equal(vx, &uvx) && reduce_equal(vy, &uvy) &&
-        reduce_equal(vz, &uvz)) {
+    if (checkForSameVoxel && reduce_equal(vx, uvx) && reduce_equal(vy, uvy) &&
+        reduce_equal(vz, uvz)) {
        // If all of the program instances are inside the same voxel, then
        // we'll call the 'uniform' variant of the voxel density lookup
        // function, thus doing a single load for each value rather than a
@@ -310,7 +310,11 @@ volume_tile(uniform int x0, uniform int y0, uniform int x1,
    // by 4.
    for (uniform int y = y0; y < y1; y += 4) {
        for (uniform int x = x0; x < x1; x += 4) {
-            foreach (o = 0 ... 16) {
+            // For each such tile, process programCount pixels at a time,
+            // until we've done all 16 of them.  Thus, we're also assuming
+            // that programCount <= 16 and that 16 is evenly dividible by
+            // programCount.
+            for (uniform int o = 0; o < 16; o += programCount) {
                // These two arrays encode the mapping from [0,15] to
                // offsets within the 4x4 pixel block so that we render
                // each pixel inside the block
@@ -320,7 +324,8 @@ volume_tile(uniform int x0, uniform int y0, uniform int x1,
                                                   2, 2, 3, 3, 2, 2, 3, 3 };

                // Figure out the pixel to render for this program instance
-                int xo = x + xoffsets[o], yo = y + yoffsets[o];
+                int xo = x + xoffsets[o + programIndex];
+                int yo = y + yoffsets[o + programIndex];

                // Use viewing parameters to compute the corresponding ray
                // for the pixel
@@ -347,7 +352,7 @@ volume_task(uniform float density[], uniform int nVoxels[3],
    uniform int ybuckets = (height + (dy-1)) / dy;

    uniform int x0 = (taskIndex % xbuckets) * dx;
-    uniform int y0 = (taskIndex / xbuckets) * dy;
+    uniform int y0 = (taskIndex / ybuckets) * dy;
    uniform int x1 = x0 + dx, y1 = y0 + dy;
    x1 = min(x1, width);
    y1 = min(y1, height);
--- a/examples/volume_rendering/volume.vcxproj
+++ b/examples/volume_rendering/volume.vcxproj
@@ -64,19 +64,15 @@
  <PropertyGroup Label="UserMacros" />
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
    <LinkIncremental>true</LinkIncremental>
-    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
    <LinkIncremental>true</LinkIncremental>
-    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
    <LinkIncremental>false</LinkIncremental>
-    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
    <LinkIncremental>false</LinkIncremental>
-    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
  </PropertyGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
    <ClCompile>
@@ -85,7 +81,6 @@
      <WarningLevel>Level3</WarningLevel>
      <Optimization>Disabled</Optimization>
      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <FloatingPointModel>Fast</FloatingPointModel>
    </ClCompile>
@@ -101,7 +96,6 @@
      <WarningLevel>Level3</WarningLevel>
      <Optimization>Disabled</Optimization>
      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <FloatingPointModel>Fast</FloatingPointModel>
    </ClCompile>
@@ -119,7 +113,6 @@
      <FunctionLevelLinking>true</FunctionLevelLinking>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
      <FloatingPointModel>Fast</FloatingPointModel>
    </ClCompile>
    <Link>
@@ -138,7 +131,6 @@
      <FunctionLevelLinking>true</FunctionLevelLinking>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
      <FloatingPointModel>Fast</FloatingPointModel>
    </ClCompile>
    <Link>
@@ -156,18 +148,18 @@
  <ItemGroup>
    <CustomBuild Include="volume.ispc">
      <FileType>Document</FileType>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=sse4x2
 </Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --target=sse4x2
 </Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=sse4x2
 </Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --target=sse4x2
 </Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
    </CustomBuild>
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
--- a/examples/volume_rendering/volume_serial.cpp
+++ b/examples/volume_rendering/volume_serial.cpp
@@ -36,6 +36,9 @@
 #include <algorithm>

 // Just enough of a float3 class to do what we need in this file.
+#ifdef _MSC_VER
+__declspec(align(16)) 
+#endif
 struct float3 {
    float3() { }
    float3(float xx, float yy, float zz) { x = xx; y = yy; z = zz; }
@@ -295,7 +298,7 @@ volume_serial(float density[], int nVoxels[3], const float raster2camera[4][4],
    for (int y = 0; y < height; ++y) {
        for (int x = 0; x < width; ++x, ++offset) {
            Ray ray;
-            generateRay(raster2camera, camera2world, (float)x, (float)y, ray);
+            generateRay(raster2camera, camera2world, x, y, ray);
            image[offset] = raymarch(density, nVoxels, ray);
        }
    }
--- a/expr.cpp
+++ b/expr.cpp
--- a/expr.h
+++ b/expr.h
@@ -39,9 +39,10 @@
 #define ISPC_EXPR_H 1

 #include "ispc.h"
-#include "ast.h"
 #include "type.h"

+class FunctionSymbolExpr;
+
 /** @brief Expr is the abstract base class that defines the interface that
    all expression types must implement.
 */
@@ -65,10 +66,6 @@ public:
    /** Returns the Type of the expression. */
    virtual const Type *GetType() const = 0;

-    /** Returns the type of the value returned by GetLValueType(); this
-        should be a pointer type of some sort (uniform or varying). */
-    virtual const Type *GetLValueType() const;
-
    /** For expressions that have values based on a symbol (e.g. regular
        symbol references, array indexing, etc.), this returns a pointer to
        that symbol. */
@@ -93,6 +90,14 @@ public:

    /** Prints the expression to standard output (used for debugging). */
    virtual void Print() const = 0;
+
+    /** This method tries to convert the expression to the given type.  In
+        the event of failure, if the failureOk parameter is true, then no
+        error is issued.  If failureOk is false, then an error is printed
+        that incorporates the given error message string.  In either
+        failure case, NULL is returned.  */
+    Expr *TypeConv(const Type *type, const char *errorMsgBase = NULL, 
+                   bool failureOk = false, bool issuePrecisionWarnings = true);
 };


@@ -260,6 +265,10 @@ public:
    ExprList *args;
    bool isLaunch;
    Expr *launchCountExpr;
+
+private:
+    void resolveFunctionOverloads(bool exactMatchOnly);
+    bool tryResolve(bool (*matchFunc)(Expr *, const Type *));
 };


@@ -270,12 +279,11 @@ public:
 */
 class IndexExpr : public Expr {
 public:
-    IndexExpr(Expr *baseExpr, Expr *index, SourcePos p);
+    IndexExpr(Expr *arrayOrVector, Expr *index, SourcePos p);

    llvm::Value *GetValue(FunctionEmitContext *ctx) const;
    llvm::Value *GetLValue(FunctionEmitContext *ctx) const;
    const Type *GetType() const;
-    const Type *GetLValueType() const;
    Symbol *GetBaseSymbol() const;
    void Print() const;

@@ -283,7 +291,7 @@ public:
    Expr *TypeCheck();
    int EstimateCost() const;

-    Expr *baseExpr, *index;
+    Expr *arrayOrVector, *index;
 };


@@ -293,35 +301,28 @@ public:
 */
 class MemberExpr : public Expr {
 public:
-    static MemberExpr *create(Expr *expr, const char *identifier,
-                              SourcePos pos, SourcePos identifierPos,
-                              bool derefLvalue);
+    static MemberExpr* create(Expr *expr, const char *identifier,
+                              SourcePos pos, SourcePos identifierPos);
+
+    MemberExpr(Expr *expr, const char *identifier, SourcePos pos, 
+               SourcePos identifierPos);
+
    llvm::Value *GetValue(FunctionEmitContext *ctx) const;
    llvm::Value *GetLValue(FunctionEmitContext *ctx) const;
    const Type *GetType() const;
-    const Type *GetLValueType() const;
    Symbol *GetBaseSymbol() const;
    void Print() const;
    Expr *Optimize();
    Expr *TypeCheck();
    int EstimateCost() const;

-    virtual int getElementNumber() const = 0;
-    virtual const Type *getElementType() const = 0;
+    virtual int getElementNumber() const;
+
    std::string getCandidateNearMatches() const;

    Expr *expr;
    std::string identifier;
    const SourcePos identifierPos;
-
-protected:
-    MemberExpr(Expr *expr, const char *identifier, SourcePos pos, 
-               SourcePos identifierPos, bool derefLValue);
-
-    /** Indicates whether the expression should be dereferenced before the
-        member is found.  (i.e. this is true if the MemberExpr was a '->'
-        operator, and is false if it was a '.' operator. */
-    bool dereferenceExpr;
 };


@@ -493,8 +494,7 @@ private:
    probably-different type. */
 class TypeCastExpr : public Expr {
 public:
-    TypeCastExpr(const Type *t, Expr *e, bool preserveUniformity,
-                 SourcePos p);
+    TypeCastExpr(const Type *t, Expr *e, SourcePos p);

    llvm::Value *GetValue(FunctionEmitContext *ctx) const;
    const Type *GetType() const;
@@ -502,12 +502,9 @@ public:
    Expr *TypeCheck();
    Expr *Optimize();
    int EstimateCost() const;
-    Symbol *GetBaseSymbol() const;
-    llvm::Constant *GetConstant(const Type *type) const;

    const Type *type;
    Expr *expr;
-    bool preserveUniformity;
 };


@@ -519,7 +516,6 @@ public:

    llvm::Value *GetValue(FunctionEmitContext *ctx) const;
    const Type *GetType() const;
-    const Type *GetLValueType() const;
    Symbol *GetBaseSymbol() const;
    void Print() const;
    Expr *TypeCheck();
@@ -539,7 +535,6 @@ public:
    llvm::Value *GetValue(FunctionEmitContext *ctx) const;
    llvm::Value *GetLValue(FunctionEmitContext *ctx) const;
    const Type *GetType() const;
-    const Type *GetLValueType() const;
    Symbol *GetBaseSymbol() const;
    void Print() const;
    Expr *TypeCheck();
@@ -550,44 +545,6 @@ public:
 };


-/** Expression that represents taking the address of an expression. */
-class AddressOfExpr : public Expr {
-public:
-    AddressOfExpr(Expr *e, SourcePos p);
-
-    llvm::Value *GetValue(FunctionEmitContext *ctx) const;
-    const Type *GetType() const;
-    Symbol *GetBaseSymbol() const;
-    void Print() const;
-    Expr *TypeCheck();
-    Expr *Optimize();
-    int EstimateCost() const;
-
-    Expr *expr;
-};
-
-
-/** Expression that returns the size of the given expression or type in
-    bytes. */
-class SizeOfExpr : public Expr {
-public:
-    SizeOfExpr(Expr *e, SourcePos p);
-    SizeOfExpr(const Type *t, SourcePos p);
-
-    llvm::Value *GetValue(FunctionEmitContext *ctx) const;
-    const Type *GetType() const;
-    void Print() const;
-    Expr *TypeCheck();
-    Expr *Optimize();
-    int EstimateCost() const;
-
-    /* One of expr or type should be non-NULL (but not both of them).  The
-       SizeOfExpr returns the size of whichever one of them isn't NULL. */
-    Expr *expr;
-    const Type *type;
-};
-
-
 /** @brief Expression representing a symbol reference in the program */
 class SymbolExpr : public Expr {
 public:
@@ -596,7 +553,6 @@ public:
    llvm::Value *GetValue(FunctionEmitContext *ctx) const;
    llvm::Value *GetLValue(FunctionEmitContext *ctx) const;
    const Type *GetType() const;
-    const Type *GetLValueType() const;
    Symbol *GetBaseSymbol() const;
    Expr *TypeCheck();
    Expr *Optimize();
@@ -613,7 +569,7 @@ private:
 */    
 class FunctionSymbolExpr : public Expr {
 public:
-    FunctionSymbolExpr(const char *name, const std::vector<Symbol *> &candFuncs,
+    FunctionSymbolExpr(const char *name, std::vector<Symbol *> *candidateFunctions,
                       SourcePos pos);

    llvm::Value *GetValue(FunctionEmitContext *ctx) const;
@@ -623,25 +579,9 @@ public:
    Expr *Optimize();
    void Print() const;
    int EstimateCost() const;
-    llvm::Constant *GetConstant(const Type *type) const;
-
-    /** Given the types of the function arguments, in the presence of
-        function overloading, this method resolves which actual function
-        the arguments match best.  If the argCouldBeNULL parameter is
-        non-NULL, each element indicates whether the corresponding argument
-        is the number zero, indicating that it could be a NULL pointer.
-        This parameter may be NULL (for cases where overload resolution is
-        being done just given type information without the parameter
-        argument expressions being available.  It returns true on success.
-     */
-    bool ResolveOverloads(const std::vector<const Type *> &argTypes,
-                          const std::vector<bool> *argCouldBeNULL = NULL);
-    Symbol *GetMatchingFunction();

 private:
-    bool tryResolve(int (*matchFunc)(const Type *, const Type *),
-                    const std::vector<const Type *> &argTypes,
-                    const std::vector<bool> *argCouldBeNULL);
+    friend class FunctionCallExpr;

    /** Name of the function that is being called. */
    std::string name;
@@ -649,12 +589,11 @@ private:
    /** All of the functions with the name given in the function call;
        there may be more then one, in which case we need to resolve which
        overload is the best match. */
-    std::vector<Symbol *> candidateFunctions;
+    std::vector<Symbol *> *candidateFunctions;

-    /** The actual matching function found after overload resolution. */
+    /** The actual matching function found after overload resolution; this
+        value is set by FunctionCallExpr::resolveFunctionOverloads() */
    Symbol *matchingFunc;
-
-    bool triedToResolve;
 };


@@ -672,37 +611,4 @@ public:
    int EstimateCost() const;
 };

-
-/** @brief An expression that represents a NULL pointer. */
-class NullPointerExpr : public Expr {
-public:
-    NullPointerExpr(SourcePos p) : Expr(p) { }
-    
-    llvm::Value *GetValue(FunctionEmitContext *ctx) const;
-    const Type *GetType() const;
-    Expr *TypeCheck();
-    Expr *Optimize();
-    void Print() const;
-    int EstimateCost() const;
-};
-
-
-/** This function indicates whether it's legal to convert from fromType to
-    toType.  If the optional errorMsgBase and source position parameters
-    are provided, then an error message is issued if the type conversion
-    isn't possible.
- */
-bool CanConvertTypes(const Type *fromType, const Type *toType,
-                     const char *errorMsgBase = NULL,
-                     SourcePos pos = SourcePos());
-
-/** This function attempts to convert the given expression to the given
-    type, returning a pointer to a new expression that is the result.  If
-    the required type conversion is illegal, it returns NULL and prints an
-    error message using the provided string to indicate the context for
-    which type conversion was being applied (e.g. "function call
-    parameter").
- */
-Expr *TypeConvertExpr(Expr *expr, const Type *toType, const char *errorMsgBase);
-
 #endif // ISPC_EXPR_H
--- a/failing_tests/masked-scatter-vector.ispc
+++ b/failing_tests/masked-scatter-vector.ispc
@@ -14,10 +14,10 @@ export void f_fu(uniform float ret[], uniform float aa[], uniform float b) {
    varying int3 vv = array[a];
    ++vv.y;
    array[a] = vv;
+//CO    print("fin %\n", array[programIndex].y);
    ret[programIndex] = array[programIndex].y;
 }

 export void result(uniform float ret[]) {
-    ret[programIndex] = 101+programIndex;
-    ret[0] = 100;
+    ret[programIndex] = 100+programIndex;
 }
--- a/failing_tests/max-uint-1.ispc
+++ b/failing_tests/max-uint-1.ispc
--- a/failing_tests/max-uint.ispc
+++ b/failing_tests/max-uint.ispc
--- a/failing_tests/min-uint-1.ispc
+++ b/failing_tests/min-uint-1.ispc
--- a/failing_tests/min-uint-2.ispc
+++ b/failing_tests/min-uint-2.ispc
--- a/failing_tests/scatter-vector.ispc
+++ b/failing_tests/scatter-vector.ispc
@@ -8,6 +8,9 @@ struct Foo {
    float y;
 };

+extern void aa(reference Foo f);
+extern void bb(reference Foo f[]);
+
 typedef float<3> float3;

 void set(uniform float3 f[], int offset, float3 val) {
--- a/failing_tests/shuffle2-10.ispc
+++ b/failing_tests/shuffle2-10.ispc
--- a/func.cpp
+++ b/func.cpp
@@ -1,414 +0,0 @@
-/*
-  Copyright (c) 2011, Intel Corporation
-  All rights reserved.
-
-  Redistribution and use in source and binary forms, with or without
-  modification, are permitted provided that the following conditions are
-  met:
-
-    * Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-
-    * Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-
-    * Neither the name of Intel Corporation nor the names of its
-      contributors may be used to endorse or promote products derived from
-      this software without specific prior written permission.
-
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
-   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
-   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
-   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
-*/
-
-/** @file func.cpp
-    @brief 
-*/
-
-#include "func.h"
-#include "ctx.h"
-#include "expr.h"
-#include "llvmutil.h"
-#include "module.h"
-#include "type.h"
-#include "stmt.h"
-#include "sym.h"
-#include "util.h"
-#include <stdio.h>
-
-#include <llvm/LLVMContext.h>
-#include <llvm/Module.h>
-#include <llvm/Type.h>
-#include <llvm/DerivedTypes.h>
-#include <llvm/Instructions.h>
-#include <llvm/Intrinsics.h>
-#include <llvm/PassManager.h>
-#include <llvm/PassRegistry.h>
-#include <llvm/Transforms/IPO.h>
-#include <llvm/Support/FormattedStream.h>
-#include <llvm/Support/FileUtilities.h>
-#include <llvm/Target/TargetMachine.h>
-#include <llvm/Target/TargetOptions.h>
-#include <llvm/Target/TargetData.h>
-#include <llvm/PassManager.h>
-#include <llvm/Analysis/Verifier.h>
-#include <llvm/Support/CFG.h>
-#include <llvm/Support/ToolOutputFile.h>
-#include <llvm/Assembly/PrintModulePass.h>
-
-Function::Function(Symbol *s, const std::vector<Symbol *> &a, Stmt *c) {
-    sym = s;
-    args = a;
-    code = c;
-
-    maskSymbol = m->symbolTable->LookupVariable("__mask");
-    assert(maskSymbol != NULL);
-
-    if (code != NULL) {
-        if (g->debugPrint) {
-            fprintf(stderr, "Creating function \"%s\".  Initial code:\n", 
-                    sym->name.c_str());
-            code->Print(0);
-            fprintf(stderr, "---------------------\n");
-        }
-
-        code = code->TypeCheck();
-
-        if (code != NULL && g->debugPrint) {
-            fprintf(stderr, "After typechecking function \"%s\":\n", 
-                    sym->name.c_str());
-            code->Print(0);
-            fprintf(stderr, "---------------------\n");
-        }
-
-        if (code != NULL) {
-            code = code->Optimize();
-            if (g->debugPrint) {
-                fprintf(stderr, "After optimizing function \"%s\":\n", 
-                        sym->name.c_str());
-                code->Print(0);
-                fprintf(stderr, "---------------------\n");
-            }
-        }
-    }
-
-    if (g->debugPrint) {
-        printf("Add Function %s\n", sym->name.c_str());
-        code->Print(0);
-        printf("\n\n\n");
-    }
-
-    const FunctionType *type = dynamic_cast<const FunctionType *>(sym->type);
-    assert(type != NULL);
-
-    for (unsigned int i = 0; i < args.size(); ++i)
-        if (dynamic_cast<const ReferenceType *>(args[i]->type) == NULL)
-            args[i]->parentFunction = this;
-
-    if (type->isTask) {
-        threadIndexSym = m->symbolTable->LookupVariable("threadIndex");
-        assert(threadIndexSym);
-        threadCountSym = m->symbolTable->LookupVariable("threadCount");
-        assert(threadCountSym);
-        taskIndexSym = m->symbolTable->LookupVariable("taskIndex");
-        assert(taskIndexSym);
-        taskCountSym = m->symbolTable->LookupVariable("taskCount");
-        assert(taskCountSym);
-    }
-    else
-        threadIndexSym = threadCountSym = taskIndexSym = taskCountSym = NULL;
-}
-
-
-const Type *
-Function::GetReturnType() const {
-    const FunctionType *type = dynamic_cast<const FunctionType *>(sym->type);
-    assert(type != NULL);
-    return type->GetReturnType();
-}
-
-
-const FunctionType *
-Function::GetType() const {
-    const FunctionType *type = dynamic_cast<const FunctionType *>(sym->type);
-    assert(type != NULL);
-    return type;
-}
-
-
-/** Parameters for tasks are stored in a big structure; this utility
-    function emits code to copy those values out of the task structure into
-    local stack-allocated variables.  (Which we expect that LLVM's
-    'mem2reg' pass will in turn promote to SSA registers..
- */
-static void
-lCopyInTaskParameter(int i, llvm::Value *structArgPtr, const std::vector<Symbol *> &args,
-                     FunctionEmitContext *ctx) {
-    // We expect the argument structure to come in as a poitner to a
-    // structure.  Confirm and figure out its type here.
-    const llvm::Type *structArgType = structArgPtr->getType();
-    assert(llvm::isa<llvm::PointerType>(structArgType));
-    const llvm::PointerType *pt = llvm::dyn_cast<const llvm::PointerType>(structArgType);
-    assert(llvm::isa<llvm::StructType>(pt->getElementType()));
-    const llvm::StructType *argStructType = 
-        llvm::dyn_cast<const llvm::StructType>(pt->getElementType());
-
-    // Get the type of the argument we're copying in and its Symbol pointer
-    LLVM_TYPE_CONST llvm::Type *argType = argStructType->getElementType(i);
-    Symbol *sym = args[i];
-
-    // allocate space to copy the parameter in to
-    sym->storagePtr = ctx->AllocaInst(argType, sym->name.c_str());
-
-    // get a pointer to the value in the struct
-    llvm::Value *ptr = ctx->AddElementOffset(structArgPtr, i, NULL, sym->name.c_str());
-
-    // and copy the value from the struct and into the local alloca'ed
-    // memory
-    llvm::Value *ptrval = ctx->LoadInst(ptr, sym->name.c_str());
-    ctx->StoreInst(ptrval, sym->storagePtr);
-    ctx->EmitFunctionParameterDebugInfo(sym);
-}
-
-
-/** Given the statements implementing a function, emit the code that
-    implements the function.  Most of the work do be done here just
-    involves wiring up the function parameter values to be available in the
-    function body code.
- */
-void 
-Function::emitCode(FunctionEmitContext *ctx, llvm::Function *function, 
-                   SourcePos firstStmtPos) {
-    llvm::Value *maskPtr = ctx->AllocaInst(LLVMTypes::MaskType, "mask_memory");
-    ctx->StoreInst(LLVMMaskAllOn, maskPtr);
-    maskSymbol->storagePtr = maskPtr;
-    ctx->SetMaskPointer(maskPtr);
-
-    // add debugging info for __mask, programIndex, ...
-    maskSymbol->pos = firstStmtPos;
-    ctx->EmitVariableDebugInfo(maskSymbol);
-
-#if 0
-    llvm::BasicBlock *entryBBlock = ctx->GetCurrentBasicBlock();
-#endif
-    const FunctionType *type = dynamic_cast<const FunctionType *>(sym->type);
-    assert(type != NULL);
-    if (type->isTask == true) {
-        // For tasks, we there should always be three parmeters: the
-        // pointer to the structure that holds all of the arguments, the
-        // thread index, and the thread count variables.
-        llvm::Function::arg_iterator argIter = function->arg_begin();
-        llvm::Value *structParamPtr = argIter++;
-        llvm::Value *threadIndex = argIter++;
-        llvm::Value *threadCount = argIter++;
-        llvm::Value *taskIndex = argIter++;
-        llvm::Value *taskCount = argIter++;
-
-        // Copy the function parameter values from the structure into local
-        // storage
-        for (unsigned int i = 0; i < args.size(); ++i)
-            lCopyInTaskParameter(i, structParamPtr, args, ctx);
-
-        // Copy in the mask as well.
-        int nArgs = (int)args.size();
-        // The mask is the last parameter in the argument structure
-        llvm::Value *ptr = ctx->AddElementOffset(structParamPtr, nArgs, NULL,
-                                                  "task_struct_mask");
-        llvm::Value *ptrval = ctx->LoadInst(ptr, "mask");
-        ctx->SetFunctionMask(ptrval);
-
-        // Copy threadIndex and threadCount into stack-allocated storage so
-        // that their symbols point to something reasonable.
-        threadIndexSym->storagePtr = ctx->AllocaInst(LLVMTypes::Int32Type, "threadIndex");
-        ctx->StoreInst(threadIndex, threadIndexSym->storagePtr);
-
-        threadCountSym->storagePtr = ctx->AllocaInst(LLVMTypes::Int32Type, "threadCount");
-        ctx->StoreInst(threadCount, threadCountSym->storagePtr);
-
-        // Copy taskIndex and taskCount into stack-allocated storage so
-        // that their symbols point to something reasonable.
-        taskIndexSym->storagePtr = ctx->AllocaInst(LLVMTypes::Int32Type, "taskIndex");
-        ctx->StoreInst(taskIndex, taskIndexSym->storagePtr);
-
-        taskCountSym->storagePtr = ctx->AllocaInst(LLVMTypes::Int32Type, "taskCount");
-        ctx->StoreInst(taskCount, taskCountSym->storagePtr);
-    }
-    else {
-        // Regular, non-task function
-        llvm::Function::arg_iterator argIter = function->arg_begin(); 
-        for (unsigned int i = 0; i < args.size(); ++i, ++argIter) {
-            Symbol *sym = args[i];
-            argIter->setName(sym->name.c_str());
-
-            // Allocate stack storage for the parameter and emit code
-            // to store the its value there.
-            sym->storagePtr = ctx->AllocaInst(argIter->getType(), sym->name.c_str());
-            ctx->StoreInst(argIter, sym->storagePtr);
-            ctx->EmitFunctionParameterDebugInfo(sym);
-        }
-
-        // If the number of actual function arguments is equal to the
-        // number of declared arguments in decl->functionParams, then we
-        // don't have a mask parameter, so set it to be all on.  This
-        // happens for exmaple with 'export'ed functions that the app
-        // calls.
-        if (argIter == function->arg_end())
-            ctx->SetFunctionMask(LLVMMaskAllOn);
-        else {
-            // Otherwise use the mask to set the entry mask value
-            argIter->setName("__mask");
-            assert(argIter->getType() == LLVMTypes::MaskType);
-            ctx->SetFunctionMask(argIter);
-            assert(++argIter == function->arg_end());
-        }
-    }
-
-    // Finally, we can generate code for the function
-    if (code != NULL) {
-        int costEstimate = code->EstimateCost();
-        bool checkMask = (type->isTask == true) || 
-            ((function->hasFnAttr(llvm::Attribute::AlwaysInline) == false) &&
-             costEstimate > CHECK_MASK_AT_FUNCTION_START_COST);
-        Debug(code->pos, "Estimated cost for function \"%s\" = %d\n", 
-              sym->name.c_str(), costEstimate);
-        // If the body of the function is non-trivial, then we wrap the
-        // entire thing around a varying "cif (true)" test in order to reap
-        // the side-effect benefit of checking to see if the execution mask
-        // is all on and thence having a specialized code path for that
-        // case.  If this is a simple function, then this isn't worth the
-        // code bloat / overhead.
-        if (checkMask) {
-            bool allTrue[ISPC_MAX_NVEC];
-            for (int i = 0; i < g->target.vectorWidth; ++i)
-                allTrue[i] = true;
-            Expr *trueExpr = new ConstExpr(AtomicType::VaryingBool, allTrue, 
-                                           code->pos);
-            code = new IfStmt(trueExpr, code, NULL, true, code->pos);
-        }
-
-        ctx->SetDebugPos(code->pos);
-        ctx->AddInstrumentationPoint("function entry");
-        code->EmitCode(ctx);
-    }
-
-    if (ctx->GetCurrentBasicBlock()) {
-        // FIXME: We'd like to issue a warning if we've reached the end of
-        // the function without a return statement (for non-void
-        // functions).  But the test below isn't right, since we can have
-        // (with 'x' a varying test) "if (x) return a; else return b;", in
-        // which case we have a valid basic block but its unreachable so ok
-        // to not have return statement.
-#if 0
-        // If the bblock has no predecessors, then it doesn't matter if it
-        // doesn't have a return; it'll never be reached.  If it does,
-        // issue a warning.  Also need to warn if it's the entry block for
-        // the function (in which case it will not have predeccesors but is
-        // still reachable.)
-        if (type->GetReturnType() != AtomicType::Void &&
-            (pred_begin(ec.bblock) != pred_end(ec.bblock) || (ec.bblock == entryBBlock)))
-            Warning(sym->pos, "Missing return statement in function returning \"%s\".",
-                    type->rType->GetString().c_str());
-#endif
-
-        // FIXME: would like to set the context's current position to
-        // e.g. the end of the function code
-
-        // if bblock is non-NULL, it hasn't been terminated by e.g. a
-        // return instruction.  Need to add a return instruction.
-        ctx->ReturnInst();
-    }
-}
-
-
-void
-Function::GenerateIR() {
-    if (sym == NULL)
-        // May be NULL due to error earlier in compilation
-        return;
-
-    llvm::Function *function = sym->function;
-    assert(function != NULL);
-
-    // But if that function has a definition, we don't want to redefine it.
-    if (function->empty() == false) {
-        Error(sym->pos, "Ignoring redefinition of function \"%s\".", 
-              sym->name.c_str());
-        return;
-    }
-
-    // Figure out a reasonable source file position for the start of the
-    // function body.  If possible, get the position of the first actual
-    // non-StmtList statment...
-    SourcePos firstStmtPos = sym->pos;
-    if (code) {
-        StmtList *sl = dynamic_cast<StmtList *>(code);
-        if (sl && sl->GetStatements().size() > 0 && 
-            sl->GetStatements()[0] != NULL)
-            firstStmtPos = sl->GetStatements()[0]->pos;
-        else
-            firstStmtPos = code->pos;
-    }
-
-    // And we can now go ahead and emit the code 
-    {
-        FunctionEmitContext ec(this, sym, function, firstStmtPos);
-        emitCode(&ec, function, firstStmtPos);
-    }
-
-    if (m->errorCount == 0) {
-        if (llvm::verifyFunction(*function, llvm::ReturnStatusAction) == true) {
-            if (g->debugPrint)
-                function->dump();
-            FATAL("Function verificication failed");
-        }
-
-        // If the function is 'export'-qualified, emit a second version of
-        // it without a mask parameter and without name mangling so that
-        // the application can call it
-        const FunctionType *type = dynamic_cast<const FunctionType *>(sym->type);
-        assert(type != NULL);
-        if (type->isExported) {
-            if (!type->isTask) {
-                LLVM_TYPE_CONST llvm::FunctionType *ftype = 
-                    type->LLVMFunctionType(g->ctx);
-                llvm::GlobalValue::LinkageTypes linkage = llvm::GlobalValue::ExternalLinkage;
-                std::string functionName = sym->name;
-                if (g->mangleFunctionsWithTarget)
-                    functionName += std::string("_") + g->target.GetISAString();
-                llvm::Function *appFunction = 
-                    llvm::Function::Create(ftype, linkage, functionName.c_str(), m->module);
-                appFunction->setDoesNotThrow(true);
-
-                if (appFunction->getName() != functionName) {
-                    // this was a redefinition for which we already emitted an
-                    // error, so don't worry about this one...
-                    appFunction->eraseFromParent();
-                }
-                else {
-                    // And emit the code again
-                    FunctionEmitContext ec(this, sym, appFunction, firstStmtPos);
-                    emitCode(&ec, appFunction, firstStmtPos);
-                    if (m->errorCount == 0) {
-                        sym->exportedFunction = appFunction;
-                        if (llvm::verifyFunction(*appFunction, 
-                                                 llvm::ReturnStatusAction) == true) {
-                            if (g->debugPrint)
-                                appFunction->dump();
-                            FATAL("Function verificication failed");
-                        }
-                    }
-                }
-            }
-        }
-    }
-}
--- a/func.h
+++ b/func.h
@@ -1,66 +0,0 @@
-/*
-  Copyright (c) 2011, Intel Corporation
-  All rights reserved.
-
-  Redistribution and use in source and binary forms, with or without
-  modification, are permitted provided that the following conditions are
-  met:
-
-    * Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-
-    * Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-
-    * Neither the name of Intel Corporation nor the names of its
-      contributors may be used to endorse or promote products derived from
-      this software without specific prior written permission.
-
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
-   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
-   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
-   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
-*/
-
-/** @file func.h
-    @brief Representation of a function in a source file.
-*/
-
-#ifndef ISPC_FUNC_H
-#define ISPC_FUNC_H 1
-
-#include "ispc.h"
-#include <vector>
-
-class Function {
-public:
-    Function(Symbol *sym, const std::vector<Symbol *> &args, Stmt *code);
-
-    const Type *GetReturnType() const;
-    const FunctionType *GetType() const;
-
-    /** Generate LLVM IR for the function into the current module. */
-    void GenerateIR();
-
-private:
-    void emitCode(FunctionEmitContext *ctx, llvm::Function *function, 
-                  SourcePos firstStmtPos);
-
-    Symbol *sym;
-    std::vector<Symbol *> args;
-    Stmt *code;
-    Symbol *maskSymbol;
-    Symbol *threadIndexSym, *threadCountSym;
-    Symbol *taskIndexSym, *taskCountSym;
-};
-
-#endif // ISPC_FUNC_H
--- a/ispc.cpp
+++ b/ispc.cpp
@@ -38,7 +38,6 @@
 #include "ispc.h"
 #include "module.h"
 #include "util.h"
-#include "llvmutil.h"
 #include <stdio.h>
 #ifdef ISPC_IS_WINDOWS
 #include <windows.h>
@@ -53,7 +52,7 @@
 #include <llvm/Target/TargetMachine.h>
 #include <llvm/Target/TargetOptions.h>
 #include <llvm/Target/TargetData.h>
-#if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
+#if defined(LLVM_3_0) || defined(LLVM_3_0svn)
  #include <llvm/Support/TargetRegistry.h>
  #include <llvm/Support/TargetSelect.h>
 #else
@@ -75,7 +74,7 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa,
    if (cpu == NULL) {
        std::string hostCPU = llvm::sys::getHostCPUName();
        if (hostCPU.size() > 0)
-            cpu = strdup(hostCPU.c_str());
+            cpu = hostCPU.c_str();
        else {
            fprintf(stderr, "Warning: unable to determine host CPU!\n");
            cpu = "generic";
@@ -86,7 +85,7 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa,
    if (isa == NULL) {
        if (!strcasecmp(cpu, "atom"))
            isa = "sse2";
-#if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
+#if defined(LLVM_3_0) || defined(LLVM_3_0_svn)
        else if (!strcasecmp(cpu, "sandybridge") ||
                 !strcasecmp(cpu, "corei7-avx"))
            isa = "avx";
@@ -130,25 +129,19 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa,
        t->vectorWidth = 4;
        t->attributes = "+sse,+sse2,-sse3,-sse41,-sse42,-sse4a,-ssse3,-popcnt";
    }
-    else if (!strcasecmp(isa, "sse2-x2")) {
-        t->isa = Target::SSE2;
-        t->nativeVectorWidth = 4;
-        t->vectorWidth = 8;
-        t->attributes = "+sse,+sse2,-sse3,-sse41,-sse42,-sse4a,-ssse3,-popcnt";
-    }
    else if (!strcasecmp(isa, "sse4")) {
        t->isa = Target::SSE4;
        t->nativeVectorWidth = 4;
        t->vectorWidth = 4;
        t->attributes = "+sse,+sse2,+sse3,+sse41,-sse42,-sse4a,+ssse3,-popcnt,+cmov";
    }
-    else if (!strcasecmp(isa, "sse4x2") || !strcasecmp(isa, "sse4-x2")) {
+    else if (!strcasecmp(isa, "sse4x2")) {
        t->isa = Target::SSE4;
        t->nativeVectorWidth = 4;
        t->vectorWidth = 8;
        t->attributes = "+sse,+sse2,+sse3,+sse41,-sse42,-sse4a,+ssse3,-popcnt,+cmov";
    }
-#if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
+#if defined(LLVM_3_0) || defined(LLVM_3_0svn)
    else if (!strcasecmp(isa, "avx")) {
        t->isa = Target::AVX;
        t->nativeVectorWidth = 8;
@@ -171,7 +164,7 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa,
    if (!error) {
        llvm::TargetMachine *targetMachine = t->GetTargetMachine();
        const llvm::TargetData *targetData = targetMachine->getTargetData();
-        t->is32Bit = (targetData->getPointerSize() == 4);
+        t->is32bit = (targetData->getPointerSize() == 4);
    }

    return !error;
@@ -181,7 +174,7 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa,
 const char *
 Target::SupportedTargetCPUs() {
    return "atom, barcelona, core2, corei7, "
-#if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
+#if defined(LLVM_3_0) || defined(LLVM_3_0_svn)
        "corei7-avx, "
 #endif
        "istanbul, nocona, penryn, "
@@ -200,8 +193,8 @@ Target::SupportedTargetArchs() {

 const char *
 Target::SupportedTargetISAs() {
-    return "sse2, sse2-x2, sse4, sse4-x2"
-#if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
+    return "sse2, sse4, sse4x2"
+#if defined(LLVM_3_0) || defined(LLVM_3_0_svn)
        ", avx, avx-x2"
 #endif
        ;
@@ -212,11 +205,7 @@ std::string
 Target::GetTripleString() const {
    llvm::Triple triple;
    // Start with the host triple as the default
-#if defined(LLVM_3_1) || defined(LLVM_3_1svn)
-    triple.setTriple(llvm::sys::getDefaultTargetTriple());
-#else
    triple.setTriple(llvm::sys::getHostTriple());
-#endif

    // And override the arch in the host triple based on what the user
    // specified.  Here we need to deal with the fact that LLVM uses one
@@ -241,7 +230,7 @@ Target::GetTargetMachine() const {

    llvm::Reloc::Model relocModel = generatePIC ? llvm::Reloc::PIC_ : 
                                                  llvm::Reloc::Default;
-#if defined(LLVM_3_0svn) || defined(LLVM_3_1svn) || defined(LLVM_3_0)
+#if defined(LLVM_3_0svn) || defined(LLVM_3_0)
    std::string featuresString = attributes;
    llvm::TargetMachine *targetMachine = 
        target->createTargetMachine(triple, cpu, featuresString, relocModel);
@@ -263,53 +252,6 @@ Target::GetTargetMachine() const {
 }


-const char *
-Target::GetISAString() const {
-    switch (isa) {
-    case Target::SSE2:
-        return "sse2";
-    case Target::SSE4:
-        return "sse4";
-    case Target::AVX:
-        return "avx";
-        break;
-    default:
-        FATAL("Unhandled target in GetISAString()");
-    }
-    return "";
-}
-
-
-llvm::Value *
-Target::SizeOf(LLVM_TYPE_CONST llvm::Type *type) {
-    const llvm::TargetData *td = GetTargetMachine()->getTargetData();
-    assert(td != NULL);
-    uint64_t byteSize = td->getTypeSizeInBits(type) / 8;
-    if (is32Bit || g->opt.force32BitAddressing)
-        return LLVMInt32(byteSize);
-    else
-        return LLVMInt64(byteSize);
-}
-
-
-llvm::Value *
-Target::StructOffset(LLVM_TYPE_CONST llvm::Type *type, int element) {
-    const llvm::TargetData *td = GetTargetMachine()->getTargetData();
-    assert(td != NULL);
-    LLVM_TYPE_CONST llvm::StructType *structType = 
-        llvm::dyn_cast<LLVM_TYPE_CONST llvm::StructType>(type);
-    assert(structType != NULL);
-    const llvm::StructLayout *sl = td->getStructLayout(structType);
-    assert(sl != NULL);
-
-    uint64_t offset = sl->getElementOffset(element);
-    if (is32Bit || g->opt.force32BitAddressing)
-        return LLVMInt32(offset);
-    else
-        return LLVMInt64(offset);
-}
-
-
 ///////////////////////////////////////////////////////////////////////////
 // Opt

@@ -317,10 +259,7 @@ Opt::Opt() {
    level = 1;
    fastMath = false;
    fastMaskedVload = false;
-    force32BitAddressing = true;
    unrollLoops = true;
-    disableAsserts = false;
-    disableHandlePseudoMemoryOps = false;
    disableBlendedMaskedStores = false;
    disableCoherentControlFlow = false;
    disableUniformControlFlow = false;
@@ -341,37 +280,35 @@ Globals::Globals() {
    runCPP = true;
    debugPrint = false;
    disableWarnings = false;
-    warningsAsErrors = false;
-    disableLineWrap = false;
    emitPerfWarnings = true;
    emitInstrumentation = false;
    generateDebuggingSymbols = false;
-    mangleFunctionsWithTarget = false;

    ctx = new llvm::LLVMContext;

 #ifdef ISPC_IS_WINDOWS
    _getcwd(currentDirectory, sizeof(currentDirectory));
 #else
-    if (getcwd(currentDirectory, sizeof(currentDirectory)) == NULL)
-        FATAL("Current directory path too long!");
+    getcwd(currentDirectory, sizeof(currentDirectory));
 #endif
 }

+///////////////////////////////////////////////////////////////////////////
+// ASTNode
+
+ASTNode::~ASTNode() {
+}
+
 ///////////////////////////////////////////////////////////////////////////
 // SourcePos

-SourcePos::SourcePos(const char *n, int fl, int fc, int ll, int lc) {
+SourcePos::SourcePos(const char *n, int l, int c) {
    name = n ? n : m->module->getModuleIdentifier().c_str();
-    first_line = fl;
-    first_column = fc;
-    last_line = ll != 0 ? ll : fl;
-    last_column = lc != 0 ? lc : fc;
+    first_line = last_line = l;
+    first_column = last_column = c;
 }

-
-llvm::DIFile
-SourcePos::GetDIFile() const {
+llvm::DIFile SourcePos::GetDIFile() const {
    std::string directory, filename;
    GetDirectoryAndFileName(g->currentDirectory, name, &directory, &filename);
    return m->diBuilder->createFile(filename, directory);
@@ -394,17 +331,3 @@ SourcePos::operator==(const SourcePos &p2) const {
            last_column == p2.last_column);
 }

-
-SourcePos
-Union(const SourcePos &p1, const SourcePos &p2) {
-    if (strcmp(p1.name, p2.name) != 0)
-        return p1;
-
-    SourcePos ret;
-    ret.name = p1.name;
-    ret.first_line = std::min(p1.first_line, p2.first_line);
-    ret.first_column = std::min(p1.first_column, p2.first_column);
-    ret.last_line = std::max(p1.last_line, p2.last_line);
-    ret.last_column = std::max(p1.last_column, p2.last_column);
-    return ret;
-}
--- a/ispc.h
+++ b/ispc.h
@@ -38,10 +38,6 @@
 #ifndef ISPC_H
 #define ISPC_H

-#if !defined(LLVM_2_9) && !defined(LLVM_3_0) && !defined(LLVM_3_0svn) && !defined(LLVM_3_1svn)
-#error "Only LLVM 2.9, 3.0, and the 3.1 development branch are supported"
-#endif
-
 #if defined(_WIN32) || defined(_WIN64)
 #define ISPC_IS_WINDOWS
 #elif defined(__linux__)
@@ -80,7 +76,7 @@ namespace llvm {
 }

 // llvm::Type *s are no longer const in llvm 3.0
-#if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
+#if defined(LLVM_3_0) || defined(LLVM_3_0svn)
 #define LLVM_TYPE_CONST
 #else
 #define LLVM_TYPE_CONST const
@@ -88,17 +84,19 @@ namespace llvm {

 class ArrayType;
 class AtomicType;
+class DeclSpecs;
+class Declaration;
+class Declarator;
 class FunctionEmitContext;
 class Expr;
 class ExprList;
-class Function;
 class FunctionType;
+class GatherBuffer;
 class Module;
 class Stmt;
 class Symbol;
 class SymbolTable;
 class Type;
-struct VariableDeclaration;

 /** @brief Representation of a range of positions in a source file.

@@ -108,8 +106,7 @@ struct VariableDeclaration;
    lexing code).  Both lines and columns are counted starting from one.
 */
 struct SourcePos {
-    SourcePos(const char *n = NULL, int fl = 0, int fc = 0,
-              int ll = 0, int lc = 0);
+    SourcePos(const char *n = NULL, int l = 0, int c = 0);

    const char *name;
    int first_line;
@@ -126,10 +123,37 @@ struct SourcePos {
    bool operator==(const SourcePos &p2) const;
 };

-/** Returns a SourcePos that encompasses the extent of both of the given
-    extents. */
-SourcePos Union(const SourcePos &p1, const SourcePos &p2);

+/** @brief Abstract base class for nodes in the abstract syntax tree (AST).
+
+    This class defines a basic interface that all abstract syntax tree
+    (AST) nodes must implement.  The base classes for both expressions
+    (Expr) and statements (Stmt) inherit from this class.
+*/
+class ASTNode {
+public:
+    ASTNode(SourcePos p) : pos(p) { }
+    virtual ~ASTNode();
+
+    /** The Optimize() method should perform any appropriate early-stage
+        optimizations on the node (e.g. constant folding).  The caller
+        should use the returned ASTNode * in place of the original node.
+        This method may return NULL if an error is encountered during
+        optimization. */
+    virtual ASTNode *Optimize() = 0;
+
+    /** Type checking should be performed by the node when this method is
+        called.  In the event of an error, a NULL value may be returned.
+        As with ASTNode::Optimize(), the caller should store the returned
+        pointer in place of the original ASTNode *. */
+    virtual ASTNode *TypeCheck() = 0;
+
+    virtual int EstimateCost() const = 0;
+
+    /** All AST nodes must track the file position where they are
+        defined. */
+    const SourcePos pos;
+};

 /** @brief Structure that defines a compilation target 

@@ -161,28 +185,13 @@ struct Target {
    /** Returns the LLVM TargetMachine object corresponding to this
        target. */
    llvm::TargetMachine *GetTargetMachine() const;
-    
-    /** Returns a string like "avx" encoding the target. */
-    const char *GetISAString() const;
-
-    /** Returns the size of the given type */
-    llvm::Value *SizeOf(LLVM_TYPE_CONST llvm::Type *type);
-    /** Given a structure type and an element number in the structure,
-        returns a value corresponding to the number of bytes from the start
-        of the structure where the element is located. */
-    llvm::Value *StructOffset(LLVM_TYPE_CONST llvm::Type *type,
-                              int element);

    /** llvm Target object representing this target. */
    const llvm::Target *target;

    /** Enumerator giving the instruction sets that the compiler can
-        target.  These should be ordered from "worse" to "better" in that
-        if a processor supports multiple target ISAs, then the most
-        flexible/performant of them will apear last in the enumerant.  Note
-        also that __best_available_isa() needs to be updated if ISAs are
-        added or the enumerant values are reordered.  */
-    enum ISA { SSE2, SSE4, AVX, NUM_ISAS };
+        target. */
+    enum ISA { SSE2, SSE4, AVX };

    /** Instruction set being compiled to. */
    ISA isa;
@@ -191,7 +200,7 @@ struct Target {
    std::string arch;

    /** Is the target architecture 32 or 64 bit */
-    bool is32Bit;
+    bool is32bit;

    /** Target CPU. (e.g. "corei7", "corei7-avx", ..) */
    std::string cpu;
@@ -241,22 +250,6 @@ struct Opt {
        it will make sense. */
    bool unrollLoops;

-    /** Indicates if addressing math will be done with 32-bit math, even on
-        64-bit systems.  (This is generally noticably more efficient,
-        though at the cost of addressing >2GB).
-     */ 
-    bool force32BitAddressing;
-
-    /** Indicates whether assert() statements should be ignored (for
-        performance in the generated code). */
-    bool disableAsserts;
-    
-    /** If enabled, the various __pseudo* memory ops (gather/scatter,
-        masked load/store) are left in their __pseudo* form, for better
-        understanding of the structure of generated code when reading
-        it. */
-    bool disableHandlePseudoMemoryOps;
-
    /** On targets that don't have a masked store instruction but do have a
        blending instruction, by default, we simulate masked stores by
        loading the old value, blending, and storing the result.  This can
@@ -348,13 +341,6 @@ struct Globals {
    /** Indicates whether all warning messages should be surpressed. */
    bool disableWarnings;

-    /** Indicates whether warnings should be issued as errors. */
-    bool warningsAsErrors;
-
-    /** Indicates whether line wrapping of error messages to the terminal
-        width should be disabled. */
-    bool disableLineWrap;
-
    /** Indicates whether additional warnings should be issued about
        possible performance pitfalls. */
    bool emitPerfWarnings;
@@ -368,10 +354,6 @@ struct Globals {
    /** Indicates whether ispc should generate debugging symbols for the
        program in its output. */
    bool generateDebuggingSymbols;
-   
-    /** If true, function names are mangled by appending the target ISA and
-        vector width to them. */
-    bool mangleFunctionsWithTarget;

    /** Global LLVMContext object */
    llvm::LLVMContext *ctx;
@@ -391,8 +373,6 @@ enum {
    COST_COMPLEX_ARITH_OP = 4,
    COST_DEREF = 4,
    COST_FUNCALL = 4,
-    COST_FUNPTR_UNIFORM = 12,
-    COST_FUNPTR_VARYING = 24,
    COST_GATHER = 8,
    COST_LOAD = 2,
    COST_REGULAR_BREAK_CONTINUE = 2,
@@ -400,14 +380,11 @@ enum {
    COST_SELECT = 4,
    COST_SIMPLE_ARITH_LOGIC_OP = 1,
    COST_SYNC = 32,
-    COST_TASK_LAUNCH = 32,
+    COST_TASK_LAUNCH = 16,
    COST_TYPECAST_COMPLEX = 4,
    COST_TYPECAST_SIMPLE = 1,
-    COST_UNIFORM_IF = 2,
-    COST_VARYING_IF = 3,
    COST_UNIFORM_LOOP = 4,
    COST_VARYING_LOOP = 6,
-    COST_ASSERT = 8,

    CHECK_MASK_AT_FUNCTION_START_COST = 16,
    PREDICATE_SAFE_IF_STATEMENT_COST = 6,
--- a/ispc.vcxproj
+++ b/ispc.vcxproj
@@ -1,4 +1,4 @@
-<?xml version="1.0" encoding="utf-8"?>
+<?xml version="1.0" encoding="utf-8"?>
 <Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
  <ItemGroup Label="ProjectConfigurations">
    <ProjectConfiguration Include="Debug|Win32">
@@ -11,35 +11,25 @@
    </ProjectConfiguration>
  </ItemGroup>
  <ItemGroup>
-    <ClCompile Include="ast.cpp" />
    <ClCompile Include="builtins.cpp" />
    <ClCompile Include="ctx.cpp" />
    <ClCompile Include="decl.cpp" />
    <ClCompile Include="expr.cpp" />
-    <ClCompile Include="func.cpp" />
    <ClCompile Include="gen-bitcode-avx.cpp" />
    <ClCompile Include="gen-bitcode-avx-x2.cpp" />
    <ClCompile Include="gen-bitcode-c-32.cpp" />
    <ClCompile Include="gen-bitcode-c-64.cpp" />
-    <ClCompile Include="gen-bitcode-dispatch.cpp" />
    <ClCompile Include="gen-bitcode-sse2.cpp" />
-    <ClCompile Include="gen-bitcode-sse2-x2.cpp" />
    <ClCompile Include="gen-bitcode-sse4.cpp" />
-    <ClCompile Include="gen-bitcode-sse4-x2.cpp" />
+    <ClCompile Include="gen-bitcode-sse4x2.cpp" />
    <ClCompile Include="gen-stdlib.cpp" />
    <ClCompile Include="ispc.cpp" />
-    <ClCompile Include="lex.cc">
-      <DisableSpecificWarnings Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">4146;4800;4996;4355;4624;4005;4003;4018</DisableSpecificWarnings>
-      <DisableSpecificWarnings Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">4146;4800;4996;4355;4624;4005;4003;4018</DisableSpecificWarnings>
-    </ClCompile>
+    <ClCompile Include="lex.cc" />
    <ClCompile Include="llvmutil.cpp" />
    <ClCompile Include="module.cpp" />
    <ClCompile Include="main.cpp" />
    <ClCompile Include="opt.cpp" />
-    <ClCompile Include="parse.cc">
-      <DisableSpecificWarnings Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">4146;4800;4996;4355;4624;4005;4065</DisableSpecificWarnings>
-      <DisableSpecificWarnings Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">4146;4800;4996;4355;4624;4005;4065</DisableSpecificWarnings>
-    </ClCompile>
+    <ClCompile Include="parse.cc" />
    <CustomBuild Include="builtins-c.c">
      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%LLVM_INSTALL_DIR%\bin\clang -m32 -emit-llvm builtins-c.c -c -o - | %LLVM_INSTALL_DIR%\bin\llvm-dis - | python bitcode2cpp.py builtins-c-32.c &gt; gen-bitcode-c-32.cpp;
 %LLVM_INSTALL_DIR%\bin\clang -m64 -emit-llvm builtins-c.c -c -o - | %LLVM_INSTALL_DIR%\bin\llvm-dis - | python bitcode2cpp.py builtins-c-64.c &gt; gen-bitcode-c-64.cpp</Command>
@@ -56,12 +46,10 @@
    <ClCompile Include="util.cpp" />
  </ItemGroup>
  <ItemGroup>
-    <ClInclude Include="ast.h" />
    <ClInclude Include="builtins.h" />
    <ClInclude Include="ctx.h" />
    <ClInclude Include="decl.h" />
    <ClInclude Include="expr.h" />
-    <ClInclude Include="func.h" />
    <ClInclude Include="ispc.h" />
    <ClInclude Include="llvmutil.h" />
    <ClInclude Include="module.h" />
@@ -88,38 +76,25 @@
      <FileType>Document</FileType>
      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 builtins.m4 builtins-sse4.ll | python bitcode2cpp.py builtins-sse4.ll &gt; gen-bitcode-sse4.cpp</Command>
      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-sse4.cpp</Outputs>
-      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins.m4;builtins-sse4-common.ll</AdditionalInputs>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins.m4;builtins-sse.ll</AdditionalInputs>
      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 builtins.m4 builtins-sse4.ll | python bitcode2cpp.py builtins-sse4.ll &gt; gen-bitcode-sse4.cpp</Command>
      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-sse4.cpp</Outputs>
-      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins.m4;builtins-sse4-common.ll</AdditionalInputs>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins.m4;builtins-sse.ll</AdditionalInputs>
      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-sse4.cpp</Message>
      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-sse4.cpp</Message>
    </CustomBuild>
  </ItemGroup>
  <ItemGroup>
-    <CustomBuild Include="builtins-dispatch.ll">
+    <CustomBuild Include="builtins-sse4x2.ll">
      <FileType>Document</FileType>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 builtins.m4 builtins-dispatch.ll | python bitcode2cpp.py builtins-dispatch.ll &gt; gen-bitcode-dispatch.cpp</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-dispatch.cpp</Outputs>
-      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins.m4</AdditionalInputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 builtins.m4 builtins-dispatch.ll | python bitcode2cpp.py builtins-dispatch.ll &gt; gen-bitcode-dispatch.cpp</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-dispatch.cpp</Outputs>
-      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins.m4</AdditionalInputs>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-dispatch.cpp</Message>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-dispatch.cpp</Message>
-    </CustomBuild>
-  </ItemGroup>
-  <ItemGroup>
-    <CustomBuild Include="builtins-sse4-x2.ll">
-      <FileType>Document</FileType>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 builtins.m4 builtins-sse4-x2.ll | python bitcode2cpp.py builtins-sse4-x2.ll &gt; gen-bitcode-sse4-x2.cpp</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-sse4-x2.cpp</Outputs>
-      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins.m4;builtins-sse4-common.ll</AdditionalInputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 builtins.m4 builtins-sse4-x2.ll | python bitcode2cpp.py builtins-sse4-x2.ll &gt; gen-bitcode-sse4-x2.cpp</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-sse4-x2.cpp</Outputs>
-      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins.m4;builtins-sse4-common.ll</AdditionalInputs>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-sse4-x2.cpp</Message>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-sse4-x2.cpp</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 builtins.m4 builtins-sse4x2.ll | python bitcode2cpp.py builtins-sse4x2.ll &gt; gen-bitcode-sse4x2.cpp</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-sse4x2.cpp</Outputs>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins.m4;builtins-sse.ll</AdditionalInputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 builtins.m4 builtins-sse4x2.ll | python bitcode2cpp.py builtins-sse4x2.ll &gt; gen-bitcode-sse4x2.cpp</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-sse4x2.cpp</Outputs>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins.m4;builtins-sse.ll</AdditionalInputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-sse4x2.cpp</Message>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-sse4x2.cpp</Message>
    </CustomBuild>
  </ItemGroup>
  <ItemGroup>
@@ -127,36 +102,23 @@
      <FileType>Document</FileType>
      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 builtins.m4 builtins-sse2.ll | python bitcode2cpp.py builtins-sse2.ll &gt; gen-bitcode-sse2.cpp</Command>
      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-sse2.cpp</Outputs>
-      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins.m4;builtins-sse2-common.ll</AdditionalInputs>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins.m4;builtins-sse.ll</AdditionalInputs>
      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 builtins.m4 builtins-sse2.ll | python bitcode2cpp.py builtins-sse2.ll &gt; gen-bitcode-sse2.cpp</Command>
      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-sse2.cpp</Outputs>
-      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins.m4;builtins-sse2-common.ll</AdditionalInputs>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins.m4;builtins-sse.ll</AdditionalInputs>
      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-sse2.cpp</Message>
      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-sse2.cpp</Message>
    </CustomBuild>
  </ItemGroup>
-  <ItemGroup>
-    <CustomBuild Include="builtins-sse2-x2.ll">
-      <FileType>Document</FileType>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 builtins.m4 builtins-sse2-x2.ll | python bitcode2cpp.py builtins-sse2-x2.ll &gt; gen-bitcode-sse2-x2.cpp</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-sse2-x2.cpp</Outputs>
-      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins.m4;builtins-sse2-common.ll</AdditionalInputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 builtins.m4 builtins-sse2-x2.ll | python bitcode2cpp.py builtins-sse2-x2.ll &gt; gen-bitcode-sse2-x2.cpp</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-sse2-x2.cpp</Outputs>
-      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins.m4;builtins-sse2-common.ll</AdditionalInputs>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-sse2-x2.cpp</Message>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-sse2-x2.cpp</Message>
-    </CustomBuild>
-  </ItemGroup>
  <ItemGroup>
    <CustomBuild Include="builtins-avx.ll">
      <FileType>Document</FileType>
      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 builtins.m4 builtins-avx.ll | python bitcode2cpp.py builtins-avx.ll &gt; gen-bitcode-avx.cpp</Command>
      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-avx.cpp</Outputs>
-      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins.m4;builtins-avx-common.ll</AdditionalInputs>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins.m4;builtins-sse.ll</AdditionalInputs>
      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 builtins.m4 builtins-avx.ll | python bitcode2cpp.py builtins-avx.ll &gt; gen-bitcode-avx.cpp</Command>
      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-avx.cpp</Outputs>
-      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins.m4;builtins-avx-common.ll</AdditionalInputs>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins.m4;builtins-sse.ll</AdditionalInputs>
      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-avx.cpp</Message>
      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-avx.cpp</Message>
    </CustomBuild>
@@ -268,4 +230,4 @@
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
  </ImportGroup>
-</Project>
+</Project>
--- a/ispc_test.cpp
+++ b/ispc_test.cpp
@@ -74,7 +74,7 @@ extern "C" {
 #include <llvm/DerivedTypes.h>
 #include <llvm/Instructions.h>
 #include <llvm/ExecutionEngine/ExecutionEngine.h>
-#if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
+#if defined(LLVM_3_0) || defined(LLVM_3_0svn)
  #include <llvm/Support/TargetRegistry.h>
  #include <llvm/Support/TargetSelect.h>
 #else
@@ -120,7 +120,7 @@ void *ISPCAlloc(void **handle, int64_t size, int32_t alignment) {
    *handle = (void *)0xdeadbeef;
    // leak time!
 #ifdef ISPC_IS_WINDOWS
-    return _aligned_malloc((size_t)size, alignment);
+    return _aligned_malloc(size, alignment);
 #endif
 #ifdef ISPC_IS_LINUX
    return memalign(alignment, size);
@@ -182,7 +182,7 @@ static bool lRunTest(const char *fn) {
    }

    std::string eeError;
-#if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
+#if defined(LLVM_3_0) || defined(LLVM_3_0svn)
    llvm::EngineBuilder engineBuilder(module);
    engineBuilder.setErrorStr(&eeError);
    engineBuilder.setEngineKind(llvm::EngineKind::JIT);
@@ -361,7 +361,7 @@ static bool lRunTest(const char *fn) {

 int main(int argc, char *argv[]) {
    llvm::InitializeNativeTarget();
-#if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
+#if defined(LLVM_3_0) || defined(LLVM_3_0svn)
    LLVMLinkInJIT();
 #endif

--- a/ispc_test.vcxproj
+++ b/ispc_test.vcxproj
@@ -54,7 +54,6 @@
      <Optimization>Disabled</Optimization>
      <PreprocessorDefinitions>LLVM_3_0;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <AdditionalIncludeDirectories>$(LLVM_INSTALL_DIR)/include</AdditionalIncludeDirectories>
-      <DisableSpecificWarnings>4146;4355;4800</DisableSpecificWarnings>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
@@ -73,7 +72,6 @@
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <PreprocessorDefinitions>LLVM_3_0;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <AdditionalIncludeDirectories>$(LLVM_INSTALL_DIR)/include</AdditionalIncludeDirectories>
-      <DisableSpecificWarnings>4146;4355;4800</DisableSpecificWarnings>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
--- a/lex.ll
+++ b/lex.ll
@@ -34,13 +34,13 @@
 %{

 #include "ispc.h"
+#include "decl.h"
 #include "sym.h"
 #include "util.h"
 #include "module.h"
 #include "type.h"
 #include "parse.hh"
 #include <stdlib.h>
-#include <stdint.h>

 static uint64_t lParseBinary(const char *ptr, SourcePos pos);
 static void lCComment(SourcePos *);
@@ -78,7 +78,6 @@ ZO_SWIZZLE ([01]+[w-z]+)+|([01]+[rgba]+)+|([01]+[uv]+)+
 "/*"            { lCComment(yylloc); }
 "//"            { lCppComment(yylloc); }

-__assert { return TOKEN_ASSERT; }
 bool { return TOKEN_BOOL; }
 break { return TOKEN_BREAK; }
 case { return TOKEN_CASE; }
@@ -86,6 +85,7 @@ cbreak { return TOKEN_CBREAK; }
 ccontinue { return TOKEN_CCONTINUE; }
 cdo { return TOKEN_CDO; }
 cfor { return TOKEN_CFOR; }
+char { return TOKEN_CHAR; }
 cif { return TOKEN_CIF; }
 cwhile { return TOKEN_CWHILE; }
 const { return TOKEN_CONST; }
@@ -101,8 +101,6 @@ extern { return TOKEN_EXTERN; }
 false { return TOKEN_FALSE; }
 float { return TOKEN_FLOAT; }
 for { return TOKEN_FOR; }
-foreach { return TOKEN_FOREACH; }
-foreach_tiled { return TOKEN_FOREACH_TILED; }
 goto { return TOKEN_GOTO; }
 if { return TOKEN_IF; }
 inline { return TOKEN_INLINE; }
@@ -112,15 +110,10 @@ int16 { return TOKEN_INT16; }
 int32 { return TOKEN_INT; }
 int64 { return TOKEN_INT64; }
 launch { return TOKEN_LAUNCH; }
-NULL { return TOKEN_NULL; }
 print { return TOKEN_PRINT; }
-reference { Error(*yylloc, "\"reference\" qualifier is no longer supported; "
-                           "please use C++-style '&' syntax for references "
-                           "instead."); }
+reference { return TOKEN_REFERENCE; }
 return { return TOKEN_RETURN; }
 soa { return TOKEN_SOA; }
-signed { return TOKEN_SIGNED; }
-sizeof { return TOKEN_SIZEOF; }
 static { return TOKEN_STATIC; }
 struct { return TOKEN_STRUCT; }
 switch { return TOKEN_SWITCH; }
@@ -133,8 +126,6 @@ unsigned { return TOKEN_UNSIGNED; }
 varying { return TOKEN_VARYING; }
 void { return TOKEN_VOID; }
 while { return TOKEN_WHILE; }
-\"C\" { return TOKEN_STRING_C_LITERAL; }
-\.\.\. { return TOKEN_DOTDOTDOT; }

 L?\"(\\.|[^\\"])*\" { lStringConst(yylval, yylloc); return TOKEN_STRING_LITERAL; }

@@ -230,7 +221,6 @@ L?\"(\\.|[^\\"])*\" { lStringConst(yylval, yylloc); return TOKEN_STRING_LITERAL;
 "&=" { return TOKEN_AND_ASSIGN; }
 "^=" { return TOKEN_XOR_ASSIGN; }
 "|=" { return TOKEN_OR_ASSIGN; }
-"->" { return TOKEN_PTR_OP; }
 ";"             { return ';'; }
 ("{"|"<%")      { return '{'; }
 ("}"|"%>")      { return '}'; }
@@ -274,6 +264,8 @@ L?\"(\\.|[^\\"])*\" { lStringConst(yylval, yylloc); return TOKEN_STRING_LITERAL;

 %%

+/*sizeof { return TOKEN_SIZEOF; }*/
+/*"->" { return TOKEN_PTR_OP; }*/
 /*short { return TOKEN_SHORT; }*/
 /*long { return TOKEN_LONG; }*/
 /*signed { return TOKEN_SIGNED; }*/
--- a/llvmutil.cpp
+++ b/llvmutil.cpp
@@ -40,7 +40,6 @@

 LLVM_TYPE_CONST llvm::Type *LLVMTypes::VoidType = NULL;
 LLVM_TYPE_CONST llvm::PointerType *LLVMTypes::VoidPointerType = NULL;
-LLVM_TYPE_CONST llvm::Type *LLVMTypes::PointerIntType = NULL;
 LLVM_TYPE_CONST llvm::Type *LLVMTypes::BoolType = NULL;

 LLVM_TYPE_CONST llvm::Type *LLVMTypes::Int8Type = NULL;
@@ -75,7 +74,7 @@ LLVM_TYPE_CONST llvm::Type *LLVMTypes::Int64VectorPointerType = NULL;
 LLVM_TYPE_CONST llvm::Type *LLVMTypes::FloatVectorPointerType = NULL;
 LLVM_TYPE_CONST llvm::Type *LLVMTypes::DoubleVectorPointerType = NULL;

-LLVM_TYPE_CONST llvm::VectorType *LLVMTypes::VoidPointerVectorType = NULL;
+LLVM_TYPE_CONST llvm::ArrayType *LLVMTypes::VoidPointerVectorType = NULL;

 llvm::Constant *LLVMTrue = NULL;
 llvm::Constant *LLVMFalse = NULL;
@@ -87,8 +86,6 @@ void
 InitLLVMUtil(llvm::LLVMContext *ctx, Target target) {
    LLVMTypes::VoidType = llvm::Type::getVoidTy(*ctx);
    LLVMTypes::VoidPointerType = llvm::PointerType::get(llvm::Type::getInt8Ty(*ctx), 0);
-    LLVMTypes::PointerIntType = target.is32Bit ? llvm::Type::getInt32Ty(*ctx) :
-        llvm::Type::getInt64Ty(*ctx);

    LLVMTypes::BoolType = llvm::Type::getInt1Ty(*ctx);
    LLVMTypes::Int8Type = llvm::Type::getInt8Ty(*ctx);
@@ -133,8 +130,8 @@ InitLLVMUtil(llvm::LLVMContext *ctx, Target target) {
    LLVMTypes::FloatVectorPointerType = llvm::PointerType::get(LLVMTypes::FloatVectorType, 0);
    LLVMTypes::DoubleVectorPointerType = llvm::PointerType::get(LLVMTypes::DoubleVectorType, 0);

-    LLVMTypes::VoidPointerVectorType = g->target.is32Bit ? LLVMTypes::Int32VectorType :
-        LLVMTypes::Int64VectorType;
+    LLVMTypes::VoidPointerVectorType = 
+        llvm::ArrayType::get(LLVMTypes::VoidPointerType, target.vectorWidth);

    LLVMTrue = llvm::ConstantInt::getTrue(*ctx);
    LLVMFalse = llvm::ConstantInt::getFalse(*ctx);
@@ -454,3 +451,11 @@ LLVMBoolVector(const bool *bvec) {
    }
    return llvm::ConstantVector::get(vals);
 }
+
+
+LLVM_TYPE_CONST llvm::ArrayType *
+LLVMPointerVectorType(LLVM_TYPE_CONST llvm::Type *t) {
+    // NOTE: ArrayType, not VectorType
+    return llvm::ArrayType::get(llvm::PointerType::get(t, 0), 
+                                g->target.vectorWidth);
+}
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Matt Pharr	2f35bc1a0f	Release notes and doxygen bump for v1.0.10	2011-09-30 15:09:19 -07:00
Matt Pharr	1620e0508d	Added deferred shading workload	2011-09-30 15:09:04 -07:00