Doxygen bump and release notes for v1.0.11

Windows: fix examples build to look for ispc.exe in ../.. as well
Windows: fix some compiler warnings during build
2011-10-07 09:57:55 -07:00 · 2011-10-09 07:40:18 -07:00 · 2011-10-09 07:40:17 -07:00 · 2011-10-07 09:20:48 -07:00 · 2011-10-08 17:17:05 -07:00 · 2011-10-06 17:10:30 -07:00
127 changed files with 8329 additions and 3024 deletions
--- a/6
+++ b/6
@@ -44,13 +44,13 @@ YACC=bison -d -v -t

 ###########################################################################

-CXX_SRC=builtins.cpp ctx.cpp decl.cpp expr.cpp ispc.cpp \
+CXX_SRC=ast.cpp builtins.cpp ctx.cpp decl.cpp expr.cpp func.cpp ispc.cpp \
 	llvmutil.cpp main.cpp module.cpp opt.cpp stmt.cpp sym.cpp type.cpp \
 	util.cpp
-HEADERS=builtins.h ctx.h decl.h expr.h ispc.h llvmutil.h module.h \
+HEADERS=ast.h builtins.h ctx.h decl.h expr.h func.h ispc.h llvmutil.h module.h \
 	opt.h stmt.h sym.h type.h util.h
 BUILTINS_SRC=builtins-avx.ll builtins-avx-x2.ll builtins-sse2.ll \
-	builtins-sse4.ll builtins-sse4x2.ll
+	builtins-sse4.ll builtins-sse4-x2.ll builtins-dispatch.ll
 BISON_SRC=parse.yy
 FLEX_SRC=lex.ll

--- a/README.txt
+++ b/README.txt
@@ -15,8 +15,8 @@ code.

 ispc is an open source compiler under the BSD license; see the file
 LICENSE.txt.  ispc supports Windows, Mac, and Linux, with both x86 and
-x86-64 targets. It currently supports the SSE2 and SSE4 instruction sets,
-though support for AVX should be available soon.
+x86-64 targets.  It currently supports the SSE2, SSE4, and AVX instruction
+sets.

 For more information and examples, as well as a wiki and the bug database,
 see the ispc distribution site, http://ispc.github.com.
--- a/examples/cpuid.h
+++ b/examples/cpuid.h
@@ -1,5 +1,5 @@
 /*
-  Copyright (c) 2010-2011, Intel Corporation
+  Copyright (c) 2011, Intel Corporation
  All rights reserved.

  Redistribution and use in source and binary forms, with or without
@@ -31,36 +31,35 @@
   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
 */

-#ifndef ISPC_CPUID_H
-#define ISPC_CPUID_H 1
+/** @file ast.cpp
+    @brief 
+*/

-#ifdef _MSC_VER
-// Provides a __cpuid() function with same signature as below
-#include <intrin.h>
-#else
-static void __cpuid(int info[4], int infoType) {
-    __asm__ __volatile__ ("cpuid"
-                          : "=a" (info[0]), "=b" (info[1]), "=c" (info[2]), "=d" (info[3])
-                          : "0" (infoType));
-}
-#endif
+#include "ast.h"
+#include "decl.h"
+#include "func.h"
+#include "type.h"
+#include "sym.h"

-inline bool CPUSupportsSSE2() {
-    int info[4];
-    __cpuid(info, 1);
-    return (info[3] & (1 << 26)) != 0;
+///////////////////////////////////////////////////////////////////////////
+// ASTNode
+
+ASTNode::~ASTNode() {
 }

-inline bool CPUSupportsSSE4() {
-    int info[4];
-    __cpuid(info, 1);
-    return (info[2] & (1 << 19)) != 0;
+
+///////////////////////////////////////////////////////////////////////////
+// AST
+
+void
+AST::AddFunction(DeclSpecs *ds, Declarator *decl, Stmt *code) {
+    functions.push_back(new Function(ds, decl, code));
 }

-inline bool CPUSupportsAVX() {
-    int info[4];
-    __cpuid(info, 1);
-    return (info[2] & (1 << 28)) != 0;
+
+void
+AST::GenerateIR() {
+    for (unsigned int i = 0; i < functions.size(); ++i)
+        functions[i]->GenerateIR();
 }

-#endif // ISPC_CPUID_H
--- a/ast.h
+++ b/ast.h
@@ -0,0 +1,93 @@
+/*
+  Copyright (c) 2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+/** @file ast.h
+    @brief 
+*/
+
+#ifndef ISPC_AST_H
+#define ISPC_AST_H 1
+
+#include "ispc.h"
+#include <vector>
+
+/** @brief Abstract base class for nodes in the abstract syntax tree (AST).
+
+    This class defines a basic interface that all abstract syntax tree
+    (AST) nodes must implement.  The base classes for both expressions
+    (Expr) and statements (Stmt) inherit from this class.
+*/
+class ASTNode {
+public:
+    ASTNode(SourcePos p) : pos(p) { }
+    virtual ~ASTNode();
+
+    /** The Optimize() method should perform any appropriate early-stage
+        optimizations on the node (e.g. constant folding).  The caller
+        should use the returned ASTNode * in place of the original node.
+        This method may return NULL if an error is encountered during
+        optimization. */
+    virtual ASTNode *Optimize() = 0;
+
+    /** Type checking should be performed by the node when this method is
+        called.  In the event of an error, a NULL value may be returned.
+        As with ASTNode::Optimize(), the caller should store the returned
+        pointer in place of the original ASTNode *. */
+    virtual ASTNode *TypeCheck() = 0;
+
+    virtual int EstimateCost() const = 0;
+
+    /** All AST nodes must track the file position where they are
+        defined. */
+    const SourcePos pos;
+};
+
+
+/** Simple representation of the abstract syntax trees for all of the
+    functions declared in a compilation unit.
+ */
+class AST {
+public:
+    /** Add the AST for a function described by the given declaration
+        information and source code. */
+    void AddFunction(DeclSpecs *ds, Declarator *decl, Stmt *code);
+
+    /** Generate LLVM IR for all of the functions into the current
+        module. */
+    void GenerateIR();
+
+private:
+    std::vector<Function *> functions;
+};
+
+#endif // ISPC_AST_H
--- a/bitcode2cpp.py
+++ b/bitcode2cpp.py
@@ -4,6 +4,8 @@ import sys
 import string
 import re
 import subprocess
+import platform
+import os

 length=0

@@ -14,8 +16,12 @@ target = re.sub("\.ll$", "", target)
 target = re.sub("\.c$", "", target)
 target = re.sub("-", "_", target)

+llvm_as="llvm-as"
+if platform.system() == 'Windows' or string.find(platform.system(), "CYGWIN_NT") != -1:
+    llvm_as = os.getenv("LLVM_INSTALL_DIR").replace("\\", "/") + "/bin/" + llvm_as
+
 try:
-    as_out=subprocess.Popen([ "llvm-as", "-", "-o", "-"], stdout=subprocess.PIPE)
+    as_out=subprocess.Popen([llvm_as, "-", "-o", "-"], stdout=subprocess.PIPE)
 except IOError:
    print >> sys.stderr, "Couldn't open " + src
    sys.exit(1)
--- a/buildall.bat
+++ b/buildall.bat
@@ -0,0 +1,16 @@
+@echo off
+
+REM If LLVM_INSTALL_DIR isn't set globally in your environment,
+REM it can be set here_
+set LLVM_INSTALL_DIR=c:\users\mmp\llvm-dev
+
+REM Both the LLVM binaries and python need to be in the path
+set path=%LLVM_INSTALL_DIR%\bin;%PATH%;c:\cygwin\bin
+
+msbuild ispc.vcxproj /V:m /p:Platform=Win32 /p:Configuration=Release
+msbuild ispc_test.vcxproj /V:m /p:Platform=Win32 /p:Configuration=Release
+
+msbuild examples\examples.sln /V:m /p:Platform=x64 /p:Configuration=Release /t:rebuild
+msbuild examples\examples.sln /V:m /p:Platform=x64 /p:Configuration=Debug /t:rebuild
+msbuild examples\examples.sln /V:m /p:Platform=Win32 /p:Configuration=Release /t:rebuild
+msbuild examples\examples.sln /V:m /p:Platform=Win32 /p:Configuration=Debug /t:rebuild
--- a/builtins-avx-x2.ll
+++ b/builtins-avx-x2.ll
@@ -232,8 +232,8 @@ define internal float @__reduce_add_float(<16 x float>) nounwind readonly always
  %v1 = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %va, <8 x float> %vb)
  %v2 = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %v1, <8 x float> %v1)
  %v3 = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %v2, <8 x float> %v2)
-  %scalar1 = extractelement <8 x float> %v2, i32 0
-  %scalar2 = extractelement <8 x float> %v2, i32 4
+  %scalar1 = extractelement <8 x float> %v3, i32 0
+  %scalar2 = extractelement <8 x float> %v3, i32 4
  %sum = fadd float %scalar1, %scalar2
  ret float %sum
 }
@@ -316,7 +316,9 @@ define internal double @__reduce_add_double(<16 x double>) nounwind readonly alw

  %sum0 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %vab, <4 x double> %vcd)
  %sum1 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %sum0, <4 x double> %sum0)
-  %sum = extractelement <4 x double> %sum1, i32 0
+  %final0 = extractelement <4 x double> %sum1, i32 0
+  %final1 = extractelement <4 x double> %sum1, i32 2
+  %sum = fadd double %final0, %final1
  ret double %sum
 }

@@ -521,35 +523,104 @@ define void @__masked_store_64(<16 x i64>* nocapture, <16 x i64>,
 }


-;; FIXME: various code elsewhere in the builtins implementations makes
-;; calls to the 32/64 bit versions of these, basically assuming that doing
-;; so is faster than doing a full call to an actual masked store, which
-;; isn't likely to be the case on AVX.  So here we provide those functions
-;; but then don't actually do what the caller asked for...
+masked_store_blend_8_16_by_16()

-declare void @llvm.trap()
-
-define void @__masked_store_blend_8(<8 x i8>* nocapture, <8 x i8>, 
-                                    <8 x i32>) nounwind alwaysinline {
-  call void @llvm.trap()
-  ret void
-}
-
-define void @__masked_store_blend_16(<8 x i16>* nocapture, <8 x i16>, 
-                                     <8 x i32>) nounwind alwaysinline {
-  call void @llvm.trap()
-  ret void
-}
+declare <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float>, <8 x float>,
+                                                <8 x float>) nounwind readnone

 define void @__masked_store_blend_32(<16 x i32>* nocapture, <16 x i32>, 
                                     <16 x i32>) nounwind alwaysinline {
-  call void @__masked_store_32(<16 x i32> * %0, <16 x i32> %1, <16 x i32> %2)
+  %maskAsFloat = bitcast <16 x i32> %2 to <16 x float>
+  %oldValue = load <16 x i32>* %0, align 4
+  %oldAsFloat = bitcast <16 x i32> %oldValue to <16 x float>
+  %newAsFloat = bitcast <16 x i32> %1 to <16 x float>
+ 
+  %old0 = shufflevector <16 x float> %oldAsFloat, <16 x float> undef,
+        <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %old1 = shufflevector <16 x float> %oldAsFloat, <16 x float> undef,
+        <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %new0 = shufflevector <16 x float> %newAsFloat, <16 x float> undef,
+        <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %new1 = shufflevector <16 x float> %newAsFloat, <16 x float> undef,
+        <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %mask0 = shufflevector <16 x float> %maskAsFloat, <16 x float> undef,
+        <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %mask1 = shufflevector <16 x float> %maskAsFloat, <16 x float> undef,
+        <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+
+  %blend0 = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %old0,
+                                                         <8 x float> %new0,
+                                                         <8 x float> %mask0)
+  %blend1 = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %old1,
+                                                         <8 x float> %new1,
+                                                         <8 x float> %mask1)
+  %blend = shufflevector <8 x float> %blend0, <8 x float> %blend1,
+    <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %blendAsInt = bitcast <16 x float> %blend to <16 x i32>
+  store <16 x i32> %blendAsInt, <16 x i32>* %0, align 4
  ret void
 }

-define void @__masked_store_blend_64(<16 x i64>* nocapture, <16 x i64>, 
-                                     <16 x i32>) nounwind alwaysinline {
-  call void @__masked_store_64(<16 x i64> * %0, <16 x i64> %1, <16 x i32> %2)
+
+declare <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double>, <4 x double>,
+                                                 <4 x double>) nounwind readnone
+
+define void @__masked_store_blend_64(<16 x i64>* nocapture %ptr, <16 x i64> %newi64, 
+                                     <16 x i32> %mask) nounwind alwaysinline {
+  %oldValue = load <16 x i64>* %ptr, align 8
+  %old = bitcast <16 x i64> %oldValue to <16 x double>
+  %old0d = shufflevector <16 x double> %old, <16 x double> undef,
+     <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %old1d = shufflevector <16 x double> %old, <16 x double> undef,
+     <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %old2d = shufflevector <16 x double> %old, <16 x double> undef,
+     <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+  %old3d = shufflevector <16 x double> %old, <16 x double> undef,
+     <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+
+  %new = bitcast <16 x i64> %newi64 to <16 x double>
+  %new0d = shufflevector <16 x double> %new, <16 x double> undef,
+     <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %new1d = shufflevector <16 x double> %new, <16 x double> undef,
+     <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %new2d = shufflevector <16 x double> %new, <16 x double> undef,
+     <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+  %new3d = shufflevector <16 x double> %new, <16 x double> undef,
+     <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+
+  %mask0 = shufflevector <16 x i32> %mask, <16 x i32> undef,
+     <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
+  %mask1 = shufflevector <16 x i32> %mask, <16 x i32> undef,
+     <8 x i32> <i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
+  %mask2 = shufflevector <16 x i32> %mask, <16 x i32> undef,
+     <8 x i32> <i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11>
+  %mask3 = shufflevector <16 x i32> %mask, <16 x i32> undef,
+     <8 x i32> <i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15>
+  %mask0d = bitcast <8 x i32> %mask0 to <4 x double>
+  %mask1d = bitcast <8 x i32> %mask1 to <4 x double>
+  %mask2d = bitcast <8 x i32> %mask2 to <4 x double>
+  %mask3d = bitcast <8 x i32> %mask3 to <4 x double>
+
+  %result0d = call <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double> %old0d,
+                                 <4 x double> %new0d, <4 x double> %mask0d)
+  %result1d = call <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double> %old1d,
+                                 <4 x double> %new1d, <4 x double> %mask1d)
+  %result2d = call <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double> %old2d,
+                                 <4 x double> %new2d, <4 x double> %mask2d)
+  %result3d = call <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double> %old3d,
+                                 <4 x double> %new3d, <4 x double> %mask3d)
+
+  %result01 = shufflevector <4 x double> %result0d, <4 x double> %result1d,
+           <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %result23 = shufflevector <4 x double> %result2d, <4 x double> %result3d,
+           <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+
+  %result = shufflevector <8 x double> %result01, <8 x double> %result23,
+           <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                       i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %result64 = bitcast <16 x double> %result to <16 x i64>
+  store <16 x i64> %result64, <16 x i64> * %ptr
  ret void
 }

--- a/builtins-avx.ll
+++ b/builtins-avx.ll
@@ -294,10 +294,12 @@ define internal double @__reduce_add_double(<8 x double>) nounwind readonly alwa
                      <4 x i32> <i32 0, i32 1, i32 2, i32 3>
  %v1 = shufflevector <8 x double> %0, <8 x double> undef,
                      <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %sum01 = fadd <4 x double> %v0, %v1
-  %red0 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %sum01, <4 x double> %sum01)
-  %red1 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %red0, <4 x double> %red0)
-  %sum = extractelement <4 x double> %red1, i32 0
+  %sum0 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %v0, <4 x double> %v1)
+  %sum1 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %sum0, <4 x double> %sum0)
+  %final0 = extractelement <4 x double> %sum1, i32 0
+  %final1 = extractelement <4 x double> %sum1, i32 2
+  %sum = fadd double %final0, %final1
+
  ret double %sum
 }

@@ -448,38 +450,74 @@ define void @__masked_store_64(<8 x i64>* nocapture, <8 x i64>,
 }


-;; FIXME: various code elsewhere in the builtins implementations makes
-;; calls to the 32/64 bit versions of these, basically assuming that doing
-;; so is faster than doing a full call to an actual masked store, which
-;; isn't likely to be the case on AVX.  So here we provide those functions
-;; but then don't actually do what the caller asked for...

-declare void @llvm.trap()
-
-define void @__masked_store_blend_8(<8 x i8>* nocapture, <8 x i8>, 
-                                    <8 x i32>) nounwind alwaysinline {
-  call void @llvm.trap()
-  ret void
-}
-
-
-define void @__masked_store_blend_16(<8 x i16>* nocapture, <8 x i16>, 
-                                     <8 x i32>) nounwind alwaysinline {
-  call void @llvm.trap()
-  ret void
-}
+masked_store_blend_8_16_by_8()

+declare <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float>, <8 x float>,
+                                                <8 x float>) nounwind readnone

 define void @__masked_store_blend_32(<8 x i32>* nocapture, <8 x i32>, 
                                     <8 x i32>) nounwind alwaysinline {
-  call void @__masked_store_32(<8 x i32> * %0, <8 x i32> %1, <8 x i32> %2)
+  %mask_as_float = bitcast <8 x i32> %2 to <8 x float>
+  %oldValue = load <8 x i32>* %0, align 4
+  %oldAsFloat = bitcast <8 x i32> %oldValue to <8 x float>
+  %newAsFloat = bitcast <8 x i32> %1 to <8 x float>
+  %blend = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %oldAsFloat,
+                                                        <8 x float> %newAsFloat,
+                                                        <8 x float> %mask_as_float)
+  %blendAsInt = bitcast <8 x float> %blend to <8 x i32>
+  store <8 x i32> %blendAsInt, <8 x i32>* %0, align 4
  ret void
 }


-define void @__masked_store_blend_64(<8 x i64>* nocapture, <8 x i64>, 
-                                     <8 x i32>) nounwind alwaysinline {
-  call void @__masked_store_64(<8 x i64> * %0, <8 x i64> %1, <8 x i32> %2)
+define void @__masked_store_blend_64(<8 x i64>* nocapture %ptr, <8 x i64> %new, 
+                                     <8 x i32> %i32mask) nounwind alwaysinline {
+  %oldValue = load <8 x i64>* %ptr, align 8
+  %mask = bitcast <8 x i32> %i32mask to <8 x float>
+
+  ; Do 4x64-bit blends by doing two <8 x i32> blends, where the <8 x i32> values
+  ; are actually bitcast <4 x i64> values
+  ;
+  ; set up the first four 64-bit values
+  %old01  = shufflevector <8 x i64> %oldValue, <8 x i64> undef,
+                          <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %old01f = bitcast <4 x i64> %old01 to <8 x float>
+  %new01  = shufflevector <8 x i64> %new, <8 x i64> undef,
+                          <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %new01f = bitcast <4 x i64> %new01 to <8 x float>
+  ; compute mask--note that the indices are all doubled-up
+  %mask01 = shufflevector <8 x float> %mask, <8 x float> undef,
+                          <8 x i32> <i32 0, i32 0, i32 1, i32 1,
+                                     i32 2, i32 2, i32 3, i32 3>
+  ; and blend them
+  %result01f = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %old01f,
+                                                            <8 x float> %new01f,
+                                                            <8 x float> %mask01)
+  %result01 = bitcast <8 x float> %result01f to <4 x i64>
+
+  ; and again
+  %old23  = shufflevector <8 x i64> %oldValue, <8 x i64> undef,
+                          <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %old23f = bitcast <4 x i64> %old23 to <8 x float>
+  %new23  = shufflevector <8 x i64> %new, <8 x i64> undef,
+                          <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %new23f = bitcast <4 x i64> %new23 to <8 x float>
+  ; compute mask--note that the values are doubled-up...
+  %mask23 = shufflevector <8 x float> %mask, <8 x float> undef,
+                          <8 x i32> <i32 4, i32 4, i32 5, i32 5,
+                                     i32 6, i32 6, i32 7, i32 7>
+  ; and blend them
+  %result23f = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %old23f,
+                                                            <8 x float> %new23f,
+                                                            <8 x float> %mask23)
+  %result23 = bitcast <8 x float> %result23f to <4 x i64>
+
+  ; reconstruct the final <8 x i64> vector
+  %final = shufflevector <4 x i64> %result01, <4 x i64> %result23,
+                         <8 x i32> <i32 0, i32 1, i32 2, i32 3,
+                                    i32 4, i32 5, i32 6, i32 7>
+  store <8 x i64> %final, <8 x i64> * %ptr, align 8
  ret void
 }

--- a/builtins-c.c
+++ b/builtins-c.c
@@ -51,8 +51,13 @@
  */


+#ifndef _MSC_VER
+#include <unistd.h>
+#endif // !_MSC_VER
+
 #include <stdint.h>
 #include <stdio.h>
+#include <stdlib.h>
 #include <stdarg.h>

 typedef int Bool;
@@ -139,3 +144,28 @@ void __do_print(const char *format, const char *types, int width, int mask,
    }
    fflush(stdout);
 }
+
+
+int __num_cores() {
+#ifdef _MSC_VER
+    // This is quite a hack.  Including all of windows.h to get this definition
+    // pulls in a bunch of stuff that leads to undefined symbols at link time.
+    // So we don't #include <windows.h> but instead have the equivalent declarations
+    // here.  Presumably this struct declaration won't be changing in the future
+    // anyway...
+    struct SYSTEM_INFO {
+        int pad0[2];
+        void *pad1[2];
+        int *pad2;
+        int dwNumberOfProcessors;
+        int pad3[3];
+    };
+
+    struct SYSTEM_INFO sysInfo;
+    extern void __stdcall GetSystemInfo(struct SYSTEM_INFO *);
+    GetSystemInfo(&sysInfo);
+    return sysInfo.dwNumberOfProcessors;
+#else
+    return sysconf(_SC_NPROCESSORS_ONLN);
+#endif // !_MSC_VER
+}
--- a/builtins-dispatch.ll
+++ b/builtins-dispatch.ll
@@ -0,0 +1,123 @@
+;;  Copyright (c) 2011, Intel Corporation
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Intel Corporation nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+;; This file defines various functions that are used when generating the
+;; the "dispatch" object/assembly file that has entrypoints for each
+;; exported function in a module that dispatch to the best available
+;; variant of that function that will run on the system's CPU.
+
+;; Stores the best target ISA that the system on which we're actually
+;; running supports.  -1 represents "uninitialized", otherwise this value
+;; should correspond to one of the enumerant values of Target::ISA from
+;; ispc.h.
+
+@__system_best_isa = internal global i32 -1
+
+declare void @abort() noreturn
+
+;; The below is the result of running "clang -O2 -emit-llvm -c -o -" on the
+;; following code...  Specifically, __get_system_isa should return a value
+;; corresponding to one of the Target::ISA enumerant values that gives the
+;; most capable ISA that the curremt system can run.
+;;
+;; #ifdef _MSC_VER
+;; extern void __stdcall __cpuid(int info[4], int infoType);
+;; #else
+;; static void __cpuid(int info[4], int infoType) {
+;;     __asm__ __volatile__ ("cpuid"
+;;                           : "=a" (info[0]), "=b" (info[1]), "=c" (info[2]), "=d" (info[3])
+;;                           : "0" (infoType));
+;; }
+;; #endif
+;; 
+;; int32_t __get_system_isa() {
+;;     int info[4];
+;;     __cpuid(info, 1);
+;;     /* NOTE: the values returned below must be the same as the
+;;        corresponding enumerant values in Target::ISA. */
+;;     if ((info[2] & (1 << 28)) != 0)
+;;         return 2; // AVX
+;;     else if ((info[2] & (1 << 19)) != 0)
+;;         return 1; // SSE4
+;;     else if ((info[3] & (1 << 26)) != 0)
+;;         return 0; // SSE2
+;;     else
+;;         abort();
+;; }
+
+%0 = type { i32, i32, i32, i32 }
+
+define internal i32 @__get_system_isa() nounwind ssp {
+  %1 = tail call %0 asm sideeffect "cpuid", "={ax},={bx},={cx},={dx},0,~{dirflag},~{fpsr},~{flags}"(i32 1) nounwind
+  %2 = extractvalue %0 %1, 2
+  %3 = extractvalue %0 %1, 3
+  %4 = and i32 %2, 268435456
+  %5 = icmp eq i32 %4, 0
+  br i1 %5, label %6, label %13
+
+; <label>:6                                       ; preds = %0
+  %7 = and i32 %2, 524288
+  %8 = icmp eq i32 %7, 0
+  br i1 %8, label %9, label %13
+
+; <label>:9                                       ; preds = %6
+  %10 = and i32 %3, 67108864
+  %11 = icmp eq i32 %10, 0
+  br i1 %11, label %12, label %13
+
+; <label>:12                                      ; preds = %9
+  tail call void @abort() noreturn nounwind
+  unreachable
+
+; <label>:13                                      ; preds = %9, %6, %0
+  %.0 = phi i32 [ 2, %0 ], [ 1, %6 ], [ 0, %9 ]
+  ret i32 %.0
+}
+
+
+;; This function is called by each of the dispatch functions we generate;
+;; it sets @__system_best_isa if it is unset.
+
+define internal void @__set_system_isa() {
+entry:
+  %bi = load i32* @__system_best_isa
+  %unset = icmp eq i32 %bi, -1
+  br i1 %unset, label %set_system_isa, label %done
+
+set_system_isa:
+  %bival = call i32 @__get_system_isa()
+  store i32 %bival, i32* @__system_best_isa
+  ret void
+
+done:
+  ret void
+}
+
--- a/builtins-sse4-x2.ll
+++ b/builtins-sse4-x2.ll
--- a/builtins.cpp
+++ b/builtins.cpp
@@ -163,7 +163,9 @@ lLLVMTypeToISPCType(const llvm::Type *t, bool intAsUnsigned) {
                                                      intAsUnsigned);
            if (eltType == NULL)
                return NULL;
-            return new ReferenceType(new ArrayType(eltType, at->getNumElements()),
+            // FIXME: this needs to be fixed when arrays can have 
+            // over 4G elements...
+            return new ReferenceType(new ArrayType(eltType, (int)at->getNumElements()),
                                     false);
        }
    }
@@ -336,8 +338,8 @@ lCheckModuleIntrinsics(llvm::Module *module) {
    @param module      Module to link the bitcode into
    @param symbolTable Symbol table to add definitions to
 */
-static void
-lAddBitcode(const unsigned char *bitcode, int length,
+void
+AddBitcodeToModule(const unsigned char *bitcode, int length,
                   llvm::Module *module, SymbolTable *symbolTable) {
    std::string bcErr;
    llvm::StringRef sb = llvm::StringRef((char *)bitcode, length);
@@ -365,6 +367,7 @@ lAddBitcode(const unsigned char *bitcode, int length,
        std::string(linkError);
        if (llvm::Linker::LinkModules(module, bcModule, &linkError))
            Error(SourcePos(), "Error linking stdlib bitcode: %s", linkError.c_str());
+        if (symbolTable != NULL)
            lAddModuleSymbols(module, symbolTable);
        lCheckModuleIntrinsics(module);
    }
@@ -377,8 +380,8 @@ lAddBitcode(const unsigned char *bitcode, int length,
 static void
 lDefineConstantInt(const char *name, int val, llvm::Module *module,
                   SymbolTable *symbolTable) {
-    Symbol *pw = new Symbol(name, SourcePos(), AtomicType::UniformConstInt32);
-    pw->isStatic = true;
+    Symbol *pw = new Symbol(name, SourcePos(), AtomicType::UniformConstInt32,
+                            SC_STATIC);
    pw->constValue = new ConstExpr(pw->type, val, SourcePos());
    LLVM_TYPE_CONST llvm::Type *ltype = LLVMTypes::Int32Type;
    llvm::Constant *linit = LLVMInt32(val);
@@ -389,11 +392,30 @@ lDefineConstantInt(const char *name, int val, llvm::Module *module,
 }


+
+static void
+lDefineConstantIntFunc(const char *name, int val, llvm::Module *module,
+                       SymbolTable *symbolTable) {
+    std::vector<const Type *> args;
+    FunctionType *ft = new FunctionType(AtomicType::UniformInt32, args, SourcePos());
+    Symbol *sym = new Symbol(name, SourcePos(), ft, SC_STATIC);
+
+    llvm::Function *func = module->getFunction(name);
+    assert(func != NULL); // it should be declared already...
+    func->addFnAttr(llvm::Attribute::AlwaysInline);
+    llvm::BasicBlock *bblock = llvm::BasicBlock::Create(*g->ctx, "entry", func, 0);
+    llvm::ReturnInst::Create(*g->ctx, LLVMInt32(val), bblock);
+
+    sym->function = func;
+    symbolTable->AddVariable(sym);
+}
+
+
+
 static void
 lDefineProgramIndex(llvm::Module *module, SymbolTable *symbolTable) {
    Symbol *pidx = new Symbol("programIndex", SourcePos(), 
-                              AtomicType::VaryingConstInt32);
-    pidx->isStatic = true;
+                              AtomicType::VaryingConstInt32, SC_STATIC);

    int pi[ISPC_MAX_NVEC];
    for (int i = 0; i < g->target.vectorWidth; ++i)
@@ -416,13 +438,13 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
    if (g->target.is32bit) {
        extern unsigned char builtins_bitcode_c_32[];
        extern int builtins_bitcode_c_32_length;
-        lAddBitcode(builtins_bitcode_c_32, builtins_bitcode_c_32_length, 
+        AddBitcodeToModule(builtins_bitcode_c_32, builtins_bitcode_c_32_length, 
                           module, symbolTable);
    }
    else {
        extern unsigned char builtins_bitcode_c_64[];
        extern int builtins_bitcode_c_64_length;
-        lAddBitcode(builtins_bitcode_c_64, builtins_bitcode_c_64_length, 
+        AddBitcodeToModule(builtins_bitcode_c_64, builtins_bitcode_c_64_length, 
                           module, symbolTable);
    }

@@ -432,21 +454,21 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
    case Target::SSE2:
        extern unsigned char builtins_bitcode_sse2[];
        extern int builtins_bitcode_sse2_length;
-        lAddBitcode(builtins_bitcode_sse2, builtins_bitcode_sse2_length, module,
-                    symbolTable);
+        AddBitcodeToModule(builtins_bitcode_sse2, builtins_bitcode_sse2_length, 
+                           module, symbolTable);
        break;
    case Target::SSE4:
        extern unsigned char builtins_bitcode_sse4[];
        extern int builtins_bitcode_sse4_length;
-        extern unsigned char builtins_bitcode_sse4x2[];
-        extern int builtins_bitcode_sse4x2_length;
+        extern unsigned char builtins_bitcode_sse4_x2[];
+        extern int builtins_bitcode_sse4_x2_length;
        switch (g->target.vectorWidth) {
        case 4: 
-            lAddBitcode(builtins_bitcode_sse4, builtins_bitcode_sse4_length, 
+            AddBitcodeToModule(builtins_bitcode_sse4, builtins_bitcode_sse4_length, 
                               module, symbolTable);
            break;
        case 8:
-            lAddBitcode(builtins_bitcode_sse4x2, builtins_bitcode_sse4x2_length, 
+            AddBitcodeToModule(builtins_bitcode_sse4_x2, builtins_bitcode_sse4_x2_length, 
                               module, symbolTable);
            break;
        default:
@@ -458,13 +480,13 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
        case 8:
            extern unsigned char builtins_bitcode_avx[];
            extern int builtins_bitcode_avx_length;
-            lAddBitcode(builtins_bitcode_avx, builtins_bitcode_avx_length, module, 
-                        symbolTable);
+            AddBitcodeToModule(builtins_bitcode_avx, builtins_bitcode_avx_length, 
+                               module, symbolTable);
            break;
        case 16:
            extern unsigned char builtins_bitcode_avx_x2[];
            extern int builtins_bitcode_avx_x2_length;
-            lAddBitcode(builtins_bitcode_avx_x2, builtins_bitcode_avx_x2_length,
+            AddBitcodeToModule(builtins_bitcode_avx_x2, builtins_bitcode_avx_x2_length,
                               module,  symbolTable);
            break;
        default:
@@ -492,6 +514,8 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
                       symbolTable);
    lDefineConstantInt("__math_lib_system", (int)Globals::Math_System, module,
                       symbolTable);
+    lDefineConstantIntFunc("__fast_masked_vload", (int)g->opt.fastMaskedVload, module,
+                           symbolTable);

    if (includeStdlibISPC) {
        // If the user wants the standard library to be included, parse the
@@ -499,11 +523,8 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
        // definitions added.  Disable emission of performance warnings for
        // now, since the user doesn't care about any of that in the stdlib
        // implementation...
-        bool epf = g->emitPerfWarnings;
-        g->emitPerfWarnings = false;
        extern char stdlib_code[];
        yy_scan_string(stdlib_code);
        yyparse();
-        g->emitPerfWarnings = epf;
    }
 }
--- a/builtins.h
+++ b/builtins.h
@@ -55,4 +55,7 @@
 void DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *module,
                  bool includeStdlib);

+void AddBitcodeToModule(const unsigned char *bitcode, int length,
+                        llvm::Module *module, SymbolTable *symbolTable = NULL);
+
 #endif // ISPC_STDLIB_H
--- a/builtins.m4
+++ b/builtins.m4
@@ -622,40 +622,6 @@ forloop(i, 1, eval($1-1), `
 }
 ')

-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; global_atomic
-;; Defines the implementation of a function that handles the mapping from
-;; an ispc atomic function to the underlying LLVM intrinsics.  Specifically,
-;; the function handles loooping over the active lanes, calling the underlying
-;; scalar atomic intrinsic for each one, and assembling the vector result.
-;;
-;; Takes four parameters:
-;; $1: vector width of the target
-;; $2: operation being performed (w.r.t. LLVM atomic intrinsic names)
-;;     (add, sub...)
-;; $3: return type of the LLVM atomic (e.g. i32)
-;; $4: return type of the LLVM atomic type, in ispc naming paralance (e.g. int32)
-
-define(`global_atomic', `
-
-declare $3 @llvm.atomic.load.$2.$3.p0$3($3 * %ptr, $3 %delta)
-
-define internal <$1 x $3> @__atomic_$2_$4_global($3 * %ptr, <$1 x $3> %val,
-                                                 <$1 x i32> %mask) nounwind alwaysinline {
-  %rptr = alloca <$1 x $3>
-  %rptr32 = bitcast <$1 x $3> * %rptr to $3 *
-
-  per_lane($1, <$1 x i32> %mask, `
-   %v_LANE_ID = extractelement <$1 x $3> %val, i32 LANE
-   %r_LANE_ID = call $3 @llvm.atomic.load.$2.$3.p0$3($3 * %ptr, $3 %v_LANE_ID)
-   %rp_LANE_ID = getelementptr $3 * %rptr32, i32 LANE
-   store $3 %r_LANE_ID, $3 * %rp_LANE_ID')
-
-  %r = load <$1 x $3> * %rptr
-  ret <$1 x $3> %r
-}
-')
-

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; global_atomic_associative
@@ -681,18 +647,20 @@ define internal <$1 x $3> @__atomic_$2_$4_global($3 * %ptr, <$1 x $3> %val,

 define(`global_atomic_associative', `

-declare $3 @llvm.atomic.load.$2.$3.p0$3($3 * %ptr, $3 %delta)
-
-;; note that the mask is expected to be of type $3, so the caller must ensure
-;; that for 64-bit types, the mask is cast to a signed int before being passed
-;; to this so that it is properly sign extended...  (The code in stdlib.ispc
-;; does do this..)
-
 define internal <$1 x $3> @__atomic_$2_$4_global($3 * %ptr, <$1 x $3> %val,
-                                                 <$1 x $3> %mask) nounwind alwaysinline {
+                                                 <$1 x i32> %m) nounwind alwaysinline {
  ; first, for any lanes where the mask is off, compute a vector where those lanes
  ; hold the identity value..

+  ; for the bit tricks below, we need the mask to be sign extended to be
+  ; the size of the element type.
+  ifelse($3, `i64', `%mask = sext <$1 x i32> %m to <$1 x i64>')
+  ifelse($3, `i32', `
+     ; silly workaround to do %mask = %m, which is not possible directly..
+     %maskmem = alloca <$1 x i32>
+     store <$1 x i32> %m, <$1 x i32> * %maskmem
+     %mask = load <$1 x i32> * %maskmem'
+  )
  ; zero out any lanes that are off
  %valoff = and <$1 x $3> %val, %mask

@@ -751,7 +719,7 @@ define(`global_atomic_uniform', `

 declare $3 @llvm.atomic.load.$2.$3.p0$3($3 * %ptr, $3 %delta)

-define internal $3 @__atomic_$2_$4_global($3 * %ptr, $3 %val,
+define internal $3 @__atomic_$2_uniform_$4_global($3 * %ptr, $3 %val,
                                          <$1 x i32> %mask) nounwind alwaysinline {
  %r = call $3 @llvm.atomic.load.$2.$3.p0$3($3 * %ptr, $3 %val)
  ret $3 %r
@@ -764,9 +732,10 @@ define internal $3 @__atomic_$2_$4_global($3 * %ptr, $3 %val,
 ;; $2: llvm type of the vector elements (e.g. i32)
 ;; $3: ispc type of the elements (e.g. int32)

-define(`global_swap', `
+declare i32 @llvm.atomic.swap.i32.p0i32(i32 * %ptr, i32 %val)
+declare i64 @llvm.atomic.swap.i64.p0i64(i64 * %ptr, i64 %val)

-declare $2 @llvm.atomic.swap.$2.p0$2($2 * %ptr, $2 %val)
+define(`global_swap', `

 define internal <$1 x $2> @__atomic_swap_$3_global($2* %ptr, <$1 x $2> %val,
                                                   <$1 x i32> %mask) nounwind alwaysinline {
@@ -782,6 +751,12 @@ define internal <$1 x $2> @__atomic_swap_$3_global($2* %ptr, <$1 x $2> %val,
  %r = load <$1 x $2> * %rptr
  ret <$1 x $2> %r
 }
+
+define internal $2 @__atomic_swap_uniform_$3_global($2* %ptr, $2 %val,
+                                                    <$1 x i32> %mask) nounwind alwaysinline {
+ %r = call $2 @llvm.atomic.swap.$2.p0$2($2 * %ptr, $2 %val)
+ ret $2 %r
+}
 ')


@@ -811,6 +786,12 @@ define internal <$1 x $2> @__atomic_compare_exchange_$3_global($2* %ptr, <$1 x $
  %r = load <$1 x $2> * %rptr
  ret <$1 x $2> %r
 }
+
+define internal $2 @__atomic_compare_exchange_uniform_$3_global($2* %ptr, $2 %cmp,
+                               $2 %val, <$1 x i32> %mask) nounwind alwaysinline {
+  %r = call $2 @llvm.atomic.cmp.swap.$2.p0$2($2 * %ptr, $2 %cmp, $2 %val)
+  ret $2 %r
+}
 ')

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@@ -851,10 +832,11 @@ define internal void @__prefetch_read_nt_$1($2 *) alwaysinline {

 define(`stdlib_core', `

-declare i8* @ISPCMalloc(i64, i32) nounwind
-declare i8* @ISPCFree(i8*) nounwind
-declare void @ISPCLaunch(i8*, i8*) nounwind
-declare void @ISPCSync() nounwind
+declare i32 @__fast_masked_vload()
+
+declare i8* @ISPCAlloc(i8**, i64, i32) nounwind
+declare void @ISPCLaunch(i8**, i8*, i8*, i32) nounwind
+declare void @ISPCSync(i8*) nounwind
 declare void @ISPCInstrument(i8*, i8*, i32, i32) nounwind

 declare i1 @__is_compile_time_constant_mask(<$1 x i32> %mask)
@@ -1226,6 +1208,11 @@ global_atomic_associative($1, sub, i32, int32, 0)
 global_atomic_associative($1, and, i32, int32, -1)
 global_atomic_associative($1, or, i32, int32, 0)
 global_atomic_associative($1, xor, i32, int32, 0)
+global_atomic_uniform($1, add, i32, int32)
+global_atomic_uniform($1, sub, i32, int32)
+global_atomic_uniform($1, and, i32, int32)
+global_atomic_uniform($1, or, i32, int32)
+global_atomic_uniform($1, xor, i32, int32)
 global_atomic_uniform($1, min, i32, int32)
 global_atomic_uniform($1, max, i32, int32)
 global_atomic_uniform($1, umin, i32, uint32)
@@ -1236,6 +1223,11 @@ global_atomic_associative($1, sub, i64, int64, 0)
 global_atomic_associative($1, and, i64, int64, -1)
 global_atomic_associative($1, or, i64, int64, 0)
 global_atomic_associative($1, xor, i64, int64, 0)
+global_atomic_uniform($1, add, i64, int64)
+global_atomic_uniform($1, sub, i64, int64)
+global_atomic_uniform($1, and, i64, int64)
+global_atomic_uniform($1, or, i64, int64)
+global_atomic_uniform($1, xor, i64, int64)
 global_atomic_uniform($1, min, i64, int64)
 global_atomic_uniform($1, max, i64, int64)
 global_atomic_uniform($1, umin, i64, uint64)
@@ -1262,6 +1254,24 @@ define internal <$1 x double> @__atomic_swap_double_global(double * %ptr, <$1 x
  ret <$1 x double> %ret
 }

+define internal float @__atomic_swap_uniform_float_global(float * %ptr, float %val,
+                                                   <$1 x i32> %mask) nounwind alwaysinline {
+  %iptr = bitcast float * %ptr to i32 *
+  %ival = bitcast float %val to i32
+  %iret = call i32 @__atomic_swap_uniform_int32_global(i32 * %iptr, i32 %ival, <$1 x i32> %mask)
+  %ret = bitcast i32 %iret to float
+  ret float %ret
+}
+
+define internal double @__atomic_swap_uniform_double_global(double * %ptr, double %val,
+                                                   <$1 x i32> %mask) nounwind alwaysinline {
+  %iptr = bitcast double * %ptr to i64 *
+  %ival = bitcast double %val to i64
+  %iret = call i64 @__atomic_swap_uniform_int64_global(i64 * %iptr, i64 %ival, <$1 x i32> %mask)
+  %ret = bitcast i64 %iret to double
+  ret double %ret
+}
+
 global_atomic_exchange($1, i32, int32)
 global_atomic_exchange($1, i64, int64)

@@ -1286,6 +1296,29 @@ define internal <$1 x double> @__atomic_compare_exchange_double_global(double *
  %ret = bitcast <$1 x i64> %iret to <$1 x double>
  ret <$1 x double> %ret
 }
+
+define internal float @__atomic_compare_exchange_uniform_float_global(float * %ptr, float %cmp, float %val,
+                                                   <$1 x i32> %mask) nounwind alwaysinline {
+  %iptr = bitcast float * %ptr to i32 *
+  %icmp = bitcast float %cmp to i32
+  %ival = bitcast float %val to i32
+  %iret = call i32 @__atomic_compare_exchange_uniform_int32_global(i32 * %iptr, i32 %icmp,
+                                                                   i32 %ival, <$1 x i32> %mask)
+  %ret = bitcast i32 %iret to float
+  ret float %ret
+}
+
+define internal double @__atomic_compare_exchange_uniform_double_global(double * %ptr, double %cmp,
+                                            double %val, <$1 x i32> %mask) nounwind alwaysinline {
+  %iptr = bitcast double * %ptr to i64 *
+  %icmp = bitcast double %cmp to i64
+  %ival = bitcast double %val to i64
+  %iret = call i64 @__atomic_compare_exchange_uniform_int64_global(i64 * %iptr, i64 %icmp,
+                                                                   i64 %ival, <$1 x i32> %mask)
+  %ret = bitcast i64 %iret to double
+  ret double %ret
+}
+
 ')


@@ -1344,12 +1377,6 @@ i64minmax($1,max,uint64,ugt)

 define(`load_and_broadcast', `
 define <$1 x $2> @__load_and_broadcast_$3(i8 *, <$1 x i32> %mask) nounwind alwaysinline {
-  ; must not load if the mask is all off; the address may be invalid
-  %mm = call i32 @__movmsk(<$1 x i32> %mask)
-  %any_on = icmp ne i32 %mm, 0
-  br i1 %any_on, label %load, label %skip
-
-load:
  %ptr = bitcast i8 * %0 to $2 *
  %val = load $2 * %ptr

@@ -1357,9 +1384,6 @@ load:
  forloop(i, 1, eval($1-1), `
  %ret`'i = insertelement <$1 x $2> %ret`'eval(i-1), $2 %val, i32 i')
  ret <$1 x $2> %ret`'eval($1-1)
-
-skip:
-  ret <$1 x $2> undef
 }
 ')

@@ -1375,14 +1399,20 @@ define(`load_masked', `
 define <$1 x $2> @__load_masked_$3(i8 *, <$1 x i32> %mask) nounwind alwaysinline {
 entry:
  %mm = call i32 @__movmsk(<$1 x i32> %mask)
+  
  ; if the first lane and the last lane are on, then it is safe to do a vector load
  ; of the whole thing--what the lanes in the middle want turns out to not matter...
  %mm_and = and i32 %mm, eval(1 | (1<<($1-1)))
  %can_vload = icmp eq i32 %mm_and, eval(1 | (1<<($1-1)))
+
+  %fast32 = call i32 @__fast_masked_vload()
+  %fast_i1 = trunc i32 %fast32 to i1
+  %can_vload_maybe_fast = or i1 %fast_i1, %can_vload
+
  ; if we are not able to do a singe vload, we will accumulate lanes in this memory..
  %retptr = alloca <$1 x $2>
  %retptr32 = bitcast <$1 x $2> * %retptr to $2 *
-  br i1 %can_vload, label %load, label %loop
+  br i1 %can_vload_maybe_fast, label %load, label %loop

 load: 
  %ptr = bitcast i8 * %0 to <$1 x $2> *
@@ -1517,6 +1547,46 @@ define void @__masked_store_blend_16(<8 x i16>* nocapture, <8 x i16>,
 ')


+define(`masked_store_blend_8_16_by_16', `
+define void @__masked_store_blend_8(<16 x i8>* nocapture, <16 x i8>,
+                                    <16 x i32>) nounwind alwaysinline {
+  %old = load <16 x i8> * %0
+  %old128 = bitcast <16 x i8> %old to i128
+  %new128 = bitcast <16 x i8> %1 to i128
+
+  %mask8 = trunc <16 x i32> %2 to <16 x i8>
+  %mask128 = bitcast <16 x i8> %mask8 to i128
+  %notmask128 = xor i128 %mask128, -1
+
+  %newmasked = and i128 %new128, %mask128
+  %oldmasked = and i128 %old128, %notmask128
+  %result = or i128 %newmasked, %oldmasked
+
+  %resultvec = bitcast i128 %result to <16 x i8>
+  store <16 x i8> %resultvec, <16 x i8> * %0
+  ret void
+}
+
+define void @__masked_store_blend_16(<16 x i16>* nocapture, <16 x i16>,
+                                     <16 x i32>) nounwind alwaysinline {
+  %old = load <16 x i16> * %0
+  %old256 = bitcast <16 x i16> %old to i256
+  %new256 = bitcast <16 x i16> %1 to i256
+
+  %mask16 = trunc <16 x i32> %2 to <16 x i16>
+  %mask256 = bitcast <16 x i16> %mask16 to i256
+  %notmask256 = xor i256 %mask256, -1
+
+  %newmasked = and i256 %new256, %mask256
+  %oldmasked = and i256 %old256, %notmask256
+  %result = or i256 %newmasked, %oldmasked
+
+  %resultvec = bitcast i256 %result to <16 x i16>
+  store <16 x i16> %resultvec, <16 x i16> * %0
+  ret void
+}
+')
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; packed load and store functions
 ;;
@@ -1544,7 +1614,7 @@ entry:

 known_mask:
  %allon = icmp eq i32 %mask, eval((1 << $1) -1)
-  br i1 %allon, label %all_on, label %not_all_on
+  br i1 %allon, label %all_on, label %unknown_mask

 all_on:
  ;; everyone wants to load, so just load an entire vector width in a single
@@ -1554,14 +1624,6 @@ all_on:
  store <$1 x i32> %vec_load, <$1 x i32> * %val_ptr, align 4
  ret i32 $1

-not_all_on:
-  %alloff = icmp eq i32 %mask, 0
-  br i1 %alloff, label %all_off, label %unknown_mask
-
-all_off:
-  ;; no one wants to load
-  ret i32 0
-
 unknown_mask:
  br label %loop

@@ -1608,20 +1670,13 @@ entry:

 known_mask:
  %allon = icmp eq i32 %mask, eval((1 << $1) -1)
-  br i1 %allon, label %all_on, label %not_all_on
+  br i1 %allon, label %all_on, label %unknown_mask

 all_on:
  %vecptr = bitcast i32 *%startptr to <$1 x i32> *
  store <$1 x i32> %vals, <$1 x i32> * %vecptr, align 4
  ret i32 $1

-not_all_on:
-  %alloff = icmp eq i32 %mask, 0
-  br i1 %alloff, label %all_off, label %unknown_mask
-
-all_off:
-  ret i32 0
-
 unknown_mask:
  br label %loop

@@ -1671,14 +1726,6 @@ entry:
   br i1 %allon, label %check_neighbors, label %domixed

 domixed:
-  ; the mask is mixed on/off.  First see if the lanes are all off
-  %alloff = icmp eq i32 %mm, 0
-  br i1 %alloff, label %doalloff, label %actuallymixed
-
-doalloff:
-  ret i1 false  ;; this seems safest
-
-actuallymixed: 
  ; First, figure out which lane is the first active one
  %first = call i32 @llvm.cttz.i32(i32 %mm)
  %baseval = extractelement <$1 x $2> %v, i32 %first
@@ -1701,7 +1748,7 @@ actuallymixed:
  br label %check_neighbors

 check_neighbors:
-  %vec = phi <$1 x $2> [ %blendvec, %actuallymixed ], [ %v, %entry ]
+  %vec = phi <$1 x $2> [ %blendvec, %domixed ], [ %v, %entry ]
  ifelse($6, `32', `
  ; For 32-bit elements, we rotate once and compare with the vector, which ends 
  ; up comparing each element to its neighbor on the right.  Then see if
@@ -1833,7 +1880,7 @@ pl_known_mask:
  ;; the mask is known at compile time; see if it is something we can
  ;; handle more efficiently
  %pl_is_allon = icmp eq i32 %pl_mask, eval((1<<$1)-1)
-  br i1 %pl_is_allon, label %pl_all_on, label %pl_not_all_on
+  br i1 %pl_is_allon, label %pl_all_on, label %pl_unknown_mask

 pl_all_on:
  ;; the mask is all on--just expand the code for each lane sequentially
@@ -1841,19 +1888,14 @@ pl_all_on:
          `patsubst(`$3', `ID\|LANE', i)')
  br label %pl_done

-pl_not_all_on:
-  ;; not all on--see if it is all off or mixed
-  ;; for the mixed case, we just run the general case, though we could
+pl_unknown_mask:
+  ;; we just run the general case, though we could
  ;; try to be smart and just emit the code based on what it actually is,
  ;; for example by emitting the code straight-line without a loop and doing 
  ;; the lane tests explicitly, leaving later optimization passes to eliminate
  ;; the stuff that is definitely not needed.  Not clear if we will frequently 
  ;; encounter a mask that is known at compile-time but is not either all on or
  ;; all off...
-  %pl_alloff = icmp eq i32 %pl_mask, 0
-  br i1 %pl_alloff, label %pl_done, label %pl_unknown_mask
-
-pl_unknown_mask:
  br label %pl_loop

 pl_loop:
@@ -1909,20 +1951,6 @@ define internal <$1 x $2> @__gather_elt_$2(i8 * %ptr, <$1 x i32> %offsets, <$1 x

 define <$1 x $2> @__gather_base_offsets_$2(i8 * %ptr, <$1 x i32> %offsets,
                                           <$1 x i32> %vecmask) nounwind readonly alwaysinline {
-entry:
-  %mask = call i32 @__movmsk(<$1 x i32> %vecmask)
-
-  %maskKnown = call i1 @__is_compile_time_constant_mask(<$1 x i32> %vecmask)
-  br i1 %maskKnown, label %known_mask, label %unknown_mask
-
-known_mask:
-  %alloff = icmp eq i32 %mask, 0
-  br i1 %alloff, label %gather_all_off, label %unknown_mask
-
-gather_all_off:
-  ret <$1 x $2> undef
-
-unknown_mask:
  ; We can be clever and avoid the per-lane stuff for gathers if we are willing
  ; to require that the 0th element of the array being gathered from is always
  ; legal to read from (and we do indeed require that, given the benefits!) 
--- a/ctx.cpp
+++ b/ctx.cpp
@@ -37,6 +37,7 @@

 #include "ctx.h"
 #include "util.h"
+#include "func.h"
 #include "llvmutil.h"
 #include "type.h"
 #include "stmt.h"
@@ -123,19 +124,20 @@ CFInfo::GetLoop(bool isUniform, llvm::BasicBlock *breakTarget,

 ///////////////////////////////////////////////////////////////////////////

-FunctionEmitContext::FunctionEmitContext(const Type *rt, llvm::Function *function,
-                                         Symbol *funSym, SourcePos firstStmtPos) {
+FunctionEmitContext::FunctionEmitContext(Function *function, Symbol *funSym,
+                                         llvm::Function *llvmFunction,
+                                         SourcePos firstStmtPos) {
+    const Type *rt = function->GetReturnType();
+
    /* Create a new basic block to store all of the allocas */
-    allocaBlock = llvm::BasicBlock::Create(*g->ctx, "allocas", function, 0);
-    bblock = llvm::BasicBlock::Create(*g->ctx, "entry", function, 0);
+    allocaBlock = llvm::BasicBlock::Create(*g->ctx, "allocas", llvmFunction, 0);
+    bblock = llvm::BasicBlock::Create(*g->ctx, "entry", llvmFunction, 0);
    /* But jump from it immediately into the real entry block */
    llvm::BranchInst::Create(bblock, allocaBlock);

-    maskPtr = AllocaInst(LLVMTypes::MaskType, "mask_memory");
-    StoreInst(LLVMMaskAllOn, maskPtr);
-
    funcStartPos = funSym->pos;
    returnType = rt;
+    maskPtr = NULL;
    entryMask = NULL;
    loopMask = NULL;
    breakLanesPtr = continueLanesPtr = NULL;
@@ -144,6 +146,11 @@ FunctionEmitContext::FunctionEmitContext(const Type *rt, llvm::Function *functio
    returnedLanesPtr = AllocaInst(LLVMTypes::MaskType, "returned_lanes_memory");
    StoreInst(LLVMMaskAllOff, returnedLanesPtr);

+    launchedTasks = false;
+    launchGroupHandlePtr = AllocaInst(LLVMTypes::VoidPointerType, "launch_group_handle");
+    StoreInst(llvm::Constant::getNullValue(LLVMTypes::VoidPointerType), 
+              launchGroupHandlePtr);
+
    if (!returnType || returnType == AtomicType::Void)
        returnValuePtr = NULL;
    else {
@@ -153,7 +160,6 @@ FunctionEmitContext::FunctionEmitContext(const Type *rt, llvm::Function *functio
        StoreInst(llvm::Constant::getNullValue(ftype), returnValuePtr);
    }

-#ifndef LLVM_2_8
    if (m->diBuilder) {
        /* If debugging is enabled, tell the debug information emission
           code about this new function */
@@ -161,33 +167,18 @@ FunctionEmitContext::FunctionEmitContext(const Type *rt, llvm::Function *functio
        llvm::DIType retType = rt->GetDIType(diFile);
        int flags = llvm::DIDescriptor::FlagPrototyped; // ??
        diFunction = m->diBuilder->createFunction(diFile, /* scope */
-                                                  function->getName(), // mangled
+                                                  llvmFunction->getName(), // mangled
                                                  funSym->name,
                                                  diFile,
                                                  funcStartPos.first_line,
                                                  retType,
-                                                  funSym->isStatic,
+                                                  funSym->storageClass == SC_STATIC,
                                                  true, /* is definition */
                                                  flags,
                                                  g->opt.level > 0,
-                                                  function);
+                                                  llvmFunction);
        /* And start a scope representing the initial function scope */
        StartScope();
-    }
-#endif // LLVM_2_8
-
-    launchedTasks = false;
-
-    // connect the funciton's mask memory to the __mask symbol
-    Symbol *maskSymbol = m->symbolTable->LookupVariable("__mask");
-    assert(maskSymbol != NULL);
-    maskSymbol->storagePtr = maskPtr;
-
-#ifndef LLVM_2_8
-    // add debugging info for __mask, programIndex, ...
-    if (m->diBuilder) {
-        maskSymbol->pos = funcStartPos;
-        EmitVariableDebugInfo(maskSymbol);

        llvm::DIFile file = funcStartPos.GetDIFile();
        Symbol *programIndexSymbol = m->symbolTable->LookupVariable("programIndex");
@@ -208,15 +199,12 @@ FunctionEmitContext::FunctionEmitContext(const Type *rt, llvm::Function *functio
                                           true /* static */,
                                           programCountSymbol->storagePtr);
    }
-#endif
 }


 FunctionEmitContext::~FunctionEmitContext() {
    assert(controlFlowInfo.size() == 0);
-#ifndef LLVM_2_8
    assert(debugScopes.size() == (m->diBuilder ? 1 : 0));
-#endif
 }


@@ -238,6 +226,12 @@ FunctionEmitContext::GetMask() {
 }


+void
+FunctionEmitContext::SetMaskPointer(llvm::Value *p) {
+    maskPtr = p;
+}
+
+
 void
 FunctionEmitContext::SetEntryMask(llvm::Value *value) {
    entryMask = value;
@@ -704,6 +698,7 @@ FunctionEmitContext::LaneMask(llvm::Value *v) {

 llvm::Value *
 FunctionEmitContext::MasksAllEqual(llvm::Value *v1, llvm::Value *v2) {
+#if 0
    // Compare the two masks to get a vector of i1s
    llvm::Value *cmp = CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_EQ,
                               v1, v2, "v1==v2");
@@ -711,6 +706,12 @@ FunctionEmitContext::MasksAllEqual(llvm::Value *v1, llvm::Value *v2) {
    cmp = I1VecToBoolVec(cmp);
    // And see if it's all on
    return All(cmp);
+#else
+    llvm::Value *mm1 = LaneMask(v1);
+    llvm::Value *mm2 = LaneMask(v2);
+    return CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_EQ, mm1, mm2,
+                   "v1==v2");
+#endif
 }


@@ -758,7 +759,7 @@ FunctionEmitContext::I1VecToBoolVec(llvm::Value *b) {


 llvm::Value *
-FunctionEmitContext::EmitMalloc(LLVM_TYPE_CONST llvm::Type *ty, int align) {
+FunctionEmitContext::SizeOf(LLVM_TYPE_CONST llvm::Type *ty) {
    // Emit code to compute the size of the given type using a GEP with a
    // NULL base pointer, indexing one element of the given type, and
    // casting the resulting 'pointer' to an int giving its size.
@@ -775,24 +776,7 @@ FunctionEmitContext::EmitMalloc(LLVM_TYPE_CONST llvm::Type *ty, int align) {
 #endif
    AddDebugPos(poffset);
    llvm::Value *sizeOf = PtrToIntInst(poffset, LLVMTypes::Int64Type, "offset_int");
-
-    // And given the size, call the malloc function
-    llvm::Function *fmalloc = m->module->getFunction("ISPCMalloc");
-    assert(fmalloc != NULL);
-    llvm::Value *mem = CallInst(fmalloc, sizeOf, LLVMInt32(align), 
-                                "raw_argmem");
-    // Cast the void * back to the result pointer type
-    return BitCastInst(mem, ptrType, "mem_bitcast");
-}
-
-
-void
-FunctionEmitContext::EmitFree(llvm::Value *ptr) {
-    llvm::Value *freeArg = BitCastInst(ptr, LLVMTypes::VoidPointerType,
-                                       "argmemfree");
-    llvm::Function *ffree = m->module->getFunction("ISPCFree");
-    assert(ffree != NULL);
-    CallInst(ffree, freeArg);
+    return sizeOf;
 }


@@ -850,7 +834,6 @@ FunctionEmitContext::GetDebugPos() const {
 void
 FunctionEmitContext::AddDebugPos(llvm::Value *value, const SourcePos *pos, 
                                 llvm::DIScope *scope) {
-#ifndef LLVM_2_8
    llvm::Instruction *inst = llvm::dyn_cast<llvm::Instruction>(value);
    if (inst != NULL && m->diBuilder) {
        SourcePos p = pos ? *pos : currentPos;
@@ -861,13 +844,11 @@ FunctionEmitContext::AddDebugPos(llvm::Value *value, const SourcePos *pos,
            inst->setDebugLoc(llvm::DebugLoc::get(p.first_line, p.first_column, 
                                                  scope ? *scope : GetDIScope()));
    }
-#endif
 }


 void
 FunctionEmitContext::StartScope() {
-#ifndef LLVM_2_8
    if (m->diBuilder != NULL) {
        llvm::DIScope parentScope;
        if (debugScopes.size() > 0)
@@ -881,18 +862,15 @@ FunctionEmitContext::StartScope() {
                                             currentPos.first_column);
        debugScopes.push_back(lexicalBlock);
    }
-#endif
 }


 void
 FunctionEmitContext::EndScope() {
-#ifndef LLVM_2_8
    if (m->diBuilder != NULL) {
        assert(debugScopes.size() > 0);
        debugScopes.pop_back();
    }
-#endif
 }


@@ -905,7 +883,6 @@ FunctionEmitContext::GetDIScope() const {

 void
 FunctionEmitContext::EmitVariableDebugInfo(Symbol *sym) {
-#ifndef LLVM_2_8
    if (m->diBuilder == NULL)
        return;

@@ -921,13 +898,11 @@ FunctionEmitContext::EmitVariableDebugInfo(Symbol *sym) {
    llvm::Instruction *declareInst = 
        m->diBuilder->insertDeclare(sym->storagePtr, var, bblock);
    AddDebugPos(declareInst, &sym->pos, &scope);
-#endif
 }


 void
 FunctionEmitContext::EmitFunctionParameterDebugInfo(Symbol *sym) {
-#ifndef LLVM_2_8
    if (m->diBuilder == NULL)
        return;

@@ -943,7 +918,6 @@ FunctionEmitContext::EmitFunctionParameterDebugInfo(Symbol *sym) {
    llvm::Instruction *declareInst = 
        m->diBuilder->insertDeclare(sym->storagePtr, var, bblock);
    AddDebugPos(declareInst, &sym->pos, &scope);
-#endif
 }


@@ -1501,27 +1475,15 @@ FunctionEmitContext::gather(llvm::Value *lvalue, const Type *type,
 void
 FunctionEmitContext::addGSMetadata(llvm::Instruction *inst, SourcePos pos) {
    llvm::Value *str = llvm::MDString::get(*g->ctx, pos.name);
-#ifdef LLVM_2_8
-    llvm::MDNode *md = llvm::MDNode::get(*g->ctx, &str, 1);
-#else
    llvm::MDNode *md = llvm::MDNode::get(*g->ctx, str);
-#endif
    inst->setMetadata("filename", md);

    llvm::Value *line = LLVMInt32(pos.first_line);
-#ifdef LLVM_2_8
-    md = llvm::MDNode::get(*g->ctx, &line, 1);
-#else
    md = llvm::MDNode::get(*g->ctx, line);
-#endif
    inst->setMetadata("line", md);

    llvm::Value *column = LLVMInt32(pos.first_column);
-#ifdef LLVM_2_8
-    md = llvm::MDNode::get(*g->ctx, &column, 1);
-#else
    md = llvm::MDNode::get(*g->ctx, column);
-#endif
    inst->setMetadata("column", md);
 }

@@ -1838,9 +1800,9 @@ llvm::PHINode *
 FunctionEmitContext::PhiNode(LLVM_TYPE_CONST llvm::Type *type, int count, 
                             const char *name) {
    llvm::PHINode *pn = llvm::PHINode::Create(type, 
-#if !defined(LLVM_2_8) && !defined(LLVM_2_9)
+#if defined(LLVM_3_0) || defined(LLVM_3_0svn)
                                              count, 
-#endif // !LLVM_2_8 && !LLVM_2_9
+#endif // LLVM_3_0
                                              name ? name : "phi", bblock);
    AddDebugPos(pn);
    return pn;
@@ -1933,15 +1895,9 @@ FunctionEmitContext::CallInst(llvm::Function *func, llvm::Value *arg0,

 llvm::Instruction *
 FunctionEmitContext::ReturnInst() {
-    if (launchedTasks) {
-        // Automatically add a sync call at the end of any function that
-        // launched tasks
-        SourcePos noPos;
-        noPos.name = "__auto_sync";
-        ExprStmt *es = new ExprStmt(new SyncExpr(noPos), noPos);
-        es->EmitCode(this); 
-        delete es;
-    }
+    if (launchedTasks)
+        // Add a sync call at the end of any function that launched tasks
+        SyncInst();

    llvm::Instruction *rinst = NULL;
    if (returnValuePtr != NULL) {
@@ -1964,7 +1920,8 @@ FunctionEmitContext::ReturnInst() {

 llvm::Instruction *
 FunctionEmitContext::LaunchInst(llvm::Function *callee, 
-                                std::vector<llvm::Value *> &argVals) {
+                                std::vector<llvm::Value *> &argVals,
+                                llvm::Value *launchCount) {
    if (callee == NULL) {
        assert(m->errorCount > 0);
        return NULL;
@@ -1981,20 +1938,15 @@ FunctionEmitContext::LaunchInst(llvm::Function *callee,
        static_cast<LLVM_TYPE_CONST llvm::StructType *>(pt->getElementType());
    assert(argStructType->getNumElements() == argVals.size() + 1);

+    llvm::Function *falloc = m->module->getFunction("ISPCAlloc");
+    assert(falloc != NULL);
    int align = 4 * RoundUpPow2(g->target.nativeVectorWidth);
-#ifdef ISPC_IS_WINDOWS
-    // Use malloc() to allocate storage on Windows, since the stack is
-    // generally not big enough there to do enough allocations for lots of
-    // tasks and then things crash horribly...
-    llvm::Value *argmem = EmitMalloc(argStructType, align);
-#else
-    // Use alloca for space for the task args on OSX And Linux.  KEY
-    // DETAIL: pass false to the call of FunctionEmitContext::AllocaInst so
-    // that the alloca doesn't happen just once at the top of the function,
-    // but happens each time the enclosing basic block executes.
-    llvm::Value *argmem = AllocaInst(argStructType, "argmem", align, false);
-#endif // ISPC_IS_WINDOWS
-    llvm::Value *voidmem = BitCastInst(argmem, LLVMTypes::VoidPointerType);
+    std::vector<llvm::Value *> allocArgs;
+    allocArgs.push_back(launchGroupHandlePtr);
+    allocArgs.push_back(SizeOf(argStructType));
+    allocArgs.push_back(LLVMInt32(align));
+    llvm::Value *voidmem = CallInst(falloc, allocArgs, "args_ptr");
+    llvm::Value *argmem = BitCastInst(voidmem, pt);

    // Copy the values of the parameters into the appropriate place in
    // the argument block
@@ -2016,5 +1968,32 @@ FunctionEmitContext::LaunchInst(llvm::Function *callee,
    llvm::Value *fptr = BitCastInst(callee, LLVMTypes::VoidPointerType);
    llvm::Function *flaunch = m->module->getFunction("ISPCLaunch");
    assert(flaunch != NULL);
-    return CallInst(flaunch, fptr, voidmem, "");
+    std::vector<llvm::Value *> args;
+    args.push_back(launchGroupHandlePtr);
+    args.push_back(fptr);
+    args.push_back(voidmem);
+    args.push_back(launchCount);
+    return CallInst(flaunch, args, "");
+}
+
+
+void
+FunctionEmitContext::SyncInst() {
+    llvm::Value *launchGroupHandle = LoadInst(launchGroupHandlePtr, NULL);
+    llvm::Value *nullPtrValue = llvm::Constant::getNullValue(LLVMTypes::VoidPointerType);
+    llvm::Value *nonNull = CmpInst(llvm::Instruction::ICmp,
+                                   llvm::CmpInst::ICMP_NE,
+                                   launchGroupHandle, nullPtrValue);
+    llvm::BasicBlock *bSync = CreateBasicBlock("call_sync");
+    llvm::BasicBlock *bPostSync = CreateBasicBlock("post_sync");
+    BranchInst(bSync, bPostSync, nonNull);
+
+    SetCurrentBasicBlock(bSync);
+    llvm::Function *fsync = m->module->getFunction("ISPCSync");
+    if (fsync == NULL)
+        FATAL("Couldn't find ISPCSync declaration?!");
+    CallInst(fsync, launchGroupHandle, "");
+    BranchInst(bPostSync);
+
+    SetCurrentBasicBlock(bPostSync);
 }
--- a/ctx.h
+++ b/ctx.h
@@ -59,14 +59,15 @@ struct CFInfo;
 class FunctionEmitContext {
 public:
    /** Create a new FunctionEmitContext.
-        @param returnType   The return type of the function
-        @param function     LLVM function in the current module that corresponds
+        @param function     The Function object representing the function
+        @param sym          Symbol that corresponds to the function
+        @param llvmFunction LLVM function in the current module that corresponds
                            to the function
-        @param funSym       Symbol that corresponds to the function
        @param firstStmtPos Source file position of the first statement in the
                            function
     */
-    FunctionEmitContext(const Type *returnType, llvm::Function *function, Symbol *funSym,
+    FunctionEmitContext(Function *function, Symbol *funSym, 
+                        llvm::Function *llvmFunction,
                        SourcePos firstStmtPos);
    ~FunctionEmitContext();

@@ -86,6 +87,8 @@ public:
    /** Returns the current mask value */ 
    llvm::Value *GetMask();

+    void SetMaskPointer(llvm::Value *p);
+
    /** Provides the value of the mask at function entry */
    void SetEntryMask(llvm::Value *val);

@@ -210,15 +213,8 @@ public:
        i32. */
    llvm::Value *I1VecToBoolVec(llvm::Value *b);

-    /** Emit code to call the user-supplied ISPCMalloc function to
-        allocate space for an object of thee given type.  Returns the
-        pointer value returned by the ISPCMalloc call. */
-    llvm::Value *EmitMalloc(LLVM_TYPE_CONST llvm::Type *ty, int align = 0);
-
-    /** Emit code to call the user-supplied ISPCFree function, passing it
-        the given pointer to storage previously allocated by an
-        EmitMalloc() call. */
-    void EmitFree(llvm::Value *ptr);
+    /** Returns the size of the given type. */
+    llvm::Value *SizeOf(LLVM_TYPE_CONST llvm::Type *ty);

    /** If the user has asked to compile the program with instrumentation,
        this inserts a callback to the user-supplied instrumentation
@@ -399,7 +395,10 @@ public:
    /** Launch an asynchronous task to run the given function, passing it
        he given argument values. */
    llvm::Instruction *LaunchInst(llvm::Function *callee, 
-                                  std::vector<llvm::Value *> &argVals);
+                                  std::vector<llvm::Value *> &argVals,
+                                  llvm::Value *launchCount);
+
+    void SyncInst();

    llvm::Instruction *ReturnInst();
    /** @} */
@@ -489,6 +488,11 @@ private:
    /** True if a 'launch' statement has been encountered in the function. */
    bool launchedTasks;

+    /** This is a pointer to a void * that is passed to the ISPCLaunch(),
+        ISPCAlloc(), and ISPCSync() routines as a handle to the group ot
+        tasks launched from the current function. */
+    llvm::Value *launchGroupHandlePtr;
+
    llvm::Value *pointerVectorToVoidPointers(llvm::Value *value);
    static void addGSMetadata(llvm::Instruction *inst, SourcePos pos);
    bool ifsInLoopAllUniform() const;
--- a/decl.cpp
+++ b/decl.cpp
@@ -101,9 +101,7 @@ Declarator::AddArrayDimension(int size) {
 void
 Declarator::InitFromDeclSpecs(DeclSpecs *ds) {
    sym->type = GetType(ds);
-
-    if (ds->storageClass == SC_STATIC)
-        sym->isStatic = true;
+    sym->storageClass = ds->storageClass;
 }


@@ -237,7 +235,7 @@ Declarator::GetType(DeclSpecs *ds) const {
                    sprintf(buf, "__anon_parameter_%d", i);
                    sym = new Symbol(buf, pos);
                    Declarator *declarator = new Declarator(sym, sym->pos);
-                    sym->type = declarator->GetType(ds);
+                    sym->type = declarator->GetType(d->declSpecs);
                    d->declarators.push_back(declarator);
                }
                else {
--- a/docs/ReleaseNotes.txt
+++ b/docs/ReleaseNotes.txt
@@ -1,3 +1,81 @@
+=== v1.0.11 === (6 October 2011)
+
+The main new feature in this release is support for generating code for
+multiple targets (e.g., SSE2, SSE4, and AVX) and having the compiled code
+select the best variant at execution time.  For more information, see
+http://ispc.github.com/ispc.html#compiling-with-support-for-multiple-instruction-sets.
+
+All of the examples now take advantage of the support for multiple
+compilation targets; thus, if one has an AVX system, it's not necessary to
+recompile the examples to use the AVX target.
+
+Performance of the built-in task system that is used in the examples has
+been improved.
+
+Finally, the print() statement now works on OSX; it had been broken for the
+last few releases.
+
+=== v1.0.10 === (30 September 2011)
+
+This release features an extensive new example showing the application of
+ispc to a deferred shading algorithm for scenes with thousands of lights
+(examples/deferred).  This is an implementation of the algorithm that Johan
+Andersson described at SIGGRAPH 2009 and was implemented by Andrew
+Lauritzen and Jefferson Montgomery.  The basic idea is that a pre-rendered
+G-buffer is partitioned into tiles, and in each tile, the set of lights
+that contribute to the tile is computed.  Then, the pixels in the tile are
+then shaded using those light sources. (See slides 19-29 of
+http://s09.idav.ucdavis.edu/talks/04-JAndersson-ParallelFrostbite-Siggraph09.pdf
+for more details on the algorithm.)
+
+The mechanism for launching tasks from ispc code has been generalized to
+allow multiple tasks to be launched with a single launch call (see
+http://ispc.github.com/ispc.html#task-parallelism-language-syntax for more
+information.)
+
+A few new functions have been added to the standard library: num_cores()
+returns the number of cores in the system's CPU, and variants of all of the
+atomic operators that take 'uniform' values as parameters have been added.
+
+=== v1.0.9 === (26 September 2011)
+
+The binary release of v1.0.9 is the first that supports AVX code
+generation.  Two targets are provided: "avx", which runs with a
+programCount of 8, and "avx-x2" which runs 16 program instances
+simultaneously.  (This binary is also built using the in-progress LLVM 3.0
+development libraries, while previous ones have been built with the
+released 2.9 version of LLVM.)
+
+This release has no other significant changes beyond a number of small
+bugfixes (https://github.com/ispc/ispc/issues/100,
+https://github.com/ispc/ispc/issues/101, https://github.com/ispc/ispc/issues/103.)
+ 
+=== v1.0.8 === (19 September 2011)
+
+A number of improvements have been made to handling of 'if' statements in
+the language:
+  - A bug was fixed where invalid memory could be incorrectly accessed even
+    if none of the running program instances wanted to execute the
+    corresponding instructions (https://github.com/ispc/ispc/issues/74).
+  - The code generated for 'if' statements is a bit simpler and thus more
+    efficient.
+
+There is now '--pic' command-line argument that causes position-independent
+code to be generated (Linux and OSX only).
+
+A number of additional performance improvements:
+  - Loops are now unrolled by default; the --opt=disable-loop-unroll
+    command-line argument can be used to disable this behavior.
+    (https://github.com/ispc/ispc/issues/78)
+  - A few more cases where gathers/scatters could be determined at compile
+    time to actually access contiguous locations have been added.
+    (https://github.com/ispc/ispc/issues/79)
+
+Finally, warnings are now issued (if possible) when it can be determined
+at compile-time that an out-of-bounds array index is being used.
+(https://github.com/ispc/ispc/issues/98).
+
+
 === v1.0.7 === (3 September 2011)

 The various atomic_*_global() standard library functions are generally
--- a/docs/ispc.txt
+++ b/docs/ispc.txt
@@ -33,6 +33,17 @@ The main goals behind ``ispc`` are to:
 number of non-trivial workloads that aren't handled well by other
 compilation approaches (e.g. loop auto-vectorization.)

+**We are very interested in your feedback and comments about ispc and
+in hearing your experiences using the system.  We are especially interested
+in hearing if you try using ispc but see results that are not as you
+were expecting or hoping for.** We encourage you to send a note with your
+experiences or comments to the `ispc-users`_ mailing list or to file bug or
+feature requests with the ``ispc`` `bug tracker`_. (Thanks!)
+
+.. _ispc-users: http://groups.google.com/group/ispc-users
+.. _bug tracker: https://github.com/ispc/ispc/issues?state=open
+
+
 Contents:

 * `Recent Changes to ISPC`_
@@ -44,7 +55,8 @@ Contents:

 * `Using The ISPC Compiler`_

-  + `Command-line Options`_
+  + `Basic Command-line Options`_
+  + `Selecting The Compilation Target`_

 * `The ISPC Language`_

@@ -69,7 +81,8 @@ Contents:
  + `Program Instance Convergence`_
  + `Data Races`_
  + `Uniform Variables and Varying Control Flow`_
-  + `Task Parallelism in ISPC`_
+  + `Task Parallelism: Language Syntax`_
+  + `Task Parallelism: Runtime Requirements`_

 * `The ISPC Standard Library`_

@@ -80,6 +93,7 @@ Contents:
  + `Conversions To and From Half-Precision Floats`_
  + `Atomic Operations and Memory Fences`_
  + `Prefetches`_
+  + `System Information`_
  + `Low-Level Bits`_

 * `Interoperability with the Application`_
@@ -102,6 +116,10 @@ Contents:
  + `Small Performance Tricks`_
  + `Instrumenting Your ISPC Programs`_
  + `Using Scan Operations For Variable Output`_
+  + `Application-Supplied Execution Masks`_
+  + `Explicit Vector Programming With Uniform Short Vector Types`_
+  + `Choosing A Target Vector Width`_
+  + `Compiling With Support For Multiple Instruction Sets`_

 * `Disclaimer and Legal Information`_

@@ -273,8 +291,8 @@ with application code, enter the following command
 compiling it.  (This functionality can be disabled with the ``--nocpp``
 command-line argument.)

-Command-line Options
--------------------
+Basic Command-line Options
+--------------------------

 The ``ispc`` executable can be run with ``--help`` to print a list of
 accepted command-line arguments.  By default, the compiler compiles the
@@ -282,56 +300,83 @@ provided program (and issues warnings and errors), but doesn't
 generate any output.  

 If the ``-o`` flag is given, it will generate an output file (a native
-object file by default).  To generate a text assembly file, pass
-``--emit-asm``:
+object file by default).  

 ::

-   ispc foo.ispc -o foo.s --emit-asm
+   ispc foo.ispc -o foo.obj --emit-asm
+
+To generate a text assembly file, pass ``--emit-asm``:
+
+::
+
+   ispc foo.ispc -o foo.asm --emit-asm

 To generate LLVM bitcode, use the ``--emit-llvm`` flag.

-By default, an optimized x86-64 object file tuned for Intel® Core
-processors CPUs is built.  You can use the ``--arch`` command line flag to
-specify a 32-bit x86 target:
-
-::
-
-   ispc foo.ispc -o foo.obj --arch=x86
-
-Optimizations can be turned off with ``-O0``:
+Optimizations are on by default; they can be turned off with ``-O0``:

 ::

   ispc foo.ispc -o foo.obj -O0

-On Mac\* and Linux\*, there is early support for generating debugging
-symbols; this is enabled with the ``-g`` command-line flag.
+On Mac\* and Linux\*, there is basic support for generating debugging
+symbols; this is enabled with the ``-g`` command-line flag.  Using ``-g``
+causes optimizations to be disabled; to compile with debugging symbols and
+optimizaion, ``-O1`` should be provided as well as the ``-g`` flag.

 The ``-h`` flag can also be used to direct ``ispc`` to generate a C/C++
 header file that includes C/C++ declarations of the C-callable ``ispc``
 functions and the types passed to it.

-On Linux\* and Mac OS\*, ``-D`` can be used to specify definitions to be
-passed along to the C pre-prcessor, which runs over the program input
-before it's compiled.  On Windows®, pre-processor definitions should be
-provided to the ``cl`` call.
-
-By default, the compiler generates x86-64 Intel® SSE4 code.  To generate
-32-bit code, you can use the ``--arch=x86`` command-line flag.  To
-select Intel® SSE2, use ``--target=sse2``.
-
-``ispc`` supports an alternative method for generating Intel® SSE4 code,
-where the program is "doubled up" and eight instances of it run in
-parallel, rather than just four.  For workloads that don't require large
-numbers of registers, this method can lead to significantly more efficient
-execution thanks to greater instruction level parallelism.  This option is
-selected with ``--target=sse4x2``.
+The ``-D`` option can be used to specify definitions to be passed along to
+the pre-processor, which runs over the program input before it's compiled.
+For example, including ``-DTEST=1`` defines the pre-processor symbol
+``TEST`` to have the value ``1`` when the program is compiled.

 The compiler issues a number of performance warnings for code constructs
 that compile to relatively inefficient code.  These warnings can be
 silenced with the ``--wno-perf`` flag (or by using ``--woff``, which turns
-off all warnings.)
+off all compiler warnings.)
+
+Selecting The Compilation Target
+--------------------------------
+
+There are three options that affect the compilation target: ``--arch``,
+which sets the target architecture, ``--cpu``, which sets the target CPU,
+and ``--target``, which sets the target instruction set.
+
+By default, the ``ispc`` compiler generates code for the 64-bit x86-64
+architecture (i.e. ``--arch=x86-64`.)  To compile to a 32-bit x86 target,
+supply ``-arch=x86`` on the command line:
+
+::
+
+   ispc foo.ispc -o foo.obj --arch=x86
+
+No other architectures are currently supported.
+
+The target CPU determines both the default instruction set used as well as
+which CPU architecture the code is tuned for.  ``ispc --help`` provides a
+list of a number of the supported CPUs.  By default, the CPU type of the
+system on which you're running ``ispc`` is used to determine the target
+CPU.
+
+::
+
+   ispc foo.ispc -o foo.obj --cpu=corei7-avx
+
+Finally, ``--target`` selects between the SSE2, SSE4, and AVX instruction
+sets.  (As general context, SSE2 was first introduced in processors that
+shipped in 2001, SSE4 was introduced in 2007, and processors with AVX 
+were introduced in 2010.  Consult your CPU's manual for specifics on which
+vector instruction set it supports.)
+
+By default, the target instruction set is chosen based on which ones are
+supported by the system on which you're running ``ispc``.  You can override
+this choice with the ``--target`` flag; for example, to select Intel® SSE2,
+use ``--target=sse2``.  (As with the other options in this section, see the
+output of ``ispc --help`` for a full list of supported targets.)


 The ISPC Language
@@ -824,8 +869,8 @@ by default.  If a function is declared with a ``static`` qualifier, then it
 is only visible in the file in which it was declared.

 Any function that can be launched with the ``launch`` construct in ``ispc``
-must have a ``task`` qualifier; see `Task Parallelism in ISPC`_ for more
-discussion of launching tasks in ``ispc``.
+must have a ``task`` qualifier; see `Task Parallelism: Language Syntax`_
+for more discussion of launching tasks in ``ispc``.

 Functions that are intended to be called from C/C++ application code must
 have the ``export`` qualifier.  This causes them to have regular C linkage
@@ -926,8 +971,9 @@ execution model is critical for writing efficient and correct programs in

 ``ispc`` supports both task parallelism to parallelize across multiple
 cores and SPMD parallelism to parallelize across the SIMD vector lanes on a
-single core.  This section focuses on SPMD parallelism.  See the section
-`Task Parallelism in ISPC`_ for discussion of task parallelism in ``ispc``.
+single core.  This section focuses on SPMD parallelism.  See the sections
+`Task Parallelism: Language Syntax`_ and `Task Parallelism: Runtime
+Requirements`_ for discussion of task parallelism in ``ispc``.

 The SPMD-on-SIMD Execution Model
 --------------------------------
@@ -1174,7 +1220,7 @@ This code implicitly assumes that ``programCount`` evenly divides
 ::

    for (uniform int i = 0; i < count; i += programCount) {
-        if (i + programIndex < programCount) {
+        if (i + programIndex < count) {
            float d = data[i + programIndex];
            ...

@@ -1370,112 +1416,190 @@ be modified in the above code even if *none* of the program instances
 evaluated a true value for the test, given the ``ispc`` execution model.


-Task Parallelism in ISPC
------------------------
+Task Parallelism: Language Syntax
+---------------------------------

 One option for combining task-parallelism with ``ispc`` is to just use
 regular task parallelism in the C/C++ application code (be it through
-Intel® Cilk(tm), Intel® Thread Building Blocks or another task system,
-etc.), and for tasks to use ``ispc`` for SPMD parallelism across the vector
-lanes as appropriate.  Alternatively, ``ispc`` also has some support for
-launching tasks from ``ispc`` code.  The approach is similar to Intel®
-Cilk's task launch feature.  (See the ``examples/mandelbrot_tasks`` example
-to see it used in a non-trivial example.)
+Intel® Cilk(tm), Intel® Thread Building Blocks or another task system), and
+for tasks to use ``ispc`` for SPMD parallelism across the vector lanes as
+appropriate.  Alternatively, ``ispc`` also has support for launching tasks
+from ``ispc`` code.  The approach is similar to Intel® Cilk's task launch
+feature.  (See the ``examples/mandelbrot_tasks`` example to see it used in
+a small example.)

-Any function that is launched as a task must be declared with the ``task``
-qualifier:
+First, any function that is launched as a task must be declared with the
+``task`` qualifier:

 ::

-    task void func(uniform float a[], uniform int start) {
-        ....
+    task void func(uniform float a[], uniform int index) {
+        ...
+        a[index] = ....
    }

 Tasks must return ``void``; a compile time error is issued if a
 non-``void`` task is defined.

-Given a task, one can then write code that launches tasks as follows:
+Given a task definitions, there are two ways to write code that launches
+tasks, using the ``launch`` construct.  First, one task can be launched at
+a time, with parameters passed to the task to help it determine what part
+of the overall computation it's responsible for:

 ::

    for (uniform int i = 0; i < 100; ++i)
-        launch < func(a, i); >
+        launch < func(a, i) >;

 Note the ``launch`` keyword and the brackets around the function call.
 This code launches 100 tasks, each of which presumably does some
-computation keyed off of given the value ``i``.  In general, one should
-launch many more tasks than there are processors in the system to
+computation that is keyed off of given the value ``i``.  In general, one
+should launch many more tasks than there are processors in the system to
 ensure good load-balancing, but not so many that the overhead of scheduling
 and running tasks dominates the computation.

-Program execution continues asynchronously after task launch; thus, the
-function shouldn't access values being generated by the tasks without
-synchronization.  A function uses a ``sync`` statement to wait for all
-launched tasks to finish:
+Alternatively, a number of tasks may be launched from a single ``launch``
+statement.  We might instead write the above example with a single
+``launch`` like this:

 ::

-    for (uniform int i = 0; i < 100; ++i)
-        launch < func(a, i); >
+    launch[100] < func2(a) >;
+
+Where an integer value (not necessarily a compile-time constant) is
+provided to the ``launch`` keyword in square brackets; this number of tasks
+will be enqueued to be run asynchronously.  Within each of the tasks, two
+special built-in variables are available--``taskIndex``, and ``taskCount``.
+The first, ``taskIndex``, ranges from zero to one minus the number of tasks
+provided to ``launch``, and ``taskCount`` equals the number of launched
+taks.  Thus, we might use ``taskIndex`` in the implementation of ``func2``
+to determine which array element to process.
+
+::
+
+    task void func2(uniform float a[]) {
+        ...
+        a[taskIndex] = ...
+    }
+
+Program execution continues asynchronously after a ``launch`` statement;
+thus, a function shouldn't access values being generated by the tasks it
+has launched within the function without synchronization.  If results are
+needed before function return, a function can use a ``sync`` statement to
+wait for all launched tasks to finish:
+
+::
+
+    launch[100] < func2(a) >;
    sync;
    // now safe to use computed values in a[]...

-Alternatively, any function that launches tasks has an implicit ``sync``
-before it returns, so that functions that call a function that launches
-tasks don't have to worry about outstanding asynchronous computation.
+Alternatively, any function that launches tasks has an automatically-added
+``sync`` statement before it returns, so that functions that call a
+function that launches tasks don't have to worry about outstanding
+asynchronous computation from that function.

 Inside functions with the ``task`` qualifier, two additional built-in
-variables are provided: ``threadIndex`` and ``threadCount``.
-``threadCount`` gives the total number of hardware threads that have been
-launched by the task system.  ``threadIndex`` provides an index between
-zero and ``threadCount-1`` that gives a unique index that corresponds to
-the hardware thread that is executing the current task.  The
-``threadIndex`` can be used for accessing data that is private to the
-current thread and thus doesn't require synchronization to access under
-parallel execution.
+variables are provided in addition to ``taskIndex`` and ``taskCount``:
+``threadIndex`` and ``threadCount``.  ``threadCount`` gives the total
+number of hardware threads that have been launched by the task system.
+``threadIndex`` provides an index between zero and ``threadCount-1`` that
+gives a unique index that corresponds to the hardware thread that is
+executing the current task.  The ``threadIndex`` can be used for accessing
+data that is private to the current thread and thus doesn't require
+synchronization to access under parallel execution.
+
+Task Parallelism: Runtime Requirements
+--------------------------------------

 If you use the task launch feature in ``ispc``, you must provide C/C++
-implementations of two functions and link them into your final executable
-file.  Although these functions may be implemented in either language, they
-must have "C" linkage (i.e. their prototypes must be declared inside an
-``extern "C"`` block if they are defined in C++.)
+implementations of three specific functions that manage launching and
+synchronizing parallel tasks; these functions must be linked into your
+executable.  Although these functions may be implemented in any
+language, they must have "C" linkage (i.e. their prototypes must be
+declared inside an ``extern "C"`` block if they are defined in C++.)
+
+By using user-supplied versions of these functions, ``ispc`` programs can
+easily interoperate with software systems that have existing task systems
+for managing parallelism.  If you're using ``ispc`` with a system that
+isn't otherwise multi-threaded and don't want to write custom
+implementations of them, you can use the implementations of these functions
+provided in the ``examples/tasksys.cpp`` file in the ``ispc``
+distributions.
+
+If you are implementing your own task system, the remainder of this section
+discusses the requirements for these calls.  You will also likely want to
+review the example task systems in ``examples/tasksys.cpp`` for reference.
+If you are not implmenting your own task system, you can skip reading the
+remainder of this section.
+
+Here are the declarations of the three functions that must be provided to
+manage tasks in ``ispc``:

 ::

-    void ISPCLaunch(void *funcptr, void *data);
-    void ISPCSync();
+    void *ISPCAlloc(void **handlePtr, int64_t size, int32_t alignment);
+    void ISPCLaunch(void **handlePtr, void *f, void *data, int count);
+    void ISPCSync(void *handle);

-On Windows, two additional functions must be provided to dynamically
-allocate and free memory to store the arguments passed to tasks.  (On OSX
-and Linux, the stack provides memory for task arguments; on Windows, the
-stack is generally not large enough to do this for large numbers of tasks.)
+All three of these functions take an opaque handle (or a pointer to an
+opaque handle) as their first parameter.  This handle allows the task
+system runtime to distinguish between calls to these functions from
+different functions in ``ispc`` code.  In this way, the task system
+implementation can efficiently wait for completion on just the tasks
+launched from a single function.
+
+The first time one of ``ISPCLaunch()`` or ``ISPCAlloc()`` is called in an
+``ispc`` functon, the ``void *`` pointed to by the ``handlePtr`` parameter
+will be ``NULL``.  The implementations of these function should then
+initialize ``*handlePtr`` to a unique handle value of some sort.  (For
+example, it might allocate a small structure to record which tasks were
+launched by the current function.)  In subsequent calls to these functions
+in the emitted ``ispc`` code, the same value for ``handlePtr`` will be
+passed in, such that loading from ``*handlePtr`` will retrieve the value
+stored in the first call.
+
+At function exit (or at an explicit ``sync`` statement), a call to
+``ISPCSync()`` will be generated if ``*handlePtr`` is non-``NULL``.
+Therefore, the handle value is passed directly to ``ISPCSync()``, rather
+than a pointer to it, as in the other functions.
+
+The ``ISPCAlloc()`` function is used to allocate small blocks of memory to
+store parameters passed to tasks.  It should return a pointer to memory
+with the given aize and alignment.  Note that there is no explicit
+``ISPCFree()`` call; instead, all memory allocated within an ``ispc``
+function should be freed when ``ISPCSync()`` is called.
+
+``ISPCLaunch()`` is called to launch to launch one or more asynchronous
+tasks.  Each ``launch`` statement in ``ispc`` code causes a call to
+``ISPCLaunch()`` to be emitted in the generated code.  The three parameters
+after the handle pointer to thie function are relatively straightforward;
+the ``void *f`` parameter holds a pointer to a function to call to run the
+work for this task, ``data`` holds a pointer to data to pass to this
+function, and ``count`` is the number of instances of this function to
+enqueue for asynchronous execution.  (In other words, ``count`` corresponds
+to the value ``n`` in a multiple-task launch statement like ``launch[n]``.)
+
+The signature of the provided function pointer ``f`` is

 ::

-    void *ISPCMalloc(int64_t size, int32_t alignment);
-    void ISPCFree(void *ptr);
+    void (*TaskFuncPtr)(void *data, int threadIndex, int threadCount,
+                        int taskIndex, int taskCount)

-These are called by the task launch code generated by the ``ispc``
-compiler; the first is called to launch to launch a task and the second is
-called to wait for, respectively.  (Factoring them out in this way
-allows ``ispc`` to inter-operate with the application's task system, if
-any, rather than having a separate one of its own.)  To run a particular
-task, the task system should cast the function pointer to a ``void (*)(void
-*, int, int)`` function pointer and then call it with the provided ``void
-*`` data and then an index for the current hardware thread and the total
-number of hardware threads the task system has launched--in other words:
-
-::
-
-    typedef void (*TaskFuncType)(void *, int, int);
-    TaskFuncType tft = (TaskFuncType)(funcptr);
-    tft(data, threadIndex, threadCount);
-
-A number of sample task system implementations are provided with ``ispc``; 
-see the files ``tasks_concrt.cpp``, ``tasks_gcd.cpp`` and
-``tasks_pthreads.cpp`` in the ``examples/mandelbrot_tasks`` directory of
-the ``ispc`` distribution.
+When this function pointer is called by one of the hardware threads managed
+bythe task system, the ``data`` pointer passed to ``ISPCLaunch()`` should
+be passed to it for its first parameter; ``threadCount`` gives the total
+number of hardware threads that have been spawned to run tasks and
+``threadIndex`` should be an integer index between zero and ``threadCount``
+uniquely identifying the hardware thread that is running the task.  (These
+values can be used to index into thread-local storage.)

+The value of ``taskCount`` should be the number of tasks launched in the
+``launch`` statement that caused the call to ``ISPCLaunch()`` and each of
+the calls to this function should be given a unique value of ``taskIndex``
+between zero and ``taskCount``, to distinguish which of the instances
+of the set of launched tasks is running.

 The ISPC Standard Library
 =========================
@@ -1817,12 +1941,19 @@ given value across all of the currently-executing vector lanes.

 ::

-    uniform float reduce_min(float a, float b)
-    uniform int reduce_min(int a, int b)
-    uniform unsigned int reduce_min(unsigned int a, unsigned int b)
-    uniform float reduce_max(float a, float b)
-    uniform int reduce_max(int a, int b)
-    uniform unsigned int reduce_max(unsigned int a, unsigned int b)
+    uniform float reduce_min(float a)
+    uniform int32 reduce_min(int32 a)
+    uniform unsigned int32 reduce_min(unsigned int32 a)
+    uniform double reduce_min(double a)
+    uniform int64 reduce_min(int64 a)
+    uniform unsigned int64 reduce_min(unsigned int64 a)
+
+    uniform float reduce_max(float a)
+    uniform int32 reduce_max(int32 a)
+    uniform unsigned int32 reduce_max(unsigned int32 a)
+    uniform double reduce_max(double a)
+    uniform int64 reduce_max(int64 a)
+    uniform unsigned int64 reduce_max(unsigned int64 a)

 Finally, you can check to see if a particular value has the same value in
 all of the currently-running program instances:
@@ -2020,12 +2151,12 @@ end.)

 One thing to note is that that the value being added to here is a
 ``uniform`` integer, while the increment amount and the return value are
-``varying``.  In other words, the semantics are that each running program
-instance individually issues the atomic operation with its own ``delta``
-value and gets the previous value of ``val`` back in return.  The atomics
-for the running program instances may be issued in arbitrary order; it's
-not guaranteed that they will be issued in ``programIndex`` order, for
-example.
+``varying``.  In other words, the semantics of this call are that each
+running program instance individually issues the atomic operation with its
+own ``delta`` value and gets the previous value of ``val`` back in return.
+The atomics for the running program instances may be issued in arbitrary
+order; it's not guaranteed that they will be issued in ``programIndex``
+order, for example.

 Here are the declarations of the ``int32`` variants of these functions.
 There are also ``int64`` equivalents as well as variants that take
@@ -2043,17 +2174,44 @@ function can be used with ``float`` and ``double`` types as well.)
  int32 atomic_xor_global(reference uniform int32 val, int32 value)
  int32 atomic_swap_global(reference uniform int32 val, int32 newval)

-There is also an atomic "compare and exchange" function; it atomically
-compares the value in "val" to "compare"--if they match, it assigns
-"newval" to "val".  In either case, the old value of "val" is returned.
-(As with the other atomic operations, there are also ``unsigned`` and
-64-bit variants of this function.  Furthermore, there are ``float`` and
-``double`` variants as well.)
+There are also variants of these functions that take ``uniform`` values for
+the operand and return a ``uniform`` result:

 ::

+  uniform int32 atomic_add_global(reference uniform int32 val,
+                                  uniform int32 value)
+  uniform int32 atomic_subtract_global(reference uniform int32 val,
+                                       uniform int32 value)
+  uniform int32 atomic_min_global(reference uniform int32 val,
+                                  uniform int32 value)
+  uniform int32 atomic_max_global(reference uniform int32 val,
+                                  uniform int32 value)
+  uniform int32 atomic_and_global(reference uniform int32 val,
+                                  uniform int32 value)
+  uniform int32 atomic_or_global(reference uniform int32 val,
+                                  uniform int32 value)
+  uniform int32 atomic_xor_global(reference uniform int32 val,
+                                  uniform int32 value)
+  uniform int32 atomic_swap_global(reference uniform int32 val,
+                                   uniform int32 newval)
+
+There are also an atomic swap and "compare and exchange" functions.
+Compare and exchange atomically compares the value in "val" to
+"compare"--if they match, it assigns "newval" to "val".  In either case,
+the old value of "val" is returned.  (As with the other atomic operations,
+there are also ``unsigned`` and 64-bit variants of this function.
+Furthermore, there are ``float`` and ``double`` variants as well.)
+
+::
+
+  int32 atomic_swap_global(reference uniform int32 val, int32 new)
+  uniform int32 atomic_swap_global(reference uniform int32 val,
+                                   uniform int32 new)
  int32 atomic_compare_exchange_global(reference uniform int32 val,
                                       int32 compare, int32 newval)
+  uniform int32 atomic_compare_exchange_global(reference uniform int32 val,
+                                  uniform int32 compare, uniform int32 newval)

 ``ispc`` also has a standard library routine that inserts a memory barrier
 into the code; it ensures that all memory reads and writes prior to be
@@ -2102,6 +2260,20 @@ These functions are available for all of the basic types in the
 language--``int8``, ``int16``, ``int32``, ``float``, and so forth.


+System Information
+------------------
+
+A routine is available to find the number of CPU cores available in the
+system:
+
+::
+
+    int num_cores()
+
+This value can be useful for adapting the granularity of parallel task
+decomposition depending on the number of processors in the system.
+
+
 Low-Level Bits
 --------------

@@ -2209,14 +2381,14 @@ Both the ``foo`` and ``bar`` global variables can be accessed on each
 side.

 ``ispc`` code can also call back to C/C++.  On the ``ispc`` side, any
-application functions to be called must be declared with the ``export "C"``
+application functions to be called must be declared with the ``extern "C"``
 qualifier.

 ::

   extern "C" void foo(uniform float f, uniform float g);

-Unlike in C++, ``export "C"`` doesn't take braces to delineate
+Unlike in C++, ``extern "C"`` doesn't take braces to delineate
 multiple functions to be declared; thus, multiple C functions to be called
 from ``ispc`` must be declared as follows:

@@ -2843,6 +3015,157 @@ values to ``outArray[1]`` and ``outArray[2]``, and so forth.  The
 ``reduce_add`` call at the end returns the total number of values that the
 program instances have written to the array.

+Application-Supplied Execution Masks
+------------------------------------
+
+Recall that when execution transitions from the application code to an
+``ispc`` function, all of the program instances are initially executing.
+In some cases, it may desired that only some of them are running, based on
+a data-dependent condition computed in the application program.  This
+situation can easily be handled via an additional parameter from the
+application.
+
+As a simple example, consider a case where the application code has an
+array of ``float`` values and we'd like the ``ispc`` code to update
+just specific values in that array, where which of those values to be
+updated has been determined by the application.  In C++ code, we might
+have:
+
+::
+
+    int count = ...;
+    float *array = new float[count];
+    bool *shouldUpdate = new bool[count];
+    // initialize array and shouldUpdate
+    ispc_func(array, shouldUpdate, count);
+
+Then, the ``ispc`` code could process this update as:
+
+::
+
+    export void ispc_func(uniform float array[], uniform bool update[],
+                          uniform int count) {
+        for (uniform int i = 0; i < count; i += programCount) {
+            cif (update[i+programIndex] == true)
+                // update array[i+programIndex]...
+        }
+    }
+
+(In this case a "coherent" if statement is likely to be worthwhile if the
+``update`` array will tend to have sections that are either all-true or
+all-false.)
+
+Explicit Vector Programming With Uniform Short Vector Types
+-----------------------------------------------------------
+
+The typical model for programming in ``ispc`` is an *implicit* parallel
+model, where one writes a program that is apparently doing scalar
+computation on values and the program is then vectorized to run in parallel
+across the SIMD lanes of a processor.  However, ``ispc`` also has some
+support for explicit vector unit programming, where the vectorization is
+explicit.  Some computations may be more effectively described in the
+explicit model rather than the implicit model.
+
+This support is provided via ``uniform`` instances of short vectors 
+(as were introduced in the `Short Vector Types`_ section).  Specifically, 
+if this short program
+
+::
+
+    export uniform float<8> madd(uniform float<8> a, 
+                                 uniform float<8> b, uniform float<8> c) {
+        return a + b * c;
+    }
+
+is compiled with the AVX target, ``ispc`` generates the following assembly:
+
+::
+    _madd:
+	vmulps	%ymm2, %ymm1, %ymm1
+	vaddps	%ymm0, %ymm1, %ymm0
+	ret
+
+(And similarly, if compiled with a 4-wide SSE target, two ``mulps`` and two
+``addps`` instructions are generated, and so forth.)
+
+Note that ``ispc`` doesn't currently support control-flow based on
+``uniform`` short vector types; it is thus not possible to write code like:
+
+::
+
+    export uniform int<8> count(uniform float<8> a, uniform float<8> b) {
+        uniform int<8> sum = 0;
+        while (a++ < b)
+            ++sum;
+    }
+
+
+Choosing A Target Vector Width
+------------------------------
+
+By default, ``ispc`` compiles to the natural vector width of the target
+instruction set.  For example, for SSE2 and SSE4, it compiles four-wide,
+and for AVX, it complies 8-wide.  For some programs, higher performance may
+be seen if the program is compiled to a doubled vector width--8-wide for
+SSE and 16-wide for AVX.  
+
+For workloads that don't require many of registers, this method can lead to
+significantly more efficient execution thanks to greater instruction level
+parallelism and amortization of various overhead over more program
+instances.  For other workloads, it may lead to a slowdown due to higher
+register pressure; trying both approaches for key kernels may be
+worthwhile.
+
+This option is currently only available for the SSE4 and AVX targets, and
+is selected with the ``--target=sse4-x2`` and ``--target=avx-x2`` options,
+respectively.
+
+Compiling With Support For Multiple Instruction Sets
+----------------------------------------------------
+
+``ispc`` can also generate output that supports multiple target instruction
+sets, choosing the most appropriate one at runtime.  For example, if you
+run the command:
+
+::
+
+   ispc foo.ispc -o foo.o --target=sse2,sse4-x2,avx-x2
+
+Then four object files will be generated: ``foo_sse2.o``, ``foo_sse4.o``,
+``foo_avx.o``, and ``foo.o``.[#]_  Link all of these into your executable, and
+when you call a function in ``foo.ispc`` from your application code,
+``ispc`` will determine which instruction sets are supported by the CPU the
+code is running on and will call the most appropraite version of the
+function available.  
+
+.. [#] Similarly, if you choose to generate assembly langauage output or
+   LLVM bitcode output, multiple versions of those files will be created.
+
+In general, the version of the function that runs will be the one in the
+most general instruction set that is supported by the system.  If you only
+compile SSE2 and SSE4 variants and run on a system that supports AVX, for
+example, then the SSE4 variant will be executed.  If the system doesn't
+is not able to run any of the available variants of the function (for
+example, trying to run a function that only has SSE4 and AVX variants on a
+system that only supports SSE2), then the standard library ``abort()``
+function will be called.
+
+One subtlety is that all non-static global variables (if any) must have the
+same size and layout with all of the targets used.  For example, if you
+have the global variables:
+
+::
+
+   uniform int foo[2*programCount];
+   int bar;
+
+and compile to both SSE2 and AVX targets, both of these variables will have
+different sizes (the first due to program count having the value 4 for SSE2
+and 8 for AVX, and the second due to ``varying`` types having different
+numbers of elements with the two targets--essentially the same issue as the
+first.)
+
+
 Disclaimer and Legal Information
 ================================

--- a/doxygen.cfg
+++ b/doxygen.cfg
@@ -31,7 +31,7 @@ PROJECT_NAME           = "Intel SPMD Program Compiler"
 # This could be handy for archiving the generated documentation or
 # if some version control system is used.

-PROJECT_NUMBER         = 1.0.7
+PROJECT_NUMBER         = 1.0.11

 # The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute)
 # base path where the generated documentation will be put.
@@ -585,7 +585,6 @@ INPUT                  = builtins.h \
                         ctx.h \
                         decl.h \
                         expr.h \
-                         gatherbuf.h \
                         ispc.h \
                         llvmutil.h \
                         module.h \
@@ -598,7 +597,6 @@ INPUT                  = builtins.h \
                         ctx.cpp \
                         decl.cpp \
                         expr.cpp \
-                         gatherbuf.cpp \
                         ispc.cpp \
                         llvmutil.cpp \
                         main.cpp \
--- a/examples/README.txt
+++ b/examples/README.txt
@@ -14,6 +14,7 @@ the runtimes and the speedup delivered by ispc.  It may be instructive to
 do a side-by-side diff of the C++ and ispc implementations of these
 algorithms to learn more about wirting ispc code.

+ 
 AOBench
 =======

@@ -27,6 +28,7 @@ It executes the program for the given number of iterations, rendering an
 (xres x yres) image each time and measuring the computation time with both
 serial and ispc implementations.

+
 AOBench_Instrumented
 ====================

@@ -40,12 +42,47 @@ is provided in the instrument.cpp file.
 *** Note: on Linux, this example currently hits an assertion in LLVM during
 *** compilation

+
+Deferred
+========
+
+This example shows an extensive example of using ispc for efficient
+deferred shading of scenes with thousands of lights; it's an implementation
+of the algorithm that Johan Andersson described at SIGGRAPH 2009,
+implemented by Andrew Lauritzen and Jefferson Montgomery.  The basic idea
+is that a pre-rendered G-buffer is partitioned into tiles, and in each
+tile, the set of lights that contribute to the tile is first computed.
+Then, the pixels in the tile are then shaded using just those light
+sources. (See slides 19-29 of
+http://s09.idav.ucdavis.edu/talks/04-JAndersson-ParallelFrostbite-Siggraph09.pdf
+for more details on the algorithm.)
+
+This directory includes three implementations of the algorithm:
+
+- An ispc implementation that first does a static partitioning of the
+  screen into tiles to parallelize across the CPU cores.  Within each tile
+  ispc kernels provide highly efficient implementations of the light
+  culling and shading calculations.
+- A "best practices" serial C++ implementation.  This implementation does a
+  dynamic partitioning of the screen, refining tiles with significant Z
+  depth complexity (these tiles often have a large number of lights that
+  affect them).  Within each final tile, the pixels are shaded using
+  regular C++ code.
+- If the Cilk extensions are available in your compiler, an ispc
+  implementation that uses Cilk will also be built.
+  (See http://software.intel.com/en-us/articles/intel-cilk-plus/).  Like 
+  the "best practices" serial implementation, this version does dynamic
+  tile partitioning for better load balancing and then uses ispc for the
+  light culling and shading.
+
+
 Mandelbrot
 ==========

 Mandelbrot set generation.  This example is extensively documented at the
 http://ispc.github.com/example.html page.

+
 Mandelbrot_tasks
 ================

@@ -58,6 +95,7 @@ using tasks with ispc, no task system is mandated; the user is free to plug
 in any task system they want, for ease of interoperating with existing task
 systems.

+
 Noise
 =====

@@ -71,6 +109,7 @@ Options
 This program implements both the Black-Scholes and Binomial options pricing
 models in both ispc and regular serial C++ code.

+
 RT
 ==

@@ -87,6 +126,7 @@ and triangle intersection code from pbrt; see the pbrt source code and/or
 "Physically Based Rendering" book for more about the basic algorithmic
 details.

+
 Simple
 ======

@@ -94,6 +134,7 @@ This is a simple "hello world" type program that shows a ~10 line
 application program calling out to a ~5 line ispc program to do a simple
 computation.

+
 Volume
 ======

--- a/examples/aobench/Makefile
+++ b/examples/aobench/Makefile
@@ -1,8 +1,18 @@

-CXX=g++ -m64
-CXXFLAGS=-Iobjs/ -O3 -Wall
+ARCH = $(shell uname)
+
+TASK_CXX=../tasksys.cpp
+TASK_LIB=-lpthread
+TASK_OBJ=$(addprefix objs/, $(subst ../,, $(TASK_CXX:.cpp=.o)))
+
+CXX=g++
+CXXFLAGS=-Iobjs/ -O3 -Wall -m64
 ISPC=ispc
-ISPCFLAGS=-O2 --fast-math --arch=x86-64
+ISPCFLAGS=-O2 --target=sse2,sse4,avx --arch=x86-64
+
+ISPC_OBJS=objs/ao_ispc.o objs/ao_ispc_sse2.o objs/ao_ispc_sse4.o \
+	objs/ao_ispc_avx.o
+OBJS=objs/ao.o objs/ao_serial.o $(ISPC_OBJS) $(TASK_OBJ)

 default: ao

@@ -14,13 +24,16 @@ dirs:
 clean:
 	/bin/rm -rf objs *~ ao

-ao: dirs objs/ao.o objs/ao_serial.o objs/ao_ispc.o
-	$(CXX) $(CXXFLAGS) -o $@ objs/ao.o objs/ao_ispc.o objs/ao_serial.o -lm -lpthread
+ao: dirs $(OBJS) $(TASK_OBJ)
+	$(CXX) $(CXXFLAGS) -o $@ $(OBJS) -lm $(TASK_LIB)

 objs/%.o: %.cpp
 	$(CXX) $< $(CXXFLAGS) -c -o $@

+objs/%.o: ../%.cpp
+	$(CXX) $< $(CXXFLAGS) -c -o $@
+
 objs/ao.o: objs/ao_ispc.h 

-objs/%_ispc.h objs/%_ispc.o: %.ispc
+objs/%_ispc.h objs/%_ispc.o objs/%_ispc_sse2.o objs/%_ispc_sse4.o objs/%_ispc_avx.o: %.ispc
 	$(ISPC) $(ISPCFLAGS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
--- a/examples/aobench/ao.cpp
+++ b/examples/aobench/ao.cpp
@@ -55,7 +55,6 @@
 using namespace ispc;

 #include "../timing.h"
-#include "../cpuid.h"

 #define NSUBSAMPLES        2

@@ -105,38 +104,6 @@ savePPM(const char *fname, int w, int h)
 }


-// Make sure that the vector ISA used during compilation is supported by
-// the processor.  The ISPC_TARGET_* macro is set in the ispc-generated
-// header file that we include above.
-static void
-ensureTargetISAIsSupported() {
-#if defined(ISPC_TARGET_SSE2)
-    bool isaSupported = CPUSupportsSSE2();
-    const char *target = "SSE2";
-#elif defined(ISPC_TARGET_SSE4)
-    bool isaSupported = CPUSupportsSSE4();
-    const char *target = "SSE4";
-#elif defined(ISPC_TARGET_AVX)
-    bool isaSupported = CPUSupportsAVX();
-    const char *target = "AVX";
-#else
-#error "Unknown ISPC_TARGET_* value"
-#endif
-    if (!isaSupported) {
-        fprintf(stderr, "***\n*** Error: the ispc-compiled code uses the %s instruction "
-                "set, which isn't\n***        supported by this computer's CPU!\n", target);
-        fprintf(stderr, "***\n***        Please modify the "
-#ifdef _MSC_VER
-                "MSVC project file "
-#else
-                "Makefile "
-#endif
-                "to select another target (e.g. sse2)\n***\n");
-        exit(1);
-    }
-}
-
-
 int main(int argc, char **argv)
 {
    if (argc != 4) {
@@ -151,8 +118,6 @@ int main(int argc, char **argv)
        height = atoi (argv[3]);
    }

-    ensureTargetISAIsSupported();
-
    // Allocate space for output images
    img = new unsigned char[width * height * 3];
    fimg = new float[width * height * 3];
@@ -173,10 +138,30 @@ int main(int argc, char **argv)
    }

    // Report results and save image
-    printf("[aobench ispc]:\t\t\t[%.3f] M cycles (%d x %d image)\n", minTimeISPC, 
-           width, height);
+    printf("[aobench ispc]:\t\t\t[%.3f] M cycles (%d x %d image)\n", 
+           minTimeISPC, width, height);
    savePPM("ao-ispc.ppm", width, height); 

+    //
+    // Run the ispc + tasks path, test_iterations times, and report the
+    // minimum time for any of them.
+    //
+    double minTimeISPCTasks = 1e30;
+    for (unsigned int i = 0; i < test_iterations; i++) {
+        memset((void *)fimg, 0, sizeof(float) * width * height * 3);
+        assert(NSUBSAMPLES == 2);
+
+        reset_and_start_timer();
+        ao_ispc_tasks(width, height, NSUBSAMPLES, fimg);
+        double t = get_elapsed_mcycles();
+        minTimeISPCTasks = std::min(minTimeISPCTasks, t);
+    }
+
+    // Report results and save image
+    printf("[aobench ispc + tasks]:\t\t[%.3f] M cycles (%d x %d image)\n", 
+           minTimeISPCTasks, width, height);
+    savePPM("ao-ispc-tasks.ppm", width, height); 
+
    //
    // Run the serial path, again test_iteration times, and report the
    // minimum time.
@@ -193,7 +178,8 @@ int main(int argc, char **argv)
    // Report more results, save another image...
    printf("[aobench serial]:\t\t[%.3f] M cycles (%d x %d image)\n", minTimeSerial, 
           width, height);
-    printf("\t\t\t\t(%.2fx speedup from ISPC)\n", minTimeSerial / minTimeISPC);
+    printf("\t\t\t\t(%.2fx speedup from ISPC, %.2fx speedup from ISPC + tasks)\n", 
+           minTimeSerial / minTimeISPC, minTimeSerial / minTimeISPCTasks);
    savePPM("ao-serial.ppm", width, height); 
        
    return 0;
--- a/examples/aobench/ao.ispc
+++ b/examples/aobench/ao.ispc
@@ -203,8 +203,9 @@ ambient_occlusion(reference Isect isect, reference Plane plane,
 /* Compute the image for the scanlines from [y0,y1), for an overall image
   of width w and height h.
 */
-void ao_scanlines(uniform int y0, uniform int y1, uniform int w, uniform int h, 
-                  uniform int nsubsamples, reference uniform float image[]) {
+static void ao_scanlines(uniform int y0, uniform int y1, uniform int w, 
+                         uniform int h,  uniform int nsubsamples, 
+                         reference uniform float image[]) {
    static Plane plane = { { 0.0f, -0.5f, 0.0f }, { 0.f, 1.f, 0.f } };
    static Sphere spheres[3] = {
        { { -2.0f, 0.0f, -3.5f }, 0.5f },
@@ -231,6 +232,9 @@ void ao_scanlines(uniform int y0, uniform int y1, uniform int w, uniform int h,
    // direction we do per iteration and ny the number in y.
    uniform int nx = 1, ny = 1;

+    // FIXME: We actually need ny to be 1 regardless of the decomposition,
+    // since the task decomposition is one scanline high.
+
    if (programCount == 8) {
        // Do two pixels at once in the x direction
        nx = 2;
@@ -239,19 +243,21 @@ void ao_scanlines(uniform int y0, uniform int y1, uniform int w, uniform int h,
            ++du;
    }
    else if (programCount == 16) {
-        // Two at once in both x and y
-        nx = ny = 2;
-        if ((programIndex >= 4 && programIndex < 8) || programIndex >= 12)
+        nx = 4;
+        ny = 1;
+        if (programIndex >= 4 && programIndex < 8)
            ++du;
-        if (programIndex >= 8)  
-            ++dv;
+        if (programIndex >= 8 && programIndex < 12)
+            du += 2;
+        if (programIndex >= 12)
+            du += 3;
    }

    // Now loop over all of the pixels, stepping in x and y as calculated
    // above.  (Assumes that ny divides y and nx divides x...)
    for (uniform int y = y0; y < y1; y += ny) {
        for (uniform int x = 0; x < w; x += nx)  {
-            // Figur out x,y pixel in NDC
+            // Figure out x,y pixel in NDC
            float px =  (x + du - (w / 2.0f)) / (w / 2.0f);
            float py = -(y + dv - (h / 2.0f)) / (h / 2.0f);
            float ret = 0.f;
@@ -293,7 +299,7 @@ void ao_scanlines(uniform int y0, uniform int y1, uniform int w, uniform int h,

            // offset to the first pixel in the image
            uniform int offset = 3 * (y * w + x);
-            for (uniform int p = 0; p < programCount; p += 4, ++offset) {
+            for (uniform int p = 0; p < programCount; p += 4, offset += 3) {
                // Get the four sample values for this pixel
                uniform float sumret = retArray[p] + retArray[p+1] + retArray[p+2] +
                    retArray[p+3];
@@ -315,3 +321,15 @@ export void ao_ispc(uniform int w, uniform int h, uniform int nsubsamples,
                    uniform float image[]) {
    ao_scanlines(0, h, w, h, nsubsamples, image);
 }
+
+
+static void task ao_task(uniform int width, uniform int height, 
+                         uniform int nsubsamples, uniform float image[]) {
+    ao_scanlines(taskIndex, taskIndex+1, width, height, nsubsamples, image);
+}
+
+
+export void ao_ispc_tasks(uniform int w, uniform int h, uniform int nsubsamples, 
+                          uniform float image[]) {
+    launch[h] < ao_task(w, h, nsubsamples, image) >;
+}
--- a/examples/aobench/aobench.vcxproj
+++ b/examples/aobench/aobench.vcxproj
@@ -21,22 +21,23 @@
  <ItemGroup>
    <ClCompile Include="ao.cpp" />
    <ClCompile Include="ao_serial.cpp" />
+    <ClCompile Include="../tasksys.cpp" />
  </ItemGroup>
  <ItemGroup>
    <CustomBuild Include="ao.ispc">
      <FileType>Document</FileType>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4,avx
 </Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4,avx
 </Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(Filename).obj</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">%(Filename).obj</Outputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4,avx
 </Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4,avx
 </Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(Filename).obj</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">%(Filename).obj</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
    </CustomBuild>
  </ItemGroup>
  <PropertyGroup Label="Globals">
@@ -85,15 +86,19 @@
  <PropertyGroup Label="UserMacros" />
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
    <LinkIncremental>true</LinkIncremental>
+    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
    <LinkIncremental>true</LinkIncremental>
+    <ExecutablePath>$(ExecutablePath);$(ProjectDir)..\..</ExecutablePath>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
    <LinkIncremental>false</LinkIncremental>
+    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
    <LinkIncremental>false</LinkIncremental>
+    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
  </PropertyGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
    <ClCompile>
@@ -102,6 +107,7 @@
      <WarningLevel>Level3</WarningLevel>
      <Optimization>Disabled</Optimization>
      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <FloatingPointModel>Fast</FloatingPointModel>
    </ClCompile>
@@ -117,6 +123,7 @@
      <WarningLevel>Level3</WarningLevel>
      <Optimization>Disabled</Optimization>
      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <FloatingPointModel>Fast</FloatingPointModel>
    </ClCompile>
@@ -134,6 +141,7 @@
      <FunctionLevelLinking>true</FunctionLevelLinking>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
      <FloatingPointModel>Fast</FloatingPointModel>
    </ClCompile>
    <Link>
@@ -152,6 +160,7 @@
      <FunctionLevelLinking>true</FunctionLevelLinking>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
      <FloatingPointModel>Fast</FloatingPointModel>
    </ClCompile>
    <Link>
--- a/examples/aobench_instrumented/Makefile
+++ b/examples/aobench_instrumented/Makefile
@@ -2,7 +2,7 @@
 CXX=g++ -m64
 CXXFLAGS=-Iobjs/ -g3 -Wall
 ISPC=ispc
-ISPCFLAGS=-O2 --fast-math --instrument --arch=x86-64
+ISPCFLAGS=-O2 --instrument --arch=x86-64 --target=sse2

 default: ao

--- a/examples/aobench_instrumented/ao.cpp
+++ b/examples/aobench_instrumented/ao.cpp
@@ -32,7 +32,6 @@
 */

 #ifdef _MSC_VER
-#define _CRT_SECURE_NO_WARNINGS
 #define NOMINMAX
 #pragma warning (disable: 4244)
 #pragma warning (disable: 4305)
@@ -56,7 +55,6 @@ using namespace ispc;

 #include "instrument.h"
 #include "../timing.h"
-#include "../cpuid.h"

 #define NSUBSAMPLES        2

@@ -104,37 +102,6 @@ savePPM(const char *fname, int w, int h)
 }


-// Make sure that the vector ISA used during compilation is supported by
-// the processor.  The ISPC_TARGET_* macro is set in the ispc-generated
-// header file that we include above.
-static void
-ensureTargetISAIsSupported() {
-#if defined(ISPC_TARGET_SSE2)
-    bool isaSupported = CPUSupportsSSE2();
-    const char *target = "SSE2";
-#elif defined(ISPC_TARGET_SSE4)
-    bool isaSupported = CPUSupportsSSE4();
-    const char *target = "SSE4";
-#elif defined(ISPC_TARGET_AVX)
-    bool isaSupported = CPUSupportsAVX();
-    const char *target = "AVX";
-#else
-#error "Unknown ISPC_TARGET_* value"
-#endif
-    if (!isaSupported) {
-        fprintf(stderr, "***\n*** Error: the ispc-compiled code uses the %s instruction "
-                "set, which isn't\n***        supported by this computer's CPU!\n", target);
-        fprintf(stderr, "***\n***        Please modify the "
-#ifdef _MSC_VER
-                "MSVC project file "
-#else
-                "Makefile "
-#endif
-                "to select another target (e.g. sse2)\n***\n");
-        exit(1);
-    }
-}
-

 int main(int argc, char **argv)
 {
@@ -150,8 +117,6 @@ int main(int argc, char **argv)
        height = atoi (argv[3]);
    }

-    ensureTargetISAIsSupported();
-
    // Allocate space for output images
    img = new unsigned char[width * height * 3];
    fimg = new float[width * height * 3];
--- a/examples/aobench_instrumented/aobench_instrumented.vcxproj
+++ b/examples/aobench_instrumented/aobench_instrumented.vcxproj
@@ -1,4 +1,4 @@
-<?xml version="1.0" encoding="utf-8"?>
+<?xml version="1.0" encoding="utf-8"?>
 <Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
  <ItemGroup Label="ProjectConfigurations">
    <ProjectConfiguration Include="Debug|Win32">
@@ -25,18 +25,18 @@
  <ItemGroup>
    <CustomBuild Include="ao.ispc">
      <FileType>Document</FileType>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --instrument
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --instrument --target=sse2
 </Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --instrument
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --instrument --target=sse2
 </Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(Filename).obj</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">%(Filename).obj</Outputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --instrument
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --instrument --target=sse2
 </Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --instrument
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --instrument --target=sse2
 </Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(Filename).obj</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">%(Filename).obj</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
    </CustomBuild>
  </ItemGroup>
  <PropertyGroup Label="Globals">
@@ -85,15 +85,19 @@
  <PropertyGroup Label="UserMacros" />
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
    <LinkIncremental>true</LinkIncremental>
+    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
    <LinkIncremental>true</LinkIncremental>
+    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
    <LinkIncremental>false</LinkIncremental>
+    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
    <LinkIncremental>false</LinkIncremental>
+    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
  </PropertyGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
    <ClCompile>
@@ -101,7 +105,8 @@
      </PrecompiledHeader>
      <WarningLevel>Level3</WarningLevel>
      <Optimization>Disabled</Optimization>
-      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
@@ -114,7 +119,8 @@
      </PrecompiledHeader>
      <WarningLevel>Level3</WarningLevel>
      <Optimization>Disabled</Optimization>
-      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
@@ -129,7 +135,8 @@
      <Optimization>MaxSpeed</Optimization>
      <FunctionLevelLinking>true</FunctionLevelLinking>
      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
@@ -146,7 +153,8 @@
      <Optimization>MaxSpeed</Optimization>
      <FunctionLevelLinking>true</FunctionLevelLinking>
      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
--- a/examples/deferred/Makefile
+++ b/examples/deferred/Makefile
@@ -0,0 +1,38 @@
+
+ARCH = $(shell uname)
+
+TASK_CXX=../tasksys.cpp
+TASK_LIB=-lpthread
+TASK_OBJ=$(addprefix objs/, $(subst ../,, $(TASK_CXX:.cpp=.o)))
+
+CXX=g++
+CXXFLAGS=-Iobjs/ -O3 -Wall -m64
+ISPC=ispc
+ISPCFLAGS=-O2 --target=sse2,sse4-x2,avx-x2 --arch=x86-64 --math-lib=fast
+
+OBJS=objs/main.o objs/common.o objs/kernels_ispc.o objs/kernels_ispc_sse2.o \
+	objs/kernels_ispc_sse4.o objs/kernels_ispc_avx.o \
+	objs/dynamic_c.o objs/dynamic_cilk.o
+
+default: deferred_shading
+
+.PHONY: dirs clean
+.PRECIOUS: objs/kernels_ispc.h
+
+dirs:
+	/bin/mkdir -p objs/
+
+clean:
+	/bin/rm -rf objs *~ deferred_shading
+
+deferred_shading: dirs $(OBJS) $(TASK_OBJ)
+	$(CXX) $(CXXFLAGS) -o $@ $(OBJS) $(TASK_OBJ) -lm $(TASK_LIB)
+
+objs/%.o: %.cpp objs/kernels_ispc.h deferred.h
+	$(CXX) $< $(CXXFLAGS) -c -o $@
+
+objs/%.o: ../%.cpp
+	$(CXX) $< $(CXXFLAGS) -c -o $@
+
+objs/%_ispc.h objs/%_ispc.o objs/%_ispc_sse2.o objs/%_ispc_sse4.o objs/%_ispc_avx.o: %.ispc
+	$(ISPC) $(ISPCFLAGS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
--- a/examples/deferred/common.cpp
+++ b/examples/deferred/common.cpp
@@ -0,0 +1,209 @@
+/*
+  Copyright (c) 2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+#ifdef _MSC_VER
+#define _CRT_SECURE_NO_WARNINGS
+#define ISPC_IS_WINDOWS
+#elif defined(__linux__)
+#define ISPC_IS_LINUX
+#elif defined(__APPLE__)
+#define ISPC_IS_APPLE
+#endif
+
+#include <fcntl.h>
+#include <float.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+#include <stdint.h>
+#include <algorithm>
+#include <assert.h>
+#include <vector>
+#ifdef ISPC_IS_WINDOWS
+  #define WIN32_LEAN_AND_MEAN
+  #include <windows.h>
+#endif
+#ifdef ISPC_IS_LINUX
+  #include <malloc.h>
+#endif
+#include "deferred.h"
+#include "../timing.h"
+
+///////////////////////////////////////////////////////////////////////////
+
+static void *
+lAlignedMalloc(size_t size, int32_t alignment) {
+#ifdef ISPC_IS_WINDOWS
+    return _aligned_malloc(size, alignment);
+#endif
+#ifdef ISPC_IS_LINUX
+    return memalign(alignment, size);
+#endif
+#ifdef ISPC_IS_APPLE
+    void *mem = malloc(size + (alignment-1) + sizeof(void*));
+    char *amem = ((char*)mem) + sizeof(void*);
+    amem = amem + uint32_t(alignment - (reinterpret_cast<uint64_t>(amem) &
+                                        (alignment - 1)));
+    ((void**)amem)[-1] = mem;
+    return amem;
+#endif
+}
+
+
+static void
+lAlignedFree(void *ptr) {
+#ifdef ISPC_IS_WINDOWS
+    _aligned_free(ptr);
+#endif
+#ifdef ISPC_IS_LINUX
+    free(ptr);
+#endif
+#ifdef ISPC_IS_APPLE
+    free(((void**)ptr)[-1]);
+#endif
+}
+
+
+Framebuffer::Framebuffer(int width, int height) {
+    nPixels = width*height;
+    r = (uint8_t *)lAlignedMalloc(nPixels, ALIGNMENT_BYTES);
+    g = (uint8_t *)lAlignedMalloc(nPixels, ALIGNMENT_BYTES);
+    b = (uint8_t *)lAlignedMalloc(nPixels, ALIGNMENT_BYTES);
+}
+
+
+Framebuffer::~Framebuffer() {
+    lAlignedFree(r);
+    lAlignedFree(g);
+    lAlignedFree(b);
+}
+
+
+void
+Framebuffer::clear() {
+    memset(r, 0, nPixels);
+    memset(g, 0, nPixels);
+    memset(b, 0, nPixels);
+}
+
+
+InputData *
+CreateInputDataFromFile(const char *path) {
+    FILE *in = fopen(path, "rb");
+    if (!in) return 0;
+
+    InputData *input = new InputData;
+
+    // Load header
+    if (fread(&input->header, sizeof(ispc::InputHeader), 1, in) != 1) {
+        fprintf(stderr, "Preumature EOF reading file \"%s\"\n", path);
+        return NULL;
+    }
+
+    // Load data chunk and update pointers
+    input->chunk = (uint8_t *)lAlignedMalloc(input->header.inputDataChunkSize, 
+                                             ALIGNMENT_BYTES);
+    if (fread(input->chunk, input->header.inputDataChunkSize, 1, in) != 1) {
+        fprintf(stderr, "Preumature EOF reading file \"%s\"\n", path);
+        return NULL;
+    }
+    
+    input->arrays.zBuffer =
+        (float *)&input->chunk[input->header.inputDataArrayOffsets[idaZBuffer]];
+    input->arrays.normalEncoded_x =
+        (uint16_t *)&input->chunk[input->header.inputDataArrayOffsets[idaNormalEncoded_x]];
+    input->arrays.normalEncoded_y =
+        (uint16_t *)&input->chunk[input->header.inputDataArrayOffsets[idaNormalEncoded_y]];
+    input->arrays.specularAmount =
+        (uint16_t *)&input->chunk[input->header.inputDataArrayOffsets[idaSpecularAmount]];
+    input->arrays.specularPower =
+        (uint16_t *)&input->chunk[input->header.inputDataArrayOffsets[idaSpecularPower]];
+    input->arrays.albedo_x =
+        (uint8_t *)&input->chunk[input->header.inputDataArrayOffsets[idaAlbedo_x]];
+    input->arrays.albedo_y =
+        (uint8_t *)&input->chunk[input->header.inputDataArrayOffsets[idaAlbedo_y]];
+    input->arrays.albedo_z =
+        (uint8_t *)&input->chunk[input->header.inputDataArrayOffsets[idaAlbedo_z]];
+    input->arrays.lightPositionView_x =
+        (float *)&input->chunk[input->header.inputDataArrayOffsets[idaLightPositionView_x]];
+    input->arrays.lightPositionView_y =
+        (float *)&input->chunk[input->header.inputDataArrayOffsets[idaLightPositionView_y]];
+    input->arrays.lightPositionView_z =
+        (float *)&input->chunk[input->header.inputDataArrayOffsets[idaLightPositionView_z]];
+    input->arrays.lightAttenuationBegin =
+        (float *)&input->chunk[input->header.inputDataArrayOffsets[idaLightAttenuationBegin]];
+    input->arrays.lightColor_x =
+        (float *)&input->chunk[input->header.inputDataArrayOffsets[idaLightColor_x]];
+    input->arrays.lightColor_y =
+        (float *)&input->chunk[input->header.inputDataArrayOffsets[idaLightColor_y]];
+    input->arrays.lightColor_z =
+        (float *)&input->chunk[input->header.inputDataArrayOffsets[idaLightColor_z]];
+    input->arrays.lightAttenuationEnd =
+        (float *)&input->chunk[input->header.inputDataArrayOffsets[idaLightAttenuationEnd]];
+
+    fclose(in);
+    return input;
+}
+
+
+void DeleteInputData(InputData *input) {
+    lAlignedFree(input->chunk);
+}
+
+
+void WriteFrame(const char *filename, const InputData *input,
+                const Framebuffer &framebuffer) {
+    // Deswizzle and copy to RGBA output
+    // Doesn't need to be fast... only happens once
+    size_t imageBytes = 3 * input->header.framebufferWidth * 
+        input->header.framebufferHeight;
+    uint8_t* framebufferAOS = (uint8_t *)lAlignedMalloc(imageBytes, ALIGNMENT_BYTES);
+    memset(framebufferAOS, 0, imageBytes);
+
+    for (int i = 0; i < input->header.framebufferWidth * 
+                        input->header.framebufferHeight; ++i) {
+        framebufferAOS[3 * i + 0] = framebuffer.r[i];
+        framebufferAOS[3 * i + 1] = framebuffer.g[i];
+        framebufferAOS[3 * i + 2] = framebuffer.b[i];
+    }
+    
+    // Write out simple PPM file
+    FILE *out = fopen(filename, "wb");
+    fprintf(out, "P6 %d %d 255\n", input->header.framebufferWidth, 
+            input->header.framebufferHeight);
+    fwrite(framebufferAOS, imageBytes, 1, out);
+
+    lAlignedFree(framebufferAOS);
+}
--- a/examples/deferred/data/pp1280x720.bin
+++ b/examples/deferred/data/pp1280x720.bin
--- a/examples/deferred/data/pp1920x1200.bin
+++ b/examples/deferred/data/pp1920x1200.bin
--- a/examples/deferred/deferred.h
+++ b/examples/deferred/deferred.h
@@ -0,0 +1,108 @@
+/*
+  Copyright (c) 2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+#ifndef DEFERRED_H
+#define DEFERRED_H
+
+// Currently tile widths must be a multiple of SIMD width (i.e. 8 for ispc sse4x2)!
+#define MIN_TILE_WIDTH 16
+#define MIN_TILE_HEIGHT 16
+#define MAX_LIGHTS 1024
+
+enum InputDataArraysEnum {
+    idaZBuffer = 0,
+    idaNormalEncoded_x,
+    idaNormalEncoded_y,
+    idaSpecularAmount,
+    idaSpecularPower,
+    idaAlbedo_x,
+    idaAlbedo_y,
+    idaAlbedo_z,
+    idaLightPositionView_x,
+    idaLightPositionView_y,
+    idaLightPositionView_z,
+    idaLightAttenuationBegin,
+    idaLightColor_x,
+    idaLightColor_y,
+    idaLightColor_z,
+    idaLightAttenuationEnd,
+
+    idaNum
+};
+
+#ifndef ISPC
+
+#include <stdint.h>
+#include "kernels_ispc.h"
+
+#define ALIGNMENT_BYTES 64
+
+#define MAX_LIGHTS 1024
+
+#define VISUALIZE_LIGHT_COUNT 0
+
+struct InputData
+{
+    ispc::InputHeader header;
+    ispc::InputDataArrays arrays;
+    uint8_t *chunk;
+};
+
+
+struct Framebuffer {
+    Framebuffer(int width, int height);
+    ~Framebuffer();
+
+    void clear();
+
+    uint8_t *r, *g, *b;
+
+private:
+    int nPixels;
+    Framebuffer(const Framebuffer &);
+    Framebuffer &operator=(const Framebuffer *);
+};
+
+
+InputData *CreateInputDataFromFile(const char *path);
+void DeleteInputData(InputData *input);
+void WriteFrame(const char *filename, const InputData *input,
+                const Framebuffer &framebuffer);
+void InitDynamicC(InputData *input);
+void InitDynamicCilk(InputData *input);
+void DispatchDynamicC(InputData *input, Framebuffer *framebuffer);
+void DispatchDynamicCilk(InputData *input, Framebuffer *framebuffer);
+
+#endif // !ISPC
+
+#endif // DEFERRED_H
--- a/examples/deferred/deferred_shading.vcxproj
+++ b/examples/deferred/deferred_shading.vcxproj
@@ -0,0 +1,178 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|Win32">
+      <Configuration>Debug</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|Win32">
+      <Configuration>Release</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{87f53c53-957e-4e91-878a-bc27828fb9eb}</ProjectGuid>
+    <Keyword>Win32Proj</Keyword>
+    <RootNamespace>mandelbrot</RootNamespace>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <LinkIncremental>true</LinkIncremental>
+    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <LinkIncremental>true</LinkIncremental>
+    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <LinkIncremental>false</LinkIncremental>
+    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <LinkIncremental>false</LinkIncremental>
+    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
+  </PropertyGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <ClCompile>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <FloatingPointModel>Fast</FloatingPointModel>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <ClCompile>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <FloatingPointModel>Fast</FloatingPointModel>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
+      <FloatingPointModel>Fast</FloatingPointModel>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
+      <FloatingPointModel>Fast</FloatingPointModel>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <ClCompile Include="common.cpp" />
+    <ClCompile Include="dynamic_c.cpp" />
+    <ClCompile Include="dynamic_cilk.cpp" />
+    <ClCompile Include="main.cpp" />
+    <ClCompile Include="../tasksys.cpp" />
+  </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="kernels.ispc">
+      <FileType>Document</FileType>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx-x2
+</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx-x2
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx-x2
+</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx-x2
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
+    </CustomBuild>
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+  </ImportGroup>
+</Project>
--- a/examples/deferred/dynamic_c.cpp
+++ b/examples/deferred/dynamic_c.cpp
@@ -0,0 +1,870 @@
+/*
+  Copyright (c) 2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+#include "deferred.h"
+#include "kernels_ispc.h"
+#include <algorithm>
+#include <stdint.h>
+#include <assert.h>
+#include <math.h>
+
+#ifdef _MSC_VER
+#define ISPC_IS_WINDOWS
+#elif defined(__linux__)
+#define ISPC_IS_LINUX
+#elif defined(__APPLE__)
+#define ISPC_IS_APPLE
+#endif
+
+#ifdef ISPC_IS_LINUX
+#include <malloc.h>
+#endif // ISPC_IS_LINUX
+
+// Currently tile widths must be a multiple of SIMD width (i.e. 8 for ispc sse4x2)!
+#define MIN_TILE_WIDTH 16
+#define MIN_TILE_HEIGHT 16
+
+
+#define DYNAMIC_TREE_LEVELS 5
+// If this is set to 1 then the result will be identical to the static version
+#define DYNAMIC_MIN_LIGHTS_TO_SUBDIVIDE 1
+
+static void *
+lAlignedMalloc(size_t size, int32_t alignment) {
+#ifdef ISPC_IS_WINDOWS
+    return _aligned_malloc(size, alignment);
+#endif
+#ifdef ISPC_IS_LINUX
+    return memalign(alignment, size);
+#endif
+#ifdef ISPC_IS_APPLE
+    void *mem = malloc(size + (alignment-1) + sizeof(void*));
+    char *amem = ((char*)mem) + sizeof(void*);
+    amem = amem + uint32_t(alignment - (reinterpret_cast<uint64_t>(amem) &
+                                        (alignment - 1)));
+    ((void**)amem)[-1] = mem;
+    return amem;
+#endif
+}
+
+
+static void
+lAlignedFree(void *ptr) {
+#ifdef ISPC_IS_WINDOWS
+    _aligned_free(ptr);
+#endif
+#ifdef ISPC_IS_LINUX
+    free(ptr);
+#endif
+#ifdef ISPC_IS_APPLE
+    free(((void**)ptr)[-1]);
+#endif
+}
+
+
+static void
+ComputeZBounds(int tileStartX, int tileEndX,
+               int tileStartY, int tileEndY,
+               // G-buffer data
+               float zBuffer[],
+               int gBufferWidth,
+               // Camera data
+               float cameraProj_33, float cameraProj_43,
+               float cameraNear, float cameraFar,
+               // Output
+               float *minZ, float *maxZ)
+{
+    // Find Z bounds
+    float laneMinZ = cameraFar;
+    float laneMaxZ = cameraNear;
+    for (int y = tileStartY; y < tileEndY; ++y) {
+        for (int x = tileStartX; x < tileEndX; ++x) {
+            // Unproject depth buffer Z value into view space
+            float z = zBuffer[(y * gBufferWidth + x)];
+            float viewSpaceZ = cameraProj_43 / (z - cameraProj_33);
+
+            // Work out Z bounds for our samples
+            // Avoid considering skybox/background or otherwise invalid pixels
+            if ((viewSpaceZ < cameraFar) && (viewSpaceZ >= cameraNear)) {
+                laneMinZ = std::min(laneMinZ, viewSpaceZ);
+                laneMaxZ = std::max(laneMaxZ, viewSpaceZ);
+            }
+        }
+    }
+    *minZ = laneMinZ;
+    *maxZ = laneMaxZ;
+}
+
+
+static void
+ComputeZBoundsRow(int tileY, int tileWidth, int tileHeight,
+                  int numTilesX, int numTilesY,
+                  // G-buffer data
+                  float zBuffer[],
+                  int gBufferWidth,
+                  // Camera data
+                  float cameraProj_33, float cameraProj_43,
+                  float cameraNear, float cameraFar,
+                  // Output
+                  float minZArray[],
+                  float maxZArray[])
+{
+    for (int tileX = 0; tileX < numTilesX; ++tileX) {
+        float minZ, maxZ;
+        ComputeZBounds(tileX * tileWidth, tileX * tileWidth + tileWidth,
+                       tileY * tileHeight, tileY * tileHeight + tileHeight,
+                       zBuffer, gBufferWidth, cameraProj_33, cameraProj_43, 
+                       cameraNear, cameraFar, &minZ, &maxZ);
+        minZArray[tileX] = minZ;
+        maxZArray[tileX] = maxZ;
+    }
+}
+
+
+class MinMaxZTree
+{
+public:
+    // Currently (min) tile dimensions must divide gBuffer dimensions evenly
+    // Levels must be small enough that neither dimension goes below one tile
+    MinMaxZTree(
+        int tileWidth, int tileHeight, int levels,
+        int gBufferWidth, int gBufferHeight)
+        : mTileWidth(tileWidth), mTileHeight(tileHeight), mLevels(levels)
+    {
+        mNumTilesX = gBufferWidth / mTileWidth;
+        mNumTilesY = gBufferHeight / mTileHeight;
+        
+        // Allocate arrays
+        mMinZArrays = (float **)lAlignedMalloc(sizeof(float *) * mLevels, 16);
+        mMaxZArrays = (float **)lAlignedMalloc(sizeof(float *) * mLevels, 16);
+        for (int i = 0; i < mLevels; ++i) {
+            int x = NumTilesX(i);
+            int y = NumTilesY(i);
+            assert(x > 0);
+            assert(y > 0);
+            // NOTE: If the following two asserts fire it probably means that
+            // the base tile dimensions do not evenly divide the G-buffer dimensions
+            assert(x * (mTileWidth << i) >= gBufferWidth);
+            assert(y * (mTileHeight << i) >= gBufferHeight);
+            mMinZArrays[i] = (float *)lAlignedMalloc(sizeof(float) * x * y, 16);
+            mMaxZArrays[i] = (float *)lAlignedMalloc(sizeof(float) * x * y, 16);
+        }
+    }
+
+    void Update(float *zBuffer, int gBufferPitchInElements,
+        float cameraProj_33, float cameraProj_43,
+        float cameraNear, float cameraFar)
+    {
+        for (int tileY = 0; tileY < mNumTilesY; ++tileY) {
+            ComputeZBoundsRow(tileY, mTileWidth, mTileHeight, mNumTilesX, mNumTilesY,
+                              zBuffer, gBufferPitchInElements,
+                              cameraProj_33, cameraProj_43, cameraNear, cameraFar,
+                              mMinZArrays[0] + (tileY * mNumTilesX),
+                              mMaxZArrays[0] + (tileY * mNumTilesX));
+        }
+
+        // Generate other levels
+        for (int level = 1; level < mLevels; ++level) {
+            int destTilesX = NumTilesX(level);
+            int destTilesY = NumTilesY(level);
+            int srcLevel = level - 1;
+            int srcTilesX = NumTilesX(srcLevel);
+            int srcTilesY = NumTilesY(srcLevel);
+            for (int y = 0; y < destTilesY; ++y) {
+                for (int x = 0; x < destTilesX; ++x) {
+                    int srcX = x << 1;
+                    int srcY = y << 1;
+                    // NOTE: Ugly branches to deal with non-multiple dimensions at some levels
+                    // TODO: SSE branchless min/max is probably better...
+                    float minZ = mMinZArrays[srcLevel][(srcY) * srcTilesX + (srcX)];
+                    float maxZ = mMaxZArrays[srcLevel][(srcY) * srcTilesX + (srcX)];
+                    if (srcX + 1 < srcTilesX) {
+                        minZ = std::min(minZ, mMinZArrays[srcLevel][(srcY) * srcTilesX + 
+                                                                    (srcX + 1)]);
+                        maxZ = std::max(maxZ, mMaxZArrays[srcLevel][(srcY) * srcTilesX +
+                                                                    (srcX + 1)]);
+                        if (srcY + 1 < srcTilesY) {
+                            minZ = std::min(minZ, mMinZArrays[srcLevel][(srcY + 1) * srcTilesX +
+                                                                        (srcX + 1)]);
+                            maxZ = std::max(maxZ, mMaxZArrays[srcLevel][(srcY + 1) * srcTilesX +
+                                                                        (srcX + 1)]);
+                        }
+                    }
+                    if (srcY + 1 < srcTilesY) {
+                        minZ = std::min(minZ, mMinZArrays[srcLevel][(srcY + 1) * srcTilesX +
+                                                                    (srcX    )]);
+                        maxZ = std::max(maxZ, mMaxZArrays[srcLevel][(srcY + 1) * srcTilesX +
+                                                                    (srcX    )]);
+                    }
+                    mMinZArrays[level][y * destTilesX + x] = minZ;
+                    mMaxZArrays[level][y * destTilesX + x] = maxZ;
+                }
+            }
+        }
+    }
+
+    ~MinMaxZTree() {
+        for (int i = 0; i < mLevels; ++i) {
+            lAlignedFree(mMinZArrays[i]);
+            lAlignedFree(mMaxZArrays[i]);
+        }
+        lAlignedFree(mMinZArrays);
+        lAlignedFree(mMaxZArrays); 
+    }
+
+    int Levels() const { return mLevels; }
+
+    // These round UP, so beware that the last tile for a given level may not be completely full
+    // TODO: Verify this...
+    int NumTilesX(int level = 0) const { return (mNumTilesX + (1 << level) - 1) >> level; }
+    int NumTilesY(int level = 0) const { return (mNumTilesY + (1 << level) - 1) >> level; }
+    int TileWidth(int level = 0) const { return (mTileWidth << level); }
+    int TileHeight(int level = 0) const { return (mTileHeight << level); }
+
+    float MinZ(int level, int tileX, int tileY) const {
+        return mMinZArrays[level][tileY * NumTilesX(level) + tileX];
+    }
+    float MaxZ(int level, int tileX, int tileY) const {
+        return mMaxZArrays[level][tileY * NumTilesX(level) + tileX];
+    }
+
+private:
+    int mTileWidth;
+    int mTileHeight;
+    int mLevels;
+    int mNumTilesX;
+    int mNumTilesY;
+
+    // One array for each "level" in the tree
+    float **mMinZArrays;
+    float **mMaxZArrays;
+};
+
+static MinMaxZTree *gMinMaxZTree = 0;
+
+void InitDynamicC(InputData *input) {
+    gMinMaxZTree = 
+        new MinMaxZTree(MIN_TILE_WIDTH, MIN_TILE_HEIGHT, DYNAMIC_TREE_LEVELS,
+                        input->header.framebufferWidth, 
+                        input->header.framebufferHeight);
+}
+
+
+/* We're going to split a tile into 4 sub-tiles.  This function
+   reclassifies the tile's lights with respect to the sub-tiles. */
+static void
+SplitTileMinMax(
+    int tileMidX, int tileMidY,
+    // Subtile data (00, 10, 01, 11)
+    float subtileMinZ[],
+    float subtileMaxZ[],
+    // G-buffer data
+    int gBufferWidth, int gBufferHeight,
+    // Camera data
+    float cameraProj_11, float cameraProj_22,
+    // Light Data
+    int lightIndices[],
+    int numLights,
+    float light_positionView_x_array[],
+    float light_positionView_y_array[],
+    float light_positionView_z_array[],
+    float light_attenuationEnd_array[],
+    // Outputs
+    int subtileIndices[],
+    int subtileIndicesPitch,
+    int subtileNumLights[]
+    )
+{
+    float gBufferScale_x = 0.5f * (float)gBufferWidth;
+    float gBufferScale_y = 0.5f * (float)gBufferHeight;
+        
+    float frustumPlanes_xy[2] = { -(cameraProj_11 * gBufferScale_x),
+                                   (cameraProj_22 * gBufferScale_y) };
+    float frustumPlanes_z[2] = { tileMidX - gBufferScale_x,
+                                 tileMidY - gBufferScale_y };
+
+    for (int i = 0; i < 2; ++i) {
+        // Normalize
+        float norm = 1.f / sqrtf(frustumPlanes_xy[i] * frustumPlanes_xy[i] + 
+                                 frustumPlanes_z[i] * frustumPlanes_z[i]);
+        frustumPlanes_xy[i] *= norm;
+        frustumPlanes_z[i] *= norm;
+    }
+
+    // Initialize
+    int subtileLightOffset[4];
+    subtileLightOffset[0] = 0 * subtileIndicesPitch;
+    subtileLightOffset[1] = 1 * subtileIndicesPitch;
+    subtileLightOffset[2] = 2 * subtileIndicesPitch;
+    subtileLightOffset[3] = 3 * subtileIndicesPitch;
+
+    for (int i = 0; i < numLights; ++i) {
+        int lightIndex = lightIndices[i];
+
+        float light_positionView_x = light_positionView_x_array[lightIndex];
+        float light_positionView_y = light_positionView_y_array[lightIndex];
+        float light_positionView_z = light_positionView_z_array[lightIndex];
+        float light_attenuationEnd = light_attenuationEnd_array[lightIndex];
+        float light_attenuationEndNeg = -light_attenuationEnd;
+        
+        // Test lights again against subtile z bounds
+        bool inFrustum[4];
+        inFrustum[0] = (light_positionView_z - subtileMinZ[0] >= light_attenuationEndNeg) &&
+            (subtileMaxZ[0] - light_positionView_z >= light_attenuationEndNeg);
+        inFrustum[1] = (light_positionView_z - subtileMinZ[1] >= light_attenuationEndNeg) && 
+            (subtileMaxZ[1] - light_positionView_z >= light_attenuationEndNeg);
+        inFrustum[2] = (light_positionView_z - subtileMinZ[2] >= light_attenuationEndNeg) && 
+            (subtileMaxZ[2] - light_positionView_z >= light_attenuationEndNeg);
+        inFrustum[3] = (light_positionView_z - subtileMinZ[3] >= light_attenuationEndNeg) && 
+            (subtileMaxZ[3] - light_positionView_z >= light_attenuationEndNeg);
+
+        float dx = light_positionView_z * frustumPlanes_z[0] + 
+            light_positionView_x * frustumPlanes_xy[0];
+        float dy = light_positionView_z * frustumPlanes_z[1] +
+            light_positionView_y * frustumPlanes_xy[1];
+        
+        if (fabsf(dx) > light_attenuationEnd) {
+            bool positiveX = dx > 0.0f;
+            inFrustum[0] = inFrustum[0] &&  positiveX;    // 00 subtile
+            inFrustum[1] = inFrustum[1] && !positiveX;    // 10 subtile
+            inFrustum[2] = inFrustum[2] &&  positiveX;    // 01 subtile
+            inFrustum[3] = inFrustum[3] && !positiveX;    // 11 subtile
+        }
+        if (fabsf(dy) > light_attenuationEnd) {
+            bool positiveY = dy > 0.0f;
+            inFrustum[0] = inFrustum[0] &&  positiveY;    // 00 subtile
+            inFrustum[1] = inFrustum[1] &&  positiveY;    // 10 subtile
+            inFrustum[2] = inFrustum[2] && !positiveY;    // 01 subtile
+            inFrustum[3] = inFrustum[3] && !positiveY;    // 11 subtile
+        }
+
+        if (inFrustum[0])
+            subtileIndices[subtileLightOffset[0]++] = lightIndex;
+        if (inFrustum[1])
+            subtileIndices[subtileLightOffset[1]++] = lightIndex;
+        if (inFrustum[2])
+            subtileIndices[subtileLightOffset[2]++] = lightIndex;
+        if (inFrustum[3])
+            subtileIndices[subtileLightOffset[3]++] = lightIndex;
+    }
+
+    subtileNumLights[0] = subtileLightOffset[0] - 0 * subtileIndicesPitch;
+    subtileNumLights[1] = subtileLightOffset[1] - 1 * subtileIndicesPitch;
+    subtileNumLights[2] = subtileLightOffset[2] - 2 * subtileIndicesPitch;
+    subtileNumLights[3] = subtileLightOffset[3] - 3 * subtileIndicesPitch;
+}
+
+
+static inline float
+dot3(float x, float y, float z, float a, float b, float c) {
+    return (x*a + y*b + z*c);
+}
+
+
+static inline void
+normalize3(float x, float y, float z, float &ox, float &oy, float &oz) {
+    float n = 1.f / sqrtf(x*x + y*y + z*z);
+    ox = x * n;
+    oy = y * n;
+    oz = z * n;
+}
+
+
+static inline float
+Unorm8ToFloat32(uint8_t u) {
+    return (float)u * (1.0f / 255.0f);
+}
+
+
+static inline uint8_t
+Float32ToUnorm8(float f) {
+    return (uint8_t)(f * 255.0f);
+}
+
+
+static inline float
+half_to_float_fast(uint16_t h) {
+    uint32_t hs = h & (int32_t)0x8000u;  // Pick off sign bit
+    uint32_t he = h & (int32_t)0x7C00u;  // Pick off exponent bits
+    uint32_t hm = h & (int32_t)0x03FFu;  // Pick off mantissa bits
+
+    // sign
+    uint32_t xs = ((uint32_t) hs) << 16; 
+    // Exponent: unbias the halfp, then bias the single
+    int32_t xes = ((int32_t) (he >> 10)) - 15 + 127; 
+    // Exponent
+    uint32_t xe = (uint32_t) (xes << 23);
+    // Mantissa
+    uint32_t xm = ((uint32_t) hm) << 13; 
+
+    uint32_t bits = (xs | xe | xm);
+    float *fp = reinterpret_cast<float *>(&bits);
+    return *fp;
+}
+
+
+static void
+ShadeTileC(
+    int32_t tileStartX, int32_t tileEndX,
+    int32_t tileStartY, int32_t tileEndY,
+    int32_t gBufferWidth, int32_t gBufferHeight,
+    const ispc::InputDataArrays &inputData,
+    // Camera data
+    float cameraProj_11, float cameraProj_22,
+    float cameraProj_33, float cameraProj_43,
+    // Light list
+    int32_t tileLightIndices[],
+    int32_t tileNumLights,
+    // UI
+    bool visualizeLightCount,
+    // Output
+    uint8_t framebuffer_r[],
+    uint8_t framebuffer_g[],
+    uint8_t framebuffer_b[]
+    )
+{
+    if (tileNumLights == 0 || visualizeLightCount) {
+        uint8_t c = (uint8_t)(std::min(tileNumLights << 2, 255));
+        for (int32_t y = tileStartY; y < tileEndY; ++y) {
+            for (int32_t x = tileStartX; x < tileEndX; ++x) {
+                int32_t framebufferIndex = (y * gBufferWidth + x);
+                framebuffer_r[framebufferIndex] = c;
+                framebuffer_g[framebufferIndex] = c;
+                framebuffer_b[framebufferIndex] = c;
+            }
+        }
+    } else {
+        float twoOverGBufferWidth = 2.0f / gBufferWidth;
+        float twoOverGBufferHeight = 2.0f / gBufferHeight;
+        
+        for (int32_t y = tileStartY; y < tileEndY; ++y) {
+            float positionScreen_y = -(((0.5f + y) * twoOverGBufferHeight) - 1.f);
+
+            for (int32_t x = tileStartX; x < tileEndX; ++x) {
+                int32_t gBufferOffset = y * gBufferWidth + x;
+                
+                // Reconstruct position and (negative) view vector from G-buffer
+                float surface_positionView_x, surface_positionView_y, surface_positionView_z;
+                float Vneg_x, Vneg_y, Vneg_z;
+
+                float z = inputData.zBuffer[gBufferOffset];
+
+                // Compute screen/clip-space position
+                // NOTE: Mind DX11 viewport transform and pixel center!
+                float positionScreen_x = (0.5f + (float)(x)) * 
+                    twoOverGBufferWidth - 1.0f;
+
+                // Unproject depth buffer Z value into view space
+                surface_positionView_z = cameraProj_43 / (z - cameraProj_33);
+                surface_positionView_x = positionScreen_x * surface_positionView_z / 
+                    cameraProj_11;
+                surface_positionView_y = positionScreen_y * surface_positionView_z / 
+                    cameraProj_22;
+                
+                // We actually end up with a vector pointing *at* the
+                // surface (i.e. the negative view vector)
+                normalize3(surface_positionView_x, surface_positionView_y, 
+                           surface_positionView_z, Vneg_x, Vneg_y, Vneg_z);
+
+                // Reconstruct normal from G-buffer
+                float surface_normal_x, surface_normal_y, surface_normal_z;
+                float normal_x = half_to_float_fast(inputData.normalEncoded_x[gBufferOffset]);
+                float normal_y = half_to_float_fast(inputData.normalEncoded_y[gBufferOffset]);
+                    
+                float f = (normal_x - normal_x * normal_x) + (normal_y - normal_y * normal_y);
+                float m = sqrtf(4.0f * f - 1.0f);
+                    
+                surface_normal_x = m * (4.0f * normal_x - 2.0f);
+                surface_normal_y = m * (4.0f * normal_y - 2.0f);
+                surface_normal_z = 3.0f - 8.0f * f;
+
+                // Load other G-buffer parameters
+                float surface_specularAmount = 
+                    half_to_float_fast(inputData.specularAmount[gBufferOffset]);
+                float surface_specularPower  = 
+                    half_to_float_fast(inputData.specularPower[gBufferOffset]);
+                float surface_albedo_x = Unorm8ToFloat32(inputData.albedo_x[gBufferOffset]);
+                float surface_albedo_y = Unorm8ToFloat32(inputData.albedo_y[gBufferOffset]);
+                float surface_albedo_z = Unorm8ToFloat32(inputData.albedo_z[gBufferOffset]);
+                
+                float lit_x = 0.0f;
+                float lit_y = 0.0f;
+                float lit_z = 0.0f;
+                for (int32_t tileLightIndex = 0; tileLightIndex < tileNumLights; 
+                     ++tileLightIndex) {
+                    int32_t lightIndex = tileLightIndices[tileLightIndex];
+                                        
+                    // Gather light data relevant to initial culling
+                    float light_positionView_x = 
+                        inputData.lightPositionView_x[lightIndex];
+                    float light_positionView_y = 
+                        inputData.lightPositionView_y[lightIndex];
+                    float light_positionView_z = 
+                        inputData.lightPositionView_z[lightIndex];
+                    float light_attenuationEnd = 
+                        inputData.lightAttenuationEnd[lightIndex];
+                    
+                    // Compute light vector
+                    float L_x = light_positionView_x - surface_positionView_x;
+                    float L_y = light_positionView_y - surface_positionView_y;
+                    float L_z = light_positionView_z - surface_positionView_z;
+
+                    float distanceToLight2 = dot3(L_x, L_y, L_z, L_x, L_y, L_z);
+                    
+                    // Clip at end of attenuation
+                    float light_attenutaionEnd2 = light_attenuationEnd * light_attenuationEnd;
+
+                    if (distanceToLight2 < light_attenutaionEnd2) {                    
+                        float distanceToLight = sqrtf(distanceToLight2);
+
+                        float distanceToLightRcp = 1.f / distanceToLight;
+                        L_x *= distanceToLightRcp;
+                        L_y *= distanceToLightRcp;
+                        L_z *= distanceToLightRcp;
+
+                        // Start computing brdf
+                        float NdotL = dot3(surface_normal_x, surface_normal_y, 
+                                           surface_normal_z, L_x, L_y, L_z);
+                    
+                        // Clip back facing
+                        if (NdotL > 0.0f) {
+                            float light_attenuationBegin = 
+                                inputData.lightAttenuationBegin[lightIndex];
+
+                            // Light distance attenuation (linstep)
+                            float lightRange = (light_attenuationEnd - light_attenuationBegin);
+                            float falloffPosition = (light_attenuationEnd - distanceToLight);
+                            float attenuation = std::min(falloffPosition / lightRange, 1.0f);
+
+                            float H_x = (L_x - Vneg_x);
+                            float H_y = (L_y - Vneg_y);
+                            float H_z = (L_z - Vneg_z);
+                            normalize3(H_x, H_y, H_z, H_x, H_y, H_z);
+                    
+                            float NdotH = dot3(surface_normal_x, surface_normal_y, 
+                                               surface_normal_z, H_x, H_y, H_z);
+                            NdotH = std::max(NdotH, 0.0f);
+
+                            float specular = powf(NdotH, surface_specularPower);
+                            float specularNorm = (surface_specularPower + 2.0f) * 
+                                (1.0f / 8.0f);
+                            float specularContrib = surface_specularAmount * 
+                                specularNorm * specular;
+
+                            float k = attenuation * NdotL * (1.0f + specularContrib);
+                    
+                            float light_color_x = inputData.lightColor_x[lightIndex];
+                            float light_color_y = inputData.lightColor_y[lightIndex];
+                            float light_color_z = inputData.lightColor_z[lightIndex];
+
+                            float lightContrib_x = surface_albedo_x * light_color_x;
+                            float lightContrib_y = surface_albedo_y * light_color_y;
+                            float lightContrib_z = surface_albedo_z * light_color_z;
+
+                            lit_x += lightContrib_x * k;
+                            lit_y += lightContrib_y * k;
+                            lit_z += lightContrib_z * k;
+                        }
+                    }
+                }
+
+                // Gamma correct
+                float gamma = 1.0 / 2.2f;
+                lit_x = powf(std::min(std::max(lit_x, 0.0f), 1.0f), gamma);
+                lit_y = powf(std::min(std::max(lit_y, 0.0f), 1.0f), gamma);
+                lit_z = powf(std::min(std::max(lit_z, 0.0f), 1.0f), gamma);
+                
+                framebuffer_r[gBufferOffset] = Float32ToUnorm8(lit_x);
+                framebuffer_g[gBufferOffset] = Float32ToUnorm8(lit_y);
+                framebuffer_b[gBufferOffset] = Float32ToUnorm8(lit_z);
+            }
+        }
+    }
+}
+
+
+void
+ShadeDynamicTileRecurse(InputData *input, int level, int tileX, int tileY, 
+                        int *lightIndices, int numLights, 
+                        Framebuffer *framebuffer) {
+    const MinMaxZTree *minMaxZTree = gMinMaxZTree;
+    
+    // If we few enough lights or this is the base case (last level), shade
+    // this full tile directly
+    if (level == 0 || numLights < DYNAMIC_MIN_LIGHTS_TO_SUBDIVIDE) {
+        int width = minMaxZTree->TileWidth(level);
+        int height = minMaxZTree->TileHeight(level);
+        int startX = tileX * width;
+        int startY = tileY * height;
+        int endX = std::min(input->header.framebufferWidth, startX + width);
+        int endY = std::min(input->header.framebufferHeight, startY + height);
+        
+        // Skip entirely offscreen tiles
+        if (endX > startX && endY > startY) {
+            ShadeTileC(startX, endX, startY, endY,
+                       input->header.framebufferWidth, input->header.framebufferHeight,
+                       input->arrays,
+                       input->header.cameraProj[0][0], input->header.cameraProj[1][1], 
+                       input->header.cameraProj[2][2], input->header.cameraProj[3][2],
+                       lightIndices, numLights, VISUALIZE_LIGHT_COUNT, 
+                       framebuffer->r, framebuffer->g, framebuffer->b);
+        }
+    } 
+    else {
+        // Otherwise, subdivide and 4-way recurse using X and Y splitting planes
+        // Move down a level in the tree
+        --level;
+        tileX <<= 1;
+        tileY <<= 1;
+        int width = minMaxZTree->TileWidth(level);
+        int height = minMaxZTree->TileHeight(level);
+
+        // Work out splitting coords
+        int midX = (tileX + 1) * width;
+        int midY = (tileY + 1) * height;
+
+        // Read subtile min/max data
+        // NOTE: We must be sure to handle out-of-bounds access here since
+        // sometimes we'll only have 1 or 2 subtiles for non-pow-2
+        // framebuffer sizes.
+        bool rightTileExists = (tileX + 1 < minMaxZTree->NumTilesX(level));
+        bool bottomTileExists = (tileY + 1 < minMaxZTree->NumTilesY(level));
+
+        // NOTE: Order is 00, 10, 01, 11
+        // Set defaults up to cull all lights if the tile doesn't exist (offscreen)
+        float minZ[4] = {input->header.cameraFar, input->header.cameraFar, 
+                         input->header.cameraFar, input->header.cameraFar};
+        float maxZ[4] = {input->header.cameraNear, input->header.cameraNear, 
+                         input->header.cameraNear, input->header.cameraNear};
+
+        minZ[0] = minMaxZTree->MinZ(level, tileX, tileY);
+        maxZ[0] = minMaxZTree->MaxZ(level, tileX, tileY);
+        if (rightTileExists) {
+            minZ[1] = minMaxZTree->MinZ(level, tileX + 1, tileY);
+            maxZ[1] = minMaxZTree->MaxZ(level, tileX + 1, tileY);
+            if (bottomTileExists) {
+                minZ[3] = minMaxZTree->MinZ(level, tileX + 1, tileY + 1);
+                maxZ[3] = minMaxZTree->MaxZ(level, tileX + 1, tileY + 1);
+            }
+        }
+        if (bottomTileExists) {
+            minZ[2] = minMaxZTree->MinZ(level, tileX, tileY + 1);
+            maxZ[2] = minMaxZTree->MaxZ(level, tileX, tileY + 1);
+        }
+
+        // Cull lights into subtile lists
+#ifdef ISPC_IS_WINDOWS
+        __declspec(align(ALIGNMENT_BYTES)) 
+#endif
+            int subtileLightIndices[4][MAX_LIGHTS]
+#ifndef ISPC_IS_WINDOWS
+            __attribute__ ((aligned(ALIGNMENT_BYTES)))
+#endif
+;
+        int subtileNumLights[4];
+        SplitTileMinMax(midX, midY, minZ, maxZ,
+            input->header.framebufferWidth, input->header.framebufferHeight, 
+            input->header.cameraProj[0][0], input->header.cameraProj[1][1],
+            lightIndices, numLights, input->arrays.lightPositionView_x, 
+            input->arrays.lightPositionView_y, input->arrays.lightPositionView_z, 
+            input->arrays.lightAttenuationEnd,
+            subtileLightIndices[0], MAX_LIGHTS, subtileNumLights);
+        
+        // Recurse into subtiles
+        ShadeDynamicTileRecurse(input, level, tileX    , tileY, 
+                                subtileLightIndices[0], subtileNumLights[0],
+                                framebuffer);
+        ShadeDynamicTileRecurse(input, level, tileX + 1, tileY,
+                                subtileLightIndices[1], subtileNumLights[1],
+                                framebuffer);
+        ShadeDynamicTileRecurse(input, level, tileX    , tileY + 1,
+                                subtileLightIndices[2], subtileNumLights[2],
+                                framebuffer);
+        ShadeDynamicTileRecurse(input, level, tileX + 1, tileY + 1,
+                                subtileLightIndices[3], subtileNumLights[3],
+                                framebuffer);
+    }
+}
+
+
+static int
+IntersectLightsWithTileMinMax(
+    int tileStartX, int tileEndX,
+    int tileStartY, int tileEndY,
+    // Tile data
+    float minZ,
+    float maxZ,
+    // G-buffer data
+    int gBufferWidth, int gBufferHeight,
+    // Camera data
+    float cameraProj_11, float cameraProj_22,
+    // Light Data
+    int numLights,
+    float light_positionView_x_array[],
+    float light_positionView_y_array[],
+    float light_positionView_z_array[],
+    float light_attenuationEnd_array[],
+    // Output
+    int tileLightIndices[]
+    )
+{
+    float gBufferScale_x = 0.5f * (float)gBufferWidth;
+    float gBufferScale_y = 0.5f * (float)gBufferHeight;
+        
+    float frustumPlanes_xy[4];
+    float frustumPlanes_z[4];
+
+    // This one is totally constant over the whole screen... worth pulling it up at all?
+    float frustumPlanes_xy_v[4] = { -(cameraProj_11 * gBufferScale_x),
+                                    (cameraProj_11 * gBufferScale_x),
+                                    (cameraProj_22 * gBufferScale_y),
+                                    -(cameraProj_22 * gBufferScale_y) };
+    
+    float frustumPlanes_z_v[4] = {  tileEndX - gBufferScale_x,
+                                    -tileStartX + gBufferScale_x,
+                                    tileEndY - gBufferScale_y,
+                                    -tileStartY + gBufferScale_y };
+
+    for (int i = 0; i < 4; ++i) {
+        float norm = 1.f / sqrtf(frustumPlanes_xy_v[i] * frustumPlanes_xy_v[i] + 
+                                 frustumPlanes_z_v[i] * frustumPlanes_z_v[i]);
+        frustumPlanes_xy_v[i] *= norm;
+        frustumPlanes_z_v[i] *= norm;
+
+        frustumPlanes_xy[i] = frustumPlanes_xy_v[i];
+        frustumPlanes_z[i] = frustumPlanes_z_v[i];
+    }
+
+    int tileNumLights = 0;
+
+    for (int lightIndex = 0; lightIndex < numLights; ++lightIndex) {
+        float light_positionView_z = light_positionView_z_array[lightIndex];
+        float light_attenuationEnd = light_attenuationEnd_array[lightIndex];
+        float light_attenuationEndNeg = -light_attenuationEnd;
+
+        float d = light_positionView_z - minZ;
+        bool inFrustum = (d >= light_attenuationEndNeg);
+
+        d = maxZ - light_positionView_z;
+        inFrustum = inFrustum && (d >= light_attenuationEndNeg);
+        
+        if (!inFrustum) 
+            continue;
+
+        float light_positionView_x = light_positionView_x_array[lightIndex];
+        float light_positionView_y = light_positionView_y_array[lightIndex];
+
+        d = light_positionView_z * frustumPlanes_z[0] + 
+            light_positionView_x * frustumPlanes_xy[0];
+        inFrustum = inFrustum && (d >= light_attenuationEndNeg);
+
+        d = light_positionView_z * frustumPlanes_z[1] + 
+            light_positionView_x * frustumPlanes_xy[1];
+        inFrustum = inFrustum && (d >= light_attenuationEndNeg);
+
+        d = light_positionView_z * frustumPlanes_z[2] + 
+            light_positionView_y * frustumPlanes_xy[2];
+        inFrustum = inFrustum && (d >= light_attenuationEndNeg);
+
+        d = light_positionView_z * frustumPlanes_z[3] + 
+            light_positionView_y * frustumPlanes_xy[3];
+        inFrustum = inFrustum && (d >= light_attenuationEndNeg);
+        
+        // Pack and store intersecting lights
+        if (inFrustum)
+            tileLightIndices[tileNumLights++] = lightIndex;
+    }
+
+    return tileNumLights;
+}
+
+
+void
+ShadeDynamicTile(InputData *input, int level, int tileX, int tileY,
+                 Framebuffer *framebuffer) {
+    const MinMaxZTree *minMaxZTree = gMinMaxZTree;
+
+    // Get Z min/max for this tile
+    int width = minMaxZTree->TileWidth(level);
+    int height = minMaxZTree->TileHeight(level);
+    float minZ = minMaxZTree->MinZ(level, tileX, tileY);
+    float maxZ = minMaxZTree->MaxZ(level, tileX, tileY);
+
+    int startX = tileX * width;
+    int startY = tileY * height;
+    int endX = std::min(input->header.framebufferWidth, startX + width);
+    int endY = std::min(input->header.framebufferHeight, startY + height);
+
+    // This is a root tile, so first do a full 6-plane cull
+#ifdef ISPC_IS_WINDOWS
+    __declspec(align(ALIGNMENT_BYTES)) 
+#endif
+        int lightIndices[MAX_LIGHTS]
+#ifndef ISPC_IS_WINDOWS
+        __attribute__ ((aligned(ALIGNMENT_BYTES)))
+#endif
+;
+    int numLights = IntersectLightsWithTileMinMax(
+        startX, endX, startY, endY,    minZ, maxZ,
+        input->header.framebufferWidth, input->header.framebufferHeight,
+        input->header.cameraProj[0][0], input->header.cameraProj[1][1],
+        MAX_LIGHTS, input->arrays.lightPositionView_x, 
+        input->arrays.lightPositionView_y, input->arrays.lightPositionView_z, 
+        input->arrays.lightAttenuationEnd, lightIndices);
+
+    // Now kick off the recursive process for this tile
+    ShadeDynamicTileRecurse(input, level, tileX, tileY, lightIndices, 
+                            numLights, framebuffer);
+}
+
+
+void
+DispatchDynamicC(InputData *input, Framebuffer *framebuffer)
+{
+    MinMaxZTree *minMaxZTree = gMinMaxZTree;
+        
+    // Update min/max Z tree
+    minMaxZTree->Update(input->arrays.zBuffer, input->header.framebufferWidth,
+        input->header.cameraProj[2][2], input->header.cameraProj[3][2], 
+        input->header.cameraNear, input->header.cameraFar);
+
+    int rootLevel = minMaxZTree->Levels() - 1;
+    int rootTilesX = minMaxZTree->NumTilesX(rootLevel);
+    int rootTilesY = minMaxZTree->NumTilesY(rootLevel);
+    int rootTiles = rootTilesX * rootTilesY;
+    for (int g = 0; g < rootTiles; ++g) {
+        uint32_t tileY = g / rootTilesX;
+        uint32_t tileX = g % rootTilesX;
+        ShadeDynamicTile(input, rootLevel, tileX, tileY, framebuffer);
+    }
+}
--- a/examples/deferred/dynamic_cilk.cpp
+++ b/examples/deferred/dynamic_cilk.cpp
@@ -0,0 +1,398 @@
+/*
+  Copyright (c) 2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+#ifdef __cilk
+
+#include "deferred.h"
+#include "kernels_ispc.h"
+#include <algorithm>
+#include <assert.h>
+
+#ifdef _MSC_VER
+#define ISPC_IS_WINDOWS
+#elif defined(__linux__)
+#define ISPC_IS_LINUX
+#elif defined(__APPLE__)
+#define ISPC_IS_APPLE
+#endif
+
+#ifdef ISPC_IS_LINUX
+#include <malloc.h>
+#endif // ISPC_IS_LINUX
+
+// Currently tile widths must be a multiple of SIMD width (i.e. 8 for ispc sse4x2)!
+#define MIN_TILE_WIDTH 16
+#define MIN_TILE_HEIGHT 16
+
+
+#define DYNAMIC_TREE_LEVELS 5
+// If this is set to 1 then the result will be identical to the static version
+#define DYNAMIC_MIN_LIGHTS_TO_SUBDIVIDE 1
+
+static void *
+lAlignedMalloc(size_t size, int32_t alignment) {
+#ifdef ISPC_IS_WINDOWS
+    return _aligned_malloc(size, alignment);
+#endif
+#ifdef ISPC_IS_LINUX
+    return memalign(alignment, size);
+#endif
+#ifdef ISPC_IS_APPLE
+    void *mem = malloc(size + (alignment-1) + sizeof(void*));
+    char *amem = ((char*)mem) + sizeof(void*);
+    amem = amem + uint32_t(alignment - (reinterpret_cast<uint64_t>(amem) &
+                                        (alignment - 1)));
+    ((void**)amem)[-1] = mem;
+    return amem;
+#endif
+}
+
+
+static void
+lAlignedFree(void *ptr) {
+#ifdef ISPC_IS_WINDOWS
+    _aligned_free(ptr);
+#endif
+#ifdef ISPC_IS_LINUX
+    free(ptr);
+#endif
+#ifdef ISPC_IS_APPLE
+    free(((void**)ptr)[-1]);
+#endif
+}
+
+
+class MinMaxZTreeCilk
+{
+public:
+    // Currently (min) tile dimensions must divide gBuffer dimensions evenly
+    // Levels must be small enough that neither dimension goes below one tile
+    MinMaxZTreeCilk(
+        int tileWidth, int tileHeight, int levels,
+        int gBufferWidth, int gBufferHeight)
+        : mTileWidth(tileWidth), mTileHeight(tileHeight), mLevels(levels)
+    {
+        mNumTilesX = gBufferWidth / mTileWidth;
+        mNumTilesY = gBufferHeight / mTileHeight;
+        
+        // Allocate arrays
+        mMinZArrays = (float **)lAlignedMalloc(sizeof(float *) * mLevels, 16);
+        mMaxZArrays = (float **)lAlignedMalloc(sizeof(float *) * mLevels, 16);
+        for (int i = 0; i < mLevels; ++i) {
+            int x = NumTilesX(i);
+            int y = NumTilesY(i);
+            assert(x > 0);
+            assert(y > 0);
+            // NOTE: If the following two asserts fire it probably means that
+            // the base tile dimensions do not evenly divide the G-buffer dimensions
+            assert(x * (mTileWidth << i) >= gBufferWidth);
+            assert(y * (mTileHeight << i) >= gBufferHeight);
+            mMinZArrays[i] = (float *)lAlignedMalloc(sizeof(float) * x * y, 16);
+            mMaxZArrays[i] = (float *)lAlignedMalloc(sizeof(float) * x * y, 16);
+        }
+    }
+
+    void Update(float *zBuffer, int gBufferPitchInElements,
+        float cameraProj_33, float cameraProj_43,
+        float cameraNear, float cameraFar)
+    {
+        // Compute level 0 in parallel. Outer loops is here since we use Cilk
+        _Cilk_for (int tileY = 0; tileY < mNumTilesY; ++tileY) {
+            ispc::ComputeZBoundsRow(tileY,
+                mTileWidth, mTileHeight, mNumTilesX, mNumTilesY,
+                zBuffer, gBufferPitchInElements,
+                cameraProj_33, cameraProj_43, cameraNear, cameraFar,
+                mMinZArrays[0] + (tileY * mNumTilesX),
+                mMaxZArrays[0] + (tileY * mNumTilesX));
+        }
+
+        // Generate other levels
+        // NOTE: We currently don't use ispc here since it's sort of an
+        // awkward gather-based reduction Using SSE odd pack/unpack
+        // instructions might actually work here when we need to optimize
+        for (int level = 1; level < mLevels; ++level) {
+            int destTilesX = NumTilesX(level);
+            int destTilesY = NumTilesY(level);
+            int srcLevel = level - 1;
+            int srcTilesX = NumTilesX(srcLevel);
+            int srcTilesY = NumTilesY(srcLevel);
+            _Cilk_for (int y = 0; y < destTilesY; ++y) {
+                for (int x = 0; x < destTilesX; ++x) {
+                    int srcX = x << 1;
+                    int srcY = y << 1;
+                    // NOTE: Ugly branches to deal with non-multiple dimensions at some levels
+                    // TODO: SSE branchless min/max is probably better...
+                    float minZ = mMinZArrays[srcLevel][(srcY) * srcTilesX + (srcX)];
+                    float maxZ = mMaxZArrays[srcLevel][(srcY) * srcTilesX + (srcX)];
+                    if (srcX + 1 < srcTilesX) {
+                        minZ = std::min(minZ, mMinZArrays[srcLevel][(srcY) * srcTilesX + 
+                                                                    (srcX + 1)]);
+                        maxZ = std::max(maxZ, mMaxZArrays[srcLevel][(srcY) * srcTilesX +
+                                                                    (srcX + 1)]);
+                        if (srcY + 1 < srcTilesY) {
+                            minZ = std::min(minZ, mMinZArrays[srcLevel][(srcY + 1) * srcTilesX +
+                                                                        (srcX + 1)]);
+                            maxZ = std::max(maxZ, mMaxZArrays[srcLevel][(srcY + 1) * srcTilesX +
+                                                                        (srcX + 1)]);
+                        }
+                    }
+                    if (srcY + 1 < srcTilesY) {
+                        minZ = std::min(minZ, mMinZArrays[srcLevel][(srcY + 1) * srcTilesX +
+                                                                    (srcX    )]);
+                        maxZ = std::max(maxZ, mMaxZArrays[srcLevel][(srcY + 1) * srcTilesX +
+                                                                    (srcX    )]);
+                    }
+                    mMinZArrays[level][y * destTilesX + x] = minZ;
+                    mMaxZArrays[level][y * destTilesX + x] = maxZ;
+                }
+            }
+        }
+    }
+
+    ~MinMaxZTreeCilk() {
+        for (int i = 0; i < mLevels; ++i) {
+            lAlignedFree(mMinZArrays[i]);
+            lAlignedFree(mMaxZArrays[i]);
+        }
+        lAlignedFree(mMinZArrays);
+        lAlignedFree(mMaxZArrays); 
+    }
+
+    int Levels() const { return mLevels; }
+
+    // These round UP, so beware that the last tile for a given level may not be completely full
+    // TODO: Verify this...
+    int NumTilesX(int level = 0) const { return (mNumTilesX + (1 << level) - 1) >> level; }
+    int NumTilesY(int level = 0) const { return (mNumTilesY + (1 << level) - 1) >> level; }
+    int TileWidth(int level = 0) const { return (mTileWidth << level); }
+    int TileHeight(int level = 0) const { return (mTileHeight << level); }
+
+    float MinZ(int level, int tileX, int tileY) const {
+        return mMinZArrays[level][tileY * NumTilesX(level) + tileX];
+    }
+    float MaxZ(int level, int tileX, int tileY) const {
+        return mMaxZArrays[level][tileY * NumTilesX(level) + tileX];
+    }
+
+private:
+    int mTileWidth;
+    int mTileHeight;
+    int mLevels;
+    int mNumTilesX;
+    int mNumTilesY;
+
+    // One array for each "level" in the tree
+    float **mMinZArrays;
+    float **mMaxZArrays;
+};
+
+static MinMaxZTreeCilk *gMinMaxZTreeCilk = 0;
+
+void InitDynamicCilk(InputData *input) {
+    gMinMaxZTreeCilk = 
+        new MinMaxZTreeCilk(MIN_TILE_WIDTH, MIN_TILE_HEIGHT, DYNAMIC_TREE_LEVELS,
+                            input->header.framebufferWidth, 
+                            input->header.framebufferHeight);
+}
+
+
+static void
+ShadeDynamicTileRecurse(InputData *input, int level, int tileX, int tileY, 
+                        int *lightIndices, int numLights, 
+                        Framebuffer *framebuffer) {
+    const MinMaxZTreeCilk *minMaxZTree = gMinMaxZTreeCilk;
+    
+    // If we few enough lights or this is the base case (last level), shade
+    // this full tile directly
+    if (level == 0 || numLights < DYNAMIC_MIN_LIGHTS_TO_SUBDIVIDE) {
+        int width = minMaxZTree->TileWidth(level);
+        int height = minMaxZTree->TileHeight(level);
+        int startX = tileX * width;
+        int startY = tileY * height;
+        int endX = std::min(input->header.framebufferWidth, startX + width);
+        int endY = std::min(input->header.framebufferHeight, startY + height);
+        
+        // Skip entirely offscreen tiles
+        if (endX > startX && endY > startY) {
+            ispc::ShadeTile(
+                startX, endX, startY, endY,
+                input->header.framebufferWidth, input->header.framebufferHeight,
+                &input->arrays,
+                input->header.cameraProj[0][0], input->header.cameraProj[1][1], 
+                input->header.cameraProj[2][2], input->header.cameraProj[3][2],
+                lightIndices, numLights, VISUALIZE_LIGHT_COUNT, 
+                framebuffer->r, framebuffer->g, framebuffer->b);
+        }
+    } 
+    else {
+        // Otherwise, subdivide and 4-way recurse using X and Y splitting planes
+        // Move down a level in the tree
+        --level;
+        tileX <<= 1;
+        tileY <<= 1;
+        int width = minMaxZTree->TileWidth(level);
+        int height = minMaxZTree->TileHeight(level);
+
+        // Work out splitting coords
+        int midX = (tileX + 1) * width;
+        int midY = (tileY + 1) * height;
+
+        // Read subtile min/max data
+        // NOTE: We must be sure to handle out-of-bounds access here since
+        // sometimes we'll only have 1 or 2 subtiles for non-pow-2
+        // framebuffer sizes.
+        bool rightTileExists = (tileX + 1 < minMaxZTree->NumTilesX(level));
+        bool bottomTileExists = (tileY + 1 < minMaxZTree->NumTilesY(level));
+
+        // NOTE: Order is 00, 10, 01, 11
+        // Set defaults up to cull all lights if the tile doesn't exist (offscreen)
+        float minZ[4] = {input->header.cameraFar, input->header.cameraFar, 
+                         input->header.cameraFar, input->header.cameraFar};
+        float maxZ[4] = {input->header.cameraNear, input->header.cameraNear, 
+                         input->header.cameraNear, input->header.cameraNear};
+
+        minZ[0] = minMaxZTree->MinZ(level, tileX, tileY);
+        maxZ[0] = minMaxZTree->MaxZ(level, tileX, tileY);
+        if (rightTileExists) {
+            minZ[1] = minMaxZTree->MinZ(level, tileX + 1, tileY);
+            maxZ[1] = minMaxZTree->MaxZ(level, tileX + 1, tileY);
+            if (bottomTileExists) {
+                minZ[3] = minMaxZTree->MinZ(level, tileX + 1, tileY + 1);
+                maxZ[3] = minMaxZTree->MaxZ(level, tileX + 1, tileY + 1);
+            }
+        }
+        if (bottomTileExists) {
+            minZ[2] = minMaxZTree->MinZ(level, tileX, tileY + 1);
+            maxZ[2] = minMaxZTree->MaxZ(level, tileX, tileY + 1);
+        }
+
+        // Cull lights into subtile lists
+#ifdef ISPC_IS_WINDOWS
+        __declspec(align(ALIGNMENT_BYTES)) 
+#endif
+            int subtileLightIndices[4][MAX_LIGHTS]
+#ifndef ISPC_IS_WINDOWS
+            __attribute__ ((aligned(ALIGNMENT_BYTES)))
+#endif
+;
+        int subtileNumLights[4];
+        ispc::SplitTileMinMax(midX, midY, minZ, maxZ,
+            input->header.framebufferWidth, input->header.framebufferHeight, 
+            input->header.cameraProj[0][0], input->header.cameraProj[1][1],
+            lightIndices, numLights, input->arrays.lightPositionView_x, 
+            input->arrays.lightPositionView_y, input->arrays.lightPositionView_z, 
+            input->arrays.lightAttenuationEnd,
+            subtileLightIndices[0], MAX_LIGHTS, subtileNumLights);
+        
+        // Recurse into subtiles
+        _Cilk_spawn ShadeDynamicTileRecurse(input, level, tileX    , tileY, 
+                                            subtileLightIndices[0], subtileNumLights[0],
+                                            framebuffer);
+        _Cilk_spawn ShadeDynamicTileRecurse(input, level, tileX + 1, tileY,
+                                            subtileLightIndices[1], subtileNumLights[1],
+                                            framebuffer);
+        _Cilk_spawn ShadeDynamicTileRecurse(input, level, tileX    , tileY + 1,
+                                            subtileLightIndices[2], subtileNumLights[2],
+                                            framebuffer);
+        ShadeDynamicTileRecurse(input, level, tileX + 1, tileY + 1,
+                                subtileLightIndices[3], subtileNumLights[3],
+                                framebuffer);
+    }
+}
+
+
+static void
+ShadeDynamicTile(InputData *input, int level, int tileX, int tileY,
+                 Framebuffer *framebuffer) {
+    const MinMaxZTreeCilk *minMaxZTree = gMinMaxZTreeCilk;
+
+    // Get Z min/max for this tile
+    int width = minMaxZTree->TileWidth(level);
+    int height = minMaxZTree->TileHeight(level);
+    float minZ = minMaxZTree->MinZ(level, tileX, tileY);
+    float maxZ = minMaxZTree->MaxZ(level, tileX, tileY);
+
+    int startX = tileX * width;
+    int startY = tileY * height;
+    int endX = std::min(input->header.framebufferWidth, startX + width);
+    int endY = std::min(input->header.framebufferHeight, startY + height);
+
+    // This is a root tile, so first do a full 6-plane cull
+#ifdef ISPC_IS_WINDOWS
+    __declspec(align(ALIGNMENT_BYTES)) 
+#endif
+        int lightIndices[MAX_LIGHTS]
+#ifndef ISPC_IS_WINDOWS
+        __attribute__ ((aligned(ALIGNMENT_BYTES)))
+#endif
+;
+    int numLights = ispc::IntersectLightsWithTileMinMax(
+        startX, endX, startY, endY,    minZ, maxZ,
+        input->header.framebufferWidth, input->header.framebufferHeight,
+        input->header.cameraProj[0][0], input->header.cameraProj[1][1],
+        MAX_LIGHTS, input->arrays.lightPositionView_x, 
+        input->arrays.lightPositionView_y, input->arrays.lightPositionView_z, 
+        input->arrays.lightAttenuationEnd, lightIndices);
+
+    // Now kick off the recursive process for this tile
+    ShadeDynamicTileRecurse(input, level, tileX, tileY, lightIndices, 
+                            numLights, framebuffer);
+}
+
+
+void
+DispatchDynamicCilk(InputData *input, Framebuffer *framebuffer)
+{
+    MinMaxZTreeCilk *minMaxZTree = gMinMaxZTreeCilk;
+        
+    // Update min/max Z tree
+    minMaxZTree->Update(input->arrays.zBuffer, input->header.framebufferWidth,
+        input->header.cameraProj[2][2], input->header.cameraProj[3][2], 
+        input->header.cameraNear, input->header.cameraFar);
+
+    // Launch the "root" tiles.  Ideally these should at least fill the
+    // machine... at the moment we have a static number of "levels" to the
+    // mip tree but it might make sense to compute it based on the width of
+    // the machine.
+    int rootLevel = minMaxZTree->Levels() - 1;
+    int rootTilesX = minMaxZTree->NumTilesX(rootLevel);
+    int rootTilesY = minMaxZTree->NumTilesY(rootLevel);
+    int rootTiles = rootTilesX * rootTilesY;
+    _Cilk_for (int g = 0; g < rootTiles; ++g) {
+        uint32_t tileY = g / rootTilesX;
+        uint32_t tileX = g % rootTilesX;
+        ShadeDynamicTile(input, rootLevel, tileX, tileY, framebuffer);
+    }
+}
+
+#endif // __cilk
--- a/examples/deferred/kernels.ispc
+++ b/examples/deferred/kernels.ispc
@@ -0,0 +1,717 @@
+/*
+  Copyright (c) 2010-2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+#include "deferred.h"
+
+struct InputDataArrays
+{
+    uniform float zBuffer[];
+    uniform unsigned int16 normalEncoded_x[]; // half float
+    uniform unsigned int16 normalEncoded_y[]; // half float
+    uniform unsigned int16 specularAmount[]; // half float
+    uniform unsigned int16 specularPower[]; // half float
+    uniform unsigned int8 albedo_x[]; // unorm8
+    uniform unsigned int8 albedo_y[]; // unorm8
+    uniform unsigned int8 albedo_z[]; // unorm8
+    uniform float lightPositionView_x[];
+    uniform float lightPositionView_y[];
+    uniform float lightPositionView_z[];
+    uniform float lightAttenuationBegin[];
+    uniform float lightColor_x[];
+    uniform float lightColor_y[];
+    uniform float lightColor_z[];
+    uniform float lightAttenuationEnd[];
+};
+
+struct InputHeader
+{
+    uniform float cameraProj[4][4];
+    uniform float cameraNear;
+    uniform float cameraFar;
+
+    uniform int32 framebufferWidth;
+    uniform int32 framebufferHeight;
+    uniform int32 numLights;
+    uniform int32 inputDataChunkSize;
+    uniform int32 inputDataArrayOffsets[idaNum];
+};
+
+
+///////////////////////////////////////////////////////////////////////////
+// Common utility routines
+
+static inline float
+dot3(float x, float y, float z, float a, float b, float c) {
+    return (x*a + y*b + z*c);
+}
+
+
+static inline void
+normalize3(float x, float y, float z, reference float ox, 
+           reference float oy, reference float oz) {
+    float n = rsqrt(x*x + y*y + z*z);
+    ox = x * n;
+    oy = y * n;
+    oz = z * n;
+}
+
+
+static inline float
+Unorm8ToFloat32(unsigned int8 u) {
+    return (float)u * (1.0f / 255.0f);
+}
+
+
+static inline unsigned int8
+Float32ToUnorm8(float f) {
+    return (unsigned int8)(f * 255.0f);
+}
+
+
+// tile width must be a multiple of programCount (SIMD size)
+static void
+ComputeZBounds(
+    uniform int32 tileStartX, uniform int32 tileEndX,
+    uniform int32 tileStartY, uniform int32 tileEndY,
+    // G-buffer data
+    uniform float zBuffer[],
+    uniform int32 gBufferWidth,
+    // Camera data
+    uniform float cameraProj_33, uniform float cameraProj_43,
+    uniform float cameraNear, uniform float cameraFar,
+    // Output
+    reference uniform float minZ,
+    reference uniform float maxZ
+    )
+{
+    // Find Z bounds
+    float laneMinZ = cameraFar;
+    float laneMaxZ = cameraNear;
+    for (uniform int32 y = tileStartY; y < tileEndY; ++y) {
+        for (uniform int32 x = tileStartX; x < tileEndX; x += programCount) {
+            // Unproject depth buffer Z value into view space
+            float z = zBuffer[(y * gBufferWidth + x) + programIndex];
+            float viewSpaceZ = cameraProj_43 / (z - cameraProj_33);
+
+            // Work out Z bounds for our samples
+            // Avoid considering skybox/background or otherwise invalid pixels
+            if ((viewSpaceZ < cameraFar) && (viewSpaceZ >= cameraNear)) {
+                laneMinZ = min(laneMinZ, viewSpaceZ);
+                laneMaxZ = max(laneMaxZ, viewSpaceZ);
+            }
+        }
+    }
+    minZ = reduce_min(laneMinZ);
+    maxZ = reduce_max(laneMaxZ);
+}
+
+
+// tile width must be a multiple of programCount (SIMD size)
+// numLights must currently be a multiple of programCount (SIMD size)
+export uniform int32
+IntersectLightsWithTileMinMax(
+    uniform int32 tileStartX, uniform int32 tileEndX,
+    uniform int32 tileStartY, uniform int32 tileEndY,
+    // Tile data
+    uniform float minZ,
+    uniform float maxZ,
+    // G-buffer data
+    uniform int32 gBufferWidth, uniform int32 gBufferHeight,
+    // Camera data
+    uniform float cameraProj_11, uniform float cameraProj_22,
+    // Light Data
+    uniform int32 numLights,
+    uniform float light_positionView_x_array[],
+    uniform float light_positionView_y_array[],
+    uniform float light_positionView_z_array[],
+    uniform float light_attenuationEnd_array[],
+    // Output
+    reference uniform int32 tileLightIndices[]
+    )
+{
+    uniform float gBufferScale_x = 0.5f * (float)gBufferWidth;
+    uniform float gBufferScale_y = 0.5f * (float)gBufferHeight;
+        
+    // Parallize across frustum planes.
+    // We really only have four side planes here, but write the code to
+    // handle programCount > 4 robustly
+    uniform float frustumPlanes_xy[programCount];
+    uniform float frustumPlanes_z[programCount];
+
+    // TODO: If programIndex < 4 here? Don't care about masking off the
+    // rest but if interleaving ("x2" modes) the other lanes should ideally
+    // not be emitted...
+    {
+        // This one is totally constant over the whole screen... worth pulling it up at all?
+        float frustumPlanes_xy_v;
+        frustumPlanes_xy_v = insert(frustumPlanes_xy_v, 0, -(cameraProj_11 * gBufferScale_x));
+        frustumPlanes_xy_v = insert(frustumPlanes_xy_v, 1,  (cameraProj_11 * gBufferScale_x));
+        frustumPlanes_xy_v = insert(frustumPlanes_xy_v, 2,  (cameraProj_22 * gBufferScale_y));
+        frustumPlanes_xy_v = insert(frustumPlanes_xy_v, 3, -(cameraProj_22 * gBufferScale_y));
+    
+        float frustumPlanes_z_v;
+        frustumPlanes_z_v = insert(frustumPlanes_z_v, 0,  tileEndX - gBufferScale_x);
+        frustumPlanes_z_v = insert(frustumPlanes_z_v, 1, -tileStartX + gBufferScale_x);
+        frustumPlanes_z_v = insert(frustumPlanes_z_v, 2,  tileEndY - gBufferScale_y);
+        frustumPlanes_z_v = insert(frustumPlanes_z_v, 3, -tileStartY + gBufferScale_y);
+
+        // Normalize
+        float norm = rsqrt(frustumPlanes_xy_v * frustumPlanes_xy_v + 
+                           frustumPlanes_z_v * frustumPlanes_z_v);
+            frustumPlanes_xy_v *= norm;
+            frustumPlanes_z_v *= norm;
+
+        // Save out for uniform use later
+        frustumPlanes_xy[programIndex] = frustumPlanes_xy_v;
+        frustumPlanes_z[programIndex] = frustumPlanes_z_v;
+    }
+
+    uniform int32 tileNumLights = 0;
+
+    for (uniform int32 baseLightIndex = 0; baseLightIndex < numLights; 
+         baseLightIndex += programCount) {
+        int32 lightIndex = baseLightIndex + programIndex;
+        float light_positionView_z = light_positionView_z_array[lightIndex];
+        float light_attenuationEnd = light_attenuationEnd_array[lightIndex];
+        float light_attenuationEndNeg = -light_attenuationEnd;
+
+        float d = light_positionView_z - minZ;
+        bool inFrustum = (d >= light_attenuationEndNeg);
+
+        d = maxZ - light_positionView_z;
+        inFrustum = inFrustum && (d >= light_attenuationEndNeg);
+        
+        // This seems better than cif(!inFrustum) ccontinue; here since we
+        // don't actually need to mask the rest of this function - this is
+        // just a greedy early-out.  Could also structure all of this as
+        // nested if() statements, but this a bit easier to read
+        if (!any(inFrustum)) 
+            continue;
+
+        float light_positionView_x = light_positionView_x_array[lightIndex];
+        float light_positionView_y = light_positionView_y_array[lightIndex];
+
+        d = light_positionView_z * frustumPlanes_z[0] + 
+            light_positionView_x * frustumPlanes_xy[0];
+        inFrustum = inFrustum && (d >= light_attenuationEndNeg);
+
+        d = light_positionView_z * frustumPlanes_z[1] + 
+            light_positionView_x * frustumPlanes_xy[1];
+        inFrustum = inFrustum && (d >= light_attenuationEndNeg);
+
+        d = light_positionView_z * frustumPlanes_z[2] + 
+            light_positionView_y * frustumPlanes_xy[2];
+        inFrustum = inFrustum && (d >= light_attenuationEndNeg);
+
+        d = light_positionView_z * frustumPlanes_z[3] + 
+            light_positionView_y * frustumPlanes_xy[3];
+        inFrustum = inFrustum && (d >= light_attenuationEndNeg);
+        
+        // Pack and store intersecting lights
+        cif (inFrustum) {
+            tileNumLights += packed_store_active(tileLightIndices, tileNumLights, 
+                                                 lightIndex);
+        }
+    }
+
+    return tileNumLights;
+}
+
+
+// tile width must be a multiple of programCount (SIMD size)
+// numLights must currently be a multiple of programCount (SIMD size)
+static uniform int32
+IntersectLightsWithTile(
+    uniform int32 tileStartX, uniform int32 tileEndX,
+    uniform int32 tileStartY, uniform int32 tileEndY,
+    uniform int32 gBufferWidth, uniform int32 gBufferHeight,
+    // G-buffer data
+    uniform float zBuffer[],
+    // Camera data
+    uniform float cameraProj_11, uniform float cameraProj_22,
+    uniform float cameraProj_33, uniform float cameraProj_43,
+    uniform float cameraNear, uniform float cameraFar,
+    // Light Data
+    uniform int32 numLights,
+    uniform float light_positionView_x_array[],
+    uniform float light_positionView_y_array[],
+    uniform float light_positionView_z_array[],
+    uniform float light_attenuationEnd_array[],
+    // Output
+    reference uniform int32 tileLightIndices[]
+    )
+{
+    uniform float minZ, maxZ;
+    ComputeZBounds(tileStartX, tileEndX, tileStartY, tileEndY,
+        zBuffer, gBufferWidth, cameraProj_33, cameraProj_43, cameraNear, cameraFar,
+        minZ, maxZ);
+
+    uniform int32 tileNumLights = IntersectLightsWithTileMinMax(
+        tileStartX, tileEndX, tileStartY, tileEndY, minZ, maxZ,
+        gBufferWidth, gBufferHeight, cameraProj_11, cameraProj_22,
+        MAX_LIGHTS, light_positionView_x_array, light_positionView_y_array, 
+        light_positionView_z_array, light_attenuationEnd_array,
+        tileLightIndices);
+
+    return tileNumLights;
+}
+
+
+// tile width must be a multiple of programCount (SIMD size)
+export void
+ShadeTile(
+    uniform int32 tileStartX, uniform int32 tileEndX,
+    uniform int32 tileStartY, uniform int32 tileEndY,
+    uniform int32 gBufferWidth, uniform int32 gBufferHeight,
+    reference uniform InputDataArrays inputData,
+    // Camera data
+    uniform float cameraProj_11, uniform float cameraProj_22,
+    uniform float cameraProj_33, uniform float cameraProj_43,
+    // Light list
+    reference uniform int32 tileLightIndices[],
+    uniform int32 tileNumLights,
+    // UI
+    uniform bool visualizeLightCount,
+    // Output
+    reference uniform unsigned int8 framebuffer_r[],
+    reference uniform unsigned int8 framebuffer_g[],
+    reference uniform unsigned int8 framebuffer_b[]
+    )
+{
+    if (tileNumLights == 0 || visualizeLightCount) {
+        uniform unsigned int8 c = (unsigned int8)(min(tileNumLights << 2, 255));
+        for (uniform int32 y = tileStartY; y < tileEndY; ++y) {
+            for (uniform int32 x = tileStartX; x < tileEndX; x += programCount) {
+                int32 framebufferIndex = (y * gBufferWidth + x) + programIndex;
+                framebuffer_r[framebufferIndex] = c;
+                framebuffer_g[framebufferIndex] = c;
+                framebuffer_b[framebufferIndex] = c;
+            }
+        }
+    } else {
+        uniform float twoOverGBufferWidth = 2.0f / gBufferWidth;
+        uniform float twoOverGBufferHeight = 2.0f / gBufferHeight;
+        
+        for (uniform int32 y = tileStartY; y < tileEndY; ++y) {
+            uniform float positionScreen_y = -(((0.5f + y) * twoOverGBufferHeight) - 1.f);
+
+            for (uniform int32 x = tileStartX; x < tileEndX; x += programCount) {
+                uniform int32 gBufferOffsetBase = y * gBufferWidth + x;
+                int32 gBufferOffset = gBufferOffsetBase + programIndex;
+                
+                // Reconstruct position and (negative) view vector from G-buffer
+                float surface_positionView_x, surface_positionView_y, surface_positionView_z;
+                float Vneg_x, Vneg_y, Vneg_z;
+
+                float z = inputData.zBuffer[gBufferOffset];
+
+                // Compute screen/clip-space position
+                // NOTE: Mind DX11 viewport transform and pixel center!
+                float positionScreen_x = (0.5f + (float)(x + programIndex)) * 
+                    twoOverGBufferWidth - 1.0f;
+
+                // Unproject depth buffer Z value into view space
+                surface_positionView_z = cameraProj_43 / (z - cameraProj_33);
+                surface_positionView_x = positionScreen_x * surface_positionView_z / 
+                    cameraProj_11;
+                surface_positionView_y = positionScreen_y * surface_positionView_z / 
+                    cameraProj_22;
+                
+                // We actually end up with a vector pointing *at* the
+                // surface (i.e. the negative view vector)
+                normalize3(surface_positionView_x, surface_positionView_y, 
+                           surface_positionView_z, Vneg_x, Vneg_y, Vneg_z);
+
+                // Reconstruct normal from G-buffer
+                float surface_normal_x, surface_normal_y, surface_normal_z;
+                float normal_x = half_to_float_fast(inputData.normalEncoded_x[gBufferOffset]);
+                float normal_y = half_to_float_fast(inputData.normalEncoded_y[gBufferOffset]);
+                    
+                float f = (normal_x - normal_x * normal_x) + (normal_y - normal_y * normal_y);
+                float m = sqrt(4.0f * f - 1.0f);
+                    
+                surface_normal_x = m * (4.0f * normal_x - 2.0f);
+                surface_normal_y = m * (4.0f * normal_y - 2.0f);
+                surface_normal_z = 3.0f - 8.0f * f;
+
+                // Load other G-buffer parameters
+                float surface_specularAmount = 
+                    half_to_float_fast(inputData.specularAmount[gBufferOffset]);
+                float surface_specularPower  = 
+                    half_to_float_fast(inputData.specularPower[gBufferOffset]);
+                float surface_albedo_x = Unorm8ToFloat32(inputData.albedo_x[gBufferOffset]);
+                float surface_albedo_y = Unorm8ToFloat32(inputData.albedo_y[gBufferOffset]);
+                float surface_albedo_z = Unorm8ToFloat32(inputData.albedo_z[gBufferOffset]);
+                
+                float lit_x = 0.0f;
+                float lit_y = 0.0f;
+                float lit_z = 0.0f;
+                for (uniform int32 tileLightIndex = 0; tileLightIndex < tileNumLights; 
+                     ++tileLightIndex) {
+                    uniform int32 lightIndex = tileLightIndices[tileLightIndex];
+                                        
+                    // Gather light data relevant to initial culling
+                    uniform float light_positionView_x = 
+                        inputData.lightPositionView_x[lightIndex];
+                    uniform float light_positionView_y = 
+                        inputData.lightPositionView_y[lightIndex];
+                    uniform float light_positionView_z = 
+                        inputData.lightPositionView_z[lightIndex];
+                    uniform float light_attenuationEnd = 
+                        inputData.lightAttenuationEnd[lightIndex];
+                    
+                    // Compute light vector
+                    float L_x = light_positionView_x - surface_positionView_x;
+                    float L_y = light_positionView_y - surface_positionView_y;
+                    float L_z = light_positionView_z - surface_positionView_z;
+
+                    float distanceToLight2 = dot3(L_x, L_y, L_z, L_x, L_y, L_z);
+                    
+                    // Clip at end of attenuation
+                    float light_attenutaionEnd2 = light_attenuationEnd * light_attenuationEnd;
+
+                    cif (distanceToLight2 < light_attenutaionEnd2) {                    
+                        float distanceToLight = sqrt(distanceToLight2);
+
+                        // HLSL "rcp" is allowed to be fairly inaccurate
+                        float distanceToLightRcp = rcp(distanceToLight);
+                        L_x *= distanceToLightRcp;
+                        L_y *= distanceToLightRcp;
+                        L_z *= distanceToLightRcp;
+
+                        // Start computing brdf
+                        float NdotL = dot3(surface_normal_x, surface_normal_y, 
+                                           surface_normal_z, L_x, L_y, L_z);
+                    
+                        // Clip back facing
+                        cif (NdotL > 0.0f) {
+                            uniform float light_attenuationBegin = 
+                                inputData.lightAttenuationBegin[lightIndex];
+
+                            // Light distance attenuation (linstep)
+                            float lightRange = (light_attenuationEnd - light_attenuationBegin);
+                            float falloffPosition = (light_attenuationEnd - distanceToLight);
+                            float attenuation = min(falloffPosition / lightRange, 1.0f);
+
+                            float H_x = (L_x - Vneg_x);
+                            float H_y = (L_y - Vneg_y);
+                            float H_z = (L_z - Vneg_z);
+                            normalize3(H_x, H_y, H_z, H_x, H_y, H_z);
+                    
+                            float NdotH = dot3(surface_normal_x, surface_normal_y, 
+                                               surface_normal_z, H_x, H_y, H_z);
+                            NdotH = max(NdotH, 0.0f);
+
+                            float specular = pow(NdotH, surface_specularPower);
+                            float specularNorm = (surface_specularPower + 2.0f) * 
+                                (1.0f / 8.0f);
+                            float specularContrib = surface_specularAmount * 
+                                specularNorm * specular;
+
+                            float k = attenuation * NdotL * (1.0f + specularContrib);
+                    
+                            uniform float light_color_x = inputData.lightColor_x[lightIndex];
+                            uniform float light_color_y = inputData.lightColor_y[lightIndex];
+                            uniform float light_color_z = inputData.lightColor_z[lightIndex];
+
+                            float lightContrib_x = surface_albedo_x * light_color_x;
+                            float lightContrib_y = surface_albedo_y * light_color_y;
+                            float lightContrib_z = surface_albedo_z * light_color_z;
+
+                            lit_x += lightContrib_x * k;
+                            lit_y += lightContrib_y * k;
+                            lit_z += lightContrib_z * k;
+                        }
+                    }
+                }
+
+                // Gamma correct
+                // These pows are pretty slow right now, but we can do
+                // something faster if really necessary to squeeze every
+                // last bit of performance out of it
+                float gamma = 1.0 / 2.2f;
+                lit_x = pow(clamp(lit_x, 0.0f, 1.0f), gamma);
+                lit_y = pow(clamp(lit_y, 0.0f, 1.0f), gamma);
+                lit_z = pow(clamp(lit_z, 0.0f, 1.0f), gamma);
+                
+                framebuffer_r[gBufferOffset] = Float32ToUnorm8(lit_x);
+                framebuffer_g[gBufferOffset] = Float32ToUnorm8(lit_y);
+                framebuffer_b[gBufferOffset] = Float32ToUnorm8(lit_z);
+            }
+        }
+    }
+}
+
+
+///////////////////////////////////////////////////////////////////////////
+// Static decomposition
+
+task void
+RenderTile(uniform int num_groups_x, uniform int num_groups_y,
+           reference uniform InputHeader inputHeader,
+           reference uniform InputDataArrays inputData,
+           uniform int visualizeLightCount,
+           // Output
+           reference uniform unsigned int8 framebuffer_r[],
+           reference uniform unsigned int8 framebuffer_g[],
+           reference uniform unsigned int8 framebuffer_b[]) {
+    uniform int32 group_y = taskIndex / num_groups_x;
+    uniform int32 group_x = taskIndex % num_groups_x;
+    uniform int32 tile_start_x = group_x * MIN_TILE_WIDTH;
+    uniform int32 tile_start_y = group_y * MIN_TILE_HEIGHT;
+    uniform int32 tile_end_x = tile_start_x + MIN_TILE_WIDTH;
+    uniform int32 tile_end_y = tile_start_y + MIN_TILE_HEIGHT;
+
+    uniform int framebufferWidth = inputHeader.framebufferWidth;
+    uniform int framebufferHeight = inputHeader.framebufferHeight;
+    uniform float cameraProj_00 = inputHeader.cameraProj[0][0];
+    uniform float cameraProj_11 = inputHeader.cameraProj[1][1];
+    uniform float cameraProj_22 = inputHeader.cameraProj[2][2];
+    uniform float cameraProj_32 = inputHeader.cameraProj[3][2];
+
+    // Light intersection: figure out which lights illuminate this tile.
+    uniform int tileLightIndices[MAX_LIGHTS];  // Light list for the tile
+    uniform int numTileLights = 
+        IntersectLightsWithTile(tile_start_x, tile_end_x, 
+                                tile_start_y, tile_end_y,
+                                framebufferWidth, framebufferHeight,
+                                inputData.zBuffer,
+                                cameraProj_00, cameraProj_11,
+                                cameraProj_22, cameraProj_32,
+                                inputHeader.cameraNear, inputHeader.cameraFar,
+                                MAX_LIGHTS,
+                                inputData.lightPositionView_x, 
+                                inputData.lightPositionView_y, 
+                                inputData.lightPositionView_z, 
+                                inputData.lightAttenuationEnd,
+                                tileLightIndices);
+
+    // And now shade the tile, using the lights in tileLightIndices
+    ShadeTile(tile_start_x, tile_end_x, tile_start_y, tile_end_y,
+              framebufferWidth, framebufferHeight, inputData,
+              cameraProj_00, cameraProj_11, cameraProj_22, cameraProj_32,
+              tileLightIndices, numTileLights, visualizeLightCount, 
+              framebuffer_r, framebuffer_g, framebuffer_b);
+}
+
+
+export void
+RenderStatic(reference uniform InputHeader inputHeader,
+             reference uniform InputDataArrays inputData,
+             uniform int visualizeLightCount,
+             // Output
+             reference uniform unsigned int8 framebuffer_r[],
+             reference uniform unsigned int8 framebuffer_g[],
+             reference uniform unsigned int8 framebuffer_b[]) {
+    uniform int num_groups_x = (inputHeader.framebufferWidth + 
+                                MIN_TILE_WIDTH - 1) / MIN_TILE_WIDTH;
+    uniform int num_groups_y = (inputHeader.framebufferHeight + 
+                                MIN_TILE_HEIGHT - 1) / MIN_TILE_HEIGHT;
+    uniform int num_groups = num_groups_x * num_groups_y;
+
+    // Launch a task to render each tile, each of which is MIN_TILE_WIDTH
+    // by MIN_TILE_HEIGHT pixels.
+    launch[num_groups] < RenderTile(num_groups_x, num_groups_y,
+                                    inputHeader, inputData, visualizeLightCount,
+                                    framebuffer_r, framebuffer_g, framebuffer_b) >;
+}
+
+
+///////////////////////////////////////////////////////////////////////////
+// Routines for dynamic decomposition path
+
+// This computes the z min/max range for a whole row worth of tiles.
+// The tile width must be a multiple of programCount (SIMD size)
+export void
+ComputeZBoundsRow(
+    uniform int32 tileY,
+    uniform int32 tileWidth, uniform int32 tileHeight,
+    uniform int32 numTilesX, uniform int32 numTilesY,
+    // G-buffer data
+    uniform float zBuffer[],
+    uniform int32 gBufferWidth,
+    // Camera data
+    uniform float cameraProj_33, uniform float cameraProj_43,
+    uniform float cameraNear, uniform float cameraFar,
+    // Output
+    reference uniform float minZArray[],
+    reference uniform float maxZArray[]
+    )
+{
+    for (uniform int32 tileX = 0; tileX < numTilesX; ++tileX) {
+        uniform float minZ, maxZ;
+        ComputeZBounds(
+            tileX * tileWidth, tileX * tileWidth + tileWidth,
+            tileY * tileHeight, tileY * tileHeight + tileHeight,
+            zBuffer, gBufferWidth,
+            cameraProj_33, cameraProj_43, cameraNear, cameraFar,
+            minZ, maxZ);
+        minZArray[tileX] = minZ;
+        maxZArray[tileX] = maxZ;
+    }
+}
+
+
+// Reclassifies the lights with respect to four sub-tiles when we refine a tile.
+// numLights need not be a multiple of programCount here, but the input and output arrays
+// should be able to handle programCount-sized load/stores.
+export void
+SplitTileMinMax(
+    uniform int32 tileMidX, uniform int32 tileMidY,
+    // Subtile data (00, 10, 01, 11)
+    uniform float subtileMinZ[],
+    uniform float subtileMaxZ[],
+    // G-buffer data
+    uniform int32 gBufferWidth, uniform int32 gBufferHeight,
+    // Camera data
+    uniform float cameraProj_11, uniform float cameraProj_22,
+    // Light Data
+    reference uniform int32 lightIndices[],
+    uniform int32 numLights,
+    uniform float light_positionView_x_array[],
+    uniform float light_positionView_y_array[],
+    uniform float light_positionView_z_array[],
+    uniform float light_attenuationEnd_array[],
+    // Outputs
+    // TODO: ISPC doesn't currently like multidimensionsal arrays so we'll do the
+    // indexing math ourselves
+    reference uniform int32 subtileIndices[],
+    uniform int32 subtileIndicesPitch,
+    reference uniform int32 subtileNumLights[]
+    )
+{
+    uniform float gBufferScale_x = 0.5f * (float)gBufferWidth;
+    uniform float gBufferScale_y = 0.5f * (float)gBufferHeight;
+        
+    // Parallize across frustum planes
+    // Only have 2 frustum split planes here so may not be worth it, but
+    // we'll do it for now for consistency
+    uniform float frustumPlanes_xy[programCount];
+    uniform float frustumPlanes_z[programCount];
+
+    // This one is totally constant over the whole screen... worth pulling it up at all?
+    float frustumPlanes_xy_v;
+    frustumPlanes_xy_v = insert(frustumPlanes_xy_v, 0, -(cameraProj_11 * gBufferScale_x));
+    frustumPlanes_xy_v = insert(frustumPlanes_xy_v, 1,  (cameraProj_22 * gBufferScale_y));
+    
+    float frustumPlanes_z_v;
+    frustumPlanes_z_v = insert(frustumPlanes_z_v, 0, tileMidX - gBufferScale_x);
+    frustumPlanes_z_v = insert(frustumPlanes_z_v, 1, tileMidY - gBufferScale_y);
+
+    // Normalize
+    float norm = rsqrt(frustumPlanes_xy_v * frustumPlanes_xy_v + 
+                       frustumPlanes_z_v * frustumPlanes_z_v);
+    frustumPlanes_xy_v *= norm;
+    frustumPlanes_z_v *= norm;
+
+    // Save out for uniform use later
+    frustumPlanes_xy[programIndex] = frustumPlanes_xy_v;
+    frustumPlanes_z[programIndex] = frustumPlanes_z_v;
+
+    // Initialize
+    uniform int32 subtileLightOffset[4];
+    subtileLightOffset[0] = 0 * subtileIndicesPitch;
+    subtileLightOffset[1] = 1 * subtileIndicesPitch;
+    subtileLightOffset[2] = 2 * subtileIndicesPitch;
+    subtileLightOffset[3] = 3 * subtileIndicesPitch;
+
+    for (int32 i = programIndex; i < numLights; i += programCount) {
+        // TODO: ISPC says gather required here when it actually
+        // isn't... this could be fixed this by nesting an if() within a
+        // uniform loop, but I'm not totally sure if that's a win
+        // overall. For now we'll just eat the perf cost for cleanliness
+        // since the below are real gathers anyways.
+        int32 lightIndex = lightIndices[i];
+
+        float light_positionView_x = light_positionView_x_array[lightIndex];
+        float light_positionView_y = light_positionView_y_array[lightIndex];
+        float light_positionView_z = light_positionView_z_array[lightIndex];
+        float light_attenuationEnd = light_attenuationEnd_array[lightIndex];
+        float light_attenuationEndNeg = -light_attenuationEnd;
+        
+        // Test lights again subtile z bounds
+        bool inFrustum[4];
+        inFrustum[0] = (light_positionView_z - subtileMinZ[0] >= light_attenuationEndNeg) &&
+            (subtileMaxZ[0] - light_positionView_z >= light_attenuationEndNeg);
+        inFrustum[1] = (light_positionView_z - subtileMinZ[1] >= light_attenuationEndNeg) && 
+            (subtileMaxZ[1] - light_positionView_z >= light_attenuationEndNeg);
+        inFrustum[2] = (light_positionView_z - subtileMinZ[2] >= light_attenuationEndNeg) && 
+            (subtileMaxZ[2] - light_positionView_z >= light_attenuationEndNeg);
+        inFrustum[3] = (light_positionView_z - subtileMinZ[3] >= light_attenuationEndNeg) && 
+            (subtileMaxZ[3] - light_positionView_z >= light_attenuationEndNeg);
+
+        float dx = light_positionView_z * frustumPlanes_z[0] + 
+            light_positionView_x * frustumPlanes_xy[0];
+        float dy = light_positionView_z * frustumPlanes_z[1] +
+            light_positionView_y * frustumPlanes_xy[1];
+        
+        cif (abs(dx) > light_attenuationEnd) {
+            bool positiveX = dx > 0.0f;
+            inFrustum[0] = inFrustum[0] &&  positiveX;    // 00 subtile
+            inFrustum[1] = inFrustum[1] && !positiveX;    // 10 subtile
+            inFrustum[2] = inFrustum[2] &&  positiveX;    // 01 subtile
+            inFrustum[3] = inFrustum[3] && !positiveX;    // 11 subtile
+        }
+        cif (abs(dy) > light_attenuationEnd) {
+            bool positiveY = dy > 0.0f;
+            inFrustum[0] = inFrustum[0] &&  positiveY;    // 00 subtile
+            inFrustum[1] = inFrustum[1] &&  positiveY;    // 10 subtile
+            inFrustum[2] = inFrustum[2] && !positiveY;    // 01 subtile
+            inFrustum[3] = inFrustum[3] && !positiveY;    // 11 subtile
+        }
+
+        // Pack and store intersecting lights
+        // TODO: Experiment with a loop here instead
+        cif (inFrustum[0])
+            subtileLightOffset[0] += packed_store_active(subtileIndices, 
+                                                         subtileLightOffset[0], 
+                                                         lightIndex);
+        cif (inFrustum[1])
+            subtileLightOffset[1] += packed_store_active(subtileIndices, 
+                                                         subtileLightOffset[1], 
+                                                         lightIndex);
+        cif (inFrustum[2])
+            subtileLightOffset[2] += packed_store_active(subtileIndices, 
+                                                         subtileLightOffset[2], 
+                                                         lightIndex);
+        cif (inFrustum[3])
+            subtileLightOffset[3] += packed_store_active(subtileIndices, 
+                                                         subtileLightOffset[3], 
+                                                         lightIndex);
+    }
+
+    subtileNumLights[0] = subtileLightOffset[0] - 0 * subtileIndicesPitch;
+    subtileNumLights[1] = subtileLightOffset[1] - 1 * subtileIndicesPitch;
+    subtileNumLights[2] = subtileLightOffset[2] - 2 * subtileIndicesPitch;
+    subtileNumLights[3] = subtileLightOffset[3] - 3 * subtileIndicesPitch;
+}
--- a/examples/deferred/main.cpp
+++ b/examples/deferred/main.cpp
@@ -0,0 +1,139 @@
+/*
+  Copyright (c) 2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+#ifdef _MSC_VER
+#define ISPC_IS_WINDOWS
+#define NOMINMAX
+#elif defined(__linux__)
+#define ISPC_IS_LINUX
+#elif defined(__APPLE__)
+#define ISPC_IS_APPLE
+#endif
+
+#include <fcntl.h>
+#include <float.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+#include <stdint.h>
+#include <algorithm>
+#include <assert.h>
+#include <vector>
+#ifdef ISPC_IS_WINDOWS
+  #define WIN32_LEAN_AND_MEAN
+  #include <windows.h>
+#endif
+#include "deferred.h"
+#include "kernels_ispc.h"
+#include "../timing.h"
+
+///////////////////////////////////////////////////////////////////////////
+
+int main(int argc, char** argv) {
+    if (argc != 2) {
+        printf("usage: deferred_shading <input_file (e.g. data/pp1280x720.bin)>\n");
+        return 1;
+    }
+
+    InputData *input = CreateInputDataFromFile(argv[1]);
+    if (!input) {
+        printf("Failed to load input file \"%s\"!\n", argv[1]);
+        return 1;
+    }
+
+    Framebuffer framebuffer(input->header.framebufferWidth,
+                            input->header.framebufferHeight);
+
+    InitDynamicC(input);
+#ifdef __cilk
+    InitDynamicCilk(input);
+#endif // __cilk
+
+    int nframes = 5;
+    double ispcCycles = 1e30;
+    for (int i = 0; i < 5; ++i) {
+        framebuffer.clear();
+        reset_and_start_timer();
+        for (int j = 0; j < nframes; ++j)
+            ispc::RenderStatic(&input->header, &input->arrays, 
+                               VISUALIZE_LIGHT_COUNT,
+                               framebuffer.r, framebuffer.g, framebuffer.b);
+        double mcycles = get_elapsed_mcycles() / nframes;
+        ispcCycles = std::min(ispcCycles, mcycles);
+    }
+    printf("[ispc static + tasks]:\t\t[%.3f] million cycles to render "
+           "%d x %d image\n", ispcCycles,
+           input->header.framebufferWidth, input->header.framebufferHeight);
+    WriteFrame("deferred-ispc-static.ppm", input, framebuffer);
+
+#ifdef __cilk
+    double dynamicCilkCycles = 1e30;
+    for (int i = 0; i < 5; ++i) {
+        framebuffer.clear();
+        reset_and_start_timer();
+        for (int j = 0; j < nframes; ++j)
+            DispatchDynamicCilk(input, &framebuffer);
+        double mcycles = get_elapsed_mcycles() / nframes;
+        dynamicCilkCycles = std::min(dynamicCilkCycles, mcycles);
+    }
+    printf("[ispc + Cilk dynamic]:\t\t[%.3f] million cycles to render image\n", 
+           dynamicCilkCycles);
+    WriteFrame("deferred-ispc-dynamic.ppm", input, framebuffer);
+#endif // __cilk
+
+    double serialCycles = 1e30;
+    for (int i = 0; i < 5; ++i) {
+        framebuffer.clear();
+        reset_and_start_timer();
+        for (int j = 0; j < nframes; ++j)
+            DispatchDynamicC(input, &framebuffer);
+        double mcycles = get_elapsed_mcycles() / nframes;
+        serialCycles = std::min(serialCycles, mcycles);
+    }
+    printf("[C++ serial dynamic, 1 core]:\t[%.3f] million cycles to render image\n", 
+           serialCycles);
+    WriteFrame("deferred-serial-dynamic.ppm", input, framebuffer);
+
+#ifdef __cilk
+    printf("\t\t\t\t(%.2fx speedup from static ISPC, %.2fx from Cilk+ISPC)\n", 
+           serialCycles/ispcCycles, serialCycles/dynamicCilkCycles);
+#else
+    printf("\t\t\t\t(%.2fx speedup from ISPC)\n", serialCycles/ispcCycles);
+#endif // __cilk
+
+    DeleteInputData(input);
+
+    return 0;
+}
--- a/examples/examples.sln
+++ b/examples/examples.sln
@@ -18,8 +18,11 @@ EndProject
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "noise", "noise\noise.vcxproj", "{0E0886D8-8B5E-4EAF-9A21-91E63DAF81FD}"
 EndProject
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "volume", "volume_rendering\volume.vcxproj", "{DEE5733A-E93E-449D-9114-9BFFCAEB4DF9}"
+EndProject
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "stencil", "stencil\stencil.vcxproj", "{2EF070A1-F62F-4E6A-944B-88D140945C3C}"
 EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "deferred_shading", "deferred\deferred_shading.vcxproj", "{87F53C53-957E-4E91-878A-BC27828FB9EB}"
+EndProject
 Global
 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
 		Debug|Win32 = Debug|Win32
@@ -108,6 +111,14 @@ Global
 		{2EF070A1-F62F-4E6A-944B-88D140945C3C}.Release|Win32.Build.0 = Release|Win32
 		{2EF070A1-F62F-4E6A-944B-88D140945C3C}.Release|x64.ActiveCfg = Release|x64
 		{2EF070A1-F62F-4E6A-944B-88D140945C3C}.Release|x64.Build.0 = Release|x64
+		{87F53C53-957E-4E91-878A-BC27828FB9EB}.Debug|Win32.ActiveCfg = Debug|Win32
+		{87F53C53-957E-4E91-878A-BC27828FB9EB}.Debug|Win32.Build.0 = Debug|Win32
+		{87F53C53-957E-4E91-878A-BC27828FB9EB}.Debug|x64.ActiveCfg = Debug|x64
+		{87F53C53-957E-4E91-878A-BC27828FB9EB}.Debug|x64.Build.0 = Debug|x64
+		{87F53C53-957E-4E91-878A-BC27828FB9EB}.Release|Win32.ActiveCfg = Release|Win32
+		{87F53C53-957E-4E91-878A-BC27828FB9EB}.Release|Win32.Build.0 = Release|Win32
+		{87F53C53-957E-4E91-878A-BC27828FB9EB}.Release|x64.ActiveCfg = Release|x64
+		{87F53C53-957E-4E91-878A-BC27828FB9EB}.Release|x64.Build.0 = Release|x64
 	EndGlobalSection
 	GlobalSection(SolutionProperties) = preSolution
 		HideSolutionNode = FALSE
--- a/examples/mandelbrot/Makefile
+++ b/examples/mandelbrot/Makefile
@@ -2,7 +2,7 @@
 CXX=g++ -m64
 CXXFLAGS=-Iobjs/ -O3 -Wall
 ISPC=ispc
-ISPCFLAGS=-O2 --target=sse4x2 --arch=x86-64
+ISPCFLAGS=-O2 --target=sse2,sse4-x2,avx-x2 --arch=x86-64

 default: mandelbrot

@@ -14,13 +14,17 @@ dirs:
 clean:
 	/bin/rm -rf objs *~ mandelbrot

-mandelbrot: dirs objs/mandelbrot.o objs/mandelbrot_serial.o objs/mandelbrot_ispc.o
-	$(CXX) $(CXXFLAGS) -o $@ objs/mandelbrot.o objs/mandelbrot_ispc.o objs/mandelbrot_serial.o -lm
+OBJS=objs/mandelbrot.o objs/mandelbrot_serial.o objs/mandelbrot_ispc_sse2.o \
+	objs/mandelbrot_ispc_sse4.o objs/mandelbrot_ispc_avx.o \
+	objs/mandelbrot_ispc.o
+
+mandelbrot: dirs $(OBJS)
+	$(CXX) $(CXXFLAGS) -o $@ $(OBJS) -lm

 objs/%.o: %.cpp
 	$(CXX) $< $(CXXFLAGS) -c -o $@

 objs/mandelbrot.o: objs/mandelbrot_ispc.h 

-objs/%_ispc.h objs/%_ispc.o: %.ispc
+objs/%_ispc.h objs/%_ispc.o objs/%_ispc_sse2.o objs/%_ispc_sse4.o objs/%_ispc_avx.o: %.ispc
 	$(ISPC) $(ISPCFLAGS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
--- a/examples/mandelbrot/mandelbrot.cpp
+++ b/examples/mandelbrot/mandelbrot.cpp
@@ -41,7 +41,6 @@
 #include <stdio.h>
 #include <algorithm>
 #include "../timing.h"
-#include "../cpuid.h"
 #include "mandelbrot_ispc.h"
 using namespace ispc;

@@ -68,38 +67,6 @@ writePPM(int *buf, int width, int height, const char *fn) {
 }


-// Make sure that the vector ISA used during compilation is supported by
-// the processor.  The ISPC_TARGET_* macro is set in the ispc-generated
-// header file that we include above.
-static void
-ensureTargetISAIsSupported() {
-#if defined(ISPC_TARGET_SSE2)
-    bool isaSupported = CPUSupportsSSE2();
-    const char *target = "SSE2";
-#elif defined(ISPC_TARGET_SSE4)
-    bool isaSupported = CPUSupportsSSE4();
-    const char *target = "SSE4";
-#elif defined(ISPC_TARGET_AVX)
-    bool isaSupported = CPUSupportsAVX();
-    const char *target = "AVX";
-#else
-#error "Unknown ISPC_TARGET_* value"
-#endif
-    if (!isaSupported) {
-        fprintf(stderr, "***\n*** Error: the ispc-compiled code uses the %s instruction "
-                "set, which isn't\n***        supported by this computer's CPU!\n", target);
-        fprintf(stderr, "***\n***        Please modify the "
-#ifdef _MSC_VER
-                "MSVC project file "
-#else
-                "Makefile "
-#endif
-                "to select another target (e.g. sse2)\n***\n");
-        exit(1);
-    }
-}
-
-
 int main() {
    unsigned int width = 768;
    unsigned int height = 512;
@@ -111,8 +78,6 @@ int main() {
    int maxIterations = 256;
    int *buf = new int[width*height];

-    ensureTargetISAIsSupported();
-
    //
    // Compute the image using the ispc implementation; report the minimum
    // time of three runs.
--- a/examples/mandelbrot/mandelbrot.vcxproj
+++ b/examples/mandelbrot/mandelbrot.vcxproj
@@ -64,15 +64,19 @@
  <PropertyGroup Label="UserMacros" />
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
    <LinkIncremental>true</LinkIncremental>
+    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
    <LinkIncremental>true</LinkIncremental>
+    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
    <LinkIncremental>false</LinkIncremental>
+    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
    <LinkIncremental>false</LinkIncremental>
+    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
  </PropertyGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
    <ClCompile>
@@ -81,6 +85,7 @@
      <WarningLevel>Level3</WarningLevel>
      <Optimization>Disabled</Optimization>
      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <FloatingPointModel>Fast</FloatingPointModel>
    </ClCompile>
@@ -96,6 +101,7 @@
      <WarningLevel>Level3</WarningLevel>
      <Optimization>Disabled</Optimization>
      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <FloatingPointModel>Fast</FloatingPointModel>
    </ClCompile>
@@ -113,6 +119,7 @@
      <FunctionLevelLinking>true</FunctionLevelLinking>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
      <FloatingPointModel>Fast</FloatingPointModel>
    </ClCompile>
    <Link>
@@ -131,6 +138,7 @@
      <FunctionLevelLinking>true</FunctionLevelLinking>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
      <FloatingPointModel>Fast</FloatingPointModel>
    </ClCompile>
    <Link>
@@ -147,18 +155,18 @@
  <ItemGroup>
    <CustomBuild Include="mandelbrot.ispc">
      <FileType>Document</FileType>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=sse4x2
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx-x2
 </Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --target=sse4x2
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx-x2
 </Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=sse4x2
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx-x2
 </Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --target=sse4x2
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx-x2
 </Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
    </CustomBuild>
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
--- a/examples/mandelbrot_tasks/Makefile
+++ b/examples/mandelbrot_tasks/Makefile
@@ -1,20 +1,18 @@

 ARCH = $(shell uname)

-TASK_CXX=../tasks_pthreads.cpp
+TASK_CXX=../tasksys.cpp
 TASK_LIB=-lpthread
-
-ifeq ($(ARCH), Darwin)
-  TASK_CXX=../tasks_gcd.cpp
-  TASK_LIB=
-endif
-
 TASK_OBJ=$(addprefix objs/, $(subst ../,, $(TASK_CXX:.cpp=.o)))

 CXX=g++
 CXXFLAGS=-Iobjs/ -O3 -Wall -m64
 ISPC=ispc
-ISPCFLAGS=-O2 --target=sse4x2 --arch=x86-64
+ISPCFLAGS=-O2 --target=sse2,sse4-x2,avx-x2 --arch=x86-64
+
+OBJS=objs/mandelbrot.o objs/mandelbrot_serial.o $(TASK_OBJ) \
+	objs/mandelbrot_ispc.o objs/mandelbrot_ispc_sse2.o \
+	objs/mandelbrot_ispc_sse4.o objs/mandelbrot_ispc_avx.o 

 default: mandelbrot

@@ -26,8 +24,8 @@ dirs:
 clean:
 	/bin/rm -rf objs *~ mandelbrot

-mandelbrot: dirs objs/mandelbrot.o objs/mandelbrot_serial.o objs/mandelbrot_ispc.o $(TASK_OBJ)
-	$(CXX) $(CXXFLAGS) -o $@ objs/mandelbrot.o objs/mandelbrot_ispc.o objs/mandelbrot_serial.o $(TASK_OBJ) -lm $(TASK_LIB)
+mandelbrot: dirs $(OBJS)
+	$(CXX) $(CXXFLAGS) -o $@ $(OBJS) -lm $(TASK_LIB)

 objs/%.o: %.cpp
 	$(CXX) $< $(CXXFLAGS) -c -o $@
@@ -37,5 +35,5 @@ objs/%.o: ../%.cpp

 objs/mandelbrot.o: objs/mandelbrot_ispc.h 

-objs/%_ispc.h objs/%_ispc.o: %.ispc
+objs/%_ispc.h objs/%_ispc.o objs/%_ispc_sse2.o objs/%_ispc_sse4.o objs/%_ispc_avx.o: %.ispc
 	$(ISPC) $(ISPCFLAGS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
--- a/examples/mandelbrot_tasks/mandelbrot.cpp
+++ b/examples/mandelbrot_tasks/mandelbrot.cpp
@@ -40,8 +40,8 @@

 #include <stdio.h>
 #include <algorithm>
+#include <string.h>
 #include "../timing.h"
-#include "../cpuid.h"
 #include "mandelbrot_ispc.h"
 using namespace ispc;

@@ -68,39 +68,12 @@ writePPM(int *buf, int width, int height, const char *fn) {
 }


-// Make sure that the vector ISA used during compilation is supported by
-// the processor.  The ISPC_TARGET_* macro is set in the ispc-generated
-// header file that we include above.
-static void
-ensureTargetISAIsSupported() {
-#if defined(ISPC_TARGET_SSE2)
-    bool isaSupported = CPUSupportsSSE2();
-    const char *target = "SSE2";
-#elif defined(ISPC_TARGET_SSE4)
-    bool isaSupported = CPUSupportsSSE4();
-    const char *target = "SSE4";
-#elif defined(ISPC_TARGET_AVX)
-    bool isaSupported = CPUSupportsAVX();
-    const char *target = "AVX";
-#else
-#error "Unknown ISPC_TARGET_* value"
-#endif
-    if (!isaSupported) {
-        fprintf(stderr, "***\n*** Error: the ispc-compiled code uses the %s instruction "
-                "set, which isn't\n***        supported by this computer's CPU!\n", target);
-        fprintf(stderr, "***\n***        Please modify the "
-#ifdef _MSC_VER
-                "MSVC project file "
-#else
-                "Makefile "
-#endif
-                "to select another target (e.g. sse2)\n***\n");
+static void usage() {
+    fprintf(stderr, "usage: mandelbrot [--scale=<factor>]\n");
    exit(1);
-    }
 }

-
-int main() {
+int main(int argc, char *argv[]) {
    unsigned int width = 1536;
    unsigned int height = 1024;
    float x0 = -2;
@@ -108,7 +81,24 @@ int main() {
    float y0 = -1;
    float y1 = 1;

-    ensureTargetISAIsSupported();
+    if (argc == 1)
+        ;
+    else if (argc == 2) {
+        if (strncmp(argv[1], "--scale=", 8) == 0) {
+            float scale = atof(argv[1] + 8);
+            if (scale == 0.f)
+                usage();
+            width *= scale;
+            height *= scale;
+            // round up to multiples of 16
+            width = (width + 0xf) & ~0xf;
+            height = (height + 0xf) & ~0xf;
+        }
+        else 
+            usage();
+    }
+    else
+        usage();

    int maxIterations = 512;
    int *buf = new int[width*height];
@@ -119,6 +109,9 @@ int main() {
    //
    double minISPC = 1e30;
    for (int i = 0; i < 3; ++i) {
+        // Clear out the buffer
+        for (unsigned int i = 0; i < width * height; ++i)
+            buf[i] = 0;
        reset_and_start_timer();
        mandelbrot_ispc(x0, y0, x1, y1, width, height, maxIterations, buf);
        double dt = get_elapsed_mcycles();
@@ -128,9 +121,6 @@ int main() {
    printf("[mandelbrot ispc+tasks]:\t[%.3f] million cycles\n", minISPC);
    writePPM(buf, width, height, "mandelbrot-ispc.ppm");

-    // Clear out the buffer
-    for (unsigned int i = 0; i < width * height; ++i)
-        buf[i] = 0;

    // 
    // And run the serial implementation 3 times, again reporting the
@@ -138,6 +128,9 @@ int main() {
    //
    double minSerial = 1e30;
    for (int i = 0; i < 3; ++i) {
+        // Clear out the buffer
+        for (unsigned int i = 0; i < width * height; ++i)
+            buf[i] = 0;
        reset_and_start_timer();
        mandelbrot_serial(x0, y0, x1, y1, width, height, maxIterations, buf);
        double dt = get_elapsed_mcycles();
--- a/examples/mandelbrot_tasks/mandelbrot.ispc
+++ b/examples/mandelbrot_tasks/mandelbrot.ispc
@@ -53,11 +53,14 @@ mandel(float c_re, float c_im, int count) {
   [ystart,yend).
 */
 task void
-mandelbrot_scanlines(uniform int ystart, uniform int yend,
+mandelbrot_scanlines(uniform int ybase, uniform int span,
                     uniform float x0, uniform float dx, 
                     uniform float y0, uniform float dy,
                     uniform int width, uniform int maxIterations,
                     reference uniform int output[]) {
+    uniform int ystart = ybase + taskIndex * span;
+    uniform int yend = ystart + span;
+
    for (uniform int j = ystart; j < yend; ++j) {
        for (uniform int i = 0; i < width; i += programCount) {
            float x = x0 + (programIndex + i) * dx;
@@ -70,6 +73,20 @@ mandelbrot_scanlines(uniform int ystart, uniform int yend,
 }
                               

+task void
+mandelbrot_chunk(uniform float x0, uniform float dx,
+                 uniform float y0, uniform float dy,
+                 uniform int width, uniform int height,
+                 uniform int maxIterations, reference uniform int output[]) {
+    uniform int ystart = taskIndex * (height/taskCount);
+    uniform int yend = (taskIndex+1) * (height/taskCount);
+    uniform int span = 1;
+
+    launch[(yend-ystart)/span] < mandelbrot_scanlines(ystart, span, x0, dx, y0, dy,
+                                                      width, maxIterations, output) >;
+}
+
+
 export void
 mandelbrot_ispc(uniform float x0, uniform float y0, 
                uniform float x1, uniform float y1,
@@ -78,9 +95,6 @@ mandelbrot_ispc(uniform float x0, uniform float y0,
    uniform float dx = (x1 - x0) / width;
    uniform float dy = (y1 - y0) / height;

-    /* Launch task to compute results for spans of 'span' scanlines. */
-    uniform int span = 2;
-    for (uniform int j = 0; j < height; j += span)
-        launch < mandelbrot_scanlines(j, j+span, x0, dx, y0, dy, width,
+    launch[32] < mandelbrot_chunk(x0, dx, y0, dy, width, height,
                                  maxIterations, output) >;
 }
--- a/examples/mandelbrot_tasks/mandelbrot_tasks.vcxproj
+++ b/examples/mandelbrot_tasks/mandelbrot_tasks.vcxproj
@@ -64,15 +64,19 @@
  <PropertyGroup Label="UserMacros" />
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
    <LinkIncremental>true</LinkIncremental>
+    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
    <LinkIncremental>true</LinkIncremental>
+    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
    <LinkIncremental>false</LinkIncremental>
+    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
    <LinkIncremental>false</LinkIncremental>
+    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
  </PropertyGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
    <ClCompile>
@@ -81,6 +85,7 @@
      <WarningLevel>Level3</WarningLevel>
      <Optimization>Disabled</Optimization>
      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <FloatingPointModel>Fast</FloatingPointModel>
    </ClCompile>
@@ -96,6 +101,7 @@
      <WarningLevel>Level3</WarningLevel>
      <Optimization>Disabled</Optimization>
      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <FloatingPointModel>Fast</FloatingPointModel>
    </ClCompile>
@@ -113,6 +119,7 @@
      <FunctionLevelLinking>true</FunctionLevelLinking>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
      <FloatingPointModel>Fast</FloatingPointModel>
    </ClCompile>
    <Link>
@@ -131,6 +138,7 @@
      <FunctionLevelLinking>true</FunctionLevelLinking>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
      <FloatingPointModel>Fast</FloatingPointModel>
    </ClCompile>
    <Link>
@@ -143,23 +151,23 @@
  <ItemGroup>
    <ClCompile Include="mandelbrot.cpp" />
    <ClCompile Include="mandelbrot_serial.cpp" />
-    <ClCompile Include="../tasks_concrt.cpp" />
+    <ClCompile Include="../tasksys.cpp" />
  </ItemGroup>
  <ItemGroup>
    <CustomBuild Include="mandelbrot.ispc">
      <FileType>Document</FileType>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=sse4x2
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx-x2
 </Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --target=sse4x2
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx-x2
 </Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=sse4x2
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx-x2
 </Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --target=sse4x2
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx-x2
 </Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
    </CustomBuild>
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
--- a/examples/noise/Makefile
+++ b/examples/noise/Makefile
@@ -2,7 +2,10 @@
 CXX=g++ -m64
 CXXFLAGS=-Iobjs/ -O3 -Wall
 ISPC=ispc
-ISPCFLAGS=-O2 --target=sse4 --arch=x86-64
+ISPCFLAGS=-O2 --target=sse2,sse4,avx-x2 --arch=x86-64
+
+OBJS=objs/noise.o objs/noise_serial.o objs/noise_ispc.o objs/noise_ispc_sse2.o \
+	objs/noise_ispc_sse4.o objs/noise_ispc_avx.o 

 default: noise

@@ -14,13 +17,13 @@ dirs:
 clean:
 	/bin/rm -rf objs *~ noise

-noise: dirs objs/noise.o objs/noise_serial.o objs/noise_ispc.o
-	$(CXX) $(CXXFLAGS) -o $@ objs/noise.o objs/noise_ispc.o objs/noise_serial.o -lm
+noise: dirs $(OBJS)
+	$(CXX) $(CXXFLAGS) -o $@ $(OBJS) -lm

 objs/%.o: %.cpp
 	$(CXX) $< $(CXXFLAGS) -c -o $@

 objs/noise.o: objs/noise_ispc.h 

-objs/%_ispc.h objs/%_ispc.o: %.ispc
+objs/%_ispc.h objs/%_ispc.o objs/%_ispc_sse2.o objs/%_ispc_sse4.o objs/%_ispc_avx.o: %.ispc
 	$(ISPC) $(ISPCFLAGS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
--- a/examples/noise/noise.cpp
+++ b/examples/noise/noise.cpp
@@ -41,7 +41,6 @@
 #include <stdio.h>
 #include <algorithm>
 #include "../timing.h"
-#include "../cpuid.h"
 #include "noise_ispc.h"
 using namespace ispc;

@@ -66,38 +65,6 @@ writePPM(float *buf, int width, int height, const char *fn) {
 }


-// Make sure that the vector ISA used during compilation is supported by
-// the processor.  The ISPC_TARGET_* macro is set in the ispc-generated
-// header file that we include above.
-static void
-ensureTargetISAIsSupported() {
-#if defined(ISPC_TARGET_SSE2)
-    bool isaSupported = CPUSupportsSSE2();
-    const char *target = "SSE2";
-#elif defined(ISPC_TARGET_SSE4)
-    bool isaSupported = CPUSupportsSSE4();
-    const char *target = "SSE4";
-#elif defined(ISPC_TARGET_AVX)
-    bool isaSupported = CPUSupportsAVX();
-    const char *target = "AVX";
-#else
-#error "Unknown ISPC_TARGET_* value"
-#endif
-    if (!isaSupported) {
-        fprintf(stderr, "***\n*** Error: the ispc-compiled code uses the %s instruction "
-                "set, which isn't\n***        supported by this computer's CPU!\n", target);
-        fprintf(stderr, "***\n***        Please modify the "
-#ifdef _MSC_VER
-                "MSVC project file "
-#else
-                "Makefile "
-#endif
-                "to select another target (e.g. sse2)\n***\n");
-        exit(1);
-    }
-}
-
-
 int main() {
    unsigned int width = 768;
    unsigned int height = 768;
@@ -108,8 +75,6 @@ int main() {

    float *buf = new float[width*height];

-    ensureTargetISAIsSupported();
-
    //
    // Compute the image using the ispc implementation; report the minimum
    // time of three runs.
--- a/examples/noise/noise.ispc
+++ b/examples/noise/noise.ispc
@@ -131,11 +131,11 @@ static float Noise(float x, float y, float z) {
 }


-static float Turbulence(float x, float y, float z, int octaves) {
+static float Turbulence(float x, float y, float z, uniform int octaves) {
    float omega = 0.6;

    float sum = 0., lambda = 1., o = 1.;
-    for (int i = 0; i < octaves; ++i) {
+    for (uniform int i = 0; i < octaves; ++i) {
        sum += abs(o * Noise(lambda * x, lambda * y, lambda * z));
        lambda *= 1.99f;
        o *= omega;
--- a/examples/noise/noise.vcxproj
+++ b/examples/noise/noise.vcxproj
@@ -1,4 +1,4 @@
-<?xml version="1.0" encoding="utf-8"?>
+<?xml version="1.0" encoding="utf-8"?>
 <Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
  <ItemGroup Label="ProjectConfigurations">
    <ProjectConfiguration Include="Debug|Win32">
@@ -64,15 +64,19 @@
  <PropertyGroup Label="UserMacros" />
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
    <LinkIncremental>true</LinkIncremental>
+    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
    <LinkIncremental>true</LinkIncremental>
+    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
    <LinkIncremental>false</LinkIncremental>
+    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
    <LinkIncremental>false</LinkIncremental>
+    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
  </PropertyGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
    <ClCompile>
@@ -81,6 +85,7 @@
      <WarningLevel>Level3</WarningLevel>
      <Optimization>Disabled</Optimization>
      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <FloatingPointModel>Fast</FloatingPointModel>
    </ClCompile>
@@ -96,6 +101,7 @@
      <WarningLevel>Level3</WarningLevel>
      <Optimization>Disabled</Optimization>
      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <FloatingPointModel>Fast</FloatingPointModel>
    </ClCompile>
@@ -113,6 +119,7 @@
      <FunctionLevelLinking>true</FunctionLevelLinking>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
      <FloatingPointModel>Fast</FloatingPointModel>
    </ClCompile>
    <Link>
@@ -131,6 +138,7 @@
      <FunctionLevelLinking>true</FunctionLevelLinking>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
      <FloatingPointModel>Fast</FloatingPointModel>
    </ClCompile>
    <Link>
@@ -147,18 +155,18 @@
  <ItemGroup>
    <CustomBuild Include="noise.ispc">
      <FileType>Document</FileType>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=sse4
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4,avx-x2
 </Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --target=sse4
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4,avx-x2
 </Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=sse4
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4,avx-x2
 </Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --target=sse4
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4,avx-x2
 </Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
    </CustomBuild>
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
--- a/examples/options/Makefile
+++ b/examples/options/Makefile
@@ -2,7 +2,11 @@
 CXX=g++ -m64
 CXXFLAGS=-Iobjs/ -g -Wall
 ISPC=ispc
-ISPCFLAGS=-O2 --target=sse4x2 --arch=x86-64
+ISPCFLAGS=-O2 --target=sse2,sse4-x2,avx-x2 --arch=x86-64
+
+OBJS=objs/options.o objs/options_serial.o objs/options_ispc.o \
+	objs/options_ispc_sse2.o objs/options_ispc_sse4.o \
+	objs/options_ispc_avx.o

 default: options

@@ -14,13 +18,13 @@ dirs:
 clean:
 	/bin/rm -rf objs *~ options

-options: dirs objs/options.o objs/options_serial.o objs/options_ispc.o
-	$(CXX) $(CXXFLAGS) -o $@ objs/options.o objs/options_ispc.o objs/options_serial.o -lm
+options: dirs $(OBJS)
+	$(CXX) $(CXXFLAGS) -o $@ $(OBJS) -lm

 objs/%.o: %.cpp
 	$(CXX) $< $(CXXFLAGS) -c -o $@

 objs/options.o: objs/options_ispc.h options_defs.h

-objs/%_ispc.h objs/%_ispc.o: %.ispc options_defs.h
+objs/%_ispc.h objs/%_ispc.o objs/%_ispc_sse2.o objs/%_ispc_sse4.o objs/%_ispc_avx.o: %.ispc options_defs.h
 	$(ISPC) $(ISPCFLAGS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
--- a/examples/options/options.cpp
+++ b/examples/options/options.cpp
@@ -41,7 +41,6 @@ using std::max;

 #include "options_defs.h"
 #include "../timing.h"
-#include "../cpuid.h"

 #include "options_ispc.h"
 using namespace ispc;
@@ -54,41 +53,7 @@ extern void binomial_put_serial(float Sa[], float Xa[], float Ta[],
                                float ra[], float va[], 
                                float result[], int count);

-// Make sure that the vector ISA used during compilation is supported by
-// the processor.  The ISPC_TARGET_* macro is set in the ispc-generated
-// header file that we include above.
-static void
-ensureTargetISAIsSupported() {
-#if defined(ISPC_TARGET_SSE2)
-    bool isaSupported = CPUSupportsSSE2();
-    const char *target = "SSE2";
-#elif defined(ISPC_TARGET_SSE4)
-    bool isaSupported = CPUSupportsSSE4();
-    const char *target = "SSE4";
-#elif defined(ISPC_TARGET_AVX)
-    bool isaSupported = CPUSupportsAVX();
-    const char *target = "AVX";
-#else
-#error "Unknown ISPC_TARGET_* value"
-#endif
-    if (!isaSupported) {
-        fprintf(stderr, "***\n*** Error: the ispc-compiled code uses the %s instruction "
-                "set, which isn't\n***        supported by this computer's CPU!\n", target);
-        fprintf(stderr, "***\n***        Please modify the "
-#ifdef _MSC_VER
-                "MSVC project file "
-#else
-                "Makefile "
-#endif
-                "to select another target (e.g. sse2)\n***\n");
-        exit(1);
-    }
-}
-
-
 int main() {
-    ensureTargetISAIsSupported();
-    
    float *S = new float[N_OPTIONS];
    float *X = new float[N_OPTIONS];
    float *T = new float[N_OPTIONS];
--- a/examples/options/options.vcxproj
+++ b/examples/options/options.vcxproj
@@ -1,4 +1,4 @@
-<?xml version="1.0" encoding="utf-8"?>
+<?xml version="1.0" encoding="utf-8"?>
 <Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
  <ItemGroup Label="ProjectConfigurations">
    <ProjectConfiguration Include="Debug|Win32">
@@ -64,15 +64,19 @@
  <PropertyGroup Label="UserMacros" />
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
    <LinkIncremental>true</LinkIncremental>
+    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
    <LinkIncremental>true</LinkIncremental>
+    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
    <LinkIncremental>false</LinkIncremental>
+    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
    <LinkIncremental>false</LinkIncremental>
+    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
  </PropertyGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
    <ClCompile>
@@ -81,6 +85,7 @@
      <WarningLevel>Level3</WarningLevel>
      <Optimization>Disabled</Optimization>
      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
      <DisableSpecificWarnings>4305</DisableSpecificWarnings>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <FloatingPointModel>Fast</FloatingPointModel>
@@ -97,6 +102,7 @@
      <WarningLevel>Level3</WarningLevel>
      <Optimization>Disabled</Optimization>
      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
      <DisableSpecificWarnings>4305</DisableSpecificWarnings>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <FloatingPointModel>Fast</FloatingPointModel>
@@ -115,6 +121,7 @@
      <FunctionLevelLinking>true</FunctionLevelLinking>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
      <DisableSpecificWarnings>4305</DisableSpecificWarnings>
      <FloatingPointModel>Fast</FloatingPointModel>
    </ClCompile>
@@ -134,6 +141,7 @@
      <FunctionLevelLinking>true</FunctionLevelLinking>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
      <DisableSpecificWarnings>4305</DisableSpecificWarnings>
      <FloatingPointModel>Fast</FloatingPointModel>
    </ClCompile>
@@ -151,18 +159,18 @@
  <ItemGroup>
    <CustomBuild Include="options.ispc">
      <FileType>Document</FileType>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=sse4x2
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx-x2
 </Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --target=sse4x2
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx-x2
 </Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=sse4x2
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx-x2
 </Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --target=sse4x2
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx-x2
 </Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
    </CustomBuild>
  </ItemGroup>
  <ItemGroup>
--- a/examples/rt/Makefile
+++ b/examples/rt/Makefile
@@ -1,20 +1,17 @@

 ARCH = $(shell uname)

-TASK_CXX=../tasks_pthreads.cpp
+TASK_CXX=../tasksys.cpp
 TASK_LIB=-lpthread
-
-ifeq ($(ARCH), Darwin)
-  TASK_CXX=../tasks_gcd.cpp
-  TASK_LIB=
-endif
-
 TASK_OBJ=$(addprefix objs/, $(subst ../,, $(TASK_CXX:.cpp=.o)))

 CXX=g++
 CXXFLAGS=-Iobjs/ -O3 -Wall -m64
 ISPC=ispc
-ISPCFLAGS=-O2 --target=sse4x2 --arch=x86-64
+ISPCFLAGS=-O2 --target=sse2,sse4-x2,avx --arch=x86-64
+
+OBJS=objs/rt.o objs/rt_serial.o $(TASK_OBJ) objs/rt_ispc.o objs/rt_ispc_sse2.o \
+	objs/rt_ispc_sse4.o objs/rt_ispc_avx.o

 default: rt

@@ -26,8 +23,8 @@ dirs:
 clean:
 	/bin/rm -rf objs *~ rt

-rt: dirs objs/rt.o objs/rt_serial.o objs/rt_ispc.o $(TASK_OBJ)
-	$(CXX) $(CXXFLAGS) -o $@ objs/rt.o objs/rt_ispc.o objs/rt_serial.o $(TASK_OBJ) -lm $(TASK_LIB)
+rt: dirs $(OBJS)
+	$(CXX) $(CXXFLAGS) -o $@ $(OBJS) -lm $(TASK_LIB)

 objs/%.o: %.cpp
 	$(CXX) $< $(CXXFLAGS) -c -o $@
@@ -37,5 +34,5 @@ objs/%.o: ../%.cpp

 objs/rt.o: objs/rt_ispc.h 

-objs/%_ispc.h objs/%_ispc.o: %.ispc
+objs/%_ispc.h objs/%_ispc.o objs/%_ispc_sse2.o objs/%_ispc_sse4.o objs/%_ispc_avx.o: %.ispc
 	$(ISPC) $(ISPCFLAGS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
--- a/examples/rt/rt.cpp
+++ b/examples/rt/rt.cpp
@@ -45,14 +45,14 @@
 #include <string.h>
 #include <sys/types.h>
 #include "../timing.h"
-#include "../cpuid.h"
 #include "rt_ispc.h"

 using namespace ispc;

 typedef unsigned int uint;

-extern void raytrace_serial(int width, int height, const float raster2camera[4][4], 
+extern void raytrace_serial(int width, int height, int baseWidth, int baseHeight,
+                            const float raster2camera[4][4], 
                            const float camera2world[4][4], float image[],
                            int id[], const LinearBVHNode nodes[],
                            const Triangle triangles[]);
@@ -95,45 +95,28 @@ static void writeImage(int *idImage, float *depthImage, int width, int height,
 }


-// Make sure that the vector ISA used during compilation is supported by
-// the processor.  The ISPC_TARGET_* macro is set in the ispc-generated
-// header file that we include above.
-static void
-ensureTargetISAIsSupported() {
-#if defined(ISPC_TARGET_SSE2)
-    bool isaSupported = CPUSupportsSSE2();
-    const char *target = "SSE2";
-#elif defined(ISPC_TARGET_SSE4)
-    bool isaSupported = CPUSupportsSSE4();
-    const char *target = "SSE4";
-#elif defined(ISPC_TARGET_AVX)
-    bool isaSupported = CPUSupportsAVX();
-    const char *target = "AVX";
-#else
-#error "Unknown ISPC_TARGET_* value"
-#endif
-    if (!isaSupported) {
-        fprintf(stderr, "***\n*** Error: the ispc-compiled code uses the %s instruction "
-                "set, which isn't\n***        supported by this computer's CPU!\n", target);
-        fprintf(stderr, "***\n***        Please modify the "
-#ifdef _MSC_VER
-                "MSVC project file "
-#else
-                "Makefile "
-#endif
-                "to select another target (e.g. sse2)\n***\n");
+static void usage() {
+    fprintf(stderr, "rt [--scale=<factor>] <scene name base>\n");
    exit(1);
-    }
 }


 int main(int argc, char *argv[]) {
-    if (argc != 2) {
-        fprintf(stderr, "usage: rt <filename base>\n");
-        exit(1);
+    float scale = 1.f;
+    const char *filename = NULL;
+    for (int i = 1; i < argc; ++i) {
+        if (strncmp(argv[i], "--scale=", 8) == 0) {
+            scale = atof(argv[i] + 8);
+            if (scale == 0.f)
+                usage();
        }
-
-    ensureTargetISAIsSupported();
+        else if (filename != NULL)
+            usage();
+        else
+            filename = argv[i];
+    }
+    if (filename == NULL)
+        usage();

 #define READ(var, n)                                            \
    if (fread(&(var), sizeof(var), n, f) != (unsigned int)n) {  \
@@ -145,10 +128,10 @@ int main(int argc, char *argv[]) {
    // Read the camera specification information from the camera file
    //
    char fnbuf[1024];
-    sprintf(fnbuf, "%s.camera", argv[1]);
+    sprintf(fnbuf, "%s.camera", filename);
    FILE *f = fopen(fnbuf, "rb");
    if (!f) {
-        perror(argv[1]);
+        perror(fnbuf);
        return 1;
    }

@@ -156,20 +139,20 @@ int main(int argc, char *argv[]) {
    // Nothing fancy, and trouble if we run on a big-endian system, just
    // fread in the bits
    //
-    int width, height;
+    int baseWidth, baseHeight;
    float camera2world[4][4], raster2camera[4][4];
-    READ(width, 1);
-    READ(height, 1);
+    READ(baseWidth, 1);
+    READ(baseHeight, 1);
    READ(camera2world[0][0], 16);
    READ(raster2camera[0][0], 16);

    //
    // Read in the serialized BVH 
    //
-    sprintf(fnbuf, "%s.bvh", argv[1]);
+    sprintf(fnbuf, "%s.bvh", filename);
    f = fopen(fnbuf, "rb");
    if (!f) {
-        perror(argv[2]);
+        perror(fnbuf);
        return 1;
    }

@@ -185,12 +168,12 @@ int main(int argc, char *argv[]) {
        // of node, the total number of int it if a leaf node, etc.
        float b[6];
        READ(b[0], 6);
-        nodes[i].bounds[0].v[0] = b[0];
-        nodes[i].bounds[0].v[1] = b[1];
-        nodes[i].bounds[0].v[2] = b[2];
-        nodes[i].bounds[1].v[0] = b[3];
-        nodes[i].bounds[1].v[1] = b[4];
-        nodes[i].bounds[1].v[2] = b[5];
+        nodes[i].bounds[0][0] = b[0];
+        nodes[i].bounds[0][1] = b[1];
+        nodes[i].bounds[0][2] = b[2];
+        nodes[i].bounds[1][0] = b[3];
+        nodes[i].bounds[1][1] = b[4];
+        nodes[i].bounds[1][2] = b[5];
        READ(nodes[i].offset, 1);
        READ(nodes[i].nPrimitives, 1);
        READ(nodes[i].splitAxis, 1);
@@ -207,19 +190,19 @@ int main(int argc, char *argv[]) {
        READ(v[0], 9);
        float *vp = v;
        for (int j = 0; j < 3; ++j) {
-            triangles[i].p[j].v[0] = *vp++;
-            triangles[i].p[j].v[1] = *vp++;
-            triangles[i].p[j].v[2] = *vp++;
+            triangles[i].p[j][0] = *vp++;
+            triangles[i].p[j][1] = *vp++;
+            triangles[i].p[j][2] = *vp++;
        }
        // And create an object id
        triangles[i].id = i+1;
    }
    fclose(f);

-    // round image resolution up to multiple of 4 to make things easy for
+    // round image resolution up to multiple of 16 to make things easy for
    // the code that assigns pixels to ispc program instances
-    height = (height + 3) & ~3;
-    width = (width + 3) & ~3;
+    int height = (int(baseHeight * scale) + 0xf) & ~0xf;
+    int width = (int(baseWidth * scale) + 0xf) & ~0xf;

    // allocate images; one to hold hit object ids, one to hold depth to
    // the first interseciton
@@ -232,8 +215,8 @@ int main(int argc, char *argv[]) {
    double minTimeISPC = 1e30;
    for (int i = 0; i < 3; ++i) {
        reset_and_start_timer();
-        raytrace_ispc(width, height, raster2camera, camera2world, 
-                      image, id, nodes, triangles);
+        raytrace_ispc(width, height, baseWidth, baseHeight, raster2camera, 
+                      camera2world, image, id, nodes, triangles);
        double dt = get_elapsed_mcycles();
        minTimeISPC = std::min(dt, minTimeISPC);
    }
@@ -251,8 +234,8 @@ int main(int argc, char *argv[]) {
    double minTimeISPCtasks = 1e30;
    for (int i = 0; i < 3; ++i) {
        reset_and_start_timer();
-        raytrace_ispc_tasks(width, height, raster2camera, camera2world, 
-                            image, id, nodes, triangles);
+        raytrace_ispc_tasks(width, height, baseWidth, baseHeight, raster2camera,
+                            camera2world, image, id, nodes, triangles);
        double dt = get_elapsed_mcycles();
        minTimeISPCtasks = std::min(dt, minTimeISPCtasks);
    }
@@ -271,8 +254,8 @@ int main(int argc, char *argv[]) {
    double minTimeSerial = 1e30;
    for (int i = 0; i < 3; ++i) {
        reset_and_start_timer();
-        raytrace_serial(width, height, raster2camera, camera2world, 
-                        image, id, nodes, triangles);
+        raytrace_serial(width, height, baseWidth, baseHeight, raster2camera, 
+                        camera2world, image, id, nodes, triangles);
        double dt = get_elapsed_mcycles();
        minTimeSerial = std::min(dt, minTimeSerial);
    }
--- a/examples/rt/rt.ispc
+++ b/examples/rt/rt.ispc
@@ -43,12 +43,13 @@ struct Ray {
 };

 struct Triangle {
-    uniform float3 p[3];
+    uniform float p[3][4];
    uniform int id;
+    uniform int pad[3];
 };

 struct LinearBVHNode {
-    uniform float3 bounds[2];
+    uniform float bounds[2][3];
    uniform unsigned int offset;     // num primitives for leaf, second child for interior
    uniform unsigned int8 nPrimitives;
    uniform unsigned int8 splitAxis;
@@ -103,14 +104,16 @@ static void generateRay(uniform const float raster2camera[4][4],
 }


-static inline bool BBoxIntersect(const reference uniform float3 bounds[2], 
+static inline bool BBoxIntersect(const uniform float bounds[2][3], 
                                 const reference Ray ray) {
+    uniform float3 bounds0 = { bounds[0][0], bounds[0][1], bounds[0][2] };
+    uniform float3 bounds1 = { bounds[1][0], bounds[1][1], bounds[1][2] };
    float t0 = ray.mint, t1 = ray.maxt;

    // Check all three axis-aligned slabs.  Don't try to early out; it's
    // not worth the trouble
-    float3 tNear = (bounds[0] - ray.origin) * ray.invDir;
-    float3 tFar  = (bounds[1] - ray.origin) * ray.invDir;
+    float3 tNear = (bounds0 - ray.origin) * ray.invDir;
+    float3 tFar  = (bounds1 - ray.origin) * ray.invDir;
    if (tNear.x > tFar.x) {
        float tmp = tNear.x;
        tNear.x = tFar.x;
@@ -141,8 +144,11 @@ static inline bool BBoxIntersect(const reference uniform float3 bounds[2],


 static inline bool TriIntersect(const reference Triangle tri, reference Ray ray) {
-    uniform float3 e1 = tri.p[1] - tri.p[0];
-    uniform float3 e2 = tri.p[2] - tri.p[0];
+    uniform float3 p0 = { tri.p[0][0], tri.p[0][1], tri.p[0][2] };
+    uniform float3 p1 = { tri.p[1][0], tri.p[1][1], tri.p[1][2] };
+    uniform float3 p2 = { tri.p[2][0], tri.p[2][1], tri.p[2][2] };
+    uniform float3 e1 = p1 - p0;
+    uniform float3 e2 = p2 - p0;

    float3 s1 = Cross(ray.dir, e2);
    float divisor = Dot(s1, e1);
@@ -153,7 +159,7 @@ static inline bool TriIntersect(const reference Triangle tri, reference Ray ray)
    float invDivisor = 1.f / divisor;

    // Compute first barycentric coordinate
-    float3 d = ray.origin - tri.p[0];
+    float3 d = ray.origin - p0;
    float b1 = Dot(d, s1) * invDivisor;
    if (b1 < 0. || b1 > 1.)
        hit = false;
@@ -227,12 +233,17 @@ bool BVHIntersect(const LinearBVHNode nodes[], const Triangle tris[],


 static void raytrace_tile(uniform int x0, uniform int x1,
-                          uniform int y0, uniform int y1, uniform int width,
+                          uniform int y0, uniform int y1, 
+                          uniform int width, uniform int height,
+                          uniform int baseWidth, uniform int baseHeight,
                          const uniform float raster2camera[4][4], 
                          const uniform float camera2world[4][4],
                          uniform float image[], uniform int id[],
                          const LinearBVHNode nodes[],
                          const Triangle triangles[]) {
+    uniform float widthScale = (float)(baseWidth) / (float)(width);
+    uniform float heightScale = (float)(baseHeight) / (float)(height);
+
    static const uniform float udx[16] = { 0, 1, 0, 1, 2, 3, 2, 3, 
                                           0, 1, 0, 1, 2, 3, 2, 3 };
    static const uniform float udy[16] = { 0, 0, 1, 1, 0, 0, 1, 1, 
@@ -252,7 +263,8 @@ static void raytrace_tile(uniform int x0, uniform int x1,
                const float dy = udy[o * programCount + programIndex];

                Ray ray;
-                generateRay(raster2camera, camera2world, x+dx, y+dy, ray);
+                generateRay(raster2camera, camera2world, (x+dx)*widthScale,
+                            (y+dy)*heightScale, ray);
                BVHIntersect(nodes, triangles, ray);

                int offset = (y + (int)dy) * width + (x + (int)dx);
@@ -265,42 +277,51 @@ static void raytrace_tile(uniform int x0, uniform int x1,


 export void raytrace_ispc(uniform int width, uniform int height,
+                          uniform int baseWidth, uniform int baseHeight,
                          const uniform float raster2camera[4][4], 
                          const uniform float camera2world[4][4],
                          uniform float image[], uniform int id[],
                          const LinearBVHNode nodes[],
                          const Triangle triangles[]) {
-    raytrace_tile(0, width, 0, height, width, raster2camera, camera2world, image,
+    raytrace_tile(0, width, 0, height, width, height, baseWidth, baseHeight,
+                  raster2camera, camera2world, image,
                  id, nodes, triangles);
 }


-task void raytrace_tile_task(uniform int x0, uniform int x1,
-                             uniform int y0, uniform int y1, uniform int width,
+task void raytrace_tile_task(uniform int width, uniform int height,
+                             uniform int baseWidth, uniform int baseHeight,
                             const uniform float raster2camera[4][4], 
                             const uniform float camera2world[4][4],
                             uniform float image[], uniform int id[],
                             const LinearBVHNode nodes[],
                             const Triangle triangles[]) {
-    raytrace_tile(x0, x1, y0, y1, width, raster2camera, camera2world, image,
+    uniform int dx = 16, dy = 16; // must match dx, dy below
+    uniform int xBuckets = (width + (dx-1)) / dx;
+    uniform int x0 = (taskIndex % xBuckets) * dx;
+    uniform int x1 = min(x0 + dx, width);
+    uniform int y0 = (taskIndex / xBuckets) * dy;
+    uniform int y1 = min(y0 + dy, height);
+                             
+    raytrace_tile(x0, x1, y0, y1, width, height, baseWidth, baseHeight, 
+                  raster2camera, camera2world, image,
                  id, nodes, triangles);
 }


 export void raytrace_ispc_tasks(uniform int width, uniform int height,
+                                uniform int baseWidth, uniform int baseHeight,
                                const uniform float raster2camera[4][4], 
                                const uniform float camera2world[4][4],
                                uniform float image[], uniform int id[],
                                const LinearBVHNode nodes[],
                                const Triangle triangles[]) {
    uniform int dx = 16, dy = 16;
-    for (uniform int y = 0; y < height; y += dy) {
-        uniform int y1 = min(y + dy, height);
-        for (uniform int x = 0; x < width; x += dx) {
-            uniform int x1 = min(x + dx, width);
-            launch < raytrace_tile_task(x, x1, y, y1, width, raster2camera, 
-                                        camera2world, image, id, nodes,
-                                        triangles) >;
-         }
-    }
+    uniform int xBuckets = (width + (dx-1)) / dx;
+    uniform int yBuckets = (height + (dy-1)) / dy;
+    uniform int nTasks = xBuckets * yBuckets;
+    launch[nTasks] < raytrace_tile_task(width, height, baseWidth, baseHeight, 
+                                        raster2camera, camera2world, 
+                                        image, id, nodes, triangles) >;
 }
+
--- a/examples/rt/rt.vcxproj
+++ b/examples/rt/rt.vcxproj
@@ -64,15 +64,19 @@
  <PropertyGroup Label="UserMacros" />
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
    <LinkIncremental>true</LinkIncremental>
+    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
    <LinkIncremental>true</LinkIncremental>
+    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
    <LinkIncremental>false</LinkIncremental>
+    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
    <LinkIncremental>false</LinkIncremental>
+    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
  </PropertyGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
    <ClCompile>
@@ -81,6 +85,7 @@
      <WarningLevel>Level3</WarningLevel>
      <Optimization>Disabled</Optimization>
      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <FloatingPointModel>Fast</FloatingPointModel>
    </ClCompile>
@@ -96,6 +101,7 @@
      <WarningLevel>Level3</WarningLevel>
      <Optimization>Disabled</Optimization>
      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <FloatingPointModel>Fast</FloatingPointModel>
    </ClCompile>
@@ -113,6 +119,7 @@
      <FunctionLevelLinking>true</FunctionLevelLinking>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
      <FloatingPointModel>Fast</FloatingPointModel>
    </ClCompile>
    <Link>
@@ -131,6 +138,7 @@
      <FunctionLevelLinking>true</FunctionLevelLinking>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
      <FloatingPointModel>Fast</FloatingPointModel>
    </ClCompile>
    <Link>
@@ -144,27 +152,27 @@
    <CustomBuild Include="rt.ispc">
      <FileType>Document</FileType>
      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86
+ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx
 </Command>
      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h
+ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx
 </Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(Filename).obj</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">%(Filename).obj</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86
+ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx
 </Command>
      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h
+ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx
 </Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(Filename).obj</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">%(Filename).obj</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
    </CustomBuild>
  </ItemGroup>
  <ItemGroup>
    <ClCompile Include="rt.cpp" />
    <ClCompile Include="rt_serial.cpp" />
-    <ClCompile Include="../tasks_concrt.cpp" />
+    <ClCompile Include="../tasksys.cpp" />
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
--- a/examples/rt/rt_serial.cpp
+++ b/examples/rt/rt_serial.cpp
@@ -75,12 +75,13 @@ struct Ray {
 // Declare these in a namespace so the mangling matches
 namespace ispc {
    struct Triangle {
-        float3 p[3];
+        float p[3][4]; // extra float pad after each vertex
        int32_t id;
+        int32_t pad[3]; // make 16 x 32-bits
    };

    struct LinearBVHNode {
-        float3 bounds[2];
+        float bounds[2][3];
        int32_t offset;     // primitives for leaf, second child for interior
        uint8_t nPrimitives;
        uint8_t splitAxis;
@@ -140,12 +141,14 @@ static void generateRay(const float raster2camera[4][4],
 }


-static inline bool BBoxIntersect(const float3 bounds[2], 
+static inline bool BBoxIntersect(const float bounds[2][3], 
                                 const Ray &ray) {
+    float3 bounds0(bounds[0][0], bounds[0][1], bounds[0][2]);
+    float3 bounds1(bounds[1][0], bounds[1][1], bounds[1][2]);
    float t0 = ray.mint, t1 = ray.maxt;

-    float3 tNear = (bounds[0] - ray.origin) * ray.invDir;
-    float3 tFar  = (bounds[1] - ray.origin) * ray.invDir;
+    float3 tNear = (bounds0 - ray.origin) * ray.invDir;
+    float3 tFar  = (bounds1 - ray.origin) * ray.invDir;
    if (tNear.x > tFar.x) {
        float tmp = tNear.x;
        tNear.x = tFar.x;
@@ -176,8 +179,11 @@ static inline bool BBoxIntersect(const float3 bounds[2],


 inline bool TriIntersect(const Triangle &tri, Ray &ray) {
-    float3 e1 = tri.p[1] - tri.p[0];
-    float3 e2 = tri.p[2] - tri.p[0];
+    float3 p0(tri.p[0][0], tri.p[0][1], tri.p[0][2]);
+    float3 p1(tri.p[1][0], tri.p[1][1], tri.p[1][2]);
+    float3 p2(tri.p[2][0], tri.p[2][1], tri.p[2][2]);
+    float3 e1 = p1 - p0;
+    float3 e2 = p2 - p0;

    float3 s1 = Cross(ray.dir, e2);
    float divisor = Dot(s1, e1);
@@ -187,7 +193,7 @@ inline bool TriIntersect(const Triangle &tri, Ray &ray) {
    float invDivisor = 1.f / divisor;

    // Compute first barycentric coordinate
-    float3 d = ray.origin - tri.p[0];
+    float3 d = ray.origin - p0;
    float b1 = Dot(d, s1) * invDivisor;
    if (b1 < 0. || b1 > 1.)
        return false;
@@ -258,17 +264,21 @@ bool BVHIntersect(const LinearBVHNode nodes[], const Triangle tris[],
 }


-void raytrace_serial(int width, int height,
+void raytrace_serial(int width, int height, int baseWidth, int baseHeight,
                     const float raster2camera[4][4], 
                     const float camera2world[4][4],
                     float image[],
                     int id[],
                     const LinearBVHNode nodes[],
                     const Triangle triangles[]) {
+    float widthScale = float(baseWidth) / float(width);
+    float heightScale = float(baseHeight) / float(height);
+
    for (int y = 0; y < height; ++y) {
        for (int x = 0; x < width; ++x) {
                Ray ray;
-                generateRay(raster2camera, camera2world, x, y, ray);
+                generateRay(raster2camera, camera2world, x * widthScale,
+                            y * heightScale, ray);
                BVHIntersect(nodes, triangles, ray);

                int offset = y * width + x;
--- a/examples/simple/Makefile
+++ b/examples/simple/Makefile
@@ -2,7 +2,7 @@
 CXX=g++ -m64
 CXXFLAGS=-Iobjs/ -O3 -Wall
 ISPC=ispc
-ISPCFLAGS=-O2 --arch=x86-64
+ISPCFLAGS=-O2 --arch=x86-64 --target=sse2

 default: simple

--- a/examples/simple/simple.cpp
+++ b/examples/simple/simple.cpp
@@ -33,47 +33,12 @@

 #include <stdio.h>
 #include <stdlib.h>
-#include "../cpuid.h"

 // Include the header file that the ispc compiler generates
 #include "simple_ispc.h"
 using namespace ispc;

-// Make sure that the vector ISA used during compilation is supported by
-// the processor.  The ISPC_TARGET_* macro is set in the ispc-generated
-// header file that we include above.
-static void
-ensureTargetISAIsSupported() {
-#if defined(ISPC_TARGET_SSE2)
-    bool isaSupported = CPUSupportsSSE2();
-    const char *target = "SSE2";
-#elif defined(ISPC_TARGET_SSE4)
-    bool isaSupported = CPUSupportsSSE4();
-    const char *target = "SSE4";
-#elif defined(ISPC_TARGET_AVX)
-    bool isaSupported = CPUSupportsAVX();
-    const char *target = "AVX";
-#else
-#error "Unknown ISPC_TARGET_* value"
-#endif
-    if (!isaSupported) {
-        fprintf(stderr, "***\n*** Error: the ispc-compiled code uses the %s instruction "
-                "set, which isn't\n***        supported by this computer's CPU!\n", target);
-        fprintf(stderr, "***\n***        Please modify the "
-#ifdef _MSC_VER
-                "MSVC project file "
-#else
-                "Makefile "
-#endif
-                "to select another target (e.g. sse2)\n***\n");
-        exit(1);
-    }
-}
-
-
 int main() {
-    ensureTargetISAIsSupported();
-
    float vin[16], vout[16];

    // Initialize input buffer
--- a/examples/simple/simple.vcxproj
+++ b/examples/simple/simple.vcxproj
@@ -1,4 +1,4 @@
-<?xml version="1.0" encoding="utf-8"?>
+<?xml version="1.0" encoding="utf-8"?>
 <Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
  <ItemGroup Label="ProjectConfigurations">
    <ProjectConfiguration Include="Debug|Win32">
@@ -25,21 +25,21 @@
    <CustomBuild Include="simple.ispc">
      <FileType>Document</FileType>
      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86
+ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2
 </Command>
      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-ispc -O2 %(Filename).ispco %(Filename).obj -h %(Filename)_ispc.h
+ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2
 </Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(Filename).obj</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">%(Filename).obj</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86
+ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2
 </Command>
      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h
+ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2
 </Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(Filename).obj</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">%(Filename).obj</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
    </CustomBuild>
  </ItemGroup>
  <PropertyGroup Label="Globals">
@@ -88,15 +88,19 @@ ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h
  <PropertyGroup Label="UserMacros" />
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
    <LinkIncremental>true</LinkIncremental>
+    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
    <LinkIncremental>true</LinkIncremental>
+    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
    <LinkIncremental>false</LinkIncremental>
+    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
    <LinkIncremental>false</LinkIncremental>
+    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
  </PropertyGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
    <ClCompile>
@@ -105,6 +109,7 @@ ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h
      <WarningLevel>Level3</WarningLevel>
      <Optimization>Disabled</Optimization>
      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
@@ -118,6 +123,7 @@ ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h
      <WarningLevel>Level3</WarningLevel>
      <Optimization>Disabled</Optimization>
      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
@@ -133,6 +139,7 @@ ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h
      <FunctionLevelLinking>true</FunctionLevelLinking>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
@@ -150,6 +157,7 @@ ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h
      <FunctionLevelLinking>true</FunctionLevelLinking>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
--- a/examples/stencil/Makefile
+++ b/examples/stencil/Makefile
@@ -1,20 +1,18 @@

 ARCH = $(shell uname)

-TASK_CXX=../tasks_pthreads.cpp
+TASK_CXX=../tasksys.cpp
 TASK_LIB=-lpthread
-
-ifeq ($(ARCH), Darwin)
-  TASK_CXX=../tasks_gcd.cpp
-  TASK_LIB=
-endif
-
 TASK_OBJ=$(addprefix objs/, $(subst ../,, $(TASK_CXX:.cpp=.o)))

 CXX=g++
 CXXFLAGS=-Iobjs/ -O3 -Wall -m64
 ISPC=ispc
-ISPCFLAGS=-O2 --target=sse4x2 --arch=x86-64
+ISPCFLAGS=-O2 --target=sse2,sse4-x2,avx --arch=x86-64
+
+OBJS=objs/stencil.o objs/stencil_serial.o $(TASK_OBJ) objs/stencil_ispc.o \
+	objs/stencil_ispc_sse2.o objs/stencil_ispc_sse4.o \
+	objs/stencil_ispc_avx.o

 default: stencil

@@ -26,8 +24,8 @@ dirs:
 clean:
 	/bin/rm -rf objs *~ stencil

-stencil: dirs objs/stencil.o objs/stencil_serial.o objs/stencil_ispc.o $(TASK_OBJ)
-	$(CXX) $(CXXFLAGS) -o $@ objs/stencil.o objs/stencil_ispc.o objs/stencil_serial.o $(TASK_OBJ) -lm $(TASK_LIB)
+stencil: dirs $(OBJS)
+	$(CXX) $(CXXFLAGS) -o $@ $(OBJS) -lm $(TASK_LIB)

 objs/%.o: %.cpp
 	$(CXX) $< $(CXXFLAGS) -c -o $@
@@ -37,5 +35,5 @@ objs/%.o: ../%.cpp

 objs/stencil.o: objs/stencil_ispc.h 

-objs/%_ispc.h objs/%_ispc.o: %.ispc
+objs/%_ispc.h objs/%_ispc.o objs/%_ispc_sse2.o objs/%_ispc_sse4.o objs/%_ispc_avx.o: %.ispc
 	$(ISPC) $(ISPCFLAGS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
--- a/examples/stencil/stencil.cpp
+++ b/examples/stencil/stencil.cpp
@@ -42,43 +42,10 @@
 #include <algorithm>
 #include <math.h>
 #include "../timing.h"
-#include "../cpuid.h"
 #include "stencil_ispc.h"
 using namespace ispc;


-// Make sure that the vector ISA used during compilation is supported by
-// the processor.  The ISPC_TARGET_* macro is set in the ispc-generated
-// header file that we include above.
-static void
-ensureTargetISAIsSupported() {
-#if defined(ISPC_TARGET_SSE2)
-    bool isaSupported = CPUSupportsSSE2();
-    const char *target = "SSE2";
-#elif defined(ISPC_TARGET_SSE4)
-    bool isaSupported = CPUSupportsSSE4();
-    const char *target = "SSE4";
-#elif defined(ISPC_TARGET_AVX)
-    bool isaSupported = CPUSupportsAVX();
-    const char *target = "AVX";
-#else
-#error "Unknown ISPC_TARGET_* value"
-#endif
-    if (!isaSupported) {
-        fprintf(stderr, "***\n*** Error: the ispc-compiled code uses the %s instruction "
-                "set, which isn't\n***        supported by this computer's CPU!\n", target);
-        fprintf(stderr, "***\n***        Please modify the "
-#ifdef _MSC_VER
-                "MSVC project file "
-#else
-                "Makefile "
-#endif
-                "to select another target (e.g. sse2)\n***\n");
-        exit(1);
-    }
-}
-
-
 extern void loop_stencil_serial(int t0, int t1, int x0, int x1,
                                int y0, int y1, int z0, int z1,
                                int Nx, int Ny, int Nz,
@@ -100,8 +67,6 @@ void InitData(int Nx, int Ny, int Nz, float *A[2], float *vsq) {


 int main() {
-    ensureTargetISAIsSupported();
-
    int Nx = 256, Ny = 256, Nz = 256;
    int width = 4;
    float *Aserial[2], *Aispc[2];
--- a/examples/stencil/stencil.vcxproj
+++ b/examples/stencil/stencil.vcxproj
@@ -64,15 +64,19 @@
  <PropertyGroup Label="UserMacros" />
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
    <LinkIncremental>true</LinkIncremental>
+    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
    <LinkIncremental>true</LinkIncremental>
+    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
    <LinkIncremental>false</LinkIncremental>
+    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
    <LinkIncremental>false</LinkIncremental>
+    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
  </PropertyGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
    <ClCompile>
@@ -81,6 +85,7 @@
      <WarningLevel>Level3</WarningLevel>
      <Optimization>Disabled</Optimization>
      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <FloatingPointModel>Fast</FloatingPointModel>
    </ClCompile>
@@ -96,6 +101,7 @@
      <WarningLevel>Level3</WarningLevel>
      <Optimization>Disabled</Optimization>
      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <FloatingPointModel>Fast</FloatingPointModel>
    </ClCompile>
@@ -113,6 +119,7 @@
      <FunctionLevelLinking>true</FunctionLevelLinking>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
      <FloatingPointModel>Fast</FloatingPointModel>
    </ClCompile>
    <Link>
@@ -131,6 +138,7 @@
      <FunctionLevelLinking>true</FunctionLevelLinking>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
      <FloatingPointModel>Fast</FloatingPointModel>
    </ClCompile>
    <Link>
@@ -144,27 +152,27 @@
    <CustomBuild Include="stencil.ispc">
      <FileType>Document</FileType>
      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86
+ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx
 </Command>
      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h
+ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx
 </Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(Filename).obj</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">%(Filename).obj</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86
+ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx
 </Command>
      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h
+ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx
 </Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(Filename).obj</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">%(Filename).obj</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
    </CustomBuild>
  </ItemGroup>
  <ItemGroup>
    <ClCompile Include="stencil.cpp" />
    <ClCompile Include="stencil_serial.cpp" />
-    <ClCompile Include="../tasks_concrt.cpp" />
+    <ClCompile Include="../tasksys.cpp" />
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
--- a/examples/taskinfo.h
+++ b/examples/taskinfo.h
@@ -1,180 +0,0 @@
-/*
-  Copyright (c) 2011, Intel Corporation
-  All rights reserved.
-
-  Redistribution and use in source and binary forms, with or without
-  modification, are permitted provided that the following conditions are
-  met:
-
-    * Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-
-    * Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-
-    * Neither the name of Intel Corporation nor the names of its
-      contributors may be used to endorse or promote products derived from
-      this software without specific prior written permission.
-
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
-   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
-   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
-   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
-*/
-
-#ifndef TASKINFO_H
-#define TASKINFO_H 1
-
-#ifdef _MSC_VER
-#define ISPC_IS_WINDOWS
-#elif defined(__linux__)
-#define ISPC_IS_LINUX
-#elif defined(__APPLE__)
-#define ISPC_IS_APPLE
-#endif
-
-#ifdef ISPC_IS_WINDOWS
-#define NOMINMAX
-#include <windows.h>
-#include <concrt.h>
-using namespace Concurrency;
-#endif // ISPC_IS_WINDOWS
-
-#if (__SIZEOF_POINTER__ == 4) || defined(__i386__) || defined(_WIN32)
-#define ISPC_POINTER_BYTES 4
-#elif (__SIZEOF_POINTER__ == 8) || defined(__x86_64__) || defined(__amd64__) || defined(_WIN64)
-#define ISPC_POINTER_BYTES 8
-#else
-#error "Pointer size unknown!"
-#endif // __SIZEOF_POINTER__
-
-#include <stdint.h>
-#include <stdlib.h>
-#include <stdio.h>
-#include <assert.h>
-
-typedef struct TaskInfo {
-    void *func;
-    void *data;
-#if defined(ISPC_IS_WINDOWS)
-    event taskEvent;
-#endif
-} TaskInfo;
-
-
-#ifndef ISPC_IS_WINDOWS
-static int32_t 
-lAtomicCompareAndSwap32(volatile int32_t *v, int32_t newValue, int32_t oldValue) {
-    int32_t result;
-    __asm__ __volatile__("lock\ncmpxchgl %2,%1"
-                          : "=a"(result), "=m"(*v)
-                          : "q"(newValue), "0"(oldValue)
-                          : "memory");
-    __asm__ __volatile__("mfence":::"memory");
-    return result;
-}
-#endif // !ISPC_IS_WINDOWS
-
-
-static void *
-lAtomicCompareAndSwapPointer(void **v, void *newValue, void *oldValue) {
-#ifdef ISPC_IS_WINDOWS
-	return InterlockedCompareExchangePointer(v, newValue, oldValue);
-#else
-    void *result;
-#if (ISPC_POINTER_BYTES == 4)
-    __asm__ __volatile__("lock\ncmpxchgd %2,%1"
-                          : "=a"(result), "=m"(*v)
-                          : "q"(newValue), "0"(oldValue)
-                          : "memory");
-#else
-    __asm__ __volatile__("lock\ncmpxchgq %2,%1"
-                          : "=a"(result), "=m"(*v)
-                          : "q"(newValue), "0"(oldValue)
-                          : "memory");
-#endif // ISPC_POINTER_BYTES
-    __asm__ __volatile__("mfence":::"memory");
-    return result;
-#endif // ISPC_IS_WINDOWS
-}
-
-
-#ifndef ISPC_IS_WINDOWS
-static int32_t 
-lAtomicAdd32(volatile int32_t *v, int32_t delta) {
-    // Do atomic add with gcc x86 inline assembly
-    int32_t origValue;
-    __asm__ __volatile__("lock\n"
-                         "xaddl %0,%1"
-                         : "=r"(origValue), "=m"(*v) : "0"(delta)
-                         : "memory");
-    return origValue;
-}
-#endif
-
-#define LOG_TASK_QUEUE_CHUNK_SIZE 13
-#define MAX_TASK_QUEUE_CHUNKS 1024
-#define TASK_QUEUE_CHUNK_SIZE (1<<LOG_TASK_QUEUE_CHUNK_SIZE)
-
-#define MAX_LAUNCHED_TASKS (MAX_TASK_QUEUE_CHUNKS * TASK_QUEUE_CHUNK_SIZE)
-
-typedef void (*TaskFuncType)(void *, int, int);
-
-#ifdef ISPC_IS_WINDOWS
-static volatile LONG nextTaskInfoCoordinate;
-#else
-static volatile int nextTaskInfoCoordinate;
-#endif
-
-static TaskInfo *taskInfo[MAX_TASK_QUEUE_CHUNKS];
-
-static inline void
-lInitTaskInfo() {
-    taskInfo[0] = new TaskInfo[TASK_QUEUE_CHUNK_SIZE];
-}
-
-
-static inline TaskInfo *
-lGetTaskInfo() {
-#ifdef ISPC_IS_WINDOWS
-    int myCoord = InterlockedAdd(&nextTaskInfoCoordinate, 1)-1;
-#else
-    int myCoord = lAtomicAdd32(&nextTaskInfoCoordinate, 1);
-#endif
-	int index = (myCoord >> LOG_TASK_QUEUE_CHUNK_SIZE);
-    int offset = myCoord & (TASK_QUEUE_CHUNK_SIZE-1);
-    if (index == MAX_TASK_QUEUE_CHUNKS) {
-        fprintf(stderr, "A total of %d tasks have been launched--the simple "
-                "built-in task system can handle no more. Exiting.", myCoord);
-        exit(1);
-    }
-
-    if (taskInfo[index] == NULL) {
-        TaskInfo *newChunk = new TaskInfo[TASK_QUEUE_CHUNK_SIZE];
-        if (lAtomicCompareAndSwapPointer((void **)&taskInfo[index], newChunk, 
-                                         NULL) != NULL) {
-            // failure--someone else got it, but that's cool
-            assert(taskInfo[index] != NULL);
-            free(newChunk);
-        }
-    }
-
-    return &taskInfo[index][offset];
-}
-
-
-static inline void
-lResetTaskInfo() {
-    nextTaskInfoCoordinate = 0;
-}
-
-#endif // TASKINFO_H
--- a/examples/tasks_concrt.cpp
+++ b/examples/tasks_concrt.cpp
@@ -1,104 +0,0 @@
-/*
-  Copyright (c) 2011, Intel Corporation
-  All rights reserved.
-
-  Redistribution and use in source and binary forms, with or without
-  modification, are permitted provided that the following conditions are
-  met:
-
-    * Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-
-    * Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-
-    * Neither the name of Intel Corporation nor the names of its
-      contributors may be used to endorse or promote products derived from
-      this software without specific prior written permission.
-
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
-   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
-   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
-   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
-*/
-
-#include "taskinfo.h"
-
-/* Simple task system implementation for ispc based on Microsoft's
-   Concurrency Runtime. */
-
-#include <windows.h>
-#include <concrt.h>
-using namespace Concurrency;
-#include <stdint.h>
-#include <assert.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <algorithm>
-
-// ispc expects these functions to have C linkage / not be mangled
-extern "C" { 
-    void ISPCLaunch(void *f, void *data);
-    void ISPCSync();
-    void *ISPCMalloc(int64_t size, int32_t alignment);
-    void ISPCFree(void *ptr);
-}
-
-
-void __cdecl
-lRunTask(LPVOID param) {
-    TaskInfo *ti = (TaskInfo *)param;
-    
-    // Actually run the task. 
-    // FIXME: like the GCD implementation for OS X, this is passing bogus
-    // values for the threadIndex and threadCount builtins, which in turn
-    // will cause bugs in code that uses those.
-    int threadIndex = 0;
-    int threadCount = 1;
-    TaskFuncType func = (TaskFuncType)ti->func;
-    func(ti->data, threadIndex, threadCount);
-
-    // Signal the event that this task is done
-    ti->taskEvent.set();
-}
-
-
-void
-ISPCLaunch(void *func, void *data) {
-    TaskInfo *ti = lGetTaskInfo();
-    ti->func = (TaskFuncType)func;
-    ti->data = data;
-	ti->taskEvent.reset();
-    CurrentScheduler::ScheduleTask(lRunTask, ti);
-}
-
-
-void ISPCSync() {
-    for (int i = 0; i < nextTaskInfoCoordinate; ++i) {
-		int index = (i >> LOG_TASK_QUEUE_CHUNK_SIZE);
-		int offset = i & (TASK_QUEUE_CHUNK_SIZE-1);
-		taskInfo[index][offset].taskEvent.wait();
-		taskInfo[index][offset].taskEvent.reset();
-    }
-
-    lResetTaskInfo();
-}
-
-
-void *ISPCMalloc(int64_t size, int32_t alignment) {
-    return _aligned_malloc(size, alignment);
-}
-
-
-void ISPCFree(void *ptr) {
-    _aligned_free(ptr);
-}
--- a/examples/tasks_gcd.cpp
+++ b/examples/tasks_gcd.cpp
@@ -1,99 +0,0 @@
-/*
-  Copyright (c) 2011, Intel Corporation
-  All rights reserved.
-
-  Redistribution and use in source and binary forms, with or without
-  modification, are permitted provided that the following conditions are
-  met:
-
-    * Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-
-    * Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-
-    * Neither the name of Intel Corporation nor the names of its
-      contributors may be used to endorse or promote products derived from
-      this software without specific prior written permission.
-
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
-   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
-   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
-   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
-*/
-
-#include "taskinfo.h"
-
-/* A simple task system for ispc programs based on Apple's Grand Central
-   Dispatch. */
-#include <dispatch/dispatch.h>
-#include <stdio.h>
-
-static int initialized = 0;
-static volatile int32_t lock = 0;
-static dispatch_queue_t gcdQueue;
-static dispatch_group_t gcdGroup;
-
-// ispc expects these functions to have C linkage / not be mangled
-extern "C" { 
-    void ISPCLaunch(void *f, void *data);
-    void ISPCSync();
-}
-
-
-static void
-lRunTask(void *ti) {
-    TaskInfo *taskInfo = (TaskInfo *)ti;
-    // FIXME: these are bogus values; may cause bugs in code that depends
-    // on them having unique values in different threads.
-    int threadIndex = 0;
-    int threadCount = 1;
-    TaskFuncType func = (TaskFuncType)(taskInfo->func);
-
-    // Actually run the task
-    func(taskInfo->data, threadIndex, threadCount);
-}
-
-
-void ISPCLaunch(void *func, void *data) {
-    if (!initialized) {
-        while (1) {
-            if (lAtomicCompareAndSwap32(&lock, 1, 0) == 0) {
-                if (!initialized) {
-                    gcdQueue = dispatch_get_global_queue(DISPATCH_QUEUE_PRIORITY_DEFAULT, 0);
-                    gcdGroup = dispatch_group_create();
-                    lInitTaskInfo();
-                    __asm__ __volatile__("mfence":::"memory");
-                    initialized = 1;
-                }
-                lock = 0;
-                break;
-            }
-        }
-    }
-
-    TaskInfo *ti = lGetTaskInfo();
-    ti->func = func;
-    ti->data = data;
-    dispatch_group_async_f(gcdGroup, gcdQueue, ti, lRunTask);
-}
-
-
-void ISPCSync() {
-    if (!initialized)
-        return;
-
-    // Wait for all of the tasks in the group to complete before returning
-    dispatch_group_wait(gcdGroup, DISPATCH_TIME_FOREVER);
-
-    lResetTaskInfo();
-}
--- a/examples/tasks_pthreads.cpp
+++ b/examples/tasks_pthreads.cpp
@@ -1,294 +0,0 @@
-/*
-  Copyright (c) 2011, Intel Corporation
-  All rights reserved.
-
-  Redistribution and use in source and binary forms, with or without
-  modification, are permitted provided that the following conditions are
-  met:
-
-    * Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-
-    * Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-
-    * Neither the name of Intel Corporation nor the names of its
-      contributors may be used to endorse or promote products derived from
-      this software without specific prior written permission.
-
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
-   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
-   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
-   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
-*/
-
-#include "taskinfo.h"
-#include <pthread.h>
-#include <semaphore.h>
-#include <string.h>
-#include <unistd.h>
-#include <assert.h>
-#include <stdio.h>
-#include <fcntl.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <sys/param.h>
-#include <sys/sysctl.h>
-#include <stdint.h>
-#include <stdlib.h>
-#include <errno.h>
-
-static int initialized = 0;
-static volatile int32_t lock = 0;
-
-static int nThreads;
-static pthread_t *threads;
-static pthread_mutex_t taskQueueMutex;
-static int nextTaskToRun;
-static sem_t *workerSemaphore;
-static uint32_t numUnfinishedTasks;
-static pthread_mutex_t tasksRunningConditionMutex;
-static pthread_cond_t tasksRunningCondition;
-
-// ispc expects these functions to have C linkage / not be mangled
-extern "C" { 
-    void ISPCLaunch(void *f, void *data);
-    void ISPCSync();
-}
-
-static void *lTaskEntry(void *arg);
-
-/** Figure out how many CPU cores there are in the system
- */
-static int
-lNumCPUCores() {
-    return sysconf(_SC_NPROCESSORS_ONLN);
-}
-
-
-static void
-lTasksInit() {
-    nThreads = lNumCPUCores();
-
-    threads = (pthread_t *)malloc(nThreads * sizeof(pthread_t));
-
-    int err;
-    if ((err = pthread_mutex_init(&taskQueueMutex, NULL)) != 0) {
-        fprintf(stderr, "Error creating mutex: %s\n", strerror(err));
-        exit(1);
-    }
-
-    char name[32];
-    sprintf(name, "ispc_task.%d", (int)getpid());
-    workerSemaphore = sem_open(name, O_CREAT, S_IRUSR|S_IWUSR, 0);
-    if (!workerSemaphore) {
-        fprintf(stderr, "Error creating semaphore: %s\n", strerror(err));
-        exit(1);
-    }
-
-    if ((err = pthread_cond_init(&tasksRunningCondition, NULL)) != 0) {
-        fprintf(stderr, "Error creating condition variable: %s\n", strerror(err));
-        exit(1);
-    }
-
-    if ((err = pthread_mutex_init(&tasksRunningConditionMutex, NULL)) != 0) {
-        fprintf(stderr, "Error creating mutex: %s\n", strerror(err));
-        exit(1);
-    }
-
-    for (int i = 0; i < nThreads; ++i) {
-        err = pthread_create(&threads[i], NULL, &lTaskEntry, (void *)(i));
-        if (err != 0) {
-            fprintf(stderr, "Error creating pthread %d: %s\n", i, strerror(err));
-            exit(1);
-        }
-    }
-}
-
-
-void
-ISPCLaunch(void *f, void *d) {
-    int err;
-
-    if (!initialized) {
-        while (1) {
-            if (lAtomicCompareAndSwap32(&lock, 1, 0) == 0) {
-                if (!initialized) {
-                    lTasksInit();
-                    __asm__ __volatile__("mfence":::"memory");
-                    initialized = 1;
-                }
-                lock = 0;
-                break;
-            }
-        }
-    }
-
-    //
-    // Acquire mutex, add task
-    //
-    if ((err = pthread_mutex_lock(&taskQueueMutex)) != 0) {
-        fprintf(stderr, "Error from pthread_mutex_lock: %s\n", strerror(err));
-        exit(1);
-    }
-
-    // Need a mutex here to ensure we get this filled in before a worker
-    // grabs it and starts running...
-    TaskInfo *ti = lGetTaskInfo();
-    ti->func = f;
-    ti->data = d;
-
-    if ((err = pthread_mutex_unlock(&taskQueueMutex)) != 0) {
-        fprintf(stderr, "Error from pthread_mutex_unlock: %s\n", strerror(err));
-        exit(1);
-    }
-
-    //
-    // Update count of number of tasks left to run
-    //
-    if ((err = pthread_mutex_lock(&tasksRunningConditionMutex)) != 0) {
-        fprintf(stderr, "Error from pthread_mutex_lock: %s\n", strerror(err));
-        exit(1);
-    }
-
-    // FIXME: is this redundant with nextTaskInfoCoordinate?
-    ++numUnfinishedTasks;
-
-    if ((err = pthread_mutex_unlock(&tasksRunningConditionMutex)) != 0) {
-        fprintf(stderr, "Error from pthread_mutex_lock: %s\n", strerror(err));
-        exit(1);
-    }
-
-    //
-    // Post to the worker semaphore to wake up worker threads that are
-    // sleeping waiting for tasks to show up
-    //
-    if ((err = sem_post(workerSemaphore)) != 0) {
-        fprintf(stderr, "Error from sem_post: %s\n", strerror(err));
-        exit(1);
-    }
-}
-
-
-static void *
-lTaskEntry(void *arg) {
-    int threadIndex = (int)((int64_t)arg);
-    int threadCount = nThreads;
-    TaskFuncType func;
-
-    while (1) {
-        int err;
-        if ((err = sem_wait(workerSemaphore)) != 0) {
-            fprintf(stderr, "Error from sem_wait: %s\n", strerror(err));
-            exit(1);
-        }
-
-        //
-        // Acquire mutex, get task
-        //
-        if ((err = pthread_mutex_lock(&taskQueueMutex)) != 0) {
-            fprintf(stderr, "Error from pthread_mutex_lock: %s\n", strerror(err));
-            exit(1);
-        }
-
-        if (nextTaskToRun == nextTaskInfoCoordinate) {
-            //
-            // Task queue is empty, go back and wait on the semaphore
-            //
-            if ((err = pthread_mutex_unlock(&taskQueueMutex)) != 0) {
-                fprintf(stderr, "Error from pthread_mutex_unlock: %s\n", strerror(err));
-                exit(1);
-            }
-            continue;
-        }
-
-        int runCoord = nextTaskToRun++;
-        int index = (runCoord >> LOG_TASK_QUEUE_CHUNK_SIZE);
-        int offset = runCoord & (TASK_QUEUE_CHUNK_SIZE-1);
-        TaskInfo *myTask = &taskInfo[index][offset];
-
-        if ((err = pthread_mutex_unlock(&taskQueueMutex)) != 0) {
-            fprintf(stderr, "Error from pthread_mutex_unlock: %s\n", strerror(err));
-            exit(1);
-        }
-
-        //
-        // Do work for _myTask_
-        //
-        func = (TaskFuncType)myTask->func;
-        func(myTask->data, threadIndex, threadCount);
-
-        //
-        // Decrement the number of unfinished tasks counter
-        //
-        if ((err = pthread_mutex_lock(&tasksRunningConditionMutex)) != 0) {
-            fprintf(stderr, "Error from pthread_mutex_lock: %s\n", strerror(err));
-            exit(1);
-        }
-
-        // FIXME: can this be a comparison of (nextTaskToRun == nextTaskInfoCoordinate)?
-        // (I don't think so--think there is a race...)
-        int unfinished = --numUnfinishedTasks;
-        if (unfinished == 0) {
-            //
-            // Signal the "no more tasks are running" condition if all of
-            // them are done.
-            //
-            int err;
-            if ((err = pthread_cond_signal(&tasksRunningCondition)) != 0) {
-                fprintf(stderr, "Error from pthread_cond_signal: %s\n", strerror(err));
-                exit(1);
-            }
-        }
-
-        if ((err = pthread_mutex_unlock(&tasksRunningConditionMutex)) != 0) {
-            fprintf(stderr, "Error from pthread_mutex_lock: %s\n", strerror(err));
-            exit(1);
-        }
-    }
-
-    pthread_exit(NULL);
-    return 0;
-}
-
-
-void ISPCSync() {
-    int err;
-    if ((err = pthread_mutex_lock(&tasksRunningConditionMutex)) != 0) {
-        fprintf(stderr, "Error from pthread_mutex_lock: %s\n", strerror(err));
-        exit(1);
-    }
-
-    // As long as there are tasks running, wait on the condition variable;
-    // doing so causes this thread to go to sleep until someone signals on
-    // the tasksRunningCondition condition variable.
-    while (numUnfinishedTasks > 0) {
-        if ((err = pthread_cond_wait(&tasksRunningCondition, 
-                                     &tasksRunningConditionMutex)) != 0) {
-            fprintf(stderr, "Error from pthread_cond_wait: %s\n", strerror(err));
-            exit(1);
-        }
-    }
-    
-    lResetTaskInfo();
-    nextTaskToRun = 0;
-
-    // We acquire ownership of the condition variable mutex when the above
-    // pthread_cond_wait returns.
-    // FIXME: is there a lurking issue here if numUnfinishedTasks gets back
-    // to zero by the time we get to ISPCSync() and thence we're trying to
-    // unlock a mutex we don't have a lock on?
-    if ((err = pthread_mutex_unlock(&tasksRunningConditionMutex)) != 0) {
-        fprintf(stderr, "Error from pthread_mutex_lock: %s\n", strerror(err));
-        exit(1);
-    }
-}
--- a/examples/tasksys.cpp
+++ b/examples/tasksys.cpp
@@ -0,0 +1,865 @@
+/*
+  Copyright (c) 2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+/*
+  This file implements simple task systems that provide the three
+  entrypoints used by ispc-generated to code to handle 'launch' and 'sync'
+  statements in ispc programs.  See the section "Task Parallelism: Language
+  Syntax" in the ispc documentation for information about using task
+  parallelism in ispc programs, and see the section "Task Parallelism:
+  Runtime Requirements" for information about the task-related entrypoints
+  that are implemented here.
+
+  There are three task systems in this file: one built using Microsoft's
+  Concurrency Runtime, one built with Apple's Grand Central Dispatch, and
+  one built on top of bare pthreads.
+*/
+
+#if defined(_WIN32) || defined(_WIN64)
+  #define ISPC_IS_WINDOWS
+  #define ISPC_USE_CONCRT
+#elif defined(__linux__)
+  #define ISPC_IS_LINUX
+  #define ISPC_USE_PTHREADS
+#elif defined(__APPLE__)
+  #define ISPC_IS_APPLE
+  #define ISPC_USE_GCD
+#endif
+
+#define DBG(x) 
+
+#ifdef ISPC_IS_WINDOWS
+  #define NOMINMAX
+  #include <windows.h>
+#endif // ISPC_IS_WINDOWS
+#ifdef ISPC_USE_CONCRT
+  #include <concrt.h>
+  using namespace Concurrency;
+#endif // ISPC_USE_CONCRT
+#ifdef ISPC_USE_GCD
+  #include <dispatch/dispatch.h>
+  #include <pthread.h>
+#endif // ISPC_USE_GCD
+#ifdef ISPC_USE_PTHREADS
+  #include <pthread.h>
+  #include <semaphore.h>
+  #include <unistd.h>
+  #include <fcntl.h>
+  #include <errno.h>
+  #include <sys/types.h>
+  #include <sys/stat.h>
+  #include <sys/param.h>
+  #include <sys/sysctl.h>
+  #include <vector>
+  #include <algorithm>
+#endif // ISPC_USE_PTHREADS
+#ifdef ISPC_IS_LINUX
+  #include <malloc.h>
+#endif // ISPC_IS_LINUX
+#include <stdio.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <assert.h>
+#include <string.h>
+#include <algorithm>
+
+// Signature of ispc-generated 'task' functions
+typedef void (*TaskFuncType)(void *data, int threadIndex, int threadCount,
+                             int taskIndex, int taskCount);
+
+// Small structure used to hold the data for each task
+struct TaskInfo {
+    TaskFuncType func;
+    void *data;
+    int taskIndex, taskCount;
+#if defined(ISPC_IS_WINDOWS)
+    event taskEvent;
+#endif
+};
+
+///////////////////////////////////////////////////////////////////////////
+// TaskGroupBase
+
+#define LOG_TASK_QUEUE_CHUNK_SIZE 14
+#define MAX_TASK_QUEUE_CHUNKS 8
+#define TASK_QUEUE_CHUNK_SIZE (1<<LOG_TASK_QUEUE_CHUNK_SIZE)
+
+#define MAX_LAUNCHED_TASKS (MAX_TASK_QUEUE_CHUNKS * TASK_QUEUE_CHUNK_SIZE)
+
+#define NUM_MEM_BUFFERS 16
+
+class TaskGroup;
+
+/** The TaskGroupBase structure provides common functionality for "task
+    groups"; a task group is the set of tasks launched from within a single
+    ispc function.  When the function is ready to return, it waits for all
+    of the tasks in its task group to finish before it actually returns.
+ */
+class TaskGroupBase {
+public:
+    void Reset();
+
+    int AllocTaskInfo(int count);
+    TaskInfo *GetTaskInfo(int index);
+
+    void *AllocMemory(int64_t size, int32_t alignment);
+
+protected:
+    TaskGroupBase();
+    ~TaskGroupBase();
+
+    int nextTaskInfoIndex;
+
+private:
+    /* We allocate blocks of TASK_QUEUE_CHUNK_SIZE TaskInfo structures as
+       needed by the calling function.  We hold up to MAX_TASK_QUEUE_CHUNKS
+       of these (and then exit at runtime if more than this many tasks are
+       launched.)
+     */
+    TaskInfo *taskInfo[MAX_TASK_QUEUE_CHUNKS];
+
+    /* We also allocate chunks of memory to service ISPCAlloc() calls.  The
+       memBuffers[] array holds pointers to this memory.  The first element
+       of this array is initialized to point to mem and then any subsequent
+       elements required are initialized with dynamic allocation.
+     */
+    int curMemBuffer, curMemBufferOffset;
+    int memBufferSize[NUM_MEM_BUFFERS];
+    char *memBuffers[NUM_MEM_BUFFERS];
+    char mem[256];
+};
+
+
+inline TaskGroupBase::TaskGroupBase() { 
+    nextTaskInfoIndex = 0; 
+
+    curMemBuffer = 0; 
+    curMemBufferOffset = 0;
+    memBuffers[0] = mem;
+    memBufferSize[0] = sizeof(mem) / sizeof(mem[0]);
+    for (int i = 1; i < NUM_MEM_BUFFERS; ++i) {
+        memBuffers[i] = NULL;
+        memBufferSize[i] = 0;
+    }
+
+    for (int i = 0; i < MAX_TASK_QUEUE_CHUNKS; ++i)
+        taskInfo[i] = NULL;
+}
+
+
+inline TaskGroupBase::~TaskGroupBase() {
+    // Note: don't delete memBuffers[0], since it points to the start of
+    // the "mem" member!
+    for (int i = 1; i < NUM_MEM_BUFFERS; ++i)
+        delete[] memBuffers[i];
+}
+
+
+inline void
+TaskGroupBase::Reset() {
+    nextTaskInfoIndex = 0; 
+    curMemBuffer = 0; 
+    curMemBufferOffset = 0;
+}
+
+
+inline int
+TaskGroupBase::AllocTaskInfo(int count) {
+    int ret = nextTaskInfoIndex;
+    nextTaskInfoIndex += count;
+    return ret;
+}
+
+
+inline TaskInfo *
+TaskGroupBase::GetTaskInfo(int index) {
+    int chunk = (index >> LOG_TASK_QUEUE_CHUNK_SIZE);
+    int offset = index & (TASK_QUEUE_CHUNK_SIZE-1);
+
+    if (chunk == MAX_TASK_QUEUE_CHUNKS) {
+        fprintf(stderr, "A total of %d tasks have been launched from the "
+                "current function--the simple built-in task system can handle "
+                "no more. You can increase the values of TASK_QUEUE_CHUNK_SIZE "
+                "and LOG_TASK_QUEUE_CHUNK_SIZE to work around this limitation.  "
+                "Sorry!  Exiting.\n", index);
+        exit(1);
+    }
+
+    if (taskInfo[chunk] == NULL)
+        taskInfo[chunk] = new TaskInfo[TASK_QUEUE_CHUNK_SIZE];
+    return &taskInfo[chunk][offset];
+}
+
+
+inline void *
+TaskGroupBase::AllocMemory(int64_t size, int32_t alignment) {
+    char *basePtr = memBuffers[curMemBuffer];
+    int64_t iptr = (int64_t)(basePtr + curMemBufferOffset);
+    iptr = (iptr + (alignment-1)) & ~(alignment-1);
+
+    int newOffset = int(iptr + size - (int64_t)basePtr);
+    if (newOffset < memBufferSize[curMemBuffer]) {
+        curMemBufferOffset = newOffset;
+        return (char *)iptr;
+    }
+
+    ++curMemBuffer;
+    curMemBufferOffset = 0;
+    assert(curMemBuffer < NUM_MEM_BUFFERS);
+
+    int allocSize = 1 << (12 + curMemBuffer);
+    allocSize = std::max(int(size+alignment), allocSize);
+    char *newBuf = new char[allocSize];
+    memBufferSize[curMemBuffer] = allocSize;
+    memBuffers[curMemBuffer] = newBuf;
+    return AllocMemory(size, alignment);
+}
+
+
+///////////////////////////////////////////////////////////////////////////
+// Atomics and the like
+
+#ifndef ISPC_IS_WINDOWS
+static inline void
+lMemFence() {
+    __asm__ __volatile__("mfence":::"memory");
+}
+#endif // !ISPC_IS_WINDOWS
+
+
+#if (__SIZEOF_POINTER__ == 4) || defined(__i386__) || defined(_WIN32)
+#define ISPC_POINTER_BYTES 4
+#elif (__SIZEOF_POINTER__ == 8) || defined(__x86_64__) || defined(__amd64__) || defined(_WIN64)
+#define ISPC_POINTER_BYTES 8
+#else
+#error "Pointer size unknown!"
+#endif // __SIZEOF_POINTER__
+
+
+static void *
+lAtomicCompareAndSwapPointer(void **v, void *newValue, void *oldValue) {
+#ifdef ISPC_IS_WINDOWS
+    return InterlockedCompareExchangePointer(v, newValue, oldValue);
+#else
+    void *result;
+#if (ISPC_POINTER_BYTES == 4)
+    __asm__ __volatile__("lock\ncmpxchgd %2,%1"
+                          : "=a"(result), "=m"(*v)
+                          : "q"(newValue), "0"(oldValue)
+                          : "memory");
+#else
+    __asm__ __volatile__("lock\ncmpxchgq %2,%1"
+                          : "=a"(result), "=m"(*v)
+                          : "q"(newValue), "0"(oldValue)
+                          : "memory");
+#endif // ISPC_POINTER_BYTES
+    lMemFence();
+    return result;
+#endif // ISPC_IS_WINDOWS
+}
+
+
+
+#ifndef ISPC_IS_WINDOWS
+static int32_t 
+lAtomicCompareAndSwap32(volatile int32_t *v, int32_t newValue, int32_t oldValue) {
+    int32_t result;
+    __asm__ __volatile__("lock\ncmpxchgl %2,%1"
+                          : "=a"(result), "=m"(*v)
+                          : "q"(newValue), "0"(oldValue)
+                          : "memory");
+    lMemFence();
+    return result;
+}
+#endif // !ISPC_IS_WINDOWS
+
+
+///////////////////////////////////////////////////////////////////////////
+
+#ifdef ISPC_USE_CONCRT
+// With ConcRT, we don't need to extend TaskGroupBase at all.
+class TaskGroup : public TaskGroupBase {
+public:
+    void Launch(int baseIndex, int count);
+    void Sync();
+};
+#endif // ISPC_USE_CONCRT
+
+#ifdef ISPC_USE_GCD
+/* With Grand Central Dispatch, we associate a GCD dispatch group with each
+   task group.  (We'll later wait on this dispatch group when we need to
+   wait on all of the tasks in the group to finish.)
+ */
+class TaskGroup : public TaskGroupBase {
+public:
+    TaskGroup() {
+        gcdGroup = dispatch_group_create();
+    }
+
+    void Launch(int baseIndex, int count);
+    void Sync();
+
+private:
+    dispatch_group_t gcdGroup;
+};
+#endif // ISPC_USE_GCD
+
+#ifdef ISPC_USE_PTHREADS
+static void *lTaskEntry(void *arg);
+
+class TaskGroup : public TaskGroupBase {
+public:
+    TaskGroup() {
+        numUnfinishedTasks = 0;
+        waitingTasks.reserve(128);
+        inActiveList = false;
+    }
+
+    void Reset() {
+        TaskGroupBase::Reset();
+        numUnfinishedTasks = 0;
+        assert(inActiveList == false);
+        lMemFence();
+    }
+
+    void Launch(int baseIndex, int count);
+    void Sync();
+
+private:
+    friend void *lTaskEntry(void *arg);
+
+    int32_t numUnfinishedTasks;
+    int32_t pad[3];
+    std::vector<int> waitingTasks;
+    bool inActiveList;
+};
+
+#endif // ISPC_USE_PTHREADS
+
+
+///////////////////////////////////////////////////////////////////////////
+// Grand Central Dispatch
+
+#ifdef ISPC_USE_GCD
+
+/* A simple task system for ispc programs based on Apple's Grand Central
+   Dispatch. */
+
+static dispatch_queue_t gcdQueue;
+static volatile int32_t lock = 0;
+
+static void
+InitTaskSystem() {
+    if (gcdQueue != NULL)
+        return;
+
+    while (1) {
+        if (lAtomicCompareAndSwap32(&lock, 1, 0) == 0) {
+            if (gcdQueue == NULL) {
+                gcdQueue = dispatch_get_global_queue(DISPATCH_QUEUE_PRIORITY_DEFAULT, 0);
+                assert(gcdQueue != NULL);
+                lMemFence();
+            }
+            lock = 0;
+            break;
+        }
+    }
+}
+
+
+static void
+lRunTask(void *ti) {
+    TaskInfo *taskInfo = (TaskInfo *)ti;
+    // FIXME: these are bogus values; may cause bugs in code that depends
+    // on them having unique values in different threads.
+    int threadIndex = 0;
+    int threadCount = 1;
+
+    // Actually run the task
+    taskInfo->func(taskInfo->data, threadIndex, threadCount, 
+                   taskInfo->taskIndex, taskInfo->taskCount);
+}
+
+
+inline void
+TaskGroup::Launch(int baseIndex, int count) {
+    for (int i = 0; i < count; ++i) {
+        TaskInfo *ti = GetTaskInfo(baseIndex + i);
+        dispatch_group_async_f(gcdGroup, gcdQueue, ti, lRunTask);
+    }
+}
+
+
+inline void
+TaskGroup::Sync() {
+    dispatch_group_wait(gcdGroup, DISPATCH_TIME_FOREVER);
+}
+
+#endif // ISPC_USE_GCD
+
+///////////////////////////////////////////////////////////////////////////
+// Concurrency Runtime
+
+#ifdef ISPC_USE_CONCRT
+
+static void
+InitTaskSystem() {
+    // No initialization needed
+}
+
+
+static void __cdecl
+lRunTask(LPVOID param) {
+    TaskInfo *ti = (TaskInfo *)param;
+    
+    // Actually run the task. 
+    // FIXME: like the GCD implementation for OS X, this is passing bogus
+    // values for the threadIndex and threadCount builtins, which in turn
+    // will cause bugs in code that uses those.
+    int threadIndex = 0;
+    int threadCount = 1;
+    ti->func(ti->data, threadIndex, threadCount, ti->taskIndex, ti->taskCount);
+
+    // Signal the event that this task is done
+    ti->taskEvent.set();
+}
+
+
+inline void
+TaskGroup::Launch(int baseIndex, int count) {
+    for (int i = 0; i < count; ++i)
+        CurrentScheduler::ScheduleTask(lRunTask, GetTaskInfo(baseIndex + i));
+}
+
+
+inline void
+TaskGroup::Sync() {
+    for (int i = 0; i < nextTaskInfoIndex; ++i) {
+        TaskInfo *ti = GetTaskInfo(i);
+        ti->taskEvent.wait();
+        ti->taskEvent.reset();
+    }
+}
+
+#endif // ISPC_USE_CONCRT
+
+///////////////////////////////////////////////////////////////////////////
+// pthreads
+
+#ifdef ISPC_USE_PTHREADS
+
+static volatile int32_t lock = 0;
+
+static int nThreads;
+static pthread_t *threads = NULL;
+
+static pthread_mutex_t taskSysMutex;
+static std::vector<TaskGroup *> activeTaskGroups;
+static sem_t *workerSemaphore;
+
+
+static inline int32_t 
+lAtomicAdd(int32_t *v, int32_t delta) {
+    int32_t origValue;
+    __asm__ __volatile__("lock\n"
+                         "xaddl %0,%1"
+                         : "=r"(origValue), "=m"(*v) : "0"(delta)
+                         : "memory");
+    return origValue;
+}
+
+
+static void *
+lTaskEntry(void *arg) {
+    int threadIndex = (int)((int64_t)arg);
+    int threadCount = nThreads;
+
+    while (1) {
+        int err;
+        //
+        // Wait on the semaphore until we're woken up due to the arrival of
+        // more work.
+        //
+        if ((err = sem_wait(workerSemaphore)) != 0) {
+            fprintf(stderr, "Error from sem_wait: %s\n", strerror(err));
+            exit(1);
+        }
+
+        //
+        // Acquire the mutex
+        //
+        if ((err = pthread_mutex_lock(&taskSysMutex)) != 0) {
+            fprintf(stderr, "Error from pthread_mutex_lock: %s\n", strerror(err));
+            exit(1);
+        }
+
+        if (activeTaskGroups.size() == 0) {
+            //
+            // Task queue is empty, go back and wait on the semaphore
+            //
+            if ((err = pthread_mutex_unlock(&taskSysMutex)) != 0) {
+                fprintf(stderr, "Error from pthread_mutex_unlock: %s\n", strerror(err));
+                exit(1);
+            }
+            continue;
+        }
+
+        //
+        // Get the last task group on the active list and the last task
+        // from its waiting tasks list.
+        //
+        TaskGroup *tg = activeTaskGroups.back();
+        assert(tg->waitingTasks.size() > 0);
+        int taskNumber = tg->waitingTasks.back();
+        tg->waitingTasks.pop_back();
+
+        if (tg->waitingTasks.size() == 0) {
+            // We just took the last task from this task group, so remove
+            // it from the active list.
+            activeTaskGroups.pop_back();
+            tg->inActiveList = false;
+        }
+    
+        if ((err = pthread_mutex_unlock(&taskSysMutex)) != 0) {
+            fprintf(stderr, "Error from pthread_mutex_unlock: %s\n", strerror(err));
+            exit(1);
+        }
+
+        //
+        // And now actually run the task
+        //
+        DBG(fprintf(stderr, "running task %d from group %p\n", taskNumber, tg));
+        TaskInfo *myTask = tg->GetTaskInfo(taskNumber);
+        myTask->func(myTask->data, threadIndex, threadCount, myTask->taskIndex,
+                     myTask->taskCount);
+
+        //
+        // Decrement the "number of unfinished tasks" counter in the task
+        // group.
+        //
+        lMemFence();
+        lAtomicAdd(&tg->numUnfinishedTasks, -1);
+    }
+
+    pthread_exit(NULL);
+    return 0;
+}
+
+
+static void
+InitTaskSystem() {
+    if (threads == NULL) {
+        while (1) {
+            if (lAtomicCompareAndSwap32(&lock, 1, 0) == 0) {
+                if (threads == NULL) {
+                    // We launch one fewer thread than there are cores,
+                    // since the main thread here will also grab jobs from
+                    // the task queue itself.
+                    nThreads = sysconf(_SC_NPROCESSORS_ONLN) - 1;
+
+                    int err;
+                    if ((err = pthread_mutex_init(&taskSysMutex, NULL)) != 0) {
+                        fprintf(stderr, "Error creating mutex: %s\n", strerror(err));
+                        exit(1);
+                    }
+
+                    char name[32];
+                    sprintf(name, "ispc_task.%d", (int)getpid());
+                    workerSemaphore = sem_open(name, O_CREAT, S_IRUSR|S_IWUSR, 0);
+                    if (!workerSemaphore) {
+                        fprintf(stderr, "Error creating semaphore: %s\n", strerror(err));
+                        exit(1);
+                    }
+
+                    threads = (pthread_t *)malloc(nThreads * sizeof(pthread_t));
+                    for (int i = 0; i < nThreads; ++i) {
+                        err = pthread_create(&threads[i], NULL, &lTaskEntry, (void *)(i));
+                        if (err != 0) {
+                            fprintf(stderr, "Error creating pthread %d: %s\n", i, strerror(err));
+                            exit(1);
+                        }
+                    }
+
+                    activeTaskGroups.reserve(64);
+                }
+
+                // Make sure all of the above goes to memory before we
+                // clear the lock.
+                lMemFence();
+                lock = 0;
+                break;
+            }
+        }
+    }
+}
+
+
+inline void
+TaskGroup::Launch(int baseCoord, int count) {
+    //
+    // Acquire mutex, add task
+    //
+    int err;
+    if ((err = pthread_mutex_lock(&taskSysMutex)) != 0) {
+        fprintf(stderr, "Error from pthread_mutex_lock: %s\n", strerror(err));
+        exit(1);
+    }
+
+    // Add the corresponding set of tasks to the waiting-to-be-run list for
+    // this task group.
+    //
+    // FIXME: it's a little ugly to hold a global mutex for this when we
+    // only need to make sure no one else is accessing this task group's
+    // waitingTasks list.  (But a small experiment in switching to a
+    // per-TaskGroup mutex showed worse performance!)
+    for (int i = 0; i < count; ++i)
+        waitingTasks.push_back(baseCoord + i);
+
+    // Add the task group to the global active list if it isn't there
+    // already.
+    if (inActiveList == false) {
+        activeTaskGroups.push_back(this);
+        inActiveList = true;
+    }
+
+    if ((err = pthread_mutex_unlock(&taskSysMutex)) != 0) {
+        fprintf(stderr, "Error from pthread_mutex_unlock: %s\n", strerror(err));
+        exit(1);
+    }
+
+    //
+    // Update the count of the number of tasks left to run in this task
+    // group.
+    //
+    lMemFence();
+    lAtomicAdd(&numUnfinishedTasks, count);
+
+    //
+    // Post to the worker semaphore to wake up worker threads that are
+    // sleeping waiting for tasks to show up
+    //
+    for (int i = 0; i < count; ++i)
+        if ((err = sem_post(workerSemaphore)) != 0) {
+            fprintf(stderr, "Error from sem_post: %s\n", strerror(err));
+            exit(1);
+        }
+}
+
+
+inline void
+TaskGroup::Sync() {
+    DBG(fprintf(stderr, "syncing %p - %d unfinished\n", tg, numUnfinishedTasks));
+
+    while (numUnfinishedTasks > 0) {
+        // All of the tasks in this group aren't finished yet.  We'll try
+        // to help out here since we don't have anything else to do...
+
+        DBG(fprintf(stderr, "while syncing %p - %d unfinished\n", tg, 
+                    numUnfinishedTasks));
+
+        //
+        // Acquire the global task system mutex to grab a task to work on
+        //
+        int err;
+        if ((err = pthread_mutex_lock(&taskSysMutex)) != 0) {
+            fprintf(stderr, "Error from pthread_mutex_lock: %s\n", strerror(err));
+            exit(1);
+        }
+
+        TaskInfo *myTask = NULL;
+        TaskGroup *runtg = this;
+        if (waitingTasks.size() > 0) {
+            int taskNumber = waitingTasks.back();
+            waitingTasks.pop_back();
+
+            if (waitingTasks.size() == 0) {
+                // There's nothing left to start running from this group,
+                // so remove it from the active task list.
+                activeTaskGroups.erase(std::find(activeTaskGroups.begin(),
+                                                 activeTaskGroups.end(), this));
+                inActiveList = false;
+            }
+            myTask = GetTaskInfo(taskNumber);
+            DBG(fprintf(stderr, "running task %d from group %p in sync\n", taskNumber, tg));
+        }
+        else {
+            // Other threads are already working on all of the tasks in
+            // this group, so we can't help out by running one ourself.
+            // We'll try to run one from another group to make ourselves
+            // useful here.
+            if (activeTaskGroups.size() == 0) {
+                // No active task groups left--there's nothing for us to do.
+                if ((err = pthread_mutex_unlock(&taskSysMutex)) != 0) {
+                    fprintf(stderr, "Error from pthread_mutex_unlock: %s\n", strerror(err));
+                    exit(1);
+                }
+                // FIXME: We basically end up busy-waiting here, which is
+                // extra wasteful in a world with hyperthreading.  It would
+                // be much better to put this thread to sleep on a
+                // condition variable that was signaled when the last task
+                // in this group was finished.
+                sleep(0);
+                continue;
+            }
+
+            // Get a task to run from another task group.
+            runtg = activeTaskGroups.back();
+            assert(runtg->waitingTasks.size() > 0);
+
+            int taskNumber = runtg->waitingTasks.back();
+            runtg->waitingTasks.pop_back();
+            if (runtg->waitingTasks.size() == 0) {
+                // There's left to start running from this group, so remove
+                // it from the active task list.
+                activeTaskGroups.pop_back();
+                runtg->inActiveList = false;
+            }
+            myTask = runtg->GetTaskInfo(taskNumber);
+            DBG(fprintf(stderr, "running task %d from other group %p in sync\n", 
+                        taskNumber, runtg));
+        }
+
+        if ((err = pthread_mutex_unlock(&taskSysMutex)) != 0) {
+            fprintf(stderr, "Error from pthread_mutex_unlock: %s\n", strerror(err));
+            exit(1);
+        }
+    
+        //
+        // Do work for _myTask_
+        //
+        // FIXME: bogus values for thread index/thread count here as well..
+        myTask->func(myTask->data, 0, 1, myTask->taskIndex, myTask->taskCount);
+
+        //
+        // Decrement the number of unfinished tasks counter
+        //
+        lMemFence();
+        lAtomicAdd(&runtg->numUnfinishedTasks, -1);
+    }
+    DBG(fprintf(stderr, "sync for %p done!n", tg));
+}
+
+#endif // ISPC_USE_PTHREADS
+
+///////////////////////////////////////////////////////////////////////////
+
+#define MAX_FREE_TASK_GROUPS 64
+static TaskGroup *freeTaskGroups[MAX_FREE_TASK_GROUPS];
+
+static inline TaskGroup *
+AllocTaskGroup() {
+    for (int i = 0; i < MAX_FREE_TASK_GROUPS; ++i) {
+        TaskGroup *tg = freeTaskGroups[i];
+        if (tg != NULL) {
+            void *ptr = lAtomicCompareAndSwapPointer((void **)(&freeTaskGroups[i]), NULL, tg);
+            if (ptr != NULL) {
+                assert(ptr == tg);
+                return (TaskGroup *)ptr;
+            }
+        }
+    }
+
+    return new TaskGroup;
+}
+
+
+static inline void
+FreeTaskGroup(TaskGroup *tg) {
+    tg->Reset();
+
+    for (int i = 0; i < MAX_FREE_TASK_GROUPS; ++i) {
+        if (freeTaskGroups[i] == NULL) {
+            void *ptr = lAtomicCompareAndSwapPointer((void **)&freeTaskGroups[i], tg, NULL);
+            if (ptr == NULL)
+                return;
+        }
+    }
+
+    delete tg;
+}
+
+///////////////////////////////////////////////////////////////////////////
+
+// ispc expects these functions to have C linkage / not be mangled
+extern "C" { 
+    void ISPCLaunch(void **handlePtr, void *f, void *data, int count);
+    void *ISPCAlloc(void **handlePtr, int64_t size, int32_t alignment);
+    void ISPCSync(void *handle);
+}
+
+void
+ISPCLaunch(void **taskGroupPtr, void *func, void *data, int count) {
+    TaskGroup *taskGroup;
+    if (*taskGroupPtr == NULL) {
+        InitTaskSystem();
+        taskGroup = AllocTaskGroup();
+        *taskGroupPtr = taskGroup;
+    }
+    else
+        taskGroup = (TaskGroup *)(*taskGroupPtr);
+
+    int baseIndex = taskGroup->AllocTaskInfo(count);
+    for (int i = 0; i < count; ++i) {
+        TaskInfo *ti = taskGroup->GetTaskInfo(baseIndex+i);
+        ti->func = (TaskFuncType)func;
+        ti->data = data;
+        ti->taskIndex = i;
+        ti->taskCount = count;
+    }
+    taskGroup->Launch(baseIndex, count);
+}
+
+
+void
+ISPCSync(void *h) {
+    TaskGroup *taskGroup = (TaskGroup *)h;
+    if (taskGroup != NULL) {
+        taskGroup->Sync();
+        FreeTaskGroup(taskGroup);
+    }
+}
+
+
+void *
+ISPCAlloc(void **taskGroupPtr, int64_t size, int32_t alignment) {
+    TaskGroup *taskGroup;
+    if (*taskGroupPtr == NULL) {
+        InitTaskSystem();
+        taskGroup = AllocTaskGroup();
+        *taskGroupPtr = taskGroup;
+    }
+    else
+        taskGroup = (TaskGroup *)(*taskGroupPtr);
+
+    return taskGroup->AllocMemory(size, alignment);
+}
--- a/examples/volume_rendering/Makefile
+++ b/examples/volume_rendering/Makefile
@@ -1,20 +1,17 @@

 ARCH = $(shell uname)

-TASK_CXX=../tasks_pthreads.cpp
+TASK_CXX=../tasksys.cpp
 TASK_LIB=-lpthread
-
-ifeq ($(ARCH), Darwin)
-  TASK_CXX=../tasks_gcd.cpp
-  TASK_LIB=
-endif
-
 TASK_OBJ=$(addprefix objs/, $(subst ../,, $(TASK_CXX:.cpp=.o)))

 CXX=g++
 CXXFLAGS=-Iobjs/ -O3 -Wall -m64
 ISPC=ispc
-ISPCFLAGS=-O2 --target=sse4x2 --arch=x86-64
+ISPCFLAGS=-O2 --target=sse2,sse4-x2 --arch=x86-64
+
+OBJS=objs/volume.o objs/volume_serial.o $(TASK_OBJ) objs/volume_ispc.o \
+	objs/volume_ispc_sse2.o objs/volume_ispc_sse4.o

 default: volume

@@ -26,8 +23,8 @@ dirs:
 clean:
 	/bin/rm -rf objs *~ volume

-volume: dirs objs/volume.o objs/volume_serial.o objs/volume_ispc.o $(TASK_OBJ)
-	$(CXX) $(CXXFLAGS) -o $@ objs/volume.o objs/volume_ispc.o objs/volume_serial.o $(TASK_OBJ) -lm $(TASK_LIB)
+volume: dirs $(OBJS)
+	$(CXX) $(CXXFLAGS) -o $@ $(OBJS) -lm $(TASK_LIB)

 objs/%.o: %.cpp
 	$(CXX) $< $(CXXFLAGS) -c -o $@
@@ -37,5 +34,5 @@ objs/%.o: ../%.cpp

 objs/volume.o: objs/volume_ispc.h 

-objs/%_ispc.h objs/%_ispc.o: %.ispc
+objs/%_ispc.h objs/%_ispc.o objs/%_ispc_sse2.o objs/%_ispc_sse4.o: %.ispc
 	$(ISPC) $(ISPCFLAGS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
--- a/examples/volume_rendering/volume.cpp
+++ b/examples/volume_rendering/volume.cpp
@@ -41,7 +41,6 @@
 #include <stdio.h>
 #include <algorithm>
 #include "../timing.h"
-#include "../cpuid.h"
 #include "volume_ispc.h"
 using namespace ispc;

@@ -70,37 +69,6 @@ writePPM(float *buf, int width, int height, const char *fn) {
 }


-// Make sure that the vector ISA used during compilation is supported by
-// the processor.  The ISPC_TARGET_* macro is set in the ispc-generated
-// header file that we include above.
-static void
-ensureTargetISAIsSupported() {
-#if defined(ISPC_TARGET_SSE2)
-    bool isaSupported = CPUSupportsSSE2();
-    const char *target = "SSE2";
-#elif defined(ISPC_TARGET_SSE4)
-    bool isaSupported = CPUSupportsSSE4();
-    const char *target = "SSE4";
-#elif defined(ISPC_TARGET_AVX)
-    bool isaSupported = CPUSupportsAVX();
-    const char *target = "AVX";
-#else
-#error "Unknown ISPC_TARGET_* value"
-#endif
-    if (!isaSupported) {
-        fprintf(stderr, "***\n*** Error: the ispc-compiled code uses the %s instruction "
-                "set, which isn't\n***        supported by this computer's CPU!\n", target);
-        fprintf(stderr, "***\n***        Please modify the "
-#ifdef _MSC_VER
-                "MSVC project file "
-#else
-                "Makefile "
-#endif
-                "to select another target (e.g. sse2)\n***\n");
-        exit(1);
-    }
-}
-
 /* Load image and viewing parameters from a camera data file.
   FIXME: we should add support to be able to specify viewing parameters
   in the program here directly. */
@@ -172,8 +140,6 @@ int main(int argc, char *argv[]) {
        return 1;
    }

-    ensureTargetISAIsSupported();
-
    //
    // Load viewing data and the volume density data
    //
--- a/examples/volume_rendering/volume.ispc
+++ b/examples/volume_rendering/volume.ispc
@@ -343,11 +343,20 @@ volume_tile(uniform int x0, uniform int y0, uniform int x1,


 task void
-volume_task(uniform int x0, uniform int y0, uniform int x1,
-            uniform int y1, uniform float density[], uniform int nVoxels[3], 
+volume_task(uniform float density[], uniform int nVoxels[3], 
            const uniform float raster2camera[4][4],
            const uniform float camera2world[4][4], 
            uniform int width, uniform int height, uniform float image[]) {
+    uniform int dx = 8, dy = 8; // must match value in volume_ispc_tasks
+    uniform int xbuckets = (width + (dx-1)) / dx;
+    uniform int ybuckets = (height + (dy-1)) / dy;
+
+    uniform int x0 = (taskIndex % xbuckets) * dx;
+    uniform int y0 = (taskIndex / xbuckets) * dy;
+    uniform int x1 = x0 + dx, y1 = y0 + dy;
+    x1 = min(x1, width);
+    y1 = min(y1, height);
+
    volume_tile(x0, y0, x1, y1, density, nVoxels, raster2camera,
                 camera2world, width, height, image);
 }
@@ -370,9 +379,7 @@ volume_ispc_tasks(uniform float density[], uniform int nVoxels[3],
                  uniform int width, uniform int height, uniform float image[]) {
    // Launch tasks to work on (dx,dy)-sized tiles of the image
    uniform int dx = 8, dy = 8;
-    for (uniform int y = 0; y < height; y += dy)
-        for (uniform int x = 0; x < width; x += dx)
-            launch < volume_task(x, y, x+dx, y+dy, density, nVoxels, 
-                                 raster2camera, camera2world, width, height, 
-                                 image) >;
+    uniform int nTasks = ((width+(dx-1))/dx) * ((height+(dy-1))/dy);
+    launch[nTasks] < volume_task(density, nVoxels, raster2camera, camera2world, 
+                                 width, height, image) >;
 }
--- a/examples/volume_rendering/volume.vcxproj
+++ b/examples/volume_rendering/volume.vcxproj
@@ -1,4 +1,4 @@
-<?xml version="1.0" encoding="utf-8"?>
+<?xml version="1.0" encoding="utf-8"?>
 <Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
  <ItemGroup Label="ProjectConfigurations">
    <ProjectConfiguration Include="Debug|Win32">
@@ -64,15 +64,19 @@
  <PropertyGroup Label="UserMacros" />
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
    <LinkIncremental>true</LinkIncremental>
+    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
    <LinkIncremental>true</LinkIncremental>
+    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
    <LinkIncremental>false</LinkIncremental>
+    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
    <LinkIncremental>false</LinkIncremental>
+    <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
  </PropertyGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
    <ClCompile>
@@ -81,6 +85,7 @@
      <WarningLevel>Level3</WarningLevel>
      <Optimization>Disabled</Optimization>
      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <FloatingPointModel>Fast</FloatingPointModel>
    </ClCompile>
@@ -96,6 +101,7 @@
      <WarningLevel>Level3</WarningLevel>
      <Optimization>Disabled</Optimization>
      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <FloatingPointModel>Fast</FloatingPointModel>
    </ClCompile>
@@ -113,6 +119,7 @@
      <FunctionLevelLinking>true</FunctionLevelLinking>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
      <FloatingPointModel>Fast</FloatingPointModel>
    </ClCompile>
    <Link>
@@ -131,6 +138,7 @@
      <FunctionLevelLinking>true</FunctionLevelLinking>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
      <FloatingPointModel>Fast</FloatingPointModel>
    </ClCompile>
    <Link>
@@ -143,23 +151,23 @@
  <ItemGroup>
    <ClCompile Include="volume.cpp" />
    <ClCompile Include="volume_serial.cpp" />
-    <ClCompile Include="../tasks_concrt.cpp" />
+    <ClCompile Include="../tasksys.cpp" />
  </ItemGroup>
  <ItemGroup>
    <CustomBuild Include="volume.ispc">
      <FileType>Document</FileType>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=sse4x2
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2
 </Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --target=sse4x2
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2
 </Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=sse4x2
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2
 </Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --target=sse4x2
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2
 </Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">%(Filename).obj;%(Filename)_ispc.h</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
    </CustomBuild>
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
--- a/examples/volume_rendering/volume_serial.cpp
+++ b/examples/volume_rendering/volume_serial.cpp
@@ -36,9 +36,6 @@
 #include <algorithm>

 // Just enough of a float3 class to do what we need in this file.
-#ifdef _MSC_VER
-__declspec(align(16)) 
-#endif
 struct float3 {
    float3() { }
    float3(float xx, float yy, float zz) { x = xx; y = yy; z = zz; }
@@ -298,7 +295,7 @@ volume_serial(float density[], int nVoxels[3], const float raster2camera[4][4],
    for (int y = 0; y < height; ++y) {
        for (int x = 0; x < width; ++x, ++offset) {
            Ray ray;
-            generateRay(raster2camera, camera2world, x, y, ray);
+            generateRay(raster2camera, camera2world, (float)x, (float)y, ray);
            image[offset] = raymarch(density, nVoxels, ray);
        }
    }
--- a/expr.cpp
+++ b/expr.cpp
@@ -741,6 +741,12 @@ UnaryExpr::TypeCheck() {
 }


+int
+UnaryExpr::EstimateCost() const {
+    return (expr ? expr->EstimateCost() : 0) + COST_SIMPLE_ARITH_LOGIC_OP;
+}
+
+
 void
 UnaryExpr::Print() const {
    if (!expr || !GetType())
@@ -1183,10 +1189,10 @@ BinaryExpr::Optimize() {
                    m->symbolTable->LookupFunction("rcp");
                if (rcpFuns != NULL) {
                    assert(rcpFuns->size() == 2);
-                    Expr *rcpSymExpr = new FunctionSymbolExpr(rcpFuns, pos);
+                    Expr *rcpSymExpr = new FunctionSymbolExpr("rcp", rcpFuns, pos);
                    ExprList *args = new ExprList(arg1, arg1->pos);
                    Expr *rcpCall = new FunctionCallExpr(rcpSymExpr, args, 
-                                                         arg1->pos, false);
+                                                         arg1->pos);
                    rcpCall = rcpCall->TypeCheck();
                    if (rcpCall == NULL)
                        return NULL;
@@ -1299,6 +1305,17 @@ BinaryExpr::TypeCheck() {
    if (type0 == NULL || type1 == NULL)
        return NULL;

+    if (dynamic_cast<const ReferenceType *>(type0) != NULL) {
+        arg0 = new DereferenceExpr(arg0, arg0->pos);
+        type0 = arg0->GetType();
+        assert(type0 != NULL);
+    }
+    if (dynamic_cast<const ReferenceType *>(type1) != NULL) {
+        arg1 = new DereferenceExpr(arg1, arg1->pos);
+        type1 = arg1->GetType();
+        assert(type1 != NULL);
+    }
+
    switch (op) {
    case Shl:
    case Shr:
@@ -1445,6 +1462,15 @@ BinaryExpr::TypeCheck() {
 }


+int
+BinaryExpr::EstimateCost() const {
+    return ((arg0 ? arg0->EstimateCost() : 0) +
+            (arg1 ? arg1->EstimateCost() : 0) +
+            ((op == Div || op == Mod) ? COST_COMPLEX_ARITH_OP : 
+                                        COST_SIMPLE_ARITH_LOGIC_OP));
+}
+
+
 void
 BinaryExpr::Print() const {
    if (!arg0 || !arg1 || !GetType())
@@ -1471,7 +1497,7 @@ lStoreAssignResult(llvm::Value *rv, llvm::Value *lv, const Type *type,
    assert(baseSym->varyingCFDepth <= ctx->VaryingCFDepth());
    if (!g->opt.disableMaskedStoreToStore &&
        baseSym->varyingCFDepth == ctx->VaryingCFDepth() &&
-        baseSym->isStatic == false &&
+        baseSym->storageClass != SC_STATIC &&
        dynamic_cast<const ReferenceType *>(baseSym->type) == NULL) {
        // If the variable is declared at the same varying control flow
        // depth as where it's being assigned, then we don't need to do any
@@ -1696,6 +1722,20 @@ AssignExpr::TypeCheck() {
 }


+int
+AssignExpr::EstimateCost() const {
+    int cost = ((lvalue ? lvalue->EstimateCost() : 0) +
+                (rvalue ? rvalue->EstimateCost() : 0));
+    cost += COST_ASSIGN;
+    if (op == Assign)
+        return cost;
+    if (op == DivAssign || op == ModAssign)
+        return cost + COST_COMPLEX_ARITH_OP;
+    else
+        return cost + COST_SIMPLE_ARITH_LOGIC_OP;
+}
+
+
 void
 AssignExpr::Print() const {
    if (!lvalue || !rvalue || !GetType())
@@ -1944,6 +1984,12 @@ SelectExpr::TypeCheck() {
 }


+int
+SelectExpr::EstimateCost() const {
+    return COST_SELECT;
+}
+
+
 void
 SelectExpr::Print() const {
    if (!test || !expr1 || !expr2 || !GetType())
@@ -2167,7 +2213,7 @@ FunctionCallExpr::tryResolve(bool (*matchFunc)(Expr *, const Type *)) {


 void
-FunctionCallExpr::resolveFunctionOverloads() {
+FunctionCallExpr::resolveFunctionOverloads(bool exactMatchOnly) {
    FunctionSymbolExpr *fse = dynamic_cast<FunctionSymbolExpr *>(func);
    if (!fse) 
        // error will be issued later if not calling an actual function
@@ -2181,6 +2227,7 @@ FunctionCallExpr::resolveFunctionOverloads() {
    if (tryResolve(lExactMatch))
        return;

+    if (!exactMatchOnly) {
        // Try to find a single match ignoring references
        if (tryResolve(lMatchIgnoringReferences))
            return;
@@ -2201,73 +2248,34 @@ FunctionCallExpr::resolveFunctionOverloads() {
        // Last chance: try to find a match via arbitrary type conversion.
        if (tryResolve(lMatchWithTypeConv))
            return;
+    }

    // failure :-(
    const char *funName = fse->candidateFunctions->front()->name.c_str();
-    Error(pos, "Unable to find matching overload for call to function \"%s\".",
-          funName);
+    Error(pos, "Unable to find matching overload for call to function \"%s\"%s.",
+          funName, exactMatchOnly ? " only considering exact matches" : "");
    fprintf(stderr, "Candidates are:\n");
    lPrintFunctionOverloads(*fse->candidateFunctions);
    lPrintPassedTypes(funName, args->exprs);
 }


-FunctionCallExpr::FunctionCallExpr(Expr *f, ExprList *a, SourcePos p, bool il) 
-    : Expr(p) {
+FunctionCallExpr::FunctionCallExpr(Expr *f, ExprList *a, SourcePos p, 
+                                   bool il, Expr *lce) 
+    : Expr(p), isLaunch(il) {
    func = f;
    args = a;
-    isLaunch = il;
+    launchCountExpr = lce;

-    resolveFunctionOverloads();
-}
-
-
-/** Starting from the function initialFunction, we're calling into
-    calledFunc.  The question is: is this a recursive call back to
-    initialFunc?  If it definitely is or if it may be, then return true.
-    Return false if it definitely is not.
- */
-static bool
-lMayBeRecursiveCall(llvm::Function *calledFunc, 
-                    llvm::Function *initialFunc,
-                    std::set<llvm::Function *> &seenFuncs) {
-    // Easy case: intrinsics aren't going to call functions themselves
-    if (calledFunc->isIntrinsic())
-        return false;
-
-    std::string name = calledFunc->getName();
-    if (name.size() > 2 && name[0] == '_' && name[1] == '_')
-        // builtin stdlib function; none of these are recursive...
-        return false;
-
-    if (calledFunc->isDeclaration())
-        // There's visibility into what the called function does without a
-        // definition, so we have to be conservative
-        return true;
-
-    if (calledFunc == initialFunc)
-        // hello recursive call
-        return true;
-
-    // Otherwise iterate over all of the instructions in the function.  If
-    // any of them is a function call then check recursively..
-    llvm::inst_iterator iter;
-    for (iter = llvm::inst_begin(calledFunc); 
-         iter != llvm::inst_end(calledFunc); ++iter) {
-        llvm::Instruction *inst = &*iter;
-        llvm::CallInst *ci = llvm::dyn_cast<llvm::CallInst>(inst);
-        if (ci != NULL) {
-            llvm::Function *nextCalledFunc = ci->getCalledFunction();
-            // Don't repeatedly test functions we've seen before 
-            if (seenFuncs.find(nextCalledFunc) == seenFuncs.end()) {
-                seenFuncs.insert(nextCalledFunc);
-                if (lMayBeRecursiveCall(nextCalledFunc, initialFunc, 
-                                        seenFuncs))
-                    return true;
-            }
-        }
-    }
-    return false;
+    FunctionSymbolExpr *fse = dynamic_cast<FunctionSymbolExpr *>(func);
+    // Functions with names that start with "__" should only be various
+    // builtins.  For those, we'll demand an exact match, since we'll
+    // expect whichever function in stdlib.ispc is calling out to one of
+    // those to be matching the argument types exactly; this is to be a bit
+    // extra safe to be sure that the expected builtin is in fact being
+    // called.
+    bool exactMatchOnly = (fse != NULL) && (fse->name.substr(0,2) == "__");
+    resolveFunctionOverloads(exactMatchOnly);
 }


@@ -2391,47 +2399,18 @@ FunctionCallExpr::GetValue(FunctionEmitContext *ctx) const {
        }
    }

-    // We sometimes need to check to see if the mask is all off here;
-    // specifically, if the mask is all off and we call a recursive
-    // function, then we will probably have an unsesirable infinite loop.
-    ctx->SetDebugPos(pos);
-    llvm::BasicBlock *bDoCall = ctx->CreateBasicBlock("funcall_mask_ok");
-    llvm::BasicBlock *bSkip = ctx->CreateBasicBlock("funcall_mask_off");
-    llvm::BasicBlock *bAfter = ctx->CreateBasicBlock("after_funcall");
-    llvm::Function *currentFunc = ctx->GetCurrentBasicBlock()->getParent();
-
-    // If we need to check the mask (it may be a recursive call, possibly
-    // transitively), or we're launching a task, which is expensive and
-    // thus probably always worth checking, then use the mask to choose
-    // whether to go to the bDoCallBlock or the bSkip block
-    std::set<llvm::Function *> seenFuncs;
-    seenFuncs.insert(currentFunc);
-    if (ft->isTask || lMayBeRecursiveCall(callee, currentFunc, seenFuncs)) {
-        Debug(pos, "Checking mask before function call \"%s\".", funSym->name.c_str());
-        ctx->BranchIfMaskAny(bDoCall, bSkip);
-    }
-    else
-        // If we don't need to check the mask, then always to the call;
-        // just jump to bDoCall
-        ctx->BranchInst(bDoCall);
-    
-    // And the bSkip block just jumps immediately to bAfter.  So why do we
-    // need it?  So the phi node below can easily tell what paths are
-    // going into it
-    ctx->SetCurrentBasicBlock(bSkip);
-    ctx->BranchInst(bAfter);
-
-    // Emit the code to do the function call
-    ctx->SetCurrentBasicBlock(bDoCall);
-
    llvm::Value *retVal = NULL;
    ctx->SetDebugPos(pos);
-    if (ft->isTask)
-        ctx->LaunchInst(callee, argVals);
+    if (ft->isTask) {
+        assert(launchCountExpr != NULL);
+        llvm::Value *launchCount = launchCountExpr->GetValue(ctx);
+        if (launchCount != NULL)
+            ctx->LaunchInst(callee, argVals, launchCount);
+    }
    else {
        // Most of the time, the mask is passed as the last argument.  this
-        // isn't the case for things like SSE intrinsics and extern "C"
-        // functions from the application.
+        // isn't the case for things like intrinsics, builtins, and extern
+        // "C" functions from the application.
        assert(callargs.size() + 1 == callee->arg_size() ||
               callargs.size() == callee->arg_size());

@@ -2458,22 +2437,10 @@ FunctionCallExpr::GetValue(FunctionEmitContext *ctx) const {
        }
    }

-    // And jump out to the 'after funciton call' basic block
-    ctx->BranchInst(bAfter);
-    ctx->SetCurrentBasicBlock(bAfter);
-
    if (isVoidFunc)
        return NULL;
-
-    // The return value for the non-void case is either undefined or the
-    // function return value, depending on whether we actually ran the code
-    // path that called the function or not.
-    LLVM_TYPE_CONST llvm::Type *lrType = ft->GetReturnType()->LLVMType(g->ctx);
-    llvm::PHINode *ret = ctx->PhiNode(lrType, 2, "fun_ret");
-    assert(retVal != NULL);
-    ret->addIncoming(llvm::UndefValue::get(lrType), bSkip);
-    ret->addIncoming(retVal, bDoCall);
-    return ret;
+    else
+        return retVal;
 }


@@ -2515,10 +2482,21 @@ FunctionCallExpr::TypeCheck() {
                    if (!isLaunch)
                        Error(pos, "\"launch\" expression needed to call function "
                              "with \"task\" qualifier.");
+                    if (!launchCountExpr)
+                        return NULL;
+
+                    launchCountExpr = 
+                        launchCountExpr->TypeConv(AtomicType::UniformInt32,
+                                                  "task launch count");
+                    if (!launchCountExpr)
+                        return NULL;
                }
-                else if (isLaunch)
+                else {
+                    if (isLaunch)
                        Error(pos, "\"launch\" expression illegal with non-\"task\"-"
                              "qualified function.");
+                    assert(launchCountExpr == NULL);
+                }
            }
            else
                Error(pos, "Valid function name must be used for function call.");
@@ -2534,6 +2512,13 @@ FunctionCallExpr::TypeCheck() {
 }


+int
+FunctionCallExpr::EstimateCost() const {
+    return ((args ? args->EstimateCost() : 0) +
+            (isLaunch ? COST_TASK_LAUNCH : COST_FUNCALL));
+}
+
+
 void
 FunctionCallExpr::Print() const {
    if (!func || !args || !GetType())
@@ -2622,7 +2607,7 @@ ExprList::GetConstant(const Type *type) const {
    }

    if (dynamic_cast<const StructType *>(type) != NULL) {
-#if defined(LLVM_2_8) || defined(LLVM_2_9)
+#if defined(LLVM_2_9)
        return llvm::ConstantStruct::get(*g->ctx, cv, false);
 #else
        LLVM_TYPE_CONST llvm::StructType *llvmStructType =
@@ -2645,6 +2630,17 @@ ExprList::GetConstant(const Type *type) const {
 }


+int
+ExprList::EstimateCost() const {
+    int cost = 0;
+    for (unsigned int i = 0; i < exprs.size(); ++i) {
+        if (exprs[i] != NULL)
+            cost += exprs[i]->EstimateCost();
+    }
+    return cost;
+}
+
+
 void
 ExprList::Print() const {
    printf("expr list (");
@@ -2775,6 +2771,22 @@ IndexExpr::GetLValue(FunctionEmitContext *ctx) const {
    if (!basePtr)
        return NULL;

+    // If the array index is a compile time constant, check to see if it
+    // may lead to an out-of-bounds access.
+    ConstExpr *ce = dynamic_cast<ConstExpr *>(index);
+    const SequentialType *seqType = dynamic_cast<const SequentialType *>(type);
+    assert(seqType != NULL);
+    int nElements = seqType->GetElementCount();
+    if (ce != NULL && nElements > 0) {
+        int32_t indices[ISPC_MAX_NVEC];
+        int count = ce->AsInt32(indices);
+        for (int i = 0; i < count; ++i) {
+            if (indices[i] < 0 || indices[i] >= nElements)
+                Warning(index->pos, "Array index \"%d\" may be out of bounds for "
+                        "\"%d\" element array.", indices[i], nElements);
+        }
+    }
+
    basePtr = lCastUniformVectorBasePtr(basePtr, ctx);

    ctx->SetDebugPos(pos);
@@ -2827,6 +2839,16 @@ IndexExpr::TypeCheck() {
 }


+int
+IndexExpr::EstimateCost() const {
+    // be pessimistic
+    if (index && index->GetType()->IsVaryingType())
+        return COST_GATHER;
+    else
+        return COST_LOAD;
+}
+
+
 void
 IndexExpr::Print() const {
    if (!arrayOrVector || !index || !GetType())
@@ -3126,6 +3148,7 @@ MemberExpr::create(Expr *e, const char *id, SourcePos p, SourcePos idpos) {
    return new MemberExpr(e, id, p, idpos);
 }

+
 MemberExpr::MemberExpr(Expr *e, const char *id, SourcePos p, SourcePos idpos) 
    : Expr(p), identifierPos(idpos) {
    expr = e;
@@ -3222,6 +3245,14 @@ MemberExpr::Optimize() {
 }


+int
+MemberExpr::EstimateCost() const {
+    // FIXME: return gather cost when we can tell a gather is going to be
+    // needed
+    return COST_SIMPLE_ARITH_LOGIC_OP;
+}
+
+
 void
 MemberExpr::Print() const {
    if (!expr || !GetType())
@@ -4017,6 +4048,12 @@ ConstExpr::TypeCheck() {
 }


+int
+ConstExpr::EstimateCost() const {
+    return 0;
+}
+
+
 void
 ConstExpr::Print() const {
    printf("[%s] (", GetType()->GetString().c_str());
@@ -4103,7 +4140,7 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal,
        case AtomicType::TYPE_BOOL:
            if (fromType->IsVaryingType() && 
                LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType)
-                // If we have a bool vector of i32 element,s first truncate
+                // If we have a bool vector of i32 elements, first truncate
                // down to a single bit
                exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, "bool_to_i1");
            // And then do an unisgned int->float cast
@@ -4163,9 +4200,6 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal,
        case AtomicType::TYPE_UINT16:
        case AtomicType::TYPE_UINT32:
        case AtomicType::TYPE_UINT64:
-            if (fromType->IsVaryingType())
-                PerformanceWarning(pos, "Conversion from unsigned int64 to float is slow. "
-                                   "Use \"int64\" if possible");
            cast = ctx->CastInst(llvm::Instruction::UIToFP, // unsigned int
                                 exprVal, targetType, "uint2double");
            break;
@@ -4937,6 +4971,13 @@ TypeCastExpr::Optimize() {
 }


+int
+TypeCastExpr::EstimateCost() const {
+    // FIXME: return COST_TYPECAST_COMPLEX when appropriate
+    return COST_TYPECAST_SIMPLE;
+}
+
+
 void
 TypeCastExpr::Print() const {
    printf("[%s] type cast (", GetType()->GetString().c_str());
@@ -5002,6 +5043,12 @@ ReferenceExpr::TypeCheck() {
 }


+int
+ReferenceExpr::EstimateCost() const {
+    return 0;
+}
+
+
 void
 ReferenceExpr::Print() const {
    if (expr == NULL || GetType() == NULL)
@@ -5080,6 +5127,12 @@ DereferenceExpr::Optimize() {
 }


+int
+DereferenceExpr::EstimateCost() const {
+    return COST_DEREF;
+}
+
+
 void
 DereferenceExpr::Print() const {
    if (expr == NULL || GetType() == NULL)
@@ -5151,6 +5204,15 @@ SymbolExpr::Optimize() {
 }


+int
+SymbolExpr::EstimateCost() const {
+    if (symbol->constValue != NULL)
+        return 0;
+    else
+        return COST_LOAD;
+}
+
+
 void
 SymbolExpr::Print() const {
    if (symbol == NULL || GetType() == NULL)
@@ -5165,9 +5227,11 @@ SymbolExpr::Print() const {
 ///////////////////////////////////////////////////////////////////////////
 // FunctionSymbolExpr

-FunctionSymbolExpr::FunctionSymbolExpr(std::vector<Symbol *> *candidates,
+FunctionSymbolExpr::FunctionSymbolExpr(const char *n,
+                                       std::vector<Symbol *> *candidates,
                                       SourcePos p) 
  : Expr(p) {
+    name = n;
    matchingFunc = NULL;
    candidateFunctions = candidates;
 }
@@ -5204,6 +5268,12 @@ FunctionSymbolExpr::Optimize() {
 }


+int
+FunctionSymbolExpr::EstimateCost() const {
+    return 0;
+}
+
+
 void
 FunctionSymbolExpr::Print() const {
    if (!matchingFunc || !GetType())
@@ -5227,14 +5297,14 @@ SyncExpr::GetType() const {
 llvm::Value *
 SyncExpr::GetValue(FunctionEmitContext *ctx) const {
    ctx->SetDebugPos(pos);
-    std::vector<llvm::Value *> noArg;
-    llvm::Function *fsync = m->module->getFunction("ISPCSync");
-    if (fsync == NULL) {
-        FATAL("Couldn't find ISPCSync declaration?!");
+    ctx->SyncInst();
    return NULL;
-    }
+}

-    return ctx->CallInst(fsync, noArg, "");
+
+int
+SyncExpr::EstimateCost() const {
+    return COST_SYNC;
 }


--- a/expr.h
+++ b/expr.h
@@ -39,6 +39,7 @@
 #define ISPC_EXPR_H 1

 #include "ispc.h"
+#include "ast.h"
 #include "type.h"

 class FunctionSymbolExpr;
@@ -121,8 +122,8 @@ public:
    void Print() const;
    Expr *Optimize();
    Expr *TypeCheck();
+    int EstimateCost() const;

-private:
    const Op op;
    Expr *expr;
 };
@@ -164,8 +165,8 @@ public:

    Expr *Optimize();
    Expr *TypeCheck();
+    int EstimateCost() const;

-private:
    const Op op;
    Expr *arg0, *arg1;
 };
@@ -196,8 +197,8 @@ public:

    Expr *Optimize();
    Expr *TypeCheck();
+    int EstimateCost() const;

-private:
    const Op op;
    Expr *lvalue, *rvalue;
 };
@@ -217,8 +218,8 @@ public:

    Expr *Optimize();
    Expr *TypeCheck();
+    int EstimateCost() const;

-private:
    Expr *test, *expr1, *expr2;
 };

@@ -240,6 +241,7 @@ public:
    llvm::Constant *GetConstant(const Type *type) const;
    ExprList *Optimize();
    ExprList *TypeCheck();
+    int EstimateCost() const;

    std::vector<Expr *> exprs;
 };
@@ -249,7 +251,8 @@ public:
 */
 class FunctionCallExpr : public Expr {
 public:
-    FunctionCallExpr(Expr *func, ExprList *args, SourcePos p, bool isLaunch);
+    FunctionCallExpr(Expr *func, ExprList *args, SourcePos p, 
+                     bool isLaunch = false, Expr *launchCountExpr = NULL);

    llvm::Value *GetValue(FunctionEmitContext *ctx) const;
    const Type *GetType() const;
@@ -257,13 +260,15 @@ public:

    Expr *Optimize();
    Expr *TypeCheck();
+    int EstimateCost() const;

-private:
    Expr *func;
    ExprList *args;
    bool isLaunch;
+    Expr *launchCountExpr;

-    void resolveFunctionOverloads();
+private:
+    void resolveFunctionOverloads(bool exactMatchOnly);
    bool tryResolve(bool (*matchFunc)(Expr *, const Type *));
 };

@@ -285,8 +290,8 @@ public:

    Expr *Optimize();
    Expr *TypeCheck();
+    int EstimateCost() const;

-private:
    Expr *arrayOrVector, *index;
 };

@@ -303,16 +308,17 @@ public:
    MemberExpr(Expr *expr, const char *identifier, SourcePos pos, 
               SourcePos identifierPos);

-    virtual llvm::Value *GetValue(FunctionEmitContext *ctx) const;
-    virtual llvm::Value *GetLValue(FunctionEmitContext *ctx) const;
-    virtual const Type *GetType() const;
-    virtual Symbol *GetBaseSymbol() const;
-    virtual void Print() const;
-    virtual Expr *Optimize();
-    virtual Expr *TypeCheck();
+    llvm::Value *GetValue(FunctionEmitContext *ctx) const;
+    llvm::Value *GetLValue(FunctionEmitContext *ctx) const;
+    const Type *GetType() const;
+    Symbol *GetBaseSymbol() const;
+    void Print() const;
+    Expr *Optimize();
+    Expr *TypeCheck();
+    int EstimateCost() const;
+
    virtual int getElementNumber() const;

-protected:
    std::string getCandidateNearMatches() const;

    Expr *expr;
@@ -392,6 +398,7 @@ public:

    Expr *TypeCheck();
    Expr *Optimize();
+    int EstimateCost() const;

    /** Return the ConstExpr's values as booleans, doing type conversion
        from the actual type if needed.  If forceVarying is true, then type
@@ -495,8 +502,8 @@ public:
    void Print() const;
    Expr *TypeCheck();
    Expr *Optimize();
+    int EstimateCost() const;

-private:
    const Type *type;
    Expr *expr;
 };
@@ -514,8 +521,8 @@ public:
    void Print() const;
    Expr *TypeCheck();
    Expr *Optimize();
+    int EstimateCost() const;

-private:
    Expr *expr;
 };

@@ -533,8 +540,8 @@ public:
    void Print() const;
    Expr *TypeCheck();
    Expr *Optimize();
+    int EstimateCost() const;

-private:
    Expr *expr;
 };

@@ -551,6 +558,7 @@ public:
    Expr *TypeCheck();
    Expr *Optimize();
    void Print() const;
+    int EstimateCost() const;

 private:
    Symbol *symbol;
@@ -562,7 +570,7 @@ private:
 */    
 class FunctionSymbolExpr : public Expr {
 public:
-    FunctionSymbolExpr(std::vector<Symbol *> *candidateFunctions, 
+    FunctionSymbolExpr(const char *name, std::vector<Symbol *> *candidateFunctions,
                       SourcePos pos);

    llvm::Value *GetValue(FunctionEmitContext *ctx) const;
@@ -571,10 +579,14 @@ public:
    Expr *TypeCheck();
    Expr *Optimize();
    void Print() const;
+    int EstimateCost() const;

 private:
    friend class FunctionCallExpr;

+    /** Name of the function that is being called. */
+    std::string name;
+
    /** All of the functions with the name given in the function call;
        there may be more then one, in which case we need to resolve which
        overload is the best match. */
@@ -597,6 +609,7 @@ public:
    Expr *TypeCheck();
    Expr *Optimize();
    void Print() const;
+    int EstimateCost() const;
 };

 #endif // ISPC_EXPR_H
--- a/func.cpp
+++ b/func.cpp
@@ -0,0 +1,643 @@
+/*
+  Copyright (c) 2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+/** @file func.cpp
+    @brief 
+*/
+
+#include "func.h"
+#include "ctx.h"
+#include "decl.h"
+#include "expr.h"
+#include "llvmutil.h"
+#include "module.h"
+#include "type.h"
+#include "stmt.h"
+#include "sym.h"
+#include "util.h"
+#include <stdio.h>
+
+#include <llvm/LLVMContext.h>
+#include <llvm/Module.h>
+#include <llvm/Type.h>
+#include <llvm/DerivedTypes.h>
+#include <llvm/Instructions.h>
+#include <llvm/Intrinsics.h>
+#include <llvm/PassManager.h>
+#include <llvm/PassRegistry.h>
+#include <llvm/Transforms/IPO.h>
+#include <llvm/Support/FormattedStream.h>
+#include <llvm/Support/FileUtilities.h>
+#include <llvm/Target/TargetMachine.h>
+#include <llvm/Target/TargetOptions.h>
+#include <llvm/Target/TargetData.h>
+#include <llvm/PassManager.h>
+#include <llvm/Analysis/Verifier.h>
+#include <llvm/Support/CFG.h>
+#include <llvm/Support/ToolOutputFile.h>
+#include <llvm/Assembly/PrintModulePass.h>
+
+Function::Function(DeclSpecs *ds, Declarator *decl, Stmt *c) {
+    code = c;
+
+    maskSymbol = m->symbolTable->LookupVariable("__mask");
+    assert(maskSymbol != NULL);
+
+    if (code) {
+        code = code->TypeCheck();
+        if (code)
+            code = code->Optimize();
+    }
+
+    if (g->debugPrint) {
+        printf("Add Function\n");
+        ds->Print();
+        printf("\n");
+        decl->Print();
+        printf("\n");
+        code->Print(0);
+        printf("\n\n\n");
+    }
+
+    // Get the symbol for the function from the symbol table.  (It should
+    // already have been added to the symbol table by AddGlobal() by the
+    // time we get here.)
+    type = dynamic_cast<const FunctionType *>(decl->GetType(ds));
+    assert(type != NULL);
+    sym = m->symbolTable->LookupFunction(decl->sym->name.c_str(), type);
+    assert(sym != NULL);
+    sym->pos = decl->pos;
+
+    isExported = (ds->storageClass == SC_EXPORT);
+
+    if (decl->functionArgs != NULL) {
+        for (unsigned int i = 0; i < decl->functionArgs->size(); ++i) {
+            Declaration *pdecl = (*decl->functionArgs)[i];
+            assert(pdecl->declarators.size() == 1);
+            args.push_back(pdecl->declarators[0]->sym);
+        }
+    }
+
+    if (type->isTask) {
+        threadIndexSym = m->symbolTable->LookupVariable("threadIndex");
+        assert(threadIndexSym);
+        threadCountSym = m->symbolTable->LookupVariable("threadCount");
+        assert(threadCountSym);
+        taskIndexSym = m->symbolTable->LookupVariable("taskIndex");
+        assert(taskIndexSym);
+        taskCountSym = m->symbolTable->LookupVariable("taskCount");
+        assert(taskCountSym);
+    }
+    else
+        threadIndexSym = threadCountSym = taskIndexSym = taskCountSym = NULL;
+}
+
+
+/** Given an arbitrary type, see if it or any of the types contained in it
+    are varying.  Returns true if so, false otherwise. 
+*/
+static bool
+lRecursiveCheckVarying(const Type *t) {
+    t = t->GetBaseType();
+    if (t->IsVaryingType()) return true;
+
+    const StructType *st = dynamic_cast<const StructType *>(t);
+    if (st) {
+        for (int i = 0; i < st->GetElementCount(); ++i)
+            if (lRecursiveCheckVarying(st->GetElementType(i)))
+                return true;
+    }
+    return false;
+}
+
+
+/** Given a Symbol representing a function parameter, see if it or any
+    contained types are varying.  If so, issue an error.  (This function
+    should only be called for parameters to 'export'ed functions, where
+    varying parameters is illegal.
+ */
+static void
+lCheckForVaryingParameter(Symbol *sym) {
+    if (lRecursiveCheckVarying(sym->type)) {
+        const Type *t = sym->type->GetBaseType();
+        if (dynamic_cast<const StructType *>(t))
+            Error(sym->pos, "Struct parameter \"%s\" with varying member(s) is illegal "
+                  "in an exported function.",
+                  sym->name.c_str());
+        else
+            Error(sym->pos, "Varying parameter \"%s\" is illegal in an exported function.",
+                  sym->name.c_str());
+    }
+}
+
+
+/** Given a function type, loop through the function parameters and see if
+    any are StructTypes.  If so, issue an error (this seems to be broken
+    currently).
+
+    @todo Fix passing structs from C/C++ to ispc functions.
+ */
+static void
+lCheckForStructParameters(const FunctionType *ftype, SourcePos pos) {
+    const std::vector<const Type *> &argTypes = ftype->GetArgumentTypes();
+    for (unsigned int i = 0; i < argTypes.size(); ++i) {
+        const Type *type = argTypes[i];
+        if (dynamic_cast<const StructType *>(type) != NULL) {
+            Error(pos, "Passing structs to/from application functions is currently broken. "
+                  "Use a reference or const reference instead for now.");
+            return;
+        }
+    }
+}
+
+
+/** We've got a declaration for a function to process.  This function does
+    all the work of creating the corresponding llvm::Function instance,
+    adding the symbol for the function to the symbol table and doing
+    various sanity checks.  This function returns true upon success and
+    false if any errors were encountered.
+ */
+Symbol *
+Function::InitFunctionSymbol(DeclSpecs *ds, Declarator *decl) {
+    // Make sure that we've got what we expect here
+    Symbol *funSym = decl->sym;
+    assert(decl->isFunction);
+    assert(decl->arraySize.size() == 0);
+
+    // So far, so good.  Go ahead and set the type of the function symbol
+    funSym->type = decl->GetType(ds);
+
+    // If a global variable with the same name has already been declared
+    // issue an error.
+    if (m->symbolTable->LookupVariable(funSym->name.c_str()) != NULL) {
+        Error(decl->pos, "Function \"%s\" shadows previously-declared global variable. "
+              "Ignoring this definition.",
+              funSym->name.c_str());
+        return NULL;
+    }
+
+    if (ds->storageClass == SC_EXTERN_C) {
+        // Make sure the user hasn't supplied both an 'extern "C"' and a
+        // 'task' qualifier with the function
+        if (ds->typeQualifier & TYPEQUAL_TASK) {
+            Error(funSym->pos, "\"task\" qualifier is illegal with C-linkage extern "
+                  "function \"%s\".  Ignoring this function.", funSym->name.c_str());
+            return NULL;
+        }
+        std::vector<Symbol *> *funcs;
+        funcs = m->symbolTable->LookupFunction(decl->sym->name.c_str());
+        if (funcs != NULL) {
+            if (funcs->size() > 1) {
+                // Multiple functions with this name have already been declared; 
+                // can't overload here
+                Error(funSym->pos, "Can't overload extern \"C\" function \"%s\"; "
+                      "%d functions with the same name have already been declared.",
+                      funSym->name.c_str(), (int)funcs->size());
+                return NULL;
+            }
+
+            // One function with the same name has been declared; see if it
+            // has the same type as this one, in which case it's ok.
+            if (Type::Equal((*funcs)[0]->type, funSym->type))
+                return (*funcs)[0];
+            else {
+                Error(funSym->pos, "Can't overload extern \"C\" function \"%s\".",
+                      funSym->name.c_str());
+                return NULL;
+            }
+        }
+    }
+
+    // We should have gotten a FunctionType back from the GetType() call above.
+    const FunctionType *functionType = 
+        dynamic_cast<const FunctionType *>(funSym->type);
+    assert(functionType != NULL);
+
+    // Get the LLVM FunctionType
+    bool includeMask = (ds->storageClass != SC_EXTERN_C);
+    LLVM_TYPE_CONST llvm::FunctionType *llvmFunctionType = 
+        functionType->LLVMFunctionType(g->ctx, includeMask);
+    if (llvmFunctionType == NULL)
+        return NULL;
+
+    // And create the llvm::Function
+    llvm::GlobalValue::LinkageTypes linkage = ds->storageClass == SC_STATIC ?
+        llvm::GlobalValue::InternalLinkage : llvm::GlobalValue::ExternalLinkage;
+    std::string functionName = ((ds->storageClass == SC_EXTERN_C) ?
+                                funSym->name : funSym->MangledName());
+    if (g->mangleFunctionsWithTarget)
+        functionName += g->target.GetISAString();
+    llvm::Function *function = 
+        llvm::Function::Create(llvmFunctionType, linkage, functionName.c_str(), m->module);
+
+    // Set function attributes: we never throw exceptions, and want to
+    // inline everything we can
+    function->setDoesNotThrow(true);
+    if (!(ds->storageClass == SC_EXTERN_C) && !g->generateDebuggingSymbols &&
+        (ds->typeQualifier & TYPEQUAL_INLINE))
+        function->addFnAttr(llvm::Attribute::AlwaysInline);
+    if (functionType->isTask)
+        // This also applies transitively to members I think? 
+        function->setDoesNotAlias(1, true);
+
+    // Make sure that the return type isn't 'varying' if the function is
+    // 'export'ed.
+    if (ds->storageClass == SC_EXPORT && 
+        lRecursiveCheckVarying(functionType->GetReturnType()))
+        Error(decl->pos, "Illegal to return a \"varying\" type from exported function \"%s\"",
+              funSym->name.c_str());
+
+    if (functionType->isTask && (functionType->GetReturnType() != AtomicType::Void))
+        Error(funSym->pos, "Task-qualified functions must have void return type.");
+
+    if (functionType->isExported || functionType->isExternC)
+        lCheckForStructParameters(functionType, funSym->pos);
+
+    // Loop over all of the arguments; process default values if present
+    // and do other checks and parameter attribute setting.
+    bool seenDefaultArg = false;
+    std::vector<ConstExpr *> argDefaults;
+    int nArgs = decl->functionArgs ? decl->functionArgs->size() : 0;
+    for (int i = 0; i < nArgs; ++i) {
+        Declaration *pdecl = (*decl->functionArgs)[i];
+        assert(pdecl->declarators.size() == 1);
+        Symbol *sym = pdecl->declarators[0]->sym;
+
+        // If the function is exported, make sure that the parameter
+        // doesn't have any varying stuff going on in it.
+        if (ds->storageClass == SC_EXPORT)
+            lCheckForVaryingParameter(sym);
+
+        // ISPC assumes that all memory passed in is aligned to the native
+        // width and that no pointers alias.  (It should be possible to
+        // specify when this is not the case, but this should be the
+        // default.)  Set parameter attributes accordingly.
+        if (!functionType->isTask && dynamic_cast<const ReferenceType *>(sym->type) != NULL) {
+            // NOTE: LLVM indexes function parameters starting from 1.
+            // This is unintuitive.
+            function->setDoesNotAlias(i+1, true);
+            int align = 4 * RoundUpPow2(g->target.nativeVectorWidth);
+            function->addAttribute(i+1, llvm::Attribute::constructAlignmentFromInt(align));
+        }
+
+        if (m->symbolTable->LookupFunction(sym->name.c_str()) != NULL)
+            Warning(sym->pos, "Function parameter \"%s\" shadows a function "
+                    "declared in global scope.", sym->name.c_str());
+        
+        // See if a default argument value was provided with the parameter
+        Expr *defaultValue = pdecl->declarators[0]->initExpr;
+        if (defaultValue != NULL) {
+            // If we have one, make sure it's a compile-time constant
+            seenDefaultArg = true;
+            defaultValue = defaultValue->TypeCheck();
+            defaultValue = defaultValue->Optimize();
+            defaultValue = dynamic_cast<ConstExpr *>(defaultValue);
+            if (!defaultValue) {
+                Error(sym->pos, "Default value for parameter \"%s\" must be "
+                      "a compile-time constant.", sym->name.c_str());
+                return NULL;
+            }
+        }
+        else if (seenDefaultArg) {
+            // Once one parameter has provided a default value, then all of
+            // the following ones must have them as well.
+            Error(sym->pos, "Parameter \"%s\" is missing default: all parameters after "
+                  "the first parameter with a default value must have default values "
+                  "as well.", sym->name.c_str());
+        }
+
+        // Add the default value to argDefaults.  Note that we make this
+        // call for all parameters, even those where no default value was
+        // provided.  In that case, a NULL value is stored here.  This
+        // approach means that we can always just look at the i'th entry of
+        // argDefaults to find the default value for the i'th parameter.
+        argDefaults.push_back(dynamic_cast<ConstExpr *>(defaultValue));
+    }
+
+    // And only now can we set the default values in the FunctionType
+    functionType->SetArgumentDefaults(argDefaults);
+
+    // If llvm gave us back a Function * with a different name than the one
+    // we asked for, then there's already a function with that same
+    // (mangled) name in the llvm::Module.  In that case, erase the one we
+    // tried to add and just work with the one it already had.
+    if (function->getName() != functionName) {
+        function->eraseFromParent();
+        function = m->module->getFunction(functionName);
+    }
+    funSym->function = function;
+
+    // But if that function has a definition, we don't want to redefine it.
+    if (!function->empty()) {
+        Warning(funSym->pos, "Ignoring redefinition of function \"%s\".", 
+                funSym->name.c_str());
+        return NULL;
+    }
+
+    // Finally, we know all is good and we can add the function to the
+    // symbol table
+    bool ok = m->symbolTable->AddFunction(funSym);
+    assert(ok);
+    return funSym;
+}
+
+
+const Type *
+Function::GetReturnType() const {
+    return type->GetReturnType();
+}
+
+
+const FunctionType *
+Function::GetType() const {
+    return type;
+}
+
+
+/** Parameters for tasks are stored in a big structure; this utility
+    function emits code to copy those values out of the task structure into
+    local stack-allocated variables.  (Which we expect that LLVM's
+    'mem2reg' pass will in turn promote to SSA registers..
+ */
+static void
+lCopyInTaskParameter(int i, llvm::Value *structArgPtr, const std::vector<Symbol *> &args,
+                     FunctionEmitContext *ctx) {
+    // We expect the argument structure to come in as a poitner to a
+    // structure.  Confirm and figure out its type here.
+    const llvm::Type *structArgType = structArgPtr->getType();
+    assert(llvm::isa<llvm::PointerType>(structArgType));
+    const llvm::PointerType *pt = llvm::dyn_cast<const llvm::PointerType>(structArgType);
+    assert(llvm::isa<llvm::StructType>(pt->getElementType()));
+    const llvm::StructType *argStructType = 
+        llvm::dyn_cast<const llvm::StructType>(pt->getElementType());
+
+    // Get the type of the argument we're copying in and its Symbol pointer
+    LLVM_TYPE_CONST llvm::Type *argType = argStructType->getElementType(i);
+    Symbol *sym = args[i];
+
+    // allocate space to copy the parameter in to
+    sym->storagePtr = ctx->AllocaInst(argType, sym->name.c_str());
+
+    // get a pointer to the value in the struct
+    llvm::Value *ptr = ctx->GetElementPtrInst(structArgPtr, 0, i, sym->name.c_str());
+
+    // and copy the value from the struct and into the local alloca'ed
+    // memory
+    llvm::Value *ptrval = ctx->LoadInst(ptr, NULL, sym->name.c_str());
+    ctx->StoreInst(ptrval, sym->storagePtr);
+    ctx->EmitFunctionParameterDebugInfo(sym);
+}
+
+
+/** Given the statements implementing a function, emit the code that
+    implements the function.  Most of the work do be done here just
+    involves wiring up the function parameter values to be available in the
+    function body code.
+ */
+void 
+Function::emitCode(FunctionEmitContext *ctx, llvm::Function *function, 
+                   SourcePos firstStmtPos) {
+    llvm::Value *maskPtr = ctx->AllocaInst(LLVMTypes::MaskType, "mask_memory");
+    ctx->StoreInst(LLVMMaskAllOn, maskPtr);
+    maskSymbol->storagePtr = maskPtr;
+    ctx->SetMaskPointer(maskPtr);
+
+    // add debugging info for __mask, programIndex, ...
+    maskSymbol->pos = firstStmtPos;
+    ctx->EmitVariableDebugInfo(maskSymbol);
+
+#if 0
+    llvm::BasicBlock *entryBBlock = ctx->GetCurrentBasicBlock();
+#endif
+    if (type->isTask == true) {
+        // For tasks, we there should always be three parmeters: the
+        // pointer to the structure that holds all of the arguments, the
+        // thread index, and the thread count variables.
+        llvm::Function::arg_iterator argIter = function->arg_begin();
+        llvm::Value *structParamPtr = argIter++;
+        llvm::Value *threadIndex = argIter++;
+        llvm::Value *threadCount = argIter++;
+        llvm::Value *taskIndex = argIter++;
+        llvm::Value *taskCount = argIter++;
+
+        // Copy the function parameter values from the structure into local
+        // storage
+        for (unsigned int i = 0; i < args.size(); ++i)
+            lCopyInTaskParameter(i, structParamPtr, args, ctx);
+
+        // Copy in the mask as well.
+        int nArgs = (int)args.size();
+        // The mask is the last parameter in the argument structure
+        llvm::Value *ptr = ctx->GetElementPtrInst(structParamPtr, 0, nArgs,
+                                                  "task_struct_mask");
+        llvm::Value *ptrval = ctx->LoadInst(ptr, NULL, "mask");
+        ctx->SetEntryMask(ptrval);
+
+        // Copy threadIndex and threadCount into stack-allocated storage so
+        // that their symbols point to something reasonable.
+        threadIndexSym->storagePtr = ctx->AllocaInst(LLVMTypes::Int32Type, "threadIndex");
+        ctx->StoreInst(threadIndex, threadIndexSym->storagePtr);
+
+        threadCountSym->storagePtr = ctx->AllocaInst(LLVMTypes::Int32Type, "threadCount");
+        ctx->StoreInst(threadCount, threadCountSym->storagePtr);
+
+        // Copy taskIndex and taskCount into stack-allocated storage so
+        // that their symbols point to something reasonable.
+        taskIndexSym->storagePtr = ctx->AllocaInst(LLVMTypes::Int32Type, "taskIndex");
+        ctx->StoreInst(taskIndex, taskIndexSym->storagePtr);
+
+        taskCountSym->storagePtr = ctx->AllocaInst(LLVMTypes::Int32Type, "taskCount");
+        ctx->StoreInst(taskCount, taskCountSym->storagePtr);
+    }
+    else {
+        // Regular, non-task function
+        llvm::Function::arg_iterator argIter = function->arg_begin(); 
+        for (unsigned int i = 0; i < args.size(); ++i, ++argIter) {
+            Symbol *sym = args[i];
+            argIter->setName(sym->name.c_str());
+
+            // Allocate stack storage for the parameter and emit code
+            // to store the its value there.
+            sym->storagePtr = ctx->AllocaInst(argIter->getType(), sym->name.c_str());
+            ctx->StoreInst(argIter, sym->storagePtr);
+            ctx->EmitFunctionParameterDebugInfo(sym);
+        }
+
+        // If the number of actual function arguments is equal to the
+        // number of declared arguments in decl->functionArgs, then we
+        // don't have a mask parameter, so set it to be all on.  This
+        // happens for exmaple with 'export'ed functions that the app
+        // calls.
+        if (argIter == function->arg_end())
+            ctx->SetEntryMask(LLVMMaskAllOn);
+        else {
+            // Otherwise use the mask to set the entry mask value
+            argIter->setName("__mask");
+            assert(argIter->getType() == LLVMTypes::MaskType);
+            ctx->SetEntryMask(argIter);
+            assert(++argIter == function->arg_end());
+        }
+    }
+
+    // Finally, we can generate code for the function
+    if (code != NULL) {
+        int costEstimate = code->EstimateCost();
+        bool checkMask = (type->isTask == true) || 
+            ((function->hasFnAttr(llvm::Attribute::AlwaysInline) == false) &&
+             costEstimate > CHECK_MASK_AT_FUNCTION_START_COST);
+        Debug(code->pos, "Estimated cost for function \"%s\" = %d\n", 
+              sym->name.c_str(), costEstimate);
+        // If the body of the function is non-trivial, then we wrap the
+        // entire thing around a varying "cif (true)" test in order to reap
+        // the side-effect benefit of checking to see if the execution mask
+        // is all on and thence having a specialized code path for that
+        // case.  If this is a simple function, then this isn't worth the
+        // code bloat / overhead.
+        if (checkMask) {
+            bool allTrue[ISPC_MAX_NVEC];
+            for (int i = 0; i < g->target.vectorWidth; ++i)
+                allTrue[i] = true;
+            Expr *trueExpr = new ConstExpr(AtomicType::VaryingBool, allTrue, 
+                                           code->pos);
+            code = new IfStmt(trueExpr, code, NULL, true, code->pos);
+        }
+
+        ctx->SetDebugPos(code->pos);
+        ctx->AddInstrumentationPoint("function entry");
+        code->EmitCode(ctx);
+    }
+
+    if (ctx->GetCurrentBasicBlock()) {
+        // FIXME: We'd like to issue a warning if we've reached the end of
+        // the function without a return statement (for non-void
+        // functions).  But the test below isn't right, since we can have
+        // (with 'x' a varying test) "if (x) return a; else return b;", in
+        // which case we have a valid basic block but its unreachable so ok
+        // to not have return statement.
+#if 0
+        // If the bblock has no predecessors, then it doesn't matter if it
+        // doesn't have a return; it'll never be reached.  If it does,
+        // issue a warning.  Also need to warn if it's the entry block for
+        // the function (in which case it will not have predeccesors but is
+        // still reachable.)
+        if (type->GetReturnType() != AtomicType::Void &&
+            (pred_begin(ec.bblock) != pred_end(ec.bblock) || (ec.bblock == entryBBlock)))
+            Warning(sym->pos, "Missing return statement in function returning \"%s\".",
+                    type->rType->GetString().c_str());
+#endif
+
+        // FIXME: would like to set the context's current position to
+        // e.g. the end of the function code
+
+        // if bblock is non-NULL, it hasn't been terminated by e.g. a
+        // return instruction.  Need to add a return instruction.
+        ctx->ReturnInst();
+    }
+}
+
+
+void
+Function::GenerateIR() {
+    llvm::Function *function = sym->function;
+    assert(function != NULL);
+
+    // Figure out a reasonable source file position for the start of the
+    // function body.  If possible, get the position of the first actual
+    // non-StmtList statment...
+    SourcePos firstStmtPos = sym->pos;
+    if (code) {
+        StmtList *sl = dynamic_cast<StmtList *>(code);
+        if (sl && sl->GetStatements().size() > 0 && 
+            sl->GetStatements()[0] != NULL)
+            firstStmtPos = sl->GetStatements()[0]->pos;
+        else
+            firstStmtPos = code->pos;
+    }
+
+    // And we can now go ahead and emit the code 
+    {
+        FunctionEmitContext ec(this, sym, function, firstStmtPos);
+        emitCode(&ec, function, firstStmtPos);
+    }
+
+    if (m->errorCount == 0) {
+        if (llvm::verifyFunction(*function, llvm::ReturnStatusAction) == true) {
+            if (g->debugPrint) {
+                llvm::PassManager ppm;
+                ppm.add(llvm::createPrintModulePass(&llvm::outs()));
+                ppm.run(*m->module);
+            }
+            FATAL("Function verificication failed");
+        }
+
+        // If the function is 'export'-qualified, emit a second version of
+        // it without a mask parameter and without name mangling so that
+        // the application can call it
+        if (isExported) {
+            if (!type->isTask) {
+                LLVM_TYPE_CONST llvm::FunctionType *ftype = 
+                    type->LLVMFunctionType(g->ctx);
+                llvm::GlobalValue::LinkageTypes linkage = llvm::GlobalValue::ExternalLinkage;
+                std::string functionName = sym->name;
+                if (g->mangleFunctionsWithTarget)
+                    functionName += std::string("_") + g->target.GetISAString();
+                llvm::Function *appFunction = 
+                    llvm::Function::Create(ftype, linkage, functionName.c_str(), m->module);
+                appFunction->setDoesNotThrow(true);
+
+                if (appFunction->getName() != functionName) {
+                    // this was a redefinition for which we already emitted an
+                    // error, so don't worry about this one...
+                    appFunction->eraseFromParent();
+                }
+                else {
+                    // And emit the code again
+                    FunctionEmitContext ec(this, sym, appFunction, firstStmtPos);
+                    emitCode(&ec, appFunction, firstStmtPos);
+                    if (m->errorCount == 0) {
+                        sym->exportedFunction = appFunction;
+                        if (llvm::verifyFunction(*appFunction, 
+                                                 llvm::ReturnStatusAction) == true) {
+                            if (g->debugPrint) {
+                                llvm::PassManager ppm;
+                                ppm.add(llvm::createPrintModulePass(&llvm::outs()));
+                                ppm.run(*m->module);
+                            }
+                            FATAL("Function verificication failed");
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
--- a/func.h
+++ b/func.h
@@ -0,0 +1,70 @@
+/*
+  Copyright (c) 2011, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+/** @file func.h
+    @brief Representation of a function in a source file.
+*/
+
+#ifndef ISPC_FUNC_H
+#define ISPC_FUNC_H 1
+
+#include "ispc.h"
+#include <vector>
+
+class Function {
+public:
+    Function(DeclSpecs *ds, Declarator *decl, Stmt *code);
+
+    static Symbol *InitFunctionSymbol(DeclSpecs *ds, Declarator *decl);
+
+    const Type *GetReturnType() const;
+    const FunctionType *GetType() const;
+
+    /** Generate LLVM IR for the function into the current module. */
+    void GenerateIR();
+
+private:
+    void emitCode(FunctionEmitContext *ctx, llvm::Function *function, 
+                  SourcePos firstStmtPos);
+
+    Symbol *sym;
+    const FunctionType *type;
+    std::vector<Symbol *> args;
+    Stmt *code;
+    bool isExported;
+    Symbol *maskSymbol;
+    Symbol *threadIndexSym, *threadCountSym;
+    Symbol *taskIndexSym, *taskCountSym;
+};
+
+#endif // ISPC_FUNC_H
--- a/ispc.cpp
+++ b/ispc.cpp
@@ -46,9 +46,7 @@
 #endif
 #include <llvm/LLVMContext.h>
 #include <llvm/Module.h>
-#ifndef LLVM_2_8
 #include <llvm/Analysis/DIBuilder.h>
-#endif
 #include <llvm/Analysis/DebugInfo.h>
 #include <llvm/Support/Dwarf.h>
 #include <llvm/Target/TargetMachine.h>
@@ -72,7 +70,7 @@ Module *m;

 bool
 Target::GetTarget(const char *arch, const char *cpu, const char *isa,
-                  Target *t) {
+                  bool pic, Target *t) {
    if (cpu == NULL) {
        std::string hostCPU = llvm::sys::getHostCPUName();
        if (hostCPU.size() > 0)
@@ -87,7 +85,7 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa,
    if (isa == NULL) {
        if (!strcasecmp(cpu, "atom"))
            isa = "sse2";
-#if defined(LLVM_3_0) || defined(LLVM_3_0_svn)
+#if defined(LLVM_3_0) || defined(LLVM_3_0svn)
        else if (!strcasecmp(cpu, "sandybridge") ||
                 !strcasecmp(cpu, "corei7-avx"))
            isa = "avx";
@@ -100,6 +98,8 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa,

    bool error = false;

+    t->generatePIC = pic;
+
    // Make sure the target architecture is a known one; print an error
    // with the valid ones otherwise.
    t->target = NULL;
@@ -135,7 +135,7 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa,
        t->vectorWidth = 4;
        t->attributes = "+sse,+sse2,+sse3,+sse41,-sse42,-sse4a,+ssse3,-popcnt,+cmov";
    }
-    else if (!strcasecmp(isa, "sse4x2")) {
+    else if (!strcasecmp(isa, "sse4x2") || !strcasecmp(isa, "sse4-x2")) {
        t->isa = Target::SSE4;
        t->nativeVectorWidth = 4;
        t->vectorWidth = 8;
@@ -174,7 +174,7 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa,
 const char *
 Target::SupportedTargetCPUs() {
    return "atom, barcelona, core2, corei7, "
-#if defined(LLVM_3_0) || defined(LLVM_3_0_svn)
+#if defined(LLVM_3_0) || defined(LLVM_3_0svn)
        "corei7-avx, "
 #endif
        "istanbul, nocona, penryn, "
@@ -193,8 +193,8 @@ Target::SupportedTargetArchs() {

 const char *
 Target::SupportedTargetISAs() {
-    return "sse2, sse4, sse4x2"
-#if defined(LLVM_3_0) || defined(LLVM_3_0_svn)
+    return "sse2, sse4, sse4-x2"
+#if defined(LLVM_3_0) || defined(LLVM_3_0svn)
        ", avx, avx-x2"
 #endif
        ;
@@ -228,14 +228,22 @@ llvm::TargetMachine *
 Target::GetTargetMachine() const {
    std::string triple = GetTripleString();

+    llvm::Reloc::Model relocModel = generatePIC ? llvm::Reloc::PIC_ : 
+                                                  llvm::Reloc::Default;
 #if defined(LLVM_3_0svn) || defined(LLVM_3_0)
    std::string featuresString = attributes;
    llvm::TargetMachine *targetMachine = 
-        target->createTargetMachine(triple, cpu, featuresString);
+        target->createTargetMachine(triple, cpu, featuresString, relocModel);
 #else
+#ifdef ISPC_IS_APPLE
+    relocModel = llvm::Reloc::PIC_;
+#endif // ISPC_IS_APPLE
    std::string featuresString = cpu + std::string(",") + attributes;
    llvm::TargetMachine *targetMachine = 
        target->createTargetMachine(triple, featuresString);
+#ifndef ISPC_IS_WINDOWS
+    targetMachine->setRelocationModel(relocModel);
+#endif // !ISPC_IS_WINDOWS
 #endif
    assert(targetMachine != NULL);

@@ -244,12 +252,31 @@ Target::GetTargetMachine() const {
 }


+const char *
+Target::GetISAString() const {
+    switch (isa) {
+    case Target::SSE2:
+        return "sse2";
+    case Target::SSE4:
+        return "sse4";
+    case Target::AVX:
+        return "avx";
+        break;
+    default:
+        FATAL("Unhandled target in GetISAString()");
+    }
+    return "";
+}
+
+
 ///////////////////////////////////////////////////////////////////////////
 // Opt

 Opt::Opt() {
    level = 1;
    fastMath = false;
+    fastMaskedVload = false;
+    unrollLoops = true;
    disableBlendedMaskedStores = false;
    disableCoherentControlFlow = false;
    disableUniformControlFlow = false;
@@ -273,6 +300,7 @@ Globals::Globals() {
    emitPerfWarnings = true;
    emitInstrumentation = false;
    generateDebuggingSymbols = false;
+    mangleFunctionsWithTarget = false;

    ctx = new llvm::LLVMContext;

@@ -283,12 +311,6 @@ Globals::Globals() {
 #endif
 }

-///////////////////////////////////////////////////////////////////////////
-// ASTNode
-
-ASTNode::~ASTNode() {
-}
-
 ///////////////////////////////////////////////////////////////////////////
 // SourcePos

@@ -299,13 +321,9 @@ SourcePos::SourcePos(const char *n, int l, int c) {
 }

 llvm::DIFile SourcePos::GetDIFile() const {
-#ifdef LLVM_2_8
-    return llvm::DIFile();
-#else
    std::string directory, filename;
    GetDirectoryAndFileName(g->currentDirectory, name, &directory, &filename);
    return m->diBuilder->createFile(filename, directory);
-#endif // LLVM_2_8
 }


--- a/ispc.h
+++ b/ispc.h
@@ -90,8 +90,8 @@ class Declarator;
 class FunctionEmitContext;
 class Expr;
 class ExprList;
+class Function;
 class FunctionType;
-class GatherBuffer;
 class Module;
 class Stmt;
 class Symbol;
@@ -124,35 +124,6 @@ struct SourcePos {
 };


-/** @brief Abstract base class for nodes in the abstract syntax tree (AST).
-
-    This class defines a basic interface that all abstract syntax tree
-    (AST) nodes must implement.  The base classes for both expressions
-    (Expr) and statements (Stmt) inherit from this class.
-*/
-class ASTNode {
-public:
-    ASTNode(SourcePos p) : pos(p) { }
-    virtual ~ASTNode();
-
-    /** The Optimize() method should perform any appropriate early-stage
-        optimizations on the node (e.g. constant folding).  The caller
-        should use the returned ASTNode * in place of the original node.
-        This method may return NULL if an error is encountered during
-        optimization. */
-    virtual ASTNode *Optimize() = 0;
-
-    /** Type checking should be performed by the node when this method is
-        called.  In the event of an error, a NULL value may be returned.
-        As with ASTNode::Optimize(), the caller should store the returned
-        pointer in place of the original ASTNode *. */
-    virtual ASTNode *TypeCheck() = 0;
-
-    /** All AST nodes must track the file position where they are
-        defined. */
-    const SourcePos pos;
-};
-
 /** @brief Structure that defines a compilation target 

    This structure defines a compilation target for the ispc compiler.
@@ -162,7 +133,7 @@ struct Target {
        name, if the name is a known target.  Returns true if the
        target was initialized and false if the name is unknown. */
    static bool GetTarget(const char *arch, const char *cpu, const char *isa,
-                          Target *);
+                          bool pic, Target *);

    /** Returns a comma-delimited string giving the names of the currently
        supported target ISAs. */
@@ -184,12 +155,19 @@ struct Target {
        target. */
    llvm::TargetMachine *GetTargetMachine() const;
    
+    /** Returns a string like "avx" encoding the target. */
+    const char *GetISAString() const;
+
    /** llvm Target object representing this target. */
    const llvm::Target *target;

    /** Enumerator giving the instruction sets that the compiler can
-        target. */
-    enum ISA { SSE2, SSE4, AVX };
+        target.  These should be ordered from "worse" to "better" in that
+        if a processor supports multiple target ISAs, then the most
+        flexible/performant of them will apear last in the enumerant.  Note
+        also that __best_available_isa() needs to be updated if ISAs are
+        added or the enumerant values are reordered.  */
+    enum ISA { SSE2, SSE4, AVX, NUM_ISAS };

    /** Instruction set being compiled to. */
    ISA isa;
@@ -215,8 +193,12 @@ struct Target {
        integer multiple of the native vector width, for example if we're
        "doubling up" and compiling 8-wide on a 4-wide SSE system. */
    int vectorWidth;
+
+    /** Indicates whether position independent code should be generated. */
+    bool generatePIC;
 };

+
 /** @brief Structure that collects optimization options

    This structure collects all of the options related to optimization of
@@ -234,6 +216,16 @@ struct Opt {
        should be performed.  This is false by default. */
    bool fastMath;

+    /** Indicates whether an vector load should be issued for masked loads
+        on platforms that don't have a native masked vector load.  (This may
+        lead to accessing memory up to programCount-1 elements past the end of
+        arrays, so is unsafe in general.) */
+    bool fastMaskedVload;
+
+    /** Indicates when loops should be unrolled (when doing so seems like
+        it will make sense. */
+    bool unrollLoops;
+
    /** On targets that don't have a masked store instruction but do have a
        blending instruction, by default, we simulate masked stores by
        loading the old value, blending, and storing the result.  This can
@@ -339,6 +331,10 @@ struct Globals {
        program in its output. */
    bool generateDebuggingSymbols;
   
+    /** If true, function names are mangled by appending the target ISA and
+        vector width to them. */
+    bool mangleFunctionsWithTarget;
+
    /** Global LLVMContext object */
    llvm::LLVMContext *ctx;

@@ -351,6 +347,29 @@ struct Globals {
    std::vector<std::string> cppArgs;
 };

+enum {
+    COST_ASSIGN = 1,
+    COST_COHERENT_BREAK_CONTINE = 4,
+    COST_COMPLEX_ARITH_OP = 4,
+    COST_DEREF = 4,
+    COST_FUNCALL = 4,
+    COST_GATHER = 8,
+    COST_LOAD = 2,
+    COST_REGULAR_BREAK_CONTINUE = 2,
+    COST_RETURN = 4,
+    COST_SELECT = 4,
+    COST_SIMPLE_ARITH_LOGIC_OP = 1,
+    COST_SYNC = 32,
+    COST_TASK_LAUNCH = 16,
+    COST_TYPECAST_COMPLEX = 4,
+    COST_TYPECAST_SIMPLE = 1,
+    COST_UNIFORM_LOOP = 4,
+    COST_VARYING_LOOP = 6,
+
+    CHECK_MASK_AT_FUNCTION_START_COST = 16,
+    PREDICATE_SAFE_IF_STATEMENT_COST = 6,
+};
+
 extern Globals *g;
 extern Module *m;

--- a/ispc.vcxproj
+++ b/ispc.vcxproj
@@ -1,4 +1,4 @@
-<?xml version="1.0" encoding="utf-8"?>
+<?xml version="1.0" encoding="utf-8"?>
 <Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
  <ItemGroup Label="ProjectConfigurations">
    <ProjectConfiguration Include="Debug|Win32">
@@ -11,32 +11,43 @@
    </ProjectConfiguration>
  </ItemGroup>
  <ItemGroup>
+    <ClCompile Include="ast.cpp" />
    <ClCompile Include="builtins.cpp" />
    <ClCompile Include="ctx.cpp" />
    <ClCompile Include="decl.cpp" />
    <ClCompile Include="expr.cpp" />
+    <ClCompile Include="func.cpp" />
    <ClCompile Include="gen-bitcode-avx.cpp" />
    <ClCompile Include="gen-bitcode-avx-x2.cpp" />
    <ClCompile Include="gen-bitcode-c-32.cpp" />
    <ClCompile Include="gen-bitcode-c-64.cpp" />
+    <ClCompile Include="gen-bitcode-dispatch.cpp" />
    <ClCompile Include="gen-bitcode-sse2.cpp" />
    <ClCompile Include="gen-bitcode-sse4.cpp" />
-    <ClCompile Include="gen-bitcode-sse4x2.cpp" />
+    <ClCompile Include="gen-bitcode-sse4-x2.cpp" />
    <ClCompile Include="gen-stdlib.cpp" />
    <ClCompile Include="ispc.cpp" />
-    <ClCompile Include="lex.cc" />
+    <ClCompile Include="lex.cc">
+      <DisableSpecificWarnings Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">4146;4800;4996;4355;4624;4005;4003;4018</DisableSpecificWarnings>
+      <DisableSpecificWarnings Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">4146;4800;4996;4355;4624;4005;4003;4018</DisableSpecificWarnings>
+    </ClCompile>
    <ClCompile Include="llvmutil.cpp" />
    <ClCompile Include="module.cpp" />
    <ClCompile Include="main.cpp" />
    <ClCompile Include="opt.cpp" />
-    <ClCompile Include="parse.cc" />
+    <ClCompile Include="parse.cc">
+      <DisableSpecificWarnings Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">4146;4800;4996;4355;4624;4005;4065</DisableSpecificWarnings>
+      <DisableSpecificWarnings Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">4146;4800;4996;4355;4624;4005;4065</DisableSpecificWarnings>
+    </ClCompile>
    <CustomBuild Include="builtins-c.c">
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%LLVM_INSTALL_DIR%\bin\clang -m32 -emit-llvm builtins-c.c -c -o - | %LLVM_INSTALL_DIR%\bin\llvm-dis - | python bitcode2cpp.py builtins-c-32.c &gt; gen-bitcode-c-32.cpp</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%LLVM_INSTALL_DIR%\bin\clang -m32 -emit-llvm builtins-c.c -c -o - | %LLVM_INSTALL_DIR%\bin\llvm-dis - | python bitcode2cpp.py builtins-c-32.c &gt; gen-bitcode-c-32.cpp;
+%LLVM_INSTALL_DIR%\bin\clang -m64 -emit-llvm builtins-c.c -c -o - | %LLVM_INSTALL_DIR%\bin\llvm-dis - | python bitcode2cpp.py builtins-c-64.c &gt; gen-bitcode-c-64.cpp</Command>
      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">clang builtins-c.c</Message>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%LLVM_INSTALL_DIR%\bin\clang -m32 -emit-llvm builtins-c.c -c -o - | %LLVM_INSTALL_DIR%\bin\llvm-dis - | python bitcode2cpp.py builtins-c-32.c &gt; gen-bitcode-c-32.cpp</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%LLVM_INSTALL_DIR%\bin\clang -m32 -emit-llvm builtins-c.c -c -o - | %LLVM_INSTALL_DIR%\bin\llvm-dis - | python bitcode2cpp.py builtins-c-32.c &gt; gen-bitcode-c-32.cpp;
+%LLVM_INSTALL_DIR%\bin\clang -m64 -emit-llvm builtins-c.c -c -o - | %LLVM_INSTALL_DIR%\bin\llvm-dis - | python bitcode2cpp.py builtins-c-64.c &gt; gen-bitcode-c-64.cpp</Command>
      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">clang builtins-c.c</Message>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-c.cpp</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-c.cpp</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-c-32.cpp;gen-bitcore-c-64.cpp</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-c-32.cpp;gen-bitcore-c-64.cpp</Outputs>
    </CustomBuild>
    <ClCompile Include="stmt.cpp" />
    <ClCompile Include="sym.cpp" />
@@ -44,10 +55,12 @@
    <ClCompile Include="util.cpp" />
  </ItemGroup>
  <ItemGroup>
+    <ClInclude Include="ast.h" />
    <ClInclude Include="builtins.h" />
    <ClInclude Include="ctx.h" />
    <ClInclude Include="decl.h" />
    <ClInclude Include="expr.h" />
+    <ClInclude Include="func.h" />
    <ClInclude Include="ispc.h" />
    <ClInclude Include="llvmutil.h" />
    <ClInclude Include="module.h" />
@@ -61,9 +74,9 @@
  <ItemGroup>
    <CustomBuild Include="stdlib.ispc">
      <FileType>Document</FileType>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">clang -E -x c %(Filename).ispc -DISPC=1 -DPI=3.1415926535 | python stdlib2cpp.py &gt; gen-stdlib.cpp</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%LLVM_INSTALL_DIR%\bin\clang -E -x c %(Filename).ispc -DISPC=1 -DPI=3.1415926535 | python stdlib2cpp.py &gt; gen-stdlib.cpp</Command>
      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-stdlib.cpp</Outputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">clang -E -x c %(Filename).ispc -DISPC=1 -DPI=3.1415926535 | python stdlib2cpp.py &gt; gen-stdlib.cpp</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%LLVM_INSTALL_DIR%\bin\clang -E -x c %(Filename).ispc -DISPC=1 -DPI=3.1415926535 | python stdlib2cpp.py &gt; gen-stdlib.cpp</Command>
      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-stdlib.cpp</Outputs>
      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-stdlib.cpp</Message>
      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-stdlib.cpp</Message>
@@ -83,16 +96,29 @@
    </CustomBuild>
  </ItemGroup>
  <ItemGroup>
-    <CustomBuild Include="builtins-sse4x2.ll">
+    <CustomBuild Include="builtins-dispatch.ll">
      <FileType>Document</FileType>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 builtins.m4 builtins-sse4x2.ll | python bitcode2cpp.py builtins-sse4x2.ll &gt; gen-bitcode-sse4x2.cpp</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-sse4x2.cpp</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 builtins.m4 builtins-dispatch.ll | python bitcode2cpp.py builtins-dispatch.ll &gt; gen-bitcode-dispatch.cpp</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-dispatch.cpp</Outputs>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins.m4</AdditionalInputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 builtins.m4 builtins-dispatch.ll | python bitcode2cpp.py builtins-dispatch.ll &gt; gen-bitcode-dispatch.cpp</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-dispatch.cpp</Outputs>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins.m4</AdditionalInputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-dispatch.cpp</Message>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-dispatch.cpp</Message>
+    </CustomBuild>
+  </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="builtins-sse4-x2.ll">
+      <FileType>Document</FileType>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 builtins.m4 builtins-sse4-x2.ll | python bitcode2cpp.py builtins-sse4-x2.ll &gt; gen-bitcode-sse4-x2.cpp</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-sse4-x2.cpp</Outputs>
      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins.m4;builtins-sse.ll</AdditionalInputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 builtins.m4 builtins-sse4x2.ll | python bitcode2cpp.py builtins-sse4x2.ll &gt; gen-bitcode-sse4x2.cpp</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-sse4x2.cpp</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 builtins.m4 builtins-sse4-x2.ll | python bitcode2cpp.py builtins-sse4-x2.ll &gt; gen-bitcode-sse4-x2.cpp</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-sse4-x2.cpp</Outputs>
      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins.m4;builtins-sse.ll</AdditionalInputs>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-sse4x2.cpp</Message>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-sse4x2.cpp</Message>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-sse4-x2.cpp</Message>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-sse4-x2.cpp</Message>
    </CustomBuild>
  </ItemGroup>
  <ItemGroup>
@@ -194,7 +220,7 @@
      <PrecompiledHeader>NotUsing</PrecompiledHeader>
      <WarningLevel>Level3</WarningLevel>
      <Optimization>Disabled</Optimization>
-      <PreprocessorDefinitions>NOMINMAX;LLVM_2_9</PreprocessorDefinitions>
+      <PreprocessorDefinitions>NOMINMAX;LLVM_3_0</PreprocessorDefinitions>
      <AdditionalIncludeDirectories>$(LLVM_INSTALL_DIR)\include;.;.\winstuff;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
      <DisableSpecificWarnings>4146;4800;4996;4355;4624</DisableSpecificWarnings>
    </ClCompile>
@@ -202,7 +228,7 @@
      <SubSystem>Console</SubSystem>
      <GenerateDebugInformation>true</GenerateDebugInformation>
      <AdditionalLibraryDirectories>$(LLVM_INSTALL_DIR)\lib;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
-      <AdditionalDependencies>clangFrontend.lib;clangDriver.lib;clangSerialization.lib;clangParse.lib;clangSema.lib;clangAnalysis.lib;clangAST.lib;clangLex.lib;clangBasic.lib;LLVMAnalysis.lib;LLVMArchive.lib;LLVMAsmPrinter.lib;LLVMBitReader.lib;LLVMBitWriter.lib;LLVMCodeGen.lib;LLVMCore.lib;LLVMExecutionEngine.lib;LLVMInstCombine.lib;LLVMInstrumentation.lib;LLVMipa.lib;LLVMipo.lib;LLVMLinker.lib;LLVMMC.lib;LLVMMCParser.lib;LLVMObject.lib;LLVMScalarOpts.lib;LLVMSelectionDAG.lib;LLVMSupport.lib;LLVMTarget.lib;LLVMTransformUtils.lib;LLVMX86ASMPrinter.lib;LLVMX86ASMParser.lib;LLVMX86Utils.lib;LLVMX86CodeGen.lib;LLVMX86Disassembler.lib;LLVMX86Info.lib;shlwapi.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalDependencies>clangFrontend.lib;clangDriver.lib;clangSerialization.lib;clangParse.lib;clangSema.lib;clangAnalysis.lib;clangAST.lib;clangLex.lib;clangBasic.lib;LLVMAnalysis.lib;LLVMArchive.lib;LLVMAsmParser.lib;LLVMAsmPrinter.lib;LLVMBitReader.lib;LLVMBitWriter.lib;LLVMCodeGen.lib;LLVMCore.lib;LLVMDebugInfo.lib;LLVMExecutionEngine.lib;LLVMInstCombine.lib;LLVMInstrumentation.lib;LLVMLinker.lib;LLVMMC.lib;LLVMMCDisassembler.lib;LLVMMCParser.lib;LLVMObject.lib;LLVMScalarOpts.lib;LLVMSelectionDAG.lib;LLVMSupport.lib;LLVMTarget.lib;LLVMTransformUtils.lib;LLVMX86ASMPrinter.lib;LLVMX86ASMParser.lib;LLVMX86Utils.lib;LLVMX86CodeGen.lib;LLVMX86Desc.lib;LLVMX86Disassembler.lib;LLVMX86Info.lib;LLVMipa.lib;LLVMipo.lib;shlwapi.lib;%(AdditionalDependencies)</AdditionalDependencies>
    </Link>
  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
@@ -212,7 +238,7 @@
      <Optimization>MaxSpeed</Optimization>
      <FunctionLevelLinking>true</FunctionLevelLinking>
      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <PreprocessorDefinitions>NOMINMAX;LLVM_2_9</PreprocessorDefinitions>
+      <PreprocessorDefinitions>NOMINMAX;LLVM_3_0</PreprocessorDefinitions>
      <AdditionalIncludeDirectories>$(LLVM_INSTALL_DIR)\include;.;.\winstuff;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
      <DisableSpecificWarnings>4146;4800;4996;4355;4624</DisableSpecificWarnings>
    </ClCompile>
@@ -222,7 +248,7 @@
      <EnableCOMDATFolding>true</EnableCOMDATFolding>
      <OptimizeReferences>true</OptimizeReferences>
      <AdditionalLibraryDirectories>$(LLVM_INSTALL_DIR)\lib;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
-      <AdditionalDependencies>clangFrontend.lib;clangDriver.lib;clangSerialization.lib;clangParse.lib;clangSema.lib;clangAnalysis.lib;clangAST.lib;clangLex.lib;clangBasic.lib;LLVMAnalysis.lib;LLVMArchive.lib;LLVMAsmPrinter.lib;LLVMBitReader.lib;LLVMBitWriter.lib;LLVMCodeGen.lib;LLVMCore.lib;LLVMExecutionEngine.lib;LLVMInstCombine.lib;LLVMInstrumentation.lib;LLVMipa.lib;LLVMipo.lib;LLVMLinker.lib;LLVMMC.lib;LLVMMCParser.lib;LLVMObject.lib;LLVMScalarOpts.lib;LLVMSelectionDAG.lib;LLVMSupport.lib;LLVMTarget.lib;LLVMTransformUtils.lib;LLVMX86ASMPrinter.lib;LLVMX86ASMParser.lib;LLVMX86Utils.lib;LLVMX86CodeGen.lib;LLVMX86Disassembler.lib;LLVMX86Info.lib;shlwapi.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalDependencies>clangFrontend.lib;clangDriver.lib;clangSerialization.lib;clangParse.lib;clangSema.lib;clangAnalysis.lib;clangAST.lib;clangLex.lib;clangBasic.lib;LLVMAnalysis.lib;LLVMArchive.lib;LLVMAsmParser.lib;LLVMAsmPrinter.lib;LLVMBitReader.lib;LLVMBitWriter.lib;LLVMCodeGen.lib;LLVMCore.lib;LLVMDebugInfo.lib;LLVMExecutionEngine.lib;LLVMInstCombine.lib;LLVMInstrumentation.lib;LLVMLinker.lib;LLVMMC.lib;LLVMMCDisassembler.lib;LLVMMCParser.lib;LLVMObject.lib;LLVMScalarOpts.lib;LLVMSelectionDAG.lib;LLVMSupport.lib;LLVMTarget.lib;LLVMTransformUtils.lib;LLVMX86ASMPrinter.lib;LLVMX86ASMParser.lib;LLVMX86Utils.lib;LLVMX86CodeGen.lib;LLVMX86Desc.lib;LLVMX86Disassembler.lib;LLVMX86Info.lib;LLVMipa.lib;LLVMipo.lib;shlwapi.lib;%(AdditionalDependencies)</AdditionalDependencies>
    </Link>
  </ItemDefinitionGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
--- a/ispc_test.cpp
+++ b/ispc_test.cpp
@@ -33,12 +33,25 @@

 #define _CRT_SECURE_NO_WARNINGS

+#if defined(_WIN32) || defined(_WIN64)
+#define ISPC_IS_WINDOWS
+#elif defined(__linux__)
+#define ISPC_IS_LINUX
+#elif defined(__APPLE__)
+#define ISPC_IS_APPLE
+#endif
+
 #ifdef ISPC_IS_WINDOWS
 #define NOMINMAX
 #include <windows.h>
 #endif
 #include <stdio.h>
 #include <stdint.h>
+#include <stdlib.h>
+#include <memory.h>
+#ifdef ISPC_IS_LINUX
+#include <malloc.h>
+#endif

 #ifdef ISPC_HAVE_SVML
 #include <xmmintrin.h>
@@ -62,7 +75,6 @@ extern "C" {
 #include <llvm/Instructions.h>
 #include <llvm/ExecutionEngine/ExecutionEngine.h>
 #if defined(LLVM_3_0) || defined(LLVM_3_0svn)
-  #include <llvm/ExecutionEngine/MCJIT.h>
  #include <llvm/Support/TargetRegistry.h>
  #include <llvm/Support/TargetSelect.h>
 #else
@@ -81,40 +93,48 @@ extern "C" {
 #include <llvm/Support/raw_ostream.h>
 #include <llvm/Bitcode/ReaderWriter.h>
 #include <llvm/Support/MemoryBuffer.h>
-#ifndef LLVM_2_8
 #include <llvm/Support/system_error.h>
-#endif

 bool shouldFail = false;

 extern "C" { 
-    void ISPCLaunch(void *, void *);
-    void ISPCSync();
-    void *ISPCMalloc(int64_t size, int32_t alignment);
-    void ISPCFree(void *ptr);
+    void ISPCLaunch(void **, void *, void *, int32_t);
+    void ISPCSync(void *);
+    void *ISPCAlloc(void **, int64_t size, int32_t alignment);
 }

-void ISPCLaunch(void *func, void *data) {
-    typedef void (*TaskFuncType)(void *, int, int);
+void ISPCLaunch(void **handle, void *func, void *data, int32_t count) {
+    *handle = (void *)0xdeadbeef;
+    typedef void (*TaskFuncType)(void *, int, int, int, int);
    TaskFuncType tft = (TaskFuncType)(func);
-    tft(data, 0, 1);
+    for (int i = 0; i < count; ++i)
+        tft(data, 0, 1, i, count);
 }


-void ISPCSync() {
+void ISPCSync(void *) {
 }


+void *ISPCAlloc(void **handle, int64_t size, int32_t alignment) {
+    *handle = (void *)0xdeadbeef;
+    // leak time!
 #ifdef ISPC_IS_WINDOWS
-void *ISPCMalloc(int64_t size, int32_t alignment) {
-    return _aligned_malloc(size, alignment);
-}
-
-
-void ISPCFree(void *ptr) {
-    _aligned_free(ptr);
-}
+    return _aligned_malloc((size_t)size, alignment);
 #endif
+#ifdef ISPC_IS_LINUX
+    return memalign(alignment, size);
+#endif
+#ifdef ISPC_IS_APPLE
+    void *mem = malloc(size + (alignment-1) + sizeof(void*));
+    char *amem = ((char*)mem) + sizeof(void*);
+    amem = amem + uint32_t(alignment - (reinterpret_cast<uint64_t>(amem) &
+                                        (alignment - 1)));
+    ((void**)amem)[-1] = mem;
+    return amem;
+#endif
+}
+

 static void usage(int ret) {
    fprintf(stderr, "usage: ispc_test\n");
@@ -145,17 +165,6 @@ double Log(double x) { return log(x); }
 static bool lRunTest(const char *fn) {
    llvm::LLVMContext *ctx = new llvm::LLVMContext;

-#ifdef LLVM_2_8
-    std::string err;
-    llvm::MemoryBuffer *buf = llvm::MemoryBuffer::getFileOrSTDIN(fn, &err);
-    if (!buf) {
-        fprintf(stderr, "Unable to open file \"%s\": %s\n", fn, err.c_str());
-        delete ctx;
-        return false;
-    }
-    std::string bcErr;
-    llvm::Module *module = llvm::ParseBitcodeFile(buf, *ctx, &bcErr);
-#else
    llvm::OwningPtr<llvm::MemoryBuffer> buf;
    llvm::error_code err = llvm::MemoryBuffer::getFileOrSTDIN(fn, buf);
    if (err) {
@@ -165,7 +174,6 @@ static bool lRunTest(const char *fn) {
    }
    std::string bcErr;
    llvm::Module *module = llvm::ParseBitcodeFile(buf.get(), *ctx, &bcErr);
-#endif

    if (!module) {
        fprintf(stderr, "Bitcode reader failed for \"%s\": %s\n", fn, bcErr.c_str());
@@ -200,10 +208,7 @@ static bool lRunTest(const char *fn) {
        ee->addGlobalMapping(func, (void *)FUNC)
    DO_FUNC(ISPCLaunch, "ISPCLaunch");
    DO_FUNC(ISPCSync, "ISPCSync");
-#ifdef ISPC_IS_WINDOWS
-    DO_FUNC(ISPCMalloc, "ISPCMalloc");
-    DO_FUNC(ISPCFree, "ISPCFree");
-#endif // ISPC_IS_WINDOWS
+    DO_FUNC(ISPCAlloc, "ISPCAlloc");
    DO_FUNC(putchar, "putchar");
    DO_FUNC(printf, "printf");
    DO_FUNC(fflush, "fflush");
@@ -357,8 +362,6 @@ static bool lRunTest(const char *fn) {
 int main(int argc, char *argv[]) {
    llvm::InitializeNativeTarget();
 #if defined(LLVM_3_0) || defined(LLVM_3_0svn)
-    llvm::InitializeAllTargetMCs();
-    LLVMLinkInMCJIT();
    LLVMLinkInJIT();
 #endif

--- a/ispc_test.vcxproj
+++ b/ispc_test.vcxproj
@@ -52,14 +52,15 @@
      </PrecompiledHeader>
      <WarningLevel>Level3</WarningLevel>
      <Optimization>Disabled</Optimization>
-      <PreprocessorDefinitions>ISPC_IS_WINDOWS;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <PreprocessorDefinitions>LLVM_3_0;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <AdditionalIncludeDirectories>$(LLVM_INSTALL_DIR)/include</AdditionalIncludeDirectories>
+      <DisableSpecificWarnings>4146;4355;4800</DisableSpecificWarnings>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
      <GenerateDebugInformation>true</GenerateDebugInformation>
      <AdditionalLibraryDirectories>$(LLVM_INSTALL_DIR)/lib</AdditionalLibraryDirectories>
-      <AdditionalDependencies>LLVMAnalysis.lib;LLVMArchive.lib;LLVMAsmPrinter.lib;LLVMBitReader.lib;LLVMBitWriter.lib;LLVMCodeGen.lib;LLVMCore.lib;LLVMExecutionEngine.lib;LLVMInstCombine.lib;LLVMInstrumentation.lib;LLVMipa.lib;LLVMipo.lib;LLVMJIT.lib;LLVMLinker.lib;LLVMMC.lib;LLVMMCParser.lib;LLVMObject.lib;LLVMScalarOpts.lib;LLVMSelectionDAG.lib;LLVMSupport.lib;LLVMTarget.lib;LLVMTransformUtils.lib;LLVMX86ASMPrinter.lib;LLVMX86ASMParser.lib;LLVMX86Utils.lib;LLVMX86CodeGen.lib;LLVMX86Disassembler.lib;LLVMX86Info.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalDependencies>LLVMAnalysis.lib;LLVMArchive.lib;LLVMAsmPrinter.lib;LLVMBitReader.lib;LLVMBitWriter.lib;LLVMCodeGen.lib;LLVMCore.lib;LLVMExecutionEngine.lib;LLVMInstCombine.lib;LLVMInstrumentation.lib;LLVMipa.lib;LLVMipo.lib;LLVMJIT.lib;LLVMLinker.lib;LLVMMC.lib;LLVMMCParser.lib;LLVMObject.lib;LLVMScalarOpts.lib;LLVMSelectionDAG.lib;LLVMSupport.lib;LLVMTarget.lib;LLVMTransformUtils.lib;LLVMX86ASMPrinter.lib;LLVMX86ASMParser.lib;LLVMX86Utils.lib;LLVMX86CodeGen.lib;LLVMX86Disassembler.lib;LLVMX86Desc.lib;LLVMX86Info.lib;%(AdditionalDependencies)</AdditionalDependencies>
    </Link>
  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
@@ -70,8 +71,9 @@
      <Optimization>MaxSpeed</Optimization>
      <FunctionLevelLinking>true</FunctionLevelLinking>
      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <PreprocessorDefinitions>ISPC_IS_WINDOWS;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <PreprocessorDefinitions>LLVM_3_0;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <AdditionalIncludeDirectories>$(LLVM_INSTALL_DIR)/include</AdditionalIncludeDirectories>
+      <DisableSpecificWarnings>4146;4355;4800</DisableSpecificWarnings>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
@@ -79,7 +81,7 @@
      <EnableCOMDATFolding>true</EnableCOMDATFolding>
      <OptimizeReferences>true</OptimizeReferences>
      <AdditionalLibraryDirectories>$(LLVM_INSTALL_DIR)/lib</AdditionalLibraryDirectories>
-      <AdditionalDependencies>LLVMAnalysis.lib;LLVMArchive.lib;LLVMAsmPrinter.lib;LLVMBitReader.lib;LLVMBitWriter.lib;LLVMCodeGen.lib;LLVMCore.lib;LLVMExecutionEngine.lib;LLVMInstCombine.lib;LLVMInstrumentation.lib;LLVMipa.lib;LLVMipo.lib;LLVMJIT.lib;LLVMLinker.lib;LLVMMC.lib;LLVMMCParser.lib;LLVMObject.lib;LLVMScalarOpts.lib;LLVMSelectionDAG.lib;LLVMSupport.lib;LLVMTarget.lib;LLVMTransformUtils.lib;LLVMX86ASMPrinter.lib;LLVMX86ASMParser.lib;LLVMX86Utils.lib;LLVMX86CodeGen.lib;LLVMX86Disassembler.lib;LLVMX86Info.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalDependencies>LLVMAnalysis.lib;LLVMArchive.lib;LLVMAsmPrinter.lib;LLVMBitReader.lib;LLVMBitWriter.lib;LLVMCodeGen.lib;LLVMCore.lib;LLVMExecutionEngine.lib;LLVMInstCombine.lib;LLVMInstrumentation.lib;LLVMipa.lib;LLVMipo.lib;LLVMJIT.lib;LLVMLinker.lib;LLVMMC.lib;LLVMMCParser.lib;LLVMObject.lib;LLVMScalarOpts.lib;LLVMSelectionDAG.lib;LLVMSupport.lib;LLVMTarget.lib;LLVMTransformUtils.lib;LLVMX86ASMPrinter.lib;LLVMX86ASMParser.lib;LLVMX86Utils.lib;LLVMX86CodeGen.lib;LLVMX86Disassembler.lib;LLVMX86Desc.lib;LLVMX86Info.lib;%(AdditionalDependencies)</AdditionalDependencies>
    </Link>
  </ItemDefinitionGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
--- a/main.cpp
+++ b/main.cpp
@@ -40,11 +40,7 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <llvm/Support/PrettyStackTrace.h>
-#ifdef LLVM_2_8
-  #include <llvm/System/Signals.h>
-#else
-  #include <llvm/Support/Signals.h>
-#endif
+#include <llvm/Support/Signals.h>
 #if defined(LLVM_3_0) || defined(LLVM_3_0svn)
  #include <llvm/Support/TargetRegistry.h>
  #include <llvm/Support/TargetSelect.h>
@@ -73,7 +69,6 @@ static void usage(int ret) {
    printf("    [--emit-asm]\t\t\tGenerate assembly language file as output\n");
    printf("    [--emit-llvm]\t\t\tEmit LLVM bitode file as output\n");
    printf("    [--emit-obj]\t\t\tGenerate object file file as output (default)\n");
-    printf("    [--fast-math]\t\t\tPerform non-IEEE-compliant optimizations of numeric expressions\n");
    printf("    [-g]\t\t\t\tGenerate debugging information\n");
    printf("    [--help]\t\t\t\tPrint help\n");
    printf("    [-h <name>/--header-outfile=<name>]\tOutput filename for header\n");
@@ -87,8 +82,11 @@ static void usage(int ret) {
    printf("    [--nocpp]\t\t\t\tDon't run the C preprocessor\n");
    printf("    [-o <name>/--outfile=<name>]\tOutput filename (may be \"-\" for standard output)\n");
    printf("    [-O0/-O1]\t\t\t\tSet optimization level (-O1 is default)\n");
-#if 0
    printf("    [--opt=<option>]\t\t\tSet optimization option\n");
+    printf("        disable-loop-unroll\t\tDisable loop unrolling.\n");
+    printf("        fast-masked-vload\t\tFaster masked vector loads on SSE (may go past end of array)\n");
+    printf("        fast-math\t\t\tPerform non-IEEE-compliant optimizations of numeric expressions\n");
+#if 0
    printf("        disable-blended-masked-stores\t\tScalarize masked stores on SSE (vs. using vblendps)\n");
    printf("        disable-coherent-control-flow\t\tDisable coherent control flow optimizations\n");
    printf("        disable-uniform-control-flow\t\tDisable uniform control flow optimizations\n");
@@ -98,6 +96,9 @@ static void usage(int ret) {
    printf("        disable-uniform-memory-optimizations\tDisable uniform-based coherent memory access\n");
    printf("        disable-masked-store-optimizations\tDisable lowering to regular stores when possible\n");
 #endif
+#ifndef ISPC_IS_WINDOWS
+    printf("    [--pic]\t\t\t\tGenerate position-independent code\n");
+#endif // !ISPC_IS_WINDOWS
    printf("    [--target=<isa>]\t\t\tSelect target ISA. <isa>={%s}\n", Target::SupportedTargetISAs());
    printf("    [--version]\t\t\t\tPrint ispc version\n");
    printf("    [--woff]\t\t\t\tDisable warnings\n");
@@ -184,8 +185,9 @@ int main(int Argc, char *Argv[]) {

    bool debugSet = false, optSet = false;
    Module::OutputType ot = Module::Object;
-
+    bool generatePIC = false;
    const char *arch = NULL, *cpu = NULL, *target = NULL;
+
    for (int i = 1; i < argc; ++i) {
        if (!strcmp(argv[i], "--help"))
            usage(0);
@@ -195,8 +197,15 @@ int main(int Argc, char *Argv[]) {
            arch = argv[i] + 7;
        else if (!strncmp(argv[i], "--cpu=", 6))
            cpu = argv[i] + 6;
-        else if (!strcmp(argv[i], "--fast-math"))
-            g->opt.fastMath = true;
+        else if (!strcmp(argv[i], "--fast-math")) {
+            fprintf(stderr, "--fast-math option has been renamed to --opt=fast-math!\n");
+            usage(1);
+        }
+        else if (!strcmp(argv[i], "--fast-masked-vload")) {
+            fprintf(stderr, "--fast-masked-vload option has been renamed to "
+                    "--opt=fast-masked-vload!\n");
+            usage(1);
+        }
        else if (!strcmp(argv[i], "--debug"))
            g->debugPrint = true;
        else if (!strcmp(argv[i], "--instrument"))
@@ -233,7 +242,16 @@ int main(int Argc, char *Argv[]) {
        }
        else if (!strncmp(argv[i], "--opt=", 6)) {
            const char *opt = argv[i] + 6;
-            if (!strcmp(opt, "disable-blended-masked-stores"))
+            if (!strcmp(opt, "fast-math"))
+                g->opt.fastMath = true;
+            else if (!strcmp(opt, "fast-masked-vload"))
+                g->opt.fastMaskedVload = true;
+            else if (!strcmp(opt, "disable-loop-unroll"))
+                g->opt.unrollLoops = false;
+
+            // These are only used for performance tests of specific
+            // optimizations
+            else if (!strcmp(opt, "disable-blended-masked-stores"))
                g->opt.disableBlendedMaskedStores = true;
            else if (!strcmp(opt, "disable-coherent-control-flow"))
                g->opt.disableCoherentControlFlow = true;
@@ -286,6 +304,10 @@ int main(int Argc, char *Argv[]) {
            g->includeStdlib = false;
        else if (!strcmp(argv[i], "--nocpp"))
            g->runCPP = false;
+#ifndef ISPC_IS_WINDOWS
+        else if (!strcmp(argv[i], "--pic"))
+            generatePIC = true;
+#endif // !ISPC_IS_WINDOWS
        else if (!strcmp(argv[i], "-v") || !strcmp(argv[i], "--version")) {
            printf("Intel(r) SPMD Program Compiler (ispc) build %s (%s)\n", 
                   BUILD_DATE, BUILD_VERSION);
@@ -307,20 +329,6 @@ int main(int Argc, char *Argv[]) {
    if (debugSet && !optSet)
        g->opt.level = 0;

-    if (!Target::GetTarget(arch, cpu, target, &g->target))
-        usage(1);
-
-    m = new Module(file);
-    if (m->CompileFile() == 0) {
-        if (outFileName != NULL)
-            if (!m->WriteOutput(ot, outFileName))
-                return 1;
-        if (headerFileName != NULL)
-            if (!m->WriteOutput(Module::Header, headerFileName))
-                return 1;
-    }
-    int errorCount = m->errorCount;
-    delete m;
-
-    return errorCount > 0;
+    return Module::CompileAndOutput(file, arch, cpu, target, generatePIC,
+                                    ot, outFileName, headerFileName);
 }
--- a/module.cpp
+++ b/module.cpp
--- a/module.h
+++ b/module.h
@@ -40,10 +40,11 @@
 #define ISPC_MODULE_H 1

 #include "ispc.h"
+#include "ast.h"

 namespace llvm
 {
-class raw_string_ostream;
+    class raw_string_ostream;
 }

 class Module {
@@ -75,11 +76,37 @@ public:
                                    variables, and the types used by them. */
    };

-    /** Write the corresponding output type to the given file.  Returns
-        true on success, false if there has been an error.  The given
-        filename may be NULL, indicating that output should go to standard
-        output. */
-    bool WriteOutput(OutputType ot, const char *filename);
+    /** Compile the given source file, generating assembly, object file, or
+        LLVM bitcode output, as well as (optionally) a header file with
+        declarations of functions and types used in the ispc/application
+        interface.
+        @param srcFile      Pathname to ispc source file to compile
+        @param arch         Target architecture (e.g. "x86-64")
+        @param cpu          Target CPU (e.g. "core-i7")
+        @param targets      Target ISAs; this parameter may give a single target
+                            ISA, or may give a comma-separated list of them in
+                            case we are compiling to multiple ISAs.
+        @param generatePIC  Indicates whether position-independent code should
+                            be generated.
+        @param outputType   Type of output to generate (object files, assembly,
+                            LLVM bitcode.)
+        @param outFileName  Base name of output filename for object files, etc.
+                            If for example the multiple targets "sse2" and "avx"
+                            are specified in the "targets" parameter and if this
+                            parameter is "foo.o", then we'll generate multiple
+                            output files, like "foo.o", "foo_sse2.o", "foo_avx.o".
+        @param headerFileName If non-NULL, emit a header file suitable for
+                              inclusion from C/C++ code with declarations of
+                              types and functions exported from the given ispc
+                              source file.
+        @return             Number of errors encountered when compiling
+                            srcFile.
+     */
+    static int CompileAndOutput(const char *srcFile, const char *arch, 
+                                const char *cpu, const char *targets, 
+                                bool generatePIC, OutputType outputType, 
+                                const char *outFileName, 
+                                const char *headerFileName);

    /** Total number of errors encountered during compilation. */
    int errorCount;
@@ -91,30 +118,26 @@ public:
    /** llvm Module object into which globals and functions are added. */
    llvm::Module *module; 

-#ifndef LLVM_2_8
-    /** The diBuilder manages generating debugging information (only
-        supported in LLVM 2.9 and beyond...) */
+    /** The diBuilder manages generating debugging information */
    llvm::DIBuilder *diBuilder;
-#endif
-
-    GatherBuffer *gatherBuffer;

 private:
    const char *filename;
+    AST *ast;

-    /** This member records the global variables that have been defined
-        with 'extern' linkage, so that it's easy to include their
-        declarations in generated header files.
-
-        @todo FIXME: it would be nice to eliminate this and then query the
-        symbol table or the llvm Module for them when/if we need them.
-     */
-    std::vector<Symbol *> externGlobals;
-
+    /** Write the corresponding output type to the given file.  Returns
+        true on success, false if there has been an error.  The given
+        filename may be NULL, indicating that output should go to standard
+        output. */
+    bool writeOutput(OutputType ot, const char *filename);
    bool writeHeader(const char *filename);
    bool writeObjectFileOrAssembly(OutputType outputType, const char *filename);
-    void execPreprocessor(const char *infilename, llvm::raw_string_ostream* ostream) const;
+    static bool writeObjectFileOrAssembly(llvm::TargetMachine *targetMachine,
+                                          llvm::Module *module, OutputType outputType, 
+                                          const char *outFileName);
+    static bool writeBitcode(llvm::Module *module, const char *outFileName);

+    void execPreprocessor(const char *infilename, llvm::raw_string_ostream* ostream) const;
 };

 #endif // ISPC_MODULE_H
--- a/opt.cpp
+++ b/opt.cpp
@@ -56,13 +56,11 @@
 #include <llvm/Intrinsics.h>
 #include <llvm/Constants.h>
 #include <llvm/Analysis/ConstantFolding.h>
-#ifndef LLVM_2_8
-    #include <llvm/Target/TargetLibraryInfo.h>
-    #ifdef LLVM_2_9
+#include <llvm/Target/TargetLibraryInfo.h>
+#ifdef LLVM_2_9
    #include <llvm/Support/StandardPasses.h>
-    #else
+#else
    #include <llvm/Transforms/IPO/PassManagerBuilder.h>
-    #endif // LLVM_2_9
 #endif // LLVM_2_8
 #include <llvm/ADT/Triple.h>
 #include <llvm/Transforms/Scalar.h>
@@ -73,11 +71,15 @@
 #include <llvm/Target/TargetMachine.h>
 #include <llvm/Analysis/Verifier.h>
 #include <llvm/Support/raw_ostream.h>
-#ifndef LLVM_2_8
 #include <llvm/Analysis/DIBuilder.h>
-#endif
 #include <llvm/Analysis/DebugInfo.h>
 #include <llvm/Support/Dwarf.h>
+#ifdef ISPC_IS_LINUX
+  #include <alloca.h>
+#elif defined(ISPC_IS_WINDOWS)
+  #include <malloc.h>
+  #define alloca _alloca
+#endif // ISPC_IS_WINDOWS

 static llvm::Pass *CreateIntrinsicsOptPass();
 static llvm::Pass *CreateGatherScatterFlattenPass();
@@ -180,25 +182,29 @@ Optimize(llvm::Module *module, int optLevel) {
    llvm::PassManager optPM;
    llvm::FunctionPassManager funcPM(module);

-#ifndef LLVM_2_8
    llvm::TargetLibraryInfo *targetLibraryInfo =
        new llvm::TargetLibraryInfo(llvm::Triple(module->getTargetTriple()));
    optPM.add(targetLibraryInfo);
-#endif
    optPM.add(new llvm::TargetData(module));

+#if defined(LLVM_3_0) || defined(LLVM_3_0svn)
+    optPM.add(llvm::createIndVarSimplifyPass());
+#endif
+
    if (optLevel == 0) {
        // This is more or less the minimum set of optimizations that we
        // need to do to generate code that will actually run.  (We can't
        // run absolutely no optimizations, since the front-end needs us to
        // take the various __pseudo_* functions it has emitted and turn
        // them into something that can actually execute.
+        optPM.add(llvm::createPromoteMemoryToRegisterPass());
        optPM.add(CreateGatherScatterFlattenPass());
        optPM.add(CreateLowerGatherScatterPass());
        optPM.add(CreateLowerMaskedStorePass());
        optPM.add(CreateIsCompileTimeConstantPass(true));
        optPM.add(llvm::createFunctionInliningPass());
        optPM.add(CreateMakeInternalFuncsStaticPass());
+        optPM.add(llvm::createCFGSimplificationPass());
        optPM.add(llvm::createGlobalDCEPass());
    }
    else {
@@ -213,7 +219,6 @@ Optimize(llvm::Module *module, int optLevel) {
        // only later in the optimization process as things like constant
        // propagation have done their thing, and then when they do kick
        // in, they can often open up new opportunities for optimization...
-#ifndef LLVM_2_8
        llvm::PassRegistry *registry = llvm::PassRegistry::getPassRegistry();
        llvm::initializeCore(*registry);
        llvm::initializeScalarOpts(*registry);
@@ -224,7 +229,7 @@ Optimize(llvm::Module *module, int optLevel) {
        llvm::initializeInstCombine(*registry);
        llvm::initializeInstrumentation(*registry);
        llvm::initializeTarget(*registry);
-#endif
+
        // Early optimizations to try to reduce the total amount of code to
        // work with if we can
        optPM.add(CreateGatherScatterFlattenPass());
@@ -281,13 +286,11 @@ Optimize(llvm::Module *module, int optLevel) {
        optPM.add(llvm::createConstantPropagationPass());
        optPM.add(CreateIntrinsicsOptPass());

-#if defined(LLVM_2_8)
-        optPM.add(CreateIsCompileTimeConstantPass(true));
-#elif defined(LLVM_2_9)
+#if defined(LLVM_2_9)
        llvm::createStandardModulePasses(&optPM, 3, 
                                         false /* opt size */,
                                         true /* unit at a time */, 
-                                         false /* unroll loops */,
+                                         g->opt.unrollLoops,
                                         true /* simplify lib calls */,
                                         false /* may have exceptions */,
                                         llvm::createFunctionInliningPass());
@@ -302,7 +305,7 @@ Optimize(llvm::Module *module, int optLevel) {
        llvm::createStandardModulePasses(&optPM, 3, 
                                         false /* opt size */,
                                         true /* unit at a time */, 
-                                         false /* unroll loops */,
+                                         g->opt.unrollLoops,
                                         true /* simplify lib calls */,
                                         false /* may have exceptions */,
                                         llvm::createFunctionInliningPass());
@@ -311,6 +314,8 @@ Optimize(llvm::Module *module, int optLevel) {
        llvm::PassManagerBuilder builder;
        builder.OptLevel = 3;
        builder.Inliner = llvm::createFunctionInliningPass();
+        if (g->opt.unrollLoops == false)
+            builder.DisableUnrollLoops = true;
        builder.populateFunctionPassManager(funcPM);
        builder.populateModulePassManager(optPM);
        optPM.add(CreateIsCompileTimeConstantPass(true));
@@ -423,8 +428,11 @@ IntrinsicsOpt::IntrinsicsOpt()
    blendInstructions.push_back(BlendInstruction(
        llvm::Intrinsic::getDeclaration(m->module, llvm::Intrinsic::x86_sse41_blendvps),
        0xf, 0, 1, 2));
+#if defined(LLVM_3_0) || defined(LLVM_3_0svn)
    blendInstructions.push_back(BlendInstruction(
-        m->module->getFunction("llvm.x86.avx.blendvps"), 0xff, 0, 1, 2));
+        llvm::Intrinsic::getDeclaration(m->module, llvm::Intrinsic::x86_avx_blendv_ps_256),
+        0xff, 0, 1, 2));
+#endif
 }


@@ -671,7 +679,8 @@ IntrinsicsOpt::runOnBasicBlock(llvm::BasicBlock &bb) {
 bool
 IntrinsicsOpt::matchesMaskInstruction(llvm::Function *function) {
    for (unsigned int i = 0; i < maskInstructions.size(); ++i)
-        if (function == maskInstructions[i].function)
+        if (maskInstructions[i].function != NULL &&
+            function == maskInstructions[i].function)
            return true;
    return false;
 }
@@ -680,7 +689,8 @@ IntrinsicsOpt::matchesMaskInstruction(llvm::Function *function) {
 IntrinsicsOpt::BlendInstruction *
 IntrinsicsOpt::matchingBlendInstruction(llvm::Function *function) {
    for (unsigned int i = 0; i < blendInstructions.size(); ++i)
-        if (function == blendInstructions[i].function)
+        if (blendInstructions[i].function != NULL &&
+            function == blendInstructions[i].function)
            return &blendInstructions[i];
    return NULL;
 }
@@ -1140,7 +1150,8 @@ GatherScatterFlattenOpt::runOnBasicBlock(llvm::BasicBlock &bb) {
            continue;
        GSInfo *info = NULL;
        for (int i = 0; i < numGSFuncs; ++i)
-            if (callInst->getCalledFunction() == gsFuncs[i].func) {
+            if (gsFuncs[i].func != NULL &&
+                callInst->getCalledFunction() == gsFuncs[i].func) {
                info = &gsFuncs[i];
                break;
            }
@@ -1281,7 +1292,7 @@ MaskedStoreOptPass::runOnBasicBlock(llvm::BasicBlock &bb) {
        int nMSFuncs = sizeof(msInfo) / sizeof(msInfo[0]);
        MSInfo *info = NULL;
        for (int i = 0; i < nMSFuncs; ++i) {
-            if (called == msInfo[i].func) {
+            if (msInfo[i].func != NULL && called == msInfo[i].func) {
                info = &msInfo[i];
                break;
            }
@@ -1421,7 +1432,8 @@ LowerMaskedStorePass::runOnBasicBlock(llvm::BasicBlock &bb) {
            continue;
        LMSInfo *info = NULL;
        for (unsigned int i = 0; i < sizeof(msInfo) / sizeof(msInfo[0]); ++i) {
-            if (callInst->getCalledFunction() == msInfo[i].pseudoFunc) {
+            if (msInfo[i].pseudoFunc != NULL &&
+                callInst->getCalledFunction() == msInfo[i].pseudoFunc) {
                info = &msInfo[i];
                break;
            }
@@ -1433,16 +1445,12 @@ LowerMaskedStorePass::runOnBasicBlock(llvm::BasicBlock &bb) {
        llvm::Value *rvalue  = callInst->getArgOperand(1);
        llvm::Value *mask = callInst->getArgOperand(2);

-        // On SSE, we need to choose between doing the load + blend + store
-        // trick, or serializing the masked store.  On targets with a
-        // native masked store instruction, the implementations of
-        // __masked_store_blend_* should be the same as __masked_store_*,
-        // so this doesn't matter.  On SSE, blending is generally more
-        // efficient and is always safe to do on stack-allocated values.(?)
-        bool doBlend = (g->target.isa != Target::AVX &&
+        // We need to choose between doing the load + blend + store trick,
+        // or serializing the masked store.  Even on targets with a native
+        // masked store instruction, this is preferable since it lets us
+        // keep values in registers rather than going out to the stack.
+        bool doBlend = (!g->opt.disableBlendedMaskedStores ||
                        lIsStackVariablePointer(lvalue));
-        if (g->target.isa == Target::SSE4 || g->target.isa == Target::SSE2)
-            doBlend |= !g->opt.disableBlendedMaskedStores;

        // Generate the call to the appropriate masked store function and
        // replace the __pseudo_* one with it.
@@ -1520,8 +1528,8 @@ static void lPrintVector(const char *info, llvm::Value *elements[ISPC_MAX_NVEC])


 /** Given an LLVM vector in vec, return a 'scalarized' version of the
-    vector in the provided offsets[] array.  For example, if the vector
-    value passed in is:  
+    vector in the provided scalarizedVector[] array.  For example, if the
+    vector value passed in is:

    add <4 x i32> %a_smear, <4 x i32> <4, 8, 12, 16>,

@@ -1542,28 +1550,39 @@ static void lPrintVector(const char *info, llvm::Value *elements[ISPC_MAX_NVEC])
    @param vec               Vector to be scalarized
    @param scalarizedVector  Array in which to store the individual vector 
                             elements
+    @param vectorLength      Number of elements in the given vector. (The
+                             passed scalarizedVector array must also be at least
+                             this length as well.)
    @returns                 True if the vector was successfully scalarized and
                             the values in offsets[] are valid; false otherwise
 */
 static bool
-lScalarizeVector(llvm::Value *vec, llvm::Value *scalarizedVector[ISPC_MAX_NVEC]) {
+lScalarizeVector(llvm::Value *vec, llvm::Value **scalarizedVector,
+                 int vectorLength) {
    // First initialize the values of scalarizedVector[] to NULL.
-    for (int i = 0; i < g->target.vectorWidth; ++i)
+    for (int i = 0; i < vectorLength; ++i)
        scalarizedVector[i] = NULL;
    
+    // It may be ok for the vector to be an undef vector; these come up for
+    // example in shufflevector instructions.  As long as elements of the
+    // undef vector aren't referenced by the shuffle indices, this is fine.
+    if (llvm::isa<llvm::UndefValue>(vec))
+        return true;
+
    // ConstantVectors are easy; just pull out the individual constant
    // element values
    llvm::ConstantVector *cv = llvm::dyn_cast<llvm::ConstantVector>(vec);
    if (cv != NULL) {
-        for (int i = 0; i < g->target.vectorWidth; ++i)
+        for (int i = 0; i < vectorLength; ++i)
            scalarizedVector[i] = cv->getOperand(i);
        return true;
    }

    // It's also easy if it's just a vector of all zeros
-    llvm::ConstantAggregateZero *caz = llvm::dyn_cast<llvm::ConstantAggregateZero>(vec);
-    if (caz) {
-        for (int i = 0; i < g->target.vectorWidth; ++i)
+    llvm::ConstantAggregateZero *caz = 
+        llvm::dyn_cast<llvm::ConstantAggregateZero>(vec);
+    if (caz != NULL) {
+        for (int i = 0; i < vectorLength; ++i)
            scalarizedVector[i] = LLVMInt32(0);
        return true;
    }
@@ -1575,13 +1594,16 @@ lScalarizeVector(llvm::Value *vec, llvm::Value *scalarizedVector[ISPC_MAX_NVEC])
        // scalar values we return from here are synthesized with scalar
        // versions of the original vector binary operator
        llvm::Instruction::BinaryOps opcode = bo->getOpcode();
-        llvm::Value *v0[ISPC_MAX_NVEC], *v1[ISPC_MAX_NVEC];
+        llvm::Value **v0 = 
+            (llvm::Value **)alloca(vectorLength * sizeof(llvm::Value *));
+        llvm::Value **v1 = 
+            (llvm::Value **)alloca(vectorLength * sizeof(llvm::Value *));

-        if (!lScalarizeVector(bo->getOperand(0), v0) || 
-            !lScalarizeVector(bo->getOperand(1), v1))
+        if (!lScalarizeVector(bo->getOperand(0), v0, vectorLength) || 
+            !lScalarizeVector(bo->getOperand(1), v1, vectorLength))
            return false;

-        for (int i = 0; i < g->target.vectorWidth; ++i) {
+        for (int i = 0; i < vectorLength; ++i) {
            scalarizedVector[i] = 
                llvm::BinaryOperator::Create(opcode, v0[i], v1[i], "flat_bop", bo);
            lCopyMetadata(scalarizedVector[i], bo);
@@ -1606,7 +1628,7 @@ lScalarizeVector(llvm::Value *vec, llvm::Value *scalarizedVector[ISPC_MAX_NVEC])
        // vaue in scalarizedVector[] based on the value being inserted.
        while (ie != NULL) {
            uint64_t iOffset = lGetIntValue(ie->getOperand(2));
-            assert((int)iOffset < g->target.vectorWidth);
+            assert((int)iOffset < vectorLength);
            assert(scalarizedVector[iOffset] == NULL);

            scalarizedVector[iOffset] = ie->getOperand(1);
@@ -1620,15 +1642,17 @@ lScalarizeVector(llvm::Value *vec, llvm::Value *scalarizedVector[ISPC_MAX_NVEC])
    }

    llvm::CastInst *ci = llvm::dyn_cast<llvm::CastInst>(vec);
-    if (ci) {
+    if (ci != NULL) {
        // Casts are similar to BinaryOperators in that we attempt to
        // scalarize the vector being cast and if successful, we apply
        // equivalent scalar cast operators to each of the values in the
        // scalarized vector.
        llvm::Instruction::CastOps op = ci->getOpcode();

-        llvm::Value *scalarizedTarget[ISPC_MAX_NVEC];
-        if (!lScalarizeVector(ci->getOperand(0), scalarizedTarget))
+        llvm::Value **scalarizedTarget = 
+            (llvm::Value **)alloca(vectorLength * sizeof(llvm::Value *));
+        if (!lScalarizeVector(ci->getOperand(0), scalarizedTarget,
+                              vectorLength))
            return false;

        LLVM_TYPE_CONST llvm::Type *destType = ci->getDestTy();
@@ -1637,7 +1661,7 @@ lScalarizeVector(llvm::Value *vec, llvm::Value *scalarizedVector[ISPC_MAX_NVEC])
        assert(vectorDestType != NULL);
        LLVM_TYPE_CONST llvm::Type *elementType = vectorDestType->getElementType();

-        for (int i = 0; i < g->target.vectorWidth; ++i) {
+        for (int i = 0; i < vectorLength; ++i) {
            scalarizedVector[i] = 
                llvm::CastInst::Create(op, scalarizedTarget[i], elementType,
                                       "cast", ci);
@@ -1647,16 +1671,11 @@ lScalarizeVector(llvm::Value *vec, llvm::Value *scalarizedVector[ISPC_MAX_NVEC])
    }

    llvm::ShuffleVectorInst *svi = llvm::dyn_cast<llvm::ShuffleVectorInst>(vec);
-    if (svi) {
-        // Note that the code for shufflevector instructions is untested.
-        // (We haven't yet had a case where it needs to run).  Therefore,
-        // an assert at the bottom of this routien will hit the first time
-        // it runs as a reminder that this needs to be tested further.
-
+    if (svi != NULL) {
        LLVM_TYPE_CONST llvm::VectorType *svInstType = 
            llvm::dyn_cast<LLVM_TYPE_CONST llvm::VectorType>(svi->getType());
        assert(svInstType != NULL);
-        assert((int)svInstType->getNumElements() == g->target.vectorWidth);
+        assert((int)svInstType->getNumElements() == vectorLength);

        // Scalarize the two vectors being shuffled.  First figure out how
        // big they are.
@@ -1671,27 +1690,21 @@ lScalarizeVector(llvm::Value *vec, llvm::Value *scalarizedVector[ISPC_MAX_NVEC])
        int n0 = vectorType0->getNumElements();
        int n1 = vectorType1->getNumElements();

-        // FIXME: It's actually totally legitimate for these two to have
-        // different sizes; the final result just needs to have the native
-        // vector width.  To handle this, not only do we need to
-        // potentially dynamically allocate space for the arrays passed
-        // into lScalarizeVector, but we need to change the rest of its
-        // implementation to not key off g->target.vectorWidth everywhere
-        // to get the sizes of the arrays to iterate over, etc.
-        assert(n0 == g->target.vectorWidth && n1 == g->target.vectorWidth);
-
        // Go ahead and scalarize the two input vectors now.
-        // FIXME: it's ok if some or all of the values of these two vectors
-        // have undef values, so long as we don't try to access undef
-        // values with the vector indices provided to the instruction.
-        // Should fix lScalarizeVector so that it doesn't return false in
-        // this case and just leaves the elements of the arrays with undef
-        // values as NULL.
-        llvm::Value *v0[ISPC_MAX_NVEC], *v1[ISPC_MAX_NVEC];
-        if (!lScalarizeVector(svi->getOperand(0), v0) ||
-            !lScalarizeVector(svi->getOperand(1), v1))
+        llvm::Value **v0 = (llvm::Value **)alloca(n0 * sizeof(llvm::Value *));
+        llvm::Value **v1 = (llvm::Value **)alloca(n1 * sizeof(llvm::Value *));
+
+        if (!lScalarizeVector(svi->getOperand(0), v0, n0) ||
+            !lScalarizeVector(svi->getOperand(1), v1, n1))
            return false;

+        llvm::ConstantAggregateZero *caz = 
+            llvm::dyn_cast<llvm::ConstantAggregateZero>(svi->getOperand(2));
+        if (caz != NULL) {
+            for (int i = 0; i < vectorLength; ++i)
+                scalarizedVector[i] = v0[0];
+        }
+        else {
            llvm::ConstantVector *shuffleIndicesVector = 
                llvm::dyn_cast<llvm::ConstantVector>(svi->getOperand(2));
            // I think this has to be a ConstantVector.  If this ever hits,
@@ -1702,15 +1715,15 @@ lScalarizeVector(llvm::Value *vec, llvm::Value *scalarizedVector[ISPC_MAX_NVEC])
            // Get the integer indices for each element of the returned vector
            llvm::SmallVector<llvm::Constant *, ISPC_MAX_NVEC> shuffleIndices;
            shuffleIndicesVector->getVectorElements(shuffleIndices);
-        assert((int)shuffleIndices.size() == g->target.vectorWidth);
+            assert((int)shuffleIndices.size() == vectorLength);

            // And loop over the indices, setting the i'th element of the
            // result vector with the source vector element that corresponds to
            // the i'th shuffle index value.
            for (unsigned int i = 0; i < shuffleIndices.size(); ++i) {
-            if (!llvm::isa<llvm::ConstantInt>(shuffleIndices[i]))
                // I'm not sure when this case would ever happen, though..
-                return false;
+                assert(llvm::isa<llvm::ConstantInt>(shuffleIndices[i]));
+
                int offset = (int)lGetIntValue(shuffleIndices[i]);
                assert(offset >= 0 && offset < n0+n1);

@@ -1722,7 +1735,45 @@ lScalarizeVector(llvm::Value *vec, llvm::Value *scalarizedVector[ISPC_MAX_NVEC])
                    // vector
                    scalarizedVector[i] = v1[offset - n0];
            }
-        FATAL("the above code is untested so far; check now that it's actually running");
+        }
+        return true;
+    }
+
+    llvm::LoadInst *li = llvm::dyn_cast<llvm::LoadInst>(vec);
+    if (li != NULL) {
+        llvm::Value *baseAddr = li->getOperand(0);
+        llvm::Value *baseInt = new llvm::PtrToIntInst(baseAddr, LLVMTypes::Int64Type,
+                                                      "base2int", li);
+        lCopyMetadata(baseInt, li);
+
+        LLVM_TYPE_CONST llvm::PointerType *ptrType = 
+            llvm::dyn_cast<llvm::PointerType>(baseAddr->getType());
+        assert(ptrType != NULL);
+        LLVM_TYPE_CONST llvm::VectorType *vecType = 
+            llvm::dyn_cast<llvm::VectorType>(ptrType->getElementType());
+        assert(vecType != NULL);
+        LLVM_TYPE_CONST llvm::Type *elementType = vecType->getElementType();
+        uint64_t elementSize;
+        bool sizeKnown = lSizeOfIfKnown(elementType, &elementSize);
+        assert(sizeKnown == true);
+
+        LLVM_TYPE_CONST llvm::Type *eltPtrType = llvm::PointerType::get(elementType, 0);
+
+        for (int i = 0; i < vectorLength; ++i) {
+            llvm::Value *intPtrOffset = 
+                llvm::BinaryOperator::Create(llvm::Instruction::Add, baseInt,
+                                             LLVMInt64(i * elementSize), "baseoffset",
+                                             li);
+            lCopyMetadata(intPtrOffset, li);
+            llvm::Value *scalarLoadPtr = 
+                new llvm::IntToPtrInst(intPtrOffset, eltPtrType, "int2ptr", li);
+            lCopyMetadata(scalarLoadPtr, li);
+
+            llvm::Instruction *scalarLoad = 
+                new llvm::LoadInst(scalarLoadPtr, "loadelt", li);
+            lCopyMetadata(scalarLoad, li);
+            scalarizedVector[i] = scalarLoad;
+        }
        return true;
    }

@@ -2105,13 +2156,15 @@ GSImprovementsPass::runOnBasicBlock(llvm::BasicBlock &bb) {
        GatherImpInfo *gatherInfo = NULL;
        ScatterImpInfo *scatterInfo = NULL;
        for (unsigned int i = 0; i < sizeof(gInfo) / sizeof(gInfo[0]); ++i) {
-            if (calledFunc == gInfo[i].pseudoFunc) {
+            if (gInfo[i].pseudoFunc != NULL &&
+                calledFunc == gInfo[i].pseudoFunc) {
                gatherInfo = &gInfo[i];
                break;
            }
        }
        for (unsigned int i = 0; i < sizeof(sInfo) / sizeof(sInfo[0]); ++i) {
-            if (calledFunc == sInfo[i].pseudoFunc) {
+            if (sInfo[i].pseudoFunc != NULL &&
+                calledFunc == sInfo[i].pseudoFunc) {
                scatterInfo = &sInfo[i];
                break;
            }
@@ -2134,11 +2187,18 @@ GSImprovementsPass::runOnBasicBlock(llvm::BasicBlock &bb) {
        if (ce && ce->getOpcode() == llvm::Instruction::BitCast)
            base = ce->getOperand(0);

-        // Try to out the offsets; the i'th element of the offsetElements
-        // array should be an i32 with the value of the offset for the i'th
-        // vector lane.  This may fail; if so, just give up.
+        // Try to find out the offsets; the i'th element of the
+        // offsetElements array should be an i32 with the value of the
+        // offset for the i'th vector lane.  This may fail; if so, just
+        // give up.
+        llvm::Value *vecValue = callInst->getArgOperand(1);
+        LLVM_TYPE_CONST llvm::VectorType *vt = 
+            llvm::dyn_cast<llvm::VectorType>(vecValue->getType());
+        assert(vt != NULL);
+        int vecLength = vt->getNumElements();
+        assert(vecLength == g->target.vectorWidth);
        llvm::Value *offsetElements[ISPC_MAX_NVEC];
-        if (!lScalarizeVector(callInst->getArgOperand(1), offsetElements))
+        if (!lScalarizeVector(vecValue, offsetElements, vecLength))
            continue;

        llvm::Value *mask = callInst->getArgOperand((gatherInfo != NULL) ? 2 : 3);
@@ -2349,7 +2409,8 @@ LowerGSPass::runOnBasicBlock(llvm::BasicBlock &bb) {
        llvm::Function *calledFunc = callInst->getCalledFunction();
        LowerGSInfo *info = NULL;
        for (unsigned int i = 0; i < sizeof(lgsInfo) / sizeof(lgsInfo[0]); ++i) {
-            if (calledFunc == lgsInfo[i].pseudoFunc) {
+            if (lgsInfo[i].pseudoFunc != NULL &&
+                calledFunc == lgsInfo[i].pseudoFunc) {
                info = &lgsInfo[i];
                break;
            }
@@ -2435,7 +2496,7 @@ IsCompileTimeConstantPass::runOnBasicBlock(llvm::BasicBlock &bb) {
        int j;
        int nFuncs = sizeof(funcs) / sizeof(funcs[0]);
        for (j = 0; j < nFuncs; ++j) {
-            if (callInst->getCalledFunction() == funcs[j]) 
+            if (funcs[j] != NULL && callInst->getCalledFunction() == funcs[j]) 
                break;
        }
        if (j == nFuncs)
@@ -2515,7 +2576,7 @@ llvm::RegisterPass<MakeInternalFuncsStaticPass>
 bool
 MakeInternalFuncsStaticPass::runOnModule(llvm::Module &module) {
    const char *names[] = {
-        "__do_print",
+        "__do_print", "__fast_masked_vload", "__num_cores",
        "__gather_base_offsets_i8", "__gather_base_offsets_i16",
        "__gather_base_offsets_i32", "__gather_base_offsets_i64",
        "__gather_elt_8", "__gather_elt_16", 
@@ -2540,7 +2601,7 @@ MakeInternalFuncsStaticPass::runOnModule(llvm::Module &module) {
    for (int i = 0; i < count; ++i) {
        llvm::Function *f = m->module->getFunction(names[i]);
        if (f != NULL) {
-            f->setLinkage(llvm::GlobalValue::PrivateLinkage);
+            f->setLinkage(llvm::GlobalValue::InternalLinkage);
            modifiedAny = true;
        }
    }
--- a/parse.yy
+++ b/parse.yy
@@ -165,7 +165,7 @@ static const char *lParamListTokens[] = {
 %token TOKEN_CBREAK TOKEN_CCONTINUE TOKEN_CRETURN TOKEN_SYNC TOKEN_PRINT

 %type <expr> primary_expression postfix_expression
-%type <expr> unary_expression cast_expression
+%type <expr> unary_expression cast_expression launch_expression
 %type <expr> multiplicative_expression additive_expression shift_expression
 %type <expr> relational_expression equality_expression and_expression
 %type <expr> exclusive_or_expression inclusive_or_expression
@@ -177,6 +177,7 @@ static const char *lParamListTokens[] = {
 %type <stmt> statement labeled_statement compound_statement for_init_statement
 %type <stmt> expression_statement selection_statement iteration_statement
 %type <stmt> jump_statement statement_list declaration_statement print_statement
+%type <stmt> sync_statement

 %type <declaration> declaration parameter_declaration
 %type <declarators> init_declarator_list 
@@ -221,7 +222,7 @@ primary_expression
        else {
            std::vector<Symbol *> *funs = m->symbolTable->LookupFunction(name);
            if (funs)
-                $$ = new FunctionSymbolExpr(funs, @1);
+                $$ = new FunctionSymbolExpr(name, funs, @1);
        }
        if ($$ == NULL) {
            std::vector<std::string> alternates = 
@@ -256,18 +257,32 @@ primary_expression
    | '(' expression ')' { $$ = $2; }
    ;

+launch_expression
+    : TOKEN_LAUNCH '<' postfix_expression '(' argument_expression_list ')' '>'
+      { 
+          ConstExpr *oneExpr = new ConstExpr(AtomicType::UniformInt32, (int32_t)1, @3);
+          $$ = new FunctionCallExpr($3, $5, @3, true, oneExpr);
+      }
+    | TOKEN_LAUNCH '<' postfix_expression '(' ')' '>'
+      {
+          ConstExpr *oneExpr = new ConstExpr(AtomicType::UniformInt32, (int32_t)1, @3);
+          $$ = new FunctionCallExpr($3, new ExprList(@3), @3, true, oneExpr);
+       }
+    | TOKEN_LAUNCH '[' expression ']' '<' postfix_expression '(' argument_expression_list ')' '>'
+      { $$ = new FunctionCallExpr($6, $8, @6, true, $3); }
+    | TOKEN_LAUNCH '[' expression ']' '<' postfix_expression '(' ')' '>'
+      { $$ = new FunctionCallExpr($6, new ExprList(@6), @6, true, $3); }
+    ;
+
 postfix_expression
    : primary_expression
    | postfix_expression '[' expression ']'
      { $$ = new IndexExpr($1, $3, @1); }
    | postfix_expression '(' ')'
-      { $$ = new FunctionCallExpr($1, new ExprList(@1), @1, false); }
+      { $$ = new FunctionCallExpr($1, new ExprList(@1), @1); }
    | postfix_expression '(' argument_expression_list ')'
-      { $$ = new FunctionCallExpr($1, $3, @1, false); }
-    | TOKEN_LAUNCH '<' postfix_expression '(' argument_expression_list ')' '>'
-      { $$ = new FunctionCallExpr($3, $5, @3, true); }
-    | TOKEN_LAUNCH '<' postfix_expression '(' ')' '>'
-      { $$ = new FunctionCallExpr($3, new ExprList(@3), @3, true); }
+      { $$ = new FunctionCallExpr($1, $3, @1); }
+    | launch_expression
    | postfix_expression '.' TOKEN_IDENTIFIER
      { $$ = MemberExpr::create($1, yytext, @1, @3); }
 /*    | postfix_expression TOKEN_PTR_OP TOKEN_IDENTIFIER
@@ -436,8 +451,6 @@ assignment_expression

 expression
    : assignment_expression
-    | TOKEN_SYNC 
-      { $$ = new SyncExpr(@1); }
    | expression ',' assignment_expression
      { $$ = new BinaryExpr(BinaryExpr::Comma, $1, $3, @2); }
    ;
@@ -928,9 +941,13 @@ parameter_list
            builtinTokens.push_back(*token);
            ++token;
        }
+        if (strlen(yytext) == 0)
+            Error(@1, "Syntax error--premature end of file.");
+        else {
            std::vector<std::string> alternates = MatchStrings(yytext, builtinTokens);
            std::string alts = lGetAlternates(alternates);
-        Error(@1, "Syntax error--token \"%s\" unknown.%s", yytext, alts.c_str());
+            Error(@1, "Syntax error--token \"%s\" unexpected.%s", yytext, alts.c_str());
+        }
        $$ = NULL;
    }
    ;
@@ -1019,6 +1036,7 @@ statement
    | jump_statement
    | declaration_statement
    | print_statement
+    | sync_statement
    | error
    {
        std::vector<std::string> builtinTokens;
@@ -1027,9 +1045,13 @@ statement
            builtinTokens.push_back(*token);
            ++token;
        }
+        if (strlen(yytext) == 0)
+            Error(@1, "Syntax error--premature end of file.");
+        else {
            std::vector<std::string> alternates = MatchStrings(yytext, builtinTokens);
            std::string alts = lGetAlternates(alternates);
-        Error(@1, "Syntax error--token \"%s\" unknown.%s", yytext, alts.c_str());
+            Error(@1, "Syntax error--token \"%s\" unexpected.%s", yytext, alts.c_str());
+        }
        $$ = NULL;
    }
    ;
@@ -1155,6 +1177,11 @@ jump_statement
      { $$ = new ReturnStmt($2, true, @1); }
    ;

+sync_statement
+    : TOKEN_SYNC 
+      { $$ = new ExprStmt(new SyncExpr(@1), @1); }
+    ;
+
 print_statement
    : TOKEN_PRINT '(' string_constant ')'
      {
@@ -1177,9 +1204,13 @@ translation_unit
            builtinTokens.push_back(*token);
            ++token;
        }
+        if (strlen(yytext) == 0)
+            Error(@1, "Syntax error--premature end of file.");
+        else {
            std::vector<std::string> alternates = MatchStrings(yytext, builtinTokens);
            std::string alts = lGetAlternates(alternates);
-        Error(@1, "Syntax error--token \"%s\" unknown.%s", yytext, alts.c_str());
+            Error(@1, "Syntax error--token \"%s\" unexpected.%s", yytext, alts.c_str());
+        }
    }
    ;

@@ -1266,6 +1297,12 @@ static void lAddThreadIndexCountToSymbolTable(SourcePos pos) {

    Symbol *threadCountSym = new Symbol("threadCount", pos, AtomicType::UniformConstUInt32);
    m->symbolTable->AddVariable(threadCountSym);
+
+    Symbol *taskIndexSym = new Symbol("taskIndex", pos, AtomicType::UniformConstUInt32);
+    m->symbolTable->AddVariable(taskIndexSym);
+
+    Symbol *taskCountSym = new Symbol("taskCount", pos, AtomicType::UniformConstUInt32);
+    m->symbolTable->AddVariable(taskCountSym);
 }


--- a/run_tests.py
+++ b/run_tests.py
@@ -17,6 +17,7 @@ import random
 import string
 import mutex
 import subprocess
+import platform

 parser = OptionParser()
 parser.add_option("-r", "--random-shuffle", dest="random", help="Randomly order tests",
@@ -25,11 +26,13 @@ parser.add_option("-s", "--static-exe", dest="static_exe",
                  help="Create and run a regular executable for each test (rather than using the LLVM JIT).",
                  default=False, action="store_true")
 parser.add_option('-t', '--target', dest='target',
-                  help='Set compilation target (sse2, sse4, sse4x2, avx, avx-x2)',
+                  help='Set compilation target (sse2, sse4, sse4-x2, avx, avx-x2)',
                  default="sse4")
 parser.add_option('-a', '--arch', dest='arch',
                  help='Set architecture (x86, x86-64)',
                  default="x86-64")
+parser.add_option('-o', '--no-opt', dest='no_opt', help='Disable optimization',
+                  default=False, action="store_true")

 (options, args) = parser.parse_args()

@@ -129,12 +132,16 @@ def run_tasks_from_queue(queue):
                exe_name = "%s.run" % filename
                ispc_cmd = "ispc --woff %s -o %s --arch=%s --target=%s" % \
                    (filename, obj_name, options.arch, options.target)
+                if options.no_opt:
+                    ispc_cmd += " -O0" 
                if options.arch == 'x86':
                    gcc_arch = '-m32'
                else:
                    gcc_arch = '-m64'
-                gcc_cmd = "g++ -Wl,-no_pie %s test_static.cpp -DTEST_SIG=%d %s.o -o %s" % \
+                gcc_cmd = "g++ %s test_static.cpp -DTEST_SIG=%d %s.o -o %s" % \
                    (gcc_arch, match, filename, exe_name)
+                if platform.system() == 'Darwin':
+                    gcc_cmd += ' -Wl,-no_pie'
                if should_fail:
                    gcc_cmd += " -DEXPECT_FAILURE"
                    
@@ -152,6 +159,8 @@ def run_tasks_from_queue(queue):
            bitcode_file = "%s.bc" % filename
            compile_cmd = "ispc --woff --emit-llvm %s --target=%s -o %s" % \
                (filename, options.target, bitcode_file)
+            if options.no_opt:
+                compile_cmd += " -O0"
            test_cmd = "ispc_test %s" % bitcode_file

            error_count += run_cmds([compile_cmd, test_cmd], filename, should_fail)
--- a/simple.vcxproj
+++ b/simple.vcxproj
@@ -0,0 +1,164 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|Win32">
+      <Configuration>Debug</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|Win32">
+      <Configuration>Release</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <ItemGroup>
+    <ClCompile Include="morph.cpp" />
+  </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="morph.vo">
+      <FileType>Document</FileType>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+cl /E /TP %(Filename).vo | volta -O2 - -o %(Filename).obj -h %(Filename).h --arch=x86
+</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+cl /E /TP %(Filename).vo | volta -O2 -o %(Filename).obj -h %(Filename).h
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(Filename).obj</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">%(Filename).obj</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+cl /E /TP %(Filename).vo | volta -O2 - -o %(Filename).obj -h %(Filename).h --arch=x86
+</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+cl /E /TP %(Filename).vo | volta -O2 - -o %(Filename).obj -h %(Filename).h
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(Filename).obj</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">%(Filename).obj</Outputs>
+    </CustomBuild>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{947C5311-8B78-4D05-BEE4-BCF342D4B367}</ProjectGuid>
+    <Keyword>Win32Proj</Keyword>
+    <RootNamespace>morph</RootNamespace>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <LinkIncremental>true</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <LinkIncremental>true</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <LinkIncremental>false</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <LinkIncremental>false</LinkIncremental>
+  </PropertyGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <ClCompile>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <ClCompile>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+    </Link>
+  </ItemDefinitionGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+  </ImportGroup>
+</Project>
--- a/stdlib.ispc
+++ b/stdlib.ispc
@@ -369,7 +369,7 @@ static inline uniform float reduce_min(float v) {
 static inline uniform float reduce_max(float v) {
    // For the lanes where the mask is off, replace the given value with
    // negative infinity, so that it doesn't affect the result.
-    const uniform int iflt_neg_max = 0xff800000; // -infinity
+    const int iflt_neg_max = 0xff800000; // -infinity
    // Must use __floatbits_varying_int32, not floatbits(), since with the
    // latter the current mask enters into the returned result...
    return __reduce_max_float(__mask ? v : __floatbits_varying_int32(iflt_neg_max));
@@ -427,7 +427,7 @@ static inline uniform double reduce_min(double v) {
 }

 static inline uniform double reduce_max(double v) {
-    const uniform int64 iflt_neg_max = 0xfff0000000000000; // -infinity
+    const int64 iflt_neg_max = 0xfff0000000000000; // -infinity
    // Must use __doublebits_varying_int64, not doublebits(), since with the
    // latter the current mask enters into the returned result...
    return __reduce_max_double(__mask ? v : __doublebits_varying_int64(iflt_neg_max));
@@ -471,21 +471,21 @@ static inline uniform unsigned int64 reduce_max(unsigned int64 v) {
    return __reduce_max_uint64(__mask ? v : 0);
 }

-#define REDUCE_EQUAL(TYPE, FUNCTYPE)                               \
+#define REDUCE_EQUAL(TYPE, FUNCTYPE, MASKTYPE)                     \
 static inline uniform bool reduce_equal(TYPE v) {                  \
    uniform TYPE unusedValue;                                      \
-    return __reduce_equal_##FUNCTYPE(v, unusedValue, (int32)__mask); \
+    return __reduce_equal_##FUNCTYPE(v, unusedValue, (MASKTYPE)__mask); \
 }                                                                  \
 static inline uniform bool reduce_equal(TYPE v, reference uniform TYPE value) { \
-    return __reduce_equal_##FUNCTYPE(v, value, (int32)__mask);       \
+    return __reduce_equal_##FUNCTYPE(v, value, (MASKTYPE)__mask);       \
 }

-REDUCE_EQUAL(int32, int32)
-REDUCE_EQUAL(unsigned int32, int32)
-REDUCE_EQUAL(float, float)
-REDUCE_EQUAL(int64, int64)
-REDUCE_EQUAL(unsigned int64, int64)
-REDUCE_EQUAL(double, double)
+REDUCE_EQUAL(int32, int32, int32)
+REDUCE_EQUAL(unsigned int32, int32, unsigned int32)
+REDUCE_EQUAL(float, float, int32)
+REDUCE_EQUAL(int64, int64, int32)
+REDUCE_EQUAL(unsigned int64, int64, unsigned int32)
+REDUCE_EQUAL(double, double, int32)

 static int32 exclusive_scan_add(int32 v) {
    return __exclusive_scan_add_i32(v, (int32)__mask);
@@ -549,23 +549,32 @@ static unsigned int64 exclusive_scan_or(unsigned int64 v) {
 static inline uniform int 
 packed_load_active(uniform unsigned int a[], uniform int start,
                   reference unsigned int vals) {
-    return __packed_load_active(a, start, vals, __mask);
+    return __packed_load_active(a, (unsigned int)start, vals,
+                                (unsigned int32)__mask);
 }

 static inline uniform int
 packed_store_active(uniform unsigned int a[], uniform int start,
                    unsigned int vals) {
-    return __packed_store_active(a, start, vals, __mask);
+    return __packed_store_active(a, (unsigned int)start, vals,
+                                 (unsigned int32)__mask);
 }

 static inline uniform int packed_load_active(uniform int a[], uniform int start,
                                             reference int vals) {
-    return __packed_load_active(a, start, vals, __mask);
+    return __packed_load_active(a, start, vals, (int32)__mask);
 }

 static inline uniform int packed_store_active(uniform int a[], uniform int start,
                                              int vals) {
-    return __packed_store_active(a, start, vals, __mask);
+    return __packed_store_active(a, start, vals, (int32)__mask);
+}
+
+///////////////////////////////////////////////////////////////////////////
+// System information
+
+static inline int num_cores() {
+    return __num_cores();
 }

 ///////////////////////////////////////////////////////////////////////////
@@ -581,24 +590,38 @@ static inline TA atomic_##OPA##_global(uniform reference TA ref, TA value) { \
    TA ret = __atomic_##OPB##_##TB##_global(ref, value, (MASKTYPE)__mask); \
    memory_barrier();                                                   \
    return ret;                                                         \
+}                                                                       \
+static inline uniform TA atomic_##OPA##_global(uniform reference TA ref, \
+                                               uniform TA value) {      \
+    memory_barrier();                                                   \
+    uniform TA ret = __atomic_##OPB##_uniform_##TB##_global(ref, value, (MASKTYPE)__mask); \
+    memory_barrier();                                                   \
+    return ret;                                                         \
 }

-#define DEFINE_ATOMIC_MINMAX_OP(TA,TB,OPA,OPB)                          \
+#define DEFINE_ATOMIC_MINMAX_OP(TA,TB,OPA,OPB, MASKTYPE)                \
 static inline TA atomic_##OPA##_global(uniform reference TA ref, TA value) { \
    uniform TA oneval = reduce_##OPA(value);                            \
    TA ret;                                                             \
    if (lanemask() != 0) {                                              \
        memory_barrier();                                               \
-        ret = __atomic_##OPB##_##TB##_global(ref, oneval, __mask);      \
+        ret = __atomic_##OPB##_uniform_##TB##_global(ref, oneval, (MASKTYPE)__mask); \
        memory_barrier();                                               \
    }                                                                   \
    return ret;                                                         \
+}                                                                       \
+static inline uniform TA atomic_##OPA##_global(uniform reference TA ref, \
+                                               uniform TA value) {      \
+    memory_barrier();                                                   \
+    uniform TA ret = __atomic_##OPB##_uniform_##TB##_global(ref, value, (MASKTYPE)__mask); \
+    memory_barrier();                                                   \
+    return ret;                                                         \
 }

 DEFINE_ATOMIC_OP(int32,int32,add,add,int32)
 DEFINE_ATOMIC_OP(int32,int32,subtract,sub,int32)
-DEFINE_ATOMIC_MINMAX_OP(int32,int32,min,min)
-DEFINE_ATOMIC_MINMAX_OP(int32,int32,max,max)
+DEFINE_ATOMIC_MINMAX_OP(int32,int32,min,min,int32)
+DEFINE_ATOMIC_MINMAX_OP(int32,int32,max,max,int32)
 DEFINE_ATOMIC_OP(int32,int32,and,and,int32)
 DEFINE_ATOMIC_OP(int32,int32,or,or,int32)
 DEFINE_ATOMIC_OP(int32,int32,xor,xor,int32)
@@ -606,56 +629,63 @@ DEFINE_ATOMIC_OP(int32,int32,swap,swap,int32)

 // For everything but atomic min and max, we can use the same
 // implementations for unsigned as for signed.
-DEFINE_ATOMIC_OP(unsigned int32,int32,add,add,int32)
-DEFINE_ATOMIC_OP(unsigned int32,int32,subtract,sub,int32)
-DEFINE_ATOMIC_MINMAX_OP(unsigned int32,uint32,min,umin)
-DEFINE_ATOMIC_MINMAX_OP(unsigned int32,uint32,max,umax)
-DEFINE_ATOMIC_OP(unsigned int32,int32,and,and,int32)
-DEFINE_ATOMIC_OP(unsigned int32,int32,or,or,int32)
-DEFINE_ATOMIC_OP(unsigned int32,int32,xor,xor,int32)
-DEFINE_ATOMIC_OP(unsigned int32,int32,swap,swap,int32)
+DEFINE_ATOMIC_OP(unsigned int32,int32,add,add,unsigned int32)
+DEFINE_ATOMIC_OP(unsigned int32,int32,subtract,sub,unsigned int32)
+DEFINE_ATOMIC_MINMAX_OP(unsigned int32,uint32,min,umin,unsigned int32)
+DEFINE_ATOMIC_MINMAX_OP(unsigned int32,uint32,max,umax,unsigned int32)
+DEFINE_ATOMIC_OP(unsigned int32,int32,and,and,unsigned int32)
+DEFINE_ATOMIC_OP(unsigned int32,int32,or,or,unsigned int32)
+DEFINE_ATOMIC_OP(unsigned int32,int32,xor,xor,unsigned int32)
+DEFINE_ATOMIC_OP(unsigned int32,int32,swap,swap,unsigned int32)

 DEFINE_ATOMIC_OP(float,float,swap,swap,int32)

-DEFINE_ATOMIC_OP(int64,int64,add,add,int64)
-DEFINE_ATOMIC_OP(int64,int64,subtract,sub,int64)
-DEFINE_ATOMIC_MINMAX_OP(int64,int64,min,min)
-DEFINE_ATOMIC_MINMAX_OP(int64,int64,max,max)
-DEFINE_ATOMIC_OP(int64,int64,and,and,int64)
-DEFINE_ATOMIC_OP(int64,int64,or,or,int64)
-DEFINE_ATOMIC_OP(int64,int64,xor,xor,int64)
+DEFINE_ATOMIC_OP(int64,int64,add,add,int32)
+DEFINE_ATOMIC_OP(int64,int64,subtract,sub,int32)
+DEFINE_ATOMIC_MINMAX_OP(int64,int64,min,min,int32)
+DEFINE_ATOMIC_MINMAX_OP(int64,int64,max,max,int32)
+DEFINE_ATOMIC_OP(int64,int64,and,and,int32)
+DEFINE_ATOMIC_OP(int64,int64,or,or,int32)
+DEFINE_ATOMIC_OP(int64,int64,xor,xor,int32)
 DEFINE_ATOMIC_OP(int64,int64,swap,swap,int32)

 // For everything but atomic min and max, we can use the same
 // implementations for unsigned as for signed.
-DEFINE_ATOMIC_OP(unsigned int64,int64,add,add,int64)
-DEFINE_ATOMIC_OP(unsigned int64,int64,subtract,sub,int64)
-DEFINE_ATOMIC_MINMAX_OP(unsigned int64,uint64,min,umin)
-DEFINE_ATOMIC_MINMAX_OP(unsigned int64,uint64,max,umax)
-DEFINE_ATOMIC_OP(unsigned int64,int64,and,and,int64)
-DEFINE_ATOMIC_OP(unsigned int64,int64,or,or,int64)
-DEFINE_ATOMIC_OP(unsigned int64,int64,xor,xor,int64)
-DEFINE_ATOMIC_OP(unsigned int64,int64,swap,swap,int32)
+DEFINE_ATOMIC_OP(unsigned int64,int64,add,add,unsigned int32)
+DEFINE_ATOMIC_OP(unsigned int64,int64,subtract,sub,unsigned int32)
+DEFINE_ATOMIC_MINMAX_OP(unsigned int64,uint64,min,umin,unsigned int32)
+DEFINE_ATOMIC_MINMAX_OP(unsigned int64,uint64,max,umax,unsigned int32)
+DEFINE_ATOMIC_OP(unsigned int64,int64,and,and,unsigned int32)
+DEFINE_ATOMIC_OP(unsigned int64,int64,or,or,unsigned int32)
+DEFINE_ATOMIC_OP(unsigned int64,int64,xor,xor,unsigned int32)
+DEFINE_ATOMIC_OP(unsigned int64,int64,swap,swap,unsigned int32)

 DEFINE_ATOMIC_OP(double,double,swap,swap,int32)

 #undef DEFINE_ATOMIC_OP

-#define ATOMIC_DECL_CMPXCHG(TA, TB)                                        \
+#define ATOMIC_DECL_CMPXCHG(TA, TB, MASKTYPE)                           \
 static inline TA atomic_compare_exchange_global(                           \
         uniform reference TA ref, TA oldval, TA newval) {                 \
    memory_barrier();                                                      \
-    TA ret = __atomic_compare_exchange_##TB##_global(ref, oldval, newval, __mask); \
+    TA ret = __atomic_compare_exchange_##TB##_global(ref, oldval, newval, (MASKTYPE)__mask); \
+    memory_barrier();                                                      \
+    return ret;                                                            \
+} \
+static inline uniform TA atomic_compare_exchange_global(               \
+         uniform reference TA ref, uniform TA oldval, uniform TA newval) {                 \
+    memory_barrier();                                                   \
+    uniform TA ret = __atomic_compare_exchange_uniform_##TB##_global(ref, oldval, newval, (MASKTYPE)__mask); \
    memory_barrier();                                                   \
    return ret;                                                         \
 }

-ATOMIC_DECL_CMPXCHG(int32, int32)
-ATOMIC_DECL_CMPXCHG(unsigned int32, int32)
-ATOMIC_DECL_CMPXCHG(float, float)
-ATOMIC_DECL_CMPXCHG(int64, int64)
-ATOMIC_DECL_CMPXCHG(unsigned int64, int64)
-ATOMIC_DECL_CMPXCHG(double, double)
+ATOMIC_DECL_CMPXCHG(int32, int32, int32)
+ATOMIC_DECL_CMPXCHG(unsigned int32, int32, unsigned int32)
+ATOMIC_DECL_CMPXCHG(float, float, int32)
+ATOMIC_DECL_CMPXCHG(int64, int64, int32)
+ATOMIC_DECL_CMPXCHG(unsigned int64, int64, unsigned int32)
+ATOMIC_DECL_CMPXCHG(double, double, int32)

 #undef ATOMIC_DECL_CMPXCHG

@@ -2862,6 +2892,12 @@ static inline void seed_rng(reference uniform RNGState state, uniform unsigned i
    seed = __seed4(state, 0, seed);
    if (programCount == 8)
        __seed4(state, 4, seed ^ 0xbeeff00d);
+    if (programCount == 16) {
+        __seed4(state, 4,  seed ^ 0xbeeff00d);
+        __seed4(state, 8,  ((seed & 0xffff) << 16) | (seed >> 16));
+        __seed4(state, 12, (((seed & 0xff) << 24) | ((seed & 0xff00)  << 8) |
+                            ((seed & 0xff0000) >> 8) | (seed & 0xff000000) >> 24));
+    }
 }

 static inline void fastmath() {
--- a/stmt.cpp
+++ b/stmt.cpp
@@ -107,6 +107,12 @@ ExprStmt::Print(int indent) const {
 }


+int
+ExprStmt::EstimateCost() const {
+    return expr ? expr->EstimateCost() : 0;
+}
+
+
 ///////////////////////////////////////////////////////////////////////////
 // DeclStmt

@@ -399,12 +405,25 @@ DeclStmt::Print(int indent) const {
 }


+int
+DeclStmt::EstimateCost() const {
+    int cost = 0;
+    for (unsigned int i = 0; i < declaration->declarators.size(); ++i)
+        if (declaration->declarators[i]->initExpr)
+            cost += declaration->declarators[i]->initExpr->EstimateCost();
+    return cost;
+}
+
+
 ///////////////////////////////////////////////////////////////////////////
 // IfStmt

-IfStmt::IfStmt(Expr *t, Stmt *ts, Stmt *fs, bool doUnif, SourcePos p) 
+IfStmt::IfStmt(Expr *t, Stmt *ts, Stmt *fs, bool checkCoherence, SourcePos p) 
    : Stmt(p), test(t), trueStmts(ts), falseStmts(fs), 
-      doCoherentCheck(doUnif && !g->opt.disableCoherentControlFlow) {
+      doAllCheck(checkCoherence &&
+                 !g->opt.disableCoherentControlFlow),
+      doAnyCheck(test->GetType() != NULL &&
+                 test->GetType()->IsVaryingType()) {
 }


@@ -436,23 +455,26 @@ IfStmt::EmitCode(FunctionEmitContext *ctx) const {

    ctx->SetDebugPos(pos);
    bool isUniform = testType->IsUniformType();
+
+    llvm::Value *testValue = test->GetValue(ctx);
+    if (testValue == NULL)
+        return;
+
    if (isUniform) {
        ctx->StartUniformIf(ctx->GetMask());
-        if (doCoherentCheck)
-            Warning(test->pos, "Uniform condition supplied to cif statement.");
+        if (doAllCheck)
+            Warning(test->pos, "Uniform condition supplied to \"cif\" statement.");

        // 'If' statements with uniform conditions are relatively
        // straightforward.  We evaluate the condition and then jump to
        // either the 'then' or 'else' clause depending on its value.
-        llvm::Value *vtest = test->GetValue(ctx);
-        if (vtest != NULL) {
        llvm::BasicBlock *bthen = ctx->CreateBasicBlock("if_then");
        llvm::BasicBlock *belse = ctx->CreateBasicBlock("if_else");
        llvm::BasicBlock *bexit = ctx->CreateBasicBlock("if_exit");

        // Jump to the appropriate basic block based on the value of
        // the 'if' test
-            ctx->BranchInst(bthen, belse, vtest);
+        ctx->BranchInst(bthen, belse, testValue);

        // Emit code for the 'true' case
        ctx->SetCurrentBasicBlock(bthen);
@@ -469,29 +491,10 @@ IfStmt::EmitCode(FunctionEmitContext *ctx) const {
        // Set the active basic block to the newly-created exit block
        // so that subsequent emitted code starts there.
        ctx->SetCurrentBasicBlock(bexit);
-        }
        ctx->EndIf();
    }
-    else {
-        // Code for 'If' statemnts with 'varying' conditions can be
-        // generated in two ways; one takes some care to see if all of the
-        // active program instances want to follow only the 'true' or
-        // 'false' cases, and the other always runs both cases but sets the
-        // mask appropriately.  The first case is handled by the
-        // IfStmt::emitCoherentTests() call, and the second is handled by
-        // IfStmt::emitMaskedTrueAndFalse().
-        llvm::Value *testValue = test->GetValue(ctx);
-        if (testValue) {
-            if (doCoherentCheck) 
-                emitCoherentTests(ctx, testValue);
-            else {
-                llvm::Value *oldMask = ctx->GetMask();
-                ctx->StartVaryingIf(oldMask);
-                emitMaskedTrueAndFalse(ctx, oldMask, testValue);
-                ctx->EndIf();
-            }
-        }
-    }
+    else
+        emitVaryingIf(ctx, testValue);
 }


@@ -535,9 +538,17 @@ Stmt *IfStmt::TypeCheck() {
 }


+int
+IfStmt::EstimateCost() const {
+    return ((test ? test->EstimateCost() : 0) +
+            (trueStmts ? trueStmts->EstimateCost() : 0) +
+            (falseStmts ? falseStmts->EstimateCost() : 0));
+}
+
+
 void
 IfStmt::Print(int indent) const {
-    printf("%*cIf Stmt %s", indent, ' ', doCoherentCheck ? "DO COHERENT CHECK" : "");
+    printf("%*cIf Stmt %s", indent, ' ', doAllCheck ? "DO ALL CHECK" : "");
    pos.Print();
    printf("\n%*cTest: ", indent+4, ' ');
    test->Print();
@@ -554,7 +565,7 @@ IfStmt::Print(int indent) const {


 /** Emit code to run both the true and false statements for the if test,
-    with the mask set appropriately before runnign each one. 
+    with the mask set appropriately before running each one. 
 */
 void
 IfStmt::emitMaskedTrueAndFalse(FunctionEmitContext *ctx, llvm::Value *oldMask, 
@@ -574,11 +585,185 @@ IfStmt::emitMaskedTrueAndFalse(FunctionEmitContext *ctx, llvm::Value *oldMask,
 }


+/** Similar to the Stmt variant of this function, this conservatively
+    checks to see if it's safe to run the code for the given Expr even if
+    the mask is 'all off'.
+ */
+static bool
+lSafeToRunWithAllLanesOff(Expr *expr) {
+    if (expr == NULL)
+        return false;
+
+    UnaryExpr *ue;
+    if ((ue = dynamic_cast<UnaryExpr *>(expr)) != NULL)
+        return lSafeToRunWithAllLanesOff(ue->expr);
+
+    BinaryExpr *be;
+    if ((be = dynamic_cast<BinaryExpr *>(expr)) != NULL)
+        return (lSafeToRunWithAllLanesOff(be->arg0) &&
+                lSafeToRunWithAllLanesOff(be->arg1));
+
+    AssignExpr *ae;
+    if ((ae = dynamic_cast<AssignExpr *>(expr)) != NULL)
+        return (lSafeToRunWithAllLanesOff(ae->lvalue) &&
+                lSafeToRunWithAllLanesOff(ae->rvalue));
+
+    SelectExpr *se;
+    if ((se = dynamic_cast<SelectExpr *>(expr)) != NULL)
+        return (lSafeToRunWithAllLanesOff(se->test) && 
+                lSafeToRunWithAllLanesOff(se->expr1) && 
+                lSafeToRunWithAllLanesOff(se->expr2));
+
+    ExprList *el;
+    if ((el = dynamic_cast<ExprList *>(expr)) != NULL) {
+        for (unsigned int i = 0; i < el->exprs.size(); ++i)
+            if (!lSafeToRunWithAllLanesOff(el->exprs[i]))
+                return false;
+        return true;
+    }
+
+    FunctionCallExpr *fce;
+    if ((fce = dynamic_cast<FunctionCallExpr *>(expr)) != NULL)
+        // FIXME: If we could somehow determine that the function being
+        // called was safe (and all of the args Exprs were safe, then it'd
+        // be nice to be able to return true here.  (Consider a call to
+        // e.g. floatbits() in the stdlib.)  Unfortunately for now we just
+        // have to be conservative.
+        return false;
+
+    IndexExpr *ie;
+    if ((ie = dynamic_cast<IndexExpr *>(expr)) != NULL) {
+        // If we can determine at compile time the size of the array/vector
+        // and if the indices are compile-time constants, then we may be
+        // able to safely run this under a predicated if statement..
+        if (ie->arrayOrVector == NULL)
+            return false;
+
+        const Type *type = ie->arrayOrVector->GetType();
+        ConstExpr *ce = dynamic_cast<ConstExpr *>(ie->index);
+        if (type == NULL || ce == NULL)
+            return false;
+        if (dynamic_cast<const ReferenceType *>(type) != NULL)
+            type = type->GetReferenceTarget();
+
+        const SequentialType *seqType = 
+            dynamic_cast<const SequentialType *>(type);
+        assert(seqType != NULL);
+        int nElements = seqType->GetElementCount();
+        if (nElements == 0)
+            // Unsized array, so we can't be sure
+            return false;
+
+        int32_t indices[ISPC_MAX_NVEC];
+        int count = ce->AsInt32(indices);
+        for (int i = 0; i < count; ++i)
+            if (indices[i] < 0 || indices[i] >= nElements)
+                return false;
+
+        // All indices are in-bounds
+        return true;
+    }
+
+    MemberExpr *me;
+    if ((me = dynamic_cast<MemberExpr *>(expr)) != NULL)
+        return lSafeToRunWithAllLanesOff(me->expr);
+
+    if (dynamic_cast<ConstExpr *>(expr) != NULL)
+        return true;
+
+    TypeCastExpr *tce;
+    if ((tce = dynamic_cast<TypeCastExpr *>(expr)) != NULL)
+        return lSafeToRunWithAllLanesOff(tce->expr);
+
+    ReferenceExpr *re;
+    if ((re = dynamic_cast<ReferenceExpr *>(expr)) != NULL)
+        return lSafeToRunWithAllLanesOff(re->expr);
+
+    DereferenceExpr *dre;
+    if ((dre = dynamic_cast<DereferenceExpr *>(expr)) != NULL)
+        return lSafeToRunWithAllLanesOff(dre->expr);
+
+    if (dynamic_cast<SymbolExpr *>(expr) != NULL ||
+        dynamic_cast<FunctionSymbolExpr *>(expr) != NULL ||
+        dynamic_cast<SyncExpr *>(expr) != NULL)
+        return true;
+
+    FATAL("Unknown Expr type in lSafeToRunWithAllLanesOff()");
+    return false;
+}
+
+
+/** Given an arbitrary statement, this function conservatively tests to see
+    if it's safe to run the code for the statement even if the mask is all
+    off.  Here we just need to determine which kind of statement we have
+    and recursively traverse it and/or the expressions inside of it.
+ */
+static bool
+lSafeToRunWithAllLanesOff(Stmt *stmt) {
+    if (stmt == NULL)
+        return true;
+
+    ExprStmt *es;
+    if ((es = dynamic_cast<ExprStmt *>(stmt)) != NULL)
+        return lSafeToRunWithAllLanesOff(es->expr);
+
+    DeclStmt *ds;
+    if ((ds = dynamic_cast<DeclStmt *>(stmt)) != NULL) {
+        for (unsigned int i = 0; i < ds->declaration->declarators.size(); ++i)
+            if (!lSafeToRunWithAllLanesOff(ds->declaration->declarators[i]->initExpr))
+                return false;
+        return true;
+    }
+
+    IfStmt *is;
+    if ((is = dynamic_cast<IfStmt *>(stmt)) != NULL)
+        return (lSafeToRunWithAllLanesOff(is->test) &&
+                lSafeToRunWithAllLanesOff(is->trueStmts) &&
+                lSafeToRunWithAllLanesOff(is->falseStmts));
+
+    DoStmt *dos;
+    if ((dos = dynamic_cast<DoStmt *>(stmt)) != NULL)
+        return (lSafeToRunWithAllLanesOff(dos->testExpr) &&
+                lSafeToRunWithAllLanesOff(dos->bodyStmts));
+
+    ForStmt *fs;
+    if ((fs = dynamic_cast<ForStmt *>(stmt)) != NULL)
+        return (lSafeToRunWithAllLanesOff(fs->init) &&
+                lSafeToRunWithAllLanesOff(fs->test) &&
+                lSafeToRunWithAllLanesOff(fs->step) &&
+                lSafeToRunWithAllLanesOff(fs->stmts));
+
+    if (dynamic_cast<BreakStmt *>(stmt) != NULL ||
+        dynamic_cast<ContinueStmt *>(stmt) != NULL)
+        return true;
+
+    ReturnStmt *rs;
+    if ((rs = dynamic_cast<ReturnStmt *>(stmt)) != NULL)
+        return lSafeToRunWithAllLanesOff(rs->val);
+
+    StmtList *sl;
+    if ((sl = dynamic_cast<StmtList *>(stmt)) != NULL) {
+        const std::vector<Stmt *> &sls = sl->GetStatements();
+        for (unsigned int i = 0; i < sls.size(); ++i)
+            if (!lSafeToRunWithAllLanesOff(sls[i]))
+                return false;
+        return true;
+    }
+
+    PrintStmt *ps;
+    if ((ps = dynamic_cast<PrintStmt *>(stmt)) != NULL)
+        return lSafeToRunWithAllLanesOff(ps->values);
+
+    FATAL("Unexpected stmt type in lSafeToRunWithAllLanesOff()");
+    return false;
+}
+
+
 /** Emit code for an if test that checks the mask and the test values and
    tries to be smart about jumping over code that doesn't need to be run.
 */
 void
-IfStmt::emitCoherentTests(FunctionEmitContext *ctx, llvm::Value *ltest) const {
+IfStmt::emitVaryingIf(FunctionEmitContext *ctx, llvm::Value *ltest) const {
    llvm::Value *oldMask = ctx->GetMask();
    if (oldMask == LLVMMaskAllOn) {
        // We can tell that the mask is on statically at compile time; just
@@ -587,7 +772,7 @@ IfStmt::emitCoherentTests(FunctionEmitContext *ctx, llvm::Value *ltest) const {
        emitMaskAllOn(ctx, ltest, bDone);
        ctx->SetCurrentBasicBlock(bDone);
    }
-    else {
+    else if (doAllCheck) {
        // We can't tell if the mask going into the if is all on at the
        // compile time.  Emit code to check for this and then either run
        // the code for the 'all on' or the 'mixed' case depending on the
@@ -619,6 +804,43 @@ IfStmt::emitCoherentTests(FunctionEmitContext *ctx, llvm::Value *ltest) const {
        // paths above jump to when they're done.
        ctx->SetCurrentBasicBlock(bDone);
    }
+    else if (trueStmts != NULL || falseStmts != NULL) {
+        // If there is nothing that is potentially unsafe to run with all
+        // lanes off in the true and false statements and if the total
+        // complexity of those two is relatively simple, then we'll go
+        // ahead and emit straightline code that runs both sides, updating
+        // the mask accordingly.  This is useful for efficiently compiling
+        // things like:
+        //
+        // if (foo) x = 0;
+        // else     ++x;
+        //
+        // Where the overhead of checking if any of the program instances wants
+        // to run one side or the other is more than the actual computation.
+        // The lSafeToRunWithAllLanesOff() checks to make sure that we don't do this
+        // for potentially dangerous code like:
+        //
+        // if (index < count) array[index] = 0;
+        //
+        // where our use of blend for conditional assignments doesn't check
+        // for the 'all lanes' off case.
+        if (lSafeToRunWithAllLanesOff(trueStmts) &&
+            lSafeToRunWithAllLanesOff(falseStmts) &&
+            (((trueStmts ? trueStmts->EstimateCost() : 0) + 
+              (falseStmts ? falseStmts->EstimateCost() : 0)) < 
+             PREDICATE_SAFE_IF_STATEMENT_COST)) {
+            ctx->StartVaryingIf(oldMask);
+            emitMaskedTrueAndFalse(ctx, oldMask, ltest);
+            assert(ctx->GetCurrentBasicBlock());
+            ctx->EndIf();
+        }
+        else {
+            assert(doAnyCheck);
+            llvm::BasicBlock *bDone = ctx->CreateBasicBlock("if_done");
+            emitMaskMixed(ctx, oldMask, ltest, bDone);
+            ctx->SetCurrentBasicBlock(bDone);
+        }
+    }
 }


@@ -677,69 +899,50 @@ IfStmt::emitMaskAllOn(FunctionEmitContext *ctx, llvm::Value *ltest,
 }


-/** Emits code that checks to see if for all of the lanes where the mask is
-    on, the test has the value true.
- */
-static llvm::Value *
-lTestMatchesMask(FunctionEmitContext *ctx, llvm::Value *test, llvm::Value *mask) {
-    llvm::Value *testAndMask = ctx->BinaryOperator(llvm::Instruction::And, test,
-                                                   mask, "test&mask");
-    return ctx->MasksAllEqual(testAndMask, mask);
-}
-
-
 /** Emit code for an 'if' test where the lane mask is known to be mixed
    on/off going into it.
 */
 void
 IfStmt::emitMaskMixed(FunctionEmitContext *ctx, llvm::Value *oldMask, 
                      llvm::Value *ltest, llvm::BasicBlock *bDone) const {
-    // First, see if, for all of the lanes where the mask is on, if the
-    // value of the test is on.  (i.e. (test&mask) == mask).  In this case,
-    // we only need to run the 'true' case code, since the lanes where the
-    // test was false aren't supposed to be running here anyway.
-     llvm::Value *testAllEqual = lTestMatchesMask(ctx, ltest, oldMask);
-    llvm::BasicBlock *bTestAll = ctx->CreateBasicBlock("cif_mixed_test_all");
-    llvm::BasicBlock *bTestAnyCheck = ctx->CreateBasicBlock("cif_mixed_test_any_check");
-    ctx->BranchInst(bTestAll, bTestAnyCheck, testAllEqual);
+    ctx->StartVaryingIf(oldMask);
+    llvm::BasicBlock *bNext = ctx->CreateBasicBlock("safe_if_after_true");
+    if (trueStmts != NULL) {
+        llvm::BasicBlock *bRunTrue = ctx->CreateBasicBlock("safe_if_run_true");
+        ctx->MaskAnd(oldMask, ltest);

-    // Emit code for the (test&mask)==mask case.  Not only do we only need
-    // to emit code for the true statements, but we don't need to modify
-    // the mask's value; it's already correct.
-    ctx->SetCurrentBasicBlock(bTestAll);
-    ctx->StartVaryingIf(ctx->GetMask());
-    lEmitIfStatements(ctx, trueStmts, "cif: all running lanes want just true stmts");
+        // Do any of the program instances want to run the 'true'
+        // block?  If not, jump ahead to bNext.
+        llvm::Value *maskAnyQ = ctx->Any(ctx->GetMask());
+        ctx->BranchInst(bRunTrue, bNext, maskAnyQ);
+
+        // Emit statements for true
+        ctx->SetCurrentBasicBlock(bRunTrue);
+        lEmitIfStatements(ctx, trueStmts, "if: expr mixed, true statements");
        assert(ctx->GetCurrentBasicBlock()); 
-    ctx->EndIf();
-    ctx->BranchInst(bDone);
+        ctx->BranchInst(bNext);
+        ctx->SetCurrentBasicBlock(bNext);
+    }
+    if (falseStmts != NULL) {
+        llvm::BasicBlock *bRunFalse = ctx->CreateBasicBlock("safe_if_run_false");
+        bNext = ctx->CreateBasicBlock("safe_if_after_false");
+        ctx->MaskAndNot(oldMask, ltest);

-    // Next, see if the active lanes only need to run the false case--i.e. if
-    // (~test & mask) == mask.
-    ctx->SetCurrentBasicBlock(bTestAnyCheck);
-    llvm::Value *notTest = ctx->BinaryOperator(llvm::Instruction::Xor, LLVMMaskAllOn,
-                                               ltest, "~test");
-    llvm::Value *notMatchesMask = lTestMatchesMask(ctx, notTest, oldMask);
-    llvm::BasicBlock *bTestAllNot = ctx->CreateBasicBlock("cif_mixed_test_none");
-    llvm::BasicBlock *bTestMixed = ctx->CreateBasicBlock("cif_mixed_test_mixed");
-    ctx->BranchInst(bTestAllNot, bTestMixed, notMatchesMask);
+        // Similarly, check to see if any of the instances want to
+        // run the 'false' block...
+        llvm::Value *maskAnyQ = ctx->Any(ctx->GetMask());
+        ctx->BranchInst(bRunFalse, bNext, maskAnyQ);

-    // Emit code for the (~test & mask) == mask case.  We only need the
-    // 'false' statements and again don't need to modify the value of the
-    // mask.
-    ctx->SetCurrentBasicBlock(bTestAllNot);
-    ctx->StartVaryingIf(ctx->GetMask());
-    lEmitIfStatements(ctx, falseStmts, "cif: all running lanes want just false stmts");
+        // Emit code for false
+        ctx->SetCurrentBasicBlock(bRunFalse);
+        lEmitIfStatements(ctx, falseStmts, "if: expr mixed, false statements");
        assert(ctx->GetCurrentBasicBlock());
-    ctx->EndIf();
+        ctx->BranchInst(bNext);
+        ctx->SetCurrentBasicBlock(bNext);
+    }
    ctx->BranchInst(bDone);
-
-    // It's mixed; we need to run both the true and false cases and also do
-    // mask update stuff.
-    ctx->SetCurrentBasicBlock(bTestMixed);
-    ctx->StartVaryingIf(ctx->GetMask());
-    emitMaskedTrueAndFalse(ctx, oldMask, ltest);
+    ctx->SetCurrentBasicBlock(bDone);
    ctx->EndIf();
-    ctx->BranchInst(bDone);
 }


@@ -955,6 +1158,13 @@ DoStmt::TypeCheck() {
 }


+int
+DoStmt::EstimateCost() const {
+    return ((testExpr ? testExpr->EstimateCost() : 0) +
+            (bodyStmts ? bodyStmts->EstimateCost() : 0));
+}
+
+
 void
 DoStmt::Print(int indent) const {
    printf("%*cDo Stmt", indent, ' ');
@@ -1162,6 +1372,20 @@ ForStmt::TypeCheck() {
 }


+int
+ForStmt::EstimateCost() const {
+    bool uniformTest = test ? test->GetType()->IsUniformType() :
+        (!g->opt.disableUniformControlFlow &&
+         !lHasVaryingBreakOrContinue(stmts));
+
+    return ((init ? init->EstimateCost() : 0) +
+            (test ? test->EstimateCost() : 0) +
+            (step ? step->EstimateCost() : 0) +
+            (stmts ? stmts->EstimateCost() : 0) +
+            (uniformTest ? COST_UNIFORM_LOOP : COST_VARYING_LOOP));
+}
+
+
 void
 ForStmt::Print(int indent) const {
    printf("%*cFor Stmt", indent, ' ');
@@ -1216,6 +1440,13 @@ BreakStmt::TypeCheck() {
 }


+int
+BreakStmt::EstimateCost() const {
+    return doCoherenceCheck ? COST_COHERENT_BREAK_CONTINE : 
+        COST_REGULAR_BREAK_CONTINUE;
+}
+
+
 void
 BreakStmt::Print(int indent) const {
    printf("%*c%sBreak Stmt", indent, ' ', doCoherenceCheck ? "Coherent " : "");
@@ -1254,6 +1485,13 @@ ContinueStmt::TypeCheck() {
 }


+int
+ContinueStmt::EstimateCost() const {
+    return doCoherenceCheck ? COST_COHERENT_BREAK_CONTINE : 
+        COST_REGULAR_BREAK_CONTINUE;
+}
+
+
 void
 ContinueStmt::Print(int indent) const {
    printf("%*c%sContinue Stmt", indent, ' ', doCoherenceCheck ? "Coherent " : "");
@@ -1300,6 +1538,12 @@ ReturnStmt::TypeCheck() {
 }


+int
+ReturnStmt::EstimateCost() const {
+    return COST_RETURN + (val ? val->EstimateCost() : 0);
+}
+
+
 void
 ReturnStmt::Print(int indent) const {
    printf("%*c%sReturn Stmt", indent, ' ', doCoherenceCheck ? "Coherent " : "");
@@ -1345,6 +1589,16 @@ StmtList::TypeCheck() {
 }


+int
+StmtList::EstimateCost() const {
+    int cost = 0;
+    for (unsigned int i = 0; i < stmts.size(); ++i)
+        if (stmts[i])
+            cost += stmts[i]->EstimateCost();
+    return cost;
+}
+
+
 void
 StmtList::Print(int indent) const {
    printf("%*cStmt List", indent, ' ');
@@ -1545,3 +1799,11 @@ PrintStmt::TypeCheck() {
        values = values->TypeCheck();
    return this;
 }
+
+
+int
+PrintStmt::EstimateCost() const {
+    return COST_FUNCALL + (values ? values->EstimateCost() : 0);
+}
+
+
--- a/stmt.h
+++ b/stmt.h
@@ -39,6 +39,7 @@
 #define ISPC_STMT_H 1

 #include "ispc.h"
+#include "ast.h"

 /** @brief Interface class for statements in the ispc language.

@@ -75,8 +76,8 @@ public:

    Stmt *Optimize();
    Stmt *TypeCheck();
+    int EstimateCost() const;

-private:
    Expr *expr;
 };

@@ -92,8 +93,8 @@ public:

    Stmt *Optimize();
    Stmt *TypeCheck();
+    int EstimateCost() const;

-private:
    Declaration *declaration;
 };

@@ -103,13 +104,14 @@ private:
 class IfStmt : public Stmt {
 public:
    IfStmt(Expr *testExpr, Stmt *trueStmts, Stmt *falseStmts,
-           bool doCoherentCheck, SourcePos pos);
+           bool doAllCheck, SourcePos pos);

    void EmitCode(FunctionEmitContext *ctx) const;
    void Print(int indent) const;

    Stmt *Optimize();
    Stmt *TypeCheck();
+    int EstimateCost() const;

    // @todo these are only public for lHasVaryingBreakOrContinue(); would
    // be nice to clean that up...
@@ -125,11 +127,12 @@ private:
        source and thus, if the emitted code should check to see if all
        active program instances want to follow just one of the 'true' or
        'false' blocks. */
-    const bool doCoherentCheck;
+    const bool doAllCheck;
+    const bool doAnyCheck;

    void emitMaskedTrueAndFalse(FunctionEmitContext *ctx, llvm::Value *oldMask, 
                                llvm::Value *test) const;
-    void emitCoherentTests(FunctionEmitContext *ctx, llvm::Value *test) const;
+    void emitVaryingIf(FunctionEmitContext *ctx, llvm::Value *test) const;
    void emitMaskAllOn(FunctionEmitContext *ctx,
                       llvm::Value *test, llvm::BasicBlock *bDone) const;
    void emitMaskMixed(FunctionEmitContext *ctx, llvm::Value *oldMask, 
@@ -150,8 +153,8 @@ public:

    Stmt *Optimize();
    Stmt *TypeCheck();
+    int EstimateCost() const;

-private:
    Expr *testExpr;
    Stmt *bodyStmts;
    const bool doCoherentCheck;
@@ -171,8 +174,8 @@ public:

    Stmt *Optimize();
    Stmt *TypeCheck();
+    int EstimateCost() const;

-private:
    /** 'for' statment initializer; may be NULL, indicating no intitializer */
    Stmt *init;
    /** expression that returns a value indicating whether the loop should
@@ -198,6 +201,7 @@ public:

    Stmt *Optimize();
    Stmt *TypeCheck();
+    int EstimateCost() const;

 private:
    /** This indicates whether the generated code will check to see if no
@@ -219,6 +223,7 @@ public:

    Stmt *Optimize();
    Stmt *TypeCheck();
+    int EstimateCost() const;

 private:
    /** This indicates whether the generated code will check to see if no
@@ -240,8 +245,8 @@ public:

    Stmt *Optimize();
    Stmt *TypeCheck();
+    int EstimateCost() const;

-private:
    Expr *val;
    /** This indicates whether the generated code will check to see if no
        more program instances are currently running after the return, in
@@ -262,6 +267,7 @@ public:

    Stmt *Optimize();
    Stmt *TypeCheck();
+    int EstimateCost() const;

    void Add(Stmt *s) { if (s) stmts.push_back(s); }
    const std::vector<Stmt *> &GetStatements() { return stmts; }
@@ -289,8 +295,8 @@ public:

    Stmt *Optimize();
    Stmt *TypeCheck();
+    int EstimateCost() const;

-private:
    /** Format string for the print() statement. */
    const std::string format;
    /** This holds the arguments passed to the print() statement.  If more
--- a/sym.cpp
+++ b/sym.cpp
@@ -43,13 +43,14 @@
 ///////////////////////////////////////////////////////////////////////////
 // Symbol

-Symbol::Symbol(const std::string &n, SourcePos p, const Type *t) 
+Symbol::Symbol(const std::string &n, SourcePos p, const Type *t, 
+               StorageClass sc) 
  : pos(p), name(n) {
    storagePtr = NULL;
-    function = NULL;
+    function = exportedFunction = NULL;
    type = t;
    constValue = NULL;
-    isStatic = false;
+    storageClass = sc;
    varyingCFDepth = 0;
 }

--- a/sym.h
+++ b/sym.h
@@ -41,6 +41,7 @@
 #define ISPC_SYM_H

 #include "ispc.h"
+#include "decl.h"
 #include <map>

 class StructType;
@@ -63,7 +64,8 @@ class Symbol {
 public:
    /** The Symbol constructor takes the name of the symbol, its
        position in a source file, and its type (if known). */
-    Symbol(const std::string &name, SourcePos pos, const Type *t = NULL);
+    Symbol(const std::string &name, SourcePos pos, const Type *t = NULL,
+           StorageClass sc = SC_NONE);

    /** This method should only be called for function symbols; for them,
        it returns a mangled version of the function name with the argument
@@ -81,6 +83,11 @@ public:
    llvm::Function *function; /*!< For symbols that represent functions,
                                   this stores the LLVM Function value for
                                   the symbol once it has been created. */ 
+    llvm::Function *exportedFunction;
+                              /*!< For symbols that represent functions with
+                                   'export' qualifiers, this points to the LLVM
+                                   Function for the application-callable version
+                                   of the function. */
    const Type *type;         /*!< The type of the symbol; if not set by the
                                   constructor, this is set after the
                                   declaration around the symbol has been parsed.  */
@@ -93,8 +100,8 @@ public:
                                   storagePtr member will be its constant value.  (This
                                   messiness is due to needing an ispc ConstExpr for the early 
                                   constant folding optimizations). */
-    bool isStatic;            /*!< Records whether this symbol had a static qualifier in
-                                   its declaration. */
+    StorageClass storageClass;/*!< Records the storage class (if any) provided with the
+                                   symbol's declaration. */
    int varyingCFDepth;       /*!< This member records the number of levels of nested 'varying' 
                                   control flow within which the symbol was declared.  Having
                                   this value available makes it possible to avoid performing
@@ -186,6 +193,14 @@ public:
        void GetMatchingFunctions(Predicate pred, 
                                  std::vector<Symbol *> *matches) const;

+    /** Returns all of the variable symbols in the symbol table that match
+        the given predicate.  The predicate is defined as in the
+        GetMatchingFunctions() method.
+     */
+    template <typename Predicate> 
+        void GetMatchingVariables(Predicate pred, 
+                                  std::vector<Symbol *> *matches) const;
+
    /** Adds the named type to the symbol table.  This is used for both
        struct definitions (where <tt>struct Foo</tt> causes type \c Foo to
        be added to the symbol table) as well as for <tt>typedef</tt>s.
@@ -251,8 +266,8 @@ private:
 };


-template <typename Predicate> 
-void SymbolTable::GetMatchingFunctions(Predicate pred, 
+template <typename Predicate> void
+SymbolTable::GetMatchingFunctions(Predicate pred, 
                                  std::vector<Symbol *> *matches) const {
    // Iterate through all function symbols and apply the given predicate.
    // If it returns true, add the Symbol * to the provided vector.
@@ -266,4 +281,14 @@ void SymbolTable::GetMatchingFunctions(Predicate pred,
    }
 }

+
+template <typename Predicate> void
+SymbolTable::GetMatchingVariables(Predicate pred, 
+                                  std::vector<Symbol *> *matches) const {
+    for (unsigned int i = 0; i < variables.size(); ++i)
+        for (unsigned int j = 0; j < variables[i]->size(); ++j)
+            if (pred((*variables[i])[j]))
+                matches->push_back((*variables[i])[j]);
+}
+
 #endif // ISPC_SYM_H
--- a/test_static.cpp
+++ b/test_static.cpp
@@ -31,9 +31,21 @@
   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
 */

+#if defined(_WIN32) || defined(_WIN64)
+#define ISPC_IS_WINDOWS
+#elif defined(__linux__)
+#define ISPC_IS_LINUX
+#elif defined(__APPLE__)
+#define ISPC_IS_APPLE
+#endif
+
 #include <string.h>
 #include <stdio.h>
 #include <assert.h>
+#include <stdint.h>
+#ifdef ISPC_IS_LINUX
+#include <malloc.h>
+#endif

 extern "C" {
    extern int width();
@@ -46,20 +58,44 @@ extern "C" {
    extern void f_di(float *result, double *a, int *b);
    extern void result(float *val);
    
-    void ISPCLaunch(void *f, void *d);
-    void ISPCSync();
+    void ISPCLaunch(void **handlePtr, void *f, void *d, int);
+    void ISPCSync(void *handle);
+    void *ISPCAlloc(void **handlePtr, int64_t size, int32_t alignment);
 }

-void ISPCLaunch(void *f, void *d) {
-    typedef void (*TaskFuncType)(void *, int, int);
+void ISPCLaunch(void **handle, void *f, void *d, int count) {
+    *handle = (void *)0xdeadbeef;
+    typedef void (*TaskFuncType)(void *, int, int, int, int);
    TaskFuncType func = (TaskFuncType)f;
-    func(d, 0, 1);
+    for (int i = 0; i < count; ++i)
+        func(d, 0, 1, i, count);
 }

-void ISPCSync() {
+void ISPCSync(void *) {
 }


+void *ISPCAlloc(void **handle, int64_t size, int32_t alignment) {
+    *handle = (void *)0xdeadbeef;
+    // and now, we leak...
+#ifdef ISPC_IS_WINDOWS
+    return _aligned_malloc(size, alignment);
+#endif
+#ifdef ISPC_IS_LINUX
+    return memalign(alignment, size);
+#endif
+#ifdef ISPC_IS_APPLE
+    void *mem = malloc(size + (alignment-1) + sizeof(void*));
+    char *amem = ((char*)mem) + sizeof(void*);
+    amem = amem + uint32_t(alignment - (reinterpret_cast<uint64_t>(amem) &
+                                        (alignment - 1)));
+    ((void**)amem)[-1] = mem;
+    return amem;
+#endif
+}
+
+
+
 int main(int argc, char *argv[]) {
    int w = width();
    assert(w <= 16);
--- a/tests/array-1.ispc
+++ b/tests/array-1.ispc
@@ -3,7 +3,7 @@ export uniform int width() { return programCount; }



-static float x[2][1];
+static float x[1][2];

 export void f_f(uniform float RET[], uniform float aFOO[]) {
    float a = aFOO[programIndex];
--- a/tests/array-scatter-vary.ispc
+++ b/tests/array-scatter-vary.ispc
@@ -13,7 +13,7 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
 }

    
-export void result(uniform float RET[4]) { 
+export void result(uniform float RET[]) { 
    RET[programIndex] = 0;
    RET[3] = 4;
    RET[4] = 5;
--- a/tests/atomics-1.ispc
+++ b/tests/atomics-1.ispc
@@ -5,7 +5,8 @@ uniform unsigned int32 s = 0;

 export void f_f(uniform float RET[], uniform float aFOO[]) {
    float a = aFOO[programIndex]; 
-    float b = atomic_add_global(s, 1);
+    float delta = 1;
+    float b = atomic_add_global(s, delta);
    RET[programIndex] = reduce_add(b);
 }

--- a/tests/atomics-10.ispc
+++ b/tests/atomics-10.ispc
@@ -6,8 +6,9 @@ uniform unsigned int32 s = 0;
 export void f_f(uniform float RET[], uniform float aFOO[]) {
    float a = aFOO[programIndex]; 
    float b = 0;
+    float delta = 1;
    if (programIndex < 2)
-        b = atomic_add_global(s, 1);
+        b = atomic_add_global(s, delta);
    RET[programIndex] = s;
 }

--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Matt Pharr	790dba2558	Doxygen bump and release notes for v1.0.11	2011-10-07 09:57:55 -07:00
Matt Pharr	ce7355f9ed	Windows: fix examples build to look for ispc.exe in ../.. as well	2011-10-09 07:40:18 -07:00
Matt Pharr	6b4459d402	Windows: fix some compiler warnings during build	2011-10-09 07:40:17 -07:00
Matt Pharr	4a2cbf2c4e	Fix regression from AST checkin that caused perf. warnings to be issued for stdlib code.	2011-10-07 09:20:48 -07:00
Matt Pharr	53dd65fa2e	Add ispc_test to buildall.bat script	2011-10-08 17:17:05 -07:00
Matt Pharr	f5afa52fd9	Add missing header	2011-10-06 17:10:30 -07:00
Matt Pharr	f9c67ff806	Explicit representation of ASTs for all the functions in a compile unit. Added AST and Function classes. Now, we parse the whole file and build up the AST for all of the functions in the Module before we emit IR for the functions (vs. before, when we generated IR along the way as we parsed the source file.)	2011-10-06 15:35:27 -07:00
Matt Pharr	ec5e627e56	Mark internal stdlib functions as "internal" linkage, not "private". This fixes print() statements on OSX. (http://llvm.org/bugs/show_bug.cgi?id=11080)	2011-10-06 13:32:20 -07:00
Matt Pharr	ff2a43ac19	Run the CFG simplification pass even when optimization is disabled. This fixes an issue with undefined SVML symbols with code that called transcendental functions in the stdandard library, even when the SVML math library hadn't been selected.	2011-10-06 09:20:50 -07:00
Matt Pharr	9feea32471	Fix errors in documentation for some of the reduce_* stdlib functions	2011-10-06 07:52:10 -07:00
Matt Pharr	bedaec2295	Update examples for multi-target compilation. Makefile and vcxproj file updates. Also modified vcxproj files so that the various files ispc generates go into $(TargetDir), not the current directory. Modified the ray tracer example to not have uniform short-vector types in its app-visible datatypes (these are laid out differently on SSE vs AVX); there was an existing lurking bug in the way this was done before.	2011-10-04 16:01:56 -07:00
Matt Pharr	a68d137df6	Documentation update for multi-target compilation.	2011-10-04 16:01:56 -07:00
Matt Pharr	59caa3d4e1	Various small Windows fixes. Also fixed some tabs/spaces and compiler warning issues.	2011-10-04 16:01:56 -07:00
Matt Pharr	06975bc7ab	Add support for compiling to multiple targets. If a flag along the lines of "--target=sse4,avx-x2" is provided on the command-line, then the program will be compiled for each of the given targets, with a separate output file generated for each one. Further, an output file with dispatch functions that check the current system's CPU and then chooses the best available variant is also created. Issue #11.	2011-10-04 16:01:55 -07:00
Matt Pharr	880cbb18cc	Remove checks to see if system's processor matches the target the code was compiled for. (Preparation for multi-target output.)	2011-10-04 16:01:55 -07:00
Matt Pharr	686d9975b6	Add Symbol::exportedFunction member to hold llvm::Function * for app-callable version of function.	2011-10-04 15:56:54 -07:00
Matt Pharr	9b7f55a28e	Add buildall.bat script for Windows. Also various example build fixes for Windows	2011-10-04 11:42:04 -07:00
Matt Pharr	e4d224a0f1	Use __cilk to detect Cilk support	2011-10-04 11:16:42 -07:00
Matt Pharr	0933a77c1b	Improve task decomposition in ray tracing example. Specifically, launch all of the tasks in one statement, rather than still looping over spans in y and launching a collection of tasks across x for each span. This seems to give a few percent better performance.	2011-10-04 09:33:59 -07:00
Matt Pharr	5f78edf07a	Fix bug with screen decomposition in volume rendering example	2011-10-04 09:30:02 -07:00
Matt Pharr	a6fc657b40	Remove 'externGlobals' member from Module; instead find them when needed via new SymbolTable::GetMatchingVariables method.	2011-10-04 06:36:31 -07:00
Matt Pharr	fa5050d5c7	Error reporting improvements. Don't print more than 3 lines of source file context with errors. (Any more than that is almost certainly not the Right Thing to do.) Make some parsing error messages more clear.	2011-10-03 21:09:04 -07:00
Matt Pharr	d5a48d9a1e	Fix incorrect LLVM_3_0svn #ifdefs	2011-10-03 08:29:19 -07:00
Matt Pharr	2df9da2524	Be careful to not inadvertently match NULL functions in optimization passes.	2011-10-01 08:34:11 -07:00
Matt Pharr	0b02f94988	Task system performance tweaks. Switch back to GCD on OSX. Increase TaskInfo allocation count. This fixes the regression with deferred on AVX (from 17x to 25x again with 4 cores.)	2011-10-01 08:04:09 -07:00
Matt Pharr	65c50b60fc	Cleanups to deferred shading workload	2011-09-30 20:35:42 -07:00
Matt Pharr	9de34eb22c	Release notes and doxygen bump for v1.0.10	2011-09-30 19:42:14 -07:00
Matt Pharr	f8f25a11b6	Added deferred shading workload	2011-09-30 19:42:14 -07:00
Matt Pharr	cb7976bbf6	Added updated task launch implementation that now tracks task groups. Within each function that launches tasks, we now can easily track which tasks that function launched, so that the sync at the end of the function can just sync on the tasks launched by that function (not all tasks launched by all functions.) Implementing this led to a rework of the task system API that ispc generates code to call; the example task systems in examples/tasksys.cpp have been updated to conform to this API. (The updated API is also documented in the ispc user's guide.) As part of this, "launch[n]" syntax was added to launch a number of tasks in a single launch statement, rather than requiring a loop over 'n' to launch n tasks. This commit thus fixes issue #84 (enhancement to launch multiple tasks from a single launch statement) as well as issue #105 (recursive task launches were broken).	2011-09-30 11:20:53 -07:00
Matt Pharr	5ee4d7fce8	Add comment	2011-09-30 11:11:52 -07:00
Matt Pharr	8f3e46f67e	Use InterlockedExchangeAdd on Windows	2011-09-29 16:19:59 -07:00
Matt Pharr	9ed07ff2b5	Fix __num_cores() definition on Windows to not cause unresolved symbols	2011-09-29 13:35:50 -07:00
Matt Pharr	32a0a30cf5	Only allow exact matches for function overload resolution for builtins. The intent is that the code in stdlib.ispc that is calling out to the built-ins should match argument types exactly (using explicit casts as needed), just for maximal clarity/safety.	2011-09-28 17:20:31 -07:00
Matt Pharr	6d39d5fc3e	Small cleanups. Add __num_cores() to the list of symbols to remove from the module at the end. Fix declarations of mask type for 64-bit atomics to silence warnings.	2011-09-28 16:26:35 -07:00
Matt Pharr	c999c8a237	Add num_cores() stdlib routine. Issue #102 .	2011-09-28 16:16:58 -07:00
Matt Pharr	aad269fdf4	Added support for 'uniform' global atomics. Issue #93.	2011-09-28 16:06:07 -07:00
Matt Pharr	d45c536c47	Fix Windows debug build of simple example	2011-09-28 14:11:32 -07:00
Matt Pharr	f1b8e5b1bf	Release notes and doxygen bump for 1.0.9 release	2011-09-26 16:21:32 -07:00
Matt Pharr	e7a70b05af	Fix statically-linked tests on Linux	2011-09-26 16:11:45 -07:00
Matt Pharr	cf73286938	More small Windows build fixes. Also switch to LLVM 3.0 libs	2011-09-26 16:07:23 -07:00
Matt Pharr	e6f80c0adc	Remove stale include of MCJIT.h	2011-09-26 16:04:52 -07:00
Matt Pharr	5e31d7b6d0	Windows build: use LLVM_INSTALL_DIR to find clang.exe	2011-09-26 16:04:50 -07:00
Matt Pharr	649f2ad7b7	Update parser to make 'sync' a statement, not an expr.	2011-09-23 20:33:24 -07:00
Matt Pharr	fade1cdf1d	Pretty much all conversions to varying double are slow, so don't bother warning about them.	2011-09-23 16:03:35 -07:00
Matt Pharr	d261105a86	Error/warning reporting improvements. - Don't suggest matches when given an empty string or a single, non-alpha character. - Also fixed the parser to be a bit less confusing when it encounters an unexpected EOF.	2011-09-23 15:51:23 -07:00
Matt Pharr	b3d3e8987b	Provide a properly initialized TextDiagnosticPrinter to clang's preprocessor. Fixes issue #100 (crash when the preprocessor was trying to emit a diagnostic about a mismatched #if/#endif).	2011-09-23 15:50:18 -07:00
Matt Pharr	4e91f3777a	Fix BinaryExpr to handle reference-typed operands. Fixes issue #101.	2011-09-23 15:19:14 -07:00
Matt Pharr	5584240c7f	Fix crash with function declarations with unnamed parameters. Fixes issue #103. Previously, we were inadvertently grabbing the function's return type for the parameter, rather than the actual parameter type.	2011-09-23 15:05:59 -07:00
Matt Pharr	7126a39092	Disable PIC on Windows	2011-09-19 15:32:43 -07:00
Matt Pharr	8ad28a3f6f	update doxygen, release notes for 1.0.8 release	2011-09-19 15:22:25 -07:00
Matt Pharr	9921b8e530	Predicated 'if' statement performance improvements. Go back to running both sides of 'if' statements with masking and without branching if we can determine that the code is relatively simple (as per the simple cost model), and is safe to run even if the mask is 'all off'. This gives a bit of a performance improvement for some of the examples (most notably, the ray tracer), and is the code that one wants generated in this case anyhow.	2011-09-19 09:54:09 -07:00
Matt Pharr	9052d4b10b	Linux build fixes	2011-09-17 13:42:46 -07:00
Matt Pharr	2405dae8e6	Use malloc() to get space for task arguments when compiling to AVX. This is to work around the LLVM bug/limitation discused in LLVM bug 10841 (http://llvm.org/bugs/show_bug.cgi?id=10841).	2011-09-17 13:38:51 -07:00
Matt Pharr	3607f3e045	Remove support for building with LLVM 2.8. Fixes issue #66 . Both 2.9 and top-of-tree generate substantially better code than LLVM 2.8 did, so it's not worth fixing the 2.8 build.	2011-09-17 13:18:59 -07:00
Matt Pharr	de84acfa5d	On OSX with LLVM 2.9, always generate position-independent code. Fixes Issue #99.	2011-09-17 13:03:51 -07:00
Matt Pharr	a501ab1aa6	Fix parenthesization bugs in cost estimates. Also added the debugging print that helped find these issues. Revert inlining some functions in examples	2011-09-16 19:07:07 -07:00
Matt Pharr	cdc850f98c	Inline some functions in examples	2011-09-16 17:02:21 -07:00
Matt Pharr	ca87579f23	Add a very simple cost model to estimate runtime cost of running code. This is currently only used to decide whether it's worth doing an "are all lanes running" check at the start of functions--for small functions, it's not worth the overhead. The cost is estimated relatively early in compilation (e.g. before we know if an array access is a scatter/gather or not, before constant folding, etc.), so there are many known shortcomings.	2011-09-16 15:09:17 -07:00
Matt Pharr	38fc13d1ab	Remove now unused function.	2011-09-16 14:21:13 -07:00
Matt Pharr	cf9d9f717e	Logic simplification to 'mixed true/false' case for coherent ifs. Use the approach from `173632f446` here as well.	2011-09-16 14:10:55 -07:00
Matt Pharr	173632f446	Generate more efficient for regular varying 'if' statements. For the case where we have a regular (i.e. non-'cif') 'if' statement, the generated code just simply checks to see if any program instance is running before running the corresponding statements. This is a lighter-weight check than IfStmt::emitMaskMixed() was performing.	2011-09-16 12:03:42 -07:00
Matt Pharr	1dedd88132	Improve implementaton of 'are both masks equal' check for AVX. Previously, we did a vector equal compare and then a movmsk, the result of which we checked to see if it was on for all lanes. Because masks are vectors of i32s, under AVX, the vector equal compare required two 4-wide SSE compares and some shuffling. Now, we do a movmsk of both masks first and then a scalar equality comparison of those two values, which seems to generate overall better code.	2011-09-15 06:25:02 -07:00
Matt Pharr	0848c2cc19	Actually make all 'if' statements check for 'all off' mask. Contrary to claims in `0c2048385`, that checkin didn't include the changes to not run if/else blocks if none of the program instances wanted to be running them. This checkin fixes that and thus actually fixes issue #74.	2011-09-13 19:48:04 -07:00
Matt Pharr	e2a88d491f	Mark the internal __fast_masked_vload function as static	2011-09-13 15:43:48 -07:00
Matt Pharr	30f9dcd4f5	Unroll loops by default, add --opt=disable-loop-unroll to disable. Issue #78.	2011-09-13 15:37:18 -07:00
Matt Pharr	0c344b6755	Fix Linux build of mandelbrot_tasks example	2011-09-13 15:17:30 -07:00
Matt Pharr	6734021520	Issue warning when compile-time constant out-of-bounds array index is used. Issue #98. Also fixes two examples that had bugs of this type that this warning uncovered!	2011-09-13 14:42:20 -07:00
Matt Pharr	dd153d3c5c	Handle more instruction types when flattening offset vectors. Generalize the lScalarizeVector() utility routine (used in determining when we can change gathers/scatters into vector loads/stores, respectively) to handle vector shuffles and vector loads. This fixes issue #79, which provided a case where a gather was being performed even though a vector load was possible.	2011-09-13 09:43:56 -07:00
Matt Pharr	9ca7541d52	Remove check for any program instances running before function calls. Given the change in `0c20483853`, this is no longer necessary, since we know that one instance will always be running if we're executing a given block of code.	2011-09-13 06:26:16 -07:00
Matt Pharr	0c20483853	Make all "if" statements "coherent" ifs. Workaround for issue #74 . Using blend to do masked stores is unsafe if all of the lanes are off: it may read from or write to invalid memory. For now, this workaround transforms all 'if' statements into coherent 'if's, ensuring that an instruction only runs if at least on program instance wants to be running it. One nice thing about this change is that a number of implementations of various builtins can be simplified, since they no longer need to confirm that at least one program instance is running. It might be nice to re-enable regular if statements in a future checkin, but we'd want to make sure they don't have any masked loads or blended masked stores in their statement lists. There isn't a performance impact for any of the examples with this change, so it's unclear if this is important. Note that this only impacts 'if' statements with a varying condition.	2011-09-12 16:25:08 -07:00
Matt Pharr	9d4ff1bc06	Fix alignment in usage message	2011-09-12 15:06:41 -07:00
Matt Pharr	83f22f1939	Add experimental --fast-masked-vload flag for SSE.	2011-09-12 12:29:33 -07:00
Matt Pharr	6375ed9224	AVX: Fix bug with misdeclaration of blend intrinsic. This was preventing the "convert an all-on blend to one of the operand values" optimization from kicking on in AVX.	2011-09-12 06:42:38 -07:00
Matt Pharr	cf23cf9ef4	Fix typo in user guide. Issue #96	2011-09-12 05:24:32 -07:00
Matt Pharr	1147b53dcd	Add #define with target vector width in emitted headers	2011-09-09 09:33:56 -07:00
Matt Pharr	4cf831a651	When --fast-math is enabled, tell LLVM about it, too.	2011-09-09 09:32:59 -07:00
Matt Pharr	785d8a29d3	Run mem2reg pass even when doing -O0 compiles	2011-09-09 09:24:43 -07:00
Matt Pharr	46d2bad231	Fix malformed program crash	2011-09-09 09:24:43 -07:00
Matt Pharr	32da8e11b4	Fix crash with varying global vector types when emitting header file.	2011-09-09 09:16:59 -07:00
Matt Pharr	5dedb6f836	Add --scale command line argument to mandelbrot and rt examples. This applies a floating-point scale factor to the image resolution; it's useful for experiments with many-core systems where the base image resolution may not give enough work for good load-balancing with tasks.	2011-09-07 20:07:51 -07:00
Matt Pharr	2ea6d249d5	Fix mapping to 8, 16 program instances in AO bench example. With this, we now compute a correct image with AVX.	2011-09-07 11:34:24 -07:00
Matt Pharr	c86128e8ee	AVX: go back to using blend (vs. masked store) when possible. All of the masked store calls were inhibiting putting values into registers, which in turn led to a lot of unnecessary stack traffic. This approach seems to give better code in the end.	2011-09-07 11:26:49 -07:00
Matt Pharr	375f1cb8e8	Make octaves and octaves loop uniform in noise example	2011-09-07 10:34:23 -07:00
Matt Pharr	3ca7b6b078	Remove MCJIT stuff from ispc_test (fix Linux build)	2011-09-07 09:44:27 -07:00
Matt Pharr	effe901890	Add task-parallel version of aobench	2011-09-07 05:43:21 -07:00
Matt Pharr	4f451bd041	More AVX fixes Fix RNG state initialization for 16-wide targets Fix a number of bugs in reduce_add builtin implementations for AVX. Fix some tests that had incorrect expected results for the 16-wide case.	2011-09-06 15:53:11 -07:00
Matt Pharr	c76ef7b174	Add command-line option to specify position-independent codegen	2011-09-06 11:12:43 -07:00
Matt Pharr	743d82e935	Various documentation updates.	2011-09-06 09:51:02 -07:00
Matt Pharr	18546e9c6d	Add option to disable optimizations to test running script	2011-09-04 18:09:00 -07:00