Release notes and doxygen bump for 1.0.9 release

Fix statically-linked tests on Linux
More small Windows build fixes. Also switch to LLVM 3.0 libs
2011-09-26 16:21:32 -07:00 · 2011-09-26 16:11:45 -07:00 · 2011-09-26 16:07:23 -07:00 · 2011-09-26 16:04:52 -07:00 · 2011-09-26 16:04:50 -07:00 · 2011-09-23 20:33:24 -07:00
50 changed files with 1675 additions and 698 deletions
--- a/README.txt
+++ b/README.txt
@@ -15,8 +15,8 @@ code.

 ispc is an open source compiler under the BSD license; see the file
 LICENSE.txt.  ispc supports Windows, Mac, and Linux, with both x86 and
-x86-64 targets. It currently supports the SSE2 and SSE4 instruction sets,
-though support for AVX should be available soon.
+x86-64 targets.  It currently supports the SSE2, SSE4, and AVX instruction
+sets.

 For more information and examples, as well as a wiki and the bug database,
 see the ispc distribution site, http://ispc.github.com.
--- a/bitcode2cpp.py
+++ b/bitcode2cpp.py
@@ -4,6 +4,8 @@ import sys
 import string
 import re
 import subprocess
+import platform
+import os

 length=0

@@ -14,8 +16,12 @@ target = re.sub("\.ll$", "", target)
 target = re.sub("\.c$", "", target)
 target = re.sub("-", "_", target)

+llvm_as="llvm-as"
+if platform.system() == 'Windows' or string.find(platform.system(), "CYGWIN_NT") != -1:
+    llvm_as = os.getenv("LLVM_INSTALL_DIR").replace("\\", "/") + "/bin/" + llvm_as
+
 try:
-    as_out=subprocess.Popen([ "llvm-as", "-", "-o", "-"], stdout=subprocess.PIPE)
+    as_out=subprocess.Popen([llvm_as, "-", "-o", "-"], stdout=subprocess.PIPE)
 except IOError:
    print >> sys.stderr, "Couldn't open " + src
    sys.exit(1)
--- a/builtins-avx-x2.ll
+++ b/builtins-avx-x2.ll
@@ -232,8 +232,8 @@ define internal float @__reduce_add_float(<16 x float>) nounwind readonly always
  %v1 = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %va, <8 x float> %vb)
  %v2 = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %v1, <8 x float> %v1)
  %v3 = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %v2, <8 x float> %v2)
-  %scalar1 = extractelement <8 x float> %v2, i32 0
-  %scalar2 = extractelement <8 x float> %v2, i32 4
+  %scalar1 = extractelement <8 x float> %v3, i32 0
+  %scalar2 = extractelement <8 x float> %v3, i32 4
  %sum = fadd float %scalar1, %scalar2
  ret float %sum
 }
@@ -316,7 +316,9 @@ define internal double @__reduce_add_double(<16 x double>) nounwind readonly alw

  %sum0 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %vab, <4 x double> %vcd)
  %sum1 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %sum0, <4 x double> %sum0)
-  %sum = extractelement <4 x double> %sum1, i32 0
+  %final0 = extractelement <4 x double> %sum1, i32 0
+  %final1 = extractelement <4 x double> %sum1, i32 2
+  %sum = fadd double %final0, %final1
  ret double %sum
 }

@@ -521,35 +523,104 @@ define void @__masked_store_64(<16 x i64>* nocapture, <16 x i64>,
 }


-;; FIXME: various code elsewhere in the builtins implementations makes
-;; calls to the 32/64 bit versions of these, basically assuming that doing
-;; so is faster than doing a full call to an actual masked store, which
-;; isn't likely to be the case on AVX.  So here we provide those functions
-;; but then don't actually do what the caller asked for...
+masked_store_blend_8_16_by_16()

-declare void @llvm.trap()
-
-define void @__masked_store_blend_8(<8 x i8>* nocapture, <8 x i8>, 
-                                    <8 x i32>) nounwind alwaysinline {
-  call void @llvm.trap()
-  ret void
-}
-
-define void @__masked_store_blend_16(<8 x i16>* nocapture, <8 x i16>, 
-                                     <8 x i32>) nounwind alwaysinline {
-  call void @llvm.trap()
-  ret void
-}
+declare <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float>, <8 x float>,
+                                                <8 x float>) nounwind readnone

 define void @__masked_store_blend_32(<16 x i32>* nocapture, <16 x i32>, 
                                     <16 x i32>) nounwind alwaysinline {
-  call void @__masked_store_32(<16 x i32> * %0, <16 x i32> %1, <16 x i32> %2)
+  %maskAsFloat = bitcast <16 x i32> %2 to <16 x float>
+  %oldValue = load <16 x i32>* %0, align 4
+  %oldAsFloat = bitcast <16 x i32> %oldValue to <16 x float>
+  %newAsFloat = bitcast <16 x i32> %1 to <16 x float>
+ 
+  %old0 = shufflevector <16 x float> %oldAsFloat, <16 x float> undef,
+        <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %old1 = shufflevector <16 x float> %oldAsFloat, <16 x float> undef,
+        <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %new0 = shufflevector <16 x float> %newAsFloat, <16 x float> undef,
+        <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %new1 = shufflevector <16 x float> %newAsFloat, <16 x float> undef,
+        <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %mask0 = shufflevector <16 x float> %maskAsFloat, <16 x float> undef,
+        <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %mask1 = shufflevector <16 x float> %maskAsFloat, <16 x float> undef,
+        <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+
+  %blend0 = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %old0,
+                                                         <8 x float> %new0,
+                                                         <8 x float> %mask0)
+  %blend1 = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %old1,
+                                                         <8 x float> %new1,
+                                                         <8 x float> %mask1)
+  %blend = shufflevector <8 x float> %blend0, <8 x float> %blend1,
+    <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %blendAsInt = bitcast <16 x float> %blend to <16 x i32>
+  store <16 x i32> %blendAsInt, <16 x i32>* %0, align 4
  ret void
 }

-define void @__masked_store_blend_64(<16 x i64>* nocapture, <16 x i64>, 
-                                     <16 x i32>) nounwind alwaysinline {
-  call void @__masked_store_64(<16 x i64> * %0, <16 x i64> %1, <16 x i32> %2)
+
+declare <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double>, <4 x double>,
+                                                 <4 x double>) nounwind readnone
+
+define void @__masked_store_blend_64(<16 x i64>* nocapture %ptr, <16 x i64> %newi64, 
+                                     <16 x i32> %mask) nounwind alwaysinline {
+  %oldValue = load <16 x i64>* %ptr, align 8
+  %old = bitcast <16 x i64> %oldValue to <16 x double>
+  %old0d = shufflevector <16 x double> %old, <16 x double> undef,
+     <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %old1d = shufflevector <16 x double> %old, <16 x double> undef,
+     <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %old2d = shufflevector <16 x double> %old, <16 x double> undef,
+     <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+  %old3d = shufflevector <16 x double> %old, <16 x double> undef,
+     <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+
+  %new = bitcast <16 x i64> %newi64 to <16 x double>
+  %new0d = shufflevector <16 x double> %new, <16 x double> undef,
+     <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %new1d = shufflevector <16 x double> %new, <16 x double> undef,
+     <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %new2d = shufflevector <16 x double> %new, <16 x double> undef,
+     <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+  %new3d = shufflevector <16 x double> %new, <16 x double> undef,
+     <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+
+  %mask0 = shufflevector <16 x i32> %mask, <16 x i32> undef,
+     <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
+  %mask1 = shufflevector <16 x i32> %mask, <16 x i32> undef,
+     <8 x i32> <i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
+  %mask2 = shufflevector <16 x i32> %mask, <16 x i32> undef,
+     <8 x i32> <i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11>
+  %mask3 = shufflevector <16 x i32> %mask, <16 x i32> undef,
+     <8 x i32> <i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15>
+  %mask0d = bitcast <8 x i32> %mask0 to <4 x double>
+  %mask1d = bitcast <8 x i32> %mask1 to <4 x double>
+  %mask2d = bitcast <8 x i32> %mask2 to <4 x double>
+  %mask3d = bitcast <8 x i32> %mask3 to <4 x double>
+
+  %result0d = call <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double> %old0d,
+                                 <4 x double> %new0d, <4 x double> %mask0d)
+  %result1d = call <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double> %old1d,
+                                 <4 x double> %new1d, <4 x double> %mask1d)
+  %result2d = call <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double> %old2d,
+                                 <4 x double> %new2d, <4 x double> %mask2d)
+  %result3d = call <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double> %old3d,
+                                 <4 x double> %new3d, <4 x double> %mask3d)
+
+  %result01 = shufflevector <4 x double> %result0d, <4 x double> %result1d,
+           <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %result23 = shufflevector <4 x double> %result2d, <4 x double> %result3d,
+           <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+
+  %result = shufflevector <8 x double> %result01, <8 x double> %result23,
+           <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                       i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %result64 = bitcast <16 x double> %result to <16 x i64>
+  store <16 x i64> %result64, <16 x i64> * %ptr
  ret void
 }

--- a/builtins-avx.ll
+++ b/builtins-avx.ll
@@ -294,10 +294,12 @@ define internal double @__reduce_add_double(<8 x double>) nounwind readonly alwa
                      <4 x i32> <i32 0, i32 1, i32 2, i32 3>
  %v1 = shufflevector <8 x double> %0, <8 x double> undef,
                      <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %sum01 = fadd <4 x double> %v0, %v1
-  %red0 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %sum01, <4 x double> %sum01)
-  %red1 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %red0, <4 x double> %red0)
-  %sum = extractelement <4 x double> %red1, i32 0
+  %sum0 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %v0, <4 x double> %v1)
+  %sum1 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %sum0, <4 x double> %sum0)
+  %final0 = extractelement <4 x double> %sum1, i32 0
+  %final1 = extractelement <4 x double> %sum1, i32 2
+  %sum = fadd double %final0, %final1
+
  ret double %sum
 }

@@ -448,38 +450,74 @@ define void @__masked_store_64(<8 x i64>* nocapture, <8 x i64>,
 }


-;; FIXME: various code elsewhere in the builtins implementations makes
-;; calls to the 32/64 bit versions of these, basically assuming that doing
-;; so is faster than doing a full call to an actual masked store, which
-;; isn't likely to be the case on AVX.  So here we provide those functions
-;; but then don't actually do what the caller asked for...

-declare void @llvm.trap()
-
-define void @__masked_store_blend_8(<8 x i8>* nocapture, <8 x i8>, 
-                                    <8 x i32>) nounwind alwaysinline {
-  call void @llvm.trap()
-  ret void
-}
-
-
-define void @__masked_store_blend_16(<8 x i16>* nocapture, <8 x i16>, 
-                                     <8 x i32>) nounwind alwaysinline {
-  call void @llvm.trap()
-  ret void
-}
+masked_store_blend_8_16_by_8()

+declare <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float>, <8 x float>,
+                                                <8 x float>) nounwind readnone

 define void @__masked_store_blend_32(<8 x i32>* nocapture, <8 x i32>, 
                                     <8 x i32>) nounwind alwaysinline {
-  call void @__masked_store_32(<8 x i32> * %0, <8 x i32> %1, <8 x i32> %2)
+  %mask_as_float = bitcast <8 x i32> %2 to <8 x float>
+  %oldValue = load <8 x i32>* %0, align 4
+  %oldAsFloat = bitcast <8 x i32> %oldValue to <8 x float>
+  %newAsFloat = bitcast <8 x i32> %1 to <8 x float>
+  %blend = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %oldAsFloat,
+                                                        <8 x float> %newAsFloat,
+                                                        <8 x float> %mask_as_float)
+  %blendAsInt = bitcast <8 x float> %blend to <8 x i32>
+  store <8 x i32> %blendAsInt, <8 x i32>* %0, align 4
  ret void
 }


-define void @__masked_store_blend_64(<8 x i64>* nocapture, <8 x i64>, 
-                                     <8 x i32>) nounwind alwaysinline {
-  call void @__masked_store_64(<8 x i64> * %0, <8 x i64> %1, <8 x i32> %2)
+define void @__masked_store_blend_64(<8 x i64>* nocapture %ptr, <8 x i64> %new, 
+                                     <8 x i32> %i32mask) nounwind alwaysinline {
+  %oldValue = load <8 x i64>* %ptr, align 8
+  %mask = bitcast <8 x i32> %i32mask to <8 x float>
+
+  ; Do 4x64-bit blends by doing two <8 x i32> blends, where the <8 x i32> values
+  ; are actually bitcast <4 x i64> values
+  ;
+  ; set up the first four 64-bit values
+  %old01  = shufflevector <8 x i64> %oldValue, <8 x i64> undef,
+                          <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %old01f = bitcast <4 x i64> %old01 to <8 x float>
+  %new01  = shufflevector <8 x i64> %new, <8 x i64> undef,
+                          <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %new01f = bitcast <4 x i64> %new01 to <8 x float>
+  ; compute mask--note that the indices are all doubled-up
+  %mask01 = shufflevector <8 x float> %mask, <8 x float> undef,
+                          <8 x i32> <i32 0, i32 0, i32 1, i32 1,
+                                     i32 2, i32 2, i32 3, i32 3>
+  ; and blend them
+  %result01f = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %old01f,
+                                                            <8 x float> %new01f,
+                                                            <8 x float> %mask01)
+  %result01 = bitcast <8 x float> %result01f to <4 x i64>
+
+  ; and again
+  %old23  = shufflevector <8 x i64> %oldValue, <8 x i64> undef,
+                          <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %old23f = bitcast <4 x i64> %old23 to <8 x float>
+  %new23  = shufflevector <8 x i64> %new, <8 x i64> undef,
+                          <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %new23f = bitcast <4 x i64> %new23 to <8 x float>
+  ; compute mask--note that the values are doubled-up...
+  %mask23 = shufflevector <8 x float> %mask, <8 x float> undef,
+                          <8 x i32> <i32 4, i32 4, i32 5, i32 5,
+                                     i32 6, i32 6, i32 7, i32 7>
+  ; and blend them
+  %result23f = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %old23f,
+                                                            <8 x float> %new23f,
+                                                            <8 x float> %mask23)
+  %result23 = bitcast <8 x float> %result23f to <4 x i64>
+
+  ; reconstruct the final <8 x i64> vector
+  %final = shufflevector <4 x i64> %result01, <4 x i64> %result23,
+                         <8 x i32> <i32 0, i32 1, i32 2, i32 3,
+                                    i32 4, i32 5, i32 6, i32 7>
+  store <8 x i64> %final, <8 x i64> * %ptr, align 8
  ret void
 }

--- a/builtins.cpp
+++ b/builtins.cpp
@@ -389,6 +389,27 @@ lDefineConstantInt(const char *name, int val, llvm::Module *module,
 }


+
+static void
+lDefineConstantIntFunc(const char *name, int val, llvm::Module *module,
+                       SymbolTable *symbolTable) {
+    std::vector<const Type *> args;
+    FunctionType *ft = new FunctionType(AtomicType::UniformInt32, args, SourcePos());
+    Symbol *sym = new Symbol(name, SourcePos(), ft);
+    sym->isStatic = true;
+
+    llvm::Function *func = module->getFunction(name);
+    assert(func != NULL); // it should be declared already...
+    func->addFnAttr(llvm::Attribute::AlwaysInline);
+    llvm::BasicBlock *bblock = llvm::BasicBlock::Create(*g->ctx, "entry", func, 0);
+    llvm::ReturnInst::Create(*g->ctx, LLVMInt32(val), bblock);
+
+    sym->function = func;
+    symbolTable->AddVariable(sym);
+}
+
+
+
 static void
 lDefineProgramIndex(llvm::Module *module, SymbolTable *symbolTable) {
    Symbol *pidx = new Symbol("programIndex", SourcePos(), 
@@ -492,6 +513,8 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
                       symbolTable);
    lDefineConstantInt("__math_lib_system", (int)Globals::Math_System, module,
                       symbolTable);
+    lDefineConstantIntFunc("__fast_masked_vload", (int)g->opt.fastMaskedVload, module,
+                           symbolTable);

    if (includeStdlibISPC) {
        // If the user wants the standard library to be included, parse the
--- a/builtins.m4
+++ b/builtins.m4
@@ -851,6 +851,8 @@ define internal void @__prefetch_read_nt_$1($2 *) alwaysinline {

 define(`stdlib_core', `

+declare i32 @__fast_masked_vload()
+
 declare i8* @ISPCMalloc(i64, i32) nounwind
 declare i8* @ISPCFree(i8*) nounwind
 declare void @ISPCLaunch(i8*, i8*) nounwind
@@ -1344,12 +1346,6 @@ i64minmax($1,max,uint64,ugt)

 define(`load_and_broadcast', `
 define <$1 x $2> @__load_and_broadcast_$3(i8 *, <$1 x i32> %mask) nounwind alwaysinline {
-  ; must not load if the mask is all off; the address may be invalid
-  %mm = call i32 @__movmsk(<$1 x i32> %mask)
-  %any_on = icmp ne i32 %mm, 0
-  br i1 %any_on, label %load, label %skip
-
-load:
  %ptr = bitcast i8 * %0 to $2 *
  %val = load $2 * %ptr

@@ -1357,9 +1353,6 @@ load:
  forloop(i, 1, eval($1-1), `
  %ret`'i = insertelement <$1 x $2> %ret`'eval(i-1), $2 %val, i32 i')
  ret <$1 x $2> %ret`'eval($1-1)
-
-skip:
-  ret <$1 x $2> undef
 }
 ')

@@ -1375,14 +1368,20 @@ define(`load_masked', `
 define <$1 x $2> @__load_masked_$3(i8 *, <$1 x i32> %mask) nounwind alwaysinline {
 entry:
  %mm = call i32 @__movmsk(<$1 x i32> %mask)
+  
  ; if the first lane and the last lane are on, then it is safe to do a vector load
  ; of the whole thing--what the lanes in the middle want turns out to not matter...
  %mm_and = and i32 %mm, eval(1 | (1<<($1-1)))
  %can_vload = icmp eq i32 %mm_and, eval(1 | (1<<($1-1)))
+
+  %fast32 = call i32 @__fast_masked_vload()
+  %fast_i1 = trunc i32 %fast32 to i1
+  %can_vload_maybe_fast = or i1 %fast_i1, %can_vload
+
  ; if we are not able to do a singe vload, we will accumulate lanes in this memory..
  %retptr = alloca <$1 x $2>
  %retptr32 = bitcast <$1 x $2> * %retptr to $2 *
-  br i1 %can_vload, label %load, label %loop
+  br i1 %can_vload_maybe_fast, label %load, label %loop

 load: 
  %ptr = bitcast i8 * %0 to <$1 x $2> *
@@ -1517,6 +1516,46 @@ define void @__masked_store_blend_16(<8 x i16>* nocapture, <8 x i16>,
 ')


+define(`masked_store_blend_8_16_by_16', `
+define void @__masked_store_blend_8(<16 x i8>* nocapture, <16 x i8>,
+                                    <16 x i32>) nounwind alwaysinline {
+  %old = load <16 x i8> * %0
+  %old128 = bitcast <16 x i8> %old to i128
+  %new128 = bitcast <16 x i8> %1 to i128
+
+  %mask8 = trunc <16 x i32> %2 to <16 x i8>
+  %mask128 = bitcast <16 x i8> %mask8 to i128
+  %notmask128 = xor i128 %mask128, -1
+
+  %newmasked = and i128 %new128, %mask128
+  %oldmasked = and i128 %old128, %notmask128
+  %result = or i128 %newmasked, %oldmasked
+
+  %resultvec = bitcast i128 %result to <16 x i8>
+  store <16 x i8> %resultvec, <16 x i8> * %0
+  ret void
+}
+
+define void @__masked_store_blend_16(<16 x i16>* nocapture, <16 x i16>,
+                                     <16 x i32>) nounwind alwaysinline {
+  %old = load <16 x i16> * %0
+  %old256 = bitcast <16 x i16> %old to i256
+  %new256 = bitcast <16 x i16> %1 to i256
+
+  %mask16 = trunc <16 x i32> %2 to <16 x i16>
+  %mask256 = bitcast <16 x i16> %mask16 to i256
+  %notmask256 = xor i256 %mask256, -1
+
+  %newmasked = and i256 %new256, %mask256
+  %oldmasked = and i256 %old256, %notmask256
+  %result = or i256 %newmasked, %oldmasked
+
+  %resultvec = bitcast i256 %result to <16 x i16>
+  store <16 x i16> %resultvec, <16 x i16> * %0
+  ret void
+}
+')
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; packed load and store functions
 ;;
@@ -1544,7 +1583,7 @@ entry:

 known_mask:
  %allon = icmp eq i32 %mask, eval((1 << $1) -1)
-  br i1 %allon, label %all_on, label %not_all_on
+  br i1 %allon, label %all_on, label %unknown_mask

 all_on:
  ;; everyone wants to load, so just load an entire vector width in a single
@@ -1554,14 +1593,6 @@ all_on:
  store <$1 x i32> %vec_load, <$1 x i32> * %val_ptr, align 4
  ret i32 $1

-not_all_on:
-  %alloff = icmp eq i32 %mask, 0
-  br i1 %alloff, label %all_off, label %unknown_mask
-
-all_off:
-  ;; no one wants to load
-  ret i32 0
-
 unknown_mask:
  br label %loop

@@ -1608,20 +1639,13 @@ entry:

 known_mask:
  %allon = icmp eq i32 %mask, eval((1 << $1) -1)
-  br i1 %allon, label %all_on, label %not_all_on
+  br i1 %allon, label %all_on, label %unknown_mask

 all_on:
  %vecptr = bitcast i32 *%startptr to <$1 x i32> *
  store <$1 x i32> %vals, <$1 x i32> * %vecptr, align 4
  ret i32 $1

-not_all_on:
-  %alloff = icmp eq i32 %mask, 0
-  br i1 %alloff, label %all_off, label %unknown_mask
-
-all_off:
-  ret i32 0
-
 unknown_mask:
  br label %loop

@@ -1671,14 +1695,6 @@ entry:
   br i1 %allon, label %check_neighbors, label %domixed

 domixed:
-  ; the mask is mixed on/off.  First see if the lanes are all off
-  %alloff = icmp eq i32 %mm, 0
-  br i1 %alloff, label %doalloff, label %actuallymixed
-
-doalloff:
-  ret i1 false  ;; this seems safest
-
-actuallymixed: 
  ; First, figure out which lane is the first active one
  %first = call i32 @llvm.cttz.i32(i32 %mm)
  %baseval = extractelement <$1 x $2> %v, i32 %first
@@ -1701,7 +1717,7 @@ actuallymixed:
  br label %check_neighbors

 check_neighbors:
-  %vec = phi <$1 x $2> [ %blendvec, %actuallymixed ], [ %v, %entry ]
+  %vec = phi <$1 x $2> [ %blendvec, %domixed ], [ %v, %entry ]
  ifelse($6, `32', `
  ; For 32-bit elements, we rotate once and compare with the vector, which ends 
  ; up comparing each element to its neighbor on the right.  Then see if
@@ -1833,7 +1849,7 @@ pl_known_mask:
  ;; the mask is known at compile time; see if it is something we can
  ;; handle more efficiently
  %pl_is_allon = icmp eq i32 %pl_mask, eval((1<<$1)-1)
-  br i1 %pl_is_allon, label %pl_all_on, label %pl_not_all_on
+  br i1 %pl_is_allon, label %pl_all_on, label %pl_unknown_mask

 pl_all_on:
  ;; the mask is all on--just expand the code for each lane sequentially
@@ -1841,19 +1857,14 @@ pl_all_on:
          `patsubst(`$3', `ID\|LANE', i)')
  br label %pl_done

-pl_not_all_on:
-  ;; not all on--see if it is all off or mixed
-  ;; for the mixed case, we just run the general case, though we could
+pl_unknown_mask:
+  ;; we just run the general case, though we could
  ;; try to be smart and just emit the code based on what it actually is,
  ;; for example by emitting the code straight-line without a loop and doing 
  ;; the lane tests explicitly, leaving later optimization passes to eliminate
  ;; the stuff that is definitely not needed.  Not clear if we will frequently 
  ;; encounter a mask that is known at compile-time but is not either all on or
  ;; all off...
-  %pl_alloff = icmp eq i32 %pl_mask, 0
-  br i1 %pl_alloff, label %pl_done, label %pl_unknown_mask
-
-pl_unknown_mask:
  br label %pl_loop

 pl_loop:
@@ -1909,20 +1920,6 @@ define internal <$1 x $2> @__gather_elt_$2(i8 * %ptr, <$1 x i32> %offsets, <$1 x

 define <$1 x $2> @__gather_base_offsets_$2(i8 * %ptr, <$1 x i32> %offsets,
                                           <$1 x i32> %vecmask) nounwind readonly alwaysinline {
-entry:
-  %mask = call i32 @__movmsk(<$1 x i32> %vecmask)
-
-  %maskKnown = call i1 @__is_compile_time_constant_mask(<$1 x i32> %vecmask)
-  br i1 %maskKnown, label %known_mask, label %unknown_mask
-
-known_mask:
-  %alloff = icmp eq i32 %mask, 0
-  br i1 %alloff, label %gather_all_off, label %unknown_mask
-
-gather_all_off:
-  ret <$1 x $2> undef
-
-unknown_mask:
  ; We can be clever and avoid the per-lane stuff for gathers if we are willing
  ; to require that the 0th element of the array being gathered from is always
  ; legal to read from (and we do indeed require that, given the benefits!) 
--- a/ctx.cpp
+++ b/ctx.cpp
@@ -153,7 +153,6 @@ FunctionEmitContext::FunctionEmitContext(const Type *rt, llvm::Function *functio
        StoreInst(llvm::Constant::getNullValue(ftype), returnValuePtr);
    }

-#ifndef LLVM_2_8
    if (m->diBuilder) {
        /* If debugging is enabled, tell the debug information emission
           code about this new function */
@@ -174,7 +173,6 @@ FunctionEmitContext::FunctionEmitContext(const Type *rt, llvm::Function *functio
        /* And start a scope representing the initial function scope */
        StartScope();
    }
-#endif // LLVM_2_8

    launchedTasks = false;

@@ -183,7 +181,6 @@ FunctionEmitContext::FunctionEmitContext(const Type *rt, llvm::Function *functio
    assert(maskSymbol != NULL);
    maskSymbol->storagePtr = maskPtr;

-#ifndef LLVM_2_8
    // add debugging info for __mask, programIndex, ...
    if (m->diBuilder) {
        maskSymbol->pos = funcStartPos;
@@ -208,15 +205,12 @@ FunctionEmitContext::FunctionEmitContext(const Type *rt, llvm::Function *functio
                                           true /* static */,
                                           programCountSymbol->storagePtr);
    }
-#endif
 }


 FunctionEmitContext::~FunctionEmitContext() {
    assert(controlFlowInfo.size() == 0);
-#ifndef LLVM_2_8
    assert(debugScopes.size() == (m->diBuilder ? 1 : 0));
-#endif
 }


@@ -704,6 +698,7 @@ FunctionEmitContext::LaneMask(llvm::Value *v) {

 llvm::Value *
 FunctionEmitContext::MasksAllEqual(llvm::Value *v1, llvm::Value *v2) {
+#if 0
    // Compare the two masks to get a vector of i1s
    llvm::Value *cmp = CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_EQ,
                               v1, v2, "v1==v2");
@@ -711,6 +706,12 @@ FunctionEmitContext::MasksAllEqual(llvm::Value *v1, llvm::Value *v2) {
    cmp = I1VecToBoolVec(cmp);
    // And see if it's all on
    return All(cmp);
+#else
+    llvm::Value *mm1 = LaneMask(v1);
+    llvm::Value *mm2 = LaneMask(v2);
+    return CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_EQ, mm1, mm2,
+                   "v1==v2");
+#endif
 }


@@ -850,7 +851,6 @@ FunctionEmitContext::GetDebugPos() const {
 void
 FunctionEmitContext::AddDebugPos(llvm::Value *value, const SourcePos *pos, 
                                 llvm::DIScope *scope) {
-#ifndef LLVM_2_8
    llvm::Instruction *inst = llvm::dyn_cast<llvm::Instruction>(value);
    if (inst != NULL && m->diBuilder) {
        SourcePos p = pos ? *pos : currentPos;
@@ -861,13 +861,11 @@ FunctionEmitContext::AddDebugPos(llvm::Value *value, const SourcePos *pos,
            inst->setDebugLoc(llvm::DebugLoc::get(p.first_line, p.first_column, 
                                                  scope ? *scope : GetDIScope()));
    }
-#endif
 }


 void
 FunctionEmitContext::StartScope() {
-#ifndef LLVM_2_8
    if (m->diBuilder != NULL) {
        llvm::DIScope parentScope;
        if (debugScopes.size() > 0)
@@ -881,18 +879,15 @@ FunctionEmitContext::StartScope() {
                                             currentPos.first_column);
        debugScopes.push_back(lexicalBlock);
    }
-#endif
 }


 void
 FunctionEmitContext::EndScope() {
-#ifndef LLVM_2_8
    if (m->diBuilder != NULL) {
        assert(debugScopes.size() > 0);
        debugScopes.pop_back();
    }
-#endif
 }


@@ -905,7 +900,6 @@ FunctionEmitContext::GetDIScope() const {

 void
 FunctionEmitContext::EmitVariableDebugInfo(Symbol *sym) {
-#ifndef LLVM_2_8
    if (m->diBuilder == NULL)
        return;

@@ -921,13 +915,11 @@ FunctionEmitContext::EmitVariableDebugInfo(Symbol *sym) {
    llvm::Instruction *declareInst = 
        m->diBuilder->insertDeclare(sym->storagePtr, var, bblock);
    AddDebugPos(declareInst, &sym->pos, &scope);
-#endif
 }


 void
 FunctionEmitContext::EmitFunctionParameterDebugInfo(Symbol *sym) {
-#ifndef LLVM_2_8
    if (m->diBuilder == NULL)
        return;

@@ -943,7 +935,6 @@ FunctionEmitContext::EmitFunctionParameterDebugInfo(Symbol *sym) {
    llvm::Instruction *declareInst = 
        m->diBuilder->insertDeclare(sym->storagePtr, var, bblock);
    AddDebugPos(declareInst, &sym->pos, &scope);
-#endif
 }


@@ -1501,27 +1492,15 @@ FunctionEmitContext::gather(llvm::Value *lvalue, const Type *type,
 void
 FunctionEmitContext::addGSMetadata(llvm::Instruction *inst, SourcePos pos) {
    llvm::Value *str = llvm::MDString::get(*g->ctx, pos.name);
-#ifdef LLVM_2_8
-    llvm::MDNode *md = llvm::MDNode::get(*g->ctx, &str, 1);
-#else
    llvm::MDNode *md = llvm::MDNode::get(*g->ctx, str);
-#endif
    inst->setMetadata("filename", md);

    llvm::Value *line = LLVMInt32(pos.first_line);
-#ifdef LLVM_2_8
-    md = llvm::MDNode::get(*g->ctx, &line, 1);
-#else
    md = llvm::MDNode::get(*g->ctx, line);
-#endif
    inst->setMetadata("line", md);

    llvm::Value *column = LLVMInt32(pos.first_column);
-#ifdef LLVM_2_8
-    md = llvm::MDNode::get(*g->ctx, &column, 1);
-#else
    md = llvm::MDNode::get(*g->ctx, column);
-#endif
    inst->setMetadata("column", md);
 }

@@ -1838,9 +1817,9 @@ llvm::PHINode *
 FunctionEmitContext::PhiNode(LLVM_TYPE_CONST llvm::Type *type, int count, 
                             const char *name) {
    llvm::PHINode *pn = llvm::PHINode::Create(type, 
-#if !defined(LLVM_2_8) && !defined(LLVM_2_9)
+#if defined(LLVM_3_0) || defined(LLVM_3_0svn)
                                              count, 
-#endif // !LLVM_2_8 && !LLVM_2_9
+#endif // LLVM_3_0
                                              name ? name : "phi", bblock);
    AddDebugPos(pn);
    return pn;
@@ -1982,17 +1961,26 @@ FunctionEmitContext::LaunchInst(llvm::Function *callee,
    assert(argStructType->getNumElements() == argVals.size() + 1);

    int align = 4 * RoundUpPow2(g->target.nativeVectorWidth);
+    llvm::Value *argmem;
 #ifdef ISPC_IS_WINDOWS
    // Use malloc() to allocate storage on Windows, since the stack is
    // generally not big enough there to do enough allocations for lots of
    // tasks and then things crash horribly...
-    llvm::Value *argmem = EmitMalloc(argStructType, align);
+    argmem = EmitMalloc(argStructType, align);
 #else
-    // Use alloca for space for the task args on OSX And Linux.  KEY
-    // DETAIL: pass false to the call of FunctionEmitContext::AllocaInst so
-    // that the alloca doesn't happen just once at the top of the function,
-    // but happens each time the enclosing basic block executes.
-    llvm::Value *argmem = AllocaInst(argStructType, "argmem", align, false);
+    // Otherwise, use alloca for space for the task args, ** unless we're 
+    // compiling to AVX, in which case we use malloc after all **. (See
+    // http://llvm.org/bugs/show_bug.cgi?id=10841 for details.  There are
+    // limitations in LLVM with respect to dynamic allocas of this sort
+    // when the stack also has to be 32-byte aligned...).
+    if (g->target.isa == Target::AVX)
+        argmem = EmitMalloc(argStructType, align);
+    else
+        // KEY DETAIL: pass false to the call of
+        // FunctionEmitContext::AllocaInst so that the alloca doesn't
+        // happen just once at the top of the function, but happens each
+        // time the enclosing basic block executes.
+        argmem = AllocaInst(argStructType, "argmem", align, false);
 #endif // ISPC_IS_WINDOWS
    llvm::Value *voidmem = BitCastInst(argmem, LLVMTypes::VoidPointerType);

--- a/decl.cpp
+++ b/decl.cpp
@@ -237,7 +237,7 @@ Declarator::GetType(DeclSpecs *ds) const {
                    sprintf(buf, "__anon_parameter_%d", i);
                    sym = new Symbol(buf, pos);
                    Declarator *declarator = new Declarator(sym, sym->pos);
-                    sym->type = declarator->GetType(ds);
+                    sym->type = declarator->GetType(d->declSpecs);
                    d->declarators.push_back(declarator);
                }
                else {
--- a/docs/ReleaseNotes.txt
+++ b/docs/ReleaseNotes.txt
@@ -1,3 +1,42 @@
+=== v1.0.9 === (26 September 2011)
+
+The binary release of v1.0.9 is the first that supports AVX code
+generation.  Two targets are provided: "avx", which runs with a
+programCount of 8, and "avx-x2" which runs 16 program instances
+simultaneously.  (This binary is also built using the in-progress LLVM 3.0
+development libraries, while previous ones have been built with the
+released 2.9 version of LLVM.)
+
+This release has no other significant changes beyond a number of small
+bugfixes (https://github.com/ispc/ispc/issues/100,
+https://github.com/ispc/ispc/issues/101, https://github.com/ispc/ispc/issues/103.)
+ 
+=== v1.0.8 === (19 September 2011)
+
+A number of improvements have been made to handling of 'if' statements in
+the language:
+  - A bug was fixed where invalid memory could be incorrectly accessed even
+    if none of the running program instances wanted to execute the
+    corresponding instructions (https://github.com/ispc/ispc/issues/74).
+  - The code generated for 'if' statements is a bit simpler and thus more
+    efficient.
+
+There is now '--pic' command-line argument that causes position-independent
+code to be generated (Linux and OSX only).
+
+A number of additional performance improvements:
+  - Loops are now unrolled by default; the --opt=disable-loop-unroll
+    command-line argument can be used to disable this behavior.
+    (https://github.com/ispc/ispc/issues/78)
+  - A few more cases where gathers/scatters could be determined at compile
+    time to actually access contiguous locations have been added.
+    (https://github.com/ispc/ispc/issues/79)
+
+Finally, warnings are now issued (if possible) when it can be determined
+at compile-time that an out-of-bounds array index is being used.
+(https://github.com/ispc/ispc/issues/98).
+
+
 === v1.0.7 === (3 September 2011)

 The various atomic_*_global() standard library functions are generally
--- a/docs/ispc.txt
+++ b/docs/ispc.txt
@@ -33,6 +33,17 @@ The main goals behind ``ispc`` are to:
 number of non-trivial workloads that aren't handled well by other
 compilation approaches (e.g. loop auto-vectorization.)

+**We are very interested in your feedback and comments about ispc and
+in hearing your experiences using the system.  We are especially interested
+in hearing if you try using ispc but see results that are not as you
+were expecting or hoping for.** We encourage you to send a note with your
+experiences or comments to the `ispc-users`_ mailing list or to file bug or
+feature requests with the ``ispc`` `bug tracker`_. (Thanks!)
+
+.. _ispc-users: http://groups.google.com/group/ispc-users
+.. _bug tracker: https://github.com/ispc/ispc/issues?state=open
+
+
 Contents:

 * `Recent Changes to ISPC`_
@@ -102,6 +113,8 @@ Contents:
  + `Small Performance Tricks`_
  + `Instrumenting Your ISPC Programs`_
  + `Using Scan Operations For Variable Output`_
+  + `Application-Supplied Execution Masks`_
+  + `Explicit Vector Programming With Uniform Short Vector Types`_

 * `Disclaimer and Legal Information`_

@@ -1174,7 +1187,7 @@ This code implicitly assumes that ``programCount`` evenly divides
 ::

    for (uniform int i = 0; i < count; i += programCount) {
-        if (i + programIndex < programCount) {
+        if (i + programIndex < count) {
            float d = data[i + programIndex];
            ...

@@ -2209,14 +2222,14 @@ Both the ``foo`` and ``bar`` global variables can be accessed on each
 side.

 ``ispc`` code can also call back to C/C++.  On the ``ispc`` side, any
-application functions to be called must be declared with the ``export "C"``
+application functions to be called must be declared with the ``extern "C"``
 qualifier.

 ::

   extern "C" void foo(uniform float f, uniform float g);

-Unlike in C++, ``export "C"`` doesn't take braces to delineate
+Unlike in C++, ``extern "C"`` doesn't take braces to delineate
 multiple functions to be declared; thus, multiple C functions to be called
 from ``ispc`` must be declared as follows:

@@ -2843,6 +2856,91 @@ values to ``outArray[1]`` and ``outArray[2]``, and so forth.  The
 ``reduce_add`` call at the end returns the total number of values that the
 program instances have written to the array.

+Application-Supplied Execution Masks
+------------------------------------
+
+Recall that when execution transitions from the application code to an
+``ispc`` function, all of the program instances are initially executing.
+In some cases, it may desired that only some of them are running, based on
+a data-dependent condition computed in the application program.  This
+situation can easily be handled via an additional parameter from the
+application.
+
+As a simple example, consider a case where the application code has an
+array of ``float`` values and we'd like the ``ispc`` code to update
+just specific values in that array, where which of those values to be
+updated has been determined by the application.  In C++ code, we might
+have:
+
+::
+
+    int count = ...;
+    float *array = new float[count];
+    bool *shouldUpdate = new bool[count];
+    // initialize array and shouldUpdate
+    ispc_func(array, shouldUpdate, count);
+
+Then, the ``ispc`` code could process this update as:
+
+::
+
+    export void ispc_func(uniform float array[], uniform bool update[],
+                          uniform int count) {
+        for (uniform int i = 0; i < count; i += programCount) {
+            cif (update[i+programIndex] == true)
+                // update array[i+programIndex]...
+        }
+    }
+
+(In this case a "coherent" if statement is likely to be worthwhile if the
+``update`` array will tend to have sections that are either all-true or
+all-false.)
+
+Explicit Vector Programming With Uniform Short Vector Types
+-----------------------------------------------------------
+
+The typical model for programming in ``ispc`` is an *implicit* parallel
+model, where one writes a program that is apparently doing scalar
+computation on values and the program is then vectorized to run in parallel
+across the SIMD lanes of a processor.  However, ``ispc`` also has some
+support for explicit vector unit programming, where the vectorization is
+explicit.  Some computations may be more effectively described in the
+explicit model rather than the implicit model.
+
+This support is provided via ``uniform`` instances of short vectors 
+(as were introduced in the `Short Vector Types`_ section).  Specifically, 
+if this short program
+
+::
+
+    export uniform float<8> madd(uniform float<8> a, 
+                                 uniform float<8> b, uniform float<8> c) {
+        return a + b * c;
+    }
+
+is compiled with the AVX target, ``ispc`` generates the following assembly:
+
+::
+    _madd:
+	vmulps	%ymm2, %ymm1, %ymm1
+	vaddps	%ymm0, %ymm1, %ymm0
+	ret
+
+(And similarly, if compiled with a 4-wide SSE target, two ``mulps`` and two
+``addps`` instructions are generated, and so forth.)
+
+Note that ``ispc`` doesn't currently support control-flow based on
+``uniform`` short vector types; it is thus not possible to write code like:
+
+::
+
+    export uniform int<8> count(uniform float<8> a, uniform float<8> b) {
+        uniform int<8> sum = 0;
+        while (a++ < b)
+            ++sum;
+    }
+
+
 Disclaimer and Legal Information
 ================================

--- a/doxygen.cfg
+++ b/doxygen.cfg
@@ -31,7 +31,7 @@ PROJECT_NAME           = "Intel SPMD Program Compiler"
 # This could be handy for archiving the generated documentation or
 # if some version control system is used.

-PROJECT_NUMBER         = 1.0.7
+PROJECT_NUMBER         = 1.0.9

 # The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute)
 # base path where the generated documentation will be put.
--- a/examples/aobench/Makefile
+++ b/examples/aobench/Makefile
@@ -1,8 +1,20 @@

-CXX=g++ -m64
-CXXFLAGS=-Iobjs/ -O3 -Wall
+ARCH = $(shell uname)
+
+TASK_CXX=../tasks_pthreads.cpp
+TASK_LIB=-lpthread
+
+ifeq ($(ARCH), Darwin)
+  TASK_CXX=../tasks_gcd.cpp
+  TASK_LIB=
+endif
+
+TASK_OBJ=$(addprefix objs/, $(subst ../,, $(TASK_CXX:.cpp=.o)))
+
+CXX=g++
+CXXFLAGS=-Iobjs/ -O3 -Wall -m64
 ISPC=ispc
-ISPCFLAGS=-O2 --fast-math --arch=x86-64
+ISPCFLAGS=-O2 --target=sse4 --arch=x86-64

 default: ao

@@ -14,12 +26,15 @@ dirs:
 clean:
 	/bin/rm -rf objs *~ ao

-ao: dirs objs/ao.o objs/ao_serial.o objs/ao_ispc.o
-	$(CXX) $(CXXFLAGS) -o $@ objs/ao.o objs/ao_ispc.o objs/ao_serial.o -lm -lpthread
+ao: dirs objs/ao.o objs/ao_serial.o objs/ao_ispc.o $(TASK_OBJ)
+	$(CXX) $(CXXFLAGS) -o $@ objs/ao.o objs/ao_ispc.o objs/ao_serial.o $(TASK_OBJ) -lm $(TASK_LIB)

 objs/%.o: %.cpp
 	$(CXX) $< $(CXXFLAGS) -c -o $@

+objs/%.o: ../%.cpp
+	$(CXX) $< $(CXXFLAGS) -c -o $@
+
 objs/ao.o: objs/ao_ispc.h 

 objs/%_ispc.h objs/%_ispc.o: %.ispc
--- a/examples/aobench/ao.cpp
+++ b/examples/aobench/ao.cpp
@@ -173,10 +173,30 @@ int main(int argc, char **argv)
    }

    // Report results and save image
-    printf("[aobench ispc]:\t\t\t[%.3f] M cycles (%d x %d image)\n", minTimeISPC, 
-           width, height);
+    printf("[aobench ispc]:\t\t\t[%.3f] M cycles (%d x %d image)\n", 
+           minTimeISPC, width, height);
    savePPM("ao-ispc.ppm", width, height); 

+    //
+    // Run the ispc + tasks path, test_iterations times, and report the
+    // minimum time for any of them.
+    //
+    double minTimeISPCTasks = 1e30;
+    for (unsigned int i = 0; i < test_iterations; i++) {
+        memset((void *)fimg, 0, sizeof(float) * width * height * 3);
+        assert(NSUBSAMPLES == 2);
+
+        reset_and_start_timer();
+        ao_ispc_tasks(width, height, NSUBSAMPLES, fimg);
+        double t = get_elapsed_mcycles();
+        minTimeISPCTasks = std::min(minTimeISPCTasks, t);
+    }
+
+    // Report results and save image
+    printf("[aobench ispc + tasks]:\t\t[%.3f] M cycles (%d x %d image)\n", 
+           minTimeISPCTasks, width, height);
+    savePPM("ao-ispc-tasks.ppm", width, height); 
+
    //
    // Run the serial path, again test_iteration times, and report the
    // minimum time.
@@ -193,7 +213,8 @@ int main(int argc, char **argv)
    // Report more results, save another image...
    printf("[aobench serial]:\t\t[%.3f] M cycles (%d x %d image)\n", minTimeSerial, 
           width, height);
-    printf("\t\t\t\t(%.2fx speedup from ISPC)\n", minTimeSerial / minTimeISPC);
+    printf("\t\t\t\t(%.2fx speedup from ISPC, %.2fx speedup from ISPC + tasks)\n", 
+           minTimeSerial / minTimeISPC, minTimeSerial / minTimeISPCTasks);
    savePPM("ao-serial.ppm", width, height); 
        
    return 0;
--- a/examples/aobench/ao.ispc
+++ b/examples/aobench/ao.ispc
@@ -203,8 +203,9 @@ ambient_occlusion(reference Isect isect, reference Plane plane,
 /* Compute the image for the scanlines from [y0,y1), for an overall image
   of width w and height h.
 */
-void ao_scanlines(uniform int y0, uniform int y1, uniform int w, uniform int h, 
-                  uniform int nsubsamples, reference uniform float image[]) {
+static void ao_scanlines(uniform int y0, uniform int y1, uniform int w, 
+                         uniform int h,  uniform int nsubsamples, 
+                         reference uniform float image[]) {
    static Plane plane = { { 0.0f, -0.5f, 0.0f }, { 0.f, 1.f, 0.f } };
    static Sphere spheres[3] = {
        { { -2.0f, 0.0f, -3.5f }, 0.5f },
@@ -231,6 +232,9 @@ void ao_scanlines(uniform int y0, uniform int y1, uniform int w, uniform int h,
    // direction we do per iteration and ny the number in y.
    uniform int nx = 1, ny = 1;

+    // FIXME: We actually need ny to be 1 regardless of the decomposition,
+    // since the task decomposition is one scanline high.
+
    if (programCount == 8) {
        // Do two pixels at once in the x direction
        nx = 2;
@@ -239,19 +243,21 @@ void ao_scanlines(uniform int y0, uniform int y1, uniform int w, uniform int h,
            ++du;
    }
    else if (programCount == 16) {
-        // Two at once in both x and y
-        nx = ny = 2;
-        if ((programIndex >= 4 && programIndex < 8) || programIndex >= 12)
+        nx = 4;
+        ny = 1;
+        if (programIndex >= 4 && programIndex < 8)
            ++du;
-        if (programIndex >= 8)  
-            ++dv;
+        if (programIndex >= 8 && programIndex < 12)
+            du += 2;
+        if (programIndex >= 12)
+            du += 3;
    }

    // Now loop over all of the pixels, stepping in x and y as calculated
    // above.  (Assumes that ny divides y and nx divides x...)
    for (uniform int y = y0; y < y1; y += ny) {
        for (uniform int x = 0; x < w; x += nx)  {
-            // Figur out x,y pixel in NDC
+            // Figure out x,y pixel in NDC
            float px =  (x + du - (w / 2.0f)) / (w / 2.0f);
            float py = -(y + dv - (h / 2.0f)) / (h / 2.0f);
            float ret = 0.f;
@@ -293,7 +299,7 @@ void ao_scanlines(uniform int y0, uniform int y1, uniform int w, uniform int h,

            // offset to the first pixel in the image
            uniform int offset = 3 * (y * w + x);
-            for (uniform int p = 0; p < programCount; p += 4, ++offset) {
+            for (uniform int p = 0; p < programCount; p += 4, offset += 3) {
                // Get the four sample values for this pixel
                uniform float sumret = retArray[p] + retArray[p+1] + retArray[p+2] +
                    retArray[p+3];
@@ -315,3 +321,18 @@ export void ao_ispc(uniform int w, uniform int h, uniform int nsubsamples,
                    uniform float image[]) {
    ao_scanlines(0, h, w, h, nsubsamples, image);
 }
+
+
+static void task ao_task(uniform int y0, uniform int y1, uniform int width,
+                         uniform int height, uniform int nsubsamples, 
+                         uniform float image[]) {
+    ao_scanlines(y0, y1, width, height, nsubsamples, image);
+}
+
+
+export void ao_ispc_tasks(uniform int w, uniform int h, uniform int nsubsamples, 
+                          uniform float image[]) {
+    uniform int dy = 1;
+    for (uniform int y = 0; y < h; y += dy)
+        launch < ao_task(y, y+dy, w, h, nsubsamples, image) >;
+}
--- a/examples/aobench/aobench.vcxproj
+++ b/examples/aobench/aobench.vcxproj
@@ -1,4 +1,4 @@
-<?xml version="1.0" encoding="utf-8"?>
+<?xml version="1.0" encoding="utf-8"?>
 <Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
  <ItemGroup Label="ProjectConfigurations">
    <ProjectConfiguration Include="Debug|Win32">
@@ -21,6 +21,7 @@
  <ItemGroup>
    <ClCompile Include="ao.cpp" />
    <ClCompile Include="ao_serial.cpp" />
+    <ClCompile Include="../tasks_concrt.cpp" />
  </ItemGroup>
  <ItemGroup>
    <CustomBuild Include="ao.ispc">
--- a/examples/aobench_instrumented/Makefile
+++ b/examples/aobench_instrumented/Makefile
@@ -2,7 +2,7 @@
 CXX=g++ -m64
 CXXFLAGS=-Iobjs/ -g3 -Wall
 ISPC=ispc
-ISPCFLAGS=-O2 --fast-math --instrument --arch=x86-64
+ISPCFLAGS=-O2 --instrument --arch=x86-64

 default: ao

--- a/examples/mandelbrot_tasks/mandelbrot.cpp
+++ b/examples/mandelbrot_tasks/mandelbrot.cpp
@@ -40,6 +40,7 @@

 #include <stdio.h>
 #include <algorithm>
+#include <string.h>
 #include "../timing.h"
 #include "../cpuid.h"
 #include "mandelbrot_ispc.h"
@@ -99,8 +100,12 @@ ensureTargetISAIsSupported() {
    }
 }

+static void usage() {
+    fprintf(stderr, "usage: mandelbrot [--scale=<factor]\n");
+    exit(1);
+}

-int main() {
+int main(int argc, char *argv[]) {
    unsigned int width = 1536;
    unsigned int height = 1024;
    float x0 = -2;
@@ -108,6 +113,25 @@ int main() {
    float y0 = -1;
    float y1 = 1;

+    if (argc == 1)
+        ;
+    else if (argc == 2) {
+        if (strncmp(argv[1], "--scale=", 8) == 0) {
+            float scale = atof(argv[1] + 8);
+            if (scale == 0.f)
+                usage();
+            width *= scale;
+            height *= scale;
+            // round up to multiples of 16
+            width = (width + 0xf) & ~0xf;
+            height = (height + 0xf) & ~0xf;
+        }
+        else 
+            usage();
+    }
+    else
+        usage();
+
    ensureTargetISAIsSupported();

    int maxIterations = 512;
--- a/examples/noise/noise.ispc
+++ b/examples/noise/noise.ispc
@@ -131,11 +131,11 @@ static float Noise(float x, float y, float z) {
 }


-static float Turbulence(float x, float y, float z, int octaves) {
+static float Turbulence(float x, float y, float z, uniform int octaves) {
    float omega = 0.6;

    float sum = 0., lambda = 1., o = 1.;
-    for (int i = 0; i < octaves; ++i) {
+    for (uniform int i = 0; i < octaves; ++i) {
        sum += abs(o * Noise(lambda * x, lambda * y, lambda * z));
        lambda *= 1.99f;
        o *= omega;
--- a/examples/rt/rt.cpp
+++ b/examples/rt/rt.cpp
@@ -52,7 +52,8 @@ using namespace ispc;

 typedef unsigned int uint;

-extern void raytrace_serial(int width, int height, const float raster2camera[4][4], 
+extern void raytrace_serial(int width, int height, int baseWidth, int baseHeight,
+                            const float raster2camera[4][4], 
                            const float camera2world[4][4], float image[],
                            int id[], const LinearBVHNode nodes[],
                            const Triangle triangles[]);
@@ -127,11 +128,28 @@ ensureTargetISAIsSupported() {
 }


+static void usage() {
+    fprintf(stderr, "rt [--scale=<factor>] <scene name base>\n");
+    exit(1);
+}
+
+
 int main(int argc, char *argv[]) {
-    if (argc != 2) {
-        fprintf(stderr, "usage: rt <filename base>\n");
-        exit(1);
+    float scale = 1.f;
+    const char *filename = NULL;
+    for (int i = 1; i < argc; ++i) {
+        if (strncmp(argv[i], "--scale=", 8) == 0) {
+            scale = atof(argv[i] + 8);
+            if (scale == 0.f)
+                usage();
+        }
+        else if (filename != NULL)
+            usage();
+        else
+            filename = argv[i];
    }
+    if (filename == NULL)
+        usage();

    ensureTargetISAIsSupported();

@@ -145,10 +163,10 @@ int main(int argc, char *argv[]) {
    // Read the camera specification information from the camera file
    //
    char fnbuf[1024];
-    sprintf(fnbuf, "%s.camera", argv[1]);
+    sprintf(fnbuf, "%s.camera", filename);
    FILE *f = fopen(fnbuf, "rb");
    if (!f) {
-        perror(argv[1]);
+        perror(fnbuf);
        return 1;
    }

@@ -156,20 +174,20 @@ int main(int argc, char *argv[]) {
    // Nothing fancy, and trouble if we run on a big-endian system, just
    // fread in the bits
    //
-    int width, height;
+    int baseWidth, baseHeight;
    float camera2world[4][4], raster2camera[4][4];
-    READ(width, 1);
-    READ(height, 1);
+    READ(baseWidth, 1);
+    READ(baseHeight, 1);
    READ(camera2world[0][0], 16);
    READ(raster2camera[0][0], 16);

    //
    // Read in the serialized BVH 
    //
-    sprintf(fnbuf, "%s.bvh", argv[1]);
+    sprintf(fnbuf, "%s.bvh", filename);
    f = fopen(fnbuf, "rb");
    if (!f) {
-        perror(argv[2]);
+        perror(fnbuf);
        return 1;
    }

@@ -216,10 +234,10 @@ int main(int argc, char *argv[]) {
    }
    fclose(f);

-    // round image resolution up to multiple of 4 to make things easy for
+    // round image resolution up to multiple of 16 to make things easy for
    // the code that assigns pixels to ispc program instances
-    height = (height + 3) & ~3;
-    width = (width + 3) & ~3;
+    int height = (int(baseHeight * scale) + 0xf) & ~0xf;
+    int width = (int(baseWidth * scale) + 0xf) & ~0xf;

    // allocate images; one to hold hit object ids, one to hold depth to
    // the first interseciton
@@ -232,8 +250,8 @@ int main(int argc, char *argv[]) {
    double minTimeISPC = 1e30;
    for (int i = 0; i < 3; ++i) {
        reset_and_start_timer();
-        raytrace_ispc(width, height, raster2camera, camera2world, 
-                      image, id, nodes, triangles);
+        raytrace_ispc(width, height, baseWidth, baseHeight, raster2camera, 
+                      camera2world, image, id, nodes, triangles);
        double dt = get_elapsed_mcycles();
        minTimeISPC = std::min(dt, minTimeISPC);
    }
@@ -251,8 +269,8 @@ int main(int argc, char *argv[]) {
    double minTimeISPCtasks = 1e30;
    for (int i = 0; i < 3; ++i) {
        reset_and_start_timer();
-        raytrace_ispc_tasks(width, height, raster2camera, camera2world, 
-                            image, id, nodes, triangles);
+        raytrace_ispc_tasks(width, height, baseWidth, baseHeight, raster2camera,
+                            camera2world, image, id, nodes, triangles);
        double dt = get_elapsed_mcycles();
        minTimeISPCtasks = std::min(dt, minTimeISPCtasks);
    }
@@ -271,8 +289,8 @@ int main(int argc, char *argv[]) {
    double minTimeSerial = 1e30;
    for (int i = 0; i < 3; ++i) {
        reset_and_start_timer();
-        raytrace_serial(width, height, raster2camera, camera2world, 
-                        image, id, nodes, triangles);
+        raytrace_serial(width, height, baseWidth, baseHeight, raster2camera, 
+                        camera2world, image, id, nodes, triangles);
        double dt = get_elapsed_mcycles();
        minTimeSerial = std::min(dt, minTimeSerial);
    }
--- a/examples/rt/rt.ispc
+++ b/examples/rt/rt.ispc
@@ -227,12 +227,17 @@ bool BVHIntersect(const LinearBVHNode nodes[], const Triangle tris[],


 static void raytrace_tile(uniform int x0, uniform int x1,
-                          uniform int y0, uniform int y1, uniform int width,
+                          uniform int y0, uniform int y1, 
+                          uniform int width, uniform int height,
+                          uniform int baseWidth, uniform int baseHeight,
                          const uniform float raster2camera[4][4], 
                          const uniform float camera2world[4][4],
                          uniform float image[], uniform int id[],
                          const LinearBVHNode nodes[],
                          const Triangle triangles[]) {
+    uniform float widthScale = (float)(baseWidth) / (float)(width);
+    uniform float heightScale = (float)(baseHeight) / (float)(height);
+
    static const uniform float udx[16] = { 0, 1, 0, 1, 2, 3, 2, 3, 
                                           0, 1, 0, 1, 2, 3, 2, 3 };
    static const uniform float udy[16] = { 0, 0, 1, 1, 0, 0, 1, 1, 
@@ -252,7 +257,8 @@ static void raytrace_tile(uniform int x0, uniform int x1,
                const float dy = udy[o * programCount + programIndex];

                Ray ray;
-                generateRay(raster2camera, camera2world, x+dx, y+dy, ray);
+                generateRay(raster2camera, camera2world, (x+dx)*widthScale,
+                            (y+dy)*heightScale, ray);
                BVHIntersect(nodes, triangles, ray);

                int offset = (y + (int)dy) * width + (x + (int)dx);
@@ -265,29 +271,35 @@ static void raytrace_tile(uniform int x0, uniform int x1,


 export void raytrace_ispc(uniform int width, uniform int height,
+                          uniform int baseWidth, uniform int baseHeight,
                          const uniform float raster2camera[4][4], 
                          const uniform float camera2world[4][4],
                          uniform float image[], uniform int id[],
                          const LinearBVHNode nodes[],
                          const Triangle triangles[]) {
-    raytrace_tile(0, width, 0, height, width, raster2camera, camera2world, image,
+    raytrace_tile(0, width, 0, height, width, height, baseWidth, baseHeight,
+                  raster2camera, camera2world, image,
                  id, nodes, triangles);
 }


 task void raytrace_tile_task(uniform int x0, uniform int x1,
-                             uniform int y0, uniform int y1, uniform int width,
+                             uniform int y0, uniform int y1, 
+                             uniform int width, uniform int height,
+                             uniform int baseWidth, uniform int baseHeight,
                             const uniform float raster2camera[4][4], 
                             const uniform float camera2world[4][4],
                             uniform float image[], uniform int id[],
                             const LinearBVHNode nodes[],
                             const Triangle triangles[]) {
-    raytrace_tile(x0, x1, y0, y1, width, raster2camera, camera2world, image,
+    raytrace_tile(x0, x1, y0, y1, width, height, baseWidth, baseHeight, 
+                  raster2camera, camera2world, image,
                  id, nodes, triangles);
 }


 export void raytrace_ispc_tasks(uniform int width, uniform int height,
+                                uniform int baseWidth, uniform int baseHeight,
                                const uniform float raster2camera[4][4], 
                                const uniform float camera2world[4][4],
                                uniform float image[], uniform int id[],
@@ -298,9 +310,9 @@ export void raytrace_ispc_tasks(uniform int width, uniform int height,
        uniform int y1 = min(y + dy, height);
        for (uniform int x = 0; x < width; x += dx) {
            uniform int x1 = min(x + dx, width);
-            launch < raytrace_tile_task(x, x1, y, y1, width, raster2camera, 
-                                        camera2world, image, id, nodes,
-                                        triangles) >;
+            launch < raytrace_tile_task(x, x1, y, y1, width, height, baseWidth,
+                                        baseHeight, raster2camera, camera2world, 
+                                        image, id, nodes, triangles) >;
         }
    }
 }
--- a/examples/rt/rt_serial.cpp
+++ b/examples/rt/rt_serial.cpp
@@ -258,17 +258,21 @@ bool BVHIntersect(const LinearBVHNode nodes[], const Triangle tris[],
 }


-void raytrace_serial(int width, int height,
+void raytrace_serial(int width, int height, int baseWidth, int baseHeight,
                     const float raster2camera[4][4], 
                     const float camera2world[4][4],
                     float image[],
                     int id[],
                     const LinearBVHNode nodes[],
                     const Triangle triangles[]) {
+    float widthScale = float(baseWidth) / float(width);
+    float heightScale = float(baseHeight) / float(height);
+
    for (int y = 0; y < height; ++y) {
        for (int x = 0; x < width; ++x) {
                Ray ray;
-                generateRay(raster2camera, camera2world, x, y, ray);
+                generateRay(raster2camera, camera2world, x * widthScale,
+                            y * heightScale, ray);
                BVHIntersect(nodes, triangles, ray);

                int offset = y * width + x;
--- a/examples/tasks_gcd.cpp
+++ b/examples/tasks_gcd.cpp
@@ -33,10 +33,20 @@

 #include "taskinfo.h"

+#if defined(_WIN32) || defined(_WIN64)
+#define ISPC_IS_WINDOWS
+#elif defined(__linux__)
+#define ISPC_IS_LINUX
+#elif defined(__APPLE__)
+#define ISPC_IS_APPLE
+#endif
+
 /* A simple task system for ispc programs based on Apple's Grand Central
   Dispatch. */
 #include <dispatch/dispatch.h>
 #include <stdio.h>
+#include <stdint.h>
+#include <stdlib.h>

 static int initialized = 0;
 static volatile int32_t lock = 0;
@@ -47,6 +57,8 @@ static dispatch_group_t gcdGroup;
 extern "C" { 
    void ISPCLaunch(void *f, void *data);
    void ISPCSync();
+    void *ISPCMalloc(int64_t size, int32_t alignment);
+    void ISPCFree(void *ptr);
 }


@@ -97,3 +109,18 @@ void ISPCSync() {

    lResetTaskInfo();
 }
+
+void *ISPCMalloc(int64_t size, int32_t alignment) {
+    void *mem = malloc(size + (alignment-1) + sizeof(void*));
+    char *amem = ((char*)mem) + sizeof(void*);
+    amem = amem + uint32_t(alignment - (reinterpret_cast<uint64_t>(amem) &
+                                        (alignment - 1)));
+    ((void**)amem)[-1] = mem;
+    return amem;
+}
+
+
+void ISPCFree(void *ptr) {
+    free(((void**)ptr)[-1]);
+}
+
--- a/examples/tasks_pthreads.cpp
+++ b/examples/tasks_pthreads.cpp
@@ -31,6 +31,14 @@
   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
 */

+#if defined(_WIN32) || defined(_WIN64)
+#define ISPC_IS_WINDOWS
+#elif defined(__linux__)
+#define ISPC_IS_LINUX
+#elif defined(__APPLE__)
+#define ISPC_IS_APPLE
+#endif
+
 #include "taskinfo.h"
 #include <pthread.h>
 #include <semaphore.h>
@@ -46,6 +54,9 @@
 #include <stdint.h>
 #include <stdlib.h>
 #include <errno.h>
+#ifdef ISPC_IS_LINUX
+#include <malloc.h>
+#endif

 static int initialized = 0;
 static volatile int32_t lock = 0;
@@ -63,6 +74,8 @@ static pthread_cond_t tasksRunningCondition;
 extern "C" { 
    void ISPCLaunch(void *f, void *data);
    void ISPCSync();
+    void *ISPCMalloc(int64_t size, int32_t alignment);
+    void ISPCFree(void *ptr);
 }

 static void *lTaskEntry(void *arg);
@@ -292,3 +305,35 @@ void ISPCSync() {
        exit(1);
    }
 }
+
+
+void *ISPCMalloc(int64_t size, int32_t alignment) {
+#ifdef ISPC_IS_WINDOWS
+    return _aligned_malloc(size, alignment);
+#endif
+#ifdef ISPC_IS_LINUX
+    return memalign(alignment, size);
+#endif
+#ifdef ISPC_IS_APPLE
+    void *mem = malloc(size + (alignment-1) + sizeof(void*));
+    char *amem = ((char*)mem) + sizeof(void*);
+    amem = amem + uint32_t(alignment - (reinterpret_cast<uint64_t>(amem) &
+                                        (alignment - 1)));
+    ((void**)amem)[-1] = mem;
+    return amem;
+#endif
+}
+
+
+void ISPCFree(void *ptr) {
+#ifdef ISPC_IS_WINDOWS
+    _aligned_free(ptr);
+#endif
+#ifdef ISPC_IS_LINUX
+    free(ptr);
+#endif
+#ifdef ISPC_IS_APPLE
+    free(((void**)ptr)[-1]);
+#endif
+}
+
--- a/expr.cpp
+++ b/expr.cpp
@@ -741,6 +741,12 @@ UnaryExpr::TypeCheck() {
 }


+int
+UnaryExpr::EstimateCost() const {
+    return (expr ? expr->EstimateCost() : 0) + COST_SIMPLE_ARITH_LOGIC_OP;
+}
+
+
 void
 UnaryExpr::Print() const {
    if (!expr || !GetType())
@@ -1299,6 +1305,17 @@ BinaryExpr::TypeCheck() {
    if (type0 == NULL || type1 == NULL)
        return NULL;

+    if (dynamic_cast<const ReferenceType *>(type0) != NULL) {
+        arg0 = new DereferenceExpr(arg0, arg0->pos);
+        type0 = arg0->GetType();
+        assert(type0 != NULL);
+    }
+    if (dynamic_cast<const ReferenceType *>(type1) != NULL) {
+        arg1 = new DereferenceExpr(arg1, arg1->pos);
+        type1 = arg1->GetType();
+        assert(type1 != NULL);
+    }
+
    switch (op) {
    case Shl:
    case Shr:
@@ -1445,6 +1462,15 @@ BinaryExpr::TypeCheck() {
 }


+int
+BinaryExpr::EstimateCost() const {
+    return ((arg0 ? arg0->EstimateCost() : 0) +
+            (arg1 ? arg1->EstimateCost() : 0) +
+            ((op == Div || op == Mod) ? COST_COMPLEX_ARITH_OP : 
+                                        COST_SIMPLE_ARITH_LOGIC_OP));
+}
+
+
 void
 BinaryExpr::Print() const {
    if (!arg0 || !arg1 || !GetType())
@@ -1696,6 +1722,20 @@ AssignExpr::TypeCheck() {
 }


+int
+AssignExpr::EstimateCost() const {
+    int cost = ((lvalue ? lvalue->EstimateCost() : 0) +
+                (rvalue ? rvalue->EstimateCost() : 0));
+    cost += COST_ASSIGN;
+    if (op == Assign)
+        return cost;
+    if (op == DivAssign || op == ModAssign)
+        return cost + COST_COMPLEX_ARITH_OP;
+    else
+        return cost + COST_SIMPLE_ARITH_LOGIC_OP;
+}
+
+
 void
 AssignExpr::Print() const {
    if (!lvalue || !rvalue || !GetType())
@@ -1944,6 +1984,12 @@ SelectExpr::TypeCheck() {
 }


+int
+SelectExpr::EstimateCost() const {
+    return COST_SELECT;
+}
+
+
 void
 SelectExpr::Print() const {
    if (!test || !expr1 || !expr2 || !GetType())
@@ -2222,55 +2268,6 @@ FunctionCallExpr::FunctionCallExpr(Expr *f, ExprList *a, SourcePos p, bool il)
 }


-/** Starting from the function initialFunction, we're calling into
-    calledFunc.  The question is: is this a recursive call back to
-    initialFunc?  If it definitely is or if it may be, then return true.
-    Return false if it definitely is not.
- */
-static bool
-lMayBeRecursiveCall(llvm::Function *calledFunc, 
-                    llvm::Function *initialFunc,
-                    std::set<llvm::Function *> &seenFuncs) {
-    // Easy case: intrinsics aren't going to call functions themselves
-    if (calledFunc->isIntrinsic())
-        return false;
-
-    std::string name = calledFunc->getName();
-    if (name.size() > 2 && name[0] == '_' && name[1] == '_')
-        // builtin stdlib function; none of these are recursive...
-        return false;
-
-    if (calledFunc->isDeclaration())
-        // There's visibility into what the called function does without a
-        // definition, so we have to be conservative
-        return true;
-
-    if (calledFunc == initialFunc)
-        // hello recursive call
-        return true;
-
-    // Otherwise iterate over all of the instructions in the function.  If
-    // any of them is a function call then check recursively..
-    llvm::inst_iterator iter;
-    for (iter = llvm::inst_begin(calledFunc); 
-         iter != llvm::inst_end(calledFunc); ++iter) {
-        llvm::Instruction *inst = &*iter;
-        llvm::CallInst *ci = llvm::dyn_cast<llvm::CallInst>(inst);
-        if (ci != NULL) {
-            llvm::Function *nextCalledFunc = ci->getCalledFunction();
-            // Don't repeatedly test functions we've seen before 
-            if (seenFuncs.find(nextCalledFunc) == seenFuncs.end()) {
-                seenFuncs.insert(nextCalledFunc);
-                if (lMayBeRecursiveCall(nextCalledFunc, initialFunc, 
-                                        seenFuncs))
-                    return true;
-            }
-        }
-    }
-    return false;
-}
-
-
 llvm::Value *
 FunctionCallExpr::GetValue(FunctionEmitContext *ctx) const {
    if (!func || !args)
@@ -2391,47 +2388,14 @@ FunctionCallExpr::GetValue(FunctionEmitContext *ctx) const {
        }
    }

-    // We sometimes need to check to see if the mask is all off here;
-    // specifically, if the mask is all off and we call a recursive
-    // function, then we will probably have an unsesirable infinite loop.
-    ctx->SetDebugPos(pos);
-    llvm::BasicBlock *bDoCall = ctx->CreateBasicBlock("funcall_mask_ok");
-    llvm::BasicBlock *bSkip = ctx->CreateBasicBlock("funcall_mask_off");
-    llvm::BasicBlock *bAfter = ctx->CreateBasicBlock("after_funcall");
-    llvm::Function *currentFunc = ctx->GetCurrentBasicBlock()->getParent();
-
-    // If we need to check the mask (it may be a recursive call, possibly
-    // transitively), or we're launching a task, which is expensive and
-    // thus probably always worth checking, then use the mask to choose
-    // whether to go to the bDoCallBlock or the bSkip block
-    std::set<llvm::Function *> seenFuncs;
-    seenFuncs.insert(currentFunc);
-    if (ft->isTask || lMayBeRecursiveCall(callee, currentFunc, seenFuncs)) {
-        Debug(pos, "Checking mask before function call \"%s\".", funSym->name.c_str());
-        ctx->BranchIfMaskAny(bDoCall, bSkip);
-    }
-    else
-        // If we don't need to check the mask, then always to the call;
-        // just jump to bDoCall
-        ctx->BranchInst(bDoCall);
-    
-    // And the bSkip block just jumps immediately to bAfter.  So why do we
-    // need it?  So the phi node below can easily tell what paths are
-    // going into it
-    ctx->SetCurrentBasicBlock(bSkip);
-    ctx->BranchInst(bAfter);
-
-    // Emit the code to do the function call
-    ctx->SetCurrentBasicBlock(bDoCall);
-
    llvm::Value *retVal = NULL;
    ctx->SetDebugPos(pos);
    if (ft->isTask)
        ctx->LaunchInst(callee, argVals);
    else {
        // Most of the time, the mask is passed as the last argument.  this
-        // isn't the case for things like SSE intrinsics and extern "C"
-        // functions from the application.
+        // isn't the case for things like intrinsics, builtins, and extern
+        // "C" functions from the application.
        assert(callargs.size() + 1 == callee->arg_size() ||
               callargs.size() == callee->arg_size());

@@ -2458,22 +2422,10 @@ FunctionCallExpr::GetValue(FunctionEmitContext *ctx) const {
        }
    }

-    // And jump out to the 'after funciton call' basic block
-    ctx->BranchInst(bAfter);
-    ctx->SetCurrentBasicBlock(bAfter);
-
    if (isVoidFunc)
        return NULL;
-
-    // The return value for the non-void case is either undefined or the
-    // function return value, depending on whether we actually ran the code
-    // path that called the function or not.
-    LLVM_TYPE_CONST llvm::Type *lrType = ft->GetReturnType()->LLVMType(g->ctx);
-    llvm::PHINode *ret = ctx->PhiNode(lrType, 2, "fun_ret");
-    assert(retVal != NULL);
-    ret->addIncoming(llvm::UndefValue::get(lrType), bSkip);
-    ret->addIncoming(retVal, bDoCall);
-    return ret;
+    else
+        return retVal;
 }


@@ -2534,6 +2486,13 @@ FunctionCallExpr::TypeCheck() {
 }


+int
+FunctionCallExpr::EstimateCost() const {
+    return ((args ? args->EstimateCost() : 0) +
+            (isLaunch ? COST_TASK_LAUNCH : COST_FUNCALL));
+}
+
+
 void
 FunctionCallExpr::Print() const {
    if (!func || !args || !GetType())
@@ -2622,7 +2581,7 @@ ExprList::GetConstant(const Type *type) const {
    }

    if (dynamic_cast<const StructType *>(type) != NULL) {
-#if defined(LLVM_2_8) || defined(LLVM_2_9)
+#if defined(LLVM_2_9)
        return llvm::ConstantStruct::get(*g->ctx, cv, false);
 #else
        LLVM_TYPE_CONST llvm::StructType *llvmStructType =
@@ -2645,6 +2604,17 @@ ExprList::GetConstant(const Type *type) const {
 }


+int
+ExprList::EstimateCost() const {
+    int cost = 0;
+    for (unsigned int i = 0; i < exprs.size(); ++i) {
+        if (exprs[i] != NULL)
+            cost += exprs[i]->EstimateCost();
+    }
+    return cost;
+}
+
+
 void
 ExprList::Print() const {
    printf("expr list (");
@@ -2775,6 +2745,22 @@ IndexExpr::GetLValue(FunctionEmitContext *ctx) const {
    if (!basePtr)
        return NULL;

+    // If the array index is a compile time constant, check to see if it
+    // may lead to an out-of-bounds access.
+    ConstExpr *ce = dynamic_cast<ConstExpr *>(index);
+    const SequentialType *seqType = dynamic_cast<const SequentialType *>(type);
+    assert(seqType != NULL);
+    int nElements = seqType->GetElementCount();
+    if (ce != NULL && nElements > 0) {
+        int32_t indices[ISPC_MAX_NVEC];
+        int count = ce->AsInt32(indices);
+        for (int i = 0; i < count; ++i) {
+            if (indices[i] < 0 || indices[i] >= nElements)
+                Warning(index->pos, "Array index \"%d\" may be out of bounds for "
+                        "\"%d\" element array.", indices[i], nElements);
+        }
+    }
+
    basePtr = lCastUniformVectorBasePtr(basePtr, ctx);

    ctx->SetDebugPos(pos);
@@ -2827,6 +2813,16 @@ IndexExpr::TypeCheck() {
 }


+int
+IndexExpr::EstimateCost() const {
+    // be pessimistic
+    if (index && index->GetType()->IsVaryingType())
+        return COST_GATHER;
+    else
+        return COST_LOAD;
+}
+
+
 void
 IndexExpr::Print() const {
    if (!arrayOrVector || !index || !GetType())
@@ -3126,6 +3122,7 @@ MemberExpr::create(Expr *e, const char *id, SourcePos p, SourcePos idpos) {
    return new MemberExpr(e, id, p, idpos);
 }

+
 MemberExpr::MemberExpr(Expr *e, const char *id, SourcePos p, SourcePos idpos) 
    : Expr(p), identifierPos(idpos) {
    expr = e;
@@ -3222,6 +3219,14 @@ MemberExpr::Optimize() {
 }


+int
+MemberExpr::EstimateCost() const {
+    // FIXME: return gather cost when we can tell a gather is going to be
+    // needed
+    return COST_SIMPLE_ARITH_LOGIC_OP;
+}
+
+
 void
 MemberExpr::Print() const {
    if (!expr || !GetType())
@@ -4017,6 +4022,12 @@ ConstExpr::TypeCheck() {
 }


+int
+ConstExpr::EstimateCost() const {
+    return 0;
+}
+
+
 void
 ConstExpr::Print() const {
    printf("[%s] (", GetType()->GetString().c_str());
@@ -4103,7 +4114,7 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal,
        case AtomicType::TYPE_BOOL:
            if (fromType->IsVaryingType() && 
                LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType)
-                // If we have a bool vector of i32 element,s first truncate
+                // If we have a bool vector of i32 elements, first truncate
                // down to a single bit
                exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, "bool_to_i1");
            // And then do an unisgned int->float cast
@@ -4163,9 +4174,6 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal,
        case AtomicType::TYPE_UINT16:
        case AtomicType::TYPE_UINT32:
        case AtomicType::TYPE_UINT64:
-            if (fromType->IsVaryingType())
-                PerformanceWarning(pos, "Conversion from unsigned int64 to float is slow. "
-                                   "Use \"int64\" if possible");
            cast = ctx->CastInst(llvm::Instruction::UIToFP, // unsigned int
                                 exprVal, targetType, "uint2double");
            break;
@@ -4937,6 +4945,13 @@ TypeCastExpr::Optimize() {
 }


+int
+TypeCastExpr::EstimateCost() const {
+    // FIXME: return COST_TYPECAST_COMPLEX when appropriate
+    return COST_TYPECAST_SIMPLE;
+}
+
+
 void
 TypeCastExpr::Print() const {
    printf("[%s] type cast (", GetType()->GetString().c_str());
@@ -5002,6 +5017,12 @@ ReferenceExpr::TypeCheck() {
 }


+int
+ReferenceExpr::EstimateCost() const {
+    return 0;
+}
+
+
 void
 ReferenceExpr::Print() const {
    if (expr == NULL || GetType() == NULL)
@@ -5080,6 +5101,12 @@ DereferenceExpr::Optimize() {
 }


+int
+DereferenceExpr::EstimateCost() const {
+    return COST_DEREF;
+}
+
+
 void
 DereferenceExpr::Print() const {
    if (expr == NULL || GetType() == NULL)
@@ -5151,6 +5178,15 @@ SymbolExpr::Optimize() {
 }


+int
+SymbolExpr::EstimateCost() const {
+    if (symbol->constValue != NULL)
+        return 0;
+    else
+        return COST_LOAD;
+}
+
+
 void
 SymbolExpr::Print() const {
    if (symbol == NULL || GetType() == NULL)
@@ -5204,6 +5240,12 @@ FunctionSymbolExpr::Optimize() {
 }


+int
+FunctionSymbolExpr::EstimateCost() const {
+    return 0;
+}
+
+
 void
 FunctionSymbolExpr::Print() const {
    if (!matchingFunc || !GetType())
@@ -5238,6 +5280,12 @@ SyncExpr::GetValue(FunctionEmitContext *ctx) const {
 }


+int
+SyncExpr::EstimateCost() const {
+    return COST_SYNC;
+}
+
+
 void
 SyncExpr::Print() const {
    printf("sync");
--- a/expr.h
+++ b/expr.h
@@ -121,8 +121,8 @@ public:
    void Print() const;
    Expr *Optimize();
    Expr *TypeCheck();
+    int EstimateCost() const;

-private:
    const Op op;
    Expr *expr;
 };
@@ -164,8 +164,8 @@ public:

    Expr *Optimize();
    Expr *TypeCheck();
+    int EstimateCost() const;

-private:
    const Op op;
    Expr *arg0, *arg1;
 };
@@ -196,8 +196,8 @@ public:

    Expr *Optimize();
    Expr *TypeCheck();
+    int EstimateCost() const;

-private:
    const Op op;
    Expr *lvalue, *rvalue;
 };
@@ -217,8 +217,8 @@ public:

    Expr *Optimize();
    Expr *TypeCheck();
+    int EstimateCost() const;

-private:
    Expr *test, *expr1, *expr2;
 };

@@ -240,6 +240,7 @@ public:
    llvm::Constant *GetConstant(const Type *type) const;
    ExprList *Optimize();
    ExprList *TypeCheck();
+    int EstimateCost() const;

    std::vector<Expr *> exprs;
 };
@@ -257,12 +258,13 @@ public:

    Expr *Optimize();
    Expr *TypeCheck();
+    int EstimateCost() const;

-private:
    Expr *func;
    ExprList *args;
    bool isLaunch;

+private:
    void resolveFunctionOverloads();
    bool tryResolve(bool (*matchFunc)(Expr *, const Type *));
 };
@@ -285,8 +287,8 @@ public:

    Expr *Optimize();
    Expr *TypeCheck();
+    int EstimateCost() const;

-private:
    Expr *arrayOrVector, *index;
 };

@@ -303,16 +305,17 @@ public:
    MemberExpr(Expr *expr, const char *identifier, SourcePos pos, 
               SourcePos identifierPos);

-    virtual llvm::Value *GetValue(FunctionEmitContext *ctx) const;
-    virtual llvm::Value *GetLValue(FunctionEmitContext *ctx) const;
-    virtual const Type *GetType() const;
-    virtual Symbol *GetBaseSymbol() const;
-    virtual void Print() const;
-    virtual Expr *Optimize();
-    virtual Expr *TypeCheck();
+    llvm::Value *GetValue(FunctionEmitContext *ctx) const;
+    llvm::Value *GetLValue(FunctionEmitContext *ctx) const;
+    const Type *GetType() const;
+    Symbol *GetBaseSymbol() const;
+    void Print() const;
+    Expr *Optimize();
+    Expr *TypeCheck();
+    int EstimateCost() const;
+
    virtual int getElementNumber() const;

-protected:
    std::string getCandidateNearMatches() const;

    Expr *expr;
@@ -392,6 +395,7 @@ public:

    Expr *TypeCheck();
    Expr *Optimize();
+    int EstimateCost() const;

    /** Return the ConstExpr's values as booleans, doing type conversion
        from the actual type if needed.  If forceVarying is true, then type
@@ -495,8 +499,8 @@ public:
    void Print() const;
    Expr *TypeCheck();
    Expr *Optimize();
+    int EstimateCost() const;

-private:
    const Type *type;
    Expr *expr;
 };
@@ -514,8 +518,8 @@ public:
    void Print() const;
    Expr *TypeCheck();
    Expr *Optimize();
+    int EstimateCost() const;

-private:
    Expr *expr;
 };

@@ -533,8 +537,8 @@ public:
    void Print() const;
    Expr *TypeCheck();
    Expr *Optimize();
+    int EstimateCost() const;

-private:
    Expr *expr;
 };

@@ -551,6 +555,7 @@ public:
    Expr *TypeCheck();
    Expr *Optimize();
    void Print() const;
+    int EstimateCost() const;

 private:
    Symbol *symbol;
@@ -571,6 +576,7 @@ public:
    Expr *TypeCheck();
    Expr *Optimize();
    void Print() const;
+    int EstimateCost() const;

 private:
    friend class FunctionCallExpr;
@@ -597,6 +603,7 @@ public:
    Expr *TypeCheck();
    Expr *Optimize();
    void Print() const;
+    int EstimateCost() const;
 };

 #endif // ISPC_EXPR_H
--- a/ispc.cpp
+++ b/ispc.cpp
@@ -46,9 +46,7 @@
 #endif
 #include <llvm/LLVMContext.h>
 #include <llvm/Module.h>
-#ifndef LLVM_2_8
 #include <llvm/Analysis/DIBuilder.h>
-#endif
 #include <llvm/Analysis/DebugInfo.h>
 #include <llvm/Support/Dwarf.h>
 #include <llvm/Target/TargetMachine.h>
@@ -72,7 +70,7 @@ Module *m;

 bool
 Target::GetTarget(const char *arch, const char *cpu, const char *isa,
-                  Target *t) {
+                  bool pic, Target *t) {
    if (cpu == NULL) {
        std::string hostCPU = llvm::sys::getHostCPUName();
        if (hostCPU.size() > 0)
@@ -100,6 +98,8 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa,

    bool error = false;

+    t->generatePIC = pic;
+
    // Make sure the target architecture is a known one; print an error
    // with the valid ones otherwise.
    t->target = NULL;
@@ -228,14 +228,22 @@ llvm::TargetMachine *
 Target::GetTargetMachine() const {
    std::string triple = GetTripleString();

+    llvm::Reloc::Model relocModel = generatePIC ? llvm::Reloc::PIC_ : 
+                                                  llvm::Reloc::Default;
 #if defined(LLVM_3_0svn) || defined(LLVM_3_0)
    std::string featuresString = attributes;
    llvm::TargetMachine *targetMachine = 
-        target->createTargetMachine(triple, cpu, featuresString);
+        target->createTargetMachine(triple, cpu, featuresString, relocModel);
 #else
+#ifdef ISPC_IS_APPLE
+    relocModel = llvm::Reloc::PIC_;
+#endif // ISPC_IS_APPLE
    std::string featuresString = cpu + std::string(",") + attributes;
    llvm::TargetMachine *targetMachine = 
        target->createTargetMachine(triple, featuresString);
+#ifndef ISPC_IS_WINDOWS
+    targetMachine->setRelocationModel(relocModel);
+#endif // !ISPC_IS_WINDOWS
 #endif
    assert(targetMachine != NULL);

@@ -250,6 +258,8 @@ Target::GetTargetMachine() const {
 Opt::Opt() {
    level = 1;
    fastMath = false;
+    fastMaskedVload = false;
+    unrollLoops = true;
    disableBlendedMaskedStores = false;
    disableCoherentControlFlow = false;
    disableUniformControlFlow = false;
@@ -299,13 +309,9 @@ SourcePos::SourcePos(const char *n, int l, int c) {
 }

 llvm::DIFile SourcePos::GetDIFile() const {
-#ifdef LLVM_2_8
-    return llvm::DIFile();
-#else
    std::string directory, filename;
    GetDirectoryAndFileName(g->currentDirectory, name, &directory, &filename);
    return m->diBuilder->createFile(filename, directory);
-#endif // LLVM_2_8
 }


--- a/ispc.h
+++ b/ispc.h
@@ -148,6 +148,8 @@ public:
        pointer in place of the original ASTNode *. */
    virtual ASTNode *TypeCheck() = 0;

+    virtual int EstimateCost() const = 0;
+
    /** All AST nodes must track the file position where they are
        defined. */
    const SourcePos pos;
@@ -162,7 +164,7 @@ struct Target {
        name, if the name is a known target.  Returns true if the
        target was initialized and false if the name is unknown. */
    static bool GetTarget(const char *arch, const char *cpu, const char *isa,
-                          Target *);
+                          bool pic, Target *);

    /** Returns a comma-delimited string giving the names of the currently
        supported target ISAs. */
@@ -215,8 +217,12 @@ struct Target {
        integer multiple of the native vector width, for example if we're
        "doubling up" and compiling 8-wide on a 4-wide SSE system. */
    int vectorWidth;
+
+    /** Indicates whether position independent code should be generated. */
+    bool generatePIC;
 };

+
 /** @brief Structure that collects optimization options

    This structure collects all of the options related to optimization of
@@ -234,6 +240,16 @@ struct Opt {
        should be performed.  This is false by default. */
    bool fastMath;

+    /** Indicates whether an vector load should be issued for masked loads
+        on platforms that don't have a native masked vector load.  (This may
+        lead to accessing memory up to programCount-1 elements past the end of
+        arrays, so is unsafe in general.) */
+    bool fastMaskedVload;
+
+    /** Indicates when loops should be unrolled (when doing so seems like
+        it will make sense. */
+    bool unrollLoops;
+
    /** On targets that don't have a masked store instruction but do have a
        blending instruction, by default, we simulate masked stores by
        loading the old value, blending, and storing the result.  This can
@@ -351,6 +367,29 @@ struct Globals {
    std::vector<std::string> cppArgs;
 };

+enum {
+    COST_ASSIGN = 1,
+    COST_COHERENT_BREAK_CONTINE = 4,
+    COST_COMPLEX_ARITH_OP = 4,
+    COST_DEREF = 4,
+    COST_FUNCALL = 4,
+    COST_GATHER = 8,
+    COST_LOAD = 2,
+    COST_REGULAR_BREAK_CONTINUE = 2,
+    COST_RETURN = 4,
+    COST_SELECT = 4,
+    COST_SIMPLE_ARITH_LOGIC_OP = 1,
+    COST_SYNC = 32,
+    COST_TASK_LAUNCH = 16,
+    COST_TYPECAST_COMPLEX = 4,
+    COST_TYPECAST_SIMPLE = 1,
+    COST_UNIFORM_LOOP = 4,
+    COST_VARYING_LOOP = 6,
+
+    CHECK_MASK_AT_FUNCTION_START_COST = 16,
+    PREDICATE_SAFE_IF_STATEMENT_COST = 6,
+};
+
 extern Globals *g;
 extern Module *m;

--- a/ispc.vcxproj
+++ b/ispc.vcxproj
@@ -1,4 +1,4 @@
-<?xml version="1.0" encoding="utf-8"?>
+<?xml version="1.0" encoding="utf-8"?>
 <Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
  <ItemGroup Label="ProjectConfigurations">
    <ProjectConfiguration Include="Debug|Win32">
@@ -31,12 +31,14 @@
    <ClCompile Include="opt.cpp" />
    <ClCompile Include="parse.cc" />
    <CustomBuild Include="builtins-c.c">
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%LLVM_INSTALL_DIR%\bin\clang -m32 -emit-llvm builtins-c.c -c -o - | %LLVM_INSTALL_DIR%\bin\llvm-dis - | python bitcode2cpp.py builtins-c-32.c &gt; gen-bitcode-c-32.cpp</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%LLVM_INSTALL_DIR%\bin\clang -m32 -emit-llvm builtins-c.c -c -o - | %LLVM_INSTALL_DIR%\bin\llvm-dis - | python bitcode2cpp.py builtins-c-32.c &gt; gen-bitcode-c-32.cpp;
+%LLVM_INSTALL_DIR%\bin\clang -m64 -emit-llvm builtins-c.c -c -o - | %LLVM_INSTALL_DIR%\bin\llvm-dis - | python bitcode2cpp.py builtins-c-64.c &gt; gen-bitcode-c-64.cpp</Command>
      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">clang builtins-c.c</Message>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%LLVM_INSTALL_DIR%\bin\clang -m32 -emit-llvm builtins-c.c -c -o - | %LLVM_INSTALL_DIR%\bin\llvm-dis - | python bitcode2cpp.py builtins-c-32.c &gt; gen-bitcode-c-32.cpp</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%LLVM_INSTALL_DIR%\bin\clang -m32 -emit-llvm builtins-c.c -c -o - | %LLVM_INSTALL_DIR%\bin\llvm-dis - | python bitcode2cpp.py builtins-c-32.c &gt; gen-bitcode-c-32.cpp;
+%LLVM_INSTALL_DIR%\bin\clang -m64 -emit-llvm builtins-c.c -c -o - | %LLVM_INSTALL_DIR%\bin\llvm-dis - | python bitcode2cpp.py builtins-c-64.c &gt; gen-bitcode-c-64.cpp</Command>
      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">clang builtins-c.c</Message>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-c.cpp</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-c.cpp</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-c-32.cpp;gen-bitcore-c-64.cpp</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-c-32.cpp;gen-bitcore-c-64.cpp</Outputs>
    </CustomBuild>
    <ClCompile Include="stmt.cpp" />
    <ClCompile Include="sym.cpp" />
@@ -61,9 +63,9 @@
  <ItemGroup>
    <CustomBuild Include="stdlib.ispc">
      <FileType>Document</FileType>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">clang -E -x c %(Filename).ispc -DISPC=1 -DPI=3.1415926535 | python stdlib2cpp.py &gt; gen-stdlib.cpp</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%LLVM_INSTALL_DIR%\bin\clang -E -x c %(Filename).ispc -DISPC=1 -DPI=3.1415926535 | python stdlib2cpp.py &gt; gen-stdlib.cpp</Command>
      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-stdlib.cpp</Outputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">clang -E -x c %(Filename).ispc -DISPC=1 -DPI=3.1415926535 | python stdlib2cpp.py &gt; gen-stdlib.cpp</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%LLVM_INSTALL_DIR%\bin\clang -E -x c %(Filename).ispc -DISPC=1 -DPI=3.1415926535 | python stdlib2cpp.py &gt; gen-stdlib.cpp</Command>
      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-stdlib.cpp</Outputs>
      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-stdlib.cpp</Message>
      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-stdlib.cpp</Message>
@@ -194,7 +196,7 @@
      <PrecompiledHeader>NotUsing</PrecompiledHeader>
      <WarningLevel>Level3</WarningLevel>
      <Optimization>Disabled</Optimization>
-      <PreprocessorDefinitions>NOMINMAX;LLVM_2_9</PreprocessorDefinitions>
+      <PreprocessorDefinitions>NOMINMAX;LLVM_3_0</PreprocessorDefinitions>
      <AdditionalIncludeDirectories>$(LLVM_INSTALL_DIR)\include;.;.\winstuff;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
      <DisableSpecificWarnings>4146;4800;4996;4355;4624</DisableSpecificWarnings>
    </ClCompile>
@@ -202,7 +204,7 @@
      <SubSystem>Console</SubSystem>
      <GenerateDebugInformation>true</GenerateDebugInformation>
      <AdditionalLibraryDirectories>$(LLVM_INSTALL_DIR)\lib;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
-      <AdditionalDependencies>clangFrontend.lib;clangDriver.lib;clangSerialization.lib;clangParse.lib;clangSema.lib;clangAnalysis.lib;clangAST.lib;clangLex.lib;clangBasic.lib;LLVMAnalysis.lib;LLVMArchive.lib;LLVMAsmPrinter.lib;LLVMBitReader.lib;LLVMBitWriter.lib;LLVMCodeGen.lib;LLVMCore.lib;LLVMExecutionEngine.lib;LLVMInstCombine.lib;LLVMInstrumentation.lib;LLVMipa.lib;LLVMipo.lib;LLVMLinker.lib;LLVMMC.lib;LLVMMCParser.lib;LLVMObject.lib;LLVMScalarOpts.lib;LLVMSelectionDAG.lib;LLVMSupport.lib;LLVMTarget.lib;LLVMTransformUtils.lib;LLVMX86ASMPrinter.lib;LLVMX86ASMParser.lib;LLVMX86Utils.lib;LLVMX86CodeGen.lib;LLVMX86Disassembler.lib;LLVMX86Info.lib;shlwapi.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalDependencies>clangFrontend.lib;clangDriver.lib;clangSerialization.lib;clangParse.lib;clangSema.lib;clangAnalysis.lib;clangAST.lib;clangLex.lib;clangBasic.lib;LLVMAnalysis.lib;LLVMArchive.lib;LLVMAsmParser.lib;LLVMAsmPrinter.lib;LLVMBitReader.lib;LLVMBitWriter.lib;LLVMCodeGen.lib;LLVMCore.lib;LLVMDebugInfo.lib;LLVMExecutionEngine.lib;LLVMInstCombine.lib;LLVMInstrumentation.lib;LLVMLinker.lib;LLVMMC.lib;LLVMMCDisassembler.lib;LLVMMCParser.lib;LLVMObject.lib;LLVMScalarOpts.lib;LLVMSelectionDAG.lib;LLVMSupport.lib;LLVMTarget.lib;LLVMTransformUtils.lib;LLVMX86ASMPrinter.lib;LLVMX86ASMParser.lib;LLVMX86Utils.lib;LLVMX86CodeGen.lib;LLVMX86Desc.lib;LLVMX86Disassembler.lib;LLVMX86Info.lib;LLVMipa.lib;LLVMipo.lib;shlwapi.lib;%(AdditionalDependencies)</AdditionalDependencies>
    </Link>
  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
@@ -212,7 +214,7 @@
      <Optimization>MaxSpeed</Optimization>
      <FunctionLevelLinking>true</FunctionLevelLinking>
      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <PreprocessorDefinitions>NOMINMAX;LLVM_2_9</PreprocessorDefinitions>
+      <PreprocessorDefinitions>NOMINMAX;LLVM_3_0</PreprocessorDefinitions>
      <AdditionalIncludeDirectories>$(LLVM_INSTALL_DIR)\include;.;.\winstuff;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
      <DisableSpecificWarnings>4146;4800;4996;4355;4624</DisableSpecificWarnings>
    </ClCompile>
@@ -222,10 +224,10 @@
      <EnableCOMDATFolding>true</EnableCOMDATFolding>
      <OptimizeReferences>true</OptimizeReferences>
      <AdditionalLibraryDirectories>$(LLVM_INSTALL_DIR)\lib;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
-      <AdditionalDependencies>clangFrontend.lib;clangDriver.lib;clangSerialization.lib;clangParse.lib;clangSema.lib;clangAnalysis.lib;clangAST.lib;clangLex.lib;clangBasic.lib;LLVMAnalysis.lib;LLVMArchive.lib;LLVMAsmPrinter.lib;LLVMBitReader.lib;LLVMBitWriter.lib;LLVMCodeGen.lib;LLVMCore.lib;LLVMExecutionEngine.lib;LLVMInstCombine.lib;LLVMInstrumentation.lib;LLVMipa.lib;LLVMipo.lib;LLVMLinker.lib;LLVMMC.lib;LLVMMCParser.lib;LLVMObject.lib;LLVMScalarOpts.lib;LLVMSelectionDAG.lib;LLVMSupport.lib;LLVMTarget.lib;LLVMTransformUtils.lib;LLVMX86ASMPrinter.lib;LLVMX86ASMParser.lib;LLVMX86Utils.lib;LLVMX86CodeGen.lib;LLVMX86Disassembler.lib;LLVMX86Info.lib;shlwapi.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalDependencies>clangFrontend.lib;clangDriver.lib;clangSerialization.lib;clangParse.lib;clangSema.lib;clangAnalysis.lib;clangAST.lib;clangLex.lib;clangBasic.lib;LLVMAnalysis.lib;LLVMArchive.lib;LLVMAsmParser.lib;LLVMAsmPrinter.lib;LLVMBitReader.lib;LLVMBitWriter.lib;LLVMCodeGen.lib;LLVMCore.lib;LLVMDebugInfo.lib;LLVMExecutionEngine.lib;LLVMInstCombine.lib;LLVMInstrumentation.lib;LLVMLinker.lib;LLVMMC.lib;LLVMMCDisassembler.lib;LLVMMCParser.lib;LLVMObject.lib;LLVMScalarOpts.lib;LLVMSelectionDAG.lib;LLVMSupport.lib;LLVMTarget.lib;LLVMTransformUtils.lib;LLVMX86ASMPrinter.lib;LLVMX86ASMParser.lib;LLVMX86Utils.lib;LLVMX86CodeGen.lib;LLVMX86Desc.lib;LLVMX86Disassembler.lib;LLVMX86Info.lib;LLVMipa.lib;LLVMipo.lib;shlwapi.lib;%(AdditionalDependencies)</AdditionalDependencies>
    </Link>
  </ItemDefinitionGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
  </ImportGroup>
-</Project>
+</Project>
--- a/ispc_test.cpp
+++ b/ispc_test.cpp
@@ -33,12 +33,25 @@

 #define _CRT_SECURE_NO_WARNINGS

+#if defined(_WIN32) || defined(_WIN64)
+#define ISPC_IS_WINDOWS
+#elif defined(__linux__)
+#define ISPC_IS_LINUX
+#elif defined(__APPLE__)
+#define ISPC_IS_APPLE
+#endif
+
 #ifdef ISPC_IS_WINDOWS
 #define NOMINMAX
 #include <windows.h>
 #endif
 #include <stdio.h>
 #include <stdint.h>
+#include <stdlib.h>
+#include <memory.h>
+#ifdef ISPC_IS_LINUX
+#include <malloc.h>
+#endif

 #ifdef ISPC_HAVE_SVML
 #include <xmmintrin.h>
@@ -62,7 +75,6 @@ extern "C" {
 #include <llvm/Instructions.h>
 #include <llvm/ExecutionEngine/ExecutionEngine.h>
 #if defined(LLVM_3_0) || defined(LLVM_3_0svn)
-  #include <llvm/ExecutionEngine/MCJIT.h>
  #include <llvm/Support/TargetRegistry.h>
  #include <llvm/Support/TargetSelect.h>
 #else
@@ -81,9 +93,7 @@ extern "C" {
 #include <llvm/Support/raw_ostream.h>
 #include <llvm/Bitcode/ReaderWriter.h>
 #include <llvm/Support/MemoryBuffer.h>
-#ifndef LLVM_2_8
 #include <llvm/Support/system_error.h>
-#endif

 bool shouldFail = false;

@@ -105,16 +115,35 @@ void ISPCSync() {
 }


-#ifdef ISPC_IS_WINDOWS
 void *ISPCMalloc(int64_t size, int32_t alignment) {
+#ifdef ISPC_IS_WINDOWS
    return _aligned_malloc(size, alignment);
+#endif
+#ifdef ISPC_IS_LINUX
+    return memalign(alignment, size);
+#endif
+#ifdef ISPC_IS_APPLE
+    void *mem = malloc(size + (alignment-1) + sizeof(void*));
+    char *amem = ((char*)mem) + sizeof(void*);
+    amem = amem + uint32_t(alignment - (reinterpret_cast<uint64_t>(amem) &
+                                        (alignment - 1)));
+    ((void**)amem)[-1] = mem;
+    return amem;
+#endif
 }


 void ISPCFree(void *ptr) {
+#ifdef ISPC_IS_WINDOWS
    _aligned_free(ptr);
-}
 #endif
+#ifdef ISPC_IS_LINUX
+    free(ptr);
+#endif
+#ifdef ISPC_IS_APPLE
+    free(((void**)ptr)[-1]);
+#endif
+}

 static void usage(int ret) {
    fprintf(stderr, "usage: ispc_test\n");
@@ -145,17 +174,6 @@ double Log(double x) { return log(x); }
 static bool lRunTest(const char *fn) {
    llvm::LLVMContext *ctx = new llvm::LLVMContext;

-#ifdef LLVM_2_8
-    std::string err;
-    llvm::MemoryBuffer *buf = llvm::MemoryBuffer::getFileOrSTDIN(fn, &err);
-    if (!buf) {
-        fprintf(stderr, "Unable to open file \"%s\": %s\n", fn, err.c_str());
-        delete ctx;
-        return false;
-    }
-    std::string bcErr;
-    llvm::Module *module = llvm::ParseBitcodeFile(buf, *ctx, &bcErr);
-#else
    llvm::OwningPtr<llvm::MemoryBuffer> buf;
    llvm::error_code err = llvm::MemoryBuffer::getFileOrSTDIN(fn, buf);
    if (err) {
@@ -165,7 +183,6 @@ static bool lRunTest(const char *fn) {
    }
    std::string bcErr;
    llvm::Module *module = llvm::ParseBitcodeFile(buf.get(), *ctx, &bcErr);
-#endif

    if (!module) {
        fprintf(stderr, "Bitcode reader failed for \"%s\": %s\n", fn, bcErr.c_str());
@@ -200,10 +217,8 @@ static bool lRunTest(const char *fn) {
        ee->addGlobalMapping(func, (void *)FUNC)
    DO_FUNC(ISPCLaunch, "ISPCLaunch");
    DO_FUNC(ISPCSync, "ISPCSync");
-#ifdef ISPC_IS_WINDOWS
    DO_FUNC(ISPCMalloc, "ISPCMalloc");
    DO_FUNC(ISPCFree, "ISPCFree");
-#endif // ISPC_IS_WINDOWS
    DO_FUNC(putchar, "putchar");
    DO_FUNC(printf, "printf");
    DO_FUNC(fflush, "fflush");
@@ -357,8 +372,6 @@ static bool lRunTest(const char *fn) {
 int main(int argc, char *argv[]) {
    llvm::InitializeNativeTarget();
 #if defined(LLVM_3_0) || defined(LLVM_3_0svn)
-    llvm::InitializeAllTargetMCs();
-    LLVMLinkInMCJIT();
    LLVMLinkInJIT();
 #endif

--- a/ispc_test.vcxproj
+++ b/ispc_test.vcxproj
@@ -52,14 +52,14 @@
      </PrecompiledHeader>
      <WarningLevel>Level3</WarningLevel>
      <Optimization>Disabled</Optimization>
-      <PreprocessorDefinitions>ISPC_IS_WINDOWS;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <PreprocessorDefinitions>LLVM_3_0;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <AdditionalIncludeDirectories>$(LLVM_INSTALL_DIR)/include</AdditionalIncludeDirectories>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
      <GenerateDebugInformation>true</GenerateDebugInformation>
      <AdditionalLibraryDirectories>$(LLVM_INSTALL_DIR)/lib</AdditionalLibraryDirectories>
-      <AdditionalDependencies>LLVMAnalysis.lib;LLVMArchive.lib;LLVMAsmPrinter.lib;LLVMBitReader.lib;LLVMBitWriter.lib;LLVMCodeGen.lib;LLVMCore.lib;LLVMExecutionEngine.lib;LLVMInstCombine.lib;LLVMInstrumentation.lib;LLVMipa.lib;LLVMipo.lib;LLVMJIT.lib;LLVMLinker.lib;LLVMMC.lib;LLVMMCParser.lib;LLVMObject.lib;LLVMScalarOpts.lib;LLVMSelectionDAG.lib;LLVMSupport.lib;LLVMTarget.lib;LLVMTransformUtils.lib;LLVMX86ASMPrinter.lib;LLVMX86ASMParser.lib;LLVMX86Utils.lib;LLVMX86CodeGen.lib;LLVMX86Disassembler.lib;LLVMX86Info.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalDependencies>LLVMAnalysis.lib;LLVMArchive.lib;LLVMAsmPrinter.lib;LLVMBitReader.lib;LLVMBitWriter.lib;LLVMCodeGen.lib;LLVMCore.lib;LLVMExecutionEngine.lib;LLVMInstCombine.lib;LLVMInstrumentation.lib;LLVMipa.lib;LLVMipo.lib;LLVMJIT.lib;LLVMLinker.lib;LLVMMC.lib;LLVMMCParser.lib;LLVMObject.lib;LLVMScalarOpts.lib;LLVMSelectionDAG.lib;LLVMSupport.lib;LLVMTarget.lib;LLVMTransformUtils.lib;LLVMX86ASMPrinter.lib;LLVMX86ASMParser.lib;LLVMX86Utils.lib;LLVMX86CodeGen.lib;LLVMX86Disassembler.lib;LLVMX86Desc.lib;LLVMX86Info.lib;%(AdditionalDependencies)</AdditionalDependencies>
    </Link>
  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
@@ -70,7 +70,7 @@
      <Optimization>MaxSpeed</Optimization>
      <FunctionLevelLinking>true</FunctionLevelLinking>
      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <PreprocessorDefinitions>ISPC_IS_WINDOWS;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <PreprocessorDefinitions>LLVM_3_0;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <AdditionalIncludeDirectories>$(LLVM_INSTALL_DIR)/include</AdditionalIncludeDirectories>
    </ClCompile>
    <Link>
@@ -79,10 +79,10 @@
      <EnableCOMDATFolding>true</EnableCOMDATFolding>
      <OptimizeReferences>true</OptimizeReferences>
      <AdditionalLibraryDirectories>$(LLVM_INSTALL_DIR)/lib</AdditionalLibraryDirectories>
-      <AdditionalDependencies>LLVMAnalysis.lib;LLVMArchive.lib;LLVMAsmPrinter.lib;LLVMBitReader.lib;LLVMBitWriter.lib;LLVMCodeGen.lib;LLVMCore.lib;LLVMExecutionEngine.lib;LLVMInstCombine.lib;LLVMInstrumentation.lib;LLVMipa.lib;LLVMipo.lib;LLVMJIT.lib;LLVMLinker.lib;LLVMMC.lib;LLVMMCParser.lib;LLVMObject.lib;LLVMScalarOpts.lib;LLVMSelectionDAG.lib;LLVMSupport.lib;LLVMTarget.lib;LLVMTransformUtils.lib;LLVMX86ASMPrinter.lib;LLVMX86ASMParser.lib;LLVMX86Utils.lib;LLVMX86CodeGen.lib;LLVMX86Disassembler.lib;LLVMX86Info.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalDependencies>LLVMAnalysis.lib;LLVMArchive.lib;LLVMAsmPrinter.lib;LLVMBitReader.lib;LLVMBitWriter.lib;LLVMCodeGen.lib;LLVMCore.lib;LLVMExecutionEngine.lib;LLVMInstCombine.lib;LLVMInstrumentation.lib;LLVMipa.lib;LLVMipo.lib;LLVMJIT.lib;LLVMLinker.lib;LLVMMC.lib;LLVMMCParser.lib;LLVMObject.lib;LLVMScalarOpts.lib;LLVMSelectionDAG.lib;LLVMSupport.lib;LLVMTarget.lib;LLVMTransformUtils.lib;LLVMX86ASMPrinter.lib;LLVMX86ASMParser.lib;LLVMX86Utils.lib;LLVMX86CodeGen.lib;LLVMX86Disassembler.lib;LLVMX86Desc.lib;LLVMX86Info.lib;%(AdditionalDependencies)</AdditionalDependencies>
    </Link>
  </ItemDefinitionGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
  </ImportGroup>
-</Project>
+</Project>
--- a/main.cpp
+++ b/main.cpp
@@ -40,11 +40,7 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <llvm/Support/PrettyStackTrace.h>
-#ifdef LLVM_2_8
-  #include <llvm/System/Signals.h>
-#else
-  #include <llvm/Support/Signals.h>
-#endif
+#include <llvm/Support/Signals.h>
 #if defined(LLVM_3_0) || defined(LLVM_3_0svn)
  #include <llvm/Support/TargetRegistry.h>
  #include <llvm/Support/TargetSelect.h>
@@ -73,7 +69,6 @@ static void usage(int ret) {
    printf("    [--emit-asm]\t\t\tGenerate assembly language file as output\n");
    printf("    [--emit-llvm]\t\t\tEmit LLVM bitode file as output\n");
    printf("    [--emit-obj]\t\t\tGenerate object file file as output (default)\n");
-    printf("    [--fast-math]\t\t\tPerform non-IEEE-compliant optimizations of numeric expressions\n");
    printf("    [-g]\t\t\t\tGenerate debugging information\n");
    printf("    [--help]\t\t\t\tPrint help\n");
    printf("    [-h <name>/--header-outfile=<name>]\tOutput filename for header\n");
@@ -87,8 +82,11 @@ static void usage(int ret) {
    printf("    [--nocpp]\t\t\t\tDon't run the C preprocessor\n");
    printf("    [-o <name>/--outfile=<name>]\tOutput filename (may be \"-\" for standard output)\n");
    printf("    [-O0/-O1]\t\t\t\tSet optimization level (-O1 is default)\n");
-#if 0
    printf("    [--opt=<option>]\t\t\tSet optimization option\n");
+    printf("        disable-loop-unroll\t\tDisable loop unrolling.\n");
+    printf("        fast-masked-vload\t\tFaster masked vector loads on SSE (may go past end of array)\n");
+    printf("        fast-math\t\t\tPerform non-IEEE-compliant optimizations of numeric expressions\n");
+#if 0
    printf("        disable-blended-masked-stores\t\tScalarize masked stores on SSE (vs. using vblendps)\n");
    printf("        disable-coherent-control-flow\t\tDisable coherent control flow optimizations\n");
    printf("        disable-uniform-control-flow\t\tDisable uniform control flow optimizations\n");
@@ -98,6 +96,9 @@ static void usage(int ret) {
    printf("        disable-uniform-memory-optimizations\tDisable uniform-based coherent memory access\n");
    printf("        disable-masked-store-optimizations\tDisable lowering to regular stores when possible\n");
 #endif
+#ifndef ISPC_IS_WINDOWS
+    printf("    [--pic]\t\t\t\tGenerate position-independent code\n");
+#endif // !ISPC_IS_WINDOWS
    printf("    [--target=<isa>]\t\t\tSelect target ISA. <isa>={%s}\n", Target::SupportedTargetISAs());
    printf("    [--version]\t\t\t\tPrint ispc version\n");
    printf("    [--woff]\t\t\t\tDisable warnings\n");
@@ -184,8 +185,9 @@ int main(int Argc, char *Argv[]) {

    bool debugSet = false, optSet = false;
    Module::OutputType ot = Module::Object;
-
+    bool generatePIC = false;
    const char *arch = NULL, *cpu = NULL, *target = NULL;
+
    for (int i = 1; i < argc; ++i) {
        if (!strcmp(argv[i], "--help"))
            usage(0);
@@ -195,8 +197,15 @@ int main(int Argc, char *Argv[]) {
            arch = argv[i] + 7;
        else if (!strncmp(argv[i], "--cpu=", 6))
            cpu = argv[i] + 6;
-        else if (!strcmp(argv[i], "--fast-math"))
-            g->opt.fastMath = true;
+        else if (!strcmp(argv[i], "--fast-math")) {
+            fprintf(stderr, "--fast-math option has been renamed to --opt=fast-math!\n");
+            usage(1);
+        }
+        else if (!strcmp(argv[i], "--fast-masked-vload")) {
+            fprintf(stderr, "--fast-masked-vload option has been renamed to "
+                    "--opt=fast-masked-vload!\n");
+            usage(1);
+        }
        else if (!strcmp(argv[i], "--debug"))
            g->debugPrint = true;
        else if (!strcmp(argv[i], "--instrument"))
@@ -233,7 +242,16 @@ int main(int Argc, char *Argv[]) {
        }
        else if (!strncmp(argv[i], "--opt=", 6)) {
            const char *opt = argv[i] + 6;
-            if (!strcmp(opt, "disable-blended-masked-stores"))
+            if (!strcmp(opt, "fast-math"))
+                g->opt.fastMath = true;
+            else if (!strcmp(opt, "fast-masked-vload"))
+                g->opt.fastMaskedVload = true;
+            else if (!strcmp(opt, "disable-loop-unroll"))
+                g->opt.unrollLoops = false;
+
+            // These are only used for performance tests of specific
+            // optimizations
+            else if (!strcmp(opt, "disable-blended-masked-stores"))
                g->opt.disableBlendedMaskedStores = true;
            else if (!strcmp(opt, "disable-coherent-control-flow"))
                g->opt.disableCoherentControlFlow = true;
@@ -286,6 +304,10 @@ int main(int Argc, char *Argv[]) {
            g->includeStdlib = false;
        else if (!strcmp(argv[i], "--nocpp"))
            g->runCPP = false;
+#ifndef ISPC_IS_WINDOWS
+        else if (!strcmp(argv[i], "--pic"))
+            generatePIC = true;
+#endif // !ISPC_IS_WINDOWS
        else if (!strcmp(argv[i], "-v") || !strcmp(argv[i], "--version")) {
            printf("Intel(r) SPMD Program Compiler (ispc) build %s (%s)\n", 
                   BUILD_DATE, BUILD_VERSION);
@@ -307,7 +329,7 @@ int main(int Argc, char *Argv[]) {
    if (debugSet && !optSet)
        g->opt.level = 0;

-    if (!Target::GetTarget(arch, cpu, target, &g->target))
+    if (!Target::GetTarget(arch, cpu, target, generatePIC, &g->target))
        usage(1);

    m = new Module(file);
--- a/module.cpp
+++ b/module.cpp
@@ -78,14 +78,11 @@
 #include <llvm/Analysis/Verifier.h>
 #include <llvm/Support/CFG.h>
 #include <clang/Frontend/CompilerInstance.h>
+#include <clang/Frontend/TextDiagnosticPrinter.h>
 #include <clang/Frontend/Utils.h>
 #include <clang/Basic/TargetInfo.h>
-#ifndef LLVM_2_8
-  #include <llvm/Support/ToolOutputFile.h>
-  #include <llvm/Support/Host.h>
-#else // !LLVM_2_8
-  #include <llvm/System/Host.h>
-#endif // LLVM_2_8
+#include <llvm/Support/ToolOutputFile.h>
+#include <llvm/Support/Host.h>
 #include <llvm/Assembly/PrintModulePass.h>
 #include <llvm/Support/raw_ostream.h>
 #include <llvm/Bitcode/ReaderWriter.h>
@@ -106,14 +103,11 @@ Module::Module(const char *fn) {

    module->setTargetTriple(g->target.GetTripleString());

-#ifndef LLVM_2_8
    if (g->generateDebuggingSymbols)
        diBuilder = new llvm::DIBuilder(*module);
    else
        diBuilder = NULL;
-#endif // LLVM_2_8

-#ifndef LLVM_2_8
    // If we're generating debugging symbols, let the DIBuilder know that
    // we're starting a new compilation unit.
    if (diBuilder != NULL) {
@@ -139,7 +133,6 @@ Module::Module(const char *fn) {
                                         0 /* run time version */);
        }
    }
-#endif // LLVM_2_8
 }


@@ -153,6 +146,9 @@ extern void yy_delete_buffer(YY_BUFFER_STATE);

 int
 Module::CompileFile() {
+    if (g->opt.fastMath == true)
+        llvm::UnsafeFPMath = true;
+
    // FIXME: it'd be nice to do this in the Module constructor, but this
    // function ends up calling into routines that expect the global
    // variable 'm' to be initialized and available (which it isn't until
@@ -457,6 +453,10 @@ Module::AddGlobal(DeclSpecs *ds, Declarator *decl) {
    // declarations, typedefs, and global variables declarations /
    // definitions.  Figure out what we've got and take care of it.

+    if (ds == NULL || decl == NULL)
+        // Error happened earlier during parsing
+        return;
+
    if (decl->isFunction) {
        // function declaration
        const Type *t = decl->GetType(ds);
@@ -557,7 +557,6 @@ Module::AddGlobal(DeclSpecs *ds, Declarator *decl) {
                                                         decl->sym->name.c_str());
        m->symbolTable->AddVariable(decl->sym);

-#ifndef LLVM_2_8
        if (diBuilder && (ds->storageClass != SC_EXTERN)) {
            llvm::DIFile file = decl->pos.GetDIFile();
            diBuilder->createGlobalVariable(decl->sym->name, 
@@ -567,7 +566,6 @@ Module::AddGlobal(DeclSpecs *ds, Declarator *decl) {
                                            (ds->storageClass == SC_STATIC),
                                            decl->sym->storagePtr);
        }
-#endif // LLVM_2_8
    }
 }

@@ -662,6 +660,11 @@ lEmitFunctionCode(FunctionEmitContext *ctx, llvm::Function *function,
        // the code to free that memory, now that we've copied the
        // parameter values out of the structure.
        ctx->EmitFree(structParamPtr);
+#else
+        // We also do this for AVX... (See discussion in
+        // FunctionEmitContext::LaunchInst().)
+        if (g->target.isa == Target::AVX)
+            ctx->EmitFree(structParamPtr);
 #endif // ISPC_IS_WINDOWS
    }
    else {
@@ -700,8 +703,18 @@ lEmitFunctionCode(FunctionEmitContext *ctx, llvm::Function *function,

    // Finally, we can generate code for the function
    if (code != NULL) {
+        int costEstimate = code->EstimateCost();
        bool checkMask = (ft->isTask == true) || 
-            (function->hasFnAttr(llvm::Attribute::AlwaysInline) == false);
+            ((function->hasFnAttr(llvm::Attribute::AlwaysInline) == false) &&
+             costEstimate > CHECK_MASK_AT_FUNCTION_START_COST);
+        Debug(code->pos, "Estimated cost for function \"%s\" = %d\n", 
+              funSym->name.c_str(), costEstimate);
+        // If the body of the function is non-trivial, then we wrap the
+        // entire thing around a varying "cif (true)" test in order to reap
+        // the side-effect benefit of checking to see if the execution mask
+        // is all on and thence having a specialized code path for that
+        // case.  If this is a simple function, then this isn't worth the
+        // code bloat / overhead.
        if (checkMask) {
            bool allTrue[ISPC_MAX_NVEC];
            for (int i = 0; i < g->target.vectorWidth; ++i)
@@ -914,12 +927,7 @@ Module::WriteOutput(OutputType outputType, const char *outFileName) {
            return true;
        }
        else {
-#ifdef LLVM_2_8
-            fprintf(stderr, "Direct object file emission not supported in this build.\n");
-            return false;
-#else
            return writeObjectFileOrAssembly(outputType, outFileName);
-#endif // LLVM_2_8
        }
    }
 }
@@ -1121,6 +1129,12 @@ lEmitVectorTypedefs(const std::vector<const VectorType *> &types, FILE *file) {
    for (unsigned int i = 0; i < types.size(); ++i) {
        std::string baseDecl;
        const VectorType *vt = types[i]->GetAsNonConstType();
+        if (!vt->IsUniformType())
+            // Varying stuff shouldn't be visibile to / used by the
+            // application, so at least make it not simple to access it by
+            // not declaring the type here...
+            continue;
+
        int size = vt->GetElementCount();

        baseDecl = vt->GetBaseType()->GetCDeclaration("");
@@ -1293,6 +1307,7 @@ Module::writeHeader(const char *fn) {
    default:
        FATAL("Unhandled target in header emission");
    }
+    fprintf(f, "#define ISPC_TARGET_VECTOR_WIDTH %d\n", g->target.vectorWidth);

    fprintf(f, "#ifdef __cplusplus\nnamespace ispc {\n#endif // __cplusplus\n\n");

@@ -1374,23 +1389,26 @@ Module::execPreprocessor(const char* infilename, llvm::raw_string_ostream* ostre
    std::string error;

    inst.createFileManager();
-    inst.createDiagnostics(0, NULL);
-    clang::TargetOptions& options = inst.getTargetOpts();

+    llvm::raw_fd_ostream stderrRaw(2, false);
+    clang::TextDiagnosticPrinter *diagPrinter = 
+        new clang::TextDiagnosticPrinter(stderrRaw, clang::DiagnosticOptions());
+    inst.createDiagnostics(0, NULL, diagPrinter);
+
+    clang::TargetOptions &options = inst.getTargetOpts();
    llvm::Triple triple(module->getTargetTriple());
    if (triple.getTriple().empty())
        triple.setTriple(llvm::sys::getHostTriple());
-    
    options.Triple = triple.getTriple();

-    clang::TargetInfo* target 
-        = clang::TargetInfo::CreateTargetInfo(inst.getDiagnostics(), options);
+    clang::TargetInfo *target =
+        clang::TargetInfo::CreateTargetInfo(inst.getDiagnostics(), options);

    inst.setTarget(target);
    inst.createSourceManager(inst.getFileManager());
    inst.InitializeSourceManager(infilename);

-    clang::PreprocessorOptions& opts = inst.getPreprocessorOpts();
+    clang::PreprocessorOptions &opts = inst.getPreprocessorOpts();

    //Add defs for ISPC and PI
    opts.addMacroDef("ISPC");
@@ -1403,7 +1421,10 @@ Module::execPreprocessor(const char* infilename, llvm::raw_string_ostream* ostre
        }
    }    
    inst.createPreprocessor();
+
+    clang::LangOptions langOptions;
+    diagPrinter->BeginSourceFile(langOptions, &inst.getPreprocessor());
    clang::DoPrintPreprocessedInput(inst.getPreprocessor(),
                                    ostream, inst.getPreprocessorOutputOpts());
+    diagPrinter->EndSourceFile();
 }
-
--- a/module.h
+++ b/module.h
@@ -91,11 +91,8 @@ public:
    /** llvm Module object into which globals and functions are added. */
    llvm::Module *module; 

-#ifndef LLVM_2_8
-    /** The diBuilder manages generating debugging information (only
-        supported in LLVM 2.9 and beyond...) */
+    /** The diBuilder manages generating debugging information */
    llvm::DIBuilder *diBuilder;
-#endif

    GatherBuffer *gatherBuffer;

--- a/opt.cpp
+++ b/opt.cpp
@@ -56,13 +56,11 @@
 #include <llvm/Intrinsics.h>
 #include <llvm/Constants.h>
 #include <llvm/Analysis/ConstantFolding.h>
-#ifndef LLVM_2_8
-    #include <llvm/Target/TargetLibraryInfo.h>
-    #ifdef LLVM_2_9
-        #include <llvm/Support/StandardPasses.h>
-    #else
-        #include <llvm/Transforms/IPO/PassManagerBuilder.h>
-    #endif // LLVM_2_9
+#include <llvm/Target/TargetLibraryInfo.h>
+#ifdef LLVM_2_9
+    #include <llvm/Support/StandardPasses.h>
+#else
+    #include <llvm/Transforms/IPO/PassManagerBuilder.h>
 #endif // LLVM_2_8
 #include <llvm/ADT/Triple.h>
 #include <llvm/Transforms/Scalar.h>
@@ -73,11 +71,15 @@
 #include <llvm/Target/TargetMachine.h>
 #include <llvm/Analysis/Verifier.h>
 #include <llvm/Support/raw_ostream.h>
-#ifndef LLVM_2_8
 #include <llvm/Analysis/DIBuilder.h>
-#endif
 #include <llvm/Analysis/DebugInfo.h>
 #include <llvm/Support/Dwarf.h>
+#ifdef ISPC_IS_LINUX
+  #include <alloca.h>
+#elif defined(ISPC_IS_WINDOWS)
+  #include <malloc.h>
+  #define alloca _alloca
+#endif // ISPC_IS_WINDOWS

 static llvm::Pass *CreateIntrinsicsOptPass();
 static llvm::Pass *CreateGatherScatterFlattenPass();
@@ -180,19 +182,22 @@ Optimize(llvm::Module *module, int optLevel) {
    llvm::PassManager optPM;
    llvm::FunctionPassManager funcPM(module);

-#ifndef LLVM_2_8
    llvm::TargetLibraryInfo *targetLibraryInfo =
        new llvm::TargetLibraryInfo(llvm::Triple(module->getTargetTriple()));
    optPM.add(targetLibraryInfo);
-#endif
    optPM.add(new llvm::TargetData(module));

+#if defined(LLVM_3_0) || defined(LLVM_3_0svn)
+    optPM.add(llvm::createIndVarSimplifyPass());
+#endif
+
    if (optLevel == 0) {
        // This is more or less the minimum set of optimizations that we
        // need to do to generate code that will actually run.  (We can't
        // run absolutely no optimizations, since the front-end needs us to
        // take the various __pseudo_* functions it has emitted and turn
        // them into something that can actually execute.
+        optPM.add(llvm::createPromoteMemoryToRegisterPass());
        optPM.add(CreateGatherScatterFlattenPass());
        optPM.add(CreateLowerGatherScatterPass());
        optPM.add(CreateLowerMaskedStorePass());
@@ -213,7 +218,6 @@ Optimize(llvm::Module *module, int optLevel) {
        // only later in the optimization process as things like constant
        // propagation have done their thing, and then when they do kick
        // in, they can often open up new opportunities for optimization...
-#ifndef LLVM_2_8
        llvm::PassRegistry *registry = llvm::PassRegistry::getPassRegistry();
        llvm::initializeCore(*registry);
        llvm::initializeScalarOpts(*registry);
@@ -224,7 +228,7 @@ Optimize(llvm::Module *module, int optLevel) {
        llvm::initializeInstCombine(*registry);
        llvm::initializeInstrumentation(*registry);
        llvm::initializeTarget(*registry);
-#endif
+
        // Early optimizations to try to reduce the total amount of code to
        // work with if we can
        optPM.add(CreateGatherScatterFlattenPass());
@@ -281,13 +285,11 @@ Optimize(llvm::Module *module, int optLevel) {
        optPM.add(llvm::createConstantPropagationPass());
        optPM.add(CreateIntrinsicsOptPass());

-#if defined(LLVM_2_8)
-        optPM.add(CreateIsCompileTimeConstantPass(true));
-#elif defined(LLVM_2_9)
+#if defined(LLVM_2_9)
        llvm::createStandardModulePasses(&optPM, 3, 
                                         false /* opt size */,
                                         true /* unit at a time */, 
-                                         false /* unroll loops */,
+                                         g->opt.unrollLoops,
                                         true /* simplify lib calls */,
                                         false /* may have exceptions */,
                                         llvm::createFunctionInliningPass());
@@ -302,7 +304,7 @@ Optimize(llvm::Module *module, int optLevel) {
        llvm::createStandardModulePasses(&optPM, 3, 
                                         false /* opt size */,
                                         true /* unit at a time */, 
-                                         false /* unroll loops */,
+                                         g->opt.unrollLoops,
                                         true /* simplify lib calls */,
                                         false /* may have exceptions */,
                                         llvm::createFunctionInliningPass());
@@ -311,6 +313,8 @@ Optimize(llvm::Module *module, int optLevel) {
        llvm::PassManagerBuilder builder;
        builder.OptLevel = 3;
        builder.Inliner = llvm::createFunctionInliningPass();
+        if (g->opt.unrollLoops == false)
+            builder.DisableUnrollLoops = true;
        builder.populateFunctionPassManager(funcPM);
        builder.populateModulePassManager(optPM);
        optPM.add(CreateIsCompileTimeConstantPass(true));
@@ -423,8 +427,11 @@ IntrinsicsOpt::IntrinsicsOpt()
    blendInstructions.push_back(BlendInstruction(
        llvm::Intrinsic::getDeclaration(m->module, llvm::Intrinsic::x86_sse41_blendvps),
        0xf, 0, 1, 2));
+#if defined(LLVM_3_0) || defined(LLVM_3_0svn)
    blendInstructions.push_back(BlendInstruction(
-        m->module->getFunction("llvm.x86.avx.blendvps"), 0xff, 0, 1, 2));
+        llvm::Intrinsic::getDeclaration(m->module, llvm::Intrinsic::x86_avx_blendv_ps_256),
+        0xff, 0, 1, 2));
+#endif
 }


@@ -1433,16 +1440,12 @@ LowerMaskedStorePass::runOnBasicBlock(llvm::BasicBlock &bb) {
        llvm::Value *rvalue  = callInst->getArgOperand(1);
        llvm::Value *mask = callInst->getArgOperand(2);

-        // On SSE, we need to choose between doing the load + blend + store
-        // trick, or serializing the masked store.  On targets with a
-        // native masked store instruction, the implementations of
-        // __masked_store_blend_* should be the same as __masked_store_*,
-        // so this doesn't matter.  On SSE, blending is generally more
-        // efficient and is always safe to do on stack-allocated values.(?)
-        bool doBlend = (g->target.isa != Target::AVX &&
+        // We need to choose between doing the load + blend + store trick,
+        // or serializing the masked store.  Even on targets with a native
+        // masked store instruction, this is preferable since it lets us
+        // keep values in registers rather than going out to the stack.
+        bool doBlend = (!g->opt.disableBlendedMaskedStores ||
                        lIsStackVariablePointer(lvalue));
-        if (g->target.isa == Target::SSE4 || g->target.isa == Target::SSE2)
-            doBlend |= !g->opt.disableBlendedMaskedStores;

        // Generate the call to the appropriate masked store function and
        // replace the __pseudo_* one with it.
@@ -1520,8 +1523,8 @@ static void lPrintVector(const char *info, llvm::Value *elements[ISPC_MAX_NVEC])


 /** Given an LLVM vector in vec, return a 'scalarized' version of the
-    vector in the provided offsets[] array.  For example, if the vector
-    value passed in is:  
+    vector in the provided scalarizedVector[] array.  For example, if the
+    vector value passed in is:

    add <4 x i32> %a_smear, <4 x i32> <4, 8, 12, 16>,

@@ -1542,28 +1545,39 @@ static void lPrintVector(const char *info, llvm::Value *elements[ISPC_MAX_NVEC])
    @param vec               Vector to be scalarized
    @param scalarizedVector  Array in which to store the individual vector 
                             elements
+    @param vectorLength      Number of elements in the given vector. (The
+                             passed scalarizedVector array must also be at least
+                             this length as well.)
    @returns                 True if the vector was successfully scalarized and
                             the values in offsets[] are valid; false otherwise
 */
 static bool
-lScalarizeVector(llvm::Value *vec, llvm::Value *scalarizedVector[ISPC_MAX_NVEC]) {
+lScalarizeVector(llvm::Value *vec, llvm::Value **scalarizedVector,
+                 int vectorLength) {
    // First initialize the values of scalarizedVector[] to NULL.
-    for (int i = 0; i < g->target.vectorWidth; ++i)
+    for (int i = 0; i < vectorLength; ++i)
        scalarizedVector[i] = NULL;
+    
+    // It may be ok for the vector to be an undef vector; these come up for
+    // example in shufflevector instructions.  As long as elements of the
+    // undef vector aren't referenced by the shuffle indices, this is fine.
+    if (llvm::isa<llvm::UndefValue>(vec))
+        return true;

    // ConstantVectors are easy; just pull out the individual constant
    // element values
    llvm::ConstantVector *cv = llvm::dyn_cast<llvm::ConstantVector>(vec);
    if (cv != NULL) {
-        for (int i = 0; i < g->target.vectorWidth; ++i)
+        for (int i = 0; i < vectorLength; ++i)
            scalarizedVector[i] = cv->getOperand(i);
        return true;
    }

    // It's also easy if it's just a vector of all zeros
-    llvm::ConstantAggregateZero *caz = llvm::dyn_cast<llvm::ConstantAggregateZero>(vec);
-    if (caz) {
-        for (int i = 0; i < g->target.vectorWidth; ++i)
+    llvm::ConstantAggregateZero *caz = 
+        llvm::dyn_cast<llvm::ConstantAggregateZero>(vec);
+    if (caz != NULL) {
+        for (int i = 0; i < vectorLength; ++i)
            scalarizedVector[i] = LLVMInt32(0);
        return true;
    }
@@ -1575,13 +1589,16 @@ lScalarizeVector(llvm::Value *vec, llvm::Value *scalarizedVector[ISPC_MAX_NVEC])
        // scalar values we return from here are synthesized with scalar
        // versions of the original vector binary operator
        llvm::Instruction::BinaryOps opcode = bo->getOpcode();
-        llvm::Value *v0[ISPC_MAX_NVEC], *v1[ISPC_MAX_NVEC];
+        llvm::Value **v0 = 
+            (llvm::Value **)alloca(vectorLength * sizeof(llvm::Value *));
+        llvm::Value **v1 = 
+            (llvm::Value **)alloca(vectorLength * sizeof(llvm::Value *));

-        if (!lScalarizeVector(bo->getOperand(0), v0) || 
-            !lScalarizeVector(bo->getOperand(1), v1))
+        if (!lScalarizeVector(bo->getOperand(0), v0, vectorLength) || 
+            !lScalarizeVector(bo->getOperand(1), v1, vectorLength))
            return false;

-        for (int i = 0; i < g->target.vectorWidth; ++i) {
+        for (int i = 0; i < vectorLength; ++i) {
            scalarizedVector[i] = 
                llvm::BinaryOperator::Create(opcode, v0[i], v1[i], "flat_bop", bo);
            lCopyMetadata(scalarizedVector[i], bo);
@@ -1606,7 +1623,7 @@ lScalarizeVector(llvm::Value *vec, llvm::Value *scalarizedVector[ISPC_MAX_NVEC])
        // vaue in scalarizedVector[] based on the value being inserted.
        while (ie != NULL) {
            uint64_t iOffset = lGetIntValue(ie->getOperand(2));
-            assert((int)iOffset < g->target.vectorWidth);
+            assert((int)iOffset < vectorLength);
            assert(scalarizedVector[iOffset] == NULL);

            scalarizedVector[iOffset] = ie->getOperand(1);
@@ -1620,15 +1637,17 @@ lScalarizeVector(llvm::Value *vec, llvm::Value *scalarizedVector[ISPC_MAX_NVEC])
    }

    llvm::CastInst *ci = llvm::dyn_cast<llvm::CastInst>(vec);
-    if (ci) {
+    if (ci != NULL) {
        // Casts are similar to BinaryOperators in that we attempt to
        // scalarize the vector being cast and if successful, we apply
        // equivalent scalar cast operators to each of the values in the
        // scalarized vector.
        llvm::Instruction::CastOps op = ci->getOpcode();

-        llvm::Value *scalarizedTarget[ISPC_MAX_NVEC];
-        if (!lScalarizeVector(ci->getOperand(0), scalarizedTarget))
+        llvm::Value **scalarizedTarget = 
+            (llvm::Value **)alloca(vectorLength * sizeof(llvm::Value *));
+        if (!lScalarizeVector(ci->getOperand(0), scalarizedTarget,
+                              vectorLength))
            return false;

        LLVM_TYPE_CONST llvm::Type *destType = ci->getDestTy();
@@ -1637,7 +1656,7 @@ lScalarizeVector(llvm::Value *vec, llvm::Value *scalarizedVector[ISPC_MAX_NVEC])
        assert(vectorDestType != NULL);
        LLVM_TYPE_CONST llvm::Type *elementType = vectorDestType->getElementType();

-        for (int i = 0; i < g->target.vectorWidth; ++i) {
+        for (int i = 0; i < vectorLength; ++i) {
            scalarizedVector[i] = 
                llvm::CastInst::Create(op, scalarizedTarget[i], elementType,
                                       "cast", ci);
@@ -1647,16 +1666,11 @@ lScalarizeVector(llvm::Value *vec, llvm::Value *scalarizedVector[ISPC_MAX_NVEC])
    }

    llvm::ShuffleVectorInst *svi = llvm::dyn_cast<llvm::ShuffleVectorInst>(vec);
-    if (svi) {
-        // Note that the code for shufflevector instructions is untested.
-        // (We haven't yet had a case where it needs to run).  Therefore,
-        // an assert at the bottom of this routien will hit the first time
-        // it runs as a reminder that this needs to be tested further.
-
+    if (svi != NULL) {
        LLVM_TYPE_CONST llvm::VectorType *svInstType = 
            llvm::dyn_cast<LLVM_TYPE_CONST llvm::VectorType>(svi->getType());
        assert(svInstType != NULL);
-        assert((int)svInstType->getNumElements() == g->target.vectorWidth);
+        assert((int)svInstType->getNumElements() == vectorLength);

        // Scalarize the two vectors being shuffled.  First figure out how
        // big they are.
@@ -1671,58 +1685,90 @@ lScalarizeVector(llvm::Value *vec, llvm::Value *scalarizedVector[ISPC_MAX_NVEC])
        int n0 = vectorType0->getNumElements();
        int n1 = vectorType1->getNumElements();

-        // FIXME: It's actually totally legitimate for these two to have
-        // different sizes; the final result just needs to have the native
-        // vector width.  To handle this, not only do we need to
-        // potentially dynamically allocate space for the arrays passed
-        // into lScalarizeVector, but we need to change the rest of its
-        // implementation to not key off g->target.vectorWidth everywhere
-        // to get the sizes of the arrays to iterate over, etc.
-        assert(n0 == g->target.vectorWidth && n1 == g->target.vectorWidth);
-
        // Go ahead and scalarize the two input vectors now.
-        // FIXME: it's ok if some or all of the values of these two vectors
-        // have undef values, so long as we don't try to access undef
-        // values with the vector indices provided to the instruction.
-        // Should fix lScalarizeVector so that it doesn't return false in
-        // this case and just leaves the elements of the arrays with undef
-        // values as NULL.
-        llvm::Value *v0[ISPC_MAX_NVEC], *v1[ISPC_MAX_NVEC];
-        if (!lScalarizeVector(svi->getOperand(0), v0) ||
-            !lScalarizeVector(svi->getOperand(1), v1))
+        llvm::Value **v0 = (llvm::Value **)alloca(n0 * sizeof(llvm::Value *));
+        llvm::Value **v1 = (llvm::Value **)alloca(n1 * sizeof(llvm::Value *));
+
+        if (!lScalarizeVector(svi->getOperand(0), v0, n0) ||
+            !lScalarizeVector(svi->getOperand(1), v1, n1))
            return false;

-        llvm::ConstantVector *shuffleIndicesVector = 
-            llvm::dyn_cast<llvm::ConstantVector>(svi->getOperand(2));
-        // I think this has to be a ConstantVector.  If this ever hits,
-        // we'll dig into what we got instead and figure out how to handle
-        // that...
-        assert(shuffleIndicesVector != NULL);
-
-        // Get the integer indices for each element of the returned vector
-        llvm::SmallVector<llvm::Constant *, ISPC_MAX_NVEC> shuffleIndices;
-        shuffleIndicesVector->getVectorElements(shuffleIndices);
-        assert((int)shuffleIndices.size() == g->target.vectorWidth);
-
-        // And loop over the indices, setting the i'th element of the
-        // result vector with the source vector element that corresponds to
-        // the i'th shuffle index value.
-        for (unsigned int i = 0; i < shuffleIndices.size(); ++i) {
-            if (!llvm::isa<llvm::ConstantInt>(shuffleIndices[i]))
-                // I'm not sure when this case would ever happen, though..
-                return false;
-            int offset = (int)lGetIntValue(shuffleIndices[i]);
-            assert(offset >= 0 && offset < n0+n1);
-
-            if (offset < n0)
-                // Offsets from 0 to n0-1 index into the first vector
-                scalarizedVector[i] = v0[offset];
-            else
-                // And offsets from n0 to (n0+n1-1) index into the second
-                // vector
-                scalarizedVector[i] = v1[offset - n0];
+        llvm::ConstantAggregateZero *caz = 
+            llvm::dyn_cast<llvm::ConstantAggregateZero>(svi->getOperand(2));
+        if (caz != NULL) {
+            for (int i = 0; i < vectorLength; ++i)
+                scalarizedVector[i] = v0[0];
+        }
+        else {
+            llvm::ConstantVector *shuffleIndicesVector = 
+                llvm::dyn_cast<llvm::ConstantVector>(svi->getOperand(2));
+            // I think this has to be a ConstantVector.  If this ever hits,
+            // we'll dig into what we got instead and figure out how to handle
+            // that...
+            assert(shuffleIndicesVector != NULL);
+
+            // Get the integer indices for each element of the returned vector
+            llvm::SmallVector<llvm::Constant *, ISPC_MAX_NVEC> shuffleIndices;
+            shuffleIndicesVector->getVectorElements(shuffleIndices);
+            assert((int)shuffleIndices.size() == vectorLength);
+
+            // And loop over the indices, setting the i'th element of the
+            // result vector with the source vector element that corresponds to
+            // the i'th shuffle index value.
+            for (unsigned int i = 0; i < shuffleIndices.size(); ++i) {
+                // I'm not sure when this case would ever happen, though..
+                assert(llvm::isa<llvm::ConstantInt>(shuffleIndices[i]));
+
+                int offset = (int)lGetIntValue(shuffleIndices[i]);
+                assert(offset >= 0 && offset < n0+n1);
+
+                if (offset < n0)
+                    // Offsets from 0 to n0-1 index into the first vector
+                    scalarizedVector[i] = v0[offset];
+                else
+                    // And offsets from n0 to (n0+n1-1) index into the second
+                    // vector
+                    scalarizedVector[i] = v1[offset - n0];
+            }
+        }
+        return true;
+    }
+
+    llvm::LoadInst *li = llvm::dyn_cast<llvm::LoadInst>(vec);
+    if (li != NULL) {
+        llvm::Value *baseAddr = li->getOperand(0);
+        llvm::Value *baseInt = new llvm::PtrToIntInst(baseAddr, LLVMTypes::Int64Type,
+                                                      "base2int", li);
+        lCopyMetadata(baseInt, li);
+
+        LLVM_TYPE_CONST llvm::PointerType *ptrType = 
+            llvm::dyn_cast<llvm::PointerType>(baseAddr->getType());
+        assert(ptrType != NULL);
+        LLVM_TYPE_CONST llvm::VectorType *vecType = 
+            llvm::dyn_cast<llvm::VectorType>(ptrType->getElementType());
+        assert(vecType != NULL);
+        LLVM_TYPE_CONST llvm::Type *elementType = vecType->getElementType();
+        uint64_t elementSize;
+        bool sizeKnown = lSizeOfIfKnown(elementType, &elementSize);
+        assert(sizeKnown == true);
+
+        LLVM_TYPE_CONST llvm::Type *eltPtrType = llvm::PointerType::get(elementType, 0);
+
+        for (int i = 0; i < vectorLength; ++i) {
+            llvm::Value *intPtrOffset = 
+                llvm::BinaryOperator::Create(llvm::Instruction::Add, baseInt,
+                                             LLVMInt64(i * elementSize), "baseoffset",
+                                             li);
+            lCopyMetadata(intPtrOffset, li);
+            llvm::Value *scalarLoadPtr = 
+                new llvm::IntToPtrInst(intPtrOffset, eltPtrType, "int2ptr", li);
+            lCopyMetadata(scalarLoadPtr, li);
+
+            llvm::Instruction *scalarLoad = 
+                new llvm::LoadInst(scalarLoadPtr, "loadelt", li);
+            lCopyMetadata(scalarLoad, li);
+            scalarizedVector[i] = scalarLoad;
        }
-        FATAL("the above code is untested so far; check now that it's actually running");
        return true;
    }

@@ -2134,11 +2180,18 @@ GSImprovementsPass::runOnBasicBlock(llvm::BasicBlock &bb) {
        if (ce && ce->getOpcode() == llvm::Instruction::BitCast)
            base = ce->getOperand(0);

-        // Try to out the offsets; the i'th element of the offsetElements
-        // array should be an i32 with the value of the offset for the i'th
-        // vector lane.  This may fail; if so, just give up.
+        // Try to find out the offsets; the i'th element of the
+        // offsetElements array should be an i32 with the value of the
+        // offset for the i'th vector lane.  This may fail; if so, just
+        // give up.
+        llvm::Value *vecValue = callInst->getArgOperand(1);
+        LLVM_TYPE_CONST llvm::VectorType *vt = 
+            llvm::dyn_cast<llvm::VectorType>(vecValue->getType());
+        assert(vt != NULL);
+        int vecLength = vt->getNumElements();
+        assert(vecLength == g->target.vectorWidth);
        llvm::Value *offsetElements[ISPC_MAX_NVEC];
-        if (!lScalarizeVector(callInst->getArgOperand(1), offsetElements))
+        if (!lScalarizeVector(vecValue, offsetElements, vecLength))
            continue;

        llvm::Value *mask = callInst->getArgOperand((gatherInfo != NULL) ? 2 : 3);
@@ -2515,7 +2568,7 @@ llvm::RegisterPass<MakeInternalFuncsStaticPass>
 bool
 MakeInternalFuncsStaticPass::runOnModule(llvm::Module &module) {
    const char *names[] = {
-        "__do_print",
+        "__do_print", "__fast_masked_vload",
        "__gather_base_offsets_i8", "__gather_base_offsets_i16",
        "__gather_base_offsets_i32", "__gather_base_offsets_i64",
        "__gather_elt_8", "__gather_elt_16", 
--- a/parse.yy
+++ b/parse.yy
@@ -177,6 +177,7 @@ static const char *lParamListTokens[] = {
 %type <stmt> statement labeled_statement compound_statement for_init_statement
 %type <stmt> expression_statement selection_statement iteration_statement
 %type <stmt> jump_statement statement_list declaration_statement print_statement
+%type <stmt> sync_statement

 %type <declaration> declaration parameter_declaration
 %type <declarators> init_declarator_list 
@@ -436,8 +437,6 @@ assignment_expression

 expression
    : assignment_expression
-    | TOKEN_SYNC 
-      { $$ = new SyncExpr(@1); }
    | expression ',' assignment_expression
      { $$ = new BinaryExpr(BinaryExpr::Comma, $1, $3, @2); }
    ;
@@ -928,9 +927,13 @@ parameter_list
            builtinTokens.push_back(*token);
            ++token;
        }
-        std::vector<std::string> alternates = MatchStrings(yytext, builtinTokens);
-        std::string alts = lGetAlternates(alternates);
-        Error(@1, "Syntax error--token \"%s\" unknown.%s", yytext, alts.c_str());
+        if (strlen(yytext) == 0)
+            Error(@1, "Syntax error--premature end of file.");
+        else {
+            std::vector<std::string> alternates = MatchStrings(yytext, builtinTokens);
+            std::string alts = lGetAlternates(alternates);
+            Error(@1, "Syntax error--token \"%s\" unknown.%s", yytext, alts.c_str());
+        }
        $$ = NULL;
    }
    ;
@@ -1019,6 +1022,7 @@ statement
    | jump_statement
    | declaration_statement
    | print_statement
+    | sync_statement
    | error
    {
        std::vector<std::string> builtinTokens;
@@ -1027,9 +1031,13 @@ statement
            builtinTokens.push_back(*token);
            ++token;
        }
-        std::vector<std::string> alternates = MatchStrings(yytext, builtinTokens);
-        std::string alts = lGetAlternates(alternates);
-        Error(@1, "Syntax error--token \"%s\" unknown.%s", yytext, alts.c_str());
+        if (strlen(yytext) == 0)
+            Error(@1, "Syntax error--premature end of file.");
+        else {
+            std::vector<std::string> alternates = MatchStrings(yytext, builtinTokens);
+            std::string alts = lGetAlternates(alternates);
+            Error(@1, "Syntax error--token \"%s\" unknown.%s", yytext, alts.c_str());
+        }
        $$ = NULL;
    }
    ;
@@ -1155,6 +1163,11 @@ jump_statement
      { $$ = new ReturnStmt($2, true, @1); }
    ;

+sync_statement
+    : TOKEN_SYNC 
+      { $$ = new ExprStmt(new SyncExpr(@1), @1); }
+    ;
+
 print_statement
    : TOKEN_PRINT '(' string_constant ')'
      {
@@ -1177,9 +1190,13 @@ translation_unit
            builtinTokens.push_back(*token);
            ++token;
        }
-        std::vector<std::string> alternates = MatchStrings(yytext, builtinTokens);
-        std::string alts = lGetAlternates(alternates);
-        Error(@1, "Syntax error--token \"%s\" unknown.%s", yytext, alts.c_str());
+        if (strlen(yytext) == 0)
+            Error(@1, "Syntax error--premature end of file.");
+        else {
+            std::vector<std::string> alternates = MatchStrings(yytext, builtinTokens);
+            std::string alts = lGetAlternates(alternates);
+            Error(@1, "Syntax error--token \"%s\" unknown.%s", yytext, alts.c_str());
+        }
    }
    ;

--- a/run_tests.py
+++ b/run_tests.py
@@ -17,6 +17,7 @@ import random
 import string
 import mutex
 import subprocess
+import platform

 parser = OptionParser()
 parser.add_option("-r", "--random-shuffle", dest="random", help="Randomly order tests",
@@ -30,6 +31,8 @@ parser.add_option('-t', '--target', dest='target',
 parser.add_option('-a', '--arch', dest='arch',
                  help='Set architecture (x86, x86-64)',
                  default="x86-64")
+parser.add_option('-o', '--no-opt', dest='no_opt', help='Disable optimization',
+                  default=False, action="store_true")

 (options, args) = parser.parse_args()

@@ -129,12 +132,16 @@ def run_tasks_from_queue(queue):
                exe_name = "%s.run" % filename
                ispc_cmd = "ispc --woff %s -o %s --arch=%s --target=%s" % \
                    (filename, obj_name, options.arch, options.target)
+                if options.no_opt:
+                    ispc_cmd += " -O0" 
                if options.arch == 'x86':
                    gcc_arch = '-m32'
                else:
                    gcc_arch = '-m64'
-                gcc_cmd = "g++ -Wl,-no_pie %s test_static.cpp -DTEST_SIG=%d %s.o -o %s" % \
+                gcc_cmd = "g++ %s test_static.cpp -DTEST_SIG=%d %s.o -o %s" % \
                    (gcc_arch, match, filename, exe_name)
+                if platform.system() == 'Darwin':
+                    gcc_cmd += ' -Wl,-no_pie'
                if should_fail:
                    gcc_cmd += " -DEXPECT_FAILURE"
                    
@@ -152,6 +159,8 @@ def run_tasks_from_queue(queue):
            bitcode_file = "%s.bc" % filename
            compile_cmd = "ispc --woff --emit-llvm %s --target=%s -o %s" % \
                (filename, options.target, bitcode_file)
+            if options.no_opt:
+                compile_cmd += " -O0"
            test_cmd = "ispc_test %s" % bitcode_file

            error_count += run_cmds([compile_cmd, test_cmd], filename, should_fail)
--- a/stdlib.ispc
+++ b/stdlib.ispc
@@ -2862,6 +2862,12 @@ static inline void seed_rng(reference uniform RNGState state, uniform unsigned i
    seed = __seed4(state, 0, seed);
    if (programCount == 8)
        __seed4(state, 4, seed ^ 0xbeeff00d);
+    if (programCount == 16) {
+        __seed4(state, 4,  seed ^ 0xbeeff00d);
+        __seed4(state, 8,  ((seed & 0xffff) << 16) | (seed >> 16));
+        __seed4(state, 12, (((seed & 0xff) << 24) | ((seed & 0xff00)  << 8) |
+                            ((seed & 0xff0000) >> 8) | (seed & 0xff000000) >> 24));
+    }
 }

 static inline void fastmath() {
--- a/stmt.cpp
+++ b/stmt.cpp
@@ -107,6 +107,12 @@ ExprStmt::Print(int indent) const {
 }


+int
+ExprStmt::EstimateCost() const {
+    return expr ? expr->EstimateCost() : 0;
+}
+
+
 ///////////////////////////////////////////////////////////////////////////
 // DeclStmt

@@ -399,12 +405,25 @@ DeclStmt::Print(int indent) const {
 }


+int
+DeclStmt::EstimateCost() const {
+    int cost = 0;
+    for (unsigned int i = 0; i < declaration->declarators.size(); ++i)
+        if (declaration->declarators[i]->initExpr)
+            cost += declaration->declarators[i]->initExpr->EstimateCost();
+    return cost;
+}
+
+
 ///////////////////////////////////////////////////////////////////////////
 // IfStmt

-IfStmt::IfStmt(Expr *t, Stmt *ts, Stmt *fs, bool doUnif, SourcePos p) 
+IfStmt::IfStmt(Expr *t, Stmt *ts, Stmt *fs, bool checkCoherence, SourcePos p) 
    : Stmt(p), test(t), trueStmts(ts), falseStmts(fs), 
-      doCoherentCheck(doUnif && !g->opt.disableCoherentControlFlow) {
+      doAllCheck(checkCoherence &&
+                 !g->opt.disableCoherentControlFlow),
+      doAnyCheck(test->GetType() != NULL &&
+                 test->GetType()->IsVaryingType()) {
 }


@@ -436,62 +455,46 @@ IfStmt::EmitCode(FunctionEmitContext *ctx) const {

    ctx->SetDebugPos(pos);
    bool isUniform = testType->IsUniformType();
+
+    llvm::Value *testValue = test->GetValue(ctx);
+    if (testValue == NULL)
+        return;
+
    if (isUniform) {
        ctx->StartUniformIf(ctx->GetMask());
-        if (doCoherentCheck)
-            Warning(test->pos, "Uniform condition supplied to cif statement.");
+        if (doAllCheck)
+            Warning(test->pos, "Uniform condition supplied to \"cif\" statement.");

        // 'If' statements with uniform conditions are relatively
        // straightforward.  We evaluate the condition and then jump to
        // either the 'then' or 'else' clause depending on its value.
-        llvm::Value *vtest = test->GetValue(ctx);
-        if (vtest != NULL) {
-            llvm::BasicBlock *bthen = ctx->CreateBasicBlock("if_then");
-            llvm::BasicBlock *belse = ctx->CreateBasicBlock("if_else");
-            llvm::BasicBlock *bexit = ctx->CreateBasicBlock("if_exit");
+        llvm::BasicBlock *bthen = ctx->CreateBasicBlock("if_then");
+        llvm::BasicBlock *belse = ctx->CreateBasicBlock("if_else");
+        llvm::BasicBlock *bexit = ctx->CreateBasicBlock("if_exit");

-            // Jump to the appropriate basic block based on the value of
-            // the 'if' test
-            ctx->BranchInst(bthen, belse, vtest);
+        // Jump to the appropriate basic block based on the value of
+        // the 'if' test
+        ctx->BranchInst(bthen, belse, testValue);

-            // Emit code for the 'true' case
-            ctx->SetCurrentBasicBlock(bthen);
-            lEmitIfStatements(ctx, trueStmts, "true");
-            if (ctx->GetCurrentBasicBlock()) 
-                ctx->BranchInst(bexit);
+        // Emit code for the 'true' case
+        ctx->SetCurrentBasicBlock(bthen);
+        lEmitIfStatements(ctx, trueStmts, "true");
+        if (ctx->GetCurrentBasicBlock()) 
+            ctx->BranchInst(bexit);

-            // Emit code for the 'false' case
-            ctx->SetCurrentBasicBlock(belse);
-            lEmitIfStatements(ctx, falseStmts, "false");
-            if (ctx->GetCurrentBasicBlock())
-                ctx->BranchInst(bexit);
+        // Emit code for the 'false' case
+        ctx->SetCurrentBasicBlock(belse);
+        lEmitIfStatements(ctx, falseStmts, "false");
+        if (ctx->GetCurrentBasicBlock())
+            ctx->BranchInst(bexit);

-            // Set the active basic block to the newly-created exit block
-            // so that subsequent emitted code starts there.
-            ctx->SetCurrentBasicBlock(bexit);
-        }
+        // Set the active basic block to the newly-created exit block
+        // so that subsequent emitted code starts there.
+        ctx->SetCurrentBasicBlock(bexit);
        ctx->EndIf();
    }
-    else {
-        // Code for 'If' statemnts with 'varying' conditions can be
-        // generated in two ways; one takes some care to see if all of the
-        // active program instances want to follow only the 'true' or
-        // 'false' cases, and the other always runs both cases but sets the
-        // mask appropriately.  The first case is handled by the
-        // IfStmt::emitCoherentTests() call, and the second is handled by
-        // IfStmt::emitMaskedTrueAndFalse().
-        llvm::Value *testValue = test->GetValue(ctx);
-        if (testValue) {
-            if (doCoherentCheck) 
-                emitCoherentTests(ctx, testValue);
-            else {
-                llvm::Value *oldMask = ctx->GetMask();
-                ctx->StartVaryingIf(oldMask);
-                emitMaskedTrueAndFalse(ctx, oldMask, testValue);
-                ctx->EndIf();
-            }
-        }
-    }
+    else
+        emitVaryingIf(ctx, testValue);
 }


@@ -535,9 +538,17 @@ Stmt *IfStmt::TypeCheck() {
 }


+int
+IfStmt::EstimateCost() const {
+    return ((test ? test->EstimateCost() : 0) +
+            (trueStmts ? trueStmts->EstimateCost() : 0) +
+            (falseStmts ? falseStmts->EstimateCost() : 0));
+}
+
+
 void
 IfStmt::Print(int indent) const {
-    printf("%*cIf Stmt %s", indent, ' ', doCoherentCheck ? "DO COHERENT CHECK" : "");
+    printf("%*cIf Stmt %s", indent, ' ', doAllCheck ? "DO ALL CHECK" : "");
    pos.Print();
    printf("\n%*cTest: ", indent+4, ' ');
    test->Print();
@@ -554,7 +565,7 @@ IfStmt::Print(int indent) const {


 /** Emit code to run both the true and false statements for the if test,
-    with the mask set appropriately before runnign each one. 
+    with the mask set appropriately before running each one. 
 */
 void
 IfStmt::emitMaskedTrueAndFalse(FunctionEmitContext *ctx, llvm::Value *oldMask, 
@@ -574,11 +585,185 @@ IfStmt::emitMaskedTrueAndFalse(FunctionEmitContext *ctx, llvm::Value *oldMask,
 }


+/** Similar to the Stmt variant of this function, this conservatively
+    checks to see if it's safe to run the code for the given Expr even if
+    the mask is 'all off'.
+ */
+static bool
+lSafeToRunWithAllLanesOff(Expr *expr) {
+    if (expr == NULL)
+        return false;
+
+    UnaryExpr *ue;
+    if ((ue = dynamic_cast<UnaryExpr *>(expr)) != NULL)
+        return lSafeToRunWithAllLanesOff(ue->expr);
+
+    BinaryExpr *be;
+    if ((be = dynamic_cast<BinaryExpr *>(expr)) != NULL)
+        return (lSafeToRunWithAllLanesOff(be->arg0) &&
+                lSafeToRunWithAllLanesOff(be->arg1));
+
+    AssignExpr *ae;
+    if ((ae = dynamic_cast<AssignExpr *>(expr)) != NULL)
+        return (lSafeToRunWithAllLanesOff(ae->lvalue) &&
+                lSafeToRunWithAllLanesOff(ae->rvalue));
+
+    SelectExpr *se;
+    if ((se = dynamic_cast<SelectExpr *>(expr)) != NULL)
+        return (lSafeToRunWithAllLanesOff(se->test) && 
+                lSafeToRunWithAllLanesOff(se->expr1) && 
+                lSafeToRunWithAllLanesOff(se->expr2));
+
+    ExprList *el;
+    if ((el = dynamic_cast<ExprList *>(expr)) != NULL) {
+        for (unsigned int i = 0; i < el->exprs.size(); ++i)
+            if (!lSafeToRunWithAllLanesOff(el->exprs[i]))
+                return false;
+        return true;
+    }
+
+    FunctionCallExpr *fce;
+    if ((fce = dynamic_cast<FunctionCallExpr *>(expr)) != NULL)
+        // FIXME: If we could somehow determine that the function being
+        // called was safe (and all of the args Exprs were safe, then it'd
+        // be nice to be able to return true here.  (Consider a call to
+        // e.g. floatbits() in the stdlib.)  Unfortunately for now we just
+        // have to be conservative.
+        return false;
+
+    IndexExpr *ie;
+    if ((ie = dynamic_cast<IndexExpr *>(expr)) != NULL) {
+        // If we can determine at compile time the size of the array/vector
+        // and if the indices are compile-time constants, then we may be
+        // able to safely run this under a predicated if statement..
+        if (ie->arrayOrVector == NULL)
+            return false;
+
+        const Type *type = ie->arrayOrVector->GetType();
+        ConstExpr *ce = dynamic_cast<ConstExpr *>(ie->index);
+        if (type == NULL || ce == NULL)
+            return false;
+        if (dynamic_cast<const ReferenceType *>(type) != NULL)
+            type = type->GetReferenceTarget();
+
+        const SequentialType *seqType = 
+            dynamic_cast<const SequentialType *>(type);
+        assert(seqType != NULL);
+        int nElements = seqType->GetElementCount();
+        if (nElements == 0)
+            // Unsized array, so we can't be sure
+            return false;
+
+        int32_t indices[ISPC_MAX_NVEC];
+        int count = ce->AsInt32(indices);
+        for (int i = 0; i < count; ++i)
+            if (indices[i] < 0 || indices[i] >= nElements)
+                return false;
+
+        // All indices are in-bounds
+        return true;
+    }
+
+    MemberExpr *me;
+    if ((me = dynamic_cast<MemberExpr *>(expr)) != NULL)
+        return lSafeToRunWithAllLanesOff(me->expr);
+
+    if (dynamic_cast<ConstExpr *>(expr) != NULL)
+        return true;
+
+    TypeCastExpr *tce;
+    if ((tce = dynamic_cast<TypeCastExpr *>(expr)) != NULL)
+        return lSafeToRunWithAllLanesOff(tce->expr);
+
+    ReferenceExpr *re;
+    if ((re = dynamic_cast<ReferenceExpr *>(expr)) != NULL)
+        return lSafeToRunWithAllLanesOff(re->expr);
+
+    DereferenceExpr *dre;
+    if ((dre = dynamic_cast<DereferenceExpr *>(expr)) != NULL)
+        return lSafeToRunWithAllLanesOff(dre->expr);
+
+    if (dynamic_cast<SymbolExpr *>(expr) != NULL ||
+        dynamic_cast<FunctionSymbolExpr *>(expr) != NULL ||
+        dynamic_cast<SyncExpr *>(expr) != NULL)
+        return true;
+
+    FATAL("Unknown Expr type in lSafeToRunWithAllLanesOff()");
+    return false;
+}
+
+
+/** Given an arbitrary statement, this function conservatively tests to see
+    if it's safe to run the code for the statement even if the mask is all
+    off.  Here we just need to determine which kind of statement we have
+    and recursively traverse it and/or the expressions inside of it.
+ */
+static bool
+lSafeToRunWithAllLanesOff(Stmt *stmt) {
+    if (stmt == NULL)
+        return true;
+
+    ExprStmt *es;
+    if ((es = dynamic_cast<ExprStmt *>(stmt)) != NULL)
+        return lSafeToRunWithAllLanesOff(es->expr);
+
+    DeclStmt *ds;
+    if ((ds = dynamic_cast<DeclStmt *>(stmt)) != NULL) {
+        for (unsigned int i = 0; i < ds->declaration->declarators.size(); ++i)
+            if (!lSafeToRunWithAllLanesOff(ds->declaration->declarators[i]->initExpr))
+                return false;
+        return true;
+    }
+
+    IfStmt *is;
+    if ((is = dynamic_cast<IfStmt *>(stmt)) != NULL)
+        return (lSafeToRunWithAllLanesOff(is->test) &&
+                lSafeToRunWithAllLanesOff(is->trueStmts) &&
+                lSafeToRunWithAllLanesOff(is->falseStmts));
+
+    DoStmt *dos;
+    if ((dos = dynamic_cast<DoStmt *>(stmt)) != NULL)
+        return (lSafeToRunWithAllLanesOff(dos->testExpr) &&
+                lSafeToRunWithAllLanesOff(dos->bodyStmts));
+
+    ForStmt *fs;
+    if ((fs = dynamic_cast<ForStmt *>(stmt)) != NULL)
+        return (lSafeToRunWithAllLanesOff(fs->init) &&
+                lSafeToRunWithAllLanesOff(fs->test) &&
+                lSafeToRunWithAllLanesOff(fs->step) &&
+                lSafeToRunWithAllLanesOff(fs->stmts));
+
+    if (dynamic_cast<BreakStmt *>(stmt) != NULL ||
+        dynamic_cast<ContinueStmt *>(stmt) != NULL)
+        return true;
+
+    ReturnStmt *rs;
+    if ((rs = dynamic_cast<ReturnStmt *>(stmt)) != NULL)
+        return lSafeToRunWithAllLanesOff(rs->val);
+
+    StmtList *sl;
+    if ((sl = dynamic_cast<StmtList *>(stmt)) != NULL) {
+        const std::vector<Stmt *> &sls = sl->GetStatements();
+        for (unsigned int i = 0; i < sls.size(); ++i)
+            if (!lSafeToRunWithAllLanesOff(sls[i]))
+                return false;
+        return true;
+    }
+
+    PrintStmt *ps;
+    if ((ps = dynamic_cast<PrintStmt *>(stmt)) != NULL)
+        return lSafeToRunWithAllLanesOff(ps->values);
+
+    FATAL("Unexpected stmt type in lSafeToRunWithAllLanesOff()");
+    return false;
+}
+
+
 /** Emit code for an if test that checks the mask and the test values and
    tries to be smart about jumping over code that doesn't need to be run.
 */
 void
-IfStmt::emitCoherentTests(FunctionEmitContext *ctx, llvm::Value *ltest) const {
+IfStmt::emitVaryingIf(FunctionEmitContext *ctx, llvm::Value *ltest) const {
    llvm::Value *oldMask = ctx->GetMask();
    if (oldMask == LLVMMaskAllOn) {
        // We can tell that the mask is on statically at compile time; just
@@ -587,7 +772,7 @@ IfStmt::emitCoherentTests(FunctionEmitContext *ctx, llvm::Value *ltest) const {
        emitMaskAllOn(ctx, ltest, bDone);
        ctx->SetCurrentBasicBlock(bDone);
    }
-    else {
+    else if (doAllCheck) {
        // We can't tell if the mask going into the if is all on at the
        // compile time.  Emit code to check for this and then either run
        // the code for the 'all on' or the 'mixed' case depending on the
@@ -619,6 +804,43 @@ IfStmt::emitCoherentTests(FunctionEmitContext *ctx, llvm::Value *ltest) const {
        // paths above jump to when they're done.
        ctx->SetCurrentBasicBlock(bDone);
    }
+    else if (trueStmts != NULL || falseStmts != NULL) {
+        // If there is nothing that is potentially unsafe to run with all
+        // lanes off in the true and false statements and if the total
+        // complexity of those two is relatively simple, then we'll go
+        // ahead and emit straightline code that runs both sides, updating
+        // the mask accordingly.  This is useful for efficiently compiling
+        // things like:
+        //
+        // if (foo) x = 0;
+        // else     ++x;
+        //
+        // Where the overhead of checking if any of the program instances wants
+        // to run one side or the other is more than the actual computation.
+        // The lSafeToRunWithAllLanesOff() checks to make sure that we don't do this
+        // for potentially dangerous code like:
+        //
+        // if (index < count) array[index] = 0;
+        //
+        // where our use of blend for conditional assignments doesn't check
+        // for the 'all lanes' off case.
+        if (lSafeToRunWithAllLanesOff(trueStmts) &&
+            lSafeToRunWithAllLanesOff(falseStmts) &&
+            (((trueStmts ? trueStmts->EstimateCost() : 0) + 
+              (falseStmts ? falseStmts->EstimateCost() : 0)) < 
+             PREDICATE_SAFE_IF_STATEMENT_COST)) {
+            ctx->StartVaryingIf(oldMask);
+            emitMaskedTrueAndFalse(ctx, oldMask, ltest);
+            assert(ctx->GetCurrentBasicBlock());
+            ctx->EndIf();
+        }
+        else {
+            assert(doAnyCheck);
+            llvm::BasicBlock *bDone = ctx->CreateBasicBlock("if_done");
+            emitMaskMixed(ctx, oldMask, ltest, bDone);
+            ctx->SetCurrentBasicBlock(bDone);
+        }
+    }
 }


@@ -677,69 +899,50 @@ IfStmt::emitMaskAllOn(FunctionEmitContext *ctx, llvm::Value *ltest,
 }


-/** Emits code that checks to see if for all of the lanes where the mask is
-    on, the test has the value true.
- */
-static llvm::Value *
-lTestMatchesMask(FunctionEmitContext *ctx, llvm::Value *test, llvm::Value *mask) {
-    llvm::Value *testAndMask = ctx->BinaryOperator(llvm::Instruction::And, test,
-                                                   mask, "test&mask");
-    return ctx->MasksAllEqual(testAndMask, mask);
-}
-
-
 /** Emit code for an 'if' test where the lane mask is known to be mixed
    on/off going into it.
 */
 void
 IfStmt::emitMaskMixed(FunctionEmitContext *ctx, llvm::Value *oldMask, 
                      llvm::Value *ltest, llvm::BasicBlock *bDone) const {
-    // First, see if, for all of the lanes where the mask is on, if the
-    // value of the test is on.  (i.e. (test&mask) == mask).  In this case,
-    // we only need to run the 'true' case code, since the lanes where the
-    // test was false aren't supposed to be running here anyway.
-     llvm::Value *testAllEqual = lTestMatchesMask(ctx, ltest, oldMask);
-    llvm::BasicBlock *bTestAll = ctx->CreateBasicBlock("cif_mixed_test_all");
-    llvm::BasicBlock *bTestAnyCheck = ctx->CreateBasicBlock("cif_mixed_test_any_check");
-    ctx->BranchInst(bTestAll, bTestAnyCheck, testAllEqual);
+    ctx->StartVaryingIf(oldMask);
+    llvm::BasicBlock *bNext = ctx->CreateBasicBlock("safe_if_after_true");
+    if (trueStmts != NULL) {
+        llvm::BasicBlock *bRunTrue = ctx->CreateBasicBlock("safe_if_run_true");
+        ctx->MaskAnd(oldMask, ltest);

-    // Emit code for the (test&mask)==mask case.  Not only do we only need
-    // to emit code for the true statements, but we don't need to modify
-    // the mask's value; it's already correct.
-    ctx->SetCurrentBasicBlock(bTestAll);
-    ctx->StartVaryingIf(ctx->GetMask());
-    lEmitIfStatements(ctx, trueStmts, "cif: all running lanes want just true stmts");
-    assert(ctx->GetCurrentBasicBlock());
-    ctx->EndIf();
+        // Do any of the program instances want to run the 'true'
+        // block?  If not, jump ahead to bNext.
+        llvm::Value *maskAnyQ = ctx->Any(ctx->GetMask());
+        ctx->BranchInst(bRunTrue, bNext, maskAnyQ);
+
+        // Emit statements for true
+        ctx->SetCurrentBasicBlock(bRunTrue);
+        lEmitIfStatements(ctx, trueStmts, "if: expr mixed, true statements");
+        assert(ctx->GetCurrentBasicBlock()); 
+        ctx->BranchInst(bNext);
+        ctx->SetCurrentBasicBlock(bNext);
+    }
+    if (falseStmts != NULL) {
+        llvm::BasicBlock *bRunFalse = ctx->CreateBasicBlock("safe_if_run_false");
+        bNext = ctx->CreateBasicBlock("safe_if_after_false");
+        ctx->MaskAndNot(oldMask, ltest);
+
+        // Similarly, check to see if any of the instances want to
+        // run the 'false' block...
+        llvm::Value *maskAnyQ = ctx->Any(ctx->GetMask());
+        ctx->BranchInst(bRunFalse, bNext, maskAnyQ);
+
+        // Emit code for false
+        ctx->SetCurrentBasicBlock(bRunFalse);
+        lEmitIfStatements(ctx, falseStmts, "if: expr mixed, false statements");
+        assert(ctx->GetCurrentBasicBlock());
+        ctx->BranchInst(bNext);
+        ctx->SetCurrentBasicBlock(bNext);
+    }
    ctx->BranchInst(bDone);
-
-    // Next, see if the active lanes only need to run the false case--i.e. if
-    // (~test & mask) == mask.
-    ctx->SetCurrentBasicBlock(bTestAnyCheck);
-    llvm::Value *notTest = ctx->BinaryOperator(llvm::Instruction::Xor, LLVMMaskAllOn,
-                                               ltest, "~test");
-    llvm::Value *notMatchesMask = lTestMatchesMask(ctx, notTest, oldMask);
-    llvm::BasicBlock *bTestAllNot = ctx->CreateBasicBlock("cif_mixed_test_none");
-    llvm::BasicBlock *bTestMixed = ctx->CreateBasicBlock("cif_mixed_test_mixed");
-    ctx->BranchInst(bTestAllNot, bTestMixed, notMatchesMask);
-
-    // Emit code for the (~test & mask) == mask case.  We only need the
-    // 'false' statements and again don't need to modify the value of the
-    // mask.
-    ctx->SetCurrentBasicBlock(bTestAllNot);
-    ctx->StartVaryingIf(ctx->GetMask());
-    lEmitIfStatements(ctx, falseStmts, "cif: all running lanes want just false stmts");
-    assert(ctx->GetCurrentBasicBlock());
+    ctx->SetCurrentBasicBlock(bDone);
    ctx->EndIf();
-    ctx->BranchInst(bDone);
-
-    // It's mixed; we need to run both the true and false cases and also do
-    // mask update stuff.
-    ctx->SetCurrentBasicBlock(bTestMixed);
-    ctx->StartVaryingIf(ctx->GetMask());
-    emitMaskedTrueAndFalse(ctx, oldMask, ltest);
-    ctx->EndIf();
-    ctx->BranchInst(bDone);
 }


@@ -955,6 +1158,13 @@ DoStmt::TypeCheck() {
 }


+int
+DoStmt::EstimateCost() const {
+    return ((testExpr ? testExpr->EstimateCost() : 0) +
+            (bodyStmts ? bodyStmts->EstimateCost() : 0));
+}
+
+
 void
 DoStmt::Print(int indent) const {
    printf("%*cDo Stmt", indent, ' ');
@@ -1162,6 +1372,20 @@ ForStmt::TypeCheck() {
 }


+int
+ForStmt::EstimateCost() const {
+    bool uniformTest = test ? test->GetType()->IsUniformType() :
+        (!g->opt.disableUniformControlFlow &&
+         !lHasVaryingBreakOrContinue(stmts));
+
+    return ((init ? init->EstimateCost() : 0) +
+            (test ? test->EstimateCost() : 0) +
+            (step ? step->EstimateCost() : 0) +
+            (stmts ? stmts->EstimateCost() : 0) +
+            (uniformTest ? COST_UNIFORM_LOOP : COST_VARYING_LOOP));
+}
+
+
 void
 ForStmt::Print(int indent) const {
    printf("%*cFor Stmt", indent, ' ');
@@ -1216,6 +1440,13 @@ BreakStmt::TypeCheck() {
 }


+int
+BreakStmt::EstimateCost() const {
+    return doCoherenceCheck ? COST_COHERENT_BREAK_CONTINE : 
+        COST_REGULAR_BREAK_CONTINUE;
+}
+
+
 void
 BreakStmt::Print(int indent) const {
    printf("%*c%sBreak Stmt", indent, ' ', doCoherenceCheck ? "Coherent " : "");
@@ -1254,6 +1485,13 @@ ContinueStmt::TypeCheck() {
 }


+int
+ContinueStmt::EstimateCost() const {
+    return doCoherenceCheck ? COST_COHERENT_BREAK_CONTINE : 
+        COST_REGULAR_BREAK_CONTINUE;
+}
+
+
 void
 ContinueStmt::Print(int indent) const {
    printf("%*c%sContinue Stmt", indent, ' ', doCoherenceCheck ? "Coherent " : "");
@@ -1300,6 +1538,12 @@ ReturnStmt::TypeCheck() {
 }


+int
+ReturnStmt::EstimateCost() const {
+    return COST_RETURN + (val ? val->EstimateCost() : 0);
+}
+
+
 void
 ReturnStmt::Print(int indent) const {
    printf("%*c%sReturn Stmt", indent, ' ', doCoherenceCheck ? "Coherent " : "");
@@ -1345,6 +1589,16 @@ StmtList::TypeCheck() {
 }


+int
+StmtList::EstimateCost() const {
+    int cost = 0;
+    for (unsigned int i = 0; i < stmts.size(); ++i)
+        if (stmts[i])
+            cost += stmts[i]->EstimateCost();
+    return cost;
+}
+
+
 void
 StmtList::Print(int indent) const {
    printf("%*cStmt List", indent, ' ');
@@ -1545,3 +1799,11 @@ PrintStmt::TypeCheck() {
        values = values->TypeCheck();
    return this;
 }
+
+
+int
+PrintStmt::EstimateCost() const {
+    return COST_FUNCALL + (values ? values->EstimateCost() : 0);
+}
+
+
--- a/stmt.h
+++ b/stmt.h
@@ -75,8 +75,8 @@ public:

    Stmt *Optimize();
    Stmt *TypeCheck();
+    int EstimateCost() const;

-private:
    Expr *expr;
 };

@@ -92,8 +92,8 @@ public:

    Stmt *Optimize();
    Stmt *TypeCheck();
+    int EstimateCost() const;

-private:
    Declaration *declaration;
 };

@@ -103,13 +103,14 @@ private:
 class IfStmt : public Stmt {
 public:
    IfStmt(Expr *testExpr, Stmt *trueStmts, Stmt *falseStmts,
-           bool doCoherentCheck, SourcePos pos);
+           bool doAllCheck, SourcePos pos);

    void EmitCode(FunctionEmitContext *ctx) const;
    void Print(int indent) const;

    Stmt *Optimize();
    Stmt *TypeCheck();
+    int EstimateCost() const;

    // @todo these are only public for lHasVaryingBreakOrContinue(); would
    // be nice to clean that up...
@@ -125,11 +126,12 @@ private:
        source and thus, if the emitted code should check to see if all
        active program instances want to follow just one of the 'true' or
        'false' blocks. */
-    const bool doCoherentCheck;
+    const bool doAllCheck;
+    const bool doAnyCheck;

    void emitMaskedTrueAndFalse(FunctionEmitContext *ctx, llvm::Value *oldMask, 
                                llvm::Value *test) const;
-    void emitCoherentTests(FunctionEmitContext *ctx, llvm::Value *test) const;
+    void emitVaryingIf(FunctionEmitContext *ctx, llvm::Value *test) const;
    void emitMaskAllOn(FunctionEmitContext *ctx,
                       llvm::Value *test, llvm::BasicBlock *bDone) const;
    void emitMaskMixed(FunctionEmitContext *ctx, llvm::Value *oldMask, 
@@ -150,8 +152,8 @@ public:

    Stmt *Optimize();
    Stmt *TypeCheck();
+    int EstimateCost() const;

-private:
    Expr *testExpr;
    Stmt *bodyStmts;
    const bool doCoherentCheck;
@@ -171,8 +173,8 @@ public:

    Stmt *Optimize();
    Stmt *TypeCheck();
+    int EstimateCost() const;

-private:
    /** 'for' statment initializer; may be NULL, indicating no intitializer */
    Stmt *init;
    /** expression that returns a value indicating whether the loop should
@@ -198,6 +200,7 @@ public:

    Stmt *Optimize();
    Stmt *TypeCheck();
+    int EstimateCost() const;

 private:
    /** This indicates whether the generated code will check to see if no
@@ -219,6 +222,7 @@ public:

    Stmt *Optimize();
    Stmt *TypeCheck();
+    int EstimateCost() const;

 private:
    /** This indicates whether the generated code will check to see if no
@@ -240,8 +244,8 @@ public:

    Stmt *Optimize();
    Stmt *TypeCheck();
+    int EstimateCost() const;

-private:
    Expr *val;
    /** This indicates whether the generated code will check to see if no
        more program instances are currently running after the return, in
@@ -262,6 +266,7 @@ public:

    Stmt *Optimize();
    Stmt *TypeCheck();
+    int EstimateCost() const;

    void Add(Stmt *s) { if (s) stmts.push_back(s); }
    const std::vector<Stmt *> &GetStatements() { return stmts; }
@@ -289,8 +294,8 @@ public:

    Stmt *Optimize();
    Stmt *TypeCheck();
+    int EstimateCost() const;

-private:
    /** Format string for the print() statement. */
    const std::string format;
    /** This holds the arguments passed to the print() statement.  If more
--- a/test_static.cpp
+++ b/test_static.cpp
@@ -31,9 +31,21 @@
   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
 */

+#if defined(_WIN32) || defined(_WIN64)
+#define ISPC_IS_WINDOWS
+#elif defined(__linux__)
+#define ISPC_IS_LINUX
+#elif defined(__APPLE__)
+#define ISPC_IS_APPLE
+#endif
+
 #include <string.h>
 #include <stdio.h>
 #include <assert.h>
+#include <stdint.h>
+#ifdef ISPC_IS_LINUX
+#include <malloc.h>
+#endif

 extern "C" {
    extern int width();
@@ -48,6 +60,8 @@ extern "C" {
    
    void ISPCLaunch(void *f, void *d);
    void ISPCSync();
+    void *ISPCMalloc(int64_t size, int32_t alignment);
+    void ISPCFree(void *ptr);
 }

 void ISPCLaunch(void *f, void *d) {
@@ -60,6 +74,37 @@ void ISPCSync() {
 }


+void *ISPCMalloc(int64_t size, int32_t alignment) {
+#ifdef ISPC_IS_WINDOWS
+    return _aligned_malloc(size, alignment);
+#endif
+#ifdef ISPC_IS_LINUX
+    return memalign(alignment, size);
+#endif
+#ifdef ISPC_IS_APPLE
+    void *mem = malloc(size + (alignment-1) + sizeof(void*));
+    char *amem = ((char*)mem) + sizeof(void*);
+    amem = amem + uint32_t(alignment - (reinterpret_cast<uint64_t>(amem) &
+                                        (alignment - 1)));
+    ((void**)amem)[-1] = mem;
+    return amem;
+#endif
+}
+
+
+void ISPCFree(void *ptr) {
+#ifdef ISPC_IS_WINDOWS
+    _aligned_free(ptr);
+#endif
+#ifdef ISPC_IS_LINUX
+    free(ptr);
+#endif
+#ifdef ISPC_IS_APPLE
+    free(((void**)ptr)[-1]);
+#endif
+}
+
+
 int main(int argc, char *argv[]) {
    int w = width();
    assert(w <= 16);
--- a/tests/array-1.ispc
+++ b/tests/array-1.ispc
@@ -3,7 +3,7 @@ export uniform int width() { return programCount; }



-static float x[2][1];
+static float x[1][2];

 export void f_f(uniform float RET[], uniform float aFOO[]) {
    float a = aFOO[programIndex];
--- a/tests/array-scatter-vary.ispc
+++ b/tests/array-scatter-vary.ispc
@@ -13,7 +13,7 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
 }

    
-export void result(uniform float RET[4]) { 
+export void result(uniform float RET[]) { 
    RET[programIndex] = 0;
    RET[3] = 4;
    RET[4] = 5;
--- a/tests/cwhile-test-60.ispc
+++ b/tests/cwhile-test-60.ispc
@@ -11,5 +11,5 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {


 export void result(uniform float RET[]) {
-    RET[programIndex] = 10;
+    RET[programIndex] = max(10, 1 + programIndex);
 }
--- a/tests/popcnt-1.ispc
+++ b/tests/popcnt-1.ispc
@@ -9,7 +9,10 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
 }

 export void result(uniform float RET[]) { 
-    uniform int pc[16] = { 1, 1, 2, 1, 2, 2, 3, 1, 1, 2, 2, 3, 2, 3, 3, 4 };
+    uniform int pc[16] = { 1, 1, 2, 1,
+                           2, 2, 3, 1,
+                           2, 2, 3, 2, 
+                           3, 3, 4, 1 };
    RET[programIndex] = pc[programIndex];
 }

--- a/tests/reduce-add-double-2.ispc
+++ b/tests/reduce-add-double-2.ispc
@@ -15,7 +15,7 @@ export void result(uniform float RET[]) {
    uniform int x = -1234;
    if (programCount == 4) x = 10;
    else if (programCount == 8) x = 36;
-    else if (programCount == 16) x = 124;
+    else if (programCount == 16) x = 136;
    RET[programIndex] = x;
 }

--- a/tests/reduce-add-float-2.ispc
+++ b/tests/reduce-add-float-2.ispc
@@ -15,7 +15,7 @@ export void result(uniform float RET[]) {
    uniform int x = -1234;
    if (programCount == 4) x = 10;
    else if (programCount == 8) x = 36;
-    else if (programCount == 16) x = 124;
+    else if (programCount == 16) x = 136;
    RET[programIndex] = x;
 }

--- a/tests/short-vec-8.ispc
+++ b/tests/short-vec-8.ispc
@@ -9,17 +9,6 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
        uniform float<5> z = c ? x : y;
        RET[programIndex] = z[programIndex];
    }
-/*CO    return x[y];*/
-
-/*CO    int index = aFOO[programIndex];*/
-/*CO    index = min(index, 3);*/
-/*CO    return x[index];*/
-
-/*CO    return x << 1;*/
-/*CO    return c[0] ? 1 : 0;*/
-/*CO    x = b;*/
-/*CO    y = b;*/
-/*CO    return x+y;*/
 }

 export void result(uniform float RET[]) {
--- a/tests/shuffle-flatten.ispc
+++ b/tests/shuffle-flatten.ispc
@@ -0,0 +1,17 @@
+
+export uniform int width() { return programCount; }
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    int tmp1 = shuffle(programIndex, 0, programIndex);
+
+    RET[programIndex] = 10;
+    if (programIndex < 1) {
+        uniform int foo = extract(tmp1, 0);
+        RET[programIndex] = aFOO[foo + programIndex];
+    }
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 10;
+    RET[0] = 1;
+}
--- a/type.cpp
+++ b/type.cpp
@@ -45,9 +45,7 @@
 #include <stdio.h>
 #include <llvm/Value.h>
 #include <llvm/Module.h>
-#ifndef LLVM_2_8
 #include <llvm/Analysis/DIBuilder.h>
-#endif
 #include <llvm/Analysis/DebugInfo.h>
 #include <llvm/Support/Dwarf.h>

@@ -414,10 +412,6 @@ AtomicType::LLVMType(llvm::LLVMContext *ctx) const {

 llvm::DIType
 AtomicType::GetDIType(llvm::DIDescriptor scope) const {
-#ifdef LLVM_2_8
-    FATAL("debug info not supported in llvm 2.8");
-    return llvm::DIType();
-#else
    if (isUniform) {
        switch (basicType) {
        case TYPE_VOID:
@@ -484,7 +478,6 @@ AtomicType::GetDIType(llvm::DIDescriptor scope) const {
        uint64_t align = unifType.getAlignInBits() * g->target.vectorWidth;
        return m->diBuilder->createVectorType(size, align, unifType, subArray);
    }
-#endif // LLVM_2_8
 }


@@ -645,10 +638,6 @@ EnumType::LLVMType(llvm::LLVMContext *ctx) const {

 llvm::DIType 
 EnumType::GetDIType(llvm::DIDescriptor scope) const {
-#ifdef LLVM_2_8
-    FATAL("debug info not supported in llvm 2.8");
-    return llvm::DIType();
-#else
    std::vector<llvm::Value *> enumeratorDescriptors;
    for (unsigned int i = 0; i < enumerators.size(); ++i) {
        unsigned int enumeratorValue;
@@ -688,7 +677,6 @@ EnumType::GetDIType(llvm::DIDescriptor scope) const {
    uint64_t size =  diType.getSizeInBits()  * g->target.vectorWidth;
    uint64_t align = diType.getAlignInBits() * g->target.vectorWidth;
    return m->diBuilder->createVectorType(size, align, diType, subArray);
-#endif // !LLVM_2_8
 }


@@ -893,10 +881,6 @@ ArrayType::TotalElementCount() const {

 llvm::DIType
 ArrayType::GetDIType(llvm::DIDescriptor scope) const {
-#ifdef LLVM_2_8
-    FATAL("debug info not supported in llvm 2.8");
-    return llvm::DIType();
-#else
    if (!child)
        return llvm::DIType();

@@ -923,7 +907,6 @@ ArrayType::GetDIType(llvm::DIDescriptor scope) const {
    uint64_t align = eltType.getAlignInBits();

    return m->diBuilder->createArrayType(size, align, eltType, subArray);
-#endif // LLVM_2_8
 }


@@ -1044,16 +1027,11 @@ SOAArrayType::LLVMType(llvm::LLVMContext *ctx) const {

 llvm::DIType
 SOAArrayType::GetDIType(llvm::DIDescriptor scope) const {
-#ifdef LLVM_2_8
-    FATAL("debug info not supported in llvm 2.8");
-    return llvm::DIType();
-#else
    if (!child)
        return llvm::DIType();

    const Type *t = soaType();
    return t->GetDIType(scope);
-#endif
 }


@@ -1217,10 +1195,6 @@ VectorType::LLVMType(llvm::LLVMContext *ctx) const {

 llvm::DIType
 VectorType::GetDIType(llvm::DIDescriptor scope) const {
-#ifdef LLVM_2_8
-    FATAL("debug info not supported in llvm 2.8");
-    return llvm::DIType();
-#else
    llvm::DIType eltType = base->GetDIType(scope);
    llvm::Value *sub = m->diBuilder->getOrCreateSubrange(0, numElements-1);
 #ifdef LLVM_2_9
@@ -1240,7 +1214,6 @@ VectorType::GetDIType(llvm::DIDescriptor scope) const {
        align = 4 * g->target.nativeVectorWidth;

    return m->diBuilder->createVectorType(sizeBits, align, eltType, subArray);
-#endif // LLVM_2_8
 }


@@ -1443,10 +1416,6 @@ StructType::LLVMType(llvm::LLVMContext *ctx) const {

 llvm::DIType
 StructType::GetDIType(llvm::DIDescriptor scope) const {
-#ifdef LLVM_2_8
-    FATAL("debug info not supported in llvm 2.8");
-    return llvm::DIType();
-#else
    uint64_t currentSize = 0, align = 0;

    std::vector<llvm::Value *> elementLLVMTypes;
@@ -1500,7 +1469,6 @@ StructType::GetDIType(llvm::DIDescriptor scope) const {
    llvm::DIFile diFile = pos.GetDIFile();
    return m->diBuilder->createStructType(scope, name, diFile, pos.first_line, currentSize, 
                                          align, 0, elements);
-#endif // LLVM_2_8
 }


@@ -1698,13 +1666,8 @@ ReferenceType::LLVMType(llvm::LLVMContext *ctx) const {

 llvm::DIType
 ReferenceType::GetDIType(llvm::DIDescriptor scope) const {
-#ifdef LLVM_2_8
-    FATAL("debug info not supported in llvm 2.8");
-    return llvm::DIType();
-#else
    llvm::DIType diTargetType = targetType->GetDIType(scope);
    return m->diBuilder->createReferenceType(diTargetType);
-#endif // LLVM_2_8
 }


@@ -1870,6 +1833,7 @@ FunctionType::LLVMFunctionType(llvm::LLVMContext *ctx, bool includeMask) const {
    for (unsigned int i = 0; i < argTypes.size(); ++i) {
        if (!argTypes[i])
            return NULL;
+        assert(argTypes[i] != AtomicType::Void);

        LLVM_TYPE_CONST llvm::Type *t = argTypes[i]->LLVMType(ctx);
        if (!t)
--- a/util.cpp
+++ b/util.cpp
@@ -344,6 +344,10 @@ StringEditDistance(const std::string &str1, const std::string &str2, int maxDist

 std::vector<std::string> 
 MatchStrings(const std::string &str, const std::vector<std::string> &options) {
+    if (str.size() == 0 || (str.size() == 1 && !isalpha(str[0])))
+        // don't even try...
+        return std::vector<std::string>();
+
    const int maxDelta = 2;
    std::vector<std::string> matches[maxDelta+1];
Author	SHA1	Message	Date
Matt Pharr	f1b8e5b1bf	Release notes and doxygen bump for 1.0.9 release	2011-09-26 16:21:32 -07:00
Matt Pharr	e7a70b05af	Fix statically-linked tests on Linux	2011-09-26 16:11:45 -07:00
Matt Pharr	cf73286938	More small Windows build fixes. Also switch to LLVM 3.0 libs	2011-09-26 16:07:23 -07:00
Matt Pharr	e6f80c0adc	Remove stale include of MCJIT.h	2011-09-26 16:04:52 -07:00
Matt Pharr	5e31d7b6d0	Windows build: use LLVM_INSTALL_DIR to find clang.exe	2011-09-26 16:04:50 -07:00
Matt Pharr	649f2ad7b7	Update parser to make 'sync' a statement, not an expr.	2011-09-23 20:33:24 -07:00
Matt Pharr	fade1cdf1d	Pretty much all conversions to varying double are slow, so don't bother warning about them.	2011-09-23 16:03:35 -07:00
Matt Pharr	d261105a86	Error/warning reporting improvements. - Don't suggest matches when given an empty string or a single, non-alpha character. - Also fixed the parser to be a bit less confusing when it encounters an unexpected EOF.	2011-09-23 15:51:23 -07:00
Matt Pharr	b3d3e8987b	Provide a properly initialized TextDiagnosticPrinter to clang's preprocessor. Fixes issue #100 (crash when the preprocessor was trying to emit a diagnostic about a mismatched #if/#endif).	2011-09-23 15:50:18 -07:00
Matt Pharr	4e91f3777a	Fix BinaryExpr to handle reference-typed operands. Fixes issue #101.	2011-09-23 15:19:14 -07:00
Matt Pharr	5584240c7f	Fix crash with function declarations with unnamed parameters. Fixes issue #103. Previously, we were inadvertently grabbing the function's return type for the parameter, rather than the actual parameter type.	2011-09-23 15:05:59 -07:00
Matt Pharr	7126a39092	Disable PIC on Windows	2011-09-19 15:32:43 -07:00
Matt Pharr	8ad28a3f6f	update doxygen, release notes for 1.0.8 release	2011-09-19 15:22:25 -07:00
Matt Pharr	9921b8e530	Predicated 'if' statement performance improvements. Go back to running both sides of 'if' statements with masking and without branching if we can determine that the code is relatively simple (as per the simple cost model), and is safe to run even if the mask is 'all off'. This gives a bit of a performance improvement for some of the examples (most notably, the ray tracer), and is the code that one wants generated in this case anyhow.	2011-09-19 09:54:09 -07:00
Matt Pharr	9052d4b10b	Linux build fixes	2011-09-17 13:42:46 -07:00
Matt Pharr	2405dae8e6	Use malloc() to get space for task arguments when compiling to AVX. This is to work around the LLVM bug/limitation discused in LLVM bug 10841 (http://llvm.org/bugs/show_bug.cgi?id=10841).	2011-09-17 13:38:51 -07:00
Matt Pharr	3607f3e045	Remove support for building with LLVM 2.8. Fixes issue #66 . Both 2.9 and top-of-tree generate substantially better code than LLVM 2.8 did, so it's not worth fixing the 2.8 build.	2011-09-17 13:18:59 -07:00
Matt Pharr	de84acfa5d	On OSX with LLVM 2.9, always generate position-independent code. Fixes Issue #99.	2011-09-17 13:03:51 -07:00
Matt Pharr	a501ab1aa6	Fix parenthesization bugs in cost estimates. Also added the debugging print that helped find these issues. Revert inlining some functions in examples	2011-09-16 19:07:07 -07:00
Matt Pharr	cdc850f98c	Inline some functions in examples	2011-09-16 17:02:21 -07:00
Matt Pharr	ca87579f23	Add a very simple cost model to estimate runtime cost of running code. This is currently only used to decide whether it's worth doing an "are all lanes running" check at the start of functions--for small functions, it's not worth the overhead. The cost is estimated relatively early in compilation (e.g. before we know if an array access is a scatter/gather or not, before constant folding, etc.), so there are many known shortcomings.	2011-09-16 15:09:17 -07:00
Matt Pharr	38fc13d1ab	Remove now unused function.	2011-09-16 14:21:13 -07:00
Matt Pharr	cf9d9f717e	Logic simplification to 'mixed true/false' case for coherent ifs. Use the approach from `173632f446` here as well.	2011-09-16 14:10:55 -07:00
Matt Pharr	173632f446	Generate more efficient for regular varying 'if' statements. For the case where we have a regular (i.e. non-'cif') 'if' statement, the generated code just simply checks to see if any program instance is running before running the corresponding statements. This is a lighter-weight check than IfStmt::emitMaskMixed() was performing.	2011-09-16 12:03:42 -07:00
Matt Pharr	1dedd88132	Improve implementaton of 'are both masks equal' check for AVX. Previously, we did a vector equal compare and then a movmsk, the result of which we checked to see if it was on for all lanes. Because masks are vectors of i32s, under AVX, the vector equal compare required two 4-wide SSE compares and some shuffling. Now, we do a movmsk of both masks first and then a scalar equality comparison of those two values, which seems to generate overall better code.	2011-09-15 06:25:02 -07:00
Matt Pharr	0848c2cc19	Actually make all 'if' statements check for 'all off' mask. Contrary to claims in `0c2048385`, that checkin didn't include the changes to not run if/else blocks if none of the program instances wanted to be running them. This checkin fixes that and thus actually fixes issue #74.	2011-09-13 19:48:04 -07:00
Matt Pharr	e2a88d491f	Mark the internal __fast_masked_vload function as static	2011-09-13 15:43:48 -07:00
Matt Pharr	30f9dcd4f5	Unroll loops by default, add --opt=disable-loop-unroll to disable. Issue #78.	2011-09-13 15:37:18 -07:00
Matt Pharr	0c344b6755	Fix Linux build of mandelbrot_tasks example	2011-09-13 15:17:30 -07:00
Matt Pharr	6734021520	Issue warning when compile-time constant out-of-bounds array index is used. Issue #98. Also fixes two examples that had bugs of this type that this warning uncovered!	2011-09-13 14:42:20 -07:00
Matt Pharr	dd153d3c5c	Handle more instruction types when flattening offset vectors. Generalize the lScalarizeVector() utility routine (used in determining when we can change gathers/scatters into vector loads/stores, respectively) to handle vector shuffles and vector loads. This fixes issue #79, which provided a case where a gather was being performed even though a vector load was possible.	2011-09-13 09:43:56 -07:00
Matt Pharr	9ca7541d52	Remove check for any program instances running before function calls. Given the change in `0c20483853`, this is no longer necessary, since we know that one instance will always be running if we're executing a given block of code.	2011-09-13 06:26:16 -07:00
Matt Pharr	0c20483853	Make all "if" statements "coherent" ifs. Workaround for issue #74 . Using blend to do masked stores is unsafe if all of the lanes are off: it may read from or write to invalid memory. For now, this workaround transforms all 'if' statements into coherent 'if's, ensuring that an instruction only runs if at least on program instance wants to be running it. One nice thing about this change is that a number of implementations of various builtins can be simplified, since they no longer need to confirm that at least one program instance is running. It might be nice to re-enable regular if statements in a future checkin, but we'd want to make sure they don't have any masked loads or blended masked stores in their statement lists. There isn't a performance impact for any of the examples with this change, so it's unclear if this is important. Note that this only impacts 'if' statements with a varying condition.	2011-09-12 16:25:08 -07:00
Matt Pharr	9d4ff1bc06	Fix alignment in usage message	2011-09-12 15:06:41 -07:00
Matt Pharr	83f22f1939	Add experimental --fast-masked-vload flag for SSE.	2011-09-12 12:29:33 -07:00
Matt Pharr	6375ed9224	AVX: Fix bug with misdeclaration of blend intrinsic. This was preventing the "convert an all-on blend to one of the operand values" optimization from kicking on in AVX.	2011-09-12 06:42:38 -07:00
Matt Pharr	cf23cf9ef4	Fix typo in user guide. Issue #96	2011-09-12 05:24:32 -07:00
Matt Pharr	1147b53dcd	Add #define with target vector width in emitted headers	2011-09-09 09:33:56 -07:00
Matt Pharr	4cf831a651	When --fast-math is enabled, tell LLVM about it, too.	2011-09-09 09:32:59 -07:00
Matt Pharr	785d8a29d3	Run mem2reg pass even when doing -O0 compiles	2011-09-09 09:24:43 -07:00
Matt Pharr	46d2bad231	Fix malformed program crash	2011-09-09 09:24:43 -07:00
Matt Pharr	32da8e11b4	Fix crash with varying global vector types when emitting header file.	2011-09-09 09:16:59 -07:00
Matt Pharr	5dedb6f836	Add --scale command line argument to mandelbrot and rt examples. This applies a floating-point scale factor to the image resolution; it's useful for experiments with many-core systems where the base image resolution may not give enough work for good load-balancing with tasks.	2011-09-07 20:07:51 -07:00
Matt Pharr	2ea6d249d5	Fix mapping to 8, 16 program instances in AO bench example. With this, we now compute a correct image with AVX.	2011-09-07 11:34:24 -07:00
Matt Pharr	c86128e8ee	AVX: go back to using blend (vs. masked store) when possible. All of the masked store calls were inhibiting putting values into registers, which in turn led to a lot of unnecessary stack traffic. This approach seems to give better code in the end.	2011-09-07 11:26:49 -07:00
Matt Pharr	375f1cb8e8	Make octaves and octaves loop uniform in noise example	2011-09-07 10:34:23 -07:00
Matt Pharr	3ca7b6b078	Remove MCJIT stuff from ispc_test (fix Linux build)	2011-09-07 09:44:27 -07:00
Matt Pharr	effe901890	Add task-parallel version of aobench	2011-09-07 05:43:21 -07:00
Matt Pharr	4f451bd041	More AVX fixes Fix RNG state initialization for 16-wide targets Fix a number of bugs in reduce_add builtin implementations for AVX. Fix some tests that had incorrect expected results for the 16-wide case.	2011-09-06 15:53:11 -07:00
Matt Pharr	c76ef7b174	Add command-line option to specify position-independent codegen	2011-09-06 11:12:43 -07:00
Matt Pharr	743d82e935	Various documentation updates.	2011-09-06 09:51:02 -07:00
Matt Pharr	18546e9c6d	Add option to disable optimizations to test running script	2011-09-04 18:09:00 -07:00