From e612abe4ba26bd0c72474f94d407c92f0a20f6d6 Mon Sep 17 00:00:00 2001
From: Matt Pharr <matt.pharr@intel.com>
Date: Thu, 26 Jan 2012 10:53:56 -0800
Subject: [PATCH 01/62] Fix parsing of 64-bit integer constants on Windows.

(i.e., use the 64-bit unsigned integer parsing function,
not the 64-bit signed one.)

Fixes bug #68.
---
 lex.ll | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lex.ll b/lex.ll
index 1b6d382b..f21df180 100644
--- a/lex.ll
+++ b/lex.ll
@@ -156,7 +156,7 @@ L?\"(\\.|[^\\"])*\" { lStringConst(yylval, yylloc); return TOKEN_STRING_LITERAL;
         yylval->intVal = lParseBinary(yytext+2, *yylloc, &endPtr);
     else {
 #if defined(ISPC_IS_WINDOWS) && !defined(__MINGW32__)
-        yylval->intVal = _strtoi64(yytext, &endPtr, 0);
+        yylval->intVal = _strtoui64(yytext, &endPtr, 0);
 #else
         // FIXME: should use strtouq and then issue an error if we can't
         // fit into 64 bits...

From 65f3252760983a6dcd40778f2df7f3e98ccc1859 Mon Sep 17 00:00:00 2001
From: Matt Pharr <matt.pharr@intel.com>
Date: Thu, 26 Jan 2012 10:55:27 -0800
Subject: [PATCH 02/62] Various fixes to test running script for Windows.

Also, removed the --valgrind option and replaced it with a more
general --wrap-exe option, which can be used both for running
Valgrind and SDE.
---
 run_tests.py      | 42 +++++++++++++++++++++++++++---------------
 winstuff/stdint.h |  4 +++-
 2 files changed, 30 insertions(+), 16 deletions(-)

diff --git a/run_tests.py b/run_tests.py
index e2ea66f9..aa88b2be 100755
--- a/run_tests.py
+++ b/run_tests.py
@@ -40,21 +40,16 @@ parser.add_option('-o', '--no-opt', dest='no_opt', help='Disable optimization',
                   default=False, action="store_true")
 parser.add_option('-v', '--verbose', dest='verbose', help='Enable verbose output',
                   default=False, action="store_true")
-if not is_windows:
-    parser.add_option('--valgrind', dest='valgrind', help='Run tests with valgrind',
-                      default=False, action="store_true")
+parser.add_option('--wrap-exe', dest='wrapexe',
+                  help='Executable to wrap test runs with (e.g. "valgrind")',
+                  default="")
 
 (options, args) = parser.parse_args()
 
-if not is_windows and options.valgrind:
-    valgrind_exe = "valgrind "
-else:
-    valgrind_exe = ""
-
 if not is_windows:
     ispc_exe = "./ispc"
 else:
-    ispc_exe = "Release/ispc.exe"
+    ispc_exe = "../Release/ispc.exe"
 
 is_generic_target = options.target.find("generic-") != -1
 if is_generic_target and options.include_file == None:
@@ -74,14 +69,31 @@ if options.compiler_exe == None:
     else:
         options.compiler_exe = "g++"
 
-# if no specific test files are specified, run all of the tests in tests/
-# and failing_tests/
+def fix_windows_paths(files):
+    ret = [ ]
+    for fn in files:
+        ret += [ string.replace(fn, '\\', '/') ]
+    return ret
+
+    
+# if no specific test files are specified, run all of the tests in tests/,
+# failing_tests/, and tests_errors/
 if len(args) == 0:
     files = glob.glob("tests/*ispc") + glob.glob("failing_tests/*ispc") + \
         glob.glob("tests_errors/*ispc")
+    files = fix_windows_paths(files)
 else:
+    if is_windows:
+        argfiles = [ ]
+        for f in args:
+            # we have to glob ourselves if this is being run under a DOS
+            # shell..
+            argfiles += glob.glob(f)
+    else:
+        argfiles = args
+        
     files = [ ]
-    for f in args:
+    for f in argfiles:
         if os.path.splitext(string.lower(f))[1] != ".ispc":
             print "Ignoring file %s, which doesn't have an .ispc extension." % f
         else:
@@ -101,6 +113,7 @@ finished_tests_counter_lock = multiprocessing.Lock()
 # utility routine to print an update on the number of tests that have been
 # finished.  Should be called with the lock held..
 def update_progress(fn):
+    global total_tests
     finished_tests_counter.value = finished_tests_counter.value + 1
     progress_str = " Done %d / %d [%s]" % (finished_tests_counter.value, total_tests, fn)
     # spaces to clear out detrius from previous printing...
@@ -218,7 +231,7 @@ def run_test(filename):
                     obj_name = "%s%s.obj" % (input_prefix, filename)
                 exe_name = "%s%s.exe" % (input_prefix, filename)
 
-                cc_cmd = "%s /I. /Iwinstuff /Zi /nologo /DTEST_SIG=%d %stest_static.cpp %s /Fe%s" % \
+                cc_cmd = "%s /I. /I../winstuff /Zi /nologo /DTEST_SIG=%d %stest_static.cpp %s /Fe%s" % \
                          (options.compiler_exe, match, input_prefix, obj_name, exe_name)
                 if should_fail:
                     cc_cmd += " /DEXPECT_FAILURE"
@@ -246,9 +259,8 @@ def run_test(filename):
                 ispc_cmd += " --emit-c++ --c++-include-file=%s" % options.include_file
 
         # compile the ispc code, make the executable, and run it...
-        global valgrind_exe
         (compile_error, run_error) = run_cmds([ispc_cmd, cc_cmd], 
-                                              valgrind_exe + " " + exe_name, \
+                                              options.wrapexe + " " + exe_name, \
                                               filename, should_fail)
 
         # clean up after running the test
diff --git a/winstuff/stdint.h b/winstuff/stdint.h
index 895f702a..5878843d 100644
--- a/winstuff/stdint.h
+++ b/winstuff/stdint.h
@@ -1,7 +1,9 @@
 #ifndef MY_STDINT_H
 #define MY_STDINT_H 1
 
+typedef unsigned __int32 uint32_t;
 typedef __int32 int32_t;
 typedef __int64 int64_t;
+typedef unsigned __int64 uint64_t;
 
-#endif // MY_STDINT_H
\ No newline at end of file
+#endif // MY_STDINT_H

From 177e6312b45c865f9cb40975a6dbf37dd58e9a46 Mon Sep 17 00:00:00 2001
From: Matt Pharr <matt.pharr@intel.com>
Date: Fri, 27 Jan 2012 09:07:58 -0800
Subject: [PATCH 03/62] Fix build with LLVM ToT
 (ConstantVector::getVectorElements() is gone now).

---
 opt.cpp | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/opt.cpp b/opt.cpp
index f6eab8c6..a01452f1 100644
--- a/opt.cpp
+++ b/opt.cpp
@@ -632,7 +632,13 @@ lGetMask(llvm::Value *factor) {
     if (cv) {
         int mask = 0;
         llvm::SmallVector<llvm::Constant *, ISPC_MAX_NVEC> elements;
+        Assert((int)cv->getNumOperands() == g->target.vectorWidth);
+#ifdef LLVM_3_1svn
+        for (int i = 0; i < (int)cv->getNumOperands(); ++i)
+            elements.push_back(cv->getOperand(i));
+#else
         cv->getVectorElements(elements);
+#endif
 
         for (unsigned int i = 0; i < elements.size(); ++i) {
             llvm::APInt intMaskValue;
@@ -1125,7 +1131,12 @@ lGetBasePtrAndOffsets(llvm::Value *ptrs, llvm::Value **offsets,
         // Indexing into global arrays can lead to this form, with
         // ConstantVectors..
         llvm::SmallVector<llvm::Constant *, ISPC_MAX_NVEC> elements;
+#ifdef LLVM_3_1svn
+        for (int i = 0; i < (int)cv->getNumOperands(); ++i)
+            elements.push_back(cv->getOperand(i));
+#else
         cv->getVectorElements(elements);
+#endif
 
         llvm::Constant *delta[ISPC_MAX_NVEC];
         for (unsigned int i = 0; i < elements.size(); ++i) {
@@ -2143,7 +2154,12 @@ lVectorIsLinearConstantInts(llvm::ConstantVector *cv, int vectorLength,
                             int stride) {
     // Flatten the vector out into the elements array
     llvm::SmallVector<llvm::Constant *, ISPC_MAX_NVEC> elements;
+#ifdef LLVM_3_1svn
+    for (int i = 0; i < (int)cv->getNumOperands(); ++i)
+        elements.push_back(cv->getOperand(i));
+#else
     cv->getVectorElements(elements);
+#endif
     Assert((int)elements.size() == vectorLength);
 
     llvm::ConstantInt *ci = llvm::dyn_cast<llvm::ConstantInt>(elements[0]);

From 061e68bc77dae947149f531d956a72bc06fe46b4 Mon Sep 17 00:00:00 2001
From: Matt Pharr <matt.pharr@intel.com>
Date: Thu, 26 Jan 2012 09:57:13 -0800
Subject: [PATCH 04/62] Fix compiler crash from malformed program.

---
 decl.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/decl.cpp b/decl.cpp
index 5ec58462..c62f0b6f 100644
--- a/decl.cpp
+++ b/decl.cpp
@@ -266,7 +266,8 @@ Declarator::GetFunctionInfo(DeclSpecs *ds, std::vector<Symbol *> *funArgs) {
         funArgs->push_back(sym);
     }
 
-    funSym->type = funSym->type->ResolveUnboundVariability(Type::Varying);
+    if (funSym != NULL)
+        funSym->type = funSym->type->ResolveUnboundVariability(Type::Varying);
 
     return funSym;
 }

From 56ffc78fa4f32126a4de2c38752cb33cde59b636 Mon Sep 17 00:00:00 2001
From: Matt Pharr <matt.pharr@intel.com>
Date: Thu, 26 Jan 2012 12:08:07 -0800
Subject: [PATCH 05/62] Require semicolons after sync, assert, and print
 statements.

(Silly parser oversight.)
---
 parse.yy | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/parse.yy b/parse.yy
index 671e426f..4be1ab7e 100644
--- a/parse.yy
+++ b/parse.yy
@@ -1469,23 +1469,23 @@ jump_statement
     ;
 
 sync_statement
-    : TOKEN_SYNC 
+    : TOKEN_SYNC ';'
       { $$ = new ExprStmt(new SyncExpr(@1), @1); }
     ;
 
 print_statement
-    : TOKEN_PRINT '(' string_constant ')'
+    : TOKEN_PRINT '(' string_constant ')' ';'
       {
            $$ = new PrintStmt(*$3, NULL, @1); 
       }
-    | TOKEN_PRINT '(' string_constant ',' argument_expression_list ')'
+    | TOKEN_PRINT '(' string_constant ',' argument_expression_list ')' ';'
       {
            $$ = new PrintStmt(*$3, $5, @1); 
       }
     ;
 
 assert_statement
-    : TOKEN_ASSERT '(' string_constant ',' expression ')'
+    : TOKEN_ASSERT '(' string_constant ',' expression ')' ';'
       {
           $$ = new AssertStmt(*$3, $5, @1);
       }

From 24f58fa16a48a460087992eabed092386670df9c Mon Sep 17 00:00:00 2001
From: Matt Pharr <matt.pharr@intel.com>
Date: Thu, 26 Jan 2012 14:15:58 -0800
Subject: [PATCH 06/62] Update per_lane macro to not use ID for lane number in
 macro expansion

This was leading to unintended consequences if WIDTH was used in macro code,
which was undesirable.
---
 builtins/util.m4 | 44 ++++++++++++++++++++++----------------------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/builtins/util.m4 b/builtins/util.m4
index 36882491..4e3fc85b 100644
--- a/builtins/util.m4
+++ b/builtins/util.m4
@@ -2187,9 +2187,9 @@ return:
 define(`gen_masked_store', `
 define void @__masked_store_$3(<$1 x $2>* nocapture, <$1 x $2>, <$1 x i32>) nounwind alwaysinline {
   per_lane($1, <$1 x i32> %2, `
-      %ptr_ID = getelementptr <$1 x $2> * %0, i32 0, i32 LANE
-      %storeval_ID = extractelement <$1 x $2> %1, i32 LANE
-      store $2 %storeval_ID, $2 * %ptr_ID')
+      %ptr_LANE_ID = getelementptr <$1 x $2> * %0, i32 0, i32 LANE
+      %storeval_LANE_ID = extractelement <$1 x $2> %1, i32 LANE
+      store $2 %storeval_LANE_ID, $2 * %ptr_LANE_ID')
   ret void
 }
 ')
@@ -2644,7 +2644,7 @@ pl_known_mask:
 pl_all_on:
   ;; the mask is all on--just expand the code for each lane sequentially
   forloop(i, 0, eval($1-1), 
-          `patsubst(`$3', `ID\|LANE', i)')
+          `patsubst(`$3', `LANE', i)')
   br label %pl_done
 
 pl_unknown_mask:
@@ -2806,11 +2806,11 @@ define <$1 x $2> @__gather32_$2(<$1 x i32> %ptrs,
                                 <$1 x i32> %vecmask) nounwind readonly alwaysinline {
   %ret_ptr = alloca <$1 x $2>
   per_lane($1, <$1 x i32> %vecmask, `
-  %iptr_ID = extractelement <$1 x i32> %ptrs, i32 LANE
-  %ptr_ID = inttoptr i32 %iptr_ID to $2 *
-  %val_ID = load $2 * %ptr_ID
-  %store_ptr_ID = getelementptr <$1 x $2> * %ret_ptr, i32 0, i32 LANE
-  store $2 %val_ID, $2 * %store_ptr_ID
+  %iptr_LANE_ID = extractelement <$1 x i32> %ptrs, i32 LANE
+  %ptr_LANE_ID = inttoptr i32 %iptr_LANE_ID to $2 *
+  %val_LANE_ID = load $2 * %ptr_LANE_ID
+  %store_ptr_LANE_ID = getelementptr <$1 x $2> * %ret_ptr, i32 0, i32 LANE
+  store $2 %val_LANE_ID, $2 * %store_ptr_LANE_ID
  ')
 
   %ret = load <$1 x $2> * %ret_ptr
@@ -2822,11 +2822,11 @@ define <$1 x $2> @__gather64_$2(<$1 x i64> %ptrs,
                                 <$1 x i32> %vecmask) nounwind readonly alwaysinline {
   %ret_ptr = alloca <$1 x $2>
   per_lane($1, <$1 x i32> %vecmask, `
-  %iptr_ID = extractelement <$1 x i64> %ptrs, i32 LANE
-  %ptr_ID = inttoptr i64 %iptr_ID to $2 *
-  %val_ID = load $2 * %ptr_ID
-  %store_ptr_ID = getelementptr <$1 x $2> * %ret_ptr, i32 0, i32 LANE
-  store $2 %val_ID, $2 * %store_ptr_ID
+  %iptr_LANE_ID = extractelement <$1 x i64> %ptrs, i32 LANE
+  %ptr_LANE_ID = inttoptr i64 %iptr_LANE_ID to $2 *
+  %val_LANE_ID = load $2 * %ptr_LANE_ID
+  %store_ptr_LANE_ID = getelementptr <$1 x $2> * %ret_ptr, i32 0, i32 LANE
+  store $2 %val_LANE_ID, $2 * %store_ptr_LANE_ID
  ')
 
   %ret = load <$1 x $2> * %ret_ptr
@@ -2910,10 +2910,10 @@ define void @__scatter_base_offsets64_$2(i8* %base, <$1 x i64> %offsets, i32 %of
 define void @__scatter32_$2(<$1 x i32> %ptrs, <$1 x $2> %values,
                             <$1 x i32> %mask) nounwind alwaysinline {
   per_lane($1, <$1 x i32> %mask, `
-  %iptr_ID = extractelement <$1 x i32> %ptrs, i32 LANE
-  %ptr_ID = inttoptr i32 %iptr_ID to $2 *
-  %val_ID = extractelement <$1 x $2> %values, i32 LANE
-  store $2 %val_ID, $2 * %ptr_ID
+  %iptr_LANE_ID = extractelement <$1 x i32> %ptrs, i32 LANE
+  %ptr_LANE_ID = inttoptr i32 %iptr_LANE_ID to $2 *
+  %val_LANE_ID = extractelement <$1 x $2> %values, i32 LANE
+  store $2 %val_LANE_ID, $2 * %ptr_LANE_ID
  ')
   ret void
 }
@@ -2922,10 +2922,10 @@ define void @__scatter32_$2(<$1 x i32> %ptrs, <$1 x $2> %values,
 define void @__scatter64_$2(<$1 x i64> %ptrs, <$1 x $2> %values,
                             <$1 x i32> %mask) nounwind alwaysinline {
   per_lane($1, <$1 x i32> %mask, `
-  %iptr_ID = extractelement <$1 x i64> %ptrs, i32 LANE
-  %ptr_ID = inttoptr i64 %iptr_ID to $2 *
-  %val_ID = extractelement <$1 x $2> %values, i32 LANE
-  store $2 %val_ID, $2 * %ptr_ID
+  %iptr_LANE_ID = extractelement <$1 x i64> %ptrs, i32 LANE
+  %ptr_LANE_ID = inttoptr i64 %iptr_LANE_ID to $2 *
+  %val_LANE_ID = extractelement <$1 x $2> %values, i32 LANE
+  store $2 %val_LANE_ID, $2 * %ptr_LANE_ID
  ')
   ret void
 }

From 5893a9c49d37dd828125b321d2f90d0cbfd6991c Mon Sep 17 00:00:00 2001
From: Matt Pharr <matt.pharr@intel.com>
Date: Fri, 27 Jan 2012 09:14:45 -0800
Subject: [PATCH 07/62] Remove incorrect assert

---
 opt.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/opt.cpp b/opt.cpp
index a01452f1..16401ba9 100644
--- a/opt.cpp
+++ b/opt.cpp
@@ -632,7 +632,6 @@ lGetMask(llvm::Value *factor) {
     if (cv) {
         int mask = 0;
         llvm::SmallVector<llvm::Constant *, ISPC_MAX_NVEC> elements;
-        Assert((int)cv->getNumOperands() == g->target.vectorWidth);
 #ifdef LLVM_3_1svn
         for (int i = 0; i < (int)cv->getNumOperands(); ++i)
             elements.push_back(cv->getOperand(i));

From 37cdc18639d88b14ed9927622d7ee212f142a515 Mon Sep 17 00:00:00 2001
From: Matt Pharr <matt.pharr@intel.com>
Date: Fri, 27 Jan 2012 10:01:06 -0800
Subject: [PATCH 08/62] Issue error instead of crashing given attempted
 function call through non-function.

Fixes issue #163.
---
 expr.cpp | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/expr.cpp b/expr.cpp
index c424f3c2..b04c8319 100644
--- a/expr.cpp
+++ b/expr.cpp
@@ -2678,12 +2678,12 @@ FunctionCallExpr::TypeCheck() {
         const Type *fptrType = func->GetType();
         if (fptrType == NULL)
             return NULL;
-           
-        Assert(dynamic_cast<const PointerType *>(fptrType) != NULL);
-        const FunctionType *funcType = 
-            dynamic_cast<const FunctionType *>(fptrType->GetBaseType());
-        if (funcType == NULL) {
-            Error(pos, "Must provide function name or function pointer for "
+
+        // Make sure we do in fact have a function to call
+        const FunctionType *funcType;
+        if (dynamic_cast<const PointerType *>(fptrType) == NULL ||
+            (funcType = dynamic_cast<const FunctionType *>(fptrType->GetBaseType())) == NULL) {
+            Error(func->pos, "Must provide function name or function pointer for "
                   "function call expression.");
             return NULL;
         }

From b7f17d435f54b385dafbfe32f9482dde97d20a67 Mon Sep 17 00:00:00 2001
From: Matt Pharr <matt.pharr@intel.com>
Date: Fri, 27 Jan 2012 12:59:18 -0800
Subject: [PATCH 09/62] Fix crash in gather/scatter optimization pass.

---
 opt.cpp | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/opt.cpp b/opt.cpp
index 16401ba9..b4570235 100644
--- a/opt.cpp
+++ b/opt.cpp
@@ -2200,8 +2200,12 @@ lCheckMulForLinear(llvm::Value *op0, llvm::Value *op1, int vectorLength,
     llvm::ConstantVector *cv = llvm::dyn_cast<llvm::ConstantVector>(op0);
     if (cv == NULL)
         return false;
-    llvm::ConstantInt *splat = 
-        llvm::dyn_cast<llvm::ConstantInt>(cv->getSplatValue());
+
+    llvm::Constant *csplat = cv->getSplatValue();
+    if (csplat == NULL)
+        return false;
+
+    llvm::ConstantInt *splat = llvm::dyn_cast<llvm::ConstantInt>(csplat);
     if (splat == NULL)
         return false;
 

From d9c0f9315ac0aae840d0dacb357306023a68cd3c Mon Sep 17 00:00:00 2001
From: Matt Pharr <matt.pharr@intel.com>
Date: Fri, 27 Jan 2012 13:02:03 -0800
Subject: [PATCH 10/62] Fix generic targets: half conversion functions weren't
 declared.

(Broken by 1867b5b31).
---
 builtins/target-generic-common.ll | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/builtins/target-generic-common.ll b/builtins/target-generic-common.ll
index 6e280ba6..e4c70aa4 100644
--- a/builtins/target-generic-common.ll
+++ b/builtins/target-generic-common.ll
@@ -98,6 +98,14 @@ declare void @__aos_to_soa4_float(float * noalias %p, <WIDTH x float> * noalias
                                   <WIDTH x float> * noalias %out2,
                                   <WIDTH x float> * noalias %out3) nounwind
 
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; half conversion routines
+
+declare float @__half_to_float_uniform(i16 %v) nounwind readnone
+declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
+declare i16 @__float_to_half_uniform(float %v) nounwind readnone
+declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; math
 

From bdba3cd97da3b040d265d5a8edb0a09136dbd056 Mon Sep 17 00:00:00 2001
From: Matt Pharr <matt.pharr@intel.com>
Date: Fri, 27 Jan 2012 13:37:18 -0800
Subject: [PATCH 11/62] Bugfix: add per-lane offsets when accessing varying
 data through a pointer!

---
 expr.cpp | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/expr.cpp b/expr.cpp
index b04c8319..725a1edc 100644
--- a/expr.cpp
+++ b/expr.cpp
@@ -3065,8 +3065,10 @@ IndexExpr::GetLValue(FunctionEmitContext *ctx) const {
         if (baseValue == NULL || indexValue == NULL)
             return NULL;
         ctx->SetDebugPos(pos);
-        return ctx->GetElementPtrInst(baseValue, indexValue,
-                                      baseExprType, "ptr_offset");
+        llvm::Value *ptr = ctx->GetElementPtrInst(baseValue, indexValue,
+                                                  baseExprType, "ptr_offset");
+        ptr = lAddVaryingOffsetsIfNeeded(ctx, ptr, GetLValueType());
+        return ptr;
     }
 
     // Otherwise it's an array or vector

From 664dc3bdda0a6472338faf466aa9917b1e062310 Mon Sep 17 00:00:00 2001
From: Matt Pharr <matt.pharr@intel.com>
Date: Fri, 27 Jan 2012 14:47:06 -0800
Subject: [PATCH 12/62] Add support for "new" and "delete" to the language.

Issue #139.
---
 ast.cpp                                      |  10 +
 builtins.cpp                                 |   5 +
 builtins/util.m4                             |  59 +++
 ctx.cpp                                      |   2 +-
 docs/ispc.rst                                | 115 +++++-
 expr.cpp                                     | 355 +++++++++++++++++++
 expr.h                                       |  48 +++
 ispc.h                                       |   2 +
 lex.ll                                       |   3 +
 parse.yy                                     |  63 +++-
 stmt.cpp                                     | 240 +++++--------
 stmt.h                                       |  17 +
 tests/new-delete-1.ispc                      |  15 +
 tests/new-delete-2.ispc                      |  15 +
 tests/new-delete-3.ispc                      |  17 +
 tests/new-delete-4.ispc                      |  14 +
 tests/new-delete-5.ispc                      |  17 +
 tests/new-delete-6.ispc                      |  17 +
 tests_errors/func-call-through-variable.ispc |  47 +++
 tests_errors/new-delete-1.ispc               |   5 +
 tests_errors/new-delete-2.ispc               |   5 +
 tests_errors/new-delete-3.ispc               |   5 +
 tests_errors/new-delete-4.ispc               |   7 +
 tests_errors/new-delete-5.ispc               |   5 +
 tests_errors/new-delete-6.ispc               |   5 +
 tests_errors/new-delete-7.ispc               |  12 +
 26 files changed, 938 insertions(+), 167 deletions(-)
 create mode 100644 tests/new-delete-1.ispc
 create mode 100644 tests/new-delete-2.ispc
 create mode 100644 tests/new-delete-3.ispc
 create mode 100644 tests/new-delete-4.ispc
 create mode 100644 tests/new-delete-5.ispc
 create mode 100644 tests/new-delete-6.ispc
 create mode 100644 tests_errors/func-call-through-variable.ispc
 create mode 100644 tests_errors/new-delete-1.ispc
 create mode 100644 tests_errors/new-delete-2.ispc
 create mode 100644 tests_errors/new-delete-3.ispc
 create mode 100644 tests_errors/new-delete-4.ispc
 create mode 100644 tests_errors/new-delete-5.ispc
 create mode 100644 tests_errors/new-delete-6.ispc
 create mode 100644 tests_errors/new-delete-7.ispc

diff --git a/ast.cpp b/ast.cpp
index 746bc0ec..5eaddeb3 100644
--- a/ast.cpp
+++ b/ast.cpp
@@ -98,6 +98,7 @@ WalkAST(ASTNode *node, ASTPreCallBackFunc preFunc, ASTPostCallBackFunc postFunc,
         StmtList *sl;
         PrintStmt *ps;
         AssertStmt *as;
+        DeleteStmt *dels;
 
         if ((es = dynamic_cast<ExprStmt *>(node)) != NULL)
             es->expr = (Expr *)WalkAST(es->expr, preFunc, postFunc, data);
@@ -160,6 +161,8 @@ WalkAST(ASTNode *node, ASTPreCallBackFunc preFunc, ASTPostCallBackFunc postFunc,
             ps->values = (Expr *)WalkAST(ps->values, preFunc, postFunc, data);
         else if ((as = dynamic_cast<AssertStmt *>(node)) != NULL)
             as->expr = (Expr *)WalkAST(as->expr, preFunc, postFunc, data);
+        else if ((dels = dynamic_cast<DeleteStmt *>(node)) != NULL)
+            dels->expr = (Expr *)WalkAST(dels->expr, preFunc, postFunc, data);
         else
             FATAL("Unhandled statement type in WalkAST()");
     }
@@ -180,6 +183,7 @@ WalkAST(ASTNode *node, ASTPreCallBackFunc preFunc, ASTPostCallBackFunc postFunc,
         DereferenceExpr *dre;
         SizeOfExpr *soe;
         AddressOfExpr *aoe;
+        NewExpr *newe;
 
         if ((ue = dynamic_cast<UnaryExpr *>(node)) != NULL)
             ue->expr = (Expr *)WalkAST(ue->expr, preFunc, postFunc, data);
@@ -223,6 +227,12 @@ WalkAST(ASTNode *node, ASTPreCallBackFunc preFunc, ASTPostCallBackFunc postFunc,
             soe->expr = (Expr *)WalkAST(soe->expr, preFunc, postFunc, data);
         else if ((aoe = dynamic_cast<AddressOfExpr *>(node)) != NULL)
             aoe->expr = (Expr *)WalkAST(aoe->expr, preFunc, postFunc, data);
+        else if ((newe = dynamic_cast<NewExpr *>(node)) != NULL) {
+            newe->countExpr = (Expr *)WalkAST(newe->countExpr, preFunc, 
+                                              postFunc, data);
+            newe->initExpr = (Expr *)WalkAST(newe->initExpr, preFunc, 
+                                             postFunc, data);
+        }
         else if (dynamic_cast<SymbolExpr *>(node) != NULL ||
                  dynamic_cast<ConstExpr *>(node) != NULL ||
                  dynamic_cast<FunctionSymbolExpr *>(node) != NULL ||
diff --git a/builtins.cpp b/builtins.cpp
index 76ebdfa7..8c3631a2 100644
--- a/builtins.cpp
+++ b/builtins.cpp
@@ -391,6 +391,8 @@ lSetInternalFunctions(llvm::Module *module) {
         "__count_trailing_zeros_i64",
         "__count_leading_zeros_i32",
         "__count_leading_zeros_i64",
+        "__delete_uniform",
+        "__delete_varying",
         "__do_assert_uniform",
         "__do_assert_varying",
         "__do_print", 
@@ -449,6 +451,9 @@ lSetInternalFunctions(llvm::Module *module) {
         "__min_varying_uint32",
         "__min_varying_uint64",
         "__movmsk",
+        "__new_uniform",
+        "__new_varying32",
+        "__new_varying64",
         "__num_cores",
         "__packed_load_active",
         "__packed_store_active",
diff --git a/builtins/util.m4 b/builtins/util.m4
index 4e3fc85b..563ee3e9 100644
--- a/builtins/util.m4
+++ b/builtins/util.m4
@@ -1805,6 +1805,65 @@ ok:
   ret void
 }
 
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; new/delete
+
+declare i8 * @malloc(i64)
+declare void @free(i8 *)
+
+define i8 * @__new_uniform(i64 %size) {
+  %a = call i8 * @malloc(i64 %size)
+  ret i8 * %a
+}
+
+define <WIDTH x i64> @__new_varying32(<WIDTH x i32> %size, <WIDTH x MASK> %mask) {
+  %ret = alloca <WIDTH x i64>
+  store <WIDTH x i64> zeroinitializer, <WIDTH x i64> * %ret
+  %ret64 = bitcast <WIDTH x i64> * %ret to i64 *
+
+  per_lane(WIDTH, <WIDTH x MASK> %mask, `
+    %sz_LANE_ID = extractelement <WIDTH x i32> %size, i32 LANE
+    %sz64_LANE_ID = zext i32 %sz_LANE_ID to i64
+    %ptr_LANE_ID = call i8 * @malloc(i64 %sz64_LANE_ID)
+    %ptr_int_LANE_ID = ptrtoint i8 * %ptr_LANE_ID to i64
+    %store_LANE_ID = getelementptr i64 * %ret64, i32 LANE
+    store i64 %ptr_int_LANE_ID, i64 * %store_LANE_ID')
+
+  %r = load <WIDTH x i64> * %ret
+  ret <WIDTH x i64> %r
+}
+
+define <WIDTH x i64> @__new_varying64(<WIDTH x i64> %size, <WIDTH x MASK> %mask) {
+  %ret = alloca <WIDTH x i64>
+  store <WIDTH x i64> zeroinitializer, <WIDTH x i64> * %ret
+  %ret64 = bitcast <WIDTH x i64> * %ret to i64 *
+
+  per_lane(WIDTH, <WIDTH x MASK> %mask, `
+    %sz_LANE_ID = extractelement <WIDTH x i64> %size, i32 LANE
+    %ptr_LANE_ID = call i8 * @malloc(i64 %sz_LANE_ID)
+    %ptr_int_LANE_ID = ptrtoint i8 * %ptr_LANE_ID to i64
+    %store_LANE_ID = getelementptr i64 * %ret64, i32 LANE
+    store i64 %ptr_int_LANE_ID, i64 * %store_LANE_ID')
+
+  %r = load <WIDTH x i64> * %ret
+  ret <WIDTH x i64> %r
+}
+
+define void @__delete_uniform(i8 * %ptr) {
+  call void @free(i8 * %ptr)
+  ret void
+}
+
+define void @__delete_varying(<WIDTH x i64> %ptr, <WIDTH x MASK> %mask) {
+  per_lane(WIDTH, <WIDTH x MASK> %mask, `
+      %iptr_LANE_ID = extractelement <WIDTH x i64> %ptr, i32 LANE
+      %ptr_LANE_ID = inttoptr i64 %iptr_LANE_ID to i8 *
+      call void @free(i8 * %ptr_LANE_ID)
+  ')
+  ret void
+}
+
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; read hw clock
 
diff --git a/ctx.cpp b/ctx.cpp
index 0a7dd6d0..8ac64fe5 100644
--- a/ctx.cpp
+++ b/ctx.cpp
@@ -2923,7 +2923,7 @@ FunctionEmitContext::SyncInst() {
 
 
 /** When we gathering from or scattering to a varying atomic type, we need
-    to add an appropraite offset to the final address for each lane right
+    to add an appropriate offset to the final address for each lane right
     before we use it.  Given a varying pointer we're about to use and its
     type, this function determines whether these offsets are needed and
     returns an updated pointer that incorporates these offsets if needed.
diff --git a/docs/ispc.rst b/docs/ispc.rst
index c0dcd6df..5613cfa1 100644
--- a/docs/ispc.rst
+++ b/docs/ispc.rst
@@ -96,6 +96,9 @@ Contents:
 
   + `Declarations and Initializers`_
   + `Expressions`_
+
+    * `Dynamic Memory Allocation`_
+
   + `Control Flow`_
 
     * `Conditional Statements: "if"`_
@@ -1162,6 +1165,7 @@ in C:
 * The ``inline`` qualifier to indicate that a function should be inlined 
 * Function overloading by parameter type
 * Hexadecimal floating-point constants
+* Dynamic memory allocation with ``new`` and ``delete``.
 
 ``ispc`` also adds a number of new features that aren't in C89, C99, or
 C++:
@@ -1966,18 +1970,123 @@ operator also work as expected.
     fp->b = 1;
   
 
+Dynamic Memory Allocation
+-------------------------
+
+``ispc`` programs can dynamically allocate (and free) memory, using syntax
+based on C++'s ``new`` and ``delete`` operators:
+
+::
+
+   int count = ...;
+   int *ptr = new uniform int[count];
+   // use ptr...
+   delete[] ptr;
+
+In the above code, each program instance allocates its own ``count`-sized
+array of ``uniform int`` values, uses that memory, and then deallocates
+that memory.  Uses of ``new`` and ``delete`` in ``ispc`` programs are
+serviced by corresponding calls the system C library's ``malloc()`` and
+``free()`` functions.
+
+After a pointer has been deleted, it is illegal to access the memory it
+points to.  However, note that deletion happens on a per-program-instance
+basis.  In other words, consider the following code:
+
+::
+
+    int *ptr = new uniform int[count];
+    // use ptr
+    if (count > 1000)
+        delete[] ptr;
+    // ...
+
+Here, the program instances where ``count`` is greater than 1000 have
+deleted the dynamically allocated memory pointed to by ``ptr``, but the
+other program instances have not.  As such, it's illegal for the former set
+of program instances to access ``*ptr``, but it's perfectly fine for the
+latter set to continue to use the memory ``ptr`` points to.  Note that it
+is illegal to delete a pointer value returned by ``new`` more than one
+time.
+ 
+Sometimes, it's useful to be able to do a single allocation for the entire
+gang of program instances.  A ``new`` statement can be qualified with
+``uniform`` to indicate a single memory allocation:
+
+::
+
+    float * uniform ptr = uniform new float[10];
+
+While a regular call to ``new`` returns a ``varying`` pointer (i.e. a
+distinct pointer to separately-allocated memory for each program instance),
+a ``uniform new`` performs a single allocation and returns a ``uniform``
+pointer.
+
+When using ``uniform new``, it's important to be aware of a subtlety; if
+the returned pointer is stored in a varying pointer variable (as may be
+appropriate and useful for the particular program being written), then the
+varying pointer may inadvertently be passed to a subsequent ``delete``
+statement, which is an error: effectively
+
+::
+
+    float *ptr = uniform new float[10];
+    // use ptr...
+    delete ptr;  // ERROR: varying pointer is deleted
+
+In this case, ``ptr`` will be deleted multiple times, once for each
+executing program instance, which is an error (unless it happens that only
+a single program instance is active in the above code.)
+
+When using ``new`` statements, it's important to make an appropriate choice
+of ``uniform`` or ``varying`` (as always, the default), for both the
+``new`` operator itself as well as the type of data being allocated, based
+on the program's needs.  Consider the following four memory allocations:
+
+::
+
+    uniform float * uniform p1 = uniform new uniform float[10];
+    float * uniform p2 = uniform new float[10];
+    uniform float * p3 = new uniform float[10];
+    float * p4 = new float[10];
+
+Assuming that a ``float`` is 4 bytes in memory and if the gang size is 8
+program instances, then the first allocation represents a single allocation
+of 40 bytes, the second is a single allocation of 8*4*10 = 320 bytes, the
+third is 8 allocations of 40 bytes, and the last performs 8 allocations of
+80 bytes each.
+
+Note in particular that varying allocations of varying data types are rarely
+desirable in practice.  In that case, each program instance is performing a
+separate allocation of ``varying float`` memory.  In this case, it's likely
+that the program instances will only access a single element of each
+``varying float``, which is wasteful.
+
+Although ``ispc`` doesn't support constructors or destructors like C++, it
+is possible to provide initializer values with ``new`` statements:
+
+::
+
+    struct Point { float x, y, z; };
+    Point *pptr = new Point(10, 20, 30);
+
+Here for example, the "x" element of the returned ``Point`` is initialized
+to have the value 10 and so forth.  In general, the rules for how
+initializer values provided in ``new`` statements are used to initialize
+complex data types follow the same rules as initializers for variables
+described in `Declarations and Initializers`_.
+
 Control Flow
 ------------
 
 ``ispc`` supports most of C's control flow constructs, including ``if``,
-``for``, ``while``, ``do``.  It also supports variants of C's control flow
+``switch``, ``for``, ``while``, ``do``.  It has limited support for
+``goto``, detailed below.  It also supports variants of C's control flow
 constructs that provide hints about the expected runtime coherence of the
 control flow at that statement.  It also provides parallel looping
 constructs, ``foreach`` and ``foreach_tiled``, all of which will be
 detailed in this section.
 
-``ispc`` does not currently support ``switch`` statements or ``goto``.
-
 Conditional Statements: "if"
 ----------------------------
 
diff --git a/expr.cpp b/expr.cpp
index 725a1edc..b9872780 100644
--- a/expr.cpp
+++ b/expr.cpp
@@ -504,6 +504,153 @@ TypeConvertExpr(Expr *expr, const Type *toType, const char *errorMsgBase) {
 }
 
 
+bool
+PossiblyResolveFunctionOverloads(Expr *expr, const Type *type) {
+    FunctionSymbolExpr *fse = NULL;
+    const FunctionType *funcType = NULL;
+    if (dynamic_cast<const PointerType *>(type) != NULL &&
+        (funcType = dynamic_cast<const FunctionType *>(type->GetBaseType())) &&
+        (fse = dynamic_cast<FunctionSymbolExpr *>(expr)) != NULL) {
+        // We're initializing a function pointer with a function symbol,
+        // which in turn may represent an overloaded function.  So we need
+        // to try to resolve the overload based on the type of the symbol
+        // we're initializing here.
+        std::vector<const Type *> paramTypes;
+        for (int i = 0; i < funcType->GetNumParameters(); ++i)
+            paramTypes.push_back(funcType->GetParameterType(i));
+
+        if (fse->ResolveOverloads(expr->pos, paramTypes) == false)
+            return false;
+    }
+    return true;
+}
+
+
+
+/** Utility routine that emits code to initialize a symbol given an
+    initializer expression.
+
+    @param lvalue    Memory location of storage for the symbol's data
+    @param symName   Name of symbol (used in error messages)
+    @param symType   Type of variable being initialized
+    @param initExpr  Expression for the initializer
+    @param ctx       FunctionEmitContext to use for generating instructions
+    @param pos       Source file position of the variable being initialized
+*/
+void
+InitSymbol(llvm::Value *lvalue, const Type *symType, Expr *initExpr, 
+           FunctionEmitContext *ctx, SourcePos pos) {
+    if (initExpr == NULL)
+        // leave it uninitialized
+        return;
+
+    // If the initializer is a straight up expression that isn't an
+    // ExprList, then we'll see if we can type convert it to the type of
+    // the variable.
+    if (dynamic_cast<ExprList *>(initExpr) == NULL) {
+        if (PossiblyResolveFunctionOverloads(initExpr, symType) == false)
+            return;
+        initExpr = TypeConvertExpr(initExpr, symType, "initializer");
+
+        if (initExpr != NULL) {
+            llvm::Value *initializerValue = initExpr->GetValue(ctx);
+            if (initializerValue != NULL)
+                // Bingo; store the value in the variable's storage
+                ctx->StoreInst(initializerValue, lvalue);
+            return;
+        }
+    }
+
+    // Atomic types and enums can't be initialized with { ... } initializer
+    // expressions, so print an error and return if that's what we've got
+    // here..
+    if (dynamic_cast<const AtomicType *>(symType) != NULL ||
+        dynamic_cast<const EnumType *>(symType) != NULL ||
+        dynamic_cast<const PointerType *>(symType) != NULL) {
+        ExprList *elist = dynamic_cast<ExprList *>(initExpr);
+        if (elist != NULL) {
+            if (elist->exprs.size() == 1)
+                InitSymbol(lvalue, symType, elist->exprs[0], ctx, pos);
+            else
+                Error(initExpr->pos, "Expression list initializers can't be used "
+                      "with type \"%s\".", symType->GetString().c_str());
+        }
+        return;
+    }
+
+    const ReferenceType *rt = dynamic_cast<const ReferenceType *>(symType);
+    if (rt) {
+        if (!Type::Equal(initExpr->GetType(), rt)) {
+            Error(initExpr->pos, "Initializer for reference type \"%s\" must have same "
+                  "reference type itself. \"%s\" is incompatible.", 
+                  rt->GetString().c_str(), initExpr->GetType()->GetString().c_str());
+            return;
+        }
+
+        llvm::Value *initializerValue = initExpr->GetValue(ctx);
+        if (initializerValue)
+            ctx->StoreInst(initializerValue, lvalue);
+        return;
+    }
+
+    // There are two cases for initializing structs, arrays and vectors;
+    // either a single initializer may be provided (float foo[3] = 0;), in
+    // which case all of the elements are initialized to the given value,
+    // or an initializer list may be provided (float foo[3] = { 1,2,3 }),
+    // in which case the elements are initialized with the corresponding
+    // values.
+    const CollectionType *collectionType = 
+        dynamic_cast<const CollectionType *>(symType);
+    if (collectionType != NULL) {
+        std::string name;
+        if (dynamic_cast<const StructType *>(symType) != NULL)
+            name = "struct";
+        else if (dynamic_cast<const ArrayType *>(symType) != NULL) 
+            name = "array";
+        else if (dynamic_cast<const VectorType *>(symType) != NULL) 
+            name = "vector";
+        else 
+            FATAL("Unexpected CollectionType in InitSymbol()");
+
+        ExprList *exprList = dynamic_cast<ExprList *>(initExpr);
+        if (exprList != NULL) {
+            // The { ... } case; make sure we have the same number of
+            // expressions in the ExprList as we have struct members
+            int nInits = exprList->exprs.size();
+            if (nInits != collectionType->GetElementCount()) {
+                Error(initExpr->pos, "Initializer for %s type \"%s\" requires "
+                      "%d values; %d provided.", name.c_str(), 
+                      symType->GetString().c_str(),
+                      collectionType->GetElementCount(), nInits);
+                return;
+            }
+
+            // Initialize each element with the corresponding value from
+            // the ExprList
+            for (int i = 0; i < nInits; ++i) {
+                llvm::Value *ep;
+                if (dynamic_cast<const StructType *>(symType) != NULL)
+                    ep = ctx->AddElementOffset(lvalue, i, NULL, "element");
+                else
+                    ep = ctx->GetElementPtrInst(lvalue, LLVMInt32(0), LLVMInt32(i), 
+                                                PointerType::GetUniform(collectionType->GetElementType(i)), 
+                                                "gep");
+
+                InitSymbol(ep, collectionType->GetElementType(i), 
+                            exprList->exprs[i], ctx, pos);
+            }
+        }
+        else
+            Error(initExpr->pos, "Can't assign type \"%s\" to \"%s\".",
+                  initExpr->GetType()->GetString().c_str(),
+                  collectionType->GetString().c_str());
+        return;
+    }
+
+    FATAL("Unexpected Type in InitSymbol()");
+}
+
+
 ///////////////////////////////////////////////////////////////////////////
 
 /** Given an atomic or vector type, this returns a boolean type with the
@@ -6527,3 +6674,211 @@ NullPointerExpr::EstimateCost() const {
     return 0;
 }
 
+
+///////////////////////////////////////////////////////////////////////////
+// NewExpr
+
+NewExpr::NewExpr(int typeQual, const Type *t, Expr *init, Expr *count, 
+                 SourcePos tqPos, SourcePos p)
+    : Expr(p) {
+    allocType = t;
+    if (allocType != NULL && allocType->HasUnboundVariability())
+        allocType = allocType->ResolveUnboundVariability(Type::Varying);
+
+    initExpr = init;
+    countExpr = count;
+
+    /* (The below cases actually should be impossible, since the parser
+       doesn't allow more than a single type qualifier before a "new".) */
+    if ((typeQual & ~(TYPEQUAL_UNIFORM | TYPEQUAL_VARYING)) != 0) {
+        Error(tqPos, "Illegal type qualifiers in \"new\" expression (only "
+              "\"uniform\" and \"varying\" are allowed.");
+        isVarying = false;
+    }
+    else if ((typeQual & TYPEQUAL_UNIFORM) != 0 &&
+             (typeQual & TYPEQUAL_VARYING) != 0) {
+        Error(tqPos, "Illegal to provide both \"uniform\" and \"varying\" "
+              "qualifiers to \"new\" expression.");
+        isVarying = false;
+    }
+    else
+        // If no type qualifier is given before the 'new', treat it as a
+        // varying new.
+        isVarying = (typeQual == 0) || (typeQual & TYPEQUAL_VARYING);
+}
+
+
+llvm::Value *
+NewExpr::GetValue(FunctionEmitContext *ctx) const {
+    bool do32Bit = (g->target.is32Bit || g->opt.force32BitAddressing);
+
+    // Determine how many elements we need to allocate.  Note that this
+    // will be a varying value if this is a varying new.
+    llvm::Value *countValue;
+    if (countExpr != NULL) {
+        countValue = countExpr->GetValue(ctx);
+        if (countValue == NULL) {
+            Assert(m->errorCount > 0);
+            return NULL;
+        }
+    }
+    else {
+        if (isVarying) {
+            if (do32Bit) countValue = LLVMInt32Vector(1);
+            else         countValue = LLVMInt64Vector(1);
+        }
+        else {
+            if (do32Bit) countValue = LLVMInt32(1);
+            else         countValue = LLVMInt64(1);
+        }
+    }
+
+    // Compute the total amount of memory to allocate, allocSize, as the
+    // product of the number of elements to allocate and the size of a
+    // single element.
+    llvm::Value *eltSize = g->target.SizeOf(allocType->LLVMType(g->ctx), 
+                                            ctx->GetCurrentBasicBlock());
+    if (isVarying)
+        eltSize = ctx->SmearUniform(eltSize, "smear_size");
+    llvm::Value *allocSize = ctx->BinaryOperator(llvm::Instruction::Mul, countValue,
+                                                 eltSize, "alloc_size");
+
+    // Determine which allocation builtin function to call: uniform or
+    // varying, and taking 32-bit or 64-bit allocation counts.
+    llvm::Function *func;
+    if (isVarying) {
+        if (do32Bit)
+            func = m->module->getFunction("__new_varying32");
+        else
+            func = m->module->getFunction("__new_varying64");
+    }
+    else {
+        if (allocSize->getType() != LLVMTypes::Int64Type)
+            allocSize = ctx->SExtInst(allocSize, LLVMTypes::Int64Type,
+                                      "alloc_size64");
+        func = m->module->getFunction("__new_uniform");
+    }
+    Assert(func != NULL);
+
+    // Make the call for the the actual allocation.
+    llvm::Value *ptrValue = ctx->CallInst(func, NULL, allocSize, "new");
+
+    // Now handle initializers and returning the right type for the result.
+    const Type *retType = GetType();
+    if (retType == NULL)
+        return NULL;
+    if (isVarying) {
+        if (g->target.is32Bit)
+            // Convert i64 vector values to i32 if we are compiling to a
+            // 32-bit target.
+            ptrValue = ctx->TruncInst(ptrValue, LLVMTypes::VoidPointerVectorType,
+                                      "ptr_to_32bit");
+
+        if (initExpr != NULL) {
+            // If we have an initializer expression, emit code that checks
+            // to see if each lane is active and if so, runs the code to do
+            // the initialization.  Note that we're we're taking advantage
+            // of the fact that the __new_varying*() functions are
+            // implemented to return NULL for program instances that aren't
+            // executing; more generally, we should be using the current
+            // execution mask for this...
+            for (int i = 0; i < g->target.vectorWidth; ++i) {
+                llvm::BasicBlock *bbInit = ctx->CreateBasicBlock("init_ptr");
+                llvm::BasicBlock *bbSkip = ctx->CreateBasicBlock("skip_init");
+                llvm::Value *p = ctx->ExtractInst(ptrValue, i);
+                llvm::Value *nullValue = g->target.is32Bit ? LLVMInt32(0) :
+                    LLVMInt64(0);
+                // Is the pointer for the current lane non-zero?
+                llvm::Value *nonNull = ctx->CmpInst(llvm::Instruction::ICmp,
+                                                    llvm::CmpInst::ICMP_NE,
+                                                    p, nullValue, "non_null");
+                ctx->BranchInst(bbInit, bbSkip, nonNull);
+
+                // Initialize the memory pointed to by the pointer for the
+                // current lane.
+                ctx->SetCurrentBasicBlock(bbInit);
+                LLVM_TYPE_CONST llvm::Type *ptrType = 
+                    retType->GetAsUniformType()->LLVMType(g->ctx);
+                llvm::Value *ptr = ctx->IntToPtrInst(p, ptrType);
+                InitSymbol(ptr, allocType, initExpr, ctx, pos);
+                ctx->BranchInst(bbSkip);
+
+                ctx->SetCurrentBasicBlock(bbSkip);
+            }
+        }
+
+        return ptrValue;
+    }
+    else {
+        // For uniform news, we just need to cast the void * to be a
+        // pointer of the return type and to run the code for initializers,
+        // if present.
+        LLVM_TYPE_CONST llvm::Type *ptrType = retType->LLVMType(g->ctx);
+        ptrValue = ctx->BitCastInst(ptrValue, ptrType, "cast_new_ptr");
+
+        if (initExpr != NULL)
+            InitSymbol(ptrValue, allocType, initExpr, ctx, pos);
+
+        return ptrValue;
+    }
+}
+
+
+const Type *
+NewExpr::GetType() const {
+    if (allocType == NULL)
+        return NULL;
+
+    return isVarying ? PointerType::GetVarying(allocType) :
+        PointerType::GetUniform(allocType);
+}
+
+
+Expr *
+NewExpr::TypeCheck() {
+    // Here we only need to make sure that if we have an expression giving
+    // a number of elements to allocate that it can be converted to an
+    // integer of the appropriate variability.
+    if (countExpr == NULL)
+        return this;
+
+    const Type *countType;
+    if ((countType = countExpr->GetType()) == NULL)
+        return NULL;
+
+    if (isVarying == false && countType->IsVaryingType()) {
+        Error(pos, "Illegal to provide \"varying\" allocation count with "
+              "\"uniform new\" expression.");
+        return NULL;
+    }
+
+    // Figure out the type that the allocation count should be
+    const Type *t = (g->target.is32Bit || g->opt.force32BitAddressing) ?
+        AtomicType::UniformUInt32 : AtomicType::UniformUInt64;
+    if (isVarying)
+        t = t->GetAsVaryingType();
+
+    countExpr = TypeConvertExpr(countExpr, t, "item count");
+    if (countExpr == NULL)
+        return NULL;
+
+    return this;
+}
+
+
+Expr *
+NewExpr::Optimize() {
+    return this;
+}
+
+
+void
+NewExpr::Print() const {
+    printf("new (%s)", allocType ? allocType->GetString().c_str() : "NULL");
+}
+
+
+int
+NewExpr::EstimateCost() const {
+    return COST_NEW;
+}
diff --git a/expr.h b/expr.h
index 48388475..3bc74d49 100644
--- a/expr.h
+++ b/expr.h
@@ -685,6 +685,38 @@ public:
 };
 
 
+/** An expression representing a "new" expression, used for dynamically
+    allocating memory. 
+*/
+class NewExpr : public Expr {
+public:
+    NewExpr(int typeQual, const Type *type, Expr *initializer, Expr *count, 
+            SourcePos tqPos, SourcePos p);
+
+    llvm::Value *GetValue(FunctionEmitContext *ctx) const;
+    const Type *GetType() const;
+    Expr *TypeCheck();
+    Expr *Optimize();
+    void Print() const;
+    int EstimateCost() const;
+
+    /** Type of object to allocate storage for. */
+    const Type *allocType;
+    /** Expression giving the number of elements to allocate, when the 
+        "new Foo[expr]" form is used.  This may be NULL, in which case a
+        single element of the given type will be allocated. */
+    Expr *countExpr;
+    /** Optional initializer expression used to initialize the allocated
+        memory. */
+    Expr *initExpr;
+    /** Indicates whether this is a "varying new" or "uniform new"
+        (i.e. whether a separate allocation is performed per program
+        instance, or whether a single allocation is performed for the
+        entire gang of program instances.) */
+    bool isVarying;
+};
+
+
 /** This function indicates whether it's legal to convert from fromType to
     toType.  If the optional errorMsgBase and source position parameters
     are provided, then an error message is issued if the type conversion
@@ -703,4 +735,20 @@ bool CanConvertTypes(const Type *fromType, const Type *toType,
  */
 Expr *TypeConvertExpr(Expr *expr, const Type *toType, const char *errorMsgBase);
 
+/** Utility routine that emits code to initialize a symbol given an
+    initializer expression.
+
+    @param lvalue    Memory location of storage for the symbol's data
+    @param symName   Name of symbol (used in error messages)
+    @param symType   Type of variable being initialized
+    @param initExpr  Expression for the initializer
+    @param ctx       FunctionEmitContext to use for generating instructions
+    @param pos       Source file position of the variable being initialized
+*/
+void
+InitSymbol(llvm::Value *lvalue, const Type *symType, Expr *initExpr,
+           FunctionEmitContext *ctx, SourcePos pos);
+
+bool PossiblyResolveFunctionOverloads(Expr *expr, const Type *type);
+
 #endif // ISPC_EXPR_H
diff --git a/ispc.h b/ispc.h
index 009470e2..9ebfef53 100644
--- a/ispc.h
+++ b/ispc.h
@@ -418,6 +418,7 @@ enum {
     COST_ASSIGN = 1,
     COST_COHERENT_BREAK_CONTINE = 4,
     COST_COMPLEX_ARITH_OP = 4,
+    COST_DELETE = 32,
     COST_DEREF = 4,
     COST_FUNCALL = 4,
     COST_FUNPTR_UNIFORM = 12,
@@ -425,6 +426,7 @@ enum {
     COST_GATHER = 8,
     COST_GOTO = 4,
     COST_LOAD = 2,
+    COST_NEW = 32,
     COST_REGULAR_BREAK_CONTINUE = 2,
     COST_RETURN = 4,
     COST_SELECT = 4,
diff --git a/lex.ll b/lex.ll
index f21df180..9797e4e5 100644
--- a/lex.ll
+++ b/lex.ll
@@ -93,6 +93,8 @@ continue { return TOKEN_CONTINUE; }
 creturn { return TOKEN_CRETURN; }
 default { return TOKEN_DEFAULT; }
 do { return TOKEN_DO; }
+delete { return TOKEN_DELETE; }
+delete\[\] { return TOKEN_DELETE; }
 double { return TOKEN_DOUBLE; }
 else { return TOKEN_ELSE; }
 enum { return TOKEN_ENUM; }
@@ -112,6 +114,7 @@ int16 { return TOKEN_INT16; }
 int32 { return TOKEN_INT; }
 int64 { return TOKEN_INT64; }
 launch { return TOKEN_LAUNCH; }
+new { return TOKEN_NEW; }
 NULL { return TOKEN_NULL; }
 print { return TOKEN_PRINT; }
 reference { Error(*yylloc, "\"reference\" qualifier is no longer supported; "
diff --git a/parse.yy b/parse.yy
index 4be1ab7e..de93bf77 100644
--- a/parse.yy
+++ b/parse.yy
@@ -106,13 +106,14 @@ static void lFinalizeEnumeratorSymbols(std::vector<Symbol *> &enums,
                                        const EnumType *enumType);
 
 static const char *lBuiltinTokens[] = {
-    "assert", "bool", "break", "case", "cbreak", "ccontinue", "cdo", "cfor",
-    "cif", "cwhile", "const", "continue", "creturn", "default", "do", "double", 
-    "else", "enum", "export", "extern", "false", "float", "for", "foreach",
-    "foreach_tiled", "goto", "if", "inline", "int", "int8", "int16",
-    "int32", "int64", "launch", "NULL", "print", "return", "signed", "sizeof",
-    "static", "struct", "switch", "sync", "task", "true", "typedef", "uniform",
-    "unsigned", "varying", "void", "while", NULL 
+    "assert", "bool", "break", "case", "cbreak", "ccontinue", "cdo",
+    "cfor", "cif", "cwhile", "const", "continue", "creturn", "default",
+    "do", "delete", "double", "else", "enum", "export", "extern", "false",
+    "float", "for", "foreach", "foreach_tiled", "goto", "if", "inline",
+    "int", "int8", "int16", "int32", "int64", "launch", "new", "NULL",
+    "print", "return", "signed", "sizeof", "static", "struct", "switch",
+    "sync", "task", "true", "typedef", "uniform", "unsigned", "varying",
+    "void", "while", NULL 
 };
 
 static const char *lParamListTokens[] = {
@@ -170,7 +171,7 @@ struct ForeachDimension {
 %token TOKEN_AND_OP TOKEN_OR_OP TOKEN_MUL_ASSIGN TOKEN_DIV_ASSIGN TOKEN_MOD_ASSIGN 
 %token TOKEN_ADD_ASSIGN TOKEN_SUB_ASSIGN TOKEN_LEFT_ASSIGN TOKEN_RIGHT_ASSIGN 
 %token TOKEN_AND_ASSIGN TOKEN_OR_ASSIGN TOKEN_XOR_ASSIGN
-%token TOKEN_SIZEOF
+%token TOKEN_SIZEOF TOKEN_NEW TOKEN_DELETE
 
 %token TOKEN_EXTERN TOKEN_EXPORT TOKEN_STATIC TOKEN_INLINE TOKEN_TASK 
 %token TOKEN_UNIFORM TOKEN_VARYING TOKEN_TYPEDEF TOKEN_SOA
@@ -189,7 +190,7 @@ struct ForeachDimension {
 %type <expr> multiplicative_expression additive_expression shift_expression
 %type <expr> relational_expression equality_expression and_expression
 %type <expr> exclusive_or_expression inclusive_or_expression
-%type <expr> logical_and_expression logical_or_expression 
+%type <expr> logical_and_expression logical_or_expression new_expression
 %type <expr> conditional_expression assignment_expression expression
 %type <expr> initializer constant_expression for_test
 %type <exprList> argument_expression_list initializer_list
@@ -197,7 +198,7 @@ struct ForeachDimension {
 %type <stmt> statement labeled_statement compound_statement for_init_statement
 %type <stmt> expression_statement selection_statement iteration_statement
 %type <stmt> jump_statement statement_list declaration_statement print_statement
-%type <stmt> assert_statement sync_statement
+%type <stmt> assert_statement sync_statement delete_statement
 
 %type <declaration> declaration parameter_declaration
 %type <declarators> init_declarator_list 
@@ -215,7 +216,7 @@ struct ForeachDimension {
 %type <enumType> enum_specifier
 
 %type <type> specifier_qualifier_list struct_or_union_specifier
-%type <type> type_specifier type_name
+%type <type> type_specifier type_name rate_qualified_new_type
 %type <type> short_vec_specifier
 %type <atomicType> atomic_var_type_specifier
 
@@ -225,7 +226,7 @@ struct ForeachDimension {
 
 %type <stringVal> string_constant
 %type <constCharPtr> struct_or_union_name enum_identifier goto_identifier
-%type <intVal> int_constant soa_width_specifier
+%type <intVal> int_constant soa_width_specifier rate_qualified_new
 
 %type <foreachDimension> foreach_dimension_specifier
 %type <foreachDimensionList> foreach_dimension_list
@@ -448,8 +449,36 @@ conditional_expression
       { $$ = new SelectExpr($1, $3, $5, Union(@1,@5)); }
     ;
 
-assignment_expression
+rate_qualified_new
+    : TOKEN_NEW { $$ = 0; }
+    | TOKEN_UNIFORM TOKEN_NEW { $$ = TYPEQUAL_UNIFORM; }
+    | TOKEN_VARYING TOKEN_NEW { $$ = TYPEQUAL_VARYING; }
+    ;
+
+rate_qualified_new_type
+    : type_specifier { $$ = $1; }
+    | TOKEN_UNIFORM type_specifier { $$ = $2->GetAsUniformType(); }
+    | TOKEN_VARYING type_specifier { $$ = $2->GetAsVaryingType(); }
+    ;
+
+new_expression
     : conditional_expression
+    | rate_qualified_new rate_qualified_new_type
+    {
+        $$ = new NewExpr($1, $2, NULL, NULL, @1, Union(@1, @2));
+    }
+    | rate_qualified_new rate_qualified_new_type '(' initializer_list ')'
+    {
+        $$ = new NewExpr($1, $2, $4, NULL, @1, Union(@1, @2));
+    }
+    | rate_qualified_new rate_qualified_new_type '[' expression ']'
+    {
+        $$ = new NewExpr($1, $2, NULL, $4, @1, Union(@1, @4));
+    }
+    ;
+
+assignment_expression
+    : new_expression
     | unary_expression '=' assignment_expression
       { $$ = new AssignExpr(AssignExpr::Assign, $1, $3, Union(@1, @3)); }
     | unary_expression TOKEN_MUL_ASSIGN assignment_expression
@@ -1240,6 +1269,7 @@ statement
     | print_statement
     | assert_statement
     | sync_statement
+    | delete_statement
     | error
     {
         std::vector<std::string> builtinTokens;
@@ -1473,6 +1503,13 @@ sync_statement
       { $$ = new ExprStmt(new SyncExpr(@1), @1); }
     ;
 
+delete_statement
+    : TOKEN_DELETE expression ';'
+    {
+        $$ = new DeleteStmt($2, Union(@1, @2));
+    }
+    ;
+
 print_statement
     : TOKEN_PRINT '(' string_constant ')' ';'
       {
diff --git a/stmt.cpp b/stmt.cpp
index fda693e5..5e5fe27d 100644
--- a/stmt.cpp
+++ b/stmt.cpp
@@ -119,153 +119,6 @@ DeclStmt::DeclStmt(const std::vector<VariableDeclaration> &v, SourcePos p)
 }
 
 
-static bool
-lPossiblyResolveFunctionOverloads(Expr *expr, const Type *type) {
-    FunctionSymbolExpr *fse = NULL;
-    const FunctionType *funcType = NULL;
-    if (dynamic_cast<const PointerType *>(type) != NULL &&
-        (funcType = dynamic_cast<const FunctionType *>(type->GetBaseType())) &&
-        (fse = dynamic_cast<FunctionSymbolExpr *>(expr)) != NULL) {
-        // We're initializing a function pointer with a function symbol,
-        // which in turn may represent an overloaded function.  So we need
-        // to try to resolve the overload based on the type of the symbol
-        // we're initializing here.
-        std::vector<const Type *> paramTypes;
-        for (int i = 0; i < funcType->GetNumParameters(); ++i)
-            paramTypes.push_back(funcType->GetParameterType(i));
-
-        if (fse->ResolveOverloads(expr->pos, paramTypes) == false)
-            return false;
-    }
-    return true;
-}
-
-
-/** Utility routine that emits code to initialize a symbol given an
-    initializer expression.
-
-    @param lvalue    Memory location of storage for the symbol's data
-    @param symName   Name of symbol (used in error messages)
-    @param symType   Type of variable being initialized
-    @param initExpr  Expression for the initializer
-    @param ctx       FunctionEmitContext to use for generating instructions
-    @param pos       Source file position of the variable being initialized
-*/
-static void
-lInitSymbol(llvm::Value *lvalue, const char *symName, const Type *symType,
-            Expr *initExpr, FunctionEmitContext *ctx, SourcePos pos) {
-    if (initExpr == NULL)
-        // leave it uninitialized
-        return;
-
-    // If the initializer is a straight up expression that isn't an
-    // ExprList, then we'll see if we can type convert it to the type of
-    // the variable.
-    if (dynamic_cast<ExprList *>(initExpr) == NULL) {
-        if (lPossiblyResolveFunctionOverloads(initExpr, symType) == false)
-            return;
-        initExpr = TypeConvertExpr(initExpr, symType, "initializer");
-
-        if (initExpr != NULL) {
-            llvm::Value *initializerValue = initExpr->GetValue(ctx);
-            if (initializerValue != NULL)
-                // Bingo; store the value in the variable's storage
-                ctx->StoreInst(initializerValue, lvalue);
-            return;
-        }
-    }
-
-    // Atomic types and enums can't be initialized with { ... } initializer
-    // expressions, so print an error and return if that's what we've got
-    // here..
-    if (dynamic_cast<const AtomicType *>(symType) != NULL ||
-        dynamic_cast<const EnumType *>(symType) != NULL ||
-        dynamic_cast<const PointerType *>(symType) != NULL) {
-        ExprList *elist = dynamic_cast<ExprList *>(initExpr);
-        if (elist != NULL) {
-            if (elist->exprs.size() == 1)
-                lInitSymbol(lvalue, symName, symType, elist->exprs[0], ctx,
-                            pos);
-            else
-                Error(initExpr->pos, "Expression list initializers can't be used for "
-                      "variable \"%s\' with type \"%s\".", symName,
-                      symType->GetString().c_str());
-        }
-        return;
-    }
-
-    const ReferenceType *rt = dynamic_cast<const ReferenceType *>(symType);
-    if (rt) {
-        if (!Type::Equal(initExpr->GetType(), rt)) {
-            Error(initExpr->pos, "Initializer for reference type \"%s\" must have same "
-                  "reference type itself. \"%s\" is incompatible.", 
-                  rt->GetString().c_str(), initExpr->GetType()->GetString().c_str());
-            return;
-        }
-
-        llvm::Value *initializerValue = initExpr->GetValue(ctx);
-        if (initializerValue)
-            ctx->StoreInst(initializerValue, lvalue);
-        return;
-    }
-
-    // There are two cases for initializing structs, arrays and vectors;
-    // either a single initializer may be provided (float foo[3] = 0;), in
-    // which case all of the elements are initialized to the given value,
-    // or an initializer list may be provided (float foo[3] = { 1,2,3 }),
-    // in which case the elements are initialized with the corresponding
-    // values.
-    const CollectionType *collectionType = 
-        dynamic_cast<const CollectionType *>(symType);
-    if (collectionType != NULL) {
-        std::string name;
-        if (dynamic_cast<const StructType *>(symType) != NULL)
-            name = "struct";
-        else if (dynamic_cast<const ArrayType *>(symType) != NULL) 
-            name = "array";
-        else if (dynamic_cast<const VectorType *>(symType) != NULL) 
-            name = "vector";
-        else 
-            FATAL("Unexpected CollectionType in lInitSymbol()");
-
-        ExprList *exprList = dynamic_cast<ExprList *>(initExpr);
-        if (exprList != NULL) {
-            // The { ... } case; make sure we have the same number of
-            // expressions in the ExprList as we have struct members
-            int nInits = exprList->exprs.size();
-            if (nInits != collectionType->GetElementCount()) {
-                Error(initExpr->pos, "Initializer for %s \"%s\" requires "
-                      "%d values; %d provided.", name.c_str(), symName, 
-                      collectionType->GetElementCount(), nInits);
-                return;
-            }
-
-            // Initialize each element with the corresponding value from
-            // the ExprList
-            for (int i = 0; i < nInits; ++i) {
-                llvm::Value *ep;
-                if (dynamic_cast<const StructType *>(symType) != NULL)
-                    ep = ctx->AddElementOffset(lvalue, i, NULL, "element");
-                else
-                    ep = ctx->GetElementPtrInst(lvalue, LLVMInt32(0), LLVMInt32(i), 
-                                                PointerType::GetUniform(collectionType->GetElementType(i)), 
-                                                "gep");
-
-                lInitSymbol(ep, symName, collectionType->GetElementType(i), 
-                            exprList->exprs[i], ctx, pos);
-            }
-        }
-        else
-            Error(initExpr->pos, "Can't assign type \"%s\" to \"%s\".",
-                  initExpr->GetType()->GetString().c_str(),
-                  collectionType->GetString().c_str());
-        return;
-    }
-
-    FATAL("Unexpected Type in lInitSymbol()");
-}
-
-
 static bool
 lHasUnsizedArrays(const Type *type) {
     const ArrayType *at = dynamic_cast<const ArrayType *>(type);
@@ -333,7 +186,7 @@ DeclStmt::EmitCode(FunctionEmitContext *ctx) const {
             // zero value.
             llvm::Constant *cinit = NULL;
             if (initExpr != NULL) {
-                if (lPossiblyResolveFunctionOverloads(initExpr, sym->type) == false)
+                if (PossiblyResolveFunctionOverloads(initExpr, sym->type) == false)
                     continue;
                 // FIXME: we only need this for function pointers; it was
                 // already done for atomic types and enums in
@@ -377,8 +230,7 @@ DeclStmt::EmitCode(FunctionEmitContext *ctx) const {
 
             // And then get it initialized...
             sym->parentFunction = ctx->GetFunction();
-            lInitSymbol(sym->storagePtr, sym->name.c_str(), sym->type, 
-                        initExpr, ctx, sym->pos);
+            InitSymbol(sym->storagePtr, sym->type, initExpr, ctx, sym->pos);
         }
     }
 }
@@ -646,6 +498,15 @@ lCheckAllOffSafety(ASTNode *node, void *data) {
         return false;
     }
 
+    if (dynamic_cast<NewExpr *>(node) != NULL ||
+        dynamic_cast<DeleteStmt *>(node) != NULL) {
+        // We definitely don't want to run the uniform variants of these if
+        // the mask is all off.  It's also worth skipping the overhead of
+        // executing the varying versions of them in the all-off mask case.
+        *okPtr = false;
+        return false;
+    }
+
     if (g->target.allOffMaskIsSafe == true)
         // Don't worry about memory accesses if we have a target that can
         // safely run them with the mask all off
@@ -2880,3 +2741,82 @@ AssertStmt::EstimateCost() const {
     return COST_ASSERT;
 }
 
+
+///////////////////////////////////////////////////////////////////////////
+// DeleteStmt
+
+DeleteStmt::DeleteStmt(Expr *e, SourcePos p)
+    : Stmt(p) {
+    expr = e;
+}
+
+
+void
+DeleteStmt::EmitCode(FunctionEmitContext *ctx) const {
+    const Type *exprType;
+    if (expr == NULL || ((exprType = expr->GetType()) == NULL)) {
+        Assert(m->errorCount > 0);
+        return;
+    }
+
+    llvm::Value *exprValue = expr->GetValue(ctx);
+    if (exprValue == NULL) {
+        Assert(m->errorCount > 0);
+        return;
+    }
+
+    // Typechecking should catch this
+    Assert(dynamic_cast<const PointerType *>(exprType) != NULL);
+
+    if (exprType->IsUniformType()) {
+        // For deletion of a uniform pointer, we just need to cast the
+        // pointer type to a void pointer type, to match what
+        // __delete_uniform() from the builtins expects.
+        exprValue = ctx->BitCastInst(exprValue, LLVMTypes::VoidPointerType,
+                                     "ptr_to_void");
+        llvm::Function *func = m->module->getFunction("__delete_uniform");
+        Assert(func != NULL);
+
+        ctx->CallInst(func, NULL, exprValue, "");
+    }
+    else {
+        // Varying pointers are arrays of ints, and __delete_varying()
+        // takes a vector of i64s (even for 32-bit targets).  Therefore, we
+        // only need to extend to 64-bit values on 32-bit targets before
+        // calling it.
+        llvm::Function *func = m->module->getFunction("__delete_varying");
+        Assert(func != NULL);
+        if (g->target.is32Bit)
+            exprValue = ctx->ZExtInst(exprValue, LLVMTypes::Int64VectorType,
+                                      "ptr_to_64");
+        ctx->CallInst(func, NULL, exprValue, "");
+    }
+}
+
+
+void
+DeleteStmt::Print(int indent) const {
+    printf("%*cDelete Stmt", indent, ' ');
+}
+
+
+Stmt *
+DeleteStmt::TypeCheck() {
+    const Type *exprType;
+    if (expr == NULL || ((exprType = expr->GetType()) == NULL))
+        return NULL;
+
+    if (dynamic_cast<const PointerType *>(exprType) == NULL) {
+        Error(pos, "Illegal to delete non-pointer type \"%s\".",
+              exprType->GetString().c_str());
+        return NULL;
+    }
+
+    return this;
+}
+
+
+int
+DeleteStmt::EstimateCost() const {
+    return COST_DELETE;
+}
diff --git a/stmt.h b/stmt.h
index 8b22603a..f557a3f3 100644
--- a/stmt.h
+++ b/stmt.h
@@ -442,4 +442,21 @@ public:
     Expr *expr;
 };
 
+
+/** Representation of a delete statement in the program.
+*/
+class DeleteStmt : public Stmt {
+public:
+    DeleteStmt(Expr *e, SourcePos p);
+
+    void EmitCode(FunctionEmitContext *ctx) const;
+    void Print(int indent) const;
+
+    Stmt *TypeCheck();
+    int EstimateCost() const;
+
+    /** Expression that gives the pointer value to be deleted. */
+    Expr *expr;
+};
+
 #endif // ISPC_STMT_H
diff --git a/tests/new-delete-1.ispc b/tests/new-delete-1.ispc
new file mode 100644
index 00000000..f8fc1599
--- /dev/null
+++ b/tests/new-delete-1.ispc
@@ -0,0 +1,15 @@
+
+export uniform int width() { return programCount; }
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex]; 
+    float * uniform buf = uniform new float[programCount];
+    for (uniform int i = 0; i < programCount; ++i)
+        buf[i] = i;
+    RET[programIndex] = buf[a-1];
+    delete buf;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = programIndex;
+}
diff --git a/tests/new-delete-2.ispc b/tests/new-delete-2.ispc
new file mode 100644
index 00000000..bf24a0c4
--- /dev/null
+++ b/tests/new-delete-2.ispc
@@ -0,0 +1,15 @@
+
+export uniform int width() { return programCount; }
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex]; 
+    uniform float * uniform buf = uniform new uniform float[programCount];
+    for (uniform int i = 0; i < programCount; ++i)
+        buf[i] = i;
+    RET[programIndex] = buf[a-1];
+    delete buf;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = programIndex;
+}
diff --git a/tests/new-delete-3.ispc b/tests/new-delete-3.ispc
new file mode 100644
index 00000000..676f9886
--- /dev/null
+++ b/tests/new-delete-3.ispc
@@ -0,0 +1,17 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex]; 
+    float * uniform buf = uniform new float[programCount+1];
+    for (uniform int i = 0; i < programCount+1; ++i) {
+        buf[i] = i+a;
+    }
+    RET[programIndex] = buf[a];
+    delete buf;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 2 + 2*programIndex;
+}
diff --git a/tests/new-delete-4.ispc b/tests/new-delete-4.ispc
new file mode 100644
index 00000000..de5488b9
--- /dev/null
+++ b/tests/new-delete-4.ispc
@@ -0,0 +1,14 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex]; 
+    float * uniform buf = uniform new float(2*b);
+    RET[programIndex] = buf[0];
+    delete[] buf;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 10;
+}
diff --git a/tests/new-delete-5.ispc b/tests/new-delete-5.ispc
new file mode 100644
index 00000000..ab99df2e
--- /dev/null
+++ b/tests/new-delete-5.ispc
@@ -0,0 +1,17 @@
+
+export uniform int width() { return programCount; }
+
+struct Point {
+    uniform float x, y, z;
+};
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex]; 
+    varying Point * uniform buf = uniform new varying Point(a, b, 1234.);
+    RET[programIndex] = buf->y;
+    delete buf;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 5;
+}
diff --git a/tests/new-delete-6.ispc b/tests/new-delete-6.ispc
new file mode 100644
index 00000000..90018b93
--- /dev/null
+++ b/tests/new-delete-6.ispc
@@ -0,0 +1,17 @@
+
+export uniform int width() { return programCount; }
+
+struct Point {
+    float x, y, z;
+};
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex]; 
+    Point * varying buf = new Point(0., b, a);
+    RET[programIndex] = buf->z;
+    delete buf;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1+programIndex;
+}
diff --git a/tests_errors/func-call-through-variable.ispc b/tests_errors/func-call-through-variable.ispc
new file mode 100644
index 00000000..3a857e79
--- /dev/null
+++ b/tests_errors/func-call-through-variable.ispc
@@ -0,0 +1,47 @@
+// Must provide function name or function pointer for function call expression
+
+export void saxpy_ispc(uniform int N,
+uniform float scale,
+uniform float X[],
+uniform float Y[],
+uniform float result[])
+{
+foreach (i = 0 ... N) {
+result[i] = scale * X[i] + Y[i];
+}
+}
+
+task void saxpy_ispc_task(uniform int N,
+uniform int span,
+uniform float scale,
+uniform float X[],
+uniform float Y[],
+uniform float result[])
+{
+uniform int indexStart;
+uniform int indexEnd;
+indexStart = (taskIndex * span);
+indexEnd = min(N, indexStart + (span)/8);
+foreach (i = indexStart ... indexEnd) {
+result[i] = scale * X[i] + Y[i];
+}
+uniform int k =0;
+for (k=0; k<8;k++) {
+indexStart = (((7-taskIndex-k)%8) * span) + k(span/8);
+indexEnd = min(N, indexStart + (span)/8);
+foreach (i = indexStart ... indexEnd) {
+result[i] = scale * X[i] + Y[i];
+}
+}
+}
+export void saxpy_ispc_withtasks(uniform int N,
+uniform float scale,
+uniform float X[],
+uniform float Y[],
+uniform float result[])
+{
+
+uniform int span = N / 8;  // 8 tasks
+
+launch[N/span] < saxpy_ispc_task(N, span, scale, X, Y, result) >;
+}
diff --git a/tests_errors/new-delete-1.ispc b/tests_errors/new-delete-1.ispc
new file mode 100644
index 00000000..551b66d7
--- /dev/null
+++ b/tests_errors/new-delete-1.ispc
@@ -0,0 +1,5 @@
+// Illegal to delete non-pointer type
+
+void func(int a) {
+    delete a;
+}
diff --git a/tests_errors/new-delete-2.ispc b/tests_errors/new-delete-2.ispc
new file mode 100644
index 00000000..fbbb1cd6
--- /dev/null
+++ b/tests_errors/new-delete-2.ispc
@@ -0,0 +1,5 @@
+// Syntax error
+
+int * func(int a) {
+    return const new int[a];
+}
diff --git a/tests_errors/new-delete-3.ispc b/tests_errors/new-delete-3.ispc
new file mode 100644
index 00000000..e34ea98d
--- /dev/null
+++ b/tests_errors/new-delete-3.ispc
@@ -0,0 +1,5 @@
+// Syntax error
+
+int * func(int a) {
+    return new int[a](10);
+}
diff --git a/tests_errors/new-delete-4.ispc b/tests_errors/new-delete-4.ispc
new file mode 100644
index 00000000..49fe6214
--- /dev/null
+++ b/tests_errors/new-delete-4.ispc
@@ -0,0 +1,7 @@
+// Type conversion only possible from atomic types
+
+struct P { int x; };
+
+int * func(P p) {
+    return new int[p];
+}
diff --git a/tests_errors/new-delete-5.ispc b/tests_errors/new-delete-5.ispc
new file mode 100644
index 00000000..d25518b5
--- /dev/null
+++ b/tests_errors/new-delete-5.ispc
@@ -0,0 +1,5 @@
+// Illegal to provide "varying" allocation count with "uniform new" expression
+
+int * func(int x) {
+    return uniform new int[x];
+}
diff --git a/tests_errors/new-delete-6.ispc b/tests_errors/new-delete-6.ispc
new file mode 100644
index 00000000..a148298b
--- /dev/null
+++ b/tests_errors/new-delete-6.ispc
@@ -0,0 +1,5 @@
+// Can't convert from varying type "int32 *" to uniform type "int32 * uniform" for return
+
+int * uniform func(int x) {
+    return new int[x];
+}
diff --git a/tests_errors/new-delete-7.ispc b/tests_errors/new-delete-7.ispc
new file mode 100644
index 00000000..12a3b79a
--- /dev/null
+++ b/tests_errors/new-delete-7.ispc
@@ -0,0 +1,12 @@
+// Can't convert from varying type "float" to uniform type "uniform float" for initializer
+
+struct Point {
+    uniform float x, y, z;
+};
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex]; 
+    uniform Point * uniform buf = uniform new uniform Point(a, b, 1234.);
+    RET[programIndex] = buf->y;
+    delete buf;
+}

From 0f01a5dcbee8d87e6dc311b41acee5f2fc482f9b Mon Sep 17 00:00:00 2001
From: Matt Pharr <matt.pharr@intel.com>
Date: Fri, 27 Jan 2012 16:48:14 -0800
Subject: [PATCH 13/62] Handle undef values in LLVMVectorValuesAllEqual()

---
 llvmutil.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/llvmutil.cpp b/llvmutil.cpp
index 808babbc..4ae07b96 100644
--- a/llvmutil.cpp
+++ b/llvmutil.cpp
@@ -669,6 +669,10 @@ LLVMVectorValuesAllEqual(llvm::Value *v, int vectorLength,
         return true;
     }
 
+    if (llvm::isa<llvm::UndefValue>(v))
+        // ?
+        return false;
+
     Assert(!llvm::isa<llvm::Constant>(v));
 
     if (llvm::isa<llvm::CallInst>(v) || llvm::isa<llvm::LoadInst>(v) ||

From 12dc3f5c2834baa432e84f140a7b82fed2c2d2dd Mon Sep 17 00:00:00 2001
From: Matt Pharr <matt.pharr@intel.com>
Date: Fri, 27 Jan 2012 16:49:09 -0800
Subject: [PATCH 14/62] Fixes to c++ backend for new and delete

Don't include declarations of malloc/free in the generated code (get
the standard ones from system headers instead).

Add a cast to (uint8_t *) before calls to malloc, which C++ requires,
since proper malloc returns a void *.
---
 cbackend.cpp | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/cbackend.cpp b/cbackend.cpp
index b800d4ac..314b53d6 100644
--- a/cbackend.cpp
+++ b/cbackend.cpp
@@ -2114,7 +2114,8 @@ bool CWriter::doInitialization(Module &M) {
         I->getName() == "memset" || I->getName() == "memset_pattern16" ||
         I->getName() == "puts" ||
         I->getName() == "printf" || I->getName() == "putchar" ||
-        I->getName() == "fflush")
+        I->getName() == "fflush" || I->getName() == "malloc" ||
+        I->getName() == "free")
       continue;
 
     // Don't redeclare ispc's own intrinsics
@@ -3437,6 +3438,9 @@ void CWriter::visitCallInst(CallInst &I) {
           Callee = RF;
         }
 
+    if (Callee->getName() == "malloc")
+        Out << "(uint8_t *)";
+
     if (NeedsCast) {
       // Ok, just cast the pointer type.
       Out << "((";

From bba02f87ea100db68973210f1520fff980f75b16 Mon Sep 17 00:00:00 2001
From: Matt Pharr <matt.pharr@intel.com>
Date: Fri, 27 Jan 2012 16:49:41 -0800
Subject: [PATCH 15/62] Improve implementations of unsigned <=, >= in sse4
 intrinsics file.

---
 examples/intrinsics/sse4.h | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/examples/intrinsics/sse4.h b/examples/intrinsics/sse4.h
index 7a3af6ad..c6299893 100644
--- a/examples/intrinsics/sse4.h
+++ b/examples/intrinsics/sse4.h
@@ -941,10 +941,8 @@ static FORCEINLINE __vec4_i1 __not_equal(__vec4_i32 a, __vec4_i32 b) {
 }
 
 static FORCEINLINE __vec4_i1 __unsigned_less_equal(__vec4_i32 a, __vec4_i32 b) {
-    a.v = _mm_xor_si128(a.v, _mm_set1_epi32(0x80000000));
-    b.v = _mm_xor_si128(b.v, _mm_set1_epi32(0x80000000));
-    return _mm_or_si128(_mm_cmplt_epi32(a.v, b.v),
-                        _mm_cmpeq_epi32(a.v, b.v));
+    // a<=b == (min(a,b) == a)
+    return _mm_cmpeq_epi32(_mm_min_epu32(a.v, b.v), a.v);
 }
 
 static FORCEINLINE __vec4_i1 __signed_less_equal(__vec4_i32 a, __vec4_i32 b) {
@@ -953,10 +951,8 @@ static FORCEINLINE __vec4_i1 __signed_less_equal(__vec4_i32 a, __vec4_i32 b) {
 }
 
 static FORCEINLINE __vec4_i1 __unsigned_greater_equal(__vec4_i32 a, __vec4_i32 b) {
-    a.v = _mm_xor_si128(a.v, _mm_set1_epi32(0x80000000));
-    b.v = _mm_xor_si128(b.v, _mm_set1_epi32(0x80000000));
-    return _mm_or_si128(_mm_cmpgt_epi32(a.v, b.v),
-                        _mm_cmpeq_epi32(a.v, b.v));
+    // a>=b == (max(a,b) == a)
+    return _mm_cmpeq_epi32(_mm_max_epu32(a.v, b.v), a.v);
 }
 
 static FORCEINLINE __vec4_i1 __signed_greater_equal(__vec4_i32 a, __vec4_i32 b) {

From c96fef6bc89f7b54b32411aab55b4119b51e7129 Mon Sep 17 00:00:00 2001
From: Matt Pharr <matt.pharr@intel.com>
Date: Fri, 27 Jan 2012 17:04:57 -0800
Subject: [PATCH 16/62] Fix silly error in generic-16.h example C++ bindings.

---
 examples/intrinsics/generic-16.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/intrinsics/generic-16.h b/examples/intrinsics/generic-16.h
index 7418f5d6..861db2a4 100644
--- a/examples/intrinsics/generic-16.h
+++ b/examples/intrinsics/generic-16.h
@@ -1106,7 +1106,7 @@ GATHER_GENERAL(__vec16_i64, int64_t, __vec16_i64, __gather64_i64)
 
 // scatter
 
-#define SCATTER_BASE_VARYINGOFFSET(VTYPE, STYPE, OTYPE, FUNC)           \
+#define SCATTER_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC)                 \
 static FORCEINLINE void FUNC(unsigned char *b, OTYPE varyingOffset,     \
                              uint32_t scale, OTYPE constOffset,         \
                              VTYPE val, __vec16_i1 mask) {              \

From c67a286aa643ba73a976c84802ff5a716789b8fd Mon Sep 17 00:00:00 2001
From: Gabe Weisz <gweisz@cs.cmu.edu>
Date: Sat, 28 Jan 2012 15:31:42 -0500
Subject: [PATCH 17/62] Add support for 1-wide scalar target.

Issue #40.
---
 Makefile                     |    2 +-
 builtins.cpp                 |    9 +-
 builtins/target-generic-1.ll | 1006 ++++++++++++++++++++++++++++++++++
 ispc.cpp                     |   10 +-
 parse.yy                     |    2 +-
 5 files changed, 1025 insertions(+), 4 deletions(-)
 create mode 100755 builtins/target-generic-1.ll

diff --git a/Makefile b/Makefile
index b83714c9..08e487f9 100644
--- a/Makefile
+++ b/Makefile
@@ -72,7 +72,7 @@ CXX_SRC=ast.cpp builtins.cpp cbackend.cpp ctx.cpp decl.cpp expr.cpp func.cpp \
 HEADERS=ast.h builtins.h ctx.h decl.h expr.h func.h ispc.h llvmutil.h module.h \
 	opt.h stmt.h sym.h type.h util.h
 TARGETS=avx1 avx1-x2 avx2 avx2-x2 sse2 sse2-x2 sse4 sse4-x2 generic-4 generic-8 \
-	generic-16
+	generic-16 generic-1
 BUILTINS_SRC=$(addprefix builtins/target-, $(addsuffix .ll, $(TARGETS))) \
 	builtins/dispatch.ll
 BUILTINS_OBJS=$(addprefix builtins-, $(notdir $(BUILTINS_SRC:.ll=.o))) \
diff --git a/builtins.cpp b/builtins.cpp
index 8c3631a2..dd910c9a 100644
--- a/builtins.cpp
+++ b/builtins.cpp
@@ -799,6 +799,13 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
                                builtins_bitcode_generic_16_length, 
                                module, symbolTable);
             break;
+	case 1:
+            extern unsigned char builtins_bitcode_generic_1[];
+            extern int builtins_bitcode_generic_1_length;
+            AddBitcodeToModule(builtins_bitcode_generic_1, 
+                               builtins_bitcode_generic_1_length, 
+                               module, symbolTable);
+            break;
         default:
             FATAL("logic error in DefineStdlib");
         }
@@ -834,7 +841,7 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
         // If the user wants the standard library to be included, parse the
         // serialized version of the stdlib.ispc file to get its
         // definitions added.
-        if (g->target.isa == Target::GENERIC) {
+      if (g->target.isa == Target::GENERIC&&g->target.vectorWidth!=1) { // 1 wide uses x86 stdlib
             extern char stdlib_generic_code[];
             yy_scan_string(stdlib_generic_code);
             yyparse();
diff --git a/builtins/target-generic-1.ll b/builtins/target-generic-1.ll
new file mode 100755
index 00000000..f63dc889
--- /dev/null
+++ b/builtins/target-generic-1.ll
@@ -0,0 +1,1006 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Define the standard library builtins for the NOVEC target
+define(`MASK',`i32')
+define(`WIDTH',`1')
+include(`util.m4')
+; Define some basics for a 1-wide target
+stdlib_core()
+packed_load_and_store()
+scans()
+int64minmax()
+aossoa()
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; masked store
+
+gen_masked_store(1, i8, 8)
+gen_masked_store(1, i16, 16)
+gen_masked_store(1, i32, 32)
+gen_masked_store(1, i64, 64)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; unaligned loads/loads+broadcasts
+
+load_and_broadcast(1, i8, 8)
+load_and_broadcast(1, i16, 16)
+load_and_broadcast(1, i32, 32)
+load_and_broadcast(1, i64, 64)
+
+masked_load(1, i8,  8,  1)
+masked_load(1, i16, 16, 2)
+masked_load(1, i32, 32, 4)
+masked_load(1, i64, 64, 8)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; gather/scatter
+
+; define these with the macros from stdlib.m4
+
+gen_gather(1, i8)
+gen_gather(1, i16)
+gen_gather(1, i32)
+gen_gather(1, i64)
+
+define  void @__scatter_elt_i8(i8 * %base, <1 x i32> %offsets, <1 x i8> %values,
+                                       i32 %lane) nounwind alwaysinline {
+  %offset32 = extractelement <1 x i32> %offsets, i32 %lane
+;  %offset64 = zext i32 %offset32 to i64
+;  %ptrdelta = add i64 %ptr64, %offset64
+;  %ptr = inttoptr i64 %ptrdelta to i8 *
+  %ptroffset = getelementptr i8 *%base, i32 %offset32
+  %ptr = bitcast i8 * %ptroffset to i8 *
+  %storeval = extractelement <1 x i8> %values, i32 %lane
+  store i8 %storeval, i8 * %ptr
+  ret void
+}
+
+define void @__scatter_base_offsets_i8(i8* %base, <1 x i32> %offsets, <1 x i8> %values,
+                                       <1 x i32> %mask) nounwind alwaysinline {
+  ;; And use the `per_lane' macro to do all of the per-lane work for scatter...
+  ;%ptr64 = ptrtoint i8 * %base to i64
+  call void @__scatter_elt_i8(i8 *%base, <1 x i32> %offsets, <1 x i8> %values, i32 0)
+  ret void
+}
+
+define  void @__scatter_elt_i16(i8 * %base, <1 x i32> %offsets, <1 x i16> %values,
+                                       i32 %lane) nounwind alwaysinline {
+  %offset32 = extractelement <1 x i32> %offsets, i32 %lane
+;  %offset64 = zext i32 %offset32 to i64
+;  %ptrdelta = add i64 %ptr64, %offset64
+;  %ptr = inttoptr i64 %ptrdelta to i16 *
+  %ptroffset = getelementptr i8 *%base, i32 %offset32
+  %ptr = bitcast i8 * %ptroffset to i16 *
+  %storeval = extractelement <1 x i16> %values, i32 %lane
+  store i16 %storeval, i16 * %ptr
+  ret void
+}
+
+define void @__scatter_base_offsets_i16(i8* %base, <1 x i32> %offsets, <1 x i16> %values,
+                                       <1 x i32> %mask) nounwind alwaysinline {
+  ;; And use the `per_lane' macro to do all of the per-lane work for scatter...
+  ;%ptr64 = ptrtoint i8 * %base to i64
+  call void @__scatter_elt_i16(i8 *%base, <1 x i32> %offsets, <1 x i16> %values, i32 0)
+  ret void
+}
+
+define  void @__scatter_elt_i32(i8 * %base, <1 x i32> %offsets, <1 x i32> %values,
+                                       i32 %lane) nounwind alwaysinline {
+  %offset32 = extractelement <1 x i32> %offsets, i32 %lane
+;  %offset64 = zext i32 %offset32 to i64
+;  %ptrdelta = add i64 %ptr64, %offset64
+;  %ptr = inttoptr i64 %ptrdelta to i32 *
+  %ptroffset = getelementptr i8 *%base, i32 %offset32
+  %ptr = bitcast i8 * %ptroffset to i32 *
+  %storeval = extractelement <1 x i32> %values, i32 %lane
+  store i32 %storeval, i32 * %ptr
+  ret void
+}
+
+define void @__scatter_base_offsets_i32(i8* %base, <1 x i32> %offsets, <1 x i32> %values,
+                                       <1 x i32> %mask) nounwind alwaysinline {
+  ;; And use the `per_lane' macro to do all of the per-lane work for scatter...
+  ;%ptr64 = ptrtoint i8 * %base to i64
+  call void @__scatter_elt_i32(i8 *%base, <1 x i32> %offsets, <1 x i32> %values, i32 0)
+  ret void
+}
+
+define  void @__scatter_elt_i64(i8 * %base, <1 x i32> %offsets, <1 x i64> %values,
+                                       i32 %lane) nounwind alwaysinline {
+  %offset32 = extractelement <1 x i32> %offsets, i32 %lane
+;  %offset64 = zext i32 %offset32 to i64
+;  %ptrdelta = add i64 %ptr64, %offset64
+;  %ptr = inttoptr i64 %ptrdelta to i64 *
+  %ptroffset = getelementptr i8 *%base, i32 %offset32
+  %ptr = bitcast i8 * %ptroffset to i64 *
+  %storeval = extractelement <1 x i64> %values, i32 %lane
+  store i64 %storeval, i64 * %ptr
+  ret void
+}
+
+define void @__scatter_base_offsets_i64(i8* %base, <1 x i32> %offsets, <1 x i64> %values,
+                                       <1 x i32> %mask) nounwind alwaysinline {
+  ;; And use the `per_lane' macro to do all of the per-lane work for scatter...
+  ;%ptr64 = ptrtoint i8 * %base to i64
+  call void @__scatter_elt_i64(i8 *%base, <1 x i32> %offsets, <1 x i64> %values, i32 0)
+  ret void
+}
+
+
+define  <1 x i8> @__vselect_i8(<1 x i8>, <1 x i8> ,
+                                         <1 x i32> %mask) nounwind readnone alwaysinline {
+;  %mv = trunc <1 x i32> %mask to <1 x i8>
+;  %notmask = xor <1 x i8> %mv, <i8 -1>
+;  %cleared_old = and <1 x i8> %0, %notmask
+;  %masked_new = and <1 x i8> %1, %mv
+;  %new = or <1 x i8> %cleared_old, %masked_new
+;  ret <1 x i8> %new
+
+   ; not doing this the easy way because of problems with LLVM's scalarizer
+;   %cmp = icmp eq <1 x i32> %mask, <i32 0>
+;   %sel = select <1 x i1> %cmp, <1 x i8> %0, <1 x i8> %1
+    %m = extractelement <1 x i32> %mask, i32 0
+    %cmp = icmp eq i32 %m, 0
+    %d0 = extractelement <1 x i8> %0, i32 0
+    %d1 = extractelement <1 x i8> %1, i32 0
+    %sel = select i1 %cmp, i8 %d0, i8 %d1    
+    %r = insertelement <1 x i8> undef, i8 %sel, i32 0
+   ret <1 x i8> %r
+}
+
+define  <1 x i16> @__vselect_i16(<1 x i16>, <1 x i16> ,
+                                         <1 x i32> %mask) nounwind readnone alwaysinline {
+;  %mv = trunc <1 x i32> %mask to <1 x i16>
+;  %notmask = xor <1 x i16> %mv, <i16 -1>
+;  %cleared_old = and <1 x i16> %0, %notmask
+;  %masked_new = and <1 x i16> %1, %mv
+;  %new = or <1 x i16> %cleared_old, %masked_new
+;  ret <1 x i16> %new
+;   %cmp = icmp eq <1 x i32> %mask, <i32 0>
+;   %sel = select <1 x i1> %cmp, <1 x i16> %0, <1 x i16> %1
+    %m = extractelement <1 x i32> %mask, i32 0
+    %cmp = icmp eq i32 %m, 0
+    %d0 = extractelement <1 x i16> %0, i32 0
+    %d1 = extractelement <1 x i16> %1, i32 0
+    %sel = select i1 %cmp, i16 %d0, i16 %d1    
+    %r = insertelement <1 x i16> undef, i16 %sel, i32 0
+   ret <1 x i16> %r
+
+;   ret <1 x i16> %sel
+}
+
+
+define  <1 x i32> @__vselect_i32(<1 x i32>, <1 x i32> ,
+                                         <1 x i32> %mask) nounwind readnone alwaysinline {
+;  %notmask = xor <1 x i32> %mask, <i32 -1>
+;  %cleared_old = and <1 x i32> %0, %notmask
+;  %masked_new = and <1 x i32> %1, %mask
+;  %new = or <1 x i32> %cleared_old, %masked_new
+;  ret <1 x i32> %new
+;   %cmp = icmp eq <1 x i32> %mask, <i32 0>
+;   %sel = select <1 x i1> %cmp, <1 x i32> %0, <1 x i32> %1
+;   ret <1 x i32> %sel
+    %m = extractelement <1 x i32> %mask, i32 0
+    %cmp = icmp eq i32 %m, 0
+    %d0 = extractelement <1 x i32> %0, i32 0
+    %d1 = extractelement <1 x i32> %1, i32 0
+    %sel = select i1 %cmp, i32 %d0, i32 %d1    
+    %r = insertelement <1 x i32> undef, i32 %sel, i32 0
+   ret <1 x i32> %r
+
+}
+define  <1 x i64> @__vselect_i64(<1 x i64>, <1 x i64> ,
+                                         <1 x i32> %mask) nounwind readnone alwaysinline {
+;  %newmask = zext <1 x i32> %mask to <1 x i64>
+;  %notmask = xor <1 x i64> %newmask, <i64 -1>
+;  %cleared_old = and <1 x i64> %0, %notmask
+;  %masked_new = and <1 x i64> %1, %newmask
+;  %new = or <1 x i64> %cleared_old, %masked_new
+;  ret <1 x i64> %new
+;   %cmp = icmp eq <1 x i32> %mask, <i32 0>
+;   %sel = select <1 x i1> %cmp, <1 x i64> %0, <1 x i64> %1
+;   ret <1 x i64> %sel
+    %m = extractelement <1 x i32> %mask, i32 0
+    %cmp = icmp eq i32 %m, 0
+    %d0 = extractelement <1 x i64> %0, i32 0
+    %d1 = extractelement <1 x i64> %1, i32 0
+    %sel = select i1 %cmp, i64 %d0, i64 %d1    
+    %r = insertelement <1 x i64> undef, i64 %sel, i32 0
+   ret <1 x i64> %r
+
+}
+
+define  <1 x float> @__vselect_float(<1 x float>, <1 x float>,
+                                             <1 x i32> %mask) nounwind readnone alwaysinline {
+;  %v0 = bitcast <1 x float> %0 to <1 x i32>
+;  %v1 = bitcast <1 x float> %1 to <1 x i32>
+;  %r = call <1 x i32> @__vselect_i32(<1 x i32> %v0, <1 x i32> %v1, <1 x i32> %mask)
+;  %rf = bitcast <1 x i32> %r to <1 x float>
+;  ret <1 x float> %rf
+;   %cmp = icmp eq <1 x i32> %mask, <i32 0>
+;   %sel = select <1 x i1> %cmp, <1 x float> %0, <1 x float> %1
+;   ret <1 x float> %sel
+    %m = extractelement <1 x i32> %mask, i32 0
+    %cmp = icmp eq i32 %m, 0
+    %d0 = extractelement <1 x float> %0, i32 0
+    %d1 = extractelement <1 x float> %1, i32 0
+    %sel = select i1 %cmp, float %d0, float %d1    
+    %r = insertelement <1 x float> undef, float %sel, i32 0
+   ret <1 x float> %r
+
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; masked store
+
+define void @__masked_store_blend_8(<1 x i8>* nocapture, <1 x i8>, 
+                                     <1 x i32> %mask) nounwind alwaysinline {
+  %val = load <1 x i8> * %0, align 4
+  %newval = call <1 x i8> @__vselect_i8(<1 x i8> %val, <1 x i8> %1, <1 x i32> %mask) 
+  store <1 x i8> %newval, <1 x i8> * %0, align 4
+  ret void
+}
+define void @__masked_store_blend_16(<1 x i16>* nocapture, <1 x i16>, 
+                                     <1 x i32> %mask) nounwind alwaysinline {
+  %val = load <1 x i16> * %0, align 4
+  %newval = call <1 x i16> @__vselect_i16(<1 x i16> %val, <1 x i16> %1, <1 x i32> %mask) 
+  store <1 x i16> %newval, <1 x i16> * %0, align 4
+  ret void
+}
+
+
+define void @__masked_store_blend_32(<1 x i32>* nocapture, <1 x i32>, 
+                                     <1 x i32> %mask) nounwind alwaysinline {
+  %val = load <1 x i32> * %0, align 4
+  %newval = call <1 x i32> @__vselect_i32(<1 x i32> %val, <1 x i32> %1, <1 x i32> %mask) 
+  store <1 x i32> %newval, <1 x i32> * %0, align 4
+  ret void
+}
+
+define void @__masked_store_blend_64(<1 x i64>* nocapture, <1 x i64>,
+                                     <1 x i32> %mask) nounwind alwaysinline {
+  %val = load <1 x i64> * %0, align 4
+  %newval = call <1 x i64> @__vselect_i64(<1 x i64> %val, <1 x i64> %1, <1 x i32> %mask) 
+  store <1 x i64> %newval, <1 x i64> * %0, align 4
+  ret void
+}
+
+define  i32 @__movmsk(<1 x i32>) nounwind readnone alwaysinline {
+  %item = extractelement <1 x i32> %0, i32 0
+  %v = lshr i32 %item, 31
+  ret i32 %v
+}
+
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rounding
+;;
+;; There are not any rounding instructions in SSE2, so we have to emulate
+;; the functionality with multiple instructions...
+
+; The code for __round_* is the result of compiling the following source
+; code.
+;
+; export float Round(float x) {
+;    unsigned int sign = signbits(x);
+;    unsigned int ix = intbits(x);
+;    ix ^= sign;
+;    x = floatbits(ix);
+;    x += 0x1.0p23f;
+;    x -= 0x1.0p23f;
+;    ix = intbits(x);
+;    ix ^= sign;
+;    x = floatbits(ix);
+;    return x;
+;}
+
+define  <1 x float> @__round_varying_float(<1 x float>) nounwind readonly alwaysinline {
+  %float_to_int_bitcast.i.i.i.i = bitcast <1 x float> %0 to <1 x i32>
+  %bitop.i.i = and <1 x i32> %float_to_int_bitcast.i.i.i.i, <i32 -2147483648>
+  %bitop.i = xor <1 x i32> %float_to_int_bitcast.i.i.i.i, %bitop.i.i
+  %int_to_float_bitcast.i.i40.i = bitcast <1 x i32> %bitop.i to <1 x float>
+  %binop.i = fadd <1 x float> %int_to_float_bitcast.i.i40.i, <float 8.388608e+06>
+  %binop21.i = fadd <1 x float> %binop.i, <float -8.388608e+06>
+  %float_to_int_bitcast.i.i.i = bitcast <1 x float> %binop21.i to <1 x i32>
+  %bitop31.i = xor <1 x i32> %float_to_int_bitcast.i.i.i, %bitop.i.i
+  %int_to_float_bitcast.i.i.i = bitcast <1 x i32> %bitop31.i to <1 x float>
+  ret <1 x float> %int_to_float_bitcast.i.i.i
+}
+
+;; Similarly, for implementations of the __floor* functions below, we have the
+;; bitcode from compiling the following source code...
+
+;export float Floor(float x) {
+;    float y = Round(x);
+;    unsigned int cmp = y > x ? 0xffffffff : 0;
+;    float delta = -1.f;
+;    unsigned int idelta = intbits(delta);
+;    idelta &= cmp;
+;    delta = floatbits(idelta);
+;    return y + delta;
+;}
+
+define  <1 x float> @__floor_varying_float(<1 x float>) nounwind readonly alwaysinline {
+  %calltmp.i = tail call <1 x float> @__round_varying_float(<1 x float> %0) nounwind
+  %bincmp.i = fcmp ogt <1 x float> %calltmp.i, %0
+  %val_to_boolvec32.i = sext <1 x i1> %bincmp.i to <1 x i32>
+  %bitop.i = and <1 x i32> %val_to_boolvec32.i, <i32 -1082130432>
+  %int_to_float_bitcast.i.i.i = bitcast <1 x i32> %bitop.i to <1 x float>
+  %binop.i = fadd <1 x float> %calltmp.i, %int_to_float_bitcast.i.i.i
+  ret <1 x float> %binop.i
+}
+
+;; And here is the code we compiled to get the __ceil* functions below
+;
+;export uniform float Ceil(uniform float x) {
+;    uniform float y = Round(x);
+;    uniform int yltx = y < x ? 0xffffffff : 0;
+;    uniform float delta = 1.f;
+;    uniform int idelta = intbits(delta);
+;    idelta &= yltx;
+;    delta = floatbits(idelta);
+;    return y + delta;
+;}
+
+define  <1 x float> @__ceil_varying_float(<1 x float>) nounwind readonly alwaysinline {
+  %calltmp.i = tail call <1 x float> @__round_varying_float(<1 x float> %0) nounwind
+  %bincmp.i = fcmp olt <1 x float> %calltmp.i, %0
+  %val_to_boolvec32.i = sext <1 x i1> %bincmp.i to <1 x i32>
+  %bitop.i = and <1 x i32> %val_to_boolvec32.i, <i32 1065353216>
+  %int_to_float_bitcast.i.i.i = bitcast <1 x i32> %bitop.i to <1 x float>
+  %binop.i = fadd <1 x float> %calltmp.i, %int_to_float_bitcast.i.i.i
+  ret <1 x float> %binop.i
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rounding doubles
+
+; expecting math lib to provide this
+declare double @ceil (double) nounwind readnone
+declare double @floor (double) nounwind readnone
+declare double @round (double) nounwind readnone
+;declare float     @llvm.sqrt.f32(float %Val)
+declare double    @llvm.sqrt.f64(double %Val)
+declare float     @llvm.sin.f32(float %Val)
+declare float     @llvm.cos.f32(float %Val)
+declare float     @llvm.sqrt.f32(float %Val)
+declare float     @llvm.exp.f32(float %Val)
+declare float     @llvm.log.f32(float %Val)
+declare float     @llvm.pow.f32(float %f, float %e)
+
+
+
+
+;; stuff that could be in builtins ...
+
+define(`unary1to1', `
+  %v_0 = extractelement <1 x $1> %0, i32 0
+  %r_0 = call $1 $2($1 %v_0)
+  %ret_0 = insertelement <1 x $1> undef, $1 %r_0, i32 0
+  ret <1 x $1> %ret_0
+')
+
+
+
+;; dummy 1 wide vector ops
+define  void
+@__aos_to_soa4_float1(<1 x float> %v0, <1 x float> %v1, <1 x float> %v2,
+        <1 x float> %v3, <1 x float> * noalias %out0, 
+        <1 x float> * noalias %out1, <1 x float> * noalias %out2, 
+        <1 x float> * noalias %out3) nounwind alwaysinline { 
+
+  store <1 x float> %v0, <1 x float > * %out0
+  store <1 x float> %v1, <1 x float > * %out1
+  store <1 x float> %v2, <1 x float > * %out2
+  store <1 x float> %v3, <1 x float > * %out3
+
+  ret void
+}
+
+define  void
+@__soa_to_aos4_float1(<1 x float> %v0, <1 x float> %v1, <1 x float> %v2,
+        <1 x float> %v3, <1 x float> * noalias %out0, 
+        <1 x float> * noalias %out1, <1 x float> * noalias %out2, 
+        <1 x float> * noalias %out3) nounwind alwaysinline { 
+  call void @__aos_to_soa4_float1(<1 x float> %v0, <1 x float> %v1, 
+    <1 x float> %v2, <1 x float> %v3, <1 x float> * %out0, 
+    <1 x float> * %out1, <1 x float> * %out2, <1 x float> * %out3)
+  ret void
+}
+
+define  void
+@__aos_to_soa3_float1(<1 x float> %v0, <1 x float> %v1,
+         <1 x float> %v2, <1 x float> * %out0, <1 x float> * %out1,
+         <1 x float> * %out2) {
+  store <1 x float> %v0, <1 x float > * %out0
+  store <1 x float> %v1, <1 x float > * %out1
+  store <1 x float> %v2, <1 x float > * %out2
+
+  ret void
+}
+
+define  void
+@__soa_to_aos3_float1(<1 x float> %v0, <1 x float> %v1,
+         <1 x float> %v2, <1 x float> * %out0, <1 x float> * %out1,
+         <1 x float> * %out2) {
+  call void @__aos_to_soa3_float1(<1 x float> %v0, <1 x float> %v1,
+         <1 x float> %v2, <1 x float> * %out0, <1 x float> * %out1,
+         <1 x float> * %out2)
+  ret void
+}
+
+
+;; end builtins
+
+
+define  <1 x double> @__round_varying_double(<1 x double>) nounwind readonly alwaysinline {
+  unary1to1(double, @round)
+}
+
+define  <1 x double> @__floor_varying_double(<1 x double>) nounwind readonly alwaysinline {
+  unary1to1(double, @floor)
+}
+
+
+define  <1 x double> @__ceil_varying_double(<1 x double>) nounwind readonly alwaysinline {
+  unary1to1(double, @ceil)
+}
+
+; To do vector integer min and max, we do the vector compare and then sign
+; extend the i1 vector result to an i32 mask.  The __vselect does the
+; rest...
+
+define  <1 x i32> @__min_varying_int32(<1 x i32>, <1 x i32>) nounwind readonly alwaysinline {
+  %c = icmp slt <1 x i32> %0, %1
+  %mask = sext <1 x i1> %c to <1 x i32>
+  %v = call <1 x i32> @__vselect_i32(<1 x i32> %1, <1 x i32> %0, <1 x i32> %mask)
+  ret <1 x i32> %v
+}
+
+define  i32 @__min_uniform_int32(i32, i32) nounwind readonly alwaysinline {
+  %c = icmp slt i32 %0, %1
+  %r = select i1 %c, i32 %0, i32 %1
+  ret i32 %r
+}
+
+define  <1 x i32> @__max_varying_int32(<1 x i32>, <1 x i32>) nounwind readonly alwaysinline {
+  %c = icmp sgt <1 x i32> %0, %1
+  %mask = sext <1 x i1> %c to <1 x i32>
+  %v = call <1 x i32> @__vselect_i32(<1 x i32> %1, <1 x i32> %0, <1 x i32> %mask)
+  ret <1 x i32> %v
+}
+
+define  i32 @__max_uniform_int32(i32, i32) nounwind readonly alwaysinline {
+  %c = icmp sgt i32 %0, %1
+  %r = select i1 %c, i32 %0, i32 %1
+  ret i32 %r
+}
+
+; The functions for unsigned ints are similar, just with unsigned
+; comparison functions...
+
+define  <1 x i32> @__min_varying_uint32(<1 x i32>, <1 x i32>) nounwind readonly alwaysinline {
+  %c = icmp ult <1 x i32> %0, %1
+  %mask = sext <1 x i1> %c to <1 x i32>
+  %v = call <1 x i32> @__vselect_i32(<1 x i32> %1, <1 x i32> %0, <1 x i32> %mask)
+  ret <1 x i32> %v
+}
+
+define  i32 @__min_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
+  %c = icmp ult i32 %0, %1
+  %r = select i1 %c, i32 %0, i32 %1
+  ret i32 %r
+}
+
+define  <1 x i32> @__max_varying_uint32(<1 x i32>, <1 x i32>) nounwind readonly alwaysinline {
+  %c = icmp ugt <1 x i32> %0, %1
+  %mask = sext <1 x i1> %c to <1 x i32>
+  %v = call <1 x i32> @__vselect_i32(<1 x i32> %1, <1 x i32> %0, <1 x i32> %mask)
+  ret <1 x i32> %v
+}
+
+define  i32 @__max_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
+  %c = icmp ugt i32 %0, %1
+  %r = select i1 %c, i32 %0, i32 %1
+  ret i32 %r
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; horizontal ops / reductions
+
+declare i32 @llvm.ctpop.i32(i32) nounwind readnone
+
+define  i32 @__popcnt_int32(i32) nounwind readonly alwaysinline {
+  %call = call i32 @llvm.ctpop.i32(i32 %0)
+  ret i32 %call
+}
+
+declare i64 @llvm.ctpop.i64(i64) nounwind readnone
+
+define  i64 @__popcnt_int64(i64) nounwind readonly alwaysinline {
+  %call = call i64 @llvm.ctpop.i64(i64 %0)
+  ret i64 %call
+}
+
+
+define  float @__reduce_add_float(<1 x float> %v) nounwind readonly alwaysinline {
+  %r = extractelement <1 x float> %v, i32 0
+  ret float %r
+}
+
+define  float @__reduce_min_float(<1 x float>) nounwind readnone {
+  %r = extractelement <1 x float> %0, i32 0
+  ret float %r
+}
+
+define  float @__reduce_max_float(<1 x float>) nounwind readnone {
+  %r = extractelement <1 x float> %0, i32 0
+  ret float %r
+}
+
+define  i32 @__reduce_add_int32(<1 x i32> %v) nounwind readnone {
+  %r = extractelement <1 x i32> %v, i32 0
+  ret i32 %r
+}
+
+define  i32 @__reduce_min_int32(<1 x i32>) nounwind readnone {
+  %r = extractelement <1 x i32> %0, i32 0
+  ret i32 %r
+}
+
+define  i32 @__reduce_max_int32(<1 x i32>) nounwind readnone {
+  %r = extractelement <1 x i32> %0, i32 0
+  ret i32 %r
+}
+
+define  i32 @__reduce_add_uint32(<1 x i32> %v) nounwind readnone {
+  %r = call i32 @__reduce_add_int32(<1 x i32> %v)
+  ret i32 %r
+}
+
+define  i32 @__reduce_min_uint32(<1 x i32>) nounwind readnone {
+  %r = extractelement <1 x i32> %0, i32 0
+  ret i32 %r
+}
+
+define  i32 @__reduce_max_uint32(<1 x i32>) nounwind readnone {
+  %r = extractelement <1 x i32> %0, i32 0
+  ret i32 %r
+ }
+
+
+define  double @__reduce_add_double(<1 x double>) nounwind readnone {
+  %m = extractelement <1 x double> %0, i32 0
+  ret double %m
+}
+
+define  double @__reduce_min_double(<1 x double>) nounwind readnone {
+  %m = extractelement <1 x double> %0, i32 0
+  ret double %m
+}
+
+define  double @__reduce_max_double(<1 x double>) nounwind readnone {
+  %m = extractelement <1 x double> %0, i32 0
+  ret double %m
+}
+
+define  i64 @__reduce_add_int64(<1 x i64>) nounwind readnone {
+  %m = extractelement <1 x i64> %0, i32 0
+  ret i64 %m
+}
+
+define  i64 @__reduce_min_int64(<1 x i64>) nounwind readnone {
+  %m = extractelement <1 x i64> %0, i32 0
+  ret i64 %m
+}
+
+define  i64 @__reduce_max_int64(<1 x i64>) nounwind readnone {
+  %m = extractelement <1 x i64> %0, i32 0
+  ret i64 %m
+}
+
+define  i64 @__reduce_min_uint64(<1 x i64>) nounwind readnone {
+  %m = extractelement <1 x i64> %0, i32 0
+  ret i64 %m
+}
+
+define  i64 @__reduce_max_uint64(<1 x i64>) nounwind readnone {
+  %m = extractelement <1 x i64> %0, i32 0
+  ret i64 %m
+}
+
+define  i1 @__reduce_equal_int32(<1 x i32> %vv, i32 * %samevalue,
+                                      <1 x i32> %mask) nounwind alwaysinline {
+  %v=extractelement <1 x i32> %vv, i32 0
+  store i32 %v, i32 * %samevalue
+  ret i1 true
+
+}
+
+define  i1 @__reduce_equal_float(<1 x float> %vv, float * %samevalue,
+                                      <1 x i32> %mask) nounwind alwaysinline {
+  %v=extractelement <1 x float> %vv, i32 0
+  store float %v, float * %samevalue
+  ret i1 true
+
+}
+
+define  i1 @__reduce_equal_int64(<1 x i64> %vv, i64 * %samevalue,
+                                      <1 x i32> %mask) nounwind alwaysinline {
+  %v=extractelement <1 x i64> %vv, i32 0
+  store i64 %v, i64 * %samevalue
+  ret i1 true
+
+}
+
+define  i1 @__reduce_equal_double(<1 x double> %vv, double * %samevalue,
+                                      <1 x i32> %mask) nounwind alwaysinline {
+  %v=extractelement <1 x double> %vv, i32 0
+  store double %v, double * %samevalue
+  ret i1 true
+
+}
+
+; extracting/reinserting elements because I want to be able to remove vectors later on
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rcp
+
+define  <1 x float> @__rcp_varying_float(<1 x float>) nounwind readonly alwaysinline {
+  ;%call = call <1 x float> @llvm.x86.sse.rcp.ps(<1 x float> %0)
+  ; do one N-R iteration to improve precision
+  ;  float iv = __rcp_v(v);
+  ;  return iv * (2. - v * iv);
+  ;%v_iv = fmul <1 x float> %0, %call
+  ;%two_minus = fsub <1 x float> <float 2., float 2., float 2., float 2.>, %v_iv  
+  ;%iv_mul = fmul <1 x float> %call, %two_minus
+  ;ret <1 x float> %iv_mul
+  %d = extractelement <1 x float> %0, i32 0
+  %r = fdiv float 1.,%d
+  %rv = insertelement <1 x float> undef, float %r, i32 0
+  ret <1 x float> %rv
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; sqrt
+
+define  <1 x float> @__sqrt_varying_float(<1 x float>) nounwind readonly alwaysinline {
+  ;%call = call <1 x float> @llvm.x86.sse.sqrt.ps(<1 x float> %0)
+  ;ret <1 x float> %call
+  %d = extractelement <1 x float> %0, i32 0
+  %r = call float @llvm.sqrt.f32(float %d)
+  %rv = insertelement <1 x float> undef, float %r, i32 0
+  ret <1 x float> %rv
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; rsqrt
+
+define  <1 x float> @__rsqrt_varying_float(<1 x float> %v) nounwind readonly alwaysinline {
+  ;  float is = __rsqrt_v(v);
+  ;%is = call <1 x float> @llvm.x86.sse.rsqrt.ps(<1 x float> %v)
+  ; Newton-Raphson iteration to improve precision
+  ;  return 0.5 * is * (3. - (v * is) * is);
+  ;%v_is = fmul <1 x float> %v, %is
+  ;%v_is_is = fmul <1 x float> %v_is, %is
+  ;%three_sub = fsub <1 x float> <float 3., float 3., float 3., float 3.>, %v_is_is
+  ;%is_mul = fmul <1 x float> %is, %three_sub
+  ;%half_scale = fmul <1 x float> <float 0.5, float 0.5, float 0.5, float 0.5>, %is_mul
+  ;ret <1 x float> %half_scale
+  %s = call <1 x float> @__sqrt_varying_float(<1 x float> %v)
+  %r = call <1 x float> @__rcp_varying_float(<1 x float> %s)
+  ret <1 x float> %r
+  
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; svml stuff
+
+define  <1 x float> @__svml_sin(<1 x float>) nounwind readnone alwaysinline {
+  ;%ret = call <1 x float> @__svml_sinf4(<1 x float> %0)
+  ;ret <1 x float> %ret
+  ;%r = extractelement <1 x float> %0, i32 0
+  ;%s = call float @llvm.sin.f32(float %r)
+  ;%rv = insertelement <1 x float> undef, float %r, i32 0
+  ;ret <1 x float> %rv
+  unary1to1(float,@llvm.sin.f32)
+   
+}
+
+define  <1 x float> @__svml_cos(<1 x float>) nounwind readnone alwaysinline {
+  ;%ret = call <1 x float> @__svml_cosf4(<1 x float> %0)
+  ;ret <1 x float> %ret
+  ;%r = extractelement <1 x float> %0, i32 0
+  ;%s = call float @llvm.cos.f32(float %r)
+  ;%rv = insertelement <1 x float> undef, float %r, i32 0
+  ;ret <1 x float> %rv
+  unary1to1(float, @llvm.cos.f32)
+
+}
+
+define  void @__svml_sincos(<1 x float>, <1 x float> *, <1 x float> *) nounwind readnone alwaysinline {
+;  %s = call <1 x float> @__svml_sincosf4(<1 x float> * %2, <1 x float> %0)
+;  store <1 x float> %s, <1 x float> * %1
+;  ret void
+   %sin = call <1 x float> @__svml_sin (<1 x float> %0)
+   %cos = call <1 x float> @__svml_cos (<1 x float> %0)
+   store <1 x float> %sin, <1 x float> * %1
+   store <1 x float> %cos, <1 x float> * %2
+   ret void
+}
+
+define  <1 x float> @__svml_tan(<1 x float>) nounwind readnone alwaysinline {
+  ;%ret = call <1 x float> @__svml_tanf4(<1 x float> %0)
+  ;ret <1 x float> %ret
+  ;%r = extractelement <1 x float> %0, i32 0
+  ;%s = call float @llvm_tan_f32(float %r)
+  ;%rv = insertelement <1 x float> undef, float %r, i32 0
+  ;ret <1 x float> %rv
+  ;unasry1to1(float, @llvm.tan.f32)
+  ; UNSUPPORTED!
+  ret <1 x float > %0
+}
+
+define  <1 x float> @__svml_atan(<1 x float>) nounwind readnone alwaysinline {
+;  %ret = call <1 x float> @__svml_atanf4(<1 x float> %0)
+;  ret <1 x float> %ret
+  ;%r = extractelement <1 x float> %0, i32 0
+  ;%s = call float @llvm_atan_f32(float %r)
+  ;%rv = insertelement <1 x float> undef, float %r, i32 0
+  ;ret <1 x float> %rv
+  ;unsary1to1(float,@llvm.atan.f32)
+  ;UNSUPPORTED!
+  ret <1 x float > %0
+
+}
+
+define  <1 x float> @__svml_atan2(<1 x float>, <1 x float>) nounwind readnone alwaysinline {
+  ;%ret = call <1 x float> @__svml_atan2f4(<1 x float> %0, <1 x float> %1)
+  ;ret <1 x float> %ret
+  ;%y = extractelement <1 x float> %0, i32 0
+  ;%x = extractelement <1 x float> %1, i32 0
+  ;%q = fdiv float %y, %x
+  ;%a = call float @llvm.atan.f32 (float %q)
+  ;%rv = insertelement <1 x float> undef, float %a, i32 0
+  ;ret <1 x float> %rv
+  ; UNSUPPORTED!
+  ret <1 x float > %0
+}
+
+define  <1 x float> @__svml_exp(<1 x float>) nounwind readnone alwaysinline {
+  ;%ret = call <1 x float> @__svml_expf4(<1 x float> %0)
+  ;ret <1 x float> %ret
+  unary1to1(float, @llvm.exp.f32)
+}
+
+define  <1 x float> @__svml_log(<1 x float>) nounwind readnone alwaysinline {
+  ;%ret = call <1 x float> @__svml_logf4(<1 x float> %0)
+  ;ret <1 x float> %ret
+  unary1to1(float, @llvm.log.f32)
+}
+
+define  <1 x float> @__svml_pow(<1 x float>, <1 x float>) nounwind readnone alwaysinline {
+  ;%ret = call <1 x float> @__svml_powf4(<1 x float> %0, <1 x float> %1)
+  ;ret <1 x float> %ret
+  %r = extractelement <1 x float> %0, i32 0
+  %e  = extractelement <1 x float> %1, i32 0
+  %s = call float @llvm.pow.f32(float %r,float %e)
+  %rv = insertelement <1 x float> undef, float %s, i32 0
+  ret <1 x float> %rv
+
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; float min/max
+
+define  <1 x float> @__max_varying_float(<1 x float>, <1 x float>) nounwind readonly alwaysinline {
+;  %call = call <1 x float> @llvm.x86.sse.max.ps(<1 x float> %0, <1 x float> %1)
+;  ret <1 x float> %call
+  %a = extractelement <1 x float> %0, i32 0
+  %b = extractelement <1 x float> %1, i32 0
+  %d = fcmp ogt float %a, %b  
+  %r = select i1 %d, float %a, float %b
+  %rv = insertelement <1 x float> undef, float %r, i32 0
+  ret <1 x float> %rv    
+}
+
+define  <1 x float> @__min_varying_float(<1 x float>, <1 x float>) nounwind readonly alwaysinline {
+;  %call = call <1 x float> @llvm.x86.sse.min.ps(<1 x float> %0, <1 x float> %1)
+;  ret <1 x float> %call
+  %a = extractelement <1 x float> %0, i32 0
+  %b = extractelement <1 x float> %1, i32 0
+  %d = fcmp olt float %a, %b  
+  %r = select i1 %d, float %a, float %b
+  %rv = insertelement <1 x float> undef, float %r, i32 0
+  ret <1 x float> %rv    
+
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; double precision sqrt
+
+;declare <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double>) nounwind readnone
+
+define  <1 x double> @__sqrt_varying_double(<1 x double>) nounwind alwaysinline {
+  ;unarya2to4(ret, double, @llvm.x86.sse2.sqrt.pd, %0)
+  ;ret <1 x double> %ret
+  unary1to1(double, @llvm.sqrt.f64)
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; double precision min/max
+
+;declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>) nounwind readnone
+;declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) nounwind readnone
+
+define  <1 x double> @__min_varying_double(<1 x double>, <1 x double>) nounwind readnone {
+  ;binarsy2to4(ret, double, @llvm.x86.sse2.min.pd, %0, %1)
+  ;ret <1 x double> %ret
+  %a = extractelement <1 x double> %0, i32 0
+  %b = extractelement <1 x double> %1, i32 0
+  %d = fcmp olt double %a, %b  
+  %r = select i1 %d, double %a, double %b
+  %rv = insertelement <1 x double> undef, double %r, i32 0
+  ret <1 x double> %rv    
+
+}
+
+define  <1 x double> @__max_varying_double(<1 x double>, <1 x double>) nounwind readnone {
+  ;binary2sto4(ret, double, @llvm.x86.sse2.max.pd, %0, %1)
+  ;ret <1 x double> %ret
+  %a = extractelement <1 x double> %0, i32 0
+  %b = extractelement <1 x double> %1, i32 0
+  %d = fcmp olt double %a, %b  
+  %r = select i1 %d, double %a, double %b
+  %rv = insertelement <1 x double> undef, double %r, i32 0
+  ret <1 x double> %rv    
+
+}
+
+
+define  float @__rcp_uniform_float(float) nounwind readonly alwaysinline {
+;    uniform float iv = extract(__rcp_u(v), 0);
+;    return iv * (2. - v * iv);
+  %r = fdiv float 1.,%0
+  ret float %r
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rounding floats
+
+define  float @__round_uniform_float(float) nounwind readonly alwaysinline {
+  ; roundss, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8
+  ; the roundss intrinsic is a total mess--docs say:
+  ;
+  ;  __m128 _mm_round_ss (__m128 a, __m128 b, const int c)
+  ;       
+  ;  b is a 128-bit parameter. The lowest 32 bits are the result of the rounding function
+  ;  on b0. The higher order 96 bits are copied directly from input parameter a. The
+  ;  return value is described by the following equations:
+  ;
+  ;  r0 = RND(b0)
+  ;  r1 = a1
+  ;  r2 = a2
+  ;  r3 = a3
+  ;
+  ;  It doesn't matter what we pass as a, since we only need the r0 value
+  ;  here.  So we pass the same register for both.
+  %v = insertelement<1 x float> undef, float %0, i32 0
+  %rv = call <1 x float> @__round_varying_float(<1 x float> %v)
+  %r=extractelement <1 x float> %rv, i32 0
+  ret float %r
+
+}
+
+define  float @__floor_uniform_float(float) nounwind readonly alwaysinline {
+  %v = insertelement<1 x float> undef, float %0, i32 0
+  %rv = call <1 x float> @__floor_varying_float(<1 x float> %v)
+  %r=extractelement <1 x float> %rv, i32 0
+  ret float %r
+
+}
+
+define  float @__ceil_uniform_float(float) nounwind readonly alwaysinline {
+  %v = insertelement<1 x float> undef, float %0, i32 0
+  %rv = call <1 x float> @__ceil_varying_float(<1 x float> %v)
+  %r=extractelement <1 x float> %rv, i32 0
+  ret float %r
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rounding doubles
+
+
+define  double @__round_uniform_double(double) nounwind readonly alwaysinline {
+       %rs=call double @round(double %0)
+       ret double %rs
+}
+
+define  double @__floor_uniform_double(double) nounwind readonly alwaysinline {
+  %rs = call double @floor(double %0)
+  ret double %rs
+}
+
+define  double @__ceil_uniform_double(double) nounwind readonly alwaysinline {
+  %rs = call double @ceil(double %0)
+  ret double %rs
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; sqrt
+
+
+define  float @__sqrt_uniform_float(float) nounwind readonly alwaysinline {
+  %ret = call float @llvm.sqrt.f32(float %0)
+  ret float %ret
+}
+
+define  double @__sqrt_uniform_double(double) nounwind readonly alwaysinline {
+  %ret = call double @llvm.sqrt.f64(double %0)
+  ret double %ret
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rsqrt
+
+
+define  float @__rsqrt_uniform_float(float) nounwind readonly alwaysinline {
+  %s = call float @__sqrt_uniform_float(float %0)
+  %r = call float @__rcp_uniform_float(float %s)
+  ret float %r
+}
+
+
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; fastmath
+
+
+define  void @__fastmath() nounwind alwaysinline {
+ ; no-op
+  ret void
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; float min/max
+
+
+define  float @__max_uniform_float(float, float) nounwind readonly alwaysinline {
+  %d = fcmp ogt float %0, %1 
+  %r = select i1 %d, float %0, float %1
+  ret float %r
+
+}
+
+define  float @__min_uniform_float(float, float) nounwind readonly alwaysinline {
+  %d = fcmp olt float %0, %1 
+  %r = select i1 %d, float %0, float %1
+  ret float %r
+
+}
+define  double @__max_uniform_double(double, double) nounwind readonly alwaysinline {
+  %d = fcmp ogt double %0, %1 
+  %r = select i1 %d, double %0, double %1
+  ret double %r
+
+}
+
+define  double @__min_uniform_double(double, double) nounwind readonly alwaysinline {
+  %d = fcmp olt double %0, %1 
+  %r = select i1 %d, double %0, double %1
+  ret double %r
+
+}
+
+define_shuffles()
+
+ctlztz()
+
+define_prefetches()
+
diff --git a/ispc.cpp b/ispc.cpp
index 7fbc5bc6..a817d17e 100644
--- a/ispc.cpp
+++ b/ispc.cpp
@@ -185,6 +185,14 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa,
         t->allOffMaskIsSafe = true;
         t->maskBitCount = 1;
     }
+    else if (!strcasecmp(isa, "generic-1")) {
+        t->isa = Target::GENERIC;
+        t->nativeVectorWidth = 1;
+        t->vectorWidth = 1;
+        t->maskingIsFree = false;
+        t->allOffMaskIsSafe = false;
+        t->maskBitCount = 32;
+    }
 #if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
     else if (!strcasecmp(isa, "avx")) {
         t->isa = Target::AVX;
@@ -270,7 +278,7 @@ Target::SupportedTargetISAs() {
 #ifdef LLVM_3_1svn
         ", avx2, avx2-x2"
 #endif // LLVM_3_1svn
-        ", generic-4, generic-8, generic-16";
+        ", generic-4, generic-8, generic-16, generic-1";
 }
 
 
diff --git a/parse.yy b/parse.yy
index de93bf77..52dd6809 100644
--- a/parse.yy
+++ b/parse.yy
@@ -1659,7 +1659,7 @@ lAddFunctionParams(Declarator *decl) {
 
 /** Add a symbol for the built-in mask variable to the symbol table */
 static void lAddMaskToSymbolTable(SourcePos pos) {
-    const Type *t = g->target.isa == Target::GENERIC ?
+    const Type *t = g->target.maskBitCount == 1 ?
         AtomicType::VaryingConstBool : AtomicType::VaryingConstUInt32;
     Symbol *maskSymbol = new Symbol("__mask", pos, t);
     m->symbolTable->AddVariable(maskSymbol);

From 20dbf594202c0614ea7c2d1acf98841f29c26e11 Mon Sep 17 00:00:00 2001
From: Matt Pharr <matt.pharr@intel.com>
Date: Sun, 29 Jan 2012 13:46:17 -0800
Subject: [PATCH 18/62] Don't lose source position when returning values of
 constant symbols.

---
 expr.cpp | 49 ++++++++++++++++++++++++++++++++++++++++++++++++-
 expr.h   |  4 ++++
 2 files changed, 52 insertions(+), 1 deletion(-)

diff --git a/expr.cpp b/expr.cpp
index b9872780..afb1ec75 100644
--- a/expr.cpp
+++ b/expr.cpp
@@ -4191,6 +4191,53 @@ ConstExpr::ConstExpr(ConstExpr *old, double *v)
 }
 
 
+ConstExpr::ConstExpr(ConstExpr *old, SourcePos p)
+    : Expr(p) {
+    type = old->type;
+
+    AtomicType::BasicType basicType = getBasicType();
+
+    switch (basicType) {
+    case AtomicType::TYPE_BOOL:
+        memcpy(boolVal, old->boolVal, Count() * sizeof(bool));
+        break;
+    case AtomicType::TYPE_INT8:
+        memcpy(int8Val, old->int8Val, Count() * sizeof(int8_t));
+        break;
+    case AtomicType::TYPE_UINT8:
+        memcpy(uint8Val, old->uint8Val, Count() * sizeof(uint8_t));
+        break;
+    case AtomicType::TYPE_INT16:
+        memcpy(int16Val, old->int16Val, Count() * sizeof(int16_t));
+        break;
+    case AtomicType::TYPE_UINT16:
+        memcpy(uint16Val, old->uint16Val, Count() * sizeof(uint16_t));
+        break;
+    case AtomicType::TYPE_INT32:
+        memcpy(int32Val, old->int32Val, Count() * sizeof(int32_t));
+        break;
+    case AtomicType::TYPE_UINT32:
+        memcpy(uint32Val, old->uint32Val, Count() * sizeof(uint32_t));
+        break;
+    case AtomicType::TYPE_FLOAT:
+        memcpy(floatVal, old->floatVal, Count() * sizeof(float));
+        break;
+    case AtomicType::TYPE_DOUBLE:
+        memcpy(doubleVal, old->doubleVal, Count() * sizeof(double));
+        break;
+    case AtomicType::TYPE_INT64:
+        memcpy(int64Val, old->int64Val, Count() * sizeof(int64_t));
+        break;
+    case AtomicType::TYPE_UINT64:
+        memcpy(uint64Val, old->uint64Val, Count() * sizeof(uint64_t));
+        break;
+    default:
+        FATAL("unimplemented const type");
+    }
+    
+}
+
+
 AtomicType::BasicType
 ConstExpr::getBasicType() const {
     const AtomicType *at = dynamic_cast<const AtomicType *>(type);
@@ -6134,7 +6181,7 @@ SymbolExpr::Optimize() {
         return NULL;
     else if (symbol->constValue != NULL) {
         Assert(GetType()->IsConstType());
-        return symbol->constValue;
+        return new ConstExpr(symbol->constValue, pos);
     }
     else
         return this;
diff --git a/expr.h b/expr.h
index 3bc74d49..fad1d0bc 100644
--- a/expr.h
+++ b/expr.h
@@ -388,6 +388,10 @@ public:
         with values given by the "vales" parameter. */
     ConstExpr(ConstExpr *old, double *values);
 
+    /** Create ConstExpr with the same type and values as the given one,
+        but at the given position. */
+    ConstExpr(ConstExpr *old, SourcePos pos);
+
     llvm::Value *GetValue(FunctionEmitContext *ctx) const;
     const Type *GetType() const;
     void Print() const;

From f8a7120d9c3c25a2e515a1c3b458d79f46cdb68a Mon Sep 17 00:00:00 2001
From: Matt Pharr <matt.pharr@intel.com>
Date: Sun, 29 Jan 2012 13:46:38 -0800
Subject: [PATCH 19/62] Detect division by 0 during constant folding and issue
 a sensible error.

---
 expr.cpp | 25 ++++++++++++++++++-------
 1 file changed, 18 insertions(+), 7 deletions(-)

diff --git a/expr.cpp b/expr.cpp
index afb1ec75..be14fc1e 100644
--- a/expr.cpp
+++ b/expr.cpp
@@ -1594,7 +1594,8 @@ lConstFoldBinLogicalOp(BinaryExpr::Op op, const T *v0, const T *v1, ConstExpr *c
 /** Constant fold binary arithmetic ops.
  */
 template <typename T> static ConstExpr *
-lConstFoldBinArithOp(BinaryExpr::Op op, const T *v0, const T *v1, ConstExpr *carg0) {
+lConstFoldBinArithOp(BinaryExpr::Op op, const T *v0, const T *v1, ConstExpr *carg0,
+                     SourcePos pos) {
     T result[ISPC_MAX_NVEC];
     int count = carg0->Count();
 
@@ -1602,7 +1603,16 @@ lConstFoldBinArithOp(BinaryExpr::Op op, const T *v0, const T *v1, ConstExpr *car
         FOLD_OP(BinaryExpr::Add, +);
         FOLD_OP(BinaryExpr::Sub, -);
         FOLD_OP(BinaryExpr::Mul, *);
-        FOLD_OP(BinaryExpr::Div, /);
+    case BinaryExpr::Div:
+        for (int i = 0; i < count; ++i) {
+            if (v1[i] == 0) {
+                Error(pos, "Division by zero encountered in expression.");
+                result[i] = 0;
+            }
+            else
+                result[i] = (v0[i] / v1[i]);
+        }
+        break;
     default:
         return NULL;
     }
@@ -1718,7 +1728,7 @@ BinaryExpr::Optimize() {
         constArg0->AsFloat(v0);
         constArg1->AsFloat(v1);
         ConstExpr *ret;
-        if ((ret = lConstFoldBinArithOp(op, v0, v1, constArg0)) != NULL)
+        if ((ret = lConstFoldBinArithOp(op, v0, v1, constArg0, pos)) != NULL)
             return ret;
         else if ((ret = lConstFoldBinLogicalOp(op, v0, v1, constArg0)) != NULL)
             return ret;
@@ -1730,7 +1740,7 @@ BinaryExpr::Optimize() {
         constArg0->AsDouble(v0);
         constArg1->AsDouble(v1);
         ConstExpr *ret;
-        if ((ret = lConstFoldBinArithOp(op, v0, v1, constArg0)) != NULL)
+        if ((ret = lConstFoldBinArithOp(op, v0, v1, constArg0, pos)) != NULL)
             return ret;
         else if ((ret = lConstFoldBinLogicalOp(op, v0, v1, constArg0)) != NULL)
             return ret;
@@ -1742,7 +1752,7 @@ BinaryExpr::Optimize() {
         constArg0->AsInt32(v0);
         constArg1->AsInt32(v1);
         ConstExpr *ret;
-        if ((ret = lConstFoldBinArithOp(op, v0, v1, constArg0)) != NULL)
+        if ((ret = lConstFoldBinArithOp(op, v0, v1, constArg0, pos)) != NULL)
             return ret;
         else if ((ret = lConstFoldBinIntOp(op, v0, v1, constArg0)) != NULL)
             return ret;
@@ -1757,7 +1767,7 @@ BinaryExpr::Optimize() {
         constArg0->AsUInt32(v0);
         constArg1->AsUInt32(v1);
         ConstExpr *ret;
-        if ((ret = lConstFoldBinArithOp(op, v0, v1, constArg0)) != NULL)
+        if ((ret = lConstFoldBinArithOp(op, v0, v1, constArg0, pos)) != NULL)
             return ret;
         else if ((ret = lConstFoldBinIntOp(op, v0, v1, constArg0)) != NULL)
             return ret;
@@ -1943,7 +1953,8 @@ BinaryExpr::TypeCheck() {
             return NULL;
         }
 
-        const Type *promotedType = Type::MoreGeneralType(type0, type1, arg0->pos,
+        const Type *promotedType = Type::MoreGeneralType(type0, type1, 
+                                                         Union(arg0->pos, arg1->pos),
                                                          lOpString(op));
         if (promotedType == NULL)
             return NULL;

From b50f6f1730238eb5aea294565b0c2098c2a7f8a4 Mon Sep 17 00:00:00 2001
From: Matt Pharr <matt.pharr@intel.com>
Date: Sun, 29 Jan 2012 13:46:57 -0800
Subject: [PATCH 20/62] Fix RNG seed code in stdlib for scalar target.

---
 stdlib.ispc | 25 +++++++++++++++++--------
 1 file changed, 17 insertions(+), 8 deletions(-)

diff --git a/stdlib.ispc b/stdlib.ispc
index 6cb7e732..0fe5e8ea 100644
--- a/stdlib.ispc
+++ b/stdlib.ispc
@@ -3246,14 +3246,23 @@ static inline uniform unsigned int __seed4(RNGState * uniform state,
 }
 
 static inline void seed_rng(uniform RNGState * uniform state, uniform unsigned int seed) {
-    seed = __seed4(state, 0, seed);
-    if (programCount == 8)
-        __seed4(state, 4, seed ^ 0xbeeff00d);
-    if (programCount == 16) {
-        __seed4(state, 4,  seed ^ 0xbeeff00d);
-        __seed4(state, 8,  ((seed & 0xffff) << 16) | (seed >> 16));
-        __seed4(state, 12, (((seed & 0xff) << 24) | ((seed & 0xff00)  << 8) |
-                            ((seed & 0xff0000) >> 8) | (seed & 0xff000000) >> 24));
+    if (programCount == 1) {
+        state->z1 = seed;
+        state->z2 = seed ^ 0xbeeff00d;
+        state->z3 = ((seed & 0xffff) << 16) | (seed >> 16);
+        state->z4 = (((seed & 0xff) << 24) | ((seed & 0xff00)  << 8) |
+                     ((seed & 0xff0000) >> 8) | (seed & 0xff000000) >> 24);
+    }
+    else {
+        seed = __seed4(state, 0, seed);
+        if (programCount == 8)
+            __seed4(state, 4, seed ^ 0xbeeff00d);
+        if (programCount == 16) {
+            __seed4(state, 4,  seed ^ 0xbeeff00d);
+            __seed4(state, 8,  ((seed & 0xffff) << 16) | (seed >> 16));
+            __seed4(state, 12, (((seed & 0xff) << 24) | ((seed & 0xff00)  << 8) |
+                                ((seed & 0xff0000) >> 8) | (seed & 0xff000000) >> 24));
+        }
     }
 }
 

From f2fbc168af2c9a89053e6df79e510d002ae3e0a4 Mon Sep 17 00:00:00 2001
From: Matt Pharr <matt.pharr@intel.com>
Date: Sun, 29 Jan 2012 13:47:44 -0800
Subject: [PATCH 21/62] Scalar target builtins bugfixes.

Typo in __max_varying_double.
Add declarations for half functions.
Use the gen_scatter macro to get the scatter functions.
---
 builtins/target-generic-1.ll | 97 +++++-------------------------------
 1 file changed, 13 insertions(+), 84 deletions(-)

diff --git a/builtins/target-generic-1.ll b/builtins/target-generic-1.ll
index f63dc889..ad911e64 100755
--- a/builtins/target-generic-1.ll
+++ b/builtins/target-generic-1.ll
@@ -41,89 +41,10 @@ gen_gather(1, i16)
 gen_gather(1, i32)
 gen_gather(1, i64)
 
-define  void @__scatter_elt_i8(i8 * %base, <1 x i32> %offsets, <1 x i8> %values,
-                                       i32 %lane) nounwind alwaysinline {
-  %offset32 = extractelement <1 x i32> %offsets, i32 %lane
-;  %offset64 = zext i32 %offset32 to i64
-;  %ptrdelta = add i64 %ptr64, %offset64
-;  %ptr = inttoptr i64 %ptrdelta to i8 *
-  %ptroffset = getelementptr i8 *%base, i32 %offset32
-  %ptr = bitcast i8 * %ptroffset to i8 *
-  %storeval = extractelement <1 x i8> %values, i32 %lane
-  store i8 %storeval, i8 * %ptr
-  ret void
-}
-
-define void @__scatter_base_offsets_i8(i8* %base, <1 x i32> %offsets, <1 x i8> %values,
-                                       <1 x i32> %mask) nounwind alwaysinline {
-  ;; And use the `per_lane' macro to do all of the per-lane work for scatter...
-  ;%ptr64 = ptrtoint i8 * %base to i64
-  call void @__scatter_elt_i8(i8 *%base, <1 x i32> %offsets, <1 x i8> %values, i32 0)
-  ret void
-}
-
-define  void @__scatter_elt_i16(i8 * %base, <1 x i32> %offsets, <1 x i16> %values,
-                                       i32 %lane) nounwind alwaysinline {
-  %offset32 = extractelement <1 x i32> %offsets, i32 %lane
-;  %offset64 = zext i32 %offset32 to i64
-;  %ptrdelta = add i64 %ptr64, %offset64
-;  %ptr = inttoptr i64 %ptrdelta to i16 *
-  %ptroffset = getelementptr i8 *%base, i32 %offset32
-  %ptr = bitcast i8 * %ptroffset to i16 *
-  %storeval = extractelement <1 x i16> %values, i32 %lane
-  store i16 %storeval, i16 * %ptr
-  ret void
-}
-
-define void @__scatter_base_offsets_i16(i8* %base, <1 x i32> %offsets, <1 x i16> %values,
-                                       <1 x i32> %mask) nounwind alwaysinline {
-  ;; And use the `per_lane' macro to do all of the per-lane work for scatter...
-  ;%ptr64 = ptrtoint i8 * %base to i64
-  call void @__scatter_elt_i16(i8 *%base, <1 x i32> %offsets, <1 x i16> %values, i32 0)
-  ret void
-}
-
-define  void @__scatter_elt_i32(i8 * %base, <1 x i32> %offsets, <1 x i32> %values,
-                                       i32 %lane) nounwind alwaysinline {
-  %offset32 = extractelement <1 x i32> %offsets, i32 %lane
-;  %offset64 = zext i32 %offset32 to i64
-;  %ptrdelta = add i64 %ptr64, %offset64
-;  %ptr = inttoptr i64 %ptrdelta to i32 *
-  %ptroffset = getelementptr i8 *%base, i32 %offset32
-  %ptr = bitcast i8 * %ptroffset to i32 *
-  %storeval = extractelement <1 x i32> %values, i32 %lane
-  store i32 %storeval, i32 * %ptr
-  ret void
-}
-
-define void @__scatter_base_offsets_i32(i8* %base, <1 x i32> %offsets, <1 x i32> %values,
-                                       <1 x i32> %mask) nounwind alwaysinline {
-  ;; And use the `per_lane' macro to do all of the per-lane work for scatter...
-  ;%ptr64 = ptrtoint i8 * %base to i64
-  call void @__scatter_elt_i32(i8 *%base, <1 x i32> %offsets, <1 x i32> %values, i32 0)
-  ret void
-}
-
-define  void @__scatter_elt_i64(i8 * %base, <1 x i32> %offsets, <1 x i64> %values,
-                                       i32 %lane) nounwind alwaysinline {
-  %offset32 = extractelement <1 x i32> %offsets, i32 %lane
-;  %offset64 = zext i32 %offset32 to i64
-;  %ptrdelta = add i64 %ptr64, %offset64
-;  %ptr = inttoptr i64 %ptrdelta to i64 *
-  %ptroffset = getelementptr i8 *%base, i32 %offset32
-  %ptr = bitcast i8 * %ptroffset to i64 *
-  %storeval = extractelement <1 x i64> %values, i32 %lane
-  store i64 %storeval, i64 * %ptr
-  ret void
-}
-
-define void @__scatter_base_offsets_i64(i8* %base, <1 x i32> %offsets, <1 x i64> %values,
-                                       <1 x i32> %mask) nounwind alwaysinline {
-  ;; And use the `per_lane' macro to do all of the per-lane work for scatter...
-  ;%ptr64 = ptrtoint i8 * %base to i64
-  call void @__scatter_elt_i64(i8 *%base, <1 x i32> %offsets, <1 x i64> %values, i32 0)
-  ret void
-}
+gen_scatter(1, i8)
+gen_scatter(1, i16)
+gen_scatter(1, i32)
+gen_scatter(1, i64)
 
 
 define  <1 x i8> @__vselect_i8(<1 x i8>, <1 x i8> ,
@@ -854,7 +775,7 @@ define  <1 x double> @__max_varying_double(<1 x double>, <1 x double>) nounwind
   ;ret <1 x double> %ret
   %a = extractelement <1 x double> %0, i32 0
   %b = extractelement <1 x double> %1, i32 0
-  %d = fcmp olt double %a, %b  
+  %d = fcmp ogt double %a, %b  
   %r = select i1 %d, double %a, double %b
   %rv = insertelement <1 x double> undef, double %r, i32 0
   ret <1 x double> %rv    
@@ -1004,3 +925,11 @@ ctlztz()
 
 define_prefetches()
 
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; half conversion routines
+
+declare float @__half_to_float_uniform(i16 %v) nounwind readnone
+declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
+declare i16 @__float_to_half_uniform(float %v) nounwind readnone
+declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone
+

From f6cd01f7cf47ac84d0a6f63c04adf8702c943e81 Mon Sep 17 00:00:00 2001
From: Matt Pharr <matt.pharr@intel.com>
Date: Sun, 29 Jan 2012 13:48:01 -0800
Subject: [PATCH 22/62] Windows build support for scalar target.

---
 ispc.vcxproj | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/ispc.vcxproj b/ispc.vcxproj
index 38457518..6971ce9a 100755
--- a/ispc.vcxproj
+++ b/ispc.vcxproj
@@ -25,6 +25,7 @@
     <ClCompile Include="gen-bitcode-c-32.cpp" />
     <ClCompile Include="gen-bitcode-c-64.cpp" />
     <ClCompile Include="gen-bitcode-dispatch.cpp" />
+    <ClCompile Include="gen-bitcode-generic-1.cpp" />
     <ClCompile Include="gen-bitcode-generic-4.cpp" />
     <ClCompile Include="gen-bitcode-generic-8.cpp" />
     <ClCompile Include="gen-bitcode-generic-16.cpp" />
@@ -211,6 +212,19 @@
       <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-avx2-x2.cpp</Message>
     </CustomBuild>
   </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="builtins\target-generic-1.ll">
+      <FileType>Document</FileType>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-generic-1.ll | python bitcode2cpp.py builtins\target-generic-1.ll &gt; gen-bitcode-generic-1.cpp</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-generic-1.cpp</Outputs>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins\util.m4;builtins\target-generic-common.ll</AdditionalInputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-generic-1.ll | python bitcode2cpp.py builtins\target-generic-1.ll &gt; gen-bitcode-generic-1.cpp</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-generic-1.cpp</Outputs>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins\util.m4;builtins\target-generic-common.ll</AdditionalInputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-generic-1.cpp</Message>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-generic-1.cpp</Message>
+    </CustomBuild>
+  </ItemGroup>
   <ItemGroup>
     <CustomBuild Include="builtins\target-generic-4.ll">
       <FileType>Document</FileType>

From 0575b1f38de68e69ea4dfa44e5a174edd8f3db13 Mon Sep 17 00:00:00 2001
From: Matt Pharr <matt.pharr@intel.com>
Date: Sun, 29 Jan 2012 16:22:25 -0800
Subject: [PATCH 23/62] Update run_tests and examples makefile for scalar
 target.

Fixed a number of tests that didn't handle the programCount == 1
case correctly.
---
 examples/common.mk                 |  8 +++++++-
 run_tests.py                       |  5 +++--
 tests/array-gather-multi-unif.ispc |  4 +++-
 tests/array-pointer-duality-1.ispc |  8 ++++----
 tests/array-scatter-unif-2.ispc    |  2 +-
 tests/array-scatter-unif.ispc      |  4 +++-
 tests/atomics-10.ispc              |  2 +-
 tests/atomics-13.ispc              |  2 +-
 tests/atomics-6.ispc               |  2 +-
 tests/atomics-8.ispc               |  2 +-
 tests/atomics-9.ispc               |  2 +-
 tests/broadcast-1.ispc             |  2 +-
 tests/broadcast-2.ispc             |  2 +-
 tests/broadcast-3.ispc             |  2 +-
 tests/broadcast.ispc               |  2 +-
 tests/extract-1.ispc               |  4 ++--
 tests/packed-load-3.ispc           |  2 +-
 tests/packed-load-5.ispc           | 10 +++++++---
 tests/packed-store-3.ispc          |  2 +-
 tests/popcnt-3.ispc                |  2 +-
 tests/ptr-assign-lhs-math-1.ispc   |  6 ++++--
 tests/reduce-add-double-1.ispc     |  5 +++--
 tests/reduce-add-double-2.ispc     |  3 ++-
 tests/reduce-add-double.ispc       |  2 +-
 tests/reduce-add-float-1.ispc      |  3 ++-
 tests/reduce-add-float-2.ispc      |  3 ++-
 tests/reduce-add-float.ispc        |  2 +-
 tests/reduce-add-int-1.ispc        |  3 ++-
 tests/reduce-add-int.ispc          |  3 ++-
 tests/reduce-add-int64-1.ispc      |  3 ++-
 tests/reduce-add-int64.ispc        |  3 ++-
 tests/reduce-add-uint-1.ispc       |  3 ++-
 tests/reduce-add-uint.ispc         |  2 +-
 tests/reduce-add-uint64-1.ispc     |  3 ++-
 tests/reduce-add-uint64.ispc       |  4 +++-
 tests/reduce-equal-13.ispc         |  6 +++---
 tests/reduce-equal-3.ispc          |  2 +-
 tests/reduce-equal-4.ispc          |  2 +-
 tests/reduce-equal-9.ispc          |  2 +-
 tests/reduce-equal.ispc            |  2 +-
 tests/reduce-max-double.ispc       |  4 ++--
 tests/reduce-max-float.ispc        |  4 ++--
 tests/reduce-max-int.ispc          |  4 ++--
 tests/reduce-max-int64.ispc        |  4 ++--
 tests/reduce-max-uint.ispc         |  2 +-
 tests/reduce-max-uint64.ispc       |  2 +-
 tests/reduce-min-double.ispc       |  2 +-
 tests/reduce-min-float.ispc        |  2 +-
 tests/reduce-min-uint.ispc         |  4 ++--
 tests/reduce-min-uint64.ispc       |  4 ++--
 tests/shuffle-3.ispc               | 10 +++++++---
 tests/shuffle.ispc                 | 10 +++++++---
 tests/shuffle2-1.ispc              | 12 ++++++++----
 tests/shuffle2-2.ispc              | 12 ++++++++----
 tests/shuffle2-3.ispc              | 12 ++++++++----
 tests/shuffle2-7.ispc              | 12 ++++++++----
 tests/shuffle2-8.ispc              | 12 ++++++++----
 tests/shuffle2-9.ispc              | 12 ++++++++----
 tests/store-int16-1.ispc           |  4 ++--
 tests/store-int16.ispc             |  4 ++--
 tests/test-124.ispc                |  2 +-
 tests/test-127.ispc                |  2 +-
 tests/test-128.ispc                |  4 ++--
 tests/test-129.ispc                |  4 ++--
 64 files changed, 168 insertions(+), 107 deletions(-)

diff --git a/examples/common.mk b/examples/common.mk
index 5c5377c0..a79e3b93 100644
--- a/examples/common.mk
+++ b/examples/common.mk
@@ -14,7 +14,7 @@ CPP_OBJS=$(addprefix objs/, $(CPP_SRC:.cpp=.o) $(TASK_OBJ))
 
 default: $(EXAMPLE)
 
-all: $(EXAMPLE) $(EXAMPLE)-sse4 $(EXAMPLE)-generic16
+all: $(EXAMPLE) $(EXAMPLE)-sse4 $(EXAMPLE)-generic16 $(EXAMPLE)-scalar
 
 .PHONY: dirs clean
 
@@ -57,3 +57,9 @@ objs/$(ISPC_SRC:.ispc=)_generic16.o: objs/$(ISPC_SRC:.ispc=)_generic16.cpp
 
 $(EXAMPLE)-generic16: $(CPP_OBJS) objs/$(ISPC_SRC:.ispc=)_generic16.o
 	$(CXX) $(CXXFLAGS) -o $@ $^ $(LIBS)
+
+objs/$(ISPC_SRC:.ispc=)_scalar.o: $(ISPC_SRC)
+	$(ISPC) $< -o $@ --target=generic-1
+
+$(EXAMPLE)-scalar: $(CPP_OBJS) objs/$(ISPC_SRC:.ispc=)_scalar.o
+	$(CXX) $(CXXFLAGS) -o $@ $^ $(LIBS)
diff --git a/run_tests.py b/run_tests.py
index ccd10e60..724e1037 100755
--- a/run_tests.py
+++ b/run_tests.py
@@ -53,7 +53,8 @@ if not is_windows:
 else:
     ispc_exe = "../Release/ispc.exe"
 
-is_generic_target = options.target.find("generic-") != -1
+is_generic_target = (options.target.find("generic-") != -1 and
+                     options.target != "generic-1")
 if is_generic_target and options.include_file == None:
     if options.target == "generic-4":
         sys.stderr.write("No generics #include specified; using examples/intrinsics/sse4.h\n")
@@ -224,7 +225,7 @@ def run_test(filename):
                   "in test %s\n" % filename)
             return (1, 0)
         else:
-            is_generic_target = options.target.find("generic-") != -1
+            global is_generic_target
             if is_generic_target:
                 obj_name = "%s.cpp" % filename
 
diff --git a/tests/array-gather-multi-unif.ispc b/tests/array-gather-multi-unif.ispc
index bf0794da..d876f314 100644
--- a/tests/array-gather-multi-unif.ispc
+++ b/tests/array-gather-multi-unif.ispc
@@ -15,7 +15,9 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
 }
 
 export void result(uniform float RET[]) { 
-    if (programCount == 4)
+    if (programCount == 1)
+        RET[programIndex] = 1;
+    else if (programCount == 4)
         RET[programIndex] = 5.; 
     else
         RET[programIndex] = 10.; 
diff --git a/tests/array-pointer-duality-1.ispc b/tests/array-pointer-duality-1.ispc
index 4982897e..b8ac18b3 100644
--- a/tests/array-pointer-duality-1.ispc
+++ b/tests/array-pointer-duality-1.ispc
@@ -3,13 +3,13 @@ export uniform int width() { return programCount; }
 
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
-    uniform float a[programCount];
-    for (unsigned int i = 0; i < programCount; ++i)
-        a[i] = aFOO[i];
+    uniform float a[programCount+4];
+    for (unsigned int i = 0; i < programCount+4; ++i)
+        a[i] = aFOO[min((int)i, programCount)];
 
     RET[programIndex] = *(a + 2);
 }
 
 export void result(uniform float RET[]) {
-    RET[programIndex] = 3;
+    RET[programIndex] = (programCount == 1) ? 1 : 3;
 }
diff --git a/tests/array-scatter-unif-2.ispc b/tests/array-scatter-unif-2.ispc
index 2c989e96..ef0c659b 100644
--- a/tests/array-scatter-unif-2.ispc
+++ b/tests/array-scatter-unif-2.ispc
@@ -14,4 +14,4 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
 }
 
     
-export void result(uniform float RET[]) { RET[programIndex] = 5; }
+export void result(uniform float RET[]) { RET[programIndex] = programCount == 1 ? 0 : 5; }
diff --git a/tests/array-scatter-unif.ispc b/tests/array-scatter-unif.ispc
index 61aaa24a..a553d703 100644
--- a/tests/array-scatter-unif.ispc
+++ b/tests/array-scatter-unif.ispc
@@ -14,7 +14,9 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
 
     
 export void result(uniform float RET[]) { 
-    if (programCount == 4)
+    if (programCount == 1)
+        RET[programIndex] = 0;
+    else if (programCount == 4)
         RET[programIndex] = 2;
     else
         RET[programIndex] = 4;
diff --git a/tests/atomics-10.ispc b/tests/atomics-10.ispc
index 1e941f81..194c0b0f 100644
--- a/tests/atomics-10.ispc
+++ b/tests/atomics-10.ispc
@@ -13,5 +13,5 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
 }
 
 export void result(uniform float RET[]) {
-    RET[programIndex] = 2;
+    RET[programIndex] = programCount == 1 ? 1 : 2;
 }
diff --git a/tests/atomics-13.ispc b/tests/atomics-13.ispc
index 71413ed0..fe9a5d1e 100644
--- a/tests/atomics-13.ispc
+++ b/tests/atomics-13.ispc
@@ -12,5 +12,5 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
 }
 
 export void result(uniform float RET[]) {
-    RET[programIndex] = (programCount/2) - 1;
+    RET[programIndex] = programCount == 1 ? 0 : ((programCount/2) - 1);
 }
diff --git a/tests/atomics-6.ispc b/tests/atomics-6.ispc
index 778c5d01..a15d3e38 100644
--- a/tests/atomics-6.ispc
+++ b/tests/atomics-6.ispc
@@ -10,5 +10,5 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
 }
 
 export void result(uniform float RET[]) {
-    RET[programIndex] = 3000;
+    RET[programIndex] = (programCount == 1) ? 2 : 3000;
 }
diff --git a/tests/atomics-8.ispc b/tests/atomics-8.ispc
index c81a7838..ea2df46e 100644
--- a/tests/atomics-8.ispc
+++ b/tests/atomics-8.ispc
@@ -12,5 +12,5 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
 }
 
 export void result(uniform float RET[]) {
-    RET[programIndex] = programCount;
+    RET[programIndex] = (programCount == 1) ? 0 : programCount;
 }
diff --git a/tests/atomics-9.ispc b/tests/atomics-9.ispc
index 52396ad1..18961e0b 100644
--- a/tests/atomics-9.ispc
+++ b/tests/atomics-9.ispc
@@ -13,5 +13,5 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
 }
 
 export void result(uniform float RET[]) {
-    RET[programIndex] = 1;
+    RET[programIndex] = (programCount == 1) ? 0 : 1;
 }
diff --git a/tests/broadcast-1.ispc b/tests/broadcast-1.ispc
index 7cefce7a..27b88ecc 100644
--- a/tests/broadcast-1.ispc
+++ b/tests/broadcast-1.ispc
@@ -3,7 +3,7 @@ export uniform int width() { return programCount; }
 
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
     int a = aFOO[programIndex]; 
-    int br = broadcast(a, (uniform int)b-2);
+    int br = (programCount == 1) ? 4 : broadcast(a, (uniform int)b-2);
     RET[programIndex] = br;
 }
 
diff --git a/tests/broadcast-2.ispc b/tests/broadcast-2.ispc
index 2efc98ed..1cc1d0c4 100644
--- a/tests/broadcast-2.ispc
+++ b/tests/broadcast-2.ispc
@@ -3,7 +3,7 @@ export uniform int width() { return programCount; }
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
     int16 a = aFOO[programIndex]; 
-    int16 b = broadcast(a, 2);
+    int16 b = (programCount == 1) ? 3 : broadcast(a, 2);
     RET[programIndex] = b;
 }
 
diff --git a/tests/broadcast-3.ispc b/tests/broadcast-3.ispc
index e1b1308b..a456eb2f 100644
--- a/tests/broadcast-3.ispc
+++ b/tests/broadcast-3.ispc
@@ -3,7 +3,7 @@ export uniform int width() { return programCount; }
 
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
     int8 a = aFOO[programIndex]; 
-    int8 br = broadcast(a, (uniform int)b-2);
+    int8 br = (programCount == 1) ? 4 : broadcast(a, (uniform int)b-2);
     RET[programIndex] = br;
 }
 
diff --git a/tests/broadcast.ispc b/tests/broadcast.ispc
index e45bbf90..1df835ae 100644
--- a/tests/broadcast.ispc
+++ b/tests/broadcast.ispc
@@ -3,7 +3,7 @@ export uniform int width() { return programCount; }
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
     float a = aFOO[programIndex]; 
-    float b = broadcast(a, 2);
+    float b = (programCount == 1) ? 3 : broadcast(a, 2);
     RET[programIndex] = b;
 }
 
diff --git a/tests/extract-1.ispc b/tests/extract-1.ispc
index 220107cb..7e8d7804 100644
--- a/tests/extract-1.ispc
+++ b/tests/extract-1.ispc
@@ -3,9 +3,9 @@ export uniform int width() { return programCount; }
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
     double a = programIndex;
-    RET[programIndex] = extract(a, 3); 
+    RET[programIndex] = extract(a, min(programCount-1, 3)); 
 }
 
 export void result(uniform float RET[]) {
-    RET[programIndex] = 3;
+    RET[programIndex] = (programCount == 1) ? 0 : 3;
 }
diff --git a/tests/packed-load-3.ispc b/tests/packed-load-3.ispc
index 150fd428..9055cb65 100644
--- a/tests/packed-load-3.ispc
+++ b/tests/packed-load-3.ispc
@@ -12,5 +12,5 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
 }
 
 export void result(uniform float RET[]) {
-    RET[programIndex] = 2;
+    RET[programIndex] = (programCount == 1) ? 1 : 2;
 }
diff --git a/tests/packed-load-5.ispc b/tests/packed-load-5.ispc
index ee3dae7b..2678b069 100644
--- a/tests/packed-load-5.ispc
+++ b/tests/packed-load-5.ispc
@@ -15,8 +15,12 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
 }
 
 export void result(uniform float RET[]) {
-    for (uniform int i = 0; i < programCount/2; ++i) {
-        RET[2*i+1] = 10+i;
-        RET[2*i] = 10+programCount/2+i;
+    if (programCount == 1)
+        RET[0] = 10;
+    else {
+        for (uniform int i = 0; i < programCount/2; ++i) {
+            RET[2*i+1] = 10+i;
+            RET[2*i] = 10+programCount/2+i;
+        }
     }
 }
diff --git a/tests/packed-store-3.ispc b/tests/packed-store-3.ispc
index 8cec64e7..12a6368a 100644
--- a/tests/packed-store-3.ispc
+++ b/tests/packed-store-3.ispc
@@ -13,5 +13,5 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
 }
 
 export void result(uniform float RET[]) {
-    RET[programIndex] = programCount/2;
+    RET[programIndex] = (programCount == 1) ? 1 : programCount/2;
 }
diff --git a/tests/popcnt-3.ispc b/tests/popcnt-3.ispc
index c1553086..110bf5e4 100644
--- a/tests/popcnt-3.ispc
+++ b/tests/popcnt-3.ispc
@@ -8,4 +8,4 @@ export void f_f(uniform float RET[4], uniform float aFOO[]) {
     RET[programIndex] = popcnt(a < 3);
 }
 
-export void result(uniform float RET[]) { RET[programIndex] = 2; }
+export void result(uniform float RET[]) { RET[programIndex] = programCount == 1 ? 1 : 2; }
diff --git a/tests/ptr-assign-lhs-math-1.ispc b/tests/ptr-assign-lhs-math-1.ispc
index 2b34cbae..d28c17b0 100644
--- a/tests/ptr-assign-lhs-math-1.ispc
+++ b/tests/ptr-assign-lhs-math-1.ispc
@@ -2,8 +2,9 @@
 export uniform int width() { return programCount; }
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
-    uniform float a[programCount];
+    uniform float a[programCount+1];
     a[programIndex] = aFOO[programIndex];
+    a[programCount] = 1;
 
     uniform float * uniform ptr = a;
     *(ptr+1) = 0;
@@ -12,5 +13,6 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
 
 export void result(uniform float RET[]) {
     RET[programIndex] = 1+programIndex;
-    RET[1] = 0;
+    if (programCount > 0)
+        RET[1] = 0;
 }
diff --git a/tests/reduce-add-double-1.ispc b/tests/reduce-add-double-1.ispc
index 08da8ea7..9ff50b2c 100644
--- a/tests/reduce-add-double-1.ispc
+++ b/tests/reduce-add-double-1.ispc
@@ -5,7 +5,7 @@ export uniform int width() { return programCount; }
 
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
     double v = aFOO[programIndex];
-    uniform float m;
+    uniform float m = 42;
     int iv = (int)v;
     if (iv & 1)
         m = reduce_add((double)iv);
@@ -14,7 +14,8 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
 
 export void result(uniform float RET[]) { 
     uniform int x = -1234;
-    if (programCount == 4) x = 4;
+    if (programCount == 1) x = 1;
+    else if (programCount == 4) x = 4;
     else if (programCount == 8) x = 16;
     else if (programCount == 16) x = 64;
     RET[programIndex] = x;
diff --git a/tests/reduce-add-double-2.ispc b/tests/reduce-add-double-2.ispc
index 89c6a493..1be0d10b 100644
--- a/tests/reduce-add-double-2.ispc
+++ b/tests/reduce-add-double-2.ispc
@@ -13,7 +13,8 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
 
 export void result(uniform float RET[]) { 
     uniform int x = -1234;
-    if (programCount == 4) x = 10;
+    if (programCount == 1) x = 1;
+    else if (programCount == 4) x = 10;
     else if (programCount == 8) x = 36;
     else if (programCount == 16) x = 136;
     RET[programIndex] = x;
diff --git a/tests/reduce-add-double.ispc b/tests/reduce-add-double.ispc
index 350bf191..e3b47554 100644
--- a/tests/reduce-add-double.ispc
+++ b/tests/reduce-add-double.ispc
@@ -11,4 +11,4 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
     RET[programIndex] = m;
 }
 
-export void result(uniform float RET[]) { RET[programIndex] = -3; }
+export void result(uniform float RET[]) { RET[programIndex] = (programCount == 1) ? -1 : -3; }
diff --git a/tests/reduce-add-float-1.ispc b/tests/reduce-add-float-1.ispc
index c01249fe..dd373849 100644
--- a/tests/reduce-add-float-1.ispc
+++ b/tests/reduce-add-float-1.ispc
@@ -14,7 +14,8 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
 
 export void result(uniform float RET[]) { 
     uniform int x = -1234;
-    if (programCount == 4) x = 4;
+    if (programCount == 1) x = 1;
+    else if (programCount == 4) x = 4;
     else if (programCount == 8) x = 16;
     else if (programCount == 16) x = 64;
     RET[programIndex] = x;
diff --git a/tests/reduce-add-float-2.ispc b/tests/reduce-add-float-2.ispc
index 6cae8b44..53aa85aa 100644
--- a/tests/reduce-add-float-2.ispc
+++ b/tests/reduce-add-float-2.ispc
@@ -13,7 +13,8 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
 
 export void result(uniform float RET[]) { 
     uniform int x = -1234;
-    if (programCount == 4) x = 10;
+    if (programCount == 1) x = 1;
+    else if (programCount == 4) x = 10;
     else if (programCount == 8) x = 36;
     else if (programCount == 16) x = 136;
     RET[programIndex] = x;
diff --git a/tests/reduce-add-float.ispc b/tests/reduce-add-float.ispc
index 91fb5689..7c68bf28 100644
--- a/tests/reduce-add-float.ispc
+++ b/tests/reduce-add-float.ispc
@@ -11,4 +11,4 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
     RET[programIndex] = m;
 }
 
-export void result(uniform float RET[]) { RET[programIndex] = -3; }
+export void result(uniform float RET[]) { RET[programIndex] = (programCount == 1) ? -1 : -3; }
diff --git a/tests/reduce-add-int-1.ispc b/tests/reduce-add-int-1.ispc
index 18c2634a..9ac887c6 100644
--- a/tests/reduce-add-int-1.ispc
+++ b/tests/reduce-add-int-1.ispc
@@ -14,7 +14,8 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
 
 export void result(uniform float RET[]) { 
     uniform int x = -1234;
-    if (programCount == 4) x = 4;
+    if (programCount == 1) x = 1;
+    else if (programCount == 4) x = 4;
     else if (programCount == 8) x = 16;
     else if (programCount == 16) x = 64;
     RET[programIndex] = x;
diff --git a/tests/reduce-add-int.ispc b/tests/reduce-add-int.ispc
index 4ea4577d..01ff745c 100644
--- a/tests/reduce-add-int.ispc
+++ b/tests/reduce-add-int.ispc
@@ -14,7 +14,8 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
 
 export void result(uniform float RET[]) { 
     uniform int x = -1234;
-    if (programCount == 4) x = 10;
+    if (programCount == 1) x = 1;
+    else if (programCount == 4) x = 10;
     else if (programCount == 8) x = 36;
     else if (programCount == 16) x = 136;
     RET[programIndex] = x;
diff --git a/tests/reduce-add-int64-1.ispc b/tests/reduce-add-int64-1.ispc
index e60ca791..cdc88bc3 100644
--- a/tests/reduce-add-int64-1.ispc
+++ b/tests/reduce-add-int64-1.ispc
@@ -14,7 +14,8 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
 
 export void result(uniform float RET[]) { 
     uniform int x = -1234;
-    if (programCount == 4) x = 4;
+    if (programCount == 1) x = 1;
+    else if (programCount == 4) x = 4;
     else if (programCount == 8) x = 16;
     else if (programCount == 16) x = 64;
     RET[programIndex] = x;
diff --git a/tests/reduce-add-int64.ispc b/tests/reduce-add-int64.ispc
index 5ef3dfee..894dddea 100644
--- a/tests/reduce-add-int64.ispc
+++ b/tests/reduce-add-int64.ispc
@@ -13,7 +13,8 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
 
 export void result(uniform float RET[]) { 
     uniform int x = -1234;
-    if (programCount == 4) x = 10;
+    if (programCount == 1) x = 1;
+    else if (programCount == 4) x = 10;
     else if (programCount == 8) x = 36;
     else if (programCount == 16) x = 136;
     RET[programIndex] = x;
diff --git a/tests/reduce-add-uint-1.ispc b/tests/reduce-add-uint-1.ispc
index 7702a67c..291200a6 100644
--- a/tests/reduce-add-uint-1.ispc
+++ b/tests/reduce-add-uint-1.ispc
@@ -14,7 +14,8 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
 
 export void result(uniform float RET[]) { 
     uniform int x = -1234;
-    if (programCount == 4) x = 4;
+    if (programCount == 1) x = 1;
+    else if (programCount == 4) x = 4;
     else if (programCount == 8) x = 16;
     else if (programCount == 16) x = 64;
     RET[programIndex] = x;
diff --git a/tests/reduce-add-uint.ispc b/tests/reduce-add-uint.ispc
index 8becef36..70c40e2b 100644
--- a/tests/reduce-add-uint.ispc
+++ b/tests/reduce-add-uint.ispc
@@ -12,4 +12,4 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
     RET[programIndex] = m;
 }
 
-export void result(uniform float RET[]) { RET[programIndex] = 10 * programCount/4; }
+export void result(uniform float RET[]) { RET[programIndex] = (programCount == 1) ? 1 : (10 * programCount/4); }
diff --git a/tests/reduce-add-uint64-1.ispc b/tests/reduce-add-uint64-1.ispc
index d33170f5..5469a898 100644
--- a/tests/reduce-add-uint64-1.ispc
+++ b/tests/reduce-add-uint64-1.ispc
@@ -14,7 +14,8 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
 
 export void result(uniform float RET[]) { 
     uniform int x = -1234;
-    if (programCount == 4) x = 4;
+    if (programCount == 1) x = 1;
+    else if (programCount == 4) x = 4;
     else if (programCount == 8) x = 16;
     else if (programCount == 16) x = 64;
     RET[programIndex] = x;
diff --git a/tests/reduce-add-uint64.ispc b/tests/reduce-add-uint64.ispc
index e637878a..e44e7903 100644
--- a/tests/reduce-add-uint64.ispc
+++ b/tests/reduce-add-uint64.ispc
@@ -11,4 +11,6 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
     RET[programIndex] = m;
 }
 
-export void result(uniform float RET[]) { RET[programIndex] = 10 * programCount/4; }
+export void result(uniform float RET[]) { 
+    RET[programIndex] = (programCount == 1) ? 1 : (10 * programCount/4);
+}
diff --git a/tests/reduce-equal-13.ispc b/tests/reduce-equal-13.ispc
index e37d597e..66e97b62 100644
--- a/tests/reduce-equal-13.ispc
+++ b/tests/reduce-equal-13.ispc
@@ -4,8 +4,8 @@ export uniform int width() { return programCount; }
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
     int a = aFOO[programIndex&1];
     RET[programIndex] = 1;
-    uniform bool re;
-    uniform int val;
+    uniform bool re = false;
+    uniform int val = 16;
     if (programIndex & 1) {
         re = reduce_equal(a, &val);
     }
@@ -13,5 +13,5 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
 }
 
 export void result(uniform float RET[]) {
-    RET[programIndex] = 258;
+    RET[programIndex] = (programCount == 1) ? 16 : 258;
 }
diff --git a/tests/reduce-equal-3.ispc b/tests/reduce-equal-3.ispc
index ca158f82..62d75261 100644
--- a/tests/reduce-equal-3.ispc
+++ b/tests/reduce-equal-3.ispc
@@ -9,5 +9,5 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
 }
 
 export void result(uniform float RET[]) {
-    RET[programIndex] = 0;
+    RET[programIndex] = (programCount == 1) ? 1 : 0;
 }
diff --git a/tests/reduce-equal-4.ispc b/tests/reduce-equal-4.ispc
index 4c6f9808..43e2d643 100644
--- a/tests/reduce-equal-4.ispc
+++ b/tests/reduce-equal-4.ispc
@@ -2,7 +2,7 @@
 export uniform int width() { return programCount; }
 
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
-    int a = aFOO[programIndex/(programCount/2)];
+    int a = aFOO[programIndex/max(1, (programCount/2))];
     RET[programIndex] = 0;
     if (programIndex >= programCount/2 && a < 4)
         RET[programIndex] = reduce_equal(a) ? 1 : 0; 
diff --git a/tests/reduce-equal-9.ispc b/tests/reduce-equal-9.ispc
index 24fde82c..dcafce20 100644
--- a/tests/reduce-equal-9.ispc
+++ b/tests/reduce-equal-9.ispc
@@ -11,5 +11,5 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
 }
 
 export void result(uniform float RET[]) {
-    RET[programIndex] = 0;
+    RET[programIndex] = (programCount == 1) ? 1 : 0;
 }
diff --git a/tests/reduce-equal.ispc b/tests/reduce-equal.ispc
index f09ec940..38244afb 100644
--- a/tests/reduce-equal.ispc
+++ b/tests/reduce-equal.ispc
@@ -7,5 +7,5 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
 }
 
 export void result(uniform float RET[]) {
-    RET[programIndex] = 0;
+    RET[programIndex] = (programCount == 1) ? 1 : 0;
 }
diff --git a/tests/reduce-max-double.ispc b/tests/reduce-max-double.ispc
index 672987ff..61908beb 100644
--- a/tests/reduce-max-double.ispc
+++ b/tests/reduce-max-double.ispc
@@ -5,10 +5,10 @@ export uniform int width() { return programCount; }
 
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
     double v = aFOO[programIndex];
-    uniform float m;
+    uniform float m = 42;
     if (v >= 3)
         m = reduce_max(-v);
     RET[programIndex] = m;
 }
 
-export void result(uniform float RET[]) { RET[programIndex] = -3; }
+export void result(uniform float RET[]) { RET[programIndex] = (programCount == 1) ? 42 : -3; }
diff --git a/tests/reduce-max-float.ispc b/tests/reduce-max-float.ispc
index 7a914c72..97290aee 100644
--- a/tests/reduce-max-float.ispc
+++ b/tests/reduce-max-float.ispc
@@ -5,10 +5,10 @@ export uniform int width() { return programCount; }
 
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
     float v = aFOO[programIndex];
-    uniform float m;
+    uniform float m = 42;
     if (v >= 3)
         m = reduce_max(-v);
     RET[programIndex] = m;
 }
 
-export void result(uniform float RET[]) { RET[programIndex] = -3; }
+export void result(uniform float RET[]) { RET[programIndex] = (programCount == 1) ? 42 : -3; }
diff --git a/tests/reduce-max-int.ispc b/tests/reduce-max-int.ispc
index a66ac8b5..b7e98a3f 100644
--- a/tests/reduce-max-int.ispc
+++ b/tests/reduce-max-int.ispc
@@ -5,10 +5,10 @@ export uniform int width() { return programCount; }
 
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
     float v = aFOO[programIndex];
-    uniform float m;
+    uniform float m = 42;
     if (v >= 3)
         m = reduce_max(-(int)v);
     RET[programIndex] = m;
 }
 
-export void result(uniform float RET[]) { RET[programIndex] = -3; }
+export void result(uniform float RET[]) { RET[programIndex] = (programCount == 1) ? 42 : -3; }
diff --git a/tests/reduce-max-int64.ispc b/tests/reduce-max-int64.ispc
index 08641a9a..3e3ae020 100644
--- a/tests/reduce-max-int64.ispc
+++ b/tests/reduce-max-int64.ispc
@@ -5,10 +5,10 @@ export uniform int width() { return programCount; }
 
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
     float v = aFOO[programIndex];
-    uniform float m;
+    uniform float m = 42;
     if (v >= 3)
         m = reduce_max(-(int64)v);
     RET[programIndex] = m;
 }
 
-export void result(uniform float RET[]) { RET[programIndex] = -3; }
+export void result(uniform float RET[]) { RET[programIndex] = (programCount == 1) ? 42 : -3; }
diff --git a/tests/reduce-max-uint.ispc b/tests/reduce-max-uint.ispc
index ed0ef282..28b8fc5b 100644
--- a/tests/reduce-max-uint.ispc
+++ b/tests/reduce-max-uint.ispc
@@ -11,4 +11,4 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
     RET[programIndex] = m;
 }
 
-export void result(uniform float RET[]) { RET[programIndex] = 2; }
+export void result(uniform float RET[]) { RET[programIndex] = (programCount == 1) ? 1 : 2; }
diff --git a/tests/reduce-max-uint64.ispc b/tests/reduce-max-uint64.ispc
index ce5e52d4..3b7f28d9 100644
--- a/tests/reduce-max-uint64.ispc
+++ b/tests/reduce-max-uint64.ispc
@@ -11,4 +11,4 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
     RET[programIndex] = m;
 }
 
-export void result(uniform float RET[]) { RET[programIndex] = 2; }
+export void result(uniform float RET[]) { RET[programIndex] = (programCount == 1) ? 1 : 2; }
diff --git a/tests/reduce-min-double.ispc b/tests/reduce-min-double.ispc
index 866aa57e..5e0222a0 100644
--- a/tests/reduce-min-double.ispc
+++ b/tests/reduce-min-double.ispc
@@ -11,4 +11,4 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
     RET[programIndex] = m;
 }
 
-export void result(uniform float RET[]) { RET[programIndex] = -2; }
+export void result(uniform float RET[]) { RET[programIndex] = (programCount == 1) ? -1 : -2; }
diff --git a/tests/reduce-min-float.ispc b/tests/reduce-min-float.ispc
index b0e890fb..5f88e4db 100644
--- a/tests/reduce-min-float.ispc
+++ b/tests/reduce-min-float.ispc
@@ -11,4 +11,4 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
     RET[programIndex] = m;
 }
 
-export void result(uniform float RET[]) { RET[programIndex] = -2; }
+export void result(uniform float RET[]) { RET[programIndex] = (programCount == 1) ? -1 : -2; }
diff --git a/tests/reduce-min-uint.ispc b/tests/reduce-min-uint.ispc
index 85ac625b..4695ebae 100644
--- a/tests/reduce-min-uint.ispc
+++ b/tests/reduce-min-uint.ispc
@@ -5,10 +5,10 @@ export uniform int width() { return programCount; }
 
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
     float v = aFOO[programIndex];
-    uniform float m;
+    uniform float m = 42;
     if (v >= 3)
         m = reduce_min((unsigned int)v);
     RET[programIndex] = m;
 }
 
-export void result(uniform float RET[]) { RET[programIndex] = 3; }
+export void result(uniform float RET[]) { RET[programIndex] = (programCount == 1) ? 42 : 3; }
diff --git a/tests/reduce-min-uint64.ispc b/tests/reduce-min-uint64.ispc
index 2290dfd6..345b4bc2 100644
--- a/tests/reduce-min-uint64.ispc
+++ b/tests/reduce-min-uint64.ispc
@@ -5,10 +5,10 @@ export uniform int width() { return programCount; }
 
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
     float v = aFOO[programIndex];
-    uniform float m;
+    uniform float m = 42;
     if (v >= 3)
         m = reduce_min((unsigned int64)v);
     RET[programIndex] = m;
 }
 
-export void result(uniform float RET[]) { RET[programIndex] = 3; }
+export void result(uniform float RET[]) { RET[programIndex] = (programCount == 1) ? 42 : 3; }
diff --git a/tests/shuffle-3.ispc b/tests/shuffle-3.ispc
index afe0b066..fa23e311 100644
--- a/tests/shuffle-3.ispc
+++ b/tests/shuffle-3.ispc
@@ -2,9 +2,13 @@
 export uniform int width() { return programCount; }
 
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
-    int8 a = aFOO[programIndex]; 
-    int8 shuf = shuffle(a, 1);
-    RET[programIndex] = shuf;
+    if (programCount == 1)
+        RET[0] = 2;
+    else {
+        int8 a = aFOO[programIndex]; 
+        int8 shuf = shuffle(a, 1);
+        RET[programIndex] = shuf;
+    }
 }
 
 export void result(uniform float RET[]) {
diff --git a/tests/shuffle.ispc b/tests/shuffle.ispc
index a17b1309..77ac8c88 100644
--- a/tests/shuffle.ispc
+++ b/tests/shuffle.ispc
@@ -2,9 +2,13 @@
 export uniform int width() { return programCount; }
 
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
-    int32 a = aFOO[programIndex]; 
-    int32 shuf = shuffle(a, 1);
-    RET[programIndex] = shuf;
+    if (programCount == 1)
+        RET[0] = 2;
+    else {
+        int32 a = aFOO[programIndex]; 
+        int32 shuf = shuffle(a, 1);
+        RET[programIndex] = shuf;
+    }
 }
 
 export void result(uniform float RET[]) {
diff --git a/tests/shuffle2-1.ispc b/tests/shuffle2-1.ispc
index 5d33cdf9..9516f3f8 100644
--- a/tests/shuffle2-1.ispc
+++ b/tests/shuffle2-1.ispc
@@ -2,10 +2,14 @@
 export uniform int width() { return programCount; }
 
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
-    int32 aa = aFOO[programIndex]; 
-    int32 bb = aa + programCount;
-    int32 shuf = shuffle(aa, bb, programCount + 1);
-    RET[programIndex] = shuf;
+    if (programCount == 1)
+        RET[0] = 3;
+    else {
+        int32 aa = aFOO[programIndex]; 
+        int32 bb = aa + programCount;
+        int32 shuf = shuffle(aa, bb, programCount + 1);
+        RET[programIndex] = shuf;
+    }
 }
 
 export void result(uniform float RET[]) {
diff --git a/tests/shuffle2-2.ispc b/tests/shuffle2-2.ispc
index 56426201..a9ea697c 100644
--- a/tests/shuffle2-2.ispc
+++ b/tests/shuffle2-2.ispc
@@ -2,10 +2,14 @@
 export uniform int width() { return programCount; }
 
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
-    int32 aa = aFOO[programIndex]; 
-    int32 bb = aa + programCount;
-    int32 shuf = shuffle(aa, bb, programIndex + 2);
-    RET[programIndex] = shuf;
+    if (programCount == 1)
+        RET[0] = 3;
+    else {
+        int32 aa = aFOO[programIndex]; 
+        int32 bb = aa + programCount;
+        int32 shuf = shuffle(aa, bb, programIndex + 2);
+        RET[programIndex] = shuf;
+    }
 }
 
 export void result(uniform float RET[]) {
diff --git a/tests/shuffle2-3.ispc b/tests/shuffle2-3.ispc
index 97040bab..ac484198 100644
--- a/tests/shuffle2-3.ispc
+++ b/tests/shuffle2-3.ispc
@@ -2,10 +2,14 @@
 export uniform int width() { return programCount; }
 
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
-    float aa = aFOO[programIndex]; 
-    float bb = aa + programCount;
-    float shuf = shuffle(aa, bb, programIndex + 2 + (int)b - 5);
-    RET[programIndex] = shuf;
+    if (programCount == 1)
+        RET[0] = 3;
+    else {
+        float aa = aFOO[programIndex]; 
+        float bb = aa + programCount;
+        float shuf = shuffle(aa, bb, programIndex + 2 + (int)b - 5);
+        RET[programIndex] = shuf;
+    }
 }
 
 export void result(uniform float RET[]) {
diff --git a/tests/shuffle2-7.ispc b/tests/shuffle2-7.ispc
index 832c84cf..0eb3c521 100644
--- a/tests/shuffle2-7.ispc
+++ b/tests/shuffle2-7.ispc
@@ -2,10 +2,14 @@
 export uniform int width() { return programCount; }
 
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
-    int16 aa = aFOO[programIndex]; 
-    int16 bb = aa + programCount;
-    int16 shuf = shuffle(aa, bb, programCount + 1);
-    RET[programIndex] = shuf;
+    if (programCount == 1)
+        RET[0] = 3;
+    else {
+        int16 aa = aFOO[programIndex]; 
+        int16 bb = aa + programCount;
+        int16 shuf = shuffle(aa, bb, programCount + 1);
+        RET[programIndex] = shuf;
+    }
 }
 
 export void result(uniform float RET[]) {
diff --git a/tests/shuffle2-8.ispc b/tests/shuffle2-8.ispc
index 190c0d9d..c479ced5 100644
--- a/tests/shuffle2-8.ispc
+++ b/tests/shuffle2-8.ispc
@@ -2,10 +2,14 @@
 export uniform int width() { return programCount; }
 
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
-    int8 aa = aFOO[programIndex]; 
-    int8 bb = aa + programCount;
-    int8 shuf = shuffle(aa, bb, programIndex + 2);
-    RET[programIndex] = shuf;
+    if (programCount == 1)
+        RET[0] = 3;
+    else {
+        int8 aa = aFOO[programIndex]; 
+        int8 bb = aa + programCount;
+        int8 shuf = shuffle(aa, bb, programIndex + 2);
+        RET[programIndex] = shuf;
+    }
 }
 
 export void result(uniform float RET[]) {
diff --git a/tests/shuffle2-9.ispc b/tests/shuffle2-9.ispc
index 85bb9123..f56e5d9a 100644
--- a/tests/shuffle2-9.ispc
+++ b/tests/shuffle2-9.ispc
@@ -2,10 +2,14 @@
 export uniform int width() { return programCount; }
 
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
-    int16 aa = aFOO[programIndex]; 
-    int16 bb = aa + programCount;
-    int16 shuf = shuffle(aa, bb, programIndex + 2 + (int)b - 5);
-    RET[programIndex] = shuf;
+    if (programCount == 1)
+        RET[0] = 3;
+    else {
+        int16 aa = aFOO[programIndex]; 
+        int16 bb = aa + programCount;
+        int16 shuf = shuffle(aa, bb, programIndex + 2 + (int)b - 5);
+        RET[programIndex] = shuf;
+    }
 }
 
 export void result(uniform float RET[]) {
diff --git a/tests/store-int16-1.ispc b/tests/store-int16-1.ispc
index c333e29c..c107dbab 100644
--- a/tests/store-int16-1.ispc
+++ b/tests/store-int16-1.ispc
@@ -1,8 +1,8 @@
 export uniform int width() { return programCount; }
 
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
-    uniform unsigned int16 x[2*programCount];
-    for (uniform int i = 0; i < 2*programCount; ++i)
+    uniform unsigned int16 x[2*programCount+1];
+    for (uniform int i = 0; i < 2*programCount+1; ++i)
         x[i] = 0xffff;
     unsigned int16 val = aFOO[programIndex];
     x[2+programIndex] = val;
diff --git a/tests/store-int16.ispc b/tests/store-int16.ispc
index 2fcd690e..a85394d8 100644
--- a/tests/store-int16.ispc
+++ b/tests/store-int16.ispc
@@ -1,8 +1,8 @@
 export uniform int width() { return programCount; }
 
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
-    uniform int16 x[2*programCount];
-    for (uniform int i = 0; i < 2*programCount; ++i)
+    uniform int16 x[2*programCount+1];
+    for (uniform int i = 0; i < 2*programCount+1; ++i)
         x[i] = 0xffff;
     unsigned int8 val = aFOO[programIndex];
     x[2+programIndex] = val;
diff --git a/tests/test-124.ispc b/tests/test-124.ispc
index e9b3e336..2412e1c1 100644
--- a/tests/test-124.ispc
+++ b/tests/test-124.ispc
@@ -10,5 +10,5 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
 
 
 export void result(uniform float RET[4]) {
-    RET[programIndex] = 1;
+    RET[programIndex] = (programCount == 1) ? 0 : 1;
 }
diff --git a/tests/test-127.ispc b/tests/test-127.ispc
index 44598001..32aa371a 100644
--- a/tests/test-127.ispc
+++ b/tests/test-127.ispc
@@ -9,5 +9,5 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
 
 
 export void result(uniform float RET[]) {
-    RET[programIndex] = 10;
+    RET[programIndex] = (programCount == 1) ? 4 : 10;
 }
diff --git a/tests/test-128.ispc b/tests/test-128.ispc
index 24bdebe7..8c1a04f9 100644
--- a/tests/test-128.ispc
+++ b/tests/test-128.ispc
@@ -4,9 +4,9 @@ export uniform int width() { return programCount; }
 
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
     float a = aFOO[programIndex];
-    RET[programIndex] = extract(a, 1);
+    RET[programIndex] = extract(a, min(1, programCount-1));
 }
 
 export void result(uniform float RET[]) {
-    RET[programIndex] = 2.;
+    RET[programIndex] = (programCount == 1) ? 1 : 2.;
 }
diff --git a/tests/test-129.ispc b/tests/test-129.ispc
index 1c3e2dff..82f0af43 100644
--- a/tests/test-129.ispc
+++ b/tests/test-129.ispc
@@ -4,10 +4,10 @@ export uniform int width() { return programCount; }
 
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
     float a = aFOO[programIndex];
-    RET[programIndex] = extract(a, (uniform int)b-3);
+    RET[programIndex] = extract(a, min((uniform int)b-3, programCount-1));
 }
 
 
 export void result(uniform float RET[]) {
-    RET[programIndex] = 3;
+    RET[programIndex] = (programCount == 1) ? 1 : 3;
 }

From e19f4931d1deec0da9fde0f440236cf55163bb25 Mon Sep 17 00:00:00 2001
From: Matt Pharr <matt.pharr@intel.com>
Date: Mon, 30 Jan 2012 05:58:41 -0800
Subject: [PATCH 24/62] Short-circuit evaluation of && and || operators.

We now follow C's approach of evaluating these: we don't evaluate
the second expression in the operator if the value of the first one
determines the overall result.  Thus, these can now be used
idiomatically like (index < limit && array[index] > 0) and such.

For varying expressions, the mask is set appropriately when evaluating
the second expression.

(For expressions that can be determined to be both simple and safe to
evaluate with the mask all off, we still evaluate both sides and compute
the logical op result directly, which saves a number of branches and tests.
However, the effect of this should never be visible to the programmer.)

Issue #4.
---
 ast.cpp                     | 113 +++++++++++++
 ast.h                       |   4 +
 docs/ispc.rst               |  13 +-
 expr.cpp                    | 309 +++++++++++++++++++++++++++++++++---
 stmt.cpp                    | 117 +-------------
 tests/short-circuit-1.ispc  |  21 +++
 tests/short-circuit-10.ispc |  24 +++
 tests/short-circuit-11.ispc |  25 +++
 tests/short-circuit-2.ispc  |  21 +++
 tests/short-circuit-3.ispc  |  21 +++
 tests/short-circuit-4.ispc  |  21 +++
 tests/short-circuit-5.ispc  |  21 +++
 tests/short-circuit-6.ispc  |  21 +++
 tests/short-circuit-7.ispc  |  21 +++
 tests/short-circuit-8.ispc  |  21 +++
 tests/short-circuit-9.ispc  |  21 +++
 16 files changed, 658 insertions(+), 136 deletions(-)
 create mode 100644 tests/short-circuit-1.ispc
 create mode 100644 tests/short-circuit-10.ispc
 create mode 100644 tests/short-circuit-11.ispc
 create mode 100644 tests/short-circuit-2.ispc
 create mode 100644 tests/short-circuit-3.ispc
 create mode 100644 tests/short-circuit-4.ispc
 create mode 100644 tests/short-circuit-5.ispc
 create mode 100644 tests/short-circuit-6.ispc
 create mode 100644 tests/short-circuit-7.ispc
 create mode 100644 tests/short-circuit-8.ispc
 create mode 100644 tests/short-circuit-9.ispc

diff --git a/ast.cpp b/ast.cpp
index 5eaddeb3..bfbc71f6 100644
--- a/ast.cpp
+++ b/ast.cpp
@@ -315,3 +315,116 @@ EstimateCost(ASTNode *root) {
     return cost;
 }
 
+
+/** Given an AST node, check to see if it's safe if we happen to run the
+    code for that node with the execution mask all off.
+ */
+static bool
+lCheckAllOffSafety(ASTNode *node, void *data) {
+    bool *okPtr = (bool *)data;
+
+    if (dynamic_cast<FunctionCallExpr *>(node) != NULL) {
+        // FIXME: If we could somehow determine that the function being
+        // called was safe (and all of the args Exprs were safe, then it'd
+        // be nice to be able to return true here.  (Consider a call to
+        // e.g. floatbits() in the stdlib.)  Unfortunately for now we just
+        // have to be conservative.
+        *okPtr = false;
+        return false;
+    }
+
+    if (dynamic_cast<AssertStmt *>(node) != NULL) {
+        // While it's fine to run the assert for varying tests, it's not
+        // desirable to check an assert on a uniform variable if all of the
+        // lanes are off.
+        *okPtr = false;
+        return false;
+    }
+
+    if (dynamic_cast<NewExpr *>(node) != NULL ||
+        dynamic_cast<DeleteStmt *>(node) != NULL) {
+        // We definitely don't want to run the uniform variants of these if
+        // the mask is all off.  It's also worth skipping the overhead of
+        // executing the varying versions of them in the all-off mask case.
+        *okPtr = false;
+        return false;
+    }
+
+    if (g->target.allOffMaskIsSafe == true)
+        // Don't worry about memory accesses if we have a target that can
+        // safely run them with the mask all off
+        return true;
+
+    IndexExpr *ie;
+    if ((ie = dynamic_cast<IndexExpr *>(node)) != NULL && ie->baseExpr != NULL) {
+        const Type *type = ie->baseExpr->GetType();
+        if (type == NULL)
+            return true;
+        if (dynamic_cast<const ReferenceType *>(type) != NULL)
+            type = type->GetReferenceTarget();
+
+        ConstExpr *ce = dynamic_cast<ConstExpr *>(ie->index);
+        if (ce == NULL) {
+            // indexing with a variable... -> not safe
+            *okPtr = false;
+            return false;
+        }
+
+        const PointerType *pointerType = 
+            dynamic_cast<const PointerType *>(type);
+        if (pointerType != NULL) {
+            // pointer[index] -> can't be sure -> not safe
+            *okPtr = false;
+            return false;
+        }
+
+        const SequentialType *seqType = 
+            dynamic_cast<const SequentialType *>(type);
+        Assert(seqType != NULL);
+        int nElements = seqType->GetElementCount();
+        if (nElements == 0) {
+            // Unsized array, so we can't be sure -> not safe
+            *okPtr = false;
+            return false;
+        }
+
+        int32_t indices[ISPC_MAX_NVEC];
+        int count = ce->AsInt32(indices);
+        for (int i = 0; i < count; ++i) {
+            if (indices[i] < 0 || indices[i] >= nElements) {
+                // Index is out of bounds -> not safe
+                *okPtr = false;
+                return false;
+            }
+        }
+
+        // All indices are in-bounds
+        return true;
+    }
+
+    MemberExpr *me;
+    if ((me = dynamic_cast<MemberExpr *>(node)) != NULL &&
+        me->dereferenceExpr) {
+        *okPtr = false;
+        return false;
+    }
+
+    DereferenceExpr *de;
+    if ((de = dynamic_cast<DereferenceExpr *>(node)) != NULL) {
+        const Type *exprType = de->expr->GetType();
+        if (dynamic_cast<const PointerType *>(exprType) != NULL) {
+            *okPtr = false;
+            return false;
+        }
+    }
+
+    return true;
+}
+
+
+bool
+SafeToRunWithMaskAllOff(ASTNode *root) {
+    bool safe = true;
+    WalkAST(root, lCheckAllOffSafety, NULL, &safe);
+    return safe;
+}
diff --git a/ast.h b/ast.h
index 0c3d4b64..0f73677b 100644
--- a/ast.h
+++ b/ast.h
@@ -144,4 +144,8 @@ extern Stmt *TypeCheck(Stmt *);
     the given root. */
 extern int EstimateCost(ASTNode *root);
 
+/** Returns true if it would be safe to run the given code with an "all
+    off" mask. */ 
+extern bool SafeToRunWithMaskAllOff(ASTNode *root);
+
 #endif // ISPC_AST_H
diff --git a/docs/ispc.rst b/docs/ispc.rst
index 5613cfa1..ee838ee0 100644
--- a/docs/ispc.rst
+++ b/docs/ispc.rst
@@ -1184,7 +1184,6 @@ C++:
 There are a number of features of C89 that are not supported in ``ispc``
 but are likely to be supported in future releases:
 
-* Short circuiting of logical operations
 * There are no types named ``char``, ``short``, or ``long`` (or ``long
   double``).  However, there are built-in ``int8``, ``int16``, and
   ``int64`` types
@@ -1969,6 +1968,18 @@ operator also work as expected.
     (*fp).a = 0;
     fp->b = 1;
   
+As in C and C++, evaluation of the ``||`` and ``&&`` logical operators is
+"short-circuited"; the right hand side won't be evaluated if the value from
+the left-hand side determines the logical operator's value.  For example,
+in the following code, ``array[index]`` won't be evaluated for values of
+``index`` that are greater than or equal to ``NUM_ITEMS``.
+
+::
+
+    if (index < NUM_ITEMS && array[index] > 0) {
+        // ...
+    }
+
 
 Dynamic Memory Allocation
 -------------------------
diff --git a/expr.cpp b/expr.cpp
index be14fc1e..24df7fb3 100644
--- a/expr.cpp
+++ b/expr.cpp
@@ -1405,13 +1405,274 @@ BinaryExpr::BinaryExpr(Op o, Expr *a, Expr *b, SourcePos p)
 }
 
 
+/** Emit code for a && or || logical operator.  In particular, the code
+    here handles "short-circuit" evaluation, where the second expression
+    isn't evaluated if the value of the first one determines the value of
+    the result. 
+*/ 
+llvm::Value *
+lEmitLogicalOp(BinaryExpr::Op op, Expr *arg0, Expr *arg1,
+               FunctionEmitContext *ctx, SourcePos pos) {
+
+    const Type *type0 = arg0->GetType(), *type1 = arg1->GetType();
+    if (type0 == NULL || type1 == NULL) {
+        Assert(m->errorCount > 0);
+        return NULL;
+    }
+
+    // There is overhead (branches, etc.), to short-circuiting, so if the
+    // right side of the expression is a) relatively simple, and b) can be
+    // safely executed with an all-off execution mask, then we just
+    // evaluate both sides and then the logical operator in that case.
+    // FIXME: not sure what we should do about vector types here...
+    bool shortCircuit = (EstimateCost(arg1) > PREDICATE_SAFE_IF_STATEMENT_COST ||
+                         SafeToRunWithMaskAllOff(arg1) == false ||
+                         dynamic_cast<const VectorType *>(type0) != NULL ||
+                         dynamic_cast<const VectorType *>(type1) != NULL);
+    if (shortCircuit == false) {
+        // If one of the operands is uniform but the other is varying,
+        // promote the uniform one to varying
+        if (type0->IsUniformType() && type1->IsVaryingType()) {
+            arg0 = TypeConvertExpr(arg0, AtomicType::VaryingBool, lOpString(op));
+            Assert(arg0 != NULL);
+        }
+        if (type1->IsUniformType() && type0->IsVaryingType()) {
+            arg1 = TypeConvertExpr(arg1, AtomicType::VaryingBool, lOpString(op));
+            Assert(arg1 != NULL);
+        }
+
+        llvm::Value *value0 = arg0->GetValue(ctx);
+        llvm::Value *value1 = arg1->GetValue(ctx);
+        if (value0 == NULL || value1 == NULL) {
+            Assert(m->errorCount > 0);
+            return NULL;
+        }
+
+        if (op == BinaryExpr::LogicalAnd)
+            return ctx->BinaryOperator(llvm::Instruction::And, value0, value1,
+                                       "logical_and");
+        else {
+            Assert(op == BinaryExpr::LogicalOr);
+            return ctx->BinaryOperator(llvm::Instruction::Or, value0, value1, 
+                                       "logical_or");
+        }
+    }
+
+    // Allocate temporary storage for the return value
+    const Type *retType = Type::MoreGeneralType(type0, type1, pos, lOpString(op));
+    LLVM_TYPE_CONST llvm::Type *llvmRetType = retType->LLVMType(g->ctx);
+    llvm::Value *retPtr = ctx->AllocaInst(llvmRetType, "logical_op_mem");
+
+    llvm::BasicBlock *bbSkipEvalValue1 = ctx->CreateBasicBlock("skip_eval_1");
+    llvm::BasicBlock *bbEvalValue1 = ctx->CreateBasicBlock("eval_1");
+    llvm::BasicBlock *bbLogicalDone = ctx->CreateBasicBlock("logical_op_done");
+
+    // Evaluate the first operand
+    llvm::Value *value0 = arg0->GetValue(ctx);
+    if (value0 == NULL) {
+        Assert(m->errorCount > 0);
+        return NULL;
+    }
+
+    if (type0->IsUniformType()) {
+        // Check to see if the value of the first operand is true or false
+        llvm::Value *value0True = 
+            ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_EQ,
+                         value0, LLVMTrue);
+
+        if (op == BinaryExpr::LogicalOr) {
+            // For ||, if value0 is true, then we skip evaluating value1
+            // entirely.
+            ctx->BranchInst(bbSkipEvalValue1, bbEvalValue1, value0True);
+
+            // If value0 is true, the complete result is true (either
+            // uniform or varying)
+            ctx->SetCurrentBasicBlock(bbSkipEvalValue1);
+            llvm::Value *trueValue = retType->IsUniformType() ? LLVMTrue :
+                LLVMMaskAllOn;
+            ctx->StoreInst(trueValue, retPtr);
+            ctx->BranchInst(bbLogicalDone);
+        }
+        else {
+            Assert(op == BinaryExpr::LogicalAnd);
+
+            // Conversely, for &&, if value0 is false, we skip evaluating
+            // value1.
+            ctx->BranchInst(bbEvalValue1, bbSkipEvalValue1, value0True);
+
+            // In this case, the complete result is false (again, either a
+            // uniform or varying false).
+            ctx->SetCurrentBasicBlock(bbSkipEvalValue1);
+            llvm::Value *falseValue = retType->IsUniformType() ? LLVMFalse :
+                LLVMMaskAllOff;
+            ctx->StoreInst(falseValue, retPtr);
+            ctx->BranchInst(bbLogicalDone);
+        }
+
+        // Both || and && are in the same situation if the first operand's
+        // value didn't resolve the final result: they need to evaluate the
+        // value of the second operand, which in turn gives the value for
+        // the full expression.
+        ctx->SetCurrentBasicBlock(bbEvalValue1);
+        if (type1->IsUniformType() && retType->IsVaryingType()) {
+            arg1 = TypeConvertExpr(arg1, AtomicType::VaryingBool, "logical op");
+            Assert(arg1 != NULL);
+        }
+
+        llvm::Value *value1 = arg1->GetValue(ctx);
+        if (value1 == NULL) {
+            Assert(m->errorCount > 0);
+            return NULL;
+        }
+        ctx->StoreInst(value1, retPtr);
+        ctx->BranchInst(bbLogicalDone);
+
+        // In all cases, we end up at the bbLogicalDone basic block;
+        // loading the value stored in retPtr in turn gives the overall
+        // result.
+        ctx->SetCurrentBasicBlock(bbLogicalDone);
+        return ctx->LoadInst(retPtr);
+    }
+    else {
+        // Otherwise, the first operand is varying...  Save the current
+        // value of the mask so that we can restore it at the end.
+        llvm::Value *oldMask = ctx->GetInternalMask();
+
+        // Convert the second operand to be varying as well, so that we can
+        // perform logical vector ops with its value.
+        if (type1->IsUniformType()) {
+            arg1 = TypeConvertExpr(arg1, AtomicType::VaryingBool, "logical op");
+            Assert(arg1 != NULL);
+            type1 = arg1->GetType();
+        }
+
+        if (op == BinaryExpr::LogicalOr) {
+            // See if value0 is true for all currently executing
+            // lanes--i.e. if (value0 & mask) == mask.  If so, we don't
+            // need to evaluate the second operand of the expression.
+            llvm::Value *value0AndMask = 
+                ctx->BinaryOperator(llvm::Instruction::And, value0, oldMask, 
+                                    "op&mask");
+            llvm::Value *equalsMask =
+                ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_EQ,
+                             value0AndMask, oldMask, "value0&mask==mask");
+            equalsMask = ctx->I1VecToBoolVec(equalsMask);
+            llvm::Value *allMatch = ctx->All(equalsMask);
+            ctx->BranchInst(bbSkipEvalValue1, bbEvalValue1, allMatch);
+
+            // value0 is true for all running lanes, so it can be used for
+            // the final result
+            ctx->SetCurrentBasicBlock(bbSkipEvalValue1);
+            ctx->StoreInst(value0, retPtr);
+            ctx->BranchInst(bbLogicalDone);
+
+            // Otherwise, we need to valuate arg1. However, first we need
+            // to set the execution mask to be (oldMask & ~a); in other
+            // words, only execute the instances where value0 is false.
+            // For the instances where value0 was true, we need to inhibit
+            // execution.
+            ctx->SetCurrentBasicBlock(bbEvalValue1);
+            llvm::Value *not0 = ctx->NotOperator(value0);
+            ctx->SetInternalMaskAnd(oldMask, not0);
+
+            llvm::Value *value1 = arg1->GetValue(ctx);
+            if (value1 == NULL) {
+                Assert(m->errorCount > 0);
+                return NULL;
+            }
+
+            // We need to compute the result carefully, since vector
+            // elements that were computed when the corresponding lane was
+            // disabled have undefined values:
+            // result = (value0 & old_mask) | (value1 & current_mask)
+            llvm::Value *value1AndMask =
+                ctx->BinaryOperator(llvm::Instruction::And, value1, 
+                                    ctx->GetInternalMask(), "op&mask");
+            llvm::Value *result =
+                ctx->BinaryOperator(llvm::Instruction::Or, value0AndMask, 
+                                    value1AndMask, "or_result");
+            ctx->StoreInst(result, retPtr);
+            ctx->BranchInst(bbLogicalDone);
+        }
+        else {
+            Assert(op == BinaryExpr::LogicalAnd);
+
+            // If value0 is false for all currently running lanes, the
+            // overall result must be false: this corresponds to checking
+            // if (mask & ~value0) == mask.
+            llvm::Value *notValue0 = ctx->NotOperator(value0, "not_value0");
+            llvm::Value *notValue0AndMask = 
+                ctx->BinaryOperator(llvm::Instruction::And, notValue0, oldMask, 
+                                    "not_value0&mask");
+            llvm::Value *equalsMask =
+                ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_EQ,
+                             notValue0AndMask, oldMask, "not_value0&mask==mask");
+            equalsMask = ctx->I1VecToBoolVec(equalsMask);
+            llvm::Value *allMatch = ctx->All(equalsMask);
+            ctx->BranchInst(bbSkipEvalValue1, bbEvalValue1, allMatch);
+
+            // value0 was false for all running lanes, so use its value as
+            // the overall result.
+            ctx->SetCurrentBasicBlock(bbSkipEvalValue1);
+            ctx->StoreInst(value0, retPtr);
+            ctx->BranchInst(bbLogicalDone);
+
+            // Otherwise we need to evaluate value1, but again with the
+            // mask set to only be on for the lanes where value0 was true.
+            // For the lanes where value0 was false, execution needs to be
+            // disabled: mask = (mask & value0).
+            ctx->SetCurrentBasicBlock(bbEvalValue1);
+            ctx->SetInternalMaskAnd(oldMask, value0);
+
+            llvm::Value *value1 = arg1->GetValue(ctx);
+            if (value1 == NULL) {
+                Assert(m->errorCount > 0);
+                return NULL;
+            }
+
+            // And as in the || case, we compute the overall result by
+            // masking off the valid lanes before we AND them together:
+            // result = (value0 & old_mask) & (value1 & current_mask)
+            llvm::Value *value0AndMask = 
+                ctx->BinaryOperator(llvm::Instruction::And, value0, oldMask,
+                                    "op&mask");
+            llvm::Value *value1AndMask =
+                ctx->BinaryOperator(llvm::Instruction::And, value1,
+                                    ctx->GetInternalMask(), "value1&mask");
+            llvm::Value *result =
+                ctx->BinaryOperator(llvm::Instruction::And, value0AndMask, 
+                                    value1AndMask, "or_result");
+            ctx->StoreInst(result, retPtr);
+            ctx->BranchInst(bbLogicalDone);
+        }
+
+        // And finally we always end up in bbLogicalDone, where we restore
+        // the old mask and return the computed result
+        ctx->SetCurrentBasicBlock(bbLogicalDone);
+        ctx->SetInternalMask(oldMask);
+        return ctx->LoadInst(retPtr);
+    }
+}
+
+
 llvm::Value *
 BinaryExpr::GetValue(FunctionEmitContext *ctx) const {
-    if (!arg0 || !arg1)
+    if (!arg0 || !arg1) {
+        Assert(m->errorCount > 0);
         return NULL;
+    }
+
+    // Handle these specially, since we want to short-circuit their evaluation...
+    if (op == LogicalAnd || op == LogicalOr)
+        return lEmitLogicalOp(op, arg0, arg1, ctx, pos);
 
     llvm::Value *value0 = arg0->GetValue(ctx);
     llvm::Value *value1 = arg1->GetValue(ctx);
+    if (value0 == NULL || value1 == NULL) {
+        Assert(m->errorCount > 0);
+        return NULL;
+    }
+
     ctx->SetDebugPos(pos);
 
     switch (op) {
@@ -1441,12 +1702,6 @@ BinaryExpr::GetValue(FunctionEmitContext *ctx) const {
         return lEmitBinaryBitOp(op, value0, value1, 
                                 arg0->GetType()->IsUnsignedType(), ctx);
     }
-    case LogicalAnd:
-        return ctx->BinaryOperator(llvm::Instruction::And, value0, value1,
-                                   "logical_and");
-    case LogicalOr:
-        return ctx->BinaryOperator(llvm::Instruction::Or, value0, value1, 
-                                   "logical_or");
     case Comma:
         return value1;
     default:
@@ -2017,12 +2272,15 @@ BinaryExpr::TypeCheck() {
     }
     case LogicalAnd:
     case LogicalOr: {
-        // We need to type convert to a boolean type of the more general
-        // shape of the two types
-        bool isUniform = (type0->IsUniformType() && type1->IsUniformType());
-        const AtomicType *boolType = isUniform ? AtomicType::UniformBool : 
-                                                 AtomicType::VaryingBool;
-        const Type *destType = NULL;
+        // For now, we just type convert to boolean types, of the same
+        // variability as the original types.  (When generating code, it's
+        // useful to have preserved the uniform/varying distinction.)
+        const AtomicType *boolType0 = type0->IsUniformType() ? 
+            AtomicType::UniformBool : AtomicType::VaryingBool;
+        const AtomicType *boolType1 = type1->IsUniformType() ? 
+            AtomicType::UniformBool : AtomicType::VaryingBool;
+
+        const Type *destType0 = NULL, *destType1 = NULL;
         const VectorType *vtype0 = dynamic_cast<const VectorType *>(type0);
         const VectorType *vtype1 = dynamic_cast<const VectorType *>(type1);
         if (vtype0 && vtype1) {
@@ -2032,17 +2290,24 @@ BinaryExpr::TypeCheck() {
                       "different sizes (%d vs. %d).", lOpString(op), sz0, sz1);
                 return NULL;
             }
-            destType = new VectorType(boolType, sz0);
+            destType0 = new VectorType(boolType0, sz0);
+            destType1 = new VectorType(boolType1, sz1);
+        }
+        else if (vtype0 != NULL) {
+            destType0 = new VectorType(boolType0, vtype0->GetElementCount());
+            destType1 = new VectorType(boolType1, vtype0->GetElementCount());
+        }
+        else if (vtype1 != NULL) {
+            destType0 = new VectorType(boolType0, vtype1->GetElementCount());
+            destType1 = new VectorType(boolType1, vtype1->GetElementCount());
+        }
+        else {
+            destType0 = boolType0;
+            destType1 = boolType1;
         }
-        else if (vtype0)
-            destType = new VectorType(boolType, vtype0->GetElementCount());
-        else if (vtype1)
-            destType = new VectorType(boolType, vtype1->GetElementCount());
-        else
-            destType = boolType;
 
-        arg0 = TypeConvertExpr(arg0, destType, lOpString(op));
-        arg1 = TypeConvertExpr(arg1, destType, lOpString(op));
+        arg0 = TypeConvertExpr(arg0, destType0, lOpString(op));
+        arg1 = TypeConvertExpr(arg1, destType1, lOpString(op));
         if (arg0 == NULL || arg1 == NULL)
             return NULL;
         return this;
diff --git a/stmt.cpp b/stmt.cpp
index 5e5fe27d..617abfdf 100644
--- a/stmt.cpp
+++ b/stmt.cpp
@@ -473,112 +473,6 @@ IfStmt::emitMaskedTrueAndFalse(FunctionEmitContext *ctx, llvm::Value *oldMask,
 }
 
 
-/** Given an AST node, check to see if it's safe if we happen to run the
-    code for that node with the execution mask all off.
- */
-static bool
-lCheckAllOffSafety(ASTNode *node, void *data) {
-    bool *okPtr = (bool *)data;
-
-    if (dynamic_cast<FunctionCallExpr *>(node) != NULL) {
-        // FIXME: If we could somehow determine that the function being
-        // called was safe (and all of the args Exprs were safe, then it'd
-        // be nice to be able to return true here.  (Consider a call to
-        // e.g. floatbits() in the stdlib.)  Unfortunately for now we just
-        // have to be conservative.
-        *okPtr = false;
-        return false;
-    }
-
-    if (dynamic_cast<AssertStmt *>(node) != NULL) {
-        // While it's fine to run the assert for varying tests, it's not
-        // desirable to check an assert on a uniform variable if all of the
-        // lanes are off.
-        *okPtr = false;
-        return false;
-    }
-
-    if (dynamic_cast<NewExpr *>(node) != NULL ||
-        dynamic_cast<DeleteStmt *>(node) != NULL) {
-        // We definitely don't want to run the uniform variants of these if
-        // the mask is all off.  It's also worth skipping the overhead of
-        // executing the varying versions of them in the all-off mask case.
-        *okPtr = false;
-        return false;
-    }
-
-    if (g->target.allOffMaskIsSafe == true)
-        // Don't worry about memory accesses if we have a target that can
-        // safely run them with the mask all off
-        return true;
-
-    IndexExpr *ie;
-    if ((ie = dynamic_cast<IndexExpr *>(node)) != NULL && ie->baseExpr != NULL) {
-        const Type *type = ie->baseExpr->GetType();
-        if (type == NULL)
-            return true;
-        if (dynamic_cast<const ReferenceType *>(type) != NULL)
-            type = type->GetReferenceTarget();
-
-        ConstExpr *ce = dynamic_cast<ConstExpr *>(ie->index);
-        if (ce == NULL) {
-            // indexing with a variable... -> not safe
-            *okPtr = false;
-            return false;
-        }
-
-        const PointerType *pointerType = 
-            dynamic_cast<const PointerType *>(type);
-        if (pointerType != NULL) {
-            // pointer[index] -> can't be sure -> not safe
-            *okPtr = false;
-            return false;
-        }
-
-        const SequentialType *seqType = 
-            dynamic_cast<const SequentialType *>(type);
-        Assert(seqType != NULL);
-        int nElements = seqType->GetElementCount();
-        if (nElements == 0) {
-            // Unsized array, so we can't be sure -> not safe
-            *okPtr = false;
-            return false;
-        }
-
-        int32_t indices[ISPC_MAX_NVEC];
-        int count = ce->AsInt32(indices);
-        for (int i = 0; i < count; ++i) {
-            if (indices[i] < 0 || indices[i] >= nElements) {
-                // Index is out of bounds -> not safe
-                *okPtr = false;
-                return false;
-            }
-        }
-
-        // All indices are in-bounds
-        return true;
-    }
-
-    MemberExpr *me;
-    if ((me = dynamic_cast<MemberExpr *>(node)) != NULL &&
-        me->dereferenceExpr) {
-        *okPtr = false;
-        return false;
-    }
-
-    DereferenceExpr *de;
-    if ((de = dynamic_cast<DereferenceExpr *>(node)) != NULL) {
-        const Type *exprType = de->expr->GetType();
-        if (dynamic_cast<const PointerType *>(exprType) != NULL) {
-            *okPtr = false;
-            return false;
-        }
-    }
-
-    return true;
-}
-
-
 /** Emit code for an if test that checks the mask and the test values and
     tries to be smart about jumping over code that doesn't need to be run.
  */
@@ -632,7 +526,7 @@ IfStmt::emitVaryingIf(FunctionEmitContext *ctx, llvm::Value *ltest) const {
         //
         // Where the overhead of checking if any of the program instances wants
         // to run one side or the other is more than the actual computation.
-        // The lSafeToRunWithAllLanesOff() checks to make sure that we don't do this
+        // SafeToRunWithMaskAllOff() checks to make sure that we don't do this
         // for potentially dangerous code like:
         //
         // if (index < count) array[index] = 0;
@@ -644,9 +538,8 @@ IfStmt::emitVaryingIf(FunctionEmitContext *ctx, llvm::Value *ltest) const {
         bool costIsAcceptable = (trueFalseCost <
                                  PREDICATE_SAFE_IF_STATEMENT_COST);
 
-        bool safeToRunWithAllLanesOff = true;
-        WalkAST(trueStmts, lCheckAllOffSafety, NULL, &safeToRunWithAllLanesOff);
-        WalkAST(falseStmts, lCheckAllOffSafety, NULL, &safeToRunWithAllLanesOff);
+        bool safeToRunWithAllLanesOff = (SafeToRunWithMaskAllOff(trueStmts) &&
+                                         SafeToRunWithMaskAllOff(falseStmts));
 
         if (safeToRunWithAllLanesOff &&
             (costIsAcceptable || g->opt.disableCoherentControlFlow)) {
@@ -1984,9 +1877,7 @@ lCheckMask(Stmt *stmts) {
         return false;
 
     int cost = EstimateCost(stmts);
-
-    bool safeToRunWithAllLanesOff = true;
-    WalkAST(stmts, lCheckAllOffSafety, NULL, &safeToRunWithAllLanesOff);
+    bool safeToRunWithAllLanesOff = SafeToRunWithMaskAllOff(stmts);
 
     // The mask should be checked if the code following the
     // 'case'/'default' is relatively complex, or if it would be unsafe to
diff --git a/tests/short-circuit-1.ispc b/tests/short-circuit-1.ispc
new file mode 100644
index 00000000..b2b0faaa
--- /dev/null
+++ b/tests/short-circuit-1.ispc
@@ -0,0 +1,21 @@
+
+export uniform int width() { return programCount; }
+
+uniform int * uniform ptr;
+
+uniform bool crash() {
+    return *ptr > 0;
+}
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex]; 
+    uniform float a0 = aFOO[0], a1 = aFOO[1];
+    if (a0 < a1 || crash())
+        RET[programIndex] = 1;
+    else
+        RET[programIndex] = 0;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1;
+}
diff --git a/tests/short-circuit-10.ispc b/tests/short-circuit-10.ispc
new file mode 100644
index 00000000..05c93793
--- /dev/null
+++ b/tests/short-circuit-10.ispc
@@ -0,0 +1,24 @@
+
+export uniform int width() { return programCount; }
+
+uniform int * uniform ptr;
+
+bool crashEven() {
+    if (programIndex & 1) 
+        return true;
+    else
+        return (*ptr > 0);
+}
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex]; 
+    float a0 = aFOO[0], a1 = aFOO[1];
+    if (((programIndex & 1) == 0) || crashEven())
+        RET[programIndex] = 1;
+    else
+        RET[programIndex] = 0;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1;
+}
diff --git a/tests/short-circuit-11.ispc b/tests/short-circuit-11.ispc
new file mode 100644
index 00000000..6ba1de89
--- /dev/null
+++ b/tests/short-circuit-11.ispc
@@ -0,0 +1,25 @@
+
+export uniform int width() { return programCount; }
+
+uniform int * uniform ptr;
+
+bool crashEven() {
+//CO    return (programIndex & 1) ? true : (*ptr > 0);
+    if (programIndex & 1) 
+        return true;
+    else
+        return (*ptr > 0);
+}
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex]; 
+    float a0 = aFOO[0], a1 = aFOO[1];
+    if (((programIndex & 1) == 1) && crashEven())
+        RET[programIndex] = 1;
+    else
+        RET[programIndex] = 2;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = (programIndex & 1) ? 1 : 2;
+}
diff --git a/tests/short-circuit-2.ispc b/tests/short-circuit-2.ispc
new file mode 100644
index 00000000..36689829
--- /dev/null
+++ b/tests/short-circuit-2.ispc
@@ -0,0 +1,21 @@
+
+export uniform int width() { return programCount; }
+
+uniform int * uniform ptr;
+
+uniform bool crash() {
+    return *ptr > 0;
+}
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex]; 
+    uniform float a0 = aFOO[0], a1 = aFOO[1];
+    if (a0 > a1 && crash())
+        RET[programIndex] = 0;
+    else
+        RET[programIndex] = 1;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1;
+}
diff --git a/tests/short-circuit-3.ispc b/tests/short-circuit-3.ispc
new file mode 100644
index 00000000..b5dba1e3
--- /dev/null
+++ b/tests/short-circuit-3.ispc
@@ -0,0 +1,21 @@
+
+export uniform int width() { return programCount; }
+
+uniform int * uniform ptr;
+
+bool crash() {
+    return *ptr > 0;
+}
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex]; 
+    uniform float a0 = aFOO[0], a1 = aFOO[1];
+    if (a0 < a1 || crash())
+        RET[programIndex] = 1;
+    else
+        RET[programIndex] = 0;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1;
+}
diff --git a/tests/short-circuit-4.ispc b/tests/short-circuit-4.ispc
new file mode 100644
index 00000000..2b3a1411
--- /dev/null
+++ b/tests/short-circuit-4.ispc
@@ -0,0 +1,21 @@
+
+export uniform int width() { return programCount; }
+
+uniform int * uniform ptr;
+
+bool crash() {
+    return *ptr > 0;
+}
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex]; 
+    uniform float a0 = aFOO[0], a1 = aFOO[1];
+    if (a0 > a1 && crash())
+        RET[programIndex] = 0;
+    else
+        RET[programIndex] = 1;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1;
+}
diff --git a/tests/short-circuit-5.ispc b/tests/short-circuit-5.ispc
new file mode 100644
index 00000000..d7bf21dc
--- /dev/null
+++ b/tests/short-circuit-5.ispc
@@ -0,0 +1,21 @@
+
+export uniform int width() { return programCount; }
+
+uniform int * uniform ptr;
+
+bool crash() {
+    return *ptr > 0;
+}
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex]; 
+    float a0 = aFOO[0], a1 = aFOO[1];
+    if (a0 < a1 || crash())
+        RET[programIndex] = 1;
+    else
+        RET[programIndex] = 0;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1;
+}
diff --git a/tests/short-circuit-6.ispc b/tests/short-circuit-6.ispc
new file mode 100644
index 00000000..9a2adb55
--- /dev/null
+++ b/tests/short-circuit-6.ispc
@@ -0,0 +1,21 @@
+
+export uniform int width() { return programCount; }
+
+uniform int * uniform ptr;
+
+bool crash() {
+    return *ptr > 0;
+}
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex]; 
+    float a0 = aFOO[0], a1 = aFOO[1];
+    if (a0 > a1 && crash())
+        RET[programIndex] = 0;
+    else
+        RET[programIndex] = 1;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1;
+}
diff --git a/tests/short-circuit-7.ispc b/tests/short-circuit-7.ispc
new file mode 100644
index 00000000..840da795
--- /dev/null
+++ b/tests/short-circuit-7.ispc
@@ -0,0 +1,21 @@
+
+export uniform int width() { return programCount; }
+
+uniform int * uniform ptr;
+
+uniform bool crash() {
+    return *ptr > 0;
+}
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex]; 
+    float a0 = aFOO[0], a1 = aFOO[1];
+    if (a0 < a1 || crash())
+        RET[programIndex] = 1;
+    else
+        RET[programIndex] = 0;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1;
+}
diff --git a/tests/short-circuit-8.ispc b/tests/short-circuit-8.ispc
new file mode 100644
index 00000000..46810906
--- /dev/null
+++ b/tests/short-circuit-8.ispc
@@ -0,0 +1,21 @@
+
+export uniform int width() { return programCount; }
+
+uniform int * uniform ptr;
+
+uniform bool crash() {
+    return *ptr > 0;
+}
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex]; 
+    float a0 = aFOO[0], a1 = aFOO[1];
+    if (a0 > a1 && crash())
+        RET[programIndex] = 0;
+    else
+        RET[programIndex] = 1;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1;
+}
diff --git a/tests/short-circuit-9.ispc b/tests/short-circuit-9.ispc
new file mode 100644
index 00000000..9a2adb55
--- /dev/null
+++ b/tests/short-circuit-9.ispc
@@ -0,0 +1,21 @@
+
+export uniform int width() { return programCount; }
+
+uniform int * uniform ptr;
+
+bool crash() {
+    return *ptr > 0;
+}
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex]; 
+    float a0 = aFOO[0], a1 = aFOO[1];
+    if (a0 > a1 && crash())
+        RET[programIndex] = 0;
+    else
+        RET[programIndex] = 1;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1;
+}

From 950f86200bfef6e9cde59a9b5e690533c02e78a1 Mon Sep 17 00:00:00 2001
From: Matt Pharr <matt.pharr@intel.com>
Date: Mon, 30 Jan 2012 15:03:54 -0800
Subject: [PATCH 25/62] Fix examples/tasksys.cpp to compile with 32-bit
 targets.

(Change a cmpxchgd to cmpxchl.)  Note that a number of the examples
still don't work with 32-bit compilation, why still TBD.
---
 examples/tasksys.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/tasksys.cpp b/examples/tasksys.cpp
index 92dc50f0..4ce5d354 100644
--- a/examples/tasksys.cpp
+++ b/examples/tasksys.cpp
@@ -273,7 +273,7 @@ lAtomicCompareAndSwapPointer(void **v, void *newValue, void *oldValue) {
 #else
     void *result;
 #if (ISPC_POINTER_BYTES == 4)
-    __asm__ __volatile__("lock\ncmpxchgd %2,%1"
+    __asm__ __volatile__("lock\ncmpxchgl %2,%1"
                           : "=a"(result), "=m"(*v)
                           : "q"(newValue), "0"(oldValue)
                           : "memory");

From 1eec27f890366e7640e547b913fdde7349ff4107 Mon Sep 17 00:00:00 2001
From: Matt Pharr <matt.pharr@intel.com>
Date: Tue, 31 Jan 2012 08:52:11 -0800
Subject: [PATCH 26/62] Scalar target fixes.

Don't issue warnings about all instances writing to the same location if
there is only one program instance in the gang.

Be sure to report that all values are equal in one-element vectors in
LLVMVectorValuesAllEqual().

Issue #166.
---
 llvmutil.cpp |  3 +++
 opt.cpp      | 18 +++++++++++-------
 2 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/llvmutil.cpp b/llvmutil.cpp
index 4ae07b96..92abd392 100644
--- a/llvmutil.cpp
+++ b/llvmutil.cpp
@@ -597,6 +597,9 @@ LLVMFlattenInsertChain(llvm::InsertElementInst *ie, int vectorWidth,
 bool
 LLVMVectorValuesAllEqual(llvm::Value *v, int vectorLength,
                          std::vector<llvm::PHINode *> &seenPhis) {
+    if (vectorLength == 1)
+        return true;
+
     if (llvm::isa<llvm::ConstantAggregateZero>(v))
         return true;
 
diff --git a/opt.cpp b/opt.cpp
index b4570235..c564aa21 100644
--- a/opt.cpp
+++ b/opt.cpp
@@ -2515,10 +2515,12 @@ GSToLoadStorePass::runOnBasicBlock(llvm::BasicBlock &bb) {
             }
             else {
                 // A scatter with everyone going to the same location is
-                // undefined.  Issue a warning and arbitrarily let the
+                // undefined (if there's more than one program instance in
+                // the gang).  Issue a warning and arbitrarily let the
                 // first guy win.
-                Warning(pos, "Undefined behavior: all program instances are "
-                        "writing to the same location!");
+                if (g->target.vectorWidth > 1)
+                    Warning(pos, "Undefined behavior: all program instances are "
+                            "writing to the same location!");
 
                 llvm::Value *first = 
                     llvm::ExtractElementInst::Create(storeValue, LLVMInt32(0), "rvalue_first",
@@ -2694,10 +2696,12 @@ PseudoGSToGSPass::runOnBasicBlock(llvm::BasicBlock &bb) {
         Assert(ok);     
 
         callInst->setCalledFunction(info->actualFunc);
-        if (info->isGather)
-            PerformanceWarning(pos, "Gather required to compute value in expression.");
-        else
-            PerformanceWarning(pos, "Scatter required for storing value.");
+        if (g->target.vectorWidth > 1) {
+            if (info->isGather)
+                PerformanceWarning(pos, "Gather required to compute value in expression.");
+            else
+                PerformanceWarning(pos, "Scatter required for storing value.");
+        }
         modifiedAny = true;
         goto restart;
     }

From 25665f0841fcee0f0a9a26338ef8555cd73f0570 Mon Sep 17 00:00:00 2001
From: Matt Pharr <matt.pharr@intel.com>
Date: Tue, 31 Jan 2012 09:37:39 -0800
Subject: [PATCH 27/62] Implement NullPointerExpr::GetConstant()

Also reworked TypeCastExpr::GetConstant() to just forward the request along
and moved the code that was previously there to handle uniform->varying
smears of function pointers to FunctionSymbolExpr::GetConstant().

Fixes issue #168.
---
 expr.cpp | 72 ++++++++++++++++++++++++++++++++++++--------------------
 expr.h   |  1 +
 2 files changed, 47 insertions(+), 26 deletions(-)

diff --git a/expr.cpp b/expr.cpp
index 24df7fb3..65907d94 100644
--- a/expr.cpp
+++ b/expr.cpp
@@ -6037,28 +6037,15 @@ llvm::Constant *
 TypeCastExpr::GetConstant(const Type *constType) const {
     // We don't need to worry about most the basic cases where the type
     // cast can resolve to a constant here, since the
-    // TypeCastExpr::Optimize() method ends up doing the type conversion
-    // and returning a ConstExpr, which in turn will have its GetConstant()
-    // method called.  Thus, the only case we do need to worry about here
-    // is converting a uniform function pointer to a varying function
-    // pointer of the same type.
+    // TypeCastExpr::Optimize() method generally ends up doing the type
+    // conversion and returning a ConstExpr, which in turn will have its
+    // GetConstant() method called.  However, because ConstExpr currently
+    // can't represent pointer values, we have to handle two cases here:
+    // 1. Null pointers (NULL, 0) valued initializers, and
+    // 2. Converting a uniform function pointer to a varying function
+    //    pointer of the same type.
     Assert(Type::Equal(constType, type));
-    const FunctionType *ft = NULL;
-    if (dynamic_cast<const PointerType *>(type) == NULL ||
-        (ft = dynamic_cast<const FunctionType *>(type->GetBaseType())) == NULL)
-        return NULL;
-
-    llvm::Constant *ec = expr->GetConstant(expr->GetType());
-    if (ec == NULL)
-        return NULL;
-
-    ec = llvm::ConstantExpr::getPtrToInt(ec, LLVMTypes::PointerIntType);
-
-    Assert(type->IsVaryingType());
-    std::vector<llvm::Constant *> smear;
-    for (int i = 0; i < g->target.vectorWidth; ++i)
-        smear.push_back(ec);
-    return llvm::ConstantVector::get(smear);
+    return expr->GetConstant(constType);
 }
 
 
@@ -6553,13 +6540,30 @@ FunctionSymbolExpr::Print() const {
 
 llvm::Constant *
 FunctionSymbolExpr::GetConstant(const Type *type) const {
-    Assert(type->IsUniformType());
-    Assert(GetType()->IsUniformType());
-
-    if (Type::EqualIgnoringConst(type, GetType()) == false)
+    if (matchingFunc == NULL || matchingFunc->function == NULL)
         return NULL;
 
-    return matchingFunc ? matchingFunc->function : NULL;
+    const FunctionType *ft;
+    if (dynamic_cast<const PointerType *>(type) == NULL ||
+        (ft = dynamic_cast<const FunctionType *>(type->GetBaseType())) == NULL)
+        return NULL;
+
+    LLVM_TYPE_CONST llvm::Type *llvmUnifType = 
+        type->GetAsUniformType()->LLVMType(g->ctx);
+    if (llvmUnifType != matchingFunc->function->getType())
+        return NULL;
+
+    if (type->IsUniformType())
+        return matchingFunc->function;
+    else {
+        llvm::Constant *intPtr = 
+            llvm::ConstantExpr::getPtrToInt(matchingFunc->function, 
+                                            LLVMTypes::PointerIntType);
+        std::vector<llvm::Constant *> smear;
+        for (int i = 0; i < g->target.vectorWidth; ++i)
+            smear.push_back(intPtr);
+        return llvm::ConstantVector::get(smear);
+    }
 }
 
 
@@ -6985,6 +6989,22 @@ NullPointerExpr::Optimize() {
 }
 
 
+llvm::Constant *
+NullPointerExpr::GetConstant(const Type *type) const {
+    const PointerType *pt = dynamic_cast<const PointerType *>(type);
+    if (pt == NULL)
+        return NULL;
+
+    LLVM_TYPE_CONST llvm::Type *llvmType = type->LLVMType(g->ctx);
+    if (llvmType == NULL) {
+        Assert(m->errorCount > 0);
+        return NULL;
+    }
+
+    return llvm::Constant::getNullValue(llvmType);
+}
+
+
 void
 NullPointerExpr::Print() const {
     printf("NULL");
diff --git a/expr.h b/expr.h
index fad1d0bc..70224a7f 100644
--- a/expr.h
+++ b/expr.h
@@ -684,6 +684,7 @@ public:
     const Type *GetType() const;
     Expr *TypeCheck();
     Expr *Optimize();
+    llvm::Constant *GetConstant(const Type *type) const;
     void Print() const;
     int EstimateCost() const;
 };

From d71c49494f571f89b287594d001b0a0563b5f60c Mon Sep 17 00:00:00 2001
From: Matt Pharr <matt.pharr@intel.com>
Date: Tue, 31 Jan 2012 11:02:23 -0800
Subject: [PATCH 28/62] Missed pass that should be skipped when pseudo memory
 ops are supposed to be left unchanged.

---
 opt.cpp | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/opt.cpp b/opt.cpp
index c564aa21..77663544 100644
--- a/opt.cpp
+++ b/opt.cpp
@@ -368,8 +368,10 @@ Optimize(llvm::Module *module, int optLevel) {
             optPM.add(CreateMaskedStoreOptPass());
             optPM.add(CreateMaskedLoadOptPass());
         }
-        optPM.add(CreatePseudoMaskedStorePass());
-        if (!g->opt.disableGatherScatterOptimizations)
+        if (g->opt.disableHandlePseudoMemoryOps == false)
+            optPM.add(CreatePseudoMaskedStorePass());
+        if (g->opt.disableGatherScatterOptimizations == false &&
+            g->opt.disableHandlePseudoMemoryOps == false)
             optPM.add(CreateGSToLoadStorePass());
         if (g->opt.disableHandlePseudoMemoryOps == false) {
             optPM.add(CreatePseudoMaskedStorePass());
@@ -1865,6 +1867,7 @@ MaskedStoreOptPass::runOnBasicBlock(llvm::BasicBlock &bb) {
             goto restart;
         }
     }
+
     return modifiedAny;
 }
 
@@ -2102,6 +2105,7 @@ PseudoMaskedStorePass::runOnBasicBlock(llvm::BasicBlock &bb) {
         modifiedAny = true;
         goto restart;
     }
+
     return modifiedAny;
 }
 

From f73abb05a7c7f43f7ed5a4c2beb64763bfbad8f3 Mon Sep 17 00:00:00 2001
From: Matt Pharr <matt.pharr@intel.com>
Date: Tue, 31 Jan 2012 11:06:14 -0800
Subject: [PATCH 29/62] Fix bug in handling scatters where all instances go to
 the same location.

Previously, we'd pick one lane and generate a regular store for its value.
This was the wrong thing to do, since we also should have been checking
that the mask was on (for the lane that was chosen).  This bug didn't
become evident until the scalar target was added, since many stores fall
into this case with that target.

Now, we just leave those as regular scatters.

Fixes most of the failing tests for the scalar target listed in issue #167.
---
 opt.cpp | 94 ++++++++++++++++++++++++++-------------------------------
 1 file changed, 43 insertions(+), 51 deletions(-)

diff --git a/opt.cpp b/opt.cpp
index 77663544..5e74bd5a 100644
--- a/opt.cpp
+++ b/opt.cpp
@@ -2494,7 +2494,6 @@ GSToLoadStorePass::runOnBasicBlock(llvm::BasicBlock &bb) {
                                          constOffsets, "varying+const_offsets",
                                          callInst);
 
-        {
         std::vector<llvm::PHINode *> seenPhis;
         if (LLVMVectorValuesAllEqual(fullOffsets, g->target.vectorWidth, seenPhis)) {
             // If all the offsets are equal, then compute the single
@@ -2516,68 +2515,61 @@ GSToLoadStorePass::runOnBasicBlock(llvm::BasicBlock &bb) {
                               "load_braodcast");
                 lCopyMetadata(newCall, callInst);
                 llvm::ReplaceInstWithInst(callInst, newCall);
+
+                modifiedAny = true;
+                goto restart;
             }
             else {
                 // A scatter with everyone going to the same location is
                 // undefined (if there's more than one program instance in
-                // the gang).  Issue a warning and arbitrarily let the
-                // first guy win.
+                // the gang).  Issue a warning.
                 if (g->target.vectorWidth > 1)
                     Warning(pos, "Undefined behavior: all program instances are "
                             "writing to the same location!");
 
-                llvm::Value *first = 
-                    llvm::ExtractElementInst::Create(storeValue, LLVMInt32(0), "rvalue_first",
-                                                     callInst);
-                lCopyMetadata(first, callInst);
+                // We could do something similar to the gather case, where
+                // we arbitrarily write one of the values, but we need to
+                // a) check to be sure the mask isn't all off and b) pick
+                // the value from an executing program instance in that
+                // case.  We'll just let a bunch of the program instances
+                // do redundant writes, since this isn't important to make
+                // fast anyway...
+            }
+        }
+        else {
+            int step = gatherInfo ? gatherInfo->align : scatterInfo->align;
 
-                ptr = new llvm::BitCastInst(ptr, llvm::PointerType::get(first->getType(), 0),
-                                            "ptr2rvalue_type", callInst);
+            std::vector<llvm::PHINode *> seenPhis;
+            if (step > 0 && lVectorIsLinear(fullOffsets, g->target.vectorWidth, 
+                                            step, seenPhis)) {
+                // We have a linear sequence of memory locations being accessed
+                // starting with the location given by the offset from
+                // offsetElements[0], with stride of 4 or 8 bytes (for 32 bit
+                // and 64 bit gather/scatters, respectively.)
+                llvm::Value *ptr = lComputeCommonPointer(base, fullOffsets, callInst);
                 lCopyMetadata(ptr, callInst);
 
-                llvm::Instruction *sinst = new llvm::StoreInst(first, ptr, false, 
-                                                               scatterInfo->align);
-                lCopyMetadata(sinst, callInst);
-                llvm::ReplaceInstWithInst(callInst, sinst);
+                if (gatherInfo != NULL) {
+                    Debug(pos, "Transformed gather to unaligned vector load!");
+                    llvm::Instruction *newCall = 
+                        lCallInst(gatherInfo->loadMaskedFunc, ptr, mask, "masked_load");
+                    lCopyMetadata(newCall, callInst);
+                    llvm::ReplaceInstWithInst(callInst, newCall);
+                }
+                else {
+                    Debug(pos, "Transformed scatter to unaligned vector store!");
+                    ptr = new llvm::BitCastInst(ptr, scatterInfo->vecPtrType, "ptrcast", 
+                                                callInst);
+                    llvm::Instruction *newCall =
+                        lCallInst(scatterInfo->maskedStoreFunc, ptr, storeValue, 
+                                  mask, "");
+                    lCopyMetadata(newCall, callInst);
+                    llvm::ReplaceInstWithInst(callInst, newCall);
+                }
+
+                modifiedAny = true;
+                goto restart;
             }
-
-            modifiedAny = true;
-            goto restart;
-        }
-        }
-
-        int step = gatherInfo ? gatherInfo->align : scatterInfo->align;
-
-        std::vector<llvm::PHINode *> seenPhis;
-        if (step > 0 && lVectorIsLinear(fullOffsets, g->target.vectorWidth, 
-                                        step, seenPhis)) {
-            // We have a linear sequence of memory locations being accessed
-            // starting with the location given by the offset from
-            // offsetElements[0], with stride of 4 or 8 bytes (for 32 bit
-            // and 64 bit gather/scatters, respectively.)
-            llvm::Value *ptr = lComputeCommonPointer(base, fullOffsets, callInst);
-            lCopyMetadata(ptr, callInst);
-
-            if (gatherInfo != NULL) {
-                Debug(pos, "Transformed gather to unaligned vector load!");
-                llvm::Instruction *newCall = 
-                    lCallInst(gatherInfo->loadMaskedFunc, ptr, mask, "masked_load");
-                lCopyMetadata(newCall, callInst);
-                llvm::ReplaceInstWithInst(callInst, newCall);
-            }
-            else {
-                Debug(pos, "Transformed scatter to unaligned vector store!");
-                ptr = new llvm::BitCastInst(ptr, scatterInfo->vecPtrType, "ptrcast", 
-                                            callInst);
-                llvm::Instruction *newCall =
-                    lCallInst(scatterInfo->maskedStoreFunc, ptr, storeValue, 
-                              mask, "");
-                lCopyMetadata(newCall, callInst);
-                llvm::ReplaceInstWithInst(callInst, newCall);
-            }
-
-            modifiedAny = true;
-            goto restart;
         }
     }
 

From ea027a95a8a0419cc0c2a1114c71df170d28d816 Mon Sep 17 00:00:00 2001
From: Matt Pharr <matt.pharr@intel.com>
Date: Tue, 31 Jan 2012 11:46:33 -0800
Subject: [PATCH 30/62] Fix various places in deferred shading example that
 assumed programCount >= 4.

This gets deferred closer to working with the scalar target, but there are still
some issues.  (Partially in gamma correction / final clamping, it seems.)

This fix causes a ~0.5% performance degradation with e.g. the AVX target,
though it's not clear that it's worth having a separate code path in order to
not lose this small amount of perf.

(Partially addresses issue #167)
---
 examples/deferred/kernels.ispc | 80 ++++++++++++----------------------
 1 file changed, 27 insertions(+), 53 deletions(-)

diff --git a/examples/deferred/kernels.ispc b/examples/deferred/kernels.ispc
index 8117e8a9..ae0542b2 100644
--- a/examples/deferred/kernels.ispc
+++ b/examples/deferred/kernels.ispc
@@ -158,38 +158,22 @@ IntersectLightsWithTileMinMax(
     uniform float gBufferScale_x = 0.5f * (float)gBufferWidth;
     uniform float gBufferScale_y = 0.5f * (float)gBufferHeight;
         
-    // Parallize across frustum planes.
-    // We really only have four side planes here, but write the code to
-    // handle programCount > 4 robustly
-    uniform float frustumPlanes_xy[programCount];
-    uniform float frustumPlanes_z[programCount];
+    uniform float frustumPlanes_xy[4] = {
+        -(cameraProj_11 * gBufferScale_x),
+         (cameraProj_11 * gBufferScale_x),
+         (cameraProj_22 * gBufferScale_y),
+        -(cameraProj_22 * gBufferScale_y) };
+    uniform float frustumPlanes_z[4] = {
+         tileEndX - gBufferScale_x,
+        -tileStartX + gBufferScale_x,
+         tileEndY - gBufferScale_y,
+        -tileStartY + gBufferScale_y };
 
-    // TODO: If programIndex < 4 here? Don't care about masking off the
-    // rest but if interleaving ("x2" modes) the other lanes should ideally
-    // not be emitted...
-    {
-        // This one is totally constant over the whole screen... worth pulling it up at all?
-        float frustumPlanes_xy_v;
-        frustumPlanes_xy_v = insert(frustumPlanes_xy_v, 0, -(cameraProj_11 * gBufferScale_x));
-        frustumPlanes_xy_v = insert(frustumPlanes_xy_v, 1,  (cameraProj_11 * gBufferScale_x));
-        frustumPlanes_xy_v = insert(frustumPlanes_xy_v, 2,  (cameraProj_22 * gBufferScale_y));
-        frustumPlanes_xy_v = insert(frustumPlanes_xy_v, 3, -(cameraProj_22 * gBufferScale_y));
-    
-        float frustumPlanes_z_v;
-        frustumPlanes_z_v = insert(frustumPlanes_z_v, 0,  tileEndX - gBufferScale_x);
-        frustumPlanes_z_v = insert(frustumPlanes_z_v, 1, -tileStartX + gBufferScale_x);
-        frustumPlanes_z_v = insert(frustumPlanes_z_v, 2,  tileEndY - gBufferScale_y);
-        frustumPlanes_z_v = insert(frustumPlanes_z_v, 3, -tileStartY + gBufferScale_y);
-
-        // Normalize
-        float norm = rsqrt(frustumPlanes_xy_v * frustumPlanes_xy_v + 
-                           frustumPlanes_z_v * frustumPlanes_z_v);
-            frustumPlanes_xy_v *= norm;
-            frustumPlanes_z_v *= norm;
-
-        // Save out for uniform use later
-        frustumPlanes_xy[programIndex] = frustumPlanes_xy_v;
-        frustumPlanes_z[programIndex] = frustumPlanes_z_v;
+    for (uniform int i = 0; i < 4; ++i) {
+        uniform float norm = rsqrt(frustumPlanes_xy[i] * frustumPlanes_xy[i] + 
+                                   frustumPlanes_z[i] * frustumPlanes_z[i]);
+        frustumPlanes_xy[i] *= norm;
+        frustumPlanes_z[i] *= norm;
     }
 
     uniform int32 tileNumLights = 0;
@@ -601,30 +585,20 @@ SplitTileMinMax(
     uniform float gBufferScale_x = 0.5f * (float)gBufferWidth;
     uniform float gBufferScale_y = 0.5f * (float)gBufferHeight;
         
-    // Parallize across frustum planes
-    // Only have 2 frustum split planes here so may not be worth it, but
-    // we'll do it for now for consistency
-    uniform float frustumPlanes_xy[programCount];
-    uniform float frustumPlanes_z[programCount];
-
-    // This one is totally constant over the whole screen... worth pulling it up at all?
-    float frustumPlanes_xy_v;
-    frustumPlanes_xy_v = insert(frustumPlanes_xy_v, 0, -(cameraProj_11 * gBufferScale_x));
-    frustumPlanes_xy_v = insert(frustumPlanes_xy_v, 1,  (cameraProj_22 * gBufferScale_y));
-    
-    float frustumPlanes_z_v;
-    frustumPlanes_z_v = insert(frustumPlanes_z_v, 0, tileMidX - gBufferScale_x);
-    frustumPlanes_z_v = insert(frustumPlanes_z_v, 1, tileMidY - gBufferScale_y);
+    uniform float frustumPlanes_xy[2] = { -(cameraProj_11 * gBufferScale_x),
+                                           (cameraProj_22 * gBufferScale_y) };
+    uniform float frustumPlanes_z[2] = { tileMidX - gBufferScale_x,
+                                         tileMidY - gBufferScale_y };
 
     // Normalize
-    float norm = rsqrt(frustumPlanes_xy_v * frustumPlanes_xy_v + 
-                       frustumPlanes_z_v * frustumPlanes_z_v);
-    frustumPlanes_xy_v *= norm;
-    frustumPlanes_z_v *= norm;
-
-    // Save out for uniform use later
-    frustumPlanes_xy[programIndex] = frustumPlanes_xy_v;
-    frustumPlanes_z[programIndex] = frustumPlanes_z_v;
+    uniform float norm[2] = { rsqrt(frustumPlanes_xy[0] * frustumPlanes_xy[0] + 
+                                    frustumPlanes_z[0] * frustumPlanes_z[0]),
+                              rsqrt(frustumPlanes_xy[1] * frustumPlanes_xy[1] + 
+                                    frustumPlanes_z[1] * frustumPlanes_z[1]) };
+    frustumPlanes_xy[0] *= norm[0];
+    frustumPlanes_xy[1] *= norm[1];
+    frustumPlanes_z[0] *= norm[0];
+    frustumPlanes_z[1] *= norm[1];
 
     // Initialize
     uniform int32 subtileLightOffset[4];

From dac091552de03b7d786793486644b3d5ba126dae Mon Sep 17 00:00:00 2001
From: Matt Pharr <matt.pharr@intel.com>
Date: Tue, 31 Jan 2012 11:57:12 -0800
Subject: [PATCH 31/62] Fix errors in tests for scalar target.

Issue #167.
---
 tests/array-pointer-duality-1.ispc | 2 +-
 tests/atomics-3.ispc               | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/array-pointer-duality-1.ispc b/tests/array-pointer-duality-1.ispc
index b8ac18b3..1550c294 100644
--- a/tests/array-pointer-duality-1.ispc
+++ b/tests/array-pointer-duality-1.ispc
@@ -11,5 +11,5 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
 }
 
 export void result(uniform float RET[]) {
-    RET[programIndex] = (programCount == 1) ? 1 : 3;
+    RET[programIndex] = (programCount == 1) ? 2 : 3;
 }
diff --git a/tests/atomics-3.ispc b/tests/atomics-3.ispc
index ef3085ae..97c8c622 100644
--- a/tests/atomics-3.ispc
+++ b/tests/atomics-3.ispc
@@ -5,11 +5,11 @@ uniform int32 s = 0xff;
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
     float a = aFOO[programIndex]; 
-    int32 bits = 0xfffffff0;
+    int32 bits = 0xfff0;
     float b = atomic_xor_global(&s, bits);
     RET[programIndex] = s;
 }
 
 export void result(uniform float RET[]) {
-    RET[programIndex] = 0xff;
+    RET[programIndex] = (programCount & 1) ? 0xff0f : 0xff;
 }

From 8d1631b714dc3debc38bdb40f4daaf82635932a1 Mon Sep 17 00:00:00 2001
From: Matt Pharr <matt.pharr@intel.com>
Date: Tue, 31 Jan 2012 12:22:11 -0800
Subject: [PATCH 32/62] Constant fold in SelectExpr::Optimize().

Resolves issue #170.
---
 expr.cpp                       | 22 +++++++++++++++++++++-
 tests/const-fold-select-1.ispc | 10 ++++++++++
 tests/const-fold-select-2.ispc | 10 ++++++++++
 3 files changed, 41 insertions(+), 1 deletion(-)
 create mode 100644 tests/const-fold-select-1.ispc
 create mode 100644 tests/const-fold-select-2.ispc

diff --git a/expr.cpp b/expr.cpp
index 65907d94..1f810998 100644
--- a/expr.cpp
+++ b/expr.cpp
@@ -2811,7 +2811,27 @@ Expr *
 SelectExpr::Optimize() {
     if (test == NULL || expr1 == NULL || expr2 == NULL)
         return NULL;
-    return this;
+
+    ConstExpr *constTest = dynamic_cast<ConstExpr *>(test);
+    if (constTest == NULL)
+        return this;
+
+    // The test is a constant; see if we can resolve to one of the
+    // expressions..
+    bool bv[ISPC_MAX_NVEC];
+    int count = constTest->AsBool(bv);
+    if (count == 1)
+        // Uniform test value; return the corresponding expression
+        return (bv[0] == true) ? expr1 : expr2;
+    else {
+        // Varying test: see if all of the values are the same; if so, then
+        // return the corresponding expression
+        bool first = bv[0];
+        for (int i = 0; i < count; ++i)
+            if (bv[i] != first)
+                return this;
+        return (bv[0] == true) ? expr1 : expr2;
+    }
 }
 
 
diff --git a/tests/const-fold-select-1.ispc b/tests/const-fold-select-1.ispc
new file mode 100644
index 00000000..08dcb8a5
--- /dev/null
+++ b/tests/const-fold-select-1.ispc
@@ -0,0 +1,10 @@
+
+export uniform int width() { return programCount; }
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    RET[programIndex] = (programIndex >= 0) ? 1 : 0;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1;
+}
diff --git a/tests/const-fold-select-2.ispc b/tests/const-fold-select-2.ispc
new file mode 100644
index 00000000..2db0472a
--- /dev/null
+++ b/tests/const-fold-select-2.ispc
@@ -0,0 +1,10 @@
+
+export uniform int width() { return programCount; }
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    RET[programIndex] = (programCount < 10000) ? 1 : 0;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1;
+}

From 0432f9755593e4df5b973d128855b5e0cef2cdd1 Mon Sep 17 00:00:00 2001
From: Matt Pharr <matt.pharr@intel.com>
Date: Tue, 31 Jan 2012 12:55:22 -0800
Subject: [PATCH 33/62] Fix build with LLVM 3.1 TOT

---
 ctx.cpp      |  8 ++++++++
 llvmutil.cpp |  6 ++++++
 opt.cpp      | 48 +++++++++++++++++++++++++++++++++++++++++-------
 3 files changed, 55 insertions(+), 7 deletions(-)

diff --git a/ctx.cpp b/ctx.cpp
index 8ac64fe5..e9fd7203 100644
--- a/ctx.cpp
+++ b/ctx.cpp
@@ -1279,7 +1279,11 @@ FunctionEmitContext::MasksAllEqual(llvm::Value *v1, llvm::Value *v2) {
 
 llvm::Value *
 FunctionEmitContext::GetStringPtr(const std::string &str) {
+#ifdef LLVM_3_1svn
+    llvm::Constant *lstr = llvm::ConstantDataArray::getString(*g->ctx, str);
+#else
     llvm::Constant *lstr = llvm::ConstantArray::get(*g->ctx, str);
+#endif
     llvm::GlobalValue::LinkageTypes linkage = llvm::GlobalValue::InternalLinkage;
     llvm::Value *lstrPtr = new llvm::GlobalVariable(*m->module, lstr->getType(),
                                                     true /*isConst*/, 
@@ -1329,7 +1333,11 @@ FunctionEmitContext::I1VecToBoolVec(llvm::Value *b) {
 
 static llvm::Value *
 lGetStringAsValue(llvm::BasicBlock *bblock, const char *s) {
+#ifdef LLVM_3_1svn
+    llvm::Constant *sConstant = llvm::ConstantDataArray::getString(*g->ctx, s);
+#else
     llvm::Constant *sConstant = llvm::ConstantArray::get(*g->ctx, s);
+#endif
     llvm::Value *sPtr = new llvm::GlobalVariable(*m->module, sConstant->getType(), 
                                                  true /* const */,
                                                  llvm::GlobalValue::InternalLinkage,
diff --git a/llvmutil.cpp b/llvmutil.cpp
index 92abd392..e5c4785e 100644
--- a/llvmutil.cpp
+++ b/llvmutil.cpp
@@ -607,6 +607,12 @@ LLVMVectorValuesAllEqual(llvm::Value *v, int vectorLength,
     if (cv != NULL)
         return (cv->getSplatValue() != NULL);
 
+#ifdef LLVM_3_1svn
+    llvm::ConstantDataVector *cdv = llvm::dyn_cast<llvm::ConstantDataVector>(v);
+    if (cdv != NULL)
+        return (cdv->getSplatValue() != NULL);
+#endif
+
     llvm::BinaryOperator *bop = llvm::dyn_cast<llvm::BinaryOperator>(v);
     if (bop != NULL)
         return (LLVMVectorValuesAllEqual(bop->getOperand(0), vectorLength, 
diff --git a/opt.cpp b/opt.cpp
index 5e74bd5a..57443040 100644
--- a/opt.cpp
+++ b/opt.cpp
@@ -630,13 +630,17 @@ lGetMask(llvm::Value *factor) {
        "known and all bits on". */
     Assert(g->target.vectorWidth < 32);
 
+#ifdef LLVM_3_1svn
+    llvm::ConstantDataVector *cv = llvm::dyn_cast<llvm::ConstantDataVector>(factor);
+#else
     llvm::ConstantVector *cv = llvm::dyn_cast<llvm::ConstantVector>(factor);
+#endif
     if (cv) {
         int mask = 0;
         llvm::SmallVector<llvm::Constant *, ISPC_MAX_NVEC> elements;
 #ifdef LLVM_3_1svn
-        for (int i = 0; i < (int)cv->getNumOperands(); ++i)
-            elements.push_back(cv->getOperand(i));
+        for (int i = 0; i < (int)cv->getNumElements(); ++i)
+            elements.push_back(cv->getElementAsConstant(i));
 #else
         cv->getVectorElements(elements);
 #endif
@@ -1133,8 +1137,13 @@ lGetBasePtrAndOffsets(llvm::Value *ptrs, llvm::Value **offsets,
         // ConstantVectors..
         llvm::SmallVector<llvm::Constant *, ISPC_MAX_NVEC> elements;
 #ifdef LLVM_3_1svn
-        for (int i = 0; i < (int)cv->getNumOperands(); ++i)
-            elements.push_back(cv->getOperand(i));
+        for (int i = 0; i < (int)cv->getNumOperands(); ++i) {
+            llvm::Constant *c = 
+                llvm::dyn_cast<llvm::Constant>(cv->getOperand(i));
+            if (c == NULL)
+                return NULL;
+            elements.push_back(c);
+        }
 #else
         cv->getVectorElements(elements);
 #endif
@@ -1247,6 +1256,9 @@ lExtractConstantOffset(llvm::Value *vec, llvm::Value **constOffset,
                        llvm::Value **variableOffset, 
                        llvm::Instruction *insertBefore) {
     if (llvm::isa<llvm::ConstantVector>(vec) ||
+#ifdef LLVM_3_1svn
+        llvm::isa<llvm::ConstantDataVector>(vec) ||
+#endif
         llvm::isa<llvm::ConstantAggregateZero>(vec)) {
         *constOffset = vec;
         *variableOffset = NULL;
@@ -1365,7 +1377,12 @@ lExtractConstantOffset(llvm::Value *vec, llvm::Value **constOffset,
    in *splat, if so). */
 static bool
 lIs248Splat(llvm::Value *v, int *splat) {
+#ifdef LLVM_3_1svn
+    llvm::ConstantDataVector *cvec = 
+        llvm::dyn_cast<llvm::ConstantDataVector>(v);
+#else
     llvm::ConstantVector *cvec = llvm::dyn_cast<llvm::ConstantVector>(v);
+#endif
     if (cvec == NULL)
         return false;
 
@@ -1472,6 +1489,9 @@ lExtractUniforms(llvm::Value **vec, llvm::Instruction *insertBefore) {
     fprintf(stderr, "\n");
 
     if (llvm::isa<llvm::ConstantVector>(*vec) ||
+#ifdef LLVM_3_1svn
+        llvm::isa<llvm::ConstantDataVector>(*vec) ||
+#endif
         llvm::isa<llvm::ConstantAggregateZero>(*vec))
         return NULL;
 
@@ -2153,13 +2173,19 @@ char GSToLoadStorePass::ID = 0;
     elements.
  */
 static bool
-lVectorIsLinearConstantInts(llvm::ConstantVector *cv, int vectorLength, 
+lVectorIsLinearConstantInts(
+#ifdef LLVM_3_1svn
+                            llvm::ConstantDataVector *cv, 
+#else
+                            llvm::ConstantVector *cv, 
+#endif
+                            int vectorLength, 
                             int stride) {
     // Flatten the vector out into the elements array
     llvm::SmallVector<llvm::Constant *, ISPC_MAX_NVEC> elements;
 #ifdef LLVM_3_1svn
-    for (int i = 0; i < (int)cv->getNumOperands(); ++i)
-        elements.push_back(cv->getOperand(i));
+    for (int i = 0; i < (int)cv->getNumElements(); ++i)
+        elements.push_back(cv->getElementAsConstant(i));
 #else
     cv->getVectorElements(elements);
 #endif
@@ -2201,7 +2227,11 @@ lCheckMulForLinear(llvm::Value *op0, llvm::Value *op1, int vectorLength,
                    int stride, std::vector<llvm::PHINode *> &seenPhis) {
     // Is the first operand a constant integer value splatted across all of
     // the lanes?
+#ifdef LLVM_3_1svn
+    llvm::ConstantDataVector *cv = llvm::dyn_cast<llvm::ConstantDataVector>(op0);
+#else
     llvm::ConstantVector *cv = llvm::dyn_cast<llvm::ConstantVector>(op0);
+#endif
     if (cv == NULL)
         return false;
 
@@ -2237,7 +2267,11 @@ lVectorIsLinear(llvm::Value *v, int vectorLength, int stride,
                 std::vector<llvm::PHINode *> &seenPhis) {
     // First try the easy case: if the values are all just constant
     // integers and have the expected stride between them, then we're done.
+#ifdef LLVM_3_1svn
+    llvm::ConstantDataVector *cv = llvm::dyn_cast<llvm::ConstantDataVector>(v);
+#else
     llvm::ConstantVector *cv = llvm::dyn_cast<llvm::ConstantVector>(v);
+#endif
     if (cv != NULL)
         return lVectorIsLinearConstantInts(cv, vectorLength, stride);
 

From fdb4eaf437872566371b70cc74a2334ad5620736 Mon Sep 17 00:00:00 2001
From: Matt Pharr <matt.pharr@intel.com>
Date: Wed, 1 Feb 2012 08:17:25 -0800
Subject: [PATCH 34/62] Fix bug in &&/|| short-circuiting.

Use full mask, not internal mask when checking "any lanes running"
before evaluating expressions.

Added some more tests to try to cover this case.
---
 expr.cpp                    | 17 +++++++++--------
 tests/short-circuit-12.ispc | 25 +++++++++++++++++++++++++
 tests/short-circuit-13.ispc | 25 +++++++++++++++++++++++++
 tests/short-circuit-14.ispc | 29 +++++++++++++++++++++++++++++
 tests/short-circuit-15.ispc | 29 +++++++++++++++++++++++++++++
 5 files changed, 117 insertions(+), 8 deletions(-)
 create mode 100644 tests/short-circuit-12.ispc
 create mode 100644 tests/short-circuit-13.ispc
 create mode 100644 tests/short-circuit-14.ispc
 create mode 100644 tests/short-circuit-15.ispc

diff --git a/expr.cpp b/expr.cpp
index 1f810998..14415beb 100644
--- a/expr.cpp
+++ b/expr.cpp
@@ -1537,6 +1537,7 @@ lEmitLogicalOp(BinaryExpr::Op op, Expr *arg0, Expr *arg1,
         // Otherwise, the first operand is varying...  Save the current
         // value of the mask so that we can restore it at the end.
         llvm::Value *oldMask = ctx->GetInternalMask();
+        llvm::Value *oldFullMask = ctx->GetFullMask();
 
         // Convert the second operand to be varying as well, so that we can
         // perform logical vector ops with its value.
@@ -1551,11 +1552,11 @@ lEmitLogicalOp(BinaryExpr::Op op, Expr *arg0, Expr *arg1,
             // lanes--i.e. if (value0 & mask) == mask.  If so, we don't
             // need to evaluate the second operand of the expression.
             llvm::Value *value0AndMask = 
-                ctx->BinaryOperator(llvm::Instruction::And, value0, oldMask, 
-                                    "op&mask");
+                ctx->BinaryOperator(llvm::Instruction::And, value0, 
+                                    oldFullMask, "op&mask");
             llvm::Value *equalsMask =
                 ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_EQ,
-                             value0AndMask, oldMask, "value0&mask==mask");
+                             value0AndMask, oldFullMask, "value0&mask==mask");
             equalsMask = ctx->I1VecToBoolVec(equalsMask);
             llvm::Value *allMatch = ctx->All(equalsMask);
             ctx->BranchInst(bbSkipEvalValue1, bbEvalValue1, allMatch);
@@ -1602,11 +1603,11 @@ lEmitLogicalOp(BinaryExpr::Op op, Expr *arg0, Expr *arg1,
             // if (mask & ~value0) == mask.
             llvm::Value *notValue0 = ctx->NotOperator(value0, "not_value0");
             llvm::Value *notValue0AndMask = 
-                ctx->BinaryOperator(llvm::Instruction::And, notValue0, oldMask, 
-                                    "not_value0&mask");
+                ctx->BinaryOperator(llvm::Instruction::And, notValue0, 
+                                    oldFullMask, "not_value0&mask");
             llvm::Value *equalsMask =
                 ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_EQ,
-                             notValue0AndMask, oldMask, "not_value0&mask==mask");
+                             notValue0AndMask, oldFullMask, "not_value0&mask==mask");
             equalsMask = ctx->I1VecToBoolVec(equalsMask);
             llvm::Value *allMatch = ctx->All(equalsMask);
             ctx->BranchInst(bbSkipEvalValue1, bbEvalValue1, allMatch);
@@ -1634,8 +1635,8 @@ lEmitLogicalOp(BinaryExpr::Op op, Expr *arg0, Expr *arg1,
             // masking off the valid lanes before we AND them together:
             // result = (value0 & old_mask) & (value1 & current_mask)
             llvm::Value *value0AndMask = 
-                ctx->BinaryOperator(llvm::Instruction::And, value0, oldMask,
-                                    "op&mask");
+                ctx->BinaryOperator(llvm::Instruction::And, value0, 
+                                    oldFullMask, "op&mask");
             llvm::Value *value1AndMask =
                 ctx->BinaryOperator(llvm::Instruction::And, value1,
                                     ctx->GetInternalMask(), "value1&mask");
diff --git a/tests/short-circuit-12.ispc b/tests/short-circuit-12.ispc
new file mode 100644
index 00000000..fb0a94a2
--- /dev/null
+++ b/tests/short-circuit-12.ispc
@@ -0,0 +1,25 @@
+
+export uniform int width() { return programCount; }
+
+uniform int * uniform ptr;
+
+float foo(uniform float a[]) {
+    int index = (programIndex & 1) * 10000;
+    if (a[programIndex] < 10000 && a[index] == 1)
+        return 1;
+    else
+        return 1234;
+}
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex]; 
+    float a0 = aFOO[0], a1 = aFOO[1];
+    if ((programIndex & 1) == 0)
+        RET[programIndex] = foo(aFOO);
+    else
+        RET[programIndex] = 2;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = (programIndex & 1) ? 2 : 1;
+}
diff --git a/tests/short-circuit-13.ispc b/tests/short-circuit-13.ispc
new file mode 100644
index 00000000..fb0a94a2
--- /dev/null
+++ b/tests/short-circuit-13.ispc
@@ -0,0 +1,25 @@
+
+export uniform int width() { return programCount; }
+
+uniform int * uniform ptr;
+
+float foo(uniform float a[]) {
+    int index = (programIndex & 1) * 10000;
+    if (a[programIndex] < 10000 && a[index] == 1)
+        return 1;
+    else
+        return 1234;
+}
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex]; 
+    float a0 = aFOO[0], a1 = aFOO[1];
+    if ((programIndex & 1) == 0)
+        RET[programIndex] = foo(aFOO);
+    else
+        RET[programIndex] = 2;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = (programIndex & 1) ? 2 : 1;
+}
diff --git a/tests/short-circuit-14.ispc b/tests/short-circuit-14.ispc
new file mode 100644
index 00000000..846a8ec3
--- /dev/null
+++ b/tests/short-circuit-14.ispc
@@ -0,0 +1,29 @@
+
+export uniform int width() { return programCount; }
+
+ int * uniform ptr;
+
+int crash() {
+    return *ptr;
+}
+
+float foo(uniform float a[]) {
+    int index = (programIndex & 1);
+    if (a[index] == 2 && crash())
+        return 1234;
+    else
+        return 1;
+}
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex]; 
+    float a0 = aFOO[0], a1 = aFOO[1];
+    if ((programIndex & 1) == 0)
+        RET[programIndex] = foo(aFOO);
+    else
+        RET[programIndex] = 2;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = (programIndex & 1) ? 2 : 1;
+}
diff --git a/tests/short-circuit-15.ispc b/tests/short-circuit-15.ispc
new file mode 100644
index 00000000..4e6f7e32
--- /dev/null
+++ b/tests/short-circuit-15.ispc
@@ -0,0 +1,29 @@
+
+export uniform int width() { return programCount; }
+
+uniform int * uniform ptr;
+
+uniform int crash() {
+    return *ptr;
+}
+
+float foo(uniform float a[]) {
+    int index = (programIndex & 1);
+    if (a[index] == 2 && crash())
+        return 1234;
+    else
+        return 1;
+}
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex]; 
+    float a0 = aFOO[0], a1 = aFOO[1];
+    if ((programIndex & 1) == 0)
+        RET[programIndex] = foo(aFOO);
+    else
+        RET[programIndex] = 2;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = (programIndex & 1) ? 2 : 1;
+}

From 89cb809922f5e8598c7f88d09871e7e97fad8dbe Mon Sep 17 00:00:00 2001
From: Matt Pharr <matt.pharr@intel.com>
Date: Wed, 1 Feb 2012 11:03:58 -0800
Subject: [PATCH 35/62] Short-circuit evaluation of ? : operator for varying
 tests.

? : now short-circuits evaluation of the expressions following
the boolean test for varying test types.  (It already did this
for uniform tests).

Issue #169.
---
 docs/ispc.rst                     | 12 +++--
 expr.cpp                          | 84 ++++++++++++++++++++++++++++---
 tests/foreach-double-1.ispc       | 30 +++++++++++
 tests/short-circuit-select-1.ispc | 21 ++++++++
 tests/short-circuit-select-2.ispc | 21 ++++++++
 tests/short-circuit-select-3.ispc | 20 ++++++++
 6 files changed, 175 insertions(+), 13 deletions(-)
 create mode 100644 tests/foreach-double-1.ispc
 create mode 100644 tests/short-circuit-select-1.ispc
 create mode 100644 tests/short-circuit-select-2.ispc
 create mode 100644 tests/short-circuit-select-3.ispc

diff --git a/docs/ispc.rst b/docs/ispc.rst
index ee838ee0..345d6119 100644
--- a/docs/ispc.rst
+++ b/docs/ispc.rst
@@ -1151,6 +1151,7 @@ in C:
 * Structs and arrays
 * Support for recursive function calls
 * Support for separate compilation of source files
+* "Short-circuit" evaluation of ``||``, ``&&`` and ``? :`` operators
 * The preprocessor
 
 ``ispc`` adds a number of features from C++ and C99 to this base:
@@ -1968,11 +1969,12 @@ operator also work as expected.
     (*fp).a = 0;
     fp->b = 1;
   
-As in C and C++, evaluation of the ``||`` and ``&&`` logical operators is
-"short-circuited"; the right hand side won't be evaluated if the value from
-the left-hand side determines the logical operator's value.  For example,
-in the following code, ``array[index]`` won't be evaluated for values of
-``index`` that are greater than or equal to ``NUM_ITEMS``.
+As in C and C++, evaluation of the ``||`` and ``&&`` logical operators as
+well as the selection operator ``? :`` is "short-circuited"; the right hand
+side won't be evaluated if the value from the left-hand side determines the
+logical operator's value.  For example, in the following code,
+``array[index]`` won't be evaluated for values of ``index`` that are
+greater than or equal to ``NUM_ITEMS``.
 
 ::
 
diff --git a/expr.cpp b/expr.cpp
index 14415beb..b1194603 100644
--- a/expr.cpp
+++ b/expr.cpp
@@ -2686,6 +2686,34 @@ lEmitVaryingSelect(FunctionEmitContext *ctx, llvm::Value *test,
 }
 
 
+static void
+lEmitSelectExprCode(FunctionEmitContext *ctx, llvm::Value *testVal, 
+                    llvm::Value *oldMask, llvm::Value *fullMask,
+                    Expr *expr, llvm::Value *exprPtr) {
+    llvm::BasicBlock *bbEval = ctx->CreateBasicBlock("select_eval_expr");
+    llvm::BasicBlock *bbDone = ctx->CreateBasicBlock("select_done");
+
+    // Check to see if the test was true for any of the currently executing
+    // program instances.
+    llvm::Value *testAndFullMask = 
+        ctx->BinaryOperator(llvm::Instruction::And, testVal, fullMask, 
+                            "test&mask");
+    llvm::Value *anyOn = ctx->Any(testAndFullMask);
+    ctx->BranchInst(bbEval, bbDone, anyOn);
+
+    ctx->SetCurrentBasicBlock(bbEval);
+    llvm::Value *testAndMask = 
+        ctx->BinaryOperator(llvm::Instruction::And, testVal, oldMask, 
+                            "test&mask");
+    ctx->SetInternalMask(testAndMask);
+    llvm::Value *exprVal = expr->GetValue(ctx);
+    ctx->StoreInst(exprVal, exprPtr);
+    ctx->BranchInst(bbDone);
+
+    ctx->SetCurrentBasicBlock(bbDone);
+}
+
+
 llvm::Value *
 SelectExpr::GetValue(FunctionEmitContext *ctx) const {
     if (!expr1 || !expr2 || !test)
@@ -2733,18 +2761,58 @@ SelectExpr::GetValue(FunctionEmitContext *ctx) const {
         return ret;
     }
     else if (dynamic_cast<const VectorType *>(testType) == NULL) {
-        // if the test is a varying bool type, then evaluate both of the
-        // value expressions with the mask set appropriately and then do an
-        // element-wise select to get the result
+        // the test is a varying bool type
         llvm::Value *testVal = test->GetValue(ctx);
         Assert(testVal->getType() == LLVMTypes::MaskType);
         llvm::Value *oldMask = ctx->GetInternalMask();
-        ctx->SetInternalMaskAnd(oldMask, testVal);
-        llvm::Value *expr1Val = expr1->GetValue(ctx);
-        ctx->SetInternalMaskAndNot(oldMask, testVal);
-        llvm::Value *expr2Val = expr2->GetValue(ctx);
-        ctx->SetInternalMask(oldMask);
+        llvm::Value *fullMask = ctx->GetFullMask();
 
+        // We don't want to incur the overhead for short-circuit evaluation
+        // for expressions that are both computationally simple and safe to
+        // run with an "all off" mask.
+        bool shortCircuit1 =
+            (::EstimateCost(expr1) > PREDICATE_SAFE_IF_STATEMENT_COST ||
+             SafeToRunWithMaskAllOff(expr1) == false);
+        bool shortCircuit2 =
+            (::EstimateCost(expr2) > PREDICATE_SAFE_IF_STATEMENT_COST ||
+             SafeToRunWithMaskAllOff(expr2) == false);
+
+        Debug(expr1->pos, "%sshort circuiting evaluation for select expr",
+              shortCircuit1 ? "" : "Not ");
+        Debug(expr2->pos, "%sshort circuiting evaluation for select expr",
+              shortCircuit2 ? "" : "Not ");
+
+        // Temporary storage to store the values computed for each
+        // expression, if any.  (These stay as uninitialized memory if we
+        // short circuit around the corresponding expression.)
+        LLVM_TYPE_CONST llvm::Type *exprType = 
+            expr1->GetType()->LLVMType(g->ctx);
+        llvm::Value *expr1Ptr = ctx->AllocaInst(exprType);
+        llvm::Value *expr2Ptr = ctx->AllocaInst(exprType);
+
+        if (shortCircuit1)
+            lEmitSelectExprCode(ctx, testVal, oldMask, fullMask, expr1, 
+                                expr1Ptr);
+        else {
+            ctx->SetInternalMaskAnd(oldMask, testVal);
+            llvm::Value *expr1Val = expr1->GetValue(ctx);
+            ctx->StoreInst(expr1Val, expr1Ptr);
+        }
+
+        if (shortCircuit2) {
+            llvm::Value *notTest = ctx->NotOperator(testVal);
+            lEmitSelectExprCode(ctx, notTest, oldMask, fullMask, expr2, 
+                                expr2Ptr);
+        }
+        else {
+            ctx->SetInternalMaskAndNot(oldMask, testVal);
+            llvm::Value *expr2Val = expr2->GetValue(ctx);
+            ctx->StoreInst(expr2Val, expr2Ptr);
+        }
+
+        ctx->SetInternalMask(oldMask);
+        llvm::Value *expr1Val = ctx->LoadInst(expr1Ptr);
+        llvm::Value *expr2Val = ctx->LoadInst(expr2Ptr);
         return lEmitVaryingSelect(ctx, testVal, expr1Val, expr2Val, type);
     }
     else {
diff --git a/tests/foreach-double-1.ispc b/tests/foreach-double-1.ispc
new file mode 100644
index 00000000..16d7cfd0
--- /dev/null
+++ b/tests/foreach-double-1.ispc
@@ -0,0 +1,30 @@
+
+export uniform int width() { return programCount; }
+
+uniform double one = 1;
+
+void copy(uniform double dst[], uniform double src[], uniform int count) {
+    foreach (i = 0 ... count)
+        dst[i] = one * src[i];
+}
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    uniform int count = 200 + aFOO[1];
+    uniform double * uniform src = uniform new uniform double[count];
+    for (uniform int i = 0; i < count; ++i)
+        src[i] = i;
+
+    uniform double * uniform dst = uniform new uniform double[count];
+    copy(dst, src, count);
+
+    uniform int errors = 0;
+    for (uniform int i = 0; i < count; ++i)
+        if (dst[i] != src[i])
+            ++errors;
+
+    RET[programIndex] = errors; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 0;
+}
diff --git a/tests/short-circuit-select-1.ispc b/tests/short-circuit-select-1.ispc
new file mode 100644
index 00000000..2189710a
--- /dev/null
+++ b/tests/short-circuit-select-1.ispc
@@ -0,0 +1,21 @@
+
+export uniform int width() { return programCount; }
+
+uniform int * uniform ptr;
+
+bool crashEven() {
+    return (programIndex & 1) ? true : (*ptr > 0);
+}
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex]; 
+    float a0 = aFOO[0], a1 = aFOO[1];
+    if (((programIndex & 1) == 0) || crashEven())
+        RET[programIndex] = 1;
+    else
+        RET[programIndex] = 0;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1;
+}
diff --git a/tests/short-circuit-select-2.ispc b/tests/short-circuit-select-2.ispc
new file mode 100644
index 00000000..c152af71
--- /dev/null
+++ b/tests/short-circuit-select-2.ispc
@@ -0,0 +1,21 @@
+
+export uniform int width() { return programCount; }
+
+uniform int * uniform ptr;
+
+bool crashEven() {
+    return (programIndex & 1) ? true : (*ptr > 0);
+}
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex]; 
+    float a0 = aFOO[0], a1 = aFOO[1];
+    if (((programIndex & 1) == 1) && crashEven())
+        RET[programIndex] = 1;
+    else
+        RET[programIndex] = 2;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = (programIndex & 1) ? 1 : 2;
+}
diff --git a/tests/short-circuit-select-3.ispc b/tests/short-circuit-select-3.ispc
new file mode 100644
index 00000000..4b503fc8
--- /dev/null
+++ b/tests/short-circuit-select-3.ispc
@@ -0,0 +1,20 @@
+
+export uniform int width() { return programCount; }
+
+float crashEven(uniform float a[]) {
+    int offset = 0;
+    return (programIndex & 1) ? a[offset] : a[offset+1000000];
+}
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex]; 
+    float a0 = aFOO[0], a1 = aFOO[1];
+    if (((programIndex & 1) == 1) && (crashEven(aFOO) == 1))
+        RET[programIndex] = 1;
+    else
+        RET[programIndex] = 2;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = (programIndex & 1) ? 1 : 2;
+}

From 83c8650b369d203f77373cf72c1681fd48d2a0e4 Mon Sep 17 00:00:00 2001
From: Matt Pharr <matt.pharr@intel.com>
Date: Fri, 3 Feb 2012 13:15:21 -0800
Subject: [PATCH 36/62] Add support for "local" atomics.

Also updated aobench example to use them, which in turn allows using
foreach() and thence a much cleaner implementation.

Issue #58.
---
 docs/ispc.rst                         | 188 +++++---
 examples/aobench/ao.ispc              | 120 ++---
 stdlib.ispc                           | 605 +++++++++++++++++---------
 tests/local-atomics-1.ispc            |  15 +
 tests/local-atomics-10.ispc           |  17 +
 tests/local-atomics-11.ispc           |  20 +
 tests/local-atomics-12.ispc           |  20 +
 tests/local-atomics-13.ispc           |  16 +
 tests/local-atomics-14.ispc           |  20 +
 tests/local-atomics-2.ispc            |  15 +
 tests/local-atomics-3.ispc            |  15 +
 tests/local-atomics-4.ispc            |  14 +
 tests/local-atomics-5.ispc            |  14 +
 tests/local-atomics-6.ispc            |  14 +
 tests/local-atomics-7.ispc            |  14 +
 tests/local-atomics-8.ispc            |  16 +
 tests/local-atomics-9.ispc            |  17 +
 tests/local-atomics-swap.ispc         |  17 +
 tests/local-atomics-uniform-1.ispc    |  14 +
 tests/local-atomics-uniform-2.ispc    |  14 +
 tests/local-atomics-uniform-3.ispc    |  14 +
 tests/local-atomics-uniform-4.ispc    |  14 +
 tests/local-atomics-uniform-5.ispc    |  14 +
 tests/local-atomics-uniform-6.ispc    |  14 +
 tests/local-atomics-uniform-7.ispc    |  14 +
 tests/local-atomics-uniform-8.ispc    |  14 +
 tests/local-atomics-uniform-9.ispc    |  14 +
 tests/local-atomics-varyingptr-1.ispc |  18 +
 tests/local-atomics-varyingptr-2.ispc |  16 +
 tests/local-atomics-varyingptr-3.ispc |  18 +
 tests/local-atomics-varyingptr-4.ispc |  15 +
 31 files changed, 983 insertions(+), 367 deletions(-)
 create mode 100644 tests/local-atomics-1.ispc
 create mode 100644 tests/local-atomics-10.ispc
 create mode 100644 tests/local-atomics-11.ispc
 create mode 100644 tests/local-atomics-12.ispc
 create mode 100644 tests/local-atomics-13.ispc
 create mode 100644 tests/local-atomics-14.ispc
 create mode 100644 tests/local-atomics-2.ispc
 create mode 100644 tests/local-atomics-3.ispc
 create mode 100644 tests/local-atomics-4.ispc
 create mode 100644 tests/local-atomics-5.ispc
 create mode 100644 tests/local-atomics-6.ispc
 create mode 100644 tests/local-atomics-7.ispc
 create mode 100644 tests/local-atomics-8.ispc
 create mode 100644 tests/local-atomics-9.ispc
 create mode 100644 tests/local-atomics-swap.ispc
 create mode 100644 tests/local-atomics-uniform-1.ispc
 create mode 100644 tests/local-atomics-uniform-2.ispc
 create mode 100644 tests/local-atomics-uniform-3.ispc
 create mode 100644 tests/local-atomics-uniform-4.ispc
 create mode 100644 tests/local-atomics-uniform-5.ispc
 create mode 100644 tests/local-atomics-uniform-6.ispc
 create mode 100644 tests/local-atomics-uniform-7.ispc
 create mode 100644 tests/local-atomics-uniform-8.ispc
 create mode 100644 tests/local-atomics-uniform-9.ispc
 create mode 100644 tests/local-atomics-varyingptr-1.ispc
 create mode 100644 tests/local-atomics-varyingptr-2.ispc
 create mode 100644 tests/local-atomics-varyingptr-3.ispc
 create mode 100644 tests/local-atomics-varyingptr-4.ispc

diff --git a/docs/ispc.rst b/docs/ispc.rst
index 345d6119..aa15158d 100644
--- a/docs/ispc.rst
+++ b/docs/ispc.rst
@@ -3389,24 +3389,53 @@ Systems Programming Support
 Atomic Operations and Memory Fences
 -----------------------------------
 
-The usual range of atomic memory operations are provided in ``ispc``,
-including variants to handle both uniform and varying types.  As a first
-example, consider on variant of the 32-bit integer atomic add routine:
+The standard range of atomic memory operations are provided by the standard
+library``ispc``, including variants to handle both uniform and varying
+types as well as "local" and "global" atomics.
+
+Local atomics provide atomic behavior across the program instances in a
+gang, but not across multiple gangs or memory operations in different
+hardware threads.  To see why they are needed, consider a histogram
+calculation where each program instance in the gang computes which bucket a
+value lies in and then increments a corresponding counter.  If the code is
+written like this:
 
 ::
 
-  int32 atomic_add_global(uniform int32 * uniform ptr, int32 delta)
+    uniform int count[N_BUCKETS] = ...;
+    float value = ...;
+    int bucket = clamp(value / N_BUCKETS, 0, N_BUCKETS);
+    ++count[bucket];  // ERROR: undefined behavior if collisions
 
-The semantics are the expected ones for an atomic add function: the pointer
-points to a single location in memory (the same one for all program
-instances), and for each executing program instance, the value stored in
-the location that ``ptr`` points to has that program instance's value
-"delta" added to it atomically, and the old value at that location is
-returned from the function.  (Thus, if multiple processors simultaneously
-issue atomic adds to the same memory location, the adds will be serialized
-by the hardware so that the correct result is computed in the end.
-Furthermore, the atomic adds are serialized across the running program
-instances.)
+then the program's behavior is undefined: whenever multiple program
+instances have values that map to the same value of ``bucket``, then the
+effect of the increment is undefined.  (See the discussion in the `Data
+Races Within a Gang`_ section; in the case here, there isn't a sequence
+point between one program instance updating ``count[bucket]`` and the other
+program instance reading its value.)
+
+The ``atomic_add_local()`` function can be used in this case; as a local
+atomic it is atomic across the gang of program instances, such that the
+expected result is computed.
+
+::
+
+    ...
+    int bucket = clamp(value / N_BUCKETS, 0, N_BUCKETS);
+    atomic_add_local(&count[bucket], 1);
+
+It uses this variant of the 32-bit integer atomic add routine:
+
+::
+
+  int32 atomic_add_local(uniform int32 * uniform ptr, int32 delta)
+
+The semantics of this routine are typical for an atomic add function: the
+pointer here points to a single location in memory (the same one for all
+program instances), and for each executing program instance, the value
+stored in the location that ``ptr`` points to has that program instance's
+value "delta" added to it atomically, and the old value at that location is
+returned from the function.
 
 One thing to note is that that the type of the value being added to a
 ``uniform`` integer, while the increment amount and the return value are
@@ -3417,45 +3446,76 @@ atomics for the running program instances may be issued in arbitrary order;
 it's not guaranteed that they will be issued in ``programIndex`` order, for
 example.
 
-Here are the declarations of the ``int32`` variants of these functions.
-There are also ``int64`` equivalents as well as variants that take
-``unsigned`` ``int32`` and ``int64`` values.  (The ``atomic_swap_global()``
-function can be used with ``float`` and ``double`` types as well.)
+Global atomics are more powerful than local atomics; they are atomic across
+both the program instances in the gang as well as atomic across different
+gangs and different hardware threads.  For example, for the global variant
+of the atomic used above,
 
 ::
 
-  int32 atomic_add_global(uniform int32 * uniform ptr, int32 value)
-  int32 atomic_subtract_global(uniform int32 * uniform ptr, int32 value)
-  int32 atomic_min_global(uniform int32 * uniform ptr, int32 value)
-  int32 atomic_max_global(uniform int32 * uniform ptr, int32 value)
-  int32 atomic_and_global(uniform int32 * uniform ptr, int32 value)
-  int32 atomic_or_global(uniform int32 * uniform ptr, int32 value)
-  int32 atomic_xor_global(uniform int32 * uniform ptr, int32 value)
-  int32 atomic_swap_global(uniform int32 * uniform ptr, int32 value)
+  int32 atomic_add_global(uniform int32 * uniform ptr, int32 delta)
 
-There are also variants of these functions that take ``uniform`` values for
-the operand and return a ``uniform`` result.  These correspond to a single
+if multiple processors simultaneously issue atomic adds to the same memory
+location, the adds will be serialized by the hardware so that the correct
+result is computed in the end.
+
+Here are the declarations of the ``int32`` variants of these functions.
+There are also ``int64`` equivalents as well as variants that take
+``unsigned`` ``int32`` and ``int64`` values.
+
+::
+
+  int32 atomic_add_{local,global}(uniform int32 * uniform ptr, int32 value)
+  int32 atomic_subtract_{local,global}(uniform int32 * uniform ptr, int32 value)
+  int32 atomic_min_{local,global}(uniform int32 * uniform ptr, int32 value)
+  int32 atomic_max_{local,global}(uniform int32 * uniform ptr, int32 value)
+  int32 atomic_and_{local,global}(uniform int32 * uniform ptr, int32 value)
+  int32 atomic_or_{local,global}(uniform int32 * uniform ptr, int32 value)
+  int32 atomic_xor_{local,global}(uniform int32 * uniform ptr, int32 value)
+  int32 atomic_swap_{local,global}(uniform int32 * uniform ptr, int32 value)
+
+Support for ``float`` and ``double`` types is also available.  For local
+atomics, all but the logical operations are available.  (There are
+corresponding ``double`` variants of these, not listed here.)
+
+::
+
+  float atomic_add_local(uniform float * uniform ptr, float value)
+  float atomic_subtract_local(uniform float * uniform ptr, float value)
+  float atomic_min_local(uniform float * uniform ptr, float value)
+  float atomic_max_local(uniform float * uniform ptr, float value)
+  float atomic_swap_local(uniform float * uniform ptr, float value)
+
+For global atomics, only atomic swap is available for these types:
+
+::
+
+  float atomic_swap_global(uniform float * uniform ptr, float value)
+  double atomic_swap_global(uniform double * uniform ptr, double value)
+
+There are also variants of the atomic that take ``uniform`` values for the
+operand and return a ``uniform`` result.  These correspond to a single
 atomic operation being performed for the entire gang of program instances,
 rather than one per program instance.
 
 ::
 
-  uniform int32 atomic_add_global(uniform int32 * uniform ptr,
-                                  uniform int32 value)
-  uniform int32 atomic_subtract_global(uniform int32 * uniform ptr,
-                                       uniform int32 value)
-  uniform int32 atomic_min_global(uniform int32 * uniform ptr,
-                                  uniform int32 value)
-  uniform int32 atomic_max_global(uniform int32 * uniform ptr,
-                                  uniform int32 value)
-  uniform int32 atomic_and_global(uniform int32 * uniform ptr,
-                                  uniform int32 value)
-  uniform int32 atomic_or_global(uniform int32 * uniform ptr,
-                                  uniform int32 value)
-  uniform int32 atomic_xor_global(uniform int32 * uniform ptr,
-                                  uniform int32 value)
-  uniform int32 atomic_swap_global(uniform int32 * uniform ptr,
-                                   uniform int32 newval)
+  uniform int32 atomic_add_{local,global}(uniform int32 * uniform ptr,
+                                          uniform int32 value)
+  uniform int32 atomic_subtract_{local,global}(uniform int32 * uniform ptr,
+                                               uniform int32 value)
+  uniform int32 atomic_min_{local,global}(uniform int32 * uniform ptr,
+                                          uniform int32 value)
+  uniform int32 atomic_max_{local,global}(uniform int32 * uniform ptr,
+                                          uniform int32 value)
+  uniform int32 atomic_and_{local,global}(uniform int32 * uniform ptr,
+                                          uniform int32 value)
+  uniform int32 atomic_or_{local,global}(uniform int32 * uniform ptr,
+                                          uniform int32 value)
+  uniform int32 atomic_xor_{local,global}(uniform int32 * uniform ptr,
+                                          uniform int32 value)
+  uniform int32 atomic_swap_{local,global}(uniform int32 * uniform ptr,
+                                           uniform int32 newval)
 
 Be careful that you use the atomic function that you mean to; consider the
 following code:
@@ -3479,8 +3539,7 @@ will cause the desired atomic add function to be called.
 ::
 
     extern uniform int32 counter;
-    int32 one = 1;
-    int32 myCounter = atomic_add_global(&counter, one);
+    int32 myCounter = atomic_add_global(&counter, (varying int32)1);
 
 There is a third variant of each of these atomic functions that takes a
 ``varying`` pointer; this allows each program instance to issue an atomic
@@ -3490,30 +3549,27 @@ the same location in memory!)
 
 ::
 
-  int32 atomic_add_global(uniform int32 * varying ptr, int32 value)
-  int32 atomic_subtract_global(uniform int32 * varying ptr, int32 value)
-  int32 atomic_min_global(uniform int32 * varying ptr, int32 value)
-  int32 atomic_max_global(uniform int32 * varying ptr, int32 value)
-  int32 atomic_and_global(uniform int32 * varying ptr, int32 value)
-  int32 atomic_or_global(uniform int32 * varying ptr, int32 value)
-  int32 atomic_xor_global(uniform int32 * varying ptr, int32 value)
-  int32 atomic_swap_global(uniform int32 * varying ptr, int32 value)
+  int32 atomic_add_{local,global}(uniform int32 * varying ptr, int32 value)
+  int32 atomic_subtract_{local,global}(uniform int32 * varying ptr, int32 value)
+  int32 atomic_min_{local,global}(uniform int32 * varying ptr, int32 value)
+  int32 atomic_max_{local,global}(uniform int32 * varying ptr, int32 value)
+  int32 atomic_and_{local,global}(uniform int32 * varying ptr, int32 value)
+  int32 atomic_or_{local,global}(uniform int32 * varying ptr, int32 value)
+  int32 atomic_xor_{local,global}(uniform int32 * varying ptr, int32 value)
+  int32 atomic_swap_{local,global}(uniform int32 * varying ptr, int32 value)
 
-There are also atomic swap and "compare and exchange" functions.
-Compare and exchange atomically compares the value in "val" to
-"compare"--if they match, it assigns "newval" to "val".  In either case,
-the old value of "val" is returned.  (As with the other atomic operations,
-there are also ``unsigned`` and 64-bit variants of this function.
-Furthermore, there are ``float`` and ``double`` variants as well.)
+There are also atomic "compare and exchange" functions.  Compare and
+exchange atomically compares the value in "val" to "compare"--if they
+match, it assigns "newval" to "val".  In either case, the old value of
+"val" is returned.  (As with the other atomic operations, there are also
+``unsigned`` and 64-bit variants of this function.  Furthermore, there are
+``float`` and ``double`` variants as well.)
 
 ::
 
-  int32 atomic_swap_global(uniform int32 * uniform ptr, int32 newvalue)
-  uniform int32 atomic_swap_global(uniform int32 * uniform ptr,
-                                   uniform int32 newvalue)
-  int32 atomic_compare_exchange_global(uniform int32 * uniform ptr,
-                                       int32 compare, int32 newval)
-  uniform int32 atomic_compare_exchange_global(uniform int32 * uniform ptr,
+  int32 atomic_compare_exchange_{local,global}(uniform int32 * uniform ptr,
+                                               int32 compare, int32 newval)
+  uniform int32 atomic_compare_exchange_{local,global}(uniform int32 * uniform ptr,
                                   uniform int32 compare, uniform int32 newval)
 
 ``ispc`` also has a standard library routine that inserts a memory barrier
diff --git a/examples/aobench/ao.ispc b/examples/aobench/ao.ispc
index ffd85d29..61c2dc7d 100644
--- a/examples/aobench/ao.ispc
+++ b/examples/aobench/ao.ispc
@@ -212,104 +212,44 @@ static void ao_scanlines(uniform int y0, uniform int y1, uniform int w,
     RNGState rngstate;
 
     seed_rng(&rngstate, y0);
+    float invSamples = 1.f / nsubsamples;
 
-    // Compute the mapping between the 'programCount'-wide program
-    // instances running in parallel and samples in the image.  
-    //
-    // For now, we'll always take four samples per pixel, so start by
-    // initializing du and dv with offsets into subpixel samples.  We'll
-    // take care of further updating du and dv for the case where we're
-    // doing more than 4 program instances in parallel shortly.
-    uniform float uSteps[4] = { 0, 1, 0, 1 };
-    uniform float vSteps[4] = { 0, 0, 1, 1 };
-    float du = uSteps[programIndex % 4] / nsubsamples;
-    float dv = vSteps[programIndex % 4] / nsubsamples;
+    foreach_tiled(y = y0 ... y1, x = 0 ... w, 
+                  u = 0 ... nsubsamples, v = 0 ... nsubsamples) {
+        float du = (float)u * invSamples, dv = (float)v * invSamples;
 
-    // Now handle the case where we are able to do more than one pixel's
-    // worth of work at once.  nx records the number of pixels in the x
-    // direction we do per iteration and ny the number in y.
-    uniform int nx = 1, ny = 1;
+        // Figure out x,y pixel in NDC
+        float px =  (x + du - (w / 2.0f)) / (w / 2.0f);
+        float py = -(y + dv - (h / 2.0f)) / (h / 2.0f);
+        float ret = 0.f;
+        Ray ray;
+        Isect isect;
 
-    // FIXME: We actually need ny to be 1 regardless of the decomposition,
-    // since the task decomposition is one scanline high.
+        ray.org = 0.f;
 
-    if (programCount == 8) {
-        // Do two pixels at once in the x direction
-        nx = 2;
-        if (programIndex >= 4) 
-            // And shift the offsets for the second pixel's worth of work
-            ++du;
-    }
-    else if (programCount == 16) {
-        nx = 4;
-        ny = 1;
-        if (programIndex >= 4 && programIndex < 8)
-            ++du;
-        if (programIndex >= 8 && programIndex < 12)
-            du += 2;
-        if (programIndex >= 12)
-            du += 3;
-    }
+        // Poor man's perspective projection
+        ray.dir.x = px;
+        ray.dir.y = py;
+        ray.dir.z = -1.0;
+        vnormalize(ray.dir);
 
-    // Now loop over all of the pixels, stepping in x and y as calculated
-    // above.  (Assumes that ny divides y and nx divides x...)
-    for (uniform int y = y0; y < y1; y += ny) {
-        for (uniform int x = 0; x < w; x += nx)  {
-            // Figure out x,y pixel in NDC
-            float px =  (x + du - (w / 2.0f)) / (w / 2.0f);
-            float py = -(y + dv - (h / 2.0f)) / (h / 2.0f);
-            float ret = 0.f;
-            Ray ray;
-            Isect isect;
+        isect.t   = 1.0e+17;
+        isect.hit = 0;
 
-            ray.org = 0.f;
+        for (uniform int snum = 0; snum < 3; ++snum)
+            ray_sphere_intersect(isect, ray, spheres[snum]);
+        ray_plane_intersect(isect, ray, plane);
 
-            // Poor man's perspective projection
-            ray.dir.x = px;
-            ray.dir.y = py;
-            ray.dir.z = -1.0;
-            vnormalize(ray.dir);
+        // Note use of 'coherent' if statement; the set of rays we
+        // trace will often all hit or all miss the scene
+        cif (isect.hit) {
+            ret = ambient_occlusion(isect, plane, spheres, rngstate);
+            ret *= invSamples * invSamples;
 
-            isect.t   = 1.0e+17;
-            isect.hit = 0;
-
-            for (uniform int snum = 0; snum < 3; ++snum)
-                ray_sphere_intersect(isect, ray, spheres[snum]);
-            ray_plane_intersect(isect, ray, plane);
-
-            // Note use of 'coherent' if statement; the set of rays we
-            // trace will often all hit or all miss the scene
-            cif (isect.hit)
-                ret = ambient_occlusion(isect, plane, spheres, rngstate);
-
-            // This is a little grungy; we have results for
-            // programCount-worth of values.  Because we're doing 2x2
-            // subsamples, we need to peel them off in groups of four,
-            // average the four values for each pixel, and update the
-            // output image.
-            //
-            // Store the varying value to a uniform array of the same size.
-            // See the discussion about communication among program
-            // instances in the ispc user's manual for more discussion on
-            // this idiom.
-            uniform float retArray[programCount];
-            retArray[programIndex] = ret;
-
-            // offset to the first pixel in the image
-            uniform int offset = 3 * (y * w + x);
-            for (uniform int p = 0; p < programCount; p += 4, offset += 3) {
-                // Get the four sample values for this pixel
-                uniform float sumret = retArray[p] + retArray[p+1] + retArray[p+2] +
-                    retArray[p+3];
-
-                // Normalize by number of samples taken
-                sumret /= nsubsamples * nsubsamples; 
-                
-                // Store result in the image
-                image[offset+0] = sumret;
-                image[offset+1] = sumret;
-                image[offset+2] = sumret;
-            }
+            int offset = 3 * (y * w + x);
+            atomic_add_local(&image[offset], ret);
+            atomic_add_local(&image[offset+1], ret);
+            atomic_add_local(&image[offset+2], ret);
         }
     }
 }
diff --git a/stdlib.ispc b/stdlib.ispc
index 0fe5e8ea..5bc931ec 100644
--- a/stdlib.ispc
+++ b/stdlib.ispc
@@ -795,217 +795,6 @@ static inline uniform int64 clock() {
     return __clock();
 }
 
-///////////////////////////////////////////////////////////////////////////
-// Atomics and memory barriers
-
-static inline void memory_barrier() {
-    __memory_barrier();
-}
-
-#define DEFINE_ATOMIC_OP(TA,TB,OPA,OPB,MASKTYPE)                        \
-static inline TA atomic_##OPA##_global(uniform TA * uniform ptr, TA value) { \
-    memory_barrier();                                                   \
-    TA ret = __atomic_##OPB##_##TB##_global(ptr, value, (MASKTYPE)__mask); \
-    memory_barrier();                                                   \
-    return ret;                                                         \
-}                                                                       \
-static inline uniform TA atomic_##OPA##_global(uniform TA * uniform ptr, \
-                                               uniform TA value) {      \
-    memory_barrier();                                                   \
-    uniform TA ret = __atomic_##OPB##_uniform_##TB##_global(ptr, value); \
-    memory_barrier();                                                   \
-    return ret;                                                         \
-}                                                                       \
-static inline TA atomic_##OPA##_global(uniform TA * varying ptr, TA value) { \
-    uniform TA * uniform ptrArray[programCount];                        \
-    ptrArray[programIndex] = ptr;                                       \
-    memory_barrier();                                                   \
-    TA ret;                                                             \
-    uniform int mask = lanemask();                                      \
-    for (uniform int i = 0; i < programCount; ++i) {                    \
-        if ((mask & (1 << i)) == 0)                                     \
-            continue;                                                   \
-        uniform TA * uniform p = ptrArray[i];                           \
-        uniform TA v = extract(value, i);                               \
-        uniform TA r = __atomic_##OPB##_uniform_##TB##_global(p, v);    \
-        ret = insert(ret, i, r);                                        \
-    }                                                                   \
-    memory_barrier();                                                   \
-    return ret;                                                         \
-}                                                                       \
-
-#define DEFINE_ATOMIC_SWAP(TA,TB)                                       \
-static inline TA atomic_swap_global(uniform TA * uniform ptr, TA value) { \
-    memory_barrier();                                                   \
-    uniform int i = 0;                                                  \
-    TA ret[programCount];                                               \
-    TA memVal;                                                          \
-    uniform int lastSwap;                                               \
-    uniform int mask = lanemask();                                      \
-    /* First, have the first running program instance (if any) perform  \
-       the swap with memory with its value of "value"; record the       \
-       value returned. */                                               \
-    for (; i < programCount; ++i) {                                     \
-        if ((mask & (1 << i)) == 0)                                     \
-            continue;                                                   \
-        memVal = __atomic_swap_uniform_##TB##_global(ptr, extract(value, i)); \
-        lastSwap = i;                                                   \
-        break;                                                          \
-    }                                                                   \
-    /* Now, for all of the remaining running program instances, set the \
-       return value of the last instance that did a swap with this      \
-       instance's value of "value"; this gives the same effect as if the \
-       current instance had executed a hardware atomic swap right before \
-       the last one that did a swap. */                                 \
-    for (; i < programCount; ++i) {                                     \
-        if ((mask & (1 << i)) == 0)                                     \
-            continue;                                                   \
-        ret[lastSwap] = extract(value, i);                              \
-        lastSwap = i;                                                   \
-    }                                                                   \
-    /* And the last instance that wanted to swap gets the value we      \
-       originally got back from memory... */                            \
-    ret[lastSwap] = memVal;                                             \
-    memory_barrier();                                                   \
-    return ret[programIndex];                                           \
-}                                                                       \
-static inline uniform TA atomic_swap_global(uniform TA * uniform ptr,   \
-                                            uniform TA value) {         \
-    memory_barrier();                                                   \
-    uniform TA ret = __atomic_swap_uniform_##TB##_global(ptr, value);   \
-    memory_barrier();                                                   \
-    return ret;                                                         \
-}                                                                       \
-static inline TA atomic_swap_global(uniform TA * varying ptr, TA value) { \
-    uniform TA * uniform ptrArray[programCount];                        \
-    ptrArray[programIndex] = ptr;                                       \
-    memory_barrier();                                                   \
-    TA ret;                                                             \
-    uniform int mask = lanemask();                                      \
-    for (uniform int i = 0; i < programCount; ++i) {                    \
-        if ((mask & (1 << i)) == 0)                                     \
-            continue;                                                   \
-        uniform TA * uniform p = ptrArray[i];                           \
-        uniform TA v = extract(value, i);                               \
-        uniform TA r = __atomic_swap_uniform_##TB##_global(p, v);       \
-        ret = insert(ret, i, r);                                        \
-    }                                                                   \
-    memory_barrier();                                                   \
-    return ret;                                                         \
-}                                                                       \
-
-#define DEFINE_ATOMIC_MINMAX_OP(TA,TB,OPA,OPB)                          \
-static inline TA atomic_##OPA##_global(uniform TA * uniform ptr, TA value) { \
-    uniform TA oneval = reduce_##OPA(value);                            \
-    TA ret;                                                             \
-    if (lanemask() != 0) {                                              \
-        memory_barrier();                                               \
-        ret = __atomic_##OPB##_uniform_##TB##_global(ptr, oneval);      \
-        memory_barrier();                                               \
-    }                                                                   \
-    return ret;                                                         \
-}                                                                       \
-static inline uniform TA atomic_##OPA##_global(uniform TA * uniform ptr, \
-                                               uniform TA value) {      \
-    memory_barrier();                                                   \
-    uniform TA ret = __atomic_##OPB##_uniform_##TB##_global(ptr, value); \
-    memory_barrier();                                                   \
-    return ret;                                                         \
-}                                                                       \
-static inline TA atomic_##OPA##_global(uniform TA * varying ptr,        \
-                                       TA value) {                      \
-    uniform TA * uniform ptrArray[programCount];                        \
-    ptrArray[programIndex] = ptr;                                       \
-    memory_barrier();                                                   \
-    TA ret;                                                             \
-    uniform int mask = lanemask();                                      \
-    for (uniform int i = 0; i < programCount; ++i) {                    \
-        if ((mask & (1 << i)) == 0)                                     \
-            continue;                                                   \
-        uniform TA * uniform p = ptrArray[i];                           \
-        uniform TA v = extract(value, i);                               \
-        uniform TA r = __atomic_##OPB##_uniform_##TB##_global(p, v);    \
-        ret = insert(ret, i, r);                                        \
-    }                                                                   \
-    memory_barrier();                                                   \
-    return ret;                                                         \
-}
-
-DEFINE_ATOMIC_OP(int32,int32,add,add,IntMaskType)
-DEFINE_ATOMIC_OP(int32,int32,subtract,sub,IntMaskType)
-DEFINE_ATOMIC_MINMAX_OP(int32,int32,min,min)
-DEFINE_ATOMIC_MINMAX_OP(int32,int32,max,max)
-DEFINE_ATOMIC_OP(int32,int32,and,and,IntMaskType)
-DEFINE_ATOMIC_OP(int32,int32,or,or,IntMaskType)
-DEFINE_ATOMIC_OP(int32,int32,xor,xor,IntMaskType)
-DEFINE_ATOMIC_SWAP(int32,int32)
-
-// For everything but atomic min and max, we can use the same
-// implementations for unsigned as for signed.
-DEFINE_ATOMIC_OP(unsigned int32,int32,add,add,UIntMaskType)
-DEFINE_ATOMIC_OP(unsigned int32,int32,subtract,sub,UIntMaskType)
-DEFINE_ATOMIC_MINMAX_OP(unsigned int32,uint32,min,umin)
-DEFINE_ATOMIC_MINMAX_OP(unsigned int32,uint32,max,umax)
-DEFINE_ATOMIC_OP(unsigned int32,int32,and,and,UIntMaskType)
-DEFINE_ATOMIC_OP(unsigned int32,int32,or,or,UIntMaskType)
-DEFINE_ATOMIC_OP(unsigned int32,int32,xor,xor,UIntMaskType)
-DEFINE_ATOMIC_SWAP(unsigned int32,int32)
-
-DEFINE_ATOMIC_SWAP(float,float)
-
-DEFINE_ATOMIC_OP(int64,int64,add,add,IntMaskType)
-DEFINE_ATOMIC_OP(int64,int64,subtract,sub,IntMaskType)
-DEFINE_ATOMIC_MINMAX_OP(int64,int64,min,min)
-DEFINE_ATOMIC_MINMAX_OP(int64,int64,max,max)
-DEFINE_ATOMIC_OP(int64,int64,and,and,IntMaskType)
-DEFINE_ATOMIC_OP(int64,int64,or,or,IntMaskType)
-DEFINE_ATOMIC_OP(int64,int64,xor,xor,IntMaskType)
-DEFINE_ATOMIC_SWAP(int64,int64)
-
-// For everything but atomic min and max, we can use the same
-// implementations for unsigned as for signed.
-DEFINE_ATOMIC_OP(unsigned int64,int64,add,add,UIntMaskType)
-DEFINE_ATOMIC_OP(unsigned int64,int64,subtract,sub,UIntMaskType)
-DEFINE_ATOMIC_MINMAX_OP(unsigned int64,uint64,min,umin)
-DEFINE_ATOMIC_MINMAX_OP(unsigned int64,uint64,max,umax)
-DEFINE_ATOMIC_OP(unsigned int64,int64,and,and,UIntMaskType)
-DEFINE_ATOMIC_OP(unsigned int64,int64,or,or,UIntMaskType)
-DEFINE_ATOMIC_OP(unsigned int64,int64,xor,xor,UIntMaskType)
-DEFINE_ATOMIC_SWAP(unsigned int64,int64)
-
-DEFINE_ATOMIC_SWAP(double,double)
-
-#undef DEFINE_ATOMIC_OP
-#undef DEFINE_ATOMIC_MINMAX_OP
-#undef DEFINE_ATOMIC_SWAP
-
-#define ATOMIC_DECL_CMPXCHG(TA, TB, MASKTYPE)                           \
-static inline TA atomic_compare_exchange_global(                           \
-         uniform TA * uniform ptr, TA oldval, TA newval) {                 \
-    memory_barrier();                                                      \
-    TA ret = __atomic_compare_exchange_##TB##_global(ptr, oldval, newval,  \
-                                                     (MASKTYPE)__mask);    \
-    memory_barrier();                                                      \
-    return ret;                                                            \
-} \
-static inline uniform TA atomic_compare_exchange_global(               \
-         uniform TA * uniform ptr, uniform TA oldval, uniform TA newval) { \
-    memory_barrier();                                                      \
-    uniform TA ret =                                                    \
-        __atomic_compare_exchange_uniform_##TB##_global(ptr, oldval, newval); \
-    memory_barrier();                                                   \
-    return ret;                                                         \
-}
-
-ATOMIC_DECL_CMPXCHG(int32, int32, IntMaskType)
-ATOMIC_DECL_CMPXCHG(unsigned int32, int32, UIntMaskType)
-ATOMIC_DECL_CMPXCHG(float, float, IntMaskType)
-ATOMIC_DECL_CMPXCHG(int64, int64, IntMaskType)
-ATOMIC_DECL_CMPXCHG(unsigned int64, int64, UIntMaskType)
-ATOMIC_DECL_CMPXCHG(double, double, IntMaskType)
-
-#undef ATOMIC_DECL_CMPXCHG
-
 ///////////////////////////////////////////////////////////////////////////
 // Floating-Point Math
 
@@ -1389,6 +1178,400 @@ static inline uniform int64 clamp(uniform int64 v, uniform int64 low,
     return min(max(v, low), high);
 }
 
+///////////////////////////////////////////////////////////////////////////
+// Global atomics and memory barriers
+
+static inline void memory_barrier() {
+    __memory_barrier();
+}
+
+#define DEFINE_ATOMIC_OP(TA,TB,OPA,OPB,MASKTYPE)                        \
+static inline TA atomic_##OPA##_global(uniform TA * uniform ptr, TA value) { \
+    memory_barrier();                                                   \
+    TA ret = __atomic_##OPB##_##TB##_global(ptr, value, (MASKTYPE)__mask); \
+    memory_barrier();                                                   \
+    return ret;                                                         \
+}                                                                       \
+static inline uniform TA atomic_##OPA##_global(uniform TA * uniform ptr, \
+                                               uniform TA value) {      \
+    memory_barrier();                                                   \
+    uniform TA ret = __atomic_##OPB##_uniform_##TB##_global(ptr, value); \
+    memory_barrier();                                                   \
+    return ret;                                                         \
+}                                                                       \
+static inline TA atomic_##OPA##_global(uniform TA * varying ptr, TA value) { \
+    uniform TA * uniform ptrArray[programCount];                        \
+    ptrArray[programIndex] = ptr;                                       \
+    memory_barrier();                                                   \
+    TA ret;                                                             \
+    uniform int mask = lanemask();                                      \
+    for (uniform int i = 0; i < programCount; ++i) {                    \
+        if ((mask & (1 << i)) == 0)                                     \
+            continue;                                                   \
+        uniform TA * uniform p = ptrArray[i];                           \
+        uniform TA v = extract(value, i);                               \
+        uniform TA r = __atomic_##OPB##_uniform_##TB##_global(p, v);    \
+        ret = insert(ret, i, r);                                        \
+    }                                                                   \
+    memory_barrier();                                                   \
+    return ret;                                                         \
+}                                                                       \
+
+#define DEFINE_ATOMIC_SWAP(TA,TB)                                       \
+static inline TA atomic_swap_global(uniform TA * uniform ptr, TA value) { \
+    memory_barrier();                                                   \
+    uniform int i = 0;                                                  \
+    TA ret[programCount];                                               \
+    TA memVal;                                                          \
+    uniform int lastSwap;                                               \
+    uniform int mask = lanemask();                                      \
+    /* First, have the first running program instance (if any) perform  \
+       the swap with memory with its value of "value"; record the       \
+       value returned. */                                               \
+    for (; i < programCount; ++i) {                                     \
+        if ((mask & (1 << i)) == 0)                                     \
+            continue;                                                   \
+        memVal = __atomic_swap_uniform_##TB##_global(ptr, extract(value, i)); \
+        lastSwap = i;                                                   \
+        break;                                                          \
+    }                                                                   \
+    /* Now, for all of the remaining running program instances, set the \
+       return value of the last instance that did a swap with this      \
+       instance's value of "value"; this gives the same effect as if the \
+       current instance had executed a hardware atomic swap right before \
+       the last one that did a swap. */                                 \
+    for (; i < programCount; ++i) {                                     \
+        if ((mask & (1 << i)) == 0)                                     \
+            continue;                                                   \
+        ret[lastSwap] = extract(value, i);                              \
+        lastSwap = i;                                                   \
+    }                                                                   \
+    /* And the last instance that wanted to swap gets the value we      \
+       originally got back from memory... */                            \
+    ret[lastSwap] = memVal;                                             \
+    memory_barrier();                                                   \
+    return ret[programIndex];                                           \
+}                                                                       \
+static inline uniform TA atomic_swap_global(uniform TA * uniform ptr,   \
+                                            uniform TA value) {         \
+    memory_barrier();                                                   \
+    uniform TA ret = __atomic_swap_uniform_##TB##_global(ptr, value);   \
+    memory_barrier();                                                   \
+    return ret;                                                         \
+}                                                                       \
+static inline TA atomic_swap_global(uniform TA * varying ptr, TA value) { \
+    uniform TA * uniform ptrArray[programCount];                        \
+    ptrArray[programIndex] = ptr;                                       \
+    memory_barrier();                                                   \
+    TA ret;                                                             \
+    uniform int mask = lanemask();                                      \
+    for (uniform int i = 0; i < programCount; ++i) {                    \
+        if ((mask & (1 << i)) == 0)                                     \
+            continue;                                                   \
+        uniform TA * uniform p = ptrArray[i];                           \
+        uniform TA v = extract(value, i);                               \
+        uniform TA r = __atomic_swap_uniform_##TB##_global(p, v);       \
+        ret = insert(ret, i, r);                                        \
+    }                                                                   \
+    memory_barrier();                                                   \
+    return ret;                                                         \
+}                                                                       \
+
+#define DEFINE_ATOMIC_MINMAX_OP(TA,TB,OPA,OPB)                          \
+static inline TA atomic_##OPA##_global(uniform TA * uniform ptr, TA value) { \
+    uniform TA oneval = reduce_##OPA(value);                            \
+    TA ret;                                                             \
+    if (lanemask() != 0) {                                              \
+        memory_barrier();                                               \
+        ret = __atomic_##OPB##_uniform_##TB##_global(ptr, oneval);      \
+        memory_barrier();                                               \
+    }                                                                   \
+    return ret;                                                         \
+}                                                                       \
+static inline uniform TA atomic_##OPA##_global(uniform TA * uniform ptr, \
+                                               uniform TA value) {      \
+    memory_barrier();                                                   \
+    uniform TA ret = __atomic_##OPB##_uniform_##TB##_global(ptr, value); \
+    memory_barrier();                                                   \
+    return ret;                                                         \
+}                                                                       \
+static inline TA atomic_##OPA##_global(uniform TA * varying ptr,        \
+                                       TA value) {                      \
+    uniform TA * uniform ptrArray[programCount];                        \
+    ptrArray[programIndex] = ptr;                                       \
+    memory_barrier();                                                   \
+    TA ret;                                                             \
+    uniform int mask = lanemask();                                      \
+    for (uniform int i = 0; i < programCount; ++i) {                    \
+        if ((mask & (1 << i)) == 0)                                     \
+            continue;                                                   \
+        uniform TA * uniform p = ptrArray[i];                           \
+        uniform TA v = extract(value, i);                               \
+        uniform TA r = __atomic_##OPB##_uniform_##TB##_global(p, v);    \
+        ret = insert(ret, i, r);                                        \
+    }                                                                   \
+    memory_barrier();                                                   \
+    return ret;                                                         \
+}
+
+DEFINE_ATOMIC_OP(int32,int32,add,add,IntMaskType)
+DEFINE_ATOMIC_OP(int32,int32,subtract,sub,IntMaskType)
+DEFINE_ATOMIC_MINMAX_OP(int32,int32,min,min)
+DEFINE_ATOMIC_MINMAX_OP(int32,int32,max,max)
+DEFINE_ATOMIC_OP(int32,int32,and,and,IntMaskType)
+DEFINE_ATOMIC_OP(int32,int32,or,or,IntMaskType)
+DEFINE_ATOMIC_OP(int32,int32,xor,xor,IntMaskType)
+DEFINE_ATOMIC_SWAP(int32,int32)
+
+// For everything but atomic min and max, we can use the same
+// implementations for unsigned as for signed.
+DEFINE_ATOMIC_OP(unsigned int32,int32,add,add,UIntMaskType)
+DEFINE_ATOMIC_OP(unsigned int32,int32,subtract,sub,UIntMaskType)
+DEFINE_ATOMIC_MINMAX_OP(unsigned int32,uint32,min,umin)
+DEFINE_ATOMIC_MINMAX_OP(unsigned int32,uint32,max,umax)
+DEFINE_ATOMIC_OP(unsigned int32,int32,and,and,UIntMaskType)
+DEFINE_ATOMIC_OP(unsigned int32,int32,or,or,UIntMaskType)
+DEFINE_ATOMIC_OP(unsigned int32,int32,xor,xor,UIntMaskType)
+DEFINE_ATOMIC_SWAP(unsigned int32,int32)
+
+DEFINE_ATOMIC_SWAP(float,float)
+
+DEFINE_ATOMIC_OP(int64,int64,add,add,IntMaskType)
+DEFINE_ATOMIC_OP(int64,int64,subtract,sub,IntMaskType)
+DEFINE_ATOMIC_MINMAX_OP(int64,int64,min,min)
+DEFINE_ATOMIC_MINMAX_OP(int64,int64,max,max)
+DEFINE_ATOMIC_OP(int64,int64,and,and,IntMaskType)
+DEFINE_ATOMIC_OP(int64,int64,or,or,IntMaskType)
+DEFINE_ATOMIC_OP(int64,int64,xor,xor,IntMaskType)
+DEFINE_ATOMIC_SWAP(int64,int64)
+
+// For everything but atomic min and max, we can use the same
+// implementations for unsigned as for signed.
+DEFINE_ATOMIC_OP(unsigned int64,int64,add,add,UIntMaskType)
+DEFINE_ATOMIC_OP(unsigned int64,int64,subtract,sub,UIntMaskType)
+DEFINE_ATOMIC_MINMAX_OP(unsigned int64,uint64,min,umin)
+DEFINE_ATOMIC_MINMAX_OP(unsigned int64,uint64,max,umax)
+DEFINE_ATOMIC_OP(unsigned int64,int64,and,and,UIntMaskType)
+DEFINE_ATOMIC_OP(unsigned int64,int64,or,or,UIntMaskType)
+DEFINE_ATOMIC_OP(unsigned int64,int64,xor,xor,UIntMaskType)
+DEFINE_ATOMIC_SWAP(unsigned int64,int64)
+
+DEFINE_ATOMIC_SWAP(double,double)
+
+#undef DEFINE_ATOMIC_OP
+#undef DEFINE_ATOMIC_MINMAX_OP
+#undef DEFINE_ATOMIC_SWAP
+
+#define ATOMIC_DECL_CMPXCHG(TA, TB, MASKTYPE)                           \
+static inline TA atomic_compare_exchange_global(                           \
+         uniform TA * uniform ptr, TA oldval, TA newval) {                 \
+    memory_barrier();                                                      \
+    TA ret = __atomic_compare_exchange_##TB##_global(ptr, oldval, newval,  \
+                                                     (MASKTYPE)__mask);    \
+    memory_barrier();                                                      \
+    return ret;                                                            \
+} \
+static inline uniform TA atomic_compare_exchange_global(               \
+         uniform TA * uniform ptr, uniform TA oldval, uniform TA newval) { \
+    memory_barrier();                                                      \
+    uniform TA ret =                                                    \
+        __atomic_compare_exchange_uniform_##TB##_global(ptr, oldval, newval); \
+    memory_barrier();                                                   \
+    return ret;                                                         \
+}
+
+ATOMIC_DECL_CMPXCHG(int32, int32, IntMaskType)
+ATOMIC_DECL_CMPXCHG(unsigned int32, int32, UIntMaskType)
+ATOMIC_DECL_CMPXCHG(float, float, IntMaskType)
+ATOMIC_DECL_CMPXCHG(int64, int64, IntMaskType)
+ATOMIC_DECL_CMPXCHG(unsigned int64, int64, UIntMaskType)
+ATOMIC_DECL_CMPXCHG(double, double, IntMaskType)
+
+#undef ATOMIC_DECL_CMPXCHG
+
+///////////////////////////////////////////////////////////////////////////
+// local atomics
+
+#define LOCAL_ATOMIC(TYPE,NAME,OPFUNC)                                  \
+static inline uniform TYPE atomic_##NAME##_local(uniform TYPE * uniform ptr, \
+                                                 uniform TYPE value) {  \
+    uniform TYPE ret = *ptr;                                           \
+    *ptr = OPFUNC(*ptr, value);                                        \
+     return ret;                                                       \
+}                                                                      \
+static inline TYPE atomic_##NAME##_local(uniform TYPE * uniform ptr, TYPE value) { \
+    TYPE ret;                                                          \
+    uniform int mask = lanemask();                                     \
+    for (uniform int i = 0; i < programCount; ++i) {                   \
+        if ((mask & (1 << i)) == 0)                                    \
+            continue;                                                  \
+        ret = insert(ret, i, *ptr);                                    \
+        *ptr = OPFUNC(*ptr, extract(value, i));                        \
+    }                                                                  \
+    return ret;                                                        \
+}                                                                      \
+static inline TYPE atomic_##NAME##_local(uniform TYPE * p, TYPE value) {    \
+    TYPE ret;                                                          \
+    uniform TYPE * uniform ptrs[programCount];                         \
+    ptrs[programIndex] = p;                                            \
+    uniform int mask = lanemask();                                     \
+    for (uniform int i = 0; i < programCount; ++i) {                   \
+        if ((mask & (1 << i)) == 0)                                    \
+            continue;                                                  \
+        ret = insert(ret, i, *ptrs[i]);                                \
+        *ptrs[i] = OPFUNC(*ptrs[i], extract(value, i));                \
+    }                                                                  \
+    return ret;                                                        \
+}
+
+static inline uniform int32 __add(uniform int32 a, uniform int32 b) { return a+b; }
+static inline uniform int32 __sub(uniform int32 a, uniform int32 b) { return a-b; }
+static inline uniform int32 __and(uniform int32 a, uniform int32 b) { return a & b; }
+static inline uniform int32 __or(uniform int32 a, uniform int32 b) { return a | b; }
+static inline uniform int32 __xor(uniform int32 a, uniform int32 b) { return a ^ b; }
+static inline uniform int32 __swap(uniform int32 a, uniform int32 b) { return b; }
+
+static inline uniform unsigned int32 __add(uniform unsigned int32 a, 
+                                           uniform unsigned int32 b) { return a+b; }
+static inline uniform unsigned int32 __sub(uniform unsigned int32 a, 
+                                           uniform unsigned int32 b) { return a-b; }
+static inline uniform unsigned int32 __and(uniform unsigned int32 a, 
+                                           uniform unsigned int32 b) { return a & b; }
+static inline uniform unsigned int32 __or(uniform unsigned int32 a, 
+                                          uniform unsigned int32 b) { return a | b; }
+static inline uniform unsigned int32 __xor(uniform unsigned int32 a, 
+                                           uniform unsigned int32 b) { return a ^ b; }
+static inline uniform unsigned int32 __swap(uniform unsigned int32 a, 
+                                            uniform unsigned int32 b) { return b; }
+
+
+static inline uniform float __add(uniform float a, uniform float b) { return a+b; }
+static inline uniform float __sub(uniform float a, uniform float b) { return a-b; }
+static inline uniform float __swap(uniform float a, uniform float b) { return b; }
+
+static inline uniform int64 __add(uniform int64 a, uniform int64 b) { return a+b; }
+static inline uniform int64 __sub(uniform int64 a, uniform int64 b) { return a-b; }
+static inline uniform int64 __and(uniform int64 a, uniform int64 b) { return a & b; }
+static inline uniform int64 __or(uniform int64 a, uniform int64 b) { return a | b; }
+static inline uniform int64 __xor(uniform int64 a, uniform int64 b) { return a ^ b; }
+static inline uniform int64 __swap(uniform int64 a, uniform int64 b) { return b; }
+
+static inline uniform unsigned int64 __add(uniform unsigned int64 a, 
+                                           uniform unsigned int64 b) { return a+b; }
+static inline uniform unsigned int64 __sub(uniform unsigned int64 a, 
+                                           uniform unsigned int64 b) { return a-b; }
+static inline uniform unsigned int64 __and(uniform unsigned int64 a, 
+                                           uniform unsigned int64 b) { return a & b; }
+static inline uniform unsigned int64 __or(uniform unsigned int64 a, 
+                                          uniform unsigned int64 b) { return a | b; }
+static inline uniform unsigned int64 __xor(uniform unsigned int64 a, 
+                                           uniform unsigned int64 b) { return a ^ b; }
+static inline uniform unsigned int64 __swap(uniform unsigned int64 a, 
+                                            uniform unsigned int64 b) { return b; }
+
+static inline uniform double __add(uniform double a, uniform double b) { return a+b; }
+static inline uniform double __sub(uniform double a, uniform double b) { return a-b; }
+static inline uniform double __swap(uniform double a, uniform double b) { return a-b; }
+
+LOCAL_ATOMIC(int32, add, __add)
+LOCAL_ATOMIC(int32, subtract, __sub)
+LOCAL_ATOMIC(int32, and, __and)
+LOCAL_ATOMIC(int32, or, __or)
+LOCAL_ATOMIC(int32, xor, __xor)
+LOCAL_ATOMIC(int32, min, min)
+LOCAL_ATOMIC(int32, max, max)
+LOCAL_ATOMIC(int32, swap, __swap)
+
+LOCAL_ATOMIC(unsigned int32, add, __add)
+LOCAL_ATOMIC(unsigned int32, subtract, __sub)
+LOCAL_ATOMIC(unsigned int32, and, __and)
+LOCAL_ATOMIC(unsigned int32, or, __or)
+LOCAL_ATOMIC(unsigned int32, xor, __xor)
+LOCAL_ATOMIC(unsigned int32, min, min)
+LOCAL_ATOMIC(unsigned int32, max, max)
+LOCAL_ATOMIC(unsigned int32, swap, __swap)
+
+LOCAL_ATOMIC(float, add, __add)
+LOCAL_ATOMIC(float, subtract, __sub)
+LOCAL_ATOMIC(float, min, min)
+LOCAL_ATOMIC(float, max, max)
+LOCAL_ATOMIC(float, swap, __swap)
+
+LOCAL_ATOMIC(int64, add, __add)
+LOCAL_ATOMIC(int64, subtract, __sub)
+LOCAL_ATOMIC(int64, and, __and)
+LOCAL_ATOMIC(int64, or, __or)
+LOCAL_ATOMIC(int64, xor, __xor)
+LOCAL_ATOMIC(int64, min, min)
+LOCAL_ATOMIC(int64, max, max)
+LOCAL_ATOMIC(int64, swap, __swap)
+
+LOCAL_ATOMIC(unsigned int64, add, __add)
+LOCAL_ATOMIC(unsigned int64, subtract, __sub)
+LOCAL_ATOMIC(unsigned int64, and, __and)
+LOCAL_ATOMIC(unsigned int64, or, __or)
+LOCAL_ATOMIC(unsigned int64, xor, __xor)
+LOCAL_ATOMIC(unsigned int64, min, min)
+LOCAL_ATOMIC(unsigned int64, max, max)
+LOCAL_ATOMIC(unsigned int64, swap, __swap)
+
+LOCAL_ATOMIC(double, add, __add)
+LOCAL_ATOMIC(double, subtract, __sub)
+LOCAL_ATOMIC(double, min, min)
+LOCAL_ATOMIC(double, max, max)
+LOCAL_ATOMIC(double, swap, __swap)
+
+// compare exchange
+#define LOCAL_CMPXCHG(TYPE)                                             \
+static inline uniform TYPE atomic_compare_exchange_local(uniform TYPE * uniform ptr, \
+                                                         uniform TYPE cmp, \
+                                                         uniform TYPE update) { \
+    uniform TYPE old = *ptr;                                               \
+    if (old == cmp)                                                     \
+        *ptr = update;                                                  \
+    return old;                                                         \
+}                                                                       \
+static inline TYPE atomic_compare_exchange_local(uniform TYPE * uniform ptr, \
+                                                 TYPE cmp, TYPE update) { \
+    TYPE ret;                                                          \
+    uniform int mask = lanemask();                                     \
+    for (uniform int i = 0; i < programCount; ++i) {                   \
+        if ((mask & (1 << i)) == 0)                                    \
+            continue;                                                  \
+        uniform TYPE old = *ptr;                                       \
+        if (old == extract(cmp, i))                                    \
+            *ptr = extract(update, i);                                 \
+        ret = insert(ret, i, old);                                     \
+    }                                                                  \
+    return ret;                                                        \
+}                                                                       \
+static inline TYPE atomic_compare_exchange_local(uniform TYPE * varying p, \
+                                                 TYPE cmp, TYPE update) { \
+    uniform TYPE * uniform ptrs[programCount];                          \
+    ptrs[programIndex] = p;                                            \
+    TYPE ret;                                                          \
+    uniform int mask = lanemask();                                     \
+    for (uniform int i = 0; i < programCount; ++i) {                   \
+        if ((mask & (1 << i)) == 0)                                    \
+            continue;                                                  \
+        uniform TYPE old = *ptrs[i];                                   \
+        if (old == extract(cmp, i))                                    \
+            *ptrs[i] = extract(update, i);                             \
+        ret = insert(ret, i, old);                                     \
+    }                                                                  \
+    return ret;                                                        \
+}
+
+LOCAL_CMPXCHG(int32)
+LOCAL_CMPXCHG(unsigned int32)
+LOCAL_CMPXCHG(float)
+LOCAL_CMPXCHG(int64)
+LOCAL_CMPXCHG(unsigned int64)
+LOCAL_CMPXCHG(double)
+
+#undef LOCAL_ATOMIC
+#undef LOCAL_CMPXCHG
+
 ///////////////////////////////////////////////////////////////////////////
 // Transcendentals (float precision)
 
diff --git a/tests/local-atomics-1.ispc b/tests/local-atomics-1.ispc
new file mode 100644
index 00000000..1b3b337a
--- /dev/null
+++ b/tests/local-atomics-1.ispc
@@ -0,0 +1,15 @@
+
+export uniform int width() { return programCount; }
+
+uniform unsigned int32 s = 0;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = aFOO[programIndex]; 
+    float delta = 1;
+    float b = atomic_add_local(&s, delta);
+    RET[programIndex] = reduce_add(b);
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = reduce_add(programIndex);
+}
diff --git a/tests/local-atomics-10.ispc b/tests/local-atomics-10.ispc
new file mode 100644
index 00000000..77eb1387
--- /dev/null
+++ b/tests/local-atomics-10.ispc
@@ -0,0 +1,17 @@
+
+export uniform int width() { return programCount; }
+
+uniform unsigned int32 s = 0;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = aFOO[programIndex]; 
+    float b = 0;
+    float delta = 1;
+    if (programIndex < 2)
+        b = atomic_add_local(&s, delta);
+    RET[programIndex] = s;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = programCount == 1 ? 1 : 2;
+}
diff --git a/tests/local-atomics-11.ispc b/tests/local-atomics-11.ispc
new file mode 100644
index 00000000..ee17ef30
--- /dev/null
+++ b/tests/local-atomics-11.ispc
@@ -0,0 +1,20 @@
+
+export uniform int width() { return programCount; }
+
+uniform unsigned int32 s = 0;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = aFOO[programIndex]; 
+    float b = 0;
+    if (programIndex & 1)
+        b = atomic_add_local(&s, programIndex);
+    RET[programIndex] = s;
+}
+
+export void result(uniform float RET[]) {
+    uniform int sum = 0;
+    for (uniform int i = 0; i < programCount; ++i)
+        if (i & 1)
+            sum += i;
+    RET[programIndex] = sum;
+}
diff --git a/tests/local-atomics-12.ispc b/tests/local-atomics-12.ispc
new file mode 100644
index 00000000..fc7938ce
--- /dev/null
+++ b/tests/local-atomics-12.ispc
@@ -0,0 +1,20 @@
+
+export uniform int width() { return programCount; }
+
+uniform unsigned int32 s = 0;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = aFOO[programIndex]; 
+    float b = 0;
+    if (programIndex & 1)
+        b = atomic_or_local(&s, (1 << programIndex));
+    RET[programIndex] = s;
+}
+
+export void result(uniform float RET[]) {
+    uniform int sum = 0;
+    for (uniform int i = 0; i < programCount; ++i)
+        if (i & 1)
+            sum += (1 << i);
+    RET[programIndex] = sum;
+}
diff --git a/tests/local-atomics-13.ispc b/tests/local-atomics-13.ispc
new file mode 100644
index 00000000..632e34ea
--- /dev/null
+++ b/tests/local-atomics-13.ispc
@@ -0,0 +1,16 @@
+
+export uniform int width() { return programCount; }
+
+uniform unsigned int32 s = 0;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = aFOO[programIndex]; 
+    float b = 0;
+    if (programIndex & 1)
+        b = atomic_or_local(&s, (1 << programIndex));
+    RET[programIndex] = popcnt(reduce_max((int32)b));
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = programCount == 1 ? 0 : ((programCount/2) - 1);
+}
diff --git a/tests/local-atomics-14.ispc b/tests/local-atomics-14.ispc
new file mode 100644
index 00000000..a5f7e63f
--- /dev/null
+++ b/tests/local-atomics-14.ispc
@@ -0,0 +1,20 @@
+
+export uniform int width() { return programCount; }
+
+uniform unsigned int64 s = 0xffffffffff000000;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = aFOO[programIndex]; 
+    float b = 0;
+    if (programIndex & 1)
+        b = atomic_or_local(&s, (1 << programIndex));
+    RET[programIndex] = (s>>20);
+}
+
+export void result(uniform float RET[]) {
+    uniform int sum = 0;
+    for (uniform int i = 0; i < programCount; ++i)
+        if (i & 1)
+            sum += (1 << i);
+    RET[programIndex] = ((unsigned int64)(0xffffffffff000000 | sum)) >> 20;
+}
diff --git a/tests/local-atomics-2.ispc b/tests/local-atomics-2.ispc
new file mode 100644
index 00000000..82964afd
--- /dev/null
+++ b/tests/local-atomics-2.ispc
@@ -0,0 +1,15 @@
+
+export uniform int width() { return programCount; }
+
+uniform int64 s = 0;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = aFOO[programIndex]; 
+    float delta = 1;
+    float b = atomic_add_local(&s, delta);
+    RET[programIndex] = reduce_add(b);
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = reduce_add(programIndex);
+}
diff --git a/tests/local-atomics-3.ispc b/tests/local-atomics-3.ispc
new file mode 100644
index 00000000..558335e4
--- /dev/null
+++ b/tests/local-atomics-3.ispc
@@ -0,0 +1,15 @@
+
+export uniform int width() { return programCount; }
+
+uniform int32 s = 0xff;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = aFOO[programIndex]; 
+    int32 bits = 0xfff0;
+    float b = atomic_xor_local(&s, bits);
+    RET[programIndex] = s;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = (programCount & 1) ? 0xff0f : 0xff;
+}
diff --git a/tests/local-atomics-4.ispc b/tests/local-atomics-4.ispc
new file mode 100644
index 00000000..651cf4c6
--- /dev/null
+++ b/tests/local-atomics-4.ispc
@@ -0,0 +1,14 @@
+
+export uniform int width() { return programCount; }
+
+uniform int32 s = 0;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = aFOO[programIndex]; 
+    float b = atomic_or_local(&s, (1<<programIndex));
+    RET[programIndex] = s;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = (1<<programCount)-1;
+}
diff --git a/tests/local-atomics-5.ispc b/tests/local-atomics-5.ispc
new file mode 100644
index 00000000..a24fc7fd
--- /dev/null
+++ b/tests/local-atomics-5.ispc
@@ -0,0 +1,14 @@
+
+export uniform int width() { return programCount; }
+
+uniform int32 s = 0xbeef;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = aFOO[programIndex]; 
+    float b = atomic_swap_local(&s, programIndex);
+    RET[programIndex] = reduce_max(b);
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 0xbeef;
+}
diff --git a/tests/local-atomics-6.ispc b/tests/local-atomics-6.ispc
new file mode 100644
index 00000000..24b56f42
--- /dev/null
+++ b/tests/local-atomics-6.ispc
@@ -0,0 +1,14 @@
+
+export uniform int width() { return programCount; }
+
+uniform int32 s = 2;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = aFOO[programIndex]; 
+    float b = atomic_compare_exchange_local(&s, programIndex, a*1000);
+    RET[programIndex] = s;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = (programCount == 1) ? 2 : 3000;
+}
diff --git a/tests/local-atomics-7.ispc b/tests/local-atomics-7.ispc
new file mode 100644
index 00000000..0d1b541d
--- /dev/null
+++ b/tests/local-atomics-7.ispc
@@ -0,0 +1,14 @@
+
+export uniform int width() { return programCount; }
+
+uniform int32 s = 0;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    int32 a = aFOO[programIndex]; 
+    float b = atomic_min_local(&s, a);
+    RET[programIndex] = reduce_min(b);
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = reduce_min(programIndex);
+}
diff --git a/tests/local-atomics-8.ispc b/tests/local-atomics-8.ispc
new file mode 100644
index 00000000..288120f6
--- /dev/null
+++ b/tests/local-atomics-8.ispc
@@ -0,0 +1,16 @@
+
+export uniform int width() { return programCount; }
+
+uniform int32 s = 0;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    int32 a = aFOO[programIndex]; 
+    int32 b = 0;
+    if (programIndex & 1)
+        b = atomic_max_local(&s, a);
+    RET[programIndex] = s;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = (programCount == 1) ? 0 : programCount;
+}
diff --git a/tests/local-atomics-9.ispc b/tests/local-atomics-9.ispc
new file mode 100644
index 00000000..c6ab12e5
--- /dev/null
+++ b/tests/local-atomics-9.ispc
@@ -0,0 +1,17 @@
+
+export uniform int width() { return programCount; }
+
+uniform unsigned int32 s = 0;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = aFOO[programIndex]; 
+    float b = 0;
+    int32 delta = 1;
+    if (programIndex < 2)
+        b = atomic_add_local(&s, delta);
+    RET[programIndex] = reduce_add(b);
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = (programCount == 1) ? 0 : 1;
+}
diff --git a/tests/local-atomics-swap.ispc b/tests/local-atomics-swap.ispc
new file mode 100644
index 00000000..64ae712a
--- /dev/null
+++ b/tests/local-atomics-swap.ispc
@@ -0,0 +1,17 @@
+
+export uniform int width() { return programCount; }
+
+uniform int32 s = 1234;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = aFOO[programIndex]; 
+    float b = 0;
+    if (programIndex & 1) {
+        b = atomic_swap_local(&s, programIndex);
+    }
+    RET[programIndex] = reduce_add(b) + s;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1234 + reduce_add(programIndex & 1 ? programIndex : 0);
+}
diff --git a/tests/local-atomics-uniform-1.ispc b/tests/local-atomics-uniform-1.ispc
new file mode 100644
index 00000000..937ef55e
--- /dev/null
+++ b/tests/local-atomics-uniform-1.ispc
@@ -0,0 +1,14 @@
+
+export uniform int width() { return programCount; }
+
+uniform unsigned int32 s = 10;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = aFOO[programIndex]; 
+    uniform unsigned int32 b = atomic_add_local(&s, 1);
+    RET[programIndex] = s;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 11;
+}
diff --git a/tests/local-atomics-uniform-2.ispc b/tests/local-atomics-uniform-2.ispc
new file mode 100644
index 00000000..44fd56a9
--- /dev/null
+++ b/tests/local-atomics-uniform-2.ispc
@@ -0,0 +1,14 @@
+
+export uniform int width() { return programCount; }
+
+uniform unsigned int32 s = 0b1010;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = aFOO[programIndex]; 
+    uniform unsigned int32 b = atomic_or_local(&s, 1);
+    RET[programIndex] = s;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 0b1011;
+}
diff --git a/tests/local-atomics-uniform-3.ispc b/tests/local-atomics-uniform-3.ispc
new file mode 100644
index 00000000..6ad13140
--- /dev/null
+++ b/tests/local-atomics-uniform-3.ispc
@@ -0,0 +1,14 @@
+
+export uniform int width() { return programCount; }
+
+uniform unsigned int32 s = 0b1010;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = aFOO[programIndex]; 
+    uniform unsigned int32 b = atomic_or_local(&s, 1);
+    RET[programIndex] = b;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 0b1010;
+}
diff --git a/tests/local-atomics-uniform-4.ispc b/tests/local-atomics-uniform-4.ispc
new file mode 100644
index 00000000..bbe938a3
--- /dev/null
+++ b/tests/local-atomics-uniform-4.ispc
@@ -0,0 +1,14 @@
+
+export uniform int width() { return programCount; }
+
+uniform unsigned int32 s = 0xffff;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = aFOO[programIndex]; 
+    uniform unsigned int32 b = atomic_min_local(&s, 1);
+    RET[programIndex] = b;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 0xffff;
+}
diff --git a/tests/local-atomics-uniform-5.ispc b/tests/local-atomics-uniform-5.ispc
new file mode 100644
index 00000000..468cfe26
--- /dev/null
+++ b/tests/local-atomics-uniform-5.ispc
@@ -0,0 +1,14 @@
+
+export uniform int width() { return programCount; }
+
+uniform unsigned int32 s = 0xffff;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = aFOO[programIndex]; 
+    uniform unsigned int32 b = atomic_min_local(&s, 1);
+    RET[programIndex] = s;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1;
+}
diff --git a/tests/local-atomics-uniform-6.ispc b/tests/local-atomics-uniform-6.ispc
new file mode 100644
index 00000000..7e838dd0
--- /dev/null
+++ b/tests/local-atomics-uniform-6.ispc
@@ -0,0 +1,14 @@
+
+export uniform int width() { return programCount; }
+
+uniform float s = 100.;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = aFOO[programIndex]; 
+    uniform float b = atomic_swap_local(&s, 1.);
+    RET[programIndex] = s;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1.;
+}
diff --git a/tests/local-atomics-uniform-7.ispc b/tests/local-atomics-uniform-7.ispc
new file mode 100644
index 00000000..3a9255e4
--- /dev/null
+++ b/tests/local-atomics-uniform-7.ispc
@@ -0,0 +1,14 @@
+
+export uniform int width() { return programCount; }
+
+uniform float s = 100.;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = aFOO[programIndex]; 
+    uniform float b = atomic_swap_local(&s, 1.);
+    RET[programIndex] = b;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 100.;
+}
diff --git a/tests/local-atomics-uniform-8.ispc b/tests/local-atomics-uniform-8.ispc
new file mode 100644
index 00000000..5f202383
--- /dev/null
+++ b/tests/local-atomics-uniform-8.ispc
@@ -0,0 +1,14 @@
+
+export uniform int width() { return programCount; }
+
+uniform float s = 100.;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = aFOO[programIndex]; 
+    uniform float b = atomic_compare_exchange_local(&s, 1., -100.);
+    RET[programIndex] = b;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 100.;
+}
diff --git a/tests/local-atomics-uniform-9.ispc b/tests/local-atomics-uniform-9.ispc
new file mode 100644
index 00000000..4d89977d
--- /dev/null
+++ b/tests/local-atomics-uniform-9.ispc
@@ -0,0 +1,14 @@
+
+export uniform int width() { return programCount; }
+
+uniform int64 s = 100.;
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = aFOO[programIndex]; 
+    uniform int64 b = atomic_compare_exchange_local(&s, 100, -100);
+    RET[programIndex] = s;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = -100.;
+}
diff --git a/tests/local-atomics-varyingptr-1.ispc b/tests/local-atomics-varyingptr-1.ispc
new file mode 100644
index 00000000..385ab5d6
--- /dev/null
+++ b/tests/local-atomics-varyingptr-1.ispc
@@ -0,0 +1,18 @@
+
+export uniform int width() { return programCount; }
+
+uniform unsigned int32 s[programCount];
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = aFOO[programIndex]; 
+    float b = 0;
+    float delta = 1;
+    if (programIndex < 2)
+        atomic_add_local(&s[programIndex], delta);
+    RET[programIndex] = s[programIndex];
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 0;
+    RET[0] = RET[1] = 1;
+}
diff --git a/tests/local-atomics-varyingptr-2.ispc b/tests/local-atomics-varyingptr-2.ispc
new file mode 100644
index 00000000..12243adb
--- /dev/null
+++ b/tests/local-atomics-varyingptr-2.ispc
@@ -0,0 +1,16 @@
+
+export uniform int width() { return programCount; }
+
+uniform unsigned int32 s[programCount];
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = aFOO[programIndex]; 
+    float b = 0;
+    float delta = 1;
+    atomic_add_local(&s[programCount-1-programIndex], programIndex);
+    RET[programIndex] = s[programIndex];
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = programCount-1-programIndex;
+}
diff --git a/tests/local-atomics-varyingptr-3.ispc b/tests/local-atomics-varyingptr-3.ispc
new file mode 100644
index 00000000..bb34b292
--- /dev/null
+++ b/tests/local-atomics-varyingptr-3.ispc
@@ -0,0 +1,18 @@
+
+export uniform int width() { return programCount; }
+
+uniform unsigned int32 s[programCount];
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    for (uniform int i = 0; i < programCount; ++i)
+        s[i] = 1234;
+    float a = aFOO[programIndex]; 
+    float b = 0;
+    float delta = 1;
+    a = atomic_max_local(&s[programIndex], programIndex);
+    RET[programIndex] = a;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1234;
+}
diff --git a/tests/local-atomics-varyingptr-4.ispc b/tests/local-atomics-varyingptr-4.ispc
new file mode 100644
index 00000000..749f72ce
--- /dev/null
+++ b/tests/local-atomics-varyingptr-4.ispc
@@ -0,0 +1,15 @@
+
+export uniform int width() { return programCount; }
+
+uniform int32 s[programCount];
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    for (uniform int i = 0; i < programCount; ++i)
+        s[i] = -1234;
+    atomic_max_local(&s[programIndex], programIndex);
+    RET[programIndex] = s[programIndex];
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = programIndex;
+}

From c2ecc15b935460e1cab576f7269a1dc1c7eb9084 Mon Sep 17 00:00:00 2001
From: Matt Pharr <matt.pharr@intel.com>
Date: Fri, 3 Feb 2012 13:19:15 -0800
Subject: [PATCH 37/62] Add missing "varying/varying"
 atomic_compare_exchange_global() functions.

---
 stdlib.ispc | 29 ++++++++++++++++++++++++-----
 1 file changed, 24 insertions(+), 5 deletions(-)

diff --git a/stdlib.ispc b/stdlib.ispc
index 5bc931ec..e8af3790 100644
--- a/stdlib.ispc
+++ b/stdlib.ispc
@@ -1363,6 +1363,14 @@ DEFINE_ATOMIC_SWAP(double,double)
 #undef DEFINE_ATOMIC_SWAP
 
 #define ATOMIC_DECL_CMPXCHG(TA, TB, MASKTYPE)                           \
+static inline uniform TA atomic_compare_exchange_global(               \
+         uniform TA * uniform ptr, uniform TA oldval, uniform TA newval) { \
+    memory_barrier();                                                      \
+    uniform TA ret =                                                    \
+        __atomic_compare_exchange_uniform_##TB##_global(ptr, oldval, newval); \
+    memory_barrier();                                                   \
+    return ret;                                                         \
+}                                                                       \
 static inline TA atomic_compare_exchange_global(                           \
          uniform TA * uniform ptr, TA oldval, TA newval) {                 \
     memory_barrier();                                                      \
@@ -1371,11 +1379,22 @@ static inline TA atomic_compare_exchange_global(                           \
     memory_barrier();                                                      \
     return ret;                                                            \
 } \
-static inline uniform TA atomic_compare_exchange_global(               \
-         uniform TA * uniform ptr, uniform TA oldval, uniform TA newval) { \
-    memory_barrier();                                                      \
-    uniform TA ret =                                                    \
-        __atomic_compare_exchange_uniform_##TB##_global(ptr, oldval, newval); \
+static inline TA atomic_compare_exchange_global(               \
+         uniform TA * varying ptr, TA oldval, TA newval) { \
+    uniform TA * uniform ptrArray[programCount];                        \
+    ptrArray[programIndex] = ptr;                                       \
+    memory_barrier();                                                   \
+    TA ret;                                                             \
+    uniform int mask = lanemask();                                      \
+    for (uniform int i = 0; i < programCount; ++i) {                    \
+        if ((mask & (1 << i)) == 0)                                     \
+            continue;                                                   \
+        uniform TA r =                                                  \
+            __atomic_compare_exchange_uniform_##TB##_global(ptrArray[i], \
+                                                            extract(oldval, i), \
+                                                            extract(newval, i)); \
+        ret = insert(ret, i, r);                                        \
+    }                                                                   \
     memory_barrier();                                                   \
     return ret;                                                         \
 }

From a9ec7452751df55a42ec5885f6b78a3c199a9438 Mon Sep 17 00:00:00 2001
From: Matt Pharr <matt.pharr@intel.com>
Date: Sat, 4 Feb 2012 15:38:17 -0800
Subject: [PATCH 38/62] Release notes, bump doxygen release number for 1.1.4

---
 docs/ReleaseNotes.txt | 39 +++++++++++++++++++++++++++++++++++++++
 doxygen.cfg           |  2 +-
 2 files changed, 40 insertions(+), 1 deletion(-)

diff --git a/docs/ReleaseNotes.txt b/docs/ReleaseNotes.txt
index 14b42c25..5e67e901 100644
--- a/docs/ReleaseNotes.txt
+++ b/docs/ReleaseNotes.txt
@@ -1,3 +1,42 @@
+=== v1.1.4 === (4 February 2012)
+
+There are two major bugfixes for Windows in this release.  First, a number
+of failures in AVX code generation on Windows have been fixed; AVX on
+Windows now has no known issues.  Second, a longstanding bug in parsing 64-bit
+integer constants on Windows has been fixed.
+
+This release features a new experimental scalar target, contributed by Gabe
+Weisz <gweisz@cs.cmu.edu>.  This target ("--target=generic-1") compiles
+gangs of single program instances (i.e. programCount == 1); it can be
+useful for debugging ispc programs.
+
+The compiler now supports dynamic memory allocation in ispc programs (with
+"new" and "delete" operators based on C++).  See
+http://ispc.github.com/ispc.html#dynamic-memory-allocation in the
+documentation for more information.
+
+ispc now performs "short circuit" evaluation of the || and && logical
+operators and the ? : selection operator.  (This represents the correction
+of a major incompatibility with C.)  Code like "(index < arraySize &&
+array[index] == 1)" thus now executes as in C, where "array[index]" won't
+be evaluated unless "index" is less than "arraySize".
+
+The standard library now provides "local" atomic operations, which are
+atomic across the gang of program instances (but not across other gangs or
+other hardware threads.  See the updated documentation on atomics for more
+information:
+http://ispc.github.com/ispc.html#atomic-operations-and-memory-fences.
+
+The standard library now offers a clock() function, which returns a uniform
+int64 value that counts processor cycles; it can be used for
+fine-resolution timing measurements.
+
+Finally (of limited interest now): ispc now supports the forthcoming AVX2
+instruction set, due with Haswell-generation CPUs.  All tests and examples
+compile and execute correctly with AVX2.  (Thanks specifically to Craig
+Topper and Nadav Rotem for work on AVX2 support in LLVM, which made this
+possible.)
+ 
 === v1.1.3 === (20 January 2012)
 
 With this release, the language now supports "switch" statements, with the
diff --git a/doxygen.cfg b/doxygen.cfg
index 75d925df..30c097de 100644
--- a/doxygen.cfg
+++ b/doxygen.cfg
@@ -31,7 +31,7 @@ PROJECT_NAME           = "Intel SPMD Program Compiler"
 # This could be handy for archiving the generated documentation or
 # if some version control system is used.
 
-PROJECT_NUMBER         = 1.1.3
+PROJECT_NUMBER         = 1.1.4
 
 # The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute)
 # base path where the generated documentation will be put.

From 724a843bbd56d4f77b427162271f3cee2568f3c3 Mon Sep 17 00:00:00 2001
From: Matt Pharr <matt.pharr@intel.com>
Date: Mon, 6 Feb 2012 06:24:10 -0800
Subject: [PATCH 39/62] Add --quiet option to supress all diagnostic output

---
 ispc.cpp |  1 +
 ispc.h   |  3 +++
 main.cpp |  3 +++
 parse.yy |  7 ++++++-
 util.cpp | 17 +++++++++++------
 5 files changed, 24 insertions(+), 7 deletions(-)

diff --git a/ispc.cpp b/ispc.cpp
index a817d17e..4293b21b 100644
--- a/ispc.cpp
+++ b/ispc.cpp
@@ -510,6 +510,7 @@ Globals::Globals() {
     debugPrint = false;
     disableWarnings = false;
     warningsAsErrors = false;
+    quiet = false;
     disableLineWrap = false;
     emitPerfWarnings = true;
     emitInstrumentation = false;
diff --git a/ispc.h b/ispc.h
index 9ebfef53..856b988e 100644
--- a/ispc.h
+++ b/ispc.h
@@ -388,6 +388,9 @@ struct Globals {
         possible performance pitfalls. */
     bool emitPerfWarnings;
 
+    /** Indicates whether all printed output should be surpressed. */
+    bool quiet;
+
     /** Indicates whether calls should be emitted in the program to an
         externally-defined program instrumentation function. (See the
         "Instrumenting your ispc programs" section in the user's
diff --git a/main.cpp b/main.cpp
index 7874aa81..4c05d044 100644
--- a/main.cpp
+++ b/main.cpp
@@ -129,6 +129,7 @@ usage(int ret) {
 #ifndef ISPC_IS_WINDOWS
     printf("    [--pic]\t\t\t\tGenerate position-independent code\n");
 #endif // !ISPC_IS_WINDOWS
+    printf("    [--quiet]\t\t\t\tSuppress all output\n");
     printf("    [--target=<isa>]\t\t\tSelect target ISA. <isa>={%s}\n", Target::SupportedTargetISAs());
     printf("    [--version]\t\t\t\tPrint ispc version\n");
     printf("    [--werror]\t\t\t\tTreat warnings as errors\n");
@@ -383,6 +384,8 @@ int main(int Argc, char *Argv[]) {
         else if (!strcmp(argv[i], "--pic"))
             generatePIC = true;
 #endif // !ISPC_IS_WINDOWS
+        else if (!strcmp(argv[i], "--quiet"))
+            g->quiet = true;
         else if (!strcmp(argv[i], "-v") || !strcmp(argv[i], "--version")) {
             lPrintVersion();
             return 0;
diff --git a/parse.yy b/parse.yy
index 52dd6809..95b22e00 100644
--- a/parse.yy
+++ b/parse.yy
@@ -91,7 +91,12 @@ extern int yylex(YYSTYPE *, SourcePos *);
 
 extern char *yytext;
 
-void yyerror(const char *s) { fprintf(stderr, "Parse error: %s\n", s); }
+void yyerror(const char *s) { 
+    if (!g->quiet) {
+        ++m->errorCount;
+        fprintf(stderr, "Parse error: %s\n", s); 
+    }
+}
 
 static void lAddDeclaration(DeclSpecs *ds, Declarator *decl);
 static void lAddFunctionParams(Declarator *decl);
diff --git a/util.cpp b/util.cpp
index 9c802f19..b7976e64 100644
--- a/util.cpp
+++ b/util.cpp
@@ -265,17 +265,20 @@ lPrint(const char *type, SourcePos p, const char *fmt, va_list args) {
 
 void
 Error(SourcePos p, const char *fmt, ...) {
+    if (m != NULL) ++m->errorCount;
+    if (g->quiet)
+        return;
+
     va_list args;
     va_start(args, fmt);
     lPrint("Error", p, fmt, args);
-    if (m != NULL) ++m->errorCount;
     va_end(args);
 }
 
 
 void
 Debug(SourcePos p, const char *fmt, ...) {
-    if (!g->debugPrint)
+    if (!g->debugPrint || g->quiet)
         return;
 
     va_list args;
@@ -287,21 +290,23 @@ Debug(SourcePos p, const char *fmt, ...) {
 
 void
 Warning(SourcePos p, const char *fmt, ...) {
-    if (g->disableWarnings)
+    if (g->warningsAsErrors && m != NULL)
+        ++m->errorCount;
+
+    if (g->disableWarnings || g->quiet)
         return;
 
     va_list args;
     va_start(args, fmt);
     lPrint(g->warningsAsErrors ? "Error" : "Warning", p, fmt, args);
-    if (g->warningsAsErrors && m != NULL)
-        ++m->errorCount;
     va_end(args);
 }
 
 
 void
 PerformanceWarning(SourcePos p, const char *fmt, ...) {
-    if (!g->emitPerfWarnings || strcmp(p.name, "stdlib.ispc") == 0)
+    if (!g->emitPerfWarnings || strcmp(p.name, "stdlib.ispc") == 0 ||
+        g->quiet)
         return;
 
     va_list args;

From fa7a7fe23e7b03740149ab66b12000e3a8801b4a Mon Sep 17 00:00:00 2001
From: Matt Pharr <matt.pharr@intel.com>
Date: Mon, 6 Feb 2012 06:29:31 -0800
Subject: [PATCH 40/62] Fix error handling in type code.

---
 type.cpp | 221 +++++++++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 190 insertions(+), 31 deletions(-)

diff --git a/type.cpp b/type.cpp
index b5f7ad5a..db68b47a 100644
--- a/type.cpp
+++ b/type.cpp
@@ -982,6 +982,10 @@ PointerType::GetAsUnboundVariabilityType() const {
 
 const PointerType *
 PointerType::ResolveUnboundVariability(Variability v) const {
+    if (baseType == NULL) {
+        Assert(m->errorCount > 0);
+        return NULL;
+    }
     return new PointerType(baseType->ResolveUnboundVariability(v),
                            (variability == Unbound) ? v : variability,
                            isConst);
@@ -1015,8 +1019,10 @@ PointerType::GetAsNonConstType() const {
 
 std::string 
 PointerType::GetString() const {
-    if (baseType == NULL)
+    if (baseType == NULL) {
+        Assert(m->errorCount > 0);
         return "";
+    }
 
     std::string ret = baseType->GetString();
 
@@ -1035,8 +1041,10 @@ PointerType::GetString() const {
 std::string
 PointerType::Mangle() const {
     Assert(variability != Unbound);
-    if (baseType == NULL)
+    if (baseType == NULL) {
+        Assert(m->errorCount > 0);
         return "";
+    }
 
     return ((variability == Uniform) ? std::string("uptr<") : std::string("vptr<")) + 
         baseType->Mangle() + std::string(">");
@@ -1050,8 +1058,10 @@ PointerType::GetCDeclaration(const std::string &name) const {
         return "";
     }
 
-    if (baseType == NULL)
+    if (baseType == NULL) {
+        Assert(m->errorCount > 0);
         return "";
+    }
 
     std::string ret = baseType->GetCDeclaration("");
     ret += std::string(" *");
@@ -1065,8 +1075,10 @@ PointerType::GetCDeclaration(const std::string &name) const {
 LLVM_TYPE_CONST llvm::Type *
 PointerType::LLVMType(llvm::LLVMContext *ctx) const {
     Assert(variability != Unbound);
-    if (baseType == NULL)
+    if (baseType == NULL) {
+        Assert(m->errorCount > 0);
         return NULL;
+    }
 
     if (variability == Varying)
         // always the same, since we currently use int vectors for varying
@@ -1120,8 +1132,10 @@ lCreateDIArray(llvm::DIType eltType, int count) {
 llvm::DIType
 PointerType::GetDIType(llvm::DIDescriptor scope) const {
     Assert(variability != Unbound);
-    if (baseType == NULL)
+    if (baseType == NULL) {
+        Assert(m->errorCount > 0);
         return llvm::DIType();
+    }
 
     llvm::DIType diTargetType = baseType->GetDIType(scope);
     int bitsSize = g->target.is32Bit ? 32 : 64;
@@ -1156,12 +1170,16 @@ ArrayType::ArrayType(const Type *c, int a)
 
 LLVM_TYPE_CONST llvm::ArrayType *
 ArrayType::LLVMType(llvm::LLVMContext *ctx) const {
-    if (!child)
+    if (child == NULL) {
+        Assert(m->errorCount > 0);
         return NULL;
+    }
 
     LLVM_TYPE_CONST llvm::Type *ct = child->LLVMType(ctx);
-    if (!ct)
+    if (ct == NULL) {
+        Assert(m->errorCount > 0);
         return NULL;
+    }
     return llvm::ArrayType::get(ct, numElements);
 }
 
@@ -1217,62 +1235,80 @@ ArrayType::GetBaseType() const {
 
 const ArrayType *
 ArrayType::GetAsVaryingType() const {
-    if (child == NULL)
+    if (child == NULL) {
+        Assert(m->errorCount > 0);
         return NULL;
+    }
     return new ArrayType(child->GetAsVaryingType(), numElements);
 }
 
 
 const ArrayType *
 ArrayType::GetAsUniformType() const {
-    if (child == NULL)
+    if (child == NULL) {
+        Assert(m->errorCount > 0);
         return NULL;
+    }
     return new ArrayType(child->GetAsUniformType(), numElements);
 }
 
 
 const ArrayType *
 ArrayType::GetAsUnboundVariabilityType() const {
-    if (child == NULL)
+    if (child == NULL) {
+        Assert(m->errorCount > 0);
         return NULL;
+    }
     return new ArrayType(child->GetAsUnboundVariabilityType(), numElements);
 }
 
 
 const ArrayType *
 ArrayType::ResolveUnboundVariability(Variability v) const {
+    if (child == NULL) {
+        Assert(m->errorCount > 0);
+        return NULL;
+    }
     return new ArrayType(child->ResolveUnboundVariability(v), numElements);
 }
 
 
 const ArrayType *
 ArrayType::GetAsUnsignedType() const {
-    if (child == NULL)
+    if (child == NULL) {
+        Assert(m->errorCount > 0);
         return NULL;
+    }
     return new ArrayType(child->GetAsUnsignedType(), numElements);
 }
 
 
 const Type *
 ArrayType::GetSOAType(int width) const {
-    if (child == NULL)
+    if (child == NULL) {
+        Assert(m->errorCount > 0);
         return NULL;
+    }
     return new ArrayType(child->GetSOAType(width), numElements);
 }
 
 
 const ArrayType *
 ArrayType::GetAsConstType() const {
-    if (child == NULL)
+    if (child == NULL) {
+        Assert(m->errorCount > 0);
         return NULL;
+    }
     return new ArrayType(child->GetAsConstType(), numElements);
 }
 
 
 const ArrayType *
 ArrayType::GetAsNonConstType() const {
-    if (child == NULL)
+    if (child == NULL) {
+        Assert(m->errorCount > 0);
         return NULL;
+    }
     return new ArrayType(child->GetAsNonConstType(), numElements);
 }
 
@@ -1291,7 +1327,12 @@ ArrayType::GetElementType() const {
 
 std::string
 ArrayType::GetString() const {
-    std::string s = GetBaseType()->GetString();
+    const Type *base = GetBaseType();
+    if (base == NULL) {
+        Assert(m->errorCount > 0);
+        return "";
+    }
+    std::string s = base->GetString();
 
     const ArrayType *at = this;
     // Walk through this and any children arrays and print all of their
@@ -1311,8 +1352,10 @@ ArrayType::GetString() const {
 
 std::string
 ArrayType::Mangle() const {
-    if (child == NULL)
+    if (child == NULL) {
+        Assert(m->errorCount > 0);
         return "(error)";
+    }
     std::string s = child->Mangle();
     char buf[16];
     if (numElements > 0)
@@ -1325,7 +1368,12 @@ ArrayType::Mangle() const {
 
 std::string
 ArrayType::GetCDeclaration(const std::string &name) const {
-    std::string s = GetBaseType()->GetCDeclaration(name);
+    const Type *base = GetBaseType();
+    if (base == NULL) {
+        Assert(m->errorCount > 0);
+        return "";
+    }
+    std::string s = base->GetCDeclaration(name);
 
     const ArrayType *at = this;
     while (at) {
@@ -1344,7 +1392,7 @@ ArrayType::GetCDeclaration(const std::string &name) const {
 int
 ArrayType::TotalElementCount() const {
     const ArrayType *ct = dynamic_cast<const ArrayType *>(child);
-    if (ct)
+    if (ct != NULL)
         return numElements * ct->TotalElementCount();
     else
         return numElements;
@@ -1353,8 +1401,10 @@ ArrayType::TotalElementCount() const {
 
 llvm::DIType
 ArrayType::GetDIType(llvm::DIDescriptor scope) const {
-    if (!child)
+    if (child == NULL) {
+        Assert(m->errorCount > 0);
         return llvm::DIType();
+    }
 
     llvm::DIType eltType = child->GetDIType(scope);
     return lCreateDIArray(eltType, numElements);
@@ -2083,36 +2133,60 @@ ReferenceType::ReferenceType(const Type *t)
 
 Type::Variability
 ReferenceType::GetVariability() const {
+    if (targetType == NULL) {
+        Assert(m->errorCount > 0);
+        return Type::Unbound;
+    }
     return targetType->GetVariability(); 
 }
 
 
 bool
 ReferenceType::IsBoolType() const {
+    if (targetType == NULL) {
+        Assert(m->errorCount > 0);
+        return false;
+    }
     return targetType->IsBoolType(); 
 }
 
 
 bool
 ReferenceType::IsFloatType() const {
+    if (targetType == NULL) {
+        Assert(m->errorCount > 0);
+        return false;
+    }
     return targetType->IsFloatType(); 
 }
 
 
 bool
 ReferenceType::IsIntType() const {
+    if (targetType == NULL) {
+        Assert(m->errorCount > 0);
+        return false;
+    }
     return targetType->IsIntType(); 
 }
 
 
 bool
 ReferenceType::IsUnsignedType() const {
+    if (targetType == NULL) {
+        Assert(m->errorCount > 0);
+        return false;
+    }
     return targetType->IsUnsignedType(); 
 }
 
 
 bool
 ReferenceType::IsConstType() const {
+    if (targetType == NULL) {
+        Assert(m->errorCount > 0);
+        return false;
+    }
     return targetType->IsConstType();
 }
 
@@ -2125,12 +2199,20 @@ ReferenceType::GetReferenceTarget() const {
 
 const Type *
 ReferenceType::GetBaseType() const {
+    if (targetType == NULL) {
+        Assert(m->errorCount > 0);
+        return NULL;
+    }
     return targetType->GetBaseType();
 }
 
 
 const ReferenceType *
 ReferenceType::GetAsVaryingType() const {
+    if (targetType == NULL) {
+        Assert(m->errorCount > 0);
+        return NULL;
+    }
     if (IsVaryingType()) 
         return this;
     return new ReferenceType(targetType->GetAsVaryingType());
@@ -2139,6 +2221,10 @@ ReferenceType::GetAsVaryingType() const {
 
 const ReferenceType *
 ReferenceType::GetAsUniformType() const {
+    if (targetType == NULL) {
+        Assert(m->errorCount > 0);
+        return NULL;
+    }
     if (IsUniformType()) 
         return this;
     return new ReferenceType(targetType->GetAsUniformType());
@@ -2147,6 +2233,10 @@ ReferenceType::GetAsUniformType() const {
 
 const ReferenceType *
 ReferenceType::GetAsUnboundVariabilityType() const {
+    if (targetType == NULL) {
+        Assert(m->errorCount > 0);
+        return NULL;
+    }
     if (HasUnboundVariability()) 
         return this;
     return new ReferenceType(targetType->GetAsUnboundVariabilityType());
@@ -2155,18 +2245,30 @@ ReferenceType::GetAsUnboundVariabilityType() const {
 
 const ReferenceType *
 ReferenceType::ResolveUnboundVariability(Variability v) const {
+    if (targetType == NULL) {
+        Assert(m->errorCount > 0);
+        return NULL;
+    }
     return new ReferenceType(targetType->ResolveUnboundVariability(v));
 }
     
 
 const Type *
 ReferenceType::GetSOAType(int width) const {
+    if (targetType == NULL) {
+        Assert(m->errorCount > 0);
+        return NULL;
+    }
     return new ReferenceType(targetType->GetSOAType(width));
 }
 
 
 const ReferenceType *
 ReferenceType::GetAsConstType() const {
+    if (targetType == NULL) {
+        Assert(m->errorCount > 0);
+        return NULL;
+    }
     if (IsConstType())
         return this;
     return new ReferenceType(targetType->GetAsConstType());
@@ -2175,6 +2277,10 @@ ReferenceType::GetAsConstType() const {
 
 const ReferenceType *
 ReferenceType::GetAsNonConstType() const {
+    if (targetType == NULL) {
+        Assert(m->errorCount > 0);
+        return NULL;
+    }
     if (!IsConstType())
         return this;
     return new ReferenceType(targetType->GetAsNonConstType());
@@ -2183,8 +2289,10 @@ ReferenceType::GetAsNonConstType() const {
 
 std::string
 ReferenceType::GetString() const {
-    if (targetType == NULL)
+    if (targetType == NULL) {
+        Assert(m->errorCount > 0);
         return "";
+    }
 
     std::string ret = targetType->GetString();
 
@@ -2195,6 +2303,10 @@ ReferenceType::GetString() const {
 
 std::string
 ReferenceType::Mangle() const {
+    if (targetType == NULL) {
+        Assert(m->errorCount > 0);
+        return "";
+    }
     std::string ret;
     ret += std::string("REF") + targetType->Mangle();
     return ret;
@@ -2203,6 +2315,11 @@ ReferenceType::Mangle() const {
 
 std::string
 ReferenceType::GetCDeclaration(const std::string &name) const {
+    if (targetType == NULL) {
+        Assert(m->errorCount > 0);
+        return "";
+    }
+
     const ArrayType *at = dynamic_cast<const ArrayType *>(targetType);
     if (at != NULL) {
         if (at->GetElementCount() == 0) {
@@ -2231,17 +2348,28 @@ ReferenceType::GetCDeclaration(const std::string &name) const {
 
 LLVM_TYPE_CONST llvm::Type *
 ReferenceType::LLVMType(llvm::LLVMContext *ctx) const {
-    if (!targetType)
+    if (targetType == NULL) {
+        Assert(m->errorCount > 0);
         return NULL;
+    }
+
     LLVM_TYPE_CONST llvm::Type *t = targetType->LLVMType(ctx);
-    if (!t)
+    if (t == NULL) {
+        Assert(m->errorCount > 0);
         return NULL;
+    }
+
     return llvm::PointerType::get(t, 0);
 }
 
 
 llvm::DIType
 ReferenceType::GetDIType(llvm::DIDescriptor scope) const {
+    if (targetType == NULL) {
+        Assert(m->errorCount > 0);
+        return llvm::DIType();
+    }
+
     llvm::DIType diTargetType = targetType->GetDIType(scope);
     return m->diBuilder->createReferenceType(diTargetType);
 }
@@ -2340,11 +2468,21 @@ FunctionType::GetAsUnboundVariabilityType() const {
 
 const FunctionType *
 FunctionType::ResolveUnboundVariability(Variability v) const {
+    if (returnType == NULL) {
+        Assert(m->errorCount > 0);
+        return NULL;
+    }
     const Type *rt = returnType->ResolveUnboundVariability(v);
+
     std::vector<const Type *> pt;
-    for (unsigned int i = 0; i < paramTypes.size(); ++i)
-        pt.push_back((paramTypes[i] == NULL) ? NULL :
-                     paramTypes[i]->ResolveUnboundVariability(v));
+    for (unsigned int i = 0; i < paramTypes.size(); ++i) {
+        if (paramTypes[i] == NULL) {
+            Assert(m->errorCount > 0);
+            return NULL;
+        }
+        pt.push_back(paramTypes[i]->ResolveUnboundVariability(v));
+    }
+
     return new FunctionType(rt, pt, paramNames, paramDefaults,
                             paramPositions, isTask, isExported, isExternC);
 }
@@ -2375,10 +2513,17 @@ std::string
 FunctionType::GetString() const {
     std::string ret;
     if (isTask) ret += "task ";
-    ret += returnType->GetString();
+    if (returnType != NULL)
+        ret += returnType->GetString();
+    else
+        ret += "/* ERROR */";
     ret += "(";
     for (unsigned int i = 0; i < paramTypes.size(); ++i) {
-        ret += paramTypes[i]->GetString();
+        if (paramTypes[i] == NULL)
+            ret += "/* ERROR */";
+        else
+            ret += paramTypes[i]->GetString();
+
         if (i != paramTypes.size() - 1)
             ret += ", ";
     }
@@ -2391,7 +2536,11 @@ std::string
 FunctionType::Mangle() const {
     std::string ret = "___";
     for (unsigned int i = 0; i < paramTypes.size(); ++i)
-        ret += paramTypes[i]->Mangle();
+        if (paramTypes[i] == NULL)
+            Assert(m->errorCount > 0);
+        else
+            ret += paramTypes[i]->Mangle();
+
     return ret;
 }
 
@@ -2444,18 +2593,23 @@ FunctionType::GetDIType(llvm::DIDescriptor scope) const {
 
 LLVM_TYPE_CONST llvm::FunctionType *
 FunctionType::LLVMFunctionType(llvm::LLVMContext *ctx, bool includeMask) const {
-    if (isTask == true) Assert(includeMask == true);
+    if (isTask == true) 
+        Assert(includeMask == true);
 
     // Get the LLVM Type *s for the function arguments
     std::vector<LLVM_TYPE_CONST llvm::Type *> llvmArgTypes;
     for (unsigned int i = 0; i < paramTypes.size(); ++i) {
-        if (!paramTypes[i])
+        if (paramTypes[i] == NULL) {
+            Assert(m->errorCount > 0);
             return NULL;
+        }
         Assert(paramTypes[i] != AtomicType::Void);
 
         LLVM_TYPE_CONST llvm::Type *t = paramTypes[i]->LLVMType(ctx);
-        if (t == NULL)
+        if (t == NULL) {
+            Assert(m->errorCount > 0);
             return NULL;
+        }
         llvmArgTypes.push_back(t);
     }
 
@@ -2481,6 +2635,11 @@ FunctionType::LLVMFunctionType(llvm::LLVMContext *ctx, bool includeMask) const {
         // Otherwise we already have the types of the arguments 
         callTypes = llvmArgTypes;
 
+    if (returnType == NULL) {
+        Assert(m->errorCount > 0);
+        return NULL;
+    }
+
     return llvm::FunctionType::get(returnType->LLVMType(g->ctx), callTypes, false);
 }
 

From 977b983771ef8caad2e1067075df3c84c5e116ea Mon Sep 17 00:00:00 2001
From: Matt Pharr <matt.pharr@intel.com>
Date: Mon, 6 Feb 2012 07:44:45 -0800
Subject: [PATCH 41/62] Issue error on "void" typed variable, function
 parameter, or struct member.

---
 decl.cpp                       | 17 ++++++++++++++++-
 module.cpp                     |  5 +++++
 tests_errors/void-1.ispc       |  5 +++++
 tests_errors/void-2.ispc       |  5 +++++
 tests_errors/void-3.ispc       |  5 +++++
 tests_errors/void-4.ispc       |  3 +++
 tests_errors/void-array-1.ispc |  5 +++++
 tests_errors/void-array-2.ispc |  4 ++++
 tests_errors/void-array-3.ispc |  7 +++++++
 type.cpp                       |  1 +
 10 files changed, 56 insertions(+), 1 deletion(-)
 create mode 100644 tests_errors/void-1.ispc
 create mode 100644 tests_errors/void-2.ispc
 create mode 100644 tests_errors/void-3.ispc
 create mode 100644 tests_errors/void-4.ispc
 create mode 100644 tests_errors/void-array-1.ispc
 create mode 100644 tests_errors/void-array-2.ispc
 create mode 100644 tests_errors/void-array-3.ispc

diff --git a/decl.cpp b/decl.cpp
index c62f0b6f..5661c4a3 100644
--- a/decl.cpp
+++ b/decl.cpp
@@ -332,6 +332,11 @@ Declarator::GetType(const Type *base, DeclSpecs *ds) const {
         break;
 
     case DK_ARRAY:
+        if (type == AtomicType::Void) {
+            Error(pos, "Arrays of \"void\" type are illegal.");
+            return NULL;
+        }
+
         type = new ArrayType(type, arraySize);
         if (child)
             return child->GetType(type, ds);
@@ -358,6 +363,11 @@ Declarator::GetType(const Type *base, DeclSpecs *ds) const {
                       "function parameter declaration for parameter \"%s\".", 
                       lGetStorageClassName(d->declSpecs->storageClass),
                       sym->name.c_str());
+            if (sym->type == AtomicType::Void) {
+                Error(sym->pos, "Parameter with type \"void\" illegal in function "
+                      "parameter list.");
+                sym->type = NULL;
+            }
 
             const ArrayType *at = dynamic_cast<const ArrayType *>(sym->type);
             if (at != NULL) {
@@ -544,7 +554,9 @@ Declaration::GetVariableDeclarations() const {
         Symbol *sym = decl->GetSymbol();
         sym->type = sym->type->ResolveUnboundVariability(Type::Varying);
 
-        if (dynamic_cast<const FunctionType *>(sym->type) == NULL) {
+        if (sym->type == AtomicType::Void)
+            Error(sym->pos, "\"void\" type variable illegal in declaration.");
+        else if (dynamic_cast<const FunctionType *>(sym->type) == NULL) {
             m->symbolTable->AddVariable(sym);
             vars.push_back(VariableDeclaration(sym, decl->initExpr));
         }
@@ -611,6 +623,9 @@ GetStructTypesNamesPositions(const std::vector<StructDeclaration *> &sd,
 
             Symbol *sym = d->GetSymbol();
 
+            if (sym->type == AtomicType::Void)
+                Error(d->pos, "\"void\" type illegal for struct member.");
+
             const ArrayType *arrayType = 
                 dynamic_cast<const ArrayType *>(sym->type);
             if (arrayType != NULL && arrayType->GetElementCount() == 0) {
diff --git a/module.cpp b/module.cpp
index df09955a..3881ae2b 100644
--- a/module.cpp
+++ b/module.cpp
@@ -237,6 +237,11 @@ Module::AddGlobalVariable(Symbol *sym, Expr *initExpr, bool isConst) {
         return;
     }
 
+    if (sym->type == AtomicType::Void) {
+        Error(sym->pos, "\"void\" type global variable is illegal.");
+        return;
+    }
+
     sym->type = ArrayType::SizeUnsizedArrays(sym->type, initExpr);
     if (sym->type == NULL)
         return;
diff --git a/tests_errors/void-1.ispc b/tests_errors/void-1.ispc
new file mode 100644
index 00000000..44f19555
--- /dev/null
+++ b/tests_errors/void-1.ispc
@@ -0,0 +1,5 @@
+// "void" type variable illegal in declaration
+
+int func() {
+    void x;
+}
diff --git a/tests_errors/void-2.ispc b/tests_errors/void-2.ispc
new file mode 100644
index 00000000..3cd50dcf
--- /dev/null
+++ b/tests_errors/void-2.ispc
@@ -0,0 +1,5 @@
+// Parameter with type "void" illegal in function parameter list
+
+void func(void x, void y) {
+    return x+y;
+}
diff --git a/tests_errors/void-3.ispc b/tests_errors/void-3.ispc
new file mode 100644
index 00000000..7910de54
--- /dev/null
+++ b/tests_errors/void-3.ispc
@@ -0,0 +1,5 @@
+// "void" type illegal for struct member
+
+struct Foo {
+    void x;
+};
diff --git a/tests_errors/void-4.ispc b/tests_errors/void-4.ispc
new file mode 100644
index 00000000..d2f8f48f
--- /dev/null
+++ b/tests_errors/void-4.ispc
@@ -0,0 +1,3 @@
+// "void" type global variable is illegal
+
+void x;
diff --git a/tests_errors/void-array-1.ispc b/tests_errors/void-array-1.ispc
new file mode 100644
index 00000000..1e9aaa5c
--- /dev/null
+++ b/tests_errors/void-array-1.ispc
@@ -0,0 +1,5 @@
+// Arrays of "void" type are illegal
+
+float f_fu(uniform void aFOO[]) {
+    return 0;
+}
diff --git a/tests_errors/void-array-2.ispc b/tests_errors/void-array-2.ispc
new file mode 100644
index 00000000..79d646e6
--- /dev/null
+++ b/tests_errors/void-array-2.ispc
@@ -0,0 +1,4 @@
+// Arrays of "void" type are illegal
+
+uniform void aFOO[] = { NULL };
+
diff --git a/tests_errors/void-array-3.ispc b/tests_errors/void-array-3.ispc
new file mode 100644
index 00000000..2cca0f37
--- /dev/null
+++ b/tests_errors/void-array-3.ispc
@@ -0,0 +1,7 @@
+// Arrays of "void" type are illegal
+
+struct Foo {
+    void aFOO[];
+};
+
+
diff --git a/type.cpp b/type.cpp
index db68b47a..f771dc73 100644
--- a/type.cpp
+++ b/type.cpp
@@ -1165,6 +1165,7 @@ ArrayType::ArrayType(const Type *c, int a)
     : child(c), numElements(a) {
     // 0 -> unsized array.
     Assert(numElements >= 0);
+    Assert(c != AtomicType::Void);
 }
 
 

From 4e018d0a208894938f1e5fc6e070b2c85b48152e Mon Sep 17 00:00:00 2001
From: Matt Pharr <matt.pharr@intel.com>
Date: Mon, 6 Feb 2012 09:40:52 -0800
Subject: [PATCH 42/62] Improve tracking of source position in the presence of
 /* */ comments.

Don't let the preprocessor remove comments anymore, so that the rules
in lex.ll can handle them.  Fix lCComment() to update the source
position as it eats characters in comments.
---
 lex.ll     | 4 +++-
 module.cpp | 4 ++++
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/lex.ll b/lex.ll
index 9797e4e5..e5e395c3 100644
--- a/lex.ll
+++ b/lex.ll
@@ -309,8 +309,10 @@ lParseBinary(const char *ptr, SourcePos pos, char **endPtr) {
 static void
 lCComment(SourcePos *pos) {
     char c, prev = 0;
-  
+
     while ((c = yyinput()) != 0) {
+        ++pos->last_column;
+
         if (c == '\n') {
             pos->last_line++;
             pos->last_column = 1;
diff --git a/module.cpp b/module.cpp
index 3881ae2b..6bd79332 100644
--- a/module.cpp
+++ b/module.cpp
@@ -1170,6 +1170,10 @@ Module::execPreprocessor(const char* infilename, llvm::raw_string_ostream* ostre
     inst.createSourceManager(inst.getFileManager());
     inst.InitializeSourceManager(infilename);
 
+    // Don't remove comments in the preprocessor, so that we can accurately
+    // track the source file position by handling them ourselves.
+    inst.getPreprocessorOutputOpts().ShowComments = 1;
+
     clang::PreprocessorOptions &opts = inst.getPreprocessorOpts();
 
     // Add defs for ISPC and PI

From 2236d53def261b31f9fc82867455afe0bef90f1a Mon Sep 17 00:00:00 2001
From: Matt Pharr <matt.pharr@intel.com>
Date: Mon, 6 Feb 2012 09:56:21 -0800
Subject: [PATCH 43/62] Issue error if &=, |=, ^=, <<=, or >>= used with
 floats.

---
 expr.cpp                          | 8 ++++++++
 tests_errors/float-logical-1.ispc | 5 +++++
 tests_errors/float-logical.ispc   | 5 +++++
 3 files changed, 18 insertions(+)
 create mode 100644 tests_errors/float-logical-1.ispc
 create mode 100644 tests_errors/float-logical.ispc

diff --git a/expr.cpp b/expr.cpp
index b1194603..7a36a4bb 100644
--- a/expr.cpp
+++ b/expr.cpp
@@ -2623,6 +2623,14 @@ AssignExpr::TypeCheck() {
     if (rvalue == NULL)
         return NULL;
 
+    if (lhsType->IsFloatType() == true &&
+        (op == ShlAssign || op == ShrAssign || op == AndAssign || 
+         op == XorAssign || op == OrAssign)) {
+            Error(pos, "Illegal to use %s operator with floating-point "
+                  "operands.", lOpString(op));
+            return NULL;
+    }
+
     // Make sure we're not assigning to a struct that has a constant member
     const StructType *st = dynamic_cast<const StructType *>(lhsType);
     if (st != NULL && lCheckForConstStructMember(pos, st, st))
diff --git a/tests_errors/float-logical-1.ispc b/tests_errors/float-logical-1.ispc
new file mode 100644
index 00000000..9aad44c5
--- /dev/null
+++ b/tests_errors/float-logical-1.ispc
@@ -0,0 +1,5 @@
+// First operand to binary operator "&" must be an integer or bool
+
+float foo(float a, float b) {
+    return a & b;
+}
diff --git a/tests_errors/float-logical.ispc b/tests_errors/float-logical.ispc
new file mode 100644
index 00000000..27ab4c8c
--- /dev/null
+++ b/tests_errors/float-logical.ispc
@@ -0,0 +1,5 @@
+// Illegal to use ^= operator with floating-point
+
+float foo(float a, float b) {
+    return a ^= b;
+}

From fddc5e022e745946bc8762672c99568aafdc417e Mon Sep 17 00:00:00 2001
From: Matt Pharr <matt.pharr@intel.com>
Date: Mon, 6 Feb 2012 10:01:50 -0800
Subject: [PATCH 44/62] Fix typo in IfStmt::EstimateCost()

---
 stmt.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/stmt.cpp b/stmt.cpp
index 617abfdf..8e90eb3e 100644
--- a/stmt.cpp
+++ b/stmt.cpp
@@ -427,7 +427,7 @@ IfStmt::TypeCheck() {
 int
 IfStmt::EstimateCost() const {
     const Type *type;
-    if (test == NULL || (type = test->GetType()) != NULL)
+    if (test == NULL || (type = test->GetType()) == NULL)
         return 0;
 
     return type->IsUniformType() ? COST_UNIFORM_IF : COST_VARYING_IF;

From 96a429694f04900744351e11b6ed45bf9fd10aec Mon Sep 17 00:00:00 2001
From: Matt Pharr <matt.pharr@intel.com>
Date: Mon, 6 Feb 2012 12:39:46 -0800
Subject: [PATCH 45/62] 80 column fixes

---
 module.cpp | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/module.cpp b/module.cpp
index 6bd79332..75a16046 100644
--- a/module.cpp
+++ b/module.cpp
@@ -227,13 +227,14 @@ Module::AddGlobalVariable(Symbol *sym, Expr *initExpr, bool isConst) {
     }
 
     if (symbolTable->LookupFunction(sym->name.c_str())) {
-        Error(sym->pos, "Global variable \"%s\" shadows previously-declared function.",
-              sym->name.c_str());
+        Error(sym->pos, "Global variable \"%s\" shadows previously-declared "
+              "function.", sym->name.c_str());
         return;
     }
 
     if (sym->storageClass == SC_EXTERN_C) {
-        Error(sym->pos, "extern \"C\" qualifier can only be used for functions.");
+        Error(sym->pos, "extern \"C\" qualifier can only be used for "
+              "functions.");
         return;
     }
 

From a9ed71f553ed0776beb723b59fcb2fb22f29d414 Mon Sep 17 00:00:00 2001
From: Matt Pharr <matt.pharr@intel.com>
Date: Mon, 6 Feb 2012 12:41:55 -0800
Subject: [PATCH 46/62] Bug fixes to avoid NULL pointer derefs with malformed
 programs.

---
 decl.cpp | 41 ++++++++++++++++++++++++++++++++++++-----
 expr.cpp |  5 +++++
 parse.yy | 31 ++++++++++++++++++++++---------
 3 files changed, 63 insertions(+), 14 deletions(-)

diff --git a/decl.cpp b/decl.cpp
index 5661c4a3..fd96d3c0 100644
--- a/decl.cpp
+++ b/decl.cpp
@@ -171,6 +171,11 @@ Declarator::Declarator(DeclaratorKind dk, SourcePos p)
 void
 Declarator::InitFromDeclSpecs(DeclSpecs *ds) {
     const Type *t = GetType(ds);
+    if (t == NULL) {
+        Assert(m->errorCount > 0);
+        return;
+    }
+
     Symbol *sym = GetSymbol();
     if (sym != NULL) {
         sym->type = t;
@@ -248,8 +253,10 @@ Declarator::GetFunctionInfo(DeclSpecs *ds, std::vector<Symbol *> *funArgs) {
     // already have been added to the symbol table by AddGlobal() by the
     // time we get here.)
     Symbol *funSym = m->symbolTable->LookupFunction(declSym->name.c_str(), type);
-    if (funSym != NULL)
+    if (funSym == NULL)
         // May be NULL due to error earlier in compilation
+        Assert(m->errorCount > 0);
+    else
         funSym->pos = pos;
 
     // Walk down to the declarator for the function.  (We have to get past
@@ -262,7 +269,13 @@ Declarator::GetFunctionInfo(DeclSpecs *ds, std::vector<Symbol *> *funArgs) {
 
     for (unsigned int i = 0; i < d->functionParams.size(); ++i) {
         Symbol *sym = d->GetSymbolForFunctionParameter(i);
-        sym->type = sym->type->ResolveUnboundVariability(Type::Varying);
+        if (sym->type == NULL) {
+            Assert(m->errorCount > 0);
+            continue;
+        }
+        else
+            sym->type = sym->type->ResolveUnboundVariability(Type::Varying);
+
         funArgs->push_back(sym);
     }
 
@@ -379,8 +392,12 @@ Declarator::GetType(const Type *base, DeclSpecs *ds) const {
                 // report this differently than it was originally declared
                 // in the function, but it's not clear that this is a
                 // significant problem.)
-                sym->type = PointerType::GetUniform(at->GetElementType());
+                if (at->GetElementType() == NULL) {
+                    Assert(m->errorCount > 0);
+                    return NULL;
+                }
 
+                sym->type = PointerType::GetUniform(at->GetElementType());
                 // Make sure there are no unsized arrays (other than the
                 // first dimension) in function parameter lists.
                 at = dynamic_cast<const ArrayType *>(at->GetElementType());
@@ -547,11 +564,18 @@ Declaration::GetVariableDeclarations() const {
 
     for (unsigned int i = 0; i < declarators.size(); ++i) {
         Declarator *decl = declarators[i];
-        if (decl == NULL)
+        if (decl == NULL) {
             // Ignore earlier errors
+            Assert(m->errorCount > 0);
             continue;
+        }
 
         Symbol *sym = decl->GetSymbol();
+        if (sym == NULL || sym->type == NULL) {
+            // Ignore errors
+            Assert(m->errorCount > 0);
+            continue;
+        }
         sym->type = sym->type->ResolveUnboundVariability(Type::Varying);
 
         if (sym->type == AtomicType::Void)
@@ -571,11 +595,18 @@ Declaration::DeclareFunctions() {
 
     for (unsigned int i = 0; i < declarators.size(); ++i) {
         Declarator *decl = declarators[i];
-        if (decl == NULL)
+        if (decl == NULL) {
             // Ignore earlier errors
+            Assert(m->errorCount > 0);
             continue;
+        }
 
         Symbol *sym = decl->GetSymbol();
+        if (sym == NULL || sym->type == NULL) {
+            // Ignore errors
+            Assert(m->errorCount > 0);
+            continue;
+        }
         sym->type = sym->type->ResolveUnboundVariability(Type::Varying);
 
         if (dynamic_cast<const FunctionType *>(sym->type) == NULL)
diff --git a/expr.cpp b/expr.cpp
index 7a36a4bb..eb0567f5 100644
--- a/expr.cpp
+++ b/expr.cpp
@@ -2584,6 +2584,11 @@ AssignExpr::TypeCheck() {
     }
 
     const Type *lhsType = lvalue->GetType();
+    if (lhsType == NULL) {
+        Assert(m->errorCount > 0);
+        return NULL;
+    }
+
     if (lhsType->IsConstType()) {
         Error(lvalue->pos, "Can't assign to type \"%s\" on left-hand side of "
               "expression.", lhsType->GetString().c_str());
diff --git a/parse.yy b/parse.yy
index 95b22e00..2688557e 100644
--- a/parse.yy
+++ b/parse.yy
@@ -521,9 +521,16 @@ constant_expression
 declaration_statement
     : declaration     
     {
-        if ($1->declSpecs->storageClass == SC_TYPEDEF) {
+        if ($1 == NULL) {
+            Assert(m->errorCount > 0);
+            $$ = NULL;
+        }
+        else if ($1->declSpecs->storageClass == SC_TYPEDEF) {
             for (unsigned int i = 0; i < $1->declarators.size(); ++i) {
-                m->AddTypeDef($1->declarators[i]->GetSymbol());
+                if ($1->declarators[i] == NULL)
+                    Assert(m->errorCount > 0);
+                else
+                    m->AddTypeDef($1->declarators[i]->GetSymbol());
             }
             $$ = NULL;
         }
@@ -658,7 +665,6 @@ type_specifier
     : atomic_var_type_specifier { $$ = $1; }
     | TOKEN_TYPE_NAME
       { const Type *t = m->symbolTable->LookupType(yytext); 
-        Assert(t != NULL);
         $$ = t;
       }
     | struct_or_union_specifier { $$ = $1; }
@@ -1618,7 +1624,10 @@ lAddDeclaration(DeclSpecs *ds, Declarator *decl) {
             m->AddFunctionDeclaration(sym, isInline);
         }
         else {
-            sym->type = sym->type->ResolveUnboundVariability(Type::Varying);
+            if (sym->type == NULL)
+                Assert(m->errorCount > 0);
+            else
+                sym->type = sym->type->ResolveUnboundVariability(Type::Varying);
             bool isConst = (ds->typeQualifiers & TYPEQUAL_CONST) != 0;
             m->AddGlobalVariable(sym, decl->initExpr, isConst);
         }
@@ -1647,14 +1656,18 @@ lAddFunctionParams(Declarator *decl) {
             continue;
         Assert(pdecl->declarators.size() == 1);
         Symbol *sym = pdecl->declarators[0]->GetSymbol();
-        sym->type = sym->type->ResolveUnboundVariability(Type::Varying);
-#ifndef NDEBUG
-        bool ok = m->symbolTable->AddVariable(sym);
-        if (ok == false)
+        if (sym == NULL || sym->type == NULL)
             Assert(m->errorCount > 0);
+        else {
+            sym->type = sym->type->ResolveUnboundVariability(Type::Varying);
+#ifndef NDEBUG
+            bool ok = m->symbolTable->AddVariable(sym);
+            if (ok == false)
+                Assert(m->errorCount > 0);
 #else
-        m->symbolTable->AddVariable(sym);
+            m->symbolTable->AddVariable(sym);
 #endif
+        }
     }
 
     // The corresponding pop scope happens in function_definition rules

From f939015b978b689fb86dde69fcfeea6b864eed3e Mon Sep 17 00:00:00 2001
From: Matt Pharr <matt.pharr@intel.com>
Date: Mon, 6 Feb 2012 12:43:03 -0800
Subject: [PATCH 47/62] Default to int32 for declarations without specified
 types.

(e.g. "uniform foo" == "uniform int32 foo")
---
 decl.cpp | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/decl.cpp b/decl.cpp
index fd96d3c0..ec8022b7 100644
--- a/decl.cpp
+++ b/decl.cpp
@@ -113,6 +113,12 @@ DeclSpecs::DeclSpecs(const Type *t, StorageClass sc, int tq) {
 const Type *
 DeclSpecs::GetBaseType(SourcePos pos) const {
     const Type *bt = baseType;
+
+    if (bt == NULL) {
+        Warning(pos, "No type specified in declaration.  Assuming int32.");
+        bt = AtomicType::UnboundInt32;
+    }
+
     if (vectorSize > 0) {
         const AtomicType *atomicType = dynamic_cast<const AtomicType *>(bt);
         if (atomicType == NULL) {

From b7c5af7e64b3e4fa631ef14fcf5165891c8e1351 Mon Sep 17 00:00:00 2001
From: Matt Pharr <matt.pharr@intel.com>
Date: Mon, 6 Feb 2012 13:59:02 -0800
Subject: [PATCH 48/62] Prohibit returning functions from functions.

(Fix malformed program crasher)
---
 decl.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/decl.cpp b/decl.cpp
index ec8022b7..beff32d8 100644
--- a/decl.cpp
+++ b/decl.cpp
@@ -447,6 +447,10 @@ Declarator::GetType(const Type *base, DeclSpecs *ds) const {
             Error(pos, "No return type provided in function declaration.");
             return NULL;
         }
+        if (dynamic_cast<const FunctionType *>(returnType) != NULL) {
+            Error(pos, "Illegal to return function type from function.");
+            return NULL;
+        }
         
         bool isExported = ds && (ds->storageClass == SC_EXPORT);
         bool isExternC =  ds && (ds->storageClass == SC_EXTERN_C);

From 3efbc71a01279a3c40f32d3c2431af3ef8791560 Mon Sep 17 00:00:00 2001
From: Matt Pharr <matt.pharr@intel.com>
Date: Mon, 6 Feb 2012 13:59:14 -0800
Subject: [PATCH 49/62] Add fuzz testing of input programs.

When the --fuzz-test command-line option is given, the input program
will be randomly perturbed by the lexer in an effort to trigger
assertions or crashes in the compiler (neither of which should ever
happen, even for malformed programs.)
---
 ispc.cpp |   4 +-
 ispc.h   |   8 ++
 lex.ll   | 382 ++++++++++++++++++++++++++++++++++++++++---------------
 main.cpp |  31 ++++-
 sym.cpp  |  39 ++++++
 sym.h    |   7 +
 6 files changed, 366 insertions(+), 105 deletions(-)

diff --git a/ispc.cpp b/ispc.cpp
index 4293b21b..6729da92 100644
--- a/ispc.cpp
+++ b/ispc.cpp
@@ -515,8 +515,10 @@ Globals::Globals() {
     emitPerfWarnings = true;
     emitInstrumentation = false;
     generateDebuggingSymbols = false;
+    enableFuzzTest = false;
+    fuzzTestSeed = -1;
     mangleFunctionsWithTarget = false;
-
+    
     ctx = new llvm::LLVMContext;
 
 #ifdef ISPC_IS_WINDOWS
diff --git a/ispc.h b/ispc.h
index 856b988e..5f25ebe4 100644
--- a/ispc.h
+++ b/ispc.h
@@ -405,6 +405,14 @@ struct Globals {
         vector width to them. */
     bool mangleFunctionsWithTarget;
 
+    /** If enabled, the lexer will randomly replace some tokens returned
+        with other tokens, in order to test error condition handling in the
+        compiler. */
+    bool enableFuzzTest;
+
+    /** Seed for random number generator used for fuzz testing. */
+    int fuzzTestSeed;
+
     /** Global LLVMContext object */
     llvm::LLVMContext *ctx;
 
diff --git a/lex.ll b/lex.ll
index e5e395c3..9ba21f2a 100644
--- a/lex.ll
+++ b/lex.ll
@@ -58,6 +58,178 @@ static double lParseHexFloat(const char *ptr);
 inline int isatty(int) { return 0; }
 #endif // ISPC_IS_WINDOWS
 
+static int allTokens[] = { 
+  TOKEN_ASSERT, TOKEN_BOOL, TOKEN_BREAK, TOKEN_CASE, TOKEN_CBREAK,
+  TOKEN_CCONTINUE, TOKEN_CDO, TOKEN_CFOR, TOKEN_CIF, TOKEN_CWHILE,
+  TOKEN_CONST, TOKEN_CONTINUE, TOKEN_CRETURN, TOKEN_DEFAULT, TOKEN_DO,
+  TOKEN_DELETE, TOKEN_DELETE, TOKEN_DOUBLE, TOKEN_ELSE, TOKEN_ENUM,
+  TOKEN_EXPORT, TOKEN_EXTERN, TOKEN_FALSE, TOKEN_FLOAT, TOKEN_FOR,
+  TOKEN_FOREACH, TOKEN_FOREACH_TILED, TOKEN_GOTO, TOKEN_IF, TOKEN_INLINE,
+  TOKEN_INT, TOKEN_INT8, TOKEN_INT16, TOKEN_INT, TOKEN_INT64, TOKEN_LAUNCH,
+  TOKEN_NEW, TOKEN_NULL, TOKEN_PRINT, TOKEN_RETURN, TOKEN_SOA, TOKEN_SIGNED,
+  TOKEN_SIZEOF, TOKEN_STATIC, TOKEN_STRUCT, TOKEN_SWITCH, TOKEN_SYNC,
+  TOKEN_TASK, TOKEN_TRUE, TOKEN_TYPEDEF, TOKEN_UNIFORM, TOKEN_UNSIGNED,
+  TOKEN_VARYING, TOKEN_VOID, TOKEN_WHILE, TOKEN_STRING_C_LITERAL,
+  TOKEN_DOTDOTDOT, 
+  TOKEN_FLOAT_CONSTANT,
+  TOKEN_INT32_CONSTANT, TOKEN_UINT32_CONSTANT, 
+  TOKEN_INT64_CONSTANT, TOKEN_UINT64_CONSTANT, 
+  TOKEN_INC_OP, TOKEN_DEC_OP, TOKEN_LEFT_OP, TOKEN_RIGHT_OP, TOKEN_LE_OP,
+  TOKEN_GE_OP, TOKEN_EQ_OP, TOKEN_NE_OP, TOKEN_AND_OP, TOKEN_OR_OP,
+  TOKEN_MUL_ASSIGN, TOKEN_DIV_ASSIGN, TOKEN_MOD_ASSIGN, TOKEN_ADD_ASSIGN,
+  TOKEN_SUB_ASSIGN, TOKEN_LEFT_ASSIGN, TOKEN_RIGHT_ASSIGN, TOKEN_AND_ASSIGN,
+  TOKEN_XOR_ASSIGN, TOKEN_OR_ASSIGN, TOKEN_PTR_OP,
+  ';', '{', '}', ',', ':', '=', '(', ')', '[', ']', '.', '&', '!', '~', '-',
+  '+', '*', '/', '%', '<', '>', '^', '|', '?',
+};
+
+static std::map<int, std::string> tokenToName;
+
+static void lInitTokenToName() {
+    tokenToName[TOKEN_ASSERT] = "assert";
+    tokenToName[TOKEN_BOOL] = "bool";
+    tokenToName[TOKEN_BREAK] = "break";
+    tokenToName[TOKEN_CASE] = "case";
+    tokenToName[TOKEN_CBREAK] = "cbreak";
+    tokenToName[TOKEN_CCONTINUE] = "ccontinue";
+    tokenToName[TOKEN_CDO] = "cdo";
+    tokenToName[TOKEN_CFOR] = "cfor";
+    tokenToName[TOKEN_CIF] = "cif";
+    tokenToName[TOKEN_CWHILE] = "cwhile";
+    tokenToName[TOKEN_CONST] = "const";
+    tokenToName[TOKEN_CONTINUE] = "continue";
+    tokenToName[TOKEN_CRETURN] = "creturn";
+    tokenToName[TOKEN_DEFAULT] = "default";
+    tokenToName[TOKEN_DO] = "do";
+    tokenToName[TOKEN_DELETE] = "delete";
+    tokenToName[TOKEN_DELETE] = "delete";
+    tokenToName[TOKEN_DOUBLE] = "double";
+    tokenToName[TOKEN_ELSE] = "else";
+    tokenToName[TOKEN_ENUM] = "enum";
+    tokenToName[TOKEN_EXPORT] = "export";
+    tokenToName[TOKEN_EXTERN] = "extern";
+    tokenToName[TOKEN_FALSE] = "false";
+    tokenToName[TOKEN_FLOAT] = "float";
+    tokenToName[TOKEN_FOR] = "for";
+    tokenToName[TOKEN_FOREACH] = "foreach";
+    tokenToName[TOKEN_FOREACH_TILED] = "foreach_tiled";
+    tokenToName[TOKEN_GOTO] = "goto";
+    tokenToName[TOKEN_IF] = "if";
+    tokenToName[TOKEN_INLINE] = "inline";
+    tokenToName[TOKEN_INT] = "int";
+    tokenToName[TOKEN_INT8] = "int8";
+    tokenToName[TOKEN_INT16] = "int16";
+    tokenToName[TOKEN_INT] = "int";
+    tokenToName[TOKEN_INT64] = "int64";
+    tokenToName[TOKEN_LAUNCH] = "launch";
+    tokenToName[TOKEN_NEW] = "new";
+    tokenToName[TOKEN_NULL] = "NULL";
+    tokenToName[TOKEN_PRINT] = "print";
+    tokenToName[TOKEN_RETURN] = "return";
+    tokenToName[TOKEN_SOA] = "soa";
+    tokenToName[TOKEN_SIGNED] = "signed";
+    tokenToName[TOKEN_SIZEOF] = "sizeof";
+    tokenToName[TOKEN_STATIC] = "static";
+    tokenToName[TOKEN_STRUCT] = "struct";
+    tokenToName[TOKEN_SWITCH] = "switch";
+    tokenToName[TOKEN_SYNC] = "sync";
+    tokenToName[TOKEN_TASK] = "task";
+    tokenToName[TOKEN_TRUE] = "true";
+    tokenToName[TOKEN_TYPEDEF] = "typedef";
+    tokenToName[TOKEN_UNIFORM] = "uniform";
+    tokenToName[TOKEN_UNSIGNED] = "unsigned";
+    tokenToName[TOKEN_VARYING] = "varying";
+    tokenToName[TOKEN_VOID] = "void";
+    tokenToName[TOKEN_WHILE] = "while";
+    tokenToName[TOKEN_STRING_C_LITERAL] = "\"C\"";
+    tokenToName[TOKEN_DOTDOTDOT] = "...";
+    tokenToName[TOKEN_FLOAT_CONSTANT] = "TOKEN_FLOAT_CONSTANT";
+    tokenToName[TOKEN_INT32_CONSTANT] = "TOKEN_INT32_CONSTANT";
+    tokenToName[TOKEN_UINT32_CONSTANT] = "TOKEN_UINT32_CONSTANT";
+    tokenToName[TOKEN_INT64_CONSTANT] = "TOKEN_INT64_CONSTANT";
+    tokenToName[TOKEN_UINT64_CONSTANT] = "TOKEN_UINT64_CONSTANT";
+    tokenToName[TOKEN_INC_OP] = "++";
+    tokenToName[TOKEN_DEC_OP] = "--";
+    tokenToName[TOKEN_LEFT_OP] = "<<";
+    tokenToName[TOKEN_RIGHT_OP] = ">>";
+    tokenToName[TOKEN_LE_OP] = "<=";
+    tokenToName[TOKEN_GE_OP] = ">=";
+    tokenToName[TOKEN_EQ_OP] = "==";
+    tokenToName[TOKEN_NE_OP] = "!=";
+    tokenToName[TOKEN_AND_OP] = "&&";
+    tokenToName[TOKEN_OR_OP] = "||";
+    tokenToName[TOKEN_MUL_ASSIGN] = "*=";
+    tokenToName[TOKEN_DIV_ASSIGN] = "/=";
+    tokenToName[TOKEN_MOD_ASSIGN] = "%=";
+    tokenToName[TOKEN_ADD_ASSIGN] = "+=";
+    tokenToName[TOKEN_SUB_ASSIGN] = "-=";
+    tokenToName[TOKEN_LEFT_ASSIGN] = "<<=";
+    tokenToName[TOKEN_RIGHT_ASSIGN] = ">>=";
+    tokenToName[TOKEN_AND_ASSIGN] = "&=";
+    tokenToName[TOKEN_XOR_ASSIGN] = "^=";
+    tokenToName[TOKEN_OR_ASSIGN] = "|=";
+    tokenToName[TOKEN_PTR_OP] = "->";
+    tokenToName[';'] = ";";
+    tokenToName['{'] = "{";
+    tokenToName['}'] = "}";
+    tokenToName[','] = ",";
+    tokenToName[':'] = ":";
+    tokenToName['='] = "=";
+    tokenToName['('] = "(";
+    tokenToName[')'] = ")";
+    tokenToName['['] = "[";
+    tokenToName[']'] = "]";
+    tokenToName['.'] = ".";
+    tokenToName['&'] = "&";
+    tokenToName['!'] = "!";
+    tokenToName['~'] = "~";
+    tokenToName['-'] = "-";
+    tokenToName['+'] = "+";
+    tokenToName['*'] = "*";
+    tokenToName['/'] = "/";
+    tokenToName['%'] = "%";
+    tokenToName['<'] = "<";
+    tokenToName['>'] = ">";
+    tokenToName['^'] = "^";
+    tokenToName['|'] = "|";
+    tokenToName['?'] = "?";
+    tokenToName[';'] = ";";
+}
+
+
+inline int ispcRand() {
+#ifdef ISPC_IS_WINDOWS
+    return rand();
+#else
+    return lrand48();
+#endif
+}
+
+#define RT \
+    if (g->enableFuzzTest) { \
+        int r = ispcRand() % 40; \
+        if (r == 0) { \
+            Warning(*yylloc, "Dropping token"); \
+        } \
+        else if (r == 1) { \
+            if (tokenToName.size() == 0) lInitTokenToName(); \
+            int nt = sizeof(allTokens) / sizeof(allTokens[0]); \
+            int tn = ispcRand() % nt; \
+            yylval->stringVal = new std::string(yytext); /* just in case */\
+            Warning(*yylloc, "Replaced with \"%s\"", tokenToName[allTokens[tn]].c_str()); \
+            return allTokens[tn]; \
+        } \
+        else if (r == 2) { \
+            Symbol *sym = m->symbolTable->RandomSymbol(); \
+            if (sym != NULL) { \
+                yylval->stringVal = new std::string(sym->name); \
+                Warning(*yylloc, "Replaced with identifier \"%s\".", sym->name.c_str()); \
+                return TOKEN_IDENTIFIER; \
+            } \
+        } \
+        /*  TOKEN_TYPE_NAME */ \
+     } else /* swallow semicolon */
+
 %}
 
 %option nounput
@@ -78,70 +250,71 @@ ZO_SWIZZLE ([01]+[w-z]+)+|([01]+[rgba]+)+|([01]+[uv]+)+
 "/*"            { lCComment(yylloc); }
 "//"            { lCppComment(yylloc); }
 
-__assert { return TOKEN_ASSERT; }
-bool { return TOKEN_BOOL; }
-break { return TOKEN_BREAK; }
-case { return TOKEN_CASE; }
-cbreak { return TOKEN_CBREAK; }
-ccontinue { return TOKEN_CCONTINUE; }
-cdo { return TOKEN_CDO; }
-cfor { return TOKEN_CFOR; }
-cif { return TOKEN_CIF; }
-cwhile { return TOKEN_CWHILE; }
-const { return TOKEN_CONST; }
-continue { return TOKEN_CONTINUE; }
-creturn { return TOKEN_CRETURN; }
-default { return TOKEN_DEFAULT; }
-do { return TOKEN_DO; }
-delete { return TOKEN_DELETE; }
-delete\[\] { return TOKEN_DELETE; }
-double { return TOKEN_DOUBLE; }
-else { return TOKEN_ELSE; }
-enum { return TOKEN_ENUM; }
-export { return TOKEN_EXPORT; }
-extern { return TOKEN_EXTERN; }
-false { return TOKEN_FALSE; }
-float { return TOKEN_FLOAT; }
-for { return TOKEN_FOR; }
-foreach { return TOKEN_FOREACH; }
-foreach_tiled { return TOKEN_FOREACH_TILED; }
-goto { return TOKEN_GOTO; }
-if { return TOKEN_IF; }
-inline { return TOKEN_INLINE; }
-int { return TOKEN_INT; }
-int8 { return TOKEN_INT8; }
-int16 { return TOKEN_INT16; }
-int32 { return TOKEN_INT; }
-int64 { return TOKEN_INT64; }
-launch { return TOKEN_LAUNCH; }
-new { return TOKEN_NEW; }
-NULL { return TOKEN_NULL; }
-print { return TOKEN_PRINT; }
+__assert { RT; return TOKEN_ASSERT; }
+bool { RT; return TOKEN_BOOL; }
+break { RT; return TOKEN_BREAK; }
+case { RT; return TOKEN_CASE; }
+cbreak { RT; return TOKEN_CBREAK; }
+ccontinue { RT; return TOKEN_CCONTINUE; }
+cdo { RT; return TOKEN_CDO; }
+cfor { RT; return TOKEN_CFOR; }
+cif { RT; return TOKEN_CIF; }
+cwhile { RT; return TOKEN_CWHILE; }
+const { RT; return TOKEN_CONST; }
+continue { RT; return TOKEN_CONTINUE; }
+creturn { RT; return TOKEN_CRETURN; }
+default { RT; return TOKEN_DEFAULT; }
+do { RT; return TOKEN_DO; }
+delete { RT; return TOKEN_DELETE; }
+delete\[\] { RT; return TOKEN_DELETE; }
+double { RT; return TOKEN_DOUBLE; }
+else { RT; return TOKEN_ELSE; }
+enum { RT; return TOKEN_ENUM; }
+export { RT; return TOKEN_EXPORT; }
+extern { RT; return TOKEN_EXTERN; }
+false { RT; return TOKEN_FALSE; }
+float { RT; return TOKEN_FLOAT; }
+for { RT; return TOKEN_FOR; }
+foreach { RT; return TOKEN_FOREACH; }
+foreach_tiled { RT; return TOKEN_FOREACH_TILED; }
+goto { RT; return TOKEN_GOTO; }
+if { RT; return TOKEN_IF; }
+inline { RT; return TOKEN_INLINE; }
+int { RT; return TOKEN_INT; }
+int8 { RT; return TOKEN_INT8; }
+int16 { RT; return TOKEN_INT16; }
+int32 { RT; return TOKEN_INT; }
+int64 { RT; return TOKEN_INT64; }
+launch { RT; return TOKEN_LAUNCH; }
+new { RT; return TOKEN_NEW; }
+NULL { RT; return TOKEN_NULL; }
+print { RT; return TOKEN_PRINT; }
 reference { Error(*yylloc, "\"reference\" qualifier is no longer supported; "
                            "please use C++-style '&' syntax for references "
                            "instead."); }
-return { return TOKEN_RETURN; }
-soa { return TOKEN_SOA; }
-signed { return TOKEN_SIGNED; }
-sizeof { return TOKEN_SIZEOF; }
-static { return TOKEN_STATIC; }
-struct { return TOKEN_STRUCT; }
-switch { return TOKEN_SWITCH; }
-sync { return TOKEN_SYNC; }
-task { return TOKEN_TASK; }
-true { return TOKEN_TRUE; }
-typedef { return TOKEN_TYPEDEF; }
-uniform { return TOKEN_UNIFORM; }
-unsigned { return TOKEN_UNSIGNED; }
-varying { return TOKEN_VARYING; }
-void { return TOKEN_VOID; }
-while { return TOKEN_WHILE; }
-\"C\" { return TOKEN_STRING_C_LITERAL; }
-\.\.\. { return TOKEN_DOTDOTDOT; }
+return { RT; return TOKEN_RETURN; }
+soa { RT; return TOKEN_SOA; }
+signed { RT; return TOKEN_SIGNED; }
+sizeof { RT; return TOKEN_SIZEOF; }
+static { RT; return TOKEN_STATIC; }
+struct { RT; return TOKEN_STRUCT; }
+switch { RT; return TOKEN_SWITCH; }
+sync { RT; return TOKEN_SYNC; }
+task { RT; return TOKEN_TASK; }
+true { RT; return TOKEN_TRUE; }
+typedef { RT; return TOKEN_TYPEDEF; }
+uniform { RT; return TOKEN_UNIFORM; }
+unsigned { RT; return TOKEN_UNSIGNED; }
+varying { RT; return TOKEN_VARYING; }
+void { RT; return TOKEN_VOID; }
+while { RT; return TOKEN_WHILE; }
+\"C\" { RT; return TOKEN_STRING_C_LITERAL; }
+\.\.\. { RT; return TOKEN_DOTDOTDOT; }
 
 L?\"(\\.|[^\\"])*\" { lStringConst(yylval, yylloc); return TOKEN_STRING_LITERAL; }
 
 {IDENT} { 
+    RT;
     /* We have an identifier--is it a type name or an identifier?
        The symbol table will straighten us out... */
     yylval->stringVal = new std::string(yytext);
@@ -152,6 +325,7 @@ L?\"(\\.|[^\\"])*\" { lStringConst(yylval, yylloc); return TOKEN_STRING_LITERAL;
 }
 
 {INT_NUMBER}+(u|U|l|L)*? { 
+    RT;
     int ls = 0, us = 0;
 
     char *endPtr = NULL;
@@ -201,60 +375,62 @@ L?\"(\\.|[^\\"])*\" { lStringConst(yylval, yylloc); return TOKEN_STRING_LITERAL;
 
 
 {FLOAT_NUMBER} { 
-    yylval->floatVal = (float)atof(yytext); 
+    RT;
+    yylval->floatVal = (float)atof(yytext);
     return TOKEN_FLOAT_CONSTANT; 
 }
 
 {HEX_FLOAT_NUMBER} {
+    RT;
     yylval->floatVal = (float)lParseHexFloat(yytext); 
     return TOKEN_FLOAT_CONSTANT; 
 }
 
-"++" { return TOKEN_INC_OP; }
-"--" { return TOKEN_DEC_OP; }
-"<<" { return TOKEN_LEFT_OP; }
-">>" { return TOKEN_RIGHT_OP; }
-"<=" { return TOKEN_LE_OP; }
-">=" { return TOKEN_GE_OP; }
-"==" { return TOKEN_EQ_OP; }
-"!=" { return TOKEN_NE_OP; }
-"&&" { return TOKEN_AND_OP; }
-"||" { return TOKEN_OR_OP; }
-"*=" { return TOKEN_MUL_ASSIGN; }
-"/=" { return TOKEN_DIV_ASSIGN; }
-"%=" { return TOKEN_MOD_ASSIGN; }
-"+=" { return TOKEN_ADD_ASSIGN; }
-"-=" { return TOKEN_SUB_ASSIGN; }
-"<<=" { return TOKEN_LEFT_ASSIGN; }
-">>=" { return TOKEN_RIGHT_ASSIGN; }
-"&=" { return TOKEN_AND_ASSIGN; }
-"^=" { return TOKEN_XOR_ASSIGN; }
-"|=" { return TOKEN_OR_ASSIGN; }
-"->" { return TOKEN_PTR_OP; }
-";"             { return ';'; }
-("{"|"<%")      { return '{'; }
-("}"|"%>")      { return '}'; }
-","             { return ','; }
-":"             { return ':'; }
-"="             { return '='; }
-"("             { return '('; }
-")"             { return ')'; }
-("["|"<:")      { return '['; }
-("]"|":>")      { return ']'; }
-"."             { return '.'; }
-"&"             { return '&'; }
-"!"             { return '!'; }
-"~"             { return '~'; }
-"-"             { return '-'; }
-"+"             { return '+'; }
-"*"             { return '*'; }
-"/"             { return '/'; }
-"%"             { return '%'; }
-"<"             { return '<'; }
-">"             { return '>'; }
-"^"             { return '^'; }
-"|"             { return '|'; }
-"?"             { return '?'; }
+"++" { RT; return TOKEN_INC_OP; }
+"--" { RT; return TOKEN_DEC_OP; }
+"<<" { RT; return TOKEN_LEFT_OP; }
+">>" { RT; return TOKEN_RIGHT_OP; }
+"<=" { RT; return TOKEN_LE_OP; }
+">=" { RT; return TOKEN_GE_OP; }
+"==" { RT; return TOKEN_EQ_OP; }
+"!=" { RT; return TOKEN_NE_OP; }
+"&&" { RT; return TOKEN_AND_OP; }
+"||" { RT; return TOKEN_OR_OP; }
+"*=" { RT; return TOKEN_MUL_ASSIGN; }
+"/=" { RT; return TOKEN_DIV_ASSIGN; }
+"%=" { RT; return TOKEN_MOD_ASSIGN; }
+"+=" { RT; return TOKEN_ADD_ASSIGN; }
+"-=" { RT; return TOKEN_SUB_ASSIGN; }
+"<<=" { RT; return TOKEN_LEFT_ASSIGN; }
+">>=" { RT; return TOKEN_RIGHT_ASSIGN; }
+"&=" { RT; return TOKEN_AND_ASSIGN; }
+"^=" { RT; return TOKEN_XOR_ASSIGN; }
+"|=" { RT; return TOKEN_OR_ASSIGN; }
+"->" { RT; return TOKEN_PTR_OP; }
+";"             { RT; return ';'; }
+("{"|"<%")      { RT; return '{'; }
+("}"|"%>")      { RT; return '}'; }
+","             { RT; return ','; }
+":"             { RT; return ':'; }
+"="             { RT; return '='; }
+"("             { RT; return '('; }
+")"             { RT; return ')'; }
+("["|"<:")      { RT; return '['; }
+("]"|":>")      { RT; return ']'; }
+"."             { RT; return '.'; }
+"&"             { RT; return '&'; }
+"!"             { RT; return '!'; }
+"~"             { RT; return '~'; }
+"-"             { RT; return '-'; }
+"+"             { RT; return '+'; }
+"*"             { RT; return '*'; }
+"/"             { RT; return '/'; }
+"%"             { RT; return '%'; }
+"<"             { RT; return '<'; }
+">"             { RT; return '>'; }
+"^"             { RT; return '^'; }
+"|"             { RT; return '|'; }
+"?"             { RT; return '?'; }
 
 {WHITESPACE} { }
 
diff --git a/main.cpp b/main.cpp
index 4c05d044..45f76658 100644
--- a/main.cpp
+++ b/main.cpp
@@ -41,6 +41,9 @@
 #include "type.h"
 #include <stdio.h>
 #include <stdlib.h>
+#ifdef ISPC_IS_WINDOWS
+  #include <time.h>
+#endif // ISPC_IS_WINDOWS
 #include <llvm/Support/PrettyStackTrace.h>
 #include <llvm/Support/Signals.h>
 #if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
@@ -97,6 +100,10 @@ usage(int ret) {
 #endif // !LLVM_2_9
     printf("    [--emit-llvm]\t\t\tEmit LLVM bitode file as output\n");
     printf("    [--emit-obj]\t\t\tGenerate object file file as output (default)\n");
+#if 0
+    printf("    [--fuzz-test]\t\t\tRandomly perturb program input to test error conditions\n");
+    printf("    [--fuzz-seed=<value>]\t\tSeed value for RNG for fuzz testing\n");
+#endif
     printf("    [-g]\t\t\t\tGenerate debugging information\n");
     printf("    [--help]\t\t\t\tPrint help\n");
     printf("    [-h <name>/--header-outfile=<name>]\tOutput filename for header\n");
@@ -144,7 +151,7 @@ usage(int ret) {
 /** We take arguments from both the command line as well as from the
     ISPC_ARGS environment variable.  This function returns a new set of
     arguments representing the ones from those two sources merged together.
- */ 
+*/ 
 static void lGetAllArgs(int Argc, char *Argv[], int &argc, char *argv[128]) {
     // Copy over the command line arguments (passed in)
     for (int i = 0; i < Argc; ++i)
@@ -272,6 +279,10 @@ int main(int Argc, char *Argv[]) {
             ot = Module::Bitcode;
         else if (!strcmp(argv[i], "--emit-obj"))
             ot = Module::Object;
+        else if (!strcmp(argv[i], "--fuzz-test"))
+            g->enableFuzzTest = true;
+        else if (!strncmp(argv[i], "--fuzz-seed=", 12))
+            g->fuzzTestSeed = atoi(argv[i] + 12);
         else if (!strcmp(argv[i], "--target")) {
             // FIXME: should remove this way of specifying the target...
             if (++i == argc) {
@@ -411,6 +422,24 @@ int main(int Argc, char *Argv[]) {
     if (debugSet && !optSet)
         g->opt.level = 0;
 
+    if (g->enableFuzzTest) {
+        if (g->fuzzTestSeed == -1) {
+#ifdef ISPC_IS_WINDOWS
+            int seed = (unsigned)time(NULL);
+#else
+            int seed = getpid();
+#endif
+            g->fuzzTestSeed = seed;
+            Warning(SourcePos(), "Using seed %d for fuzz testing", 
+                    g->fuzzTestSeed);
+        }
+#ifdef ISPC_IS_WINDOWS
+        srand(g->fuzzTestSeed);
+#else
+        srand48(g->fuzzTestSeed);
+#endif
+    }
+
     if (outFileName == NULL && headerFileName == NULL)
         Warning(SourcePos(), "No output file or header file name specified. "
                 "Program will be compiled and warnings/errors will "
diff --git a/sym.cpp b/sym.cpp
index 0647a5b4..f60dc1aa 100644
--- a/sym.cpp
+++ b/sym.cpp
@@ -354,3 +354,42 @@ SymbolTable::Print() {
         depth += 4;
     }
 }
+
+
+inline int ispcRand() {
+#ifdef ISPC_IS_WINDOWS
+    return rand();
+#else
+    return lrand48();
+#endif
+}
+
+
+Symbol *
+SymbolTable::RandomSymbol() {
+    int v = ispcRand() % variables.size();
+    if (variables[v]->size() == 0)
+        return NULL;
+    int count = ispcRand() % variables[v]->size();
+    SymbolMapType::iterator iter = variables[v]->begin();
+    while (count-- > 0) {
+        ++iter;
+        Assert(iter != variables[v]->end());
+    }
+    return iter->second;
+}
+
+
+const Type *
+SymbolTable::RandomType() {
+    int v = ispcRand() % types.size();
+    if (types[v]->size() == 0)
+        return NULL;
+    int count = ispcRand() % types[v]->size();
+    TypeMapType::iterator iter = types[v]->begin();
+    while (count-- > 0) {
+        ++iter;
+        Assert(iter != types[v]->end());
+    }
+    return iter->second;
+}
diff --git a/sym.h b/sym.h
index aff0553c..fa452326 100644
--- a/sym.h
+++ b/sym.h
@@ -244,6 +244,13 @@ public:
         (Debugging method). */
     void Print();
 
+    /** Returns a random symbol from the symbol table. (It is not
+        guaranteed that it is equally likely to return all symbols). */
+    Symbol *RandomSymbol();
+
+    /** Returns a random type from the symbol table. */
+    const Type *RandomType();
+
 private:
     std::vector<std::string> closestTypeMatch(const char *str, 
                                               bool structsVsEnums) const;

From a2b5ce01726514f6e57232f2789d8c10a5eec3bb Mon Sep 17 00:00:00 2001
From: Matt Pharr <matt.pharr@intel.com>
Date: Mon, 6 Feb 2012 14:03:26 -0800
Subject: [PATCH 50/62] Add --help-dev option, only print developer options
 when it is used.

---
 main.cpp | 39 +++++++++++++++++++++++----------------
 1 file changed, 23 insertions(+), 16 deletions(-)

diff --git a/main.cpp b/main.cpp
index 45f76658..c2ffe49d 100644
--- a/main.cpp
+++ b/main.cpp
@@ -93,19 +93,15 @@ usage(int ret) {
     printf("    [--cpu=<cpu>]\t\t\tSelect target CPU type\n");
     printf("         <cpu>={%s}\n", Target::SupportedTargetCPUs());
     printf("    [-D<foo>]\t\t\t\t#define given value when running preprocessor\n");
-    printf("    [--debug]\t\t\t\tPrint information useful for debugging ispc\n");
     printf("    [--emit-asm]\t\t\tGenerate assembly language file as output\n");
 #ifndef LLVM_2_9
     printf("    [--emit-c++]\t\t\tEmit a C++ source file as output\n");
 #endif // !LLVM_2_9
     printf("    [--emit-llvm]\t\t\tEmit LLVM bitode file as output\n");
     printf("    [--emit-obj]\t\t\tGenerate object file file as output (default)\n");
-#if 0
-    printf("    [--fuzz-test]\t\t\tRandomly perturb program input to test error conditions\n");
-    printf("    [--fuzz-seed=<value>]\t\tSeed value for RNG for fuzz testing\n");
-#endif
     printf("    [-g]\t\t\t\tGenerate debugging information\n");
     printf("    [--help]\t\t\t\tPrint help\n");
+    printf("    [--help-dev]\t\t\tPrint help for developer options\n");
     printf("    [-h <name>/--header-outfile=<name>]\tOutput filename for header\n");
     printf("    [--instrument]\t\t\tEmit instrumentation to gather performance data\n");
     printf("    [--math-lib=<option>]\t\tSelect math library\n");
@@ -122,17 +118,6 @@ usage(int ret) {
     printf("        disable-loop-unroll\t\tDisable loop unrolling.\n");
     printf("        fast-masked-vload\t\tFaster masked vector loads on SSE (may go past end of array)\n");
     printf("        fast-math\t\t\tPerform non-IEEE-compliant optimizations of numeric expressions\n");
-#if 0
-    printf("        disable-all-on-optimizations\n");
-    printf("        disable-blended-masked-stores\t\tScalarize masked stores on SSE (vs. using vblendps)\n");
-    printf("        disable-blending-removal\t\tDisable eliminating blend at same scope\n");
-    printf("        disable-coherent-control-flow\t\tDisable coherent control flow optimizations\n");
-    printf("        disable-gather-scatter-flattening\tDisable flattening when all lanes are on\n");
-    printf("        disable-gather-scatter-optimizations\tDisable improvements to gather/scatter\n");
-    printf("        disable-handle-pseudo-memory-ops\n");
-    printf("        disable-uniform-control-flow\t\tDisable uniform control flow optimizations\n");
-    printf("        disable-uniform-memory-optimizations\tDisable uniform-based coherent memory access\n");
-#endif
 #ifndef ISPC_IS_WINDOWS
     printf("    [--pic]\t\t\t\tGenerate position-independent code\n");
 #endif // !ISPC_IS_WINDOWS
@@ -147,6 +132,26 @@ usage(int ret) {
 }
 
 
+static void
+devUsage(int ret) {
+    lPrintVersion();
+    printf("\nusage (developer options): ispc\n");
+    printf("    [--debug]\t\t\t\tPrint information useful for debugging ispc\n");
+    printf("    [--fuzz-test]\t\t\tRandomly perturb program input to test error conditions\n");
+    printf("    [--fuzz-seed=<value>]\t\tSeed value for RNG for fuzz testing\n");
+    printf("    [--opt=<option>]\t\t\tSet optimization option\n");
+    printf("        disable-all-on-optimizations\n");
+    printf("        disable-blended-masked-stores\t\tScalarize masked stores on SSE (vs. using vblendps)\n");
+    printf("        disable-blending-removal\t\tDisable eliminating blend at same scope\n");
+    printf("        disable-coherent-control-flow\t\tDisable coherent control flow optimizations\n");
+    printf("        disable-gather-scatter-flattening\tDisable flattening when all lanes are on\n");
+    printf("        disable-gather-scatter-optimizations\tDisable improvements to gather/scatter\n");
+    printf("        disable-handle-pseudo-memory-ops\n");
+    printf("        disable-uniform-control-flow\t\tDisable uniform control flow optimizations\n");
+    printf("        disable-uniform-memory-optimizations\tDisable uniform-based coherent memory access\n");
+    exit(ret);
+}
+
 
 /** We take arguments from both the command line as well as from the
     ISPC_ARGS environment variable.  This function returns a new set of
@@ -235,6 +240,8 @@ int main(int Argc, char *Argv[]) {
     for (int i = 1; i < argc; ++i) {
         if (!strcmp(argv[i], "--help"))
             usage(0);
+        if (!strcmp(argv[i], "--help-dev"))
+            devUsage(0);
         else if (!strncmp(argv[i], "-D", 2))
             g->cppArgs.push_back(argv[i]);
         else if (!strncmp(argv[i], "--addressing=", 13)) {

From ee91fa122883b2b3e3d50c7fb6badb07056c9674 Mon Sep 17 00:00:00 2001
From: Matt Pharr <matt.pharr@intel.com>
Date: Mon, 6 Feb 2012 14:19:27 -0800
Subject: [PATCH 51/62] Make sure the program doesn't have a dereference of a
 non-pointer type.

---
 expr.cpp                  | 11 ++++++++++-
 tests_errors/deref-4.ispc |  6 ++++++
 2 files changed, 16 insertions(+), 1 deletion(-)
 create mode 100644 tests_errors/deref-4.ispc

diff --git a/expr.cpp b/expr.cpp
index eb0567f5..4ee0b52e 100644
--- a/expr.cpp
+++ b/expr.cpp
@@ -6305,8 +6305,17 @@ DereferenceExpr::GetType() const {
 
 Expr *
 DereferenceExpr::TypeCheck() {
-    if (expr == NULL)
+    if (expr == NULL) {
+        Assert(m->errorCount > 0);
         return NULL;
+    }
+        
+    if (dynamic_cast<const PointerType *>(expr->GetType()) == NULL) {
+        Error(pos, "Illegal to dereference non-pointer type \"%s\".",
+              expr->GetType()->GetString().c_str());
+        return NULL;
+    }
+
     return this;
 }
 
diff --git a/tests_errors/deref-4.ispc b/tests_errors/deref-4.ispc
new file mode 100644
index 00000000..0c45083b
--- /dev/null
+++ b/tests_errors/deref-4.ispc
@@ -0,0 +1,6 @@
+// Illegal to dereference non-pointer type "float"
+
+float func(float a) {
+    *a = 0;
+    return 0;
+}

From a59fd7eeb33596cf964a62ba5357099681114ccd Mon Sep 17 00:00:00 2001
From: Matt Pharr <matt.pharr@intel.com>
Date: Mon, 6 Feb 2012 14:19:37 -0800
Subject: [PATCH 52/62] Fix a missing return value in the parser.

---
 parse.yy | 1 +
 1 file changed, 1 insertion(+)

diff --git a/parse.yy b/parse.yy
index 2688557e..455d2b67 100644
--- a/parse.yy
+++ b/parse.yy
@@ -1230,6 +1230,7 @@ direct_abstract_declarator
       {
           Declarator *d = new Declarator(DK_FUNCTION, Union(@1, @3));
           if ($2 != NULL) d->functionParams = *$2;
+          $$ = d;
       }
     | direct_abstract_declarator '(' ')'
       {

From 420d373d89c438154d447a9bc1b87e5e2d256129 Mon Sep 17 00:00:00 2001
From: Matt Pharr <matt.pharr@intel.com>
Date: Mon, 6 Feb 2012 14:28:07 -0800
Subject: [PATCH 53/62] Move assert so that an error is issued for "break"
 outside of loops.

---
 ctx.cpp                 | 3 ++-
 tests_errors/break.ispc | 5 +++++
 2 files changed, 7 insertions(+), 1 deletion(-)
 create mode 100644 tests_errors/break.ispc

diff --git a/ctx.cpp b/ctx.cpp
index e9fd7203..41178a5b 100644
--- a/ctx.cpp
+++ b/ctx.cpp
@@ -642,12 +642,12 @@ FunctionEmitContext::inSwitchStatement() const {
 
 void
 FunctionEmitContext::Break(bool doCoherenceCheck) {
-    Assert(controlFlowInfo.size() > 0);
     if (breakTarget == NULL) {
         Error(currentPos, "\"break\" statement is illegal outside of "
               "for/while/do loops and \"switch\" statements.");
         return;
     }
+    Assert(controlFlowInfo.size() > 0);
 
     if (bblock == NULL)
         return;
@@ -721,6 +721,7 @@ FunctionEmitContext::Continue(bool doCoherenceCheck) {
               "for/while/do/foreach loops.");
         return;
     }
+    Assert(controlFlowInfo.size() > 0);
 
     if (ifsInCFAllUniform(CFInfo::Loop) || GetInternalMask() == LLVMMaskAllOn) {
         // Similarly to 'break' statements, we can immediately jump to the
diff --git a/tests_errors/break.ispc b/tests_errors/break.ispc
new file mode 100644
index 00000000..35441569
--- /dev/null
+++ b/tests_errors/break.ispc
@@ -0,0 +1,5 @@
+// "break" statement is illegal outside of
+
+void foo() {
+    break;
+}

From 8e2b0632e89343b90e67d6613615e8e43b0f9e93 Mon Sep 17 00:00:00 2001
From: Matt Pharr <matt.pharr@intel.com>
Date: Mon, 6 Feb 2012 14:28:38 -0800
Subject: [PATCH 54/62] Issue an error if an array of references is declared.

(More malformed program fixes.)
---
 decl.cpp | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/decl.cpp b/decl.cpp
index beff32d8..f5c0eb88 100644
--- a/decl.cpp
+++ b/decl.cpp
@@ -355,6 +355,11 @@ Declarator::GetType(const Type *base, DeclSpecs *ds) const {
             Error(pos, "Arrays of \"void\" type are illegal.");
             return NULL;
         }
+        if (dynamic_cast<const ReferenceType *>(type)) {
+            Error(pos, "Arrays of references (type \"%s\") are illegal.",
+                  type->GetString().c_str());
+            return NULL;
+        }
 
         type = new ArrayType(type, arraySize);
         if (child)
@@ -472,6 +477,11 @@ Declarator::GetType(const Type *base, DeclSpecs *ds) const {
             return NULL;
         }
 
+        if (child == NULL) {
+            Assert(m->errorCount > 0);
+            return NULL;
+        }
+
         const Type *functionType = 
             new FunctionType(returnType, args, argNames, argDefaults,
                              argPos, isTask, isExported, isExternC);

From 098ceb55671ac47a600de9face7f7ad20b96fb88 Mon Sep 17 00:00:00 2001
From: Matt Pharr <matt.pharr@intel.com>
Date: Mon, 6 Feb 2012 15:17:25 -0800
Subject: [PATCH 55/62] Issue error on attempted type convert from/to function
 type.

---
 expr.cpp | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/expr.cpp b/expr.cpp
index 4ee0b52e..ac569693 100644
--- a/expr.cpp
+++ b/expr.cpp
@@ -211,6 +211,21 @@ lDoTypeConv(const Type *fromType, const Type *toType, Expr **expr,
         return false;
     }
 
+    if (dynamic_cast<const FunctionType *>(fromType)) {
+        if (!failureOk)
+            Error(pos, "Can't convert function type \"%s\" to \"%s\" for %s.",
+                  fromType->GetString().c_str(),
+                  toType->GetString().c_str(), errorMsgBase);
+        return false;
+    }
+    if (dynamic_cast<const FunctionType *>(toType)) {
+        if (!failureOk)
+            Error(pos, "Can't convert from type \"%s\" to function type \"%s\" "
+                  "for %s.", fromType->GetString().c_str(),
+                  toType->GetString().c_str(), errorMsgBase);
+        return false;
+    }
+
     const ArrayType *toArrayType = dynamic_cast<const ArrayType *>(toType);
     const ArrayType *fromArrayType = dynamic_cast<const ArrayType *>(fromType);
     const VectorType *toVectorType = dynamic_cast<const VectorType *>(toType);

From 6b3e14b0a49b60f710b28660ef98ee7a3659a231 Mon Sep 17 00:00:00 2001
From: Matt Pharr <matt.pharr@intel.com>
Date: Mon, 6 Feb 2012 15:33:57 -0800
Subject: [PATCH 56/62] Add command-line option to enable debugging output from
 parser.

---
 main.cpp | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/main.cpp b/main.cpp
index c2ffe49d..6168f1c2 100644
--- a/main.cpp
+++ b/main.cpp
@@ -149,6 +149,7 @@ devUsage(int ret) {
     printf("        disable-handle-pseudo-memory-ops\n");
     printf("        disable-uniform-control-flow\t\tDisable uniform control flow optimizations\n");
     printf("        disable-uniform-memory-optimizations\tDisable uniform-based coherent memory access\n");
+    printf("    [--yydebug]\t\t\tPrint debugging information during parsing\n");
     exit(ret);
 }
 
@@ -404,6 +405,10 @@ int main(int Argc, char *Argv[]) {
 #endif // !ISPC_IS_WINDOWS
         else if (!strcmp(argv[i], "--quiet"))
             g->quiet = true;
+        else if (!strcmp(argv[i], "--yydebug")) {
+            extern int yydebug;
+            yydebug = 1;
+        }
         else if (!strcmp(argv[i], "-v") || !strcmp(argv[i], "--version")) {
             lPrintVersion();
             return 0;

From 33ea934c8fe501928572ed752ef520822060fa52 Mon Sep 17 00:00:00 2001
From: Matt Pharr <matt.pharr@intel.com>
Date: Tue, 7 Feb 2012 08:18:33 -0800
Subject: [PATCH 57/62] Fix over-aggressive check in
 DereferenceExpr::TypeCheck()

(Reference types are allowed as well.)
---
 expr.cpp | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/expr.cpp b/expr.cpp
index ac569693..ec242fea 100644
--- a/expr.cpp
+++ b/expr.cpp
@@ -6325,9 +6325,10 @@ DereferenceExpr::TypeCheck() {
         return NULL;
     }
         
-    if (dynamic_cast<const PointerType *>(expr->GetType()) == NULL) {
-        Error(pos, "Illegal to dereference non-pointer type \"%s\".",
-              expr->GetType()->GetString().c_str());
+    if (dynamic_cast<const PointerType *>(expr->GetType()) == NULL &&
+        dynamic_cast<const ReferenceType *>(expr->GetType()) == NULL) {
+        Error(pos, "Illegal to dereference non-pointer or reference "
+              "type \"%s\".", expr->GetType()->GetString().c_str());
         return NULL;
     }
 

From 5b9de8cc07a6b7517a4ecfa38b1ffd44353212fb Mon Sep 17 00:00:00 2001
From: Matt Pharr <matt.pharr@intel.com>
Date: Tue, 7 Feb 2012 08:36:56 -0800
Subject: [PATCH 58/62] Fix test to account for updated error message.

---
 tests_errors/deref-4.ispc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests_errors/deref-4.ispc b/tests_errors/deref-4.ispc
index 0c45083b..33877f72 100644
--- a/tests_errors/deref-4.ispc
+++ b/tests_errors/deref-4.ispc
@@ -1,4 +1,4 @@
-// Illegal to dereference non-pointer type "float"
+// Illegal to dereference non-pointer or reference type "float"
 
 float func(float a) {
     *a = 0;

From 5b4673e8eb6c3ec45b116f92959d0ed6a83d8532 Mon Sep 17 00:00:00 2001
From: Matt Pharr <matt.pharr@intel.com>
Date: Tue, 7 Feb 2012 08:37:13 -0800
Subject: [PATCH 59/62] Fix build with LLVM 2.9.

---
 builtins/util.m4 |  2 +-
 opt.cpp          | 10 +++++-----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/builtins/util.m4 b/builtins/util.m4
index 563ee3e9..7c022e94 100644
--- a/builtins/util.m4
+++ b/builtins/util.m4
@@ -1867,7 +1867,7 @@ define void @__delete_varying(<WIDTH x i64> %ptr, <WIDTH x MASK> %mask) {
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; read hw clock
 
-define i64 @__clock() nounwind uwtable ssp {
+define i64 @__clock() nounwind {
 entry:
   tail call void asm sideeffect "xorl %eax,%eax \0A    cpuid", "~{rax},~{rbx},~{rcx},~{rdx},~{dirflag},~{fpsr},~{flags}"() nounwind
   %0 = tail call { i32, i32 } asm sideeffect "rdtsc", "={ax},={dx},~{dirflag},~{fpsr},~{flags}"() nounwind
diff --git a/opt.cpp b/opt.cpp
index 57443040..0cfb360c 100644
--- a/opt.cpp
+++ b/opt.cpp
@@ -184,7 +184,7 @@ lCallInst(llvm::Function *func, llvm::Value *arg0, llvm::Value *arg1,
     llvm::ArrayRef<llvm::Value *> newArgArray(&args[0], &args[2]);
     return llvm::CallInst::Create(func, newArgArray, name, insertBefore);
 #else
-    return llvm::CallInst::Create(func, &newArgs[0], &newArgs[2],
+    return llvm::CallInst::Create(func, &args[0], &args[2],
                                   name, insertBefore);
 #endif
 }
@@ -199,7 +199,7 @@ lCallInst(llvm::Function *func, llvm::Value *arg0, llvm::Value *arg1,
     llvm::ArrayRef<llvm::Value *> newArgArray(&args[0], &args[3]);
     return llvm::CallInst::Create(func, newArgArray, name, insertBefore);
 #else
-    return llvm::CallInst::Create(func, &newArgs[0], &newArgs[3],
+    return llvm::CallInst::Create(func, &args[0], &args[3],
                                   name, insertBefore);
 #endif
 }
@@ -215,7 +215,7 @@ lCallInst(llvm::Function *func, llvm::Value *arg0, llvm::Value *arg1,
     llvm::ArrayRef<llvm::Value *> newArgArray(&args[0], &args[4]);
     return llvm::CallInst::Create(func, newArgArray, name, insertBefore);
 #else
-    return llvm::CallInst::Create(func, &newArgs[0], &newArgs[4],
+    return llvm::CallInst::Create(func, &args[0], &args[4],
                                   name, insertBefore);
 #endif
 }
@@ -230,7 +230,7 @@ lCallInst(llvm::Function *func, llvm::Value *arg0, llvm::Value *arg1,
     llvm::ArrayRef<llvm::Value *> newArgArray(&args[0], &args[5]);
     return llvm::CallInst::Create(func, newArgArray, name, insertBefore);
 #else
-    return llvm::CallInst::Create(func, &newArgs[0], &newArgs[5],
+    return llvm::CallInst::Create(func, &args[0], &args[5],
                                   name, insertBefore);
 #endif
 }
@@ -245,7 +245,7 @@ lCallInst(llvm::Function *func, llvm::Value *arg0, llvm::Value *arg1,
     llvm::ArrayRef<llvm::Value *> newArgArray(&args[0], &args[6]);
     return llvm::CallInst::Create(func, newArgArray, name, insertBefore);
 #else
-    return llvm::CallInst::Create(func, &newArgs[0], &newArgs[6],
+    return llvm::CallInst::Create(func, &args[0], &args[6],
                                   name, insertBefore);
 #endif
 }

From bb8e13e3c9aaf3257d324f8c108e6f49301681e4 Mon Sep 17 00:00:00 2001
From: Matt Pharr <matt.pharr@intel.com>
Date: Tue, 7 Feb 2012 08:39:01 -0800
Subject: [PATCH 60/62] Add support for -I command-line argument to specify
 #include search directories.

---
 ispc.h     |  4 ++++
 main.cpp   | 10 ++++++++++
 module.cpp | 14 ++++++++++++++
 3 files changed, 28 insertions(+)

diff --git a/ispc.h b/ispc.h
index 5f25ebe4..59c9140f 100644
--- a/ispc.h
+++ b/ispc.h
@@ -423,6 +423,10 @@ struct Globals {
     /** Arguments to pass along to the C pre-processor, if it is run on the
         program before compilation. */
     std::vector<std::string> cppArgs;
+
+    /** Additional user-provided directories to search when processing
+        #include directives in the preprocessor. */
+    std::vector<std::string> includePath;
 };
 
 enum {
diff --git a/main.cpp b/main.cpp
index 6168f1c2..7b8c66d5 100644
--- a/main.cpp
+++ b/main.cpp
@@ -103,6 +103,7 @@ usage(int ret) {
     printf("    [--help]\t\t\t\tPrint help\n");
     printf("    [--help-dev]\t\t\tPrint help for developer options\n");
     printf("    [-h <name>/--header-outfile=<name>]\tOutput filename for header\n");
+    printf("    [-I <path>]\t\t\t\tAdd <path> to #include file search path\n");
     printf("    [--instrument]\t\t\tEmit instrumentation to gather performance data\n");
     printf("    [--math-lib=<option>]\t\tSelect math library\n");
     printf("        default\t\t\t\tUse ispc's built-in math functions\n");
@@ -287,6 +288,15 @@ int main(int Argc, char *Argv[]) {
             ot = Module::Bitcode;
         else if (!strcmp(argv[i], "--emit-obj"))
             ot = Module::Object;
+        else if (!strcmp(argv[i], "-I")) {
+            if (++i == argc) {
+                fprintf(stderr, "No path specified after -I option.\n");
+                usage(1);
+            }
+            g->includePath.push_back(argv[i]);
+        }
+        else if (!strncmp(argv[i], "-I", 2))
+            g->includePath.push_back(argv[i]+2);
         else if (!strcmp(argv[i], "--fuzz-test"))
             g->enableFuzzTest = true;
         else if (!strncmp(argv[i], "--fuzz-seed=", 12))
diff --git a/module.cpp b/module.cpp
index 75a16046..5d559d40 100644
--- a/module.cpp
+++ b/module.cpp
@@ -1175,6 +1175,20 @@ Module::execPreprocessor(const char* infilename, llvm::raw_string_ostream* ostre
     // track the source file position by handling them ourselves.
     inst.getPreprocessorOutputOpts().ShowComments = 1;
 
+    clang::HeaderSearchOptions &headerOpts = inst.getHeaderSearchOpts();
+    headerOpts.UseBuiltinIncludes = 0;
+#ifndef LLVM_2_9
+    headerOpts.UseStandardSystemIncludes = 0;
+#endif // !LLVM_2_9
+    headerOpts.UseStandardCXXIncludes = 0;
+    if (g->debugPrint)
+        headerOpts.Verbose = 1;
+    for (int i = 0; i < (int)g->includePath.size(); ++i)
+        headerOpts.AddPath(g->includePath[i], clang::frontend::Angled,
+                           true /* is user supplied */,
+                           false /* not a framework */,
+                           true /* ignore sys root */);
+
     clang::PreprocessorOptions &opts = inst.getPreprocessorOpts();
 
     // Add defs for ISPC and PI

From 157e7c97aec5a21e5dbf6baf0adb1f2c7c2a296a Mon Sep 17 00:00:00 2001
From: Matt Pharr <matt.pharr@intel.com>
Date: Tue, 7 Feb 2012 11:08:00 -0800
Subject: [PATCH 61/62] Fix a variety of cases in the parser that could crash
 with malformed programs.

---
 parse.yy | 345 ++++++++++++++++++++++++++++++++++++-------------------
 1 file changed, 224 insertions(+), 121 deletions(-)

diff --git a/parse.yy b/parse.yy
index 455d2b67..188e425c 100644
--- a/parse.yy
+++ b/parse.yy
@@ -333,7 +333,10 @@ argument_expression_list
     | argument_expression_list ',' assignment_expression
       {
           ExprList *argList = dynamic_cast<ExprList *>($1);
-          Assert(argList != NULL);
+          if (argList == NULL) {
+              Assert(m->errorCount > 0);
+              argList = new ExprList(@3);
+          }
           argList->exprs.push_back($3);
           argList->pos = Union(argList->pos, @3);
           $$ = argList;
@@ -462,8 +465,8 @@ rate_qualified_new
 
 rate_qualified_new_type
     : type_specifier { $$ = $1; }
-    | TOKEN_UNIFORM type_specifier { $$ = $2->GetAsUniformType(); }
-    | TOKEN_VARYING type_specifier { $$ = $2->GetAsVaryingType(); }
+    | TOKEN_UNIFORM type_specifier { $$ = $2 ? $2->GetAsUniformType() : NULL; }
+    | TOKEN_VARYING type_specifier { $$ = $2 ? $2->GetAsVaryingType() : NULL; }
     ;
 
 new_expression
@@ -631,15 +634,20 @@ init_declarator_list
     : init_declarator
       {
           std::vector<Declarator *> *dl = new std::vector<Declarator *>;
-          dl->push_back($1);
+          if ($1 != NULL)
+              dl->push_back($1);
           $$ = dl;
       }
     | init_declarator_list ',' init_declarator
       {
           std::vector<Declarator *> *dl = (std::vector<Declarator *> *)$1;
-          if (dl != NULL && $3 != NULL)
+          if (dl == NULL) {
+              Assert(m->errorCount > 0);
+              dl = new std::vector<Declarator *>;
+          }
+          if ($3 != NULL)
               dl->push_back($3);
-          $$ = $1;
+          $$ = dl;
       }
     ;
 
@@ -664,9 +672,10 @@ storage_class_specifier
 type_specifier
     : atomic_var_type_specifier { $$ = $1; }
     | TOKEN_TYPE_NAME
-      { const Type *t = m->symbolTable->LookupType(yytext); 
+    {
+        const Type *t = m->symbolTable->LookupType(yytext); 
         $$ = t;
-      }
+    }
     | struct_or_union_specifier { $$ = $1; }
     | enum_specifier { $$ = $1; }
     ;
@@ -684,41 +693,47 @@ atomic_var_type_specifier
 
 short_vec_specifier
     : atomic_var_type_specifier '<' int_constant '>'
-      {
-        Type* vt = 
-          new VectorType($1, (int32_t)$3);
-        $$ = vt;
-      }
+    {
+        $$ = $1 ? new VectorType($1, (int32_t)$3) : NULL;
+    }
     ;
 
 struct_or_union_name
     : TOKEN_IDENTIFIER { $$ = strdup(yytext); }
-    | TOKEN_TYPE_NAME { $$ = strdup(yytext); }
+    | TOKEN_TYPE_NAME  { $$ = strdup(yytext); }
     ;
 
 struct_or_union_specifier
     : struct_or_union struct_or_union_name '{' struct_declaration_list '}' 
-      { 
-          std::vector<const Type *> elementTypes;
-          std::vector<std::string> elementNames;
-          std::vector<SourcePos> elementPositions;
-          GetStructTypesNamesPositions(*$4, &elementTypes, &elementNames,
-                                       &elementPositions);
-          StructType *st = new StructType($2, elementTypes, elementNames,
-                                          elementPositions, false, Type::Unbound, @2);
-          m->symbolTable->AddType($2, st, @2);
-          $$ = st;
+      {
+          if ($4 != NULL) {
+              std::vector<const Type *> elementTypes;
+              std::vector<std::string> elementNames;
+              std::vector<SourcePos> elementPositions;
+              GetStructTypesNamesPositions(*$4, &elementTypes, &elementNames,
+                                           &elementPositions);
+              StructType *st = new StructType($2, elementTypes, elementNames,
+                                              elementPositions, false, Type::Unbound, @2);
+              m->symbolTable->AddType($2, st, @2);
+              $$ = st;
+          }
+          else
+              $$ = NULL;
       }
     | struct_or_union '{' struct_declaration_list '}' 
       {
-          std::vector<const Type *> elementTypes;
-          std::vector<std::string> elementNames;
-          std::vector<SourcePos> elementPositions;
-          GetStructTypesNamesPositions(*$3, &elementTypes, &elementNames,
-                                       &elementPositions);
-          // FIXME: should be unbound
-          $$ = new StructType("", elementTypes, elementNames, elementPositions,
-                              false, Type::Unbound, @1);
+          if ($3 != NULL) {
+              std::vector<const Type *> elementTypes;
+              std::vector<std::string> elementNames;
+              std::vector<SourcePos> elementPositions;
+              GetStructTypesNamesPositions(*$3, &elementTypes, &elementNames,
+                                           &elementPositions);
+              // FIXME: should be unbound
+              $$ = new StructType("", elementTypes, elementNames, elementPositions,
+                                  false, Type::Unbound, @1);
+          }
+          else
+              $$ = NULL;
       }
     | struct_or_union '{' '}' 
       {
@@ -729,16 +744,17 @@ struct_or_union_specifier
           Error(@1, "Empty struct definitions not allowed."); 
       }
     | struct_or_union struct_or_union_name
-      { const Type *st = m->symbolTable->LookupType($2); 
-        if (!st) {
-            std::vector<std::string> alternates = m->symbolTable->ClosestTypeMatch($2);
-            std::string alts = lGetAlternates(alternates);
-            Error(@2, "Struct type \"%s\" unknown.%s", $2, alts.c_str());
-        }
-        else if (dynamic_cast<const StructType *>(st) == NULL)
-            Error(@2, "Type \"%s\" is not a struct type! (%s)", $2,
-                  st->GetString().c_str());
-        $$ = st;
+      { 
+          const Type *st = m->symbolTable->LookupType($2); 
+          if (!st) {
+              std::vector<std::string> alternates = m->symbolTable->ClosestTypeMatch($2);
+              std::string alts = lGetAlternates(alternates);
+              Error(@2, "Struct type \"%s\" unknown.%s", $2, alts.c_str());
+          }
+          else if (dynamic_cast<const StructType *>(st) == NULL)
+              Error(@2, "Type \"%s\" is not a struct type! (%s)", $2,
+                    st->GetString().c_str());
+          $$ = st;
       }
     ;
 
@@ -750,22 +766,26 @@ struct_declaration_list
     : struct_declaration 
       { 
           std::vector<StructDeclaration *> *sdl = new std::vector<StructDeclaration *>;
-          if (sdl != NULL && $1 != NULL)
+          if ($1 != NULL)
               sdl->push_back($1);
           $$ = sdl;
       }
     | struct_declaration_list struct_declaration 
       {
           std::vector<StructDeclaration *> *sdl = (std::vector<StructDeclaration *> *)$1;
-          if (sdl != NULL && $2 != NULL)
+          if (sdl == NULL) {
+              Assert(m->errorCount > 0);
+              sdl = new std::vector<StructDeclaration *>;
+          }
+          if ($2 != NULL)
               sdl->push_back($2);
-          $$ = $1;
+          $$ = sdl;
       }
     ;
 
 struct_declaration
     : specifier_qualifier_list struct_declarator_list ';' 
-      { $$ = new StructDeclaration($1, $2); }
+      { $$ = ($1 != NULL && $2 != NULL) ? new StructDeclaration($1, $2) : NULL; }
     ;
 
 specifier_qualifier_list
@@ -831,9 +851,13 @@ struct_declarator_list
     | struct_declarator_list ',' struct_declarator 
       {
           std::vector<Declarator *> *sdl = (std::vector<Declarator *> *)$1;
-          if (sdl != NULL && $3 != NULL)
+          if (sdl == NULL) {
+              Assert(m->errorCount > 0);
+              sdl = new std::vector<Declarator *>;
+          }
+          if ($3 != NULL)
               sdl->push_back($3);
-          $$ = $1;
+          $$ = sdl;
       }
     ;
 
@@ -900,9 +924,14 @@ enumerator_list
       }
     | enumerator_list ',' enumerator
       {
-          if ($1 != NULL && $3 != NULL)
-              $1->push_back($3);
-          $$ = $1;
+          std::vector<Symbol *> *symList = $1;
+          if (symList == NULL) {
+              Assert(m->errorCount > 0);
+              symList = new std::vector<Symbol *>;
+          }
+          if ($3 != NULL)
+              symList->push_back($3);
+          $$ = symList;
       }
     ;
 
@@ -950,19 +979,27 @@ type_qualifier_list
 declarator
     : pointer direct_declarator
     {
-        Declarator *tail = $1;
-        while (tail->child != NULL)
-           tail = tail->child;
-        tail->child = $2;
-        $$ = $1;
+        if ($1 != NULL) {
+            Declarator *tail = $1;
+            while (tail->child != NULL)
+               tail = tail->child;
+            tail->child = $2;
+            $$ = $1;
+        }
+        else
+            $$ = NULL;
     }
     | reference direct_declarator
     {
-        Declarator *tail = $1;
-        while (tail->child != NULL)
-           tail = tail->child;
-        tail->child = $2;
-        $$ = $1;
+        if ($1 != NULL) {
+            Declarator *tail = $1;
+            while (tail->child != NULL)
+               tail = tail->child;
+            tail->child = $2;
+            $$ = $1;
+        }
+        else
+            $$ = NULL;
     }
     | direct_declarator
     ;
@@ -1016,7 +1053,8 @@ direct_declarator
           if ($1 != NULL) {
               Declarator *d = new Declarator(DK_FUNCTION, Union(@1, @4));
               d->child = $1;
-              if ($3 != NULL) d->functionParams = *$3;
+              if ($3 != NULL)
+                  d->functionParams = *$3;
               $$ = d;
           }
           else
@@ -1086,7 +1124,6 @@ parameter_list
     {
         std::vector<Declaration *> *dl = (std::vector<Declaration *> *)$1;
         if (dl == NULL)
-            // dl may be NULL due to an earlier parse error...
             dl = new std::vector<Declaration *>;
         if ($3 != NULL)
             dl->push_back($3);
@@ -1118,18 +1155,26 @@ parameter_declaration
     }
     | declaration_specifiers declarator '=' initializer
     { 
-        if ($2 != NULL)
+        if ($1 != NULL && $2 != NULL) {
             $2->initExpr = $4;
-        $$ = new Declaration($1, $2); 
-
+            $$ = new Declaration($1, $2);
+        }
+        else
+            $$ = NULL;
     }
     | declaration_specifiers abstract_declarator
     {
-        $$ = new Declaration($1, $2);
+        if ($1 != NULL && $2 != NULL)
+            $$ = new Declaration($1, $2);
+        else
+            $$ = NULL;
     }
     | declaration_specifiers
     {
-        $$ = new Declaration($1); 
+        if ($1 == NULL)
+            $$ = NULL;
+        else
+            $$ = new Declaration($1); 
     }
     ;
 
@@ -1144,7 +1189,10 @@ type_name
     : specifier_qualifier_list
     | specifier_qualifier_list abstract_declarator
     {
-        $$ = $2->GetType($1, NULL);
+        if ($1 == NULL || $2 == NULL)
+            $$ = NULL;
+        else
+            $$ = $2->GetType($1, NULL);
     }
     ;
 
@@ -1156,20 +1204,27 @@ abstract_declarator
     | direct_abstract_declarator
     | pointer direct_abstract_declarator
       {
-          Declarator *d = new Declarator(DK_POINTER, Union(@1, @2));
-          d->child = $2;
-          $$ = d;
+          if ($2 == NULL)
+              $$ = NULL;
+          else {
+              Declarator *d = new Declarator(DK_POINTER, Union(@1, @2));
+              d->child = $2;
+              $$ = d;
+          }
       }
     | reference
       {
-          Declarator *d = new Declarator(DK_REFERENCE, @1);
-          $$ = d;
+          $$ = new Declarator(DK_REFERENCE, @1);
       }
     | reference direct_abstract_declarator
       {
-          Declarator *d = new Declarator(DK_REFERENCE, Union(@1, @2));
-          d->child = $2;
-          $$ = d;
+          if ($2 == NULL)
+              $$ = NULL;
+          else {
+              Declarator *d = new Declarator(DK_REFERENCE, Union(@1, @2));
+              d->child = $2;
+              $$ = d;
+          }
       }
     ;
 
@@ -1201,15 +1256,19 @@ direct_abstract_declarator
       }
     | direct_abstract_declarator '[' ']'
       {
-          Declarator *d = new Declarator(DK_ARRAY, Union(@1, @3));
-          d->arraySize = 0;
-          d->child = $1;
-          $$ = d;
+          if ($1 == NULL)
+              $$ = NULL;
+          else {
+              Declarator *d = new Declarator(DK_ARRAY, Union(@1, @3));
+              d->arraySize = 0;
+              d->child = $1;
+              $$ = d;
+          }
       }
     | direct_abstract_declarator '[' constant_expression ']'
       {
           int size;
-          if ($3 != NULL && lGetConstantInt($3, &size, @3, "Array dimension")) {
+          if ($1 != NULL && $3 != NULL && lGetConstantInt($3, &size, @3, "Array dimension")) {
               if (size < 0) {
                   Error(@3, "Array dimension must be non-negative.");
                   $$ = NULL;
@@ -1234,16 +1293,24 @@ direct_abstract_declarator
       }
     | direct_abstract_declarator '(' ')'
       {
-          Declarator *d = new Declarator(DK_FUNCTION, Union(@1, @3));
-          d->child = $1;
-          $$ = d;
+          if ($1 == NULL)
+              $$ = NULL;
+          else {
+              Declarator *d = new Declarator(DK_FUNCTION, Union(@1, @3));
+              d->child = $1;
+              $$ = d;
+          }
       }
     | direct_abstract_declarator '(' parameter_type_list ')'
       {
-          Declarator *d = new Declarator(DK_FUNCTION, Union(@1, @4));
-          d->child = $1;
-          if ($3 != NULL) d->functionParams = *$3;
-          $$ = d;
+          if ($1 == NULL)
+              $$ = NULL;
+          else {
+              Declarator *d = new Declarator(DK_FUNCTION, Union(@1, @4));
+              d->child = $1;
+              if ($3 != NULL) d->functionParams = *$3;
+              $$ = d;
+          }
       }
     ;
 
@@ -1258,15 +1325,14 @@ initializer_list
       { $$ = new ExprList($1, @1); }
     | initializer_list ',' initializer
       {
-          if ($1 == NULL)
-              $$ = NULL;
-          else {
-              ExprList *exprList = dynamic_cast<ExprList *>($1);
-              Assert(exprList);
-              exprList->exprs.push_back($3);
-              exprList->pos = Union(exprList->pos, @3);
-              $$ = exprList;
+          ExprList *exprList = $1;
+          if (exprList == NULL) {
+              Assert(m->errorCount > 0);
+              exprList = new ExprList(@3);
           }
+          exprList->exprs.push_back($3);
+          exprList->pos = Union(exprList->pos, @3);
+          $$ = exprList;
       }
     ;
 
@@ -1342,15 +1408,19 @@ statement_list
       }
     | statement_list statement
       {
-          if ($1 != NULL)
-              ((StmtList *)$1)->Add($2);
-          $$ = $1;
+          StmtList *sl = (StmtList *)$1;
+          if (sl == NULL) {
+              Assert(m->errorCount > 0);
+              sl = new StmtList(@2);
+          }
+          sl->Add($2);
+          $$ = sl;
       }
     ;
 
 expression_statement
     : ';' { $$ = NULL; }
-    | expression ';' { $$ = new ExprStmt($1, @1); }
+    | expression ';' { $$ = $1 ? new ExprStmt($1, @1) : NULL; }
     ;
 
 selection_statement
@@ -1416,7 +1486,14 @@ foreach_dimension_list
     }
     | foreach_dimension_list ',' foreach_dimension_specifier
     {
-        $$->push_back($3);
+        std::vector<ForeachDimension *> *dv = $1;
+        if (dv == NULL) {
+            Assert(m->errorCount > 0);
+            dv = new std::vector<ForeachDimension *>;
+        }
+        if ($3 != NULL)
+            dv->push_back($3);
+        $$ = dv;
     }
     ;
 
@@ -1447,38 +1524,57 @@ iteration_statement
       }
     | foreach_scope '(' foreach_dimension_list ')'
      {
-         std::vector<ForeachDimension *> &dims = *$3;
-         for (unsigned int i = 0; i < dims.size(); ++i)
-             m->symbolTable->AddVariable(dims[i]->sym);
+         std::vector<ForeachDimension *> *dims = $3;
+         if (dims == NULL) {
+             Assert(m->errorCount > 0);
+             dims = new std::vector<ForeachDimension *>;
+         }
+         for (unsigned int i = 0; i < dims->size(); ++i)
+             m->symbolTable->AddVariable((*dims)[i]->sym);
      }
      statement
      {
-         std::vector<ForeachDimension *> &dims = *$3;
+         std::vector<ForeachDimension *> *dims = $3;
+         if (dims == NULL) {
+             Assert(m->errorCount > 0);
+             dims = new std::vector<ForeachDimension *>;
+         }
+
          std::vector<Symbol *> syms;
          std::vector<Expr *> begins, ends;
-         for (unsigned int i = 0; i < dims.size(); ++i) {
-             syms.push_back(dims[i]->sym);
-             begins.push_back(dims[i]->beginExpr);
-             ends.push_back(dims[i]->endExpr);
+         for (unsigned int i = 0; i < dims->size(); ++i) {
+             syms.push_back((*dims)[i]->sym);
+             begins.push_back((*dims)[i]->beginExpr);
+             ends.push_back((*dims)[i]->endExpr);
          }
          $$ = new ForeachStmt(syms, begins, ends, $6, false, @1);
          m->symbolTable->PopScope();
      }
     | foreach_tiled_scope '(' foreach_dimension_list ')'
      {
-         std::vector<ForeachDimension *> &dims = *$3;
-         for (unsigned int i = 0; i < dims.size(); ++i)
-             m->symbolTable->AddVariable(dims[i]->sym);
+         std::vector<ForeachDimension *> *dims = $3;
+         if (dims == NULL) {
+             Assert(m->errorCount > 0);
+             dims = new std::vector<ForeachDimension *>;
+         }
+
+         for (unsigned int i = 0; i < dims->size(); ++i)
+             m->symbolTable->AddVariable((*dims)[i]->sym);
      }
      statement
      {
-         std::vector<ForeachDimension *> &dims = *$3;
+         std::vector<ForeachDimension *> *dims = $3;
+         if (dims == NULL) {
+             Assert(m->errorCount > 0);
+             dims = new std::vector<ForeachDimension *>;
+         }
+
          std::vector<Symbol *> syms;
          std::vector<Expr *> begins, ends;
-         for (unsigned int i = 0; i < dims.size(); ++i) {
-             syms.push_back(dims[i]->sym);
-             begins.push_back(dims[i]->beginExpr);
-             ends.push_back(dims[i]->endExpr);
+         for (unsigned int i = 0; i < dims->size(); ++i) {
+             syms.push_back((*dims)[i]->sym);
+             begins.push_back((*dims)[i]->beginExpr);
+             ends.push_back((*dims)[i]->endExpr);
          }
          $$ = new ForeachStmt(syms, begins, ends, $6, true, @1);
          m->symbolTable->PopScope();
@@ -1584,9 +1680,11 @@ function_definition
     compound_statement
     {
         std::vector<Symbol *> args;
-        Symbol *sym = $2->GetFunctionInfo($1, &args);
-        if (sym != NULL)
-            m->AddFunctionDefinition(sym, args, $4);
+        if ($2 != NULL) {
+            Symbol *sym = $2->GetFunctionInfo($1, &args);
+            if (sym != NULL)
+                m->AddFunctionDefinition(sym, args, $4);
+        }
         m->symbolTable->PopScope(); // push in lAddFunctionParams();
     }
 /* function with no declared return type??
@@ -1643,6 +1741,11 @@ static void
 lAddFunctionParams(Declarator *decl) {
     m->symbolTable->PushScope();
 
+    if (decl == NULL) {
+        Assert(m->errorCount > 0);
+        return;
+    }
+
     // walk down to the declarator for the function itself 
     while (decl->kind != DK_FUNCTION && decl->child != NULL)
         decl = decl->child;

From f3089df0866ce8e699a6436e51f3c5534be01d7d Mon Sep 17 00:00:00 2001
From: Matt Pharr <matt.pharr@intel.com>
Date: Tue, 7 Feb 2012 11:11:40 -0800
Subject: [PATCH 62/62] Improve error handling and reporting in the parser.

Add a number of additional error cases in the grammar.

Enable bison's extended error reporting, to get better messages about the
context of errors and the expected (but not found) tokens at errors.

Improve the printing of these by providing an implementation of yytnamerr
that rewrites things like "TOKEN_MUL_ASSIGN" to "*=" in error messages.

Print the source location (using Error() when yyerror() is called; wiring
this up seems to require no longer building a 'pure parser' but having
yylloc as a global, which in turn led to having to update all of the uses of
it (which previously accessed it as a pointer).

Updated a number of tests_errors for resulting changesin error text.
---
 lex.ll                                      | 153 ++++++++++++++----
 module.cpp                                  |   3 +
 parse.yy                                    | 168 ++++++++++++++------
 tests_errors/new-delete-2.ispc              |   2 +-
 tests_errors/new-delete-3.ispc              |   2 +-
 tests_errors/ref-1.ispc                     |   2 +-
 tests_errors/ref-3.ispc                     |   2 +-
 tests_errors/vec-size-compile-constant.ispc |   2 +-
 8 files changed, 242 insertions(+), 92 deletions(-)

diff --git a/lex.ll b/lex.ll
index 9ba21f2a..678e9179 100644
--- a/lex.ll
+++ b/lex.ll
@@ -50,9 +50,9 @@ static void lStringConst(YYSTYPE *, SourcePos *);
 static double lParseHexFloat(const char *ptr);
 
 #define YY_USER_ACTION \
-    yylloc->first_line = yylloc->last_line; \
-    yylloc->first_column = yylloc->last_column; \
-    yylloc->last_column += yyleng;
+    yylloc.first_line = yylloc.last_line; \
+    yylloc.first_column = yylloc.last_column; \
+    yylloc.last_column += yyleng;
 
 #ifdef ISPC_IS_WINDOWS
 inline int isatty(int) { return 0; }
@@ -62,7 +62,7 @@ static int allTokens[] = {
   TOKEN_ASSERT, TOKEN_BOOL, TOKEN_BREAK, TOKEN_CASE, TOKEN_CBREAK,
   TOKEN_CCONTINUE, TOKEN_CDO, TOKEN_CFOR, TOKEN_CIF, TOKEN_CWHILE,
   TOKEN_CONST, TOKEN_CONTINUE, TOKEN_CRETURN, TOKEN_DEFAULT, TOKEN_DO,
-  TOKEN_DELETE, TOKEN_DELETE, TOKEN_DOUBLE, TOKEN_ELSE, TOKEN_ENUM,
+  TOKEN_DELETE, TOKEN_DOUBLE, TOKEN_ELSE, TOKEN_ENUM,
   TOKEN_EXPORT, TOKEN_EXTERN, TOKEN_FALSE, TOKEN_FLOAT, TOKEN_FOR,
   TOKEN_FOREACH, TOKEN_FOREACH_TILED, TOKEN_GOTO, TOKEN_IF, TOKEN_INLINE,
   TOKEN_INT, TOKEN_INT8, TOKEN_INT16, TOKEN_INT, TOKEN_INT64, TOKEN_LAUNCH,
@@ -83,9 +83,10 @@ static int allTokens[] = {
   '+', '*', '/', '%', '<', '>', '^', '|', '?',
 };
 
-static std::map<int, std::string> tokenToName;
+std::map<int, std::string> tokenToName;
+std::map<std::string, std::string> tokenNameRemap;
 
-static void lInitTokenToName() {
+void ParserInit() {
     tokenToName[TOKEN_ASSERT] = "assert";
     tokenToName[TOKEN_BOOL] = "bool";
     tokenToName[TOKEN_BREAK] = "break";
@@ -102,7 +103,6 @@ static void lInitTokenToName() {
     tokenToName[TOKEN_DEFAULT] = "default";
     tokenToName[TOKEN_DO] = "do";
     tokenToName[TOKEN_DELETE] = "delete";
-    tokenToName[TOKEN_DELETE] = "delete";
     tokenToName[TOKEN_DOUBLE] = "double";
     tokenToName[TOKEN_ELSE] = "else";
     tokenToName[TOKEN_ENUM] = "enum";
@@ -194,6 +194,91 @@ static void lInitTokenToName() {
     tokenToName['|'] = "|";
     tokenToName['?'] = "?";
     tokenToName[';'] = ";";
+
+    tokenNameRemap["TOKEN_ASSERT"] = "\'assert\'";
+    tokenNameRemap["TOKEN_BOOL"] = "\'bool\'";
+    tokenNameRemap["TOKEN_BREAK"] = "\'break\'";
+    tokenNameRemap["TOKEN_CASE"] = "\'case\'";
+    tokenNameRemap["TOKEN_CBREAK"] = "\'cbreak\'";
+    tokenNameRemap["TOKEN_CCONTINUE"] = "\'ccontinue\'";
+    tokenNameRemap["TOKEN_CDO"] = "\'cdo\'";
+    tokenNameRemap["TOKEN_CFOR"] = "\'cfor\'";
+    tokenNameRemap["TOKEN_CIF"] = "\'cif\'";
+    tokenNameRemap["TOKEN_CWHILE"] = "\'cwhile\'";
+    tokenNameRemap["TOKEN_CONST"] = "\'const\'";
+    tokenNameRemap["TOKEN_CONTINUE"] = "\'continue\'";
+    tokenNameRemap["TOKEN_CRETURN"] = "\'creturn\'";
+    tokenNameRemap["TOKEN_DEFAULT"] = "\'default\'";
+    tokenNameRemap["TOKEN_DO"] = "\'do\'";
+    tokenNameRemap["TOKEN_DELETE"] = "\'delete\'";
+    tokenNameRemap["TOKEN_DOUBLE"] = "\'double\'";
+    tokenNameRemap["TOKEN_ELSE"] = "\'else\'";
+    tokenNameRemap["TOKEN_ENUM"] = "\'enum\'";
+    tokenNameRemap["TOKEN_EXPORT"] = "\'export\'";
+    tokenNameRemap["TOKEN_EXTERN"] = "\'extern\'";
+    tokenNameRemap["TOKEN_FALSE"] = "\'false\'";
+    tokenNameRemap["TOKEN_FLOAT"] = "\'float\'";
+    tokenNameRemap["TOKEN_FOR"] = "\'for\'";
+    tokenNameRemap["TOKEN_FOREACH"] = "\'foreach\'";
+    tokenNameRemap["TOKEN_FOREACH_TILED"] = "\'foreach_tiled\'";
+    tokenNameRemap["TOKEN_GOTO"] = "\'goto\'";
+    tokenNameRemap["TOKEN_IDENTIFIER"] = "identifier";
+    tokenNameRemap["TOKEN_IF"] = "\'if\'";
+    tokenNameRemap["TOKEN_INLINE"] = "\'inline\'";
+    tokenNameRemap["TOKEN_INT"] = "\'int\'";
+    tokenNameRemap["TOKEN_INT8"] = "\'int8\'";
+    tokenNameRemap["TOKEN_INT16"] = "\'int16\'";
+    tokenNameRemap["TOKEN_INT"] = "\'int\'";
+    tokenNameRemap["TOKEN_INT64"] = "\'int64\'";
+    tokenNameRemap["TOKEN_LAUNCH"] = "\'launch\'";
+    tokenNameRemap["TOKEN_NEW"] = "\'new\'";
+    tokenNameRemap["TOKEN_NULL"] = "\'NULL\'";
+    tokenNameRemap["TOKEN_PRINT"] = "\'print\'";
+    tokenNameRemap["TOKEN_RETURN"] = "\'return\'";
+    tokenNameRemap["TOKEN_SOA"] = "\'soa\'";
+    tokenNameRemap["TOKEN_SIGNED"] = "\'signed\'";
+    tokenNameRemap["TOKEN_SIZEOF"] = "\'sizeof\'";
+    tokenNameRemap["TOKEN_STATIC"] = "\'static\'";
+    tokenNameRemap["TOKEN_STRUCT"] = "\'struct\'";
+    tokenNameRemap["TOKEN_SWITCH"] = "\'switch\'";
+    tokenNameRemap["TOKEN_SYNC"] = "\'sync\'";
+    tokenNameRemap["TOKEN_TASK"] = "\'task\'";
+    tokenNameRemap["TOKEN_TRUE"] = "\'true\'";
+    tokenNameRemap["TOKEN_TYPEDEF"] = "\'typedef\'";
+    tokenNameRemap["TOKEN_UNIFORM"] = "\'uniform\'";
+    tokenNameRemap["TOKEN_UNSIGNED"] = "\'unsigned\'";
+    tokenNameRemap["TOKEN_VARYING"] = "\'varying\'";
+    tokenNameRemap["TOKEN_VOID"] = "\'void\'";
+    tokenNameRemap["TOKEN_WHILE"] = "\'while\'";
+    tokenNameRemap["TOKEN_STRING_C_LITERAL"] = "\"C\"";
+    tokenNameRemap["TOKEN_DOTDOTDOT"] = "\'...\'";
+    tokenNameRemap["TOKEN_FLOAT_CONSTANT"] = "float constant";
+    tokenNameRemap["TOKEN_INT32_CONSTANT"] = "int32 constant";
+    tokenNameRemap["TOKEN_UINT32_CONSTANT"] = "unsigned int32 constant";
+    tokenNameRemap["TOKEN_INT64_CONSTANT"] = "int64 constant";
+    tokenNameRemap["TOKEN_UINT64_CONSTANT"] = "unsigned int64 constant";
+    tokenNameRemap["TOKEN_INC_OP"] = "\'++\'";
+    tokenNameRemap["TOKEN_DEC_OP"] = "\'--\'";
+    tokenNameRemap["TOKEN_LEFT_OP"] = "\'<<\'";
+    tokenNameRemap["TOKEN_RIGHT_OP"] = "\'>>\'";
+    tokenNameRemap["TOKEN_LE_OP"] = "\'<=\'";
+    tokenNameRemap["TOKEN_GE_OP"] = "\'>=\'";
+    tokenNameRemap["TOKEN_EQ_OP"] = "\'==\'";
+    tokenNameRemap["TOKEN_NE_OP"] = "\'!=\'";
+    tokenNameRemap["TOKEN_AND_OP"] = "\'&&\'";
+    tokenNameRemap["TOKEN_OR_OP"] = "\'||\'";
+    tokenNameRemap["TOKEN_MUL_ASSIGN"] = "\'*=\'";
+    tokenNameRemap["TOKEN_DIV_ASSIGN"] = "\'/=\'";
+    tokenNameRemap["TOKEN_MOD_ASSIGN"] = "\'%=\'";
+    tokenNameRemap["TOKEN_ADD_ASSIGN"] = "\'+=\'";
+    tokenNameRemap["TOKEN_SUB_ASSIGN"] = "\'-=\'";
+    tokenNameRemap["TOKEN_LEFT_ASSIGN"] = "\'<<=\'";
+    tokenNameRemap["TOKEN_RIGHT_ASSIGN"] = "\'>>=\'";
+    tokenNameRemap["TOKEN_AND_ASSIGN"] = "\'&=\'";
+    tokenNameRemap["TOKEN_XOR_ASSIGN"] = "\'^=\'";
+    tokenNameRemap["TOKEN_OR_ASSIGN"] = "\'|=\'";
+    tokenNameRemap["TOKEN_PTR_OP"] = "\'->\'";
+    tokenNameRemap["$end"] = "end of file";
 }
 
 
@@ -209,21 +294,21 @@ inline int ispcRand() {
     if (g->enableFuzzTest) { \
         int r = ispcRand() % 40; \
         if (r == 0) { \
-            Warning(*yylloc, "Dropping token"); \
+            Warning(yylloc, "Fuzz test dropping token"); \
         } \
         else if (r == 1) { \
-            if (tokenToName.size() == 0) lInitTokenToName(); \
+            Assert (tokenToName.size() > 0); \
             int nt = sizeof(allTokens) / sizeof(allTokens[0]); \
             int tn = ispcRand() % nt; \
-            yylval->stringVal = new std::string(yytext); /* just in case */\
-            Warning(*yylloc, "Replaced with \"%s\"", tokenToName[allTokens[tn]].c_str()); \
+            yylval.stringVal = new std::string(yytext); /* just in case */\
+            Warning(yylloc, "Fuzz test replaced token with \"%s\"", tokenToName[allTokens[tn]].c_str()); \
             return allTokens[tn]; \
         } \
         else if (r == 2) { \
             Symbol *sym = m->symbolTable->RandomSymbol(); \
             if (sym != NULL) { \
-                yylval->stringVal = new std::string(sym->name); \
-                Warning(*yylloc, "Replaced with identifier \"%s\".", sym->name.c_str()); \
+                yylval.stringVal = new std::string(sym->name); \
+                Warning(yylloc, "Fuzz test replaced with identifier \"%s\".", sym->name.c_str()); \
                 return TOKEN_IDENTIFIER; \
             } \
         } \
@@ -234,8 +319,6 @@ inline int ispcRand() {
 
 %option nounput
 %option noyywrap
-%option bison-bridge
-%option bison-locations
 %option nounistd
 
 WHITESPACE [ \t\r]+
@@ -247,8 +330,8 @@ IDENT [a-zA-Z_][a-zA-Z_0-9]*
 ZO_SWIZZLE ([01]+[w-z]+)+|([01]+[rgba]+)+|([01]+[uv]+)+
 
 %%
-"/*"            { lCComment(yylloc); }
-"//"            { lCppComment(yylloc); }
+"/*"            { lCComment(&yylloc); }
+"//"            { lCppComment(&yylloc); }
 
 __assert { RT; return TOKEN_ASSERT; }
 bool { RT; return TOKEN_BOOL; }
@@ -289,9 +372,9 @@ launch { RT; return TOKEN_LAUNCH; }
 new { RT; return TOKEN_NEW; }
 NULL { RT; return TOKEN_NULL; }
 print { RT; return TOKEN_PRINT; }
-reference { Error(*yylloc, "\"reference\" qualifier is no longer supported; "
-                           "please use C++-style '&' syntax for references "
-                           "instead."); }
+reference { Error(yylloc, "\"reference\" qualifier is no longer supported; "
+                          "please use C++-style '&' syntax for references "
+                          "instead."); }
 return { RT; return TOKEN_RETURN; }
 soa { RT; return TOKEN_SOA; }
 signed { RT; return TOKEN_SIGNED; }
@@ -311,13 +394,13 @@ while { RT; return TOKEN_WHILE; }
 \"C\" { RT; return TOKEN_STRING_C_LITERAL; }
 \.\.\. { RT; return TOKEN_DOTDOTDOT; }
 
-L?\"(\\.|[^\\"])*\" { lStringConst(yylval, yylloc); return TOKEN_STRING_LITERAL; }
+L?\"(\\.|[^\\"])*\" { lStringConst(&yylval, &yylloc); return TOKEN_STRING_LITERAL; }
 
 {IDENT} { 
     RT;
     /* We have an identifier--is it a type name or an identifier?
        The symbol table will straighten us out... */
-    yylval->stringVal = new std::string(yytext);
+    yylval.stringVal = new std::string(yytext);
     if (m->symbolTable->LookupType(yytext) != NULL)
         return TOKEN_TYPE_NAME;
     else
@@ -330,14 +413,14 @@ L?\"(\\.|[^\\"])*\" { lStringConst(yylval, yylloc); return TOKEN_STRING_LITERAL;
 
     char *endPtr = NULL;
     if (yytext[0] == '0' && yytext[1] == 'b')
-        yylval->intVal = lParseBinary(yytext+2, *yylloc, &endPtr);
+        yylval.intVal = lParseBinary(yytext+2, yylloc, &endPtr);
     else {
 #if defined(ISPC_IS_WINDOWS) && !defined(__MINGW32__)
-        yylval->intVal = _strtoui64(yytext, &endPtr, 0);
+        yylval.intVal = _strtoui64(yytext, &endPtr, 0);
 #else
         // FIXME: should use strtouq and then issue an error if we can't
         // fit into 64 bits...
-        yylval->intVal = strtoull(yytext, &endPtr, 0);
+        yylval.intVal = strtoull(yytext, &endPtr, 0);
 #endif
     }
 
@@ -355,11 +438,11 @@ L?\"(\\.|[^\\"])*\" { lStringConst(yylval, yylloc); return TOKEN_STRING_LITERAL;
             us++;
     }
     if (kilo)
-        yylval->intVal *= 1024;
+        yylval.intVal *= 1024;
     if (mega)
-        yylval->intVal *= 1024*1024;
+        yylval.intVal *= 1024*1024;
     if (giga)
-        yylval->intVal *= 1024*1024*1024;
+        yylval.intVal *= 1024*1024*1024;
 
     if (ls >= 2)
         return us ? TOKEN_UINT64_CONSTANT : TOKEN_INT64_CONSTANT;
@@ -367,7 +450,7 @@ L?\"(\\.|[^\\"])*\" { lStringConst(yylval, yylloc); return TOKEN_STRING_LITERAL;
         return us ? TOKEN_UINT32_CONSTANT : TOKEN_INT32_CONSTANT;
 
     // See if we can fit this into a 32-bit integer...
-    if ((yylval->intVal & 0xffffffff) == yylval->intVal)
+    if ((yylval.intVal & 0xffffffff) == yylval.intVal)
         return us ? TOKEN_UINT32_CONSTANT : TOKEN_INT32_CONSTANT;
     else
         return us ? TOKEN_UINT64_CONSTANT : TOKEN_INT64_CONSTANT;
@@ -376,13 +459,13 @@ L?\"(\\.|[^\\"])*\" { lStringConst(yylval, yylloc); return TOKEN_STRING_LITERAL;
 
 {FLOAT_NUMBER} { 
     RT;
-    yylval->floatVal = (float)atof(yytext);
+    yylval.floatVal = (float)atof(yytext);
     return TOKEN_FLOAT_CONSTANT; 
 }
 
 {HEX_FLOAT_NUMBER} {
     RT;
-    yylval->floatVal = (float)lParseHexFloat(yytext); 
+    yylval.floatVal = (float)lParseHexFloat(yytext); 
     return TOKEN_FLOAT_CONSTANT; 
 }
 
@@ -435,16 +518,16 @@ L?\"(\\.|[^\\"])*\" { lStringConst(yylval, yylloc); return TOKEN_STRING_LITERAL;
 {WHITESPACE} { }
 
 \n {
-    yylloc->last_line++; 
-    yylloc->last_column = 1; 
+    yylloc.last_line++; 
+    yylloc.last_column = 1; 
 }
 
 #(line)?[ ][0-9]+[ ]\"(\\.|[^\\"])*\"[^\n]* { 
-    lHandleCppHash(yylloc); 
+    lHandleCppHash(&yylloc); 
 }
 
 . {
-    Error(*yylloc, "Illegal character: %c (0x%x)", yytext[0], int(yytext[0]));
+    Error(yylloc, "Illegal character: %c (0x%x)", yytext[0], int(yytext[0]));
     YY_USER_ACTION 
 }
 
diff --git a/module.cpp b/module.cpp
index 5d559d40..acf57a90 100644
--- a/module.cpp
+++ b/module.cpp
@@ -161,6 +161,9 @@ Module::CompileFile() {
 
     bool runPreprocessor = g->runCPP;
 
+    extern void ParserInit();
+    ParserInit();
+
     if (runPreprocessor) {
         if (filename != NULL) {
             // Try to open the file first, since otherwise we crash in the
diff --git a/parse.yy b/parse.yy
index 188e425c..5847d3e7 100644
--- a/parse.yy
+++ b/parse.yy
@@ -37,10 +37,13 @@
 /* one for 'if', one for 'cif' */
 %expect 2
 
-%pure-parser
+%error-verbose
 
 %code requires {
 
+#define yytnamerr lYYTNameErr
+
+
 #define YYLTYPE SourcePos
 
 # define YYLLOC_DEFAULT(Current, Rhs, N)                               \
@@ -87,16 +90,16 @@ struct ForeachDimension;
         __FILE__, __LINE__);
 
 union YYSTYPE;
-extern int yylex(YYSTYPE *, SourcePos *);
+extern int yylex();
 
 extern char *yytext;
 
-void yyerror(const char *s) { 
-    if (!g->quiet) {
-        ++m->errorCount;
-        fprintf(stderr, "Parse error: %s\n", s); 
-    }
-}
+void yyerror(const char *s);
+
+static int lYYTNameErr(char *yyres, const char *yystr);
+
+static void lSuggestBuiltinAlternates();
+static void lSuggestParamListAlternates();
 
 static void lAddDeclaration(DeclSpecs *ds, Declarator *decl);
 static void lAddFunctionParams(Declarator *decl);
@@ -290,6 +293,7 @@ primary_expression
 /*    | TOKEN_STRING_LITERAL
        { UNIMPLEMENTED }*/
     | '(' expression ')' { $$ = $2; }
+    | '(' error ')' { $$ = NULL; }
     ;
 
 launch_expression
@@ -313,10 +317,14 @@ postfix_expression
     : primary_expression
     | postfix_expression '[' expression ']'
       { $$ = new IndexExpr($1, $3, Union(@1,@4)); }
+    | postfix_expression '[' error ']'
+      { $$ = NULL; }
     | postfix_expression '(' ')'
       { $$ = new FunctionCallExpr($1, new ExprList(Union(@1,@2)), Union(@1,@3)); }
     | postfix_expression '(' argument_expression_list ')'
       { $$ = new FunctionCallExpr($1, $3, Union(@1,@4)); }
+    | postfix_expression '(' error ')'
+      { $$ = NULL; }
     | launch_expression
     | postfix_expression '.' TOKEN_IDENTIFIER
       { $$ = MemberExpr::create($1, yytext, Union(@1,@3), @3, false); }
@@ -1048,6 +1056,10 @@ direct_declarator
         else
             $$ = NULL;
     }
+    | direct_declarator '[' error ']'
+    {
+         $$ = NULL;
+    }
     | direct_declarator '(' parameter_type_list ')'
       {
           if ($1 != NULL) {
@@ -1070,6 +1082,10 @@ direct_declarator
           else
               $$ = NULL;
       }
+    | direct_declarator '(' error ')'
+    {
+        $$ = NULL;
+    }
     ;
 
 
@@ -1129,21 +1145,9 @@ parameter_list
             dl->push_back($3);
         $$ = dl;
     }
-    | error
+    | error ','
     {
-        std::vector<std::string> builtinTokens;
-        const char **token = lParamListTokens;
-        while (*token) {
-            builtinTokens.push_back(*token);
-            ++token;
-        }
-        if (strlen(yytext) == 0)
-            Error(@1, "Syntax error--premature end of file.");
-        else {
-            std::vector<std::string> alternates = MatchStrings(yytext, builtinTokens);
-            std::string alts = lGetAlternates(alternates);
-            Error(@1, "Syntax error--token \"%s\" unexpected.%s", yytext, alts.c_str());
-        }
+        lSuggestParamListAlternates();
         $$ = NULL;
     }
     ;
@@ -1348,21 +1352,9 @@ statement
     | assert_statement
     | sync_statement
     | delete_statement
-    | error
+    | error ';'
     {
-        std::vector<std::string> builtinTokens;
-        const char **token = lBuiltinTokens;
-        while (*token) {
-            builtinTokens.push_back(*token);
-            ++token;
-        }
-        if (strlen(yytext) == 0)
-            Error(@1, "Syntax error--premature end of file.");
-        else {
-            std::vector<std::string> alternates = MatchStrings(yytext, builtinTokens);
-            std::string alts = lGetAlternates(alternates);
-            Error(@1, "Syntax error--token \"%s\" unexpected.%s", yytext, alts.c_str());
-        }
+        lSuggestBuiltinAlternates();
         $$ = NULL;
     }
     ;
@@ -1639,22 +1631,7 @@ assert_statement
 translation_unit
     : external_declaration
     | translation_unit external_declaration
-    | error
-    {
-        std::vector<std::string> builtinTokens;
-        const char **token = lBuiltinTokens;
-        while (*token) {
-            builtinTokens.push_back(*token);
-            ++token;
-        }
-        if (strlen(yytext) == 0)
-            Error(@1, "Syntax error--premature end of file.");
-        else {
-            std::vector<std::string> alternates = MatchStrings(yytext, builtinTokens);
-            std::string alts = lGetAlternates(alternates);
-            Error(@1, "Syntax error--token \"%s\" unexpected.%s", yytext, alts.c_str());
-        }
-    }
+    | error ';'
     ;
 
 external_declaration
@@ -1700,6 +1677,93 @@ func(...)
 %%
 
 
+void yyerror(const char *s) {
+    if (strlen(yytext) == 0)
+        Error(yylloc, "Premature end of file: %s.", s);
+    else
+        Error(yylloc, "%s.", s);
+}
+
+
+static int
+lYYTNameErr (char *yyres, const char *yystr)
+{
+  extern std::map<std::string, std::string> tokenNameRemap;
+  Assert(tokenNameRemap.size() > 0);
+  if (tokenNameRemap.find(yystr) != tokenNameRemap.end()) {
+      std::string n = tokenNameRemap[yystr];
+      if (yyres == NULL)
+          return n.size();
+      else
+          return yystpcpy(yyres, n.c_str()) - yyres;
+  }
+  
+  if (*yystr == '"')
+    {
+      YYSIZE_T yyn = 0;
+      char const *yyp = yystr;
+
+      for (;;)
+	switch (*++yyp)
+	  {
+	  case '\'':
+	  case ',':
+	    goto do_not_strip_quotes;
+
+	  case '\\':
+	    if (*++yyp != '\\')
+	      goto do_not_strip_quotes;
+	    /* Fall through.  */
+	  default:
+	    if (yyres)
+	      yyres[yyn] = *yyp;
+	    yyn++;
+	    break;
+
+	  case '"':
+	    if (yyres)
+	      yyres[yyn] = '\0';
+	    return yyn;
+	  }
+    do_not_strip_quotes: ;
+    }
+
+  if (! yyres)
+    return yystrlen (yystr);
+
+  return yystpcpy (yyres, yystr) - yyres;
+}
+
+static void
+lSuggestBuiltinAlternates() {
+    std::vector<std::string> builtinTokens;
+    const char **token = lBuiltinTokens;
+    while (*token) {
+        builtinTokens.push_back(*token);
+        ++token;
+    }
+    std::vector<std::string> alternates = MatchStrings(yytext, builtinTokens);
+    std::string alts = lGetAlternates(alternates);
+    if (alts.size() > 0)
+         Error(yylloc, "%s", alts.c_str());
+}
+
+
+static void
+lSuggestParamListAlternates() {
+    std::vector<std::string> builtinTokens;
+    const char **token = lParamListTokens;
+    while (*token) {
+        builtinTokens.push_back(*token);
+        ++token;
+    }
+    std::vector<std::string> alternates = MatchStrings(yytext, builtinTokens);
+    std::string alts = lGetAlternates(alternates);
+    if (alts.size() > 0)
+        Error(yylloc, "%s", alts.c_str());
+}
+
+
 static void
 lAddDeclaration(DeclSpecs *ds, Declarator *decl) {
     if (ds == NULL || decl == NULL)
diff --git a/tests_errors/new-delete-2.ispc b/tests_errors/new-delete-2.ispc
index fbbb1cd6..4f78fb6b 100644
--- a/tests_errors/new-delete-2.ispc
+++ b/tests_errors/new-delete-2.ispc
@@ -1,4 +1,4 @@
-// Syntax error
+// syntax error, unexpected 'const'.
 
 int * func(int a) {
     return const new int[a];
diff --git a/tests_errors/new-delete-3.ispc b/tests_errors/new-delete-3.ispc
index e34ea98d..bb22aa56 100644
--- a/tests_errors/new-delete-3.ispc
+++ b/tests_errors/new-delete-3.ispc
@@ -1,4 +1,4 @@
-// Syntax error
+// syntax error, unexpected '('
 
 int * func(int a) {
     return new int[a](10);
diff --git a/tests_errors/ref-1.ispc b/tests_errors/ref-1.ispc
index 7605fc90..c5e60487 100644
--- a/tests_errors/ref-1.ispc
+++ b/tests_errors/ref-1.ispc
@@ -1,4 +1,4 @@
-// Syntax error--token "&" unexpected
+// syntax error, unexpected '&'
 
 int foo(int & & bar) {
     bar = 0;
diff --git a/tests_errors/ref-3.ispc b/tests_errors/ref-3.ispc
index 2a2a169b..85a8dd35 100644
--- a/tests_errors/ref-3.ispc
+++ b/tests_errors/ref-3.ispc
@@ -1,4 +1,4 @@
-// Syntax error--token "*" unexpected
+// syntax error, unexpected '*',
 
 void foo(int & * x) {
     *x = NULL;
diff --git a/tests_errors/vec-size-compile-constant.ispc b/tests_errors/vec-size-compile-constant.ispc
index 45b9b162..b9e61721 100644
--- a/tests_errors/vec-size-compile-constant.ispc
+++ b/tests_errors/vec-size-compile-constant.ispc
@@ -1,4 +1,4 @@
-// Syntax error--token "i" unexpected
+// syntax error, unexpected identifier, expecting int32 constant
 
 void foo(uniform int i) {
     float<i> a;