From 899f85ce9c75f7d545da1233b091dfbe8ff304bf Mon Sep 17 00:00:00 2001
From: "james.brodman" <james.brodman@intel.com>
Date: Tue, 22 Oct 2013 18:06:54 -0400
Subject: [PATCH 01/24] Initial Support for new stdlib shift operator

---
 builtins.cpp                      |   6 ++
 builtins/target-generic-common.ll |   7 ++
 builtins/util.m4                  |  37 ++++++++
 opt.cpp                           | 137 ++++++++++++++++++++++++++++++
 stdlib.ispc                       |  30 +++++++
 5 files changed, 217 insertions(+)
diff --git a/builtins.cpp b/builtins.cpp
index 43f68833..fa2e7328 100644
--- a/builtins.cpp
+++ b/builtins.cpp
@@ -536,6 +536,12 @@ lSetInternalFunctions(llvm::Module *module) {
         "__set_system_isa",
         "__sext_uniform_bool",
         "__sext_varying_bool",
+        "__shift_double",
+        "__shift_float",
+        "__shift_i16",
+        "__shift_i32",
+        "__shift_i64",
+        "__shift_i8",
         "__shuffle2_double",
         "__shuffle2_float",
         "__shuffle2_i16",
diff --git a/builtins/target-generic-common.ll b/builtins/target-generic-common.ll
index 2a5d1b32..92b7a18e 100644
--- a/builtins/target-generic-common.ll
+++ b/builtins/target-generic-common.ll
@@ -80,6 +80,13 @@ declare <WIDTH x i32> @__rotate_i32(<WIDTH x i32>, i32) nounwind readnone
 declare <WIDTH x double> @__rotate_double(<WIDTH x double>, i32) nounwind readnone
 declare <WIDTH x i64> @__rotate_i64(<WIDTH x i64>, i32) nounwind readnone
 
+declare <WIDTH x i8> @__shift_i8(<WIDTH x i8>, i32) nounwind readnone
+declare <WIDTH x i16> @__shift_i16(<WIDTH x i16>, i32) nounwind readnone
+declare <WIDTH x float> @__shift_float(<WIDTH x float>, i32) nounwind readnone
+declare <WIDTH x i32> @__shift_i32(<WIDTH x i32>, i32) nounwind readnone
+declare <WIDTH x double> @__shift_double(<WIDTH x double>, i32) nounwind readnone
+declare <WIDTH x i64> @__shift_i64(<WIDTH x i64>, i32) nounwind readnone
+
 declare <WIDTH x i8> @__shuffle_i8(<WIDTH x i8>, <WIDTH x i32>) nounwind readnone
 declare <WIDTH x i8> @__shuffle2_i8(<WIDTH x i8>, <WIDTH x i8>,
                                     <WIDTH x i32>) nounwind readnone
diff --git a/builtins/util.m4 b/builtins/util.m4
index 68fa818b..4cb46310 100644
--- a/builtins/util.m4
+++ b/builtins/util.m4
@@ -797,6 +797,43 @@ not_const:
   ret <WIDTH x $1> %result
 }
 
+define <WIDTH x $1> @__shift_$1(<WIDTH x $1>, i32) nounwind readnone alwaysinline {
+  %isc = call i1 @__is_compile_time_constant_uniform_int32(i32 %1)
+  %zeropaddedvec = shufflevector <WIDTH x $1> %0, <WIDTH x $1> zeroinitializer,
+                     <eval(2*WIDTH) x i32> < forloop(i, 0, eval(2*WIDTH-2), `i32 i, ')i32 eval(2*WIDTH-1) >
+  br i1 %isc, label %is_const, label %not_const
+
+is_const:
+  ; though verbose, this turms into tight code if %1 is a constant
+forloop(i, 0, eval(WIDTH-1), `  
+  %delta_`'i = add i32 %1, i
+  %delta_clamped_`'i = and i32 %delta_`'i, eval(2*WIDTH-1)
+  %v_`'i = extractelement <eval(2*WIDTH) x $1> %zeropaddedvec, i32 %delta_clamped_`'i')
+  %ret_0 = insertelement <WIDTH x $1> zeroinitializer, $1 %v_0, i32 0
+forloop(i, 1, eval(WIDTH-1), `  %ret_`'i = insertelement <WIDTH x $1> %ret_`'eval(i-1), $1 %v_`'i, i32 i
+')
+  ret <WIDTH x $1> %ret_`'eval(WIDTH-1)
+
+not_const:
+  ; store two instances of the vector into memory
+  %ptr = alloca <WIDTH x $1>, i32 3
+  %ptr0 = getelementptr <WIDTH x $1> * %ptr, i32 0
+  store <WIDTH x $1> zeroinitializer, <WIDTH x $1> * %ptr0
+  %ptr1 = getelementptr <WIDTH x $1> * %ptr, i32 1
+  store <WIDTH x $1> %0, <WIDTH x $1> * %ptr1
+  %ptr2 = getelementptr <WIDTH x $1> * %ptr, i32 2
+  store <WIDTH x $1> zeroinitializer, <WIDTH x $1> * %ptr2
+
+  ; compute offset in [0,vectorwidth-1], then index into the doubled-up vector
+  %offset = add i32 %1, 16
+  %ptr_as_elt_array = bitcast <WIDTH x $1> * %ptr to [eval(3*WIDTH) x $1] *
+  %load_ptr = getelementptr [eval(3*WIDTH) x $1] * %ptr_as_elt_array, i32 0, i32 %offset
+  %load_ptr_vec = bitcast $1 * %load_ptr to <WIDTH x $1> *
+  %result = load <WIDTH x $1> * %load_ptr_vec, align $2
+  ret <WIDTH x $1> %result
+}
+
+
 define <WIDTH x $1> @__shuffle_$1(<WIDTH x $1>, <WIDTH x i32>) nounwind readnone alwaysinline {
 forloop(i, 0, eval(WIDTH-1), `  
   %index_`'i = extractelement <WIDTH x i32> %1, i32 i')
diff --git a/opt.cpp b/opt.cpp
index 75eae20c..0146e7cf 100644
--- a/opt.cpp
+++ b/opt.cpp
@@ -72,6 +72,7 @@
 #include <llvm/Analysis/ConstantFolding.h>
 #include <llvm/Target/TargetLibraryInfo.h>
 #include <llvm/ADT/Triple.h>
+#include <llvm/ADT/SmallSet.h>
 #include <llvm/Transforms/Scalar.h>
 #include <llvm/Transforms/IPO.h>
 #include <llvm/Transforms/Utils/BasicBlockUtils.h>
@@ -124,6 +125,8 @@ static llvm::Pass *CreateMakeInternalFuncsStaticPass();
 
 static llvm::Pass *CreateDebugPass(char * output);
 
+static llvm::Pass *CreateReplaceExtractInsertChainsPass();
+
 #define DEBUG_START_PASS(NAME)                                 \
     if (g->debugPrint &&                                       \
         (getenv("FUNC") == NULL ||                             \
@@ -635,6 +638,7 @@ Optimize(llvm::Module *module, int optLevel) {
         optPM.add(CreateIsCompileTimeConstantPass(true));
         optPM.add(CreateIntrinsicsOptPass());
         optPM.add(CreateInstructionSimplifyPass());
+        optPM.add(CreateReplaceExtractInsertChainsPass());
 
         optPM.add(llvm::createMemCpyOptPass());
         optPM.add(llvm::createSCCPPass());
@@ -4923,3 +4927,136 @@ static llvm::Pass *
 CreatePeepholePass() {
   return new PeepholePass;
 }
+
+///////////////////////////////////////////////////////////////////////////
+// ReplaceExtractInsertChainsPass
+
+/** 
+    We occassionally get chains of ExtractElementInsts followed by 
+    InsertElementInsts.  Unfortunately, all of these can't be replaced by 
+    ShuffleVectorInsts as we don't know that things are constant at the time.
+
+    This Pass will detect such chains, and replace them with ShuffleVectorInsts
+    if all the appropriate values are constant.
+ */
+
+class ReplaceExtractInsertChainsPass : public llvm::BasicBlockPass {
+public:
+    static char ID;
+    ReplaceExtractInsertChainsPass() : BasicBlockPass(ID) {
+    }
+
+    const char *getPassName() const { return "Resolve \"replace extract insert chains\""; }
+    bool runOnBasicBlock(llvm::BasicBlock &BB);
+
+};
+
+char ReplaceExtractInsertChainsPass::ID = 0;
+
+#include <iostream>
+
+/** Given an llvm::Value known to be an integer, return its value as
+    an int64_t.
+*/
+static int64_t
+lGetIntValue(llvm::Value *offset) {
+  llvm::ConstantInt *intOffset = llvm::dyn_cast<llvm::ConstantInt>(offset);
+  Assert(intOffset && (intOffset->getBitWidth() == 32 ||
+                       intOffset->getBitWidth() == 64));
+  return intOffset->getSExtValue();
+}
+
+bool
+ReplaceExtractInsertChainsPass::runOnBasicBlock(llvm::BasicBlock &bb) {
+    DEBUG_START_PASS("ReplaceExtractInsertChainsPass");
+    bool modifiedAny = false;
+
+    // Initialize our mapping to the first spot in the zero vector
+    int vectorWidth = g->target->getVectorWidth();
+    int shuffleMap[vectorWidth];
+    for (int i = 0; i < vectorWidth; i++) {
+      shuffleMap[i] = vectorWidth;
+    }
+
+    // Hack-y.  16 is likely the upper limit for now.
+    llvm::SmallSet<llvm::Value *, 16> inserts;
+
+    // save the last Insert in the chain
+    llvm::Value * lastInsert = NULL;
+
+    for (llvm::BasicBlock::iterator i = bb.begin(), e = bb.end(); i != e; ++i) {
+      // Iterate through the instructions looking for InsertElementInsts
+      llvm::InsertElementInst *ieInst = llvm::dyn_cast<llvm::InsertElementInst>(&*i);
+      if (ieInst == NULL) {
+        // These aren't the instructions you're looking for.
+        continue;
+      }
+      
+      llvm::Value * base = ieInst->getOperand(0);
+      if ( (llvm::isa<llvm::UndefValue>(base))
+           || (llvm::isa<llvm::ConstantAggregateZero>(base))
+           || (base == lastInsert)) {
+        // if source for insert scalar is 0 or an EEInst, add insert
+        llvm::Value *scalar = ieInst->getOperand(1);
+        if (llvm::ExtractElementInst *eeInst = llvm::dyn_cast<llvm::ExtractElementInst>(scalar)) {
+          // We're only going to deal with Inserts into a Constant vector lane
+          if (llvm::isa<llvm::Constant>(eeInst->getOperand(1))) {
+            inserts.insert(ieInst);
+            lastInsert = ieInst;
+          }
+        }
+        else if (llvm::ConstantInt *ci = llvm::dyn_cast<llvm::ConstantInt>(scalar)) {
+          if (ci->isZero()) {
+            inserts.insert(ieInst);
+            lastInsert = ieInst;
+          }
+        }
+        else {
+          lastInsert = NULL;
+        }
+      }
+    }
+    
+    // Look for chains, not insert/shuffle sequences
+    if (inserts.size() > 1) {
+      // The vector from which we're extracting elements
+      llvm::Value * baseVec = NULL;
+      llvm::Value *ee = llvm::cast<llvm::InsertElementInst>((*inserts.begin()))->getOperand(1);
+      if (llvm::ExtractElementInst *eeInst = llvm::dyn_cast<llvm::ExtractElementInst>(ee)) {
+        baseVec = eeInst->getOperand(0);
+      }
+
+      bool sameBase = true;
+      for (llvm::SmallSet<llvm::Value *,16>::iterator i = inserts.begin(); i != inserts.end(); i++) {
+        llvm::InsertElementInst *ie = llvm::cast<llvm::InsertElementInst>(*i);
+        if (llvm::ExtractElementInst *ee = llvm::dyn_cast<llvm::ExtractElementInst>(ie->getOperand(1))) {
+          if (ee->getOperand(0) != baseVec) {
+            sameBase = false;
+            break;
+          }
+          int64_t from = lGetIntValue(ee->getIndexOperand());
+          int64_t to = lGetIntValue(ie->getOperand(2)); 
+          shuffleMap[to] = from;
+        }
+      }
+      if (sameBase) {
+        llvm::Value *shuffleIdxs = LLVMInt32Vector(shuffleMap);
+        llvm::Value *zeroVec = llvm::ConstantAggregateZero::get(shuffleIdxs->getType());
+        llvm::Value *shuffle = new llvm::ShuffleVectorInst(baseVec, zeroVec, shuffleIdxs, "shiftInZero", llvm::cast<llvm::Instruction>(lastInsert));
+        // For now, be lazy and let DCE clean up the Extracts/Inserts.
+        lastInsert->replaceAllUsesWith(shuffle);
+
+        modifiedAny = true;
+      }
+    }    
+    
+    DEBUG_END_PASS("ReplaceExtractInsertChainsPass");
+
+    return modifiedAny;
+}
+
+
+static llvm::Pass *
+CreateReplaceExtractInsertChainsPass() {
+    return new ReplaceExtractInsertChainsPass();
+}
diff --git a/stdlib.ispc b/stdlib.ispc
index 9b02d0ba..248f664a 100644
--- a/stdlib.ispc
+++ b/stdlib.ispc
@@ -170,6 +170,36 @@ static inline int64 rotate(int64 v, uniform int i) {
     return __rotate_i64(v, i);
 }
 
+__declspec(safe) 
+static inline float shift(float v, uniform int i) {
+    return __shift_float(v, i);
+}
+
+__declspec(safe) 
+static inline int8 shift(int8 v, uniform int i) {
+    return __shift_i8(v, i);
+}
+
+__declspec(safe) 
+static inline int16 shift(int16 v, uniform int i) {
+    return __shift_i16(v, i);
+}
+
+__declspec(safe) 
+static inline int32 shift(int32 v, uniform int i) {
+    return __shift_i32(v, i);
+}
+
+__declspec(safe) 
+static inline double shift(double v, uniform int i) {
+    return __shift_double(v, i);
+}
+
+__declspec(safe) 
+static inline int64 shift(int64 v, uniform int i) {
+    return __shift_i64(v, i);
+}
+
 __declspec(safe) 
 static inline float shuffle(float v, int i) {
     return __shuffle_float(v, i);

From f97a2d68c8e0ae0e10d11b3f08a415685a899f6f Mon Sep 17 00:00:00 2001
From: "james.brodman" <james.brodman@intel.com>
Date: Tue, 22 Oct 2013 18:29:20 -0400
Subject: [PATCH 02/24] Bugfix for non-const shift amt and unit tests.

---
 builtins/util.m4   |  4 +---
 tests/shift-1.ispc | 14 ++++++++++++++
 tests/shift-2.ispc | 15 +++++++++++++++
 tests/shift-3.ispc | 14 ++++++++++++++
 4 files changed, 44 insertions(+), 3 deletions(-)
 create mode 100644 tests/shift-1.ispc
 create mode 100644 tests/shift-2.ispc
 create mode 100644 tests/shift-3.ispc

diff --git a/builtins/util.m4 b/builtins/util.m4
index 4cb46310..c1582e51 100644
--- a/builtins/util.m4
+++ b/builtins/util.m4
@@ -815,7 +815,6 @@ forloop(i, 1, eval(WIDTH-1), `  %ret_`'i = insertelement <WIDTH x $1> %ret_`'eva
   ret <WIDTH x $1> %ret_`'eval(WIDTH-1)
 
 not_const:
-  ; store two instances of the vector into memory
   %ptr = alloca <WIDTH x $1>, i32 3
   %ptr0 = getelementptr <WIDTH x $1> * %ptr, i32 0
   store <WIDTH x $1> zeroinitializer, <WIDTH x $1> * %ptr0
@@ -824,8 +823,7 @@ not_const:
   %ptr2 = getelementptr <WIDTH x $1> * %ptr, i32 2
   store <WIDTH x $1> zeroinitializer, <WIDTH x $1> * %ptr2
 
-  ; compute offset in [0,vectorwidth-1], then index into the doubled-up vector
-  %offset = add i32 %1, 16
+  %offset = add i32 %1, WIDTH
   %ptr_as_elt_array = bitcast <WIDTH x $1> * %ptr to [eval(3*WIDTH) x $1] *
   %load_ptr = getelementptr [eval(3*WIDTH) x $1] * %ptr_as_elt_array, i32 0, i32 %offset
   %load_ptr_vec = bitcast $1 * %load_ptr to <WIDTH x $1> *
diff --git a/tests/shift-1.ispc b/tests/shift-1.ispc
new file mode 100644
index 00000000..2062e36b
--- /dev/null
+++ b/tests/shift-1.ispc
@@ -0,0 +1,14 @@
+
+export uniform int width() { return programCount; }
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    int a = aFOO[programIndex]; 
+    int rot = shift(a, -1);
+    RET[programIndex] = rot;
+}
+
+export void result(uniform float RET[]) {
+    varying int val = programIndex;
+    if (val < 0) val = 0;	 
+    RET[programIndex] = val;	 
+}   
diff --git a/tests/shift-2.ispc b/tests/shift-2.ispc
new file mode 100644
index 00000000..6cb88e8a
--- /dev/null
+++ b/tests/shift-2.ispc
@@ -0,0 +1,15 @@
+
+export uniform int width() { return programCount; }
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    int a = aFOO[programIndex]; 
+    uniform int delta = b - 6; // -1
+    int rot = shift(a, delta);
+    RET[programIndex] = rot;
+}
+
+export void result(uniform float RET[]) {
+    varying int val = programIndex;
+    if (val < 0) val = 0;	 
+    RET[programIndex] = val;	 
+}
diff --git a/tests/shift-3.ispc b/tests/shift-3.ispc
new file mode 100644
index 00000000..827d076f
--- /dev/null
+++ b/tests/shift-3.ispc
@@ -0,0 +1,14 @@
+
+export uniform int width() { return programCount; }
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    int a = aFOO[programIndex]; 
+    int rot = shift(a, 1);
+    RET[programIndex] = rot;
+}
+
+export void result(uniform float RET[]) {
+    varying int val = 2 + programIndex;
+    if (val > programCount) val = 0;	 
+    RET[programIndex] = val;	 
+}   

From 4d289b16c283ace36aa193817bf1ac16a1fcc364 Mon Sep 17 00:00:00 2001
From: "james.brodman" <james.brodman@intel.com>
Date: Wed, 23 Oct 2013 14:25:43 -0400
Subject: [PATCH 03/24] Redesign after being hit with the KISS bat.

---
 builtins/util.m4 |  17 -----
 opt.cpp          | 168 +++++++++++++++++------------------------------
 stdlib.ispc      |  36 ++++++++--
 3 files changed, 92 insertions(+), 129 deletions(-)

diff --git a/builtins/util.m4 b/builtins/util.m4
index c1582e51..0e017322 100644
--- a/builtins/util.m4
+++ b/builtins/util.m4
@@ -798,23 +798,6 @@ not_const:
 }
 
 define <WIDTH x $1> @__shift_$1(<WIDTH x $1>, i32) nounwind readnone alwaysinline {
-  %isc = call i1 @__is_compile_time_constant_uniform_int32(i32 %1)
-  %zeropaddedvec = shufflevector <WIDTH x $1> %0, <WIDTH x $1> zeroinitializer,
-                     <eval(2*WIDTH) x i32> < forloop(i, 0, eval(2*WIDTH-2), `i32 i, ')i32 eval(2*WIDTH-1) >
-  br i1 %isc, label %is_const, label %not_const
-
-is_const:
-  ; though verbose, this turms into tight code if %1 is a constant
-forloop(i, 0, eval(WIDTH-1), `  
-  %delta_`'i = add i32 %1, i
-  %delta_clamped_`'i = and i32 %delta_`'i, eval(2*WIDTH-1)
-  %v_`'i = extractelement <eval(2*WIDTH) x $1> %zeropaddedvec, i32 %delta_clamped_`'i')
-  %ret_0 = insertelement <WIDTH x $1> zeroinitializer, $1 %v_0, i32 0
-forloop(i, 1, eval(WIDTH-1), `  %ret_`'i = insertelement <WIDTH x $1> %ret_`'eval(i-1), $1 %v_`'i, i32 i
-')
-  ret <WIDTH x $1> %ret_`'eval(WIDTH-1)
-
-not_const:
   %ptr = alloca <WIDTH x $1>, i32 3
   %ptr0 = getelementptr <WIDTH x $1> * %ptr, i32 0
   store <WIDTH x $1> zeroinitializer, <WIDTH x $1> * %ptr0
diff --git a/opt.cpp b/opt.cpp
index 0146e7cf..b1a22a1c 100644
--- a/opt.cpp
+++ b/opt.cpp
@@ -125,7 +125,7 @@ static llvm::Pass *CreateMakeInternalFuncsStaticPass();
 
 static llvm::Pass *CreateDebugPass(char * output);
 
-static llvm::Pass *CreateReplaceExtractInsertChainsPass();
+static llvm::Pass *CreateReplaceStdlibShiftPass();
 
 #define DEBUG_START_PASS(NAME)                                 \
     if (g->debugPrint &&                                       \
@@ -524,6 +524,7 @@ Optimize(llvm::Module *module, int optLevel) {
         optPM.add(llvm::createPromoteMemoryToRegisterPass());
         optPM.add(llvm::createAggressiveDCEPass());
 
+
         if (g->opt.disableGatherScatterOptimizations == false &&
             g->target->getVectorWidth() > 1) {
             optPM.add(llvm::createInstructionCombiningPass(), 210);
@@ -535,6 +536,9 @@ Optimize(llvm::Module *module, int optLevel) {
         }
         optPM.add(llvm::createDeadInstEliminationPass(), 220);
 
+        optPM.add(llvm::createIPConstantPropagationPass());
+        optPM.add(CreateReplaceStdlibShiftPass());
+
         // Max struct size threshold for scalar replacement is
         //    1) 4 fields (r,g,b,w)
         //    2) field size: vectorWidth * sizeof(float)
@@ -638,7 +642,6 @@ Optimize(llvm::Module *module, int optLevel) {
         optPM.add(CreateIsCompileTimeConstantPass(true));
         optPM.add(CreateIntrinsicsOptPass());
         optPM.add(CreateInstructionSimplifyPass());
-        optPM.add(CreateReplaceExtractInsertChainsPass());
 
         optPM.add(llvm::createMemCpyOptPass());
         optPM.add(llvm::createSCCPPass());
@@ -4883,6 +4886,7 @@ lMatchAvgDownInt16(llvm::Value *inst) {
 }
 #endif // !LLVM_3_1 && !LLVM_3_2
 
+
 bool
 PeepholePass::runOnBasicBlock(llvm::BasicBlock &bb) {
     DEBUG_START_PASS("PeepholePass");
@@ -4928,31 +4932,6 @@ CreatePeepholePass() {
   return new PeepholePass;
 }
 
-///////////////////////////////////////////////////////////////////////////
-// ReplaceExtractInsertChainsPass
-
-/** 
-    We occassionally get chains of ExtractElementInsts followed by 
-    InsertElementInsts.  Unfortunately, all of these can't be replaced by 
-    ShuffleVectorInsts as we don't know that things are constant at the time.
-
-    This Pass will detect such chains, and replace them with ShuffleVectorInsts
-    if all the appropriate values are constant.
- */
-
-class ReplaceExtractInsertChainsPass : public llvm::BasicBlockPass {
-public:
-    static char ID;
-    ReplaceExtractInsertChainsPass() : BasicBlockPass(ID) {
-    }
-
-    const char *getPassName() const { return "Resolve \"replace extract insert chains\""; }
-    bool runOnBasicBlock(llvm::BasicBlock &BB);
-
-};
-
-char ReplaceExtractInsertChainsPass::ID = 0;
-
 #include <iostream>
 
 /** Given an llvm::Value known to be an integer, return its value as
@@ -4966,97 +4945,74 @@ lGetIntValue(llvm::Value *offset) {
   return intOffset->getSExtValue();
 }
 
+///////////////////////////////////////////////////////////////////////////
+// ReplaceStdlibShiftPass
+
+class ReplaceStdlibShiftPass : public llvm::BasicBlockPass {
+public:
+    static char ID;
+    ReplaceStdlibShiftPass() : BasicBlockPass(ID) {
+    }
+
+    const char *getPassName() const { return "Resolve \"replace extract insert chains\""; }
+    bool runOnBasicBlock(llvm::BasicBlock &BB);
+
+};
+
+char ReplaceStdlibShiftPass::ID = 0;
+
 bool
-ReplaceExtractInsertChainsPass::runOnBasicBlock(llvm::BasicBlock &bb) {
-    DEBUG_START_PASS("ReplaceExtractInsertChainsPass");
+ReplaceStdlibShiftPass::runOnBasicBlock(llvm::BasicBlock &bb) {
+    DEBUG_START_PASS("ReplaceStdlibShiftPass");
     bool modifiedAny = false;
+    
+    llvm::Function *shifts[6];
+    shifts[0] = m->module->getFunction("__shift_i8");
+    shifts[1] = m->module->getFunction("__shift_i16");
+    shifts[2] = m->module->getFunction("__shift_i32");
+    shifts[3] = m->module->getFunction("__shift_i64");
+    shifts[4] = m->module->getFunction("__shift_float");
+    shifts[5] = m->module->getFunction("__shift_double");
 
-    // Initialize our mapping to the first spot in the zero vector
-    int vectorWidth = g->target->getVectorWidth();
-    int shuffleMap[vectorWidth];
-    for (int i = 0; i < vectorWidth; i++) {
-      shuffleMap[i] = vectorWidth;
-    }
+    for (llvm::BasicBlock::iterator iter = bb.begin(), e = bb.end(); iter != e; ++iter) {
+        llvm::Instruction *inst = &*iter;
 
-    // Hack-y.  16 is likely the upper limit for now.
-    llvm::SmallSet<llvm::Value *, 16> inserts;
-
-    // save the last Insert in the chain
-    llvm::Value * lastInsert = NULL;
-
-    for (llvm::BasicBlock::iterator i = bb.begin(), e = bb.end(); i != e; ++i) {
-      // Iterate through the instructions looking for InsertElementInsts
-      llvm::InsertElementInst *ieInst = llvm::dyn_cast<llvm::InsertElementInst>(&*i);
-      if (ieInst == NULL) {
-        // These aren't the instructions you're looking for.
-        continue;
-      }
-      
-      llvm::Value * base = ieInst->getOperand(0);
-      if ( (llvm::isa<llvm::UndefValue>(base))
-           || (llvm::isa<llvm::ConstantAggregateZero>(base))
-           || (base == lastInsert)) {
-        // if source for insert scalar is 0 or an EEInst, add insert
-        llvm::Value *scalar = ieInst->getOperand(1);
-        if (llvm::ExtractElementInst *eeInst = llvm::dyn_cast<llvm::ExtractElementInst>(scalar)) {
-          // We're only going to deal with Inserts into a Constant vector lane
-          if (llvm::isa<llvm::Constant>(eeInst->getOperand(1))) {
-            inserts.insert(ieInst);
-            lastInsert = ieInst;
+        if (llvm::CallInst *ci = llvm::dyn_cast<llvm::CallInst>(inst)) {
+          llvm::Function *func = ci->getCalledFunction();
+          for (int i = 0; i < 6; i++) {
+            if (shifts[i] == func) {
+              // we matched a call
+              llvm::Value *shiftedVec = ci->getArgOperand(0);
+              llvm::Value *shiftAmt = ci->getArgOperand(1);
+              if (llvm::isa<llvm::Constant>(shiftAmt)) {
+                int vectorWidth = g->target->getVectorWidth();
+                int shuffleVals[vectorWidth];
+                int shiftInt = lGetIntValue(shiftAmt);
+                for (int i = 0; i < vectorWidth; i++) {
+                  int s = i + shiftInt;
+                  s = (s < 0) ? vectorWidth : s;
+                  s = (s >= vectorWidth) ? vectorWidth : s;
+                  shuffleVals[i] = s;
+                }
+                llvm::Value *shuffleIdxs = LLVMInt32Vector(shuffleVals);
+                llvm::Value *zeroVec = llvm::ConstantAggregateZero::get(shiftedVec->getType());
+                llvm::Value *shuffle = new llvm::ShuffleVectorInst(shiftedVec, zeroVec, 
+                                                                   shuffleIdxs, "vecShift", ci);
+                ci->replaceAllUsesWith(shuffle);
+                modifiedAny = true;
+              }
+            }
           }
         }
-        else if (llvm::ConstantInt *ci = llvm::dyn_cast<llvm::ConstantInt>(scalar)) {
-          if (ci->isZero()) {
-            inserts.insert(ieInst);
-            lastInsert = ieInst;
-          }
-        }
-        else {
-          lastInsert = NULL;
-        }
-      }
     }
     
-    // Look for chains, not insert/shuffle sequences
-    if (inserts.size() > 1) {
-      // The vector from which we're extracting elements
-      llvm::Value * baseVec = NULL;
-      llvm::Value *ee = llvm::cast<llvm::InsertElementInst>((*inserts.begin()))->getOperand(1);
-      if (llvm::ExtractElementInst *eeInst = llvm::dyn_cast<llvm::ExtractElementInst>(ee)) {
-        baseVec = eeInst->getOperand(0);
-      }
-
-      bool sameBase = true;
-      for (llvm::SmallSet<llvm::Value *,16>::iterator i = inserts.begin(); i != inserts.end(); i++) {
-        llvm::InsertElementInst *ie = llvm::cast<llvm::InsertElementInst>(*i);
-        if (llvm::ExtractElementInst *ee = llvm::dyn_cast<llvm::ExtractElementInst>(ie->getOperand(1))) {
-          if (ee->getOperand(0) != baseVec) {
-            sameBase = false;
-            break;
-          }
-          int64_t from = lGetIntValue(ee->getIndexOperand());
-          int64_t to = lGetIntValue(ie->getOperand(2)); 
-          shuffleMap[to] = from;
-        }
-      }
-      if (sameBase) {
-        llvm::Value *shuffleIdxs = LLVMInt32Vector(shuffleMap);
-        llvm::Value *zeroVec = llvm::ConstantAggregateZero::get(shuffleIdxs->getType());
-        llvm::Value *shuffle = new llvm::ShuffleVectorInst(baseVec, zeroVec, shuffleIdxs, "shiftInZero", llvm::cast<llvm::Instruction>(lastInsert));
-        // For now, be lazy and let DCE clean up the Extracts/Inserts.
-        lastInsert->replaceAllUsesWith(shuffle);
-
-        modifiedAny = true;
-      }
-    }    
-    
-    DEBUG_END_PASS("ReplaceExtractInsertChainsPass");
+    DEBUG_END_PASS("ReplaceStdlibShiftPass");
 
     return modifiedAny;
 }
 
 
 static llvm::Pass *
-CreateReplaceExtractInsertChainsPass() {
-    return new ReplaceExtractInsertChainsPass();
+CreateReplaceStdlibShiftPass() {
+    return new ReplaceStdlibShiftPass();
 }
diff --git a/stdlib.ispc b/stdlib.ispc
index 248f664a..6768594b 100644
--- a/stdlib.ispc
+++ b/stdlib.ispc
@@ -172,32 +172,56 @@ static inline int64 rotate(int64 v, uniform int i) {
 
 __declspec(safe) 
 static inline float shift(float v, uniform int i) {
-    return __shift_float(v, i);
+  varying float result;
+  unmasked {
+    result = __shift_float(v, i);
+  }
+  return result;
 }
 
 __declspec(safe) 
 static inline int8 shift(int8 v, uniform int i) {
-    return __shift_i8(v, i);
+  varying int8 result;
+  unmasked {
+    result = __shift_i8(v, i);
+  }
+  return result;
 }
 
 __declspec(safe) 
 static inline int16 shift(int16 v, uniform int i) {
-    return __shift_i16(v, i);
+  varying int16 result;
+  unmasked {
+    result = __shift_i16(v, i);
+  }
+  return result;
 }
 
 __declspec(safe) 
 static inline int32 shift(int32 v, uniform int i) {
-    return __shift_i32(v, i);
+  varying int32 result;
+  unmasked {
+    result = __shift_i32(v, i);
+  }
+  return result;
 }
 
 __declspec(safe) 
 static inline double shift(double v, uniform int i) {
-    return __shift_double(v, i);
+  varying double result;
+  unmasked {
+    result = __shift_double(v, i);
+  }
+  return result;
 }
 
 __declspec(safe) 
 static inline int64 shift(int64 v, uniform int i) {
-    return __shift_i64(v, i);
+  varying int64 result;
+  unmasked {
+    result = __shift_i64(v, i);
+  }
+  return result;
 }
 
 __declspec(safe) 

From c4ad8f6ed4d5f72e43f63b805f557d506f3a54a0 Mon Sep 17 00:00:00 2001
From: "james.brodman" <james.brodman@intel.com>
Date: Wed, 23 Oct 2013 15:51:59 -0400
Subject: [PATCH 04/24] Add docs/generic impls

---
 docs/ispc.rst              | 18 +++++++-
 examples/intrinsics/sse4.h | 84 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 101 insertions(+), 1 deletion(-)

diff --git a/docs/ispc.rst b/docs/ispc.rst
index eac9b24e..a99a3990 100644
--- a/docs/ispc.rst
+++ b/docs/ispc.rst
@@ -3719,6 +3719,22 @@ the size of the gang (it is masked to ensure valid offsets).
     double rotate(double value, uniform int offset)
 
 
+The ``shift()`` function allows each program instance to find the value of
+the given value that their neighbor ``offset`` steps away has.  This is similar
+to ``rotate()`` with the exception that values are not circularly shifted.  
+Instead, zeroes are shifted in where appropriate.
+
+
+::
+
+    int8 shift(int8 value, uniform int offset)
+    int16 shift(int16 value, uniform int offset)
+    int32 shift(int32 value, uniform int offset)
+    int64 shift(int64 value, uniform int offset)
+    float shift(float value, uniform int offset)
+    double shift(double value, uniform int offset)
+
+
 Finally, the ``shuffle()`` functions allow two variants of fully general
 shuffling of values among the program instances.  For the first version,
 each program instance's value of permutation gives the program instance
@@ -3751,7 +3767,7 @@ the last element of ``value1``, etc.)
     double shuffle(double value0, double value1, int permutation)
 
 Finally, there are primitive operations that extract and set values in the
-SIMD lanes.  You can implement all of the broadcast, rotate, and shuffle
+SIMD lanes.  You can implement all of the broadcast, rotate, shift, and shuffle
 operations described above in this section from these routines, though in
 general, not as efficiently.  These routines are useful for implementing
 other reductions and cross-lane communication that isn't included in the
diff --git a/examples/intrinsics/sse4.h b/examples/intrinsics/sse4.h
index ff00d920..d1178751 100644
--- a/examples/intrinsics/sse4.h
+++ b/examples/intrinsics/sse4.h
@@ -598,6 +598,20 @@ static FORCEINLINE __vec4_i8 __rotate_i8(__vec4_i8 v, int delta) {
                      __extract_element(v, (delta+3) & 0x3));
 }
 
+static FORCEINLINE __vec4_i8 __shift_i8(__vec4_i8 v, int delta) {
+  int8_t v1, v2, v3, v4;
+  int d1, d2, d3, d4;
+  d1 = delta+0;
+  d2 = delta+1;
+  d3 = delta+2;
+  d4 = delta+3;
+  v1 = ((d1 >= 0) && (d1 < 4)) ? __extract_element(v, d1) : 0;
+  v2 = ((d2 >= 0) && (d2 < 4)) ? __extract_element(v, d2) : 0;
+  v3 = ((d3 >= 0) && (d3 < 4)) ? __extract_element(v, d3) : 0;
+  v4 = ((d4 >= 0) && (d4 < 4)) ? __extract_element(v, d4) : 0;
+  return __vec4_i8(v1, v2, v3, v4);
+}
+
 static FORCEINLINE __vec4_i8 __shuffle_i8(__vec4_i8 v, __vec4_i32 index) {
     return __vec4_i8(__extract_element(v, __extract_element(index, 0) & 0x3),
                      __extract_element(v, __extract_element(index, 1) & 0x3),
@@ -870,6 +884,20 @@ static FORCEINLINE __vec4_i16 __rotate_i16(__vec4_i16 v, int delta) {
                       __extract_element(v, (delta+3) & 0x3));
 }
 
+static FORCEINLINE __vec4_i16 __shift_i16(__vec4_i16 v, int delta) {
+  int16_t v1, v2, v3, v4;
+  int d1, d2, d3, d4;
+  d1 = delta+0;
+  d2 = delta+1;
+  d3 = delta+2;
+  d4 = delta+3;
+  v1 = ((d1 >= 0) && (d1 < 4)) ? __extract_element(v, d1) : 0;
+  v2 = ((d2 >= 0) && (d2 < 4)) ? __extract_element(v, d2) : 0;
+  v3 = ((d3 >= 0) && (d3 < 4)) ? __extract_element(v, d3) : 0;
+  v4 = ((d4 >= 0) && (d4 < 4)) ? __extract_element(v, d4) : 0;
+  return __vec4_i16(v1, v2, v3, v4);
+}
+
 static FORCEINLINE __vec4_i16 __shuffle_i16(__vec4_i16 v, __vec4_i32 index) {
     return __vec4_i16(__extract_element(v, __extract_element(index, 0) & 0x3),
                       __extract_element(v, __extract_element(index, 1) & 0x3),
@@ -1128,6 +1156,20 @@ static FORCEINLINE __vec4_i32 __rotate_i32(__vec4_i32 v, int delta) {
                       __extract_element(v, (delta+3) & 0x3));
 }
 
+static FORCEINLINE __vec4_i32 __shift_i32(__vec4_i32 v, int delta) {
+  int32_t v1, v2, v3, v4;
+  int d1, d2, d3, d4;
+  d1 = delta+0;
+  d2 = delta+1;
+  d3 = delta+2;
+  d4 = delta+3;
+  v1 = ((d1 >= 0) && (d1 < 4)) ? __extract_element(v, d1) : 0;
+  v2 = ((d2 >= 0) && (d2 < 4)) ? __extract_element(v, d2) : 0;
+  v3 = ((d3 >= 0) && (d3 < 4)) ? __extract_element(v, d3) : 0;
+  v4 = ((d4 >= 0) && (d4 < 4)) ? __extract_element(v, d4) : 0;
+  return __vec4_i32(v1, v2, v3, v4);
+}
+
 static FORCEINLINE __vec4_i32 __shuffle_i32(__vec4_i32 v, __vec4_i32 index) {
     return __vec4_i32(__extract_element(v, __extract_element(index, 0) & 0x3),
                       __extract_element(v, __extract_element(index, 1) & 0x3),
@@ -1403,6 +1445,20 @@ static FORCEINLINE __vec4_i64 __rotate_i64(__vec4_i64 v, int delta) {
                       __extract_element(v, (delta+3) & 0x3));
 }
 
+static FORCEINLINE __vec4_i64 __shift_i64(__vec4_i64 v, int delta) {
+  int64_t v1, v2, v3, v4;
+  int d1, d2, d3, d4;
+  d1 = delta+0;
+  d2 = delta+1;
+  d3 = delta+2;
+  d4 = delta+3;
+  v1 = ((d1 >= 0) && (d1 < 4)) ? __extract_element(v, d1) : 0;
+  v2 = ((d2 >= 0) && (d2 < 4)) ? __extract_element(v, d2) : 0;
+  v3 = ((d3 >= 0) && (d3 < 4)) ? __extract_element(v, d3) : 0;
+  v4 = ((d4 >= 0) && (d4 < 4)) ? __extract_element(v, d4) : 0;
+  return __vec4_i64(v1, v2, v3, v4);
+}
+
 static FORCEINLINE __vec4_i64 __shuffle_i64(__vec4_i64 v, __vec4_i32 index) {
     return __vec4_i64(__extract_element(v, __extract_element(index, 0) & 0x3),
                       __extract_element(v, __extract_element(index, 1) & 0x3),
@@ -1523,6 +1579,20 @@ static FORCEINLINE __vec4_f __rotate_float(__vec4_f v, int delta) {
                     __extract_element(v, (delta+3) & 0x3));
 }
 
+static FORCEINLINE __vec4_f __shift_float(__vec4_f v, int delta) {
+  float v1, v2, v3, v4;
+  int d1, d2, d3, d4;
+  d1 = delta+0;
+  d2 = delta+1;
+  d3 = delta+2;
+  d4 = delta+3;
+  v1 = ((d1 >= 0) && (d1 < 4)) ? __extract_element(v, d1) : 0.f;
+  v2 = ((d2 >= 0) && (d2 < 4)) ? __extract_element(v, d2) : 0.f;
+  v3 = ((d3 >= 0) && (d3 < 4)) ? __extract_element(v, d3) : 0.f;
+  v4 = ((d4 >= 0) && (d4 < 4)) ? __extract_element(v, d4) : 0.f;
+  return __vec4_f(v1, v2, v3, v4);
+}
+
 static FORCEINLINE __vec4_f __shuffle_float(__vec4_f v, __vec4_i32 index) {
     return __vec4_f(__extract_element(v, __extract_element(index, 0) & 0x3),
                     __extract_element(v, __extract_element(index, 1) & 0x3),
@@ -1676,6 +1746,20 @@ static FORCEINLINE __vec4_d __rotate_double(__vec4_d v, int delta) {
                     __extract_element(v, (delta+3) & 0x3));
 }
 
+static FORCEINLINE __vec4_d __shift_double(__vec4_d v, int delta) {
+  double v1, v2, v3, v4;
+  int d1, d2, d3, d4;
+  d1 = delta+0;
+  d2 = delta+1;
+  d3 = delta+2;
+  d4 = delta+3;
+  v1 = ((d1 >= 0) && (d1 < 4)) ? __extract_element(v, d1) : 0;
+  v2 = ((d2 >= 0) && (d2 < 4)) ? __extract_element(v, d2) : 0;
+  v3 = ((d3 >= 0) && (d3 < 4)) ? __extract_element(v, d3) : 0;
+  v4 = ((d4 >= 0) && (d4 < 4)) ? __extract_element(v, d4) : 0;
+  return __vec4_d(v1, v2, v3, v4);
+}
+
 static FORCEINLINE __vec4_d __shuffle_double(__vec4_d v, __vec4_i32 index) {
     return __vec4_d(__extract_element(v, __extract_element(index, 0) & 0x3),
                     __extract_element(v, __extract_element(index, 1) & 0x3),

From d2b89e0e3741a85b49f51ea7a7bbdb04ce61eb4e Mon Sep 17 00:00:00 2001
From: "james.brodman" <james.brodman@intel.com>
Date: Wed, 23 Oct 2013 18:01:01 -0400
Subject: [PATCH 05/24] Tweak generic target.

---
 examples/intrinsics/sse4.h | 127 ++++++++++++++++++++-----------------
 opt.cpp                    |   7 +-
 2 files changed, 73 insertions(+), 61 deletions(-)

diff --git a/examples/intrinsics/sse4.h b/examples/intrinsics/sse4.h
index d1178751..67c46848 100644
--- a/examples/intrinsics/sse4.h
+++ b/examples/intrinsics/sse4.h
@@ -108,22 +108,21 @@ struct __vec4_i64 {
 };
 
 struct __vec4_i32 {
-    __vec4_i32() { }
+    FORCEINLINE __vec4_i32() : v(_mm_setzero_si128()) { }
     FORCEINLINE __vec4_i32(__m128i vv) : v(vv) {  }
-    FORCEINLINE __vec4_i32(uint32_t a, uint32_t b, uint32_t c, uint32_t d) {
+    FORCEINLINE __vec4_i32(int32_t a, int32_t b, int32_t c, int32_t d) {
         v = _mm_set_epi32(d, c, b, a);
     }
-    FORCEINLINE __vec4_i32(uint32_t *p) {
+    FORCEINLINE __vec4_i32(int32_t *p) {
         v = _mm_loadu_si128((__m128i *)p);
     }
-
+    FORCEINLINE __vec4_i32(const __vec4_i32 &other) : v(other.v) {}
+    FORCEINLINE __vec4_i32& operator =(const __vec4_i32 &o) { v=o.v; return *this; }
     FORCEINLINE operator __m128() const { return _mm_castsi128_ps(v); }
-
+    
     __m128i v;
 };
 
-static inline int32_t __extract_element(__vec4_i32 v, int index);
-
 struct __vec4_i16 {
     __vec4_i16() { }
     FORCEINLINE __vec4_i16(__m128i vv) : v(vv) {  }
@@ -215,6 +214,64 @@ INSERT_EXTRACT(__vec1_i64, int64_t)
 INSERT_EXTRACT(__vec1_f, float)
 INSERT_EXTRACT(__vec1_d, double)
 
+static FORCEINLINE bool __extract_element(const __vec4_i1 &v, int index) {
+    return ((int32_t *)&v)[index] ? true : false;
+}
+
+static FORCEINLINE void __insert_element(__vec4_i1 *v, int index, bool val) {
+    ((int32_t *)v)[index] = val ? -1 : 0;
+}
+
+static FORCEINLINE int8_t __extract_element(const __vec4_i8 &v, int index) {
+    return ((int8_t *)&v)[index];
+}
+
+static FORCEINLINE void __insert_element(__vec4_i8 *v, int index, int8_t val) {
+    ((int8_t *)v)[index] = val;
+}
+
+static FORCEINLINE int16_t __extract_element(const __vec4_i16 &v, int index) {
+    return ((int16_t *)&v)[index];
+}
+
+static FORCEINLINE void __insert_element(__vec4_i16 *v, int index, int16_t val) {
+    ((int16_t *)v)[index] = val;
+}
+
+static FORCEINLINE int32_t __extract_element(const __vec4_i32 &v, int index) {
+    return ((int32_t *)&v)[index];
+}
+
+static FORCEINLINE void __insert_element(__vec4_i32 *v, int index, int32_t val) {
+    ((int32_t *)v)[index] = val;
+}
+
+static FORCEINLINE int64_t __extract_element(const __vec4_i64 &v, int index) {
+    return ((int64_t *)&v)[index];
+}
+
+static FORCEINLINE void __insert_element(__vec4_i64 *v, int index, int64_t val) {
+    ((int64_t *)v)[index] = val;
+}
+
+static FORCEINLINE float __extract_element(const __vec4_f &v, int index) {
+    return ((float *)&v)[index];
+}
+
+static FORCEINLINE void __insert_element(__vec4_f *v, int index, float val) {
+    ((float *)v)[index] = val;
+}
+
+static FORCEINLINE double __extract_element(const __vec4_d &v, int index) {
+    return ((double *)&v)[index];
+}
+
+static FORCEINLINE void __insert_element(__vec4_d *v, int index, double val) {
+    ((double *)v)[index] = val;
+}
+
+
+
 #define CAST_BITS_SCALAR(TO, FROM)                  \
 static FORCEINLINE TO __cast_bits(TO, FROM v) {     \
     union {                                         \
@@ -313,13 +370,6 @@ static FORCEINLINE __vec4_i1 __select(__vec4_i1 mask, __vec4_i1 a, __vec4_i1 b)
     return _mm_blendv_ps(b.v, a.v, mask.v);
 }
 
-static FORCEINLINE bool __extract_element(__vec4_i1 v, int index) {
-    return ((int32_t *)&v)[index] ? true : false;
-}
-
-static FORCEINLINE void __insert_element(__vec4_i1 *v, int index, bool val) {
-    ((int32_t *)v)[index] = val ? -1 : 0;
-}
 
 template <int ALIGN> static FORCEINLINE __vec4_i1 __load(const __vec4_i1 *v) {
     // FIXME: handle align of 16...
@@ -564,13 +614,6 @@ static FORCEINLINE __vec4_i8 __select(__vec4_i1 mask, __vec4_i8 a, __vec4_i8 b)
                                                         _mm_extract_epi8(b.v, 3));
 }
 
-static FORCEINLINE int8_t __extract_element(__vec4_i8 v, int index) {
-    return ((int8_t *)&v)[index];
-}
-
-static FORCEINLINE void __insert_element(__vec4_i8 *v, int index, int8_t val) {
-    ((int8_t *)v)[index] = val;
-}
 
 template <class RetVecType> __vec4_i8 __smear_i8(int8_t v);
 template <> FORCEINLINE __vec4_i8 __smear_i8<__vec4_i8>(int8_t v) {
@@ -850,13 +893,6 @@ static FORCEINLINE __vec4_i16 __select(__vec4_i1 mask, __vec4_i16 a, __vec4_i16
                                                          _mm_extract_epi16(b.v, 3));
 }
 
-static FORCEINLINE int16_t __extract_element(__vec4_i16 v, int index) {
-    return ((int16_t *)&v)[index];
-}
-
-static FORCEINLINE void __insert_element(__vec4_i16 *v, int index, int16_t val) {
-    ((int16_t *)v)[index] = val;
-}
 
 template <class RetVecType> __vec4_i16 __smear_i16(int16_t v);
 template <> FORCEINLINE __vec4_i16 __smear_i16<__vec4_i16>(int16_t v) {
@@ -1137,13 +1173,6 @@ template <> FORCEINLINE __vec4_i32 __undef_i32<__vec4_i32>() {
     return __vec4_i32();
 }
 
-static FORCEINLINE int32_t __extract_element(__vec4_i32 v, int index) {
-    return ((int32_t *)&v)[index];
-}
-
-static FORCEINLINE void __insert_element(__vec4_i32 *v, int index, int32_t val) {
-    ((int32_t *)v)[index] = val;
-}
 
 static FORCEINLINE __vec4_i32 __broadcast_i32(__vec4_i32 v, int index) {
     return _mm_set1_epi32(__extract_element(v, index));
@@ -1156,9 +1185,10 @@ static FORCEINLINE __vec4_i32 __rotate_i32(__vec4_i32 v, int delta) {
                       __extract_element(v, (delta+3) & 0x3));
 }
 
-static FORCEINLINE __vec4_i32 __shift_i32(__vec4_i32 v, int delta) {
+#include <iostream>
+static FORCEINLINE __vec4_i32 __shift_i32(const __vec4_i32 &v, int delta) {
   int32_t v1, v2, v3, v4;
-  int d1, d2, d3, d4;
+  int32_t d1, d2, d3, d4;
   d1 = delta+0;
   d2 = delta+1;
   d3 = delta+2;
@@ -1425,13 +1455,6 @@ template <> FORCEINLINE __vec4_i64 __undef_i64<__vec4_i64>() {
     return __vec4_i64();
 }
 
-static FORCEINLINE int64_t __extract_element(__vec4_i64 v, int index) {
-    return ((int64_t *)&v)[index];
-}
-
-static FORCEINLINE void __insert_element(__vec4_i64 *v, int index, int64_t val) {
-    ((int64_t *)v)[index] = val;
-}
 
 static FORCEINLINE __vec4_i64 __broadcast_i64(__vec4_i64 v, int index) {
     uint64_t val = __extract_element(v, index);
@@ -1560,13 +1583,6 @@ template <> FORCEINLINE __vec4_f __undef_float<__vec4_f>() {
     return __vec4_f();
 }
 
-static FORCEINLINE float __extract_element(__vec4_f v, int index) {
-    return ((float *)&v)[index];
-}
-
-static FORCEINLINE void __insert_element(__vec4_f *v, int index, float val) {
-    ((float *)v)[index] = val;
-}
 
 static FORCEINLINE __vec4_f __broadcast_float(__vec4_f v, int index) {
     return _mm_set1_ps(__extract_element(v, index));
@@ -1726,13 +1742,6 @@ template <> FORCEINLINE __vec4_d __undef_double<__vec4_d>() {
     return __vec4_d();
 }
 
-static FORCEINLINE double __extract_element(__vec4_d v, int index) {
-    return ((double *)&v)[index];
-}
-
-static FORCEINLINE void __insert_element(__vec4_d *v, int index, double val) {
-    ((double *)v)[index] = val;
-}
 
 static FORCEINLINE __vec4_d __broadcast_double(__vec4_d v, int index) {
     return __vec4_d(_mm_set1_pd(__extract_element(v, index)),
@@ -1973,7 +1982,7 @@ static FORCEINLINE __vec4_f __cast_sitofp(__vec4_f, __vec4_i16 val) {
                     (float)((int16_t)_mm_extract_epi16(val.v, 3)));
 }
 
-static FORCEINLINE __vec4_f __cast_sitofp(__vec4_f, __vec4_i32 val) {
+static FORCEINLINE __vec4_f __cast_sitofp(__vec4_f, const __vec4_i32 &val) {
     return _mm_cvtepi32_ps(val.v);
 }
 
diff --git a/opt.cpp b/opt.cpp
index b1a22a1c..b018d35d 100644
--- a/opt.cpp
+++ b/opt.cpp
@@ -536,8 +536,11 @@ Optimize(llvm::Module *module, int optLevel) {
         }
         optPM.add(llvm::createDeadInstEliminationPass(), 220);
 
-        optPM.add(llvm::createIPConstantPropagationPass());
-        optPM.add(CreateReplaceStdlibShiftPass());
+        if (g->target->getISA() != Target::GENERIC) {
+          // Just use the builtins for generic targets.
+          optPM.add(llvm::createIPConstantPropagationPass());
+          optPM.add(CreateReplaceStdlibShiftPass());
+        }
 
         // Max struct size threshold for scalar replacement is
         //    1) 4 fields (r,g,b,w)

From 814ee67519771dc1f8b4002affc2426eb0e7e427 Mon Sep 17 00:00:00 2001
From: Ilia Filippov <ili.filippov@gmail.com>
Date: Thu, 24 Oct 2013 11:51:33 +0400
Subject: [PATCH 06/24] patch and regression test for problem with vzeroupper

---
 llvm_patches/3_3_0001-Fix-PR16807.patch       |  0
 .../3_3_r193261_bug17631_win_vzeroupper.patch | 69 +++++++++++++++++++
 tests/chkstk.ispc                             | 49 +++++++++++++
 3 files changed, 118 insertions(+)
 mode change 100755 => 100644 llvm_patches/3_3_0001-Fix-PR16807.patch
 create mode 100644 llvm_patches/3_3_r193261_bug17631_win_vzeroupper.patch
 create mode 100644 tests/chkstk.ispc

diff --git a/llvm_patches/3_3_0001-Fix-PR16807.patch b/llvm_patches/3_3_0001-Fix-PR16807.patch
old mode 100755
new mode 100644
diff --git a/llvm_patches/3_3_r193261_bug17631_win_vzeroupper.patch b/llvm_patches/3_3_r193261_bug17631_win_vzeroupper.patch
new file mode 100644
index 00000000..b6abb1d3
--- /dev/null
+++ b/llvm_patches/3_3_r193261_bug17631_win_vzeroupper.patch
@@ -0,0 +1,69 @@
+From b9b016cda57d8afc26a150de7ee329b54a994c85 Mon Sep 17 00:00:00 2001
+From: Michael Liao <michael.hliao@gmail.com>
+Date: Mon, 21 Oct 2013 17:47:58 -0700
+Subject: [PATCH] Fix PR17631
+
+- Skip instructions added in prolog. For specific targets, prolog may
+  insert helper function calls (e.g. _chkstk will be called when
+  there're more than 4K bytes allocated on stack). However, these
+  helpers don't use/def YMM/XMM registers.
+---
+ lib/Target/X86/X86VZeroUpper.cpp | 11 ++++++++++-
+ test/CodeGen/X86/pr17631.ll      | 22 ++++++++++++++++++++++
+ 2 files changed, 32 insertions(+), 1 deletion(-)
+ create mode 100644 test/CodeGen/X86/pr17631.ll
+
+diff --git a/lib/Target/X86/X86VZeroUpper.cpp b/lib/Target/X86/X86VZeroUpper.cpp
+index 477f75a..0d37a7d 100644
+--- lib/Target/X86/X86VZeroUpper.cpp
++++ lib/Target/X86/X86VZeroUpper.cpp
+@@ -231,8 +231,17 @@ bool VZeroUpperInserter::processBasicBlock(MachineFunction &MF,
+   bool BBHasCall = false;
+ 
+   for (MachineBasicBlock::iterator I = BB.begin(); I != BB.end(); ++I) {
+-    MachineInstr *MI = I;
+     DebugLoc dl = I->getDebugLoc();
++    MachineInstr *MI = I;
++
++    // Don't need to check instructions added in prolog.
++    // In prolog, special function calls may be added for specific targets
++    // (e.g. on Windows, a prolog helper '_chkstk' is called when the local
++    // variables exceed 4K bytes on stack.) These helpers won't use/def YMM/XMM
++    // registers.
++    if (MI->getFlag(MachineInstr::FrameSetup))
++      continue;
++
+     bool isControlFlow = MI->isCall() || MI->isReturn();
+ 
+     // Shortcut: don't need to check regular instructions in dirty state.
+diff --git a/test/CodeGen/X86/pr17631.ll b/test/CodeGen/X86/pr17631.ll
+new file mode 100644
+index 0000000..a572ff2
+--- /dev/null
++++ test/CodeGen/X86/pr17631.ll
+@@ -0,0 +1,22 @@
++; RUN: llc < %s -mcpu=core-avx-i -mtriple=i386-pc-win32 | FileCheck %s
++ 
++%struct_type = type { [64 x <8 x float>], <8 x float> }
++ 
++; Function Attrs: nounwind readnone
++declare i32 @llvm.x86.avx.movmsk.ps.256(<8 x float>)
++ 
++; Function Attrs: nounwind
++define i32 @equal(<8 x i32> %A) {
++allocas:
++  %first_alloc  = alloca [64 x <8 x i32>]
++  %second_alloc = alloca %struct_type
++ 
++  %A1 = bitcast <8 x i32> %A to <8 x float>
++  %A2 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %A1)
++  ret i32 %A2
++}
++
++; CHECK: equal
++; CHECK-NOT: vzeroupper
++; CHECK: _chkstk
++; CHECK: ret
+-- 
+1.8.1.2
+
diff --git a/tests/chkstk.ispc b/tests/chkstk.ispc
new file mode 100644
index 00000000..bd0a8299
--- /dev/null
+++ b/tests/chkstk.ispc
@@ -0,0 +1,49 @@
+//test for 17631 bug in LLVM.
+
+export uniform int width() { return programCount; }
+ 
+struct s_temp
+{
+    float temp[64];
+};
+ 
+int CompressBlockBC7(int A, uniform float b)
+{
+    // This declaration caused problem because LLVM inserted
+    // _chkstk after declaration and vzeroupper before it's call.
+    // A will be in ymm at avx, so we lose a half of it.
+    s_temp _state;
+    // These two loops are here to prevent elimination of declaration
+    for (int i=0; i<64; i++) {
+        float ii = i;
+        _state.temp[i] = b + sin(ii);
+    }
+    float r = 0;
+    for (int j=0; j<64; j+=9) {
+        r += _state.temp[j] + j;
+    }
+
+    // Here upper bits of A in ymm can be zeros. This will crash the test.
+    int B;
+    if (A!=0) {
+        B = 20;
+    }
+    else {
+        B = 30;
+    }
+    if(A == 1) {
+        B = r;
+    }
+    return B;
+}
+ 
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    int A = programIndex;
+    RET[programIndex] = CompressBlockBC7(A, b);
+}
+ 
+export void result(uniform float RET[]) {
+    RET[programIndex] = 20;
+    RET[0] = 30;
+    RET[1] = 292;
+}

From 9b5ee1b31bea40c3d94097fd352a7c50a91b9487 Mon Sep 17 00:00:00 2001
From: Dmitry Babokin <babokin@gmail.com>
Date: Wed, 23 Oct 2013 18:42:49 +0400
Subject: [PATCH 07/24] fail_db update on Linux

---
 fail_db.txt | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/fail_db.txt b/fail_db.txt
index 9c43c7f0..367cdf18 100644
--- a/fail_db.txt
+++ b/fail_db.txt
@@ -906,13 +906,8 @@
 ./tests/test-141.ispc runfail  x86-64     generic-16   Linux LLVM 3.3 clang++3.3 -O2 *
 ./tests/test-143.ispc runfail  x86-64     generic-16   Linux LLVM 3.3 clang++3.3 -O2 *
 ./tests/ptr-assign-lhs-math-1.ispc compfail  x86-64     generic-16   Linux LLVM 3.3 clang++3.3 -O2 *
-./tests/avg-down-int8.ispc compfail     x86  avx1.1-i32x16   Linux LLVM 3.3 clang++3.3 -O2 *
-./tests/avg-up-int8.ispc compfail     x86  avx1.1-i32x16   Linux LLVM 3.3 clang++3.3 -O2 *
-./tests/avg-down-int8.ispc compfail  x86-64  avx1.1-i32x16   Linux LLVM 3.3 clang++3.3 -O2 *
-./tests/avg-up-int8.ispc compfail  x86-64  avx1.1-i32x16   Linux LLVM 3.3 clang++3.3 -O2 *
 ./tests/test-141.ispc runfail     x86    avx2-i32x16   Linux LLVM 3.3 clang++3.3 -O2 *
 ./tests/test-141.ispc runfail  x86-64    avx2-i32x16   Linux LLVM 3.3 clang++3.3 -O2 *
-./tests/reduce-equal-4.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.3 clang++3.3 -O2 *
 ./tests/funcptr-null-4.ispc runfail     x86     sse4-i8x16   Linux LLVM 3.4 clang++3.3 -O2 *
 ./tests/funcptr-null-5.ispc runfail     x86     sse4-i8x16   Linux LLVM 3.4 clang++3.3 -O2 *
 ./tests/funcptr-null-6.ispc runfail     x86     sse4-i8x16   Linux LLVM 3.4 clang++3.3 -O2 *

From 58aea1b61c27b1305318b55ca899895a8da699a8 Mon Sep 17 00:00:00 2001
From: Dmitry Babokin <babokin@gmail.com>
Date: Fri, 25 Oct 2013 21:42:57 +0400
Subject: [PATCH 08/24] Fail_db update with Linux passed with LLVM 3.4

---
 fail_db.txt | 34 ----------------------------------
 1 file changed, 34 deletions(-)

diff --git a/fail_db.txt b/fail_db.txt
index 367cdf18..b3163869 100644
--- a/fail_db.txt
+++ b/fail_db.txt
@@ -914,45 +914,11 @@
 ./tests/funcptr-null-4.ispc runfail  x86-64     sse4-i8x16   Linux LLVM 3.4 clang++3.3 -O2 *
 ./tests/funcptr-null-5.ispc runfail  x86-64     sse4-i8x16   Linux LLVM 3.4 clang++3.3 -O2 *
 ./tests/funcptr-null-6.ispc runfail  x86-64     sse4-i8x16   Linux LLVM 3.4 clang++3.3 -O2 *
-./tests/avg-down-int8.ispc compfail     x86    avx1-i32x16   Linux LLVM 3.4 clang++3.3 -O2 *
-./tests/avg-up-int8.ispc compfail     x86    avx1-i32x16   Linux LLVM 3.4 clang++3.3 -O2 *
-./tests/avg-down-int8.ispc compfail  x86-64    avx1-i32x16   Linux LLVM 3.4 clang++3.3 -O2 *
-./tests/avg-up-int8.ispc compfail  x86-64    avx1-i32x16   Linux LLVM 3.4 clang++3.3 -O2 *
 ./tests/ptr-assign-lhs-math-1.ispc compfail  x86-64      generic-4   Linux LLVM 3.4 clang++3.3 -O2 *
 ./tests/short-vec-8.ispc compfail  x86-64      generic-4   Linux LLVM 3.4 clang++3.3 -O2 *
 ./tests/test-141.ispc runfail  x86-64     generic-16   Linux LLVM 3.4 clang++3.3 -O2 *
 ./tests/test-143.ispc runfail  x86-64     generic-16   Linux LLVM 3.4 clang++3.3 -O2 *
 ./tests/ptr-assign-lhs-math-1.ispc compfail  x86-64     generic-16   Linux LLVM 3.4 clang++3.3 -O2 *
-./tests/avg-down-int8.ispc compfail     x86  avx1.1-i32x16   Linux LLVM 3.4 clang++3.3 -O2 *
-./tests/avg-up-int8.ispc compfail     x86  avx1.1-i32x16   Linux LLVM 3.4 clang++3.3 -O2 *
-./tests/avg-down-int8.ispc compfail  x86-64  avx1.1-i32x16   Linux LLVM 3.4 clang++3.3 -O2 *
-./tests/avg-up-int8.ispc compfail  x86-64  avx1.1-i32x16   Linux LLVM 3.4 clang++3.3 -O2 *
-./tests/atomics-varyingptr-2.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4 clang++3.3 -O2 *
-./tests/atomics-varyingptr-3.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4 clang++3.3 -O2 *
-./tests/atomics-varyingptr-4.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4 clang++3.3 -O2 *
-./tests/local-atomics-11.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4 clang++3.3 -O2 *
-./tests/local-atomics-12.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4 clang++3.3 -O2 *
-./tests/local-atomics-13.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4 clang++3.3 -O2 *
-./tests/local-atomics-4.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4 clang++3.3 -O2 *
-./tests/local-atomics-5.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4 clang++3.3 -O2 *
-./tests/local-atomics-6.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4 clang++3.3 -O2 *
-./tests/local-atomics-7.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4 clang++3.3 -O2 *
-./tests/local-atomics-8.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4 clang++3.3 -O2 *
-./tests/local-atomics-swap.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4 clang++3.3 -O2 *
-./tests/local-atomics-varyingptr-2.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4 clang++3.3 -O2 *
-./tests/local-atomics-varyingptr-3.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4 clang++3.3 -O2 *
-./tests/local-atomics-varyingptr-4.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4 clang++3.3 -O2 *
-./tests/memset-varying.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4 clang++3.3 -O2 *
-./tests/reduce-equal-1.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4 clang++3.3 -O2 *
-./tests/reduce-equal-12.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4 clang++3.3 -O2 *
-./tests/reduce-equal-13.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4 clang++3.3 -O2 *
-./tests/reduce-equal-2.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4 clang++3.3 -O2 *
-./tests/reduce-equal-3.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4 clang++3.3 -O2 *
-./tests/reduce-equal-4.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4 clang++3.3 -O2 *
-./tests/reduce-equal-5.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4 clang++3.3 -O2 *
-./tests/reduce-equal-6.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4 clang++3.3 -O2 *
-./tests/reduce-equal-7.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4 clang++3.3 -O2 *
-./tests/reduce-equal.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4 clang++3.3 -O2 *
 ./tests/test-141.ispc runfail     x86    avx2-i32x16   Linux LLVM 3.4 clang++3.3 -O2 *
 ./tests/test-141.ispc runfail  x86-64    avx2-i32x16   Linux LLVM 3.4 clang++3.3 -O2 *
 ./tests/atomics-13.ispc compfail     x86     sse4-i16x8     Mac LLVM 3.3 clang++3.3 -O2 *

From 621679245a4567a385fdeeed9af944ed45d691c0 Mon Sep 17 00:00:00 2001
From: Ilia Filippov <ili.filippov@gmail.com>
Date: Fri, 25 Oct 2013 12:49:06 +0400
Subject: [PATCH 09/24] fixing problem 644

---
 expr.cpp | 19 ++++++++++++++++---
 1 file changed, 16 insertions(+), 3 deletions(-)

diff --git a/expr.cpp b/expr.cpp
index c92503e0..222c89a1 100644
--- a/expr.cpp
+++ b/expr.cpp
@@ -7065,9 +7065,22 @@ TypeCastExpr::GetLValue(FunctionEmitContext *ctx) const {
 
 const Type *
 TypeCastExpr::GetType() const {
-    // We have to switch off this assert after supporting of operators.
-    //AssertPos(pos, type->HasUnboundVariability() == false);
-    return type;
+    // Here we try to resolve situation where (base_type) can be treated as
+    // (uniform base_type) of (varying base_type). This is a part of function
+    // TypeCastExpr::TypeCheck. After implementation of operators we
+    // have to have this functionality here.
+    const Type *toType = type, *fromType = expr->GetType();
+    if (toType == NULL || fromType == NULL)
+        return NULL;
+    if (toType->HasUnboundVariability()) {
+        if (fromType->IsUniformType()) {
+            toType = type->ResolveUnboundVariability(Variability::Uniform);
+        } else {
+            toType = type->ResolveUnboundVariability(Variability::Varying);
+        }
+    }
+    AssertPos(pos, toType->HasUnboundVariability() == false);
+    return toType;
 }
 
 

From a508bd4290a5dc8073602bda88a7953ab6ef456b Mon Sep 17 00:00:00 2001
From: Dmitry Babokin <babokin@gmail.com>
Date: Sat, 26 Oct 2013 14:50:45 +0400
Subject: [PATCH 10/24] MacOS fails update

---
 fail_db.txt | 35 -----------------------------------
 1 file changed, 35 deletions(-)

diff --git a/fail_db.txt b/fail_db.txt
index b3163869..bfa14dad 100644
--- a/fail_db.txt
+++ b/fail_db.txt
@@ -944,46 +944,11 @@
 ./tests/funcptr-null-4.ispc runfail  x86-64     sse4-i8x16     Mac LLVM 3.4 clang++3.3 -O2 *
 ./tests/funcptr-null-5.ispc runfail  x86-64     sse4-i8x16     Mac LLVM 3.4 clang++3.3 -O2 *
 ./tests/funcptr-null-6.ispc runfail  x86-64     sse4-i8x16     Mac LLVM 3.4 clang++3.3 -O2 *
-./tests/avg-down-int8.ispc compfail     x86    avx1-i32x16     Mac LLVM 3.4 clang++3.3 -O2 *
-./tests/avg-up-int8.ispc compfail     x86    avx1-i32x16     Mac LLVM 3.4 clang++3.3 -O2 *
-./tests/avg-down-int8.ispc compfail  x86-64    avx1-i32x16     Mac LLVM 3.4 clang++3.3 -O2 *
-./tests/avg-up-int8.ispc compfail  x86-64    avx1-i32x16     Mac LLVM 3.4 clang++3.3 -O2 *
-./tests/avg-down-int8.ispc compfail     x86  avx1.1-i32x16     Mac LLVM 3.4 clang++3.3 -O2 *
-./tests/avg-up-int8.ispc compfail     x86  avx1.1-i32x16     Mac LLVM 3.4 clang++3.3 -O2 *
-./tests/avg-down-int8.ispc compfail  x86-64  avx1.1-i32x16     Mac LLVM 3.4 clang++3.3 -O2 *
-./tests/avg-up-int8.ispc compfail  x86-64  avx1.1-i32x16     Mac LLVM 3.4 clang++3.3 -O2 *
 ./tests/ptr-assign-lhs-math-1.ispc compfail  x86-64      generic-4     Mac LLVM 3.4 clang++3.3 -O2 *
 ./tests/short-vec-8.ispc compfail  x86-64      generic-4     Mac LLVM 3.4 clang++3.3 -O2 *
 ./tests/test-141.ispc runfail  x86-64     generic-16     Mac LLVM 3.4 clang++3.3 -O2 *
 ./tests/test-143.ispc runfail  x86-64     generic-16     Mac LLVM 3.4 clang++3.3 -O2 *
 ./tests/ptr-assign-lhs-math-1.ispc compfail  x86-64     generic-16     Mac LLVM 3.4 clang++3.3 -O2 *
-./tests/double-3.ispc runfail     x86     avx2-i32x8     Mac LLVM 3.4 clang++3.3 -O2 *
-./tests/atomics-varyingptr-2.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4 clang++3.3 -O2 *
-./tests/atomics-varyingptr-3.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4 clang++3.3 -O2 *
-./tests/atomics-varyingptr-4.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4 clang++3.3 -O2 *
-./tests/local-atomics-11.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4 clang++3.3 -O2 *
-./tests/local-atomics-12.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4 clang++3.3 -O2 *
-./tests/local-atomics-13.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4 clang++3.3 -O2 *
-./tests/local-atomics-4.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4 clang++3.3 -O2 *
-./tests/local-atomics-5.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4 clang++3.3 -O2 *
-./tests/local-atomics-6.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4 clang++3.3 -O2 *
-./tests/local-atomics-7.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4 clang++3.3 -O2 *
-./tests/local-atomics-8.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4 clang++3.3 -O2 *
-./tests/local-atomics-swap.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4 clang++3.3 -O2 *
-./tests/local-atomics-varyingptr-2.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4 clang++3.3 -O2 *
-./tests/local-atomics-varyingptr-3.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4 clang++3.3 -O2 *
-./tests/local-atomics-varyingptr-4.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4 clang++3.3 -O2 *
-./tests/memset-varying.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4 clang++3.3 -O2 *
-./tests/reduce-equal-1.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4 clang++3.3 -O2 *
-./tests/reduce-equal-12.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4 clang++3.3 -O2 *
-./tests/reduce-equal-13.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4 clang++3.3 -O2 *
-./tests/reduce-equal-2.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4 clang++3.3 -O2 *
-./tests/reduce-equal-3.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4 clang++3.3 -O2 *
-./tests/reduce-equal-4.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4 clang++3.3 -O2 *
-./tests/reduce-equal-5.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4 clang++3.3 -O2 *
-./tests/reduce-equal-6.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4 clang++3.3 -O2 *
-./tests/reduce-equal-7.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4 clang++3.3 -O2 *
-./tests/reduce-equal.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4 clang++3.3 -O2 *
 ./tests/test-141.ispc runfail     x86    avx2-i32x16     Mac LLVM 3.4 clang++3.3 -O2 *
 ./tests/test-141.ispc runfail  x86-64    avx2-i32x16     Mac LLVM 3.4 clang++3.3 -O2 *
 .\tests\exclusive-scan-add-10.ispc runfail     x86   avx1.1-i64x4 Windows LLVM 3.3         cl -O2 *

From 103ef25f12bfd736a1ca84f71358059991354c6d Mon Sep 17 00:00:00 2001
From: Dmitry Babokin <babokin@gmail.com>
Date: Sun, 27 Oct 2013 23:01:20 +0400
Subject: [PATCH 11/24] Docs fix in memory management section

---
 docs/ispc.rst | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/docs/ispc.rst b/docs/ispc.rst
index eac9b24e..84063694 100644
--- a/docs/ispc.rst
+++ b/docs/ispc.rst
@@ -2344,8 +2344,11 @@ based on C++'s ``new`` and ``delete`` operators:
 In the above code, each program instance allocates its own ``count`` sized
 array of ``uniform int`` values, uses that memory, and then deallocates
 that memory.  Uses of ``new`` and ``delete`` in ``ispc`` programs are
-serviced by corresponding calls the system C library's ``malloc()`` and
-``free()`` functions.
+implemented as calls to C library's aligned memory allocation routines,
+which are platform dependent (``posix_memalign()`` and ``free()`` on Linux
+and Mac and ``_aligned_malloc()`` and ``_aligned_free()`` on Windows). So it's
+advised to pair ISPC's ``new`` and ``delete`` with each other, but not with
+C/C++ memory management functions.
 
 Note that the rules for ``uniform`` and ``varying`` for ``new`` are
 analogous to the corresponding rules for pointers (as described in

From 43829028940316cf9869492d7d0f712aec878338 Mon Sep 17 00:00:00 2001
From: Dmitry Babokin <babokin@gmail.com>
Date: Mon, 28 Oct 2013 12:31:24 +0400
Subject: [PATCH 12/24] Fail_db update on Windows: 3.3 update and adding 3.4

---
 fail_db.txt | 120 ++++++++++++++++++++++++++++++++++++++++------------
 1 file changed, 92 insertions(+), 28 deletions(-)

diff --git a/fail_db.txt b/fail_db.txt
index bfa14dad..2e08a6ae 100644
--- a/fail_db.txt
+++ b/fail_db.txt
@@ -644,11 +644,6 @@
 .\tests\reduce-add-uint64.ispc runfail     x86     sse4-i16x8 Windows LLVM 3.3         cl -O2 *
 .\tests\reduce-max-uint.ispc runfail     x86     sse4-i16x8 Windows LLVM 3.3         cl -O2 *
 .\tests\atomics-13.ispc compfail     x86     sse4-i16x8 Windows LLVM 3.3         cl -O2 *
-.\tests\reduce-equal-10.ispc compfail     x86     sse4-i16x8 Windows LLVM 3.3         cl -O2 *
-.\tests\reduce-equal-11.ispc compfail     x86     sse4-i16x8 Windows LLVM 3.3         cl -O2 *
-.\tests\reduce-equal-13.ispc compfail     x86     sse4-i16x8 Windows LLVM 3.3         cl -O2 *
-.\tests\reduce-equal-5.ispc compfail     x86     sse4-i16x8 Windows LLVM 3.3         cl -O2 *
-.\tests\reduce-equal-6.ispc compfail     x86     sse4-i16x8 Windows LLVM 3.3         cl -O2 *
 .\tests\exclusive-scan-add-10.ispc runfail     x86     sse4-i8x16 Windows LLVM 3.3         cl -O2 *
 .\tests\exclusive-scan-add-9.ispc runfail     x86     sse4-i8x16 Windows LLVM 3.3         cl -O2 *
 .\tests\funcptr-null-4.ispc runfail     x86     sse4-i8x16 Windows LLVM 3.3         cl -O2 *
@@ -667,11 +662,6 @@
 .\tests\reduce-max-uint64.ispc runfail     x86     sse4-i8x16 Windows LLVM 3.3         cl -O2 *
 .\tests\reduce-min-uint64.ispc runfail     x86     sse4-i8x16 Windows LLVM 3.3         cl -O2 *
 .\tests\atomics-13.ispc compfail     x86     sse4-i8x16 Windows LLVM 3.3         cl -O2 *
-.\tests\reduce-equal-10.ispc compfail     x86     sse4-i8x16 Windows LLVM 3.3         cl -O2 *
-.\tests\reduce-equal-11.ispc compfail     x86     sse4-i8x16 Windows LLVM 3.3         cl -O2 *
-.\tests\reduce-equal-13.ispc compfail     x86     sse4-i8x16 Windows LLVM 3.3         cl -O2 *
-.\tests\reduce-equal-5.ispc compfail     x86     sse4-i8x16 Windows LLVM 3.3         cl -O2 *
-.\tests\reduce-equal-6.ispc compfail     x86     sse4-i8x16 Windows LLVM 3.3         cl -O2 *
 .\tests\exclusive-scan-add-10.ispc runfail     x86     avx1-i32x8 Windows LLVM 3.3         cl -O2 *
 .\tests\max-uint-1.ispc runfail     x86     avx1-i32x8 Windows LLVM 3.3         cl -O2 *
 .\tests\max-uint.ispc runfail     x86     avx1-i32x8 Windows LLVM 3.3         cl -O2 *
@@ -715,8 +705,6 @@
 .\tests\uint64-max.ispc runfail     x86    avx1-i32x16 Windows LLVM 3.3         cl -O2 *
 .\tests\uint64-min-1.ispc runfail     x86    avx1-i32x16 Windows LLVM 3.3         cl -O2 *
 .\tests\uint64-min.ispc runfail     x86    avx1-i32x16 Windows LLVM 3.3         cl -O2 *
-.\tests\avg-down-int8.ispc compfail     x86    avx1-i32x16 Windows LLVM 3.3         cl -O2 *
-.\tests\avg-up-int8.ispc compfail     x86    avx1-i32x16 Windows LLVM 3.3         cl -O2 *
 .\tests\switch-10.ispc compfail     x86    avx1-i32x16 Windows LLVM 3.3         cl -O2 *
 .\tests\switch-11.ispc compfail     x86    avx1-i32x16 Windows LLVM 3.3         cl -O2 *
 .\tests\switch-12.ispc compfail     x86    avx1-i32x16 Windows LLVM 3.3         cl -O2 *
@@ -765,35 +753,21 @@
 .\tests\uint64-max.ispc runfail     x86  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
 .\tests\uint64-min-1.ispc runfail     x86  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
 .\tests\uint64-min.ispc runfail     x86  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
-.\tests\avg-down-int8.ispc compfail     x86  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
-.\tests\avg-up-int8.ispc compfail     x86  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
 .\tests\switch-10.ispc compfail     x86  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
 .\tests\switch-11.ispc compfail     x86  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
 .\tests\switch-12.ispc compfail     x86  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
 .\tests\switch-8.ispc compfail     x86  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
 .\tests\switch-9.ispc compfail     x86  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
 .\tests\atomics-13.ispc compfail  x86-64     sse4-i16x8 Windows LLVM 3.3         cl -O2 *
-.\tests\reduce-equal-10.ispc compfail  x86-64     sse4-i16x8 Windows LLVM 3.3         cl -O2 *
-.\tests\reduce-equal-11.ispc compfail  x86-64     sse4-i16x8 Windows LLVM 3.3         cl -O2 *
-.\tests\reduce-equal-13.ispc compfail  x86-64     sse4-i16x8 Windows LLVM 3.3         cl -O2 *
-.\tests\reduce-equal-5.ispc compfail  x86-64     sse4-i16x8 Windows LLVM 3.3         cl -O2 *
-.\tests\reduce-equal-6.ispc compfail  x86-64     sse4-i16x8 Windows LLVM 3.3         cl -O2 *
 .\tests\funcptr-null-4.ispc runfail  x86-64     sse4-i8x16 Windows LLVM 3.3         cl -O2 *
 .\tests\funcptr-null-5.ispc runfail  x86-64     sse4-i8x16 Windows LLVM 3.3         cl -O2 *
 .\tests\funcptr-null-6.ispc runfail  x86-64     sse4-i8x16 Windows LLVM 3.3         cl -O2 *
 .\tests\atomics-13.ispc compfail  x86-64     sse4-i8x16 Windows LLVM 3.3         cl -O2 *
-.\tests\reduce-equal-10.ispc compfail  x86-64     sse4-i8x16 Windows LLVM 3.3         cl -O2 *
-.\tests\reduce-equal-11.ispc compfail  x86-64     sse4-i8x16 Windows LLVM 3.3         cl -O2 *
-.\tests\reduce-equal-13.ispc compfail  x86-64     sse4-i8x16 Windows LLVM 3.3         cl -O2 *
-.\tests\reduce-equal-5.ispc compfail  x86-64     sse4-i8x16 Windows LLVM 3.3         cl -O2 *
-.\tests\reduce-equal-6.ispc compfail  x86-64     sse4-i8x16 Windows LLVM 3.3         cl -O2 *
 .\tests\switch-10.ispc compfail  x86-64     avx1-i32x8 Windows LLVM 3.3         cl -O2 *
 .\tests\switch-11.ispc compfail  x86-64     avx1-i32x8 Windows LLVM 3.3         cl -O2 *
 .\tests\switch-12.ispc compfail  x86-64     avx1-i32x8 Windows LLVM 3.3         cl -O2 *
 .\tests\switch-8.ispc compfail  x86-64     avx1-i32x8 Windows LLVM 3.3         cl -O2 *
 .\tests\switch-9.ispc compfail  x86-64     avx1-i32x8 Windows LLVM 3.3         cl -O2 *
-.\tests\avg-down-int8.ispc compfail  x86-64    avx1-i32x16 Windows LLVM 3.3         cl -O2 *
-.\tests\avg-up-int8.ispc compfail  x86-64    avx1-i32x16 Windows LLVM 3.3         cl -O2 *
 .\tests\switch-10.ispc compfail  x86-64    avx1-i32x16 Windows LLVM 3.3         cl -O2 *
 .\tests\switch-11.ispc compfail  x86-64    avx1-i32x16 Windows LLVM 3.3         cl -O2 *
 .\tests\switch-12.ispc compfail  x86-64    avx1-i32x16 Windows LLVM 3.3         cl -O2 *
@@ -804,8 +778,6 @@
 .\tests\switch-12.ispc compfail  x86-64   avx1.1-i32x8 Windows LLVM 3.3         cl -O2 *
 .\tests\switch-8.ispc compfail  x86-64   avx1.1-i32x8 Windows LLVM 3.3         cl -O2 *
 .\tests\switch-9.ispc compfail  x86-64   avx1.1-i32x8 Windows LLVM 3.3         cl -O2 *
-.\tests\avg-down-int8.ispc compfail  x86-64  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
-.\tests\avg-up-int8.ispc compfail  x86-64  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
 .\tests\switch-10.ispc compfail  x86-64  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
 .\tests\switch-11.ispc compfail  x86-64  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
 .\tests\switch-12.ispc compfail  x86-64  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
@@ -986,3 +958,95 @@
 .\tests\reduce-min-uint64.ispc runfail     x86   avx1.1-i64x4 Windows LLVM 3.4         cl -O2 *
 .\tests\reduce-min-uint.ispc runfail     x86     avx2-i64x4 Windows LLVM 3.4         cl -O2 *
 .\tests\reduce-min-uint64.ispc runfail     x86     avx2-i64x4 Windows LLVM 3.4         cl -O2 *
+.\tests\funcptr-null-4.ispc runfail     x86     sse4-i8x16 Windows LLVM 3.4         cl -O2 *
+.\tests\funcptr-null-5.ispc runfail     x86     sse4-i8x16 Windows LLVM 3.4         cl -O2 *
+.\tests\funcptr-null-6.ispc runfail     x86     sse4-i8x16 Windows LLVM 3.4         cl -O2 *
+.\tests\reduce-equal-10.ispc runfail     x86     avx1-i32x8 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-10.ispc compfail     x86     avx1-i32x8 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-11.ispc compfail     x86     avx1-i32x8 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-12.ispc compfail     x86     avx1-i32x8 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-8.ispc compfail     x86     avx1-i32x8 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-9.ispc compfail     x86     avx1-i32x8 Windows LLVM 3.4         cl -O2 *
+.\tests\reduce-add-uint-1.ispc runfail     x86    avx1-i32x16 Windows LLVM 3.4         cl -O2 *
+.\tests\reduce-min-uint64.ispc runfail     x86    avx1-i32x16 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-10.ispc compfail     x86    avx1-i32x16 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-11.ispc compfail     x86    avx1-i32x16 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-12.ispc compfail     x86    avx1-i32x16 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-8.ispc compfail     x86    avx1-i32x16 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-9.ispc compfail     x86    avx1-i32x16 Windows LLVM 3.4         cl -O2 *
+.\tests\reduce-min-uint.ispc runfail     x86     avx1-i64x4 Windows LLVM 3.4         cl -O2 *
+.\tests\reduce-equal-10.ispc runfail     x86   avx1.1-i32x8 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-10.ispc compfail     x86   avx1.1-i32x8 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-11.ispc compfail     x86   avx1.1-i32x8 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-12.ispc compfail     x86   avx1.1-i32x8 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-8.ispc compfail     x86   avx1.1-i32x8 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-9.ispc compfail     x86   avx1.1-i32x8 Windows LLVM 3.4         cl -O2 *
+.\tests\reduce-add-uint-1.ispc runfail     x86  avx1.1-i32x16 Windows LLVM 3.4         cl -O2 *
+.\tests\reduce-min-uint64.ispc runfail     x86  avx1.1-i32x16 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-10.ispc compfail     x86  avx1.1-i32x16 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-11.ispc compfail     x86  avx1.1-i32x16 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-12.ispc compfail     x86  avx1.1-i32x16 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-8.ispc compfail     x86  avx1.1-i32x16 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-9.ispc compfail     x86  avx1.1-i32x16 Windows LLVM 3.4         cl -O2 *
+.\tests\reduce-min-uint.ispc runfail     x86   avx1.1-i64x4 Windows LLVM 3.4         cl -O2 *
+.\tests\exclusive-scan-add-9.ispc runfail     x86     avx2-i32x8 Windows LLVM 3.4         cl -O2 *
+.\tests\reduce-equal-10.ispc runfail     x86     avx2-i32x8 Windows LLVM 3.4         cl -O2 *
+.\tests\uint64-max.ispc runfail     x86     avx2-i32x8 Windows LLVM 3.4         cl -O2 *
+.\tests\uint64-min-1.ispc runfail     x86     avx2-i32x8 Windows LLVM 3.4         cl -O2 *
+.\tests\uint64-min.ispc runfail     x86     avx2-i32x8 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-10.ispc compfail     x86     avx2-i32x8 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-11.ispc compfail     x86     avx2-i32x8 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-12.ispc compfail     x86     avx2-i32x8 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-8.ispc compfail     x86     avx2-i32x8 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-9.ispc compfail     x86     avx2-i32x8 Windows LLVM 3.4         cl -O2 *
+.\tests\exclusive-scan-add-9.ispc runfail     x86    avx2-i32x16 Windows LLVM 3.4         cl -O2 *
+.\tests\max-uint-1.ispc runfail     x86    avx2-i32x16 Windows LLVM 3.4         cl -O2 *
+.\tests\min-uint-1.ispc runfail     x86    avx2-i32x16 Windows LLVM 3.4         cl -O2 *
+.\tests\min-uint-2.ispc runfail     x86    avx2-i32x16 Windows LLVM 3.4         cl -O2 *
+.\tests\packed-load-1.ispc runfail     x86    avx2-i32x16 Windows LLVM 3.4         cl -O2 *
+.\tests\packed-store.ispc runfail     x86    avx2-i32x16 Windows LLVM 3.4         cl -O2 *
+.\tests\reduce-add-uint-1.ispc runfail     x86    avx2-i32x16 Windows LLVM 3.4         cl -O2 *
+.\tests\test-141.ispc runfail     x86    avx2-i32x16 Windows LLVM 3.4         cl -O2 *
+.\tests\uint64-max.ispc runfail     x86    avx2-i32x16 Windows LLVM 3.4         cl -O2 *
+.\tests\uint64-min-1.ispc runfail     x86    avx2-i32x16 Windows LLVM 3.4         cl -O2 *
+.\tests\uint64-min.ispc runfail     x86    avx2-i32x16 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-10.ispc compfail     x86    avx2-i32x16 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-11.ispc compfail     x86    avx2-i32x16 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-12.ispc compfail     x86    avx2-i32x16 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-8.ispc compfail     x86    avx2-i32x16 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-9.ispc compfail     x86    avx2-i32x16 Windows LLVM 3.4         cl -O2 *
+.\tests\funcptr-null-4.ispc runfail  x86-64     sse4-i8x16 Windows LLVM 3.4         cl -O2 *
+.\tests\funcptr-null-5.ispc runfail  x86-64     sse4-i8x16 Windows LLVM 3.4         cl -O2 *
+.\tests\funcptr-null-6.ispc runfail  x86-64     sse4-i8x16 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-10.ispc compfail  x86-64     avx1-i32x8 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-11.ispc compfail  x86-64     avx1-i32x8 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-12.ispc compfail  x86-64     avx1-i32x8 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-8.ispc compfail  x86-64     avx1-i32x8 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-9.ispc compfail  x86-64     avx1-i32x8 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-10.ispc compfail  x86-64    avx1-i32x16 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-11.ispc compfail  x86-64    avx1-i32x16 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-12.ispc compfail  x86-64    avx1-i32x16 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-8.ispc compfail  x86-64    avx1-i32x16 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-9.ispc compfail  x86-64    avx1-i32x16 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-10.ispc compfail  x86-64   avx1.1-i32x8 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-11.ispc compfail  x86-64   avx1.1-i32x8 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-12.ispc compfail  x86-64   avx1.1-i32x8 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-8.ispc compfail  x86-64   avx1.1-i32x8 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-9.ispc compfail  x86-64   avx1.1-i32x8 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-10.ispc compfail  x86-64  avx1.1-i32x16 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-11.ispc compfail  x86-64  avx1.1-i32x16 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-12.ispc compfail  x86-64  avx1.1-i32x16 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-8.ispc compfail  x86-64  avx1.1-i32x16 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-9.ispc compfail  x86-64  avx1.1-i32x16 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-10.ispc compfail  x86-64     avx2-i32x8 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-11.ispc compfail  x86-64     avx2-i32x8 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-12.ispc compfail  x86-64     avx2-i32x8 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-8.ispc compfail  x86-64     avx2-i32x8 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-9.ispc compfail  x86-64     avx2-i32x8 Windows LLVM 3.4         cl -O2 *
+.\tests\test-141.ispc runfail  x86-64    avx2-i32x16 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-10.ispc compfail  x86-64    avx2-i32x16 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-11.ispc compfail  x86-64    avx2-i32x16 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-12.ispc compfail  x86-64    avx2-i32x16 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-8.ispc compfail  x86-64    avx2-i32x16 Windows LLVM 3.4         cl -O2 *
+.\tests\switch-9.ispc compfail  x86-64    avx2-i32x16 Windows LLVM 3.4         cl -O2 *
+.\tests\reduce-equal-10.ispc runfail     x86     sse4-i8x16 Windows LLVM 3.3         cl -O2 *

From 63a3214cc6c7fe7fc051254b7882f12f2231f314 Mon Sep 17 00:00:00 2001
From: Dmitry Babokin <babokin@gmail.com>
Date: Mon, 28 Oct 2013 12:45:39 +0400
Subject: [PATCH 13/24] Removing fails with g++4.4/g++4.7, as we are using
 clang by default now

---
 fail_db.txt | 598 +---------------------------------------------------
 1 file changed, 5 insertions(+), 593 deletions(-)

diff --git a/fail_db.txt b/fail_db.txt
index 2e08a6ae..da77cac3 100644
--- a/fail_db.txt
+++ b/fail_db.txt
@@ -1,600 +1,12 @@
 % List of known fails.
 % The list is unordered and contains information about commonly used platforms / configurations.
 % Our goas is to maintain this list for Linux, MacOS and Windows with reasonably new compilers.
-% Note, that it's important which C++ compiler was used. For example, gcc 4.4 is know to produce
-% considerably more fails with generic targets, than gcc 4.7 or later.
-% Using old compilers (gcc 4.4 is considered to be relatively old) may cause LLVM bugs.
-% To avoid them you can use LLVM selfbuild.
+% Note, that it's important which C++ compiler was used. The currently supported C++ compilers are
+% clang 3.3 on Linux and MacOS and cl (VS2010) on Windows.
+% Please also note that it's very important to have correctly built LLVM. There are a number of
+% LLVM bugs in released versions, that we have to workaround by applying patches (see llvm_patches
+% folder). The recommended way to build LLVM on Unix is to use "alloy.py".
 % 
-./tests/masked-scatter-vector.ispc runfail  x86-64     sse2-i32x4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/atomics-13.ispc compfail     x86     sse4-i16x8   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/atomics-13.ispc compfail  x86-64     sse4-i16x8   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/funcptr-null-4.ispc runfail     x86     sse4-i8x16   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/funcptr-null-5.ispc runfail     x86     sse4-i8x16   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/funcptr-null-6.ispc runfail     x86     sse4-i8x16   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/atomics-13.ispc compfail     x86     sse4-i8x16   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/funcptr-null-4.ispc runfail  x86-64     sse4-i8x16   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/funcptr-null-5.ispc runfail  x86-64     sse4-i8x16   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/funcptr-null-6.ispc runfail  x86-64     sse4-i8x16   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/atomics-13.ispc compfail  x86-64     sse4-i8x16   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/atomics-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/atomics-6.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/atomics-varyingptr-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/atomics-varyingptr-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/avg-down-uint16.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/avg-down-uint8.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/avg-up-uint16.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/avg-up-uint8.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/broadcast-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/broadcast-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/count-leading-trailing-zeros-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/count-leading-trailing-zeros-5.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/exclusive-scan-add-10.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/exclusive-scan-add-8.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/exclusive-scan-add-9.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/exclusive-scan-and-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/exclusive-scan-or-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/funcptr-null-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/funcptr-null-3.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/funcptr-null-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/funcptr-null-5.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/funcptr-null-6.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/funcptr-uniform-7.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/funcptr-uniform-8.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/funcptr-uniform-9.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/funcptr-varying-5.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/funcptr-varying-7.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/funcptr-varying-8.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/funcptr-varying-9.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/half-3.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/idiv.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/int64-max-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/int64-max.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/int64-min-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/int64-min.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/local-atomics-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/local-atomics-11.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/local-atomics-12.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/local-atomics-13.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/local-atomics-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/local-atomics-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/local-atomics-5.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/local-atomics-9.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/local-atomics-swap.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/local-atomics-varyingptr-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/local-atomics-varyingptr-3.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/local-atomics-varyingptr-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/new-delete-6.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/phi-opts-3.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/phi-opts-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/popcnt-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/popcnt-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/popcnt-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/ptr-15.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/reduce-add-int16-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/reduce-add-int16.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/reduce-equal-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/reduce-equal-10.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/reduce-equal-12.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/reduce-equal-13.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/reduce-equal-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/reduce-equal-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/reduce-equal-5.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/reduce-equal-7.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/reduce-equal-8.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/rotate-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/rotate-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/rotate-3.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/rotate-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/rotate-6.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/rotate.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/short-vec-14.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/shuffle-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/shuffle-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/shuffle-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/shuffle-flatten.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/shuffle.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/shuffle2-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/shuffle2-11.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/shuffle2-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/shuffle2-3.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/shuffle2-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/shuffle2-5.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/shuffle2-6.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/shuffle2-7.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/shuffle2-8.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/shuffle2-9.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/shuffle2.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/soa-27.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/soa-28.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/test-128.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/test-129.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/test-130.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/test-57.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/uint64-max-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/uint64-max.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/uint64-min-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/uint64-min.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/ptr-assign-lhs-math-1.ispc compfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/short-vec-8.ispc compfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/ptr-15.ispc runfail  x86-64     generic-16   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/test-141.ispc runfail  x86-64     generic-16   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/test-143.ispc runfail  x86-64     generic-16   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/ptr-assign-lhs-math-1.ispc compfail  x86-64     generic-16   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/avg-down-int8.ispc compfail     x86  avx1.1-i32x16   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/avg-up-int8.ispc compfail     x86  avx1.1-i32x16   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/avg-down-int8.ispc compfail  x86-64  avx1.1-i32x16   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/avg-up-int8.ispc compfail  x86-64  avx1.1-i32x16   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/test-141.ispc runfail     x86    avx2-i32x16   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/test-141.ispc runfail  x86-64    avx2-i32x16   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/funcptr-null-4.ispc runfail     x86     sse4-i8x16   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/funcptr-null-5.ispc runfail     x86     sse4-i8x16   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/funcptr-null-6.ispc runfail     x86     sse4-i8x16   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/funcptr-null-4.ispc runfail  x86-64     sse4-i8x16   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/funcptr-null-5.ispc runfail  x86-64     sse4-i8x16   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/funcptr-null-6.ispc runfail  x86-64     sse4-i8x16   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/array-gather-ifs.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/array-gather-multi-unif.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/array-gather-unif.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/array-mixed-unif-vary-indexing-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/array-mixed-unif-vary-indexing-3.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/array-mixed-unif-vary-indexing.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/array-multidim-gather.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/array-scatter-unif-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/array-scatter-vary.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/array-struct-gather.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/atomics-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/atomics-6.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/atomics-varyingptr-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/atomics-varyingptr-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/avg-down-uint16.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/avg-down-uint8.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/avg-up-uint16.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/avg-up-uint8.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/broadcast-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/broadcast-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/cfor-array-gather-ifs.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/cfor-array-gather-unif.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/cfor-array-multidim-gather.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/cfor-array-struct-gather.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/cfor-struct-test-114.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/cfor-unif-struct-test-114.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/count-leading-trailing-zeros-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/count-leading-trailing-zeros-5.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/exclusive-scan-add-10.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/exclusive-scan-add-8.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/exclusive-scan-add-9.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/exclusive-scan-and-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/exclusive-scan-or-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/funcptr-null-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/funcptr-null-3.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/funcptr-null-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/funcptr-null-5.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/funcptr-null-6.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/funcptr-uniform-7.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/funcptr-uniform-8.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/funcptr-uniform-9.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/funcptr-varying-5.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/funcptr-varying-7.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/funcptr-varying-8.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/funcptr-varying-9.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/gather-int16.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/gather-to-vload-neg-offset.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/global-array-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/half-3.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/idiv.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/int64-max-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/int64-max.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/int64-min-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/int64-min.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/local-atomics-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/local-atomics-11.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/local-atomics-12.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/local-atomics-13.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/local-atomics-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/local-atomics-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/local-atomics-5.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/local-atomics-9.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/local-atomics-swap.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/local-atomics-varyingptr-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/local-atomics-varyingptr-3.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/local-atomics-varyingptr-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/masked-scatter-struct.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/masked-scatter-vector.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/nested-structs-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/new-delete-6.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/pass-varying-lvalue-to-ref.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/phi-opts-3.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/phi-opts-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/popcnt-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/popcnt-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/popcnt-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/ptr-15.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/reduce-add-int16-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/reduce-add-int16.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/reduce-equal-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/reduce-equal-10.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/reduce-equal-12.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/reduce-equal-13.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/reduce-equal-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/reduce-equal-5.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/reduce-equal-7.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/reduce-equal-8.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/rotate-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/rotate-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/rotate-3.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/rotate-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/rotate-6.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/rotate.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/scatter-int16-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/scatter-int16.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/scatter-mask-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/scatter-mask-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/short-vec-12.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/short-vec-14.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/shuffle-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/shuffle-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/shuffle-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/shuffle-flatten.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/shuffle.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/shuffle2-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/shuffle2-11.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/shuffle2-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/shuffle2-3.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/shuffle2-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/shuffle2-5.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/shuffle2-6.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/shuffle2-7.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/shuffle2-8.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/shuffle2-9.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/shuffle2.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/soa-28.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/struct-test-114.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/test-128.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/test-129.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/test-130.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/test-57.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/uint64-max-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/uint64-max.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/uint64-min-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/uint64-min.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/unif-struct-test-114.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/varying-struct-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/varying-struct-3.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/varying-struct-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/write-same-loc.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/ptr-assign-lhs-math-1.ispc compfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/short-vec-8.ispc compfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/ptr-15.ispc runfail  x86-64     generic-16   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/test-141.ispc runfail  x86-64     generic-16   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/test-143.ispc runfail  x86-64     generic-16   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/ptr-assign-lhs-math-1.ispc compfail  x86-64     generic-16   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/avg-down-int8.ispc compfail     x86  avx1.1-i32x16   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/avg-up-int8.ispc compfail     x86  avx1.1-i32x16   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/avg-down-int8.ispc compfail  x86-64  avx1.1-i32x16   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/avg-up-int8.ispc compfail  x86-64  avx1.1-i32x16   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/atomics-varyingptr-2.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/atomics-varyingptr-3.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/atomics-varyingptr-4.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/local-atomics-11.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/local-atomics-12.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/local-atomics-13.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/local-atomics-4.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/local-atomics-5.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/local-atomics-6.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/local-atomics-7.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/local-atomics-8.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/local-atomics-swap.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/local-atomics-varyingptr-2.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/local-atomics-varyingptr-3.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/local-atomics-varyingptr-4.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/memset-varying.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/reduce-equal-1.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/reduce-equal-12.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/reduce-equal-13.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/reduce-equal-2.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/reduce-equal-3.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/reduce-equal-4.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/reduce-equal-5.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/reduce-equal-6.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/reduce-equal-7.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/reduce-equal.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/test-141.ispc runfail     x86    avx2-i32x16   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/test-141.ispc runfail  x86-64    avx2-i32x16   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/masked-scatter-struct.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/atomics-13.ispc compfail     x86     sse4-i16x8   Linux LLVM 3.3     g++4.7 -O2 *
-./tests/atomics-13.ispc compfail  x86-64     sse4-i16x8   Linux LLVM 3.3     g++4.7 -O2 *
-./tests/funcptr-null-4.ispc runfail     x86     sse4-i8x16   Linux LLVM 3.3     g++4.7 -O2 *
-./tests/funcptr-null-5.ispc runfail     x86     sse4-i8x16   Linux LLVM 3.3     g++4.7 -O2 *
-./tests/funcptr-null-6.ispc runfail     x86     sse4-i8x16   Linux LLVM 3.3     g++4.7 -O2 *
-./tests/atomics-13.ispc compfail     x86     sse4-i8x16   Linux LLVM 3.3     g++4.7 -O2 *
-./tests/funcptr-null-4.ispc runfail  x86-64     sse4-i8x16   Linux LLVM 3.3     g++4.7 -O2 *
-./tests/funcptr-null-5.ispc runfail  x86-64     sse4-i8x16   Linux LLVM 3.3     g++4.7 -O2 *
-./tests/funcptr-null-6.ispc runfail  x86-64     sse4-i8x16   Linux LLVM 3.3     g++4.7 -O2 *
-./tests/atomics-13.ispc compfail  x86-64     sse4-i8x16   Linux LLVM 3.3     g++4.7 -O2 *
-./tests/avg-down-int8.ispc compfail     x86  avx1.1-i32x16   Linux LLVM 3.3     g++4.7 -O2 *
-./tests/avg-up-int8.ispc compfail     x86  avx1.1-i32x16   Linux LLVM 3.3     g++4.7 -O2 *
-./tests/avg-down-int8.ispc compfail  x86-64  avx1.1-i32x16   Linux LLVM 3.3     g++4.7 -O2 *
-./tests/avg-up-int8.ispc compfail  x86-64  avx1.1-i32x16   Linux LLVM 3.3     g++4.7 -O2 *
-./tests/atomics-varyingptr-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
-./tests/atomics-varyingptr-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
-./tests/broadcast-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
-./tests/half-3.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
-./tests/local-atomics-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
-./tests/local-atomics-11.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
-./tests/local-atomics-12.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
-./tests/local-atomics-13.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
-./tests/local-atomics-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
-./tests/local-atomics-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
-./tests/local-atomics-5.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
-./tests/local-atomics-9.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
-./tests/local-atomics-swap.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
-./tests/local-atomics-varyingptr-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
-./tests/local-atomics-varyingptr-3.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
-./tests/local-atomics-varyingptr-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
-./tests/memset-varying.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
-./tests/ptr-15.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
-./tests/reduce-equal-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
-./tests/rotate-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
-./tests/rotate-3.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
-./tests/shuffle-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
-./tests/shuffle-flatten.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
-./tests/shuffle.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
-./tests/shuffle2-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
-./tests/shuffle2-10.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
-./tests/shuffle2-11.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
-./tests/shuffle2-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
-./tests/shuffle2-3.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
-./tests/shuffle2-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
-./tests/shuffle2-5.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
-./tests/shuffle2-6.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
-./tests/shuffle2-7.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
-./tests/shuffle2-8.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
-./tests/shuffle2-9.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
-./tests/shuffle2.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
-./tests/test-129.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
-./tests/test-130.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
-./tests/ptr-assign-lhs-math-1.ispc compfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
-./tests/short-vec-8.ispc compfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
-./tests/ptr-15.ispc runfail  x86-64     generic-16   Linux LLVM 3.3     g++4.7 -O2 *
-./tests/test-141.ispc runfail  x86-64     generic-16   Linux LLVM 3.3     g++4.7 -O2 *
-./tests/test-143.ispc runfail  x86-64     generic-16   Linux LLVM 3.3     g++4.7 -O2 *
-./tests/ptr-assign-lhs-math-1.ispc compfail  x86-64     generic-16   Linux LLVM 3.3     g++4.7 -O2 *
-./tests/rotate.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.3     g++4.7 -O2 *
-./tests/shift1.ispc runfail     x86    avx2-i32x16   Linux LLVM 3.3     g++4.7 -O2 *
-./tests/test-141.ispc runfail     x86    avx2-i32x16   Linux LLVM 3.3     g++4.7 -O2 *
-./tests/shift1.ispc runfail  x86-64    avx2-i32x16   Linux LLVM 3.3     g++4.7 -O2 *
-./tests/test-141.ispc runfail  x86-64    avx2-i32x16   Linux LLVM 3.3     g++4.7 -O2 *
-./tests/funcptr-null-4.ispc runfail     x86     sse4-i8x16   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/funcptr-null-5.ispc runfail     x86     sse4-i8x16   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/funcptr-null-6.ispc runfail     x86     sse4-i8x16   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/funcptr-null-4.ispc runfail  x86-64     sse4-i8x16   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/funcptr-null-5.ispc runfail  x86-64     sse4-i8x16   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/funcptr-null-6.ispc runfail  x86-64     sse4-i8x16   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/avg-down-int8.ispc compfail     x86  avx1.1-i32x16   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/avg-up-int8.ispc compfail     x86  avx1.1-i32x16   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/avg-down-int8.ispc compfail  x86-64  avx1.1-i32x16   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/avg-up-int8.ispc compfail  x86-64  avx1.1-i32x16   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/atomics-varyingptr-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/atomics-varyingptr-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/broadcast-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/half-3.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/local-atomics-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/local-atomics-11.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/local-atomics-12.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/local-atomics-13.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/local-atomics-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/local-atomics-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/local-atomics-5.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/local-atomics-9.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/local-atomics-swap.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/local-atomics-varyingptr-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/local-atomics-varyingptr-3.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/local-atomics-varyingptr-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/memset-varying.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/ptr-15.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/reduce-equal-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/rotate-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/rotate-3.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/shuffle-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/shuffle-flatten.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/shuffle.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/shuffle2-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/shuffle2-10.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/shuffle2-11.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/shuffle2-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/shuffle2-3.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/shuffle2-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/shuffle2-5.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/shuffle2-6.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/shuffle2-7.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/shuffle2-8.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/shuffle2-9.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/shuffle2.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/test-129.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/test-130.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/ptr-assign-lhs-math-1.ispc compfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/short-vec-8.ispc compfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/ptr-15.ispc runfail  x86-64     generic-16   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/test-141.ispc runfail  x86-64     generic-16   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/test-143.ispc runfail  x86-64     generic-16   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/ptr-assign-lhs-math-1.ispc compfail  x86-64     generic-16   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/atomics-varyingptr-2.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/atomics-varyingptr-3.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/atomics-varyingptr-4.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/local-atomics-11.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/local-atomics-12.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/local-atomics-13.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/local-atomics-4.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/local-atomics-5.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/local-atomics-6.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/local-atomics-7.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/local-atomics-8.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/local-atomics-swap.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/local-atomics-varyingptr-2.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/local-atomics-varyingptr-3.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/local-atomics-varyingptr-4.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/memset-varying.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/reduce-equal-1.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/reduce-equal-12.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/reduce-equal-13.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/reduce-equal-2.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/reduce-equal-3.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/reduce-equal-4.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/reduce-equal-5.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/reduce-equal-6.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/reduce-equal-7.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/reduce-equal.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/shift1.ispc runfail     x86    avx2-i32x16   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/test-141.ispc runfail     x86    avx2-i32x16   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/shift1.ispc runfail  x86-64    avx2-i32x16   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/test-141.ispc runfail  x86-64    avx2-i32x16   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/atomics-13.ispc compfail     x86     sse4-i16x8     Mac LLVM 3.3     g++4.7 -O2 *
-./tests/reduce-equal-10.ispc compfail     x86     sse4-i16x8     Mac LLVM 3.3     g++4.7 -O2 *
-./tests/reduce-equal-11.ispc compfail     x86     sse4-i16x8     Mac LLVM 3.3     g++4.7 -O2 *
-./tests/reduce-equal-13.ispc compfail     x86     sse4-i16x8     Mac LLVM 3.3     g++4.7 -O2 *
-./tests/reduce-equal-5.ispc compfail     x86     sse4-i16x8     Mac LLVM 3.3     g++4.7 -O2 *
-./tests/reduce-equal-6.ispc compfail     x86     sse4-i16x8     Mac LLVM 3.3     g++4.7 -O2 *
-./tests/atomics-13.ispc compfail  x86-64     sse4-i16x8     Mac LLVM 3.3     g++4.7 -O2 *
-./tests/reduce-equal-10.ispc compfail  x86-64     sse4-i16x8     Mac LLVM 3.3     g++4.7 -O2 *
-./tests/reduce-equal-11.ispc compfail  x86-64     sse4-i16x8     Mac LLVM 3.3     g++4.7 -O2 *
-./tests/reduce-equal-13.ispc compfail  x86-64     sse4-i16x8     Mac LLVM 3.3     g++4.7 -O2 *
-./tests/reduce-equal-5.ispc compfail  x86-64     sse4-i16x8     Mac LLVM 3.3     g++4.7 -O2 *
-./tests/reduce-equal-6.ispc compfail  x86-64     sse4-i16x8     Mac LLVM 3.3     g++4.7 -O2 *
-./tests/funcptr-null-4.ispc runfail     x86     sse4-i8x16     Mac LLVM 3.3     g++4.7 -O2 *
-./tests/funcptr-null-5.ispc runfail     x86     sse4-i8x16     Mac LLVM 3.3     g++4.7 -O2 *
-./tests/funcptr-null-6.ispc runfail     x86     sse4-i8x16     Mac LLVM 3.3     g++4.7 -O2 *
-./tests/atomics-13.ispc compfail     x86     sse4-i8x16     Mac LLVM 3.3     g++4.7 -O2 *
-./tests/reduce-equal-10.ispc compfail     x86     sse4-i8x16     Mac LLVM 3.3     g++4.7 -O2 *
-./tests/reduce-equal-11.ispc compfail     x86     sse4-i8x16     Mac LLVM 3.3     g++4.7 -O2 *
-./tests/reduce-equal-13.ispc compfail     x86     sse4-i8x16     Mac LLVM 3.3     g++4.7 -O2 *
-./tests/reduce-equal-5.ispc compfail     x86     sse4-i8x16     Mac LLVM 3.3     g++4.7 -O2 *
-./tests/reduce-equal-6.ispc compfail     x86     sse4-i8x16     Mac LLVM 3.3     g++4.7 -O2 *
-./tests/funcptr-null-4.ispc runfail  x86-64     sse4-i8x16     Mac LLVM 3.3     g++4.7 -O2 *
-./tests/funcptr-null-5.ispc runfail  x86-64     sse4-i8x16     Mac LLVM 3.3     g++4.7 -O2 *
-./tests/funcptr-null-6.ispc runfail  x86-64     sse4-i8x16     Mac LLVM 3.3     g++4.7 -O2 *
-./tests/atomics-13.ispc compfail  x86-64     sse4-i8x16     Mac LLVM 3.3     g++4.7 -O2 *
-./tests/reduce-equal-10.ispc compfail  x86-64     sse4-i8x16     Mac LLVM 3.3     g++4.7 -O2 *
-./tests/reduce-equal-11.ispc compfail  x86-64     sse4-i8x16     Mac LLVM 3.3     g++4.7 -O2 *
-./tests/reduce-equal-13.ispc compfail  x86-64     sse4-i8x16     Mac LLVM 3.3     g++4.7 -O2 *
-./tests/reduce-equal-5.ispc compfail  x86-64     sse4-i8x16     Mac LLVM 3.3     g++4.7 -O2 *
-./tests/reduce-equal-6.ispc compfail  x86-64     sse4-i8x16     Mac LLVM 3.3     g++4.7 -O2 *
-./tests/avg-down-int8.ispc compfail     x86    avx1-i32x16     Mac LLVM 3.3     g++4.7 -O2 *
-./tests/avg-up-int8.ispc compfail     x86    avx1-i32x16     Mac LLVM 3.3     g++4.7 -O2 *
-./tests/avg-down-int8.ispc compfail  x86-64    avx1-i32x16     Mac LLVM 3.3     g++4.7 -O2 *
-./tests/avg-up-int8.ispc compfail  x86-64    avx1-i32x16     Mac LLVM 3.3     g++4.7 -O2 *
-./tests/avg-down-int8.ispc compfail     x86  avx1.1-i32x16     Mac LLVM 3.3     g++4.7 -O2 *
-./tests/avg-up-int8.ispc compfail     x86  avx1.1-i32x16     Mac LLVM 3.3     g++4.7 -O2 *
-./tests/avg-down-int8.ispc compfail  x86-64  avx1.1-i32x16     Mac LLVM 3.3     g++4.7 -O2 *
-./tests/avg-up-int8.ispc compfail  x86-64  avx1.1-i32x16     Mac LLVM 3.3     g++4.7 -O2 *
-./tests/test-141.ispc runfail     x86    avx2-i32x16     Mac LLVM 3.3     g++4.7 -O2 *
-./tests/test-141.ispc runfail  x86-64    avx2-i32x16     Mac LLVM 3.3     g++4.7 -O2 *
-./tests/broadcast-1.ispc runfail  x86-64      generic-4     Mac LLVM 3.3     g++4.7 -O2 *
-./tests/half-3.ispc runfail  x86-64      generic-4     Mac LLVM 3.3     g++4.7 -O2 *
-./tests/local-atomics-1.ispc runfail  x86-64      generic-4     Mac LLVM 3.3     g++4.7 -O2 *
-./tests/local-atomics-13.ispc runfail  x86-64      generic-4     Mac LLVM 3.3     g++4.7 -O2 *
-./tests/local-atomics-5.ispc runfail  x86-64      generic-4     Mac LLVM 3.3     g++4.7 -O2 *
-./tests/local-atomics-9.ispc runfail  x86-64      generic-4     Mac LLVM 3.3     g++4.7 -O2 *
-./tests/local-atomics-swap.ispc runfail  x86-64      generic-4     Mac LLVM 3.3     g++4.7 -O2 *
-./tests/local-atomics-varyingptr-3.ispc runfail  x86-64      generic-4     Mac LLVM 3.3     g++4.7 -O2 *
-./tests/memset-varying.ispc runfail  x86-64      generic-4     Mac LLVM 3.3     g++4.7 -O2 *
-./tests/ptr-15.ispc runfail  x86-64      generic-4     Mac LLVM 3.3     g++4.7 -O2 *
-./tests/rotate-2.ispc runfail  x86-64      generic-4     Mac LLVM 3.3     g++4.7 -O2 *
-./tests/shuffle-4.ispc runfail  x86-64      generic-4     Mac LLVM 3.3     g++4.7 -O2 *
-./tests/shuffle2-1.ispc runfail  x86-64      generic-4     Mac LLVM 3.3     g++4.7 -O2 *
-./tests/shuffle2-10.ispc runfail  x86-64      generic-4     Mac LLVM 3.3     g++4.7 -O2 *
-./tests/shuffle2-11.ispc runfail  x86-64      generic-4     Mac LLVM 3.3     g++4.7 -O2 *
-./tests/shuffle2-2.ispc runfail  x86-64      generic-4     Mac LLVM 3.3     g++4.7 -O2 *
-./tests/shuffle2-3.ispc runfail  x86-64      generic-4     Mac LLVM 3.3     g++4.7 -O2 *
-./tests/shuffle2-4.ispc runfail  x86-64      generic-4     Mac LLVM 3.3     g++4.7 -O2 *
-./tests/shuffle2-5.ispc runfail  x86-64      generic-4     Mac LLVM 3.3     g++4.7 -O2 *
-./tests/shuffle2-6.ispc runfail  x86-64      generic-4     Mac LLVM 3.3     g++4.7 -O2 *
-./tests/shuffle2-7.ispc runfail  x86-64      generic-4     Mac LLVM 3.3     g++4.7 -O2 *
-./tests/shuffle2-8.ispc runfail  x86-64      generic-4     Mac LLVM 3.3     g++4.7 -O2 *
-./tests/shuffle2-9.ispc runfail  x86-64      generic-4     Mac LLVM 3.3     g++4.7 -O2 *
-./tests/shuffle2.ispc runfail  x86-64      generic-4     Mac LLVM 3.3     g++4.7 -O2 *
-./tests/test-129.ispc runfail  x86-64      generic-4     Mac LLVM 3.3     g++4.7 -O2 *
-./tests/test-130.ispc runfail  x86-64      generic-4     Mac LLVM 3.3     g++4.7 -O2 *
-./tests/ptr-assign-lhs-math-1.ispc compfail  x86-64      generic-4     Mac LLVM 3.3     g++4.7 -O2 *
-./tests/short-vec-8.ispc compfail  x86-64      generic-4     Mac LLVM 3.3     g++4.7 -O2 *
-./tests/ptr-15.ispc runfail  x86-64     generic-16     Mac LLVM 3.3     g++4.7 -O2 *
-./tests/test-141.ispc runfail  x86-64     generic-16     Mac LLVM 3.3     g++4.7 -O2 *
-./tests/test-143.ispc runfail  x86-64     generic-16     Mac LLVM 3.3     g++4.7 -O2 *
-./tests/ptr-assign-lhs-math-1.ispc compfail  x86-64     generic-16     Mac LLVM 3.3     g++4.7 -O2 *
-./tests/funcptr-null-4.ispc runfail     x86     sse4-i8x16     Mac LLVM 3.4     g++4.7 -O2 *
-./tests/funcptr-null-5.ispc runfail     x86     sse4-i8x16     Mac LLVM 3.4     g++4.7 -O2 *
-./tests/funcptr-null-6.ispc runfail     x86     sse4-i8x16     Mac LLVM 3.4     g++4.7 -O2 *
-./tests/funcptr-null-4.ispc runfail  x86-64     sse4-i8x16     Mac LLVM 3.4     g++4.7 -O2 *
-./tests/funcptr-null-5.ispc runfail  x86-64     sse4-i8x16     Mac LLVM 3.4     g++4.7 -O2 *
-./tests/funcptr-null-6.ispc runfail  x86-64     sse4-i8x16     Mac LLVM 3.4     g++4.7 -O2 *
-./tests/avg-down-int8.ispc compfail     x86    avx1-i32x16     Mac LLVM 3.4     g++4.7 -O2 *
-./tests/avg-up-int8.ispc compfail     x86    avx1-i32x16     Mac LLVM 3.4     g++4.7 -O2 *
-./tests/avg-down-int8.ispc compfail  x86-64    avx1-i32x16     Mac LLVM 3.4     g++4.7 -O2 *
-./tests/avg-up-int8.ispc compfail  x86-64    avx1-i32x16     Mac LLVM 3.4     g++4.7 -O2 *
-./tests/avg-down-int8.ispc compfail     x86  avx1.1-i32x16     Mac LLVM 3.4     g++4.7 -O2 *
-./tests/avg-up-int8.ispc compfail     x86  avx1.1-i32x16     Mac LLVM 3.4     g++4.7 -O2 *
-./tests/avg-down-int8.ispc compfail  x86-64  avx1.1-i32x16     Mac LLVM 3.4     g++4.7 -O2 *
-./tests/avg-up-int8.ispc compfail  x86-64  avx1.1-i32x16     Mac LLVM 3.4     g++4.7 -O2 *
-./tests/atomics-varyingptr-2.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4     g++4.7 -O2 *
-./tests/atomics-varyingptr-3.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4     g++4.7 -O2 *
-./tests/atomics-varyingptr-4.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4     g++4.7 -O2 *
-./tests/local-atomics-11.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4     g++4.7 -O2 *
-./tests/local-atomics-12.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4     g++4.7 -O2 *
-./tests/local-atomics-13.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4     g++4.7 -O2 *
-./tests/local-atomics-4.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4     g++4.7 -O2 *
-./tests/local-atomics-5.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4     g++4.7 -O2 *
-./tests/local-atomics-6.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4     g++4.7 -O2 *
-./tests/local-atomics-7.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4     g++4.7 -O2 *
-./tests/local-atomics-8.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4     g++4.7 -O2 *
-./tests/local-atomics-swap.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4     g++4.7 -O2 *
-./tests/local-atomics-varyingptr-2.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4     g++4.7 -O2 *
-./tests/local-atomics-varyingptr-3.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4     g++4.7 -O2 *
-./tests/local-atomics-varyingptr-4.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4     g++4.7 -O2 *
-./tests/memset-varying.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4     g++4.7 -O2 *
-./tests/reduce-equal-1.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4     g++4.7 -O2 *
-./tests/reduce-equal-12.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4     g++4.7 -O2 *
-./tests/reduce-equal-13.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4     g++4.7 -O2 *
-./tests/reduce-equal-2.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4     g++4.7 -O2 *
-./tests/reduce-equal-3.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4     g++4.7 -O2 *
-./tests/reduce-equal-4.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4     g++4.7 -O2 *
-./tests/reduce-equal-5.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4     g++4.7 -O2 *
-./tests/reduce-equal-6.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4     g++4.7 -O2 *
-./tests/reduce-equal-7.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4     g++4.7 -O2 *
-./tests/reduce-equal.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4     g++4.7 -O2 *
-./tests/test-141.ispc runfail     x86    avx2-i32x16     Mac LLVM 3.4     g++4.7 -O2 *
-./tests/test-141.ispc runfail  x86-64    avx2-i32x16     Mac LLVM 3.4     g++4.7 -O2 *
-./tests/broadcast-1.ispc runfail  x86-64      generic-4     Mac LLVM 3.4     g++4.7 -O2 *
-./tests/half-3.ispc runfail  x86-64      generic-4     Mac LLVM 3.4     g++4.7 -O2 *
-./tests/local-atomics-1.ispc runfail  x86-64      generic-4     Mac LLVM 3.4     g++4.7 -O2 *
-./tests/local-atomics-13.ispc runfail  x86-64      generic-4     Mac LLVM 3.4     g++4.7 -O2 *
-./tests/local-atomics-5.ispc runfail  x86-64      generic-4     Mac LLVM 3.4     g++4.7 -O2 *
-./tests/local-atomics-9.ispc runfail  x86-64      generic-4     Mac LLVM 3.4     g++4.7 -O2 *
-./tests/local-atomics-swap.ispc runfail  x86-64      generic-4     Mac LLVM 3.4     g++4.7 -O2 *
-./tests/local-atomics-varyingptr-3.ispc runfail  x86-64      generic-4     Mac LLVM 3.4     g++4.7 -O2 *
-./tests/memset-varying.ispc runfail  x86-64      generic-4     Mac LLVM 3.4     g++4.7 -O2 *
-./tests/ptr-15.ispc runfail  x86-64      generic-4     Mac LLVM 3.4     g++4.7 -O2 *
-./tests/rotate-2.ispc runfail  x86-64      generic-4     Mac LLVM 3.4     g++4.7 -O2 *
-./tests/shuffle-4.ispc runfail  x86-64      generic-4     Mac LLVM 3.4     g++4.7 -O2 *
-./tests/shuffle2-1.ispc runfail  x86-64      generic-4     Mac LLVM 3.4     g++4.7 -O2 *
-./tests/shuffle2-10.ispc runfail  x86-64      generic-4     Mac LLVM 3.4     g++4.7 -O2 *
-./tests/shuffle2-11.ispc runfail  x86-64      generic-4     Mac LLVM 3.4     g++4.7 -O2 *
-./tests/shuffle2-2.ispc runfail  x86-64      generic-4     Mac LLVM 3.4     g++4.7 -O2 *
-./tests/shuffle2-3.ispc runfail  x86-64      generic-4     Mac LLVM 3.4     g++4.7 -O2 *
-./tests/shuffle2-4.ispc runfail  x86-64      generic-4     Mac LLVM 3.4     g++4.7 -O2 *
-./tests/shuffle2-5.ispc runfail  x86-64      generic-4     Mac LLVM 3.4     g++4.7 -O2 *
-./tests/shuffle2-6.ispc runfail  x86-64      generic-4     Mac LLVM 3.4     g++4.7 -O2 *
-./tests/shuffle2-7.ispc runfail  x86-64      generic-4     Mac LLVM 3.4     g++4.7 -O2 *
-./tests/shuffle2-8.ispc runfail  x86-64      generic-4     Mac LLVM 3.4     g++4.7 -O2 *
-./tests/shuffle2-9.ispc runfail  x86-64      generic-4     Mac LLVM 3.4     g++4.7 -O2 *
-./tests/shuffle2.ispc runfail  x86-64      generic-4     Mac LLVM 3.4     g++4.7 -O2 *
-./tests/test-129.ispc runfail  x86-64      generic-4     Mac LLVM 3.4     g++4.7 -O2 *
-./tests/test-130.ispc runfail  x86-64      generic-4     Mac LLVM 3.4     g++4.7 -O2 *
-./tests/ptr-assign-lhs-math-1.ispc compfail  x86-64      generic-4     Mac LLVM 3.4     g++4.7 -O2 *
-./tests/short-vec-8.ispc compfail  x86-64      generic-4     Mac LLVM 3.4     g++4.7 -O2 *
-./tests/ptr-15.ispc runfail  x86-64     generic-16     Mac LLVM 3.4     g++4.7 -O2 *
-./tests/test-141.ispc runfail  x86-64     generic-16     Mac LLVM 3.4     g++4.7 -O2 *
-./tests/test-143.ispc runfail  x86-64     generic-16     Mac LLVM 3.4     g++4.7 -O2 *
-./tests/ptr-assign-lhs-math-1.ispc compfail  x86-64     generic-16     Mac LLVM 3.4     g++4.7 -O2 *
 .\tests\exclusive-scan-add-9.ispc runfail     x86     sse2-i32x4 Windows LLVM 3.3         cl -O2 *
 .\tests\reduce-equal-10.ispc runfail     x86     sse2-i32x4 Windows LLVM 3.3         cl -O2 *
 .\tests\reduce-max-uint64.ispc runfail     x86     sse2-i32x4 Windows LLVM 3.3         cl -O2 *

From 1e80b3b0d7d5a1e66cd6a28d3ad379b0624a009d Mon Sep 17 00:00:00 2001
From: "james.brodman" <james.brodman@intel.com>
Date: Mon, 28 Oct 2013 12:20:32 -0400
Subject: [PATCH 14/24] Add shift support for generic-16 target.

---
 examples/intrinsics/generic-16.h | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/examples/intrinsics/generic-16.h b/examples/intrinsics/generic-16.h
index d81101f7..fa794276 100644
--- a/examples/intrinsics/generic-16.h
+++ b/examples/intrinsics/generic-16.h
@@ -311,6 +311,17 @@ static FORCEINLINE VTYPE __rotate_##NAME(VTYPE v, int index) {   \
     return ret;                                       \
 }                                                     \
 
+#define SHIFT(VTYPE, NAME, STYPE)                    \
+static FORCEINLINE VTYPE __shift_##NAME(VTYPE v, int index) {   \
+    VTYPE ret;                                        \
+    for (int i = 0; i < 16; ++i) {                    \
+      int modIndex = i+index;                         \
+      STYPE val = ((modIndex >= 0) && (modIndex < 16)) ? v.v[modIndex] : 0; \
+      ret.v[i] = val;                                 \
+    }                                                 \
+    return ret;                                       \
+}                                                     \
+
 #define SHUFFLES(VTYPE, NAME, STYPE)                 \
 static FORCEINLINE VTYPE __shuffle_##NAME(VTYPE v, __vec16_i32 index) {   \
     VTYPE ret;                                        \
@@ -492,6 +503,7 @@ SETZERO(__vec16_i8, i8)
 UNDEF(__vec16_i8, i8)
 BROADCAST(__vec16_i8, i8, int8_t)
 ROTATE(__vec16_i8, i8, int8_t)
+SHIFT(__vec16_i8, i8, int8_t)
 SHUFFLES(__vec16_i8, i8, int8_t)
 LOAD_STORE(__vec16_i8, int8_t)
 
@@ -537,6 +549,7 @@ SETZERO(__vec16_i16, i16)
 UNDEF(__vec16_i16, i16)
 BROADCAST(__vec16_i16, i16, int16_t)
 ROTATE(__vec16_i16, i16, int16_t)
+SHIFT(__vec16_i16, i16, int16_t)
 SHUFFLES(__vec16_i16, i16, int16_t)
 LOAD_STORE(__vec16_i16, int16_t)
 
@@ -582,6 +595,7 @@ SETZERO(__vec16_i32, i32)
 UNDEF(__vec16_i32, i32)
 BROADCAST(__vec16_i32, i32, int32_t)
 ROTATE(__vec16_i32, i32, int32_t)
+SHIFT(__vec16_i32, i32, int32_t)
 SHUFFLES(__vec16_i32, i32, int32_t)
 LOAD_STORE(__vec16_i32, int32_t)
 
@@ -627,6 +641,7 @@ SETZERO(__vec16_i64, i64)
 UNDEF(__vec16_i64, i64)
 BROADCAST(__vec16_i64, i64, int64_t)
 ROTATE(__vec16_i64, i64, int64_t)
+SHIFT(__vec16_i64, i64, int64_t)
 SHUFFLES(__vec16_i64, i64, int64_t)
 LOAD_STORE(__vec16_i64, int64_t)
 
@@ -672,6 +687,7 @@ SETZERO(__vec16_f, float)
 UNDEF(__vec16_f, float)
 BROADCAST(__vec16_f, float, float)
 ROTATE(__vec16_f, float, float)
+SHIFT(__vec16_f, float, float)
 SHUFFLES(__vec16_f, float, float)
 LOAD_STORE(__vec16_f, float)
 
@@ -832,6 +848,7 @@ SETZERO(__vec16_d, double)
 UNDEF(__vec16_d, double)
 BROADCAST(__vec16_d, double, double)
 ROTATE(__vec16_d, double, double)
+SHIFT(__vec16_d, double, double)
 SHUFFLES(__vec16_d, double, double)
 LOAD_STORE(__vec16_d, double)
 

From 641d882ea6ed42ab19c5afaeb0ca9fcc97d616ed Mon Sep 17 00:00:00 2001
From: "james.brodman" <james.brodman@intel.com>
Date: Mon, 28 Oct 2013 12:43:42 -0400
Subject: [PATCH 15/24] Add shift support for knc targets.  This is not
 optimized.

---
 examples/intrinsics/knc-i1x16.h | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/examples/intrinsics/knc-i1x16.h b/examples/intrinsics/knc-i1x16.h
index 78d35ddc..0ede6006 100644
--- a/examples/intrinsics/knc-i1x16.h
+++ b/examples/intrinsics/knc-i1x16.h
@@ -451,6 +451,17 @@ static FORCEINLINE VTYPE __rotate_##NAME(VTYPE v, int index) {   \
     return ret;                                       \
 }                                                     \
 
+#define SHIFT(VTYPE, NAME, STYPE)                    \
+static FORCEINLINE VTYPE __shift_##NAME(VTYPE v, int index) {   \
+    VTYPE ret;                                        \
+    for (int i = 0; i < 16; ++i) {                    \
+      int modIndex = i+index;                         \
+      STYPE val = ((modIndex >= 0) && (modIndex < 16)) ? v.v[modIndex] : 0; \
+      ret.v[i] = val;                                 \
+    }                                                 \
+    return ret;                                       \
+}                                                     \
+
 /* knc::macro::used */
 #define SHUFFLES(VTYPE, NAME, STYPE)                 \
 static FORCEINLINE VTYPE __shuffle_##NAME(VTYPE v, __vec16_i32 index) {   \
@@ -566,6 +577,7 @@ SETZERO(__vec16_i8, i8)
 UNDEF(__vec16_i8, i8)
 BROADCAST(__vec16_i8, i8, int8_t)
 ROTATE(__vec16_i8, i8, int8_t)
+SHIFT(__vec16_i8, i8, int8_t)
 SHUFFLES(__vec16_i8, i8, int8_t)
 LOAD_STORE(__vec16_i8, int8_t)
 
@@ -612,6 +624,7 @@ SETZERO(__vec16_i16, i16)
 UNDEF(__vec16_i16, i16)
 BROADCAST(__vec16_i16, i16, int16_t)
 ROTATE(__vec16_i16, i16, int16_t)
+SHIFT(__vec16_i16, i16, int16_t)
 SHUFFLES(__vec16_i16, i16, int16_t)
 LOAD_STORE(__vec16_i16, int16_t)
 
@@ -688,6 +701,8 @@ static FORCEINLINE __vec16_i32 __rotate_i32(__vec16_i32 v, int index)
   return _mm512_mask_permutevar_epi32(v, 0xFFFF, shuffle, v);
 }
 
+SHIFT(__vec16_i32, i32, int32_t)
+
 static FORCEINLINE __vec16_i32 __shuffle_i32 (__vec16_i32 v, __vec16_i32 index) 
 { 
   return _mm512_mask_permutevar_epi32(v, 0xFFFF, __and(index, __smear_i32<__vec16_i32>(0xF)), v); 
@@ -942,6 +957,8 @@ static FORCEINLINE __vec16_i64 __rotate_i64(const __vec16_i64 _v, const int inde
   const __vec16_i32 ret_lo = __rotate_i32(v_lo, index);
   return CASTI2L(ret_hi, ret_lo);
 }
+SHIFT(__vec16_i64, i64, int64_t)
+
 static FORCEINLINE __vec16_i64 __shuffle_double(__vec16_i64 _v, const __vec16_i32 index) 
 {
   CASTL2I(_v, v_hi, v_lo);
@@ -1063,6 +1080,7 @@ static FORCEINLINE __vec16_f __rotate_float(__vec16_f _v, int index)
   const __vec16_i32 shuffle = _mm512_and_epi32(_mm512_add_epi32(__ispc_stride1, idx),  __smear_i32<__vec16_i32>(0xF));
   return _mm512_castsi512_ps(_mm512_mask_permutevar_epi32(v, 0xFFFF, shuffle, v));
 }
+SHIFT(__vec16_f, float, float)
 static FORCEINLINE __vec16_f __shuffle_float(__vec16_f v, __vec16_i32 index) 
 {
   return _mm512_castsi512_ps(_mm512_mask_permutevar_epi32(_mm512_castps_si512(v), 0xffff, index, _mm512_castps_si512(v)));
@@ -1333,6 +1351,7 @@ static FORCEINLINE __vec16_d __rotate_double(const __vec16_d _v, const int index
   const __vec16_f ret_lo = __rotate_float(v_lo, index);
   return CASTF2D(ret_hi, ret_lo);
 }
+SHIFT(__vec16_d, double, double)
 static FORCEINLINE __vec16_d __shuffle_double(__vec16_d _v, const __vec16_i32 index) 
 {
   CASTD2F(_v, v_hi, v_lo);

From 02681d531eb4871db2b732079651aed6360d85c6 Mon Sep 17 00:00:00 2001
From: "james.brodman" <james.brodman@intel.com>
Date: Mon, 28 Oct 2013 12:56:43 -0400
Subject: [PATCH 16/24] Minor tweak for interface.

---
 examples/intrinsics/knc-i1x16.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/intrinsics/knc-i1x16.h b/examples/intrinsics/knc-i1x16.h
index 0ede6006..376e66bc 100644
--- a/examples/intrinsics/knc-i1x16.h
+++ b/examples/intrinsics/knc-i1x16.h
@@ -456,8 +456,8 @@ static FORCEINLINE VTYPE __shift_##NAME(VTYPE v, int index) {   \
     VTYPE ret;                                        \
     for (int i = 0; i < 16; ++i) {                    \
       int modIndex = i+index;                         \
-      STYPE val = ((modIndex >= 0) && (modIndex < 16)) ? v.v[modIndex] : 0; \
-      ret.v[i] = val;                                 \
+      STYPE val = ((modIndex >= 0) && (modIndex < 16)) ? v[modIndex] : 0; \
+      ret[i] = val;                                 \
     }                                                 \
     return ret;                                       \
 }                                                     \

From a166eb7ea12ed9a6ea4a13a7354d2906fb5de6e3 Mon Sep 17 00:00:00 2001
From: Dmitry Babokin <babokin@gmail.com>
Date: Mon, 28 Oct 2013 22:19:09 +0400
Subject: [PATCH 17/24] Check AVX OS support in host cpu check code

---
 ispc.cpp | 19 ++++++++++++++++++-
 1 file changed, 18 insertions(+), 1 deletion(-)

diff --git a/ispc.cpp b/ispc.cpp
index db4c161a..419c64ab 100644
--- a/ispc.cpp
+++ b/ispc.cpp
@@ -102,6 +102,22 @@ static void __cpuidex(int info[4], int level, int count) {
 }
 #endif // !ISPC_IS_WINDOWS && !__ARM__
 
+#if !defined(__arm__)
+static bool __os_has_avx_support() {
+#if defined(ISPC_IS_WINDOWS)
+    // Check if the OS will save the YMM registers
+    unsigned long long xcrFeatureMask = _xgetbv(_XCR_XFEATURE_ENABLED_MASK);
+    return (xcrFeatureMask & 6) == 6;
+#else // defined(ISPC_IS_WINDOWS)
+    // Check xgetbv; this uses a .byte sequence instead of the instruction
+    // directly because older assemblers do not include support for xgetbv and
+    // there is no easy way to conditionally compile based on the assembler used.
+    int rEAX, rEDX;
+    __asm__ __volatile__ (".byte 0x0f, 0x01, 0xd0" : "=a" (rEAX), "=d" (rEDX) : "c" (0));
+    return (rEAX & 6) == 6;
+#endif // !defined(ISPC_IS_WINDOWS)
+}
+#endif // !__arm__
 
 static const char *
 lGetSystemISA() {
@@ -111,7 +127,8 @@ lGetSystemISA() {
     int info[4];
     __cpuid(info, 1);
 
-    if ((info[2] & (1 << 28)) != 0) {  // AVX
+    if ((info[2] & (1 << 28)) != 0 &&
+         __os_has_avx_support()) {  // AVX
         // AVX1 for sure....
         // Ivy Bridge?
         if ((info[2] & (1 << 29)) != 0 &&  // F16C

From 1f0f852fda1209aabf8f773c0ec6d8266f236296 Mon Sep 17 00:00:00 2001
From: Dmitry Babokin <babokin@gmail.com>
Date: Mon, 28 Oct 2013 22:54:14 +0400
Subject: [PATCH 18/24] Standalone checker for detecting the best ISA supported
 on the host

---
 check_isa.cpp | 129 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 129 insertions(+)
 create mode 100644 check_isa.cpp

diff --git a/check_isa.cpp b/check_isa.cpp
new file mode 100644
index 00000000..3f8b487d
--- /dev/null
+++ b/check_isa.cpp
@@ -0,0 +1,129 @@
+/*
+  Copyright (c) 2013, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+///////////////////////////////////////////////////////////////////////////////
+//                                                                           //
+// This file is a standalone program, which detects the best supported ISA.  //
+//                                                                           //
+///////////////////////////////////////////////////////////////////////////////
+
+
+
+#include <stdio.h>
+
+#if defined(_WIN32) || defined(_WIN64)
+#define ISPC_IS_WINDOWS
+#include <intrin.h>
+#endif
+
+#if !defined (__arm__)
+#if !defined(ISPC_IS_WINDOWS)
+static void __cpuid(int info[4], int infoType) {
+    __asm__ __volatile__ ("cpuid"
+                          : "=a" (info[0]), "=b" (info[1]), "=c" (info[2]), "=d" (info[3])
+                          : "0" (infoType));
+}
+
+/* Save %ebx in case it's the PIC register */
+static void __cpuidex(int info[4], int level, int count) {
+  __asm__ __volatile__ ("xchg{l}\t{%%}ebx, %1\n\t"
+                        "cpuid\n\t"
+                        "xchg{l}\t{%%}ebx, %1\n\t"
+                        : "=a" (info[0]), "=r" (info[1]), "=c" (info[2]), "=d" (info[3])
+                        : "0" (level), "2" (count));
+}
+#endif // !ISPC_IS_WINDOWS
+
+static bool __os_has_avx_support() {
+#if defined(ISPC_IS_WINDOWS)
+    // Check if the OS will save the YMM registers
+    unsigned long long xcrFeatureMask = _xgetbv(_XCR_XFEATURE_ENABLED_MASK);
+    return (xcrFeatureMask & 6) == 6;
+#else // defined(ISPC_IS_WINDOWS)
+    // Check xgetbv; this uses a .byte sequence instead of the instruction
+    // directly because older assemblers do not include support for xgetbv and
+    // there is no easy way to conditionally compile based on the assembler used.
+    int rEAX, rEDX;
+    __asm__ __volatile__ (".byte 0x0f, 0x01, 0xd0" : "=a" (rEAX), "=d" (rEDX) : "c" (0));
+    return (rEAX & 6) == 6;
+#endif // !defined(ISPC_IS_WINDOWS)
+}
+#endif // !__arm__
+
+
+static const char *
+lGetSystemISA() {
+#ifdef __arm__
+    return "ARM NEON";
+#else
+    int info[4];
+    __cpuid(info, 1);
+
+    if ((info[2] & (1 << 28)) != 0 &&
+         __os_has_avx_support()) {  // AVX
+        // AVX1 for sure....
+        // Ivy Bridge?
+        if ((info[2] & (1 << 29)) != 0 &&  // F16C
+            (info[2] & (1 << 30)) != 0) {  // RDRAND
+            // So far, so good.  AVX2?
+            // Call cpuid with eax=7, ecx=0
+            int info2[4];
+            __cpuidex(info2, 7, 0);
+            if ((info2[1] & (1 << 5)) != 0) {
+                return "AVX2 (codename Haswell)";
+            }
+            else {
+                return "AVX1.1 (codename Ivy Bridge)";
+            }
+        }
+        // Regular AVX
+        return "AVX (codename Sandy Bridge)";
+    }
+    else if ((info[2] & (1 << 19)) != 0) {
+        return "SSE4";
+    }
+    else if ((info[3] & (1 << 26)) != 0) {
+        return "SSE2";
+    }
+    else {
+        return "Error";
+    }
+#endif
+}
+
+int main () {
+    const char* isa = lGetSystemISA();
+    printf("ISA: %s\n", isa);
+
+    return 0;
+}

From 9ba7b96825ec29e18372df72dadcc988ea5c0a52 Mon Sep 17 00:00:00 2001
From: "james.brodman" <james.brodman@intel.com>
Date: Mon, 28 Oct 2013 16:14:31 -0400
Subject: [PATCH 19/24] Make the new optimization play nicely with the other.s

---
 opt.cpp | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/opt.cpp b/opt.cpp
index b018d35d..59f00538 100644
--- a/opt.cpp
+++ b/opt.cpp
@@ -536,12 +536,6 @@ Optimize(llvm::Module *module, int optLevel) {
         }
         optPM.add(llvm::createDeadInstEliminationPass(), 220);
 
-        if (g->target->getISA() != Target::GENERIC) {
-          // Just use the builtins for generic targets.
-          optPM.add(llvm::createIPConstantPropagationPass());
-          optPM.add(CreateReplaceStdlibShiftPass());
-        }
-
         // Max struct size threshold for scalar replacement is
         //    1) 4 fields (r,g,b,w)
         //    2) field size: vectorWidth * sizeof(float)
@@ -556,7 +550,12 @@ Optimize(llvm::Module *module, int optLevel) {
         optPM.add(llvm::createGlobalOptimizerPass());
         optPM.add(llvm::createReassociatePass());
         optPM.add(llvm::createIPConstantPropagationPass());
-        optPM.add(llvm::createDeadArgEliminationPass());
+        if (g->target->getISA() != Target::GENERIC) {
+          // Just use the builtins for generic targets.
+
+          optPM.add(CreateReplaceStdlibShiftPass(),229);
+        }
+        optPM.add(llvm::createDeadArgEliminationPass(),230);
         optPM.add(llvm::createInstructionCombiningPass());
         optPM.add(llvm::createCFGSimplificationPass());
         optPM.add(llvm::createPruneEHPass());

From 1b8e745ffe68d4d9107d9b910f1f2c5892362b4c Mon Sep 17 00:00:00 2001
From: "james.brodman" <james.brodman@intel.com>
Date: Mon, 28 Oct 2013 16:36:59 -0400
Subject: [PATCH 20/24] remove condition.  Don't use gcc 4.7 for tests.

---
 opt.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/opt.cpp b/opt.cpp
index 59f00538..e585d6c1 100644
--- a/opt.cpp
+++ b/opt.cpp
@@ -550,11 +550,11 @@ Optimize(llvm::Module *module, int optLevel) {
         optPM.add(llvm::createGlobalOptimizerPass());
         optPM.add(llvm::createReassociatePass());
         optPM.add(llvm::createIPConstantPropagationPass());
-        if (g->target->getISA() != Target::GENERIC) {
+        //        if (g->target->getISA() != Target::GENERIC) {
           // Just use the builtins for generic targets.
 
           optPM.add(CreateReplaceStdlibShiftPass(),229);
-        }
+          //        }
         optPM.add(llvm::createDeadArgEliminationPass(),230);
         optPM.add(llvm::createInstructionCombiningPass());
         optPM.add(llvm::createCFGSimplificationPass());

From 09a6e371541d6578a59c00c2aeea66443bb1c0b7 Mon Sep 17 00:00:00 2001
From: "james.brodman" <james.brodman@intel.com>
Date: Mon, 28 Oct 2013 16:37:33 -0400
Subject: [PATCH 21/24] Source cleanup.

---
 opt.cpp | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/opt.cpp b/opt.cpp
index e585d6c1..77fb9f21 100644
--- a/opt.cpp
+++ b/opt.cpp
@@ -550,11 +550,7 @@ Optimize(llvm::Module *module, int optLevel) {
         optPM.add(llvm::createGlobalOptimizerPass());
         optPM.add(llvm::createReassociatePass());
         optPM.add(llvm::createIPConstantPropagationPass());
-        //        if (g->target->getISA() != Target::GENERIC) {
-          // Just use the builtins for generic targets.
-
-          optPM.add(CreateReplaceStdlibShiftPass(),229);
-          //        }
+        optPM.add(CreateReplaceStdlibShiftPass(),229);
         optPM.add(llvm::createDeadArgEliminationPass(),230);
         optPM.add(llvm::createInstructionCombiningPass());
         optPM.add(llvm::createCFGSimplificationPass());

From 8ee317816607b6abef892fbc6a04ccf886518b99 Mon Sep 17 00:00:00 2001
From: "james.brodman" <james.brodman@intel.com>
Date: Mon, 28 Oct 2013 16:51:02 -0400
Subject: [PATCH 22/24] Add Performance Warning

---
 opt.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/opt.cpp b/opt.cpp
index 77fb9f21..bb788a8e 100644
--- a/opt.cpp
+++ b/opt.cpp
@@ -4998,6 +4998,8 @@ ReplaceStdlibShiftPass::runOnBasicBlock(llvm::BasicBlock &bb) {
                                                                    shuffleIdxs, "vecShift", ci);
                 ci->replaceAllUsesWith(shuffle);
                 modifiedAny = true;
+              } else {
+                PerformanceWarning(SourcePos(), "Stdlib shift() called without constant shift amount."); 
               }
             }
           }

From e682b19edacb2713453d229961f9e33ed6b82bd1 Mon Sep 17 00:00:00 2001
From: "james.brodman" <james.brodman@intel.com>
Date: Mon, 28 Oct 2013 17:13:07 -0400
Subject: [PATCH 23/24] Remove zero initialization for __vec4_i32

---
 examples/intrinsics/sse4.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/intrinsics/sse4.h b/examples/intrinsics/sse4.h
index 67c46848..919716be 100644
--- a/examples/intrinsics/sse4.h
+++ b/examples/intrinsics/sse4.h
@@ -108,7 +108,7 @@ struct __vec4_i64 {
 };
 
 struct __vec4_i32 {
-    FORCEINLINE __vec4_i32() : v(_mm_setzero_si128()) { }
+    FORCEINLINE __vec4_i32() { }
     FORCEINLINE __vec4_i32(__m128i vv) : v(vv) {  }
     FORCEINLINE __vec4_i32(int32_t a, int32_t b, int32_t c, int32_t d) {
         v = _mm_set_epi32(d, c, b, a);

From 362ee06b9f14c4ebb8672ba5d11dc3d5f7c6fac5 Mon Sep 17 00:00:00 2001
From: Dmitry Babokin <babokin@gmail.com>
Date: Tue, 29 Oct 2013 01:35:26 +0400
Subject: [PATCH 24/24] Typo fix

---
 check_isa.cpp | 2 +-
 ispc.cpp      | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/check_isa.cpp b/check_isa.cpp
index 3f8b487d..a4d10606 100644
--- a/check_isa.cpp
+++ b/check_isa.cpp
@@ -69,7 +69,7 @@ static bool __os_has_avx_support() {
     // Check if the OS will save the YMM registers
     unsigned long long xcrFeatureMask = _xgetbv(_XCR_XFEATURE_ENABLED_MASK);
     return (xcrFeatureMask & 6) == 6;
-#else // defined(ISPC_IS_WINDOWS)
+#else // !defined(ISPC_IS_WINDOWS)
     // Check xgetbv; this uses a .byte sequence instead of the instruction
     // directly because older assemblers do not include support for xgetbv and
     // there is no easy way to conditionally compile based on the assembler used.
diff --git a/ispc.cpp b/ispc.cpp
index 419c64ab..859865a5 100644
--- a/ispc.cpp
+++ b/ispc.cpp
@@ -108,7 +108,7 @@ static bool __os_has_avx_support() {
     // Check if the OS will save the YMM registers
     unsigned long long xcrFeatureMask = _xgetbv(_XCR_XFEATURE_ENABLED_MASK);
     return (xcrFeatureMask & 6) == 6;
-#else // defined(ISPC_IS_WINDOWS)
+#else // !defined(ISPC_IS_WINDOWS)
     // Check xgetbv; this uses a .byte sequence instead of the instruction
     // directly because older assemblers do not include support for xgetbv and
     // there is no easy way to conditionally compile based on the assembler used.