From 0277ba1aaa8a3c2b9441b149942bbc9c0ed3be5d Mon Sep 17 00:00:00 2001
From: Matt Pharr <mpharr@google.com>
Date: Tue, 23 Jul 2013 16:49:07 -0700
Subject: [PATCH 001/124] Improve warnings for right shift by varying amounts.

Fixes:
- Don't issue a warning when the shift is a by the same amount in all
  vector lanes.
- Do issue a warning when it's a compile-time constant but the values
  are different in different lanes.

Previously, we warned iff the shift amount wasn't a compile-time constant.
---
 expr.cpp | 39 ++++++++++++++++++++++++++++++++++++---
 1 file changed, 36 insertions(+), 3 deletions(-)
diff --git a/expr.cpp b/expr.cpp
index fc3d295a..894942d2 100644
--- a/expr.cpp
+++ b/expr.cpp
@@ -1911,6 +1911,40 @@ lEmitLogicalOp(BinaryExpr::Op op, Expr *arg0, Expr *arg1,
 }
 
 
+/* Returns true if shifting right by the given amount will lead to
+   inefficient code.  (Assumes x86 target.  May also warn inaccurately if
+   later optimization simplify the shift amount more than we are able to
+   see at this point.) */
+static bool
+lIsDifficultShiftAmount(Expr *expr) {
+    // Uniform shifts (of uniform values) are no problem.
+    if (expr->GetType()->IsVaryingType() == false)
+        return false;
+
+    ConstExpr *ce = dynamic_cast<ConstExpr *>(expr);
+    if (ce) {
+        // If the shift is by a constant amount, *and* it's the same amount
+        // in all vector lanes, we're in good shape.
+        uint32_t amount[ISPC_MAX_NVEC];
+        int count = ce->GetValues(amount);
+        for (int i = 1; i < count; ++i)
+            if (amount[i] != amount[0])
+              return true;
+        return false;
+    }
+
+    TypeCastExpr *tce = dynamic_cast<TypeCastExpr *>(expr);
+    if (tce && tce->expr) {
+        // Finally, if the shift amount is given by a uniform value that's
+        // been smeared out into a varying, we have the same shift for all
+        // lanes and are also in good shape.
+        return (tce->expr->GetType()->IsUniformType() == false);
+    }
+
+    return true;
+}
+
+
 llvm::Value *
 BinaryExpr::GetValue(FunctionEmitContext *ctx) const {
     if (!arg0 || !arg1) {
@@ -1951,9 +1985,8 @@ BinaryExpr::GetValue(FunctionEmitContext *ctx) const {
     case BitAnd:
     case BitXor:
     case BitOr: {
-        if (op == Shr && arg1->GetType()->IsVaryingType() &&
-            dynamic_cast<ConstExpr *>(arg1) == NULL)
-            PerformanceWarning(pos, "Shift right is extremely inefficient for "
+        if (op == Shr && lIsDifficultShiftAmount(arg1))
+            PerformanceWarning(pos, "Shift right is inefficient for "
                                "varying shift amounts.");
         return lEmitBinaryBitOp(op, value0, value1,
                                 arg0->GetType()->IsUnsignedType(), ctx);

From 83e1630fbcfde4aa67b50245cd96e36cbe033660 Mon Sep 17 00:00:00 2001
From: Matt Pharr <mpharr@google.com>
Date: Tue, 23 Jul 2013 16:49:56 -0700
Subject: [PATCH 002/124] Add support for fast division of varying int values
 by small constants.

For varying int8/16/32 types, divides by small constants can be
implemented efficiently through multiplies and shifts with integer
types of twice the bit-width; this commit adds this optimization.

(Implementation is based on Halide.)
---
 expr.cpp        |  69 +++++
 stdlib.ispc     | 675 ++++++++++++++++++++++++++++++++++++++++++++++++
 tests/idiv.ispc |  75 ++++++
 3 files changed, 819 insertions(+)
 create mode 100644 tests/idiv.ispc

diff --git a/expr.cpp b/expr.cpp
index 894942d2..3baaabaf 100644
--- a/expr.cpp
+++ b/expr.cpp
@@ -2240,6 +2240,49 @@ lConstFoldBinaryIntOp(ConstExpr *constArg0, ConstExpr *constArg1,
 }
 
 
+/* Returns true if the given arguments (which are assumed to be the
+   operands of a divide) represent a divide that can be performed by one of
+   the __fast_idiv functions.
+ */
+static bool
+lCanImproveVectorDivide(Expr *arg0, Expr *arg1, int *divisor) {
+    const Type *type = arg0->GetType();
+    if (!type)
+        return false;
+
+    // The value being divided must be an int8/16/32.
+    if (!(Type::EqualIgnoringConst(type, AtomicType::VaryingInt8) ||
+          Type::EqualIgnoringConst(type, AtomicType::VaryingUInt8) ||
+          Type::EqualIgnoringConst(type, AtomicType::VaryingInt16) ||
+          Type::EqualIgnoringConst(type, AtomicType::VaryingUInt16) ||
+          Type::EqualIgnoringConst(type, AtomicType::VaryingInt32) ||
+          Type::EqualIgnoringConst(type, AtomicType::VaryingUInt32)))
+        return false;
+
+    // The divisor must be the same compile-time constant value for all of
+    // the vector lanes.
+    ConstExpr *ce = dynamic_cast<ConstExpr *>(arg1);
+    if (!ce)
+        return false;
+    int64_t div[ISPC_MAX_NVEC];
+    int count = ce->GetValues(div);
+    for (int i = 1; i < count; ++i)
+        if (div[i] != div[0])
+          return false;
+    *divisor = div[0];
+
+    // And finally, the divisor must be >= 2 and <128 (for 8-bit divides),
+    // and <256 otherwise.
+    if (*divisor < 2)
+        return false;
+    if (Type::EqualIgnoringConst(type, AtomicType::VaryingInt8) ||
+        Type::EqualIgnoringConst(type, AtomicType::VaryingUInt8))
+        return *divisor < 128;
+    else
+        return *divisor < 256;
+}
+
+
 Expr *
 BinaryExpr::Optimize() {
     if (arg0 == NULL || arg1 == NULL)
@@ -2302,6 +2345,32 @@ BinaryExpr::Optimize() {
         }
     }
 
+    int divisor;
+    if (op == Div && lCanImproveVectorDivide(arg0, arg1, &divisor)) {
+        Debug(pos, "Improving vector divide by constant %d", divisor);
+
+        std::vector<Symbol *> idivFuns;
+        m->symbolTable->LookupFunction("__fast_idiv", &idivFuns);
+        if (idivFuns.size() == 0) {
+            Warning(pos, "Couldn't find __fast_idiv to optimize integer divide. "
+                    "Are you compiling with --nostdlib?");
+            return this;
+        }
+
+        Expr *idivSymExpr = new FunctionSymbolExpr("__fast_idiv", idivFuns, pos);
+        ExprList *args = new ExprList(arg0, pos);
+        args->exprs.push_back(new ConstExpr(AtomicType::UniformInt32, divisor, arg1->pos));
+        Expr *idivCall = new FunctionCallExpr(idivSymExpr, args, pos);
+
+        idivCall = ::TypeCheck(idivCall);
+        if (idivCall == NULL)
+          return NULL;
+
+        Assert(Type::EqualIgnoringConst(GetType(), idivCall->GetType()));
+        idivCall = new TypeCastExpr(GetType(), idivCall, pos);
+        return ::Optimize(idivCall);
+    }
+
     // From here on out, we're just doing constant folding, so if both args
     // aren't constants then we're done...
     if (constArg0 == NULL || constArg1 == NULL)
diff --git a/stdlib.ispc b/stdlib.ispc
index 4e06f5da..b8ed2057 100644
--- a/stdlib.ispc
+++ b/stdlib.ispc
@@ -4264,3 +4264,678 @@ static inline bool rdrand(int64 * ptr) {
         return success;
     }
 }
+
+///////////////////////////////////////////////////////////////////////////
+// Fast vector integer division
+
+/* These tables and the algorithms in the __fast_idiv() functions below are
+   from Halide; the idea is based on the paper "Division by Invariant
+   Integers using Multiplication" by Granlund and Montgomery.
+
+   Copyright (c) 2012 MIT CSAIL
+
+   Developed by:
+
+   The Halide team
+   MIT CSAIL
+   http://halide-lang.org
+
+   Permission is hereby granted, free of charge, to any person obtaining a
+   copy of this software and associated documentation files (the
+   "Software"), to deal in the Software without restriction, including
+   without limitation the rights to use, copy, modify, merge, publish,
+   distribute, sublicense, and/or sell copies of the Software, and to
+   permit persons to whom the Software is furnished to do so, subject to
+   the following conditions:
+
+   The above copyright notice and this permission notice shall be included
+   in all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+   OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+   LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+   OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+   WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+static const uniform int64 __idiv_table_u8[][3] = {
+    {0,          0LL,  1},     {1,        171LL,  1},     {0,          0LL,  2},
+    {1,        205LL,  2},     {1,        171LL,  2},     {2,         37LL,  2},
+    {0,          0LL,  3},     {1,         57LL,  1},     {1,        205LL,  3},
+    {2,        117LL,  3},     {1,        171LL,  3},     {1,         79LL,  2},
+    {2,         37LL,  3},     {1,        137LL,  3},     {0,          0LL,  4},
+    {1,        241LL,  4},     {1,         57LL,  2},     {1,         27LL,  1},
+    {1,        205LL,  4},     {2,        135LL,  4},     {2,        117LL,  4},
+    {2,        101LL,  4},     {1,        171LL,  4},     {1,         41LL,  2},
+    {1,         79LL,  3},     {1,         19LL,  1},     {2,         37LL,  4},
+    {2,         27LL,  4},     {1,        137LL,  4},     {2,          9LL,  4},
+    {0,          0LL,  5},     {1,        249LL,  5},     {1,        241LL,  5},
+    {1,        235LL,  5},     {1,         57LL,  3},     {1,        111LL,  4},
+    {1,         27LL,  2},     {2,        165LL,  5},     {1,        205LL,  5},
+    {1,         25LL,  2},     {2,        135LL,  5},     {1,        191LL,  5},
+    {1,        187LL,  5},     {2,        109LL,  5},     {2,        101LL,  5},
+    {1,        175LL,  5},     {1,        171LL,  5},     {2,         79LL,  5},
+    {1,         41LL,  3},     {1,        161LL,  5},     {1,         79LL,  4},
+    {1,        155LL,  5},     {1,         19LL,  2},     {1,        149LL,  5},
+    {2,         37LL,  5},     {1,          9LL,  1},     {2,         27LL,  5},
+    {1,        139LL,  5},     {1,        137LL,  5},     {2,         13LL,  5},
+    {2,          9LL,  5},     {2,          5LL,  5},     {0,          0LL,  6},
+    {1,        253LL,  6},     {1,        249LL,  6},     {1,        245LL,  6},
+    {1,        121LL,  5},     {1,        119LL,  5},     {1,        235LL,  6},
+    {1,        231LL,  6},     {1,         57LL,  4},     {1,        225LL,  6},
+    {1,        111LL,  5},     {1,        219LL,  6},     {1,         27LL,  3},
+    {1,        213LL,  6},     {2,        165LL,  6},     {1,         13LL,  2},
+    {1,        205LL,  6},     {1,        203LL,  6},     {1,         25LL,  3},
+    {1,         99LL,  5},     {2,        135LL,  6},     {1,        193LL,  6},
+    {1,        191LL,  6},     {1,        189LL,  6},     {1,        187LL,  6},
+    {1,        185LL,  6},     {1,        183LL,  6},     {1,        181LL,  6},
+    {1,        179LL,  6},     {1,        177LL,  6},     {1,        175LL,  6},
+    {1,        173LL,  6},     {1,        171LL,  6},     {1,        169LL,  6},
+    {1,         21LL,  3},     {1,         83LL,  5},     {1,         41LL,  4},
+    {1,        163LL,  6},     {1,        161LL,  6},     {2,         63LL,  6},
+    {1,         79LL,  5},     {2,         57LL,  6},     {1,        155LL,  6},
+    {2,         51LL,  6},     {1,         19LL,  3},     {1,        151LL,  6},
+    {1,        149LL,  6},     {1,         37LL,  4},     {2,         37LL,  6},
+    {1,        145LL,  6},     {1,          9LL,  2},     {1,        143LL,  6},
+    {2,         27LL,  6},     {2,         25LL,  6},     {1,        139LL,  6},
+    {1,         69LL,  5},     {1,        137LL,  6},     {2,         15LL,  6},
+    {2,         13LL,  6},     {2,         11LL,  6},     {2,          9LL,  6},
+    {2,          7LL,  6},     {2,          5LL,  6},     {2,          3LL,  6},
+    {0,          0LL,  7},     {1,        255LL,  7},     {1,        127LL,  6},
+    {1,         63LL,  5},     {1,        125LL,  6},     {1,         31LL,  4},
+    {1,        123LL,  6},     {1,         61LL,  5},     {1,        121LL,  6},
+    {1,         15LL,  3},     {1,        119LL,  6},     {1,         59LL,  5},
+    {1,        235LL,  7},     {1,        117LL,  6},     {1,         29LL,  4},
+    {1,        115LL,  6},     {1,         57LL,  5},     {1,        113LL,  6},
+    {1,        225LL,  7},     {1,          7LL,  2},     {1,        111LL,  6},
+    {1,         55LL,  5},     {1,        219LL,  7},     {1,        109LL,  6},
+    {1,         27LL,  4},     {1,        215LL,  7},     {1,        107LL,  6},
+    {1,         53LL,  5},     {1,        211LL,  7},     {1,        105LL,  6},
+    {1,         13LL,  3},     {1,        207LL,  7},     {1,        103LL,  6},
+    {1,         51LL,  5},     {1,        203LL,  7},     {1,        101LL,  6},
+    {1,         25LL,  4},     {1,        199LL,  7},     {1,         99LL,  6},
+    {1,        197LL,  7},     {1,         49LL,  5},     {1,         97LL,  6},
+    {1,        193LL,  7},     {1,          3LL,  1},     {1,        191LL,  7},
+    {1,         95LL,  6},     {1,        189LL,  7},     {1,         47LL,  5},
+    {1,        187LL,  7},     {1,         93LL,  6},     {1,        185LL,  7},
+    {1,         23LL,  4},     {1,        183LL,  7},     {1,         91LL,  6},
+    {1,        181LL,  7},     {1,         45LL,  5},     {1,        179LL,  7},
+    {1,         89LL,  6},     {1,        177LL,  7},     {1,         11LL,  3},
+    {1,        175LL,  7},     {1,         87LL,  6},     {1,        173LL,  7},
+    {1,         43LL,  5},     {1,        171LL,  7},     {1,         85LL,  6},
+    {1,        169LL,  7},     {2,         81LL,  7},     {1,         21LL,  4},
+    {1,        167LL,  7},     {1,         83LL,  6},     {1,        165LL,  7},
+    {1,         41LL,  5},     {2,         71LL,  7},     {1,        163LL,  7},
+    {1,         81LL,  6},     {1,        161LL,  7},     {1,          5LL,  2},
+    {2,         63LL,  7},     {1,        159LL,  7},     {1,         79LL,  6},
+    {1,        157LL,  7},     {2,         57LL,  7},     {1,         39LL,  5},
+    {1,        155LL,  7},     {1,         77LL,  6},     {2,         51LL,  7},
+    {1,        153LL,  7},     {1,         19LL,  4},     {2,         47LL,  7},
+    {1,        151LL,  7},     {1,         75LL,  6},     {1,        149LL,  7},
+    {2,         41LL,  7},     {1,         37LL,  5},     {1,        147LL,  7},
+    {2,         37LL,  7},     {1,         73LL,  6},     {1,        145LL,  7},
+    {2,         33LL,  7},     {1,          9LL,  3},     {2,         31LL,  7},
+    {1,        143LL,  7},     {1,         71LL,  6},     {2,         27LL,  7},
+    {1,        141LL,  7},     {2,         25LL,  7},     {1,         35LL,  5},
+    {1,        139LL,  7},     {2,         21LL,  7},     {1,         69LL,  6},
+    {2,         19LL,  7},     {1,        137LL,  7},     {1,         17LL,  4},
+    {2,         15LL,  7},     {1,        135LL,  7},     {2,         13LL,  7},
+    {1,         67LL,  6},     {2,         11LL,  7},     {1,        133LL,  7},
+    {2,          9LL,  7},     {1,         33LL,  5},     {2,          7LL,  7},
+    {1,        131LL,  7},     {2,          5LL,  7},     {1,         65LL,  6},
+    {2,          3LL,  7},     {1,        129LL,  7},     {0,          0LL,  8},
+};
+static const uniform int64 __idiv_table_s8[][3] = {
+    {0,          0LL,  1},     {1,         86LL,  0},     {0,          0LL,  2},
+    {1,        103LL,  1},     {1,         43LL,  0},     {1,        147LL,  2},
+    {0,          0LL,  3},     {1,         57LL,  1},     {1,        103LL,  2},
+    {1,        187LL,  3},     {1,         43LL,  1},     {1,         79LL,  2},
+    {1,        147LL,  3},     {1,        137LL,  3},     {0,          0LL,  4},
+    {1,        121LL,  3},     {1,         57LL,  2},     {1,         27LL,  1},
+    {1,        103LL,  3},     {1,         49LL,  2},     {1,        187LL,  4},
+    {1,        179LL,  4},     {1,         43LL,  2},     {1,         41LL,  2},
+    {1,         79LL,  3},     {1,         19LL,  1},     {1,        147LL,  4},
+    {1,         71LL,  3},     {1,        137LL,  4},     {1,        133LL,  4},
+    {0,          0LL,  5},     {1,        125LL,  4},     {1,        121LL,  4},
+    {1,         59LL,  3},     {1,         57LL,  3},     {1,        111LL,  4},
+    {1,         27LL,  2},     {1,        211LL,  5},     {1,        103LL,  4},
+    {1,         25LL,  2},     {1,         49LL,  3},     {1,          6LL,  0},
+    {1,         47LL,  3},     {1,         23LL,  2},     {1,         45LL,  3},
+    {1,         11LL,  1},     {1,         43LL,  3},     {1,         21LL,  2},
+    {1,         41LL,  3},     {1,         81LL,  4},     {1,         79LL,  4},
+    {1,         39LL,  3},     {1,         19LL,  2},     {1,         75LL,  4},
+    {1,        147LL,  5},     {1,          9LL,  1},     {1,         71LL,  4},
+    {1,         35LL,  3},     {1,        137LL,  5},     {1,        135LL,  5},
+    {1,        133LL,  5},     {1,        131LL,  5},     {0,          0LL,  6},
+    {1,        127LL,  5},     {1,         63LL,  4},     {1,         31LL,  3},
+    {1,         61LL,  4},     {1,         15LL,  2},     {1,         59LL,  4},
+    {1,         29LL,  3},     {1,         57LL,  4},     {1,        113LL,  5},
+    {1,          7LL,  1},     {1,         55LL,  4},     {1,         27LL,  3},
+    {1,        107LL,  5},     {1,         53LL,  4},     {1,         13LL,  2},
+    {1,        103LL,  5},     {1,         51LL,  4},     {1,         25LL,  3},
+    {1,         99LL,  5},     {1,         49LL,  4},     {1,         97LL,  5},
+    {1,          3LL,  0},     {1,         95LL,  5},     {1,         47LL,  4},
+    {1,         93LL,  5},     {1,         23LL,  3},     {1,         91LL,  5},
+    {1,         45LL,  4},     {1,         89LL,  5},     {1,         11LL,  2},
+    {1,         87LL,  5},     {1,         43LL,  4},     {1,         85LL,  5},
+    {1,         21LL,  3},     {1,         83LL,  5},     {1,         41LL,  4},
+    {1,        163LL,  6},     {1,         81LL,  5},     {1,          5LL,  1},
+    {1,         79LL,  5},     {1,        157LL,  6},     {1,         39LL,  4},
+    {1,         77LL,  5},     {1,         19LL,  3},     {1,        151LL,  6},
+    {1,         75LL,  5},     {1,         37LL,  4},     {1,        147LL,  6},
+    {1,         73LL,  5},     {1,          9LL,  2},     {1,        143LL,  6},
+    {1,         71LL,  5},     {1,        141LL,  6},     {1,         35LL,  4},
+    {1,         69LL,  5},     {1,        137LL,  6},     {1,         17LL,  3},
+    {1,        135LL,  6},     {1,         67LL,  5},     {1,        133LL,  6},
+    {1,         33LL,  4},     {1,        131LL,  6},     {1,         65LL,  5},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+};
+static const uniform int64 __idiv_table_u16[][3] = {
+    {0,          0LL,  1},     {1,      43691LL,  1},     {0,          0LL,  2},
+    {1,      52429LL,  2},     {1,      43691LL,  2},     {2,       9363LL,  2},
+    {0,          0LL,  3},     {1,      58255LL,  3},     {1,      52429LL,  3},
+    {1,      47663LL,  3},     {1,      43691LL,  3},     {1,      20165LL,  2},
+    {2,       9363LL,  3},     {1,      34953LL,  3},     {0,          0LL,  4},
+    {1,      61681LL,  4},     {1,      58255LL,  4},     {1,      55189LL,  4},
+    {1,      52429LL,  4},     {2,      34329LL,  4},     {1,      47663LL,  4},
+    {2,      25645LL,  4},     {1,      43691LL,  4},     {2,      18351LL,  4},
+    {1,      20165LL,  3},     {2,      12137LL,  4},     {2,       9363LL,  4},
+    {1,      18079LL,  3},     {1,      34953LL,  4},     {2,       2115LL,  4},
+    {0,          0LL,  5},     {1,      63551LL,  5},     {1,      61681LL,  5},
+    {1,      59919LL,  5},     {1,      58255LL,  5},     {1,       7085LL,  2},
+    {1,      55189LL,  5},     {2,      42011LL,  5},     {1,      52429LL,  5},
+    {2,      36765LL,  5},     {2,      34329LL,  5},     {1,      48771LL,  5},
+    {1,      47663LL,  5},     {1,      11651LL,  3},     {2,      25645LL,  5},
+    {2,      23705LL,  5},     {1,      43691LL,  5},     {2,      20063LL,  5},
+    {2,      18351LL,  5},     {1,      41121LL,  5},     {1,      20165LL,  4},
+    {1,      39569LL,  5},     {2,      12137LL,  5},     {2,      10725LL,  5},
+    {2,       9363LL,  5},     {2,       8049LL,  5},     {1,      18079LL,  4},
+    {1,      35545LL,  5},     {1,      34953LL,  5},     {1,       8595LL,  3},
+    {2,       2115LL,  5},     {2,       1041LL,  5},     {0,          0LL,  6},
+    {1,       4033LL,  2},     {1,      63551LL,  6},     {1,      31301LL,  5},
+    {1,      61681LL,  6},     {2,      56039LL,  6},     {1,      59919LL,  6},
+    {1,      59075LL,  6},     {1,      58255LL,  6},     {1,      57457LL,  6},
+    {1,       7085LL,  3},     {2,      46313LL,  6},     {1,      55189LL,  6},
+    {1,       6809LL,  3},     {2,      42011LL,  6},     {1,      53093LL,  6},
+    {1,      52429LL,  6},     {1,      25891LL,  5},     {2,      36765LL,  6},
+    {1,      25267LL,  5},     {2,      34329LL,  6},     {1,      49345LL,  6},
+    {1,      48771LL,  6},     {1,      48211LL,  6},     {1,      47663LL,  6},
+    {2,      28719LL,  6},     {1,      11651LL,  4},     {2,      26647LL,  6},
+    {2,      25645LL,  6},     {2,      24665LL,  6},     {2,      23705LL,  6},
+    {1,      44151LL,  6},     {1,      43691LL,  6},     {2,      20945LL,  6},
+    {2,      20063LL,  6},     {1,      42367LL,  6},     {2,      18351LL,  6},
+    {1,       5191LL,  3},     {1,      41121LL,  6},     {1,      20361LL,  5},
+    {1,      20165LL,  5},     {1,      19973LL,  5},     {1,      39569LL,  6},
+    {2,      12863LL,  6},     {2,      12137LL,  6},     {1,       2405LL,  2},
+    {2,      10725LL,  6},     {1,      37787LL,  6},     {2,       9363LL,  6},
+    {1,      18559LL,  5},     {2,       8049LL,  6},     {2,       7409LL,  6},
+    {1,      18079LL,  5},     {1,      35849LL,  6},     {1,      35545LL,  6},
+    {2,       4957LL,  6},     {1,      34953LL,  6},     {1,       4333LL,  3},
+    {1,       8595LL,  4},     {2,       2665LL,  6},     {2,       2115LL,  6},
+    {2,       1573LL,  6},     {2,       1041LL,  6},     {2,        517LL,  6},
+    {0,          0LL,  7},     {1,      16257LL,  5},     {1,       4033LL,  3},
+    {1,      16009LL,  5},     {1,      63551LL,  7},     {1,      63073LL,  7},
+    {1,      31301LL,  6},     {1,      31069LL,  6},     {1,      61681LL,  7},
+    {1,      61231LL,  7},     {2,      56039LL,  7},     {1,      30175LL,  6},
+    {1,      59919LL,  7},     {1,      29747LL,  6},     {1,      59075LL,  7},
+    {1,      29331LL,  6},     {1,      58255LL,  7},     {1,      57853LL,  7},
+    {1,      57457LL,  7},     {1,      28533LL,  6},     {1,       7085LL,  4},
+    {1,      14075LL,  5},     {2,      46313LL,  7},     {1,      27777LL,  6},
+    {1,      55189LL,  7},     {1,      13707LL,  5},     {1,       6809LL,  4},
+    {2,      42705LL,  7},     {2,      42011LL,  7},     {1,      53431LL,  7},
+    {1,      53093LL,  7},     {1,      52759LL,  7},     {1,      52429LL,  7},
+    {2,      38671LL,  7},     {1,      25891LL,  6},     {1,       6433LL,  4},
+    {2,      36765LL,  7},     {2,      36145LL,  7},     {1,      25267LL,  6},
+    {2,      34927LL,  7},     {2,      34329LL,  7},     {1,      49637LL,  7},
+    {1,      49345LL,  7},     {2,      32577LL,  7},     {1,      48771LL,  7},
+    {2,      31443LL,  7},     {1,      48211LL,  7},     {1,      47935LL,  7},
+    {1,      47663LL,  7},     {2,      29251LL,  7},     {2,      28719LL,  7},
+    {1,       2929LL,  3},     {1,      11651LL,  5},     {1,      23173LL,  6},
+    {2,      26647LL,  7},     {1,       2865LL,  3},     {2,      25645LL,  7},
+    {1,       1417LL,  2},     {2,      24665LL,  7},     {1,      44859LL,  7},
+    {2,      23705LL,  7},     {2,      23233LL,  7},     {1,      44151LL,  7},
+    {1,       2745LL,  3},     {1,      43691LL,  7},     {2,      21393LL,  7},
+    {2,      20945LL,  7},     {1,      43019LL,  7},     {2,      20063LL,  7},
+    {1,      21291LL,  6},     {1,      42367LL,  7},     {1,      21077LL,  6},
+    {2,      18351LL,  7},     {1,      41735LL,  7},     {1,       5191LL,  4},
+    {2,      17111LL,  7},     {1,      41121LL,  7},     {2,      16305LL,  7},
+    {1,      20361LL,  6},     {1,      40525LL,  7},     {1,      20165LL,  6},
+    {1,      40137LL,  7},     {1,      19973LL,  6},     {1,      39757LL,  7},
+    {1,      39569LL,  7},     {2,      13231LL,  7},     {2,      12863LL,  7},
+    {1,      39017LL,  7},     {2,      12137LL,  7},     {2,      11779LL,  7},
+    {1,       2405LL,  3},     {2,      11073LL,  7},     {2,      10725LL,  7},
+    {1,      18979LL,  6},     {1,      37787LL,  7},     {2,       9699LL,  7},
+    {2,       9363LL,  7},     {1,      37283LL,  7},     {1,      18559LL,  6},
+    {2,       8373LL,  7},     {2,       8049LL,  7},     {1,       4579LL,  4},
+    {2,       7409LL,  7},     {2,       7093LL,  7},     {1,      18079LL,  6},
+    {1,      36003LL,  7},     {1,      35849LL,  7},     {2,       5857LL,  7},
+    {1,      35545LL,  7},     {1,      35395LL,  7},     {2,       4957LL,  7},
+    {1,      35099LL,  7},     {1,      34953LL,  7},     {1,       4351LL,  4},
+    {1,       4333LL,  4},     {2,       3507LL,  7},     {1,       8595LL,  5},
+    {2,       2943LL,  7},     {2,       2665LL,  7},     {1,      16981LL,  6},
+    {2,       2115LL,  7},     {2,       1843LL,  7},     {2,       1573LL,  7},
+    {1,      33421LL,  7},     {2,       1041LL,  7},     {1,      33157LL,  7},
+    {2,        517LL,  7},     {1,      32897LL,  7},     {0,          0LL,  8},
+};
+static const uniform int64 __idiv_table_s16[][3] = {
+    {0,          0LL,  1},     {1,      21846LL,  0},     {0,          0LL,  2},
+    {1,      26215LL,  1},     {1,      10923LL,  0},     {1,      18725LL,  1},
+    {0,          0LL,  3},     {1,       7282LL,  0},     {1,      26215LL,  2},
+    {1,       5958LL,  0},     {1,      10923LL,  1},     {1,      20165LL,  2},
+    {1,      18725LL,  2},     {1,      34953LL,  3},     {0,          0LL,  4},
+    {1,      30841LL,  3},     {1,       3641LL,  0},     {1,      55189LL,  4},
+    {1,      26215LL,  3},     {1,      49933LL,  4},     {1,       2979LL,  0},
+    {1,      45591LL,  4},     {1,      10923LL,  2},     {1,       5243LL,  1},
+    {1,      20165LL,  3},     {1,      38837LL,  4},     {1,      18725LL,  3},
+    {1,      18079LL,  3},     {1,      34953LL,  4},     {1,      16913LL,  3},
+    {0,          0LL,  5},     {1,       1986LL,  0},     {1,      30841LL,  4},
+    {1,       3745LL,  1},     {1,       3641LL,  1},     {1,       7085LL,  2},
+    {1,      55189LL,  5},     {1,      26887LL,  4},     {1,      26215LL,  4},
+    {1,      51151LL,  5},     {1,      49933LL,  5},     {1,      12193LL,  3},
+    {1,       2979LL,  1},     {1,      11651LL,  3},     {1,      45591LL,  5},
+    {1,      44621LL,  5},     {1,      10923LL,  3},     {1,       2675LL,  1},
+    {1,       5243LL,  2},     {1,      41121LL,  5},     {1,      20165LL,  4},
+    {1,      19785LL,  4},     {1,      38837LL,  5},     {1,      38131LL,  5},
+    {1,      18725LL,  4},     {1,      36793LL,  5},     {1,      18079LL,  4},
+    {1,      17773LL,  4},     {1,      34953LL,  5},     {1,       8595LL,  3},
+    {1,      16913LL,  4},     {1,      33289LL,  5},     {0,          0LL,  6},
+    {1,       4033LL,  2},     {1,        993LL,  0},     {1,      31301LL,  5},
+    {1,      30841LL,  5},     {1,      15197LL,  4},     {1,       3745LL,  2},
+    {1,      14769LL,  4},     {1,       3641LL,  2},     {1,      57457LL,  6},
+    {1,       7085LL,  3},     {1,      55925LL,  6},     {1,      55189LL,  6},
+    {1,       6809LL,  3},     {1,      26887LL,  5},     {1,      26547LL,  5},
+    {1,      26215LL,  5},     {1,      25891LL,  5},     {1,      51151LL,  6},
+    {1,      25267LL,  5},     {1,      49933LL,  6},     {1,      24673LL,  5},
+    {1,      12193LL,  4},     {1,      48211LL,  6},     {1,       2979LL,  2},
+    {1,       5891LL,  3},     {1,      11651LL,  4},     {1,      11523LL,  4},
+    {1,      45591LL,  6},     {1,      45101LL,  6},     {1,      44621LL,  6},
+    {1,      44151LL,  6},     {1,      10923LL,  4},     {1,      43241LL,  6},
+    {1,       2675LL,  2},     {1,        662LL,  0},     {1,       5243LL,  3},
+    {1,       5191LL,  3},     {1,      41121LL,  6},     {1,      20361LL,  5},
+    {1,      20165LL,  5},     {1,      19973LL,  5},     {1,      19785LL,  5},
+    {1,       1225LL,  1},     {1,      38837LL,  6},     {1,       2405LL,  2},
+    {1,      38131LL,  6},     {1,      37787LL,  6},     {1,      18725LL,  5},
+    {1,      18559LL,  5},     {1,      36793LL,  6},     {1,      36473LL,  6},
+    {1,      18079LL,  5},     {1,      35849LL,  6},     {1,      17773LL,  5},
+    {1,      35247LL,  6},     {1,      34953LL,  6},     {1,       4333LL,  3},
+    {1,       8595LL,  4},     {1,      34101LL,  6},     {1,      16913LL,  5},
+    {1,      33555LL,  6},     {1,      33289LL,  6},     {1,      33027LL,  6},
+    {0,          0LL,  7},     {1,      16257LL,  5},     {1,       4033LL,  3},
+    {1,      16009LL,  5},     {1,        993LL,  1},     {1,      31537LL,  6},
+    {1,      31301LL,  6},     {1,      31069LL,  6},     {1,      30841LL,  6},
+    {1,       3827LL,  3},     {1,      15197LL,  5},     {1,      30175LL,  6},
+    {1,       3745LL,  3},     {1,      29747LL,  6},     {1,      14769LL,  5},
+    {1,      29331LL,  6},     {1,       3641LL,  3},     {1,      28927LL,  6},
+    {1,      57457LL,  7},     {1,      28533LL,  6},     {1,       7085LL,  4},
+    {1,      14075LL,  5},     {1,      55925LL,  7},     {1,      27777LL,  6},
+    {1,      55189LL,  7},     {1,      13707LL,  5},     {1,       6809LL,  4},
+    {1,      54121LL,  7},     {1,      26887LL,  6},     {1,       6679LL,  4},
+    {1,      26547LL,  6},     {1,       6595LL,  4},     {1,      26215LL,  6},
+    {1,       6513LL,  4},     {1,      25891LL,  6},     {1,       6433LL,  4},
+    {1,      51151LL,  7},     {1,      50841LL,  7},     {1,      25267LL,  6},
+    {1,       6279LL,  4},     {1,      49933LL,  7},     {1,      24819LL,  6},
+    {1,      24673LL,  6},     {1,      49057LL,  7},     {1,      12193LL,  5},
+    {1,      24245LL,  6},     {1,      48211LL,  7},     {1,        749LL,  1},
+    {1,       2979LL,  3},     {1,      23697LL,  6},     {1,       5891LL,  4},
+    {1,       2929LL,  3},     {1,      11651LL,  5},     {1,      23173LL,  6},
+    {1,      11523LL,  5},     {1,       2865LL,  3},     {1,      45591LL,  7},
+    {1,       1417LL,  2},     {1,      45101LL,  7},     {1,      11215LL,  5},
+    {1,      44621LL,  7},     {1,      44385LL,  7},     {1,      44151LL,  7},
+    {1,       2745LL,  3},     {1,      10923LL,  5},     {1,      43465LL,  7},
+    {1,      43241LL,  7},     {1,      43019LL,  7},     {1,       2675LL,  3},
+    {1,      21291LL,  6},     {1,        331LL,  0},     {1,      21077LL,  6},
+    {1,       5243LL,  4},     {1,      41735LL,  7},     {1,       5191LL,  4},
+    {1,      10331LL,  5},     {1,      41121LL,  7},     {1,      40921LL,  7},
+    {1,      20361LL,  6},     {1,      40525LL,  7},     {1,      20165LL,  6},
+    {1,      20069LL,  6},     {1,      19973LL,  6},     {1,      39757LL,  7},
+    {1,      19785LL,  6},     {1,       4923LL,  4},     {1,       1225LL,  2},
+    {1,      39017LL,  7},     {1,      38837LL,  7},     {1,      19329LL,  6},
+    {1,       2405LL,  3},     {1,      38305LL,  7},     {1,      38131LL,  7},
+    {1,      18979LL,  6},     {1,      37787LL,  7},     {1,      18809LL,  6},
+    {1,      18725LL,  6},     {1,      37283LL,  7},     {1,      18559LL,  6},
+    {1,      36955LL,  7},     {1,      36793LL,  7},     {1,       4579LL,  4},
+    {1,      36473LL,  7},     {1,      36315LL,  7},     {1,      18079LL,  6},
+    {1,      36003LL,  7},     {1,      35849LL,  7},     {1,      35697LL,  7},
+    {1,      17773LL,  6},     {1,       8849LL,  5},     {1,      35247LL,  7},
+    {1,      35099LL,  7},     {1,      34953LL,  7},     {1,       4351LL,  4},
+    {1,       4333LL,  4},     {1,      17261LL,  6},     {1,       8595LL,  5},
+    {1,        535LL,  1},     {1,      34101LL,  7},     {1,      16981LL,  6},
+    {1,      16913LL,  6},     {1,      16845LL,  6},     {1,      33555LL,  7},
+    {1,      33421LL,  7},     {1,      33289LL,  7},     {1,      33157LL,  7},
+    {1,      33027LL,  7},     {1,      32897LL,  7},     {1,      32769LL,  7},
+};
+static const uniform int64 __idiv_table_u32[][3] = {
+    {0,          0LL,  1},     {1, 2863311531LL,  1},     {0,          0LL,  2},
+    {1, 3435973837LL,  2},     {1, 2863311531LL,  2},     {2,  613566757LL,  2},
+    {0,          0LL,  3},     {1,  954437177LL,  1},     {1, 3435973837LL,  3},
+    {1, 3123612579LL,  3},     {1, 2863311531LL,  3},     {1, 1321528399LL,  2},
+    {2,  613566757LL,  3},     {1, 2290649225LL,  3},     {0,          0LL,  4},
+    {1, 4042322161LL,  4},     {1,  954437177LL,  2},     {2, 2938661835LL,  4},
+    {1, 3435973837LL,  4},     {2, 2249744775LL,  4},     {1, 3123612579LL,  4},
+    {1, 2987803337LL,  4},     {1, 2863311531LL,  4},     {1, 1374389535LL,  3},
+    {1, 1321528399LL,  3},     {2,  795364315LL,  4},     {2,  613566757LL,  4},
+    {1, 2369637129LL,  4},     {1, 2290649225LL,  4},     {2,  138547333LL,  4},
+    {0,          0LL,  5},     {1, 1041204193LL,  3},     {1, 4042322161LL,  5},
+    {2, 3558687189LL,  5},     {1,  954437177LL,  3},     {2, 3134165325LL,  5},
+    {2, 2938661835LL,  5},     {2, 2753184165LL,  5},     {1, 3435973837LL,  5},
+    {1, 3352169597LL,  5},     {2, 2249744775LL,  5},     {1,  799063683LL,  3},
+    {1, 3123612579LL,  5},     {2, 1813430637LL,  5},     {1, 2987803337LL,  5},
+    {1, 2924233053LL,  5},     {1, 2863311531LL,  5},     {1, 1402438301LL,  4},
+    {1, 1374389535LL,  4},     {1, 2694881441LL,  5},     {1, 1321528399LL,  4},
+    {2,  891408307LL,  5},     {2,  795364315LL,  5},     {2,  702812831LL,  5},
+    {2,  613566757LL,  5},     {2,  527452125LL,  5},     {1, 2369637129LL,  5},
+    {1,  582368447LL,  3},     {1, 2290649225LL,  5},     {1, 1126548799LL,  4},
+    {2,  138547333LL,  5},     {2,   68174085LL,  5},     {0,          0LL,  6},
+    {1, 4228890877LL,  6},     {1, 1041204193LL,  4},     {1,  128207979LL,  1},
+    {1, 4042322161LL,  6},     {1, 1991868891LL,  5},     {2, 3558687189LL,  6},
+    {1, 3871519817LL,  6},     {1,  954437177LL,  4},     {2, 3235934265LL,  6},
+    {2, 3134165325LL,  6},     {1,  458129845LL,  3},     {2, 2938661835LL,  6},
+    {1,  892460737LL,  4},     {2, 2753184165LL,  6},     {1, 3479467177LL,  6},
+    {1, 3435973837LL,  6},     {1, 3393554407LL,  6},     {1, 3352169597LL,  6},
+    {1,  827945503LL,  4},     {2, 2249744775LL,  6},     {1, 3233857729LL,  6},
+    {1,  799063683LL,  4},     {1,  789879043LL,  4},     {1, 3123612579LL,  6},
+    {1, 3088515809LL,  6},     {2, 1813430637LL,  6},     {2, 1746305385LL,  6},
+    {1, 2987803337LL,  6},     {1, 2955676419LL,  6},     {1, 2924233053LL,  6},
+    {2, 1491936009LL,  6},     {1, 2863311531LL,  6},     {2, 1372618415LL,  6},
+    {1, 1402438301LL,  5},     {1, 2776544515LL,  6},     {1, 1374389535LL,  5},
+    {2, 1148159575LL,  6},     {1, 2694881441LL,  6},     {2, 1042467791LL,  6},
+    {1, 1321528399LL,  5},     {2,  940802361LL,  6},     {2,  891408307LL,  6},
+    {2,  842937507LL,  6},     {2,  795364315LL,  6},     {2,  748664025LL,  6},
+    {2,  702812831LL,  6},     {2,  657787785LL,  6},     {2,  613566757LL,  6},
+    {2,  570128403LL,  6},     {2,  527452125LL,  6},     {2,  485518043LL,  6},
+    {1, 2369637129LL,  6},     {2,  403800345LL,  6},     {1,  582368447LL,  4},
+    {1, 1154949189LL,  5},     {1, 2290649225LL,  6},     {2,  248469183LL,  6},
+    {1, 1126548799LL,  5},     {2,  174592167LL,  6},     {2,  138547333LL,  6},
+    {1,  274877907LL,  3},     {2,   68174085LL,  6},     {2,   33818641LL,  6},
+    {0,          0LL,  7},     {1,  266354561LL,  3},     {1, 4228890877LL,  7},
+    {1, 4196609267LL,  7},     {1, 1041204193LL,  5},     {1, 4133502361LL,  7},
+    {1,  128207979LL,  2},     {1, 4072265289LL,  7},     {1, 4042322161LL,  7},
+    {1,  125400505LL,  2},     {1, 1991868891LL,  6},     {1, 1977538899LL,  6},
+    {2, 3558687189LL,  7},     {1,  974744351LL,  5},     {1, 3871519817LL,  7},
+    {1, 3844446251LL,  7},     {1,  954437177LL,  5},     {1, 3791419407LL,  7},
+    {2, 3235934265LL,  7},     {1, 3739835469LL,  7},     {2, 3134165325LL,  7},
+    {1, 3689636335LL,  7},     {1,  458129845LL,  4},     {1,  910191745LL,  5},
+    {2, 2938661835LL,  7},     {1, 3593175255LL,  7},     {1,  892460737LL,  5},
+    {1, 3546811703LL,  7},     {2, 2753184165LL,  7},     {1,  875407347LL,  5},
+    {1, 3479467177LL,  7},     {2, 2620200175LL,  7},     {1, 3435973837LL,  7},
+    {1, 3414632385LL,  7},     {1, 3393554407LL,  7},     {1, 3372735055LL,  7},
+    {1, 3352169597LL,  7},     {1, 1665926709LL,  6},     {1,  827945503LL,  5},
+    {1, 1645975491LL,  6},     {2, 2249744775LL,  7},     {1, 1626496491LL,  6},
+    {1, 3233857729LL,  7},     {2, 2134925265LL,  7},     {1,  799063683LL,  5},
+    {2, 2060591247LL,  7},     {1,  789879043LL,  5},     {1, 1570730897LL,  6},
+    {1, 3123612579LL,  7},     {2, 1916962805LL,  7},     {1, 3088515809LL,  7},
+    {2, 1847555765LL,  7},     {2, 1813430637LL,  7},     {1, 3037324939LL,  7},
+    {2, 1746305385LL,  7},     {1, 3004130131LL,  7},     {1, 2987803337LL,  7},
+    {2, 1648338801LL,  7},     {1, 2955676419LL,  7},     {1, 2939870663LL,  7},
+    {1, 2924233053LL,  7},     {2, 1522554545LL,  7},     {2, 1491936009LL,  7},
+    {1, 2878302691LL,  7},     {1, 2863311531LL,  7},     {1,  356059465LL,  4},
+    {2, 1372618415LL,  7},     {2, 1343553873LL,  7},     {1, 1402438301LL,  6},
+    {2, 1286310003LL,  7},     {1, 2776544515LL,  7},     {1, 1381296015LL,  6},
+    {1, 1374389535LL,  6},     {1,   42735993LL,  1},     {2, 1148159575LL,  7},
+    {1, 2708156719LL,  7},     {1, 2694881441LL,  7},     {1, 1340867839LL,  6},
+    {2, 1042467791LL,  7},     {1,  663956297LL,  5},     {1, 1321528399LL,  6},
+    {1, 2630410593LL,  7},     {2,  940802361LL,  7},     {1, 2605477791LL,  7},
+    {2,  891408307LL,  7},     {1, 2581013211LL,  7},     {2,  842937507LL,  7},
+    {1, 1278501893LL,  6},     {2,  795364315LL,  7},     {2,  771906565LL,  7},
+    {2,  748664025LL,  7},     {2,  725633745LL,  7},     {2,  702812831LL,  7},
+    {2,  680198441LL,  7},     {2,  657787785LL,  7},     {2,  635578121LL,  7},
+    {2,  613566757LL,  7},     {1, 2443359173LL,  7},     {2,  570128403LL,  7},
+    {2,  548696263LL,  7},     {2,  527452125LL,  7},     {1, 1200340205LL,  6},
+    {2,  485518043LL,  7},     {2,  464823301LL,  7},     {1, 2369637129LL,  7},
+    {2,  423966729LL,  7},     {2,  403800345LL,  7},     {2,  383805589LL,  7},
+    {1,  582368447LL,  5},     {2,  344322273LL,  7},     {1, 1154949189LL,  6},
+    {1, 2300233531LL,  7},     {1, 2290649225LL,  7},     {1,  285143057LL,  4},
+    {2,  248469183LL,  7},     {1, 2262369605LL,  7},     {1, 1126548799LL,  6},
+    {2,  192835267LL,  7},     {2,  174592167LL,  7},     {2,  156496785LL,  7},
+    {2,  138547333LL,  7},     {2,  120742053LL,  7},     {1,  274877907LL,  4},
+    {1, 2190262207LL,  7},     {2,   68174085LL,  7},     {1, 2172947881LL,  7},
+    {2,   33818641LL,  7},     {1, 2155905153LL,  7},     {0,          0LL,  8},
+};
+static const uniform int64 __idiv_table_s32[][3] = {
+    {0,          0LL,  1},     {1, 1431655766LL,  0},     {0,          0LL,  2},
+    {1, 1717986919LL,  1},     {1,  715827883LL,  0},     {1, 2454267027LL,  2},
+    {0,          0LL,  3},     {1,  954437177LL,  1},     {1, 1717986919LL,  2},
+    {1,  780903145LL,  1},     {1,  715827883LL,  1},     {1, 1321528399LL,  2},
+    {1, 2454267027LL,  3},     {1, 2290649225LL,  3},     {0,          0LL,  4},
+    {1, 2021161081LL,  3},     {1,  954437177LL,  2},     {1, 1808407283LL,  3},
+    {1, 1717986919LL,  3},     {1,  818089009LL,  2},     {1,  780903145LL,  2},
+    {1, 2987803337LL,  4},     {1,  715827883LL,  2},     {1, 1374389535LL,  3},
+    {1, 1321528399LL,  3},     {1, 1272582903LL,  3},     {1, 2454267027LL,  4},
+    {1, 2369637129LL,  4},     {1, 2290649225LL,  4},     {1, 2216757315LL,  4},
+    {0,          0LL,  5},     {1, 1041204193LL,  3},     {1, 2021161081LL,  4},
+    {1, 3926827243LL,  5},     {1,  954437177LL,  3},     {1, 3714566311LL,  5},
+    {1, 1808407283LL,  4},     {1, 3524075731LL,  5},     {1, 1717986919LL,  4},
+    {1, 1676084799LL,  4},     {1,  818089009LL,  3},     {1,  799063683LL,  3},
+    {1,  780903145LL,  3},     {1, 3054198967LL,  5},     {1, 2987803337LL,  5},
+    {1, 2924233053LL,  5},     {1,  715827883LL,  3},     {1, 1402438301LL,  4},
+    {1, 1374389535LL,  4},     {1, 2694881441LL,  5},     {1, 1321528399LL,  4},
+    {1, 1296593901LL,  4},     {1, 1272582903LL,  4},     {1,  156180629LL,  1},
+    {1, 2454267027LL,  5},     {1, 2411209711LL,  5},     {1, 2369637129LL,  5},
+    {1,  582368447LL,  3},     {1, 2290649225LL,  5},     {1, 1126548799LL,  4},
+    {1, 2216757315LL,  5},     {1, 2181570691LL,  5},     {0,          0LL,  6},
+    {1, 2114445439LL,  5},     {1, 1041204193LL,  4},     {1,  128207979LL,  1},
+    {1, 2021161081LL,  5},     {1, 1991868891LL,  5},     {1, 3926827243LL,  6},
+    {1, 3871519817LL,  6},     {1,  954437177LL,  4},     {1, 3765450781LL,  6},
+    {1, 3714566311LL,  6},     {1,  458129845LL,  3},     {1, 1808407283LL,  5},
+    {1,  892460737LL,  4},     {1, 3524075731LL,  6},     {1, 1739733589LL,  5},
+    {1, 1717986919LL,  5},     {1,  424194301LL,  3},     {1, 1676084799LL,  5},
+    {1,  827945503LL,  4},     {1,  818089009LL,  4},     {1, 1616928865LL,  5},
+    {1,  799063683LL,  4},     {1,  789879043LL,  4},     {1,  780903145LL,  4},
+    {1, 3088515809LL,  6},     {1, 3054198967LL,  6},     {1, 3020636341LL,  6},
+    {1, 2987803337LL,  6},     {1,  738919105LL,  4},     {1, 2924233053LL,  6},
+    {1, 2893451653LL,  6},     {1,  715827883LL,  4},     {1,  354224107LL,  3},
+    {1, 1402438301LL,  5},     {1, 2776544515LL,  6},     {1, 1374389535LL,  5},
+    {1,  680390859LL,  4},     {1, 2694881441LL,  6},     {1,  333589693LL,  3},
+    {1, 1321528399LL,  5},     {1, 2617884829LL,  6},     {1, 1296593901LL,  5},
+    {1, 1284476201LL,  5},     {1, 1272582903LL,  5},     {1, 2521815661LL,  6},
+    {1,  156180629LL,  2},     {1, 2476377541LL,  6},     {1, 2454267027LL,  6},
+    {1, 1216273925LL,  5},     {1, 2411209711LL,  6},     {1, 1195121335LL,  5},
+    {1, 2369637129LL,  6},     {1, 2349383821LL,  6},     {1,  582368447LL,  4},
+    {1, 1154949189LL,  5},     {1, 2290649225LL,  6},     {1,   70991195LL,  1},
+    {1, 1126548799LL,  5},     {1,  558694933LL,  4},     {1, 2216757315LL,  6},
+    {1,  274877907LL,  3},     {1, 2181570691LL,  6},     {1, 2164392969LL,  6},
+    {0,          0LL,  7},     {1,  266354561LL,  3},     {1, 2114445439LL,  6},
+    {1, 1049152317LL,  5},     {1, 1041204193LL,  5},     {1, 4133502361LL,  7},
+    {1,  128207979LL,  2},     {1, 4072265289LL,  7},     {1, 2021161081LL,  6},
+    {1,  125400505LL,  2},     {1, 1991868891LL,  6},     {1, 1977538899LL,  6},
+    {1, 3926827243LL,  7},     {1,  974744351LL,  5},     {1, 3871519817LL,  7},
+    {1,  961111563LL,  5},     {1,  954437177LL,  5},     {1, 3791419407LL,  7},
+    {1, 3765450781LL,  7},     {1, 1869917735LL,  6},     {1, 3714566311LL,  7},
+    {1,  230602271LL,  3},     {1,  458129845LL,  4},     {1,  910191745LL,  5},
+    {1, 1808407283LL,  6},     {1, 3593175255LL,  7},     {1,  892460737LL,  5},
+    {1,  443351463LL,  4},     {1, 3524075731LL,  7},     {1,  875407347LL,  5},
+    {1, 1739733589LL,  6},     {1,  432197967LL,  4},     {1, 1717986919LL,  6},
+    {1, 3414632385LL,  7},     {1,  424194301LL,  4},     {1,  210795941LL,  3},
+    {1, 1676084799LL,  6},     {1, 1665926709LL,  6},     {1,  827945503LL,  5},
+    {1, 1645975491LL,  6},     {1,  818089009LL,  5},     {1, 1626496491LL,  6},
+    {1, 1616928865LL,  6},     {1, 3214946281LL,  7},     {1,  799063683LL,  5},
+    {1,  397222409LL,  4},     {1,  789879043LL,  5},     {1, 1570730897LL,  6},
+    {1,  780903145LL,  5},     {1, 3105965051LL,  7},     {1, 3088515809LL,  7},
+    {1, 3071261531LL,  7},     {1, 3054198967LL,  7},     {1,  759331235LL,  5},
+    {1, 3020636341LL,  7},     {1, 3004130131LL,  7},     {1, 2987803337LL,  7},
+    {1, 2971653049LL,  7},     {1,  738919105LL,  5},     {1, 2939870663LL,  7},
+    {1, 2924233053LL,  7},     {1, 2908760921LL,  7},     {1, 2893451653LL,  7},
+    {1, 2878302691LL,  7},     {1,  715827883LL,  5},     {1,  356059465LL,  4},
+    {1,  354224107LL,  4},     {1, 2819260585LL,  7},     {1, 1402438301LL,  6},
+    {1, 1395319325LL,  6},     {1, 2776544515LL,  7},     {1, 1381296015LL,  6},
+    {1, 1374389535LL,  6},     {1,   42735993LL,  1},     {1,  680390859LL,  5},
+    {1, 2708156719LL,  7},     {1, 2694881441LL,  7},     {1, 1340867839LL,  6},
+    {1,  333589693LL,  4},     {1,  663956297LL,  5},     {1, 1321528399LL,  6},
+    {1, 2630410593LL,  7},     {1, 2617884829LL,  7},     {1,   81421181LL,  2},
+    {1, 1296593901LL,  6},     {1, 2581013211LL,  7},     {1, 1284476201LL,  6},
+    {1, 1278501893LL,  6},     {1, 1272582903LL,  6},     {1, 2533436931LL,  7},
+    {1, 2521815661LL,  7},     {1, 2510300521LL,  7},     {1,  156180629LL,  3},
+    {1, 2487582869LL,  7},     {1, 2476377541LL,  7},     {1, 2465272709LL,  7},
+    {1, 2454267027LL,  7},     {1, 2443359173LL,  7},     {1, 1216273925LL,  6},
+    {1,  605457945LL,  5},     {1, 2411209711LL,  7},     {1, 1200340205LL,  6},
+    {1, 1195121335LL,  6},     {1, 2379895299LL,  7},     {1, 2369637129LL,  7},
+    {1, 2359467013LL,  7},     {1, 2349383821LL,  7},     {1, 2339386443LL,  7},
+    {1,  582368447LL,  5},     {1, 2319644785LL,  7},     {1, 1154949189LL,  6},
+    {1, 2300233531LL,  7},     {1, 2290649225LL,  7},     {1,  285143057LL,  4},
+    {1,   70991195LL,  2},     {1, 2262369605LL,  7},     {1, 1126548799LL,  6},
+    {1, 1121950641LL,  6},     {1,  558694933LL,  5},     {1, 2225732041LL,  7},
+    {1, 2216757315LL,  7},     {1, 2207854675LL,  7},     {1,  274877907LL,  4},
+    {1, 2190262207LL,  7},     {1, 2181570691LL,  7},     {1, 2172947881LL,  7},
+    {1, 2164392969LL,  7},     {1, 2155905153LL,  7},     {1, 2147483649LL,  7},
+};
+
+__declspec(safe)
+static unmasked unsigned int8 __fast_idiv(unsigned int8 numerator,
+                                          uniform unsigned int8 divisor) {
+  uniform int64 method = __idiv_table_u8[divisor-2][0];
+  uniform int64 multiplier = __idiv_table_u8[divisor-2][1];
+  uniform int64 shift = __idiv_table_u8[divisor-2][2];
+
+  unsigned int16 mult = multiplier;
+  unsigned int16 val = numerator;
+  if (method == 0)
+      return numerator >> shift;
+  else if (method == 1)
+      return (val * mult) >> (8 + shift);
+  else {
+      val *= mult;
+      val >>= 8;
+      val += (numerator-val)>>1;
+      return (val >> shift);
+  }
+}
+
+__declspec(safe)
+static unmasked int8 __fast_idiv(int8 numerator, uniform int8 divisor) {
+  uniform int8 method = __idiv_table_s8[divisor-2][0];
+  uniform int16 multiplier = __idiv_table_s8[divisor-2][1];
+  uniform int8 shift = __idiv_table_s8[divisor-2][2];
+
+  if (method == 0)
+      return numerator >> shift;
+  else {
+      unsigned int8 sign = numerator >> 7;
+      numerator ^= sign;
+      int16 mul = (int16)numerator * (int16)multiplier;
+      mul >>= 8 + shift;
+      return (int8)mul ^ sign;
+  }
+}
+
+__declspec(safe)
+static unmasked unsigned int16 __fast_idiv(unsigned int16 numerator,
+                                           uniform unsigned int16 divisor) {
+  uniform int64 method = __idiv_table_u16[divisor-2][0];
+  uniform int64 multiplier = __idiv_table_u16[divisor-2][1];
+  uniform int64 shift = __idiv_table_u16[divisor-2][2];
+
+  unsigned int32 mult = multiplier;
+  unsigned int32 val = numerator;
+  if (method == 0)
+      return numerator >> shift;
+  else if (method == 1)
+      return (val * mult) >> (16 + shift);
+  else {
+      val *= mult;
+      val >>= 16;
+      val += (numerator-val)>>1;
+      return val >> shift;
+  }
+}
+
+__declspec(safe)
+static unmasked int16 __fast_idiv(int16 numerator, uniform int16 divisor) {
+  uniform int64 method = __idiv_table_s16[divisor-2][0];
+  uniform int64 multiplier = __idiv_table_s16[divisor-2][1];
+  uniform int64 shift = __idiv_table_s16[divisor-2][2];
+
+  if (method == 0)
+      return numerator >> shift;
+  else {
+      unsigned int16 sign = numerator >> 15;
+      numerator ^= sign;
+      int32 mul = (int32)numerator * (int32)multiplier;
+      mul >>= 16 + shift;
+      int16 result = mul;
+      return result ^ sign;
+  }
+}
+
+__declspec(safe)
+static unmasked inline unsigned int32 __fast_idiv(unsigned int32 numerator,
+                                                  uniform unsigned int32 divisor) {
+  uniform int64 method = __idiv_table_u32[divisor-2][0];
+  uniform int64 multiplier = __idiv_table_u32[divisor-2][1];
+  uniform int64 shift = __idiv_table_u32[divisor-2][2];
+
+  unsigned int64 mult = multiplier;
+  unsigned int64 val = numerator;
+  if (method == 0)
+      return numerator >> shift;
+  else if (method == 1)
+      return (val * mult) >> (32 + shift);
+  else {
+      val *= mult;
+      val >>= 32;
+      val += (numerator-val)>>1;
+      return val >> shift;
+  }
+}
+
+__declspec(safe)
+static unmasked int32 __fast_idiv(int32 numerator, uniform int32 divisor) {
+  uniform int64 method = __idiv_table_s32[divisor-2][0];
+  uniform int64 multiplier = __idiv_table_s32[divisor-2][1];
+  uniform int64 shift = __idiv_table_s32[divisor-2][2];
+
+  if (method == 0)
+      return numerator >> shift;
+  else {
+      unsigned int32 sign = numerator >> 31;
+      numerator ^= sign;
+      int64 mul = (int64)numerator * (int64)multiplier;
+      mul >>= 32 + shift;
+      int32 result = mul;
+      return result ^ sign;
+  }
+}
+
diff --git a/tests/idiv.ispc b/tests/idiv.ispc
new file mode 100644
index 00000000..b7bd78dc
--- /dev/null
+++ b/tests/idiv.ispc
@@ -0,0 +1,75 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+  uniform int errorCount = 0;
+ 
+  for (unsigned int8 num = 0; num < 255; ++num) {
+    for (uniform unsigned int8 div = 2; div < 255; ++div) {
+      if (__fast_idiv(num, div) != num/div) {
+        ++errorCount;
+        print("error %/% = %, got %\n", num, div, num/div, __fast_idiv(num,div));
+      }
+    }
+  }
+
+  for (int8 num = 0; num < 127; ++num) {
+    for (uniform int8 div = 2; div < 127; ++div) {
+      if (__fast_idiv(num, div) != num/div) {
+        ++errorCount;
+        print("error %/% = %, got %\n", num, div, num/div, __fast_idiv(num,div));
+      }
+    }
+  }
+
+  for (int16 num = 0; num < 32767; ++num) {
+    for (uniform int16 div = 2; div < 256; ++div) {
+      if (__fast_idiv(num, div) != num/div) {
+        ++errorCount;
+        print("error %/% = %, got %\n", num, div, num/div, __fast_idiv(num,div));
+      }
+    }
+  }
+
+  for (unsigned int16 num = 0; num < 0xffff; ++num) {
+    for (uniform unsigned int16 div = 2; div < 256; ++div) {
+      if (__fast_idiv(num, div) != num/div) {
+        ++errorCount;
+        print("error %/% = %, got %\n", num, div, num/div, __fast_idiv(num,div));
+      }
+    }
+  }
+
+  // randomly sample int32s...
+  uniform RNGState state;
+  seed_rng(&state, 1234);
+  for (uniform int i = 0; i < 1M; ++i) {
+    unsigned int32 num = random(&state);
+    for (uniform unsigned int32 div = 2; div < 256; ++div) {
+      if (__fast_idiv(num, div) != num/div) {
+        ++errorCount;
+        print("ui32 error %/% = %, got %\n", num, div, num/div, __fast_idiv(num,div));
+      }
+    }
+  }
+
+  for (uniform int64 i = 0; i < 1M; ++i) {  
+    int32 num = random(&state);
+    if (num < 0)
+      continue;
+    for (uniform int32 div = 2; div < 256; ++div) {
+      if (__fast_idiv(num, div) != num/div) {
+        ++errorCount;
+        print("si32 error %/% = %, got %\n", num, div, num/div, __fast_idiv(num,div));
+      }
+    }
+  }
+
+  RET[programIndex] = errorCount;
+}
+
+export void result(uniform float RET[]) {
+  RET[programIndex] = 0;
+}
+

From e7abf3f2eacd50b0b8cb194fc87e878bdc25ddec Mon Sep 17 00:00:00 2001
From: Matt Pharr <mpharr@google.com>
Date: Tue, 23 Jul 2013 16:38:10 -0700
Subject: [PATCH 003/124] Add support for mask vectors of 8 and 16-bit element
 types.

There were a number of places throughout the system that assumed that the
execution mask would only have either 32-bit or 1-bit elements.  This
commit makes it possible to have a target with an 8- or 16-bit mask.
---
 Makefile         |  29 ++++++---
 builtins.cpp     |  35 +++++++----
 builtins/util.m4 | 161 ++++++++++++++++++++++++++++++++---------------
 ctx.cpp          |  26 +++-----
 expr.cpp         |  36 +++++------
 llvmutil.cpp     |  73 +++++++++++++++++----
 parse.yy         |  20 +++++-
 stdlib.ispc      |  37 ++++++-----
 8 files changed, 284 insertions(+), 133 deletions(-)

diff --git a/Makefile b/Makefile
index 835f8e15..043ab4cf 100644
--- a/Makefile
+++ b/Makefile
@@ -137,7 +137,7 @@ BISON_SRC=parse.yy
 FLEX_SRC=lex.ll
 
 OBJS=$(addprefix objs/, $(CXX_SRC:.cpp=.o) $(BUILTINS_OBJS) \
-	stdlib_generic_ispc.o stdlib_x86_ispc.o \
+	stdlib_mask1_ispc.o stdlib_mask8_ispc.o stdlib_mask16_ispc.o stdlib_mask32_ispc.o \
 	$(BISON_SRC:.yy=.o) $(FLEX_SRC:.ll=.o))
 
 default: ispc
@@ -243,12 +243,23 @@ objs/builtins-c-64.cpp: builtins/builtins.c
 	@echo Creating C++ source from builtins definition file $<
 	@$(CLANG) -m64 -emit-llvm -c $< -o - | llvm-dis - | python bitcode2cpp.py c 64 > $@
 
-objs/stdlib_generic_ispc.cpp: stdlib.ispc
-	@echo Creating C++ source from $< for generic
-	@$(CLANG) -E -x c -DISPC_TARGET_GENERIC=1 -DISPC=1 -DPI=3.1415926536 $< -o - | \
-		python stdlib2cpp.py generic > $@
+objs/stdlib_mask1_ispc.cpp: stdlib.ispc
+	@echo Creating C++ source from $< for mask1
+	@$(CLANG) -E -x c -DISPC_MASK_BITS=1 -DISPC=1 -DPI=3.1415926536 $< -o - | \
+		python stdlib2cpp.py mask1 > $@
+
+objs/stdlib_mask8_ispc.cpp: stdlib.ispc
+	@echo Creating C++ source from $< for mask8
+	@$(CLANG) -E -x c -DISPC_MASK_BITS=8 -DISPC=1 -DPI=3.1415926536 $< -o - | \
+		python stdlib2cpp.py mask8 > $@
+
+objs/stdlib_mask16_ispc.cpp: stdlib.ispc
+	@echo Creating C++ source from $< for mask16
+	@$(CLANG) -E -x c -DISPC_MASK_BITS=16 -DISPC=1 -DPI=3.1415926536 $< -o - | \
+		python stdlib2cpp.py mask16 > $@
+
+objs/stdlib_mask32_ispc.cpp: stdlib.ispc
+	@echo Creating C++ source from $< for mask32
+	@$(CLANG) -E -x c -DISPC_MASK_BITS=32 -DISPC=1 -DPI=3.1415926536 $< -o - | \
+		python stdlib2cpp.py mask32 > $@
 
-objs/stdlib_x86_ispc.cpp: stdlib.ispc
-	@echo Creating C++ source from $< for x86
-	@$(CLANG) -E -x c -DISPC=1 -DPI=3.1415926536 $< -o - | \
-		python stdlib2cpp.py x86 > $@
diff --git a/builtins.cpp b/builtins.cpp
index 3e03de10..d3bbaa6a 100644
--- a/builtins.cpp
+++ b/builtins.cpp
@@ -112,10 +112,7 @@ lLLVMTypeToISPCType(const llvm::Type *t, bool intAsUnsigned) {
         return intAsUnsigned ? AtomicType::UniformUInt64 : AtomicType::UniformInt64;
 
     // varying
-    if (LLVMTypes::MaskType != LLVMTypes::Int32VectorType &&
-        t == LLVMTypes::MaskType)
-        return AtomicType::VaryingBool;
-    else if (t == LLVMTypes::Int8VectorType)
+    if (t == LLVMTypes::Int8VectorType)
         return intAsUnsigned ? AtomicType::VaryingUInt8 : AtomicType::VaryingInt8;
     else if (t == LLVMTypes::Int16VectorType)
         return intAsUnsigned ? AtomicType::VaryingUInt16 : AtomicType::VaryingInt16;
@@ -127,6 +124,8 @@ lLLVMTypeToISPCType(const llvm::Type *t, bool intAsUnsigned) {
         return AtomicType::VaryingDouble;
     else if (t == LLVMTypes::Int64VectorType)
         return intAsUnsigned ? AtomicType::VaryingUInt64 : AtomicType::VaryingInt64;
+    else if (t == LLVMTypes::MaskType)
+        return AtomicType::VaryingBool;
 
     // pointers to uniform
     else if (t == LLVMTypes::Int8PointerType)
@@ -1038,16 +1037,30 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
         // If the user wants the standard library to be included, parse the
         // serialized version of the stdlib.ispc file to get its
         // definitions added.
+        extern char stdlib_mask1_code[], stdlib_mask8_code[];
+        extern char stdlib_mask16_code[], stdlib_mask32_code[];
         if (g->target->getISA() == Target::GENERIC &&
-            g->target->getVectorWidth() != 1) { // 1 wide uses x86 stdlib
-            extern char stdlib_generic_code[];
-            yy_scan_string(stdlib_generic_code);
-            yyparse();
+            g->target->getVectorWidth() == 1) { // 1 wide uses 32 stdlib
+            yy_scan_string(stdlib_mask32_code);
         }
         else {
-            extern char stdlib_x86_code[];
-            yy_scan_string(stdlib_x86_code);
-            yyparse();
+            switch (g->target->getMaskBitCount()) {
+            case 1:
+                yy_scan_string(stdlib_mask1_code);
+                break;
+            case 8:
+                yy_scan_string(stdlib_mask8_code);
+                break;
+            case 16:
+                yy_scan_string(stdlib_mask16_code);
+                break;
+            case 32:
+                yy_scan_string(stdlib_mask32_code);
+                break;
+            default:
+                FATAL("Unhandled mask bit size for stdlib.ispc");
+            }
         }
+        yyparse();
     }
 }
diff --git a/builtins/util.m4 b/builtins/util.m4
index c19d4930..d6f3e5c3 100644
--- a/builtins/util.m4
+++ b/builtins/util.m4
@@ -690,6 +690,75 @@ shuffles(i64, 8)
 ;; $4: return type of the LLVM atomic type, in ispc naming paralance (e.g. int32)
 ;; $5: identity value for the operator (e.g. 0 for add, -1 for AND, ...)
 
+define(`mask_converts', `
+define internal <$1 x i8> @convertmask_i1_i8_$1(<$1 x i1>) {
+  %r = sext <$1 x i1> %0 to <$1 x i8>
+  ret <$1 x i8> %r
+}
+define internal <$1 x i16> @convertmask_i1_i16_$1(<$1 x i1>) {
+  %r = sext <$1 x i1> %0 to <$1 x i16>
+  ret <$1 x i16> %r
+}
+define internal <$1 x i32> @convertmask_i1_i32_$1(<$1 x i1>) {
+  %r = sext <$1 x i1> %0 to <$1 x i32>
+  ret <$1 x i32> %r
+}
+define internal <$1 x i64> @convertmask_i1_i64_$1(<$1 x i1>) {
+  %r = sext <$1 x i1> %0 to <$1 x i64>
+  ret <$1 x i64> %r
+}
+
+define internal <$1 x i8> @convertmask_i8_i8_$1(<$1 x i8>) {
+  ret <$1 x i8> %0
+}
+define internal <$1 x i16> @convertmask_i8_i86_$1(<$1 x i8>) {
+  %r = sext <$1 x i8> %0 to <$1 x i16>
+  ret <$1 x i16> %r
+}
+define internal <$1 x i32> @convertmask_i8_i32_$1(<$1 x i8>) {
+  %r = sext <$1 x i8> %0 to <$1 x i32>
+  ret <$1 x i32> %r
+}
+define internal <$1 x i64> @convertmask_i8_i64_$1(<$1 x i8>) {
+  %r = sext <$1 x i8> %0 to <$1 x i64>
+  ret <$1 x i64> %r
+}
+
+define internal <$1 x i8> @convertmask_i16_i8_$1(<$1 x i16>) {
+  %r = trunc <$1 x i16> %0 to <$1 x i8>
+  ret <$1 x i8> %r
+}
+define internal <$1 x i16> @convertmask_i16_i16_$1(<$1 x i16>) {
+  ret <$1 x i16> %0
+}
+define internal <$1 x i32> @convertmask_i16_i32_$1(<$1 x i16>) {
+  %r = sext <$1 x i16> %0 to <$1 x i32>
+  ret <$1 x i32> %r
+}
+define internal <$1 x i64> @convertmask_i16_i64_$1(<$1 x i16>) {
+  %r = sext <$1 x i16> %0 to <$1 x i64>
+  ret <$1 x i64> %r
+}
+
+define internal <$1 x i8> @convertmask_i32_i8_$1(<$1 x i32>) {
+  %r = trunc <$1 x i32> %0 to <$1 x i8>
+  ret <$1 x i8> %r
+}
+define internal <$1 x i16> @convertmask_i32_i16_$1(<$1 x i32>) {
+  %r = trunc <$1 x i32> %0 to <$1 x i16>
+  ret <$1 x i16> %r
+}
+define internal <$1 x i32> @convertmask_i32_i32_$1(<$1 x i32>) {
+  ret <$1 x i32> %0
+}
+define internal <$1 x i64> @convertmask_i32_i64_$1(<$1 x i32>) {
+  %r = sext <$1 x i32> %0 to <$1 x i64>
+  ret <$1 x i64> %r
+}
+')
+
+mask_converts(WIDTH)
+
 define(`global_atomic_associative', `
 
 define <$1 x $3> @__atomic_$2_$4_global($3 * %ptr, <$1 x $3> %val,
@@ -697,17 +766,10 @@ define <$1 x $3> @__atomic_$2_$4_global($3 * %ptr, <$1 x $3> %val,
   ; first, for any lanes where the mask is off, compute a vector where those lanes
   ; hold the identity value..
 
-  ; for the bit tricks below, we need the mask to be sign extended to be
-  ; the size of the element type.
-  ifelse(
-    MASK,i1,`%mask = sext <$1 x MASK> %m to <$1 x $3>',
-    $3,i64, `%mask = sext <$1 x MASK> %m to <$1 x i64>',
-    $3,i32, `
-       ; silly workaround to do %mask = %m, which is not possible directly..
-       %maskmem = alloca <$1 x i32>
-       store <$1 x i32> %m, <$1 x i32> * %maskmem
-       %mask = load <$1 x i32> * %maskmem'
-  )
+  ; for the bit tricks below, we need the mask to have the
+  ; the same element size as the element type.
+  %mask = call <$1 x $3> @convertmask_`'MASK`'_$3_$1(<$1 x MASK> %m)
+
   ; zero out any lanes that are off
   %valoff = and <$1 x $3> %val, %mask
 
@@ -2440,13 +2502,12 @@ define i32 @__sext_uniform_bool(i1) nounwind readnone alwaysinline {
 }
 
 define <WIDTH x i32> @__sext_varying_bool(<WIDTH x MASK>) nounwind readnone alwaysinline {
-  ifelse(MASK,i1, `
-  %se = sext <WIDTH x i1> %0 to <WIDTH x i32>
-  ret <WIDTH x i32> %se
-  ', `
-  ret <WIDTH x i32> %0')
+  ifelse(MASK,i32, `ret <WIDTH x i32> %0',
+  `%se = sext <WIDTH x MASK> %0 to <WIDTH x i32>
+  ret <WIDTH x i32> %se')
 }
 
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; memcpy/memmove/memset
 
@@ -3201,8 +3262,8 @@ return:
 ;; $1: llvm type of elements (and suffix for function name)
 
 define(`gen_masked_store', `
-define void @__masked_store_$1(<WIDTH x $1>* nocapture, <WIDTH x $1>, <WIDTH x i32>) nounwind alwaysinline {
-  per_lane(WIDTH, <WIDTH x i32> %2, `
+define void @__masked_store_$1(<WIDTH x $1>* nocapture, <WIDTH x $1>, <WIDTH x MASK>) nounwind alwaysinline {
+  per_lane(WIDTH, <WIDTH x MASK> %2, `
       %ptr_LANE_ID = getelementptr <WIDTH x $1> * %0, i32 0, i32 LANE
       %storeval_LANE_ID = extractelement <WIDTH x $1> %1, i32 LANE
       store $1 %storeval_LANE_ID, $1 * %ptr_LANE_ID')
@@ -3378,10 +3439,10 @@ define void @__masked_store_blend_i16(<16 x i16>* nocapture, <16 x i16>,
 define(`packed_load_and_store', `
 
 define i32 @__packed_load_active(i32 * %startptr, <WIDTH x i32> * %val_ptr,
-                                 <WIDTH x i32> %full_mask) nounwind alwaysinline {
+                                 <WIDTH x MASK> %full_mask) nounwind alwaysinline {
 entry:
-  %mask = call i64 @__movmsk(<WIDTH x i32> %full_mask)
-  %mask_known = call i1 @__is_compile_time_constant_mask(<WIDTH x i32> %full_mask)
+  %mask = call i64 @__movmsk(<WIDTH x MASK> %full_mask)
+  %mask_known = call i1 @__is_compile_time_constant_mask(<WIDTH x MASK> %full_mask)
   br i1 %mask_known, label %known_mask, label %unknown_mask
 
 known_mask:
@@ -3432,10 +3493,10 @@ done:
 }
 
 define i32 @__packed_store_active(i32 * %startptr, <WIDTH x i32> %vals,
-                                   <WIDTH x i32> %full_mask) nounwind alwaysinline {
+                                   <WIDTH x MASK> %full_mask) nounwind alwaysinline {
 entry:
-  %mask = call i64 @__movmsk(<WIDTH x i32> %full_mask)
-  %mask_known = call i1 @__is_compile_time_constant_mask(<WIDTH x i32> %full_mask)
+  %mask = call i64 @__movmsk(<WIDTH x MASK> %full_mask)
+  %mask_known = call i1 @__is_compile_time_constant_mask(<WIDTH x MASK> %full_mask)
   br i1 %mask_known, label %known_mask, label %unknown_mask
 
 known_mask:
@@ -3544,10 +3605,10 @@ check_neighbors:
   %castvr = call <$1 x $4> @__rotate_i$6(<$1 x $4> %castvec, i32 1)
   %vr = bitcast <$1 x $4> %castvr to <$1 x $2>
   %eq = $5 $7 <$1 x $2> %vec, %vr
-  ifelse(MASK,i32, `
-    %eq32 = sext <$1 x i1> %eq to <$1 x i32>
-    %eqmm = call i64 @__movmsk(<$1 x i32> %eq32)', `
-    %eqmm = call i64 @__movmsk(<$1 x MASK> %eq)')
+  ifelse(MASK,i1, `
+    %eqmm = call i64 @__movmsk(<$1 x MASK> %eq)',
+    `%eqm = sext <$1 x i1> %eq to <$1 x MASK>
+    %eqmm = call i64 @__movmsk(<$1 x MASK> %eqm)')
   %alleq = icmp eq i64 %eqmm, ALL_ON_MASK
   br i1 %alleq, label %all_equal, label %not_all_equal
   ', `
@@ -3722,9 +3783,9 @@ pl_done:
 define(`gen_gather_general', `
 ; fully general 32-bit gather, takes array of pointers encoded as vector of i32s
 define <WIDTH x $1> @__gather32_$1(<WIDTH x i32> %ptrs, 
-                                   <WIDTH x i32> %vecmask) nounwind readonly alwaysinline {
+                                   <WIDTH x MASK> %vecmask) nounwind readonly alwaysinline {
   %ret_ptr = alloca <WIDTH x $1>
-  per_lane(WIDTH, <WIDTH x i32> %vecmask, `
+  per_lane(WIDTH, <WIDTH x MASK> %vecmask, `
   %iptr_LANE_ID = extractelement <WIDTH x i32> %ptrs, i32 LANE
   %ptr_LANE_ID = inttoptr i32 %iptr_LANE_ID to $1 *
   %val_LANE_ID = load $1 * %ptr_LANE_ID
@@ -3738,9 +3799,9 @@ define <WIDTH x $1> @__gather32_$1(<WIDTH x i32> %ptrs,
 
 ; fully general 64-bit gather, takes array of pointers encoded as vector of i32s
 define <WIDTH x $1> @__gather64_$1(<WIDTH x i64> %ptrs, 
-                                   <WIDTH x i32> %vecmask) nounwind readonly alwaysinline {
+                                   <WIDTH x MASK> %vecmask) nounwind readonly alwaysinline {
   %ret_ptr = alloca <WIDTH x $1>
-  per_lane(WIDTH, <WIDTH x i32> %vecmask, `
+  per_lane(WIDTH, <WIDTH x MASK> %vecmask, `
   %iptr_LANE_ID = extractelement <WIDTH x i64> %ptrs, i32 LANE
   %ptr_LANE_ID = inttoptr i64 %iptr_LANE_ID to $1 *
   %val_LANE_ID = load $1 * %ptr_LANE_ID
@@ -3804,7 +3865,7 @@ define <WIDTH x $1> @__gather_elt64_$1(i8 * %ptr, <WIDTH x i64> %offsets, i32 %o
 
 define <WIDTH x $1> @__gather_factored_base_offsets32_$1(i8 * %ptr, <WIDTH x i32> %offsets, i32 %offset_scale,
                                              <WIDTH x i32> %offset_delta,
-                                             <WIDTH x i32> %vecmask) nounwind readonly alwaysinline {
+                                             <WIDTH x MASK> %vecmask) nounwind readonly alwaysinline {
   ; We can be clever and avoid the per-lane stuff for gathers if we are willing
   ; to require that the 0th element of the array being gathered from is always
   ; legal to read from (and we do indeed require that, given the benefits!) 
@@ -3813,13 +3874,13 @@ define <WIDTH x $1> @__gather_factored_base_offsets32_$1(i8 * %ptr, <WIDTH x i32
   %offsetsPtr = alloca <WIDTH x i32>
   store <WIDTH x i32> zeroinitializer, <WIDTH x i32> * %offsetsPtr
   call void @__masked_store_blend_i32(<WIDTH x i32> * %offsetsPtr, <WIDTH x i32> %offsets, 
-                                      <WIDTH x i32> %vecmask)
+                                      <WIDTH x MASK> %vecmask)
   %newOffsets = load <WIDTH x i32> * %offsetsPtr
 
   %deltaPtr = alloca <WIDTH x i32>
   store <WIDTH x i32> zeroinitializer, <WIDTH x i32> * %deltaPtr
   call void @__masked_store_blend_i32(<WIDTH x i32> * %deltaPtr, <WIDTH x i32> %offset_delta, 
-                                      <WIDTH x i32> %vecmask)
+                                      <WIDTH x MASK> %vecmask)
   %newDelta = load <WIDTH x i32> * %deltaPtr
 
   %ret0 = call <WIDTH x $1> @__gather_elt32_$1(i8 * %ptr, <WIDTH x i32> %newOffsets,
@@ -3835,7 +3896,7 @@ define <WIDTH x $1> @__gather_factored_base_offsets32_$1(i8 * %ptr, <WIDTH x i32
 
 define <WIDTH x $1> @__gather_factored_base_offsets64_$1(i8 * %ptr, <WIDTH x i64> %offsets, i32 %offset_scale,
                                              <WIDTH x i64> %offset_delta,
-                                             <WIDTH x i32> %vecmask) nounwind readonly alwaysinline {
+                                             <WIDTH x MASK> %vecmask) nounwind readonly alwaysinline {
   ; We can be clever and avoid the per-lane stuff for gathers if we are willing
   ; to require that the 0th element of the array being gathered from is always
   ; legal to read from (and we do indeed require that, given the benefits!) 
@@ -3844,13 +3905,13 @@ define <WIDTH x $1> @__gather_factored_base_offsets64_$1(i8 * %ptr, <WIDTH x i64
   %offsetsPtr = alloca <WIDTH x i64>
   store <WIDTH x i64> zeroinitializer, <WIDTH x i64> * %offsetsPtr
   call void @__masked_store_blend_i64(<WIDTH x i64> * %offsetsPtr, <WIDTH x i64> %offsets, 
-                                      <WIDTH x i32> %vecmask)
+                                      <WIDTH x MASK> %vecmask)
   %newOffsets = load <WIDTH x i64> * %offsetsPtr
 
   %deltaPtr = alloca <WIDTH x i64>
   store <WIDTH x i64> zeroinitializer, <WIDTH x i64> * %deltaPtr
   call void @__masked_store_blend_i64(<WIDTH x i64> * %deltaPtr, <WIDTH x i64> %offset_delta, 
-                                      <WIDTH x i32> %vecmask)
+                                      <WIDTH x MASK> %vecmask)
   %newDelta = load <WIDTH x i64> * %deltaPtr
 
   %ret0 = call <WIDTH x $1> @__gather_elt64_$1(i8 * %ptr, <WIDTH x i64> %newOffsets,
@@ -3876,27 +3937,27 @@ gen_gather_factored($1)
 define <WIDTH x $1>
 @__gather_base_offsets32_$1(i8 * %ptr, i32 %offset_scale,
                            <WIDTH x i32> %offsets,
-                           <WIDTH x i32> %vecmask) nounwind readonly alwaysinline {
+                           <WIDTH x MASK> %vecmask) nounwind readonly alwaysinline {
   %scale_vec = bitcast i32 %offset_scale to <1 x i32>
   %smear_scale = shufflevector <1 x i32> %scale_vec, <1 x i32> undef,
      <WIDTH x i32> < forloop(i, 1, eval(WIDTH-1), `i32 0, ') i32 0 >
   %scaled_offsets = mul <WIDTH x i32> %smear_scale, %offsets
   %v = call <WIDTH x $1> @__gather_factored_base_offsets32_$1(i8 * %ptr, <WIDTH x i32> %scaled_offsets, i32 1, 
-                                                     <WIDTH x i32> zeroinitializer, <WIDTH x i32> %vecmask)
+                                                     <WIDTH x i32> zeroinitializer, <WIDTH x MASK> %vecmask)
   ret <WIDTH x $1> %v
 }
 
 define <WIDTH x $1>
 @__gather_base_offsets64_$1(i8 * %ptr, i32 %offset_scale,
                             <WIDTH x i64> %offsets,
-                            <WIDTH x i32> %vecmask) nounwind readonly alwaysinline {
+                            <WIDTH x MASK> %vecmask) nounwind readonly alwaysinline {
   %scale64 = zext i32 %offset_scale to i64
   %scale_vec = bitcast i64 %scale64 to <1 x i64>
   %smear_scale = shufflevector <1 x i64> %scale_vec, <1 x i64> undef,
      <WIDTH x i32> < forloop(i, 1, eval(WIDTH-1), `i32 0, ') i32 0 >
   %scaled_offsets = mul <WIDTH x i64> %smear_scale, %offsets
   %v = call <WIDTH x $1> @__gather_factored_base_offsets64_$1(i8 * %ptr, <WIDTH x i64> %scaled_offsets,
-                                                     i32 1, <WIDTH x i64> zeroinitializer, <WIDTH x i32> %vecmask)
+                                                     i32 1, <WIDTH x i64> zeroinitializer, <WIDTH x MASK> %vecmask)
   ret <WIDTH x $1> %v
 }
 
@@ -3955,9 +4016,9 @@ define void @__scatter_elt64_$1(i8 * %ptr, <WIDTH x i64> %offsets, i32 %offset_s
 
 define void @__scatter_factored_base_offsets32_$1(i8* %base, <WIDTH x i32> %offsets, i32 %offset_scale,
                                          <WIDTH x i32> %offset_delta, <WIDTH x $1> %values,
-                                         <WIDTH x i32> %mask) nounwind alwaysinline {
+                                         <WIDTH x MASK> %mask) nounwind alwaysinline {
   ;; And use the `per_lane' macro to do all of the per-lane work for scatter...
-  per_lane(WIDTH, <WIDTH x i32> %mask, `
+  per_lane(WIDTH, <WIDTH x MASK> %mask, `
       call void @__scatter_elt32_$1(i8 * %base, <WIDTH x i32> %offsets, i32 %offset_scale,
                                     <WIDTH x i32> %offset_delta, <WIDTH x $1> %values, i32 LANE)')
   ret void
@@ -3965,9 +4026,9 @@ define void @__scatter_factored_base_offsets32_$1(i8* %base, <WIDTH x i32> %offs
 
 define void @__scatter_factored_base_offsets64_$1(i8* %base, <WIDTH x i64> %offsets, i32 %offset_scale,
                                          <WIDTH x i64> %offset_delta, <WIDTH x $1> %values,
-                                         <WIDTH x i32> %mask) nounwind alwaysinline {
+                                         <WIDTH x MASK> %mask) nounwind alwaysinline {
   ;; And use the `per_lane' macro to do all of the per-lane work for scatter...
-  per_lane(WIDTH, <WIDTH x i32> %mask, `
+  per_lane(WIDTH, <WIDTH x MASK> %mask, `
       call void @__scatter_elt64_$1(i8 * %base, <WIDTH x i64> %offsets, i32 %offset_scale,
                                     <WIDTH x i64> %offset_delta, <WIDTH x $1> %values, i32 LANE)')
   ret void
@@ -3975,8 +4036,8 @@ define void @__scatter_factored_base_offsets64_$1(i8* %base, <WIDTH x i64> %offs
 
 ; fully general 32-bit scatter, takes array of pointers encoded as vector of i32s
 define void @__scatter32_$1(<WIDTH x i32> %ptrs, <WIDTH x $1> %values,
-                            <WIDTH x i32> %mask) nounwind alwaysinline {
-  per_lane(WIDTH, <WIDTH x i32> %mask, `
+                            <WIDTH x MASK> %mask) nounwind alwaysinline {
+  per_lane(WIDTH, <WIDTH x MASK> %mask, `
   %iptr_LANE_ID = extractelement <WIDTH x i32> %ptrs, i32 LANE
   %ptr_LANE_ID = inttoptr i32 %iptr_LANE_ID to $1 *
   %val_LANE_ID = extractelement <WIDTH x $1> %values, i32 LANE
@@ -3987,8 +4048,8 @@ define void @__scatter32_$1(<WIDTH x i32> %ptrs, <WIDTH x $1> %values,
 
 ; fully general 64-bit scatter, takes array of pointers encoded as vector of i64s
 define void @__scatter64_$1(<WIDTH x i64> %ptrs, <WIDTH x $1> %values,
-                            <WIDTH x i32> %mask) nounwind alwaysinline {
-  per_lane(WIDTH, <WIDTH x i32> %mask, `
+                            <WIDTH x MASK> %mask) nounwind alwaysinline {
+  per_lane(WIDTH, <WIDTH x MASK> %mask, `
   %iptr_LANE_ID = extractelement <WIDTH x i64> %ptrs, i32 LANE
   %ptr_LANE_ID = inttoptr i64 %iptr_LANE_ID to $1 *
   %val_LANE_ID = extractelement <WIDTH x $1> %values, i32 LANE
diff --git a/ctx.cpp b/ctx.cpp
index 1e79c97b..c50d22f9 100644
--- a/ctx.cpp
+++ b/ctx.cpp
@@ -1456,13 +1456,13 @@ FunctionEmitContext::I1VecToBoolVec(llvm::Value *b) {
         for (unsigned int i = 0; i < at->getNumElements(); ++i) {
             llvm::Value *elt = ExtractInst(b, i);
             llvm::Value *sext = SExtInst(elt, LLVMTypes::BoolVectorType,
-                                         LLVMGetName(elt, "_to_boolvec32"));
+                                         LLVMGetName(elt, "_to_boolvec"));
             ret = InsertInst(ret, sext, i);
         }
         return ret;
     }
     else
-        return SExtInst(b, LLVMTypes::BoolVectorType, LLVMGetName(b, "_to_i32"));
+        return SExtInst(b, LLVMTypes::BoolVectorType, LLVMGetName(b, "_to_boolvec"));
 }
 
 
@@ -2781,6 +2781,7 @@ FunctionEmitContext::maskedStore(llvm::Value *value, llvm::Value *ptr,
 
     // Figure out if we need a 8, 16, 32 or 64-bit masked store.
     llvm::Function *maskedStoreFunc = NULL;
+    llvm::Type *llvmValueType = value->getType();
 
     const PointerType *pt = CastType<PointerType>(valueType);
     if (pt != NULL) {
@@ -2809,8 +2810,7 @@ FunctionEmitContext::maskedStore(llvm::Value *value, llvm::Value *ptr,
         else
             maskedStoreFunc = m->module->getFunction("__pseudo_masked_store_i64");
     }
-    else if (Type::Equal(valueType, AtomicType::VaryingBool) &&
-             g->target->getMaskBitCount() == 1) {
+    else if (llvmValueType == LLVMTypes::Int1VectorType) {
         llvm::Value *notMask = BinaryOperator(llvm::Instruction::Xor, mask,
                                               LLVMMaskAllOn, "~mask");
         llvm::Value *old = LoadInst(ptr);
@@ -2823,28 +2823,22 @@ FunctionEmitContext::maskedStore(llvm::Value *value, llvm::Value *ptr,
         StoreInst(final, ptr);
         return;
     }
-    else if (Type::Equal(valueType, AtomicType::VaryingDouble)) {
+    else if (llvmValueType == LLVMTypes::DoubleVectorType) {
         maskedStoreFunc = m->module->getFunction("__pseudo_masked_store_double");
     }
-    else if (Type::Equal(valueType, AtomicType::VaryingInt64) ||
-             Type::Equal(valueType, AtomicType::VaryingUInt64)) {
+    else if (llvmValueType == LLVMTypes::Int64VectorType) {
         maskedStoreFunc = m->module->getFunction("__pseudo_masked_store_i64");
     }
-    else if (Type::Equal(valueType, AtomicType::VaryingFloat)) {
+    else if (llvmValueType == LLVMTypes::FloatVectorType) {
         maskedStoreFunc = m->module->getFunction("__pseudo_masked_store_float");
     }
-    else if (Type::Equal(valueType, AtomicType::VaryingBool) ||
-             Type::Equal(valueType, AtomicType::VaryingInt32) ||
-             Type::Equal(valueType, AtomicType::VaryingUInt32) ||
-             CastType<EnumType>(valueType) != NULL) {
+    else if (llvmValueType == LLVMTypes::Int32VectorType) {
         maskedStoreFunc = m->module->getFunction("__pseudo_masked_store_i32");
     }
-    else if (Type::Equal(valueType, AtomicType::VaryingInt16) ||
-             Type::Equal(valueType, AtomicType::VaryingUInt16)) {
+    else if (llvmValueType == LLVMTypes::Int16VectorType) {
         maskedStoreFunc = m->module->getFunction("__pseudo_masked_store_i16");
     }
-    else if (Type::Equal(valueType, AtomicType::VaryingInt8) ||
-             Type::Equal(valueType, AtomicType::VaryingUInt8)) {
+    else if (llvmValueType == LLVMTypes::Int8VectorType) {
         maskedStoreFunc = m->module->getFunction("__pseudo_masked_store_i8");
     }
     AssertPos(currentPos, maskedStoreFunc != NULL);
diff --git a/expr.cpp b/expr.cpp
index 3baaabaf..6bde2acb 100644
--- a/expr.cpp
+++ b/expr.cpp
@@ -6161,9 +6161,9 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal,
         switch (fromType->basicType) {
         case AtomicType::TYPE_BOOL:
             if (fromType->IsVaryingType() &&
-                LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType)
-                // If we have a bool vector of i32 elements, first truncate
-                // down to a single bit
+                LLVMTypes::BoolVectorType != LLVMTypes::Int1VectorType)
+                // If we have a bool vector of non-i1 elements, first
+                // truncate down to a single bit.
                 exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, cOpName);
             // And then do an unisgned int->float cast
             cast = ctx->CastInst(llvm::Instruction::UIToFP, // unsigned int
@@ -6205,8 +6205,8 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal,
         switch (fromType->basicType) {
         case AtomicType::TYPE_BOOL:
             if (fromType->IsVaryingType() &&
-                LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType)
-                // truncate i32 bool vector values to i1s
+                LLVMTypes::BoolVectorType != LLVMTypes::Int1VectorType)
+                // truncate bool vector values to i1s
                 exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, cOpName);
             cast = ctx->CastInst(llvm::Instruction::UIToFP, // unsigned int to double
                                  exprVal, targetType, cOpName);
@@ -6243,7 +6243,7 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal,
         switch (fromType->basicType) {
         case AtomicType::TYPE_BOOL:
             if (fromType->IsVaryingType() &&
-                LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType)
+                LLVMTypes::BoolVectorType != LLVMTypes::Int1VectorType)
                 exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, cOpName);
             cast = ctx->ZExtInst(exprVal, targetType, cOpName);
             break;
@@ -6279,7 +6279,7 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal,
         switch (fromType->basicType) {
         case AtomicType::TYPE_BOOL:
             if (fromType->IsVaryingType() &&
-                LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType)
+                LLVMTypes::BoolVectorType != LLVMTypes::Int1VectorType)
                 exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, cOpName);
             cast = ctx->ZExtInst(exprVal, targetType, cOpName);
             break;
@@ -6321,7 +6321,7 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal,
         switch (fromType->basicType) {
         case AtomicType::TYPE_BOOL:
             if (fromType->IsVaryingType() &&
-                LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType)
+                LLVMTypes::BoolVectorType != LLVMTypes::Int1VectorType)
                 exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, cOpName);
             cast = ctx->ZExtInst(exprVal, targetType, cOpName);
             break;
@@ -6361,7 +6361,7 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal,
         switch (fromType->basicType) {
         case AtomicType::TYPE_BOOL:
             if (fromType->IsVaryingType() &&
-                LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType)
+                LLVMTypes::BoolVectorType != LLVMTypes::Int1VectorType)
                 exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, cOpName);
             cast = ctx->ZExtInst(exprVal, targetType, cOpName);
             break;
@@ -6407,7 +6407,7 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal,
         switch (fromType->basicType) {
         case AtomicType::TYPE_BOOL:
             if (fromType->IsVaryingType() &&
-                LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType)
+                LLVMTypes::BoolVectorType != LLVMTypes::Int1VectorType)
                 exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, cOpName);
             cast = ctx->ZExtInst(exprVal, targetType, cOpName);
             break;
@@ -6447,7 +6447,7 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal,
         switch (fromType->basicType) {
         case AtomicType::TYPE_BOOL:
             if (fromType->IsVaryingType() &&
-                LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType)
+                LLVMTypes::BoolVectorType != LLVMTypes::Int1VectorType)
                 exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, cOpName);
             cast = ctx->ZExtInst(exprVal, targetType, cOpName);
             break;
@@ -6493,7 +6493,7 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal,
         switch (fromType->basicType) {
         case AtomicType::TYPE_BOOL:
             if (fromType->IsVaryingType() &&
-                LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType)
+                LLVMTypes::BoolVectorType != LLVMTypes::Int1VectorType)
                 exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, cOpName);
             cast = ctx->ZExtInst(exprVal, targetType, cOpName);
             break;
@@ -6531,7 +6531,7 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal,
         switch (fromType->basicType) {
         case AtomicType::TYPE_BOOL:
             if (fromType->IsVaryingType() &&
-                LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType)
+                LLVMTypes::BoolVectorType != LLVMTypes::Int1VectorType)
                 exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, cOpName);
             cast = ctx->ZExtInst(exprVal, targetType, cOpName);
             break;
@@ -6625,12 +6625,12 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal,
 
         if (fromType->IsUniformType()) {
             if (toType->IsVaryingType() &&
-                LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType) {
-                // extend out to i32 bool values from i1 here.  then we'll
-                // turn into a vector below, the way it does for everyone
-                // else...
+                LLVMTypes::BoolVectorType != LLVMTypes::Int1VectorType) {
+                // extend out to an bool as an i8/i16/i32 from the i1 here.
+                // Then we'll turn that into a vector below, the way it
+                // does for everyone else...
                 cast = ctx->SExtInst(cast, LLVMTypes::BoolVectorType->getElementType(),
-                                     LLVMGetName(cast, "to_i32bool"));
+                                     LLVMGetName(cast, "to_i_bool"));
             }
         }
         else
diff --git a/llvmutil.cpp b/llvmutil.cpp
index 26c18bf5..180c8676 100644
--- a/llvmutil.cpp
+++ b/llvmutil.cpp
@@ -115,13 +115,25 @@ InitLLVMUtil(llvm::LLVMContext *ctx, Target& target) {
     LLVMTypes::FloatPointerType = llvm::PointerType::get(LLVMTypes::FloatType, 0);
     LLVMTypes::DoublePointerType = llvm::PointerType::get(LLVMTypes::DoubleType, 0);
 
-    if (target.getMaskBitCount() == 1)
+    switch (target.getMaskBitCount()) {
+    case 1:
         LLVMTypes::MaskType = LLVMTypes::BoolVectorType =
             llvm::VectorType::get(llvm::Type::getInt1Ty(*ctx), target.getVectorWidth());
-    else {
-        Assert(target.getMaskBitCount() == 32);
+        break;
+    case 8:
+        LLVMTypes::MaskType = LLVMTypes::BoolVectorType =
+            llvm::VectorType::get(llvm::Type::getInt8Ty(*ctx), target.getVectorWidth());
+        break;
+    case 16:
+        LLVMTypes::MaskType = LLVMTypes::BoolVectorType =
+            llvm::VectorType::get(llvm::Type::getInt16Ty(*ctx), target.getVectorWidth());
+        break;
+    case 32:
         LLVMTypes::MaskType = LLVMTypes::BoolVectorType =
             llvm::VectorType::get(llvm::Type::getInt32Ty(*ctx), target.getVectorWidth());
+        break;
+    default:
+        FATAL("Unhandled mask width for initializing MaskType");
     }
 
     LLVMTypes::Int1VectorType =
@@ -154,12 +166,26 @@ InitLLVMUtil(llvm::LLVMContext *ctx, Target& target) {
 
     std::vector<llvm::Constant *> maskOnes;
     llvm::Constant *onMask = NULL;
-    if (target.getMaskBitCount() == 1)
+    switch (target.getMaskBitCount()) {
+    case 1:
         onMask = llvm::ConstantInt::get(llvm::Type::getInt1Ty(*ctx), 1,
                                         false /*unsigned*/); // 0x1
-    else
+        break;
+    case 8:
+        onMask = llvm::ConstantInt::get(llvm::Type::getInt8Ty(*ctx), -1,
+                                        true /*signed*/); // 0xff
+        break;
+    case 16:
+        onMask = llvm::ConstantInt::get(llvm::Type::getInt16Ty(*ctx), -1,
+                                        true /*signed*/); // 0xffff
+        break;
+    case 32:
         onMask = llvm::ConstantInt::get(llvm::Type::getInt32Ty(*ctx), -1,
                                     true /*signed*/); // 0xffffffff
+        break;
+    default:
+        FATAL("Unhandled mask width for onMask");
+    }
 
     for (int i = 0; i < target.getVectorWidth(); ++i)
         maskOnes.push_back(onMask);
@@ -167,13 +193,26 @@ InitLLVMUtil(llvm::LLVMContext *ctx, Target& target) {
 
     std::vector<llvm::Constant *> maskZeros;
     llvm::Constant *offMask = NULL;
-    if (target.getMaskBitCount() == 1)
+    switch (target.getMaskBitCount()) {
+    case 1:
         offMask = llvm::ConstantInt::get(llvm::Type::getInt1Ty(*ctx), 0,
                                          true /*signed*/);
-    else
+        break;
+    case 8:
+        offMask = llvm::ConstantInt::get(llvm::Type::getInt8Ty(*ctx), 0,
+                                         true /*signed*/);
+        break;
+    case 16:
+        offMask = llvm::ConstantInt::get(llvm::Type::getInt16Ty(*ctx), 0,
+                                         true /*signed*/);
+        break;
+    case 32:
         offMask = llvm::ConstantInt::get(llvm::Type::getInt32Ty(*ctx), 0,
                                          true /*signed*/);
-
+        break;
+    default:
+        FATAL("Unhandled mask width for offMask");
+    }
     for (int i = 0; i < target.getVectorWidth(); ++i)
         maskZeros.push_back(offMask);
     LLVMMaskAllOff = llvm::ConstantVector::get(maskZeros);
@@ -444,9 +483,14 @@ LLVMBoolVector(bool b) {
     if (LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType)
         v = llvm::ConstantInt::get(LLVMTypes::Int32Type, b ? 0xffffffff : 0,
                                    false /*unsigned*/);
+    else if (LLVMTypes::BoolVectorType == LLVMTypes::Int16VectorType)
+        v = llvm::ConstantInt::get(LLVMTypes::Int16Type, b ? 0xffff : 0,
+                                   false /*unsigned*/);
+    else if (LLVMTypes::BoolVectorType == LLVMTypes::Int8VectorType)
+        v = llvm::ConstantInt::get(LLVMTypes::Int8Type, b ? 0xff : 0,
+                                   false /*unsigned*/);
     else {
-        Assert(LLVMTypes::BoolVectorType->getElementType() ==
-               llvm::Type::getInt1Ty(*g->ctx));
+        Assert(LLVMTypes::BoolVectorType == LLVMTypes::Int1VectorType);
         v = b ? LLVMTrue : LLVMFalse;
     }
 
@@ -465,9 +509,14 @@ LLVMBoolVector(const bool *bvec) {
         if (LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType)
             v = llvm::ConstantInt::get(LLVMTypes::Int32Type, bvec[i] ? 0xffffffff : 0,
                                        false /*unsigned*/);
+        else if (LLVMTypes::BoolVectorType == LLVMTypes::Int16VectorType)
+            v = llvm::ConstantInt::get(LLVMTypes::Int16Type, bvec[i] ? 0xffff : 0,
+                                       false /*unsigned*/);
+        else if (LLVMTypes::BoolVectorType == LLVMTypes::Int8VectorType)
+            v = llvm::ConstantInt::get(LLVMTypes::Int8Type, bvec[i] ? 0xff : 0,
+                                       false /*unsigned*/);
         else {
-            Assert(LLVMTypes::BoolVectorType->getElementType() ==
-                   llvm::Type::getInt1Ty(*g->ctx));
+            Assert(LLVMTypes::BoolVectorType == LLVMTypes::Int1VectorType);
             v = bvec[i] ? LLVMTrue : LLVMFalse;
         }
 
diff --git a/parse.yy b/parse.yy
index 3ad815cf..488c864a 100644
--- a/parse.yy
+++ b/parse.yy
@@ -2148,8 +2148,24 @@ lAddFunctionParams(Declarator *decl) {
 
 /** Add a symbol for the built-in mask variable to the symbol table */
 static void lAddMaskToSymbolTable(SourcePos pos) {
-    const Type *t = g->target->getMaskBitCount() == 1 ?
-        AtomicType::VaryingBool : AtomicType::VaryingUInt32;
+    const Type *t;
+    switch (g->target->getMaskBitCount()) {
+    case 1:
+        t = AtomicType::VaryingBool;
+        break;
+    case 8:
+        t = AtomicType::VaryingUInt8;
+        break;
+    case 16:
+        t = AtomicType::VaryingUInt16;
+        break;
+    case 32:
+        t = AtomicType::VaryingUInt32;
+        break;
+    default:
+        FATAL("Unhandled mask bitsize in lAddMaskToSymbolTable");
+    }
+
     t = t->GetAsConstType();
     Symbol *maskSymbol = new Symbol("__mask", pos, t);
     m->symbolTable->AddVariable(maskSymbol);
diff --git a/stdlib.ispc b/stdlib.ispc
index b8ed2057..8ad5aa49 100644
--- a/stdlib.ispc
+++ b/stdlib.ispc
@@ -38,12 +38,20 @@
            ispc code 
 */
 
-#ifdef ISPC_TARGET_GENERIC
-#define IntMaskType bool
-#define UIntMaskType bool
+#if (ISPC_MASK_BITS == 1)
+  #define IntMaskType bool
+  #define UIntMaskType bool
+#elif (ISPC_MASK_BITS == 8)
+  #define IntMaskType int8
+  #define UIntMaskType unsigned int8
+#elif (ISPC_MASK_BITS == 16)
+  #define IntMaskType int16
+  #define UIntMaskType unsigned int16
+#elif (ISPC_MASK_BITS == 32)
+  #define IntMaskType int32
+  #define UIntMaskType unsigned int32
 #else
-#define IntMaskType int32
-#define UIntMaskType unsigned int32
+  #error Unknown value of ISPC_MASK_BITS
 #endif
 
 ///////////////////////////////////////////////////////////////////////////
@@ -335,14 +343,15 @@ static inline int32 sign_extend(bool v) {
     return __sext_varying_bool(v);
 }
 
+
 __declspec(safe) 
 static inline uniform bool any(bool v) {
     // We only care about whether "any" is true for the active program instances,
     // so we have to make v with the current program mask.
-#ifdef ISPC_TARGET_GENERIC
+#if (ISPC_MASK_BITS == 1)
     return __any(v & __mask);
 #else
-    return __any(__sext_varying_bool(v) & __mask);
+    return __any((UIntMaskType)__sext_varying_bool(v) & __mask);
 #endif
 }
 
@@ -350,11 +359,10 @@ __declspec(safe)
 static inline uniform bool all(bool v) {
     // As with any(), we need to explicitly mask v with the current program mask
     // so we're only looking at the current lanes
-
-#ifdef ISPC_TARGET_GENERIC
+#if (ISPC_MASK_BITS == 1)
     return __all(v | !__mask);
 #else
-    return __all(__sext_varying_bool(v) | !__mask);
+    return __all((UIntMaskType)__sext_varying_bool(v) | !__mask);
 #endif
 }
 
@@ -362,11 +370,10 @@ __declspec(safe)
 static inline uniform bool none(bool v) {
     // As with any(), we need to explicitly mask v with the current program mask
     // so we're only looking at the current lanes
-
-#ifdef ISPC_TARGET_GENERIC
+#if (ISPC_MASK_BITS == 1)
     return __none(v & __mask);
 #else
-    return __none(__sext_varying_bool(v) & __mask);
+    return __none((UIntMaskType)__sext_varying_bool(v) & __mask);
 #endif
 }
 
@@ -399,10 +406,10 @@ static inline int popcnt(int64 v) {
 __declspec(safe) 
 static inline uniform int popcnt(bool v) {
     // As with any() and all(), only count across the active lanes
-#ifdef ISPC_TARGET_GENERIC
+#if (ISPC_MASK_BITS == 1)
     return __popcnt_int64(__movmsk(v & __mask));
 #else
-    return __popcnt_int64(__movmsk(__sext_varying_bool(v) & __mask));
+    return __popcnt_int64(__movmsk((UIntMaskType)__sext_varying_bool(v) & __mask));
 #endif
 }
 

From 9ba49eabb21c7971f529fda25bad5fc1e84a6e3e Mon Sep 17 00:00:00 2001
From: Matt Pharr <mpharr@google.com>
Date: Tue, 23 Jul 2013 16:52:43 -0700
Subject: [PATCH 004/124] Reduce estimated costs for 8 and 16-bit min() and
 max() in stdlib.

These actually compile to a single instruction.
---
 stdlib.ispc | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/stdlib.ispc b/stdlib.ispc
index 8ad5aa49..9a2b191f 100644
--- a/stdlib.ispc
+++ b/stdlib.ispc
@@ -1332,88 +1332,88 @@ static inline uniform double max(uniform double a, uniform double b) {
 
 // int8
 
-__declspec(safe,cost2) 
+__declspec(safe,cost1)
 static inline uniform unsigned int8 min(uniform unsigned int8 a,
                                         uniform unsigned int8 b) {
     return (a < b) ? a : b;
 }
 
-__declspec(safe,cost2) 
+__declspec(safe,cost1)
 static inline uniform unsigned int8 max(uniform unsigned int8 a, 
                                         uniform unsigned int8 b) {
     return (a > b) ? a : b;
 }
 
-__declspec(safe,cost2)
+__declspec(safe,cost1)
 static inline uniform int8 min(uniform int8 a, uniform int8 b) {
     return (a < b) ? a : b;
 }
 
-__declspec(safe,cost2)
+__declspec(safe,cost1)
 static inline uniform int8 max(uniform int8 a, uniform int8 b) {
     return (a > b) ? a : b;
 }
 
-__declspec(safe,cost2)
+__declspec(safe,cost1)
 static inline unsigned int8 min(unsigned int8 a, unsigned int8 b) {
     return (a < b) ? a : b;
 }
 
-__declspec(safe,cost2)
+__declspec(safe,cost1)
 static inline unsigned int8 max(unsigned int8 a, unsigned int8 b) {
     return (a > b) ? a : b;
 }
 
-__declspec(safe,cost2)
+__declspec(safe,cost1)
 static inline int8 min(int8 a, int8 b) {
     return (a < b) ? a : b;
 }
 
-__declspec(safe,cost2)
+__declspec(safe,cost1)
 static inline int8 max(int8 a, int8 b) {
     return (a > b) ? a : b;
 }
 
 // int16
 
-__declspec(safe,cost2)
+__declspec(safe,cost1)
 static inline uniform unsigned int16 min(uniform unsigned int16 a, 
                                          uniform unsigned int16 b) {
     return (a < b) ? a : b;
 }
 
-__declspec(safe,cost2)
+__declspec(safe,cost1)
 static inline uniform unsigned int16 max(uniform unsigned int16 a, 
                                          uniform unsigned int16 b) {
     return (a > b) ? a : b;
 }
 
-__declspec(safe,cost2)
+__declspec(safe,cost1)
 static inline uniform int16 min(uniform int16 a, uniform int16 b) {
     return (a < b) ? a : b;
 }
 
-__declspec(safe,cost2)
+__declspec(safe,cost1)
 static inline uniform int16 max(uniform int16 a, uniform int16 b) {
     return (a > b) ? a : b;
 }
 
-__declspec(safe,cost2)
+__declspec(safe,cost1)
 static inline unsigned int16 min(unsigned int16 a, unsigned int16 b) {
     return (a < b) ? a : b;
 }
 
-__declspec(safe,cost2)
+__declspec(safe,cost1)
 static inline unsigned int16 max(unsigned int16 a, unsigned int16 b) {
     return (a > b) ? a : b;
 }
 
-__declspec(safe,cost2)
+__declspec(safe,cost1)
 static inline int16 min(int16 a, int16 b) {
     return (a < b) ? a : b;
 }
 
-__declspec(safe,cost2)
+__declspec(safe,cost1)
 static inline int16 max(int16 a, int16 b) {
     return (a > b) ? a : b;
 }

From f7f281a256c38c1986860baec81736fcb4f5b6d1 Mon Sep 17 00:00:00 2001
From: Matt Pharr <mpharr@google.com>
Date: Tue, 23 Jul 2013 17:01:03 -0700
Subject: [PATCH 005/124] Choose type for integer literals to match the target
 mask size (if possible).

On a target with a 16-bit mask (for example), we would choose the type
of an integer literal "1024" to be an int16.  Previously, we used an int32,
which is a worse fit and leads to less efficient code than an int16
on a 16-bit mask target.  (However, we'd still give an integer literal
1000000 the type int32, even in a 16-bit target.)

Updated the tests to still pass with 8 and 16-bit targets, given this
change.
---
 lex.ll                                      | 27 +++++++-
 parse.yy                                    | 23 ++++++-
 run_tests.py                                |  4 +-
 stdlib.ispc                                 | 74 ++++++++++-----------
 tests/aossoa-1.ispc                         |  4 +-
 tests/aossoa-2.ispc                         |  4 +-
 tests/aossoa-5.ispc                         |  4 +-
 tests/aossoa-6.ispc                         |  4 +-
 tests/atomics-12.ispc                       |  4 +-
 tests/atomics-13.ispc                       |  2 +-
 tests/atomics-4.ispc                        |  4 +-
 tests/coalesce-1.ispc                       |  4 +-
 tests/coalesce-2.ispc                       |  4 +-
 tests/coalesce-3.ispc                       |  4 +-
 tests/coalesce-4.ispc                       |  4 +-
 tests/coalesce-5.ispc                       |  4 +-
 tests/coalesce-6.ispc                       |  4 +-
 tests/coalesce-7.ispc                       |  4 +-
 tests/coalesce-8.ispc                       |  4 +-
 tests/count-leading-trailing-zeros-1.ispc   |  2 +-
 tests/count-leading-trailing-zeros-4.ispc   |  2 +-
 tests/exclusive-scan-and-2.ispc             |  4 +-
 tests/exclusive-scan-or-1.ispc              |  4 +-
 tests/frexp-double-1.ispc                   |  2 +-
 tests/frexp-double.ispc                     |  2 +-
 tests/frexp-float-1.ispc                    |  2 +-
 tests/frexp-float.ispc                      |  2 +-
 tests/kilo-mega-giga-2.ispc                 |  2 +-
 tests/ldexp-double.ispc                     |  4 +-
 tests/ldexp-float.ispc                      |  4 +-
 tests/local-atomics-12.ispc                 |  4 +-
 tests/local-atomics-13.ispc                 |  2 +-
 tests/local-atomics-14.ispc                 |  4 +-
 tests/local-atomics-4.ispc                  |  4 +-
 tests/rand-distrib-1.ispc                   |  2 +-
 tests/sizeof-9.ispc                         |  2 +-
 tests/test-83.ispc                          |  2 +-
 tests/test-84.ispc                          |  2 +-
 tests/test-85.ispc                          |  2 +-
 tests_errors/array-plus-equals.ispc         |  2 +-
 tests_errors/array-pointer-assign.ispc      |  2 +-
 tests_errors/float-logical.ispc             |  2 +-
 tests_errors/fptr-typecheck-2.ispc          |  2 +-
 tests_errors/fptr-typecheck-3.ispc          |  2 +-
 tests_errors/initexpr-2.ispc                |  2 +-
 tests_errors/int-ptr-fail.ispc              |  4 +-
 tests_errors/lvalue-2.ispc                  |  2 +-
 tests_errors/lvalue-3.ispc                  |  2 +-
 tests_errors/new-delete-3.ispc              |  2 +-
 tests_errors/new-delete-6.ispc              |  2 +-
 tests_errors/ptr-1.ispc                     |  2 +-
 tests_errors/ptr-const-1.ispc               |  2 +-
 tests_errors/ptrcast-lose-info.ispc         |  2 +-
 tests_errors/ref-3.ispc                     |  2 +-
 tests_errors/soa-11.ispc                    |  2 +-
 tests_errors/soa-12.ispc                    |  2 +-
 tests_errors/soa-3.ispc                     |  2 +-
 tests_errors/soa-4.ispc                     |  2 +-
 tests_errors/soa-9.ispc                     |  2 +-
 tests_errors/struct_arith.ispc              |  2 +-
 tests_errors/vec-size-compile-constant.ispc |  2 +-
 61 files changed, 166 insertions(+), 120 deletions(-)

diff --git a/lex.ll b/lex.ll
index f6633fce..8baa627a 100644
--- a/lex.ll
+++ b/lex.ll
@@ -77,6 +77,8 @@ static int allTokens[] = {
   TOKEN_UNSIGNED, TOKEN_VARYING, TOKEN_VOID, TOKEN_WHILE,
   TOKEN_STRING_C_LITERAL, TOKEN_DOTDOTDOT,
   TOKEN_FLOAT_CONSTANT,
+  TOKEN_INT8_CONSTANT, TOKEN_UINT8_CONSTANT,
+  TOKEN_INT16_CONSTANT, TOKEN_UINT16_CONSTANT,
   TOKEN_INT32_CONSTANT, TOKEN_UINT32_CONSTANT,
   TOKEN_INT64_CONSTANT, TOKEN_UINT64_CONSTANT,
   TOKEN_INC_OP, TOKEN_DEC_OP, TOKEN_LEFT_OP, TOKEN_RIGHT_OP, TOKEN_LE_OP,
@@ -150,6 +152,10 @@ void ParserInit() {
     tokenToName[TOKEN_STRING_C_LITERAL] = "\"C\"";
     tokenToName[TOKEN_DOTDOTDOT] = "...";
     tokenToName[TOKEN_FLOAT_CONSTANT] = "TOKEN_FLOAT_CONSTANT";
+    tokenToName[TOKEN_INT8_CONSTANT] = "TOKEN_INT8_CONSTANT";
+    tokenToName[TOKEN_UINT8_CONSTANT] = "TOKEN_UINT8_CONSTANT";
+    tokenToName[TOKEN_INT16_CONSTANT] = "TOKEN_INT16_CONSTANT";
+    tokenToName[TOKEN_UINT16_CONSTANT] = "TOKEN_UINT16_CONSTANT";
     tokenToName[TOKEN_INT32_CONSTANT] = "TOKEN_INT32_CONSTANT";
     tokenToName[TOKEN_UINT32_CONSTANT] = "TOKEN_UINT32_CONSTANT";
     tokenToName[TOKEN_INT64_CONSTANT] = "TOKEN_INT64_CONSTANT";
@@ -260,6 +266,10 @@ void ParserInit() {
     tokenNameRemap["TOKEN_STRING_C_LITERAL"] = "\"C\"";
     tokenNameRemap["TOKEN_DOTDOTDOT"] = "\'...\'";
     tokenNameRemap["TOKEN_FLOAT_CONSTANT"] = "float constant";
+    tokenNameRemap["TOKEN_INT8_CONSTANT"] = "int8 constant";
+    tokenNameRemap["TOKEN_UINT8_CONSTANT"] = "unsigned int8 constant";
+    tokenNameRemap["TOKEN_INT16_CONSTANT"] = "int16 constant";
+    tokenNameRemap["TOKEN_UINT16_CONSTANT"] = "unsigned int16 constant";
     tokenNameRemap["TOKEN_INT32_CONSTANT"] = "int32 constant";
     tokenNameRemap["TOKEN_UINT32_CONSTANT"] = "unsigned int32 constant";
     tokenNameRemap["TOKEN_INT64_CONSTANT"] = "int64 constant";
@@ -599,7 +609,22 @@ lParseInteger(bool dotdotdot) {
         }
         else {
             // No u or l suffix
-            // First, see if we can fit this into a 32-bit integer...
+            // If we're compiling to an 8-bit mask target and the constant
+            // fits into 8 bits, return an 8-bit int.
+            if (g->target->getMaskBitCount() == 8) {
+                if (yylval.intVal <= 0x7fULL)
+                    return TOKEN_INT8_CONSTANT;
+                else if (yylval.intVal <= 0xffULL)
+                    return TOKEN_UINT8_CONSTANT;
+            }
+            // And similarly for 16-bit masks and constants
+            if (g->target->getMaskBitCount() == 16) {
+                if (yylval.intVal <= 0x7fffULL)
+                    return TOKEN_INT16_CONSTANT;
+                else if (yylval.intVal <= 0xffffULL)
+                    return TOKEN_UINT16_CONSTANT;
+            }
+            // Otherwise, see if we can fit this into a 32-bit integer...
             if (yylval.intVal <= 0x7fffffffULL)
                 return TOKEN_INT32_CONSTANT;
             else if (yylval.intVal <= 0xffffffffULL)
diff --git a/parse.yy b/parse.yy
index 488c864a..6ed2a43d 100644
--- a/parse.yy
+++ b/parse.yy
@@ -179,6 +179,8 @@ struct ForeachDimension {
 }
 
 
+%token TOKEN_INT8_CONSTANT TOKEN_UINT8_CONSTANT
+%token TOKEN_INT16_CONSTANT TOKEN_UINT16_CONSTANT
 %token TOKEN_INT32_CONSTANT TOKEN_UINT32_CONSTANT
 %token TOKEN_INT64_CONSTANT TOKEN_UINT64_CONSTANT
 %token TOKEN_INT32DOTDOTDOT_CONSTANT TOKEN_UINT32DOTDOTDOT_CONSTANT
@@ -291,6 +293,22 @@ primary_expression
             Error(@1, "Undeclared symbol \"%s\".%s", name, alts.c_str());
         }
     }
+    | TOKEN_INT8_CONSTANT {
+        $$ = new ConstExpr(AtomicType::UniformInt8->GetAsConstType(),
+                           (int8_t)yylval.intVal, @1);
+    }
+    | TOKEN_UINT8_CONSTANT {
+        $$ = new ConstExpr(AtomicType::UniformUInt8->GetAsConstType(),
+                           (uint8_t)yylval.intVal, @1);
+    }
+    | TOKEN_INT16_CONSTANT {
+        $$ = new ConstExpr(AtomicType::UniformInt16->GetAsConstType(),
+                           (int16_t)yylval.intVal, @1);
+    }
+    | TOKEN_UINT16_CONSTANT {
+        $$ = new ConstExpr(AtomicType::UniformUInt16->GetAsConstType(),
+                           (uint16_t)yylval.intVal, @1);
+    }
     | TOKEN_INT32_CONSTANT {
         $$ = new ConstExpr(AtomicType::UniformInt32->GetAsConstType(),
                            (int32_t)yylval.intVal, @1);
@@ -1233,7 +1251,10 @@ declarator
     ;
 
 int_constant
-    : TOKEN_INT32_CONSTANT { $$ = yylval.intVal; }
+    : TOKEN_INT8_CONSTANT { $$ = yylval.intVal; }
+    | TOKEN_INT16_CONSTANT { $$ = yylval.intVal; }
+    | TOKEN_INT32_CONSTANT { $$ = yylval.intVal; }
+    | TOKEN_INT64_CONSTANT { $$ = yylval.intVal; }
     ;
 
 direct_declarator
diff --git a/run_tests.py b/run_tests.py
index 7c6b1eb8..296db867 100755
--- a/run_tests.py
+++ b/run_tests.py
@@ -37,7 +37,7 @@ parser.add_option("-g", "--generics-include", dest="include_file", help="Filenam
 parser.add_option("-f", "--ispc-flags", dest="ispc_flags", help="Additional flags for ispc (-g, -O1, ...)",
                   default="")
 parser.add_option('-t', '--target', dest='target',
-                  help='Set compilation target (neon, sse2, sse2-x2, sse4, sse4-x2, avx, avx-x2, generic-4, generic-8, generic-16, generic-32)',
+                  help='Set compilation target (neon, sse2, sse2-x2, sse4, sse4-x2, sse4-8, avx, avx-x2, generic-4, generic-8, generic-16, generic-32)',
                   default="sse4")
 parser.add_option('-a', '--arch', dest='arch',
                   help='Set architecture (arm, x86, x86-64)',
@@ -294,7 +294,7 @@ def run_test(testname):
         firstline = firstline.rstrip()
         file.close()
 
-        if (output.find(firstline) == -1):
+        if re.search(firstline, output) == None:
             sys.stderr.write("Didn't see expected error message %s from test %s.\nActual output:\n%s\n" % \
                 (firstline, testname, output))
             return (1, 0)
diff --git a/stdlib.ispc b/stdlib.ispc
index 9a2b191f..7e848481 100644
--- a/stdlib.ispc
+++ b/stdlib.ispc
@@ -3126,7 +3126,7 @@ static inline void __range_reduce_log(float input, varying float * uniform reduc
     static const int nonexponent_mask = 0x807FFFFF;
 
     // We want the reduced version to have an exponent of -1 which is -1 + 127 after biasing or 126
-    static const int exponent_neg1 = (126 << 23);
+    static const int exponent_neg1 = (126l << 23);
     // NOTE(boulos): We don't need to mask anything out since we know
     // the sign bit has to be 0. If it's 1, we need to return infinity/nan
     // anyway (log(x), x = +-0 -> infinity, x < 0 -> NaN).
@@ -3149,7 +3149,7 @@ static inline void __range_reduce_log(uniform float input, uniform float * unifo
     uniform int int_version = intbits(input);
     static const uniform int nonexponent_mask = 0x807FFFFF;
 
-    static const uniform int exponent_neg1 = (126 << 23);
+    static const uniform int exponent_neg1 = (126ul << 23);
     uniform int biased_exponent = int_version >> 23;
     uniform int offset_exponent = biased_exponent + 1;
     *exponent = offset_exponent - 127; // get the real value
@@ -3647,18 +3647,18 @@ static inline uniform float half_to_float(uniform unsigned int16 h) {
     else {
         // https://gist.github.com/2144712
         // Fabian "ryg" Giesen.
-        static const uniform unsigned int32 shifted_exp = 0x7c00 << 13; // exponent mask after shift
+        static const uniform unsigned int32 shifted_exp = 0x7c00ul << 13; // exponent mask after shift
 
         uniform int32 o = ((int32)(h & 0x7fff)) << 13;     // exponent/mantissa bits
         uniform unsigned int32 exp = shifted_exp & o;   // just the exponent
-        o += (127 - 15) << 23;        // exponent adjust
+        o += (uniform int32)(127 - 15) << 23;        // exponent adjust
 
         // handle exponent special cases
         if (exp == shifted_exp) // Inf/NaN?
-            o += (128 - 16) << 23;    // extra exp adjust
+            o += (uniform unsigned int32)(128 - 16) << 23;    // extra exp adjust
         else if (exp == 0) { // Zero/Denormal?
-            o += 1 << 23;             // extra exp adjust
-            o = intbits(floatbits(o) - floatbits(113 << 23)); // renormalize
+            o += 1ul << 23;             // extra exp adjust
+            o = intbits(floatbits(o) - floatbits(113ul << 23)); // renormalize
         }
 
         o |= ((int32)(h & 0x8000)) << 16;    // sign bit
@@ -3675,17 +3675,17 @@ static inline float half_to_float(unsigned int16 h) {
         // https://gist.github.com/2144712
         // Fabian "ryg" Giesen.
 
-        const unsigned int32 shifted_exp = 0x7c00 << 13; // exponent mask after shift
+        const unsigned int32 shifted_exp = 0x7c00ul << 13; // exponent mask after shift
 
-        int32 o = ((int32)(h & 0x7fff)) << 13;     // exponent/mantissa bits
+        int32 o = ((int32)(h & 0x7ffful)) << 13;     // exponent/mantissa bits
         unsigned int32 exp = shifted_exp & o;   // just the exponent
-        o += (127 - 15) << 23;        // exponent adjust
+        o += (int32)(127 - 15) << 23;        // exponent adjust
 
-        int32 infnan_val = o + ((128 - 16) << 23);
-        int32 zerodenorm_val = intbits(floatbits(o + (1<<23)) - floatbits(113 << 23));
+        int32 infnan_val = o + ((int32)(128 - 16) << 23);
+        int32 zerodenorm_val = intbits(floatbits(o + (1ul<<23)) - floatbits(113ul << 23));
         int32 reg_val = (exp == 0) ? zerodenorm_val : o;
 
-        int32 sign_bit = ((int32)(h & 0x8000)) << 16;
+        int32 sign_bit = ((int32)(h & 0x8000ul)) << 16;
         return floatbits(((exp == shifted_exp) ? infnan_val : reg_val) | sign_bit);
     }
 }
@@ -3715,16 +3715,16 @@ static inline uniform int16 float_to_half(uniform float f) {
         // NaN->qNaN and Inf->Inf
         // unconditional assignment here, will override with right value for
         // the regular case below.
-        uniform int32 f32infty = 255 << 23;
-        o = (fint > f32infty) ? 0x7e00 : 0x7c00; 
+        uniform int32 f32infty = 255ul << 23;
+        o = (fint > f32infty) ? 0x7e00u : 0x7c00u; 
 
         // (De)normalized number or zero
         // update fint unconditionally to save the blending; we don't need it
         // anymore for the Inf/NaN case anyway.
 
-        const uniform unsigned int32 round_mask = ~0xfffu; 
-        const uniform int32 magic = 15 << 23;
-        const uniform int32 f16infty = 31 << 23;
+        const uniform unsigned int32 round_mask = ~0xffful;
+        const uniform int32 magic = 15ul << 23;
+        const uniform int32 f16infty = 31ul << 23;
 
         uniform int32 fint2 = intbits(floatbits(fint & round_mask) * floatbits(magic)) - round_mask;
         fint2 = (fint2 > f16infty) ? f16infty : fint2; // Clamp to signed infinity if overflowed
@@ -3761,16 +3761,16 @@ static inline int16 float_to_half(float f) {
         // NaN->qNaN and Inf->Inf
         // unconditional assignment here, will override with right value for
         // the regular case below.
-        int32 f32infty = 255 << 23;
-        o = (fint > f32infty) ? 0x7e00 : 0x7c00; 
+        int32 f32infty = 255ul << 23;
+        o = (fint > f32infty) ? 0x7e00u : 0x7c00u;
 
         // (De)normalized number or zero
         // update fint unconditionally to save the blending; we don't need it
         // anymore for the Inf/NaN case anyway.
 
-        const unsigned int32 round_mask = ~0xfffu; 
-        const int32 magic = 15 << 23;
-        const int32 f16infty = 31 << 23;
+        const unsigned int32 round_mask = ~0xffful;
+        const int32 magic = 15ul << 23;
+        const int32 f16infty = 31ul << 23;
 
         // Shift exponent down, denormalize if necessary.
         // NOTE This represents half-float denormals using single precision denormals.
@@ -3789,7 +3789,7 @@ static inline int16 float_to_half(float f) {
         //   FP16 denormals are rare in practice, I don't know. Whatever slow path your HW
         //   may or may not have for denormals, this may well hit it.
         float fscale = floatbits(fint & round_mask) * floatbits(magic);
-        fscale = min(fscale, floatbits((31 << 23) - 0x1000));
+        fscale = min(fscale, floatbits((31ul << 23) - 0x1000ul));
         int32 fint2 = intbits(fscale) - round_mask;
 
         if (fint < f32infty)
@@ -3956,7 +3956,7 @@ float_to_srgb8(float inval)
     // Do the table lookup and unpack bias, scale
     unsigned int tab = table[(intbits(inval) - 0x39000000u) >> 20];
     unsigned int bias = (tab >> 16) << 9;
-    unsigned int scale = tab & 0xffff;
+    unsigned int scale = tab & 0xfffful;
 
     // Grab next-highest mantissa bits and perform linear interpolation
     unsigned int t = (intbits(inval) >> 12) & 0xff;
@@ -4006,7 +4006,7 @@ float_to_srgb8(uniform float inval)
     // Do the table lookup and unpack bias, scale
     uniform unsigned int tab = table[(intbits(inval) - 0x39000000u) >> 20];
     uniform unsigned int bias = (tab >> 16) << 9;
-    uniform unsigned int scale = tab & 0xffff;
+    uniform unsigned int scale = tab & 0xfffful;
 
     // Grab next-highest mantissa bits and perform linear interpolation
     uniform unsigned int t = (intbits(inval) >> 12) & 0xff;
@@ -4053,14 +4053,14 @@ static inline uniform unsigned int random(uniform RNGState * uniform state)
 static inline float frandom(varying RNGState * uniform state)
 {
     unsigned int irand = random(state);
-    irand &= (1<<23)-1;
+    irand &= (1ul<<23)-1;
     return floatbits(0x3F800000 | irand)-1.0f;
 }
 
 static inline uniform float frandom(uniform RNGState * uniform state)
 {
     uniform unsigned int irand = random(state);
-    irand &= (1<<23)-1;
+    irand &= (1ul<<23)-1;
     return floatbits(0x3F800000 | irand)-1.0f;
 }
 
@@ -4068,18 +4068,18 @@ static inline void seed_rng(varying RNGState * uniform state,
                             unsigned int seed) {
     state->z1 = seed;
     state->z2 = seed ^ 0xbeeff00d;
-    state->z3 = ((seed & 0xffff) << 16) | (seed >> 16);
-    state->z4 = (((seed & 0xff) << 24) | ((seed & 0xff00)  << 8) |
-                 ((seed & 0xff0000) >> 8) | (seed & 0xff000000) >> 24);
+    state->z3 = ((seed & 0xfffful) << 16) | (seed >> 16);
+    state->z4 = (((seed & 0xfful) << 24) | ((seed & 0xff00ul)  << 8) |
+                 ((seed & 0xff0000ul) >> 8) | (seed & 0xff000000ul) >> 24);
 }
 
 static inline void seed_rng(uniform RNGState * uniform state, 
                             uniform unsigned int seed) {
     state->z1 = seed;
     state->z2 = seed ^ 0xbeeff00d;
-    state->z3 = ((seed & 0xffff) << 16) | (seed >> 16);
-    state->z4 = (((seed & 0xff) << 24) | ((seed & 0xff00)  << 8) |
-                 ((seed & 0xff0000) >> 8) | (seed & 0xff000000) >> 24);
+    state->z3 = ((seed & 0xfffful) << 16) | (seed >> 16);
+    state->z4 = (((seed & 0xfful) << 24) | ((seed & 0xff00ul)  << 8) |
+                 ((seed & 0xff0000ul) >> 8) | (seed & 0xff000000ul) >> 24);
 }
 
 
@@ -4097,7 +4097,7 @@ static inline uniform bool rdrand(float * uniform ptr) {
         uniform int32 irand;
         uniform bool success = __rdrand_i32(&irand);
         if (success) {
-            irand &= (1<<23)-1;
+            irand &= (1ul<<23)-1;
             *ptr = floatbits(0x3F800000 | irand)-1.0f;
         }
         return success;
@@ -4117,7 +4117,7 @@ static inline bool rdrand(varying float * uniform ptr) {
                 // in vector form.  However, we need to be careful to not
                 // clobber any existing already-set values in *ptr with
                 // inactive lanes here...
-                irand &= (1<<23)-1;
+                irand &= (1ul<<23)-1;
                 *ptr = floatbits(0x3F800000 | irand)-1.0f;
                 success = true;
             }
@@ -4137,7 +4137,7 @@ static inline bool rdrand(float * ptr) {
         foreach_active (index) {
             uniform int32 irand;
             if (__rdrand_i32(&irand)) {
-                irand &= (1<<23)-1;
+                irand &= (1ul<<23)-1;
                 *ptrs[index] = floatbits(0x3F800000 | irand)-1.0f;
                 success = true;
             }
diff --git a/tests/aossoa-1.ispc b/tests/aossoa-1.ispc
index 59964d6d..32d3bcba 100644
--- a/tests/aossoa-1.ispc
+++ b/tests/aossoa-1.ispc
@@ -2,8 +2,8 @@
 export uniform int width() { return programCount; }
 
 export void f_v(uniform float RET[]) {
-#define width 3
-#define maxProgramCount 64
+#define width 3ul
+#define maxProgramCount 64ul
     assert(programCount <= maxProgramCount);
 
 //CO    const uniform int width = 3;
diff --git a/tests/aossoa-2.ispc b/tests/aossoa-2.ispc
index 9ff82226..df8eae5c 100644
--- a/tests/aossoa-2.ispc
+++ b/tests/aossoa-2.ispc
@@ -2,8 +2,8 @@
 export uniform int width() { return programCount; }
 
 export void f_v(uniform float RET[]) {
-#define width 4
-#define maxProgramCount 64
+#define width 4ul
+#define maxProgramCount 64ul
     assert(programCount <= maxProgramCount);
 
 //CO    const uniform int width = 4;
diff --git a/tests/aossoa-5.ispc b/tests/aossoa-5.ispc
index eb4fed3a..d6346455 100644
--- a/tests/aossoa-5.ispc
+++ b/tests/aossoa-5.ispc
@@ -2,8 +2,8 @@
 export uniform int width() { return programCount; }
 
 export void f_v(uniform float RET[]) {
-#define width 3
-#define maxProgramCount 64
+#define width 3ul
+#define maxProgramCount 64ul
     assert(programCount <= maxProgramCount);
 
 //CO    const uniform int width = 3;
diff --git a/tests/aossoa-6.ispc b/tests/aossoa-6.ispc
index b64cd10b..7c177fde 100644
--- a/tests/aossoa-6.ispc
+++ b/tests/aossoa-6.ispc
@@ -2,8 +2,8 @@
 export uniform int width() { return programCount; }
 
 export void f_v(uniform float RET[]) {
-#define width 4
-#define maxProgramCount 64
+#define width 4ul
+#define maxProgramCount 64ul
     assert(programCount <= maxProgramCount);
 
 //CO    const uniform int width = 4;
diff --git a/tests/atomics-12.ispc b/tests/atomics-12.ispc
index c27ad99c..d6359555 100644
--- a/tests/atomics-12.ispc
+++ b/tests/atomics-12.ispc
@@ -7,7 +7,7 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
     float a = aFOO[programIndex]; 
     float b = 0;
     if (programIndex < 30 && programIndex & 1)
-        b = atomic_or_global(&s, (1 << programIndex));
+        b = atomic_or_global(&s, (1ul << programIndex));
     RET[programIndex] = s;
 }
 
@@ -15,6 +15,6 @@ export void result(uniform float RET[]) {
     uniform int sum = 0;
     for (uniform int i = 0; i < min(30, programCount); ++i)
         if (i & 1)
-            sum += (1 << i);
+            sum += (1ul << i);
     RET[programIndex] = sum;
 }
diff --git a/tests/atomics-13.ispc b/tests/atomics-13.ispc
index 86faaddb..dea3bfc3 100644
--- a/tests/atomics-13.ispc
+++ b/tests/atomics-13.ispc
@@ -7,7 +7,7 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
     float a = aFOO[programIndex]; 
     int32 b = 0;
     if (programIndex < 32 && programIndex & 1)
-        b = atomic_or_global(&s, (1 << programIndex));
+        b = atomic_or_global(&s, (1ul << programIndex));
     RET[programIndex] = popcnt(reduce_max((int32)b));
 }
 
diff --git a/tests/atomics-4.ispc b/tests/atomics-4.ispc
index 30b343d1..ac746ad2 100644
--- a/tests/atomics-4.ispc
+++ b/tests/atomics-4.ispc
@@ -5,10 +5,10 @@ uniform int32 s = 0;
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
     float a = aFOO[programIndex]; 
-    float b = atomic_or_global(&s, (1<<min(programIndex,30)));
+    float b = atomic_or_global(&s, (1ul<<min(programIndex,30)));
     RET[programIndex] = s;
 }
 
 export void result(uniform float RET[]) {
-    RET[programIndex] = (1<<min(programCount,31))-1;
+    RET[programIndex] = (1ul<<min(programCount,31))-1;
 }
diff --git a/tests/coalesce-1.ispc b/tests/coalesce-1.ispc
index acfe8cdf..39a79a91 100644
--- a/tests/coalesce-1.ispc
+++ b/tests/coalesce-1.ispc
@@ -2,8 +2,8 @@
 export uniform int width() { return programCount; }
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
-    uniform float * uniform buf = uniform new uniform float[32*32];
-    for (uniform int i = 0; i < 32*32; ++i)
+    uniform float * uniform buf = uniform new uniform float[32l*32l];
+    for (uniform int i = 0; i < 32l*32l; ++i)
         buf[i] = i;
     
     assert(programIndex <= 64);
diff --git a/tests/coalesce-2.ispc b/tests/coalesce-2.ispc
index 88b952a4..a047e456 100644
--- a/tests/coalesce-2.ispc
+++ b/tests/coalesce-2.ispc
@@ -2,8 +2,8 @@
 export uniform int width() { return programCount; }
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
-    uniform float * uniform buf = uniform new uniform float[32*32];
-    for (uniform int i = 0; i < 32*32; ++i)
+    uniform float * uniform buf = uniform new uniform float[32l*32l];
+    for (uniform int i = 0; i < 32l*32l; ++i)
         buf[i] = i;
 
     RET[programIndex] = buf[programIndex & 1];
diff --git a/tests/coalesce-3.ispc b/tests/coalesce-3.ispc
index 7a05963f..c1718b4f 100644
--- a/tests/coalesce-3.ispc
+++ b/tests/coalesce-3.ispc
@@ -2,8 +2,8 @@
 export uniform int width() { return programCount; }
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
-    uniform float * uniform buf = uniform new uniform float[32*32];
-    for (uniform int i = 0; i < 32*32; ++i)
+    uniform float * uniform buf = uniform new uniform float[32l*32l];
+    for (uniform int i = 0; i < 32l*32l; ++i)
         buf[i] = i;
 
     RET[programIndex] = buf[(programIndex >> 2) * 16 + (programIndex & 3)];
diff --git a/tests/coalesce-4.ispc b/tests/coalesce-4.ispc
index 1ddd4b89..182a4d4f 100644
--- a/tests/coalesce-4.ispc
+++ b/tests/coalesce-4.ispc
@@ -2,8 +2,8 @@
 export uniform int width() { return programCount; }
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
-    uniform float * uniform buf = uniform new uniform float[32*32];
-    for (uniform int i = 0; i < 32*32; ++i)
+    uniform float * uniform buf = uniform new uniform float[32l*32l];
+    for (uniform int i = 0; i < 32l*32l; ++i)
         buf[i] = i;
 
     float a = buf[2*programIndex];
diff --git a/tests/coalesce-5.ispc b/tests/coalesce-5.ispc
index 2dd8d44e..385e8526 100644
--- a/tests/coalesce-5.ispc
+++ b/tests/coalesce-5.ispc
@@ -2,8 +2,8 @@
 export uniform int width() { return programCount; }
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
-    uniform float * uniform buf = uniform new uniform float[32*32];
-    for (uniform int i = 0; i < 32*32; ++i)
+    uniform float * uniform buf = uniform new uniform float[32l*32l];
+    for (uniform int i = 0; i < 32l*32l; ++i)
         buf[i] = i;
 
     float a = buf[4*programIndex];
diff --git a/tests/coalesce-6.ispc b/tests/coalesce-6.ispc
index 2a54a2db..8c630a45 100644
--- a/tests/coalesce-6.ispc
+++ b/tests/coalesce-6.ispc
@@ -2,8 +2,8 @@
 export uniform int width() { return programCount; }
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
-    uniform float * uniform buf = uniform new uniform float[32*32];
-    for (uniform int i = 0; i < 32*32; ++i)
+    uniform float * uniform buf = uniform new uniform float[32l*32l];
+    for (uniform int i = 0; i < 32l*32l; ++i)
         buf[i] = i;
 
     float a = buf[4*programIndex];
diff --git a/tests/coalesce-7.ispc b/tests/coalesce-7.ispc
index 8ed628bd..29b56b8d 100644
--- a/tests/coalesce-7.ispc
+++ b/tests/coalesce-7.ispc
@@ -2,8 +2,8 @@
 export uniform int width() { return programCount; }
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
-    uniform float * uniform buf = uniform new uniform float[32*32];
-    for (uniform int i = 0; i < 32*32; ++i)
+    uniform float * uniform buf = uniform new uniform float[32l*32l];
+    for (uniform int i = 0; i < 32l*32l; ++i)
         buf[i] = i;
 
     float a = buf[4*programIndex];
diff --git a/tests/coalesce-8.ispc b/tests/coalesce-8.ispc
index dfefaa19..f01ca9c3 100644
--- a/tests/coalesce-8.ispc
+++ b/tests/coalesce-8.ispc
@@ -2,8 +2,8 @@
 export uniform int width() { return programCount; }
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
-    uniform float * uniform buf = uniform new uniform float[32*32];
-    for (uniform int i = 0; i < 32*32; ++i)
+    uniform float * uniform buf = uniform new uniform float[32l*32l];
+    for (uniform int i = 0; i < 32l*32l; ++i)
         buf[i] = i;
 
     int index = (programIndex < 4) ? (programIndex & 1) :
diff --git a/tests/count-leading-trailing-zeros-1.ispc b/tests/count-leading-trailing-zeros-1.ispc
index 221d066d..3f12c07d 100644
--- a/tests/count-leading-trailing-zeros-1.ispc
+++ b/tests/count-leading-trailing-zeros-1.ispc
@@ -3,7 +3,7 @@ export uniform int width() { return programCount; }
 
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
-    RET[programIndex] = count_trailing_zeros(0xf0);
+    RET[programIndex] = count_trailing_zeros(0xf0ul);
 }
 
 export void result(uniform float RET[]) {
diff --git a/tests/count-leading-trailing-zeros-4.ispc b/tests/count-leading-trailing-zeros-4.ispc
index 475c18ca..4b849018 100644
--- a/tests/count-leading-trailing-zeros-4.ispc
+++ b/tests/count-leading-trailing-zeros-4.ispc
@@ -3,7 +3,7 @@ export uniform int width() { return programCount; }
 
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
-    int32 i = (1 << (programIndex % 28));
+    int32 i = (1ul << (programIndex % 28));
     RET[programIndex] = count_leading_zeros(i);
 }
 
diff --git a/tests/exclusive-scan-and-2.ispc b/tests/exclusive-scan-and-2.ispc
index 5d2bcd1f..b742a91e 100644
--- a/tests/exclusive-scan-and-2.ispc
+++ b/tests/exclusive-scan-and-2.ispc
@@ -3,7 +3,7 @@ export uniform int width() { return programCount; }
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
     RET[programIndex] = -1;
-    int32 a = ~(1 << programIndex);
+    int32 a = ~(1ul << programIndex);
     if ((programIndex < 32) && (programIndex & 1) == 0) {
         RET[programIndex] = exclusive_scan_and(a);
     }
@@ -15,7 +15,7 @@ export void result(uniform float RET[]) {
     if ((programIndex & 1) == 0 && programIndex > 0 && programIndex < 32) {
         int val = 0xffffffff;
         for (int i = 0; i < programIndex-1; i += 2)
-            val &= ~(1<<i);
+            val &= ~(1ul<<i);
         RET[programIndex] = val;
     }
 }
diff --git a/tests/exclusive-scan-or-1.ispc b/tests/exclusive-scan-or-1.ispc
index bd2b7598..ce790c4f 100644
--- a/tests/exclusive-scan-or-1.ispc
+++ b/tests/exclusive-scan-or-1.ispc
@@ -3,11 +3,11 @@ export uniform int width() { return programCount; }
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
     RET[programIndex] = -1;
-    int32 a = (1 << (min(programIndex, 30)));
+    int32 a = (1ul << (min(programIndex, 30)));
     RET[programIndex] = exclusive_scan_or(a);
 }
 
 
 export void result(uniform float RET[]) {
-    RET[programIndex] = (1 << (min(programIndex, 31))) - 1;
+    RET[programIndex] = (1ul << (min(programIndex, 31))) - 1;
 }
diff --git a/tests/frexp-double-1.ispc b/tests/frexp-double-1.ispc
index 6c38b05e..dbb4128b 100644
--- a/tests/frexp-double-1.ispc
+++ b/tests/frexp-double-1.ispc
@@ -3,7 +3,7 @@ export uniform int width() { return programCount; }
 
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
-    double a = (1<< (programIndex % 28)) * 1.5;
+    double a = (1ul<< (programIndex % 28)) * 1.5;
     if (programIndex & 1)
         a = -a;
     int exponent;
diff --git a/tests/frexp-double.ispc b/tests/frexp-double.ispc
index ba4831d7..f397355f 100644
--- a/tests/frexp-double.ispc
+++ b/tests/frexp-double.ispc
@@ -3,7 +3,7 @@ export uniform int width() { return programCount; }
 
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
-    double a = (1<< (programIndex%28)) * 1.5;
+    double a = (1ul << (programIndex%28)) * 1.5;
     if (programIndex & 1)
         a = -a;
     int exponent;
diff --git a/tests/frexp-float-1.ispc b/tests/frexp-float-1.ispc
index 7d5fc1d2..9df35c4c 100644
--- a/tests/frexp-float-1.ispc
+++ b/tests/frexp-float-1.ispc
@@ -3,7 +3,7 @@ export uniform int width() { return programCount; }
 
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
-    float a = (1<< (programIndex%28)) * 1.5;
+    float a = (1ul << (programIndex%28)) * 1.5;
     if (programIndex & 1)
         a = -a;
     int exponent;
diff --git a/tests/frexp-float.ispc b/tests/frexp-float.ispc
index ec54e4be..2bfa35ff 100644
--- a/tests/frexp-float.ispc
+++ b/tests/frexp-float.ispc
@@ -3,7 +3,7 @@ export uniform int width() { return programCount; }
 
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
-    float a = (1<< (programIndex%28)) * 1.5;
+    float a = (1ul << (programIndex%28)) * 1.5;
     if (programIndex & 1)
         a = -a;
     int exponent;
diff --git a/tests/kilo-mega-giga-2.ispc b/tests/kilo-mega-giga-2.ispc
index 77e201ef..42545b8d 100644
--- a/tests/kilo-mega-giga-2.ispc
+++ b/tests/kilo-mega-giga-2.ispc
@@ -8,5 +8,5 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
 }
 
 export void result(uniform float RET[]) {
-    RET[programIndex] = 2*1024*1024 + 5;
+    RET[programIndex] = 2ul*1024ul*1024ul + 5;
 }
diff --git a/tests/ldexp-double.ispc b/tests/ldexp-double.ispc
index 6b3ed734..e1b7a59f 100644
--- a/tests/ldexp-double.ispc
+++ b/tests/ldexp-double.ispc
@@ -3,7 +3,7 @@ export uniform int width() { return programCount; }
 
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
-    double a = 1 << (programIndex % 28);
+    double a = 1ul << (programIndex % 28);
     if (programIndex & 1)
         a = -a;
     RET[programIndex] = ldexp(a, 2);
@@ -11,7 +11,7 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
 
 export void result(uniform float RET[]) {
     int pi = programIndex % 28;
-    RET[programIndex] = (1 << (pi + 2));
+    RET[programIndex] = (1ul << (pi + 2));
     if (programIndex & 1)
         RET[programIndex] = -RET[programIndex];
 }
diff --git a/tests/ldexp-float.ispc b/tests/ldexp-float.ispc
index a2ec9a27..305ae106 100644
--- a/tests/ldexp-float.ispc
+++ b/tests/ldexp-float.ispc
@@ -3,7 +3,7 @@ export uniform int width() { return programCount; }
 
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
-    float a = 1 << (programIndex % 28);
+    float a = 1ul << (programIndex % 28);
     if (programIndex & 1)
         a = -a;
     RET[programIndex] = ldexp(a, 2);
@@ -11,7 +11,7 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
 
 export void result(uniform float RET[]) {
     int pi = programIndex % 28;
-    RET[programIndex] = (1 << (pi + 2));
+    RET[programIndex] = (1ul << (pi + 2));
     if (programIndex & 1)
         RET[programIndex] = -RET[programIndex];
 }
diff --git a/tests/local-atomics-12.ispc b/tests/local-atomics-12.ispc
index 23a30af5..358ffd34 100644
--- a/tests/local-atomics-12.ispc
+++ b/tests/local-atomics-12.ispc
@@ -7,7 +7,7 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
     float a = aFOO[programIndex]; 
     float b = 0;
     if (programIndex < 29 && (programIndex & 1))
-        b = atomic_or_local(&s, (1 << programIndex));
+        b = atomic_or_local(&s, (1ul << programIndex));
     RET[programIndex] = s;
 }
 
@@ -15,6 +15,6 @@ export void result(uniform float RET[]) {
     uniform int sum = 0;
     for (uniform int i = 0; i < min(programCount, 29); ++i)
         if (i & 1)
-            sum += (1 << i);
+            sum += (1ul << i);
     RET[programIndex] = sum;
 }
diff --git a/tests/local-atomics-13.ispc b/tests/local-atomics-13.ispc
index 36fd1f1c..b9d35d09 100644
--- a/tests/local-atomics-13.ispc
+++ b/tests/local-atomics-13.ispc
@@ -7,7 +7,7 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
     float a = aFOO[programIndex]; 
     int32 b = 0;
     if (programIndex < 28 && (programIndex & 1))
-        b = atomic_or_local(&s, (1 << programIndex));
+        b = atomic_or_local(&s, (1ul << programIndex));
     RET[programIndex] = popcnt(reduce_max(b));
 }
 
diff --git a/tests/local-atomics-14.ispc b/tests/local-atomics-14.ispc
index 4cf81809..25c52e60 100644
--- a/tests/local-atomics-14.ispc
+++ b/tests/local-atomics-14.ispc
@@ -7,7 +7,7 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
     float a = aFOO[programIndex]; 
     float b = 0;
     if (programIndex < 32 && (programIndex & 1))
-        b = atomic_or_local(&s, (1 << programIndex));
+        b = atomic_or_local(&s, (1ul << programIndex));
     RET[programIndex] = (s>>20);
 }
 
@@ -15,6 +15,6 @@ export void result(uniform float RET[]) {
     uniform int sum = 0;
     for (uniform int i = 0; i < min(32, programCount); ++i)
         if (i & 1)
-            sum += (1 << i);
+            sum += (1ul << i);
     RET[programIndex] = ((unsigned int64)(0xffffffffff000000 | sum)) >> 20;
 }
diff --git a/tests/local-atomics-4.ispc b/tests/local-atomics-4.ispc
index f7f6a04a..b3648ab5 100644
--- a/tests/local-atomics-4.ispc
+++ b/tests/local-atomics-4.ispc
@@ -7,10 +7,10 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
     float a = aFOO[programIndex]; 
     float b = 0;
     if (programIndex < 29)
-        atomic_or_local(&s, (1<<programIndex));
+        atomic_or_local(&s, (1ul<<programIndex));
     RET[programIndex] = s;
 }
 
 export void result(uniform float RET[]) {
-    RET[programIndex] = (1<<min(29,programCount))-1;
+    RET[programIndex] = (1ul<<min(29,programCount))-1;
 }
diff --git a/tests/rand-distrib-1.ispc b/tests/rand-distrib-1.ispc
index 3a23a917..8869d600 100644
--- a/tests/rand-distrib-1.ispc
+++ b/tests/rand-distrib-1.ispc
@@ -11,7 +11,7 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
     for (uniform int i = 0; i < iters; ++i) {
         unsigned int val = random(&state);
         for (uniform int j = 0; j < 32; ++j) {
-            if (val & (1<<j))
+            if (val & (1ul<<j))
                 ++count[j];
         }
     }
diff --git a/tests/sizeof-9.ispc b/tests/sizeof-9.ispc
index ad96fab2..84492bcc 100644
--- a/tests/sizeof-9.ispc
+++ b/tests/sizeof-9.ispc
@@ -2,7 +2,7 @@
 export uniform int width() { return programCount; }
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
-    RET[programIndex] = sizeof 1;
+  RET[programIndex] = sizeof 1u;
 }
 
 export void result(uniform float RET[]) {
diff --git a/tests/test-83.ispc b/tests/test-83.ispc
index eaca92d7..6aa408b7 100644
--- a/tests/test-83.ispc
+++ b/tests/test-83.ispc
@@ -6,7 +6,7 @@ float f(int i) { return i + 1.; }
 float f(float v) { return 2 * v; }
 export void f_f(uniform float RET[], uniform float aFOO[]) {
     float a = aFOO[programIndex]; 
-    RET[programIndex] = f(a) + f(10); 
+    RET[programIndex] = f(a) + f(10l);
 }
 
 export void result(uniform float RET[]) {
diff --git a/tests/test-84.ispc b/tests/test-84.ispc
index f39568b0..fedf149a 100644
--- a/tests/test-84.ispc
+++ b/tests/test-84.ispc
@@ -6,7 +6,7 @@ float f(float v) { return 2 * v; }
 float f(int i) { return i + 1.; }
 export void f_f(uniform float RET[], uniform float aFOO[]) {
     float a = aFOO[programIndex]; 
-    RET[programIndex] = f(a) + f(10); 
+    RET[programIndex] = f(a) + f(10l);
 }
 
 export void result(uniform float RET[]) {
diff --git a/tests/test-85.ispc b/tests/test-85.ispc
index 0001816c..e9510ffb 100644
--- a/tests/test-85.ispc
+++ b/tests/test-85.ispc
@@ -8,7 +8,7 @@ float f(float a, int b) { return a + b; }
 float f(int i) { return i + 1.; }
 export void f_f(uniform float RET[], uniform float aFOO[]) {
     float a = aFOO[programIndex]; 
-    RET[programIndex] = f(a) + f() + f(a, a) + f(10); 
+    RET[programIndex] = f(a) + f() + f(a, a) + f(10l);
 }
 
 export void result(uniform float RET[]) {
diff --git a/tests_errors/array-plus-equals.ispc b/tests_errors/array-plus-equals.ispc
index 0e0ba744..7fcecc43 100644
--- a/tests_errors/array-plus-equals.ispc
+++ b/tests_errors/array-plus-equals.ispc
@@ -1,4 +1,4 @@
-// Illegal to assign to array type "varying float[5]"
+// Illegal to assign to array type "varying float\[5\]"
 
 void foo(float *x) {
     float a[5] = { 1,2,3,4,5};
diff --git a/tests_errors/array-pointer-assign.ispc b/tests_errors/array-pointer-assign.ispc
index d709dbd3..3e74c2ed 100644
--- a/tests_errors/array-pointer-assign.ispc
+++ b/tests_errors/array-pointer-assign.ispc
@@ -1,4 +1,4 @@
-// Illegal to assign to array type "varying float[5]"
+// Illegal to assign to array type "varying float\[5\]"
 
 void foo(float *x) {
     float a[5] = { 1,2,3,4,5};
diff --git a/tests_errors/float-logical.ispc b/tests_errors/float-logical.ispc
index 27ab4c8c..a5a1a5cc 100644
--- a/tests_errors/float-logical.ispc
+++ b/tests_errors/float-logical.ispc
@@ -1,4 +1,4 @@
-// Illegal to use ^= operator with floating-point
+// Illegal to use \^= operator with floating-point
 
 float foo(float a, float b) {
     return a ^= b;
diff --git a/tests_errors/fptr-typecheck-2.ispc b/tests_errors/fptr-typecheck-2.ispc
index ea59fa54..6a665207 100644
--- a/tests_errors/fptr-typecheck-2.ispc
+++ b/tests_errors/fptr-typecheck-2.ispc
@@ -1,4 +1,4 @@
-// Can't convert argument of type "void * uniform" to type "varying float" for function call argument.
+// Can't convert argument of type "void \* uniform" to type "varying float" for function call argument.
 
 float bar(float a, float b);
 
diff --git a/tests_errors/fptr-typecheck-3.ispc b/tests_errors/fptr-typecheck-3.ispc
index 26412632..a65ac74e 100644
--- a/tests_errors/fptr-typecheck-3.ispc
+++ b/tests_errors/fptr-typecheck-3.ispc
@@ -1,4 +1,4 @@
-// Too few parameter values provided in function call (1 provided, 2 expected).
+// Too few parameter values provided in function call \(1 provided, 2 expected\).
 
 float bar(float a, float b);
 
diff --git a/tests_errors/initexpr-2.ispc b/tests_errors/initexpr-2.ispc
index 681fe6fc..db0b4925 100644
--- a/tests_errors/initexpr-2.ispc
+++ b/tests_errors/initexpr-2.ispc
@@ -1,3 +1,3 @@
-// Initializer list for array "varying int32[2][4]" must have no more than 2 elements (has 3)
+// Initializer list for array "varying int32\[2\]\[4\]" must have no more than 2 elements \(has 3\)
 
 int a[2][4] = { { 1, 2, 3 }, { 1, 2, 3, 4 }, 1 };
diff --git a/tests_errors/int-ptr-fail.ispc b/tests_errors/int-ptr-fail.ispc
index 0c06bfc8..4185651e 100644
--- a/tests_errors/int-ptr-fail.ispc
+++ b/tests_errors/int-ptr-fail.ispc
@@ -1,5 +1,5 @@
-// Type conversion from "const uniform int32" to "uniform int32 * varying" for initializer is not possible
+// Type conversion from "const uniform int32" to "uniform int32 \* varying" for initializer is not possible
 
 int voo() {
-    int * varying foo = 1;
+    int * varying foo = 1l;
 }
diff --git a/tests_errors/lvalue-2.ispc b/tests_errors/lvalue-2.ispc
index ae2e3edd..77438ebb 100644
--- a/tests_errors/lvalue-2.ispc
+++ b/tests_errors/lvalue-2.ispc
@@ -1,4 +1,4 @@
-// Can't assign to type "const uniform int32" on left-hand side of expression
+// Can't assign to type "const uniform int[0-9]*" on left-hand side of expression
 
 int bar(){ 
     4 = 0;
diff --git a/tests_errors/lvalue-3.ispc b/tests_errors/lvalue-3.ispc
index eb856bf0..72b8b9b8 100644
--- a/tests_errors/lvalue-3.ispc
+++ b/tests_errors/lvalue-3.ispc
@@ -1,4 +1,4 @@
-// Can't assign to type "const uniform int32" on left-hand side of expression
+// Can't assign to type "const uniform int[0-9]*" on left-hand side of expression
 
 int bar(){ 
     int x;
diff --git a/tests_errors/new-delete-3.ispc b/tests_errors/new-delete-3.ispc
index bb22aa56..f46053e3 100644
--- a/tests_errors/new-delete-3.ispc
+++ b/tests_errors/new-delete-3.ispc
@@ -1,4 +1,4 @@
-// syntax error, unexpected '('
+// syntax error, unexpected '\('
 
 int * func(int a) {
     return new int[a](10);
diff --git a/tests_errors/new-delete-6.ispc b/tests_errors/new-delete-6.ispc
index 250441c2..1b090a1c 100644
--- a/tests_errors/new-delete-6.ispc
+++ b/tests_errors/new-delete-6.ispc
@@ -1,4 +1,4 @@
-// Can't convert from type "uniform int32 * varying" to type "uniform int32 * uniform" for return
+// Can't convert from type "uniform int32 \* varying" to type "uniform int32 \* uniform" for return
 
 int * uniform func(int x) {
     return new int[x];
diff --git a/tests_errors/ptr-1.ispc b/tests_errors/ptr-1.ispc
index 97a88488..5a9d891c 100644
--- a/tests_errors/ptr-1.ispc
+++ b/tests_errors/ptr-1.ispc
@@ -1,4 +1,4 @@
-// Can't convert from pointer type "void * varying" to incompatible pointer type "uniform int32 * varying" for return statement
+// Can't convert from pointer type "void \* varying" to incompatible pointer type "uniform int32 \* varying" for return statement
 
 int *foo(void *p) {
     return p;
diff --git a/tests_errors/ptr-const-1.ispc b/tests_errors/ptr-const-1.ispc
index 4dcfaa75..65900060 100644
--- a/tests_errors/ptr-const-1.ispc
+++ b/tests_errors/ptr-const-1.ispc
@@ -1,4 +1,4 @@
-// Can't assign to type "const uniform int32 * const varying"
+// Can't assign to type "const uniform int32 \* const varying"
 
 void foo(const int * const p) {
     ++p;
diff --git a/tests_errors/ptrcast-lose-info.ispc b/tests_errors/ptrcast-lose-info.ispc
index 5da374aa..61efe95e 100644
--- a/tests_errors/ptrcast-lose-info.ispc
+++ b/tests_errors/ptrcast-lose-info.ispc
@@ -1,4 +1,4 @@
-// Pointer type cast of type "uniform int32 * uniform" to integer type "uniform int32" may lose information.
+// Pointer type cast of type "uniform int32 \* uniform" to integer type "uniform int32" may lose information.
 //  rule: run on arch=x86-64
 
 int32 foo(int * uniform x)  {
diff --git a/tests_errors/ref-3.ispc b/tests_errors/ref-3.ispc
index 85a8dd35..11b30a92 100644
--- a/tests_errors/ref-3.ispc
+++ b/tests_errors/ref-3.ispc
@@ -1,4 +1,4 @@
-// syntax error, unexpected '*',
+// syntax error, unexpected '\*',
 
 void foo(int & * x) {
     *x = NULL;
diff --git a/tests_errors/soa-11.ispc b/tests_errors/soa-11.ispc
index 67d814c6..d632e9b0 100644
--- a/tests_errors/soa-11.ispc
+++ b/tests_errors/soa-11.ispc
@@ -1,4 +1,4 @@
-// Type conversion from "const uniform int32" to "soa<4> struct Foo" for assignment operator is not possible
+// Type conversion from "const uniform int[0-9]*" to "soa<4> struct Foo" for assignment operator is not possible
 
 struct Pt { float x, y, z; };
 
diff --git a/tests_errors/soa-12.ispc b/tests_errors/soa-12.ispc
index e2cd3242..c0420614 100644
--- a/tests_errors/soa-12.ispc
+++ b/tests_errors/soa-12.ispc
@@ -1,4 +1,4 @@
-// Can't convert between types "const uniform int32" and "soa<4> float" with different SOA widths
+// Can't convert between types "const uniform int[0-9]*" and "soa<4> float" with different SOA widths
 
 struct Pt { float x, y, z; };
 
diff --git a/tests_errors/soa-3.ispc b/tests_errors/soa-3.ispc
index b2be1b59..04dc84bc 100644
--- a/tests_errors/soa-3.ispc
+++ b/tests_errors/soa-3.ispc
@@ -1,4 +1,4 @@
-// syntax error, unexpected '-', expecting int32 constant
+// syntax error, unexpected '-', expecting int
 
 struct F { float a, b, c; };
 
diff --git a/tests_errors/soa-4.ispc b/tests_errors/soa-4.ispc
index b2be1b59..04dc84bc 100644
--- a/tests_errors/soa-4.ispc
+++ b/tests_errors/soa-4.ispc
@@ -1,4 +1,4 @@
-// syntax error, unexpected '-', expecting int32 constant
+// syntax error, unexpected '-', expecting int
 
 struct F { float a, b, c; };
 
diff --git a/tests_errors/soa-9.ispc b/tests_errors/soa-9.ispc
index 7c6a1df9..e9e7509a 100644
--- a/tests_errors/soa-9.ispc
+++ b/tests_errors/soa-9.ispc
@@ -1,4 +1,4 @@
-// Can't convert from pointer to SOA type "soa<8> struct A * uniform" to pointer to non-SOA type "void * varying" 
+// Can't convert from pointer to SOA type "soa<8> struct A \* uniform" to pointer to non-SOA type "void \* varying" 
 
 struct A { float a, b; };
 
diff --git a/tests_errors/struct_arith.ispc b/tests_errors/struct_arith.ispc
index 9d942880..df729d02 100644
--- a/tests_errors/struct_arith.ispc
+++ b/tests_errors/struct_arith.ispc
@@ -1,4 +1,4 @@
-// Assignment operator "+=" is illegal with struct type
+// Assignment operator "\+=" is illegal with struct type
 
 struct Point { float x, y, z; };
 
diff --git a/tests_errors/vec-size-compile-constant.ispc b/tests_errors/vec-size-compile-constant.ispc
index b9e61721..0eb6f90e 100644
--- a/tests_errors/vec-size-compile-constant.ispc
+++ b/tests_errors/vec-size-compile-constant.ispc
@@ -1,4 +1,4 @@
-// syntax error, unexpected identifier, expecting int32 constant
+// syntax error, unexpected identifier, expecting int
 
 void foo(uniform int i) {
     float<i> a;

From c14659c6754f4d91a3bec3cbb48c4e67b7421d13 Mon Sep 17 00:00:00 2001
From: Matt Pharr <mpharr@google.com>
Date: Tue, 23 Jul 2013 17:02:49 -0700
Subject: [PATCH 006/124] Fix bug in lGetConstantInt() in parse.yy.

Previously, we weren't handling signed/unsigned constant types correctly.
---
 parse.yy | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/parse.yy b/parse.yy
index 6ed2a43d..4b315776 100644
--- a/parse.yy
+++ b/parse.yy
@@ -2278,7 +2278,11 @@ lGetConstantInt(Expr *expr, int *value, SourcePos pos, const char *usage) {
             Error(pos, "%s must be representable with a 32-bit integer.", usage);
             return false;
         }
-        *value = (int)ci->getZExtValue();
+        const Type *type = expr->GetType();
+        if (type->IsUnsignedType())
+            *value = (int)ci->getZExtValue();
+        else
+            *value = (int)ci->getSExtValue();
         return true;
     }
 }

From 15a3ef370a433eedcf6e6650f07ec81775d0322d Mon Sep 17 00:00:00 2001
From: Matt Pharr <mpharr@google.com>
Date: Tue, 23 Jul 2013 17:11:01 -0700
Subject: [PATCH 007/124] Use @llvm.readcyclecounter to implement stdlib
 clock() function.

Also added a test for the clock builtin.
---
 builtins/util.m4 | 14 ++++----------
 tests/clock.ispc | 14 ++++++++++++++
 2 files changed, 18 insertions(+), 10 deletions(-)
 create mode 100644 tests/clock.ispc

diff --git a/builtins/util.m4 b/builtins/util.m4
index d6f3e5c3..8c379781 100644
--- a/builtins/util.m4
+++ b/builtins/util.m4
@@ -2891,17 +2891,11 @@ m4exit(`1')
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; read hw clock
 
+declare i64 @llvm.readcyclecounter()
+
 define i64 @__clock() nounwind {
-entry:
-  tail call void asm sideeffect "xorl %eax,%eax \0A    cpuid", "~{rax},~{rbx},~{rcx},~{rdx},~{dirflag},~{fpsr},~{flags}"() nounwind
-  %0 = tail call { i32, i32 } asm sideeffect "rdtsc", "={ax},={dx},~{dirflag},~{fpsr},~{flags}"() nounwind
-  %asmresult = extractvalue { i32, i32 } %0, 0
-  %asmresult1 = extractvalue { i32, i32 } %0, 1
-  %conv = zext i32 %asmresult1 to i64
-  %shl = shl nuw i64 %conv, 32
-  %conv2 = zext i32 %asmresult to i64
-  %or = or i64 %shl, %conv2
-  ret i64 %or
+  %r = call i64 @llvm.readcyclecounter()
+  ret i64 %r
 }
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
diff --git a/tests/clock.ispc b/tests/clock.ispc
new file mode 100644
index 00000000..0e95379b
--- /dev/null
+++ b/tests/clock.ispc
@@ -0,0 +1,14 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+  unsigned uniform int64 a = clock();
+  float x = pow(sqrt(aFOO[programIndex]), 5.5);
+  unsigned uniform int64 b = clock();
+  RET[programIndex] = (b - a) > 0 ? 1 : 0;
+}
+
+export void result(uniform float RET[]) {
+  RET[programIndex] = 1;
+}

From 53414f12e6ce7d1615cd650cc7b2152063da6556 Mon Sep 17 00:00:00 2001
From: Matt Pharr <mpharr@google.com>
Date: Tue, 23 Jul 2013 17:30:32 -0700
Subject: [PATCH 008/124] Add SSE4 target optimized for computation with 8-bit
 datatypes.

This change adds a new 'sse4-8' target, where programCount is 16 and
the mask element size is 8-bits.  (i.e. the most appropriate sizing of
the mask for SIMD computation with 8-bit datatypes.)
---
 Makefile                  |   2 +-
 builtins.cpp              |   9 +
 builtins/target-sse4-8.ll | 444 ++++++++++++++++++++++++++++++++++++++
 builtins/util.m4          | 104 ++++++++-
 expr.cpp                  |   5 +
 ispc.cpp                  |   8 +
 opt.cpp                   |  13 +-
 7 files changed, 578 insertions(+), 7 deletions(-)
 create mode 100644 builtins/target-sse4-8.ll

diff --git a/Makefile b/Makefile
index 043ab4cf..054a3da1 100644
--- a/Makefile
+++ b/Makefile
@@ -123,7 +123,7 @@ CXX_SRC=ast.cpp builtins.cpp cbackend.cpp ctx.cpp decl.cpp expr.cpp func.cpp \
 HEADERS=ast.h builtins.h ctx.h decl.h expr.h func.h ispc.h llvmutil.h module.h \
 	opt.h stmt.h sym.h type.h util.h
 TARGETS=neon avx1 avx1-x2 avx11 avx11-x2 avx2 avx2-x2 sse2 sse2-x2 sse4 sse4-x2 \
-	generic-4 generic-8 generic-16 generic-32 generic-64 generic-1
+	sse4-8 generic-4 generic-8 generic-16 generic-32 generic-64 generic-1
 # These files need to be compiled in two versions - 32 and 64 bits.
 BUILTINS_SRC_TARGET=$(addprefix builtins/target-, $(addsuffix .ll, $(TARGETS)))
 # These are files to be compiled in single version.
diff --git a/builtins.cpp b/builtins.cpp
index d3bbaa6a..6c586595 100644
--- a/builtins.cpp
+++ b/builtins.cpp
@@ -868,6 +868,15 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
                 EXPORT_MODULE(builtins_bitcode_sse4_x2_64bit);
             }
             break;
+        case 16:
+            Assert(g->target->getMaskBitCount() == 8);
+            if (runtime32) {
+                EXPORT_MODULE(builtins_bitcode_sse4_8_32bit);
+            }
+            else {
+                EXPORT_MODULE(builtins_bitcode_sse4_8_64bit);
+            }
+            break;
         default:
             FATAL("logic error in DefineStdlib");
         }
diff --git a/builtins/target-sse4-8.ll b/builtins/target-sse4-8.ll
new file mode 100644
index 00000000..c85209ba
--- /dev/null
+++ b/builtins/target-sse4-8.ll
@@ -0,0 +1,444 @@
+;;  Copyright (c) 2013, Google, Inc.
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Google, Inc. nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; Define common 4-wide stuff
+define(`WIDTH',`16')
+define(`MASK',`i8')
+include(`util.m4')
+
+stdlib_core()
+packed_load_and_store()
+scans()
+int64minmax()
+
+include(`target-sse4-common.ll')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; half conversion routines
+
+declare float @__half_to_float_uniform(i16 %v) nounwind readnone
+declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
+declare i16 @__float_to_half_uniform(float %v) nounwind readnone
+declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rcp
+
+declare <4 x float> @llvm.x86.sse.rcp.ps(<4 x float>) nounwind readnone
+
+define <WIDTH x float> @__rcp_varying_float(<WIDTH x float>) nounwind readonly alwaysinline {
+  unary4to16(call, float, @llvm.x86.sse.rcp.ps, %0)
+  ; do one N-R iteration to improve precision
+  ;  float iv = __rcp_v(v);
+  ;  return iv * (2. - v * iv);
+  %v_iv = fmul <16 x float> %0, %call
+  %two_minus = fsub <16 x float> <float 2., float 2., float 2., float 2.,
+                                  float 2., float 2., float 2., float 2.,
+                                  float 2., float 2., float 2., float 2.,
+                                  float 2., float 2., float 2., float 2.>, %v_iv  
+  %iv_mul = fmul <16 x float> %call, %two_minus
+  ret <16 x float> %iv_mul
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; rsqrt
+
+declare <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float>) nounwind readnone
+
+define <16 x float> @__rsqrt_varying_float(<16 x float> %v) nounwind readonly alwaysinline {
+  ;  float is = __rsqrt_v(v);
+  unary4to16(is, float, @llvm.x86.sse.rsqrt.ps, %v)
+   ; Newton-Raphson iteration to improve precision
+  ;  return 0.5 * is * (3. - (v * is) * is);
+  %v_is = fmul <16 x float> %v, %is
+  %v_is_is = fmul <16 x float> %v_is, %is
+  %three_sub = fsub <16 x float> <float 3., float 3., float 3., float 3.,
+                                  float 3., float 3., float 3., float 3.,
+                                  float 3., float 3., float 3., float 3.,
+                                  float 3., float 3., float 3., float 3.>, %v_is_is
+  %is_mul = fmul <16 x float> %is, %three_sub
+  %half_scale = fmul <16 x float> <float 0.5, float 0.5, float 0.5, float 0.5,
+                                   float 0.5, float 0.5, float 0.5, float 0.5,
+                                   float 0.5, float 0.5, float 0.5, float 0.5,
+                                   float 0.5, float 0.5, float 0.5, float 0.5>, %is_mul
+  ret <16 x float> %half_scale
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; sqrt
+
+declare <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float>) nounwind readnone
+
+define <16 x float> @__sqrt_varying_float(<16 x float>) nounwind readonly alwaysinline {
+  unary4to16(call, float, @llvm.x86.sse.sqrt.ps, %0)
+  ret <16 x float> %call
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; double precision sqrt
+
+declare <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double>) nounwind readnone
+
+define <16 x double> @__sqrt_varying_double(<16 x double>) nounwind
+alwaysinline {
+  unary2to16(ret, double, @llvm.x86.sse2.sqrt.pd, %0)
+  ret <16 x double> %ret
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rounding floats
+
+declare <4 x float> @llvm.x86.sse41.round.ps(<4 x float>, i32) nounwind readnone
+
+define <16 x float> @__round_varying_float(<16 x float>) nounwind readonly alwaysinline {
+  ; roundps, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8
+  round4to16(%0, 8)
+}
+
+define <16 x float> @__floor_varying_float(<16 x float>) nounwind readonly alwaysinline {
+  ; roundps, round down 0b01 | don't signal precision exceptions 0b1001 = 9
+  round4to16(%0, 9)
+}
+
+define <16 x float> @__ceil_varying_float(<16 x float>) nounwind readonly alwaysinline {
+  ; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
+  round4to16(%0, 10)
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rounding doubles
+
+declare <2 x double> @llvm.x86.sse41.round.pd(<2 x double>, i32) nounwind readnone
+
+define <16 x double> @__round_varying_double(<16 x double>) nounwind readonly alwaysinline {
+;  XXXround2to4double(%0, 8)
+  ; FIXME: need round2to16double in util.m4...
+  ret <16 x double> undef  
+}
+
+define <16 x double> @__floor_varying_double(<16 x double>) nounwind readonly alwaysinline {
+  ; roundpd, round down 0b01 | don't signal precision exceptions 0b1001 = 9
+;  XXXround2to4double(%0, 9)
+  ret <16 x double> undef  
+}
+
+define <16 x double> @__ceil_varying_double(<16 x double>) nounwind readonly alwaysinline {
+  ; roundpd, round up 0b10 | don't signal precision exceptions 0b1010 = 10
+;  XXXround2to4double(%0, 10)
+  ret <16 x double> undef  
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; float min/max
+
+declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind readnone
+declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind readnone
+
+define <16 x float> @__max_varying_float(<16 x float>, <16 x float>) nounwind readonly alwaysinline {
+  binary4to16(call, float, @llvm.x86.sse.max.ps, %0, %1)
+  ret <16 x float> %call
+}
+
+define <16 x float> @__min_varying_float(<16 x float>, <16 x float>) nounwind readonly alwaysinline {
+  binary4to16(call, float, @llvm.x86.sse.min.ps, %0, %1)
+  ret <16 x float> %call
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int32 min/max
+
+define <16 x i32> @__min_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
+  binary4to16(call, i32, @llvm.x86.sse41.pminsd, %0, %1)
+  ret <16 x i32> %call
+}
+
+define <16 x i32> @__max_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
+  binary4to16(call, i32, @llvm.x86.sse41.pmaxsd, %0, %1)
+  ret <16 x i32> %call
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; unsigned int min/max
+
+define <16 x i32> @__min_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
+  binary4to16(call, i32, @llvm.x86.sse41.pminud, %0, %1)
+  ret <16 x i32> %call
+}
+
+define <16 x i32> @__max_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
+  binary4to16(call, i32, @llvm.x86.sse41.pmaxud, %0, %1)
+  ret <16 x i32> %call
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; double precision min/max
+
+declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>) nounwind readnone
+declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) nounwind readnone
+
+define <16 x double> @__min_varying_double(<16 x double>, <16 x double>) nounwind readnone {
+  binary2to16(ret, double, @llvm.x86.sse2.min.pd, %0, %1)
+  ret <16 x double> %ret
+}
+
+define <16 x double> @__max_varying_double(<16 x double>, <16 x double>) nounwind readnone {
+  binary2to16(ret, double, @llvm.x86.sse2.max.pd, %0, %1)
+  ret <16 x double> %ret
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; svml
+
+; FIXME
+
+declare <16 x float> @__svml_sin(<16 x float>)
+declare <16 x float> @__svml_cos(<16 x float>)
+declare void @__svml_sincos(<16 x float>, <16 x float> *, <16 x float> *)
+declare <16 x float> @__svml_tan(<16 x float>)
+declare <16 x float> @__svml_atan(<16 x float>)
+declare <16 x float> @__svml_atan2(<16 x float>, <16 x float>)
+declare <16 x float> @__svml_exp(<16 x float>)
+declare <16 x float> @__svml_log(<16 x float>)
+declare <16 x float> @__svml_pow(<16 x float>, <16 x float>)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; horizontal ops / reductions
+
+declare i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8>) nounwind readnone
+
+define i64 @__movmsk(<16 x i8>) nounwind readnone alwaysinline {
+  %m = call i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8> %0)
+  %m64 = zext i32 %m to i64
+  ret i64 %m64
+}
+
+define i1 @__any(<16 x i8>) nounwind readnone alwaysinline {
+  %m = call i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8> %0)
+  %mne = icmp ne i32 %m, 0
+  ret i1 %mne
+}
+
+define i1 @__all(<16 x i8>) nounwind readnone alwaysinline {
+  %m = call i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8> %0)
+  %meq = icmp eq i32 %m, ALL_ON_MASK
+  ret i1 %meq
+}
+
+define i1 @__none(<16 x i8>) nounwind readnone alwaysinline {
+  %m = call i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8> %0)
+  %meq = icmp eq i32 %m, 0
+  ret i1 %meq
+}
+
+define internal <16 x float> @__add_varying_float(<16 x float>, <16 x float>) {
+  %r = fadd <16 x float> %0, %1
+  ret <16 x float> %r
+}
+
+define internal float @__add_uniform_float(float, float) {
+  %r = fadd float %0, %1
+  ret float %r
+}
+
+define float @__reduce_add_float(<16 x float>) nounwind readonly alwaysinline {
+  reduce16(float, @__add_varying_float, @__add_uniform_float)
+}
+
+define float @__reduce_min_float(<16 x float>) nounwind readnone {
+  reduce16(float, @__min_varying_float, @__min_uniform_float)
+}
+
+define float @__reduce_max_float(<16 x float>) nounwind readnone {
+  reduce16(float, @__max_varying_float, @__max_uniform_float)
+}
+
+define internal <16 x i32> @__add_varying_int32(<16 x i32>, <16 x i32>) {
+  %r = add <16 x i32> %0, %1
+  ret <16 x i32> %r
+}
+
+define internal i32 @__add_uniform_int32(i32, i32) {
+  %r = add i32 %0, %1
+  ret i32 %r
+}
+
+define i32 @__reduce_add_int32(<16 x i32>) nounwind readnone {
+  reduce16(i32, @__add_varying_int32, @__add_uniform_int32)
+}
+
+define i32 @__reduce_min_int32(<16 x i32>) nounwind readnone {
+  reduce16(i32, @__min_varying_int32, @__min_uniform_int32)
+}
+
+define i32 @__reduce_max_int32(<16 x i32>) nounwind readnone {
+  reduce16(i32, @__max_varying_int32, @__max_uniform_int32)
+}
+
+define i32 @__reduce_min_uint32(<16 x i32>) nounwind readnone {
+  reduce16(i32, @__min_varying_uint32, @__min_uniform_uint32)
+}
+
+define i32 @__reduce_max_uint32(<16 x i32>) nounwind readnone {
+  reduce16(i32, @__max_varying_uint32, @__max_uniform_uint32)
+}
+
+define internal <16 x double> @__add_varying_double(<16 x double>, <16 x double>) {
+  %r = fadd <16 x double> %0, %1
+  ret <16 x double> %r
+}
+
+define internal double @__add_uniform_double(double, double) {
+  %r = fadd double %0, %1
+  ret double %r
+}
+
+define double @__reduce_add_double(<16 x double>) nounwind readnone {
+  reduce16(double, @__add_varying_double, @__add_uniform_double)
+}
+
+define double @__reduce_min_double(<16 x double>) nounwind readnone {
+  reduce16(double, @__min_varying_double, @__min_uniform_double)
+}
+
+define double @__reduce_max_double(<16 x double>) nounwind readnone {
+  reduce16(double, @__max_varying_double, @__max_uniform_double)
+}
+
+define internal <16 x i64> @__add_varying_int64(<16 x i64>, <16 x i64>) {
+  %r = add <16 x i64> %0, %1
+  ret <16 x i64> %r
+}
+
+define internal i64 @__add_uniform_int64(i64, i64) {
+  %r = add i64 %0, %1
+  ret i64 %r
+}
+
+define i64 @__reduce_add_int64(<16 x i64>) nounwind readnone {
+  reduce16(i64, @__add_varying_int64, @__add_uniform_int64)
+}
+
+define i64 @__reduce_min_int64(<16 x i64>) nounwind readnone {
+  reduce16(i64, @__min_varying_int64, @__min_uniform_int64)
+}
+
+define i64 @__reduce_max_int64(<16 x i64>) nounwind readnone {
+  reduce16(i64, @__max_varying_int64, @__max_uniform_int64)
+}
+
+define i64 @__reduce_min_uint64(<16 x i64>) nounwind readnone {
+  reduce16(i64, @__min_varying_uint64, @__min_uniform_uint64)
+}
+
+define i64 @__reduce_max_uint64(<16 x i64>) nounwind readnone {
+  reduce16(i64, @__max_varying_uint64, @__max_uniform_uint64)
+}
+
+reduce_equal(16)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; masked store
+
+define void @__masked_store_blend_i64(<16 x i64>* nocapture, <16 x i64>,
+                                      <16 x i8> %mask) nounwind
+                                      alwaysinline {
+  %mask_as_i1 = trunc <16 x MASK> %mask to <16 x i1>
+  %old = load <16 x i64>* %0, align 4
+  %blend = select <16 x i1> %mask_as_i1, <16 x i64> %1, <16 x i64> %old
+  store <16 x i64> %blend, <16 x i64>* %0, align 4
+  ret void
+}
+
+define void @__masked_store_blend_i32(<16 x i32>* nocapture, <16 x i32>, 
+                                      <16 x MASK> %mask) nounwind alwaysinline {
+  %mask_as_i1 = trunc <16 x MASK> %mask to <16 x i1>
+  %old = load <16 x i32>* %0, align 4
+  %blend = select <16 x i1> %mask_as_i1, <16 x i32> %1, <16 x i32> %old
+  store <16 x i32> %blend, <16 x i32>* %0, align 4
+  ret void
+}
+
+define void @__masked_store_blend_i16(<16 x i16>* nocapture, <16 x i16>,
+                                     <16 x MASK> %mask) nounwind alwaysinline {
+  %mask_as_i1 = trunc <16 x MASK> %mask to <16 x i1>
+  %old = load <16 x i16>* %0, align 4
+  %blend = select <16 x i1> %mask_as_i1, <16 x i16> %1, <16 x i16> %old
+  store <16 x i16> %blend, <16 x i16>* %0, align 4
+  ret void
+}
+
+define void @__masked_store_blend_i8(<16 x i8>* nocapture, <16 x i8>,
+                                     <16 x MASK> %mask) nounwind alwaysinline {
+  %mask_as_i1 = trunc <16 x MASK> %mask to <16 x i1>
+  %old = load <16 x i8>* %0, align 4
+  %blend = select <16 x i1> %mask_as_i1, <16 x i8> %1, <16 x i8> %old
+  store <16 x i8> %blend, <16 x i8>* %0, align 4
+  ret void
+}
+
+gen_masked_store(i8)
+gen_masked_store(i16)
+gen_masked_store(i32)
+gen_masked_store(i64)
+
+masked_store_float_double()
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; unaligned loads/loads+broadcasts
+
+masked_load(i8,  1)
+masked_load(i16, 2)
+masked_load(i32, 4)
+masked_load(float, 4)
+masked_load(i64, 8)
+masked_load(double, 8)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; gather/scatter
+
+; define these with the macros from stdlib.m4
+
+gen_gather_factored(i8)
+gen_gather_factored(i16)
+gen_gather_factored(i32)
+gen_gather_factored(float)
+gen_gather_factored(i64)
+gen_gather_factored(double)
+
+gen_scatter(i8)
+gen_scatter(i16)
+gen_scatter(i32)
+gen_scatter(float)
+gen_scatter(i64)
+gen_scatter(double)
diff --git a/builtins/util.m4 b/builtins/util.m4
index 8c379781..ee45ebc7 100644
--- a/builtins/util.m4
+++ b/builtins/util.m4
@@ -411,6 +411,42 @@ define(`unary2to8', `
 '
 )
 
+define(`unary2to16', `
+  %$1_0 = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> <i32 0, i32 1>
+  %v$1_0 = call <2 x $2> $3(<2 x $2> %$1_0)
+  %$1_1 = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> <i32 2, i32 3>
+  %v$1_1 = call <2 x $2> $3(<2 x $2> %$1_1)
+  %$1_2 = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> <i32 4, i32 5>
+  %v$1_2 = call <2 x $2> $3(<2 x $2> %$1_2)
+  %$1_3 = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> <i32 6, i32 7>
+  %v$1_3 = call <2 x $2> $3(<2 x $2> %$1_3)
+  %$1_4 = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> <i32 8, i32 9>
+  %v$1_4 = call <2 x $2> $3(<2 x $2> %$1_4)
+  %$1_5 = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> <i32 10, i32 11>
+  %v$1_5 = call <2 x $2> $3(<2 x $2> %$1_5)
+  %$1_6 = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> <i32 12, i32 13>
+  %v$1_6 = call <2 x $2> $3(<2 x $2> %$1_6)
+  %$1_7 = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> <i32 14, i32 15>
+  %v$1_7 = call <2 x $2> $3(<2 x $2> %$1_7)
+  %$1a = shufflevector <2 x $2> %v$1_0, <2 x $2> %v$1_1,
+           <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %$1b = shufflevector <2 x $2> %v$1_2, <2 x $2> %v$1_3,
+           <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %$1ab = shufflevector <4 x $2> %$1a, <4 x $2> %$1b,
+           <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %$1c = shufflevector <2 x $2> %v$1_4, <2 x $2> %v$1_5,
+           <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %$1d = shufflevector <2 x $2> %v$1_6, <2 x $2> %v$1_7,
+           <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %$1cd = shufflevector <4 x $2> %$1c, <4 x $2> %$1d,
+           <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+
+  %$1 = shufflevector <8 x $2> %$1ab, <8 x $2> %$1cd,
+           <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                       i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+'
+)
+
 ;; Maps an 2-wide binary function to two 8-wide vector operands
 ;; $1: name of variable into which the final result should go
 ;; $2: scalar type of the vector elements
@@ -432,12 +468,58 @@ define(`binary2to8', `
   %$1_3b = shufflevector <8 x $2> $5, <8 x $2> undef, <2 x i32> <i32 6, i32 7>
   %v$1_3 = call <2 x $2> $3(<2 x $2> %$1_3a, <2 x $2> %$1_3b)
 
+  %$1a = shufflevector <2 x $2> %v$1_0, <2 x $2> %v$1_1,
+           <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %$1b = shufflevector <2 x $2> %v$1_2, <2 x $2> %v$1_3,
+           <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %$1 = shufflevector <4 x $2> %$1a, <4 x $2> %$1b,
+           <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+'
+)
+
+define(`binary2to16', `
+  %$1_0a = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> <i32 0, i32 1>
+  %$1_0b = shufflevector <16 x $2> $5, <16 x $2> undef, <2 x i32> <i32 0, i32 1>
+  %v$1_0 = call <2 x $2> $3(<2 x $2> %$1_0a, <2 x $2> %$1_0b)
+  %$1_1a = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> <i32 2, i32 3>
+  %$1_1b = shufflevector <16 x $2> $5, <16 x $2> undef, <2 x i32> <i32 2, i32 3>
+  %v$1_1 = call <2 x $2> $3(<2 x $2> %$1_1a, <2 x $2> %$1_1b)
+  %$1_2a = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> <i32 4, i32 5>
+  %$1_2b = shufflevector <16 x $2> $5, <16 x $2> undef, <2 x i32> <i32 4, i32 5>
+  %v$1_2 = call <2 x $2> $3(<2 x $2> %$1_2a, <2 x $2> %$1_2b)
+  %$1_3a = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> <i32 6, i32 7>
+  %$1_3b = shufflevector <16 x $2> $5, <16 x $2> undef, <2 x i32> <i32 6, i32 7>
+  %v$1_3 = call <2 x $2> $3(<2 x $2> %$1_3a, <2 x $2> %$1_3b)
+  %$1_4a = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> <i32 8, i32 9>
+  %$1_4b = shufflevector <16 x $2> $5, <16 x $2> undef, <2 x i32> <i32 8, i32 9>
+  %v$1_4 = call <2 x $2> $3(<2 x $2> %$1_4a, <2 x $2> %$1_4b)
+  %$1_5a = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> <i32 10, i32 11>
+  %$1_5b = shufflevector <16 x $2> $5, <16 x $2> undef, <2 x i32> <i32 10, i32 11>
+  %v$1_5 = call <2 x $2> $3(<2 x $2> %$1_5a, <2 x $2> %$1_5b)
+  %$1_6a = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> <i32 12, i32 13>
+  %$1_6b = shufflevector <16 x $2> $5, <16 x $2> undef, <2 x i32> <i32 12, i32 13>
+  %v$1_6 = call <2 x $2> $3(<2 x $2> %$1_6a, <2 x $2> %$1_6b)
+  %$1_7a = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> <i32 14, i32 15>
+  %$1_7b = shufflevector <16 x $2> $5, <16 x $2> undef, <2 x i32> <i32 14, i32 15>
+  %v$1_7 = call <2 x $2> $3(<2 x $2> %$1_7a, <2 x $2> %$1_7b)
+
   %$1a = shufflevector <2 x $2> %v$1_0, <2 x $2> %v$1_1, 
            <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   %$1b = shufflevector <2 x $2> %v$1_2, <2 x $2> %v$1_3, 
            <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %$1 = shufflevector <4 x $2> %$1a, <4 x $2> %$1b,
+  %$1ab = shufflevector <4 x $2> %$1a, <4 x $2> %$1b,
            <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>           
+
+  %$1c = shufflevector <2 x $2> %v$1_4, <2 x $2> %v$1_5,
+           <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %$1d = shufflevector <2 x $2> %v$1_6, <2 x $2> %v$1_7,
+           <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %$1cd = shufflevector <4 x $2> %$1c, <4 x $2> %$1d,
+           <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+
+  %$1 = shufflevector <8 x $2> %$1ab, <8 x $2> %$1cd,
+           <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                       i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 '
 )
 
@@ -460,6 +542,26 @@ ret <8 x float> %ret
 '
 )
 
+define(`round4to16', `
+%v0 = shufflevector <16 x float> $1, <16 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+%v1 = shufflevector <16 x float> $1, <16 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+%v2 = shufflevector <16 x float> $1, <16 x float> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+%v3 = shufflevector <16 x float> $1, <16 x float> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+%r0 = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %v0, i32 $2)
+%r1 = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %v1, i32 $2)
+%r2 = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %v2, i32 $2)
+%r3 = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %v3, i32 $2)
+%ret01 = shufflevector <4 x float> %r0, <4 x float> %r1,
+         <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+%ret23 = shufflevector <4 x float> %r2, <4 x float> %r3,
+         <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+%ret = shufflevector <8 x float> %ret01, <8 x float> %ret23,
+         <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                     i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ret <16 x float> %ret
+'
+)
+
 define(`round8to16', `
 %v0 = shufflevector <16 x float> $1, <16 x float> undef,
         <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
diff --git a/expr.cpp b/expr.cpp
index 6bde2acb..f81037f6 100644
--- a/expr.cpp
+++ b/expr.cpp
@@ -3123,6 +3123,10 @@ static llvm::Value *
 lEmitVaryingSelect(FunctionEmitContext *ctx, llvm::Value *test,
                    llvm::Value *expr1, llvm::Value *expr2,
                    const Type *type) {
+#if !defined(LLVM_3_1)
+    test = ctx->TruncInst(test, LLVMTypes::Int1VectorType);
+    return ctx->SelectInst(test, expr1, expr2, "select");
+#else
     llvm::Value *resultPtr = ctx->AllocaInst(expr1->getType(), "selectexpr_tmp");
     // Don't need to worry about masking here
     ctx->StoreInst(expr2, resultPtr);
@@ -3131,6 +3135,7 @@ lEmitVaryingSelect(FunctionEmitContext *ctx, llvm::Value *test,
            PointerType::GetUniform(type)->LLVMType(g->ctx));
     ctx->StoreInst(expr1, resultPtr, test, type, PointerType::GetUniform(type));
     return ctx->LoadInst(resultPtr, "selectexpr_final");
+#endif // !LLVM_3_1
 }
 
 
diff --git a/ispc.cpp b/ispc.cpp
index 887f6ca3..6ac23781 100644
--- a/ispc.cpp
+++ b/ispc.cpp
@@ -310,6 +310,14 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
         this->m_maskingIsFree = false;
         this->m_maskBitCount = 32;
     }
+    else if (!strcasecmp(isa, "sse4-8")) {
+        this->m_isa = Target::SSE4;
+        this->m_nativeVectorWidth = 16;
+        this->m_vectorWidth = 16;
+        this->m_attributes = "+sse,+sse2,+sse3,+sse41,-sse42,-sse4a,+ssse3,-popcnt,+cmov";
+        this->m_maskingIsFree = false;
+        this->m_maskBitCount = 8;
+    }
     else if (!strcasecmp(isa, "generic-4")) {
         this->m_isa = Target::GENERIC;
         this->m_nativeVectorWidth = 4;
diff --git a/opt.cpp b/opt.cpp
index ba32c639..4701e7df 100644
--- a/opt.cpp
+++ b/opt.cpp
@@ -670,14 +670,17 @@ IntrinsicsOpt::IntrinsicsOpt()
     // All of the mask instructions we may encounter.  Note that even if
     // compiling for AVX, we may still encounter the regular 4-wide SSE
     // MOVMSK instruction.
-    llvm::Function *sseMovmsk =
+    llvm::Function *ssei8Movmsk =
+        llvm::Intrinsic::getDeclaration(m->module, llvm::Intrinsic::x86_sse2_pmovmskb_128);
+    maskInstructions.push_back(ssei8Movmsk);
+    llvm::Function *sseFloatMovmsk =
         llvm::Intrinsic::getDeclaration(m->module, llvm::Intrinsic::x86_sse_movmsk_ps);
-    maskInstructions.push_back(sseMovmsk);
+    maskInstructions.push_back(sseFloatMovmsk);
     maskInstructions.push_back(m->module->getFunction("__movmsk"));
-    llvm::Function *avxMovmsk =
+    llvm::Function *avxFloatMovmsk =
         llvm::Intrinsic::getDeclaration(m->module, llvm::Intrinsic::x86_avx_movmsk_ps_256);
-    Assert(avxMovmsk != NULL);
-    maskInstructions.push_back(avxMovmsk);
+    Assert(avxFloatMovmsk != NULL);
+    maskInstructions.push_back(avxFloatMovmsk);
 
     // And all of the blend instructions
     blendInstructions.push_back(BlendInstruction(

From 04d61afa23a64d9fc5f95648509bd5ec002da53e Mon Sep 17 00:00:00 2001
From: Matt Pharr <mpharr@google.com>
Date: Thu, 25 Jul 2013 09:40:48 -0700
Subject: [PATCH 009/124] Fix bug in lEmitVaryingSelect() for targets with i1
 mask types.

Commit 53414f12e6c introduced a but where lEmitVaryingSelect() would
try to truncate a vector of i1s to a vector of i1s, which in turn
made LLVM's IR analyzer unhappy.
---
 expr.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/expr.cpp b/expr.cpp
index f81037f6..856d363c 100644
--- a/expr.cpp
+++ b/expr.cpp
@@ -3124,7 +3124,8 @@ lEmitVaryingSelect(FunctionEmitContext *ctx, llvm::Value *test,
                    llvm::Value *expr1, llvm::Value *expr2,
                    const Type *type) {
 #if !defined(LLVM_3_1)
-    test = ctx->TruncInst(test, LLVMTypes::Int1VectorType);
+    if (test->getType() != LLVMTypes::Int1VectorType)
+        test = ctx->TruncInst(test, LLVMTypes::Int1VectorType);
     return ctx->SelectInst(test, expr1, expr2, "select");
 #else
     llvm::Value *resultPtr = ctx->AllocaInst(expr1->getType(), "selectexpr_tmp");

From 780b0dfe47a770785c4fe1f224813e3a518cd135 Mon Sep 17 00:00:00 2001
From: Matt Pharr <mpharr@google.com>
Date: Wed, 24 Jul 2013 09:40:50 -0700
Subject: [PATCH 010/124] Add SSE4-16 target.

Along the lines of sse4-8, this is an 8-wide target for SSE4, using
16-bit elements for the mask.  It's thus (in principle) the best
target for SIMD computation with 16-bit datatypes.
---
 Makefile                   |   2 +-
 builtins.cpp               |  16 +-
 builtins/target-sse4-16.ll | 436 +++++++++++++++++++++++++++++++++++++
 ispc.cpp                   |  14 +-
 run_tests.py               |   2 +-
 5 files changed, 463 insertions(+), 7 deletions(-)
 create mode 100644 builtins/target-sse4-16.ll

diff --git a/Makefile b/Makefile
index 054a3da1..fc064dbd 100644
--- a/Makefile
+++ b/Makefile
@@ -123,7 +123,7 @@ CXX_SRC=ast.cpp builtins.cpp cbackend.cpp ctx.cpp decl.cpp expr.cpp func.cpp \
 HEADERS=ast.h builtins.h ctx.h decl.h expr.h func.h ispc.h llvmutil.h module.h \
 	opt.h stmt.h sym.h type.h util.h
 TARGETS=neon avx1 avx1-x2 avx11 avx11-x2 avx2 avx2-x2 sse2 sse2-x2 sse4 sse4-x2 \
-	sse4-8 generic-4 generic-8 generic-16 generic-32 generic-64 generic-1
+	sse4-8 sse4-16 generic-4 generic-8 generic-16 generic-32 generic-64 generic-1
 # These files need to be compiled in two versions - 32 and 64 bits.
 BUILTINS_SRC_TARGET=$(addprefix builtins/target-, $(addsuffix .ll, $(TARGETS)))
 # These are files to be compiled in single version.
diff --git a/builtins.cpp b/builtins.cpp
index 6c586595..c4a2f3b5 100644
--- a/builtins.cpp
+++ b/builtins.cpp
@@ -862,10 +862,22 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
             break;
         case 8:
             if (runtime32) {
-                EXPORT_MODULE(builtins_bitcode_sse4_x2_32bit);
+                if (g->target->getMaskBitCount() == 16) {
+                    EXPORT_MODULE(builtins_bitcode_sse4_16_32bit);
+                }
+                else {
+                    Assert(g->target->getMaskBitCount() == 32);
+                    EXPORT_MODULE(builtins_bitcode_sse4_x2_32bit);
+                }
             }
             else {
-                EXPORT_MODULE(builtins_bitcode_sse4_x2_64bit);
+                if (g->target->getMaskBitCount() == 16) {
+                    EXPORT_MODULE(builtins_bitcode_sse4_16_64bit);
+                }
+                else {
+                    Assert(g->target->getMaskBitCount() == 32);
+                    EXPORT_MODULE(builtins_bitcode_sse4_x2_64bit);
+                }
             }
             break;
         case 16:
diff --git a/builtins/target-sse4-16.ll b/builtins/target-sse4-16.ll
new file mode 100644
index 00000000..2044fbee
--- /dev/null
+++ b/builtins/target-sse4-16.ll
@@ -0,0 +1,436 @@
+;;  Copyright (c) 2013, Google, Inc.
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Google, Inc. nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; Define common 4-wide stuff
+define(`WIDTH',`8')
+define(`MASK',`i16')
+include(`util.m4')
+
+stdlib_core()
+packed_load_and_store()
+scans()
+int64minmax()
+
+include(`target-sse4-common.ll')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; half conversion routines
+
+declare float @__half_to_float_uniform(i16 %v) nounwind readnone
+declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
+declare i16 @__float_to_half_uniform(float %v) nounwind readnone
+declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rcp
+
+declare <4 x float> @llvm.x86.sse.rcp.ps(<4 x float>) nounwind readnone
+
+define <WIDTH x float> @__rcp_varying_float(<WIDTH x float>) nounwind readonly alwaysinline {
+  unary4to8(call, float, @llvm.x86.sse.rcp.ps, %0)
+  ; do one N-R iteration to improve precision
+  ;  float iv = __rcp_v(v);
+  ;  return iv * (2. - v * iv);
+  %v_iv = fmul <8 x float> %0, %call
+  %two_minus = fsub <8 x float> <float 2., float 2., float 2., float 2.,
+                                 float 2., float 2., float 2., float 2.>, %v_iv  
+  %iv_mul = fmul <8 x float> %call, %two_minus
+  ret <8 x float> %iv_mul
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; rsqrt
+
+declare <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float>) nounwind readnone
+
+define <WIDTH x float> @__rsqrt_varying_float(<WIDTH x float> %v) nounwind readonly alwaysinline {
+  ;  float is = __rsqrt_v(v);
+  unary4to8(is, float, @llvm.x86.sse.rsqrt.ps, %v)
+   ; Newton-Raphson iteration to improve precision
+  ;  return 0.5 * is * (3. - (v * is) * is);
+  %v_is = fmul <8 x float> %v, %is
+  %v_is_is = fmul <8 x float> %v_is, %is
+  %three_sub = fsub <8 x float> <float 3., float 3., float 3., float 3.,
+                                  float 3., float 3., float 3., float 3.>, %v_is_is
+  %is_mul = fmul <8 x float> %is, %three_sub
+  %half_scale = fmul <8 x float> <float 0.5, float 0.5, float 0.5, float 0.5,
+                                   float 0.5, float 0.5, float 0.5, float 0.5>, %is_mul
+  ret <8 x float> %half_scale
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; sqrt
+
+declare <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float>) nounwind readnone
+
+define <8 x float> @__sqrt_varying_float(<8 x float>) nounwind readonly alwaysinline {
+  unary4to8(call, float, @llvm.x86.sse.sqrt.ps, %0)
+  ret <8 x float> %call
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; double precision sqrt
+
+declare <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double>) nounwind readnone
+
+define <8 x double> @__sqrt_varying_double(<8 x double>) nounwind
+alwaysinline {
+  unary2to8(ret, double, @llvm.x86.sse2.sqrt.pd, %0)
+  ret <8 x double> %ret
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rounding floats
+
+declare <4 x float> @llvm.x86.sse41.round.ps(<4 x float>, i32) nounwind readnone
+
+define <8 x float> @__round_varying_float(<8 x float>) nounwind readonly alwaysinline {
+  ; roundps, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8
+  round4to8(%0, 8)
+}
+
+define <8 x float> @__floor_varying_float(<8 x float>) nounwind readonly alwaysinline {
+  ; roundps, round down 0b01 | don't signal precision exceptions 0b1001 = 9
+  round4to8(%0, 9)
+}
+
+define <8 x float> @__ceil_varying_float(<8 x float>) nounwind readonly alwaysinline {
+  ; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
+  round4to8(%0, 10)
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rounding doubles
+
+declare <2 x double> @llvm.x86.sse41.round.pd(<2 x double>, i32) nounwind readnone
+
+define <8 x double> @__round_varying_double(<8 x double>) nounwind readonly alwaysinline {
+  round2to8double(%0, 8)
+}
+
+define <8 x double> @__floor_varying_double(<8 x double>) nounwind readonly alwaysinline {
+  round2to8double(%0, 9)
+}
+
+define <8 x double> @__ceil_varying_double(<8 x double>) nounwind readonly alwaysinline {
+  round2to8double(%0, 10)
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; float min/max
+
+declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind readnone
+declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind readnone
+
+define <8 x float> @__max_varying_float(<8 x float>, <8 x float>) nounwind readonly alwaysinline {
+  binary4to8(call, float, @llvm.x86.sse.max.ps, %0, %1)
+  ret <8 x float> %call
+}
+
+define <8 x float> @__min_varying_float(<8 x float>, <8 x float>) nounwind readonly alwaysinline {
+  binary4to8(call, float, @llvm.x86.sse.min.ps, %0, %1)
+  ret <8 x float> %call
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int32 min/max
+
+define <8 x i32> @__min_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
+  binary4to8(call, i32, @llvm.x86.sse41.pminsd, %0, %1)
+  ret <8 x i32> %call
+}
+
+define <8 x i32> @__max_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
+  binary4to8(call, i32, @llvm.x86.sse41.pmaxsd, %0, %1)
+  ret <8 x i32> %call
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; unsigned int min/max
+
+define <8 x i32> @__min_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
+  binary4to8(call, i32, @llvm.x86.sse41.pminud, %0, %1)
+  ret <8 x i32> %call
+}
+
+define <8 x i32> @__max_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
+  binary4to8(call, i32, @llvm.x86.sse41.pmaxud, %0, %1)
+  ret <8 x i32> %call
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; double precision min/max
+
+declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>) nounwind readnone
+declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) nounwind readnone
+
+define <8 x double> @__min_varying_double(<8 x double>, <8 x double>) nounwind readnone {
+  binary2to8(ret, double, @llvm.x86.sse2.min.pd, %0, %1)
+  ret <8 x double> %ret
+}
+
+define <8 x double> @__max_varying_double(<8 x double>, <8 x double>) nounwind readnone {
+  binary2to8(ret, double, @llvm.x86.sse2.max.pd, %0, %1)
+  ret <8 x double> %ret
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; svml
+
+; FIXME
+
+declare <8 x float> @__svml_sin(<8 x float>)
+declare <8 x float> @__svml_cos(<8 x float>)
+declare void @__svml_sincos(<8 x float>, <8 x float> *, <8 x float> *)
+declare <8 x float> @__svml_tan(<8 x float>)
+declare <8 x float> @__svml_atan(<8 x float>)
+declare <8 x float> @__svml_atan2(<8 x float>, <8 x float>)
+declare <8 x float> @__svml_exp(<8 x float>)
+declare <8 x float> @__svml_log(<8 x float>)
+declare <8 x float> @__svml_pow(<8 x float>, <8 x float>)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; horizontal ops / reductions
+
+declare i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8>) nounwind readnone
+
+define i64 @__movmsk(<8 x MASK>) nounwind readnone alwaysinline {
+  %m8 = trunc <8 x MASK> %0 to <8 x i8>
+  %mask8 = shufflevector <8 x i8> %m8, <8 x i8> zeroinitializer,
+      <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                  i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
+  %m = call i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8> %mask8)
+  %m64 = zext i32 %m to i64
+  ret i64 %m64
+}
+
+define i1 @__any(<8 x MASK>) nounwind readnone alwaysinline {
+  %m = call i64 @__movmsk(<8 x MASK> %0)
+  %mne = icmp ne i64 %m, 0
+  ret i1 %mne
+}
+
+define i1 @__all(<8 x MASK>) nounwind readnone alwaysinline {
+  %m = call i64 @__movmsk(<8 x MASK> %0)
+  %meq = icmp eq i64 %m, ALL_ON_MASK
+  ret i1 %meq
+}
+
+define i1 @__none(<8 x MASK>) nounwind readnone alwaysinline {
+  %m = call i64 @__movmsk(<8 x MASK> %0)
+  %meq = icmp eq i64 %m, 0
+  ret i1 %meq
+}
+
+define internal <8 x float> @__add_varying_float(<8 x float>, <8 x float>) {
+  %r = fadd <8 x float> %0, %1
+  ret <8 x float> %r
+}
+
+define internal float @__add_uniform_float(float, float) {
+  %r = fadd float %0, %1
+  ret float %r
+}
+
+define float @__reduce_add_float(<8 x float>) nounwind readonly alwaysinline {
+  reduce8(float, @__add_varying_float, @__add_uniform_float)
+}
+
+define float @__reduce_min_float(<8 x float>) nounwind readnone {
+  reduce8(float, @__min_varying_float, @__min_uniform_float)
+}
+
+define float @__reduce_max_float(<8 x float>) nounwind readnone {
+  reduce8(float, @__max_varying_float, @__max_uniform_float)
+}
+
+define internal <8 x i32> @__add_varying_int32(<8 x i32>, <8 x i32>) {
+  %r = add <8 x i32> %0, %1
+  ret <8 x i32> %r
+}
+
+define internal i32 @__add_uniform_int32(i32, i32) {
+  %r = add i32 %0, %1
+  ret i32 %r
+}
+
+define i32 @__reduce_add_int32(<8 x i32>) nounwind readnone {
+  reduce8(i32, @__add_varying_int32, @__add_uniform_int32)
+}
+
+define i32 @__reduce_min_int32(<8 x i32>) nounwind readnone {
+  reduce8(i32, @__min_varying_int32, @__min_uniform_int32)
+}
+
+define i32 @__reduce_max_int32(<8 x i32>) nounwind readnone {
+  reduce8(i32, @__max_varying_int32, @__max_uniform_int32)
+}
+
+define i32 @__reduce_min_uint32(<8 x i32>) nounwind readnone {
+  reduce8(i32, @__min_varying_uint32, @__min_uniform_uint32)
+}
+
+define i32 @__reduce_max_uint32(<8 x i32>) nounwind readnone {
+  reduce8(i32, @__max_varying_uint32, @__max_uniform_uint32)
+}
+
+define internal <8 x double> @__add_varying_double(<8 x double>, <8 x double>) {
+  %r = fadd <8 x double> %0, %1
+  ret <8 x double> %r
+}
+
+define internal double @__add_uniform_double(double, double) {
+  %r = fadd double %0, %1
+  ret double %r
+}
+
+define double @__reduce_add_double(<8 x double>) nounwind readnone {
+  reduce8(double, @__add_varying_double, @__add_uniform_double)
+}
+
+define double @__reduce_min_double(<8 x double>) nounwind readnone {
+  reduce8(double, @__min_varying_double, @__min_uniform_double)
+}
+
+define double @__reduce_max_double(<8 x double>) nounwind readnone {
+  reduce8(double, @__max_varying_double, @__max_uniform_double)
+}
+
+define internal <8 x i64> @__add_varying_int64(<8 x i64>, <8 x i64>) {
+  %r = add <8 x i64> %0, %1
+  ret <8 x i64> %r
+}
+
+define internal i64 @__add_uniform_int64(i64, i64) {
+  %r = add i64 %0, %1
+  ret i64 %r
+}
+
+define i64 @__reduce_add_int64(<8 x i64>) nounwind readnone {
+  reduce8(i64, @__add_varying_int64, @__add_uniform_int64)
+}
+
+define i64 @__reduce_min_int64(<8 x i64>) nounwind readnone {
+  reduce8(i64, @__min_varying_int64, @__min_uniform_int64)
+}
+
+define i64 @__reduce_max_int64(<8 x i64>) nounwind readnone {
+  reduce8(i64, @__max_varying_int64, @__max_uniform_int64)
+}
+
+define i64 @__reduce_min_uint64(<8 x i64>) nounwind readnone {
+  reduce8(i64, @__min_varying_uint64, @__min_uniform_uint64)
+}
+
+define i64 @__reduce_max_uint64(<8 x i64>) nounwind readnone {
+  reduce8(i64, @__max_varying_uint64, @__max_uniform_uint64)
+}
+
+reduce_equal(8)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; masked store
+
+define void @__masked_store_blend_i64(<8 x i64>* nocapture, <8 x i64>,
+                                      <8 x MASK> %mask) nounwind
+                                      alwaysinline {
+  %mask_as_i1 = trunc <8 x MASK> %mask to <8 x i1>
+  %old = load <8 x i64>* %0, align 4
+  %blend = select <8 x i1> %mask_as_i1, <8 x i64> %1, <8 x i64> %old
+  store <8 x i64> %blend, <8 x i64>* %0, align 4
+  ret void
+}
+
+define void @__masked_store_blend_i32(<8 x i32>* nocapture, <8 x i32>, 
+                                      <8 x MASK> %mask) nounwind alwaysinline {
+  %mask_as_i1 = trunc <8 x MASK> %mask to <8 x i1>
+  %old = load <8 x i32>* %0, align 4
+  %blend = select <8 x i1> %mask_as_i1, <8 x i32> %1, <8 x i32> %old
+  store <8 x i32> %blend, <8 x i32>* %0, align 4
+  ret void
+}
+
+define void @__masked_store_blend_i16(<8 x i16>* nocapture, <8 x i16>,
+                                     <8 x MASK> %mask) nounwind alwaysinline {
+  %mask_as_i1 = trunc <8 x MASK> %mask to <8 x i1>
+  %old = load <8 x i16>* %0, align 4
+  %blend = select <8 x i1> %mask_as_i1, <8 x i16> %1, <8 x i16> %old
+  store <8 x i16> %blend, <8 x i16>* %0, align 4
+  ret void
+}
+
+define void @__masked_store_blend_i8(<8 x i8>* nocapture, <8 x i8>,
+                                     <8 x MASK> %mask) nounwind alwaysinline {
+  %mask_as_i1 = trunc <8 x MASK> %mask to <8 x i1>
+  %old = load <8 x i8>* %0, align 4
+  %blend = select <8 x i1> %mask_as_i1, <8 x i8> %1, <8 x i8> %old
+  store <8 x i8> %blend, <8 x i8>* %0, align 4
+  ret void
+}
+
+gen_masked_store(i8)
+gen_masked_store(i16)
+gen_masked_store(i32)
+gen_masked_store(i64)
+
+masked_store_float_double()
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; unaligned loads/loads+broadcasts
+
+masked_load(i8,  1)
+masked_load(i16, 2)
+masked_load(i32, 4)
+masked_load(float, 4)
+masked_load(i64, 8)
+masked_load(double, 8)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; gather/scatter
+
+; define these with the macros from stdlib.m4
+
+gen_gather_factored(i8)
+gen_gather_factored(i16)
+gen_gather_factored(i32)
+gen_gather_factored(float)
+gen_gather_factored(i64)
+gen_gather_factored(double)
+
+gen_scatter(i8)
+gen_scatter(i16)
+gen_scatter(i32)
+gen_scatter(float)
+gen_scatter(i64)
+gen_scatter(double)
diff --git a/ispc.cpp b/ispc.cpp
index 6ac23781..a9f5ff5c 100644
--- a/ispc.cpp
+++ b/ispc.cpp
@@ -318,6 +318,14 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
         this->m_maskingIsFree = false;
         this->m_maskBitCount = 8;
     }
+    else if (!strcasecmp(isa, "sse4-16")) {
+        this->m_isa = Target::SSE4;
+        this->m_nativeVectorWidth = 8;
+        this->m_vectorWidth = 8;
+        this->m_attributes = "+sse,+sse2,+sse3,+sse41,-sse42,-sse4a,+ssse3,-popcnt,+cmov";
+        this->m_maskingIsFree = false;
+        this->m_maskBitCount = 16;
+    }
     else if (!strcasecmp(isa, "generic-4")) {
         this->m_isa = Target::GENERIC;
         this->m_nativeVectorWidth = 4;
@@ -575,9 +583,9 @@ Target::SupportedTargetArchs() {
 
 const char *
 Target::SupportedTargetISAs() {
-    return "neon, sse2, sse2-x2, sse4, sse4-x2, avx, avx-x2"
-        ", avx1.1, avx1.1-x2, avx2, avx2-x2"
-        ", generic-1, generic-4, generic-8, generic-16, generic-32";
+    return "neon, sse2, sse2-x2, sse4, sse4-8, sse4-16, sse4-x2, "
+        "avx, avx-x2, avx1.1, avx1.1-x2, avx2, avx2-x2,"
+        "generic-1, generic-4, generic-8, generic-16, generic-32";
 }
 
 
diff --git a/run_tests.py b/run_tests.py
index 296db867..ea819ea4 100755
--- a/run_tests.py
+++ b/run_tests.py
@@ -37,7 +37,7 @@ parser.add_option("-g", "--generics-include", dest="include_file", help="Filenam
 parser.add_option("-f", "--ispc-flags", dest="ispc_flags", help="Additional flags for ispc (-g, -O1, ...)",
                   default="")
 parser.add_option('-t', '--target', dest='target',
-                  help='Set compilation target (neon, sse2, sse2-x2, sse4, sse4-x2, sse4-8, avx, avx-x2, generic-4, generic-8, generic-16, generic-32)',
+                  help='Set compilation target (neon, sse2, sse2-x2, sse4, sse4-x2, sse4-8, sse4-16, avx, avx-x2, generic-4, generic-8, generic-16, generic-32)',
                   default="sse4")
 parser.add_option('-a', '--arch', dest='arch',
                   help='Set architecture (arm, x86, x86-64)',

From bba84f247c34f67ed28a357d19a4a7414c590c2b Mon Sep 17 00:00:00 2001
From: Matt Pharr <mpharr@google.com>
Date: Wed, 24 Jul 2013 15:08:07 -0700
Subject: [PATCH 011/124] Improved optimization of vector select instructions.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Various LLVM optimization passes are turning code like:

%cmp = icmp lt <8 x i32> %foo, %bar
%cmp32 = sext <8 x i1> %cmp to <8 x i32>
. . .
%cmp1 = trunc <8 x i32> %cmp32 to <8 x i1>
%result = select <8 x i1> %cmp1, . . .

Into:

%cmp = icmp lt <8 x i32> %foo, %bar
%cmp32 = zext <8 x i1> %cmp to <8 x i32>   # note: zext
. . .
%cmp1 = icmp ne <8 x i32> %cmp32, zeroinitializer
%result = select <8 x i1> %cmp1, …

Which in turn isn't matched well by the LLVM code generators, which
in turn leads to fairly inefficient code.  (i.e. it doesn't just emit
a vector compare and blend instruction.)

Also, renamed VSelMovmskOptPass to InstructionSimplifyPass to better
describe its functionality.
---
 opt.cpp | 175 ++++++++++++++++++++++++++++++++++++++++----------------
 1 file changed, 126 insertions(+), 49 deletions(-)

diff --git a/opt.cpp b/opt.cpp
index 4701e7df..8efdbc67 100644
--- a/opt.cpp
+++ b/opt.cpp
@@ -108,7 +108,7 @@
 #endif
 
 static llvm::Pass *CreateIntrinsicsOptPass();
-static llvm::Pass *CreateVSelMovmskOptPass();
+static llvm::Pass *CreateInstructionSimplifyPass();
 
 static llvm::Pass *CreateImproveMemoryOpsPass();
 static llvm::Pass *CreateGatherCoalescePass();
@@ -476,7 +476,7 @@ Optimize(llvm::Module *module, int optLevel) {
         }
         if (!g->opt.disableMaskAllOnOptimizations) {
             optPM.add(CreateIntrinsicsOptPass());
-            optPM.add(CreateVSelMovmskOptPass());
+            optPM.add(CreateInstructionSimplifyPass());
         }
         optPM.add(llvm::createDeadInstEliminationPass());
 
@@ -519,7 +519,7 @@ Optimize(llvm::Module *module, int optLevel) {
 
         if (!g->opt.disableMaskAllOnOptimizations) {
             optPM.add(CreateIntrinsicsOptPass());
-            optPM.add(CreateVSelMovmskOptPass());
+            optPM.add(CreateInstructionSimplifyPass());
         }
 
         if (g->opt.disableGatherScatterOptimizations == false &&
@@ -539,7 +539,7 @@ Optimize(llvm::Module *module, int optLevel) {
         optPM.add(llvm::createFunctionInliningPass());
         optPM.add(llvm::createConstantPropagationPass());
         optPM.add(CreateIntrinsicsOptPass());
-        optPM.add(CreateVSelMovmskOptPass());
+        optPM.add(CreateInstructionSimplifyPass());
 
         if (g->opt.disableGatherScatterOptimizations == false &&
             g->target->getVectorWidth() > 1) {
@@ -555,18 +555,20 @@ Optimize(llvm::Module *module, int optLevel) {
         if (g->opt.disableHandlePseudoMemoryOps == false)
             optPM.add(CreateReplacePseudoMemoryOpsPass());
         optPM.add(CreateIntrinsicsOptPass());
-        optPM.add(CreateVSelMovmskOptPass());
+        optPM.add(CreateInstructionSimplifyPass());
 
         optPM.add(llvm::createFunctionInliningPass());
         optPM.add(llvm::createArgumentPromotionPass());
         optPM.add(llvm::createScalarReplAggregatesPass(sr_threshold, false));
         optPM.add(llvm::createInstructionCombiningPass());
+        optPM.add(CreateInstructionSimplifyPass());
         optPM.add(llvm::createCFGSimplificationPass());
         optPM.add(llvm::createReassociatePass());
         optPM.add(llvm::createLoopRotatePass());
         optPM.add(llvm::createLICMPass());
         optPM.add(llvm::createLoopUnswitchPass(false));
         optPM.add(llvm::createInstructionCombiningPass());
+        optPM.add(CreateInstructionSimplifyPass());
         optPM.add(llvm::createIndVarSimplifyPass());
         optPM.add(llvm::createLoopIdiomPass());
         optPM.add(llvm::createLoopDeletionPass());
@@ -576,17 +578,19 @@ Optimize(llvm::Module *module, int optLevel) {
 
         optPM.add(CreateIsCompileTimeConstantPass(true));
         optPM.add(CreateIntrinsicsOptPass());
-        optPM.add(CreateVSelMovmskOptPass());
+        optPM.add(CreateInstructionSimplifyPass());
 
         optPM.add(llvm::createMemCpyOptPass());
         optPM.add(llvm::createSCCPPass());
         optPM.add(llvm::createInstructionCombiningPass());
+        optPM.add(CreateInstructionSimplifyPass());
         optPM.add(llvm::createJumpThreadingPass());
         optPM.add(llvm::createCorrelatedValuePropagationPass());
         optPM.add(llvm::createDeadStoreEliminationPass());
         optPM.add(llvm::createAggressiveDCEPass());
         optPM.add(llvm::createCFGSimplificationPass());
         optPM.add(llvm::createInstructionCombiningPass());
+        optPM.add(CreateInstructionSimplifyPass());
         optPM.add(llvm::createStripDeadPrototypesPass());
         optPM.add(CreateMakeInternalFuncsStaticPass());
         optPM.add(llvm::createGlobalDCEPass());
@@ -927,80 +931,153 @@ CreateIntrinsicsOptPass() {
     @todo The better thing to do would be to submit a patch to LLVM to get
     these; they're presumably pretty simple patterns to match.
 */
-class VSelMovmskOpt : public llvm::BasicBlockPass {
+class InstructionSimplifyPass : public llvm::BasicBlockPass {
 public:
-    VSelMovmskOpt()
+    InstructionSimplifyPass()
         : BasicBlockPass(ID) { }
 
     const char *getPassName() const { return "Vector Select Optimization"; }
     bool runOnBasicBlock(llvm::BasicBlock &BB);
 
     static char ID;
+
+private:
+    static bool simplifySelect(llvm::SelectInst *selectInst,
+                               llvm::BasicBlock::iterator iter);
+    static llvm::Value *simplifyBoolVec(llvm::Value *value);
+    static bool simplifyCall(llvm::CallInst *callInst,
+                               llvm::BasicBlock::iterator iter);
 };
 
-char VSelMovmskOpt::ID = 0;
+char InstructionSimplifyPass::ID = 0;
+
+
+llvm::Value *
+InstructionSimplifyPass::simplifyBoolVec(llvm::Value *value) {
+    llvm::TruncInst *trunc = llvm::dyn_cast<llvm::TruncInst>(value);
+    if (trunc != NULL) {
+        // Convert trunc({sext,zext}(i1 vector)) -> (i1 vector)
+        llvm::SExtInst *sext = llvm::dyn_cast<llvm::SExtInst>(value);
+        if (sext && 
+            sext->getOperand(0)->getType() == LLVMTypes::Int1VectorType)
+            return sext->getOperand(0);
+
+        llvm::ZExtInst *zext = llvm::dyn_cast<llvm::ZExtInst>(value);
+        if (zext && 
+            zext->getOperand(0)->getType() == LLVMTypes::Int1VectorType)
+            return zext->getOperand(0);
+    }
+
+    llvm::ICmpInst *icmp = llvm::dyn_cast<llvm::ICmpInst>(value);
+    if (icmp != NULL) {
+        // icmp(ne, {sext,zext}(foo), zeroinitializer) -> foo
+        if (icmp->getSignedPredicate() == llvm::CmpInst::ICMP_NE) {
+            llvm::Value *op1 = icmp->getOperand(1);
+            if (llvm::isa<llvm::ConstantAggregateZero>(op1)) {
+                llvm::Value *op0 = icmp->getOperand(0);
+                llvm::SExtInst *sext = llvm::dyn_cast<llvm::SExtInst>(op0);
+                if (sext)
+                    return sext->getOperand(0);
+                llvm::ZExtInst *zext = llvm::dyn_cast<llvm::ZExtInst>(op0);
+                if (zext)
+                    return zext->getOperand(0);
+            }
+        }
+    }
+    return NULL;
+}
 
 
 bool
-VSelMovmskOpt::runOnBasicBlock(llvm::BasicBlock &bb) {
-    DEBUG_START_PASS("VSelMovmaskOpt");
+InstructionSimplifyPass::simplifySelect(llvm::SelectInst *selectInst,
+                                        llvm::BasicBlock::iterator iter) {
+    if (selectInst->getType()->isVectorTy() == false)
+        return false;
+
+    llvm::Value *factor = selectInst->getOperand(0);
+
+    // Simplify all-on or all-off mask values
+    MaskStatus maskStatus = lGetMaskStatus(factor);
+    llvm::Value *value = NULL;
+    if (maskStatus == ALL_ON)
+        // Mask all on -> replace with the first select value
+        value = selectInst->getOperand(1);
+    else if (maskStatus == ALL_OFF)
+        // Mask all off -> replace with the second select value
+        value = selectInst->getOperand(2);
+    if (value != NULL) {
+        llvm::ReplaceInstWithValue(iter->getParent()->getInstList(),
+                                   iter, value);
+        return true;
+    }
+
+    // Sometimes earlier LLVM optimization passes generate unnecessarily
+    // complex expressions for the selection vector, which in turn confuses
+    // the code generators and leads to sub-optimal code (particularly for
+    // 8 and 16-bit masks).  We'll try to simplify them out here so that
+    // the code generator patterns match..
+    if ((factor = simplifyBoolVec(factor)) != NULL) {
+        llvm::Instruction *newSelect =
+            llvm::SelectInst::Create(factor, selectInst->getOperand(1),
+                                     selectInst->getOperand(2),
+                                     selectInst->getName());
+        llvm::ReplaceInstWithInst(selectInst, newSelect);
+        return true;
+    }
+
+    return false;
+}
+
+
+bool
+InstructionSimplifyPass::simplifyCall(llvm::CallInst *callInst,
+                                      llvm::BasicBlock::iterator iter) {
+    llvm::Function *calledFunc = callInst->getCalledFunction();
+
+    // Turn a __movmsk call with a compile-time constant vector into the
+    // equivalent scalar value.
+    if (calledFunc == NULL || calledFunc != m->module->getFunction("__movmsk"))
+        return false;
+
+    uint64_t mask;
+    if (lGetMask(callInst->getArgOperand(0), &mask) == true) {
+        llvm::ReplaceInstWithValue(iter->getParent()->getInstList(),
+                                   iter, LLVMInt64(mask));
+        return true;
+    }
+    return false;
+}
+
+
+bool
+InstructionSimplifyPass::runOnBasicBlock(llvm::BasicBlock &bb) {
+    DEBUG_START_PASS("InstructionSimplify");
 
     bool modifiedAny = false;
 
  restart:
     for (llvm::BasicBlock::iterator iter = bb.begin(), e = bb.end(); iter != e; ++iter) {
         llvm::SelectInst *selectInst = llvm::dyn_cast<llvm::SelectInst>(&*iter);
-        if (selectInst != NULL && selectInst->getType()->isVectorTy()) {
-            llvm::Value *factor = selectInst->getOperand(0);
-
-            MaskStatus maskStatus = lGetMaskStatus(factor);
-            llvm::Value *value = NULL;
-            if (maskStatus == ALL_ON)
-                // Mask all on -> replace with the first select value
-                value = selectInst->getOperand(1);
-            else if (maskStatus == ALL_OFF)
-                // Mask all off -> replace with the second select value
-                value = selectInst->getOperand(2);
-
-            if (value != NULL) {
-                llvm::ReplaceInstWithValue(iter->getParent()->getInstList(),
-                                           iter, value);
-                modifiedAny = true;
-                goto restart;
-            }
+        if (selectInst && simplifySelect(selectInst, iter)) {
+            modifiedAny = true;
+            goto restart;
         }
-
         llvm::CallInst *callInst = llvm::dyn_cast<llvm::CallInst>(&*iter);
-        if (callInst == NULL)
-            continue;
-
-        llvm::Function *calledFunc = callInst->getCalledFunction();
-        if (calledFunc == NULL || calledFunc != m->module->getFunction("__movmsk"))
-            continue;
-
-        uint64_t mask;
-        if (lGetMask(callInst->getArgOperand(0), &mask) == true) {
-#if 0
-            fprintf(stderr, "mask %d\n", mask);
-            callInst->getArgOperand(0)->dump();
-            fprintf(stderr, "-----------\n");
-#endif
-            llvm::ReplaceInstWithValue(iter->getParent()->getInstList(),
-                                       iter, LLVMInt64(mask));
+        if (callInst && simplifyCall(callInst, iter)) {
             modifiedAny = true;
             goto restart;
         }
     }
 
-    DEBUG_END_PASS("VSelMovMskOpt");
+    DEBUG_END_PASS("InstructionSimplify");
 
     return modifiedAny;
 }
 
 
 static llvm::Pass *
-CreateVSelMovmskOptPass() {
-    return new VSelMovmskOpt;
+CreateInstructionSimplifyPass() {
+    return new InstructionSimplifyPass;
 }
 
 

From 2d063925a1d5ab758bcdd22454c201ac7d617dd3 Mon Sep 17 00:00:00 2001
From: Matt Pharr <mpharr@google.com>
Date: Wed, 24 Jul 2013 15:10:08 -0700
Subject: [PATCH 012/124] Explicitly call the PBLENDVB intrinsic for i8
 blending with sse4-8.

This is slightly cleaner than trunc-ing the i8 mask to i1 and using
a vector select.  (And is probably more safe in terms of good code.)
---
 builtins/target-sse4-8.ll | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/builtins/target-sse4-8.ll b/builtins/target-sse4-8.ll
index c85209ba..cd8fdce2 100644
--- a/builtins/target-sse4-8.ll
+++ b/builtins/target-sse4-8.ll
@@ -398,11 +398,13 @@ define void @__masked_store_blend_i16(<16 x i16>* nocapture, <16 x i16>,
   ret void
 }
 
+declare <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
+
 define void @__masked_store_blend_i8(<16 x i8>* nocapture, <16 x i8>,
                                      <16 x MASK> %mask) nounwind alwaysinline {
-  %mask_as_i1 = trunc <16 x MASK> %mask to <16 x i1>
   %old = load <16 x i8>* %0, align 4
-  %blend = select <16 x i1> %mask_as_i1, <16 x i8> %1, <16 x i8> %old
+  %blend = call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> %old, <16 x i8> %1,
+                                                   <16 x i8> %mask)
   store <16 x i8> %blend, <16 x i8>* %0, align 4
   ret void
 }

From b6df447b550507ba77dde70758a5bdaf0e079f95 Mon Sep 17 00:00:00 2001
From: Matt Pharr <mpharr@google.com>
Date: Thu, 25 Jul 2013 09:11:39 -0700
Subject: [PATCH 013/124] Add reduce_add() for int8 and int16 types.

This maps to specialized instructions (e.g. PSADBW) when available.
---
 builtins.cpp                      |  2 ++
 builtins/target-avx-x2.ll         | 27 ++++++++++++++++++
 builtins/target-avx.ll            | 46 ++++++++++++++++++++++++-------
 builtins/target-generic-1.ll      |  9 ++++++
 builtins/target-generic-common.ll |  7 +++--
 builtins/target-neon.ll           | 33 ++++++++++++++++++----
 builtins/target-sse2-x2.ll        | 30 ++++++++++++++++++++
 builtins/target-sse2.ll           | 30 ++++++++++++++++++++
 builtins/target-sse4-16.ll        | 30 ++++++++++++++++++++
 builtins/target-sse4-8.ll         | 27 ++++++++++++++++++
 builtins/target-sse4-x2.ll        | 30 ++++++++++++++++++++
 builtins/target-sse4.ll           | 30 ++++++++++++++++++++
 docs/ispc.rst                     | 39 ++++++++++++++++++--------
 examples/intrinsics/generic-16.h  |  9 +++---
 examples/intrinsics/generic-32.h  |  9 +++---
 examples/intrinsics/generic-64.h  |  9 +++---
 examples/intrinsics/knc.h         | 16 +++++++++++
 examples/intrinsics/knc2x.h       |  3 ++
 examples/intrinsics/sse4.h        | 16 +++++++++++
 stdlib.ispc                       | 25 +++++++++++++++--
 tests/reduce-add-int16-1.ispc     | 21 ++++++++++++++
 tests/reduce-add-int16.ispc       | 21 ++++++++++++++
 tests/reduce-add-int8-1.ispc      | 21 ++++++++++++++
 tests/reduce-add-int8.ispc        | 18 ++++++++++++
 24 files changed, 464 insertions(+), 44 deletions(-)
 create mode 100644 tests/reduce-add-int16-1.ispc
 create mode 100644 tests/reduce-add-int16.ispc
 create mode 100644 tests/reduce-add-int8-1.ispc
 create mode 100644 tests/reduce-add-int8.ispc

diff --git a/builtins.cpp b/builtins.cpp
index c4a2f3b5..08472623 100644
--- a/builtins.cpp
+++ b/builtins.cpp
@@ -501,6 +501,8 @@ lSetInternalFunctions(llvm::Module *module) {
         "__rdrand_i64",
         "__reduce_add_double",
         "__reduce_add_float",
+        "__reduce_add_int8",
+        "__reduce_add_int16",
         "__reduce_add_int32",
         "__reduce_add_int64",
         "__reduce_equal_double",
diff --git a/builtins/target-avx-x2.ll b/builtins/target-avx-x2.ll
index 8c6b7753..d9e0322b 100644
--- a/builtins/target-avx-x2.ll
+++ b/builtins/target-avx-x2.ll
@@ -271,6 +271,33 @@ reduce_equal(16)
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; horizontal int32 ops
 
+declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone
+
+define i16 @__reduce_add_int8(<16 x i8>) nounwind readnone alwaysinline {
+  %rv = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %0,
+                                              <16 x i8> zeroinitializer)
+  %r0 = extractelement <2 x i64> %rv, i32 0
+  %r1 = extractelement <2 x i64> %rv, i32 1
+  %r = add i64 %r0, %r1
+  %r16 = trunc i64 %r to i16
+  ret i16 %r16
+}
+
+define internal <16 x i16> @__add_varying_i16(<16 x i16>,
+                                  <16 x i16>) nounwind readnone alwaysinline {
+  %r = add <16 x i16> %0, %1
+  ret <16 x i16> %r
+}
+
+define internal i16 @__add_uniform_i16(i16, i16) nounwind readnone alwaysinline {
+  %r = add i16 %0, %1
+  ret i16 %r
+}
+
+define i16 @__reduce_add_int16(<16 x i16>) nounwind readnone alwaysinline {
+  reduce16(i16, @__add_varying_i16, @__add_uniform_i16)
+}
+
 define <16 x i32> @__add_varying_int32(<16 x i32>,
                                        <16 x i32>) nounwind readnone alwaysinline {
   %s = add <16 x i32> %0, %1
diff --git a/builtins/target-avx.ll b/builtins/target-avx.ll
index e6ab3a4b..90e2f3ac 100644
--- a/builtins/target-avx.ll
+++ b/builtins/target-avx.ll
@@ -217,7 +217,6 @@ define float @__reduce_add_float(<8 x float>) nounwind readonly alwaysinline {
   ret float %sum
 }
 
-
 define float @__reduce_min_float(<8 x float>) nounwind readnone alwaysinline {
   reduce8(float, @__min_varying_float, @__min_uniform_float)
 }
@@ -229,6 +228,42 @@ define float @__reduce_max_float(<8 x float>) nounwind readnone alwaysinline {
 
 reduce_equal(8)
 
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; horizontal int8 ops
+
+declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone
+
+define i16 @__reduce_add_int8(<8 x i8>) nounwind readnone alwaysinline {
+  %wide8 = shufflevector <8 x i8> %0, <8 x i8> zeroinitializer,
+      <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                  i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
+  %rv = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %wide8,
+                                              <16 x i8> zeroinitializer)
+  %r0 = extractelement <2 x i64> %rv, i32 0
+  %r1 = extractelement <2 x i64> %rv, i32 1
+  %r = add i64 %r0, %r1
+  %r16 = trunc i64 %r to i16
+  ret i16 %r16
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; horizontal int16 ops
+
+define internal <8 x i16> @__add_varying_i16(<8 x i16>,
+                                  <8 x i16>) nounwind readnone alwaysinline {
+  %r = add <8 x i16> %0, %1
+  ret <8 x i16> %r
+}
+
+define internal i16 @__add_uniform_i16(i16, i16) nounwind readnone alwaysinline {
+  %r = add i16 %0, %1
+  ret i16 %r
+}
+
+define i16 @__reduce_add_int16(<8 x i16>) nounwind readnone alwaysinline {
+  reduce8(i16, @__add_varying_i16, @__add_uniform_i16)
+}
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; horizontal int32 ops
 
@@ -257,20 +292,14 @@ define i32 @__reduce_max_int32(<8 x i32>) nounwind readnone alwaysinline {
   reduce8(i32, @__max_varying_int32, @__max_uniform_int32)
 }
 
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;;; horizontal uint32 ops
-
 define i32 @__reduce_min_uint32(<8 x i32>) nounwind readnone alwaysinline {
   reduce8(i32, @__min_varying_uint32, @__min_uniform_uint32)
 }
 
-
 define i32 @__reduce_max_uint32(<8 x i32>) nounwind readnone alwaysinline {
   reduce8(i32, @__max_varying_uint32, @__max_uniform_uint32)
 }
 
-
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; horizontal double ops
 
@@ -329,9 +358,6 @@ define i64 @__reduce_max_int64(<8 x i64>) nounwind readnone alwaysinline {
 }
 
 
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;;; horizontal uint64 ops
-
 define i64 @__reduce_min_uint64(<8 x i64>) nounwind readnone alwaysinline {
   reduce8(i64, @__min_varying_uint64, @__min_uniform_uint64)
 }
diff --git a/builtins/target-generic-1.ll b/builtins/target-generic-1.ll
index 9b747e2e..3dec76b0 100644
--- a/builtins/target-generic-1.ll
+++ b/builtins/target-generic-1.ll
@@ -471,6 +471,15 @@ define  i64 @__popcnt_int64(i64) nounwind readonly alwaysinline {
   ret i64 %call
 }
 
+define i8 @__reduce_add_int8(<1 x i8> %v) nounwind readonly alwaysinline {
+  %r = extractelement <1 x i8> %v, i32 0
+  ret i8 %r
+}
+
+define i16 @__reduce_add_int16(<1 x i16> %v) nounwind readonly alwaysinline {
+  %r = extractelement <1 x i16> %v, i32 0
+  ret i16 %r
+}
 
 define  float @__reduce_add_float(<1 x float> %v) nounwind readonly alwaysinline {
   %r = extractelement <1 x float> %v, i32 0
diff --git a/builtins/target-generic-common.ll b/builtins/target-generic-common.ll
index bbf1b842..76d1faf3 100644
--- a/builtins/target-generic-common.ll
+++ b/builtins/target-generic-common.ll
@@ -226,14 +226,16 @@ declare i1 @__any(<WIDTH x i1>) nounwind readnone
 declare i1 @__all(<WIDTH x i1>) nounwind readnone 
 declare i1 @__none(<WIDTH x i1>) nounwind readnone 
 
+declare i16 @__reduce_add_int8(<WIDTH x i8>) nounwind readnone
+declare i32 @__reduce_add_int16(<WIDTH x i16>) nounwind readnone
+
 declare float @__reduce_add_float(<WIDTH x float>) nounwind readnone
 declare float @__reduce_min_float(<WIDTH x float>) nounwind readnone 
 declare float @__reduce_max_float(<WIDTH x float>) nounwind readnone 
 
-declare i32 @__reduce_add_int32(<WIDTH x i32>) nounwind readnone 
+declare i64 @__reduce_add_int32(<WIDTH x i32>) nounwind readnone
 declare i32 @__reduce_min_int32(<WIDTH x i32>) nounwind readnone 
 declare i32 @__reduce_max_int32(<WIDTH x i32>) nounwind readnone 
-
 declare i32 @__reduce_min_uint32(<WIDTH x i32>) nounwind readnone 
 declare i32 @__reduce_max_uint32(<WIDTH x i32>) nounwind readnone 
 
@@ -244,7 +246,6 @@ declare double @__reduce_max_double(<WIDTH x double>) nounwind readnone
 declare i64 @__reduce_add_int64(<WIDTH x i64>) nounwind readnone 
 declare i64 @__reduce_min_int64(<WIDTH x i64>) nounwind readnone 
 declare i64 @__reduce_max_int64(<WIDTH x i64>) nounwind readnone 
-
 declare i64 @__reduce_min_uint64(<WIDTH x i64>) nounwind readnone 
 declare i64 @__reduce_max_uint64(<WIDTH x i64>) nounwind readnone 
 
diff --git a/builtins/target-neon.ll b/builtins/target-neon.ll
index e70b774b..fbeac352 100644
--- a/builtins/target-neon.ll
+++ b/builtins/target-neon.ll
@@ -509,15 +509,38 @@ define float @__reduce_max_float(<4 x float>) nounwind readnone {
   neon_reduce(float, @llvm.arm.neon.vpmaxs.v2f32, @max_f32)
 }
 
-define internal i32 @add_i32(i32, i32) {
-  %r = add i32 %0, %1
+declare <4 x i16> @llvm.arm.neon.vpaddls.v4i16.v8i8(<8 x i8>) nounwind readnone
+
+define i16 @__reduce_add_int8(<WIDTH x i8>) nounwind readnone {
+  %v8 = shufflevector <4 x i8> %0, <4 x i8> zeroinitializer,
+           <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4>
+  %a16 = call <4 x i16> @llvm.arm.neon.vpaddls.v4i16.v8i8(<8 x i8> %v8)
+  %a32 = call <2 x i32> @llvm.arm.neon.vpaddlu.v2i32.v4i16(<4 x i16> %a16)
+  %a0 = extractelement <2 x i32> %a32, i32 0
+  %a1 = extractelement <2 x i32> %a32, i32 1
+  %r = add i32 %a0, %a1
+  %r16 = trunc i32 %r to i16
+  ret i16 %r16
+}
+
+declare <2 x i32> @llvm.arm.neon.vpaddlu.v2i32.v4i16(<4 x i16>) nounwind readnone
+
+define i32 @__reduce_add_int16(<WIDTH x i16>) nounwind readnone {
+  %a32 = call <2 x i32> @llvm.arm.neon.vpaddlu.v2i32.v4i16(<4 x i16> %0)
+  %a0 = extractelement <2 x i32> %a32, i32 0
+  %a1 = extractelement <2 x i32> %a32, i32 1
+  %r = add i32 %a0, %a1
   ret i32 %r
 }
 
-declare <2 x i32> @llvm.arm.neon.vpadd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32>) nounwind readnone
 
-define i32 @__reduce_add_int32(<WIDTH x i32>) nounwind readnone {
-  neon_reduce(i32, @llvm.arm.neon.vpadd.v2i32, @add_i32)
+define i64 @__reduce_add_int32(<WIDTH x i32>) nounwind readnone {
+  %a64 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %0)
+  %a0 = extractelement <2 x i64> %a64, i32 0
+  %a1 = extractelement <2 x i64> %a64, i32 1
+  %r = add i64 %a0, %a1
+  ret i64 %r
 }
 
 declare <2 x i32> @llvm.arm.neon.vpmins.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
diff --git a/builtins/target-sse2-x2.ll b/builtins/target-sse2-x2.ll
index 73361720..da22a66c 100644
--- a/builtins/target-sse2-x2.ll
+++ b/builtins/target-sse2-x2.ll
@@ -367,6 +367,36 @@ define i1 @__none(<8 x i32>) nounwind readnone alwaysinline {
   ret i1 %cmp
 }
 
+declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone
+
+define i16 @__reduce_add_int8(<8 x i8>) nounwind readnone alwaysinline {
+  %wide8 = shufflevector <8 x i8> %0, <8 x i8> zeroinitializer,
+      <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                  i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
+  %rv = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %wide8,
+                                              <16 x i8> zeroinitializer)
+  %r0 = extractelement <2 x i64> %rv, i32 0
+  %r1 = extractelement <2 x i64> %rv, i32 1
+  %r = add i64 %r0, %r1
+  %r16 = trunc i64 %r to i16
+  ret i16 %r16
+}
+
+define internal <8 x i16> @__add_varying_i16(<8 x i16>,
+                                  <8 x i16>) nounwind readnone alwaysinline {
+  %r = add <8 x i16> %0, %1
+  ret <8 x i16> %r
+}
+
+define internal i16 @__add_uniform_i16(i16, i16) nounwind readnone alwaysinline {
+  %r = add i16 %0, %1
+  ret i16 %r
+}
+
+define i16 @__reduce_add_int16(<8 x i16>) nounwind readnone alwaysinline {
+  reduce8(i16, @__add_varying_i16, @__add_uniform_i16)
+}
+
 define <4 x float> @__vec4_add_float(<4 x float> %v0,
                                      <4 x float> %v1) nounwind readnone alwaysinline {
   %v = fadd <4 x float> %v0, %v1
diff --git a/builtins/target-sse2.ll b/builtins/target-sse2.ll
index 2bb06391..a6b206b6 100644
--- a/builtins/target-sse2.ll
+++ b/builtins/target-sse2.ll
@@ -267,6 +267,36 @@ define i1 @__none(<4 x i32>) nounwind readnone alwaysinline {
   ret i1 %cmp
 }
 
+declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone
+
+define i16 @__reduce_add_int8(<4 x i8>) nounwind readnone alwaysinline {
+  %wide8 = shufflevector <4 x i8> %0, <4 x i8> zeroinitializer,
+      <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4,
+                  i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
+  %rv = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %wide8,
+                                              <16 x i8> zeroinitializer)
+  %r0 = extractelement <2 x i64> %rv, i32 0
+  %r1 = extractelement <2 x i64> %rv, i32 1
+  %r = add i64 %r0, %r1
+  %r16 = trunc i64 %r to i16
+  ret i16 %r16
+}
+
+define internal <4 x i16> @__add_varying_i16(<4 x i16>,
+                                  <4 x i16>) nounwind readnone alwaysinline {
+  %r = add <4 x i16> %0, %1
+  ret <4 x i16> %r
+}
+
+define internal i16 @__add_uniform_i16(i16, i16) nounwind readnone alwaysinline {
+  %r = add i16 %0, %1
+  ret i16 %r
+}
+
+define i16 @__reduce_add_int16(<4 x i16>) nounwind readnone alwaysinline {
+  reduce4(i16, @__add_varying_i16, @__add_uniform_i16)
+}
+
 define float @__reduce_add_float(<4 x float> %v) nounwind readonly alwaysinline {
   %v1 = shufflevector <4 x float> %v, <4 x float> undef,
                       <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
diff --git a/builtins/target-sse4-16.ll b/builtins/target-sse4-16.ll
index 2044fbee..d1563988 100644
--- a/builtins/target-sse4-16.ll
+++ b/builtins/target-sse4-16.ll
@@ -253,6 +253,36 @@ define i1 @__none(<8 x MASK>) nounwind readnone alwaysinline {
   ret i1 %meq
 }
 
+declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone
+
+define i16 @__reduce_add_int8(<8 x i8>) nounwind readnone alwaysinline {
+  %wide8 = shufflevector <8 x i8> %0, <8 x i8> zeroinitializer,
+      <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                  i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
+  %rv = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %wide8,
+                                              <16 x i8> zeroinitializer)
+  %r0 = extractelement <2 x i64> %rv, i32 0
+  %r1 = extractelement <2 x i64> %rv, i32 1
+  %r = add i64 %r0, %r1
+  %r16 = trunc i64 %r to i16
+  ret i16 %r16
+}
+
+define internal <8 x i16> @__add_varying_i16(<8 x i16>,
+                                  <8 x i16>) nounwind readnone alwaysinline {
+  %r = add <8 x i16> %0, %1
+  ret <8 x i16> %r
+}
+
+define internal i16 @__add_uniform_i16(i16, i16) nounwind readnone alwaysinline {
+  %r = add i16 %0, %1
+  ret i16 %r
+}
+
+define i16 @__reduce_add_int16(<8 x i16>) nounwind readnone alwaysinline {
+  reduce8(i16, @__add_varying_i16, @__add_uniform_i16)
+}
+
 define internal <8 x float> @__add_varying_float(<8 x float>, <8 x float>) {
   %r = fadd <8 x float> %0, %1
   ret <8 x float> %r
diff --git a/builtins/target-sse4-8.ll b/builtins/target-sse4-8.ll
index cd8fdce2..85b7bbe7 100644
--- a/builtins/target-sse4-8.ll
+++ b/builtins/target-sse4-8.ll
@@ -261,6 +261,33 @@ define i1 @__none(<16 x i8>) nounwind readnone alwaysinline {
   ret i1 %meq
 }
 
+declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone
+
+define i16 @__reduce_add_int8(<16 x i8>) nounwind readnone alwaysinline {
+  %rv = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %0,
+                                              <16 x i8> zeroinitializer)
+  %r0 = extractelement <2 x i64> %rv, i32 0
+  %r1 = extractelement <2 x i64> %rv, i32 1
+  %r = add i64 %r0, %r1
+  %r16 = trunc i64 %r to i16
+  ret i16 %r16
+}
+
+define internal <16 x i16> @__add_varying_i16(<16 x i16>,
+                                  <16 x i16>) nounwind readnone alwaysinline {
+  %r = add <16 x i16> %0, %1
+  ret <16 x i16> %r
+}
+
+define internal i16 @__add_uniform_i16(i16, i16) nounwind readnone alwaysinline {
+  %r = add i16 %0, %1
+  ret i16 %r
+}
+
+define i16 @__reduce_add_int16(<16 x i16>) nounwind readnone alwaysinline {
+  reduce16(i16, @__add_varying_i16, @__add_uniform_i16)
+}
+
 define internal <16 x float> @__add_varying_float(<16 x float>, <16 x float>) {
   %r = fadd <16 x float> %0, %1
   ret <16 x float> %r
diff --git a/builtins/target-sse4-x2.ll b/builtins/target-sse4-x2.ll
index ccae4d51..e2debbc2 100644
--- a/builtins/target-sse4-x2.ll
+++ b/builtins/target-sse4-x2.ll
@@ -309,6 +309,36 @@ define i1 @__none(<8 x i32>) nounwind readnone alwaysinline {
   ret i1 %cmp
 }
 
+declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone
+
+define i16 @__reduce_add_int8(<8 x i8>) nounwind readnone alwaysinline {
+  %wide8 = shufflevector <8 x i8> %0, <8 x i8> zeroinitializer,
+      <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                  i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
+  %rv = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %wide8,
+                                              <16 x i8> zeroinitializer)
+  %r0 = extractelement <2 x i64> %rv, i32 0
+  %r1 = extractelement <2 x i64> %rv, i32 1
+  %r = add i64 %r0, %r1
+  %r16 = trunc i64 %r to i16
+  ret i16 %r16
+}
+
+define internal <8 x i16> @__add_varying_i16(<8 x i16>,
+                                  <8 x i16>) nounwind readnone alwaysinline {
+  %r = add <8 x i16> %0, %1
+  ret <8 x i16> %r
+}
+
+define internal i16 @__add_uniform_i16(i16, i16) nounwind readnone alwaysinline {
+  %r = add i16 %0, %1
+  ret i16 %r
+}
+
+define i16 @__reduce_add_int16(<8 x i16>) nounwind readnone alwaysinline {
+  reduce8(i16, @__add_varying_i16, @__add_uniform_i16)
+}
+
 define float @__reduce_min_float(<8 x float>) nounwind readnone alwaysinline {
   reduce8by4(float, @llvm.x86.sse.min.ps, @__min_uniform_float)
 }
diff --git a/builtins/target-sse4.ll b/builtins/target-sse4.ll
index f622b839..98a7ef69 100644
--- a/builtins/target-sse4.ll
+++ b/builtins/target-sse4.ll
@@ -299,6 +299,36 @@ define i1 @__none(<4 x i32>) nounwind readnone alwaysinline {
   ret i1 %cmp
 }
 
+declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone
+
+define i16 @__reduce_add_int8(<4 x i8>) nounwind readnone alwaysinline {
+  %wide8 = shufflevector <4 x i8> %0, <4 x i8> zeroinitializer,
+      <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4,
+                  i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
+  %rv = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %wide8,
+                                              <16 x i8> zeroinitializer)
+  %r0 = extractelement <2 x i64> %rv, i32 0
+  %r1 = extractelement <2 x i64> %rv, i32 1
+  %r = add i64 %r0, %r1
+  %r16 = trunc i64 %r to i16
+  ret i16 %r16
+}
+
+define internal <4 x i16> @__add_varying_i16(<4 x i16>,
+                                  <4 x i16>) nounwind readnone alwaysinline {
+  %r = add <4 x i16> %0, %1
+  ret <4 x i16> %r
+}
+
+define internal i16 @__add_uniform_i16(i16, i16) nounwind readnone alwaysinline {
+  %r = add i16 %0, %1
+  ret i16 %r
+}
+
+define i16 @__reduce_add_int16(<4 x i16>) nounwind readnone alwaysinline {
+  reduce4(i16, @__add_varying_i16, @__add_uniform_i16)
+}
+
 declare <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float>, <4 x float>) nounwind readnone
 
 define float @__reduce_add_float(<4 x float>) nounwind readonly alwaysinline {
diff --git a/docs/ispc.rst b/docs/ispc.rst
index c6c63172..39d3a5c8 100755
--- a/docs/ispc.rst
+++ b/docs/ispc.rst
@@ -3711,29 +3711,44 @@ instances are added together by the ``reduce_add()`` function.
 
 ::
 
-    uniform float reduce_add(float x)
-    uniform int reduce_add(int x)
-    uniform unsigned int reduce_add(unsigned int x)
+    uniform int16 reduce_add(int8 x)
+    uniform unsigned int16 reduce_add(unsigned int8 x)
+    uniform int32 reduce_add(int16 x)
+    uniform unsigned int32 reduce_add(unsigned 16int x)
+    uniform int64 reduce_add(int32 x)
+    uniform unsigned int64 reduce_add(unsigned int32 x)
+    uniform int64 reduce_add(int64 x)
+    uniform unsigned int64 reduce_add(unsigned int64 x)
 
-You can also use functions to compute the minimum and maximum value of the
-given value across all of the currently-executing program instances.
+    uniform float reduce_add(float x)
+    uniform double reduce_add(double x)
+
+You can also use functions to compute the minimum value of the given value
+across all of the currently-executing program instances.
 
 ::
 
-    uniform float reduce_min(float a)
     uniform int32 reduce_min(int32 a)
     uniform unsigned int32 reduce_min(unsigned int32 a)
-    uniform double reduce_min(double a)
     uniform int64 reduce_min(int64 a)
     uniform unsigned int64 reduce_min(unsigned int64 a)
 
-    uniform float reduce_max(float a)
+    uniform float reduce_min(float a)
+    uniform double reduce_min(double a)
+
+Equivalent functions are available to comptue the maximum of the given
+varying variable over the active program instances.
+
+::
+
     uniform int32 reduce_max(int32 a)
     uniform unsigned int32 reduce_max(unsigned int32 a)
-    uniform double reduce_max(double a)
     uniform int64 reduce_max(int64 a)
     uniform unsigned int64 reduce_max(unsigned int64 a)
 
+    uniform float reduce_max(float a)
+    uniform double reduce_max(double a)
+
 Finally, you can check to see if a particular value has the same value in
 all of the currently-running program instances:
 
@@ -3741,9 +3756,10 @@ all of the currently-running program instances:
 
     uniform bool reduce_equal(int32 v)
     uniform bool reduce_equal(unsigned int32 v)
-    uniform bool reduce_equal(float v)
     uniform bool reduce_equal(int64 v)
     uniform bool reduce_equal(unsigned int64 v)
+
+    uniform bool reduce_equal(float v)
     uniform bool reduce_equal(double)
 
 There are also variants of these functions that return the value as a
@@ -3758,10 +3774,11 @@ performance in the `Performance Guide`_.
     uniform bool reduce_equal(int32 v, uniform int32 * uniform sameval)
     uniform bool reduce_equal(unsigned int32 v,
                               uniform unsigned int32 * uniform sameval)
-    uniform bool reduce_equal(float v, uniform float * uniform sameval)
     uniform bool reduce_equal(int64 v, uniform int64 * uniform sameval)
     uniform bool reduce_equal(unsigned int64 v,
                               uniform unsigned int64 * uniform sameval)
+
+    uniform bool reduce_equal(float v, uniform float * uniform sameval)
     uniform bool reduce_equal(double, uniform double * uniform sameval)
 
 If called when none of the program instances are running,
diff --git a/examples/intrinsics/generic-16.h b/examples/intrinsics/generic-16.h
index 828c1ab4..6d4fe1f4 100644
--- a/examples/intrinsics/generic-16.h
+++ b/examples/intrinsics/generic-16.h
@@ -1162,19 +1162,20 @@ REDUCE_ADD(double, __vec16_d, __reduce_add_double)
 REDUCE_MINMAX(double, __vec16_d, __reduce_min_double, <)
 REDUCE_MINMAX(double, __vec16_d, __reduce_max_double, >)
 
-REDUCE_ADD(uint32_t, __vec16_i32, __reduce_add_int32)
+REDUCE_ADD(int16_t, __vec16_i8, __reduce_add_int8)
+REDUCE_ADD(int32_t, __vec16_i16, __reduce_add_int16)
+
+REDUCE_ADD(int64_t, __vec16_i32, __reduce_add_int32)
 REDUCE_MINMAX(int32_t, __vec16_i32, __reduce_min_int32, <)
 REDUCE_MINMAX(int32_t, __vec16_i32, __reduce_max_int32, >)
 
-REDUCE_ADD(uint32_t, __vec16_i32, __reduce_add_uint32)
 REDUCE_MINMAX(uint32_t, __vec16_i32, __reduce_min_uint32, <)
 REDUCE_MINMAX(uint32_t, __vec16_i32, __reduce_max_uint32, >)
 
-REDUCE_ADD(uint64_t, __vec16_i64, __reduce_add_int64)
+REDUCE_ADD(int64_t, __vec16_i64, __reduce_add_int64)
 REDUCE_MINMAX(int64_t, __vec16_i64, __reduce_min_int64, <)
 REDUCE_MINMAX(int64_t, __vec16_i64, __reduce_max_int64, >)
 
-REDUCE_ADD(uint64_t, __vec16_i64, __reduce_add_uint64)
 REDUCE_MINMAX(uint64_t, __vec16_i64, __reduce_min_uint64, <)
 REDUCE_MINMAX(uint64_t, __vec16_i64, __reduce_max_uint64, >)
 
diff --git a/examples/intrinsics/generic-32.h b/examples/intrinsics/generic-32.h
index 64b82cb1..12c4f84e 100644
--- a/examples/intrinsics/generic-32.h
+++ b/examples/intrinsics/generic-32.h
@@ -1231,19 +1231,20 @@ REDUCE_ADD(double, __vec32_d, __reduce_add_double)
 REDUCE_MINMAX(double, __vec32_d, __reduce_min_double, <)
 REDUCE_MINMAX(double, __vec32_d, __reduce_max_double, >)
 
-REDUCE_ADD(uint32_t, __vec32_i32, __reduce_add_int32)
+REDUCE_ADD(int16_t, __vec16_i8, __reduce_add_int8)
+REDUCE_ADD(int32_t, __vec16_i16, __reduce_add_int16)
+
+REDUCE_ADD(int64_t, __vec32_i32, __reduce_add_int32)
 REDUCE_MINMAX(int32_t, __vec32_i32, __reduce_min_int32, <)
 REDUCE_MINMAX(int32_t, __vec32_i32, __reduce_max_int32, >)
 
-REDUCE_ADD(uint32_t, __vec32_i32, __reduce_add_uint32)
 REDUCE_MINMAX(uint32_t, __vec32_i32, __reduce_min_uint32, <)
 REDUCE_MINMAX(uint32_t, __vec32_i32, __reduce_max_uint32, >)
 
-REDUCE_ADD(uint64_t, __vec32_i64, __reduce_add_int64)
+REDUCE_ADD(int64_t, __vec32_i64, __reduce_add_int64)
 REDUCE_MINMAX(int64_t, __vec32_i64, __reduce_min_int64, <)
 REDUCE_MINMAX(int64_t, __vec32_i64, __reduce_max_int64, >)
 
-REDUCE_ADD(uint64_t, __vec32_i64, __reduce_add_uint64)
 REDUCE_MINMAX(uint64_t, __vec32_i64, __reduce_min_uint64, <)
 REDUCE_MINMAX(uint64_t, __vec32_i64, __reduce_max_uint64, >)
 
diff --git a/examples/intrinsics/generic-64.h b/examples/intrinsics/generic-64.h
index 7869faa5..a3648f42 100644
--- a/examples/intrinsics/generic-64.h
+++ b/examples/intrinsics/generic-64.h
@@ -1364,19 +1364,20 @@ REDUCE_ADD(double, __vec64_d, __reduce_add_double)
 REDUCE_MINMAX(double, __vec64_d, __reduce_min_double, <)
 REDUCE_MINMAX(double, __vec64_d, __reduce_max_double, >)
 
-REDUCE_ADD(uint32_t, __vec64_i32, __reduce_add_int32)
+REDUCE_ADD(int16_t, __vec16_i8, __reduce_add_int8)
+REDUCE_ADD(int32_t, __vec16_i16, __reduce_add_int16)
+
+REDUCE_ADD(int64_t, __vec64_i32, __reduce_add_int32)
 REDUCE_MINMAX(int32_t, __vec64_i32, __reduce_min_int32, <)
 REDUCE_MINMAX(int32_t, __vec64_i32, __reduce_max_int32, >)
 
-REDUCE_ADD(uint32_t, __vec64_i32, __reduce_add_uint32)
 REDUCE_MINMAX(uint32_t, __vec64_i32, __reduce_min_uint32, <)
 REDUCE_MINMAX(uint32_t, __vec64_i32, __reduce_max_uint32, >)
 
-REDUCE_ADD(uint64_t, __vec64_i64, __reduce_add_int64)
+REDUCE_ADD(int64_t, __vec64_i64, __reduce_add_int64)
 REDUCE_MINMAX(int64_t, __vec64_i64, __reduce_min_int64, <)
 REDUCE_MINMAX(int64_t, __vec64_i64, __reduce_max_int64, >)
 
-REDUCE_ADD(uint64_t, __vec64_i64, __reduce_add_uint64)
 REDUCE_MINMAX(uint64_t, __vec64_i64, __reduce_min_uint64, <)
 REDUCE_MINMAX(uint64_t, __vec64_i64, __reduce_max_uint64, >)
 
diff --git a/examples/intrinsics/knc.h b/examples/intrinsics/knc.h
index bf383c88..41c4cbc0 100644
--- a/examples/intrinsics/knc.h
+++ b/examples/intrinsics/knc.h
@@ -1511,6 +1511,22 @@ static FORCEINLINE int64_t __count_trailing_zeros_i64(const __vec1_i64 mask) {
 // reductions
 ///////////////////////////////////////////////////////////////////////////
 
+static FORCEINLINE int16_t __reduce_add_i8(__vec16_i8 v) {
+  // TODO: improve this!
+  int16_t ret = 0;
+  for (int i = 0; i < 16; ++i)
+    ret += v.v[i];
+  return ret;
+}
+
+static FORCEINLINE int32_t __reduce_add_i16(__vec16_i16 v) {
+  // TODO: improve this!
+  int32_t ret = 0;
+  for (int i = 0; i < 16; ++i)
+    ret += v.v[i];
+  return ret;
+}
+
 static FORCEINLINE uint32_t __reduce_add_i32(__vec16_i32 v) {
     return _mm512_reduce_add_epi32(v);
 }
diff --git a/examples/intrinsics/knc2x.h b/examples/intrinsics/knc2x.h
index 0041a6c9..5b6e5295 100644
--- a/examples/intrinsics/knc2x.h
+++ b/examples/intrinsics/knc2x.h
@@ -1607,6 +1607,9 @@ static FORCEINLINE int64_t __count_leading_zeros_i64(uint64_t v) {
 ///////////////////////////////////////////////////////////////////////////
 // reductions
 
+REDUCE_ADD(int16_t, __vec32_i8, __reduce_add_int8)
+REDUCE_ADD(int32_t, __vec32_i16, __reduce_add_int16)
+
 static FORCEINLINE float __reduce_add_float(__vec32_f v) {
     return _mm512_reduce_add_ps(v.v1) + _mm512_reduce_add_ps(v.v2);
 }
diff --git a/examples/intrinsics/sse4.h b/examples/intrinsics/sse4.h
index d4739d61..30f90b31 100644
--- a/examples/intrinsics/sse4.h
+++ b/examples/intrinsics/sse4.h
@@ -2528,6 +2528,22 @@ static FORCEINLINE int64_t __count_leading_zeros_i64(uint64_t v) {
 ///////////////////////////////////////////////////////////////////////////
 // reductions
 
+static FORCEINLINE int16_t __reduce_add_int8(__vec4_i8 v) {
+    // TODO: improve
+    int16_t ret = 0;
+    for (int i = 0; i < 4; ++i)
+        ret += v.v[i];
+    return ret;
+}
+
+static FORCEINLINE int32_t __reduce_add_int16(__vec4_i16 v) {
+    // TODO: improve
+    int32_t ret = 0;
+    for (int i = 0; i < 4; ++i)
+        ret += v.v[i];
+    return ret;
+}
+
 static FORCEINLINE float __reduce_add_float(__vec4_f v) {
     float r = bits_as_float(_mm_extract_ps(v.v, 0));
     r += bits_as_float(_mm_extract_ps(v.v, 1));
diff --git a/stdlib.ispc b/stdlib.ispc
index 7e848481..c9c66252 100644
--- a/stdlib.ispc
+++ b/stdlib.ispc
@@ -887,13 +887,32 @@ static inline uniform double select(uniform bool c, uniform double a,
 ///////////////////////////////////////////////////////////////////////////
 // Horizontal ops / reductions
 
+__declspec(safe)
+static inline uniform int16 reduce_add(int8 x) {
+    return __reduce_add_int8(__mask ? x : (int8)0);
+}
+
+__declspec(safe)
+static inline uniform unsigned int16 reduce_add(unsigned int8 x) {
+    return __reduce_add_int8(__mask ? x : (int8)0);
+}
+
+__declspec(safe)
+static inline uniform int32 reduce_add(int16 x) {
+    return __reduce_add_int16(__mask ? x : (int16)0);
+}
+
+__declspec(safe)
+static inline uniform unsigned int32 reduce_add(unsigned int16 x) {
+    return __reduce_add_int16(__mask ? x : (int16)0);
+}
+
 __declspec(safe) 
 static inline uniform float reduce_add(float x) {
     // zero the lanes where the mask is off
     return __reduce_add_float(__mask ? x : 0.);
 }
 
-
 __declspec(safe) 
 static inline uniform float reduce_min(float v) {
     // For the lanes where the mask is off, replace the given value with
@@ -915,7 +934,7 @@ static inline uniform float reduce_max(float v) {
 }
 
 __declspec(safe) 
-static inline uniform int reduce_add(int x) {
+static inline uniform int64 reduce_add(int32 x) {
     // Zero out the values for lanes that aren't running
     return __reduce_add_int32(__mask ? x : 0);
 }
@@ -937,7 +956,7 @@ static inline uniform int reduce_max(int v) {
 }
 
 __declspec(safe) 
-static inline uniform unsigned int reduce_add(unsigned int x) {
+static inline uniform unsigned int64 reduce_add(unsigned int32 x) {
     // Set values for non-running lanes to zero so they don't affect the
     // result.
     return __reduce_add_int32(__mask ? x : 0);
diff --git a/tests/reduce-add-int16-1.ispc b/tests/reduce-add-int16-1.ispc
new file mode 100644
index 00000000..58529ca1
--- /dev/null
+++ b/tests/reduce-add-int16-1.ispc
@@ -0,0 +1,21 @@
+
+export uniform int width() { return programCount; }
+
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float v = aFOO[programIndex];
+    uniform float m;
+    int16 iv = (int)v;
+    if (iv & 1)
+        m = reduce_add(iv);
+    RET[programIndex] = m;
+}
+
+export void result(uniform float RET[]) { 
+    uniform int x = 0;
+    for (uniform int i = 1; i <= programCount; i += 2)
+        x += i;
+    RET[programIndex] = x;
+}
+
diff --git a/tests/reduce-add-int16.ispc b/tests/reduce-add-int16.ispc
new file mode 100644
index 00000000..8657b201
--- /dev/null
+++ b/tests/reduce-add-int16.ispc
@@ -0,0 +1,21 @@
+
+export uniform int width() { return programCount; }
+
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float v = aFOO[programIndex];
+    uniform float m;
+    int16 iv = (int)v;
+/*CO    if (iv & 1)*/
+        m = reduce_add(iv);
+    RET[programIndex] = m;
+}
+
+export void result(uniform float RET[]) { 
+    uniform int x = 0;
+    for (uniform int i = 1; i <= programCount; ++i)
+        x += i;
+    RET[programIndex] = x;
+}
+
diff --git a/tests/reduce-add-int8-1.ispc b/tests/reduce-add-int8-1.ispc
new file mode 100644
index 00000000..e5310aae
--- /dev/null
+++ b/tests/reduce-add-int8-1.ispc
@@ -0,0 +1,21 @@
+
+export uniform int width() { return programCount; }
+
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float v = aFOO[programIndex];
+    uniform float m;
+    int8 iv = (int)v;
+    if (iv & 1)
+        m = reduce_add(iv);
+    RET[programIndex] = m;
+}
+
+export void result(uniform float RET[]) { 
+    uniform int x = 0;
+    for (uniform int i = 1; i <= programCount; i += 2)
+        x += i;
+    RET[programIndex] = x;
+}
+
diff --git a/tests/reduce-add-int8.ispc b/tests/reduce-add-int8.ispc
new file mode 100644
index 00000000..7e0dd027
--- /dev/null
+++ b/tests/reduce-add-int8.ispc
@@ -0,0 +1,18 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+  int8 db = b-4;
+  int8 iv = programIndex + db;
+  int m = reduce_add(iv);
+  RET[programIndex] = m;
+}
+
+export void result(uniform float RET[]) { 
+    uniform int x = 0;
+    for (uniform int i = 1; i <= programCount; ++i)
+        x += i;
+    RET[programIndex] = x;
+}
+

From fa93cb7d0ba3bcd587ca5dd6bfaa0a6f083cb2b7 Mon Sep 17 00:00:00 2001
From: Dmitry Babokin <babokin@gmail.com>
Date: Mon, 29 Jul 2013 22:46:36 -0700
Subject: [PATCH 014/124] InterlockedAdd -> InterlockedExchangeAdd for better
 portability (InterlockedAdd is not always supported)

---
 examples/tasksys.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/tasksys.cpp b/examples/tasksys.cpp
index b4ced5c7..c9c2fa7b 100644
--- a/examples/tasksys.cpp
+++ b/examples/tasksys.cpp
@@ -365,7 +365,7 @@ lAtomicCompareAndSwap32(volatile int32_t *v, int32_t newValue, int32_t oldValue)
 static inline int32_t 
 lAtomicAdd(volatile int32_t *v, int32_t delta) {
 #ifdef ISPC_IS_WINDOWS
-    return InterlockedAdd((volatile LONG *)v, delta);
+    return InterlockedExchangeAdd((volatile LONG *)v, delta)+delta;
 #else
     return __sync_fetch_and_add(v, delta);
 #endif

From ab3b633733ec05f3778e46f792a98844e9ee5900 Mon Sep 17 00:00:00 2001
From: Matt Pharr <mpharr@google.com>
Date: Mon, 29 Jul 2013 16:14:58 -0700
Subject: [PATCH 015/124] Add 8-bit and 16-bit specialized NEON targets.

Like SSE4-8 and SSE4-16, these use 8-bit and 16-bit values for mask
elements, respectively, and thus should generate the best code when used
for computation with datatypes of those sizes.
---
 Makefile                                      |   6 +-
 builtins.cpp                                  |  28 +-
 builtins/target-neon-16.ll                    | 458 ++++++++++++++++
 .../{target-neon.ll => target-neon-32.ll}     | 305 +----------
 builtins/target-neon-8.ll                     | 508 ++++++++++++++++++
 builtins/target-neon-common.ll                | 351 ++++++++++++
 builtins/util.m4                              | 120 ++++-
 ispc.cpp                                      |  41 +-
 ispc.h                                        |   3 +-
 ispc.vcxproj                                  | 111 ++--
 module.cpp                                    |   1 +
 run_tests.py                                  |   2 +-
 12 files changed, 1561 insertions(+), 373 deletions(-)
 create mode 100644 builtins/target-neon-16.ll
 rename builtins/{target-neon.ll => target-neon-32.ll} (62%)
 create mode 100644 builtins/target-neon-8.ll
 create mode 100644 builtins/target-neon-common.ll

diff --git a/Makefile b/Makefile
index fc064dbd..98729bfc 100644
--- a/Makefile
+++ b/Makefile
@@ -122,8 +122,10 @@ CXX_SRC=ast.cpp builtins.cpp cbackend.cpp ctx.cpp decl.cpp expr.cpp func.cpp \
 	type.cpp util.cpp
 HEADERS=ast.h builtins.h ctx.h decl.h expr.h func.h ispc.h llvmutil.h module.h \
 	opt.h stmt.h sym.h type.h util.h
-TARGETS=neon avx1 avx1-x2 avx11 avx11-x2 avx2 avx2-x2 sse2 sse2-x2 sse4 sse4-x2 \
-	sse4-8 sse4-16 generic-4 generic-8 generic-16 generic-32 generic-64 generic-1
+TARGETS=neon-32 neon-16 neon-8 \
+	avx1 avx1-x2 avx11 avx11-x2 avx2 avx2-x2 \
+	sse2 sse2-x2 sse4 sse4-x2 sse4-8 sse4-16 \
+	generic-1 generic-4 generic-8 generic-16 generic-32 generic-64
 # These files need to be compiled in two versions - 32 and 64 bits.
 BUILTINS_SRC_TARGET=$(addprefix builtins/target-, $(addsuffix .ll, $(TARGETS)))
 # These are files to be compiled in single version.
diff --git a/builtins.cpp b/builtins.cpp
index 08472623..e671a491 100644
--- a/builtins.cpp
+++ b/builtins.cpp
@@ -657,7 +657,9 @@ AddBitcodeToModule(const unsigned char *bitcode, int length,
         // the values for an ARM target.  This maybe won't cause problems
         // in the generated code, since bulitins.c doesn't do anything too
         // complex w.r.t. struct layouts, etc.
-        if (g->target->getISA() != Target::NEON)
+        if (g->target->getISA() != Target::NEON32 &&
+            g->target->getISA() != Target::NEON16 &&
+            g->target->getISA() != Target::NEON8)
 #endif // !__arm__
         {
             Assert(bcTriple.getArch() == llvm::Triple::UnknownArch ||
@@ -820,12 +822,30 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
     // Next, add the target's custom implementations of the various needed
     // builtin functions (e.g. __masked_store_32(), etc).
     switch (g->target->getISA()) {
-    case Target::NEON: {
+    case Target::NEON8: {
         if (runtime32) {
-            EXPORT_MODULE(builtins_bitcode_neon_32bit);
+            EXPORT_MODULE(builtins_bitcode_neon_8_32bit);
         }
         else {
-            EXPORT_MODULE(builtins_bitcode_neon_64bit);
+            EXPORT_MODULE(builtins_bitcode_neon_8_64bit);
+        }
+        break;
+    }
+    case Target::NEON16: {
+        if (runtime32) {
+            EXPORT_MODULE(builtins_bitcode_neon_16_32bit);
+        }
+        else {
+            EXPORT_MODULE(builtins_bitcode_neon_16_64bit);
+        }
+        break;
+    }
+    case Target::NEON32: {
+        if (runtime32) {
+            EXPORT_MODULE(builtins_bitcode_neon_32_32bit);
+        }
+        else {
+            EXPORT_MODULE(builtins_bitcode_neon_32_64bit);
         }
         break;
     }
diff --git a/builtins/target-neon-16.ll b/builtins/target-neon-16.ll
new file mode 100644
index 00000000..fd15eb0b
--- /dev/null
+++ b/builtins/target-neon-16.ll
@@ -0,0 +1,458 @@
+;;
+;; target-neon-16.ll
+;;
+;;  Copyright(c) 2013 Google, Inc.
+;;
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Matt Pharr nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+define(`WIDTH',`8')
+define(`MASK',`i16')
+
+include(`util.m4')
+include(`target-neon-common.ll')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; half conversion routines
+
+define <8 x float> @__half_to_float_varying(<8 x i16> %v) nounwind readnone {
+  unary4to8conv(r, i16, float, @llvm.arm.neon.vcvthf2fp, %v)
+  ret <8 x float> %r
+}
+
+define <8 x i16> @__float_to_half_varying(<8 x float> %v) nounwind readnone {
+  unary4to8conv(r, float, i16, @llvm.arm.neon.vcvtfp2hf, %v)
+  ret <8 x i16> %r
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; math
+
+;; round/floor/ceil
+
+;; FIXME: grabbed these from the sse2 target, which does not have native
+;; instructions for these.  Is there a better approach for NEON?
+
+define <8 x float> @__round_varying_float(<8 x float>) nounwind readonly alwaysinline {
+  %float_to_int_bitcast.i.i.i.i = bitcast <8 x float> %0 to <8 x i32>
+  %bitop.i.i = and <8 x i32> %float_to_int_bitcast.i.i.i.i,
+      <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648,
+       i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>
+  %bitop.i = xor <8 x i32> %float_to_int_bitcast.i.i.i.i, %bitop.i.i
+  %int_to_float_bitcast.i.i40.i = bitcast <8 x i32> %bitop.i to <8 x float>
+  %binop.i = fadd <8 x float> %int_to_float_bitcast.i.i40.i,
+    <float 8.388608e+06, float 8.388608e+06, float 8.388608e+06, float 8.388608e+06,
+     float 8.388608e+06, float 8.388608e+06, float 8.388608e+06, float 8.388608e+06>
+  %binop21.i = fadd <8 x float> %binop.i,
+    <float -8.388608e+06, float -8.388608e+06, float -8.388608e+06, float -8.388608e+06,
+     float -8.388608e+06, float -8.388608e+06, float -8.388608e+06, float -8.388608e+06>
+  %float_to_int_bitcast.i.i.i = bitcast <8 x float> %binop21.i to <8 x i32>
+  %bitop31.i = xor <8 x i32> %float_to_int_bitcast.i.i.i, %bitop.i.i
+  %int_to_float_bitcast.i.i.i = bitcast <8 x i32> %bitop31.i to <8 x float>
+  ret <8 x float> %int_to_float_bitcast.i.i.i
+}
+
+define <8 x float> @__floor_varying_float(<8 x float>) nounwind readonly alwaysinline {
+  %calltmp.i = tail call <8 x float> @__round_varying_float(<8 x float> %0) nounwind
+  %bincmp.i = fcmp ogt <8 x float> %calltmp.i, %0
+  %val_to_boolvec32.i = sext <8 x i1> %bincmp.i to <8 x i32>
+  %bitop.i = and <8 x i32> %val_to_boolvec32.i,
+    <i32 -1082130432, i32 -1082130432, i32 -1082130432, i32 -1082130432,
+     i32 -1082130432, i32 -1082130432, i32 -1082130432, i32 -1082130432>
+  %int_to_float_bitcast.i.i.i = bitcast <8 x i32> %bitop.i to <8 x float>
+  %binop.i = fadd <8 x float> %calltmp.i, %int_to_float_bitcast.i.i.i
+  ret <8 x float> %binop.i
+}
+
+define <8 x float> @__ceil_varying_float(<8 x float>) nounwind readonly alwaysinline {
+  %calltmp.i = tail call <8 x float> @__round_varying_float(<8 x float> %0) nounwind
+  %bincmp.i = fcmp olt <8 x float> %calltmp.i, %0
+  %val_to_boolvec32.i = sext <8 x i1> %bincmp.i to <8 x i32>
+  %bitop.i = and <8 x i32> %val_to_boolvec32.i,
+    <i32 1065353216, i32 1065353216, i32 1065353216, i32 1065353216,
+     i32 1065353216, i32 1065353216, i32 1065353216, i32 1065353216>
+  %int_to_float_bitcast.i.i.i = bitcast <8 x i32> %bitop.i to <8 x float>
+  %binop.i = fadd <8 x float> %calltmp.i, %int_to_float_bitcast.i.i.i
+  ret <8 x float> %binop.i
+}
+
+;; FIXME: rounding doubles and double vectors needs to be implemented
+declare <WIDTH x double> @__round_varying_double(<WIDTH x double>) nounwind readnone 
+declare <WIDTH x double> @__floor_varying_double(<WIDTH x double>) nounwind readnone 
+declare <WIDTH x double> @__ceil_varying_double(<WIDTH x double>) nounwind readnone 
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; min/max
+
+declare <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float>, <4 x float>) nounwind readnone
+declare <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float>, <4 x float>) nounwind readnone
+
+define <WIDTH x float> @__max_varying_float(<WIDTH x float>,
+                                            <WIDTH x float>) nounwind readnone {
+  binary4to8(r, float, @llvm.arm.neon.vmaxs.v4f32, %0, %1)
+  ret <WIDTH x float> %r
+}
+
+define <WIDTH x float> @__min_varying_float(<WIDTH x float>,
+                                            <WIDTH x float>) nounwind readnone {
+  binary4to8(r, float, @llvm.arm.neon.vmins.v4f32, %0, %1)
+  ret <WIDTH x float> %r
+}
+
+declare <4 x i32> @llvm.arm.neon.vmins.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vminu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vmaxs.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vmaxu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <WIDTH x i32> @__min_varying_int32(<WIDTH x i32>, <WIDTH x i32>) nounwind readnone {
+  binary4to8(r, i32, @llvm.arm.neon.vmins.v4i32, %0, %1)
+  ret <WIDTH x i32> %r
+}
+
+define <WIDTH x i32> @__max_varying_int32(<WIDTH x i32>, <WIDTH x i32>) nounwind readnone {
+  binary4to8(r, i32, @llvm.arm.neon.vmaxs.v4i32, %0, %1)
+  ret <WIDTH x i32> %r
+}
+
+define <WIDTH x i32> @__min_varying_uint32(<WIDTH x i32>, <WIDTH x i32>) nounwind readnone {
+  binary4to8(r, i32, @llvm.arm.neon.vminu.v4i32, %0, %1)
+  ret <WIDTH x i32> %r
+}
+
+define <WIDTH x i32> @__max_varying_uint32(<WIDTH x i32>, <WIDTH x i32>) nounwind readnone {
+  binary4to8(r, i32, @llvm.arm.neon.vmaxu.v4i32, %0, %1)
+  ret <WIDTH x i32> %r
+}
+
+;; sqrt/rsqrt/rcp
+
+declare <4 x float> @llvm.arm.neon.vrecpe.v4f32(<4 x float>) nounwind readnone
+declare <4 x float> @llvm.arm.neon.vrecps.v4f32(<4 x float>, <4 x float>) nounwind readnone
+
+define <WIDTH x float> @__rcp_varying_float(<WIDTH x float> %d) nounwind readnone {
+  unary4to8(x0, float, @llvm.arm.neon.vrecpe.v4f32, %d)
+  binary4to8(x0_nr, float, @llvm.arm.neon.vrecps.v4f32, %d, %x0)
+  %x1 = fmul <WIDTH x float> %x0, %x0_nr
+  binary4to8(x1_nr, float, @llvm.arm.neon.vrecps.v4f32, %d, %x1)
+  %x2 = fmul <WIDTH x float> %x1, %x1_nr
+  ret <WIDTH x float> %x2
+}
+
+declare <4 x float> @llvm.arm.neon.vrsqrte.v4f32(<4 x float>) nounwind readnone
+declare <4 x float> @llvm.arm.neon.vrsqrts.v4f32(<4 x float>, <4 x float>) nounwind readnone
+
+define <WIDTH x float> @__rsqrt_varying_float(<WIDTH x float> %d) nounwind readnone {
+  unary4to8(x0, float, @llvm.arm.neon.vrsqrte.v4f32, %d)
+  %x0_2 = fmul <WIDTH x float> %x0, %x0
+  binary4to8(x0_nr, float, @llvm.arm.neon.vrsqrts.v4f32, %d, %x0_2)
+  %x1 = fmul <WIDTH x float> %x0, %x0_nr
+  %x1_2 = fmul <WIDTH x float> %x1, %x1
+  binary4to8(x1_nr, float, @llvm.arm.neon.vrsqrts.v4f32, %d, %x1_2)
+  %x2 = fmul <WIDTH x float> %x1, %x1_nr
+  ret <WIDTH x float> %x2
+}
+
+define float @__rsqrt_uniform_float(float) nounwind readnone {
+  %v1 = bitcast float %0 to <1 x float>
+  %vs = shufflevector <1 x float> %v1, <1 x float> undef,
+          <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef,
+                      i32 undef, i32 undef, i32 undef, i32 undef>
+  %vr = call <8 x float> @__rsqrt_varying_float(<8 x float> %vs)
+  %r = extractelement <8 x float> %vr, i32 0
+  ret float %r
+}
+
+define float @__rcp_uniform_float(float) nounwind readnone {
+  %v1 = bitcast float %0 to <1 x float>
+  %vs = shufflevector <1 x float> %v1, <1 x float> undef,
+          <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef,
+                      i32 undef, i32 undef, i32 undef, i32 undef>
+  %vr = call <8 x float> @__rcp_varying_float(<8 x float> %vs)
+  %r = extractelement <8 x float> %vr, i32 0
+  ret float %r
+}
+
+declare <4 x float> @llvm.sqrt.v4f32(<4 x float>)
+
+define <WIDTH x float> @__sqrt_varying_float(<WIDTH x float>) nounwind readnone {
+  unary4to8(result, float, @llvm.sqrt.v4f32, %0)
+;; this returns nan for v=0, which is undesirable..
+;;  %rsqrt = call <WIDTH x float> @__rsqrt_varying_float(<WIDTH x float> %0)
+;;  %result = fmul <4 x float> %rsqrt, %0
+  ret <8 x float> %result
+}
+
+declare <4 x double> @llvm.sqrt.v4f64(<4 x double>)
+
+define <WIDTH x double> @__sqrt_varying_double(<WIDTH x double>) nounwind readnone {
+  unary4to8(r, double, @llvm.sqrt.v4f64, %0)
+  ret <WIDTH x double> %r
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; reductions
+
+define i64 @__movmsk(<WIDTH x MASK>) nounwind readnone {
+  %and_mask = and <WIDTH x i16> %0,
+    <i16 1, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128>
+  %v4 = call <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<8 x i16> %and_mask)
+  %v2 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %v4)
+  %va = extractelement <2 x i64> %v2, i32 0
+  %vb = extractelement <2 x i64> %v2, i32 1
+  %v = or i64 %va, %vb
+  ret i64 %v
+}
+
+define i1 @__any(<WIDTH x MASK>) nounwind readnone alwaysinline {
+  v8tov4(MASK, %0, %v0123, %v4567)
+  %vor = or <4 x MASK> %v0123, %v4567
+  %v0 = extractelement <4 x MASK> %vor, i32 0
+  %v1 = extractelement <4 x MASK> %vor, i32 1
+  %v2 = extractelement <4 x MASK> %vor, i32 2
+  %v3 = extractelement <4 x MASK> %vor, i32 3
+  %v01 = or MASK %v0, %v1
+  %v23 = or MASK %v2, %v3
+  %v = or MASK %v01, %v23
+  %cmp = icmp ne MASK %v, 0
+  ret i1 %cmp
+}
+
+define i1 @__all(<WIDTH x MASK>) nounwind readnone alwaysinline {
+  v8tov4(MASK, %0, %v0123, %v4567)
+  %vand = and <4 x MASK> %v0123, %v4567
+  %v0 = extractelement <4 x MASK> %vand, i32 0
+  %v1 = extractelement <4 x MASK> %vand, i32 1
+  %v2 = extractelement <4 x MASK> %vand, i32 2
+  %v3 = extractelement <4 x MASK> %vand, i32 3
+  %v01 = and MASK %v0, %v1
+  %v23 = and MASK %v2, %v3
+  %v = and MASK %v01, %v23
+  %cmp = icmp ne MASK %v, 0
+  ret i1 %cmp
+}
+
+define i1 @__none(<WIDTH x MASK>) nounwind readnone alwaysinline {
+  %any = call i1 @__any(<WIDTH x MASK> %0)
+  %none = icmp eq i1 %any, 0
+  ret i1 %none
+}
+
+;; $1: scalar type
+;; $2: vector/vector reduce function (2 x <WIDTH x vec> -> <WIDTH x vec>)
+;; $3: pairwise vector reduce function (2 x <2 x vec> -> <2 x vec>)
+;; $4: scalar reduce function
+
+define(`neon_reduce', `
+  v8tov4($1, %0, %v0123, %v4567)
+  %v0123_8 = shufflevector <4 x $1> %v0123, <4 x $1> undef,
+    <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+  %v4567_8 = shufflevector <4 x $1> %v4567, <4 x $1> undef,
+    <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+  %vfirst = call <8 x $1> $2(<8 x $1> %v0123_8, <8 x $1> %v4567_8)
+  %vfirst_4 = shufflevector <8 x $1> %vfirst, <8 x $1> undef,
+    <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  v4tov2($1, %vfirst_4, %v0, %v1)
+  %vh = call <2 x $1> $3(<2 x $1> %v0, <2 x $1> %v1)
+  %vh0 = extractelement <2 x $1> %vh, i32 0
+  %vh1 = extractelement <2 x $1> %vh, i32 1
+  %r = call $1 $4($1 %vh0, $1 %vh1)
+  ret $1 %r
+')
+
+declare <2 x float> @llvm.arm.neon.vpadd.v2f32(<2 x float>, <2 x float>) nounwind readnone
+
+define internal float @add_f32(float, float) {
+  %r = fadd float %0, %1
+  ret float %r
+}
+
+define internal <WIDTH x float> @__add_varying_float(<WIDTH x float>, <WIDTH x float>) {
+  %r = fadd <WIDTH x float> %0, %1
+  ret <WIDTH x float> %r
+}
+
+define float @__reduce_add_float(<WIDTH x float>) nounwind readnone {
+  neon_reduce(float, @__add_varying_float, @llvm.arm.neon.vpadd.v2f32, @add_f32)
+}
+
+declare <2 x float> @llvm.arm.neon.vpmins.v2f32(<2 x float>, <2 x float>) nounwind readnone
+
+define internal float @min_f32(float, float) {
+  %cmp = fcmp olt float %0, %1
+  %r = select i1 %cmp, float %0, float %1
+  ret float %r
+}
+
+define float @__reduce_min_float(<WIDTH x float>) nounwind readnone {
+  neon_reduce(float, @__min_varying_float, @llvm.arm.neon.vpmins.v2f32, @min_f32)
+}
+
+declare <2 x float> @llvm.arm.neon.vpmaxs.v2f32(<2 x float>, <2 x float>) nounwind readnone
+
+define internal float @max_f32(float, float) {
+  %cmp = fcmp ugt float %0, %1
+  %r = select i1 %cmp, float %0, float %1
+  ret float %r
+}
+
+define float @__reduce_max_float(<WIDTH x float>) nounwind readnone {
+  neon_reduce(float, @__max_varying_float, @llvm.arm.neon.vpmaxs.v2f32, @max_f32)
+}
+
+declare <4 x i16> @llvm.arm.neon.vpaddls.v4i16.v8i8(<8 x i8>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vpaddlu.v2i32.v4i16(<4 x i16>) nounwind readnone
+
+define i16 @__reduce_add_int8(<WIDTH x i8>) nounwind readnone {
+  %a16 = call <4 x i16> @llvm.arm.neon.vpaddls.v4i16.v8i8(<8 x i8> %0)
+  %a32 = call <2 x i32> @llvm.arm.neon.vpaddlu.v2i32.v4i16(<4 x i16> %a16)
+  %a0 = extractelement <2 x i32> %a32, i32 0
+  %a1 = extractelement <2 x i32> %a32, i32 1
+  %r = add i32 %a0, %a1
+  %r16 = trunc i32 %r to i16
+  ret i16 %r16
+}
+
+declare <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<WIDTH x i16>)
+
+define i64 @__reduce_add_int16(<WIDTH x i16>) nounwind readnone {
+  %a1 = call <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<WIDTH x i16> %0)
+  %a2 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %a1)
+  %aa = extractelement <2 x i64> %a2, i32 0
+  %ab = extractelement <2 x i64> %a2, i32 1
+  %r = add i64 %aa, %ab
+  ret i64 %r
+}
+
+declare <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32>) nounwind readnone
+
+define i64 @__reduce_add_int32(<WIDTH x i32>) nounwind readnone {
+  v8tov4(i32, %0, %va, %vb)
+  %pa = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %va)
+  %pb = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %vb)
+  %psum = add <2 x i64> %pa, %pb
+  %a0 = extractelement <2 x i64> %psum, i32 0
+  %a1 = extractelement <2 x i64> %psum, i32 1
+  %r = add i64 %a0, %a1
+  ret i64 %r
+}
+
+declare <2 x i32> @llvm.arm.neon.vpmins.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+define internal i32 @min_si32(i32, i32) {
+  %cmp = icmp slt i32 %0, %1
+  %r = select i1 %cmp, i32 %0, i32 %1
+  ret i32 %r
+}
+
+define i32 @__reduce_min_int32(<WIDTH x i32>) nounwind readnone {
+  neon_reduce(i32, @__min_varying_int32, @llvm.arm.neon.vpmins.v2i32, @min_si32)
+}
+
+declare <2 x i32> @llvm.arm.neon.vpmaxs.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+define internal i32 @max_si32(i32, i32) {
+  %cmp = icmp sgt i32 %0, %1
+  %r = select i1 %cmp, i32 %0, i32 %1
+  ret i32 %r
+}
+
+define i32 @__reduce_max_int32(<WIDTH x i32>) nounwind readnone {
+  neon_reduce(i32, @__max_varying_int32, @llvm.arm.neon.vpmaxs.v2i32, @max_si32)
+}
+
+declare <2 x i32> @llvm.arm.neon.vpminu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+define internal i32 @min_ui32(i32, i32) {
+  %cmp = icmp ult i32 %0, %1
+  %r = select i1 %cmp, i32 %0, i32 %1
+  ret i32 %r
+}
+
+define i32 @__reduce_min_uint32(<WIDTH x i32>) nounwind readnone {
+  neon_reduce(i32, @__min_varying_uint32, @llvm.arm.neon.vpmins.v2i32, @min_ui32)
+}
+
+declare <2 x i32> @llvm.arm.neon.vpmaxu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+define internal i32 @max_ui32(i32, i32) {
+  %cmp = icmp ugt i32 %0, %1
+  %r = select i1 %cmp, i32 %0, i32 %1
+  ret i32 %r
+}
+
+define i32 @__reduce_max_uint32(<WIDTH x i32>) nounwind readnone {
+  neon_reduce(i32, @__max_varying_uint32, @llvm.arm.neon.vpmaxs.v2i32, @max_ui32)
+}
+
+define double @__reduce_add_double(<WIDTH x double>) nounwind readnone {
+  v8tov2(double, %0, %v0, %v1, %v2, %v3)
+  %v01 = fadd <2 x double> %v0, %v1
+  %v23 = fadd <2 x double> %v2, %v3
+  %sum = fadd <2 x double> %v01, %v23
+  %e0 = extractelement <2 x double> %sum, i32 0
+  %e1 = extractelement <2 x double> %sum, i32 1
+  %m = fadd double %e0, %e1
+  ret double %m
+}
+
+define double @__reduce_min_double(<WIDTH x double>) nounwind readnone {
+  reduce8(double, @__min_varying_double, @__min_uniform_double)
+}
+
+define double @__reduce_max_double(<WIDTH x double>) nounwind readnone {
+  reduce8(double, @__max_varying_double, @__max_uniform_double)
+}
+
+define i64 @__reduce_add_int64(<WIDTH x i64>) nounwind readnone {
+  v8tov2(i64, %0, %v0, %v1, %v2, %v3)
+  %v01 = add <2 x i64> %v0, %v1
+  %v23 = add <2 x i64> %v2, %v3
+  %sum = add <2 x i64> %v01, %v23
+  %e0 = extractelement <2 x i64> %sum, i32 0
+  %e1 = extractelement <2 x i64> %sum, i32 1
+  %m = add i64 %e0, %e1
+  ret i64 %m
+}
+
+define i64 @__reduce_min_int64(<WIDTH x i64>) nounwind readnone {
+  reduce8(i64, @__min_varying_int64, @__min_uniform_int64)
+}
+
+define i64 @__reduce_max_int64(<WIDTH x i64>) nounwind readnone {
+  reduce8(i64, @__max_varying_int64, @__max_uniform_int64)
+}
+
+define i64 @__reduce_min_uint64(<WIDTH x i64>) nounwind readnone {
+  reduce8(i64, @__min_varying_uint64, @__min_uniform_uint64)
+}
+
+define i64 @__reduce_max_uint64(<WIDTH x i64>) nounwind readnone {
+  reduce8(i64, @__max_varying_uint64, @__max_uniform_uint64)
+}
diff --git a/builtins/target-neon.ll b/builtins/target-neon-32.ll
similarity index 62%
rename from builtins/target-neon.ll
rename to builtins/target-neon-32.ll
index fbeac352..1f8003d7 100644
--- a/builtins/target-neon.ll
+++ b/builtins/target-neon-32.ll
@@ -1,5 +1,5 @@
 ;;
-;; target-neon.ll
+;; target-neon-32.ll
 ;;
 ;;  Copyright(c) 2012-2013 Matt Pharr
 ;;  Copyright(c) 2013 Google, Inc.
@@ -34,52 +34,20 @@
 ;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 ;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
 
-target datalayout = "e-p:32:32:32-S32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f16:16:16-f32:32:32-f64:32:64-f128:128:128-v64:32:64-v128:32:128-a0:0:64-n32"
-
 define(`WIDTH',`4')
-
 define(`MASK',`i32')
 
 include(`util.m4')
-
-stdlib_core()
-scans()
-reduce_equal(WIDTH)
-rdrand_decls()
-define_shuffles()
-aossoa()
-ctlztz()
+include(`target-neon-common.ll')
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; half conversion routines
 
-declare <4 x i16> @llvm.arm.neon.vcvtfp2hf(<4 x float>) nounwind readnone
-declare <4 x float> @llvm.arm.neon.vcvthf2fp(<4 x i16>) nounwind readnone
-
-define float @__half_to_float_uniform(i16 %v) nounwind readnone {
-  %v1 = bitcast i16 %v to <1 x i16>
-  %vec = shufflevector <1 x i16> %v1, <1 x i16> undef, 
-           <4 x i32> <i32 0, i32 0, i32 0, i32 0>
-  %h = call <4 x float> @llvm.arm.neon.vcvthf2fp(<4 x i16> %vec)
-  %r = extractelement <4 x float> %h, i32 0
-  ret float %r
-}
-
 define <4 x float> @__half_to_float_varying(<4 x i16> %v) nounwind readnone {
   %r = call <4 x float> @llvm.arm.neon.vcvthf2fp(<4 x i16> %v)
   ret <4 x float> %r
 }
 
-define i16 @__float_to_half_uniform(float %v) nounwind readnone {
-  %v1 = bitcast float %v to <1 x float>
-  %vec = shufflevector <1 x float> %v1, <1 x float> undef, 
-           <4 x i32> <i32 0, i32 0, i32 0, i32 0>
-  %h = call <4 x i16> @llvm.arm.neon.vcvtfp2hf(<4 x float> %vec)
-  %r = extractelement <4 x i16> %h, i32 0
-  ret i16 %r
-}
-
-
 define <4 x i16> @__float_to_half_varying(<4 x float> %v) nounwind readnone {
   %r = call <4 x i16> @llvm.arm.neon.vcvtfp2hf(<4 x float> %v)
   ret <4 x i16> %r
@@ -88,48 +56,11 @@ define <4 x i16> @__float_to_half_varying(<4 x float> %v) nounwind readnone {
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; math
 
-define void @__fastmath() nounwind {
-  ret void
-}
-
 ;; round/floor/ceil
 
 ;; FIXME: grabbed these from the sse2 target, which does not have native
 ;; instructions for these.  Is there a better approach for NEON?
 
-define float @__round_uniform_float(float) nounwind readonly alwaysinline {
-  %float_to_int_bitcast.i.i.i.i = bitcast float %0 to i32
-  %bitop.i.i = and i32 %float_to_int_bitcast.i.i.i.i, -2147483648
-  %bitop.i = xor i32 %bitop.i.i, %float_to_int_bitcast.i.i.i.i
-  %int_to_float_bitcast.i.i40.i = bitcast i32 %bitop.i to float
-  %binop.i = fadd float %int_to_float_bitcast.i.i40.i, 8.388608e+06
-  %binop21.i = fadd float %binop.i, -8.388608e+06
-  %float_to_int_bitcast.i.i.i = bitcast float %binop21.i to i32
-  %bitop31.i = xor i32 %float_to_int_bitcast.i.i.i, %bitop.i.i
-  %int_to_float_bitcast.i.i.i = bitcast i32 %bitop31.i to float
-  ret float %int_to_float_bitcast.i.i.i
-}
-
-define float @__floor_uniform_float(float) nounwind readonly alwaysinline {
-  %calltmp.i = tail call float @__round_uniform_float(float %0) nounwind
-  %bincmp.i = fcmp ogt float %calltmp.i, %0
-  %selectexpr.i = sext i1 %bincmp.i to i32
-  %bitop.i = and i32 %selectexpr.i, -1082130432
-  %int_to_float_bitcast.i.i.i = bitcast i32 %bitop.i to float
-  %binop.i = fadd float %calltmp.i, %int_to_float_bitcast.i.i.i
-  ret float %binop.i
-}
-
-define float @__ceil_uniform_float(float) nounwind readonly alwaysinline {
-  %calltmp.i = tail call float @__round_uniform_float(float %0) nounwind
-  %bincmp.i = fcmp olt float %calltmp.i, %0
-  %selectexpr.i = sext i1 %bincmp.i to i32
-  %bitop.i = and i32 %selectexpr.i, 1065353216
-  %int_to_float_bitcast.i.i.i = bitcast i32 %bitop.i to float
-  %binop.i = fadd float %calltmp.i, %int_to_float_bitcast.i.i.i
-  ret float %binop.i
-}
-
 define <4 x float> @__round_varying_float(<4 x float>) nounwind readonly alwaysinline {
   %float_to_int_bitcast.i.i.i.i = bitcast <4 x float> %0 to <4 x i32>
   %bitop.i.i = and <4 x i32> %float_to_int_bitcast.i.i.i.i, <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>
@@ -164,10 +95,6 @@ define <4 x float> @__ceil_varying_float(<4 x float>) nounwind readonly alwaysin
 }
 
 ;; FIXME: rounding doubles and double vectors needs to be implemented
-declare double @__round_uniform_double(double) nounwind readnone 
-declare double @__floor_uniform_double(double) nounwind readnone 
-declare double @__ceil_uniform_double(double) nounwind readnone 
-
 declare <WIDTH x double> @__round_varying_double(<WIDTH x double>) nounwind readnone 
 declare <WIDTH x double> @__floor_varying_double(<WIDTH x double>) nounwind readnone 
 declare <WIDTH x double> @__ceil_varying_double(<WIDTH x double>) nounwind readnone 
@@ -175,78 +102,6 @@ declare <WIDTH x double> @__ceil_varying_double(<WIDTH x double>) nounwind readn
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; min/max
 
-define float @__max_uniform_float(float, float) nounwind readnone {
-  %cmp = fcmp ugt float %0, %1
-  %r = select i1 %cmp, float %0, float %1
-  ret float %r
-}
-
-define float @__min_uniform_float(float, float) nounwind readnone {
-  %cmp = fcmp ult float %0, %1
-  %r = select i1 %cmp, float %0, float %1
-  ret float %r
-}
-
-define i32 @__min_uniform_int32(i32, i32) nounwind readnone {
-  %cmp = icmp slt i32 %0, %1
-  %r = select i1 %cmp, i32 %0, i32 %1
-  ret i32 %r
-}
-
-define i32 @__max_uniform_int32(i32, i32) nounwind readnone {
-  %cmp = icmp sgt i32 %0, %1
-  %r = select i1 %cmp, i32 %0, i32 %1
-  ret i32 %r
-}
-
-define i32 @__min_uniform_uint32(i32, i32) nounwind readnone {
-  %cmp = icmp ult i32 %0, %1
-  %r = select i1 %cmp, i32 %0, i32 %1
-  ret i32 %r
-}
-
-define i32 @__max_uniform_uint32(i32, i32) nounwind readnone {
-  %cmp = icmp ugt i32 %0, %1
-  %r = select i1 %cmp, i32 %0, i32 %1
-  ret i32 %r
-}
-
-define i64 @__min_uniform_int64(i64, i64) nounwind readnone {
-  %cmp = icmp slt i64 %0, %1
-  %r = select i1 %cmp, i64 %0, i64 %1
-  ret i64 %r
-}
-
-define i64 @__max_uniform_int64(i64, i64) nounwind readnone {
-  %cmp = icmp sgt i64 %0, %1
-  %r = select i1 %cmp, i64 %0, i64 %1
-  ret i64 %r
-}
-
-define i64 @__min_uniform_uint64(i64, i64) nounwind readnone {
-  %cmp = icmp ult i64 %0, %1
-  %r = select i1 %cmp, i64 %0, i64 %1
-  ret i64 %r
-}
-
-define i64 @__max_uniform_uint64(i64, i64) nounwind readnone {
-  %cmp = icmp ugt i64 %0, %1
-  %r = select i1 %cmp, i64 %0, i64 %1
-  ret i64 %r
-}
-
-define double @__min_uniform_double(double, double) nounwind readnone {
-  %cmp = fcmp olt double %0, %1
-  %r = select i1 %cmp, double %0, double %1
-  ret double %r
-}
-
-define double @__max_uniform_double(double, double) nounwind readnone {
-  %cmp = fcmp ogt double %0, %1
-  %r = select i1 %cmp, double %0, double %1
-  ret double %r
-}
-
 declare <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float>, <4 x float>) nounwind readnone
 declare <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float>, <4 x float>) nounwind readnone
 
@@ -287,44 +142,6 @@ define <WIDTH x i32> @__max_varying_uint32(<WIDTH x i32>, <WIDTH x i32>) nounwin
   ret <4 x i32> %r
 }
 
-define <WIDTH x i64> @__min_varying_int64(<WIDTH x i64>, <WIDTH x i64>) nounwind readnone {
-  %m = icmp slt <WIDTH x i64> %0, %1
-  %r = select <WIDTH x i1> %m, <WIDTH x i64> %0, <WIDTH x i64> %1
-  ret <WIDTH x i64> %r
-}
-
-define <WIDTH x i64> @__max_varying_int64(<WIDTH x i64>, <WIDTH x i64>) nounwind readnone {
-  %m = icmp sgt <WIDTH x i64> %0, %1
-  %r = select <WIDTH x i1> %m, <WIDTH x i64> %0, <WIDTH x i64> %1
-  ret <WIDTH x i64> %r
-}
-
-define <WIDTH x i64> @__min_varying_uint64(<WIDTH x i64>, <WIDTH x i64>) nounwind readnone {
-  %m = icmp ult <WIDTH x i64> %0, %1
-  %r = select <WIDTH x i1> %m, <WIDTH x i64> %0, <WIDTH x i64> %1
-  ret <WIDTH x i64> %r
-}
-
-define <WIDTH x i64> @__max_varying_uint64(<WIDTH x i64>, <WIDTH x i64>) nounwind readnone {
-  %m = icmp ugt <WIDTH x i64> %0, %1
-  %r = select <WIDTH x i1> %m, <WIDTH x i64> %0, <WIDTH x i64> %1
-  ret <WIDTH x i64> %r
-}
-
-define <WIDTH x double> @__min_varying_double(<WIDTH x double>,
-                                              <WIDTH x double>) nounwind readnone {
-  %m = fcmp olt <WIDTH x double> %0, %1
-  %r = select <WIDTH x i1> %m, <WIDTH x double> %0, <WIDTH x double> %1
-  ret <WIDTH x double> %r
-}
-
-define <WIDTH x double> @__max_varying_double(<WIDTH x double>,
-                                              <WIDTH x double>) nounwind readnone {
-  %m = fcmp ogt <WIDTH x double> %0, %1
-  %r = select <WIDTH x i1> %m, <WIDTH x double> %0, <WIDTH x double> %1
-  ret <WIDTH x double> %r
-}
-
 ;; sqrt/rsqrt/rcp
 
 declare <4 x float> @llvm.arm.neon.vrecpe.v4f32(<4 x float>) nounwind readnone
@@ -371,13 +188,6 @@ define float @__rcp_uniform_float(float) nounwind readnone {
   ret float %r
 }
 
-declare float @llvm.sqrt.f32(float)
-
-define float @__sqrt_uniform_float(float) nounwind readnone {
-  %r = call float @llvm.sqrt.f32(float %0)
-  ret float %r
-}
-
 declare <4 x float> @llvm.sqrt.v4f32(<4 x float>)
 
 define <WIDTH x float> @__sqrt_varying_float(<WIDTH x float>) nounwind readnone {
@@ -388,13 +198,6 @@ define <WIDTH x float> @__sqrt_varying_float(<WIDTH x float>) nounwind readnone
   ret <4 x float> %result
 }
 
-declare double @llvm.sqrt.f64(double)
-
-define double @__sqrt_uniform_double(double) nounwind readnone {
-  %r = call double @llvm.sqrt.f64(double %0)
-  ret double %r
-}
-
 declare <4 x double> @llvm.sqrt.v4f64(<4 x double>)
 
 define <WIDTH x double> @__sqrt_varying_double(<WIDTH x double>) nounwind readnone {
@@ -402,21 +205,6 @@ define <WIDTH x double> @__sqrt_varying_double(<WIDTH x double>) nounwind readno
   ret <4 x double> %r
 }
 
-;; bit ops
-
-declare i32 @llvm.ctpop.i32(i32) nounwind readnone
-declare i64 @llvm.ctpop.i64(i64) nounwind readnone
-
-define i32 @__popcnt_int32(i32) nounwind readnone {
-  %v = call i32 @llvm.ctpop.i32(i32 %0)
-  ret i32 %v
-}
-
-define i64 @__popcnt_int64(i64) nounwind readnone {
-  %v = call i64 @llvm.ctpop.i64(i64 %0)
-  ret i64 %v
-}
-
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; reductions
 
@@ -638,92 +426,3 @@ define i64 @__reduce_min_uint64(<4 x i64>) nounwind readnone {
 define i64 @__reduce_max_uint64(<4 x i64>) nounwind readnone {
   reduce4(i64, @__max_varying_uint64, @__max_uniform_uint64)
 }
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; unaligned loads/loads+broadcasts
-
-masked_load(i8,  1)
-masked_load(i16, 2)
-masked_load(i32, 4)
-masked_load(float, 4)
-masked_load(i64, 8)
-masked_load(double, 8)
-
-gen_masked_store(i8)
-gen_masked_store(i16)
-gen_masked_store(i32)
-gen_masked_store(i64)
-masked_store_float_double()
-
-define void @__masked_store_blend_i8(<WIDTH x i8>* nocapture %ptr, <WIDTH x i8> %new,
-                                     <WIDTH x MASK> %mask) nounwind alwaysinline {
-  %old = load <WIDTH x i8> * %ptr
-  %mask1 = trunc <4 x MASK> %mask to <4 x i1>
-  %result = select <4 x i1> %mask1, <4 x i8> %new, <4 x i8> %old
-  store <WIDTH x i8> %result, <WIDTH x i8> * %ptr
-  ret void
-}
-
-define void @__masked_store_blend_i16(<WIDTH x i16>* nocapture %ptr, <WIDTH x i16> %new, 
-                                      <WIDTH x MASK> %mask) nounwind alwaysinline {
-  %old = load <WIDTH x i16> * %ptr
-  %mask1 = trunc <4 x MASK> %mask to <4 x i1>
-  %result = select <4 x i1> %mask1, <4 x i16> %new, <4 x i16> %old
-  store <WIDTH x i16> %result, <WIDTH x i16> * %ptr
-  ret void
-}
-
-define void @__masked_store_blend_i32(<WIDTH x i32>* nocapture %ptr, <WIDTH x i32> %new, 
-                                      <WIDTH x MASK> %mask) nounwind alwaysinline {
-  %old = load <WIDTH x i32> * %ptr
-  %mask1 = trunc <4 x MASK> %mask to <4 x i1>
-  %result = select <4 x i1> %mask1, <4 x i32> %new, <4 x i32> %old
-  store <WIDTH x i32> %result, <WIDTH x i32> * %ptr
-  ret void
-}
-
-define void @__masked_store_blend_i64(<WIDTH x i64>* nocapture %ptr,
-                            <WIDTH x i64> %new, <WIDTH x MASK> %mask) nounwind alwaysinline {
-  %old = load <WIDTH x i64> * %ptr
-  %mask1 = trunc <4 x MASK> %mask to <4 x i1>
-  %result = select <4 x i1> %mask1, <4 x i64> %new, <4 x i64> %old
-  store <WIDTH x i64> %result, <WIDTH x i64> * %ptr
-  ret void
-}
-
-;; yuck.  We need declarations of these, even though we shouldnt ever
-;; actually generate calls to them for the NEON target...
-
-declare <WIDTH x float> @__svml_sin(<WIDTH x float>)
-declare <WIDTH x float> @__svml_cos(<WIDTH x float>)
-declare void @__svml_sincos(<WIDTH x float>, <WIDTH x float> *, <WIDTH x float> *)
-declare <WIDTH x float> @__svml_tan(<WIDTH x float>)
-declare <WIDTH x float> @__svml_atan(<WIDTH x float>)
-declare <WIDTH x float> @__svml_atan2(<WIDTH x float>, <WIDTH x float>)
-declare <WIDTH x float> @__svml_exp(<WIDTH x float>)
-declare <WIDTH x float> @__svml_log(<WIDTH x float>)
-declare <WIDTH x float> @__svml_pow(<WIDTH x float>, <WIDTH x float>)
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; gather
-
-gen_gather_factored(i8)
-gen_gather_factored(i16)
-gen_gather_factored(i32)
-gen_gather_factored(float)
-gen_gather_factored(i64)
-gen_gather_factored(double)
-
-gen_scatter(i8)
-gen_scatter(i16)
-gen_scatter(i32)
-gen_scatter(float)
-gen_scatter(i64)
-gen_scatter(double)
-
-packed_load_and_store(4)
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; prefetch
-
-define_prefetches()
diff --git a/builtins/target-neon-8.ll b/builtins/target-neon-8.ll
new file mode 100644
index 00000000..eb65f224
--- /dev/null
+++ b/builtins/target-neon-8.ll
@@ -0,0 +1,508 @@
+;;
+;; target-neon-8.ll
+;;
+;;  Copyright(c) 2013 Google, Inc.
+;;
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Matt Pharr nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+define(`WIDTH',`16')
+define(`MASK',`i8')
+
+include(`util.m4')
+include(`target-neon-common.ll')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; half conversion routines
+
+define <16 x float> @__half_to_float_varying(<16 x i16> %v) nounwind readnone {
+  unary4to16conv(r, i16, float, @llvm.arm.neon.vcvthf2fp, %v)
+  ret <16 x float> %r
+}
+
+define <16 x i16> @__float_to_half_varying(<16 x float> %v) nounwind readnone {
+  unary4to16conv(r, float, i16, @llvm.arm.neon.vcvtfp2hf, %v)
+  ret <16 x i16> %r
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; math
+
+;; round/floor/ceil
+
+;; FIXME: grabbed these from the sse2 target, which does not have native
+;; instructions for these.  Is there a better approach for NEON?
+
+define <16 x float> @__round_varying_float(<16 x float>) nounwind readonly alwaysinline {
+  %float_to_int_bitcast.i.i.i.i = bitcast <16 x float> %0 to <16 x i32>
+  %bitop.i.i = and <16 x i32> %float_to_int_bitcast.i.i.i.i,
+    <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648,
+     i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648,
+     i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648,
+     i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>
+  %bitop.i = xor <16 x i32> %float_to_int_bitcast.i.i.i.i, %bitop.i.i
+  %int_to_float_bitcast.i.i40.i = bitcast <16 x i32> %bitop.i to <16 x float>
+  %binop.i = fadd <16 x float> %int_to_float_bitcast.i.i40.i,
+    <float 8.388608e+06, float 8.388608e+06, float 8.388608e+06, float 8.388608e+06,
+     float 8.388608e+06, float 8.388608e+06, float 8.388608e+06, float 8.388608e+06,
+     float 8.388608e+06, float 8.388608e+06, float 8.388608e+06, float 8.388608e+06,
+     float 8.388608e+06, float 8.388608e+06, float 8.388608e+06, float 8.388608e+06>
+  %binop21.i = fadd <16 x float> %binop.i,
+    <float -8.388608e+06, float -8.388608e+06, float -8.388608e+06, float -8.388608e+06,
+     float -8.388608e+06, float -8.388608e+06, float -8.388608e+06, float -8.388608e+06,
+     float -8.388608e+06, float -8.388608e+06, float -8.388608e+06, float -8.388608e+06,
+     float -8.388608e+06, float -8.388608e+06, float -8.388608e+06, float -8.388608e+06>
+  %float_to_int_bitcast.i.i.i = bitcast <16 x float> %binop21.i to <16 x i32>
+  %bitop31.i = xor <16 x i32> %float_to_int_bitcast.i.i.i, %bitop.i.i
+  %int_to_float_bitcast.i.i.i = bitcast <16 x i32> %bitop31.i to <16 x float>
+  ret <16 x float> %int_to_float_bitcast.i.i.i
+}
+
+define <16 x float> @__floor_varying_float(<16 x float>) nounwind readonly alwaysinline {
+  %calltmp.i = tail call <16 x float> @__round_varying_float(<16 x float> %0) nounwind
+  %bincmp.i = fcmp ogt <16 x float> %calltmp.i, %0
+  %val_to_boolvec32.i = sext <16 x i1> %bincmp.i to <16 x i32>
+  %bitop.i = and <16 x i32> %val_to_boolvec32.i,
+    <i32 -1082130432, i32 -1082130432, i32 -1082130432, i32 -1082130432,
+     i32 -1082130432, i32 -1082130432, i32 -1082130432, i32 -1082130432,
+     i32 -1082130432, i32 -1082130432, i32 -1082130432, i32 -1082130432,
+     i32 -1082130432, i32 -1082130432, i32 -1082130432, i32 -1082130432>
+  %int_to_float_bitcast.i.i.i = bitcast <16 x i32> %bitop.i to <16 x float>
+  %binop.i = fadd <16 x float> %calltmp.i, %int_to_float_bitcast.i.i.i
+  ret <16 x float> %binop.i
+}
+
+define <16 x float> @__ceil_varying_float(<16 x float>) nounwind readonly alwaysinline {
+  %calltmp.i = tail call <16 x float> @__round_varying_float(<16 x float> %0) nounwind
+  %bincmp.i = fcmp olt <16 x float> %calltmp.i, %0
+  %val_to_boolvec32.i = sext <16 x i1> %bincmp.i to <16 x i32>
+  %bitop.i = and <16 x i32> %val_to_boolvec32.i,
+    <i32 1065353216, i32 1065353216, i32 1065353216, i32 1065353216,
+     i32 1065353216, i32 1065353216, i32 1065353216, i32 1065353216,
+     i32 1065353216, i32 1065353216, i32 1065353216, i32 1065353216,
+     i32 1065353216, i32 1065353216, i32 1065353216, i32 1065353216>
+  %int_to_float_bitcast.i.i.i = bitcast <16 x i32> %bitop.i to <16 x float>
+  %binop.i = fadd <16 x float> %calltmp.i, %int_to_float_bitcast.i.i.i
+  ret <16 x float> %binop.i
+}
+
+;; FIXME: rounding doubles and double vectors needs to be implemented
+declare <WIDTH x double> @__round_varying_double(<WIDTH x double>) nounwind readnone 
+declare <WIDTH x double> @__floor_varying_double(<WIDTH x double>) nounwind readnone 
+declare <WIDTH x double> @__ceil_varying_double(<WIDTH x double>) nounwind readnone 
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; min/max
+
+declare <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float>, <4 x float>) nounwind readnone
+declare <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float>, <4 x float>) nounwind readnone
+
+define <WIDTH x float> @__max_varying_float(<WIDTH x float>,
+                                            <WIDTH x float>) nounwind readnone {
+  binary4to16(r, float, @llvm.arm.neon.vmaxs.v4f32, %0, %1)
+  ret <WIDTH x float> %r
+}
+
+define <WIDTH x float> @__min_varying_float(<WIDTH x float>,
+                                            <WIDTH x float>) nounwind readnone {
+  binary4to16(r, float, @llvm.arm.neon.vmins.v4f32, %0, %1)
+  ret <WIDTH x float> %r
+}
+
+declare <4 x i32> @llvm.arm.neon.vmins.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vminu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vmaxs.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vmaxu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <WIDTH x i32> @__min_varying_int32(<WIDTH x i32>, <WIDTH x i32>) nounwind readnone {
+  binary4to16(r, i32, @llvm.arm.neon.vmins.v4i32, %0, %1)
+  ret <WIDTH x i32> %r
+}
+
+define <WIDTH x i32> @__max_varying_int32(<WIDTH x i32>, <WIDTH x i32>) nounwind readnone {
+  binary4to16(r, i32, @llvm.arm.neon.vmaxs.v4i32, %0, %1)
+  ret <WIDTH x i32> %r
+}
+
+define <WIDTH x i32> @__min_varying_uint32(<WIDTH x i32>, <WIDTH x i32>) nounwind readnone {
+  binary4to16(r, i32, @llvm.arm.neon.vminu.v4i32, %0, %1)
+  ret <WIDTH x i32> %r
+}
+
+define <WIDTH x i32> @__max_varying_uint32(<WIDTH x i32>, <WIDTH x i32>) nounwind readnone {
+  binary4to16(r, i32, @llvm.arm.neon.vmaxu.v4i32, %0, %1)
+  ret <WIDTH x i32> %r
+}
+
+;; sqrt/rsqrt/rcp
+
+declare <4 x float> @llvm.arm.neon.vrecpe.v4f32(<4 x float>) nounwind readnone
+declare <4 x float> @llvm.arm.neon.vrecps.v4f32(<4 x float>, <4 x float>) nounwind readnone
+
+define <WIDTH x float> @__rcp_varying_float(<WIDTH x float> %d) nounwind readnone {
+  unary4to16(x0, float, @llvm.arm.neon.vrecpe.v4f32, %d)
+  binary4to16(x0_nr, float, @llvm.arm.neon.vrecps.v4f32, %d, %x0)
+  %x1 = fmul <WIDTH x float> %x0, %x0_nr
+  binary4to16(x1_nr, float, @llvm.arm.neon.vrecps.v4f32, %d, %x1)
+  %x2 = fmul <WIDTH x float> %x1, %x1_nr
+  ret <WIDTH x float> %x2
+}
+
+declare <4 x float> @llvm.arm.neon.vrsqrte.v4f32(<4 x float>) nounwind readnone
+declare <4 x float> @llvm.arm.neon.vrsqrts.v4f32(<4 x float>, <4 x float>) nounwind readnone
+
+define <WIDTH x float> @__rsqrt_varying_float(<WIDTH x float> %d) nounwind readnone {
+  unary4to16(x0, float, @llvm.arm.neon.vrsqrte.v4f32, %d)
+  %x0_2 = fmul <WIDTH x float> %x0, %x0
+  binary4to16(x0_nr, float, @llvm.arm.neon.vrsqrts.v4f32, %d, %x0_2)
+  %x1 = fmul <WIDTH x float> %x0, %x0_nr
+  %x1_2 = fmul <WIDTH x float> %x1, %x1
+  binary4to16(x1_nr, float, @llvm.arm.neon.vrsqrts.v4f32, %d, %x1_2)
+  %x2 = fmul <WIDTH x float> %x1, %x1_nr
+  ret <WIDTH x float> %x2
+}
+
+define float @__rsqrt_uniform_float(float) nounwind readnone {
+  %v1 = bitcast float %0 to <1 x float>
+  %vs = shufflevector <1 x float> %v1, <1 x float> undef,
+          <16 x i32> <i32 0, i32 undef, i32 undef, i32 undef,
+                      i32 undef, i32 undef, i32 undef, i32 undef,
+                      i32 undef, i32 undef, i32 undef, i32 undef,
+                      i32 undef, i32 undef, i32 undef, i32 undef>
+  %vr = call <16 x float> @__rsqrt_varying_float(<16 x float> %vs)
+  %r = extractelement <16 x float> %vr, i32 0
+  ret float %r
+}
+
+define float @__rcp_uniform_float(float) nounwind readnone {
+  %v1 = bitcast float %0 to <1 x float>
+  %vs = shufflevector <1 x float> %v1, <1 x float> undef,
+          <16 x i32> <i32 0, i32 undef, i32 undef, i32 undef,
+                      i32 undef, i32 undef, i32 undef, i32 undef,
+                      i32 undef, i32 undef, i32 undef, i32 undef,
+                      i32 undef, i32 undef, i32 undef, i32 undef>
+  %vr = call <16 x float> @__rcp_varying_float(<16 x float> %vs)
+  %r = extractelement <16 x float> %vr, i32 0
+  ret float %r
+}
+
+declare <4 x float> @llvm.sqrt.v4f32(<4 x float>)
+
+define <WIDTH x float> @__sqrt_varying_float(<WIDTH x float>) nounwind readnone {
+  unary4to16(result, float, @llvm.sqrt.v4f32, %0)
+;; this returns nan for v=0, which is undesirable..
+;;  %rsqrt = call <WIDTH x float> @__rsqrt_varying_float(<WIDTH x float> %0)
+;;  %result = fmul <4 x float> %rsqrt, %0
+  ret <16 x float> %result
+}
+
+declare <4 x double> @llvm.sqrt.v4f64(<4 x double>)
+
+define <WIDTH x double> @__sqrt_varying_double(<WIDTH x double>) nounwind readnone {
+  unary4to16(r, double, @llvm.sqrt.v4f64, %0)
+  ret <WIDTH x double> %r
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; reductions
+
+define i64 @__movmsk(<WIDTH x MASK>) nounwind readnone {
+  %and_mask = and <WIDTH x i8> %0,
+    <i8 1, i8 2, i8 4, i8 8, i8 16, i8 32, i8 64, i8 128,
+     i8 1, i8 2, i8 4, i8 8, i8 16, i8 32, i8 64, i8 128>
+  %v8 = call <8 x i16> @llvm.arm.neon.vpaddlu.v8i16.v16i8(<16 x i8> %and_mask)
+  %v4 = call <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<8 x i16> %v8)
+  %v2 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %v4)
+  %va = extractelement <2 x i64> %v2, i32 0
+  %vb = extractelement <2 x i64> %v2, i32 1
+  %vbshift = shl i64 %vb, 8
+  %v = or i64 %va, %vbshift
+  ret i64 %v
+}
+
+define i1 @__any(<WIDTH x MASK>) nounwind readnone alwaysinline {
+  v16tov8(MASK, %0, %v8a, %v8b)
+  %vor8 = or <8 x MASK> %v8a, %v8b
+  %v16 = sext <8 x i8> %vor8 to <8 x i16>
+  v8tov4(i16, %v16, %v16a, %v16b)
+  %vor16 = or <4 x i16> %v16a, %v16b
+  %v32 = sext <4 x i16> %vor16 to <4 x i32>
+  v4tov2(i32, %v32, %v32a, %v32b)
+  %vor32 = or <2 x i32> %v32a, %v32b
+  %v0 = extractelement <2 x i32> %vor32, i32 0
+  %v1 = extractelement <2 x i32> %vor32, i32 1
+  %v = or i32 %v0, %v1
+  %cmp = icmp ne i32 %v, 0
+  ret i1 %cmp
+}
+
+define i1 @__all(<WIDTH x MASK>) nounwind readnone alwaysinline {
+  v16tov8(MASK, %0, %v8a, %v8b)
+  %vand8 = and <8 x MASK> %v8a, %v8b
+  %v16 = sext <8 x i8> %vand8 to <8 x i16>
+  v8tov4(i16, %v16, %v16a, %v16b)
+  %vand16 = and <4 x i16> %v16a, %v16b
+  %v32 = sext <4 x i16> %vand16 to <4 x i32>
+  v4tov2(i32, %v32, %v32a, %v32b)
+  %vand32 = and <2 x i32> %v32a, %v32b
+  %v0 = extractelement <2 x i32> %vand32, i32 0
+  %v1 = extractelement <2 x i32> %vand32, i32 1
+  %v = and i32 %v0, %v1
+  %cmp = icmp ne i32 %v, 0
+  ret i1 %cmp
+}
+
+define i1 @__none(<WIDTH x MASK>) nounwind readnone alwaysinline {
+  %any = call i1 @__any(<WIDTH x MASK> %0)
+  %none = icmp eq i1 %any, 0
+  ret i1 %none
+}
+
+;; $1: scalar type
+;; $2: vector/vector reduce function (2 x <WIDTH x vec> -> <WIDTH x vec>)
+;; $3: pairwise vector reduce function (2 x <2 x vec> -> <2 x vec>)
+;; $4: scalar reduce function
+
+define(`neon_reduce', `
+  v16tov8($1, %0, %va, %vb)
+  %va_16 = shufflevector <8 x $1> %va, <8 x $1> undef,
+    <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %vb_16 = shufflevector <8 x $1> %vb, <8 x $1> undef,
+    <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %v8 = call <16 x $1> $2(<16 x $1> %va_16, <16 x $1> %vb_16)
+
+  %v8a = shufflevector <16 x $1> %v8, <16 x $1> undef,
+    <16 x i32> <i32 0, i32 1, i32 2, i32 3, 
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %v8b = shufflevector <16 x $1> %v8, <16 x $1> undef,
+    <16 x i32> <i32 4, i32 5, i32 6, i32 7, 
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef>
+
+  %v4 = call <16 x $1> $2(<16 x $1> %v8a, <16 x $1> %v8b)
+
+  %vfirst_4 = shufflevector <16 x $1> %v4, <16 x $1> undef,
+    <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  v4tov2($1, %vfirst_4, %v0, %v1)
+  %vh = call <2 x $1> $3(<2 x $1> %v0, <2 x $1> %v1)
+  %vh0 = extractelement <2 x $1> %vh, i32 0
+  %vh1 = extractelement <2 x $1> %vh, i32 1
+  %r = call $1 $4($1 %vh0, $1 %vh1)
+  ret $1 %r
+')
+
+declare <2 x float> @llvm.arm.neon.vpadd.v2f32(<2 x float>, <2 x float>) nounwind readnone
+
+define internal float @add_f32(float, float) {
+  %r = fadd float %0, %1
+  ret float %r
+}
+
+define internal <WIDTH x float> @__add_varying_float(<WIDTH x float>, <WIDTH x float>) {
+  %r = fadd <WIDTH x float> %0, %1
+  ret <WIDTH x float> %r
+}
+
+define float @__reduce_add_float(<WIDTH x float>) nounwind readnone {
+  neon_reduce(float, @__add_varying_float, @llvm.arm.neon.vpadd.v2f32, @add_f32)
+}
+
+declare <2 x float> @llvm.arm.neon.vpmins.v2f32(<2 x float>, <2 x float>) nounwind readnone
+
+define internal float @min_f32(float, float) {
+  %cmp = fcmp olt float %0, %1
+  %r = select i1 %cmp, float %0, float %1
+  ret float %r
+}
+
+define float @__reduce_min_float(<WIDTH x float>) nounwind readnone {
+  neon_reduce(float, @__min_varying_float, @llvm.arm.neon.vpmins.v2f32, @min_f32)
+}
+
+declare <2 x float> @llvm.arm.neon.vpmaxs.v2f32(<2 x float>, <2 x float>) nounwind readnone
+
+define internal float @max_f32(float, float) {
+  %cmp = fcmp ugt float %0, %1
+  %r = select i1 %cmp, float %0, float %1
+  ret float %r
+}
+
+define float @__reduce_max_float(<WIDTH x float>) nounwind readnone {
+  neon_reduce(float, @__max_varying_float, @llvm.arm.neon.vpmaxs.v2f32, @max_f32)
+}
+
+declare <8 x i16> @llvm.arm.neon.vpaddlu.v8i16.v16i8(<16 x i8>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<8 x i16>) nounwind readnone
+declare <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32>) nounwind readnone
+
+define i64 @__reduce_add_int8(<WIDTH x i8>) nounwind readnone {
+  %a16 = call <8 x i16> @llvm.arm.neon.vpaddlu.v8i16.v16i8(<16 x i8> %0)
+  %a32 = call <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<8 x i16> %a16)
+  %a64 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %a32)
+  %a0 = extractelement <2 x i64> %a64, i32 0
+  %a1 = extractelement <2 x i64> %a64, i32 1
+  %r = add i64 %a0, %a1
+  ret i64 %r
+}
+
+define i64 @__reduce_add_int16(<WIDTH x i16>) nounwind readnone {
+  v16tov8(i16, %0, %va, %vb)
+  %a32 = call <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<8 x i16> %va)
+  %b32 = call <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<8 x i16> %vb)
+  %a64 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %a32)
+  %b64 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %b32)
+  %sum = add <2 x i64> %a64, %b64
+  %a0 = extractelement <2 x i64> %sum, i32 0
+  %a1 = extractelement <2 x i64> %sum, i32 1
+  %r = add i64 %a0, %a1
+  ret i64 %r
+}
+
+define i64 @__reduce_add_int32(<WIDTH x i32>) nounwind readnone {
+  v16tov4(i32, %0, %va, %vb, %vc, %vd)
+  %a64 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %va)
+  %b64 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %vb)
+  %c64 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %vc)
+  %d64 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %vd)
+  %ab = add <2 x i64> %a64, %b64
+  %cd = add <2 x i64> %c64, %d64
+  %sum = add <2 x i64> %ab, %cd
+  %a0 = extractelement <2 x i64> %sum, i32 0
+  %a1 = extractelement <2 x i64> %sum, i32 1
+  %r = add i64 %a0, %a1
+  ret i64 %r
+}
+
+declare <2 x i32> @llvm.arm.neon.vpmins.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+define internal i32 @min_si32(i32, i32) {
+  %cmp = icmp slt i32 %0, %1
+  %r = select i1 %cmp, i32 %0, i32 %1
+  ret i32 %r
+}
+
+define i32 @__reduce_min_int32(<WIDTH x i32>) nounwind readnone {
+  neon_reduce(i32, @__min_varying_int32, @llvm.arm.neon.vpmins.v2i32, @min_si32)
+}
+
+declare <2 x i32> @llvm.arm.neon.vpmaxs.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+define internal i32 @max_si32(i32, i32) {
+  %cmp = icmp sgt i32 %0, %1
+  %r = select i1 %cmp, i32 %0, i32 %1
+  ret i32 %r
+}
+
+define i32 @__reduce_max_int32(<WIDTH x i32>) nounwind readnone {
+  neon_reduce(i32, @__max_varying_int32, @llvm.arm.neon.vpmaxs.v2i32, @max_si32)
+}
+
+declare <2 x i32> @llvm.arm.neon.vpminu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+define internal i32 @min_ui32(i32, i32) {
+  %cmp = icmp ult i32 %0, %1
+  %r = select i1 %cmp, i32 %0, i32 %1
+  ret i32 %r
+}
+
+define i32 @__reduce_min_uint32(<WIDTH x i32>) nounwind readnone {
+  neon_reduce(i32, @__min_varying_uint32, @llvm.arm.neon.vpmins.v2i32, @min_ui32)
+}
+
+declare <2 x i32> @llvm.arm.neon.vpmaxu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+define internal i32 @max_ui32(i32, i32) {
+  %cmp = icmp ugt i32 %0, %1
+  %r = select i1 %cmp, i32 %0, i32 %1
+  ret i32 %r
+}
+
+define i32 @__reduce_max_uint32(<WIDTH x i32>) nounwind readnone {
+  neon_reduce(i32, @__max_varying_uint32, @llvm.arm.neon.vpmaxs.v2i32, @max_ui32)
+}
+
+define internal double @__add_uniform_double(double, double) {
+  %r = fadd double %0, %1
+  ret double %r
+}
+
+define internal <WIDTH x double> @__add_varying_double(<WIDTH x double>, <WIDTH x double>) {
+  %r = fadd <WIDTH x double> %0, %1
+  ret <WIDTH x double> %r
+}
+
+define double @__reduce_add_double(<WIDTH x double>) nounwind readnone {
+  reduce16(double, @__add_varying_double, @__add_uniform_double)
+}
+
+define double @__reduce_min_double(<WIDTH x double>) nounwind readnone {
+  reduce16(double, @__min_varying_double, @__min_uniform_double)
+}
+
+define double @__reduce_max_double(<WIDTH x double>) nounwind readnone {
+  reduce16(double, @__max_varying_double, @__max_uniform_double)
+}
+
+define internal i64 @__add_uniform_int64(i64, i64) {
+  %r = add i64 %0, %1
+  ret i64 %r
+}
+
+define internal <WIDTH x i64> @__add_varying_int64(<WIDTH x i64>, <WIDTH x i64>) {
+  %r = add <WIDTH x i64> %0, %1
+  ret <WIDTH x i64> %r
+}
+
+define i64 @__reduce_add_int64(<WIDTH x i64>) nounwind readnone {
+  reduce16(i64, @__add_varying_int64, @__add_uniform_int64)
+}
+
+define i64 @__reduce_min_int64(<WIDTH x i64>) nounwind readnone {
+  reduce16(i64, @__min_varying_int64, @__min_uniform_int64)
+}
+
+define i64 @__reduce_max_int64(<WIDTH x i64>) nounwind readnone {
+  reduce16(i64, @__max_varying_int64, @__max_uniform_int64)
+}
+
+define i64 @__reduce_min_uint64(<WIDTH x i64>) nounwind readnone {
+  reduce16(i64, @__min_varying_uint64, @__min_uniform_uint64)
+}
+
+define i64 @__reduce_max_uint64(<WIDTH x i64>) nounwind readnone {
+  reduce16(i64, @__max_varying_uint64, @__max_uniform_uint64)
+}
diff --git a/builtins/target-neon-common.ll b/builtins/target-neon-common.ll
new file mode 100644
index 00000000..696b0748
--- /dev/null
+++ b/builtins/target-neon-common.ll
@@ -0,0 +1,351 @@
+;;
+;; target-neon-common.ll
+;;
+;;  Copyright(c) 2013 Google, Inc.
+;;
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Matt Pharr nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+target datalayout = "e-p:32:32:32-S32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f16:16:16-f32:32:32-f64:32:64-f128:128:128-v64:32:64-v128:32:128-a0:0:64-n32"
+
+stdlib_core()
+scans()
+reduce_equal(WIDTH)
+rdrand_decls()
+define_shuffles()
+aossoa()
+ctlztz()
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; half conversion routines
+
+declare <4 x i16> @llvm.arm.neon.vcvtfp2hf(<4 x float>) nounwind readnone
+declare <4 x float> @llvm.arm.neon.vcvthf2fp(<4 x i16>) nounwind readnone
+
+define float @__half_to_float_uniform(i16 %v) nounwind readnone {
+  %v1 = bitcast i16 %v to <1 x i16>
+  %vec = shufflevector <1 x i16> %v1, <1 x i16> undef, 
+           <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+  %h = call <4 x float> @llvm.arm.neon.vcvthf2fp(<4 x i16> %vec)
+  %r = extractelement <4 x float> %h, i32 0
+  ret float %r
+}
+
+define i16 @__float_to_half_uniform(float %v) nounwind readnone {
+  %v1 = bitcast float %v to <1 x float>
+  %vec = shufflevector <1 x float> %v1, <1 x float> undef, 
+           <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+  %h = call <4 x i16> @llvm.arm.neon.vcvtfp2hf(<4 x float> %vec)
+  %r = extractelement <4 x i16> %h, i32 0
+  ret i16 %r
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; math
+
+define void @__fastmath() nounwind {
+  ret void
+}
+
+;; round/floor/ceil
+
+;; FIXME: grabbed these from the sse2 target, which does not have native
+;; instructions for these.  Is there a better approach for NEON?
+
+define float @__round_uniform_float(float) nounwind readonly alwaysinline {
+  %float_to_int_bitcast.i.i.i.i = bitcast float %0 to i32
+  %bitop.i.i = and i32 %float_to_int_bitcast.i.i.i.i, -2147483648
+  %bitop.i = xor i32 %bitop.i.i, %float_to_int_bitcast.i.i.i.i
+  %int_to_float_bitcast.i.i40.i = bitcast i32 %bitop.i to float
+  %binop.i = fadd float %int_to_float_bitcast.i.i40.i, 8.388608e+06
+  %binop21.i = fadd float %binop.i, -8.388608e+06
+  %float_to_int_bitcast.i.i.i = bitcast float %binop21.i to i32
+  %bitop31.i = xor i32 %float_to_int_bitcast.i.i.i, %bitop.i.i
+  %int_to_float_bitcast.i.i.i = bitcast i32 %bitop31.i to float
+  ret float %int_to_float_bitcast.i.i.i
+}
+
+define float @__floor_uniform_float(float) nounwind readonly alwaysinline {
+  %calltmp.i = tail call float @__round_uniform_float(float %0) nounwind
+  %bincmp.i = fcmp ogt float %calltmp.i, %0
+  %selectexpr.i = sext i1 %bincmp.i to i32
+  %bitop.i = and i32 %selectexpr.i, -1082130432
+  %int_to_float_bitcast.i.i.i = bitcast i32 %bitop.i to float
+  %binop.i = fadd float %calltmp.i, %int_to_float_bitcast.i.i.i
+  ret float %binop.i
+}
+
+define float @__ceil_uniform_float(float) nounwind readonly alwaysinline {
+  %calltmp.i = tail call float @__round_uniform_float(float %0) nounwind
+  %bincmp.i = fcmp olt float %calltmp.i, %0
+  %selectexpr.i = sext i1 %bincmp.i to i32
+  %bitop.i = and i32 %selectexpr.i, 1065353216
+  %int_to_float_bitcast.i.i.i = bitcast i32 %bitop.i to float
+  %binop.i = fadd float %calltmp.i, %int_to_float_bitcast.i.i.i
+  ret float %binop.i
+}
+
+;; FIXME: rounding doubles and double vectors needs to be implemented
+declare double @__round_uniform_double(double) nounwind readnone 
+declare double @__floor_uniform_double(double) nounwind readnone 
+declare double @__ceil_uniform_double(double) nounwind readnone 
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; min/max
+
+define float @__max_uniform_float(float, float) nounwind readnone {
+  %cmp = fcmp ugt float %0, %1
+  %r = select i1 %cmp, float %0, float %1
+  ret float %r
+}
+
+define float @__min_uniform_float(float, float) nounwind readnone {
+  %cmp = fcmp ult float %0, %1
+  %r = select i1 %cmp, float %0, float %1
+  ret float %r
+}
+
+define i32 @__min_uniform_int32(i32, i32) nounwind readnone {
+  %cmp = icmp slt i32 %0, %1
+  %r = select i1 %cmp, i32 %0, i32 %1
+  ret i32 %r
+}
+
+define i32 @__max_uniform_int32(i32, i32) nounwind readnone {
+  %cmp = icmp sgt i32 %0, %1
+  %r = select i1 %cmp, i32 %0, i32 %1
+  ret i32 %r
+}
+
+define i32 @__min_uniform_uint32(i32, i32) nounwind readnone {
+  %cmp = icmp ult i32 %0, %1
+  %r = select i1 %cmp, i32 %0, i32 %1
+  ret i32 %r
+}
+
+define i32 @__max_uniform_uint32(i32, i32) nounwind readnone {
+  %cmp = icmp ugt i32 %0, %1
+  %r = select i1 %cmp, i32 %0, i32 %1
+  ret i32 %r
+}
+
+define i64 @__min_uniform_int64(i64, i64) nounwind readnone {
+  %cmp = icmp slt i64 %0, %1
+  %r = select i1 %cmp, i64 %0, i64 %1
+  ret i64 %r
+}
+
+define i64 @__max_uniform_int64(i64, i64) nounwind readnone {
+  %cmp = icmp sgt i64 %0, %1
+  %r = select i1 %cmp, i64 %0, i64 %1
+  ret i64 %r
+}
+
+define i64 @__min_uniform_uint64(i64, i64) nounwind readnone {
+  %cmp = icmp ult i64 %0, %1
+  %r = select i1 %cmp, i64 %0, i64 %1
+  ret i64 %r
+}
+
+define i64 @__max_uniform_uint64(i64, i64) nounwind readnone {
+  %cmp = icmp ugt i64 %0, %1
+  %r = select i1 %cmp, i64 %0, i64 %1
+  ret i64 %r
+}
+
+define double @__min_uniform_double(double, double) nounwind readnone {
+  %cmp = fcmp olt double %0, %1
+  %r = select i1 %cmp, double %0, double %1
+  ret double %r
+}
+
+define double @__max_uniform_double(double, double) nounwind readnone {
+  %cmp = fcmp ogt double %0, %1
+  %r = select i1 %cmp, double %0, double %1
+  ret double %r
+}
+
+define <WIDTH x i64> @__min_varying_int64(<WIDTH x i64>, <WIDTH x i64>) nounwind readnone {
+  %m = icmp slt <WIDTH x i64> %0, %1
+  %r = select <WIDTH x i1> %m, <WIDTH x i64> %0, <WIDTH x i64> %1
+  ret <WIDTH x i64> %r
+}
+
+define <WIDTH x i64> @__max_varying_int64(<WIDTH x i64>, <WIDTH x i64>) nounwind readnone {
+  %m = icmp sgt <WIDTH x i64> %0, %1
+  %r = select <WIDTH x i1> %m, <WIDTH x i64> %0, <WIDTH x i64> %1
+  ret <WIDTH x i64> %r
+}
+
+define <WIDTH x i64> @__min_varying_uint64(<WIDTH x i64>, <WIDTH x i64>) nounwind readnone {
+  %m = icmp ult <WIDTH x i64> %0, %1
+  %r = select <WIDTH x i1> %m, <WIDTH x i64> %0, <WIDTH x i64> %1
+  ret <WIDTH x i64> %r
+}
+
+define <WIDTH x i64> @__max_varying_uint64(<WIDTH x i64>, <WIDTH x i64>) nounwind readnone {
+  %m = icmp ugt <WIDTH x i64> %0, %1
+  %r = select <WIDTH x i1> %m, <WIDTH x i64> %0, <WIDTH x i64> %1
+  ret <WIDTH x i64> %r
+}
+
+define <WIDTH x double> @__min_varying_double(<WIDTH x double>,
+                                              <WIDTH x double>) nounwind readnone {
+  %m = fcmp olt <WIDTH x double> %0, %1
+  %r = select <WIDTH x i1> %m, <WIDTH x double> %0, <WIDTH x double> %1
+  ret <WIDTH x double> %r
+}
+
+define <WIDTH x double> @__max_varying_double(<WIDTH x double>,
+                                              <WIDTH x double>) nounwind readnone {
+  %m = fcmp ogt <WIDTH x double> %0, %1
+  %r = select <WIDTH x i1> %m, <WIDTH x double> %0, <WIDTH x double> %1
+  ret <WIDTH x double> %r
+}
+
+;; sqrt/rsqrt/rcp
+
+declare float @llvm.sqrt.f32(float)
+
+define float @__sqrt_uniform_float(float) nounwind readnone {
+  %r = call float @llvm.sqrt.f32(float %0)
+  ret float %r
+}
+
+declare double @llvm.sqrt.f64(double)
+
+define double @__sqrt_uniform_double(double) nounwind readnone {
+  %r = call double @llvm.sqrt.f64(double %0)
+  ret double %r
+}
+
+;; bit ops
+
+declare i32 @llvm.ctpop.i32(i32) nounwind readnone
+declare i64 @llvm.ctpop.i64(i64) nounwind readnone
+
+define i32 @__popcnt_int32(i32) nounwind readnone {
+  %v = call i32 @llvm.ctpop.i32(i32 %0)
+  ret i32 %v
+}
+
+define i64 @__popcnt_int64(i64) nounwind readnone {
+  %v = call i64 @llvm.ctpop.i64(i64 %0)
+  ret i64 %v
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; unaligned loads/loads+broadcasts
+
+masked_load(i8,  1)
+masked_load(i16, 2)
+masked_load(i32, 4)
+masked_load(float, 4)
+masked_load(i64, 8)
+masked_load(double, 8)
+
+gen_masked_store(i8)
+gen_masked_store(i16)
+gen_masked_store(i32)
+gen_masked_store(i64)
+masked_store_float_double()
+
+define void @__masked_store_blend_i8(<WIDTH x i8>* nocapture %ptr, <WIDTH x i8> %new,
+                                     <WIDTH x MASK> %mask) nounwind alwaysinline {
+  %old = load <WIDTH x i8> * %ptr
+  %mask1 = trunc <WIDTH x MASK> %mask to <WIDTH x i1>
+  %result = select <WIDTH x i1> %mask1, <WIDTH x i8> %new, <WIDTH x i8> %old
+  store <WIDTH x i8> %result, <WIDTH x i8> * %ptr
+  ret void
+}
+
+define void @__masked_store_blend_i16(<WIDTH x i16>* nocapture %ptr, <WIDTH x i16> %new, 
+                                      <WIDTH x MASK> %mask) nounwind alwaysinline {
+  %old = load <WIDTH x i16> * %ptr
+  %mask1 = trunc <WIDTH x MASK> %mask to <WIDTH x i1>
+  %result = select <WIDTH x i1> %mask1, <WIDTH x i16> %new, <WIDTH x i16> %old
+  store <WIDTH x i16> %result, <WIDTH x i16> * %ptr
+  ret void
+}
+
+define void @__masked_store_blend_i32(<WIDTH x i32>* nocapture %ptr, <WIDTH x i32> %new, 
+                                      <WIDTH x MASK> %mask) nounwind alwaysinline {
+  %old = load <WIDTH x i32> * %ptr
+  %mask1 = trunc <WIDTH x MASK> %mask to <WIDTH x i1>
+  %result = select <WIDTH x i1> %mask1, <WIDTH x i32> %new, <WIDTH x i32> %old
+  store <WIDTH x i32> %result, <WIDTH x i32> * %ptr
+  ret void
+}
+
+define void @__masked_store_blend_i64(<WIDTH x i64>* nocapture %ptr,
+                            <WIDTH x i64> %new, <WIDTH x MASK> %mask) nounwind alwaysinline {
+  %old = load <WIDTH x i64> * %ptr
+  %mask1 = trunc <WIDTH x MASK> %mask to <WIDTH x i1>
+  %result = select <WIDTH x i1> %mask1, <WIDTH x i64> %new, <WIDTH x i64> %old
+  store <WIDTH x i64> %result, <WIDTH x i64> * %ptr
+  ret void
+}
+
+;; yuck.  We need declarations of these, even though we shouldnt ever
+;; actually generate calls to them for the NEON target...
+
+declare <WIDTH x float> @__svml_sin(<WIDTH x float>)
+declare <WIDTH x float> @__svml_cos(<WIDTH x float>)
+declare void @__svml_sincos(<WIDTH x float>, <WIDTH x float> *, <WIDTH x float> *)
+declare <WIDTH x float> @__svml_tan(<WIDTH x float>)
+declare <WIDTH x float> @__svml_atan(<WIDTH x float>)
+declare <WIDTH x float> @__svml_atan2(<WIDTH x float>, <WIDTH x float>)
+declare <WIDTH x float> @__svml_exp(<WIDTH x float>)
+declare <WIDTH x float> @__svml_log(<WIDTH x float>)
+declare <WIDTH x float> @__svml_pow(<WIDTH x float>, <WIDTH x float>)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; gather
+
+gen_gather_factored(i8)
+gen_gather_factored(i16)
+gen_gather_factored(i32)
+gen_gather_factored(float)
+gen_gather_factored(i64)
+gen_gather_factored(double)
+
+gen_scatter(i8)
+gen_scatter(i16)
+gen_scatter(i32)
+gen_scatter(float)
+gen_scatter(i64)
+gen_scatter(double)
+
+packed_load_and_store(4)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; prefetch
+
+define_prefetches()
diff --git a/builtins/util.m4 b/builtins/util.m4
index ee45ebc7..1f85e2cc 100644
--- a/builtins/util.m4
+++ b/builtins/util.m4
@@ -49,6 +49,53 @@ define(`MASK_HIGH_BIT_ON',
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
+;; vector assembly and deconstruction utilities
+;; split 8-wide vector into 2 4-wide vectors
+;; 
+;; $1: vector element type
+;; $2: 8-wide vector
+;; $3: first 4-wide vector
+;; $4: second 4-wide vector
+
+define(`v8tov4', `
+  $3 = shufflevector <8 x $1> $2, <8 x $1> undef,
+    <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  $4 = shufflevector <8 x $1> $2, <8 x $1> undef,
+    <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+')
+
+define(`v16tov8', `
+  $3 = shufflevector <16 x $1> $2, <16 x $1> undef,
+    <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  $4 = shufflevector <16 x $1> $2, <16 x $1> undef,
+    <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+')
+
+;; 4-wide into 2 2-wide
+;; args as above
+;;
+
+define(`v4tov2', `
+  $3 = shufflevector <4 x $1> $2, <4 x $1> undef, <2 x i32> <i32 0, i32 1>
+  $4 = shufflevector <4 x $1> $2, <4 x $1> undef, <2 x i32> <i32 2, i32 3>
+')
+
+define(`v8tov2', `
+  $3 = shufflevector <8 x $1> $2, <8 x $1> undef, <2 x i32> <i32 0, i32 1>
+  $4 = shufflevector <8 x $1> $2, <8 x $1> undef, <2 x i32> <i32 2, i32 3>
+  $5 = shufflevector <8 x $1> $2, <8 x $1> undef, <2 x i32> <i32 4, i32 5>
+  $6 = shufflevector <8 x $1> $2, <8 x $1> undef, <2 x i32> <i32 6, i32 7>
+')
+
+define(`v16tov4', `
+  $3 = shufflevector <16 x $1> $2, <16 x $1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  $4 = shufflevector <16 x $1> $2, <16 x $1> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  $5 = shufflevector <16 x $1> $2, <16 x $1> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+  $6 = shufflevector <16 x $1> $2, <16 x $1> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
 ;; Helper macro for calling various SSE instructions for scalar values
 ;; but where the instruction takes a vector parameter.
 ;; $1 : name of variable to put the final value in
@@ -156,10 +203,7 @@ define(`reduce16', `
 ;;     the final reduction
 
 define(`reduce8by4', `
-  %v1 = shufflevector <8 x $1> %0, <8 x $1> undef,
-        <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %v2 = shufflevector <8 x $1> %0, <8 x $1> undef,
-        <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  v8tov4($1, %0, %v1, %v2)
   %m1 = call <4 x $1> $2(<4 x $1> %v1, <4 x $1> %v2)
   %v3 = shufflevector <4 x $1> %m1, <4 x $1> undef,
         <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
@@ -266,30 +310,66 @@ define(`binary2to4', `
 ;; $4: 8-wide operand value
 
 define(`unary4to8', `
-  %$1_0 = shufflevector <8 x $2> $4, <8 x $2> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %v$1_0 = call <4 x $2> $3(<4 x $2> %$1_0)
-  %$1_1 = shufflevector <8 x $2> $4, <8 x $2> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %v$1_1 = call <4 x $2> $3(<4 x $2> %$1_1)
-  %$1 = shufflevector <4 x $2> %v$1_0, <4 x $2> %v$1_1, 
+  %__$1_0 = shufflevector <8 x $2> $4, <8 x $2> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %__v$1_0 = call <4 x $2> $3(<4 x $2> %__$1_0)
+  %__$1_1 = shufflevector <8 x $2> $4, <8 x $2> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %__v$1_1 = call <4 x $2> $3(<4 x $2> %__$1_1)
+  %$1 = shufflevector <4 x $2> %__v$1_0, <4 x $2> %__v$1_1, 
+           <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+'
+)
+
+;; $1: name of variable into which the final result should go
+;; $2: scalar type of the input vector elements
+;; $3: scalar type of the result vector elements
+;; $4: 4-wide unary vector function to apply
+;; $5: 8-wide operand value
+
+define(`unary4to8conv', `
+  %$1_0 = shufflevector <8 x $2> $5, <8 x $2> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v$1_0 = call <4 x $3> $4(<4 x $2> %$1_0)
+  %$1_1 = shufflevector <8 x $2> $5, <8 x $2> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %v$1_1 = call <4 x $3> $4(<4 x $2> %$1_1)
+  %$1 = shufflevector <4 x $3> %v$1_0, <4 x $3> %v$1_1, 
            <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 '
 )
 
 define(`unary4to16', `
-  %$1_0 = shufflevector <16 x $2> $4, <16 x $2> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %v$1_0 = call <4 x $2> $3(<4 x $2> %$1_0)
-  %$1_1 = shufflevector <16 x $2> $4, <16 x $2> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %v$1_1 = call <4 x $2> $3(<4 x $2> %$1_1)
-  %$1_2 = shufflevector <16 x $2> $4, <16 x $2> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
-  %v$1_2 = call <4 x $2> $3(<4 x $2> %$1_2)
-  %$1_3 = shufflevector <16 x $2> $4, <16 x $2> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
-  %v$1_3 = call <4 x $2> $3(<4 x $2> %$1_3)
+  %__$1_0 = shufflevector <16 x $2> $4, <16 x $2> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %__v$1_0 = call <4 x $2> $3(<4 x $2> %__$1_0)
+  %__$1_1 = shufflevector <16 x $2> $4, <16 x $2> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %__v$1_1 = call <4 x $2> $3(<4 x $2> %__$1_1)
+  %__$1_2 = shufflevector <16 x $2> $4, <16 x $2> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+  %__v$1_2 = call <4 x $2> $3(<4 x $2> %__$1_2)
+  %__$1_3 = shufflevector <16 x $2> $4, <16 x $2> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+  %__v$1_3 = call <4 x $2> $3(<4 x $2> %__$1_3)
 
-  %$1a = shufflevector <4 x $2> %v$1_0, <4 x $2> %v$1_1, 
+  %__$1a = shufflevector <4 x $2> %__v$1_0, <4 x $2> %__v$1_1, 
            <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  %$1b = shufflevector <4 x $2> %v$1_2, <4 x $2> %v$1_3, 
+  %__$1b = shufflevector <4 x $2> %__v$1_2, <4 x $2> %__v$1_3, 
            <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  %$1 = shufflevector <8 x $2> %$1a, <8 x $2> %$1b,
+  %$1 = shufflevector <8 x $2> %__$1a, <8 x $2> %__$1b,
+           <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                       i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+'
+)
+
+define(`unary4to16conv', `
+  %$1_0 = shufflevector <16 x $2> $5, <16 x $2> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v$1_0 = call <4 x $3> $4(<4 x $2> %$1_0)
+  %$1_1 = shufflevector <16 x $2> $5, <16 x $2> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %v$1_1 = call <4 x $3> $4(<4 x $2> %$1_1)
+  %$1_2 = shufflevector <16 x $2> $5, <16 x $2> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+  %v$1_2 = call <4 x $3> $4(<4 x $2> %$1_2)
+  %$1_3 = shufflevector <16 x $2> $5, <16 x $2> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+  %v$1_3 = call <4 x $3> $4(<4 x $2> %$1_3)
+
+  %$1a = shufflevector <4 x $3> %v$1_0, <4 x $3> %v$1_1, 
+           <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %$1b = shufflevector <4 x $3> %v$1_2, <4 x $3> %v$1_3, 
+           <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %$1 = shufflevector <8 x $3> %$1a, <8 x $3> %$1b,
            <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
                        i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 '
diff --git a/ispc.cpp b/ispc.cpp
index a9f5ff5c..de8fba4d 100644
--- a/ispc.cpp
+++ b/ispc.cpp
@@ -106,7 +106,7 @@ static void __cpuidex(int info[4], int level, int count) {
 static const char *
 lGetSystemISA() {
 #ifdef __arm__
-    return "neon";
+    return "neon-32";
 #else
     int info[4];
     __cpuid(info, 1);
@@ -187,7 +187,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
                 isa = "avx2";
             else if (!strcmp(cpu, "cortex-a9") ||
                      !strcmp(cpu, "cortex-a15"))
-                isa = "neon";
+                isa = "neon-32";
             else if (!strcmp(cpu, "core-avx-i"))
                 isa = "avx1.1";
             else if (!strcmp(cpu, "sandybridge") ||
@@ -212,7 +212,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
     }
 
 #if !defined(__arm__)
-    if (cpu == NULL && !strcmp(isa, "neon"))
+    if (cpu == NULL && !strncmp(isa, "neon", 4))
         // If we're compiling NEON on an x86 host and the CPU wasn't
         // supplied, don't go and set the CPU based on the host...
         cpu = "cortex-a9";
@@ -246,7 +246,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
     this->m_cpu = cpu;
 
     if (arch == NULL) {
-        if (!strcmp(isa, "neon"))
+        if (!strncmp(isa, "neon", 4))
             arch = "arm";
         else
             arch = "x86-64";
@@ -461,8 +461,26 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
         this->m_hasGather = true;
 #endif
     }
-    else if (!strcasecmp(isa, "neon")) {
-        this->m_isa = Target::NEON;
+    else if (!strcasecmp(isa, "neon-8")) {
+        this->m_isa = Target::NEON8;
+        this->m_nativeVectorWidth = 16;
+        this->m_vectorWidth = 16;
+        this->m_attributes = "+neon,+fp16";
+        this->m_hasHalf = true; // ??
+        this->m_maskingIsFree = false;
+        this->m_maskBitCount = 8;
+    }
+    else if (!strcasecmp(isa, "neon-16")) {
+        this->m_isa = Target::NEON16;
+        this->m_nativeVectorWidth = 8;
+        this->m_vectorWidth = 8;
+        this->m_attributes = "+neon,+fp16";
+        this->m_hasHalf = true; // ??
+        this->m_maskingIsFree = false;
+        this->m_maskBitCount = 16;
+    }
+    else if (!strcasecmp(isa, "neon-32") || !strcasecmp(isa, "neon")) {
+        this->m_isa = Target::NEON32;
         this->m_nativeVectorWidth = 4;
         this->m_vectorWidth = 4;
         this->m_attributes = "+neon,+fp16";
@@ -484,7 +502,8 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
             llvm::Reloc::Default;
         std::string featuresString = m_attributes;
         llvm::TargetOptions options;
-        if (m_isa == Target::NEON)
+        if (m_isa == Target::NEON8 || m_isa == Target::NEON16 ||
+            m_isa == Target::NEON32)
             options.FloatABIType = llvm::FloatABI::Hard;
 #if !defined(LLVM_3_1)
         if (g->opt.disableFMA == false)
@@ -618,8 +637,12 @@ Target::GetTripleString() const {
 const char *
 Target::ISAToString(ISA isa) {
     switch (isa) {
-    case Target::NEON:
-        return "neon";
+    case Target::NEON8:
+        return "neon-8";
+    case Target::NEON16:
+        return "neon-16";
+    case Target::NEON32:
+        return "neon-32";
     case Target::SSE2:
         return "sse2";
     case Target::SSE4:
diff --git a/ispc.h b/ispc.h
index 7d10b908..bf6d2642 100644
--- a/ispc.h
+++ b/ispc.h
@@ -175,7 +175,8 @@ public:
         flexible/performant of them will apear last in the enumerant.  Note
         also that __best_available_isa() needs to be updated if ISAs are
         added or the enumerant values are reordered.  */
-    enum ISA { NEON, SSE2, SSE4, AVX, AVX11, AVX2, GENERIC, NUM_ISAS };
+    enum ISA { NEON32, NEON16, NEON8, SSE2, SSE4, AVX, AVX11, AVX2, GENERIC,
+               NUM_ISAS };
 
     /** Initializes the given Target pointer for a target of the given
         name, if the name is a known target.  Returns true if the
diff --git a/ispc.vcxproj b/ispc.vcxproj
index 96682fe3..e9bf9d97 100755
--- a/ispc.vcxproj
+++ b/ispc.vcxproj
@@ -45,8 +45,12 @@
     <ClCompile Include="$(Configuration)\gen-bitcode-generic-32-64bit.cpp" />
     <ClCompile Include="$(Configuration)\gen-bitcode-generic-64-32bit.cpp" />
     <ClCompile Include="$(Configuration)\gen-bitcode-generic-64-64bit.cpp" />
-    <ClCompile Include="$(Configuration)\gen-bitcode-neon-32bit.cpp" />
-    <ClCompile Include="$(Configuration)\gen-bitcode-neon-64bit.cpp" />
+    <ClCompile Include="$(Configuration)\gen-bitcode-neon-8-32bit.cpp" />
+    <ClCompile Include="$(Configuration)\gen-bitcode-neon-8-64bit.cpp" />
+    <ClCompile Include="$(Configuration)\gen-bitcode-neon-16-32bit.cpp" />
+    <ClCompile Include="$(Configuration)\gen-bitcode-neon-16-64bit.cpp" />
+    <ClCompile Include="$(Configuration)\gen-bitcode-neon-32-32bit.cpp" />
+    <ClCompile Include="$(Configuration)\gen-bitcode-neon-32-64bit.cpp" />
     <ClCompile Include="$(Configuration)\gen-bitcode-sse2-32bit.cpp" />
     <ClCompile Include="$(Configuration)\gen-bitcode-sse2-64bit.cpp" />
     <ClCompile Include="$(Configuration)\gen-bitcode-sse2-x2-32bit.cpp" />
@@ -187,37 +191,78 @@
       <Message>Building gen-bitcode-sse2-x2-64bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
-  <ItemGroup>
-    <CustomBuild Include="builtins\target-neon.ll">
-      <FileType>Document</FileType>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-neon.ll | python bitcode2cpp.py builtins\target-neon.ll &gt; gen-bitcode-neon.cpp</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-neon.cpp</Outputs>
-      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins\util.m4</AdditionalInputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-neon.ll | python bitcode2cpp.py builtins\target-neon.ll &gt; gen-bitcode-neon.cpp</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-neon.cpp</Outputs>
-      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins\util.m4</AdditionalInputs>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-neon.cpp</Message>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-neon.cpp</Message>
-    </CustomBuild>
-  </ItemGroup>
-  <ItemGroup>
-    <CustomBuild Include="builtins\target-avx1.ll">
-      <FileType>Document</FileType>
-      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-avx1.ll | python bitcode2cpp.py builtins\target-avx1.ll 32bit &gt; $(Configuration)/gen-bitcode-avx1-32bit.cpp</Command>
-      <Outputs>$(Configuration)/gen-bitcode-avx1-32bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\target-avx-common.ll;builtins\target-avx.ll</AdditionalInputs>
-      <Message>Building gen-bitcode-avx1-32bit.cpp</Message>
-    </CustomBuild>
-  </ItemGroup>
-  <ItemGroup>
-    <CustomBuild Include="builtins\target-avx1.ll">
-      <FileType>Document</FileType>
-      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-avx1.ll | python bitcode2cpp.py builtins\target-avx1.ll 64bit &gt; $(Configuration)/gen-bitcode-avx1-64bit.cpp</Command>
-      <Outputs>$(Configuration)/gen-bitcode-avx1-64bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\target-avx-common.ll;builtins\target-avx.ll</AdditionalInputs>
-      <Message>Building gen-bitcode-avx1-64bit.cpp</Message>
-    </CustomBuild>
-  </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="builtins\target-neon-8.ll">
+      <FileType>Document</FileType>
+      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-neon-8.ll | python bitcode2cpp.py builtins\target-neon-8.ll 32bit &gt; $(Configuration)/gen-bitcode-neon-8-32bit.cpp</Command>
+      <Outputs>$(Configuration)/gen-bitcode-neon-8-32bit.cpp</Outputs>
+      <AdditionalInputs>builtins\util.m4;builtins\target-neon-common.ll</AdditionalInputs>
+      <Message>Building gen-bitcode-neon-8-32bit.cpp</Message>
+    </CustomBuild>
+  </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="builtins\target-neon-8.ll">
+      <FileType>Document</FileType>
+      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-neon-8.ll | python bitcode2cpp.py builtins\target-neon-8.ll 64bit &gt; $(Configuration)/gen-bitcode-neon-8-64bit.cpp</Command>
+      <Outputs>$(Configuration)/gen-bitcode-neon-8-64bit.cpp</Outputs>
+      <AdditionalInputs>builtins\util.m4;builtins\target-neon-common.ll</AdditionalInputs>
+      <Message>Building gen-bitcode-neon-8-64bit.cpp</Message>
+    </CustomBuild>
+  </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="builtins\target-neon-16.ll">
+      <FileType>Document</FileType>
+      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-neon-16.ll | python bitcode2cpp.py builtins\target-neon-16.ll 32bit &gt; $(Configuration)/gen-bitcode-neon-16-32bit.cpp</Command>
+      <Outputs>$(Configuration)/gen-bitcode-neon-16-32bit.cpp</Outputs>
+      <AdditionalInputs>builtins\util.m4;builtins\target-neon-common.ll</AdditionalInputs>
+      <Message>Building gen-bitcode-neon-16-32bit.cpp</Message>
+    </CustomBuild>
+  </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="builtins\target-neon-16.ll">
+      <FileType>Document</FileType>
+      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-neon-16.ll | python bitcode2cpp.py builtins\target-neon-16.ll 64bit &gt; $(Configuration)/gen-bitcode-neon-16-64bit.cpp</Command>
+      <Outputs>$(Configuration)/gen-bitcode-neon-16-64bit.cpp</Outputs>
+      <AdditionalInputs>builtins\util.m4;builtins\target-neon-common.ll</AdditionalInputs>
+      <Message>Building gen-bitcode-neon-16-64bit.cpp</Message>
+    </CustomBuild>
+  </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="builtins\target-neon-32.ll">
+      <FileType>Document</FileType>
+      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-neon-32.ll | python bitcode2cpp.py builtins\target-neon-32.ll 32bit &gt; $(Configuration)/gen-bitcode-neon-32-32bit.cpp</Command>
+      <Outputs>$(Configuration)/gen-bitcode-neon-32-32bit.cpp</Outputs>
+      <AdditionalInputs>builtins\util.m4;builtins\target-neon-common.ll</AdditionalInputs>
+      <Message>Building gen-bitcode-neon-32-32bit.cpp</Message>
+    </CustomBuild>
+  </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="builtins\target-neon-32.ll">
+      <FileType>Document</FileType>
+      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-neon-32.ll | python bitcode2cpp.py builtins\target-neon-32.ll 64bit &gt; $(Configuration)/gen-bitcode-neon-32-64bit.cpp</Command>
+      <Outputs>$(Configuration)/gen-bitcode-neon-32-64bit.cpp</Outputs>
+      <AdditionalInputs>builtins\util.m4;builtins\target-neon-common.ll</AdditionalInputs>
+      <Message>Building gen-bitcode-neon-32-64bit.cpp</Message>
+    </CustomBuild>
+  </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="builtins\target-avx1.ll">
+      <FileType>Document</FileType>
+      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-avx1.ll | python bitcode2cpp.py builtins\target-avx1.ll 32bit &gt; $(Configuration)/gen-bitcode-avx1-32bit.cpp</Command>
+      <Outputs>$(Configuration)/gen-bitcode-avx1-32bit.cpp</Outputs>
+      <AdditionalInputs>builtins\util.m4;builtins\target-avx-common.ll;builtins\target-avx.ll</AdditionalInputs>
+      <Message>Building gen-bitcode-avx1-32bit.cpp</Message>
+    </CustomBuild>
+  </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="builtins\target-avx1.ll">
+      <FileType>Document</FileType>
+      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-avx1.ll | python bitcode2cpp.py builtins\target-avx1.ll 64bit &gt; $(Configuration)/gen-bitcode-avx1-64bit.cpp</Command>
+      <Outputs>$(Configuration)/gen-bitcode-avx1-64bit.cpp</Outputs>
+      <AdditionalInputs>builtins\util.m4;builtins\target-avx-common.ll;builtins\target-avx.ll</AdditionalInputs>
+      <Message>Building gen-bitcode-avx1-64bit.cpp</Message>
+    </CustomBuild>
+  </ItemGroup>
   <ItemGroup>
     <CustomBuild Include="builtins\target-avx1-x2.ll">
       <FileType>Document</FileType>
diff --git a/module.cpp b/module.cpp
index 85bf242c..755a5dc4 100644
--- a/module.cpp
+++ b/module.cpp
@@ -1877,6 +1877,7 @@ Module::execPreprocessor(const char *infilename, llvm::raw_string_ostream *ostre
     char *p = targetMacro;
     while (*p) {
         *p = toupper(*p);
+        if (*p == '-') *p = '_';
         ++p;
     }
     opts.addMacroDef(targetMacro);
diff --git a/run_tests.py b/run_tests.py
index ea819ea4..c9dd8b76 100755
--- a/run_tests.py
+++ b/run_tests.py
@@ -37,7 +37,7 @@ parser.add_option("-g", "--generics-include", dest="include_file", help="Filenam
 parser.add_option("-f", "--ispc-flags", dest="ispc_flags", help="Additional flags for ispc (-g, -O1, ...)",
                   default="")
 parser.add_option('-t', '--target', dest='target',
-                  help='Set compilation target (neon, sse2, sse2-x2, sse4, sse4-x2, sse4-8, sse4-16, avx, avx-x2, generic-4, generic-8, generic-16, generic-32)',
+                  help='Set compilation target (neon8, neon16, neon32, sse2, sse2-x2, sse4, sse4-x2, sse4-8, sse4-16, avx, avx-x2, generic-4, generic-8, generic-16, generic-32)',
                   default="sse4")
 parser.add_option('-a', '--arch', dest='arch',
                   help='Set architecture (arm, x86, x86-64)',

From 48ff03112fd30d12a85eaf7cee3636ee6bfbedb4 Mon Sep 17 00:00:00 2001
From: Matt Pharr <mpharr@google.com>
Date: Mon, 29 Jul 2013 16:20:46 -0700
Subject: [PATCH 016/124] Remove __pause from stdlib_core() in utils.m4.

It wasn't ever being used, and was breaking compilation on ARM.
---
 builtins.cpp     | 1 -
 builtins/util.m4 | 5 -----
 2 files changed, 6 deletions(-)

diff --git a/builtins.cpp b/builtins.cpp
index e671a491..b2896388 100644
--- a/builtins.cpp
+++ b/builtins.cpp
@@ -487,7 +487,6 @@ lSetInternalFunctions(llvm::Module *module) {
         "__num_cores",
         "__packed_load_active",
         "__packed_store_active",
-        "__pause",
         "__popcnt_int32",
         "__popcnt_int64",
         "__prefetch_read_uniform_1",
diff --git a/builtins/util.m4 b/builtins/util.m4
index 1f85e2cc..025030d5 100644
--- a/builtins/util.m4
+++ b/builtins/util.m4
@@ -1795,11 +1795,6 @@ declare i1 @__is_compile_time_constant_mask(<WIDTH x MASK> %mask)
 declare i1 @__is_compile_time_constant_uniform_int32(i32)
 declare i1 @__is_compile_time_constant_varying_int32(<WIDTH x i32>)
 
-define void @__pause() nounwind readnone {
-  call void asm sideeffect "pause", "~{dirflag},~{fpsr},~{flags}"() nounwind
-  ret void
-}
-
 ; This function declares placeholder masked store functions for the
 ;  front-end to use.
 ;

From 220f0b0b4037f8c9124e6e2f666b053b39d71152 Mon Sep 17 00:00:00 2001
From: Dmitry Babokin <babokin@gmail.com>
Date: Tue, 30 Jul 2013 19:53:12 -0700
Subject: [PATCH 017/124] Renaming mandelbrot_tasks files to be different from
 mandelbrot

---
 examples/mandelbrot_tasks/Makefile               |  6 +++---
 .../{mandelbrot.cpp => mandelbrot_tasks.cpp}     |  2 +-
 .../{mandelbrot.ispc => mandelbrot_tasks.ispc}   |  0
 .../mandelbrot_tasks/mandelbrot_tasks.vcxproj    | 16 ++++++++--------
 ...ot_serial.cpp => mandelbrot_tasks_serial.cpp} |  0
 examples/perf.ini                                |  2 +-
 6 files changed, 13 insertions(+), 13 deletions(-)
 rename examples/mandelbrot_tasks/{mandelbrot.cpp => mandelbrot_tasks.cpp} (99%)
 rename examples/mandelbrot_tasks/{mandelbrot.ispc => mandelbrot_tasks.ispc} (100%)
 rename examples/mandelbrot_tasks/{mandelbrot_serial.cpp => mandelbrot_tasks_serial.cpp} (100%)

diff --git a/examples/mandelbrot_tasks/Makefile b/examples/mandelbrot_tasks/Makefile
index 7e83e618..1a565ffd 100644
--- a/examples/mandelbrot_tasks/Makefile
+++ b/examples/mandelbrot_tasks/Makefile
@@ -1,7 +1,7 @@
 
-EXAMPLE=mandelbrot
-CPP_SRC=mandelbrot.cpp mandelbrot_serial.cpp
-ISPC_SRC=mandelbrot.ispc
+EXAMPLE=mandelbrot_tasks
+CPP_SRC=mandelbrot_tasks.cpp mandelbrot_tasks_serial.cpp
+ISPC_SRC=mandelbrot_tasks.ispc
 ISPC_IA_TARGETS=sse2,sse4-x2,avx-x2
 ISPC_ARM_TARGETS=neon
 
diff --git a/examples/mandelbrot_tasks/mandelbrot.cpp b/examples/mandelbrot_tasks/mandelbrot_tasks.cpp
similarity index 99%
rename from examples/mandelbrot_tasks/mandelbrot.cpp
rename to examples/mandelbrot_tasks/mandelbrot_tasks.cpp
index a01cfe43..dae22736 100644
--- a/examples/mandelbrot_tasks/mandelbrot.cpp
+++ b/examples/mandelbrot_tasks/mandelbrot_tasks.cpp
@@ -42,7 +42,7 @@
 #include <algorithm>
 #include <string.h>
 #include "../timing.h"
-#include "mandelbrot_ispc.h"
+#include "mandelbrot_tasks_ispc.h"
 using namespace ispc;
 
 extern void mandelbrot_serial(float x0, float y0, float x1, float y1,
diff --git a/examples/mandelbrot_tasks/mandelbrot.ispc b/examples/mandelbrot_tasks/mandelbrot_tasks.ispc
similarity index 100%
rename from examples/mandelbrot_tasks/mandelbrot.ispc
rename to examples/mandelbrot_tasks/mandelbrot_tasks.ispc
diff --git a/examples/mandelbrot_tasks/mandelbrot_tasks.vcxproj b/examples/mandelbrot_tasks/mandelbrot_tasks.vcxproj
index b92de72f..3a8fca79 100644
--- a/examples/mandelbrot_tasks/mandelbrot_tasks.vcxproj
+++ b/examples/mandelbrot_tasks/mandelbrot_tasks.vcxproj
@@ -21,7 +21,7 @@
   <PropertyGroup Label="Globals">
     <ProjectGuid>{E80DA7D4-AB22-4648-A068-327307156BE6}</ProjectGuid>
     <Keyword>Win32Proj</Keyword>
-    <RootNamespace>mandelbrot</RootNamespace>
+    <RootNamespace>mandelbrot_tasks</RootNamespace>
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
@@ -65,22 +65,22 @@
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
     <LinkIncremental>true</LinkIncremental>
     <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
-    <TargetName>mandelbrot</TargetName>
+    <TargetName>mandelbrot_tasks</TargetName>
   </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
     <LinkIncremental>true</LinkIncremental>
     <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
-    <TargetName>mandelbrot</TargetName>
+    <TargetName>mandelbrot_tasks</TargetName>
   </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
     <LinkIncremental>false</LinkIncremental>
     <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
-    <TargetName>mandelbrot</TargetName>
+    <TargetName>mandelbrot_tasks</TargetName>
   </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
     <LinkIncremental>false</LinkIncremental>
     <ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
-    <TargetName>mandelbrot</TargetName>
+    <TargetName>mandelbrot_tasks</TargetName>
   </PropertyGroup>
   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
     <ClCompile>
@@ -153,12 +153,12 @@
     </Link>
   </ItemDefinitionGroup>
   <ItemGroup>
-    <ClCompile Include="mandelbrot.cpp" />
-    <ClCompile Include="mandelbrot_serial.cpp" />
+    <ClCompile Include="mandelbrot_tasks.cpp" />
+    <ClCompile Include="mandelbrot_tasks_serial.cpp" />
     <ClCompile Include="../tasksys.cpp" />
   </ItemGroup>
   <ItemGroup>
-    <CustomBuild Include="mandelbrot.ispc">
+    <CustomBuild Include="mandelbrot_tasks.ispc">
       <FileType>Document</FileType>
       <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx-x2
 </Command>
diff --git a/examples/mandelbrot_tasks/mandelbrot_serial.cpp b/examples/mandelbrot_tasks/mandelbrot_tasks_serial.cpp
similarity index 100%
rename from examples/mandelbrot_tasks/mandelbrot_serial.cpp
rename to examples/mandelbrot_tasks/mandelbrot_tasks_serial.cpp
diff --git a/examples/perf.ini b/examples/perf.ini
index 3814bf16..d2a5c73e 100755
--- a/examples/perf.ini
+++ b/examples/perf.ini
@@ -22,7 +22,7 @@ mandelbrot
 #***
 Mandelbrot Set
 mandelbrot_tasks
-mandelbrot
+mandelbrot_tasks
 ^
 #***
 Perlin Noise Function

From d3c567503bf64ec9066c09cb8959c31d4aa1be0e Mon Sep 17 00:00:00 2001
From: Matt Pharr <mpharr@google.com>
Date: Wed, 31 Jul 2013 06:46:45 -0700
Subject: [PATCH 018/124] Remove support for building with LLVM 3.1

---
 builtins.cpp                |   2 -
 builtins/target-avx11-x2.ll |   4 +-
 builtins/target-avx11.ll    |   4 +-
 builtins/target-avx2-x2.ll  |  25 +-------
 builtins/target-avx2.ll     |  25 +-------
 cbackend.cpp                | 115 +++++++++---------------------------
 ctx.cpp                     |   4 +-
 ctx.h                       |  11 +---
 expr.cpp                    |   2 +-
 func.cpp                    |  10 +---
 ispc.cpp                    |  68 ++++-----------------
 ispc.h                      |  18 +-----
 llvmutil.cpp                |   2 +-
 llvmutil.h                  |   2 +-
 main.cpp                    |   4 +-
 module.cpp                  |  47 +++------------
 opt.cpp                     |  22 ++-----
 stmt.cpp                    |   2 +-
 type.cpp                    |  26 +++-----
 type.h                      |   2 +-
 util.cpp                    |   9 +--
 21 files changed, 84 insertions(+), 320 deletions(-)

diff --git a/builtins.cpp b/builtins.cpp
index b2896388..17582d68 100644
--- a/builtins.cpp
+++ b/builtins.cpp
@@ -49,8 +49,6 @@
 #include <stdlib.h>
 #if defined(LLVM_3_2)
   #include <llvm/Attributes.h>
-#endif
-#if defined(LLVM_3_1) || defined(LLVM_3_2)
   #include <llvm/LLVMContext.h>
   #include <llvm/Module.h>
   #include <llvm/Type.h>
diff --git a/builtins/target-avx11-x2.ll b/builtins/target-avx11-x2.ll
index 1aa6345c..2aee1e1c 100644
--- a/builtins/target-avx11-x2.ll
+++ b/builtins/target-avx11-x2.ll
@@ -31,9 +31,7 @@
 
 include(`target-avx-x2.ll')
 
-ifelse(LLVM_VERSION, `LLVM_3_0', `rdrand_decls()',
-       LLVM_VERSION, `LLVM_3_1', `rdrand_decls()',
-       `rdrand_definition()')
+rdrand_definition()
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; int min/max
diff --git a/builtins/target-avx11.ll b/builtins/target-avx11.ll
index fea0a7c2..44593113 100644
--- a/builtins/target-avx11.ll
+++ b/builtins/target-avx11.ll
@@ -31,9 +31,7 @@
 
 include(`target-avx.ll')
 
-ifelse(LLVM_VERSION, `LLVM_3_0', `rdrand_decls()',
-       LLVM_VERSION, `LLVM_3_1', `rdrand_decls()',
-       `rdrand_definition()')
+rdrand_definition()
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; int min/max
diff --git a/builtins/target-avx2-x2.ll b/builtins/target-avx2-x2.ll
index 053fd078..19f1845d 100644
--- a/builtins/target-avx2-x2.ll
+++ b/builtins/target-avx2-x2.ll
@@ -29,15 +29,11 @@
 ;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 ;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
 
-ifelse(LLVM_VERSION, `LLVM_3_0', `',
-       LLVM_VERSION, `LLVM_3_1', `',
-       `define(`HAVE_GATHER', `1')')
+define(`HAVE_GATHER', `1')
 
 include(`target-avx-x2.ll')
 
-ifelse(LLVM_VERSION, `LLVM_3_0', `rdrand_decls()',
-       LLVM_VERSION, `LLVM_3_1', `rdrand_decls()',
-       `rdrand_definition()')
+rdrand_definition()
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; int min/max
@@ -176,21 +172,6 @@ define(`assemble_4s', `
   assemble_8s($1, $2, $2_1, $2_2)
 ')
 
-ifelse(LLVM_VERSION, `LLVM_3_0', `
-gen_gather_factored(i8)
-gen_gather_factored(i16)
-gen_gather_factored(i32)
-gen_gather_factored(float)
-gen_gather_factored(i64)
-gen_gather_factored(double)',
-LLVM_VERSION, `LLVM_3_1', `
-gen_gather_factored(i8)
-gen_gather_factored(i16)
-gen_gather_factored(i32)
-gen_gather_factored(float)
-gen_gather_factored(i64)
-gen_gather_factored(double)', `
-
 gen_gather(i8)
 gen_gather(i16)
 
@@ -557,5 +538,3 @@ define <16 x double> @__gather64_double(<16 x i64> %ptrs,
 
   ret <16 x double> %v
 }
-
-')
diff --git a/builtins/target-avx2.ll b/builtins/target-avx2.ll
index f4a0ee07..d3410011 100644
--- a/builtins/target-avx2.ll
+++ b/builtins/target-avx2.ll
@@ -29,15 +29,11 @@
 ;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 ;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
 
-ifelse(LLVM_VERSION, `LLVM_3_0', `',
-       LLVM_VERSION, `LLVM_3_1', `',
-       `define(`HAVE_GATHER', `1')')
+define(`HAVE_GATHER', `1')
 
 include(`target-avx.ll')
 
-ifelse(LLVM_VERSION, `LLVM_3_0', `rdrand_decls()',
-       LLVM_VERSION, `LLVM_3_1', `rdrand_decls()',
-       `rdrand_definition()')
+rdrand_definition()
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; int min/max
@@ -123,21 +119,6 @@ define(`extract_4s', `
   %$2_2 = shufflevector <8 x $1> %$2, <8 x $1> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 ')
 
-ifelse(LLVM_VERSION, `LLVM_3_0', `
-gen_gather_factored(i8)
-gen_gather_factored(i16)
-gen_gather_factored(i32)
-gen_gather_factored(float)
-gen_gather_factored(i64)
-gen_gather_factored(double)',
-LLVM_VERSION, `LLVM_3_1', `
-gen_gather_factored(i8)
-gen_gather_factored(i16)
-gen_gather_factored(i32)
-gen_gather_factored(float)
-gen_gather_factored(i64)
-gen_gather_factored(double)', `
-
 gen_gather(i8)
 gen_gather(i16)
 
@@ -429,5 +410,3 @@ define <8 x double> @__gather64_double(<8 x i64> %ptrs,
 
   ret <8 x double> %v
 }
-
-')
diff --git a/cbackend.cpp b/cbackend.cpp
index d23bcc20..d54f48fb 100644
--- a/cbackend.cpp
+++ b/cbackend.cpp
@@ -29,7 +29,7 @@
 
 #include "llvmutil.h"
 
-#if defined(LLVM_3_1) || defined(LLVM_3_2)
+#if defined(LLVM_3_2)
   #include "llvm/Constants.h"
   #include "llvm/DerivedTypes.h"
   #include "llvm/CallingConv.h"
@@ -38,6 +38,7 @@
   #include "llvm/Intrinsics.h"
   #include "llvm/IntrinsicInst.h"
   #include "llvm/InlineAsm.h"
+  #include "llvm/TypeFinder.h"
 #else
   #include "llvm/IR/Constants.h"
   #include "llvm/IR/DerivedTypes.h"
@@ -47,16 +48,10 @@
   #include "llvm/IR/Intrinsics.h"
   #include "llvm/IR/IntrinsicInst.h"
   #include "llvm/IR/InlineAsm.h"
+  #include "llvm/IR/TypeFinder.h"
 #endif
 #include "llvm/Pass.h"
 #include "llvm/PassManager.h"
-#if !defined(LLVM_3_1)
-  #if defined(LLVM_3_2)
-    #include "llvm/TypeFinder.h"
-  #else // LLVM_3_3 +
-    #include "llvm/IR/TypeFinder.h"
-  #endif
-#endif // LLVM_3_2 +
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/STLExtras.h"
@@ -76,9 +71,7 @@
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCSymbol.h"
-#if defined(LLVM_3_1)
-  #include "llvm/Target/TargetData.h"
-#elif defined(LLVM_3_2)
+#if defined(LLVM_3_2)
   #include "llvm/DataLayout.h"
 #else // LLVM 3.3+
   #include "llvm/IR/DataLayout.h"
@@ -88,7 +81,7 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FormattedStream.h"
 #include "llvm/Support/GetElementPtrTypeIterator.h"
-#if defined(LLVM_3_1) || defined(LLVM_3_2)
+#if defined(LLVM_3_2)
   #include "llvm/Support/InstVisitor.h"
 #else // LLVM 3.3+
   #include "llvm/InstVisitor.h"
@@ -258,14 +251,10 @@ namespace {
     const llvm::MCRegisterInfo *MRI;
     const llvm::MCObjectFileInfo *MOFI;
     llvm::MCContext *TCtx;
-#if defined(LLVM_3_1)
-    const llvm::TargetData* TD;
-#else
     // FIXME: it's ugly to have the name be "TD" here, but it saves us
     // lots of ifdefs in the below since the new DataLayout and the old
     // TargetData have generally similar interfaces...
     const llvm::DataLayout* TD;
-#endif
 
     std::map<const llvm::ConstantFP *, unsigned> FPConstantMap;
     std::map<const llvm::ConstantDataVector *, unsigned> VectorConstantMap;
@@ -352,7 +341,7 @@ namespace {
                            bool isSigned = false,
                            const std::string &VariableName = "",
                            bool IgnoreName = false,
-#if defined(LLVM_3_1) || defined(LLVM_3_2)
+#if defined(LLVM_3_2)
                            const llvm::AttrListPtr &PAL = llvm::AttrListPtr()
 #else
                            const llvm::AttributeSet &PAL = llvm::AttributeSet()
@@ -363,7 +352,7 @@ namespace {
                            const std::string &NameSoFar = "");
 
     void printStructReturnPointerFunctionType(llvm::raw_ostream &Out,
-#if defined(LLVM_3_1) || defined(LLVM_3_2)
+#if defined(LLVM_3_2)
                                               const llvm::AttrListPtr &PAL,
 #else
                                               const llvm::AttributeSet &PAL,
@@ -586,7 +575,7 @@ std::string CWriter::getArrayName(llvm::ArrayType *AT) {
 /// return type, except, instead of printing the type as void (*)(Struct*, ...)
 /// print it as "Struct (*)(...)", for struct return functions.
 void CWriter::printStructReturnPointerFunctionType(llvm::raw_ostream &Out,
-#if defined(LLVM_3_1) || defined(LLVM_3_2)
+#if defined(LLVM_3_2)
                                                    const llvm::AttrListPtr &PAL,
 #else
                                                    const llvm::AttributeSet &PAL,
@@ -605,20 +594,16 @@ void CWriter::printStructReturnPointerFunctionType(llvm::raw_ostream &Out,
     if (PrintedType)
       FunctionInnards << ", ";
     llvm::Type *ArgTy = *I;
-#if defined(LLVM_3_1)
-    if (PAL.paramHasAttr(Idx, llvm::Attribute::ByVal)) {
-#elif defined(LLVM_3_2)
+#if defined(LLVM_3_2)
     if (PAL.getParamAttributes(Idx).hasAttribute(llvm::Attributes::ByVal)) {
 #else
-        if (PAL.getParamAttributes(Idx).hasAttribute(llvm::AttributeSet::FunctionIndex, llvm::Attribute::ByVal)) {
+    if (PAL.getParamAttributes(Idx).hasAttribute(llvm::AttributeSet::FunctionIndex, llvm::Attribute::ByVal)) {
 #endif
       assert(ArgTy->isPointerTy());
       ArgTy = llvm::cast<llvm::PointerType>(ArgTy)->getElementType();
     }
     printType(FunctionInnards, ArgTy,
-#if defined(LLVM_3_1)
-              /*isSigned=*/PAL.paramHasAttr(Idx, llvm::Attribute::SExt),
-#elif defined(LLVM_3_2)
+#if defined(LLVM_3_2)
               PAL.getParamAttributes(Idx).hasAttribute(llvm::Attributes::SExt),
 #else
               PAL.getParamAttributes(Idx).hasAttribute(llvm::AttributeSet::FunctionIndex, llvm::Attribute::SExt),
@@ -635,9 +620,7 @@ void CWriter::printStructReturnPointerFunctionType(llvm::raw_ostream &Out,
   }
   FunctionInnards << ')';
   printType(Out, RetTy,
-#if defined(LLVM_3_1)
-            /*isSigned=*/PAL.paramHasAttr(0, llvm::Attribute::SExt),
-#elif defined(LLVM_3_2)
+#if defined(LLVM_3_2)
             PAL.getParamAttributes(0).hasAttribute(llvm::Attributes::SExt),
 #else
             PAL.getParamAttributes(0).hasAttribute(llvm::AttributeSet::ReturnIndex, llvm::Attribute::SExt),
@@ -737,7 +720,7 @@ CWriter::printSimpleType(llvm::raw_ostream &Out, llvm::Type *Ty, bool isSigned,
 llvm::raw_ostream &CWriter::printType(llvm::raw_ostream &Out, llvm::Type *Ty,
                                 bool isSigned, const std::string &NameSoFar,
                                 bool IgnoreName,
-#if defined(LLVM_3_1) || defined(LLVM_3_2)
+#if defined(LLVM_3_2)
                                 const llvm::AttrListPtr &PAL
 #else
                                 const llvm::AttributeSet &PAL
@@ -759,9 +742,7 @@ llvm::raw_ostream &CWriter::printType(llvm::raw_ostream &Out, llvm::Type *Ty,
     for (llvm::FunctionType::param_iterator I = FTy->param_begin(),
            E = FTy->param_end(); I != E; ++I) {
       llvm::Type *ArgTy = *I;
-#if defined(LLVM_3_1)
-      if (PAL.paramHasAttr(Idx, llvm::Attribute::ByVal)) {
-#elif defined(LLVM_3_2)
+#if defined(LLVM_3_2)
       if (PAL.getParamAttributes(Idx).hasAttribute(llvm::Attributes::ByVal)) {
 #else
           if (PAL.getParamAttributes(Idx).hasAttribute(llvm::AttributeSet::FunctionIndex, llvm::Attribute::ByVal)) {
@@ -772,9 +753,7 @@ llvm::raw_ostream &CWriter::printType(llvm::raw_ostream &Out, llvm::Type *Ty,
       if (I != FTy->param_begin())
         FunctionInnards << ", ";
       printType(FunctionInnards, ArgTy,
-#if defined(LLVM_3_1)
-                /*isSigned=*/PAL.paramHasAttr(Idx, llvm::Attribute::SExt),
-#elif defined(LLVM_3_2)
+#if defined(LLVM_3_2)
                 PAL.getParamAttributes(Idx).hasAttribute(llvm::Attributes::SExt),
 #else
                 PAL.getParamAttributes(Idx).hasAttribute(llvm::AttributeSet::FunctionIndex, llvm::Attribute::SExt),
@@ -791,9 +770,7 @@ llvm::raw_ostream &CWriter::printType(llvm::raw_ostream &Out, llvm::Type *Ty,
     }
     FunctionInnards << ')';
     printType(Out, FTy->getReturnType(),
-#if defined(LLVM_3_1)
-              /*isSigned=*/PAL.paramHasAttr(0, llvm::Attribute::SExt),
-#elif defined(LLVM_3_2)
+#if defined(LLVM_3_2)
               PAL.getParamAttributes(0).hasAttribute(llvm::Attributes::SExt),
 #else
               PAL.getParamAttributes(0).hasAttribute(llvm::AttributeSet::ReturnIndex, llvm::Attribute::SExt),
@@ -1972,11 +1949,7 @@ void CWriter::writeOperandWithCast(llvm::Value* Operand, const llvm::ICmpInst &C
 // directives to cater to specific compilers as need be.
 //
 static void generateCompilerSpecificCode(llvm::formatted_raw_ostream& Out,
-#if defined(LLVM_3_1)
-                                         const llvm::TargetData *TD) {
-#else
                                          const llvm::DataLayout *TD) {
-#endif
   // We output GCC specific attributes to preserve 'linkonce'ness on globals.
   // If we aren't being compiled with GCC, just drop these attributes.
   Out << "#ifndef __GNUC__  /* Can only support \"linkonce\" vars with GCC */\n"
@@ -2169,11 +2142,7 @@ bool CWriter::doInitialization(llvm::Module &M) {
   // Initialize
   TheModule = &M;
 
-#if defined(LLVM_3_1)
-  TD = new llvm::TargetData(&M);
-#else
   TD = new llvm::DataLayout(&M);
-#endif
   IL = new llvm::IntrinsicLowering(*TD);
   IL->AddPrototypes(M);
 
@@ -2656,15 +2625,11 @@ void CWriter::printModuleTypes() {
 
   // Get all of the struct types used in the module.
   std::vector<llvm::StructType*> StructTypes;
-#if defined(LLVM_3_1)
-  TheModule->findUsedStructTypes(StructTypes);
-#else
   llvm::TypeFinder typeFinder;
   typeFinder.run(*TheModule, false);
   for (llvm::TypeFinder::iterator iter = typeFinder.begin();
        iter != typeFinder.end(); ++iter)
       StructTypes.push_back(*iter);
-#endif
 
   // Get all of the array types used in the module
   std::vector<llvm::ArrayType*> ArrayTypes;
@@ -2785,7 +2750,7 @@ void CWriter::printFunctionSignature(const llvm::Function *F, bool Prototype) {
 
   // Loop over the arguments, printing them...
   llvm::FunctionType *FT = llvm::cast<llvm::FunctionType>(F->getFunctionType());
-#if defined(LLVM_3_1) || defined(LLVM_3_2)
+#if defined(LLVM_3_2)
   const llvm::AttrListPtr &PAL = F->getAttributes();
 #else
   const llvm::AttributeSet &PAL = F->getAttributes();
@@ -2819,20 +2784,16 @@ void CWriter::printFunctionSignature(const llvm::Function *F, bool Prototype) {
         else
           ArgName = "";
         llvm::Type *ArgTy = I->getType();
-#if defined(LLVM_3_1)
-        if (PAL.paramHasAttr(Idx, llvm::Attribute::ByVal)) {
-#elif defined(LLVM_3_2)
+#if defined(LLVM_3_2)
         if (PAL.getParamAttributes(Idx).hasAttribute(llvm::Attributes::ByVal)) {
 #else
-            if (PAL.getParamAttributes(Idx).hasAttribute(llvm::AttributeSet::FunctionIndex, llvm::Attribute::ByVal)) {
+        if (PAL.getParamAttributes(Idx).hasAttribute(llvm::AttributeSet::FunctionIndex, llvm::Attribute::ByVal)) {
 #endif
           ArgTy = llvm::cast<llvm::PointerType>(ArgTy)->getElementType();
           ByValParams.insert(I);
         }
         printType(FunctionInnards, ArgTy,
-#if defined(LLVM_3_1)
-                  /*isSigned=*/PAL.paramHasAttr(Idx, llvm::Attribute::SExt),
-#elif defined(LLVM_3_2)
+#if defined(LLVM_3_2)
                   PAL.getParamAttributes(Idx).hasAttribute(llvm::Attributes::SExt),
 #else
                   PAL.getParamAttributes(Idx).hasAttribute(llvm::AttributeSet::FunctionIndex, llvm::Attribute::SExt),
@@ -2858,9 +2819,7 @@ void CWriter::printFunctionSignature(const llvm::Function *F, bool Prototype) {
     for (; I != E; ++I) {
       if (PrintedArg) FunctionInnards << ", ";
       llvm::Type *ArgTy = *I;
-#if defined(LLVM_3_1)
-      if (PAL.paramHasAttr(Idx, llvm::Attribute::ByVal)) {
-#elif defined(LLVM_3_2)
+#if defined(LLVM_3_2)
       if (PAL.getParamAttributes(Idx).hasAttribute(llvm::Attributes::ByVal)) {
 #else
           if (PAL.getParamAttributes(Idx).hasAttribute(llvm::AttributeSet::FunctionIndex, llvm::Attribute::ByVal)) {
@@ -2869,9 +2828,7 @@ void CWriter::printFunctionSignature(const llvm::Function *F, bool Prototype) {
         ArgTy = llvm::cast<llvm::PointerType>(ArgTy)->getElementType();
       }
       printType(FunctionInnards, ArgTy,
-#if defined(LLVM_3_1)
-                /*isSigned=*/PAL.paramHasAttr(Idx, llvm::Attribute::SExt)
-#elif defined(LLVM_3_2)
+#if defined(LLVM_3_2)
                 PAL.getParamAttributes(Idx).hasAttribute(llvm::Attributes::SExt)
 #else
                 PAL.getParamAttributes(Idx).hasAttribute(llvm::AttributeSet::FunctionIndex, llvm::Attribute::SExt)
@@ -2908,9 +2865,7 @@ void CWriter::printFunctionSignature(const llvm::Function *F, bool Prototype) {
 
   // Print out the return type and the signature built above.
   printType(Out, RetTy,
-#if defined(LLVM_3_1)
-            /*isSigned=*/PAL.paramHasAttr(0, llvm::Attribute::SExt),
-#elif defined(LLVM_3_2)
+#if defined(LLVM_3_2)
             PAL.getParamAttributes(0).hasAttribute(llvm::Attributes::SExt),
 #else
             PAL.getParamAttributes(0).hasAttribute(llvm::AttributeSet::ReturnIndex, llvm::Attribute::SExt),
@@ -3712,7 +3667,7 @@ void CWriter::lowerIntrinsics(llvm::Function &F) {
             const char *BuiltinName = "";
 #define GET_GCC_BUILTIN_NAME
 #define Intrinsic llvm::Intrinsic
-#if defined(LLVM_3_1) || defined(LLVM_3_2)
+#if defined(LLVM_3_2)
   #include "llvm/Intrinsics.gen"
 #else
   #include "llvm/IR/Intrinsics.gen"
@@ -3777,7 +3732,7 @@ void CWriter::visitCallInst(llvm::CallInst &I) {
 
   // If this is a call to a struct-return function, assign to the first
   // parameter instead of passing it to the call.
-#if defined(LLVM_3_1) || defined(LLVM_3_2)
+#if defined(LLVM_3_2)
   const llvm::AttrListPtr &PAL = I.getAttributes();
 #else
   const llvm::AttributeSet &PAL = I.getAttributes();
@@ -3865,9 +3820,7 @@ void CWriter::visitCallInst(llvm::CallInst &I) {
         (*AI)->getType() != FTy->getParamType(ArgNo)) {
       Out << '(';
       printType(Out, FTy->getParamType(ArgNo),
-#if defined(LLVM_3_1)
-                /*isSigned=*/PAL.paramHasAttr(ArgNo+1, llvm::Attribute::SExt)
-#elif defined(LLVM_3_2)
+#if defined(LLVM_3_2)
                 PAL.getParamAttributes(ArgNo+1).hasAttribute(llvm::Attributes::SExt)
 #else
                 PAL.getParamAttributes(ArgNo+1).hasAttribute(llvm::AttributeSet::FunctionIndex, llvm::Attribute::SExt)
@@ -3905,7 +3858,7 @@ bool CWriter::visitBuiltinCall(llvm::CallInst &I, llvm::Intrinsic::ID ID,
     const char *BuiltinName = "";
 #define GET_GCC_BUILTIN_NAME
 #define Intrinsic llvm::Intrinsic
-#if defined(LLVM_3_1) || defined(LLVM_3_2)
+#if defined(LLVM_3_2)
   #include "llvm/Intrinsics.gen"
 #else
   #include "llvm/IR/Intrinsics.gen"
@@ -4555,13 +4508,8 @@ SmearCleanupPass::runOnBasicBlock(llvm::BasicBlock &bb) {
                                                 smearType, NULL);
                 smearFunc = llvm::dyn_cast<llvm::Function>(sf);
                 assert(smearFunc != NULL);
-#if defined(LLVM_3_1)
-                smearFunc->setDoesNotThrow(true);
-                smearFunc->setDoesNotAccessMemory(true);
-#else
                 smearFunc->setDoesNotThrow();
                 smearFunc->setDoesNotAccessMemory();
-#endif
             }
 
             assert(smearFunc != NULL);
@@ -4703,13 +4651,8 @@ AndCmpCleanupPass::runOnBasicBlock(llvm::BasicBlock &bb) {
                                                    LLVMTypes::MaskType, NULL);
                 andCmpFunc = llvm::dyn_cast<llvm::Function>(acf);
                 Assert(andCmpFunc != NULL);
-#if defined(LLVM_3_1)
-                andCmpFunc->setDoesNotThrow(true);
-                andCmpFunc->setDoesNotAccessMemory(true);
-#else
                 andCmpFunc->setDoesNotThrow();
                 andCmpFunc->setDoesNotAccessMemory();
-#endif
             }
 
             // Set up the function call to the *_and_mask function; the
@@ -4914,7 +4857,7 @@ WriteCXXFile(llvm::Module *module, const char *fn, int vectorWidth,
         pm.add(new llvm::TargetData(module));
 #endif
 
-#if defined(LLVM_3_1) || defined(LLVM_3_2) || defined(LLVM_3_3)
+#if defined(LLVM_3_2) || defined(LLVM_3_3)
     int flags = 0;
 #else
     llvm::sys::fs::OpenFlags flags = llvm::sys::fs::F_None;
@@ -4939,7 +4882,7 @@ WriteCXXFile(llvm::Module *module, const char *fn, int vectorWidth,
     pm.add(llvm::createDeadCodeEliminationPass()); // clean up after smear pass
 //CO    pm.add(llvm::createPrintModulePass(&fos));
     pm.add(new CWriter(fos, includeName, vectorWidth));
-#if defined(LLVM_3_1) || defined(LLVM_3_2)
+#if defined(LLVM_3_2)
     // This interface is depricated for 3.3+
     pm.add(llvm::createGCInfoDeleter());
 #endif
diff --git a/ctx.cpp b/ctx.cpp
index c50d22f9..32ba0ad9 100644
--- a/ctx.cpp
+++ b/ctx.cpp
@@ -46,7 +46,7 @@
 #include "sym.h"
 #include <map>
 #include <llvm/Support/Dwarf.h>
-#if defined(LLVM_3_1) || defined(LLVM_3_2)
+#if defined(LLVM_3_2)
   #include <llvm/Metadata.h>
   #include <llvm/Module.h>
   #include <llvm/Instructions.h>
@@ -3312,7 +3312,7 @@ FunctionEmitContext::CallInst(llvm::Value *func, const FunctionType *funcType,
         // alias analysis.
         // TODO: what other attributes needs to be copied?
         // TODO: do the same for varing path.
-#if !defined (LLVM_3_1) && !defined (LLVM_3_2) // LLVM 3.3+
+#if !defined (LLVM_3_2) // LLVM 3.3+
         llvm::CallInst *cc = llvm::dyn_cast<llvm::CallInst>(ci);
         if (cc &&
             cc->getCalledFunction() &&
diff --git a/ctx.h b/ctx.h
index 58f9aae3..4b27e6e5 100644
--- a/ctx.h
+++ b/ctx.h
@@ -40,20 +40,15 @@
 
 #include "ispc.h"
 #include <map>
-#if defined(LLVM_3_1) || defined(LLVM_3_2)
+#if defined(LLVM_3_2)
   #include <llvm/InstrTypes.h>
   #include <llvm/Instructions.h>
 #else
   #include <llvm/IR/InstrTypes.h>
   #include <llvm/IR/Instructions.h>
 #endif
-#if defined(LLVM_3_1)
-  #include <llvm/Analysis/DebugInfo.h>
-  #include <llvm/Analysis/DIBuilder.h>
-#else
-  #include <llvm/DebugInfo.h>
-  #include <llvm/DIBuilder.h>
-#endif
+#include <llvm/DebugInfo.h>
+#include <llvm/DIBuilder.h>
 
 struct CFInfo;
 
diff --git a/expr.cpp b/expr.cpp
index 856d363c..eb8c0951 100644
--- a/expr.cpp
+++ b/expr.cpp
@@ -56,7 +56,7 @@
 #include <list>
 #include <set>
 #include <stdio.h>
-#if defined(LLVM_3_1) || defined(LLVM_3_2)
+#if defined(LLVM_3_2)
   #include <llvm/Module.h>
   #include <llvm/Type.h>
   #include <llvm/Instructions.h>
diff --git a/func.cpp b/func.cpp
index b975049b..3097f86d 100644
--- a/func.cpp
+++ b/func.cpp
@@ -46,7 +46,7 @@
 #include "util.h"
 #include <stdio.h>
 
-#if defined(LLVM_3_1) || defined(LLVM_3_2)
+#if defined(LLVM_3_2)
   #include <llvm/LLVMContext.h>
   #include <llvm/Module.h>
   #include <llvm/Type.h>
@@ -310,9 +310,7 @@ Function::emitCode(FunctionEmitContext *ctx, llvm::Function *function,
         // isn't worth the code bloat / overhead.
         bool checkMask = (type->isTask == true) ||
             (
-#if defined(LLVM_3_1)
-              (function->hasFnAttr(llvm::Attribute::AlwaysInline) == false)
-#elif defined(LLVM_3_2)
+#if defined(LLVM_3_2)
               (function->getFnAttributes().hasAttribute(llvm::Attributes::AlwaysInline) == false)
 #else // LLVM 3.3+
               (function->getAttributes().getFnAttributes().hasAttribute(llvm::AttributeSet::FunctionIndex, llvm::Attribute::AlwaysInline) == false)
@@ -453,11 +451,7 @@ Function::GenerateIR() {
                     functionName += std::string("_") + g->target->GetISAString();
                 llvm::Function *appFunction =
                     llvm::Function::Create(ftype, linkage, functionName.c_str(), m->module);
-#if defined(LLVM_3_1)
-                appFunction->setDoesNotThrow(true);
-#else
                 appFunction->setDoesNotThrow();
-#endif
 
                 g->target->markFuncWithTargetAttr(appFunction);
 
diff --git a/ispc.cpp b/ispc.cpp
index de8fba4d..b25527c4 100644
--- a/ispc.cpp
+++ b/ispc.cpp
@@ -48,7 +48,7 @@
   #include <sys/types.h>
   #include <unistd.h>
 #endif
-#if defined(LLVM_3_1) || defined(LLVM_3_2)
+#if defined(LLVM_3_2)
   #include <llvm/LLVMContext.h>
   #include <llvm/Module.h>
   #include <llvm/Instructions.h>
@@ -57,19 +57,12 @@
   #include <llvm/IR/Module.h>
   #include <llvm/IR/Instructions.h>
 #endif
-#if defined(LLVM_3_1)
-  #include <llvm/Analysis/DebugInfo.h>
-  #include <llvm/Analysis/DIBuilder.h>
-#else
-  #include <llvm/DebugInfo.h>
-  #include <llvm/DIBuilder.h>
-#endif
+#include <llvm/DebugInfo.h>
+#include <llvm/DIBuilder.h>
 #include <llvm/Support/Dwarf.h>
 #include <llvm/Target/TargetMachine.h>
 #include <llvm/Target/TargetOptions.h>
-#if defined(LLVM_3_1)
-  #include <llvm/Target/TargetData.h>
-#elif defined(LLVM_3_2)
+#if defined(LLVM_3_2)
   #include <llvm/DataLayout.h>
 #else // LLVM 3.3+
   #include <llvm/IR/DataLayout.h>
@@ -145,27 +138,20 @@ static const char *supportedCPUs[] = {
     // cortex-a9 and a15.  We should be able to handle any of them that also
     // have NEON support.
     "cortex-a9", "cortex-a15",
-    "atom", "penryn", "core2", "corei7", "corei7-avx"
-#if !defined(LLVM_3_1)
-    , "core-avx-i", "core-avx2"
-#endif // LLVM 3.2+
+    "atom", "penryn", "core2", "corei7", "corei7-avx", "core-avx-i", "core-avx2"
 };
 
 Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
     m_target(NULL),
     m_targetMachine(NULL),
-#if defined(LLVM_3_1)
-    m_targetData(NULL),
-#else
     m_dataLayout(NULL),
-#endif
     m_valid(false),
     m_isa(SSE2),
     m_arch(""),
     m_is32Bit(true),
     m_cpu(""),
     m_attributes(""),
-#if !defined(LLVM_3_1) && !defined(LLVM_3_2)
+#if !defined(LLVM_3_2)
     m_tf_attributes(NULL),
 #endif
     m_nativeVectorWidth(-1),
@@ -407,10 +393,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
         this->m_maskingIsFree = false;
         this->m_maskBitCount = 32;
         this->m_hasHalf = true;
-#if !defined(LLVM_3_1)
-        // LLVM 3.2+ only
         this->m_hasRand = true;
-#endif
     }
     else if (!strcasecmp(isa, "avx1.1-x2")) {
         this->m_isa = Target::AVX11;
@@ -420,46 +403,29 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
         this->m_maskingIsFree = false;
         this->m_maskBitCount = 32;
         this->m_hasHalf = true;
-#if !defined(LLVM_3_1)
-        // LLVM 3.2+ only
         this->m_hasRand = true;
-#endif
     }
     else if (!strcasecmp(isa, "avx2")) {
         this->m_isa = Target::AVX2;
         this->m_nativeVectorWidth = 8;
         this->m_vectorWidth = 8;
-        this->m_attributes = "+avx2,+popcnt,+cmov,+f16c,+rdrand"
-#ifndef LLVM_3_1
-            ",+fma"
-#endif // !LLVM_3_1
-            ;
+        this->m_attributes = "+avx2,+popcnt,+cmov,+f16c,+rdrand,+fma";
         this->m_maskingIsFree = false;
         this->m_maskBitCount = 32;
         this->m_hasHalf = true;
-#if !defined(LLVM_3_1)
-        // LLVM 3.2+ only
         this->m_hasRand = true;
         this->m_hasGather = true;
-#endif
     }
     else if (!strcasecmp(isa, "avx2-x2")) {
         this->m_isa = Target::AVX2;
         this->m_nativeVectorWidth = 16;
         this->m_vectorWidth = 16;
-        this->m_attributes = "+avx2,+popcnt,+cmov,+f16c,+rdrand"
-#ifndef LLVM_3_1
-            ",+fma"
-#endif // !LLVM_3_1
-            ;
+        this->m_attributes = "+avx2,+popcnt,+cmov,+f16c,+rdrand,+fma";
         this->m_maskingIsFree = false;
         this->m_maskBitCount = 32;
         this->m_hasHalf = true;
-#if !defined(LLVM_3_1)
-        // LLVM 3.2+ only
         this->m_hasRand = true;
         this->m_hasGather = true;
-#endif
     }
     else if (!strcasecmp(isa, "neon-8")) {
         this->m_isa = Target::NEON8;
@@ -505,10 +471,8 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
         if (m_isa == Target::NEON8 || m_isa == Target::NEON16 ||
             m_isa == Target::NEON32)
             options.FloatABIType = llvm::FloatABI::Hard;
-#if !defined(LLVM_3_1)
         if (g->opt.disableFMA == false)
             options.AllowFPOpFusion = llvm::FPOpFusion::Fast;
-#endif // !LLVM_3_1
 
 #ifdef ISPC_IS_WINDOWS
         if (strcmp("x86", arch) == 0) {
@@ -526,12 +490,8 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
 
         // Initialize TargetData/DataLayout in 3 steps.
         // 1. Get default data layout first
-        std::string dl_string;
-#if defined(LLVM_3_1)
-        dl_string = m_targetMachine->getTargetData()->getStringRepresentation();
-#else
-        dl_string = m_targetMachine->getDataLayout()->getStringRepresentation();
-#endif
+        std::string dl_string =
+          m_targetMachine->getDataLayout()->getStringRepresentation();
 
         // 2. Adjust for generic
         if (m_isa == Target::GENERIC) {
@@ -546,11 +506,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
         }
 
         // 3. Finally set member data
-#if defined(LLVM_3_1)
-        m_targetData = new llvm::TargetData(dl_string);
-#else
         m_dataLayout = new llvm::DataLayout(dl_string);
-#endif
 
         // Set is32Bit
         // This indicates if we are compiling for 32 bit platform
@@ -558,7 +514,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
         // FIXME: all generic targets are handled as 64 bit, which is incorrect.
         this->m_is32Bit = (getDataLayout()->getPointerSize() == 4);
 
-#if !defined(LLVM_3_1) && !defined(LLVM_3_2)
+#if !defined(LLVM_3_2)
         // This is LLVM 3.3+ feature.
         // Initialize target-specific "target-feature" attribute.
         if (!m_attributes.empty()) {
@@ -772,7 +728,7 @@ Target::StructOffset(llvm::Type *type, int element,
 }
 
 void Target::markFuncWithTargetAttr(llvm::Function* func) {
-#if !defined(LLVM_3_1) && !defined(LLVM_3_2)
+#if !defined(LLVM_3_2)
     if (m_tf_attributes) {
         func->addAttributes(llvm::AttributeSet::FunctionIndex, *m_tf_attributes);
     }
diff --git a/ispc.h b/ispc.h
index bf6d2642..d68f9034 100644
--- a/ispc.h
+++ b/ispc.h
@@ -40,8 +40,8 @@
 
 #define ISPC_VERSION "1.4.5dev"
 
-#if !defined(LLVM_3_1) && !defined(LLVM_3_2) && !defined(LLVM_3_3) && !defined(LLVM_3_4)
-#error "Only LLVM 3.1, 3.2, 3.3 and the 3.4 development branch are supported"
+#if !defined(LLVM_3_2) && !defined(LLVM_3_3) && !defined(LLVM_3_4)
+#error "Only LLVM 3.2, 3.3 and the 3.4 development branch are supported"
 #endif
 
 #if defined(_WIN32) || defined(_WIN64)
@@ -72,11 +72,7 @@ namespace llvm {
     class BasicBlock;
     class Constant;
     class ConstantValue;
-#if defined(LLVM_3_1)
-    class TargetData;
-#else
     class DataLayout;
-#endif
     class DIBuilder;
     class DIDescriptor;
     class DIFile;
@@ -226,11 +222,7 @@ public:
 
     // Note the same name of method for 3.1 and 3.2+, this allows
     // to reduce number ifdefs on client side.
-#if defined(LLVM_3_1)
-    llvm::TargetData *getDataLayout() const {return m_targetData;}
-#else
     llvm::DataLayout *getDataLayout() const {return m_dataLayout;}
-#endif
 
     /** Reports if Target object has valid state. */
     bool isValid() const {return m_valid;}
@@ -278,11 +270,7 @@ private:
         */
     llvm::TargetMachine *m_targetMachine;
 
-#if defined(LLVM_3_1)
-    llvm::TargetData *m_targetData;
-#else
     llvm::DataLayout *m_dataLayout;
-#endif
 
     /** flag to report invalid state after construction
         (due to bad parameters passed to constructor). */
@@ -303,7 +291,7 @@ private:
     /** Target-specific attribute string to pass along to the LLVM backend */
     std::string m_attributes;
 
-#if !defined(LLVM_3_1) && !defined(LLVM_3_2)
+#if !defined(LLVM_3_2)
     /** Target-specific LLVM attribute, which has to be attached to every
         function to ensure that it is generated for correct target architecture.
         This is requirement was introduced in LLVM 3.3 */
diff --git a/llvmutil.cpp b/llvmutil.cpp
index 180c8676..2f54a2fe 100644
--- a/llvmutil.cpp
+++ b/llvmutil.cpp
@@ -38,7 +38,7 @@
 #include "llvmutil.h"
 #include "ispc.h"
 #include "type.h"
-#if defined(LLVM_3_1) || defined(LLVM_3_2)
+#if defined(LLVM_3_2)
   #include <llvm/Instructions.h>
   #include <llvm/BasicBlock.h>
 #else
diff --git a/llvmutil.h b/llvmutil.h
index d6c5ede0..d1803f32 100644
--- a/llvmutil.h
+++ b/llvmutil.h
@@ -38,7 +38,7 @@
 #ifndef ISPC_LLVMUTIL_H
 #define ISPC_LLVMUTIL_H 1
 
-#if defined(LLVM_3_1) || defined(LLVM_3_2)
+#if defined(LLVM_3_2)
   #include <llvm/LLVMContext.h>
   #include <llvm/Type.h>
   #include <llvm/DerivedTypes.h>
diff --git a/main.cpp b/main.cpp
index de2bb620..4c4b4575 100644
--- a/main.cpp
+++ b/main.cpp
@@ -62,9 +62,7 @@ static void
 lPrintVersion() {
     printf("Intel(r) SPMD Program Compiler (ispc), %s (build %s @ %s, LLVM %s)\n",
            ISPC_VERSION, BUILD_VERSION, BUILD_DATE,
-#if defined(LLVM_3_1)
-           "3.1"
-#elif defined(LLVM_3_2)
+#if defined(LLVM_3_2)
            "3.2"
 #elif defined(LLVM_3_3)
            "3.3"
diff --git a/module.cpp b/module.cpp
index 755a5dc4..eba5eb3b 100644
--- a/module.cpp
+++ b/module.cpp
@@ -64,7 +64,7 @@
 #define strcasecmp stricmp
 #endif
 
-#if defined(LLVM_3_1) || defined(LLVM_3_2)
+#if defined(LLVM_3_2)
   #include <llvm/LLVMContext.h>
   #include <llvm/Module.h>
   #include <llvm/Type.h>
@@ -86,9 +86,7 @@
 #include <llvm/Support/FileUtilities.h>
 #include <llvm/Target/TargetMachine.h>
 #include <llvm/Target/TargetOptions.h>
-#if defined(LLVM_3_1)
-  #include <llvm/Target/TargetData.h>
-#elif defined(LLVM_3_2)
+#if defined(LLVM_3_2)
   #include <llvm/DataLayout.h>
   #include <llvm/TargetTransformInfo.h>
 #else // LLVM 3.3+
@@ -202,7 +200,7 @@ lStripUnusedDebugInfo(llvm::Module *module) {
             // stuff and remove it later on. Removing it is useful, as it
             // reduces size of the binary significantly (manyfold for small
             // programs).
-#if defined(LLVM_3_1) || defined(LLVM_3_2)
+#if defined(LLVM_3_2)
             llvm::MDNode *nodeSPMD =
                 llvm::dyn_cast<llvm::MDNode>(cuNode->getOperand(12));
             Assert(nodeSPMD != NULL);
@@ -797,11 +795,7 @@ Module::AddFunctionDeclaration(const std::string &name,
 #endif
     if (functionType->isTask)
         // This also applies transitively to members I think?
-#if defined(LLVM_3_1)
-        function->setDoesNotAlias(1, true);
-#else // LLVM 3.2+
         function->setDoesNotAlias(1);
-#endif
 
     g->target->markFuncWithTargetAttr(function);
 
@@ -850,12 +844,7 @@ Module::AddFunctionDeclaration(const std::string &name,
 
             // NOTE: LLVM indexes function parameters starting from 1.
             // This is unintuitive.
-#if defined(LLVM_3_1)
-            function->setDoesNotAlias(i+1, true);
-#else
             function->setDoesNotAlias(i+1);
-#endif
-
 #if 0
             int align = 4 * RoundUpPow2(g->target->nativeVectorWidth);
             function->addAttribute(i+1, llvm::Attribute::constructAlignmentFromInt(align));
@@ -1067,7 +1056,7 @@ Module::writeObjectFileOrAssembly(llvm::TargetMachine *targetMachine,
     llvm::TargetMachine::CodeGenFileType fileType = (outputType == Object) ?
         llvm::TargetMachine::CGFT_ObjectFile : llvm::TargetMachine::CGFT_AssemblyFile;
     bool binary = (fileType == llvm::TargetMachine::CGFT_ObjectFile);
-#if defined(LLVM_3_1) || defined(LLVM_3_2) || defined(LLVM_3_3)
+#if defined(LLVM_3_2) || defined(LLVM_3_3)
     unsigned int flags = binary ? llvm::raw_fd_ostream::F_Binary : 0;
 #else
     llvm::sys::fs::OpenFlags flags = binary ? llvm::sys::fs::F_Binary :
@@ -1082,11 +1071,7 @@ Module::writeObjectFileOrAssembly(llvm::TargetMachine *targetMachine,
     }
 
     llvm::PassManager pm;
-#if defined(LLVM_3_1)
-    pm.add(new llvm::TargetData(*g->target->getDataLayout()));
-#else
     pm.add(new llvm::DataLayout(*g->target->getDataLayout()));
-#endif
 
     llvm::formatted_raw_ostream fos(of->os());
 
@@ -1800,22 +1785,12 @@ Module::execPreprocessor(const char *infilename, llvm::raw_string_ostream *ostre
 
     llvm::raw_fd_ostream stderrRaw(2, false);
 
-#if defined(LLVM_3_1)
-    clang::TextDiagnosticPrinter *diagPrinter =
-        new clang::TextDiagnosticPrinter(stderrRaw, clang::DiagnosticOptions());
-#else
     clang::DiagnosticOptions *diagOptions = new clang::DiagnosticOptions();
     clang::TextDiagnosticPrinter *diagPrinter =
         new clang::TextDiagnosticPrinter(stderrRaw, diagOptions);
-#endif
     llvm::IntrusiveRefCntPtr<clang::DiagnosticIDs> diagIDs(new clang::DiagnosticIDs);
-#if defined(LLVM_3_1)
-    clang::DiagnosticsEngine *diagEngine =
-        new clang::DiagnosticsEngine(diagIDs, diagPrinter);
-#else
     clang::DiagnosticsEngine *diagEngine =
         new clang::DiagnosticsEngine(diagIDs, diagOptions, diagPrinter);
-#endif
     inst.setDiagnostics(diagEngine);
 
     clang::TargetOptions &options = inst.getTargetOpts();
@@ -1825,7 +1800,7 @@ Module::execPreprocessor(const char *infilename, llvm::raw_string_ostream *ostre
     }
     options.Triple = triple.getTriple();
 
-#if defined(LLVM_3_1) || defined(LLVM_3_2)
+#if defined(LLVM_3_2)
     clang::TargetInfo *target =
         clang::TargetInfo::CreateTargetInfo(inst.getDiagnostics(), options);
 #else // LLVM 3.3+
@@ -1835,18 +1810,14 @@ Module::execPreprocessor(const char *infilename, llvm::raw_string_ostream *ostre
 
     inst.setTarget(target);
     inst.createSourceManager(inst.getFileManager());
-#if defined(LLVM_3_1)
-    inst.InitializeSourceManager(infilename);
-#else
     clang::FrontendInputFile inputFile(infilename, clang::IK_None);
     inst.InitializeSourceManager(inputFile);
-#endif
 
     // Don't remove comments in the preprocessor, so that we can accurately
     // track the source file position by handling them ourselves.
     inst.getPreprocessorOutputOpts().ShowComments = 1;
 
-#if !defined(LLVM_3_1) && !defined(LLVM_3_2) // LLVM 3.3+
+#if !defined(LLVM_3_2) // LLVM 3.3+
     inst.getPreprocessorOutputOpts().ShowCPP = 1;
 #endif
 
@@ -1858,7 +1829,7 @@ Module::execPreprocessor(const char *infilename, llvm::raw_string_ostream *ostre
         headerOpts.Verbose = 1;
     for (int i = 0; i < (int)g->includePath.size(); ++i) {
         headerOpts.AddPath(g->includePath[i], clang::frontend::Angled,
-#if defined(LLVM_3_1) || defined(LLVM_3_2)
+#if defined(LLVM_3_2)
                            true /* is user supplied */,
 #endif
                            false /* not a framework */,
@@ -1913,11 +1884,7 @@ Module::execPreprocessor(const char *infilename, llvm::raw_string_ostream *ostre
         }
     }
 
-#if defined(LLVM_3_1)
-    inst.getLangOpts().BCPLComment = 1;
-#else
     inst.getLangOpts().LineComment = 1;
-#endif
     inst.createPreprocessor();
 
     diagPrinter->BeginSourceFile(inst.getLangOpts(), &inst.getPreprocessor());
diff --git a/opt.cpp b/opt.cpp
index 8efdbc67..8c86368e 100644
--- a/opt.cpp
+++ b/opt.cpp
@@ -48,7 +48,7 @@
 #include <set>
 
 #include <llvm/Pass.h>
-#if defined(LLVM_3_1) || defined(LLVM_3_2)
+#if defined(LLVM_3_2)
   #include <llvm/Module.h>
   #include <llvm/Instructions.h>
   #include <llvm/Intrinsics.h>
@@ -73,9 +73,7 @@
 #include <llvm/Transforms/IPO.h>
 #include <llvm/Transforms/Utils/BasicBlockUtils.h>
 #include <llvm/Target/TargetOptions.h>
-#if defined(LLVM_3_1)
-  #include <llvm/Target/TargetData.h>
-#elif defined(LLVM_3_2)
+#if defined(LLVM_3_2)
   #include <llvm/DataLayout.h>
 #else // LLVM 3.3+
   #include <llvm/IR/DataLayout.h>
@@ -85,11 +83,7 @@
 #include <llvm/Analysis/Verifier.h>
 #include <llvm/Analysis/Passes.h>
 #include <llvm/Support/raw_ostream.h>
-#if defined(LLVM_3_1)
-  #include <llvm/Analysis/DebugInfo.h>
-#else
-  #include <llvm/DebugInfo.h>
-#endif
+#include <llvm/DebugInfo.h>
 #include <llvm/Support/Dwarf.h>
 #ifdef ISPC_IS_LINUX
   #include <alloca.h>
@@ -415,18 +409,14 @@ Optimize(llvm::Module *module, int optLevel) {
     optPM.add(targetLibraryInfo);
 
 
-#if defined(LLVM_3_1)
-    optPM.add(new llvm::TargetData(*g->target->getDataLayout()));
-#else
     optPM.add(new llvm::DataLayout(*g->target->getDataLayout()));
 
     llvm::TargetMachine *targetMachine = g->target->GetTargetMachine();
-  #ifdef LLVM_3_2
+#ifdef LLVM_3_2
     optPM.add(new llvm::TargetTransformInfo(targetMachine->getScalarTargetTransformInfo(),
                                             targetMachine->getVectorTargetTransformInfo()));
-  #else // LLVM 3.3+
+#else // LLVM 3.3+
     targetMachine->addAnalysisPasses(optPM);
-  #endif
 #endif
 
     optPM.add(llvm::createIndVarSimplifyPass());
@@ -505,7 +495,7 @@ Optimize(llvm::Module *module, int optLevel) {
         optPM.add(llvm::createCFGSimplificationPass());
 
         optPM.add(llvm::createArgumentPromotionPass());
-#if defined(LLVM_3_1) || defined(LLVM_3_2) || defined(LLVM_3_3)
+#if defined(LLVM_3_2) || defined(LLVM_3_3)
         // Starting from 3.4 this functionality was moved to
         // InstructionCombiningPass. See r184459 for details.
         optPM.add(llvm::createSimplifyLibCallsPass());
diff --git a/stmt.cpp b/stmt.cpp
index 4ec63d35..412b0dd9 100644
--- a/stmt.cpp
+++ b/stmt.cpp
@@ -48,7 +48,7 @@
 #include <stdio.h>
 #include <map>
 
-#if defined(LLVM_3_1) || defined(LLVM_3_2)
+#if defined(LLVM_3_2)
   #include <llvm/Module.h>
   #include <llvm/Type.h>
   #include <llvm/Instructions.h>
diff --git a/type.cpp b/type.cpp
index 5fa1845b..11a165f5 100644
--- a/type.cpp
+++ b/type.cpp
@@ -43,20 +43,15 @@
 
 #include <stdio.h>
 #include <map>
-#if defined(LLVM_3_1) || defined(LLVM_3_2)
+#if defined(LLVM_3_2)
   #include <llvm/Value.h>
   #include <llvm/Module.h>
 #else
   #include <llvm/IR/Value.h>
   #include <llvm/IR/Module.h>
 #endif
-#if defined(LLVM_3_1)
-  #include <llvm/Analysis/DebugInfo.h>
-  #include <llvm/Analysis/DIBuilder.h>
-#else
-  #include <llvm/DebugInfo.h>
-  #include <llvm/DIBuilder.h>
-#endif
+#include <llvm/DebugInfo.h>
+#include <llvm/DIBuilder.h>
 #include <llvm/Support/Dwarf.h>
 
 
@@ -819,11 +814,8 @@ EnumType::GetDIType(llvm::DIDescriptor scope) const {
         m->diBuilder->createEnumerationType(scope, name, diFile, pos.first_line,
                                             32 /* size in bits */,
                                             32 /* align in bits */,
-                                            elementArray
-#if !defined(LLVM_3_1)
-                                            , llvm::DIType()
-#endif
-                                            );
+                                            elementArray,
+                                            llvm::DIType());
 
 
     switch (variability.type) {
@@ -2139,7 +2131,7 @@ StructType::GetDIType(llvm::DIDescriptor scope) const {
         currentSize,    // Size in bits
         align,          // Alignment in bits
         0,              // Flags
-#if !defined(LLVM_3_1) && !defined(LLVM_3_2)
+#if !defined(LLVM_3_2)
         llvm::DIType(), // DerivedFrom
 #endif
         elements);
@@ -2382,7 +2374,7 @@ UndefinedStructType::GetDIType(llvm::DIDescriptor scope) const {
         0,              // Size
         0,              // Align
         0,              // Flags
-#if !defined(LLVM_3_1) && !defined(LLVM_3_2)
+#if !defined(LLVM_3_2)
         llvm::DIType(), // DerivedFrom
 #endif
         elements);
@@ -2645,12 +2637,8 @@ ReferenceType::GetDIType(llvm::DIDescriptor scope) const {
     }
 
     llvm::DIType diTargetType = targetType->GetDIType(scope);
-#if defined(LLVM_3_1)
-    return m->diBuilder->createReferenceType(diTargetType);
-#else
     return m->diBuilder->createReferenceType(llvm::dwarf::DW_TAG_reference_type,
                                              diTargetType);
-#endif
 }
 
 
diff --git a/type.h b/type.h
index 880f8574..a6a52e10 100644
--- a/type.h
+++ b/type.h
@@ -40,7 +40,7 @@
 
 #include "ispc.h"
 #include "util.h"
-#if defined(LLVM_3_1) || defined(LLVM_3_2)
+#if defined(LLVM_3_2)
   #include <llvm/Type.h>
   #include <llvm/DerivedTypes.h>
 #else
diff --git a/util.cpp b/util.cpp
index dbea9517..4be863bf 100644
--- a/util.cpp
+++ b/util.cpp
@@ -65,9 +65,7 @@
 #include <set>
 #include <algorithm>
 
-#if defined(LLVM_3_1)
-  #include <llvm/Target/TargetData.h>
-#elif defined(LLVM_3_2)
+#if defined(LLVM_3_2)
   #include <llvm/DataLayout.h>
 #else // LLVM 3.3+
   #include <llvm/IR/DataLayout.h>
@@ -616,13 +614,8 @@ VerifyDataLayoutCompatibility(const std::string &module_dl,
     // which contradic: f80:128:128 followed by f80:32:32. This is a bug, but
     // correct thing to do is to interpret this exactly how LLVM would treat it,
     // so we create a DataLayout class and take its string representation.
-#if defined(LLVM_3_1)
-    llvm::TargetData d1(module_dl);
-    llvm::TargetData d2(lib_dl);
-#else // LLVM 3.2+
     llvm::DataLayout d1(module_dl);
     llvm::DataLayout d2(lib_dl);
-#endif
 
     std::string module_dl_canonic = d1.getStringRepresentation();
     std::string lib_dl_canonic = d2.getStringRepresentation();

From d9c38b5c1f6c1ccb4920465789b9e3d451e302a8 Mon Sep 17 00:00:00 2001
From: Matt Pharr <mpharr@google.com>
Date: Thu, 18 Jul 2013 09:24:23 -0700
Subject: [PATCH 019/124] Remove support for using SVML for math lib routines.

This path was poorly maintained and wasn't actually available on most
targets.
---
 builtins.cpp                      | 11 ----
 builtins/target-avx-x2.ll         | 17 ------
 builtins/target-avx.ll            | 17 ------
 builtins/target-generic-1.ll      | 98 -------------------------------
 builtins/target-generic-common.ll | 16 -----
 builtins/target-neon-common.ll    | 13 ----
 builtins/target-sse2-x2.ll        | 86 ---------------------------
 builtins/target-sse2.ll           | 60 -------------------
 builtins/target-sse4-16.ll        | 15 -----
 builtins/target-sse4-8.ll         | 15 -----
 builtins/target-sse4-x2.ll        | 86 ---------------------------
 builtins/target-sse4.ll           | 60 -------------------
 docs/ispc.rst                     |  3 -
 ispc.h                            |  2 +-
 main.cpp                          |  3 -
 stdlib.ispc                       | 72 ++++++-----------------
 16 files changed, 18 insertions(+), 556 deletions(-)

diff --git a/builtins.cpp b/builtins.cpp
index 17582d68..d75db43e 100644
--- a/builtins.cpp
+++ b/builtins.cpp
@@ -579,15 +579,6 @@ lSetInternalFunctions(llvm::Module *module) {
         "__stdlib_sinf",
         "__stdlib_tan",
         "__stdlib_tanf",
-        "__svml_sin",
-        "__svml_cos",
-        "__svml_sincos",
-        "__svml_tan",
-        "__svml_atan",
-        "__svml_atan2",
-        "__svml_exp",
-        "__svml_log",
-        "__svml_pow",
         "__undef_uniform",
         "__undef_varying",
         "__vec4_add_float",
@@ -1054,8 +1045,6 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
                        symbolTable);
     lDefineConstantInt("__math_lib_ispc_fast", (int)Globals::Math_ISPCFast,
                        module, symbolTable);
-    lDefineConstantInt("__math_lib_svml", (int)Globals::Math_SVML, module,
-                       symbolTable);
     lDefineConstantInt("__math_lib_system", (int)Globals::Math_System, module,
                        symbolTable);
     lDefineConstantIntFunc("__fast_masked_vload", (int)g->opt.fastMaskedVload,
diff --git a/builtins/target-avx-x2.ll b/builtins/target-avx-x2.ll
index d9e0322b..8fb2e427 100644
--- a/builtins/target-avx-x2.ll
+++ b/builtins/target-avx-x2.ll
@@ -134,23 +134,6 @@ define <16 x float> @__sqrt_varying_float(<16 x float>) nounwind readonly always
   ret <16 x float> %call
 }
 
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; svml
-
-; FIXME: need either to wire these up to the 8-wide SVML entrypoints,
-; or, use the macro to call the 4-wide ones 4x with our 16-wide
-; vectors...
-
-declare <16 x float> @__svml_sin(<16 x float>)
-declare <16 x float> @__svml_cos(<16 x float>)
-declare void @__svml_sincos(<16 x float>, <16 x float> *, <16 x float> *)
-declare <16 x float> @__svml_tan(<16 x float>)
-declare <16 x float> @__svml_atan(<16 x float>)
-declare <16 x float> @__svml_atan2(<16 x float>, <16 x float>)
-declare <16 x float> @__svml_exp(<16 x float>)
-declare <16 x float> @__svml_log(<16 x float>)
-declare <16 x float> @__svml_pow(<16 x float>, <16 x float>)
-
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; float min/max
 
diff --git a/builtins/target-avx.ll b/builtins/target-avx.ll
index 90e2f3ac..adaed9ba 100644
--- a/builtins/target-avx.ll
+++ b/builtins/target-avx.ll
@@ -134,23 +134,6 @@ define <8 x float> @__sqrt_varying_float(<8 x float>) nounwind readonly alwaysin
   ret <8 x float> %call
 }
 
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; svml
-
-; FIXME: need either to wire these up to the 8-wide SVML entrypoints,
-; or, use the macro to call the 4-wide ones twice with our 8-wide
-; vectors...
-
-declare <8 x float> @__svml_sin(<8 x float>)
-declare <8 x float> @__svml_cos(<8 x float>)
-declare void @__svml_sincos(<8 x float>, <8 x float> *, <8 x float> *)
-declare <8 x float> @__svml_tan(<8 x float>)
-declare <8 x float> @__svml_atan(<8 x float>)
-declare <8 x float> @__svml_atan2(<8 x float>, <8 x float>)
-declare <8 x float> @__svml_exp(<8 x float>)
-declare <8 x float> @__svml_log(<8 x float>)
-declare <8 x float> @__svml_pow(<8 x float>, <8 x float>)
-
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; float min/max
 
diff --git a/builtins/target-generic-1.ll b/builtins/target-generic-1.ll
index 3dec76b0..238de444 100644
--- a/builtins/target-generic-1.ll
+++ b/builtins/target-generic-1.ll
@@ -647,104 +647,6 @@ define  <1 x float> @__rsqrt_varying_float(<1 x float> %v) nounwind readonly alw
   
 }
 
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-; svml stuff
-
-define  <1 x float> @__svml_sin(<1 x float>) nounwind readnone alwaysinline {
-  ;%ret = call <1 x float> @__svml_sinf4(<1 x float> %0)
-  ;ret <1 x float> %ret
-  ;%r = extractelement <1 x float> %0, i32 0
-  ;%s = call float @llvm.sin.f32(float %r)
-  ;%rv = insertelement <1 x float> undef, float %r, i32 0
-  ;ret <1 x float> %rv
-  unary1to1(float,@llvm.sin.f32)
-   
-}
-
-define  <1 x float> @__svml_cos(<1 x float>) nounwind readnone alwaysinline {
-  ;%ret = call <1 x float> @__svml_cosf4(<1 x float> %0)
-  ;ret <1 x float> %ret
-  ;%r = extractelement <1 x float> %0, i32 0
-  ;%s = call float @llvm.cos.f32(float %r)
-  ;%rv = insertelement <1 x float> undef, float %r, i32 0
-  ;ret <1 x float> %rv
-  unary1to1(float, @llvm.cos.f32)
-
-}
-
-define  void @__svml_sincos(<1 x float>, <1 x float> *, <1 x float> *) nounwind readnone alwaysinline {
-;  %s = call <1 x float> @__svml_sincosf4(<1 x float> * %2, <1 x float> %0)
-;  store <1 x float> %s, <1 x float> * %1
-;  ret void
-   %sin = call <1 x float> @__svml_sin (<1 x float> %0)
-   %cos = call <1 x float> @__svml_cos (<1 x float> %0)
-   store <1 x float> %sin, <1 x float> * %1
-   store <1 x float> %cos, <1 x float> * %2
-   ret void
-}
-
-define  <1 x float> @__svml_tan(<1 x float>) nounwind readnone alwaysinline {
-  ;%ret = call <1 x float> @__svml_tanf4(<1 x float> %0)
-  ;ret <1 x float> %ret
-  ;%r = extractelement <1 x float> %0, i32 0
-  ;%s = call float @llvm_tan_f32(float %r)
-  ;%rv = insertelement <1 x float> undef, float %r, i32 0
-  ;ret <1 x float> %rv
-  ;unasry1to1(float, @llvm.tan.f32)
-  ; UNSUPPORTED!
-  ret <1 x float > %0
-}
-
-define  <1 x float> @__svml_atan(<1 x float>) nounwind readnone alwaysinline {
-;  %ret = call <1 x float> @__svml_atanf4(<1 x float> %0)
-;  ret <1 x float> %ret
-  ;%r = extractelement <1 x float> %0, i32 0
-  ;%s = call float @llvm_atan_f32(float %r)
-  ;%rv = insertelement <1 x float> undef, float %r, i32 0
-  ;ret <1 x float> %rv
-  ;unsary1to1(float,@llvm.atan.f32)
-  ;UNSUPPORTED!
-  ret <1 x float > %0
-
-}
-
-define  <1 x float> @__svml_atan2(<1 x float>, <1 x float>) nounwind readnone alwaysinline {
-  ;%ret = call <1 x float> @__svml_atan2f4(<1 x float> %0, <1 x float> %1)
-  ;ret <1 x float> %ret
-  ;%y = extractelement <1 x float> %0, i32 0
-  ;%x = extractelement <1 x float> %1, i32 0
-  ;%q = fdiv float %y, %x
-  ;%a = call float @llvm.atan.f32 (float %q)
-  ;%rv = insertelement <1 x float> undef, float %a, i32 0
-  ;ret <1 x float> %rv
-  ; UNSUPPORTED!
-  ret <1 x float > %0
-}
-
-define  <1 x float> @__svml_exp(<1 x float>) nounwind readnone alwaysinline {
-  ;%ret = call <1 x float> @__svml_expf4(<1 x float> %0)
-  ;ret <1 x float> %ret
-  unary1to1(float, @llvm.exp.f32)
-}
-
-define  <1 x float> @__svml_log(<1 x float>) nounwind readnone alwaysinline {
-  ;%ret = call <1 x float> @__svml_logf4(<1 x float> %0)
-  ;ret <1 x float> %ret
-  unary1to1(float, @llvm.log.f32)
-}
-
-define  <1 x float> @__svml_pow(<1 x float>, <1 x float>) nounwind readnone alwaysinline {
-  ;%ret = call <1 x float> @__svml_powf4(<1 x float> %0, <1 x float> %1)
-  ;ret <1 x float> %ret
-  %r = extractelement <1 x float> %0, i32 0
-  %e  = extractelement <1 x float> %1, i32 0
-  %s = call float @llvm.pow.f32(float %r,float %e)
-  %rv = insertelement <1 x float> undef, float %s, i32 0
-  ret <1 x float> %rv
-
-}
-
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; float min/max
 
diff --git a/builtins/target-generic-common.ll b/builtins/target-generic-common.ll
index 76d1faf3..b581e0a7 100644
--- a/builtins/target-generic-common.ll
+++ b/builtins/target-generic-common.ll
@@ -202,22 +202,6 @@ declare i64 @__count_trailing_zeros_i64(i64) nounwind readnone
 declare i32 @__count_leading_zeros_i32(i32) nounwind readnone
 declare i64 @__count_leading_zeros_i64(i64) nounwind readnone
 
-;; svml
-
-; FIXME: need either to wire these up to the 8-wide SVML entrypoints,
-; or, use the macro to call the 4-wide ones twice with our 8-wide
-; vectors...
-
-declare <WIDTH x float> @__svml_sin(<WIDTH x float>)
-declare <WIDTH x float> @__svml_cos(<WIDTH x float>)
-declare void @__svml_sincos(<WIDTH x float>, <WIDTH x float> *, <WIDTH x float> *)
-declare <WIDTH x float> @__svml_tan(<WIDTH x float>)
-declare <WIDTH x float> @__svml_atan(<WIDTH x float>)
-declare <WIDTH x float> @__svml_atan2(<WIDTH x float>, <WIDTH x float>)
-declare <WIDTH x float> @__svml_exp(<WIDTH x float>)
-declare <WIDTH x float> @__svml_log(<WIDTH x float>)
-declare <WIDTH x float> @__svml_pow(<WIDTH x float>, <WIDTH x float>)
-
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; reductions
 
diff --git a/builtins/target-neon-common.ll b/builtins/target-neon-common.ll
index 696b0748..f892a0a1 100644
--- a/builtins/target-neon-common.ll
+++ b/builtins/target-neon-common.ll
@@ -313,19 +313,6 @@ define void @__masked_store_blend_i64(<WIDTH x i64>* nocapture %ptr,
   ret void
 }
 
-;; yuck.  We need declarations of these, even though we shouldnt ever
-;; actually generate calls to them for the NEON target...
-
-declare <WIDTH x float> @__svml_sin(<WIDTH x float>)
-declare <WIDTH x float> @__svml_cos(<WIDTH x float>)
-declare void @__svml_sincos(<WIDTH x float>, <WIDTH x float> *, <WIDTH x float> *)
-declare <WIDTH x float> @__svml_tan(<WIDTH x float>)
-declare <WIDTH x float> @__svml_atan(<WIDTH x float>)
-declare <WIDTH x float> @__svml_atan2(<WIDTH x float>, <WIDTH x float>)
-declare <WIDTH x float> @__svml_exp(<WIDTH x float>)
-declare <WIDTH x float> @__svml_log(<WIDTH x float>)
-declare <WIDTH x float> @__svml_pow(<WIDTH x float>, <WIDTH x float>)
-
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; gather
 
diff --git a/builtins/target-sse2-x2.ll b/builtins/target-sse2-x2.ll
index da22a66c..057ea98f 100644
--- a/builtins/target-sse2-x2.ll
+++ b/builtins/target-sse2-x2.ll
@@ -102,92 +102,6 @@ define <8 x float> @__sqrt_varying_float(<8 x float>) nounwind readonly alwaysin
   ret <8 x float> %call
 }
 
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-; svml stuff
-
-declare <4 x float> @__svml_sinf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_cosf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_sincosf4(<4 x float> *, <4 x float>) nounwind readnone
-declare <4 x float> @__svml_tanf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_atanf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_atan2f4(<4 x float>, <4 x float>) nounwind readnone
-declare <4 x float> @__svml_expf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_logf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_powf4(<4 x float>, <4 x float>) nounwind readnone
-
-
-define <8 x float> @__svml_sin(<8 x float>) nounwind readnone alwaysinline {
-  unary4to8(ret, float, @__svml_sinf4, %0)
-  ret <8 x float> %ret
-}
-
-define <8 x float> @__svml_cos(<8 x float>) nounwind readnone alwaysinline {
-  unary4to8(ret, float, @__svml_cosf4, %0)
-  ret <8 x float> %ret
-}
-
-define void @__svml_sincos(<8 x float>, <8 x float> *,
-                                    <8 x float> *) nounwind readnone alwaysinline {
-  ; call svml_sincosf4 two times with the two 4-wide sub-vectors
-  %a = shufflevector <8 x float> %0, <8 x float> undef,
-         <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %b = shufflevector <8 x float> %0, <8 x float> undef,
-         <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-
-  %cospa = alloca <4 x float>
-  %sa = call <4 x float> @__svml_sincosf4(<4 x float> * %cospa, <4 x float> %a)
-
-  %cospb = alloca <4 x float>
-  %sb = call <4 x float> @__svml_sincosf4(<4 x float> * %cospb, <4 x float> %b)
-
-  %sin = shufflevector <4 x float> %sa, <4 x float> %sb,
-         <8 x i32> <i32 0, i32 1, i32 2, i32 3,
-                    i32 4, i32 5, i32 6, i32 7>
-  store <8 x float> %sin, <8 x float> * %1
-
-  %cosa = load <4 x float> * %cospa
-  %cosb = load <4 x float> * %cospb
-  %cos = shufflevector <4 x float> %cosa, <4 x float> %cosb,
-         <8 x i32> <i32 0, i32 1, i32 2, i32 3,
-                    i32 4, i32 5, i32 6, i32 7>
-  store <8 x float> %cos, <8 x float> * %2
-
-  ret void
-}
-
-define <8 x float> @__svml_tan(<8 x float>) nounwind readnone alwaysinline {
-  unary4to8(ret, float, @__svml_tanf4, %0)
-  ret <8 x float> %ret
-}
-
-define <8 x float> @__svml_atan(<8 x float>) nounwind readnone alwaysinline {
-  unary4to8(ret, float, @__svml_atanf4, %0)
-  ret <8 x float> %ret
-}
-
-define <8 x float> @__svml_atan2(<8 x float>,
-                                          <8 x float>) nounwind readnone alwaysinline {
-  binary4to8(ret, float, @__svml_atan2f4, %0, %1)
-  ret <8 x float> %ret
-}
-
-define <8 x float> @__svml_exp(<8 x float>) nounwind readnone alwaysinline {
-  unary4to8(ret, float, @__svml_expf4, %0)
-  ret <8 x float> %ret
-}
-
-define <8 x float> @__svml_log(<8 x float>) nounwind readnone alwaysinline {
-  unary4to8(ret, float, @__svml_logf4, %0)
-  ret <8 x float> %ret
-}
-
-define <8 x float> @__svml_pow(<8 x float>,
-                                        <8 x float>) nounwind readnone alwaysinline {
-  binary4to8(ret, float, @__svml_powf4, %0, %1)
-  ret <8 x float> %ret
-}
-
-
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; float min/max
 
diff --git a/builtins/target-sse2.ll b/builtins/target-sse2.ll
index a6b206b6..e0a5c3d5 100644
--- a/builtins/target-sse2.ll
+++ b/builtins/target-sse2.ll
@@ -493,66 +493,6 @@ define <4 x float> @__sqrt_varying_float(<4 x float>) nounwind readonly alwaysin
   ret <4 x float> %call
 }
 
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-; svml stuff
-
-declare <4 x float> @__svml_sinf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_cosf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_sincosf4(<4 x float> *, <4 x float>) nounwind readnone
-declare <4 x float> @__svml_tanf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_atanf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_atan2f4(<4 x float>, <4 x float>) nounwind readnone
-declare <4 x float> @__svml_expf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_logf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_powf4(<4 x float>, <4 x float>) nounwind readnone
-
-
-define <4 x float> @__svml_sin(<4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_sinf4(<4 x float> %0)
-  ret <4 x float> %ret
-}
-
-define <4 x float> @__svml_cos(<4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_cosf4(<4 x float> %0)
-  ret <4 x float> %ret
-}
-
-define void @__svml_sincos(<4 x float>, <4 x float> *, <4 x float> *) nounwind readnone alwaysinline {
-  %s = call <4 x float> @__svml_sincosf4(<4 x float> * %2, <4 x float> %0)
-  store <4 x float> %s, <4 x float> * %1
-  ret void
-}
-
-define <4 x float> @__svml_tan(<4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_tanf4(<4 x float> %0)
-  ret <4 x float> %ret
-}
-
-define <4 x float> @__svml_atan(<4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_atanf4(<4 x float> %0)
-  ret <4 x float> %ret
-}
-
-define <4 x float> @__svml_atan2(<4 x float>, <4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_atan2f4(<4 x float> %0, <4 x float> %1)
-  ret <4 x float> %ret
-}
-
-define <4 x float> @__svml_exp(<4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_expf4(<4 x float> %0)
-  ret <4 x float> %ret
-}
-
-define <4 x float> @__svml_log(<4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_logf4(<4 x float> %0)
-  ret <4 x float> %ret
-}
-
-define <4 x float> @__svml_pow(<4 x float>, <4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_powf4(<4 x float> %0, <4 x float> %1)
-  ret <4 x float> %ret
-}
-
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; float min/max
 
diff --git a/builtins/target-sse4-16.ll b/builtins/target-sse4-16.ll
index d1563988..50f0848d 100644
--- a/builtins/target-sse4-16.ll
+++ b/builtins/target-sse4-16.ll
@@ -205,21 +205,6 @@ define <8 x double> @__max_varying_double(<8 x double>, <8 x double>) nounwind r
   ret <8 x double> %ret
 }
 
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; svml
-
-; FIXME
-
-declare <8 x float> @__svml_sin(<8 x float>)
-declare <8 x float> @__svml_cos(<8 x float>)
-declare void @__svml_sincos(<8 x float>, <8 x float> *, <8 x float> *)
-declare <8 x float> @__svml_tan(<8 x float>)
-declare <8 x float> @__svml_atan(<8 x float>)
-declare <8 x float> @__svml_atan2(<8 x float>, <8 x float>)
-declare <8 x float> @__svml_exp(<8 x float>)
-declare <8 x float> @__svml_log(<8 x float>)
-declare <8 x float> @__svml_pow(<8 x float>, <8 x float>)
-
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; horizontal ops / reductions
 
diff --git a/builtins/target-sse4-8.ll b/builtins/target-sse4-8.ll
index 85b7bbe7..7fa9075b 100644
--- a/builtins/target-sse4-8.ll
+++ b/builtins/target-sse4-8.ll
@@ -217,21 +217,6 @@ define <16 x double> @__max_varying_double(<16 x double>, <16 x double>) nounwin
   ret <16 x double> %ret
 }
 
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; svml
-
-; FIXME
-
-declare <16 x float> @__svml_sin(<16 x float>)
-declare <16 x float> @__svml_cos(<16 x float>)
-declare void @__svml_sincos(<16 x float>, <16 x float> *, <16 x float> *)
-declare <16 x float> @__svml_tan(<16 x float>)
-declare <16 x float> @__svml_atan(<16 x float>)
-declare <16 x float> @__svml_atan2(<16 x float>, <16 x float>)
-declare <16 x float> @__svml_exp(<16 x float>)
-declare <16 x float> @__svml_log(<16 x float>)
-declare <16 x float> @__svml_pow(<16 x float>, <16 x float>)
-
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; horizontal ops / reductions
 
diff --git a/builtins/target-sse4-x2.ll b/builtins/target-sse4-x2.ll
index e2debbc2..4a447ba6 100644
--- a/builtins/target-sse4-x2.ll
+++ b/builtins/target-sse4-x2.ll
@@ -102,92 +102,6 @@ define <8 x float> @__sqrt_varying_float(<8 x float>) nounwind readonly alwaysin
   ret <8 x float> %call
 }
 
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-; svml stuff
-
-declare <4 x float> @__svml_sinf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_cosf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_sincosf4(<4 x float> *, <4 x float>) nounwind readnone
-declare <4 x float> @__svml_tanf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_atanf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_atan2f4(<4 x float>, <4 x float>) nounwind readnone
-declare <4 x float> @__svml_expf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_logf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_powf4(<4 x float>, <4 x float>) nounwind readnone
-
-
-define <8 x float> @__svml_sin(<8 x float>) nounwind readnone alwaysinline {
-  unary4to8(ret, float, @__svml_sinf4, %0)
-  ret <8 x float> %ret
-}
-
-define <8 x float> @__svml_cos(<8 x float>) nounwind readnone alwaysinline {
-  unary4to8(ret, float, @__svml_cosf4, %0)
-  ret <8 x float> %ret
-}
-
-define void @__svml_sincos(<8 x float>, <8 x float> *,
-                                    <8 x float> *) nounwind readnone alwaysinline {
-  ; call svml_sincosf4 two times with the two 4-wide sub-vectors
-  %a = shufflevector <8 x float> %0, <8 x float> undef,
-         <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %b = shufflevector <8 x float> %0, <8 x float> undef,
-         <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-
-  %cospa = alloca <4 x float>
-  %sa = call <4 x float> @__svml_sincosf4(<4 x float> * %cospa, <4 x float> %a)
-
-  %cospb = alloca <4 x float>
-  %sb = call <4 x float> @__svml_sincosf4(<4 x float> * %cospb, <4 x float> %b)
-
-  %sin = shufflevector <4 x float> %sa, <4 x float> %sb,
-         <8 x i32> <i32 0, i32 1, i32 2, i32 3,
-                    i32 4, i32 5, i32 6, i32 7>
-  store <8 x float> %sin, <8 x float> * %1
-
-  %cosa = load <4 x float> * %cospa
-  %cosb = load <4 x float> * %cospb
-  %cos = shufflevector <4 x float> %cosa, <4 x float> %cosb,
-         <8 x i32> <i32 0, i32 1, i32 2, i32 3,
-                    i32 4, i32 5, i32 6, i32 7>
-  store <8 x float> %cos, <8 x float> * %2
-
-  ret void
-}
-
-define <8 x float> @__svml_tan(<8 x float>) nounwind readnone alwaysinline {
-  unary4to8(ret, float, @__svml_tanf4, %0)
-  ret <8 x float> %ret
-}
-
-define <8 x float> @__svml_atan(<8 x float>) nounwind readnone alwaysinline {
-  unary4to8(ret, float, @__svml_atanf4, %0)
-  ret <8 x float> %ret
-}
-
-define <8 x float> @__svml_atan2(<8 x float>,
-                                          <8 x float>) nounwind readnone alwaysinline {
-  binary4to8(ret, float, @__svml_atan2f4, %0, %1)
-  ret <8 x float> %ret
-}
-
-define <8 x float> @__svml_exp(<8 x float>) nounwind readnone alwaysinline {
-  unary4to8(ret, float, @__svml_expf4, %0)
-  ret <8 x float> %ret
-}
-
-define <8 x float> @__svml_log(<8 x float>) nounwind readnone alwaysinline {
-  unary4to8(ret, float, @__svml_logf4, %0)
-  ret <8 x float> %ret
-}
-
-define <8 x float> @__svml_pow(<8 x float>,
-                                        <8 x float>) nounwind readnone alwaysinline {
-  binary4to8(ret, float, @__svml_powf4, %0, %1)
-  ret <8 x float> %ret
-}
-
-
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; float min/max
 
diff --git a/builtins/target-sse4.ll b/builtins/target-sse4.ll
index 98a7ef69..7f9a9185 100644
--- a/builtins/target-sse4.ll
+++ b/builtins/target-sse4.ll
@@ -206,66 +206,6 @@ define <4 x double> @__max_varying_double(<4 x double>, <4 x double>) nounwind r
   ret <4 x double> %ret
 }
 
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-; svml stuff
-
-declare <4 x float> @__svml_sinf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_cosf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_sincosf4(<4 x float> *, <4 x float>) nounwind readnone
-declare <4 x float> @__svml_tanf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_atanf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_atan2f4(<4 x float>, <4 x float>) nounwind readnone
-declare <4 x float> @__svml_expf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_logf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_powf4(<4 x float>, <4 x float>) nounwind readnone
-
-
-define <4 x float> @__svml_sin(<4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_sinf4(<4 x float> %0)
-  ret <4 x float> %ret
-}
-
-define <4 x float> @__svml_cos(<4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_cosf4(<4 x float> %0)
-  ret <4 x float> %ret
-}
-
-define void @__svml_sincos(<4 x float>, <4 x float> *, <4 x float> *) nounwind readnone alwaysinline {
-  %s = call <4 x float> @__svml_sincosf4(<4 x float> * %2, <4 x float> %0)
-  store <4 x float> %s, <4 x float> * %1
-  ret void
-}
-
-define <4 x float> @__svml_tan(<4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_tanf4(<4 x float> %0)
-  ret <4 x float> %ret
-}
-
-define <4 x float> @__svml_atan(<4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_atanf4(<4 x float> %0)
-  ret <4 x float> %ret
-}
-
-define <4 x float> @__svml_atan2(<4 x float>, <4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_atan2f4(<4 x float> %0, <4 x float> %1)
-  ret <4 x float> %ret
-}
-
-define <4 x float> @__svml_exp(<4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_expf4(<4 x float> %0)
-  ret <4 x float> %ret
-}
-
-define <4 x float> @__svml_log(<4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_logf4(<4 x float> %0)
-  ret <4 x float> %ret
-}
-
-define <4 x float> @__svml_pow(<4 x float>, <4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_powf4(<4 x float> %0, <4 x float> %1)
-  ret <4 x float> %ret
-}
-
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; horizontal ops / reductions
 
diff --git a/docs/ispc.rst b/docs/ispc.rst
index 39d3a5c8..af59714a 100755
--- a/docs/ispc.rst
+++ b/docs/ispc.rst
@@ -3278,9 +3278,6 @@ for this argument.
   approximately 1.45e-6 over the range -10pi to 10pi.)
 * ``fast``: more efficient but lower accuracy versions of the default ``ispc``
   implementations.
-* ``svml``: use Intel "Short Vector Math Library".  Use
-  ``icc`` to link your final executable so that the appropriate libraries
-  are linked.
 * ``system``: use the system's math library.  On many systems, these
   functions are more accurate than both of ``ispc``'s implementations.
   Using these functions may be quite
diff --git a/ispc.h b/ispc.h
index d68f9034..8653553e 100644
--- a/ispc.h
+++ b/ispc.h
@@ -468,7 +468,7 @@ struct Globals {
 
     /** There are a number of math libraries that can be used for
         transcendentals and the like during program compilation. */
-    enum MathLib { Math_ISPC, Math_ISPCFast, Math_SVML, Math_System };
+    enum MathLib { Math_ISPC, Math_ISPCFast, Math_System };
     MathLib mathLib;
 
     /** Records whether the ispc standard library should be made available
diff --git a/main.cpp b/main.cpp
index 4c4b4575..c21e7f88 100644
--- a/main.cpp
+++ b/main.cpp
@@ -107,7 +107,6 @@ usage(int ret) {
     printf("    [--math-lib=<option>]\t\tSelect math library\n");
     printf("        default\t\t\t\tUse ispc's built-in math functions\n");
     printf("        fast\t\t\t\tUse high-performance but lower-accuracy math functions\n");
-    printf("        svml\t\t\t\tUse the Intel(r) SVML math libraries\n");
     printf("        system\t\t\t\tUse the system's math library (*may be quite slow*)\n");
     printf("    [-MMM <filename>\t\t\t\tWrite #include dependencies to given file.\n");
     printf("    [--nostdlib]\t\t\tDon't make the ispc standard library available\n");
@@ -353,8 +352,6 @@ int main(int Argc, char *Argv[]) {
                 g->mathLib = Globals::Math_ISPC;
             else if (!strcmp(lib, "fast"))
                 g->mathLib = Globals::Math_ISPCFast;
-            else if (!strcmp(lib, "svml"))
-                g->mathLib = Globals::Math_SVML;
             else if (!strcmp(lib, "system"))
                 g->mathLib = Globals::Math_System;
             else {
diff --git a/stdlib.ispc b/stdlib.ispc
index c9c66252..affa7fef 100644
--- a/stdlib.ispc
+++ b/stdlib.ispc
@@ -2159,10 +2159,7 @@ static inline uniform float frexp(uniform float x, uniform int * uniform pw2) {
 
 __declspec(safe)
 static inline float sin(float x_full) {
-    if (__math_lib == __math_lib_svml) {
-        return __svml_sin(x_full);
-    }
-    else if (__math_lib == __math_lib_system) {
+    if (__math_lib == __math_lib_system) {
         float ret;
         foreach_active (i) {
             uniform float r = __stdlib_sinf(extract(x_full, i));
@@ -2221,8 +2218,7 @@ static inline float sin(float x_full) {
 
 __declspec(safe)
 static inline uniform float sin(uniform float x_full) {
-    if (__math_lib == __math_lib_system ||
-        __math_lib == __math_lib_svml) {
+    if (__math_lib == __math_lib_system) {
         return __stdlib_sinf(x_full);
     }
     else if (__math_lib == __math_lib_ispc || 
@@ -2293,8 +2289,7 @@ static inline float asin(float x) {
     bool isnan = (x > 1);
 
     float v;
-    if (__math_lib == __math_lib_svml ||
-        __math_lib == __math_lib_system) {
+    if (__math_lib == __math_lib_system) {
         float ret;
         foreach_active (i) {
             uniform float r = __stdlib_asinf(extract(x, i));
@@ -2350,8 +2345,7 @@ static inline uniform float asin(uniform float x) {
     uniform bool isnan = (x > 1);
 
     uniform float v;
-    if (__math_lib == __math_lib_svml ||
-        __math_lib == __math_lib_system) {
+    if (__math_lib == __math_lib_system) {
         return __stdlib_asinf(x);
     }
     else if (__math_lib == __math_lib_ispc)
@@ -2396,10 +2390,7 @@ static inline uniform float asin(uniform float x) {
 
 __declspec(safe)
 static inline float cos(float x_full) {
-    if (__math_lib == __math_lib_svml) {
-        return __svml_cos(x_full);
-    }
-    else if (__math_lib == __math_lib_system) {
+    if (__math_lib == __math_lib_system) {
         float ret;
         foreach_active (i) {
             uniform float r = __stdlib_cosf(extract(x_full, i));
@@ -2457,8 +2448,7 @@ static inline float cos(float x_full) {
 
 __declspec(safe)
 static inline uniform float cos(uniform float x_full) {
-    if (__math_lib == __math_lib_system ||
-        __math_lib == __math_lib_svml) {
+    if (__math_lib == __math_lib_system) {
         return __stdlib_cosf(x_full);
     }
     else if (__math_lib == __math_lib_ispc || 
@@ -2535,10 +2525,7 @@ static inline uniform float acos(uniform float v) {
 __declspec(safe)
 static inline void sincos(float x_full, varying float * uniform sin_result, 
                           varying float * uniform cos_result) {
-    if (__math_lib == __math_lib_svml) {
-        __svml_sincos(x_full, sin_result, cos_result);
-    }
-    else if (__math_lib == __math_lib_system) {
+    if (__math_lib == __math_lib_system) {
         foreach_active (i) {
             uniform float s, c;
             __stdlib_sincosf(extract(x_full, i), &s, &c);
@@ -2605,8 +2592,7 @@ static inline void sincos(float x_full, varying float * uniform sin_result,
 __declspec(safe)
 static inline void sincos(uniform float x_full, uniform float * uniform sin_result,
                           uniform float * uniform cos_result) {
-    if (__math_lib == __math_lib_system ||
-        __math_lib == __math_lib_svml) {
+    if (__math_lib == __math_lib_system) {
         __stdlib_sincosf(x_full, sin_result, cos_result);
     }
     else if (__math_lib == __math_lib_ispc || 
@@ -2667,10 +2653,7 @@ static inline void sincos(uniform float x_full, uniform float * uniform sin_resu
 
 __declspec(safe)
 static inline float tan(float x_full) {
-    if (__math_lib == __math_lib_svml) {
-        return __svml_tan(x_full);
-    }
-    else if (__math_lib == __math_lib_system) {
+    if (__math_lib == __math_lib_system) {
         float ret;
         foreach_active (i) {
             uniform float r = __stdlib_tanf(extract(x_full, i));
@@ -2746,8 +2729,7 @@ static inline float tan(float x_full) {
 
 __declspec(safe)
 static inline uniform float tan(uniform float x_full) {
-    if (__math_lib == __math_lib_system ||
-        __math_lib == __math_lib_svml) {
+    if (__math_lib == __math_lib_system) {
         return __stdlib_tanf(x_full);
     }
     else if (__math_lib == __math_lib_ispc || 
@@ -2818,10 +2800,7 @@ static inline uniform float tan(uniform float x_full) {
 
 __declspec(safe)
 static inline float atan(float x_full) {
-    if (__math_lib == __math_lib_svml) {
-        return __svml_atan(x_full);
-    }
-    else if (__math_lib == __math_lib_system) {
+    if (__math_lib == __math_lib_system) {
         float ret;
         foreach_active (i) {
             uniform float r = __stdlib_atanf(extract(x_full, i));
@@ -2869,8 +2848,7 @@ static inline float atan(float x_full) {
 
 __declspec(safe)
 static inline uniform float atan(uniform float x_full) {
-    if (__math_lib == __math_lib_system ||
-        __math_lib == __math_lib_svml) {
+    if (__math_lib == __math_lib_system) {
         return __stdlib_atanf(x_full);
     }
     else if (__math_lib == __math_lib_ispc || 
@@ -2913,10 +2891,7 @@ static inline uniform float atan(uniform float x_full) {
 
 __declspec(safe)
 static inline float atan2(float y, float x) {
-    if (__math_lib == __math_lib_svml) {
-        return __svml_atan2(y, x);
-    }
-    else if (__math_lib == __math_lib_system) {
+    if (__math_lib == __math_lib_system) {
         float ret;
         foreach_active (i) {
             uniform float r = __stdlib_atan2f(extract(y, i), extract(x, i));
@@ -2952,8 +2927,7 @@ static inline float atan2(float y, float x) {
 
 __declspec(safe)
 static inline uniform float atan2(uniform float y, uniform float x) {
-    if (__math_lib == __math_lib_system ||
-        __math_lib == __math_lib_svml) {
+    if (__math_lib == __math_lib_system) {
         return __stdlib_atan2f(y, x);
     }
     else if (__math_lib == __math_lib_ispc || 
@@ -2976,9 +2950,6 @@ static inline float exp(float x_full) {
     if (__have_native_transcendentals) {
         return __exp_varying_float(x_full);
     }
-    else if (__math_lib == __math_lib_svml) {
-        return __svml_exp(x_full);
-    }
     else if (__math_lib == __math_lib_system) {
         float ret;
         foreach_active (i) {
@@ -3058,8 +3029,7 @@ static inline uniform float exp(uniform float x_full) {
     if (__have_native_transcendentals) {
         return __exp_uniform_float(x_full);
     }
-    else if (__math_lib == __math_lib_system ||
-        __math_lib == __math_lib_svml) {
+    else if (__math_lib == __math_lib_system) {
         return __stdlib_expf(x_full);
     }
     else if (__math_lib == __math_lib_ispc_fast) {
@@ -3183,9 +3153,6 @@ static inline float log(float x_full) {
     if (__have_native_transcendentals) {
         return __log_varying_float(x_full);
     }
-    else if (__math_lib == __math_lib_svml) {
-        return __svml_log(x_full);
-    }
     else if (__math_lib == __math_lib_system) {
         float ret;
         foreach_active (i) {
@@ -3274,8 +3241,7 @@ static inline uniform float log(uniform float x_full) {
     if (__have_native_transcendentals) {
         return __log_uniform_float(x_full);
     }
-    else if (__math_lib == __math_lib_system ||
-        __math_lib == __math_lib_svml) {
+    else if (__math_lib == __math_lib_system) {
         return __stdlib_logf(x_full);
     }
     else if (__math_lib == __math_lib_ispc_fast) {
@@ -3358,9 +3324,6 @@ static inline float pow(float a, float b) {
     if (__have_native_transcendentals) {
         return __pow_varying_float(a, b);
     }
-    else if (__math_lib == __math_lib_svml) {
-        return __svml_pow(a, b);
-    }
     else if (__math_lib == __math_lib_system) {
         float ret;
         foreach_active (i) {
@@ -3380,8 +3343,7 @@ static inline uniform float pow(uniform float a, uniform float b) {
     if (__have_native_transcendentals) {
         return __pow_uniform_float(a, b);
     }
-    if (__math_lib == __math_lib_system ||
-        __math_lib == __math_lib_svml) {
+    if (__math_lib == __math_lib_system) {
         return __stdlib_powf(a, b);
     }
     else if (__math_lib == __math_lib_ispc || 

From 4f48d3258a27087f93c08c28f32200d0ed194eee Mon Sep 17 00:00:00 2001
From: Matt Pharr <mpharr@google.com>
Date: Wed, 31 Jul 2013 20:06:04 -0700
Subject: [PATCH 020/124] Documentation updates for NEON

---
 docs/build.sh |  8 ++++---
 docs/ispc.rst | 61 +++++++++++++++++++++++++++++++++++++++++----------
 ispc.cpp      |  4 ++--
 3 files changed, 56 insertions(+), 17 deletions(-)

diff --git a/docs/build.sh b/docs/build.sh
index a13f3231..4f4fbfe4 100755
--- a/docs/build.sh
+++ b/docs/build.sh
@@ -1,14 +1,16 @@
 #!/bin/bash
 
+rst2html=rst2html.py
+
 for i in ispc perfguide faq; do
-    rst2html --template=template.txt --link-stylesheet \
+    $rst2html --template=template.txt --link-stylesheet \
         --stylesheet-path=css/style.css $i.rst > $i.html
 done
 
-rst2html --template=template-news.txt --link-stylesheet \
+$rst2html --template=template-news.txt --link-stylesheet \
     --stylesheet-path=css/style.css news.rst > news.html
 
-rst2html --template=template-perf.txt --link-stylesheet \
+$rst2html --template=template-perf.txt --link-stylesheet \
         --stylesheet-path=css/style.css perf.rst > perf.html
 
 #rst2latex --section-numbering --documentclass=article --documentoptions=DIV=9,10pt,letterpaper ispc.txt > ispc.tex
diff --git a/docs/ispc.rst b/docs/ispc.rst
index af59714a..8456f126 100755
--- a/docs/ispc.rst
+++ b/docs/ispc.rst
@@ -467,31 +467,68 @@ There are three options that affect the compilation target: ``--arch``,
 which sets the target architecture, ``--cpu``, which sets the target CPU,
 and ``--target``, which sets the target instruction set.
 
-By default, the ``ispc`` compiler generates code for the 64-bit x86-64
-architecture (i.e. ``--arch=x86-64``.)  To compile to a 32-bit x86 target,
-supply ``--arch=x86`` on the command line:
+If none of these options is specified, ``ispc`` generates code for the
+architecture of the system the compiler is running on (i.e. 64-bit x86-64
+(``--arch=x86-64``) on x86 systems and ARM NEON on ARM systems.
+
+To compile to a 32-bit x86 target, for example, supply ``--arch=x86`` on
+the command line:
 
 ::
 
    ispc foo.ispc -o foo.obj --arch=x86
 
-No other architectures are currently supported.
+Currently-supported architectures are ``x86-64``, ``x86``, and ``arm``.
 
 The target CPU determines both the default instruction set used as well as
 which CPU architecture the code is tuned for.  ``ispc --help`` provides a
-list of a number of the supported CPUs.  By default, the CPU type of the
-system on which you're running ``ispc`` is used to determine the target
-CPU.
+list of all of the supported CPUs.  By default, the CPU type of the system
+on which you're running ``ispc`` is used to determine the target CPU.
 
 ::
 
    ispc foo.ispc -o foo.obj --cpu=corei7-avx
 
-Finally, ``--target`` selects between the SSE2, SSE4, and AVX, and AVX2
+Finally, ``--target`` selects the target instruction set.  The following
+targets are currently supported:
+
+=========== ========= =======================================
+Target      Gang Size Description
+----------- --------- ---------------------------------------
+avx         8         AVX (2010-2011 era Intel CPUs)
+avx-x2      16        "Double-pumped" AVX target, running
+                      twice as many program instances as the
+                      native vector width.
+avx1.1      8         AVX 1.1 target (2012 era "Ivybridge"
+                      Intel CPUs).
+avx1.1-x2   16        Double-pumped AVX 1.1 target.
+avx2        8         AVX 2 target (2013- Intel "Haswell"
+                      CPUs.)
+avx2-x2     16        Double-pumped AVX 2 target.
+neon-8      16        ARM NEON target, targeting computation
+                      on 8-bit data types. 
+neon-16     8         ARM NEON target, targeting computation
+                      on 16-bit data types.
+neon-32     4         ARM NEON target, targeting computation
+                      on 32-bit data types.
+sse2        4         SSE2 (early 2000s era x86 CPUs).
+sse2-x2     8         Double-pumped SSE2.
+sse4        4         SSE4 (generally 2008-2010 Intel CPUs).
+sse4-x2     8         Double-pumped SSE4.
+sse4-8      16        SSE4 target targeting computation on
+                      8-bit data types. 
+sse4-16     8         SSE4 target targeting computation on
+                      16-bit data types.
+=========== ========= =======================================
+
+See `Basic Concepts: Program Instances and Gangs of Program Instances`_ for
+more discussion of the "gang size" and its implications for program
+execution.
+
 instruction sets.  (As general context, SSE2 was first introduced in
 processors that shipped in 2001, SSE4 was introduced in 2007, and
-processors with AVX were introduced in 2010.  AVX2 will be supported on
-future CPUs based on Intel's "Haswell" architecture.  Consult your CPU's
+processors with AVX were introduced in 2010, and AVX2 arrived in 2013.
+Consult your CPU's
 manual for specifics on which vector instruction set it supports.)
 
 By default, the target instruction set is chosen based on the most capable
@@ -505,7 +542,7 @@ Generating Generic C++ Output
 -----------------------------
 
 In addition to generating object files or assembly output for specific
-targets like SSE2, SSE4, and AVX, ``ispc`` provides an option to generate
+targets like NEON, SSE2, SSE4, and AVX, ``ispc`` provides an option to generate
 "generic" C++ output.  This
 
 As an example, consider the following simple ``ispc`` program:
@@ -659,7 +696,7 @@ preprocessor runs:
   * - ISPC
     - 1
     - Detecting that the ``ispc`` compiler is processing the file
-  * - ISPC_TARGET_{SSE2,SSE4,AVX,AVX2}
+  * - ISPC_TARGET_{NEON_8,NEON_16,NEON_32,SSE2,SSE4,AVX,AVX11,AVX2,GENERIC}
     - 1
     - One of these will be set, depending on the compilation target.
   * - ISPC_POINTER_SIZE
diff --git a/ispc.cpp b/ispc.cpp
index b25527c4..03d1aaff 100644
--- a/ispc.cpp
+++ b/ispc.cpp
@@ -558,8 +558,8 @@ Target::SupportedTargetArchs() {
 
 const char *
 Target::SupportedTargetISAs() {
-    return "neon, sse2, sse2-x2, sse4, sse4-8, sse4-16, sse4-x2, "
-        "avx, avx-x2, avx1.1, avx1.1-x2, avx2, avx2-x2,"
+    return "neon-8, neon-16, neon-32, sse2, sse2-x2, sse4, sse4-8, sse4-16, sse4-x2, "
+        "avx, avx-x2, avx1.1, avx1.1-x2, avx2, avx2-x2, "
         "generic-1, generic-4, generic-8, generic-16, generic-32";
 }
 

From a174a90f86a65ddb6a9cde897996b1ccea368144 Mon Sep 17 00:00:00 2001
From: Ilia Filippov <ili.filippov@gmail.com>
Date: Thu, 1 Aug 2013 11:37:52 +0400
Subject: [PATCH 021/124] Supporting dumping, switching off and debug printing
 of optimization phases

---
 ispc.cpp |   1 +
 ispc.h   |  14 ++++++
 main.cpp |  60 +++++++++++++++++++++++
 opt.cpp  | 146 +++++++++++++++++++++++++++++++++++++++++++------------
 4 files changed, 191 insertions(+), 30 deletions(-)

diff --git a/ispc.cpp b/ispc.cpp
index 887f6ca3..82ad8f4d 100644
--- a/ispc.cpp
+++ b/ispc.cpp
@@ -774,6 +774,7 @@ Globals::Globals() {
     includeStdlib = true;
     runCPP = true;
     debugPrint = false;
+    debugIR = -1;
     disableWarnings = false;
     warningsAsErrors = false;
     quiet = false;
diff --git a/ispc.h b/ispc.h
index 7d10b908..57cba2e5 100644
--- a/ispc.h
+++ b/ispc.h
@@ -59,6 +59,7 @@
 #include <stdlib.h>
 #include <stdio.h>
 #include <vector>
+#include <set>
 #include <string>
 
 /** @def ISPC_MAX_NVEC maximum vector size of any of the compliation
@@ -66,6 +67,9 @@
  */
 #define ISPC_MAX_NVEC 64
 
+// Number of final optimization phase
+#define LAST_OPT_NUMBER 1000
+
 // Forward declarations of a number of widely-used LLVM types
 namespace llvm {
     class AttributeSet;
@@ -494,6 +498,16 @@ struct Globals {
         ispc's execution. */
     bool debugPrint;
 
+    /** Indicates which stages of optimization we want to dump. */
+    std::set<int> debug_stages;
+
+    /** Indicates after which optimization we want to generate
+        DebugIR information. */
+    int debugIR;
+
+    /** Indicates which phases of optimization we want to switch off. */
+    std::set<int> off_stages;
+
     /** Indicates whether all warning messages should be surpressed. */
     bool disableWarnings;
 
diff --git a/main.cpp b/main.cpp
index de2bb620..b107075c 100644
--- a/main.cpp
+++ b/main.cpp
@@ -156,6 +156,11 @@ devUsage(int ret) {
     printf("        disable-uniform-control-flow\t\tDisable uniform control flow optimizations\n");
     printf("        disable-uniform-memory-optimizations\tDisable uniform-based coherent memory access\n");
     printf("    [--yydebug]\t\t\t\tPrint debugging information during parsing\n");
+    printf("    [--debug-phase=<value>]\t\tSet optimization phases to dump. --debug-phase=first,210:220,300,305,310:last\n");
+#ifdef LLVM_3_4
+    printf("    [--debug-ir=<value>]\t\tSet optimization phase to generate debugIR after it\n");
+#endif
+    printf("    [--off-phase=<value>]\t\tSwitch off optimization phases. --off-phase=first,210:220,300,305,310:last\n");
     exit(ret);
 }
 
@@ -212,6 +217,47 @@ lSignal(void *) {
 }
 
 
+static int ParsingPhaseName(char * stage) {
+    if (strncmp(stage, "first", 5) == 0) {
+        return 0;
+    }
+    else if (strncmp(stage, "last", 4) == 0) {
+        return LAST_OPT_NUMBER;
+    }
+    else {
+        int t = atoi(stage);
+        if (t < 0 || t > LAST_OPT_NUMBER) {
+            fprintf(stderr, "Phases must be from 0 to %d. %s is incorrect.\n", LAST_OPT_NUMBER, stage);
+            exit(0);
+        }
+        else {
+            return t;
+        }
+    }
+}
+
+
+static std::set<int> ParsingPhases(char * stages) {
+    std::set<int> phases;
+    int begin = ParsingPhaseName(stages);
+    int end = begin;
+
+    for (unsigned i = 0; i < strlen(stages); i++) {
+        if ((stages[i] == ',') || (i == strlen(stages) - 1)) {
+            for (int j = begin; j < end + 1; j++) {
+                phases.insert(j);
+            }
+            begin = ParsingPhaseName(stages + i + 1);
+            end = begin;
+        }
+        else if (stages[i] == ':') {
+            end = ParsingPhaseName(stages + i + 1);
+        }
+    }
+    return phases;
+}
+
+
 static void
 lParseInclude(const char *path) {
 #ifdef ISPC_IS_WINDOWS
@@ -489,6 +535,20 @@ int main(int Argc, char *Argv[]) {
           }
           hostStubFileName = argv[i];
         }
+        else if (strncmp(argv[i], "--debug-phase=", 14) == 0) {
+            fprintf(stderr, "WARNING: Adding debug phases may change the way PassManager"
+                            "handles the phases and it may possibly make some bugs go"
+                            "away or introduce the new ones.\n");
+            g->debug_stages = ParsingPhases(argv[i] + strlen("--debug-phase="));
+        }
+#ifdef LLVM_3_4
+        else if (strncmp(argv[i], "--debug-ir=", 11) == 0) {
+            g->debugIR = ParsingPhaseName(argv[i] + strlen("--debug-ir="));
+        }
+#endif
+        else if (strncmp(argv[i], "--off-phase=", 12) == 0) {
+            g->off_stages = ParsingPhases(argv[i] + strlen("--off-phase="));
+        }
         else if (!strcmp(argv[i], "-v") || !strcmp(argv[i], "--version")) {
             lPrintVersion();
             return 0;
diff --git a/opt.cpp b/opt.cpp
index ba32c639..4602da43 100644
--- a/opt.cpp
+++ b/opt.cpp
@@ -63,6 +63,9 @@
   #include <llvm/IR/BasicBlock.h>
   #include <llvm/IR/Constants.h>
 #endif
+#if defined (LLVM_3_4)
+  #include <llvm/Transforms/Instrumentation.h>
+#endif
 #include <llvm/PassManager.h>
 #include <llvm/PassRegistry.h>
 #include <llvm/Assembly/PrintModulePass.h>
@@ -117,6 +120,8 @@ static llvm::Pass *CreateReplacePseudoMemoryOpsPass();
 static llvm::Pass *CreateIsCompileTimeConstantPass(bool isLastTry);
 static llvm::Pass *CreateMakeInternalFuncsStaticPass();
 
+static llvm::Pass *CreateDebugPass(char * output);
+
 #define DEBUG_START_PASS(NAME)                                 \
     if (g->debugPrint &&                                       \
         (getenv("FUNC") == NULL ||                             \
@@ -393,6 +398,54 @@ lGetMaskStatus(llvm::Value *mask, int vecWidth = -1) {
 }
 
 
+///////////////////////////////////////////////////////////////////////////
+// This is a wrap over class llvm::PassManager. This duplicates PassManager function run()
+//   and change PassManager function add by adding some checks and debug passes.
+//   This wrap can control:
+//   - If we want to switch off optimization with given number.
+//   - If we want to dump LLVM IR after optimization with given number.
+//   - If we want to generate LLVM IR debug for gdb after optimization with given number.
+class DebugPassManager {
+public:
+    DebugPassManager():number(0){}
+    void add(llvm::Pass * P, int stage);
+    bool run(llvm::Module& M) {return PM.run(M);}
+    llvm::PassManager& getPM() {return PM;}
+
+private:
+    llvm::PassManager PM;
+    int number;
+};
+
+void
+DebugPassManager::add(llvm::Pass * P, int stage = -1) {
+    // taking number of optimization
+    if (stage == -1) {
+        number++;
+    }
+    else {
+        number = stage;
+    }
+    if (g->off_stages.find(number) == g->off_stages.end()) {
+        // adding optimization (not switched off)
+        PM.add(P);
+        if (g->debug_stages.find(number) != g->debug_stages.end()) {
+            // adding dump of LLVM IR after optimization
+            char buf[100];
+            sprintf(buf, "\n\n*****LLVM IR after phase %d: %s*****\n\n",
+                number, P->getPassName());
+            PM.add(CreateDebugPass(buf));
+        }
+#ifdef LLVM_3_4
+        if (g->debugIR == number) {
+            // adding generating of LLVM IR debug after optimization
+            char buf[100];
+            sprintf(buf, "Debug_IR_after_%d_phase.bc", number);
+            PM.add(llvm::createDebugIRPass(true, true, ".", buf));
+        }
+#endif
+    }
+}
 ///////////////////////////////////////////////////////////////////////////
 
 void
@@ -401,14 +454,8 @@ Optimize(llvm::Module *module, int optLevel) {
         printf("*** Code going into optimization ***\n");
         module->dump();
     }
-
-    llvm::PassManager optPM;
-    optPM.add(llvm::createVerifierPass());
-
-#if 0
-    std::string err;
-    optPM.add(llvm::createPrintModulePass(new llvm::raw_fd_ostream("-", err)));
-#endif
+    DebugPassManager optPM;
+    optPM.add(llvm::createVerifierPass(),0);
 
     llvm::TargetLibraryInfo *targetLibraryInfo =
         new llvm::TargetLibraryInfo(llvm::Triple(module->getTargetTriple()));
@@ -425,7 +472,7 @@ Optimize(llvm::Module *module, int optLevel) {
     optPM.add(new llvm::TargetTransformInfo(targetMachine->getScalarTargetTransformInfo(),
                                             targetMachine->getVectorTargetTransformInfo()));
   #else // LLVM 3.3+
-    targetMachine->addAnalysisPasses(optPM);
+    targetMachine->addAnalysisPasses(optPM.getPM());
   #endif
 #endif
 
@@ -437,11 +484,11 @@ Optimize(llvm::Module *module, int optLevel) {
         // run absolutely no optimizations, since the front-end needs us to
         // take the various __pseudo_* functions it has emitted and turn
         // them into something that can actually execute.
-        optPM.add(CreateImproveMemoryOpsPass());
+        optPM.add(CreateImproveMemoryOpsPass(), 100);
         if (g->opt.disableHandlePseudoMemoryOps == false)
             optPM.add(CreateReplacePseudoMemoryOpsPass());
 
-        optPM.add(CreateIntrinsicsOptPass());
+        optPM.add(CreateIntrinsicsOptPass(), 102);
         optPM.add(CreateIsCompileTimeConstantPass(true));
         optPM.add(llvm::createFunctionInliningPass());
         optPM.add(CreateMakeInternalFuncsStaticPass());
@@ -460,7 +507,7 @@ Optimize(llvm::Module *module, int optLevel) {
         llvm::initializeInstrumentation(*registry);
         llvm::initializeTarget(*registry);
 
-        optPM.add(llvm::createGlobalDCEPass());
+        optPM.add(llvm::createGlobalDCEPass(), 200);
 
         // Early optimizations to try to reduce the total amount of code to
         // work with if we can
@@ -471,14 +518,14 @@ Optimize(llvm::Module *module, int optLevel) {
 
         if (g->opt.disableGatherScatterOptimizations == false &&
             g->target->getVectorWidth() > 1) {
-            optPM.add(llvm::createInstructionCombiningPass());
+            optPM.add(llvm::createInstructionCombiningPass(), 210);
             optPM.add(CreateImproveMemoryOpsPass());
         }
         if (!g->opt.disableMaskAllOnOptimizations) {
-            optPM.add(CreateIntrinsicsOptPass());
+            optPM.add(CreateIntrinsicsOptPass(), 215);
             optPM.add(CreateVSelMovmskOptPass());
         }
-        optPM.add(llvm::createDeadInstEliminationPass());
+        optPM.add(llvm::createDeadInstEliminationPass(), 220);
 
         // Max struct size threshold for scalar replacement is
         //    1) 4 fields (r,g,b,w)
@@ -508,9 +555,9 @@ Optimize(llvm::Module *module, int optLevel) {
 #if defined(LLVM_3_1) || defined(LLVM_3_2) || defined(LLVM_3_3)
         // Starting from 3.4 this functionality was moved to
         // InstructionCombiningPass. See r184459 for details.
-        optPM.add(llvm::createSimplifyLibCallsPass());
+        optPM.add(llvm::createSimplifyLibCallsPass(), 240);
 #endif
-        optPM.add(llvm::createInstructionCombiningPass());
+        optPM.add(llvm::createInstructionCombiningPass(), 241);
         optPM.add(llvm::createJumpThreadingPass());
         optPM.add(llvm::createCFGSimplificationPass());
         optPM.add(llvm::createScalarReplAggregatesPass(sr_threshold));
@@ -518,43 +565,45 @@ Optimize(llvm::Module *module, int optLevel) {
         optPM.add(llvm::createTailCallEliminationPass());
 
         if (!g->opt.disableMaskAllOnOptimizations) {
-            optPM.add(CreateIntrinsicsOptPass());
+            optPM.add(CreateIntrinsicsOptPass(), 250);
             optPM.add(CreateVSelMovmskOptPass());
         }
 
         if (g->opt.disableGatherScatterOptimizations == false &&
             g->target->getVectorWidth() > 1) {
-            optPM.add(llvm::createInstructionCombiningPass());
+            optPM.add(llvm::createInstructionCombiningPass(), 255);
             optPM.add(CreateImproveMemoryOpsPass());
 
             if (g->opt.disableCoalescing == false &&
                 g->target->getISA() != Target::GENERIC) {
                 // It is important to run this here to make it easier to
                 // finding matching gathers we can coalesce..
-                optPM.add(llvm::createEarlyCSEPass());
+                optPM.add(llvm::createEarlyCSEPass(), 260);
                 optPM.add(CreateGatherCoalescePass());
             }
         }
 
-        optPM.add(llvm::createFunctionInliningPass());
+        optPM.add(llvm::createFunctionInliningPass(), 265);
         optPM.add(llvm::createConstantPropagationPass());
         optPM.add(CreateIntrinsicsOptPass());
         optPM.add(CreateVSelMovmskOptPass());
 
         if (g->opt.disableGatherScatterOptimizations == false &&
             g->target->getVectorWidth() > 1) {
-            optPM.add(llvm::createInstructionCombiningPass());
+            optPM.add(llvm::createInstructionCombiningPass(), 270);
             optPM.add(CreateImproveMemoryOpsPass());
         }
 
-        optPM.add(llvm::createIPSCCPPass());
+        optPM.add(llvm::createIPSCCPPass(), 275);
         optPM.add(llvm::createDeadArgEliminationPass());
         optPM.add(llvm::createInstructionCombiningPass());
         optPM.add(llvm::createCFGSimplificationPass());
 
-        if (g->opt.disableHandlePseudoMemoryOps == false)
-            optPM.add(CreateReplacePseudoMemoryOpsPass());
-        optPM.add(CreateIntrinsicsOptPass());
+        if (g->opt.disableHandlePseudoMemoryOps == false) {
+            optPM.add(CreateReplacePseudoMemoryOpsPass(),280);
+        }
+
+        optPM.add(CreateIntrinsicsOptPass(),281);
         optPM.add(CreateVSelMovmskOptPass());
 
         optPM.add(llvm::createFunctionInliningPass());
@@ -570,9 +619,10 @@ Optimize(llvm::Module *module, int optLevel) {
         optPM.add(llvm::createIndVarSimplifyPass());
         optPM.add(llvm::createLoopIdiomPass());
         optPM.add(llvm::createLoopDeletionPass());
-        if (g->opt.unrollLoops)
-            optPM.add(llvm::createLoopUnrollPass());
-        optPM.add(llvm::createGVNPass());
+        if (g->opt.unrollLoops) {
+            optPM.add(llvm::createLoopUnrollPass(), 300);
+        }
+        optPM.add(llvm::createGVNPass(), 301);
 
         optPM.add(CreateIsCompileTimeConstantPass(true));
         optPM.add(CreateIntrinsicsOptPass());
@@ -595,7 +645,7 @@ Optimize(llvm::Module *module, int optLevel) {
 
     // Finish up by making sure we didn't mess anything up in the IR along
     // the way.
-    optPM.add(llvm::createVerifierPass());
+    optPM.add(llvm::createVerifierPass(), LAST_OPT_NUMBER);
     optPM.run(*module);
 
     if (g->debugPrint) {
@@ -4240,6 +4290,42 @@ CreateIsCompileTimeConstantPass(bool isLastTry) {
     return new IsCompileTimeConstantPass(isLastTry);
 }
 
+//////////////////////////////////////////////////////////////////////////
+// DebugPass
+
+/** This pass is added in list of passes after optimizations which
+    we want to debug and print dump of LLVM IR in stderr. Also it
+    prints name and number of previous optimization.
+ */
+class DebugPass : public llvm::ModulePass {
+public:
+    static char ID;
+    DebugPass(char * output) : ModulePass(ID) {
+        sprintf(str_output, "%s", output);
+    }
+
+    const char *getPassName() const { return "Dump LLVM IR"; }
+    bool runOnModule(llvm::Module &m);
+
+private:
+    char str_output[100];
+};
+
+char DebugPass::ID = 0;
+
+bool
+DebugPass::runOnModule(llvm::Module &module) {
+    fprintf(stderr, "%s", str_output);
+    fflush(stderr);
+    module.dump();
+    return true;
+}
+
+static llvm::Pass *
+CreateDebugPass(char * output) {
+    return new DebugPass(output);
+}
+
 ///////////////////////////////////////////////////////////////////////////
 // MakeInternalFuncsStaticPass
 

From 3c06924a028a0e6bfd52745be93784e9afcf4a2f Mon Sep 17 00:00:00 2001
From: Ilia Filippov <ili.filippov@gmail.com>
Date: Thu, 1 Aug 2013 12:47:37 +0400
Subject: [PATCH 022/124] Supporting perf.py on Mac OS

---
 examples/perf.py | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/examples/perf.py b/examples/perf.py
index 8503bd8c..f96ef9ec 100755
--- a/examples/perf.py
+++ b/examples/perf.py
@@ -73,10 +73,19 @@ def cpu_get():
 #returns cpu_usage
 def cpu_check():
     if is_windows == False:
-        cpu1 = cpu_get()
-        time.sleep(1)
-        cpu2 = cpu_get()
-        cpu_percent = (float(cpu1[0] - cpu2[0])/float(cpu1[1] - cpu2[1]))*100
+        if is_mac == False:
+            cpu1 = cpu_get()
+            time.sleep(1)
+            cpu2 = cpu_get()
+            cpu_percent = (float(cpu1[0] - cpu2[0])/float(cpu1[1] - cpu2[1]))*100
+        else:
+            os.system("sysctl -n vm.loadavg > cpu_temp")
+            c = open("cpu_temp", 'r')
+            c_line = c.readline()
+            c.close
+            os.remove("cpu_temp")
+            R = c_line.split(' ')
+            cpu_percent = float(R[1]) * 3
     else:
 	os.system("wmic cpu get loadpercentage /value > cpu_temp")
 	c = open("cpu_temp", 'r')
@@ -143,6 +152,8 @@ parser.add_option('-p', '--path', dest='path',
 global is_windows
 is_windows = (platform.system() == 'Windows' or
               'CYGWIN_NT' in platform.system())
+global is_mac
+is_mac = (platform.system() == 'Darwin')
 
 # save corrent path
 pwd = os.getcwd()

From dff7735af9f6328d2b45b33634e65c909143408d Mon Sep 17 00:00:00 2001
From: Dmitry Babokin <babokin@gmail.com>
Date: Fri, 2 Aug 2013 19:24:34 -0700
Subject: [PATCH 023/124] Fix for Windows build and making NEON target optional

---
 Makefile     | 17 +++++++++++++++--
 builtins.cpp |  4 +++-
 ispc.cpp     | 31 +++++++++++++++++++++++++++----
 ispc.h       |  6 +++++-
 ispc.vcxproj | 15 ---------------
 main.cpp     |  3 +++
 6 files changed, 53 insertions(+), 23 deletions(-)

diff --git a/Makefile b/Makefile
index 835f8e15..d5741435 100644
--- a/Makefile
+++ b/Makefile
@@ -39,6 +39,10 @@
 LLVM_CONFIG=$(shell which llvm-config)
 CLANG_INCLUDE=$(shell $(LLVM_CONFIG) --includedir)
 
+# Enable ARM by request
+# To enable: make ARM_ENABLED=ON
+ARM_ENABLED=OFF
+
 # Add llvm bin to the path so any scripts run will go to the right llvm-config
 LLVM_BIN= $(shell $(LLVM_CONFIG) --bindir)
 export PATH:=$(LLVM_BIN):$(PATH)
@@ -55,12 +59,15 @@ LLVM_CXXFLAGS=$(shell $(LLVM_CONFIG) --cppflags)
 LLVM_VERSION=LLVM_$(shell $(LLVM_CONFIG) --version | sed -e s/\\./_/ -e s/svn//)
 LLVM_VERSION_DEF=-D$(LLVM_VERSION)
 
-LLVM_COMPONENTS = engine ipo bitreader bitwriter instrumentation linker arm
+LLVM_COMPONENTS = engine ipo bitreader bitwriter instrumentation linker
 # Component "option" was introduced in 3.3 and starting with 3.4 it is required for the link step.
 # We check if it's available before adding it (to not break 3.2 and earlier).
 ifeq ($(shell $(LLVM_CONFIG) --components |grep -c option), 1)
     LLVM_COMPONENTS+=option
 endif
+ifeq ($(ARM_ENABLED), ON)
+    LLVM_COMPONENTS+=arm
+endif
 LLVM_LIBS=$(shell $(LLVM_CONFIG) --libs $(LLVM_COMPONENTS))
 
 CLANG=clang
@@ -104,6 +111,9 @@ OPT=-O2
 CXXFLAGS=$(OPT) $(LLVM_CXXFLAGS) -I. -Iobjs/ -I$(CLANG_INCLUDE)  \
 	-Wall $(LLVM_VERSION_DEF) \
 	-DBUILD_DATE="\"$(BUILD_DATE)\"" -DBUILD_VERSION="\"$(BUILD_VERSION)\""
+ifeq ($(ARM_ENABLED), ON)
+    CXXFLAGS+=-DISPC_ARM_ENABLED
+endif
 
 LDFLAGS=
 ifeq ($(ARCH_OS),Linux)
@@ -122,8 +132,11 @@ CXX_SRC=ast.cpp builtins.cpp cbackend.cpp ctx.cpp decl.cpp expr.cpp func.cpp \
 	type.cpp util.cpp
 HEADERS=ast.h builtins.h ctx.h decl.h expr.h func.h ispc.h llvmutil.h module.h \
 	opt.h stmt.h sym.h type.h util.h
-TARGETS=neon avx1 avx1-x2 avx11 avx11-x2 avx2 avx2-x2 sse2 sse2-x2 sse4 sse4-x2 \
+TARGETS=avx1 avx1-x2 avx11 avx11-x2 avx2 avx2-x2 sse2 sse2-x2 sse4 sse4-x2 \
 	generic-4 generic-8 generic-16 generic-32 generic-64 generic-1
+ifeq ($(ARM_ENABLED), ON)
+    TARGETS+=neon
+endif
 # These files need to be compiled in two versions - 32 and 64 bits.
 BUILTINS_SRC_TARGET=$(addprefix builtins/target-, $(addsuffix .ll, $(TARGETS)))
 # These are files to be compiled in single version.
diff --git a/builtins.cpp b/builtins.cpp
index 3e03de10..7d99abf9 100644
--- a/builtins.cpp
+++ b/builtins.cpp
@@ -640,7 +640,7 @@ AddBitcodeToModule(const unsigned char *bitcode, int length,
         llvm::Triple bcTriple(bcModule->getTargetTriple());
         Debug(SourcePos(), "module triple: %s\nbitcode triple: %s\n",
               mTriple.str().c_str(), bcTriple.str().c_str());
-#ifndef __arm__
+#if defined(ISPC_ARM_ENABLED) && !defined(__arm__)
         // FIXME: More ugly and dangerous stuff.  We really haven't set up
         // proper build and runtime infrastructure for ispc to do
         // cross-compilation, yet it's at minimum useful to be able to emit
@@ -819,6 +819,7 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
     // Next, add the target's custom implementations of the various needed
     // builtin functions (e.g. __masked_store_32(), etc).
     switch (g->target->getISA()) {
+#ifdef ISPC_ARM_ENABLED
     case Target::NEON: {
         if (runtime32) {
             EXPORT_MODULE(builtins_bitcode_neon_32bit);
@@ -828,6 +829,7 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
         }
         break;
     }
+#endif
     case Target::SSE2: {
         switch (g->target->getVectorWidth()) {
         case 4:
diff --git a/ispc.cpp b/ispc.cpp
index 82ad8f4d..7743d6b2 100644
--- a/ispc.cpp
+++ b/ispc.cpp
@@ -141,10 +141,12 @@ lGetSystemISA() {
 
 
 static const char *supportedCPUs[] = {
+#ifdef ISPC_ARM_ENABLED
     // FIXME: LLVM supports a ton of different ARM CPU variants--not just
     // cortex-a9 and a15.  We should be able to handle any of them that also
     // have NEON support.
     "cortex-a9", "cortex-a15",
+#endif
     "atom", "penryn", "core2", "corei7", "corei7-avx"
 #if !defined(LLVM_3_1)
     , "core-avx-i", "core-avx2"
@@ -185,9 +187,11 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
             // possible ISA based on that.
             if (!strcmp(cpu, "core-avx2"))
                 isa = "avx2";
+#ifdef ISPC_ARM_ENABLED
             else if (!strcmp(cpu, "cortex-a9") ||
                      !strcmp(cpu, "cortex-a15"))
                 isa = "neon";
+#endif
             else if (!strcmp(cpu, "core-avx-i"))
                 isa = "avx1.1";
             else if (!strcmp(cpu, "sandybridge") ||
@@ -211,7 +215,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
         }
     }
 
-#if !defined(__arm__)
+#if defined(ISPC_ARM_ENABLED) && !defined(__arm__)
     if (cpu == NULL && !strcmp(isa, "neon"))
         // If we're compiling NEON on an x86 host and the CPU wasn't
         // supplied, don't go and set the CPU based on the host...
@@ -246,9 +250,11 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
     this->m_cpu = cpu;
 
     if (arch == NULL) {
+#ifdef ISPC_ARM_ENABLED
         if (!strcmp(isa, "neon"))
             arch = "arm";
         else
+#endif
             arch = "x86-64";
     }
 
@@ -445,6 +451,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
         this->m_hasGather = true;
 #endif
     }
+#ifdef ISPC_ARM_ENABLED
     else if (!strcasecmp(isa, "neon")) {
         this->m_isa = Target::NEON;
         this->m_nativeVectorWidth = 4;
@@ -454,6 +461,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
         this->m_maskingIsFree = false;
         this->m_maskBitCount = 32;
     }
+#endif
     else {
         fprintf(stderr, "Target ISA \"%s\" is unknown.  Choices are: %s\n",
                 isa, SupportedTargetISAs());
@@ -468,8 +476,10 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
             llvm::Reloc::Default;
         std::string featuresString = m_attributes;
         llvm::TargetOptions options;
+#ifdef ISPC_ARM_ENABLED
         if (m_isa == Target::NEON)
             options.FloatABIType = llvm::FloatABI::Hard;
+#endif
 #if !defined(LLVM_3_1)
         if (g->opt.disableFMA == false)
             options.AllowFPOpFusion = llvm::FPOpFusion::Fast;
@@ -561,13 +571,21 @@ Target::SupportedTargetCPUs() {
 
 const char *
 Target::SupportedTargetArchs() {
-    return "arm, x86, x86-64";
+    return
+#ifdef ISPC_ARM_ENABLED
+        "arm, "
+#endif
+        "x86, x86-64";
 }
 
 
 const char *
 Target::SupportedTargetISAs() {
-    return "neon, sse2, sse2-x2, sse4, sse4-x2, avx, avx-x2"
+    return
+#ifdef ISPC_ARM_ENABLED
+        "neon, "
+#endif
+        "sse2, sse2-x2, sse4, sse4-x2, avx, avx-x2"
         ", avx1.1, avx1.1-x2, avx2, avx2-x2"
         ", generic-1, generic-4, generic-8, generic-16, generic-32";
 }
@@ -576,10 +594,13 @@ Target::SupportedTargetISAs() {
 std::string
 Target::GetTripleString() const {
     llvm::Triple triple;
+#ifdef ISPC_ARM_ENABLED
     if (m_arch == "arm") {
         triple.setTriple("armv7-eabi");
     }
-    else {
+    else
+#endif
+    {
         // Start with the host triple as the default
         triple.setTriple(llvm::sys::getDefaultTargetTriple());
 
@@ -602,7 +623,9 @@ Target::GetTripleString() const {
 const char *
 Target::ISAToString(ISA isa) {
     switch (isa) {
+#ifdef ISPC_ARM_ENABLED
     case Target::NEON:
+#endif
         return "neon";
     case Target::SSE2:
         return "sse2";
diff --git a/ispc.h b/ispc.h
index 57cba2e5..bb9e2b31 100644
--- a/ispc.h
+++ b/ispc.h
@@ -179,7 +179,11 @@ public:
         flexible/performant of them will apear last in the enumerant.  Note
         also that __best_available_isa() needs to be updated if ISAs are
         added or the enumerant values are reordered.  */
-    enum ISA { NEON, SSE2, SSE4, AVX, AVX11, AVX2, GENERIC, NUM_ISAS };
+    enum ISA {
+#ifdef ISPC_ARM_ENABLED
+        NEON,
+#endif
+        SSE2, SSE4, AVX, AVX11, AVX2, GENERIC, NUM_ISAS };
 
     /** Initializes the given Target pointer for a target of the given
         name, if the name is a known target.  Returns true if the
diff --git a/ispc.vcxproj b/ispc.vcxproj
index 96682fe3..36fbad5d 100755
--- a/ispc.vcxproj
+++ b/ispc.vcxproj
@@ -45,8 +45,6 @@
     <ClCompile Include="$(Configuration)\gen-bitcode-generic-32-64bit.cpp" />
     <ClCompile Include="$(Configuration)\gen-bitcode-generic-64-32bit.cpp" />
     <ClCompile Include="$(Configuration)\gen-bitcode-generic-64-64bit.cpp" />
-    <ClCompile Include="$(Configuration)\gen-bitcode-neon-32bit.cpp" />
-    <ClCompile Include="$(Configuration)\gen-bitcode-neon-64bit.cpp" />
     <ClCompile Include="$(Configuration)\gen-bitcode-sse2-32bit.cpp" />
     <ClCompile Include="$(Configuration)\gen-bitcode-sse2-64bit.cpp" />
     <ClCompile Include="$(Configuration)\gen-bitcode-sse2-x2-32bit.cpp" />
@@ -187,19 +185,6 @@
       <Message>Building gen-bitcode-sse2-x2-64bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
-  <ItemGroup>
-    <CustomBuild Include="builtins\target-neon.ll">
-      <FileType>Document</FileType>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-neon.ll | python bitcode2cpp.py builtins\target-neon.ll &gt; gen-bitcode-neon.cpp</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-neon.cpp</Outputs>
-      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins\util.m4</AdditionalInputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-neon.ll | python bitcode2cpp.py builtins\target-neon.ll &gt; gen-bitcode-neon.cpp</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-neon.cpp</Outputs>
-      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins\util.m4</AdditionalInputs>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-neon.cpp</Message>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-neon.cpp</Message>
-    </CustomBuild>
-  </ItemGroup>
   <ItemGroup>
     <CustomBuild Include="builtins\target-avx1.ll">
       <FileType>Document</FileType>
diff --git a/main.cpp b/main.cpp
index b107075c..c6786c39 100644
--- a/main.cpp
+++ b/main.cpp
@@ -300,6 +300,8 @@ int main(int Argc, char *Argv[]) {
     LLVMInitializeX86Disassembler();
     LLVMInitializeX86TargetMC();
 #endif // !__ARM__
+
+#ifdef ISPC_ARM_ENABLED
     // Generating ARM from x86 is more likely to be useful, though.
     LLVMInitializeARMTargetInfo();
     LLVMInitializeARMTarget();
@@ -307,6 +309,7 @@ int main(int Argc, char *Argv[]) {
     LLVMInitializeARMAsmParser();
     LLVMInitializeARMDisassembler();
     LLVMInitializeARMTargetMC();
+#endif
 
     char *file = NULL;
     const char *headerFileName = NULL;

From 5b20b06bd9c75d84e78749b752716d6f2088b8d1 Mon Sep 17 00:00:00 2001
From: Matt Pharr <mpharr@google.com>
Date: Sat, 3 Aug 2013 20:44:25 -0700
Subject: [PATCH 024/124] Add avg_{up,down}_int{8,16} routines to stdlib

These compute the average of two given values, rounding up and down,
respectively, if the result isn't exact.  When possible, these are
mapped to target-specific intrinsics (PADD[BW] on IA and VH[R]ADD[US]
on NEON.)

A subsequent commit will add pattern-matching to generate calls to
these intrinsincs when the corresponding patterns are detected in the
IR.)
---
 builtins/target-avx-common.ll     |   6 ++
 builtins/target-generic-1.ll      |   6 ++
 builtins/target-generic-common.ll |   5 ++
 builtins/target-neon-16.ll        |  59 ++++++++++++++
 builtins/target-neon-32.ll        |  59 ++++++++++++++
 builtins/target-neon-8.ll         |  75 +++++++++++++++++
 builtins/target-sse2-common.ll    |   4 +
 builtins/target-sse4-16.ll        |  31 ++++++++
 builtins/target-sse4-8.ll         |  25 ++++++
 builtins/target-sse4-x2.ll        |   6 ++
 builtins/target-sse4.ll           |   6 ++
 builtins/util.m4                  | 128 ++++++++++++++++++++++++++++--
 docs/ispc.rst                     |  25 ++++++
 opt.cpp                           |   8 ++
 stdlib.ispc                       |  60 +++++++++++---
 tests/avg-down-int16.ispc         |  13 +++
 tests/avg-down-int8.ispc          |  13 +++
 tests/avg-down-uint16.ispc        |  13 +++
 tests/avg-down-uint8.ispc         |  13 +++
 tests/avg-up-int16.ispc           |  13 +++
 tests/avg-up-int8.ispc            |  13 +++
 tests/avg-up-uint16.ispc          |  13 +++
 tests/avg-up-uint8.ispc           |  13 +++
 23 files changed, 592 insertions(+), 15 deletions(-)
 create mode 100644 tests/avg-down-int16.ispc
 create mode 100644 tests/avg-down-int8.ispc
 create mode 100644 tests/avg-down-uint16.ispc
 create mode 100644 tests/avg-down-uint8.ispc
 create mode 100644 tests/avg-up-int16.ispc
 create mode 100644 tests/avg-up-int8.ispc
 create mode 100644 tests/avg-up-uint16.ispc
 create mode 100644 tests/avg-up-uint8.ispc

diff --git a/builtins/target-avx-common.ll b/builtins/target-avx-common.ll
index dcbe0a66..1d317713 100644
--- a/builtins/target-avx-common.ll
+++ b/builtins/target-avx-common.ll
@@ -277,3 +277,9 @@ define double @__max_uniform_double(double, double) nounwind readnone alwaysinli
   sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.max.sd, %0, %1)
   ret double %ret
 }
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int8/int16 builtins
+
+define_avgs()
+
diff --git a/builtins/target-generic-1.ll b/builtins/target-generic-1.ll
index 238de444..3472c207 100644
--- a/builtins/target-generic-1.ll
+++ b/builtins/target-generic-1.ll
@@ -864,3 +864,9 @@ declare float @__half_to_float_uniform(i16 %v) nounwind readnone
 declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
 declare i16 @__float_to_half_uniform(float %v) nounwind readnone
 declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int8/int16 builtins
+
+define_avgs()
+
diff --git a/builtins/target-generic-common.ll b/builtins/target-generic-common.ll
index b581e0a7..c683ff45 100644
--- a/builtins/target-generic-common.ll
+++ b/builtins/target-generic-common.ll
@@ -364,3 +364,8 @@ declare void @__prefetch_read_uniform_2(i8 * nocapture) nounwind
 declare void @__prefetch_read_uniform_3(i8 * nocapture) nounwind 
 declare void @__prefetch_read_uniform_nt(i8 * nocapture) nounwind 
 
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int8/int16 builtins
+
+define_avgs()
+
diff --git a/builtins/target-neon-16.ll b/builtins/target-neon-16.ll
index fd15eb0b..a0575927 100644
--- a/builtins/target-neon-16.ll
+++ b/builtins/target-neon-16.ll
@@ -456,3 +456,62 @@ define i64 @__reduce_min_uint64(<WIDTH x i64>) nounwind readnone {
 define i64 @__reduce_max_uint64(<WIDTH x i64>) nounwind readnone {
   reduce8(i64, @__max_varying_uint64, @__max_uniform_uint64)
 }
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int8/int16
+
+declare <8 x i8> @llvm.arm.neon.vrhaddu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+
+define <8 x i8> @__avg_up_uint8(<8 x i8>, <8 x i8>) nounwind readnone {
+  %r = call <8 x i8> @llvm.arm.neon.vrhaddu.v8i8(<8 x i8> %0, <8 x i8> %1)
+  ret <8 x i8> %r
+}
+
+declare <8 x i8> @llvm.arm.neon.vrhadds.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+
+define <8 x i8> @__avg_up_int8(<8 x i8>, <8 x i8>) nounwind readnone {
+  %r = call <8 x i8> @llvm.arm.neon.vrhadds.v8i8(<8 x i8> %0, <8 x i8> %1)
+  ret <8 x i8> %r
+}
+
+declare <8 x i8> @llvm.arm.neon.vhaddu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+
+define <8 x i8> @__avg_down_uint8(<8 x i8>, <8 x i8>) nounwind readnone {
+  %r = call <8 x i8> @llvm.arm.neon.vhaddu.v8i8(<8 x i8> %0, <8 x i8> %1)
+  ret <8 x i8> %r
+}
+
+declare <8 x i8> @llvm.arm.neon.vhadds.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+
+define <8 x i8> @__avg_down_int8(<8 x i8>, <8 x i8>) nounwind readnone {
+  %r = call <8 x i8> @llvm.arm.neon.vhadds.v8i8(<8 x i8> %0, <8 x i8> %1)
+  ret <8 x i8> %r
+}
+
+declare <8 x i16> @llvm.arm.neon.vrhaddu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <8 x i16> @__avg_up_uint16(<8 x i16>, <8 x i16>) nounwind readnone {
+  %r = call <8 x i16> @llvm.arm.neon.vrhaddu.v8i16(<8 x i16> %0, <8 x i16> %1)
+  ret <8 x i16> %r
+}
+
+declare <8 x i16> @llvm.arm.neon.vrhadds.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <8 x i16> @__avg_up_int16(<8 x i16>, <8 x i16>) nounwind readnone {
+  %r = call <8 x i16> @llvm.arm.neon.vrhadds.v8i16(<8 x i16> %0, <8 x i16> %1)
+  ret <8 x i16> %r
+}
+
+declare <8 x i16> @llvm.arm.neon.vhaddu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <8 x i16> @__avg_down_uint16(<8 x i16>, <8 x i16>) nounwind readnone {
+  %r = call <8 x i16> @llvm.arm.neon.vhaddu.v8i16(<8 x i16> %0, <8 x i16> %1)
+  ret <8 x i16> %r
+}
+
+declare <8 x i16> @llvm.arm.neon.vhadds.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <8 x i16> @__avg_down_int16(<8 x i16>, <8 x i16>) nounwind readnone {
+  %r = call <8 x i16> @llvm.arm.neon.vhadds.v8i16(<8 x i16> %0, <8 x i16> %1)
+  ret <8 x i16> %r
+}
diff --git a/builtins/target-neon-32.ll b/builtins/target-neon-32.ll
index 1f8003d7..30b062c9 100644
--- a/builtins/target-neon-32.ll
+++ b/builtins/target-neon-32.ll
@@ -426,3 +426,62 @@ define i64 @__reduce_min_uint64(<4 x i64>) nounwind readnone {
 define i64 @__reduce_max_uint64(<4 x i64>) nounwind readnone {
   reduce4(i64, @__max_varying_uint64, @__max_uniform_uint64)
 }
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int8/int16
+
+declare <4 x i8> @llvm.arm.neon.vrhaddu.v4i8(<4 x i8>, <4 x i8>) nounwind readnone
+
+define <4 x i8> @__avg_up_uint8(<4 x i8>, <4 x i8>) nounwind readnone {
+  %r = call <4 x i8> @llvm.arm.neon.vrhaddu.v4i8(<4 x i8> %0, <4 x i8> %1)
+  ret <4 x i8> %r
+}
+
+declare <4 x i8> @llvm.arm.neon.vrhadds.v4i8(<4 x i8>, <4 x i8>) nounwind readnone
+
+define <4 x i8> @__avg_up_int8(<4 x i8>, <4 x i8>) nounwind readnone {
+  %r = call <4 x i8> @llvm.arm.neon.vrhadds.v4i8(<4 x i8> %0, <4 x i8> %1)
+  ret <4 x i8> %r
+}
+
+declare <4 x i8> @llvm.arm.neon.vhaddu.v4i8(<4 x i8>, <4 x i8>) nounwind readnone
+
+define <4 x i8> @__avg_down_uint8(<4 x i8>, <4 x i8>) nounwind readnone {
+  %r = call <4 x i8> @llvm.arm.neon.vhaddu.v4i8(<4 x i8> %0, <4 x i8> %1)
+  ret <4 x i8> %r
+}
+
+declare <4 x i8> @llvm.arm.neon.vhadds.v4i8(<4 x i8>, <4 x i8>) nounwind readnone
+
+define <4 x i8> @__avg_down_int8(<4 x i8>, <4 x i8>) nounwind readnone {
+  %r = call <4 x i8> @llvm.arm.neon.vhadds.v4i8(<4 x i8> %0, <4 x i8> %1)
+  ret <4 x i8> %r
+}
+
+declare <4 x i16> @llvm.arm.neon.vrhaddu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+
+define <4 x i16> @__avg_up_uint16(<4 x i16>, <4 x i16>) nounwind readnone {
+  %r = call <4 x i16> @llvm.arm.neon.vrhaddu.v4i16(<4 x i16> %0, <4 x i16> %1)
+  ret <4 x i16> %r
+}
+
+declare <4 x i16> @llvm.arm.neon.vrhadds.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+
+define <4 x i16> @__avg_up_int16(<4 x i16>, <4 x i16>) nounwind readnone {
+  %r = call <4 x i16> @llvm.arm.neon.vrhadds.v4i16(<4 x i16> %0, <4 x i16> %1)
+  ret <4 x i16> %r
+}
+
+declare <4 x i16> @llvm.arm.neon.vhaddu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+
+define <4 x i16> @__avg_down_uint16(<4 x i16>, <4 x i16>) nounwind readnone {
+  %r = call <4 x i16> @llvm.arm.neon.vhaddu.v4i16(<4 x i16> %0, <4 x i16> %1)
+  ret <4 x i16> %r
+}
+
+declare <4 x i16> @llvm.arm.neon.vhadds.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+
+define <4 x i16> @__avg_down_int16(<4 x i16>, <4 x i16>) nounwind readnone {
+  %r = call <4 x i16> @llvm.arm.neon.vhadds.v4i16(<4 x i16> %0, <4 x i16> %1)
+  ret <4 x i16> %r
+}
diff --git a/builtins/target-neon-8.ll b/builtins/target-neon-8.ll
index eb65f224..2accfe53 100644
--- a/builtins/target-neon-8.ll
+++ b/builtins/target-neon-8.ll
@@ -506,3 +506,78 @@ define i64 @__reduce_min_uint64(<WIDTH x i64>) nounwind readnone {
 define i64 @__reduce_max_uint64(<WIDTH x i64>) nounwind readnone {
   reduce16(i64, @__max_varying_uint64, @__max_uniform_uint64)
 }
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int8/int16 builtins
+
+declare <16 x i8> @llvm.arm.neon.vrhaddu.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+
+define <16 x i8> @__avg_up_uint8(<16 x i8>, <16 x i8>) nounwind readnone {
+  %r = call <16 x i8> @llvm.arm.neon.vrhaddu.v16i8(<16 x i8> %0, <16 x i8> %1)
+  ret <16 x i8> %r
+}
+
+declare <16 x i8> @llvm.arm.neon.vrhadds.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+
+define <16 x i8> @__avg_up_int8(<16 x i8>, <16 x i8>) nounwind readnone {
+  %r = call <16 x i8> @llvm.arm.neon.vrhadds.v16i8(<16 x i8> %0, <16 x i8> %1)
+  ret <16 x i8> %r
+}
+
+declare <16 x i8> @llvm.arm.neon.vhaddu.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+
+define <16 x i8> @__avg_down_uint8(<16 x i8>, <16 x i8>) nounwind readnone {
+  %r = call <16 x i8> @llvm.arm.neon.vhaddu.v16i8(<16 x i8> %0, <16 x i8> %1)
+  ret <16 x i8> %r
+}
+
+declare <16 x i8> @llvm.arm.neon.vhadds.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+
+define <16 x i8> @__avg_down_int8(<16 x i8>, <16 x i8>) nounwind readnone {
+  %r = call <16 x i8> @llvm.arm.neon.vhadds.v16i8(<16 x i8> %0, <16 x i8> %1)
+  ret <16 x i8> %r
+}
+
+declare <8 x i16> @llvm.arm.neon.vrhaddu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <16 x i16> @__avg_up_uint16(<16 x i16>, <16 x i16>) nounwind readnone {
+  v16tov8(i16, %0, %a0, %b0)
+  v16tov8(i16, %1, %a1, %b1)
+  %r0 = call <8 x i16> @llvm.arm.neon.vrhaddu.v8i16(<8 x i16> %a0, <8 x i16> %a1)
+  %r1 = call <8 x i16> @llvm.arm.neon.vrhaddu.v8i16(<8 x i16> %b0, <8 x i16> %b1)
+  v8tov16(i16, %r0, %r1, %r)
+  ret <16 x i16> %r
+}
+
+declare <8 x i16> @llvm.arm.neon.vrhadds.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <16 x i16> @__avg_up_int16(<16 x i16>, <16 x i16>) nounwind readnone {
+  v16tov8(i16, %0, %a0, %b0)
+  v16tov8(i16, %1, %a1, %b1)
+  %r0 = call <8 x i16> @llvm.arm.neon.vrhadds.v8i16(<8 x i16> %a0, <8 x i16> %a1)
+  %r1 = call <8 x i16> @llvm.arm.neon.vrhadds.v8i16(<8 x i16> %b0, <8 x i16> %b1)
+  v8tov16(i16, %r0, %r1, %r)
+  ret <16 x i16> %r
+}
+
+declare <8 x i16> @llvm.arm.neon.vhaddu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <16 x i16> @__avg_down_uint16(<16 x i16>, <16 x i16>) nounwind readnone {
+  v16tov8(i16, %0, %a0, %b0)
+  v16tov8(i16, %1, %a1, %b1)
+  %r0 = call <8 x i16> @llvm.arm.neon.vhaddu.v8i16(<8 x i16> %a0, <8 x i16> %a1)
+  %r1 = call <8 x i16> @llvm.arm.neon.vhaddu.v8i16(<8 x i16> %b0, <8 x i16> %b1)
+  v8tov16(i16, %r0, %r1, %r)
+  ret <16 x i16> %r
+}
+
+declare <8 x i16> @llvm.arm.neon.vhadds.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <16 x i16> @__avg_down_int16(<16 x i16>, <16 x i16>) nounwind readnone {
+  v16tov8(i16, %0, %a0, %b0)
+  v16tov8(i16, %1, %a1, %b1)
+  %r0 = call <8 x i16> @llvm.arm.neon.vhadds.v8i16(<8 x i16> %a0, <8 x i16> %a1)
+  %r1 = call <8 x i16> @llvm.arm.neon.vhadds.v8i16(<8 x i16> %b0, <8 x i16> %b1)
+  v8tov16(i16, %r0, %r1, %r)
+  ret <16 x i16> %r
+}
diff --git a/builtins/target-sse2-common.ll b/builtins/target-sse2-common.ll
index c6a3afe2..ad1d88bc 100644
--- a/builtins/target-sse2-common.ll
+++ b/builtins/target-sse2-common.ll
@@ -269,4 +269,8 @@ define i64 @__popcnt_int64(i64) nounwind readnone alwaysinline {
   ret i64 %val
 }
 
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int8/int16 builtins
+
+define_avgs()
 
diff --git a/builtins/target-sse4-16.ll b/builtins/target-sse4-16.ll
index 50f0848d..b4772552 100644
--- a/builtins/target-sse4-16.ll
+++ b/builtins/target-sse4-16.ll
@@ -449,3 +449,34 @@ gen_scatter(i32)
 gen_scatter(float)
 gen_scatter(i64)
 gen_scatter(double)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int8/int16 builtins
+
+declare <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8>, <16 x i8>) nounwind readnone
+
+define <8 x i8> @__avg_up_uint8(<8 x i8>, <8 x i8>) {
+  %v0 = shufflevector <8 x i8> %0, <8 x i8> undef,
+    <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %v1 = shufflevector <8 x i8> %1, <8 x i8> undef,
+    <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %r16 = call <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8> %v0, <16 x i8> %v1)
+  %r = shufflevector <16 x i8> %r16, <16 x i8> undef,
+    <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i8> %r
+}
+
+declare <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <8 x i16> @__avg_up_uint16(<8 x i16>, <8 x i16>) {
+  %r = call <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16> %0, <8 x i16> %1)
+  ret <8 x i16> %r
+}
+
+define_avg_up_int8()
+define_avg_up_int16()
+define_down_avgs()
diff --git a/builtins/target-sse4-8.ll b/builtins/target-sse4-8.ll
index 7fa9075b..a75d8e3a 100644
--- a/builtins/target-sse4-8.ll
+++ b/builtins/target-sse4-8.ll
@@ -456,3 +456,28 @@ gen_scatter(i32)
 gen_scatter(float)
 gen_scatter(i64)
 gen_scatter(double)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int8/int16 builtins
+
+declare <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8>, <16 x i8>) nounwind readnone
+
+define <16 x i8> @__avg_up_uint8(<16 x i8>, <16 x i8>) nounwind readnone {
+  %r = call <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8> %0, <16 x i8> %1)
+  ret <16 x i8> %r
+}
+
+declare <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <16 x i16> @__avg_up_uint16(<16 x i16>, <16 x i16>) nounwind readnone {
+  v16tov8(i16, %0, %a0, %b0)
+  v16tov8(i16, %1, %a1, %b1)
+  %r0 = call <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16> %a0, <8 x i16> %a1)
+  %r1 = call <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16> %b0, <8 x i16> %b1)
+  v8tov16(i16, %r0, %r1, %r)
+  ret <16 x i16> %r
+}
+
+define_avg_up_int8()
+define_avg_up_int16()
+define_down_avgs()
diff --git a/builtins/target-sse4-x2.ll b/builtins/target-sse4-x2.ll
index 4a447ba6..897a09eb 100644
--- a/builtins/target-sse4-x2.ll
+++ b/builtins/target-sse4-x2.ll
@@ -573,3 +573,9 @@ define <8 x double> @__max_varying_double(<8 x double>, <8 x double>) nounwind r
   binary2to8(ret, double, @llvm.x86.sse2.max.pd, %0, %1)
   ret <8 x double> %ret
 }
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int8/int16 builtins
+
+define_avgs()
+
diff --git a/builtins/target-sse4.ll b/builtins/target-sse4.ll
index 7f9a9185..5429b461 100644
--- a/builtins/target-sse4.ll
+++ b/builtins/target-sse4.ll
@@ -473,3 +473,9 @@ gen_scatter(i32)
 gen_scatter(float)
 gen_scatter(i64)
 gen_scatter(double)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int8/int16 builtins
+
+define_avgs()
+
diff --git a/builtins/util.m4 b/builtins/util.m4
index 025030d5..95e3844d 100644
--- a/builtins/util.m4
+++ b/builtins/util.m4
@@ -49,9 +49,9 @@ define(`MASK_HIGH_BIT_ON',
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
-;; vector assembly and deconstruction utilities
+;; vector deconstruction utilities
 ;; split 8-wide vector into 2 4-wide vectors
-;; 
+;;
 ;; $1: vector element type
 ;; $2: 8-wide vector
 ;; $3: first 4-wide vector
@@ -71,10 +71,6 @@ define(`v16tov8', `
     <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ')
 
-;; 4-wide into 2 2-wide
-;; args as above
-;;
-
 define(`v4tov2', `
   $3 = shufflevector <4 x $1> $2, <4 x $1> undef, <2 x i32> <i32 0, i32 1>
   $4 = shufflevector <4 x $1> $2, <4 x $1> undef, <2 x i32> <i32 2, i32 3>
@@ -96,6 +92,20 @@ define(`v16tov4', `
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
+;; vector assembly: wider vector from two narrower vectors
+;;
+;; $1: vector element type
+;; $2: first n-wide vector
+;; $3: second n-wide vector
+;; $4: result 2*n-wide vector
+define(`v8tov16', `
+  $4 = shufflevector <8 x $1> $2, <8 x $1> $3,
+    <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
 ;; Helper macro for calling various SSE instructions for scalar values
 ;; but where the instruction takes a vector parameter.
 ;; $1 : name of variable to put the final value in
@@ -4276,3 +4286,109 @@ define i1 @__rdrand_i64(i64 * %ptr) {
   ret i1 %good
 }
 ')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int8/int16 builtins
+
+define(`define_avg_up_uint8', `
+define <WIDTH x i8> @__avg_up_uint8(<WIDTH x i8>, <WIDTH x i8>) {
+  %a16 = zext <WIDTH x i8> %0 to <WIDTH x i16>
+  %b16 = zext <WIDTH x i8> %1 to <WIDTH x i16>
+  %sum1 = add <WIDTH x i16> %a16, %b16
+  %sum = add <WIDTH x i16> %sum1, < forloop(i, 1, eval(WIDTH-1), `i16 1, ') i16 1 >
+  %avg = lshr <WIDTH x i16> %sum, < forloop(i, 1, eval(WIDTH-1), `i16 1, ') i16 1 >
+  %r = trunc <WIDTH x i16> %avg to <WIDTH x i8>
+  ret <WIDTH x i8> %r
+}')
+
+define(`define_avg_up_int8', `
+define <WIDTH x i8> @__avg_up_int8(<WIDTH x i8>, <WIDTH x i8>) {
+  %a16 = sext <WIDTH x i8> %0 to <WIDTH x i16>
+  %b16 = sext <WIDTH x i8> %1 to <WIDTH x i16>
+  %sum1 = add <WIDTH x i16> %a16, %b16
+  %sum = add <WIDTH x i16> %sum1, < forloop(i, 1, eval(WIDTH-1), `i16 1, ') i16 1 >
+  %avg = sdiv <WIDTH x i16> %sum, < forloop(i, 1, eval(WIDTH-1), `i16 2, ') i16 2 >
+  %r = trunc <WIDTH x i16> %avg to <WIDTH x i8>
+  ret <WIDTH x i8> %r
+}')
+
+define(`define_avg_up_uint16', `
+define <WIDTH x i16> @__avg_up_uint16(<WIDTH x i16>, <WIDTH x i16>) {
+  %a32 = zext <WIDTH x i16> %0 to <WIDTH x i32>
+  %b32 = zext <WIDTH x i16> %1 to <WIDTH x i32>
+  %sum1 = add <WIDTH x i32> %a32, %b32
+  %sum = add <WIDTH x i32> %sum1, < forloop(i, 1, eval(WIDTH-1), `i32 1, ') i32 1 >
+  %avg = lshr <WIDTH x i32> %sum, < forloop(i, 1, eval(WIDTH-1), `i32 1, ') i32 1 >
+  %r = trunc <WIDTH x i32> %avg to <WIDTH x i16>
+  ret <WIDTH x i16> %r
+}')
+
+define(`define_avg_up_int16', `
+define <WIDTH x i16> @__avg_up_int16(<WIDTH x i16>, <WIDTH x i16>) {
+  %a32 = sext <WIDTH x i16> %0 to <WIDTH x i32>
+  %b32 = sext <WIDTH x i16> %1 to <WIDTH x i32>
+  %sum1 = add <WIDTH x i32> %a32, %b32
+  %sum = add <WIDTH x i32> %sum1, < forloop(i, 1, eval(WIDTH-1), `i32 1, ') i32 1 >
+  %avg = sdiv <WIDTH x i32> %sum, < forloop(i, 1, eval(WIDTH-1), `i32 2, ') i32 2 >
+  %r = trunc <WIDTH x i32> %avg to <WIDTH x i16>
+  ret <WIDTH x i16> %r
+}')
+
+define(`define_avg_down_uint8', `
+define <WIDTH x i8> @__avg_down_uint8(<WIDTH x i8>, <WIDTH x i8>) {
+  %a16 = zext <WIDTH x i8> %0 to <WIDTH x i16>
+  %b16 = zext <WIDTH x i8> %1 to <WIDTH x i16>
+  %sum = add <WIDTH x i16> %a16, %b16
+  %avg = lshr <WIDTH x i16> %sum, < forloop(i, 1, eval(WIDTH-1), `i16 1, ') i16 1 >
+  %r = trunc <WIDTH x i16> %avg to <WIDTH x i8>
+  ret <WIDTH x i8> %r
+}')
+
+define(`define_avg_down_int8', `
+define <WIDTH x i8> @__avg_down_int8(<WIDTH x i8>, <WIDTH x i8>) {
+  %a16 = sext <WIDTH x i8> %0 to <WIDTH x i16>
+  %b16 = sext <WIDTH x i8> %1 to <WIDTH x i16>
+  %sum = add <WIDTH x i16> %a16, %b16
+  %avg = sdiv <WIDTH x i16> %sum, < forloop(i, 1, eval(WIDTH-1), `i16 2, ') i16 2 >
+  %r = trunc <WIDTH x i16> %avg to <WIDTH x i8>
+  ret <WIDTH x i8> %r
+}')
+
+define(`define_avg_down_uint16', `
+define <WIDTH x i16> @__avg_down_uint16(<WIDTH x i16>, <WIDTH x i16>) {
+  %a32 = zext <WIDTH x i16> %0 to <WIDTH x i32>
+  %b32 = zext <WIDTH x i16> %1 to <WIDTH x i32>
+  %sum = add <WIDTH x i32> %a32, %b32
+  %avg = lshr <WIDTH x i32> %sum, < forloop(i, 1, eval(WIDTH-1), `i32 1, ') i32 1 >
+  %r = trunc <WIDTH x i32> %avg to <WIDTH x i16>
+  ret <WIDTH x i16> %r
+}')
+
+define(`define_avg_down_int16', `
+define <WIDTH x i16> @__avg_down_int16(<WIDTH x i16>, <WIDTH x i16>) {
+  %a32 = sext <WIDTH x i16> %0 to <WIDTH x i32>
+  %b32 = sext <WIDTH x i16> %1 to <WIDTH x i32>
+  %sum = add <WIDTH x i32> %a32, %b32
+  %avg = sdiv <WIDTH x i32> %sum, < forloop(i, 1, eval(WIDTH-1), `i32 2, ') i32 2 >
+  %r = trunc <WIDTH x i32> %avg to <WIDTH x i16>
+  ret <WIDTH x i16> %r
+}')
+
+define(`define_up_avgs', `
+define_avg_up_uint8()
+define_avg_up_int8()
+define_avg_up_uint16()
+define_avg_up_int16()
+')
+
+define(`define_down_avgs', `
+define_avg_down_uint8()
+define_avg_down_int8()
+define_avg_down_uint16()
+define_avg_down_int16()
+')
+
+define(`define_avgs', `
+define_up_avgs()
+define_down_avgs()
+')
diff --git a/docs/ispc.rst b/docs/ispc.rst
index 8456f126..eb8333de 100755
--- a/docs/ispc.rst
+++ b/docs/ispc.rst
@@ -3399,6 +3399,31 @@ The ``isnan()`` functions test whether the given value is a floating-point
     uniform bool isnan(uniform double v)
 
 
+A number of functions are also available for performing operations on 8- and
+16-bit quantities; these map to specialized instructions that perform these
+operations on targets that support them.  ``avg_up()`` computes the average
+of the two values, rounding up if their average is halfway between two
+integers (i.e., it computes ``(a+b+1)/2``).
+
+::
+
+   int8 avg_up(int8 a, int8 b)
+   unsigned int8 avg_up(unsigned int8 a, unsigned int8 b)
+   int16 avg_up(int16 a, int16 b)
+   unsigned int16 avg_up(unsigned int16 a, unsigned int16 b)
+
+
+``avg_down()`` computes the average of the two values, rounding down (i.e.,
+it computes ``(a+b)/2``).
+
+::
+
+   int8 avg_down(int8 a, int8 b)
+   unsigned int8 avg_down(unsigned int8 a, unsigned int8 b)
+   int16 avg_down(int16 a, int16 b)
+   unsigned int16 avg_down(unsigned int16 a, unsigned int16 b)
+
+
 Transcendental Functions
 ------------------------
 
diff --git a/opt.cpp b/opt.cpp
index 8c86368e..b363f0e1 100644
--- a/opt.cpp
+++ b/opt.cpp
@@ -4343,6 +4343,14 @@ char MakeInternalFuncsStaticPass::ID = 0;
 bool
 MakeInternalFuncsStaticPass::runOnModule(llvm::Module &module) {
     const char *names[] = {
+        "__avg_up_uint8",
+        "__avg_up_int8",
+        "__avg_up_uint16",
+        "__avg_up_int16",
+        "__avg_down_uint8",
+        "__avg_down_int8",
+        "__avg_down_uint16",
+        "__avg_down_int16",
         "__fast_masked_vload",
         "__gather_factored_base_offsets32_i8", "__gather_factored_base_offsets32_i16",
         "__gather_factored_base_offsets32_i32", "__gather_factored_base_offsets32_i64",
diff --git a/stdlib.ispc b/stdlib.ispc
index affa7fef..dc94d7e3 100644
--- a/stdlib.ispc
+++ b/stdlib.ispc
@@ -4812,8 +4812,8 @@ static const uniform int64 __idiv_table_s32[][3] = {
 };
 
 __declspec(safe)
-static unmasked unsigned int8 __fast_idiv(unsigned int8 numerator,
-                                          uniform unsigned int8 divisor) {
+static unmasked inline unsigned int8
+__fast_idiv(unsigned int8 numerator, uniform unsigned int8 divisor) {
   uniform int64 method = __idiv_table_u8[divisor-2][0];
   uniform int64 multiplier = __idiv_table_u8[divisor-2][1];
   uniform int64 shift = __idiv_table_u8[divisor-2][2];
@@ -4833,7 +4833,7 @@ static unmasked unsigned int8 __fast_idiv(unsigned int8 numerator,
 }
 
 __declspec(safe)
-static unmasked int8 __fast_idiv(int8 numerator, uniform int8 divisor) {
+static unmasked inline int8 __fast_idiv(int8 numerator, uniform int8 divisor) {
   uniform int8 method = __idiv_table_s8[divisor-2][0];
   uniform int16 multiplier = __idiv_table_s8[divisor-2][1];
   uniform int8 shift = __idiv_table_s8[divisor-2][2];
@@ -4850,8 +4850,8 @@ static unmasked int8 __fast_idiv(int8 numerator, uniform int8 divisor) {
 }
 
 __declspec(safe)
-static unmasked unsigned int16 __fast_idiv(unsigned int16 numerator,
-                                           uniform unsigned int16 divisor) {
+static unmasked inline unsigned int16 __fast_idiv(unsigned int16 numerator,
+                                                  uniform unsigned int16 divisor) {
   uniform int64 method = __idiv_table_u16[divisor-2][0];
   uniform int64 multiplier = __idiv_table_u16[divisor-2][1];
   uniform int64 shift = __idiv_table_u16[divisor-2][2];
@@ -4871,7 +4871,7 @@ static unmasked unsigned int16 __fast_idiv(unsigned int16 numerator,
 }
 
 __declspec(safe)
-static unmasked int16 __fast_idiv(int16 numerator, uniform int16 divisor) {
+static unmasked inline int16 __fast_idiv(int16 numerator, uniform int16 divisor) {
   uniform int64 method = __idiv_table_s16[divisor-2][0];
   uniform int64 multiplier = __idiv_table_s16[divisor-2][1];
   uniform int64 shift = __idiv_table_s16[divisor-2][2];
@@ -4889,8 +4889,8 @@ static unmasked int16 __fast_idiv(int16 numerator, uniform int16 divisor) {
 }
 
 __declspec(safe)
-static unmasked inline unsigned int32 __fast_idiv(unsigned int32 numerator,
-                                                  uniform unsigned int32 divisor) {
+static unmasked inline inline unsigned int32 __fast_idiv(unsigned int32 numerator,
+                                                         uniform unsigned int32 divisor) {
   uniform int64 method = __idiv_table_u32[divisor-2][0];
   uniform int64 multiplier = __idiv_table_u32[divisor-2][1];
   uniform int64 shift = __idiv_table_u32[divisor-2][2];
@@ -4910,7 +4910,7 @@ static unmasked inline unsigned int32 __fast_idiv(unsigned int32 numerator,
 }
 
 __declspec(safe)
-static unmasked int32 __fast_idiv(int32 numerator, uniform int32 divisor) {
+static unmasked inline int32 __fast_idiv(int32 numerator, uniform int32 divisor) {
   uniform int64 method = __idiv_table_s32[divisor-2][0];
   uniform int64 multiplier = __idiv_table_s32[divisor-2][1];
   uniform int64 shift = __idiv_table_s32[divisor-2][2];
@@ -4927,3 +4927,45 @@ static unmasked int32 __fast_idiv(int32 numerator, uniform int32 divisor) {
   }
 }
 
+///////////////////////////////////////////////////////////////////////////
+// Saturating int8/int16 ops
+
+__declspec(safe)
+static unmasked inline unsigned int8 avg_up(unsigned int8 a, unsigned int8 b) {
+    return __avg_up_uint8(a, b);
+}
+
+__declspec(safe)
+static unmasked inline int8 avg_up(int8 a, int8 b) {
+    return __avg_up_int8(a, b);
+}
+
+__declspec(safe)
+static unmasked inline unsigned int16 avg_up(unsigned int16 a, unsigned int16 b) {
+    return __avg_up_uint16(a, b);
+}
+
+__declspec(safe)
+static unmasked inline int16 avg_up(int16 a, int16 b) {
+    return __avg_up_int16(a, b);
+}
+
+__declspec(safe)
+static unmasked inline unsigned int8 avg_down(unsigned int8 a, unsigned int8 b) {
+    return __avg_down_uint8(a, b);
+}
+
+__declspec(safe)
+static unmasked inline int8 avg_down(int8 a, int8 b) {
+    return __avg_down_int8(a, b);
+}
+
+__declspec(safe)
+static unmasked inline unsigned int16 avg_down(unsigned int16 a, unsigned int16 b) {
+    return __avg_down_uint16(a, b);
+}
+
+__declspec(safe)
+static unmasked inline int16 avg_down(int16 a, int16 b) {
+    return __avg_down_int16(a, b);
+}
diff --git a/tests/avg-down-int16.ispc b/tests/avg-down-int16.ispc
new file mode 100644
index 00000000..10a3c2a2
--- /dev/null
+++ b/tests/avg-down-int16.ispc
@@ -0,0 +1,13 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float bf) {
+    int16 a = aFOO[programIndex];
+    int16 b = bf;
+    RET[programIndex] = avg_down(a, b);
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = ((int)programIndex + 1 + 5) / 2;
+}
diff --git a/tests/avg-down-int8.ispc b/tests/avg-down-int8.ispc
new file mode 100644
index 00000000..67638934
--- /dev/null
+++ b/tests/avg-down-int8.ispc
@@ -0,0 +1,13 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float bf) {
+    int8 a = aFOO[programIndex];
+    int8 b = bf;
+    RET[programIndex] = avg_down(a, b);
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = ((int)programIndex + 1 + 5) / 2;
+}
diff --git a/tests/avg-down-uint16.ispc b/tests/avg-down-uint16.ispc
new file mode 100644
index 00000000..70f9185e
--- /dev/null
+++ b/tests/avg-down-uint16.ispc
@@ -0,0 +1,13 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float bf) {
+    unsigned int16 a = aFOO[programIndex];
+    unsigned int16 b = bf;
+    RET[programIndex] = avg_down(a, b);
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = ((int)programIndex + 1 + 5) / 2;
+}
diff --git a/tests/avg-down-uint8.ispc b/tests/avg-down-uint8.ispc
new file mode 100644
index 00000000..75fbf116
--- /dev/null
+++ b/tests/avg-down-uint8.ispc
@@ -0,0 +1,13 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float bf) {
+    unsigned int8 a = aFOO[programIndex];
+    unsigned int8 b = bf;
+    RET[programIndex] = avg_down(a, b);
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = ((int)programIndex + 1 + 5) / 2;
+}
diff --git a/tests/avg-up-int16.ispc b/tests/avg-up-int16.ispc
new file mode 100644
index 00000000..8f557a5b
--- /dev/null
+++ b/tests/avg-up-int16.ispc
@@ -0,0 +1,13 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float bf) {
+    int16 a = aFOO[programIndex];
+    int16 b = bf;
+    RET[programIndex] = avg_up(a, b);
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = ((int)programIndex + 1 + 5 + 1) / 2;
+}
diff --git a/tests/avg-up-int8.ispc b/tests/avg-up-int8.ispc
new file mode 100644
index 00000000..d0a3b444
--- /dev/null
+++ b/tests/avg-up-int8.ispc
@@ -0,0 +1,13 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float bf) {
+    int8 a = aFOO[programIndex];
+    int8 b = bf;
+    RET[programIndex] = avg_up(a, b);
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = ((int)programIndex + 1 + 5 + 1) / 2;
+}
diff --git a/tests/avg-up-uint16.ispc b/tests/avg-up-uint16.ispc
new file mode 100644
index 00000000..273f9f3b
--- /dev/null
+++ b/tests/avg-up-uint16.ispc
@@ -0,0 +1,13 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float bf) {
+    unsigned int16 a = aFOO[programIndex];
+    unsigned int16 b = bf;
+    RET[programIndex] = avg_up(a, b);
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = ((int)programIndex + 1 + 5 + 1) / 2;
+}
diff --git a/tests/avg-up-uint8.ispc b/tests/avg-up-uint8.ispc
new file mode 100644
index 00000000..d5d02491
--- /dev/null
+++ b/tests/avg-up-uint8.ispc
@@ -0,0 +1,13 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float bf) {
+    unsigned int8 a = aFOO[programIndex];
+    unsigned int8 b = bf;
+    RET[programIndex] = avg_up(a, b);
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = ((int)programIndex + 1 + 5 + 1) / 2;
+}

From ccdbddd388bf494bf3cb4aaf6a90cbb684cd18f0 Mon Sep 17 00:00:00 2001
From: Matt Pharr <mpharr@google.com>
Date: Tue, 6 Aug 2013 08:59:46 -0700
Subject: [PATCH 025/124] Add peephole optimization to match int8/int16
 averages.

Match the following patterns in IR, turning them into target-specific
intrinsics (e.g. PAVGB on x86) when possible.

(unsigned int8)(((unsigned int16)a + (unsigned int16)b + 1)/2)
(unsigned int8)(((unsigned int16)a + (unsigned int16)b)/2)
(unsigned int16)(((unsigned int32)a + (unsigned int32)b + 1)/2)
(unsigned int16)(((unsigned int32)a + (unsigned int32)b)/2)
(int8)(((int16)a + (int16)b + 1)/2)
(int8)(((int16)a + (int16)b)/2)
(int16)(((int32)a + (int32)b + 1)/2)
(int16)(((int32)a + (int32)b)/2)
---
 opt.cpp | 393 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 393 insertions(+)

diff --git a/opt.cpp b/opt.cpp
index b363f0e1..8899c64d 100644
--- a/opt.cpp
+++ b/opt.cpp
@@ -84,6 +84,7 @@
 #include <llvm/Analysis/Passes.h>
 #include <llvm/Support/raw_ostream.h>
 #include <llvm/DebugInfo.h>
+#include <llvm/Support/PatternMatch.h>
 #include <llvm/Support/Dwarf.h>
 #ifdef ISPC_IS_LINUX
   #include <alloca.h>
@@ -103,6 +104,7 @@
 
 static llvm::Pass *CreateIntrinsicsOptPass();
 static llvm::Pass *CreateInstructionSimplifyPass();
+static llvm::Pass *CreatePeepholePass();
 
 static llvm::Pass *CreateImproveMemoryOpsPass();
 static llvm::Pass *CreateGatherCoalescePass();
@@ -459,6 +461,9 @@ Optimize(llvm::Module *module, int optLevel) {
         optPM.add(llvm::createDeadInstEliminationPass());
         optPM.add(llvm::createCFGSimplificationPass());
 
+        optPM.add(llvm::createPromoteMemoryToRegisterPass());
+        optPM.add(llvm::createAggressiveDCEPass());
+
         if (g->opt.disableGatherScatterOptimizations == false &&
             g->target->getVectorWidth() > 1) {
             optPM.add(llvm::createInstructionCombiningPass());
@@ -500,6 +505,7 @@ Optimize(llvm::Module *module, int optLevel) {
         // InstructionCombiningPass. See r184459 for details.
         optPM.add(llvm::createSimplifyLibCallsPass());
 #endif
+        optPM.add(llvm::createAggressiveDCEPass());
         optPM.add(llvm::createInstructionCombiningPass());
         optPM.add(llvm::createJumpThreadingPass());
         optPM.add(llvm::createCFGSimplificationPass());
@@ -539,6 +545,7 @@ Optimize(llvm::Module *module, int optLevel) {
 
         optPM.add(llvm::createIPSCCPPass());
         optPM.add(llvm::createDeadArgEliminationPass());
+        optPM.add(llvm::createAggressiveDCEPass());
         optPM.add(llvm::createInstructionCombiningPass());
         optPM.add(llvm::createCFGSimplificationPass());
 
@@ -581,6 +588,9 @@ Optimize(llvm::Module *module, int optLevel) {
         optPM.add(llvm::createCFGSimplificationPass());
         optPM.add(llvm::createInstructionCombiningPass());
         optPM.add(CreateInstructionSimplifyPass());
+        optPM.add(CreatePeepholePass());
+        optPM.add(llvm::createFunctionInliningPass());
+        optPM.add(llvm::createAggressiveDCEPass());
         optPM.add(llvm::createStripDeadPrototypesPass());
         optPM.add(CreateMakeInternalFuncsStaticPass());
         optPM.add(llvm::createGlobalDCEPass());
@@ -4430,3 +4440,386 @@ static llvm::Pass *
 CreateMakeInternalFuncsStaticPass() {
     return new MakeInternalFuncsStaticPass;
 }
+
+
+///////////////////////////////////////////////////////////////////////////
+// PeepholePass
+
+class PeepholePass : public llvm::BasicBlockPass {
+public:
+    PeepholePass();
+
+    const char *getPassName() const { return "Peephole Optimizations"; }
+    bool runOnBasicBlock(llvm::BasicBlock &BB);
+
+    static char ID;
+};
+
+char PeepholePass::ID = 0;
+
+PeepholePass::PeepholePass()
+    : BasicBlockPass(ID) {
+}
+
+using namespace llvm::PatternMatch;
+
+template<typename Op_t, unsigned Opcode>
+struct CastClassTypes_match {
+    Op_t Op;
+    const llvm::Type *fromType, *toType;
+
+    CastClassTypes_match(const Op_t &OpMatch, const llvm::Type *f,
+                         const llvm::Type *t)
+        : Op(OpMatch), fromType(f), toType(t) {}
+
+    template<typename OpTy>
+    bool match(OpTy *V) {
+        if (llvm::Operator *O = llvm::dyn_cast<llvm::Operator>(V))
+            return (O->getOpcode() == Opcode && Op.match(O->getOperand(0)) &&
+                    O->getType() == toType &&
+                    O->getOperand(0)->getType() == fromType);
+        return false;
+    }
+};
+
+template<typename OpTy>
+inline CastClassTypes_match<OpTy, llvm::Instruction::SExt>
+m_SExt8To16(const OpTy &Op) {
+    return CastClassTypes_match<OpTy, llvm::Instruction::SExt>(
+        Op,
+        LLVMTypes::Int8VectorType,
+        LLVMTypes::Int16VectorType);
+}
+
+template<typename OpTy>
+inline CastClassTypes_match<OpTy, llvm::Instruction::ZExt>
+m_ZExt8To16(const OpTy &Op) {
+    return CastClassTypes_match<OpTy, llvm::Instruction::ZExt>(
+        Op,
+        LLVMTypes::Int8VectorType,
+        LLVMTypes::Int16VectorType);
+}
+
+
+template<typename OpTy>
+inline CastClassTypes_match<OpTy, llvm::Instruction::Trunc>
+m_Trunc16To8(const OpTy &Op) {
+    return CastClassTypes_match<OpTy, llvm::Instruction::Trunc>(
+        Op,
+        LLVMTypes::Int16VectorType,
+        LLVMTypes::Int8VectorType);
+}
+
+template<typename OpTy>
+inline CastClassTypes_match<OpTy, llvm::Instruction::SExt>
+m_SExt16To32(const OpTy &Op) {
+    return CastClassTypes_match<OpTy, llvm::Instruction::SExt>(
+        Op,
+        LLVMTypes::Int16VectorType,
+        LLVMTypes::Int32VectorType);
+}
+
+template<typename OpTy>
+inline CastClassTypes_match<OpTy, llvm::Instruction::ZExt>
+m_ZExt16To32(const OpTy &Op) {
+    return CastClassTypes_match<OpTy, llvm::Instruction::ZExt>(
+        Op,
+        LLVMTypes::Int16VectorType,
+        LLVMTypes::Int32VectorType);
+}
+
+
+template<typename OpTy>
+inline CastClassTypes_match<OpTy, llvm::Instruction::Trunc>
+m_Trunc32To16(const OpTy &Op) {
+    return CastClassTypes_match<OpTy, llvm::Instruction::Trunc>(
+        Op,
+        LLVMTypes::Int32VectorType,
+        LLVMTypes::Int16VectorType);
+}
+
+template<typename Op_t>
+struct UDiv2_match {
+    Op_t Op;
+
+    UDiv2_match(const Op_t &OpMatch)
+        : Op(OpMatch) {}
+
+    template<typename OpTy>
+    bool match(OpTy *V) {
+        llvm::BinaryOperator *bop;
+        llvm::ConstantDataVector *cdv;
+        if ((bop = llvm::dyn_cast<llvm::BinaryOperator>(V)) &&
+            (cdv = llvm::dyn_cast<llvm::ConstantDataVector>(bop->getOperand(1))) &&
+            cdv->getSplatValue() != NULL) {
+            const llvm::APInt &apInt = cdv->getUniqueInteger();
+
+            switch (bop->getOpcode()) {
+            case llvm::Instruction::UDiv:
+                // divide by 2
+                return (apInt.isIntN(2) && Op.match(bop->getOperand(0)));
+            case llvm::Instruction::LShr:
+                // shift left by 1
+                return (apInt.isIntN(1) && Op.match(bop->getOperand(0)));
+            default:
+                return false;
+            }
+        }
+        return false;
+    }
+};
+
+template<typename V>
+inline UDiv2_match<V>
+m_UDiv2(const V &v) {
+    return UDiv2_match<V>(v);
+}
+
+template<typename Op_t>
+struct SDiv2_match {
+    Op_t Op;
+
+    SDiv2_match(const Op_t &OpMatch)
+        : Op(OpMatch) {}
+
+    template<typename OpTy>
+    bool match(OpTy *V) {
+        llvm::BinaryOperator *bop;
+        llvm::ConstantDataVector *cdv;
+        if ((bop = llvm::dyn_cast<llvm::BinaryOperator>(V)) &&
+            (cdv = llvm::dyn_cast<llvm::ConstantDataVector>(bop->getOperand(1))) &&
+            cdv->getSplatValue() != NULL) {
+            const llvm::APInt &apInt = cdv->getUniqueInteger();
+
+            switch (bop->getOpcode()) {
+            case llvm::Instruction::SDiv:
+                // divide by 2
+                return (apInt.isIntN(2) && Op.match(bop->getOperand(0)));
+            case llvm::Instruction::AShr:
+                // shift left by 1
+                return (apInt.isIntN(1) && Op.match(bop->getOperand(0)));
+            default:
+                return false;
+            }
+        }
+        return false;
+    }
+};
+
+template<typename V>
+inline SDiv2_match<V>
+m_SDiv2(const V &v) {
+    return SDiv2_match<V>(v);
+}
+// Returns true if the given function has a call to an intrinsic function
+// in its definition.
+static bool
+lHasIntrinsicInDefinition(llvm::Function *func) {
+  llvm::Function::iterator bbiter = func->begin();
+  for (; bbiter != func->end(); ++bbiter) {
+    for (llvm::BasicBlock::iterator institer = bbiter->begin();
+         institer != bbiter->end(); ++institer) {
+      if (llvm::isa<llvm::IntrinsicInst>(institer))
+        return true;
+    }
+  }
+  return false;
+}
+
+static llvm::Instruction *
+lGetBinaryIntrinsic(const char *name, llvm::Value *opa, llvm::Value *opb) {
+  llvm::Function *func = m->module->getFunction(name);
+  Assert(func != NULL);
+
+  // Make sure that the definition of the llvm::Function has a call to an
+  // intrinsic function in its instructions; otherwise we will generate
+  // infinite loops where we "helpfully" turn the default implementations
+  // of target builtins like __avg_up_uint8 that are implemented with plain
+  // arithmetic ops into recursive calls to themselves.
+  if (lHasIntrinsicInDefinition(func))
+    return lCallInst(func, opa, opb, name);
+  else
+    return NULL;
+}
+
+//////////////////////////////////////////////////
+
+static llvm::Instruction *
+lMatchAvgUpUInt8(llvm::Value *inst) {
+    // (unsigned int8)(((unsigned int16)a + (unsigned int16)b + 1)/2)
+    llvm::Value *opa, *opb;
+    const llvm::APInt *delta;
+    if (match(inst, m_Trunc16To8(m_UDiv2(m_CombineOr(
+        m_CombineOr(
+            m_Add(m_ZExt8To16(m_Value(opa)),
+                  m_Add(m_ZExt8To16(m_Value(opb)), m_APInt(delta))),
+            m_Add(m_Add(m_ZExt8To16(m_Value(opa)), m_APInt(delta)),
+                  m_ZExt8To16(m_Value(opb)))),
+        m_Add(m_Add(m_ZExt8To16(m_Value(opa)), m_ZExt8To16(m_Value(opb))),
+              m_APInt(delta))))))) {
+        if (delta->isIntN(1) == false)
+            return false;
+
+        return lGetBinaryIntrinsic("__avg_up_uint8", opa, opb);
+    }
+    return NULL;
+}
+
+
+static llvm::Instruction *
+lMatchAvgDownUInt8(llvm::Value *inst) {
+    // (unsigned int8)(((unsigned int16)a + (unsigned int16)b)/2)
+    llvm::Value *opa, *opb;
+    if (match(inst, m_Trunc16To8(m_UDiv2(
+                    m_Add(m_ZExt8To16(m_Value(opa)),
+                          m_ZExt8To16(m_Value(opb))))))) {
+        return lGetBinaryIntrinsic("__avg_down_uint8", opa, opb);
+    }
+    return NULL;
+}
+
+static llvm::Instruction *
+lMatchAvgUpUInt16(llvm::Value *inst) {
+    // (unsigned int16)(((unsigned int32)a + (unsigned int32)b + 1)/2)
+    llvm::Value *opa, *opb;
+    const llvm::APInt *delta;
+    if (match(inst, m_Trunc32To16(m_UDiv2(m_CombineOr(
+        m_CombineOr(
+            m_Add(m_ZExt16To32(m_Value(opa)),
+                  m_Add(m_ZExt16To32(m_Value(opb)), m_APInt(delta))),
+            m_Add(m_Add(m_ZExt16To32(m_Value(opa)), m_APInt(delta)),
+                  m_ZExt16To32(m_Value(opb)))),
+        m_Add(m_Add(m_ZExt16To32(m_Value(opa)), m_ZExt16To32(m_Value(opb))),
+              m_APInt(delta))))))) {
+        if (delta->isIntN(1) == false)
+            return false;
+
+        return lGetBinaryIntrinsic("__avg_up_uint16", opa, opb);
+    }
+    return NULL;
+}
+
+
+static llvm::Instruction *
+lMatchAvgDownUInt16(llvm::Value *inst) {
+    // (unsigned int16)(((unsigned int32)a + (unsigned int32)b)/2)
+    llvm::Value *opa, *opb;
+    if (match(inst, m_Trunc32To16(m_UDiv2(
+                    m_Add(m_ZExt16To32(m_Value(opa)),
+                          m_ZExt16To32(m_Value(opb))))))) {
+        return lGetBinaryIntrinsic("__avg_down_uint16", opa, opb);
+    }
+    return NULL;
+}
+
+
+static llvm::Instruction *
+lMatchAvgUpInt8(llvm::Value *inst) {
+    // (int8)(((int16)a + (int16)b + 1)/2)
+    llvm::Value *opa, *opb;
+    const llvm::APInt *delta;
+    if (match(inst, m_Trunc16To8(m_SDiv2(m_CombineOr(
+        m_CombineOr(
+            m_Add(m_SExt8To16(m_Value(opa)),
+                  m_Add(m_SExt8To16(m_Value(opb)), m_APInt(delta))),
+            m_Add(m_Add(m_SExt8To16(m_Value(opa)), m_APInt(delta)),
+                  m_SExt8To16(m_Value(opb)))),
+        m_Add(m_Add(m_SExt8To16(m_Value(opa)), m_SExt8To16(m_Value(opb))),
+              m_APInt(delta))))))) {
+        if (delta->isIntN(1) == false)
+            return false;
+
+        return lGetBinaryIntrinsic("__avg_up_int8", opa, opb);
+    }
+    return NULL;
+}
+
+
+static llvm::Instruction *
+lMatchAvgDownInt8(llvm::Value *inst) {
+    // (int8)(((int16)a + (int16)b)/2)
+    llvm::Value *opa, *opb;
+    if (match(inst, m_Trunc16To8(m_SDiv2(
+                    m_Add(m_SExt8To16(m_Value(opa)),
+                          m_SExt8To16(m_Value(opb))))))) {
+        return lGetBinaryIntrinsic("__avg_down_int8", opa, opb);
+    }
+    return NULL;
+}
+
+static llvm::Instruction *
+lMatchAvgUpInt16(llvm::Value *inst) {
+    // (int16)(((int32)a + (int32)b + 1)/2)
+    llvm::Value *opa, *opb;
+    const llvm::APInt *delta;
+    if (match(inst, m_Trunc32To16(m_SDiv2(m_CombineOr(
+        m_CombineOr(
+            m_Add(m_SExt16To32(m_Value(opa)),
+                  m_Add(m_SExt16To32(m_Value(opb)), m_APInt(delta))),
+            m_Add(m_Add(m_SExt16To32(m_Value(opa)), m_APInt(delta)),
+                  m_SExt16To32(m_Value(opb)))),
+        m_Add(m_Add(m_SExt16To32(m_Value(opa)), m_SExt16To32(m_Value(opb))),
+              m_APInt(delta))))))) {
+        if (delta->isIntN(1) == false)
+            return false;
+
+        return lGetBinaryIntrinsic("__avg_up_int16", opa, opb);
+    }
+    return NULL;
+}
+
+static llvm::Instruction *
+lMatchAvgDownInt16(llvm::Value *inst) {
+    // (int16)(((int32)a + (int32)b)/2)
+    llvm::Value *opa, *opb;
+    if (match(inst, m_Trunc32To16(m_SDiv2(
+                    m_Add(m_SExt16To32(m_Value(opa)),
+                          m_SExt16To32(m_Value(opb))))))) {
+        return lGetBinaryIntrinsic("__avg_down_int16", opa, opb);
+    }
+    return NULL;
+}
+
+bool
+PeepholePass::runOnBasicBlock(llvm::BasicBlock &bb) {
+    DEBUG_START_PASS("PeepholePass");
+
+    bool modifiedAny = false;
+ restart:
+    for (llvm::BasicBlock::iterator iter = bb.begin(), e = bb.end(); iter != e; ++iter) {
+        llvm::Instruction *inst = &*iter;
+
+        llvm::Instruction *builtinCall = NULL;
+        if (!builtinCall)
+          builtinCall = lMatchAvgUpUInt8(inst);
+        if (!builtinCall)
+          builtinCall = lMatchAvgUpUInt16(inst);
+        if (!builtinCall)
+          builtinCall = lMatchAvgDownUInt8(inst);
+        if (!builtinCall)
+          builtinCall = lMatchAvgDownUInt16(inst);
+        if (!builtinCall)
+          builtinCall = lMatchAvgUpInt8(inst);
+        if (!builtinCall)
+          builtinCall = lMatchAvgUpInt16(inst);
+        if (!builtinCall)
+          builtinCall = lMatchAvgDownInt8(inst);
+        if (!builtinCall)
+          builtinCall = lMatchAvgDownInt16(inst);
+
+        if (builtinCall != NULL) {
+          llvm::ReplaceInstWithInst(inst, builtinCall);
+          modifiedAny = true;
+          goto restart;
+        }
+    }
+
+    DEBUG_END_PASS("PeepholePass");
+
+    return modifiedAny;
+}
+
+static llvm::Pass *
+CreatePeepholePass() {
+  return new PeepholePass;
+}

From 1276ea98440fc95bdb1388c27217c618cdac3cba Mon Sep 17 00:00:00 2001
From: Matt Pharr <mpharr@google.com>
Date: Tue, 6 Aug 2013 17:00:35 -0700
Subject: [PATCH 026/124] Revert "Remove support for building with LLVM 3.1"

This reverts commit d3c567503bf64ec9066c09cb8959c31d4aa1be0e.

Conflicts:
	opt.cpp
---
 builtins.cpp                |   2 +
 builtins/target-avx11-x2.ll |   4 +-
 builtins/target-avx11.ll    |   4 +-
 builtins/target-avx2-x2.ll  |  25 +++++++-
 builtins/target-avx2.ll     |  25 +++++++-
 cbackend.cpp                | 115 +++++++++++++++++++++++++++---------
 ctx.cpp                     |   4 +-
 ctx.h                       |  11 +++-
 expr.cpp                    |   2 +-
 func.cpp                    |  10 +++-
 ispc.cpp                    |  68 +++++++++++++++++----
 ispc.h                      |  18 +++++-
 llvmutil.cpp                |   2 +-
 llvmutil.h                  |   2 +-
 main.cpp                    |   4 +-
 module.cpp                  |  47 ++++++++++++---
 opt.cpp                     |  22 +++++--
 stmt.cpp                    |   2 +-
 type.cpp                    |  26 +++++---
 type.h                      |   2 +-
 util.cpp                    |   9 ++-
 21 files changed, 320 insertions(+), 84 deletions(-)

diff --git a/builtins.cpp b/builtins.cpp
index d75db43e..82c45b02 100644
--- a/builtins.cpp
+++ b/builtins.cpp
@@ -49,6 +49,8 @@
 #include <stdlib.h>
 #if defined(LLVM_3_2)
   #include <llvm/Attributes.h>
+#endif
+#if defined(LLVM_3_1) || defined(LLVM_3_2)
   #include <llvm/LLVMContext.h>
   #include <llvm/Module.h>
   #include <llvm/Type.h>
diff --git a/builtins/target-avx11-x2.ll b/builtins/target-avx11-x2.ll
index 2aee1e1c..1aa6345c 100644
--- a/builtins/target-avx11-x2.ll
+++ b/builtins/target-avx11-x2.ll
@@ -31,7 +31,9 @@
 
 include(`target-avx-x2.ll')
 
-rdrand_definition()
+ifelse(LLVM_VERSION, `LLVM_3_0', `rdrand_decls()',
+       LLVM_VERSION, `LLVM_3_1', `rdrand_decls()',
+       `rdrand_definition()')
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; int min/max
diff --git a/builtins/target-avx11.ll b/builtins/target-avx11.ll
index 44593113..fea0a7c2 100644
--- a/builtins/target-avx11.ll
+++ b/builtins/target-avx11.ll
@@ -31,7 +31,9 @@
 
 include(`target-avx.ll')
 
-rdrand_definition()
+ifelse(LLVM_VERSION, `LLVM_3_0', `rdrand_decls()',
+       LLVM_VERSION, `LLVM_3_1', `rdrand_decls()',
+       `rdrand_definition()')
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; int min/max
diff --git a/builtins/target-avx2-x2.ll b/builtins/target-avx2-x2.ll
index 19f1845d..053fd078 100644
--- a/builtins/target-avx2-x2.ll
+++ b/builtins/target-avx2-x2.ll
@@ -29,11 +29,15 @@
 ;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 ;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
 
-define(`HAVE_GATHER', `1')
+ifelse(LLVM_VERSION, `LLVM_3_0', `',
+       LLVM_VERSION, `LLVM_3_1', `',
+       `define(`HAVE_GATHER', `1')')
 
 include(`target-avx-x2.ll')
 
-rdrand_definition()
+ifelse(LLVM_VERSION, `LLVM_3_0', `rdrand_decls()',
+       LLVM_VERSION, `LLVM_3_1', `rdrand_decls()',
+       `rdrand_definition()')
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; int min/max
@@ -172,6 +176,21 @@ define(`assemble_4s', `
   assemble_8s($1, $2, $2_1, $2_2)
 ')
 
+ifelse(LLVM_VERSION, `LLVM_3_0', `
+gen_gather_factored(i8)
+gen_gather_factored(i16)
+gen_gather_factored(i32)
+gen_gather_factored(float)
+gen_gather_factored(i64)
+gen_gather_factored(double)',
+LLVM_VERSION, `LLVM_3_1', `
+gen_gather_factored(i8)
+gen_gather_factored(i16)
+gen_gather_factored(i32)
+gen_gather_factored(float)
+gen_gather_factored(i64)
+gen_gather_factored(double)', `
+
 gen_gather(i8)
 gen_gather(i16)
 
@@ -538,3 +557,5 @@ define <16 x double> @__gather64_double(<16 x i64> %ptrs,
 
   ret <16 x double> %v
 }
+
+')
diff --git a/builtins/target-avx2.ll b/builtins/target-avx2.ll
index d3410011..f4a0ee07 100644
--- a/builtins/target-avx2.ll
+++ b/builtins/target-avx2.ll
@@ -29,11 +29,15 @@
 ;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 ;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
 
-define(`HAVE_GATHER', `1')
+ifelse(LLVM_VERSION, `LLVM_3_0', `',
+       LLVM_VERSION, `LLVM_3_1', `',
+       `define(`HAVE_GATHER', `1')')
 
 include(`target-avx.ll')
 
-rdrand_definition()
+ifelse(LLVM_VERSION, `LLVM_3_0', `rdrand_decls()',
+       LLVM_VERSION, `LLVM_3_1', `rdrand_decls()',
+       `rdrand_definition()')
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; int min/max
@@ -119,6 +123,21 @@ define(`extract_4s', `
   %$2_2 = shufflevector <8 x $1> %$2, <8 x $1> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 ')
 
+ifelse(LLVM_VERSION, `LLVM_3_0', `
+gen_gather_factored(i8)
+gen_gather_factored(i16)
+gen_gather_factored(i32)
+gen_gather_factored(float)
+gen_gather_factored(i64)
+gen_gather_factored(double)',
+LLVM_VERSION, `LLVM_3_1', `
+gen_gather_factored(i8)
+gen_gather_factored(i16)
+gen_gather_factored(i32)
+gen_gather_factored(float)
+gen_gather_factored(i64)
+gen_gather_factored(double)', `
+
 gen_gather(i8)
 gen_gather(i16)
 
@@ -410,3 +429,5 @@ define <8 x double> @__gather64_double(<8 x i64> %ptrs,
 
   ret <8 x double> %v
 }
+
+')
diff --git a/cbackend.cpp b/cbackend.cpp
index d54f48fb..d23bcc20 100644
--- a/cbackend.cpp
+++ b/cbackend.cpp
@@ -29,7 +29,7 @@
 
 #include "llvmutil.h"
 
-#if defined(LLVM_3_2)
+#if defined(LLVM_3_1) || defined(LLVM_3_2)
   #include "llvm/Constants.h"
   #include "llvm/DerivedTypes.h"
   #include "llvm/CallingConv.h"
@@ -38,7 +38,6 @@
   #include "llvm/Intrinsics.h"
   #include "llvm/IntrinsicInst.h"
   #include "llvm/InlineAsm.h"
-  #include "llvm/TypeFinder.h"
 #else
   #include "llvm/IR/Constants.h"
   #include "llvm/IR/DerivedTypes.h"
@@ -48,10 +47,16 @@
   #include "llvm/IR/Intrinsics.h"
   #include "llvm/IR/IntrinsicInst.h"
   #include "llvm/IR/InlineAsm.h"
-  #include "llvm/IR/TypeFinder.h"
 #endif
 #include "llvm/Pass.h"
 #include "llvm/PassManager.h"
+#if !defined(LLVM_3_1)
+  #if defined(LLVM_3_2)
+    #include "llvm/TypeFinder.h"
+  #else // LLVM_3_3 +
+    #include "llvm/IR/TypeFinder.h"
+  #endif
+#endif // LLVM_3_2 +
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/STLExtras.h"
@@ -71,7 +76,9 @@
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCSymbol.h"
-#if defined(LLVM_3_2)
+#if defined(LLVM_3_1)
+  #include "llvm/Target/TargetData.h"
+#elif defined(LLVM_3_2)
   #include "llvm/DataLayout.h"
 #else // LLVM 3.3+
   #include "llvm/IR/DataLayout.h"
@@ -81,7 +88,7 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FormattedStream.h"
 #include "llvm/Support/GetElementPtrTypeIterator.h"
-#if defined(LLVM_3_2)
+#if defined(LLVM_3_1) || defined(LLVM_3_2)
   #include "llvm/Support/InstVisitor.h"
 #else // LLVM 3.3+
   #include "llvm/InstVisitor.h"
@@ -251,10 +258,14 @@ namespace {
     const llvm::MCRegisterInfo *MRI;
     const llvm::MCObjectFileInfo *MOFI;
     llvm::MCContext *TCtx;
+#if defined(LLVM_3_1)
+    const llvm::TargetData* TD;
+#else
     // FIXME: it's ugly to have the name be "TD" here, but it saves us
     // lots of ifdefs in the below since the new DataLayout and the old
     // TargetData have generally similar interfaces...
     const llvm::DataLayout* TD;
+#endif
 
     std::map<const llvm::ConstantFP *, unsigned> FPConstantMap;
     std::map<const llvm::ConstantDataVector *, unsigned> VectorConstantMap;
@@ -341,7 +352,7 @@ namespace {
                            bool isSigned = false,
                            const std::string &VariableName = "",
                            bool IgnoreName = false,
-#if defined(LLVM_3_2)
+#if defined(LLVM_3_1) || defined(LLVM_3_2)
                            const llvm::AttrListPtr &PAL = llvm::AttrListPtr()
 #else
                            const llvm::AttributeSet &PAL = llvm::AttributeSet()
@@ -352,7 +363,7 @@ namespace {
                            const std::string &NameSoFar = "");
 
     void printStructReturnPointerFunctionType(llvm::raw_ostream &Out,
-#if defined(LLVM_3_2)
+#if defined(LLVM_3_1) || defined(LLVM_3_2)
                                               const llvm::AttrListPtr &PAL,
 #else
                                               const llvm::AttributeSet &PAL,
@@ -575,7 +586,7 @@ std::string CWriter::getArrayName(llvm::ArrayType *AT) {
 /// return type, except, instead of printing the type as void (*)(Struct*, ...)
 /// print it as "Struct (*)(...)", for struct return functions.
 void CWriter::printStructReturnPointerFunctionType(llvm::raw_ostream &Out,
-#if defined(LLVM_3_2)
+#if defined(LLVM_3_1) || defined(LLVM_3_2)
                                                    const llvm::AttrListPtr &PAL,
 #else
                                                    const llvm::AttributeSet &PAL,
@@ -594,16 +605,20 @@ void CWriter::printStructReturnPointerFunctionType(llvm::raw_ostream &Out,
     if (PrintedType)
       FunctionInnards << ", ";
     llvm::Type *ArgTy = *I;
-#if defined(LLVM_3_2)
+#if defined(LLVM_3_1)
+    if (PAL.paramHasAttr(Idx, llvm::Attribute::ByVal)) {
+#elif defined(LLVM_3_2)
     if (PAL.getParamAttributes(Idx).hasAttribute(llvm::Attributes::ByVal)) {
 #else
-    if (PAL.getParamAttributes(Idx).hasAttribute(llvm::AttributeSet::FunctionIndex, llvm::Attribute::ByVal)) {
+        if (PAL.getParamAttributes(Idx).hasAttribute(llvm::AttributeSet::FunctionIndex, llvm::Attribute::ByVal)) {
 #endif
       assert(ArgTy->isPointerTy());
       ArgTy = llvm::cast<llvm::PointerType>(ArgTy)->getElementType();
     }
     printType(FunctionInnards, ArgTy,
-#if defined(LLVM_3_2)
+#if defined(LLVM_3_1)
+              /*isSigned=*/PAL.paramHasAttr(Idx, llvm::Attribute::SExt),
+#elif defined(LLVM_3_2)
               PAL.getParamAttributes(Idx).hasAttribute(llvm::Attributes::SExt),
 #else
               PAL.getParamAttributes(Idx).hasAttribute(llvm::AttributeSet::FunctionIndex, llvm::Attribute::SExt),
@@ -620,7 +635,9 @@ void CWriter::printStructReturnPointerFunctionType(llvm::raw_ostream &Out,
   }
   FunctionInnards << ')';
   printType(Out, RetTy,
-#if defined(LLVM_3_2)
+#if defined(LLVM_3_1)
+            /*isSigned=*/PAL.paramHasAttr(0, llvm::Attribute::SExt),
+#elif defined(LLVM_3_2)
             PAL.getParamAttributes(0).hasAttribute(llvm::Attributes::SExt),
 #else
             PAL.getParamAttributes(0).hasAttribute(llvm::AttributeSet::ReturnIndex, llvm::Attribute::SExt),
@@ -720,7 +737,7 @@ CWriter::printSimpleType(llvm::raw_ostream &Out, llvm::Type *Ty, bool isSigned,
 llvm::raw_ostream &CWriter::printType(llvm::raw_ostream &Out, llvm::Type *Ty,
                                 bool isSigned, const std::string &NameSoFar,
                                 bool IgnoreName,
-#if defined(LLVM_3_2)
+#if defined(LLVM_3_1) || defined(LLVM_3_2)
                                 const llvm::AttrListPtr &PAL
 #else
                                 const llvm::AttributeSet &PAL
@@ -742,7 +759,9 @@ llvm::raw_ostream &CWriter::printType(llvm::raw_ostream &Out, llvm::Type *Ty,
     for (llvm::FunctionType::param_iterator I = FTy->param_begin(),
            E = FTy->param_end(); I != E; ++I) {
       llvm::Type *ArgTy = *I;
-#if defined(LLVM_3_2)
+#if defined(LLVM_3_1)
+      if (PAL.paramHasAttr(Idx, llvm::Attribute::ByVal)) {
+#elif defined(LLVM_3_2)
       if (PAL.getParamAttributes(Idx).hasAttribute(llvm::Attributes::ByVal)) {
 #else
           if (PAL.getParamAttributes(Idx).hasAttribute(llvm::AttributeSet::FunctionIndex, llvm::Attribute::ByVal)) {
@@ -753,7 +772,9 @@ llvm::raw_ostream &CWriter::printType(llvm::raw_ostream &Out, llvm::Type *Ty,
       if (I != FTy->param_begin())
         FunctionInnards << ", ";
       printType(FunctionInnards, ArgTy,
-#if defined(LLVM_3_2)
+#if defined(LLVM_3_1)
+                /*isSigned=*/PAL.paramHasAttr(Idx, llvm::Attribute::SExt),
+#elif defined(LLVM_3_2)
                 PAL.getParamAttributes(Idx).hasAttribute(llvm::Attributes::SExt),
 #else
                 PAL.getParamAttributes(Idx).hasAttribute(llvm::AttributeSet::FunctionIndex, llvm::Attribute::SExt),
@@ -770,7 +791,9 @@ llvm::raw_ostream &CWriter::printType(llvm::raw_ostream &Out, llvm::Type *Ty,
     }
     FunctionInnards << ')';
     printType(Out, FTy->getReturnType(),
-#if defined(LLVM_3_2)
+#if defined(LLVM_3_1)
+              /*isSigned=*/PAL.paramHasAttr(0, llvm::Attribute::SExt),
+#elif defined(LLVM_3_2)
               PAL.getParamAttributes(0).hasAttribute(llvm::Attributes::SExt),
 #else
               PAL.getParamAttributes(0).hasAttribute(llvm::AttributeSet::ReturnIndex, llvm::Attribute::SExt),
@@ -1949,7 +1972,11 @@ void CWriter::writeOperandWithCast(llvm::Value* Operand, const llvm::ICmpInst &C
 // directives to cater to specific compilers as need be.
 //
 static void generateCompilerSpecificCode(llvm::formatted_raw_ostream& Out,
+#if defined(LLVM_3_1)
+                                         const llvm::TargetData *TD) {
+#else
                                          const llvm::DataLayout *TD) {
+#endif
   // We output GCC specific attributes to preserve 'linkonce'ness on globals.
   // If we aren't being compiled with GCC, just drop these attributes.
   Out << "#ifndef __GNUC__  /* Can only support \"linkonce\" vars with GCC */\n"
@@ -2142,7 +2169,11 @@ bool CWriter::doInitialization(llvm::Module &M) {
   // Initialize
   TheModule = &M;
 
+#if defined(LLVM_3_1)
+  TD = new llvm::TargetData(&M);
+#else
   TD = new llvm::DataLayout(&M);
+#endif
   IL = new llvm::IntrinsicLowering(*TD);
   IL->AddPrototypes(M);
 
@@ -2625,11 +2656,15 @@ void CWriter::printModuleTypes() {
 
   // Get all of the struct types used in the module.
   std::vector<llvm::StructType*> StructTypes;
+#if defined(LLVM_3_1)
+  TheModule->findUsedStructTypes(StructTypes);
+#else
   llvm::TypeFinder typeFinder;
   typeFinder.run(*TheModule, false);
   for (llvm::TypeFinder::iterator iter = typeFinder.begin();
        iter != typeFinder.end(); ++iter)
       StructTypes.push_back(*iter);
+#endif
 
   // Get all of the array types used in the module
   std::vector<llvm::ArrayType*> ArrayTypes;
@@ -2750,7 +2785,7 @@ void CWriter::printFunctionSignature(const llvm::Function *F, bool Prototype) {
 
   // Loop over the arguments, printing them...
   llvm::FunctionType *FT = llvm::cast<llvm::FunctionType>(F->getFunctionType());
-#if defined(LLVM_3_2)
+#if defined(LLVM_3_1) || defined(LLVM_3_2)
   const llvm::AttrListPtr &PAL = F->getAttributes();
 #else
   const llvm::AttributeSet &PAL = F->getAttributes();
@@ -2784,16 +2819,20 @@ void CWriter::printFunctionSignature(const llvm::Function *F, bool Prototype) {
         else
           ArgName = "";
         llvm::Type *ArgTy = I->getType();
-#if defined(LLVM_3_2)
+#if defined(LLVM_3_1)
+        if (PAL.paramHasAttr(Idx, llvm::Attribute::ByVal)) {
+#elif defined(LLVM_3_2)
         if (PAL.getParamAttributes(Idx).hasAttribute(llvm::Attributes::ByVal)) {
 #else
-        if (PAL.getParamAttributes(Idx).hasAttribute(llvm::AttributeSet::FunctionIndex, llvm::Attribute::ByVal)) {
+            if (PAL.getParamAttributes(Idx).hasAttribute(llvm::AttributeSet::FunctionIndex, llvm::Attribute::ByVal)) {
 #endif
           ArgTy = llvm::cast<llvm::PointerType>(ArgTy)->getElementType();
           ByValParams.insert(I);
         }
         printType(FunctionInnards, ArgTy,
-#if defined(LLVM_3_2)
+#if defined(LLVM_3_1)
+                  /*isSigned=*/PAL.paramHasAttr(Idx, llvm::Attribute::SExt),
+#elif defined(LLVM_3_2)
                   PAL.getParamAttributes(Idx).hasAttribute(llvm::Attributes::SExt),
 #else
                   PAL.getParamAttributes(Idx).hasAttribute(llvm::AttributeSet::FunctionIndex, llvm::Attribute::SExt),
@@ -2819,7 +2858,9 @@ void CWriter::printFunctionSignature(const llvm::Function *F, bool Prototype) {
     for (; I != E; ++I) {
       if (PrintedArg) FunctionInnards << ", ";
       llvm::Type *ArgTy = *I;
-#if defined(LLVM_3_2)
+#if defined(LLVM_3_1)
+      if (PAL.paramHasAttr(Idx, llvm::Attribute::ByVal)) {
+#elif defined(LLVM_3_2)
       if (PAL.getParamAttributes(Idx).hasAttribute(llvm::Attributes::ByVal)) {
 #else
           if (PAL.getParamAttributes(Idx).hasAttribute(llvm::AttributeSet::FunctionIndex, llvm::Attribute::ByVal)) {
@@ -2828,7 +2869,9 @@ void CWriter::printFunctionSignature(const llvm::Function *F, bool Prototype) {
         ArgTy = llvm::cast<llvm::PointerType>(ArgTy)->getElementType();
       }
       printType(FunctionInnards, ArgTy,
-#if defined(LLVM_3_2)
+#if defined(LLVM_3_1)
+                /*isSigned=*/PAL.paramHasAttr(Idx, llvm::Attribute::SExt)
+#elif defined(LLVM_3_2)
                 PAL.getParamAttributes(Idx).hasAttribute(llvm::Attributes::SExt)
 #else
                 PAL.getParamAttributes(Idx).hasAttribute(llvm::AttributeSet::FunctionIndex, llvm::Attribute::SExt)
@@ -2865,7 +2908,9 @@ void CWriter::printFunctionSignature(const llvm::Function *F, bool Prototype) {
 
   // Print out the return type and the signature built above.
   printType(Out, RetTy,
-#if defined(LLVM_3_2)
+#if defined(LLVM_3_1)
+            /*isSigned=*/PAL.paramHasAttr(0, llvm::Attribute::SExt),
+#elif defined(LLVM_3_2)
             PAL.getParamAttributes(0).hasAttribute(llvm::Attributes::SExt),
 #else
             PAL.getParamAttributes(0).hasAttribute(llvm::AttributeSet::ReturnIndex, llvm::Attribute::SExt),
@@ -3667,7 +3712,7 @@ void CWriter::lowerIntrinsics(llvm::Function &F) {
             const char *BuiltinName = "";
 #define GET_GCC_BUILTIN_NAME
 #define Intrinsic llvm::Intrinsic
-#if defined(LLVM_3_2)
+#if defined(LLVM_3_1) || defined(LLVM_3_2)
   #include "llvm/Intrinsics.gen"
 #else
   #include "llvm/IR/Intrinsics.gen"
@@ -3732,7 +3777,7 @@ void CWriter::visitCallInst(llvm::CallInst &I) {
 
   // If this is a call to a struct-return function, assign to the first
   // parameter instead of passing it to the call.
-#if defined(LLVM_3_2)
+#if defined(LLVM_3_1) || defined(LLVM_3_2)
   const llvm::AttrListPtr &PAL = I.getAttributes();
 #else
   const llvm::AttributeSet &PAL = I.getAttributes();
@@ -3820,7 +3865,9 @@ void CWriter::visitCallInst(llvm::CallInst &I) {
         (*AI)->getType() != FTy->getParamType(ArgNo)) {
       Out << '(';
       printType(Out, FTy->getParamType(ArgNo),
-#if defined(LLVM_3_2)
+#if defined(LLVM_3_1)
+                /*isSigned=*/PAL.paramHasAttr(ArgNo+1, llvm::Attribute::SExt)
+#elif defined(LLVM_3_2)
                 PAL.getParamAttributes(ArgNo+1).hasAttribute(llvm::Attributes::SExt)
 #else
                 PAL.getParamAttributes(ArgNo+1).hasAttribute(llvm::AttributeSet::FunctionIndex, llvm::Attribute::SExt)
@@ -3858,7 +3905,7 @@ bool CWriter::visitBuiltinCall(llvm::CallInst &I, llvm::Intrinsic::ID ID,
     const char *BuiltinName = "";
 #define GET_GCC_BUILTIN_NAME
 #define Intrinsic llvm::Intrinsic
-#if defined(LLVM_3_2)
+#if defined(LLVM_3_1) || defined(LLVM_3_2)
   #include "llvm/Intrinsics.gen"
 #else
   #include "llvm/IR/Intrinsics.gen"
@@ -4508,8 +4555,13 @@ SmearCleanupPass::runOnBasicBlock(llvm::BasicBlock &bb) {
                                                 smearType, NULL);
                 smearFunc = llvm::dyn_cast<llvm::Function>(sf);
                 assert(smearFunc != NULL);
+#if defined(LLVM_3_1)
+                smearFunc->setDoesNotThrow(true);
+                smearFunc->setDoesNotAccessMemory(true);
+#else
                 smearFunc->setDoesNotThrow();
                 smearFunc->setDoesNotAccessMemory();
+#endif
             }
 
             assert(smearFunc != NULL);
@@ -4651,8 +4703,13 @@ AndCmpCleanupPass::runOnBasicBlock(llvm::BasicBlock &bb) {
                                                    LLVMTypes::MaskType, NULL);
                 andCmpFunc = llvm::dyn_cast<llvm::Function>(acf);
                 Assert(andCmpFunc != NULL);
+#if defined(LLVM_3_1)
+                andCmpFunc->setDoesNotThrow(true);
+                andCmpFunc->setDoesNotAccessMemory(true);
+#else
                 andCmpFunc->setDoesNotThrow();
                 andCmpFunc->setDoesNotAccessMemory();
+#endif
             }
 
             // Set up the function call to the *_and_mask function; the
@@ -4857,7 +4914,7 @@ WriteCXXFile(llvm::Module *module, const char *fn, int vectorWidth,
         pm.add(new llvm::TargetData(module));
 #endif
 
-#if defined(LLVM_3_2) || defined(LLVM_3_3)
+#if defined(LLVM_3_1) || defined(LLVM_3_2) || defined(LLVM_3_3)
     int flags = 0;
 #else
     llvm::sys::fs::OpenFlags flags = llvm::sys::fs::F_None;
@@ -4882,7 +4939,7 @@ WriteCXXFile(llvm::Module *module, const char *fn, int vectorWidth,
     pm.add(llvm::createDeadCodeEliminationPass()); // clean up after smear pass
 //CO    pm.add(llvm::createPrintModulePass(&fos));
     pm.add(new CWriter(fos, includeName, vectorWidth));
-#if defined(LLVM_3_2)
+#if defined(LLVM_3_1) || defined(LLVM_3_2)
     // This interface is depricated for 3.3+
     pm.add(llvm::createGCInfoDeleter());
 #endif
diff --git a/ctx.cpp b/ctx.cpp
index 32ba0ad9..c50d22f9 100644
--- a/ctx.cpp
+++ b/ctx.cpp
@@ -46,7 +46,7 @@
 #include "sym.h"
 #include <map>
 #include <llvm/Support/Dwarf.h>
-#if defined(LLVM_3_2)
+#if defined(LLVM_3_1) || defined(LLVM_3_2)
   #include <llvm/Metadata.h>
   #include <llvm/Module.h>
   #include <llvm/Instructions.h>
@@ -3312,7 +3312,7 @@ FunctionEmitContext::CallInst(llvm::Value *func, const FunctionType *funcType,
         // alias analysis.
         // TODO: what other attributes needs to be copied?
         // TODO: do the same for varing path.
-#if !defined (LLVM_3_2) // LLVM 3.3+
+#if !defined (LLVM_3_1) && !defined (LLVM_3_2) // LLVM 3.3+
         llvm::CallInst *cc = llvm::dyn_cast<llvm::CallInst>(ci);
         if (cc &&
             cc->getCalledFunction() &&
diff --git a/ctx.h b/ctx.h
index 4b27e6e5..58f9aae3 100644
--- a/ctx.h
+++ b/ctx.h
@@ -40,15 +40,20 @@
 
 #include "ispc.h"
 #include <map>
-#if defined(LLVM_3_2)
+#if defined(LLVM_3_1) || defined(LLVM_3_2)
   #include <llvm/InstrTypes.h>
   #include <llvm/Instructions.h>
 #else
   #include <llvm/IR/InstrTypes.h>
   #include <llvm/IR/Instructions.h>
 #endif
-#include <llvm/DebugInfo.h>
-#include <llvm/DIBuilder.h>
+#if defined(LLVM_3_1)
+  #include <llvm/Analysis/DebugInfo.h>
+  #include <llvm/Analysis/DIBuilder.h>
+#else
+  #include <llvm/DebugInfo.h>
+  #include <llvm/DIBuilder.h>
+#endif
 
 struct CFInfo;
 
diff --git a/expr.cpp b/expr.cpp
index eb8c0951..856d363c 100644
--- a/expr.cpp
+++ b/expr.cpp
@@ -56,7 +56,7 @@
 #include <list>
 #include <set>
 #include <stdio.h>
-#if defined(LLVM_3_2)
+#if defined(LLVM_3_1) || defined(LLVM_3_2)
   #include <llvm/Module.h>
   #include <llvm/Type.h>
   #include <llvm/Instructions.h>
diff --git a/func.cpp b/func.cpp
index 3097f86d..b975049b 100644
--- a/func.cpp
+++ b/func.cpp
@@ -46,7 +46,7 @@
 #include "util.h"
 #include <stdio.h>
 
-#if defined(LLVM_3_2)
+#if defined(LLVM_3_1) || defined(LLVM_3_2)
   #include <llvm/LLVMContext.h>
   #include <llvm/Module.h>
   #include <llvm/Type.h>
@@ -310,7 +310,9 @@ Function::emitCode(FunctionEmitContext *ctx, llvm::Function *function,
         // isn't worth the code bloat / overhead.
         bool checkMask = (type->isTask == true) ||
             (
-#if defined(LLVM_3_2)
+#if defined(LLVM_3_1)
+              (function->hasFnAttr(llvm::Attribute::AlwaysInline) == false)
+#elif defined(LLVM_3_2)
               (function->getFnAttributes().hasAttribute(llvm::Attributes::AlwaysInline) == false)
 #else // LLVM 3.3+
               (function->getAttributes().getFnAttributes().hasAttribute(llvm::AttributeSet::FunctionIndex, llvm::Attribute::AlwaysInline) == false)
@@ -451,7 +453,11 @@ Function::GenerateIR() {
                     functionName += std::string("_") + g->target->GetISAString();
                 llvm::Function *appFunction =
                     llvm::Function::Create(ftype, linkage, functionName.c_str(), m->module);
+#if defined(LLVM_3_1)
+                appFunction->setDoesNotThrow(true);
+#else
                 appFunction->setDoesNotThrow();
+#endif
 
                 g->target->markFuncWithTargetAttr(appFunction);
 
diff --git a/ispc.cpp b/ispc.cpp
index 03d1aaff..0f07895f 100644
--- a/ispc.cpp
+++ b/ispc.cpp
@@ -48,7 +48,7 @@
   #include <sys/types.h>
   #include <unistd.h>
 #endif
-#if defined(LLVM_3_2)
+#if defined(LLVM_3_1) || defined(LLVM_3_2)
   #include <llvm/LLVMContext.h>
   #include <llvm/Module.h>
   #include <llvm/Instructions.h>
@@ -57,12 +57,19 @@
   #include <llvm/IR/Module.h>
   #include <llvm/IR/Instructions.h>
 #endif
-#include <llvm/DebugInfo.h>
-#include <llvm/DIBuilder.h>
+#if defined(LLVM_3_1)
+  #include <llvm/Analysis/DebugInfo.h>
+  #include <llvm/Analysis/DIBuilder.h>
+#else
+  #include <llvm/DebugInfo.h>
+  #include <llvm/DIBuilder.h>
+#endif
 #include <llvm/Support/Dwarf.h>
 #include <llvm/Target/TargetMachine.h>
 #include <llvm/Target/TargetOptions.h>
-#if defined(LLVM_3_2)
+#if defined(LLVM_3_1)
+  #include <llvm/Target/TargetData.h>
+#elif defined(LLVM_3_2)
   #include <llvm/DataLayout.h>
 #else // LLVM 3.3+
   #include <llvm/IR/DataLayout.h>
@@ -138,20 +145,27 @@ static const char *supportedCPUs[] = {
     // cortex-a9 and a15.  We should be able to handle any of them that also
     // have NEON support.
     "cortex-a9", "cortex-a15",
-    "atom", "penryn", "core2", "corei7", "corei7-avx", "core-avx-i", "core-avx2"
+    "atom", "penryn", "core2", "corei7", "corei7-avx"
+#if !defined(LLVM_3_1)
+    , "core-avx-i", "core-avx2"
+#endif // LLVM 3.2+
 };
 
 Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
     m_target(NULL),
     m_targetMachine(NULL),
+#if defined(LLVM_3_1)
+    m_targetData(NULL),
+#else
     m_dataLayout(NULL),
+#endif
     m_valid(false),
     m_isa(SSE2),
     m_arch(""),
     m_is32Bit(true),
     m_cpu(""),
     m_attributes(""),
-#if !defined(LLVM_3_2)
+#if !defined(LLVM_3_1) && !defined(LLVM_3_2)
     m_tf_attributes(NULL),
 #endif
     m_nativeVectorWidth(-1),
@@ -393,7 +407,10 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
         this->m_maskingIsFree = false;
         this->m_maskBitCount = 32;
         this->m_hasHalf = true;
+#if !defined(LLVM_3_1)
+        // LLVM 3.2+ only
         this->m_hasRand = true;
+#endif
     }
     else if (!strcasecmp(isa, "avx1.1-x2")) {
         this->m_isa = Target::AVX11;
@@ -403,29 +420,46 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
         this->m_maskingIsFree = false;
         this->m_maskBitCount = 32;
         this->m_hasHalf = true;
+#if !defined(LLVM_3_1)
+        // LLVM 3.2+ only
         this->m_hasRand = true;
+#endif
     }
     else if (!strcasecmp(isa, "avx2")) {
         this->m_isa = Target::AVX2;
         this->m_nativeVectorWidth = 8;
         this->m_vectorWidth = 8;
-        this->m_attributes = "+avx2,+popcnt,+cmov,+f16c,+rdrand,+fma";
+        this->m_attributes = "+avx2,+popcnt,+cmov,+f16c,+rdrand"
+#ifndef LLVM_3_1
+            ",+fma"
+#endif // !LLVM_3_1
+            ;
         this->m_maskingIsFree = false;
         this->m_maskBitCount = 32;
         this->m_hasHalf = true;
+#if !defined(LLVM_3_1)
+        // LLVM 3.2+ only
         this->m_hasRand = true;
         this->m_hasGather = true;
+#endif
     }
     else if (!strcasecmp(isa, "avx2-x2")) {
         this->m_isa = Target::AVX2;
         this->m_nativeVectorWidth = 16;
         this->m_vectorWidth = 16;
-        this->m_attributes = "+avx2,+popcnt,+cmov,+f16c,+rdrand,+fma";
+        this->m_attributes = "+avx2,+popcnt,+cmov,+f16c,+rdrand"
+#ifndef LLVM_3_1
+            ",+fma"
+#endif // !LLVM_3_1
+            ;
         this->m_maskingIsFree = false;
         this->m_maskBitCount = 32;
         this->m_hasHalf = true;
+#if !defined(LLVM_3_1)
+        // LLVM 3.2+ only
         this->m_hasRand = true;
         this->m_hasGather = true;
+#endif
     }
     else if (!strcasecmp(isa, "neon-8")) {
         this->m_isa = Target::NEON8;
@@ -471,8 +505,10 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
         if (m_isa == Target::NEON8 || m_isa == Target::NEON16 ||
             m_isa == Target::NEON32)
             options.FloatABIType = llvm::FloatABI::Hard;
+#if !defined(LLVM_3_1)
         if (g->opt.disableFMA == false)
             options.AllowFPOpFusion = llvm::FPOpFusion::Fast;
+#endif // !LLVM_3_1
 
 #ifdef ISPC_IS_WINDOWS
         if (strcmp("x86", arch) == 0) {
@@ -490,8 +526,12 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
 
         // Initialize TargetData/DataLayout in 3 steps.
         // 1. Get default data layout first
-        std::string dl_string =
-          m_targetMachine->getDataLayout()->getStringRepresentation();
+        std::string dl_string;
+#if defined(LLVM_3_1)
+        dl_string = m_targetMachine->getTargetData()->getStringRepresentation();
+#else
+        dl_string = m_targetMachine->getDataLayout()->getStringRepresentation();
+#endif
 
         // 2. Adjust for generic
         if (m_isa == Target::GENERIC) {
@@ -506,7 +546,11 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
         }
 
         // 3. Finally set member data
+#if defined(LLVM_3_1)
+        m_targetData = new llvm::TargetData(dl_string);
+#else
         m_dataLayout = new llvm::DataLayout(dl_string);
+#endif
 
         // Set is32Bit
         // This indicates if we are compiling for 32 bit platform
@@ -514,7 +558,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
         // FIXME: all generic targets are handled as 64 bit, which is incorrect.
         this->m_is32Bit = (getDataLayout()->getPointerSize() == 4);
 
-#if !defined(LLVM_3_2)
+#if !defined(LLVM_3_1) && !defined(LLVM_3_2)
         // This is LLVM 3.3+ feature.
         // Initialize target-specific "target-feature" attribute.
         if (!m_attributes.empty()) {
@@ -728,7 +772,7 @@ Target::StructOffset(llvm::Type *type, int element,
 }
 
 void Target::markFuncWithTargetAttr(llvm::Function* func) {
-#if !defined(LLVM_3_2)
+#if !defined(LLVM_3_1) && !defined(LLVM_3_2)
     if (m_tf_attributes) {
         func->addAttributes(llvm::AttributeSet::FunctionIndex, *m_tf_attributes);
     }
diff --git a/ispc.h b/ispc.h
index 8653553e..98fcd199 100644
--- a/ispc.h
+++ b/ispc.h
@@ -40,8 +40,8 @@
 
 #define ISPC_VERSION "1.4.5dev"
 
-#if !defined(LLVM_3_2) && !defined(LLVM_3_3) && !defined(LLVM_3_4)
-#error "Only LLVM 3.2, 3.3 and the 3.4 development branch are supported"
+#if !defined(LLVM_3_1) && !defined(LLVM_3_2) && !defined(LLVM_3_3) && !defined(LLVM_3_4)
+#error "Only LLVM 3.1, 3.2, 3.3 and the 3.4 development branch are supported"
 #endif
 
 #if defined(_WIN32) || defined(_WIN64)
@@ -72,7 +72,11 @@ namespace llvm {
     class BasicBlock;
     class Constant;
     class ConstantValue;
+#if defined(LLVM_3_1)
+    class TargetData;
+#else
     class DataLayout;
+#endif
     class DIBuilder;
     class DIDescriptor;
     class DIFile;
@@ -222,7 +226,11 @@ public:
 
     // Note the same name of method for 3.1 and 3.2+, this allows
     // to reduce number ifdefs on client side.
+#if defined(LLVM_3_1)
+    llvm::TargetData *getDataLayout() const {return m_targetData;}
+#else
     llvm::DataLayout *getDataLayout() const {return m_dataLayout;}
+#endif
 
     /** Reports if Target object has valid state. */
     bool isValid() const {return m_valid;}
@@ -270,7 +278,11 @@ private:
         */
     llvm::TargetMachine *m_targetMachine;
 
+#if defined(LLVM_3_1)
+    llvm::TargetData *m_targetData;
+#else
     llvm::DataLayout *m_dataLayout;
+#endif
 
     /** flag to report invalid state after construction
         (due to bad parameters passed to constructor). */
@@ -291,7 +303,7 @@ private:
     /** Target-specific attribute string to pass along to the LLVM backend */
     std::string m_attributes;
 
-#if !defined(LLVM_3_2)
+#if !defined(LLVM_3_1) && !defined(LLVM_3_2)
     /** Target-specific LLVM attribute, which has to be attached to every
         function to ensure that it is generated for correct target architecture.
         This is requirement was introduced in LLVM 3.3 */
diff --git a/llvmutil.cpp b/llvmutil.cpp
index 2f54a2fe..180c8676 100644
--- a/llvmutil.cpp
+++ b/llvmutil.cpp
@@ -38,7 +38,7 @@
 #include "llvmutil.h"
 #include "ispc.h"
 #include "type.h"
-#if defined(LLVM_3_2)
+#if defined(LLVM_3_1) || defined(LLVM_3_2)
   #include <llvm/Instructions.h>
   #include <llvm/BasicBlock.h>
 #else
diff --git a/llvmutil.h b/llvmutil.h
index d1803f32..d6c5ede0 100644
--- a/llvmutil.h
+++ b/llvmutil.h
@@ -38,7 +38,7 @@
 #ifndef ISPC_LLVMUTIL_H
 #define ISPC_LLVMUTIL_H 1
 
-#if defined(LLVM_3_2)
+#if defined(LLVM_3_1) || defined(LLVM_3_2)
   #include <llvm/LLVMContext.h>
   #include <llvm/Type.h>
   #include <llvm/DerivedTypes.h>
diff --git a/main.cpp b/main.cpp
index c21e7f88..8d7282f5 100644
--- a/main.cpp
+++ b/main.cpp
@@ -62,7 +62,9 @@ static void
 lPrintVersion() {
     printf("Intel(r) SPMD Program Compiler (ispc), %s (build %s @ %s, LLVM %s)\n",
            ISPC_VERSION, BUILD_VERSION, BUILD_DATE,
-#if defined(LLVM_3_2)
+#if defined(LLVM_3_1)
+           "3.1"
+#elif defined(LLVM_3_2)
            "3.2"
 #elif defined(LLVM_3_3)
            "3.3"
diff --git a/module.cpp b/module.cpp
index eba5eb3b..755a5dc4 100644
--- a/module.cpp
+++ b/module.cpp
@@ -64,7 +64,7 @@
 #define strcasecmp stricmp
 #endif
 
-#if defined(LLVM_3_2)
+#if defined(LLVM_3_1) || defined(LLVM_3_2)
   #include <llvm/LLVMContext.h>
   #include <llvm/Module.h>
   #include <llvm/Type.h>
@@ -86,7 +86,9 @@
 #include <llvm/Support/FileUtilities.h>
 #include <llvm/Target/TargetMachine.h>
 #include <llvm/Target/TargetOptions.h>
-#if defined(LLVM_3_2)
+#if defined(LLVM_3_1)
+  #include <llvm/Target/TargetData.h>
+#elif defined(LLVM_3_2)
   #include <llvm/DataLayout.h>
   #include <llvm/TargetTransformInfo.h>
 #else // LLVM 3.3+
@@ -200,7 +202,7 @@ lStripUnusedDebugInfo(llvm::Module *module) {
             // stuff and remove it later on. Removing it is useful, as it
             // reduces size of the binary significantly (manyfold for small
             // programs).
-#if defined(LLVM_3_2)
+#if defined(LLVM_3_1) || defined(LLVM_3_2)
             llvm::MDNode *nodeSPMD =
                 llvm::dyn_cast<llvm::MDNode>(cuNode->getOperand(12));
             Assert(nodeSPMD != NULL);
@@ -795,7 +797,11 @@ Module::AddFunctionDeclaration(const std::string &name,
 #endif
     if (functionType->isTask)
         // This also applies transitively to members I think?
+#if defined(LLVM_3_1)
+        function->setDoesNotAlias(1, true);
+#else // LLVM 3.2+
         function->setDoesNotAlias(1);
+#endif
 
     g->target->markFuncWithTargetAttr(function);
 
@@ -844,7 +850,12 @@ Module::AddFunctionDeclaration(const std::string &name,
 
             // NOTE: LLVM indexes function parameters starting from 1.
             // This is unintuitive.
+#if defined(LLVM_3_1)
+            function->setDoesNotAlias(i+1, true);
+#else
             function->setDoesNotAlias(i+1);
+#endif
+
 #if 0
             int align = 4 * RoundUpPow2(g->target->nativeVectorWidth);
             function->addAttribute(i+1, llvm::Attribute::constructAlignmentFromInt(align));
@@ -1056,7 +1067,7 @@ Module::writeObjectFileOrAssembly(llvm::TargetMachine *targetMachine,
     llvm::TargetMachine::CodeGenFileType fileType = (outputType == Object) ?
         llvm::TargetMachine::CGFT_ObjectFile : llvm::TargetMachine::CGFT_AssemblyFile;
     bool binary = (fileType == llvm::TargetMachine::CGFT_ObjectFile);
-#if defined(LLVM_3_2) || defined(LLVM_3_3)
+#if defined(LLVM_3_1) || defined(LLVM_3_2) || defined(LLVM_3_3)
     unsigned int flags = binary ? llvm::raw_fd_ostream::F_Binary : 0;
 #else
     llvm::sys::fs::OpenFlags flags = binary ? llvm::sys::fs::F_Binary :
@@ -1071,7 +1082,11 @@ Module::writeObjectFileOrAssembly(llvm::TargetMachine *targetMachine,
     }
 
     llvm::PassManager pm;
+#if defined(LLVM_3_1)
+    pm.add(new llvm::TargetData(*g->target->getDataLayout()));
+#else
     pm.add(new llvm::DataLayout(*g->target->getDataLayout()));
+#endif
 
     llvm::formatted_raw_ostream fos(of->os());
 
@@ -1785,12 +1800,22 @@ Module::execPreprocessor(const char *infilename, llvm::raw_string_ostream *ostre
 
     llvm::raw_fd_ostream stderrRaw(2, false);
 
+#if defined(LLVM_3_1)
+    clang::TextDiagnosticPrinter *diagPrinter =
+        new clang::TextDiagnosticPrinter(stderrRaw, clang::DiagnosticOptions());
+#else
     clang::DiagnosticOptions *diagOptions = new clang::DiagnosticOptions();
     clang::TextDiagnosticPrinter *diagPrinter =
         new clang::TextDiagnosticPrinter(stderrRaw, diagOptions);
+#endif
     llvm::IntrusiveRefCntPtr<clang::DiagnosticIDs> diagIDs(new clang::DiagnosticIDs);
+#if defined(LLVM_3_1)
+    clang::DiagnosticsEngine *diagEngine =
+        new clang::DiagnosticsEngine(diagIDs, diagPrinter);
+#else
     clang::DiagnosticsEngine *diagEngine =
         new clang::DiagnosticsEngine(diagIDs, diagOptions, diagPrinter);
+#endif
     inst.setDiagnostics(diagEngine);
 
     clang::TargetOptions &options = inst.getTargetOpts();
@@ -1800,7 +1825,7 @@ Module::execPreprocessor(const char *infilename, llvm::raw_string_ostream *ostre
     }
     options.Triple = triple.getTriple();
 
-#if defined(LLVM_3_2)
+#if defined(LLVM_3_1) || defined(LLVM_3_2)
     clang::TargetInfo *target =
         clang::TargetInfo::CreateTargetInfo(inst.getDiagnostics(), options);
 #else // LLVM 3.3+
@@ -1810,14 +1835,18 @@ Module::execPreprocessor(const char *infilename, llvm::raw_string_ostream *ostre
 
     inst.setTarget(target);
     inst.createSourceManager(inst.getFileManager());
+#if defined(LLVM_3_1)
+    inst.InitializeSourceManager(infilename);
+#else
     clang::FrontendInputFile inputFile(infilename, clang::IK_None);
     inst.InitializeSourceManager(inputFile);
+#endif
 
     // Don't remove comments in the preprocessor, so that we can accurately
     // track the source file position by handling them ourselves.
     inst.getPreprocessorOutputOpts().ShowComments = 1;
 
-#if !defined(LLVM_3_2) // LLVM 3.3+
+#if !defined(LLVM_3_1) && !defined(LLVM_3_2) // LLVM 3.3+
     inst.getPreprocessorOutputOpts().ShowCPP = 1;
 #endif
 
@@ -1829,7 +1858,7 @@ Module::execPreprocessor(const char *infilename, llvm::raw_string_ostream *ostre
         headerOpts.Verbose = 1;
     for (int i = 0; i < (int)g->includePath.size(); ++i) {
         headerOpts.AddPath(g->includePath[i], clang::frontend::Angled,
-#if defined(LLVM_3_2)
+#if defined(LLVM_3_1) || defined(LLVM_3_2)
                            true /* is user supplied */,
 #endif
                            false /* not a framework */,
@@ -1884,7 +1913,11 @@ Module::execPreprocessor(const char *infilename, llvm::raw_string_ostream *ostre
         }
     }
 
+#if defined(LLVM_3_1)
+    inst.getLangOpts().BCPLComment = 1;
+#else
     inst.getLangOpts().LineComment = 1;
+#endif
     inst.createPreprocessor();
 
     diagPrinter->BeginSourceFile(inst.getLangOpts(), &inst.getPreprocessor());
diff --git a/opt.cpp b/opt.cpp
index 8899c64d..077320d5 100644
--- a/opt.cpp
+++ b/opt.cpp
@@ -48,7 +48,7 @@
 #include <set>
 
 #include <llvm/Pass.h>
-#if defined(LLVM_3_2)
+#if defined(LLVM_3_1) || defined(LLVM_3_2)
   #include <llvm/Module.h>
   #include <llvm/Instructions.h>
   #include <llvm/Intrinsics.h>
@@ -73,7 +73,9 @@
 #include <llvm/Transforms/IPO.h>
 #include <llvm/Transforms/Utils/BasicBlockUtils.h>
 #include <llvm/Target/TargetOptions.h>
-#if defined(LLVM_3_2)
+#if defined(LLVM_3_1)
+  #include <llvm/Target/TargetData.h>
+#elif defined(LLVM_3_2)
   #include <llvm/DataLayout.h>
 #else // LLVM 3.3+
   #include <llvm/IR/DataLayout.h>
@@ -83,8 +85,12 @@
 #include <llvm/Analysis/Verifier.h>
 #include <llvm/Analysis/Passes.h>
 #include <llvm/Support/raw_ostream.h>
-#include <llvm/DebugInfo.h>
 #include <llvm/Support/PatternMatch.h>
+#if defined(LLVM_3_1)
+  #include <llvm/Analysis/DebugInfo.h>
+#else
+  #include <llvm/DebugInfo.h>
+#endif
 #include <llvm/Support/Dwarf.h>
 #ifdef ISPC_IS_LINUX
   #include <alloca.h>
@@ -411,14 +417,18 @@ Optimize(llvm::Module *module, int optLevel) {
     optPM.add(targetLibraryInfo);
 
 
+#if defined(LLVM_3_1)
+    optPM.add(new llvm::TargetData(*g->target->getDataLayout()));
+#else
     optPM.add(new llvm::DataLayout(*g->target->getDataLayout()));
 
     llvm::TargetMachine *targetMachine = g->target->GetTargetMachine();
-#ifdef LLVM_3_2
+  #ifdef LLVM_3_2
     optPM.add(new llvm::TargetTransformInfo(targetMachine->getScalarTargetTransformInfo(),
                                             targetMachine->getVectorTargetTransformInfo()));
-#else // LLVM 3.3+
+  #else // LLVM 3.3+
     targetMachine->addAnalysisPasses(optPM);
+  #endif
 #endif
 
     optPM.add(llvm::createIndVarSimplifyPass());
@@ -500,7 +510,7 @@ Optimize(llvm::Module *module, int optLevel) {
         optPM.add(llvm::createCFGSimplificationPass());
 
         optPM.add(llvm::createArgumentPromotionPass());
-#if defined(LLVM_3_2) || defined(LLVM_3_3)
+#if defined(LLVM_3_1) || defined(LLVM_3_2) || defined(LLVM_3_3)
         // Starting from 3.4 this functionality was moved to
         // InstructionCombiningPass. See r184459 for details.
         optPM.add(llvm::createSimplifyLibCallsPass());
diff --git a/stmt.cpp b/stmt.cpp
index 412b0dd9..4ec63d35 100644
--- a/stmt.cpp
+++ b/stmt.cpp
@@ -48,7 +48,7 @@
 #include <stdio.h>
 #include <map>
 
-#if defined(LLVM_3_2)
+#if defined(LLVM_3_1) || defined(LLVM_3_2)
   #include <llvm/Module.h>
   #include <llvm/Type.h>
   #include <llvm/Instructions.h>
diff --git a/type.cpp b/type.cpp
index 11a165f5..5fa1845b 100644
--- a/type.cpp
+++ b/type.cpp
@@ -43,15 +43,20 @@
 
 #include <stdio.h>
 #include <map>
-#if defined(LLVM_3_2)
+#if defined(LLVM_3_1) || defined(LLVM_3_2)
   #include <llvm/Value.h>
   #include <llvm/Module.h>
 #else
   #include <llvm/IR/Value.h>
   #include <llvm/IR/Module.h>
 #endif
-#include <llvm/DebugInfo.h>
-#include <llvm/DIBuilder.h>
+#if defined(LLVM_3_1)
+  #include <llvm/Analysis/DebugInfo.h>
+  #include <llvm/Analysis/DIBuilder.h>
+#else
+  #include <llvm/DebugInfo.h>
+  #include <llvm/DIBuilder.h>
+#endif
 #include <llvm/Support/Dwarf.h>
 
 
@@ -814,8 +819,11 @@ EnumType::GetDIType(llvm::DIDescriptor scope) const {
         m->diBuilder->createEnumerationType(scope, name, diFile, pos.first_line,
                                             32 /* size in bits */,
                                             32 /* align in bits */,
-                                            elementArray,
-                                            llvm::DIType());
+                                            elementArray
+#if !defined(LLVM_3_1)
+                                            , llvm::DIType()
+#endif
+                                            );
 
 
     switch (variability.type) {
@@ -2131,7 +2139,7 @@ StructType::GetDIType(llvm::DIDescriptor scope) const {
         currentSize,    // Size in bits
         align,          // Alignment in bits
         0,              // Flags
-#if !defined(LLVM_3_2)
+#if !defined(LLVM_3_1) && !defined(LLVM_3_2)
         llvm::DIType(), // DerivedFrom
 #endif
         elements);
@@ -2374,7 +2382,7 @@ UndefinedStructType::GetDIType(llvm::DIDescriptor scope) const {
         0,              // Size
         0,              // Align
         0,              // Flags
-#if !defined(LLVM_3_2)
+#if !defined(LLVM_3_1) && !defined(LLVM_3_2)
         llvm::DIType(), // DerivedFrom
 #endif
         elements);
@@ -2637,8 +2645,12 @@ ReferenceType::GetDIType(llvm::DIDescriptor scope) const {
     }
 
     llvm::DIType diTargetType = targetType->GetDIType(scope);
+#if defined(LLVM_3_1)
+    return m->diBuilder->createReferenceType(diTargetType);
+#else
     return m->diBuilder->createReferenceType(llvm::dwarf::DW_TAG_reference_type,
                                              diTargetType);
+#endif
 }
 
 
diff --git a/type.h b/type.h
index a6a52e10..880f8574 100644
--- a/type.h
+++ b/type.h
@@ -40,7 +40,7 @@
 
 #include "ispc.h"
 #include "util.h"
-#if defined(LLVM_3_2)
+#if defined(LLVM_3_1) || defined(LLVM_3_2)
   #include <llvm/Type.h>
   #include <llvm/DerivedTypes.h>
 #else
diff --git a/util.cpp b/util.cpp
index 4be863bf..dbea9517 100644
--- a/util.cpp
+++ b/util.cpp
@@ -65,7 +65,9 @@
 #include <set>
 #include <algorithm>
 
-#if defined(LLVM_3_2)
+#if defined(LLVM_3_1)
+  #include <llvm/Target/TargetData.h>
+#elif defined(LLVM_3_2)
   #include <llvm/DataLayout.h>
 #else // LLVM 3.3+
   #include <llvm/IR/DataLayout.h>
@@ -614,8 +616,13 @@ VerifyDataLayoutCompatibility(const std::string &module_dl,
     // which contradic: f80:128:128 followed by f80:32:32. This is a bug, but
     // correct thing to do is to interpret this exactly how LLVM would treat it,
     // so we create a DataLayout class and take its string representation.
+#if defined(LLVM_3_1)
+    llvm::TargetData d1(module_dl);
+    llvm::TargetData d2(lib_dl);
+#else // LLVM 3.2+
     llvm::DataLayout d1(module_dl);
     llvm::DataLayout d2(lib_dl);
+#endif
 
     std::string module_dl_canonic = d1.getStringRepresentation();
     std::string lib_dl_canonic = d2.getStringRepresentation();

From 5e5d42b918852a7aeb12bbc98cf4a5b46e5f9842 Mon Sep 17 00:00:00 2001
From: Matt Pharr <mpharr@google.com>
Date: Tue, 6 Aug 2013 17:55:37 -0700
Subject: [PATCH 027/124] Fix build with LLVM 3.1

---
 opt.cpp | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/opt.cpp b/opt.cpp
index 3e2efcd8..e1618b7a 100644
--- a/opt.cpp
+++ b/opt.cpp
@@ -4556,6 +4556,8 @@ PeepholePass::PeepholePass()
     : BasicBlockPass(ID) {
 }
 
+#ifndef LLVM_3_1
+
 using namespace llvm::PatternMatch;
 
 template<typename Op_t, unsigned Opcode>
@@ -4706,6 +4708,7 @@ inline SDiv2_match<V>
 m_SDiv2(const V &v) {
     return SDiv2_match<V>(v);
 }
+
 // Returns true if the given function has a call to an intrinsic function
 // in its definition.
 static bool
@@ -4874,6 +4877,7 @@ lMatchAvgDownInt16(llvm::Value *inst) {
     }
     return NULL;
 }
+#endif // !LLVM_3_1
 
 bool
 PeepholePass::runOnBasicBlock(llvm::BasicBlock &bb) {
@@ -4885,6 +4889,7 @@ PeepholePass::runOnBasicBlock(llvm::BasicBlock &bb) {
         llvm::Instruction *inst = &*iter;
 
         llvm::Instruction *builtinCall = NULL;
+#ifndef LLVM_3_1
         if (!builtinCall)
           builtinCall = lMatchAvgUpUInt8(inst);
         if (!builtinCall)
@@ -4901,7 +4906,7 @@ PeepholePass::runOnBasicBlock(llvm::BasicBlock &bb) {
           builtinCall = lMatchAvgDownInt8(inst);
         if (!builtinCall)
           builtinCall = lMatchAvgDownInt16(inst);
-
+#endif // LLVM_3_1
         if (builtinCall != NULL) {
           llvm::ReplaceInstWithInst(inst, builtinCall);
           modifiedAny = true;

From 1d76f74b165ee79840a739490fddedbb532a275f Mon Sep 17 00:00:00 2001
From: Matt Pharr <mpharr@google.com>
Date: Wed, 7 Aug 2013 12:53:39 -0700
Subject: [PATCH 028/124] Fix compiler warnings

---
 opt.cpp  | 8 ++++----
 parse.yy | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/opt.cpp b/opt.cpp
index e1618b7a..522e601b 100644
--- a/opt.cpp
+++ b/opt.cpp
@@ -4756,7 +4756,7 @@ lMatchAvgUpUInt8(llvm::Value *inst) {
         m_Add(m_Add(m_ZExt8To16(m_Value(opa)), m_ZExt8To16(m_Value(opb))),
               m_APInt(delta))))))) {
         if (delta->isIntN(1) == false)
-            return false;
+            return NULL;
 
         return lGetBinaryIntrinsic("__avg_up_uint8", opa, opb);
     }
@@ -4790,7 +4790,7 @@ lMatchAvgUpUInt16(llvm::Value *inst) {
         m_Add(m_Add(m_ZExt16To32(m_Value(opa)), m_ZExt16To32(m_Value(opb))),
               m_APInt(delta))))))) {
         if (delta->isIntN(1) == false)
-            return false;
+            return NULL;
 
         return lGetBinaryIntrinsic("__avg_up_uint16", opa, opb);
     }
@@ -4825,7 +4825,7 @@ lMatchAvgUpInt8(llvm::Value *inst) {
         m_Add(m_Add(m_SExt8To16(m_Value(opa)), m_SExt8To16(m_Value(opb))),
               m_APInt(delta))))))) {
         if (delta->isIntN(1) == false)
-            return false;
+            return NULL;
 
         return lGetBinaryIntrinsic("__avg_up_int8", opa, opb);
     }
@@ -4859,7 +4859,7 @@ lMatchAvgUpInt16(llvm::Value *inst) {
         m_Add(m_Add(m_SExt16To32(m_Value(opa)), m_SExt16To32(m_Value(opb))),
               m_APInt(delta))))))) {
         if (delta->isIntN(1) == false)
-            return false;
+            return NULL;
 
         return lGetBinaryIntrinsic("__avg_up_int16", opa, opb);
     }
diff --git a/parse.yy b/parse.yy
index 4b315776..5fc01cb0 100644
--- a/parse.yy
+++ b/parse.yy
@@ -2169,7 +2169,7 @@ lAddFunctionParams(Declarator *decl) {
 
 /** Add a symbol for the built-in mask variable to the symbol table */
 static void lAddMaskToSymbolTable(SourcePos pos) {
-    const Type *t;
+    const Type *t = NULL;
     switch (g->target->getMaskBitCount()) {
     case 1:
         t = AtomicType::VaryingBool;

From 0c5742b6f88a7b880f27352f652e282d817b92a0 Mon Sep 17 00:00:00 2001
From: Matt Pharr <mpharr@google.com>
Date: Thu, 8 Aug 2013 19:23:44 -0700
Subject: [PATCH 029/124] Implement new naming scheme for --target.

Now targets are named like "<isa>-i<mask size>x<gang size>", e.g.
"sse4-i8x16", or "avx2-i32x16".

The old target names are still supported.
---
 docs/ispc.rst |  94 +++++++++++++++++++++++-----------------
 ispc.cpp      | 116 ++++++++++++++++++++++++++++++--------------------
 ispc.h        |  12 +++---
 main.cpp      |  15 +++++--
 run_tests.py  |   2 +-
 util.cpp      |  10 ++---
 util.h        |  14 ++++++
 7 files changed, 163 insertions(+), 100 deletions(-)

diff --git a/docs/ispc.rst b/docs/ispc.rst
index eb8333de..26cf6be3 100755
--- a/docs/ispc.rst
+++ b/docs/ispc.rst
@@ -489,54 +489,72 @@ on which you're running ``ispc`` is used to determine the target CPU.
 
    ispc foo.ispc -o foo.obj --cpu=corei7-avx
 
-Finally, ``--target`` selects the target instruction set.  The following
-targets are currently supported:
+Finally, ``--target`` selects the target instruction set.  The target
+string is of the form ``[ISA]-i[mask size]x[gang size]``.  For example,
+``--target=avx2-i32x16`` specifies a target with the AVX2 instruction set,
+a mask size of 32 bits, and a gang size of 16.
 
-=========== ========= =======================================
-Target      Gang Size Description
------------ --------- ---------------------------------------
-avx         8         AVX (2010-2011 era Intel CPUs)
-avx-x2      16        "Double-pumped" AVX target, running
-                      twice as many program instances as the
-                      native vector width.
-avx1.1      8         AVX 1.1 target (2012 era "Ivybridge"
-                      Intel CPUs).
-avx1.1-x2   16        Double-pumped AVX 1.1 target.
-avx2        8         AVX 2 target (2013- Intel "Haswell"
-                      CPUs.)
-avx2-x2     16        Double-pumped AVX 2 target.
-neon-8      16        ARM NEON target, targeting computation
-                      on 8-bit data types. 
-neon-16     8         ARM NEON target, targeting computation
-                      on 16-bit data types.
-neon-32     4         ARM NEON target, targeting computation
-                      on 32-bit data types.
-sse2        4         SSE2 (early 2000s era x86 CPUs).
-sse2-x2     8         Double-pumped SSE2.
-sse4        4         SSE4 (generally 2008-2010 Intel CPUs).
-sse4-x2     8         Double-pumped SSE4.
-sse4-8      16        SSE4 target targeting computation on
-                      8-bit data types. 
-sse4-16     8         SSE4 target targeting computation on
-                      16-bit data types.
-=========== ========= =======================================
+The following target ISAs are supported:
+
+============ ==========================================
+Target       Description
+------------ ------------------------------------------
+avx, avx1    AVX (2010-2011 era Intel CPUs)
+avx1.1       AVX 1.1 (2012 era "Ivybridge" Intel CPUs)
+avx2         AVX 2 target (2013- Intel "Haswell" CPUs)
+neon         ARM NEON
+sse2         SSE2 (early 2000s era x86 CPUs)
+sse4         SSE4 (generally 2008-2010 Intel CPUs)
+============ ==========================================
+
+Consult your CPU's manual for specifics on which vector instruction set it
+supports.
+
+The mask size may be 8, 16, or 32 bits, though not all combinations of ISAs
+and mask sizes are supported.  For best performance, the best general
+approach is to choose a mask size equal to the size of the most common
+datatype in your programs.  For example, if most of your computation is on
+32-bit floating-point values, an ``i32`` target is appropriate.  However,
+if you're mostly doing computation on 8-bit images, ``i8`` is a better choice.
 
 See `Basic Concepts: Program Instances and Gangs of Program Instances`_ for
 more discussion of the "gang size" and its implications for program
 execution.
 
-instruction sets.  (As general context, SSE2 was first introduced in
-processors that shipped in 2001, SSE4 was introduced in 2007, and
-processors with AVX were introduced in 2010, and AVX2 arrived in 2013.
-Consult your CPU's
-manual for specifics on which vector instruction set it supports.)
+Running ``ispc --help`` and looking at the output for the ``--target``
+option gives the most up-to-date documentation about which targets your
+compiler binary supports.
+
+The naming scheme for compilation targets changed in August 2013; the
+following table shows the relationship between names in the old scheme and
+in the new scheme:
+
+============= ===========
+Target        Former Name
+------------- -----------
+avx1-i32x8    avx, avx1
+avx1-i32x16   avx-x2
+avx1.1-i32x8  avx1.1
+avx1.1-i32x16 avx1.1-x2
+avx2-i32x8    avx2
+avx2-i32x16   avx2-x2
+neon-8        n/a
+neon-16       n/a
+neon-32       n/a
+sse2-i32x4    sse2
+sse2-i32x8    sse2-x2
+sse4-i32x4    sse4
+sse4-i32x8    sse4-x2
+sse4-i8x16    n/a
+sse4-i16x8    n/a
+============= ===========
 
 By default, the target instruction set is chosen based on the most capable
 one supported by the system on which you're running ``ispc``.  You can
 override this choice with the ``--target`` flag; for example, to select
-Intel® SSE2, use ``--target=sse2``.  (As with the other options in this
-section, see the output of ``ispc --help`` for a full list of supported
-targets.)
+Intel® SSE2 with a 32-bit mask and 4 program instances in a gang, use
+``--target=sse2-i32x4``.  (As with the other options in this section, see
+the output of ``ispc --help`` for a full list of supported targets.)
 
 Generating Generic C++ Output
 -----------------------------
diff --git a/ispc.cpp b/ispc.cpp
index a012b08d..8a0f16c6 100644
--- a/ispc.cpp
+++ b/ispc.cpp
@@ -106,7 +106,7 @@ static void __cpuidex(int info[4], int level, int count) {
 static const char *
 lGetSystemISA() {
 #ifdef __arm__
-    return "neon-32";
+    return "neon-i32x4";
 #else
     int info[4];
     __cpuid(info, 1);
@@ -121,19 +121,19 @@ lGetSystemISA() {
             int info2[4];
             __cpuidex(info2, 7, 0);
             if ((info2[1] & (1 << 5)) != 0)
-                return "avx2";
+                return "avx2-i32x8";
             else
-                return "avx1.1";
+                return "avx1.1-i32x8";
         }
         // Regular AVX
-        return "avx";
+        return "avx-i32x8";
     }
     else if ((info[2] & (1 << 19)) != 0)
-        return "sse4";
+        return "sse4-i32x4";
     else if ((info[3] & (1 << 26)) != 0)
-        return "sse2";
+        return "sse2-i32x4";
     else {
-        fprintf(stderr, "Unable to detect supported SSE/AVX ISA.  Exiting.\n");
+        Error(SourcePos(), "Unable to detect supported SSE/AVX ISA.  Exiting.");
         exit(1);
     }
 #endif
@@ -186,22 +186,22 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
             // If a CPU was specified explicitly, try to pick the best
             // possible ISA based on that.
             if (!strcmp(cpu, "core-avx2"))
-                isa = "avx2";
+                isa = "avx2-i32x8";
 #ifdef ISPC_ARM_ENABLED
             else if (!strcmp(cpu, "cortex-a9") ||
                      !strcmp(cpu, "cortex-a15"))
-                isa = "neon-32";
+                isa = "neon-i32x4";
 #endif
             else if (!strcmp(cpu, "core-avx-i"))
-                isa = "avx1.1";
+                isa = "avx1.1-i32x8";
             else if (!strcmp(cpu, "sandybridge") ||
                 !strcmp(cpu, "corei7-avx"))
-                isa = "avx";
+                isa = "avx-i32x8";
             else if (!strcmp(cpu, "corei7") ||
                      !strcmp(cpu, "penryn"))
-                isa = "sse4";
+                isa = "sse4-i32x4";
             else
-                isa = "sse2";
+                isa = "sse2-i32x4";
             Warning(SourcePos(), "No --target specified on command-line.  "
                     "Using ISA \"%s\" based on specified CPU \"%s\".", isa,
                     cpu);
@@ -211,7 +211,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
             // supports.
             isa = lGetSystemISA();
             Warning(SourcePos(), "No --target specified on command-line.  "
-                    "Using system ISA \"%s\".", isa);
+                    "Using default system target \"%s\".", isa);
         }
     }
 
@@ -241,8 +241,8 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
             }
         }
         if (foundCPU == false) {
-            fprintf(stderr, "Error: CPU type \"%s\" unknown. Supported CPUs: "
-                    "%s.\n", cpu, SupportedTargetCPUs().c_str());
+            Error(SourcePos(), "Error: CPU type \"%s\" unknown. Supported CPUs: "
+                    "%s.", cpu, SupportedCPUs().c_str());
             return;
         }
     }
@@ -283,7 +283,8 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
     }
 
     // Check default LLVM generated targets
-    if (!strcasecmp(isa, "sse2")) {
+    if (!strcasecmp(isa, "sse2") ||
+        !strcasecmp(isa, "sse2-i32x4")) {
         this->m_isa = Target::SSE2;
         this->m_nativeVectorWidth = 4;
         this->m_vectorWidth = 4;
@@ -291,7 +292,8 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
         this->m_maskingIsFree = false;
         this->m_maskBitCount = 32;
     }
-    else if (!strcasecmp(isa, "sse2-x2")) {
+    else if (!strcasecmp(isa, "sse2-x2") ||
+             !strcasecmp(isa, "sse2-i32x8")) {
         this->m_isa = Target::SSE2;
         this->m_nativeVectorWidth = 4;
         this->m_vectorWidth = 8;
@@ -299,7 +301,8 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
         this->m_maskingIsFree = false;
         this->m_maskBitCount = 32;
     }
-    else if (!strcasecmp(isa, "sse4")) {
+    else if (!strcasecmp(isa, "sse4") ||
+             !strcasecmp(isa, "sse4-i32x4")) {
         this->m_isa = Target::SSE4;
         this->m_nativeVectorWidth = 4;
         this->m_vectorWidth = 4;
@@ -308,7 +311,9 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
         this->m_maskingIsFree = false;
         this->m_maskBitCount = 32;
     }
-    else if (!strcasecmp(isa, "sse4x2") || !strcasecmp(isa, "sse4-x2")) {
+    else if (!strcasecmp(isa, "sse4x2") ||
+             !strcasecmp(isa, "sse4-x2") ||
+             !strcasecmp(isa, "sse4-i32x8")) {
         this->m_isa = Target::SSE4;
         this->m_nativeVectorWidth = 4;
         this->m_vectorWidth = 8;
@@ -316,7 +321,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
         this->m_maskingIsFree = false;
         this->m_maskBitCount = 32;
     }
-    else if (!strcasecmp(isa, "sse4-8")) {
+    else if (!strcasecmp(isa, "sse4-i8x16")) {
         this->m_isa = Target::SSE4;
         this->m_nativeVectorWidth = 16;
         this->m_vectorWidth = 16;
@@ -324,7 +329,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
         this->m_maskingIsFree = false;
         this->m_maskBitCount = 8;
     }
-    else if (!strcasecmp(isa, "sse4-16")) {
+    else if (!strcasecmp(isa, "sse4-i16x8")) {
         this->m_isa = Target::SSE4;
         this->m_nativeVectorWidth = 8;
         this->m_vectorWidth = 8;
@@ -332,7 +337,8 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
         this->m_maskingIsFree = false;
         this->m_maskBitCount = 16;
     }
-    else if (!strcasecmp(isa, "generic-4")) {
+    else if (!strcasecmp(isa, "generic-4") ||
+             !strcasecmp(isa, "generic-x4")) {
         this->m_isa = Target::GENERIC;
         this->m_nativeVectorWidth = 4;
         this->m_vectorWidth = 4;
@@ -342,7 +348,8 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
         this->m_hasTranscendentals = true;
         this->m_hasGather = this->m_hasScatter = true;
     }
-    else if (!strcasecmp(isa, "generic-8")) {
+    else if (!strcasecmp(isa, "generic-8") ||
+             !strcasecmp(isa, "generic-x8")) {
         this->m_isa = Target::GENERIC;
         this->m_nativeVectorWidth = 8;
         this->m_vectorWidth = 8;
@@ -352,7 +359,8 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
         this->m_hasTranscendentals = true;
         this->m_hasGather = this->m_hasScatter = true;
     }
-    else if (!strcasecmp(isa, "generic-16")) {
+    else if (!strcasecmp(isa, "generic-16") ||
+             !strcasecmp(isa, "generic-x16")) {
         this->m_isa = Target::GENERIC;
         this->m_nativeVectorWidth = 16;
         this->m_vectorWidth = 16;
@@ -362,7 +370,8 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
         this->m_hasTranscendentals = true;
         this->m_hasGather = this->m_hasScatter = true;
     }
-    else if (!strcasecmp(isa, "generic-32")) {
+    else if (!strcasecmp(isa, "generic-32") ||
+             !strcasecmp(isa, "generic-x32")) {
         this->m_isa = Target::GENERIC;
         this->m_nativeVectorWidth = 32;
         this->m_vectorWidth = 32;
@@ -372,7 +381,8 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
         this->m_hasTranscendentals = true;
         this->m_hasGather = this->m_hasScatter = true;
     }
-    else if (!strcasecmp(isa, "generic-64")) {
+    else if (!strcasecmp(isa, "generic-64") ||
+             !strcasecmp(isa, "generic-x64")) {
         this->m_isa = Target::GENERIC;
         this->m_nativeVectorWidth = 64;
         this->m_vectorWidth = 64;
@@ -382,14 +392,17 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
         this->m_hasTranscendentals = true;
         this->m_hasGather = this->m_hasScatter = true;
     }
-    else if (!strcasecmp(isa, "generic-1")) {
+    else if (!strcasecmp(isa, "generic-1") ||
+             !strcasecmp(isa, "generic-x1")) {
         this->m_isa = Target::GENERIC;
         this->m_nativeVectorWidth = 1;
         this->m_vectorWidth = 1;
         this->m_maskingIsFree = false;
         this->m_maskBitCount = 32;
     }
-    else if (!strcasecmp(isa, "avx") || !strcasecmp(isa, "avx1")) {
+    else if (!strcasecmp(isa, "avx") ||
+             !strcasecmp(isa, "avx1") ||
+             !strcasecmp(isa, "avx1-i32x8")) {
         this->m_isa = Target::AVX;
         this->m_nativeVectorWidth = 8;
         this->m_vectorWidth = 8;
@@ -397,7 +410,9 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
         this->m_maskingIsFree = false;
         this->m_maskBitCount = 32;
     }
-    else if (!strcasecmp(isa, "avx-x2") || !strcasecmp(isa, "avx1-x2")) {
+    else if (!strcasecmp(isa, "avx-x2") ||
+             !strcasecmp(isa, "avx1-x2") ||
+             !strcasecmp(isa, "avx1-i32x16")) {
         this->m_isa = Target::AVX;
         this->m_nativeVectorWidth = 8;
         this->m_vectorWidth = 16;
@@ -405,7 +420,8 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
         this->m_maskingIsFree = false;
         this->m_maskBitCount = 32;
     }
-    else if (!strcasecmp(isa, "avx1.1")) {
+    else if (!strcasecmp(isa, "avx1.1") ||
+             !strcasecmp(isa, "avx1.1-i32x8")) {
         this->m_isa = Target::AVX11;
         this->m_nativeVectorWidth = 8;
         this->m_vectorWidth = 8;
@@ -418,7 +434,8 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
         this->m_hasRand = true;
 #endif
     }
-    else if (!strcasecmp(isa, "avx1.1-x2")) {
+    else if (!strcasecmp(isa, "avx1.1-x2") ||
+             !strcasecmp(isa, "avx1.1-i32x16")) {
         this->m_isa = Target::AVX11;
         this->m_nativeVectorWidth = 8;
         this->m_vectorWidth = 16;
@@ -431,7 +448,8 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
         this->m_hasRand = true;
 #endif
     }
-    else if (!strcasecmp(isa, "avx2")) {
+    else if (!strcasecmp(isa, "avx2") ||
+             !strcasecmp(isa, "avx2-i32x8")) {
         this->m_isa = Target::AVX2;
         this->m_nativeVectorWidth = 8;
         this->m_vectorWidth = 8;
@@ -449,7 +467,8 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
         this->m_hasGather = true;
 #endif
     }
-    else if (!strcasecmp(isa, "avx2-x2")) {
+    else if (!strcasecmp(isa, "avx2-x2") ||
+             !strcasecmp(isa, "avx2-i32x16")) {
         this->m_isa = Target::AVX2;
         this->m_nativeVectorWidth = 16;
         this->m_vectorWidth = 16;
@@ -468,7 +487,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
 #endif
     }
 #ifdef ISPC_ARM_ENABLED
-    else if (!strcasecmp(isa, "neon-8")) {
+    else if (!strcasecmp(isa, "neon-i8x16")) {
         this->m_isa = Target::NEON8;
         this->m_nativeVectorWidth = 16;
         this->m_vectorWidth = 16;
@@ -477,7 +496,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
         this->m_maskingIsFree = false;
         this->m_maskBitCount = 8;
     }
-    else if (!strcasecmp(isa, "neon-16")) {
+    else if (!strcasecmp(isa, "neon-i16x8")) {
         this->m_isa = Target::NEON16;
         this->m_nativeVectorWidth = 8;
         this->m_vectorWidth = 8;
@@ -486,7 +505,8 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
         this->m_maskingIsFree = false;
         this->m_maskBitCount = 16;
     }
-    else if (!strcasecmp(isa, "neon-32") || !strcasecmp(isa, "neon")) {
+    else if (!strcasecmp(isa, "neon") ||
+             !strcasecmp(isa, "neon-i32x4")) {
         this->m_isa = Target::NEON32;
         this->m_nativeVectorWidth = 4;
         this->m_vectorWidth = 4;
@@ -497,8 +517,8 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
     }
 #endif
     else {
-        fprintf(stderr, "Target ISA \"%s\" is unknown.  Choices are: %s\n",
-                isa, SupportedTargetISAs());
+        Error(SourcePos(), "Target \"%s\" is unknown.  Choices are: %s.",
+                isa, SupportedTargets());
         error = true;
     }
 
@@ -592,7 +612,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
 
 
 std::string
-Target::SupportedTargetCPUs() {
+Target::SupportedCPUs() {
     std::string ret;
     int count = sizeof(supportedCPUs) / sizeof(supportedCPUs[0]);
     for (int i = 0; i < count; ++i) {
@@ -605,7 +625,7 @@ Target::SupportedTargetCPUs() {
 
 
 const char *
-Target::SupportedTargetArchs() {
+Target::SupportedArchs() {
     return
 #ifdef ISPC_ARM_ENABLED
         "arm, "
@@ -615,14 +635,18 @@ Target::SupportedTargetArchs() {
 
 
 const char *
-Target::SupportedTargetISAs() {
+Target::SupportedTargets() {
     return
 #ifdef ISPC_ARM_ENABLED
-        "neon-8, neon-16, neon-32, "
+        "neon-i8x16, neon-16x8, neon-32x4, "
 #endif
-        "sse2, sse2-x2, sse4, sse4-8, sse4-16, sse4-x2, "
-        "avx, avx-x2, avx1.1, avx1.1-x2, avx2, avx2-x2, "
-        "generic-1, generic-4, generic-8, generic-16, generic-32";
+        "sse2-i32x4, sse2-i32x8, "
+        "sse4-i32x4, sse4-i32x8, sse4-i16x8, sse4-i8x16, "
+        "avx1-i32x8, avx1-i32x16, "
+        "avx1.1-i32x8, avx1.1-i32x16, "
+        "avx2-i32x8, avx2-i32x16, "
+        "generic-x1, generic-x4, generic-x8, generic-x16, "
+            "generic-x32, generic-x64";
 }
 
 
diff --git a/ispc.h b/ispc.h
index 25a03e1d..fc78e415 100644
--- a/ispc.h
+++ b/ispc.h
@@ -192,16 +192,16 @@ public:
     Target(const char *arch, const char *cpu, const char *isa, bool pic);
 
     /** Returns a comma-delimited string giving the names of the currently
-        supported target ISAs. */
-    static const char *SupportedTargetISAs();
+        supported compilation targets. */
+    static const char *SupportedTargets();
 
     /** Returns a comma-delimited string giving the names of the currently
-        supported target CPUs. */
-    static std::string SupportedTargetCPUs();
+        supported CPUs. */
+    static std::string SupportedCPUs();
 
     /** Returns a comma-delimited string giving the names of the currently
-        supported target architectures. */
-    static const char *SupportedTargetArchs();
+        supported architectures. */
+    static const char *SupportedArchs();
 
     /** Returns a triple string specifying the target architecture, vendor,
         and environment. */
diff --git a/main.cpp b/main.cpp
index 7290d3c8..94edb73f 100644
--- a/main.cpp
+++ b/main.cpp
@@ -85,13 +85,16 @@ usage(int ret) {
     printf("                          \t\taddressing calculations are done by default, even\n");
     printf("                          \t\ton 64-bit target architectures.)\n");
     printf("    [--arch={%s}]\t\tSelect target architecture\n",
-           Target::SupportedTargetArchs());
+           Target::SupportedArchs());
     printf("    [--c++-include-file=<name>]\t\tSpecify name of file to emit in #include statement in generated C++ code.\n");
 #ifndef ISPC_IS_WINDOWS
     printf("    [--colored-output]\t\tAlways use terminal colors in error/warning messages.\n");
 #endif
-    printf("    [--cpu=<cpu>]\t\t\tSelect target CPU type\n");
-    printf("         <cpu>={%s}\n", Target::SupportedTargetCPUs().c_str());
+    printf("    ");
+    char cpuHelp[2048];
+    sprintf(cpuHelp, "[--cpu=<cpu>]\t\t\tSelect target CPU type\n<cpu>={%s}\n",
+            Target::SupportedCPUs().c_str());
+    PrintWithWordBreaks(cpuHelp, 16, TerminalWidth(), stdout);
     printf("    [-D<foo>]\t\t\t\t#define given value when running preprocessor\n");
     printf("    [--dev-stub <filename>]\t\tEmit device-side offload stub functions to file\n");
     printf("    [--emit-asm]\t\t\tGenerate assembly language file as output\n");
@@ -126,7 +129,11 @@ usage(int ret) {
     printf("    [--pic]\t\t\t\tGenerate position-independent code\n");
 #endif // !ISPC_IS_WINDOWS
     printf("    [--quiet]\t\t\t\tSuppress all output\n");
-    printf("    [--target=<isa>]\t\t\tSelect target ISA. <isa>={%s}\n", Target::SupportedTargetISAs());
+    printf("    ");
+    char targetHelp[2048];
+    sprintf(targetHelp, "[--target=<t>]\t\t\tSelect target ISA and width.\n"
+            "<t>={%s}", Target::SupportedTargets());
+    PrintWithWordBreaks(targetHelp, 24, TerminalWidth(), stdout);
     printf("    [--version]\t\t\t\tPrint ispc version\n");
     printf("    [--werror]\t\t\t\tTreat warnings as errors\n");
     printf("    [--woff]\t\t\t\tDisable warnings\n");
diff --git a/run_tests.py b/run_tests.py
index c9dd8b76..3225c7fd 100755
--- a/run_tests.py
+++ b/run_tests.py
@@ -37,7 +37,7 @@ parser.add_option("-g", "--generics-include", dest="include_file", help="Filenam
 parser.add_option("-f", "--ispc-flags", dest="ispc_flags", help="Additional flags for ispc (-g, -O1, ...)",
                   default="")
 parser.add_option('-t', '--target', dest='target',
-                  help='Set compilation target (neon8, neon16, neon32, sse2, sse2-x2, sse4, sse4-x2, sse4-8, sse4-16, avx, avx-x2, generic-4, generic-8, generic-16, generic-32)',
+                  help='Set compilation target (sse2-i32x4, sse2-i32x8, sse4-i32x4, sse4-i32x8, sse4-i16x8, sse4-i8x16, avx1-i32x8, avx1-i32x16, avx1.1-i32x8, avx1.1-i32x16, avx2-i32x8, avx2-i32x16, generic-x1, generic-x4, generic-x8, generic-x16, generic-x32, generic-x64)',
                   default="sse4")
 parser.add_option('-a', '--arch', dest='arch',
                   help='Set architecture (arm, x86, x86-64)',
diff --git a/util.cpp b/util.cpp
index dbea9517..6b121988 100644
--- a/util.cpp
+++ b/util.cpp
@@ -79,8 +79,8 @@
     compiler under a debuffer; in this case, just return a reasonable
     default.
  */
-static int
-lTerminalWidth() {
+int
+TerminalWidth() {
     if (g->disableLineWrap)
         return 1<<30;
 
@@ -228,8 +228,8 @@ lFindIndent(int numColons, const char *buf) {
 /** Print the given string to the given FILE, assuming the given output
     column width.  Break words as needed to avoid words spilling past the
     last column.  */
-static void
-lPrintWithWordBreaks(const char *buf, int indent, int columnWidth, FILE *out) {
+void
+PrintWithWordBreaks(const char *buf, int indent, int columnWidth, FILE *out) {
 #ifdef ISPC_IS_WINDOWS
     fputs(buf, out);
     fputs("\n", out);
@@ -375,7 +375,7 @@ lPrint(const char *type, bool isError, SourcePos p, const char *fmt,
         return;
     printed.insert(formattedBuf);
 
-    lPrintWithWordBreaks(formattedBuf, indent, lTerminalWidth(), stderr);
+    PrintWithWordBreaks(formattedBuf, indent, TerminalWidth(), stderr);
     lPrintFileLineContext(p);
 
     free(errorBuf);
diff --git a/util.h b/util.h
index b247b8bd..7edf71f7 100644
--- a/util.h
+++ b/util.h
@@ -156,4 +156,18 @@ void GetDirectoryAndFileName(const std::string &currentDir,
 bool VerifyDataLayoutCompatibility(const std::string &module_dl,
                                    const std::string &lib_dl);
 
+/** Print the given string to the given FILE, assuming the given output
+    column width.  Break words as needed to avoid words spilling past the
+    last column.  */
+void PrintWithWordBreaks(const char *buf, int indent, int columnWidth,
+                         FILE *out);
+
+/** Returns the width of the terminal where the compiler is running.
+    Finding this out may fail in a variety of reasonable situations (piping
+    compiler output to 'less', redirecting output to a file, running the
+    compiler under a debuffer; in this case, just return a reasonable
+    default.
+ */
+int TerminalWidth();
+
 #endif // ISPC_UTIL_H

From 7ab4c5391cf5c00eae9e557e579402d2a76644fd Mon Sep 17 00:00:00 2001
From: Matt Pharr <mpharr@google.com>
Date: Fri, 9 Aug 2013 19:56:43 -0700
Subject: [PATCH 030/124] Fix build with LLVM 3.2 and generic-4 /
 examples/sse4.h target.

---
 examples/intrinsics/sse4.h | 4 ++--
 opt.cpp                    | 8 ++++----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/examples/intrinsics/sse4.h b/examples/intrinsics/sse4.h
index 30f90b31..44dedf33 100644
--- a/examples/intrinsics/sse4.h
+++ b/examples/intrinsics/sse4.h
@@ -2532,7 +2532,7 @@ static FORCEINLINE int16_t __reduce_add_int8(__vec4_i8 v) {
     // TODO: improve
     int16_t ret = 0;
     for (int i = 0; i < 4; ++i)
-        ret += v.v[i];
+        ret += __extract_element(v, i);
     return ret;
 }
 
@@ -2540,7 +2540,7 @@ static FORCEINLINE int32_t __reduce_add_int16(__vec4_i16 v) {
     // TODO: improve
     int32_t ret = 0;
     for (int i = 0; i < 4; ++i)
-        ret += v.v[i];
+        ret += __extract_element(v, i);
     return ret;
 }
 
diff --git a/opt.cpp b/opt.cpp
index 522e601b..75eae20c 100644
--- a/opt.cpp
+++ b/opt.cpp
@@ -4556,7 +4556,7 @@ PeepholePass::PeepholePass()
     : BasicBlockPass(ID) {
 }
 
-#ifndef LLVM_3_1
+#if !defined(LLVM_3_1) && !defined(LLVM_3_2)
 
 using namespace llvm::PatternMatch;
 
@@ -4877,7 +4877,7 @@ lMatchAvgDownInt16(llvm::Value *inst) {
     }
     return NULL;
 }
-#endif // !LLVM_3_1
+#endif // !LLVM_3_1 && !LLVM_3_2
 
 bool
 PeepholePass::runOnBasicBlock(llvm::BasicBlock &bb) {
@@ -4889,7 +4889,7 @@ PeepholePass::runOnBasicBlock(llvm::BasicBlock &bb) {
         llvm::Instruction *inst = &*iter;
 
         llvm::Instruction *builtinCall = NULL;
-#ifndef LLVM_3_1
+#if !defined(LLVM_3_1) && !defined(LLVM_3_2)
         if (!builtinCall)
           builtinCall = lMatchAvgUpUInt8(inst);
         if (!builtinCall)
@@ -4906,7 +4906,7 @@ PeepholePass::runOnBasicBlock(llvm::BasicBlock &bb) {
           builtinCall = lMatchAvgDownInt8(inst);
         if (!builtinCall)
           builtinCall = lMatchAvgDownInt16(inst);
-#endif // LLVM_3_1
+#endif // !LLVM_3_1 && !LLVM_3_2
         if (builtinCall != NULL) {
           llvm::ReplaceInstWithInst(inst, builtinCall);
           modifiedAny = true;

From ea8591a85a6ac494ce3395cfbeca17e196a3d463 Mon Sep 17 00:00:00 2001
From: Matt Pharr <mpharr@google.com>
Date: Sat, 10 Aug 2013 11:22:43 -0700
Subject: [PATCH 031/124] Fix build with LLVM top-of-tree (link libcurses)

---
 Makefile | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/Makefile b/Makefile
index 5bac4a6e..69e24d41 100644
--- a/Makefile
+++ b/Makefile
@@ -79,6 +79,10 @@ CLANG_LIBS = -lclangFrontend -lclangDriver \
 ISPC_LIBS=$(shell $(LLVM_CONFIG) --ldflags) $(CLANG_LIBS) $(LLVM_LIBS) \
 	-lpthread
 
+ifeq ($(LLVM_VERSION),LLVM_3_4)
+    ISPC_LIBS += -lcurses
+endif
+
 ifeq ($(ARCH_OS),Linux)
 	ISPC_LIBS += -ldl
 endif

From 4766467271a9e6c0702eec04ebd6d8b9725db5f1 Mon Sep 17 00:00:00 2001
From: Matt Pharr <mpharr@google.com>
Date: Sat, 10 Aug 2013 11:23:39 -0700
Subject: [PATCH 032/124] Revert ispc.vcxproj to version from top-of-tree.

---
 ispc.vcxproj | 22 +---------------------
 1 file changed, 1 insertion(+), 21 deletions(-)

diff --git a/ispc.vcxproj b/ispc.vcxproj
index 53386c4c..36fbad5d 100755
--- a/ispc.vcxproj
+++ b/ispc.vcxproj
@@ -1,4 +1,4 @@
-<?xml version="1.0" encoding="utf-8"?>
+﻿<?xml version="1.0" encoding="utf-8"?>
 <Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
   <ItemGroup Label="ProjectConfigurations">
     <ProjectConfiguration Include="Debug|Win32">
@@ -185,25 +185,6 @@
       <Message>Building gen-bitcode-sse2-x2-64bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
-  <ItemGroup>
-    <CustomBuild Include="builtins\target-avx1.ll">
-      <FileType>Document</FileType>
-      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-avx1.ll | python bitcode2cpp.py builtins\target-avx1.ll 32bit &gt; $(Configuration)/gen-bitcode-avx1-32bit.cpp</Command>
-      <Outputs>$(Configuration)/gen-bitcode-avx1-32bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\target-avx-common.ll;builtins\target-avx.ll</AdditionalInputs>
-      <Message>Building gen-bitcode-avx1-32bit.cpp</Message>
-    </CustomBuild>
-  </ItemGroup>
-  <ItemGroup>
-    <CustomBuild Include="builtins\target-avx1.ll">
-      <FileType>Document</FileType>
-      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-avx1.ll | python bitcode2cpp.py builtins\target-avx1.ll 64bit &gt; $(Configuration)/gen-bitcode-avx1-64bit.cpp</Command>
-      <Outputs>$(Configuration)/gen-bitcode-avx1-64bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\target-avx-common.ll;builtins\target-avx.ll</AdditionalInputs>
-      <Message>Building gen-bitcode-avx1-64bit.cpp</Message>
-    </CustomBuild>
-  </ItemGroup>
-=======
   <ItemGroup>
     <CustomBuild Include="builtins\target-avx1.ll">
       <FileType>Document</FileType>
@@ -222,7 +203,6 @@
       <Message>Building gen-bitcode-avx1-64bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
->>>>>>> master
   <ItemGroup>
     <CustomBuild Include="builtins\target-avx1-x2.ll">
       <FileType>Document</FileType>

From ed017c42f1933ea1c57242f52cecb45507d9e324 Mon Sep 17 00:00:00 2001
From: Matt Pharr <mpharr@google.com>
Date: Sun, 11 Aug 2013 07:47:20 -0700
Subject: [PATCH 033/124] Fix ispc.vcxproj for Windows builds

---
 ispc.vcxproj | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/ispc.vcxproj b/ispc.vcxproj
index 36fbad5d..74186ac0 100755
--- a/ispc.vcxproj
+++ b/ispc.vcxproj
@@ -53,8 +53,10 @@
     <ClCompile Include="$(Configuration)\gen-bitcode-sse4-64bit.cpp" />
     <ClCompile Include="$(Configuration)\gen-bitcode-sse4-x2-32bit.cpp" />
     <ClCompile Include="$(Configuration)\gen-bitcode-sse4-x2-64bit.cpp" />
-    <ClCompile Include="$(Configuration)\gen-stdlib-generic.cpp" />
-    <ClCompile Include="$(Configuration)\gen-stdlib-x86.cpp" />
+    <ClCompile Include="$(Configuration)\gen-stdlib-mask1.cpp" />
+    <ClCompile Include="$(Configuration)\gen-stdlib-mask8.cpp" />
+    <ClCompile Include="$(Configuration)\gen-stdlib-mask16.cpp" />
+    <ClCompile Include="$(Configuration)\gen-stdlib-mask32.cpp" />
     <ClCompile Include="ispc.cpp" />
     <ClCompile Include="$(Configuration)\lex.cc">
       <DisableSpecificWarnings>4146;4800;4996;4355;4624;4005;4003;4018</DisableSpecificWarnings>
@@ -97,11 +99,13 @@
   <ItemGroup>
     <CustomBuild Include="stdlib.ispc">
       <FileType>Document</FileType>
-      <Command>%LLVM_INSTALL_DIR%\bin\clang -E -x c %(Filename).ispc -DISPC=1 -DPI=3.1415926535 | python stdlib2cpp.py x86 &gt; $(Configuration)/gen-stdlib-x86.cpp;
-%LLVM_INSTALL_DIR%\bin\clang -E -x c %(Filename).ispc -DISPC=1 -DISPC_TARGET_GENERIC=1 -DPI=3.1415926535 | python stdlib2cpp.py generic &gt; $(Configuration)/gen-stdlib-generic.cpp;
+      <Command>%LLVM_INSTALL_DIR%\bin\clang -E -x c %(Filename).ispc -DISPC_MASK_BITS=1 -DISPC=1 -DPI=3.1415926535 | python stdlib2cpp.py mask1 &gt; $(Configuration)/gen-stdlib-mask1.cpp;
+%LLVM_INSTALL_DIR%\bin\clang -E -x c %(Filename).ispc -DISPC_MASK_BITS=8 -DISPC=1 -DPI=3.1415926535 | python stdlib2cpp.py mask8 &gt; $(Configuration)/gen-stdlib-mask8.cpp;
+%LLVM_INSTALL_DIR%\bin\clang -E -x c %(Filename).ispc -DISPC_MASK_BITS=16 -DISPC=1 -DPI=3.1415926535 | python stdlib2cpp.py mask16 &gt; $(Configuration)/gen-stdlib-mask16.cpp;
+%LLVM_INSTALL_DIR%\bin\clang -E -x c %(Filename).ispc -DISPC_MASK_BITS=32 -DISPC=1 -DPI=3.1415926535 | python stdlib2cpp.py mask32 &gt; $(Configuration)/gen-stdlib-mask32.cpp;
 </Command>
-      <Outputs>$(Configuration)/gen-stdlib-generic.cpp;$(Configuration)/gen-stdlib-x86.cpp</Outputs>
-      <Message>Building gen-stdlib-{generic,x86}.cpp</Message>
+      <Outputs>$(Configuration)/gen-stdlib-mask1.cpp;$(Configuration)/gen-stdlib-mask8.cpp;$(Configuration)/gen-stdlib-mask16.cpp;$(Configuration)/gen-stdlib-mask32.cpp</Outputs>
+      <Message>Building gen-stdlib-{mask1,8,16,32}.cpp</Message>
     </CustomBuild>
   </ItemGroup>
   <ItemGroup>

From 42f31aed6901f131cf20eb7606db498f43192012 Mon Sep 17 00:00:00 2001
From: Matt Pharr <mpharr@google.com>
Date: Wed, 14 Aug 2013 11:02:45 -0700
Subject: [PATCH 034/124] Another attempt at fixing the Windows build (added
 sse4-8/sse4-16 targets).

---
 ispc.vcxproj | 40 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 40 insertions(+)

diff --git a/ispc.vcxproj b/ispc.vcxproj
index 74186ac0..b4a8b764 100755
--- a/ispc.vcxproj
+++ b/ispc.vcxproj
@@ -51,6 +51,10 @@
     <ClCompile Include="$(Configuration)\gen-bitcode-sse2-x2-64bit.cpp" />
     <ClCompile Include="$(Configuration)\gen-bitcode-sse4-32bit.cpp" />
     <ClCompile Include="$(Configuration)\gen-bitcode-sse4-64bit.cpp" />
+    <ClCompile Include="$(Configuration)\gen-bitcode-sse4-8-32bit.cpp" />
+    <ClCompile Include="$(Configuration)\gen-bitcode-sse4-8-64bit.cpp" />
+    <ClCompile Include="$(Configuration)\gen-bitcode-sse4-16-32bit.cpp" />
+    <ClCompile Include="$(Configuration)\gen-bitcode-sse4-16-64bit.cpp" />
     <ClCompile Include="$(Configuration)\gen-bitcode-sse4-x2-32bit.cpp" />
     <ClCompile Include="$(Configuration)\gen-bitcode-sse4-x2-64bit.cpp" />
     <ClCompile Include="$(Configuration)\gen-stdlib-mask1.cpp" />
@@ -135,6 +139,42 @@
       <Message>Building gen-bitcode-sse4-64bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="builtins\target-sse4-8.ll">
+      <FileType>Document</FileType>
+      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-sse4-8.ll | python bitcode2cpp.py builtins\target-sse4-8.ll 32bit &gt; $(Configuration)/gen-bitcode-sse4-8-32bit.cpp</Command>
+      <Outputs>$(Configuration)/gen-bitcode-sse4-8-32bit.cpp</Outputs>
+      <AdditionalInputs>builtins\util.m4;builtins\target-sse4-common.ll</AdditionalInputs>
+      <Message>Building gen-bitcode-sse4-8-32bit.cpp</Message>
+    </CustomBuild>
+  </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="builtins\target-sse4-8.ll">
+      <FileType>Document</FileType>
+      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-sse4-8.ll | python bitcode2cpp.py builtins\target-sse4-8.ll 64bit &gt; $(Configuration)/gen-bitcode-sse4-8-64bit.cpp</Command>
+      <Outputs>$(Configuration)/gen-bitcode-sse4-8-64bit.cpp</Outputs>
+      <AdditionalInputs>builtins\util.m4;builtins\target-sse4-common.ll</AdditionalInputs>
+      <Message>Building gen-bitcode-sse4-8-64bit.cpp</Message>
+    </CustomBuild>
+  </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="builtins\target-sse4-16.ll">
+      <FileType>Document</FileType>
+      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-sse4-16.ll | python bitcode2cpp.py builtins\target-sse4-16.ll 32bit &gt; $(Configuration)/gen-bitcode-sse4-16-32bit.cpp</Command>
+      <Outputs>$(Configuration)/gen-bitcode-sse4-16-32bit.cpp</Outputs>
+      <AdditionalInputs>builtins\util.m4;builtins\target-sse4-common.ll</AdditionalInputs>
+      <Message>Building gen-bitcode-sse4-16-32bit.cpp</Message>
+    </CustomBuild>
+  </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="builtins\target-sse4-16.ll">
+      <FileType>Document</FileType>
+      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-sse4-16.ll | python bitcode2cpp.py builtins\target-sse4-16.ll 64bit &gt; $(Configuration)/gen-bitcode-sse4-16-64bit.cpp</Command>
+      <Outputs>$(Configuration)/gen-bitcode-sse4-16-64bit.cpp</Outputs>
+      <AdditionalInputs>builtins\util.m4;builtins\target-sse4-common.ll</AdditionalInputs>
+      <Message>Building gen-bitcode-sse4-16-64bit.cpp</Message>
+    </CustomBuild>
+  </ItemGroup>
   <ItemGroup>
     <CustomBuild Include="builtins\target-sse4-x2.ll">
       <FileType>Document</FileType>

From 6be3c24ee5a6accc8157eb20f00d72da060d8644 Mon Sep 17 00:00:00 2001
From: "james.brodman" <james.brodman@intel.com>
Date: Thu, 15 Aug 2013 15:24:46 -0400
Subject: [PATCH 035/124] Separate -O and -g

---
 main.cpp | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/main.cpp b/main.cpp
index c6786c39..b4a02b9f 100644
--- a/main.cpp
+++ b/main.cpp
@@ -571,12 +571,6 @@ int main(int Argc, char *Argv[]) {
         }
     }
 
-    // If the user specified -g, then the default optimization level is 0.
-    // If -g wasn't specified, the default optimization level is 1 (full
-    // optimization).
-    if (debugSet && !optSet)
-        g->opt.level = 0;
-
     if (g->enableFuzzTest) {
         if (g->fuzzTestSeed == -1) {
 #ifdef ISPC_IS_WINDOWS

From d976da7559089fa9bdc033ad764c73793ad34598 Mon Sep 17 00:00:00 2001
From: Matt Pharr <mpharr@google.com>
Date: Tue, 20 Aug 2013 08:49:51 -0700
Subject: [PATCH 036/124] Speed up idiv test (dont test int32 as thoroughly)

---
 tests/idiv.ispc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/idiv.ispc b/tests/idiv.ispc
index b7bd78dc..8738740b 100644
--- a/tests/idiv.ispc
+++ b/tests/idiv.ispc
@@ -44,7 +44,7 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
   // randomly sample int32s...
   uniform RNGState state;
   seed_rng(&state, 1234);
-  for (uniform int i = 0; i < 1M; ++i) {
+  for (uniform int i = 0; i < 64k; ++i) {
     unsigned int32 num = random(&state);
     for (uniform unsigned int32 div = 2; div < 256; ++div) {
       if (__fast_idiv(num, div) != num/div) {
@@ -54,7 +54,7 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
     }
   }
 
-  for (uniform int64 i = 0; i < 1M; ++i) {  
+  for (uniform int64 i = 0; i < 64k; ++i) {
     int32 num = random(&state);
     if (num < 0)
       continue;

From e7f067d70cf03415fc350272daf0506b7184fa84 Mon Sep 17 00:00:00 2001
From: Matt Pharr <mpharr@google.com>
Date: Tue, 20 Aug 2013 09:04:52 -0700
Subject: [PATCH 037/124] Fix handling of __clock() builtin for "generic"
 targets.

---
 cbackend.cpp                     |  4 ++++
 examples/intrinsics/generic-16.h | 20 ++++++++++++++++++++
 examples/intrinsics/generic-32.h | 20 ++++++++++++++++++++
 examples/intrinsics/generic-64.h | 20 ++++++++++++++++++++
 examples/intrinsics/knc.h        | 21 ++++++++++++++++++---
 examples/intrinsics/knc2x.h      | 19 ++++++++++++++++++-
 examples/intrinsics/sse4.h       | 20 ++++++++++++++++++--
 7 files changed, 118 insertions(+), 6 deletions(-)

diff --git a/cbackend.cpp b/cbackend.cpp
index d23bcc20..7d4b4cfc 100644
--- a/cbackend.cpp
+++ b/cbackend.cpp
@@ -3704,6 +3704,7 @@ void CWriter::lowerIntrinsics(llvm::Function &F) {
           case llvm::Intrinsic::sadd_with_overflow:
           case llvm::Intrinsic::trap:
           case llvm::Intrinsic::objectsize:
+          case llvm::Intrinsic::readcyclecounter:
               // We directly implement these intrinsics
             break;
           default:
@@ -4056,6 +4057,9 @@ bool CWriter::visitBuiltinCall(llvm::CallInst &I, llvm::Intrinsic::ID ID,
     return true;
   case llvm::Intrinsic::objectsize:
     return true;
+  case llvm::Intrinsic::readcyclecounter:
+    Out << "__clock()";
+    return true;
   }
 }
 
diff --git a/examples/intrinsics/generic-16.h b/examples/intrinsics/generic-16.h
index 6d4fe1f4..d81101f7 100644
--- a/examples/intrinsics/generic-16.h
+++ b/examples/intrinsics/generic-16.h
@@ -1759,3 +1759,23 @@ static FORCEINLINE uint64_t __atomic_cmpxchg(uint64_t *p, uint64_t cmpval,
     return __sync_val_compare_and_swap(p, cmpval, newval);
 #endif
 }
+
+#ifdef WIN32
+#include <windows.h>
+#define __clock __rdtsc
+#else // WIN32
+static FORCEINLINE uint64_t __clock() {
+  uint32_t low, high;
+#ifdef __x86_64
+  __asm__ __volatile__ ("xorl %%eax,%%eax \n    cpuid"
+                        ::: "%rax", "%rbx", "%rcx", "%rdx" );
+#else
+  __asm__ __volatile__ ("xorl %%eax,%%eax \n    cpuid"
+                        ::: "%eax", "%ebx", "%ecx", "%edx" );
+#endif
+  __asm__ __volatile__ ("rdtsc" : "=a" (low), "=d" (high));
+  return (uint64_t)high << 32 | low;
+}
+
+#endif // !WIN32
+
diff --git a/examples/intrinsics/generic-32.h b/examples/intrinsics/generic-32.h
index 12c4f84e..7e6c69d4 100644
--- a/examples/intrinsics/generic-32.h
+++ b/examples/intrinsics/generic-32.h
@@ -1827,3 +1827,23 @@ static FORCEINLINE uint64_t __atomic_cmpxchg(uint64_t *p, uint64_t cmpval,
     return __sync_val_compare_and_swap(p, cmpval, newval);
 #endif
 }
+
+#ifdef WIN32
+#include <windows.h>
+#define __clock __rdtsc
+#else // WIN32
+static FORCEINLINE uint64_t __clock() {
+  uint32_t low, high;
+#ifdef __x86_64
+  __asm__ __volatile__ ("xorl %%eax,%%eax \n    cpuid"
+                        ::: "%rax", "%rbx", "%rcx", "%rdx" );
+#else
+  __asm__ __volatile__ ("xorl %%eax,%%eax \n    cpuid"
+                        ::: "%eax", "%ebx", "%ecx", "%edx" );
+#endif
+  __asm__ __volatile__ ("rdtsc" : "=a" (low), "=d" (high));
+  return (uint64_t)high << 32 | low;
+}
+#endif // WIN32
+
+#undef FORCEINLINE
diff --git a/examples/intrinsics/generic-64.h b/examples/intrinsics/generic-64.h
index a3648f42..39124186 100644
--- a/examples/intrinsics/generic-64.h
+++ b/examples/intrinsics/generic-64.h
@@ -1960,3 +1960,23 @@ static FORCEINLINE uint64_t __atomic_cmpxchg(uint64_t *p, uint64_t cmpval,
     return __sync_val_compare_and_swap(p, cmpval, newval);
 #endif
 }
+
+#ifdef WIN32
+#include <windows.h>
+#define __clock __rdtsc
+#else // WIN32
+static FORCEINLINE uint64_t __clock() {
+  uint32_t low, high;
+#ifdef __x86_64
+  __asm__ __volatile__ ("xorl %%eax,%%eax \n    cpuid"
+                        ::: "%rax", "%rbx", "%rcx", "%rdx" );
+#else
+  __asm__ __volatile__ ("xorl %%eax,%%eax \n    cpuid"
+                        ::: "%eax", "%ebx", "%ecx", "%edx" );
+#endif
+  __asm__ __volatile__ ("rdtsc" : "=a" (low), "=d" (high));
+  return (uint64_t)high << 32 | low;
+}
+#endif
+
+#undef FORCEINLINE
diff --git a/examples/intrinsics/knc.h b/examples/intrinsics/knc.h
index 41c4cbc0..8baef8cb 100644
--- a/examples/intrinsics/knc.h
+++ b/examples/intrinsics/knc.h
@@ -2121,9 +2121,24 @@ static FORCEINLINE uint64_t __atomic_cmpxchg(uint64_t *p, uint64_t cmpval,
 #endif
 }
 
+#ifdef WIN32
+#include <windows.h>
+#define __clock __rdtsc
+#else // WIN32
+static FORCEINLINE uint64_t __clock() {
+  uint32_t low, high;
+#ifdef __x86_64
+  __asm__ __volatile__ ("xorl %%eax,%%eax \n    cpuid"
+                        ::: "%rax", "%rbx", "%rcx", "%rdx" );
+#else
+  __asm__ __volatile__ ("xorl %%eax,%%eax \n    cpuid"
+                        ::: "%eax", "%ebx", "%ecx", "%edx" );
+#endif
+  __asm__ __volatile__ ("rdtsc" : "=a" (low), "=d" (high));
+  return (uint64_t)high << 32 | low;
+}
+#endif // !WIN32
+
 #undef FORCEINLINE
 #undef PRE_ALIGN
 #undef POST_ALIGN
-
-
-
diff --git a/examples/intrinsics/knc2x.h b/examples/intrinsics/knc2x.h
index 5b6e5295..a1b1fc9d 100644
--- a/examples/intrinsics/knc2x.h
+++ b/examples/intrinsics/knc2x.h
@@ -2055,7 +2055,24 @@ static FORCEINLINE void __aos_to_soa4_float(float *ptr, __vec32_f *out0, __vec32
 }
 */
 
+#ifdef WIN32
+#include <windows.h>
+#define __clock __rdtsc
+#else // WIN32
+static FORCEINLINE uint64_t __clock() {
+  uint32_t low, high;
+#ifdef __x86_64
+  __asm__ __volatile__ ("xorl %%eax,%%eax \n    cpuid"
+                        ::: "%rax", "%rbx", "%rcx", "%rdx" );
+#else
+  __asm__ __volatile__ ("xorl %%eax,%%eax \n    cpuid"
+                        ::: "%eax", "%ebx", "%ecx", "%edx" );
+#endif
+  __asm__ __volatile__ ("rdtsc" : "=a" (low), "=d" (high));
+  return (uint64_t)high << 32 | low;
+}
+#endif // !WIN32
+
 #undef FORCEINLINE
 #undef PRE_ALIGN
 #undef POST_ALIGN
-
diff --git a/examples/intrinsics/sse4.h b/examples/intrinsics/sse4.h
index 44dedf33..ff00d920 100644
--- a/examples/intrinsics/sse4.h
+++ b/examples/intrinsics/sse4.h
@@ -4000,6 +4000,22 @@ static FORCEINLINE uint64_t __atomic_cmpxchg(uint64_t *p, uint64_t cmpval,
 #endif
 }
 
+#ifdef WIN32
+#include <windows.h>
+#define __clock __rdtsc
+#else // WIN32
+static FORCEINLINE uint64_t __clock() {
+  uint32_t low, high;
+#ifdef __x86_64
+  __asm__ __volatile__ ("xorl %%eax,%%eax \n    cpuid"
+                        ::: "%rax", "%rbx", "%rcx", "%rdx" );
+#else
+  __asm__ __volatile__ ("xorl %%eax,%%eax \n    cpuid"
+                        ::: "%eax", "%ebx", "%ecx", "%edx" );
+#endif
+  __asm__ __volatile__ ("rdtsc" : "=a" (low), "=d" (high));
+  return (uint64_t)high << 32 | low;
+}
+#endif // !WIN32
+
 #undef FORCEINLINE
-
-

From 2b2905b567fec1725beff5064d6b0ffe21d93c38 Mon Sep 17 00:00:00 2001
From: Matt Pharr <mpharr@google.com>
Date: Tue, 20 Aug 2013 09:05:50 -0700
Subject: [PATCH 038/124] Fix (preexisting) bugs in generic-32/64.h with type
 of "__any", etc.

This should be a bool, not a one-wide vector of bools.  The equivalent
fix was previously made in generic-16.h, but not made here.  (Note that
many tests are still failing with these targets, but at least they
compile properly now.)
---
 examples/intrinsics/generic-32.h | 12 ++++++------
 examples/intrinsics/generic-64.h | 12 ++++++------
 2 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/examples/intrinsics/generic-32.h b/examples/intrinsics/generic-32.h
index 7e6c69d4..531ed215 100644
--- a/examples/intrinsics/generic-32.h
+++ b/examples/intrinsics/generic-32.h
@@ -408,15 +408,15 @@ static FORCEINLINE uint64_t __movmsk(__vec32_i1 mask) {
     return (uint64_t)mask.v;
 }
 
-static FORCEINLINE __vec32_i1 __any(__vec32_i1 mask) {
+static FORCEINLINE bool __any(__vec32_i1 mask) {
     return (mask.v!=0);
 }
 
-static FORCEINLINE __vec32_i1 __all(__vec32_i1 mask) {
-    return (mask.v==0xFFFFFFFF);
+static FORCEINLINE bool __all(__vec32_i1 mask) {
+    return (mask.v==0xFFFFFFFFul);
 }
 
-static FORCEINLINE __vec32_i1 __none(__vec32_i1 mask) {
+static FORCEINLINE bool __none(__vec32_i1 mask) {
     return (mask.v==0);
 }
 
@@ -1231,8 +1231,8 @@ REDUCE_ADD(double, __vec32_d, __reduce_add_double)
 REDUCE_MINMAX(double, __vec32_d, __reduce_min_double, <)
 REDUCE_MINMAX(double, __vec32_d, __reduce_max_double, >)
 
-REDUCE_ADD(int16_t, __vec16_i8, __reduce_add_int8)
-REDUCE_ADD(int32_t, __vec16_i16, __reduce_add_int16)
+//REDUCE_ADD(int16_t, __vec16_i8, __reduce_add_int8)
+//REDUCE_ADD(int32_t, __vec16_i16, __reduce_add_int16)
 
 REDUCE_ADD(int64_t, __vec32_i32, __reduce_add_int32)
 REDUCE_MINMAX(int32_t, __vec32_i32, __reduce_min_int32, <)
diff --git a/examples/intrinsics/generic-64.h b/examples/intrinsics/generic-64.h
index 39124186..bbeb007a 100644
--- a/examples/intrinsics/generic-64.h
+++ b/examples/intrinsics/generic-64.h
@@ -533,15 +533,15 @@ static FORCEINLINE uint64_t __movmsk(__vec64_i1 mask) {
     return (uint64_t)mask.v;
 }
 
-static FORCEINLINE __vec64_i1 __any(__vec64_i1 mask) {
+static FORCEINLINE bool __any(__vec64_i1 mask) {
     return (mask.v!=0);
 }
 
-static FORCEINLINE __vec64_i1 __all(__vec64_i1 mask) {
-    return (mask.v==0xFFFFFFFFFFFFFFFF);
+static FORCEINLINE bool __all(__vec64_i1 mask) {
+    return (mask.v==0xFFFFFFFFFFFFFFFFull);
 }
 
-static FORCEINLINE __vec64_i1 __none(__vec64_i1 mask) {
+static FORCEINLINE bool __none(__vec64_i1 mask) {
     return (mask.v==0);
 }
 
@@ -1364,8 +1364,8 @@ REDUCE_ADD(double, __vec64_d, __reduce_add_double)
 REDUCE_MINMAX(double, __vec64_d, __reduce_min_double, <)
 REDUCE_MINMAX(double, __vec64_d, __reduce_max_double, >)
 
-REDUCE_ADD(int16_t, __vec16_i8, __reduce_add_int8)
-REDUCE_ADD(int32_t, __vec16_i16, __reduce_add_int16)
+//REDUCE_ADD(int16_t, __vec16_i8, __reduce_add_int8)
+//REDUCE_ADD(int32_t, __vec16_i16, __reduce_add_int16)
 
 REDUCE_ADD(int64_t, __vec64_i32, __reduce_add_int32)
 REDUCE_MINMAX(int32_t, __vec64_i32, __reduce_min_int32, <)

From 502f8fd76b9cf88cd260106b546494c1facc28b4 Mon Sep 17 00:00:00 2001
From: Matt Pharr <mpharr@google.com>
Date: Tue, 20 Aug 2013 09:22:09 -0700
Subject: [PATCH 039/124] Reduce debug spew on failing idiv.ispc tests

---
 tests/idiv.ispc | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/tests/idiv.ispc b/tests/idiv.ispc
index 8738740b..bd0766da 100644
--- a/tests/idiv.ispc
+++ b/tests/idiv.ispc
@@ -4,12 +4,13 @@ export uniform int width() { return programCount; }
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
   uniform int errorCount = 0;
- 
+
   for (unsigned int8 num = 0; num < 255; ++num) {
     for (uniform unsigned int8 div = 2; div < 255; ++div) {
       if (__fast_idiv(num, div) != num/div) {
         ++errorCount;
         print("error %/% = %, got %\n", num, div, num/div, __fast_idiv(num,div));
+        if (errorCount > 32) break;
       }
     }
   }
@@ -19,6 +20,7 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
       if (__fast_idiv(num, div) != num/div) {
         ++errorCount;
         print("error %/% = %, got %\n", num, div, num/div, __fast_idiv(num,div));
+        if (errorCount > 64) break;
       }
     }
   }
@@ -28,6 +30,7 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
       if (__fast_idiv(num, div) != num/div) {
         ++errorCount;
         print("error %/% = %, got %\n", num, div, num/div, __fast_idiv(num,div));
+        if (errorCount > 96) break;
       }
     }
   }
@@ -37,6 +40,7 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
       if (__fast_idiv(num, div) != num/div) {
         ++errorCount;
         print("error %/% = %, got %\n", num, div, num/div, __fast_idiv(num,div));
+        if (errorCount > 128) break;
       }
     }
   }
@@ -50,6 +54,7 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
       if (__fast_idiv(num, div) != num/div) {
         ++errorCount;
         print("ui32 error %/% = %, got %\n", num, div, num/div, __fast_idiv(num,div));
+        if (errorCount > 160) break;
       }
     }
   }
@@ -62,6 +67,7 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
       if (__fast_idiv(num, div) != num/div) {
         ++errorCount;
         print("si32 error %/% = %, got %\n", num, div, num/div, __fast_idiv(num,div));
+        if (errorCount > 192) break;
       }
     }
   }

From 60b413a9cb9b30dc2c6e1f9c345bdf19286f9114 Mon Sep 17 00:00:00 2001
From: Dmitry Babokin <babokin@gmail.com>
Date: Wed, 21 Aug 2013 19:25:30 +0400
Subject: [PATCH 040/124] Adding --non-interactive switch to run_tests.py

---
 run_tests.py | 32 +++++++++++++++++++-------------
 1 file changed, 19 insertions(+), 13 deletions(-)

diff --git a/run_tests.py b/run_tests.py
index 7c6b1eb8..74407ce4 100755
--- a/run_tests.py
+++ b/run_tests.py
@@ -55,6 +55,8 @@ parser.add_option('--wrap-exe', dest='wrapexe',
                   default="")
 parser.add_option('--time', dest='time', help='Enable time output',
                   default=False, action="store_true")
+parser.add_option('--non-interactive', dest='non_interactive', help='Disable interactive status updates',
+                  default=False, action="store_true")
 
 (options, args) = parser.parse_args()
 
@@ -162,14 +164,15 @@ total_tests = 0
 # finished.  Should be called with the lock held..
 def update_progress(fn, total_tests_arg, counter, max_test_length_arg):
     counter.value += 1
-    progress_str = " Done %d / %d [%s]" % (counter.value, total_tests_arg, fn)
-    # spaces to clear out detrius from previous printing...
-    spaces_needed = max_test_length_arg - len(fn)
-    for x in range(spaces_needed):
-        progress_str += ' '
-    progress_str += '\r'
-    sys.stdout.write(progress_str)
-    sys.stdout.flush()
+    if options.non_interactive == False:
+        progress_str = " Done %d / %d [%s]" % (counter.value, total_tests_arg, fn)
+        # spaces to clear out detrius from previous printing...
+        spaces_needed = max_test_length_arg - len(fn)
+        for x in range(spaces_needed):
+            progress_str += ' '
+        progress_str += '\r'
+        sys.stdout.write(progress_str)
+        sys.stdout.flush()
 
 def run_command(cmd):
     if options.verbose:
@@ -489,11 +492,8 @@ if __name__ == '__main__':
     # (i.e. return 0 if all is ok)
     for t in task_threads:
         t.join()
-    sys.stdout.write("\n")
-
-    elapsed_time = time.time() - start_time
-    if options.time:
-        sys.stdout.write("Elapsed time: %d s\n" % elapsed_time)
+    if options.non_interactive == False:
+        sys.stdout.write("\n")
 
     while not qret.empty():
         (c, r, s) = qret.get()
@@ -501,6 +501,8 @@ if __name__ == '__main__':
         run_error_files += r
         skip_files += s
 
+    if options.non_interactive:
+        sys.stdout.write(" Done %d / %d\n" % (finished_tests_counter.value, total_tests))
     if len(skip_files) > 0:
         skip_files.sort()
         sys.stdout.write("%d / %d tests SKIPPED:\n" % (len(skip_files), total_tests))
@@ -517,4 +519,8 @@ if __name__ == '__main__':
         for f in run_error_files:
             sys.stdout.write("\t%s\n" % f)
 
+    elapsed_time = time.time() - start_time
+    if options.time:
+        sys.stdout.write("Elapsed time: %d s\n" % elapsed_time)
+
     sys.exit(len(compile_error_files) + len(run_error_files))

From 5fb30939be6b4a7949c039c7b1db9b42eb478a22 Mon Sep 17 00:00:00 2001
From: Dmitry Babokin <babokin@gmail.com>
Date: Wed, 21 Aug 2013 19:46:18 +0400
Subject: [PATCH 041/124] Fix for #564, using wrong ispc in run_tests.py

---
 run_tests.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/run_tests.py b/run_tests.py
index 74407ce4..e029b9a6 100755
--- a/run_tests.py
+++ b/run_tests.py
@@ -234,7 +234,7 @@ def add_prefix(path):
     else:
         input_prefix = ""
     path = input_prefix + path
-    path = os.path.normpath(path)
+    path = os.path.abspath(path)
     return path
 
 

From f31a31478b7329cbaf6d8b7d50f30c0cc90996dc Mon Sep 17 00:00:00 2001
From: Dmitry Babokin <babokin@gmail.com>
Date: Thu, 22 Aug 2013 12:41:57 +0400
Subject: [PATCH 042/124] Moving time calculation earlier

---
 run_tests.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/run_tests.py b/run_tests.py
index e029b9a6..710bd274 100755
--- a/run_tests.py
+++ b/run_tests.py
@@ -495,6 +495,8 @@ if __name__ == '__main__':
     if options.non_interactive == False:
         sys.stdout.write("\n")
 
+    elapsed_time = time.time() - start_time
+
     while not qret.empty():
         (c, r, s) = qret.get()
         compile_error_files += c
@@ -519,7 +521,6 @@ if __name__ == '__main__':
         for f in run_error_files:
             sys.stdout.write("\t%s\n" % f)
 
-    elapsed_time = time.time() - start_time
     if options.time:
         sys.stdout.write("Elapsed time: %d s\n" % elapsed_time)
 

From 611477e214f19e89657cd85252bb44e801573240 Mon Sep 17 00:00:00 2001
From: Matt Pharr <mpharr@google.com>
Date: Thu, 22 Aug 2013 07:50:25 -0700
Subject: [PATCH 043/124] Revert change to lEmitVaryingSelect().

Using vector select versus a store and masked load for varying vector
selects seems to give worse code.  This may be related to
http://llvm.org/bugs/show_bug.cgi?id=16941.
---
 expr.cpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/expr.cpp b/expr.cpp
index 856d363c..614cb5e5 100644
--- a/expr.cpp
+++ b/expr.cpp
@@ -3123,7 +3123,10 @@ static llvm::Value *
 lEmitVaryingSelect(FunctionEmitContext *ctx, llvm::Value *test,
                    llvm::Value *expr1, llvm::Value *expr2,
                    const Type *type) {
-#if !defined(LLVM_3_1)
+#if 0 // !defined(LLVM_3_1)
+    // Though it should be equivalent, this seems to cause non-trivial
+    // performance regressions versus the below.  This may be related to
+    // http://llvm.org/bugs/show_bug.cgi?id=16941.
     if (test->getType() != LLVMTypes::Int1VectorType)
         test = ctx->TruncInst(test, LLVMTypes::Int1VectorType);
     return ctx->SelectInst(test, expr1, expr2, "select");

From f620cdbaa1f6cfdad15218a28d7da025e2493c01 Mon Sep 17 00:00:00 2001
From: Ilia Filippov <ili.filippov@gmail.com>
Date: Mon, 26 Aug 2013 14:04:59 +0400
Subject: [PATCH 044/124] Changes in perf.py functionality, unification of
 examples, correction build warnings

---
 Makefile                                      |   6 +-
 examples/aobench/ao.cpp                       |   6 +-
 examples/deferred/main.cpp                    |   2 +-
 examples/mandelbrot/mandelbrot.cpp            |   2 +-
 .../mandelbrot_tasks/mandelbrot_tasks.cpp     |   2 +-
 examples/noise/noise.cpp                      |   2 +-
 examples/perf.py                              | 153 +++++++++++++++---
 examples/stencil/stencil.cpp                  |   2 +-
 examples/volume_rendering/volume.cpp          |   2 +-
 main.cpp                                      |   4 -
 10 files changed, 140 insertions(+), 41 deletions(-)

diff --git a/Makefile b/Makefile
index 69e24d41..8d27cc80 100644
--- a/Makefile
+++ b/Makefile
@@ -113,8 +113,10 @@ CXX=g++
 CPP=cpp
 OPT=-O2
 CXXFLAGS=$(OPT) $(LLVM_CXXFLAGS) -I. -Iobjs/ -I$(CLANG_INCLUDE)  \
-	-Wall $(LLVM_VERSION_DEF) \
-	-DBUILD_DATE="\"$(BUILD_DATE)\"" -DBUILD_VERSION="\"$(BUILD_VERSION)\""
+	$(LLVM_VERSION_DEF) \
+	-Wall \
+	-DBUILD_DATE="\"$(BUILD_DATE)\"" -DBUILD_VERSION="\"$(BUILD_VERSION)\"" \
+	-Werror -Wno-sign-compare
 ifneq ($(ARM_ENABLED), 0)
     CXXFLAGS+=-DISPC_ARM_ENABLED
 endif
diff --git a/examples/aobench/ao.cpp b/examples/aobench/ao.cpp
index cbe75a0b..2286316d 100644
--- a/examples/aobench/ao.cpp
+++ b/examples/aobench/ao.cpp
@@ -138,7 +138,7 @@ int main(int argc, char **argv)
     }
 
     // Report results and save image
-    printf("[aobench ispc]:\t\t\t[%.3f] M cycles (%d x %d image)\n", 
+    printf("[aobench ispc]:\t\t\t[%.3f] million cycles (%d x %d image)\n", 
            minTimeISPC, width, height);
     savePPM("ao-ispc.ppm", width, height); 
 
@@ -158,7 +158,7 @@ int main(int argc, char **argv)
     }
 
     // Report results and save image
-    printf("[aobench ispc + tasks]:\t\t[%.3f] M cycles (%d x %d image)\n", 
+    printf("[aobench ispc + tasks]:\t\t[%.3f] million cycles (%d x %d image)\n", 
            minTimeISPCTasks, width, height);
     savePPM("ao-ispc-tasks.ppm", width, height); 
 
@@ -176,7 +176,7 @@ int main(int argc, char **argv)
     }
 
     // Report more results, save another image...
-    printf("[aobench serial]:\t\t[%.3f] M cycles (%d x %d image)\n", minTimeSerial, 
+    printf("[aobench serial]:\t\t[%.3f] million cycles (%d x %d image)\n", minTimeSerial, 
            width, height);
     printf("\t\t\t\t(%.2fx speedup from ISPC, %.2fx speedup from ISPC + tasks)\n", 
            minTimeSerial / minTimeISPC, minTimeSerial / minTimeISPCTasks);
diff --git a/examples/deferred/main.cpp b/examples/deferred/main.cpp
index 17bd3f42..4f2be879 100644
--- a/examples/deferred/main.cpp
+++ b/examples/deferred/main.cpp
@@ -130,7 +130,7 @@ int main(int argc, char** argv) {
     printf("\t\t\t\t(%.2fx speedup from static ISPC, %.2fx from Cilk+ISPC)\n", 
            serialCycles/ispcCycles, serialCycles/dynamicCilkCycles);
 #else
-    printf("\t\t\t\t(%.2fx speedup from ISPC)\n", serialCycles/ispcCycles);
+    printf("\t\t\t\t(%.2fx speedup from ISPC + tasks)\n", serialCycles/ispcCycles);
 #endif // __cilk
 
     DeleteInputData(input);
diff --git a/examples/mandelbrot/mandelbrot.cpp b/examples/mandelbrot/mandelbrot.cpp
index 7e73768f..d2bebb96 100644
--- a/examples/mandelbrot/mandelbrot.cpp
+++ b/examples/mandelbrot/mandelbrot.cpp
@@ -109,7 +109,7 @@ int main() {
         minSerial = std::min(minSerial, dt);
     }
 
-    printf("[mandelbrot serial]:\t\t[%.3f] millon cycles\n", minSerial);
+    printf("[mandelbrot serial]:\t\t[%.3f] million cycles\n", minSerial);
     writePPM(buf, width, height, "mandelbrot-serial.ppm");
 
     printf("\t\t\t\t(%.2fx speedup from ISPC)\n", minSerial/minISPC);
diff --git a/examples/mandelbrot_tasks/mandelbrot_tasks.cpp b/examples/mandelbrot_tasks/mandelbrot_tasks.cpp
index dae22736..698daf0f 100644
--- a/examples/mandelbrot_tasks/mandelbrot_tasks.cpp
+++ b/examples/mandelbrot_tasks/mandelbrot_tasks.cpp
@@ -137,7 +137,7 @@ int main(int argc, char *argv[]) {
         minSerial = std::min(minSerial, dt);
     }
 
-    printf("[mandelbrot serial]:\t\t[%.3f] millon cycles\n", minSerial);
+    printf("[mandelbrot serial]:\t\t[%.3f] million cycles\n", minSerial);
     writePPM(buf, width, height, "mandelbrot-serial.ppm");
 
     printf("\t\t\t\t(%.2fx speedup from ISPC + tasks)\n", minSerial/minISPC);
diff --git a/examples/noise/noise.cpp b/examples/noise/noise.cpp
index 58552ce3..123f98c7 100644
--- a/examples/noise/noise.cpp
+++ b/examples/noise/noise.cpp
@@ -106,7 +106,7 @@ int main() {
         minSerial = std::min(minSerial, dt);
     }
 
-    printf("[noise serial]:\t\t\t[%.3f] millon cycles\n", minSerial);
+    printf("[noise serial]:\t\t\t[%.3f] million cycles\n", minSerial);
     writePPM(buf, width, height, "noise-serial.ppm");
 
     printf("\t\t\t\t(%.2fx speedup from ISPC)\n", minSerial/minISPC);
diff --git a/examples/perf.py b/examples/perf.py
index f96ef9ec..4b661b39 100755
--- a/examples/perf.py
+++ b/examples/perf.py
@@ -10,12 +10,22 @@ import glob
 import string
 import platform
 
+def print_debug(line):
+    if options.silent == False:
+        sys.stdout.write(line)
+
+def print_file(line):
+    if options.output != "":
+        output = open(options.output, 'w')
+        output.writelines(line)
+        output.close()
+
 def build_test():
     global build_log
     global is_windows
     if is_windows == False:
         os.system("make clean >> "+build_log)
-        return os.system("make >> "+build_log+" 2>> "+build_log)
+        return os.system("make CXX="+ref_compiler+" CC="+refc_compiler+" >> "+build_log+" 2>> "+build_log)
     else:
         os.system("msbuild /t:clean >> " + build_log)
         return os.system("msbuild /V:m /p:Platform=x64 /p:Configuration=Release /p:TargetDir=.\ /t:rebuild >> " + build_log)
@@ -30,7 +40,7 @@ def execute_test(command):
     return r
 
 #gathers all tests results and made an item test from answer structure
-def run_test(command, c1, c2, test):
+def run_test(command, c1, c2, test, b_serial):
     global perf_temp
     if build_test() != 0:
         sys.stdout.write("ERROR: Compilation fails\n")
@@ -40,11 +50,13 @@ def run_test(command, c1, c2, test):
         return
     tasks = [] #list of results with tasks, it will be test[2]
     ispc = [] #list of results without tasks, it will be test[1]
+    absolute_tasks = []  #list of absolute results with tasks, it will be test[4]
+    absolute_ispc = [] #list of absolute results without tasks, ut will be test[3]
+    serial = [] #list serial times, it will be test[5]
     j = 1
     for line in open(perf_temp): # we take test output
         if "speedup" in line: # we are interested only in lines with speedup
             if j == c1: # we are interested only in lines with c1 numbers
-                sys.stdout.write(line)
                 line = line.expandtabs(0)
                 line = line.replace("("," ")
                 line = line.split(",")
@@ -57,9 +69,42 @@ def run_test(command, c1, c2, test):
                         ispc.append(number)
                 c1 = c1 + c2
             j+=1
+        if "million cycles" in line:
+            if j == c1:
+                line = line.replace("]","[")
+                line = line.split("[")
+                number = float(line[3])
+                if "tasks" in line[1]:
+                    absolute_tasks.append(number)
+                else:
+                    if "ispc" in line[1]:
+                        absolute_ispc.append(number)
+                if "serial" in line[1]:
+                    serial.append(number)
+
+    if len(ispc) != 0:
+        if len(tasks) != 0:
+            print_debug("ISPC speedup / ISPC + tasks speedup / ISPC time / ISPC + tasks time / serial time\n")
+            for i in range(0,len(serial)):
+                print_debug("%10s   /\t%10s\t    /%9s  /    %10s\t    /%10s\n" %
+                    (ispc[i], tasks[i], absolute_ispc[i], absolute_tasks[i], serial[i]))
+        else:
+            print_debug("ISPC speedup / ISPC time / serial time\n")
+            for i in range(0,len(serial)):
+                print_debug("%10s   /%9s  /%10s\n" % (ispc[i], absolute_ispc[i], serial[i]))
+    else:
+        if len(tasks) != 0:
+            print_debug("ISPC + tasks speedup / ISPC + tasks time / serial time\n")
+            for i in range(0,len(serial)):
+                print_debug("%10s\t     /    %10s\t /%10s\n" % (tasks[i], absolute_tasks[i], serial[i]))
+
     test[1] = test[1] + ispc
     test[2] = test[2] + tasks
-
+    test[3] = test[3] + absolute_ispc
+    test[4] = test[4] + absolute_tasks
+    if b_serial == True:
+        #if we concatenate outputs we should use only the first serial answer.
+        test[5] = test[5] + serial
 
 def cpu_get():
     p = open("/proc/stat", 'r')
@@ -113,30 +158,57 @@ def geomean(par):
 #test[0] - name of test
 #test[1] - list of results without tasks
 #test[2] - list of results with tasks
-#test[1] or test[2] may be empty
+#test[3] - list of absolute results without tasks
+#test[4] - list of absolute results with tasks
+#test[5] - list of absolute time without ISPC (serial)
+#test[1..4] may be empty
 def print_answer(answer):
-    sys.stdout.write("Name of test:\t\tISPC:\tISPC + tasks:\n")
-    max_t = [0,0]
-    diff_t = [0,0]
-    geomean_t = [0,0]
-    list_of_max = [[],[]]
+    filelist = []
+    print_debug("--------------------------------------------------------------------------\n")
+    print_debug("test name:\t    ISPC speedup: ISPC + tasks speedup: | " + 
+        "ISPC time:    ISPC + tasks time:  serial:\n")
+    filelist.append("test name,ISPC speedup,diff," +
+        "ISPC + tasks speedup,diff,ISPC time,diff,ISPC + tasks time,diff,serial,diff\n")
+    max_t = [0,0,0,0,0]
+    diff_t = [0,0,0,0,0]
+    geomean_t = [0,0,0,0,0]
+    list_of_max = [[],[],[],[],[]]
     for i in range(len(answer)):
-        for t in range(1,3):
+        for t in range(1,6):
             if len(answer[i][t]) == 0:
                 max_t[t-1] = "n/a"
                 diff_t[t-1] = "n/a"
             else:
-                list_of_max[t-1].append(max(answer[i][t]))
-                max_t[t-1] = str(max(answer[i][t]))
-                diff_t[t-1] = str(max(answer[i][t]) - min(answer[i][t]))
-        sys.stdout.write("%s:\n" % answer[i][0])
-        sys.stdout.write("\t\tmax:\t%s\t%s\n" % (max_t[0], max_t[1]))
-        sys.stdout.write("\t\tdiff:\t%s\t%s\n" % (diff_t[0], diff_t[1]))
+                if t < 3:
+                    mm = max(answer[i][t])
+                else:
+                    mm = min(answer[i][t])
+                max_t[t-1] = '%.2f' % mm
+                list_of_max[t-1].append(mm)
+                diff_t[t-1] = '%.2f' % (max(answer[i][t]) - min(answer[i][t]))
+        print_debug("%s:\n" % answer[i][0])
+        print_debug("\t\tmax:\t%5s\t\t%10s\t|%10s\t%10s\t%10s\n" %
+            (max_t[0], max_t[1], max_t[2], max_t[3], max_t[4]))
+        print_debug("\t\tdiff:\t%5s\t\t%10s\t|%10s\t%10s\t%10s\n" %
+            (diff_t[0], diff_t[1], diff_t[2], diff_t[3], diff_t[4]))
+        for t in range(0,5):
+            if max_t[t] == "n/a":
+                max_t[t] = ""
+            if diff_t[t] == "n/a":
+                diff_t[t] = ""
+        filelist.append(answer[i][0] + "," +
+                        max_t[0] + "," + diff_t[0] + "," +  max_t[1] + "," + diff_t[1] + "," +
+                        max_t[2] + "," + diff_t[2] + "," +  max_t[3] + "," + diff_t[3] + "," +
+                        max_t[4] + "," + diff_t[4] + "\n")
+    for i in range(0,5):
+        geomean_t[i] = geomean(list_of_max[i])
+    print_debug("---------------------------------------------------------------------------------\n")
+    print_debug("Geomean:\t\t%5s\t\t%10s\t|%10s\t%10s\t%10s\n" %
+        (geomean_t[0], geomean_t[1], geomean_t[2], geomean_t[3], geomean_t[4]))
+    filelist.append("Geomean," + str(geomean_t[0]) + ",," + str(geomean_t[1])
+        + ",," + str(geomean_t[2]) + ",," + str(geomean_t[3]) + ",," + str(geomean_t[4]) + "\n")
+    print_file(filelist)
 
-    geomean_t[0] = geomean(list_of_max[0])
-    geomean_t[1] = geomean(list_of_max[1])
-    sys.stdout.write("---------------------------------------------\n")
-    sys.stdout.write("Geomean:\t\t%s\t%s\n" % (geomean_t[0], geomean_t[1]))
 
 ###Main###
 # parsing options
@@ -147,6 +219,12 @@ parser.add_option('-c', '--config', dest='config',
     help='config file of tests', default="./perf.ini")
 parser.add_option('-p', '--path', dest='path',
     help='path to examples directory', default="./")
+parser.add_option('-s', '--silent', dest='silent',
+    help='silent mode, only table output', default=False, action="store_true")
+parser.add_option('-o', '--output', dest='output',
+    help='output file for script reading', default="")
+parser.add_option('--compiler', dest='compiler',
+    help='reference compiler', default="")
 (options, args) = parser.parse_args()
 
 global is_windows
@@ -174,6 +252,14 @@ ref_compiler_exists = False
 if is_windows == False:
     compiler = "ispc"
     ref_compiler = "g++"
+    refc_compiler = "gcc"
+    if options.compiler != "":
+        if options.compiler == "clang" or options.compiler == "clang++":
+            ref_compiler = "clang++"
+            refc_compiler = "clang"
+        if options.compiler == "icc" or options.compiler == "icpc":
+            ref_compiler = "icpc"
+            refc_compiler = "icc"
 else:
     compiler = "ispc.exe"
     ref_compiler = "cl.exe"
@@ -222,12 +308,27 @@ perf_temp = pwd + "perf_temp"
 
 i = 0
 answer = []
-sys.stdout.write("Okey go go go!\n\n")
+print_debug("Okey go go go!\n\n")
+os.system(compiler + " --version >" + build_log)
+version = open(build_log)
+print_debug("Using test compiler: " + version.readline())
+version.close()
+
+if is_windows == False:
+    os.system(ref_compiler + " --version >" + build_log)
+else:
+    os.system(ref_compiler + " 2>" + build_log + " 1>&2")
+
+version = open(build_log)
+print_debug("Using reference compiler: " + version.readline())
+version.close()
+
+
 # loop for all tests
 while i < length-2:
     # we read name of test
-    sys.stdout.write("%s" % lines[i])
-    test = [lines[i][:-1],[],[]]
+    print_debug("%s" % lines[i])
+    test = [lines[i][:-1],[],[],[],[],[]]
     # read location of test
     folder = lines[i+1]
     folder = folder[:-1]
@@ -257,10 +358,10 @@ while i < length-2:
         c2 = 1
     next_line = lines[i+3]
     if next_line[0] == "^":  #we should concatenate result of this test with previous one
-        run_test(command, c1, c2, answer[len(answer)-1])
+        run_test(command, c1, c2, answer[len(answer)-1], False)
         i = i+1
     else: #we run this test and append it's result to answer structure
-        run_test(command, c1, c2, test)
+        run_test(command, c1, c2, test, True)
         answer.append(test)
     # preparing next loop iteration
     os.chdir(pwd)
diff --git a/examples/stencil/stencil.cpp b/examples/stencil/stencil.cpp
index 9d5b3ee6..593d901f 100644
--- a/examples/stencil/stencil.cpp
+++ b/examples/stencil/stencil.cpp
@@ -130,7 +130,7 @@ int main() {
         minTimeSerial = std::min(minTimeSerial, dt);
     }
 
-    printf("[stencil serial]:\t\t[%.3f] millon cycles\n", minTimeSerial);
+    printf("[stencil serial]:\t\t[%.3f] million cycles\n", minTimeSerial);
 
     printf("\t\t\t\t(%.2fx speedup from ISPC, %.2fx speedup from ISPC + tasks)\n", 
            minTimeSerial / minTimeISPC, minTimeSerial / minTimeISPCTasks);
diff --git a/examples/volume_rendering/volume.cpp b/examples/volume_rendering/volume.cpp
index 7d8b8e99..458cd407 100644
--- a/examples/volume_rendering/volume.cpp
+++ b/examples/volume_rendering/volume.cpp
@@ -204,7 +204,7 @@ int main(int argc, char *argv[]) {
         minSerial = std::min(minSerial, dt);
     }
 
-    printf("[volume serial]:\t\t[%.3f] millon cycles\n", minSerial);
+    printf("[volume serial]:\t\t[%.3f] million cycles\n", minSerial);
     writePPM(image, width, height, "volume-serial.ppm");
 
     printf("\t\t\t\t(%.2fx speedup from ISPC, %.2fx speedup from ISPC + tasks)\n", 
diff --git a/main.cpp b/main.cpp
index 58daa2d3..61c62042 100644
--- a/main.cpp
+++ b/main.cpp
@@ -328,7 +328,6 @@ int main(int Argc, char *Argv[]) {
     // as we're parsing below
     g = new Globals;
 
-    bool debugSet = false, optSet = false;
     Module::OutputType ot = Module::Object;
     bool generatePIC = false;
     const char *arch = NULL, *cpu = NULL, *target = NULL;
@@ -371,7 +370,6 @@ int main(int Argc, char *Argv[]) {
             g->emitInstrumentation = true;
         else if (!strcmp(argv[i], "-g")) {
             g->generateDebuggingSymbols = true;
-            debugSet = true;
         }
         else if (!strcmp(argv[i], "--emit-asm"))
             ot = Module::Asm;
@@ -496,12 +494,10 @@ int main(int Argc, char *Argv[]) {
         }
         else if (!strcmp(argv[i], "-O0")) {
             g->opt.level = 0;
-            optSet = true;
         }
         else if (!strcmp(argv[i], "-O") ||  !strcmp(argv[i], "-O1") ||
                  !strcmp(argv[i], "-O2") || !strcmp(argv[i], "-O3")) {
             g->opt.level = 1;
-            optSet = true;
         }
         else if (!strcmp(argv[i], "-"))
             ;

From 443987f536a15adc384fe98284106208b2049eed Mon Sep 17 00:00:00 2001
From: Dmitry Babokin <babokin@gmail.com>
Date: Tue, 27 Aug 2013 15:33:44 +0400
Subject: [PATCH 045/124] fixing ispc.rst file properties (should not be
 executable)

---
 docs/ispc.rst | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 mode change 100755 => 100644 docs/ispc.rst

diff --git a/docs/ispc.rst b/docs/ispc.rst
old mode 100755
new mode 100644

From 5d8ebf3ca17ed18d21b89d4cacf6599220e9c293 Mon Sep 17 00:00:00 2001
From: Dmitry Babokin <babokin@gmail.com>
Date: Tue, 27 Aug 2013 18:27:06 +0400
Subject: [PATCH 046/124] Fixing r183327-AVX2-GATHER.patch file permissions

---
 llvm_patches/r183327-AVX2-GATHER.patch | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 mode change 100755 => 100644 llvm_patches/r183327-AVX2-GATHER.patch

diff --git a/llvm_patches/r183327-AVX2-GATHER.patch b/llvm_patches/r183327-AVX2-GATHER.patch
old mode 100755
new mode 100644

From be3a40e70b84a4615e36d5067e939d33f4da702e Mon Sep 17 00:00:00 2001
From: "james.brodman" <james.brodman@intel.com>
Date: Tue, 27 Aug 2013 15:15:16 -0400
Subject: [PATCH 047/124] Fix for 3.4

---
 ispc.cpp | 36 ++++++++++++++++++++++++++++++++++++
 1 file changed, 36 insertions(+)

diff --git a/ispc.cpp b/ispc.cpp
index 8a0f16c6..2dd1a87d 100644
--- a/ispc.cpp
+++ b/ispc.cpp
@@ -288,7 +288,11 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
         this->m_isa = Target::SSE2;
         this->m_nativeVectorWidth = 4;
         this->m_vectorWidth = 4;
+#if defined(LLVM_3_4)
+        this->m_attributes = "+sse,+sse2,-sse3,-sse4.1,-sse4.2,-sse4a,-ssse3,-popcnt";
+#else
         this->m_attributes = "+sse,+sse2,-sse3,-sse41,-sse42,-sse4a,-ssse3,-popcnt";
+#endif
         this->m_maskingIsFree = false;
         this->m_maskBitCount = 32;
     }
@@ -297,7 +301,11 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
         this->m_isa = Target::SSE2;
         this->m_nativeVectorWidth = 4;
         this->m_vectorWidth = 8;
+#if defined(LLVM_3_4)
+        this->m_attributes = "+sse,+sse2,-sse3,-sse4.1,-sse4.2,-sse4a,-ssse3,-popcnt";
+#else
         this->m_attributes = "+sse,+sse2,-sse3,-sse41,-sse42,-sse4a,-ssse3,-popcnt";
+#endif
         this->m_maskingIsFree = false;
         this->m_maskBitCount = 32;
     }
@@ -307,7 +315,11 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
         this->m_nativeVectorWidth = 4;
         this->m_vectorWidth = 4;
         // TODO: why not sse42 and popcnt?
+#if defined(LLVM_3_4)
+        this->m_attributes = "+sse,+sse2,+sse3,+sse4.1,-sse4.2,-sse4a,+ssse3,-popcnt,+cmov";
+#else
         this->m_attributes = "+sse,+sse2,+sse3,+sse41,-sse42,-sse4a,+ssse3,-popcnt,+cmov";
+#endif
         this->m_maskingIsFree = false;
         this->m_maskBitCount = 32;
     }
@@ -317,7 +329,11 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
         this->m_isa = Target::SSE4;
         this->m_nativeVectorWidth = 4;
         this->m_vectorWidth = 8;
+#if defined(LLVM_3_4)
+        this->m_attributes = "+sse,+sse2,+sse3,+sse4.1,-sse4.2,-sse4a,+ssse3,-popcnt,+cmov";
+#else
         this->m_attributes = "+sse,+sse2,+sse3,+sse41,-sse42,-sse4a,+ssse3,-popcnt,+cmov";
+#endif
         this->m_maskingIsFree = false;
         this->m_maskBitCount = 32;
     }
@@ -325,7 +341,11 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
         this->m_isa = Target::SSE4;
         this->m_nativeVectorWidth = 16;
         this->m_vectorWidth = 16;
+#if defined(LLVM_3_4)
+        this->m_attributes = "+sse,+sse2,+sse3,+sse4.1,-sse4.2,-sse4a,+ssse3,-popcnt,+cmov";
+#else
         this->m_attributes = "+sse,+sse2,+sse3,+sse41,-sse42,-sse4a,+ssse3,-popcnt,+cmov";
+#endif
         this->m_maskingIsFree = false;
         this->m_maskBitCount = 8;
     }
@@ -333,7 +353,11 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
         this->m_isa = Target::SSE4;
         this->m_nativeVectorWidth = 8;
         this->m_vectorWidth = 8;
+#if defined(LLVM_3_4)
+        this->m_attributes = "+sse,+sse2,+sse3,+sse4.1,-sse4.2,-sse4a,+ssse3,-popcnt,+cmov";
+#else
         this->m_attributes = "+sse,+sse2,+sse3,+sse41,-sse42,-sse4a,+ssse3,-popcnt,+cmov";
+#endif
         this->m_maskingIsFree = false;
         this->m_maskBitCount = 16;
     }
@@ -425,7 +449,11 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
         this->m_isa = Target::AVX11;
         this->m_nativeVectorWidth = 8;
         this->m_vectorWidth = 8;
+#if defined(LLVM_3_4)
+        this->m_attributes = "+avx,+popcnt,+cmov,+f16c,+rdrnd";
+#else
         this->m_attributes = "+avx,+popcnt,+cmov,+f16c,+rdrand";
+#endif
         this->m_maskingIsFree = false;
         this->m_maskBitCount = 32;
         this->m_hasHalf = true;
@@ -453,7 +481,11 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
         this->m_isa = Target::AVX2;
         this->m_nativeVectorWidth = 8;
         this->m_vectorWidth = 8;
+#if defined(LLVM_3_4)
+        this->m_attributes = "+avx2,+popcnt,+cmov,+f16c,+rdrnd"
+#else
         this->m_attributes = "+avx2,+popcnt,+cmov,+f16c,+rdrand"
+#endif
 #ifndef LLVM_3_1
             ",+fma"
 #endif // !LLVM_3_1
@@ -472,7 +504,11 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
         this->m_isa = Target::AVX2;
         this->m_nativeVectorWidth = 16;
         this->m_vectorWidth = 16;
+#if defined(LLVM_3_4)
+        this->m_attributes = "+avx2,+popcnt,+cmov,+f16c,+rdrnd"
+#else
         this->m_attributes = "+avx2,+popcnt,+cmov,+f16c,+rdrand"
+#endif
 #ifndef LLVM_3_1
             ",+fma"
 #endif // !LLVM_3_1

From 28080b0c22bd84d4b1d5cf29759c9e1423739f7e Mon Sep 17 00:00:00 2001
From: "james.brodman" <james.brodman@intel.com>
Date: Tue, 27 Aug 2013 16:56:00 -0400
Subject: [PATCH 048/124] Fix build against 3.4

---
 ispc.cpp | 60 ++++++++++++++++++++++++++++++++++++++------------------
 1 file changed, 41 insertions(+), 19 deletions(-)

diff --git a/ispc.cpp b/ispc.cpp
index 2dd1a87d..6d4b063d 100644
--- a/ispc.cpp
+++ b/ispc.cpp
@@ -288,11 +288,13 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
         this->m_isa = Target::SSE2;
         this->m_nativeVectorWidth = 4;
         this->m_vectorWidth = 4;
+        this->m_attributes = "+sse,+sse2,-sse3,-sse4a,-ssse3,-popcnt"
 #if defined(LLVM_3_4)
-        this->m_attributes = "+sse,+sse2,-sse3,-sse4.1,-sse4.2,-sse4a,-ssse3,-popcnt";
+        ",-sse4.1,-sse4.2"
 #else
-        this->m_attributes = "+sse,+sse2,-sse3,-sse41,-sse42,-sse4a,-ssse3,-popcnt";
+        ",-sse41,-sse42"
 #endif
+        ;
         this->m_maskingIsFree = false;
         this->m_maskBitCount = 32;
     }
@@ -301,11 +303,13 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
         this->m_isa = Target::SSE2;
         this->m_nativeVectorWidth = 4;
         this->m_vectorWidth = 8;
+        this->m_attributes = "+sse,+sse2,-sse3,-sse4a,-ssse3,-popcnt"
 #if defined(LLVM_3_4)
-        this->m_attributes = "+sse,+sse2,-sse3,-sse4.1,-sse4.2,-sse4a,-ssse3,-popcnt";
+        ",-sse4.1,-sse4.2"
 #else
-        this->m_attributes = "+sse,+sse2,-sse3,-sse41,-sse42,-sse4a,-ssse3,-popcnt";
+        ",-sse41,-sse42"
 #endif
+        ;
         this->m_maskingIsFree = false;
         this->m_maskBitCount = 32;
     }
@@ -315,11 +319,13 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
         this->m_nativeVectorWidth = 4;
         this->m_vectorWidth = 4;
         // TODO: why not sse42 and popcnt?
+        this->m_attributes = "+sse,+sse2,+sse3,-sse4a,+ssse3,-popcnt,+cmov"
 #if defined(LLVM_3_4)
-        this->m_attributes = "+sse,+sse2,+sse3,+sse4.1,-sse4.2,-sse4a,+ssse3,-popcnt,+cmov";
+        ",+sse4.1,-sse4.2"        
 #else
-        this->m_attributes = "+sse,+sse2,+sse3,+sse41,-sse42,-sse4a,+ssse3,-popcnt,+cmov";
+        ",+sse41,-sse42"
 #endif
+        ;
         this->m_maskingIsFree = false;
         this->m_maskBitCount = 32;
     }
@@ -329,11 +335,13 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
         this->m_isa = Target::SSE4;
         this->m_nativeVectorWidth = 4;
         this->m_vectorWidth = 8;
+        this->m_attributes = "+sse,+sse2,+sse3,-sse4a,+ssse3,-popcnt,+cmov"
 #if defined(LLVM_3_4)
-        this->m_attributes = "+sse,+sse2,+sse3,+sse4.1,-sse4.2,-sse4a,+ssse3,-popcnt,+cmov";
+        ",+sse4.1,-sse4.2"        
 #else
-        this->m_attributes = "+sse,+sse2,+sse3,+sse41,-sse42,-sse4a,+ssse3,-popcnt,+cmov";
+        ",+sse41,-sse42"
 #endif
+        ;
         this->m_maskingIsFree = false;
         this->m_maskBitCount = 32;
     }
@@ -341,11 +349,13 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
         this->m_isa = Target::SSE4;
         this->m_nativeVectorWidth = 16;
         this->m_vectorWidth = 16;
+        this->m_attributes = "+sse,+sse2,+sse3,-sse4a,+ssse3,-popcnt,+cmov"
 #if defined(LLVM_3_4)
-        this->m_attributes = "+sse,+sse2,+sse3,+sse4.1,-sse4.2,-sse4a,+ssse3,-popcnt,+cmov";
+        ",+sse4.1,-sse4.2"        
 #else
-        this->m_attributes = "+sse,+sse2,+sse3,+sse41,-sse42,-sse4a,+ssse3,-popcnt,+cmov";
+        ",+sse41,-sse42"
 #endif
+        ;
         this->m_maskingIsFree = false;
         this->m_maskBitCount = 8;
     }
@@ -353,11 +363,13 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
         this->m_isa = Target::SSE4;
         this->m_nativeVectorWidth = 8;
         this->m_vectorWidth = 8;
+        this->m_attributes = "+sse,+sse2,+sse3,-sse4a,+ssse3,-popcnt,+cmov"
 #if defined(LLVM_3_4)
-        this->m_attributes = "+sse,+sse2,+sse3,+sse4.1,-sse4.2,-sse4a,+ssse3,-popcnt,+cmov";
+        ",+sse4.1,-sse4.2"        
 #else
-        this->m_attributes = "+sse,+sse2,+sse3,+sse41,-sse42,-sse4a,+ssse3,-popcnt,+cmov";
+        ",+sse41,-sse42"
 #endif
+        ;
         this->m_maskingIsFree = false;
         this->m_maskBitCount = 16;
     }
@@ -449,11 +461,13 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
         this->m_isa = Target::AVX11;
         this->m_nativeVectorWidth = 8;
         this->m_vectorWidth = 8;
+        this->m_attributes = "+avx,+popcnt,+cmov,+f16c"
 #if defined(LLVM_3_4)
-        this->m_attributes = "+avx,+popcnt,+cmov,+f16c,+rdrnd";
+        ",+rdrnd"
 #else
-        this->m_attributes = "+avx,+popcnt,+cmov,+f16c,+rdrand";
+        ",+rdrand"
 #endif
+        ;
         this->m_maskingIsFree = false;
         this->m_maskBitCount = 32;
         this->m_hasHalf = true;
@@ -467,7 +481,13 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
         this->m_isa = Target::AVX11;
         this->m_nativeVectorWidth = 8;
         this->m_vectorWidth = 16;
-        this->m_attributes = "+avx,+popcnt,+cmov,+f16c,+rdrand";
+        this->m_attributes = "+avx,+popcnt,+cmov,+f16c"
+#if defined(LLVM_3_4)
+        ",+rdrnd"
+#else
+        ",+rdrand"
+#endif
+        ;           
         this->m_maskingIsFree = false;
         this->m_maskBitCount = 32;
         this->m_hasHalf = true;
@@ -481,10 +501,11 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
         this->m_isa = Target::AVX2;
         this->m_nativeVectorWidth = 8;
         this->m_vectorWidth = 8;
+        this->m_attributes = "+avx2,+popcnt,+cmov,+f16c"
 #if defined(LLVM_3_4)
-        this->m_attributes = "+avx2,+popcnt,+cmov,+f16c,+rdrnd"
+        ",+rdrnd"
 #else
-        this->m_attributes = "+avx2,+popcnt,+cmov,+f16c,+rdrand"
+        ",+rdrand"
 #endif
 #ifndef LLVM_3_1
             ",+fma"
@@ -504,10 +525,11 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
         this->m_isa = Target::AVX2;
         this->m_nativeVectorWidth = 16;
         this->m_vectorWidth = 16;
+        this->m_attributes = "+avx2,+popcnt,+cmov,+f16c"
 #if defined(LLVM_3_4)
-        this->m_attributes = "+avx2,+popcnt,+cmov,+f16c,+rdrnd"
+        ",+rdrnd"
 #else
-        this->m_attributes = "+avx2,+popcnt,+cmov,+f16c,+rdrand"
+        ",+rdrand"
 #endif
 #ifndef LLVM_3_1
             ",+fma"

From 501a23ad208c027c208c00a44f12c65824d6f7f3 Mon Sep 17 00:00:00 2001
From: Dmitry Babokin <babokin@gmail.com>
Date: Thu, 29 Aug 2013 14:48:09 +0400
Subject: [PATCH 049/124] Typos fixes in docs

---
 docs/ispc.rst | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/ispc.rst b/docs/ispc.rst
index 26cf6be3..476046e8 100644
--- a/docs/ispc.rst
+++ b/docs/ispc.rst
@@ -3659,7 +3659,7 @@ command-line argument.
 Cross-Program Instance Operations
 ---------------------------------
 
-``ispc`` programs are often used to expresses independently-executing
+``ispc`` programs are often used to express independently-executing
 programs performing computation on separate data elements.  (i.e. pure
 data-parallelism).  However, it's often the case where it's useful for the
 program instances to be able to cooperate in computing results.  The
@@ -3690,7 +3690,7 @@ the running program instances.
 
 The ``rotate()`` function allows each program instance to find the value of
 the given value that their neighbor ``offset`` steps away has.  For
-example, on an 8-wide target, if ``offset`` has the value (1, 2, 3, 4, 5,
+example, on an 8-wide target, if ``value`` has the value (1, 2, 3, 4, 5,
 6, 7, 8) across the gang of running program instances, then ``rotate(value,
 -1)`` causes the first program instance to get the value 8, the second
 program instance to get the value 1, the third 2, and so forth.  The
@@ -3769,7 +3769,7 @@ where the ``i`` th element of ``x`` has been replaced with the value ``v``
 Reductions
 ----------
 
-A number routines are available to evaluate conditions across the
+A number of routines are available to evaluate conditions across the
 running program instances.  For example, ``any()`` returns ``true`` if
 the given value ``v`` is ``true`` for any of the SPMD program
 instances currently running, ``all()`` returns ``true`` if it true

From e06267ef1bab233a955c5182c4071969520ac7b8 Mon Sep 17 00:00:00 2001
From: Dmitry Babokin <babokin@gmail.com>
Date: Thu, 29 Aug 2013 16:16:02 +0400
Subject: [PATCH 050/124] Fix for incorrect implementation of
 reduce_[min|max]_[float|double], it showed up as -O0

---
 stdlib.ispc | 44 ++++++++++++++++++++++++++++++++------------
 1 file changed, 32 insertions(+), 12 deletions(-)

diff --git a/stdlib.ispc b/stdlib.ispc
index dc94d7e3..f7d135dd 100644
--- a/stdlib.ispc
+++ b/stdlib.ispc
@@ -918,9 +918,14 @@ static inline uniform float reduce_min(float v) {
     // For the lanes where the mask is off, replace the given value with
     // infinity, so that it doesn't affect the result.
     int iflt_max = 0x7f800000; // infinity
-    // Must use __floatbits_varying_int32, not floatbits(), since with the
-    // latter the current mask enters into the returned result...
-    return __reduce_min_float(__mask ? v : __floatbits_varying_int32(iflt_max));
+    // unmasked block is needed to make sure that argument for unmasked
+    // function __reduce_min_float() are calculated without a mask.
+    bool test = __mask;
+    uniform float result;
+    unmasked {
+        result = __reduce_min_float(test ? v : floatbits(iflt_max));
+    }
+    return result;
 }
 
 __declspec(safe) 
@@ -928,9 +933,14 @@ static inline uniform float reduce_max(float v) {
     // For the lanes where the mask is off, replace the given value with
     // negative infinity, so that it doesn't affect the result.
     const int iflt_neg_max = 0xff800000; // -infinity
-    // Must use __floatbits_varying_int32, not floatbits(), since with the
-    // latter the current mask enters into the returned result...
-    return __reduce_max_float(__mask ? v : __floatbits_varying_int32(iflt_neg_max));
+    // unmasked block is needed to make sure that argument for unmasked
+    // function __reduce_max_float() are calculated without a mask.
+    bool test = __mask;
+    uniform float result;
+    unmasked {
+        result = __reduce_max_float(test ? v : floatbits(iflt_neg_max));
+    }
+    return result;
 }
 
 __declspec(safe) 
@@ -986,17 +996,27 @@ static inline uniform double reduce_add(double x) {
 __declspec(safe) 
 static inline uniform double reduce_min(double v) {
     int64 iflt_max = 0x7ff0000000000000; // infinity
-    // Must use __doublebits_varying_int64, not doublebits(), since with the
-    // latter the current mask enters into the returned result...
-    return __reduce_min_double(__mask ? v : __doublebits_varying_int64(iflt_max));
+    // unmasked block is needed to make sure that argument for unmasked
+    // function __reduce_min_double() are calculated without a mask.
+    bool test = __mask;
+    uniform double result;
+    unmasked {
+        result = __reduce_min_double(test ? v : doublebits(iflt_max));
+    }
+    return result;
 }
 
 __declspec(safe) 
 static inline uniform double reduce_max(double v) {
     const int64 iflt_neg_max = 0xfff0000000000000; // -infinity
-    // Must use __doublebits_varying_int64, not doublebits(), since with the
-    // latter the current mask enters into the returned result...
-    return __reduce_max_double(__mask ? v : __doublebits_varying_int64(iflt_neg_max));
+    // unmasked block is needed to make sure that argument for unmasked
+    // function __reduce_max_double() are calculated without a mask.
+    bool test = __mask;
+    uniform double result;
+    unmasked {
+        result = __reduce_max_double(test ? v : doublebits(iflt_neg_max));
+    }
+    return result;
 }
 
 __declspec(safe) 

From 320b1700ff2c6f791d8477223f3d799a875089b5 Mon Sep 17 00:00:00 2001
From: Ilia Filippov <ili.filippov@gmail.com>
Date: Fri, 30 Aug 2013 16:01:01 +0400
Subject: [PATCH 051/124] correction of adding -Werror option

---
 Makefile | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 8d27cc80..09ec302d 100644
--- a/Makefile
+++ b/Makefile
@@ -116,7 +116,10 @@ CXXFLAGS=$(OPT) $(LLVM_CXXFLAGS) -I. -Iobjs/ -I$(CLANG_INCLUDE)  \
 	$(LLVM_VERSION_DEF) \
 	-Wall \
 	-DBUILD_DATE="\"$(BUILD_DATE)\"" -DBUILD_VERSION="\"$(BUILD_VERSION)\"" \
-	-Werror -Wno-sign-compare
+	-Wno-sign-compare
+ifneq ($(LLVM_VERSION),LLVM_3_1)
+	CXXFLAGS+=-Werror
+endif
 ifneq ($(ARM_ENABLED), 0)
     CXXFLAGS+=-DISPC_ARM_ENABLED
 endif

From 97d430d5cd87630a1074888476aceb110ebf4772 Mon Sep 17 00:00:00 2001
From: "james.brodman" <james.brodman@intel.com>
Date: Fri, 30 Aug 2013 14:13:08 -0400
Subject: [PATCH 052/124] Fix to respect uniform/varying qualifiers inside of
 typedefs.

---
 decl.cpp | 25 +++++++++++++++++++++++--
 sym.cpp  | 11 +++++++++++
 sym.h    |  6 ++++++
 3 files changed, 40 insertions(+), 2 deletions(-)

diff --git a/decl.cpp b/decl.cpp
index e7b3cdef..8a10543b 100644
--- a/decl.cpp
+++ b/decl.cpp
@@ -69,8 +69,15 @@ lApplyTypeQualifiers(int typeQualifiers, const Type *type, SourcePos pos) {
     if (type == NULL)
         return NULL;
 
-    if ((typeQualifiers & TYPEQUAL_CONST) != 0)
+    if ((typeQualifiers & TYPEQUAL_CONST) != 0) {
         type = type->GetAsConstType();
+    }
+
+    if ( ((typeQualifiers & TYPEQUAL_UNIFORM) != 0)
+         && ((typeQualifiers & TYPEQUAL_VARYING) != 0) ) {
+        Error(pos, "Type \"%s\" cannot be qualified with both uniform and varying.", 
+              type->GetString().c_str());
+    }
 
     if ((typeQualifiers & TYPEQUAL_UNIFORM) != 0) {
         if (Type::Equal(type, AtomicType::Void))
@@ -84,9 +91,10 @@ lApplyTypeQualifiers(int typeQualifiers, const Type *type, SourcePos pos) {
         else
             type = type->GetAsVaryingType();
     }
-    else
+    else {
         if (Type::Equal(type, AtomicType::Void) == false)
             type = type->GetAsUnboundVariabilityType();
+    }
 
     if ((typeQualifiers & TYPEQUAL_UNSIGNED) != 0) {
         if ((typeQualifiers & TYPEQUAL_SIGNED) != 0)
@@ -124,6 +132,17 @@ DeclSpecs::DeclSpecs(const Type *t, StorageClass sc, int tq) {
     typeQualifiers = tq;
     soaWidth = 0;
     vectorSize = 0;
+    if (t != NULL) {
+        if (m->symbolTable->ContainsType(t)) {
+            // Typedefs might have uniform/varying qualifiers inside.
+            if (t->IsVaryingType()) {
+                typeQualifiers |= TYPEQUAL_VARYING;
+            }
+            else if (t->IsUniformType()) {
+                typeQualifiers |= TYPEQUAL_UNIFORM;
+            }
+        }
+    }
 }
 
 
@@ -229,6 +248,7 @@ Declarator::Declarator(DeclaratorKind dk, SourcePos p)
 void
 Declarator::InitFromDeclSpecs(DeclSpecs *ds) {
     const Type *baseType = ds->GetBaseType(pos);
+
     InitFromType(baseType, ds);
 
     if (type == NULL) {
@@ -591,6 +611,7 @@ Declaration::Declaration(DeclSpecs *ds, Declarator *d) {
 }
 
 
+
 std::vector<VariableDeclaration>
 Declaration::GetVariableDeclarations() const {
     Assert(declSpecs->storageClass != SC_TYPEDEF);
diff --git a/sym.cpp b/sym.cpp
index f16f5e11..05f9996a 100644
--- a/sym.cpp
+++ b/sym.cpp
@@ -214,6 +214,17 @@ SymbolTable::LookupType(const char *name) const {
     return NULL;
 }
 
+bool
+SymbolTable::ContainsType(const Type *type) const {
+    TypeMapType::const_iterator iter = types.begin();
+    while (iter != types.end()) {
+        if (iter->second == type) {
+            return true;
+        }
+        iter++;
+    }
+    return false;
+}
 
 std::vector<std::string>
 SymbolTable::ClosestVariableOrFunctionMatch(const char *str) const {
diff --git a/sym.h b/sym.h
index efb532a3..761c3612 100644
--- a/sym.h
+++ b/sym.h
@@ -219,6 +219,12 @@ public:
         @return Pointer to the Type, if found; otherwise NULL is returned.
     */
     const Type *LookupType(const char *name) const;
+    
+    /** Look for a type given a pointer.
+
+        @return True if found, False otherwise.
+    */
+    bool ContainsType(const Type * type) const;
 
     /** This method returns zero or more strings with the names of symbols
         in the symbol table that nearly (but not exactly) match the given

From 8db378b26565e6263f523faa335f10651078551f Mon Sep 17 00:00:00 2001
From: "james.brodman" <james.brodman@intel.com>
Date: Wed, 4 Sep 2013 16:01:58 -0400
Subject: [PATCH 053/124] Revert "Remove support for using SVML for math lib
 routines."

This reverts commit d9c38b5c1f6c1ccb4920465789b9e3d451e302a8.
---
 builtins.cpp                      | 11 ++++
 builtins/target-avx-x2.ll         | 17 ++++++
 builtins/target-avx.ll            | 17 ++++++
 builtins/target-generic-1.ll      | 98 +++++++++++++++++++++++++++++++
 builtins/target-generic-common.ll | 16 +++++
 builtins/target-neon-common.ll    | 13 ++++
 builtins/target-sse2-x2.ll        | 86 +++++++++++++++++++++++++++
 builtins/target-sse2.ll           | 60 +++++++++++++++++++
 builtins/target-sse4-16.ll        | 15 +++++
 builtins/target-sse4-8.ll         | 15 +++++
 builtins/target-sse4-x2.ll        | 86 +++++++++++++++++++++++++++
 builtins/target-sse4.ll           | 60 +++++++++++++++++++
 docs/ispc.rst                     |  3 +
 ispc.h                            |  2 +-
 main.cpp                          |  3 +
 stdlib.ispc                       | 72 +++++++++++++++++------
 16 files changed, 556 insertions(+), 18 deletions(-)

diff --git a/builtins.cpp b/builtins.cpp
index f3a0cf59..886eec15 100644
--- a/builtins.cpp
+++ b/builtins.cpp
@@ -581,6 +581,15 @@ lSetInternalFunctions(llvm::Module *module) {
         "__stdlib_sinf",
         "__stdlib_tan",
         "__stdlib_tanf",
+        "__svml_sin",
+        "__svml_cos",
+        "__svml_sincos",
+        "__svml_tan",
+        "__svml_atan",
+        "__svml_atan2",
+        "__svml_exp",
+        "__svml_log",
+        "__svml_pow",
         "__undef_uniform",
         "__undef_varying",
         "__vec4_add_float",
@@ -1050,6 +1059,8 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
                        symbolTable);
     lDefineConstantInt("__math_lib_ispc_fast", (int)Globals::Math_ISPCFast,
                        module, symbolTable);
+    lDefineConstantInt("__math_lib_svml", (int)Globals::Math_SVML, module,
+                       symbolTable);
     lDefineConstantInt("__math_lib_system", (int)Globals::Math_System, module,
                        symbolTable);
     lDefineConstantIntFunc("__fast_masked_vload", (int)g->opt.fastMaskedVload,
diff --git a/builtins/target-avx-x2.ll b/builtins/target-avx-x2.ll
index 8fb2e427..d9e0322b 100644
--- a/builtins/target-avx-x2.ll
+++ b/builtins/target-avx-x2.ll
@@ -134,6 +134,23 @@ define <16 x float> @__sqrt_varying_float(<16 x float>) nounwind readonly always
   ret <16 x float> %call
 }
 
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; svml
+
+; FIXME: need either to wire these up to the 8-wide SVML entrypoints,
+; or, use the macro to call the 4-wide ones 4x with our 16-wide
+; vectors...
+
+declare <16 x float> @__svml_sin(<16 x float>)
+declare <16 x float> @__svml_cos(<16 x float>)
+declare void @__svml_sincos(<16 x float>, <16 x float> *, <16 x float> *)
+declare <16 x float> @__svml_tan(<16 x float>)
+declare <16 x float> @__svml_atan(<16 x float>)
+declare <16 x float> @__svml_atan2(<16 x float>, <16 x float>)
+declare <16 x float> @__svml_exp(<16 x float>)
+declare <16 x float> @__svml_log(<16 x float>)
+declare <16 x float> @__svml_pow(<16 x float>, <16 x float>)
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; float min/max
 
diff --git a/builtins/target-avx.ll b/builtins/target-avx.ll
index adaed9ba..90e2f3ac 100644
--- a/builtins/target-avx.ll
+++ b/builtins/target-avx.ll
@@ -134,6 +134,23 @@ define <8 x float> @__sqrt_varying_float(<8 x float>) nounwind readonly alwaysin
   ret <8 x float> %call
 }
 
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; svml
+
+; FIXME: need either to wire these up to the 8-wide SVML entrypoints,
+; or, use the macro to call the 4-wide ones twice with our 8-wide
+; vectors...
+
+declare <8 x float> @__svml_sin(<8 x float>)
+declare <8 x float> @__svml_cos(<8 x float>)
+declare void @__svml_sincos(<8 x float>, <8 x float> *, <8 x float> *)
+declare <8 x float> @__svml_tan(<8 x float>)
+declare <8 x float> @__svml_atan(<8 x float>)
+declare <8 x float> @__svml_atan2(<8 x float>, <8 x float>)
+declare <8 x float> @__svml_exp(<8 x float>)
+declare <8 x float> @__svml_log(<8 x float>)
+declare <8 x float> @__svml_pow(<8 x float>, <8 x float>)
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; float min/max
 
diff --git a/builtins/target-generic-1.ll b/builtins/target-generic-1.ll
index 3472c207..31ebcdd5 100644
--- a/builtins/target-generic-1.ll
+++ b/builtins/target-generic-1.ll
@@ -647,6 +647,104 @@ define  <1 x float> @__rsqrt_varying_float(<1 x float> %v) nounwind readonly alw
   
 }
 
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; svml stuff
+
+define  <1 x float> @__svml_sin(<1 x float>) nounwind readnone alwaysinline {
+  ;%ret = call <1 x float> @__svml_sinf4(<1 x float> %0)
+  ;ret <1 x float> %ret
+  ;%r = extractelement <1 x float> %0, i32 0
+  ;%s = call float @llvm.sin.f32(float %r)
+  ;%rv = insertelement <1 x float> undef, float %r, i32 0
+  ;ret <1 x float> %rv
+  unary1to1(float,@llvm.sin.f32)
+   
+}
+
+define  <1 x float> @__svml_cos(<1 x float>) nounwind readnone alwaysinline {
+  ;%ret = call <1 x float> @__svml_cosf4(<1 x float> %0)
+  ;ret <1 x float> %ret
+  ;%r = extractelement <1 x float> %0, i32 0
+  ;%s = call float @llvm.cos.f32(float %r)
+  ;%rv = insertelement <1 x float> undef, float %r, i32 0
+  ;ret <1 x float> %rv
+  unary1to1(float, @llvm.cos.f32)
+
+}
+
+define  void @__svml_sincos(<1 x float>, <1 x float> *, <1 x float> *) nounwind readnone alwaysinline {
+;  %s = call <1 x float> @__svml_sincosf4(<1 x float> * %2, <1 x float> %0)
+;  store <1 x float> %s, <1 x float> * %1
+;  ret void
+   %sin = call <1 x float> @__svml_sin (<1 x float> %0)
+   %cos = call <1 x float> @__svml_cos (<1 x float> %0)
+   store <1 x float> %sin, <1 x float> * %1
+   store <1 x float> %cos, <1 x float> * %2
+   ret void
+}
+
+define  <1 x float> @__svml_tan(<1 x float>) nounwind readnone alwaysinline {
+  ;%ret = call <1 x float> @__svml_tanf4(<1 x float> %0)
+  ;ret <1 x float> %ret
+  ;%r = extractelement <1 x float> %0, i32 0
+  ;%s = call float @llvm_tan_f32(float %r)
+  ;%rv = insertelement <1 x float> undef, float %r, i32 0
+  ;ret <1 x float> %rv
+  ;unasry1to1(float, @llvm.tan.f32)
+  ; UNSUPPORTED!
+  ret <1 x float > %0
+}
+
+define  <1 x float> @__svml_atan(<1 x float>) nounwind readnone alwaysinline {
+;  %ret = call <1 x float> @__svml_atanf4(<1 x float> %0)
+;  ret <1 x float> %ret
+  ;%r = extractelement <1 x float> %0, i32 0
+  ;%s = call float @llvm_atan_f32(float %r)
+  ;%rv = insertelement <1 x float> undef, float %r, i32 0
+  ;ret <1 x float> %rv
+  ;unsary1to1(float,@llvm.atan.f32)
+  ;UNSUPPORTED!
+  ret <1 x float > %0
+
+}
+
+define  <1 x float> @__svml_atan2(<1 x float>, <1 x float>) nounwind readnone alwaysinline {
+  ;%ret = call <1 x float> @__svml_atan2f4(<1 x float> %0, <1 x float> %1)
+  ;ret <1 x float> %ret
+  ;%y = extractelement <1 x float> %0, i32 0
+  ;%x = extractelement <1 x float> %1, i32 0
+  ;%q = fdiv float %y, %x
+  ;%a = call float @llvm.atan.f32 (float %q)
+  ;%rv = insertelement <1 x float> undef, float %a, i32 0
+  ;ret <1 x float> %rv
+  ; UNSUPPORTED!
+  ret <1 x float > %0
+}
+
+define  <1 x float> @__svml_exp(<1 x float>) nounwind readnone alwaysinline {
+  ;%ret = call <1 x float> @__svml_expf4(<1 x float> %0)
+  ;ret <1 x float> %ret
+  unary1to1(float, @llvm.exp.f32)
+}
+
+define  <1 x float> @__svml_log(<1 x float>) nounwind readnone alwaysinline {
+  ;%ret = call <1 x float> @__svml_logf4(<1 x float> %0)
+  ;ret <1 x float> %ret
+  unary1to1(float, @llvm.log.f32)
+}
+
+define  <1 x float> @__svml_pow(<1 x float>, <1 x float>) nounwind readnone alwaysinline {
+  ;%ret = call <1 x float> @__svml_powf4(<1 x float> %0, <1 x float> %1)
+  ;ret <1 x float> %ret
+  %r = extractelement <1 x float> %0, i32 0
+  %e  = extractelement <1 x float> %1, i32 0
+  %s = call float @llvm.pow.f32(float %r,float %e)
+  %rv = insertelement <1 x float> undef, float %s, i32 0
+  ret <1 x float> %rv
+
+}
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; float min/max
 
diff --git a/builtins/target-generic-common.ll b/builtins/target-generic-common.ll
index c683ff45..2896c6b1 100644
--- a/builtins/target-generic-common.ll
+++ b/builtins/target-generic-common.ll
@@ -202,6 +202,22 @@ declare i64 @__count_trailing_zeros_i64(i64) nounwind readnone
 declare i32 @__count_leading_zeros_i32(i32) nounwind readnone
 declare i64 @__count_leading_zeros_i64(i64) nounwind readnone
 
+;; svml
+
+; FIXME: need either to wire these up to the 8-wide SVML entrypoints,
+; or, use the macro to call the 4-wide ones twice with our 8-wide
+; vectors...
+
+declare <WIDTH x float> @__svml_sin(<WIDTH x float>)
+declare <WIDTH x float> @__svml_cos(<WIDTH x float>)
+declare void @__svml_sincos(<WIDTH x float>, <WIDTH x float> *, <WIDTH x float> *)
+declare <WIDTH x float> @__svml_tan(<WIDTH x float>)
+declare <WIDTH x float> @__svml_atan(<WIDTH x float>)
+declare <WIDTH x float> @__svml_atan2(<WIDTH x float>, <WIDTH x float>)
+declare <WIDTH x float> @__svml_exp(<WIDTH x float>)
+declare <WIDTH x float> @__svml_log(<WIDTH x float>)
+declare <WIDTH x float> @__svml_pow(<WIDTH x float>, <WIDTH x float>)
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; reductions
 
diff --git a/builtins/target-neon-common.ll b/builtins/target-neon-common.ll
index f892a0a1..696b0748 100644
--- a/builtins/target-neon-common.ll
+++ b/builtins/target-neon-common.ll
@@ -313,6 +313,19 @@ define void @__masked_store_blend_i64(<WIDTH x i64>* nocapture %ptr,
   ret void
 }
 
+;; yuck.  We need declarations of these, even though we shouldnt ever
+;; actually generate calls to them for the NEON target...
+
+declare <WIDTH x float> @__svml_sin(<WIDTH x float>)
+declare <WIDTH x float> @__svml_cos(<WIDTH x float>)
+declare void @__svml_sincos(<WIDTH x float>, <WIDTH x float> *, <WIDTH x float> *)
+declare <WIDTH x float> @__svml_tan(<WIDTH x float>)
+declare <WIDTH x float> @__svml_atan(<WIDTH x float>)
+declare <WIDTH x float> @__svml_atan2(<WIDTH x float>, <WIDTH x float>)
+declare <WIDTH x float> @__svml_exp(<WIDTH x float>)
+declare <WIDTH x float> @__svml_log(<WIDTH x float>)
+declare <WIDTH x float> @__svml_pow(<WIDTH x float>, <WIDTH x float>)
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; gather
 
diff --git a/builtins/target-sse2-x2.ll b/builtins/target-sse2-x2.ll
index 057ea98f..da22a66c 100644
--- a/builtins/target-sse2-x2.ll
+++ b/builtins/target-sse2-x2.ll
@@ -102,6 +102,92 @@ define <8 x float> @__sqrt_varying_float(<8 x float>) nounwind readonly alwaysin
   ret <8 x float> %call
 }
 
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; svml stuff
+
+declare <4 x float> @__svml_sinf4(<4 x float>) nounwind readnone
+declare <4 x float> @__svml_cosf4(<4 x float>) nounwind readnone
+declare <4 x float> @__svml_sincosf4(<4 x float> *, <4 x float>) nounwind readnone
+declare <4 x float> @__svml_tanf4(<4 x float>) nounwind readnone
+declare <4 x float> @__svml_atanf4(<4 x float>) nounwind readnone
+declare <4 x float> @__svml_atan2f4(<4 x float>, <4 x float>) nounwind readnone
+declare <4 x float> @__svml_expf4(<4 x float>) nounwind readnone
+declare <4 x float> @__svml_logf4(<4 x float>) nounwind readnone
+declare <4 x float> @__svml_powf4(<4 x float>, <4 x float>) nounwind readnone
+
+
+define <8 x float> @__svml_sin(<8 x float>) nounwind readnone alwaysinline {
+  unary4to8(ret, float, @__svml_sinf4, %0)
+  ret <8 x float> %ret
+}
+
+define <8 x float> @__svml_cos(<8 x float>) nounwind readnone alwaysinline {
+  unary4to8(ret, float, @__svml_cosf4, %0)
+  ret <8 x float> %ret
+}
+
+define void @__svml_sincos(<8 x float>, <8 x float> *,
+                                    <8 x float> *) nounwind readnone alwaysinline {
+  ; call svml_sincosf4 two times with the two 4-wide sub-vectors
+  %a = shufflevector <8 x float> %0, <8 x float> undef,
+         <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %b = shufflevector <8 x float> %0, <8 x float> undef,
+         <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+
+  %cospa = alloca <4 x float>
+  %sa = call <4 x float> @__svml_sincosf4(<4 x float> * %cospa, <4 x float> %a)
+
+  %cospb = alloca <4 x float>
+  %sb = call <4 x float> @__svml_sincosf4(<4 x float> * %cospb, <4 x float> %b)
+
+  %sin = shufflevector <4 x float> %sa, <4 x float> %sb,
+         <8 x i32> <i32 0, i32 1, i32 2, i32 3,
+                    i32 4, i32 5, i32 6, i32 7>
+  store <8 x float> %sin, <8 x float> * %1
+
+  %cosa = load <4 x float> * %cospa
+  %cosb = load <4 x float> * %cospb
+  %cos = shufflevector <4 x float> %cosa, <4 x float> %cosb,
+         <8 x i32> <i32 0, i32 1, i32 2, i32 3,
+                    i32 4, i32 5, i32 6, i32 7>
+  store <8 x float> %cos, <8 x float> * %2
+
+  ret void
+}
+
+define <8 x float> @__svml_tan(<8 x float>) nounwind readnone alwaysinline {
+  unary4to8(ret, float, @__svml_tanf4, %0)
+  ret <8 x float> %ret
+}
+
+define <8 x float> @__svml_atan(<8 x float>) nounwind readnone alwaysinline {
+  unary4to8(ret, float, @__svml_atanf4, %0)
+  ret <8 x float> %ret
+}
+
+define <8 x float> @__svml_atan2(<8 x float>,
+                                          <8 x float>) nounwind readnone alwaysinline {
+  binary4to8(ret, float, @__svml_atan2f4, %0, %1)
+  ret <8 x float> %ret
+}
+
+define <8 x float> @__svml_exp(<8 x float>) nounwind readnone alwaysinline {
+  unary4to8(ret, float, @__svml_expf4, %0)
+  ret <8 x float> %ret
+}
+
+define <8 x float> @__svml_log(<8 x float>) nounwind readnone alwaysinline {
+  unary4to8(ret, float, @__svml_logf4, %0)
+  ret <8 x float> %ret
+}
+
+define <8 x float> @__svml_pow(<8 x float>,
+                                        <8 x float>) nounwind readnone alwaysinline {
+  binary4to8(ret, float, @__svml_powf4, %0, %1)
+  ret <8 x float> %ret
+}
+
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; float min/max
 
diff --git a/builtins/target-sse2.ll b/builtins/target-sse2.ll
index e0a5c3d5..a6b206b6 100644
--- a/builtins/target-sse2.ll
+++ b/builtins/target-sse2.ll
@@ -493,6 +493,66 @@ define <4 x float> @__sqrt_varying_float(<4 x float>) nounwind readonly alwaysin
   ret <4 x float> %call
 }
 
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; svml stuff
+
+declare <4 x float> @__svml_sinf4(<4 x float>) nounwind readnone
+declare <4 x float> @__svml_cosf4(<4 x float>) nounwind readnone
+declare <4 x float> @__svml_sincosf4(<4 x float> *, <4 x float>) nounwind readnone
+declare <4 x float> @__svml_tanf4(<4 x float>) nounwind readnone
+declare <4 x float> @__svml_atanf4(<4 x float>) nounwind readnone
+declare <4 x float> @__svml_atan2f4(<4 x float>, <4 x float>) nounwind readnone
+declare <4 x float> @__svml_expf4(<4 x float>) nounwind readnone
+declare <4 x float> @__svml_logf4(<4 x float>) nounwind readnone
+declare <4 x float> @__svml_powf4(<4 x float>, <4 x float>) nounwind readnone
+
+
+define <4 x float> @__svml_sin(<4 x float>) nounwind readnone alwaysinline {
+  %ret = call <4 x float> @__svml_sinf4(<4 x float> %0)
+  ret <4 x float> %ret
+}
+
+define <4 x float> @__svml_cos(<4 x float>) nounwind readnone alwaysinline {
+  %ret = call <4 x float> @__svml_cosf4(<4 x float> %0)
+  ret <4 x float> %ret
+}
+
+define void @__svml_sincos(<4 x float>, <4 x float> *, <4 x float> *) nounwind readnone alwaysinline {
+  %s = call <4 x float> @__svml_sincosf4(<4 x float> * %2, <4 x float> %0)
+  store <4 x float> %s, <4 x float> * %1
+  ret void
+}
+
+define <4 x float> @__svml_tan(<4 x float>) nounwind readnone alwaysinline {
+  %ret = call <4 x float> @__svml_tanf4(<4 x float> %0)
+  ret <4 x float> %ret
+}
+
+define <4 x float> @__svml_atan(<4 x float>) nounwind readnone alwaysinline {
+  %ret = call <4 x float> @__svml_atanf4(<4 x float> %0)
+  ret <4 x float> %ret
+}
+
+define <4 x float> @__svml_atan2(<4 x float>, <4 x float>) nounwind readnone alwaysinline {
+  %ret = call <4 x float> @__svml_atan2f4(<4 x float> %0, <4 x float> %1)
+  ret <4 x float> %ret
+}
+
+define <4 x float> @__svml_exp(<4 x float>) nounwind readnone alwaysinline {
+  %ret = call <4 x float> @__svml_expf4(<4 x float> %0)
+  ret <4 x float> %ret
+}
+
+define <4 x float> @__svml_log(<4 x float>) nounwind readnone alwaysinline {
+  %ret = call <4 x float> @__svml_logf4(<4 x float> %0)
+  ret <4 x float> %ret
+}
+
+define <4 x float> @__svml_pow(<4 x float>, <4 x float>) nounwind readnone alwaysinline {
+  %ret = call <4 x float> @__svml_powf4(<4 x float> %0, <4 x float> %1)
+  ret <4 x float> %ret
+}
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; float min/max
 
diff --git a/builtins/target-sse4-16.ll b/builtins/target-sse4-16.ll
index b4772552..d7f3833d 100644
--- a/builtins/target-sse4-16.ll
+++ b/builtins/target-sse4-16.ll
@@ -205,6 +205,21 @@ define <8 x double> @__max_varying_double(<8 x double>, <8 x double>) nounwind r
   ret <8 x double> %ret
 }
 
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; svml
+
+; FIXME
+
+declare <8 x float> @__svml_sin(<8 x float>)
+declare <8 x float> @__svml_cos(<8 x float>)
+declare void @__svml_sincos(<8 x float>, <8 x float> *, <8 x float> *)
+declare <8 x float> @__svml_tan(<8 x float>)
+declare <8 x float> @__svml_atan(<8 x float>)
+declare <8 x float> @__svml_atan2(<8 x float>, <8 x float>)
+declare <8 x float> @__svml_exp(<8 x float>)
+declare <8 x float> @__svml_log(<8 x float>)
+declare <8 x float> @__svml_pow(<8 x float>, <8 x float>)
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; horizontal ops / reductions
 
diff --git a/builtins/target-sse4-8.ll b/builtins/target-sse4-8.ll
index a75d8e3a..fd4b74d7 100644
--- a/builtins/target-sse4-8.ll
+++ b/builtins/target-sse4-8.ll
@@ -217,6 +217,21 @@ define <16 x double> @__max_varying_double(<16 x double>, <16 x double>) nounwin
   ret <16 x double> %ret
 }
 
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; svml
+
+; FIXME
+
+declare <16 x float> @__svml_sin(<16 x float>)
+declare <16 x float> @__svml_cos(<16 x float>)
+declare void @__svml_sincos(<16 x float>, <16 x float> *, <16 x float> *)
+declare <16 x float> @__svml_tan(<16 x float>)
+declare <16 x float> @__svml_atan(<16 x float>)
+declare <16 x float> @__svml_atan2(<16 x float>, <16 x float>)
+declare <16 x float> @__svml_exp(<16 x float>)
+declare <16 x float> @__svml_log(<16 x float>)
+declare <16 x float> @__svml_pow(<16 x float>, <16 x float>)
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; horizontal ops / reductions
 
diff --git a/builtins/target-sse4-x2.ll b/builtins/target-sse4-x2.ll
index 897a09eb..a7faddb3 100644
--- a/builtins/target-sse4-x2.ll
+++ b/builtins/target-sse4-x2.ll
@@ -102,6 +102,92 @@ define <8 x float> @__sqrt_varying_float(<8 x float>) nounwind readonly alwaysin
   ret <8 x float> %call
 }
 
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; svml stuff
+
+declare <4 x float> @__svml_sinf4(<4 x float>) nounwind readnone
+declare <4 x float> @__svml_cosf4(<4 x float>) nounwind readnone
+declare <4 x float> @__svml_sincosf4(<4 x float> *, <4 x float>) nounwind readnone
+declare <4 x float> @__svml_tanf4(<4 x float>) nounwind readnone
+declare <4 x float> @__svml_atanf4(<4 x float>) nounwind readnone
+declare <4 x float> @__svml_atan2f4(<4 x float>, <4 x float>) nounwind readnone
+declare <4 x float> @__svml_expf4(<4 x float>) nounwind readnone
+declare <4 x float> @__svml_logf4(<4 x float>) nounwind readnone
+declare <4 x float> @__svml_powf4(<4 x float>, <4 x float>) nounwind readnone
+
+
+define <8 x float> @__svml_sin(<8 x float>) nounwind readnone alwaysinline {
+  unary4to8(ret, float, @__svml_sinf4, %0)
+  ret <8 x float> %ret
+}
+
+define <8 x float> @__svml_cos(<8 x float>) nounwind readnone alwaysinline {
+  unary4to8(ret, float, @__svml_cosf4, %0)
+  ret <8 x float> %ret
+}
+
+define void @__svml_sincos(<8 x float>, <8 x float> *,
+                                    <8 x float> *) nounwind readnone alwaysinline {
+  ; call svml_sincosf4 two times with the two 4-wide sub-vectors
+  %a = shufflevector <8 x float> %0, <8 x float> undef,
+         <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %b = shufflevector <8 x float> %0, <8 x float> undef,
+         <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+
+  %cospa = alloca <4 x float>
+  %sa = call <4 x float> @__svml_sincosf4(<4 x float> * %cospa, <4 x float> %a)
+
+  %cospb = alloca <4 x float>
+  %sb = call <4 x float> @__svml_sincosf4(<4 x float> * %cospb, <4 x float> %b)
+
+  %sin = shufflevector <4 x float> %sa, <4 x float> %sb,
+         <8 x i32> <i32 0, i32 1, i32 2, i32 3,
+                    i32 4, i32 5, i32 6, i32 7>
+  store <8 x float> %sin, <8 x float> * %1
+
+  %cosa = load <4 x float> * %cospa
+  %cosb = load <4 x float> * %cospb
+  %cos = shufflevector <4 x float> %cosa, <4 x float> %cosb,
+         <8 x i32> <i32 0, i32 1, i32 2, i32 3,
+                    i32 4, i32 5, i32 6, i32 7>
+  store <8 x float> %cos, <8 x float> * %2
+
+  ret void
+}
+
+define <8 x float> @__svml_tan(<8 x float>) nounwind readnone alwaysinline {
+  unary4to8(ret, float, @__svml_tanf4, %0)
+  ret <8 x float> %ret
+}
+
+define <8 x float> @__svml_atan(<8 x float>) nounwind readnone alwaysinline {
+  unary4to8(ret, float, @__svml_atanf4, %0)
+  ret <8 x float> %ret
+}
+
+define <8 x float> @__svml_atan2(<8 x float>,
+                                          <8 x float>) nounwind readnone alwaysinline {
+  binary4to8(ret, float, @__svml_atan2f4, %0, %1)
+  ret <8 x float> %ret
+}
+
+define <8 x float> @__svml_exp(<8 x float>) nounwind readnone alwaysinline {
+  unary4to8(ret, float, @__svml_expf4, %0)
+  ret <8 x float> %ret
+}
+
+define <8 x float> @__svml_log(<8 x float>) nounwind readnone alwaysinline {
+  unary4to8(ret, float, @__svml_logf4, %0)
+  ret <8 x float> %ret
+}
+
+define <8 x float> @__svml_pow(<8 x float>,
+                                        <8 x float>) nounwind readnone alwaysinline {
+  binary4to8(ret, float, @__svml_powf4, %0, %1)
+  ret <8 x float> %ret
+}
+
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; float min/max
 
diff --git a/builtins/target-sse4.ll b/builtins/target-sse4.ll
index 5429b461..e05b865f 100644
--- a/builtins/target-sse4.ll
+++ b/builtins/target-sse4.ll
@@ -206,6 +206,66 @@ define <4 x double> @__max_varying_double(<4 x double>, <4 x double>) nounwind r
   ret <4 x double> %ret
 }
 
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; svml stuff
+
+declare <4 x float> @__svml_sinf4(<4 x float>) nounwind readnone
+declare <4 x float> @__svml_cosf4(<4 x float>) nounwind readnone
+declare <4 x float> @__svml_sincosf4(<4 x float> *, <4 x float>) nounwind readnone
+declare <4 x float> @__svml_tanf4(<4 x float>) nounwind readnone
+declare <4 x float> @__svml_atanf4(<4 x float>) nounwind readnone
+declare <4 x float> @__svml_atan2f4(<4 x float>, <4 x float>) nounwind readnone
+declare <4 x float> @__svml_expf4(<4 x float>) nounwind readnone
+declare <4 x float> @__svml_logf4(<4 x float>) nounwind readnone
+declare <4 x float> @__svml_powf4(<4 x float>, <4 x float>) nounwind readnone
+
+
+define <4 x float> @__svml_sin(<4 x float>) nounwind readnone alwaysinline {
+  %ret = call <4 x float> @__svml_sinf4(<4 x float> %0)
+  ret <4 x float> %ret
+}
+
+define <4 x float> @__svml_cos(<4 x float>) nounwind readnone alwaysinline {
+  %ret = call <4 x float> @__svml_cosf4(<4 x float> %0)
+  ret <4 x float> %ret
+}
+
+define void @__svml_sincos(<4 x float>, <4 x float> *, <4 x float> *) nounwind readnone alwaysinline {
+  %s = call <4 x float> @__svml_sincosf4(<4 x float> * %2, <4 x float> %0)
+  store <4 x float> %s, <4 x float> * %1
+  ret void
+}
+
+define <4 x float> @__svml_tan(<4 x float>) nounwind readnone alwaysinline {
+  %ret = call <4 x float> @__svml_tanf4(<4 x float> %0)
+  ret <4 x float> %ret
+}
+
+define <4 x float> @__svml_atan(<4 x float>) nounwind readnone alwaysinline {
+  %ret = call <4 x float> @__svml_atanf4(<4 x float> %0)
+  ret <4 x float> %ret
+}
+
+define <4 x float> @__svml_atan2(<4 x float>, <4 x float>) nounwind readnone alwaysinline {
+  %ret = call <4 x float> @__svml_atan2f4(<4 x float> %0, <4 x float> %1)
+  ret <4 x float> %ret
+}
+
+define <4 x float> @__svml_exp(<4 x float>) nounwind readnone alwaysinline {
+  %ret = call <4 x float> @__svml_expf4(<4 x float> %0)
+  ret <4 x float> %ret
+}
+
+define <4 x float> @__svml_log(<4 x float>) nounwind readnone alwaysinline {
+  %ret = call <4 x float> @__svml_logf4(<4 x float> %0)
+  ret <4 x float> %ret
+}
+
+define <4 x float> @__svml_pow(<4 x float>, <4 x float>) nounwind readnone alwaysinline {
+  %ret = call <4 x float> @__svml_powf4(<4 x float> %0, <4 x float> %1)
+  ret <4 x float> %ret
+}
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; horizontal ops / reductions
 
diff --git a/docs/ispc.rst b/docs/ispc.rst
index 476046e8..ff07f6d8 100644
--- a/docs/ispc.rst
+++ b/docs/ispc.rst
@@ -3333,6 +3333,9 @@ for this argument.
   approximately 1.45e-6 over the range -10pi to 10pi.)
 * ``fast``: more efficient but lower accuracy versions of the default ``ispc``
   implementations.
+* ``svml``: use Intel "Short Vector Math Library".  Use
+  ``icc`` to link your final executable so that the appropriate libraries
+  are linked.
 * ``system``: use the system's math library.  On many systems, these
   functions are more accurate than both of ``ispc``'s implementations.
   Using these functions may be quite
diff --git a/ispc.h b/ispc.h
index fc78e415..4804832f 100644
--- a/ispc.h
+++ b/ispc.h
@@ -488,7 +488,7 @@ struct Globals {
 
     /** There are a number of math libraries that can be used for
         transcendentals and the like during program compilation. */
-    enum MathLib { Math_ISPC, Math_ISPCFast, Math_System };
+    enum MathLib { Math_ISPC, Math_ISPCFast, Math_SVML, Math_System };
     MathLib mathLib;
 
     /** Records whether the ispc standard library should be made available
diff --git a/main.cpp b/main.cpp
index 61c62042..21a47de8 100644
--- a/main.cpp
+++ b/main.cpp
@@ -112,6 +112,7 @@ usage(int ret) {
     printf("    [--math-lib=<option>]\t\tSelect math library\n");
     printf("        default\t\t\t\tUse ispc's built-in math functions\n");
     printf("        fast\t\t\t\tUse high-performance but lower-accuracy math functions\n");
+    printf("        svml\t\t\t\tUse the Intel(r) SVML math libraries\n");
     printf("        system\t\t\t\tUse the system's math library (*may be quite slow*)\n");
     printf("    [-MMM <filename>\t\t\t\tWrite #include dependencies to given file.\n");
     printf("    [--nostdlib]\t\t\tDon't make the ispc standard library available\n");
@@ -408,6 +409,8 @@ int main(int Argc, char *Argv[]) {
                 g->mathLib = Globals::Math_ISPC;
             else if (!strcmp(lib, "fast"))
                 g->mathLib = Globals::Math_ISPCFast;
+            else if (!strcmp(lib, "svml"))
+                g->mathLib = Globals::Math_SVML;
             else if (!strcmp(lib, "system"))
                 g->mathLib = Globals::Math_System;
             else {
diff --git a/stdlib.ispc b/stdlib.ispc
index f7d135dd..e4f8844f 100644
--- a/stdlib.ispc
+++ b/stdlib.ispc
@@ -2179,7 +2179,10 @@ static inline uniform float frexp(uniform float x, uniform int * uniform pw2) {
 
 __declspec(safe)
 static inline float sin(float x_full) {
-    if (__math_lib == __math_lib_system) {
+    if (__math_lib == __math_lib_svml) {
+        return __svml_sin(x_full);
+    }
+    else if (__math_lib == __math_lib_system) {
         float ret;
         foreach_active (i) {
             uniform float r = __stdlib_sinf(extract(x_full, i));
@@ -2238,7 +2241,8 @@ static inline float sin(float x_full) {
 
 __declspec(safe)
 static inline uniform float sin(uniform float x_full) {
-    if (__math_lib == __math_lib_system) {
+    if (__math_lib == __math_lib_system ||
+        __math_lib == __math_lib_svml) {
         return __stdlib_sinf(x_full);
     }
     else if (__math_lib == __math_lib_ispc || 
@@ -2309,7 +2313,8 @@ static inline float asin(float x) {
     bool isnan = (x > 1);
 
     float v;
-    if (__math_lib == __math_lib_system) {
+    if (__math_lib == __math_lib_svml ||
+        __math_lib == __math_lib_system) {
         float ret;
         foreach_active (i) {
             uniform float r = __stdlib_asinf(extract(x, i));
@@ -2365,7 +2370,8 @@ static inline uniform float asin(uniform float x) {
     uniform bool isnan = (x > 1);
 
     uniform float v;
-    if (__math_lib == __math_lib_system) {
+    if (__math_lib == __math_lib_svml ||
+        __math_lib == __math_lib_system) {
         return __stdlib_asinf(x);
     }
     else if (__math_lib == __math_lib_ispc)
@@ -2410,7 +2416,10 @@ static inline uniform float asin(uniform float x) {
 
 __declspec(safe)
 static inline float cos(float x_full) {
-    if (__math_lib == __math_lib_system) {
+    if (__math_lib == __math_lib_svml) {
+        return __svml_cos(x_full);
+    }
+    else if (__math_lib == __math_lib_system) {
         float ret;
         foreach_active (i) {
             uniform float r = __stdlib_cosf(extract(x_full, i));
@@ -2468,7 +2477,8 @@ static inline float cos(float x_full) {
 
 __declspec(safe)
 static inline uniform float cos(uniform float x_full) {
-    if (__math_lib == __math_lib_system) {
+    if (__math_lib == __math_lib_system ||
+        __math_lib == __math_lib_svml) {
         return __stdlib_cosf(x_full);
     }
     else if (__math_lib == __math_lib_ispc || 
@@ -2545,7 +2555,10 @@ static inline uniform float acos(uniform float v) {
 __declspec(safe)
 static inline void sincos(float x_full, varying float * uniform sin_result, 
                           varying float * uniform cos_result) {
-    if (__math_lib == __math_lib_system) {
+    if (__math_lib == __math_lib_svml) {
+        __svml_sincos(x_full, sin_result, cos_result);
+    }
+    else if (__math_lib == __math_lib_system) {
         foreach_active (i) {
             uniform float s, c;
             __stdlib_sincosf(extract(x_full, i), &s, &c);
@@ -2612,7 +2625,8 @@ static inline void sincos(float x_full, varying float * uniform sin_result,
 __declspec(safe)
 static inline void sincos(uniform float x_full, uniform float * uniform sin_result,
                           uniform float * uniform cos_result) {
-    if (__math_lib == __math_lib_system) {
+    if (__math_lib == __math_lib_system ||
+        __math_lib == __math_lib_svml) {
         __stdlib_sincosf(x_full, sin_result, cos_result);
     }
     else if (__math_lib == __math_lib_ispc || 
@@ -2673,7 +2687,10 @@ static inline void sincos(uniform float x_full, uniform float * uniform sin_resu
 
 __declspec(safe)
 static inline float tan(float x_full) {
-    if (__math_lib == __math_lib_system) {
+    if (__math_lib == __math_lib_svml) {
+        return __svml_tan(x_full);
+    }
+    else if (__math_lib == __math_lib_system) {
         float ret;
         foreach_active (i) {
             uniform float r = __stdlib_tanf(extract(x_full, i));
@@ -2749,7 +2766,8 @@ static inline float tan(float x_full) {
 
 __declspec(safe)
 static inline uniform float tan(uniform float x_full) {
-    if (__math_lib == __math_lib_system) {
+    if (__math_lib == __math_lib_system ||
+        __math_lib == __math_lib_svml) {
         return __stdlib_tanf(x_full);
     }
     else if (__math_lib == __math_lib_ispc || 
@@ -2820,7 +2838,10 @@ static inline uniform float tan(uniform float x_full) {
 
 __declspec(safe)
 static inline float atan(float x_full) {
-    if (__math_lib == __math_lib_system) {
+    if (__math_lib == __math_lib_svml) {
+        return __svml_atan(x_full);
+    }
+    else if (__math_lib == __math_lib_system) {
         float ret;
         foreach_active (i) {
             uniform float r = __stdlib_atanf(extract(x_full, i));
@@ -2868,7 +2889,8 @@ static inline float atan(float x_full) {
 
 __declspec(safe)
 static inline uniform float atan(uniform float x_full) {
-    if (__math_lib == __math_lib_system) {
+    if (__math_lib == __math_lib_system ||
+        __math_lib == __math_lib_svml) {
         return __stdlib_atanf(x_full);
     }
     else if (__math_lib == __math_lib_ispc || 
@@ -2911,7 +2933,10 @@ static inline uniform float atan(uniform float x_full) {
 
 __declspec(safe)
 static inline float atan2(float y, float x) {
-    if (__math_lib == __math_lib_system) {
+    if (__math_lib == __math_lib_svml) {
+        return __svml_atan2(y, x);
+    }
+    else if (__math_lib == __math_lib_system) {
         float ret;
         foreach_active (i) {
             uniform float r = __stdlib_atan2f(extract(y, i), extract(x, i));
@@ -2947,7 +2972,8 @@ static inline float atan2(float y, float x) {
 
 __declspec(safe)
 static inline uniform float atan2(uniform float y, uniform float x) {
-    if (__math_lib == __math_lib_system) {
+    if (__math_lib == __math_lib_system ||
+        __math_lib == __math_lib_svml) {
         return __stdlib_atan2f(y, x);
     }
     else if (__math_lib == __math_lib_ispc || 
@@ -2970,6 +2996,9 @@ static inline float exp(float x_full) {
     if (__have_native_transcendentals) {
         return __exp_varying_float(x_full);
     }
+    else if (__math_lib == __math_lib_svml) {
+        return __svml_exp(x_full);
+    }
     else if (__math_lib == __math_lib_system) {
         float ret;
         foreach_active (i) {
@@ -3049,7 +3078,8 @@ static inline uniform float exp(uniform float x_full) {
     if (__have_native_transcendentals) {
         return __exp_uniform_float(x_full);
     }
-    else if (__math_lib == __math_lib_system) {
+    else if (__math_lib == __math_lib_system ||
+        __math_lib == __math_lib_svml) {
         return __stdlib_expf(x_full);
     }
     else if (__math_lib == __math_lib_ispc_fast) {
@@ -3173,6 +3203,9 @@ static inline float log(float x_full) {
     if (__have_native_transcendentals) {
         return __log_varying_float(x_full);
     }
+    else if (__math_lib == __math_lib_svml) {
+        return __svml_log(x_full);
+    }
     else if (__math_lib == __math_lib_system) {
         float ret;
         foreach_active (i) {
@@ -3261,7 +3294,8 @@ static inline uniform float log(uniform float x_full) {
     if (__have_native_transcendentals) {
         return __log_uniform_float(x_full);
     }
-    else if (__math_lib == __math_lib_system) {
+    else if (__math_lib == __math_lib_system ||
+        __math_lib == __math_lib_svml) {
         return __stdlib_logf(x_full);
     }
     else if (__math_lib == __math_lib_ispc_fast) {
@@ -3344,6 +3378,9 @@ static inline float pow(float a, float b) {
     if (__have_native_transcendentals) {
         return __pow_varying_float(a, b);
     }
+    else if (__math_lib == __math_lib_svml) {
+        return __svml_pow(a, b);
+    }
     else if (__math_lib == __math_lib_system) {
         float ret;
         foreach_active (i) {
@@ -3363,7 +3400,8 @@ static inline uniform float pow(uniform float a, uniform float b) {
     if (__have_native_transcendentals) {
         return __pow_uniform_float(a, b);
     }
-    if (__math_lib == __math_lib_system) {
+    if (__math_lib == __math_lib_system ||
+        __math_lib == __math_lib_svml) {
         return __stdlib_powf(a, b);
     }
     else if (__math_lib == __math_lib_ispc || 

From 9c79d4d182ca14072583128e6b59d48a80b93102 Mon Sep 17 00:00:00 2001
From: egaburov <egaburov.work@gmail.com>
Date: Wed, 11 Sep 2013 12:58:02 +0200
Subject: [PATCH 054/124] addded avxh with vectorWidth=4 support, use
 --target=avxh to enable it

---
 Makefile                 |   2 +-
 builtins.cpp             |   8 +
 builtins/target-avx-h.ll | 554 +++++++++++++++++++++++++++++++++++++++
 builtins/target-avxh.ll  |  81 ++++++
 ispc.cpp                 |   9 +
 5 files changed, 653 insertions(+), 1 deletion(-)
 create mode 100644 builtins/target-avx-h.ll
 create mode 100644 builtins/target-avxh.ll

diff --git a/Makefile b/Makefile
index 09ec302d..b5bb3472 100644
--- a/Makefile
+++ b/Makefile
@@ -141,7 +141,7 @@ CXX_SRC=ast.cpp builtins.cpp cbackend.cpp ctx.cpp decl.cpp expr.cpp func.cpp \
 	type.cpp util.cpp
 HEADERS=ast.h builtins.h ctx.h decl.h expr.h func.h ispc.h llvmutil.h module.h \
 	opt.h stmt.h sym.h type.h util.h
-TARGETS=avx1 avx1-x2 avx11 avx11-x2 avx2 avx2-x2 \
+TARGETS=avxh avx1 avx1-x2 avx11 avx11-x2 avx2 avx2-x2 \
 	sse2 sse2-x2 sse4-8 sse4-16 sse4 sse4-x2 \
 	generic-4 generic-8 generic-16 generic-32 generic-64 generic-1
 ifneq ($(ARM_ENABLED), 0)
diff --git a/builtins.cpp b/builtins.cpp
index 886eec15..63c90337 100644
--- a/builtins.cpp
+++ b/builtins.cpp
@@ -920,6 +920,14 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
     }
     case Target::AVX: {
         switch (g->target->getVectorWidth()) {
+        case 4:
+            if (runtime32) {
+                EXPORT_MODULE(builtins_bitcode_avxh_32bit);
+            }
+            else {
+                EXPORT_MODULE(builtins_bitcode_avxh_64bit);
+            }
+            break;
         case 8:
             if (runtime32) {
                 EXPORT_MODULE(builtins_bitcode_avx1_32bit);
diff --git a/builtins/target-avx-h.ll b/builtins/target-avx-h.ll
new file mode 100644
index 00000000..d56a63b9
--- /dev/null
+++ b/builtins/target-avx-h.ll
@@ -0,0 +1,554 @@
+;;  Copyright (c) 2010-2012, Intel Corporation
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Intel Corporation nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Basic 4-wide definitions
+
+define(`WIDTH',`4')
+define(`MASK',`i32')
+include(`util.m4')
+
+stdlib_core()
+packed_load_and_store()
+scans()
+int64minmax()
+
+include(`target-avx-common.ll')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rcp
+
+;; sse intrinsic
+declare <4 x float> @llvm.x86.sse.rcp.ps(<4 x float>) nounwind readnone
+
+define <4 x float> @__rcp_varying_float(<4 x float>) nounwind readonly alwaysinline {
+  ;  float iv = __rcp_v(v);
+  ;  return iv * (2. - v * iv);
+
+  %call = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %0)
+  ; do one N-R iteration
+  %v_iv = fmul <4 x float> %0, %call
+  %two_minus = fsub <4 x float> <float 2., float 2., float 2., float 2.>, %v_iv
+  %iv_mul = fmul <4 x float> %call, %two_minus
+  ret <4 x float> %iv_mul
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rounding floats
+
+;; sse intrinsic
+declare <4 x float> @llvm.x86.sse41.round.ps(<4 x float>, i32) nounwind readnone
+
+define <4 x float> @__round_varying_float(<4 x float>) nounwind readonly alwaysinline {
+  ; roundps, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8
+  %call = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %0, i32 8)
+  ret <4 x float> %call
+}
+
+define <4 x float> @__floor_varying_float(<4 x float>) nounwind readonly alwaysinline {
+  ; roundps, round down 0b01 | don't signal precision exceptions 0b1001 = 9
+  %call = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %0, i32 9)
+  ret <4 x float> %call
+}
+
+define <4 x float> @__ceil_varying_float(<4 x float>) nounwind readonly alwaysinline {
+  ; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
+  %call = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %0, i32 10)
+  ret <4 x float> %call
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rounding doubles
+
+;; avx intrinsic
+declare <4 x double> @llvm.x86.avx.round.pd.256(<4 x double>, i32) nounwind readnone
+
+define <4 x double> @__round_varying_double(<4 x double>) nounwind readonly alwaysinline {
+  %call = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %0, i32 8)
+  ret <4 x double> %call
+}
+
+define <4 x double> @__floor_varying_double(<4 x double>) nounwind readonly alwaysinline {
+  ; roundpd, round down 0b01 | don't signal precision exceptions 0b1000 = 9
+  %call = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %0, i32 9)
+  ret <4 x double> %call
+}
+
+
+define <4 x double> @__ceil_varying_double(<4 x double>) nounwind readonly alwaysinline {
+  ; roundpd, round up 0b10 | don't signal precision exceptions 0b1000 = 10
+  %call = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %0, i32 10)
+  ret <4 x double> %call
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rsqrt
+
+;; sse intrinsic
+declare <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float>) nounwind readnone
+
+define <4 x float> @__rsqrt_varying_float(<4 x float> %v) nounwind readonly alwaysinline {
+  ;  float is = __rsqrt_v(v);
+  %is = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %v)
+  ; Newton-Raphson iteration to improve precision
+  ;  return 0.5 * is * (3. - (v * is) * is);
+  %v_is = fmul <4 x float> %v, %is
+  %v_is_is = fmul <4 x float> %v_is, %is
+  %three_sub = fsub <4 x float> <float 3., float 3., float 3., float 3.>, %v_is_is
+  %is_mul = fmul <4 x float> %is, %three_sub
+  %half_scale = fmul <4 x float> <float 0.5, float 0.5, float 0.5, float 0.5>, %is_mul
+  ret <4 x float> %half_scale
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; sqrt
+
+;; sse intrinsic
+declare <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float>) nounwind readnone
+
+define <4 x float> @__sqrt_varying_float(<4 x float>) nounwind readonly alwaysinline {
+  %call = call <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float> %0)
+  ret <4 x float> %call
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; double precision sqrt
+
+;; avx� intrinsic
+declare <4 x double> @llvm.x86.avx.sqrt.pd.256(<4 x double>) nounwind readnone
+
+define <4 x double> @__sqrt_varying_double(<4 x double>) nounwind alwaysinline {
+  %call = call <4 x double> @llvm.x86.avx.sqrt.pd.256(<4 x double> %0)
+  ret <4 x double> %call
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; svml
+
+; FIXME: need either to wire these up to the 8-wide SVML entrypoints,
+; or, use the macro to call the 4-wide ones twice with our 8-wide
+; vectors...
+
+;;declare <4 x double> @__svml_sin4(<4 x double>)
+;;declare <4 x double> @__svml_cos4(<4 x double>)
+;;declare void @__svml_sincos4(<4 x double>, <4 x double> *, <4 x double> *)
+;;declare <4 x double> @__svml_tan4(<4 x double>)
+;;declare <4 x double> @__svml_atan4(<4 x double>)
+;;declare <4 x double> @__svml_atan24(<4 x double>, <4 x double>)
+;;declare <4 x double> @__svml_exp4(<4 x double>)
+;;declare <4 x double> @__svml_log4(<4 x double>)
+;;declare <4 x double> @__svml_pow4(<4 x double>, <4 x double>)
+declare <4 x float> @__svml_sin(<4 x float>)
+declare <4 x float> @__svml_cos(<4 x float>)
+declare void @__svml_sincos(<4 x float>, <4 x float> *, <4 x float> *)
+declare <4 x float> @__svml_tan(<4 x float>)
+declare <4 x float> @__svml_atan(<4 x float>)
+declare <4 x float> @__svml_atan2(<4 x float>, <4 x float>)
+declare <4 x float> @__svml_exp(<4 x float>)
+declare <4 x float> @__svml_log(<4 x float>)
+declare <4 x float> @__svml_pow(<4 x float>, <4 x float>)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; float min/max
+
+;; sse intrinsics
+declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind readnone
+declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind readnone
+
+define <4 x float> @__max_varying_float(<4 x float>, <4 x float>) nounwind readonly alwaysinline {
+  %call = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %0, <4 x float> %1)
+  ret <4 x float> %call
+}
+
+define <4 x float> @__min_varying_float(<4 x float>, <4 x float>) nounwind readonly alwaysinline {
+  %call = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %0, <4 x float> %1)
+  ret <4 x float> %call
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; horizontal ops
+
+;; sse intrinsic 
+declare i32 @llvm.x86.sse.movmsk.ps(<4 x float>) nounwind readnone
+
+define i64 @__movmsk(<4 x i32>) nounwind readnone alwaysinline {
+  %floatmask = bitcast <4 x i32> %0 to <4 x float>
+  %v = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %floatmask) nounwind readnone
+  %v64 = zext i32 %v to i64
+  ret i64 %v64
+}
+
+define i1 @__any(<4 x i32>) nounwind readnone alwaysinline {
+  %floatmask = bitcast <4 x i32> %0 to <4 x float>
+  %v = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %floatmask) nounwind readnone
+  %cmp = icmp ne i32 %v, 0
+  ret i1 %cmp
+}
+
+define i1 @__all(<4 x i32>) nounwind readnone alwaysinline {
+  %floatmask = bitcast <4 x i32> %0 to <4 x float>
+  %v = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %floatmask) nounwind readnone
+  %cmp = icmp eq i32 %v, 15
+  ret i1 %cmp
+}
+
+define i1 @__none(<4 x i32>) nounwind readnone alwaysinline {
+  %floatmask = bitcast <4 x i32> %0 to <4 x float>
+  %v = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %floatmask) nounwind readnone
+  %cmp = icmp eq i32 %v, 0
+  ret i1 %cmp
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; horizontal float ops
+
+;; sse intrinsic
+declare <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float>, <4 x float>) nounwind readnone
+
+define float @__reduce_add_float(<4 x float>) nounwind readonly alwaysinline {
+  %v1 = call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %0, <4 x float> %0)
+  %v2 = call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %v1, <4 x float> %v1)
+  %scalar = extractelement <4 x float> %v2, i32 0
+  ret float %scalar
+}
+
+define float @__reduce_min_float(<4 x float>) nounwind readnone {
+  reduce4(float, @__min_varying_float, @__min_uniform_float)
+}
+
+define float @__reduce_max_float(<4 x float>) nounwind readnone {
+  reduce4(float, @__max_varying_float, @__max_uniform_float)
+}
+
+reduce_equal(4)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; horizontal int8 ops
+
+declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone
+
+define i16 @__reduce_add_int8(<4 x i8>) nounwind readnone alwaysinline 
+{
+  %wide8 = shufflevector <4 x i8> %0, <4 x i8> zeroinitializer,
+      <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4,
+                  i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
+  %rv = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %wide8,
+                                              <16 x i8> zeroinitializer)
+  %r0 = extractelement <2 x i64> %rv, i32 0
+  %r1 = extractelement <2 x i64> %rv, i32 1
+  %r = add i64 %r0, %r1
+  %r16 = trunc i64 %r to i16
+  ret i16 %r16
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; horizontal int16 ops
+
+define internal <4 x i16> @__add_varying_i16(<4 x i16>,
+                                  <4 x i16>) nounwind readnone alwaysinline {
+  %r = add <4 x i16> %0, %1
+  ret <4 x i16> %r
+}
+
+define internal i16 @__add_uniform_i16(i16, i16) nounwind readnone alwaysinline {
+  %r = add i16 %0, %1
+  ret i16 %r
+}
+
+define i16 @__reduce_add_int16(<4 x i16>) nounwind readnone alwaysinline {
+  reduce4(i16, @__add_varying_i16, @__add_uniform_i16)
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; horizontal int32 ops
+
+define <4 x i32> @__add_varying_int32(<4 x i32>,
+                                      <4 x i32>) nounwind readnone alwaysinline {
+  %s = add <4 x i32> %0, %1
+  ret <4 x i32> %s
+}
+
+define i32 @__add_uniform_int32(i32, i32) nounwind readnone alwaysinline {
+  %s = add i32 %0, %1
+  ret i32 %s
+}
+
+define i32 @__reduce_add_int32(<4 x i32>) nounwind readnone alwaysinline {
+  reduce4(i32, @__add_varying_int32, @__add_uniform_int32)
+}
+
+
+define i32 @__reduce_min_int32(<4 x i32>) nounwind readnone alwaysinline {
+  reduce4(i32, @__min_varying_int32, @__min_uniform_int32)
+}
+
+
+define i32 @__reduce_max_int32(<4 x i32>) nounwind readnone alwaysinline {
+  reduce4(i32, @__max_varying_int32, @__max_uniform_int32)
+}
+
+define i32 @__reduce_min_uint32(<4 x i32>) nounwind readnone alwaysinline {
+  reduce4(i32, @__min_varying_uint32, @__min_uniform_uint32)
+}
+
+define i32 @__reduce_max_uint32(<4 x i32>) nounwind readnone alwaysinline {
+  reduce4(i32, @__max_varying_uint32, @__max_uniform_uint32)
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; horizontal double ops
+
+declare <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double>, <4 x double>) nounwind readnone
+
+define double @__reduce_add_double(<4 x double>) nounwind readonly alwaysinline {
+  %v0 = shufflevector <4 x double> %0, <4 x double> undef,
+                      <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v1 = shufflevector <4 x double> <double 0.,double 0.,double 0.,double 0.>, <4 x double> undef,
+                      <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+;;  %v1 = <4 x double> <double 0., double 0., double 0., double 0.>
+  %sum0 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %v0,   <4 x double> %v1)
+  %sum1 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %sum0, <4 x double> %sum0)
+  %final0 = extractelement <4 x double> %sum1, i32 0
+  %final1 = extractelement <4 x double> %sum1, i32 2
+  %sum = fadd double %final0, %final1
+
+  ret double %sum
+}
+
+define double @__reduce_min_double(<4 x double>) nounwind readnone alwaysinline {
+  reduce4(double, @__min_varying_double, @__min_uniform_double)
+}
+
+
+define double @__reduce_max_double(<4 x double>) nounwind readnone alwaysinline {
+  reduce4(double, @__max_varying_double, @__max_uniform_double)
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; horizontal int64 ops
+
+define <4 x i64> @__add_varying_int64(<4 x i64>,
+                                      <4 x i64>) nounwind readnone alwaysinline {
+  %s = add <4 x i64> %0, %1
+  ret <4 x i64> %s
+}
+
+define i64 @__add_uniform_int64(i64, i64) nounwind readnone alwaysinline {
+  %s = add i64 %0, %1
+  ret i64 %s
+}
+
+define i64 @__reduce_add_int64(<4 x i64>) nounwind readnone alwaysinline {
+  reduce4(i64, @__add_varying_int64, @__add_uniform_int64)
+}
+
+
+define i64 @__reduce_min_int64(<4 x i64>) nounwind readnone alwaysinline {
+  reduce4(i64, @__min_varying_int64, @__min_uniform_int64)
+}
+
+
+define i64 @__reduce_max_int64(<4 x i64>) nounwind readnone alwaysinline {
+  reduce4(i64, @__max_varying_int64, @__max_uniform_int64)
+}
+
+
+define i64 @__reduce_min_uint64(<4 x i64>) nounwind readnone alwaysinline {
+  reduce4(i64, @__min_varying_uint64, @__min_uniform_uint64)
+}
+
+
+define i64 @__reduce_max_uint64(<4 x i64>) nounwind readnone alwaysinline {
+  reduce4(i64, @__max_varying_uint64, @__max_uniform_uint64)
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; unaligned loads/loads+broadcasts
+
+
+; no masked load instruction for i8 and i16 types??
+masked_load(i8,  1)
+masked_load(i16, 2)
+
+;; avx intrinsics
+declare <4 x float> @llvm.x86.avx.maskload.ps(i8 *, <4 x float> %mask)
+declare <4 x double> @llvm.x86.avx.maskload.pd.256(i8 *, <4 x double> %mask)
+ 
+define <4 x i32> @__masked_load_i32(i8 *, <4 x i32> %mask) nounwind alwaysinline {
+  %floatmask = bitcast <4 x i32> %mask to <4 x float>
+  %floatval = call <4 x float> @llvm.x86.avx.maskload.ps(i8 * %0, <4 x float> %floatmask)
+  %retval = bitcast <4 x float> %floatval to <4 x i32>
+  ret <4 x i32> %retval
+}
+
+
+define <4 x i64> @__masked_load_i64(i8 *, <4 x i32> %mask) nounwind alwaysinline {
+  ; double up masks, bitcast to doubles
+  %mask0 = shufflevector <4 x i32> %mask, <4 x i32> undef,
+     <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
+  %mask0d = bitcast <8 x i32> %mask0 to <4 x double>
+
+  %val0d = call <4 x double> @llvm.x86.avx.maskload.pd.256(i8 * %0, <4 x double> %mask0d)
+
+  %vald = shufflevector <4 x double> %val0d, <4 x double> undef,
+      <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %val = bitcast <4 x double> %vald to <4 x i64>
+  ret <4 x i64> %val
+}
+
+masked_load_float_double()
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; masked store
+
+gen_masked_store(i8)
+gen_masked_store(i16)
+
+; note that mask is the 2nd parameter, not the 3rd one!!
+;; avx intrinsics
+declare void @llvm.x86.avx.maskstore.ps    (i8 *, <4 x float>,  <4 x float>)
+declare void @llvm.x86.avx.maskstore.pd.256(i8 *, <4 x double>, <4 x double>)
+
+define void @__masked_store_i32(<4 x i32>* nocapture, <4 x i32>, 
+                                <4 x i32>) nounwind alwaysinline {
+  %ptr = bitcast <4 x i32> * %0 to i8 *
+  %val = bitcast <4 x i32> %1 to <4 x float>
+  %mask = bitcast <4 x i32> %2 to <4 x float>
+  call void @llvm.x86.avx.maskstore.ps(i8 * %ptr, <4 x float> %mask, <4 x float> %val)
+  ret void
+}
+
+define void @__masked_store_i64(<4 x i64>* nocapture, <4 x i64>,
+                                <4 x i32> %mask) nounwind alwaysinline {
+  %ptr = bitcast <4 x i64> * %0 to i8 *
+  %val = bitcast <4 x i64> %1 to <4 x double>
+
+  %mask0 = shufflevector <4 x i32> %mask, <4 x i32> undef,
+     <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
+
+  %mask0d = bitcast <8 x i32> %mask0 to <4 x double>
+
+  %val0 = shufflevector <4 x double> %val, <4 x double> undef,
+     <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+
+  call void @llvm.x86.avx.maskstore.pd.256(i8 * %ptr, <4 x double> %mask0d, <4 x double> %val0)
+  ret void
+}
+
+
+masked_store_blend_8_16_by_4()
+
+;; sse intrinsic
+declare <4 x float> @llvm.x86.sse41.blendvps(<4 x float>, <4 x float>,
+                                             <4 x float>) nounwind readnone
+
+
+define void @__masked_store_blend_i32(<4 x i32>* nocapture, <4 x i32>, 
+                                      <4 x i32> %mask) nounwind alwaysinline {
+  %mask_as_float = bitcast <4 x i32> %mask to <4 x float>
+  %oldValue = load <4 x i32>* %0, align 4
+  %oldAsFloat = bitcast <4 x i32> %oldValue to <4 x float>
+  %newAsFloat = bitcast <4 x i32> %1 to <4 x float>
+  %blend = call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %oldAsFloat,
+                                                     <4 x float> %newAsFloat,
+                                                     <4 x float> %mask_as_float)
+  %blendAsInt = bitcast <4 x float> %blend to <4 x i32>
+  store <4 x i32> %blendAsInt, <4 x i32>* %0, align 4
+  ret void
+}
+
+;; avx intrinsic
+declare <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float>, <8 x float>,
+                                                <8 x float>) nounwind readnone
+
+define void @__masked_store_blend_i64(<4 x i64>* nocapture %ptr, <4 x i64> %new, 
+                                      <4 x i32> %i32mask) nounwind alwaysinline {
+  %oldValue = load <4 x i64>* %ptr, align 8
+  %mask = bitcast <4 x i32> %i32mask to <4 x float>
+
+  ; Do 4x64-bit blends by doing two <8 x i32> blends, where the <8 x i32> values
+  ; are actually bitcast <4 x i64> values
+  ;
+  ; set up the first four 64-bit values
+  %old01  = bitcast <4 x i64> %oldValue to <4 x i64>
+  %old01f = bitcast <4 x i64> %old01 to <8 x float>
+  %new01  = bitcast <4 x i64> %new  to <4 x i64>
+  %new01f = bitcast <4 x i64> %new01 to <8 x float>
+  ; compute mask--note that the indices are all doubled-up
+  %mask01 = shufflevector <4 x float> %mask, <4 x float> undef,
+                          <8 x i32> <i32 0, i32 0, i32 1, i32 1,
+                                     i32 2, i32 2, i32 3, i32 3>
+  ; and blend them
+  %result01f = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %old01f,
+                                                            <8 x float> %new01f,
+                                                            <8 x float> %mask01)
+  %result01 = bitcast <8 x float> %result01f to <4 x i64>
+
+
+  %final = bitcast <4 x i64> %result01 to <4 x i64>
+  store <4 x i64> %final, <4 x i64> * %ptr, align 8
+  ret void
+}
+
+masked_store_float_double()
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; scatter
+
+gen_scatter(i8)
+gen_scatter(i16)
+gen_scatter(i32)
+gen_scatter(float)
+gen_scatter(i64)
+gen_scatter(double)
+
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; double precision min/max
+
+declare <4 x double> @llvm.x86.avx.max.pd.256(<4 x double>, <4 x double>) nounwind readnone
+declare <4 x double> @llvm.x86.avx.min.pd.256(<4 x double>, <4 x double>) nounwind readnone
+
+define <4 x double> @__min_varying_double(<4 x double>, <4 x double>) nounwind readnone alwaysinline {
+  %call = call <4 x double> @llvm.x86.avx.min.pd.256(<4 x double> %0, <4 x double> %1)
+  ret <4 x double> %call
+}
+
+define <4 x double> @__max_varying_double(<4 x double>, <4 x double>) nounwind readnone alwaysinline {
+  %call = call <4 x double> @llvm.x86.avx.max.pd.256(<4 x double> %0, <4 x double> %1)
+  ret <4 x double> %call
+}
+
diff --git a/builtins/target-avxh.ll b/builtins/target-avxh.ll
new file mode 100644
index 00000000..98c9111d
--- /dev/null
+++ b/builtins/target-avxh.ll
@@ -0,0 +1,81 @@
+;;  Copyright (c) 2010-2011, Intel Corporation
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Intel Corporation nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+include(`target-avx-h.ll')
+
+rdrand_decls()
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int min/max
+
+define <4 x i32> @__min_varying_int32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
+  %call = call <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32> %0, <4 x i32> %1)
+  ret <4 x i32> %call
+}
+define <4 x i32> @__max_varying_int32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
+  %call = call <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32> %0, <4 x i32> %1)
+
+  ret <4 x i32> %call
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; unsigned int min/max
+
+define <4 x i32> @__min_varying_uint32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
+  %call = call <4 x i32> @llvm.x86.sse41.pminud(<4 x i32> %0, <4 x i32> %1)
+  ret <4 x i32> %call
+}
+
+define <4 x i32> @__max_varying_uint32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
+  %call = call <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32> %0, <4 x i32> %1)
+  ret <4 x i32> %call
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; half conversion routines
+
+ifelse(NO_HALF_DECLARES, `1', `', `
+declare float @__half_to_float_uniform(i16 %v) nounwind readnone
+declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
+declare i16 @__float_to_half_uniform(float %v) nounwind readnone
+declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone
+')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; gather
+
+gen_gather_factored(i8)
+gen_gather_factored(i16)
+gen_gather_factored(i32)
+gen_gather_factored(float)
+gen_gather_factored(i64)
+gen_gather_factored(double)
diff --git a/ispc.cpp b/ispc.cpp
index 6d4b063d..02c23568 100644
--- a/ispc.cpp
+++ b/ispc.cpp
@@ -446,6 +446,15 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
         this->m_maskingIsFree = false;
         this->m_maskBitCount = 32;
     }
+    else if (!strcasecmp(isa, "avxh") ) {
+        fprintf(stderr, " ISA is avxh \n");
+        this->m_isa = Target::AVX;
+        this->m_nativeVectorWidth = 4;
+        this->m_vectorWidth = 4;
+        this->m_attributes = "+avx,+popcnt,+cmov";
+        this->m_maskingIsFree = false;
+        this->m_maskBitCount = 32;
+    }
     else if (!strcasecmp(isa, "avx-x2") ||
              !strcasecmp(isa, "avx1-x2") ||
              !strcasecmp(isa, "avx1-i32x16")) {

From 320c41ffcf223f4793c39c2f445ed0aed19d6270 Mon Sep 17 00:00:00 2001
From: egaburov <egaburov.work@gmail.com>
Date: Wed, 11 Sep 2013 15:16:50 +0200
Subject: [PATCH 055/124] added svml support. experimental. for some reason all
 sybmols are visible..

---
 .gitignore                        |   4 ++
 Makefile                          |   6 +-
 builtins.cpp                      |  13 ++++
 builtins/target-avx-h.ll          |  27 ++------
 builtins/target-avx-x2.ll         |  16 +----
 builtins/target-avx.ll            |  18 ++----
 builtins/target-generic-1.ll      |  45 +++++++++----
 builtins/target-generic-common.ll |  16 ++---
 builtins/target-neon-common.ll    |  13 ++--
 builtins/target-sse2-x2.ll        |  36 +++++------
 builtins/target-sse2.ll           |  61 ++----------------
 builtins/target-sse4-16.ll        |  13 +---
 builtins/target-sse4-8.ll         |  12 +---
 builtins/target-sse4-x2.ll        |  36 +++++------
 builtins/target-sse4.ll           |  61 ++----------------
 builtins/util.m4                  |   6 ++
 stdlib.ispc                       | 102 ++++++++++++++++++++++++------
 17 files changed, 216 insertions(+), 269 deletions(-)

diff --git a/.gitignore b/.gitignore
index 0469cf7d..3bec2ace 100644
--- a/.gitignore
+++ b/.gitignore
@@ -11,5 +11,9 @@ tests*/*run
 examples/*/*.png
 examples/*/*.ppm
 examples/*/objs/*
+*.swp
+.*
+!.gitignore
+
 
 
diff --git a/Makefile b/Makefile
index b5bb3472..43f41e09 100644
--- a/Makefile
+++ b/Makefile
@@ -246,15 +246,15 @@ objs/lex.o: objs/lex.cpp $(HEADERS) objs/parse.cc
 	@echo Compiling $<
 	@$(CXX) $(CXXFLAGS) -o $@ -c $<
 
-objs/builtins-dispatch.cpp: builtins/dispatch.ll builtins/util.m4 $(wildcard builtins/*common.ll)
+objs/builtins-dispatch.cpp: builtins/dispatch.ll builtins/util.m4 builtins/svml.m4 $(wildcard builtins/*common.ll)
 	@echo Creating C++ source from builtins definition file $<
 	@m4 -Ibuiltins/ -DLLVM_VERSION=$(LLVM_VERSION) -DBUILD_OS=UNIX $< | python bitcode2cpp.py $< > $@
 
-objs/builtins-%-32bit.cpp: builtins/%.ll builtins/util.m4 $(wildcard builtins/*common.ll)
+objs/builtins-%-32bit.cpp: builtins/%.ll builtins/util.m4 builtins/svml.m4 $(wildcard builtins/*common.ll)
 	@echo Creating C++ source from builtins definition file $< \(32 bit version\)
 	@m4 -Ibuiltins/ -DLLVM_VERSION=$(LLVM_VERSION) -DBUILD_OS=UNIX -DRUNTIME=32 $< | python bitcode2cpp.py $< 32bit > $@
 
-objs/builtins-%-64bit.cpp: builtins/%.ll builtins/util.m4 $(wildcard builtins/*common.ll)
+objs/builtins-%-64bit.cpp: builtins/%.ll builtins/util.m4 builtins/svml.m4 $(wildcard builtins/*common.ll)
 	@echo Creating C++ source from builtins definition file $< \(64 bit version\)
 	@m4 -Ibuiltins/ -DLLVM_VERSION=$(LLVM_VERSION) -DBUILD_OS=UNIX -DRUNTIME=64 $< | python bitcode2cpp.py $< 64bit > $@
 
diff --git a/builtins.cpp b/builtins.cpp
index 63c90337..139b8f04 100644
--- a/builtins.cpp
+++ b/builtins.cpp
@@ -582,7 +582,9 @@ lSetInternalFunctions(llvm::Module *module) {
         "__stdlib_tan",
         "__stdlib_tanf",
         "__svml_sin",
+        "__svml_asin",
         "__svml_cos",
+        "__svml_acos",
         "__svml_sincos",
         "__svml_tan",
         "__svml_atan",
@@ -590,6 +592,17 @@ lSetInternalFunctions(llvm::Module *module) {
         "__svml_exp",
         "__svml_log",
         "__svml_pow",
+        "__svml_sinf",
+        "__svml_asinf",
+        "__svml_cosf",
+        "__svml_acosf",
+        "__svml_sincosf",
+        "__svml_tanf",
+        "__svml_atanf",
+        "__svml_atan2f",
+        "__svml_expf",
+        "__svml_logf",
+        "__svml_powf",
         "__undef_uniform",
         "__undef_varying",
         "__vec4_add_float",
diff --git a/builtins/target-avx-h.ll b/builtins/target-avx-h.ll
index d56a63b9..a06e5ab3 100644
--- a/builtins/target-avx-h.ll
+++ b/builtins/target-avx-h.ll
@@ -154,28 +154,11 @@ define <4 x double> @__sqrt_varying_double(<4 x double>) nounwind alwaysinline {
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; svml
 
-; FIXME: need either to wire these up to the 8-wide SVML entrypoints,
-; or, use the macro to call the 4-wide ones twice with our 8-wide
-; vectors...
-
-;;declare <4 x double> @__svml_sin4(<4 x double>)
-;;declare <4 x double> @__svml_cos4(<4 x double>)
-;;declare void @__svml_sincos4(<4 x double>, <4 x double> *, <4 x double> *)
-;;declare <4 x double> @__svml_tan4(<4 x double>)
-;;declare <4 x double> @__svml_atan4(<4 x double>)
-;;declare <4 x double> @__svml_atan24(<4 x double>, <4 x double>)
-;;declare <4 x double> @__svml_exp4(<4 x double>)
-;;declare <4 x double> @__svml_log4(<4 x double>)
-;;declare <4 x double> @__svml_pow4(<4 x double>, <4 x double>)
-declare <4 x float> @__svml_sin(<4 x float>)
-declare <4 x float> @__svml_cos(<4 x float>)
-declare void @__svml_sincos(<4 x float>, <4 x float> *, <4 x float> *)
-declare <4 x float> @__svml_tan(<4 x float>)
-declare <4 x float> @__svml_atan(<4 x float>)
-declare <4 x float> @__svml_atan2(<4 x float>, <4 x float>)
-declare <4 x float> @__svml_exp(<4 x float>)
-declare <4 x float> @__svml_log(<4 x float>)
-declare <4 x float> @__svml_pow(<4 x float>, <4 x float>)
+include(`svml.m4')
+svmlf_declare(4)
+svmlf_define(4)
+svmld_declare(4)
+svmld_define(4)
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; float min/max
diff --git a/builtins/target-avx-x2.ll b/builtins/target-avx-x2.ll
index d9e0322b..d646720e 100644
--- a/builtins/target-avx-x2.ll
+++ b/builtins/target-avx-x2.ll
@@ -137,19 +137,9 @@ define <16 x float> @__sqrt_varying_float(<16 x float>) nounwind readonly always
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; svml
 
-; FIXME: need either to wire these up to the 8-wide SVML entrypoints,
-; or, use the macro to call the 4-wide ones 4x with our 16-wide
-; vectors...
-
-declare <16 x float> @__svml_sin(<16 x float>)
-declare <16 x float> @__svml_cos(<16 x float>)
-declare void @__svml_sincos(<16 x float>, <16 x float> *, <16 x float> *)
-declare <16 x float> @__svml_tan(<16 x float>)
-declare <16 x float> @__svml_atan(<16 x float>)
-declare <16 x float> @__svml_atan2(<16 x float>, <16 x float>)
-declare <16 x float> @__svml_exp(<16 x float>)
-declare <16 x float> @__svml_log(<16 x float>)
-declare <16 x float> @__svml_pow(<16 x float>, <16 x float>)
+include(`svml.m4')
+svmlf_stubs(16)
+svmld_stubs(16)
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; float min/max
diff --git a/builtins/target-avx.ll b/builtins/target-avx.ll
index 90e2f3ac..1d33e3f9 100644
--- a/builtins/target-avx.ll
+++ b/builtins/target-avx.ll
@@ -137,19 +137,11 @@ define <8 x float> @__sqrt_varying_float(<8 x float>) nounwind readonly alwaysin
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; svml
 
-; FIXME: need either to wire these up to the 8-wide SVML entrypoints,
-; or, use the macro to call the 4-wide ones twice with our 8-wide
-; vectors...
-
-declare <8 x float> @__svml_sin(<8 x float>)
-declare <8 x float> @__svml_cos(<8 x float>)
-declare void @__svml_sincos(<8 x float>, <8 x float> *, <8 x float> *)
-declare <8 x float> @__svml_tan(<8 x float>)
-declare <8 x float> @__svml_atan(<8 x float>)
-declare <8 x float> @__svml_atan2(<8 x float>, <8 x float>)
-declare <8 x float> @__svml_exp(<8 x float>)
-declare <8 x float> @__svml_log(<8 x float>)
-declare <8 x float> @__svml_pow(<8 x float>, <8 x float>)
+include(`svml.m4')
+svmlf_declare(8)
+svmlf_define(8)
+svmld_declare(4)
+svmld_stubs(8)
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; float min/max
diff --git a/builtins/target-generic-1.ll b/builtins/target-generic-1.ll
index 31ebcdd5..910565dd 100644
--- a/builtins/target-generic-1.ll
+++ b/builtins/target-generic-1.ll
@@ -310,6 +310,7 @@ declare double @round (double) nounwind readnone
 ;declare float     @llvm.sqrt.f32(float %Val)
 declare double    @llvm.sqrt.f64(double %Val)
 declare float     @llvm.sin.f32(float %Val)
+declare float     @llvm.asin.f32(float %Val)
 declare float     @llvm.cos.f32(float %Val)
 declare float     @llvm.sqrt.f32(float %Val)
 declare float     @llvm.exp.f32(float %Val)
@@ -651,7 +652,18 @@ define  <1 x float> @__rsqrt_varying_float(<1 x float> %v) nounwind readonly alw
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; svml stuff
 
-define  <1 x float> @__svml_sin(<1 x float>) nounwind readnone alwaysinline {
+declare  <1 x float> @__svml_sind(<1 x float>) nounwind readnone alwaysinline 
+declare  <1 x float> @__svml_asind(<1 x float>) nounwind readnone alwaysinline 
+declare  <1 x float> @__svml_cosd(<1 x float>) nounwind readnone alwaysinline 
+declare  void @__svml_sincosd(<1 x float>, <1 x float> *, <1 x float> *) nounwind readnone alwaysinline 
+declare  <1 x float> @__svml_tand(<1 x float>) nounwind readnone alwaysinline 
+declare  <1 x float> @__svml_atand(<1 x float>) nounwind readnone alwaysinline 
+declare  <1 x float> @__svml_atan2d(<1 x float>, <1 x float>) nounwind readnone alwaysinline 
+declare  <1 x float> @__svml_expd(<1 x float>) nounwind readnone alwaysinline 
+declare  <1 x float> @__svml_logd(<1 x float>) nounwind readnone alwaysinline 
+declare  <1 x float> @__svml_powd(<1 x float>, <1 x float>) nounwind readnone alwaysinline 
+
+define  <1 x float> @__svml_sinf(<1 x float>) nounwind readnone alwaysinline {
   ;%ret = call <1 x float> @__svml_sinf4(<1 x float> %0)
   ;ret <1 x float> %ret
   ;%r = extractelement <1 x float> %0, i32 0
@@ -662,7 +674,18 @@ define  <1 x float> @__svml_sin(<1 x float>) nounwind readnone alwaysinline {
    
 }
 
-define  <1 x float> @__svml_cos(<1 x float>) nounwind readnone alwaysinline {
+define  <1 x float> @__svml_asinf(<1 x float>) nounwind readnone alwaysinline {
+  ;%ret = call <1 x float> @__svml_asinf4(<1 x float> %0)
+  ;ret <1 x float> %ret
+  ;%r = extractelement <1 x float> %0, i32 0
+  ;%s = call float @llvm.asin.f32(float %r)
+  ;%rv = insertelement <1 x float> undef, float %r, i32 0
+  ;ret <1 x float> %rv
+  unary1to1(float,@llvm.asin.f32)
+   
+}
+
+define  <1 x float> @__svml_cosf(<1 x float>) nounwind readnone alwaysinline {
   ;%ret = call <1 x float> @__svml_cosf4(<1 x float> %0)
   ;ret <1 x float> %ret
   ;%r = extractelement <1 x float> %0, i32 0
@@ -673,18 +696,18 @@ define  <1 x float> @__svml_cos(<1 x float>) nounwind readnone alwaysinline {
 
 }
 
-define  void @__svml_sincos(<1 x float>, <1 x float> *, <1 x float> *) nounwind readnone alwaysinline {
+define  void @__svml_sincosf(<1 x float>, <1 x float> *, <1 x float> *) nounwind readnone alwaysinline {
 ;  %s = call <1 x float> @__svml_sincosf4(<1 x float> * %2, <1 x float> %0)
 ;  store <1 x float> %s, <1 x float> * %1
 ;  ret void
-   %sin = call <1 x float> @__svml_sin (<1 x float> %0)
-   %cos = call <1 x float> @__svml_cos (<1 x float> %0)
+   %sin = call <1 x float> @__svml_sinf(<1 x float> %0)
+   %cos = call <1 x float> @__svml_cosf(<1 x float> %0)
    store <1 x float> %sin, <1 x float> * %1
    store <1 x float> %cos, <1 x float> * %2
    ret void
 }
 
-define  <1 x float> @__svml_tan(<1 x float>) nounwind readnone alwaysinline {
+define  <1 x float> @__svml_tanf(<1 x float>) nounwind readnone alwaysinline {
   ;%ret = call <1 x float> @__svml_tanf4(<1 x float> %0)
   ;ret <1 x float> %ret
   ;%r = extractelement <1 x float> %0, i32 0
@@ -696,7 +719,7 @@ define  <1 x float> @__svml_tan(<1 x float>) nounwind readnone alwaysinline {
   ret <1 x float > %0
 }
 
-define  <1 x float> @__svml_atan(<1 x float>) nounwind readnone alwaysinline {
+define  <1 x float> @__svml_atanf(<1 x float>) nounwind readnone alwaysinline {
 ;  %ret = call <1 x float> @__svml_atanf4(<1 x float> %0)
 ;  ret <1 x float> %ret
   ;%r = extractelement <1 x float> %0, i32 0
@@ -709,7 +732,7 @@ define  <1 x float> @__svml_atan(<1 x float>) nounwind readnone alwaysinline {
 
 }
 
-define  <1 x float> @__svml_atan2(<1 x float>, <1 x float>) nounwind readnone alwaysinline {
+define  <1 x float> @__svml_atan2f(<1 x float>, <1 x float>) nounwind readnone alwaysinline {
   ;%ret = call <1 x float> @__svml_atan2f4(<1 x float> %0, <1 x float> %1)
   ;ret <1 x float> %ret
   ;%y = extractelement <1 x float> %0, i32 0
@@ -722,19 +745,19 @@ define  <1 x float> @__svml_atan2(<1 x float>, <1 x float>) nounwind readnone al
   ret <1 x float > %0
 }
 
-define  <1 x float> @__svml_exp(<1 x float>) nounwind readnone alwaysinline {
+define  <1 x float> @__svml_expf(<1 x float>) nounwind readnone alwaysinline {
   ;%ret = call <1 x float> @__svml_expf4(<1 x float> %0)
   ;ret <1 x float> %ret
   unary1to1(float, @llvm.exp.f32)
 }
 
-define  <1 x float> @__svml_log(<1 x float>) nounwind readnone alwaysinline {
+define  <1 x float> @__svml_logf(<1 x float>) nounwind readnone alwaysinline {
   ;%ret = call <1 x float> @__svml_logf4(<1 x float> %0)
   ;ret <1 x float> %ret
   unary1to1(float, @llvm.log.f32)
 }
 
-define  <1 x float> @__svml_pow(<1 x float>, <1 x float>) nounwind readnone alwaysinline {
+define  <1 x float> @__svml_powf(<1 x float>, <1 x float>) nounwind readnone alwaysinline {
   ;%ret = call <1 x float> @__svml_powf4(<1 x float> %0, <1 x float> %1)
   ;ret <1 x float> %ret
   %r = extractelement <1 x float> %0, i32 0
diff --git a/builtins/target-generic-common.ll b/builtins/target-generic-common.ll
index 2896c6b1..bc7db9ec 100644
--- a/builtins/target-generic-common.ll
+++ b/builtins/target-generic-common.ll
@@ -202,21 +202,15 @@ declare i64 @__count_trailing_zeros_i64(i64) nounwind readnone
 declare i32 @__count_leading_zeros_i32(i32) nounwind readnone
 declare i64 @__count_leading_zeros_i64(i64) nounwind readnone
 
-;; svml
-
 ; FIXME: need either to wire these up to the 8-wide SVML entrypoints,
 ; or, use the macro to call the 4-wide ones twice with our 8-wide
 ; vectors...
 
-declare <WIDTH x float> @__svml_sin(<WIDTH x float>)
-declare <WIDTH x float> @__svml_cos(<WIDTH x float>)
-declare void @__svml_sincos(<WIDTH x float>, <WIDTH x float> *, <WIDTH x float> *)
-declare <WIDTH x float> @__svml_tan(<WIDTH x float>)
-declare <WIDTH x float> @__svml_atan(<WIDTH x float>)
-declare <WIDTH x float> @__svml_atan2(<WIDTH x float>, <WIDTH x float>)
-declare <WIDTH x float> @__svml_exp(<WIDTH x float>)
-declare <WIDTH x float> @__svml_log(<WIDTH x float>)
-declare <WIDTH x float> @__svml_pow(<WIDTH x float>, <WIDTH x float>)
+;; svml
+
+include(`svml.m4')
+svmlf_stubs(WIDTH)
+svmld_stubs(WIDTH)
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; reductions
diff --git a/builtins/target-neon-common.ll b/builtins/target-neon-common.ll
index 696b0748..92fc5ce3 100644
--- a/builtins/target-neon-common.ll
+++ b/builtins/target-neon-common.ll
@@ -316,15 +316,10 @@ define void @__masked_store_blend_i64(<WIDTH x i64>* nocapture %ptr,
 ;; yuck.  We need declarations of these, even though we shouldnt ever
 ;; actually generate calls to them for the NEON target...
 
-declare <WIDTH x float> @__svml_sin(<WIDTH x float>)
-declare <WIDTH x float> @__svml_cos(<WIDTH x float>)
-declare void @__svml_sincos(<WIDTH x float>, <WIDTH x float> *, <WIDTH x float> *)
-declare <WIDTH x float> @__svml_tan(<WIDTH x float>)
-declare <WIDTH x float> @__svml_atan(<WIDTH x float>)
-declare <WIDTH x float> @__svml_atan2(<WIDTH x float>, <WIDTH x float>)
-declare <WIDTH x float> @__svml_exp(<WIDTH x float>)
-declare <WIDTH x float> @__svml_log(<WIDTH x float>)
-declare <WIDTH x float> @__svml_pow(<WIDTH x float>, <WIDTH x float>)
+
+include(`svml.m4')
+svmlf_stubs(WIDTH)
+svmld_stubs(WIDTH)
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; gather
diff --git a/builtins/target-sse2-x2.ll b/builtins/target-sse2-x2.ll
index da22a66c..5688ebba 100644
--- a/builtins/target-sse2-x2.ll
+++ b/builtins/target-sse2-x2.ll
@@ -105,28 +105,28 @@ define <8 x float> @__sqrt_varying_float(<8 x float>) nounwind readonly alwaysin
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; svml stuff
 
-declare <4 x float> @__svml_sinf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_cosf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_sincosf4(<4 x float> *, <4 x float>) nounwind readnone
-declare <4 x float> @__svml_tanf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_atanf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_atan2f4(<4 x float>, <4 x float>) nounwind readnone
-declare <4 x float> @__svml_expf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_logf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_powf4(<4 x float>, <4 x float>) nounwind readnone
+include(`svml.m4')
+svmlf_declare(4)
+svmld_declare(2)
+svmld_stubs(8)
 
 
-define <8 x float> @__svml_sin(<8 x float>) nounwind readnone alwaysinline {
+define <8 x float> @__svml_sinf(<8 x float>) nounwind readnone alwaysinline {
   unary4to8(ret, float, @__svml_sinf4, %0)
   ret <8 x float> %ret
 }
 
-define <8 x float> @__svml_cos(<8 x float>) nounwind readnone alwaysinline {
+define <8 x float> @__svml_asinf(<8 x float>) nounwind readnone alwaysinline {
+  unary4to8(ret, float, @__svml_asinf4, %0)
+  ret <8 x float> %ret
+}
+
+define <8 x float> @__svml_cosf(<8 x float>) nounwind readnone alwaysinline {
   unary4to8(ret, float, @__svml_cosf4, %0)
   ret <8 x float> %ret
 }
 
-define void @__svml_sincos(<8 x float>, <8 x float> *,
+define void @__svml_sincosf(<8 x float>, <8 x float> *,
                                     <8 x float> *) nounwind readnone alwaysinline {
   ; call svml_sincosf4 two times with the two 4-wide sub-vectors
   %a = shufflevector <8 x float> %0, <8 x float> undef,
@@ -155,33 +155,33 @@ define void @__svml_sincos(<8 x float>, <8 x float> *,
   ret void
 }
 
-define <8 x float> @__svml_tan(<8 x float>) nounwind readnone alwaysinline {
+define <8 x float> @__svml_tanf(<8 x float>) nounwind readnone alwaysinline {
   unary4to8(ret, float, @__svml_tanf4, %0)
   ret <8 x float> %ret
 }
 
-define <8 x float> @__svml_atan(<8 x float>) nounwind readnone alwaysinline {
+define <8 x float> @__svml_atanf(<8 x float>) nounwind readnone alwaysinline {
   unary4to8(ret, float, @__svml_atanf4, %0)
   ret <8 x float> %ret
 }
 
-define <8 x float> @__svml_atan2(<8 x float>,
+define <8 x float> @__svml_atan2f(<8 x float>,
                                           <8 x float>) nounwind readnone alwaysinline {
   binary4to8(ret, float, @__svml_atan2f4, %0, %1)
   ret <8 x float> %ret
 }
 
-define <8 x float> @__svml_exp(<8 x float>) nounwind readnone alwaysinline {
+define <8 x float> @__svml_expf(<8 x float>) nounwind readnone alwaysinline {
   unary4to8(ret, float, @__svml_expf4, %0)
   ret <8 x float> %ret
 }
 
-define <8 x float> @__svml_log(<8 x float>) nounwind readnone alwaysinline {
+define <8 x float> @__svml_logf(<8 x float>) nounwind readnone alwaysinline {
   unary4to8(ret, float, @__svml_logf4, %0)
   ret <8 x float> %ret
 }
 
-define <8 x float> @__svml_pow(<8 x float>,
+define <8 x float> @__svml_powf(<8 x float>,
                                         <8 x float>) nounwind readnone alwaysinline {
   binary4to8(ret, float, @__svml_powf4, %0, %1)
   ret <8 x float> %ret
diff --git a/builtins/target-sse2.ll b/builtins/target-sse2.ll
index a6b206b6..236cda33 100644
--- a/builtins/target-sse2.ll
+++ b/builtins/target-sse2.ll
@@ -496,62 +496,11 @@ define <4 x float> @__sqrt_varying_float(<4 x float>) nounwind readonly alwaysin
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; svml stuff
 
-declare <4 x float> @__svml_sinf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_cosf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_sincosf4(<4 x float> *, <4 x float>) nounwind readnone
-declare <4 x float> @__svml_tanf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_atanf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_atan2f4(<4 x float>, <4 x float>) nounwind readnone
-declare <4 x float> @__svml_expf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_logf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_powf4(<4 x float>, <4 x float>) nounwind readnone
-
-
-define <4 x float> @__svml_sin(<4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_sinf4(<4 x float> %0)
-  ret <4 x float> %ret
-}
-
-define <4 x float> @__svml_cos(<4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_cosf4(<4 x float> %0)
-  ret <4 x float> %ret
-}
-
-define void @__svml_sincos(<4 x float>, <4 x float> *, <4 x float> *) nounwind readnone alwaysinline {
-  %s = call <4 x float> @__svml_sincosf4(<4 x float> * %2, <4 x float> %0)
-  store <4 x float> %s, <4 x float> * %1
-  ret void
-}
-
-define <4 x float> @__svml_tan(<4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_tanf4(<4 x float> %0)
-  ret <4 x float> %ret
-}
-
-define <4 x float> @__svml_atan(<4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_atanf4(<4 x float> %0)
-  ret <4 x float> %ret
-}
-
-define <4 x float> @__svml_atan2(<4 x float>, <4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_atan2f4(<4 x float> %0, <4 x float> %1)
-  ret <4 x float> %ret
-}
-
-define <4 x float> @__svml_exp(<4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_expf4(<4 x float> %0)
-  ret <4 x float> %ret
-}
-
-define <4 x float> @__svml_log(<4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_logf4(<4 x float> %0)
-  ret <4 x float> %ret
-}
-
-define <4 x float> @__svml_pow(<4 x float>, <4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_powf4(<4 x float> %0, <4 x float> %1)
-  ret <4 x float> %ret
-}
+include(`svml.m4')
+svmlf_declare(4)
+svmld_declare(2)
+svmlf_define(4)
+svmld_stubs(4)
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; float min/max
diff --git a/builtins/target-sse4-16.ll b/builtins/target-sse4-16.ll
index d7f3833d..3fbbe534 100644
--- a/builtins/target-sse4-16.ll
+++ b/builtins/target-sse4-16.ll
@@ -209,16 +209,9 @@ define <8 x double> @__max_varying_double(<8 x double>, <8 x double>) nounwind r
 ;; svml
 
 ; FIXME
-
-declare <8 x float> @__svml_sin(<8 x float>)
-declare <8 x float> @__svml_cos(<8 x float>)
-declare void @__svml_sincos(<8 x float>, <8 x float> *, <8 x float> *)
-declare <8 x float> @__svml_tan(<8 x float>)
-declare <8 x float> @__svml_atan(<8 x float>)
-declare <8 x float> @__svml_atan2(<8 x float>, <8 x float>)
-declare <8 x float> @__svml_exp(<8 x float>)
-declare <8 x float> @__svml_log(<8 x float>)
-declare <8 x float> @__svml_pow(<8 x float>, <8 x float>)
+include(`svml.m4')
+svmlf_stubs(8)
+svmld_stubs(8)
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; horizontal ops / reductions
diff --git a/builtins/target-sse4-8.ll b/builtins/target-sse4-8.ll
index fd4b74d7..e65077b7 100644
--- a/builtins/target-sse4-8.ll
+++ b/builtins/target-sse4-8.ll
@@ -222,15 +222,9 @@ define <16 x double> @__max_varying_double(<16 x double>, <16 x double>) nounwin
 
 ; FIXME
 
-declare <16 x float> @__svml_sin(<16 x float>)
-declare <16 x float> @__svml_cos(<16 x float>)
-declare void @__svml_sincos(<16 x float>, <16 x float> *, <16 x float> *)
-declare <16 x float> @__svml_tan(<16 x float>)
-declare <16 x float> @__svml_atan(<16 x float>)
-declare <16 x float> @__svml_atan2(<16 x float>, <16 x float>)
-declare <16 x float> @__svml_exp(<16 x float>)
-declare <16 x float> @__svml_log(<16 x float>)
-declare <16 x float> @__svml_pow(<16 x float>, <16 x float>)
+include(`svml.m4')
+svmlf_stubs(16)
+svmld_stubs(16)
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; horizontal ops / reductions
diff --git a/builtins/target-sse4-x2.ll b/builtins/target-sse4-x2.ll
index a7faddb3..2a69b60a 100644
--- a/builtins/target-sse4-x2.ll
+++ b/builtins/target-sse4-x2.ll
@@ -105,28 +105,28 @@ define <8 x float> @__sqrt_varying_float(<8 x float>) nounwind readonly alwaysin
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; svml stuff
 
-declare <4 x float> @__svml_sinf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_cosf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_sincosf4(<4 x float> *, <4 x float>) nounwind readnone
-declare <4 x float> @__svml_tanf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_atanf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_atan2f4(<4 x float>, <4 x float>) nounwind readnone
-declare <4 x float> @__svml_expf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_logf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_powf4(<4 x float>, <4 x float>) nounwind readnone
+include(`svml.m4')
+svmlf_declare(4)
+svmld_declare(2)
+svmld_stubs(8)
 
 
-define <8 x float> @__svml_sin(<8 x float>) nounwind readnone alwaysinline {
+define <8 x float> @__svml_sinf(<8 x float>) nounwind readnone alwaysinline {
   unary4to8(ret, float, @__svml_sinf4, %0)
   ret <8 x float> %ret
 }
 
-define <8 x float> @__svml_cos(<8 x float>) nounwind readnone alwaysinline {
+define <8 x float> @__svml_asinf(<8 x float>) nounwind readnone alwaysinline {
+  unary4to8(ret, float, @__svml_asinf4, %0)
+  ret <8 x float> %ret
+}
+
+define <8 x float> @__svml_cosf(<8 x float>) nounwind readnone alwaysinline {
   unary4to8(ret, float, @__svml_cosf4, %0)
   ret <8 x float> %ret
 }
 
-define void @__svml_sincos(<8 x float>, <8 x float> *,
+define void @__svml_sincosf(<8 x float>, <8 x float> *,
                                     <8 x float> *) nounwind readnone alwaysinline {
   ; call svml_sincosf4 two times with the two 4-wide sub-vectors
   %a = shufflevector <8 x float> %0, <8 x float> undef,
@@ -155,33 +155,33 @@ define void @__svml_sincos(<8 x float>, <8 x float> *,
   ret void
 }
 
-define <8 x float> @__svml_tan(<8 x float>) nounwind readnone alwaysinline {
+define <8 x float> @__svml_tanf(<8 x float>) nounwind readnone alwaysinline {
   unary4to8(ret, float, @__svml_tanf4, %0)
   ret <8 x float> %ret
 }
 
-define <8 x float> @__svml_atan(<8 x float>) nounwind readnone alwaysinline {
+define <8 x float> @__svml_atanf(<8 x float>) nounwind readnone alwaysinline {
   unary4to8(ret, float, @__svml_atanf4, %0)
   ret <8 x float> %ret
 }
 
-define <8 x float> @__svml_atan2(<8 x float>,
+define <8 x float> @__svml_atan2f(<8 x float>,
                                           <8 x float>) nounwind readnone alwaysinline {
   binary4to8(ret, float, @__svml_atan2f4, %0, %1)
   ret <8 x float> %ret
 }
 
-define <8 x float> @__svml_exp(<8 x float>) nounwind readnone alwaysinline {
+define <8 x float> @__svml_expf(<8 x float>) nounwind readnone alwaysinline {
   unary4to8(ret, float, @__svml_expf4, %0)
   ret <8 x float> %ret
 }
 
-define <8 x float> @__svml_log(<8 x float>) nounwind readnone alwaysinline {
+define <8 x float> @__svml_logf(<8 x float>) nounwind readnone alwaysinline {
   unary4to8(ret, float, @__svml_logf4, %0)
   ret <8 x float> %ret
 }
 
-define <8 x float> @__svml_pow(<8 x float>,
+define <8 x float> @__svml_powf(<8 x float>,
                                         <8 x float>) nounwind readnone alwaysinline {
   binary4to8(ret, float, @__svml_powf4, %0, %1)
   ret <8 x float> %ret
diff --git a/builtins/target-sse4.ll b/builtins/target-sse4.ll
index e05b865f..686b4f84 100644
--- a/builtins/target-sse4.ll
+++ b/builtins/target-sse4.ll
@@ -209,62 +209,11 @@ define <4 x double> @__max_varying_double(<4 x double>, <4 x double>) nounwind r
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; svml stuff
 
-declare <4 x float> @__svml_sinf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_cosf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_sincosf4(<4 x float> *, <4 x float>) nounwind readnone
-declare <4 x float> @__svml_tanf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_atanf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_atan2f4(<4 x float>, <4 x float>) nounwind readnone
-declare <4 x float> @__svml_expf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_logf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_powf4(<4 x float>, <4 x float>) nounwind readnone
-
-
-define <4 x float> @__svml_sin(<4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_sinf4(<4 x float> %0)
-  ret <4 x float> %ret
-}
-
-define <4 x float> @__svml_cos(<4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_cosf4(<4 x float> %0)
-  ret <4 x float> %ret
-}
-
-define void @__svml_sincos(<4 x float>, <4 x float> *, <4 x float> *) nounwind readnone alwaysinline {
-  %s = call <4 x float> @__svml_sincosf4(<4 x float> * %2, <4 x float> %0)
-  store <4 x float> %s, <4 x float> * %1
-  ret void
-}
-
-define <4 x float> @__svml_tan(<4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_tanf4(<4 x float> %0)
-  ret <4 x float> %ret
-}
-
-define <4 x float> @__svml_atan(<4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_atanf4(<4 x float> %0)
-  ret <4 x float> %ret
-}
-
-define <4 x float> @__svml_atan2(<4 x float>, <4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_atan2f4(<4 x float> %0, <4 x float> %1)
-  ret <4 x float> %ret
-}
-
-define <4 x float> @__svml_exp(<4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_expf4(<4 x float> %0)
-  ret <4 x float> %ret
-}
-
-define <4 x float> @__svml_log(<4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_logf4(<4 x float> %0)
-  ret <4 x float> %ret
-}
-
-define <4 x float> @__svml_pow(<4 x float>, <4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_powf4(<4 x float> %0, <4 x float> %1)
-  ret <4 x float> %ret
-}
+include(`svml.m4')
+svmlf_declare(4)
+svmlf_define(4)
+svmld_declare(2)
+svmld_stubs(8)
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; horizontal ops / reductions
diff --git a/builtins/util.m4 b/builtins/util.m4
index 95e3844d..6c90c821 100644
--- a/builtins/util.m4
+++ b/builtins/util.m4
@@ -3160,6 +3160,7 @@ define float @__stdlib_powf(float, float) nounwind readnone alwaysinline {
 }
 
 declare double @sin(double) nounwind readnone
+declare double @asin(double) nounwind readnone
 declare double @cos(double) nounwind readnone
 declare void @sincos(double, double *, double *) nounwind readnone
 declare double @tan(double) nounwind readnone
@@ -3174,6 +3175,11 @@ define double @__stdlib_sin(double) nounwind readnone alwaysinline {
   ret double %r
 }
 
+define double @__stdlib_asin(double) nounwind readnone alwaysinline {
+  %r = call double @asin(double %0)
+  ret double %r
+}
+
 define double @__stdlib_cos(double) nounwind readnone alwaysinline {
   %r = call double @cos(double %0)
   ret double %r
diff --git a/stdlib.ispc b/stdlib.ispc
index e4f8844f..db9d7f36 100644
--- a/stdlib.ispc
+++ b/stdlib.ispc
@@ -2180,7 +2180,7 @@ static inline uniform float frexp(uniform float x, uniform int * uniform pw2) {
 __declspec(safe)
 static inline float sin(float x_full) {
     if (__math_lib == __math_lib_svml) {
-        return __svml_sin(x_full);
+        return __svml_sinf(x_full);
     }
     else if (__math_lib == __math_lib_system) {
         float ret;
@@ -2313,8 +2313,10 @@ static inline float asin(float x) {
     bool isnan = (x > 1);
 
     float v;
-    if (__math_lib == __math_lib_svml ||
-        __math_lib == __math_lib_system) {
+    if (__math_lib == __math_lib_svml) {
+        return __svml_asinf(x);
+    } 
+    else if (__math_lib == __math_lib_system) {
         float ret;
         foreach_active (i) {
             uniform float r = __stdlib_asinf(extract(x, i));
@@ -2417,7 +2419,7 @@ static inline uniform float asin(uniform float x) {
 __declspec(safe)
 static inline float cos(float x_full) {
     if (__math_lib == __math_lib_svml) {
-        return __svml_cos(x_full);
+        return __svml_cosf(x_full);
     }
     else if (__math_lib == __math_lib_system) {
         float ret;
@@ -2545,18 +2547,28 @@ static inline float acos(float v) {
     return 1.57079637050628662109375 - asin(v);
 }
 
+__declspec(safe)
+static inline double acos(const double v) {
+    return 1.57079637050628662109375 - asin(v);
+}
+
 
 __declspec(safe)
 static inline uniform float acos(uniform float v) {
     return 1.57079637050628662109375 - asin(v);
 }
 
+__declspec(safe)
+static inline uniform double acos(const uniform double v) {
+    return 1.57079637050628662109375 - asin(v);
+}
+
 
 __declspec(safe)
 static inline void sincos(float x_full, varying float * uniform sin_result, 
                           varying float * uniform cos_result) {
     if (__math_lib == __math_lib_svml) {
-        __svml_sincos(x_full, sin_result, cos_result);
+        __svml_sincosf(x_full, sin_result, cos_result);
     }
     else if (__math_lib == __math_lib_system) {
         foreach_active (i) {
@@ -2688,7 +2700,7 @@ static inline void sincos(uniform float x_full, uniform float * uniform sin_resu
 __declspec(safe)
 static inline float tan(float x_full) {
     if (__math_lib == __math_lib_svml) {
-        return __svml_tan(x_full);
+        return __svml_tanf(x_full);
     }
     else if (__math_lib == __math_lib_system) {
         float ret;
@@ -2839,7 +2851,7 @@ static inline uniform float tan(uniform float x_full) {
 __declspec(safe)
 static inline float atan(float x_full) {
     if (__math_lib == __math_lib_svml) {
-        return __svml_atan(x_full);
+        return __svml_atanf(x_full);
     }
     else if (__math_lib == __math_lib_system) {
         float ret;
@@ -2934,7 +2946,7 @@ static inline uniform float atan(uniform float x_full) {
 __declspec(safe)
 static inline float atan2(float y, float x) {
     if (__math_lib == __math_lib_svml) {
-        return __svml_atan2(y, x);
+        return __svml_atan2f(y, x);
     }
     else if (__math_lib == __math_lib_system) {
         float ret;
@@ -2997,7 +3009,7 @@ static inline float exp(float x_full) {
         return __exp_varying_float(x_full);
     }
     else if (__math_lib == __math_lib_svml) {
-        return __svml_exp(x_full);
+        return __svml_expf(x_full);
     }
     else if (__math_lib == __math_lib_system) {
         float ret;
@@ -3204,7 +3216,7 @@ static inline float log(float x_full) {
         return __log_varying_float(x_full);
     }
     else if (__math_lib == __math_lib_svml) {
-        return __svml_log(x_full);
+        return __svml_logf(x_full);
     }
     else if (__math_lib == __math_lib_system) {
         float ret;
@@ -3379,7 +3391,7 @@ static inline float pow(float a, float b) {
         return __pow_varying_float(a, b);
     }
     else if (__math_lib == __math_lib_svml) {
-        return __svml_pow(a, b);
+        return __svml_powf(a, b);
     }
     else if (__math_lib == __math_lib_system) {
         float ret;
@@ -3469,7 +3481,11 @@ static inline uniform double frexp(uniform double x, uniform int * uniform pw2)
 
 __declspec(safe)
 static inline double sin(double x) {
-    if (__math_lib == __math_lib_ispc_fast)
+    if (__math_lib == __math_lib_svml) 
+    {
+      return __svml_sind(x);
+    }
+    else if (__math_lib == __math_lib_ispc_fast)
         return sin((float)x);
     else {
         double ret;
@@ -3490,8 +3506,30 @@ static inline uniform double sin(uniform double x) {
 }
 
 __declspec(safe)
-static inline double cos(double x) {
-    if (__math_lib == __math_lib_ispc_fast)
+static inline double asin(const double x) {
+    if (__math_lib == __math_lib_svml) 
+    {
+      return __svml_asind(x);
+    }
+    else if (__math_lib == __math_lib_ispc_fast)
+        return asin((float)x);
+    else {
+        double ret;
+        foreach_active (i) {
+            uniform double r = __stdlib_asin(extract(x, i));
+            ret = insert(ret, i, r);
+        }
+        return ret;
+    }
+}
+
+__declspec(safe)
+static inline double cos(const double x) {
+    if (__math_lib == __math_lib_svml) 
+    {
+      return __svml_cosd(x);
+    }
+    else if (__math_lib == __math_lib_ispc_fast)
         return cos((float)x);
     else {
         double ret;
@@ -3514,7 +3552,11 @@ static inline uniform double cos(uniform double x) {
 __declspec(safe)
 static inline void sincos(double x, varying double * uniform sin_result,
                           varying double * uniform cos_result) {
-    if (__math_lib == __math_lib_ispc_fast) {
+    if (__math_lib == __math_lib_svml) 
+    {
+      __svml_sincosd(x, sin_result, cos_result);
+    }
+    else if (__math_lib == __math_lib_ispc_fast) {
         float sr, cr;
         sincos((float)x, &sr, &cr);
         *sin_result = sr;
@@ -3545,7 +3587,11 @@ static inline void sincos(uniform double x, uniform double * uniform sin_result,
 
 __declspec(safe)
 static inline double tan(double x) {
-    if (__math_lib == __math_lib_ispc_fast)
+    if (__math_lib == __math_lib_svml) 
+    {
+      return __svml_tand(x);
+    }
+    else if (__math_lib == __math_lib_ispc_fast)
         return tan((float)x);
     else {
         double ret;
@@ -3589,7 +3635,11 @@ static inline uniform double atan(uniform double x) {
 
 __declspec(safe)
 static inline double atan2(double y, double x) {
-    if (__math_lib == __math_lib_ispc_fast)
+    if (__math_lib == __math_lib_svml) 
+    {
+      return __svml_atan2d(y,x);
+    }
+    else if (__math_lib == __math_lib_ispc_fast)
         return atan2((float)y, (float)x);
     else {
         double ret;
@@ -3611,7 +3661,11 @@ static inline uniform double atan2(uniform double y, uniform double x) {
 
 __declspec(safe)
 static inline double exp(double x) {
-    if (__math_lib == __math_lib_ispc_fast)
+    if (__math_lib == __math_lib_svml) 
+    {
+        return __svml_expd(x);
+    }
+    else if (__math_lib == __math_lib_ispc_fast)
         return exp((float)x);
     else {
         double ret;
@@ -3633,7 +3687,11 @@ static inline uniform double exp(uniform double x) {
 
 __declspec(safe)
 static inline double log(double x) {
-    if (__math_lib == __math_lib_ispc_fast)
+    if (__math_lib == __math_lib_svml) 
+    {
+        return __svml_logd(x);
+    }
+    else if (__math_lib == __math_lib_ispc_fast)
         return log((float)x);
     else {
         double ret;
@@ -3655,7 +3713,11 @@ static inline uniform double log(uniform double x) {
 
 __declspec(safe)
 static inline double pow(double a, double b) {
-    if (__math_lib == __math_lib_ispc_fast)
+    if (__math_lib == __math_lib_svml) 
+    {
+        return __svml_powd(a,b);
+    }
+    else if (__math_lib == __math_lib_ispc_fast)
         return pow((float)a, (float)b);
     else {
         double ret;

From 7a326995735293a25fb44d5f7243521a57df719a Mon Sep 17 00:00:00 2001
From: egaburov <egaburov.work@gmail.com>
Date: Wed, 11 Sep 2013 15:18:03 +0200
Subject: [PATCH 056/124] added svml.m4

---
 builtins/svml.m4 | 176 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 176 insertions(+)
 create mode 100644 builtins/svml.m4

diff --git a/builtins/svml.m4 b/builtins/svml.m4
new file mode 100644
index 00000000..cc3cd979
--- /dev/null
+++ b/builtins/svml.m4
@@ -0,0 +1,176 @@
+;; svml
+
+;; stub
+define(`svmlf_stubs',`
+  declare <$1 x float> @__svml_sinf(<$1 x float>) nounwind readnone alwaysinline
+  declare <$1 x float> @__svml_asinf(<$1 x float>) nounwind readnone alwaysinline 
+  declare <$1 x float> @__svml_cosf(<$1 x float>) nounwind readnone alwaysinline 
+  declare void @__svml_sincosf(<$1 x float>, <$1 x float> *, <$1 x float> *) nounwind readnone alwaysinline 
+  declare <$1 x float> @__svml_tanf(<$1 x float>) nounwind readnone alwaysinline 
+  declare <$1 x float> @__svml_atanf(<$1 x float>) nounwind readnone alwaysinline 
+  declare <$1 x float> @__svml_atan2f(<$1 x float>, <$1 x float>) nounwind readnone alwaysinline 
+  declare <$1 x float> @__svml_expf(<$1 x float>) nounwind readnone alwaysinline 
+  declare <$1 x float> @__svml_logf(<$1 x float>) nounwind readnone alwaysinline 
+  declare <$1 x float> @__svml_powf(<$1 x float>, <$1 x float>) nounwind readnone alwaysinline 
+')
+
+define(`svmld_stubs',`
+  declare <$1 x double> @__svml_sind(<$1 x double>) nounwind readnone alwaysinline 
+  declare <$1 x double> @__svml_asind(<$1 x double>) nounwind readnone alwaysinline 
+  declare <$1 x double> @__svml_cosd(<$1 x double>) nounwind readnone alwaysinline 
+  declare void @__svml_sincosd(<$1 x double>, <$1 x double> *, <$1 x double> *) nounwind readnone alwaysinline 
+  declare <$1 x double> @__svml_tand(<$1 x double>) nounwind readnone alwaysinline 
+  declare <$1 x double> @__svml_atand(<$1 x double>) nounwind readnone alwaysinline 
+  declare <$1 x double> @__svml_atan2d(<$1 x double>, <$1 x double>) nounwind readnone alwaysinline 
+  declare <$1 x double> @__svml_expd(<$1 x double>) nounwind readnone alwaysinline 
+  declare <$1 x double> @__svml_logd(<$1 x double>) nounwind readnone alwaysinline 
+  declare <$1 x double> @__svml_powd(<$1 x double>, <$1 x double>) nounwind readnone alwaysinline 
+')
+
+;; single precision
+define(`svmlf_declare',`
+  declare <$1 x float> @__svml_sinf$1(<$1 x float>) nounwind readnone
+  declare <$1 x float> @__svml_asinf$1(<$1 x float>) nounwind readnone
+  declare <$1 x float> @__svml_cosf$1(<$1 x float>) nounwind readnone
+  declare <$1 x float> @__svml_sincosf$1(<$1 x float> *, <$1 x float>) nounwind readnone
+  declare <$1 x float> @__svml_tanf$1(<$1 x float>) nounwind readnone
+  declare <$1 x float> @__svml_atanf$1(<$1 x float>) nounwind readnone
+  declare <$1 x float> @__svml_atan2f$1(<$1 x float>, <$1 x float>) nounwind readnone
+  declare <$1 x float> @__svml_expf$1(<$1 x float>) nounwind readnone
+  declare <$1 x float> @__svml_logf$1(<$1 x float>) nounwind readnone
+  declare <$1 x float> @__svml_powf$1(<$1 x float>, <$1 x float>) nounwind readnone
+');
+
+
+
+define(`svmlf_define',`
+  define <$1 x float> @__svml_sinf(<$1 x float>) nounwind readnone alwaysinline {
+    %ret = call <$1 x float> @__svml_sinf$1(<$1 x float> %0)
+    ret <$1 x float> %ret
+  }
+  define <$1 x float> @__svml_asinf(<$1 x float>) nounwind readnone alwaysinline {
+    %ret = call <$1 x float> @__svml_asinf$1(<$1 x float> %0)
+    ret <$1 x float> %ret
+  }
+
+  define <$1 x float> @__svml_cosf(<$1 x float>) nounwind readnone alwaysinline {
+    %ret = call <$1 x float> @__svml_cosf$1(<$1 x float> %0)
+    ret <$1 x float> %ret
+  }
+
+  define void @__svml_sincosf(<$1 x float>, <$1 x float> *, <$1 x float> *) nounwind readnone alwaysinline {
+    %s = call <$1 x float> @__svml_sincosf$1(<$1 x float> * %2, <$1 x float> %0)
+    store <$1 x float> %s, <$1 x float> * %1
+    ret void
+  }
+
+  define <$1 x float> @__svml_tanf(<$1 x float>) nounwind readnone alwaysinline {
+    %ret = call <$1 x float> @__svml_tanf$1(<$1 x float> %0)
+    ret <$1 x float> %ret
+  }
+
+  define <$1 x float> @__svml_atanf(<$1 x float>) nounwind readnone alwaysinline {
+    %ret = call <$1 x float> @__svml_atanf$1(<$1 x float> %0)
+    ret <$1 x float> %ret
+  }
+
+  define <$1 x float> @__svml_atan2f(<$1 x float>, <$1 x float>) nounwind readnone alwaysinline {
+    %ret = call <$1 x float> @__svml_atan2f$1(<$1 x float> %0, <$1 x float> %1)
+    ret <$1 x float> %ret
+  }
+
+  define <$1 x float> @__svml_expf(<$1 x float>) nounwind readnone alwaysinline {
+    %ret = call <$1 x float> @__svml_expf$1(<$1 x float> %0)
+    ret <$1 x float> %ret
+  }
+
+  define <$1 x float> @__svml_logf(<$1 x float>) nounwind readnone alwaysinline {
+    %ret = call <$1 x float> @__svml_logf$1(<$1 x float> %0)
+    ret <$1 x float> %ret
+  }
+
+  define <$1 x float> @__svml_powf(<$1 x float>, <$1 x float>) nounwind readnone alwaysinline {
+    %ret = call <$1 x float> @__svml_powf$1(<$1 x float> %0, <$1 x float> %1)
+    ret <$1 x float> %ret
+  }
+')
+
+;; double precision
+define(`svmld_declare',`
+  declare <$1 x double> @__svml_sin$1(<$1 x double>) nounwind readnone
+  declare <$1 x double> @__svml_asin$1(<$1 x double>) nounwind readnone
+  declare <$1 x double> @__svml_cos$1(<$1 x double>) nounwind readnone
+  declare <$1 x double> @__svml_sincos$1(<$1 x double> *, <$1 x double>) nounwind readnone
+  declare <$1 x double> @__svml_tan$1(<$1 x double>) nounwind readnone
+  declare <$1 x double> @__svml_atan$1(<$1 x double>) nounwind readnone
+  declare <$1 x double> @__svml_atan2$1(<$1 x double>, <$1 x double>) nounwind readnone
+  declare <$1 x double> @__svml_exp$1(<$1 x double>) nounwind readnone
+  declare <$1 x double> @__svml_log$1(<$1 x double>) nounwind readnone
+  declare <$1 x double> @__svml_pow$1(<$1 x double>, <$1 x double>) nounwind readnone
+')
+
+define(`svmld_define',`
+  define <$1 x double> @__svml_sind(<$1 x double>) nounwind readnone alwaysinline {
+    %ret = call <$1 x double> @__svml_sin$1(<$1 x double> %0)
+    ret <$1 x double> %ret
+  }
+  define <$1 x double> @__svml_asind(<$1 x double>) nounwind readnone alwaysinline {
+    %ret = call <$1 x double> @__svml_asin$1(<$1 x double> %0)
+    ret <$1 x double> %ret
+  }
+
+
+  define <$1 x double> @__svml_cosd(<$1 x double>) nounwind readnone alwaysinline {
+    %ret = call <$1 x double> @__svml_cos$1(<$1 x double> %0)
+    ret <$1 x double> %ret
+  }
+
+  define void @__svml_sincosd(<$1 x double>, <$1 x double> *, <$1 x double> *) nounwind readnone alwaysinline {
+    %s = call <$1 x double> @__svml_sincos$1(<$1 x double> * %2, <$1 x double> %0)
+    store <$1 x double> %s, <$1 x double> * %1
+    ret void
+  }
+
+  define <$1 x double> @__svml_tand(<$1 x double>) nounwind readnone alwaysinline {
+    %ret = call <$1 x double> @__svml_tan$1(<$1 x double> %0)
+    ret <$1 x double> %ret
+  }
+
+  define <$1 x double> @__svml_atand(<$1 x double>) nounwind readnone alwaysinline {
+    %ret = call <$1 x double> @__svml_atan$1(<$1 x double> %0)
+    ret <$1 x double> %ret
+  }
+
+  define <$1 x double> @__svml_atan2d(<$1 x double>, <$1 x double>) nounwind readnone alwaysinline {
+    %ret = call <$1 x double> @__svml_atan2$1(<$1 x double> %0, <$1 x double> %1)
+    ret <$1 x double> %ret
+  }
+
+  define <$1 x double> @__svml_expd(<$1 x double>) nounwind readnone alwaysinline {
+    %ret = call <$1 x double> @__svml_exp$1(<$1 x double> %0)
+    ret <$1 x double> %ret
+  }
+
+  define <$1 x double> @__svml_logd(<$1 x double>) nounwind readnone alwaysinline {
+    %ret = call <$1 x double> @__svml_log$1(<$1 x double> %0)
+    ret <$1 x double> %ret
+  }
+
+  define <$1 x double> @__svml_powd(<$1 x double>, <$1 x double>) nounwind readnone alwaysinline {
+    %ret = call <$1 x double> @__svml_pow$1(<$1 x double> %0, <$1 x double> %1)
+    ret <$1 x double> %ret
+  }
+')
+
+;; need to implement smvld for 2xvectorWidth ...:w
+
+define(`svmld2_define',`
+  define <$1 x double> @__svml_sinxx(<$1 x double>) nounwind readnone alwaysinline {
+    %v0 = shufflevector <$1 x double> %0, <$1 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+    %v1 = shufflevector <$1 x double> %0, <$1 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+    %ret0 = call <$2 x double> @__svml_sin$2(<$2 x double> %v0)
+    %ret1 = call <$2 x double> @__svml_sin$2(<$2 x double> %v1)
+    %ret  = shufflevector <$2 x double> %ret0, <$2 x double> %ret1, <$1 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+    ret <$1 x double> %ret
+  }
+')

From 9cf8e8cbf3945df122bf0652326be1404634c0cb Mon Sep 17 00:00:00 2001
From: egaburov <egaburov.work@gmail.com>
Date: Wed, 11 Sep 2013 15:23:45 +0200
Subject: [PATCH 057/124] builtins fix for double precision svml and
 __stdlib_asin

---
 builtins.cpp | 23 ++++++++++++-----------
 1 file changed, 12 insertions(+), 11 deletions(-)

diff --git a/builtins.cpp b/builtins.cpp
index 139b8f04..816d4d78 100644
--- a/builtins.cpp
+++ b/builtins.cpp
@@ -576,22 +576,23 @@ lSetInternalFunctions(llvm::Module *module) {
         "__stdlib_pow",
         "__stdlib_powf",
         "__stdlib_sin",
+        "__stdlib_asin",
         "__stdlib_sincos",
         "__stdlib_sincosf",
         "__stdlib_sinf",
         "__stdlib_tan",
         "__stdlib_tanf",
-        "__svml_sin",
-        "__svml_asin",
-        "__svml_cos",
-        "__svml_acos",
-        "__svml_sincos",
-        "__svml_tan",
-        "__svml_atan",
-        "__svml_atan2",
-        "__svml_exp",
-        "__svml_log",
-        "__svml_pow",
+        "__svml_sind",
+        "__svml_asind",
+        "__svml_cosd",
+        "__svml_acosd",
+        "__svml_sincosd",
+        "__svml_tand",
+        "__svml_atand",
+        "__svml_atan2d",
+        "__svml_expd",
+        "__svml_logd",
+        "__svml_powd",
         "__svml_sinf",
         "__svml_asinf",
         "__svml_cosf",

From 19379db3b60a60f2f1862a54709115bcf11c7545 Mon Sep 17 00:00:00 2001
From: egaburov <egaburov.work@gmail.com>
Date: Wed, 11 Sep 2013 16:48:56 +0200
Subject: [PATCH 058/124] svml cleanup

---
 builtins/svml.m4                  | 209 +++++++++---------------------
 builtins/target-avx-h.ll          |  11 +-
 builtins/target-avx-x2.ll         |   9 +-
 builtins/target-avx.ll            |  11 +-
 builtins/target-generic-common.ll |   4 +-
 builtins/target-sse2-x2.ll        |   8 +-
 builtins/target-sse2.ll           |  12 +-
 builtins/target-sse4-16.ll        |   4 +-
 builtins/target-sse4-8.ll         |   4 +-
 builtins/target-sse4-x2.ll        |   9 +-
 builtins/target-sse4.ll           |  11 +-
 11 files changed, 116 insertions(+), 176 deletions(-)

diff --git a/builtins/svml.m4 b/builtins/svml.m4
index cc3cd979..9608dea6 100644
--- a/builtins/svml.m4
+++ b/builtins/svml.m4
@@ -1,176 +1,93 @@
 ;; svml
 
-;; stub
-define(`svmlf_stubs',`
-  declare <$1 x float> @__svml_sinf(<$1 x float>) nounwind readnone alwaysinline
-  declare <$1 x float> @__svml_asinf(<$1 x float>) nounwind readnone alwaysinline 
-  declare <$1 x float> @__svml_cosf(<$1 x float>) nounwind readnone alwaysinline 
-  declare void @__svml_sincosf(<$1 x float>, <$1 x float> *, <$1 x float> *) nounwind readnone alwaysinline 
-  declare <$1 x float> @__svml_tanf(<$1 x float>) nounwind readnone alwaysinline 
-  declare <$1 x float> @__svml_atanf(<$1 x float>) nounwind readnone alwaysinline 
-  declare <$1 x float> @__svml_atan2f(<$1 x float>, <$1 x float>) nounwind readnone alwaysinline 
-  declare <$1 x float> @__svml_expf(<$1 x float>) nounwind readnone alwaysinline 
-  declare <$1 x float> @__svml_logf(<$1 x float>) nounwind readnone alwaysinline 
-  declare <$1 x float> @__svml_powf(<$1 x float>, <$1 x float>) nounwind readnone alwaysinline 
+;; stubs
+define(`svml_stubs',`
+  declare <$2 x $1> @__svml_sin$3(<$2 x $1>) nounwind readnone alwaysinline
+  declare <$2 x $1> @__svml_asin$3(<$2 x $1>) nounwind readnone alwaysinline 
+  declare <$2 x $1> @__svml_cos$3(<$2 x $1>) nounwind readnone alwaysinline 
+  declare void @__svml_sincos$3(<$2 x $1>, <$2 x $1> *, <$2 x $1> *) nounwind readnone alwaysinline 
+  declare <$2 x $1> @__svml_tan$3(<$2 x $1>) nounwind readnone alwaysinline 
+  declare <$2 x $1> @__svml_atan$3(<$2 x $1>) nounwind readnone alwaysinline 
+  declare <$2 x $1> @__svml_atan2$3(<$2 x $1>, <$2 x $1>) nounwind readnone alwaysinline 
+  declare <$2 x $1> @__svml_exp$3(<$2 x $1>) nounwind readnone alwaysinline 
+  declare <$2 x $1> @__svml_log$3(<$2 x $1>) nounwind readnone alwaysinline 
+  declare <$2 x $1> @__svml_pow$3(<$2 x $1>, <$2 x $1>) nounwind readnone alwaysinline 
 ')
 
-define(`svmld_stubs',`
-  declare <$1 x double> @__svml_sind(<$1 x double>) nounwind readnone alwaysinline 
-  declare <$1 x double> @__svml_asind(<$1 x double>) nounwind readnone alwaysinline 
-  declare <$1 x double> @__svml_cosd(<$1 x double>) nounwind readnone alwaysinline 
-  declare void @__svml_sincosd(<$1 x double>, <$1 x double> *, <$1 x double> *) nounwind readnone alwaysinline 
-  declare <$1 x double> @__svml_tand(<$1 x double>) nounwind readnone alwaysinline 
-  declare <$1 x double> @__svml_atand(<$1 x double>) nounwind readnone alwaysinline 
-  declare <$1 x double> @__svml_atan2d(<$1 x double>, <$1 x double>) nounwind readnone alwaysinline 
-  declare <$1 x double> @__svml_expd(<$1 x double>) nounwind readnone alwaysinline 
-  declare <$1 x double> @__svml_logd(<$1 x double>) nounwind readnone alwaysinline 
-  declare <$1 x double> @__svml_powd(<$1 x double>, <$1 x double>) nounwind readnone alwaysinline 
-')
-
-;; single precision
-define(`svmlf_declare',`
-  declare <$1 x float> @__svml_sinf$1(<$1 x float>) nounwind readnone
-  declare <$1 x float> @__svml_asinf$1(<$1 x float>) nounwind readnone
-  declare <$1 x float> @__svml_cosf$1(<$1 x float>) nounwind readnone
-  declare <$1 x float> @__svml_sincosf$1(<$1 x float> *, <$1 x float>) nounwind readnone
-  declare <$1 x float> @__svml_tanf$1(<$1 x float>) nounwind readnone
-  declare <$1 x float> @__svml_atanf$1(<$1 x float>) nounwind readnone
-  declare <$1 x float> @__svml_atan2f$1(<$1 x float>, <$1 x float>) nounwind readnone
-  declare <$1 x float> @__svml_expf$1(<$1 x float>) nounwind readnone
-  declare <$1 x float> @__svml_logf$1(<$1 x float>) nounwind readnone
-  declare <$1 x float> @__svml_powf$1(<$1 x float>, <$1 x float>) nounwind readnone
+;; decalre __svml calls
+define(`svml_declare',`
+  declare <$3 x $1> @__svml_sin$2(<$3 x $1>) nounwind readnone
+  declare <$3 x $1> @__svml_asin$2(<$3 x $1>) nounwind readnone
+  declare <$3 x $1> @__svml_cos$2(<$3 x $1>) nounwind readnone
+  declare <$3 x $1> @__svml_sincos$2(<$3 x $1> *, <$3 x $1>) nounwind readnone
+  declare <$3 x $1> @__svml_tan$2(<$3 x $1>) nounwind readnone
+  declare <$3 x $1> @__svml_atan$2(<$3 x $1>) nounwind readnone
+  declare <$3 x $1> @__svml_atan2$2(<$3 x $1>, <$3 x $1>) nounwind readnone
+  declare <$3 x $1> @__svml_exp$2(<$3 x $1>) nounwind readnone
+  declare <$3 x $1> @__svml_log$2(<$3 x $1>) nounwind readnone
+  declare <$3 x $1> @__svml_pow$2(<$3 x $1>, <$3 x $1>) nounwind readnone
 ');
 
-
-
-define(`svmlf_define',`
-  define <$1 x float> @__svml_sinf(<$1 x float>) nounwind readnone alwaysinline {
-    %ret = call <$1 x float> @__svml_sinf$1(<$1 x float> %0)
-    ret <$1 x float> %ret
+;; define native __svml calls
+define(`svml_define',`
+  define <$3 x $1> @__svml_sin$4(<$3 x $1>) nounwind readnone alwaysinline {
+    %ret = call <$3 x $1> @__svml_sin$2(<$3 x $1> %0)
+    ret <$3 x $1> %ret
   }
-  define <$1 x float> @__svml_asinf(<$1 x float>) nounwind readnone alwaysinline {
-    %ret = call <$1 x float> @__svml_asinf$1(<$1 x float> %0)
-    ret <$1 x float> %ret
+  define <$3 x $1> @__svml_asin$4(<$3 x $1>) nounwind readnone alwaysinline {
+    %ret = call <$3 x $1> @__svml_asin$2(<$3 x $1> %0)
+    ret <$3 x $1> %ret
   }
 
-  define <$1 x float> @__svml_cosf(<$1 x float>) nounwind readnone alwaysinline {
-    %ret = call <$1 x float> @__svml_cosf$1(<$1 x float> %0)
-    ret <$1 x float> %ret
+  define <$3 x $1> @__svml_cos$4(<$3 x $1>) nounwind readnone alwaysinline {
+    %ret = call <$3 x $1> @__svml_cos$2(<$3 x $1> %0)
+    ret <$3 x $1> %ret
   }
 
-  define void @__svml_sincosf(<$1 x float>, <$1 x float> *, <$1 x float> *) nounwind readnone alwaysinline {
-    %s = call <$1 x float> @__svml_sincosf$1(<$1 x float> * %2, <$1 x float> %0)
-    store <$1 x float> %s, <$1 x float> * %1
+  define void @__svml_sincos$4(<$3 x $1>, <$3 x $1> *, <$3 x $1> *) nounwind readnone alwaysinline {
+    %s = call <$3 x $1> @__svml_sincos$2(<$3 x $1> * %2, <$3 x $1> %0)
+    store <$3 x $1> %s, <$3 x $1> * %1
     ret void
   }
 
-  define <$1 x float> @__svml_tanf(<$1 x float>) nounwind readnone alwaysinline {
-    %ret = call <$1 x float> @__svml_tanf$1(<$1 x float> %0)
-    ret <$1 x float> %ret
+  define <$3 x $1> @__svml_tan$4(<$3 x $1>) nounwind readnone alwaysinline {
+    %ret = call <$3 x $1> @__svml_tan$2(<$3 x $1> %0)
+    ret <$3 x $1> %ret
   }
 
-  define <$1 x float> @__svml_atanf(<$1 x float>) nounwind readnone alwaysinline {
-    %ret = call <$1 x float> @__svml_atanf$1(<$1 x float> %0)
-    ret <$1 x float> %ret
+  define <$3 x $1> @__svml_atan$4(<$3 x $1>) nounwind readnone alwaysinline {
+    %ret = call <$3 x $1> @__svml_atan$2(<$3 x $1> %0)
+    ret <$3 x $1> %ret
   }
 
-  define <$1 x float> @__svml_atan2f(<$1 x float>, <$1 x float>) nounwind readnone alwaysinline {
-    %ret = call <$1 x float> @__svml_atan2f$1(<$1 x float> %0, <$1 x float> %1)
-    ret <$1 x float> %ret
+  define <$3 x $1> @__svml_atan2$4(<$3 x $1>, <$3 x $1>) nounwind readnone alwaysinline {
+    %ret = call <$3 x $1> @__svml_atan2$2(<$3 x $1> %0, <$3 x $1> %1)
+    ret <$3 x $1> %ret
   }
 
-  define <$1 x float> @__svml_expf(<$1 x float>) nounwind readnone alwaysinline {
-    %ret = call <$1 x float> @__svml_expf$1(<$1 x float> %0)
-    ret <$1 x float> %ret
+  define <$3 x $1> @__svml_exp$4(<$3 x $1>) nounwind readnone alwaysinline {
+    %ret = call <$3 x $1> @__svml_exp$2(<$3 x $1> %0)
+    ret <$3 x $1> %ret
   }
 
-  define <$1 x float> @__svml_logf(<$1 x float>) nounwind readnone alwaysinline {
-    %ret = call <$1 x float> @__svml_logf$1(<$1 x float> %0)
-    ret <$1 x float> %ret
+  define <$3 x $1> @__svml_log$4(<$3 x $1>) nounwind readnone alwaysinline {
+    %ret = call <$3 x $1> @__svml_log$2(<$3 x $1> %0)
+    ret <$3 x $1> %ret
   }
 
-  define <$1 x float> @__svml_powf(<$1 x float>, <$1 x float>) nounwind readnone alwaysinline {
-    %ret = call <$1 x float> @__svml_powf$1(<$1 x float> %0, <$1 x float> %1)
-    ret <$1 x float> %ret
+  define <$3 x $1> @__svml_pow$4(<$3 x $1>, <$3 x $1>) nounwind readnone alwaysinline {
+    %ret = call <$3 x $1> @__svml_pow$2(<$3 x $1> %0, <$3 x $1> %1)
+    ret <$3 x $1> %ret
   }
 ')
 
-;; double precision
-define(`svmld_declare',`
-  declare <$1 x double> @__svml_sin$1(<$1 x double>) nounwind readnone
-  declare <$1 x double> @__svml_asin$1(<$1 x double>) nounwind readnone
-  declare <$1 x double> @__svml_cos$1(<$1 x double>) nounwind readnone
-  declare <$1 x double> @__svml_sincos$1(<$1 x double> *, <$1 x double>) nounwind readnone
-  declare <$1 x double> @__svml_tan$1(<$1 x double>) nounwind readnone
-  declare <$1 x double> @__svml_atan$1(<$1 x double>) nounwind readnone
-  declare <$1 x double> @__svml_atan2$1(<$1 x double>, <$1 x double>) nounwind readnone
-  declare <$1 x double> @__svml_exp$1(<$1 x double>) nounwind readnone
-  declare <$1 x double> @__svml_log$1(<$1 x double>) nounwind readnone
-  declare <$1 x double> @__svml_pow$1(<$1 x double>, <$1 x double>) nounwind readnone
+
+;; define x2 __svml calls
+define(`svml_define_x2',`
+   svml_stubs($1,$3,$4)
 ')
 
-define(`svmld_define',`
-  define <$1 x double> @__svml_sind(<$1 x double>) nounwind readnone alwaysinline {
-    %ret = call <$1 x double> @__svml_sin$1(<$1 x double> %0)
-    ret <$1 x double> %ret
-  }
-  define <$1 x double> @__svml_asind(<$1 x double>) nounwind readnone alwaysinline {
-    %ret = call <$1 x double> @__svml_asin$1(<$1 x double> %0)
-    ret <$1 x double> %ret
-  }
-
-
-  define <$1 x double> @__svml_cosd(<$1 x double>) nounwind readnone alwaysinline {
-    %ret = call <$1 x double> @__svml_cos$1(<$1 x double> %0)
-    ret <$1 x double> %ret
-  }
-
-  define void @__svml_sincosd(<$1 x double>, <$1 x double> *, <$1 x double> *) nounwind readnone alwaysinline {
-    %s = call <$1 x double> @__svml_sincos$1(<$1 x double> * %2, <$1 x double> %0)
-    store <$1 x double> %s, <$1 x double> * %1
-    ret void
-  }
-
-  define <$1 x double> @__svml_tand(<$1 x double>) nounwind readnone alwaysinline {
-    %ret = call <$1 x double> @__svml_tan$1(<$1 x double> %0)
-    ret <$1 x double> %ret
-  }
-
-  define <$1 x double> @__svml_atand(<$1 x double>) nounwind readnone alwaysinline {
-    %ret = call <$1 x double> @__svml_atan$1(<$1 x double> %0)
-    ret <$1 x double> %ret
-  }
-
-  define <$1 x double> @__svml_atan2d(<$1 x double>, <$1 x double>) nounwind readnone alwaysinline {
-    %ret = call <$1 x double> @__svml_atan2$1(<$1 x double> %0, <$1 x double> %1)
-    ret <$1 x double> %ret
-  }
-
-  define <$1 x double> @__svml_expd(<$1 x double>) nounwind readnone alwaysinline {
-    %ret = call <$1 x double> @__svml_exp$1(<$1 x double> %0)
-    ret <$1 x double> %ret
-  }
-
-  define <$1 x double> @__svml_logd(<$1 x double>) nounwind readnone alwaysinline {
-    %ret = call <$1 x double> @__svml_log$1(<$1 x double> %0)
-    ret <$1 x double> %ret
-  }
-
-  define <$1 x double> @__svml_powd(<$1 x double>, <$1 x double>) nounwind readnone alwaysinline {
-    %ret = call <$1 x double> @__svml_pow$1(<$1 x double> %0, <$1 x double> %1)
-    ret <$1 x double> %ret
-  }
-')
-
-;; need to implement smvld for 2xvectorWidth ...:w
-
-define(`svmld2_define',`
-  define <$1 x double> @__svml_sinxx(<$1 x double>) nounwind readnone alwaysinline {
-    %v0 = shufflevector <$1 x double> %0, <$1 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-    %v1 = shufflevector <$1 x double> %0, <$1 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-    %ret0 = call <$2 x double> @__svml_sin$2(<$2 x double> %v0)
-    %ret1 = call <$2 x double> @__svml_sin$2(<$2 x double> %v1)
-    %ret  = shufflevector <$2 x double> %ret0, <$2 x double> %ret1, <$1 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-    ret <$1 x double> %ret
-  }
+;; define x4 __svml calls
+define(`svml_define_x4',`
+   svml_stubs($1,$3,$4)
 ')
diff --git a/builtins/target-avx-h.ll b/builtins/target-avx-h.ll
index a06e5ab3..283eaddd 100644
--- a/builtins/target-avx-h.ll
+++ b/builtins/target-avx-h.ll
@@ -155,10 +155,13 @@ define <4 x double> @__sqrt_varying_double(<4 x double>) nounwind alwaysinline {
 ;; svml
 
 include(`svml.m4')
-svmlf_declare(4)
-svmlf_define(4)
-svmld_declare(4)
-svmld_define(4)
+;; single precision
+svml_declare(float,f4,4)
+svml_define(float,f4,4,f)
+
+;; double precision
+svml_declare(double,4,4)
+svml_define(double,4,4,d)
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; float min/max
diff --git a/builtins/target-avx-x2.ll b/builtins/target-avx-x2.ll
index d646720e..f3f1590a 100644
--- a/builtins/target-avx-x2.ll
+++ b/builtins/target-avx-x2.ll
@@ -138,8 +138,13 @@ define <16 x float> @__sqrt_varying_float(<16 x float>) nounwind readonly always
 ;; svml
 
 include(`svml.m4')
-svmlf_stubs(16)
-svmld_stubs(16)
+;; single precision
+svml_declare(float,f8,8)
+svml_define_x2(float,f8,8,f,16)
+
+;; double precision
+svml_declare(double,4,4)
+svml_define_x2(double,4,4,d,16)
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; float min/max
diff --git a/builtins/target-avx.ll b/builtins/target-avx.ll
index 1d33e3f9..7e7ab330 100644
--- a/builtins/target-avx.ll
+++ b/builtins/target-avx.ll
@@ -138,10 +138,13 @@ define <8 x float> @__sqrt_varying_float(<8 x float>) nounwind readonly alwaysin
 ;; svml
 
 include(`svml.m4')
-svmlf_declare(8)
-svmlf_define(8)
-svmld_declare(4)
-svmld_stubs(8)
+;; single precision
+svml_declare(float,f8,8)
+svml_define(float,f8,8,f)
+
+;; double precision
+svml_declare(double,4,4)
+svml_define_x2(double,4,4,d,8)
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; float min/max
diff --git a/builtins/target-generic-common.ll b/builtins/target-generic-common.ll
index bc7db9ec..30a8b030 100644
--- a/builtins/target-generic-common.ll
+++ b/builtins/target-generic-common.ll
@@ -209,8 +209,8 @@ declare i64 @__count_leading_zeros_i64(i64) nounwind readnone
 ;; svml
 
 include(`svml.m4')
-svmlf_stubs(WIDTH)
-svmld_stubs(WIDTH)
+svml_stubs(float,  WIDTH, f)
+svml_stubs(double, WIDTH, d)
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; reductions
diff --git a/builtins/target-sse2-x2.ll b/builtins/target-sse2-x2.ll
index 5688ebba..9fa607a4 100644
--- a/builtins/target-sse2-x2.ll
+++ b/builtins/target-sse2-x2.ll
@@ -106,10 +106,12 @@ define <8 x float> @__sqrt_varying_float(<8 x float>) nounwind readonly alwaysin
 ; svml stuff
 
 include(`svml.m4')
-svmlf_declare(4)
-svmld_declare(2)
-svmld_stubs(8)
+;; single precision
+svml_declare(float,f4,4)
 
+;; double precision
+svml_declare(double,2,2)
+svml_define_x4(double,2,2,d,8)
 
 define <8 x float> @__svml_sinf(<8 x float>) nounwind readnone alwaysinline {
   unary4to8(ret, float, @__svml_sinf4, %0)
diff --git a/builtins/target-sse2.ll b/builtins/target-sse2.ll
index 236cda33..c858ccb6 100644
--- a/builtins/target-sse2.ll
+++ b/builtins/target-sse2.ll
@@ -497,10 +497,14 @@ define <4 x float> @__sqrt_varying_float(<4 x float>) nounwind readonly alwaysin
 ; svml stuff
 
 include(`svml.m4')
-svmlf_declare(4)
-svmld_declare(2)
-svmlf_define(4)
-svmld_stubs(4)
+;; single precision
+svml_declare(float,f4,4)
+svml_define(float,f4,4,f)
+
+;; double precision
+svml_declare(double,2,2)
+svml_define_x2(double,2,2,d,4)
+
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; float min/max
diff --git a/builtins/target-sse4-16.ll b/builtins/target-sse4-16.ll
index 3fbbe534..3f8cd339 100644
--- a/builtins/target-sse4-16.ll
+++ b/builtins/target-sse4-16.ll
@@ -210,8 +210,8 @@ define <8 x double> @__max_varying_double(<8 x double>, <8 x double>) nounwind r
 
 ; FIXME
 include(`svml.m4')
-svmlf_stubs(8)
-svmld_stubs(8)
+svml_stubs(float,8,f)
+svml_stubs(double,8,d)
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; horizontal ops / reductions
diff --git a/builtins/target-sse4-8.ll b/builtins/target-sse4-8.ll
index e65077b7..f43cd940 100644
--- a/builtins/target-sse4-8.ll
+++ b/builtins/target-sse4-8.ll
@@ -223,8 +223,8 @@ define <16 x double> @__max_varying_double(<16 x double>, <16 x double>) nounwin
 ; FIXME
 
 include(`svml.m4')
-svmlf_stubs(16)
-svmld_stubs(16)
+svml_stubs(float,16,f)
+svml_stubs(double,16,d)
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; horizontal ops / reductions
diff --git a/builtins/target-sse4-x2.ll b/builtins/target-sse4-x2.ll
index 2a69b60a..c45966e3 100644
--- a/builtins/target-sse4-x2.ll
+++ b/builtins/target-sse4-x2.ll
@@ -106,9 +106,12 @@ define <8 x float> @__sqrt_varying_float(<8 x float>) nounwind readonly alwaysin
 ; svml stuff
 
 include(`svml.m4')
-svmlf_declare(4)
-svmld_declare(2)
-svmld_stubs(8)
+;; single precision
+svml_declare(float,f4,4)
+
+;; double precision
+svml_declare(double,2,2)
+svml_define_x4(double,2,2,d,8)
 
 
 define <8 x float> @__svml_sinf(<8 x float>) nounwind readnone alwaysinline {
diff --git a/builtins/target-sse4.ll b/builtins/target-sse4.ll
index 686b4f84..eb82ab9a 100644
--- a/builtins/target-sse4.ll
+++ b/builtins/target-sse4.ll
@@ -210,10 +210,13 @@ define <4 x double> @__max_varying_double(<4 x double>, <4 x double>) nounwind r
 ; svml stuff
 
 include(`svml.m4')
-svmlf_declare(4)
-svmlf_define(4)
-svmld_declare(2)
-svmld_stubs(8)
+;; single precision
+svml_declare(float,f4,4)
+svml_define(float,f4,4,f)
+
+;; double precision
+svml_declare(double,2,2)
+svml_define_x2(double,2,2,d,4)
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; horizontal ops / reductions

From efc20c211061585150abb02b4720316f0e45dad5 Mon Sep 17 00:00:00 2001
From: egaburov <egaburov.work@gmail.com>
Date: Wed, 11 Sep 2013 17:07:54 +0200
Subject: [PATCH 059/124] added svml support to all sse/avx modes

---
 builtins/svml.m4           | 44 ++++++++++++++++++---
 builtins/target-avx-x2.ll  |  4 +-
 builtins/target-avx.ll     |  2 +-
 builtins/target-sse2-x2.ll | 79 +------------------------------------
 builtins/target-sse2.ll    |  2 +-
 builtins/target-sse4-x2.ll | 80 +-------------------------------------
 builtins/target-sse4.ll    |  2 +-
 7 files changed, 47 insertions(+), 166 deletions(-)

diff --git a/builtins/svml.m4 b/builtins/svml.m4
index 9608dea6..71a6a709 100644
--- a/builtins/svml.m4
+++ b/builtins/svml.m4
@@ -83,11 +83,43 @@ define(`svml_define',`
 
 
 ;; define x2 __svml calls
-define(`svml_define_x2',`
-   svml_stubs($1,$3,$4)
+define(`svml_define_x',`
+  define <$5 x $1> @__svml_sin$4(<$5 x $1>) nounwind readnone alwaysinline {
+    unary$3to$5(ret, $1, @__svml_sin$2, %0)
+    ret <$5 x $1> %ret
+  }
+  define <$5 x $1> @__svml_asin$4(<$5 x $1>) nounwind readnone alwaysinline {
+    unary$3to$5(ret, $1, @__svml_asin$2, %0)
+    ret <$5 x $1> %ret
+  }
+  define <$5 x $1> @__svml_cos$4(<$5 x $1>) nounwind readnone alwaysinline {
+    unary$3to$5(ret, $1, @__svml_cos$2, %0)
+    ret <$5 x $1> %ret
+  }
+  declare void @__svml_sincos$4(<$5 x $1>,<$5 x $1>*,<$5 x $1>*) nounwind readnone alwaysinline 
+  define <$5 x $1> @__svml_tan$4(<$5 x $1>) nounwind readnone alwaysinline {
+    unary$3to$5(ret, $1, @__svml_tan$2, %0)
+    ret <$5 x $1> %ret
+  }
+  define <$5 x $1> @__svml_atan$4(<$5 x $1>) nounwind readnone alwaysinline {
+    unary$3to$5(ret, $1, @__svml_atan$2, %0)
+    ret <$5 x $1> %ret
+  }
+  define <$5 x $1> @__svml_atan2$4(<$5 x $1>,<$5 x $1>) nounwind readnone alwaysinline {
+    binary$3to$5(ret, $1, @__svml_atan2$2, %0, %1)
+    ret <$5 x $1> %ret
+  }
+  define <$5 x $1> @__svml_exp$4(<$5 x $1>) nounwind readnone alwaysinline {
+    unary$3to$5(ret, $1, @__svml_exp$2, %0)
+    ret <$5 x $1> %ret
+  }
+  define <$5 x $1> @__svml_log$4(<$5 x $1>) nounwind readnone alwaysinline {
+    unary$3to$5(ret, $1, @__svml_log$2, %0)
+    ret <$5 x $1> %ret
+  }
+  define <$5 x $1> @__svml_pow$4(<$5 x $1>,<$5 x $1>) nounwind readnone alwaysinline {
+    binary$3to$5(ret, $1, @__svml_pow$2, %0, %1)
+    ret <$5 x $1> %ret
+  }
 ')
 
-;; define x4 __svml calls
-define(`svml_define_x4',`
-   svml_stubs($1,$3,$4)
-')
diff --git a/builtins/target-avx-x2.ll b/builtins/target-avx-x2.ll
index f3f1590a..f8fd5cd5 100644
--- a/builtins/target-avx-x2.ll
+++ b/builtins/target-avx-x2.ll
@@ -140,11 +140,11 @@ define <16 x float> @__sqrt_varying_float(<16 x float>) nounwind readonly always
 include(`svml.m4')
 ;; single precision
 svml_declare(float,f8,8)
-svml_define_x2(float,f8,8,f,16)
+svml_define_x(float,f8,8,f,16)
 
 ;; double precision
 svml_declare(double,4,4)
-svml_define_x2(double,4,4,d,16)
+svml_define_x(double,4,4,d,16)
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; float min/max
diff --git a/builtins/target-avx.ll b/builtins/target-avx.ll
index 7e7ab330..196e5ea4 100644
--- a/builtins/target-avx.ll
+++ b/builtins/target-avx.ll
@@ -144,7 +144,7 @@ svml_define(float,f8,8,f)
 
 ;; double precision
 svml_declare(double,4,4)
-svml_define_x2(double,4,4,d,8)
+svml_define_x(double,4,4,d,8)
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; float min/max
diff --git a/builtins/target-sse2-x2.ll b/builtins/target-sse2-x2.ll
index 9fa607a4..77bf1a9d 100644
--- a/builtins/target-sse2-x2.ll
+++ b/builtins/target-sse2-x2.ll
@@ -108,86 +108,11 @@ define <8 x float> @__sqrt_varying_float(<8 x float>) nounwind readonly alwaysin
 include(`svml.m4')
 ;; single precision
 svml_declare(float,f4,4)
+svml_define_x(float,f4,4,f,8)
 
 ;; double precision
 svml_declare(double,2,2)
-svml_define_x4(double,2,2,d,8)
-
-define <8 x float> @__svml_sinf(<8 x float>) nounwind readnone alwaysinline {
-  unary4to8(ret, float, @__svml_sinf4, %0)
-  ret <8 x float> %ret
-}
-
-define <8 x float> @__svml_asinf(<8 x float>) nounwind readnone alwaysinline {
-  unary4to8(ret, float, @__svml_asinf4, %0)
-  ret <8 x float> %ret
-}
-
-define <8 x float> @__svml_cosf(<8 x float>) nounwind readnone alwaysinline {
-  unary4to8(ret, float, @__svml_cosf4, %0)
-  ret <8 x float> %ret
-}
-
-define void @__svml_sincosf(<8 x float>, <8 x float> *,
-                                    <8 x float> *) nounwind readnone alwaysinline {
-  ; call svml_sincosf4 two times with the two 4-wide sub-vectors
-  %a = shufflevector <8 x float> %0, <8 x float> undef,
-         <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %b = shufflevector <8 x float> %0, <8 x float> undef,
-         <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-
-  %cospa = alloca <4 x float>
-  %sa = call <4 x float> @__svml_sincosf4(<4 x float> * %cospa, <4 x float> %a)
-
-  %cospb = alloca <4 x float>
-  %sb = call <4 x float> @__svml_sincosf4(<4 x float> * %cospb, <4 x float> %b)
-
-  %sin = shufflevector <4 x float> %sa, <4 x float> %sb,
-         <8 x i32> <i32 0, i32 1, i32 2, i32 3,
-                    i32 4, i32 5, i32 6, i32 7>
-  store <8 x float> %sin, <8 x float> * %1
-
-  %cosa = load <4 x float> * %cospa
-  %cosb = load <4 x float> * %cospb
-  %cos = shufflevector <4 x float> %cosa, <4 x float> %cosb,
-         <8 x i32> <i32 0, i32 1, i32 2, i32 3,
-                    i32 4, i32 5, i32 6, i32 7>
-  store <8 x float> %cos, <8 x float> * %2
-
-  ret void
-}
-
-define <8 x float> @__svml_tanf(<8 x float>) nounwind readnone alwaysinline {
-  unary4to8(ret, float, @__svml_tanf4, %0)
-  ret <8 x float> %ret
-}
-
-define <8 x float> @__svml_atanf(<8 x float>) nounwind readnone alwaysinline {
-  unary4to8(ret, float, @__svml_atanf4, %0)
-  ret <8 x float> %ret
-}
-
-define <8 x float> @__svml_atan2f(<8 x float>,
-                                          <8 x float>) nounwind readnone alwaysinline {
-  binary4to8(ret, float, @__svml_atan2f4, %0, %1)
-  ret <8 x float> %ret
-}
-
-define <8 x float> @__svml_expf(<8 x float>) nounwind readnone alwaysinline {
-  unary4to8(ret, float, @__svml_expf4, %0)
-  ret <8 x float> %ret
-}
-
-define <8 x float> @__svml_logf(<8 x float>) nounwind readnone alwaysinline {
-  unary4to8(ret, float, @__svml_logf4, %0)
-  ret <8 x float> %ret
-}
-
-define <8 x float> @__svml_powf(<8 x float>,
-                                        <8 x float>) nounwind readnone alwaysinline {
-  binary4to8(ret, float, @__svml_powf4, %0, %1)
-  ret <8 x float> %ret
-}
+svml_define_x(double,2,2,d,8)
 
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
diff --git a/builtins/target-sse2.ll b/builtins/target-sse2.ll
index c858ccb6..e42d4990 100644
--- a/builtins/target-sse2.ll
+++ b/builtins/target-sse2.ll
@@ -503,7 +503,7 @@ svml_define(float,f4,4,f)
 
 ;; double precision
 svml_declare(double,2,2)
-svml_define_x2(double,2,2,d,4)
+svml_define_x(double,2,2,d,4)
 
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
diff --git a/builtins/target-sse4-x2.ll b/builtins/target-sse4-x2.ll
index c45966e3..842db53f 100644
--- a/builtins/target-sse4-x2.ll
+++ b/builtins/target-sse4-x2.ll
@@ -108,87 +108,11 @@ define <8 x float> @__sqrt_varying_float(<8 x float>) nounwind readonly alwaysin
 include(`svml.m4')
 ;; single precision
 svml_declare(float,f4,4)
+svml_define_x(float,f4,4,f,8)
 
 ;; double precision
 svml_declare(double,2,2)
-svml_define_x4(double,2,2,d,8)
-
-
-define <8 x float> @__svml_sinf(<8 x float>) nounwind readnone alwaysinline {
-  unary4to8(ret, float, @__svml_sinf4, %0)
-  ret <8 x float> %ret
-}
-
-define <8 x float> @__svml_asinf(<8 x float>) nounwind readnone alwaysinline {
-  unary4to8(ret, float, @__svml_asinf4, %0)
-  ret <8 x float> %ret
-}
-
-define <8 x float> @__svml_cosf(<8 x float>) nounwind readnone alwaysinline {
-  unary4to8(ret, float, @__svml_cosf4, %0)
-  ret <8 x float> %ret
-}
-
-define void @__svml_sincosf(<8 x float>, <8 x float> *,
-                                    <8 x float> *) nounwind readnone alwaysinline {
-  ; call svml_sincosf4 two times with the two 4-wide sub-vectors
-  %a = shufflevector <8 x float> %0, <8 x float> undef,
-         <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %b = shufflevector <8 x float> %0, <8 x float> undef,
-         <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-
-  %cospa = alloca <4 x float>
-  %sa = call <4 x float> @__svml_sincosf4(<4 x float> * %cospa, <4 x float> %a)
-
-  %cospb = alloca <4 x float>
-  %sb = call <4 x float> @__svml_sincosf4(<4 x float> * %cospb, <4 x float> %b)
-
-  %sin = shufflevector <4 x float> %sa, <4 x float> %sb,
-         <8 x i32> <i32 0, i32 1, i32 2, i32 3,
-                    i32 4, i32 5, i32 6, i32 7>
-  store <8 x float> %sin, <8 x float> * %1
-
-  %cosa = load <4 x float> * %cospa
-  %cosb = load <4 x float> * %cospb
-  %cos = shufflevector <4 x float> %cosa, <4 x float> %cosb,
-         <8 x i32> <i32 0, i32 1, i32 2, i32 3,
-                    i32 4, i32 5, i32 6, i32 7>
-  store <8 x float> %cos, <8 x float> * %2
-
-  ret void
-}
-
-define <8 x float> @__svml_tanf(<8 x float>) nounwind readnone alwaysinline {
-  unary4to8(ret, float, @__svml_tanf4, %0)
-  ret <8 x float> %ret
-}
-
-define <8 x float> @__svml_atanf(<8 x float>) nounwind readnone alwaysinline {
-  unary4to8(ret, float, @__svml_atanf4, %0)
-  ret <8 x float> %ret
-}
-
-define <8 x float> @__svml_atan2f(<8 x float>,
-                                          <8 x float>) nounwind readnone alwaysinline {
-  binary4to8(ret, float, @__svml_atan2f4, %0, %1)
-  ret <8 x float> %ret
-}
-
-define <8 x float> @__svml_expf(<8 x float>) nounwind readnone alwaysinline {
-  unary4to8(ret, float, @__svml_expf4, %0)
-  ret <8 x float> %ret
-}
-
-define <8 x float> @__svml_logf(<8 x float>) nounwind readnone alwaysinline {
-  unary4to8(ret, float, @__svml_logf4, %0)
-  ret <8 x float> %ret
-}
-
-define <8 x float> @__svml_powf(<8 x float>,
-                                        <8 x float>) nounwind readnone alwaysinline {
-  binary4to8(ret, float, @__svml_powf4, %0, %1)
-  ret <8 x float> %ret
-}
+svml_define_x(double,2,2,d,8)
 
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
diff --git a/builtins/target-sse4.ll b/builtins/target-sse4.ll
index eb82ab9a..88be6c59 100644
--- a/builtins/target-sse4.ll
+++ b/builtins/target-sse4.ll
@@ -216,7 +216,7 @@ svml_define(float,f4,4,f)
 
 ;; double precision
 svml_declare(double,2,2)
-svml_define_x2(double,2,2,d,4)
+svml_define_x(double,2,2,d,4)
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; horizontal ops / reductions

From 7364e06387e7cc02f1a144097754e03181602208 Mon Sep 17 00:00:00 2001
From: egaburov <egaburov.work@gmail.com>
Date: Thu, 12 Sep 2013 12:02:42 +0200
Subject: [PATCH 060/124] added mask64

---
 Makefile                                      |  17 ++-
 builtins.cpp                                  |  10 +-
 .../{target-avxh.ll => target-avx-i64x4.ll}   |   2 +-
 ...arget-avx-h.ll => target-avx-i64x4base.ll} | 137 +++++++-----------
 builtins/util.m4                              |  76 +++++++++-
 ispc.cpp                                      |   5 +-
 llvmutil.cpp                                  |  22 ++-
 parse.yy                                      |   3 +
 stdlib.ispc                                   |   3 +
 9 files changed, 175 insertions(+), 100 deletions(-)
 rename builtins/{target-avxh.ll => target-avx-i64x4.ll} (98%)
 rename builtins/{target-avx-h.ll => target-avx-i64x4base.ll} (78%)

diff --git a/Makefile b/Makefile
index 43f41e09..92debe4f 100644
--- a/Makefile
+++ b/Makefile
@@ -141,7 +141,7 @@ CXX_SRC=ast.cpp builtins.cpp cbackend.cpp ctx.cpp decl.cpp expr.cpp func.cpp \
 	type.cpp util.cpp
 HEADERS=ast.h builtins.h ctx.h decl.h expr.h func.h ispc.h llvmutil.h module.h \
 	opt.h stmt.h sym.h type.h util.h
-TARGETS=avxh avx1 avx1-x2 avx11 avx11-x2 avx2 avx2-x2 \
+TARGETS=avx-i64x4 avx1 avx1-x2 avx11 avx11-x2 avx2 avx2-x2 \
 	sse2 sse2-x2 sse4-8 sse4-16 sse4 sse4-x2 \
 	generic-4 generic-8 generic-16 generic-32 generic-64 generic-1
 ifneq ($(ARM_ENABLED), 0)
@@ -160,7 +160,7 @@ BISON_SRC=parse.yy
 FLEX_SRC=lex.ll
 
 OBJS=$(addprefix objs/, $(CXX_SRC:.cpp=.o) $(BUILTINS_OBJS) \
-       stdlib_mask1_ispc.o stdlib_mask8_ispc.o stdlib_mask16_ispc.o stdlib_mask32_ispc.o \
+       stdlib_mask1_ispc.o stdlib_mask8_ispc.o stdlib_mask16_ispc.o stdlib_mask32_ispc.o stdlib_mask64_ispc.o \
 	$(BISON_SRC:.yy=.o) $(FLEX_SRC:.ll=.o))
 
 default: ispc
@@ -268,20 +268,25 @@ objs/builtins-c-64.cpp: builtins/builtins.c
 
 objs/stdlib_mask1_ispc.cpp: stdlib.ispc
 	@echo Creating C++ source from $< for mask1
-	@$(CLANG) -E -x c -DISPC_MASK_BITS=1 -DISPC=1 -DPI=3.1415926536 $< -o - | \
+	@$(CLANG) -E -x c -DISPC_MASK_BITS=1 -DISPC=1 -DPI=3.14159265358979 $< -o - | \
 		python stdlib2cpp.py mask1 > $@
 
 objs/stdlib_mask8_ispc.cpp: stdlib.ispc
 	@echo Creating C++ source from $< for mask8
-	@$(CLANG) -E -x c -DISPC_MASK_BITS=8 -DISPC=1 -DPI=3.1415926536 $< -o - | \
+	@$(CLANG) -E -x c -DISPC_MASK_BITS=8 -DISPC=1 -DPI=3.14159265358979 $< -o - | \
 		python stdlib2cpp.py mask8 > $@
 
 objs/stdlib_mask16_ispc.cpp: stdlib.ispc
 	@echo Creating C++ source from $< for mask16
-	@$(CLANG) -E -x c -DISPC_MASK_BITS=16 -DISPC=1 -DPI=3.1415926536 $< -o - | \
+	@$(CLANG) -E -x c -DISPC_MASK_BITS=16 -DISPC=1 -DPI=3.14159265358979 $< -o - | \
 		python stdlib2cpp.py mask16 > $@
 
 objs/stdlib_mask32_ispc.cpp: stdlib.ispc
 	@echo Creating C++ source from $< for mask32
-	@$(CLANG) -E -x c -DISPC_MASK_BITS=32 -DISPC=1 -DPI=3.1415926536 $< -o - | \
+	@$(CLANG) -E -x c -DISPC_MASK_BITS=32 -DISPC=1 -DPI=3.14159265358979 $< -o - | \
 		python stdlib2cpp.py mask32 > $@
+
+objs/stdlib_mask64_ispc.cpp: stdlib.ispc
+	@echo Creating C++ source from $< for mask64
+	@$(CLANG) -E -x c -DISPC_MASK_BITS=64 -DISPC=1 -DPI=3.14159265358979 $< -o - | \
+		python stdlib2cpp.py mask64 > $@
diff --git a/builtins.cpp b/builtins.cpp
index 816d4d78..f8d4136e 100644
--- a/builtins.cpp
+++ b/builtins.cpp
@@ -302,6 +302,7 @@ lCheckModuleIntrinsics(llvm::Module *module) {
         // check the llvm.x86.* intrinsics for now...
         if (!strncmp(funcName.c_str(), "llvm.x86.", 9)) {
             llvm::Intrinsic::ID id = (llvm::Intrinsic::ID)func->getIntrinsicID();
+            if (id == 0) fprintf(stderr, "FATAL: intrinsic is not found: %s  \n", funcName.c_str());
             Assert(id != 0);
             llvm::Type *intrinsicType =
                 llvm::Intrinsic::getType(*g->ctx, id);
@@ -936,10 +937,10 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
         switch (g->target->getVectorWidth()) {
         case 4:
             if (runtime32) {
-                EXPORT_MODULE(builtins_bitcode_avxh_32bit);
+                EXPORT_MODULE(builtins_bitcode_avx_i64x4_32bit);
             }
             else {
-                EXPORT_MODULE(builtins_bitcode_avxh_64bit);
+                EXPORT_MODULE(builtins_bitcode_avx_i64x4_64bit);
             }
             break;
         case 8:
@@ -1105,7 +1106,7 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
         // serialized version of the stdlib.ispc file to get its
         // definitions added.
         extern char stdlib_mask1_code[], stdlib_mask8_code[];
-        extern char stdlib_mask16_code[], stdlib_mask32_code[];
+        extern char stdlib_mask16_code[], stdlib_mask32_code[], stdlib_mask64_code[];
         if (g->target->getISA() == Target::GENERIC &&
             g->target->getVectorWidth() == 1) { // 1 wide uses 32 stdlib
             yy_scan_string(stdlib_mask32_code);
@@ -1124,6 +1125,9 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
             case 32:
                 yy_scan_string(stdlib_mask32_code);
                 break;
+            case 64:
+                yy_scan_string(stdlib_mask64_code);
+                break;
             default:
                 FATAL("Unhandled mask bit size for stdlib.ispc");
             }
diff --git a/builtins/target-avxh.ll b/builtins/target-avx-i64x4.ll
similarity index 98%
rename from builtins/target-avxh.ll
rename to builtins/target-avx-i64x4.ll
index 98c9111d..d7dbb6bd 100644
--- a/builtins/target-avxh.ll
+++ b/builtins/target-avx-i64x4.ll
@@ -29,7 +29,7 @@
 ;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 ;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
 
-include(`target-avx-h.ll')
+include(`target-avx-i64x4base.ll')
 
 rdrand_decls()
 
diff --git a/builtins/target-avx-h.ll b/builtins/target-avx-i64x4base.ll
similarity index 78%
rename from builtins/target-avx-h.ll
rename to builtins/target-avx-i64x4base.ll
index 283eaddd..05bf178d 100644
--- a/builtins/target-avx-h.ll
+++ b/builtins/target-avx-i64x4base.ll
@@ -33,7 +33,7 @@
 ;; Basic 4-wide definitions
 
 define(`WIDTH',`4')
-define(`MASK',`i32')
+define(`MASK',`i64')
 include(`util.m4')
 
 stdlib_core()
@@ -185,32 +185,32 @@ define <4 x float> @__min_varying_float(<4 x float>, <4 x float>) nounwind reado
 ; horizontal ops
 
 ;; sse intrinsic 
-declare i32 @llvm.x86.sse.movmsk.ps(<4 x float>) nounwind readnone
+declare i32 @llvm.x86.avx.movmsk.pd.256(<4 x double>) nounwind readnone
 
-define i64 @__movmsk(<4 x i32>) nounwind readnone alwaysinline {
-  %floatmask = bitcast <4 x i32> %0 to <4 x float>
-  %v = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %floatmask) nounwind readnone
+define i64 @__movmsk(<4 x i64>) nounwind readnone alwaysinline {
+  %floatmask = bitcast <4 x i64> %0 to <4 x double>
+  %v = call i32 @llvm.x86.avx.movmsk.pd.256(<4 x double> %floatmask) nounwind readnone
   %v64 = zext i32 %v to i64
   ret i64 %v64
 }
 
-define i1 @__any(<4 x i32>) nounwind readnone alwaysinline {
-  %floatmask = bitcast <4 x i32> %0 to <4 x float>
-  %v = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %floatmask) nounwind readnone
+define i1 @__any(<4 x i64>) nounwind readnone alwaysinline {
+  %floatmask = bitcast <4 x i64> %0 to <4 x double>
+  %v = call i32 @llvm.x86.avx.movmsk.pd.256(<4 x double> %floatmask) nounwind readnone
   %cmp = icmp ne i32 %v, 0
   ret i1 %cmp
 }
 
-define i1 @__all(<4 x i32>) nounwind readnone alwaysinline {
-  %floatmask = bitcast <4 x i32> %0 to <4 x float>
-  %v = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %floatmask) nounwind readnone
+define i1 @__all(<4 x i64>) nounwind readnone alwaysinline {
+  %floatmask = bitcast <4 x i64> %0 to <4 x double>
+  %v = call i32 @llvm.x86.avx.movmsk.pd.256(<4 x double> %floatmask) nounwind readnone
   %cmp = icmp eq i32 %v, 15
   ret i1 %cmp
 }
 
-define i1 @__none(<4 x i32>) nounwind readnone alwaysinline {
-  %floatmask = bitcast <4 x i32> %0 to <4 x float>
-  %v = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %floatmask) nounwind readnone
+define i1 @__none(<4 x i64>) nounwind readnone alwaysinline {
+  %floatmask = bitcast <4 x i64> %0 to <4 x double>
+  %v = call i32 @llvm.x86.avx.movmsk.pd.256(<4 x double> %floatmask) nounwind readnone
   %cmp = icmp eq i32 %v, 0
   ret i1 %cmp
 }
@@ -392,7 +392,8 @@ masked_load(i16, 2)
 declare <4 x float> @llvm.x86.avx.maskload.ps(i8 *, <4 x float> %mask)
 declare <4 x double> @llvm.x86.avx.maskload.pd.256(i8 *, <4 x double> %mask)
  
-define <4 x i32> @__masked_load_i32(i8 *, <4 x i32> %mask) nounwind alwaysinline {
+define <4 x i32> @__masked_load_i32(i8 *, <4 x i64> %mask64) nounwind alwaysinline {
+  %mask      = trunc <4 x i64> %mask64 to <4 x i32>
   %floatmask = bitcast <4 x i32> %mask to <4 x float>
   %floatval = call <4 x float> @llvm.x86.avx.maskload.ps(i8 * %0, <4 x float> %floatmask)
   %retval = bitcast <4 x float> %floatval to <4 x i32>
@@ -400,18 +401,11 @@ define <4 x i32> @__masked_load_i32(i8 *, <4 x i32> %mask) nounwind alwaysinline
 }
 
 
-define <4 x i64> @__masked_load_i64(i8 *, <4 x i32> %mask) nounwind alwaysinline {
-  ; double up masks, bitcast to doubles
-  %mask0 = shufflevector <4 x i32> %mask, <4 x i32> undef,
-     <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
-  %mask0d = bitcast <8 x i32> %mask0 to <4 x double>
-
-  %val0d = call <4 x double> @llvm.x86.avx.maskload.pd.256(i8 * %0, <4 x double> %mask0d)
-
-  %vald = shufflevector <4 x double> %val0d, <4 x double> undef,
-      <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %val = bitcast <4 x double> %vald to <4 x i64>
-  ret <4 x i64> %val
+define <4 x i64> @__masked_load_i64(i8 *, <4 x i64> %mask) nounwind alwaysinline {
+  %doublemask = bitcast <4 x i64> %mask to <4 x double>
+  %doubleval  = call <4 x double> @llvm.x86.avx.maskload.pd.256(i8 * %0, <4 x double> %doublemask)
+  %retval = bitcast <4 x double> %doubleval to <4 x i64>
+  ret <4 x i64> %retval
 }
 
 masked_load_float_double()
@@ -428,83 +422,62 @@ declare void @llvm.x86.avx.maskstore.ps    (i8 *, <4 x float>,  <4 x float>)
 declare void @llvm.x86.avx.maskstore.pd.256(i8 *, <4 x double>, <4 x double>)
 
 define void @__masked_store_i32(<4 x i32>* nocapture, <4 x i32>, 
-                                <4 x i32>) nounwind alwaysinline {
-  %ptr = bitcast <4 x i32> * %0 to i8 *
-  %val = bitcast <4 x i32> %1 to <4 x float>
-  %mask = bitcast <4 x i32> %2 to <4 x float>
+                                <4 x i64>) nounwind alwaysinline {
+  %mask32 = trunc <4 x i64> %2 to <4 x i32>
+
+  %ptr    = bitcast <4 x i32> * %0 to i8 *
+  %val    = bitcast <4 x i32> %1 to <4 x float>
+  %mask   = bitcast <4 x i32> %mask32 to <4 x float>
   call void @llvm.x86.avx.maskstore.ps(i8 * %ptr, <4 x float> %mask, <4 x float> %val)
   ret void
 }
 
 define void @__masked_store_i64(<4 x i64>* nocapture, <4 x i64>,
-                                <4 x i32> %mask) nounwind alwaysinline {
-  %ptr = bitcast <4 x i64> * %0 to i8 *
-  %val = bitcast <4 x i64> %1 to <4 x double>
-
-  %mask0 = shufflevector <4 x i32> %mask, <4 x i32> undef,
-     <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
-
-  %mask0d = bitcast <8 x i32> %mask0 to <4 x double>
-
-  %val0 = shufflevector <4 x double> %val, <4 x double> undef,
-     <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-
-  call void @llvm.x86.avx.maskstore.pd.256(i8 * %ptr, <4 x double> %mask0d, <4 x double> %val0)
+                                <4 x i64>) nounwind alwaysinline {
+  %ptr  = bitcast <4 x i64> * %0 to i8 *
+  %val  = bitcast <4 x i64> %1 to <4 x double>
+  %mask = bitcast <4 x i64> %2 to <4 x double>
+  call void @llvm.x86.avx.maskstore.pd.256(i8 * %ptr, <4 x double> %mask, <4 x double> %val)
   ret void
 }
 
 
-masked_store_blend_8_16_by_4()
+masked_store_blend_8_16_by_4_mask64()
 
 ;; sse intrinsic
-declare <4 x float> @llvm.x86.sse41.blendvps(<4 x float>, <4 x float>,
+declare <4 x float>  @llvm.x86.sse41.blendvps(<4 x float>, <4 x float>,
                                              <4 x float>) nounwind readnone
 
-
 define void @__masked_store_blend_i32(<4 x i32>* nocapture, <4 x i32>, 
-                                      <4 x i32> %mask) nounwind alwaysinline {
+                                      <4 x i64>) nounwind alwaysinline {
+  %mask          = trunc   <4 x i64> %2 to <4 x i32>
   %mask_as_float = bitcast <4 x i32> %mask to <4 x float>
-  %oldValue = load <4 x i32>* %0, align 4
-  %oldAsFloat = bitcast <4 x i32> %oldValue to <4 x float>
-  %newAsFloat = bitcast <4 x i32> %1 to <4 x float>
-  %blend = call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %oldAsFloat,
-                                                     <4 x float> %newAsFloat,
-                                                     <4 x float> %mask_as_float)
+  %oldValue      = load    <4 x i32>* %0, align 4
+  %oldAsFloat    = bitcast <4 x i32> %oldValue to <4 x float>
+  %newAsFloat    = bitcast <4 x i32> %1 to <4 x float>
+  %blend         = call    <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %oldAsFloat,
+                                                             <4 x float> %newAsFloat,
+                                                             <4 x float> %mask_as_float)
   %blendAsInt = bitcast <4 x float> %blend to <4 x i32>
   store <4 x i32> %blendAsInt, <4 x i32>* %0, align 4
   ret void
 }
 
 ;; avx intrinsic
-declare <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float>, <8 x float>,
-                                                <8 x float>) nounwind readnone
+declare <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double>, <4 x double>,
+                                                <4 x double>) nounwind readnone
 
-define void @__masked_store_blend_i64(<4 x i64>* nocapture %ptr, <4 x i64> %new, 
-                                      <4 x i32> %i32mask) nounwind alwaysinline {
-  %oldValue = load <4 x i64>* %ptr, align 8
-  %mask = bitcast <4 x i32> %i32mask to <4 x float>
-
-  ; Do 4x64-bit blends by doing two <8 x i32> blends, where the <8 x i32> values
-  ; are actually bitcast <4 x i64> values
-  ;
-  ; set up the first four 64-bit values
-  %old01  = bitcast <4 x i64> %oldValue to <4 x i64>
-  %old01f = bitcast <4 x i64> %old01 to <8 x float>
-  %new01  = bitcast <4 x i64> %new  to <4 x i64>
-  %new01f = bitcast <4 x i64> %new01 to <8 x float>
-  ; compute mask--note that the indices are all doubled-up
-  %mask01 = shufflevector <4 x float> %mask, <4 x float> undef,
-                          <8 x i32> <i32 0, i32 0, i32 1, i32 1,
-                                     i32 2, i32 2, i32 3, i32 3>
-  ; and blend them
-  %result01f = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %old01f,
-                                                            <8 x float> %new01f,
-                                                            <8 x float> %mask01)
-  %result01 = bitcast <8 x float> %result01f to <4 x i64>
-
-
-  %final = bitcast <4 x i64> %result01 to <4 x i64>
-  store <4 x i64> %final, <4 x i64> * %ptr, align 8
+define void @__masked_store_blend_i64(<4 x i64>* nocapture , <4 x i64>,
+                                      <4 x i64>) nounwind alwaysinline {
+  %mask_as_double = bitcast <4 x i64>  %2 to <4 x double>
+  %oldValue       = load    <4 x i64>* %0, align 4
+  %oldAsDouble    = bitcast <4 x i64>  %oldValue to <4 x double>
+  %newAsDouble    = bitcast <4 x i64>  %1 to <4 x double>
+  %blend          = call    <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double> %oldAsDouble,
+                                                                        <4 x double> %newAsDouble,
+                                                                        <4 x double> %mask_as_double)
+  %blendAsInt = bitcast <4 x double> %blend to <4 x i64>
+  store <4 x i64> %blendAsInt, <4 x i64>* %0, align 4
   ret void
 }
 
diff --git a/builtins/util.m4 b/builtins/util.m4
index 6c90c821..68fa818b 100644
--- a/builtins/util.m4
+++ b/builtins/util.m4
@@ -947,6 +947,22 @@ define internal <$1 x i64> @convertmask_i32_i64_$1(<$1 x i32>) {
   %r = sext <$1 x i32> %0 to <$1 x i64>
   ret <$1 x i64> %r
 }
+
+define internal <$1 x i8> @convertmask_i64_i8_$1(<$1 x i64>) {
+  %r = trunc <$1 x i64> %0 to <$1 x i8>
+  ret <$1 x i8> %r
+}
+define internal <$1 x i16> @convertmask_i64_i16_$1(<$1 x i64>) {
+  %r = trunc <$1 x i64> %0 to <$1 x i16>
+  ret <$1 x i16> %r
+}
+define internal <$1 x i32> @convertmask_i64_i32_$1(<$1 x i64>) {
+  %r = trunc <$1 x i64> %0 to <$1 x i32>
+  ret <$1 x i32> %r
+}
+define internal <$1 x i64> @convertmask_i64_i64_$1(<$1 x i64>) {
+  ret <$1 x i64> %0
+}
 ')
 
 mask_converts(WIDTH)
@@ -2689,9 +2705,13 @@ define i32 @__sext_uniform_bool(i1) nounwind readnone alwaysinline {
 }
 
 define <WIDTH x i32> @__sext_varying_bool(<WIDTH x MASK>) nounwind readnone alwaysinline {
-  ifelse(MASK,i32, `ret <WIDTH x i32> %0',
-  `%se = sext <WIDTH x MASK> %0 to <WIDTH x i32>
-  ret <WIDTH x i32> %se')
+;;  ifelse(MASK,i32, `ret <WIDTH x i32> %0',
+;; `%se = sext <WIDTH x MASK> %0 to <WIDTH x i32>
+;; ret <WIDTH x i32> %se')
+  ifelse(MASK,i32, `%se = bitcast <WIDTH x i32> %0 to <WIDTH x i32>',
+         MASK,i64, `%se = trunc <WIDTH x MASK> %0 to <WIDTH x i32>',
+                   `%se = sext <WIDTH x MASK> %0 to <WIDTH x i32>')
+  ret <WIDTH x i32> %se
 }
 
 
@@ -3508,6 +3528,56 @@ define void @__masked_store_blend_i16(<4 x i16>* nocapture, <4 x i16>,
 }
 ')
 
+define(`masked_store_blend_8_16_by_4_mask64', `
+define void @__masked_store_blend_i8(<4 x i8>* nocapture, <4 x i8>,
+                                     <4 x i64>) nounwind alwaysinline {
+  %old = load <4 x i8> * %0, align 1
+  ifelse(LLVM_VERSION,LLVM_3_0,`
+    %old32 = bitcast <4 x i8> %old to i32
+    %new32 = bitcast <4 x i8> %1 to i32
+
+    %mask8 = trunc <4 x i64> %2 to <4 x i8>
+    %mask32 = bitcast <4 x i8> %mask8 to i32
+    %notmask32 = xor i32 %mask32, -1
+
+    %newmasked = and i32 %new32, %mask32
+    %oldmasked = and i32 %old32, %notmask32
+    %result = or i32 %newmasked, %oldmasked
+
+    %resultvec = bitcast i32 %result to <4 x i8>
+  ',`
+    %m = trunc <4 x i64> %2 to <4 x i1>
+    %resultvec = select <4 x i1> %m, <4 x i8> %1, <4 x i8> %old
+  ')
+  store <4 x i8> %resultvec, <4 x i8> * %0, align 1
+  ret void
+}
+
+define void @__masked_store_blend_i16(<4 x i16>* nocapture, <4 x i16>,
+                                      <4 x i64>) nounwind alwaysinline {
+  %old = load <4 x i16> * %0, align 2
+  ifelse(LLVM_VERSION,LLVM_3_0,`
+    %old64 = bitcast <4 x i16> %old to i64
+    %new64 = bitcast <4 x i16> %1 to i64
+
+    %mask16 = trunc <4 x i64> %2 to <4 x i16>
+    %mask64 = bitcast <4 x i16> %mask16 to i64
+    %notmask64 = xor i64 %mask64, -1
+
+    %newmasked = and i64 %new64, %mask64
+    %oldmasked = and i64 %old64, %notmask64
+    %result = or i64 %newmasked, %oldmasked
+
+    %resultvec = bitcast i64 %result to <4 x i16>
+  ',`
+    %m = trunc <4 x i64> %2 to <4 x i1>
+    %resultvec = select <4 x i1> %m, <4 x i16> %1, <4 x i16> %old
+  ')
+  store <4 x i16> %resultvec, <4 x i16> * %0, align 2
+  ret void
+}
+')
+
 define(`masked_store_blend_8_16_by_8', `
 define void @__masked_store_blend_i8(<8 x i8>* nocapture, <8 x i8>,
                                      <8 x i32>) nounwind alwaysinline {
diff --git a/ispc.cpp b/ispc.cpp
index 02c23568..046c64c4 100644
--- a/ispc.cpp
+++ b/ispc.cpp
@@ -446,14 +446,13 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
         this->m_maskingIsFree = false;
         this->m_maskBitCount = 32;
     }
-    else if (!strcasecmp(isa, "avxh") ) {
-        fprintf(stderr, " ISA is avxh \n");
+    else if (!strcasecmp(isa, "avx-i64x4") ) {
         this->m_isa = Target::AVX;
         this->m_nativeVectorWidth = 4;
         this->m_vectorWidth = 4;
         this->m_attributes = "+avx,+popcnt,+cmov";
         this->m_maskingIsFree = false;
-        this->m_maskBitCount = 32;
+        this->m_maskBitCount = 64;
     }
     else if (!strcasecmp(isa, "avx-x2") ||
              !strcasecmp(isa, "avx1-x2") ||
diff --git a/llvmutil.cpp b/llvmutil.cpp
index 180c8676..64691498 100644
--- a/llvmutil.cpp
+++ b/llvmutil.cpp
@@ -132,6 +132,10 @@ InitLLVMUtil(llvm::LLVMContext *ctx, Target& target) {
         LLVMTypes::MaskType = LLVMTypes::BoolVectorType =
             llvm::VectorType::get(llvm::Type::getInt32Ty(*ctx), target.getVectorWidth());
         break;
+    case 64:
+        LLVMTypes::MaskType = LLVMTypes::BoolVectorType =
+            llvm::VectorType::get(llvm::Type::getInt64Ty(*ctx), target.getVectorWidth());
+        break;
     default:
         FATAL("Unhandled mask width for initializing MaskType");
     }
@@ -183,6 +187,10 @@ InitLLVMUtil(llvm::LLVMContext *ctx, Target& target) {
         onMask = llvm::ConstantInt::get(llvm::Type::getInt32Ty(*ctx), -1,
                                     true /*signed*/); // 0xffffffff
         break;
+    case 64:
+        onMask = llvm::ConstantInt::get(llvm::Type::getInt64Ty(*ctx), -1,
+                                    true /*signed*/); // 0xffffffff
+        break;
     default:
         FATAL("Unhandled mask width for onMask");
     }
@@ -210,6 +218,10 @@ InitLLVMUtil(llvm::LLVMContext *ctx, Target& target) {
         offMask = llvm::ConstantInt::get(llvm::Type::getInt32Ty(*ctx), 0,
                                          true /*signed*/);
         break;
+    case 64:
+        offMask = llvm::ConstantInt::get(llvm::Type::getInt64Ty(*ctx), 0,
+                                         true /*signed*/);
+        break;
     default:
         FATAL("Unhandled mask width for offMask");
     }
@@ -480,7 +492,10 @@ LLVMUInt64Vector(const uint64_t *ivec) {
 llvm::Constant *
 LLVMBoolVector(bool b) {
     llvm::Constant *v;
-    if (LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType)
+    if (LLVMTypes::BoolVectorType == LLVMTypes::Int64VectorType)
+        v = llvm::ConstantInt::get(LLVMTypes::Int64Type, b ? 0xffffffffffffffffull : 0,
+                                   false /*unsigned*/);
+    else if (LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType)
         v = llvm::ConstantInt::get(LLVMTypes::Int32Type, b ? 0xffffffff : 0,
                                    false /*unsigned*/);
     else if (LLVMTypes::BoolVectorType == LLVMTypes::Int16VectorType)
@@ -506,7 +521,10 @@ LLVMBoolVector(const bool *bvec) {
     std::vector<llvm::Constant *> vals;
     for (int i = 0; i < g->target->getVectorWidth(); ++i) {
         llvm::Constant *v;
-        if (LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType)
+        if (LLVMTypes::BoolVectorType == LLVMTypes::Int64VectorType)
+            v = llvm::ConstantInt::get(LLVMTypes::Int64Type, bvec[i] ? 0xffffffffffffffffull : 0,
+                                       false /*unsigned*/);
+        else if (LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType)
             v = llvm::ConstantInt::get(LLVMTypes::Int32Type, bvec[i] ? 0xffffffff : 0,
                                        false /*unsigned*/);
         else if (LLVMTypes::BoolVectorType == LLVMTypes::Int16VectorType)
diff --git a/parse.yy b/parse.yy
index 5fc01cb0..9a2b4fc3 100644
--- a/parse.yy
+++ b/parse.yy
@@ -2183,6 +2183,9 @@ static void lAddMaskToSymbolTable(SourcePos pos) {
     case 32:
         t = AtomicType::VaryingUInt32;
         break;
+    case 64:
+        t = AtomicType::VaryingUInt64;
+        break;
     default:
         FATAL("Unhandled mask bitsize in lAddMaskToSymbolTable");
     }
diff --git a/stdlib.ispc b/stdlib.ispc
index db9d7f36..6d7ee051 100644
--- a/stdlib.ispc
+++ b/stdlib.ispc
@@ -50,6 +50,9 @@
 #elif (ISPC_MASK_BITS == 32)
   #define IntMaskType int32
   #define UIntMaskType unsigned int32
+#elif (ISPC_MASK_BITS == 64)
+  #define IntMaskType int64
+  #define UIntMaskType unsigned int64
 #else
   #error Unknown value of ISPC_MASK_BITS
 #endif

From 059d80cc11d0cf50d337fceb1ae04d0c3c365152 Mon Sep 17 00:00:00 2001
From: Evghenii <egaburov@dds.nl>
Date: Thu, 12 Sep 2013 17:18:12 +0200
Subject: [PATCH 061/124] included suggested changes, ./tests/launch-*.ispc
 still fails. something is mask64 related, not sure what. help...

---
 .gitignore                        |   3 -
 builtins/svml.m4                  | 124 ++++++++++++++++++++++++++----
 builtins/target-avx-i64x4.ll      |   2 +-
 builtins/target-avx-i64x4base.ll  |   2 +-
 builtins/target-generic-common.ll |   4 +-
 builtins/target-neon-common.ll    |   4 +-
 builtins/target-sse4-16.ll        |   4 +-
 builtins/target-sse4-8.ll         |   4 +-
 llvmutil.cpp                      |   2 +-
 run_tests.py                      |   2 +-
 10 files changed, 120 insertions(+), 31 deletions(-)

diff --git a/.gitignore b/.gitignore
index 3bec2ace..88fb0197 100644
--- a/.gitignore
+++ b/.gitignore
@@ -12,8 +12,5 @@ examples/*/*.png
 examples/*/*.ppm
 examples/*/objs/*
 *.swp
-.*
-!.gitignore
-
 
 
diff --git a/builtins/svml.m4 b/builtins/svml.m4
index 71a6a709..0a587577 100644
--- a/builtins/svml.m4
+++ b/builtins/svml.m4
@@ -1,20 +1,61 @@
-;; svml
+;; copyright stub  :)
+;;  Copyright (c) 2013, Intel Corporation
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Intel Corporation nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
 
-;; stubs
+
+;; svml macro
+
+;; svml_stubs : stubs for svml calls
+;; $1 - type ("float" or "double")
+;; $2 - svml internal function suffix ("f" for float, "d" for double)
+;; $3 - vector width
 define(`svml_stubs',`
-  declare <$2 x $1> @__svml_sin$3(<$2 x $1>) nounwind readnone alwaysinline
-  declare <$2 x $1> @__svml_asin$3(<$2 x $1>) nounwind readnone alwaysinline 
-  declare <$2 x $1> @__svml_cos$3(<$2 x $1>) nounwind readnone alwaysinline 
-  declare void @__svml_sincos$3(<$2 x $1>, <$2 x $1> *, <$2 x $1> *) nounwind readnone alwaysinline 
-  declare <$2 x $1> @__svml_tan$3(<$2 x $1>) nounwind readnone alwaysinline 
-  declare <$2 x $1> @__svml_atan$3(<$2 x $1>) nounwind readnone alwaysinline 
-  declare <$2 x $1> @__svml_atan2$3(<$2 x $1>, <$2 x $1>) nounwind readnone alwaysinline 
-  declare <$2 x $1> @__svml_exp$3(<$2 x $1>) nounwind readnone alwaysinline 
-  declare <$2 x $1> @__svml_log$3(<$2 x $1>) nounwind readnone alwaysinline 
-  declare <$2 x $1> @__svml_pow$3(<$2 x $1>, <$2 x $1>) nounwind readnone alwaysinline 
+  declare <$3 x $1> @__svml_sin$2(<$3 x $1>) nounwind readnone alwaysinline
+  declare <$3 x $1> @__svml_asin$2(<$3 x $1>) nounwind readnone alwaysinline 
+  declare <$3 x $1> @__svml_cos$2(<$3 x $1>) nounwind readnone alwaysinline 
+  declare void @__svml_sincos$2(<$3 x $1>, <$3 x $1> *, <$3 x $1> *) nounwind readnone alwaysinline 
+  declare <$3 x $1> @__svml_tan$2(<$3 x $1>) nounwind readnone alwaysinline 
+  declare <$3 x $1> @__svml_atan$2(<$3 x $1>) nounwind readnone alwaysinline 
+  declare <$3 x $1> @__svml_atan2$2(<$3 x $1>, <$3 x $1>) nounwind readnone alwaysinline 
+  declare <$3 x $1> @__svml_exp$2(<$3 x $1>) nounwind readnone alwaysinline 
+  declare <$3 x $1> @__svml_log$2(<$3 x $1>) nounwind readnone alwaysinline 
+  declare <$3 x $1> @__svml_pow$2(<$3 x $1>, <$3 x $1>) nounwind readnone alwaysinline 
 ')
 
-;; decalre __svml calls
+;; svml_declare : declaration of __svml_* intrinsics 
+;; $1 - type ("float" or "double")
+;; $2 - __svml_* intrinsic function suffix 
+;;      float:  "f4"(sse) "f8"(avx) "f16"(avx512)
+;;      double:  "2"(sse)  "4"(avx)   "8"(avx512)
+;; $3 - vector width
 define(`svml_declare',`
   declare <$3 x $1> @__svml_sin$2(<$3 x $1>) nounwind readnone
   declare <$3 x $1> @__svml_asin$2(<$3 x $1>) nounwind readnone
@@ -28,7 +69,13 @@ define(`svml_declare',`
   declare <$3 x $1> @__svml_pow$2(<$3 x $1>, <$3 x $1>) nounwind readnone
 ');
 
-;; define native __svml calls
+;; defintition of __svml_* internal functions
+;; $1 - type ("float" or "double")
+;; $2 - __svml_* intrinsic function suffix 
+;;      float:  "f4"(sse) "f8"(avx) "f16"(avx512)
+;;      double:  "2"(sse)  "4"(avx)   "8"(avx512)
+;; $3 - vector width
+;; $4 - svml internal function suffix ("f" for float, "d" for double)
 define(`svml_define',`
   define <$3 x $1> @__svml_sin$4(<$3 x $1>) nounwind readnone alwaysinline {
     %ret = call <$3 x $1> @__svml_sin$2(<$3 x $1> %0)
@@ -82,7 +129,45 @@ define(`svml_define',`
 ')
 
 
-;; define x2 __svml calls
+;; svml_define_x : defintition of __svml_* internal functions operation on extended width
+;; $1 - type ("float" or "double")
+;; $2 - __svml_* intrinsic function suffix 
+;;      float:  "f4"(sse) "f8"(avx) "f16"(avx512)
+;;      double:  "2"(sse)  "4"(avx)   "8"(avx512)
+;; $3 - vector width
+;; $4 - svml internal function suffix ("f" for float, "d" for double)
+;; $5 - extended width, must be at least twice the native vector width
+;;      contigent on existing of unary$3to$5 and binary$3to$5 macros
+
+;; *todo*: in sincos call use __svml_sincos[f][2,4,8,16] call, e.g.
+;;define void @__svml_sincosf(<8 x float>, <8 x float> *,
+;;                                    <8 x float> *) nounwind readnone alwaysinline {
+;;  ; call svml_sincosf4 two times with the two 4-wide sub-vectors
+;;  %a = shufflevector <8 x float> %0, <8 x float> undef,
+;;         <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+;;  %b = shufflevector <8 x float> %0, <8 x float> undef,
+;;         <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+;;
+;;  %cospa = alloca <4 x float>
+;;  %sa = call <4 x float> @__svml_sincosf4(<4 x float> * %cospa, <4 x float> %a)
+;;
+;;  %cospb = alloca <4 x float>
+;;  %sb = call <4 x float> @__svml_sincosf4(<4 x float> * %cospb, <4 x float> %b)
+;;
+;;  %sin = shufflevector <4 x float> %sa, <4 x float> %sb,
+;;         <8 x i32> <i32 0, i32 1, i32 2, i32 3,
+;;                    i32 4, i32 5, i32 6, i32 7>
+;;  store <8 x float> %sin, <8 x float> * %1
+;;
+;;  %cosa = load <4 x float> * %cospa
+;;  %cosb = load <4 x float> * %cospb
+;;  %cos = shufflevector <4 x float> %cosa, <4 x float> %cosb,
+;;         <8 x i32> <i32 0, i32 1, i32 2, i32 3,
+;;                    i32 4, i32 5, i32 6, i32 7>
+;;  store <8 x float> %cos, <8 x float> * %2
+;;
+;;  ret void
+;;}
 define(`svml_define_x',`
   define <$5 x $1> @__svml_sin$4(<$5 x $1>) nounwind readnone alwaysinline {
     unary$3to$5(ret, $1, @__svml_sin$2, %0)
@@ -96,7 +181,14 @@ define(`svml_define_x',`
     unary$3to$5(ret, $1, @__svml_cos$2, %0)
     ret <$5 x $1> %ret
   }
-  declare void @__svml_sincos$4(<$5 x $1>,<$5 x $1>*,<$5 x $1>*) nounwind readnone alwaysinline 
+  define void @__svml_sincos$4(<$5 x $1>,<$5 x $1>*,<$5 x $1>*) nounwind readnone alwaysinline 
+  {
+    %s = call <$5 x $1> @__svml_sin$4(<$5 x $1> %0)
+    %c = call <$5 x $1> @__svml_cos$4(<$5 x $1> %0)
+    store <$5 x $1> %s, <$5 x $1> * %1
+    store <$5 x $1> %c, <$5 x $1> * %2
+    ret void
+  }
   define <$5 x $1> @__svml_tan$4(<$5 x $1>) nounwind readnone alwaysinline {
     unary$3to$5(ret, $1, @__svml_tan$2, %0)
     ret <$5 x $1> %ret
diff --git a/builtins/target-avx-i64x4.ll b/builtins/target-avx-i64x4.ll
index d7dbb6bd..65490ea5 100644
--- a/builtins/target-avx-i64x4.ll
+++ b/builtins/target-avx-i64x4.ll
@@ -1,4 +1,4 @@
-;;  Copyright (c) 2010-2011, Intel Corporation
+;;  Copyright (c) 2013, Intel Corporation
 ;;  All rights reserved.
 ;;
 ;;  Redistribution and use in source and binary forms, with or without
diff --git a/builtins/target-avx-i64x4base.ll b/builtins/target-avx-i64x4base.ll
index 05bf178d..e1832030 100644
--- a/builtins/target-avx-i64x4base.ll
+++ b/builtins/target-avx-i64x4base.ll
@@ -1,4 +1,4 @@
-;;  Copyright (c) 2010-2012, Intel Corporation
+;;  Copyright (c) 2013, Intel Corporation
 ;;  All rights reserved.
 ;;
 ;;  Redistribution and use in source and binary forms, with or without
diff --git a/builtins/target-generic-common.ll b/builtins/target-generic-common.ll
index 30a8b030..2a5d1b32 100644
--- a/builtins/target-generic-common.ll
+++ b/builtins/target-generic-common.ll
@@ -209,8 +209,8 @@ declare i64 @__count_leading_zeros_i64(i64) nounwind readnone
 ;; svml
 
 include(`svml.m4')
-svml_stubs(float,  WIDTH, f)
-svml_stubs(double, WIDTH, d)
+svml_stubs(float,f,WIDTH)
+svml_stubs(double,d,WIDTH)
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; reductions
diff --git a/builtins/target-neon-common.ll b/builtins/target-neon-common.ll
index 92fc5ce3..1c0b421f 100644
--- a/builtins/target-neon-common.ll
+++ b/builtins/target-neon-common.ll
@@ -318,8 +318,8 @@ define void @__masked_store_blend_i64(<WIDTH x i64>* nocapture %ptr,
 
 
 include(`svml.m4')
-svmlf_stubs(WIDTH)
-svmld_stubs(WIDTH)
+svml_stubs(float,f,WIDTH)
+svml_stubs(double,d,WIDTH)
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; gather
diff --git a/builtins/target-sse4-16.ll b/builtins/target-sse4-16.ll
index 3f8cd339..72b81ff0 100644
--- a/builtins/target-sse4-16.ll
+++ b/builtins/target-sse4-16.ll
@@ -210,8 +210,8 @@ define <8 x double> @__max_varying_double(<8 x double>, <8 x double>) nounwind r
 
 ; FIXME
 include(`svml.m4')
-svml_stubs(float,8,f)
-svml_stubs(double,8,d)
+svml_stubs(float,f,WIDTH)
+svml_stubs(double,d,WIDTH)
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; horizontal ops / reductions
diff --git a/builtins/target-sse4-8.ll b/builtins/target-sse4-8.ll
index f43cd940..69b355e3 100644
--- a/builtins/target-sse4-8.ll
+++ b/builtins/target-sse4-8.ll
@@ -223,8 +223,8 @@ define <16 x double> @__max_varying_double(<16 x double>, <16 x double>) nounwin
 ; FIXME
 
 include(`svml.m4')
-svml_stubs(float,16,f)
-svml_stubs(double,16,d)
+svml_stubs(float,f,WIDTH)
+svml_stubs(double,d,WIDTH)
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; horizontal ops / reductions
diff --git a/llvmutil.cpp b/llvmutil.cpp
index 64691498..275cf794 100644
--- a/llvmutil.cpp
+++ b/llvmutil.cpp
@@ -189,7 +189,7 @@ InitLLVMUtil(llvm::LLVMContext *ctx, Target& target) {
         break;
     case 64:
         onMask = llvm::ConstantInt::get(llvm::Type::getInt64Ty(*ctx), -1,
-                                    true /*signed*/); // 0xffffffff
+                                    true /*signed*/); // 0xffffffffffffffffull
         break;
     default:
         FATAL("Unhandled mask width for onMask");
diff --git a/run_tests.py b/run_tests.py
index 9729930f..180205a0 100755
--- a/run_tests.py
+++ b/run_tests.py
@@ -75,7 +75,7 @@ if not os.path.exists(ispc_exe):
     sys.stderr.write("Fatal error: missing ispc compiler: %s\n" % ispc_exe)
     sys.exit()
 
-ispc_exe += " " + options.ispc_flags
+ispc_exe += " -g " + options.ispc_flags
 
 if __name__ == '__main__':
     sys.stdout.write("ispc compiler: %s\n" % ispc_exe)

From 40af8d6ed564cc5970786459587ecdc487a1fc44 Mon Sep 17 00:00:00 2001
From: Evghenii <egaburov@dds.nl>
Date: Thu, 12 Sep 2013 20:25:44 +0200
Subject: [PATCH 062/124] fixed segfault in tests/launch-*.ispc.
 nativeVectoWidth in avx-i64x4 was set to 4. Fixed

---
 ispc.cpp     | 2 +-
 run_tests.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/ispc.cpp b/ispc.cpp
index 046c64c4..1a99154b 100644
--- a/ispc.cpp
+++ b/ispc.cpp
@@ -448,7 +448,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
     }
     else if (!strcasecmp(isa, "avx-i64x4") ) {
         this->m_isa = Target::AVX;
-        this->m_nativeVectorWidth = 4;
+        this->m_nativeVectorWidth = 8;  /* native vector width in terms of floats */
         this->m_vectorWidth = 4;
         this->m_attributes = "+avx,+popcnt,+cmov";
         this->m_maskingIsFree = false;
diff --git a/run_tests.py b/run_tests.py
index 180205a0..9729930f 100755
--- a/run_tests.py
+++ b/run_tests.py
@@ -75,7 +75,7 @@ if not os.path.exists(ispc_exe):
     sys.stderr.write("Fatal error: missing ispc compiler: %s\n" % ispc_exe)
     sys.exit()
 
-ispc_exe += " -g " + options.ispc_flags
+ispc_exe += " " + options.ispc_flags
 
 if __name__ == '__main__':
     sys.stdout.write("ispc compiler: %s\n" % ispc_exe)

From 715b82826634644eec8f95f40e53d16b8a587ca3 Mon Sep 17 00:00:00 2001
From: egaburov <egaburov.work@gmail.com>
Date: Fri, 13 Sep 2013 09:25:52 +0200
Subject: [PATCH 063/124] fixed float constants to be read as doubles

---
 lex.ll   | 4 ++--
 parse.yy | 6 +++---
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/lex.ll b/lex.ll
index 8baa627a..129f0cd5 100644
--- a/lex.ll
+++ b/lex.ll
@@ -440,13 +440,13 @@ L?\"(\\.|[^\\"])*\" { lStringConst(&yylval, &yylloc); return TOKEN_STRING_LITERA
 
 {FLOAT_NUMBER} {
     RT;
-    yylval.floatVal = (float)atof(yytext);
+    yylval.floatVal = atof(yytext);
     return TOKEN_FLOAT_CONSTANT;
 }
 
 {HEX_FLOAT_NUMBER} {
     RT;
-    yylval.floatVal = (float)lParseHexFloat(yytext);
+    yylval.floatVal = lParseHexFloat(yytext);
     return TOKEN_FLOAT_CONSTANT;
 }
 
diff --git a/parse.yy b/parse.yy
index 9a2b4fc3..b55d49e0 100644
--- a/parse.yy
+++ b/parse.yy
@@ -149,7 +149,7 @@ struct ForeachDimension {
 
 %union {
     uint64_t intVal;
-    float floatVal;
+    double floatVal;
     std::string *stringVal;
     const char *constCharPtr;
 
@@ -326,8 +326,8 @@ primary_expression
                            (uint64_t)yylval.intVal, @1);
     }
     | TOKEN_FLOAT_CONSTANT {
-        $$ = new ConstExpr(AtomicType::UniformFloat->GetAsConstType(),
-                           (float)yylval.floatVal, @1);
+        $$ = new ConstExpr(AtomicType::UniformDouble->GetAsConstType(),
+                           yylval.floatVal, @1);
     }
     | TOKEN_TRUE {
         $$ = new ConstExpr(AtomicType::UniformBool->GetAsConstType(), true, @1);

From a97eb7b7cb217fb8f583314612527171488b0f79 Mon Sep 17 00:00:00 2001
From: Evghenii <egaburov@dds.nl>
Date: Fri, 13 Sep 2013 09:32:59 +0200
Subject: [PATCH 064/124] added clamp in double precision

---
 stdlib.ispc | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/stdlib.ispc b/stdlib.ispc
index 6d7ee051..0d5c4efd 100644
--- a/stdlib.ispc
+++ b/stdlib.ispc
@@ -1559,6 +1559,18 @@ static inline uniform float clamp(uniform float v, uniform float low, uniform fl
     return min(max(v, low), high);
 }
 
+// double
+
+__declspec(safe,cost2)
+static inline double clamp(double v, double low, double high) {
+    return min(max(v, low), high);
+}
+
+__declspec(safe,cost2)
+static inline uniform double clamp(uniform double v, uniform double low, uniform double high) {
+    return min(max(v, low), high);
+}
+
 // int8
 
 __declspec(safe,cost2)

From a9913c83377614dde2ac782e298f437e45dcbd84 Mon Sep 17 00:00:00 2001
From: egaburov <egaburov.work@gmail.com>
Date: Fri, 13 Sep 2013 10:26:15 +0200
Subject: [PATCH 065/124] changed lexer/parser to be able to read float
 constants, if they have "f"-suffix

---
 lex.ll   | 23 ++++++++++++++++++++---
 parse.yy | 11 ++++++++---
 2 files changed, 28 insertions(+), 6 deletions(-)

diff --git a/lex.ll b/lex.ll
index 129f0cd5..7a3db71a 100644
--- a/lex.ll
+++ b/lex.ll
@@ -76,7 +76,7 @@ static int allTokens[] = {
   TOKEN_TASK, TOKEN_TRUE, TOKEN_TYPEDEF, TOKEN_UNIFORM, TOKEN_UNMASKED,
   TOKEN_UNSIGNED, TOKEN_VARYING, TOKEN_VOID, TOKEN_WHILE,
   TOKEN_STRING_C_LITERAL, TOKEN_DOTDOTDOT,
-  TOKEN_FLOAT_CONSTANT,
+  TOKEN_FLOAT_CONSTANT, TOKEN_DOUBLE_CONSTANT,
   TOKEN_INT8_CONSTANT, TOKEN_UINT8_CONSTANT,
   TOKEN_INT16_CONSTANT, TOKEN_UINT16_CONSTANT,
   TOKEN_INT32_CONSTANT, TOKEN_UINT32_CONSTANT,
@@ -152,6 +152,7 @@ void ParserInit() {
     tokenToName[TOKEN_STRING_C_LITERAL] = "\"C\"";
     tokenToName[TOKEN_DOTDOTDOT] = "...";
     tokenToName[TOKEN_FLOAT_CONSTANT] = "TOKEN_FLOAT_CONSTANT";
+    tokenToName[TOKEN_DOUBLE_CONSTANT] = "TOKEN_DOUBLE_CONSTANT";
     tokenToName[TOKEN_INT8_CONSTANT] = "TOKEN_INT8_CONSTANT";
     tokenToName[TOKEN_UINT8_CONSTANT] = "TOKEN_UINT8_CONSTANT";
     tokenToName[TOKEN_INT16_CONSTANT] = "TOKEN_INT16_CONSTANT";
@@ -266,6 +267,7 @@ void ParserInit() {
     tokenNameRemap["TOKEN_STRING_C_LITERAL"] = "\"C\"";
     tokenNameRemap["TOKEN_DOTDOTDOT"] = "\'...\'";
     tokenNameRemap["TOKEN_FLOAT_CONSTANT"] = "float constant";
+    tokenNameRemap["TOKEN_DOUBLE_CONSTANT"] = "double constant";
     tokenNameRemap["TOKEN_INT8_CONSTANT"] = "int8 constant";
     tokenNameRemap["TOKEN_UINT8_CONSTANT"] = "unsigned int8 constant";
     tokenNameRemap["TOKEN_INT16_CONSTANT"] = "int16 constant";
@@ -341,6 +343,8 @@ inline int ispcRand() {
 WHITESPACE [ \t\r]+
 INT_NUMBER (([0-9]+)|(0x[0-9a-fA-F]+)|(0b[01]+))[uUlL]*[kMG]?[uUlL]*
 INT_NUMBER_DOTDOTDOT (([0-9]+)|(0x[0-9a-fA-F]+)|(0b[01]+))[uUlL]*[kMG]?[uUlL]*\.\.\.
+DOUBLE_NUMBER (([0-9]+|(([0-9]+\.[0-9]*?)|(\.[0-9]+)))([eE][-+]?[0-9]+)??)
+HEX_DOUBLE_NUMBER (0x[01](\.[0-9a-fA-F]*)?p[-+]?[0-9]+?)
 FLOAT_NUMBER (([0-9]+|(([0-9]+\.[0-9]*[fF]?)|(\.[0-9]+)))([eE][-+]?[0-9]+)?[fF]?)
 HEX_FLOAT_NUMBER (0x[01](\.[0-9a-fA-F]*)?p[-+]?[0-9]+[fF]?)
 
@@ -438,15 +442,28 @@ L?\"(\\.|[^\\"])*\" { lStringConst(&yylval, &yylloc); return TOKEN_STRING_LITERA
 }
 
 
+
+{DOUBLE_NUMBER} {
+    RT;
+    yylval.doubleVal = atof(yytext);
+    return TOKEN_DOUBLE_CONSTANT;
+}
+
+{HEX_DOUBLE_NUMBER} {
+    RT;
+    yylval.doubleVal = lParseHexFloat(yytext);
+    return TOKEN_DOUBLE_CONSTANT;
+}
+
 {FLOAT_NUMBER} {
     RT;
-    yylval.floatVal = atof(yytext);
+    yylval.floatVal = (float)atof(yytext);
     return TOKEN_FLOAT_CONSTANT;
 }
 
 {HEX_FLOAT_NUMBER} {
     RT;
-    yylval.floatVal = lParseHexFloat(yytext);
+    yylval.floatVal = (float)lParseHexFloat(yytext);
     return TOKEN_FLOAT_CONSTANT;
 }
 
diff --git a/parse.yy b/parse.yy
index b55d49e0..933a3455 100644
--- a/parse.yy
+++ b/parse.yy
@@ -149,7 +149,8 @@ struct ForeachDimension {
 
 %union {
     uint64_t intVal;
-    double floatVal;
+    float  floatVal;
+    double doubleVal;
     std::string *stringVal;
     const char *constCharPtr;
 
@@ -185,7 +186,7 @@ struct ForeachDimension {
 %token TOKEN_INT64_CONSTANT TOKEN_UINT64_CONSTANT
 %token TOKEN_INT32DOTDOTDOT_CONSTANT TOKEN_UINT32DOTDOTDOT_CONSTANT
 %token TOKEN_INT64DOTDOTDOT_CONSTANT TOKEN_UINT64DOTDOTDOT_CONSTANT
-%token TOKEN_FLOAT_CONSTANT TOKEN_STRING_C_LITERAL
+%token TOKEN_FLOAT_CONSTANT TOKEN_DOUBLE_CONSTANT TOKEN_STRING_C_LITERAL
 %token TOKEN_IDENTIFIER TOKEN_STRING_LITERAL TOKEN_TYPE_NAME TOKEN_NULL
 %token TOKEN_PTR_OP TOKEN_INC_OP TOKEN_DEC_OP TOKEN_LEFT_OP TOKEN_RIGHT_OP
 %token TOKEN_LE_OP TOKEN_GE_OP TOKEN_EQ_OP TOKEN_NE_OP
@@ -326,9 +327,13 @@ primary_expression
                            (uint64_t)yylval.intVal, @1);
     }
     | TOKEN_FLOAT_CONSTANT {
-        $$ = new ConstExpr(AtomicType::UniformDouble->GetAsConstType(),
+        $$ = new ConstExpr(AtomicType::UniformFloat->GetAsConstType(),
                            yylval.floatVal, @1);
     }
+    | TOKEN_DOUBLE_CONSTANT {
+        $$ = new ConstExpr(AtomicType::UniformDouble->GetAsConstType(),
+                           yylval.doubleVal, @1);
+    }
     | TOKEN_TRUE {
         $$ = new ConstExpr(AtomicType::UniformBool->GetAsConstType(), true, @1);
     }

From ed825b377396b639dc0d86fe44bc0b36e29189f3 Mon Sep 17 00:00:00 2001
From: Tomasz Koziara <t.koziara@gmail.com>
Date: Fri, 13 Sep 2013 13:14:31 +0100
Subject: [PATCH 066/124] Uniform memory allocation fixed.

---
 examples/sort/sort.ispc | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/examples/sort/sort.ispc b/examples/sort/sort.ispc
index 65df4736..5fc89d91 100644
--- a/examples/sort/sort.ispc
+++ b/examples/sort/sort.ispc
@@ -172,7 +172,7 @@ task void bumpup (uniform int h[], uniform int g[])
 
 static void prefix_sum (uniform int num, uniform int h[])
 {
-  uniform int * uniform g = uniform new int [num+1];
+  uniform int * uniform g = uniform new uniform int [num+1];
   uniform int i;
 
   launch[num] addup (h, g+1);
@@ -191,9 +191,9 @@ export void sort_ispc (uniform int n, uniform unsigned int code[], uniform int o
   uniform int num = ntasks < 1 ? num_cores () : ntasks;
   uniform int span = n / num;
   uniform int hsize = 256*programCount*num;
-  uniform int * uniform hist = uniform new int [hsize];
-  uniform int64 * uniform pair = uniform new int64 [n];
-  uniform int64 * uniform temp = uniform new int64 [n];
+  uniform int * uniform hist = uniform new uniform int [hsize];
+  uniform int64 * uniform pair = uniform new uniform int64 [n];
+  uniform int64 * uniform temp = uniform new uniform int64 [n];
   uniform int pass, i;
 
 #if DEBUG

From 9861375f0c1235ea25f68211f3a82f6dcd91874c Mon Sep 17 00:00:00 2001
From: Evghenii <egaburov@dds.nl>
Date: Fri, 13 Sep 2013 15:07:14 +0200
Subject: [PATCH 067/124] renamed avx-i64x4 -> avx1-i64x4

---
 Makefile                                                      | 2 +-
 builtins.cpp                                                  | 4 ++--
 builtins/{target-avx-i64x4.ll => target-avx1-i64x4.ll}        | 2 +-
 .../{target-avx-i64x4base.ll => target-avx1-i64x4base.ll}     | 0
 ispc.cpp                                                      | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)
 rename builtins/{target-avx-i64x4.ll => target-avx1-i64x4.ll} (98%)
 rename builtins/{target-avx-i64x4base.ll => target-avx1-i64x4base.ll} (100%)

diff --git a/Makefile b/Makefile
index 92debe4f..097da238 100644
--- a/Makefile
+++ b/Makefile
@@ -141,7 +141,7 @@ CXX_SRC=ast.cpp builtins.cpp cbackend.cpp ctx.cpp decl.cpp expr.cpp func.cpp \
 	type.cpp util.cpp
 HEADERS=ast.h builtins.h ctx.h decl.h expr.h func.h ispc.h llvmutil.h module.h \
 	opt.h stmt.h sym.h type.h util.h
-TARGETS=avx-i64x4 avx1 avx1-x2 avx11 avx11-x2 avx2 avx2-x2 \
+TARGETS=avx1-i64x4 avx1 avx1-x2 avx11 avx11-x2 avx2 avx2-x2 \
 	sse2 sse2-x2 sse4-8 sse4-16 sse4 sse4-x2 \
 	generic-4 generic-8 generic-16 generic-32 generic-64 generic-1
 ifneq ($(ARM_ENABLED), 0)
diff --git a/builtins.cpp b/builtins.cpp
index f8d4136e..43f68833 100644
--- a/builtins.cpp
+++ b/builtins.cpp
@@ -937,10 +937,10 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
         switch (g->target->getVectorWidth()) {
         case 4:
             if (runtime32) {
-                EXPORT_MODULE(builtins_bitcode_avx_i64x4_32bit);
+                EXPORT_MODULE(builtins_bitcode_avx1_i64x4_32bit);
             }
             else {
-                EXPORT_MODULE(builtins_bitcode_avx_i64x4_64bit);
+                EXPORT_MODULE(builtins_bitcode_avx1_i64x4_64bit);
             }
             break;
         case 8:
diff --git a/builtins/target-avx-i64x4.ll b/builtins/target-avx1-i64x4.ll
similarity index 98%
rename from builtins/target-avx-i64x4.ll
rename to builtins/target-avx1-i64x4.ll
index 65490ea5..d183f1ce 100644
--- a/builtins/target-avx-i64x4.ll
+++ b/builtins/target-avx1-i64x4.ll
@@ -29,7 +29,7 @@
 ;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 ;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
 
-include(`target-avx-i64x4base.ll')
+include(`target-avx1-i64x4base.ll')
 
 rdrand_decls()
 
diff --git a/builtins/target-avx-i64x4base.ll b/builtins/target-avx1-i64x4base.ll
similarity index 100%
rename from builtins/target-avx-i64x4base.ll
rename to builtins/target-avx1-i64x4base.ll
diff --git a/ispc.cpp b/ispc.cpp
index 1a99154b..26ca0b39 100644
--- a/ispc.cpp
+++ b/ispc.cpp
@@ -446,7 +446,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
         this->m_maskingIsFree = false;
         this->m_maskBitCount = 32;
     }
-    else if (!strcasecmp(isa, "avx-i64x4") ) {
+    else if (!strcasecmp(isa, "avx1-i64x4") ) {
         this->m_isa = Target::AVX;
         this->m_nativeVectorWidth = 8;  /* native vector width in terms of floats */
         this->m_vectorWidth = 4;

From 36886971e337c555b1b339b862653111f9cf9506 Mon Sep 17 00:00:00 2001
From: Evghenii <egaburov@dds.nl>
Date: Fri, 13 Sep 2013 16:02:53 +0200
Subject: [PATCH 068/124] revert lex.ll parse.yy stdlib.ispc to state when all
 constants are floats

---
 lex.ll      | 19 +------------------
 parse.yy    | 11 +++--------
 stdlib.ispc | 12 ------------
 3 files changed, 4 insertions(+), 38 deletions(-)

diff --git a/lex.ll b/lex.ll
index 7a3db71a..8baa627a 100644
--- a/lex.ll
+++ b/lex.ll
@@ -76,7 +76,7 @@ static int allTokens[] = {
   TOKEN_TASK, TOKEN_TRUE, TOKEN_TYPEDEF, TOKEN_UNIFORM, TOKEN_UNMASKED,
   TOKEN_UNSIGNED, TOKEN_VARYING, TOKEN_VOID, TOKEN_WHILE,
   TOKEN_STRING_C_LITERAL, TOKEN_DOTDOTDOT,
-  TOKEN_FLOAT_CONSTANT, TOKEN_DOUBLE_CONSTANT,
+  TOKEN_FLOAT_CONSTANT,
   TOKEN_INT8_CONSTANT, TOKEN_UINT8_CONSTANT,
   TOKEN_INT16_CONSTANT, TOKEN_UINT16_CONSTANT,
   TOKEN_INT32_CONSTANT, TOKEN_UINT32_CONSTANT,
@@ -152,7 +152,6 @@ void ParserInit() {
     tokenToName[TOKEN_STRING_C_LITERAL] = "\"C\"";
     tokenToName[TOKEN_DOTDOTDOT] = "...";
     tokenToName[TOKEN_FLOAT_CONSTANT] = "TOKEN_FLOAT_CONSTANT";
-    tokenToName[TOKEN_DOUBLE_CONSTANT] = "TOKEN_DOUBLE_CONSTANT";
     tokenToName[TOKEN_INT8_CONSTANT] = "TOKEN_INT8_CONSTANT";
     tokenToName[TOKEN_UINT8_CONSTANT] = "TOKEN_UINT8_CONSTANT";
     tokenToName[TOKEN_INT16_CONSTANT] = "TOKEN_INT16_CONSTANT";
@@ -267,7 +266,6 @@ void ParserInit() {
     tokenNameRemap["TOKEN_STRING_C_LITERAL"] = "\"C\"";
     tokenNameRemap["TOKEN_DOTDOTDOT"] = "\'...\'";
     tokenNameRemap["TOKEN_FLOAT_CONSTANT"] = "float constant";
-    tokenNameRemap["TOKEN_DOUBLE_CONSTANT"] = "double constant";
     tokenNameRemap["TOKEN_INT8_CONSTANT"] = "int8 constant";
     tokenNameRemap["TOKEN_UINT8_CONSTANT"] = "unsigned int8 constant";
     tokenNameRemap["TOKEN_INT16_CONSTANT"] = "int16 constant";
@@ -343,8 +341,6 @@ inline int ispcRand() {
 WHITESPACE [ \t\r]+
 INT_NUMBER (([0-9]+)|(0x[0-9a-fA-F]+)|(0b[01]+))[uUlL]*[kMG]?[uUlL]*
 INT_NUMBER_DOTDOTDOT (([0-9]+)|(0x[0-9a-fA-F]+)|(0b[01]+))[uUlL]*[kMG]?[uUlL]*\.\.\.
-DOUBLE_NUMBER (([0-9]+|(([0-9]+\.[0-9]*?)|(\.[0-9]+)))([eE][-+]?[0-9]+)??)
-HEX_DOUBLE_NUMBER (0x[01](\.[0-9a-fA-F]*)?p[-+]?[0-9]+?)
 FLOAT_NUMBER (([0-9]+|(([0-9]+\.[0-9]*[fF]?)|(\.[0-9]+)))([eE][-+]?[0-9]+)?[fF]?)
 HEX_FLOAT_NUMBER (0x[01](\.[0-9a-fA-F]*)?p[-+]?[0-9]+[fF]?)
 
@@ -442,19 +438,6 @@ L?\"(\\.|[^\\"])*\" { lStringConst(&yylval, &yylloc); return TOKEN_STRING_LITERA
 }
 
 
-
-{DOUBLE_NUMBER} {
-    RT;
-    yylval.doubleVal = atof(yytext);
-    return TOKEN_DOUBLE_CONSTANT;
-}
-
-{HEX_DOUBLE_NUMBER} {
-    RT;
-    yylval.doubleVal = lParseHexFloat(yytext);
-    return TOKEN_DOUBLE_CONSTANT;
-}
-
 {FLOAT_NUMBER} {
     RT;
     yylval.floatVal = (float)atof(yytext);
diff --git a/parse.yy b/parse.yy
index 933a3455..9a2b4fc3 100644
--- a/parse.yy
+++ b/parse.yy
@@ -149,8 +149,7 @@ struct ForeachDimension {
 
 %union {
     uint64_t intVal;
-    float  floatVal;
-    double doubleVal;
+    float floatVal;
     std::string *stringVal;
     const char *constCharPtr;
 
@@ -186,7 +185,7 @@ struct ForeachDimension {
 %token TOKEN_INT64_CONSTANT TOKEN_UINT64_CONSTANT
 %token TOKEN_INT32DOTDOTDOT_CONSTANT TOKEN_UINT32DOTDOTDOT_CONSTANT
 %token TOKEN_INT64DOTDOTDOT_CONSTANT TOKEN_UINT64DOTDOTDOT_CONSTANT
-%token TOKEN_FLOAT_CONSTANT TOKEN_DOUBLE_CONSTANT TOKEN_STRING_C_LITERAL
+%token TOKEN_FLOAT_CONSTANT TOKEN_STRING_C_LITERAL
 %token TOKEN_IDENTIFIER TOKEN_STRING_LITERAL TOKEN_TYPE_NAME TOKEN_NULL
 %token TOKEN_PTR_OP TOKEN_INC_OP TOKEN_DEC_OP TOKEN_LEFT_OP TOKEN_RIGHT_OP
 %token TOKEN_LE_OP TOKEN_GE_OP TOKEN_EQ_OP TOKEN_NE_OP
@@ -328,11 +327,7 @@ primary_expression
     }
     | TOKEN_FLOAT_CONSTANT {
         $$ = new ConstExpr(AtomicType::UniformFloat->GetAsConstType(),
-                           yylval.floatVal, @1);
-    }
-    | TOKEN_DOUBLE_CONSTANT {
-        $$ = new ConstExpr(AtomicType::UniformDouble->GetAsConstType(),
-                           yylval.doubleVal, @1);
+                           (float)yylval.floatVal, @1);
     }
     | TOKEN_TRUE {
         $$ = new ConstExpr(AtomicType::UniformBool->GetAsConstType(), true, @1);
diff --git a/stdlib.ispc b/stdlib.ispc
index 0d5c4efd..6d7ee051 100644
--- a/stdlib.ispc
+++ b/stdlib.ispc
@@ -1559,18 +1559,6 @@ static inline uniform float clamp(uniform float v, uniform float low, uniform fl
     return min(max(v, low), high);
 }
 
-// double
-
-__declspec(safe,cost2)
-static inline double clamp(double v, double low, double high) {
-    return min(max(v, low), high);
-}
-
-__declspec(safe,cost2)
-static inline uniform double clamp(uniform double v, uniform double low, uniform double high) {
-    return min(max(v, low), high);
-}
-
 // int8
 
 __declspec(safe,cost2)

From ce99b17616be754b3e30464f36fdc48bcceb22dd Mon Sep 17 00:00:00 2001
From: Dmitry Babokin <babokin@gmail.com>
Date: Sat, 14 Sep 2013 02:00:23 +0400
Subject: [PATCH 069/124] Fix for Windows buils to include new target:
 avx-i64x4

---
 ispc.cpp     |   3 +-
 ispc.vcxproj | 108 +++++++++++++++++++++++++++++++--------------------
 2 files changed, 67 insertions(+), 44 deletions(-)

diff --git a/ispc.cpp b/ispc.cpp
index 26ca0b39..82f0518b 100644
--- a/ispc.cpp
+++ b/ispc.cpp
@@ -446,7 +446,8 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
         this->m_maskingIsFree = false;
         this->m_maskBitCount = 32;
     }
-    else if (!strcasecmp(isa, "avx1-i64x4") ) {
+    else if (!strcasecmp(isa, "avx-i64x4") ||
+             !strcasecmp(isa, "avx1-i64x4")) {
         this->m_isa = Target::AVX;
         this->m_nativeVectorWidth = 8;  /* native vector width in terms of floats */
         this->m_vectorWidth = 4;
diff --git a/ispc.vcxproj b/ispc.vcxproj
index b4a8b764..58fa5b08 100755
--- a/ispc.vcxproj
+++ b/ispc.vcxproj
@@ -22,6 +22,8 @@
     <ClCompile Include="$(Configuration)\gen-bitcode-avx1-64bit.cpp" />
     <ClCompile Include="$(Configuration)\gen-bitcode-avx1-x2-32bit.cpp" />
     <ClCompile Include="$(Configuration)\gen-bitcode-avx1-x2-64bit.cpp" />
+    <ClCompile Include="$(Configuration)\gen-bitcode-avx1-i64x4-32bit.cpp" />
+    <ClCompile Include="$(Configuration)\gen-bitcode-avx1-i64x4-64bit.cpp" />
     <ClCompile Include="$(Configuration)\gen-bitcode-avx11-32bit.cpp" />
     <ClCompile Include="$(Configuration)\gen-bitcode-avx11-64bit.cpp" />
     <ClCompile Include="$(Configuration)\gen-bitcode-avx11-x2-32bit.cpp" />
@@ -61,6 +63,7 @@
     <ClCompile Include="$(Configuration)\gen-stdlib-mask8.cpp" />
     <ClCompile Include="$(Configuration)\gen-stdlib-mask16.cpp" />
     <ClCompile Include="$(Configuration)\gen-stdlib-mask32.cpp" />
+    <ClCompile Include="$(Configuration)\gen-stdlib-mask64.cpp" />
     <ClCompile Include="ispc.cpp" />
     <ClCompile Include="$(Configuration)\lex.cc">
       <DisableSpecificWarnings>4146;4800;4996;4355;4624;4005;4003;4018</DisableSpecificWarnings>
@@ -103,13 +106,14 @@
   <ItemGroup>
     <CustomBuild Include="stdlib.ispc">
       <FileType>Document</FileType>
-      <Command>%LLVM_INSTALL_DIR%\bin\clang -E -x c %(Filename).ispc -DISPC_MASK_BITS=1 -DISPC=1 -DPI=3.1415926535 | python stdlib2cpp.py mask1 &gt; $(Configuration)/gen-stdlib-mask1.cpp;
-%LLVM_INSTALL_DIR%\bin\clang -E -x c %(Filename).ispc -DISPC_MASK_BITS=8 -DISPC=1 -DPI=3.1415926535 | python stdlib2cpp.py mask8 &gt; $(Configuration)/gen-stdlib-mask8.cpp;
-%LLVM_INSTALL_DIR%\bin\clang -E -x c %(Filename).ispc -DISPC_MASK_BITS=16 -DISPC=1 -DPI=3.1415926535 | python stdlib2cpp.py mask16 &gt; $(Configuration)/gen-stdlib-mask16.cpp;
-%LLVM_INSTALL_DIR%\bin\clang -E -x c %(Filename).ispc -DISPC_MASK_BITS=32 -DISPC=1 -DPI=3.1415926535 | python stdlib2cpp.py mask32 &gt; $(Configuration)/gen-stdlib-mask32.cpp;
+      <Command>%LLVM_INSTALL_DIR%\bin\clang -E -x c %(Filename).ispc -DISPC_MASK_BITS=1 -DISPC=1 -DPI=3.14159265358979 | python stdlib2cpp.py mask1 &gt; $(Configuration)/gen-stdlib-mask1.cpp;
+%LLVM_INSTALL_DIR%\bin\clang -E -x c %(Filename).ispc -DISPC_MASK_BITS=8 -DISPC=1 -DPI=3.14159265358979 | python stdlib2cpp.py mask8 &gt; $(Configuration)/gen-stdlib-mask8.cpp;
+%LLVM_INSTALL_DIR%\bin\clang -E -x c %(Filename).ispc -DISPC_MASK_BITS=16 -DISPC=1 -DPI=3.14159265358979 | python stdlib2cpp.py mask16 &gt; $(Configuration)/gen-stdlib-mask16.cpp;
+%LLVM_INSTALL_DIR%\bin\clang -E -x c %(Filename).ispc -DISPC_MASK_BITS=32 -DISPC=1 -DPI=3.14159265358979 | python stdlib2cpp.py mask32 &gt; $(Configuration)/gen-stdlib-mask32.cpp;
+%LLVM_INSTALL_DIR%\bin\clang -E -x c %(Filename).ispc -DISPC_MASK_BITS=64 -DISPC=1 -DPI=3.14159265358979 | python stdlib2cpp.py mask64 &gt; $(Configuration)/gen-stdlib-mask64.cpp;
 </Command>
-      <Outputs>$(Configuration)/gen-stdlib-mask1.cpp;$(Configuration)/gen-stdlib-mask8.cpp;$(Configuration)/gen-stdlib-mask16.cpp;$(Configuration)/gen-stdlib-mask32.cpp</Outputs>
-      <Message>Building gen-stdlib-{mask1,8,16,32}.cpp</Message>
+      <Outputs>$(Configuration)/gen-stdlib-mask1.cpp;$(Configuration)/gen-stdlib-mask8.cpp;$(Configuration)/gen-stdlib-mask16.cpp;$(Configuration)/gen-stdlib-mask32.cpp;$(Configuration)/gen-stdlib-mask64.cpp</Outputs>
+      <Message>Building gen-stdlib-{mask1,8,16,32,64}.cpp</Message>
     </CustomBuild>
   </ItemGroup>
   <ItemGroup>
@@ -117,7 +121,7 @@
       <FileType>Document</FileType>
       <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins/dispatch.ll | python bitcode2cpp.py dispatch.ll &gt; $(Configuration)/gen-bitcode-dispatch.cpp</Command>
       <Outputs>$(Configuration)/gen-bitcode-dispatch.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4</AdditionalInputs>
+      <AdditionalInputs>builtins\util.m4;builtins\svml.m4</AdditionalInputs>
       <Message>Building gen-bitcode-dispatch.cpp</Message>
     </CustomBuild>
   </ItemGroup>
@@ -126,7 +130,7 @@
       <FileType>Document</FileType>
       <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-sse4.ll | python bitcode2cpp.py builtins\target-sse4.ll 32bit &gt; $(Configuration)/gen-bitcode-sse4-32bit.cpp</Command>
       <Outputs>$(Configuration)/gen-bitcode-sse4-32bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\target-sse4-common.ll</AdditionalInputs>
+      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-sse4-common.ll</AdditionalInputs>
       <Message>Building gen-bitcode-sse4-32bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
@@ -135,7 +139,7 @@
       <FileType>Document</FileType>
       <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-sse4.ll | python bitcode2cpp.py builtins\target-sse4.ll 64bit &gt; $(Configuration)/gen-bitcode-sse4-64bit.cpp</Command>
       <Outputs>$(Configuration)/gen-bitcode-sse4-64bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\target-sse4-common.ll</AdditionalInputs>
+      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-sse4-common.ll</AdditionalInputs>
       <Message>Building gen-bitcode-sse4-64bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
@@ -144,7 +148,7 @@
       <FileType>Document</FileType>
       <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-sse4-8.ll | python bitcode2cpp.py builtins\target-sse4-8.ll 32bit &gt; $(Configuration)/gen-bitcode-sse4-8-32bit.cpp</Command>
       <Outputs>$(Configuration)/gen-bitcode-sse4-8-32bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\target-sse4-common.ll</AdditionalInputs>
+      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-sse4-common.ll</AdditionalInputs>
       <Message>Building gen-bitcode-sse4-8-32bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
@@ -153,7 +157,7 @@
       <FileType>Document</FileType>
       <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-sse4-8.ll | python bitcode2cpp.py builtins\target-sse4-8.ll 64bit &gt; $(Configuration)/gen-bitcode-sse4-8-64bit.cpp</Command>
       <Outputs>$(Configuration)/gen-bitcode-sse4-8-64bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\target-sse4-common.ll</AdditionalInputs>
+      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-sse4-common.ll</AdditionalInputs>
       <Message>Building gen-bitcode-sse4-8-64bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
@@ -162,7 +166,7 @@
       <FileType>Document</FileType>
       <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-sse4-16.ll | python bitcode2cpp.py builtins\target-sse4-16.ll 32bit &gt; $(Configuration)/gen-bitcode-sse4-16-32bit.cpp</Command>
       <Outputs>$(Configuration)/gen-bitcode-sse4-16-32bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\target-sse4-common.ll</AdditionalInputs>
+      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-sse4-common.ll</AdditionalInputs>
       <Message>Building gen-bitcode-sse4-16-32bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
@@ -171,7 +175,7 @@
       <FileType>Document</FileType>
       <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-sse4-16.ll | python bitcode2cpp.py builtins\target-sse4-16.ll 64bit &gt; $(Configuration)/gen-bitcode-sse4-16-64bit.cpp</Command>
       <Outputs>$(Configuration)/gen-bitcode-sse4-16-64bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\target-sse4-common.ll</AdditionalInputs>
+      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-sse4-common.ll</AdditionalInputs>
       <Message>Building gen-bitcode-sse4-16-64bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
@@ -180,7 +184,7 @@
       <FileType>Document</FileType>
       <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-sse4-x2.ll | python bitcode2cpp.py builtins\target-sse4-x2.ll 32bit &gt; $(Configuration)/gen-bitcode-sse4-x2-32bit.cpp</Command>
       <Outputs>$(Configuration)/gen-bitcode-sse4-x2-32bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\target-sse4-common.ll</AdditionalInputs>
+      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-sse4-common.ll</AdditionalInputs>
       <Message>Building gen-bitcode-sse4-x2-32bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
@@ -189,7 +193,7 @@
       <FileType>Document</FileType>
       <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-sse4-x2.ll | python bitcode2cpp.py builtins\target-sse4-x2.ll 64bit &gt; $(Configuration)/gen-bitcode-sse4-x2-64bit.cpp</Command>
       <Outputs>$(Configuration)/gen-bitcode-sse4-x2-64bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\target-sse4-common.ll</AdditionalInputs>
+      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-sse4-common.ll</AdditionalInputs>
       <Message>Building gen-bitcode-sse4-x2-64bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
@@ -198,7 +202,7 @@
       <FileType>Document</FileType>
       <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-sse2.ll | python bitcode2cpp.py builtins\target-sse2.ll 32bit &gt; $(Configuration)/gen-bitcode-sse2-32bit.cpp</Command>
       <Outputs>$(Configuration)/gen-bitcode-sse2-32bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\target-sse2-common.ll</AdditionalInputs>
+      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-sse2-common.ll</AdditionalInputs>
       <Message>Building gen-bitcode-sse2-32bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
@@ -207,7 +211,7 @@
       <FileType>Document</FileType>
       <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-sse2.ll | python bitcode2cpp.py builtins\target-sse2.ll 64bit &gt; $(Configuration)/gen-bitcode-sse2-64bit.cpp</Command>
       <Outputs>$(Configuration)/gen-bitcode-sse2-64bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\target-sse2-common.ll</AdditionalInputs>
+      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-sse2-common.ll</AdditionalInputs>
       <Message>Building gen-bitcode-sse2-64bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
@@ -216,7 +220,7 @@
       <FileType>Document</FileType>
       <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-sse2-x2.ll | python bitcode2cpp.py builtins\target-sse2-x2.ll 32bit &gt; $(Configuration)/gen-bitcode-sse2-x2-32bit.cpp</Command>
       <Outputs>$(Configuration)/gen-bitcode-sse2-x2-32bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\target-sse2-common.ll</AdditionalInputs>
+      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-sse2-common.ll</AdditionalInputs>
       <Message>Building gen-bitcode-sse2-x2-32bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
@@ -225,7 +229,7 @@
       <FileType>Document</FileType>
       <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-sse2-x2.ll | python bitcode2cpp.py builtins\target-sse2-x2.ll 64bit &gt; $(Configuration)/gen-bitcode-sse2-x2-64bit.cpp</Command>
       <Outputs>$(Configuration)/gen-bitcode-sse2-x2-64bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\target-sse2-common.ll</AdditionalInputs>
+      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-sse2-common.ll</AdditionalInputs>
       <Message>Building gen-bitcode-sse2-x2-64bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
@@ -234,7 +238,7 @@
       <FileType>Document</FileType>
       <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-avx1.ll | python bitcode2cpp.py builtins\target-avx1.ll 32bit &gt; $(Configuration)/gen-bitcode-avx1-32bit.cpp</Command>
       <Outputs>$(Configuration)/gen-bitcode-avx1-32bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\target-avx-common.ll;builtins\target-avx.ll</AdditionalInputs>
+      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-avx-common.ll;builtins\target-avx.ll</AdditionalInputs>
       <Message>Building gen-bitcode-avx1-32bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
@@ -243,7 +247,7 @@
       <FileType>Document</FileType>
       <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-avx1.ll | python bitcode2cpp.py builtins\target-avx1.ll 64bit &gt; $(Configuration)/gen-bitcode-avx1-64bit.cpp</Command>
       <Outputs>$(Configuration)/gen-bitcode-avx1-64bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\target-avx-common.ll;builtins\target-avx.ll</AdditionalInputs>
+      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-avx-common.ll;builtins\target-avx.ll</AdditionalInputs>
       <Message>Building gen-bitcode-avx1-64bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
@@ -252,7 +256,7 @@
       <FileType>Document</FileType>
       <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-avx1-x2.ll | python bitcode2cpp.py builtins\target-avx1-x2.ll 32bit &gt; $(Configuration)/gen-bitcode-avx1-x2-32bit.cpp</Command>
       <Outputs>$(Configuration)/gen-bitcode-avx1-x2-32bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\target-avx-common.ll;builtins\target-avx-x2.ll</AdditionalInputs>
+      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-avx-common.ll;builtins\target-avx-x2.ll</AdditionalInputs>
       <Message>Building gen-bitcode-avx1-x2-32bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
@@ -261,16 +265,34 @@
       <FileType>Document</FileType>
       <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-avx1-x2.ll | python bitcode2cpp.py builtins\target-avx1-x2.ll 64bit &gt; $(Configuration)/gen-bitcode-avx1-x2-64bit.cpp</Command>
       <Outputs>$(Configuration)/gen-bitcode-avx1-x2-64bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\target-avx-common.ll;builtins\target-avx-x2.ll</AdditionalInputs>
+      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-avx-common.ll;builtins\target-avx-x2.ll</AdditionalInputs>
       <Message>Building gen-bitcode-avx1-x2-64bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="builtins\target-avx1-i64x4.ll">
+      <FileType>Document</FileType>
+      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-avx1-i64x4.ll | python bitcode2cpp.py builtins\target-avx1-i64x4.ll 32bit &gt; $(Configuration)/gen-bitcode-avx1-i64x4-32bit.cpp</Command>
+      <Outputs>$(Configuration)/gen-bitcode-avx1-i64x4-32bit.cpp</Outputs>
+      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-avx-common.ll;builtins\target-avx.ll;builtins\target-avx1-i64x4base.ll</AdditionalInputs>
+      <Message>Building gen-bitcode-avx1-i64x4-32bit.cpp</Message>
+    </CustomBuild>
+  </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="builtins\target-avx1-i64x4.ll">
+      <FileType>Document</FileType>
+      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-avx1-i64x4.ll | python bitcode2cpp.py builtins\target-avx1-i64x4.ll 64bit &gt; $(Configuration)/gen-bitcode-avx1-i64x4-64bit.cpp</Command>
+      <Outputs>$(Configuration)/gen-bitcode-avx1-i64x4-64bit.cpp</Outputs>
+      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-avx-common.ll;builtins\target-avx.ll;builtins\target-avx1-i64x4base.ll</AdditionalInputs>
+      <Message>Building gen-bitcode-avx1-i64x4-64bit.cpp</Message>
+    </CustomBuild>
+  </ItemGroup>
   <ItemGroup>
     <CustomBuild Include="builtins\target-avx11.ll">
       <FileType>Document</FileType>
       <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-avx11.ll | python bitcode2cpp.py builtins\target-avx11.ll 32bit &gt; $(Configuration)/gen-bitcode-avx11-32bit.cpp</Command>
       <Outputs>$(Configuration)/gen-bitcode-avx11-32bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\target-avx-common.ll;builtins\target-avx.ll</AdditionalInputs>
+      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-avx-common.ll;builtins\target-avx.ll</AdditionalInputs>
       <Message>Building gen-bitcode-avx11-32bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
@@ -279,7 +301,7 @@
       <FileType>Document</FileType>
       <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-avx11.ll | python bitcode2cpp.py builtins\target-avx11.ll 64bit &gt; $(Configuration)/gen-bitcode-avx11-64bit.cpp</Command>
       <Outputs>$(Configuration)/gen-bitcode-avx11-64bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\target-avx-common.ll;builtins\target-avx.ll</AdditionalInputs>
+      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-avx-common.ll;builtins\target-avx.ll</AdditionalInputs>
       <Message>Building gen-bitcode-avx11-64bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
@@ -288,7 +310,7 @@
       <FileType>Document</FileType>
       <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-avx11-x2.ll | python bitcode2cpp.py builtins\target-avx11-x2.ll 32bit &gt; $(Configuration)/gen-bitcode-avx11-x2-32bit.cpp</Command>
       <Outputs>$(Configuration)/gen-bitcode-avx11-x2-32bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\target-avx-common.ll;builtins\target-avx-x2.ll</AdditionalInputs>
+      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-avx-common.ll;builtins\target-avx-x2.ll</AdditionalInputs>
       <Message>Building gen-bitcode-avx11-x2-32bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
@@ -297,7 +319,7 @@
       <FileType>Document</FileType>
       <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-avx11-x2.ll | python bitcode2cpp.py builtins\target-avx11-x2.ll 64bit &gt; $(Configuration)/gen-bitcode-avx11-x2-64bit.cpp</Command>
       <Outputs>$(Configuration)/gen-bitcode-avx11-x2-64bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\target-avx-common.ll;builtins\target-avx-x2.ll</AdditionalInputs>
+      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-avx-common.ll;builtins\target-avx-x2.ll</AdditionalInputs>
       <Message>Building gen-bitcode-avx11-x2-64bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
@@ -306,7 +328,7 @@
       <FileType>Document</FileType>
       <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-avx2.ll | python bitcode2cpp.py builtins\target-avx2.ll 32bit &gt; $(Configuration)/gen-bitcode-avx2-32bit.cpp</Command>
       <Outputs>$(Configuration)/gen-bitcode-avx2-32bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\target-avx-common.ll;builtins\target-avx.ll</AdditionalInputs>
+      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-avx-common.ll;builtins\target-avx.ll</AdditionalInputs>
       <Message>Building gen-bitcode-avx2-32bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
@@ -315,7 +337,7 @@
       <FileType>Document</FileType>
       <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-avx2.ll | python bitcode2cpp.py builtins\target-avx2.ll 64bit &gt; $(Configuration)/gen-bitcode-avx2-64bit.cpp</Command>
       <Outputs>$(Configuration)/gen-bitcode-avx2-64bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\target-avx-common.ll;builtins\target-avx.ll</AdditionalInputs>
+      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-avx-common.ll;builtins\target-avx.ll</AdditionalInputs>
       <Message>Building gen-bitcode-avx2-64bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
@@ -324,7 +346,7 @@
       <FileType>Document</FileType>
       <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-avx2-x2.ll | python bitcode2cpp.py builtins\target-avx2-x2.ll 32bit &gt; $(Configuration)/gen-bitcode-avx2-x2-32bit.cpp</Command>
       <Outputs>$(Configuration)/gen-bitcode-avx2-x2-32bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\target-avx-common.ll;builtins\target-avx-x2.ll</AdditionalInputs>
+      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-avx-common.ll;builtins\target-avx-x2.ll</AdditionalInputs>
       <Message>Building gen-bitcode-avx2-x2-32bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
@@ -333,7 +355,7 @@
       <FileType>Document</FileType>
       <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-avx2-x2.ll | python bitcode2cpp.py builtins\target-avx2-x2.ll 64bit &gt; $(Configuration)/gen-bitcode-avx2-x2-64bit.cpp</Command>
       <Outputs>$(Configuration)/gen-bitcode-avx2-x2-64bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\target-avx-common.ll;builtins\target-avx-x2.ll</AdditionalInputs>
+      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-avx-common.ll;builtins\target-avx-x2.ll</AdditionalInputs>
       <Message>Building gen-bitcode-avx2-x2-64bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
@@ -342,7 +364,7 @@
       <FileType>Document</FileType>
       <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-generic-1.ll | python bitcode2cpp.py builtins\target-generic-1.ll 32bit &gt; $(Configuration)/gen-bitcode-generic-1-32bit.cpp</Command>
       <Outputs>$(Configuration)/gen-bitcode-generic-1-32bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\target-generic-common.ll</AdditionalInputs>
+      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-generic-common.ll</AdditionalInputs>
       <Message>Building gen-bitcode-generic-1-32bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
@@ -351,7 +373,7 @@
       <FileType>Document</FileType>
       <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-generic-1.ll | python bitcode2cpp.py builtins\target-generic-1.ll 64bit &gt; $(Configuration)/gen-bitcode-generic-1-64bit.cpp</Command>
       <Outputs>$(Configuration)/gen-bitcode-generic-1-64bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\target-generic-common.ll</AdditionalInputs>
+      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-generic-common.ll</AdditionalInputs>
       <Message>Building gen-bitcode-generic-1-64bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
@@ -360,7 +382,7 @@
       <FileType>Document</FileType>
       <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-generic-4.ll | python bitcode2cpp.py builtins\target-generic-4.ll 32bit &gt; $(Configuration)/gen-bitcode-generic-4-32bit.cpp</Command>
       <Outputs>$(Configuration)/gen-bitcode-generic-4-32bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\target-generic-common.ll</AdditionalInputs>
+      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-generic-common.ll</AdditionalInputs>
       <Message>Building gen-bitcode-generic-4-32bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
@@ -369,7 +391,7 @@
       <FileType>Document</FileType>
       <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-generic-4.ll | python bitcode2cpp.py builtins\target-generic-4.ll 64bit &gt; $(Configuration)/gen-bitcode-generic-4-64bit.cpp</Command>
       <Outputs>$(Configuration)/gen-bitcode-generic-4-64bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\target-generic-common.ll</AdditionalInputs>
+      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-generic-common.ll</AdditionalInputs>
       <Message>Building gen-bitcode-generic-4-64bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
@@ -378,7 +400,7 @@
       <FileType>Document</FileType>
       <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-generic-8.ll | python bitcode2cpp.py builtins\target-generic-8.ll 32bit &gt; $(Configuration)/gen-bitcode-generic-8-32bit.cpp</Command>
       <Outputs>$(Configuration)/gen-bitcode-generic-8-32bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\target-generic-common.ll</AdditionalInputs>
+      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-generic-common.ll</AdditionalInputs>
       <Message>Building gen-bitcode-generic-8-32bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
@@ -387,7 +409,7 @@
       <FileType>Document</FileType>
       <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-generic-8.ll | python bitcode2cpp.py builtins\target-generic-8.ll 64bit &gt; $(Configuration)/gen-bitcode-generic-8-64bit.cpp</Command>
       <Outputs>$(Configuration)/gen-bitcode-generic-8-64bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\target-generic-common.ll</AdditionalInputs>
+      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-generic-common.ll</AdditionalInputs>
       <Message>Building gen-bitcode-generic-8-64bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
@@ -396,7 +418,7 @@
       <FileType>Document</FileType>
       <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-generic-16.ll | python bitcode2cpp.py builtins\target-generic-16.ll 32bit &gt; $(Configuration)/gen-bitcode-generic-16-32bit.cpp</Command>
       <Outputs>$(Configuration)/gen-bitcode-generic-16-32bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\target-generic-common.ll</AdditionalInputs>
+      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-generic-common.ll</AdditionalInputs>
       <Message>Building gen-bitcode-generic-16-32bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
@@ -405,7 +427,7 @@
       <FileType>Document</FileType>
       <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-generic-16.ll | python bitcode2cpp.py builtins\target-generic-16.ll 64bit &gt; $(Configuration)/gen-bitcode-generic-16-64bit.cpp</Command>
       <Outputs>$(Configuration)/gen-bitcode-generic-16-64bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\target-generic-common.ll</AdditionalInputs>
+      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-generic-common.ll</AdditionalInputs>
       <Message>Building gen-bitcode-generic-16-64bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
@@ -414,7 +436,7 @@
       <FileType>Document</FileType>
       <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-generic-32.ll | python bitcode2cpp.py builtins\target-generic-32.ll 32bit &gt; $(Configuration)/gen-bitcode-generic-32-32bit.cpp</Command>
       <Outputs>$(Configuration)/gen-bitcode-generic-32-32bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\target-generic-common.ll</AdditionalInputs>
+      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-generic-common.ll</AdditionalInputs>
       <Message>Building gen-bitcode-generic-32-32bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
@@ -423,7 +445,7 @@
       <FileType>Document</FileType>
       <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-generic-32.ll | python bitcode2cpp.py builtins\target-generic-32.ll 64bit &gt; $(Configuration)/gen-bitcode-generic-32-64bit.cpp</Command>
       <Outputs>$(Configuration)/gen-bitcode-generic-32-64bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\target-generic-common.ll</AdditionalInputs>
+      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-generic-common.ll</AdditionalInputs>
       <Message>Building gen-bitcode-generic-32-64bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
@@ -432,7 +454,7 @@
       <FileType>Document</FileType>
       <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-generic-64.ll | python bitcode2cpp.py builtins\target-generic-64.ll 32bit &gt; $(Configuration)/gen-bitcode-generic-64-32bit.cpp</Command>
       <Outputs>$(Configuration)/gen-bitcode-generic-64-32bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\target-generic-common.ll</AdditionalInputs>
+      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-generic-common.ll</AdditionalInputs>
       <Message>Building gen-bitcode-generic-64-32bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
@@ -441,7 +463,7 @@
       <FileType>Document</FileType>
       <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-generic-64.ll | python bitcode2cpp.py builtins\target-generic-64.ll 64bit &gt; $(Configuration)/gen-bitcode-generic-64-64bit.cpp</Command>
       <Outputs>$(Configuration)/gen-bitcode-generic-64-64bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\target-generic-common.ll</AdditionalInputs>
+      <AdditionalInputs>builtins\util.m4;builtins\svml.m4;builtins\target-generic-common.ll</AdditionalInputs>
       <Message>Building gen-bitcode-generic-64-64bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>

From 97068765e884599afc4cc4b7187a4de4dd509b46 Mon Sep 17 00:00:00 2001
From: Tomasz Koziara <t.koziara@gmail.com>
Date: Sat, 14 Sep 2013 18:09:04 +0100
Subject: [PATCH 070/124] Copyright reversed.

---
 examples/sort/sort.cpp        | 4 ++--
 examples/sort/sort.ispc       | 4 ++--
 examples/sort/sort_serial.cpp | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/examples/sort/sort.cpp b/examples/sort/sort.cpp
index 1d05b247..4f402c75 100644
--- a/examples/sort/sort.cpp
+++ b/examples/sort/sort.cpp
@@ -1,5 +1,5 @@
 /*
-  Copyright (c) 2013, Intel Corporation
+  Copyright (c) 2013, Durham University
   All rights reserved.
 
   Redistribution and use in source and binary forms, with or without
@@ -13,7 +13,7 @@
       notice, this list of conditions and the following disclaimer in the
       documentation and/or other materials provided with the distribution.
 
-    * Neither the name of Intel Corporation nor the names of its
+    * Neither the name of Durham University nor the names of its
       contributors may be used to endorse or promote products derived from
       this software without specific prior written permission.
 
diff --git a/examples/sort/sort.ispc b/examples/sort/sort.ispc
index 5fc89d91..25ea90f4 100644
--- a/examples/sort/sort.ispc
+++ b/examples/sort/sort.ispc
@@ -1,5 +1,5 @@
 /*
-  Copyright (c) 2013, Intel Corporation
+  Copyright (c) 2013, Durham University
   All rights reserved.
 
   Redistribution and use in source and binary forms, with or without
@@ -13,7 +13,7 @@
       notice, this list of conditions and the following disclaimer in the
       documentation and/or other materials provided with the distribution.
 
-    * Neither the name of Intel Corporation nor the names of its
+    * Neither the name of Durham University nor the names of its
       contributors may be used to endorse or promote products derived from
       this software without specific prior written permission.
 
diff --git a/examples/sort/sort_serial.cpp b/examples/sort/sort_serial.cpp
index ba955c77..38bbdda6 100644
--- a/examples/sort/sort_serial.cpp
+++ b/examples/sort/sort_serial.cpp
@@ -1,5 +1,5 @@
 /*
-  Copyright (c) 2013, Intel Corporation
+  Copyright (c) 2013, Durham University
   All rights reserved.
 
   Redistribution and use in source and binary forms, with or without
@@ -13,7 +13,7 @@
       notice, this list of conditions and the following disclaimer in the
       documentation and/or other materials provided with the distribution.
 
-    * Neither the name of Intel Corporation nor the names of its
+    * Neither the name of Durham University nor the names of its
       contributors may be used to endorse or promote products derived from
       this software without specific prior written permission.
 

From e2a91e6de5fdcd370b903b2670e76be14c60dc09 Mon Sep 17 00:00:00 2001
From: egaburov <egaburov.work@gmail.com>
Date: Mon, 16 Sep 2013 15:54:32 +0200
Subject: [PATCH 071/124] added support for "d"-suffix

---
 lex.ll      | 20 +++++++++++++++++++-
 parse.yy    | 11 ++++++++---
 stdlib.ispc | 12 ++++++++++++
 3 files changed, 39 insertions(+), 4 deletions(-)

diff --git a/lex.ll b/lex.ll
index 8baa627a..c2990ccc 100644
--- a/lex.ll
+++ b/lex.ll
@@ -76,7 +76,7 @@ static int allTokens[] = {
   TOKEN_TASK, TOKEN_TRUE, TOKEN_TYPEDEF, TOKEN_UNIFORM, TOKEN_UNMASKED,
   TOKEN_UNSIGNED, TOKEN_VARYING, TOKEN_VOID, TOKEN_WHILE,
   TOKEN_STRING_C_LITERAL, TOKEN_DOTDOTDOT,
-  TOKEN_FLOAT_CONSTANT,
+  TOKEN_FLOAT_CONSTANT, TOKEN_DOUBLE_CONSTANT,
   TOKEN_INT8_CONSTANT, TOKEN_UINT8_CONSTANT,
   TOKEN_INT16_CONSTANT, TOKEN_UINT16_CONSTANT,
   TOKEN_INT32_CONSTANT, TOKEN_UINT32_CONSTANT,
@@ -152,6 +152,7 @@ void ParserInit() {
     tokenToName[TOKEN_STRING_C_LITERAL] = "\"C\"";
     tokenToName[TOKEN_DOTDOTDOT] = "...";
     tokenToName[TOKEN_FLOAT_CONSTANT] = "TOKEN_FLOAT_CONSTANT";
+    tokenToName[TOKEN_DOUBLE_CONSTANT] = "TOKEN_DOUBLE_CONSTANT";
     tokenToName[TOKEN_INT8_CONSTANT] = "TOKEN_INT8_CONSTANT";
     tokenToName[TOKEN_UINT8_CONSTANT] = "TOKEN_UINT8_CONSTANT";
     tokenToName[TOKEN_INT16_CONSTANT] = "TOKEN_INT16_CONSTANT";
@@ -266,6 +267,7 @@ void ParserInit() {
     tokenNameRemap["TOKEN_STRING_C_LITERAL"] = "\"C\"";
     tokenNameRemap["TOKEN_DOTDOTDOT"] = "\'...\'";
     tokenNameRemap["TOKEN_FLOAT_CONSTANT"] = "float constant";
+    tokenNameRemap["TOKEN_DOUBLE_CONSTANT"] = "double constant";
     tokenNameRemap["TOKEN_INT8_CONSTANT"] = "int8 constant";
     tokenNameRemap["TOKEN_UINT8_CONSTANT"] = "unsigned int8 constant";
     tokenNameRemap["TOKEN_INT16_CONSTANT"] = "int16 constant";
@@ -343,6 +345,8 @@ INT_NUMBER (([0-9]+)|(0x[0-9a-fA-F]+)|(0b[01]+))[uUlL]*[kMG]?[uUlL]*
 INT_NUMBER_DOTDOTDOT (([0-9]+)|(0x[0-9a-fA-F]+)|(0b[01]+))[uUlL]*[kMG]?[uUlL]*\.\.\.
 FLOAT_NUMBER (([0-9]+|(([0-9]+\.[0-9]*[fF]?)|(\.[0-9]+)))([eE][-+]?[0-9]+)?[fF]?)
 HEX_FLOAT_NUMBER (0x[01](\.[0-9a-fA-F]*)?p[-+]?[0-9]+[fF]?)
+DOUBLE_NUMBER (([0-9]+|(([0-9]+\.[0-9]*[dD]?)|(\.[0-9]+)))([dD][-+]?[0-9]+)?[dD]?)
+HEX_DOUBLE_NUMBER (0x[01](\.[0-9a-fA-F]*)?p[-+]?[0-9]+[dD]?)
 
 IDENT [a-zA-Z_][a-zA-Z_0-9]*
 ZO_SWIZZLE ([01]+[w-z]+)+|([01]+[rgba]+)+|([01]+[uv]+)+
@@ -438,6 +442,7 @@ L?\"(\\.|[^\\"])*\" { lStringConst(&yylval, &yylloc); return TOKEN_STRING_LITERA
 }
 
 
+
 {FLOAT_NUMBER} {
     RT;
     yylval.floatVal = (float)atof(yytext);
@@ -450,6 +455,19 @@ L?\"(\\.|[^\\"])*\" { lStringConst(&yylval, &yylloc); return TOKEN_STRING_LITERA
     return TOKEN_FLOAT_CONSTANT;
 }
 
+{DOUBLE_NUMBER} {
+    RT;
+    yylval.doubleVal = atof(yytext);
+    return TOKEN_DOUBLE_CONSTANT;
+}
+
+{HEX_DOUBLE_NUMBER} {
+    RT;
+    yylval.doubleVal = lParseHexFloat(yytext);
+    return TOKEN_DOUBLE_CONSTANT;
+}
+
+
 "++" { RT; return TOKEN_INC_OP; }
 "--" { RT; return TOKEN_DEC_OP; }
 "<<" { RT; return TOKEN_LEFT_OP; }
diff --git a/parse.yy b/parse.yy
index 9a2b4fc3..933a3455 100644
--- a/parse.yy
+++ b/parse.yy
@@ -149,7 +149,8 @@ struct ForeachDimension {
 
 %union {
     uint64_t intVal;
-    float floatVal;
+    float  floatVal;
+    double doubleVal;
     std::string *stringVal;
     const char *constCharPtr;
 
@@ -185,7 +186,7 @@ struct ForeachDimension {
 %token TOKEN_INT64_CONSTANT TOKEN_UINT64_CONSTANT
 %token TOKEN_INT32DOTDOTDOT_CONSTANT TOKEN_UINT32DOTDOTDOT_CONSTANT
 %token TOKEN_INT64DOTDOTDOT_CONSTANT TOKEN_UINT64DOTDOTDOT_CONSTANT
-%token TOKEN_FLOAT_CONSTANT TOKEN_STRING_C_LITERAL
+%token TOKEN_FLOAT_CONSTANT TOKEN_DOUBLE_CONSTANT TOKEN_STRING_C_LITERAL
 %token TOKEN_IDENTIFIER TOKEN_STRING_LITERAL TOKEN_TYPE_NAME TOKEN_NULL
 %token TOKEN_PTR_OP TOKEN_INC_OP TOKEN_DEC_OP TOKEN_LEFT_OP TOKEN_RIGHT_OP
 %token TOKEN_LE_OP TOKEN_GE_OP TOKEN_EQ_OP TOKEN_NE_OP
@@ -327,7 +328,11 @@ primary_expression
     }
     | TOKEN_FLOAT_CONSTANT {
         $$ = new ConstExpr(AtomicType::UniformFloat->GetAsConstType(),
-                           (float)yylval.floatVal, @1);
+                           yylval.floatVal, @1);
+    }
+    | TOKEN_DOUBLE_CONSTANT {
+        $$ = new ConstExpr(AtomicType::UniformDouble->GetAsConstType(),
+                           yylval.doubleVal, @1);
     }
     | TOKEN_TRUE {
         $$ = new ConstExpr(AtomicType::UniformBool->GetAsConstType(), true, @1);
diff --git a/stdlib.ispc b/stdlib.ispc
index 6d7ee051..0d5c4efd 100644
--- a/stdlib.ispc
+++ b/stdlib.ispc
@@ -1559,6 +1559,18 @@ static inline uniform float clamp(uniform float v, uniform float low, uniform fl
     return min(max(v, low), high);
 }
 
+// double
+
+__declspec(safe,cost2)
+static inline double clamp(double v, double low, double high) {
+    return min(max(v, low), high);
+}
+
+__declspec(safe,cost2)
+static inline uniform double clamp(uniform double v, uniform double low, uniform double high) {
+    return min(max(v, low), high);
+}
+
 // int8
 
 __declspec(safe,cost2)

From 233249048127b94cdb073e694f18987b643741d2 Mon Sep 17 00:00:00 2001
From: egaburov <egaburov.work@gmail.com>
Date: Mon, 16 Sep 2013 16:31:41 +0200
Subject: [PATCH 072/124] added fortran_double_constant

---
 lex.ll | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/lex.ll b/lex.ll
index c2990ccc..3d88a23a 100644
--- a/lex.ll
+++ b/lex.ll
@@ -345,8 +345,7 @@ INT_NUMBER (([0-9]+)|(0x[0-9a-fA-F]+)|(0b[01]+))[uUlL]*[kMG]?[uUlL]*
 INT_NUMBER_DOTDOTDOT (([0-9]+)|(0x[0-9a-fA-F]+)|(0b[01]+))[uUlL]*[kMG]?[uUlL]*\.\.\.
 FLOAT_NUMBER (([0-9]+|(([0-9]+\.[0-9]*[fF]?)|(\.[0-9]+)))([eE][-+]?[0-9]+)?[fF]?)
 HEX_FLOAT_NUMBER (0x[01](\.[0-9a-fA-F]*)?p[-+]?[0-9]+[fF]?)
-DOUBLE_NUMBER (([0-9]+|(([0-9]+\.[0-9]*[dD]?)|(\.[0-9]+)))([dD][-+]?[0-9]+)?[dD]?)
-HEX_DOUBLE_NUMBER (0x[01](\.[0-9a-fA-F]*)?p[-+]?[0-9]+[dD]?)
+FORTRAN_DOUBLE_NUMBER (([0-9]+\.[0-9]*[dD])|([0-9]+\.[0-9]*[dD][-+]?[0-9])|([0-9]+[dD][-+]?[0-9]))
 
 IDENT [a-zA-Z_][a-zA-Z_0-9]*
 ZO_SWIZZLE ([01]+[w-z]+)+|([01]+[rgba]+)+|([01]+[uv]+)+
@@ -455,18 +454,19 @@ L?\"(\\.|[^\\"])*\" { lStringConst(&yylval, &yylloc); return TOKEN_STRING_LITERA
     return TOKEN_FLOAT_CONSTANT;
 }
 
-{DOUBLE_NUMBER} {
+{FORTRAN_DOUBLE_NUMBER} {
     RT;
+    {
+      int i = 0;
+      while (yytext[i] != 'd') i++;
+      if ((yytext[i+1] >= '0' && yytext[i+1] <= '9') 
+          || yytext[i+1] == '+' || yytext[i+1] == '-')
+        yytext[i] = 'E';
+    }
     yylval.doubleVal = atof(yytext);
     return TOKEN_DOUBLE_CONSTANT;
 }
 
-{HEX_DOUBLE_NUMBER} {
-    RT;
-    yylval.doubleVal = lParseHexFloat(yytext);
-    return TOKEN_DOUBLE_CONSTANT;
-}
-
 
 "++" { RT; return TOKEN_INC_OP; }
 "--" { RT; return TOKEN_DEC_OP; }

From 6fd21d988d999b62aa0e2832cd93ccdb4ca78f77 Mon Sep 17 00:00:00 2001
From: Evghenii <egaburov@dds.nl>
Date: Mon, 16 Sep 2013 17:15:02 +0200
Subject: [PATCH 073/124] fixed lexer to properly read fortran-notation double
 constants

---
 lex.ll      | 26 +++++++++++++-------------
 stdlib.ispc |  4 ++--
 2 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/lex.ll b/lex.ll
index 3d88a23a..ca318dbb 100644
--- a/lex.ll
+++ b/lex.ll
@@ -345,7 +345,9 @@ INT_NUMBER (([0-9]+)|(0x[0-9a-fA-F]+)|(0b[01]+))[uUlL]*[kMG]?[uUlL]*
 INT_NUMBER_DOTDOTDOT (([0-9]+)|(0x[0-9a-fA-F]+)|(0b[01]+))[uUlL]*[kMG]?[uUlL]*\.\.\.
 FLOAT_NUMBER (([0-9]+|(([0-9]+\.[0-9]*[fF]?)|(\.[0-9]+)))([eE][-+]?[0-9]+)?[fF]?)
 HEX_FLOAT_NUMBER (0x[01](\.[0-9a-fA-F]*)?p[-+]?[0-9]+[fF]?)
-FORTRAN_DOUBLE_NUMBER (([0-9]+\.[0-9]*[dD])|([0-9]+\.[0-9]*[dD][-+]?[0-9])|([0-9]+[dD][-+]?[0-9]))
+FORTRAN_DOUBLE_NUMBER (([0-9]+\.[0-9]*[dD])|([0-9]+\.[0-9]*[dD][-+]?[0-9]+)|([0-9]+[dD][-+]?[0-9]+))
+
+
 
 IDENT [a-zA-Z_][a-zA-Z_0-9]*
 ZO_SWIZZLE ([01]+[w-z]+)+|([01]+[rgba]+)+|([01]+[uv]+)+
@@ -440,6 +442,16 @@ L?\"(\\.|[^\\"])*\" { lStringConst(&yylval, &yylloc); return TOKEN_STRING_LITERA
     return lParseInteger(true);
 }
 
+{FORTRAN_DOUBLE_NUMBER} {
+    RT;
+    {
+      int i = 0;
+      while (yytext[i] != 'd') i++;
+      yytext[i] = 'E';
+    }
+    yylval.doubleVal = atof(yytext);
+    return TOKEN_DOUBLE_CONSTANT;
+}
 
 
 {FLOAT_NUMBER} {
@@ -454,18 +466,6 @@ L?\"(\\.|[^\\"])*\" { lStringConst(&yylval, &yylloc); return TOKEN_STRING_LITERA
     return TOKEN_FLOAT_CONSTANT;
 }
 
-{FORTRAN_DOUBLE_NUMBER} {
-    RT;
-    {
-      int i = 0;
-      while (yytext[i] != 'd') i++;
-      if ((yytext[i+1] >= '0' && yytext[i+1] <= '9') 
-          || yytext[i+1] == '+' || yytext[i+1] == '-')
-        yytext[i] = 'E';
-    }
-    yylval.doubleVal = atof(yytext);
-    return TOKEN_DOUBLE_CONSTANT;
-}
 
 
 "++" { RT; return TOKEN_INC_OP; }
diff --git a/stdlib.ispc b/stdlib.ispc
index 0d5c4efd..9b02d0ba 100644
--- a/stdlib.ispc
+++ b/stdlib.ispc
@@ -2564,7 +2564,7 @@ static inline float acos(float v) {
 
 __declspec(safe)
 static inline double acos(const double v) {
-    return 1.57079637050628662109375 - asin(v);
+    return 1.57079637050628662109375d0 - asin(v);
 }
 
 
@@ -2575,7 +2575,7 @@ static inline uniform float acos(uniform float v) {
 
 __declspec(safe)
 static inline uniform double acos(const uniform double v) {
-    return 1.57079637050628662109375 - asin(v);
+    return 1.57079637050628662109375d0 - asin(v);
 }
 
 

From eef4e11768222914ffb93ccc1ab698e1cfbd7922 Mon Sep 17 00:00:00 2001
From: egaburov <egaburov.work@gmail.com>
Date: Mon, 16 Sep 2013 17:25:13 +0200
Subject: [PATCH 074/124] now it is also case nonsensitive

---
 lex.ll | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lex.ll b/lex.ll
index ca318dbb..f1dcaa6f 100644
--- a/lex.ll
+++ b/lex.ll
@@ -446,7 +446,7 @@ L?\"(\\.|[^\\"])*\" { lStringConst(&yylval, &yylloc); return TOKEN_STRING_LITERA
     RT;
     {
       int i = 0;
-      while (yytext[i] != 'd') i++;
+      while (yytext[i] != 'd' && yytext[i] != 'D') i++;
       yytext[i] = 'E';
     }
     yylval.doubleVal = atof(yytext);

From 6e0b9ddc74a4480e97d9b19c66e4ad8de5d5198a Mon Sep 17 00:00:00 2001
From: Tomasz Koziara <t.koziara@gmail.com>
Date: Mon, 16 Sep 2013 18:02:07 +0100
Subject: [PATCH 075/124] Sort description.

---
 examples/README.txt | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/examples/README.txt b/examples/README.txt
index 5b47df44..b67529c1 100644
--- a/examples/README.txt
+++ b/examples/README.txt
@@ -146,6 +146,11 @@ This is a simple "hello world" type program that shows a ~10 line
 application program calling out to a ~5 line ispc program to do a simple
 computation.
 
+Sort
+====
+This is a bucket sort of 32 bit unsigned integers.
+By default 1000000 random elements get sorted.
+Call ./sort N in order to sort N elements instead.
 
 Volume
 ======

From fa78d548ccc17c4a844762bd5660e49d941f9383 Mon Sep 17 00:00:00 2001
From: Dmitry Babokin <babokin@gmail.com>
Date: Tue, 17 Sep 2013 23:36:16 +0400
Subject: [PATCH 076/124] Test, documentation and vim support for double
 precision constants

---
 contrib/ispc.vim         |  5 +++++
 docs/ispc.rst            | 11 ++++++++++-
 tests/double-consts.ispc | 23 +++++++++++++++++++++++
 3 files changed, 38 insertions(+), 1 deletion(-)
 create mode 100644 tests/double-consts.ispc

diff --git a/contrib/ispc.vim b/contrib/ispc.vim
index cc8493f0..4d870dcd 100644
--- a/contrib/ispc.vim
+++ b/contrib/ispc.vim
@@ -19,6 +19,11 @@ syn keyword	ispcRepeat	cdo cfor cwhile
 syn keyword	ispcBuiltin	programCount programIndex	
 syn keyword	ispcType	export uniform varying int8 int16 int32 int64
 
+"double precision floating point number, with dot, optional exponent
+syn match	cFloat		display contained "\d\+\.\d*d[-+]\=\d*\>"
+"double precision floating point number, without dot, with exponent
+syn match	cFloat		display contained "\d\+d[-+]\=\d\+\>"
+
 " Default highlighting
 command -nargs=+ HiLink hi def link <args>
 HiLink ispcStatement	Statement
diff --git a/docs/ispc.rst b/docs/ispc.rst
index ff07f6d8..224faaa9 100644
--- a/docs/ispc.rst
+++ b/docs/ispc.rst
@@ -270,6 +270,14 @@ new reserved words: ``unmasked``, ``foreach_unique``, ``foreach_active``,
 and ``in``.  Any program that happens to have a variable or function with
 one of these names must be modified to rename that symbol.
 
+Updating ISPC Programs For Changes In ISPC 1.4.5
+----------------------------------------------
+
+This release adds support for double precision floating point constants.
+Double precision floating point constants are floating point number with
+``d`` suffix and optional exponent part. Here are some examples: 3.14d,
+31.4d-1, 1.d, 1.0d, 1d-2. Note that floating point number without suffix is
+treated as single precision constant.
 
 Getting Started with ISPC
 =========================
@@ -1349,7 +1357,8 @@ but are likely to be supported in future releases:
 * Bitfield members of ``struct`` types
 * Variable numbers of arguments to functions
 * Literal floating-point constants (even without a ``f`` suffix) are
-  currently treated as being ``float`` type, not ``double``
+  currently treated as being ``float`` type, not ``double``. To have a double
+  precision floating point constant use ``d`` suffix.
 * The ``volatile`` qualifier
 * The ``register`` storage class for variables.  (Will be ignored).
 
diff --git a/tests/double-consts.ispc b/tests/double-consts.ispc
new file mode 100644
index 00000000..3259156a
--- /dev/null
+++ b/tests/double-consts.ispc
@@ -0,0 +1,23 @@
+
+export uniform int width() { return programCount; }
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    double a = aFOO[programIndex];
+    // Test parsing of double constants.
+    double d1 = 1.0d40;
+    double d2 = 1.d40;
+    double d3 = 1d40;
+    double d4 = 10000000000000000000000000000000000000000.d;
+    double d5 = 10000000000000000000000000000000000000000.0d;
+
+    // All the constants should be equal and if it's evaluated as "float",
+    // then sqrt will evaluate to +inf.
+    if (d1 == d2 && d1 == d3 && d1 == d4 && d1 == d5 &&
+        ((float)sqrt(d1)) < 2e20) {
+        RET[programIndex] = a;
+    }
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1 + programIndex;
+}

From 922edb11281ae432bc1647445dfa556de8fd663f Mon Sep 17 00:00:00 2001
From: evghenii <egaburov@dds.nl>
Date: Wed, 18 Sep 2013 18:14:07 +0300
Subject: [PATCH 077/124] completed knc-i1x16.h and added knc-i1x8.h with
 knc-i1x8unsafe_fast.h that doesnt pass several tests..

---
 examples/intrinsics/knc-i1x16.h           | 3092 +++++++++++++++++++++
 examples/intrinsics/knc-i1x8.h            | 2862 +++++++++++++++++++
 examples/intrinsics/knc-i1x8unsafe_fast.h |    2 +
 run_tests.py                              |    7 +-
 4 files changed, 5961 insertions(+), 2 deletions(-)
 create mode 100644 examples/intrinsics/knc-i1x16.h
 create mode 100644 examples/intrinsics/knc-i1x8.h
 create mode 100644 examples/intrinsics/knc-i1x8unsafe_fast.h

diff --git a/examples/intrinsics/knc-i1x16.h b/examples/intrinsics/knc-i1x16.h
new file mode 100644
index 00000000..8b1a2bb9
--- /dev/null
+++ b/examples/intrinsics/knc-i1x16.h
@@ -0,0 +1,3092 @@
+/**
+  Copyright (c) 2010-2012, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+#include <stdint.h>
+#include <math.h>
+#include <assert.h>
+#include <algorithm>
+#include <immintrin.h>
+#include <zmmintrin.h>
+
+#ifdef _MSC_VER
+#define FORCEINLINE __forceinline
+#define PRE_ALIGN(x)  /*__declspec(align(x))*/
+#define POST_ALIGN(x)  
+#define roundf(x) (floorf(x + .5f))
+#define round(x) (floor(x + .5))
+#else
+#define FORCEINLINE __attribute__((always_inline))
+#define PRE_ALIGN(x)
+#define POST_ALIGN(x)  __attribute__ ((aligned(x)))
+#endif
+
+#if 0
+#define KNC 1
+extern "C" 
+{
+  int printf(const unsigned char *, ...);
+  int puts(unsigned char *);
+  unsigned int putchar(unsigned int);
+  int fflush(void *);
+  uint8_t *memcpy(uint8_t *, uint8_t *, uint64_t);
+  uint8_t *memset(uint8_t *, uint8_t, uint64_t);
+  void memset_pattern16(void *, const void *, uint64_t);
+}
+#endif
+
+typedef float __vec1_f;
+typedef double __vec1_d;
+typedef int8_t __vec1_i8;
+typedef int16_t __vec1_i16;
+typedef int32_t __vec1_i32;
+typedef int64_t __vec1_i64;
+
+struct __vec16_i1 {
+    __vec16_i1() { }
+    __vec16_i1(const __mmask16 &vv) : v(vv) { }
+    __vec16_i1(bool v0, bool v1, bool v2, bool v3,
+               bool v4, bool v5, bool v6, bool v7,
+               bool v8, bool v9, bool v10, bool v11,
+               bool v12, bool v13, bool v14, bool v15) {
+        v = ((v0 & 1) |
+             ((v1 & 1) << 1) |
+             ((v2 & 1) << 2) |
+             ((v3 & 1) << 3) |
+             ((v4 & 1) << 4) |
+             ((v5 & 1) << 5) |
+             ((v6 & 1) << 6) |
+             ((v7 & 1) << 7) |
+             ((v8 & 1) << 8) |
+             ((v9 & 1) << 9) |
+             ((v10 & 1) << 10) |
+             ((v11 & 1) << 11) |
+             ((v12 & 1) << 12) |
+             ((v13 & 1) << 13) |
+             ((v14 & 1) << 14) |
+             ((v15 & 1) << 15));
+    }
+             
+    __mmask16 v;
+    FORCEINLINE operator __mmask16() const { return v; }
+};
+
+
+template <typename T>
+struct vec16 {
+    vec16() { }
+    vec16(T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7,
+          T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15) {
+        data[0] = v0;        data[1] = v1;        data[2] = v2;        data[3] = v3;
+        data[4] = v4;        data[5] = v5;        data[6] = v6;        data[7] = v7;
+        data[8] = v8;        data[9] = v9;        data[10] = v10;      data[11] = v11;
+        data[12] = v12;      data[13] = v13;      data[14] = v14;      data[15] = v15;
+    }
+    T data[16]; 
+    FORCEINLINE const T& operator[](const int i) const { return data[i]; }
+    FORCEINLINE       T& operator[](const int i)       { return data[i]; }
+};
+
+#if 0 /* evghenii:i32 */
+struct PRE_ALIGN(64) __vec16_i32  : public vec16<int32_t> { 
+  __vec16_i32() { }
+  __vec16_i32(int32_t v0, int32_t v1, int32_t v2, int32_t v3, 
+      int32_t v4, int32_t v5, int32_t v6, int32_t v7,
+      int32_t v8, int32_t v9, int32_t v10, int32_t v11, 
+      int32_t v12, int32_t v13, int32_t v14, int32_t v15) 
+    : vec16<int32_t>(v0, v1, v2, v3, v4, v5, v6, v7,
+        v8, v9, v10, v11, v12, v13, v14, v15) { }
+} POST_ALIGN(64);
+#else /* evghenii:i32 */
+struct PRE_ALIGN(64) __vec16_i32 
+{
+  __m512i v;
+  FORCEINLINE operator __m512i() const { return v; }
+  FORCEINLINE __vec16_i32() : v(_mm512_undefined_epi32()) {}
+  FORCEINLINE __vec16_i32(const int32_t &in) : v(_mm512_set1_epi32(in)) {}
+  FORCEINLINE __vec16_i32(const __m512i &in) : v(in) {}
+  FORCEINLINE __vec16_i32(const __vec16_i32 &o) : v(o.v) {}
+  FORCEINLINE __vec16_i32& operator =(const __vec16_i32 &o) { v=o.v; return *this; }
+  FORCEINLINE __vec16_i32(int32_t v00, int32_t v01, int32_t v02, int32_t v03, 
+      int32_t v04, int32_t v05, int32_t v06, int32_t v07,
+      int32_t v08, int32_t v09, int32_t v10, int32_t v11,
+      int32_t v12, int32_t v13, int32_t v14, int32_t v15) :
+    v ( _mm512_set_16to16_pi(v15, v14, v13, v12, v11, v10, v09, v08, v07, v06, v05, v04, v03, v02, v01, v00) ) {}
+    FORCEINLINE const int32_t& operator[](const int i) const {  return ((int32_t*)this)[i]; }
+    FORCEINLINE       int32_t& operator[](const int i)       {  return ((int32_t*)this)[i]; }
+} POST_ALIGN(64);
+#endif /* evghenii:i32 */
+
+#if 0 /* evghenii::f */
+PRE_ALIGN(64) struct __vec16_f : public vec16<float> { 
+    __vec16_f() { }
+    __vec16_f(float v0, float v1, float v2, float v3, 
+              float v4, float v5, float v6, float v7,
+              float v8, float v9, float v10, float v11, 
+              float v12, float v13, float v14, float v15) 
+        : vec16<float>(v0, v1, v2, v3, v4, v5, v6, v7,
+                       v8, v9, v10, v11, v12, v13, v14, v15) { }
+
+} POST_ALIGN(64);
+#else /* evghenii::f */
+PRE_ALIGN(64) struct __vec16_f 
+{
+    __m512 v;
+    FORCEINLINE operator __m512() const { return v; }
+    FORCEINLINE __vec16_f() : v(_mm512_undefined_ps()) { }
+    FORCEINLINE __vec16_f(const __m512 &in) : v(in) {}
+    FORCEINLINE __vec16_f(const __vec16_f &o) : v(o.v) {}
+    FORCEINLINE __vec16_f& operator =(const __vec16_f &o) { v=o.v; return *this; }
+    FORCEINLINE __vec16_f(float v00, float v01, float v02, float v03, 
+                          float v04, float v05, float v06, float v07,
+                          float v08, float v09, float v10, float v11,
+                          float v12, float v13, float v14, float v15) :
+        v ( _mm512_set_16to16_ps(v15, v14, v13, v12, v11, v10, v09, v08, v07, v06, v05, v04, v03, v02, v01, v00) )  {}
+    FORCEINLINE const float& operator[](const int i) const {  return ((float*)this)[i]; }
+    FORCEINLINE       float& operator[](const int i)       {  return ((float*)this)[i]; }
+} POST_ALIGN(64);
+#endif /* evghenii::f */
+
+#if 0 /* evghenii::d */
+PRE_ALIGN(128) struct __vec16_d : public vec16<double> { 
+    __vec16_d() { }
+    __vec16_d(double v0, double v1, double v2, double v3, 
+              double v4, double v5, double v6, double v7,
+              double v8, double v9, double v10, double v11, 
+              double v12, double v13, double v14, double v15) 
+        : vec16<double>(v0, v1, v2, v3, v4, v5, v6, v7,
+                        v8, v9, v10, v11, v12, v13, v14, v15) { }
+
+} POST_ALIGN(128);
+#else /* evghenii::d */
+struct PRE_ALIGN(128) __vec16_d 
+{
+    __m512d v1;
+    __m512d v2;
+    FORCEINLINE __vec16_d() : v1(_mm512_undefined_pd()), v2(_mm512_undefined_pd()) {}
+    FORCEINLINE __vec16_d(const __m512d _v1, const __m512d _v2) : v1(_v1), v2(_v2) {}
+    FORCEINLINE __vec16_d(const __vec16_d &o) : v1(o.v1), v2(o.v2) {}
+    FORCEINLINE __vec16_d& operator =(const __vec16_d &o) { v1=o.v1; v2=o.v2; return *this; }
+    FORCEINLINE __vec16_d(double v00, double v01, double v02, double v03, 
+                          double v04, double v05, double v06, double v07,
+                          double v08, double v09, double v10, double v11,
+                          double v12, double v13, double v14, double v15) {
+        v1 = _mm512_set_8to8_pd(v15, v14, v13, v12, v11, v10, v09, v08);
+        v2 = _mm512_set_8to8_pd(v07, v06, v05, v04, v03, v02, v01, v00);
+    }
+    FORCEINLINE const double& operator[](const int i) const {  return ((double*)this)[i]; }
+    FORCEINLINE       double& operator[](const int i)       {  return ((double*)this)[i]; }
+} POST_ALIGN(128);
+#endif /* evghenii::d */
+
+#if 1 /* evghenii::i64 */
+PRE_ALIGN(128) struct __vec16_i64  : public vec16<int64_t> { 
+    __vec16_i64() { }
+    __vec16_i64(int64_t v0, int64_t v1, int64_t v2, int64_t v3, 
+                int64_t v4, int64_t v5, int64_t v6, int64_t v7,
+                int64_t v8, int64_t v9, int64_t v10, int64_t v11, 
+                int64_t v12, int64_t v13, int64_t v14, int64_t v15) 
+        : vec16<int64_t>(v0, v1, v2, v3, v4, v5, v6, v7,
+                         v8, v9, v10, v11, v12, v13, v14, v15) { }
+} POST_ALIGN(128);
+#else /* evghenii::i64 */
+struct PRE_ALIGN(64) __vec16_i64 {
+    FORCEINLINE __vec16_i64() : v_lo(_mm512_undefined_epi32()),  v_hi(_mm512_undefined_epi32()) {}
+    FORCEINLINE __vec16_i64(const __vec16_i64 &o) : v_lo(o.v_lo), v_hi(o.v_hi) {}
+    FORCEINLINE __vec16_i64(__m512i l, __m512i h) : v_lo(l), v_hi(h) {}
+    FORCEINLINE __vec16_i64& operator =(const __vec16_i64 &o) { v_lo=o.v_lo; v_hi=o.v_hi; return *this; }
+    FORCEINLINE __vec16_i64(int64_t v00, int64_t v01, int64_t v02, int64_t v03, 
+                            int64_t v04, int64_t v05, int64_t v06, int64_t v07,
+                            int64_t v08, int64_t v09, int64_t v10, int64_t v11,
+                            int64_t v12, int64_t v13, int64_t v14, int64_t v15) {
+        __m512i v1 = _mm512_set_8to8_epi64(v15, v14, v13, v12, v11, v10, v09, v08);
+        __m512i v2 = _mm512_set_8to8_epi64(v07, v06, v05, v04, v03, v02, v01, v00);
+        v_hi = _mm512_mask_permutevar_epi32(v_hi, 0xFF00, 
+                      _mm512_set_16to16_pi(15,13,11,9,7,5,3,1,14,12,10,8,6,4,2,0),
+                      v1);
+        v_hi = _mm512_mask_permutevar_epi32(v_hi, 0x00FF, 
+                      _mm512_set_16to16_pi(14,12,10,8,6,4,2,0,15,13,11,9,7,5,3,1),
+                      v2);
+        v_lo = _mm512_mask_permutevar_epi32(v_lo, 0xFF00,
+                      _mm512_set_16to16_pi(14,12,10,8,6,4,2,0,15,13,11,9,7,5,3,1),
+                      v1);
+        v_lo = _mm512_mask_permutevar_epi32(v_lo, 0x00FF,
+                      _mm512_set_16to16_pi(15,13,11,9,7,5,3,1,14,12,10,8,6,4,2,0),
+                      v2);
+    }
+    __m512i v_hi;
+    __m512i v_lo;
+} POST_ALIGN(64);
+
+#endif /* evghenii::i64 */
+
+PRE_ALIGN(16) struct __vec16_i8   : public vec16<int8_t> { 
+    __vec16_i8() { }
+    __vec16_i8(int8_t v0, int8_t v1, int8_t v2, int8_t v3, 
+               int8_t v4, int8_t v5, int8_t v6, int8_t v7,
+               int8_t v8, int8_t v9, int8_t v10, int8_t v11, 
+               int8_t v12, int8_t v13, int8_t v14, int8_t v15) 
+        : vec16<int8_t>(v0, v1, v2, v3, v4, v5, v6, v7,
+                        v8, v9, v10, v11, v12, v13, v14, v15) { }
+} POST_ALIGN(16);
+
+PRE_ALIGN(32) struct __vec16_i16  : public vec16<int16_t> { 
+    __vec16_i16() { }
+    __vec16_i16(int16_t v0, int16_t v1, int16_t v2, int16_t v3, 
+                int16_t v4, int16_t v5, int16_t v6, int16_t v7,
+                int16_t v8, int16_t v9, int16_t v10, int16_t v11, 
+                int16_t v12, int16_t v13, int16_t v14, int16_t v15) 
+        : vec16<int16_t>(v0, v1, v2, v3, v4, v5, v6, v7,
+                         v8, v9, v10, v11, v12, v13, v14, v15) { }
+} POST_ALIGN(32);
+
+static inline int32_t __extract_element(__vec16_i32, int);
+
+
+///////////////////////////////////////////////////////////////////////////
+// macros...
+
+#define UNARY_OP(TYPE, NAME, OP)            \
+static FORCEINLINE TYPE NAME(TYPE v) {      \
+    TYPE ret;                               \
+    for (int i = 0; i < 16; ++i)            \
+        ret[i] = OP(v[i]);              \
+    return ret;                             \
+}
+
+#define BINARY_OP(TYPE, NAME, OP)                               \
+static FORCEINLINE TYPE NAME(TYPE a, TYPE b) {                  \
+    TYPE ret;                                                   \
+   for (int i = 0; i < 16; ++i)                                 \
+       ret[i] = a[i] OP b[i];                             \
+   return ret;                                                   \
+}
+
+#define BINARY_OP_CAST(TYPE, CAST, NAME, OP)                        \
+static FORCEINLINE TYPE NAME(TYPE a, TYPE b) {                      \
+   TYPE ret;                                                        \
+   for (int i = 0; i < 16; ++i)                                     \
+       ret[i] = (CAST)(a[i]) OP (CAST)(b[i]);                 \
+   return ret;                                                      \
+}
+
+#define BINARY_OP_FUNC(TYPE, NAME, FUNC)                            \
+static FORCEINLINE TYPE NAME(TYPE a, TYPE b) {                      \
+   TYPE ret;                                                        \
+   for (int i = 0; i < 16; ++i)                                     \
+       ret[i] = FUNC(a[i], b[i]);                             \
+   return ret;                                                      \
+}
+
+#define CMP_OP(TYPE, SUFFIX, CAST, NAME, OP)                        \
+static FORCEINLINE __vec16_i1 NAME##_##SUFFIX(TYPE a, TYPE b) {     \
+   __vec16_i1 ret;                                                  \
+   ret.v = 0;                                                       \
+   for (int i = 0; i < 16; ++i)                                     \
+       ret.v |= ((CAST)(a[i]) OP (CAST)(b[i])) << i;            \
+   return ret;                                                      \
+}                                                                   \
+static FORCEINLINE __vec16_i1 NAME##_##SUFFIX##_and_mask(TYPE a, TYPE b,       \
+                                              __vec16_i1 mask) {    \
+   __vec16_i1 ret;                                                  \
+   ret.v = 0;                                                       \
+   for (int i = 0; i < 16; ++i)                                     \
+       ret.v |= ((CAST)(a[i]) OP (CAST)(b[i])) << i;            \
+   ret.v &= mask.v;                                                 \
+   return ret;                                                      \
+}
+
+#define INSERT_EXTRACT(VTYPE, STYPE)                                  \
+static FORCEINLINE STYPE __extract_element(VTYPE v, int index) {      \
+    return ((STYPE *)&v)[index];                                      \
+}                                                                     \
+static FORCEINLINE void __insert_element(VTYPE *v, int index, STYPE val) { \
+    ((STYPE *)v)[index] = val;                                        \
+}
+
+#define LOAD_STORE(VTYPE, STYPE)                       \
+template <int ALIGN>                                   \
+static FORCEINLINE VTYPE __load(const VTYPE *p) {      \
+    STYPE *ptr = (STYPE *)p;                           \
+    VTYPE ret;                                         \
+    for (int i = 0; i < 16; ++i)                       \
+        ret[i] = ptr[i];                             \
+    return ret;                                        \
+}                                                      \
+template <int ALIGN>                                   \
+static FORCEINLINE void __store(VTYPE *p, VTYPE v) {   \
+    STYPE *ptr = (STYPE *)p;                           \
+    for (int i = 0; i < 16; ++i)                       \
+        ptr[i] = v[i];                               \
+}
+
+#define LOADS(VTYPE, STYPE)                       \
+template <int ALIGN>                                   \
+static FORCEINLINE VTYPE __load(const VTYPE *p) {      \
+    STYPE *ptr = (STYPE *)p;                           \
+    VTYPE ret;                                         \
+    for (int i = 0; i < 16; ++i)                       \
+        ret[i] = ptr[i];                             \
+    return ret;                                        \
+}                                                      \
+
+#define STORES(VTYPE, STYPE)                       \
+template <int ALIGN>                                   \
+static FORCEINLINE void __store(VTYPE *p, VTYPE v) {   \
+    STYPE *ptr = (STYPE *)p;                           \
+    for (int i = 0; i < 16; ++i)                       \
+        ptr[i] = v[i];                               \
+}
+
+#define REDUCE_ADD(TYPE, VTYPE, NAME)           \
+static FORCEINLINE TYPE NAME(VTYPE v) {         \
+     TYPE ret = v[0];                         \
+     for (int i = 1; i < 16; ++i)               \
+         ret = ret + v[i];                    \
+     return ret;                                \
+}
+
+#define REDUCE_MINMAX(TYPE, VTYPE, NAME, OP)                    \
+static FORCEINLINE TYPE NAME(VTYPE v) {                         \
+    TYPE ret = v[0];                                          \
+    for (int i = 1; i < 16; ++i)                                \
+        ret = (ret OP (TYPE)v[i]) ? ret : (TYPE)v[i];       \
+    return ret;                                                 \
+}
+
+#define SELECT(TYPE)                                                \
+static FORCEINLINE TYPE __select(__vec16_i1 mask, TYPE a, TYPE b) { \
+    TYPE ret;                                                       \
+    for (int i = 0; i < 16; ++i)                                    \
+        ret[i] = (mask.v & (1<<i)) ? a[i] : b[i];             \
+    return ret;                                                     \
+}                                                                   \
+static FORCEINLINE TYPE __select(bool cond, TYPE a, TYPE b) {       \
+    return cond ? a : b;                                            \
+}
+
+#define SHIFT_UNIFORM(TYPE, CAST, NAME, OP)                         \
+static FORCEINLINE TYPE NAME(TYPE a, int32_t b) {                   \
+   TYPE ret;                                                        \
+   for (int i = 0; i < 16; ++i)                                     \
+       ret[i] = (CAST)(a[i]) OP b;                              \
+   return ret;                                                      \
+}
+
+#define SMEAR(VTYPE, NAME, STYPE)                                  \
+template <class RetVecType> VTYPE __smear_##NAME(STYPE);           \
+template <> FORCEINLINE VTYPE __smear_##NAME<VTYPE>(STYPE v) {     \
+    VTYPE ret;                                                     \
+    for (int i = 0; i < 16; ++i)                                   \
+        ret[i] = v;                                              \
+    return ret;                                                    \
+}
+
+#define SETZERO(VTYPE, NAME)                                       \
+template <class RetVecType> VTYPE __setzero_##NAME();              \
+template <> FORCEINLINE VTYPE __setzero_##NAME<VTYPE>() {          \
+    VTYPE ret;                                                     \
+    for (int i = 0; i < 16; ++i)                                   \
+        ret[i] = 0;                                              \
+    return ret;                                                    \
+}
+
+#define UNDEF(VTYPE, NAME)                                         \
+template <class RetVecType> VTYPE __undef_##NAME();                \
+template <> FORCEINLINE VTYPE __undef_##NAME<VTYPE>() {            \
+    return VTYPE();                                                \
+}
+
+#define BROADCAST(VTYPE, NAME, STYPE)                 \
+static FORCEINLINE VTYPE __broadcast_##NAME(VTYPE v, int index) {   \
+    VTYPE ret;                                        \
+    for (int i = 0; i < 16; ++i)                      \
+        ret[i] = v[index & 0xf];                  \
+    return ret;                                       \
+}                                                     \
+
+#define ROTATE(VTYPE, NAME, STYPE)                    \
+static FORCEINLINE VTYPE __rotate_##NAME(VTYPE v, int index) {   \
+    VTYPE ret;                                        \
+    for (int i = 0; i < 16; ++i)                      \
+        ret[i] = v[(i+index) & 0xf];              \
+    return ret;                                       \
+}                                                     \
+
+#define SHUFFLES(VTYPE, NAME, STYPE)                 \
+static FORCEINLINE VTYPE __shuffle_##NAME(VTYPE v, __vec16_i32 index) {   \
+    VTYPE ret;                                        \
+    for (int i = 0; i < 16; ++i)                      \
+        ret[i] = v[__extract_element(index, i) & 0xf];      \
+    return ret;                                       \
+}                                                     \
+static FORCEINLINE VTYPE __shuffle2_##NAME(VTYPE v0, VTYPE v1, __vec16_i32 index) {     \
+    VTYPE ret;                                        \
+    for (int i = 0; i < 16; ++i) {                    \
+        int ii = __extract_element(index, i) & 0x1f;    \
+        ret[i] = (ii < 16) ? v0[ii] : v1[ii-16];  \
+    }                                                 \
+    return ret;                                       \
+}
+
+#define SHUFFLE2(VTYPE, NAME, STYPE)                 \
+static FORCEINLINE VTYPE __shuffle2_##NAME(VTYPE v0, VTYPE v1, __vec16_i32 index) {     \
+    VTYPE ret;                                        \
+    for (int i = 0; i < 16; ++i) {                    \
+        int ii = __extract_element(index, i) & 0x1f;    \
+        ret[i] = (ii < 16) ? v0[ii] : v1[ii-16];  \
+    }                                                 \
+    return ret;                                       \
+}
+
+///////////////////////////////////////////////////////////////////////////
+
+INSERT_EXTRACT(__vec1_i8, int8_t)
+INSERT_EXTRACT(__vec1_i16, int16_t)
+INSERT_EXTRACT(__vec1_i32, int32_t)
+INSERT_EXTRACT(__vec1_i64, int64_t)
+INSERT_EXTRACT(__vec1_f, float)
+INSERT_EXTRACT(__vec1_d, double)
+
+///////////////////////////////////////////////////////////////////////////
+// mask ops
+
+static FORCEINLINE uint64_t __movmsk(__vec16_i1 mask) {
+    return (uint64_t)mask.v;
+}
+
+static FORCEINLINE bool __any(__vec16_i1 mask) {
+    return (mask.v!=0);
+}
+
+static FORCEINLINE bool __all(__vec16_i1 mask) {
+    return (mask.v==0xFFFF);
+}
+
+static FORCEINLINE bool __none(__vec16_i1 mask) {
+    return (mask.v==0);
+}
+
+static FORCEINLINE __vec16_i1 __equal_i1(__vec16_i1 a, __vec16_i1 b) {
+    __vec16_i1 r;
+    r.v = (a.v & b.v) | (~a.v & ~b.v);
+    return r;
+}
+
+static FORCEINLINE __vec16_i1 __and(__vec16_i1 a, __vec16_i1 b) {
+    __vec16_i1 r;
+    r.v = a.v & b.v;
+    return r;
+}
+
+static FORCEINLINE __vec16_i1 __xor(__vec16_i1 a, __vec16_i1 b) {
+    __vec16_i1 r;
+    r.v = a.v ^ b.v;
+    return r;
+}
+
+static FORCEINLINE __vec16_i1 __or(__vec16_i1 a, __vec16_i1 b) {
+    __vec16_i1 r;
+    r.v = a.v | b.v;
+    return r;
+}
+
+static FORCEINLINE __vec16_i1 __not(__vec16_i1 v) {
+    __vec16_i1 r;
+    r.v = ~v.v;
+    return r;
+}
+
+static FORCEINLINE __vec16_i1 __and_not1(__vec16_i1 a, __vec16_i1 b) {
+    __vec16_i1 r;
+    r.v = ~a.v & b.v;
+    return r;
+}
+
+static FORCEINLINE __vec16_i1 __and_not2(__vec16_i1 a, __vec16_i1 b) {
+    __vec16_i1 r;
+    r.v = a.v & ~b.v;
+    return r;
+}
+
+static FORCEINLINE __vec16_i1 __select(__vec16_i1 mask, __vec16_i1 a, 
+                                       __vec16_i1 b) {
+    __vec16_i1 r;
+    r.v = (a.v & mask.v) | (b.v & ~mask.v);
+    return r;
+}
+
+static FORCEINLINE __vec16_i1 __select(bool cond, __vec16_i1 a, __vec16_i1 b) {
+    return cond ? a : b;
+}
+
+static FORCEINLINE bool __extract_element(__vec16_i1 vec, int index) {
+    return (vec.v & (1 << index)) ? true : false;
+}
+
+static FORCEINLINE void __insert_element(__vec16_i1 *vec, int index, 
+                                         bool val) {
+    if (val == false)
+        vec->v &= ~(1 << index);
+    else
+        vec->v |= (1 << index);
+}
+
+template <int ALIGN> static FORCEINLINE __vec16_i1 __load(const __vec16_i1 *p) {
+    uint16_t *ptr = (uint16_t *)p;
+    __vec16_i1 r;
+    r.v = *ptr;
+    return r;
+}
+
+template <int ALIGN> static FORCEINLINE void __store(__vec16_i1 *p, __vec16_i1 v) {
+    uint16_t *ptr = (uint16_t *)p;
+    *ptr = v.v;
+}
+
+template <class RetVecType> RetVecType __smear_i1(int i);
+template <> static FORCEINLINE __vec16_i1 __smear_i1<__vec16_i1>(int i) {
+    return i?0xFFFF:0x0;
+}
+
+template <class RetVecType> RetVecType __setzero_i1();
+template <> static FORCEINLINE __vec16_i1 __setzero_i1<__vec16_i1>() {
+    return 0;
+}
+
+template <class RetVecType> __vec16_i1 __undef_i1();
+template <> FORCEINLINE __vec16_i1 __undef_i1<__vec16_i1>() {
+    return __vec16_i1();
+}
+
+
+///////////////////////////////////////////////////////////////////////////
+// int8
+
+BINARY_OP(__vec16_i8, __add, +)
+BINARY_OP(__vec16_i8, __sub, -)
+BINARY_OP(__vec16_i8, __mul, *)
+
+BINARY_OP(__vec16_i8, __or, |)
+BINARY_OP(__vec16_i8, __and, &)
+BINARY_OP(__vec16_i8, __xor, ^)
+BINARY_OP(__vec16_i8, __shl, <<)
+
+BINARY_OP_CAST(__vec16_i8, uint8_t, __udiv, /)
+BINARY_OP_CAST(__vec16_i8, int8_t,  __sdiv, /)
+
+BINARY_OP_CAST(__vec16_i8, uint8_t, __urem, %)
+BINARY_OP_CAST(__vec16_i8, int8_t,  __srem, %)
+BINARY_OP_CAST(__vec16_i8, uint8_t, __lshr, >>)
+BINARY_OP_CAST(__vec16_i8, int8_t,  __ashr, >>)
+
+SHIFT_UNIFORM(__vec16_i8, uint8_t, __lshr, >>)
+SHIFT_UNIFORM(__vec16_i8, int8_t, __ashr, >>)
+SHIFT_UNIFORM(__vec16_i8, int8_t, __shl, <<)
+
+CMP_OP(__vec16_i8, i8, int8_t,  __equal, ==)
+CMP_OP(__vec16_i8, i8, int8_t,  __not_equal, !=)
+CMP_OP(__vec16_i8, i8, uint8_t, __unsigned_less_equal, <=)
+CMP_OP(__vec16_i8, i8, int8_t,  __signed_less_equal, <=)
+CMP_OP(__vec16_i8, i8, uint8_t, __unsigned_greater_equal, >=)
+CMP_OP(__vec16_i8, i8, int8_t,  __signed_greater_equal, >=)
+CMP_OP(__vec16_i8, i8, uint8_t, __unsigned_less_than, <)
+CMP_OP(__vec16_i8, i8, int8_t,  __signed_less_than, <)
+CMP_OP(__vec16_i8, i8, uint8_t, __unsigned_greater_than, >)
+CMP_OP(__vec16_i8, i8, int8_t,  __signed_greater_than, >)
+
+SELECT(__vec16_i8)
+INSERT_EXTRACT(__vec16_i8, int8_t)
+SMEAR(__vec16_i8, i8, int8_t)
+SETZERO(__vec16_i8, i8)
+UNDEF(__vec16_i8, i8)
+BROADCAST(__vec16_i8, i8, int8_t)
+ROTATE(__vec16_i8, i8, int8_t)
+SHUFFLES(__vec16_i8, i8, int8_t)
+LOAD_STORE(__vec16_i8, int8_t)
+
+///////////////////////////////////////////////////////////////////////////
+// int16
+
+BINARY_OP(__vec16_i16, __add, +)
+BINARY_OP(__vec16_i16, __sub, -)
+BINARY_OP(__vec16_i16, __mul, *)
+
+BINARY_OP(__vec16_i16, __or, |)
+BINARY_OP(__vec16_i16, __and, &)
+BINARY_OP(__vec16_i16, __xor, ^)
+BINARY_OP(__vec16_i16, __shl, <<)
+
+BINARY_OP_CAST(__vec16_i16, uint16_t, __udiv, /)
+BINARY_OP_CAST(__vec16_i16, int16_t,  __sdiv, /)
+
+BINARY_OP_CAST(__vec16_i16, uint16_t, __urem, %)
+BINARY_OP_CAST(__vec16_i16, int16_t,  __srem, %)
+BINARY_OP_CAST(__vec16_i16, uint16_t, __lshr, >>)
+BINARY_OP_CAST(__vec16_i16, int16_t,  __ashr, >>)
+
+SHIFT_UNIFORM(__vec16_i16, uint16_t, __lshr, >>)
+SHIFT_UNIFORM(__vec16_i16, int16_t, __ashr, >>)
+SHIFT_UNIFORM(__vec16_i16, int16_t, __shl, <<)
+
+CMP_OP(__vec16_i16, i16, int16_t,  __equal, ==)
+CMP_OP(__vec16_i16, i16, int16_t,  __not_equal, !=)
+CMP_OP(__vec16_i16, i16, uint16_t, __unsigned_less_equal, <=)
+CMP_OP(__vec16_i16, i16, int16_t,  __signed_less_equal, <=)
+CMP_OP(__vec16_i16, i16, uint16_t, __unsigned_greater_equal, >=)
+CMP_OP(__vec16_i16, i16, int16_t,  __signed_greater_equal, >=)
+CMP_OP(__vec16_i16, i16, uint16_t, __unsigned_less_than, <)
+CMP_OP(__vec16_i16, i16, int16_t,  __signed_less_than, <)
+CMP_OP(__vec16_i16, i16, uint16_t, __unsigned_greater_than, >)
+CMP_OP(__vec16_i16, i16, int16_t,  __signed_greater_than, >)
+
+SELECT(__vec16_i16)
+INSERT_EXTRACT(__vec16_i16, int16_t)
+SMEAR(__vec16_i16, i16, int16_t)
+SETZERO(__vec16_i16, i16)
+UNDEF(__vec16_i16, i16)
+BROADCAST(__vec16_i16, i16, int16_t)
+ROTATE(__vec16_i16, i16, int16_t)
+SHUFFLES(__vec16_i16, i16, int16_t)
+LOAD_STORE(__vec16_i16, int16_t)
+
+#if 0 /* evghenii::int32 */
+///////////////////////////////////////////////////////////////////////////
+// int32
+
+BINARY_OP(__vec16_i32, __add, +)
+BINARY_OP(__vec16_i32, __sub, -)
+BINARY_OP(__vec16_i32, __mul, *)
+
+BINARY_OP(__vec16_i32, __or, |)
+BINARY_OP(__vec16_i32, __and, &)
+BINARY_OP(__vec16_i32, __xor, ^)
+BINARY_OP(__vec16_i32, __shl, <<)
+
+BINARY_OP_CAST(__vec16_i32, uint32_t, __udiv, /)
+BINARY_OP_CAST(__vec16_i32, int32_t,  __sdiv, /)
+
+BINARY_OP_CAST(__vec16_i32, uint32_t, __urem, %)
+BINARY_OP_CAST(__vec16_i32, int32_t,  __srem, %)
+BINARY_OP_CAST(__vec16_i32, uint32_t, __lshr, >>)
+BINARY_OP_CAST(__vec16_i32, int32_t,  __ashr, >>)
+
+SHIFT_UNIFORM(__vec16_i32, uint32_t, __lshr, >>)
+SHIFT_UNIFORM(__vec16_i32, int32_t, __ashr, >>)
+SHIFT_UNIFORM(__vec16_i32, int32_t, __shl, <<)
+
+CMP_OP(__vec16_i32, i32, int32_t,  __equal, ==)
+CMP_OP(__vec16_i32, i32, int32_t,  __not_equal, !=)
+CMP_OP(__vec16_i32, i32, uint32_t, __unsigned_less_equal, <=)
+CMP_OP(__vec16_i32, i32, int32_t,  __signed_less_equal, <=)
+CMP_OP(__vec16_i32, i32, uint32_t, __unsigned_greater_equal, >=)
+CMP_OP(__vec16_i32, i32, int32_t,  __signed_greater_equal, >=)
+CMP_OP(__vec16_i32, i32, uint32_t, __unsigned_less_than, <)
+CMP_OP(__vec16_i32, i32, int32_t,  __signed_less_than, <)
+CMP_OP(__vec16_i32, i32, uint32_t, __unsigned_greater_than, >)
+CMP_OP(__vec16_i32, i32, int32_t,  __signed_greater_than, >)
+
+SELECT(__vec16_i32)
+INSERT_EXTRACT(__vec16_i32, int32_t)
+SMEAR(__vec16_i32, i32, int32_t)
+SETZERO(__vec16_i32, i32)
+UNDEF(__vec16_i32, i32)
+BROADCAST(__vec16_i32, i32, int32_t)
+ROTATE(__vec16_i32, i32, int32_t)
+SHUFFLES(__vec16_i32, i32, int32_t)
+LOAD_STORE(__vec16_i32, int32_t)
+
+#else /* evghenii::int32 */
+///////////////////////////////////////////////////////////////////////////
+// int32
+///////////////////////////////////////////////////////////////////////////
+
+static FORCEINLINE __vec16_i32 __add(__vec16_i32 a, __vec16_i32 b) {
+    return _mm512_add_epi32(a, b);
+}
+
+static FORCEINLINE __vec16_i32 __sub(__vec16_i32 a, __vec16_i32 b) {
+    return _mm512_sub_epi32(a, b);
+}
+
+static FORCEINLINE __vec16_i32 __mul(__vec16_i32 a, __vec16_i32 b) {
+    return _mm512_mullo_epi32(a, b);
+}
+
+static FORCEINLINE __vec16_i32 __udiv(__vec16_i32 a, __vec16_i32 b) {
+    return _mm512_div_epu32(a, b);
+}
+
+static FORCEINLINE __vec16_i32 __sdiv(__vec16_i32 a, __vec16_i32 b) {
+    return _mm512_div_epi32(a, b);
+}
+
+static FORCEINLINE __vec16_i32 __urem(__vec16_i32 a, __vec16_i32 b) {
+    return _mm512_rem_epu32(a, b);
+}
+
+static FORCEINLINE __vec16_i32 __srem(__vec16_i32 a, __vec16_i32 b) {
+    return _mm512_rem_epi32(a, b);
+}
+
+static FORCEINLINE __vec16_i32 __or(__vec16_i32 a, __vec16_i32 b) {
+    return _mm512_or_epi32(a, b);
+}
+
+static FORCEINLINE __vec16_i32 __and(__vec16_i32 a, __vec16_i32 b) {
+    return _mm512_and_epi32(a, b);
+}
+
+static FORCEINLINE __vec16_i32 __xor(__vec16_i32 a, __vec16_i32 b) {
+    return _mm512_xor_epi32(a, b);
+}
+
+static FORCEINLINE __vec16_i32 __shl(__vec16_i32 a, __vec16_i32 b) {
+    return _mm512_sllv_epi32(a, b);
+}
+
+static FORCEINLINE __vec16_i32 __lshr(__vec16_i32 a, __vec16_i32 b) {
+    return _mm512_srlv_epi32(a, b);
+}
+
+static FORCEINLINE __vec16_i32 __ashr(__vec16_i32 a, __vec16_i32 b) {
+    return _mm512_srav_epi32(a, b); 
+}
+
+static FORCEINLINE __vec16_i32 __shl(__vec16_i32 a, int32_t n) {
+    return _mm512_slli_epi32(a, n);
+}
+
+static FORCEINLINE __vec16_i32 __lshr(__vec16_i32 a, int32_t n) {
+    return _mm512_srli_epi32(a, n); 
+}
+
+static FORCEINLINE __vec16_i32 __ashr(__vec16_i32 a, int32_t n) {
+    return _mm512_srai_epi32(a, n); 
+}
+
+static FORCEINLINE __vec16_i1 __equal_i32(const __vec16_i32 &a, const __vec16_i32 &b) {
+    return _mm512_cmpeq_epi32_mask(a, b);
+}
+
+static FORCEINLINE __vec16_i1 __equal_i32_and_mask(const __vec16_i32 &a, const __vec16_i32 &b,
+                                                   __vec16_i1 m) {
+    return _mm512_mask_cmpeq_epi32_mask(m, a, b);
+}
+
+static FORCEINLINE __vec16_i1 __not_equal_i32(__vec16_i32 a, __vec16_i32 b) {
+    return _mm512_cmpneq_epi32_mask(a, b);
+}
+
+static FORCEINLINE __vec16_i1 __not_equal_i32_and_mask(__vec16_i32 a, __vec16_i32 b,
+                                                       __vec16_i1 m) {
+    return _mm512_mask_cmpneq_epi32_mask(m, a, b);
+}
+
+static FORCEINLINE __vec16_i1 __unsigned_less_equal_i32(__vec16_i32 a, __vec16_i32 b) {
+    return _mm512_cmple_epu32_mask(a, b);
+}
+
+static FORCEINLINE __vec16_i1 __unsigned_less_equal_i32_and_mask(__vec16_i32 a, __vec16_i32 b,
+                                                                 __vec16_i1 m) {
+    return _mm512_mask_cmple_epu32_mask(m, a, b);
+}
+
+static FORCEINLINE __vec16_i1 __signed_less_equal_i32(__vec16_i32 a, __vec16_i32 b) {
+    return _mm512_cmple_epi32_mask(a, b);
+}
+
+static FORCEINLINE __vec16_i1 __signed_less_equal_i32_and_mask(__vec16_i32 a, __vec16_i32 b,
+                                                               __vec16_i1 m) {
+    return _mm512_mask_cmple_epi32_mask(m, a, b);
+}
+
+static FORCEINLINE __vec16_i1 __unsigned_greater_equal_i32(__vec16_i32 a, __vec16_i32 b) {
+    return _mm512_cmpge_epu32_mask(a, b);
+}
+
+static FORCEINLINE __vec16_i1 __unsigned_greater_equal_i32_and_mask(__vec16_i32 a, __vec16_i32 b,
+                                                                    __vec16_i1 m) {
+    return _mm512_mask_cmpge_epu32_mask(m, a, b);
+}
+
+static FORCEINLINE __vec16_i1 __signed_greater_equal_i32(__vec16_i32 a, __vec16_i32 b) {
+    return _mm512_cmpge_epi32_mask(a, b);
+}
+
+static FORCEINLINE __vec16_i1 __signed_greater_equal_i32_and_mask(__vec16_i32 a, __vec16_i32 b,
+                                                                  __vec16_i1 m) {
+    return _mm512_mask_cmpge_epi32_mask(m, a, b);
+}
+
+static FORCEINLINE __vec16_i1 __unsigned_less_than_i32(__vec16_i32 a, __vec16_i32 b) {
+    return _mm512_cmplt_epu32_mask(a, b);
+}
+
+static FORCEINLINE __vec16_i1 __unsigned_less_than_i32_and_mask(__vec16_i32 a, __vec16_i32 b,
+                                                                __vec16_i1 m) {
+    return _mm512_mask_cmplt_epu32_mask(m, a, b);
+}
+
+static FORCEINLINE __vec16_i1 __signed_less_than_i32(__vec16_i32 a, __vec16_i32 b) {
+    return _mm512_cmplt_epi32_mask(a, b);
+}
+
+static FORCEINLINE __vec16_i1 __signed_less_than_i32_and_mask(__vec16_i32 a, __vec16_i32 b,
+                                                              __vec16_i1 m) {
+    return _mm512_mask_cmplt_epi32_mask(m, a, b);
+}
+
+static FORCEINLINE __vec16_i1 __unsigned_greater_than_i32(__vec16_i32 a, __vec16_i32 b) {
+    return _mm512_cmpgt_epu32_mask(a, b);
+}
+
+static FORCEINLINE __vec16_i1 __unsigned_greater_than_i32_and_mask(__vec16_i32 a, __vec16_i32 b,
+                                                                   __vec16_i1 m) {
+    return _mm512_mask_cmpgt_epu32_mask(m, a, b);
+}
+
+static FORCEINLINE __vec16_i1 __signed_greater_than_i32(__vec16_i32 a, __vec16_i32 b) {
+    return _mm512_cmpgt_epi32_mask(a, b);
+}
+
+static FORCEINLINE __vec16_i1 __signed_greater_than_i32_and_mask(__vec16_i32 a, __vec16_i32 b,
+                                                                 __vec16_i1 m) {
+    return _mm512_mask_cmpgt_epi32_mask(m, a, b);
+}
+
+static FORCEINLINE __vec16_i32 __select(__vec16_i1 mask,
+                                        __vec16_i32 a, __vec16_i32 b) {
+    return _mm512_mask_mov_epi32(b.v, mask, a.v);
+} 
+
+static FORCEINLINE __vec16_i32 __select(bool cond, __vec16_i32 a, __vec16_i32 b) {
+    return cond ? a : b;
+}
+
+static FORCEINLINE int32_t __extract_element(__vec16_i32 v, int index) { //uint32_t index) {
+    return ((int32_t *)&v)[index];
+}
+
+static FORCEINLINE void __insert_element(__vec16_i32 *v, uint32_t index, int32_t val) {
+    ((int32_t *)v)[index] = val;
+}
+
+template <class RetVecType> RetVecType __smear_i32(int32_t i);
+template <> static FORCEINLINE __vec16_i32 __smear_i32<__vec16_i32>(int32_t i) {
+    return _mm512_set1_epi32(i);
+}
+
+static const __vec16_i32 __ispc_one = __smear_i32<__vec16_i32>(1);
+static const __vec16_i32 __ispc_thirty_two = __smear_i32<__vec16_i32>(32);
+static const __vec16_i32 __ispc_ffffffff = __smear_i32<__vec16_i32>(-1);
+static const __vec16_i32 __ispc_stride1(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+
+template <class RetVecType> RetVecType __setzero_i32();
+template <> static FORCEINLINE __vec16_i32 __setzero_i32<__vec16_i32>() {
+    return _mm512_setzero_epi32();
+}
+
+template <class RetVecType> RetVecType __undef_i32();
+template <> static FORCEINLINE __vec16_i32 __undef_i32<__vec16_i32>() {
+    return __vec16_i32();
+}
+
+static FORCEINLINE __vec16_i32 __broadcast_i32(__vec16_i32 v, int index) {
+    int32_t val = __extract_element(v, index & 0xf);
+    return _mm512_set1_epi32(val);
+}
+
+#if 0 /* evghenii::doesn't work */
+static FORCEINLINE __vec16_i32 __rotate_i32(__vec16_i32 v, int index) {
+    __vec16_i32 idx = __smear_i32<__vec16_i32>(index);
+    __vec16_i32 shuffle = _mm512_and_epi32(_mm512_add_epi32(__ispc_stride1, idx),  __smear_i32<__vec16_i32>(0x7));
+    return _mm512_mask_permutevar_epi32(v, 0xffff, shuffle, v);
+}
+#else
+ROTATE(__vec16_i32, i32, int32_t)
+#endif
+
+static FORCEINLINE __vec16_i32 __shuffle_i32(__vec16_i32 v, __vec16_i32 index) {
+    return _mm512_mask_permutevar_epi32(v, 0xffff, index, v);
+}
+SHUFFLE2(__vec16_i32, i32, int32_t) /* evghenii::to implement */
+
+template <int ALIGN> static FORCEINLINE __vec16_i32 __load(const __vec16_i32 *p) {
+#ifdef ISPC_FORCE_ALIGNED_MEMORY
+    return _mm512_load_epi32(p);
+#else
+    __vec16_i32 v;
+    v = _mm512_extloadunpacklo_epi32(v, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
+    v = _mm512_extloadunpackhi_epi32(v, (uint8_t*)p+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
+    return v;
+#endif
+}
+
+
+template <int ALIGN> static FORCEINLINE void __store(__vec16_i32 *p, __vec16_i32 v) {
+#ifdef ISPC_FORCE_ALIGNED_MEMORY
+    _mm512_store_epi32(p, v);
+#else
+    _mm512_extpackstorelo_epi32(p, v, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
+    _mm512_extpackstorehi_epi32((uint8_t*)p+64, v, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
+#endif
+}
+
+#if 0
+template <> static FORCEINLINE __vec16_i32 __load<64>(const __vec16_i32 *p) {
+    return _mm512_load_epi32(p);
+}
+template <> static FORCEINLINE void __store<64>(__vec16_i32 *p, __vec16_i32 v) {
+    _mm512_store_epi32(p, v);
+}
+#endif
+#endif /* evghenii::int32 */
+
+///////////////////////////////////////////////////////////////////////////
+// int64
+
+BINARY_OP(__vec16_i64, __add, +)
+BINARY_OP(__vec16_i64, __sub, -)
+BINARY_OP(__vec16_i64, __mul, *)
+
+BINARY_OP(__vec16_i64, __or, |)
+BINARY_OP(__vec16_i64, __and, &)
+BINARY_OP(__vec16_i64, __xor, ^)
+BINARY_OP(__vec16_i64, __shl, <<)
+
+BINARY_OP_CAST(__vec16_i64, uint64_t, __udiv, /)
+BINARY_OP_CAST(__vec16_i64, int64_t,  __sdiv, /)
+
+BINARY_OP_CAST(__vec16_i64, uint64_t, __urem, %)
+BINARY_OP_CAST(__vec16_i64, int64_t,  __srem, %)
+BINARY_OP_CAST(__vec16_i64, uint64_t, __lshr, >>)
+BINARY_OP_CAST(__vec16_i64, int64_t,  __ashr, >>)
+
+SHIFT_UNIFORM(__vec16_i64, uint64_t, __lshr, >>)
+SHIFT_UNIFORM(__vec16_i64, int64_t, __ashr, >>)
+SHIFT_UNIFORM(__vec16_i64, int64_t, __shl, <<)
+
+CMP_OP(__vec16_i64, i64, int64_t,  __equal, ==)
+CMP_OP(__vec16_i64, i64, int64_t,  __not_equal, !=)
+CMP_OP(__vec16_i64, i64, uint64_t, __unsigned_less_equal, <=)
+CMP_OP(__vec16_i64, i64, int64_t,  __signed_less_equal, <=)
+CMP_OP(__vec16_i64, i64, uint64_t, __unsigned_greater_equal, >=)
+CMP_OP(__vec16_i64, i64, int64_t,  __signed_greater_equal, >=)
+CMP_OP(__vec16_i64, i64, uint64_t, __unsigned_less_than, <)
+CMP_OP(__vec16_i64, i64, int64_t,  __signed_less_than, <)
+CMP_OP(__vec16_i64, i64, uint64_t, __unsigned_greater_than, >)
+CMP_OP(__vec16_i64, i64, int64_t,  __signed_greater_than, >)
+
+SELECT(__vec16_i64)
+INSERT_EXTRACT(__vec16_i64, int64_t)
+SMEAR(__vec16_i64, i64, int64_t)
+SETZERO(__vec16_i64, i64)
+UNDEF(__vec16_i64, i64)
+BROADCAST(__vec16_i64, i64, int64_t)
+ROTATE(__vec16_i64, i64, int64_t)
+SHUFFLES(__vec16_i64, i64, int64_t)
+LOAD_STORE(__vec16_i64, int64_t)
+
+
+#if 0 /* evghenii::float */
+///////////////////////////////////////////////////////////////////////////
+// float
+
+BINARY_OP(__vec16_f, __add, +)
+BINARY_OP(__vec16_f, __sub, -)
+BINARY_OP(__vec16_f, __mul, *)
+BINARY_OP(__vec16_f, __div, /)
+
+CMP_OP(__vec16_f, float, float, __equal, ==)
+CMP_OP(__vec16_f, float, float, __not_equal, !=)
+CMP_OP(__vec16_f, float, float, __less_than, <)
+CMP_OP(__vec16_f, float, float, __less_equal, <=)
+CMP_OP(__vec16_f, float, float, __greater_than, >)
+CMP_OP(__vec16_f, float, float, __greater_equal, >=)
+
+static FORCEINLINE __vec16_i1 __ordered_float(__vec16_f a, __vec16_f b) {
+    __vec16_i1 ret;
+    ret.v = 0;
+    for (int i = 0; i < 16; ++i)
+        ret.v |= ((a[i] == a[i]) && (b[i] == b[i])) ? (1 << i) : 0;
+    return ret;
+}
+
+static FORCEINLINE __vec16_i1 __unordered_float(__vec16_f a, __vec16_f b) {
+    __vec16_i1 ret;
+    ret.v = 0;
+    for (int i = 0; i < 16; ++i)
+        ret.v |= ((a[i] != a[i]) || (b[i] != b[i])) ? (1 << i) : 0;
+    return ret;
+}
+
+#if 0
+      case Instruction::FRem: intrinsic = "__frem"; break;
+#endif
+
+SELECT(__vec16_f)
+INSERT_EXTRACT(__vec16_f, float)
+SMEAR(__vec16_f, float, float)
+SETZERO(__vec16_f, float)
+UNDEF(__vec16_f, float)
+BROADCAST(__vec16_f, float, float)
+ROTATE(__vec16_f, float, float)
+SHUFFLES(__vec16_f, float, float)
+LOAD_STORE(__vec16_f, float)
+#else /* evghenii::float */
+
+///////////////////////////////////////////////////////////////////////////
+// float
+///////////////////////////////////////////////////////////////////////////
+
+static FORCEINLINE __vec16_f __add(__vec16_f a, __vec16_f b) { 
+    return _mm512_add_ps(a, b);
+}
+
+static FORCEINLINE __vec16_f __sub(__vec16_f a, __vec16_f b) {
+    return _mm512_sub_ps(a, b);
+}
+
+#if 1 /* evghenii::this two fails assert-3.ispc test */
+static FORCEINLINE __vec16_f __mul(__vec16_f a, __vec16_f b) {
+    return _mm512_mul_ps(a, b);
+}
+
+static FORCEINLINE __vec16_f __div(__vec16_f a, __vec16_f b) {
+    return _mm512_div_ps(a, b);
+}
+#else
+BINARY_OP(__vec16_f, __mul, *)
+BINARY_OP(__vec16_f, __div, /)
+#endif
+
+
+static FORCEINLINE __vec16_i1 __equal_float(__vec16_f a, __vec16_f b) {
+    return _mm512_cmpeq_ps_mask(a, b);
+}
+
+static FORCEINLINE __vec16_i1 __equal_float_and_mask(__vec16_f a, __vec16_f b,
+                                                     __vec16_i1 m) {
+    return _mm512_mask_cmpeq_ps_mask(m, a, b);
+}
+
+static FORCEINLINE __vec16_i1 __not_equal_float(__vec16_f a, __vec16_f b) {
+    return _mm512_cmpneq_ps_mask(a, b);
+}
+
+static FORCEINLINE __vec16_i1 __not_equal_float_and_mask(__vec16_f a, __vec16_f b,
+                                                         __vec16_i1 m) {
+    return _mm512_mask_cmpneq_ps_mask(m, a, b);
+}
+
+static FORCEINLINE __vec16_i1 __less_than_float(__vec16_f a, __vec16_f b) {
+    return _mm512_cmplt_ps_mask(a, b);
+}
+
+static FORCEINLINE __vec16_i1 __less_than_float_and_mask(__vec16_f a, __vec16_f b,
+                                                         __vec16_i1 m) {
+    return _mm512_mask_cmplt_ps_mask(m, a, b);
+}
+
+static FORCEINLINE __vec16_i1 __less_equal_float(__vec16_f a, __vec16_f b) {
+    return _mm512_cmple_ps_mask(a, b);
+}
+
+static FORCEINLINE __vec16_i1 __less_equal_float_and_mask(__vec16_f a, __vec16_f b,
+                                                          __vec16_i1 m) {
+    return _mm512_mask_cmple_ps_mask(m, a, b);
+}
+
+static FORCEINLINE __vec16_i1 __greater_than_float(__vec16_f a, __vec16_f b) {
+//    return _mm512_cmpnle_ps_mask(a, b);
+    return _mm512_cmp_ps_mask(a, b,_CMP_GT_OS);
+}
+
+static FORCEINLINE __vec16_i1 __greater_than_float_and_mask(__vec16_f a, __vec16_f b,
+                                                            __vec16_i1 m) {
+//    return _mm512_mask_cmpnle_ps_mask(m, a, b);
+    return _mm512_mask_cmp_ps_mask(m,a, b,_CMP_GT_OS);
+}
+
+static FORCEINLINE __vec16_i1 __greater_equal_float(__vec16_f a, __vec16_f b) {
+//    return _mm512_cmpnlt_ps_mask(a, b);
+    return _mm512_cmp_ps_mask(a, b,_CMP_GE_OS);
+}
+
+static FORCEINLINE __vec16_i1 __greater_equal_float_and_mask(__vec16_f a, __vec16_f b,
+                                                             __vec16_i1 m) {
+//    return _mm512_mask_cmpnlt_ps_mask(m, a, b);
+    return _mm512_mask_cmp_ps_mask(m,a, b,_CMP_GE_OS);
+}
+
+static FORCEINLINE __vec16_i1 __ordered_float(__vec16_f a, __vec16_f b) {
+    return _mm512_cmpord_ps_mask(a, b);
+}
+
+static FORCEINLINE __vec16_i1 __unordered_float(__vec16_f a, __vec16_f b) {
+    return _mm512_cmpunord_ps_mask(a, b);
+}
+
+static FORCEINLINE __vec16_f __select(__vec16_i1 mask, __vec16_f a, __vec16_f b) {
+    return _mm512_mask_mov_ps(b, mask, a);
+}
+
+static FORCEINLINE __vec16_f __select(bool cond, __vec16_f a, __vec16_f b) {
+    return cond ? a : b;
+}
+
+static FORCEINLINE float __extract_element(__vec16_f v, uint32_t index) {
+  return v[index];
+ //   return ((float *)&v)[index];
+}
+
+static FORCEINLINE void  __insert_element(__vec16_f *v, uint32_t index, float val) {
+  (*v)[index] = val;
+//    ((float *)v)[index] = val;
+}
+
+template <class RetVecType> RetVecType __smear_float(float f);
+template <> static FORCEINLINE __vec16_f __smear_float<__vec16_f>(float f) {
+    return _mm512_set_1to16_ps(f);
+}
+
+template <class RetVecType> RetVecType __setzero_float();
+template <> static FORCEINLINE __vec16_f __setzero_float<__vec16_f>() {
+    return _mm512_setzero_ps();
+}
+
+template <class RetVecType> RetVecType __undef_float();
+template <> static FORCEINLINE __vec16_f __undef_float<__vec16_f>() {
+    return __vec16_f();
+}
+
+static FORCEINLINE __vec16_f __broadcast_float(__vec16_f v, int index) {
+    float val = __extract_element(v, index & 0xf);
+    return _mm512_set1_ps(val);
+}
+ 
+#if 1
+static FORCEINLINE __vec16_f __shuffle_float(__vec16_f v, __vec16_i32 index) {
+    return _mm512_castsi512_ps(_mm512_mask_permutevar_epi32(_mm512_castps_si512(v), 0xffff, index, _mm512_castps_si512(v)));
+}
+#endif
+ROTATE(__vec16_f, float, float)
+SHUFFLE2(__vec16_f, float, float)
+
+template <int ALIGN> static FORCEINLINE __vec16_f __load(const __vec16_f *p) {
+#ifdef ISPC_FORCE_ALIGNED_MEMORY
+    return _mm512_load_ps(p);
+#else
+    __vec16_f v;
+    v = _mm512_extloadunpacklo_ps(v,           p,    _MM_UPCONV_PS_NONE, _MM_HINT_NONE);
+    v = _mm512_extloadunpackhi_ps(v, (uint8_t*)p+64, _MM_UPCONV_PS_NONE, _MM_HINT_NONE);
+    return v;
+#endif
+}
+
+template <int ALIGN> static FORCEINLINE void __store(__vec16_f *p, __vec16_f v) {
+#ifdef ISPC_FORCE_ALIGNED_MEMORY
+    _mm512_store_ps(p, v);
+#else
+    _mm512_extpackstorelo_ps(          p,    v, _MM_DOWNCONV_PS_NONE, _MM_HINT_NONE);
+    _mm512_extpackstorehi_ps((uint8_t*)p+64, v, _MM_DOWNCONV_PS_NONE, _MM_HINT_NONE);
+#endif
+}
+
+#if 0
+template <> static FORCEINLINE void __store<64>(__vec16_f *p, __vec16_f v) {
+    _mm512_store_ps(p, v);
+}
+template <> static FORCEINLINE __vec16_f __load<64>(const __vec16_f *p) {
+    return _mm512_load_ps(p);
+}
+#endif
+
+#endif /* evghenii::float */
+
+static FORCEINLINE float __exp_uniform_float(float v) {    return expf(v);}
+static FORCEINLINE __vec16_f __exp_varying_float(__vec16_f v) { return _mm512_exp_ps(v); }
+
+static FORCEINLINE float __log_uniform_float(float v) {    return logf(v);}
+static FORCEINLINE __vec16_f __log_varying_float(__vec16_f v) { return _mm512_log_ps(v); }
+
+static FORCEINLINE float __pow_uniform_float(float a, float b) {    return powf(a, b);}
+static FORCEINLINE __vec16_f __pow_varying_float(__vec16_f a, __vec16_f b) { return _mm512_pow_ps(a,b); }
+
+static FORCEINLINE int __intbits(float v) {
+    union {
+        float f;
+        int i;
+    } u;
+    u.f = v;
+    return u.i;
+}
+
+static FORCEINLINE float __floatbits(int v) {
+    union {
+        float f;
+        int i;
+    } u;
+    u.i = v;
+    return u.f;
+}
+
+static FORCEINLINE float __half_to_float_uniform(int16_t h) {
+    static const uint32_t shifted_exp = 0x7c00 << 13; // exponent mask after shift
+
+    int32_t o = ((int32_t)(h & 0x7fff)) << 13;     // exponent/mantissa bits
+    uint32_t exp = shifted_exp & o;   // just the exponent
+    o += (127 - 15) << 23;        // exponent adjust
+
+    // handle exponent special cases
+    if (exp == shifted_exp) // Inf/NaN?
+        o += (128 - 16) << 23;    // extra exp adjust
+    else if (exp == 0) { // Zero/Denormal?
+        o += 1 << 23;             // extra exp adjust
+        o = __intbits(__floatbits(o) - __floatbits(113 << 23)); // renormalize
+    }
+
+    o |= ((int32_t)(h & 0x8000)) << 16;    // sign bit
+    return __floatbits(o);
+}
+
+
+static FORCEINLINE __vec16_f __half_to_float_varying(__vec16_i16 v) {
+    __vec16_f ret;
+    for (int i = 0; i < 16; ++i)
+        ret[i] = __half_to_float_uniform(v[i]);
+    return ret;
+}
+
+
+static FORCEINLINE int16_t __float_to_half_uniform(float f) {
+    uint32_t sign_mask = 0x80000000u;
+    int32_t o;
+
+    int32_t fint = __intbits(f);
+    int32_t sign = fint & sign_mask;
+    fint ^= sign;
+
+    int32_t f32infty = 255 << 23;
+    o = (fint > f32infty) ? 0x7e00 : 0x7c00; 
+
+    // (De)normalized number or zero
+    // update fint unconditionally to save the blending; we don't need it
+    // anymore for the Inf/NaN case anyway.
+    const uint32_t round_mask = ~0xfffu; 
+    const int32_t magic = 15 << 23;
+    const int32_t f16infty = 31 << 23;
+
+    int32_t fint2 = __intbits(__floatbits(fint & round_mask) * __floatbits(magic)) - round_mask;
+    fint2 = (fint2 > f16infty) ? f16infty : fint2; // Clamp to signed infinity if overflowed
+
+    if (fint < f32infty)
+        o = fint2 >> 13; // Take the bits!
+
+    return (o | (sign >> 16));
+}
+
+
+static FORCEINLINE __vec16_i16 __float_to_half_varying(__vec16_f v) {
+    __vec16_i16 ret;
+    for (int i = 0; i < 16; ++i)
+        ret[i] = __float_to_half_uniform(v[i]);
+    return ret;
+}
+
+
+#if 0 /* evghenii::double */
+///////////////////////////////////////////////////////////////////////////
+// double
+
+BINARY_OP(__vec16_d, __add, +)
+BINARY_OP(__vec16_d, __sub, -)
+BINARY_OP(__vec16_d, __mul, *)
+BINARY_OP(__vec16_d, __div, /)
+
+CMP_OP(__vec16_d, double, double, __equal, ==)
+CMP_OP(__vec16_d, double, double, __not_equal, !=)
+CMP_OP(__vec16_d, double, double, __less_than, <)
+CMP_OP(__vec16_d, double, double, __less_equal, <=)
+CMP_OP(__vec16_d, double, double, __greater_than, >)
+CMP_OP(__vec16_d, double, double, __greater_equal, >=)
+
+static FORCEINLINE __vec16_i1 __ordered_double(__vec16_d a, __vec16_d b) {
+    __vec16_i1 ret;
+    ret.v = 0;
+    for (int i = 0; i < 16; ++i)
+        ret.v |= ((a[i] == a[i]) && (b[i] == b[i])) ? (1 << i) : 0;
+    return ret;
+}
+
+static FORCEINLINE __vec16_i1 __unordered_double(__vec16_d a, __vec16_d b) {
+    __vec16_i1 ret;
+    ret.v = 0;
+    for (int i = 0; i < 16; ++i)
+        ret.v |= ((a[i] != a[i]) || (b[i] != b[i])) ? (1 << i) : 0;
+    return ret;
+}
+
+#if 0
+      case Instruction::FRem: intrinsic = "__frem"; break;
+#endif
+
+SELECT(__vec16_d)
+INSERT_EXTRACT(__vec16_d, double)
+SMEAR(__vec16_d, double, double)
+SETZERO(__vec16_d, double)
+UNDEF(__vec16_d, double)
+BROADCAST(__vec16_d, double, double)
+ROTATE(__vec16_d, double, double)
+SHUFFLES(__vec16_d, double, double)
+LOAD_STORE(__vec16_d, double)
+#else /* evghenii::double */
+///////////////////////////////////////////////////////////////////////////
+// double
+///////////////////////////////////////////////////////////////////////////
+
+static FORCEINLINE __vec16_d __add(__vec16_d a, __vec16_d b) { 
+    __vec16_d ret;
+    ret.v1 = _mm512_add_pd(a.v1, b.v1);
+    ret.v2 = _mm512_add_pd(a.v2, b.v2);
+    return ret;
+}
+
+static FORCEINLINE __vec16_d __sub(__vec16_d a, __vec16_d b) {
+    __vec16_d ret;
+    ret.v1 = _mm512_sub_pd(a.v1, b.v1);
+    ret.v2 = _mm512_sub_pd(a.v2, b.v2);
+    return ret;
+}
+
+static FORCEINLINE __vec16_d __mul(__vec16_d a, __vec16_d b) {
+    __vec16_d ret;
+    ret.v1 = _mm512_mul_pd(a.v1, b.v1);
+    ret.v2 = _mm512_mul_pd(a.v2, b.v2);
+    return ret;
+}
+
+static FORCEINLINE __vec16_d __div(__vec16_d a, __vec16_d b) {
+    __vec16_d ret;
+    ret.v1 = _mm512_div_pd(a.v1, b.v1);
+    ret.v2 = _mm512_div_pd(a.v2, b.v2);
+    return ret;
+}
+
+static FORCEINLINE __vec16_i1 __equal_double(__vec16_d a, __vec16_d b) {
+    __vec16_i1 ret1;
+    __vec16_i1 ret2;
+    ret1 = _mm512_cmpeq_pd_mask(a.v1, b.v1);
+    ret2 = _mm512_cmpeq_pd_mask(a.v2, b.v2);
+    return _mm512_kmovlhb(ret1, ret2);
+}
+
+static FORCEINLINE __vec16_i1 __equal_double_and_mask(__vec16_d a, __vec16_d b,
+                                                      __vec16_i1 m) {
+    __vec16_i1 ret1;
+    __vec16_i1 ret2;
+    ret1 = _mm512_mask_cmpeq_pd_mask(m, a.v1, b.v1);
+    __vec16_i1 tmp_m = m;
+    ret2 = _mm512_mask_cmpeq_pd_mask(_mm512_kswapb(tmp_m,tmp_m), a.v2, b.v2);
+    return _mm512_kmovlhb(ret1, ret2);
+}
+
+static FORCEINLINE __vec16_i1 __not_equal_double(__vec16_d a, __vec16_d b) {
+    __vec16_i1 ret1;
+    __vec16_i1 ret2;
+    ret1 = _mm512_cmpneq_pd_mask(a.v1, b.v1);
+    ret2 = _mm512_cmpneq_pd_mask(a.v2, b.v2);
+    return _mm512_kmovlhb(ret1, ret2);
+}
+
+static FORCEINLINE __vec16_i1 __not_equal_double_and_mask(__vec16_d a, __vec16_d b,
+                                                          __vec16_i1 m) {
+    __vec16_i1 ret1;
+    __vec16_i1 ret2;
+    __vec16_i1 tmp_m = m;
+    ret1 = _mm512_mask_cmpneq_pd_mask(m, a.v1, b.v1);
+    ret2 = _mm512_mask_cmpneq_pd_mask(_mm512_kswapb(tmp_m, tmp_m), a.v2, b.v2);
+    return _mm512_kmovlhb(ret1, ret2);
+}
+
+static FORCEINLINE __vec16_i1 __less_than_double(__vec16_d a, __vec16_d b) {
+    __vec16_i1 ret1;
+    __vec16_i1 ret2;
+    ret1 = _mm512_cmplt_pd_mask(a.v1, b.v1);
+    ret2 = _mm512_cmplt_pd_mask(a.v2, b.v2);
+    return _mm512_kmovlhb(ret1, ret2);
+}
+
+static FORCEINLINE __vec16_i1 __less_than_double_and_mask(__vec16_d a, __vec16_d b,
+                                                          __vec16_i1 m) {
+    __vec16_i1 ret1;
+    __vec16_i1 ret2;
+    __vec16_i1 tmp_m = m;
+    ret1 = _mm512_mask_cmplt_pd_mask(m, a.v1, b.v1);
+    ret2 = _mm512_mask_cmplt_pd_mask(_mm512_kswapb(tmp_m, tmp_m), a.v2, b.v2);
+    return _mm512_kmovlhb(ret1, ret2);
+}
+
+static FORCEINLINE __vec16_i1 __less_equal_double(__vec16_d a, __vec16_d b) {
+    __vec16_i1 ret1;
+    __vec16_i1 ret2;
+    ret1 = _mm512_cmple_pd_mask(a.v1, b.v1);
+    ret2 = _mm512_cmple_pd_mask(a.v2, b.v2);
+    return _mm512_kmovlhb(ret1, ret2);
+}
+
+static FORCEINLINE __vec16_i1 __less_equal_double_and_mask(__vec16_d a, __vec16_d b,
+                                                           __vec16_i1 m) {
+    __vec16_i1 ret1;
+    __vec16_i1 ret2;
+    __vec16_i1 tmp_m = m;
+    ret1 = _mm512_mask_cmple_pd_mask(m, a.v1, b.v1);
+    ret2 = _mm512_mask_cmple_pd_mask(_mm512_kswapb(tmp_m, tmp_m), a.v2, b.v2);
+    return _mm512_kmovlhb(ret1, ret2);
+}
+
+static FORCEINLINE __vec16_i1 __greater_than_double(__vec16_d a, __vec16_d b) {
+    __vec16_i1 ret1;
+    __vec16_i1 ret2;
+    ret1 = _mm512_cmpnle_pd_mask(a.v1, b.v1);
+    ret2 = _mm512_cmpnle_pd_mask(a.v2, b.v2);
+    return _mm512_kmovlhb(ret1, ret2);
+}
+
+static FORCEINLINE __vec16_i1 __greater_than_double_and_mask(__vec16_d a, __vec16_d b,
+                                                             __vec16_i1 m) {
+    __vec16_i1 ret1;
+    __vec16_i1 ret2;
+    __vec16_i1 tmp_m = m;
+    ret1 = _mm512_mask_cmpnle_pd_mask(m, a.v1, b.v1);
+    ret2 = _mm512_mask_cmpnle_pd_mask(_mm512_kswapb(tmp_m, tmp_m), a.v2, b.v2);
+    return _mm512_kmovlhb(ret1, ret2);
+}
+
+static FORCEINLINE __vec16_i1 __greater_equal_double(__vec16_d a, __vec16_d b) {
+    __vec16_i1 ret1;
+    __vec16_i1 ret2;
+    ret1 = _mm512_cmpnlt_pd_mask(a.v1, b.v1);
+    ret2 = _mm512_cmpnlt_pd_mask(a.v2, b.v2);
+    return _mm512_kmovlhb(ret1, ret2);
+}
+
+static FORCEINLINE __vec16_i1 __greater_equal_double_and_mask(__vec16_d a, __vec16_d b,
+                                                              __vec16_i1 m) {
+    __vec16_i1 ret1;
+    __vec16_i1 ret2;
+    __vec16_i1 tmp_m = m;
+    ret1 = _mm512_mask_cmpnlt_pd_mask(m, a.v1, b.v1);
+    ret2 = _mm512_mask_cmpnlt_pd_mask(_mm512_kswapb(tmp_m, tmp_m), a.v2, b.v2);
+    return _mm512_kmovlhb(ret1, ret2);
+}
+
+static FORCEINLINE __vec16_i1 __ordered_double(__vec16_d a, __vec16_d b) {
+    __vec16_i1 ret1;
+    __vec16_i1 ret2;
+    ret1 = _mm512_cmpord_pd_mask(a.v1, b.v1);
+    ret2 = _mm512_cmpord_pd_mask(a.v2, b.v2);
+    return _mm512_kmovlhb(ret1, ret2);
+}
+
+static FORCEINLINE __vec16_i1 __unordered_double(__vec16_d a, __vec16_d b) {
+    __vec16_i1 ret1;
+    __vec16_i1 ret2;
+    ret1 = _mm512_cmpunord_pd_mask(a.v1, b.v1);
+    ret2 = _mm512_cmpunord_pd_mask(a.v2, b.v2);
+    return _mm512_kmovlhb(ret1, ret2);
+}
+
+static FORCEINLINE __vec16_d __select(__vec16_i1 mask, __vec16_d a, __vec16_d b) {
+    __vec16_d ret;
+    __vec16_i1 tmp_m = mask;
+    ret.v1 = _mm512_mask_mov_pd(b.v1, mask, a.v1);
+    ret.v2 = _mm512_mask_mov_pd(b.v2, _mm512_kswapb(tmp_m, tmp_m), a.v2);
+    return ret;
+}
+
+
+static FORCEINLINE __vec16_d __select(bool cond, __vec16_d a, __vec16_d b) {
+    return cond ? a : b;
+}
+
+static FORCEINLINE double __extract_element(__vec16_d v, uint32_t index) {
+    return ((double *)&v)[index];
+}
+
+static FORCEINLINE void  __insert_element(__vec16_d *v, uint32_t index, double val) {
+    ((double *)v)[index] = val;
+}
+
+template <class RetVecType> RetVecType __smear_double(double d);
+template <> static FORCEINLINE __vec16_d __smear_double<__vec16_d>(double d) {
+    __vec16_d ret;
+    ret.v1 = _mm512_set1_pd(d);
+    ret.v2 = _mm512_set1_pd(d);
+    return ret;
+}
+
+template <class RetVecType> RetVecType __setzero_double();
+template <> static FORCEINLINE __vec16_d __setzero_double<__vec16_d>() {
+    __vec16_d ret;
+    ret.v1 = _mm512_setzero_pd();
+    ret.v2 = _mm512_setzero_pd();
+    return ret;
+}
+
+template <class RetVecType> RetVecType __undef_double();
+template <> static FORCEINLINE __vec16_d __undef_double<__vec16_d>() {
+    return __vec16_d();
+}
+
+static FORCEINLINE __vec16_d __broadcast_double(__vec16_d v, int index) {
+    __vec16_d ret;
+    double val = __extract_element(v, index & 0xf);
+    ret.v1 = _mm512_set1_pd(val);
+    ret.v2 = _mm512_set1_pd(val);
+    return ret;
+}
+
+ROTATE(__vec16_d, double, double)
+SHUFFLES(__vec16_d, double, double)
+
+template <int ALIGN> static FORCEINLINE __vec16_d __load(const __vec16_d *p) {
+    __vec16_d ret;
+    ret.v1 = _mm512_extloadunpacklo_pd(ret.v1, p, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
+    ret.v1 = _mm512_extloadunpackhi_pd(ret.v1, (uint8_t*)p+64, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
+    ret.v2 = _mm512_extloadunpacklo_pd(ret.v2, (uint8_t*)p+64, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
+    ret.v2 = _mm512_extloadunpackhi_pd(ret.v2, (uint8_t*)p+128, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
+    return ret;
+}
+ 
+template <int ALIGN> static FORCEINLINE void __store(__vec16_d *p, __vec16_d v) {
+    _mm512_extpackstorelo_pd(p, v.v1, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE);
+    _mm512_extpackstorehi_pd((uint8_t*)p+64, v.v1, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE);
+    _mm512_extpackstorelo_pd((uint8_t*)p+64, v.v2, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE);
+    _mm512_extpackstorehi_pd((uint8_t*)p+128, v.v2, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE);
+}
+
+
+#if 0
+template <> static FORCEINLINE __vec16_d __load<64>(const __vec16_d *p) {
+    __vec16_d ret;
+    ret.v1 = _mm512_load_pd(p);
+    ret.v2 = _mm512_load_pd(((uint8_t*)p)+64);
+    return ret;
+}
+template <> static FORCEINLINE __vec16_d __load<128>(const __vec16_d *p) {
+    return __load<64>(p);
+}
+template <> static FORCEINLINE void __store<64>(__vec16_d *p, __vec16_d v) {
+    _mm512_store_pd(p, v.v1);
+    _mm512_store_pd(((uint8_t*)p)+64, v.v2);
+}
+template <> static FORCEINLINE void __store<128>(__vec16_d *p, __vec16_d v) {
+    __store<64>(p, v);
+}
+#endif
+#endif /* evghenii::double */
+
+///////////////////////////////////////////////////////////////////////////
+// casts
+
+
+#define CAST(TO, STO, FROM, SFROM, FUNC)        \
+static FORCEINLINE TO FUNC(TO, FROM val) {      \
+    TO ret;                                     \
+    for (int i = 0; i < 16; ++i)                \
+        ret[i] = (STO)((SFROM)(val[i]));    \
+    return ret;                                 \
+}
+
+// sign extension conversions
+CAST(__vec16_i64, int64_t, __vec16_i32, int32_t, __cast_sext)
+CAST(__vec16_i64, int64_t, __vec16_i16, int16_t, __cast_sext)
+CAST(__vec16_i64, int64_t, __vec16_i8,  int8_t,  __cast_sext)
+CAST(__vec16_i32, int32_t, __vec16_i16, int16_t, __cast_sext)
+CAST(__vec16_i32, int32_t, __vec16_i8,  int8_t,  __cast_sext)
+CAST(__vec16_i16, int16_t, __vec16_i8,  int8_t,  __cast_sext)
+
+#define CAST_SEXT_I1(TYPE)                            \
+static FORCEINLINE TYPE __cast_sext(TYPE, __vec16_i1 v) {  \
+    TYPE ret;                                         \
+    for (int i = 0; i < 16; ++i) {                    \
+        ret[i] = 0;                                 \
+        if (v.v & (1 << i))                           \
+            ret[i] = ~ret[i];                     \
+    }                                                 \
+    return ret;                                       \
+}
+
+CAST_SEXT_I1(__vec16_i8)
+CAST_SEXT_I1(__vec16_i16)
+#if 0
+CAST_SEXT_I1(__vec16_i32)
+#else
+static FORCEINLINE __vec16_i32 __cast_sext(const __vec16_i32 &, const __vec16_i1 &val)
+{
+    __vec16_i32 ret = _mm512_setzero_epi32();
+    __vec16_i32 one = _mm512_set1_epi32(-1);
+    return _mm512_mask_mov_epi32(ret, val, one);
+}
+#endif
+CAST_SEXT_I1(__vec16_i64)
+
+// zero extension
+CAST(__vec16_i64, uint64_t, __vec16_i32, uint32_t, __cast_zext)
+CAST(__vec16_i64, uint64_t, __vec16_i16, uint16_t, __cast_zext)
+CAST(__vec16_i64, uint64_t, __vec16_i8,  uint8_t,  __cast_zext)
+CAST(__vec16_i32, uint32_t, __vec16_i16, uint16_t, __cast_zext)
+CAST(__vec16_i32, uint32_t, __vec16_i8,  uint8_t,  __cast_zext)
+CAST(__vec16_i16, uint16_t, __vec16_i8,  uint8_t,  __cast_zext)
+
+#define CAST_ZEXT_I1(TYPE)                            \
+static FORCEINLINE TYPE __cast_zext(TYPE, __vec16_i1 v) {  \
+    TYPE ret;                                         \
+    for (int i = 0; i < 16; ++i)                      \
+        ret[i] = (v.v & (1 << i)) ? 1 : 0;          \
+    return ret;                                       \
+}
+
+CAST_ZEXT_I1(__vec16_i8)
+CAST_ZEXT_I1(__vec16_i16)
+#if 0
+CAST_ZEXT_I1(__vec16_i32)
+#else
+static FORCEINLINE __vec16_i32 __cast_zext(const __vec16_i32 &, const __vec16_i1 &val)
+{
+    __vec16_i32 ret = _mm512_setzero_epi32();
+    __vec16_i32 one = _mm512_set1_epi32(1);
+    return _mm512_mask_mov_epi32(ret, val, one);
+}
+#endif
+CAST_ZEXT_I1(__vec16_i64)
+
+// truncations
+CAST(__vec16_i32, int32_t, __vec16_i64, int64_t, __cast_trunc)
+CAST(__vec16_i16, int16_t, __vec16_i64, int64_t, __cast_trunc)
+CAST(__vec16_i8,  int8_t,  __vec16_i64, int64_t, __cast_trunc)
+CAST(__vec16_i16, int16_t, __vec16_i32, int32_t, __cast_trunc)
+CAST(__vec16_i8,  int8_t,  __vec16_i32, int32_t, __cast_trunc)
+CAST(__vec16_i8,  int8_t,  __vec16_i16, int16_t, __cast_trunc)
+
+// signed int to float/double
+#if 0
+CAST(__vec16_f, float, __vec16_i8,   int8_t,  __cast_sitofp)
+CAST(__vec16_f, float, __vec16_i16,  int16_t, __cast_sitofp)
+CAST(__vec16_f, float, __vec16_i32,  int32_t, __cast_sitofp)
+#else
+static FORCEINLINE __vec16_f __cast_sitofp(__vec16_f, __vec16_i8  val) {return _mm512_extload_ps(&val, _MM_UPCONV_PS_SINT8, _MM_BROADCAST_16X16, _MM_HINT_NONE);}
+static FORCEINLINE __vec16_f __cast_sitofp(__vec16_f, __vec16_i16 val) {return _mm512_extload_ps(&val, _MM_UPCONV_PS_SINT16, _MM_BROADCAST_16X16, _MM_HINT_NONE);}
+static FORCEINLINE __vec16_f __cast_sitofp(__vec16_f, __vec16_i32 val) {return _mm512_cvtfxpnt_round_adjustepi32_ps(val, _MM_ROUND_MODE_NEAREST, _MM_EXPADJ_NONE);}
+#endif
+CAST(__vec16_f, float, __vec16_i64,  int64_t, __cast_sitofp)
+#if 0
+CAST(__vec16_d, double, __vec16_i8,  int8_t,  __cast_sitofp)
+CAST(__vec16_d, double, __vec16_i16, int16_t, __cast_sitofp)
+CAST(__vec16_d, double, __vec16_i32, int32_t, __cast_sitofp)
+#else
+static FORCEINLINE __vec16_d __cast_sitofp(__vec16_d, __vec16_i8 val) {
+    __vec16_i32 vi = _mm512_extload_epi32(&val, _MM_UPCONV_EPI32_SINT8, _MM_BROADCAST_16X16, _MM_HINT_NONE);
+    __vec16_d ret;
+    ret.v1 = _mm512_cvtepi32lo_pd(vi);
+    __vec16_i32 other8 = _mm512_permute4f128_epi32(vi, _MM_PERM_DCDC);
+    ret.v2 = _mm512_cvtepi32lo_pd(other8);
+    return ret;
+}
+
+static FORCEINLINE __vec16_d __cast_sitofp(__vec16_d, __vec16_i16 val) {
+    __vec16_i32 vi = _mm512_extload_epi32(&val, _MM_UPCONV_EPI32_SINT16, _MM_BROADCAST_16X16, _MM_HINT_NONE);
+    __vec16_d ret;
+    ret.v1 = _mm512_cvtepi32lo_pd(vi);
+    __vec16_i32 other8 = _mm512_permute4f128_epi32(vi, _MM_PERM_DCDC);
+    ret.v2 = _mm512_cvtepi32lo_pd(other8);
+    return ret;
+}
+
+static FORCEINLINE __vec16_d __cast_sitofp(__vec16_d, __vec16_i32 val) {
+    __vec16_d ret;
+    ret.v1 = _mm512_cvtepi32lo_pd(val);
+    __vec16_i32 other8 = _mm512_permute4f128_epi32(val, _MM_PERM_DCDC);
+    ret.v2 = _mm512_cvtepi32lo_pd(other8);
+    return ret;
+}
+#endif
+CAST(__vec16_d, double, __vec16_i64, int64_t, __cast_sitofp)
+
+// unsigned int to float/double
+#if 0
+CAST(__vec16_f, float, __vec16_i8,   uint8_t,  __cast_uitofp)
+CAST(__vec16_f, float, __vec16_i16,  uint16_t, __cast_uitofp)
+CAST(__vec16_f, float, __vec16_i32,  uint32_t, __cast_uitofp)
+#else
+static FORCEINLINE __vec16_f __cast_uitofp(__vec16_f, __vec16_i8  val) {return _mm512_extload_ps(&val, _MM_UPCONV_PS_UINT8, _MM_BROADCAST_16X16, _MM_HINT_NONE);}
+static FORCEINLINE __vec16_f __cast_uitofp(__vec16_f, __vec16_i16 val) {return _mm512_extload_ps(&val, _MM_UPCONV_PS_UINT16, _MM_BROADCAST_16X16, _MM_HINT_NONE);}
+static FORCEINLINE __vec16_f __cast_uitofp(__vec16_f, __vec16_i32 val) {return _mm512_cvtfxpnt_round_adjustepu32_ps(val, _MM_ROUND_MODE_NEAREST, _MM_EXPADJ_NONE);}
+#endif
+CAST(__vec16_f, float, __vec16_i64,  uint64_t, __cast_uitofp)
+#if 0
+CAST(__vec16_d, double, __vec16_i8,  uint8_t,  __cast_uitofp)
+CAST(__vec16_d, double, __vec16_i16, uint16_t, __cast_uitofp)
+CAST(__vec16_d, double, __vec16_i32, uint32_t, __cast_uitofp)
+#else
+static FORCEINLINE __vec16_d __cast_uitofp(__vec16_d, __vec16_i8 val) {
+    __vec16_i32 vi = _mm512_extload_epi32(&val, _MM_UPCONV_EPI32_UINT8, _MM_BROADCAST_16X16, _MM_HINT_NONE);
+    __vec16_d ret;
+    ret.v1 = _mm512_cvtepu32lo_pd(vi);
+    __vec16_i32 other8 = _mm512_permute4f128_epi32(vi, _MM_PERM_DCDC);
+    ret.v2 = _mm512_cvtepu32lo_pd(other8);
+    return ret;
+}
+
+static FORCEINLINE __vec16_d __cast_uitofp(__vec16_d, __vec16_i16 val) {
+    __vec16_i32 vi = _mm512_extload_epi32(&val, _MM_UPCONV_EPI32_UINT16, _MM_BROADCAST_16X16, _MM_HINT_NONE);
+    __vec16_d ret;
+    ret.v1 = _mm512_cvtepu32lo_pd(vi);
+    __vec16_i32 other8 = _mm512_permute4f128_epi32(vi, _MM_PERM_DCDC);
+    ret.v2 = _mm512_cvtepu32lo_pd(other8);
+    return ret;
+}
+
+static FORCEINLINE __vec16_d __cast_uitofp(__vec16_d, __vec16_i32 val) {
+    __vec16_d ret;
+    ret.v1 = _mm512_cvtepu32lo_pd(val);
+    __vec16_i32 other8 = _mm512_permute4f128_epi32(val, _MM_PERM_DCDC);
+    ret.v2 = _mm512_cvtepu32lo_pd(other8);
+    return ret;
+}
+#endif
+CAST(__vec16_d, double, __vec16_i64, uint64_t, __cast_uitofp)
+
+#if 0
+static FORCEINLINE __vec16_f __cast_uitofp(__vec16_f, __vec16_i1 v) {
+    __vec16_f ret;
+    for (int i = 0; i < 16; ++i)
+        ret[i] = (v.v & (1 << i)) ? 1. : 0.;
+    return ret;
+}
+#else
+static FORCEINLINE __vec16_f __cast_uitofp(__vec16_f, __vec16_i1 v) 
+{
+    const __m512 ret = _mm512_setzero_ps();
+    const __m512 one = _mm512_set1_ps(1.0);
+    return _mm512_mask_mov_ps(ret, v, one);
+}
+#endif
+
+// float/double to signed int
+CAST(__vec16_i8,  int8_t,  __vec16_f, float, __cast_fptosi)
+CAST(__vec16_i16, int16_t, __vec16_f, float, __cast_fptosi)
+#if 0
+CAST(__vec16_i32, int32_t, __vec16_f, float, __cast_fptosi)
+#else
+static FORCEINLINE __vec16_i32 __cast_fptosi(__vec16_i32, __vec16_f val) {
+  return _mm512_cvtfxpnt_round_adjustps_epi32(val, _MM_ROUND_MODE_TOWARD_ZERO, _MM_EXPADJ_NONE);
+}
+#endif
+CAST(__vec16_i64, int64_t, __vec16_f, float, __cast_fptosi)
+CAST(__vec16_i8,  int8_t,  __vec16_d, double, __cast_fptosi)
+CAST(__vec16_i16, int16_t, __vec16_d, double, __cast_fptosi)
+#if 1
+CAST(__vec16_i32, int32_t, __vec16_d, double, __cast_fptosi)
+#else
+#endif
+CAST(__vec16_i64, int64_t, __vec16_d, double, __cast_fptosi)
+
+// float/double to unsigned int
+CAST(__vec16_i8,  uint8_t,  __vec16_f, float, __cast_fptoui)
+CAST(__vec16_i16, uint16_t, __vec16_f, float, __cast_fptoui)
+#if 0
+CAST(__vec16_i32, uint32_t, __vec16_f, float, __cast_fptoui)
+#else
+static FORCEINLINE __vec16_i32 __cast_fptoui(__vec16_i32, __vec16_f val) {
+  return _mm512_cvtfxpnt_round_adjustps_epu32(val, _MM_ROUND_MODE_TOWARD_ZERO, _MM_EXPADJ_NONE);
+}
+#endif
+CAST(__vec16_i64, uint64_t, __vec16_f, float, __cast_fptoui)
+CAST(__vec16_i8,  uint8_t,  __vec16_d, double, __cast_fptoui)
+CAST(__vec16_i16, uint16_t, __vec16_d, double, __cast_fptoui)
+#if 1
+CAST(__vec16_i32, uint32_t, __vec16_d, double, __cast_fptoui)
+#else
+#endif
+CAST(__vec16_i64, uint64_t, __vec16_d, double, __cast_fptoui)
+
+// float/double conversions
+#if 1
+CAST(__vec16_f, float,  __vec16_d, double, __cast_fptrunc)
+CAST(__vec16_d, double, __vec16_f, float,  __cast_fpext)
+#else
+static FORCEINLINE __vec16_f __cast_fptrunc(__vec16_f, __vec16_d val) {
+    __m512i r0i = _mm512_castps_si512(_mm512_cvtpd_pslo(val.v1));
+    __m512i r1i = _mm512_castps_si512(_mm512_cvtpd_pslo(val.v2));
+
+    return _mm512_mask_permute4f128_epi32(r0i, 0xFF00, r1i, _MM_PERM_BABA);
+}
+static FORCEINLINE __vec16_d __cast_fpext(__vec16_d, __vec16_f val) {
+    __vec16_d ret;
+    ret.v2 = _mm512_cvtpslo_pd(val.v);
+    __vec16_f other8 = _mm512_permute4f128_epi32(_mm512_castps_si512(val.v), _MM_PERM_DCDC);
+    ret.v1 = _mm512_cvtpslo_pd(other8);
+    return ret;
+}
+#endif
+
+typedef union {
+    int32_t i32;
+    float f;
+    int64_t i64;
+    double d;
+} BitcastUnion;
+
+#define CAST_BITS(TO, TO_ELT, FROM, FROM_ELT)       \
+static FORCEINLINE TO __cast_bits(TO, FROM val) {   \
+    TO r;                                           \
+    for (int i = 0; i < 16; ++i) {                  \
+        BitcastUnion u;                             \
+        u.FROM_ELT = val[i];                      \
+        r[i] = u.TO_ELT;                          \
+    }                                               \
+    return r;                                       \
+}
+
+#if 0
+CAST_BITS(__vec16_f,   f,   __vec16_i32, i32)
+CAST_BITS(__vec16_i32, i32, __vec16_f,   f)
+#else
+static FORCEINLINE __vec16_f __cast_bits(__vec16_f, __vec16_i32 val) {
+    return _mm512_castsi512_ps(val);
+}
+static FORCEINLINE __vec16_i32 __cast_bits(__vec16_i32, __vec16_f val) {
+    return _mm512_castps_si512(val);
+}
+#endif
+
+#if 0
+CAST_BITS(__vec16_d,   d,   __vec16_i64, i64)
+CAST_BITS(__vec16_i64, i64, __vec16_d,   d)
+#else
+static FORCEINLINE __vec16_i64 __cast_bits(__vec16_i64, __vec16_d val) {
+    return *(__vec16_i64*)&val;
+}
+static FORCEINLINE __vec16_d __cast_bits(__vec16_d, __vec16_i64 val) {
+    return *(__vec16_d*)&val;
+}
+#endif
+
+#define CAST_BITS_SCALAR(TO, FROM)                  \
+static FORCEINLINE TO __cast_bits(TO, FROM v) {     \
+    union {                                         \
+    TO to;                                          \
+    FROM from;                                      \
+    } u;                                            \
+    u.from = v;                                     \
+    return u.to;                                    \
+}
+
+CAST_BITS_SCALAR(uint32_t, float)
+CAST_BITS_SCALAR(int32_t, float)
+CAST_BITS_SCALAR(float, uint32_t)
+CAST_BITS_SCALAR(float, int32_t)
+CAST_BITS_SCALAR(uint64_t, double)
+CAST_BITS_SCALAR(int64_t, double)
+CAST_BITS_SCALAR(double, uint64_t)
+CAST_BITS_SCALAR(double, int64_t)
+
+///////////////////////////////////////////////////////////////////////////
+// various math functions
+
+static FORCEINLINE void __fastmath() {
+}
+
+static FORCEINLINE float __round_uniform_float(float v) {
+    return roundf(v);
+}
+
+static FORCEINLINE float __floor_uniform_float(float v)  {
+    return floorf(v);
+}
+
+static FORCEINLINE float __ceil_uniform_float(float v) {
+    return ceilf(v);
+}
+
+static FORCEINLINE double __round_uniform_double(double v) {
+    return round(v);
+}
+
+static FORCEINLINE double __floor_uniform_double(double v) {
+    return floor(v);
+}
+
+static FORCEINLINE double __ceil_uniform_double(double v) {
+    return ceil(v);
+}
+
+#if 0
+UNARY_OP(__vec16_f, __round_varying_float, roundf)
+UNARY_OP(__vec16_f, __floor_varying_float, floorf)
+UNARY_OP(__vec16_f, __ceil_varying_float, ceilf)
+#else
+static FORCEINLINE __vec16_f __round_varying_float(__vec16_f v) {
+  return _mm512_round_ps(v, _MM_ROUND_MODE_NEAREST, _MM_EXPADJ_NONE);
+}
+
+static FORCEINLINE __vec16_f __floor_varying_float(__vec16_f v) {
+  return _mm512_floor_ps(v);
+}
+
+static FORCEINLINE __vec16_f __ceil_varying_float(__vec16_f v) {
+  return _mm512_ceil_ps(v);
+}
+#endif
+
+#if 0
+UNARY_OP(__vec16_d, __round_varying_double, round)
+UNARY_OP(__vec16_d, __floor_varying_double, floor)
+UNARY_OP(__vec16_d, __ceil_varying_double, ceil)
+#else
+static FORCEINLINE __vec16_d __round_varying_float(__vec16_d v) {
+  __vec16_d ret;
+  ret.v1 = _mm512_svml_round_pd(v.v1);
+  ret.v2 = _mm512_svml_round_pd(v.v2);
+  return ret;
+}
+
+static FORCEINLINE __vec16_d __floor_varying_float(__vec16_d v) {
+  __vec16_d ret;
+  ret.v1 = _mm512_floor_pd(v.v1);
+  ret.v2 = _mm512_floor_pd(v.v2);
+  return ret;
+}
+
+static FORCEINLINE __vec16_d __ceil_varying_float(__vec16_d v) {
+  __vec16_d ret;
+  ret.v1 = _mm512_ceil_pd(v.v1);
+  ret.v2 = _mm512_ceil_pd(v.v2);
+  return ret;
+}
+#endif
+
+
+// min/max
+
+static FORCEINLINE float __min_uniform_float(float a, float b) { return (a<b) ? a : b; }
+static FORCEINLINE float __max_uniform_float(float a, float b) { return (a>b) ? a : b; }
+static FORCEINLINE double __min_uniform_double(double a, double b) { return (a<b) ? a : b; }
+static FORCEINLINE double __max_uniform_double(double a, double b) { return (a>b) ? a : b; }
+
+static FORCEINLINE int32_t __min_uniform_int32(int32_t a, int32_t b) { return (a<b) ? a : b; }
+static FORCEINLINE int32_t __max_uniform_int32(int32_t a, int32_t b) { return (a>b) ? a : b; }
+static FORCEINLINE int32_t __min_uniform_uint32(uint32_t a, uint32_t b) { return (a<b) ? a : b; }
+static FORCEINLINE int32_t __max_uniform_uint32(uint32_t a, uint32_t b) { return (a>b) ? a : b; }
+
+static FORCEINLINE int64_t __min_uniform_int64(int64_t a, int64_t b) { return (a<b) ? a : b; }
+static FORCEINLINE int64_t __max_uniform_int64(int64_t a, int64_t b) { return (a>b) ? a : b; }
+static FORCEINLINE int64_t __min_uniform_uint64(uint64_t a, uint64_t b) { return (a<b) ? a : b; }
+static FORCEINLINE int64_t __max_uniform_uint64(uint64_t a, uint64_t b) { return (a>b) ? a : b; }
+
+
+#if 0
+BINARY_OP_FUNC(__vec16_f, __max_varying_float, __max_uniform_float)
+BINARY_OP_FUNC(__vec16_f, __min_varying_float, __min_uniform_float)
+BINARY_OP_FUNC(__vec16_d, __max_varying_double, __max_uniform_double)
+BINARY_OP_FUNC(__vec16_d, __min_varying_double, __min_uniform_double)
+#else
+static FORCEINLINE __vec16_f __max_varying_float (__vec16_f v1, __vec16_f v2) { return _mm512_gmax_ps(v1, v2);}
+static FORCEINLINE __vec16_f __min_varying_float (__vec16_f v1, __vec16_f v2) { return _mm512_gmin_ps(v1, v2);}
+static FORCEINLINE __vec16_d __max_varying_double(__vec16_d v1, __vec16_d v2) { return __vec16_d(_mm512_gmax_pd(v1.v1, v2.v1),_mm512_gmax_pd(v1.v2,v2.v2));}
+static FORCEINLINE __vec16_d __min_varying_double(__vec16_d v1, __vec16_d v2) { return __vec16_d(_mm512_gmin_pd(v1.v1, v2.v1),_mm512_gmin_pd(v1.v2,v2.v2));}
+#endif
+
+#if 0
+BINARY_OP_FUNC(__vec16_i32, __max_varying_int32, __max_uniform_int32)
+BINARY_OP_FUNC(__vec16_i32, __min_varying_int32, __min_uniform_int32)
+BINARY_OP_FUNC(__vec16_i32, __max_varying_uint32, __max_uniform_uint32)
+BINARY_OP_FUNC(__vec16_i32, __min_varying_uint32, __min_uniform_uint32)
+#else
+static FORCEINLINE __vec16_i32 __max_varying_int32 (__vec16_i32 v1, __vec16_i32 v2) { return _mm512_max_epi32(v1, v2);}
+static FORCEINLINE __vec16_i32 __min_varying_int32 (__vec16_i32 v1, __vec16_i32 v2) { return _mm512_min_epi32(v1, v2);}
+static FORCEINLINE __vec16_i32 __max_varying_uint32(__vec16_i32 v1, __vec16_i32 v2) { return _mm512_max_epu32(v1, v2);}
+static FORCEINLINE __vec16_i32 __min_varying_uint32(__vec16_i32 v1, __vec16_i32 v2) { return _mm512_min_epu32(v1, v2);}
+#endif
+
+BINARY_OP_FUNC(__vec16_i64, __max_varying_int64, __max_uniform_int64)
+BINARY_OP_FUNC(__vec16_i64, __min_varying_int64, __min_uniform_int64)
+BINARY_OP_FUNC(__vec16_i64, __max_varying_uint64, __max_uniform_uint64)
+BINARY_OP_FUNC(__vec16_i64, __min_varying_uint64, __min_uniform_uint64)
+
+// sqrt/rsqrt/rcp
+
+static FORCEINLINE float __rsqrt_uniform_float(float v) {
+    return 1.f / sqrtf(v);
+}
+
+static FORCEINLINE float __rcp_uniform_float(float v) {
+    return 1.f / v;
+}
+
+static FORCEINLINE float __sqrt_uniform_float(float v) {
+    return sqrtf(v);
+}
+
+static FORCEINLINE double __sqrt_uniform_double(double v) {
+    return sqrt(v);
+}
+
+#if 0
+UNARY_OP(__vec16_f, __rcp_varying_float, __rcp_uniform_float)
+UNARY_OP(__vec16_f, __rsqrt_varying_float, __rsqrt_uniform_float)
+UNARY_OP(__vec16_f, __sqrt_varying_float, __sqrt_uniform_float)
+UNARY_OP(__vec16_d, __sqrt_varying_double, __sqrt_uniform_double)
+#else
+static FORCEINLINE __vec16_f __rcp_varying_float(__vec16_f v) {
+#ifdef ISPC_FAST_MATH
+    return _mm512_rcp23_ps(v); // Approximation with 23 bits of accuracy.
+#else
+    return _mm512_recip_ps(v);
+#endif
+}
+
+static FORCEINLINE __vec16_f __rsqrt_varying_float(__vec16_f v) {
+#ifdef ISPC_FAST_MATH
+    return _mm512_rsqrt23_ps(v); // Approximation with 0.775ULP accuracy
+#else 
+    return _mm512_invsqrt_ps(v);
+#endif
+}
+static FORCEINLINE __vec16_f __sqrt_varying_float (__vec16_f v) {    return _mm512_sqrt_ps(v);}
+static FORCEINLINE __vec16_d __sqrt_varying_double(__vec16_d v) {    return __vec16_d(_mm512_sqrt_pd(v.v1),_mm512_sqrt_pd(v.v2));}
+#endif
+
+///////////////////////////////////////////////////////////////////////////
+// svml
+///////////////////////////////////////////////////////////////////////////
+
+static FORCEINLINE __vec16_f __svml_logf(__vec16_f v)              { return _mm512_log_ps(v); }
+static FORCEINLINE __vec16_f __svml_expf(__vec16_f v)              { return _mm512_exp_ps(v); }
+static FORCEINLINE __vec16_f __svml_cosf(__vec16_f v)              { return _mm512_cos_ps(v); }
+static FORCEINLINE __vec16_f __svml_powf(__vec16_f a, __vec16_f b) { return _mm512_pow_ps(a,b); }
+
+static FORCEINLINE __vec16_d __svml_logd(__vec16_d v)              { return __vec16_d(_mm512_log_pd(v.v1), _mm512_log_pd(v.v2)); }
+static FORCEINLINE __vec16_d __svml_expd(__vec16_d v)              { return __vec16_d(_mm512_exp_pd(v.v1), _mm512_exp_pd(v.v2)); }
+static FORCEINLINE __vec16_d __svml_cosd(__vec16_d v)              { return __vec16_d(_mm512_cos_pd(v.v1), _mm512_cos_pd(v.v2)); }
+static FORCEINLINE __vec16_d __svml_powd(__vec16_d a, __vec16_d b) { return __vec16_d(_mm512_pow_pd(a.v1,b.v1), _mm512_pow_pd(a.v2,b.v2)); }
+
+///////////////////////////////////////////////////////////////////////////
+// bit ops
+
+static FORCEINLINE int32_t __popcnt_int32(uint32_t v) {
+    int count = 0;
+    for (; v != 0; v >>= 1)
+        count += (v & 1);
+    return count;
+}
+
+static FORCEINLINE int32_t __popcnt_int64(uint64_t v) {
+    int count = 0;
+    for (; v != 0; v >>= 1)
+        count += (v & 1);
+    return count;
+}
+
+static FORCEINLINE int32_t __count_trailing_zeros_i32(uint32_t v) {
+    if (v == 0)
+        return 32;
+
+    int count = 0;
+    while ((v & 1) == 0) {
+        ++count;
+        v >>= 1;
+    }
+    return count;
+}
+
+static FORCEINLINE int64_t __count_trailing_zeros_i64(uint64_t v) {
+    if (v == 0)
+        return 64;
+
+    int count = 0;
+    while ((v & 1) == 0) {
+        ++count;
+        v >>= 1;
+    }
+    return count;
+}
+
+static FORCEINLINE int32_t __count_leading_zeros_i32(uint32_t v) {
+    if (v == 0)
+        return 32;
+
+    int count = 0;
+    while ((v & (1<<31)) == 0) {
+        ++count;
+        v <<= 1;
+    }
+    return count;
+}
+
+static FORCEINLINE int64_t __count_leading_zeros_i64(uint64_t v) {
+    if (v == 0)
+        return 64;
+
+    int count = 0;
+    while ((v & (1ull<<63)) == 0) {
+        ++count;
+        v <<= 1;
+    }
+    return count;
+}
+
+///////////////////////////////////////////////////////////////////////////
+// reductions
+
+#if 0
+REDUCE_ADD(float, __vec16_f, __reduce_add_float)
+REDUCE_MINMAX(float, __vec16_f, __reduce_min_float, <)
+REDUCE_MINMAX(float, __vec16_f, __reduce_max_float, >)
+#else
+static FORCEINLINE float __reduce_add_float(__vec16_f v) { return _mm512_reduce_add_ps(v); }
+static FORCEINLINE float __reduce_min_float(__vec16_f v) { return _mm512_reduce_min_ps(v); }
+static FORCEINLINE float __reduce_max_float(__vec16_f v) { return _mm512_reduce_max_ps(v); }
+#endif
+
+#if 0
+REDUCE_ADD(double, __vec16_d, __reduce_add_double)
+REDUCE_MINMAX(double, __vec16_d, __reduce_min_double, <)
+REDUCE_MINMAX(double, __vec16_d, __reduce_max_double, >)
+#else
+static FORCEINLINE float __reduce_add_double(__vec16_d v) { return _mm512_reduce_add_pd(v.v1) + _mm512_reduce_add_pd(v.v2); }
+static FORCEINLINE float __reduce_min_double(__vec16_d v) { return std::min(_mm512_reduce_min_pd(v.v1), _mm512_reduce_min_pd(v.v2)); }
+static FORCEINLINE float __reduce_max_double(__vec16_d v) { return std::max(_mm512_reduce_max_pd(v.v1), _mm512_reduce_max_pd(v.v2)); }
+#endif
+
+
+
+#if 0
+REDUCE_ADD   (int64_t, __vec16_i32, __reduce_add_int32)
+REDUCE_MINMAX(int32_t, __vec16_i32, __reduce_min_int32, <)
+REDUCE_MINMAX(int32_t, __vec16_i32, __reduce_max_int32, >)
+REDUCE_MINMAX(uint32_t, __vec16_i32, __reduce_min_uint32, <)
+REDUCE_MINMAX(uint32_t, __vec16_i32, __reduce_max_uint32, >)
+#else
+static FORCEINLINE  int64_t __reduce_add_int32  (__vec16_i32 v) { return _mm512_reduce_add_epi32(v);}
+static FORCEINLINE  int32_t __reduce_min_int32  (__vec16_i32 v) { return _mm512_reduce_min_epi32(v);}
+static FORCEINLINE  int32_t __reduce_max_int32  (__vec16_i32 v) { return _mm512_reduce_max_epi32(v);}
+static FORCEINLINE uint32_t __reduce_min_uint32 (__vec16_i32 v) { return _mm512_reduce_min_epu32(v);}
+static FORCEINLINE uint32_t __reduce_max_uint32 (__vec16_i32 v) { return _mm512_reduce_max_epu32(v);}
+#endif
+
+REDUCE_ADD   ( int16_t, __vec16_i8,  __reduce_add_int8)
+REDUCE_ADD   ( int32_t, __vec16_i16, __reduce_add_int16)
+REDUCE_ADD   ( int64_t, __vec16_i64, __reduce_add_int64)
+REDUCE_MINMAX( int64_t, __vec16_i64, __reduce_min_int64, <)
+REDUCE_MINMAX( int64_t, __vec16_i64, __reduce_max_int64, >)
+REDUCE_MINMAX(uint64_t, __vec16_i64, __reduce_min_uint64, <)
+REDUCE_MINMAX(uint64_t, __vec16_i64, __reduce_max_uint64, >)
+
+///////////////////////////////////////////////////////////////////////////
+// masked load/store
+
+static FORCEINLINE __vec16_i8 __masked_load_i8(void *p,
+                                               __vec16_i1 mask) {
+    __vec16_i8 ret;
+    int8_t *ptr = (int8_t *)p;
+    for (int i = 0; i < 16; ++i)
+        if ((mask.v & (1 << i)) != 0)
+            ret[i] = ptr[i];
+    return ret;
+}
+
+static FORCEINLINE __vec16_i16 __masked_load_i16(void *p,
+                                                 __vec16_i1 mask) {
+    __vec16_i16 ret;
+    int16_t *ptr = (int16_t *)p;
+    for (int i = 0; i < 16; ++i)
+        if ((mask.v & (1 << i)) != 0)
+            ret[i] = ptr[i];
+    return ret;
+}
+
+#if 0
+static FORCEINLINE __vec16_i32 __masked_load_i32(void *p,
+                                                 __vec16_i1 mask) {
+    __vec16_i32 ret;
+    int32_t *ptr = (int32_t *)p;
+    for (int i = 0; i < 16; ++i)
+        if ((mask.v & (1 << i)) != 0)
+            ret[i] = ptr[i];
+    return ret;
+}
+#else
+static FORCEINLINE __vec16_i32 __masked_load_i32(void *p, __vec16_i1 mask) {
+#ifdef ISPC_FORCE_ALIGNED_MEMORY
+    return _mm512_mask_load_epi32(__vec16_i32(), mask, p);
+#else
+    __vec16_i32 tmp;
+    tmp.v = _mm512_mask_extloadunpacklo_epi32(tmp.v, 0xFFFF, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
+    tmp.v = _mm512_mask_extloadunpackhi_epi32(tmp.v, 0xFFFF, (uint8_t*)p+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
+    __vec16_i32 ret;
+    return _mm512_mask_mov_epi32(ret.v, mask, tmp.v);
+#endif
+}
+#endif
+
+#if 0
+static FORCEINLINE __vec16_f __masked_load_float(void *p,
+                                                 __vec16_i1 mask) {
+    __vec16_f ret;
+    float *ptr = (float *)p;
+    for (int i = 0; i < 16; ++i)
+        if ((mask.v & (1 << i)) != 0)
+            ret[i] = ptr[i];
+    return ret;
+}
+#else
+static FORCEINLINE __vec16_f __masked_load_float(void *p, __vec16_i1 mask) {
+#ifdef ISPC_FORCE_ALIGNED_MEMORY
+    return _mm512_mask_load_ps(_mm512_undefined_ps(), mask,p);
+#else
+    __vec16_f tmp;
+    tmp.v = _mm512_mask_extloadunpacklo_ps(tmp.v, 0xFFFF, p, _MM_UPCONV_PS_NONE, _MM_HINT_NONE);
+    tmp.v = _mm512_mask_extloadunpackhi_ps(tmp.v, 0xFFFF, (uint8_t*)p+64, _MM_UPCONV_PS_NONE, _MM_HINT_NONE);
+    __vec16_f ret;
+    return _mm512_mask_mov_ps(ret.v, mask, tmp.v);
+#endif
+}
+#endif
+
+static FORCEINLINE __vec16_i64 __masked_load_i64(void *p,
+                                                 __vec16_i1 mask) {
+    __vec16_i64 ret;
+    int64_t *ptr = (int64_t *)p;
+    for (int i = 0; i < 16; ++i)
+        if ((mask.v & (1 << i)) != 0)
+            ret[i] = ptr[i];
+    return ret;
+}
+
+#if 0
+static FORCEINLINE __vec16_d __masked_load_double(void *p,
+                                                  __vec16_i1 mask) {
+    __vec16_d ret;
+    double *ptr = (double *)p;
+    for (int i = 0; i < 16; ++i)
+        if ((mask.v & (1 << i)) != 0)
+            ret[i] = ptr[i];
+    return ret;
+}
+#else
+static FORCEINLINE __vec16_d __masked_load_double(void *p, __vec16_i1 mask) {
+#ifdef ISPC_FORCE_ALIGNED_MEMORY
+    __vec16_d ret;
+    __vec16_i1 tmp_m = mask;
+    tmp_m = _mm512_kswapb(tmp_m, tmp_m);
+    ret.v1 = _mm512_mask_load_pd(ret.v1, mask, p);
+    ret.v2 = _mm512_mask_load_pd(ret.v2, tmp_m, (uint8_t*)p+64);
+    return ret;
+#else
+    __vec16_d tmp;
+    tmp.v1 = _mm512_mask_extloadunpacklo_pd(tmp.v1, 0xFF, p, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
+    tmp.v1 = _mm512_mask_extloadunpackhi_pd(tmp.v1, 0xFF, (uint8_t*)p+64, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
+    tmp.v2 = _mm512_mask_extloadunpacklo_pd(tmp.v2, 0xFF, (uint8_t*)p+64, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
+    tmp.v2 = _mm512_mask_extloadunpackhi_pd(tmp.v2, 0xFF, (uint8_t*)p+128, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
+    __vec16_d ret;
+    __vec16_i1 tmp_m = mask;
+    tmp_m = _mm512_kswapb(tmp_m, tmp_m);
+    ret.v1 = _mm512_mask_mov_pd(ret.v1, mask, tmp.v1);
+    ret.v2 = _mm512_mask_mov_pd(ret.v2, tmp_m, tmp.v2);
+    return ret;
+#endif
+}
+#endif
+
+
+static FORCEINLINE void __masked_store_i8(void *p, __vec16_i8 val,
+                                          __vec16_i1 mask) {
+    int8_t *ptr = (int8_t *)p;
+    for (int i = 0; i < 16; ++i)
+        if ((mask.v & (1 << i)) != 0)
+            ptr[i] = val[i];
+}
+
+static FORCEINLINE void __masked_store_i16(void *p, __vec16_i16 val,
+                                           __vec16_i1 mask) {
+    int16_t *ptr = (int16_t *)p;
+    for (int i = 0; i < 16; ++i)
+        if ((mask.v & (1 << i)) != 0)
+            ptr[i] = val[i];
+}
+
+#if 0
+static FORCEINLINE void __masked_store_i32(void *p, __vec16_i32 val,
+                                           __vec16_i1 mask) {
+    int32_t *ptr = (int32_t *)p;
+    for (int i = 0; i < 16; ++i)
+        if ((mask.v & (1 << i)) != 0)
+            ptr[i] = val[i];
+}
+#else
+static FORCEINLINE void __masked_store_i32(void *p, __vec16_i32 val, __vec16_i1 mask) {
+#ifdef ISPC_FORCE_ALIGNED_MEMORY
+    _mm512_mask_store_epi32(p, mask, val.v);
+#else
+    __vec16_i32 tmp;
+    tmp.v = _mm512_extloadunpacklo_epi32(tmp.v, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
+    tmp.v = _mm512_extloadunpackhi_epi32(tmp.v, (uint8_t*)p+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
+    tmp.v = _mm512_mask_mov_epi32(tmp.v, mask, val.v);
+    _mm512_extpackstorelo_epi32(p, tmp.v, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
+    _mm512_extpackstorehi_epi32((uint8_t*)p+64, tmp.v, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
+#endif
+}
+#endif
+
+#if 0
+static FORCEINLINE void __masked_store_float(void *p, __vec16_f val,
+                                             __vec16_i1 mask) {
+    float *ptr = (float *)p;
+    for (int i = 0; i < 16; ++i)
+        if ((mask.v & (1 << i)) != 0)
+            ptr[i] = val[i];
+}
+#else
+static FORCEINLINE void __masked_store_float(void *p, __vec16_f val,
+                                             __vec16_i1 mask) {
+#ifdef ISPC_FORCE_ALIGNED_MEMORY
+    _mm512_mask_store_ps(p, mask, val.v);
+#else
+    __vec16_f tmp;
+    tmp.v = _mm512_extloadunpacklo_ps(tmp.v, p, _MM_UPCONV_PS_NONE, _MM_HINT_NONE);
+    tmp.v = _mm512_extloadunpackhi_ps(tmp.v, (uint8_t*)p+64, _MM_UPCONV_PS_NONE, _MM_HINT_NONE);
+    tmp.v = _mm512_mask_mov_ps(tmp.v, mask, val.v);
+    _mm512_extpackstorelo_ps(p, tmp.v, _MM_DOWNCONV_PS_NONE, _MM_HINT_NONE);
+    _mm512_extpackstorehi_ps((uint8_t*)p+64, tmp.v, _MM_DOWNCONV_PS_NONE, _MM_HINT_NONE);
+#endif
+}
+#endif
+
+static FORCEINLINE void __masked_store_i64(void *p, __vec16_i64 val,
+                                          __vec16_i1 mask) {
+    int64_t *ptr = (int64_t *)p;
+    for (int i = 0; i < 16; ++i)
+        if ((mask.v & (1 << i)) != 0)
+            ptr[i] = val[i];
+}
+
+#if 0
+static FORCEINLINE void __masked_store_double(void *p, __vec16_d val,
+                                              __vec16_i1 mask) {
+    double *ptr = (double *)p;
+    for (int i = 0; i < 16; ++i)
+        if ((mask.v & (1 << i)) != 0)
+            ptr[i] = val[i];
+}
+#else
+static FORCEINLINE void __masked_store_double(void *p, __vec16_d val,
+                                              __vec16_i1 mask) {
+#ifdef ISPC_FORCE_ALIGNED_MEMORY
+    __vec16_i1 tmp_m = mask;
+    tmp_m = _mm512_kswapb(tmp_m, tmp_m);
+    _mm512_mask_store_pd(p, mask, val.v1);
+    _mm512_mask_store_pd((uint8_t*)p+64, tmp_m, val.v2);
+#else
+    __vec16_d tmp;
+    __vec16_i1 tmp_m = mask;
+    tmp_m = _mm512_kswapb(tmp_m, tmp_m);
+    tmp.v1 = _mm512_extloadunpacklo_pd(tmp.v1, p, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
+    tmp.v1 = _mm512_extloadunpackhi_pd(tmp.v1, (uint8_t*)p+64, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
+    tmp.v2 = _mm512_extloadunpacklo_pd(tmp.v2, (uint8_t*)p+64, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
+    tmp.v2 = _mm512_extloadunpackhi_pd(tmp.v2, (uint8_t*)p+128, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
+    tmp.v1 = _mm512_mask_mov_pd(tmp.v1, mask, val.v1);
+    tmp.v2 = _mm512_mask_mov_pd(tmp.v2, tmp_m, val.v2);
+    _mm512_extpackstorelo_pd(p, tmp.v1, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE);
+    _mm512_extpackstorehi_pd((uint8_t*)p+64, tmp.v1, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE);
+    _mm512_extpackstorelo_pd((uint8_t*)p+64, tmp.v2, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE);
+    _mm512_extpackstorehi_pd((uint8_t*)p+128, tmp.v2, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE);
+#endif
+}
+#endif
+
+static FORCEINLINE void __masked_store_blend_i8(void *p, __vec16_i8 val,
+                                                __vec16_i1 mask) {
+    __masked_store_i8(p, val, mask);
+}
+
+static FORCEINLINE void __masked_store_blend_i16(void *p, __vec16_i16 val,
+                                                 __vec16_i1 mask) {
+    __masked_store_i16(p, val, mask);
+}
+
+static FORCEINLINE void __masked_store_blend_i32(void *p, __vec16_i32 val,
+                                                 __vec16_i1 mask) {
+    __masked_store_i32(p, val, mask);
+}
+
+static FORCEINLINE void __masked_store_blend_float(void *p, __vec16_f val,
+                                                   __vec16_i1 mask) {
+    __masked_store_float(p, val, mask);
+}
+
+static FORCEINLINE void __masked_store_blend_i64(void *p, __vec16_i64 val,
+                                                 __vec16_i1 mask) {
+    __masked_store_i64(p, val, mask);
+}
+
+static FORCEINLINE void __masked_store_blend_double(void *p, __vec16_d val,
+                                                    __vec16_i1 mask) {
+    __masked_store_double(p, val, mask);
+}
+
+///////////////////////////////////////////////////////////////////////////
+// gather/scatter
+
+// offsets * offsetScale is in bytes (for all of these)
+
+#define GATHER_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC)                  \
+static FORCEINLINE VTYPE FUNC(unsigned char *b, uint32_t scale,         \
+                              OTYPE offset, __vec16_i1 mask) {          \
+    VTYPE ret;                                                          \
+    int8_t *base = (int8_t *)b;                                         \
+    for (int i = 0; i < 16; ++i)                                        \
+        if ((mask.v & (1 << i)) != 0) {                                 \
+            STYPE *ptr = (STYPE *)(base + scale * offset[i]);         \
+            ret[i] = *ptr;                                            \
+        }                                                               \
+    return ret;                                                         \
+}
+    
+
+/****************/
+#if 0
+GATHER_BASE_OFFSETS(__vec16_i8,  int8_t,  __vec16_i32, __gather_base_offsets32_i8)
+#else
+static FORCEINLINE __vec16_i8 __gather_base_offsets32_i8(uint8_t *base, uint32_t scale, __vec16_i32 offsets,  __vec16_i1 mask) 
+{
+    // (iw): need to temporarily store as int because gathers can only return ints.
+    __vec16_i32 tmp = _mm512_mask_i32extgather_epi32(_mm512_undefined_epi32(), mask, offsets, base, 
+                                                     _MM_UPCONV_EPI32_SINT8, scale,
+                                                     _MM_HINT_NONE);
+    // now, downconverting to chars into temporary char vector
+    __vec16_i8 ret;
+    _mm512_extstore_epi32(ret.data,tmp,_MM_DOWNCONV_EPI32_SINT8,_MM_HINT_NONE);
+    return ret;
+}
+#endif
+GATHER_BASE_OFFSETS(__vec16_i8,  int8_t,  __vec16_i64, __gather_base_offsets64_i8)
+/****************/
+GATHER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i32, __gather_base_offsets32_i16)
+GATHER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i64, __gather_base_offsets64_i16)
+/****************/
+#if 0
+GATHER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i32, __gather_base_offsets32_i32)
+#else
+static FORCEINLINE __vec16_i32 __gather_base_offsets32_i32(uint8_t *base, uint32_t scale, __vec16_i32 offsets,   __vec16_i1 mask) 
+{
+    return _mm512_mask_i32extgather_epi32(_mm512_undefined_epi32(), mask, offsets, 
+                                          base, _MM_UPCONV_EPI32_NONE, scale,
+                                          _MM_HINT_NONE);
+}
+#endif
+GATHER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i64, __gather_base_offsets64_i32)
+/****************/
+#if 0
+GATHER_BASE_OFFSETS(__vec16_f,   float,   __vec16_i32, __gather_base_offsets32_float)
+#else
+static FORCEINLINE __vec16_f __gather_base_offsets32_float(uint8_t *base, uint32_t scale, __vec16_i32 offsets, __vec16_i1 mask) 
+{
+    return _mm512_mask_i32extgather_ps(_mm512_undefined_ps(), mask, offsets,
+                                       base, _MM_UPCONV_PS_NONE, scale,
+                                       _MM_HINT_NONE);
+}
+#endif
+GATHER_BASE_OFFSETS(__vec16_f,   float,   __vec16_i64, __gather_base_offsets64_float)
+/****************/
+GATHER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i32, __gather_base_offsets32_i64)
+GATHER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i64, __gather_base_offsets64_i64)
+/****************/
+#if 0
+GATHER_BASE_OFFSETS(__vec16_d,   double,  __vec16_i32, __gather_base_offsets32_double)
+#else
+static FORCEINLINE __vec16_d __gather_base_offsets32_double(uint8_t *base, uint32_t scale, __vec16_i32 offsets, __vec16_i1 mask) 
+{
+    __vec16_d ret;
+    ret.v1 = _mm512_mask_i32loextgather_pd(_mm512_undefined_pd(), mask, offsets,
+                                       base, _MM_UPCONV_PD_NONE, scale,
+                                       _MM_HINT_NONE); 
+    __m512i shuffled_offsets = _mm512_permute4f128_epi32(offsets.v, _MM_PERM_DCDC);
+    const __mmask8 mask8 = 0x00FF & (mask >> 8); /* evghenii::testme */
+    ret.v2 = _mm512_mask_i32loextgather_pd(_mm512_undefined_pd(), mask8, shuffled_offsets,
+                                       base, _MM_UPCONV_PD_NONE, scale,
+                                       _MM_HINT_NONE); 
+    return ret;
+}
+#endif
+GATHER_BASE_OFFSETS(__vec16_d,   double,  __vec16_i64, __gather_base_offsets64_double)
+
+#define GATHER_GENERAL(VTYPE, STYPE, PTRTYPE, FUNC)         \
+static FORCEINLINE VTYPE FUNC(PTRTYPE ptrs, __vec16_i1 mask) {   \
+    VTYPE ret;                                              \
+    for (int i = 0; i < 16; ++i)                            \
+        if ((mask.v & (1 << i)) != 0) {                     \
+            STYPE *ptr = (STYPE *)ptrs[i];                \
+            ret[i] = *ptr;                                \
+        }                                                   \
+    return ret;                                             \
+}
+#define GATHER_GENERALF(VTYPE, STYPE, PTRTYPE, FUNC,FUNC1)         \
+static FORCEINLINE VTYPE FUNC(PTRTYPE ptrs, __vec16_i1 mask) {   \
+  return FUNC1(0, 1, ptrs, mask); \
+}
+
+
+#if 1
+/***********/
+GATHER_GENERALF(__vec16_i8,  int8_t,  __vec16_i32, __gather32_i8, __gather_base_offsets32_i8)
+GATHER_GENERALF(__vec16_i16, int16_t, __vec16_i32, __gather32_i16, __gather_base_offsets32_i16)
+GATHER_GENERALF(__vec16_i32, int32_t, __vec16_i32, __gather32_i32, __gather_base_offsets32_i32)
+GATHER_GENERALF(__vec16_i64, int64_t, __vec16_i32, __gather32_i64, __gather_base_offsets32_i64)
+GATHER_GENERALF(__vec16_f,   float,   __vec16_i32, __gather32_float, __gather_base_offsets32_float)
+GATHER_GENERALF(__vec16_d,   double,  __vec16_i32, __gather32_double, __gather_base_offsets32_double)
+/***********/
+GATHER_GENERAL(__vec16_i8,  int8_t,  __vec16_i64, __gather64_i8);
+GATHER_GENERAL(__vec16_i16, int16_t, __vec16_i64, __gather64_i16);
+GATHER_GENERAL(__vec16_i32, int32_t, __vec16_i64, __gather64_i32);
+GATHER_GENERAL(__vec16_i64, int64_t, __vec16_i64, __gather64_i64);
+GATHER_GENERAL(__vec16_f,   float,   __vec16_i64, __gather64_float);
+GATHER_GENERAL(__vec16_d,   double,  __vec16_i64, __gather64_double);
+/***********/
+#endif
+
+// scatter
+
+#define SCATTER_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC)                 \
+static FORCEINLINE void FUNC(unsigned char *b, uint32_t scale,          \
+                             OTYPE offset, VTYPE val,                   \
+                             __vec16_i1 mask) {                         \
+    int8_t *base = (int8_t *)b;                                         \
+    for (int i = 0; i < 16; ++i)                                        \
+        if ((mask.v & (1 << i)) != 0) {                                 \
+            STYPE *ptr = (STYPE *)(base + scale * offset[i]);         \
+            *ptr = val[i];                                            \
+        }                                                               \
+}
+    
+
+/*****************/
+SCATTER_BASE_OFFSETS(__vec16_i8,  int8_t,  __vec16_i32, __scatter_base_offsets32_i8)
+SCATTER_BASE_OFFSETS(__vec16_i8,  int8_t,  __vec16_i64, __scatter_base_offsets64_i8)
+/*****************/
+SCATTER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i32, __scatter_base_offsets32_i16)
+SCATTER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i64, __scatter_base_offsets64_i16)
+/*****************/
+#if 0
+SCATTER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i32, __scatter_base_offsets32_i32)
+#else
+static FORCEINLINE void __scatter_base_offsets32_i32(uint8_t *b, uint32_t scale, __vec16_i32 offsets,  __vec16_i32 val, __vec16_i1 mask)
+{
+    _mm512_mask_i32extscatter_epi32(b, mask, offsets, val, 
+                                    _MM_DOWNCONV_EPI32_NONE, scale, 
+                                    _MM_HINT_NONE);
+}
+#endif
+SCATTER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i64, __scatter_base_offsets64_i32)
+/*****************/
+#if 0
+SCATTER_BASE_OFFSETS(__vec16_f,   float,   __vec16_i32, __scatter_base_offsets32_float)
+#else
+static FORCEINLINE void __scatter_base_offsets32_float(void *base, uint32_t scale, __vec16_i32 offsets,
+                               __vec16_f val, __vec16_i1 mask) 
+{ 
+    _mm512_mask_i32extscatter_ps(base, mask, offsets, val, 
+                                 _MM_DOWNCONV_PS_NONE, scale,
+                                 _MM_HINT_NONE);
+}
+#endif
+SCATTER_BASE_OFFSETS(__vec16_f,   float,   __vec16_i64, __scatter_base_offsets64_float)
+/*****************/
+SCATTER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i32, __scatter_base_offsets32_i64)
+SCATTER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i64, __scatter_base_offsets64_i64)
+/*****************/
+#if 0 /* evghenii::to implement */
+SCATTER_BASE_OFFSETS(__vec16_d,   double,  __vec16_i32, __scatter_base_offsets32_double)
+#else /* evghenii:testme */
+static FORCEINLINE void __scatter_base_offsets32_double(void *base, uint32_t scale, __vec16_i32 offsets,
+                               __vec16_d val, __vec16_i1 mask) 
+{ 
+    _mm512_mask_i32loextscatter_pd(base, mask, offsets, val.v1, 
+                                 _MM_DOWNCONV_PD_NONE, scale,
+                                 _MM_HINT_NONE);
+    __m512i shuffled_offsets = _mm512_permute4f128_epi32(offsets.v, _MM_PERM_DCDC);
+    const __mmask8 mask8 = 0x00FF & (mask >> 8); /* evghenii::testme */
+    _mm512_mask_i32loextscatter_pd(base, mask8, shuffled_offsets, val.v2, 
+                                 _MM_DOWNCONV_PD_NONE, scale,
+                                 _MM_HINT_NONE);
+}
+#endif
+SCATTER_BASE_OFFSETS(__vec16_d,   double,  __vec16_i64, __scatter_base_offsets64_double)
+
+#define SCATTER_GENERAL(VTYPE, STYPE, PTRTYPE, FUNC)                 \
+static FORCEINLINE void FUNC(PTRTYPE ptrs, VTYPE val, __vec16_i1 mask) {  \
+    VTYPE ret;                                                       \
+    for (int i = 0; i < 16; ++i)                                     \
+        if ((mask.v & (1 << i)) != 0) {                              \
+            STYPE *ptr = (STYPE *)ptrs[i];                         \
+            *ptr = val[i];                                         \
+        }                                                            \
+}
+#define SCATTER_GENERALF(VTYPE, STYPE, PTRTYPE, FUNC,FUNC1)         \
+static FORCEINLINE void FUNC(PTRTYPE ptrs, VTYPE val, __vec16_i1 mask) {  \
+  return FUNC1(0, 1, ptrs, val, mask); \
+}
+
+#if 1
+/***********/
+SCATTER_GENERALF(__vec16_i8,  int8_t,  __vec16_i32, __scatter32_i8, __scatter_base_offsets32_i8)
+SCATTER_GENERALF(__vec16_i16, int16_t, __vec16_i32, __scatter32_i16, __scatter_base_offsets32_i16)
+SCATTER_GENERALF(__vec16_i32, int32_t, __vec16_i32, __scatter32_i32, __scatter_base_offsets32_i32)
+SCATTER_GENERALF(__vec16_i64, int64_t, __vec16_i32, __scatter32_i64, __scatter_base_offsets32_i64)
+SCATTER_GENERALF(__vec16_f,   float,   __vec16_i32, __scatter32_float, __scatter_base_offsets32_float)
+SCATTER_GENERALF(__vec16_d,   double,  __vec16_i32, __scatter32_double, __scatter_base_offsets32_double)
+/***********/
+SCATTER_GENERAL(__vec16_i8,  int8_t,  __vec16_i64, __scatter64_i8)
+SCATTER_GENERAL(__vec16_i16, int16_t, __vec16_i64, __scatter64_i16)
+SCATTER_GENERAL(__vec16_i32, int32_t, __vec16_i64, __scatter64_i32)
+SCATTER_GENERAL(__vec16_f,   float,   __vec16_i64, __scatter64_float)
+SCATTER_GENERAL(__vec16_i64, int64_t, __vec16_i64, __scatter64_i64)
+SCATTER_GENERAL(__vec16_d,   double,  __vec16_i64, __scatter64_double)
+/***********/
+#endif
+
+///////////////////////////////////////////////////////////////////////////
+// packed load/store
+
+#if 0
+static FORCEINLINE int32_t __packed_load_active(int32_t *ptr, __vec16_i32 *val,
+                                                __vec16_i1 mask) {
+    int count = 0; 
+    for (int i = 0; i < 16; ++i) {
+        if ((mask.v & (1 << i)) != 0) {
+            val->operator[](i) = *ptr++;
+            ++count;
+        }
+    }
+    return count;
+}
+#endif
+
+#if 0
+static FORCEINLINE int32_t __packed_store_active(int32_t *ptr, 
+                                                 __vec16_i32 val,
+                                                 __vec16_i1 mask) {
+    int count = 0; 
+    for (int i = 0; i < 16; ++i) {
+        if ((mask.v & (1 << i)) != 0) {
+            *ptr++ = val[i];
+            ++count;
+        }
+    }
+    return count;
+}
+#endif
+
+#if 0
+static FORCEINLINE int32_t __packed_load_active(uint32_t *ptr,
+                                                __vec16_i32 *val,
+                                                __vec16_i1 mask) {
+    int count = 0; 
+    for (int i = 0; i < 16; ++i) {
+        if ((mask.v & (1 << i)) != 0) {
+            val->operator[](i) = *ptr++;
+            ++count;
+        }
+    }
+    return count;
+}
+static FORCEINLINE int32_t __packed_store_active(uint32_t *ptr, 
+                                                 __vec16_i32 val,
+                                                 __vec16_i1 mask) {
+    int count = 0; 
+    for (int i = 0; i < 16; ++i) {
+        if ((mask.v & (1 << i)) != 0) {
+            *ptr++ = val[i];
+            ++count;
+        }
+    }
+    return count;
+}
+#endif
+
+#if 1
+static FORCEINLINE int32_t __packed_load_active(uint32_t *p, __vec16_i32 *val,
+                                                __vec16_i1 mask) {
+    __vec16_i32 v = __load<64>(val);
+    v = _mm512_mask_extloadunpacklo_epi32(v, mask, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
+    v = _mm512_mask_extloadunpackhi_epi32(v, mask, (uint8_t*)p+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
+    __store<64>(val, v);
+    return _mm_countbits_32(uint32_t(mask));
+}
+#endif
+
+#if 1
+static FORCEINLINE int32_t __packed_store_active(uint32_t *p, __vec16_i32 val,
+                                                 __vec16_i1 mask) {
+    _mm512_mask_extpackstorelo_epi32(p, mask, val, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
+    _mm512_mask_extpackstorehi_epi32((uint8_t*)p+64, mask, val, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
+    return _mm_countbits_32(uint32_t(mask));
+}
+#endif
+
+#if 1
+static FORCEINLINE int32_t __packed_load_active(int32_t *p, __vec16_i32 *val,
+                                                __vec16_i1 mask) {
+    __vec16_i32 v = __load<64>(val);
+    v = _mm512_mask_extloadunpacklo_epi32(v, mask, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
+    v = _mm512_mask_extloadunpackhi_epi32(v, mask, (uint8_t*)p+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
+    __store<64>(val, v);
+    return _mm_countbits_32(uint32_t(mask));
+}
+#endif
+
+#if 1
+static FORCEINLINE int32_t __packed_store_active(int32_t *p, __vec16_i32 val,
+                                                 __vec16_i1 mask) {
+    _mm512_mask_extpackstorelo_epi32(p, mask, val, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
+    _mm512_mask_extpackstorehi_epi32((uint8_t*)p+64, mask, val, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
+    return _mm_countbits_32(uint32_t(mask));
+}
+#endif
+
+///////////////////////////////////////////////////////////////////////////
+// aos/soa
+
+static FORCEINLINE void __soa_to_aos3_float(__vec16_f v0, __vec16_f v1, __vec16_f v2,
+                                            float *ptr) {
+    for (int i = 0; i < 16; ++i) {
+        *ptr++ = __extract_element(v0, i);
+        *ptr++ = __extract_element(v1, i);
+        *ptr++ = __extract_element(v2, i);
+    }
+}
+
+static FORCEINLINE void __aos_to_soa3_float(float *ptr, __vec16_f *out0, __vec16_f *out1,
+                                            __vec16_f *out2) {
+    for (int i = 0; i < 16; ++i) {
+        __insert_element(out0, i, *ptr++);
+        __insert_element(out1, i, *ptr++);
+        __insert_element(out2, i, *ptr++);
+    }
+}
+
+static FORCEINLINE void __soa_to_aos4_float(__vec16_f v0, __vec16_f v1, __vec16_f v2,
+                                            __vec16_f v3, float *ptr) {
+    for (int i = 0; i < 16; ++i) {
+        *ptr++ = __extract_element(v0, i);
+        *ptr++ = __extract_element(v1, i);
+        *ptr++ = __extract_element(v2, i);
+        *ptr++ = __extract_element(v3, i);
+    }
+}
+
+static FORCEINLINE void __aos_to_soa4_float(float *ptr, __vec16_f *out0, __vec16_f *out1,
+                                            __vec16_f *out2, __vec16_f *out3) {
+    for (int i = 0; i < 16; ++i) {
+        __insert_element(out0, i, *ptr++);
+        __insert_element(out1, i, *ptr++);
+        __insert_element(out2, i, *ptr++);
+        __insert_element(out3, i, *ptr++);
+    }
+}
+
+///////////////////////////////////////////////////////////////////////////
+// prefetch
+
+static FORCEINLINE void __prefetch_read_uniform_1(unsigned char *p) {
+    _mm_prefetch((char *)p, _MM_HINT_T0); // prefetch into L1$
+}
+
+static FORCEINLINE void __prefetch_read_uniform_2(unsigned char *p) {
+    _mm_prefetch((char *)p, _MM_HINT_T1); // prefetch into L2$
+}
+
+static FORCEINLINE void __prefetch_read_uniform_3(unsigned char *p) {
+    // There is no L3$ on KNC, don't want to pollute L2$ unecessarily
+}
+
+static FORCEINLINE void __prefetch_read_uniform_nt(unsigned char *p) {
+    _mm_prefetch((char *)p, _MM_HINT_T2); // prefetch into L2$ with non-temporal hint
+    // _mm_prefetch(p, _MM_HINT_NTA); // prefetch into L1$ with non-temporal hint
+}
+
+///////////////////////////////////////////////////////////////////////////
+// atomics
+
+static FORCEINLINE uint32_t __atomic_add(uint32_t *p, uint32_t v) {
+#ifdef _MSC_VER
+    return InterlockedAdd((LONG volatile *)p, v) - v;
+#else
+    return __sync_fetch_and_add(p, v);
+#endif
+}
+
+static FORCEINLINE uint32_t __atomic_sub(uint32_t *p, uint32_t v) {
+#ifdef _MSC_VER
+    return InterlockedAdd((LONG volatile *)p, -v) + v;
+#else
+    return __sync_fetch_and_sub(p, v);
+#endif
+}
+
+static FORCEINLINE uint32_t __atomic_and(uint32_t *p, uint32_t v) {
+#ifdef _MSC_VER
+    return InterlockedAnd((LONG volatile *)p, v);
+#else
+    return __sync_fetch_and_and(p, v);
+#endif
+}
+
+static FORCEINLINE uint32_t __atomic_or(uint32_t *p, uint32_t v) {
+#ifdef _MSC_VER
+    return InterlockedOr((LONG volatile *)p, v);
+#else
+    return __sync_fetch_and_or(p, v);
+#endif
+}
+
+static FORCEINLINE uint32_t __atomic_xor(uint32_t *p, uint32_t v) {
+#ifdef _MSC_VER
+    return InterlockedXor((LONG volatile *)p, v);
+#else
+    return __sync_fetch_and_xor(p, v);
+#endif
+}
+
+static FORCEINLINE uint32_t __atomic_min(uint32_t *p, uint32_t v) {
+    int32_t old, min;
+    do {
+        old = *((volatile int32_t *)p);
+        min = (old < (int32_t)v) ? old : (int32_t)v;
+#ifdef _MSC_VER
+    } while (InterlockedCompareExchange((LONG volatile *)p, min, old) != old);
+#else
+    } while (__sync_bool_compare_and_swap(p, old, min) == false);
+#endif
+    return old;
+}
+
+static FORCEINLINE uint32_t __atomic_max(uint32_t *p, uint32_t v) {
+    int32_t old, max;
+    do {
+        old = *((volatile int32_t *)p);
+        max = (old > (int32_t)v) ? old : (int32_t)v;
+#ifdef _MSC_VER
+    } while (InterlockedCompareExchange((LONG volatile *)p, max, old) != old);
+#else
+    } while (__sync_bool_compare_and_swap(p, old, max) == false);
+#endif
+    return old;
+}
+
+static FORCEINLINE uint32_t __atomic_umin(uint32_t *p, uint32_t v) {
+    uint32_t old, min;
+    do {
+        old = *((volatile uint32_t *)p);
+        min = (old < v) ? old : v;
+#ifdef _MSC_VER
+    } while (InterlockedCompareExchange((LONG volatile *)p, min, old) != old);
+#else
+    } while (__sync_bool_compare_and_swap(p, old, min) == false);
+#endif
+    return old;
+}
+
+static FORCEINLINE uint32_t __atomic_umax(uint32_t *p, uint32_t v) {
+    uint32_t old, max;
+    do {
+        old = *((volatile uint32_t *)p);
+        max = (old > v) ? old : v;
+#ifdef _MSC_VER
+    } while (InterlockedCompareExchange((LONG volatile *)p, max, old) != old);
+#else
+    } while (__sync_bool_compare_and_swap(p, old, max) == false);
+#endif
+    return old;
+}
+
+static FORCEINLINE uint32_t __atomic_xchg(uint32_t *p, uint32_t v) {
+#ifdef _MSC_VER
+    return InterlockedExchange((LONG volatile *)p, v);
+#else
+    return __sync_lock_test_and_set(p, v);
+#endif
+}
+
+static FORCEINLINE uint32_t __atomic_cmpxchg(uint32_t *p, uint32_t cmpval,
+                                             uint32_t newval) {
+#ifdef _MSC_VER
+    return InterlockedCompareExchange((LONG volatile *)p, newval, cmpval);
+#else
+    return __sync_val_compare_and_swap(p, cmpval, newval);
+#endif
+}
+
+static FORCEINLINE uint64_t __atomic_add(uint64_t *p, uint64_t v) {
+#ifdef _MSC_VER
+    return InterlockedAdd64((LONGLONG volatile *)p, v) - v;
+#else
+    return __sync_fetch_and_add(p, v);
+#endif
+}
+
+static FORCEINLINE uint64_t __atomic_sub(uint64_t *p, uint64_t v) {
+#ifdef _MSC_VER
+    return InterlockedAdd64((LONGLONG volatile *)p, -v) + v;
+#else
+    return __sync_fetch_and_sub(p, v);
+#endif
+}
+
+static FORCEINLINE uint64_t __atomic_and(uint64_t *p, uint64_t v) {
+#ifdef _MSC_VER
+    return InterlockedAnd64((LONGLONG volatile *)p, v) - v;
+#else
+    return __sync_fetch_and_and(p, v);
+#endif
+}
+
+static FORCEINLINE uint64_t __atomic_or(uint64_t *p, uint64_t v) {
+#ifdef _MSC_VER
+    return InterlockedOr64((LONGLONG volatile *)p, v) - v;
+#else
+    return __sync_fetch_and_or(p, v);
+#endif
+}
+
+static FORCEINLINE uint64_t __atomic_xor(uint64_t *p, uint64_t v) {
+#ifdef _MSC_VER
+    return InterlockedXor64((LONGLONG volatile *)p, v) - v;
+#else
+    return __sync_fetch_and_xor(p, v);
+#endif
+}
+
+static FORCEINLINE uint64_t __atomic_min(uint64_t *p, uint64_t v) {
+    int64_t old, min;
+    do {
+        old = *((volatile int64_t *)p);
+        min = (old < (int64_t)v) ? old : (int64_t)v;
+#ifdef _MSC_VER
+    } while (InterlockedCompareExchange64((LONGLONG volatile *)p, min, old) != old);
+#else
+    } while (__sync_bool_compare_and_swap(p, old, min) == false);
+#endif
+    return old;
+}
+
+static FORCEINLINE uint64_t __atomic_max(uint64_t *p, uint64_t v) {
+    int64_t old, max;
+    do {
+        old = *((volatile int64_t *)p);
+        max = (old > (int64_t)v) ? old : (int64_t)v;
+#ifdef _MSC_VER
+    } while (InterlockedCompareExchange64((LONGLONG volatile *)p, max, old) != old);
+#else
+    } while (__sync_bool_compare_and_swap(p, old, max) == false);
+#endif
+    return old;
+}
+
+static FORCEINLINE uint64_t __atomic_umin(uint64_t *p, uint64_t v) {
+    uint64_t old, min;
+    do {
+        old = *((volatile uint64_t *)p);
+        min = (old < v) ? old : v;
+#ifdef _MSC_VER
+    } while (InterlockedCompareExchange64((LONGLONG volatile *)p, min, old) != old);
+#else
+    } while (__sync_bool_compare_and_swap(p, old, min) == false);
+#endif
+    return old;
+}
+
+static FORCEINLINE uint64_t __atomic_umax(uint64_t *p, uint64_t v) {
+    uint64_t old, max;
+    do {
+        old = *((volatile uint64_t *)p);
+        max = (old > v) ? old : v;
+#ifdef _MSC_VER
+    } while (InterlockedCompareExchange64((LONGLONG volatile *)p, max, old) != old);
+#else
+    } while (__sync_bool_compare_and_swap(p, old, max) == false);
+#endif
+    return old;
+}
+
+static FORCEINLINE uint64_t __atomic_xchg(uint64_t *p, uint64_t v) {
+#ifdef _MSC_VER
+    return InterlockedExchange64((LONGLONG volatile *)p, v);
+#else
+    return __sync_lock_test_and_set(p, v);
+#endif
+}
+
+static FORCEINLINE uint64_t __atomic_cmpxchg(uint64_t *p, uint64_t cmpval,
+                                             uint64_t newval) {
+#ifdef _MSC_VER
+    return InterlockedCompareExchange64((LONGLONG volatile *)p, newval, cmpval);
+#else
+    return __sync_val_compare_and_swap(p, cmpval, newval);
+#endif
+}
+
+#ifdef WIN32
+#include <windows.h>
+#define __clock __rdtsc
+#else // WIN32
+static FORCEINLINE uint64_t __clock() {
+  uint32_t low, high;
+#ifdef __x86_64
+  __asm__ __volatile__ ("xorl %%eax,%%eax \n    cpuid"
+                        ::: "%rax", "%rbx", "%rcx", "%rdx" );
+#else
+  __asm__ __volatile__ ("xorl %%eax,%%eax \n    cpuid"
+                        ::: "%eax", "%ebx", "%ecx", "%edx" );
+#endif
+  __asm__ __volatile__ ("rdtsc" : "=a" (low), "=d" (high));
+  return (uint64_t)high << 32 | low;
+}
+
+#endif // !WIN32
+
+#undef FORCEINLINE
+#undef PRE_ALIGN
+#undef POST_ALIGN
diff --git a/examples/intrinsics/knc-i1x8.h b/examples/intrinsics/knc-i1x8.h
new file mode 100644
index 00000000..de9bddcc
--- /dev/null
+++ b/examples/intrinsics/knc-i1x8.h
@@ -0,0 +1,2862 @@
+/**
+  Copyright (c) 2010-2012, Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+*/
+
+#include <stdint.h>
+#include <math.h>
+#include <assert.h>
+#include <algorithm>
+#include <immintrin.h>
+#include <zmmintrin.h>
+
+#if 0
+#define __ZMM32BIT__
+#endif
+
+
+#ifdef _MSC_VER
+#define FORCEINLINE __forceinline
+#define PRE_ALIGN(x)  /*__declspec(align(x))*/
+#define POST_ALIGN(x)  
+#define roundf(x) (floorf(x + .5f))
+#define round(x) (floor(x + .5))
+#else
+#define FORCEINLINE __attribute__((always_inline))
+#define PRE_ALIGN(x)
+#define POST_ALIGN(x)  __attribute__ ((aligned(x)))
+#endif
+
+#if 0
+#define KNC 1
+extern "C" 
+{
+  int printf(const unsigned char *, ...);
+  int puts(unsigned char *);
+  unsigned int putchar(unsigned int);
+  int fflush(void *);
+  uint8_t *memcpy(uint8_t *, uint8_t *, uint64_t);
+  uint8_t *memset(uint8_t *, uint8_t, uint64_t);
+  void memset_pattern16(void *, const void *, uint64_t);
+}
+#endif
+
+typedef float __vec1_f;
+typedef double __vec1_d;
+typedef int8_t __vec1_i8;
+typedef int16_t __vec1_i16;
+typedef int32_t __vec1_i32;
+typedef int64_t __vec1_i64;
+
+struct __vec8_i1 {
+    __vec8_i1() { }
+    __vec8_i1(const __mmask16 &vv) : v(vv) { }
+    __vec8_i1(bool v0, bool v1, bool v2, bool v3,
+               bool v4, bool v5, bool v6, bool v7) {
+        v = ((v0 & 1) |
+             ((v1 & 1) << 1) |
+             ((v2 & 1) << 2) |
+             ((v3 & 1) << 3) |
+             ((v4 & 1) << 4) |
+             ((v5 & 1) << 5) |
+             ((v6 & 1) << 6) |
+             ((v7 & 1) << 7) );
+    }
+             
+    __mmask8 v;
+    FORCEINLINE operator __mmask8() const { return v; }
+};
+
+
+template <typename T>
+struct vec8 {
+    vec8() { }
+    vec8(T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7) {
+        data[0] = v0;        data[1] = v1;        data[2] = v2;        data[3] = v3;
+        data[4] = v4;        data[5] = v5;        data[6] = v6;        data[7] = v7;
+    }
+    T data[8]; 
+    FORCEINLINE const T& operator[](const int i) const { return data[i]; }
+    FORCEINLINE       T& operator[](const int i)       { return data[i]; }
+};
+
+/****************/
+
+#ifndef __ZMM32BIT__
+struct PRE_ALIGN(32) __vec8_i32  : public vec8<int32_t> { 
+  __vec8_i32() { }
+  FORCEINLINE __vec8_i32(int32_t v0, int32_t v1, int32_t v2, int32_t v3, 
+      int32_t v4, int32_t v5, int32_t v6, int32_t v7)
+    : vec8<int32_t>(v0, v1, v2, v3, v4, v5, v6, v7) { }
+  FORCEINLINE __vec8_i32(__m512i v) 
+  {
+    union { __m512i v; int32_t s[8]; } val = {v};
+    data[0] = val.s[0];
+    data[1] = val.s[1];
+    data[2] = val.s[2];
+    data[3] = val.s[3];
+    data[4] = val.s[4];
+    data[5] = val.s[5];
+    data[6] = val.s[6];
+    data[7] = val.s[7];
+  }
+  FORCEINLINE operator __m512i() const 
+  { 
+    return _mm512_set_16to16_pi(
+        0,0,0,0, 0,0,0,0,
+        data[7],data[6],data[5],data[4],data[3],data[2],data[1],data[0]);
+  }
+} POST_ALIGN(32);
+#else /* __ZMM32BIT__ */
+struct PRE_ALIGN(32) __vec8_i32 
+{
+  __m512i v;
+  FORCEINLINE operator __m512i() const { return v; }
+  FORCEINLINE __vec8_i32() : v(_mm512_undefined_epi32()) {}
+  FORCEINLINE __vec8_i32(const __m512i &in) : v(in) {}
+  FORCEINLINE __vec8_i32(const __vec8_i32 &o) : v(o.v) {}
+  FORCEINLINE __vec8_i32& operator =(const __vec8_i32 &o) { v=o.v; return *this; }
+  FORCEINLINE __vec8_i32(int32_t v00, int32_t v01, int32_t v02, int32_t v03, 
+                        int32_t v04, int32_t v05, int32_t v06, int32_t v07) :
+    v ( _mm512_set_16to16_pi(0,0,0,0,0,0,0,0, v07, v06, v05, v04, v03, v02, v01, v00) ) {}
+    FORCEINLINE const int32_t& operator[](const int i) const {  return ((int32_t*)this)[i]; }
+    FORCEINLINE       int32_t& operator[](const int i)       {  return ((int32_t*)this)[i]; }
+} POST_ALIGN(32);
+#endif /* __ZMM32BIT__ */
+
+#ifndef __ZMM32BIT__ /* __ZMM32BIT__ */
+PRE_ALIGN(32) struct __vec8_f : public vec8<float> { 
+    __vec8_f() { }
+  FORCEINLINE  __vec8_f(float v0, float v1, float v2, float v3, 
+             float v4, float v5, float v6, float v7) 
+        : vec8<float>(v0, v1, v2, v3, v4, v5, v6, v7) { }
+  FORCEINLINE operator __m512() const 
+  { 
+    return _mm512_set_16to16_ps(
+        0,0,0,0,0,0,0,0,
+        data[7],data[6],data[5],data[4],data[3],data[2],data[1],data[0]);
+  }
+  FORCEINLINE __vec8_f(__m512 v) 
+  {
+    union { __m512 v; float s[8]; } val = {v};
+    data[0] = val.s[0];
+    data[1] = val.s[1];
+    data[2] = val.s[2];
+    data[3] = val.s[3];
+    data[4] = val.s[4];
+    data[5] = val.s[5];
+    data[6] = val.s[6];
+    data[7] = val.s[7];
+  }
+} POST_ALIGN(32);
+#else /* __ZMM32BIT__ */
+PRE_ALIGN(32) struct __vec8_f 
+{
+    __m512 v;
+    FORCEINLINE operator __m512() const { return v; }
+    FORCEINLINE __vec8_f() : v(_mm512_undefined_ps()) { }
+    FORCEINLINE __vec8_f(const __m512 &in) : v(in) {}
+    FORCEINLINE __vec8_f(const __vec8_f &o) : v(o.v) {}
+    FORCEINLINE __vec8_f& operator =(const __vec8_f &o) { v=o.v; return *this; }
+    FORCEINLINE __vec8_f(float v00, float v01, float v02, float v03, 
+                          float v04, float v05, float v06, float v07) :
+        v ( _mm512_set_16to16_ps(0,0,0,0,0,0,0,0, v07, v06, v05, v04, v03, v02, v01, v00) )  {}
+    FORCEINLINE const float& operator[](const int i) const {  return ((float*)this)[i]; }
+    FORCEINLINE       float& operator[](const int i)       {  return ((float*)this)[i]; }
+} POST_ALIGN(32);
+#endif /* __ZMM32BIT__ */
+
+struct PRE_ALIGN(64) __vec8_d 
+{
+    __m512d v;
+    FORCEINLINE __vec8_d() : v(_mm512_undefined_pd()) {}
+    FORCEINLINE __vec8_d(const __m512d _v) : v(_v) {}
+    FORCEINLINE __vec8_d(const __vec8_d &o) : v(o.v) {}
+    FORCEINLINE __vec8_d& operator =(const __vec8_d &o) { v=o.v; return *this; }
+    FORCEINLINE operator __m512d() const { return v; }
+    FORCEINLINE __vec8_d(double v00, double v01, double v02, double v03, 
+                          double v04, double v05, double v06, double v07) :
+        v ( _mm512_set_8to8_pd(v07, v06, v05, v04, v03, v02, v01, v00) ) {}
+    FORCEINLINE const double& operator[](const int i) const {  return ((double*)this)[i]; }
+    FORCEINLINE       double& operator[](const int i)       {  return ((double*)this)[i]; }
+} POST_ALIGN(64);
+
+/****************/
+
+PRE_ALIGN(64) struct __vec8_i64  : public vec8<int64_t> { 
+    __vec8_i64() { }
+    __vec8_i64(int64_t v0, int64_t v1, int64_t v2, int64_t v3, 
+               int64_t v4, int64_t v5, int64_t v6, int64_t v7) 
+        : vec8<int64_t>(v0, v1, v2, v3, v4, v5, v6, v7) { }
+} POST_ALIGN(64);
+
+PRE_ALIGN(16) struct __vec8_i8   : public vec8<int8_t> { 
+    __vec8_i8() { }
+    __vec8_i8(int8_t v0, int8_t v1, int8_t v2, int8_t v3, 
+               int8_t v4, int8_t v5, int8_t v6, int8_t v7)
+        : vec8<int8_t>(v0, v1, v2, v3, v4, v5, v6, v7) { }
+} POST_ALIGN(16);
+
+PRE_ALIGN(32) struct __vec8_i16  : public vec8<int16_t> { 
+    __vec8_i16() { }
+    __vec8_i16(int16_t v0, int16_t v1, int16_t v2, int16_t v3, 
+                int16_t v4, int16_t v5, int16_t v6, int16_t v7) 
+        : vec8<int16_t>(v0, v1, v2, v3, v4, v5, v6, v7) { }
+} POST_ALIGN(32);
+
+static inline int32_t __extract_element(__vec8_i32, int);
+
+
+///////////////////////////////////////////////////////////////////////////
+// macros...
+
+#define UNARY_OP(TYPE, NAME, OP)            \
+static FORCEINLINE TYPE NAME(TYPE v) {      \
+    TYPE ret;                               \
+    for (int i = 0; i < 8; ++i)            \
+        ret[i] = OP(v[i]);              \
+    return ret;                             \
+}
+
+#define BINARY_OP(TYPE, NAME, OP)                               \
+static FORCEINLINE TYPE NAME(TYPE a, TYPE b) {                  \
+    TYPE ret;                                                   \
+   for (int i = 0; i < 8; ++i)                                 \
+       ret[i] = a[i] OP b[i];                             \
+   return ret;                                                   \
+}
+
+#define BINARY_OP_CAST(TYPE, CAST, NAME, OP)                        \
+static FORCEINLINE TYPE NAME(TYPE a, TYPE b) {                      \
+   TYPE ret;                                                        \
+   for (int i = 0; i < 8; ++i)                                     \
+       ret[i] = (CAST)(a[i]) OP (CAST)(b[i]);                 \
+   return ret;                                                      \
+}
+
+#define BINARY_OP_FUNC(TYPE, NAME, FUNC)                            \
+static FORCEINLINE TYPE NAME(TYPE a, TYPE b) {                      \
+   TYPE ret;                                                        \
+   for (int i = 0; i < 8; ++i)                                     \
+       ret[i] = FUNC(a[i], b[i]);                             \
+   return ret;                                                      \
+}
+
+#define CMP_OP(TYPE, SUFFIX, CAST, NAME, OP)                        \
+static FORCEINLINE __vec8_i1 NAME##_##SUFFIX(TYPE a, TYPE b) {     \
+   __vec8_i1 ret;                                                  \
+   ret.v = 0;                                                       \
+   for (int i = 0; i < 8; ++i)                                     \
+       ret.v |= ((CAST)(a[i]) OP (CAST)(b[i])) << i;            \
+   return ret;                                                      \
+}                                                                   \
+static FORCEINLINE __vec8_i1 NAME##_##SUFFIX##_and_mask(TYPE a, TYPE b,       \
+                                              __vec8_i1 mask) {    \
+   __vec8_i1 ret;                                                  \
+   ret.v = 0;                                                       \
+   for (int i = 0; i < 8; ++i)                                     \
+       ret.v |= ((CAST)(a[i]) OP (CAST)(b[i])) << i;            \
+   ret.v &= mask.v;                                                 \
+   return ret;                                                      \
+}
+
+#define INSERT_EXTRACT(VTYPE, STYPE)                                  \
+static FORCEINLINE STYPE __extract_element(VTYPE v, int index) {      \
+    return ((STYPE *)&v)[index];                                      \
+}                                                                     \
+static FORCEINLINE void __insert_element(VTYPE *v, int index, STYPE val) { \
+    ((STYPE *)v)[index] = val;                                        \
+}
+
+#define LOAD_STORE(VTYPE, STYPE)                       \
+template <int ALIGN>                                   \
+static FORCEINLINE VTYPE __load(const VTYPE *p) {      \
+    STYPE *ptr = (STYPE *)p;                           \
+    VTYPE ret;                                         \
+    for (int i = 0; i < 8; ++i)                       \
+        ret[i] = ptr[i];                             \
+    return ret;                                        \
+}                                                      \
+template <int ALIGN>                                   \
+static FORCEINLINE void __store(VTYPE *p, VTYPE v) {   \
+    STYPE *ptr = (STYPE *)p;                           \
+    for (int i = 0; i < 8; ++i)                       \
+        ptr[i] = v[i];                               \
+}
+
+#define LOADS(VTYPE, STYPE)                       \
+template <int ALIGN>                                   \
+static FORCEINLINE VTYPE __load(const VTYPE *p) {      \
+    STYPE *ptr = (STYPE *)p;                           \
+    VTYPE ret;                                         \
+    for (int i = 0; i < 8; ++i)                       \
+        ret[i] = ptr[i];                             \
+    return ret;                                        \
+}                                                      \
+
+#define STORES(VTYPE, STYPE)                       \
+template <int ALIGN>                                   \
+static FORCEINLINE void __store(VTYPE *p, VTYPE v) {   \
+    STYPE *ptr = (STYPE *)p;                           \
+    for (int i = 0; i < 8; ++i)                       \
+        ptr[i] = v[i];                               \
+}
+
+#define REDUCE_ADD(TYPE, VTYPE, NAME)           \
+static FORCEINLINE TYPE NAME(VTYPE v) {         \
+     TYPE ret = v[0];                         \
+     for (int i = 1; i < 8; ++i)               \
+         ret = ret + v[i];                    \
+     return ret;                                \
+}
+
+#define REDUCE_MINMAX(TYPE, VTYPE, NAME, OP)                    \
+static FORCEINLINE TYPE NAME(VTYPE v) {                         \
+    TYPE ret = v[0];                                          \
+    for (int i = 1; i < 8; ++i)                                \
+        ret = (ret OP (TYPE)v[i]) ? ret : (TYPE)v[i];       \
+    return ret;                                                 \
+}
+
+#define SELECT(TYPE)                                                \
+static FORCEINLINE TYPE __select(__vec8_i1 mask, TYPE a, TYPE b) { \
+    TYPE ret;                                                       \
+    for (int i = 0; i < 8; ++i)                                    \
+        ret[i] = (mask.v & (1<<i)) ? a[i] : b[i];             \
+    return ret;                                                     \
+}                                                                   \
+static FORCEINLINE TYPE __select(bool cond, TYPE a, TYPE b) {       \
+    return cond ? a : b;                                            \
+}
+
+#define SHIFT_UNIFORM(TYPE, CAST, NAME, OP)                         \
+static FORCEINLINE TYPE NAME(TYPE a, int32_t b) {                   \
+   TYPE ret;                                                        \
+   for (int i = 0; i < 8; ++i)                                     \
+       ret[i] = (CAST)(a[i]) OP b;                              \
+   return ret;                                                      \
+}
+
+#define SMEAR(VTYPE, NAME, STYPE)                                  \
+template <class RetVecType> VTYPE __smear_##NAME(STYPE);           \
+template <> FORCEINLINE VTYPE __smear_##NAME<VTYPE>(STYPE v) {     \
+    VTYPE ret;                                                     \
+    for (int i = 0; i < 8; ++i)                                   \
+        ret[i] = v;                                              \
+    return ret;                                                    \
+}
+
+#define SETZERO(VTYPE, NAME)                                       \
+template <class RetVecType> VTYPE __setzero_##NAME();              \
+template <> FORCEINLINE VTYPE __setzero_##NAME<VTYPE>() {          \
+    VTYPE ret;                                                     \
+    for (int i = 0; i < 8; ++i)                                   \
+        ret[i] = 0;                                              \
+    return ret;                                                    \
+}
+
+#define UNDEF(VTYPE, NAME)                                         \
+template <class RetVecType> VTYPE __undef_##NAME();                \
+template <> FORCEINLINE VTYPE __undef_##NAME<VTYPE>() {            \
+    return VTYPE();                                                \
+}
+
+#define BROADCAST(VTYPE, NAME, STYPE)                 \
+static FORCEINLINE VTYPE __broadcast_##NAME(VTYPE v, int index) {   \
+    VTYPE ret;                                        \
+    for (int i = 0; i < 8; ++i)                      \
+        ret[i] = v[index & 0x7];                  \
+    return ret;                                       \
+}                                                     \
+
+#define ROTATE(VTYPE, NAME, STYPE)                    \
+static FORCEINLINE VTYPE __rotate_##NAME(VTYPE v, int index) {   \
+    VTYPE ret;                                        \
+    for (int i = 0; i < 8; ++i)                      \
+        ret[i] = v[(i+index) & 0x7];              \
+    return ret;                                       \
+}                                                     \
+
+#define SHUFFLES(VTYPE, NAME, STYPE)                 \
+static FORCEINLINE VTYPE __shuffle_##NAME(VTYPE v, __vec8_i32 index) {   \
+    VTYPE ret;                                        \
+    for (int i = 0; i < 8; ++i)                      \
+        ret[i] = v[__extract_element(index, i) & 0x7];      \
+    return ret;                                       \
+}                                                     \
+static FORCEINLINE VTYPE __shuffle2_##NAME(VTYPE v0, VTYPE v1, __vec8_i32 index) {     \
+    VTYPE ret;                                        \
+    for (int i = 0; i < 8; ++i) {                    \
+        int ii = __extract_element(index, i) & 0xf;    \
+        ret[i] = (ii < 8) ? v0[ii] : v1[ii-8];  \
+    }                                                 \
+    return ret;                                       \
+}
+
+#define SHUFFLE2(VTYPE, NAME, STYPE)                 \
+static FORCEINLINE VTYPE __shuffle2_##NAME(VTYPE v0, VTYPE v1, __vec8_i32 index) {     \
+    VTYPE ret;                                        \
+    for (int i = 0; i < 8; ++i) {                    \
+        int ii = __extract_element(index, i) & 0xf;    \
+        ret[i] = (ii < 8) ? v0[ii] : v1[ii-8];  \
+    }                                                 \
+    return ret;                                       \
+}
+
+///////////////////////////////////////////////////////////////////////////
+
+INSERT_EXTRACT(__vec1_i8, int8_t)
+INSERT_EXTRACT(__vec1_i16, int16_t)
+INSERT_EXTRACT(__vec1_i32, int32_t)
+INSERT_EXTRACT(__vec1_i64, int64_t)
+INSERT_EXTRACT(__vec1_f, float)
+INSERT_EXTRACT(__vec1_d, double)
+
+///////////////////////////////////////////////////////////////////////////
+// mask ops
+
+static FORCEINLINE uint64_t __movmsk(__vec8_i1 mask) {
+    return (uint64_t)mask.v;
+}
+
+static FORCEINLINE bool __any(__vec8_i1 mask) {
+    return (mask.v!=0);
+}
+
+static FORCEINLINE bool __all(__vec8_i1 mask) {
+    return (mask.v==0xFF);
+}
+
+static FORCEINLINE bool __none(__vec8_i1 mask) {
+    return (mask.v==0);
+}
+
+static FORCEINLINE __vec8_i1 __equal_i1(__vec8_i1 a, __vec8_i1 b) {
+    __vec8_i1 r;
+    r.v = (a.v & b.v) | (~a.v & ~b.v);
+    return r;
+}
+
+static FORCEINLINE __vec8_i1 __and(__vec8_i1 a, __vec8_i1 b) {
+    __vec8_i1 r;
+    r.v = a.v & b.v;
+    return r;
+}
+
+static FORCEINLINE __vec8_i1 __xor(__vec8_i1 a, __vec8_i1 b) {
+    __vec8_i1 r;
+    r.v = a.v ^ b.v;
+    return r;
+}
+
+static FORCEINLINE __vec8_i1 __or(__vec8_i1 a, __vec8_i1 b) {
+    __vec8_i1 r;
+    r.v = a.v | b.v;
+    return r;
+}
+
+static FORCEINLINE __vec8_i1 __not(__vec8_i1 v) {
+    __vec8_i1 r;
+    r.v = ~v.v;
+    return r;
+}
+
+static FORCEINLINE __vec8_i1 __and_not1(__vec8_i1 a, __vec8_i1 b) {
+    __vec8_i1 r;
+    r.v = ~a.v & b.v;
+    return r;
+}
+
+static FORCEINLINE __vec8_i1 __and_not2(__vec8_i1 a, __vec8_i1 b) {
+    __vec8_i1 r;
+    r.v = a.v & ~b.v;
+    return r;
+}
+
+static FORCEINLINE __vec8_i1 __select(__vec8_i1 mask, __vec8_i1 a, 
+                                       __vec8_i1 b) {
+    __vec8_i1 r;
+    r.v = (a.v & mask.v) | (b.v & ~mask.v);
+    return r;
+}
+
+static FORCEINLINE __vec8_i1 __select(bool cond, __vec8_i1 a, __vec8_i1 b) {
+    return cond ? a : b;
+}
+
+static FORCEINLINE bool __extract_element(__vec8_i1 vec, int index) {
+    return (vec.v & (1 << index)) ? true : false;
+}
+
+static FORCEINLINE void __insert_element(__vec8_i1 *vec, int index, 
+                                         bool val) {
+    if (val == false)
+        vec->v &= ~(1 << index);
+    else
+        vec->v |= (1 << index);
+}
+
+template <int ALIGN> static FORCEINLINE __vec8_i1 __load(const __vec8_i1 *p) {
+    uint8_t *ptr = (uint8_t *)p;
+    __vec8_i1 r;
+    r.v = *ptr;
+    return r;
+}
+
+template <int ALIGN> static FORCEINLINE void __store(__vec8_i1 *p, __vec8_i1 v) {
+    uint8_t *ptr = (uint8_t *)p;
+    *ptr = v.v;
+}
+
+template <class RetVecType> RetVecType __smear_i1(int i);
+template <> static FORCEINLINE __vec8_i1 __smear_i1<__vec8_i1>(int i) {
+    return i?0xFF:0x0;
+}
+
+template <class RetVecType> RetVecType __setzero_i1();
+template <> static FORCEINLINE __vec8_i1 __setzero_i1<__vec8_i1>() {
+    return 0;
+}
+
+template <class RetVecType> __vec8_i1 __undef_i1();
+template <> FORCEINLINE __vec8_i1 __undef_i1<__vec8_i1>() {
+    return __vec8_i1();
+}
+
+
+///////////////////////////////////////////////////////////////////////////
+// int8
+
+BINARY_OP(__vec8_i8, __add, +)
+BINARY_OP(__vec8_i8, __sub, -)
+BINARY_OP(__vec8_i8, __mul, *)
+
+BINARY_OP(__vec8_i8, __or, |)
+BINARY_OP(__vec8_i8, __and, &)
+BINARY_OP(__vec8_i8, __xor, ^)
+BINARY_OP(__vec8_i8, __shl, <<)
+
+BINARY_OP_CAST(__vec8_i8, uint8_t, __udiv, /)
+BINARY_OP_CAST(__vec8_i8, int8_t,  __sdiv, /)
+
+BINARY_OP_CAST(__vec8_i8, uint8_t, __urem, %)
+BINARY_OP_CAST(__vec8_i8, int8_t,  __srem, %)
+BINARY_OP_CAST(__vec8_i8, uint8_t, __lshr, >>)
+BINARY_OP_CAST(__vec8_i8, int8_t,  __ashr, >>)
+
+SHIFT_UNIFORM(__vec8_i8, uint8_t, __lshr, >>)
+SHIFT_UNIFORM(__vec8_i8, int8_t, __ashr, >>)
+SHIFT_UNIFORM(__vec8_i8, int8_t, __shl, <<)
+
+CMP_OP(__vec8_i8, i8, int8_t,  __equal, ==)
+CMP_OP(__vec8_i8, i8, int8_t,  __not_equal, !=)
+CMP_OP(__vec8_i8, i8, uint8_t, __unsigned_less_equal, <=)
+CMP_OP(__vec8_i8, i8, int8_t,  __signed_less_equal, <=)
+CMP_OP(__vec8_i8, i8, uint8_t, __unsigned_greater_equal, >=)
+CMP_OP(__vec8_i8, i8, int8_t,  __signed_greater_equal, >=)
+CMP_OP(__vec8_i8, i8, uint8_t, __unsigned_less_than, <)
+CMP_OP(__vec8_i8, i8, int8_t,  __signed_less_than, <)
+CMP_OP(__vec8_i8, i8, uint8_t, __unsigned_greater_than, >)
+CMP_OP(__vec8_i8, i8, int8_t,  __signed_greater_than, >)
+
+SELECT(__vec8_i8)
+INSERT_EXTRACT(__vec8_i8, int8_t)
+SMEAR(__vec8_i8, i8, int8_t)
+SETZERO(__vec8_i8, i8)
+UNDEF(__vec8_i8, i8)
+BROADCAST(__vec8_i8, i8, int8_t)
+ROTATE(__vec8_i8, i8, int8_t)
+SHUFFLES(__vec8_i8, i8, int8_t)
+LOAD_STORE(__vec8_i8, int8_t)
+
+///////////////////////////////////////////////////////////////////////////
+// int16
+
+BINARY_OP(__vec8_i16, __add, +)
+BINARY_OP(__vec8_i16, __sub, -)
+BINARY_OP(__vec8_i16, __mul, *)
+
+BINARY_OP(__vec8_i16, __or, |)
+BINARY_OP(__vec8_i16, __and, &)
+BINARY_OP(__vec8_i16, __xor, ^)
+BINARY_OP(__vec8_i16, __shl, <<)
+
+BINARY_OP_CAST(__vec8_i16, uint16_t, __udiv, /)
+BINARY_OP_CAST(__vec8_i16, int16_t,  __sdiv, /)
+
+BINARY_OP_CAST(__vec8_i16, uint16_t, __urem, %)
+BINARY_OP_CAST(__vec8_i16, int16_t,  __srem, %)
+BINARY_OP_CAST(__vec8_i16, uint16_t, __lshr, >>)
+BINARY_OP_CAST(__vec8_i16, int16_t,  __ashr, >>)
+
+SHIFT_UNIFORM(__vec8_i16, uint16_t, __lshr, >>)
+SHIFT_UNIFORM(__vec8_i16, int16_t, __ashr, >>)
+SHIFT_UNIFORM(__vec8_i16, int16_t, __shl, <<)
+
+CMP_OP(__vec8_i16, i16, int16_t,  __equal, ==)
+CMP_OP(__vec8_i16, i16, int16_t,  __not_equal, !=)
+CMP_OP(__vec8_i16, i16, uint16_t, __unsigned_less_equal, <=)
+CMP_OP(__vec8_i16, i16, int16_t,  __signed_less_equal, <=)
+CMP_OP(__vec8_i16, i16, uint16_t, __unsigned_greater_equal, >=)
+CMP_OP(__vec8_i16, i16, int16_t,  __signed_greater_equal, >=)
+CMP_OP(__vec8_i16, i16, uint16_t, __unsigned_less_than, <)
+CMP_OP(__vec8_i16, i16, int16_t,  __signed_less_than, <)
+CMP_OP(__vec8_i16, i16, uint16_t, __unsigned_greater_than, >)
+CMP_OP(__vec8_i16, i16, int16_t,  __signed_greater_than, >)
+
+SELECT(__vec8_i16)
+INSERT_EXTRACT(__vec8_i16, int16_t)
+SMEAR(__vec8_i16, i16, int16_t)
+SETZERO(__vec8_i16, i16)
+UNDEF(__vec8_i16, i16)
+BROADCAST(__vec8_i16, i16, int16_t)
+ROTATE(__vec8_i16, i16, int16_t)
+SHUFFLES(__vec8_i16, i16, int16_t)
+LOAD_STORE(__vec8_i16, int16_t)
+
+#if 0 /* evghenii::int32 */
+///////////////////////////////////////////////////////////////////////////
+// int32
+
+BINARY_OP(__vec8_i32, __add, +)
+BINARY_OP(__vec8_i32, __sub, -)
+BINARY_OP(__vec8_i32, __mul, *)
+
+BINARY_OP(__vec8_i32, __or, |)
+BINARY_OP(__vec8_i32, __and, &)
+BINARY_OP(__vec8_i32, __xor, ^)
+BINARY_OP(__vec8_i32, __shl, <<)
+
+BINARY_OP_CAST(__vec8_i32, uint32_t, __udiv, /)
+BINARY_OP_CAST(__vec8_i32, int32_t,  __sdiv, /)
+
+BINARY_OP_CAST(__vec8_i32, uint32_t, __urem, %)
+BINARY_OP_CAST(__vec8_i32, int32_t,  __srem, %)
+BINARY_OP_CAST(__vec8_i32, uint32_t, __lshr, >>)
+BINARY_OP_CAST(__vec8_i32, int32_t,  __ashr, >>)
+
+SHIFT_UNIFORM(__vec8_i32, uint32_t, __lshr, >>)
+SHIFT_UNIFORM(__vec8_i32, int32_t, __ashr, >>)
+SHIFT_UNIFORM(__vec8_i32, int32_t, __shl, <<)
+
+CMP_OP(__vec8_i32, i32, int32_t,  __equal, ==)
+CMP_OP(__vec8_i32, i32, int32_t,  __not_equal, !=)
+CMP_OP(__vec8_i32, i32, uint32_t, __unsigned_less_equal, <=)
+CMP_OP(__vec8_i32, i32, int32_t,  __signed_less_equal, <=)
+CMP_OP(__vec8_i32, i32, uint32_t, __unsigned_greater_equal, >=)
+CMP_OP(__vec8_i32, i32, int32_t,  __signed_greater_equal, >=)
+CMP_OP(__vec8_i32, i32, uint32_t, __unsigned_less_than, <)
+CMP_OP(__vec8_i32, i32, int32_t,  __signed_less_than, <)
+CMP_OP(__vec8_i32, i32, uint32_t, __unsigned_greater_than, >)
+CMP_OP(__vec8_i32, i32, int32_t,  __signed_greater_than, >)
+
+SELECT(__vec8_i32)
+INSERT_EXTRACT(__vec8_i32, int32_t)
+SMEAR(__vec8_i32, i32, int32_t)
+SETZERO(__vec8_i32, i32)
+UNDEF(__vec8_i32, i32)
+BROADCAST(__vec8_i32, i32, int32_t)
+ROTATE(__vec8_i32, i32, int32_t)
+SHUFFLES(__vec8_i32, i32, int32_t)
+LOAD_STORE(__vec8_i32, int32_t)
+
+#else /* evghenii::int32 */
+///////////////////////////////////////////////////////////////////////////
+// int32
+///////////////////////////////////////////////////////////////////////////
+
+#define IZERO _mm512_setzero_epi32()
+static FORCEINLINE __vec8_i32 __add(__vec8_i32 a, __vec8_i32 b) {
+    return _mm512_mask_add_epi32(IZERO,0xFF, a, b);
+}
+
+static FORCEINLINE __vec8_i32 __sub(__vec8_i32 a, __vec8_i32 b) {
+    return _mm512_mask_sub_epi32(IZERO,0xFF, a, b);
+}
+
+static FORCEINLINE __vec8_i32 __mul(__vec8_i32 a, __vec8_i32 b) {
+    return _mm512_mask_mullo_epi32(IZERO,0xFF, a, b);
+}
+
+static FORCEINLINE __vec8_i32 __udiv(__vec8_i32 a, __vec8_i32 b) {
+    return _mm512_mask_div_epu32(IZERO,0xFF, a, b);
+}
+
+static FORCEINLINE __vec8_i32 __sdiv(__vec8_i32 a, __vec8_i32 b) {
+    return _mm512_mask_div_epi32(IZERO,0xFF, a, b);
+}
+
+static FORCEINLINE __vec8_i32 __urem(__vec8_i32 a, __vec8_i32 b) {
+    return _mm512_mask_rem_epu32(IZERO,0xFF, a, b);
+}
+
+static FORCEINLINE __vec8_i32 __srem(__vec8_i32 a, __vec8_i32 b) {
+    return _mm512_mask_rem_epi32(IZERO,0xFF, a, b);
+}
+
+static FORCEINLINE __vec8_i32 __or(__vec8_i32 a, __vec8_i32 b) {
+    return _mm512_mask_or_epi32(IZERO,0xFF, a, b);
+}
+
+static FORCEINLINE __vec8_i32 __and(__vec8_i32 a, __vec8_i32 b) {
+    return _mm512_mask_and_epi32(IZERO,0xFF, a, b);
+}
+
+static FORCEINLINE __vec8_i32 __xor(__vec8_i32 a, __vec8_i32 b) {
+    return _mm512_mask_xor_epi32(IZERO,0xFF, a, b);
+}
+
+static FORCEINLINE __vec8_i32 __shl(__vec8_i32 a, __vec8_i32 b) {
+    return _mm512_mask_sllv_epi32(IZERO,0xFF, a, b);
+}
+
+static FORCEINLINE __vec8_i32 __lshr(__vec8_i32 a, __vec8_i32 b) {
+    return _mm512_mask_srlv_epi32(IZERO,0xFF, a, b);
+}
+
+static FORCEINLINE __vec8_i32 __ashr(__vec8_i32 a, __vec8_i32 b) {
+    return _mm512_mask_srav_epi32(IZERO,0xFF, a, b); 
+}
+
+static FORCEINLINE __vec8_i32 __shl(__vec8_i32 a, int32_t n) {
+    return _mm512_mask_slli_epi32(IZERO,0xFF, a, n);
+}
+
+static FORCEINLINE __vec8_i32 __lshr(__vec8_i32 a, int32_t n) {
+    return _mm512_mask_srli_epi32(IZERO,0xFF, a, n); 
+}
+
+static FORCEINLINE __vec8_i32 __ashr(__vec8_i32 a, int32_t n) {
+    return _mm512_mask_srai_epi32(IZERO,0xFF, a, n); 
+}
+
+static FORCEINLINE __vec8_i1 __equal_i32(const __vec8_i32 &a, const __vec8_i32 &b) {
+    return _mm512_mask_cmpeq_epi32_mask(0xFF,a, b);
+}
+
+static FORCEINLINE __vec8_i1 __equal_i32_and_mask(const __vec8_i32 &a, const __vec8_i32 &b,
+                                                   __vec8_i1 m) {
+    return _mm512_mask_cmpeq_epi32_mask(m, a, b);
+}
+
+static FORCEINLINE __vec8_i1 __not_equal_i32(__vec8_i32 a, __vec8_i32 b) {
+    return _mm512_mask_cmpneq_epi32_mask(0xFF,a, b);
+}
+
+static FORCEINLINE __vec8_i1 __not_equal_i32_and_mask(__vec8_i32 a, __vec8_i32 b,
+                                                       __vec8_i1 m) {
+    return _mm512_mask_cmpneq_epi32_mask(m, a, b);
+}
+
+static FORCEINLINE __vec8_i1 __unsigned_less_equal_i32(__vec8_i32 a, __vec8_i32 b) {
+    return _mm512_mask_cmple_epu32_mask(0xFF,a, b);
+}
+
+static FORCEINLINE __vec8_i1 __unsigned_less_equal_i32_and_mask(__vec8_i32 a, __vec8_i32 b,
+                                                                 __vec8_i1 m) {
+    return _mm512_mask_cmple_epu32_mask(m, a, b);
+}
+
+static FORCEINLINE __vec8_i1 __signed_less_equal_i32(__vec8_i32 a, __vec8_i32 b) {
+    return _mm512_mask_cmple_epi32_mask(0xFF,a, b);
+}
+
+static FORCEINLINE __vec8_i1 __signed_less_equal_i32_and_mask(__vec8_i32 a, __vec8_i32 b,
+                                                               __vec8_i1 m) {
+    return _mm512_mask_cmple_epi32_mask(m, a, b);
+}
+
+static FORCEINLINE __vec8_i1 __unsigned_greater_equal_i32(__vec8_i32 a, __vec8_i32 b) {
+    return _mm512_mask_cmpge_epu32_mask(0xFF,a, b);
+}
+
+static FORCEINLINE __vec8_i1 __unsigned_greater_equal_i32_and_mask(__vec8_i32 a, __vec8_i32 b,
+                                                                    __vec8_i1 m) {
+    return _mm512_mask_cmpge_epu32_mask(m, a, b);
+}
+
+static FORCEINLINE __vec8_i1 __signed_greater_equal_i32(__vec8_i32 a, __vec8_i32 b) {
+    return _mm512_mask_cmpge_epi32_mask(0xFF,a, b);
+}
+
+static FORCEINLINE __vec8_i1 __signed_greater_equal_i32_and_mask(__vec8_i32 a, __vec8_i32 b,
+                                                                  __vec8_i1 m) {
+    return _mm512_mask_cmpge_epi32_mask(m, a, b);
+}
+
+static FORCEINLINE __vec8_i1 __unsigned_less_than_i32(__vec8_i32 a, __vec8_i32 b) {
+    return _mm512_mask_cmplt_epu32_mask(0xFF,a, b);
+}
+
+static FORCEINLINE __vec8_i1 __unsigned_less_than_i32_and_mask(__vec8_i32 a, __vec8_i32 b,
+                                                                __vec8_i1 m) {
+    return _mm512_mask_cmplt_epu32_mask(m, a, b);
+}
+
+static FORCEINLINE __vec8_i1 __signed_less_than_i32(__vec8_i32 a, __vec8_i32 b) {
+    return _mm512_mask_cmplt_epi32_mask(0xFF,a, b);
+}
+
+static FORCEINLINE __vec8_i1 __signed_less_than_i32_and_mask(__vec8_i32 a, __vec8_i32 b,
+                                                              __vec8_i1 m) {
+    return _mm512_mask_cmplt_epi32_mask(m, a, b);
+}
+
+static FORCEINLINE __vec8_i1 __unsigned_greater_than_i32(__vec8_i32 a, __vec8_i32 b) {
+    return _mm512_mask_cmpgt_epu32_mask(0xFF,a, b);
+}
+
+static FORCEINLINE __vec8_i1 __unsigned_greater_than_i32_and_mask(__vec8_i32 a, __vec8_i32 b,
+                                                                   __vec8_i1 m) {
+    return _mm512_mask_cmpgt_epu32_mask(m, a, b);
+}
+
+static FORCEINLINE __vec8_i1 __signed_greater_than_i32(__vec8_i32 a, __vec8_i32 b) {
+    return _mm512_mask_cmpgt_epi32_mask(0xFF,a, b);
+}
+
+static FORCEINLINE __vec8_i1 __signed_greater_than_i32_and_mask(__vec8_i32 a, __vec8_i32 b,
+                                                                 __vec8_i1 m) {
+    return _mm512_mask_cmpgt_epi32_mask(m, a, b);
+}
+
+static FORCEINLINE __vec8_i32 __select(__vec8_i1 mask,
+                                        __vec8_i32 a, __vec8_i32 b) {
+    return _mm512_mask_mov_epi32(b, mask, a);
+} 
+
+static FORCEINLINE __vec8_i32 __select(bool cond, __vec8_i32 a, __vec8_i32 b) {
+    return cond ? a : b;
+}
+
+static FORCEINLINE int32_t __extract_element(__vec8_i32 v, int index) { //uint32_t index) {
+    return ((int32_t *)&v)[index];
+}
+
+static FORCEINLINE void __insert_element(__vec8_i32 *v, uint32_t index, int32_t val) {
+    ((int32_t *)v)[index] = val;
+}
+
+template <class RetVecType> RetVecType __smear_i32(int32_t i);
+template <> static FORCEINLINE __vec8_i32 __smear_i32<__vec8_i32>(int32_t i) {
+    return _mm512_set_16to16_epi32(0,0,0,0,0,0,0,0, i,i,i,i,i,i,i,i);
+}
+
+static const __vec8_i32 __ispc_one = __smear_i32<__vec8_i32>(1);
+static const __vec8_i32 __ispc_thirty_two = __smear_i32<__vec8_i32>(32);
+static const __vec8_i32 __ispc_ffffffff = __smear_i32<__vec8_i32>(-1);
+static const __vec8_i32 __ispc_stride1(0, 1, 2, 3, 4, 5, 6, 7);
+
+template <class RetVecType> RetVecType __setzero_i32();
+template <> static FORCEINLINE __vec8_i32 __setzero_i32<__vec8_i32>() {
+    return _mm512_setzero_epi32();
+}
+
+template <class RetVecType> RetVecType __undef_i32();
+template <> static FORCEINLINE __vec8_i32 __undef_i32<__vec8_i32>() {
+    return __vec8_i32();
+}
+
+static FORCEINLINE __vec8_i32 __broadcast_i32(__vec8_i32 v, int index) {
+    int32_t val = __extract_element(v, index & 0xf);
+    return _mm512_set1_epi32(val);
+}
+
+#if 0 /* evghenii::doesn't work */
+static FORCEINLINE __vec8_i32 __rotate_i32(__vec8_i32 v, int index) {
+    __vec8_i32 idx = __smear_i32<__vec8_i32>(index);
+    __vec8_i32 shuffle = _mm512_and_epi32(_mm512_add_epi32(__ispc_stride1, idx),  __smear_i32<__vec8_i32>(0x7));
+    return _mm512_mask_permutevar_epi32(v, 0xffff, shuffle, v);
+}
+#else
+ROTATE(__vec8_i32, i32, int32_t)
+#endif
+
+static FORCEINLINE __vec8_i32 __shuffle_i32(__vec8_i32 v, __vec8_i32 index) {
+    return _mm512_mask_permutevar_epi32(v, 0xffff, index, v);
+}
+SHUFFLE2(__vec8_i32, i32, int32_t) /* evghenii::to implement */
+
+template <int ALIGN> static FORCEINLINE __vec8_i32 __load(const __vec8_i32 *p) {
+  __vec8_i32 v;
+  v = _mm512_extloadunpacklo_epi32(v, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
+  v = _mm512_extloadunpackhi_epi32(v, (uint8_t*)p+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
+  return __select(0xFF,v,IZERO);
+}
+
+
+template <int ALIGN> static FORCEINLINE void __store(__vec8_i32 *p, __vec8_i32 v) {
+  _mm512_mask_extpackstorelo_epi32(          p,    0xFF, v, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
+  _mm512_mask_extpackstorehi_epi32((uint8_t*)p+64, 0xFF, v, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
+}
+
+#if 0
+template <> static FORCEINLINE __vec8_i32 __load<64>(const __vec8_i32 *p) {
+    return _mm512_load_epi32(p);
+}
+template <> static FORCEINLINE void __store<64>(__vec8_i32 *p, __vec8_i32 v) {
+    _mm512_store_epi32(p, v);
+}
+#endif
+#endif /* evghenii::int32 */
+
+///////////////////////////////////////////////////////////////////////////
+// int64
+
+BINARY_OP(__vec8_i64, __add, +)
+BINARY_OP(__vec8_i64, __sub, -)
+BINARY_OP(__vec8_i64, __mul, *)
+
+BINARY_OP(__vec8_i64, __or, |)
+BINARY_OP(__vec8_i64, __and, &)
+BINARY_OP(__vec8_i64, __xor, ^)
+BINARY_OP(__vec8_i64, __shl, <<)
+
+BINARY_OP_CAST(__vec8_i64, uint64_t, __udiv, /)
+BINARY_OP_CAST(__vec8_i64, int64_t,  __sdiv, /)
+
+BINARY_OP_CAST(__vec8_i64, uint64_t, __urem, %)
+BINARY_OP_CAST(__vec8_i64, int64_t,  __srem, %)
+BINARY_OP_CAST(__vec8_i64, uint64_t, __lshr, >>)
+BINARY_OP_CAST(__vec8_i64, int64_t,  __ashr, >>)
+
+SHIFT_UNIFORM(__vec8_i64, uint64_t, __lshr, >>)
+SHIFT_UNIFORM(__vec8_i64, int64_t, __ashr, >>)
+SHIFT_UNIFORM(__vec8_i64, int64_t, __shl, <<)
+
+CMP_OP(__vec8_i64, i64, int64_t,  __equal, ==)
+CMP_OP(__vec8_i64, i64, int64_t,  __not_equal, !=)
+CMP_OP(__vec8_i64, i64, uint64_t, __unsigned_less_equal, <=)
+CMP_OP(__vec8_i64, i64, int64_t,  __signed_less_equal, <=)
+CMP_OP(__vec8_i64, i64, uint64_t, __unsigned_greater_equal, >=)
+CMP_OP(__vec8_i64, i64, int64_t,  __signed_greater_equal, >=)
+CMP_OP(__vec8_i64, i64, uint64_t, __unsigned_less_than, <)
+CMP_OP(__vec8_i64, i64, int64_t,  __signed_less_than, <)
+CMP_OP(__vec8_i64, i64, uint64_t, __unsigned_greater_than, >)
+CMP_OP(__vec8_i64, i64, int64_t,  __signed_greater_than, >)
+
+SELECT(__vec8_i64)
+INSERT_EXTRACT(__vec8_i64, int64_t)
+SMEAR(__vec8_i64, i64, int64_t)
+SETZERO(__vec8_i64, i64)
+UNDEF(__vec8_i64, i64)
+BROADCAST(__vec8_i64, i64, int64_t)
+ROTATE(__vec8_i64, i64, int64_t)
+SHUFFLES(__vec8_i64, i64, int64_t)
+LOAD_STORE(__vec8_i64, int64_t)
+
+
+#if 0 /* evghenii::float */
+///////////////////////////////////////////////////////////////////////////
+// float
+
+BINARY_OP(__vec8_f, __add, +)
+BINARY_OP(__vec8_f, __sub, -)
+BINARY_OP(__vec8_f, __mul, *)
+BINARY_OP(__vec8_f, __div, /)
+
+CMP_OP(__vec8_f, float, float, __equal, ==)
+CMP_OP(__vec8_f, float, float, __not_equal, !=)
+CMP_OP(__vec8_f, float, float, __less_than, <)
+CMP_OP(__vec8_f, float, float, __less_equal, <=)
+CMP_OP(__vec8_f, float, float, __greater_than, >)
+CMP_OP(__vec8_f, float, float, __greater_equal, >=)
+
+static FORCEINLINE __vec8_i1 __ordered_float(__vec8_f a, __vec8_f b) {
+    __vec8_i1 ret;
+    ret.v = 0;
+    for (int i = 0; i < 8; ++i)
+        ret.v |= ((a[i] == a[i]) && (b[i] == b[i])) ? (1 << i) : 0;
+    return ret;
+}
+
+static FORCEINLINE __vec8_i1 __unordered_float(__vec8_f a, __vec8_f b) {
+    __vec8_i1 ret;
+    ret.v = 0;
+    for (int i = 0; i < 8; ++i)
+        ret.v |= ((a[i] != a[i]) || (b[i] != b[i])) ? (1 << i) : 0;
+    return ret;
+}
+
+#if 0
+      case Instruction::FRem: intrinsic = "__frem"; break;
+#endif
+
+SELECT(__vec8_f)
+INSERT_EXTRACT(__vec8_f, float)
+SMEAR(__vec8_f, float, float)
+SETZERO(__vec8_f, float)
+UNDEF(__vec8_f, float)
+BROADCAST(__vec8_f, float, float)
+ROTATE(__vec8_f, float, float)
+SHUFFLES(__vec8_f, float, float)
+LOAD_STORE(__vec8_f, float)
+#else /* evghenii::float */
+
+///////////////////////////////////////////////////////////////////////////
+// float
+///////////////////////////////////////////////////////////////////////////
+
+#define FZERO _mm512_setzero_ps()
+static FORCEINLINE __vec8_f __add(__vec8_f a, __vec8_f b) { 
+    return _mm512_mask_add_ps(FZERO, 0xFF, a, b);
+}
+
+static FORCEINLINE __vec8_f __sub(__vec8_f a, __vec8_f b) {
+    return _mm512_mask_sub_ps(FZERO, 0xFF, a, b);
+}
+
+static FORCEINLINE __vec8_f __mul(__vec8_f a, __vec8_f b) {
+    return _mm512_mask_mul_ps(FZERO, 0xFF, a, b);
+}
+
+static FORCEINLINE __vec8_f __div(__vec8_f a, __vec8_f b) {
+    return _mm512_mask_div_ps(FZERO, 0xFF, a, b);
+}
+
+static FORCEINLINE __vec8_i1 __equal_float(__vec8_f a, __vec8_f b) {
+    return _mm512_mask_cmpeq_ps_mask(0xFF, a, b);
+}
+
+static FORCEINLINE __vec8_i1 __equal_float_and_mask(__vec8_f a, __vec8_f b,
+                                                     __vec8_i1 m) {
+    return _mm512_mask_cmpeq_ps_mask(m, a, b);
+}
+
+static FORCEINLINE __vec8_i1 __not_equal_float(__vec8_f a, __vec8_f b) {
+    return _mm512_mask_cmpneq_ps_mask(0xFF, a, b);
+}
+
+static FORCEINLINE __vec8_i1 __not_equal_float_and_mask(__vec8_f a, __vec8_f b,
+                                                         __vec8_i1 m) {
+    return _mm512_mask_cmpneq_ps_mask(m, a, b);
+}
+
+static FORCEINLINE __vec8_i1 __less_than_float(__vec8_f a, __vec8_f b) {
+    return _mm512_mask_cmplt_ps_mask(0xFF, a, b);
+}
+
+static FORCEINLINE __vec8_i1 __less_than_float_and_mask(__vec8_f a, __vec8_f b,
+                                                         __vec8_i1 m) {
+    return _mm512_mask_cmplt_ps_mask(m, a, b);
+}
+
+static FORCEINLINE __vec8_i1 __less_equal_float(__vec8_f a, __vec8_f b) {
+    return _mm512_mask_cmple_ps_mask(0xFF, a, b);
+}
+
+static FORCEINLINE __vec8_i1 __less_equal_float_and_mask(__vec8_f a, __vec8_f b,
+                                                          __vec8_i1 m) {
+    return _mm512_mask_cmple_ps_mask(m, a, b);
+}
+
+static FORCEINLINE __vec8_i1 __greater_than_float(__vec8_f a, __vec8_f b) {
+    return _mm512_mask_cmp_ps_mask(0xFF, a, b,_CMP_GT_OS);
+}
+
+static FORCEINLINE __vec8_i1 __greater_than_float_and_mask(__vec8_f a, __vec8_f b,
+                                                            __vec8_i1 m) {
+    return _mm512_mask_cmp_ps_mask(m,a, b,_CMP_GT_OS);
+}
+
+static FORCEINLINE __vec8_i1 __greater_equal_float(__vec8_f a, __vec8_f b) {
+    return _mm512_mask_cmp_ps_mask(0xFF, a, b,_CMP_GE_OS);
+}
+
+static FORCEINLINE __vec8_i1 __greater_equal_float_and_mask(__vec8_f a, __vec8_f b,
+                                                             __vec8_i1 m) {
+    return _mm512_mask_cmp_ps_mask(m,a, b,_CMP_GE_OS);
+}
+
+static FORCEINLINE __vec8_i1 __ordered_float(__vec8_f a, __vec8_f b) {
+    return _mm512_mask_cmpord_ps_mask(0xFF, a, b);
+}
+
+static FORCEINLINE __vec8_i1 __unordered_float(__vec8_f a, __vec8_f b) {
+    return _mm512_mask_cmpunord_ps_mask(0xFF,a, b);
+}
+
+static FORCEINLINE __vec8_f __select(__vec8_i1 mask, __vec8_f a, __vec8_f b) {
+    return _mm512_mask_mov_ps(b, mask & 0xFF, a);
+}
+
+static FORCEINLINE __vec8_f __select(bool cond, __vec8_f a, __vec8_f b) {
+    return cond ? a : b;
+}
+
+static FORCEINLINE float __extract_element(__vec8_f v, uint32_t index) {
+  return v[index];
+ //   return ((float *)&v)[index];
+}
+
+static FORCEINLINE void  __insert_element(__vec8_f *v, uint32_t index, float val) {
+  (*v)[index] = val;
+//    ((float *)v)[index] = val;
+}
+
+template <class RetVecType> RetVecType __smear_float(float f);
+template <> static FORCEINLINE __vec8_f __smear_float<__vec8_f>(float f) {
+  return _mm512_set_16to16_ps(0,0,0,0,0,0,0,0, f,f,f,f,f,f,f,f);
+}
+
+template <class RetVecType> RetVecType __setzero_float();
+template <> static FORCEINLINE __vec8_f __setzero_float<__vec8_f>() {
+    return _mm512_setzero_ps();
+}
+
+template <class RetVecType> RetVecType __undef_float();
+template <> static FORCEINLINE __vec8_f __undef_float<__vec8_f>() {
+    return __vec8_f();
+}
+
+static FORCEINLINE __vec8_f __broadcast_float(__vec8_f v, int index) {
+    float val = __extract_element(v, index & 0x7);
+  return _mm512_set_16to16_ps(0,0,0,0,0,0,0,0, val,val,val,val,val,val,val,val);
+}
+ 
+#if 1
+static FORCEINLINE __vec8_f __shuffle_float(__vec8_f v, __vec8_i32 index) {
+    return _mm512_castsi512_ps(_mm512_mask_permutevar_epi32(_mm512_castps_si512(v), 0xffff, index, _mm512_castps_si512(v)));
+}
+#endif
+ROTATE(__vec8_f, float, float)
+SHUFFLE2(__vec8_f, float, float)
+
+#if 0
+LOADS(__vec8_f, float)
+#else
+template <int ALIGN> static FORCEINLINE __vec8_f __load(const __vec8_f *p) {
+  __vec8_f v;
+  v = _mm512_extloadunpacklo_ps(v,           p,    _MM_UPCONV_PS_NONE, _MM_HINT_NONE);
+  v = _mm512_extloadunpackhi_ps(v, (uint8_t*)p+64, _MM_UPCONV_PS_NONE, _MM_HINT_NONE);
+  return __select(0xFF,v,FZERO);
+}
+#endif
+
+#if 0
+STORES(__vec8_f, float)
+#else
+template <int ALIGN> static FORCEINLINE void __store(__vec8_f *p, __vec8_f v) 
+{
+  _mm512_mask_extpackstorelo_ps(          p,    0xFF, v, _MM_DOWNCONV_PS_NONE, _MM_HINT_NONE);
+  _mm512_mask_extpackstorehi_ps((uint8_t*)p+64, 0xFF, v, _MM_DOWNCONV_PS_NONE, _MM_HINT_NONE);
+}
+#endif
+
+#endif /* evghenii::float */
+
+static FORCEINLINE float __exp_uniform_float(float v) {    return expf(v);}
+static FORCEINLINE __vec8_f __exp_varying_float(__vec8_f v) { return _mm512_mask_exp_ps(FZERO, 0xFF, v); }
+
+
+static FORCEINLINE float __log_uniform_float(float v) {    return logf(v);}
+static FORCEINLINE __vec8_f __log_varying_float(__vec8_f v) { return _mm512_mask_log_ps(FZERO, 0xFF, v); }
+
+static FORCEINLINE float __pow_uniform_float(float a, float b) {    return powf(a, b);}
+static FORCEINLINE __vec8_f __pow_varying_float(__vec8_f a, __vec8_f b) { return _mm512_mask_pow_ps(FZERO, 0xFF, a,b); }
+
+
+static FORCEINLINE int __intbits(float v) {
+    union {
+        float f;
+        int i;
+    } u;
+    u.f = v;
+    return u.i;
+}
+
+static FORCEINLINE float __floatbits(int v) {
+    union {
+        float f;
+        int i;
+    } u;
+    u.i = v;
+    return u.f;
+}
+
+static FORCEINLINE float __half_to_float_uniform(int16_t h) {
+    static const uint32_t shifted_exp = 0x7c00 << 13; // exponent mask after shift
+
+    int32_t o = ((int32_t)(h & 0x7fff)) << 13;     // exponent/mantissa bits
+    uint32_t exp = shifted_exp & o;   // just the exponent
+    o += (127 - 15) << 23;        // exponent adjust
+
+    // handle exponent special cases
+    if (exp == shifted_exp) // Inf/NaN?
+        o += (128 - 16) << 23;    // extra exp adjust
+    else if (exp == 0) { // Zero/Denormal?
+        o += 1 << 23;             // extra exp adjust
+        o = __intbits(__floatbits(o) - __floatbits(113 << 23)); // renormalize
+    }
+
+    o |= ((int32_t)(h & 0x8000)) << 16;    // sign bit
+    return __floatbits(o);
+}
+
+
+static FORCEINLINE __vec8_f __half_to_float_varying(__vec8_i16 v) {
+    __vec8_f ret;
+    for (int i = 0; i < 8; ++i)
+        ret[i] = __half_to_float_uniform(v[i]);
+    return ret;
+}
+
+
+static FORCEINLINE int16_t __float_to_half_uniform(float f) {
+    uint32_t sign_mask = 0x80000000u;
+    int32_t o;
+
+    int32_t fint = __intbits(f);
+    int32_t sign = fint & sign_mask;
+    fint ^= sign;
+
+    int32_t f32infty = 255 << 23;
+    o = (fint > f32infty) ? 0x7e00 : 0x7c00; 
+
+    // (De)normalized number or zero
+    // update fint unconditionally to save the blending; we don't need it
+    // anymore for the Inf/NaN case anyway.
+    const uint32_t round_mask = ~0xfffu; 
+    const int32_t magic = 15 << 23;
+    const int32_t f16infty = 31 << 23;
+
+    int32_t fint2 = __intbits(__floatbits(fint & round_mask) * __floatbits(magic)) - round_mask;
+    fint2 = (fint2 > f16infty) ? f16infty : fint2; // Clamp to signed infinity if overflowed
+
+    if (fint < f32infty)
+        o = fint2 >> 13; // Take the bits!
+
+    return (o | (sign >> 16));
+}
+
+
+static FORCEINLINE __vec8_i16 __float_to_half_varying(__vec8_f v) {
+    __vec8_i16 ret;
+    for (int i = 0; i < 8; ++i)
+        ret[i] = __float_to_half_uniform(v[i]);
+    return ret;
+}
+
+
+#if 0 /* evghenii::double */
+///////////////////////////////////////////////////////////////////////////
+// double
+
+BINARY_OP(__vec8_d, __add, +)
+BINARY_OP(__vec8_d, __sub, -)
+BINARY_OP(__vec8_d, __mul, *)
+BINARY_OP(__vec8_d, __div, /)
+
+CMP_OP(__vec8_d, double, double, __equal, ==)
+CMP_OP(__vec8_d, double, double, __not_equal, !=)
+CMP_OP(__vec8_d, double, double, __less_than, <)
+CMP_OP(__vec8_d, double, double, __less_equal, <=)
+CMP_OP(__vec8_d, double, double, __greater_than, >)
+CMP_OP(__vec8_d, double, double, __greater_equal, >=)
+
+static FORCEINLINE __vec8_i1 __ordered_double(__vec8_d a, __vec8_d b) {
+    __vec8_i1 ret;
+    ret.v = 0;
+    for (int i = 0; i < 8; ++i)
+        ret.v |= ((a[i] == a[i]) && (b[i] == b[i])) ? (1 << i) : 0;
+    return ret;
+}
+
+static FORCEINLINE __vec8_i1 __unordered_double(__vec8_d a, __vec8_d b) {
+    __vec8_i1 ret;
+    ret.v = 0;
+    for (int i = 0; i < 8; ++i)
+        ret.v |= ((a[i] != a[i]) || (b[i] != b[i])) ? (1 << i) : 0;
+    return ret;
+}
+
+#if 0
+      case Instruction::FRem: intrinsic = "__frem"; break;
+#endif
+
+SELECT(__vec8_d)
+INSERT_EXTRACT(__vec8_d, double)
+SMEAR(__vec8_d, double, double)
+SETZERO(__vec8_d, double)
+UNDEF(__vec8_d, double)
+BROADCAST(__vec8_d, double, double)
+ROTATE(__vec8_d, double, double)
+SHUFFLES(__vec8_d, double, double)
+LOAD_STORE(__vec8_d, double)
+#else /* evghenii::double */
+///////////////////////////////////////////////////////////////////////////
+// double
+///////////////////////////////////////////////////////////////////////////
+
+static FORCEINLINE __vec8_d __add(__vec8_d a, __vec8_d b) { 
+    return _mm512_add_pd(a, b);
+}
+static FORCEINLINE __vec8_d __sub(__vec8_d a, __vec8_d b) {
+    return _mm512_sub_pd(a, b);
+}
+static FORCEINLINE __vec8_d __mul(__vec8_d a, __vec8_d b) {
+    return _mm512_mul_pd(a, b);
+}
+
+static FORCEINLINE __vec8_d __div(__vec8_d a, __vec8_d b) {
+    return _mm512_div_pd(a, b);
+}
+
+static FORCEINLINE __vec8_i1 __equal_double(__vec8_d a, __vec8_d b) {
+    return _mm512_cmpeq_pd_mask(a, b);
+}
+
+static FORCEINLINE __vec8_i1 __equal_double_and_mask(__vec8_d a, __vec8_d b,
+                                                      __vec8_i1 m) {
+    return _mm512_mask_cmpeq_pd_mask(m, a, b);
+}
+
+static FORCEINLINE __vec8_i1 __not_equal_double(__vec8_d a, __vec8_d b) {
+    return _mm512_cmpneq_pd_mask(a, b);
+}
+
+static FORCEINLINE __vec8_i1 __not_equal_double_and_mask(__vec8_d a, __vec8_d b,
+                                                          __vec8_i1 m) {
+    return _mm512_mask_cmpneq_pd_mask(m, a, b);
+}
+
+static FORCEINLINE __vec8_i1 __less_than_double(__vec8_d a, __vec8_d b) {
+    return _mm512_cmplt_pd_mask(a, b);
+}
+
+static FORCEINLINE __vec8_i1 __less_than_double_and_mask(__vec8_d a, __vec8_d b,
+                                                          __vec8_i1 m) {
+    return _mm512_mask_cmplt_pd_mask(m, a, b);
+}
+
+static FORCEINLINE __vec8_i1 __less_equal_double(__vec8_d a, __vec8_d b) {
+    return _mm512_cmple_pd_mask(a, b);
+}
+
+static FORCEINLINE __vec8_i1 __less_equal_double_and_mask(__vec8_d a, __vec8_d b,
+                                                           __vec8_i1 m) {
+    return _mm512_mask_cmple_pd_mask(m, a, b);
+}
+
+static FORCEINLINE __vec8_i1 __greater_than_double(__vec8_d a, __vec8_d b) {
+    return _mm512_cmpnle_pd_mask(a, b);
+}
+
+static FORCEINLINE __vec8_i1 __greater_than_double_and_mask(__vec8_d a, __vec8_d b,
+                                                             __vec8_i1 m) {
+    return _mm512_mask_cmpnle_pd_mask(m, a, b);
+}
+
+static FORCEINLINE __vec8_i1 __greater_equal_double(__vec8_d a, __vec8_d b) {
+    return _mm512_cmpnlt_pd_mask(a, b);
+}
+
+static FORCEINLINE __vec8_i1 __greater_equal_double_and_mask(__vec8_d a, __vec8_d b,
+                                                              __vec8_i1 m) {
+    return _mm512_mask_cmpnlt_pd_mask(m, a, b);
+}
+
+static FORCEINLINE __vec8_i1 __ordered_double(__vec8_d a, __vec8_d b) {
+    return _mm512_cmpord_pd_mask(a, b);
+}
+
+static FORCEINLINE __vec8_i1 __unordered_double(__vec8_d a, __vec8_d b) {
+    return _mm512_cmpunord_pd_mask(a, b);
+}
+
+static FORCEINLINE __vec8_d __select(__vec8_i1 mask, __vec8_d a, __vec8_d b) {
+    return _mm512_mask_mov_pd(b, mask, a);
+}
+
+
+static FORCEINLINE __vec8_d __select(bool cond, __vec8_d a, __vec8_d b) {
+    return cond ? a : b;
+}
+
+static FORCEINLINE double __extract_element(__vec8_d v, uint32_t index) {
+    return ((double *)&v)[index];
+}
+
+static FORCEINLINE void  __insert_element(__vec8_d *v, uint32_t index, double val) {
+    ((double *)v)[index] = val;
+}
+
+template <class RetVecType> RetVecType __smear_double(double d);
+template <> static FORCEINLINE __vec8_d __smear_double<__vec8_d>(double d) { return _mm512_set1_pd(d); }
+
+template <class RetVecType> RetVecType __setzero_double();
+template <> static FORCEINLINE __vec8_d __setzero_double<__vec8_d>() { return _mm512_setzero_pd(); }
+
+template <class RetVecType> RetVecType __undef_double();
+template <> static FORCEINLINE __vec8_d __undef_double<__vec8_d>() {    return __vec8_d();}
+
+static FORCEINLINE __vec8_d __broadcast_double(__vec8_d v, int index) {
+    double val = __extract_element(v, index & 0xf);
+    return _mm512_set1_pd(val);
+}
+
+ROTATE(__vec8_d, double, double)
+SHUFFLES(__vec8_d, double, double)
+
+template <int ALIGN> static FORCEINLINE __vec8_d __load(const __vec8_d *p) {
+  __vec8_d ret;
+  ret.v = _mm512_extloadunpacklo_pd(ret.v, p, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
+  ret.v = _mm512_extloadunpackhi_pd(ret.v, (uint8_t*)p+64, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
+  return ret;
+}
+ 
+template <int ALIGN> static FORCEINLINE void __store(__vec8_d *p, __vec8_d v) {
+  _mm512_extpackstorelo_pd(p, v.v, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE);
+  _mm512_extpackstorehi_pd((uint8_t*)p+64, v.v, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE);
+}
+
+
+#if 0
+template <> static FORCEINLINE __vec8_d __load<64>(const __vec8_d *p) {
+    return  _mm512_load_pd(p);
+}
+template <> static FORCEINLINE __vec8_d __load<128>(const __vec8_d *p) {
+    return __load<64>(p);
+}
+template <> static FORCEINLINE void __store<64>(__vec8_d *p, __vec8_d v) {
+    _mm512_store_pd(p, v.v);
+}
+template <> static FORCEINLINE void __store<128>(__vec8_d *p, __vec8_d v) {
+    __store<64>(p, v);
+}
+#endif
+#endif /* evghenii::double */
+
+///////////////////////////////////////////////////////////////////////////
+// casts
+
+
+#define CAST(TO, STO, FROM, SFROM, FUNC)        \
+static FORCEINLINE TO FUNC(TO, FROM val) {      \
+    TO ret;                                     \
+    for (int i = 0; i < 8; ++i)                \
+        ret[i] = (STO)((SFROM)(val[i]));    \
+    return ret;                                 \
+}
+
+// sign extension conversions
+CAST(__vec8_i64, int64_t, __vec8_i32, int32_t, __cast_sext)
+CAST(__vec8_i64, int64_t, __vec8_i16, int16_t, __cast_sext)
+CAST(__vec8_i64, int64_t, __vec8_i8,  int8_t,  __cast_sext)
+CAST(__vec8_i32, int32_t, __vec8_i16, int16_t, __cast_sext)
+CAST(__vec8_i32, int32_t, __vec8_i8,  int8_t,  __cast_sext)
+CAST(__vec8_i16, int16_t, __vec8_i8,  int8_t,  __cast_sext)
+
+#define CAST_SEXT_I1(TYPE)                            \
+static FORCEINLINE TYPE __cast_sext(TYPE, __vec8_i1 v) {  \
+    TYPE ret;                                         \
+    for (int i = 0; i < 8; ++i) {                    \
+        ret[i] = 0;                                 \
+        if (v.v & (1 << i))                           \
+            ret[i] = ~ret[i];                     \
+    }                                                 \
+    return ret;                                       \
+}
+
+CAST_SEXT_I1(__vec8_i8)
+CAST_SEXT_I1(__vec8_i16)
+#if 0
+CAST_SEXT_I1(__vec8_i32)
+#else
+static FORCEINLINE __vec8_i32 __cast_sext(const __vec8_i32 &, const __vec8_i1 &val)
+{
+    __vec8_i32 ret = _mm512_setzero_epi32();
+    __vec8_i32 one = _mm512_set1_epi32(-1);
+    return _mm512_mask_mov_epi32(ret, 0xFF & val, one);
+}
+#endif
+CAST_SEXT_I1(__vec8_i64)
+
+// zero extension
+CAST(__vec8_i64, uint64_t, __vec8_i32, uint32_t, __cast_zext)
+CAST(__vec8_i64, uint64_t, __vec8_i16, uint16_t, __cast_zext)
+CAST(__vec8_i64, uint64_t, __vec8_i8,  uint8_t,  __cast_zext)
+CAST(__vec8_i32, uint32_t, __vec8_i16, uint16_t, __cast_zext)
+CAST(__vec8_i32, uint32_t, __vec8_i8,  uint8_t,  __cast_zext)
+CAST(__vec8_i16, uint16_t, __vec8_i8,  uint8_t,  __cast_zext)
+
+#define CAST_ZEXT_I1(TYPE)                            \
+static FORCEINLINE TYPE __cast_zext(TYPE, __vec8_i1 v) {  \
+    TYPE ret;                                         \
+    for (int i = 0; i < 8; ++i)                      \
+        ret[i] = (v.v & (1 << i)) ? 1 : 0;          \
+    return ret;                                       \
+}
+
+CAST_ZEXT_I1(__vec8_i8)
+CAST_ZEXT_I1(__vec8_i16)
+#if 0
+CAST_ZEXT_I1(__vec8_i32)
+#else
+static FORCEINLINE __vec8_i32 __cast_zext(const __vec8_i32 &, const __vec8_i1 &val)
+{
+    __vec8_i32 ret = _mm512_setzero_epi32();
+    __vec8_i32 one = _mm512_set1_epi32(1);
+    return _mm512_mask_mov_epi32(ret, 0xFF & val, one);
+}
+#endif
+CAST_ZEXT_I1(__vec8_i64)
+
+// truncations
+CAST(__vec8_i32, int32_t, __vec8_i64, int64_t, __cast_trunc)
+CAST(__vec8_i16, int16_t, __vec8_i64, int64_t, __cast_trunc)
+CAST(__vec8_i8,  int8_t,  __vec8_i64, int64_t, __cast_trunc)
+CAST(__vec8_i16, int16_t, __vec8_i32, int32_t, __cast_trunc)
+CAST(__vec8_i8,  int8_t,  __vec8_i32, int32_t, __cast_trunc)
+CAST(__vec8_i8,  int8_t,  __vec8_i16, int16_t, __cast_trunc)
+
+// signed int to float/double
+#if 0
+CAST(__vec8_f, float, __vec8_i8,   int8_t,  __cast_sitofp)
+CAST(__vec8_f, float, __vec8_i16,  int16_t, __cast_sitofp)
+CAST(__vec8_f, float, __vec8_i32,  int32_t, __cast_sitofp)
+#else
+static FORCEINLINE __vec8_f __cast_sitofp(__vec8_f, __vec8_i8  val) {return _mm512_mask_extload_ps(FZERO, 0xFF, &val, _MM_UPCONV_PS_SINT8, _MM_BROADCAST_16X16, _MM_HINT_NONE);}
+static FORCEINLINE __vec8_f __cast_sitofp(__vec8_f, __vec8_i16 val) {return _mm512_mask_extload_ps(FZERO, 0xFF, &val, _MM_UPCONV_PS_SINT16, _MM_BROADCAST_16X16, _MM_HINT_NONE);}
+static FORCEINLINE __vec8_f __cast_sitofp(__vec8_f, __vec8_i32 val) {return _mm512_mask_cvtfxpnt_round_adjustepi32_ps(FZERO, 0xFF, val, _MM_ROUND_MODE_NEAREST, _MM_EXPADJ_NONE);}
+#endif
+CAST(__vec8_f, float, __vec8_i64,  int64_t, __cast_sitofp)
+#if 0
+CAST(__vec8_d, double, __vec8_i8,  int8_t,  __cast_sitofp)
+CAST(__vec8_d, double, __vec8_i16, int16_t, __cast_sitofp)
+CAST(__vec8_d, double, __vec8_i32, int32_t, __cast_sitofp)
+#else
+static FORCEINLINE __vec8_d __cast_sitofp(__vec8_d, __vec8_i8 val) {
+    __vec8_i32 vi = _mm512_extload_epi32(&val, _MM_UPCONV_EPI32_SINT8, _MM_BROADCAST_16X16, _MM_HINT_NONE);
+    return  _mm512_cvtepi32lo_pd(vi);
+}
+
+static FORCEINLINE __vec8_d __cast_sitofp(__vec8_d, __vec8_i16 val) {
+    __vec8_i32 vi = _mm512_extload_epi32(&val, _MM_UPCONV_EPI32_SINT16, _MM_BROADCAST_16X16, _MM_HINT_NONE);
+    return  _mm512_cvtepi32lo_pd(vi);
+}
+
+static FORCEINLINE __vec8_d __cast_sitofp(__vec8_d, __vec8_i32 val) {
+    __vec8_d ret;
+    return _mm512_cvtepi32lo_pd(val);
+}
+#endif
+CAST(__vec8_d, double, __vec8_i64, int64_t, __cast_sitofp)
+
+// unsigned int to float/double
+#if 0
+CAST(__vec8_f, float, __vec8_i8,   uint8_t,  __cast_uitofp)
+CAST(__vec8_f, float, __vec8_i16,  uint16_t, __cast_uitofp)
+CAST(__vec8_f, float, __vec8_i32,  uint32_t, __cast_uitofp)
+#else
+static FORCEINLINE __vec8_f __cast_uitofp(__vec8_f, __vec8_i8  val) {return _mm512_mask_extload_ps(FZERO, 0xFF, &val, _MM_UPCONV_PS_UINT8, _MM_BROADCAST_16X16, _MM_HINT_NONE);}
+static FORCEINLINE __vec8_f __cast_uitofp(__vec8_f, __vec8_i16 val) {return _mm512_mask_extload_ps(FZERO, 0xFF, &val, _MM_UPCONV_PS_UINT16, _MM_BROADCAST_16X16, _MM_HINT_NONE);}
+static FORCEINLINE __vec8_f __cast_uitofp(__vec8_f, __vec8_i32 val) {return _mm512_mask_cvtfxpnt_round_adjustepu32_ps(FZERO, 0xFF, val, _MM_ROUND_MODE_NEAREST, _MM_EXPADJ_NONE);}
+#endif
+CAST(__vec8_f, float, __vec8_i64,  uint64_t, __cast_uitofp)
+#if 0
+CAST(__vec8_d, double, __vec8_i8,  uint8_t,  __cast_uitofp)
+CAST(__vec8_d, double, __vec8_i16, uint16_t, __cast_uitofp)
+CAST(__vec8_d, double, __vec8_i32, uint32_t, __cast_uitofp)
+#else
+static FORCEINLINE __vec8_d __cast_uitofp(__vec8_d, __vec8_i8 val) {
+    __vec8_i32 vi = _mm512_extload_epi32(&val, _MM_UPCONV_EPI32_UINT8, _MM_BROADCAST_16X16, _MM_HINT_NONE);
+    return  _mm512_cvtepu32lo_pd(vi);
+}
+
+static FORCEINLINE __vec8_d __cast_uitofp(__vec8_d, __vec8_i16 val) {
+    __vec8_i32 vi = _mm512_extload_epi32(&val, _MM_UPCONV_EPI32_UINT16, _MM_BROADCAST_16X16, _MM_HINT_NONE);
+    return _mm512_cvtepu32lo_pd(vi);
+}
+
+static FORCEINLINE __vec8_d __cast_uitofp(__vec8_d, __vec8_i32 val) {
+    __vec8_d ret;
+    return _mm512_cvtepu32lo_pd(val);
+}
+#endif
+CAST(__vec8_d, double, __vec8_i64, uint64_t, __cast_uitofp)
+
+#if 0
+static FORCEINLINE __vec8_f __cast_uitofp(__vec8_f, __vec8_i1 v) {
+    __vec8_f ret;
+    for (int i = 0; i < 8; ++i)
+        ret[i] = (v.v & (1 << i)) ? 1. : 0.;
+    return ret;
+}
+#else
+static FORCEINLINE __vec8_f __cast_uitofp(__vec8_f, __vec8_i1 v) 
+{
+    const __m512 ret = _mm512_setzero_ps();
+    const __m512 one = _mm512_set1_ps(1.0);
+    return _mm512_mask_mov_ps(ret, v & 0xFF, one);
+}
+#endif
+
+// float/double to signed int
+CAST(__vec8_i8,  int8_t,  __vec8_f, float, __cast_fptosi)
+CAST(__vec8_i16, int16_t, __vec8_f, float, __cast_fptosi)
+#if 0
+CAST(__vec8_i32, int32_t, __vec8_f, float, __cast_fptosi)
+#else
+static FORCEINLINE __vec8_i32 __cast_fptosi(__vec8_i32, __vec8_f val) {
+  return _mm512_mask_cvtfxpnt_round_adjustps_epi32(IZERO, 0xFF, val, _MM_ROUND_MODE_TOWARD_ZERO, _MM_EXPADJ_NONE);
+}
+#endif
+CAST(__vec8_i64, int64_t, __vec8_f, float, __cast_fptosi)
+CAST(__vec8_i8,  int8_t,  __vec8_d, double, __cast_fptosi)
+CAST(__vec8_i16, int16_t, __vec8_d, double, __cast_fptosi)
+#if 1
+CAST(__vec8_i32, int32_t, __vec8_d, double, __cast_fptosi)
+#else
+#endif
+CAST(__vec8_i64, int64_t, __vec8_d, double, __cast_fptosi)
+
+// float/double to unsigned int
+CAST(__vec8_i8,  uint8_t,  __vec8_f, float, __cast_fptoui)
+CAST(__vec8_i16, uint16_t, __vec8_f, float, __cast_fptoui)
+#if 0
+CAST(__vec8_i32, uint32_t, __vec8_f, float, __cast_fptoui)
+#else
+static FORCEINLINE __vec8_i32 __cast_fptoui(__vec8_i32, __vec8_f val) {
+  return _mm512_mask_cvtfxpnt_round_adjustps_epu32(IZERO, 0xFF, val, _MM_ROUND_MODE_TOWARD_ZERO, _MM_EXPADJ_NONE);
+}
+#endif
+CAST(__vec8_i64, uint64_t, __vec8_f, float, __cast_fptoui)
+CAST(__vec8_i8,  uint8_t,  __vec8_d, double, __cast_fptoui)
+CAST(__vec8_i16, uint16_t, __vec8_d, double, __cast_fptoui)
+#if 1
+CAST(__vec8_i32, uint32_t, __vec8_d, double, __cast_fptoui)
+#else
+#endif
+CAST(__vec8_i64, uint64_t, __vec8_d, double, __cast_fptoui)
+
+// float/double conversions
+#if 0
+CAST(__vec8_f, float,  __vec8_d, double, __cast_fptrunc)
+CAST(__vec8_d, double, __vec8_f, float,  __cast_fpext)
+#else
+static FORCEINLINE __vec8_f __cast_fptrunc(__vec8_f, __vec8_d val) {
+    return _mm512_mask_cvtpd_pslo(FZERO, 0xFF, val);
+}
+static FORCEINLINE __vec8_d __cast_fpext(__vec8_d, __vec8_f val) {
+    return _mm512_cvtpslo_pd(val);
+}
+#endif
+
+typedef union {
+    int32_t i32;
+    float f;
+    int64_t i64;
+    double d;
+} BitcastUnion;
+
+#define CAST_BITS(TO, TO_ELT, FROM, FROM_ELT)       \
+static FORCEINLINE TO __cast_bits(TO, FROM val) {   \
+    TO r;                                           \
+    for (int i = 0; i < 8; ++i) {                  \
+        BitcastUnion u;                             \
+        u.FROM_ELT = val[i];                      \
+        r[i] = u.TO_ELT;                          \
+    }                                               \
+    return r;                                       \
+}
+
+#if 0
+CAST_BITS(__vec8_f,   f,   __vec8_i32, i32)
+CAST_BITS(__vec8_i32, i32, __vec8_f,   f)
+#else
+static FORCEINLINE __vec8_f __cast_bits(__vec8_f, __vec8_i32 val) {
+    return _mm512_castsi512_ps(val);
+}
+static FORCEINLINE __vec8_i32 __cast_bits(__vec8_i32, __vec8_f val) {
+    return _mm512_castps_si512(val);
+}
+#endif
+
+#if 0
+CAST_BITS(__vec8_d,   d,   __vec8_i64, i64)
+CAST_BITS(__vec8_i64, i64, __vec8_d,   d)
+#else
+static FORCEINLINE __vec8_i64 __cast_bits(__vec8_i64, __vec8_d val) {
+    return *(__vec8_i64*)&val;
+}
+static FORCEINLINE __vec8_d __cast_bits(__vec8_d, __vec8_i64 val) {
+    return *(__vec8_d*)&val;
+}
+#endif
+
+#define CAST_BITS_SCALAR(TO, FROM)                  \
+static FORCEINLINE TO __cast_bits(TO, FROM v) {     \
+    union {                                         \
+    TO to;                                          \
+    FROM from;                                      \
+    } u;                                            \
+    u.from = v;                                     \
+    return u.to;                                    \
+}
+
+CAST_BITS_SCALAR(uint32_t, float)
+CAST_BITS_SCALAR(int32_t, float)
+CAST_BITS_SCALAR(float, uint32_t)
+CAST_BITS_SCALAR(float, int32_t)
+CAST_BITS_SCALAR(uint64_t, double)
+CAST_BITS_SCALAR(int64_t, double)
+CAST_BITS_SCALAR(double, uint64_t)
+CAST_BITS_SCALAR(double, int64_t)
+
+///////////////////////////////////////////////////////////////////////////
+// various math functions
+
+static FORCEINLINE void __fastmath() {
+}
+
+static FORCEINLINE float __round_uniform_float(float v) {
+    return roundf(v);
+}
+
+static FORCEINLINE float __floor_uniform_float(float v)  {
+    return floorf(v);
+}
+
+static FORCEINLINE float __ceil_uniform_float(float v) {
+    return ceilf(v);
+}
+
+static FORCEINLINE double __round_uniform_double(double v) {
+    return round(v);
+}
+
+static FORCEINLINE double __floor_uniform_double(double v) {
+    return floor(v);
+}
+
+static FORCEINLINE double __ceil_uniform_double(double v) {
+    return ceil(v);
+}
+
+#if 0
+UNARY_OP(__vec8_f, __round_varying_float, roundf)
+UNARY_OP(__vec8_f, __floor_varying_float, floorf)
+UNARY_OP(__vec8_f, __ceil_varying_float, ceilf)
+#else
+static FORCEINLINE __vec8_f __round_varying_float(__vec8_f v) {
+  return _mm512_mask_round_ps(FZERO, 0xFF, v, _MM_ROUND_MODE_NEAREST, _MM_EXPADJ_NONE);
+}
+
+static FORCEINLINE __vec8_f __floor_varying_float(__vec8_f v) {
+  return _mm512_mask_floor_ps(FZERO, 0xFF, v);
+}
+
+static FORCEINLINE __vec8_f __ceil_varying_float(__vec8_f v) {
+  return _mm512_mask_ceil_ps(FZERO, 0xFF, v);
+}
+#endif
+
+#if 0
+UNARY_OP(__vec8_d, __round_varying_double, round)
+UNARY_OP(__vec8_d, __floor_varying_double, floor)
+UNARY_OP(__vec8_d, __ceil_varying_double, ceil)
+#else
+static FORCEINLINE __vec8_d __round_varying_float(__vec8_d v) {
+  return _mm512_svml_round_pd(v);
+}
+
+static FORCEINLINE __vec8_d __floor_varying_float(__vec8_d v) {
+  return _mm512_floor_pd(v);
+}
+
+static FORCEINLINE __vec8_d __ceil_varying_float(__vec8_d v) {
+  return _mm512_ceil_pd(v);
+}
+#endif
+
+
+// min/max
+
+static FORCEINLINE float __min_uniform_float(float a, float b) { return (a<b) ? a : b; }
+static FORCEINLINE float __max_uniform_float(float a, float b) { return (a>b) ? a : b; }
+static FORCEINLINE double __min_uniform_double(double a, double b) { return (a<b) ? a : b; }
+static FORCEINLINE double __max_uniform_double(double a, double b) { return (a>b) ? a : b; }
+
+static FORCEINLINE int32_t __min_uniform_int32(int32_t a, int32_t b) { return (a<b) ? a : b; }
+static FORCEINLINE int32_t __max_uniform_int32(int32_t a, int32_t b) { return (a>b) ? a : b; }
+static FORCEINLINE int32_t __min_uniform_uint32(uint32_t a, uint32_t b) { return (a<b) ? a : b; }
+static FORCEINLINE int32_t __max_uniform_uint32(uint32_t a, uint32_t b) { return (a>b) ? a : b; }
+
+static FORCEINLINE int64_t __min_uniform_int64(int64_t a, int64_t b) { return (a<b) ? a : b; }
+static FORCEINLINE int64_t __max_uniform_int64(int64_t a, int64_t b) { return (a>b) ? a : b; }
+static FORCEINLINE int64_t __min_uniform_uint64(uint64_t a, uint64_t b) { return (a<b) ? a : b; }
+static FORCEINLINE int64_t __max_uniform_uint64(uint64_t a, uint64_t b) { return (a>b) ? a : b; }
+
+
+#if 0
+BINARY_OP_FUNC(__vec8_f, __max_varying_float, __max_uniform_float)
+BINARY_OP_FUNC(__vec8_f, __min_varying_float, __min_uniform_float)
+#else
+static FORCEINLINE __vec8_f __max_varying_float (__vec8_f v1, __vec8_f v2) { return _mm512_mask_gmax_ps(FZERO, 0xFF, v1, v2);}
+static FORCEINLINE __vec8_f __min_varying_float (__vec8_f v1, __vec8_f v2) { return _mm512_mask_gmin_ps(FZERO, 0xFF, v1, v2);}
+#endif
+
+#if 0
+BINARY_OP_FUNC(__vec8_d, __max_varying_double, __max_uniform_double)
+BINARY_OP_FUNC(__vec8_d, __min_varying_double, __min_uniform_double)
+#else
+static FORCEINLINE __vec8_d __max_varying_double(__vec8_d v1, __vec8_d v2) { return _mm512_gmax_pd(v1,v2); }
+static FORCEINLINE __vec8_d __min_varying_double(__vec8_d v1, __vec8_d v2) { return _mm512_gmin_pd(v1,v2); }
+#endif
+
+#if 0
+BINARY_OP_FUNC(__vec8_i32, __max_varying_int32, __max_uniform_int32)
+BINARY_OP_FUNC(__vec8_i32, __min_varying_int32, __min_uniform_int32)
+BINARY_OP_FUNC(__vec8_i32, __max_varying_uint32, __max_uniform_uint32)
+BINARY_OP_FUNC(__vec8_i32, __min_varying_uint32, __min_uniform_uint32)
+#else
+static FORCEINLINE __vec8_i32 __max_varying_int32 (__vec8_i32 v1, __vec8_i32 v2) { return _mm512_mask_max_epi32(IZERO,0xFF, v1, v2);}
+static FORCEINLINE __vec8_i32 __min_varying_int32 (__vec8_i32 v1, __vec8_i32 v2) { return _mm512_mask_min_epi32(IZERO,0xFF, v1, v2);}
+static FORCEINLINE __vec8_i32 __max_varying_uint32(__vec8_i32 v1, __vec8_i32 v2) { return _mm512_mask_max_epu32(IZERO,0xFF, v1, v2);}
+static FORCEINLINE __vec8_i32 __min_varying_uint32(__vec8_i32 v1, __vec8_i32 v2) { return _mm512_mask_min_epu32(IZERO,0xFF, v1, v2);}
+#endif
+
+BINARY_OP_FUNC(__vec8_i64, __max_varying_int64, __max_uniform_int64)
+BINARY_OP_FUNC(__vec8_i64, __min_varying_int64, __min_uniform_int64)
+BINARY_OP_FUNC(__vec8_i64, __max_varying_uint64, __max_uniform_uint64)
+BINARY_OP_FUNC(__vec8_i64, __min_varying_uint64, __min_uniform_uint64)
+
+// sqrt/rsqrt/rcp
+
+static FORCEINLINE float __rsqrt_uniform_float(float v) {
+    return 1.f / sqrtf(v);
+}
+
+static FORCEINLINE float __rcp_uniform_float(float v) {
+    return 1.f / v;
+}
+
+static FORCEINLINE float __sqrt_uniform_float(float v) {
+    return sqrtf(v);
+}
+
+static FORCEINLINE double __sqrt_uniform_double(double v) {
+    return sqrt(v);
+}
+
+#if 0
+UNARY_OP(__vec8_f, __rcp_varying_float, __rcp_uniform_float)
+UNARY_OP(__vec8_f, __rsqrt_varying_float, __rsqrt_uniform_float)
+UNARY_OP(__vec8_f, __sqrt_varying_float, __sqrt_uniform_float)
+#else
+static FORCEINLINE __vec8_f __rcp_varying_float(__vec8_f v) {
+#ifdef ISPC_FAST_MATH
+    return _mm512_mask_rcp23_ps(FZERO, 0xFF, v); // Approximation with 23 bits of accuracy.
+#else
+    return _mm512_mask_recip_ps(FZERO, 0xFF, v);
+#endif
+}
+
+static FORCEINLINE __vec8_f __rsqrt_varying_float(__vec8_f v) {
+#ifdef ISPC_FAST_MATH
+    return _mm512_mask_rsqrt23_ps(FZERO,0xFF,v); // Approximation with 0.775ULP accuracy
+#else 
+    return _mm512_mask_invsqrt_ps(FZERO,0xFF,v);
+#endif
+}
+static FORCEINLINE __vec8_f __sqrt_varying_float (__vec8_f v) {    return _mm512_mask_sqrt_ps(FZERO,0xFF,v);}
+#endif
+
+#if 0
+UNARY_OP(__vec8_d, __sqrt_varying_double, __sqrt_uniform_double)
+#else
+static FORCEINLINE __vec8_d __sqrt_varying_double(__vec8_d v) {    return _mm512_sqrt_pd(v); }
+#endif
+
+///////////////////////////////////////////////////////////////////////////
+// svml
+///////////////////////////////////////////////////////////////////////////
+
+static FORCEINLINE __vec8_f __svml_logf(__vec8_f v)              { return _mm512_mask_log_ps(FZERO,0xFF,v); }
+static FORCEINLINE __vec8_f __svml_expf(__vec8_f v)              { return _mm512_mask_exp_ps(FZERO,0xFF,v); }
+static FORCEINLINE __vec8_f __svml_cosf(__vec8_f v)              { return _mm512_mask_cos_ps(FZERO,0xFF,v); }
+static FORCEINLINE __vec8_f __svml_powf(__vec8_f a, __vec8_f b) { return _mm512_mask_pow_ps(FZERO,0xFF,a,b); }
+
+static FORCEINLINE __vec8_d __svml_logd(__vec8_d v)              { return _mm512_log_pd(v); }
+static FORCEINLINE __vec8_d __svml_expd(__vec8_d v)              { return _mm512_exp_pd(v); }
+static FORCEINLINE __vec8_d __svml_cosd(__vec8_d v)              { return _mm512_cos_pd(v); }
+static FORCEINLINE __vec8_d __svml_powd(__vec8_d a, __vec8_d b) { return _mm512_pow_pd(a,b); }
+
+///////////////////////////////////////////////////////////////////////////
+// bit ops
+
+static FORCEINLINE int32_t __popcnt_int32(uint32_t v) {
+    int count = 0;
+    for (; v != 0; v >>= 1)
+        count += (v & 1);
+    return count;
+}
+
+static FORCEINLINE int32_t __popcnt_int64(uint64_t v) {
+    int count = 0;
+    for (; v != 0; v >>= 1)
+        count += (v & 1);
+    return count;
+}
+
+static FORCEINLINE int32_t __count_trailing_zeros_i32(uint32_t v) {
+    if (v == 0)
+        return 32;
+
+    int count = 0;
+    while ((v & 1) == 0) {
+        ++count;
+        v >>= 1;
+    }
+    return count;
+}
+
+static FORCEINLINE int64_t __count_trailing_zeros_i64(uint64_t v) {
+    if (v == 0)
+        return 64;
+
+    int count = 0;
+    while ((v & 1) == 0) {
+        ++count;
+        v >>= 1;
+    }
+    return count;
+}
+
+static FORCEINLINE int32_t __count_leading_zeros_i32(uint32_t v) {
+    if (v == 0)
+        return 32;
+
+    int count = 0;
+    while ((v & (1<<31)) == 0) {
+        ++count;
+        v <<= 1;
+    }
+    return count;
+}
+
+static FORCEINLINE int64_t __count_leading_zeros_i64(uint64_t v) {
+    if (v == 0)
+        return 64;
+
+    int count = 0;
+    while ((v & (1ull<<63)) == 0) {
+        ++count;
+        v <<= 1;
+    }
+    return count;
+}
+
+///////////////////////////////////////////////////////////////////////////
+// reductions
+
+#if 0
+REDUCE_ADD(float, __vec8_f, __reduce_add_float)
+REDUCE_MINMAX(float, __vec8_f, __reduce_min_float, <)
+REDUCE_MINMAX(float, __vec8_f, __reduce_max_float, >)
+#else
+static FORCEINLINE float __reduce_add_float(__vec8_f v) { return _mm512_mask_reduce_add_ps(0xFF,v); }
+static FORCEINLINE float __reduce_min_float(__vec8_f v) { return _mm512_mask_reduce_min_ps(0xFF,v); }
+static FORCEINLINE float __reduce_max_float(__vec8_f v) { return _mm512_mask_reduce_max_ps(0xFF,v); }
+#endif
+
+#if 0
+REDUCE_ADD(double, __vec8_d, __reduce_add_double)
+REDUCE_MINMAX(double, __vec8_d, __reduce_min_double, <)
+REDUCE_MINMAX(double, __vec8_d, __reduce_max_double, >)
+#else
+static FORCEINLINE float __reduce_add_double(__vec8_d v) { return _mm512_reduce_add_pd(v); }
+static FORCEINLINE float __reduce_min_double(__vec8_d v) { return _mm512_reduce_min_pd(v); }
+static FORCEINLINE float __reduce_max_double(__vec8_d v) { return _mm512_reduce_max_pd(v); }
+#endif
+
+
+
+#if 0
+REDUCE_ADD   (int64_t, __vec8_i32, __reduce_add_int32)
+REDUCE_MINMAX(int32_t, __vec8_i32, __reduce_min_int32, <)
+REDUCE_MINMAX(int32_t, __vec8_i32, __reduce_max_int32, >)
+REDUCE_MINMAX(uint32_t, __vec8_i32, __reduce_min_uint32, <)
+REDUCE_MINMAX(uint32_t, __vec8_i32, __reduce_max_uint32, >)
+#else
+static FORCEINLINE  int64_t __reduce_add_int32  (__vec8_i32 v) { return _mm512_mask_reduce_add_epi32(0xFF, v);}
+static FORCEINLINE  int32_t __reduce_min_int32  (__vec8_i32 v) { return _mm512_mask_reduce_min_epi32(0xFF, v);}
+static FORCEINLINE  int32_t __reduce_max_int32  (__vec8_i32 v) { return _mm512_mask_reduce_max_epi32(0xFF, v);}
+static FORCEINLINE uint32_t __reduce_min_uint32 (__vec8_i32 v) { return _mm512_mask_reduce_min_epu32(0xFF, v);}
+static FORCEINLINE uint32_t __reduce_max_uint32 (__vec8_i32 v) { return _mm512_mask_reduce_max_epu32(0xFF, v);}
+#endif
+
+REDUCE_ADD   ( int16_t, __vec8_i8,  __reduce_add_int8)
+REDUCE_ADD   ( int32_t, __vec8_i16, __reduce_add_int16)
+REDUCE_ADD   ( int64_t, __vec8_i64, __reduce_add_int64)
+REDUCE_MINMAX( int64_t, __vec8_i64, __reduce_min_int64, <)
+REDUCE_MINMAX( int64_t, __vec8_i64, __reduce_max_int64, >)
+REDUCE_MINMAX(uint64_t, __vec8_i64, __reduce_min_uint64, <)
+REDUCE_MINMAX(uint64_t, __vec8_i64, __reduce_max_uint64, >)
+
+///////////////////////////////////////////////////////////////////////////
+// masked load/store
+
+static FORCEINLINE __vec8_i8 __masked_load_i8(void *p,
+                                               __vec8_i1 mask) {
+    __vec8_i8 ret;
+    int8_t *ptr = (int8_t *)p;
+    for (int i = 0; i < 8; ++i)
+        if ((mask.v & (1 << i)) != 0)
+            ret[i] = ptr[i];
+    return ret;
+}
+
+static FORCEINLINE __vec8_i16 __masked_load_i16(void *p,
+                                                 __vec8_i1 mask) {
+    __vec8_i16 ret;
+    int16_t *ptr = (int16_t *)p;
+    for (int i = 0; i < 8; ++i)
+        if ((mask.v & (1 << i)) != 0)
+            ret[i] = ptr[i];
+    return ret;
+}
+
+#if 0
+static FORCEINLINE __vec8_i32 __masked_load_i32(void *p,
+                                                 __vec8_i1 mask) {
+    __vec8_i32 ret;
+    int32_t *ptr = (int32_t *)p;
+    for (int i = 0; i < 8; ++i)
+        if ((mask.v & (1 << i)) != 0)
+            ret[i] = ptr[i];
+    return ret;
+}
+#else
+static FORCEINLINE __vec8_i32 __masked_load_i32(void *p, __vec8_i1 mask) {
+#ifdef ISPC_FORCE_ALIGNED_MEMORY
+    return _mm512_mask_load_epi32(__vec8_i32(), mask, p);
+#else
+    __vec8_i32 tmp;
+    tmp = _mm512_mask_extloadunpacklo_epi32(tmp, 0xFF, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
+    tmp = _mm512_mask_extloadunpackhi_epi32(tmp, 0xFF, (uint8_t*)p+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
+    __vec8_i32 ret;
+    return _mm512_mask_mov_epi32(ret, 0xFF & mask, tmp);
+#endif
+}
+#endif
+
+#if 0
+static FORCEINLINE __vec8_f __masked_load_float(void *p,
+                                                 __vec8_i1 mask) {
+    __vec8_f ret;
+    float *ptr = (float *)p;
+    for (int i = 0; i < 8; ++i)
+        if ((mask.v & (1 << i)) != 0)
+            ret[i] = ptr[i];
+    return ret;
+}
+#else
+static FORCEINLINE __vec8_f __masked_load_float(void *p, __vec8_i1 mask) {
+#ifdef ISPC_FORCE_ALIGNED_MEMORY
+    return _mm512_mask_load_ps(_mm512_undefined_ps(), mask,p);
+#else
+    __vec8_f tmp;
+    tmp = _mm512_mask_extloadunpacklo_ps(tmp, 0xFF, p, _MM_UPCONV_PS_NONE, _MM_HINT_NONE);
+    tmp = _mm512_mask_extloadunpackhi_ps(tmp, 0xFF, (uint8_t*)p+64, _MM_UPCONV_PS_NONE, _MM_HINT_NONE);
+    __vec8_f ret;
+    return _mm512_mask_mov_ps(ret, 0xFF & mask, tmp);
+#endif
+}
+#endif
+
+static FORCEINLINE __vec8_i64 __masked_load_i64(void *p,
+                                                 __vec8_i1 mask) {
+    __vec8_i64 ret;
+    int64_t *ptr = (int64_t *)p;
+    for (int i = 0; i < 8; ++i)
+        if ((mask.v & (1 << i)) != 0)
+            ret[i] = ptr[i];
+    return ret;
+}
+
+#if 0
+static FORCEINLINE __vec8_d __masked_load_double(void *p,
+                                                  __vec8_i1 mask) {
+    __vec8_d ret;
+    double *ptr = (double *)p;
+    for (int i = 0; i < 8; ++i)
+        if ((mask.v & (1 << i)) != 0)
+            ret[i] = ptr[i];
+    return ret;
+}
+#else
+static FORCEINLINE __vec8_d __masked_load_double(void *p, __vec8_i1 mask) {
+#ifdef ISPC_FORCE_ALIGNED_MEMORY
+    __vec8_d ret = FZERO;
+    ret = _mm512_mask_load_pd(ret, 0xFF & mask, p);
+    return ret;
+#else
+    __vec8_d tmp = FZERO;
+    tmp.v = _mm512_mask_extloadunpacklo_pd(tmp.v, 0xFF, p, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
+    tmp.v = _mm512_mask_extloadunpackhi_pd(tmp.v, 0xFF, (uint8_t*)p+64, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
+    __vec8_d ret = FZERO;
+    ret.v = _mm512_mask_mov_pd(ret.v, mask, tmp.v);
+    return ret;
+#endif
+}
+#endif
+
+
+static FORCEINLINE void __masked_store_i8(void *p, __vec8_i8 val,
+                                          __vec8_i1 mask) {
+    int8_t *ptr = (int8_t *)p;
+    for (int i = 0; i < 8; ++i)
+        if ((mask.v & (1 << i)) != 0)
+            ptr[i] = val[i];
+}
+
+static FORCEINLINE void __masked_store_i16(void *p, __vec8_i16 val,
+                                           __vec8_i1 mask) {
+    int16_t *ptr = (int16_t *)p;
+    for (int i = 0; i < 8; ++i)
+        if ((mask.v & (1 << i)) != 0)
+            ptr[i] = val[i];
+}
+
+#if 0
+static FORCEINLINE void __masked_store_i32(void *p, __vec8_i32 val,
+                                           __vec8_i1 mask) {
+    int32_t *ptr = (int32_t *)p;
+    for (int i = 0; i < 8; ++i)
+        if ((mask.v & (1 << i)) != 0)
+            ptr[i] = val[i];
+}
+#else
+static FORCEINLINE void __masked_store_i32(void *p, __vec8_i32 val, __vec8_i1 mask) {
+#ifdef ISPC_FORCE_ALIGNED_MEMORY
+    _mm512_mask_store_epi32(p, mask, val.v);
+#else
+    __vec8_i32 tmp;
+    tmp = _mm512_extloadunpacklo_epi32(tmp, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
+    tmp = _mm512_extloadunpackhi_epi32(tmp, (uint8_t*)p+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
+    tmp = _mm512_mask_mov_epi32(tmp, 0xFF & mask, val);
+    _mm512_mask_extpackstorelo_epi32(          p,    0xFF, tmp, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
+    _mm512_mask_extpackstorehi_epi32((uint8_t*)p+64, 0xFF, tmp, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
+#endif
+}
+#endif
+
+#if 0
+static FORCEINLINE void __masked_store_float(void *p, __vec8_f val,
+                                             __vec8_i1 mask) {
+    float *ptr = (float *)p;
+    for (int i = 0; i < 8; ++i)
+        if ((mask.v & (1 << i)) != 0)
+            ptr[i] = val[i];
+}
+#else
+static FORCEINLINE void __masked_store_float(void *p, __vec8_f val,
+                                             __vec8_i1 mask) {
+#ifdef ISPC_FORCE_ALIGNED_MEMORY
+    _mm512_mask_store_ps(p, 0xFF & mask, val.v);
+#else
+    __vec8_f tmp = FZERO;
+    tmp = _mm512_extloadunpacklo_ps(tmp, p, _MM_UPCONV_PS_NONE, _MM_HINT_NONE);
+    tmp = _mm512_extloadunpackhi_ps(tmp, (uint8_t*)p+64, _MM_UPCONV_PS_NONE, _MM_HINT_NONE);
+    tmp = _mm512_mask_mov_ps(tmp, 0xFF & mask, val);
+    _mm512_mask_extpackstorelo_ps(          p,    0xFF, tmp, _MM_DOWNCONV_PS_NONE, _MM_HINT_NONE);
+    _mm512_mask_extpackstorehi_ps((uint8_t*)p+64, 0xFF, tmp, _MM_DOWNCONV_PS_NONE, _MM_HINT_NONE);
+#endif
+}
+#endif
+
+static FORCEINLINE void __masked_store_i64(void *p, __vec8_i64 val,
+                                          __vec8_i1 mask) {
+    int64_t *ptr = (int64_t *)p;
+    for (int i = 0; i < 8; ++i)
+        if ((mask.v & (1 << i)) != 0)
+            ptr[i] = val[i];
+}
+
+#if 0
+static FORCEINLINE void __masked_store_double(void *p, __vec8_d val,
+                                              __vec8_i1 mask) {
+    double *ptr = (double *)p;
+    for (int i = 0; i < 8; ++i)
+        if ((mask.v & (1 << i)) != 0)
+            ptr[i] = val[i];
+}
+#else
+static FORCEINLINE void __masked_store_double(void *p, __vec8_d val,
+                                              __vec8_i1 mask) {
+#ifdef ISPC_FORCE_ALIGNED_MEMORY
+    _mm512_mask_store_pd(p, mask, val.v);
+#else
+    __vec8_d tmp;
+    tmp.v = _mm512_extloadunpacklo_pd(tmp.v, p, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
+    tmp.v = _mm512_extloadunpackhi_pd(tmp.v, (uint8_t*)p+64, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
+    tmp.v = _mm512_mask_mov_pd(tmp.v, mask, val.v);
+    _mm512_extpackstorelo_pd(p, tmp.v, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE);
+    _mm512_extpackstorehi_pd((uint8_t*)p+64, tmp.v, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE);
+#endif
+}
+#endif
+
+static FORCEINLINE void __masked_store_blend_i8(void *p, __vec8_i8 val,
+                                                __vec8_i1 mask) {
+    __masked_store_i8(p, val, mask);
+}
+
+static FORCEINLINE void __masked_store_blend_i16(void *p, __vec8_i16 val,
+                                                 __vec8_i1 mask) {
+    __masked_store_i16(p, val, mask);
+}
+
+static FORCEINLINE void __masked_store_blend_i32(void *p, __vec8_i32 val,
+                                                 __vec8_i1 mask) {
+    __masked_store_i32(p, val, mask);
+}
+
+static FORCEINLINE void __masked_store_blend_float(void *p, __vec8_f val,
+                                                   __vec8_i1 mask) {
+    __masked_store_float(p, val, mask);
+}
+
+static FORCEINLINE void __masked_store_blend_i64(void *p, __vec8_i64 val,
+                                                 __vec8_i1 mask) {
+    __masked_store_i64(p, val, mask);
+}
+
+static FORCEINLINE void __masked_store_blend_double(void *p, __vec8_d val,
+                                                    __vec8_i1 mask) {
+    __masked_store_double(p, val, mask);
+}
+
+///////////////////////////////////////////////////////////////////////////
+// gather/scatter
+
+// offsets * offsetScale is in bytes (for all of these)
+
+#define GATHER_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC)                  \
+static FORCEINLINE VTYPE FUNC(unsigned char *b, uint32_t scale,         \
+                              OTYPE offset, __vec8_i1 mask) {          \
+    VTYPE ret;                                                          \
+    int8_t *base = (int8_t *)b;                                         \
+    for (int i = 0; i < 8; ++i)                                        \
+        if ((mask.v & (1 << i)) != 0) {                                 \
+            STYPE *ptr = (STYPE *)(base + scale * offset[i]);         \
+            ret[i] = *ptr;                                            \
+        }                                                               \
+    return ret;                                                         \
+}
+    
+
+/****************/
+#if 0
+GATHER_BASE_OFFSETS(__vec8_i8,  int8_t,  __vec8_i32, __gather_base_offsets32_i8)
+#else
+static FORCEINLINE __vec8_i8 __gather_base_offsets32_i8(uint8_t *base, uint32_t scale, __vec8_i32 offsets,  __vec8_i1 mask) 
+{
+    // (iw): need to temporarily store as int because gathers can only return ints.
+    __vec8_i32 tmp = _mm512_mask_i32extgather_epi32(_mm512_undefined_epi32(), 0xFF & mask, offsets, base, 
+                                                     _MM_UPCONV_EPI32_SINT8, scale,
+                                                     _MM_HINT_NONE);
+    // now, downconverting to chars into temporary char vector
+    __vec8_i8 ret;
+    _mm512_mask_extstore_epi32(ret.data,0xFF,tmp,_MM_DOWNCONV_EPI32_SINT8,_MM_HINT_NONE);
+    return ret;
+}
+#endif
+GATHER_BASE_OFFSETS(__vec8_i8,  int8_t,  __vec8_i64, __gather_base_offsets64_i8)
+/****************/
+GATHER_BASE_OFFSETS(__vec8_i16, int16_t, __vec8_i32, __gather_base_offsets32_i16)
+GATHER_BASE_OFFSETS(__vec8_i16, int16_t, __vec8_i64, __gather_base_offsets64_i16)
+/****************/
+#if 0
+GATHER_BASE_OFFSETS(__vec8_i32, int32_t, __vec8_i32, __gather_base_offsets32_i32)
+#else
+static FORCEINLINE __vec8_i32 __gather_base_offsets32_i32(uint8_t *base, uint32_t scale, __vec8_i32 offsets,   __vec8_i1 mask) 
+{
+    return _mm512_mask_i32extgather_epi32(_mm512_undefined_epi32(), 0xFF & mask, offsets, 
+                                          base, _MM_UPCONV_EPI32_NONE, scale,
+                                          _MM_HINT_NONE);
+}
+#endif
+GATHER_BASE_OFFSETS(__vec8_i32, int32_t, __vec8_i64, __gather_base_offsets64_i32)
+/****************/
+#if 0
+GATHER_BASE_OFFSETS(__vec8_f,   float,   __vec8_i32, __gather_base_offsets32_float)
+#else
+static FORCEINLINE __vec8_f __gather_base_offsets32_float(uint8_t *base, uint32_t scale, __vec8_i32 offsets, __vec8_i1 mask) 
+{
+    return _mm512_mask_i32extgather_ps(_mm512_undefined_ps(), 0xFF & mask, offsets,
+                                       base, _MM_UPCONV_PS_NONE, scale,
+                                       _MM_HINT_NONE);
+}
+#endif
+GATHER_BASE_OFFSETS(__vec8_f,   float,   __vec8_i64, __gather_base_offsets64_float)
+/****************/
+GATHER_BASE_OFFSETS(__vec8_i64, int64_t, __vec8_i32, __gather_base_offsets32_i64)
+GATHER_BASE_OFFSETS(__vec8_i64, int64_t, __vec8_i64, __gather_base_offsets64_i64)
+/****************/
+#if 0
+GATHER_BASE_OFFSETS(__vec8_d,   double,  __vec8_i32, __gather_base_offsets32_double)
+#else
+static FORCEINLINE __vec8_d __gather_base_offsets32_double(uint8_t *base, uint32_t scale, __vec8_i32 offsets, __vec8_i1 mask) 
+{
+    __vec8_d ret;
+    ret.v = _mm512_mask_i32loextgather_pd(_mm512_undefined_pd(), mask, offsets,
+                                       base, _MM_UPCONV_PD_NONE, scale,
+                                       _MM_HINT_NONE); 
+#if 0
+    __m512i shuffled_offsets = _mm512_permute4f128_epi32(offsets.v, _MM_PERM_DCDC);
+    const __mmask8 mask8 = 0x00FF & (mask >> 8); /* evghenii::testme */
+    ret.v2 = _mm512_mask_i32loextgather_pd(_mm512_undefined_pd(), mask8, shuffled_offsets,
+                                       base, _MM_UPCONV_PD_NONE, scale,
+                                       _MM_HINT_NONE); 
+#endif
+    return ret;
+}
+#endif
+GATHER_BASE_OFFSETS(__vec8_d,   double,  __vec8_i64, __gather_base_offsets64_double)
+
+#define GATHER_GENERAL(VTYPE, STYPE, PTRTYPE, FUNC)         \
+static FORCEINLINE VTYPE FUNC(PTRTYPE ptrs, __vec8_i1 mask) {   \
+    VTYPE ret;                                              \
+    for (int i = 0; i < 8; ++i)                            \
+        if ((mask.v & (1 << i)) != 0) {                     \
+            STYPE *ptr = (STYPE *)ptrs[i];                \
+            ret[i] = *ptr;                                \
+        }                                                   \
+    return ret;                                             \
+}
+#define GATHER_GENERALF(VTYPE, STYPE, PTRTYPE, FUNC,FUNC1)         \
+static FORCEINLINE VTYPE FUNC(PTRTYPE ptrs, __vec8_i1 mask) {   \
+  return FUNC1(0, 1, ptrs, mask); \
+}
+
+
+#if 1
+/***********/
+GATHER_GENERALF(__vec8_i8,  int8_t,  __vec8_i32, __gather32_i8, __gather_base_offsets32_i8)
+GATHER_GENERALF(__vec8_i16, int16_t, __vec8_i32, __gather32_i16, __gather_base_offsets32_i16)
+GATHER_GENERALF(__vec8_i32, int32_t, __vec8_i32, __gather32_i32, __gather_base_offsets32_i32)
+GATHER_GENERALF(__vec8_i64, int64_t, __vec8_i32, __gather32_i64, __gather_base_offsets32_i64)
+GATHER_GENERALF(__vec8_f,   float,   __vec8_i32, __gather32_float, __gather_base_offsets32_float)
+GATHER_GENERALF(__vec8_d,   double,  __vec8_i32, __gather32_double, __gather_base_offsets32_double)
+/***********/
+GATHER_GENERAL(__vec8_i8,  int8_t,  __vec8_i64, __gather64_i8);
+GATHER_GENERAL(__vec8_i16, int16_t, __vec8_i64, __gather64_i16);
+GATHER_GENERAL(__vec8_i32, int32_t, __vec8_i64, __gather64_i32);
+GATHER_GENERAL(__vec8_i64, int64_t, __vec8_i64, __gather64_i64);
+GATHER_GENERAL(__vec8_f,   float,   __vec8_i64, __gather64_float);
+GATHER_GENERAL(__vec8_d,   double,  __vec8_i64, __gather64_double);
+/***********/
+#endif
+
+// scatter
+
+#define SCATTER_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC)                 \
+static FORCEINLINE void FUNC(unsigned char *b, uint32_t scale,          \
+                             OTYPE offset, VTYPE val,                   \
+                             __vec8_i1 mask) {                         \
+    int8_t *base = (int8_t *)b;                                         \
+    for (int i = 0; i < 8; ++i)                                        \
+        if ((mask.v & (1 << i)) != 0) {                                 \
+            STYPE *ptr = (STYPE *)(base + scale * offset[i]);         \
+            *ptr = val[i];                                            \
+        }                                                               \
+}
+    
+
+/*****************/
+SCATTER_BASE_OFFSETS(__vec8_i8,  int8_t,  __vec8_i32, __scatter_base_offsets32_i8)
+SCATTER_BASE_OFFSETS(__vec8_i8,  int8_t,  __vec8_i64, __scatter_base_offsets64_i8)
+/*****************/
+SCATTER_BASE_OFFSETS(__vec8_i16, int16_t, __vec8_i32, __scatter_base_offsets32_i16)
+SCATTER_BASE_OFFSETS(__vec8_i16, int16_t, __vec8_i64, __scatter_base_offsets64_i16)
+/*****************/
+#if 0
+SCATTER_BASE_OFFSETS(__vec8_i32, int32_t, __vec8_i32, __scatter_base_offsets32_i32)
+#else
+static FORCEINLINE void __scatter_base_offsets32_i32(uint8_t *b, uint32_t scale, __vec8_i32 offsets,  __vec8_i32 val, __vec8_i1 mask)
+{
+    _mm512_mask_i32extscatter_epi32(b, 0xFF & mask, offsets, val, 
+                                    _MM_DOWNCONV_EPI32_NONE, scale, 
+                                    _MM_HINT_NONE);
+}
+#endif
+SCATTER_BASE_OFFSETS(__vec8_i32, int32_t, __vec8_i64, __scatter_base_offsets64_i32)
+/*****************/
+#if 0
+SCATTER_BASE_OFFSETS(__vec8_f,   float,   __vec8_i32, __scatter_base_offsets32_float)
+#else
+static FORCEINLINE void __scatter_base_offsets32_float(void *base, uint32_t scale, __vec8_i32 offsets,
+                               __vec8_f val, __vec8_i1 mask) 
+{ 
+    _mm512_mask_i32extscatter_ps(base, 0xFF & mask, offsets, val, 
+                                 _MM_DOWNCONV_PS_NONE, scale,
+                                 _MM_HINT_NONE);
+}
+#endif
+SCATTER_BASE_OFFSETS(__vec8_f,   float,   __vec8_i64, __scatter_base_offsets64_float)
+/*****************/
+SCATTER_BASE_OFFSETS(__vec8_i64, int64_t, __vec8_i32, __scatter_base_offsets32_i64)
+SCATTER_BASE_OFFSETS(__vec8_i64, int64_t, __vec8_i64, __scatter_base_offsets64_i64)
+/*****************/
+#if 0 /* evghenii::to implement */
+SCATTER_BASE_OFFSETS(__vec8_d,   double,  __vec8_i32, __scatter_base_offsets32_double)
+#else /* evghenii:testme */
+static FORCEINLINE void __scatter_base_offsets32_double(void *base, uint32_t scale, __vec8_i32 offsets,
+                               __vec8_d val, __vec8_i1 mask) 
+{ 
+    _mm512_mask_i32loextscatter_pd(base, mask, offsets, val.v,
+                                 _MM_DOWNCONV_PD_NONE, scale,
+                                 _MM_HINT_NONE);
+}
+#endif
+SCATTER_BASE_OFFSETS(__vec8_d,   double,  __vec8_i64, __scatter_base_offsets64_double)
+
+#define SCATTER_GENERAL(VTYPE, STYPE, PTRTYPE, FUNC)                 \
+static FORCEINLINE void FUNC(PTRTYPE ptrs, VTYPE val, __vec8_i1 mask) {  \
+    VTYPE ret;                                                       \
+    for (int i = 0; i < 8; ++i)                                     \
+        if ((mask.v & (1 << i)) != 0) {                              \
+            STYPE *ptr = (STYPE *)ptrs[i];                         \
+            *ptr = val[i];                                         \
+        }                                                            \
+}
+#define SCATTER_GENERALF(VTYPE, STYPE, PTRTYPE, FUNC,FUNC1)         \
+static FORCEINLINE void FUNC(PTRTYPE ptrs, VTYPE val, __vec8_i1 mask) {  \
+  return FUNC1(0, 1, ptrs, val, mask); \
+}
+
+#if 1
+/***********/
+SCATTER_GENERALF(__vec8_i8,  int8_t,  __vec8_i32, __scatter32_i8, __scatter_base_offsets32_i8)
+SCATTER_GENERALF(__vec8_i16, int16_t, __vec8_i32, __scatter32_i16, __scatter_base_offsets32_i16)
+SCATTER_GENERALF(__vec8_i32, int32_t, __vec8_i32, __scatter32_i32, __scatter_base_offsets32_i32)
+SCATTER_GENERALF(__vec8_i64, int64_t, __vec8_i32, __scatter32_i64, __scatter_base_offsets32_i64)
+SCATTER_GENERALF(__vec8_f,   float,   __vec8_i32, __scatter32_float, __scatter_base_offsets32_float)
+SCATTER_GENERALF(__vec8_d,   double,  __vec8_i32, __scatter32_double, __scatter_base_offsets32_double)
+/***********/
+SCATTER_GENERAL(__vec8_i8,  int8_t,  __vec8_i64, __scatter64_i8)
+SCATTER_GENERAL(__vec8_i16, int16_t, __vec8_i64, __scatter64_i16)
+SCATTER_GENERAL(__vec8_i32, int32_t, __vec8_i64, __scatter64_i32)
+SCATTER_GENERAL(__vec8_f,   float,   __vec8_i64, __scatter64_float)
+SCATTER_GENERAL(__vec8_i64, int64_t, __vec8_i64, __scatter64_i64)
+SCATTER_GENERAL(__vec8_d,   double,  __vec8_i64, __scatter64_double)
+/***********/
+#endif
+
+///////////////////////////////////////////////////////////////////////////
+// packed load/store
+
+#if 0
+static FORCEINLINE int32_t __packed_load_active(int32_t *ptr, __vec8_i32 *val,
+                                                __vec8_i1 mask) {
+    int count = 0; 
+    for (int i = 0; i < 8; ++i) {
+        if ((mask.v & (1 << i)) != 0) {
+            val->operator[](i) = *ptr++;
+            ++count;
+        }
+    }
+    return count;
+}
+static FORCEINLINE int32_t __packed_store_active(int32_t *ptr, 
+                                                 __vec8_i32 val,
+                                                 __vec8_i1 mask) {
+    int count = 0; 
+    for (int i = 0; i < 8; ++i) {
+        if ((mask.v & (1 << i)) != 0) {
+            *ptr++ = val[i];
+            ++count;
+        }
+    }
+    return count;
+}
+static FORCEINLINE int32_t __packed_load_active(uint32_t *ptr,
+                                                __vec8_i32 *val,
+                                                __vec8_i1 mask) {
+    int count = 0; 
+    for (int i = 0; i < 8; ++i) {
+        if ((mask.v & (1 << i)) != 0) {
+            val->operator[](i) = *ptr++;
+            ++count;
+        }
+    }
+    return count;
+}
+static FORCEINLINE int32_t __packed_store_active(uint32_t *ptr, 
+                                                 __vec8_i32 val,
+                                                 __vec8_i1 mask) {
+    int count = 0; 
+    for (int i = 0; i < 8; ++i) {
+        if ((mask.v & (1 << i)) != 0) {
+            *ptr++ = val[i];
+            ++count;
+        }
+    }
+    return count;
+}
+#else
+static FORCEINLINE int32_t __packed_load_active(uint32_t *p, __vec8_i32 *val,
+                                                __vec8_i1 mask) {
+    __vec8_i32 v = __load<64>(val);
+    v = _mm512_mask_extloadunpacklo_epi32(v, 0xFF & mask, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
+    v = _mm512_mask_extloadunpackhi_epi32(v, 0xFF & mask, (uint8_t*)p+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
+    __store<64>(val, v);
+    return _mm_countbits_32(uint32_t(0xFF & mask));
+}
+static FORCEINLINE int32_t __packed_store_active(uint32_t *p, __vec8_i32 val,
+                                                 __vec8_i1 mask) {
+    _mm512_mask_extpackstorelo_epi32(p, 0xFF & mask, val, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
+    _mm512_mask_extpackstorehi_epi32((uint8_t*)p+64, 0xFF & mask, val, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
+    return _mm_countbits_32(uint32_t(0xFF & mask));
+}
+static FORCEINLINE int32_t __packed_load_active(int32_t *p, __vec8_i32 *val,
+                                                __vec8_i1 mask) {
+    __vec8_i32 v = __load<64>(val);
+    v = _mm512_mask_extloadunpacklo_epi32(v, 0xFF & mask, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
+    v = _mm512_mask_extloadunpackhi_epi32(v, 0xFF & mask, (uint8_t*)p+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
+    __store<64>(val, v);
+    return _mm_countbits_32(uint32_t(0xFF & mask));
+}
+static FORCEINLINE int32_t __packed_store_active(int32_t *p, __vec8_i32 val,
+                                                 __vec8_i1 mask) {
+    _mm512_mask_extpackstorelo_epi32(p, 0xFF & mask, val, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
+    _mm512_mask_extpackstorehi_epi32((uint8_t*)p+64, 0xFF & mask, val, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
+    return _mm_countbits_32(uint32_t(0xFF & mask));
+}
+#endif
+
+///////////////////////////////////////////////////////////////////////////
+// aos/soa
+
+static FORCEINLINE void __soa_to_aos3_float(__vec8_f v0, __vec8_f v1, __vec8_f v2,
+                                            float *ptr) {
+    for (int i = 0; i < 8; ++i) {
+        *ptr++ = __extract_element(v0, i);
+        *ptr++ = __extract_element(v1, i);
+        *ptr++ = __extract_element(v2, i);
+    }
+}
+
+static FORCEINLINE void __aos_to_soa3_float(float *ptr, __vec8_f *out0, __vec8_f *out1,
+                                            __vec8_f *out2) {
+    for (int i = 0; i < 8; ++i) {
+        __insert_element(out0, i, *ptr++);
+        __insert_element(out1, i, *ptr++);
+        __insert_element(out2, i, *ptr++);
+    }
+}
+
+static FORCEINLINE void __soa_to_aos4_float(__vec8_f v0, __vec8_f v1, __vec8_f v2,
+                                            __vec8_f v3, float *ptr) {
+    for (int i = 0; i < 8; ++i) {
+        *ptr++ = __extract_element(v0, i);
+        *ptr++ = __extract_element(v1, i);
+        *ptr++ = __extract_element(v2, i);
+        *ptr++ = __extract_element(v3, i);
+    }
+}
+
+static FORCEINLINE void __aos_to_soa4_float(float *ptr, __vec8_f *out0, __vec8_f *out1,
+                                            __vec8_f *out2, __vec8_f *out3) {
+    for (int i = 0; i < 8; ++i) {
+        __insert_element(out0, i, *ptr++);
+        __insert_element(out1, i, *ptr++);
+        __insert_element(out2, i, *ptr++);
+        __insert_element(out3, i, *ptr++);
+    }
+}
+
+///////////////////////////////////////////////////////////////////////////
+// prefetch
+
+static FORCEINLINE void __prefetch_read_uniform_1(unsigned char *p) {
+    _mm_prefetch((char *)p, _MM_HINT_T0); // prefetch into L1$
+}
+
+static FORCEINLINE void __prefetch_read_uniform_2(unsigned char *p) {
+    _mm_prefetch((char *)p, _MM_HINT_T1); // prefetch into L2$
+}
+
+static FORCEINLINE void __prefetch_read_uniform_3(unsigned char *p) {
+    // There is no L3$ on KNC, don't want to pollute L2$ unecessarily
+}
+
+static FORCEINLINE void __prefetch_read_uniform_nt(unsigned char *p) {
+    _mm_prefetch((char *)p, _MM_HINT_T2); // prefetch into L2$ with non-temporal hint
+    // _mm_prefetch(p, _MM_HINT_NTA); // prefetch into L1$ with non-temporal hint
+}
+
+///////////////////////////////////////////////////////////////////////////
+// atomics
+
+static FORCEINLINE uint32_t __atomic_add(uint32_t *p, uint32_t v) {
+#ifdef _MSC_VER
+    return InterlockedAdd((LONG volatile *)p, v) - v;
+#else
+    return __sync_fetch_and_add(p, v);
+#endif
+}
+
+static FORCEINLINE uint32_t __atomic_sub(uint32_t *p, uint32_t v) {
+#ifdef _MSC_VER
+    return InterlockedAdd((LONG volatile *)p, -v) + v;
+#else
+    return __sync_fetch_and_sub(p, v);
+#endif
+}
+
+static FORCEINLINE uint32_t __atomic_and(uint32_t *p, uint32_t v) {
+#ifdef _MSC_VER
+    return InterlockedAnd((LONG volatile *)p, v);
+#else
+    return __sync_fetch_and_and(p, v);
+#endif
+}
+
+static FORCEINLINE uint32_t __atomic_or(uint32_t *p, uint32_t v) {
+#ifdef _MSC_VER
+    return InterlockedOr((LONG volatile *)p, v);
+#else
+    return __sync_fetch_and_or(p, v);
+#endif
+}
+
+static FORCEINLINE uint32_t __atomic_xor(uint32_t *p, uint32_t v) {
+#ifdef _MSC_VER
+    return InterlockedXor((LONG volatile *)p, v);
+#else
+    return __sync_fetch_and_xor(p, v);
+#endif
+}
+
+static FORCEINLINE uint32_t __atomic_min(uint32_t *p, uint32_t v) {
+    int32_t old, min;
+    do {
+        old = *((volatile int32_t *)p);
+        min = (old < (int32_t)v) ? old : (int32_t)v;
+#ifdef _MSC_VER
+    } while (InterlockedCompareExchange((LONG volatile *)p, min, old) != old);
+#else
+    } while (__sync_bool_compare_and_swap(p, old, min) == false);
+#endif
+    return old;
+}
+
+static FORCEINLINE uint32_t __atomic_max(uint32_t *p, uint32_t v) {
+    int32_t old, max;
+    do {
+        old = *((volatile int32_t *)p);
+        max = (old > (int32_t)v) ? old : (int32_t)v;
+#ifdef _MSC_VER
+    } while (InterlockedCompareExchange((LONG volatile *)p, max, old) != old);
+#else
+    } while (__sync_bool_compare_and_swap(p, old, max) == false);
+#endif
+    return old;
+}
+
+static FORCEINLINE uint32_t __atomic_umin(uint32_t *p, uint32_t v) {
+    uint32_t old, min;
+    do {
+        old = *((volatile uint32_t *)p);
+        min = (old < v) ? old : v;
+#ifdef _MSC_VER
+    } while (InterlockedCompareExchange((LONG volatile *)p, min, old) != old);
+#else
+    } while (__sync_bool_compare_and_swap(p, old, min) == false);
+#endif
+    return old;
+}
+
+static FORCEINLINE uint32_t __atomic_umax(uint32_t *p, uint32_t v) {
+    uint32_t old, max;
+    do {
+        old = *((volatile uint32_t *)p);
+        max = (old > v) ? old : v;
+#ifdef _MSC_VER
+    } while (InterlockedCompareExchange((LONG volatile *)p, max, old) != old);
+#else
+    } while (__sync_bool_compare_and_swap(p, old, max) == false);
+#endif
+    return old;
+}
+
+static FORCEINLINE uint32_t __atomic_xchg(uint32_t *p, uint32_t v) {
+#ifdef _MSC_VER
+    return InterlockedExchange((LONG volatile *)p, v);
+#else
+    return __sync_lock_test_and_set(p, v);
+#endif
+}
+
+static FORCEINLINE uint32_t __atomic_cmpxchg(uint32_t *p, uint32_t cmpval,
+                                             uint32_t newval) {
+#ifdef _MSC_VER
+    return InterlockedCompareExchange((LONG volatile *)p, newval, cmpval);
+#else
+    return __sync_val_compare_and_swap(p, cmpval, newval);
+#endif
+}
+
+static FORCEINLINE uint64_t __atomic_add(uint64_t *p, uint64_t v) {
+#ifdef _MSC_VER
+    return InterlockedAdd64((LONGLONG volatile *)p, v) - v;
+#else
+    return __sync_fetch_and_add(p, v);
+#endif
+}
+
+static FORCEINLINE uint64_t __atomic_sub(uint64_t *p, uint64_t v) {
+#ifdef _MSC_VER
+    return InterlockedAdd64((LONGLONG volatile *)p, -v) + v;
+#else
+    return __sync_fetch_and_sub(p, v);
+#endif
+}
+
+static FORCEINLINE uint64_t __atomic_and(uint64_t *p, uint64_t v) {
+#ifdef _MSC_VER
+    return InterlockedAnd64((LONGLONG volatile *)p, v) - v;
+#else
+    return __sync_fetch_and_and(p, v);
+#endif
+}
+
+static FORCEINLINE uint64_t __atomic_or(uint64_t *p, uint64_t v) {
+#ifdef _MSC_VER
+    return InterlockedOr64((LONGLONG volatile *)p, v) - v;
+#else
+    return __sync_fetch_and_or(p, v);
+#endif
+}
+
+static FORCEINLINE uint64_t __atomic_xor(uint64_t *p, uint64_t v) {
+#ifdef _MSC_VER
+    return InterlockedXor64((LONGLONG volatile *)p, v) - v;
+#else
+    return __sync_fetch_and_xor(p, v);
+#endif
+}
+
+static FORCEINLINE uint64_t __atomic_min(uint64_t *p, uint64_t v) {
+    int64_t old, min;
+    do {
+        old = *((volatile int64_t *)p);
+        min = (old < (int64_t)v) ? old : (int64_t)v;
+#ifdef _MSC_VER
+    } while (InterlockedCompareExchange64((LONGLONG volatile *)p, min, old) != old);
+#else
+    } while (__sync_bool_compare_and_swap(p, old, min) == false);
+#endif
+    return old;
+}
+
+static FORCEINLINE uint64_t __atomic_max(uint64_t *p, uint64_t v) {
+    int64_t old, max;
+    do {
+        old = *((volatile int64_t *)p);
+        max = (old > (int64_t)v) ? old : (int64_t)v;
+#ifdef _MSC_VER
+    } while (InterlockedCompareExchange64((LONGLONG volatile *)p, max, old) != old);
+#else
+    } while (__sync_bool_compare_and_swap(p, old, max) == false);
+#endif
+    return old;
+}
+
+static FORCEINLINE uint64_t __atomic_umin(uint64_t *p, uint64_t v) {
+    uint64_t old, min;
+    do {
+        old = *((volatile uint64_t *)p);
+        min = (old < v) ? old : v;
+#ifdef _MSC_VER
+    } while (InterlockedCompareExchange64((LONGLONG volatile *)p, min, old) != old);
+#else
+    } while (__sync_bool_compare_and_swap(p, old, min) == false);
+#endif
+    return old;
+}
+
+static FORCEINLINE uint64_t __atomic_umax(uint64_t *p, uint64_t v) {
+    uint64_t old, max;
+    do {
+        old = *((volatile uint64_t *)p);
+        max = (old > v) ? old : v;
+#ifdef _MSC_VER
+    } while (InterlockedCompareExchange64((LONGLONG volatile *)p, max, old) != old);
+#else
+    } while (__sync_bool_compare_and_swap(p, old, max) == false);
+#endif
+    return old;
+}
+
+static FORCEINLINE uint64_t __atomic_xchg(uint64_t *p, uint64_t v) {
+#ifdef _MSC_VER
+    return InterlockedExchange64((LONGLONG volatile *)p, v);
+#else
+    return __sync_lock_test_and_set(p, v);
+#endif
+}
+
+static FORCEINLINE uint64_t __atomic_cmpxchg(uint64_t *p, uint64_t cmpval,
+                                             uint64_t newval) {
+#ifdef _MSC_VER
+    return InterlockedCompareExchange64((LONGLONG volatile *)p, newval, cmpval);
+#else
+    return __sync_val_compare_and_swap(p, cmpval, newval);
+#endif
+}
+
+#ifdef WIN32
+#include <windows.h>
+#define __clock __rdtsc
+#else // WIN32
+static FORCEINLINE uint64_t __clock() {
+  uint32_t low, high;
+#ifdef __x86_64
+  __asm__ __volatile__ ("xorl %%eax,%%eax \n    cpuid"
+                        ::: "%rax", "%rbx", "%rcx", "%rdx" );
+#else
+  __asm__ __volatile__ ("xorl %%eax,%%eax \n    cpuid"
+                        ::: "%eax", "%ebx", "%ecx", "%edx" );
+#endif
+  __asm__ __volatile__ ("rdtsc" : "=a" (low), "=d" (high));
+  return (uint64_t)high << 32 | low;
+}
+
+#endif // !WIN32
+
+#undef FORCEINLINE
+#undef PRE_ALIGN
+#undef POST_ALIGN
diff --git a/examples/intrinsics/knc-i1x8unsafe_fast.h b/examples/intrinsics/knc-i1x8unsafe_fast.h
new file mode 100644
index 00000000..55d97566
--- /dev/null
+++ b/examples/intrinsics/knc-i1x8unsafe_fast.h
@@ -0,0 +1,2 @@
+#define __ZMM32BIT__
+#include "knc-i1x8.h"
diff --git a/run_tests.py b/run_tests.py
index 9729930f..2cca983e 100755
--- a/run_tests.py
+++ b/run_tests.py
@@ -362,10 +362,13 @@ def run_test(testname):
                 gcc_isa=""
                 if options.target == 'generic-4':
                     gcc_isa = '-msse4.2'
-                if options.target == 'generic-8':
+                if (options.target == 'generic-8'):
+                  if (options.include_file.find("knc-i1x8.h")!=-1 or options.include_file.find("knc-i1x8unsafe_fast.h")!=-1):
+                    gcc_isa = '-mmic'
+                  else:
                     gcc_isa = '-mavx'
                 if (options.target == 'generic-16' or options.target == 'generic-32' or options.target == 'generic-64') \
-                        and (options.include_file.find("knc.h")!=-1 or options.include_file.find("knc2x.h")!=-1):
+                        and (options.include_file.find("knc-i1x16.h")!=-1 or options.include_file.find("knc.h")!=-1 or options.include_file.find("knc2x.h")!=-1):
                     gcc_isa = '-mmic'
 
                 cc_cmd = "%s -O2 -I. %s %s test_static.cpp -DTEST_SIG=%d %s -o %s" % \

From 4b1a0b4bc46f6a4503c1ebec8cbfa7b74ffc78a3 Mon Sep 17 00:00:00 2001
From: evghenii <egaburov@dds.nl>
Date: Wed, 18 Sep 2013 18:41:22 +0300
Subject: [PATCH 078/124] added fails

---
 examples/intrinsics/knc-i1x8unsafe_fast.h | 67 +++++++++++++++++++++++
 1 file changed, 67 insertions(+)

diff --git a/examples/intrinsics/knc-i1x8unsafe_fast.h b/examples/intrinsics/knc-i1x8unsafe_fast.h
index 55d97566..ce66ea11 100644
--- a/examples/intrinsics/knc-i1x8unsafe_fast.h
+++ b/examples/intrinsics/knc-i1x8unsafe_fast.h
@@ -1,2 +1,69 @@
 #define __ZMM32BIT__
 #include "knc-i1x8.h"
+
+/* the following tests fails because vec8_i32 and vec8_float are 512 and not 256 bit in size.
+ * not sure how it is possible to fix this, any suggestions? 
+33 / 1206 tests FAILED execution:
+        ./tests/array-gather-simple.ispc
+        ./tests/array-gather-vary.ispc
+        ./tests/array-multidim-gather-scatter.ispc
+        ./tests/array-scatter-vary.ispc
+        ./tests/atomics-5.ispc
+        ./tests/atomics-swap.ispc
+        ./tests/cfor-array-gather-vary.ispc
+        ./tests/cfor-gs-improve-varying-1.ispc
+        ./tests/cfor-struct-gather-2.ispc
+        ./tests/cfor-struct-gather-3.ispc
+        ./tests/cfor-struct-gather.ispc
+        ./tests/gather-struct-vector.ispc
+        ./tests/global-array-4.ispc
+        ./tests/gs-improve-varying-1.ispc
+        ./tests/half-1.ispc
+        ./tests/half-3.ispc
+        ./tests/half.ispc
+        ./tests/launch-3.ispc
+        ./tests/launch-4.ispc
+        ./tests/masked-scatter-vector.ispc
+        ./tests/masked-struct-scatter-varying.ispc
+        ./tests/new-delete-6.ispc
+        ./tests/ptr-24.ispc
+        ./tests/ptr-25.ispc
+        ./tests/short-vec-15.ispc
+        ./tests/struct-gather-2.ispc
+        ./tests/struct-gather-3.ispc
+        ./tests/struct-gather.ispc
+        ./tests/struct-ref-lvalue.ispc
+        ./tests/struct-test-118.ispc
+        ./tests/struct-vary-index-expr.ispc
+        ./tests/typedef-2.ispc
+        ./tests/vector-varying-scatter.ispc
+*/
+
+/* knc-i1x8.h has the following fails:
+3 / 1206 tests FAILED execution:
+        ./tests/half-1.ispc
+        ./tests/half-3.ispc
+        ./tests/half.ispc
+*/
+
+/* knc-i1x16.h has the following fails:
+5 / 1206 tests FAILED execution:
+        ./tests/assert-3.ispc
+        ./tests/half-1.ispc
+        ./tests/half-3.ispc
+        ./tests/half.ispc
+        ./tests/test-141.ispc
+*/
+
+/* generics-16, from which these knc-i1x*.h are derived, has the following fails:
+6 / 1206 tests FAILED execution:
+        ./tests/func-overload-max.ispc
+        ./tests/half-1.ispc
+        ./tests/half-3.ispc
+        ./tests/half.ispc
+        ./tests/test-141.ispc
+        ./tests/test-143.ispc
+*/
+
+
+

From e4b1f585952d4748818d01995f24c04d35c4c0b0 Mon Sep 17 00:00:00 2001
From: evghenii <egaburov@dds.nl>
Date: Wed, 18 Sep 2013 19:14:41 +0300
Subject: [PATCH 079/124] performance fix.. still some issues left with
 equal_i1 for __vec8_i1

---
 examples/intrinsics/knc-i1x16.h | 52 ++++++++++++++++++++++-----------
 1 file changed, 35 insertions(+), 17 deletions(-)

diff --git a/examples/intrinsics/knc-i1x16.h b/examples/intrinsics/knc-i1x16.h
index 8b1a2bb9..ebffa4d6 100644
--- a/examples/intrinsics/knc-i1x16.h
+++ b/examples/intrinsics/knc-i1x16.h
@@ -480,46 +480,63 @@ INSERT_EXTRACT(__vec1_d, double)
 ///////////////////////////////////////////////////////////////////////////
 // mask ops
 
-static FORCEINLINE uint64_t __movmsk(__vec16_i1 mask) {
-    return (uint64_t)mask.v;
+static FORCEINLINE __vec16_i1 __movmsk(__vec16_i1 mask) {
+    return _mm512_kmov(mask);
 }
 
 static FORCEINLINE bool __any(__vec16_i1 mask) {
-    return (mask.v!=0);
+    return !_mm512_kortestz(mask, mask);
 }
 
 static FORCEINLINE bool __all(__vec16_i1 mask) {
-    return (mask.v==0xFFFF);
+    return _mm512_kortestc(mask, mask);
 }
 
 static FORCEINLINE bool __none(__vec16_i1 mask) {
-    return (mask.v==0);
+    return _mm512_kortestz(mask, mask);
 }
 
+#if 0
+static FORCEINLINE __vec16_i1 __equal_i1(__vec16_i1 a, __vec16_i1 b) {
+#if 0
+    return _mm512_kand(a,b);   /* this fails some short circut tests */
+#else
+    return _mm512_knot( _mm512_kandn(a, b));  /* this fails some asin test */
+#endif
+}
+#else /* passes all the tests */
 static FORCEINLINE __vec16_i1 __equal_i1(__vec16_i1 a, __vec16_i1 b) {
     __vec16_i1 r;
     r.v = (a.v & b.v) | (~a.v & ~b.v);
     return r;
 }
+#endif
 
 static FORCEINLINE __vec16_i1 __and(__vec16_i1 a, __vec16_i1 b) {
-    __vec16_i1 r;
-    r.v = a.v & b.v;
-    return r;
+    return _mm512_kand(a, b);
 }
 
 static FORCEINLINE __vec16_i1 __xor(__vec16_i1 a, __vec16_i1 b) {
-    __vec16_i1 r;
-    r.v = a.v ^ b.v;
-    return r;
+    return _mm512_kxor(a, b);
 }
 
 static FORCEINLINE __vec16_i1 __or(__vec16_i1 a, __vec16_i1 b) {
-    __vec16_i1 r;
-    r.v = a.v | b.v;
-    return r;
+    return _mm512_kor(a, b);
 }
 
+#if 0
+static FORCEINLINE __vec16_i1 __not(__vec16_i1 a) {
+    return _mm512_knot(a);
+}
+
+static FORCEINLINE __vec16_i1 __and_not1(__vec16_i1 a, __vec16_i1 b) {
+    return _mm512_kandn(a, b);
+}
+
+static FORCEINLINE __vec16_i1 __and_not2(__vec16_i1 a, __vec16_i1 b) {
+    return _mm512_kandnr(a, b);
+}
+#else
 static FORCEINLINE __vec16_i1 __not(__vec16_i1 v) {
     __vec16_i1 r;
     r.v = ~v.v;
@@ -537,18 +554,19 @@ static FORCEINLINE __vec16_i1 __and_not2(__vec16_i1 a, __vec16_i1 b) {
     r.v = a.v & ~b.v;
     return r;
 }
+#endif
 
 static FORCEINLINE __vec16_i1 __select(__vec16_i1 mask, __vec16_i1 a, 
                                        __vec16_i1 b) {
-    __vec16_i1 r;
-    r.v = (a.v & mask.v) | (b.v & ~mask.v);
-    return r;
+    return ((a & mask) | (b & ~mask));
+    //return __or(__and(a, mask), __andnr(b, mask));
 }
 
 static FORCEINLINE __vec16_i1 __select(bool cond, __vec16_i1 a, __vec16_i1 b) {
     return cond ? a : b;
 }
 
+
 static FORCEINLINE bool __extract_element(__vec16_i1 vec, int index) {
     return (vec.v & (1 << index)) ? true : false;
 }

From 3cf63362a4885056bf72e6daaad7ffc67d7a93dc Mon Sep 17 00:00:00 2001
From: evghenii <egaburov@dds.nl>
Date: Wed, 18 Sep 2013 20:03:08 +0300
Subject: [PATCH 080/124] small tuning

---
 examples/intrinsics/knc-i1x16.h | 39 +++------------------------------
 1 file changed, 3 insertions(+), 36 deletions(-)

diff --git a/examples/intrinsics/knc-i1x16.h b/examples/intrinsics/knc-i1x16.h
index ebffa4d6..b7d3a7f1 100644
--- a/examples/intrinsics/knc-i1x16.h
+++ b/examples/intrinsics/knc-i1x16.h
@@ -496,22 +496,9 @@ static FORCEINLINE bool __none(__vec16_i1 mask) {
     return _mm512_kortestz(mask, mask);
 }
 
-#if 0
 static FORCEINLINE __vec16_i1 __equal_i1(__vec16_i1 a, __vec16_i1 b) {
-#if 0
-    return _mm512_kand(a,b);   /* this fails some short circut tests */
-#else
-    return _mm512_knot( _mm512_kandn(a, b));  /* this fails some asin test */
-#endif
+    return _mm512_kxnor(a,b);
 }
-#else /* passes all the tests */
-static FORCEINLINE __vec16_i1 __equal_i1(__vec16_i1 a, __vec16_i1 b) {
-    __vec16_i1 r;
-    r.v = (a.v & b.v) | (~a.v & ~b.v);
-    return r;
-}
-#endif
-
 static FORCEINLINE __vec16_i1 __and(__vec16_i1 a, __vec16_i1 b) {
     return _mm512_kand(a, b);
 }
@@ -524,7 +511,6 @@ static FORCEINLINE __vec16_i1 __or(__vec16_i1 a, __vec16_i1 b) {
     return _mm512_kor(a, b);
 }
 
-#if 0
 static FORCEINLINE __vec16_i1 __not(__vec16_i1 a) {
     return _mm512_knot(a);
 }
@@ -536,30 +522,11 @@ static FORCEINLINE __vec16_i1 __and_not1(__vec16_i1 a, __vec16_i1 b) {
 static FORCEINLINE __vec16_i1 __and_not2(__vec16_i1 a, __vec16_i1 b) {
     return _mm512_kandnr(a, b);
 }
-#else
-static FORCEINLINE __vec16_i1 __not(__vec16_i1 v) {
-    __vec16_i1 r;
-    r.v = ~v.v;
-    return r;
-}
-
-static FORCEINLINE __vec16_i1 __and_not1(__vec16_i1 a, __vec16_i1 b) {
-    __vec16_i1 r;
-    r.v = ~a.v & b.v;
-    return r;
-}
-
-static FORCEINLINE __vec16_i1 __and_not2(__vec16_i1 a, __vec16_i1 b) {
-    __vec16_i1 r;
-    r.v = a.v & ~b.v;
-    return r;
-}
-#endif
 
 static FORCEINLINE __vec16_i1 __select(__vec16_i1 mask, __vec16_i1 a, 
                                        __vec16_i1 b) {
-    return ((a & mask) | (b & ~mask));
-    //return __or(__and(a, mask), __andnr(b, mask));
+//    return ((a & mask) | (b & ~mask));
+    return __or(__and(a, mask), __and_not2(b, mask));
 }
 
 static FORCEINLINE __vec16_i1 __select(bool cond, __vec16_i1 a, __vec16_i1 b) {

From 406e2eb8d0e9eaac0c1923c8a91837882b8f4610 Mon Sep 17 00:00:00 2001
From: egaburov <egaburov.work@gmail.com>
Date: Thu, 19 Sep 2013 09:16:37 +0200
Subject: [PATCH 081/124] fix double precision input to support .123d321 type
 of input

---
 lex.ll | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lex.ll b/lex.ll
index f1dcaa6f..3655220f 100644
--- a/lex.ll
+++ b/lex.ll
@@ -345,7 +345,7 @@ INT_NUMBER (([0-9]+)|(0x[0-9a-fA-F]+)|(0b[01]+))[uUlL]*[kMG]?[uUlL]*
 INT_NUMBER_DOTDOTDOT (([0-9]+)|(0x[0-9a-fA-F]+)|(0b[01]+))[uUlL]*[kMG]?[uUlL]*\.\.\.
 FLOAT_NUMBER (([0-9]+|(([0-9]+\.[0-9]*[fF]?)|(\.[0-9]+)))([eE][-+]?[0-9]+)?[fF]?)
 HEX_FLOAT_NUMBER (0x[01](\.[0-9a-fA-F]*)?p[-+]?[0-9]+[fF]?)
-FORTRAN_DOUBLE_NUMBER (([0-9]+\.[0-9]*[dD])|([0-9]+\.[0-9]*[dD][-+]?[0-9]+)|([0-9]+[dD][-+]?[0-9]+))
+FORTRAN_DOUBLE_NUMBER (([0-9]+\.[0-9]*[dD])|([0-9]+\.[0-9]*[dD][-+]?[0-9]+)|([0-9]+[dD][-+]?[0-9]+)|(\.[0-9]*[dD][-+]?[0-9]+))
 
 
 

From 00cd90c6b0c31a0d709c368db8b0dc42501577cc Mon Sep 17 00:00:00 2001
From: Ilia Filippov <ili.filippov@gmail.com>
Date: Tue, 17 Sep 2013 17:30:34 +0400
Subject: [PATCH 082/124] test system

---
 alloy.py                                      | 600 ++++++++++++++++++
 check_env.py                                  | 102 +++
 common.py                                     | 120 ++++
 examples/noise/Makefile                       |   2 +-
 examples/perf.py                              | 374 -----------
 fail_db.txt                                   |   1 +
 ...ER.patch => 3_3_r183327-AVX2-GATHER.patch} |   0
 ...hift.patch => 3_3_r184575-x86-shift.patch} |   0
 examples/perf.ini => perf.ini                 |  24 +-
 perf.py                                       | 489 ++++++++++++++
 run_tests.py                                  | 570 +++++++++++------
 11 files changed, 1711 insertions(+), 571 deletions(-)
 create mode 100755 alloy.py
 create mode 100755 check_env.py
 create mode 100644 common.py
 delete mode 100755 examples/perf.py
 create mode 100644 fail_db.txt
 rename llvm_patches/{r183327-AVX2-GATHER.patch => 3_3_r183327-AVX2-GATHER.patch} (100%)
 rename llvm_patches/{r184575-x86-shift.patch => 3_3_r184575-x86-shift.patch} (100%)
 rename examples/perf.ini => perf.ini (84%)
 create mode 100755 perf.py

diff --git a/alloy.py b/alloy.py
new file mode 100755
index 00000000..67f534ca
--- /dev/null
+++ b/alloy.py
@@ -0,0 +1,600 @@
+#!/usr/bin/python
+#
+#  Copyright (c) 2013, Intel Corporation
+#  All rights reserved.
+# 
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions are
+#  met:
+# 
+#    * Redistributions of source code must retain the above copyright
+#      notice, this list of conditions and the following disclaimer.
+# 
+#    * Redistributions in binary form must reproduce the above copyright
+#      notice, this list of conditions and the following disclaimer in the
+#      documentation and/or other materials provided with the distribution.
+# 
+#    * Neither the name of Intel Corporation nor the names of its
+#      contributors may be used to endorse or promote products derived from
+#      this software without specific prior written permission.
+# 
+# 
+#   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+#   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+#   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+#   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+#   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+#   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+#   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+#   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+#   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+#   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+#   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+# // Author: Filippov Ilia
+
+def attach_mail_file(msg, filename, name):
+    if os.path.exists(filename):
+        fp = open(filename, "rb")
+        to_attach = MIMEBase("application", "octet-stream")
+        to_attach.set_payload(fp.read())
+        encode_base64(to_attach)
+        to_attach.add_header("Content-Disposition", "attachment", filename=name)
+        fp.close()
+        msg.attach(to_attach)
+
+def setting_paths(llvm, ispc, sde):
+    if llvm != "":
+        os.environ["LLVM_HOME"]=llvm
+    if ispc != "":
+        os.environ["ISPC_HOME"]=ispc
+    if sde != "":
+        os.environ["SDE_HOME"]=sde
+
+def check_LLVM(which_LLVM):
+    answer = []
+    if which_LLVM[0] == " ":
+        return answer
+    p = os.environ["LLVM_HOME"]
+    for i in range(0,len(which_LLVM)):
+        if not os.path.exists(p + os.sep + "bin-" + which_LLVM[i] + os.sep + "bin"):
+            answer.append(which_LLVM[i])
+    return answer
+
+def try_do_LLVM(text, command, from_validation):
+    if from_validation == True:
+        text = text + "\n"
+    print_debug("Trying to " + text, from_validation, alloy_build)
+    if os.system(command + " >> " + alloy_build + " 2>> " + alloy_build) != 0:
+        print_debug("ERROR.\n", from_validation, alloy_build)
+        error("can't " + text, 1)
+    print_debug("DONE.\n", from_validation, alloy_build)
+
+def build_LLVM(version_LLVM, revision, folder, tarball, debug, selfbuild, from_validation, force):
+    print_debug("Building LLVM. Version: " + version_LLVM + ". ", from_validation, alloy_build)
+    if revision != "":
+        print_debug("Revision: " + revision + ".\n", from_validation, alloy_build)
+    else:
+        print_debug("\n", from_validation, alloy_build)
+    # Here we understand what and where do we want to build
+    current_path = os.getcwd()
+    llvm_home = os.environ["LLVM_HOME"]
+    os.chdir(llvm_home)
+    FOLDER_NAME=version_LLVM
+    if  version_LLVM == "head":
+        SVN_PATH="trunk"
+    if  version_LLVM == "3.3":
+        SVN_PATH="tags/RELEASE_33/final"
+        version_LLVM = "3_3"
+    if  version_LLVM == "3.2":
+        SVN_PATH="tags/RELEASE_32/final"
+        version_LLVM = "3_2"
+    if  version_LLVM == "3.1":
+        SVN_PATH="tags/RELEASE_31/final"
+        version_LLVM = "3_1"
+    if revision != "":
+        FOLDER_NAME = FOLDER_NAME + "_" + revision
+        revision = "-" + revision
+    if folder == "":
+        folder = FOLDER_NAME
+    LLVM_SRC="llvm-" + folder
+    LLVM_BUILD="build-" + folder
+    LLVM_BIN="bin-" + folder
+    if os.path.exists(LLVM_BIN) and not force:
+        print_debug("You have folder " + LLVM_BIN + ". If you want to rebuild use --force\n", False, "")
+        exit(0)
+    LLVM_BUILD_selfbuild = LLVM_BUILD + "_temp"
+    LLVM_BIN_selfbuild = LLVM_BIN + "_temp"
+    common.remove_if_exists(LLVM_SRC)
+    common.remove_if_exists(LLVM_BUILD)
+    common.remove_if_exists(LLVM_BIN)
+    if selfbuild:
+        common.remove_if_exists(LLVM_BUILD_selfbuild)
+        common.remove_if_exists(LLVM_BIN_selfbuild)
+    MAKE = "gmake"
+    print_debug("Using folders: " + LLVM_SRC + " " + LLVM_BUILD + " " + LLVM_BIN + " in " + 
+        llvm_home + "\n", from_validation, alloy_build)
+    # load llvm
+    if tarball == "":
+        try_do_LLVM("load LLVM from http://llvm.org/svn/llvm-project/llvm/" + SVN_PATH + " ",
+                    "svn co " + revision + " http://llvm.org/svn/llvm-project/llvm/" + SVN_PATH + " " + LLVM_SRC,
+                    from_validation)
+        os.chdir(LLVM_SRC + "/tools")
+        try_do_LLVM("load clang from http://llvm.org/svn/llvm-project/cfe/" + SVN_PATH + " ",
+                    "svn co " + revision + " http://llvm.org/svn/llvm-project/cfe/" + SVN_PATH + " clang",
+                    from_validation)
+        os.chdir("../")
+    else:
+        tar = tarball.split(" ")
+        os.makedirs(LLVM_SRC) 
+        os.chdir(LLVM_SRC) 
+        try_do_LLVM("untar LLVM from " + tar[0] + " ",
+                    "tar -xvzf " + tar[0] + " --strip-components 1", from_validation)
+        os.chdir("./tools") 
+        os.makedirs("clang") 
+        os.chdir("./clang") 
+        try_do_LLVM("untar clang from " + tar[1] + " ",
+                    "tar -xvzf " + tar[1] + " --strip-components 1", from_validation)
+        os.chdir("../../")
+    # paching llvm
+    patches = glob.glob(os.environ["ISPC_HOME"] + "/llvm_patches/*.*")
+    for patch in patches:
+        if version_LLVM in os.path.basename(patch):
+            try_do_LLVM("patch LLVM with patch" + patch + " ", "patch -p0 < " + patch, from_validation)
+    os.chdir("../")
+    # configuring llvm, build first part of selfbuild
+    os.makedirs(LLVM_BUILD)
+    os.makedirs(LLVM_BIN)
+    selfbuild_compiler = ""
+    if selfbuild:
+        print_debug("Making selfbuild and use folders " + LLVM_BUILD_selfbuild + " and " +
+            LLVM_BIN_selfbuild + "\n", from_validation, alloy_build)
+        os.makedirs(LLVM_BUILD_selfbuild)
+        os.makedirs(LLVM_BIN_selfbuild)
+        os.chdir(LLVM_BUILD_selfbuild)
+        try_do_LLVM("configure release version for selfbuild ",
+                    "../" + LLVM_SRC + "/configure --prefix=" + llvm_home + "/" +
+                    LLVM_BIN_selfbuild + " --enable-optimized",
+                    from_validation)
+        try_do_LLVM("build release version for selfbuild ",
+                    MAKE + " -j32", from_validation)
+        try_do_LLVM("install release version for selfbuild ",
+                    MAKE + " install",
+                    from_validation)
+        os.chdir("../")
+        selfbuild_compiler = " CC="+llvm_home+ "/" + LLVM_BIN_selfbuild + "/bin/clang"
+        print_debug("Now we have compiler for selfbuild: " + selfbuild_compiler + "\n", from_validation, alloy_build)
+    os.chdir(LLVM_BUILD)
+    if debug == False:
+        try_do_LLVM("configure release version ",
+                    "../" + LLVM_SRC + "/configure --prefix=" + llvm_home + "/" +
+                    LLVM_BIN + " --enable-optimized" + selfbuild_compiler,
+                    from_validation)
+    else:
+        try_do_LLVM("configure debug version ",
+                    "../" + LLVM_SRC + "/configure --prefix=" + llvm_home + "/" + LLVM_BIN +
+                    " --enable-debug-runtime --enable-debug-symbols --enable-keep-symbols" + selfbuild_compiler,
+                    from_validation)
+    # building llvm
+    try_do_LLVM("build LLVM ", MAKE + " -j32", from_validation)
+    try_do_LLVM("install LLVM ", MAKE + " install", from_validation)
+    os.chdir(current_path) 
+
+def check_targets():
+    answer = []
+    answer_sde = []
+    SSE2 = False;
+    SSE4 = False;
+    AVX = False;
+    AVX11 = False;
+    AVX2 = False;
+    cpu = open("/proc/cpuinfo")
+    f_lines = cpu.readlines()
+    cpu.close()
+    # check what native targets do we have
+    for i in range(0,len(f_lines)):
+        if SSE2 == False and "sse2" in f_lines[i]:
+            SSE2 = True;
+            answer = answer + ["sse2-i32x4", "sse2-i32x8"]
+        if SSE4 == False and "sse4_1" in f_lines[i]:
+            SSE4 = True;
+            answer = answer + ["sse4-i32x4", "sse4-i32x8", "sse4-i16x8", "sse4-i8x16"]
+        if AVX == False and "avx" in f_lines[i]:
+            AVX = True;
+            answer = answer + ["avx1-i32x8", "avx1-i32x16"]
+        if AVX11 == False and "rdrand" in f_lines[i]:
+            AVX11 = True;
+            answer = answer + ["avx1.1-i32x8", "avx1.1-i32x16"]
+        if AVX2 == False and "avx2" in f_lines[i]:
+            AVX2 = True;
+            answer = answer + ["avx2-i32x8", "avx2-i32x16"]
+    answer = answer + ["generic-4", "generic-16", "generic-8", "generic-1", "generic-32", "generic-64"]
+    # now check what targets we have with the help of SDE
+    sde_exists = ""
+    PATH_dir = string.split(os.getenv("PATH"), os.pathsep)
+    for counter in PATH_dir:
+        if os.path.exists(counter + os.sep + "sde") and sde_exists == "":
+            sde_exists = counter + os.sep + "sde"
+    if os.environ.get("SDE_HOME") != None:
+        if os.path.exists(os.environ.get("SDE_HOME") + os.sep + "sde"):
+            sde_exists = os.environ.get("SDE_HOME") + os.sep + "sde"
+    if sde_exists == "":
+        error("you haven't got sde neither in SDE_HOME nor in your PATH.\n" + 
+            "To test all platforms please set SDE_HOME to path containing SDE.\n" +
+            "Please refer to http://www.intel.com/software/sde for SDE download information.", 2)
+        return [answer, answer_sde]
+    # here we have SDE
+    os.system(sde_exists + " -help > " + temp_alloy_file)
+    cpu = open(temp_alloy_file)
+    f_lines = cpu.readlines()
+    cpu.close()
+    for i in range(0,len(f_lines)):
+        if SSE4 == False and "wsm" in f_lines[i]:
+            answer_sde = answer_sde + [["-wsm", "sse4-i32x4"], ["-wsm", "sse4-i32x8"], ["-wsm", "sse4-i16x8"], ["-wsm", "sse4-i8x16"]]
+        if AVX == False and "snb" in f_lines[i]:
+            answer_sde = answer_sde + [["-snb", "avx1-i32x8"], ["-snb", "avx1-i32x16"]]
+        if AVX11 == False and "ivb" in f_lines[i]:
+            answer_sde = answer_sde + [["-ivb", "avx1.1-i32x8"], ["ivb", "avx1.1-i32x16"]]
+        if AVX2 == False and "hsw" in f_lines[i]:
+            answer_sde = answer_sde + [["-hsw", "avx2-i32x8"], ["-hsw", "avx2-i32x16"]]
+    return [answer, answer_sde]
+
+def build_ispc(version_LLVM):
+    current_path = os.getcwd()
+    os.chdir(os.environ["ISPC_HOME"])
+    p_temp = os.getenv("PATH")
+    os.environ["PATH"] = os.environ["LLVM_HOME"] + "/bin-" + version_LLVM + "/bin:" + os.environ["PATH"]
+    os.system("make clean >> " + alloy_build)
+    try_do_LLVM("build ISPC with LLVM version " + version_LLVM + " ", "make -j32", True)
+    os.environ["PATH"] = p_temp
+    os.chdir(current_path)
+
+def execute_stability(stability, R, print_version):
+    stability1 = copy.deepcopy(stability)
+    temp = run_tests.run_tests(stability1, [], print_version)
+    for j in range(0,4):
+        R[j][0] = R[j][0] + temp[j]
+        for i in range(0,len(temp[j])):
+            R[j][1].append(temp[4])
+    number_of_fails = temp[5]
+    number_of_new_fails = len(temp[0]) + len(temp[1])
+    if number_of_fails == 0:
+        str_fails = ". No fails"
+    else:
+        str_fails = ". Fails: " + str(number_of_fails)
+    if number_of_new_fails == 0:
+        str_new_fails = ", No new fails.\n"
+    else:
+        str_new_fails = ", New fails: " + str(number_of_new_fails) + ".\n"
+    print_debug(temp[4][1:-3] + str_fails + str_new_fails, False, stability_log)
+
+def run_special_tests():
+   i = 5 
+
+def validation_run(only, only_targets, reference_branch, notify, update):
+    current_path = os.getcwd()
+    os.chdir(os.environ["ISPC_HOME"])
+    os.environ["PATH"] = os.environ["ISPC_HOME"] + ":" + os.environ["PATH"]
+    if options.notify != "":
+        if os.environ.get("SMTP_ISPC") == None:
+            error("you have no SMTP_ISPC in your environment for option notify", 1)
+        common.remove_if_exists(os.environ["ISPC_HOME"] + os.sep + "all_answer.txt")
+        smtp_server = os.environ["SMTP_ISPC"]
+        msg = MIMEMultipart()
+        msg['Subject'] = 'ISPC test system results'
+        msg['From'] = 'ISPC_test_system'
+        msg['To'] = options.notify
+    print_debug("Command: " + ' '.join(sys.argv) + "\n", False, "")
+    print_debug("Folder: " + os.environ["ISPC_HOME"] + "\n", False, "")
+    date = datetime.datetime.now()
+    print_debug("Date: " + date.strftime('%H:%M %d/%m/%Y') + "\n", False, "")
+    class options_for_drivers:
+        pass
+# *** *** ***
+# Stability validation run
+# *** *** ***
+    if ((("stability" in only) == True) or ("performance" in only) == False):
+        print_debug("\n\nStability validation run\n\n", False, "")
+        stability = options_for_drivers()
+# stability constant options
+        stability.random = False
+        stability.ispc_flags = ""
+        stability.compiler_exe = None
+        stability.num_jobs = 1024
+        stability.verbose = False
+        stability.time = False
+        stability.non_interactive = True
+        stability.update = update
+        stability.include_file = None
+        stability.silent = True
+        stability.in_file = "." + os.sep + f_date + os.sep + "run_tests_log.log"
+        stability.verify = False
+# stability varying options
+        stability.target = ""
+        stability.arch = ""
+        stability.no_opt = False
+        stability.wrapexe = ""
+# prepare parameters of run
+        common.check_tools(1)
+        [targets_t, sde_targets_t] = check_targets()
+        rebuild = True
+        opts = []
+        archs = []
+        LLVM = []
+        targets = []
+        sde_targets = []
+# parsing option only, update parameters of run
+        if "-O2" in only:
+            opts.append(False)
+        if "-O0" in only:
+            opts.append(True)
+        if "x86" in only and not ("x86-64" in only):
+            archs.append("x86")
+        if "x86-64" in only:
+            archs.append("x86-64")
+        if "native" in only:
+            sde_targets_t = []
+        for i in ["3.1", "3.2", "3.3", "head"]:
+            if i in only:
+                LLVM.append(i)
+        if "current" in only:
+            LLVM = [" "]
+            rebuild = False
+        if only_targets != "":
+            only_targets_t = only_targets.split(" ")
+            for i in only_targets_t:
+                err = True
+                for j in range(0,len(targets_t)):
+                    if i in targets_t[j]:
+                        targets.append(targets_t[j])
+                        err = False
+                for j in range(0,len(sde_targets_t)):
+                    if i in sde_targets_t[j][1]:
+                        sde_targets.append(sde_targets_t[j])
+                        err = False
+                if err == True:
+                    error("You haven't sde for target " + i, 1)
+        else:
+            targets = targets_t[:-4]
+            sde_targets = sde_targets_t
+        if "build" in only:
+            targets = []
+            sde_targets = []
+            only = only + " stability "
+# finish parameters of run, prepare LLVM
+        if len(opts) == 0:
+            opts = [False]
+        if len(archs) == 0:
+            archs = ["x86", "x86-64"]
+        if len(LLVM) == 0:
+            LLVM = ["3.1", "3.2", "3.3", "head"]
+        gen_archs = ["x86-64"]
+        need_LLVM = check_LLVM(LLVM)
+        for i in range(0,len(need_LLVM)):
+            build_LLVM(need_LLVM[i], "", "", "", False, False, True, False)
+# begin validation run for stabitily
+        common.remove_if_exists(stability.in_file)
+        R = [[[],[]],[[],[]],[[],[]],[[],[]]]
+        print_debug("\n_________________________STABILITY REPORT_________________________\n", False, stability_log)
+        for i in range(0,len(LLVM)):
+            print_version = 2
+            if rebuild:
+                build_ispc(LLVM[i])
+            for j in range(0,len(targets)):
+                stability.target = targets[j]
+                stability.wrapexe = ""
+                if "generic" in targets[j]:
+                    arch = gen_archs
+                else:
+                    arch = archs
+                for i1 in range(0,len(arch)):
+                    for i2 in range(0,len(opts)):
+                        stability.arch = arch[i1]
+                        stability.no_opt = opts[i2]
+                        execute_stability(stability, R, print_version)
+                        print_version = 0
+            for j in range(0,len(sde_targets)):
+                stability.target = sde_targets[j][1]
+                stability.wrapexe = os.environ["SDE_HOME"] + "/sde " + sde_targets[j][0] + " -- "
+                for i1 in range(0,len(archs)):
+                    for i2 in range(0,len(opts)):
+                        stability.arch = archs[i1]
+                        stability.no_opt = opts[i2]
+                        execute_stability(stability, R, print_version)
+                        print_version = 0
+# run special tests like embree
+# 
+        run_special_tests()
+        ttt = ["NEW RUNFAILS: ", "NEW COMPFAILS: ", "NEW PASSES RUNFAILS: ", "NEW PASSES COMPFAILS: "]
+        for j in range(0,4):
+            if len(R[j][0]) == 0:
+                print_debug("NO " + ttt[j][:-2] + "\n", False, stability_log)
+            else:
+                print_debug(ttt[j] + str(len(R[j][0])) + "\n", False, stability_log)
+                temp5 = [[],[]]
+                for i in range(0,len(R[j][0])):
+                    er = True
+                    for k in range(0,len(temp5[0])):
+                        if R[j][0][i] == temp5[0][k]:
+                            temp5[1][k].append(R[j][1][i])
+                            er = False
+                    if er == True:
+                        temp5[0].append(R[j][0][i])
+                        temp5[1].append([R[j][1][i]])
+                for i in range(0,len(temp5[0])):
+                    print_debug("\t" + temp5[0][i] + "\n", True, stability_log)
+                    for k in range(0,len(temp5[1][i])):
+                        print_debug("\t\t\t" + temp5[1][i][k], True, stability_log)
+        print_debug("__________________Watch stability.log for details_________________\n", False, stability_log)
+        if options.notify != "":
+            attach_mail_file(msg, stability.in_file, "run_tests_log.log")
+            attach_mail_file(msg, stability_log, "stability.log")
+
+# *** *** ***
+# Performance validation run
+# *** *** ***
+    if ((("performance" in only) == True) or ("stability" in only) == False):
+        print_debug("\n\nPerformance validation run\n\n", False, "")
+        performance = options_for_drivers()
+# performance constant options
+        performance.number = 5
+        performance.config = "./perf.ini"
+        performance.path = "./"
+        performance.silent = True
+        performance.output = ""
+        performance.compiler = ""
+        performance.ref = "ispc_ref"
+        performance.in_file = "." + os.sep + f_date + os.sep + "performance.log"
+# prepare LLVM 3.3 as newest LLVM
+        need_LLVM = check_LLVM(["3.3"])
+        if len(need_LLVM) != 0:
+            build_LLVM(need_LLVM[i], "", "", "", False, False, True, False)
+# prepare reference point. build both test and reference compilers
+        os.system("git branch > " + temp_alloy_file)
+        br = open(temp_alloy_file)
+        temp4 = br.readlines()
+        br.close()
+        for line in temp4:
+            if "*" in line:
+                current_branch = line[2:-1]
+        stashing = True
+        sys.stdout.write("Please, don't interrupt script here! You can have not sync git status after interruption!\n")
+        if "No local changes" in detect_version("git stash"):
+            stashing = False
+        #try_do_LLVM("stash current branch ", "git stash", True)
+        try_do_LLVM("checkout reference branch " + reference_branch + " ", "git checkout " + reference_branch, True)
+        sys.stdout.write(".\n")
+        build_ispc("3.3")
+        sys.stdout.write(".\n")
+        os.rename("ispc", "ispc_ref")
+        try_do_LLVM("checkout test branch " + current_branch + " ", "git checkout " + current_branch, True)
+        if stashing:
+            try_do_LLVM("return current branch ", "git stash pop", True)
+        sys.stdout.write("You can interrupt script now.\n")
+        build_ispc("3.3")
+# begin validation run for performance. output is inserted into perf()
+        perf.perf(performance, [])
+        if options.notify != "":
+            attach_mail_file(msg, performance.in_file, "performance.log")
+            attach_mail_file(msg, "." + os.sep + "logs" + os.sep + "perf_build.log", "perf_build.log")
+
+    print_debug("Logs are in alloy_results_[date]", False, "")
+
+# sending e-mail with results
+    if options.notify != "":
+        fp = open(os.environ["ISPC_HOME"] + os.sep + "all_answer.txt", 'rb')
+        f_lines = fp.readlines()
+        fp.close()
+        line = ""
+        for i in range(0,len(f_lines)):
+            line = line + f_lines[i][:-1]
+            line = line + '   \n'
+        text = MIMEText(line, "", "KOI-8")
+        msg.attach(text)
+        attach_mail_file(msg, alloy_build, "alloy_build.log")
+        s = smtplib.SMTP(smtp_server)
+        s.sendmail('ISPC_test_system', options.notify, msg.as_string())
+        s.quit()
+# exit of validation routine
+    common.remove_if_exists(temp_alloy_file)
+    os.chdir(current_path)
+
+def Main():
+    if (platform.system() == 'Windows' or 'CYGWIN_NT' in platform.system()) == True:
+        error("Windows isn't supported now", 1)
+    if (options.build_llvm == False and
+       options.validation_run == False and
+       options.llvm_home == "" and
+       options.ispc_home == "" and
+       options.sde_home == ""):
+        parser.print_help()
+        exit(0)
+    global f_date
+    f_date = "logs"
+    common.remove_if_exists(f_date)
+    os.makedirs(f_date)
+    global temp_alloy_file
+    temp_alloy_file = os.getcwd() + os.sep + f_date + os.sep + "temp_detect_version"
+    global alloy_build
+    alloy_build = os.getcwd() + os.sep + f_date + os.sep + "alloy_build.log"
+    common.remove_if_exists(alloy_build) 
+    global stability_log
+    stability_log = os.getcwd() + os.sep + f_date + os.sep + "stability.log"
+    common.remove_if_exists(stability_log)
+    setting_paths(options.llvm_home, options.ispc_home, options.sde_home)
+    if os.environ.get("LLVM_HOME") == None:
+        error("you have no LLVM_HOME", 1)
+    if os.environ.get("ISPC_HOME") == None:
+        error("you have no ISPC_HOME", 1)
+    if options.build_llvm:
+        build_LLVM(options.version, options.revision, options.folder, options.tarball,
+                    options.debug, options.selfbuild, False, options.force)
+    if options.validation_run:
+        validation_run(options.only, options.only_targets, options.branch, options.notify, options.update)
+    os.rename(f_date, "alloy_results_" + datetime.datetime.now().strftime('%H_%M_%d_%m_%Y'))
+
+###Main###
+from optparse import OptionParser
+import sys
+import os
+import operator
+import time
+import glob
+import string
+import platform
+import smtplib
+import datetime
+import copy
+from email.MIMEMultipart import MIMEMultipart
+from email.MIMEBase import MIMEBase
+from email.mime.text import MIMEText
+from email.Encoders import encode_base64
+# our drivers
+import run_tests
+import perf
+import common
+error = common.error
+detect_version = common.detect_version
+print_debug = common.print_debug
+# parsing options
+parser = OptionParser()
+# options for activity "build LLVM"
+parser.add_option('-b', '--build-llvm', dest='build_llvm',
+    help='ask to build LLVM', default=False, action="store_true")
+parser.add_option('--version', dest='version',
+    help='version of llvm to build', default="head")
+parser.add_option('--revision', dest='revision',
+    help='revision of llvm to build', default="")
+parser.add_option('--debug', dest='debug',
+    help='debug build of LLVM?', default=False, action="store_true")
+parser.add_option('--folder', dest='folder',
+    help='folder to build LLVM in', default="")
+parser.add_option('--tarball', dest='tarball',
+    help='"llvm_tarball clang_tarball"', default="")
+parser.add_option('--selfbuild', dest='selfbuild',
+    help='make selfbuild of LLVM and clang', default=False, action="store_true")
+parser.add_option('--force', dest='force',
+    help='rebuild LLVM', default=False, action='store_true')
+# options for activity "setup PATHS"
+parser.add_option('--llvm_home', dest='llvm_home',help='path to LLVM',default="")
+parser.add_option('--ispc_home', dest='ispc_home',help='path to ISPC',default="")
+parser.add_option('--sde_home', dest='sde_home',help='path to SDE',default="")
+# options for activity "validation run"
+parser.add_option('-r', '--run', dest='validation_run',
+    help='ask for validation run', default=False, action="store_true")
+parser.add_option('--compare-with', dest='branch',
+    help='set performance reference point', default="master")
+parser.add_option('--only-targets', dest='only_targets',
+    help='set list of targets to test. Possible values - all subnames of targets.\n' +
+        'Example: --only-targets="avx2-i32x8 sse4 i32x16 sse2"', default="")
+parser.add_option('--notify', dest='notify',
+    help='sent results to email', default="")
+parser.add_option('--only', dest='only',
+    help='set types of tests. Possible values:\n' + 
+        '-O0, -O2, x86, x86-64, stability (test only stability), performance (test only performance)\n' +
+        'build (only build with different LLVM), 3.1, 3.2, 3.3, head, native (do not use SDE), current (do not rebuild ISPC).\n' +
+        'Example: --only="3.2 -O0 stability 3.3"', default="")
+parser.add_option('--update-errors', dest='update',
+    help='rewrite fail_db.txt file according to received results (F or FP)', default="")
+(options, args) = parser.parse_args()
+Main()
diff --git a/check_env.py b/check_env.py
new file mode 100755
index 00000000..98deb235
--- /dev/null
+++ b/check_env.py
@@ -0,0 +1,102 @@
+#!/usr/bin/python
+#
+#  Copyright (c) 2013, Intel Corporation
+#  All rights reserved.
+# 
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions are
+#  met:
+# 
+#    * Redistributions of source code must retain the above copyright
+#      notice, this list of conditions and the following disclaimer.
+# 
+#    * Redistributions in binary form must reproduce the above copyright
+#      notice, this list of conditions and the following disclaimer in the
+#      documentation and/or other materials provided with the distribution.
+# 
+#    * Neither the name of Intel Corporation nor the names of its
+#      contributors may be used to endorse or promote products derived from
+#      this software without specific prior written permission.
+# 
+# 
+#   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+#   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+#   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+#   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+#   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+#   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+#   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+#   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+#   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+#   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+#   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ 
+# // Author: Filippov Ilia
+
+import common
+import sys
+import os
+import string
+print_debug = common.print_debug
+error = common.error
+detect_version = common.detect_version
+
+exists = [False, False, False, False, False, False, False, False]
+names = ["m4", "bison", "flex", "sde", "ispc", "clang", "gcc", "icc"]
+
+PATH_dir = string.split(os.getenv("PATH"), os.pathsep)
+for counter in PATH_dir:
+    for i in range(0,8):
+        if os.path.exists(counter + os.sep + names[i]):
+            exists[i] = True
+
+print_debug("=== in PATH: ===\n", False, "")
+print_debug("Tools:\n", False, "")
+for i in range(0,3):
+    if exists[i]:
+        print_debug(detect_version(names[i] + " --version"), False, "")
+    else:
+        error("you don't have " + names[i], 0)
+if exists[0] and exists[1] and exists[2]:
+    if common.check_tools(2):
+        print_debug("versions are ok\n", False, "")
+print_debug("\nSDE:\n", False, "")
+if exists[3]:
+    print_debug(detect_version(names[3] + " --version"), False, "")
+else:
+    error("you don't have " + names[3], 2)
+print_debug("\nISPC:\n", False, "")
+if exists[4]:
+    print_debug(detect_version(names[4] + " --version"), False, "")
+else:
+    error("you don't have " + names[4], 2)
+print_debug("\nC/C++ compilers:\n", False, "")
+for i in range(5,8):
+    if exists[i]:
+        print_debug(detect_version(names[i] + " --version"), False, "")
+    else:
+        error("you don't have " + names[i], 2)
+
+print_debug("\n=== in ISPC specific environment variables: ===\n", False, "")
+if os.environ.get("LLVM_HOME") == None:
+    error("you have no LLVM_HOME", 2)
+else:
+    print_debug("Your LLVM_HOME:" + os.environ.get("LLVM_HOME") + "\n", False, "")
+if os.environ.get("ISPC_HOME") == None:
+    error("you have no ISPC_HOME", 2)
+else:
+    print_debug("Your ISPC_HOME:" + os.environ.get("ISPC_HOME") + "\n", False, "")
+    if os.path.exists(os.environ.get("ISPC_HOME") + os.sep + "ispc"):
+        print_debug("You have ISPC in your ISPC_HOME: " +
+        detect_version(os.environ.get("ISPC_HOME") + os.sep + "ispc" + " --version"), False, "")
+    else:
+        error("you don't have ISPC in your ISPC_HOME", 2)
+if os.environ.get("SDE_HOME") == None:
+    error("You have no SDE_HOME", 2)
+else:
+    print_debug("Your SDE_HOME:" + os.environ.get("SDE_HOME") + "\n", False, "")
+    if os.path.exists(os.environ.get("SDE_HOME") + os.sep + "sde"):
+        print_debug("You have sde in your SDE_HOME: " +
+        detect_version(os.environ.get("SDE_HOME") + os.sep + "sde" + " --version"), False, "")
+    else:
+        error("you don't have any SDE in your ISPC_HOME", 2)
diff --git a/common.py b/common.py
new file mode 100644
index 00000000..dd8fb388
--- /dev/null
+++ b/common.py
@@ -0,0 +1,120 @@
+#!/usr/bin/python
+#
+#  Copyright (c) 2013, Intel Corporation 
+#  All rights reserved.
+# 
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions are
+#  met:
+# 
+#    * Redistributions of source code must retain the above copyright
+#      notice, this list of conditions and the following disclaimer.
+# 
+#    * Redistributions in binary form must reproduce the above copyright
+#      notice, this list of conditions and the following disclaimer in the
+#      documentation and/or other materials provided with the distribution.
+# 
+#    * Neither the name of Intel Corporation nor the names of its
+#      contributors may be used to endorse or promote products derived from
+#      this software without specific prior written permission.
+# 
+# 
+#   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+#   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+#   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+#   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+#   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+#   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+#   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+#   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+#   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+#   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+#   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+# // Author: Filippov Ilia
+import sys
+import os
+import shutil
+
+def write_to_file(filename, line):
+    f = open(filename, 'a')
+    f.writelines(line)
+    f.close()
+
+#remove file if it exists
+def remove_if_exists(filename):
+    if os.path.exists(filename):
+        if os.path.isdir(filename):
+            shutil.rmtree(filename)
+        else:
+            os.remove(filename)
+
+# detect version which is printed after command
+def detect_version(command):
+    os.system(command + " > " + "temp_detect_version")
+    version = open("temp_detect_version")
+    answer = version.readline()
+    version.close()
+    remove_if_exists("temp_detect_version")
+    return answer
+
+# print versions of compilers
+def print_version(ispc_test, ispc_ref, ref_compiler, s, perf_log, is_windows):
+    print_debug("\nUsing test compiler: " + detect_version(ispc_test + " --version"), s, perf_log)
+    if ispc_ref != "":
+        print_debug("Using ref compiler:  " + detect_version(ispc_ref + " --version"), s, perf_log)
+    if is_windows == False:
+        temp1 = detect_version(ref_compiler + " --version")
+    else:
+        os.system(ref_compiler + " 2>&1" + " 2> temp_detect_version > temp_detect_version1" )
+        version = open("temp_detect_version")
+        temp1 = version.readline()
+        version.close()
+        remove_if_exists("temp_detect_version")
+        remove_if_exists("temp_detect_version1")
+    print_debug("Using C/C++ compiler: " + temp1 + "\n", s, perf_log)
+
+# print everything from scripts instead errors
+def print_debug(line, silent, filename):
+    if silent == False:
+        sys.stdout.write(line)
+        sys.stdout.flush()
+        if os.environ.get("ISPC_HOME") != None:
+            write_to_file(os.environ["ISPC_HOME"] + os.sep + "all_answer.txt", line)
+    if filename != "":
+        write_to_file(filename, line)
+
+# print errors from scripts
+# type 1 for error in environment
+# type 2 for warning
+# type 3 for error of compiler or test which isn't the goal of script 
+def error(line, error_type):
+    line = line + "\n"
+    if error_type == 1:
+        sys.stderr.write("Fatal error: " + line)
+        sys.exit(1)
+    if error_type == 2:
+        sys.stderr.write("Warning: " + line)
+    if error_type == 0:
+        print_debug("FIND ERROR: " + line, False, "")
+
+def check_tools(m):
+    input_tools=[[[1,4],"m4 --version", "bad m4 version"],
+                 [[2,4],"bison --version", "bad bison version"],
+                 [[2,5], "flex --version", "bad flex version"]]
+ 
+    for t in range(0,len(input_tools)):
+        t1 = ((detect_version(input_tools[t][1]))[:-1].split(" "))
+        for i in range(0,len(t1)):
+            t11 = t1[i].split(".")
+            f = True
+            for j in range(0,len(t11)):
+                if not t11[j].isdigit():
+                    f = False
+            if f == True:
+                for j in range(0,len(t11)):
+                    if j < len(input_tools[t][0]):
+                        if int(t11[j])<input_tools[t][0][j]:
+                            error(input_tools[t][2], m)
+                            return 0
+    return 1
diff --git a/examples/noise/Makefile b/examples/noise/Makefile
index 8cc72689..58d1cf3b 100644
--- a/examples/noise/Makefile
+++ b/examples/noise/Makefile
@@ -1,6 +1,6 @@
 
 EXAMPLE=noise
-CPP_SRC=$(EXAMPLE).cpp $(EXAMPLE)_serial.cpp
+CPP_SRC=noise.cpp noise_serial.cpp
 ISPC_SRC=noise.ispc
 ISPC_IA_TARGETS=sse2,sse4,avx-x2
 ISPC_ARM_TARGETS=neon
diff --git a/examples/perf.py b/examples/perf.py
deleted file mode 100755
index 4b661b39..00000000
--- a/examples/perf.py
+++ /dev/null
@@ -1,374 +0,0 @@
-#!/usr/bin/python
-# // Author: Filippov Ilia
-
-from optparse import OptionParser
-import sys
-import os
-import operator
-import time
-import glob
-import string
-import platform
-
-def print_debug(line):
-    if options.silent == False:
-        sys.stdout.write(line)
-
-def print_file(line):
-    if options.output != "":
-        output = open(options.output, 'w')
-        output.writelines(line)
-        output.close()
-
-def build_test():
-    global build_log
-    global is_windows
-    if is_windows == False:
-        os.system("make clean >> "+build_log)
-        return os.system("make CXX="+ref_compiler+" CC="+refc_compiler+" >> "+build_log+" 2>> "+build_log)
-    else:
-        os.system("msbuild /t:clean >> " + build_log)
-        return os.system("msbuild /V:m /p:Platform=x64 /p:Configuration=Release /p:TargetDir=.\ /t:rebuild >> " + build_log)
-
-def execute_test(command):
-    global perf_temp
-    r = 0
-    if os.path.exists(perf_temp):
-        os.remove(perf_temp)
-    for k in range(int(options.number)):
-        r = r + os.system(command)
-    return r
-
-#gathers all tests results and made an item test from answer structure
-def run_test(command, c1, c2, test, b_serial):
-    global perf_temp
-    if build_test() != 0:
-        sys.stdout.write("ERROR: Compilation fails\n")
-        return
-    if execute_test(command) != 0:
-        sys.stdout.write("ERROR: Execution fails\n")
-        return
-    tasks = [] #list of results with tasks, it will be test[2]
-    ispc = [] #list of results without tasks, it will be test[1]
-    absolute_tasks = []  #list of absolute results with tasks, it will be test[4]
-    absolute_ispc = [] #list of absolute results without tasks, ut will be test[3]
-    serial = [] #list serial times, it will be test[5]
-    j = 1
-    for line in open(perf_temp): # we take test output
-        if "speedup" in line: # we are interested only in lines with speedup
-            if j == c1: # we are interested only in lines with c1 numbers
-                line = line.expandtabs(0)
-                line = line.replace("("," ")
-                line = line.split(",")
-                for i in range(len(line)):
-                    subline = line[i].split(" ")
-                    number = float(subline[1][:-1])
-                    if "speedup from ISPC + tasks" in line[i]:
-                        tasks.append(number)
-                    else:
-                        ispc.append(number)
-                c1 = c1 + c2
-            j+=1
-        if "million cycles" in line:
-            if j == c1:
-                line = line.replace("]","[")
-                line = line.split("[")
-                number = float(line[3])
-                if "tasks" in line[1]:
-                    absolute_tasks.append(number)
-                else:
-                    if "ispc" in line[1]:
-                        absolute_ispc.append(number)
-                if "serial" in line[1]:
-                    serial.append(number)
-
-    if len(ispc) != 0:
-        if len(tasks) != 0:
-            print_debug("ISPC speedup / ISPC + tasks speedup / ISPC time / ISPC + tasks time / serial time\n")
-            for i in range(0,len(serial)):
-                print_debug("%10s   /\t%10s\t    /%9s  /    %10s\t    /%10s\n" %
-                    (ispc[i], tasks[i], absolute_ispc[i], absolute_tasks[i], serial[i]))
-        else:
-            print_debug("ISPC speedup / ISPC time / serial time\n")
-            for i in range(0,len(serial)):
-                print_debug("%10s   /%9s  /%10s\n" % (ispc[i], absolute_ispc[i], serial[i]))
-    else:
-        if len(tasks) != 0:
-            print_debug("ISPC + tasks speedup / ISPC + tasks time / serial time\n")
-            for i in range(0,len(serial)):
-                print_debug("%10s\t     /    %10s\t /%10s\n" % (tasks[i], absolute_tasks[i], serial[i]))
-
-    test[1] = test[1] + ispc
-    test[2] = test[2] + tasks
-    test[3] = test[3] + absolute_ispc
-    test[4] = test[4] + absolute_tasks
-    if b_serial == True:
-        #if we concatenate outputs we should use only the first serial answer.
-        test[5] = test[5] + serial
-
-def cpu_get():
-    p = open("/proc/stat", 'r')
-    cpu = p.readline()
-    p.close()
-    cpu = cpu.split(" ")
-    cpu_usage = (int(cpu[2]) + int(cpu[3]) + int(cpu[4]))
-    cpu_all = cpu_usage + int(cpu[5])
-    return [cpu_usage, cpu_all]
-
-#returns cpu_usage
-def cpu_check():
-    if is_windows == False:
-        if is_mac == False:
-            cpu1 = cpu_get()
-            time.sleep(1)
-            cpu2 = cpu_get()
-            cpu_percent = (float(cpu1[0] - cpu2[0])/float(cpu1[1] - cpu2[1]))*100
-        else:
-            os.system("sysctl -n vm.loadavg > cpu_temp")
-            c = open("cpu_temp", 'r')
-            c_line = c.readline()
-            c.close
-            os.remove("cpu_temp")
-            R = c_line.split(' ')
-            cpu_percent = float(R[1]) * 3
-    else:
-	os.system("wmic cpu get loadpercentage /value > cpu_temp")
-	c = open("cpu_temp", 'r')
-        c_lines = c.readlines()
-	c.close()
-	os.remove("cpu_temp")
-	t = "0"
-	for i in c_lines[2]:
-            if i.isdigit():
-                t = t + i
-	cpu_percent = int(t)
-    return cpu_percent
-
-#returns geomean of list
-def geomean(par):
-    temp = 1
-    l = len(par)
-    for i in range(l):
-        temp = temp * par[i]
-    temp = temp ** (1.0/l)
-    return round(temp, 2)
-
-#takes an answer struct and print it.
-#answer struct: list answer contains lists test
-#test[0] - name of test
-#test[1] - list of results without tasks
-#test[2] - list of results with tasks
-#test[3] - list of absolute results without tasks
-#test[4] - list of absolute results with tasks
-#test[5] - list of absolute time without ISPC (serial)
-#test[1..4] may be empty
-def print_answer(answer):
-    filelist = []
-    print_debug("--------------------------------------------------------------------------\n")
-    print_debug("test name:\t    ISPC speedup: ISPC + tasks speedup: | " + 
-        "ISPC time:    ISPC + tasks time:  serial:\n")
-    filelist.append("test name,ISPC speedup,diff," +
-        "ISPC + tasks speedup,diff,ISPC time,diff,ISPC + tasks time,diff,serial,diff\n")
-    max_t = [0,0,0,0,0]
-    diff_t = [0,0,0,0,0]
-    geomean_t = [0,0,0,0,0]
-    list_of_max = [[],[],[],[],[]]
-    for i in range(len(answer)):
-        for t in range(1,6):
-            if len(answer[i][t]) == 0:
-                max_t[t-1] = "n/a"
-                diff_t[t-1] = "n/a"
-            else:
-                if t < 3:
-                    mm = max(answer[i][t])
-                else:
-                    mm = min(answer[i][t])
-                max_t[t-1] = '%.2f' % mm
-                list_of_max[t-1].append(mm)
-                diff_t[t-1] = '%.2f' % (max(answer[i][t]) - min(answer[i][t]))
-        print_debug("%s:\n" % answer[i][0])
-        print_debug("\t\tmax:\t%5s\t\t%10s\t|%10s\t%10s\t%10s\n" %
-            (max_t[0], max_t[1], max_t[2], max_t[3], max_t[4]))
-        print_debug("\t\tdiff:\t%5s\t\t%10s\t|%10s\t%10s\t%10s\n" %
-            (diff_t[0], diff_t[1], diff_t[2], diff_t[3], diff_t[4]))
-        for t in range(0,5):
-            if max_t[t] == "n/a":
-                max_t[t] = ""
-            if diff_t[t] == "n/a":
-                diff_t[t] = ""
-        filelist.append(answer[i][0] + "," +
-                        max_t[0] + "," + diff_t[0] + "," +  max_t[1] + "," + diff_t[1] + "," +
-                        max_t[2] + "," + diff_t[2] + "," +  max_t[3] + "," + diff_t[3] + "," +
-                        max_t[4] + "," + diff_t[4] + "\n")
-    for i in range(0,5):
-        geomean_t[i] = geomean(list_of_max[i])
-    print_debug("---------------------------------------------------------------------------------\n")
-    print_debug("Geomean:\t\t%5s\t\t%10s\t|%10s\t%10s\t%10s\n" %
-        (geomean_t[0], geomean_t[1], geomean_t[2], geomean_t[3], geomean_t[4]))
-    filelist.append("Geomean," + str(geomean_t[0]) + ",," + str(geomean_t[1])
-        + ",," + str(geomean_t[2]) + ",," + str(geomean_t[3]) + ",," + str(geomean_t[4]) + "\n")
-    print_file(filelist)
-
-
-###Main###
-# parsing options
-parser = OptionParser()
-parser.add_option('-n', '--number', dest='number',
-    help='number of repeats', default="3")
-parser.add_option('-c', '--config', dest='config',
-    help='config file of tests', default="./perf.ini")
-parser.add_option('-p', '--path', dest='path',
-    help='path to examples directory', default="./")
-parser.add_option('-s', '--silent', dest='silent',
-    help='silent mode, only table output', default=False, action="store_true")
-parser.add_option('-o', '--output', dest='output',
-    help='output file for script reading', default="")
-parser.add_option('--compiler', dest='compiler',
-    help='reference compiler', default="")
-(options, args) = parser.parse_args()
-
-global is_windows
-is_windows = (platform.system() == 'Windows' or
-              'CYGWIN_NT' in platform.system())
-global is_mac
-is_mac = (platform.system() == 'Darwin')
-
-# save corrent path
-pwd = os.getcwd()
-pwd = pwd + os.sep
-if is_windows:
-    pwd = "..\\"
-
-# check if cpu usage is low now
-cpu_percent = cpu_check()
-if cpu_percent > 20:
-    sys.stdout.write("Warning: CPU Usage is very high.\n")
-    sys.stdout.write("Close other applications.\n")
-
-# check that required compilers exist
-PATH_dir = string.split(os.getenv("PATH"), os.pathsep)
-compiler_exists = False
-ref_compiler_exists = False
-if is_windows == False:
-    compiler = "ispc"
-    ref_compiler = "g++"
-    refc_compiler = "gcc"
-    if options.compiler != "":
-        if options.compiler == "clang" or options.compiler == "clang++":
-            ref_compiler = "clang++"
-            refc_compiler = "clang"
-        if options.compiler == "icc" or options.compiler == "icpc":
-            ref_compiler = "icpc"
-            refc_compiler = "icc"
-else:
-    compiler = "ispc.exe"
-    ref_compiler = "cl.exe"
-for counter in PATH_dir:
-    if os.path.exists(counter + os.sep + compiler):
-        compiler_exists = True
-    if os.path.exists(counter + os.sep + ref_compiler):
-        ref_compiler_exists = True
-if not compiler_exists:
-    sys.stderr.write("Fatal error: ISPC compiler not found.\n")
-    sys.stderr.write("Added path to ispc compiler to your PATH variable.\n")
-    sys.exit()
-if not ref_compiler_exists:
-    sys.stderr.write("Fatal error: reference compiler %s not found.\n" % ref_compiler)
-    sys.stderr.write("Added path to %s compiler to your PATH variable.\n" % ref_compiler)
-    sys.exit()
-
-# checks that config file exists
-path_config = os.path.normpath(options.config)
-if os.path.exists(path_config) == False:
-    sys.stderr.write("Fatal error: config file not found: %s.\n" % options.config) 
-    sys.stderr.write("Set path to your config file in --config.\n")
-    sys.exit()
-
-# read lines from config file except comments
-f = open(path_config, 'r')
-f_lines = f.readlines()
-f.close()
-lines =[]
-for i in range(len(f_lines)):
-    if f_lines[i][0] != "%":
-        lines.append(f_lines[i])
-length = len(lines)
-
-# prepare build.log and perf_temp files
-global build_log
-build_log = pwd + "build.log"
-if is_windows == False:
-    if os.path.exists(build_log):
-        os.remove(build_log)
-else:
-    if os.path.exists("build.log"):
-        os.remove("build.log")
-global perf_temp
-perf_temp = pwd + "perf_temp"
-
-i = 0
-answer = []
-print_debug("Okey go go go!\n\n")
-os.system(compiler + " --version >" + build_log)
-version = open(build_log)
-print_debug("Using test compiler: " + version.readline())
-version.close()
-
-if is_windows == False:
-    os.system(ref_compiler + " --version >" + build_log)
-else:
-    os.system(ref_compiler + " 2>" + build_log + " 1>&2")
-
-version = open(build_log)
-print_debug("Using reference compiler: " + version.readline())
-version.close()
-
-
-# loop for all tests
-while i < length-2:
-    # we read name of test
-    print_debug("%s" % lines[i])
-    test = [lines[i][:-1],[],[],[],[],[]]
-    # read location of test
-    folder = lines[i+1]
-    folder = folder[:-1]
-    folder = os.path.normpath(options.path + os.sep + folder)
-    # check that test exists
-    if os.path.exists(folder) == False:
-        sys.stdout.write("Fatal error: Can't find test %s. Your path is: \"%s\".\n" % (lines[i][:-1], options.path))
-        sys.stdout.write("Change current location to /examples or set path to /examples in --path.\n")
-        exit(0)
-    os.chdir(folder)
-    # read parameters of test
-    command = lines[i+2]
-    command = command[:-1]
-    if is_windows == False:
-        command = "./"+command + " >> " + perf_temp
-    else:
-        command = "x64\\Release\\"+command + " >> " + perf_temp
-    # parsing config parameters
-    next_line = lines[i+3]
-    if next_line[0] == "!": # we should take only one part of test output
-        R = next_line.split(' ')
-        c1 = int(R[1]) #c1 is a number of string which we want to use in test output
-        c2 = int(R[2]) #c2 is total number of strings in test output
-        i = i+1
-    else:
-        c1 = 1
-        c2 = 1
-    next_line = lines[i+3]
-    if next_line[0] == "^":  #we should concatenate result of this test with previous one
-        run_test(command, c1, c2, answer[len(answer)-1], False)
-        i = i+1
-    else: #we run this test and append it's result to answer structure
-        run_test(command, c1, c2, test, True)
-        answer.append(test)
-    # preparing next loop iteration
-    os.chdir(pwd)
-    i+=4
-
-# delete temp file
-if os.path.exists(perf_temp):
-    os.remove(perf_temp)
-#print collected answer
-print_answer(answer)
diff --git a/fail_db.txt b/fail_db.txt
new file mode 100644
index 00000000..7adc3e41
--- /dev/null
+++ b/fail_db.txt
@@ -0,0 +1 @@
+% List of known fails
diff --git a/llvm_patches/r183327-AVX2-GATHER.patch b/llvm_patches/3_3_r183327-AVX2-GATHER.patch
similarity index 100%
rename from llvm_patches/r183327-AVX2-GATHER.patch
rename to llvm_patches/3_3_r183327-AVX2-GATHER.patch
diff --git a/llvm_patches/r184575-x86-shift.patch b/llvm_patches/3_3_r184575-x86-shift.patch
similarity index 100%
rename from llvm_patches/r184575-x86-shift.patch
rename to llvm_patches/3_3_r184575-x86-shift.patch
diff --git a/examples/perf.ini b/perf.ini
similarity index 84%
rename from examples/perf.ini
rename to perf.ini
index d2a5c73e..d8c7fe71 100755
--- a/examples/perf.ini
+++ b/perf.ini
@@ -10,44 +10,48 @@
 %****************************************************************************************************
 AOBench
 aobench
-ao 10 512 512
+10 512 512
 #***
 Deferred Shading
 deferred
-deferred_shading data/pp1280x720.bin
+data/pp1280x720.bin
 #***
 Mandelbrot Set
 mandelbrot
-mandelbrot
+
 #***
 Mandelbrot Set
 mandelbrot_tasks
-mandelbrot_tasks
+
 ^
 #***
 Perlin Noise Function
 noise
-noise
+
 #***
 Binomial Options
 options
-options
+
 ! 1 2
 #***
 Black-Scholes Options
 options
-options
+
 ! 2 2
 #***
 Ray Tracer
 rt
-rt sponza
+sponza
 #***
 3D Stencil
 stencil
-stencil
+
 #***
 Volume Rendering
 volume_rendering
-volume camera.dat density_highres.vol
+camera.dat density_highres.vol
 #***
+%Sort
+%sort
+%
+%#***
diff --git a/perf.py b/perf.py
new file mode 100755
index 00000000..d1d7654b
--- /dev/null
+++ b/perf.py
@@ -0,0 +1,489 @@
+#!/usr/bin/python
+#
+#  Copyright (c) 2013, Intel Corporation
+#  All rights reserved.
+# 
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions are
+#  met:
+# 
+#    * Redistributions of source code must retain the above copyright
+#      notice, this list of conditions and the following disclaimer.
+# 
+#    * Redistributions in binary form must reproduce the above copyright
+#      notice, this list of conditions and the following disclaimer in the
+#      documentation and/or other materials provided with the distribution.
+# 
+#    * Neither the name of Intel Corporation nor the names of its
+#      contributors may be used to endorse or promote products derived from
+#      this software without specific prior written permission.
+# 
+# 
+#   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+#   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+#   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+#   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+#   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+#   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+#   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+#   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+#   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+#   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+#   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+# // Author: Filippov Ilia
+
+def print_file(line):
+    if options.output != "":
+        output = open(options.output, 'w')
+        output.writelines(line)
+        output.close()
+
+def build_test(commands):
+    os.system(commands[4])
+    test = os.system(commands[1])
+    if options.ref:
+        ref = os.system(commands[3])
+    return (options.ref and ref) or test
+
+def execute_test(commands):
+    r = 0
+    common.remove_if_exists(perf_temp+"_test") 
+    common.remove_if_exists(perf_temp+"_ref")
+    for k in range(int(options.number)):
+        r = r + os.system(commands[0])
+        if options.ref:
+            r = r + os.system(commands[2])
+    return r
+
+#gathers all tests results and made an item test from answer structure
+def run_test(commands, c1, c2, test, test_ref, b_serial):
+    if build_test(commands) != 0:
+        error("Compilation fails of test %s\n" % test[0], 0)
+        return
+    if execute_test(commands) != 0:
+        error("Execution fails of test %s\n" % test[0], 0)
+        return
+    print_debug("TEST COMPILER:\n", s, perf_log)
+    analyse_test(c1, c2, test, b_serial, perf_temp+"_test")
+    if options.ref:
+        print_debug("REFERENCE COMPILER:\n", s, perf_log)
+        analyse_test(c1, c2, test_ref, b_serial, perf_temp+"_ref")
+
+
+def analyse_test(c1, c2, test, b_serial, perf_temp_n):
+    tasks = [] #list of results with tasks, it will be test[2]
+    ispc = [] #list of results without tasks, it will be test[1]
+    absolute_tasks = []  #list of absolute results with tasks, it will be test[4]
+    absolute_ispc = [] #list of absolute results without tasks, ut will be test[3]
+    serial = [] #list serial times, it will be test[5]
+    j = 1
+    for line in open(perf_temp_n): # we take test output
+        if "speedup" in line: # we are interested only in lines with speedup
+            if j == c1: # we are interested only in lines with c1 numbers
+                line = line.expandtabs(0)
+                line = line.replace("("," ")
+                line = line.split(",")
+                for i in range(len(line)):
+                    subline = line[i].split(" ")
+                    number = float(subline[1][:-1])
+                    if "speedup from ISPC + tasks" in line[i]:
+                        tasks.append(number)
+                    else:
+                        ispc.append(number)
+                c1 = c1 + c2
+            j+=1
+        if "million cycles" in line:
+            if j == c1:
+                line = line.replace("]","[")
+                line = line.split("[")
+                number = float(line[3])
+                if "tasks" in line[1]:
+                    absolute_tasks.append(number)
+                else:
+                    if "ispc" in line[1]:
+                        absolute_ispc.append(number)
+                if "serial" in line[1]:
+                    serial.append(number)
+
+    if len(ispc) != 0:
+        if len(tasks) != 0:
+            print_debug("ISPC speedup / ISPC + tasks speedup / ISPC time / ISPC + tasks time / serial time\n", s, perf_log)
+            for i in range(0,len(serial)):
+                print_debug("%10s   /\t%10s\t    /%9s  /    %10s\t    /%10s\n" %
+                    (ispc[i], tasks[i], absolute_ispc[i], absolute_tasks[i], serial[i]), s, perf_log)
+        else:
+            print_debug("ISPC speedup / ISPC time / serial time\n", s, perf_log)
+            for i in range(0,len(serial)):
+                print_debug("%10s   /%9s  /%10s\n" % (ispc[i], absolute_ispc[i], serial[i]), s, perf_log)
+    else:
+        if len(tasks) != 0:
+            print_debug("ISPC + tasks speedup / ISPC + tasks time / serial time\n", s, perf_log)
+            for i in range(0,len(serial)):
+                print_debug("%10s\t     /    %10s\t /%10s\n" % (tasks[i], absolute_tasks[i], serial[i]), s, perf_log)
+
+    test[1] = test[1] + ispc
+    test[2] = test[2] + tasks
+    test[3] = test[3] + absolute_ispc
+    test[4] = test[4] + absolute_tasks
+    if b_serial == True:
+        #if we concatenate outputs we should use only the first serial answer.
+        test[5] = test[5] + serial
+
+def cpu_get():
+    p = open("/proc/stat", 'r')
+    cpu = p.readline()
+    p.close()
+    cpu = cpu.split(" ")
+    cpu_usage = (int(cpu[2]) + int(cpu[3]) + int(cpu[4]))
+    cpu_all = cpu_usage + int(cpu[5])
+    return [cpu_usage, cpu_all]
+
+#returns cpu_usage
+def cpu_check():
+    if is_windows == False:
+        if is_mac == False:
+            cpu1 = cpu_get()
+            time.sleep(1)
+            cpu2 = cpu_get()
+            cpu_percent = (float(cpu1[0] - cpu2[0])/float(cpu1[1] - cpu2[1]))*100
+        else:
+            os.system("sysctl -n vm.loadavg > cpu_temp")
+            c = open("cpu_temp", 'r')
+            c_line = c.readline()
+            c.close
+            os.remove("cpu_temp")
+            R = c_line.split(' ')
+            cpu_percent = float(R[1]) * 3
+    else:
+	os.system("wmic cpu get loadpercentage /value > cpu_temp")
+	c = open("cpu_temp", 'r')
+        c_lines = c.readlines()
+	c.close()
+	os.remove("cpu_temp")
+	t = "0"
+	for i in c_lines[2]:
+            if i.isdigit():
+                t = t + i
+	cpu_percent = int(t)
+    return cpu_percent
+
+#returns geomean of list
+def geomean(par):
+    temp = 1
+    l = len(par)
+    for i in range(l):
+        temp = temp * par[i]
+    temp = temp ** (1.0/l)
+    return round(temp, 2)
+
+#takes an answer struct and print it.
+#answer struct: list answer contains lists test
+#test[0] - name of test
+#test[1] - list of results without tasks
+#test[2] - list of results with tasks
+#test[3] - list of absolute results without tasks
+#test[4] - list of absolute results with tasks
+#test[5] - list of absolute time without ISPC (serial)
+#test[1..4] may be empty
+def print_answer(answer):
+    filelist = []
+    print_debug("--------------------------------------------------------------------------\n", s, perf_log)
+    print_debug("test name:\t    ISPC speedup: ISPC + tasks speedup: | " + 
+        "ISPC time:    ISPC + tasks time:  serial:\n", s, perf_log)
+    filelist.append("test name,ISPC speedup,diff," +
+        "ISPC + tasks speedup,diff,ISPC time,diff,ISPC + tasks time,diff,serial,diff\n")
+    max_t = [0,0,0,0,0]
+    diff_t = [0,0,0,0,0]
+    geomean_t = [0,0,0,0,0]
+    list_of_max = [[],[],[],[],[]]
+    list_of_compare = [[],[],[],[],[],[]]
+    for i in range(len(answer)):
+        list_of_compare[0].append(answer[i][0])
+        for t in range(1,6):
+            if len(answer[i][t]) == 0:
+                max_t[t-1] = "n/a"
+                diff_t[t-1] = "n/a"
+                list_of_compare[t].append(0);
+            else:
+                if t < 3:
+                    mm = max(answer[i][t])
+                else:
+                    mm = min(answer[i][t])
+                list_of_compare[t].append(mm)
+                max_t[t-1] = '%.2f' % mm
+                list_of_max[t-1].append(mm)
+                diff_t[t-1] = '%.2f' % (max(answer[i][t]) - min(answer[i][t]))
+        print_debug("%s:\n" % answer[i][0], s, perf_log)
+        print_debug("\t\tmax:\t%5s\t\t%10s\t|%10s\t%10s\t%10s\n" %
+            (max_t[0], max_t[1], max_t[2], max_t[3], max_t[4]), s, perf_log)
+        print_debug("\t\tdiff:\t%5s\t\t%10s\t|%10s\t%10s\t%10s\n" %
+            (diff_t[0], diff_t[1], diff_t[2], diff_t[3], diff_t[4]), s, perf_log)
+        for t in range(0,5):
+            if max_t[t] == "n/a":
+                max_t[t] = ""
+            if diff_t[t] == "n/a":
+                diff_t[t] = ""
+        filelist.append(answer[i][0] + "," +
+                        max_t[0] + "," + diff_t[0] + "," +  max_t[1] + "," + diff_t[1] + "," +
+                        max_t[2] + "," + diff_t[2] + "," +  max_t[3] + "," + diff_t[3] + "," +
+                        max_t[4] + "," + diff_t[4] + "\n")
+    for i in range(0,5):
+        geomean_t[i] = geomean(list_of_max[i])
+    print_debug("---------------------------------------------------------------------------------\n", s, perf_log)
+    print_debug("Geomean:\t\t%5s\t\t%10s\t|%10s\t%10s\t%10s\n" %
+        (geomean_t[0], geomean_t[1], geomean_t[2], geomean_t[3], geomean_t[4]), s, perf_log)
+    filelist.append("Geomean," + str(geomean_t[0]) + ",," + str(geomean_t[1])
+        + ",," + str(geomean_t[2]) + ",," + str(geomean_t[3]) + ",," + str(geomean_t[4]) + "\n")
+    print_file(filelist)
+    return list_of_compare
+
+
+def compare(A, B):
+    print_debug("\n\n_____________________PERFORMANCE REPORT____________________________\n", False, "")
+    print_debug("test name:                 ISPC time: ISPC time ref: %:\n", False, "")
+    for i in range(0,len(A[0])):
+        if B[3][i] == 0:
+            p1 = 0
+        else:
+            p1 = 100 - 100 * A[3][i]/B[3][i]
+        print_debug("%21s:  %10.2f %10.2f %10.2f" % (A[0][i], A[3][i], B[3][i], p1), False, "")
+        if p1 < -1:
+            print_debug(" <-", False, "")
+        if p1 > 1:
+            print_debug(" <+", False, "")
+        print_debug("\n", False, "")
+    print_debug("\n", False, "")
+
+    print_debug("test name:                 TASKS time: TASKS time ref: %:\n", False, "")
+    for i in range(0,len(A[0])):
+        if B[4][i] == 0:
+            p2 = 0
+        else:
+            p2 = 100 - 100 * A[4][i]/B[4][i]
+        print_debug("%21s:  %10.2f %10.2f %10.2f" % (A[0][i], A[4][i], B[4][i], p2), False, "")
+        if p2 < -1:
+            print_debug(" <-", False, "")
+        if p2 > 1:
+            print_debug(" <+", False, "")
+        print_debug("\n", False, "")
+    if "performance.log" in options.in_file:
+        print_debug("\n\n_________________Watch performance.log for details________________\n", False, "")
+    else:
+        print_debug("\n\n__________________________________________________________________\n", False, "")
+
+
+
+def perf(options1, args):
+    global options
+    options = options1  
+    global s
+    s = options.silent
+
+    # save current OS
+    global is_windows
+    is_windows = (platform.system() == 'Windows' or
+              'CYGWIN_NT' in platform.system())
+    global is_mac
+    is_mac = (platform.system() == 'Darwin')
+
+    # save current path
+    pwd = os.getcwd()
+    pwd = pwd + os.sep
+    pwd1 = pwd
+    if is_windows:
+        pwd1 = "..\\..\\"
+
+    # check if cpu usage is low now
+    cpu_percent = cpu_check()
+    if cpu_percent > 20:
+        error("CPU Usage is very high.\nClose other applications.\n", 2)
+
+    global ispc_test
+    global ispc_ref
+    global ref_compiler
+    global refc_compiler
+    # check that required compilers exist
+    PATH_dir = string.split(os.getenv("PATH"), os.pathsep)
+    ispc_test_exists = False
+    ispc_ref_exists = False
+    ref_compiler_exists = False
+    if is_windows == False:
+        ispc_test = "ispc"
+        ref_compiler = "g++"
+        refc_compiler = "gcc"
+        if options.compiler != "":
+            if options.compiler == "clang" or options.compiler == "clang++":
+                ref_compiler = "clang++"
+                refc_compiler = "clang"
+            if options.compiler == "icc" or options.compiler == "icpc":
+                ref_compiler = "icpc"
+                refc_compiler = "icc"
+    else:
+        ispc_test = "ispc.exe"
+        ref_compiler = "cl.exe"
+    ispc_ref = options.ref
+    if options.ref != "":
+        options.ref = True
+    for counter in PATH_dir:
+        if os.path.exists(counter + os.sep + ispc_test):
+            ispc_test_exists = True
+        if os.path.exists(counter + os.sep + ref_compiler):
+            ref_compiler_exists = True
+        if os.path.exists(counter + os.sep + ispc_ref):
+            ispc_ref_exists = True
+    if not ispc_test_exists:
+        error("ISPC compiler not found.\nAdded path to ispc compiler to your PATH variable.\n", 1)
+    if not ref_compiler_exists:
+        error("C/C++ compiler %s not found.\nAdded path to %s compiler to your PATH variable.\n" % (ref_compiler, ref_compiler), 1)
+    if options.ref:
+        if not ispc_ref_exists:
+            error("ISPC reference compiler not found.\nAdded path to ispc reference compiler to your PATH variable.\n", 1)
+
+    # checks that config file exists
+    path_config = os.path.normpath(options.config)
+    if os.path.exists(path_config) == False:
+        error("config file not found: %s.\nSet path to your config file in --config.\n" % options.config, 1)
+        sys.exit()
+
+    # read lines from config file except comments
+    f = open(path_config, 'r')
+    f_lines = f.readlines()
+    f.close()
+    lines =[]
+    for i in range(len(f_lines)):
+        if f_lines[i][0] != "%":
+            lines.append(f_lines[i])
+    length = len(lines)
+
+    # prepare build.log, perf_temp and perf.log files
+    global perf_log
+    if options.in_file:
+        perf_log = pwd + options.in_file
+        common.remove_if_exists(perf_log)
+    else:
+        perf_log = ""
+    global build_log
+    build_log = pwd + os.sep + "logs" + os.sep + "perf_build.log"
+    common.remove_if_exists(build_log)
+    if os.path.exists(pwd + os.sep + "logs") == False:
+        os.makedirs(pwd + os.sep + "logs")
+
+    global perf_temp
+    perf_temp = pwd + "perf_temp"
+    # end of preparations
+ 
+    print_debug("Okey go go go!\n\n", s, perf_log)
+    
+    #print compilers versions   
+    common.print_version(ispc_test, ispc_ref, ref_compiler, False, perf_log, is_windows) 
+
+    # begin
+    i = 0
+    answer = []
+    answer_ref = []
+
+    # loop for all tests
+    while i < length-2:
+        # we read name of test
+        print_debug("%s" % lines[i], s, perf_log)
+        test = [lines[i][:-1],[],[],[],[],[]]
+        test_ref = [lines[i][:-1],[],[],[],[],[]]
+        # read location of test
+        folder = lines[i+1]
+        folder = folder[:-1]
+        folder = os.path.normpath(options.path + os.sep + "examples" + os.sep + folder)
+        # check that test exists
+        if os.path.exists(folder) == False:
+            error("Can't find test %s. Your path is: \"%s\".\nChange current location to ISPC_HOME or set path to ISPC_HOME in --path.\n" %
+                 (lines[i][:-1], options.path), 1)
+        os.chdir(folder)
+        # read parameters of test
+        command = lines[i+2]
+        command = command[:-1]
+        if is_windows == False:
+            ex_command_ref = "./ref " + command + " >> " + perf_temp + "_ref"
+            ex_command = "./test " + command + " >> " + perf_temp + "_test"
+            bu_command_ref = "make CXX="+ref_compiler+" CC="+refc_compiler+ " EXAMPLE=ref ISPC="+ispc_ref+" >> "+build_log+" 2>> "+build_log
+            bu_command = "make CXX="+ref_compiler+" CC="+refc_compiler+ " EXAMPLE=test ISPC="+ispc_test+" >> "+build_log+" 2>> "+build_log
+            re_command = "make clean >> "+build_log
+        else:
+            ex_command_ref = "x64\\Release\\ref.exe " + command + " >> " + perf_temp + "_ref"
+            ex_command = "x64\\Release\\test.exe " + command + " >> " + perf_temp + "_test"
+            bu_command_ref = "msbuild /V:m /p:Platform=x64 /p:Configuration=Release /p:TargetDir=.\ /p:TargetName=ref /t:rebuild >> " + build_log
+            bu_command = "msbuild /V:m /p:Platform=x64 /p:Configuration=Release /p:TargetDir=.\ /p:TargetName=test /t:rebuild >> " + build_log
+            re_command = "msbuild /t:clean >> " + build_log
+        commands = [ex_command, bu_command, ex_command_ref, bu_command_ref, re_command]
+        # parsing config parameters
+        next_line = lines[i+3]
+        if next_line[0] == "!": # we should take only one part of test output
+            R = next_line.split(' ')
+            c1 = int(R[1]) #c1 is a number of string which we want to use in test output
+            c2 = int(R[2]) #c2 is total number of strings in test output
+            i = i+1
+        else:
+            c1 = 1
+            c2 = 1
+        next_line = lines[i+3]
+        if next_line[0] == "^":  #we should concatenate result of this test with previous one
+            run_test(commands, c1, c2, answer[len(answer)-1], answer_ref[len(answer)-1], False)
+            i = i+1
+        else: #we run this test and append it's result to answer structure
+            run_test(commands, c1, c2, test, test_ref, True)
+            answer.append(test)
+            answer_ref.append(test_ref)
+
+        # preparing next loop iteration
+        os.chdir(pwd1)
+        i+=4
+
+    # delete temp file
+    common.remove_if_exists(perf_temp+"_test")
+    common.remove_if_exists(perf_temp+"_ref")
+
+    #print collected answer
+    print_debug("\n\nTEST COMPILER:\n", s, perf_log)
+    A = print_answer(answer)
+    if options.ref != "":
+        print_debug("\n\nREFERENCE COMPILER:\n", s, perf_log)
+        B = print_answer(answer_ref)
+        # print perf report
+        compare(A,B)
+
+ 
+
+###Main###
+from optparse import OptionParser
+import sys
+import os
+import operator
+import time
+import glob
+import string
+import platform
+# our functions
+import common
+print_debug = common.print_debug
+error = common.error
+
+if __name__ == "__main__":
+    # parsing options
+    parser = OptionParser()
+    parser.add_option('-n', '--number', dest='number',
+        help='number of repeats', default="3")
+    parser.add_option('-c', '--config', dest='config',
+        help='config file of tests', default="./perf.ini")
+    parser.add_option('-p', '--path', dest='path',
+        help='path to test_system directory', default=".")
+    parser.add_option('-s', '--silent', dest='silent',
+        help='silent mode, only table output', default=False, action="store_true")
+    parser.add_option('-o', '--output', dest='output',
+        help='output file for script reading', default="")
+    parser.add_option('--compiler', dest='compiler',
+        help='C/C++ compiler', default="")
+    parser.add_option('-r', '--ref', dest='ref',
+        help='set reference compiler for compare', default="")
+    parser.add_option('-f', '--file', dest='in_file',
+        help='file to save perf output', default="")
+    (options, args) = parser.parse_args()
+    perf(options, args)
diff --git a/run_tests.py b/run_tests.py
index 9729930f..2471b6cb 100755
--- a/run_tests.py
+++ b/run_tests.py
@@ -1,165 +1,37 @@
 #!/usr/bin/python
+#
+#  Copyright (c) 2013, Intel Corporation
+#  All rights reserved.
+# 
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions are
+#  met:
+# 
+#    * Redistributions of source code must retain the above copyright
+#      notice, this list of conditions and the following disclaimer.
+# 
+#    * Redistributions in binary form must reproduce the above copyright
+#      notice, this list of conditions and the following disclaimer in the
+#      documentation and/or other materials provided with the distribution.
+# 
+#    * Neither the name of Intel Corporation nor the names of its
+#      contributors may be used to endorse or promote products derived from
+#      this software without specific prior written permission.
+# 
+# 
+#   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+#   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+#   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+#   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+#   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+#   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+#   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+#   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+#   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+#   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+#   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 # test-running driver for ispc
-
-from optparse import OptionParser
-import multiprocessing
-from ctypes import c_int
-import os
-import sys
-import glob
-import re
-import signal
-import random
-import string
-import subprocess
-import shlex
-import platform
-import tempfile
-import os.path
-import time
-
-# disable fancy error/warning printing with ANSI colors, so grepping for error
-# messages doesn't get confused
-os.environ["TERM"] = "dumb"
-
-# This script is affected by http://bugs.python.org/issue5261 on OSX 10.5 Leopard
-# git history has a workaround for that issue.
-
-is_windows = (platform.system() == 'Windows' or
-              'CYGWIN_NT' in platform.system())
-
-parser = OptionParser()
-parser.add_option("-r", "--random-shuffle", dest="random", help="Randomly order tests",
-                  default=False, action="store_true")
-parser.add_option("-g", "--generics-include", dest="include_file", help="Filename for header implementing functions for generics",
-                  default=None)
-parser.add_option("-f", "--ispc-flags", dest="ispc_flags", help="Additional flags for ispc (-g, -O1, ...)",
-                  default="")
-parser.add_option('-t', '--target', dest='target',
-                  help='Set compilation target (sse2-i32x4, sse2-i32x8, sse4-i32x4, sse4-i32x8, sse4-i16x8, sse4-i8x16, avx1-i32x8, avx1-i32x16, avx1.1-i32x8, avx1.1-i32x16, avx2-i32x8, avx2-i32x16, generic-x1, generic-x4, generic-x8, generic-x16, generic-x32, generic-x64)',
-                  default="sse4")
-parser.add_option('-a', '--arch', dest='arch',
-                  help='Set architecture (arm, x86, x86-64)',
-                  default="x86-64")
-parser.add_option("-c", "--compiler", dest="compiler_exe", help="Compiler binary to use to run tests",
-                  default=None)
-parser.add_option('-o', '--no-opt', dest='no_opt', help='Disable optimization',
-                  default=False, action="store_true")
-parser.add_option('-j', '--jobs', dest='num_jobs', help='Maximum number of jobs to run in parallel',
-                  default="1024", type="int")
-parser.add_option('-v', '--verbose', dest='verbose', help='Enable verbose output',
-                  default=False, action="store_true")
-parser.add_option('--wrap-exe', dest='wrapexe',
-                  help='Executable to wrap test runs with (e.g. "valgrind")',
-                  default="")
-parser.add_option('--time', dest='time', help='Enable time output',
-                  default=False, action="store_true")
-parser.add_option('--non-interactive', dest='non_interactive', help='Disable interactive status updates',
-                  default=False, action="store_true")
-
-(options, args) = parser.parse_args()
-
-if options.target == 'neon':
-    options.arch = 'arm'
-
-# use relative path to not depend on host directory, which may possibly
-# have white spaces and unicode characters.
-if not is_windows:
-    ispc_exe = "./ispc"
-else:
-    ispc_exe = ".\\Release\\ispc.exe"
-
-# checks the required ispc compiler otherwise prints an error message
-if not os.path.exists(ispc_exe):
-    sys.stderr.write("Fatal error: missing ispc compiler: %s\n" % ispc_exe)
-    sys.exit()
-
-ispc_exe += " " + options.ispc_flags
-
-if __name__ == '__main__':
-    sys.stdout.write("ispc compiler: %s\n" % ispc_exe)
-
-is_generic_target = (options.target.find("generic-") != -1 and
-                     options.target != "generic-1")
-if is_generic_target and options.include_file == None:
-    if options.target == "generic-4":
-        sys.stderr.write("No generics #include specified; using examples/intrinsics/sse4.h\n")
-        options.include_file = "examples/intrinsics/sse4.h"
-    elif options.target == "generic-8":
-        sys.stderr.write("No generics #include specified and no default available for \"generic-8\" target.\n")
-        sys.exit(1)
-    elif options.target == "generic-16":
-        sys.stderr.write("No generics #include specified; using examples/intrinsics/generic-16.h\n")
-        options.include_file = "examples/intrinsics/generic-16.h"
-    elif options.target == "generic-32":
-        sys.stderr.write("No generics #include specified; using examples/intrinsics/generic-32.h\n")
-        options.include_file = "examples/intrinsics/generic-32.h"
-    elif options.target == "generic-64":
-        sys.stderr.write("No generics #include specified; using examples/intrinsics/generic-64.h\n")
-        options.include_file = "examples/intrinsics/generic-64.h"
-
-if options.compiler_exe == None:
-    if is_windows:
-        options.compiler_exe = "cl.exe"
-    else:
-        options.compiler_exe = "g++"
-
-# checks the required compiler otherwise prints an error message
-PATH_dir = string.split(os.getenv("PATH"), os.pathsep) 
-compiler_exists = False
-
-for counter in PATH_dir:
-    if os.path.exists(counter + os.sep + options.compiler_exe):
-        compiler_exists = True
-        break
-
-if not compiler_exists:
-    sys.stderr.write("Fatal error: missing the required compiler: %s \n" %
-        options.compiler_exe)
-    sys.exit()
-
-ispc_root = "."
-    
-# if no specific test files are specified, run all of the tests in tests/,
-# failing_tests/, and tests_errors/
-if len(args) == 0:
-    files = glob.glob(ispc_root + os.sep + "tests" + os.sep + "*ispc") + \
-        glob.glob(ispc_root + os.sep + "failing_tests" + os.sep + "*ispc") + \
-        glob.glob(ispc_root + os.sep + "tests_errors" + os.sep + "*ispc")
-else:
-    if is_windows:
-        argfiles = [ ]
-        for f in args:
-            # we have to glob ourselves if this is being run under a DOS
-            # shell, as it passes wildcard as is.
-            argfiles += glob.glob(f)
-    else:
-        argfiles = args
-        
-    files = [ ]
-    for f in argfiles:
-        if os.path.splitext(string.lower(f))[1] != ".ispc":
-            sys.stdout.write("Ignoring file %s, which doesn't have an .ispc extension.\n" % f)
-        else:
-            files += [ f ]
-
-# max_test_length is used to issue exact number of whitespace characters when
-# updating status. Otherwise update causes new lines standard 80 char terminal
-# on both Linux and Windows.
-max_test_length = 0
-for f in files:
-    max_test_length = max(max_test_length, len(f))
-
-# randomly shuffle the tests if asked to do so
-if (options.random):
-    random.seed()
-    random.shuffle(files)
-
-# counter
-total_tests = 0
-
-
 # utility routine to print an update on the number of tests that have been
 # finished.  Should be called with the lock held..
 def update_progress(fn, total_tests_arg, counter, max_test_length_arg):
@@ -176,7 +48,7 @@ def update_progress(fn, total_tests_arg, counter, max_test_length_arg):
 
 def run_command(cmd):
     if options.verbose:
-        sys.stdout.write("Running: %s\n" % cmd)
+        print_debug("Running: %s\n" % cmd, s, run_tests_log)
 
     # Here's a bit tricky part. To pass a command for execution we should
     # break down the line in to arguments. shlex class is designed exactly
@@ -204,9 +76,9 @@ def run_cmds(compile_cmds, run_cmd, filename, expect_failure):
         (return_code, output) = run_command(cmd)
         compile_failed = (return_code != 0)
         if compile_failed:
-            sys.stdout.write("Compilation of test %s failed            \n" % filename)
+            print_debug("Compilation of test %s failed            \n" % filename, s, run_tests_log)
             if output != "":
-                sys.stdout.write("%s" % output.encode("utf-8"))
+                print_debug("%s" % output.encode("utf-8"), s, run_tests_log)
             return (1, 0)
 
     (return_code, output) = run_command(run_cmd)
@@ -215,11 +87,11 @@ def run_cmds(compile_cmds, run_cmd, filename, expect_failure):
     surprise = ((expect_failure and not run_failed) or
                 (not expect_failure and run_failed))
     if surprise == True:
-        sys.stderr.write("Test %s %s (return code %d)            \n" % \
+        print_debug("Test %s %s (return code %d)            \n" % \
             (filename, "unexpectedly passed" if expect_failure else "failed",
-             return_code))
+             return_code), s, run_tests_log)
     if output != "":
-        sys.stdout.write("%s\n" % output.encode("utf-8"))
+        print_debug("%s\n" % output.encode("utf-8"), s, run_tests_log)
     if surprise == True:
         return (0, 1)
     else:
@@ -298,11 +170,11 @@ def run_test(testname):
         file.close()
 
         if re.search(firstline, output) == None:
-            sys.stderr.write("Didn't see expected error message %s from test %s.\nActual output:\n%s\n" % \
-                (firstline, testname, output))
+            print_debug("Didn't see expected error message %s from test %s.\nActual output:\n%s\n" % \
+                (firstline, testname, output), s, run_tests_log)
             return (1, 0)
         elif got_error == False:
-            sys.stderr.write("Unexpectedly no errors issued from test %s\n" % testname)
+            print_debug("Unexpectedly no errors issued from test %s\n" % testname, s, run_tests_log)
             return (1, 0)
         else:
             return (0, 0)
@@ -328,8 +200,7 @@ def run_test(testname):
                     break
         file.close()
         if match == -1:
-            sys.stderr.write("Fatal error: unable to find function signature " + \
-                  "in test %s\n" % testname)
+            error("unable to find function signature in test %s\n" % testname, 0)
             return (1, 0)
         else:
             global is_generic_target
@@ -404,7 +275,21 @@ def run_test(testname):
 # pull tests to run from the given queue and run them.  Multiple copies of
 # this function will be running in parallel across all of the CPU cores of
 # the system.
-def run_tasks_from_queue(queue, queue_ret, queue_skip, total_tests_arg, max_test_length_arg, counter, mutex):
+def run_tasks_from_queue(queue, queue_ret, queue_skip, total_tests_arg, max_test_length_arg, counter, mutex, glob_var):
+    # This is needed on windows because windows doen't copy globals from parent process whili multiprocessing
+    global is_windows
+    is_windows = glob_var[0]
+    global options
+    options = glob_var[1]
+    global s
+    s = glob_var[2]
+    global ispc_exe
+    ispc_exe = glob_var[3]
+    global is_generic_target
+    is_generic_target = glob_var[4]
+    global run_tests_log
+    run_tests_log = glob_var[5]    
+
     if is_windows:
         tmpdir = "tmp%d" % os.getpid()
         os.mkdir(tmpdir)
@@ -454,7 +339,256 @@ def sigint(signum, frame):
         t.terminate()
     sys.exit(1)
 
-if __name__ == '__main__':
+
+def file_check(compfails, runfails):
+    errors = len(compfails) + len(runfails)
+    new_compfails = []
+    new_runfails = []
+    new_passes_compfails = []
+    new_passes_runfails = []
+# Open file fail_db.txt
+    f = open(test_states, 'r')
+    f_lines = f.readlines()
+    f.close()
+# Detect OS
+    if platform.system() == 'Windows' or 'CYGWIN_NT' in platform.system():
+        OS = "Windows"
+    else:
+        if platform.system() == 'Darwin':
+            OS = "Mac"
+        else:
+            OS = "Linux"
+# Detect opt_set
+    if options.no_opt == True:
+        opt = "-O0"
+    else:
+        opt = "-O2"
+# Detect LLVM version
+    temp1 = common.detect_version(ispc_exe + " --version")
+    llvm_version = temp1[-10:-2]
+#Detect compiler version
+    if is_windows == False:
+        temp1 = common.detect_version(options.compiler_exe + " --version")
+        temp2 = temp1.split(" ")
+        compiler_version = temp2[0] + temp2[2][0:4]
+    else:
+        compiler_version = "cl" 
+    new_line = " "+options.arch.rjust(6)+" "+options.target.rjust(14)+" "+OS.rjust(7)+" "+llvm_version+" "+compiler_version.rjust(10)+" "+opt+" *\n"
+
+    new_compfails = compfails[:]
+    new_runfails = runfails[:]
+    new_f_lines = f_lines[:]
+    for j in range(0, len(f_lines)):
+        if (((" "+options.arch+" ") in f_lines[j]) and
+           ((" "+options.target+" ") in f_lines[j]) and
+           ((" "+OS+" ") in f_lines[j]) and
+           ((" "+llvm_version+" ") in f_lines[j]) and
+           ((" "+compiler_version+" ") in f_lines[j]) and
+           ((" "+opt+" ") in f_lines[j])):
+            if (" compfail " in f_lines[j]):
+                f = 0
+                for i in range(0, len(compfails)):
+                    if compfails[i] in f_lines[j]:
+                        new_compfails.remove(compfails[i])
+                    else:
+                        f = f + 1
+                if f == len(compfails):
+                    temp3 = f_lines[j].split(" ")
+                    new_passes_compfails.append(temp3[0])
+                    if options.update == "FP":
+                        new_f_lines.remove(f_lines[j])
+            if (" runfail " in f_lines[j]):
+                f = 0
+                for i in range(0, len(runfails)):
+                    if runfails[i] in f_lines[j]:
+                        new_runfails.remove(runfails[i])
+                    else:
+                        f = f + 1
+                if f == len(runfails):
+                    temp3 = f_lines[j].split(" ")
+                    new_passes_runfails.append(temp3[0])
+                    if options.update == "FP":
+                        new_f_lines.remove(f_lines[j])
+    if len(new_runfails) != 0:
+        print_debug("NEW RUNFAILS:\n", s, run_tests_log)
+        for i in range (0,len(new_runfails)):
+            new_f_lines.append(new_runfails[i] + " runfail " + new_line)
+            print_debug("\t" + new_runfails[i] + "\n", s, run_tests_log)
+    if len(new_compfails) != 0:
+        print_debug("NEW COMPFAILS:\n", s, run_tests_log)
+        for i in range (0,len(new_compfails)):
+            new_f_lines.append(new_compfails[i] + " compfail " + new_line)
+            print_debug("\t" + new_compfails[i] + "\n", s, run_tests_log)
+    if len(new_passes_runfails) != 0:
+        print_debug("NEW PASSES after RUNFAILS:\n", s, run_tests_log)
+        for i in range (0,len(new_passes_runfails)):
+            print_debug("\t" + new_passes_runfails[i] + "\n", s, run_tests_log)
+    if len(new_passes_compfails) != 0:
+        print_debug("NEW PASSES after COMPFAILS:\n", s, run_tests_log)
+        for i in range (0,len(new_passes_compfails)):
+            print_debug("\t" + new_passes_compfails[i] + "\n", s, run_tests_log)
+    
+    if options.update != "":
+        output = open(test_states, 'w')
+        output.writelines(new_f_lines)
+        output.close()
+    return [new_runfails, new_compfails, new_passes_runfails, new_passes_compfails, new_line, errors]
+
+def verify():
+    # Open file fail_db.txt
+    f = open(test_states, 'r')
+    f_lines = f.readlines()
+    f.close()
+    check = [["g++", "clang", "cl"],["-O0", "-O2"],["x86","x86-64"],
+             ["Linux","Windows","Mac"],["LLVM 3.1","LLVM 3.2","LLVM 3.3","LLVM head"],
+             ["sse2-i32x4", "sse2-i32x8", "sse4-i32x4", "sse4-i32x8", "sse4-i16x8",
+              "sse4-i8x16", "avx1-i32x8", "avx1-i32x16", "avx1.1-i32x8", "avx1.1-i32x16",
+              "avx2-i32x8", "avx2-i32x16", "generic-1", "generic-4", "generic-8",
+              "generic-16", "generic-32", "generic-64"]]
+    for i in range (0,len(f_lines)):
+        if f_lines[i][0] == "%":
+            continue
+        for j in range(0,len(check)):
+            temp = 0
+            for t in range(0,len(check[j])):
+                if " " + check[j][t] + " " in f_lines[i]:
+                    temp = temp + 1
+            if temp != 1:
+                print_debug("error in line " + str(i) + "\n", False, run_tests_log)
+                break
+
+
+def run_tests(options1, args, print_version):
+    global options
+    options = options1
+    global s
+    s = options.silent
+    
+    # prepare run_tests_log and test_states files
+    global run_tests_log
+    if options.in_file:
+        run_tests_log = os.getcwd() + os.sep + options.in_file
+        if print_version == 1:
+            common.remove_if_exists(run_tests_log)
+    else:
+        run_tests_log = ""
+    global test_states
+    test_states = "fail_db.txt"
+    if options.verify:
+        verify()
+        return 0
+
+    # disable fancy error/warning printing with ANSI colors, so grepping for error
+    # messages doesn't get confused
+    os.environ["TERM"] = "dumb"
+ 
+    # This script is affected by http://bugs.python.org/issue5261 on OSX 10.5 Leopard
+    # git history has a workaround for that issue.
+    global is_windows 
+    is_windows = (platform.system() == 'Windows' or
+                'CYGWIN_NT' in platform.system())
+ 
+    if options.target == 'neon':
+        options.arch = 'arm'
+ 
+    # use relative path to not depend on host directory, which may possibly
+    # have white spaces and unicode characters.
+    global ispc_exe
+    if not is_windows:
+        ispc_exe = "./ispc"
+    else:
+        ispc_exe = ".\\Release\\ispc.exe"
+ 
+    # checks the required ispc compiler otherwise prints an error message
+    if not os.path.exists(ispc_exe):
+        error("missing ispc compiler: %s\n" % ispc_exe, 1)
+    ispc_exe += " " + options.ispc_flags
+    print_debug("ispc compiler: %s\n" % ispc_exe, s, run_tests_log)
+
+    global is_generic_target 
+    is_generic_target = (options.target.find("generic-") != -1 and
+                     options.target != "generic-1" and options.target != "generic-x1")
+    if is_generic_target and options.include_file == None:
+        if options.target == "generic-4" or options.target == "generic-x4":
+            error("No generics #include specified; using examples/intrinsics/sse4.h\n", 2)
+            options.include_file = "examples/intrinsics/sse4.h"
+            options.target = "generic-4"
+        elif options.target == "generic-8" or options.target == "generic-x8":
+            error("No generics #include specified and no default available for \"generic-8\" target.\n", 1)
+            options.target = "generic-8"
+        elif options.target == "generic-16" or options.target == "generic-x16":
+            error("No generics #include specified; using examples/intrinsics/generic-16.h\n", 2)
+            options.include_file = "examples/intrinsics/generic-16.h"
+            options.target = "generic-16"
+        elif options.target == "generic-32" or options.target == "generic-x32":
+            error("No generics #include specified; using examples/intrinsics/generic-32.h\n", 2)
+            options.include_file = "examples/intrinsics/generic-32.h"
+            options.target = "generic-32"
+        elif options.target == "generic-64" or options.target == "generic-x64":
+            error("No generics #include specified; using examples/intrinsics/generic-64.h\n", 2)
+            options.include_file = "examples/intrinsics/generic-64.h"
+            options.target = "generic-64"
+ 
+    if options.compiler_exe == None:
+        if is_windows:
+            options.compiler_exe = "cl.exe"
+        else:
+            options.compiler_exe = "g++"
+ 
+    # checks the required compiler otherwise prints an error message
+    PATH_dir = string.split(os.getenv("PATH"), os.pathsep) 
+    compiler_exists = False
+ 
+    for counter in PATH_dir:
+        if os.path.exists(counter + os.sep + options.compiler_exe):
+            compiler_exists = True
+            break
+ 
+    if not compiler_exists:
+        error("missing the required compiler: %s \n" % options.compiler_exe, 1)
+
+    # print compilers versions
+    if print_version > 0:
+        common.print_version(ispc_exe, "", options.compiler_exe, False, run_tests_log, is_windows)
+ 
+    ispc_root = "."
+    
+    # if no specific test files are specified, run all of the tests in tests/,
+    # failing_tests/, and tests_errors/
+    if len(args) == 0:
+        files = glob.glob(ispc_root + os.sep + "tests" + os.sep + "*ispc") + \
+            glob.glob(ispc_root + os.sep + "failing_tests" + os.sep + "*ispc") + \
+            glob.glob(ispc_root + os.sep + "tests_errors" + os.sep + "*ispc")
+    else:
+        if is_windows:
+            argfiles = [ ]
+            for f in args:
+                # we have to glob ourselves if this is being run under a DOS
+                # shell, as it passes wildcard as is.
+                argfiles += glob.glob(f)
+        else:
+            argfiles = args
+        
+        files = [ ]
+        for f in argfiles:
+            if os.path.splitext(string.lower(f))[1] != ".ispc":
+                error("Ignoring file %s, which doesn't have an .ispc extension.\n" % f, 2)
+            else:
+                files += [ f ]
+ 
+    # max_test_length is used to issue exact number of whitespace characters when
+    # updating status. Otherwise update causes new lines standard 80 char terminal
+    # on both Linux and Windows.
+    max_test_length = 0
+    for f in files:
+        max_test_length = max(max_test_length, len(f))
+ 
+    # randomly shuffle the tests if asked to do so
+    if (options.random):
+        random.seed()
+        random.shuffle(files)
+ 
+    # counter
     total_tests = len(files)
 
     compile_error_files = [ ]
@@ -463,7 +597,7 @@ if __name__ == '__main__':
 
     nthreads = min(multiprocessing.cpu_count(), options.num_jobs)
     nthreads = min(nthreads, len(files))
-    sys.stdout.write("Running %d jobs in parallel. Running %d tests.\n" % (nthreads, total_tests))
+    print_debug("Running %d jobs in parallel. Running %d tests.\n" % (nthreads, total_tests), s, run_tests_log)
 
     # put each of the test filenames into a queue
     q = multiprocessing.Queue()
@@ -483,8 +617,10 @@ if __name__ == '__main__':
 
     start_time = time.time()
     # launch jobs to run tests
+    glob_var = [is_windows, options, s, ispc_exe, is_generic_target, run_tests_log]
     for x in range(nthreads):
-        t = multiprocessing.Process(target=run_tasks_from_queue, args=(q, qret, qskip, total_tests, max_test_length, finished_tests_counter, finished_tests_counter_lock))
+        t = multiprocessing.Process(target=run_tasks_from_queue, args=(q, qret, qskip, total_tests,
+            max_test_length, finished_tests_counter, finished_tests_counter_lock, glob_var))
         task_threads.append(t)
         t.start()
 
@@ -493,35 +629,97 @@ if __name__ == '__main__':
     for t in task_threads:
         t.join()
     if options.non_interactive == False:
-        sys.stdout.write("\n")
+        print_debug("\n", s, run_tests_log)
 
     elapsed_time = time.time() - start_time
 
     while not qret.empty():
-        (c, r, s) = qret.get()
+        (c, r, skip) = qret.get()
         compile_error_files += c
         run_error_files += r
-        skip_files += s
+        skip_files += skip
 
     if options.non_interactive:
-        sys.stdout.write(" Done %d / %d\n" % (finished_tests_counter.value, total_tests))
+        print_debug(" Done %d / %d\n" % (finished_tests_counter.value, total_tests), s, run_tests_log)
     if len(skip_files) > 0:
         skip_files.sort()
-        sys.stdout.write("%d / %d tests SKIPPED:\n" % (len(skip_files), total_tests))
+        print_debug("%d / %d tests SKIPPED:\n" % (len(skip_files), total_tests), s, run_tests_log)
         for f in skip_files:
-            sys.stdout.write("\t%s\n" % f)
+            print_debug("\t%s\n" % f, s, run_tests_log)
     if len(compile_error_files) > 0:
         compile_error_files.sort()
-        sys.stdout.write("%d / %d tests FAILED compilation:\n" % (len(compile_error_files), total_tests))
+        print_debug("%d / %d tests FAILED compilation:\n" % (len(compile_error_files), total_tests), s, run_tests_log)
         for f in compile_error_files:
-            sys.stdout.write("\t%s\n" % f)
+            print_debug("\t%s\n" % f, s, run_tests_log)
     if len(run_error_files) > 0:
         run_error_files.sort()
-        sys.stdout.write("%d / %d tests FAILED execution:\n" % (len(run_error_files), total_tests))
+        print_debug("%d / %d tests FAILED execution:\n" % (len(run_error_files), total_tests), s, run_tests_log)
         for f in run_error_files:
-            sys.stdout.write("\t%s\n" % f)
+            print_debug("\t%s\n" % f, s, run_tests_log)
+
+    R = file_check(compile_error_files, run_error_files)
 
     if options.time:
-        sys.stdout.write("Elapsed time: %d s\n" % elapsed_time)
+        print_debug("Elapsed time: %d s\n" % elapsed_time, s, run_tests_log)
 
-    sys.exit(len(compile_error_files) + len(run_error_files))
+    return R
+
+
+from optparse import OptionParser
+import multiprocessing
+from ctypes import c_int
+import os
+import sys
+import glob
+import re
+import signal
+import random
+import string
+import subprocess
+import shlex
+import platform
+import tempfile
+import os.path
+import time
+# our functions
+import common
+print_debug = common.print_debug
+error = common.error
+
+if __name__ == "__main__":
+    parser = OptionParser()
+    parser.add_option("-r", "--random-shuffle", dest="random", help="Randomly order tests",
+                  default=False, action="store_true")
+    parser.add_option("-g", "--generics-include", dest="include_file", help="Filename for header implementing functions for generics",
+                  default=None)
+    parser.add_option("-f", "--ispc-flags", dest="ispc_flags", help="Additional flags for ispc (-g, -O1, ...)",
+                  default="")
+    parser.add_option('-t', '--target', dest='target',
+                  help='Set compilation target (sse2-i32x4, sse2-i32x8, sse4-i32x4, sse4-i32x8, sse4-i16x8, sse4-i8x16, avx1-i32x8, avx1-i32x16, avx1.1-i32x8, avx1.1-i32x16, avx2-i32x8, avx2-i32x16, generic-x1, generic-x4, generic-x8, generic-x16, generic-x32, generic-x64)',
+                                    default="sse4")
+    parser.add_option('-a', '--arch', dest='arch',
+                  help='Set architecture (arm, x86, x86-64)',
+                                    default="x86-64")
+    parser.add_option("-c", "--compiler", dest="compiler_exe", help="C/C++ compiler binary to use to run tests",
+                  default=None)
+    parser.add_option('-o', '--no-opt', dest='no_opt', help='Disable optimization',
+                  default=False, action="store_true")
+    parser.add_option('-j', '--jobs', dest='num_jobs', help='Maximum number of jobs to run in parallel',
+                  default="1024", type="int")
+    parser.add_option('-v', '--verbose', dest='verbose', help='Enable verbose output',
+                  default=False, action="store_true")
+    parser.add_option('--wrap-exe', dest='wrapexe',
+                  help='Executable to wrap test runs with (e.g. "valgrind")',
+                                    default="")
+    parser.add_option('--time', dest='time', help='Enable time output',
+                  default=False, action="store_true")
+    parser.add_option('--non-interactive', dest='non_interactive', help='Disable interactive status updates',
+                  default=False, action="store_true")
+    parser.add_option('-u', "--update", dest='update', help='Update file with fails (F of FP)', default="")
+    parser.add_option('-s', "--silent", dest='silent', help='enable silent mode without any output', default=False,
+                  action = "store_true")
+    parser.add_option("--file", dest='in_file', help='file to save run_tests output', default="")
+    parser.add_option("--verify", dest='verify', help='verify the file fail_db.txt', default=False, action="store_true")
+    (options, args) = parser.parse_args()
+    L = run_tests(options, args, 1)
+    exit(0)

From f45f6cb32a390d834e53037751365cd1932929e3 Mon Sep 17 00:00:00 2001
From: Dmitry Babokin <babokin@gmail.com>
Date: Tue, 17 Sep 2013 23:36:16 +0400
Subject: [PATCH 083/124] Test, documentation and vim support for double
 precision constants

---
 contrib/ispc.vim         |  5 +++++
 docs/ispc.rst            | 11 ++++++++++-
 tests/double-consts.ispc | 23 +++++++++++++++++++++++
 3 files changed, 38 insertions(+), 1 deletion(-)
 create mode 100644 tests/double-consts.ispc

diff --git a/contrib/ispc.vim b/contrib/ispc.vim
index cc8493f0..4d870dcd 100644
--- a/contrib/ispc.vim
+++ b/contrib/ispc.vim
@@ -19,6 +19,11 @@ syn keyword	ispcRepeat	cdo cfor cwhile
 syn keyword	ispcBuiltin	programCount programIndex	
 syn keyword	ispcType	export uniform varying int8 int16 int32 int64
 
+"double precision floating point number, with dot, optional exponent
+syn match	cFloat		display contained "\d\+\.\d*d[-+]\=\d*\>"
+"double precision floating point number, without dot, with exponent
+syn match	cFloat		display contained "\d\+d[-+]\=\d\+\>"
+
 " Default highlighting
 command -nargs=+ HiLink hi def link <args>
 HiLink ispcStatement	Statement
diff --git a/docs/ispc.rst b/docs/ispc.rst
index ff07f6d8..224faaa9 100644
--- a/docs/ispc.rst
+++ b/docs/ispc.rst
@@ -270,6 +270,14 @@ new reserved words: ``unmasked``, ``foreach_unique``, ``foreach_active``,
 and ``in``.  Any program that happens to have a variable or function with
 one of these names must be modified to rename that symbol.
 
+Updating ISPC Programs For Changes In ISPC 1.4.5
+----------------------------------------------
+
+This release adds support for double precision floating point constants.
+Double precision floating point constants are floating point number with
+``d`` suffix and optional exponent part. Here are some examples: 3.14d,
+31.4d-1, 1.d, 1.0d, 1d-2. Note that floating point number without suffix is
+treated as single precision constant.
 
 Getting Started with ISPC
 =========================
@@ -1349,7 +1357,8 @@ but are likely to be supported in future releases:
 * Bitfield members of ``struct`` types
 * Variable numbers of arguments to functions
 * Literal floating-point constants (even without a ``f`` suffix) are
-  currently treated as being ``float`` type, not ``double``
+  currently treated as being ``float`` type, not ``double``. To have a double
+  precision floating point constant use ``d`` suffix.
 * The ``volatile`` qualifier
 * The ``register`` storage class for variables.  (Will be ignored).
 
diff --git a/tests/double-consts.ispc b/tests/double-consts.ispc
new file mode 100644
index 00000000..3259156a
--- /dev/null
+++ b/tests/double-consts.ispc
@@ -0,0 +1,23 @@
+
+export uniform int width() { return programCount; }
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    double a = aFOO[programIndex];
+    // Test parsing of double constants.
+    double d1 = 1.0d40;
+    double d2 = 1.d40;
+    double d3 = 1d40;
+    double d4 = 10000000000000000000000000000000000000000.d;
+    double d5 = 10000000000000000000000000000000000000000.0d;
+
+    // All the constants should be equal and if it's evaluated as "float",
+    // then sqrt will evaluate to +inf.
+    if (d1 == d2 && d1 == d3 && d1 == d4 && d1 == d5 &&
+        ((float)sqrt(d1)) < 2e20) {
+        RET[programIndex] = a;
+    }
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1 + programIndex;
+}

From 1c527ae34cf7c257f8deaf0261af447b238cab56 Mon Sep 17 00:00:00 2001
From: Dmitry Babokin <babokin@gmail.com>
Date: Wed, 18 Sep 2013 11:48:24 +0400
Subject: [PATCH 084/124] Adding tests and vim support for double constant of
 the form .1d41

---
 contrib/ispc.vim         | 2 ++
 tests/double-consts.ispc | 7 ++++---
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/contrib/ispc.vim b/contrib/ispc.vim
index 4d870dcd..f3cb413b 100644
--- a/contrib/ispc.vim
+++ b/contrib/ispc.vim
@@ -21,6 +21,8 @@ syn keyword	ispcType	export uniform varying int8 int16 int32 int64
 
 "double precision floating point number, with dot, optional exponent
 syn match	cFloat		display contained "\d\+\.\d*d[-+]\=\d*\>"
+"double precision floating point number, starting with dot, optional exponent
+syn match	cFloat		display contained ".\d*d[-+]\=\d*\>"
 "double precision floating point number, without dot, with exponent
 syn match	cFloat		display contained "\d\+d[-+]\=\d\+\>"
 
diff --git a/tests/double-consts.ispc b/tests/double-consts.ispc
index 3259156a..4096aa1c 100644
--- a/tests/double-consts.ispc
+++ b/tests/double-consts.ispc
@@ -7,12 +7,13 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
     double d1 = 1.0d40;
     double d2 = 1.d40;
     double d3 = 1d40;
-    double d4 = 10000000000000000000000000000000000000000.d;
-    double d5 = 10000000000000000000000000000000000000000.0d;
+    double d4 = .1d41;
+    double d5 = 10000000000000000000000000000000000000000.d;
+    double d6 = 10000000000000000000000000000000000000000.0d;
 
     // All the constants should be equal and if it's evaluated as "float",
     // then sqrt will evaluate to +inf.
-    if (d1 == d2 && d1 == d3 && d1 == d4 && d1 == d5 &&
+    if (d1 == d2 && d1 == d3 && d1 == d4 && d1 == d5 && d1 == d6
         ((float)sqrt(d1)) < 2e20) {
         RET[programIndex] = a;
     }

From bb8f7d4e3f2a226a8f4b7b7ae2de6fce7d609791 Mon Sep 17 00:00:00 2001
From: Ilia Filippov <ili.filippov@gmail.com>
Date: Thu, 19 Sep 2013 14:37:26 +0400
Subject: [PATCH 085/124] removing LLVM 3.1 and 3.2 from default testing

---
 alloy.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/alloy.py b/alloy.py
index 67f534ca..06025324 100755
--- a/alloy.py
+++ b/alloy.py
@@ -367,7 +367,7 @@ def validation_run(only, only_targets, reference_branch, notify, update):
         if len(archs) == 0:
             archs = ["x86", "x86-64"]
         if len(LLVM) == 0:
-            LLVM = ["3.1", "3.2", "3.3", "head"]
+            LLVM = ["3.3", "head"]
         gen_archs = ["x86-64"]
         need_LLVM = check_LLVM(LLVM)
         for i in range(0,len(need_LLVM)):

From 6a21218c13aa14666d11150c265f542afd79818e Mon Sep 17 00:00:00 2001
From: evghenii <egaburov@dds.nl>
Date: Thu, 19 Sep 2013 13:45:31 +0300
Subject: [PATCH 086/124] fix warrning and add KNC 1

---
 examples/intrinsics/knc-i1x16.h | 4 ++--
 examples/intrinsics/knc-i1x8.h  | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/examples/intrinsics/knc-i1x16.h b/examples/intrinsics/knc-i1x16.h
index b7d3a7f1..c535e61a 100644
--- a/examples/intrinsics/knc-i1x16.h
+++ b/examples/intrinsics/knc-i1x16.h
@@ -45,13 +45,13 @@
 #define roundf(x) (floorf(x + .5f))
 #define round(x) (floor(x + .5))
 #else
-#define FORCEINLINE __attribute__((always_inline))
+#define FORCEINLINE __forceinline
 #define PRE_ALIGN(x)
 #define POST_ALIGN(x)  __attribute__ ((aligned(x)))
 #endif
 
-#if 0
 #define KNC 1
+#if 0
 extern "C" 
 {
   int printf(const unsigned char *, ...);
diff --git a/examples/intrinsics/knc-i1x8.h b/examples/intrinsics/knc-i1x8.h
index de9bddcc..573d232c 100644
--- a/examples/intrinsics/knc-i1x8.h
+++ b/examples/intrinsics/knc-i1x8.h
@@ -50,13 +50,13 @@
 #define roundf(x) (floorf(x + .5f))
 #define round(x) (floor(x + .5))
 #else
-#define FORCEINLINE __attribute__((always_inline))
+#define FORCEINLINE __forceinline
 #define PRE_ALIGN(x)
 #define POST_ALIGN(x)  __attribute__ ((aligned(x)))
 #endif
 
-#if 0
 #define KNC 1
+#if 0
 extern "C" 
 {
   int printf(const unsigned char *, ...);

From 43245bbc118c1b415c9c538c98555fc110ad1f3c Mon Sep 17 00:00:00 2001
From: Dmitry Babokin <babokin@gmail.com>
Date: Wed, 18 Sep 2013 14:24:46 +0400
Subject: [PATCH 087/124] Adding check for OS AVX support to auto-dispatch code

---
 builtins/dispatch.ll | 81 +++++++++++++++++++++++++++-----------------
 1 file changed, 49 insertions(+), 32 deletions(-)

diff --git a/builtins/dispatch.ll b/builtins/dispatch.ll
index f1d5a969..ba216df7 100644
--- a/builtins/dispatch.ll
+++ b/builtins/dispatch.ll
@@ -1,4 +1,4 @@
-;;  Copyright (c) 2011, Intel Corporation
+;;  Copyright (c) 2011-2013, Intel Corporation
 ;;  All rights reserved.
 ;;
 ;;  Redistribution and use in source and binary forms, with or without
@@ -41,15 +41,13 @@
 
 @__system_best_isa = internal global i32 -1
 
-declare void @abort() noreturn
-
 ;; The below is the result of running "clang -O2 -emit-llvm -c -o -" on the
 ;; following code...  Specifically, __get_system_isa should return a value
 ;; corresponding to one of the Target::ISA enumerant values that gives the
 ;; most capable ISA that the curremt system can run.
 ;;
-;; Note: clang from LLVM 3.0 should be used if this is updated, for maximum
-;; backwards compatibility for anyone building ispc with LLVM 3.0
+;; Note: clang from LLVM 3.1 should be used if this is updated, for maximum
+;; backwards compatibility for anyone building ispc with LLVM 3.1
 ;;
 ;; #include <stdint.h>
 ;; #include <stdlib.h>
@@ -60,7 +58,7 @@ declare void @abort() noreturn
 ;;                           : "0" (infoType));
 ;; }
 ;; 
-;; /* Save %ebx in case it's the PIC register */
+;; // Save %ebx in case it's the PIC register.
 ;; static void __cpuid_count(int info[4], int level, int count) {
 ;;   __asm__ __volatile__ ("xchg{l}\t{%%}ebx, %1\n\t"
 ;;                         "cpuid\n\t"
@@ -69,13 +67,23 @@ declare void @abort() noreturn
 ;;                         : "0" (level), "2" (count));
 ;; }
 ;; 
+;; static int __os_has_avx_support() {
+;;     // Check xgetbv; this uses a .byte sequence instead of the instruction
+;;     // directly because older assemblers do not include support for xgetbv and
+;;     // there is no easy way to conditionally compile based on the assembler used.
+;;     int rEAX, rEDX;
+;;     __asm__ __volatile__ (".byte 0x0f, 0x01, 0xd0" : "=a" (rEAX), "=d" (rEDX) : "c" (0));
+;;     return (rEAX & 6) == 6;
+;; }
+;; 
 ;; int32_t __get_system_isa() {
 ;;     int info[4];
 ;;     __cpuid(info, 1);
 ;; 
-;;     /* NOTE: the values returned below must be the same as the
-;;        corresponding enumerant values in Target::ISA. */
-;;     if ((info[2] & (1 << 28)) != 0) {
+;;     // NOTE: the values returned below must be the same as the
+;;     // corresponding enumerant values in Target::ISA.
+;;     if ((info[2] & (1 << 28)) != 0 &&
+;;         __os_has_avx_support()) {
 ;;        if ((info[2] & (1 << 29)) != 0 &&  // F16C
 ;;            (info[2] & (1 << 30)) != 0) {  // RDRAND
 ;;            // So far, so good.  AVX2?
@@ -98,47 +106,56 @@ declare void @abort() noreturn
 ;;         abort();
 ;; }
 
-define i32 @__get_system_isa() nounwind uwtable ssp {
+define i32 @__get_system_isa() nounwind uwtable {
 entry:
   %0 = tail call { i32, i32, i32, i32 } asm sideeffect "cpuid", "={ax},={bx},={cx},={dx},0,~{dirflag},~{fpsr},~{flags}"(i32 1) nounwind
   %asmresult5.i = extractvalue { i32, i32, i32, i32 } %0, 2
   %asmresult6.i = extractvalue { i32, i32, i32, i32 } %0, 3
   %and = and i32 %asmresult5.i, 268435456
   %cmp = icmp eq i32 %and, 0
-  br i1 %cmp, label %if.else13, label %if.then
+  br i1 %cmp, label %if.else14, label %land.lhs.true
 
-if.then:                                          ; preds = %entry
-  %1 = and i32 %asmresult5.i, 1610612736
-  %2 = icmp eq i32 %1, 1610612736
-  br i1 %2, label %if.then7, label %return
+land.lhs.true:                                    ; preds = %entry
+  %1 = tail call { i32, i32 } asm sideeffect ".byte 0x0f, 0x01, 0xd0", "={ax},={dx},{cx},~{dirflag},~{fpsr},~{flags}"(i32 0) nounwind
+  %asmresult.i25 = extractvalue { i32, i32 } %1, 0
+  %and.i = and i32 %asmresult.i25, 6
+  %cmp.i = icmp eq i32 %and.i, 6
+  br i1 %cmp.i, label %if.then, label %if.else14
 
-if.then7:                                         ; preds = %if.then
-  %3 = tail call { i32, i32, i32, i32 } asm sideeffect "xchg$(l$)\09$(%$)ebx, $1\0A\09cpuid\0A\09xchg$(l$)\09$(%$)ebx, $1\0A\09", "={ax},=r,={cx},={dx},0,2,~{dirflag},~{fpsr},~{flags}"(i32 7, i32 0) nounwind
-  %asmresult4.i28 = extractvalue { i32, i32, i32, i32 } %3, 1
-  %and10 = lshr i32 %asmresult4.i28, 5
-  %4 = and i32 %and10, 1
-  %5 = add i32 %4, 3
+if.then:                                          ; preds = %land.lhs.true
+  %2 = and i32 %asmresult5.i, 1610612736
+  %3 = icmp eq i32 %2, 1610612736
+  br i1 %3, label %if.then8, label %return
+
+if.then8:                                         ; preds = %if.then
+  %4 = tail call { i32, i32, i32, i32 } asm sideeffect "xchg$(l$)\09$(%$)ebx, $1\0A\09cpuid\0A\09xchg$(l$)\09$(%$)ebx, $1\0A\09", "={ax},=r,={cx},={dx},0,2,~{dirflag},~{fpsr},~{flags}"(i32 7, i32 0) nounwind
+  %asmresult4.i30 = extractvalue { i32, i32, i32, i32 } %4, 1
+  %and11 = lshr i32 %asmresult4.i30, 5
+  %5 = and i32 %and11, 1
+  %6 = add i32 %5, 3
   br label %return
 
-if.else13:                                        ; preds = %entry
-  %and15 = and i32 %asmresult5.i, 524288
-  %cmp16 = icmp eq i32 %and15, 0
-  br i1 %cmp16, label %if.else18, label %return
+if.else14:                                        ; preds = %land.lhs.true, %entry
+  %and16 = and i32 %asmresult5.i, 524288
+  %cmp17 = icmp eq i32 %and16, 0
+  br i1 %cmp17, label %if.else19, label %return
 
-if.else18:                                        ; preds = %if.else13
-  %and20 = and i32 %asmresult6.i, 67108864
-  %cmp21 = icmp eq i32 %and20, 0
-  br i1 %cmp21, label %if.else23, label %return
+if.else19:                                        ; preds = %if.else14
+  %and21 = and i32 %asmresult6.i, 67108864
+  %cmp22 = icmp eq i32 %and21, 0
+  br i1 %cmp22, label %if.else24, label %return
 
-if.else23:                                        ; preds = %if.else18
+if.else24:                                        ; preds = %if.else19
   tail call void @abort() noreturn nounwind
   unreachable
 
-return:                                           ; preds = %if.else18, %if.else13, %if.then7, %if.then
-  %retval.0 = phi i32 [ %5, %if.then7 ], [ 2, %if.then ], [ 1, %if.else13 ], [ 0, %if.else18 ]
+return:                                           ; preds = %if.else19, %if.else14, %if.then8, %if.then
+  %retval.0 = phi i32 [ %6, %if.then8 ], [ 2, %if.then ], [ 1, %if.else14 ], [ 0, %if.else19 ]
   ret i32 %retval.0
 }
 
+declare void @abort() noreturn nounwind
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; This function is called by each of the dispatch functions we generate;
 ;; it sets @__system_best_isa if it is unset.

From dbef4fd7d7d270e350f8af26f76846ba24deb8a0 Mon Sep 17 00:00:00 2001
From: evghenii <egaburov@dds.nl>
Date: Thu, 19 Sep 2013 14:52:22 +0300
Subject: [PATCH 088/124] fixed notation

---
 examples/intrinsics/knc-i1x8.h            | 17 ++++++-----------
 examples/intrinsics/knc-i1x8unsafe_fast.h |  2 +-
 2 files changed, 7 insertions(+), 12 deletions(-)

diff --git a/examples/intrinsics/knc-i1x8.h b/examples/intrinsics/knc-i1x8.h
index 573d232c..c17b7238 100644
--- a/examples/intrinsics/knc-i1x8.h
+++ b/examples/intrinsics/knc-i1x8.h
@@ -38,11 +38,6 @@
 #include <immintrin.h>
 #include <zmmintrin.h>
 
-#if 0
-#define __ZMM32BIT__
-#endif
-
-
 #ifdef _MSC_VER
 #define FORCEINLINE __forceinline
 #define PRE_ALIGN(x)  /*__declspec(align(x))*/
@@ -110,7 +105,7 @@ struct vec8 {
 
 /****************/
 
-#ifndef __ZMM32BIT__
+#ifndef __ZMM64BIT__
 struct PRE_ALIGN(32) __vec8_i32  : public vec8<int32_t> { 
   __vec8_i32() { }
   FORCEINLINE __vec8_i32(int32_t v0, int32_t v1, int32_t v2, int32_t v3, 
@@ -135,7 +130,7 @@ struct PRE_ALIGN(32) __vec8_i32  : public vec8<int32_t> {
         data[7],data[6],data[5],data[4],data[3],data[2],data[1],data[0]);
   }
 } POST_ALIGN(32);
-#else /* __ZMM32BIT__ */
+#else /* __ZMM64BIT__ */
 struct PRE_ALIGN(32) __vec8_i32 
 {
   __m512i v;
@@ -150,9 +145,9 @@ struct PRE_ALIGN(32) __vec8_i32
     FORCEINLINE const int32_t& operator[](const int i) const {  return ((int32_t*)this)[i]; }
     FORCEINLINE       int32_t& operator[](const int i)       {  return ((int32_t*)this)[i]; }
 } POST_ALIGN(32);
-#endif /* __ZMM32BIT__ */
+#endif /* __ZMM64BIT__ */
 
-#ifndef __ZMM32BIT__ /* __ZMM32BIT__ */
+#ifndef __ZMM64BIT__ /* __ZMM64BIT__ */
 PRE_ALIGN(32) struct __vec8_f : public vec8<float> { 
     __vec8_f() { }
   FORCEINLINE  __vec8_f(float v0, float v1, float v2, float v3, 
@@ -177,7 +172,7 @@ PRE_ALIGN(32) struct __vec8_f : public vec8<float> {
     data[7] = val.s[7];
   }
 } POST_ALIGN(32);
-#else /* __ZMM32BIT__ */
+#else /* __ZMM64BIT__ */
 PRE_ALIGN(32) struct __vec8_f 
 {
     __m512 v;
@@ -192,7 +187,7 @@ PRE_ALIGN(32) struct __vec8_f
     FORCEINLINE const float& operator[](const int i) const {  return ((float*)this)[i]; }
     FORCEINLINE       float& operator[](const int i)       {  return ((float*)this)[i]; }
 } POST_ALIGN(32);
-#endif /* __ZMM32BIT__ */
+#endif /* __ZMM64BIT__ */
 
 struct PRE_ALIGN(64) __vec8_d 
 {
diff --git a/examples/intrinsics/knc-i1x8unsafe_fast.h b/examples/intrinsics/knc-i1x8unsafe_fast.h
index ce66ea11..2e00a567 100644
--- a/examples/intrinsics/knc-i1x8unsafe_fast.h
+++ b/examples/intrinsics/knc-i1x8unsafe_fast.h
@@ -1,4 +1,4 @@
-#define __ZMM32BIT__
+#define __ZMM64BIT__
 #include "knc-i1x8.h"
 
 /* the following tests fails because vec8_i32 and vec8_float are 512 and not 256 bit in size.

From 0c274212c2104a4547018fd3be31f33e153b82d3 Mon Sep 17 00:00:00 2001
From: evghenii <egaburov@dds.nl>
Date: Thu, 19 Sep 2013 16:07:22 +0300
Subject: [PATCH 089/124] performance tuning for knc-i1x8.h. this gives goed
 enough performance for double only. float performance is terrible

---
 examples/intrinsics/knc-i1x8.h | 167 +++++++++++++--------------------
 1 file changed, 64 insertions(+), 103 deletions(-)

diff --git a/examples/intrinsics/knc-i1x8.h b/examples/intrinsics/knc-i1x8.h
index c17b7238..d7696117 100644
--- a/examples/intrinsics/knc-i1x8.h
+++ b/examples/intrinsics/knc-i1x8.h
@@ -73,9 +73,9 @@ typedef int64_t __vec1_i64;
 
 struct __vec8_i1 {
     __vec8_i1() { }
-    __vec8_i1(const __mmask16 &vv) : v(vv) { }
+    __vec8_i1(const __mmask8 &vv) : v(vv) { }
     __vec8_i1(bool v0, bool v1, bool v2, bool v3,
-               bool v4, bool v5, bool v6, bool v7) {
+              bool v4, bool v5, bool v6, bool v7) {
         v = ((v0 & 1) |
              ((v1 & 1) << 1) |
              ((v2 & 1) << 2) |
@@ -87,7 +87,7 @@ struct __vec8_i1 {
     }
              
     __mmask8 v;
-    FORCEINLINE operator __mmask8() const { return v; }
+    FORCEINLINE operator __mmask8() const { return v; }//0xFF & v; }
 };
 
 
@@ -105,89 +105,66 @@ struct vec8 {
 
 /****************/
 
-#ifndef __ZMM64BIT__
-struct PRE_ALIGN(32) __vec8_i32  : public vec8<int32_t> { 
+struct PRE_ALIGN(32) __vec8_i32  
+{
+#ifdef __ZMM64BIT__
+  __m512i _data;
+  FORCEINLINE __vec8_i32(const __m512i &in) : _data(in) {}
+  FORCEINLINE operator __m512i() const   { return _data; }
+#else /* __ZMM64BIT__ */
+  typedef int32_t  _v8si  __attribute__((vector_size(32)));
+  _v8si _data;
+  FORCEINLINE __vec8_i32(const __m512i &in) 
+  {
+    _mm512_mask_extpackstorelo_epi32((__m512i*)&_data,  0xFF, in, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
+  }
+  FORCEINLINE operator __m512i() const   
+  { 
+    return _mm512_extloadunpacklo_epi32(_mm512_setzero_epi32(), (uint8_t*)&_data, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE); 
+  }
+#endif /* __ZMM64BIT__ */
+  
   __vec8_i32() { }
   FORCEINLINE __vec8_i32(int32_t v0, int32_t v1, int32_t v2, int32_t v3, 
-      int32_t v4, int32_t v5, int32_t v6, int32_t v7)
-    : vec8<int32_t>(v0, v1, v2, v3, v4, v5, v6, v7) { }
-  FORCEINLINE __vec8_i32(__m512i v) 
+      int32_t v4, int32_t v5, int32_t v6, int32_t v7) 
   {
-    union { __m512i v; int32_t s[8]; } val = {v};
-    data[0] = val.s[0];
-    data[1] = val.s[1];
-    data[2] = val.s[2];
-    data[3] = val.s[3];
-    data[4] = val.s[4];
-    data[5] = val.s[5];
-    data[6] = val.s[6];
-    data[7] = val.s[7];
+    const __m512i v  = _mm512_set_16to16_pi(0,0,0,0,0,0,0,0, v7, v6, v5, v4, v3, v2, v1, v0);
+    *this = __vec8_i32(v);
   }
-  FORCEINLINE operator __m512i() const 
-  { 
-    return _mm512_set_16to16_pi(
-        0,0,0,0, 0,0,0,0,
-        data[7],data[6],data[5],data[4],data[3],data[2],data[1],data[0]);
-  }
-} POST_ALIGN(32);
-#else /* __ZMM64BIT__ */
-struct PRE_ALIGN(32) __vec8_i32 
-{
-  __m512i v;
-  FORCEINLINE operator __m512i() const { return v; }
-  FORCEINLINE __vec8_i32() : v(_mm512_undefined_epi32()) {}
-  FORCEINLINE __vec8_i32(const __m512i &in) : v(in) {}
-  FORCEINLINE __vec8_i32(const __vec8_i32 &o) : v(o.v) {}
-  FORCEINLINE __vec8_i32& operator =(const __vec8_i32 &o) { v=o.v; return *this; }
-  FORCEINLINE __vec8_i32(int32_t v00, int32_t v01, int32_t v02, int32_t v03, 
-                        int32_t v04, int32_t v05, int32_t v06, int32_t v07) :
-    v ( _mm512_set_16to16_pi(0,0,0,0,0,0,0,0, v07, v06, v05, v04, v03, v02, v01, v00) ) {}
-    FORCEINLINE const int32_t& operator[](const int i) const {  return ((int32_t*)this)[i]; }
-    FORCEINLINE       int32_t& operator[](const int i)       {  return ((int32_t*)this)[i]; }
-} POST_ALIGN(32);
-#endif /* __ZMM64BIT__ */
 
-#ifndef __ZMM64BIT__ /* __ZMM64BIT__ */
-PRE_ALIGN(32) struct __vec8_f : public vec8<float> { 
-    __vec8_f() { }
-  FORCEINLINE  __vec8_f(float v0, float v1, float v2, float v3, 
-             float v4, float v5, float v6, float v7) 
-        : vec8<float>(v0, v1, v2, v3, v4, v5, v6, v7) { }
-  FORCEINLINE operator __m512() const 
-  { 
-    return _mm512_set_16to16_ps(
-        0,0,0,0,0,0,0,0,
-        data[7],data[6],data[5],data[4],data[3],data[2],data[1],data[0]);
-  }
-  FORCEINLINE __vec8_f(__m512 v) 
-  {
-    union { __m512 v; float s[8]; } val = {v};
-    data[0] = val.s[0];
-    data[1] = val.s[1];
-    data[2] = val.s[2];
-    data[3] = val.s[3];
-    data[4] = val.s[4];
-    data[5] = val.s[5];
-    data[6] = val.s[6];
-    data[7] = val.s[7];
-  }
+  FORCEINLINE const int32_t& operator[](const int i) const {  return ((int32_t*)this)[i]; }
+  FORCEINLINE       int32_t& operator[](const int i)       {  return ((int32_t*)this)[i]; }
 } POST_ALIGN(32);
-#else /* __ZMM64BIT__ */
+
 PRE_ALIGN(32) struct __vec8_f 
 {
-    __m512 v;
-    FORCEINLINE operator __m512() const { return v; }
-    FORCEINLINE __vec8_f() : v(_mm512_undefined_ps()) { }
-    FORCEINLINE __vec8_f(const __m512 &in) : v(in) {}
-    FORCEINLINE __vec8_f(const __vec8_f &o) : v(o.v) {}
-    FORCEINLINE __vec8_f& operator =(const __vec8_f &o) { v=o.v; return *this; }
-    FORCEINLINE __vec8_f(float v00, float v01, float v02, float v03, 
-                          float v04, float v05, float v06, float v07) :
-        v ( _mm512_set_16to16_ps(0,0,0,0,0,0,0,0, v07, v06, v05, v04, v03, v02, v01, v00) )  {}
-    FORCEINLINE const float& operator[](const int i) const {  return ((float*)this)[i]; }
-    FORCEINLINE       float& operator[](const int i)       {  return ((float*)this)[i]; }
-} POST_ALIGN(32);
+#ifdef __ZMM64BIT__
+  __m512 _data;
+  FORCEINLINE __vec8_f(const __m512 &in) : _data(in) {}
+  FORCEINLINE operator __m512() const   { return _data; }
+#else /* __ZMM64BIT__ */
+  typedef float  _v8sf  __attribute__((vector_size(32)));
+  _v8sf _data;
+  FORCEINLINE __vec8_f(const __m512 &in) 
+  {
+    _mm512_mask_extpackstorelo_ps((__m512*)&_data,  0xFF, in, _MM_DOWNCONV_PS_NONE, _MM_HINT_NONE);
+  }
+  FORCEINLINE operator __m512() const   
+  { 
+    return _mm512_extloadunpacklo_ps(_mm512_setzero_ps(), (uint8_t*)&_data, _MM_UPCONV_PS_NONE, _MM_HINT_NONE); 
+  }
 #endif /* __ZMM64BIT__ */
+  FORCEINLINE __vec8_f() { }
+  FORCEINLINE __vec8_f(float v0, float v1, float v2, float v3, 
+                       float v4, float v5, float v6, float v7) 
+  {
+    const __m512 v  = _mm512_set_16to16_ps(0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0, v7, v6, v5, v4, v3, v2, v1, v0);
+    *this = __vec8_f(v);
+  }
+
+  FORCEINLINE const float& operator[](const int i) const {  return ((float*)this)[i]; }
+  FORCEINLINE       float& operator[](const int i)       {  return ((float*)this)[i]; }
+} POST_ALIGN(32);
 
 struct PRE_ALIGN(64) __vec8_d 
 {
@@ -438,8 +415,8 @@ INSERT_EXTRACT(__vec1_d, double)
 ///////////////////////////////////////////////////////////////////////////
 // mask ops
 
-static FORCEINLINE uint64_t __movmsk(__vec8_i1 mask) {
-    return (uint64_t)mask.v;
+static FORCEINLINE __vec8_i1 __movmsk(__vec8_i1 mask) {
+    return mask.v;
 }
 
 static FORCEINLINE bool __any(__vec8_i1 mask) {
@@ -455,52 +432,36 @@ static FORCEINLINE bool __none(__vec8_i1 mask) {
 }
 
 static FORCEINLINE __vec8_i1 __equal_i1(__vec8_i1 a, __vec8_i1 b) {
-    __vec8_i1 r;
-    r.v = (a.v & b.v) | (~a.v & ~b.v);
-    return r;
+    return (a.v & b.v) | (~a.v & ~b.v);
 }
 
 static FORCEINLINE __vec8_i1 __and(__vec8_i1 a, __vec8_i1 b) {
-    __vec8_i1 r;
-    r.v = a.v & b.v;
-    return r;
+    return  a.v & b.v;
 }
 
 static FORCEINLINE __vec8_i1 __xor(__vec8_i1 a, __vec8_i1 b) {
-    __vec8_i1 r;
-    r.v = a.v ^ b.v;
-    return r;
+    return a.v ^ b.v;
 }
 
 static FORCEINLINE __vec8_i1 __or(__vec8_i1 a, __vec8_i1 b) {
-    __vec8_i1 r;
-    r.v = a.v | b.v;
-    return r;
+    return  a.v | b.v;
 }
 
 static FORCEINLINE __vec8_i1 __not(__vec8_i1 v) {
-    __vec8_i1 r;
-    r.v = ~v.v;
-    return r;
+    return ~v;
 }
 
 static FORCEINLINE __vec8_i1 __and_not1(__vec8_i1 a, __vec8_i1 b) {
-    __vec8_i1 r;
-    r.v = ~a.v & b.v;
-    return r;
+    return  ~a.v & b.v;
 }
 
 static FORCEINLINE __vec8_i1 __and_not2(__vec8_i1 a, __vec8_i1 b) {
-    __vec8_i1 r;
-    r.v = a.v & ~b.v;
-    return r;
+    return  a.v & ~b.v;
 }
 
 static FORCEINLINE __vec8_i1 __select(__vec8_i1 mask, __vec8_i1 a, 
                                        __vec8_i1 b) {
-    __vec8_i1 r;
-    r.v = (a.v & mask.v) | (b.v & ~mask.v);
-    return r;
+    return  (a.v & mask.v) | (b.v & ~mask.v);
 }
 
 static FORCEINLINE __vec8_i1 __select(bool cond, __vec8_i1 a, __vec8_i1 b) {

From b2678b43388914e4eb94a9cd5845bfea16ae0e3e Mon Sep 17 00:00:00 2001
From: Dmitry Babokin <babokin@gmail.com>
Date: Thu, 19 Sep 2013 17:27:58 +0400
Subject: [PATCH 090/124] Typo fix is tests/double-consts.ispc

---
 tests/double-consts.ispc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/double-consts.ispc b/tests/double-consts.ispc
index 4096aa1c..5f9a66d5 100644
--- a/tests/double-consts.ispc
+++ b/tests/double-consts.ispc
@@ -13,7 +13,7 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
 
     // All the constants should be equal and if it's evaluated as "float",
     // then sqrt will evaluate to +inf.
-    if (d1 == d2 && d1 == d3 && d1 == d4 && d1 == d5 && d1 == d6
+    if (d1 == d2 && d1 == d3 && d1 == d4 && d1 == d5 && d1 == d6 &&
         ((float)sqrt(d1)) < 2e20) {
         RET[programIndex] = a;
     }

From 0ed89e93fa309796867c0e8729c16dac0c27bbb8 Mon Sep 17 00:00:00 2001
From: evghenii <egaburov@dds.nl>
Date: Thu, 19 Sep 2013 16:34:06 +0300
Subject: [PATCH 091/124] added fails info

---
 examples/intrinsics/knc-i1x8unsafe_fast.h | 103 +++++++++++++---------
 1 file changed, 60 insertions(+), 43 deletions(-)

diff --git a/examples/intrinsics/knc-i1x8unsafe_fast.h b/examples/intrinsics/knc-i1x8unsafe_fast.h
index 2e00a567..05be27bd 100644
--- a/examples/intrinsics/knc-i1x8unsafe_fast.h
+++ b/examples/intrinsics/knc-i1x8unsafe_fast.h
@@ -1,61 +1,78 @@
 #define __ZMM64BIT__
 #include "knc-i1x8.h"
 
-/* the following tests fails because vec8_i32 and vec8_float are 512 and not 256 bit in size.
- * not sure how it is possible to fix this, any suggestions? 
+/* the following tests fails because on KNC native vec8_i32 and vec8_float are 512 and not 256 bit in size.
+ *
+ *  Using test compiler: Intel(r) SPMD Program Compiler (ispc), 1.4.5dev (build commit d68dbbc7bce74803 @ 20130919, LLVM 3.3)
+ *  Using C/C++ compiler: icpc (ICC) 14.0.0 20130728
+ *
+ */
+
+/* knc-i1x8unsafe_fast.h fails: 
+ * ----------------------------
+1 / 1206 tests FAILED compilation:
+	./tests/ptr-assign-lhs-math-1.ispc
 33 / 1206 tests FAILED execution:
-        ./tests/array-gather-simple.ispc
-        ./tests/array-gather-vary.ispc
-        ./tests/array-multidim-gather-scatter.ispc
-        ./tests/array-scatter-vary.ispc
-        ./tests/atomics-5.ispc
-        ./tests/atomics-swap.ispc
-        ./tests/cfor-array-gather-vary.ispc
-        ./tests/cfor-gs-improve-varying-1.ispc
-        ./tests/cfor-struct-gather-2.ispc
-        ./tests/cfor-struct-gather-3.ispc
-        ./tests/cfor-struct-gather.ispc
-        ./tests/gather-struct-vector.ispc
-        ./tests/global-array-4.ispc
-        ./tests/gs-improve-varying-1.ispc
-        ./tests/half-1.ispc
-        ./tests/half-3.ispc
-        ./tests/half.ispc
-        ./tests/launch-3.ispc
-        ./tests/launch-4.ispc
-        ./tests/masked-scatter-vector.ispc
-        ./tests/masked-struct-scatter-varying.ispc
-        ./tests/new-delete-6.ispc
-        ./tests/ptr-24.ispc
-        ./tests/ptr-25.ispc
-        ./tests/short-vec-15.ispc
-        ./tests/struct-gather-2.ispc
-        ./tests/struct-gather-3.ispc
-        ./tests/struct-gather.ispc
-        ./tests/struct-ref-lvalue.ispc
-        ./tests/struct-test-118.ispc
-        ./tests/struct-vary-index-expr.ispc
-        ./tests/typedef-2.ispc
-        ./tests/vector-varying-scatter.ispc
+	./tests/array-gather-simple.ispc
+	./tests/array-gather-vary.ispc
+	./tests/array-multidim-gather-scatter.ispc
+	./tests/array-scatter-vary.ispc
+	./tests/atomics-5.ispc
+	./tests/atomics-swap.ispc
+	./tests/cfor-array-gather-vary.ispc
+	./tests/cfor-gs-improve-varying-1.ispc
+	./tests/cfor-struct-gather-2.ispc
+	./tests/cfor-struct-gather-3.ispc
+	./tests/cfor-struct-gather.ispc
+	./tests/gather-struct-vector.ispc
+	./tests/global-array-4.ispc
+	./tests/gs-improve-varying-1.ispc
+	./tests/half-1.ispc
+	./tests/half-3.ispc
+	./tests/half.ispc
+	./tests/launch-3.ispc
+	./tests/launch-4.ispc
+	./tests/masked-scatter-vector.ispc
+	./tests/masked-struct-scatter-varying.ispc
+	./tests/new-delete-6.ispc
+	./tests/ptr-24.ispc
+	./tests/ptr-25.ispc
+	./tests/short-vec-15.ispc
+	./tests/struct-gather-2.ispc
+	./tests/struct-gather-3.ispc
+	./tests/struct-gather.ispc
+	./tests/struct-ref-lvalue.ispc
+	./tests/struct-test-118.ispc
+	./tests/struct-vary-index-expr.ispc
+	./tests/typedef-2.ispc
+	./tests/vector-varying-scatter.ispc
 */
 
-/* knc-i1x8.h has the following fails:
+/* knc-i1x8.h fails: 
+ * ----------------------------
+1 / 1206 tests FAILED compilation:
+	./tests/ptr-assign-lhs-math-1.ispc
 3 / 1206 tests FAILED execution:
-        ./tests/half-1.ispc
-        ./tests/half-3.ispc
-        ./tests/half.ispc
+	./tests/half-1.ispc
+	./tests/half-3.ispc
+	./tests/half.ispc
 */
 
-/* knc-i1x16.h has the following fails:
-5 / 1206 tests FAILED execution:
-        ./tests/assert-3.ispc
+/* knc-i1x8.h fails: 
+ * ----------------------------
+1 / 1206 tests FAILED compilation:
+        ./tests/ptr-assign-lhs-math-1.ispc
+4 / 1206 tests FAILED execution:
         ./tests/half-1.ispc
         ./tests/half-3.ispc
         ./tests/half.ispc
         ./tests/test-141.ispc
 */
 
-/* generics-16, from which these knc-i1x*.h are derived, has the following fails:
+/* generic-16.h fails: (from these knc-i1x8.h & knc-i1x16.h are derived 
+ * ----------------------------
+1 / 1206 tests FAILED compilation:
+        ./tests/ptr-assign-lhs-math-1.ispc
 6 / 1206 tests FAILED execution:
         ./tests/func-overload-max.ispc
         ./tests/half-1.ispc

From 491c58aef374a1de7987ba8d5919a641a65cb853 Mon Sep 17 00:00:00 2001
From: Ilia Filippov <ili.filippov@gmail.com>
Date: Thu, 19 Sep 2013 17:47:10 +0400
Subject: [PATCH 092/124] change head to trunk

---
 alloy.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/alloy.py b/alloy.py
index 06025324..119874b8 100755
--- a/alloy.py
+++ b/alloy.py
@@ -81,7 +81,7 @@ def build_LLVM(version_LLVM, revision, folder, tarball, debug, selfbuild, from_v
     llvm_home = os.environ["LLVM_HOME"]
     os.chdir(llvm_home)
     FOLDER_NAME=version_LLVM
-    if  version_LLVM == "head":
+    if  version_LLVM == "trunk":
         SVN_PATH="trunk"
     if  version_LLVM == "3.3":
         SVN_PATH="tags/RELEASE_33/final"
@@ -334,7 +334,7 @@ def validation_run(only, only_targets, reference_branch, notify, update):
             archs.append("x86-64")
         if "native" in only:
             sde_targets_t = []
-        for i in ["3.1", "3.2", "3.3", "head"]:
+        for i in ["3.1", "3.2", "3.3", "trunk"]:
             if i in only:
                 LLVM.append(i)
         if "current" in only:
@@ -367,7 +367,7 @@ def validation_run(only, only_targets, reference_branch, notify, update):
         if len(archs) == 0:
             archs = ["x86", "x86-64"]
         if len(LLVM) == 0:
-            LLVM = ["3.3", "head"]
+            LLVM = ["3.3", "trunk"]
         gen_archs = ["x86-64"]
         need_LLVM = check_LLVM(LLVM)
         for i in range(0,len(need_LLVM)):
@@ -562,9 +562,9 @@ parser = OptionParser()
 parser.add_option('-b', '--build-llvm', dest='build_llvm',
     help='ask to build LLVM', default=False, action="store_true")
 parser.add_option('--version', dest='version',
-    help='version of llvm to build', default="head")
+    help='version of llvm to build: 3.1 3.2 3.3 trunk', default="trunk")
 parser.add_option('--revision', dest='revision',
-    help='revision of llvm to build', default="")
+    help='revision of llvm to build in format r172870', default="")
 parser.add_option('--debug', dest='debug',
     help='debug build of LLVM?', default=False, action="store_true")
 parser.add_option('--folder', dest='folder',
@@ -592,7 +592,7 @@ parser.add_option('--notify', dest='notify',
 parser.add_option('--only', dest='only',
     help='set types of tests. Possible values:\n' + 
         '-O0, -O2, x86, x86-64, stability (test only stability), performance (test only performance)\n' +
-        'build (only build with different LLVM), 3.1, 3.2, 3.3, head, native (do not use SDE), current (do not rebuild ISPC).\n' +
+        'build (only build with different LLVM), 3.1, 3.2, 3.3, trunk, native (do not use SDE), current (do not rebuild ISPC).\n' +
         'Example: --only="3.2 -O0 stability 3.3"', default="")
 parser.add_option('--update-errors', dest='update',
     help='rewrite fail_db.txt file according to received results (F or FP)', default="")

From 5cabf0bef06af579571046cae63dcd82768c1220 Mon Sep 17 00:00:00 2001
From: evghenii <egaburov@dds.nl>
Date: Fri, 20 Sep 2013 14:13:40 +0300
Subject: [PATCH 093/124] adding int64 support form knc.h, phase 1. bugs:
 __lshr & __ashr fail idiv.ispc test, __equal_i64 & __equal_i64_and_mask fails
 reduce_equal_8.ispc test

---
 examples/intrinsics/knc-i1x16.h | 290 ++++++++++++++++++++++++++++----
 1 file changed, 259 insertions(+), 31 deletions(-)

diff --git a/examples/intrinsics/knc-i1x16.h b/examples/intrinsics/knc-i1x16.h
index c535e61a..628a38b8 100644
--- a/examples/intrinsics/knc-i1x16.h
+++ b/examples/intrinsics/knc-i1x16.h
@@ -208,7 +208,7 @@ struct PRE_ALIGN(128) __vec16_d
 } POST_ALIGN(128);
 #endif /* evghenii::d */
 
-#if 1 /* evghenii::i64 */
+#if 0 /* evghenii::i64 */
 PRE_ALIGN(128) struct __vec16_i64  : public vec16<int64_t> { 
     __vec16_i64() { }
     __vec16_i64(int64_t v0, int64_t v1, int64_t v2, int64_t v3, 
@@ -219,34 +219,66 @@ PRE_ALIGN(128) struct __vec16_i64  : public vec16<int64_t> {
                          v8, v9, v10, v11, v12, v13, v14, v15) { }
 } POST_ALIGN(128);
 #else /* evghenii::i64 */
-struct PRE_ALIGN(64) __vec16_i64 {
-    FORCEINLINE __vec16_i64() : v_lo(_mm512_undefined_epi32()),  v_hi(_mm512_undefined_epi32()) {}
-    FORCEINLINE __vec16_i64(const __vec16_i64 &o) : v_lo(o.v_lo), v_hi(o.v_hi) {}
-    FORCEINLINE __vec16_i64(__m512i l, __m512i h) : v_lo(l), v_hi(h) {}
-    FORCEINLINE __vec16_i64& operator =(const __vec16_i64 &o) { v_lo=o.v_lo; v_hi=o.v_hi; return *this; }
-    FORCEINLINE __vec16_i64(int64_t v00, int64_t v01, int64_t v02, int64_t v03, 
-                            int64_t v04, int64_t v05, int64_t v06, int64_t v07,
-                            int64_t v08, int64_t v09, int64_t v10, int64_t v11,
-                            int64_t v12, int64_t v13, int64_t v14, int64_t v15) {
-        __m512i v1 = _mm512_set_8to8_epi64(v15, v14, v13, v12, v11, v10, v09, v08);
-        __m512i v2 = _mm512_set_8to8_epi64(v07, v06, v05, v04, v03, v02, v01, v00);
-        v_hi = _mm512_mask_permutevar_epi32(v_hi, 0xFF00, 
-                      _mm512_set_16to16_pi(15,13,11,9,7,5,3,1,14,12,10,8,6,4,2,0),
-                      v1);
-        v_hi = _mm512_mask_permutevar_epi32(v_hi, 0x00FF, 
-                      _mm512_set_16to16_pi(14,12,10,8,6,4,2,0,15,13,11,9,7,5,3,1),
-                      v2);
-        v_lo = _mm512_mask_permutevar_epi32(v_lo, 0xFF00,
-                      _mm512_set_16to16_pi(14,12,10,8,6,4,2,0,15,13,11,9,7,5,3,1),
-                      v1);
-        v_lo = _mm512_mask_permutevar_epi32(v_lo, 0x00FF,
-                      _mm512_set_16to16_pi(15,13,11,9,7,5,3,1,14,12,10,8,6,4,2,0),
-                      v2);
-    }
+struct PRE_ALIGN(128) __vec16_i64 
+{
+  union {
+    __m512i v1;
     __m512i v_hi;
+  };
+  union
+  {
+    __m512i v2;
     __m512i v_lo;
-} POST_ALIGN(64);
+  };
+  FORCEINLINE __vec16_i64() : v1(_mm512_undefined_epi32()), v2(_mm512_undefined_epi32()) {}
+  FORCEINLINE __vec16_i64(const __m512i _v1, const __m512i _v2) : v1(_v1), v2(_v2) {}
+  FORCEINLINE __vec16_i64(const __vec16_i64 &o) : v1(o.v1), v2(o.v2) {}
+  FORCEINLINE __vec16_i64& operator =(const __vec16_i64 &o) { v1=o.v1; v2=o.v2; return *this; }
+  FORCEINLINE __vec16_i64(int64_t v00, int64_t v01, int64_t v02, int64_t v03, 
+      int64_t v04, int64_t v05, int64_t v06, int64_t v07,
+      int64_t v08, int64_t v09, int64_t v10, int64_t v11,
+      int64_t v12, int64_t v13, int64_t v14, int64_t v15) {
+    v2 = _mm512_set_8to8_epi64(v15, v14, v13, v12, v11, v10, v09, v08);
+    v1 = _mm512_set_8to8_epi64(v07, v06, v05, v04, v03, v02, v01, v00);
+  }
+  FORCEINLINE const int64_t& operator[](const int i) const {  return ((int64_t*)this)[i]; }
+  FORCEINLINE       int64_t& operator[](const int i)       {  return ((int64_t*)this)[i]; }
+  FORCEINLINE __vec16_i64 cvt2hilo()  const
+  {
+    __m512i _hi, _lo;
+    _hi = _mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xFF00, 
+        _mm512_set_16to16_pi(15,13,11,9,7,5,3,1,14,12,10,8,6,4,2,0),
+        v1);
+    _hi = _mm512_mask_permutevar_epi32(_hi, 0x00FF, 
+        _mm512_set_16to16_pi(14,12,10,8,6,4,2,0,15,13,11,9,7,5,3,1),
+        v2);
+    _lo = _mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xFF00,
+        _mm512_set_16to16_pi(14,12,10,8,6,4,2,0,15,13,11,9,7,5,3,1),
+        v1);
+    _lo = _mm512_mask_permutevar_epi32(_lo, 0x00FF,
+        _mm512_set_16to16_pi(15,13,11,9,7,5,3,1,14,12,10,8,6,4,2,0),
+        v2);
+    return __vec16_i64(_hi, _lo);
+  }
+  FORCEINLINE __vec16_i64 cvt2zmm() const
+  {
+    __m512i _v1, _v2;
+    _v1 = _mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xAAAA,
+        _mm512_set_16to16_pi(15,15,14,14,13,13,12,12,11,11,10,10,9,9,8,8),
+        v_hi);
+    _v1 = _mm512_mask_permutevar_epi32(_v1, 0x5555,
+        _mm512_set_16to16_pi(15,15,14,14,13,13,12,12,11,11,10,10,9,9,8,8),
+        v_lo);
 
+    _v2 = _mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xAAAA,
+        _mm512_set_16to16_pi(7,7,6,6,5,5,4,4,3,3,2,2,1,1,0,0),
+        v_hi);
+    _v2 = _mm512_mask_permutevar_epi32(_v2, 0x5555,
+        _mm512_set_16to16_pi(7,7,6,6,5,5,4,4,3,3,2,2,1,1,0,0),
+        v_lo);
+    return __vec16_i64(_v1, _v2);
+  }
+} POST_ALIGN(128);
 #endif /* evghenii::i64 */
 
 PRE_ALIGN(16) struct __vec16_i8   : public vec16<int8_t> { 
@@ -959,30 +991,162 @@ template <> static FORCEINLINE void __store<64>(__vec16_i32 *p, __vec16_i32 v) {
 
 ///////////////////////////////////////////////////////////////////////////
 // int64
+// evghenii::int64
 
+#if 0
 BINARY_OP(__vec16_i64, __add, +)
 BINARY_OP(__vec16_i64, __sub, -)
 BINARY_OP(__vec16_i64, __mul, *)
+#else
+static FORCEINLINE __vec16_i64 __add(__vec16_i64 a, __vec16_i64 b) {
+  return __vec16_i64(_mm512_add_epi64(a.v1, b.v1), _mm512_add_epi64(a.v2,b.v2));
+}
 
+static FORCEINLINE __vec16_i64 __sub(__vec16_i64 _a, __vec16_i64 _b) {
+//    return __vec16_i64(_mm512_sub_epi64(_a.v1, _b.v1), _mm512_sub_epi64(_a.v2,_b.v2));
+  const __vec16_i64 a = _a.cvt2hilo();
+  const __vec16_i64 b = _b.cvt2hilo();
+  __vec16_i64 ret;
+  __mmask16 borrow = 0;
+  ret.v_lo = _mm512_subsetb_epi32(a.v_lo, b.v_lo, &borrow);
+  ret.v_hi = _mm512_sbb_epi32    (a.v_hi, borrow, b.v_hi, &borrow);
+  return ret.cvt2zmm();
+}
+
+static FORCEINLINE __vec16_i64 __mul(const __vec16_i32 &a, const __vec16_i64 &_b)
+{
+  const __vec16_i64 b = _b.cvt2hilo();
+  return __vec16_i64(_mm512_mullo_epi32(a.v,b.v_lo),
+      _mm512_add_epi32(_mm512_mullo_epi32(a.v, b.v_hi),
+        _mm512_mulhi_epi32(a.v, b.v_lo))).cvt2zmm();
+}
+
+static FORCEINLINE __vec16_i64 __mul(__vec16_i64 a, __vec16_i64 b) {
+  return __vec16_i64(_mm512_mullox_epi64(a.v1, b.v1), _mm512_mullox_epi64(a.v2,b.v2));
+}
+#endif
+
+#if 0
 BINARY_OP(__vec16_i64, __or, |)
 BINARY_OP(__vec16_i64, __and, &)
 BINARY_OP(__vec16_i64, __xor, ^)
 BINARY_OP(__vec16_i64, __shl, <<)
+#else
+static FORCEINLINE __vec16_i64 __or(__vec16_i64 a, __vec16_i64 b) {
+  return __vec16_i64(_mm512_or_epi64(a.v1, b.v1), _mm512_or_epi64(a.v2, b.v2));
+}
 
+static FORCEINLINE __vec16_i64 __and(__vec16_i64 a, __vec16_i64 b) {
+  return __vec16_i64(_mm512_and_epi64(a.v1, b.v1), _mm512_and_epi64(a.v2, b.v2));
+}
+
+static FORCEINLINE __vec16_i64 __xor(__vec16_i64 a, __vec16_i64 b) {
+  return __vec16_i64(_mm512_xor_epi64(a.v1, b.v1), _mm512_xor_epi64(a.v2, b.v2));
+}
+
+static FORCEINLINE __vec16_i64 __shl(__vec16_i64 _a, __vec16_i64 _b) {
+  const __vec16_i64 a = _a.cvt2hilo();
+  const __vec16_i64 b = _b.cvt2hilo();
+  __vec16_i32 xfer = _mm512_srlv_epi32(a.v_lo, _mm512_sub_epi32(__ispc_thirty_two, b.v_lo));
+  __vec16_i32 hi = _mm512_or_epi32(_mm512_sllv_epi32(a.v_hi, b.v_lo), xfer);
+  __vec16_i32 lo = _mm512_sllv_epi32(a.v_lo, b.v_lo);
+  return __vec16_i64(hi,lo).cvt2zmm();
+}
+#endif
+
+#if 0
 BINARY_OP_CAST(__vec16_i64, uint64_t, __udiv, /)
 BINARY_OP_CAST(__vec16_i64, int64_t,  __sdiv, /)
+#else
+static FORCEINLINE __vec16_i64 __udiv(__vec16_i64 a, __vec16_i64 b) {
+  return __vec16_i64(_mm512_div_epu64(a.v1,b.v1), _mm512_div_epu64(a.v2,b.v2));
+}
+static FORCEINLINE __vec16_i64 __sdiv(__vec16_i64 a, __vec16_i64 b) {
+  return __vec16_i64(_mm512_div_epi64(a.v1,b.v1), _mm512_div_epi64(a.v2,b.v2));
+}
+#endif
 
+#if 0
 BINARY_OP_CAST(__vec16_i64, uint64_t, __urem, %)
 BINARY_OP_CAST(__vec16_i64, int64_t,  __srem, %)
+#else
+static FORCEINLINE __vec16_i64 __urem(__vec16_i64 a, __vec16_i64 b) {
+  return __vec16_i64(_mm512_rem_epu64(a.v1,b.v1), _mm512_rem_epu64(a.v2,b.v2));
+}
+static FORCEINLINE __vec16_i64 __srem(__vec16_i64 a, __vec16_i64 b) {
+  return __vec16_i64(_mm512_rem_epi64(a.v1,b.v1), _mm512_rem_epi64(a.v2,b.v2));
+}
+#endif
+
+#if 1
 BINARY_OP_CAST(__vec16_i64, uint64_t, __lshr, >>)
+#else /* evghenii::fails idiv.ispc */
+static FORCEINLINE __vec16_i64 __lshr(__vec16_i64 _a, __vec16_i64 _b) {
+  const __vec16_i64 a = _a.cvt2hilo();
+  const __vec16_i64 b = _b.cvt2hilo();
+  __vec16_i32 shift = _mm512_sub_epi32(__ispc_thirty_two, b.v_lo);
+#if 0
+  __vec16_i32 xfer = _mm512_and_epi32(_mm512_sllv_epi32(__ispc_ffffffff, shift), _mm512_sllv_epi32(a.v_hi, shift));
+#else
+  __vec16_i32 xfer = _mm512_sllv_epi32(_mm512_and_epi32(a.v_hi, 
+        _mm512_sub_epi32(_mm512_sllv_epi32(__ispc_one, b.v_lo), __ispc_one)), 
+      _mm512_sub_epi32(__ispc_thirty_two, b.v_lo));
+#endif
+  __vec16_i32 hi = _mm512_srlv_epi32(a.v_hi, b.v_lo);
+  __vec16_i32 lo = _mm512_or_epi32(xfer, _mm512_srlv_epi32(a.v_lo, b.v_lo));
+  return __vec16_i64(hi,lo).cvt2zmm();
+}
+
+#endif
+
+#if 1
 BINARY_OP_CAST(__vec16_i64, int64_t,  __ashr, >>)
+#else /* evghenii::fails idiv.ispc */
+static FORCEINLINE __vec16_i64 __ashr(__vec16_i64 _a, __vec16_i64 _b) {
+  const __vec16_i64 a = _a.cvt2hilo();
+  const __vec16_i64 b = _b.cvt2hilo();
+  __vec16_i32 xfer = _mm512_sllv_epi32(_mm512_and_epi32(a.v_hi, 
+        _mm512_sub_epi32(_mm512_sllv_epi32(__ispc_one, b.v_lo), __ispc_one)), 
+      _mm512_sub_epi32(__ispc_thirty_two, b.v_lo));
+  __vec16_i32 hi = _mm512_srav_epi32(a.v_hi, b.v_lo);
+  __vec16_i32 lo = _mm512_or_epi32(xfer, _mm512_srlv_epi32(a.v_lo, b.v_lo));
+  return __vec16_i64(hi,lo).cvt2zmm();
+}
+#endif
 
 SHIFT_UNIFORM(__vec16_i64, uint64_t, __lshr, >>)
 SHIFT_UNIFORM(__vec16_i64, int64_t, __ashr, >>)
 SHIFT_UNIFORM(__vec16_i64, int64_t, __shl, <<)
 
+#if 1
 CMP_OP(__vec16_i64, i64, int64_t,  __equal, ==)
 CMP_OP(__vec16_i64, i64, int64_t,  __not_equal, !=)
+#else /* evghenii::fails         ./tests/reduce-equal-8.ispc, some other test hang... */
+static FORCEINLINE __vec16_i1 __equal_i64(const __vec16_i64 &_a, const __vec16_i64 &_b) {
+  const __vec16_i64 a = _a.cvt2hilo();
+  const __vec16_i64 b = _b.cvt2hilo();
+  const __mmask16 lo_match = _mm512_cmpeq_epi32_mask(a.v_lo,b.v_lo);
+  return _mm512_mask_cmpeq_epi32_mask(lo_match,a.v_hi,b.v_hi);
+}
+static FORCEINLINE __vec16_i1 __equal_i64_and_mask(const __vec16_i64 &_a, const __vec16_i64 &_b,
+                                                   __vec16_i1 mask) {
+  const __vec16_i64 a = _a.cvt2hilo();
+  const __vec16_i64 b = _b.cvt2hilo();
+  __mmask16 lo_match = _mm512_cmpeq_epi32_mask(a.v_lo,b.v_lo);
+  __mmask16 full_match = _mm512_mask_cmpeq_epi32_mask(lo_match,a.v_hi,b.v_hi);
+  return _mm512_kand(full_match, (__mmask16)mask);
+}
+
+static FORCEINLINE __vec16_i1 __not_equal_i64(const __vec16_i64 &a, const __vec16_i64 &b) {
+    return __not(__equal_i64(a,b));
+}
+static FORCEINLINE __vec16_i1 __not_equal_i64_and_mask(const __vec16_i64 &a, const __vec16_i64 &b,
+                                                       __vec16_i1 mask) {
+    return __and(__not(__equal_i64(a,b)), mask);
+}
+#endif
+
+
 CMP_OP(__vec16_i64, i64, uint64_t, __unsigned_less_equal, <=)
 CMP_OP(__vec16_i64, i64, int64_t,  __signed_less_equal, <=)
 CMP_OP(__vec16_i64, i64, uint64_t, __unsigned_greater_equal, >=)
@@ -992,15 +1156,84 @@ CMP_OP(__vec16_i64, i64, int64_t,  __signed_less_than, <)
 CMP_OP(__vec16_i64, i64, uint64_t, __unsigned_greater_than, >)
 CMP_OP(__vec16_i64, i64, int64_t,  __signed_greater_than, >)
 
+#if 0
 SELECT(__vec16_i64)
+#else
+static FORCEINLINE __vec16_i64 __select(__vec16_i1 mask,
+                                        __vec16_i64 a, __vec16_i64 b) {
+  __vec16_i64 ret;
+  ret.v_hi = _mm512_mask_mov_epi64(b.v_hi, mask, a.v_hi);
+  ret.v_lo = _mm512_mask_mov_epi64(b.v_lo, mask >> 8, a.v_lo);
+  return ret;
+}
+#endif
+
 INSERT_EXTRACT(__vec16_i64, int64_t)
+#if 0
 SMEAR(__vec16_i64, i64, int64_t)
 SETZERO(__vec16_i64, i64)
 UNDEF(__vec16_i64, i64)
 BROADCAST(__vec16_i64, i64, int64_t)
+#else
+template <class RetVecType> RetVecType __smear_i64(const int64_t &l);
+template <> FORCEINLINE  __vec16_i64 __smear_i64<__vec16_i64>(const int64_t &l) {    return __vec16_i64(_mm512_set1_epi64(l), _mm512_set1_epi64(l)); }
+
+template <class RetVecType> RetVecType __setzero_i64();
+template <> FORCEINLINE  __vec16_i64 __setzero_i64<__vec16_i64>() {    return __vec16_i64(_mm512_setzero_epi32(), _mm512_setzero_epi32()); }
+
+template <class RetVecType> RetVecType __undef_i64();
+template <> FORCEINLINE  __vec16_i64 __undef_i64<__vec16_i64>() {    return __vec16_i64(_mm512_undefined_epi32(), _mm512_undefined_epi32()); }
+
+static FORCEINLINE __vec16_i64 __broadcast_i64(__vec16_i64 v, int index) {
+    int64_t val = __extract_element(v, index & 0xf);
+    return __smear_i64<__vec16_i64>(val);
+}
+#endif
 ROTATE(__vec16_i64, i64, int64_t)
 SHUFFLES(__vec16_i64, i64, int64_t)
+#if 0
 LOAD_STORE(__vec16_i64, int64_t)
+#else
+template <int ALIGN> static FORCEINLINE __vec16_i64 __load(const __vec16_i64 *p) 
+{
+  __vec16_i32 v1;
+  __vec16_i32 v2;
+  v2 = _mm512_extloadunpacklo_epi32(v2, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
+  v2 = _mm512_extloadunpackhi_epi32(v2, (uint8_t*)p+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
+  v1 = _mm512_extloadunpacklo_epi32(v1, (uint8_t*)p+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
+  v1 = _mm512_extloadunpackhi_epi32(v1, (uint8_t*)p+128, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
+  return __vec16_i64(v2,v1);
+}
+
+template <> static FORCEINLINE __vec16_i64 __load<64>(const __vec16_i64 *p) 
+{
+  __m512i v2 = _mm512_load_epi32(p);
+  __m512i v1 = _mm512_load_epi32(((uint8_t*)p)+64);
+  return __vec16_i64(v2,v1);
+}
+
+template <> static FORCEINLINE __vec16_i64 __load<128>(const __vec16_i64 *p) {    return __load<64>(p); }
+
+template <int ALIGN> static FORCEINLINE void __store(__vec16_i64 *p, __vec16_i64 v) 
+{
+  __m512i v1 = v.v2;
+  __m512i v2 = v.v1;
+  _mm512_extpackstorelo_epi32(p, v2, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
+  _mm512_extpackstorehi_epi32((uint8_t*)p+64, v2, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
+  _mm512_extpackstorelo_epi32((uint8_t*)p+64, v1, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
+  _mm512_extpackstorehi_epi32((uint8_t*)p+128, v1, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
+}
+
+template <> static FORCEINLINE void __store<64>(__vec16_i64 *p, __vec16_i64 v) 
+{
+  __m512i v1 = v.v2;
+  __m512i v2 = v.v1;
+  _mm512_store_epi64(p, v2);
+  _mm512_store_epi64(((uint8_t*)p)+64, v1);
+}
+
+template <> static FORCEINLINE void __store<128>(__vec16_i64 *p, __vec16_i64 v) {    __store<64>(p, v); }
+#endif
 
 
 #if 0 /* evghenii::float */
@@ -1062,7 +1295,6 @@ static FORCEINLINE __vec16_f __sub(__vec16_f a, __vec16_f b) {
     return _mm512_sub_ps(a, b);
 }
 
-#if 1 /* evghenii::this two fails assert-3.ispc test */
 static FORCEINLINE __vec16_f __mul(__vec16_f a, __vec16_f b) {
     return _mm512_mul_ps(a, b);
 }
@@ -1070,10 +1302,6 @@ static FORCEINLINE __vec16_f __mul(__vec16_f a, __vec16_f b) {
 static FORCEINLINE __vec16_f __div(__vec16_f a, __vec16_f b) {
     return _mm512_div_ps(a, b);
 }
-#else
-BINARY_OP(__vec16_f, __mul, *)
-BINARY_OP(__vec16_f, __div, /)
-#endif
 
 
 static FORCEINLINE __vec16_i1 __equal_float(__vec16_f a, __vec16_f b) {

From ddecdeb8349e1d3db7d6c4ef949c9fb86734609d Mon Sep 17 00:00:00 2001
From: evghenii <egaburov@dds.nl>
Date: Fri, 20 Sep 2013 14:55:15 +0300
Subject: [PATCH 094/124] move remaining int64 from knc.h some of fails to pass
 tests, grep for evghenii::fails to find out which functions fail and on what
 tests

---
 examples/intrinsics/knc-i1x16.h | 170 +++++++++++++++++++++++++++++---
 1 file changed, 157 insertions(+), 13 deletions(-)

diff --git a/examples/intrinsics/knc-i1x16.h b/examples/intrinsics/knc-i1x16.h
index 628a38b8..1f5a6056 100644
--- a/examples/intrinsics/knc-i1x16.h
+++ b/examples/intrinsics/knc-i1x16.h
@@ -1120,7 +1120,6 @@ SHIFT_UNIFORM(__vec16_i64, int64_t, __shl, <<)
 
 #if 1
 CMP_OP(__vec16_i64, i64, int64_t,  __equal, ==)
-CMP_OP(__vec16_i64, i64, int64_t,  __not_equal, !=)
 #else /* evghenii::fails         ./tests/reduce-equal-8.ispc, some other test hang... */
 static FORCEINLINE __vec16_i1 __equal_i64(const __vec16_i64 &_a, const __vec16_i64 &_b) {
   const __vec16_i64 a = _a.cvt2hilo();
@@ -1128,6 +1127,14 @@ static FORCEINLINE __vec16_i1 __equal_i64(const __vec16_i64 &_a, const __vec16_i
   const __mmask16 lo_match = _mm512_cmpeq_epi32_mask(a.v_lo,b.v_lo);
   return _mm512_mask_cmpeq_epi32_mask(lo_match,a.v_hi,b.v_hi);
 }
+static FORCEINLINE __vec16_i1 __not_equal_i64(const __vec16_i64 &a, const __vec16_i64 &b) {
+    return __not(__equal_i64(a,b));
+}
+#endif
+
+#if 1
+CMP_OP(__vec16_i64, i64, int64_t,  __not_equal, !=)
+#else /* evghenii::fails         ./tests/reduce-equal-8.ispc, some other test hang... */
 static FORCEINLINE __vec16_i1 __equal_i64_and_mask(const __vec16_i64 &_a, const __vec16_i64 &_b,
                                                    __vec16_i1 mask) {
   const __vec16_i64 a = _a.cvt2hilo();
@@ -1136,10 +1143,6 @@ static FORCEINLINE __vec16_i1 __equal_i64_and_mask(const __vec16_i64 &_a, const
   __mmask16 full_match = _mm512_mask_cmpeq_epi32_mask(lo_match,a.v_hi,b.v_hi);
   return _mm512_kand(full_match, (__mmask16)mask);
 }
-
-static FORCEINLINE __vec16_i1 __not_equal_i64(const __vec16_i64 &a, const __vec16_i64 &b) {
-    return __not(__equal_i64(a,b));
-}
 static FORCEINLINE __vec16_i1 __not_equal_i64_and_mask(const __vec16_i64 &a, const __vec16_i64 &b,
                                                        __vec16_i1 mask) {
     return __and(__not(__equal_i64(a,b)), mask);
@@ -1147,6 +1150,7 @@ static FORCEINLINE __vec16_i1 __not_equal_i64_and_mask(const __vec16_i64 &a, con
 #endif
 
 
+
 CMP_OP(__vec16_i64, i64, uint64_t, __unsigned_less_equal, <=)
 CMP_OP(__vec16_i64, i64, int64_t,  __signed_less_equal, <=)
 CMP_OP(__vec16_i64, i64, uint64_t, __unsigned_greater_equal, >=)
@@ -1843,7 +1847,14 @@ static FORCEINLINE TO FUNC(TO, FROM val) {      \
 }
 
 // sign extension conversions
+#if 1
 CAST(__vec16_i64, int64_t, __vec16_i32, int32_t, __cast_sext)
+#else /* evghenii::fails on soa-9 soa-13 soa-10 soa-29 soa-3 ... and others  */
+static FORCEINLINE __vec16_i64 __cast_sext(const __vec16_i64 &, const __vec16_i32 &val)
+{
+  return __vec16_i64(_mm512_srai_epi32(val.v,31), val.v).cvt2zmm();
+}
+#endif
 CAST(__vec16_i64, int64_t, __vec16_i16, int16_t, __cast_sext)
 CAST(__vec16_i64, int64_t, __vec16_i8,  int8_t,  __cast_sext)
 CAST(__vec16_i32, int32_t, __vec16_i16, int16_t, __cast_sext)
@@ -1868,15 +1879,23 @@ CAST_SEXT_I1(__vec16_i32)
 #else
 static FORCEINLINE __vec16_i32 __cast_sext(const __vec16_i32 &, const __vec16_i1 &val)
 {
-    __vec16_i32 ret = _mm512_setzero_epi32();
-    __vec16_i32 one = _mm512_set1_epi32(-1);
-    return _mm512_mask_mov_epi32(ret, val, one);
+  __vec16_i32 ret = _mm512_setzero_epi32();
+  __vec16_i32 one = _mm512_set1_epi32(-1);
+  return _mm512_mask_mov_epi32(ret, val, one);
 }
 #endif
 CAST_SEXT_I1(__vec16_i64)
 
 // zero extension
+#if 0
 CAST(__vec16_i64, uint64_t, __vec16_i32, uint32_t, __cast_zext)
+#else
+static FORCEINLINE __vec16_i64 __cast_zext(const __vec16_i64 &, const __vec16_i32 &val)
+{
+  return __vec16_i64(_mm512_setzero_epi32(), val.v).cvt2zmm();
+}
+
+#endif
 CAST(__vec16_i64, uint64_t, __vec16_i16, uint16_t, __cast_zext)
 CAST(__vec16_i64, uint64_t, __vec16_i8,  uint8_t,  __cast_zext)
 CAST(__vec16_i32, uint32_t, __vec16_i16, uint16_t, __cast_zext)
@@ -2714,8 +2733,34 @@ static FORCEINLINE __vec16_i8 __gather_base_offsets32_i8(uint8_t *base, uint32_t
     _mm512_extstore_epi32(ret.data,tmp,_MM_DOWNCONV_EPI32_SINT8,_MM_HINT_NONE);
     return ret;
 }
-#endif
+#if 0 /* evghenii::fails on gather-int8-2 & gather-int8-4 */
+static FORCEINLINE __vec16_i8 __gather_base_offsets64_i8(uint8_t *_base, uint32_t scale, __vec16_i64 _offsets, __vec16_i1 mask) 
+{ 
+  const __vec16_i64 offsets = _offsets.cvt2hilo();
+  __vec16_i1 still_to_do = mask;
+  __vec16_i32 tmp;
+  while (still_to_do) {
+    int first_active_lane = _mm_tzcnt_32((int)still_to_do);
+    const uint &hi32 = ((uint*)&offsets.v_hi)[first_active_lane];
+    __vec16_i1 match = _mm512_mask_cmp_epi32_mask(mask,offsets.v_hi,
+        __smear_i32<__vec16_i32>((int32_t)hi32),
+        _MM_CMPINT_EQ);
+
+    void * base = (void*)((unsigned long)_base  +
+        ((scale*(unsigned long)hi32) << 32));    
+    tmp = _mm512_mask_i32extgather_epi32(tmp, match, offsets.v_lo, base,
+        _MM_UPCONV_EPI32_SINT8, scale,
+        _MM_HINT_NONE);
+    still_to_do = _mm512_kxor(match,still_to_do);
+  }
+  __vec16_i8 ret;
+  _mm512_extstore_epi32(ret.data,tmp,_MM_DOWNCONV_EPI32_SINT8,_MM_HINT_NONE);
+  return ret;
+}
+#else
 GATHER_BASE_OFFSETS(__vec16_i8,  int8_t,  __vec16_i64, __gather_base_offsets64_i8)
+#endif
+#endif
 /****************/
 GATHER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i32, __gather_base_offsets32_i16)
 GATHER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i64, __gather_base_offsets64_i16)
@@ -2729,8 +2774,35 @@ static FORCEINLINE __vec16_i32 __gather_base_offsets32_i32(uint8_t *base, uint32
                                           base, _MM_UPCONV_EPI32_NONE, scale,
                                           _MM_HINT_NONE);
 }
-#endif
+#if 0 /* evghenii::fails on gather-int32-2 & gather-int32-4 */
+static FORCEINLINE __vec16_i32 __gather_base_offsets64_i32(uint8_t *_base, uint32_t scale, __vec16_i64 _offsets,  __vec16_i1 mask) 
+{
+  const __vec16_i64 offsets = _offsets.cvt2hilo();
+  // There is no gather instruction with 64-bit offsets in KNC.
+  // We have to manually iterate over the upper 32 bits ;-)
+  __vec16_i1  still_to_do = mask;
+  __vec16_i32 ret;
+  while (still_to_do) {
+    int first_active_lane = _mm_tzcnt_32((int)still_to_do);
+    const uint &hi32 = ((uint*)&offsets.v_hi)[first_active_lane];
+    __vec16_i1 match = _mm512_mask_cmp_epi32_mask(mask,offsets.v_hi,
+        __smear_i32<__vec16_i32>((int32_t)hi32),
+        _MM_CMPINT_EQ);
+
+    void * base = (void*)((unsigned long)_base  +
+        ((scale*(unsigned long)hi32) << 32));
+    ret = _mm512_mask_i32extgather_epi32(ret, match, offsets.v_lo, base,
+        _MM_UPCONV_EPI32_NONE, scale,
+        _MM_HINT_NONE);
+    still_to_do = _mm512_kxor(match, still_to_do);
+  }
+
+  return ret;
+}
+#else
 GATHER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i64, __gather_base_offsets64_i32)
+#endif
+#endif
 /****************/
 #if 0
 GATHER_BASE_OFFSETS(__vec16_f,   float,   __vec16_i32, __gather_base_offsets32_float)
@@ -2741,8 +2813,35 @@ static FORCEINLINE __vec16_f __gather_base_offsets32_float(uint8_t *base, uint32
                                        base, _MM_UPCONV_PS_NONE, scale,
                                        _MM_HINT_NONE);
 }
-#endif
+#if 0 /* evghenii::fails on gather-float-2 gather-float-4 & soa-14 */
+static FORCEINLINE __vec16_f __gather_base_offsets64_float(uint8_t *_base, uint32_t scale, __vec16_i64 _offsets,  __vec16_i1 mask) 
+{
+  const __vec16_i64 offsets = _offsets.cvt2hilo();
+  // There is no gather instruction with 64-bit offsets in KNC.
+  // We have to manually iterate over the upper 32 bits ;-)
+  __vec16_i1 still_to_do = mask;
+  __vec16_f ret;
+  while (still_to_do) {
+    int first_active_lane = _mm_tzcnt_32((int)still_to_do);
+    const uint &hi32 = ((uint*)&offsets.v_hi)[first_active_lane];
+    __vec16_i1 match = _mm512_mask_cmp_epi32_mask(mask,offsets.v_hi,
+        __smear_i32<__vec16_i32>((int32_t)hi32),
+        _MM_CMPINT_EQ);
+
+    void * base = (void*)((unsigned long)_base  +
+        ((scale*(unsigned long)hi32) << 32));
+    ret = _mm512_mask_i32extgather_ps(ret, match, offsets.v_lo, base,
+        _MM_UPCONV_PS_NONE, scale,
+        _MM_HINT_NONE);
+    still_to_do = _mm512_kxor(match, still_to_do);
+  }
+
+  return ret;
+}
+#else
 GATHER_BASE_OFFSETS(__vec16_f,   float,   __vec16_i64, __gather_base_offsets64_float)
+#endif
+#endif
 /****************/
 GATHER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i32, __gather_base_offsets32_i64)
 GATHER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i64, __gather_base_offsets64_i64)
@@ -2824,6 +2923,7 @@ SCATTER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i64, __scatter_base_offsets64
 /*****************/
 #if 0
 SCATTER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i32, __scatter_base_offsets32_i32)
+SCATTER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i64, __scatter_base_offsets64_i32)
 #else
 static FORCEINLINE void __scatter_base_offsets32_i32(uint8_t *b, uint32_t scale, __vec16_i32 offsets,  __vec16_i32 val, __vec16_i1 mask)
 {
@@ -2831,8 +2931,28 @@ static FORCEINLINE void __scatter_base_offsets32_i32(uint8_t *b, uint32_t scale,
                                     _MM_DOWNCONV_EPI32_NONE, scale, 
                                     _MM_HINT_NONE);
 }
+static FORCEINLINE void __scatter_base_offsets64_i32(uint8_t *_base, uint32_t scale, __vec16_i64 _offsets, __vec16_i32 value, __vec16_i1 mask) 
+{
+  const __vec16_i64 offsets = _offsets.cvt2hilo();
+
+  __vec16_i1 still_to_do = mask;
+  while (still_to_do) {
+    int first_active_lane = _mm_tzcnt_32((int)still_to_do);
+    const uint &hi32 = ((uint*)&offsets.v_hi)[first_active_lane];
+    __vec16_i1 match = _mm512_mask_cmp_epi32_mask(mask,offsets.v_hi,
+        __smear_i32<__vec16_i32>((int32_t)hi32),
+        _MM_CMPINT_EQ);
+
+    void * base = (void*)((unsigned long)_base  +
+        ((scale*(unsigned long)hi32) << 32));    
+    _mm512_mask_i32extscatter_epi32(base, match, offsets.v_lo, 
+        value,
+        _MM_DOWNCONV_EPI32_NONE, scale,
+        _MM_HINT_NONE);
+    still_to_do = _mm512_kxor(match,still_to_do);
+  }
+}
 #endif
-SCATTER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i64, __scatter_base_offsets64_i32)
 /*****************/
 #if 0
 SCATTER_BASE_OFFSETS(__vec16_f,   float,   __vec16_i32, __scatter_base_offsets32_float)
@@ -2844,8 +2964,32 @@ static FORCEINLINE void __scatter_base_offsets32_float(void *base, uint32_t scal
                                  _MM_DOWNCONV_PS_NONE, scale,
                                  _MM_HINT_NONE);
 }
-#endif
+#if 0 /* evghenii::fails on soa-10 & soa-13 , it is very similar to __scatter_base_offsets64_it32, but that passes tests, why ?!? */
+static FORCEINLINE void __scatter_base_offsets64_float(uint8_t *_base, uint32_t scale, __vec16_i64 _offsets, __vec16_f value, __vec16_i1 mask) 
+{ 
+  const __vec16_i64 offsets = _offsets.cvt2hilo();
+
+  __vec16_i1 still_to_do = mask;
+  while (still_to_do) {
+    int first_active_lane = _mm_tzcnt_32((int)still_to_do);
+    const uint &hi32 = ((uint*)&offsets.v_hi)[first_active_lane];
+    __vec16_i1 match = _mm512_mask_cmp_epi32_mask(mask,offsets.v_hi,
+        __smear_i32<__vec16_i32>((int32_t)hi32),
+        _MM_CMPINT_EQ);
+
+    void * base = (void*)((unsigned long)_base  +
+        ((scale*(unsigned long)hi32) << 32));    
+    _mm512_mask_i32extscatter_ps(base, match, offsets.v_lo, 
+        value,
+        _MM_DOWNCONV_PS_NONE, scale,
+        _MM_HINT_NONE);
+    still_to_do = _mm512_kxor(match,still_to_do);
+  }
+}
+#else
 SCATTER_BASE_OFFSETS(__vec16_f,   float,   __vec16_i64, __scatter_base_offsets64_float)
+#endif
+#endif
 /*****************/
 SCATTER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i32, __scatter_base_offsets32_i64)
 SCATTER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i64, __scatter_base_offsets64_i64)

From 87cecddabb69f0a5794c6d6c325c8ccd329165c9 Mon Sep 17 00:00:00 2001
From: Ilia Filippov <ili.filippov@gmail.com>
Date: Fri, 20 Sep 2013 18:57:20 +0400
Subject: [PATCH 095/124] adding sort to performance checking

---
 examples/sort/sort.cpp | 16 +++++++++-------
 perf.ini               |  8 ++++----
 2 files changed, 13 insertions(+), 11 deletions(-)

diff --git a/examples/sort/sort.cpp b/examples/sort/sort.cpp
index 4f402c75..f5e4264a 100644
--- a/examples/sort/sort.cpp
+++ b/examples/sort/sort.cpp
@@ -86,7 +86,8 @@ int main (int argc, char *argv[])
 
     tISPC1 += get_elapsed_mcycles();
 
-    progressbar (i, m);
+    if (argc != 3)
+        progressbar (i, m);
   }
 
   printf("[sort ispc]:\t[%.3f] million cycles\n", tISPC1);
@@ -103,10 +104,11 @@ int main (int argc, char *argv[])
 
     tISPC2 += get_elapsed_mcycles();
 
-    progressbar (i, m);
+    if (argc != 3)
+        progressbar (i, m);
   }
-              
-  printf("[sort ispc+tasks]:\t[%.3f] million cycles\n", tISPC2);
+
+  printf("[sort ispc + tasks]:\t[%.3f] million cycles\n", tISPC2);
 
   srand (0);
 
@@ -120,13 +122,13 @@ int main (int argc, char *argv[])
 
     tSerial += get_elapsed_mcycles();
 
-    progressbar (i, m);
+    if (argc != 3)
+        progressbar (i, m);
   }
 
   printf("[sort serial]:\t\t[%.3f] million cycles\n", tSerial);
 
-  printf("\t\t\t\t(%.2fx speedup from ISPC serial)\n", tSerial/tISPC1);
-  printf("\t\t\t\t(%.2fx speedup from ISPC with tasks)\n", tSerial/tISPC2);
+  printf("\t\t\t\t(%.2fx speedup from ISPC, %.2fx speedup from ISPC + tasks)\n", tSerial/tISPC1, tSerial/tISPC2);
 
   delete code;
   delete order;
diff --git a/perf.ini b/perf.ini
index d8c7fe71..249c25f4 100755
--- a/perf.ini
+++ b/perf.ini
@@ -51,7 +51,7 @@ Volume Rendering
 volume_rendering
 camera.dat density_highres.vol
 #***
-%Sort
-%sort
-%
-%#***
+Sort
+sort
+1000000 1
+#***

From 9e0e9dbecc484fdbc6fd16a3fca283df71572f65 Mon Sep 17 00:00:00 2001
From: Preston Gurd <preston.gurd@intel.com>
Date: Fri, 20 Sep 2013 14:42:46 -0400
Subject: [PATCH 096/124] - Add Silvermont (--cpu=slm) option for llvm 3.4+. -
 Change default Sandybridge isa name to avx1-i32x8 from avx-i32x8,   to
 conform with replacement of avx-i32x8 by avx1-i32x8 everywhere else. - Add
 "target-cpu" attribute, when using AttrBuilder, to correct a problem  
 whereby llvm would switch from the command line cpu setting   to the native
 (auto-detected) cpu setting on second and subsequent   functions. e.g. if I
 wanted to build for Silvermont on a Sandy Bridge   machine, ispc/llvm would
 correctly use Silvermont and turn on the   Silvermont scheduler. For the
 second and subsequent functions,   it would auto-detect Sandy Bridge, but
 still run the Silvermont   scheduler.

---
 ispc.cpp | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/ispc.cpp b/ispc.cpp
index 82f0518b..ea7bfcd7 100644
--- a/ispc.cpp
+++ b/ispc.cpp
@@ -126,7 +126,7 @@ lGetSystemISA() {
                 return "avx1.1-i32x8";
         }
         // Regular AVX
-        return "avx-i32x8";
+        return "avx1-i32x8";
     }
     else if ((info[2] & (1 << 19)) != 0)
         return "sse4-i32x4";
@@ -149,8 +149,11 @@ static const char *supportedCPUs[] = {
 #endif
     "atom", "penryn", "core2", "corei7", "corei7-avx"
 #if !defined(LLVM_3_1)
-    , "core-avx-i", "core-avx2"
+    , "core-avx-i", "core-avx2", "slm"
 #endif // LLVM 3.2+
+#if !defined(LLVM_3_1) && !defined(LLVM_3_2) && !defined(LLVM_3_3)
+    , "slm" 
+#endif // LLVM 3.4+
 };
 
 Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
@@ -196,9 +199,10 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
                 isa = "avx1.1-i32x8";
             else if (!strcmp(cpu, "sandybridge") ||
                 !strcmp(cpu, "corei7-avx"))
-                isa = "avx-i32x8";
+                isa = "avx1-i32x8";
             else if (!strcmp(cpu, "corei7") ||
-                     !strcmp(cpu, "penryn"))
+                     !strcmp(cpu, "penryn") ||
+                     !strcmp(cpu, "slm"))
                 isa = "sse4-i32x4";
             else
                 isa = "sse2-i32x4";
@@ -660,6 +664,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
         // Initialize target-specific "target-feature" attribute.
         if (!m_attributes.empty()) {
             llvm::AttrBuilder attrBuilder;
+            attrBuilder.addAttribute("target-cpu", this->m_cpu);
             attrBuilder.addAttribute("target-features", this->m_attributes);
             this->m_tf_attributes = new llvm::AttributeSet(
                 llvm::AttributeSet::get(

From 4b26b8b4309ffb3295db16815620d2ab751c61c7 Mon Sep 17 00:00:00 2001
From: Preston Gurd <preston.gurd@intel.com>
Date: Fri, 20 Sep 2013 16:44:01 -0400
Subject: [PATCH 097/124] Remove redundant "slm".

---
 ispc.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ispc.cpp b/ispc.cpp
index ea7bfcd7..bec7baf7 100644
--- a/ispc.cpp
+++ b/ispc.cpp
@@ -149,7 +149,7 @@ static const char *supportedCPUs[] = {
 #endif
     "atom", "penryn", "core2", "corei7", "corei7-avx"
 #if !defined(LLVM_3_1)
-    , "core-avx-i", "core-avx2", "slm"
+    , "core-avx-i", "core-avx2"
 #endif // LLVM 3.2+
 #if !defined(LLVM_3_1) && !defined(LLVM_3_2) && !defined(LLVM_3_3)
     , "slm" 

From 019043f55ee13865fe6f672fcce544028ff63e2f Mon Sep 17 00:00:00 2001
From: evghenii <egaburov@dds.nl>
Date: Mon, 23 Sep 2013 09:55:55 +0300
Subject: [PATCH 098/124] patched half2float & float2half to pass the tests.
 Now only test-141 is failed. but it seems to be test rather than knc-i1x16.h
 related

---
 examples/intrinsics/knc-i1x16.h | 138 +++++++++++++++++++++++++++-----
 1 file changed, 117 insertions(+), 21 deletions(-)

diff --git a/examples/intrinsics/knc-i1x16.h b/examples/intrinsics/knc-i1x16.h
index 1f5a6056..2ee6d2f5 100644
--- a/examples/intrinsics/knc-i1x16.h
+++ b/examples/intrinsics/knc-i1x16.h
@@ -1478,23 +1478,101 @@ static FORCEINLINE float __floatbits(int v) {
     return u.f;
 }
 
+/* source : 
+ * http://stackoverflow.com/questions/1659440/32-bit-to-16-bit-floating-point-conversion */
+class Float16Compressor
+{
+  union Bits
+  {
+    float f;
+    int32_t si;
+    uint32_t ui;
+  };
+
+  static int const shift = 13;
+  static int const shiftSign = 16;
+
+  static int32_t const infN = 0x7F800000; // flt32 infinity
+  static int32_t const maxN = 0x477FE000; // max flt16 normal as a flt32
+  static int32_t const minN = 0x38800000; // min flt16 normal as a flt32
+  static int32_t const signN = 0x80000000; // flt32 sign bit
+
+  static int32_t const infC = infN >> shift;
+  static int32_t const nanN = (infC + 1) << shift; // minimum flt16 nan as a flt32
+  static int32_t const maxC = maxN >> shift;
+  static int32_t const minC = minN >> shift;
+  static int32_t const signC = signN >> shiftSign; // flt16 sign bit
+
+  static int32_t const mulN = 0x52000000; // (1 << 23) / minN
+  static int32_t const mulC = 0x33800000; // minN / (1 << (23 - shift))
+
+  static int32_t const subC = 0x003FF; // max flt32 subnormal down shifted
+  static int32_t const norC = 0x00400; // min flt32 normal down shifted
+
+  static int32_t const maxD = infC - maxC - 1;
+  static int32_t const minD = minC - subC - 1;
+
+  public:
+
+  static uint16_t compress(float value)
+  {
+    Bits v, s;
+    v.f = value;
+    uint32_t sign = v.si & signN;
+    v.si ^= sign;
+    sign >>= shiftSign; // logical shift
+    s.si = mulN;
+    s.si = s.f * v.f; // correct subnormals
+    v.si ^= (s.si ^ v.si) & -(minN > v.si);
+    v.si ^= (infN ^ v.si) & -((infN > v.si) & (v.si > maxN));
+    v.si ^= (nanN ^ v.si) & -((nanN > v.si) & (v.si > infN));
+    v.ui >>= shift; // logical shift
+    v.si ^= ((v.si - maxD) ^ v.si) & -(v.si > maxC);
+    v.si ^= ((v.si - minD) ^ v.si) & -(v.si > subC);
+    return v.ui | sign;
+  }
+
+  static float decompress(uint16_t value)
+  {
+    Bits v;
+    v.ui = value;
+    int32_t sign = v.si & signC;
+    v.si ^= sign;
+    sign <<= shiftSign;
+    v.si ^= ((v.si + minD) ^ v.si) & -(v.si > subC);
+    v.si ^= ((v.si + maxD) ^ v.si) & -(v.si > maxC);
+    Bits s;
+    s.si = mulC;
+    s.f *= v.si;
+    int32_t mask = -(norC > v.si);
+    v.si <<= shift;
+    v.si ^= (s.si ^ v.si) & mask;
+    v.si |= sign;
+    return v.f;
+  }
+};
+
 static FORCEINLINE float __half_to_float_uniform(int16_t h) {
-    static const uint32_t shifted_exp = 0x7c00 << 13; // exponent mask after shift
+#if 0
+  static const uint32_t shifted_exp = 0x7c00 << 13; // exponent mask after shift
 
-    int32_t o = ((int32_t)(h & 0x7fff)) << 13;     // exponent/mantissa bits
-    uint32_t exp = shifted_exp & o;   // just the exponent
-    o += (127 - 15) << 23;        // exponent adjust
+  int32_t o = ((int32_t)(h & 0x7fff)) << 13;     // exponent/mantissa bits
+  uint32_t exp = shifted_exp & o;   // just the exponent
+  o += (127 - 15) << 23;        // exponent adjust
 
-    // handle exponent special cases
-    if (exp == shifted_exp) // Inf/NaN?
-        o += (128 - 16) << 23;    // extra exp adjust
-    else if (exp == 0) { // Zero/Denormal?
-        o += 1 << 23;             // extra exp adjust
-        o = __intbits(__floatbits(o) - __floatbits(113 << 23)); // renormalize
-    }
+  // handle exponent special cases
+  if (exp == shifted_exp) // Inf/NaN?
+    o += (128 - 16) << 23;    // extra exp adjust
+  else if (exp == 0) { // Zero/Denormal?
+    o += 1 << 23;             // extra exp adjust
+    o = __intbits(__floatbits(o) - __floatbits(113 << 23)); // renormalize
+  }
 
-    o |= ((int32_t)(h & 0x8000)) << 16;    // sign bit
-    return __floatbits(o);
+  o |= ((int32_t)(h & 0x8000)) << 16;    // sign bit
+  return __floatbits(o);
+#else
+  return Float16Compressor::decompress(h);
+#endif
 }
 
 
@@ -1507,6 +1585,7 @@ static FORCEINLINE __vec16_f __half_to_float_varying(__vec16_i16 v) {
 
 
 static FORCEINLINE int16_t __float_to_half_uniform(float f) {
+#if 0
     uint32_t sign_mask = 0x80000000u;
     int32_t o;
 
@@ -1531,6 +1610,9 @@ static FORCEINLINE int16_t __float_to_half_uniform(float f) {
         o = fint2 >> 13; // Take the bits!
 
     return (o | (sign >> 16));
+#else
+  return Float16Compressor::compress(f);
+#endif
 }
 
 
@@ -2075,9 +2157,8 @@ CAST(__vec16_i32, uint32_t, __vec16_d, double, __cast_fptoui)
 CAST(__vec16_i64, uint64_t, __vec16_d, double, __cast_fptoui)
 
 // float/double conversions
-#if 1
+#if 0
 CAST(__vec16_f, float,  __vec16_d, double, __cast_fptrunc)
-CAST(__vec16_d, double, __vec16_f, float,  __cast_fpext)
 #else
 static FORCEINLINE __vec16_f __cast_fptrunc(__vec16_f, __vec16_d val) {
     __m512i r0i = _mm512_castps_si512(_mm512_cvtpd_pslo(val.v1));
@@ -2085,11 +2166,16 @@ static FORCEINLINE __vec16_f __cast_fptrunc(__vec16_f, __vec16_d val) {
 
     return _mm512_mask_permute4f128_epi32(r0i, 0xFF00, r1i, _MM_PERM_BABA);
 }
+#endif
+
+#if 0
+CAST(__vec16_d, double, __vec16_f, float,  __cast_fpext)
+#else
 static FORCEINLINE __vec16_d __cast_fpext(__vec16_d, __vec16_f val) {
     __vec16_d ret;
-    ret.v2 = _mm512_cvtpslo_pd(val.v);
+    ret.v1 = _mm512_cvtpslo_pd(val.v);
     __vec16_f other8 = _mm512_permute4f128_epi32(_mm512_castps_si512(val.v), _MM_PERM_DCDC);
-    ret.v1 = _mm512_cvtpslo_pd(other8);
+    ret.v2 = _mm512_cvtpslo_pd(other8);
     return ret;
 }
 #endif
@@ -2325,14 +2411,24 @@ static FORCEINLINE __vec16_d __sqrt_varying_double(__vec16_d v) {    return __ve
 // svml
 ///////////////////////////////////////////////////////////////////////////
 
-static FORCEINLINE __vec16_f __svml_logf(__vec16_f v)              { return _mm512_log_ps(v); }
-static FORCEINLINE __vec16_f __svml_expf(__vec16_f v)              { return _mm512_exp_ps(v); }
+static FORCEINLINE __vec16_f __svml_sinf(__vec16_f v)              { return _mm512_sin_ps(v); }
+static FORCEINLINE __vec16_f __svml_asinf(__vec16_f v)              { return _mm512_asin_ps(v); }
 static FORCEINLINE __vec16_f __svml_cosf(__vec16_f v)              { return _mm512_cos_ps(v); }
+static FORCEINLINE __vec16_f __svml_tanf(__vec16_f v)              { return _mm512_tan_ps(v); }
+static FORCEINLINE __vec16_f __svml_atanf(__vec16_f v)              { return _mm512_atan_ps(v); }
+static FORCEINLINE __vec16_f __svml_atan2f(__vec16_f a, __vec16_f b) { return _mm512_atan2_ps(a,b); }
+static FORCEINLINE __vec16_f __svml_expf(__vec16_f v)              { return _mm512_exp_ps(v); }
+static FORCEINLINE __vec16_f __svml_logf(__vec16_f v)              { return _mm512_log_ps(v); }
 static FORCEINLINE __vec16_f __svml_powf(__vec16_f a, __vec16_f b) { return _mm512_pow_ps(a,b); }
 
-static FORCEINLINE __vec16_d __svml_logd(__vec16_d v)              { return __vec16_d(_mm512_log_pd(v.v1), _mm512_log_pd(v.v2)); }
-static FORCEINLINE __vec16_d __svml_expd(__vec16_d v)              { return __vec16_d(_mm512_exp_pd(v.v1), _mm512_exp_pd(v.v2)); }
+static FORCEINLINE __vec16_d __svml_sind(__vec16_d v)              { return __vec16_d(_mm512_sin_pd(v.v1), _mm512_sin_pd(v.v2)); }
+static FORCEINLINE __vec16_d __svml_asind(__vec16_d v)              { return __vec16_d(_mm512_asin_pd(v.v1), _mm512_asin_pd(v.v2)); }
 static FORCEINLINE __vec16_d __svml_cosd(__vec16_d v)              { return __vec16_d(_mm512_cos_pd(v.v1), _mm512_cos_pd(v.v2)); }
+static FORCEINLINE __vec16_d __svml_tand(__vec16_d v)              { return __vec16_d(_mm512_tan_pd(v.v1), _mm512_tan_pd(v.v2)); }
+static FORCEINLINE __vec16_d __svml_atand(__vec16_d v)              { return __vec16_d(_mm512_atan_pd(v.v1), _mm512_atan_pd(v.v2)); }
+static FORCEINLINE __vec16_d __svml_atan2d(__vec16_d a, __vec16_d b) { return __vec16_d(_mm512_atan2_pd(a.v1,b.v1), _mm512_atan2_pd(a.v2,b.v2)); }
+static FORCEINLINE __vec16_d __svml_expd(__vec16_d v)              { return __vec16_d(_mm512_exp_pd(v.v1), _mm512_exp_pd(v.v2)); }
+static FORCEINLINE __vec16_d __svml_logd(__vec16_d v)              { return __vec16_d(_mm512_log_pd(v.v1), _mm512_log_pd(v.v2)); }
 static FORCEINLINE __vec16_d __svml_powd(__vec16_d a, __vec16_d b) { return __vec16_d(_mm512_pow_pd(a.v1,b.v1), _mm512_pow_pd(a.v2,b.v2)); }
 
 ///////////////////////////////////////////////////////////////////////////

From 5a9b3b3abb592d19fbe298467bcb631b25c8bd76 Mon Sep 17 00:00:00 2001
From: Ilia Filippov <ili.filippov@gmail.com>
Date: Fri, 20 Sep 2013 19:03:58 +0400
Subject: [PATCH 099/124] adding patch for LLVM 3.3 which increases performance
 after regression

---
 .../3_3_r172868-vmovups-vinsertf128.patch     | 102 ++++++++++++++++++
 1 file changed, 102 insertions(+)
 create mode 100644 llvm_patches/3_3_r172868-vmovups-vinsertf128.patch

diff --git a/llvm_patches/3_3_r172868-vmovups-vinsertf128.patch b/llvm_patches/3_3_r172868-vmovups-vinsertf128.patch
new file mode 100644
index 00000000..36bb5572
--- /dev/null
+++ b/llvm_patches/3_3_r172868-vmovups-vinsertf128.patch
@@ -0,0 +1,102 @@
+This patch needs to be applied to LLVM 3.3 to fix performance regression after r172868 revision.
+This regression is due to increased register pressure after revision causing spills in case of multiple loads 
+This regression is fixed in 3.4 but the changes in 3.4 is not back portable,
+so we roll back r172868 to avoid regression with 3.3.
+
+Index: test/CodeGen/X86/sandybridge-loads.ll
+===================================================================
+--- test/CodeGen/X86/sandybridge-loads.ll       (revision 191082)
++++ test/CodeGen/X86/sandybridge-loads.ll       (working copy)
+@@ -1,24 +1,5 @@
+ ; RUN: llc -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -o - < %s | FileCheck %s
+ 
+-;CHECK: wideloads
+-;CHECK: vmovaps
+-;CHECK: vinsertf128
+-;CHECK: vmovaps
+-;CHECK-NOT: vinsertf128
+-;CHECK: ret
+-
+-define void @wideloads(<8 x float>* %a, <8 x float>* %b, <8 x float>* %c) nounwind uwtable noinline ssp {
+-  %v0 = load <8 x float>* %a, align 16  ; <---- unaligned!
+-  %v1 = load <8 x float>* %b, align 32  ; <---- aligned!
+-  %m0 = fcmp olt <8 x float> %v1, %v0
+-  %v2 = load <8 x float>* %c, align 32  ; <---- aligned!
+-  %m1 = fcmp olt <8 x float> %v2, %v0
+-  %mand = and <8 x i1> %m1, %m0
+-  %r = zext <8 x i1> %mand to <8 x i32>
+-  store <8 x i32> %r, <8 x i32>* undef, align 32
+-  ret void
+-}
+-
+ ; CHECK: widestores
+ ; loads:
+ ; CHECK: vmovaps
+Index: test/CodeGen/X86/v8i1-masks.ll
+===================================================================
+--- test/CodeGen/X86/v8i1-masks.ll	(revision 172868)
++++ test/CodeGen/X86/v8i1-masks.ll	(revision 172866)
+@@ -1,7 +1,7 @@
+ ; RUN: llc -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -o - < %s | FileCheck %s
+ 
+ ;CHECK: and_masks
+-;CHECK: vmovaps
++;CHECK: vmovups
+ ;CHECK: vcmpltp
+ ;CHECK: vcmpltp
+ ;CHECK: vandps
+Index: lib/Target/X86/X86ISelLowering.cpp
+===================================================================
+--- lib/Target/X86/X86ISelLowering.cpp  (revision 191077)
++++ lib/Target/X86/X86ISelLowering.cpp  (working copy)
+@@ -16756,42 +16756,9 @@
+   EVT MemVT = Ld->getMemoryVT();
+   DebugLoc dl = Ld->getDebugLoc();
+   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+-  unsigned RegSz = RegVT.getSizeInBits();
+ 
+-  // On Sandybridge unaligned 256bit loads are inefficient.
+   ISD::LoadExtType Ext = Ld->getExtensionType();
+-  unsigned Alignment = Ld->getAlignment();
+-  bool IsAligned = Alignment == 0 || Alignment >= MemVT.getSizeInBits()/8;
+-  if (RegVT.is256BitVector() && !Subtarget->hasInt256() &&
+-      !DCI.isBeforeLegalizeOps() && !IsAligned && Ext == ISD::NON_EXTLOAD) {
+-    unsigned NumElems = RegVT.getVectorNumElements();
+-    if (NumElems < 2)
+-      return SDValue();
+ 
+-    SDValue Ptr = Ld->getBasePtr();
+-    SDValue Increment = DAG.getConstant(16, TLI.getPointerTy());
+-
+-    EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
+-                                  NumElems/2);
+-    SDValue Load1 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr,
+-                                Ld->getPointerInfo(), Ld->isVolatile(),
+-                                Ld->isNonTemporal(), Ld->isInvariant(),
+-                                Alignment);
+-    Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
+-    SDValue Load2 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr,
+-                                Ld->getPointerInfo(), Ld->isVolatile(),
+-                                Ld->isNonTemporal(), Ld->isInvariant(),
+-                                std::min(16U, Alignment));
+-    SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+-                             Load1.getValue(1),
+-                             Load2.getValue(1));
+-
+-    SDValue NewVec = DAG.getUNDEF(RegVT);
+-    NewVec = Insert128BitVector(NewVec, Load1, 0, DAG, dl);
+-    NewVec = Insert128BitVector(NewVec, Load2, NumElems/2, DAG, dl);
+-    return DCI.CombineTo(N, NewVec, TF, true);
+-  }
+-
+   // If this is a vector EXT Load then attempt to optimize it using a
+   // shuffle. If SSSE3 is not available we may emit an illegal shuffle but the
+   // expansion is still better than scalar code.
+@@ -16805,6 +16772,7 @@
+     assert(MemVT.isVector() && "Must load a vector from memory");
+ 
+     unsigned NumElems = RegVT.getVectorNumElements();
++    unsigned RegSz = RegVT.getSizeInBits();
+     unsigned MemSz = MemVT.getSizeInBits();
+     assert(RegSz > MemSz && "Register size must be greater than the mem size");
+ 

From af5da885a56b32798f4c6dc94ccbbe60bc40b28e Mon Sep 17 00:00:00 2001
From: Ilia Filippov <ili.filippov@gmail.com>
Date: Fri, 20 Sep 2013 17:28:07 +0400
Subject: [PATCH 100/124] small corrections of test system

---
 .gitignore   |   6 ++
 alloy.py     | 223 ++++++++++++++++++++++++++++++---------------------
 check_env.py |  16 ++--
 common.py    |  23 +++---
 perf.py      |  12 +--
 run_tests.py |   6 +-
 6 files changed, 169 insertions(+), 117 deletions(-)

diff --git a/.gitignore b/.gitignore
index 88fb0197..429199bb 100644
--- a/.gitignore
+++ b/.gitignore
@@ -3,14 +3,20 @@
 depend
 ispc
 ispc_test
+ispc_ref
 objs
 docs/doxygen
 docs/*.html
 tests*/*cpp
 tests*/*run
+logs/
+notify_log.log
+alloy_results_*
 examples/*/*.png
 examples/*/*.ppm
 examples/*/objs/*
+examples/*/ref
+examples/*/test
 *.swp
 
 
diff --git a/alloy.py b/alloy.py
index 119874b8..31399a37 100755
--- a/alloy.py
+++ b/alloy.py
@@ -101,8 +101,7 @@ def build_LLVM(version_LLVM, revision, folder, tarball, debug, selfbuild, from_v
     LLVM_BUILD="build-" + folder
     LLVM_BIN="bin-" + folder
     if os.path.exists(LLVM_BIN) and not force:
-        print_debug("You have folder " + LLVM_BIN + ". If you want to rebuild use --force\n", False, "")
-        exit(0)
+        error("you have folder " + LLVM_BIN + ".\nIf you want to rebuild use --force", 1)
     LLVM_BUILD_selfbuild = LLVM_BUILD + "_temp"
     LLVM_BIN_selfbuild = LLVM_BIN + "_temp"
     common.remove_if_exists(LLVM_SRC)
@@ -188,26 +187,45 @@ def check_targets():
     AVX = False;
     AVX11 = False;
     AVX2 = False;
-    cpu = open("/proc/cpuinfo")
-    f_lines = cpu.readlines()
-    cpu.close()
-    # check what native targets do we have
-    for i in range(0,len(f_lines)):
-        if SSE2 == False and "sse2" in f_lines[i]:
+    if current_OS == "Linux":
+        cpu = open("/proc/cpuinfo")
+        f_lines = cpu.readlines()
+        cpu.close()
+        # check what native targets do we have
+        for i in range(0,len(f_lines)):
+            if SSE2 == False and "sse2" in f_lines[i]:
+                SSE2 = True;
+                answer = answer + ["sse2-i32x4", "sse2-i32x8"]
+            if SSE4 == False and "sse4_1" in f_lines[i]:
+                SSE4 = True;
+                answer = answer + ["sse4-i32x4", "sse4-i32x8", "sse4-i16x8", "sse4-i8x16"]
+            if AVX == False and "avx" in f_lines[i]:
+                AVX = True;
+                answer = answer + ["avx1-i32x8", "avx1-i32x16"]
+            if AVX11 == False and "rdrand" in f_lines[i]:
+                AVX11 = True;
+                answer = answer + ["avx1.1-i32x8", "avx1.1-i32x16"]
+            if AVX2 == False and "avx2" in f_lines[i]:
+                AVX2 = True;
+                answer = answer + ["avx2-i32x8", "avx2-i32x16"]
+    if current_OS == "MacOS":
+        f_lines = take_lines("sysctl machdep.cpu.features", "first")
+        if "SSE2" in f_lines:
             SSE2 = True;
             answer = answer + ["sse2-i32x4", "sse2-i32x8"]
-        if SSE4 == False and "sse4_1" in f_lines[i]:
+        if "SSE4.1" in f_lines:
             SSE4 = True;
             answer = answer + ["sse4-i32x4", "sse4-i32x8", "sse4-i16x8", "sse4-i8x16"]
-        if AVX == False and "avx" in f_lines[i]:
+        if "AVX1.0" in f_lines:
             AVX = True;
             answer = answer + ["avx1-i32x8", "avx1-i32x16"]
-        if AVX11 == False and "rdrand" in f_lines[i]:
+        if "RDRAND" in f_lines:
             AVX11 = True;
             answer = answer + ["avx1.1-i32x8", "avx1.1-i32x16"]
-        if AVX2 == False and "avx2" in f_lines[i]:
+        if "AVX2.0" in f_lines:
             AVX2 = True;
             answer = answer + ["avx2-i32x8", "avx2-i32x16"]
+
     answer = answer + ["generic-4", "generic-16", "generic-8", "generic-1", "generic-32", "generic-64"]
     # now check what targets we have with the help of SDE
     sde_exists = ""
@@ -224,17 +242,14 @@ def check_targets():
             "Please refer to http://www.intel.com/software/sde for SDE download information.", 2)
         return [answer, answer_sde]
     # here we have SDE
-    os.system(sde_exists + " -help > " + temp_alloy_file)
-    cpu = open(temp_alloy_file)
-    f_lines = cpu.readlines()
-    cpu.close()
+    f_lines = take_lines(sde_exists + " -help", "all")
     for i in range(0,len(f_lines)):
         if SSE4 == False and "wsm" in f_lines[i]:
             answer_sde = answer_sde + [["-wsm", "sse4-i32x4"], ["-wsm", "sse4-i32x8"], ["-wsm", "sse4-i16x8"], ["-wsm", "sse4-i8x16"]]
         if AVX == False and "snb" in f_lines[i]:
             answer_sde = answer_sde + [["-snb", "avx1-i32x8"], ["-snb", "avx1-i32x16"]]
         if AVX11 == False and "ivb" in f_lines[i]:
-            answer_sde = answer_sde + [["-ivb", "avx1.1-i32x8"], ["ivb", "avx1.1-i32x16"]]
+            answer_sde = answer_sde + [["-ivb", "avx1.1-i32x8"], ["-ivb", "avx1.1-i32x16"]]
         if AVX2 == False and "hsw" in f_lines[i]:
             answer_sde = answer_sde + [["-hsw", "avx2-i32x8"], ["-hsw", "avx2-i32x16"]]
     return [answer, answer_sde]
@@ -271,14 +286,11 @@ def execute_stability(stability, R, print_version):
 def run_special_tests():
    i = 5 
 
-def validation_run(only, only_targets, reference_branch, notify, update):
-    current_path = os.getcwd()
+def validation_run(only, only_targets, reference_branch, number, notify, update):
     os.chdir(os.environ["ISPC_HOME"])
     os.environ["PATH"] = os.environ["ISPC_HOME"] + ":" + os.environ["PATH"]
     if options.notify != "":
-        if os.environ.get("SMTP_ISPC") == None:
-            error("you have no SMTP_ISPC in your environment for option notify", 1)
-        common.remove_if_exists(os.environ["ISPC_HOME"] + os.sep + "all_answer.txt")
+        common.remove_if_exists(os.environ["ISPC_HOME"] + os.sep + "notify_log.log")
         smtp_server = os.environ["SMTP_ISPC"]
         msg = MIMEMultipart()
         msg['Subject'] = 'ISPC test system results'
@@ -437,7 +449,7 @@ def validation_run(only, only_targets, reference_branch, notify, update):
         print_debug("\n\nPerformance validation run\n\n", False, "")
         performance = options_for_drivers()
 # performance constant options
-        performance.number = 5
+        performance.number = number
         performance.config = "./perf.ini"
         performance.path = "./"
         performance.silent = True
@@ -450,16 +462,13 @@ def validation_run(only, only_targets, reference_branch, notify, update):
         if len(need_LLVM) != 0:
             build_LLVM(need_LLVM[i], "", "", "", False, False, True, False)
 # prepare reference point. build both test and reference compilers
-        os.system("git branch > " + temp_alloy_file)
-        br = open(temp_alloy_file)
-        temp4 = br.readlines()
-        br.close()
+        temp4 = take_lines("git branch", "all")
         for line in temp4:
             if "*" in line:
                 current_branch = line[2:-1]
         stashing = True
         sys.stdout.write("Please, don't interrupt script here! You can have not sync git status after interruption!\n")
-        if "No local changes" in detect_version("git stash"):
+        if "No local changes" in take_lines("git stash", "first"):
             stashing = False
         #try_do_LLVM("stash current branch ", "git stash", True)
         try_do_LLVM("checkout reference branch " + reference_branch + " ", "git checkout " + reference_branch, True)
@@ -478,11 +487,9 @@ def validation_run(only, only_targets, reference_branch, notify, update):
             attach_mail_file(msg, performance.in_file, "performance.log")
             attach_mail_file(msg, "." + os.sep + "logs" + os.sep + "perf_build.log", "perf_build.log")
 
-    print_debug("Logs are in alloy_results_[date]", False, "")
-
 # sending e-mail with results
     if options.notify != "":
-        fp = open(os.environ["ISPC_HOME"] + os.sep + "all_answer.txt", 'rb')
+        fp = open(os.environ["ISPC_HOME"] + os.sep + "notify_log.log", 'rb')
         f_lines = fp.readlines()
         fp.close()
         line = ""
@@ -495,46 +502,56 @@ def validation_run(only, only_targets, reference_branch, notify, update):
         s = smtplib.SMTP(smtp_server)
         s.sendmail('ISPC_test_system', options.notify, msg.as_string())
         s.quit()
-# exit of validation routine
-    common.remove_if_exists(temp_alloy_file)
-    os.chdir(current_path)
 
 def Main():
+    global current_OS
     if (platform.system() == 'Windows' or 'CYGWIN_NT' in platform.system()) == True:
+        current_OS = "Windows"
         error("Windows isn't supported now", 1)
-    if (options.build_llvm == False and
-       options.validation_run == False and
-       options.llvm_home == "" and
-       options.ispc_home == "" and
-       options.sde_home == ""):
+    else:
+        if (platform.system() == 'Darwin'):
+            current_OS = "MacOS"
+        else:
+            current_OS = "Linux" 
+
+    if (options.build_llvm == False and options.validation_run == False):
         parser.print_help()
         exit(0)
-    global f_date
-    f_date = "logs"
-    common.remove_if_exists(f_date)
-    os.makedirs(f_date)
-    global temp_alloy_file
-    temp_alloy_file = os.getcwd() + os.sep + f_date + os.sep + "temp_detect_version"
-    global alloy_build
-    alloy_build = os.getcwd() + os.sep + f_date + os.sep + "alloy_build.log"
-    common.remove_if_exists(alloy_build) 
-    global stability_log
-    stability_log = os.getcwd() + os.sep + f_date + os.sep + "stability.log"
-    common.remove_if_exists(stability_log)
+
     setting_paths(options.llvm_home, options.ispc_home, options.sde_home)
     if os.environ.get("LLVM_HOME") == None:
         error("you have no LLVM_HOME", 1)
     if os.environ.get("ISPC_HOME") == None:
         error("you have no ISPC_HOME", 1)
-    if options.build_llvm:
-        build_LLVM(options.version, options.revision, options.folder, options.tarball,
+    if options.notify != "":
+        if os.environ.get("SMTP_ISPC") == None:
+            error("you have no SMTP_ISPC in your environment for option notify", 1)
+
+    global f_date
+    f_date = "logs"
+    common.remove_if_exists(f_date)
+    os.makedirs(f_date)
+    global alloy_build
+    alloy_build = os.getcwd() + os.sep + f_date + os.sep + "alloy_build.log"
+    global stability_log
+    stability_log = os.getcwd() + os.sep + f_date + os.sep + "stability.log"
+    current_path = os.getcwd()
+    try:
+        if options.build_llvm:
+            build_LLVM(options.version, options.revision, options.folder, options.tarball,
                     options.debug, options.selfbuild, False, options.force)
-    if options.validation_run:
-        validation_run(options.only, options.only_targets, options.branch, options.notify, options.update)
-    os.rename(f_date, "alloy_results_" + datetime.datetime.now().strftime('%H_%M_%d_%m_%Y'))
+        if options.validation_run:
+            validation_run(options.only, options.only_targets, options.branch,
+                    options.number_for_performance, options.notify, options.update)
+    finally:
+        os.chdir(current_path)
+        date_name = "alloy_results_" + datetime.datetime.now().strftime('%H_%M_%d_%m_%Y')
+        os.rename(f_date, date_name)
+        print_debug("Logs are in " + date_name + "\n", False, "")
 
 ###Main###
 from optparse import OptionParser
+from optparse import OptionGroup
 import sys
 import os
 import operator
@@ -554,47 +571,73 @@ import run_tests
 import perf
 import common
 error = common.error
-detect_version = common.detect_version
+take_lines = common.take_lines
 print_debug = common.print_debug
 # parsing options
-parser = OptionParser()
-# options for activity "build LLVM"
+class MyParser(OptionParser):
+    def format_epilog(self, formatter):
+        return self.epilog
+examples =  ("Examples:\n" +
+"Load and build LLVM from trunk\n\talloy.py -b\n" +
+"Load and build LLVM 3.3. Rewrite LLVM folders\n\talloy.py -b --version=3.3 --force\n" +
+"Untar files llvm.tgz clang.tgz, build LLVM from them in folder bin-from_tar\n\talloy.py -b --tarball='llvm.tgz clang.tgz' --folder=from_tar\n" +
+"Load LLVM from trunk, revision r172870. Build it. Do selfbuild\n\talloy.py -b --revision=r172870 --selfbuild\n" +
+"Validation run with LLVM 3.3, trunk; x86, x86-64; -O2;\nall supported targets; performance\n\talloy.py -r\n" + 
+"Validation run with all avx targets and sse4-i8x16 without performance\n\talloy.py -r --only=stability --only-targets='avx sse4-i8x16'\n" +
+"Validation run with avx2-i32x8, all sse4 and sse2 targets\nand all targets with i32x16\n\talloy.py -r --only-targets='avx2-i32x8 sse4 i32x16 sse2'\n" +
+"Stability validation run with LLVM 3.2, 3.3; -O0; x86,\nupdate fail_db.txt with passes and fails\n\talloy.py -r --only='3.2 -O0 stability 3.3 x86' --update-errors=FP\n" +
+"Try to build compiler with all LLVM\n\talloy.py -r --only=build\n" +
+"Performance validation run with 10 runs of each test and comparing to branch 'old'\n\talloy.py -r --only=performance --compare-with=old --number=10\n" +
+"Validation run. Update fail_db.txt with new fails, send results to my@my.com\n\talloy.py -r --update-errors=F --notify='my@my.com'\n")
+parser = MyParser(usage="Usage: alloy.py -r/-b [options]", epilog=examples)
 parser.add_option('-b', '--build-llvm', dest='build_llvm',
     help='ask to build LLVM', default=False, action="store_true")
-parser.add_option('--version', dest='version',
-    help='version of llvm to build: 3.1 3.2 3.3 trunk', default="trunk")
-parser.add_option('--revision', dest='revision',
-    help='revision of llvm to build in format r172870', default="")
-parser.add_option('--debug', dest='debug',
-    help='debug build of LLVM?', default=False, action="store_true")
-parser.add_option('--folder', dest='folder',
-    help='folder to build LLVM in', default="")
-parser.add_option('--tarball', dest='tarball',
-    help='"llvm_tarball clang_tarball"', default="")
-parser.add_option('--selfbuild', dest='selfbuild',
-    help='make selfbuild of LLVM and clang', default=False, action="store_true")
-parser.add_option('--force', dest='force',
-    help='rebuild LLVM', default=False, action='store_true')
-# options for activity "setup PATHS"
-parser.add_option('--llvm_home', dest='llvm_home',help='path to LLVM',default="")
-parser.add_option('--ispc_home', dest='ispc_home',help='path to ISPC',default="")
-parser.add_option('--sde_home', dest='sde_home',help='path to SDE',default="")
-# options for activity "validation run"
 parser.add_option('-r', '--run', dest='validation_run',
     help='ask for validation run', default=False, action="store_true")
-parser.add_option('--compare-with', dest='branch',
-    help='set performance reference point', default="master")
-parser.add_option('--only-targets', dest='only_targets',
-    help='set list of targets to test. Possible values - all subnames of targets.\n' +
-        'Example: --only-targets="avx2-i32x8 sse4 i32x16 sse2"', default="")
-parser.add_option('--notify', dest='notify',
-    help='sent results to email', default="")
-parser.add_option('--only', dest='only',
+# options for activity "build LLVM"
+llvm_group = OptionGroup(parser, "Options for building LLVM",
+                    "These options must be used with -b option.")
+llvm_group.add_option('--version', dest='version',
+    help='version of llvm to build: 3.1 3.2 3.3 trunk. Default: trunk', default="trunk")
+llvm_group.add_option('--revision', dest='revision',
+    help='revision of llvm to build in format r172870', default="")
+llvm_group.add_option('--debug', dest='debug',
+    help='debug build of LLVM?', default=False, action="store_true")
+llvm_group.add_option('--folder', dest='folder',
+    help='folder to build LLVM in', default="")
+llvm_group.add_option('--tarball', dest='tarball',
+    help='"llvm_tarball clang_tarball"', default="")
+llvm_group.add_option('--selfbuild', dest='selfbuild',
+    help='make selfbuild of LLVM and clang', default=False, action="store_true")
+llvm_group.add_option('--force', dest='force',
+    help='rebuild LLVM', default=False, action='store_true')
+parser.add_option_group(llvm_group)
+# options for activity "validation run"
+run_group = OptionGroup(parser, "Options for validation run",
+                    "These options must be used with -r option.")
+run_group.add_option('--compare-with', dest='branch',
+    help='set performance reference point. Dafault: master', default="master")
+run_group.add_option('--number', dest='number_for_performance',
+    help='number of performance runs for each test. Default: 5', default=5)
+run_group.add_option('--notify', dest='notify',
+    help='email to sent results to', default="")
+run_group.add_option('--update-errors', dest='update',
+    help='rewrite fail_db.txt file according to received results (F or FP)', default="")
+run_group.add_option('--only-targets', dest='only_targets',
+    help='set list of targets to test. Possible values - all subnames of targets.',
+    default="")
+run_group.add_option('--only', dest='only',
     help='set types of tests. Possible values:\n' + 
         '-O0, -O2, x86, x86-64, stability (test only stability), performance (test only performance)\n' +
-        'build (only build with different LLVM), 3.1, 3.2, 3.3, trunk, native (do not use SDE), current (do not rebuild ISPC).\n' +
-        'Example: --only="3.2 -O0 stability 3.3"', default="")
-parser.add_option('--update-errors', dest='update',
-    help='rewrite fail_db.txt file according to received results (F or FP)', default="")
+        'build (only build with different LLVM), 3.1, 3.2, 3.3, trunk, native (do not use SDE), current (do not rebuild ISPC).',
+        default="")
+parser.add_option_group(run_group)
+# options for activity "setup PATHS"
+setup_group = OptionGroup(parser, "Options for setup",
+                    "These options must be use with -r or -b to setup environment variables")
+setup_group.add_option('--llvm_home', dest='llvm_home',help='path to LLVM',default="")
+setup_group.add_option('--ispc_home', dest='ispc_home',help='path to ISPC',default="")
+setup_group.add_option('--sde_home', dest='sde_home',help='path to SDE',default="")
+parser.add_option_group(setup_group)
 (options, args) = parser.parse_args()
 Main()
diff --git a/check_env.py b/check_env.py
index 98deb235..8c90d895 100755
--- a/check_env.py
+++ b/check_env.py
@@ -39,7 +39,7 @@ import os
 import string
 print_debug = common.print_debug
 error = common.error
-detect_version = common.detect_version
+take_lines = common.take_lines
 
 exists = [False, False, False, False, False, False, False, False]
 names = ["m4", "bison", "flex", "sde", "ispc", "clang", "gcc", "icc"]
@@ -54,26 +54,26 @@ print_debug("=== in PATH: ===\n", False, "")
 print_debug("Tools:\n", False, "")
 for i in range(0,3):
     if exists[i]:
-        print_debug(detect_version(names[i] + " --version"), False, "")
+        print_debug(take_lines(names[i] + " --version", "first"), False, "")
     else:
         error("you don't have " + names[i], 0)
 if exists[0] and exists[1] and exists[2]:
     if common.check_tools(2):
-        print_debug("versions are ok\n", False, "")
+        print_debug("Tools' versions are ok\n", False, "")
 print_debug("\nSDE:\n", False, "")
 if exists[3]:
-    print_debug(detect_version(names[3] + " --version"), False, "")
+    print_debug(take_lines(names[3] + " --version", "first"), False, "")
 else:
     error("you don't have " + names[3], 2)
 print_debug("\nISPC:\n", False, "")
 if exists[4]:
-    print_debug(detect_version(names[4] + " --version"), False, "")
+    print_debug(take_lines(names[4] + " --version", "first"), False, "")
 else:
     error("you don't have " + names[4], 2)
 print_debug("\nC/C++ compilers:\n", False, "")
 for i in range(5,8):
     if exists[i]:
-        print_debug(detect_version(names[i] + " --version"), False, "")
+        print_debug(take_lines(names[i] + " --version", "first"), False, "")
     else:
         error("you don't have " + names[i], 2)
 
@@ -88,7 +88,7 @@ else:
     print_debug("Your ISPC_HOME:" + os.environ.get("ISPC_HOME") + "\n", False, "")
     if os.path.exists(os.environ.get("ISPC_HOME") + os.sep + "ispc"):
         print_debug("You have ISPC in your ISPC_HOME: " +
-        detect_version(os.environ.get("ISPC_HOME") + os.sep + "ispc" + " --version"), False, "")
+        take_lines(os.environ.get("ISPC_HOME") + os.sep + "ispc" + " --version", "first"), False, "")
     else:
         error("you don't have ISPC in your ISPC_HOME", 2)
 if os.environ.get("SDE_HOME") == None:
@@ -97,6 +97,6 @@ else:
     print_debug("Your SDE_HOME:" + os.environ.get("SDE_HOME") + "\n", False, "")
     if os.path.exists(os.environ.get("SDE_HOME") + os.sep + "sde"):
         print_debug("You have sde in your SDE_HOME: " +
-        detect_version(os.environ.get("SDE_HOME") + os.sep + "sde" + " --version"), False, "")
+        take_lines(os.environ.get("SDE_HOME") + os.sep + "sde" + " --version", "first"), False, "")
     else:
         error("you don't have any SDE in your ISPC_HOME", 2)
diff --git a/common.py b/common.py
index dd8fb388..19d09e4d 100644
--- a/common.py
+++ b/common.py
@@ -50,21 +50,24 @@ def remove_if_exists(filename):
             os.remove(filename)
 
 # detect version which is printed after command
-def detect_version(command):
+def take_lines(command, which):
     os.system(command + " > " + "temp_detect_version")
     version = open("temp_detect_version")
-    answer = version.readline()
+    if which == "first":
+        answer = version.readline()
+    if which == "all":
+        answer = version.readlines()
     version.close()
     remove_if_exists("temp_detect_version")
     return answer
 
 # print versions of compilers
 def print_version(ispc_test, ispc_ref, ref_compiler, s, perf_log, is_windows):
-    print_debug("\nUsing test compiler: " + detect_version(ispc_test + " --version"), s, perf_log)
+    print_debug("\nUsing test compiler: " + take_lines(ispc_test + " --version", "first"), s, perf_log)
     if ispc_ref != "":
-        print_debug("Using ref compiler:  " + detect_version(ispc_ref + " --version"), s, perf_log)
+        print_debug("Using ref compiler:  " + take_lines(ispc_ref + " --version", "first"), s, perf_log)
     if is_windows == False:
-        temp1 = detect_version(ref_compiler + " --version")
+        temp1 = take_lines(ref_compiler + " --version", "first")
     else:
         os.system(ref_compiler + " 2>&1" + " 2> temp_detect_version > temp_detect_version1" )
         version = open("temp_detect_version")
@@ -80,7 +83,7 @@ def print_debug(line, silent, filename):
         sys.stdout.write(line)
         sys.stdout.flush()
         if os.environ.get("ISPC_HOME") != None:
-            write_to_file(os.environ["ISPC_HOME"] + os.sep + "all_answer.txt", line)
+            write_to_file(os.environ["ISPC_HOME"] + os.sep + "notify_log.log", line)
     if filename != "":
         write_to_file(filename, line)
 
@@ -102,9 +105,9 @@ def check_tools(m):
     input_tools=[[[1,4],"m4 --version", "bad m4 version"],
                  [[2,4],"bison --version", "bad bison version"],
                  [[2,5], "flex --version", "bad flex version"]]
- 
+    ret = 1 
     for t in range(0,len(input_tools)):
-        t1 = ((detect_version(input_tools[t][1]))[:-1].split(" "))
+        t1 = ((take_lines(input_tools[t][1], "first"))[:-1].split(" "))
         for i in range(0,len(t1)):
             t11 = t1[i].split(".")
             f = True
@@ -116,5 +119,5 @@ def check_tools(m):
                     if j < len(input_tools[t][0]):
                         if int(t11[j])<input_tools[t][0][j]:
                             error(input_tools[t][2], m)
-                            return 0
-    return 1
+                            ret = 0
+    return ret
diff --git a/perf.py b/perf.py
index d1d7654b..b33e1f25 100755
--- a/perf.py
+++ b/perf.py
@@ -247,11 +247,11 @@ def compare(A, B):
             p1 = 0
         else:
             p1 = 100 - 100 * A[3][i]/B[3][i]
-        print_debug("%21s:  %10.2f %10.2f %10.2f" % (A[0][i], A[3][i], B[3][i], p1), False, "")
+        print_debug("%21s:  %10.2f %10.2f %10.2f" % (A[0][i], A[3][i], B[3][i], abs(p1)), False, "")
         if p1 < -1:
-            print_debug(" <-", False, "")
-        if p1 > 1:
             print_debug(" <+", False, "")
+        if p1 > 1:
+            print_debug(" <-", False, "")
         print_debug("\n", False, "")
     print_debug("\n", False, "")
 
@@ -261,11 +261,11 @@ def compare(A, B):
             p2 = 0
         else:
             p2 = 100 - 100 * A[4][i]/B[4][i]
-        print_debug("%21s:  %10.2f %10.2f %10.2f" % (A[0][i], A[4][i], B[4][i], p2), False, "")
+        print_debug("%21s:  %10.2f %10.2f %10.2f" % (A[0][i], A[4][i], B[4][i], abs(p2)), False, "")
         if p2 < -1:
-            print_debug(" <-", False, "")
-        if p2 > 1:
             print_debug(" <+", False, "")
+        if p2 > 1:
+            print_debug(" <-", False, "")
         print_debug("\n", False, "")
     if "performance.log" in options.in_file:
         print_debug("\n\n_________________Watch performance.log for details________________\n", False, "")
diff --git a/run_tests.py b/run_tests.py
index 2471b6cb..914f22a7 100755
--- a/run_tests.py
+++ b/run_tests.py
@@ -364,11 +364,11 @@ def file_check(compfails, runfails):
     else:
         opt = "-O2"
 # Detect LLVM version
-    temp1 = common.detect_version(ispc_exe + " --version")
+    temp1 = common.take_lines(ispc_exe + " --version", "first")
     llvm_version = temp1[-10:-2]
-#Detect compiler version
+# Detect compiler version
     if is_windows == False:
-        temp1 = common.detect_version(options.compiler_exe + " --version")
+        temp1 = common.take_lines(options.compiler_exe + " --version", "first")
         temp2 = temp1.split(" ")
         compiler_version = temp2[0] + temp2[2][0:4]
     else:

From 1c858c34f795c1b2fb29d9c07ae5c448dab287a0 Mon Sep 17 00:00:00 2001
From: Ilia Filippov <ili.filippov@gmail.com>
Date: Tue, 24 Sep 2013 17:37:39 +0400
Subject: [PATCH 101/124] correction of test system

---
 alloy.py           | 53 +++++++++++++++++++++++++++++-----------------
 examples/common.mk |  2 +-
 perf.py            | 10 ++++-----
 run_tests.py       | 12 +++++++----
 4 files changed, 47 insertions(+), 30 deletions(-)

diff --git a/alloy.py b/alloy.py
index 31399a37..7ae972b4 100755
--- a/alloy.py
+++ b/alloy.py
@@ -70,7 +70,7 @@ def try_do_LLVM(text, command, from_validation):
         error("can't " + text, 1)
     print_debug("DONE.\n", from_validation, alloy_build)
 
-def build_LLVM(version_LLVM, revision, folder, tarball, debug, selfbuild, from_validation, force):
+def build_LLVM(version_LLVM, revision, folder, tarball, debug, selfbuild, from_validation, force, make):
     print_debug("Building LLVM. Version: " + version_LLVM + ". ", from_validation, alloy_build)
     if revision != "":
         print_debug("Revision: " + revision + ".\n", from_validation, alloy_build)
@@ -100,7 +100,7 @@ def build_LLVM(version_LLVM, revision, folder, tarball, debug, selfbuild, from_v
     LLVM_SRC="llvm-" + folder
     LLVM_BUILD="build-" + folder
     LLVM_BIN="bin-" + folder
-    if os.path.exists(LLVM_BIN) and not force:
+    if os.path.exists(LLVM_BIN + os.sep + "bin") and not force:
         error("you have folder " + LLVM_BIN + ".\nIf you want to rebuild use --force", 1)
     LLVM_BUILD_selfbuild = LLVM_BUILD + "_temp"
     LLVM_BIN_selfbuild = LLVM_BIN + "_temp"
@@ -110,7 +110,6 @@ def build_LLVM(version_LLVM, revision, folder, tarball, debug, selfbuild, from_v
     if selfbuild:
         common.remove_if_exists(LLVM_BUILD_selfbuild)
         common.remove_if_exists(LLVM_BIN_selfbuild)
-    MAKE = "gmake"
     print_debug("Using folders: " + LLVM_SRC + " " + LLVM_BUILD + " " + LLVM_BIN + " in " + 
         llvm_home + "\n", from_validation, alloy_build)
     # load llvm
@@ -156,9 +155,9 @@ def build_LLVM(version_LLVM, revision, folder, tarball, debug, selfbuild, from_v
                     LLVM_BIN_selfbuild + " --enable-optimized",
                     from_validation)
         try_do_LLVM("build release version for selfbuild ",
-                    MAKE + " -j32", from_validation)
+                    make, from_validation)
         try_do_LLVM("install release version for selfbuild ",
-                    MAKE + " install",
+                    "make install",
                     from_validation)
         os.chdir("../")
         selfbuild_compiler = " CC="+llvm_home+ "/" + LLVM_BIN_selfbuild + "/bin/clang"
@@ -175,8 +174,8 @@ def build_LLVM(version_LLVM, revision, folder, tarball, debug, selfbuild, from_v
                     " --enable-debug-runtime --enable-debug-symbols --enable-keep-symbols" + selfbuild_compiler,
                     from_validation)
     # building llvm
-    try_do_LLVM("build LLVM ", MAKE + " -j32", from_validation)
-    try_do_LLVM("install LLVM ", MAKE + " install", from_validation)
+    try_do_LLVM("build LLVM ", make, from_validation)
+    try_do_LLVM("install LLVM ", "make install", from_validation)
     os.chdir(current_path) 
 
 def check_targets():
@@ -254,13 +253,13 @@ def check_targets():
             answer_sde = answer_sde + [["-hsw", "avx2-i32x8"], ["-hsw", "avx2-i32x16"]]
     return [answer, answer_sde]
 
-def build_ispc(version_LLVM):
+def build_ispc(version_LLVM, make):
     current_path = os.getcwd()
     os.chdir(os.environ["ISPC_HOME"])
     p_temp = os.getenv("PATH")
     os.environ["PATH"] = os.environ["LLVM_HOME"] + "/bin-" + version_LLVM + "/bin:" + os.environ["PATH"]
-    os.system("make clean >> " + alloy_build)
-    try_do_LLVM("build ISPC with LLVM version " + version_LLVM + " ", "make -j32", True)
+    try_do_LLVM("clean ISPC for building", "make clean", True)
+    try_do_LLVM("build ISPC with LLVM version " + version_LLVM + " ", make, True)
     os.environ["PATH"] = p_temp
     os.chdir(current_path)
 
@@ -286,7 +285,7 @@ def execute_stability(stability, R, print_version):
 def run_special_tests():
    i = 5 
 
-def validation_run(only, only_targets, reference_branch, number, notify, update):
+def validation_run(only, only_targets, reference_branch, number, notify, update, make):
     os.chdir(os.environ["ISPC_HOME"])
     os.environ["PATH"] = os.environ["ISPC_HOME"] + ":" + os.environ["PATH"]
     if options.notify != "":
@@ -327,7 +326,6 @@ def validation_run(only, only_targets, reference_branch, number, notify, update)
         stability.no_opt = False
         stability.wrapexe = ""
 # prepare parameters of run
-        common.check_tools(1)
         [targets_t, sde_targets_t] = check_targets()
         rebuild = True
         opts = []
@@ -352,6 +350,8 @@ def validation_run(only, only_targets, reference_branch, number, notify, update)
         if "current" in only:
             LLVM = [" "]
             rebuild = False
+        else:
+            common.check_tools(1)
         if only_targets != "":
             only_targets_t = only_targets.split(" ")
             for i in only_targets_t:
@@ -383,7 +383,7 @@ def validation_run(only, only_targets, reference_branch, number, notify, update)
         gen_archs = ["x86-64"]
         need_LLVM = check_LLVM(LLVM)
         for i in range(0,len(need_LLVM)):
-            build_LLVM(need_LLVM[i], "", "", "", False, False, True, False)
+            build_LLVM(need_LLVM[i], "", "", "", False, False, True, False, make)
 # begin validation run for stabitily
         common.remove_if_exists(stability.in_file)
         R = [[[],[]],[[],[]],[[],[]],[[],[]]]
@@ -391,7 +391,7 @@ def validation_run(only, only_targets, reference_branch, number, notify, update)
         for i in range(0,len(LLVM)):
             print_version = 2
             if rebuild:
-                build_ispc(LLVM[i])
+                build_ispc(LLVM[i], make)
             for j in range(0,len(targets)):
                 stability.target = targets[j]
                 stability.wrapexe = ""
@@ -447,6 +447,7 @@ def validation_run(only, only_targets, reference_branch, number, notify, update)
 # *** *** ***
     if ((("performance" in only) == True) or ("stability" in only) == False):
         print_debug("\n\nPerformance validation run\n\n", False, "")
+        common.check_tools(1)
         performance = options_for_drivers()
 # performance constant options
         performance.number = number
@@ -460,8 +461,9 @@ def validation_run(only, only_targets, reference_branch, number, notify, update)
 # prepare LLVM 3.3 as newest LLVM
         need_LLVM = check_LLVM(["3.3"])
         if len(need_LLVM) != 0:
-            build_LLVM(need_LLVM[i], "", "", "", False, False, True, False)
+            build_LLVM(need_LLVM[i], "", "", "", False, False, True, False, make)
 # prepare reference point. build both test and reference compilers
+        try_do_LLVM("apply git", "git branch", True)
         temp4 = take_lines("git branch", "all")
         for line in temp4:
             if "*" in line:
@@ -473,14 +475,14 @@ def validation_run(only, only_targets, reference_branch, number, notify, update)
         #try_do_LLVM("stash current branch ", "git stash", True)
         try_do_LLVM("checkout reference branch " + reference_branch + " ", "git checkout " + reference_branch, True)
         sys.stdout.write(".\n")
-        build_ispc("3.3")
+        build_ispc("3.3", make)
         sys.stdout.write(".\n")
         os.rename("ispc", "ispc_ref")
         try_do_LLVM("checkout test branch " + current_branch + " ", "git checkout " + current_branch, True)
         if stashing:
             try_do_LLVM("return current branch ", "git stash pop", True)
         sys.stdout.write("You can interrupt script now.\n")
-        build_ispc("3.3")
+        build_ispc("3.3", make)
 # begin validation run for performance. output is inserted into perf()
         perf.perf(performance, [])
         if options.notify != "":
@@ -526,6 +528,12 @@ def Main():
     if options.notify != "":
         if os.environ.get("SMTP_ISPC") == None:
             error("you have no SMTP_ISPC in your environment for option notify", 1)
+    if options.only != "":
+        test_only_r = " 3.1 3.2 3.3 trunk current build stability performance x86 x86-64 -O0 -O2 native "
+        test_only = options.only.split(" ")
+        for iterator in test_only:
+            if not (" " + iterator + " " in test_only_r):
+                error("unknow option for only: " + iterator, 1)
 
     global f_date
     f_date = "logs"
@@ -536,16 +544,19 @@ def Main():
     global stability_log
     stability_log = os.getcwd() + os.sep + f_date + os.sep + "stability.log"
     current_path = os.getcwd()
+    make = "make -j" + options.speed
     try:
         if options.build_llvm:
             build_LLVM(options.version, options.revision, options.folder, options.tarball,
-                    options.debug, options.selfbuild, False, options.force)
+                    options.debug, options.selfbuild, False, options.force, make)
         if options.validation_run:
             validation_run(options.only, options.only_targets, options.branch,
-                    options.number_for_performance, options.notify, options.update)
+                    options.number_for_performance, options.notify, options.update, make)
     finally:
         os.chdir(current_path)
-        date_name = "alloy_results_" + datetime.datetime.now().strftime('%H_%M_%d_%m_%Y')
+        date_name = "alloy_results_" + datetime.datetime.now().strftime('%d_%m_%Y_%H_%M_%S')
+        if os.path.exists(date_name):
+            error("It's forbidden to run alloy two times in a second, logs are in ./logs", 1)
         os.rename(f_date, date_name)
         print_debug("Logs are in " + date_name + "\n", False, "")
 
@@ -594,6 +605,8 @@ parser.add_option('-b', '--build-llvm', dest='build_llvm',
     help='ask to build LLVM', default=False, action="store_true")
 parser.add_option('-r', '--run', dest='validation_run',
     help='ask for validation run', default=False, action="store_true")
+parser.add_option('-j', dest='speed',
+    help='set -j for make', default="8")
 # options for activity "build LLVM"
 llvm_group = OptionGroup(parser, "Options for building LLVM",
                     "These options must be used with -b option.")
diff --git a/examples/common.mk b/examples/common.mk
index cdfc4c6a..95ec7ccb 100644
--- a/examples/common.mk
+++ b/examples/common.mk
@@ -44,7 +44,7 @@ dirs:
 objs/%.cpp objs/%.o objs/%.h: dirs
 
 clean:
-	/bin/rm -rf objs *~ $(EXAMPLE) $(EXAMPLE)-sse4 $(EXAMPLE)-generic16
+	/bin/rm -rf objs *~ $(EXAMPLE) $(EXAMPLE)-sse4 $(EXAMPLE)-generic16 ref test
 
 $(EXAMPLE): $(OBJS)
 	$(CXX) $(CXXFLAGS) -o $@ $^ $(LIBS)
diff --git a/perf.py b/perf.py
index b33e1f25..576a5c7d 100755
--- a/perf.py
+++ b/perf.py
@@ -190,7 +190,7 @@ def print_answer(answer):
     filelist = []
     print_debug("--------------------------------------------------------------------------\n", s, perf_log)
     print_debug("test name:\t    ISPC speedup: ISPC + tasks speedup: | " + 
-        "ISPC time:    ISPC + tasks time:  serial:\n", s, perf_log)
+        "    ISPC time:    ISPC + tasks time:  serial:\n", s, perf_log)
     filelist.append("test name,ISPC speedup,diff," +
         "ISPC + tasks speedup,diff,ISPC time,diff,ISPC + tasks time,diff,serial,diff\n")
     max_t = [0,0,0,0,0]
@@ -215,9 +215,9 @@ def print_answer(answer):
                 list_of_max[t-1].append(mm)
                 diff_t[t-1] = '%.2f' % (max(answer[i][t]) - min(answer[i][t]))
         print_debug("%s:\n" % answer[i][0], s, perf_log)
-        print_debug("\t\tmax:\t%5s\t\t%10s\t|%10s\t%10s\t%10s\n" %
+        print_debug("\t\tmax:\t%5s\t\t%10s\t|min:%10s\t%10s\t%10s\n" %
             (max_t[0], max_t[1], max_t[2], max_t[3], max_t[4]), s, perf_log)
-        print_debug("\t\tdiff:\t%5s\t\t%10s\t|%10s\t%10s\t%10s\n" %
+        print_debug("\t\tdiff:\t%5s\t\t%10s\t|%14s\t%10s\t%10s\n" %
             (diff_t[0], diff_t[1], diff_t[2], diff_t[3], diff_t[4]), s, perf_log)
         for t in range(0,5):
             if max_t[t] == "n/a":
@@ -231,7 +231,7 @@ def print_answer(answer):
     for i in range(0,5):
         geomean_t[i] = geomean(list_of_max[i])
     print_debug("---------------------------------------------------------------------------------\n", s, perf_log)
-    print_debug("Geomean:\t\t%5s\t\t%10s\t|%10s\t%10s\t%10s\n" %
+    print_debug("Geomean:\t\t%5s\t\t%10s\t|%14s\t%10s\t%10s\n" %
         (geomean_t[0], geomean_t[1], geomean_t[2], geomean_t[3], geomean_t[4]), s, perf_log)
     filelist.append("Geomean," + str(geomean_t[0]) + ",," + str(geomean_t[1])
         + ",," + str(geomean_t[2]) + ",," + str(geomean_t[3]) + ",," + str(geomean_t[4]) + "\n")
@@ -474,7 +474,7 @@ if __name__ == "__main__":
     parser.add_option('-c', '--config', dest='config',
         help='config file of tests', default="./perf.ini")
     parser.add_option('-p', '--path', dest='path',
-        help='path to test_system directory', default=".")
+        help='path to ispc root', default=".")
     parser.add_option('-s', '--silent', dest='silent',
         help='silent mode, only table output', default=False, action="store_true")
     parser.add_option('-o', '--output', dest='output',
diff --git a/run_tests.py b/run_tests.py
index 914f22a7..abc9b656 100755
--- a/run_tests.py
+++ b/run_tests.py
@@ -369,8 +369,12 @@ def file_check(compfails, runfails):
 # Detect compiler version
     if is_windows == False:
         temp1 = common.take_lines(options.compiler_exe + " --version", "first")
-        temp2 = temp1.split(" ")
-        compiler_version = temp2[0] + temp2[2][0:4]
+        temp2 = re.search("[0-9]*\.[0-9]*\.[0-9]", temp1)
+        if temp2 == None:
+            temp3 = re.search("[0-9]*\.[0-9]*", temp1)
+        else:
+            temp3 = re.search("[0-9]*\.[0-9]*", temp2.group())
+        compiler_version = options.compiler_exe + temp3.group()
     else:
         compiler_version = "cl" 
     new_line = " "+options.arch.rjust(6)+" "+options.target.rjust(14)+" "+OS.rjust(7)+" "+llvm_version+" "+compiler_version.rjust(10)+" "+opt+" *\n"
@@ -464,7 +468,7 @@ def run_tests(options1, args, print_version):
     global s
     s = options.silent
     
-    # prepare run_tests_log and test_states files
+    # prepare run_tests_log and fail_db files
     global run_tests_log
     if options.in_file:
         run_tests_log = os.getcwd() + os.sep + options.in_file
@@ -715,7 +719,7 @@ if __name__ == "__main__":
                   default=False, action="store_true")
     parser.add_option('--non-interactive', dest='non_interactive', help='Disable interactive status updates',
                   default=False, action="store_true")
-    parser.add_option('-u', "--update", dest='update', help='Update file with fails (F of FP)', default="")
+    parser.add_option('-u', "--update-errors", dest='update', help='Update file with fails (F of FP)', default="")
     parser.add_option('-s', "--silent", dest='silent', help='enable silent mode without any output', default=False,
                   action = "store_true")
     parser.add_option("--file", dest='in_file', help='file to save run_tests output', default="")

From dfc723bc1958f39d4526897fdfd5173a936c09f7 Mon Sep 17 00:00:00 2001
From: Dmitry Babokin <babokin@gmail.com>
Date: Mon, 23 Sep 2013 21:35:33 +0400
Subject: [PATCH 102/124] Add fails with gcc 4.4 on Linux

---
 fail_db.txt | 326 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 326 insertions(+)

diff --git a/fail_db.txt b/fail_db.txt
index 7adc3e41..23a6c8ca 100644
--- a/fail_db.txt
+++ b/fail_db.txt
@@ -1 +1,327 @@
 % List of known fails
+./tests/masked-scatter-vector.ispc runfail  x86-64     sse2-i32x4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/atomics-13.ispc compfail     x86     sse4-i16x8   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/reduce-equal-10.ispc compfail     x86     sse4-i16x8   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/reduce-equal-11.ispc compfail     x86     sse4-i16x8   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/reduce-equal-13.ispc compfail     x86     sse4-i16x8   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/reduce-equal-5.ispc compfail     x86     sse4-i16x8   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/reduce-equal-6.ispc compfail     x86     sse4-i16x8   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/atomics-13.ispc compfail  x86-64     sse4-i16x8   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/reduce-equal-10.ispc compfail  x86-64     sse4-i16x8   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/reduce-equal-11.ispc compfail  x86-64     sse4-i16x8   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/reduce-equal-13.ispc compfail  x86-64     sse4-i16x8   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/reduce-equal-5.ispc compfail  x86-64     sse4-i16x8   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/reduce-equal-6.ispc compfail  x86-64     sse4-i16x8   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/funcptr-null-4.ispc runfail     x86     sse4-i8x16   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/funcptr-null-5.ispc runfail     x86     sse4-i8x16   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/funcptr-null-6.ispc runfail     x86     sse4-i8x16   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/atomics-13.ispc compfail     x86     sse4-i8x16   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/reduce-equal-10.ispc compfail     x86     sse4-i8x16   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/reduce-equal-11.ispc compfail     x86     sse4-i8x16   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/reduce-equal-13.ispc compfail     x86     sse4-i8x16   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/reduce-equal-5.ispc compfail     x86     sse4-i8x16   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/reduce-equal-6.ispc compfail     x86     sse4-i8x16   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/funcptr-null-4.ispc runfail  x86-64     sse4-i8x16   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/funcptr-null-5.ispc runfail  x86-64     sse4-i8x16   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/funcptr-null-6.ispc runfail  x86-64     sse4-i8x16   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/atomics-13.ispc compfail  x86-64     sse4-i8x16   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/reduce-equal-10.ispc compfail  x86-64     sse4-i8x16   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/reduce-equal-11.ispc compfail  x86-64     sse4-i8x16   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/reduce-equal-13.ispc compfail  x86-64     sse4-i8x16   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/reduce-equal-5.ispc compfail  x86-64     sse4-i8x16   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/reduce-equal-6.ispc compfail  x86-64     sse4-i8x16   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/avg-down-int8.ispc compfail     x86    avx1-i32x16   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/avg-up-int8.ispc compfail     x86    avx1-i32x16   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/avg-down-int8.ispc compfail  x86-64    avx1-i32x16   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/avg-up-int8.ispc compfail  x86-64    avx1-i32x16   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/atomics-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/atomics-6.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/atomics-swap.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/atomics-varyingptr-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/atomics-varyingptr-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/avg-down-uint16.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/avg-down-uint8.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/avg-up-uint16.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/avg-up-uint8.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/broadcast-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/broadcast-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/count-leading-trailing-zeros-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/count-leading-trailing-zeros-5.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/exclusive-scan-add-10.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/exclusive-scan-add-8.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/exclusive-scan-add-9.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/exclusive-scan-and-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/exclusive-scan-or-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/funcptr-null-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/funcptr-null-3.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/funcptr-null-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/funcptr-null-5.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/funcptr-null-6.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/funcptr-uniform-7.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/funcptr-uniform-8.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/funcptr-uniform-9.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/funcptr-varying-5.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/funcptr-varying-7.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/funcptr-varying-8.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/funcptr-varying-9.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/half-3.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/idiv.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/int64-max-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/int64-max.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/int64-min-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/int64-min.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/local-atomics-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/local-atomics-11.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/local-atomics-12.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/local-atomics-13.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/local-atomics-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/local-atomics-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/local-atomics-5.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/local-atomics-9.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/local-atomics-swap.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/local-atomics-varyingptr-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/local-atomics-varyingptr-3.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/local-atomics-varyingptr-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/new-delete-6.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/phi-opts-3.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/phi-opts-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/popcnt-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/popcnt-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/popcnt-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/ptr-15.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/reduce-add-int16-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/reduce-add-int16.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/reduce-equal-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/reduce-equal-10.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/reduce-equal-12.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/reduce-equal-13.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/reduce-equal-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/reduce-equal-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/reduce-equal-5.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/reduce-equal-7.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/reduce-equal-8.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/rotate-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/rotate-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/rotate-3.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/rotate-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/rotate-6.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/rotate.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/short-vec-14.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/shuffle-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/shuffle-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/shuffle-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/shuffle-flatten.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/shuffle.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/shuffle2-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/shuffle2-11.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/shuffle2-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/shuffle2-3.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/shuffle2-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/shuffle2-5.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/shuffle2-6.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/shuffle2-7.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/shuffle2-8.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/shuffle2-9.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/shuffle2.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/soa-27.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/soa-28.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/test-128.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/test-129.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/test-130.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/test-57.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/uint64-max-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/uint64-max.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/uint64-min-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/uint64-min.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/ptr-assign-lhs-math-1.ispc compfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/short-vec-8.ispc compfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/ptr-15.ispc runfail  x86-64     generic-16   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/test-141.ispc runfail  x86-64     generic-16   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/test-143.ispc runfail  x86-64     generic-16   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/ptr-assign-lhs-math-1.ispc compfail  x86-64     generic-16   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/avg-down-int8.ispc compfail     x86  avx1.1-i32x16   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/avg-up-int8.ispc compfail     x86  avx1.1-i32x16   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/avg-down-int8.ispc compfail  x86-64  avx1.1-i32x16   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/avg-up-int8.ispc compfail  x86-64  avx1.1-i32x16   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/test-141.ispc runfail     x86    avx2-i32x16   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/test-141.ispc runfail  x86-64    avx2-i32x16   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/funcptr-null-4.ispc runfail     x86     sse4-i8x16   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/funcptr-null-5.ispc runfail     x86     sse4-i8x16   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/funcptr-null-6.ispc runfail     x86     sse4-i8x16   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/funcptr-null-4.ispc runfail  x86-64     sse4-i8x16   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/funcptr-null-5.ispc runfail  x86-64     sse4-i8x16   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/funcptr-null-6.ispc runfail  x86-64     sse4-i8x16   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/avg-down-int8.ispc compfail     x86    avx1-i32x16   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/avg-up-int8.ispc compfail     x86    avx1-i32x16   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/avg-down-int8.ispc compfail  x86-64    avx1-i32x16   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/avg-up-int8.ispc compfail  x86-64    avx1-i32x16   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/array-gather-ifs.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/array-gather-multi-unif.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/array-gather-unif.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/array-mixed-unif-vary-indexing-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/array-mixed-unif-vary-indexing-3.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/array-mixed-unif-vary-indexing.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/array-multidim-gather.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/array-scatter-unif-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/array-scatter-vary.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/array-struct-gather.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/atomics-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/atomics-6.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/atomics-varyingptr-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/atomics-varyingptr-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/avg-down-uint16.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/avg-down-uint8.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/avg-up-uint16.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/avg-up-uint8.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/broadcast-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/broadcast-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/cfor-array-gather-ifs.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/cfor-array-gather-unif.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/cfor-array-multidim-gather.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/cfor-array-struct-gather.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/cfor-struct-test-114.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/cfor-unif-struct-test-114.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/count-leading-trailing-zeros-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/count-leading-trailing-zeros-5.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/exclusive-scan-add-10.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/exclusive-scan-add-8.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/exclusive-scan-add-9.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/exclusive-scan-and-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/exclusive-scan-or-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/funcptr-null-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/funcptr-null-3.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/funcptr-null-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/funcptr-null-5.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/funcptr-null-6.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/funcptr-uniform-7.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/funcptr-uniform-8.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/funcptr-uniform-9.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/funcptr-varying-5.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/funcptr-varying-7.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/funcptr-varying-8.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/funcptr-varying-9.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/gather-int16.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/gather-to-vload-neg-offset.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/global-array-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/half-3.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/idiv.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/int64-max-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/int64-max.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/int64-min-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/int64-min.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/local-atomics-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/local-atomics-11.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/local-atomics-12.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/local-atomics-13.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/local-atomics-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/local-atomics-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/local-atomics-5.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/local-atomics-9.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/local-atomics-swap.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/local-atomics-varyingptr-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/local-atomics-varyingptr-3.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/local-atomics-varyingptr-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/masked-scatter-struct.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/masked-scatter-vector.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/nested-structs-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/new-delete-6.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/pass-varying-lvalue-to-ref.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/phi-opts-3.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/phi-opts-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/popcnt-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/popcnt-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/popcnt-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/ptr-15.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/reduce-add-int16-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/reduce-add-int16.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/reduce-equal-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/reduce-equal-10.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/reduce-equal-12.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/reduce-equal-13.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/reduce-equal-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/reduce-equal-5.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/reduce-equal-7.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/reduce-equal-8.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/rotate-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/rotate-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/rotate-3.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/rotate-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/rotate-6.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/rotate.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/scatter-int16-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/scatter-int16.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/scatter-mask-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/scatter-mask-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/short-vec-12.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/short-vec-14.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/shuffle-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/shuffle-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/shuffle-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/shuffle-flatten.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/shuffle.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/shuffle2-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/shuffle2-11.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/shuffle2-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/shuffle2-3.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/shuffle2-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/shuffle2-5.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/shuffle2-6.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/shuffle2-7.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/shuffle2-8.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/shuffle2-9.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/shuffle2.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/soa-28.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/struct-test-114.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/test-128.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/test-129.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/test-130.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/test-57.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/uint64-max-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/uint64-max.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/uint64-min-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/uint64-min.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/unif-struct-test-114.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/varying-struct-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/varying-struct-3.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/varying-struct-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/write-same-loc.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/ptr-assign-lhs-math-1.ispc compfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/short-vec-8.ispc compfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/ptr-15.ispc runfail  x86-64     generic-16   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/test-141.ispc runfail  x86-64     generic-16   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/test-143.ispc runfail  x86-64     generic-16   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/ptr-assign-lhs-math-1.ispc compfail  x86-64     generic-16   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/avg-down-int8.ispc compfail     x86  avx1.1-i32x16   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/avg-up-int8.ispc compfail     x86  avx1.1-i32x16   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/avg-down-int8.ispc compfail  x86-64  avx1.1-i32x16   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/avg-up-int8.ispc compfail  x86-64  avx1.1-i32x16   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/atomics-varyingptr-2.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/atomics-varyingptr-3.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/atomics-varyingptr-4.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/local-atomics-11.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/local-atomics-12.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/local-atomics-13.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/local-atomics-4.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/local-atomics-5.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/local-atomics-6.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/local-atomics-7.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/local-atomics-8.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/local-atomics-swap.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/local-atomics-varyingptr-2.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/local-atomics-varyingptr-3.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/local-atomics-varyingptr-4.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/memset-varying.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/reduce-equal-1.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/reduce-equal-12.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/reduce-equal-13.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/reduce-equal-2.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/reduce-equal-3.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/reduce-equal-4.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/reduce-equal-5.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/reduce-equal-6.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/reduce-equal-7.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/reduce-equal.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/test-141.ispc runfail     x86    avx2-i32x16   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/test-141.ispc runfail  x86-64    avx2-i32x16   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/atomics-swap.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
+./tests/masked-scatter-struct.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *

From 2a83cefd5b0d3f19f968e9f91702e073211375bb Mon Sep 17 00:00:00 2001
From: Dmitry Babokin <babokin@gmail.com>
Date: Thu, 26 Sep 2013 19:07:38 +0400
Subject: [PATCH 103/124] Add fails with gcc 4.7 on Linux

---
 fail_db.txt | 175 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 175 insertions(+)

diff --git a/fail_db.txt b/fail_db.txt
index 23a6c8ca..9cc7a884 100644
--- a/fail_db.txt
+++ b/fail_db.txt
@@ -325,3 +325,178 @@
 ./tests/test-141.ispc runfail  x86-64    avx2-i32x16   Linux LLVM 3.4     g++4.4 -O2 *
 ./tests/atomics-swap.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
 ./tests/masked-scatter-struct.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
+./tests/atomics-13.ispc compfail     x86     sse4-i16x8   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/reduce-equal-10.ispc compfail     x86     sse4-i16x8   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/reduce-equal-11.ispc compfail     x86     sse4-i16x8   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/reduce-equal-13.ispc compfail     x86     sse4-i16x8   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/reduce-equal-5.ispc compfail     x86     sse4-i16x8   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/reduce-equal-6.ispc compfail     x86     sse4-i16x8   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/atomics-13.ispc compfail  x86-64     sse4-i16x8   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/reduce-equal-10.ispc compfail  x86-64     sse4-i16x8   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/reduce-equal-11.ispc compfail  x86-64     sse4-i16x8   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/reduce-equal-13.ispc compfail  x86-64     sse4-i16x8   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/reduce-equal-5.ispc compfail  x86-64     sse4-i16x8   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/reduce-equal-6.ispc compfail  x86-64     sse4-i16x8   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/funcptr-null-4.ispc runfail     x86     sse4-i8x16   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/funcptr-null-5.ispc runfail     x86     sse4-i8x16   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/funcptr-null-6.ispc runfail     x86     sse4-i8x16   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/atomics-13.ispc compfail     x86     sse4-i8x16   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/reduce-equal-10.ispc compfail     x86     sse4-i8x16   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/reduce-equal-11.ispc compfail     x86     sse4-i8x16   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/reduce-equal-13.ispc compfail     x86     sse4-i8x16   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/reduce-equal-5.ispc compfail     x86     sse4-i8x16   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/reduce-equal-6.ispc compfail     x86     sse4-i8x16   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/funcptr-null-4.ispc runfail  x86-64     sse4-i8x16   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/funcptr-null-5.ispc runfail  x86-64     sse4-i8x16   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/funcptr-null-6.ispc runfail  x86-64     sse4-i8x16   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/atomics-13.ispc compfail  x86-64     sse4-i8x16   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/reduce-equal-10.ispc compfail  x86-64     sse4-i8x16   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/reduce-equal-11.ispc compfail  x86-64     sse4-i8x16   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/reduce-equal-13.ispc compfail  x86-64     sse4-i8x16   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/reduce-equal-5.ispc compfail  x86-64     sse4-i8x16   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/reduce-equal-6.ispc compfail  x86-64     sse4-i8x16   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/avg-down-int8.ispc compfail     x86    avx1-i32x16   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/avg-up-int8.ispc compfail     x86    avx1-i32x16   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/avg-down-int8.ispc compfail  x86-64    avx1-i32x16   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/avg-up-int8.ispc compfail  x86-64    avx1-i32x16   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/avg-down-int8.ispc compfail     x86  avx1.1-i32x16   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/avg-up-int8.ispc compfail     x86  avx1.1-i32x16   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/avg-down-int8.ispc compfail  x86-64  avx1.1-i32x16   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/avg-up-int8.ispc compfail  x86-64  avx1.1-i32x16   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/atomics-varyingptr-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/atomics-varyingptr-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/broadcast-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/half-3.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/local-atomics-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/local-atomics-11.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/local-atomics-12.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/local-atomics-13.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/local-atomics-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/local-atomics-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/local-atomics-5.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/local-atomics-9.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/local-atomics-swap.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/local-atomics-varyingptr-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/local-atomics-varyingptr-3.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/local-atomics-varyingptr-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/memset-varying.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/ptr-15.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/reduce-equal-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/rotate-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/rotate-3.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/shuffle-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/shuffle-flatten.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/shuffle.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/shuffle2-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/shuffle2-10.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/shuffle2-11.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/shuffle2-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/shuffle2-3.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/shuffle2-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/shuffle2-5.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/shuffle2-6.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/shuffle2-7.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/shuffle2-8.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/shuffle2-9.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/shuffle2.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/test-129.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/test-130.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/ptr-assign-lhs-math-1.ispc compfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/short-vec-8.ispc compfail  x86-64      generic-4   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/ptr-15.ispc runfail  x86-64     generic-16   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/test-141.ispc runfail  x86-64     generic-16   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/test-143.ispc runfail  x86-64     generic-16   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/ptr-assign-lhs-math-1.ispc compfail  x86-64     generic-16   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/rotate.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/shift1.ispc runfail     x86    avx2-i32x16   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/test-141.ispc runfail     x86    avx2-i32x16   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/shift1.ispc runfail  x86-64    avx2-i32x16   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/test-141.ispc runfail  x86-64    avx2-i32x16   Linux LLVM 3.3     g++4.7 -O2 *
+./tests/funcptr-null-4.ispc runfail     x86     sse4-i8x16   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/funcptr-null-5.ispc runfail     x86     sse4-i8x16   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/funcptr-null-6.ispc runfail     x86     sse4-i8x16   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/funcptr-null-4.ispc runfail  x86-64     sse4-i8x16   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/funcptr-null-5.ispc runfail  x86-64     sse4-i8x16   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/funcptr-null-6.ispc runfail  x86-64     sse4-i8x16   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/avg-down-int8.ispc compfail     x86    avx1-i32x16   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/avg-up-int8.ispc compfail     x86    avx1-i32x16   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/avg-down-int8.ispc compfail  x86-64    avx1-i32x16   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/avg-up-int8.ispc compfail  x86-64    avx1-i32x16   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/avg-down-int8.ispc compfail     x86  avx1.1-i32x16   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/avg-up-int8.ispc compfail     x86  avx1.1-i32x16   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/avg-down-int8.ispc compfail  x86-64  avx1.1-i32x16   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/avg-up-int8.ispc compfail  x86-64  avx1.1-i32x16   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/atomics-varyingptr-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/atomics-varyingptr-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/broadcast-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/half-3.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/local-atomics-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/local-atomics-11.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/local-atomics-12.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/local-atomics-13.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/local-atomics-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/local-atomics-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/local-atomics-5.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/local-atomics-9.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/local-atomics-swap.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/local-atomics-varyingptr-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/local-atomics-varyingptr-3.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/local-atomics-varyingptr-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/memset-varying.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/ptr-15.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/reduce-equal-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/rotate-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/rotate-3.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/shuffle-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/shuffle-flatten.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/shuffle.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/shuffle2-1.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/shuffle2-10.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/shuffle2-11.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/shuffle2-2.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/shuffle2-3.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/shuffle2-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/shuffle2-5.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/shuffle2-6.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/shuffle2-7.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/shuffle2-8.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/shuffle2-9.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/shuffle2.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/test-129.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/test-130.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/ptr-assign-lhs-math-1.ispc compfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/short-vec-8.ispc compfail  x86-64      generic-4   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/ptr-15.ispc runfail  x86-64     generic-16   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/test-141.ispc runfail  x86-64     generic-16   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/test-143.ispc runfail  x86-64     generic-16   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/ptr-assign-lhs-math-1.ispc compfail  x86-64     generic-16   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/atomics-varyingptr-2.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/atomics-varyingptr-3.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/atomics-varyingptr-4.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/local-atomics-11.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/local-atomics-12.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/local-atomics-13.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/local-atomics-4.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/local-atomics-5.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/local-atomics-6.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/local-atomics-7.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/local-atomics-8.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/local-atomics-swap.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/local-atomics-varyingptr-2.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/local-atomics-varyingptr-3.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/local-atomics-varyingptr-4.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/memset-varying.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/reduce-equal-1.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/reduce-equal-12.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/reduce-equal-13.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/reduce-equal-2.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/reduce-equal-3.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/reduce-equal-4.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/reduce-equal-5.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/reduce-equal-6.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/reduce-equal-7.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/reduce-equal.ispc compfail  x86-64     avx2-i32x8   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/shift1.ispc runfail     x86    avx2-i32x16   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/test-141.ispc runfail     x86    avx2-i32x16   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/shift1.ispc runfail  x86-64    avx2-i32x16   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/test-141.ispc runfail  x86-64    avx2-i32x16   Linux LLVM 3.4     g++4.7 -O2 *

From 5855ae746021553cea0cb4c81c913a71e4fc71f9 Mon Sep 17 00:00:00 2001
From: Dmitry Babokin <babokin@gmail.com>
Date: Fri, 27 Sep 2013 02:32:01 +0400
Subject: [PATCH 104/124] Add fails with gcc 4.7 on Mac

---
 fail_db.txt | 146 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 146 insertions(+)

diff --git a/fail_db.txt b/fail_db.txt
index 9cc7a884..b8e58d8b 100644
--- a/fail_db.txt
+++ b/fail_db.txt
@@ -500,3 +500,149 @@
 ./tests/test-141.ispc runfail     x86    avx2-i32x16   Linux LLVM 3.4     g++4.7 -O2 *
 ./tests/shift1.ispc runfail  x86-64    avx2-i32x16   Linux LLVM 3.4     g++4.7 -O2 *
 ./tests/test-141.ispc runfail  x86-64    avx2-i32x16   Linux LLVM 3.4     g++4.7 -O2 *
+./tests/atomics-13.ispc compfail     x86     sse4-i16x8     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/reduce-equal-10.ispc compfail     x86     sse4-i16x8     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/reduce-equal-11.ispc compfail     x86     sse4-i16x8     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/reduce-equal-13.ispc compfail     x86     sse4-i16x8     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/reduce-equal-5.ispc compfail     x86     sse4-i16x8     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/reduce-equal-6.ispc compfail     x86     sse4-i16x8     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/atomics-13.ispc compfail  x86-64     sse4-i16x8     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/reduce-equal-10.ispc compfail  x86-64     sse4-i16x8     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/reduce-equal-11.ispc compfail  x86-64     sse4-i16x8     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/reduce-equal-13.ispc compfail  x86-64     sse4-i16x8     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/reduce-equal-5.ispc compfail  x86-64     sse4-i16x8     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/reduce-equal-6.ispc compfail  x86-64     sse4-i16x8     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/funcptr-null-4.ispc runfail     x86     sse4-i8x16     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/funcptr-null-5.ispc runfail     x86     sse4-i8x16     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/funcptr-null-6.ispc runfail     x86     sse4-i8x16     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/atomics-13.ispc compfail     x86     sse4-i8x16     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/reduce-equal-10.ispc compfail     x86     sse4-i8x16     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/reduce-equal-11.ispc compfail     x86     sse4-i8x16     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/reduce-equal-13.ispc compfail     x86     sse4-i8x16     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/reduce-equal-5.ispc compfail     x86     sse4-i8x16     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/reduce-equal-6.ispc compfail     x86     sse4-i8x16     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/funcptr-null-4.ispc runfail  x86-64     sse4-i8x16     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/funcptr-null-5.ispc runfail  x86-64     sse4-i8x16     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/funcptr-null-6.ispc runfail  x86-64     sse4-i8x16     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/atomics-13.ispc compfail  x86-64     sse4-i8x16     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/reduce-equal-10.ispc compfail  x86-64     sse4-i8x16     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/reduce-equal-11.ispc compfail  x86-64     sse4-i8x16     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/reduce-equal-13.ispc compfail  x86-64     sse4-i8x16     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/reduce-equal-5.ispc compfail  x86-64     sse4-i8x16     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/reduce-equal-6.ispc compfail  x86-64     sse4-i8x16     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/avg-down-int8.ispc compfail     x86    avx1-i32x16     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/avg-up-int8.ispc compfail     x86    avx1-i32x16     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/avg-down-int8.ispc compfail  x86-64    avx1-i32x16     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/avg-up-int8.ispc compfail  x86-64    avx1-i32x16     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/avg-down-int8.ispc compfail     x86  avx1.1-i32x16     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/avg-up-int8.ispc compfail     x86  avx1.1-i32x16     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/avg-down-int8.ispc compfail  x86-64  avx1.1-i32x16     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/avg-up-int8.ispc compfail  x86-64  avx1.1-i32x16     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/test-141.ispc runfail     x86    avx2-i32x16     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/test-141.ispc runfail  x86-64    avx2-i32x16     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/broadcast-1.ispc runfail  x86-64      generic-4     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/half-3.ispc runfail  x86-64      generic-4     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/local-atomics-1.ispc runfail  x86-64      generic-4     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/local-atomics-13.ispc runfail  x86-64      generic-4     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/local-atomics-5.ispc runfail  x86-64      generic-4     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/local-atomics-9.ispc runfail  x86-64      generic-4     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/local-atomics-swap.ispc runfail  x86-64      generic-4     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/local-atomics-varyingptr-3.ispc runfail  x86-64      generic-4     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/memset-varying.ispc runfail  x86-64      generic-4     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/ptr-15.ispc runfail  x86-64      generic-4     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/rotate-2.ispc runfail  x86-64      generic-4     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/shuffle-4.ispc runfail  x86-64      generic-4     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/shuffle2-1.ispc runfail  x86-64      generic-4     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/shuffle2-10.ispc runfail  x86-64      generic-4     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/shuffle2-11.ispc runfail  x86-64      generic-4     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/shuffle2-2.ispc runfail  x86-64      generic-4     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/shuffle2-3.ispc runfail  x86-64      generic-4     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/shuffle2-4.ispc runfail  x86-64      generic-4     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/shuffle2-5.ispc runfail  x86-64      generic-4     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/shuffle2-6.ispc runfail  x86-64      generic-4     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/shuffle2-7.ispc runfail  x86-64      generic-4     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/shuffle2-8.ispc runfail  x86-64      generic-4     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/shuffle2-9.ispc runfail  x86-64      generic-4     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/shuffle2.ispc runfail  x86-64      generic-4     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/test-129.ispc runfail  x86-64      generic-4     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/test-130.ispc runfail  x86-64      generic-4     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/ptr-assign-lhs-math-1.ispc compfail  x86-64      generic-4     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/short-vec-8.ispc compfail  x86-64      generic-4     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/ptr-15.ispc runfail  x86-64     generic-16     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/test-141.ispc runfail  x86-64     generic-16     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/test-143.ispc runfail  x86-64     generic-16     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/ptr-assign-lhs-math-1.ispc compfail  x86-64     generic-16     Mac LLVM 3.3     g++4.7 -O2 *
+./tests/funcptr-null-4.ispc runfail     x86     sse4-i8x16     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/funcptr-null-5.ispc runfail     x86     sse4-i8x16     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/funcptr-null-6.ispc runfail     x86     sse4-i8x16     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/funcptr-null-4.ispc runfail  x86-64     sse4-i8x16     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/funcptr-null-5.ispc runfail  x86-64     sse4-i8x16     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/funcptr-null-6.ispc runfail  x86-64     sse4-i8x16     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/avg-down-int8.ispc compfail     x86    avx1-i32x16     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/avg-up-int8.ispc compfail     x86    avx1-i32x16     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/avg-down-int8.ispc compfail  x86-64    avx1-i32x16     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/avg-up-int8.ispc compfail  x86-64    avx1-i32x16     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/avg-down-int8.ispc compfail     x86  avx1.1-i32x16     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/avg-up-int8.ispc compfail     x86  avx1.1-i32x16     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/avg-down-int8.ispc compfail  x86-64  avx1.1-i32x16     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/avg-up-int8.ispc compfail  x86-64  avx1.1-i32x16     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/atomics-varyingptr-2.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/atomics-varyingptr-3.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/atomics-varyingptr-4.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/local-atomics-11.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/local-atomics-12.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/local-atomics-13.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/local-atomics-4.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/local-atomics-5.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/local-atomics-6.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/local-atomics-7.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/local-atomics-8.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/local-atomics-swap.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/local-atomics-varyingptr-2.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/local-atomics-varyingptr-3.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/local-atomics-varyingptr-4.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/memset-varying.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/reduce-equal-1.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/reduce-equal-12.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/reduce-equal-13.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/reduce-equal-2.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/reduce-equal-3.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/reduce-equal-4.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/reduce-equal-5.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/reduce-equal-6.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/reduce-equal-7.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/reduce-equal.ispc compfail  x86-64     avx2-i32x8     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/test-141.ispc runfail     x86    avx2-i32x16     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/test-141.ispc runfail  x86-64    avx2-i32x16     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/broadcast-1.ispc runfail  x86-64      generic-4     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/half-3.ispc runfail  x86-64      generic-4     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/local-atomics-1.ispc runfail  x86-64      generic-4     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/local-atomics-13.ispc runfail  x86-64      generic-4     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/local-atomics-5.ispc runfail  x86-64      generic-4     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/local-atomics-9.ispc runfail  x86-64      generic-4     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/local-atomics-swap.ispc runfail  x86-64      generic-4     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/local-atomics-varyingptr-3.ispc runfail  x86-64      generic-4     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/memset-varying.ispc runfail  x86-64      generic-4     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/ptr-15.ispc runfail  x86-64      generic-4     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/rotate-2.ispc runfail  x86-64      generic-4     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/shuffle-4.ispc runfail  x86-64      generic-4     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/shuffle2-1.ispc runfail  x86-64      generic-4     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/shuffle2-10.ispc runfail  x86-64      generic-4     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/shuffle2-11.ispc runfail  x86-64      generic-4     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/shuffle2-2.ispc runfail  x86-64      generic-4     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/shuffle2-3.ispc runfail  x86-64      generic-4     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/shuffle2-4.ispc runfail  x86-64      generic-4     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/shuffle2-5.ispc runfail  x86-64      generic-4     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/shuffle2-6.ispc runfail  x86-64      generic-4     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/shuffle2-7.ispc runfail  x86-64      generic-4     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/shuffle2-8.ispc runfail  x86-64      generic-4     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/shuffle2-9.ispc runfail  x86-64      generic-4     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/shuffle2.ispc runfail  x86-64      generic-4     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/test-129.ispc runfail  x86-64      generic-4     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/test-130.ispc runfail  x86-64      generic-4     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/ptr-assign-lhs-math-1.ispc compfail  x86-64      generic-4     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/short-vec-8.ispc compfail  x86-64      generic-4     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/ptr-15.ispc runfail  x86-64     generic-16     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/test-141.ispc runfail  x86-64     generic-16     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/test-143.ispc runfail  x86-64     generic-16     Mac LLVM 3.4     g++4.7 -O2 *
+./tests/ptr-assign-lhs-math-1.ispc compfail  x86-64     generic-16     Mac LLVM 3.4     g++4.7 -O2 *

From 396aaae098abc2e7a5ed5a02c97254a9f292086e Mon Sep 17 00:00:00 2001
From: Dmitry Babokin <babokin@gmail.com>
Date: Fri, 27 Sep 2013 17:00:17 +0400
Subject: [PATCH 105/124] Add fails with VS2010 on Windows

---
 fail_db.txt | 216 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 216 insertions(+)

diff --git a/fail_db.txt b/fail_db.txt
index b8e58d8b..a6608c12 100644
--- a/fail_db.txt
+++ b/fail_db.txt
@@ -646,3 +646,219 @@
 ./tests/test-141.ispc runfail  x86-64     generic-16     Mac LLVM 3.4     g++4.7 -O2 *
 ./tests/test-143.ispc runfail  x86-64     generic-16     Mac LLVM 3.4     g++4.7 -O2 *
 ./tests/ptr-assign-lhs-math-1.ispc compfail  x86-64     generic-16     Mac LLVM 3.4     g++4.7 -O2 *
+.\tests\exclusive-scan-add-9.ispc runfail     x86     sse2-i32x4 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-equal-10.ispc runfail     x86     sse2-i32x4 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-max-uint64.ispc runfail     x86     sse2-i32x4 Windows LLVM 3.3         cl -O2 *
+.\tests\uint64-max-1.ispc runfail     x86     sse2-i32x4 Windows LLVM 3.3         cl -O2 *
+.\tests\uint64-max.ispc runfail     x86     sse2-i32x4 Windows LLVM 3.3         cl -O2 *
+.\tests\uint64-min-1.ispc runfail     x86     sse2-i32x4 Windows LLVM 3.3         cl -O2 *
+.\tests\uint64-min.ispc runfail     x86     sse2-i32x4 Windows LLVM 3.3         cl -O2 *
+.\tests\exclusive-scan-add-10.ispc runfail     x86     sse2-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\exclusive-scan-add-9.ispc runfail     x86     sse2-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-equal-10.ispc runfail     x86     sse2-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-max-uint64.ispc runfail     x86     sse2-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\uint64-max-1.ispc runfail     x86     sse2-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\uint64-max.ispc runfail     x86     sse2-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\uint64-min-1.ispc runfail     x86     sse2-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\uint64-min.ispc runfail     x86     sse2-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\exclusive-scan-add-10.ispc runfail     x86     sse4-i32x4 Windows LLVM 3.3         cl -O2 *
+.\tests\exclusive-scan-add-9.ispc runfail     x86     sse4-i32x4 Windows LLVM 3.3         cl -O2 *
+.\tests\max-uint-1.ispc runfail     x86     sse4-i32x4 Windows LLVM 3.3         cl -O2 *
+.\tests\max-uint.ispc runfail     x86     sse4-i32x4 Windows LLVM 3.3         cl -O2 *
+.\tests\min-uint-2.ispc runfail     x86     sse4-i32x4 Windows LLVM 3.3         cl -O2 *
+.\tests\packed-load-1.ispc runfail     x86     sse4-i32x4 Windows LLVM 3.3         cl -O2 *
+.\tests\packed-store.ispc runfail     x86     sse4-i32x4 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint-1.ispc runfail     x86     sse4-i32x4 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint.ispc runfail     x86     sse4-i32x4 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint64-1.ispc runfail     x86     sse4-i32x4 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint64.ispc runfail     x86     sse4-i32x4 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-max-uint.ispc runfail     x86     sse4-i32x4 Windows LLVM 3.3         cl -O2 *
+.\tests\max-uint-1.ispc runfail     x86     sse4-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\max-uint.ispc runfail     x86     sse4-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\min-uint-2.ispc runfail     x86     sse4-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\packed-load-1.ispc runfail     x86     sse4-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\packed-store.ispc runfail     x86     sse4-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint-1.ispc runfail     x86     sse4-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint.ispc runfail     x86     sse4-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint64-1.ispc runfail     x86     sse4-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint64.ispc runfail     x86     sse4-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-max-uint.ispc runfail     x86     sse4-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\exclusive-scan-add-10.ispc runfail     x86     sse4-i16x8 Windows LLVM 3.3         cl -O2 *
+.\tests\max-uint-1.ispc runfail     x86     sse4-i16x8 Windows LLVM 3.3         cl -O2 *
+.\tests\max-uint.ispc runfail     x86     sse4-i16x8 Windows LLVM 3.3         cl -O2 *
+.\tests\min-uint-2.ispc runfail     x86     sse4-i16x8 Windows LLVM 3.3         cl -O2 *
+.\tests\packed-load-1.ispc runfail     x86     sse4-i16x8 Windows LLVM 3.3         cl -O2 *
+.\tests\packed-store.ispc runfail     x86     sse4-i16x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint-1.ispc runfail     x86     sse4-i16x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint.ispc runfail     x86     sse4-i16x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint64-1.ispc runfail     x86     sse4-i16x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint64.ispc runfail     x86     sse4-i16x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-max-uint.ispc runfail     x86     sse4-i16x8 Windows LLVM 3.3         cl -O2 *
+.\tests\atomics-13.ispc compfail     x86     sse4-i16x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-equal-10.ispc compfail     x86     sse4-i16x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-equal-11.ispc compfail     x86     sse4-i16x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-equal-13.ispc compfail     x86     sse4-i16x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-equal-5.ispc compfail     x86     sse4-i16x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-equal-6.ispc compfail     x86     sse4-i16x8 Windows LLVM 3.3         cl -O2 *
+.\tests\exclusive-scan-add-10.ispc runfail     x86     sse4-i8x16 Windows LLVM 3.3         cl -O2 *
+.\tests\exclusive-scan-add-9.ispc runfail     x86     sse4-i8x16 Windows LLVM 3.3         cl -O2 *
+.\tests\funcptr-null-4.ispc runfail     x86     sse4-i8x16 Windows LLVM 3.3         cl -O2 *
+.\tests\funcptr-null-5.ispc runfail     x86     sse4-i8x16 Windows LLVM 3.3         cl -O2 *
+.\tests\funcptr-null-6.ispc runfail     x86     sse4-i8x16 Windows LLVM 3.3         cl -O2 *
+.\tests\max-uint-1.ispc runfail     x86     sse4-i8x16 Windows LLVM 3.3         cl -O2 *
+.\tests\max-uint.ispc runfail     x86     sse4-i8x16 Windows LLVM 3.3         cl -O2 *
+.\tests\min-uint-2.ispc runfail     x86     sse4-i8x16 Windows LLVM 3.3         cl -O2 *
+.\tests\packed-load-1.ispc runfail     x86     sse4-i8x16 Windows LLVM 3.3         cl -O2 *
+.\tests\packed-store.ispc runfail     x86     sse4-i8x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint-1.ispc runfail     x86     sse4-i8x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint.ispc runfail     x86     sse4-i8x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint64-1.ispc runfail     x86     sse4-i8x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint64.ispc runfail     x86     sse4-i8x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-max-uint.ispc runfail     x86     sse4-i8x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-max-uint64.ispc runfail     x86     sse4-i8x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-min-uint64.ispc runfail     x86     sse4-i8x16 Windows LLVM 3.3         cl -O2 *
+.\tests\atomics-13.ispc compfail     x86     sse4-i8x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-equal-10.ispc compfail     x86     sse4-i8x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-equal-11.ispc compfail     x86     sse4-i8x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-equal-13.ispc compfail     x86     sse4-i8x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-equal-5.ispc compfail     x86     sse4-i8x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-equal-6.ispc compfail     x86     sse4-i8x16 Windows LLVM 3.3         cl -O2 *
+.\tests\exclusive-scan-add-10.ispc runfail     x86     avx1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\max-uint-1.ispc runfail     x86     avx1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\max-uint.ispc runfail     x86     avx1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\min-uint-2.ispc runfail     x86     avx1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\packed-load-1.ispc runfail     x86     avx1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\packed-store.ispc runfail     x86     avx1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint-1.ispc runfail     x86     avx1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint.ispc runfail     x86     avx1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint64-1.ispc runfail     x86     avx1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint64.ispc runfail     x86     avx1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-equal-10.ispc runfail     x86     avx1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-max-uint.ispc runfail     x86     avx1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-max-uint64.ispc runfail     x86     avx1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-min-uint64.ispc runfail     x86     avx1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\uint64-max-1.ispc runfail     x86     avx1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\uint64-max.ispc runfail     x86     avx1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\uint64-min-1.ispc runfail     x86     avx1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\uint64-min.ispc runfail     x86     avx1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-10.ispc compfail     x86     avx1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-11.ispc compfail     x86     avx1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-12.ispc compfail     x86     avx1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-8.ispc compfail     x86     avx1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-9.ispc compfail     x86     avx1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\exclusive-scan-add-10.ispc runfail     x86    avx1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\max-uint-1.ispc runfail     x86    avx1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\max-uint.ispc runfail     x86    avx1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\min-uint-1.ispc runfail     x86    avx1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\min-uint-2.ispc runfail     x86    avx1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\packed-load-1.ispc runfail     x86    avx1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\packed-store.ispc runfail     x86    avx1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint-1.ispc runfail     x86    avx1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint.ispc runfail     x86    avx1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint64-1.ispc runfail     x86    avx1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint64.ispc runfail     x86    avx1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-equal-10.ispc runfail     x86    avx1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-max-uint.ispc runfail     x86    avx1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-max-uint64.ispc runfail     x86    avx1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-min-uint.ispc runfail     x86    avx1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-min-uint64.ispc runfail     x86    avx1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\uint64-max-1.ispc runfail     x86    avx1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\uint64-max.ispc runfail     x86    avx1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\uint64-min-1.ispc runfail     x86    avx1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\uint64-min.ispc runfail     x86    avx1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\avg-down-int8.ispc compfail     x86    avx1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\avg-up-int8.ispc compfail     x86    avx1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-10.ispc compfail     x86    avx1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-11.ispc compfail     x86    avx1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-12.ispc compfail     x86    avx1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-8.ispc compfail     x86    avx1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-9.ispc compfail     x86    avx1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\exclusive-scan-add-10.ispc runfail     x86   avx1.1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\max-uint-1.ispc runfail     x86   avx1.1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\max-uint.ispc runfail     x86   avx1.1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\min-uint-2.ispc runfail     x86   avx1.1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\packed-load-1.ispc runfail     x86   avx1.1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\packed-store.ispc runfail     x86   avx1.1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint-1.ispc runfail     x86   avx1.1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint.ispc runfail     x86   avx1.1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint64-1.ispc runfail     x86   avx1.1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint64.ispc runfail     x86   avx1.1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-equal-10.ispc runfail     x86   avx1.1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-max-uint.ispc runfail     x86   avx1.1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-max-uint64.ispc runfail     x86   avx1.1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-min-uint64.ispc runfail     x86   avx1.1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\uint64-max-1.ispc runfail     x86   avx1.1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\uint64-max.ispc runfail     x86   avx1.1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\uint64-min-1.ispc runfail     x86   avx1.1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\uint64-min.ispc runfail     x86   avx1.1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-10.ispc compfail     x86   avx1.1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-11.ispc compfail     x86   avx1.1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-12.ispc compfail     x86   avx1.1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-8.ispc compfail     x86   avx1.1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-9.ispc compfail     x86   avx1.1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\exclusive-scan-add-10.ispc runfail     x86  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\max-uint-1.ispc runfail     x86  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\max-uint.ispc runfail     x86  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\min-uint-1.ispc runfail     x86  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\min-uint-2.ispc runfail     x86  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\packed-load-1.ispc runfail     x86  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\packed-store.ispc runfail     x86  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint-1.ispc runfail     x86  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint.ispc runfail     x86  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint64-1.ispc runfail     x86  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint64.ispc runfail     x86  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-equal-10.ispc runfail     x86  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-max-uint.ispc runfail     x86  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-max-uint64.ispc runfail     x86  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-min-uint.ispc runfail     x86  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-min-uint64.ispc runfail     x86  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\uint64-max-1.ispc runfail     x86  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\uint64-max.ispc runfail     x86  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\uint64-min-1.ispc runfail     x86  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\uint64-min.ispc runfail     x86  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\avg-down-int8.ispc compfail     x86  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\avg-up-int8.ispc compfail     x86  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-10.ispc compfail     x86  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-11.ispc compfail     x86  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-12.ispc compfail     x86  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-8.ispc compfail     x86  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-9.ispc compfail     x86  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\atomics-13.ispc compfail  x86-64     sse4-i16x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-equal-10.ispc compfail  x86-64     sse4-i16x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-equal-11.ispc compfail  x86-64     sse4-i16x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-equal-13.ispc compfail  x86-64     sse4-i16x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-equal-5.ispc compfail  x86-64     sse4-i16x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-equal-6.ispc compfail  x86-64     sse4-i16x8 Windows LLVM 3.3         cl -O2 *
+.\tests\funcptr-null-4.ispc runfail  x86-64     sse4-i8x16 Windows LLVM 3.3         cl -O2 *
+.\tests\funcptr-null-5.ispc runfail  x86-64     sse4-i8x16 Windows LLVM 3.3         cl -O2 *
+.\tests\funcptr-null-6.ispc runfail  x86-64     sse4-i8x16 Windows LLVM 3.3         cl -O2 *
+.\tests\atomics-13.ispc compfail  x86-64     sse4-i8x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-equal-10.ispc compfail  x86-64     sse4-i8x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-equal-11.ispc compfail  x86-64     sse4-i8x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-equal-13.ispc compfail  x86-64     sse4-i8x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-equal-5.ispc compfail  x86-64     sse4-i8x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-equal-6.ispc compfail  x86-64     sse4-i8x16 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-10.ispc compfail  x86-64     avx1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-11.ispc compfail  x86-64     avx1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-12.ispc compfail  x86-64     avx1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-8.ispc compfail  x86-64     avx1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-9.ispc compfail  x86-64     avx1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\avg-down-int8.ispc compfail  x86-64    avx1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\avg-up-int8.ispc compfail  x86-64    avx1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-10.ispc compfail  x86-64    avx1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-11.ispc compfail  x86-64    avx1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-12.ispc compfail  x86-64    avx1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-8.ispc compfail  x86-64    avx1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-9.ispc compfail  x86-64    avx1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-10.ispc compfail  x86-64   avx1.1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-11.ispc compfail  x86-64   avx1.1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-12.ispc compfail  x86-64   avx1.1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-8.ispc compfail  x86-64   avx1.1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-9.ispc compfail  x86-64   avx1.1-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\avg-down-int8.ispc compfail  x86-64  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\avg-up-int8.ispc compfail  x86-64  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-10.ispc compfail  x86-64  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-11.ispc compfail  x86-64  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-12.ispc compfail  x86-64  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-8.ispc compfail  x86-64  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-9.ispc compfail  x86-64  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *

From da52ae844f95ef617ef81af0f0588395109d2994 Mon Sep 17 00:00:00 2001
From: Dmitry Babokin <babokin@gmail.com>
Date: Fri, 27 Sep 2013 18:06:28 +0400
Subject: [PATCH 106/124] Adding AVX2 fails on Windows

---
 fail_db.txt | 62 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 62 insertions(+)

diff --git a/fail_db.txt b/fail_db.txt
index a6608c12..59e0a7a6 100644
--- a/fail_db.txt
+++ b/fail_db.txt
@@ -862,3 +862,65 @@
 .\tests\switch-12.ispc compfail  x86-64  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
 .\tests\switch-8.ispc compfail  x86-64  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
 .\tests\switch-9.ispc compfail  x86-64  avx1.1-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-10.ispc compfail  x86-64           avx2-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-11.ispc compfail  x86-64           avx2-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-12.ispc compfail  x86-64           avx2-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-8.ispc compfail  x86-64           avx2-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-9.ispc compfail  x86-64           avx2-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\test-141.ispc runfail  x86-64    avx2-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-10.ispc compfail  x86-64    avx2-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-11.ispc compfail  x86-64    avx2-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-12.ispc compfail  x86-64    avx2-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-8.ispc compfail  x86-64    avx2-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-9.ispc compfail  x86-64    avx2-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\exclusive-scan-add-10.ispc runfail     x86     avx2-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\exclusive-scan-add-9.ispc runfail     x86     avx2-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\max-uint-1.ispc runfail     x86     avx2-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\max-uint.ispc runfail     x86     avx2-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\min-uint-2.ispc runfail     x86     avx2-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\packed-load-1.ispc runfail     x86     avx2-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\packed-store.ispc runfail     x86     avx2-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint-1.ispc runfail     x86     avx2-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint.ispc runfail     x86     avx2-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint64-1.ispc runfail     x86     avx2-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint64.ispc runfail     x86     avx2-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-equal-10.ispc runfail     x86     avx2-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-max-uint.ispc runfail     x86     avx2-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-max-uint64.ispc runfail     x86     avx2-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-min-uint64.ispc runfail     x86     avx2-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\uint64-max-1.ispc runfail     x86     avx2-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\uint64-max.ispc runfail     x86     avx2-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\uint64-min-1.ispc runfail     x86     avx2-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\uint64-min.ispc runfail     x86     avx2-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-10.ispc compfail     x86     avx2-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-11.ispc compfail     x86     avx2-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-12.ispc compfail     x86     avx2-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-8.ispc compfail     x86     avx2-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-9.ispc compfail     x86     avx2-i32x8 Windows LLVM 3.3         cl -O2 *
+.\tests\exclusive-scan-add-10.ispc runfail     x86    avx2-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\exclusive-scan-add-9.ispc runfail     x86    avx2-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\max-uint-1.ispc runfail     x86    avx2-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\max-uint.ispc runfail     x86    avx2-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\min-uint-1.ispc runfail     x86    avx2-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\min-uint-2.ispc runfail     x86    avx2-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\packed-load-1.ispc runfail     x86    avx2-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\packed-store.ispc runfail     x86    avx2-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint-1.ispc runfail     x86    avx2-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint.ispc runfail     x86    avx2-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint64-1.ispc runfail     x86    avx2-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint64.ispc runfail     x86    avx2-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-equal-10.ispc runfail     x86    avx2-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-max-uint.ispc runfail     x86    avx2-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-max-uint64.ispc runfail     x86    avx2-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-min-uint.ispc runfail     x86    avx2-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-min-uint64.ispc runfail     x86    avx2-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\test-141.ispc runfail     x86    avx2-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\uint64-max-1.ispc runfail     x86    avx2-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\uint64-max.ispc runfail     x86    avx2-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\uint64-min-1.ispc runfail     x86    avx2-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\uint64-min.ispc runfail     x86    avx2-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-10.ispc compfail     x86    avx2-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-11.ispc compfail     x86    avx2-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-12.ispc compfail     x86    avx2-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-8.ispc compfail     x86    avx2-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\switch-9.ispc compfail     x86    avx2-i32x16 Windows LLVM 3.3         cl -O2 *

From 8e71dbd6c12b0fde77ed58c21e4083c84227114e Mon Sep 17 00:00:00 2001
From: Dmitry Babokin <babokin@gmail.com>
Date: Fri, 27 Sep 2013 18:12:12 +0400
Subject: [PATCH 107/124] Adding comments to fail_db.txt

---
 fail_db.txt | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/fail_db.txt b/fail_db.txt
index 59e0a7a6..eb3c0fe9 100644
--- a/fail_db.txt
+++ b/fail_db.txt
@@ -1,4 +1,11 @@
-% List of known fails
+% List of known fails.
+% The list is unordered and contains information about commonly used platforms / configurations.
+% Our goas is to maintain this list for Linux, MacOS and Windows with reasonably new compilers.
+% Note, that it's important which C++ compiler was used. For example, gcc 4.4 is know to produce
+% considerably more fails with generic targets, than gcc 4.7 or later.
+% Using old compilers (gcc 4.4 is considered to be relatively old) may cause LLVM bugs.
+% To avoid them you can use LLVM selfbuild.
+% 
 ./tests/masked-scatter-vector.ispc runfail  x86-64     sse2-i32x4   Linux LLVM 3.3     g++4.4 -O2 *
 ./tests/atomics-13.ispc compfail     x86     sse4-i16x8   Linux LLVM 3.3     g++4.4 -O2 *
 ./tests/reduce-equal-10.ispc compfail     x86     sse4-i16x8   Linux LLVM 3.3     g++4.4 -O2 *

From 8a39af8f7204640fa802f6eb07403526523d1ea3 Mon Sep 17 00:00:00 2001
From: Dmitry Babokin <babokin@gmail.com>
Date: Fri, 27 Sep 2013 23:27:05 +0400
Subject: [PATCH 108/124] Release 1.5.0

---
 docs/ReleaseNotes.txt | 60 +++++++++++++++++++++++++++++++++++++++++++
 docs/news.rst         |  8 ++++++
 doxygen.cfg           |  2 +-
 ispc.h                |  2 +-
 4 files changed, 70 insertions(+), 2 deletions(-)

diff --git a/docs/ReleaseNotes.txt b/docs/ReleaseNotes.txt
index 007f283e..a8575ea0 100644
--- a/docs/ReleaseNotes.txt
+++ b/docs/ReleaseNotes.txt
@@ -1,3 +1,63 @@
+=== v1.5.0 === (27 September 2013)
+
+A major new version of ISPC with several new targets and important bug fixes.
+Here's a list of the most important changes, if you are using pre-built
+binaries (which are based on patched version of LLVM 3.3):
+
+* The naming of targets was changed to explicitly include data type width and
+  a number of threads in the gang. For example, avx2-i32x8 is avx2 target,
+  which uses 32 bit types as a base and has 8 threads in a gang. Old naming
+  scheme is still supported, but depricated.
+
+* New SSE4 targets for calculations based on 8 bit and 16 bit data types:
+  sse4-i8x16 and sse4-i16x8.
+
+* New AVX1 target for calculations based on 64 bit data types: avx1-i64x4.
+
+* SVML support was extended and improved.
+
+* Behavior of -g switch was changed to not affect optimization level.
+
+* ISPC debug infrastructure was redesigned. See --help-dev for more info and
+  enjoy capabilities of new --debug-phase=<value> and --off-phase=<value>
+  switches.
+
+* Fixed an auto-dispatch bug, which caused AVX code execution when OS doesn't
+  support AVX (but hardware does).
+
+* Fixed a bug, which discarded uniform/varying keyword in typedefs.
+
+* Several performance regressions were fixed.
+
+If you are building ISPC yourself, then following changes are also available
+to you:
+
+* --cpu=slm for targeting Intel Atom codename Silvermont (if LLVM 3.4 is used).
+
+* ARM NEON targets are available (if enabled in build system).
+
+* --debug-ir=<value> is available to generate debug information based on LLVM
+  IR (if LLVM 3.4 is used). In debugger you'll see LLVM IR instead of source
+  code.
+
+* A redesigned and improved test and configuration management system is
+  available to facilitate the process of building LLVM and testing ISPC
+  compiler.
+
+Standard library changes/fixes:
+
+* __pause() function was removed from standard library.
+
+* Fixed reduce_[min|max]_[float|double] intrinsics, which were producing
+  incorrect code under some conditions.
+
+Language changes:
+
+* By default a floating point constant without a suffix is a single precision
+  constant (32 bit). A new suffix "d" was introduced to allow double precision
+  constant (64 bit). Please refer to tests/double-consts.ispc for syntax
+  examples.
+
 === v1.4.4 === (19 July 2013)
 
 A minor version update with several stability fixes requested by the customers.
diff --git a/docs/news.rst b/docs/news.rst
index c1c35de3..7d78a662 100644
--- a/docs/news.rst
+++ b/docs/news.rst
@@ -2,6 +2,14 @@
 ispc News
 =========
 
+ispc 1.5.0 is Released
+----------------------
+
+A major update of ``ispc`` has been released with several new targets available
+and bunch of performance and stability fixes. The released binaries are built
+with patched version of LLVM 3.3. Please refer to Release Notes for complete
+set of changes.
+
 ispc 1.4.4 is Released
 ----------------------
 
diff --git a/doxygen.cfg b/doxygen.cfg
index 480d9331..ab4eec20 100644
--- a/doxygen.cfg
+++ b/doxygen.cfg
@@ -31,7 +31,7 @@ PROJECT_NAME           = "Intel SPMD Program Compiler"
 # This could be handy for archiving the generated documentation or
 # if some version control system is used.
 
-PROJECT_NUMBER         = 1.4.5dev
+PROJECT_NUMBER         = 1.5.0
 
 # The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute)
 # base path where the generated documentation will be put.
diff --git a/ispc.h b/ispc.h
index 4804832f..4b7ae732 100644
--- a/ispc.h
+++ b/ispc.h
@@ -38,7 +38,7 @@
 #ifndef ISPC_H
 #define ISPC_H
 
-#define ISPC_VERSION "1.4.5dev"
+#define ISPC_VERSION "1.5.0"
 
 #if !defined(LLVM_3_1) && !defined(LLVM_3_2) && !defined(LLVM_3_3) && !defined(LLVM_3_4)
 #error "Only LLVM 3.1, 3.2, 3.3 and the 3.4 development branch are supported"

From 3b4cc9080046983932ea461345344deccd0ad33e Mon Sep 17 00:00:00 2001
From: Dmitry Babokin <babokin@gmail.com>
Date: Sat, 28 Sep 2013 01:32:00 +0400
Subject: [PATCH 109/124] Changing ISPC to 1.5.dev

---
 doxygen.cfg | 2 +-
 ispc.h      | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/doxygen.cfg b/doxygen.cfg
index ab4eec20..a0ad3176 100644
--- a/doxygen.cfg
+++ b/doxygen.cfg
@@ -31,7 +31,7 @@ PROJECT_NAME           = "Intel SPMD Program Compiler"
 # This could be handy for archiving the generated documentation or
 # if some version control system is used.
 
-PROJECT_NUMBER         = 1.5.0
+PROJECT_NUMBER         = 1.5.1dev
 
 # The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute)
 # base path where the generated documentation will be put.
diff --git a/ispc.h b/ispc.h
index 4b7ae732..82cb9050 100644
--- a/ispc.h
+++ b/ispc.h
@@ -38,7 +38,7 @@
 #ifndef ISPC_H
 #define ISPC_H
 
-#define ISPC_VERSION "1.5.0"
+#define ISPC_VERSION "1.5.1dev"
 
 #if !defined(LLVM_3_1) && !defined(LLVM_3_2) && !defined(LLVM_3_3) && !defined(LLVM_3_4)
 #error "Only LLVM 3.1, 3.2, 3.3 and the 3.4 development branch are supported"

From 758efebb3cc166e46169931490fbb42c5f9ffd65 Mon Sep 17 00:00:00 2001
From: Dmitry Babokin <babokin@gmail.com>
Date: Mon, 30 Sep 2013 17:54:59 +0400
Subject: [PATCH 110/124] Add missing testing support for avx1-i64x4 target

---
 alloy.py     | 6 +++---
 ispc.cpp     | 4 ++--
 run_tests.py | 2 +-
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/alloy.py b/alloy.py
index 7ae972b4..3f05f4fd 100755
--- a/alloy.py
+++ b/alloy.py
@@ -200,7 +200,7 @@ def check_targets():
                 answer = answer + ["sse4-i32x4", "sse4-i32x8", "sse4-i16x8", "sse4-i8x16"]
             if AVX == False and "avx" in f_lines[i]:
                 AVX = True;
-                answer = answer + ["avx1-i32x8", "avx1-i32x16"]
+                answer = answer + ["avx1-i32x8", "avx1-i32x16", "avx1-i64x4"]
             if AVX11 == False and "rdrand" in f_lines[i]:
                 AVX11 = True;
                 answer = answer + ["avx1.1-i32x8", "avx1.1-i32x16"]
@@ -217,7 +217,7 @@ def check_targets():
             answer = answer + ["sse4-i32x4", "sse4-i32x8", "sse4-i16x8", "sse4-i8x16"]
         if "AVX1.0" in f_lines:
             AVX = True;
-            answer = answer + ["avx1-i32x8", "avx1-i32x16"]
+            answer = answer + ["avx1-i32x8", "avx1-i32x16", "avx1-i64x4"]
         if "RDRAND" in f_lines:
             AVX11 = True;
             answer = answer + ["avx1.1-i32x8", "avx1.1-i32x16"]
@@ -246,7 +246,7 @@ def check_targets():
         if SSE4 == False and "wsm" in f_lines[i]:
             answer_sde = answer_sde + [["-wsm", "sse4-i32x4"], ["-wsm", "sse4-i32x8"], ["-wsm", "sse4-i16x8"], ["-wsm", "sse4-i8x16"]]
         if AVX == False and "snb" in f_lines[i]:
-            answer_sde = answer_sde + [["-snb", "avx1-i32x8"], ["-snb", "avx1-i32x16"]]
+            answer_sde = answer_sde + [["-snb", "avx1-i32x8"], ["-snb", "avx1-i32x16"], ["-snb", "avx1-i64x4"]]
         if AVX11 == False and "ivb" in f_lines[i]:
             answer_sde = answer_sde + [["-ivb", "avx1.1-i32x8"], ["-ivb", "avx1.1-i32x16"]]
         if AVX2 == False and "hsw" in f_lines[i]:
diff --git a/ispc.cpp b/ispc.cpp
index bec7baf7..56b0a25f 100644
--- a/ispc.cpp
+++ b/ispc.cpp
@@ -714,11 +714,11 @@ Target::SupportedTargets() {
 #endif
         "sse2-i32x4, sse2-i32x8, "
         "sse4-i32x4, sse4-i32x8, sse4-i16x8, sse4-i8x16, "
-        "avx1-i32x8, avx1-i32x16, "
+        "avx1-i32x8, avx1-i32x16, avx1-i64x4, "
         "avx1.1-i32x8, avx1.1-i32x16, "
         "avx2-i32x8, avx2-i32x16, "
         "generic-x1, generic-x4, generic-x8, generic-x16, "
-            "generic-x32, generic-x64";
+        "generic-x32, generic-x64";
 }
 
 
diff --git a/run_tests.py b/run_tests.py
index 64d3462a..4146576c 100755
--- a/run_tests.py
+++ b/run_tests.py
@@ -449,7 +449,7 @@ def verify():
     check = [["g++", "clang", "cl"],["-O0", "-O2"],["x86","x86-64"],
              ["Linux","Windows","Mac"],["LLVM 3.1","LLVM 3.2","LLVM 3.3","LLVM head"],
              ["sse2-i32x4", "sse2-i32x8", "sse4-i32x4", "sse4-i32x8", "sse4-i16x8",
-              "sse4-i8x16", "avx1-i32x8", "avx1-i32x16", "avx1.1-i32x8", "avx1.1-i32x16",
+              "sse4-i8x16", "avx1-i32x8", "avx1-i32x16", "avx1-i64x4", "avx1.1-i32x8", "avx1.1-i32x16",
               "avx2-i32x8", "avx2-i32x16", "generic-1", "generic-4", "generic-8",
               "generic-16", "generic-32", "generic-64"]]
     for i in range (0,len(f_lines)):

From 7942bdb728f8fc9b6cc560303cf6193ed5aba647 Mon Sep 17 00:00:00 2001
From: Dmitry Babokin <babokin@gmail.com>
Date: Mon, 30 Sep 2013 18:09:59 +0400
Subject: [PATCH 111/124] Typo fix and copyright update

---
 docs/ispc.rst          | 4 ++--
 docs/template-news.txt | 2 +-
 docs/template-perf.txt | 2 +-
 docs/template.txt      | 2 +-
 4 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/docs/ispc.rst b/docs/ispc.rst
index 224faaa9..eac9b24e 100644
--- a/docs/ispc.rst
+++ b/docs/ispc.rst
@@ -270,8 +270,8 @@ new reserved words: ``unmasked``, ``foreach_unique``, ``foreach_active``,
 and ``in``.  Any program that happens to have a variable or function with
 one of these names must be modified to rename that symbol.
 
-Updating ISPC Programs For Changes In ISPC 1.4.5
-----------------------------------------------
+Updating ISPC Programs For Changes In ISPC 1.5.0
+------------------------------------------------
 
 This release adds support for double precision floating point constants.
 Double precision floating point constants are floating point number with
diff --git a/docs/template-news.txt b/docs/template-news.txt
index 9a41fbdb..d5eebdd1 100644
--- a/docs/template-news.txt
+++ b/docs/template-news.txt
@@ -57,7 +57,7 @@
 %(body)s
 </div>
     <div class="clearfix"></div>
-    <div id="footer"> &copy; 2011-2012 <strong>Intel Corporation</strong> | Valid <a href="http://validator.w3.org/check?uri=referer">XHTML</a> | <a href="http://jigsaw.w3.org/css-validator/check/referer">CSS</a> | ClearBlue  by: <a href="http://www.themebin.com/">ThemeBin</a>
+    <div id="footer"> &copy; 2011-2013 <strong>Intel Corporation</strong> | Valid <a href="http://validator.w3.org/check?uri=referer">XHTML</a> | <a href="http://jigsaw.w3.org/css-validator/check/referer">CSS</a> | ClearBlue  by: <a href="http://www.themebin.com/">ThemeBin</a>
       <!-- Please Do Not remove this link, thank u -->
       </div>
       </div>
diff --git a/docs/template-perf.txt b/docs/template-perf.txt
index 4932e332..9537a836 100644
--- a/docs/template-perf.txt
+++ b/docs/template-perf.txt
@@ -57,7 +57,7 @@
 %(body)s
 </div>
     <div class="clearfix"></div>
-    <div id="footer"> &copy; 2011-2012 <strong>Intel Corporation</strong> | Valid <a href="http://validator.w3.org/check?uri=referer">XHTML</a> | <a href="http://jigsaw.w3.org/css-validator/check/referer">CSS</a> | ClearBlue  by: <a href="http://www.themebin.com/">ThemeBin</a>
+    <div id="footer"> &copy; 2011-2013 <strong>Intel Corporation</strong> | Valid <a href="http://validator.w3.org/check?uri=referer">XHTML</a> | <a href="http://jigsaw.w3.org/css-validator/check/referer">CSS</a> | ClearBlue  by: <a href="http://www.themebin.com/">ThemeBin</a>
       <!-- Please Do Not remove this link, thank u -->
       </div>
       </div>
diff --git a/docs/template.txt b/docs/template.txt
index 8cb4f5ab..b9041f19 100644
--- a/docs/template.txt
+++ b/docs/template.txt
@@ -57,7 +57,7 @@
 %(body)s
 </div>
     <div class="clearfix"></div>
-    <div id="footer"> &copy; 2011-2012 <strong>Intel Corporation</strong> | Valid <a href="http://validator.w3.org/check?uri=referer">XHTML</a> | <a href="http://jigsaw.w3.org/css-validator/check/referer">CSS</a> | ClearBlue  by: <a href="http://www.themebin.com/">ThemeBin</a>
+    <div id="footer"> &copy; 2011-2013 <strong>Intel Corporation</strong> | Valid <a href="http://validator.w3.org/check?uri=referer">XHTML</a> | <a href="http://jigsaw.w3.org/css-validator/check/referer">CSS</a> | ClearBlue  by: <a href="http://www.themebin.com/">ThemeBin</a>
       <!-- Please Do Not remove this link, thank u -->
       </div>
       </div>

From 49cefc2e972bb3d742f74f855cd40b09b57f029b Mon Sep 17 00:00:00 2001
From: Dmitry Babokin <babokin@gmail.com>
Date: Mon, 30 Sep 2013 19:20:18 +0400
Subject: [PATCH 112/124] Updating fail_db for new target

---
 fail_db.txt | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/fail_db.txt b/fail_db.txt
index eb3c0fe9..31db9961 100644
--- a/fail_db.txt
+++ b/fail_db.txt
@@ -931,3 +931,21 @@
 .\tests\switch-12.ispc compfail     x86    avx2-i32x16 Windows LLVM 3.3         cl -O2 *
 .\tests\switch-8.ispc compfail     x86    avx2-i32x16 Windows LLVM 3.3         cl -O2 *
 .\tests\switch-9.ispc compfail     x86    avx2-i32x16 Windows LLVM 3.3         cl -O2 *
+.\tests\exclusive-scan-add-10.ispc runfail     x86     avx1-i64x4 Windows LLVM 3.3         cl -O2 *
+.\tests\exclusive-scan-add-9.ispc runfail     x86     avx1-i64x4 Windows LLVM 3.3         cl -O2 *
+.\tests\max-uint-1.ispc runfail     x86     avx1-i64x4 Windows LLVM 3.3         cl -O2 *
+.\tests\max-uint.ispc runfail     x86     avx1-i64x4 Windows LLVM 3.3         cl -O2 *
+.\tests\min-uint-2.ispc runfail     x86     avx1-i64x4 Windows LLVM 3.3         cl -O2 *
+.\tests\packed-load-1.ispc runfail     x86     avx1-i64x4 Windows LLVM 3.3         cl -O2 *
+.\tests\packed-store.ispc runfail     x86     avx1-i64x4 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint-1.ispc runfail     x86     avx1-i64x4 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint.ispc runfail     x86     avx1-i64x4 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint64-1.ispc runfail     x86     avx1-i64x4 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-add-uint64.ispc runfail     x86     avx1-i64x4 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-max-uint.ispc runfail     x86     avx1-i64x4 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-min-uint64.ispc runfail     x86     avx1-i64x4 Windows LLVM 3.3         cl -O2 *
+.\tests\uint64-max-1.ispc runfail     x86     avx1-i64x4 Windows LLVM 3.3         cl -O2 *
+.\tests\uint64-max.ispc runfail     x86     avx1-i64x4 Windows LLVM 3.3         cl -O2 *
+.\tests\uint64-min-1.ispc runfail     x86     avx1-i64x4 Windows LLVM 3.3         cl -O2 *
+.\tests\uint64-min.ispc runfail     x86     avx1-i64x4 Windows LLVM 3.3         cl -O2 *
+.\tests\reduce-min-uint64.ispc runfail     x86     avx1-i64x4 Windows LLVM 3.4         cl -O2 *

From 2d6f7a7c93bcbe89c2ec55e99a995d309c2d85b5 Mon Sep 17 00:00:00 2001
From: Dmitry Babokin <babokin@gmail.com>
Date: Tue, 1 Oct 2013 17:37:34 +0400
Subject: [PATCH 113/124] Support i686 architecture recognition as x86 and
 enable 32 bit x86 platforms

---
 examples/common.mk | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/examples/common.mk b/examples/common.mk
index 95ec7ccb..330a2453 100644
--- a/examples/common.mk
+++ b/examples/common.mk
@@ -12,15 +12,22 @@ LIBS=-lm $(TASK_LIB) -lstdc++
 ISPC=ispc -O2 $(ISPC_FLAGS)
 ISPC_HEADER=objs/$(ISPC_SRC:.ispc=_ispc.h)
 
-ARCH:=$(shell uname -m | sed -e s/x86_64/x86/ -e s/arm.*/arm/ -e s/sa110/arm/)
+ARCH:=$(shell uname -m | sed -e s/x86_64/x86/ -e s/i686/x86/ -e s/arm.*/arm/ -e s/sa110/arm/)
 
 ifeq ($(ARCH),x86)
   ISPC_OBJS=$(addprefix objs/, $(ISPC_SRC:.ispc=)_ispc.o $(ISPC_SRC:.ispc=)_ispc_sse2.o \
 	$(ISPC_SRC:.ispc=)_ispc_sse4.o $(ISPC_SRC:.ispc=)_ispc_avx.o)
   ISPC_TARGETS=$(ISPC_IA_TARGETS)
-  ISPC_FLAGS += --arch=x86-64
-  CXXFLAGS += -m64
-  CCFLAGS += -m64
+  ARCH_BIT:=$(shell getconf LONG_BIT)
+  ifeq ($(ARCH_BIT),32)
+    ISPC_FLAGS += --arch=x86
+    CXXFLAGS += -m32
+    CCFLAGS += -m32
+  else
+    ISPC_FLAGS += --arch=x86-64
+    CXXFLAGS += -m64
+    CCFLAGS += -m64
+  endif
 else ifeq ($(ARCH),arm)
   ISPC_OBJS=$(addprefix objs/, $(ISPC_SRC:.ispc=_ispc.o))
   ISPC_TARGETS=$(ISPC_ARM_TARGETS)

From b2cf0209b153c072f5e531e23203a68e05d47d87 Mon Sep 17 00:00:00 2001
From: Ilia Filippov <ili.filippov@gmail.com>
Date: Tue, 1 Oct 2013 18:01:29 +0400
Subject: [PATCH 114/124] pipe correction and some other small changes in test
 system

---
 alloy.py     |  4 ++++
 common.py    |  3 ++-
 run_tests.py | 15 ++++++++-------
 3 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/alloy.py b/alloy.py
index 7ae972b4..6b55f85b 100755
--- a/alloy.py
+++ b/alloy.py
@@ -353,8 +353,12 @@ def validation_run(only, only_targets, reference_branch, number, notify, update,
         else:
             common.check_tools(1)
         if only_targets != "":
+            only_targets += " "
+            only_targets = only_targets.replace("generic "," generic-4 generic-16 ")
             only_targets_t = only_targets.split(" ")
             for i in only_targets_t:
+                if i == "":
+                    continue
                 err = True
                 for j in range(0,len(targets_t)):
                     if i in targets_t[j]:
diff --git a/common.py b/common.py
index 19d09e4d..be3e9526 100644
--- a/common.py
+++ b/common.py
@@ -83,7 +83,8 @@ def print_debug(line, silent, filename):
         sys.stdout.write(line)
         sys.stdout.flush()
         if os.environ.get("ISPC_HOME") != None:
-            write_to_file(os.environ["ISPC_HOME"] + os.sep + "notify_log.log", line)
+            if os.path.exists(os.environ.get("ISPC_HOME")):
+                write_to_file(os.environ["ISPC_HOME"] + os.sep + "notify_log.log", line)
     if filename != "":
         write_to_file(filename, line)
 
diff --git a/run_tests.py b/run_tests.py
index abc9b656..7b2f5f29 100755
--- a/run_tests.py
+++ b/run_tests.py
@@ -332,8 +332,6 @@ def run_tasks_from_queue(queue, queue_ret, queue_skip, total_tests_arg, max_test
             skip_files += [ filename ]
 
 
-task_threads = []
-
 def sigint(signum, frame):
     for t in task_threads:
         t.terminate()
@@ -423,6 +421,8 @@ def file_check(compfails, runfails):
         for i in range (0,len(new_compfails)):
             new_f_lines.append(new_compfails[i] + " compfail " + new_line)
             print_debug("\t" + new_compfails[i] + "\n", s, run_tests_log)
+    if len(new_runfails) == 0 and len(new_compfails) == 0:
+        print_debug("No new fails\n", s, run_tests_log)
     if len(new_passes_runfails) != 0:
         print_debug("NEW PASSES after RUNFAILS:\n", s, run_tests_log)
         for i in range (0,len(new_passes_runfails)):
@@ -561,7 +561,6 @@ def run_tests(options1, args, print_version):
     # failing_tests/, and tests_errors/
     if len(args) == 0:
         files = glob.glob(ispc_root + os.sep + "tests" + os.sep + "*ispc") + \
-            glob.glob(ispc_root + os.sep + "failing_tests" + os.sep + "*ispc") + \
             glob.glob(ispc_root + os.sep + "tests_errors" + os.sep + "*ispc")
     else:
         if is_windows:
@@ -622,12 +621,12 @@ def run_tests(options1, args, print_version):
     start_time = time.time()
     # launch jobs to run tests
     glob_var = [is_windows, options, s, ispc_exe, is_generic_target, run_tests_log]
+    global task_threads
+    task_threads = [0] * nthreads
     for x in range(nthreads):
-        t = multiprocessing.Process(target=run_tasks_from_queue, args=(q, qret, qskip, total_tests,
+        task_threads[x] = multiprocessing.Process(target=run_tasks_from_queue, args=(q, qret, qskip, total_tests,
             max_test_length, finished_tests_counter, finished_tests_counter_lock, glob_var))
-        task_threads.append(t)
-        t.start()
-
+        task_threads[x].start()
     # wait for them to all finish and then return the number that failed
     # (i.e. return 0 if all is ok)
     for t in task_threads:
@@ -660,6 +659,8 @@ def run_tests(options1, args, print_version):
         print_debug("%d / %d tests FAILED execution:\n" % (len(run_error_files), total_tests), s, run_tests_log)
         for f in run_error_files:
             print_debug("\t%s\n" % f, s, run_tests_log)
+    if len(compile_error_files) == 0 and len(run_error_files) == 0:
+        print_debug("No fails\n", s, run_tests_log)
 
     R = file_check(compile_error_files, run_error_files)
 

From c7b4164122f7a9cf45a1a2ea30c90064650258dd Mon Sep 17 00:00:00 2001
From: Dmitry Babokin <babokin@gmail.com>
Date: Tue, 1 Oct 2013 18:40:26 +0400
Subject: [PATCH 115/124] Redefining ISPC should not discard ISPC_FLAGS

---
 examples/common.mk | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/examples/common.mk b/examples/common.mk
index 330a2453..367d3eb3 100644
--- a/examples/common.mk
+++ b/examples/common.mk
@@ -9,7 +9,8 @@ CC=gcc
 CCFLAGS=-Iobjs/ -O2
 
 LIBS=-lm $(TASK_LIB) -lstdc++
-ISPC=ispc -O2 $(ISPC_FLAGS)
+ISPC=ispc
+ISPC_FLAGS=-O2
 ISPC_HEADER=objs/$(ISPC_SRC:.ispc=_ispc.h)
 
 ARCH:=$(shell uname -m | sed -e s/x86_64/x86/ -e s/i686/x86/ -e s/arm.*/arm/ -e s/sa110/arm/)
@@ -68,10 +69,10 @@ objs/%.o: ../%.cpp dirs
 objs/$(EXAMPLE).o: objs/$(EXAMPLE)_ispc.h
 
 objs/%_ispc.h objs/%_ispc.o objs/%_ispc_sse2.o objs/%_ispc_sse4.o objs/%_ispc_avx.o: %.ispc
-	$(ISPC) --target=$(ISPC_TARGETS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
+	$(ISPC) $(ISPC_FLAGS) --target=$(ISPC_TARGETS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h
 
 objs/$(ISPC_SRC:.ispc=)_sse4.cpp: $(ISPC_SRC)
-	$(ISPC) $< -o $@ --target=generic-4 --emit-c++ --c++-include-file=sse4.h
+	$(ISPC) $(ISPC_FLAGS) $< -o $@ --target=generic-4 --emit-c++ --c++-include-file=sse4.h
 
 objs/$(ISPC_SRC:.ispc=)_sse4.o: objs/$(ISPC_SRC:.ispc=)_sse4.cpp
 	$(CXX) -I../intrinsics -msse4.2 $< $(CXXFLAGS) -c -o $@
@@ -80,7 +81,7 @@ $(EXAMPLE)-sse4: $(CPP_OBJS) objs/$(ISPC_SRC:.ispc=)_sse4.o
 	$(CXX) $(CXXFLAGS) -o $@ $^ $(LIBS)
 
 objs/$(ISPC_SRC:.ispc=)_generic16.cpp: $(ISPC_SRC)
-	$(ISPC) $< -o $@ --target=generic-16 --emit-c++ --c++-include-file=generic-16.h
+	$(ISPC) $(ISPC_FLAGS) $< -o $@ --target=generic-16 --emit-c++ --c++-include-file=generic-16.h
 
 objs/$(ISPC_SRC:.ispc=)_generic16.o: objs/$(ISPC_SRC:.ispc=)_generic16.cpp
 	$(CXX) -I../intrinsics $< $(CXXFLAGS) -c -o $@
@@ -89,7 +90,7 @@ $(EXAMPLE)-generic16: $(CPP_OBJS) objs/$(ISPC_SRC:.ispc=)_generic16.o
 	$(CXX) $(CXXFLAGS) -o $@ $^ $(LIBS)
 
 objs/$(ISPC_SRC:.ispc=)_scalar.o: $(ISPC_SRC)
-	$(ISPC) $< -o $@ --target=generic-1
+	$(ISPC) $(ISPC_FLAGS) $< -o $@ --target=generic-1
 
 $(EXAMPLE)-scalar: $(CPP_OBJS) objs/$(ISPC_SRC:.ispc=)_scalar.o
 	$(CXX) $(CXXFLAGS) -o $@ $^ $(LIBS)

From dc8895352af94d7042e9e7658035c3c9d35ba8b7 Mon Sep 17 00:00:00 2001
From: "james.brodman" <james.brodman@intel.com>
Date: Tue, 1 Oct 2013 11:53:56 -0400
Subject: [PATCH 116/124] Adding missing typecasts and guarding i64 __mul with
 compiler version check

---
 examples/intrinsics/knc-i1x16.h | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/examples/intrinsics/knc-i1x16.h b/examples/intrinsics/knc-i1x16.h
index 2ee6d2f5..ae9c4130 100644
--- a/examples/intrinsics/knc-i1x16.h
+++ b/examples/intrinsics/knc-i1x16.h
@@ -1021,9 +1021,13 @@ static FORCEINLINE __vec16_i64 __mul(const __vec16_i32 &a, const __vec16_i64 &_b
         _mm512_mulhi_epi32(a.v, b.v_lo))).cvt2zmm();
 }
 
+#if __ICC_VERSION == 1400
 static FORCEINLINE __vec16_i64 __mul(__vec16_i64 a, __vec16_i64 b) {
   return __vec16_i64(_mm512_mullox_epi64(a.v1, b.v1), _mm512_mullox_epi64(a.v2,b.v2));
 }
+#else
+BINARY_OP(__vec16_i64, __mul, *)
+#endif
 #endif
 
 #if 0
@@ -2164,7 +2168,7 @@ static FORCEINLINE __vec16_f __cast_fptrunc(__vec16_f, __vec16_d val) {
     __m512i r0i = _mm512_castps_si512(_mm512_cvtpd_pslo(val.v1));
     __m512i r1i = _mm512_castps_si512(_mm512_cvtpd_pslo(val.v2));
 
-    return _mm512_mask_permute4f128_epi32(r0i, 0xFF00, r1i, _MM_PERM_BABA);
+    return _mm512_castsi512_ps(_mm512_mask_permute4f128_epi32(r0i, 0xFF00, r1i, _MM_PERM_BABA));
 }
 #endif
 
@@ -2174,7 +2178,7 @@ CAST(__vec16_d, double, __vec16_f, float,  __cast_fpext)
 static FORCEINLINE __vec16_d __cast_fpext(__vec16_d, __vec16_f val) {
     __vec16_d ret;
     ret.v1 = _mm512_cvtpslo_pd(val.v);
-    __vec16_f other8 = _mm512_permute4f128_epi32(_mm512_castps_si512(val.v), _MM_PERM_DCDC);
+    __vec16_f other8 = _mm512_castsi512_ps(_mm512_permute4f128_epi32(_mm512_castps_si512(val.v), _MM_PERM_DCDC));
     ret.v2 = _mm512_cvtpslo_pd(other8);
     return ret;
 }

From 32c77be2f3537b24890e1334b1a7d2579c58d2c1 Mon Sep 17 00:00:00 2001
From: evghenii <egaburov@dds.nl>
Date: Fri, 4 Oct 2013 11:42:52 +0300
Subject: [PATCH 117/124] cleaned mask & int32, only test141 fails

---
 examples/intrinsics/knc-i1x16.h | 656 +++++++++-----------------------
 1 file changed, 190 insertions(+), 466 deletions(-)

diff --git a/examples/intrinsics/knc-i1x16.h b/examples/intrinsics/knc-i1x16.h
index ae9c4130..aae4be57 100644
--- a/examples/intrinsics/knc-i1x16.h
+++ b/examples/intrinsics/knc-i1x16.h
@@ -64,69 +64,48 @@ extern "C"
 }
 #endif
 
-typedef float __vec1_f;
-typedef double __vec1_d;
-typedef int8_t __vec1_i8;
+typedef float   __vec1_f;
+typedef double  __vec1_d;
+typedef int8_t  __vec1_i8;
 typedef int16_t __vec1_i16;
 typedef int32_t __vec1_i32;
 typedef int64_t __vec1_i64;
 
-struct __vec16_i1 {
-    __vec16_i1() { }
-    __vec16_i1(const __mmask16 &vv) : v(vv) { }
-    __vec16_i1(bool v0, bool v1, bool v2, bool v3,
-               bool v4, bool v5, bool v6, bool v7,
-               bool v8, bool v9, bool v10, bool v11,
-               bool v12, bool v13, bool v14, bool v15) {
-        v = ((v0 & 1) |
-             ((v1 & 1) << 1) |
-             ((v2 & 1) << 2) |
-             ((v3 & 1) << 3) |
-             ((v4 & 1) << 4) |
-             ((v5 & 1) << 5) |
-             ((v6 & 1) << 6) |
-             ((v7 & 1) << 7) |
-             ((v8 & 1) << 8) |
-             ((v9 & 1) << 9) |
-             ((v10 & 1) << 10) |
-             ((v11 & 1) << 11) |
-             ((v12 & 1) << 12) |
-             ((v13 & 1) << 13) |
-             ((v14 & 1) << 14) |
-             ((v15 & 1) << 15));
-    }
-             
-    __mmask16 v;
-    FORCEINLINE operator __mmask16() const { return v; }
+/************ mask **************/
+
+struct __vec16_i1 
+{
+  __mmask16 v;
+
+  FORCEINLINE __vec16_i1() { }
+  FORCEINLINE __vec16_i1(const __mmask16 &vv) : v(vv) { }
+  FORCEINLINE __vec16_i1(bool  v0, bool  v1, bool  v2, bool  v3,
+                         bool  v4, bool  v5, bool  v6, bool  v7,
+                         bool  v8, bool  v9, bool v10, bool v11,
+                         bool v12, bool v13, bool v14, bool v15) {
+    v = ((v0 & 1) |
+        ((v1 & 1) << 1) |
+        ((v2 & 1) << 2) |
+        ((v3 & 1) << 3) |
+        ((v4 & 1) << 4) |
+        ((v5 & 1) << 5) |
+        ((v6 & 1) << 6) |
+        ((v7 & 1) << 7) |
+        ((v8 & 1) << 8) |
+        ((v9 & 1) << 9) |
+        ((v10 & 1) << 10) |
+        ((v11 & 1) << 11) |
+        ((v12 & 1) << 12) |
+        ((v13 & 1) << 13) |
+        ((v14 & 1) << 14) |
+        ((v15 & 1) << 15));
+  }
+
+  FORCEINLINE operator __mmask16() const { return v; }
 };
 
+/************ vector **************/
 
-template <typename T>
-struct vec16 {
-    vec16() { }
-    vec16(T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7,
-          T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15) {
-        data[0] = v0;        data[1] = v1;        data[2] = v2;        data[3] = v3;
-        data[4] = v4;        data[5] = v5;        data[6] = v6;        data[7] = v7;
-        data[8] = v8;        data[9] = v9;        data[10] = v10;      data[11] = v11;
-        data[12] = v12;      data[13] = v13;      data[14] = v14;      data[15] = v15;
-    }
-    T data[16]; 
-    FORCEINLINE const T& operator[](const int i) const { return data[i]; }
-    FORCEINLINE       T& operator[](const int i)       { return data[i]; }
-};
-
-#if 0 /* evghenii:i32 */
-struct PRE_ALIGN(64) __vec16_i32  : public vec16<int32_t> { 
-  __vec16_i32() { }
-  __vec16_i32(int32_t v0, int32_t v1, int32_t v2, int32_t v3, 
-      int32_t v4, int32_t v5, int32_t v6, int32_t v7,
-      int32_t v8, int32_t v9, int32_t v10, int32_t v11, 
-      int32_t v12, int32_t v13, int32_t v14, int32_t v15) 
-    : vec16<int32_t>(v0, v1, v2, v3, v4, v5, v6, v7,
-        v8, v9, v10, v11, v12, v13, v14, v15) { }
-} POST_ALIGN(64);
-#else /* evghenii:i32 */
 struct PRE_ALIGN(64) __vec16_i32 
 {
   __m512i v;
@@ -144,81 +123,43 @@ struct PRE_ALIGN(64) __vec16_i32
     FORCEINLINE const int32_t& operator[](const int i) const {  return ((int32_t*)this)[i]; }
     FORCEINLINE       int32_t& operator[](const int i)       {  return ((int32_t*)this)[i]; }
 } POST_ALIGN(64);
-#endif /* evghenii:i32 */
 
-#if 0 /* evghenii::f */
-PRE_ALIGN(64) struct __vec16_f : public vec16<float> { 
-    __vec16_f() { }
-    __vec16_f(float v0, float v1, float v2, float v3, 
-              float v4, float v5, float v6, float v7,
-              float v8, float v9, float v10, float v11, 
-              float v12, float v13, float v14, float v15) 
-        : vec16<float>(v0, v1, v2, v3, v4, v5, v6, v7,
-                       v8, v9, v10, v11, v12, v13, v14, v15) { }
-
-} POST_ALIGN(64);
-#else /* evghenii::f */
 PRE_ALIGN(64) struct __vec16_f 
 {
-    __m512 v;
-    FORCEINLINE operator __m512() const { return v; }
-    FORCEINLINE __vec16_f() : v(_mm512_undefined_ps()) { }
-    FORCEINLINE __vec16_f(const __m512 &in) : v(in) {}
-    FORCEINLINE __vec16_f(const __vec16_f &o) : v(o.v) {}
-    FORCEINLINE __vec16_f& operator =(const __vec16_f &o) { v=o.v; return *this; }
-    FORCEINLINE __vec16_f(float v00, float v01, float v02, float v03, 
-                          float v04, float v05, float v06, float v07,
-                          float v08, float v09, float v10, float v11,
-                          float v12, float v13, float v14, float v15) :
-        v ( _mm512_set_16to16_ps(v15, v14, v13, v12, v11, v10, v09, v08, v07, v06, v05, v04, v03, v02, v01, v00) )  {}
-    FORCEINLINE const float& operator[](const int i) const {  return ((float*)this)[i]; }
-    FORCEINLINE       float& operator[](const int i)       {  return ((float*)this)[i]; }
+  __m512 v;
+  FORCEINLINE operator __m512() const { return v; }
+  FORCEINLINE __vec16_f() : v(_mm512_undefined_ps()) { }
+  FORCEINLINE __vec16_f(const __m512 &in) : v(in) {}
+  FORCEINLINE __vec16_f(const __vec16_f &o) : v(o.v) {}
+  FORCEINLINE __vec16_f& operator =(const __vec16_f &o) { v=o.v; return *this; }
+  FORCEINLINE __vec16_f(float v00, float v01, float v02, float v03, 
+      float v04, float v05, float v06, float v07,
+      float v08, float v09, float v10, float v11,
+      float v12, float v13, float v14, float v15) :
+    v ( _mm512_set_16to16_ps(v15, v14, v13, v12, v11, v10, v09, v08, v07, v06, v05, v04, v03, v02, v01, v00) )  {}
+  FORCEINLINE const float& operator[](const int i) const {  return ((float*)this)[i]; }
+  FORCEINLINE       float& operator[](const int i)       {  return ((float*)this)[i]; }
 } POST_ALIGN(64);
-#endif /* evghenii::f */
 
-#if 0 /* evghenii::d */
-PRE_ALIGN(128) struct __vec16_d : public vec16<double> { 
-    __vec16_d() { }
-    __vec16_d(double v0, double v1, double v2, double v3, 
-              double v4, double v5, double v6, double v7,
-              double v8, double v9, double v10, double v11, 
-              double v12, double v13, double v14, double v15) 
-        : vec16<double>(v0, v1, v2, v3, v4, v5, v6, v7,
-                        v8, v9, v10, v11, v12, v13, v14, v15) { }
-
-} POST_ALIGN(128);
-#else /* evghenii::d */
 struct PRE_ALIGN(128) __vec16_d 
 {
-    __m512d v1;
-    __m512d v2;
-    FORCEINLINE __vec16_d() : v1(_mm512_undefined_pd()), v2(_mm512_undefined_pd()) {}
-    FORCEINLINE __vec16_d(const __m512d _v1, const __m512d _v2) : v1(_v1), v2(_v2) {}
-    FORCEINLINE __vec16_d(const __vec16_d &o) : v1(o.v1), v2(o.v2) {}
-    FORCEINLINE __vec16_d& operator =(const __vec16_d &o) { v1=o.v1; v2=o.v2; return *this; }
-    FORCEINLINE __vec16_d(double v00, double v01, double v02, double v03, 
-                          double v04, double v05, double v06, double v07,
-                          double v08, double v09, double v10, double v11,
-                          double v12, double v13, double v14, double v15) {
-        v1 = _mm512_set_8to8_pd(v15, v14, v13, v12, v11, v10, v09, v08);
-        v2 = _mm512_set_8to8_pd(v07, v06, v05, v04, v03, v02, v01, v00);
-    }
-    FORCEINLINE const double& operator[](const int i) const {  return ((double*)this)[i]; }
-    FORCEINLINE       double& operator[](const int i)       {  return ((double*)this)[i]; }
+  __m512d v1;
+  __m512d v2;
+  FORCEINLINE __vec16_d() : v1(_mm512_undefined_pd()), v2(_mm512_undefined_pd()) {}
+  FORCEINLINE __vec16_d(const __m512d _v1, const __m512d _v2) : v1(_v1), v2(_v2) {}
+  FORCEINLINE __vec16_d(const __vec16_d &o) : v1(o.v1), v2(o.v2) {}
+  FORCEINLINE __vec16_d& operator =(const __vec16_d &o) { v1=o.v1; v2=o.v2; return *this; }
+  FORCEINLINE __vec16_d(double v00, double v01, double v02, double v03, 
+      double v04, double v05, double v06, double v07,
+      double v08, double v09, double v10, double v11,
+      double v12, double v13, double v14, double v15) {
+    v1 = _mm512_set_8to8_pd(v15, v14, v13, v12, v11, v10, v09, v08);
+    v2 = _mm512_set_8to8_pd(v07, v06, v05, v04, v03, v02, v01, v00);
+  }
+  FORCEINLINE const double& operator[](const int i) const {  return ((double*)this)[i]; }
+  FORCEINLINE       double& operator[](const int i)       {  return ((double*)this)[i]; }
 } POST_ALIGN(128);
-#endif /* evghenii::d */
 
-#if 0 /* evghenii::i64 */
-PRE_ALIGN(128) struct __vec16_i64  : public vec16<int64_t> { 
-    __vec16_i64() { }
-    __vec16_i64(int64_t v0, int64_t v1, int64_t v2, int64_t v3, 
-                int64_t v4, int64_t v5, int64_t v6, int64_t v7,
-                int64_t v8, int64_t v9, int64_t v10, int64_t v11, 
-                int64_t v12, int64_t v13, int64_t v14, int64_t v15) 
-        : vec16<int64_t>(v0, v1, v2, v3, v4, v5, v6, v7,
-                         v8, v9, v10, v11, v12, v13, v14, v15) { }
-} POST_ALIGN(128);
-#else /* evghenii::i64 */
 struct PRE_ALIGN(128) __vec16_i64 
 {
   union {
@@ -279,7 +220,24 @@ struct PRE_ALIGN(128) __vec16_i64
     return __vec16_i64(_v1, _v2);
   }
 } POST_ALIGN(128);
-#endif /* evghenii::i64 */
+
+/************ scalar **************/
+
+template <typename T>
+struct vec16 
+{
+  FORCEINLINE vec16() { }
+  FORCEINLINE vec16(T v0, T v1, T  v2, T  v3, T  v4, T  v5, T  v6, T  v7,
+                    T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15) {
+    data[0] = v0;        data[1] = v1;        data[2] = v2;        data[3] = v3;
+    data[4] = v4;        data[5] = v5;        data[6] = v6;        data[7] = v7;
+    data[8] = v8;        data[9] = v9;        data[10] = v10;      data[11] = v11;
+    data[12] = v12;      data[13] = v13;      data[14] = v14;      data[15] = v15;
+  }
+  T data[16]; 
+  FORCEINLINE const T& operator[](const int i) const { return data[i]; }
+  FORCEINLINE       T& operator[](const int i)       { return data[i]; }
+};
 
 PRE_ALIGN(16) struct __vec16_i8   : public vec16<int8_t> { 
     __vec16_i8() { }
@@ -510,104 +468,54 @@ INSERT_EXTRACT(__vec1_f, float)
 INSERT_EXTRACT(__vec1_d, double)
 
 ///////////////////////////////////////////////////////////////////////////
-// mask ops
+// mask 
+///////////////////////////////////////////////////////////////////////////
 
-static FORCEINLINE __vec16_i1 __movmsk(__vec16_i1 mask) {
-    return _mm512_kmov(mask);
+static FORCEINLINE __vec16_i1 __movmsk(__vec16_i1 mask) { return  _mm512_kmov    (mask);       }
+static FORCEINLINE       bool __any   (__vec16_i1 mask) { return !_mm512_kortestz(mask, mask); }
+static FORCEINLINE       bool __all   (__vec16_i1 mask) { return  _mm512_kortestc(mask, mask); }
+static FORCEINLINE       bool __none  (__vec16_i1 mask) { return  _mm512_kortestz(mask, mask); }
+static FORCEINLINE __vec16_i1 __not   (__vec16_i1 mask) { return  _mm512_knot    (mask);       }
+
+static FORCEINLINE __vec16_i1 __equal_i1(__vec16_i1 a, __vec16_i1 b) { return _mm512_kxnor (a,b); }
+static FORCEINLINE __vec16_i1 __and     (__vec16_i1 a, __vec16_i1 b) { return _mm512_kand  (a,b); }
+static FORCEINLINE __vec16_i1 __xor     (__vec16_i1 a, __vec16_i1 b) { return _mm512_kxor  (a,b); }
+static FORCEINLINE __vec16_i1 __or      (__vec16_i1 a, __vec16_i1 b) { return _mm512_kor   (a,b); }
+static FORCEINLINE __vec16_i1 __and_not1(__vec16_i1 a, __vec16_i1 b) { return _mm512_kandn (a,b); }
+static FORCEINLINE __vec16_i1 __and_not2(__vec16_i1 a, __vec16_i1 b) { return _mm512_kandnr(a,b); }
+
+static FORCEINLINE __vec16_i1 __select(__vec16_i1 mask, __vec16_i1 a, __vec16_i1 b) { return __or(__and(a, mask), __and_not2(b, mask)); }
+static FORCEINLINE __vec16_i1 __select(      bool cond, __vec16_i1 a, __vec16_i1 b) { return cond ? a : b; }
+
+static FORCEINLINE bool __extract_element(__vec16_i1 vec, int index) { return (vec.v & (1 << index)) ? true : false; }
+static FORCEINLINE void __insert_element(__vec16_i1 *vec, int index, bool val) 
+{
+  if (val == false)  vec->v &= ~(1 << index);
+  else               vec->v |=  (1 << index);
 }
 
-static FORCEINLINE bool __any(__vec16_i1 mask) {
-    return !_mm512_kortestz(mask, mask);
+template <int ALIGN> static FORCEINLINE __vec16_i1 __load(const __vec16_i1 *p) 
+{
+  return *p;
 }
 
-static FORCEINLINE bool __all(__vec16_i1 mask) {
-    return _mm512_kortestc(mask, mask);
-}
-
-static FORCEINLINE bool __none(__vec16_i1 mask) {
-    return _mm512_kortestz(mask, mask);
-}
-
-static FORCEINLINE __vec16_i1 __equal_i1(__vec16_i1 a, __vec16_i1 b) {
-    return _mm512_kxnor(a,b);
-}
-static FORCEINLINE __vec16_i1 __and(__vec16_i1 a, __vec16_i1 b) {
-    return _mm512_kand(a, b);
-}
-
-static FORCEINLINE __vec16_i1 __xor(__vec16_i1 a, __vec16_i1 b) {
-    return _mm512_kxor(a, b);
-}
-
-static FORCEINLINE __vec16_i1 __or(__vec16_i1 a, __vec16_i1 b) {
-    return _mm512_kor(a, b);
-}
-
-static FORCEINLINE __vec16_i1 __not(__vec16_i1 a) {
-    return _mm512_knot(a);
-}
-
-static FORCEINLINE __vec16_i1 __and_not1(__vec16_i1 a, __vec16_i1 b) {
-    return _mm512_kandn(a, b);
-}
-
-static FORCEINLINE __vec16_i1 __and_not2(__vec16_i1 a, __vec16_i1 b) {
-    return _mm512_kandnr(a, b);
-}
-
-static FORCEINLINE __vec16_i1 __select(__vec16_i1 mask, __vec16_i1 a, 
-                                       __vec16_i1 b) {
-//    return ((a & mask) | (b & ~mask));
-    return __or(__and(a, mask), __and_not2(b, mask));
-}
-
-static FORCEINLINE __vec16_i1 __select(bool cond, __vec16_i1 a, __vec16_i1 b) {
-    return cond ? a : b;
-}
-
-
-static FORCEINLINE bool __extract_element(__vec16_i1 vec, int index) {
-    return (vec.v & (1 << index)) ? true : false;
-}
-
-static FORCEINLINE void __insert_element(__vec16_i1 *vec, int index, 
-                                         bool val) {
-    if (val == false)
-        vec->v &= ~(1 << index);
-    else
-        vec->v |= (1 << index);
-}
-
-template <int ALIGN> static FORCEINLINE __vec16_i1 __load(const __vec16_i1 *p) {
-    uint16_t *ptr = (uint16_t *)p;
-    __vec16_i1 r;
-    r.v = *ptr;
-    return r;
-}
-
-template <int ALIGN> static FORCEINLINE void __store(__vec16_i1 *p, __vec16_i1 v) {
-    uint16_t *ptr = (uint16_t *)p;
-    *ptr = v.v;
+template <int ALIGN> static FORCEINLINE void __store(__vec16_i1 *p, __vec16_i1 v) 
+{
+  *p = v;
 }
 
 template <class RetVecType> RetVecType __smear_i1(int i);
-template <> static FORCEINLINE __vec16_i1 __smear_i1<__vec16_i1>(int i) {
-    return i?0xFFFF:0x0;
-}
+template <> static FORCEINLINE __vec16_i1 __smear_i1<__vec16_i1>(int i) { return i?0xFFFF:0x0; }
 
 template <class RetVecType> RetVecType __setzero_i1();
-template <> static FORCEINLINE __vec16_i1 __setzero_i1<__vec16_i1>() {
-    return 0;
-}
+template <> static FORCEINLINE __vec16_i1 __setzero_i1<__vec16_i1>() { return 0; }
 
 template <class RetVecType> __vec16_i1 __undef_i1();
-template <> FORCEINLINE __vec16_i1 __undef_i1<__vec16_i1>() {
-    return __vec16_i1();
-}
-
+template <> FORCEINLINE __vec16_i1 __undef_i1<__vec16_i1>() { return __vec16_i1(); }
 
 ///////////////////////////////////////////////////////////////////////////
 // int8
+///////////////////////////////////////////////////////////////////////////
 
 BINARY_OP(__vec16_i8, __add, +)
 BINARY_OP(__vec16_i8, __sub, -)
@@ -653,6 +561,7 @@ LOAD_STORE(__vec16_i8, int8_t)
 
 ///////////////////////////////////////////////////////////////////////////
 // int16
+///////////////////////////////////////////////////////////////////////////
 
 BINARY_OP(__vec16_i16, __add, +)
 BINARY_OP(__vec16_i16, __sub, -)
@@ -696,232 +605,57 @@ ROTATE(__vec16_i16, i16, int16_t)
 SHUFFLES(__vec16_i16, i16, int16_t)
 LOAD_STORE(__vec16_i16, int16_t)
 
-#if 0 /* evghenii::int32 */
-///////////////////////////////////////////////////////////////////////////
-// int32
-
-BINARY_OP(__vec16_i32, __add, +)
-BINARY_OP(__vec16_i32, __sub, -)
-BINARY_OP(__vec16_i32, __mul, *)
-
-BINARY_OP(__vec16_i32, __or, |)
-BINARY_OP(__vec16_i32, __and, &)
-BINARY_OP(__vec16_i32, __xor, ^)
-BINARY_OP(__vec16_i32, __shl, <<)
-
-BINARY_OP_CAST(__vec16_i32, uint32_t, __udiv, /)
-BINARY_OP_CAST(__vec16_i32, int32_t,  __sdiv, /)
-
-BINARY_OP_CAST(__vec16_i32, uint32_t, __urem, %)
-BINARY_OP_CAST(__vec16_i32, int32_t,  __srem, %)
-BINARY_OP_CAST(__vec16_i32, uint32_t, __lshr, >>)
-BINARY_OP_CAST(__vec16_i32, int32_t,  __ashr, >>)
-
-SHIFT_UNIFORM(__vec16_i32, uint32_t, __lshr, >>)
-SHIFT_UNIFORM(__vec16_i32, int32_t, __ashr, >>)
-SHIFT_UNIFORM(__vec16_i32, int32_t, __shl, <<)
-
-CMP_OP(__vec16_i32, i32, int32_t,  __equal, ==)
-CMP_OP(__vec16_i32, i32, int32_t,  __not_equal, !=)
-CMP_OP(__vec16_i32, i32, uint32_t, __unsigned_less_equal, <=)
-CMP_OP(__vec16_i32, i32, int32_t,  __signed_less_equal, <=)
-CMP_OP(__vec16_i32, i32, uint32_t, __unsigned_greater_equal, >=)
-CMP_OP(__vec16_i32, i32, int32_t,  __signed_greater_equal, >=)
-CMP_OP(__vec16_i32, i32, uint32_t, __unsigned_less_than, <)
-CMP_OP(__vec16_i32, i32, int32_t,  __signed_less_than, <)
-CMP_OP(__vec16_i32, i32, uint32_t, __unsigned_greater_than, >)
-CMP_OP(__vec16_i32, i32, int32_t,  __signed_greater_than, >)
-
-SELECT(__vec16_i32)
-INSERT_EXTRACT(__vec16_i32, int32_t)
-SMEAR(__vec16_i32, i32, int32_t)
-SETZERO(__vec16_i32, i32)
-UNDEF(__vec16_i32, i32)
-BROADCAST(__vec16_i32, i32, int32_t)
-ROTATE(__vec16_i32, i32, int32_t)
-SHUFFLES(__vec16_i32, i32, int32_t)
-LOAD_STORE(__vec16_i32, int32_t)
-
-#else /* evghenii::int32 */
 ///////////////////////////////////////////////////////////////////////////
 // int32
 ///////////////////////////////////////////////////////////////////////////
 
-static FORCEINLINE __vec16_i32 __add(__vec16_i32 a, __vec16_i32 b) {
-    return _mm512_add_epi32(a, b);
-}
+static FORCEINLINE __vec16_i32 __add (__vec16_i32 a, __vec16_i32 b) { return _mm512_add_epi32  (a,b); }
+static FORCEINLINE __vec16_i32 __sub (__vec16_i32 a, __vec16_i32 b) { return _mm512_sub_epi32  (a,b); }
+static FORCEINLINE __vec16_i32 __mul (__vec16_i32 a, __vec16_i32 b) { return _mm512_mullo_epi32(a,b); }
+static FORCEINLINE __vec16_i32 __udiv(__vec16_i32 a, __vec16_i32 b) { return _mm512_div_epu32  (a,b); }
+static FORCEINLINE __vec16_i32 __sdiv(__vec16_i32 a, __vec16_i32 b) { return _mm512_div_epi32  (a,b); }
+static FORCEINLINE __vec16_i32 __urem(__vec16_i32 a, __vec16_i32 b) { return _mm512_rem_epu32  (a,b); }
+static FORCEINLINE __vec16_i32 __srem(__vec16_i32 a, __vec16_i32 b) { return _mm512_rem_epi32  (a,b); }
+static FORCEINLINE __vec16_i32 __or  (__vec16_i32 a, __vec16_i32 b) { return _mm512_or_epi32   (a,b); }
+static FORCEINLINE __vec16_i32 __and (__vec16_i32 a, __vec16_i32 b) { return _mm512_and_epi32  (a,b); }
+static FORCEINLINE __vec16_i32 __xor (__vec16_i32 a, __vec16_i32 b) { return _mm512_xor_epi32  (a,b); }
+static FORCEINLINE __vec16_i32 __shl (__vec16_i32 a, __vec16_i32 b) { return _mm512_sllv_epi32 (a,b); }
+static FORCEINLINE __vec16_i32 __lshr(__vec16_i32 a, __vec16_i32 b) { return _mm512_srlv_epi32 (a,b); }
+static FORCEINLINE __vec16_i32 __ashr(__vec16_i32 a, __vec16_i32 b) { return _mm512_srav_epi32 (a,b); }
+static FORCEINLINE __vec16_i32 __shl (__vec16_i32 a,     int32_t n) { return _mm512_slli_epi32 (a,n); }
+static FORCEINLINE __vec16_i32 __lshr(__vec16_i32 a,     int32_t n) { return _mm512_srli_epi32 (a,n); }
+static FORCEINLINE __vec16_i32 __ashr(__vec16_i32 a,     int32_t n) { return _mm512_srai_epi32 (a,n); }
 
-static FORCEINLINE __vec16_i32 __sub(__vec16_i32 a, __vec16_i32 b) {
-    return _mm512_sub_epi32(a, b);
-}
+static FORCEINLINE __vec16_i1 __equal_i32                 (__vec16_i32 a, __vec16_i32 b) { return _mm512_cmpeq_epi32_mask (a,b); }
+static FORCEINLINE __vec16_i1 __not_equal_i32             (__vec16_i32 a, __vec16_i32 b) { return _mm512_cmpneq_epi32_mask(a,b); }
+static FORCEINLINE __vec16_i1 __unsigned_less_equal_i32   (__vec16_i32 a, __vec16_i32 b) { return _mm512_cmple_epu32_mask (a,b); }
+static FORCEINLINE __vec16_i1 __signed_less_equal_i32     (__vec16_i32 a, __vec16_i32 b) { return _mm512_cmple_epi32_mask (a,b); }
+static FORCEINLINE __vec16_i1 __unsigned_greater_equal_i32(__vec16_i32 a, __vec16_i32 b) { return _mm512_cmpge_epu32_mask (a,b); }
+static FORCEINLINE __vec16_i1 __signed_greater_equal_i32  (__vec16_i32 a, __vec16_i32 b) { return _mm512_cmpge_epi32_mask (a,b); }
+static FORCEINLINE __vec16_i1 __unsigned_less_than_i32    (__vec16_i32 a, __vec16_i32 b) { return _mm512_cmplt_epu32_mask (a,b); }
+static FORCEINLINE __vec16_i1 __signed_less_than_i32      (__vec16_i32 a, __vec16_i32 b) { return _mm512_cmplt_epi32_mask (a,b); }
+static FORCEINLINE __vec16_i1 __unsigned_greater_than_i32 (__vec16_i32 a, __vec16_i32 b) { return _mm512_cmpgt_epu32_mask (a,b); }
+static FORCEINLINE __vec16_i1 __signed_greater_than_i32   (__vec16_i32 a, __vec16_i32 b) { return _mm512_cmpgt_epi32_mask (a,b); }
 
-static FORCEINLINE __vec16_i32 __mul(__vec16_i32 a, __vec16_i32 b) {
-    return _mm512_mullo_epi32(a, b);
-}
+static FORCEINLINE __vec16_i1 __equal_i32_and_mask                 (__vec16_i32 a, __vec16_i32 b, __vec16_i1 m) { return _mm512_mask_cmpeq_epi32_mask (m,a,b); }
+static FORCEINLINE __vec16_i1 __not_equal_i32_and_mask             (__vec16_i32 a, __vec16_i32 b, __vec16_i1 m) { return _mm512_mask_cmpneq_epi32_mask(m,a,b); }
+static FORCEINLINE __vec16_i1 __unsigned_less_equal_i32_and_mask   (__vec16_i32 a, __vec16_i32 b, __vec16_i1 m) { return _mm512_mask_cmple_epu32_mask (m,a,b); }
+static FORCEINLINE __vec16_i1 __signed_less_equal_i32_and_mask     (__vec16_i32 a, __vec16_i32 b, __vec16_i1 m) { return _mm512_mask_cmple_epi32_mask (m,a,b); }
+static FORCEINLINE __vec16_i1 __unsigned_greater_equal_i32_and_mask(__vec16_i32 a, __vec16_i32 b, __vec16_i1 m) { return _mm512_mask_cmpge_epu32_mask (m,a,b); }
+static FORCEINLINE __vec16_i1 __signed_greater_equal_i32_and_mask  (__vec16_i32 a, __vec16_i32 b, __vec16_i1 m) { return _mm512_mask_cmpge_epi32_mask (m,a,b); }
+static FORCEINLINE __vec16_i1 __unsigned_less_than_i32_and_mask    (__vec16_i32 a, __vec16_i32 b, __vec16_i1 m) { return _mm512_mask_cmplt_epu32_mask (m,a,b); }
+static FORCEINLINE __vec16_i1 __signed_less_than_i32_and_mask      (__vec16_i32 a, __vec16_i32 b, __vec16_i1 m) { return _mm512_mask_cmplt_epi32_mask (m,a,b); }
+static FORCEINLINE __vec16_i1 __unsigned_greater_than_i32_and_mask (__vec16_i32 a, __vec16_i32 b, __vec16_i1 m) { return _mm512_mask_cmpgt_epu32_mask (m,a,b); }
+static FORCEINLINE __vec16_i1 __signed_greater_than_i32_and_mask   (__vec16_i32 a, __vec16_i32 b, __vec16_i1 m) { return _mm512_mask_cmpgt_epi32_mask (m,a,b); }
 
-static FORCEINLINE __vec16_i32 __udiv(__vec16_i32 a, __vec16_i32 b) {
-    return _mm512_div_epu32(a, b);
-}
+static FORCEINLINE __vec16_i32 __select(__vec16_i1 mask, __vec16_i32 a, __vec16_i32 b) { return _mm512_mask_mov_epi32(b, mask, a); }
+static FORCEINLINE __vec16_i32 __select(      bool cond, __vec16_i32 a, __vec16_i32 b) { return cond ? a : b; }
 
-static FORCEINLINE __vec16_i32 __sdiv(__vec16_i32 a, __vec16_i32 b) {
-    return _mm512_div_epi32(a, b);
-}
-
-static FORCEINLINE __vec16_i32 __urem(__vec16_i32 a, __vec16_i32 b) {
-    return _mm512_rem_epu32(a, b);
-}
-
-static FORCEINLINE __vec16_i32 __srem(__vec16_i32 a, __vec16_i32 b) {
-    return _mm512_rem_epi32(a, b);
-}
-
-static FORCEINLINE __vec16_i32 __or(__vec16_i32 a, __vec16_i32 b) {
-    return _mm512_or_epi32(a, b);
-}
-
-static FORCEINLINE __vec16_i32 __and(__vec16_i32 a, __vec16_i32 b) {
-    return _mm512_and_epi32(a, b);
-}
-
-static FORCEINLINE __vec16_i32 __xor(__vec16_i32 a, __vec16_i32 b) {
-    return _mm512_xor_epi32(a, b);
-}
-
-static FORCEINLINE __vec16_i32 __shl(__vec16_i32 a, __vec16_i32 b) {
-    return _mm512_sllv_epi32(a, b);
-}
-
-static FORCEINLINE __vec16_i32 __lshr(__vec16_i32 a, __vec16_i32 b) {
-    return _mm512_srlv_epi32(a, b);
-}
-
-static FORCEINLINE __vec16_i32 __ashr(__vec16_i32 a, __vec16_i32 b) {
-    return _mm512_srav_epi32(a, b); 
-}
-
-static FORCEINLINE __vec16_i32 __shl(__vec16_i32 a, int32_t n) {
-    return _mm512_slli_epi32(a, n);
-}
-
-static FORCEINLINE __vec16_i32 __lshr(__vec16_i32 a, int32_t n) {
-    return _mm512_srli_epi32(a, n); 
-}
-
-static FORCEINLINE __vec16_i32 __ashr(__vec16_i32 a, int32_t n) {
-    return _mm512_srai_epi32(a, n); 
-}
-
-static FORCEINLINE __vec16_i1 __equal_i32(const __vec16_i32 &a, const __vec16_i32 &b) {
-    return _mm512_cmpeq_epi32_mask(a, b);
-}
-
-static FORCEINLINE __vec16_i1 __equal_i32_and_mask(const __vec16_i32 &a, const __vec16_i32 &b,
-                                                   __vec16_i1 m) {
-    return _mm512_mask_cmpeq_epi32_mask(m, a, b);
-}
-
-static FORCEINLINE __vec16_i1 __not_equal_i32(__vec16_i32 a, __vec16_i32 b) {
-    return _mm512_cmpneq_epi32_mask(a, b);
-}
-
-static FORCEINLINE __vec16_i1 __not_equal_i32_and_mask(__vec16_i32 a, __vec16_i32 b,
-                                                       __vec16_i1 m) {
-    return _mm512_mask_cmpneq_epi32_mask(m, a, b);
-}
-
-static FORCEINLINE __vec16_i1 __unsigned_less_equal_i32(__vec16_i32 a, __vec16_i32 b) {
-    return _mm512_cmple_epu32_mask(a, b);
-}
-
-static FORCEINLINE __vec16_i1 __unsigned_less_equal_i32_and_mask(__vec16_i32 a, __vec16_i32 b,
-                                                                 __vec16_i1 m) {
-    return _mm512_mask_cmple_epu32_mask(m, a, b);
-}
-
-static FORCEINLINE __vec16_i1 __signed_less_equal_i32(__vec16_i32 a, __vec16_i32 b) {
-    return _mm512_cmple_epi32_mask(a, b);
-}
-
-static FORCEINLINE __vec16_i1 __signed_less_equal_i32_and_mask(__vec16_i32 a, __vec16_i32 b,
-                                                               __vec16_i1 m) {
-    return _mm512_mask_cmple_epi32_mask(m, a, b);
-}
-
-static FORCEINLINE __vec16_i1 __unsigned_greater_equal_i32(__vec16_i32 a, __vec16_i32 b) {
-    return _mm512_cmpge_epu32_mask(a, b);
-}
-
-static FORCEINLINE __vec16_i1 __unsigned_greater_equal_i32_and_mask(__vec16_i32 a, __vec16_i32 b,
-                                                                    __vec16_i1 m) {
-    return _mm512_mask_cmpge_epu32_mask(m, a, b);
-}
-
-static FORCEINLINE __vec16_i1 __signed_greater_equal_i32(__vec16_i32 a, __vec16_i32 b) {
-    return _mm512_cmpge_epi32_mask(a, b);
-}
-
-static FORCEINLINE __vec16_i1 __signed_greater_equal_i32_and_mask(__vec16_i32 a, __vec16_i32 b,
-                                                                  __vec16_i1 m) {
-    return _mm512_mask_cmpge_epi32_mask(m, a, b);
-}
-
-static FORCEINLINE __vec16_i1 __unsigned_less_than_i32(__vec16_i32 a, __vec16_i32 b) {
-    return _mm512_cmplt_epu32_mask(a, b);
-}
-
-static FORCEINLINE __vec16_i1 __unsigned_less_than_i32_and_mask(__vec16_i32 a, __vec16_i32 b,
-                                                                __vec16_i1 m) {
-    return _mm512_mask_cmplt_epu32_mask(m, a, b);
-}
-
-static FORCEINLINE __vec16_i1 __signed_less_than_i32(__vec16_i32 a, __vec16_i32 b) {
-    return _mm512_cmplt_epi32_mask(a, b);
-}
-
-static FORCEINLINE __vec16_i1 __signed_less_than_i32_and_mask(__vec16_i32 a, __vec16_i32 b,
-                                                              __vec16_i1 m) {
-    return _mm512_mask_cmplt_epi32_mask(m, a, b);
-}
-
-static FORCEINLINE __vec16_i1 __unsigned_greater_than_i32(__vec16_i32 a, __vec16_i32 b) {
-    return _mm512_cmpgt_epu32_mask(a, b);
-}
-
-static FORCEINLINE __vec16_i1 __unsigned_greater_than_i32_and_mask(__vec16_i32 a, __vec16_i32 b,
-                                                                   __vec16_i1 m) {
-    return _mm512_mask_cmpgt_epu32_mask(m, a, b);
-}
-
-static FORCEINLINE __vec16_i1 __signed_greater_than_i32(__vec16_i32 a, __vec16_i32 b) {
-    return _mm512_cmpgt_epi32_mask(a, b);
-}
-
-static FORCEINLINE __vec16_i1 __signed_greater_than_i32_and_mask(__vec16_i32 a, __vec16_i32 b,
-                                                                 __vec16_i1 m) {
-    return _mm512_mask_cmpgt_epi32_mask(m, a, b);
-}
-
-static FORCEINLINE __vec16_i32 __select(__vec16_i1 mask,
-                                        __vec16_i32 a, __vec16_i32 b) {
-    return _mm512_mask_mov_epi32(b.v, mask, a.v);
-} 
-
-static FORCEINLINE __vec16_i32 __select(bool cond, __vec16_i32 a, __vec16_i32 b) {
-    return cond ? a : b;
-}
-
-static FORCEINLINE int32_t __extract_element(__vec16_i32 v, int index) { //uint32_t index) {
-    return ((int32_t *)&v)[index];
-}
-
-static FORCEINLINE void __insert_element(__vec16_i32 *v, uint32_t index, int32_t val) {
-    ((int32_t *)v)[index] = val;
-}
+static FORCEINLINE int32_t __extract_element(__vec16_i32  v,  int32_t index)              { return v[index];    }
+static FORCEINLINE void    __insert_element (__vec16_i32 *v, uint32_t index, int32_t val) { (*v)[index] = val;  }
 
 template <class RetVecType> RetVecType __smear_i32(int32_t i);
-template <> static FORCEINLINE __vec16_i32 __smear_i32<__vec16_i32>(int32_t i) {
-    return _mm512_set1_epi32(i);
-}
+template <> static FORCEINLINE __vec16_i32 __smear_i32<__vec16_i32>(int32_t i) { return _mm512_set1_epi32(i); }
 
 static const __vec16_i32 __ispc_one = __smear_i32<__vec16_i32>(1);
 static const __vec16_i32 __ispc_thirty_two = __smear_i32<__vec16_i32>(32);
@@ -929,66 +663,56 @@ static const __vec16_i32 __ispc_ffffffff = __smear_i32<__vec16_i32>(-1);
 static const __vec16_i32 __ispc_stride1(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
 
 template <class RetVecType> RetVecType __setzero_i32();
-template <> static FORCEINLINE __vec16_i32 __setzero_i32<__vec16_i32>() {
-    return _mm512_setzero_epi32();
-}
+template <> static FORCEINLINE __vec16_i32 __setzero_i32<__vec16_i32>() { return _mm512_setzero_epi32(); }
 
 template <class RetVecType> RetVecType __undef_i32();
-template <> static FORCEINLINE __vec16_i32 __undef_i32<__vec16_i32>() {
-    return __vec16_i32();
+template <> static FORCEINLINE __vec16_i32 __undef_i32<__vec16_i32>() { return __vec16_i32(); }
+
+static FORCEINLINE __vec16_i32 __broadcast_i32(__vec16_i32 v, int index) { return _mm512_mask_permutevar_epi32(v, 0xFFFF, _mm512_set1_epi32(index), v); }
+
+static FORCEINLINE __vec16_i32 __rotate_i32(__vec16_i32 v, int index) 
+{
+  __vec16_i32 idx = __smear_i32<__vec16_i32>(index);
+  __vec16_i32 shuffle = _mm512_and_epi32(_mm512_add_epi32(__ispc_stride1, idx),  __smear_i32<__vec16_i32>(0xF));
+  return _mm512_mask_permutevar_epi32(v, 0xFFFF, shuffle, v);
 }
 
-static FORCEINLINE __vec16_i32 __broadcast_i32(__vec16_i32 v, int index) {
-    int32_t val = __extract_element(v, index & 0xf);
-    return _mm512_set1_epi32(val);
+static FORCEINLINE __vec16_i32 __shuffle_i32 (__vec16_i32 v, __vec16_i32 index) 
+{ 
+  return _mm512_mask_permutevar_epi32(v, 0xFFFF, __and(index, __smear_i32<__vec16_i32>(0xF)), v); 
+}
+static FORCEINLINE __vec16_i32 __shuffle2_i32(__vec16_i32 v0, __vec16_i32 v1, __vec16_i32 index)
+{
+  const __vec16_i1 mask = __signed_less_than_i32(index, __smear_i32<__vec16_i32>(0x10));
+  index  = __and(index, __smear_i32<__vec16_i32>(0xF));
+  __vec16_i32 ret = __undef_i32<__vec16_i32>();
+  ret = _mm512_mask_permutevar_epi32(ret,       mask,  index, v0);
+  ret = _mm512_mask_permutevar_epi32(ret, __not(mask), index, v1);
+  return ret;
 }
 
-#if 0 /* evghenii::doesn't work */
-static FORCEINLINE __vec16_i32 __rotate_i32(__vec16_i32 v, int index) {
-    __vec16_i32 idx = __smear_i32<__vec16_i32>(index);
-    __vec16_i32 shuffle = _mm512_and_epi32(_mm512_add_epi32(__ispc_stride1, idx),  __smear_i32<__vec16_i32>(0x7));
-    return _mm512_mask_permutevar_epi32(v, 0xffff, shuffle, v);
-}
-#else
-ROTATE(__vec16_i32, i32, int32_t)
-#endif
-
-static FORCEINLINE __vec16_i32 __shuffle_i32(__vec16_i32 v, __vec16_i32 index) {
-    return _mm512_mask_permutevar_epi32(v, 0xffff, index, v);
-}
-SHUFFLE2(__vec16_i32, i32, int32_t) /* evghenii::to implement */
-
-template <int ALIGN> static FORCEINLINE __vec16_i32 __load(const __vec16_i32 *p) {
+template <int ALIGN> static FORCEINLINE __vec16_i32 __load(const __vec16_i32 *p) 
+{
 #ifdef ISPC_FORCE_ALIGNED_MEMORY
-    return _mm512_load_epi32(p);
+  return __load<64>(p);
 #else
-    __vec16_i32 v;
-    v = _mm512_extloadunpacklo_epi32(v, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
-    v = _mm512_extloadunpackhi_epi32(v, (uint8_t*)p+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
-    return v;
+  __vec16_i32 v;
+  v = _mm512_extloadunpacklo_epi32(v,           p,    _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
+  v = _mm512_extloadunpackhi_epi32(v, (uint8_t*)p+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
+  return v;
 #endif
 }
 
-
-template <int ALIGN> static FORCEINLINE void __store(__vec16_i32 *p, __vec16_i32 v) {
+template <int ALIGN> static FORCEINLINE void __store(__vec16_i32 *p, __vec16_i32 v) 
+{
 #ifdef ISPC_FORCE_ALIGNED_MEMORY
-    _mm512_store_epi32(p, v);
+  __store<64>(p,v);
 #else
-    _mm512_extpackstorelo_epi32(p, v, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
-    _mm512_extpackstorehi_epi32((uint8_t*)p+64, v, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
+  _mm512_extpackstorelo_epi32(          p,    v, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
+  _mm512_extpackstorehi_epi32((uint8_t*)p+64, v, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
 #endif
 }
 
-#if 0
-template <> static FORCEINLINE __vec16_i32 __load<64>(const __vec16_i32 *p) {
-    return _mm512_load_epi32(p);
-}
-template <> static FORCEINLINE void __store<64>(__vec16_i32 *p, __vec16_i32 v) {
-    _mm512_store_epi32(p, v);
-}
-#endif
-#endif /* evghenii::int32 */
-
 ///////////////////////////////////////////////////////////////////////////
 // int64
 // evghenii::int64

From 57f019a6e02db5b90f9310b1f19114c0c93926ee Mon Sep 17 00:00:00 2001
From: evghenii <egaburov@dds.nl>
Date: Fri, 4 Oct 2013 13:39:15 +0300
Subject: [PATCH 118/124] cleaned int64 added fails info

---
 examples/intrinsics/knc-i1x16.h | 162 +++++++++++++-------------------
 1 file changed, 67 insertions(+), 95 deletions(-)

diff --git a/examples/intrinsics/knc-i1x16.h b/examples/intrinsics/knc-i1x16.h
index aae4be57..934d90b6 100644
--- a/examples/intrinsics/knc-i1x16.h
+++ b/examples/intrinsics/knc-i1x16.h
@@ -715,19 +715,18 @@ template <int ALIGN> static FORCEINLINE void __store(__vec16_i32 *p, __vec16_i32
 
 ///////////////////////////////////////////////////////////////////////////
 // int64
-// evghenii::int64
+///////////////////////////////////////////////////////////////////////////
 
-#if 0
-BINARY_OP(__vec16_i64, __add, +)
-BINARY_OP(__vec16_i64, __sub, -)
-BINARY_OP(__vec16_i64, __mul, *)
-#else
-static FORCEINLINE __vec16_i64 __add(__vec16_i64 a, __vec16_i64 b) {
+static FORCEINLINE __vec16_i64 __add(__vec16_i64 a, __vec16_i64 b) 
+{
   return __vec16_i64(_mm512_add_epi64(a.v1, b.v1), _mm512_add_epi64(a.v2,b.v2));
 }
 
-static FORCEINLINE __vec16_i64 __sub(__vec16_i64 _a, __vec16_i64 _b) {
-//    return __vec16_i64(_mm512_sub_epi64(_a.v1, _b.v1), _mm512_sub_epi64(_a.v2,_b.v2));
+static FORCEINLINE __vec16_i64 __sub(__vec16_i64 _a, __vec16_i64 _b) 
+{
+  //  this intrinsic doesn't exist :S 
+  //  return __vec16_i64(_mm512_sub_epi64(_a.v1, _b.v1), _mm512_sub_epi64(_a.v2,_b.v2));
+  //  use knc.h implementation
   const __vec16_i64 a = _a.cvt2hilo();
   const __vec16_i64 b = _b.cvt2hilo();
   __vec16_i64 ret;
@@ -745,34 +744,30 @@ static FORCEINLINE __vec16_i64 __mul(const __vec16_i32 &a, const __vec16_i64 &_b
         _mm512_mulhi_epi32(a.v, b.v_lo))).cvt2zmm();
 }
 
-#if __ICC_VERSION == 1400
-static FORCEINLINE __vec16_i64 __mul(__vec16_i64 a, __vec16_i64 b) {
-  return __vec16_i64(_mm512_mullox_epi64(a.v1, b.v1), _mm512_mullox_epi64(a.v2,b.v2));
-}
+static FORCEINLINE __vec16_i64 __mul(__vec16_i64 _a, __vec16_i64 _b) 
+{
+#if __ICC >= 1400
+  return __vec16_i64(_mm512_mullox_epi64(_a.v1,_b.v1), _mm512_mullox_epi64(_a.v2,_b.v2));
 #else
-BINARY_OP(__vec16_i64, __mul, *)
+  const __vec16_i64 a = _a.cvt2hilo();
+  const __vec16_i64 b = _b.cvt2hilo();
+  __vec16_i32 lo = _mm512_mullo_epi32(a.v_lo,b.v_lo);
+  __vec16_i32 hi_m1 = _mm512_mulhi_epi32(a.v_lo, b.v_lo);
+  __vec16_i32 hi_m2 = _mm512_mullo_epi32(a.v_hi, b.v_lo);
+  __vec16_i32 hi_m3 = _mm512_mullo_epi32(a.v_lo, b.v_hi);
+  __mmask16 carry = 0;
+  __vec16_i32 hi_p23 = _mm512_addsetc_epi32(hi_m2, hi_m1, &carry);
+  __vec16_i32 hi = _mm512_adc_epi32(hi_m3, carry, hi_p23, &carry);
+  return __vec16_i64(hi,lo).cvt2zmm();
 #endif
-#endif
-
-#if 0
-BINARY_OP(__vec16_i64, __or, |)
-BINARY_OP(__vec16_i64, __and, &)
-BINARY_OP(__vec16_i64, __xor, ^)
-BINARY_OP(__vec16_i64, __shl, <<)
-#else
-static FORCEINLINE __vec16_i64 __or(__vec16_i64 a, __vec16_i64 b) {
-  return __vec16_i64(_mm512_or_epi64(a.v1, b.v1), _mm512_or_epi64(a.v2, b.v2));
 }
 
-static FORCEINLINE __vec16_i64 __and(__vec16_i64 a, __vec16_i64 b) {
-  return __vec16_i64(_mm512_and_epi64(a.v1, b.v1), _mm512_and_epi64(a.v2, b.v2));
-}
+static FORCEINLINE __vec16_i64 __or (__vec16_i64 a, __vec16_i64 b) { return __vec16_i64(_mm512_or_epi64 (a.v1, b.v1), _mm512_or_epi64 (a.v2, b.v2)); }
+static FORCEINLINE __vec16_i64 __and(__vec16_i64 a, __vec16_i64 b) { return __vec16_i64(_mm512_and_epi64(a.v1, b.v1), _mm512_and_epi64(a.v2, b.v2)); }
+static FORCEINLINE __vec16_i64 __xor(__vec16_i64 a, __vec16_i64 b) { return __vec16_i64(_mm512_xor_epi64(a.v1, b.v1), _mm512_xor_epi64(a.v2, b.v2)); }
 
-static FORCEINLINE __vec16_i64 __xor(__vec16_i64 a, __vec16_i64 b) {
-  return __vec16_i64(_mm512_xor_epi64(a.v1, b.v1), _mm512_xor_epi64(a.v2, b.v2));
-}
-
-static FORCEINLINE __vec16_i64 __shl(__vec16_i64 _a, __vec16_i64 _b) {
+static FORCEINLINE __vec16_i64 __shl(__vec16_i64 _a, __vec16_i64 _b) 
+{
   const __vec16_i64 a = _a.cvt2hilo();
   const __vec16_i64 b = _b.cvt2hilo();
   __vec16_i32 xfer = _mm512_srlv_epi32(a.v_lo, _mm512_sub_epi32(__ispc_thirty_two, b.v_lo));
@@ -780,35 +775,16 @@ static FORCEINLINE __vec16_i64 __shl(__vec16_i64 _a, __vec16_i64 _b) {
   __vec16_i32 lo = _mm512_sllv_epi32(a.v_lo, b.v_lo);
   return __vec16_i64(hi,lo).cvt2zmm();
 }
-#endif
 
-#if 0
-BINARY_OP_CAST(__vec16_i64, uint64_t, __udiv, /)
-BINARY_OP_CAST(__vec16_i64, int64_t,  __sdiv, /)
-#else
-static FORCEINLINE __vec16_i64 __udiv(__vec16_i64 a, __vec16_i64 b) {
-  return __vec16_i64(_mm512_div_epu64(a.v1,b.v1), _mm512_div_epu64(a.v2,b.v2));
-}
-static FORCEINLINE __vec16_i64 __sdiv(__vec16_i64 a, __vec16_i64 b) {
-  return __vec16_i64(_mm512_div_epi64(a.v1,b.v1), _mm512_div_epi64(a.v2,b.v2));
-}
-#endif
+static FORCEINLINE __vec16_i64 __udiv(__vec16_i64 a, __vec16_i64 b) { return __vec16_i64(_mm512_div_epu64(a.v1,b.v1), _mm512_div_epu64(a.v2,b.v2)); }
+static FORCEINLINE __vec16_i64 __sdiv(__vec16_i64 a, __vec16_i64 b) { return __vec16_i64(_mm512_div_epi64(a.v1,b.v1), _mm512_div_epi64(a.v2,b.v2)); }
 
-#if 0
-BINARY_OP_CAST(__vec16_i64, uint64_t, __urem, %)
-BINARY_OP_CAST(__vec16_i64, int64_t,  __srem, %)
-#else
-static FORCEINLINE __vec16_i64 __urem(__vec16_i64 a, __vec16_i64 b) {
-  return __vec16_i64(_mm512_rem_epu64(a.v1,b.v1), _mm512_rem_epu64(a.v2,b.v2));
-}
-static FORCEINLINE __vec16_i64 __srem(__vec16_i64 a, __vec16_i64 b) {
-  return __vec16_i64(_mm512_rem_epi64(a.v1,b.v1), _mm512_rem_epi64(a.v2,b.v2));
-}
-#endif
+static FORCEINLINE __vec16_i64 __urem(__vec16_i64 a, __vec16_i64 b) { return __vec16_i64(_mm512_rem_epu64(a.v1,b.v1), _mm512_rem_epu64(a.v2,b.v2)); }
+static FORCEINLINE __vec16_i64 __srem(__vec16_i64 a, __vec16_i64 b) { return __vec16_i64(_mm512_rem_epi64(a.v1,b.v1), _mm512_rem_epi64(a.v2,b.v2)); }
 
 #if 1
 BINARY_OP_CAST(__vec16_i64, uint64_t, __lshr, >>)
-#else /* evghenii::fails idiv.ispc */
+#else /* knc::fails  ./tests/idiv.ispc */
 static FORCEINLINE __vec16_i64 __lshr(__vec16_i64 _a, __vec16_i64 _b) {
   const __vec16_i64 a = _a.cvt2hilo();
   const __vec16_i64 b = _b.cvt2hilo();
@@ -829,7 +805,7 @@ static FORCEINLINE __vec16_i64 __lshr(__vec16_i64 _a, __vec16_i64 _b) {
 
 #if 1
 BINARY_OP_CAST(__vec16_i64, int64_t,  __ashr, >>)
-#else /* evghenii::fails idiv.ispc */
+#else /* knc::fails  ./tests/idiv.ispc */
 static FORCEINLINE __vec16_i64 __ashr(__vec16_i64 _a, __vec16_i64 _b) {
   const __vec16_i64 a = _a.cvt2hilo();
   const __vec16_i64 b = _b.cvt2hilo();
@@ -848,31 +824,30 @@ SHIFT_UNIFORM(__vec16_i64, int64_t, __shl, <<)
 
 #if 1
 CMP_OP(__vec16_i64, i64, int64_t,  __equal, ==)
-#else /* evghenii::fails         ./tests/reduce-equal-8.ispc, some other test hang... */
-static FORCEINLINE __vec16_i1 __equal_i64(const __vec16_i64 &_a, const __vec16_i64 &_b) {
+CMP_OP(__vec16_i64, i64, int64_t,  __not_equal, !=)
+#else /* knc::fails  ./tests/reduce-equal-8.ispc , knc::hangs foreach-unique-6.ispc funcptr-null-[2-6].ispc  funcptr-uniform-9.ispc  funcptr-varying-5.ispc */
+static FORCEINLINE __vec16_i1 __equal_i64(__vec16_i64 _a, __vec16_i64 _b) 
+{
   const __vec16_i64 a = _a.cvt2hilo();
   const __vec16_i64 b = _b.cvt2hilo();
   const __mmask16 lo_match = _mm512_cmpeq_epi32_mask(a.v_lo,b.v_lo);
   return _mm512_mask_cmpeq_epi32_mask(lo_match,a.v_hi,b.v_hi);
 }
-static FORCEINLINE __vec16_i1 __not_equal_i64(const __vec16_i64 &a, const __vec16_i64 &b) {
-    return __not(__equal_i64(a,b));
-}
-#endif
-
-#if 1
-CMP_OP(__vec16_i64, i64, int64_t,  __not_equal, !=)
-#else /* evghenii::fails         ./tests/reduce-equal-8.ispc, some other test hang... */
-static FORCEINLINE __vec16_i1 __equal_i64_and_mask(const __vec16_i64 &_a, const __vec16_i64 &_b,
-                                                   __vec16_i1 mask) {
+static FORCEINLINE __vec16_i1 __equal_i64_and_mask(__vec16_i64 _a, __vec16_i64 _b, __vec16_i1 mask) 
+{
   const __vec16_i64 a = _a.cvt2hilo();
   const __vec16_i64 b = _b.cvt2hilo();
   __mmask16 lo_match = _mm512_cmpeq_epi32_mask(a.v_lo,b.v_lo);
   __mmask16 full_match = _mm512_mask_cmpeq_epi32_mask(lo_match,a.v_hi,b.v_hi);
   return _mm512_kand(full_match, (__mmask16)mask);
 }
-static FORCEINLINE __vec16_i1 __not_equal_i64_and_mask(const __vec16_i64 &a, const __vec16_i64 &b,
-                                                       __vec16_i1 mask) {
+
+static FORCEINLINE __vec16_i1 __not_equal_i64(__vec16_i64 a, __vec16_i64 b) 
+{
+    return __not(__equal_i64(a,b));
+}
+static FORCEINLINE __vec16_i1 __not_equal_i64_and_mask(__vec16_i64 a, __vec16_i64 b, __vec16_i1 mask) 
+{
     return __and(__not(__equal_i64(a,b)), mask);
 }
 #endif
@@ -888,46 +863,39 @@ CMP_OP(__vec16_i64, i64, int64_t,  __signed_less_than, <)
 CMP_OP(__vec16_i64, i64, uint64_t, __unsigned_greater_than, >)
 CMP_OP(__vec16_i64, i64, int64_t,  __signed_greater_than, >)
 
-#if 0
-SELECT(__vec16_i64)
-#else
-static FORCEINLINE __vec16_i64 __select(__vec16_i1 mask,
-                                        __vec16_i64 a, __vec16_i64 b) {
+static FORCEINLINE __vec16_i64 __select(__vec16_i1 mask, __vec16_i64 a, __vec16_i64 b) 
+{
   __vec16_i64 ret;
   ret.v_hi = _mm512_mask_mov_epi64(b.v_hi, mask, a.v_hi);
   ret.v_lo = _mm512_mask_mov_epi64(b.v_lo, mask >> 8, a.v_lo);
   return ret;
 }
-#endif
 
 INSERT_EXTRACT(__vec16_i64, int64_t)
-#if 0
-SMEAR(__vec16_i64, i64, int64_t)
-SETZERO(__vec16_i64, i64)
-UNDEF(__vec16_i64, i64)
-BROADCAST(__vec16_i64, i64, int64_t)
-#else
+
 template <class RetVecType> RetVecType __smear_i64(const int64_t &l);
-template <> FORCEINLINE  __vec16_i64 __smear_i64<__vec16_i64>(const int64_t &l) {    return __vec16_i64(_mm512_set1_epi64(l), _mm512_set1_epi64(l)); }
+template <> FORCEINLINE  __vec16_i64 __smear_i64<__vec16_i64>(const int64_t &l) { return __vec16_i64(_mm512_set1_epi64(l), _mm512_set1_epi64(l)); }
 
 template <class RetVecType> RetVecType __setzero_i64();
-template <> FORCEINLINE  __vec16_i64 __setzero_i64<__vec16_i64>() {    return __vec16_i64(_mm512_setzero_epi32(), _mm512_setzero_epi32()); }
+template <> FORCEINLINE  __vec16_i64 __setzero_i64<__vec16_i64>() { return __vec16_i64(_mm512_setzero_epi32(), _mm512_setzero_epi32()); }
 
 template <class RetVecType> RetVecType __undef_i64();
-template <> FORCEINLINE  __vec16_i64 __undef_i64<__vec16_i64>() {    return __vec16_i64(_mm512_undefined_epi32(), _mm512_undefined_epi32()); }
+template <> FORCEINLINE  __vec16_i64 __undef_i64<__vec16_i64>() { return __vec16_i64(_mm512_undefined_epi32(), _mm512_undefined_epi32()); }
 
-static FORCEINLINE __vec16_i64 __broadcast_i64(__vec16_i64 v, int index) {
-    int64_t val = __extract_element(v, index & 0xf);
-    return __smear_i64<__vec16_i64>(val);
+static FORCEINLINE __vec16_i64 __broadcast_i64(__vec16_i64 v, int index) 
+{
+  int64_t val = __extract_element(v, index & 0xf);
+  return __smear_i64<__vec16_i64>(val);
 }
-#endif
-ROTATE(__vec16_i64, i64, int64_t)
+
+ROTATE  (__vec16_i64, i64, int64_t)
 SHUFFLES(__vec16_i64, i64, int64_t)
-#if 0
-LOAD_STORE(__vec16_i64, int64_t)
-#else
+
 template <int ALIGN> static FORCEINLINE __vec16_i64 __load(const __vec16_i64 *p) 
 {
+#ifdef ISPC_FORCE_ALIGNED_MEMORY
+  return __load<128>(p);
+#else
   __vec16_i32 v1;
   __vec16_i32 v2;
   v2 = _mm512_extloadunpacklo_epi32(v2, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
@@ -935,6 +903,7 @@ template <int ALIGN> static FORCEINLINE __vec16_i64 __load(const __vec16_i64 *p)
   v1 = _mm512_extloadunpacklo_epi32(v1, (uint8_t*)p+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
   v1 = _mm512_extloadunpackhi_epi32(v1, (uint8_t*)p+128, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
   return __vec16_i64(v2,v1);
+#endif
 }
 
 template <> static FORCEINLINE __vec16_i64 __load<64>(const __vec16_i64 *p) 
@@ -948,12 +917,16 @@ template <> static FORCEINLINE __vec16_i64 __load<128>(const __vec16_i64 *p) {
 
 template <int ALIGN> static FORCEINLINE void __store(__vec16_i64 *p, __vec16_i64 v) 
 {
+#ifdef ISPC_FORCE_ALIGNED_MEMORY
+  return __store<128>(p,v);
+#else
   __m512i v1 = v.v2;
   __m512i v2 = v.v1;
   _mm512_extpackstorelo_epi32(p, v2, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
   _mm512_extpackstorehi_epi32((uint8_t*)p+64, v2, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
   _mm512_extpackstorelo_epi32((uint8_t*)p+64, v1, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
   _mm512_extpackstorehi_epi32((uint8_t*)p+128, v1, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
+#endif
 }
 
 template <> static FORCEINLINE void __store<64>(__vec16_i64 *p, __vec16_i64 v) 
@@ -965,7 +938,6 @@ template <> static FORCEINLINE void __store<64>(__vec16_i64 *p, __vec16_i64 v)
 }
 
 template <> static FORCEINLINE void __store<128>(__vec16_i64 *p, __vec16_i64 v) {    __store<64>(p, v); }
-#endif
 
 
 #if 0 /* evghenii::float */

From 8a6789ef61e006866ead9e0c5d0cfa1db39cd8c5 Mon Sep 17 00:00:00 2001
From: evghenii <egaburov@dds.nl>
Date: Fri, 4 Oct 2013 14:11:09 +0300
Subject: [PATCH 119/124] cleaned float added fails info

---
 examples/intrinsics/knc-i1x16.h | 348 ++++++++++----------------------
 1 file changed, 107 insertions(+), 241 deletions(-)

diff --git a/examples/intrinsics/knc-i1x16.h b/examples/intrinsics/knc-i1x16.h
index 934d90b6..87f54dfa 100644
--- a/examples/intrinsics/knc-i1x16.h
+++ b/examples/intrinsics/knc-i1x16.h
@@ -940,217 +940,113 @@ template <> static FORCEINLINE void __store<64>(__vec16_i64 *p, __vec16_i64 v)
 template <> static FORCEINLINE void __store<128>(__vec16_i64 *p, __vec16_i64 v) {    __store<64>(p, v); }
 
 
-#if 0 /* evghenii::float */
-///////////////////////////////////////////////////////////////////////////
-// float
-
-BINARY_OP(__vec16_f, __add, +)
-BINARY_OP(__vec16_f, __sub, -)
-BINARY_OP(__vec16_f, __mul, *)
-BINARY_OP(__vec16_f, __div, /)
-
-CMP_OP(__vec16_f, float, float, __equal, ==)
-CMP_OP(__vec16_f, float, float, __not_equal, !=)
-CMP_OP(__vec16_f, float, float, __less_than, <)
-CMP_OP(__vec16_f, float, float, __less_equal, <=)
-CMP_OP(__vec16_f, float, float, __greater_than, >)
-CMP_OP(__vec16_f, float, float, __greater_equal, >=)
-
-static FORCEINLINE __vec16_i1 __ordered_float(__vec16_f a, __vec16_f b) {
-    __vec16_i1 ret;
-    ret.v = 0;
-    for (int i = 0; i < 16; ++i)
-        ret.v |= ((a[i] == a[i]) && (b[i] == b[i])) ? (1 << i) : 0;
-    return ret;
-}
-
-static FORCEINLINE __vec16_i1 __unordered_float(__vec16_f a, __vec16_f b) {
-    __vec16_i1 ret;
-    ret.v = 0;
-    for (int i = 0; i < 16; ++i)
-        ret.v |= ((a[i] != a[i]) || (b[i] != b[i])) ? (1 << i) : 0;
-    return ret;
-}
-
-#if 0
-      case Instruction::FRem: intrinsic = "__frem"; break;
-#endif
-
-SELECT(__vec16_f)
-INSERT_EXTRACT(__vec16_f, float)
-SMEAR(__vec16_f, float, float)
-SETZERO(__vec16_f, float)
-UNDEF(__vec16_f, float)
-BROADCAST(__vec16_f, float, float)
-ROTATE(__vec16_f, float, float)
-SHUFFLES(__vec16_f, float, float)
-LOAD_STORE(__vec16_f, float)
-#else /* evghenii::float */
-
 ///////////////////////////////////////////////////////////////////////////
 // float
 ///////////////////////////////////////////////////////////////////////////
 
-static FORCEINLINE __vec16_f __add(__vec16_f a, __vec16_f b) { 
-    return _mm512_add_ps(a, b);
-}
+static FORCEINLINE __vec16_f __add(__vec16_f a, __vec16_f b) { return _mm512_add_ps(a,b); }
+static FORCEINLINE __vec16_f __sub(__vec16_f a, __vec16_f b) { return _mm512_sub_ps(a,b); }
+static FORCEINLINE __vec16_f __mul(__vec16_f a, __vec16_f b) { return _mm512_mul_ps(a,b); }
+static FORCEINLINE __vec16_f __div(__vec16_f a, __vec16_f b) { return _mm512_div_ps(a,b); }
 
-static FORCEINLINE __vec16_f __sub(__vec16_f a, __vec16_f b) {
-    return _mm512_sub_ps(a, b);
-}
+static FORCEINLINE __vec16_i1 __equal_float        (__vec16_f a, __vec16_f b) { return _mm512_cmpeq_ps_mask (a,b);            }
+static FORCEINLINE __vec16_i1 __not_equal_float    (__vec16_f a, __vec16_f b) { return _mm512_cmpneq_ps_mask(a,b);            }
+static FORCEINLINE __vec16_i1 __less_than_float    (__vec16_f a, __vec16_f b) { return _mm512_cmplt_ps_mask (a,b);            }
+static FORCEINLINE __vec16_i1 __less_equal_float   (__vec16_f a, __vec16_f b) { return _mm512_cmple_ps_mask (a,b);            }
+static FORCEINLINE __vec16_i1 __greater_than_float (__vec16_f a, __vec16_f b) { return _mm512_cmp_ps_mask   (a,b,_CMP_GT_OS); }
+static FORCEINLINE __vec16_i1 __greater_equal_float(__vec16_f a, __vec16_f b) { return _mm512_cmp_ps_mask   (a,b,_CMP_GE_OS); }
 
-static FORCEINLINE __vec16_f __mul(__vec16_f a, __vec16_f b) {
-    return _mm512_mul_ps(a, b);
-}
+static FORCEINLINE __vec16_i1 __equal_float_and_mask        (__vec16_f a, __vec16_f b, __vec16_i1 m) { return _mm512_mask_cmpeq_ps_mask (m,a,b);            }
+static FORCEINLINE __vec16_i1 __not_equal_float_and_mask    (__vec16_f a, __vec16_f b, __vec16_i1 m) { return _mm512_mask_cmpneq_ps_mask(m,a,b);            }
+static FORCEINLINE __vec16_i1 __less_than_float_and_mask    (__vec16_f a, __vec16_f b, __vec16_i1 m) { return _mm512_mask_cmplt_ps_mask (m,a,b);            }
+static FORCEINLINE __vec16_i1 __less_equal_float_and_mask   (__vec16_f a, __vec16_f b, __vec16_i1 m) { return _mm512_mask_cmple_ps_mask (m,a,b);            }
+static FORCEINLINE __vec16_i1 __greater_than_float_and_mask (__vec16_f a, __vec16_f b, __vec16_i1 m) { return _mm512_mask_cmp_ps_mask   (m,a,b,_CMP_GT_OS); }
+static FORCEINLINE __vec16_i1 __greater_equal_float_and_mask(__vec16_f a, __vec16_f b, __vec16_i1 m) { return _mm512_mask_cmp_ps_mask   (m,a,b,_CMP_GE_OS); }
 
-static FORCEINLINE __vec16_f __div(__vec16_f a, __vec16_f b) {
-    return _mm512_div_ps(a, b);
-}
+static FORCEINLINE __vec16_i1   __ordered_float(__vec16_f a, __vec16_f b) { return _mm512_cmpord_ps_mask  (a,b); }
+static FORCEINLINE __vec16_i1 __unordered_float(__vec16_f a, __vec16_f b) { return _mm512_cmpunord_ps_mask(a,b); }
 
+static FORCEINLINE __vec16_f __select(__vec16_i1 mask, __vec16_f a, __vec16_f b) { return _mm512_mask_mov_ps(b, mask, a); }
+static FORCEINLINE __vec16_f __select(      bool cond, __vec16_f a, __vec16_f b) { return cond ? a : b; }
 
-static FORCEINLINE __vec16_i1 __equal_float(__vec16_f a, __vec16_f b) {
-    return _mm512_cmpeq_ps_mask(a, b);
-}
-
-static FORCEINLINE __vec16_i1 __equal_float_and_mask(__vec16_f a, __vec16_f b,
-                                                     __vec16_i1 m) {
-    return _mm512_mask_cmpeq_ps_mask(m, a, b);
-}
-
-static FORCEINLINE __vec16_i1 __not_equal_float(__vec16_f a, __vec16_f b) {
-    return _mm512_cmpneq_ps_mask(a, b);
-}
-
-static FORCEINLINE __vec16_i1 __not_equal_float_and_mask(__vec16_f a, __vec16_f b,
-                                                         __vec16_i1 m) {
-    return _mm512_mask_cmpneq_ps_mask(m, a, b);
-}
-
-static FORCEINLINE __vec16_i1 __less_than_float(__vec16_f a, __vec16_f b) {
-    return _mm512_cmplt_ps_mask(a, b);
-}
-
-static FORCEINLINE __vec16_i1 __less_than_float_and_mask(__vec16_f a, __vec16_f b,
-                                                         __vec16_i1 m) {
-    return _mm512_mask_cmplt_ps_mask(m, a, b);
-}
-
-static FORCEINLINE __vec16_i1 __less_equal_float(__vec16_f a, __vec16_f b) {
-    return _mm512_cmple_ps_mask(a, b);
-}
-
-static FORCEINLINE __vec16_i1 __less_equal_float_and_mask(__vec16_f a, __vec16_f b,
-                                                          __vec16_i1 m) {
-    return _mm512_mask_cmple_ps_mask(m, a, b);
-}
-
-static FORCEINLINE __vec16_i1 __greater_than_float(__vec16_f a, __vec16_f b) {
-//    return _mm512_cmpnle_ps_mask(a, b);
-    return _mm512_cmp_ps_mask(a, b,_CMP_GT_OS);
-}
-
-static FORCEINLINE __vec16_i1 __greater_than_float_and_mask(__vec16_f a, __vec16_f b,
-                                                            __vec16_i1 m) {
-//    return _mm512_mask_cmpnle_ps_mask(m, a, b);
-    return _mm512_mask_cmp_ps_mask(m,a, b,_CMP_GT_OS);
-}
-
-static FORCEINLINE __vec16_i1 __greater_equal_float(__vec16_f a, __vec16_f b) {
-//    return _mm512_cmpnlt_ps_mask(a, b);
-    return _mm512_cmp_ps_mask(a, b,_CMP_GE_OS);
-}
-
-static FORCEINLINE __vec16_i1 __greater_equal_float_and_mask(__vec16_f a, __vec16_f b,
-                                                             __vec16_i1 m) {
-//    return _mm512_mask_cmpnlt_ps_mask(m, a, b);
-    return _mm512_mask_cmp_ps_mask(m,a, b,_CMP_GE_OS);
-}
-
-static FORCEINLINE __vec16_i1 __ordered_float(__vec16_f a, __vec16_f b) {
-    return _mm512_cmpord_ps_mask(a, b);
-}
-
-static FORCEINLINE __vec16_i1 __unordered_float(__vec16_f a, __vec16_f b) {
-    return _mm512_cmpunord_ps_mask(a, b);
-}
-
-static FORCEINLINE __vec16_f __select(__vec16_i1 mask, __vec16_f a, __vec16_f b) {
-    return _mm512_mask_mov_ps(b, mask, a);
-}
-
-static FORCEINLINE __vec16_f __select(bool cond, __vec16_f a, __vec16_f b) {
-    return cond ? a : b;
-}
-
-static FORCEINLINE float __extract_element(__vec16_f v, uint32_t index) {
-  return v[index];
- //   return ((float *)&v)[index];
-}
-
-static FORCEINLINE void  __insert_element(__vec16_f *v, uint32_t index, float val) {
-  (*v)[index] = val;
-//    ((float *)v)[index] = val;
-}
+static FORCEINLINE float __extract_element(__vec16_f  v, uint32_t index)            { return v[index];   }
+static FORCEINLINE void   __insert_element(__vec16_f *v, uint32_t index, float val) { (*v)[index] = val; }
 
 template <class RetVecType> RetVecType __smear_float(float f);
-template <> static FORCEINLINE __vec16_f __smear_float<__vec16_f>(float f) {
-    return _mm512_set_1to16_ps(f);
-}
+template <> static FORCEINLINE __vec16_f __smear_float<__vec16_f>(float f) { return _mm512_set_1to16_ps(f); }
 
 template <class RetVecType> RetVecType __setzero_float();
-template <> static FORCEINLINE __vec16_f __setzero_float<__vec16_f>() {
-    return _mm512_setzero_ps();
-}
+template <> static FORCEINLINE __vec16_f __setzero_float<__vec16_f>() { return _mm512_setzero_ps(); }
 
 template <class RetVecType> RetVecType __undef_float();
-template <> static FORCEINLINE __vec16_f __undef_float<__vec16_f>() {
-    return __vec16_f();
-}
+template <> static FORCEINLINE __vec16_f __undef_float<__vec16_f>() { return __vec16_f(); }
 
-static FORCEINLINE __vec16_f __broadcast_float(__vec16_f v, int index) {
-    float val = __extract_element(v, index & 0xf);
-    return _mm512_set1_ps(val);
+static FORCEINLINE __vec16_f __broadcast_float(__vec16_f _v, int index) 
+{
+  const __vec16_i32 v = _mm512_castps_si512(_v);
+  return _mm512_castsi512_ps(_mm512_mask_permutevar_epi32(v, 0xFFFF, _mm512_set1_epi32(index), v));
 }
  
-#if 1
-static FORCEINLINE __vec16_f __shuffle_float(__vec16_f v, __vec16_i32 index) {
-    return _mm512_castsi512_ps(_mm512_mask_permutevar_epi32(_mm512_castps_si512(v), 0xffff, index, _mm512_castps_si512(v)));
+static FORCEINLINE __vec16_f __rotate_float(__vec16_f _v, int index) 
+{
+  const __vec16_i32 v =  _mm512_castps_si512(_v);
+  const __vec16_i32 idx = __smear_i32<__vec16_i32>(index);
+  const __vec16_i32 shuffle = _mm512_and_epi32(_mm512_add_epi32(__ispc_stride1, idx),  __smear_i32<__vec16_i32>(0xF));
+  return _mm512_castsi512_ps(_mm512_mask_permutevar_epi32(v, 0xFFFF, shuffle, v));
+}
+static FORCEINLINE __vec16_f __shuffle_float(__vec16_f v, __vec16_i32 index) 
+{
+  return _mm512_castsi512_ps(_mm512_mask_permutevar_epi32(_mm512_castps_si512(v), 0xffff, index, _mm512_castps_si512(v)));
+}
+static FORCEINLINE __vec16_f __shuffle2_float(__vec16_f _v0, __vec16_f _v1, __vec16_i32 index)
+{
+  const __vec16_i32 v0 =  _mm512_castps_si512(_v0);
+  const __vec16_i32 v1 =  _mm512_castps_si512(_v1);
+  const __vec16_i1 mask = __signed_less_than_i32(index, __smear_i32<__vec16_i32>(0x10));
+  index  = __and(index, __smear_i32<__vec16_i32>(0xF));
+  __vec16_i32 ret = __undef_i32<__vec16_i32>();
+  ret = _mm512_mask_permutevar_epi32(ret,       mask,  index, v0);
+  ret = _mm512_mask_permutevar_epi32(ret, __not(mask), index, v1);
+  return _mm512_castsi512_ps(ret);
 }
-#endif
-ROTATE(__vec16_f, float, float)
-SHUFFLE2(__vec16_f, float, float)
 
-template <int ALIGN> static FORCEINLINE __vec16_f __load(const __vec16_f *p) {
+template <int ALIGN> static FORCEINLINE __vec16_f __load(const __vec16_f *p) 
+{
 #ifdef ISPC_FORCE_ALIGNED_MEMORY
-    return _mm512_load_ps(p);
+  return __load<64>(p);
 #else
-    __vec16_f v;
-    v = _mm512_extloadunpacklo_ps(v,           p,    _MM_UPCONV_PS_NONE, _MM_HINT_NONE);
-    v = _mm512_extloadunpackhi_ps(v, (uint8_t*)p+64, _MM_UPCONV_PS_NONE, _MM_HINT_NONE);
-    return v;
+  __vec16_f v;
+  v = _mm512_extloadunpacklo_ps(v,           p,    _MM_UPCONV_PS_NONE, _MM_HINT_NONE);
+  v = _mm512_extloadunpackhi_ps(v, (uint8_t*)p+64, _MM_UPCONV_PS_NONE, _MM_HINT_NONE);
+  return v;
 #endif
 }
 
-template <int ALIGN> static FORCEINLINE void __store(__vec16_f *p, __vec16_f v) {
+template <int ALIGN> static FORCEINLINE void __store(__vec16_f *p, __vec16_f v) 
+{
 #ifdef ISPC_FORCE_ALIGNED_MEMORY
-    _mm512_store_ps(p, v);
+  __store<64>(p,v);
 #else
-    _mm512_extpackstorelo_ps(          p,    v, _MM_DOWNCONV_PS_NONE, _MM_HINT_NONE);
-    _mm512_extpackstorehi_ps((uint8_t*)p+64, v, _MM_DOWNCONV_PS_NONE, _MM_HINT_NONE);
+  _mm512_extpackstorelo_ps(          p,    v, _MM_DOWNCONV_PS_NONE, _MM_HINT_NONE);
+  _mm512_extpackstorehi_ps((uint8_t*)p+64, v, _MM_DOWNCONV_PS_NONE, _MM_HINT_NONE);
 #endif
 }
 
-#if 0
-template <> static FORCEINLINE void __store<64>(__vec16_f *p, __vec16_f v) {
-    _mm512_store_ps(p, v);
-}
-template <> static FORCEINLINE __vec16_f __load<64>(const __vec16_f *p) {
+#if 0 /* knc::fails  ./tests/gs-improve-progindex.ispc with segfault */
+template <> static FORCEINLINE __vec16_f __load<64>(const __vec16_f *p) 
+{
     return _mm512_load_ps(p);
 }
+/* this one doesn't fail but it is  commented out for completenes, no aligned load/stores */
+template <> static FORCEINLINE void __store<64>(__vec16_f *p, __vec16_f v) 
+{
+  _mm512_store_ps(p, v);
+}
 #endif
 
-#endif /* evghenii::float */
+/******** math ******/
 
+/*** float ***/
 static FORCEINLINE float __exp_uniform_float(float v) {    return expf(v);}
 static FORCEINLINE __vec16_f __exp_varying_float(__vec16_f v) { return _mm512_exp_ps(v); }
 
@@ -1160,6 +1056,18 @@ static FORCEINLINE __vec16_f __log_varying_float(__vec16_f v) { return _mm512_lo
 static FORCEINLINE float __pow_uniform_float(float a, float b) {    return powf(a, b);}
 static FORCEINLINE __vec16_f __pow_varying_float(__vec16_f a, __vec16_f b) { return _mm512_pow_ps(a,b); }
 
+/*** double ***/
+static FORCEINLINE double __exp_uniform_double(double v) {    return exp(v);}
+static FORCEINLINE __vec16_d __exp_varying_double(__vec16_d v) { return __vec16_d(_mm512_exp_pd(v.v1),_mm512_exp_pd(v.v2)); }
+
+static FORCEINLINE double __log_uniform_double(double v) {    return log(v);}
+static FORCEINLINE __vec16_d __log_varying_double(__vec16_d v) { return __vec16_d(_mm512_log_pd(v.v1),_mm512_log_pd(v.v2)); }
+
+static FORCEINLINE double __pow_uniform_double(double a, double b) {    return pow(a,b);}
+static FORCEINLINE __vec16_d __pow_varying_double(__vec16_d a, __vec16_d b) { return __vec16_d(_mm512_pow_pd(a.v1,b.v1),_mm512_pow_pd(a.v2,b.v2)); }
+
+/******** bitcast ******/
+
 static FORCEINLINE int __intbits(float v) {
     union {
         float f;
@@ -1178,8 +1086,11 @@ static FORCEINLINE float __floatbits(int v) {
     return u.f;
 }
 
-/* source : 
- * http://stackoverflow.com/questions/1659440/32-bit-to-16-bit-floating-point-conversion */
+///////////////////////////////////////////////////////////////////////////
+// half<->float : this one passes the tests 
+// source : 
+// http://stackoverflow.com/questions/1659440/32-bit-to-16-bit-floating-point-conversion 
+///////////////////////////////////////////////////////////////////////////
 class Float16Compressor
 {
   union Bits
@@ -1252,81 +1163,36 @@ class Float16Compressor
   }
 };
 
-static FORCEINLINE float __half_to_float_uniform(int16_t h) {
-#if 0
-  static const uint32_t shifted_exp = 0x7c00 << 13; // exponent mask after shift
-
-  int32_t o = ((int32_t)(h & 0x7fff)) << 13;     // exponent/mantissa bits
-  uint32_t exp = shifted_exp & o;   // just the exponent
-  o += (127 - 15) << 23;        // exponent adjust
-
-  // handle exponent special cases
-  if (exp == shifted_exp) // Inf/NaN?
-    o += (128 - 16) << 23;    // extra exp adjust
-  else if (exp == 0) { // Zero/Denormal?
-    o += 1 << 23;             // extra exp adjust
-    o = __intbits(__floatbits(o) - __floatbits(113 << 23)); // renormalize
-  }
-
-  o |= ((int32_t)(h & 0x8000)) << 16;    // sign bit
-  return __floatbits(o);
-#else
+static FORCEINLINE float __half_to_float_uniform(int16_t h) 
+{
   return Float16Compressor::decompress(h);
-#endif
+}
+static FORCEINLINE __vec16_f __half_to_float_varying(__vec16_i16 v) 
+{
+  __vec16_f ret;
+  for (int i = 0; i < 16; ++i)
+    ret[i] = __half_to_float_uniform(v[i]);
+  return ret;
 }
 
 
-static FORCEINLINE __vec16_f __half_to_float_varying(__vec16_i16 v) {
-    __vec16_f ret;
-    for (int i = 0; i < 16; ++i)
-        ret[i] = __half_to_float_uniform(v[i]);
-    return ret;
-}
-
-
-static FORCEINLINE int16_t __float_to_half_uniform(float f) {
-#if 0
-    uint32_t sign_mask = 0x80000000u;
-    int32_t o;
-
-    int32_t fint = __intbits(f);
-    int32_t sign = fint & sign_mask;
-    fint ^= sign;
-
-    int32_t f32infty = 255 << 23;
-    o = (fint > f32infty) ? 0x7e00 : 0x7c00; 
-
-    // (De)normalized number or zero
-    // update fint unconditionally to save the blending; we don't need it
-    // anymore for the Inf/NaN case anyway.
-    const uint32_t round_mask = ~0xfffu; 
-    const int32_t magic = 15 << 23;
-    const int32_t f16infty = 31 << 23;
-
-    int32_t fint2 = __intbits(__floatbits(fint & round_mask) * __floatbits(magic)) - round_mask;
-    fint2 = (fint2 > f16infty) ? f16infty : fint2; // Clamp to signed infinity if overflowed
-
-    if (fint < f32infty)
-        o = fint2 >> 13; // Take the bits!
-
-    return (o | (sign >> 16));
-#else
+static FORCEINLINE int16_t __float_to_half_uniform(float f) 
+{
   return Float16Compressor::compress(f);
-#endif
 }
-
-
-static FORCEINLINE __vec16_i16 __float_to_half_varying(__vec16_f v) {
-    __vec16_i16 ret;
-    for (int i = 0; i < 16; ++i)
-        ret[i] = __float_to_half_uniform(v[i]);
-    return ret;
+static FORCEINLINE __vec16_i16 __float_to_half_varying(__vec16_f v) 
+{
+  __vec16_i16 ret;
+  for (int i = 0; i < 16; ++i)
+    ret[i] = __float_to_half_uniform(v[i]);
+  return ret;
 }
 
 
 #if 0 /* evghenii::double */
 ///////////////////////////////////////////////////////////////////////////
 // double
+///////////////////////////////////////////////////////////////////////////
 
 BINARY_OP(__vec16_d, __add, +)
 BINARY_OP(__vec16_d, __sub, -)

From 8b0fc558cb88a1675f903058a1695b70b60efefe Mon Sep 17 00:00:00 2001
From: evghenii <egaburov@dds.nl>
Date: Sat, 5 Oct 2013 14:15:33 +0300
Subject: [PATCH 120/124] complete cleaning

---
 examples/intrinsics/knc-i1x16.h | 1322 ++++++++++---------------------
 1 file changed, 438 insertions(+), 884 deletions(-)

diff --git a/examples/intrinsics/knc-i1x16.h b/examples/intrinsics/knc-i1x16.h
index 87f54dfa..e712c969 100644
--- a/examples/intrinsics/knc-i1x16.h
+++ b/examples/intrinsics/knc-i1x16.h
@@ -1,5 +1,5 @@
 /**
-  Copyright (c) 2010-2012, Intel Corporation
+  Copyright (c) 2010-2013, Intel Corporation
   All rights reserved.
 
   Redistribution and use in source and binary forms, with or without
@@ -265,6 +265,7 @@ static inline int32_t __extract_element(__vec16_i32, int);
 ///////////////////////////////////////////////////////////////////////////
 // macros...
 
+/* knc::macro::not used */
 #define UNARY_OP(TYPE, NAME, OP)            \
 static FORCEINLINE TYPE NAME(TYPE v) {      \
     TYPE ret;                               \
@@ -273,6 +274,7 @@ static FORCEINLINE TYPE NAME(TYPE v) {      \
     return ret;                             \
 }
 
+/* knc::macro::used */
 #define BINARY_OP(TYPE, NAME, OP)                               \
 static FORCEINLINE TYPE NAME(TYPE a, TYPE b) {                  \
     TYPE ret;                                                   \
@@ -281,6 +283,7 @@ static FORCEINLINE TYPE NAME(TYPE a, TYPE b) {                  \
    return ret;                                                   \
 }
 
+/* knc::macro::used */
 #define BINARY_OP_CAST(TYPE, CAST, NAME, OP)                        \
 static FORCEINLINE TYPE NAME(TYPE a, TYPE b) {                      \
    TYPE ret;                                                        \
@@ -289,6 +292,7 @@ static FORCEINLINE TYPE NAME(TYPE a, TYPE b) {                      \
    return ret;                                                      \
 }
 
+/* knc::macro::used */
 #define BINARY_OP_FUNC(TYPE, NAME, FUNC)                            \
 static FORCEINLINE TYPE NAME(TYPE a, TYPE b) {                      \
    TYPE ret;                                                        \
@@ -297,6 +301,7 @@ static FORCEINLINE TYPE NAME(TYPE a, TYPE b) {                      \
    return ret;                                                      \
 }
 
+/* knc::macro::used */
 #define CMP_OP(TYPE, SUFFIX, CAST, NAME, OP)                        \
 static FORCEINLINE __vec16_i1 NAME##_##SUFFIX(TYPE a, TYPE b) {     \
    __vec16_i1 ret;                                                  \
@@ -315,6 +320,7 @@ static FORCEINLINE __vec16_i1 NAME##_##SUFFIX##_and_mask(TYPE a, TYPE b,       \
    return ret;                                                      \
 }
 
+/* knc::macro::used */
 #define INSERT_EXTRACT(VTYPE, STYPE)                                  \
 static FORCEINLINE STYPE __extract_element(VTYPE v, int index) {      \
     return ((STYPE *)&v)[index];                                      \
@@ -323,6 +329,7 @@ static FORCEINLINE void __insert_element(VTYPE *v, int index, STYPE val) { \
     ((STYPE *)v)[index] = val;                                        \
 }
 
+/* knc::macro::used */
 #define LOAD_STORE(VTYPE, STYPE)                       \
 template <int ALIGN>                                   \
 static FORCEINLINE VTYPE __load(const VTYPE *p) {      \
@@ -339,24 +346,7 @@ static FORCEINLINE void __store(VTYPE *p, VTYPE v) {   \
         ptr[i] = v[i];                               \
 }
 
-#define LOADS(VTYPE, STYPE)                       \
-template <int ALIGN>                                   \
-static FORCEINLINE VTYPE __load(const VTYPE *p) {      \
-    STYPE *ptr = (STYPE *)p;                           \
-    VTYPE ret;                                         \
-    for (int i = 0; i < 16; ++i)                       \
-        ret[i] = ptr[i];                             \
-    return ret;                                        \
-}                                                      \
-
-#define STORES(VTYPE, STYPE)                       \
-template <int ALIGN>                                   \
-static FORCEINLINE void __store(VTYPE *p, VTYPE v) {   \
-    STYPE *ptr = (STYPE *)p;                           \
-    for (int i = 0; i < 16; ++i)                       \
-        ptr[i] = v[i];                               \
-}
-
+/* knc::macro::used */
 #define REDUCE_ADD(TYPE, VTYPE, NAME)           \
 static FORCEINLINE TYPE NAME(VTYPE v) {         \
      TYPE ret = v[0];                         \
@@ -365,6 +355,7 @@ static FORCEINLINE TYPE NAME(VTYPE v) {         \
      return ret;                                \
 }
 
+/* knc::macro::used */
 #define REDUCE_MINMAX(TYPE, VTYPE, NAME, OP)                    \
 static FORCEINLINE TYPE NAME(VTYPE v) {                         \
     TYPE ret = v[0];                                          \
@@ -373,6 +364,7 @@ static FORCEINLINE TYPE NAME(VTYPE v) {                         \
     return ret;                                                 \
 }
 
+/* knc::macro::used */
 #define SELECT(TYPE)                                                \
 static FORCEINLINE TYPE __select(__vec16_i1 mask, TYPE a, TYPE b) { \
     TYPE ret;                                                       \
@@ -384,6 +376,7 @@ static FORCEINLINE TYPE __select(bool cond, TYPE a, TYPE b) {       \
     return cond ? a : b;                                            \
 }
 
+/* knc::macro::used */
 #define SHIFT_UNIFORM(TYPE, CAST, NAME, OP)                         \
 static FORCEINLINE TYPE NAME(TYPE a, int32_t b) {                   \
    TYPE ret;                                                        \
@@ -392,6 +385,7 @@ static FORCEINLINE TYPE NAME(TYPE a, int32_t b) {                   \
    return ret;                                                      \
 }
 
+/* knc::macro::used */
 #define SMEAR(VTYPE, NAME, STYPE)                                  \
 template <class RetVecType> VTYPE __smear_##NAME(STYPE);           \
 template <> FORCEINLINE VTYPE __smear_##NAME<VTYPE>(STYPE v) {     \
@@ -401,6 +395,7 @@ template <> FORCEINLINE VTYPE __smear_##NAME<VTYPE>(STYPE v) {     \
     return ret;                                                    \
 }
 
+/* knc::macro::used */
 #define SETZERO(VTYPE, NAME)                                       \
 template <class RetVecType> VTYPE __setzero_##NAME();              \
 template <> FORCEINLINE VTYPE __setzero_##NAME<VTYPE>() {          \
@@ -410,12 +405,14 @@ template <> FORCEINLINE VTYPE __setzero_##NAME<VTYPE>() {          \
     return ret;                                                    \
 }
 
+/* knc::macro::used */
 #define UNDEF(VTYPE, NAME)                                         \
 template <class RetVecType> VTYPE __undef_##NAME();                \
 template <> FORCEINLINE VTYPE __undef_##NAME<VTYPE>() {            \
     return VTYPE();                                                \
 }
 
+/* knc::macro::used */
 #define BROADCAST(VTYPE, NAME, STYPE)                 \
 static FORCEINLINE VTYPE __broadcast_##NAME(VTYPE v, int index) {   \
     VTYPE ret;                                        \
@@ -424,6 +421,7 @@ static FORCEINLINE VTYPE __broadcast_##NAME(VTYPE v, int index) {   \
     return ret;                                       \
 }                                                     \
 
+/* knc::macro::used */
 #define ROTATE(VTYPE, NAME, STYPE)                    \
 static FORCEINLINE VTYPE __rotate_##NAME(VTYPE v, int index) {   \
     VTYPE ret;                                        \
@@ -432,6 +430,7 @@ static FORCEINLINE VTYPE __rotate_##NAME(VTYPE v, int index) {   \
     return ret;                                       \
 }                                                     \
 
+/* knc::macro::used */
 #define SHUFFLES(VTYPE, NAME, STYPE)                 \
 static FORCEINLINE VTYPE __shuffle_##NAME(VTYPE v, __vec16_i32 index) {   \
     VTYPE ret;                                        \
@@ -448,16 +447,6 @@ static FORCEINLINE VTYPE __shuffle2_##NAME(VTYPE v0, VTYPE v1, __vec16_i32 index
     return ret;                                       \
 }
 
-#define SHUFFLE2(VTYPE, NAME, STYPE)                 \
-static FORCEINLINE VTYPE __shuffle2_##NAME(VTYPE v0, VTYPE v1, __vec16_i32 index) {     \
-    VTYPE ret;                                        \
-    for (int i = 0; i < 16; ++i) {                    \
-        int ii = __extract_element(index, i) & 0x1f;    \
-        ret[i] = (ii < 16) ? v0[ii] : v1[ii-16];  \
-    }                                                 \
-    return ret;                                       \
-}
-
 ///////////////////////////////////////////////////////////////////////////
 
 INSERT_EXTRACT(__vec1_i8, int8_t)
@@ -724,9 +713,9 @@ static FORCEINLINE __vec16_i64 __add(__vec16_i64 a, __vec16_i64 b)
 
 static FORCEINLINE __vec16_i64 __sub(__vec16_i64 _a, __vec16_i64 _b) 
 {
-  //  this intrinsic doesn't exist :S 
-  //  return __vec16_i64(_mm512_sub_epi64(_a.v1, _b.v1), _mm512_sub_epi64(_a.v2,_b.v2));
-  //  use knc.h implementation
+#if __ICC >= 99999 /* compiler gate, icc >= 99999 will hopefully support _mm512_sub_epi64 */
+  return __vec16_i64(_mm512_sub_epi64(_a.v1, _b.v1), _mm512_sub_epi64(_a.v2,_b.v2));
+#else
   const __vec16_i64 a = _a.cvt2hilo();
   const __vec16_i64 b = _b.cvt2hilo();
   __vec16_i64 ret;
@@ -734,6 +723,7 @@ static FORCEINLINE __vec16_i64 __sub(__vec16_i64 _a, __vec16_i64 _b)
   ret.v_lo = _mm512_subsetb_epi32(a.v_lo, b.v_lo, &borrow);
   ret.v_hi = _mm512_sbb_epi32    (a.v_hi, borrow, b.v_hi, &borrow);
   return ret.cvt2zmm();
+#endif
 }
 
 static FORCEINLINE __vec16_i64 __mul(const __vec16_i32 &a, const __vec16_i64 &_b)
@@ -744,11 +734,15 @@ static FORCEINLINE __vec16_i64 __mul(const __vec16_i32 &a, const __vec16_i64 &_b
         _mm512_mulhi_epi32(a.v, b.v_lo))).cvt2zmm();
 }
 
+#if __ICC >= 1400 /* compiler gate, icc >= 14.0.0 support _mm512_mullox_epi64 */
+static FORCEINLINE __vec16_i64 __mul(__vec16_i64 a, __vec16_i64 b) 
+{
+  return __vec16_i64(_mm512_mullox_epi64(a.v1,b.v1), _mm512_mullox_epi64(a.v2,b.v2));
+}
+#else  /* __ICC >= 1400 */
+#if 0  /* knc::fails  ./tests/int64-min-1.ispc ./tests/idiv.ispc */
 static FORCEINLINE __vec16_i64 __mul(__vec16_i64 _a, __vec16_i64 _b) 
 {
-#if __ICC >= 1400
-  return __vec16_i64(_mm512_mullox_epi64(_a.v1,_b.v1), _mm512_mullox_epi64(_a.v2,_b.v2));
-#else
   const __vec16_i64 a = _a.cvt2hilo();
   const __vec16_i64 b = _b.cvt2hilo();
   __vec16_i32 lo = _mm512_mullo_epi32(a.v_lo,b.v_lo);
@@ -759,8 +753,11 @@ static FORCEINLINE __vec16_i64 __mul(__vec16_i64 _a, __vec16_i64 _b)
   __vec16_i32 hi_p23 = _mm512_addsetc_epi32(hi_m2, hi_m1, &carry);
   __vec16_i32 hi = _mm512_adc_epi32(hi_m3, carry, hi_p23, &carry);
   return __vec16_i64(hi,lo).cvt2zmm();
-#endif
 }
+#else
+BINARY_OP(__vec16_i64, __mul, *)
+#endif
+#endif  /* __ICC >= 1400 */
 
 static FORCEINLINE __vec16_i64 __or (__vec16_i64 a, __vec16_i64 b) { return __vec16_i64(_mm512_or_epi64 (a.v1, b.v1), _mm512_or_epi64 (a.v2, b.v2)); }
 static FORCEINLINE __vec16_i64 __and(__vec16_i64 a, __vec16_i64 b) { return __vec16_i64(_mm512_and_epi64(a.v1, b.v1), _mm512_and_epi64(a.v2, b.v2)); }
@@ -782,9 +779,7 @@ static FORCEINLINE __vec16_i64 __sdiv(__vec16_i64 a, __vec16_i64 b) { return __v
 static FORCEINLINE __vec16_i64 __urem(__vec16_i64 a, __vec16_i64 b) { return __vec16_i64(_mm512_rem_epu64(a.v1,b.v1), _mm512_rem_epu64(a.v2,b.v2)); }
 static FORCEINLINE __vec16_i64 __srem(__vec16_i64 a, __vec16_i64 b) { return __vec16_i64(_mm512_rem_epi64(a.v1,b.v1), _mm512_rem_epi64(a.v2,b.v2)); }
 
-#if 1
-BINARY_OP_CAST(__vec16_i64, uint64_t, __lshr, >>)
-#else /* knc::fails  ./tests/idiv.ispc */
+#if 0 /* knc::fails  ./tests/idiv.ispc */
 static FORCEINLINE __vec16_i64 __lshr(__vec16_i64 _a, __vec16_i64 _b) {
   const __vec16_i64 a = _a.cvt2hilo();
   const __vec16_i64 b = _b.cvt2hilo();
@@ -800,12 +795,11 @@ static FORCEINLINE __vec16_i64 __lshr(__vec16_i64 _a, __vec16_i64 _b) {
   __vec16_i32 lo = _mm512_or_epi32(xfer, _mm512_srlv_epi32(a.v_lo, b.v_lo));
   return __vec16_i64(hi,lo).cvt2zmm();
 }
-
+#else
+BINARY_OP_CAST(__vec16_i64, uint64_t, __lshr, >>)
 #endif
 
-#if 1
-BINARY_OP_CAST(__vec16_i64, int64_t,  __ashr, >>)
-#else /* knc::fails  ./tests/idiv.ispc */
+#if 0 /* knc::fails  ./tests/idiv.ispc */
 static FORCEINLINE __vec16_i64 __ashr(__vec16_i64 _a, __vec16_i64 _b) {
   const __vec16_i64 a = _a.cvt2hilo();
   const __vec16_i64 b = _b.cvt2hilo();
@@ -816,16 +810,15 @@ static FORCEINLINE __vec16_i64 __ashr(__vec16_i64 _a, __vec16_i64 _b) {
   __vec16_i32 lo = _mm512_or_epi32(xfer, _mm512_srlv_epi32(a.v_lo, b.v_lo));
   return __vec16_i64(hi,lo).cvt2zmm();
 }
+#else
+BINARY_OP_CAST(__vec16_i64, int64_t,  __ashr, >>)
 #endif
 
 SHIFT_UNIFORM(__vec16_i64, uint64_t, __lshr, >>)
 SHIFT_UNIFORM(__vec16_i64, int64_t, __ashr, >>)
 SHIFT_UNIFORM(__vec16_i64, int64_t, __shl, <<)
 
-#if 1
-CMP_OP(__vec16_i64, i64, int64_t,  __equal, ==)
-CMP_OP(__vec16_i64, i64, int64_t,  __not_equal, !=)
-#else /* knc::fails  ./tests/reduce-equal-8.ispc , knc::hangs foreach-unique-6.ispc funcptr-null-[2-6].ispc  funcptr-uniform-9.ispc  funcptr-varying-5.ispc */
+#if 0 /* knc::fails  ./tests/reduce-equal-8.ispc , knc::hangs foreach-unique-6.ispc funcptr-null-[2-6].ispc  funcptr-uniform-9.ispc  funcptr-varying-5.ispc */
 static FORCEINLINE __vec16_i1 __equal_i64(__vec16_i64 _a, __vec16_i64 _b) 
 {
   const __vec16_i64 a = _a.cvt2hilo();
@@ -850,6 +843,9 @@ static FORCEINLINE __vec16_i1 __not_equal_i64_and_mask(__vec16_i64 a, __vec16_i6
 {
     return __and(__not(__equal_i64(a,b)), mask);
 }
+#else
+CMP_OP(__vec16_i64, i64, int64_t,  __equal, ==)
+CMP_OP(__vec16_i64, i64, int64_t,  __not_equal, !=)
 #endif
 
 
@@ -1037,7 +1033,7 @@ template <> static FORCEINLINE __vec16_f __load<64>(const __vec16_f *p)
 {
     return _mm512_load_ps(p);
 }
-/* this one doesn't fail but it is  commented out for completenes, no aligned load/stores */
+/* this one doesn't fail but it is  commented out for completeness, no aligned load/stores */
 template <> static FORCEINLINE void __store<64>(__vec16_f *p, __vec16_f v) 
 {
   _mm512_store_ps(p, v);
@@ -1189,303 +1185,110 @@ static FORCEINLINE __vec16_i16 __float_to_half_varying(__vec16_f v)
 }
 
 
-#if 0 /* evghenii::double */
 ///////////////////////////////////////////////////////////////////////////
 // double
 ///////////////////////////////////////////////////////////////////////////
 
-BINARY_OP(__vec16_d, __add, +)
-BINARY_OP(__vec16_d, __sub, -)
-BINARY_OP(__vec16_d, __mul, *)
-BINARY_OP(__vec16_d, __div, /)
+#define VECOP(OP) __vec16_d(_mm512_##OP(a.v1,b.v1),_mm512_##OP(a.v2,b.v2))
+static FORCEINLINE __vec16_d __add(__vec16_d a, __vec16_d b) { return VECOP(add_pd); }
+static FORCEINLINE __vec16_d __sub(__vec16_d a, __vec16_d b) { return VECOP(sub_pd); }
+static FORCEINLINE __vec16_d __mul(__vec16_d a, __vec16_d b) { return VECOP(mul_pd); }
+static FORCEINLINE __vec16_d __div(__vec16_d a, __vec16_d b) { return VECOP(div_pd); }
+#undef VECOP
 
-CMP_OP(__vec16_d, double, double, __equal, ==)
-CMP_OP(__vec16_d, double, double, __not_equal, !=)
-CMP_OP(__vec16_d, double, double, __less_than, <)
-CMP_OP(__vec16_d, double, double, __less_equal, <=)
-CMP_OP(__vec16_d, double, double, __greater_than, >)
-CMP_OP(__vec16_d, double, double, __greater_equal, >=)
+#define CMPOP(OP) _mm512_kmovlhb(_mm512_##OP(a.v1,b.v1),_mm512_##OP(a.v2,b.v2))
+static FORCEINLINE __vec16_i1 __equal_double        (__vec16_d a, __vec16_d b) { return CMPOP(cmpeq_pd_mask);    }
+static FORCEINLINE __vec16_i1 __not_equal_double    (__vec16_d a, __vec16_d b) { return CMPOP(cmpneq_pd_mask);   }
+static FORCEINLINE __vec16_i1 __less_than_double    (__vec16_d a, __vec16_d b) { return CMPOP(cmplt_pd_mask);    }
+static FORCEINLINE __vec16_i1 __less_equal_double   (__vec16_d a, __vec16_d b) { return CMPOP(cmple_pd_mask);    }
+static FORCEINLINE __vec16_i1 __greater_than_double (__vec16_d a, __vec16_d b) { return CMPOP(cmpnle_pd_mask);   }
+static FORCEINLINE __vec16_i1 __greater_equal_double(__vec16_d a, __vec16_d b) { return CMPOP(cmpnlt_pd_mask);   }
+static FORCEINLINE __vec16_i1 __ordered_double      (__vec16_d a, __vec16_d b) { return CMPOP(cmpord_pd_mask);   }
+static FORCEINLINE __vec16_i1 __unordered_double    (__vec16_d a, __vec16_d b) { return CMPOP(cmpunord_pd_mask); }
+#undef CMPOP
 
-static FORCEINLINE __vec16_i1 __ordered_double(__vec16_d a, __vec16_d b) {
-    __vec16_i1 ret;
-    ret.v = 0;
-    for (int i = 0; i < 16; ++i)
-        ret.v |= ((a[i] == a[i]) && (b[i] == b[i])) ? (1 << i) : 0;
-    return ret;
+#define CMPOPMASK(OP) _mm512_kmovlhb(_mm512_mask_##OP(m,a.v1,b.v1),_mm512_mask_##OP(_mm512_kswapb(m,m),a.v2,b.v2))
+static FORCEINLINE __vec16_i1 __equal_double_and_mask        (__vec16_d a, __vec16_d b, __vec16_i1 m) { return CMPOPMASK(cmpeq_pd_mask);  }
+static FORCEINLINE __vec16_i1 __not_equal_double_and_mask    (__vec16_d a, __vec16_d b, __vec16_i1 m) { return CMPOPMASK(cmpneq_pd_mask); }
+static FORCEINLINE __vec16_i1 __less_than_double_and_mask    (__vec16_d a, __vec16_d b, __vec16_i1 m) { return CMPOPMASK(cmplt_pd_mask);  }
+static FORCEINLINE __vec16_i1 __less_equal_double_and_mask   (__vec16_d a, __vec16_d b, __vec16_i1 m) { return CMPOPMASK(cmple_pd_mask);  }
+static FORCEINLINE __vec16_i1 __greater_than_double_and_mask (__vec16_d a, __vec16_d b, __vec16_i1 m) { return CMPOPMASK(cmpnle_pd_mask); }
+static FORCEINLINE __vec16_i1 __greater_equal_double_and_mask(__vec16_d a, __vec16_d b, __vec16_i1 m) { return CMPOPMASK(cmpnlt_pd_mask); }
+#undef CMOPMASK
+
+
+static FORCEINLINE __vec16_d __select(__vec16_i1 m, __vec16_d a, __vec16_d b) 
+{
+  return __vec16_d(_mm512_mask_mov_pd(b.v1, m, a.v1), _mm512_mask_mov_pd(b.v2, _mm512_kswapb(m, m), a.v2));
 }
-
-static FORCEINLINE __vec16_i1 __unordered_double(__vec16_d a, __vec16_d b) {
-    __vec16_i1 ret;
-    ret.v = 0;
-    for (int i = 0; i < 16; ++i)
-        ret.v |= ((a[i] != a[i]) || (b[i] != b[i])) ? (1 << i) : 0;
-    return ret;
-}
-
-#if 0
-      case Instruction::FRem: intrinsic = "__frem"; break;
-#endif
-
-SELECT(__vec16_d)
-INSERT_EXTRACT(__vec16_d, double)
-SMEAR(__vec16_d, double, double)
-SETZERO(__vec16_d, double)
-UNDEF(__vec16_d, double)
-BROADCAST(__vec16_d, double, double)
-ROTATE(__vec16_d, double, double)
-SHUFFLES(__vec16_d, double, double)
-LOAD_STORE(__vec16_d, double)
-#else /* evghenii::double */
-///////////////////////////////////////////////////////////////////////////
-// double
-///////////////////////////////////////////////////////////////////////////
-
-static FORCEINLINE __vec16_d __add(__vec16_d a, __vec16_d b) { 
-    __vec16_d ret;
-    ret.v1 = _mm512_add_pd(a.v1, b.v1);
-    ret.v2 = _mm512_add_pd(a.v2, b.v2);
-    return ret;
-}
-
-static FORCEINLINE __vec16_d __sub(__vec16_d a, __vec16_d b) {
-    __vec16_d ret;
-    ret.v1 = _mm512_sub_pd(a.v1, b.v1);
-    ret.v2 = _mm512_sub_pd(a.v2, b.v2);
-    return ret;
-}
-
-static FORCEINLINE __vec16_d __mul(__vec16_d a, __vec16_d b) {
-    __vec16_d ret;
-    ret.v1 = _mm512_mul_pd(a.v1, b.v1);
-    ret.v2 = _mm512_mul_pd(a.v2, b.v2);
-    return ret;
-}
-
-static FORCEINLINE __vec16_d __div(__vec16_d a, __vec16_d b) {
-    __vec16_d ret;
-    ret.v1 = _mm512_div_pd(a.v1, b.v1);
-    ret.v2 = _mm512_div_pd(a.v2, b.v2);
-    return ret;
-}
-
-static FORCEINLINE __vec16_i1 __equal_double(__vec16_d a, __vec16_d b) {
-    __vec16_i1 ret1;
-    __vec16_i1 ret2;
-    ret1 = _mm512_cmpeq_pd_mask(a.v1, b.v1);
-    ret2 = _mm512_cmpeq_pd_mask(a.v2, b.v2);
-    return _mm512_kmovlhb(ret1, ret2);
-}
-
-static FORCEINLINE __vec16_i1 __equal_double_and_mask(__vec16_d a, __vec16_d b,
-                                                      __vec16_i1 m) {
-    __vec16_i1 ret1;
-    __vec16_i1 ret2;
-    ret1 = _mm512_mask_cmpeq_pd_mask(m, a.v1, b.v1);
-    __vec16_i1 tmp_m = m;
-    ret2 = _mm512_mask_cmpeq_pd_mask(_mm512_kswapb(tmp_m,tmp_m), a.v2, b.v2);
-    return _mm512_kmovlhb(ret1, ret2);
-}
-
-static FORCEINLINE __vec16_i1 __not_equal_double(__vec16_d a, __vec16_d b) {
-    __vec16_i1 ret1;
-    __vec16_i1 ret2;
-    ret1 = _mm512_cmpneq_pd_mask(a.v1, b.v1);
-    ret2 = _mm512_cmpneq_pd_mask(a.v2, b.v2);
-    return _mm512_kmovlhb(ret1, ret2);
-}
-
-static FORCEINLINE __vec16_i1 __not_equal_double_and_mask(__vec16_d a, __vec16_d b,
-                                                          __vec16_i1 m) {
-    __vec16_i1 ret1;
-    __vec16_i1 ret2;
-    __vec16_i1 tmp_m = m;
-    ret1 = _mm512_mask_cmpneq_pd_mask(m, a.v1, b.v1);
-    ret2 = _mm512_mask_cmpneq_pd_mask(_mm512_kswapb(tmp_m, tmp_m), a.v2, b.v2);
-    return _mm512_kmovlhb(ret1, ret2);
-}
-
-static FORCEINLINE __vec16_i1 __less_than_double(__vec16_d a, __vec16_d b) {
-    __vec16_i1 ret1;
-    __vec16_i1 ret2;
-    ret1 = _mm512_cmplt_pd_mask(a.v1, b.v1);
-    ret2 = _mm512_cmplt_pd_mask(a.v2, b.v2);
-    return _mm512_kmovlhb(ret1, ret2);
-}
-
-static FORCEINLINE __vec16_i1 __less_than_double_and_mask(__vec16_d a, __vec16_d b,
-                                                          __vec16_i1 m) {
-    __vec16_i1 ret1;
-    __vec16_i1 ret2;
-    __vec16_i1 tmp_m = m;
-    ret1 = _mm512_mask_cmplt_pd_mask(m, a.v1, b.v1);
-    ret2 = _mm512_mask_cmplt_pd_mask(_mm512_kswapb(tmp_m, tmp_m), a.v2, b.v2);
-    return _mm512_kmovlhb(ret1, ret2);
-}
-
-static FORCEINLINE __vec16_i1 __less_equal_double(__vec16_d a, __vec16_d b) {
-    __vec16_i1 ret1;
-    __vec16_i1 ret2;
-    ret1 = _mm512_cmple_pd_mask(a.v1, b.v1);
-    ret2 = _mm512_cmple_pd_mask(a.v2, b.v2);
-    return _mm512_kmovlhb(ret1, ret2);
-}
-
-static FORCEINLINE __vec16_i1 __less_equal_double_and_mask(__vec16_d a, __vec16_d b,
-                                                           __vec16_i1 m) {
-    __vec16_i1 ret1;
-    __vec16_i1 ret2;
-    __vec16_i1 tmp_m = m;
-    ret1 = _mm512_mask_cmple_pd_mask(m, a.v1, b.v1);
-    ret2 = _mm512_mask_cmple_pd_mask(_mm512_kswapb(tmp_m, tmp_m), a.v2, b.v2);
-    return _mm512_kmovlhb(ret1, ret2);
-}
-
-static FORCEINLINE __vec16_i1 __greater_than_double(__vec16_d a, __vec16_d b) {
-    __vec16_i1 ret1;
-    __vec16_i1 ret2;
-    ret1 = _mm512_cmpnle_pd_mask(a.v1, b.v1);
-    ret2 = _mm512_cmpnle_pd_mask(a.v2, b.v2);
-    return _mm512_kmovlhb(ret1, ret2);
-}
-
-static FORCEINLINE __vec16_i1 __greater_than_double_and_mask(__vec16_d a, __vec16_d b,
-                                                             __vec16_i1 m) {
-    __vec16_i1 ret1;
-    __vec16_i1 ret2;
-    __vec16_i1 tmp_m = m;
-    ret1 = _mm512_mask_cmpnle_pd_mask(m, a.v1, b.v1);
-    ret2 = _mm512_mask_cmpnle_pd_mask(_mm512_kswapb(tmp_m, tmp_m), a.v2, b.v2);
-    return _mm512_kmovlhb(ret1, ret2);
-}
-
-static FORCEINLINE __vec16_i1 __greater_equal_double(__vec16_d a, __vec16_d b) {
-    __vec16_i1 ret1;
-    __vec16_i1 ret2;
-    ret1 = _mm512_cmpnlt_pd_mask(a.v1, b.v1);
-    ret2 = _mm512_cmpnlt_pd_mask(a.v2, b.v2);
-    return _mm512_kmovlhb(ret1, ret2);
-}
-
-static FORCEINLINE __vec16_i1 __greater_equal_double_and_mask(__vec16_d a, __vec16_d b,
-                                                              __vec16_i1 m) {
-    __vec16_i1 ret1;
-    __vec16_i1 ret2;
-    __vec16_i1 tmp_m = m;
-    ret1 = _mm512_mask_cmpnlt_pd_mask(m, a.v1, b.v1);
-    ret2 = _mm512_mask_cmpnlt_pd_mask(_mm512_kswapb(tmp_m, tmp_m), a.v2, b.v2);
-    return _mm512_kmovlhb(ret1, ret2);
-}
-
-static FORCEINLINE __vec16_i1 __ordered_double(__vec16_d a, __vec16_d b) {
-    __vec16_i1 ret1;
-    __vec16_i1 ret2;
-    ret1 = _mm512_cmpord_pd_mask(a.v1, b.v1);
-    ret2 = _mm512_cmpord_pd_mask(a.v2, b.v2);
-    return _mm512_kmovlhb(ret1, ret2);
-}
-
-static FORCEINLINE __vec16_i1 __unordered_double(__vec16_d a, __vec16_d b) {
-    __vec16_i1 ret1;
-    __vec16_i1 ret2;
-    ret1 = _mm512_cmpunord_pd_mask(a.v1, b.v1);
-    ret2 = _mm512_cmpunord_pd_mask(a.v2, b.v2);
-    return _mm512_kmovlhb(ret1, ret2);
-}
-
-static FORCEINLINE __vec16_d __select(__vec16_i1 mask, __vec16_d a, __vec16_d b) {
-    __vec16_d ret;
-    __vec16_i1 tmp_m = mask;
-    ret.v1 = _mm512_mask_mov_pd(b.v1, mask, a.v1);
-    ret.v2 = _mm512_mask_mov_pd(b.v2, _mm512_kswapb(tmp_m, tmp_m), a.v2);
-    return ret;
-}
-
-
-static FORCEINLINE __vec16_d __select(bool cond, __vec16_d a, __vec16_d b) {
+static FORCEINLINE __vec16_d __select(bool cond, __vec16_d a, __vec16_d b) 
+{
     return cond ? a : b;
 }
 
-static FORCEINLINE double __extract_element(__vec16_d v, uint32_t index) {
-    return ((double *)&v)[index];
-}
-
-static FORCEINLINE void  __insert_element(__vec16_d *v, uint32_t index, double val) {
-    ((double *)v)[index] = val;
-}
+static FORCEINLINE double __extract_element(__vec16_d  v, uint32_t index)             { return v[index];   }
+static FORCEINLINE void    __insert_element(__vec16_d *v, uint32_t index, double val) { (*v)[index] = val; }
 
 template <class RetVecType> RetVecType __smear_double(double d);
-template <> static FORCEINLINE __vec16_d __smear_double<__vec16_d>(double d) {
-    __vec16_d ret;
-    ret.v1 = _mm512_set1_pd(d);
-    ret.v2 = _mm512_set1_pd(d);
-    return ret;
-}
+template <> static FORCEINLINE __vec16_d __smear_double<__vec16_d>(double d) { return __vec16_d(_mm512_set1_pd(d), _mm512_set1_pd(d)); }
 
 template <class RetVecType> RetVecType __setzero_double();
-template <> static FORCEINLINE __vec16_d __setzero_double<__vec16_d>() {
-    __vec16_d ret;
-    ret.v1 = _mm512_setzero_pd();
-    ret.v2 = _mm512_setzero_pd();
-    return ret;
-}
+template <> static FORCEINLINE __vec16_d __setzero_double<__vec16_d>() { return __vec16_d(_mm512_setzero_pd(), _mm512_setzero_pd()); }
 
 template <class RetVecType> RetVecType __undef_double();
-template <> static FORCEINLINE __vec16_d __undef_double<__vec16_d>() {
-    return __vec16_d();
-}
+template <> static FORCEINLINE __vec16_d __undef_double<__vec16_d>() { return __vec16_d(); }
 
-static FORCEINLINE __vec16_d __broadcast_double(__vec16_d v, int index) {
-    __vec16_d ret;
-    double val = __extract_element(v, index & 0xf);
-    ret.v1 = _mm512_set1_pd(val);
-    ret.v2 = _mm512_set1_pd(val);
-    return ret;
+static FORCEINLINE __vec16_d __broadcast_double(__vec16_d v, int index) 
+{
+  __vec16_d ret;
+  double val = __extract_element(v, index & 0xf);
+  ret.v1 = _mm512_set1_pd(val);
+  ret.v2 = _mm512_set1_pd(val);
+  return ret;
 }
 
 ROTATE(__vec16_d, double, double)
 SHUFFLES(__vec16_d, double, double)
 
-template <int ALIGN> static FORCEINLINE __vec16_d __load(const __vec16_d *p) {
-    __vec16_d ret;
-    ret.v1 = _mm512_extloadunpacklo_pd(ret.v1, p, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
-    ret.v1 = _mm512_extloadunpackhi_pd(ret.v1, (uint8_t*)p+64, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
-    ret.v2 = _mm512_extloadunpacklo_pd(ret.v2, (uint8_t*)p+64, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
-    ret.v2 = _mm512_extloadunpackhi_pd(ret.v2, (uint8_t*)p+128, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
-    return ret;
+template <int ALIGN> static FORCEINLINE __vec16_d __load(const __vec16_d *p) \
+{
+  __vec16_d ret;
+  ret.v1 = _mm512_extloadunpacklo_pd(ret.v1, p, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
+  ret.v1 = _mm512_extloadunpackhi_pd(ret.v1, (uint8_t*)p+64, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
+  ret.v2 = _mm512_extloadunpacklo_pd(ret.v2, (uint8_t*)p+64, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
+  ret.v2 = _mm512_extloadunpackhi_pd(ret.v2, (uint8_t*)p+128, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
+  return ret;
 }
  
-template <int ALIGN> static FORCEINLINE void __store(__vec16_d *p, __vec16_d v) {
-    _mm512_extpackstorelo_pd(p, v.v1, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE);
-    _mm512_extpackstorehi_pd((uint8_t*)p+64, v.v1, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE);
-    _mm512_extpackstorelo_pd((uint8_t*)p+64, v.v2, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE);
-    _mm512_extpackstorehi_pd((uint8_t*)p+128, v.v2, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE);
+template <int ALIGN> static FORCEINLINE void __store(__vec16_d *p, __vec16_d v) 
+{
+  _mm512_extpackstorelo_pd(p, v.v1, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE);
+  _mm512_extpackstorehi_pd((uint8_t*)p+64, v.v1, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE);
+  _mm512_extpackstorelo_pd((uint8_t*)p+64, v.v2, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE);
+  _mm512_extpackstorehi_pd((uint8_t*)p+128, v.v2, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE);
 }
 
 
-#if 0
-template <> static FORCEINLINE __vec16_d __load<64>(const __vec16_d *p) {
-    __vec16_d ret;
-    ret.v1 = _mm512_load_pd(p);
-    ret.v2 = _mm512_load_pd(((uint8_t*)p)+64);
-    return ret;
+#if 1
+template <> static FORCEINLINE __vec16_d __load<64>(const __vec16_d *p) 
+{
+  return __vec16_d(_mm512_load_pd(p), _mm512_load_pd(((uint8_t*)p)+64));
 }
-template <> static FORCEINLINE __vec16_d __load<128>(const __vec16_d *p) {
-    return __load<64>(p);
-}
-template <> static FORCEINLINE void __store<64>(__vec16_d *p, __vec16_d v) {
-    _mm512_store_pd(p, v.v1);
-    _mm512_store_pd(((uint8_t*)p)+64, v.v2);
-}
-template <> static FORCEINLINE void __store<128>(__vec16_d *p, __vec16_d v) {
-    __store<64>(p, v);
+template <> static FORCEINLINE void __store<64>(__vec16_d *p, __vec16_d v) 
+{
+  _mm512_store_pd(p, v.v1);
+  _mm512_store_pd(((uint8_t*)p)+64, v.v2);
 }
+template <> static FORCEINLINE __vec16_d __load <128>(const __vec16_d *p)        { return __load<64>(p); }
+template <> static FORCEINLINE      void __store<128>(__vec16_d *p, __vec16_d v) { __store<64>(p, v);    }
 #endif
-#endif /* evghenii::double */
 
 ///////////////////////////////////////////////////////////////////////////
 // casts
+///////////////////////////////////////////////////////////////////////////
 
 
+/* knc::macro::used */
 #define CAST(TO, STO, FROM, SFROM, FUNC)        \
 static FORCEINLINE TO FUNC(TO, FROM val) {      \
     TO ret;                                     \
@@ -1495,13 +1298,13 @@ static FORCEINLINE TO FUNC(TO, FROM val) {      \
 }
 
 // sign extension conversions
-#if 1
-CAST(__vec16_i64, int64_t, __vec16_i32, int32_t, __cast_sext)
-#else /* evghenii::fails on soa-9 soa-13 soa-10 soa-29 soa-3 ... and others  */
+#if 0 /* knc::fails on soa-9 soa-13 soa-10 soa-29 soa-3 ... and others  */
 static FORCEINLINE __vec16_i64 __cast_sext(const __vec16_i64 &, const __vec16_i32 &val)
 {
   return __vec16_i64(_mm512_srai_epi32(val.v,31), val.v).cvt2zmm();
 }
+#else
+CAST(__vec16_i64, int64_t, __vec16_i32, int32_t, __cast_sext)
 #endif
 CAST(__vec16_i64, int64_t, __vec16_i16, int16_t, __cast_sext)
 CAST(__vec16_i64, int64_t, __vec16_i8,  int8_t,  __cast_sext)
@@ -1509,6 +1312,7 @@ CAST(__vec16_i32, int32_t, __vec16_i16, int16_t, __cast_sext)
 CAST(__vec16_i32, int32_t, __vec16_i8,  int8_t,  __cast_sext)
 CAST(__vec16_i16, int16_t, __vec16_i8,  int8_t,  __cast_sext)
 
+/* knc::macro::used */
 #define CAST_SEXT_I1(TYPE)                            \
 static FORCEINLINE TYPE __cast_sext(TYPE, __vec16_i1 v) {  \
     TYPE ret;                                         \
@@ -1522,34 +1326,31 @@ static FORCEINLINE TYPE __cast_sext(TYPE, __vec16_i1 v) {  \
 
 CAST_SEXT_I1(__vec16_i8)
 CAST_SEXT_I1(__vec16_i16)
-#if 0
-CAST_SEXT_I1(__vec16_i32)
-#else
+
+//CAST_SEXT_I1(__vec16_i32)
 static FORCEINLINE __vec16_i32 __cast_sext(const __vec16_i32 &, const __vec16_i1 &val)
 {
   __vec16_i32 ret = _mm512_setzero_epi32();
   __vec16_i32 one = _mm512_set1_epi32(-1);
   return _mm512_mask_mov_epi32(ret, val, one);
 }
-#endif
+
 CAST_SEXT_I1(__vec16_i64)
 
 // zero extension
-#if 0
-CAST(__vec16_i64, uint64_t, __vec16_i32, uint32_t, __cast_zext)
-#else
+// CAST(__vec16_i64, uint64_t, __vec16_i32, uint32_t, __cast_zext)
 static FORCEINLINE __vec16_i64 __cast_zext(const __vec16_i64 &, const __vec16_i32 &val)
 {
   return __vec16_i64(_mm512_setzero_epi32(), val.v).cvt2zmm();
 }
 
-#endif
 CAST(__vec16_i64, uint64_t, __vec16_i16, uint16_t, __cast_zext)
 CAST(__vec16_i64, uint64_t, __vec16_i8,  uint8_t,  __cast_zext)
 CAST(__vec16_i32, uint32_t, __vec16_i16, uint16_t, __cast_zext)
 CAST(__vec16_i32, uint32_t, __vec16_i8,  uint8_t,  __cast_zext)
 CAST(__vec16_i16, uint16_t, __vec16_i8,  uint8_t,  __cast_zext)
 
+/* knc::macro::used */
 #define CAST_ZEXT_I1(TYPE)                            \
 static FORCEINLINE TYPE __cast_zext(TYPE, __vec16_i1 v) {  \
     TYPE ret;                                         \
@@ -1560,16 +1361,15 @@ static FORCEINLINE TYPE __cast_zext(TYPE, __vec16_i1 v) {  \
 
 CAST_ZEXT_I1(__vec16_i8)
 CAST_ZEXT_I1(__vec16_i16)
-#if 0
-CAST_ZEXT_I1(__vec16_i32)
-#else
+
+//CAST_ZEXT_I1(__vec16_i32)
 static FORCEINLINE __vec16_i32 __cast_zext(const __vec16_i32 &, const __vec16_i1 &val)
 {
-    __vec16_i32 ret = _mm512_setzero_epi32();
-    __vec16_i32 one = _mm512_set1_epi32(1);
-    return _mm512_mask_mov_epi32(ret, val, one);
+  __vec16_i32 ret = _mm512_setzero_epi32();
+  __vec16_i32 one = _mm512_set1_epi32(1);
+  return _mm512_mask_mov_epi32(ret, val, one);
 }
-#endif
+
 CAST_ZEXT_I1(__vec16_i64)
 
 // truncations
@@ -1581,170 +1381,160 @@ CAST(__vec16_i8,  int8_t,  __vec16_i32, int32_t, __cast_trunc)
 CAST(__vec16_i8,  int8_t,  __vec16_i16, int16_t, __cast_trunc)
 
 // signed int to float/double
-#if 0
-CAST(__vec16_f, float, __vec16_i8,   int8_t,  __cast_sitofp)
-CAST(__vec16_f, float, __vec16_i16,  int16_t, __cast_sitofp)
-CAST(__vec16_f, float, __vec16_i32,  int32_t, __cast_sitofp)
-#else
+
+//CAST(__vec16_f, float, __vec16_i8,   int8_t,  __cast_sitofp)
 static FORCEINLINE __vec16_f __cast_sitofp(__vec16_f, __vec16_i8  val) {return _mm512_extload_ps(&val, _MM_UPCONV_PS_SINT8, _MM_BROADCAST_16X16, _MM_HINT_NONE);}
+//CAST(__vec16_f, float, __vec16_i16,  int16_t, __cast_sitofp)
 static FORCEINLINE __vec16_f __cast_sitofp(__vec16_f, __vec16_i16 val) {return _mm512_extload_ps(&val, _MM_UPCONV_PS_SINT16, _MM_BROADCAST_16X16, _MM_HINT_NONE);}
+//CAST(__vec16_f, float, __vec16_i32,  int32_t, __cast_sitofp)
 static FORCEINLINE __vec16_f __cast_sitofp(__vec16_f, __vec16_i32 val) {return _mm512_cvtfxpnt_round_adjustepi32_ps(val, _MM_ROUND_MODE_NEAREST, _MM_EXPADJ_NONE);}
-#endif
+
 CAST(__vec16_f, float, __vec16_i64,  int64_t, __cast_sitofp)
-#if 0
-CAST(__vec16_d, double, __vec16_i8,  int8_t,  __cast_sitofp)
-CAST(__vec16_d, double, __vec16_i16, int16_t, __cast_sitofp)
-CAST(__vec16_d, double, __vec16_i32, int32_t, __cast_sitofp)
-#else
-static FORCEINLINE __vec16_d __cast_sitofp(__vec16_d, __vec16_i8 val) {
-    __vec16_i32 vi = _mm512_extload_epi32(&val, _MM_UPCONV_EPI32_SINT8, _MM_BROADCAST_16X16, _MM_HINT_NONE);
-    __vec16_d ret;
-    ret.v1 = _mm512_cvtepi32lo_pd(vi);
-    __vec16_i32 other8 = _mm512_permute4f128_epi32(vi, _MM_PERM_DCDC);
-    ret.v2 = _mm512_cvtepi32lo_pd(other8);
-    return ret;
+
+//CAST(__vec16_d, double, __vec16_i8,  int8_t,  __cast_sitofp)
+static FORCEINLINE __vec16_d __cast_sitofp(__vec16_d, __vec16_i8 val) 
+{
+  __vec16_i32 vi = _mm512_extload_epi32(&val, _MM_UPCONV_EPI32_SINT8, _MM_BROADCAST_16X16, _MM_HINT_NONE);
+  __vec16_d ret;
+  ret.v1 = _mm512_cvtepi32lo_pd(vi);
+  __vec16_i32 other8 = _mm512_permute4f128_epi32(vi, _MM_PERM_DCDC);
+  ret.v2 = _mm512_cvtepi32lo_pd(other8);
+  return ret;
 }
 
-static FORCEINLINE __vec16_d __cast_sitofp(__vec16_d, __vec16_i16 val) {
-    __vec16_i32 vi = _mm512_extload_epi32(&val, _MM_UPCONV_EPI32_SINT16, _MM_BROADCAST_16X16, _MM_HINT_NONE);
-    __vec16_d ret;
-    ret.v1 = _mm512_cvtepi32lo_pd(vi);
-    __vec16_i32 other8 = _mm512_permute4f128_epi32(vi, _MM_PERM_DCDC);
-    ret.v2 = _mm512_cvtepi32lo_pd(other8);
-    return ret;
+// CAST(__vec16_d, double, __vec16_i16, int16_t, __cast_sitofp)
+static FORCEINLINE __vec16_d __cast_sitofp(__vec16_d, __vec16_i16 val) 
+{
+  __vec16_i32 vi = _mm512_extload_epi32(&val, _MM_UPCONV_EPI32_SINT16, _MM_BROADCAST_16X16, _MM_HINT_NONE);
+  __vec16_d ret;
+  ret.v1 = _mm512_cvtepi32lo_pd(vi);
+  __vec16_i32 other8 = _mm512_permute4f128_epi32(vi, _MM_PERM_DCDC);
+  ret.v2 = _mm512_cvtepi32lo_pd(other8);
+  return ret;
 }
 
-static FORCEINLINE __vec16_d __cast_sitofp(__vec16_d, __vec16_i32 val) {
-    __vec16_d ret;
-    ret.v1 = _mm512_cvtepi32lo_pd(val);
-    __vec16_i32 other8 = _mm512_permute4f128_epi32(val, _MM_PERM_DCDC);
-    ret.v2 = _mm512_cvtepi32lo_pd(other8);
-    return ret;
+// CAST(__vec16_d, double, __vec16_i32, int32_t, __cast_sitofp)
+static FORCEINLINE __vec16_d __cast_sitofp(__vec16_d, __vec16_i32 val) 
+{
+  __vec16_d ret;
+  ret.v1 = _mm512_cvtepi32lo_pd(val);
+  __vec16_i32 other8 = _mm512_permute4f128_epi32(val, _MM_PERM_DCDC);
+  ret.v2 = _mm512_cvtepi32lo_pd(other8);
+  return ret;
 }
-#endif
+
 CAST(__vec16_d, double, __vec16_i64, int64_t, __cast_sitofp)
 
 // unsigned int to float/double
-#if 0
-CAST(__vec16_f, float, __vec16_i8,   uint8_t,  __cast_uitofp)
-CAST(__vec16_f, float, __vec16_i16,  uint16_t, __cast_uitofp)
-CAST(__vec16_f, float, __vec16_i32,  uint32_t, __cast_uitofp)
-#else
+
+// CAST(__vec16_f, float, __vec16_i8,   uint8_t,  __cast_uitofp)
 static FORCEINLINE __vec16_f __cast_uitofp(__vec16_f, __vec16_i8  val) {return _mm512_extload_ps(&val, _MM_UPCONV_PS_UINT8, _MM_BROADCAST_16X16, _MM_HINT_NONE);}
+//CAST(__vec16_f, float, __vec16_i16,  uint16_t, __cast_uitofp)
 static FORCEINLINE __vec16_f __cast_uitofp(__vec16_f, __vec16_i16 val) {return _mm512_extload_ps(&val, _MM_UPCONV_PS_UINT16, _MM_BROADCAST_16X16, _MM_HINT_NONE);}
+//CAST(__vec16_f, float, __vec16_i32,  uint32_t, __cast_uitofp)
 static FORCEINLINE __vec16_f __cast_uitofp(__vec16_f, __vec16_i32 val) {return _mm512_cvtfxpnt_round_adjustepu32_ps(val, _MM_ROUND_MODE_NEAREST, _MM_EXPADJ_NONE);}
-#endif
+
 CAST(__vec16_f, float, __vec16_i64,  uint64_t, __cast_uitofp)
-#if 0
-CAST(__vec16_d, double, __vec16_i8,  uint8_t,  __cast_uitofp)
-CAST(__vec16_d, double, __vec16_i16, uint16_t, __cast_uitofp)
-CAST(__vec16_d, double, __vec16_i32, uint32_t, __cast_uitofp)
-#else
-static FORCEINLINE __vec16_d __cast_uitofp(__vec16_d, __vec16_i8 val) {
-    __vec16_i32 vi = _mm512_extload_epi32(&val, _MM_UPCONV_EPI32_UINT8, _MM_BROADCAST_16X16, _MM_HINT_NONE);
-    __vec16_d ret;
-    ret.v1 = _mm512_cvtepu32lo_pd(vi);
-    __vec16_i32 other8 = _mm512_permute4f128_epi32(vi, _MM_PERM_DCDC);
-    ret.v2 = _mm512_cvtepu32lo_pd(other8);
-    return ret;
+
+// CAST(__vec16_d, double, __vec16_i8,  uint8_t,  __cast_uitofp)
+static FORCEINLINE __vec16_d __cast_uitofp(__vec16_d, __vec16_i8 val) 
+{
+  __vec16_i32 vi = _mm512_extload_epi32(&val, _MM_UPCONV_EPI32_UINT8, _MM_BROADCAST_16X16, _MM_HINT_NONE);
+  __vec16_d ret;
+  ret.v1 = _mm512_cvtepu32lo_pd(vi);
+  __vec16_i32 other8 = _mm512_permute4f128_epi32(vi, _MM_PERM_DCDC);
+  ret.v2 = _mm512_cvtepu32lo_pd(other8);
+  return ret;
 }
 
-static FORCEINLINE __vec16_d __cast_uitofp(__vec16_d, __vec16_i16 val) {
-    __vec16_i32 vi = _mm512_extload_epi32(&val, _MM_UPCONV_EPI32_UINT16, _MM_BROADCAST_16X16, _MM_HINT_NONE);
-    __vec16_d ret;
-    ret.v1 = _mm512_cvtepu32lo_pd(vi);
-    __vec16_i32 other8 = _mm512_permute4f128_epi32(vi, _MM_PERM_DCDC);
-    ret.v2 = _mm512_cvtepu32lo_pd(other8);
-    return ret;
+// CAST(__vec16_d, double, __vec16_i16, uint16_t, __cast_uitofp)
+static FORCEINLINE __vec16_d __cast_uitofp(__vec16_d, __vec16_i16 val) 
+{
+  __vec16_i32 vi = _mm512_extload_epi32(&val, _MM_UPCONV_EPI32_UINT16, _MM_BROADCAST_16X16, _MM_HINT_NONE);
+  __vec16_d ret;
+  ret.v1 = _mm512_cvtepu32lo_pd(vi);
+  __vec16_i32 other8 = _mm512_permute4f128_epi32(vi, _MM_PERM_DCDC);
+  ret.v2 = _mm512_cvtepu32lo_pd(other8);
+  return ret;
 }
 
-static FORCEINLINE __vec16_d __cast_uitofp(__vec16_d, __vec16_i32 val) {
-    __vec16_d ret;
-    ret.v1 = _mm512_cvtepu32lo_pd(val);
-    __vec16_i32 other8 = _mm512_permute4f128_epi32(val, _MM_PERM_DCDC);
-    ret.v2 = _mm512_cvtepu32lo_pd(other8);
-    return ret;
+// CAST(__vec16_d, double, __vec16_i32, uint32_t, __cast_uitofp)
+static FORCEINLINE __vec16_d __cast_uitofp(__vec16_d, __vec16_i32 val) 
+{
+  __vec16_d ret;
+  ret.v1 = _mm512_cvtepu32lo_pd(val);
+  __vec16_i32 other8 = _mm512_permute4f128_epi32(val, _MM_PERM_DCDC);
+  ret.v2 = _mm512_cvtepu32lo_pd(other8);
+  return ret;
 }
-#endif
+
 CAST(__vec16_d, double, __vec16_i64, uint64_t, __cast_uitofp)
 
-#if 0
-static FORCEINLINE __vec16_f __cast_uitofp(__vec16_f, __vec16_i1 v) {
-    __vec16_f ret;
-    for (int i = 0; i < 16; ++i)
-        ret[i] = (v.v & (1 << i)) ? 1. : 0.;
-    return ret;
-}
-#else
 static FORCEINLINE __vec16_f __cast_uitofp(__vec16_f, __vec16_i1 v) 
 {
-    const __m512 ret = _mm512_setzero_ps();
-    const __m512 one = _mm512_set1_ps(1.0);
-    return _mm512_mask_mov_ps(ret, v, one);
+  const __m512 ret = _mm512_setzero_ps();
+  const __m512 one = _mm512_set1_ps(1.0);
+  return _mm512_mask_mov_ps(ret, v, one);
 }
-#endif
 
 // float/double to signed int
 CAST(__vec16_i8,  int8_t,  __vec16_f, float, __cast_fptosi)
 CAST(__vec16_i16, int16_t, __vec16_f, float, __cast_fptosi)
-#if 0
-CAST(__vec16_i32, int32_t, __vec16_f, float, __cast_fptosi)
-#else
-static FORCEINLINE __vec16_i32 __cast_fptosi(__vec16_i32, __vec16_f val) {
+
+// CAST(__vec16_i32, int32_t, __vec16_f, float, __cast_fptosi)
+static FORCEINLINE __vec16_i32 __cast_fptosi(__vec16_i32, __vec16_f val) 
+{
   return _mm512_cvtfxpnt_round_adjustps_epi32(val, _MM_ROUND_MODE_TOWARD_ZERO, _MM_EXPADJ_NONE);
 }
-#endif
+
 CAST(__vec16_i64, int64_t, __vec16_f, float, __cast_fptosi)
 CAST(__vec16_i8,  int8_t,  __vec16_d, double, __cast_fptosi)
 CAST(__vec16_i16, int16_t, __vec16_d, double, __cast_fptosi)
-#if 1
-CAST(__vec16_i32, int32_t, __vec16_d, double, __cast_fptosi)
+#if 0 /* knc::2implement */
 #else
+CAST(__vec16_i32, int32_t, __vec16_d, double, __cast_fptosi)
 #endif
 CAST(__vec16_i64, int64_t, __vec16_d, double, __cast_fptosi)
 
 // float/double to unsigned int
 CAST(__vec16_i8,  uint8_t,  __vec16_f, float, __cast_fptoui)
 CAST(__vec16_i16, uint16_t, __vec16_f, float, __cast_fptoui)
-#if 0
-CAST(__vec16_i32, uint32_t, __vec16_f, float, __cast_fptoui)
-#else
-static FORCEINLINE __vec16_i32 __cast_fptoui(__vec16_i32, __vec16_f val) {
+
+// CAST(__vec16_i32, uint32_t, __vec16_f, float, __cast_fptoui)
+static FORCEINLINE __vec16_i32 __cast_fptoui(__vec16_i32, __vec16_f val) 
+{
   return _mm512_cvtfxpnt_round_adjustps_epu32(val, _MM_ROUND_MODE_TOWARD_ZERO, _MM_EXPADJ_NONE);
 }
-#endif
+
 CAST(__vec16_i64, uint64_t, __vec16_f, float, __cast_fptoui)
 CAST(__vec16_i8,  uint8_t,  __vec16_d, double, __cast_fptoui)
 CAST(__vec16_i16, uint16_t, __vec16_d, double, __cast_fptoui)
-#if 1
-CAST(__vec16_i32, uint32_t, __vec16_d, double, __cast_fptoui)
+#if 0 /* knc::2implement */
 #else
+CAST(__vec16_i32, uint32_t, __vec16_d, double, __cast_fptoui)
 #endif
 CAST(__vec16_i64, uint64_t, __vec16_d, double, __cast_fptoui)
 
 // float/double conversions
-#if 0
-CAST(__vec16_f, float,  __vec16_d, double, __cast_fptrunc)
-#else
-static FORCEINLINE __vec16_f __cast_fptrunc(__vec16_f, __vec16_d val) {
-    __m512i r0i = _mm512_castps_si512(_mm512_cvtpd_pslo(val.v1));
-    __m512i r1i = _mm512_castps_si512(_mm512_cvtpd_pslo(val.v2));
 
-    return _mm512_castsi512_ps(_mm512_mask_permute4f128_epi32(r0i, 0xFF00, r1i, _MM_PERM_BABA));
-}
-#endif
+// CAST(__vec16_f, float,  __vec16_d, double, __cast_fptrunc)
+static FORCEINLINE __vec16_f __cast_fptrunc(__vec16_f, __vec16_d val) 
+{
+  __m512i r0i = _mm512_castps_si512(_mm512_cvtpd_pslo(val.v1));
+  __m512i r1i = _mm512_castps_si512(_mm512_cvtpd_pslo(val.v2));
 
-#if 0
-CAST(__vec16_d, double, __vec16_f, float,  __cast_fpext)
-#else
-static FORCEINLINE __vec16_d __cast_fpext(__vec16_d, __vec16_f val) {
-    __vec16_d ret;
-    ret.v1 = _mm512_cvtpslo_pd(val.v);
-    __vec16_f other8 = _mm512_castsi512_ps(_mm512_permute4f128_epi32(_mm512_castps_si512(val.v), _MM_PERM_DCDC));
-    ret.v2 = _mm512_cvtpslo_pd(other8);
-    return ret;
+  return _mm512_castsi512_ps(_mm512_mask_permute4f128_epi32(r0i, 0xFF00, r1i, _MM_PERM_BABA));
+}
+
+// CAST(__vec16_d, double, __vec16_f, float,  __cast_fpext)
+static FORCEINLINE __vec16_d __cast_fpext(__vec16_d, __vec16_f val) 
+{
+  __vec16_d ret;
+  ret.v1 = _mm512_cvtpslo_pd(val.v);
+  __vec16_f other8 = _mm512_castsi512_ps(_mm512_permute4f128_epi32(_mm512_castps_si512(val.v), _MM_PERM_DCDC));
+  ret.v2 = _mm512_cvtpslo_pd(other8);
+  return ret;
 }
-#endif
 
 typedef union {
     int32_t i32;
@@ -1753,6 +1543,7 @@ typedef union {
     double d;
 } BitcastUnion;
 
+/* knc::macro::not used */
 #define CAST_BITS(TO, TO_ELT, FROM, FROM_ELT)       \
 static FORCEINLINE TO __cast_bits(TO, FROM val) {   \
     TO r;                                           \
@@ -1764,30 +1555,17 @@ static FORCEINLINE TO __cast_bits(TO, FROM val) {   \
     return r;                                       \
 }
 
-#if 0
-CAST_BITS(__vec16_f,   f,   __vec16_i32, i32)
-CAST_BITS(__vec16_i32, i32, __vec16_f,   f)
-#else
-static FORCEINLINE __vec16_f __cast_bits(__vec16_f, __vec16_i32 val) {
-    return _mm512_castsi512_ps(val);
-}
-static FORCEINLINE __vec16_i32 __cast_bits(__vec16_i32, __vec16_f val) {
-    return _mm512_castps_si512(val);
-}
-#endif
+// CAST_BITS(__vec16_f,   f,   __vec16_i32, i32)
+static FORCEINLINE __vec16_f __cast_bits(__vec16_f, __vec16_i32 val) { return _mm512_castsi512_ps(val); }
+// CAST_BITS(__vec16_i32, i32, __vec16_f,   f)
+static FORCEINLINE __vec16_i32 __cast_bits(__vec16_i32, __vec16_f val) { return _mm512_castps_si512(val); }
 
-#if 0
-CAST_BITS(__vec16_d,   d,   __vec16_i64, i64)
-CAST_BITS(__vec16_i64, i64, __vec16_d,   d)
-#else
-static FORCEINLINE __vec16_i64 __cast_bits(__vec16_i64, __vec16_d val) {
-    return *(__vec16_i64*)&val;
-}
-static FORCEINLINE __vec16_d __cast_bits(__vec16_d, __vec16_i64 val) {
-    return *(__vec16_d*)&val;
-}
-#endif
+// CAST_BITS(__vec16_d,   d,   __vec16_i64, i64)
+static FORCEINLINE __vec16_i64 __cast_bits(__vec16_i64, __vec16_d val) { return *(__vec16_i64*)&val; }
+// CAST_BITS(__vec16_i64, i64, __vec16_d,   d)
+static FORCEINLINE __vec16_d __cast_bits(__vec16_d, __vec16_i64 val) { return *(__vec16_d*)&val; }
 
+/* knc::macro::used */
 #define CAST_BITS_SCALAR(TO, FROM)                  \
 static FORCEINLINE TO __cast_bits(TO, FROM v) {     \
     union {                                         \
@@ -1809,6 +1587,7 @@ CAST_BITS_SCALAR(double, int64_t)
 
 ///////////////////////////////////////////////////////////////////////////
 // various math functions
+///////////////////////////////////////////////////////////////////////////
 
 static FORCEINLINE void __fastmath() {
 }
@@ -1837,168 +1616,100 @@ static FORCEINLINE double __ceil_uniform_double(double v) {
     return ceil(v);
 }
 
-#if 0
-UNARY_OP(__vec16_f, __round_varying_float, roundf)
-UNARY_OP(__vec16_f, __floor_varying_float, floorf)
-UNARY_OP(__vec16_f, __ceil_varying_float, ceilf)
-#else
-static FORCEINLINE __vec16_f __round_varying_float(__vec16_f v) {
-  return _mm512_round_ps(v, _MM_ROUND_MODE_NEAREST, _MM_EXPADJ_NONE);
-}
-
-static FORCEINLINE __vec16_f __floor_varying_float(__vec16_f v) {
-  return _mm512_floor_ps(v);
-}
-
-static FORCEINLINE __vec16_f __ceil_varying_float(__vec16_f v) {
-  return _mm512_ceil_ps(v);
-}
-#endif
-
-#if 0
-UNARY_OP(__vec16_d, __round_varying_double, round)
-UNARY_OP(__vec16_d, __floor_varying_double, floor)
-UNARY_OP(__vec16_d, __ceil_varying_double, ceil)
-#else
-static FORCEINLINE __vec16_d __round_varying_float(__vec16_d v) {
-  __vec16_d ret;
-  ret.v1 = _mm512_svml_round_pd(v.v1);
-  ret.v2 = _mm512_svml_round_pd(v.v2);
-  return ret;
-}
-
-static FORCEINLINE __vec16_d __floor_varying_float(__vec16_d v) {
-  __vec16_d ret;
-  ret.v1 = _mm512_floor_pd(v.v1);
-  ret.v2 = _mm512_floor_pd(v.v2);
-  return ret;
-}
-
-static FORCEINLINE __vec16_d __ceil_varying_float(__vec16_d v) {
-  __vec16_d ret;
-  ret.v1 = _mm512_ceil_pd(v.v1);
-  ret.v2 = _mm512_ceil_pd(v.v2);
-  return ret;
-}
-#endif
+static FORCEINLINE __vec16_f __round_varying_float(__vec16_f v) { return _mm512_round_ps(v, _MM_ROUND_MODE_NEAREST, _MM_EXPADJ_NONE); }
+static FORCEINLINE __vec16_f __floor_varying_float(__vec16_f v) { return _mm512_floor_ps(v); }
+static FORCEINLINE __vec16_f __ceil_varying_float(__vec16_f v) { return _mm512_ceil_ps(v); }
 
+static FORCEINLINE __vec16_d __round_varying_float(__vec16_d v)  { return __vec16_d(_mm512_svml_round_pd(v.v1), _mm512_svml_round_pd(v.v2)); }
+static FORCEINLINE __vec16_d __floor_varying_float(__vec16_d v)  { return __vec16_d(_mm512_floor_pd(v.v1), _mm512_floor_pd(v.v2)); }
+static FORCEINLINE __vec16_d __ceil_varying_float(__vec16_d v)  { return __vec16_d(_mm512_ceil_pd(v.v1), _mm512_ceil_pd(v.v2)); }
 
 // min/max
 
-static FORCEINLINE float __min_uniform_float(float a, float b) { return (a<b) ? a : b; }
-static FORCEINLINE float __max_uniform_float(float a, float b) { return (a>b) ? a : b; }
+static FORCEINLINE float  __min_uniform_float (float  a, float  b) { return (a<b) ? a : b; }
+static FORCEINLINE float  __max_uniform_float (float  a, float  b) { return (a>b) ? a : b; }
 static FORCEINLINE double __min_uniform_double(double a, double b) { return (a<b) ? a : b; }
 static FORCEINLINE double __max_uniform_double(double a, double b) { return (a>b) ? a : b; }
 
-static FORCEINLINE int32_t __min_uniform_int32(int32_t a, int32_t b) { return (a<b) ? a : b; }
-static FORCEINLINE int32_t __max_uniform_int32(int32_t a, int32_t b) { return (a>b) ? a : b; }
+static FORCEINLINE int32_t __min_uniform_int32 ( int32_t a,  int32_t b) { return (a<b) ? a : b; }
+static FORCEINLINE int32_t __max_uniform_int32 ( int32_t a,  int32_t b) { return (a>b) ? a : b; }
 static FORCEINLINE int32_t __min_uniform_uint32(uint32_t a, uint32_t b) { return (a<b) ? a : b; }
 static FORCEINLINE int32_t __max_uniform_uint32(uint32_t a, uint32_t b) { return (a>b) ? a : b; }
 
-static FORCEINLINE int64_t __min_uniform_int64(int64_t a, int64_t b) { return (a<b) ? a : b; }
-static FORCEINLINE int64_t __max_uniform_int64(int64_t a, int64_t b) { return (a>b) ? a : b; }
+static FORCEINLINE int64_t __min_uniform_int64 ( int64_t a,  int64_t b) { return (a<b) ? a : b; }
+static FORCEINLINE int64_t __max_uniform_int64 ( int64_t a,  int64_t b) { return (a>b) ? a : b; }
 static FORCEINLINE int64_t __min_uniform_uint64(uint64_t a, uint64_t b) { return (a<b) ? a : b; }
 static FORCEINLINE int64_t __max_uniform_uint64(uint64_t a, uint64_t b) { return (a>b) ? a : b; }
 
-
-#if 0
-BINARY_OP_FUNC(__vec16_f, __max_varying_float, __max_uniform_float)
-BINARY_OP_FUNC(__vec16_f, __min_varying_float, __min_uniform_float)
-BINARY_OP_FUNC(__vec16_d, __max_varying_double, __max_uniform_double)
-BINARY_OP_FUNC(__vec16_d, __min_varying_double, __min_uniform_double)
-#else
 static FORCEINLINE __vec16_f __max_varying_float (__vec16_f v1, __vec16_f v2) { return _mm512_gmax_ps(v1, v2);}
 static FORCEINLINE __vec16_f __min_varying_float (__vec16_f v1, __vec16_f v2) { return _mm512_gmin_ps(v1, v2);}
 static FORCEINLINE __vec16_d __max_varying_double(__vec16_d v1, __vec16_d v2) { return __vec16_d(_mm512_gmax_pd(v1.v1, v2.v1),_mm512_gmax_pd(v1.v2,v2.v2));}
 static FORCEINLINE __vec16_d __min_varying_double(__vec16_d v1, __vec16_d v2) { return __vec16_d(_mm512_gmin_pd(v1.v1, v2.v1),_mm512_gmin_pd(v1.v2,v2.v2));}
-#endif
 
-#if 0
-BINARY_OP_FUNC(__vec16_i32, __max_varying_int32, __max_uniform_int32)
-BINARY_OP_FUNC(__vec16_i32, __min_varying_int32, __min_uniform_int32)
-BINARY_OP_FUNC(__vec16_i32, __max_varying_uint32, __max_uniform_uint32)
-BINARY_OP_FUNC(__vec16_i32, __min_varying_uint32, __min_uniform_uint32)
-#else
 static FORCEINLINE __vec16_i32 __max_varying_int32 (__vec16_i32 v1, __vec16_i32 v2) { return _mm512_max_epi32(v1, v2);}
 static FORCEINLINE __vec16_i32 __min_varying_int32 (__vec16_i32 v1, __vec16_i32 v2) { return _mm512_min_epi32(v1, v2);}
 static FORCEINLINE __vec16_i32 __max_varying_uint32(__vec16_i32 v1, __vec16_i32 v2) { return _mm512_max_epu32(v1, v2);}
 static FORCEINLINE __vec16_i32 __min_varying_uint32(__vec16_i32 v1, __vec16_i32 v2) { return _mm512_min_epu32(v1, v2);}
-#endif
 
-BINARY_OP_FUNC(__vec16_i64, __max_varying_int64, __max_uniform_int64)
-BINARY_OP_FUNC(__vec16_i64, __min_varying_int64, __min_uniform_int64)
+BINARY_OP_FUNC(__vec16_i64, __max_varying_int64,  __max_uniform_int64)
+BINARY_OP_FUNC(__vec16_i64, __min_varying_int64,  __min_uniform_int64)
 BINARY_OP_FUNC(__vec16_i64, __max_varying_uint64, __max_uniform_uint64)
 BINARY_OP_FUNC(__vec16_i64, __min_varying_uint64, __min_uniform_uint64)
 
 // sqrt/rsqrt/rcp
 
-static FORCEINLINE float __rsqrt_uniform_float(float v) {
-    return 1.f / sqrtf(v);
-}
+static FORCEINLINE float  __rsqrt_uniform_float(float  v) { return 1.f / sqrtf(v); }
+static FORCEINLINE float  __rcp_uniform_float  (float  v) { return 1.f / v;        }
+static FORCEINLINE float  __sqrt_uniform_float (float  v) { return sqrtf(v);       }
+static FORCEINLINE double __sqrt_uniform_double(double v) { return sqrt (v);       }
 
-static FORCEINLINE float __rcp_uniform_float(float v) {
-    return 1.f / v;
-}
-
-static FORCEINLINE float __sqrt_uniform_float(float v) {
-    return sqrtf(v);
-}
-
-static FORCEINLINE double __sqrt_uniform_double(double v) {
-    return sqrt(v);
-}
-
-#if 0
-UNARY_OP(__vec16_f, __rcp_varying_float, __rcp_uniform_float)
-UNARY_OP(__vec16_f, __rsqrt_varying_float, __rsqrt_uniform_float)
-UNARY_OP(__vec16_f, __sqrt_varying_float, __sqrt_uniform_float)
-UNARY_OP(__vec16_d, __sqrt_varying_double, __sqrt_uniform_double)
-#else
-static FORCEINLINE __vec16_f __rcp_varying_float(__vec16_f v) {
+static FORCEINLINE __vec16_f __rcp_varying_float(__vec16_f v) 
+{
 #ifdef ISPC_FAST_MATH
-    return _mm512_rcp23_ps(v); // Approximation with 23 bits of accuracy.
+  return _mm512_rcp23_ps(v); // Approximation with 23 bits of accuracy.
 #else
-    return _mm512_recip_ps(v);
+  return _mm512_recip_ps(v);
 #endif
 }
 
-static FORCEINLINE __vec16_f __rsqrt_varying_float(__vec16_f v) {
+static FORCEINLINE __vec16_f __rsqrt_varying_float(__vec16_f v) 
+{
 #ifdef ISPC_FAST_MATH
-    return _mm512_rsqrt23_ps(v); // Approximation with 0.775ULP accuracy
+  return _mm512_rsqrt23_ps(v); // Approximation with 0.775ULP accuracy
 #else 
-    return _mm512_invsqrt_ps(v);
+  return _mm512_invsqrt_ps(v);
 #endif
 }
-static FORCEINLINE __vec16_f __sqrt_varying_float (__vec16_f v) {    return _mm512_sqrt_ps(v);}
-static FORCEINLINE __vec16_d __sqrt_varying_double(__vec16_d v) {    return __vec16_d(_mm512_sqrt_pd(v.v1),_mm512_sqrt_pd(v.v2));}
-#endif
+static FORCEINLINE __vec16_f __sqrt_varying_float (__vec16_f v) { return _mm512_sqrt_ps(v);}
+static FORCEINLINE __vec16_d __sqrt_varying_double(__vec16_d v) { return __vec16_d(_mm512_sqrt_pd(v.v1),_mm512_sqrt_pd(v.v2));}
 
 ///////////////////////////////////////////////////////////////////////////
 // svml
 ///////////////////////////////////////////////////////////////////////////
 
-static FORCEINLINE __vec16_f __svml_sinf(__vec16_f v)              { return _mm512_sin_ps(v); }
-static FORCEINLINE __vec16_f __svml_asinf(__vec16_f v)              { return _mm512_asin_ps(v); }
-static FORCEINLINE __vec16_f __svml_cosf(__vec16_f v)              { return _mm512_cos_ps(v); }
-static FORCEINLINE __vec16_f __svml_tanf(__vec16_f v)              { return _mm512_tan_ps(v); }
-static FORCEINLINE __vec16_f __svml_atanf(__vec16_f v)              { return _mm512_atan_ps(v); }
+static FORCEINLINE __vec16_f __svml_sinf  (__vec16_f v)              { return _mm512_sin_ps(v);     }
+static FORCEINLINE __vec16_f __svml_asinf (__vec16_f v)              { return _mm512_asin_ps(v);    }
+static FORCEINLINE __vec16_f __svml_cosf  (__vec16_f v)              { return _mm512_cos_ps(v);     }
+static FORCEINLINE __vec16_f __svml_tanf  (__vec16_f v)              { return _mm512_tan_ps(v);     }
+static FORCEINLINE __vec16_f __svml_atanf (__vec16_f v)              { return _mm512_atan_ps(v);    }
 static FORCEINLINE __vec16_f __svml_atan2f(__vec16_f a, __vec16_f b) { return _mm512_atan2_ps(a,b); }
-static FORCEINLINE __vec16_f __svml_expf(__vec16_f v)              { return _mm512_exp_ps(v); }
-static FORCEINLINE __vec16_f __svml_logf(__vec16_f v)              { return _mm512_log_ps(v); }
-static FORCEINLINE __vec16_f __svml_powf(__vec16_f a, __vec16_f b) { return _mm512_pow_ps(a,b); }
+static FORCEINLINE __vec16_f __svml_expf  (__vec16_f v)              { return _mm512_exp_ps(v);     }
+static FORCEINLINE __vec16_f __svml_logf  (__vec16_f v)              { return _mm512_log_ps(v);     }
+static FORCEINLINE __vec16_f __svml_powf  (__vec16_f a, __vec16_f b) { return _mm512_pow_ps(a,b);   }
 
-static FORCEINLINE __vec16_d __svml_sind(__vec16_d v)              { return __vec16_d(_mm512_sin_pd(v.v1), _mm512_sin_pd(v.v2)); }
-static FORCEINLINE __vec16_d __svml_asind(__vec16_d v)              { return __vec16_d(_mm512_asin_pd(v.v1), _mm512_asin_pd(v.v2)); }
-static FORCEINLINE __vec16_d __svml_cosd(__vec16_d v)              { return __vec16_d(_mm512_cos_pd(v.v1), _mm512_cos_pd(v.v2)); }
-static FORCEINLINE __vec16_d __svml_tand(__vec16_d v)              { return __vec16_d(_mm512_tan_pd(v.v1), _mm512_tan_pd(v.v2)); }
-static FORCEINLINE __vec16_d __svml_atand(__vec16_d v)              { return __vec16_d(_mm512_atan_pd(v.v1), _mm512_atan_pd(v.v2)); }
+static FORCEINLINE __vec16_d __svml_sind  (__vec16_d v)              { return __vec16_d(_mm512_sin_pd(v.v1), _mm512_sin_pd(v.v2)); }
+static FORCEINLINE __vec16_d __svml_asind (__vec16_d v)              { return __vec16_d(_mm512_asin_pd(v.v1), _mm512_asin_pd(v.v2)); }
+static FORCEINLINE __vec16_d __svml_cosd  (__vec16_d v)              { return __vec16_d(_mm512_cos_pd(v.v1), _mm512_cos_pd(v.v2)); }
+static FORCEINLINE __vec16_d __svml_tand  (__vec16_d v)              { return __vec16_d(_mm512_tan_pd(v.v1), _mm512_tan_pd(v.v2)); }
+static FORCEINLINE __vec16_d __svml_atand (__vec16_d v)              { return __vec16_d(_mm512_atan_pd(v.v1), _mm512_atan_pd(v.v2)); }
 static FORCEINLINE __vec16_d __svml_atan2d(__vec16_d a, __vec16_d b) { return __vec16_d(_mm512_atan2_pd(a.v1,b.v1), _mm512_atan2_pd(a.v2,b.v2)); }
-static FORCEINLINE __vec16_d __svml_expd(__vec16_d v)              { return __vec16_d(_mm512_exp_pd(v.v1), _mm512_exp_pd(v.v2)); }
-static FORCEINLINE __vec16_d __svml_logd(__vec16_d v)              { return __vec16_d(_mm512_log_pd(v.v1), _mm512_log_pd(v.v2)); }
-static FORCEINLINE __vec16_d __svml_powd(__vec16_d a, __vec16_d b) { return __vec16_d(_mm512_pow_pd(a.v1,b.v1), _mm512_pow_pd(a.v2,b.v2)); }
+static FORCEINLINE __vec16_d __svml_expd  (__vec16_d v)              { return __vec16_d(_mm512_exp_pd(v.v1), _mm512_exp_pd(v.v2)); }
+static FORCEINLINE __vec16_d __svml_logd  (__vec16_d v)              { return __vec16_d(_mm512_log_pd(v.v1), _mm512_log_pd(v.v2)); }
+static FORCEINLINE __vec16_d __svml_powd  (__vec16_d a, __vec16_d b) { return __vec16_d(_mm512_pow_pd(a.v1,b.v1), _mm512_pow_pd(a.v2,b.v2)); }
 
 ///////////////////////////////////////////////////////////////////////////
 // bit ops
+///////////////////////////////////////////////////////////////////////////
 
 static FORCEINLINE int32_t __popcnt_int32(uint32_t v) {
     int count = 0;
@@ -2064,42 +1775,23 @@ static FORCEINLINE int64_t __count_leading_zeros_i64(uint64_t v) {
 
 ///////////////////////////////////////////////////////////////////////////
 // reductions
+///////////////////////////////////////////////////////////////////////////
 
-#if 0
-REDUCE_ADD(float, __vec16_f, __reduce_add_float)
-REDUCE_MINMAX(float, __vec16_f, __reduce_min_float, <)
-REDUCE_MINMAX(float, __vec16_f, __reduce_max_float, >)
-#else
 static FORCEINLINE float __reduce_add_float(__vec16_f v) { return _mm512_reduce_add_ps(v); }
 static FORCEINLINE float __reduce_min_float(__vec16_f v) { return _mm512_reduce_min_ps(v); }
 static FORCEINLINE float __reduce_max_float(__vec16_f v) { return _mm512_reduce_max_ps(v); }
-#endif
 
-#if 0
-REDUCE_ADD(double, __vec16_d, __reduce_add_double)
-REDUCE_MINMAX(double, __vec16_d, __reduce_min_double, <)
-REDUCE_MINMAX(double, __vec16_d, __reduce_max_double, >)
-#else
 static FORCEINLINE float __reduce_add_double(__vec16_d v) { return _mm512_reduce_add_pd(v.v1) + _mm512_reduce_add_pd(v.v2); }
 static FORCEINLINE float __reduce_min_double(__vec16_d v) { return std::min(_mm512_reduce_min_pd(v.v1), _mm512_reduce_min_pd(v.v2)); }
 static FORCEINLINE float __reduce_max_double(__vec16_d v) { return std::max(_mm512_reduce_max_pd(v.v1), _mm512_reduce_max_pd(v.v2)); }
-#endif
 
 
 
-#if 0
-REDUCE_ADD   (int64_t, __vec16_i32, __reduce_add_int32)
-REDUCE_MINMAX(int32_t, __vec16_i32, __reduce_min_int32, <)
-REDUCE_MINMAX(int32_t, __vec16_i32, __reduce_max_int32, >)
-REDUCE_MINMAX(uint32_t, __vec16_i32, __reduce_min_uint32, <)
-REDUCE_MINMAX(uint32_t, __vec16_i32, __reduce_max_uint32, >)
-#else
 static FORCEINLINE  int64_t __reduce_add_int32  (__vec16_i32 v) { return _mm512_reduce_add_epi32(v);}
 static FORCEINLINE  int32_t __reduce_min_int32  (__vec16_i32 v) { return _mm512_reduce_min_epi32(v);}
 static FORCEINLINE  int32_t __reduce_max_int32  (__vec16_i32 v) { return _mm512_reduce_max_epi32(v);}
 static FORCEINLINE uint32_t __reduce_min_uint32 (__vec16_i32 v) { return _mm512_reduce_min_epu32(v);}
 static FORCEINLINE uint32_t __reduce_max_uint32 (__vec16_i32 v) { return _mm512_reduce_max_epu32(v);}
-#endif
 
 REDUCE_ADD   ( int16_t, __vec16_i8,  __reduce_add_int8)
 REDUCE_ADD   ( int32_t, __vec16_i16, __reduce_add_int16)
@@ -2111,6 +1803,7 @@ REDUCE_MINMAX(uint64_t, __vec16_i64, __reduce_max_uint64, >)
 
 ///////////////////////////////////////////////////////////////////////////
 // masked load/store
+///////////////////////////////////////////////////////////////////////////
 
 static FORCEINLINE __vec16_i8 __masked_load_i8(void *p,
                                                __vec16_i1 mask) {
@@ -2132,53 +1825,31 @@ static FORCEINLINE __vec16_i16 __masked_load_i16(void *p,
     return ret;
 }
 
-#if 0
-static FORCEINLINE __vec16_i32 __masked_load_i32(void *p,
-                                                 __vec16_i1 mask) {
-    __vec16_i32 ret;
-    int32_t *ptr = (int32_t *)p;
-    for (int i = 0; i < 16; ++i)
-        if ((mask.v & (1 << i)) != 0)
-            ret[i] = ptr[i];
-    return ret;
-}
-#else
-static FORCEINLINE __vec16_i32 __masked_load_i32(void *p, __vec16_i1 mask) {
+static FORCEINLINE __vec16_i32 __masked_load_i32(void *p, __vec16_i1 mask) 
+{
 #ifdef ISPC_FORCE_ALIGNED_MEMORY
-    return _mm512_mask_load_epi32(__vec16_i32(), mask, p);
+  return _mm512_mask_load_epi32(__vec16_i32(), mask, p);
 #else
-    __vec16_i32 tmp;
-    tmp.v = _mm512_mask_extloadunpacklo_epi32(tmp.v, 0xFFFF, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
-    tmp.v = _mm512_mask_extloadunpackhi_epi32(tmp.v, 0xFFFF, (uint8_t*)p+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
-    __vec16_i32 ret;
-    return _mm512_mask_mov_epi32(ret.v, mask, tmp.v);
+  __vec16_i32 tmp;
+  tmp.v = _mm512_mask_extloadunpacklo_epi32(tmp.v, 0xFFFF, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
+  tmp.v = _mm512_mask_extloadunpackhi_epi32(tmp.v, 0xFFFF, (uint8_t*)p+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
+  __vec16_i32 ret;
+  return _mm512_mask_mov_epi32(ret.v, mask, tmp.v);
 #endif
 }
-#endif
 
-#if 0
-static FORCEINLINE __vec16_f __masked_load_float(void *p,
-                                                 __vec16_i1 mask) {
-    __vec16_f ret;
-    float *ptr = (float *)p;
-    for (int i = 0; i < 16; ++i)
-        if ((mask.v & (1 << i)) != 0)
-            ret[i] = ptr[i];
-    return ret;
-}
-#else
-static FORCEINLINE __vec16_f __masked_load_float(void *p, __vec16_i1 mask) {
+static FORCEINLINE __vec16_f __masked_load_float(void *p, __vec16_i1 mask) 
+{
 #ifdef ISPC_FORCE_ALIGNED_MEMORY
-    return _mm512_mask_load_ps(_mm512_undefined_ps(), mask,p);
+  return _mm512_mask_load_ps(_mm512_undefined_ps(), mask,p);
 #else
-    __vec16_f tmp;
-    tmp.v = _mm512_mask_extloadunpacklo_ps(tmp.v, 0xFFFF, p, _MM_UPCONV_PS_NONE, _MM_HINT_NONE);
-    tmp.v = _mm512_mask_extloadunpackhi_ps(tmp.v, 0xFFFF, (uint8_t*)p+64, _MM_UPCONV_PS_NONE, _MM_HINT_NONE);
-    __vec16_f ret;
-    return _mm512_mask_mov_ps(ret.v, mask, tmp.v);
+  __vec16_f tmp;
+  tmp.v = _mm512_mask_extloadunpacklo_ps(tmp.v, 0xFFFF, p, _MM_UPCONV_PS_NONE, _MM_HINT_NONE);
+  tmp.v = _mm512_mask_extloadunpackhi_ps(tmp.v, 0xFFFF, (uint8_t*)p+64, _MM_UPCONV_PS_NONE, _MM_HINT_NONE);
+  __vec16_f ret;
+  return _mm512_mask_mov_ps(ret.v, mask, tmp.v);
 #endif
 }
-#endif
 
 static FORCEINLINE __vec16_i64 __masked_load_i64(void *p,
                                                  __vec16_i1 mask) {
@@ -2190,40 +1861,29 @@ static FORCEINLINE __vec16_i64 __masked_load_i64(void *p,
     return ret;
 }
 
-#if 0
-static FORCEINLINE __vec16_d __masked_load_double(void *p,
-                                                  __vec16_i1 mask) {
-    __vec16_d ret;
-    double *ptr = (double *)p;
-    for (int i = 0; i < 16; ++i)
-        if ((mask.v & (1 << i)) != 0)
-            ret[i] = ptr[i];
-    return ret;
-}
-#else
-static FORCEINLINE __vec16_d __masked_load_double(void *p, __vec16_i1 mask) {
+static FORCEINLINE __vec16_d __masked_load_double(void *p, __vec16_i1 mask) 
+{
 #ifdef ISPC_FORCE_ALIGNED_MEMORY
-    __vec16_d ret;
-    __vec16_i1 tmp_m = mask;
-    tmp_m = _mm512_kswapb(tmp_m, tmp_m);
-    ret.v1 = _mm512_mask_load_pd(ret.v1, mask, p);
-    ret.v2 = _mm512_mask_load_pd(ret.v2, tmp_m, (uint8_t*)p+64);
-    return ret;
+  __vec16_d ret;
+  __vec16_i1 tmp_m = mask;
+  tmp_m = _mm512_kswapb(tmp_m, tmp_m);
+  ret.v1 = _mm512_mask_load_pd(ret.v1, mask, p);
+  ret.v2 = _mm512_mask_load_pd(ret.v2, tmp_m, (uint8_t*)p+64);
+  return ret;
 #else
-    __vec16_d tmp;
-    tmp.v1 = _mm512_mask_extloadunpacklo_pd(tmp.v1, 0xFF, p, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
-    tmp.v1 = _mm512_mask_extloadunpackhi_pd(tmp.v1, 0xFF, (uint8_t*)p+64, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
-    tmp.v2 = _mm512_mask_extloadunpacklo_pd(tmp.v2, 0xFF, (uint8_t*)p+64, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
-    tmp.v2 = _mm512_mask_extloadunpackhi_pd(tmp.v2, 0xFF, (uint8_t*)p+128, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
-    __vec16_d ret;
-    __vec16_i1 tmp_m = mask;
-    tmp_m = _mm512_kswapb(tmp_m, tmp_m);
-    ret.v1 = _mm512_mask_mov_pd(ret.v1, mask, tmp.v1);
-    ret.v2 = _mm512_mask_mov_pd(ret.v2, tmp_m, tmp.v2);
-    return ret;
+  __vec16_d tmp;
+  tmp.v1 = _mm512_mask_extloadunpacklo_pd(tmp.v1, 0xFF, p, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
+  tmp.v1 = _mm512_mask_extloadunpackhi_pd(tmp.v1, 0xFF, (uint8_t*)p+64, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
+  tmp.v2 = _mm512_mask_extloadunpacklo_pd(tmp.v2, 0xFF, (uint8_t*)p+64, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
+  tmp.v2 = _mm512_mask_extloadunpackhi_pd(tmp.v2, 0xFF, (uint8_t*)p+128, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
+  __vec16_d ret;
+  __vec16_i1 tmp_m = mask;
+  tmp_m = _mm512_kswapb(tmp_m, tmp_m);
+  ret.v1 = _mm512_mask_mov_pd(ret.v1, mask, tmp.v1);
+  ret.v2 = _mm512_mask_mov_pd(ret.v2, tmp_m, tmp.v2);
+  return ret;
 #endif
 }
-#endif
 
 
 static FORCEINLINE void __masked_store_i8(void *p, __vec16_i8 val,
@@ -2242,52 +1902,33 @@ static FORCEINLINE void __masked_store_i16(void *p, __vec16_i16 val,
             ptr[i] = val[i];
 }
 
-#if 0
-static FORCEINLINE void __masked_store_i32(void *p, __vec16_i32 val,
-                                           __vec16_i1 mask) {
-    int32_t *ptr = (int32_t *)p;
-    for (int i = 0; i < 16; ++i)
-        if ((mask.v & (1 << i)) != 0)
-            ptr[i] = val[i];
-}
-#else
-static FORCEINLINE void __masked_store_i32(void *p, __vec16_i32 val, __vec16_i1 mask) {
+static FORCEINLINE void __masked_store_i32(void *p, __vec16_i32 val, __vec16_i1 mask) 
+{
 #ifdef ISPC_FORCE_ALIGNED_MEMORY
-    _mm512_mask_store_epi32(p, mask, val.v);
+  _mm512_mask_store_epi32(p, mask, val.v);
 #else
-    __vec16_i32 tmp;
-    tmp.v = _mm512_extloadunpacklo_epi32(tmp.v, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
-    tmp.v = _mm512_extloadunpackhi_epi32(tmp.v, (uint8_t*)p+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
-    tmp.v = _mm512_mask_mov_epi32(tmp.v, mask, val.v);
-    _mm512_extpackstorelo_epi32(p, tmp.v, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
-    _mm512_extpackstorehi_epi32((uint8_t*)p+64, tmp.v, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
+  __vec16_i32 tmp;
+  tmp.v = _mm512_extloadunpacklo_epi32(tmp.v, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
+  tmp.v = _mm512_extloadunpackhi_epi32(tmp.v, (uint8_t*)p+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
+  tmp.v = _mm512_mask_mov_epi32(tmp.v, mask, val.v);
+  _mm512_extpackstorelo_epi32(p, tmp.v, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
+  _mm512_extpackstorehi_epi32((uint8_t*)p+64, tmp.v, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
 #endif
 }
-#endif
 
-#if 0
-static FORCEINLINE void __masked_store_float(void *p, __vec16_f val,
-                                             __vec16_i1 mask) {
-    float *ptr = (float *)p;
-    for (int i = 0; i < 16; ++i)
-        if ((mask.v & (1 << i)) != 0)
-            ptr[i] = val[i];
-}
-#else
-static FORCEINLINE void __masked_store_float(void *p, __vec16_f val,
-                                             __vec16_i1 mask) {
+static FORCEINLINE void __masked_store_float(void *p, __vec16_f val, __vec16_i1 mask) 
+{
 #ifdef ISPC_FORCE_ALIGNED_MEMORY
-    _mm512_mask_store_ps(p, mask, val.v);
+  _mm512_mask_store_ps(p, mask, val.v);
 #else
-    __vec16_f tmp;
-    tmp.v = _mm512_extloadunpacklo_ps(tmp.v, p, _MM_UPCONV_PS_NONE, _MM_HINT_NONE);
-    tmp.v = _mm512_extloadunpackhi_ps(tmp.v, (uint8_t*)p+64, _MM_UPCONV_PS_NONE, _MM_HINT_NONE);
-    tmp.v = _mm512_mask_mov_ps(tmp.v, mask, val.v);
-    _mm512_extpackstorelo_ps(p, tmp.v, _MM_DOWNCONV_PS_NONE, _MM_HINT_NONE);
-    _mm512_extpackstorehi_ps((uint8_t*)p+64, tmp.v, _MM_DOWNCONV_PS_NONE, _MM_HINT_NONE);
+  __vec16_f tmp;
+  tmp.v = _mm512_extloadunpacklo_ps(tmp.v, p, _MM_UPCONV_PS_NONE, _MM_HINT_NONE);
+  tmp.v = _mm512_extloadunpackhi_ps(tmp.v, (uint8_t*)p+64, _MM_UPCONV_PS_NONE, _MM_HINT_NONE);
+  tmp.v = _mm512_mask_mov_ps(tmp.v, mask, val.v);
+  _mm512_extpackstorelo_ps(p, tmp.v, _MM_DOWNCONV_PS_NONE, _MM_HINT_NONE);
+  _mm512_extpackstorehi_ps((uint8_t*)p+64, tmp.v, _MM_DOWNCONV_PS_NONE, _MM_HINT_NONE);
 #endif
 }
-#endif
 
 static FORCEINLINE void __masked_store_i64(void *p, __vec16_i64 val,
                                           __vec16_i1 mask) {
@@ -2297,39 +1938,29 @@ static FORCEINLINE void __masked_store_i64(void *p, __vec16_i64 val,
             ptr[i] = val[i];
 }
 
-#if 0
-static FORCEINLINE void __masked_store_double(void *p, __vec16_d val,
-                                              __vec16_i1 mask) {
-    double *ptr = (double *)p;
-    for (int i = 0; i < 16; ++i)
-        if ((mask.v & (1 << i)) != 0)
-            ptr[i] = val[i];
-}
-#else
-static FORCEINLINE void __masked_store_double(void *p, __vec16_d val,
-                                              __vec16_i1 mask) {
+static FORCEINLINE void __masked_store_double(void *p, __vec16_d val, __vec16_i1 mask) 
+{
 #ifdef ISPC_FORCE_ALIGNED_MEMORY
-    __vec16_i1 tmp_m = mask;
-    tmp_m = _mm512_kswapb(tmp_m, tmp_m);
-    _mm512_mask_store_pd(p, mask, val.v1);
-    _mm512_mask_store_pd((uint8_t*)p+64, tmp_m, val.v2);
+  __vec16_i1 tmp_m = mask;
+  tmp_m = _mm512_kswapb(tmp_m, tmp_m);
+  _mm512_mask_store_pd(p, mask, val.v1);
+  _mm512_mask_store_pd((uint8_t*)p+64, tmp_m, val.v2);
 #else
-    __vec16_d tmp;
-    __vec16_i1 tmp_m = mask;
-    tmp_m = _mm512_kswapb(tmp_m, tmp_m);
-    tmp.v1 = _mm512_extloadunpacklo_pd(tmp.v1, p, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
-    tmp.v1 = _mm512_extloadunpackhi_pd(tmp.v1, (uint8_t*)p+64, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
-    tmp.v2 = _mm512_extloadunpacklo_pd(tmp.v2, (uint8_t*)p+64, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
-    tmp.v2 = _mm512_extloadunpackhi_pd(tmp.v2, (uint8_t*)p+128, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
-    tmp.v1 = _mm512_mask_mov_pd(tmp.v1, mask, val.v1);
-    tmp.v2 = _mm512_mask_mov_pd(tmp.v2, tmp_m, val.v2);
-    _mm512_extpackstorelo_pd(p, tmp.v1, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE);
-    _mm512_extpackstorehi_pd((uint8_t*)p+64, tmp.v1, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE);
-    _mm512_extpackstorelo_pd((uint8_t*)p+64, tmp.v2, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE);
-    _mm512_extpackstorehi_pd((uint8_t*)p+128, tmp.v2, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE);
+  __vec16_d tmp;
+  __vec16_i1 tmp_m = mask;
+  tmp_m = _mm512_kswapb(tmp_m, tmp_m);
+  tmp.v1 = _mm512_extloadunpacklo_pd(tmp.v1, p, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
+  tmp.v1 = _mm512_extloadunpackhi_pd(tmp.v1, (uint8_t*)p+64, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
+  tmp.v2 = _mm512_extloadunpacklo_pd(tmp.v2, (uint8_t*)p+64, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
+  tmp.v2 = _mm512_extloadunpackhi_pd(tmp.v2, (uint8_t*)p+128, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
+  tmp.v1 = _mm512_mask_mov_pd(tmp.v1, mask, val.v1);
+  tmp.v2 = _mm512_mask_mov_pd(tmp.v2, tmp_m, val.v2);
+  _mm512_extpackstorelo_pd(p, tmp.v1, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE);
+  _mm512_extpackstorehi_pd((uint8_t*)p+64, tmp.v1, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE);
+  _mm512_extpackstorelo_pd((uint8_t*)p+64, tmp.v2, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE);
+  _mm512_extpackstorehi_pd((uint8_t*)p+128, tmp.v2, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE);
 #endif
 }
-#endif
 
 static FORCEINLINE void __masked_store_blend_i8(void *p, __vec16_i8 val,
                                                 __vec16_i1 mask) {
@@ -2363,9 +1994,11 @@ static FORCEINLINE void __masked_store_blend_double(void *p, __vec16_d val,
 
 ///////////////////////////////////////////////////////////////////////////
 // gather/scatter
+///////////////////////////////////////////////////////////////////////////
 
 // offsets * offsetScale is in bytes (for all of these)
 
+/* knc::macro::used */
 #define GATHER_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC)                  \
 static FORCEINLINE VTYPE FUNC(unsigned char *b, uint32_t scale,         \
                               OTYPE offset, __vec16_i1 mask) {          \
@@ -2381,21 +2014,19 @@ static FORCEINLINE VTYPE FUNC(unsigned char *b, uint32_t scale,         \
     
 
 /****************/
-#if 0
-GATHER_BASE_OFFSETS(__vec16_i8,  int8_t,  __vec16_i32, __gather_base_offsets32_i8)
-#else
+// GATHER_BASE_OFFSETS(__vec16_i8,  int8_t,  __vec16_i32, __gather_base_offsets32_i8)
 static FORCEINLINE __vec16_i8 __gather_base_offsets32_i8(uint8_t *base, uint32_t scale, __vec16_i32 offsets,  __vec16_i1 mask) 
 {
-    // (iw): need to temporarily store as int because gathers can only return ints.
-    __vec16_i32 tmp = _mm512_mask_i32extgather_epi32(_mm512_undefined_epi32(), mask, offsets, base, 
-                                                     _MM_UPCONV_EPI32_SINT8, scale,
-                                                     _MM_HINT_NONE);
-    // now, downconverting to chars into temporary char vector
-    __vec16_i8 ret;
-    _mm512_extstore_epi32(ret.data,tmp,_MM_DOWNCONV_EPI32_SINT8,_MM_HINT_NONE);
-    return ret;
+  // (iw): need to temporarily store as int because gathers can only return ints.
+  __vec16_i32 tmp = _mm512_mask_i32extgather_epi32(_mm512_undefined_epi32(), mask, offsets, base, 
+                                                   _MM_UPCONV_EPI32_SINT8, scale,
+                                                   _MM_HINT_NONE);
+  // now, downconverting to chars into temporary char vector
+  __vec16_i8 ret;
+  _mm512_extstore_epi32(ret.data,tmp,_MM_DOWNCONV_EPI32_SINT8,_MM_HINT_NONE);
+  return ret;
 }
-#if 0 /* evghenii::fails on gather-int8-2 & gather-int8-4 */
+#if 0 /* knc::fails on gather-int8-2 & gather-int8-4 */
 static FORCEINLINE __vec16_i8 __gather_base_offsets64_i8(uint8_t *_base, uint32_t scale, __vec16_i64 _offsets, __vec16_i1 mask) 
 { 
   const __vec16_i64 offsets = _offsets.cvt2hilo();
@@ -2422,21 +2053,18 @@ static FORCEINLINE __vec16_i8 __gather_base_offsets64_i8(uint8_t *_base, uint32_
 #else
 GATHER_BASE_OFFSETS(__vec16_i8,  int8_t,  __vec16_i64, __gather_base_offsets64_i8)
 #endif
-#endif
 /****************/
 GATHER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i32, __gather_base_offsets32_i16)
 GATHER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i64, __gather_base_offsets64_i16)
 /****************/
-#if 0
-GATHER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i32, __gather_base_offsets32_i32)
-#else
+// GATHER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i32, __gather_base_offsets32_i32)
 static FORCEINLINE __vec16_i32 __gather_base_offsets32_i32(uint8_t *base, uint32_t scale, __vec16_i32 offsets,   __vec16_i1 mask) 
 {
-    return _mm512_mask_i32extgather_epi32(_mm512_undefined_epi32(), mask, offsets, 
-                                          base, _MM_UPCONV_EPI32_NONE, scale,
-                                          _MM_HINT_NONE);
+  return _mm512_mask_i32extgather_epi32(_mm512_undefined_epi32(), mask, offsets, 
+                                        base, _MM_UPCONV_EPI32_NONE, scale,
+                                        _MM_HINT_NONE);
 }
-#if 0 /* evghenii::fails on gather-int32-2 & gather-int32-4 */
+#if 0 /* knc::fails on gather-int32-2 & gather-int32-4 */
 static FORCEINLINE __vec16_i32 __gather_base_offsets64_i32(uint8_t *_base, uint32_t scale, __vec16_i64 _offsets,  __vec16_i1 mask) 
 {
   const __vec16_i64 offsets = _offsets.cvt2hilo();
@@ -2464,18 +2092,15 @@ static FORCEINLINE __vec16_i32 __gather_base_offsets64_i32(uint8_t *_base, uint3
 #else
 GATHER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i64, __gather_base_offsets64_i32)
 #endif
-#endif
 /****************/
-#if 0
-GATHER_BASE_OFFSETS(__vec16_f,   float,   __vec16_i32, __gather_base_offsets32_float)
-#else
+// GATHER_BASE_OFFSETS(__vec16_f,   float,   __vec16_i32, __gather_base_offsets32_float)
 static FORCEINLINE __vec16_f __gather_base_offsets32_float(uint8_t *base, uint32_t scale, __vec16_i32 offsets, __vec16_i1 mask) 
 {
-    return _mm512_mask_i32extgather_ps(_mm512_undefined_ps(), mask, offsets,
-                                       base, _MM_UPCONV_PS_NONE, scale,
-                                       _MM_HINT_NONE);
+  return _mm512_mask_i32extgather_ps(_mm512_undefined_ps(), mask, offsets,
+                                     base, _MM_UPCONV_PS_NONE, scale,
+                                     _MM_HINT_NONE);
 }
-#if 0 /* evghenii::fails on gather-float-2 gather-float-4 & soa-14 */
+#if 0 /* knc::fails on gather-float-2 gather-float-4 & soa-14 */
 static FORCEINLINE __vec16_f __gather_base_offsets64_float(uint8_t *_base, uint32_t scale, __vec16_i64 _offsets,  __vec16_i1 mask) 
 {
   const __vec16_i64 offsets = _offsets.cvt2hilo();
@@ -2503,30 +2128,27 @@ static FORCEINLINE __vec16_f __gather_base_offsets64_float(uint8_t *_base, uint3
 #else
 GATHER_BASE_OFFSETS(__vec16_f,   float,   __vec16_i64, __gather_base_offsets64_float)
 #endif
-#endif
 /****************/
 GATHER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i32, __gather_base_offsets32_i64)
 GATHER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i64, __gather_base_offsets64_i64)
 /****************/
-#if 0
-GATHER_BASE_OFFSETS(__vec16_d,   double,  __vec16_i32, __gather_base_offsets32_double)
-#else
+// GATHER_BASE_OFFSETS(__vec16_d,   double,  __vec16_i32, __gather_base_offsets32_double)
 static FORCEINLINE __vec16_d __gather_base_offsets32_double(uint8_t *base, uint32_t scale, __vec16_i32 offsets, __vec16_i1 mask) 
 {
-    __vec16_d ret;
-    ret.v1 = _mm512_mask_i32loextgather_pd(_mm512_undefined_pd(), mask, offsets,
-                                       base, _MM_UPCONV_PD_NONE, scale,
-                                       _MM_HINT_NONE); 
-    __m512i shuffled_offsets = _mm512_permute4f128_epi32(offsets.v, _MM_PERM_DCDC);
-    const __mmask8 mask8 = 0x00FF & (mask >> 8); /* evghenii::testme */
-    ret.v2 = _mm512_mask_i32loextgather_pd(_mm512_undefined_pd(), mask8, shuffled_offsets,
-                                       base, _MM_UPCONV_PD_NONE, scale,
-                                       _MM_HINT_NONE); 
-    return ret;
+  __vec16_d ret;
+  ret.v1 = _mm512_mask_i32loextgather_pd(_mm512_undefined_pd(), mask, offsets,
+                                         base, _MM_UPCONV_PD_NONE, scale,
+                                         _MM_HINT_NONE); 
+  __m512i shuffled_offsets = _mm512_permute4f128_epi32(offsets.v, _MM_PERM_DCDC);
+  const __mmask8 mask8 = 0x00FF & (mask >> 8); /* knc::testme */
+  ret.v2 = _mm512_mask_i32loextgather_pd(_mm512_undefined_pd(), mask8, shuffled_offsets,
+                                         base, _MM_UPCONV_PD_NONE, scale,
+                                         _MM_HINT_NONE); 
+  return ret;
 }
-#endif
 GATHER_BASE_OFFSETS(__vec16_d,   double,  __vec16_i64, __gather_base_offsets64_double)
 
+/* knc::macro::used */
 #define GATHER_GENERAL(VTYPE, STYPE, PTRTYPE, FUNC)         \
 static FORCEINLINE VTYPE FUNC(PTRTYPE ptrs, __vec16_i1 mask) {   \
     VTYPE ret;                                              \
@@ -2537,13 +2159,13 @@ static FORCEINLINE VTYPE FUNC(PTRTYPE ptrs, __vec16_i1 mask) {   \
         }                                                   \
     return ret;                                             \
 }
+/* knc::macro::used */
 #define GATHER_GENERALF(VTYPE, STYPE, PTRTYPE, FUNC,FUNC1)         \
 static FORCEINLINE VTYPE FUNC(PTRTYPE ptrs, __vec16_i1 mask) {   \
   return FUNC1(0, 1, ptrs, mask); \
 }
 
 
-#if 1
 /***********/
 GATHER_GENERALF(__vec16_i8,  int8_t,  __vec16_i32, __gather32_i8, __gather_base_offsets32_i8)
 GATHER_GENERALF(__vec16_i16, int16_t, __vec16_i32, __gather32_i16, __gather_base_offsets32_i16)
@@ -2559,10 +2181,10 @@ GATHER_GENERAL(__vec16_i64, int64_t, __vec16_i64, __gather64_i64);
 GATHER_GENERAL(__vec16_f,   float,   __vec16_i64, __gather64_float);
 GATHER_GENERAL(__vec16_d,   double,  __vec16_i64, __gather64_double);
 /***********/
-#endif
 
 // scatter
 
+/* knc::macro::used */
 #define SCATTER_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC)                 \
 static FORCEINLINE void FUNC(unsigned char *b, uint32_t scale,          \
                              OTYPE offset, VTYPE val,                   \
@@ -2583,16 +2205,14 @@ SCATTER_BASE_OFFSETS(__vec16_i8,  int8_t,  __vec16_i64, __scatter_base_offsets64
 SCATTER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i32, __scatter_base_offsets32_i16)
 SCATTER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i64, __scatter_base_offsets64_i16)
 /*****************/
-#if 0
-SCATTER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i32, __scatter_base_offsets32_i32)
-SCATTER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i64, __scatter_base_offsets64_i32)
-#else
+// SCATTER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i32, __scatter_base_offsets32_i32)
 static FORCEINLINE void __scatter_base_offsets32_i32(uint8_t *b, uint32_t scale, __vec16_i32 offsets,  __vec16_i32 val, __vec16_i1 mask)
 {
-    _mm512_mask_i32extscatter_epi32(b, mask, offsets, val, 
-                                    _MM_DOWNCONV_EPI32_NONE, scale, 
-                                    _MM_HINT_NONE);
+  _mm512_mask_i32extscatter_epi32(b, mask, offsets, val, 
+                                  _MM_DOWNCONV_EPI32_NONE, scale, 
+                                  _MM_HINT_NONE);
 }
+// SCATTER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i64, __scatter_base_offsets64_i32)
 static FORCEINLINE void __scatter_base_offsets64_i32(uint8_t *_base, uint32_t scale, __vec16_i64 _offsets, __vec16_i32 value, __vec16_i1 mask) 
 {
   const __vec16_i64 offsets = _offsets.cvt2hilo();
@@ -2614,19 +2234,16 @@ static FORCEINLINE void __scatter_base_offsets64_i32(uint8_t *_base, uint32_t sc
     still_to_do = _mm512_kxor(match,still_to_do);
   }
 }
-#endif
 /*****************/
-#if 0
-SCATTER_BASE_OFFSETS(__vec16_f,   float,   __vec16_i32, __scatter_base_offsets32_float)
-#else
+// SCATTER_BASE_OFFSETS(__vec16_f,   float,   __vec16_i32, __scatter_base_offsets32_float)
 static FORCEINLINE void __scatter_base_offsets32_float(void *base, uint32_t scale, __vec16_i32 offsets,
                                __vec16_f val, __vec16_i1 mask) 
 { 
-    _mm512_mask_i32extscatter_ps(base, mask, offsets, val, 
-                                 _MM_DOWNCONV_PS_NONE, scale,
-                                 _MM_HINT_NONE);
+  _mm512_mask_i32extscatter_ps(base, mask, offsets, val, 
+                               _MM_DOWNCONV_PS_NONE, scale,
+                               _MM_HINT_NONE);
 }
-#if 0 /* evghenii::fails on soa-10 & soa-13 , it is very similar to __scatter_base_offsets64_it32, but that passes tests, why ?!? */
+#if 0 /* knc::fails on soa-10 & soa-13 , it is very similar to __scatter_base_offsets64_it32, but that passes tests, why ?!? */
 static FORCEINLINE void __scatter_base_offsets64_float(uint8_t *_base, uint32_t scale, __vec16_i64 _offsets, __vec16_f value, __vec16_i1 mask) 
 { 
   const __vec16_i64 offsets = _offsets.cvt2hilo();
@@ -2651,29 +2268,26 @@ static FORCEINLINE void __scatter_base_offsets64_float(uint8_t *_base, uint32_t
 #else
 SCATTER_BASE_OFFSETS(__vec16_f,   float,   __vec16_i64, __scatter_base_offsets64_float)
 #endif
-#endif
 /*****************/
 SCATTER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i32, __scatter_base_offsets32_i64)
 SCATTER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i64, __scatter_base_offsets64_i64)
 /*****************/
-#if 0 /* evghenii::to implement */
-SCATTER_BASE_OFFSETS(__vec16_d,   double,  __vec16_i32, __scatter_base_offsets32_double)
-#else /* evghenii:testme */
+// SCATTER_BASE_OFFSETS(__vec16_d,   double,  __vec16_i32, __scatter_base_offsets32_double)
 static FORCEINLINE void __scatter_base_offsets32_double(void *base, uint32_t scale, __vec16_i32 offsets,
                                __vec16_d val, __vec16_i1 mask) 
 { 
-    _mm512_mask_i32loextscatter_pd(base, mask, offsets, val.v1, 
+  _mm512_mask_i32loextscatter_pd(base, mask, offsets, val.v1, 
                                  _MM_DOWNCONV_PD_NONE, scale,
                                  _MM_HINT_NONE);
-    __m512i shuffled_offsets = _mm512_permute4f128_epi32(offsets.v, _MM_PERM_DCDC);
-    const __mmask8 mask8 = 0x00FF & (mask >> 8); /* evghenii::testme */
-    _mm512_mask_i32loextscatter_pd(base, mask8, shuffled_offsets, val.v2, 
+  __m512i shuffled_offsets = _mm512_permute4f128_epi32(offsets.v, _MM_PERM_DCDC);
+  const __mmask8 mask8 = 0x00FF & (mask >> 8); /* knc::testme */
+  _mm512_mask_i32loextscatter_pd(base, mask8, shuffled_offsets, val.v2, 
                                  _MM_DOWNCONV_PD_NONE, scale,
                                  _MM_HINT_NONE);
 }
-#endif
 SCATTER_BASE_OFFSETS(__vec16_d,   double,  __vec16_i64, __scatter_base_offsets64_double)
 
+/* knc::macro::used */
 #define SCATTER_GENERAL(VTYPE, STYPE, PTRTYPE, FUNC)                 \
 static FORCEINLINE void FUNC(PTRTYPE ptrs, VTYPE val, __vec16_i1 mask) {  \
     VTYPE ret;                                                       \
@@ -2683,12 +2297,12 @@ static FORCEINLINE void FUNC(PTRTYPE ptrs, VTYPE val, __vec16_i1 mask) {  \
             *ptr = val[i];                                         \
         }                                                            \
 }
+/* knc::macro::used */
 #define SCATTER_GENERALF(VTYPE, STYPE, PTRTYPE, FUNC,FUNC1)         \
 static FORCEINLINE void FUNC(PTRTYPE ptrs, VTYPE val, __vec16_i1 mask) {  \
   return FUNC1(0, 1, ptrs, val, mask); \
 }
 
-#if 1
 /***********/
 SCATTER_GENERALF(__vec16_i8,  int8_t,  __vec16_i32, __scatter32_i8, __scatter_base_offsets32_i8)
 SCATTER_GENERALF(__vec16_i16, int16_t, __vec16_i32, __scatter32_i16, __scatter_base_offsets32_i16)
@@ -2704,109 +2318,47 @@ SCATTER_GENERAL(__vec16_f,   float,   __vec16_i64, __scatter64_float)
 SCATTER_GENERAL(__vec16_i64, int64_t, __vec16_i64, __scatter64_i64)
 SCATTER_GENERAL(__vec16_d,   double,  __vec16_i64, __scatter64_double)
 /***********/
-#endif
 
 ///////////////////////////////////////////////////////////////////////////
 // packed load/store
+///////////////////////////////////////////////////////////////////////////
 
-#if 0
-static FORCEINLINE int32_t __packed_load_active(int32_t *ptr, __vec16_i32 *val,
-                                                __vec16_i1 mask) {
-    int count = 0; 
-    for (int i = 0; i < 16; ++i) {
-        if ((mask.v & (1 << i)) != 0) {
-            val->operator[](i) = *ptr++;
-            ++count;
-        }
-    }
-    return count;
-}
-#endif
 
-#if 0
-static FORCEINLINE int32_t __packed_store_active(int32_t *ptr, 
-                                                 __vec16_i32 val,
-                                                 __vec16_i1 mask) {
-    int count = 0; 
-    for (int i = 0; i < 16; ++i) {
-        if ((mask.v & (1 << i)) != 0) {
-            *ptr++ = val[i];
-            ++count;
-        }
-    }
-    return count;
+static FORCEINLINE int32_t __packed_load_active(uint32_t *p, __vec16_i32 *val, __vec16_i1 mask)
+{
+  __vec16_i32 v = __load<64>(val);
+  v = _mm512_mask_extloadunpacklo_epi32(v, mask, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
+  v = _mm512_mask_extloadunpackhi_epi32(v, mask, (uint8_t*)p+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
+  __store<64>(val, v);
+  return _mm_countbits_32(uint32_t(mask));
 }
-#endif
 
-#if 0
-static FORCEINLINE int32_t __packed_load_active(uint32_t *ptr,
-                                                __vec16_i32 *val,
-                                                __vec16_i1 mask) {
-    int count = 0; 
-    for (int i = 0; i < 16; ++i) {
-        if ((mask.v & (1 << i)) != 0) {
-            val->operator[](i) = *ptr++;
-            ++count;
-        }
-    }
-    return count;
+static FORCEINLINE int32_t __packed_store_active(uint32_t *p, __vec16_i32 val, __vec16_i1 mask) 
+{
+  _mm512_mask_extpackstorelo_epi32(p, mask, val, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
+  _mm512_mask_extpackstorehi_epi32((uint8_t*)p+64, mask, val, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
+  return _mm_countbits_32(uint32_t(mask));
 }
-static FORCEINLINE int32_t __packed_store_active(uint32_t *ptr, 
-                                                 __vec16_i32 val,
-                                                 __vec16_i1 mask) {
-    int count = 0; 
-    for (int i = 0; i < 16; ++i) {
-        if ((mask.v & (1 << i)) != 0) {
-            *ptr++ = val[i];
-            ++count;
-        }
-    }
-    return count;
-}
-#endif
 
-#if 1
-static FORCEINLINE int32_t __packed_load_active(uint32_t *p, __vec16_i32 *val,
-                                                __vec16_i1 mask) {
-    __vec16_i32 v = __load<64>(val);
-    v = _mm512_mask_extloadunpacklo_epi32(v, mask, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
-    v = _mm512_mask_extloadunpackhi_epi32(v, mask, (uint8_t*)p+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
-    __store<64>(val, v);
-    return _mm_countbits_32(uint32_t(mask));
+static FORCEINLINE int32_t __packed_load_active(int32_t *p, __vec16_i32 *val, __vec16_i1 mask)
+{
+  __vec16_i32 v = __load<64>(val);
+  v = _mm512_mask_extloadunpacklo_epi32(v, mask, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
+  v = _mm512_mask_extloadunpackhi_epi32(v, mask, (uint8_t*)p+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
+  __store<64>(val, v);
+  return _mm_countbits_32(uint32_t(mask));
 }
-#endif
 
-#if 1
-static FORCEINLINE int32_t __packed_store_active(uint32_t *p, __vec16_i32 val,
-                                                 __vec16_i1 mask) {
-    _mm512_mask_extpackstorelo_epi32(p, mask, val, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
-    _mm512_mask_extpackstorehi_epi32((uint8_t*)p+64, mask, val, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
-    return _mm_countbits_32(uint32_t(mask));
+static FORCEINLINE int32_t __packed_store_active(int32_t *p, __vec16_i32 val, __vec16_i1 mask) 
+{
+  _mm512_mask_extpackstorelo_epi32(p, mask, val, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
+  _mm512_mask_extpackstorehi_epi32((uint8_t*)p+64, mask, val, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
+  return _mm_countbits_32(uint32_t(mask));
 }
-#endif
-
-#if 1
-static FORCEINLINE int32_t __packed_load_active(int32_t *p, __vec16_i32 *val,
-                                                __vec16_i1 mask) {
-    __vec16_i32 v = __load<64>(val);
-    v = _mm512_mask_extloadunpacklo_epi32(v, mask, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
-    v = _mm512_mask_extloadunpackhi_epi32(v, mask, (uint8_t*)p+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
-    __store<64>(val, v);
-    return _mm_countbits_32(uint32_t(mask));
-}
-#endif
-
-#if 1
-static FORCEINLINE int32_t __packed_store_active(int32_t *p, __vec16_i32 val,
-                                                 __vec16_i1 mask) {
-    _mm512_mask_extpackstorelo_epi32(p, mask, val, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
-    _mm512_mask_extpackstorehi_epi32((uint8_t*)p+64, mask, val, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
-    return _mm_countbits_32(uint32_t(mask));
-}
-#endif
 
 ///////////////////////////////////////////////////////////////////////////
 // aos/soa
+///////////////////////////////////////////////////////////////////////////
 
 static FORCEINLINE void __soa_to_aos3_float(__vec16_f v0, __vec16_f v1, __vec16_f v2,
                                             float *ptr) {
@@ -2848,6 +2400,7 @@ static FORCEINLINE void __aos_to_soa4_float(float *ptr, __vec16_f *out0, __vec16
 
 ///////////////////////////////////////////////////////////////////////////
 // prefetch
+///////////////////////////////////////////////////////////////////////////
 
 static FORCEINLINE void __prefetch_read_uniform_1(unsigned char *p) {
     _mm_prefetch((char *)p, _MM_HINT_T0); // prefetch into L1$
@@ -2868,6 +2421,7 @@ static FORCEINLINE void __prefetch_read_uniform_nt(unsigned char *p) {
 
 ///////////////////////////////////////////////////////////////////////////
 // atomics
+///////////////////////////////////////////////////////////////////////////
 
 static FORCEINLINE uint32_t __atomic_add(uint32_t *p, uint32_t v) {
 #ifdef _MSC_VER

From 10223cfac3a8d0f5d80bd5eff095055e593764cd Mon Sep 17 00:00:00 2001
From: evghenii <egaburov@dds.nl>
Date: Sat, 5 Oct 2013 15:23:55 +0300
Subject: [PATCH 121/124] workong on shuffle/rotate for double, there seems to
 be a bug in cvt2zmm cvt2hilo

---
 examples/intrinsics/knc-i1x16.h | 85 ++++++++++++++++++++++++++++++++-
 1 file changed, 83 insertions(+), 2 deletions(-)

diff --git a/examples/intrinsics/knc-i1x16.h b/examples/intrinsics/knc-i1x16.h
index e712c969..807781f0 100644
--- a/examples/intrinsics/knc-i1x16.h
+++ b/examples/intrinsics/knc-i1x16.h
@@ -143,8 +143,14 @@ PRE_ALIGN(64) struct __vec16_f
 
 struct PRE_ALIGN(128) __vec16_d 
 {
-  __m512d v1;
-  __m512d v2;
+  union {
+    __m512d v1;
+    __m512d v_hi;
+  };
+  union {
+    __m512d v2;
+    __m512d v_lo;
+  };
   FORCEINLINE __vec16_d() : v1(_mm512_undefined_pd()), v2(_mm512_undefined_pd()) {}
   FORCEINLINE __vec16_d(const __m512d _v1, const __m512d _v2) : v1(_v1), v2(_v2) {}
   FORCEINLINE __vec16_d(const __vec16_d &o) : v1(o.v1), v2(o.v2) {}
@@ -158,6 +164,40 @@ struct PRE_ALIGN(128) __vec16_d
   }
   FORCEINLINE const double& operator[](const int i) const {  return ((double*)this)[i]; }
   FORCEINLINE       double& operator[](const int i)       {  return ((double*)this)[i]; }
+  FORCEINLINE __vec16_d cvt2hilo()  const
+  {
+    __m512i _hi, _lo;
+    _hi = _mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xFF00, 
+        _mm512_set_16to16_pi(15,13,11,9,7,5,3,1,14,12,10,8,6,4,2,0),
+        _mm512_castpd_si512(v1));
+    _hi = _mm512_mask_permutevar_epi32(_hi, 0x00FF, 
+        _mm512_set_16to16_pi(14,12,10,8,6,4,2,0,15,13,11,9,7,5,3,1),
+        _mm512_castpd_si512(v2));
+    _lo = _mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xFF00,
+        _mm512_set_16to16_pi(14,12,10,8,6,4,2,0,15,13,11,9,7,5,3,1),
+        _mm512_castpd_si512(v1));
+    _lo = _mm512_mask_permutevar_epi32(_lo, 0x00FF,
+        _mm512_set_16to16_pi(15,13,11,9,7,5,3,1,14,12,10,8,6,4,2,0),
+        _mm512_castpd_si512(v2));
+    return __vec16_d(_mm512_castsi512_pd(_hi), _mm512_castsi512_pd(_lo));
+  }
+  FORCEINLINE __vec16_d cvt2zmm() const
+  {
+    __m512i _v1, _v2;
+    _v1 = _mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xAAAA,
+        _mm512_set_16to16_pi(15,15,14,14,13,13,12,12,11,11,10,10,9,9,8,8),
+        _mm512_castpd_si512(v_hi));
+    _v1 = _mm512_mask_permutevar_epi32(_v1, 0x5555,
+        _mm512_set_16to16_pi(15,15,14,14,13,13,12,12,11,11,10,10,9,9,8,8),
+        _mm512_castpd_si512(v_lo));
+    _v2 = _mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xAAAA,
+        _mm512_set_16to16_pi(7,7,6,6,5,5,4,4,3,3,2,2,1,1,0,0),
+        _mm512_castpd_si512(v_hi));
+    _v2 = _mm512_mask_permutevar_epi32(_v2, 0x5555,
+        _mm512_set_16to16_pi(7,7,6,6,5,5,4,4,3,3,2,2,1,1,0,0),
+        _mm512_castpd_si512(v_lo));
+    return __vec16_d(_mm512_castsi512_pd(_v1), _mm512_castsi512_pd(_v2));
+  }
 } POST_ALIGN(128);
 
 struct PRE_ALIGN(128) __vec16_i64 
@@ -1247,8 +1287,49 @@ static FORCEINLINE __vec16_d __broadcast_double(__vec16_d v, int index)
   return ret;
 }
 
+#define CASTD2F(_v_, _v_hi_, _v_lo_) \
+  __vec16_f _v_hi_, _v_lo_;  \
+  { \
+  const __vec16_d v      = _v_.cvt2hilo(); \
+  _v_hi_   = _mm512_castpd_ps(v.v_hi); \
+  _v_lo_   = _mm512_castpd_ps(v.v_lo); }
+#define CASTF2D(_ret_hi_, _ret_lo_) \
+  __vec16_d(_mm512_castps_pd(_ret_hi_), _mm512_castps_pd(_ret_lo_)).cvt2zmm()
+
+#if 0 /* knc::testme  there appears to be no tests in ./tests for checking this functionality */
+static FORCEINLINE __vec16_d __rotate_double(const __vec16_d _v, const int index) 
+{
+//  return _v; /* this one passes all tests , but most not */
+  CASTD2F(_v, v_hi, v_lo);
+  const __vec16_f ret_hi = __rotate_float(v_hi, index);
+  const __vec16_f ret_lo = __rotate_float(v_lo, index);
+  return CASTF2D(ret_hi, ret_lo);
+}
+#else
 ROTATE(__vec16_d, double, double)
+#endif
+
+#if 0 /* knc::fails  ./tests/shuffle2-4.ispc ./tests/shuffle2-5.ispc */
+static FORCEINLINE __vec16_d __shuffle_double(__vec16_d _v, const __vec16_i32 index) 
+{
+  CASTD2F(_v, v_hi, v_lo);
+  const __vec16_f ret_hi = __shuffle_float(v_hi, index);
+  const __vec16_f ret_lo = __shuffle_float(v_lo, index);
+  return CASTF2D(ret_hi, ret_lo);
+}
+static FORCEINLINE __vec16_d __shuffle2_double(__vec16_d _v0, __vec16_d _v1, const __vec16_i32 index)
+{
+  CASTD2F(_v0, v0_hi, v0_lo);
+  CASTD2F(_v1, v1_hi, v1_lo);
+  const __vec16_f ret_hi = __shuffle2_float(v0_hi, v1_hi, index);
+  const __vec16_f ret_lo = __shuffle2_float(v0_lo, v1_lo, index);
+  return CASTF2D(ret_hi, ret_lo);
+}
+#else
 SHUFFLES(__vec16_d, double, double)
+#endif
+#undef CASTD2F
+#undef CASTF2D
 
 template <int ALIGN> static FORCEINLINE __vec16_d __load(const __vec16_d *p) \
 {

From 1b196520f6877c14203e5bc88ab37db6deeb88a7 Mon Sep 17 00:00:00 2001
From: evghenii <egaburov@dds.nl>
Date: Sat, 5 Oct 2013 22:10:05 +0300
Subject: [PATCH 122/124] knc-i1x16.h is cleaned: int32,float,double are
 complete, int64 is partially complete

---
 examples/intrinsics/knc-i1x16.h | 271 ++++++++++++++++----------------
 1 file changed, 133 insertions(+), 138 deletions(-)

diff --git a/examples/intrinsics/knc-i1x16.h b/examples/intrinsics/knc-i1x16.h
index 807781f0..fb2cf618 100644
--- a/examples/intrinsics/knc-i1x16.h
+++ b/examples/intrinsics/knc-i1x16.h
@@ -141,6 +141,37 @@ PRE_ALIGN(64) struct __vec16_f
   FORCEINLINE       float& operator[](const int i)       {  return ((float*)this)[i]; }
 } POST_ALIGN(64);
 
+static void zmm2hilo(const __m512i v1, const __m512i v2, __m512i &_hi, __m512i &_lo)
+{
+  _hi = _mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xFF00, 
+      _mm512_set_16to16_pi(15,13,11,9,7,5,3,1,14,12,10,8,6,4,2,0),
+      v2);
+  _hi = _mm512_mask_permutevar_epi32(_hi, 0x00FF, 
+      _mm512_set_16to16_pi(14,12,10,8,6,4,2,0,15,13,11,9,7,5,3,1),
+      v1);
+  _lo = _mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xFF00,
+      _mm512_set_16to16_pi(14,12,10,8,6,4,2,0,15,13,11,9,7,5,3,1),
+      v2);
+  _lo = _mm512_mask_permutevar_epi32(_lo, 0x00FF,
+      _mm512_set_16to16_pi(15,13,11,9,7,5,3,1,14,12,10,8,6,4,2,0),
+      v1);
+}
+static void hilo2zmm(const __m512i v_hi, const __m512i v_lo, __m512i &_v1, __m512i &_v2)
+{
+  _v2 = _mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xAAAA,
+      _mm512_set_16to16_pi(15,15,14,14,13,13,12,12,11,11,10,10,9,9,8,8),
+      v_hi);
+  _v2 = _mm512_mask_permutevar_epi32(_v2, 0x5555,
+      _mm512_set_16to16_pi(15,15,14,14,13,13,12,12,11,11,10,10,9,9,8,8),
+      v_lo);
+  _v1 = _mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xAAAA,
+      _mm512_set_16to16_pi(7,7,6,6,5,5,4,4,3,3,2,2,1,1,0,0),
+      v_hi);
+  _v1 = _mm512_mask_permutevar_epi32(_v1, 0x5555,
+      _mm512_set_16to16_pi(7,7,6,6,5,5,4,4,3,3,2,2,1,1,0,0),
+      v_lo);
+}
+
 struct PRE_ALIGN(128) __vec16_d 
 {
   union {
@@ -166,36 +197,18 @@ struct PRE_ALIGN(128) __vec16_d
   FORCEINLINE       double& operator[](const int i)       {  return ((double*)this)[i]; }
   FORCEINLINE __vec16_d cvt2hilo()  const
   {
+    const __m512i _v1 = _mm512_castpd_si512(v1);
+    const __m512i _v2 = _mm512_castpd_si512(v2);
     __m512i _hi, _lo;
-    _hi = _mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xFF00, 
-        _mm512_set_16to16_pi(15,13,11,9,7,5,3,1,14,12,10,8,6,4,2,0),
-        _mm512_castpd_si512(v1));
-    _hi = _mm512_mask_permutevar_epi32(_hi, 0x00FF, 
-        _mm512_set_16to16_pi(14,12,10,8,6,4,2,0,15,13,11,9,7,5,3,1),
-        _mm512_castpd_si512(v2));
-    _lo = _mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xFF00,
-        _mm512_set_16to16_pi(14,12,10,8,6,4,2,0,15,13,11,9,7,5,3,1),
-        _mm512_castpd_si512(v1));
-    _lo = _mm512_mask_permutevar_epi32(_lo, 0x00FF,
-        _mm512_set_16to16_pi(15,13,11,9,7,5,3,1,14,12,10,8,6,4,2,0),
-        _mm512_castpd_si512(v2));
+    zmm2hilo(_v1, _v2, _hi, _lo);
     return __vec16_d(_mm512_castsi512_pd(_hi), _mm512_castsi512_pd(_lo));
   }
   FORCEINLINE __vec16_d cvt2zmm() const
   {
+    const __m512i _hi = _mm512_castpd_si512(v_hi);
+    const __m512i _lo = _mm512_castpd_si512(v_lo);
     __m512i _v1, _v2;
-    _v1 = _mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xAAAA,
-        _mm512_set_16to16_pi(15,15,14,14,13,13,12,12,11,11,10,10,9,9,8,8),
-        _mm512_castpd_si512(v_hi));
-    _v1 = _mm512_mask_permutevar_epi32(_v1, 0x5555,
-        _mm512_set_16to16_pi(15,15,14,14,13,13,12,12,11,11,10,10,9,9,8,8),
-        _mm512_castpd_si512(v_lo));
-    _v2 = _mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xAAAA,
-        _mm512_set_16to16_pi(7,7,6,6,5,5,4,4,3,3,2,2,1,1,0,0),
-        _mm512_castpd_si512(v_hi));
-    _v2 = _mm512_mask_permutevar_epi32(_v2, 0x5555,
-        _mm512_set_16to16_pi(7,7,6,6,5,5,4,4,3,3,2,2,1,1,0,0),
-        _mm512_castpd_si512(v_lo));
+    hilo2zmm(_hi,_lo, _v1,_v2);
     return __vec16_d(_mm512_castsi512_pd(_v1), _mm512_castsi512_pd(_v2));
   }
 } POST_ALIGN(128);
@@ -226,38 +239,15 @@ struct PRE_ALIGN(128) __vec16_i64
   FORCEINLINE       int64_t& operator[](const int i)       {  return ((int64_t*)this)[i]; }
   FORCEINLINE __vec16_i64 cvt2hilo()  const
   {
-    __m512i _hi, _lo;
-    _hi = _mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xFF00, 
-        _mm512_set_16to16_pi(15,13,11,9,7,5,3,1,14,12,10,8,6,4,2,0),
-        v1);
-    _hi = _mm512_mask_permutevar_epi32(_hi, 0x00FF, 
-        _mm512_set_16to16_pi(14,12,10,8,6,4,2,0,15,13,11,9,7,5,3,1),
-        v2);
-    _lo = _mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xFF00,
-        _mm512_set_16to16_pi(14,12,10,8,6,4,2,0,15,13,11,9,7,5,3,1),
-        v1);
-    _lo = _mm512_mask_permutevar_epi32(_lo, 0x00FF,
-        _mm512_set_16to16_pi(15,13,11,9,7,5,3,1,14,12,10,8,6,4,2,0),
-        v2);
-    return __vec16_i64(_hi, _lo);
+    __vec16_i64 ret;
+    zmm2hilo(v1,v2,ret.v_hi,ret.v_lo);
+    return ret;
   }
   FORCEINLINE __vec16_i64 cvt2zmm() const
   {
-    __m512i _v1, _v2;
-    _v1 = _mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xAAAA,
-        _mm512_set_16to16_pi(15,15,14,14,13,13,12,12,11,11,10,10,9,9,8,8),
-        v_hi);
-    _v1 = _mm512_mask_permutevar_epi32(_v1, 0x5555,
-        _mm512_set_16to16_pi(15,15,14,14,13,13,12,12,11,11,10,10,9,9,8,8),
-        v_lo);
-
-    _v2 = _mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xAAAA,
-        _mm512_set_16to16_pi(7,7,6,6,5,5,4,4,3,3,2,2,1,1,0,0),
-        v_hi);
-    _v2 = _mm512_mask_permutevar_epi32(_v2, 0x5555,
-        _mm512_set_16to16_pi(7,7,6,6,5,5,4,4,3,3,2,2,1,1,0,0),
-        v_lo);
-    return __vec16_i64(_v1, _v2);
+    __vec16_i64 ret;
+    hilo2zmm(v_hi,v_lo, ret.v1, ret.v2);
+    return ret;
   }
 } POST_ALIGN(128);
 
@@ -305,15 +295,6 @@ static inline int32_t __extract_element(__vec16_i32, int);
 ///////////////////////////////////////////////////////////////////////////
 // macros...
 
-/* knc::macro::not used */
-#define UNARY_OP(TYPE, NAME, OP)            \
-static FORCEINLINE TYPE NAME(TYPE v) {      \
-    TYPE ret;                               \
-    for (int i = 0; i < 16; ++i)            \
-        ret[i] = OP(v[i]);              \
-    return ret;                             \
-}
-
 /* knc::macro::used */
 #define BINARY_OP(TYPE, NAME, OP)                               \
 static FORCEINLINE TYPE NAME(TYPE a, TYPE b) {                  \
@@ -722,7 +703,7 @@ static FORCEINLINE __vec16_i32 __shuffle2_i32(__vec16_i32 v0, __vec16_i32 v1, __
 
 template <int ALIGN> static FORCEINLINE __vec16_i32 __load(const __vec16_i32 *p) 
 {
-#ifdef ISPC_FORCE_ALIGNED_MEMORY
+#ifdef ISPC_FORCE_ALIGNED_MEMORY__REMOVETHIS_WHEN_FIXED
   return __load<64>(p);
 #else
   __vec16_i32 v;
@@ -734,7 +715,7 @@ template <int ALIGN> static FORCEINLINE __vec16_i32 __load(const __vec16_i32 *p)
 
 template <int ALIGN> static FORCEINLINE void __store(__vec16_i32 *p, __vec16_i32 v) 
 {
-#ifdef ISPC_FORCE_ALIGNED_MEMORY
+#ifdef ISPC_FORCE_ALIGNED_MEMORY__REMOVETHIS_WHEN_FIXED
   __store<64>(p,v);
 #else
   _mm512_extpackstorelo_epi32(          p,    v, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
@@ -742,6 +723,17 @@ template <int ALIGN> static FORCEINLINE void __store(__vec16_i32 *p, __vec16_i32
 #endif
 }
 
+#if 0 /* knc::fails  ./tests/foreach-25.ispc ./tests/forach-26.ispc ./tests/foreach-27.ispc */
+template <> static FORCEINLINE __vec16_i32 __load<64>(const __vec16_i32 *p) 
+{
+  return _mm512_load_epi32(p);
+}
+template <> static FORCEINLINE void __store<64>(__vec16_i32 *p, __vec16_i32 v) 
+{
+  _mm512_store_epi32(p, v);
+}
+#endif
+
 ///////////////////////////////////////////////////////////////////////////
 // int64
 ///////////////////////////////////////////////////////////////////////////
@@ -783,8 +775,8 @@ static FORCEINLINE __vec16_i64 __mul(__vec16_i64 a, __vec16_i64 b)
 #if 0  /* knc::fails  ./tests/int64-min-1.ispc ./tests/idiv.ispc */
 static FORCEINLINE __vec16_i64 __mul(__vec16_i64 _a, __vec16_i64 _b) 
 {
-  const __vec16_i64 a = _a.cvt2hilo();
-  const __vec16_i64 b = _b.cvt2hilo();
+  __vec16_i64 a = _a.cvt2hilo();
+  __vec16_i64 b = _b.cvt2hilo();
   __vec16_i32 lo = _mm512_mullo_epi32(a.v_lo,b.v_lo);
   __vec16_i32 hi_m1 = _mm512_mulhi_epi32(a.v_lo, b.v_lo);
   __vec16_i32 hi_m2 = _mm512_mullo_epi32(a.v_hi, b.v_lo);
@@ -858,7 +850,6 @@ SHIFT_UNIFORM(__vec16_i64, uint64_t, __lshr, >>)
 SHIFT_UNIFORM(__vec16_i64, int64_t, __ashr, >>)
 SHIFT_UNIFORM(__vec16_i64, int64_t, __shl, <<)
 
-#if 0 /* knc::fails  ./tests/reduce-equal-8.ispc , knc::hangs foreach-unique-6.ispc funcptr-null-[2-6].ispc  funcptr-uniform-9.ispc  funcptr-varying-5.ispc */
 static FORCEINLINE __vec16_i1 __equal_i64(__vec16_i64 _a, __vec16_i64 _b) 
 {
   const __vec16_i64 a = _a.cvt2hilo();
@@ -874,22 +865,14 @@ static FORCEINLINE __vec16_i1 __equal_i64_and_mask(__vec16_i64 _a, __vec16_i64 _
   __mmask16 full_match = _mm512_mask_cmpeq_epi32_mask(lo_match,a.v_hi,b.v_hi);
   return _mm512_kand(full_match, (__mmask16)mask);
 }
-
 static FORCEINLINE __vec16_i1 __not_equal_i64(__vec16_i64 a, __vec16_i64 b) 
 {
-    return __not(__equal_i64(a,b));
+  return __not(__equal_i64(a,b));
 }
 static FORCEINLINE __vec16_i1 __not_equal_i64_and_mask(__vec16_i64 a, __vec16_i64 b, __vec16_i1 mask) 
 {
-    return __and(__not(__equal_i64(a,b)), mask);
+  return __and(__not(__equal_i64(a,b)), mask);
 }
-#else
-CMP_OP(__vec16_i64, i64, int64_t,  __equal, ==)
-CMP_OP(__vec16_i64, i64, int64_t,  __not_equal, !=)
-#endif
-
-
-
 CMP_OP(__vec16_i64, i64, uint64_t, __unsigned_less_equal, <=)
 CMP_OP(__vec16_i64, i64, int64_t,  __signed_less_equal, <=)
 CMP_OP(__vec16_i64, i64, uint64_t, __unsigned_greater_equal, >=)
@@ -918,18 +901,49 @@ template <> FORCEINLINE  __vec16_i64 __setzero_i64<__vec16_i64>() { return __vec
 template <class RetVecType> RetVecType __undef_i64();
 template <> FORCEINLINE  __vec16_i64 __undef_i64<__vec16_i64>() { return __vec16_i64(_mm512_undefined_epi32(), _mm512_undefined_epi32()); }
 
-static FORCEINLINE __vec16_i64 __broadcast_i64(__vec16_i64 v, int index) 
+#define CASTL2I(_v_, _v_hi_, _v_lo_) \
+  __vec16_i32 _v_hi_, _v_lo_;  \
+  { \
+  const __vec16_i64 v      = _v_.cvt2hilo(); \
+  _v_hi_   = v.v_hi; \
+  _v_lo_   = v.v_lo; }
+#define CASTI2L(_ret_hi_, _ret_lo_) \
+  __vec16_i64(_ret_hi_, _ret_lo_).cvt2zmm()
+static FORCEINLINE __vec16_i64 __broadcast_i64(__vec16_i64 _v, int index) 
 {
-  int64_t val = __extract_element(v, index & 0xf);
-  return __smear_i64<__vec16_i64>(val);
+  CASTL2I(_v, v_hi, v_lo);
+  const __vec16_i32 ret_hi = __broadcast_i32(v_hi, index);
+  const __vec16_i32 ret_lo = __broadcast_i32(v_lo, index);
+  return CASTI2L(ret_hi, ret_lo);
 }
-
-ROTATE  (__vec16_i64, i64, int64_t)
-SHUFFLES(__vec16_i64, i64, int64_t)
+static FORCEINLINE __vec16_i64 __rotate_i64(const __vec16_i64 _v, const int index) 
+{
+  CASTL2I(_v, v_hi, v_lo);
+  const __vec16_i32 ret_hi = __rotate_i32(v_hi, index);
+  const __vec16_i32 ret_lo = __rotate_i32(v_lo, index);
+  return CASTI2L(ret_hi, ret_lo);
+}
+static FORCEINLINE __vec16_i64 __shuffle_double(__vec16_i64 _v, const __vec16_i32 index) 
+{
+  CASTL2I(_v, v_hi, v_lo);
+  const __vec16_i32 ret_hi = __shuffle_i32(v_hi, index);
+  const __vec16_i32 ret_lo = __shuffle_i32(v_lo, index);
+  return CASTI2L(ret_hi, ret_lo);
+}
+static FORCEINLINE __vec16_i64 __shuffle2_double(__vec16_i64 _v0, __vec16_i64 _v1, const __vec16_i32 index)
+{
+  CASTL2I(_v0, v0_hi, v0_lo);
+  CASTL2I(_v1, v1_hi, v1_lo);
+  const __vec16_i32 ret_hi = __shuffle2_i32(v0_hi, v1_hi, index);
+  const __vec16_i32 ret_lo = __shuffle2_i32(v0_lo, v1_lo, index);
+  return CASTI2L(ret_hi, ret_lo);
+}
+#undef CASTI2L
+#undef CASTL2I
 
 template <int ALIGN> static FORCEINLINE __vec16_i64 __load(const __vec16_i64 *p) 
 {
-#ifdef ISPC_FORCE_ALIGNED_MEMORY
+#ifdef ISPC_FORCE_ALIGNED_MEMORY__REMOVETHIS_WHEN_FIXED
   return __load<128>(p);
 #else
   __vec16_i32 v1;
@@ -942,18 +956,10 @@ template <int ALIGN> static FORCEINLINE __vec16_i64 __load(const __vec16_i64 *p)
 #endif
 }
 
-template <> static FORCEINLINE __vec16_i64 __load<64>(const __vec16_i64 *p) 
-{
-  __m512i v2 = _mm512_load_epi32(p);
-  __m512i v1 = _mm512_load_epi32(((uint8_t*)p)+64);
-  return __vec16_i64(v2,v1);
-}
-
-template <> static FORCEINLINE __vec16_i64 __load<128>(const __vec16_i64 *p) {    return __load<64>(p); }
 
 template <int ALIGN> static FORCEINLINE void __store(__vec16_i64 *p, __vec16_i64 v) 
 {
-#ifdef ISPC_FORCE_ALIGNED_MEMORY
+#ifdef ISPC_FORCE_ALIGNED_MEMORY__REMOVETHIS_WHEN_FIXED
   return __store<128>(p,v);
 #else
   __m512i v1 = v.v2;
@@ -965,6 +971,14 @@ template <int ALIGN> static FORCEINLINE void __store(__vec16_i64 *p, __vec16_i64
 #endif
 }
 
+#if 0 /* knc::fails  as with _i32 this may generate fails ... so commetining it out */
+template <> static FORCEINLINE __vec16_i64 __load<64>(const __vec16_i64 *p) 
+{
+  __m512i v2 = _mm512_load_epi32(p);
+  __m512i v1 = _mm512_load_epi32(((uint8_t*)p)+64);
+  return __vec16_i64(v2,v1);
+}
+template <> static FORCEINLINE __vec16_i64 __load<128>(const __vec16_i64 *p) {    return __load<64>(p); }
 template <> static FORCEINLINE void __store<64>(__vec16_i64 *p, __vec16_i64 v) 
 {
   __m512i v1 = v.v2;
@@ -972,8 +986,8 @@ template <> static FORCEINLINE void __store<64>(__vec16_i64 *p, __vec16_i64 v)
   _mm512_store_epi64(p, v2);
   _mm512_store_epi64(((uint8_t*)p)+64, v1);
 }
-
 template <> static FORCEINLINE void __store<128>(__vec16_i64 *p, __vec16_i64 v) {    __store<64>(p, v); }
+#endif
 
 
 ///////////////////////////////////////////////////////////////////////////
@@ -1048,7 +1062,7 @@ static FORCEINLINE __vec16_f __shuffle2_float(__vec16_f _v0, __vec16_f _v1, __ve
 
 template <int ALIGN> static FORCEINLINE __vec16_f __load(const __vec16_f *p) 
 {
-#ifdef ISPC_FORCE_ALIGNED_MEMORY
+#ifdef ISPC_FORCE_ALIGNED_MEMORY__REMOVETHIS_WHEN_FIXED
   return __load<64>(p);
 #else
   __vec16_f v;
@@ -1060,7 +1074,7 @@ template <int ALIGN> static FORCEINLINE __vec16_f __load(const __vec16_f *p)
 
 template <int ALIGN> static FORCEINLINE void __store(__vec16_f *p, __vec16_f v) 
 {
-#ifdef ISPC_FORCE_ALIGNED_MEMORY
+#ifdef ISPC_FORCE_ALIGNED_MEMORY__REMOVETHIS_WHEN_FIXED
   __store<64>(p,v);
 #else
   _mm512_extpackstorelo_ps(          p,    v, _MM_DOWNCONV_PS_NONE, _MM_HINT_NONE);
@@ -1278,15 +1292,6 @@ template <> static FORCEINLINE __vec16_d __setzero_double<__vec16_d>() { return
 template <class RetVecType> RetVecType __undef_double();
 template <> static FORCEINLINE __vec16_d __undef_double<__vec16_d>() { return __vec16_d(); }
 
-static FORCEINLINE __vec16_d __broadcast_double(__vec16_d v, int index) 
-{
-  __vec16_d ret;
-  double val = __extract_element(v, index & 0xf);
-  ret.v1 = _mm512_set1_pd(val);
-  ret.v2 = _mm512_set1_pd(val);
-  return ret;
-}
-
 #define CASTD2F(_v_, _v_hi_, _v_lo_) \
   __vec16_f _v_hi_, _v_lo_;  \
   { \
@@ -1295,21 +1300,20 @@ static FORCEINLINE __vec16_d __broadcast_double(__vec16_d v, int index)
   _v_lo_   = _mm512_castpd_ps(v.v_lo); }
 #define CASTF2D(_ret_hi_, _ret_lo_) \
   __vec16_d(_mm512_castps_pd(_ret_hi_), _mm512_castps_pd(_ret_lo_)).cvt2zmm()
-
-#if 0 /* knc::testme  there appears to be no tests in ./tests for checking this functionality */
+static FORCEINLINE __vec16_d __broadcast_double(__vec16_d _v, int index) 
+{
+  CASTD2F(_v, v_hi, v_lo);
+  const __vec16_f ret_hi = __broadcast_float(v_hi, index);
+  const __vec16_f ret_lo = __broadcast_float(v_lo, index);
+  return CASTF2D(ret_hi, ret_lo);
+}
 static FORCEINLINE __vec16_d __rotate_double(const __vec16_d _v, const int index) 
 {
-//  return _v; /* this one passes all tests , but most not */
   CASTD2F(_v, v_hi, v_lo);
   const __vec16_f ret_hi = __rotate_float(v_hi, index);
   const __vec16_f ret_lo = __rotate_float(v_lo, index);
   return CASTF2D(ret_hi, ret_lo);
 }
-#else
-ROTATE(__vec16_d, double, double)
-#endif
-
-#if 0 /* knc::fails  ./tests/shuffle2-4.ispc ./tests/shuffle2-5.ispc */
 static FORCEINLINE __vec16_d __shuffle_double(__vec16_d _v, const __vec16_i32 index) 
 {
   CASTD2F(_v, v_hi, v_lo);
@@ -1325,32 +1329,37 @@ static FORCEINLINE __vec16_d __shuffle2_double(__vec16_d _v0, __vec16_d _v1, con
   const __vec16_f ret_lo = __shuffle2_float(v0_lo, v1_lo, index);
   return CASTF2D(ret_hi, ret_lo);
 }
-#else
-SHUFFLES(__vec16_d, double, double)
-#endif
-#undef CASTD2F
 #undef CASTF2D
+#undef CASTD2F
 
 template <int ALIGN> static FORCEINLINE __vec16_d __load(const __vec16_d *p) \
 {
+#ifdef ISPC_FORCE_ALIGNED_MEMORY__REMOVETHIS_WHEN_FIXED
+  return __load<128>(p);
+#else
   __vec16_d ret;
   ret.v1 = _mm512_extloadunpacklo_pd(ret.v1, p, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
   ret.v1 = _mm512_extloadunpackhi_pd(ret.v1, (uint8_t*)p+64, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
   ret.v2 = _mm512_extloadunpacklo_pd(ret.v2, (uint8_t*)p+64, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
   ret.v2 = _mm512_extloadunpackhi_pd(ret.v2, (uint8_t*)p+128, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
   return ret;
+#endif
 }
  
 template <int ALIGN> static FORCEINLINE void __store(__vec16_d *p, __vec16_d v) 
 {
+#ifdef ISPC_FORCE_ALIGNED_MEMORY__REMOVETHIS_WHEN_FIXED
+  return __store<128>(p,v);
+#else
   _mm512_extpackstorelo_pd(p, v.v1, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE);
   _mm512_extpackstorehi_pd((uint8_t*)p+64, v.v1, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE);
   _mm512_extpackstorelo_pd((uint8_t*)p+64, v.v2, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE);
   _mm512_extpackstorehi_pd((uint8_t*)p+128, v.v2, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE);
+#endif
 }
 
 
-#if 1
+#if 0 /* knc::fails  as with _f this may generate fails ... so commetining it out */
 template <> static FORCEINLINE __vec16_d __load<64>(const __vec16_d *p) 
 {
   return __vec16_d(_mm512_load_pd(p), _mm512_load_pd(((uint8_t*)p)+64));
@@ -1379,14 +1388,12 @@ static FORCEINLINE TO FUNC(TO, FROM val) {      \
 }
 
 // sign extension conversions
-#if 0 /* knc::fails on soa-9 soa-13 soa-10 soa-29 soa-3 ... and others  */
+
+// CAST(__vec16_i64, int64_t, __vec16_i32, int32_t, __cast_sext)
 static FORCEINLINE __vec16_i64 __cast_sext(const __vec16_i64 &, const __vec16_i32 &val)
 {
   return __vec16_i64(_mm512_srai_epi32(val.v,31), val.v).cvt2zmm();
 }
-#else
-CAST(__vec16_i64, int64_t, __vec16_i32, int32_t, __cast_sext)
-#endif
 CAST(__vec16_i64, int64_t, __vec16_i16, int16_t, __cast_sext)
 CAST(__vec16_i64, int64_t, __vec16_i8,  int8_t,  __cast_sext)
 CAST(__vec16_i32, int32_t, __vec16_i16, int16_t, __cast_sext)
@@ -2107,7 +2114,7 @@ static FORCEINLINE __vec16_i8 __gather_base_offsets32_i8(uint8_t *base, uint32_t
   _mm512_extstore_epi32(ret.data,tmp,_MM_DOWNCONV_EPI32_SINT8,_MM_HINT_NONE);
   return ret;
 }
-#if 0 /* knc::fails on gather-int8-2 & gather-int8-4 */
+// GATHER_BASE_OFFSETS(__vec16_i8,  int8_t,  __vec16_i64, __gather_base_offsets64_i8)
 static FORCEINLINE __vec16_i8 __gather_base_offsets64_i8(uint8_t *_base, uint32_t scale, __vec16_i64 _offsets, __vec16_i1 mask) 
 { 
   const __vec16_i64 offsets = _offsets.cvt2hilo();
@@ -2131,9 +2138,6 @@ static FORCEINLINE __vec16_i8 __gather_base_offsets64_i8(uint8_t *_base, uint32_
   _mm512_extstore_epi32(ret.data,tmp,_MM_DOWNCONV_EPI32_SINT8,_MM_HINT_NONE);
   return ret;
 }
-#else
-GATHER_BASE_OFFSETS(__vec16_i8,  int8_t,  __vec16_i64, __gather_base_offsets64_i8)
-#endif
 /****************/
 GATHER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i32, __gather_base_offsets32_i16)
 GATHER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i64, __gather_base_offsets64_i16)
@@ -2145,7 +2149,7 @@ static FORCEINLINE __vec16_i32 __gather_base_offsets32_i32(uint8_t *base, uint32
                                         base, _MM_UPCONV_EPI32_NONE, scale,
                                         _MM_HINT_NONE);
 }
-#if 0 /* knc::fails on gather-int32-2 & gather-int32-4 */
+// GATHER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i64, __gather_base_offsets64_i32)
 static FORCEINLINE __vec16_i32 __gather_base_offsets64_i32(uint8_t *_base, uint32_t scale, __vec16_i64 _offsets,  __vec16_i1 mask) 
 {
   const __vec16_i64 offsets = _offsets.cvt2hilo();
@@ -2170,9 +2174,6 @@ static FORCEINLINE __vec16_i32 __gather_base_offsets64_i32(uint8_t *_base, uint3
 
   return ret;
 }
-#else
-GATHER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i64, __gather_base_offsets64_i32)
-#endif
 /****************/
 // GATHER_BASE_OFFSETS(__vec16_f,   float,   __vec16_i32, __gather_base_offsets32_float)
 static FORCEINLINE __vec16_f __gather_base_offsets32_float(uint8_t *base, uint32_t scale, __vec16_i32 offsets, __vec16_i1 mask) 
@@ -2181,7 +2182,7 @@ static FORCEINLINE __vec16_f __gather_base_offsets32_float(uint8_t *base, uint32
                                      base, _MM_UPCONV_PS_NONE, scale,
                                      _MM_HINT_NONE);
 }
-#if 0 /* knc::fails on gather-float-2 gather-float-4 & soa-14 */
+// GATHER_BASE_OFFSETS(__vec16_f,   float,   __vec16_i64, __gather_base_offsets64_float)
 static FORCEINLINE __vec16_f __gather_base_offsets64_float(uint8_t *_base, uint32_t scale, __vec16_i64 _offsets,  __vec16_i1 mask) 
 {
   const __vec16_i64 offsets = _offsets.cvt2hilo();
@@ -2206,9 +2207,6 @@ static FORCEINLINE __vec16_f __gather_base_offsets64_float(uint8_t *_base, uint3
 
   return ret;
 }
-#else
-GATHER_BASE_OFFSETS(__vec16_f,   float,   __vec16_i64, __gather_base_offsets64_float)
-#endif
 /****************/
 GATHER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i32, __gather_base_offsets32_i64)
 GATHER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i64, __gather_base_offsets64_i64)
@@ -2324,7 +2322,7 @@ static FORCEINLINE void __scatter_base_offsets32_float(void *base, uint32_t scal
                                _MM_DOWNCONV_PS_NONE, scale,
                                _MM_HINT_NONE);
 }
-#if 0 /* knc::fails on soa-10 & soa-13 , it is very similar to __scatter_base_offsets64_it32, but that passes tests, why ?!? */
+//SCATTER_BASE_OFFSETS(__vec16_f,   float,   __vec16_i64, __scatter_base_offsets64_float)
 static FORCEINLINE void __scatter_base_offsets64_float(uint8_t *_base, uint32_t scale, __vec16_i64 _offsets, __vec16_f value, __vec16_i1 mask) 
 { 
   const __vec16_i64 offsets = _offsets.cvt2hilo();
@@ -2346,9 +2344,6 @@ static FORCEINLINE void __scatter_base_offsets64_float(uint8_t *_base, uint32_t
     still_to_do = _mm512_kxor(match,still_to_do);
   }
 }
-#else
-SCATTER_BASE_OFFSETS(__vec16_f,   float,   __vec16_i64, __scatter_base_offsets64_float)
-#endif
 /*****************/
 SCATTER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i32, __scatter_base_offsets32_i64)
 SCATTER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i64, __scatter_base_offsets64_i64)

From 4222605f873e589aa9dc905fb0c2e6dcb9353d01 Mon Sep 17 00:00:00 2001
From: evghenii <egaburov@dds.nl>
Date: Mon, 7 Oct 2013 14:24:27 +0300
Subject: [PATCH 123/124] fixed lshr/ashr/shl shifts. __mul i64 vector version
 for icc < 14.0.0 works only on signed, so  commented it out in favour of
 sequential

---
 examples/intrinsics/knc-i1x16.h | 115 ++++++++++++++++----------------
 1 file changed, 58 insertions(+), 57 deletions(-)

diff --git a/examples/intrinsics/knc-i1x16.h b/examples/intrinsics/knc-i1x16.h
index fb2cf618..ffe8fb56 100644
--- a/examples/intrinsics/knc-i1x16.h
+++ b/examples/intrinsics/knc-i1x16.h
@@ -668,6 +668,7 @@ template <class RetVecType> RetVecType __smear_i32(int32_t i);
 template <> static FORCEINLINE __vec16_i32 __smear_i32<__vec16_i32>(int32_t i) { return _mm512_set1_epi32(i); }
 
 static const __vec16_i32 __ispc_one = __smear_i32<__vec16_i32>(1);
+static const __vec16_i32 __ispc_zero = __smear_i32<__vec16_i32>(0);
 static const __vec16_i32 __ispc_thirty_two = __smear_i32<__vec16_i32>(32);
 static const __vec16_i32 __ispc_ffffffff = __smear_i32<__vec16_i32>(-1);
 static const __vec16_i32 __ispc_stride1(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
@@ -772,18 +773,18 @@ static FORCEINLINE __vec16_i64 __mul(__vec16_i64 a, __vec16_i64 b)
   return __vec16_i64(_mm512_mullox_epi64(a.v1,b.v1), _mm512_mullox_epi64(a.v2,b.v2));
 }
 #else  /* __ICC >= 1400 */
-#if 0  /* knc::fails  ./tests/int64-min-1.ispc ./tests/idiv.ispc */
+#if 0  /* knc::fails  ./tests/int64-min-1.ispc ./tests/idiv.ispc  cause: if one or both numbers are negative multiplication fails */
 static FORCEINLINE __vec16_i64 __mul(__vec16_i64 _a, __vec16_i64 _b) 
 {
-  __vec16_i64 a = _a.cvt2hilo();
-  __vec16_i64 b = _b.cvt2hilo();
-  __vec16_i32 lo = _mm512_mullo_epi32(a.v_lo,b.v_lo);
-  __vec16_i32 hi_m1 = _mm512_mulhi_epi32(a.v_lo, b.v_lo);
-  __vec16_i32 hi_m2 = _mm512_mullo_epi32(a.v_hi, b.v_lo);
-  __vec16_i32 hi_m3 = _mm512_mullo_epi32(a.v_lo, b.v_hi);
-  __mmask16 carry = 0;
-  __vec16_i32 hi_p23 = _mm512_addsetc_epi32(hi_m2, hi_m1, &carry);
-  __vec16_i32 hi = _mm512_adc_epi32(hi_m3, carry, hi_p23, &carry);
+  const __vec16_i64 a = _a.cvt2hilo();
+  const __vec16_i64 b = _b.cvt2hilo();
+  const __vec16_i32    lo = _mm512_mullo_epi32(a.v_lo, b.v_lo);
+  const __vec16_i32 hi_m1 = _mm512_mulhi_epi32(a.v_lo, b.v_lo);
+  const __vec16_i32 hi_m2 = _mm512_mullo_epi32(a.v_hi, b.v_lo);
+  const __vec16_i32 hi_m3 = _mm512_mullo_epi32(a.v_lo, b.v_hi);
+  __mmask16 carry;
+  const __vec16_i32 hi_p23 = _mm512_addsetc_epi32(hi_m2, hi_m3, &carry);
+  const __vec16_i32 hi = _mm512_adc_epi32(hi_p23, carry, hi_m1, &carry);
   return __vec16_i64(hi,lo).cvt2zmm();
 }
 #else
@@ -795,60 +796,68 @@ static FORCEINLINE __vec16_i64 __or (__vec16_i64 a, __vec16_i64 b) { return __ve
 static FORCEINLINE __vec16_i64 __and(__vec16_i64 a, __vec16_i64 b) { return __vec16_i64(_mm512_and_epi64(a.v1, b.v1), _mm512_and_epi64(a.v2, b.v2)); }
 static FORCEINLINE __vec16_i64 __xor(__vec16_i64 a, __vec16_i64 b) { return __vec16_i64(_mm512_xor_epi64(a.v1, b.v1), _mm512_xor_epi64(a.v2, b.v2)); }
 
-static FORCEINLINE __vec16_i64 __shl(__vec16_i64 _a, __vec16_i64 _b) 
-{
-  const __vec16_i64 a = _a.cvt2hilo();
-  const __vec16_i64 b = _b.cvt2hilo();
-  __vec16_i32 xfer = _mm512_srlv_epi32(a.v_lo, _mm512_sub_epi32(__ispc_thirty_two, b.v_lo));
-  __vec16_i32 hi = _mm512_or_epi32(_mm512_sllv_epi32(a.v_hi, b.v_lo), xfer);
-  __vec16_i32 lo = _mm512_sllv_epi32(a.v_lo, b.v_lo);
-  return __vec16_i64(hi,lo).cvt2zmm();
-}
-
 static FORCEINLINE __vec16_i64 __udiv(__vec16_i64 a, __vec16_i64 b) { return __vec16_i64(_mm512_div_epu64(a.v1,b.v1), _mm512_div_epu64(a.v2,b.v2)); }
 static FORCEINLINE __vec16_i64 __sdiv(__vec16_i64 a, __vec16_i64 b) { return __vec16_i64(_mm512_div_epi64(a.v1,b.v1), _mm512_div_epi64(a.v2,b.v2)); }
 
 static FORCEINLINE __vec16_i64 __urem(__vec16_i64 a, __vec16_i64 b) { return __vec16_i64(_mm512_rem_epu64(a.v1,b.v1), _mm512_rem_epu64(a.v2,b.v2)); }
 static FORCEINLINE __vec16_i64 __srem(__vec16_i64 a, __vec16_i64 b) { return __vec16_i64(_mm512_rem_epi64(a.v1,b.v1), _mm512_rem_epi64(a.v2,b.v2)); }
 
-#if 0 /* knc::fails  ./tests/idiv.ispc */
-static FORCEINLINE __vec16_i64 __lshr(__vec16_i64 _a, __vec16_i64 _b) {
+
+static FORCEINLINE __vec16_i64 __shl(__vec16_i64 _a, __vec16_i64 _b) 
+{
   const __vec16_i64 a = _a.cvt2hilo();
   const __vec16_i64 b = _b.cvt2hilo();
-  __vec16_i32 shift = _mm512_sub_epi32(__ispc_thirty_two, b.v_lo);
-#if 0
-  __vec16_i32 xfer = _mm512_and_epi32(_mm512_sllv_epi32(__ispc_ffffffff, shift), _mm512_sllv_epi32(a.v_hi, shift));
-#else
-  __vec16_i32 xfer = _mm512_sllv_epi32(_mm512_and_epi32(a.v_hi, 
-        _mm512_sub_epi32(_mm512_sllv_epi32(__ispc_one, b.v_lo), __ispc_one)), 
-      _mm512_sub_epi32(__ispc_thirty_two, b.v_lo));
-#endif
-  __vec16_i32 hi = _mm512_srlv_epi32(a.v_hi, b.v_lo);
-  __vec16_i32 lo = _mm512_or_epi32(xfer, _mm512_srlv_epi32(a.v_lo, b.v_lo));
+  /* this is a safety gate in case b-shift >= 32 */
+  const __vec16_i32 xfer = __select(
+      __signed_less_than_i32(b.v_lo, __ispc_thirty_two), 
+      __lshr(a.v_lo,   __sub(__ispc_thirty_two, b.v_lo)),
+      __shl (a.v_lo,   __sub(b.v_lo, __ispc_thirty_two))
+      );
+  const __vec16_i32   hi = __or(__shl(a.v_hi, b.v_lo), xfer);
+  const __vec16_i32   lo =      __shl(a.v_lo, b.v_lo);
   return __vec16_i64(hi,lo).cvt2zmm();
 }
-#else
-BINARY_OP_CAST(__vec16_i64, uint64_t, __lshr, >>)
-#endif
-
-#if 0 /* knc::fails  ./tests/idiv.ispc */
-static FORCEINLINE __vec16_i64 __ashr(__vec16_i64 _a, __vec16_i64 _b) {
+static FORCEINLINE __vec16_i64 __lshr(__vec16_i64 _a, __vec16_i64 _b) 
+{
   const __vec16_i64 a = _a.cvt2hilo();
   const __vec16_i64 b = _b.cvt2hilo();
-  __vec16_i32 xfer = _mm512_sllv_epi32(_mm512_and_epi32(a.v_hi, 
-        _mm512_sub_epi32(_mm512_sllv_epi32(__ispc_one, b.v_lo), __ispc_one)), 
-      _mm512_sub_epi32(__ispc_thirty_two, b.v_lo));
-  __vec16_i32 hi = _mm512_srav_epi32(a.v_hi, b.v_lo);
-  __vec16_i32 lo = _mm512_or_epi32(xfer, _mm512_srlv_epi32(a.v_lo, b.v_lo));
+  /* this is a safety gate in case b-shift >= 32 */
+  const __vec16_i32 xfer = __select(
+      __signed_less_than_i32(b.v_lo, __ispc_thirty_two), 
+      __shl (a.v_hi,   __sub(__ispc_thirty_two, b.v_lo)),
+      __lshr(a.v_hi,   __sub(b.v_lo, __ispc_thirty_two))
+      );
+  const __vec16_i32   lo = __or(__lshr(a.v_lo, b.v_lo), xfer);
+  const __vec16_i32   hi =      __lshr(a.v_hi, b.v_lo);
+  return __vec16_i64(hi,lo).cvt2zmm();
+}
+static FORCEINLINE __vec16_i64 __ashr(__vec16_i64 _a, __vec16_i64 _b) 
+{
+  const __vec16_i64 a = _a.cvt2hilo();
+  const __vec16_i64 b = _b.cvt2hilo();
+  /* this is a safety gate in case b-shift >= 32 */
+  const __vec16_i32 xfer = __select(
+      __signed_less_than_i32(b.v_lo, __ispc_thirty_two), 
+      __shl (a.v_hi,   __sub(__ispc_thirty_two, b.v_lo)),
+      __ashr(a.v_hi,   __sub(b.v_lo, __ispc_thirty_two))
+      );
+  const __vec16_i32   lo = __or(__lshr(a.v_lo, b.v_lo), xfer);
+  const __vec16_i32   hi =      __ashr(a.v_hi, b.v_lo);
   return __vec16_i64(hi,lo).cvt2zmm();
 }
-#else
-BINARY_OP_CAST(__vec16_i64, int64_t,  __ashr, >>)
-#endif
 
-SHIFT_UNIFORM(__vec16_i64, uint64_t, __lshr, >>)
-SHIFT_UNIFORM(__vec16_i64, int64_t, __ashr, >>)
-SHIFT_UNIFORM(__vec16_i64, int64_t, __shl, <<)
+template <class RetVecType> RetVecType __smear_i64(const int64_t &l);
+template <> FORCEINLINE  __vec16_i64 __smear_i64<__vec16_i64>(const int64_t &l) { return __vec16_i64(_mm512_set1_epi64(l), _mm512_set1_epi64(l)); }
+
+template <class RetVecType> RetVecType __setzero_i64();
+template <> FORCEINLINE  __vec16_i64 __setzero_i64<__vec16_i64>() { return __vec16_i64(_mm512_setzero_epi32(), _mm512_setzero_epi32()); }
+
+template <class RetVecType> RetVecType __undef_i64();
+template <> FORCEINLINE  __vec16_i64 __undef_i64<__vec16_i64>() { return __vec16_i64(_mm512_undefined_epi32(), _mm512_undefined_epi32()); }
+
+static FORCEINLINE __vec16_i64 __lshr(__vec16_i64 a, uint64_t shift) { return __lshr(a, __smear_i64<__vec16_i64>(shift)); }
+static FORCEINLINE __vec16_i64 __ashr(__vec16_i64 a,  int64_t shift) { return __ashr(a, __smear_i64<__vec16_i64>(shift)); }
+static FORCEINLINE __vec16_i64 __shl (__vec16_i64 a,  int64_t shift) { return __shl (a, __smear_i64<__vec16_i64>(shift)); }
 
 static FORCEINLINE __vec16_i1 __equal_i64(__vec16_i64 _a, __vec16_i64 _b) 
 {
@@ -892,14 +901,6 @@ static FORCEINLINE __vec16_i64 __select(__vec16_i1 mask, __vec16_i64 a, __vec16_
 
 INSERT_EXTRACT(__vec16_i64, int64_t)
 
-template <class RetVecType> RetVecType __smear_i64(const int64_t &l);
-template <> FORCEINLINE  __vec16_i64 __smear_i64<__vec16_i64>(const int64_t &l) { return __vec16_i64(_mm512_set1_epi64(l), _mm512_set1_epi64(l)); }
-
-template <class RetVecType> RetVecType __setzero_i64();
-template <> FORCEINLINE  __vec16_i64 __setzero_i64<__vec16_i64>() { return __vec16_i64(_mm512_setzero_epi32(), _mm512_setzero_epi32()); }
-
-template <class RetVecType> RetVecType __undef_i64();
-template <> FORCEINLINE  __vec16_i64 __undef_i64<__vec16_i64>() { return __vec16_i64(_mm512_undefined_epi32(), _mm512_undefined_epi32()); }
 
 #define CASTL2I(_v_, _v_hi_, _v_lo_) \
   __vec16_i32 _v_hi_, _v_lo_;  \

From 3da152a150d5b99f856368317031f181835afb9e Mon Sep 17 00:00:00 2001
From: evghenii <egaburov@dds.nl>
Date: Mon, 7 Oct 2013 18:30:22 +0300
Subject: [PATCH 124/124] fixed zmm __mul for i64 with icc < 14.0.0, 4
 knc::fails lefts, but I doubt these are due to this include..

---
 examples/intrinsics/knc-i1x16.h | 50 ++++++++++++++++++++++-----------
 1 file changed, 34 insertions(+), 16 deletions(-)

diff --git a/examples/intrinsics/knc-i1x16.h b/examples/intrinsics/knc-i1x16.h
index ffe8fb56..78d35ddc 100644
--- a/examples/intrinsics/knc-i1x16.h
+++ b/examples/intrinsics/knc-i1x16.h
@@ -767,31 +767,56 @@ static FORCEINLINE __vec16_i64 __mul(const __vec16_i32 &a, const __vec16_i64 &_b
         _mm512_mulhi_epi32(a.v, b.v_lo))).cvt2zmm();
 }
 
+static FORCEINLINE __vec16_i64 __select(__vec16_i1 mask, __vec16_i64 a, __vec16_i64 b) 
+{
+  __vec16_i64 ret;
+  ret.v1 = _mm512_mask_mov_epi64(b.v1, mask,      a.v1);
+  ret.v2 = _mm512_mask_mov_epi64(b.v2, mask >> 8, a.v2);
+  return ret;
+}
+
 #if __ICC >= 1400 /* compiler gate, icc >= 14.0.0 support _mm512_mullox_epi64 */
 static FORCEINLINE __vec16_i64 __mul(__vec16_i64 a, __vec16_i64 b) 
 {
   return __vec16_i64(_mm512_mullox_epi64(a.v1,b.v1), _mm512_mullox_epi64(a.v2,b.v2));
 }
 #else  /* __ICC >= 1400 */
-#if 0  /* knc::fails  ./tests/int64-min-1.ispc ./tests/idiv.ispc  cause: if one or both numbers are negative multiplication fails */
+static FORCEINLINE void __abs_i32i64(__m512i &_hi, __m512i &_lo)
+{
+  /*   abs(x) : 
+   * mask  = x >> 32;
+   * abs(x) = (x^mask) - mask
+   */ 
+  const __vec16_i32 mask = __ashr(_hi, __ispc_thirty_two);
+  __vec16_i32 hi = __xor(_hi, mask);
+  __vec16_i32 lo = __xor(_lo, mask);
+  __mmask16 borrow = 0;
+  _lo = _mm512_subsetb_epi32(lo, mask, &borrow);
+  _hi = _mm512_sbb_epi32    (hi, borrow, mask, &borrow);
+}
 static FORCEINLINE __vec16_i64 __mul(__vec16_i64 _a, __vec16_i64 _b) 
 {
-  const __vec16_i64 a = _a.cvt2hilo();
-  const __vec16_i64 b = _b.cvt2hilo();
-  const __vec16_i32    lo = _mm512_mullo_epi32(a.v_lo, b.v_lo);
-  const __vec16_i32 hi_m1 = _mm512_mulhi_epi32(a.v_lo, b.v_lo);
+  __vec16_i64 a = _a.cvt2hilo();
+  __vec16_i64 b = _b.cvt2hilo();
+  /* sign = (a^b) >> 32, if sign == 0 then a*b >= 0, otherwise a*b < 0 */
+  const __vec16_i1 sign = __not_equal_i32(__ashr(__xor(a.v_hi, b.v_hi), __ispc_thirty_two), __ispc_zero);
+  __abs_i32i64(a.v_hi, a.v_lo);  /* abs(a) */
+  __abs_i32i64(b.v_hi, b.v_lo);  /* abs(b) */
+  const __vec16_i32 lo_m1 = _mm512_mullo_epi32(a.v_lo, b.v_lo);
+  const __vec16_i32 hi_m1 = _mm512_mulhi_epu32(a.v_lo, b.v_lo);
   const __vec16_i32 hi_m2 = _mm512_mullo_epi32(a.v_hi, b.v_lo);
   const __vec16_i32 hi_m3 = _mm512_mullo_epi32(a.v_lo, b.v_hi);
   __mmask16 carry;
   const __vec16_i32 hi_p23 = _mm512_addsetc_epi32(hi_m2, hi_m3, &carry);
   const __vec16_i32 hi = _mm512_adc_epi32(hi_p23, carry, hi_m1, &carry);
-  return __vec16_i64(hi,lo).cvt2zmm();
+  const __vec16_i32 lo = lo_m1;
+  const __vec16_i64 ret_abs = __vec16_i64(hi,lo).cvt2zmm();
+  /* if sign != 0, means either a or b is negative, then negate the result */
+  return __select(sign, __sub(__vec16_i64(__ispc_zero, __ispc_zero), ret_abs), ret_abs);
 }
-#else
-BINARY_OP(__vec16_i64, __mul, *)
-#endif
 #endif  /* __ICC >= 1400 */
 
+
 static FORCEINLINE __vec16_i64 __or (__vec16_i64 a, __vec16_i64 b) { return __vec16_i64(_mm512_or_epi64 (a.v1, b.v1), _mm512_or_epi64 (a.v2, b.v2)); }
 static FORCEINLINE __vec16_i64 __and(__vec16_i64 a, __vec16_i64 b) { return __vec16_i64(_mm512_and_epi64(a.v1, b.v1), _mm512_and_epi64(a.v2, b.v2)); }
 static FORCEINLINE __vec16_i64 __xor(__vec16_i64 a, __vec16_i64 b) { return __vec16_i64(_mm512_xor_epi64(a.v1, b.v1), _mm512_xor_epi64(a.v2, b.v2)); }
@@ -891,13 +916,6 @@ CMP_OP(__vec16_i64, i64, int64_t,  __signed_less_than, <)
 CMP_OP(__vec16_i64, i64, uint64_t, __unsigned_greater_than, >)
 CMP_OP(__vec16_i64, i64, int64_t,  __signed_greater_than, >)
 
-static FORCEINLINE __vec16_i64 __select(__vec16_i1 mask, __vec16_i64 a, __vec16_i64 b) 
-{
-  __vec16_i64 ret;
-  ret.v_hi = _mm512_mask_mov_epi64(b.v_hi, mask, a.v_hi);
-  ret.v_lo = _mm512_mask_mov_epi64(b.v_lo, mask >> 8, a.v_lo);
-  return ret;
-}
 
 INSERT_EXTRACT(__vec16_i64, int64_t)