From 0277ba1aaa8a3c2b9441b149942bbc9c0ed3be5d Mon Sep 17 00:00:00 2001
From: Matt Pharr <mpharr@google.com>
Date: Tue, 23 Jul 2013 16:49:07 -0700
Subject: [PATCH 01/34] Improve warnings for right shift by varying amounts.

Fixes:
- Don't issue a warning when the shift is a by the same amount in all
  vector lanes.
- Do issue a warning when it's a compile-time constant but the values
  are different in different lanes.

Previously, we warned iff the shift amount wasn't a compile-time constant.
---
 expr.cpp | 39 ++++++++++++++++++++++++++++++++++++---
 1 file changed, 36 insertions(+), 3 deletions(-)
diff --git a/expr.cpp b/expr.cpp
index fc3d295a..894942d2 100644
--- a/expr.cpp
+++ b/expr.cpp
@@ -1911,6 +1911,40 @@ lEmitLogicalOp(BinaryExpr::Op op, Expr *arg0, Expr *arg1,
 }
 
 
+/* Returns true if shifting right by the given amount will lead to
+   inefficient code.  (Assumes x86 target.  May also warn inaccurately if
+   later optimization simplify the shift amount more than we are able to
+   see at this point.) */
+static bool
+lIsDifficultShiftAmount(Expr *expr) {
+    // Uniform shifts (of uniform values) are no problem.
+    if (expr->GetType()->IsVaryingType() == false)
+        return false;
+
+    ConstExpr *ce = dynamic_cast<ConstExpr *>(expr);
+    if (ce) {
+        // If the shift is by a constant amount, *and* it's the same amount
+        // in all vector lanes, we're in good shape.
+        uint32_t amount[ISPC_MAX_NVEC];
+        int count = ce->GetValues(amount);
+        for (int i = 1; i < count; ++i)
+            if (amount[i] != amount[0])
+              return true;
+        return false;
+    }
+
+    TypeCastExpr *tce = dynamic_cast<TypeCastExpr *>(expr);
+    if (tce && tce->expr) {
+        // Finally, if the shift amount is given by a uniform value that's
+        // been smeared out into a varying, we have the same shift for all
+        // lanes and are also in good shape.
+        return (tce->expr->GetType()->IsUniformType() == false);
+    }
+
+    return true;
+}
+
+
 llvm::Value *
 BinaryExpr::GetValue(FunctionEmitContext *ctx) const {
     if (!arg0 || !arg1) {
@@ -1951,9 +1985,8 @@ BinaryExpr::GetValue(FunctionEmitContext *ctx) const {
     case BitAnd:
     case BitXor:
     case BitOr: {
-        if (op == Shr && arg1->GetType()->IsVaryingType() &&
-            dynamic_cast<ConstExpr *>(arg1) == NULL)
-            PerformanceWarning(pos, "Shift right is extremely inefficient for "
+        if (op == Shr && lIsDifficultShiftAmount(arg1))
+            PerformanceWarning(pos, "Shift right is inefficient for "
                                "varying shift amounts.");
         return lEmitBinaryBitOp(op, value0, value1,
                                 arg0->GetType()->IsUnsignedType(), ctx);

From 83e1630fbcfde4aa67b50245cd96e36cbe033660 Mon Sep 17 00:00:00 2001
From: Matt Pharr <mpharr@google.com>
Date: Tue, 23 Jul 2013 16:49:56 -0700
Subject: [PATCH 02/34] Add support for fast division of varying int values by
 small constants.

For varying int8/16/32 types, divides by small constants can be
implemented efficiently through multiplies and shifts with integer
types of twice the bit-width; this commit adds this optimization.

(Implementation is based on Halide.)
---
 expr.cpp        |  69 +++++
 stdlib.ispc     | 675 ++++++++++++++++++++++++++++++++++++++++++++++++
 tests/idiv.ispc |  75 ++++++
 3 files changed, 819 insertions(+)
 create mode 100644 tests/idiv.ispc

diff --git a/expr.cpp b/expr.cpp
index 894942d2..3baaabaf 100644
--- a/expr.cpp
+++ b/expr.cpp
@@ -2240,6 +2240,49 @@ lConstFoldBinaryIntOp(ConstExpr *constArg0, ConstExpr *constArg1,
 }
 
 
+/* Returns true if the given arguments (which are assumed to be the
+   operands of a divide) represent a divide that can be performed by one of
+   the __fast_idiv functions.
+ */
+static bool
+lCanImproveVectorDivide(Expr *arg0, Expr *arg1, int *divisor) {
+    const Type *type = arg0->GetType();
+    if (!type)
+        return false;
+
+    // The value being divided must be an int8/16/32.
+    if (!(Type::EqualIgnoringConst(type, AtomicType::VaryingInt8) ||
+          Type::EqualIgnoringConst(type, AtomicType::VaryingUInt8) ||
+          Type::EqualIgnoringConst(type, AtomicType::VaryingInt16) ||
+          Type::EqualIgnoringConst(type, AtomicType::VaryingUInt16) ||
+          Type::EqualIgnoringConst(type, AtomicType::VaryingInt32) ||
+          Type::EqualIgnoringConst(type, AtomicType::VaryingUInt32)))
+        return false;
+
+    // The divisor must be the same compile-time constant value for all of
+    // the vector lanes.
+    ConstExpr *ce = dynamic_cast<ConstExpr *>(arg1);
+    if (!ce)
+        return false;
+    int64_t div[ISPC_MAX_NVEC];
+    int count = ce->GetValues(div);
+    for (int i = 1; i < count; ++i)
+        if (div[i] != div[0])
+          return false;
+    *divisor = div[0];
+
+    // And finally, the divisor must be >= 2 and <128 (for 8-bit divides),
+    // and <256 otherwise.
+    if (*divisor < 2)
+        return false;
+    if (Type::EqualIgnoringConst(type, AtomicType::VaryingInt8) ||
+        Type::EqualIgnoringConst(type, AtomicType::VaryingUInt8))
+        return *divisor < 128;
+    else
+        return *divisor < 256;
+}
+
+
 Expr *
 BinaryExpr::Optimize() {
     if (arg0 == NULL || arg1 == NULL)
@@ -2302,6 +2345,32 @@ BinaryExpr::Optimize() {
         }
     }
 
+    int divisor;
+    if (op == Div && lCanImproveVectorDivide(arg0, arg1, &divisor)) {
+        Debug(pos, "Improving vector divide by constant %d", divisor);
+
+        std::vector<Symbol *> idivFuns;
+        m->symbolTable->LookupFunction("__fast_idiv", &idivFuns);
+        if (idivFuns.size() == 0) {
+            Warning(pos, "Couldn't find __fast_idiv to optimize integer divide. "
+                    "Are you compiling with --nostdlib?");
+            return this;
+        }
+
+        Expr *idivSymExpr = new FunctionSymbolExpr("__fast_idiv", idivFuns, pos);
+        ExprList *args = new ExprList(arg0, pos);
+        args->exprs.push_back(new ConstExpr(AtomicType::UniformInt32, divisor, arg1->pos));
+        Expr *idivCall = new FunctionCallExpr(idivSymExpr, args, pos);
+
+        idivCall = ::TypeCheck(idivCall);
+        if (idivCall == NULL)
+          return NULL;
+
+        Assert(Type::EqualIgnoringConst(GetType(), idivCall->GetType()));
+        idivCall = new TypeCastExpr(GetType(), idivCall, pos);
+        return ::Optimize(idivCall);
+    }
+
     // From here on out, we're just doing constant folding, so if both args
     // aren't constants then we're done...
     if (constArg0 == NULL || constArg1 == NULL)
diff --git a/stdlib.ispc b/stdlib.ispc
index 4e06f5da..b8ed2057 100644
--- a/stdlib.ispc
+++ b/stdlib.ispc
@@ -4264,3 +4264,678 @@ static inline bool rdrand(int64 * ptr) {
         return success;
     }
 }
+
+///////////////////////////////////////////////////////////////////////////
+// Fast vector integer division
+
+/* These tables and the algorithms in the __fast_idiv() functions below are
+   from Halide; the idea is based on the paper "Division by Invariant
+   Integers using Multiplication" by Granlund and Montgomery.
+
+   Copyright (c) 2012 MIT CSAIL
+
+   Developed by:
+
+   The Halide team
+   MIT CSAIL
+   http://halide-lang.org
+
+   Permission is hereby granted, free of charge, to any person obtaining a
+   copy of this software and associated documentation files (the
+   "Software"), to deal in the Software without restriction, including
+   without limitation the rights to use, copy, modify, merge, publish,
+   distribute, sublicense, and/or sell copies of the Software, and to
+   permit persons to whom the Software is furnished to do so, subject to
+   the following conditions:
+
+   The above copyright notice and this permission notice shall be included
+   in all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+   OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+   LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+   OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+   WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+static const uniform int64 __idiv_table_u8[][3] = {
+    {0,          0LL,  1},     {1,        171LL,  1},     {0,          0LL,  2},
+    {1,        205LL,  2},     {1,        171LL,  2},     {2,         37LL,  2},
+    {0,          0LL,  3},     {1,         57LL,  1},     {1,        205LL,  3},
+    {2,        117LL,  3},     {1,        171LL,  3},     {1,         79LL,  2},
+    {2,         37LL,  3},     {1,        137LL,  3},     {0,          0LL,  4},
+    {1,        241LL,  4},     {1,         57LL,  2},     {1,         27LL,  1},
+    {1,        205LL,  4},     {2,        135LL,  4},     {2,        117LL,  4},
+    {2,        101LL,  4},     {1,        171LL,  4},     {1,         41LL,  2},
+    {1,         79LL,  3},     {1,         19LL,  1},     {2,         37LL,  4},
+    {2,         27LL,  4},     {1,        137LL,  4},     {2,          9LL,  4},
+    {0,          0LL,  5},     {1,        249LL,  5},     {1,        241LL,  5},
+    {1,        235LL,  5},     {1,         57LL,  3},     {1,        111LL,  4},
+    {1,         27LL,  2},     {2,        165LL,  5},     {1,        205LL,  5},
+    {1,         25LL,  2},     {2,        135LL,  5},     {1,        191LL,  5},
+    {1,        187LL,  5},     {2,        109LL,  5},     {2,        101LL,  5},
+    {1,        175LL,  5},     {1,        171LL,  5},     {2,         79LL,  5},
+    {1,         41LL,  3},     {1,        161LL,  5},     {1,         79LL,  4},
+    {1,        155LL,  5},     {1,         19LL,  2},     {1,        149LL,  5},
+    {2,         37LL,  5},     {1,          9LL,  1},     {2,         27LL,  5},
+    {1,        139LL,  5},     {1,        137LL,  5},     {2,         13LL,  5},
+    {2,          9LL,  5},     {2,          5LL,  5},     {0,          0LL,  6},
+    {1,        253LL,  6},     {1,        249LL,  6},     {1,        245LL,  6},
+    {1,        121LL,  5},     {1,        119LL,  5},     {1,        235LL,  6},
+    {1,        231LL,  6},     {1,         57LL,  4},     {1,        225LL,  6},
+    {1,        111LL,  5},     {1,        219LL,  6},     {1,         27LL,  3},
+    {1,        213LL,  6},     {2,        165LL,  6},     {1,         13LL,  2},
+    {1,        205LL,  6},     {1,        203LL,  6},     {1,         25LL,  3},
+    {1,         99LL,  5},     {2,        135LL,  6},     {1,        193LL,  6},
+    {1,        191LL,  6},     {1,        189LL,  6},     {1,        187LL,  6},
+    {1,        185LL,  6},     {1,        183LL,  6},     {1,        181LL,  6},
+    {1,        179LL,  6},     {1,        177LL,  6},     {1,        175LL,  6},
+    {1,        173LL,  6},     {1,        171LL,  6},     {1,        169LL,  6},
+    {1,         21LL,  3},     {1,         83LL,  5},     {1,         41LL,  4},
+    {1,        163LL,  6},     {1,        161LL,  6},     {2,         63LL,  6},
+    {1,         79LL,  5},     {2,         57LL,  6},     {1,        155LL,  6},
+    {2,         51LL,  6},     {1,         19LL,  3},     {1,        151LL,  6},
+    {1,        149LL,  6},     {1,         37LL,  4},     {2,         37LL,  6},
+    {1,        145LL,  6},     {1,          9LL,  2},     {1,        143LL,  6},
+    {2,         27LL,  6},     {2,         25LL,  6},     {1,        139LL,  6},
+    {1,         69LL,  5},     {1,        137LL,  6},     {2,         15LL,  6},
+    {2,         13LL,  6},     {2,         11LL,  6},     {2,          9LL,  6},
+    {2,          7LL,  6},     {2,          5LL,  6},     {2,          3LL,  6},
+    {0,          0LL,  7},     {1,        255LL,  7},     {1,        127LL,  6},
+    {1,         63LL,  5},     {1,        125LL,  6},     {1,         31LL,  4},
+    {1,        123LL,  6},     {1,         61LL,  5},     {1,        121LL,  6},
+    {1,         15LL,  3},     {1,        119LL,  6},     {1,         59LL,  5},
+    {1,        235LL,  7},     {1,        117LL,  6},     {1,         29LL,  4},
+    {1,        115LL,  6},     {1,         57LL,  5},     {1,        113LL,  6},
+    {1,        225LL,  7},     {1,          7LL,  2},     {1,        111LL,  6},
+    {1,         55LL,  5},     {1,        219LL,  7},     {1,        109LL,  6},
+    {1,         27LL,  4},     {1,        215LL,  7},     {1,        107LL,  6},
+    {1,         53LL,  5},     {1,        211LL,  7},     {1,        105LL,  6},
+    {1,         13LL,  3},     {1,        207LL,  7},     {1,        103LL,  6},
+    {1,         51LL,  5},     {1,        203LL,  7},     {1,        101LL,  6},
+    {1,         25LL,  4},     {1,        199LL,  7},     {1,         99LL,  6},
+    {1,        197LL,  7},     {1,         49LL,  5},     {1,         97LL,  6},
+    {1,        193LL,  7},     {1,          3LL,  1},     {1,        191LL,  7},
+    {1,         95LL,  6},     {1,        189LL,  7},     {1,         47LL,  5},
+    {1,        187LL,  7},     {1,         93LL,  6},     {1,        185LL,  7},
+    {1,         23LL,  4},     {1,        183LL,  7},     {1,         91LL,  6},
+    {1,        181LL,  7},     {1,         45LL,  5},     {1,        179LL,  7},
+    {1,         89LL,  6},     {1,        177LL,  7},     {1,         11LL,  3},
+    {1,        175LL,  7},     {1,         87LL,  6},     {1,        173LL,  7},
+    {1,         43LL,  5},     {1,        171LL,  7},     {1,         85LL,  6},
+    {1,        169LL,  7},     {2,         81LL,  7},     {1,         21LL,  4},
+    {1,        167LL,  7},     {1,         83LL,  6},     {1,        165LL,  7},
+    {1,         41LL,  5},     {2,         71LL,  7},     {1,        163LL,  7},
+    {1,         81LL,  6},     {1,        161LL,  7},     {1,          5LL,  2},
+    {2,         63LL,  7},     {1,        159LL,  7},     {1,         79LL,  6},
+    {1,        157LL,  7},     {2,         57LL,  7},     {1,         39LL,  5},
+    {1,        155LL,  7},     {1,         77LL,  6},     {2,         51LL,  7},
+    {1,        153LL,  7},     {1,         19LL,  4},     {2,         47LL,  7},
+    {1,        151LL,  7},     {1,         75LL,  6},     {1,        149LL,  7},
+    {2,         41LL,  7},     {1,         37LL,  5},     {1,        147LL,  7},
+    {2,         37LL,  7},     {1,         73LL,  6},     {1,        145LL,  7},
+    {2,         33LL,  7},     {1,          9LL,  3},     {2,         31LL,  7},
+    {1,        143LL,  7},     {1,         71LL,  6},     {2,         27LL,  7},
+    {1,        141LL,  7},     {2,         25LL,  7},     {1,         35LL,  5},
+    {1,        139LL,  7},     {2,         21LL,  7},     {1,         69LL,  6},
+    {2,         19LL,  7},     {1,        137LL,  7},     {1,         17LL,  4},
+    {2,         15LL,  7},     {1,        135LL,  7},     {2,         13LL,  7},
+    {1,         67LL,  6},     {2,         11LL,  7},     {1,        133LL,  7},
+    {2,          9LL,  7},     {1,         33LL,  5},     {2,          7LL,  7},
+    {1,        131LL,  7},     {2,          5LL,  7},     {1,         65LL,  6},
+    {2,          3LL,  7},     {1,        129LL,  7},     {0,          0LL,  8},
+};
+static const uniform int64 __idiv_table_s8[][3] = {
+    {0,          0LL,  1},     {1,         86LL,  0},     {0,          0LL,  2},
+    {1,        103LL,  1},     {1,         43LL,  0},     {1,        147LL,  2},
+    {0,          0LL,  3},     {1,         57LL,  1},     {1,        103LL,  2},
+    {1,        187LL,  3},     {1,         43LL,  1},     {1,         79LL,  2},
+    {1,        147LL,  3},     {1,        137LL,  3},     {0,          0LL,  4},
+    {1,        121LL,  3},     {1,         57LL,  2},     {1,         27LL,  1},
+    {1,        103LL,  3},     {1,         49LL,  2},     {1,        187LL,  4},
+    {1,        179LL,  4},     {1,         43LL,  2},     {1,         41LL,  2},
+    {1,         79LL,  3},     {1,         19LL,  1},     {1,        147LL,  4},
+    {1,         71LL,  3},     {1,        137LL,  4},     {1,        133LL,  4},
+    {0,          0LL,  5},     {1,        125LL,  4},     {1,        121LL,  4},
+    {1,         59LL,  3},     {1,         57LL,  3},     {1,        111LL,  4},
+    {1,         27LL,  2},     {1,        211LL,  5},     {1,        103LL,  4},
+    {1,         25LL,  2},     {1,         49LL,  3},     {1,          6LL,  0},
+    {1,         47LL,  3},     {1,         23LL,  2},     {1,         45LL,  3},
+    {1,         11LL,  1},     {1,         43LL,  3},     {1,         21LL,  2},
+    {1,         41LL,  3},     {1,         81LL,  4},     {1,         79LL,  4},
+    {1,         39LL,  3},     {1,         19LL,  2},     {1,         75LL,  4},
+    {1,        147LL,  5},     {1,          9LL,  1},     {1,         71LL,  4},
+    {1,         35LL,  3},     {1,        137LL,  5},     {1,        135LL,  5},
+    {1,        133LL,  5},     {1,        131LL,  5},     {0,          0LL,  6},
+    {1,        127LL,  5},     {1,         63LL,  4},     {1,         31LL,  3},
+    {1,         61LL,  4},     {1,         15LL,  2},     {1,         59LL,  4},
+    {1,         29LL,  3},     {1,         57LL,  4},     {1,        113LL,  5},
+    {1,          7LL,  1},     {1,         55LL,  4},     {1,         27LL,  3},
+    {1,        107LL,  5},     {1,         53LL,  4},     {1,         13LL,  2},
+    {1,        103LL,  5},     {1,         51LL,  4},     {1,         25LL,  3},
+    {1,         99LL,  5},     {1,         49LL,  4},     {1,         97LL,  5},
+    {1,          3LL,  0},     {1,         95LL,  5},     {1,         47LL,  4},
+    {1,         93LL,  5},     {1,         23LL,  3},     {1,         91LL,  5},
+    {1,         45LL,  4},     {1,         89LL,  5},     {1,         11LL,  2},
+    {1,         87LL,  5},     {1,         43LL,  4},     {1,         85LL,  5},
+    {1,         21LL,  3},     {1,         83LL,  5},     {1,         41LL,  4},
+    {1,        163LL,  6},     {1,         81LL,  5},     {1,          5LL,  1},
+    {1,         79LL,  5},     {1,        157LL,  6},     {1,         39LL,  4},
+    {1,         77LL,  5},     {1,         19LL,  3},     {1,        151LL,  6},
+    {1,         75LL,  5},     {1,         37LL,  4},     {1,        147LL,  6},
+    {1,         73LL,  5},     {1,          9LL,  2},     {1,        143LL,  6},
+    {1,         71LL,  5},     {1,        141LL,  6},     {1,         35LL,  4},
+    {1,         69LL,  5},     {1,        137LL,  6},     {1,         17LL,  3},
+    {1,        135LL,  6},     {1,         67LL,  5},     {1,        133LL,  6},
+    {1,         33LL,  4},     {1,        131LL,  6},     {1,         65LL,  5},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+};
+static const uniform int64 __idiv_table_u16[][3] = {
+    {0,          0LL,  1},     {1,      43691LL,  1},     {0,          0LL,  2},
+    {1,      52429LL,  2},     {1,      43691LL,  2},     {2,       9363LL,  2},
+    {0,          0LL,  3},     {1,      58255LL,  3},     {1,      52429LL,  3},
+    {1,      47663LL,  3},     {1,      43691LL,  3},     {1,      20165LL,  2},
+    {2,       9363LL,  3},     {1,      34953LL,  3},     {0,          0LL,  4},
+    {1,      61681LL,  4},     {1,      58255LL,  4},     {1,      55189LL,  4},
+    {1,      52429LL,  4},     {2,      34329LL,  4},     {1,      47663LL,  4},
+    {2,      25645LL,  4},     {1,      43691LL,  4},     {2,      18351LL,  4},
+    {1,      20165LL,  3},     {2,      12137LL,  4},     {2,       9363LL,  4},
+    {1,      18079LL,  3},     {1,      34953LL,  4},     {2,       2115LL,  4},
+    {0,          0LL,  5},     {1,      63551LL,  5},     {1,      61681LL,  5},
+    {1,      59919LL,  5},     {1,      58255LL,  5},     {1,       7085LL,  2},
+    {1,      55189LL,  5},     {2,      42011LL,  5},     {1,      52429LL,  5},
+    {2,      36765LL,  5},     {2,      34329LL,  5},     {1,      48771LL,  5},
+    {1,      47663LL,  5},     {1,      11651LL,  3},     {2,      25645LL,  5},
+    {2,      23705LL,  5},     {1,      43691LL,  5},     {2,      20063LL,  5},
+    {2,      18351LL,  5},     {1,      41121LL,  5},     {1,      20165LL,  4},
+    {1,      39569LL,  5},     {2,      12137LL,  5},     {2,      10725LL,  5},
+    {2,       9363LL,  5},     {2,       8049LL,  5},     {1,      18079LL,  4},
+    {1,      35545LL,  5},     {1,      34953LL,  5},     {1,       8595LL,  3},
+    {2,       2115LL,  5},     {2,       1041LL,  5},     {0,          0LL,  6},
+    {1,       4033LL,  2},     {1,      63551LL,  6},     {1,      31301LL,  5},
+    {1,      61681LL,  6},     {2,      56039LL,  6},     {1,      59919LL,  6},
+    {1,      59075LL,  6},     {1,      58255LL,  6},     {1,      57457LL,  6},
+    {1,       7085LL,  3},     {2,      46313LL,  6},     {1,      55189LL,  6},
+    {1,       6809LL,  3},     {2,      42011LL,  6},     {1,      53093LL,  6},
+    {1,      52429LL,  6},     {1,      25891LL,  5},     {2,      36765LL,  6},
+    {1,      25267LL,  5},     {2,      34329LL,  6},     {1,      49345LL,  6},
+    {1,      48771LL,  6},     {1,      48211LL,  6},     {1,      47663LL,  6},
+    {2,      28719LL,  6},     {1,      11651LL,  4},     {2,      26647LL,  6},
+    {2,      25645LL,  6},     {2,      24665LL,  6},     {2,      23705LL,  6},
+    {1,      44151LL,  6},     {1,      43691LL,  6},     {2,      20945LL,  6},
+    {2,      20063LL,  6},     {1,      42367LL,  6},     {2,      18351LL,  6},
+    {1,       5191LL,  3},     {1,      41121LL,  6},     {1,      20361LL,  5},
+    {1,      20165LL,  5},     {1,      19973LL,  5},     {1,      39569LL,  6},
+    {2,      12863LL,  6},     {2,      12137LL,  6},     {1,       2405LL,  2},
+    {2,      10725LL,  6},     {1,      37787LL,  6},     {2,       9363LL,  6},
+    {1,      18559LL,  5},     {2,       8049LL,  6},     {2,       7409LL,  6},
+    {1,      18079LL,  5},     {1,      35849LL,  6},     {1,      35545LL,  6},
+    {2,       4957LL,  6},     {1,      34953LL,  6},     {1,       4333LL,  3},
+    {1,       8595LL,  4},     {2,       2665LL,  6},     {2,       2115LL,  6},
+    {2,       1573LL,  6},     {2,       1041LL,  6},     {2,        517LL,  6},
+    {0,          0LL,  7},     {1,      16257LL,  5},     {1,       4033LL,  3},
+    {1,      16009LL,  5},     {1,      63551LL,  7},     {1,      63073LL,  7},
+    {1,      31301LL,  6},     {1,      31069LL,  6},     {1,      61681LL,  7},
+    {1,      61231LL,  7},     {2,      56039LL,  7},     {1,      30175LL,  6},
+    {1,      59919LL,  7},     {1,      29747LL,  6},     {1,      59075LL,  7},
+    {1,      29331LL,  6},     {1,      58255LL,  7},     {1,      57853LL,  7},
+    {1,      57457LL,  7},     {1,      28533LL,  6},     {1,       7085LL,  4},
+    {1,      14075LL,  5},     {2,      46313LL,  7},     {1,      27777LL,  6},
+    {1,      55189LL,  7},     {1,      13707LL,  5},     {1,       6809LL,  4},
+    {2,      42705LL,  7},     {2,      42011LL,  7},     {1,      53431LL,  7},
+    {1,      53093LL,  7},     {1,      52759LL,  7},     {1,      52429LL,  7},
+    {2,      38671LL,  7},     {1,      25891LL,  6},     {1,       6433LL,  4},
+    {2,      36765LL,  7},     {2,      36145LL,  7},     {1,      25267LL,  6},
+    {2,      34927LL,  7},     {2,      34329LL,  7},     {1,      49637LL,  7},
+    {1,      49345LL,  7},     {2,      32577LL,  7},     {1,      48771LL,  7},
+    {2,      31443LL,  7},     {1,      48211LL,  7},     {1,      47935LL,  7},
+    {1,      47663LL,  7},     {2,      29251LL,  7},     {2,      28719LL,  7},
+    {1,       2929LL,  3},     {1,      11651LL,  5},     {1,      23173LL,  6},
+    {2,      26647LL,  7},     {1,       2865LL,  3},     {2,      25645LL,  7},
+    {1,       1417LL,  2},     {2,      24665LL,  7},     {1,      44859LL,  7},
+    {2,      23705LL,  7},     {2,      23233LL,  7},     {1,      44151LL,  7},
+    {1,       2745LL,  3},     {1,      43691LL,  7},     {2,      21393LL,  7},
+    {2,      20945LL,  7},     {1,      43019LL,  7},     {2,      20063LL,  7},
+    {1,      21291LL,  6},     {1,      42367LL,  7},     {1,      21077LL,  6},
+    {2,      18351LL,  7},     {1,      41735LL,  7},     {1,       5191LL,  4},
+    {2,      17111LL,  7},     {1,      41121LL,  7},     {2,      16305LL,  7},
+    {1,      20361LL,  6},     {1,      40525LL,  7},     {1,      20165LL,  6},
+    {1,      40137LL,  7},     {1,      19973LL,  6},     {1,      39757LL,  7},
+    {1,      39569LL,  7},     {2,      13231LL,  7},     {2,      12863LL,  7},
+    {1,      39017LL,  7},     {2,      12137LL,  7},     {2,      11779LL,  7},
+    {1,       2405LL,  3},     {2,      11073LL,  7},     {2,      10725LL,  7},
+    {1,      18979LL,  6},     {1,      37787LL,  7},     {2,       9699LL,  7},
+    {2,       9363LL,  7},     {1,      37283LL,  7},     {1,      18559LL,  6},
+    {2,       8373LL,  7},     {2,       8049LL,  7},     {1,       4579LL,  4},
+    {2,       7409LL,  7},     {2,       7093LL,  7},     {1,      18079LL,  6},
+    {1,      36003LL,  7},     {1,      35849LL,  7},     {2,       5857LL,  7},
+    {1,      35545LL,  7},     {1,      35395LL,  7},     {2,       4957LL,  7},
+    {1,      35099LL,  7},     {1,      34953LL,  7},     {1,       4351LL,  4},
+    {1,       4333LL,  4},     {2,       3507LL,  7},     {1,       8595LL,  5},
+    {2,       2943LL,  7},     {2,       2665LL,  7},     {1,      16981LL,  6},
+    {2,       2115LL,  7},     {2,       1843LL,  7},     {2,       1573LL,  7},
+    {1,      33421LL,  7},     {2,       1041LL,  7},     {1,      33157LL,  7},
+    {2,        517LL,  7},     {1,      32897LL,  7},     {0,          0LL,  8},
+};
+static const uniform int64 __idiv_table_s16[][3] = {
+    {0,          0LL,  1},     {1,      21846LL,  0},     {0,          0LL,  2},
+    {1,      26215LL,  1},     {1,      10923LL,  0},     {1,      18725LL,  1},
+    {0,          0LL,  3},     {1,       7282LL,  0},     {1,      26215LL,  2},
+    {1,       5958LL,  0},     {1,      10923LL,  1},     {1,      20165LL,  2},
+    {1,      18725LL,  2},     {1,      34953LL,  3},     {0,          0LL,  4},
+    {1,      30841LL,  3},     {1,       3641LL,  0},     {1,      55189LL,  4},
+    {1,      26215LL,  3},     {1,      49933LL,  4},     {1,       2979LL,  0},
+    {1,      45591LL,  4},     {1,      10923LL,  2},     {1,       5243LL,  1},
+    {1,      20165LL,  3},     {1,      38837LL,  4},     {1,      18725LL,  3},
+    {1,      18079LL,  3},     {1,      34953LL,  4},     {1,      16913LL,  3},
+    {0,          0LL,  5},     {1,       1986LL,  0},     {1,      30841LL,  4},
+    {1,       3745LL,  1},     {1,       3641LL,  1},     {1,       7085LL,  2},
+    {1,      55189LL,  5},     {1,      26887LL,  4},     {1,      26215LL,  4},
+    {1,      51151LL,  5},     {1,      49933LL,  5},     {1,      12193LL,  3},
+    {1,       2979LL,  1},     {1,      11651LL,  3},     {1,      45591LL,  5},
+    {1,      44621LL,  5},     {1,      10923LL,  3},     {1,       2675LL,  1},
+    {1,       5243LL,  2},     {1,      41121LL,  5},     {1,      20165LL,  4},
+    {1,      19785LL,  4},     {1,      38837LL,  5},     {1,      38131LL,  5},
+    {1,      18725LL,  4},     {1,      36793LL,  5},     {1,      18079LL,  4},
+    {1,      17773LL,  4},     {1,      34953LL,  5},     {1,       8595LL,  3},
+    {1,      16913LL,  4},     {1,      33289LL,  5},     {0,          0LL,  6},
+    {1,       4033LL,  2},     {1,        993LL,  0},     {1,      31301LL,  5},
+    {1,      30841LL,  5},     {1,      15197LL,  4},     {1,       3745LL,  2},
+    {1,      14769LL,  4},     {1,       3641LL,  2},     {1,      57457LL,  6},
+    {1,       7085LL,  3},     {1,      55925LL,  6},     {1,      55189LL,  6},
+    {1,       6809LL,  3},     {1,      26887LL,  5},     {1,      26547LL,  5},
+    {1,      26215LL,  5},     {1,      25891LL,  5},     {1,      51151LL,  6},
+    {1,      25267LL,  5},     {1,      49933LL,  6},     {1,      24673LL,  5},
+    {1,      12193LL,  4},     {1,      48211LL,  6},     {1,       2979LL,  2},
+    {1,       5891LL,  3},     {1,      11651LL,  4},     {1,      11523LL,  4},
+    {1,      45591LL,  6},     {1,      45101LL,  6},     {1,      44621LL,  6},
+    {1,      44151LL,  6},     {1,      10923LL,  4},     {1,      43241LL,  6},
+    {1,       2675LL,  2},     {1,        662LL,  0},     {1,       5243LL,  3},
+    {1,       5191LL,  3},     {1,      41121LL,  6},     {1,      20361LL,  5},
+    {1,      20165LL,  5},     {1,      19973LL,  5},     {1,      19785LL,  5},
+    {1,       1225LL,  1},     {1,      38837LL,  6},     {1,       2405LL,  2},
+    {1,      38131LL,  6},     {1,      37787LL,  6},     {1,      18725LL,  5},
+    {1,      18559LL,  5},     {1,      36793LL,  6},     {1,      36473LL,  6},
+    {1,      18079LL,  5},     {1,      35849LL,  6},     {1,      17773LL,  5},
+    {1,      35247LL,  6},     {1,      34953LL,  6},     {1,       4333LL,  3},
+    {1,       8595LL,  4},     {1,      34101LL,  6},     {1,      16913LL,  5},
+    {1,      33555LL,  6},     {1,      33289LL,  6},     {1,      33027LL,  6},
+    {0,          0LL,  7},     {1,      16257LL,  5},     {1,       4033LL,  3},
+    {1,      16009LL,  5},     {1,        993LL,  1},     {1,      31537LL,  6},
+    {1,      31301LL,  6},     {1,      31069LL,  6},     {1,      30841LL,  6},
+    {1,       3827LL,  3},     {1,      15197LL,  5},     {1,      30175LL,  6},
+    {1,       3745LL,  3},     {1,      29747LL,  6},     {1,      14769LL,  5},
+    {1,      29331LL,  6},     {1,       3641LL,  3},     {1,      28927LL,  6},
+    {1,      57457LL,  7},     {1,      28533LL,  6},     {1,       7085LL,  4},
+    {1,      14075LL,  5},     {1,      55925LL,  7},     {1,      27777LL,  6},
+    {1,      55189LL,  7},     {1,      13707LL,  5},     {1,       6809LL,  4},
+    {1,      54121LL,  7},     {1,      26887LL,  6},     {1,       6679LL,  4},
+    {1,      26547LL,  6},     {1,       6595LL,  4},     {1,      26215LL,  6},
+    {1,       6513LL,  4},     {1,      25891LL,  6},     {1,       6433LL,  4},
+    {1,      51151LL,  7},     {1,      50841LL,  7},     {1,      25267LL,  6},
+    {1,       6279LL,  4},     {1,      49933LL,  7},     {1,      24819LL,  6},
+    {1,      24673LL,  6},     {1,      49057LL,  7},     {1,      12193LL,  5},
+    {1,      24245LL,  6},     {1,      48211LL,  7},     {1,        749LL,  1},
+    {1,       2979LL,  3},     {1,      23697LL,  6},     {1,       5891LL,  4},
+    {1,       2929LL,  3},     {1,      11651LL,  5},     {1,      23173LL,  6},
+    {1,      11523LL,  5},     {1,       2865LL,  3},     {1,      45591LL,  7},
+    {1,       1417LL,  2},     {1,      45101LL,  7},     {1,      11215LL,  5},
+    {1,      44621LL,  7},     {1,      44385LL,  7},     {1,      44151LL,  7},
+    {1,       2745LL,  3},     {1,      10923LL,  5},     {1,      43465LL,  7},
+    {1,      43241LL,  7},     {1,      43019LL,  7},     {1,       2675LL,  3},
+    {1,      21291LL,  6},     {1,        331LL,  0},     {1,      21077LL,  6},
+    {1,       5243LL,  4},     {1,      41735LL,  7},     {1,       5191LL,  4},
+    {1,      10331LL,  5},     {1,      41121LL,  7},     {1,      40921LL,  7},
+    {1,      20361LL,  6},     {1,      40525LL,  7},     {1,      20165LL,  6},
+    {1,      20069LL,  6},     {1,      19973LL,  6},     {1,      39757LL,  7},
+    {1,      19785LL,  6},     {1,       4923LL,  4},     {1,       1225LL,  2},
+    {1,      39017LL,  7},     {1,      38837LL,  7},     {1,      19329LL,  6},
+    {1,       2405LL,  3},     {1,      38305LL,  7},     {1,      38131LL,  7},
+    {1,      18979LL,  6},     {1,      37787LL,  7},     {1,      18809LL,  6},
+    {1,      18725LL,  6},     {1,      37283LL,  7},     {1,      18559LL,  6},
+    {1,      36955LL,  7},     {1,      36793LL,  7},     {1,       4579LL,  4},
+    {1,      36473LL,  7},     {1,      36315LL,  7},     {1,      18079LL,  6},
+    {1,      36003LL,  7},     {1,      35849LL,  7},     {1,      35697LL,  7},
+    {1,      17773LL,  6},     {1,       8849LL,  5},     {1,      35247LL,  7},
+    {1,      35099LL,  7},     {1,      34953LL,  7},     {1,       4351LL,  4},
+    {1,       4333LL,  4},     {1,      17261LL,  6},     {1,       8595LL,  5},
+    {1,        535LL,  1},     {1,      34101LL,  7},     {1,      16981LL,  6},
+    {1,      16913LL,  6},     {1,      16845LL,  6},     {1,      33555LL,  7},
+    {1,      33421LL,  7},     {1,      33289LL,  7},     {1,      33157LL,  7},
+    {1,      33027LL,  7},     {1,      32897LL,  7},     {1,      32769LL,  7},
+};
+static const uniform int64 __idiv_table_u32[][3] = {
+    {0,          0LL,  1},     {1, 2863311531LL,  1},     {0,          0LL,  2},
+    {1, 3435973837LL,  2},     {1, 2863311531LL,  2},     {2,  613566757LL,  2},
+    {0,          0LL,  3},     {1,  954437177LL,  1},     {1, 3435973837LL,  3},
+    {1, 3123612579LL,  3},     {1, 2863311531LL,  3},     {1, 1321528399LL,  2},
+    {2,  613566757LL,  3},     {1, 2290649225LL,  3},     {0,          0LL,  4},
+    {1, 4042322161LL,  4},     {1,  954437177LL,  2},     {2, 2938661835LL,  4},
+    {1, 3435973837LL,  4},     {2, 2249744775LL,  4},     {1, 3123612579LL,  4},
+    {1, 2987803337LL,  4},     {1, 2863311531LL,  4},     {1, 1374389535LL,  3},
+    {1, 1321528399LL,  3},     {2,  795364315LL,  4},     {2,  613566757LL,  4},
+    {1, 2369637129LL,  4},     {1, 2290649225LL,  4},     {2,  138547333LL,  4},
+    {0,          0LL,  5},     {1, 1041204193LL,  3},     {1, 4042322161LL,  5},
+    {2, 3558687189LL,  5},     {1,  954437177LL,  3},     {2, 3134165325LL,  5},
+    {2, 2938661835LL,  5},     {2, 2753184165LL,  5},     {1, 3435973837LL,  5},
+    {1, 3352169597LL,  5},     {2, 2249744775LL,  5},     {1,  799063683LL,  3},
+    {1, 3123612579LL,  5},     {2, 1813430637LL,  5},     {1, 2987803337LL,  5},
+    {1, 2924233053LL,  5},     {1, 2863311531LL,  5},     {1, 1402438301LL,  4},
+    {1, 1374389535LL,  4},     {1, 2694881441LL,  5},     {1, 1321528399LL,  4},
+    {2,  891408307LL,  5},     {2,  795364315LL,  5},     {2,  702812831LL,  5},
+    {2,  613566757LL,  5},     {2,  527452125LL,  5},     {1, 2369637129LL,  5},
+    {1,  582368447LL,  3},     {1, 2290649225LL,  5},     {1, 1126548799LL,  4},
+    {2,  138547333LL,  5},     {2,   68174085LL,  5},     {0,          0LL,  6},
+    {1, 4228890877LL,  6},     {1, 1041204193LL,  4},     {1,  128207979LL,  1},
+    {1, 4042322161LL,  6},     {1, 1991868891LL,  5},     {2, 3558687189LL,  6},
+    {1, 3871519817LL,  6},     {1,  954437177LL,  4},     {2, 3235934265LL,  6},
+    {2, 3134165325LL,  6},     {1,  458129845LL,  3},     {2, 2938661835LL,  6},
+    {1,  892460737LL,  4},     {2, 2753184165LL,  6},     {1, 3479467177LL,  6},
+    {1, 3435973837LL,  6},     {1, 3393554407LL,  6},     {1, 3352169597LL,  6},
+    {1,  827945503LL,  4},     {2, 2249744775LL,  6},     {1, 3233857729LL,  6},
+    {1,  799063683LL,  4},     {1,  789879043LL,  4},     {1, 3123612579LL,  6},
+    {1, 3088515809LL,  6},     {2, 1813430637LL,  6},     {2, 1746305385LL,  6},
+    {1, 2987803337LL,  6},     {1, 2955676419LL,  6},     {1, 2924233053LL,  6},
+    {2, 1491936009LL,  6},     {1, 2863311531LL,  6},     {2, 1372618415LL,  6},
+    {1, 1402438301LL,  5},     {1, 2776544515LL,  6},     {1, 1374389535LL,  5},
+    {2, 1148159575LL,  6},     {1, 2694881441LL,  6},     {2, 1042467791LL,  6},
+    {1, 1321528399LL,  5},     {2,  940802361LL,  6},     {2,  891408307LL,  6},
+    {2,  842937507LL,  6},     {2,  795364315LL,  6},     {2,  748664025LL,  6},
+    {2,  702812831LL,  6},     {2,  657787785LL,  6},     {2,  613566757LL,  6},
+    {2,  570128403LL,  6},     {2,  527452125LL,  6},     {2,  485518043LL,  6},
+    {1, 2369637129LL,  6},     {2,  403800345LL,  6},     {1,  582368447LL,  4},
+    {1, 1154949189LL,  5},     {1, 2290649225LL,  6},     {2,  248469183LL,  6},
+    {1, 1126548799LL,  5},     {2,  174592167LL,  6},     {2,  138547333LL,  6},
+    {1,  274877907LL,  3},     {2,   68174085LL,  6},     {2,   33818641LL,  6},
+    {0,          0LL,  7},     {1,  266354561LL,  3},     {1, 4228890877LL,  7},
+    {1, 4196609267LL,  7},     {1, 1041204193LL,  5},     {1, 4133502361LL,  7},
+    {1,  128207979LL,  2},     {1, 4072265289LL,  7},     {1, 4042322161LL,  7},
+    {1,  125400505LL,  2},     {1, 1991868891LL,  6},     {1, 1977538899LL,  6},
+    {2, 3558687189LL,  7},     {1,  974744351LL,  5},     {1, 3871519817LL,  7},
+    {1, 3844446251LL,  7},     {1,  954437177LL,  5},     {1, 3791419407LL,  7},
+    {2, 3235934265LL,  7},     {1, 3739835469LL,  7},     {2, 3134165325LL,  7},
+    {1, 3689636335LL,  7},     {1,  458129845LL,  4},     {1,  910191745LL,  5},
+    {2, 2938661835LL,  7},     {1, 3593175255LL,  7},     {1,  892460737LL,  5},
+    {1, 3546811703LL,  7},     {2, 2753184165LL,  7},     {1,  875407347LL,  5},
+    {1, 3479467177LL,  7},     {2, 2620200175LL,  7},     {1, 3435973837LL,  7},
+    {1, 3414632385LL,  7},     {1, 3393554407LL,  7},     {1, 3372735055LL,  7},
+    {1, 3352169597LL,  7},     {1, 1665926709LL,  6},     {1,  827945503LL,  5},
+    {1, 1645975491LL,  6},     {2, 2249744775LL,  7},     {1, 1626496491LL,  6},
+    {1, 3233857729LL,  7},     {2, 2134925265LL,  7},     {1,  799063683LL,  5},
+    {2, 2060591247LL,  7},     {1,  789879043LL,  5},     {1, 1570730897LL,  6},
+    {1, 3123612579LL,  7},     {2, 1916962805LL,  7},     {1, 3088515809LL,  7},
+    {2, 1847555765LL,  7},     {2, 1813430637LL,  7},     {1, 3037324939LL,  7},
+    {2, 1746305385LL,  7},     {1, 3004130131LL,  7},     {1, 2987803337LL,  7},
+    {2, 1648338801LL,  7},     {1, 2955676419LL,  7},     {1, 2939870663LL,  7},
+    {1, 2924233053LL,  7},     {2, 1522554545LL,  7},     {2, 1491936009LL,  7},
+    {1, 2878302691LL,  7},     {1, 2863311531LL,  7},     {1,  356059465LL,  4},
+    {2, 1372618415LL,  7},     {2, 1343553873LL,  7},     {1, 1402438301LL,  6},
+    {2, 1286310003LL,  7},     {1, 2776544515LL,  7},     {1, 1381296015LL,  6},
+    {1, 1374389535LL,  6},     {1,   42735993LL,  1},     {2, 1148159575LL,  7},
+    {1, 2708156719LL,  7},     {1, 2694881441LL,  7},     {1, 1340867839LL,  6},
+    {2, 1042467791LL,  7},     {1,  663956297LL,  5},     {1, 1321528399LL,  6},
+    {1, 2630410593LL,  7},     {2,  940802361LL,  7},     {1, 2605477791LL,  7},
+    {2,  891408307LL,  7},     {1, 2581013211LL,  7},     {2,  842937507LL,  7},
+    {1, 1278501893LL,  6},     {2,  795364315LL,  7},     {2,  771906565LL,  7},
+    {2,  748664025LL,  7},     {2,  725633745LL,  7},     {2,  702812831LL,  7},
+    {2,  680198441LL,  7},     {2,  657787785LL,  7},     {2,  635578121LL,  7},
+    {2,  613566757LL,  7},     {1, 2443359173LL,  7},     {2,  570128403LL,  7},
+    {2,  548696263LL,  7},     {2,  527452125LL,  7},     {1, 1200340205LL,  6},
+    {2,  485518043LL,  7},     {2,  464823301LL,  7},     {1, 2369637129LL,  7},
+    {2,  423966729LL,  7},     {2,  403800345LL,  7},     {2,  383805589LL,  7},
+    {1,  582368447LL,  5},     {2,  344322273LL,  7},     {1, 1154949189LL,  6},
+    {1, 2300233531LL,  7},     {1, 2290649225LL,  7},     {1,  285143057LL,  4},
+    {2,  248469183LL,  7},     {1, 2262369605LL,  7},     {1, 1126548799LL,  6},
+    {2,  192835267LL,  7},     {2,  174592167LL,  7},     {2,  156496785LL,  7},
+    {2,  138547333LL,  7},     {2,  120742053LL,  7},     {1,  274877907LL,  4},
+    {1, 2190262207LL,  7},     {2,   68174085LL,  7},     {1, 2172947881LL,  7},
+    {2,   33818641LL,  7},     {1, 2155905153LL,  7},     {0,          0LL,  8},
+};
+static const uniform int64 __idiv_table_s32[][3] = {
+    {0,          0LL,  1},     {1, 1431655766LL,  0},     {0,          0LL,  2},
+    {1, 1717986919LL,  1},     {1,  715827883LL,  0},     {1, 2454267027LL,  2},
+    {0,          0LL,  3},     {1,  954437177LL,  1},     {1, 1717986919LL,  2},
+    {1,  780903145LL,  1},     {1,  715827883LL,  1},     {1, 1321528399LL,  2},
+    {1, 2454267027LL,  3},     {1, 2290649225LL,  3},     {0,          0LL,  4},
+    {1, 2021161081LL,  3},     {1,  954437177LL,  2},     {1, 1808407283LL,  3},
+    {1, 1717986919LL,  3},     {1,  818089009LL,  2},     {1,  780903145LL,  2},
+    {1, 2987803337LL,  4},     {1,  715827883LL,  2},     {1, 1374389535LL,  3},
+    {1, 1321528399LL,  3},     {1, 1272582903LL,  3},     {1, 2454267027LL,  4},
+    {1, 2369637129LL,  4},     {1, 2290649225LL,  4},     {1, 2216757315LL,  4},
+    {0,          0LL,  5},     {1, 1041204193LL,  3},     {1, 2021161081LL,  4},
+    {1, 3926827243LL,  5},     {1,  954437177LL,  3},     {1, 3714566311LL,  5},
+    {1, 1808407283LL,  4},     {1, 3524075731LL,  5},     {1, 1717986919LL,  4},
+    {1, 1676084799LL,  4},     {1,  818089009LL,  3},     {1,  799063683LL,  3},
+    {1,  780903145LL,  3},     {1, 3054198967LL,  5},     {1, 2987803337LL,  5},
+    {1, 2924233053LL,  5},     {1,  715827883LL,  3},     {1, 1402438301LL,  4},
+    {1, 1374389535LL,  4},     {1, 2694881441LL,  5},     {1, 1321528399LL,  4},
+    {1, 1296593901LL,  4},     {1, 1272582903LL,  4},     {1,  156180629LL,  1},
+    {1, 2454267027LL,  5},     {1, 2411209711LL,  5},     {1, 2369637129LL,  5},
+    {1,  582368447LL,  3},     {1, 2290649225LL,  5},     {1, 1126548799LL,  4},
+    {1, 2216757315LL,  5},     {1, 2181570691LL,  5},     {0,          0LL,  6},
+    {1, 2114445439LL,  5},     {1, 1041204193LL,  4},     {1,  128207979LL,  1},
+    {1, 2021161081LL,  5},     {1, 1991868891LL,  5},     {1, 3926827243LL,  6},
+    {1, 3871519817LL,  6},     {1,  954437177LL,  4},     {1, 3765450781LL,  6},
+    {1, 3714566311LL,  6},     {1,  458129845LL,  3},     {1, 1808407283LL,  5},
+    {1,  892460737LL,  4},     {1, 3524075731LL,  6},     {1, 1739733589LL,  5},
+    {1, 1717986919LL,  5},     {1,  424194301LL,  3},     {1, 1676084799LL,  5},
+    {1,  827945503LL,  4},     {1,  818089009LL,  4},     {1, 1616928865LL,  5},
+    {1,  799063683LL,  4},     {1,  789879043LL,  4},     {1,  780903145LL,  4},
+    {1, 3088515809LL,  6},     {1, 3054198967LL,  6},     {1, 3020636341LL,  6},
+    {1, 2987803337LL,  6},     {1,  738919105LL,  4},     {1, 2924233053LL,  6},
+    {1, 2893451653LL,  6},     {1,  715827883LL,  4},     {1,  354224107LL,  3},
+    {1, 1402438301LL,  5},     {1, 2776544515LL,  6},     {1, 1374389535LL,  5},
+    {1,  680390859LL,  4},     {1, 2694881441LL,  6},     {1,  333589693LL,  3},
+    {1, 1321528399LL,  5},     {1, 2617884829LL,  6},     {1, 1296593901LL,  5},
+    {1, 1284476201LL,  5},     {1, 1272582903LL,  5},     {1, 2521815661LL,  6},
+    {1,  156180629LL,  2},     {1, 2476377541LL,  6},     {1, 2454267027LL,  6},
+    {1, 1216273925LL,  5},     {1, 2411209711LL,  6},     {1, 1195121335LL,  5},
+    {1, 2369637129LL,  6},     {1, 2349383821LL,  6},     {1,  582368447LL,  4},
+    {1, 1154949189LL,  5},     {1, 2290649225LL,  6},     {1,   70991195LL,  1},
+    {1, 1126548799LL,  5},     {1,  558694933LL,  4},     {1, 2216757315LL,  6},
+    {1,  274877907LL,  3},     {1, 2181570691LL,  6},     {1, 2164392969LL,  6},
+    {0,          0LL,  7},     {1,  266354561LL,  3},     {1, 2114445439LL,  6},
+    {1, 1049152317LL,  5},     {1, 1041204193LL,  5},     {1, 4133502361LL,  7},
+    {1,  128207979LL,  2},     {1, 4072265289LL,  7},     {1, 2021161081LL,  6},
+    {1,  125400505LL,  2},     {1, 1991868891LL,  6},     {1, 1977538899LL,  6},
+    {1, 3926827243LL,  7},     {1,  974744351LL,  5},     {1, 3871519817LL,  7},
+    {1,  961111563LL,  5},     {1,  954437177LL,  5},     {1, 3791419407LL,  7},
+    {1, 3765450781LL,  7},     {1, 1869917735LL,  6},     {1, 3714566311LL,  7},
+    {1,  230602271LL,  3},     {1,  458129845LL,  4},     {1,  910191745LL,  5},
+    {1, 1808407283LL,  6},     {1, 3593175255LL,  7},     {1,  892460737LL,  5},
+    {1,  443351463LL,  4},     {1, 3524075731LL,  7},     {1,  875407347LL,  5},
+    {1, 1739733589LL,  6},     {1,  432197967LL,  4},     {1, 1717986919LL,  6},
+    {1, 3414632385LL,  7},     {1,  424194301LL,  4},     {1,  210795941LL,  3},
+    {1, 1676084799LL,  6},     {1, 1665926709LL,  6},     {1,  827945503LL,  5},
+    {1, 1645975491LL,  6},     {1,  818089009LL,  5},     {1, 1626496491LL,  6},
+    {1, 1616928865LL,  6},     {1, 3214946281LL,  7},     {1,  799063683LL,  5},
+    {1,  397222409LL,  4},     {1,  789879043LL,  5},     {1, 1570730897LL,  6},
+    {1,  780903145LL,  5},     {1, 3105965051LL,  7},     {1, 3088515809LL,  7},
+    {1, 3071261531LL,  7},     {1, 3054198967LL,  7},     {1,  759331235LL,  5},
+    {1, 3020636341LL,  7},     {1, 3004130131LL,  7},     {1, 2987803337LL,  7},
+    {1, 2971653049LL,  7},     {1,  738919105LL,  5},     {1, 2939870663LL,  7},
+    {1, 2924233053LL,  7},     {1, 2908760921LL,  7},     {1, 2893451653LL,  7},
+    {1, 2878302691LL,  7},     {1,  715827883LL,  5},     {1,  356059465LL,  4},
+    {1,  354224107LL,  4},     {1, 2819260585LL,  7},     {1, 1402438301LL,  6},
+    {1, 1395319325LL,  6},     {1, 2776544515LL,  7},     {1, 1381296015LL,  6},
+    {1, 1374389535LL,  6},     {1,   42735993LL,  1},     {1,  680390859LL,  5},
+    {1, 2708156719LL,  7},     {1, 2694881441LL,  7},     {1, 1340867839LL,  6},
+    {1,  333589693LL,  4},     {1,  663956297LL,  5},     {1, 1321528399LL,  6},
+    {1, 2630410593LL,  7},     {1, 2617884829LL,  7},     {1,   81421181LL,  2},
+    {1, 1296593901LL,  6},     {1, 2581013211LL,  7},     {1, 1284476201LL,  6},
+    {1, 1278501893LL,  6},     {1, 1272582903LL,  6},     {1, 2533436931LL,  7},
+    {1, 2521815661LL,  7},     {1, 2510300521LL,  7},     {1,  156180629LL,  3},
+    {1, 2487582869LL,  7},     {1, 2476377541LL,  7},     {1, 2465272709LL,  7},
+    {1, 2454267027LL,  7},     {1, 2443359173LL,  7},     {1, 1216273925LL,  6},
+    {1,  605457945LL,  5},     {1, 2411209711LL,  7},     {1, 1200340205LL,  6},
+    {1, 1195121335LL,  6},     {1, 2379895299LL,  7},     {1, 2369637129LL,  7},
+    {1, 2359467013LL,  7},     {1, 2349383821LL,  7},     {1, 2339386443LL,  7},
+    {1,  582368447LL,  5},     {1, 2319644785LL,  7},     {1, 1154949189LL,  6},
+    {1, 2300233531LL,  7},     {1, 2290649225LL,  7},     {1,  285143057LL,  4},
+    {1,   70991195LL,  2},     {1, 2262369605LL,  7},     {1, 1126548799LL,  6},
+    {1, 1121950641LL,  6},     {1,  558694933LL,  5},     {1, 2225732041LL,  7},
+    {1, 2216757315LL,  7},     {1, 2207854675LL,  7},     {1,  274877907LL,  4},
+    {1, 2190262207LL,  7},     {1, 2181570691LL,  7},     {1, 2172947881LL,  7},
+    {1, 2164392969LL,  7},     {1, 2155905153LL,  7},     {1, 2147483649LL,  7},
+};
+
+__declspec(safe)
+static unmasked unsigned int8 __fast_idiv(unsigned int8 numerator,
+                                          uniform unsigned int8 divisor) {
+  uniform int64 method = __idiv_table_u8[divisor-2][0];
+  uniform int64 multiplier = __idiv_table_u8[divisor-2][1];
+  uniform int64 shift = __idiv_table_u8[divisor-2][2];
+
+  unsigned int16 mult = multiplier;
+  unsigned int16 val = numerator;
+  if (method == 0)
+      return numerator >> shift;
+  else if (method == 1)
+      return (val * mult) >> (8 + shift);
+  else {
+      val *= mult;
+      val >>= 8;
+      val += (numerator-val)>>1;
+      return (val >> shift);
+  }
+}
+
+__declspec(safe)
+static unmasked int8 __fast_idiv(int8 numerator, uniform int8 divisor) {
+  uniform int8 method = __idiv_table_s8[divisor-2][0];
+  uniform int16 multiplier = __idiv_table_s8[divisor-2][1];
+  uniform int8 shift = __idiv_table_s8[divisor-2][2];
+
+  if (method == 0)
+      return numerator >> shift;
+  else {
+      unsigned int8 sign = numerator >> 7;
+      numerator ^= sign;
+      int16 mul = (int16)numerator * (int16)multiplier;
+      mul >>= 8 + shift;
+      return (int8)mul ^ sign;
+  }
+}
+
+__declspec(safe)
+static unmasked unsigned int16 __fast_idiv(unsigned int16 numerator,
+                                           uniform unsigned int16 divisor) {
+  uniform int64 method = __idiv_table_u16[divisor-2][0];
+  uniform int64 multiplier = __idiv_table_u16[divisor-2][1];
+  uniform int64 shift = __idiv_table_u16[divisor-2][2];
+
+  unsigned int32 mult = multiplier;
+  unsigned int32 val = numerator;
+  if (method == 0)
+      return numerator >> shift;
+  else if (method == 1)
+      return (val * mult) >> (16 + shift);
+  else {
+      val *= mult;
+      val >>= 16;
+      val += (numerator-val)>>1;
+      return val >> shift;
+  }
+}
+
+__declspec(safe)
+static unmasked int16 __fast_idiv(int16 numerator, uniform int16 divisor) {
+  uniform int64 method = __idiv_table_s16[divisor-2][0];
+  uniform int64 multiplier = __idiv_table_s16[divisor-2][1];
+  uniform int64 shift = __idiv_table_s16[divisor-2][2];
+
+  if (method == 0)
+      return numerator >> shift;
+  else {
+      unsigned int16 sign = numerator >> 15;
+      numerator ^= sign;
+      int32 mul = (int32)numerator * (int32)multiplier;
+      mul >>= 16 + shift;
+      int16 result = mul;
+      return result ^ sign;
+  }
+}
+
+__declspec(safe)
+static unmasked inline unsigned int32 __fast_idiv(unsigned int32 numerator,
+                                                  uniform unsigned int32 divisor) {
+  uniform int64 method = __idiv_table_u32[divisor-2][0];
+  uniform int64 multiplier = __idiv_table_u32[divisor-2][1];
+  uniform int64 shift = __idiv_table_u32[divisor-2][2];
+
+  unsigned int64 mult = multiplier;
+  unsigned int64 val = numerator;
+  if (method == 0)
+      return numerator >> shift;
+  else if (method == 1)
+      return (val * mult) >> (32 + shift);
+  else {
+      val *= mult;
+      val >>= 32;
+      val += (numerator-val)>>1;
+      return val >> shift;
+  }
+}
+
+__declspec(safe)
+static unmasked int32 __fast_idiv(int32 numerator, uniform int32 divisor) {
+  uniform int64 method = __idiv_table_s32[divisor-2][0];
+  uniform int64 multiplier = __idiv_table_s32[divisor-2][1];
+  uniform int64 shift = __idiv_table_s32[divisor-2][2];
+
+  if (method == 0)
+      return numerator >> shift;
+  else {
+      unsigned int32 sign = numerator >> 31;
+      numerator ^= sign;
+      int64 mul = (int64)numerator * (int64)multiplier;
+      mul >>= 32 + shift;
+      int32 result = mul;
+      return result ^ sign;
+  }
+}
+
diff --git a/tests/idiv.ispc b/tests/idiv.ispc
new file mode 100644
index 00000000..b7bd78dc
--- /dev/null
+++ b/tests/idiv.ispc
@@ -0,0 +1,75 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+  uniform int errorCount = 0;
+ 
+  for (unsigned int8 num = 0; num < 255; ++num) {
+    for (uniform unsigned int8 div = 2; div < 255; ++div) {
+      if (__fast_idiv(num, div) != num/div) {
+        ++errorCount;
+        print("error %/% = %, got %\n", num, div, num/div, __fast_idiv(num,div));
+      }
+    }
+  }
+
+  for (int8 num = 0; num < 127; ++num) {
+    for (uniform int8 div = 2; div < 127; ++div) {
+      if (__fast_idiv(num, div) != num/div) {
+        ++errorCount;
+        print("error %/% = %, got %\n", num, div, num/div, __fast_idiv(num,div));
+      }
+    }
+  }
+
+  for (int16 num = 0; num < 32767; ++num) {
+    for (uniform int16 div = 2; div < 256; ++div) {
+      if (__fast_idiv(num, div) != num/div) {
+        ++errorCount;
+        print("error %/% = %, got %\n", num, div, num/div, __fast_idiv(num,div));
+      }
+    }
+  }
+
+  for (unsigned int16 num = 0; num < 0xffff; ++num) {
+    for (uniform unsigned int16 div = 2; div < 256; ++div) {
+      if (__fast_idiv(num, div) != num/div) {
+        ++errorCount;
+        print("error %/% = %, got %\n", num, div, num/div, __fast_idiv(num,div));
+      }
+    }
+  }
+
+  // randomly sample int32s...
+  uniform RNGState state;
+  seed_rng(&state, 1234);
+  for (uniform int i = 0; i < 1M; ++i) {
+    unsigned int32 num = random(&state);
+    for (uniform unsigned int32 div = 2; div < 256; ++div) {
+      if (__fast_idiv(num, div) != num/div) {
+        ++errorCount;
+        print("ui32 error %/% = %, got %\n", num, div, num/div, __fast_idiv(num,div));
+      }
+    }
+  }
+
+  for (uniform int64 i = 0; i < 1M; ++i) {  
+    int32 num = random(&state);
+    if (num < 0)
+      continue;
+    for (uniform int32 div = 2; div < 256; ++div) {
+      if (__fast_idiv(num, div) != num/div) {
+        ++errorCount;
+        print("si32 error %/% = %, got %\n", num, div, num/div, __fast_idiv(num,div));
+      }
+    }
+  }
+
+  RET[programIndex] = errorCount;
+}
+
+export void result(uniform float RET[]) {
+  RET[programIndex] = 0;
+}
+

From e7abf3f2eacd50b0b8cb194fc87e878bdc25ddec Mon Sep 17 00:00:00 2001
From: Matt Pharr <mpharr@google.com>
Date: Tue, 23 Jul 2013 16:38:10 -0700
Subject: [PATCH 03/34] Add support for mask vectors of 8 and 16-bit element
 types.

There were a number of places throughout the system that assumed that the
execution mask would only have either 32-bit or 1-bit elements.  This
commit makes it possible to have a target with an 8- or 16-bit mask.
---
 Makefile         |  29 ++++++---
 builtins.cpp     |  35 +++++++----
 builtins/util.m4 | 161 ++++++++++++++++++++++++++++++++---------------
 ctx.cpp          |  26 +++-----
 expr.cpp         |  36 +++++------
 llvmutil.cpp     |  73 +++++++++++++++++----
 parse.yy         |  20 +++++-
 stdlib.ispc      |  37 ++++++-----
 8 files changed, 284 insertions(+), 133 deletions(-)

diff --git a/Makefile b/Makefile
index 835f8e15..043ab4cf 100644
--- a/Makefile
+++ b/Makefile
@@ -137,7 +137,7 @@ BISON_SRC=parse.yy
 FLEX_SRC=lex.ll
 
 OBJS=$(addprefix objs/, $(CXX_SRC:.cpp=.o) $(BUILTINS_OBJS) \
-	stdlib_generic_ispc.o stdlib_x86_ispc.o \
+	stdlib_mask1_ispc.o stdlib_mask8_ispc.o stdlib_mask16_ispc.o stdlib_mask32_ispc.o \
 	$(BISON_SRC:.yy=.o) $(FLEX_SRC:.ll=.o))
 
 default: ispc
@@ -243,12 +243,23 @@ objs/builtins-c-64.cpp: builtins/builtins.c
 	@echo Creating C++ source from builtins definition file $<
 	@$(CLANG) -m64 -emit-llvm -c $< -o - | llvm-dis - | python bitcode2cpp.py c 64 > $@
 
-objs/stdlib_generic_ispc.cpp: stdlib.ispc
-	@echo Creating C++ source from $< for generic
-	@$(CLANG) -E -x c -DISPC_TARGET_GENERIC=1 -DISPC=1 -DPI=3.1415926536 $< -o - | \
-		python stdlib2cpp.py generic > $@
+objs/stdlib_mask1_ispc.cpp: stdlib.ispc
+	@echo Creating C++ source from $< for mask1
+	@$(CLANG) -E -x c -DISPC_MASK_BITS=1 -DISPC=1 -DPI=3.1415926536 $< -o - | \
+		python stdlib2cpp.py mask1 > $@
+
+objs/stdlib_mask8_ispc.cpp: stdlib.ispc
+	@echo Creating C++ source from $< for mask8
+	@$(CLANG) -E -x c -DISPC_MASK_BITS=8 -DISPC=1 -DPI=3.1415926536 $< -o - | \
+		python stdlib2cpp.py mask8 > $@
+
+objs/stdlib_mask16_ispc.cpp: stdlib.ispc
+	@echo Creating C++ source from $< for mask16
+	@$(CLANG) -E -x c -DISPC_MASK_BITS=16 -DISPC=1 -DPI=3.1415926536 $< -o - | \
+		python stdlib2cpp.py mask16 > $@
+
+objs/stdlib_mask32_ispc.cpp: stdlib.ispc
+	@echo Creating C++ source from $< for mask32
+	@$(CLANG) -E -x c -DISPC_MASK_BITS=32 -DISPC=1 -DPI=3.1415926536 $< -o - | \
+		python stdlib2cpp.py mask32 > $@
 
-objs/stdlib_x86_ispc.cpp: stdlib.ispc
-	@echo Creating C++ source from $< for x86
-	@$(CLANG) -E -x c -DISPC=1 -DPI=3.1415926536 $< -o - | \
-		python stdlib2cpp.py x86 > $@
diff --git a/builtins.cpp b/builtins.cpp
index 3e03de10..d3bbaa6a 100644
--- a/builtins.cpp
+++ b/builtins.cpp
@@ -112,10 +112,7 @@ lLLVMTypeToISPCType(const llvm::Type *t, bool intAsUnsigned) {
         return intAsUnsigned ? AtomicType::UniformUInt64 : AtomicType::UniformInt64;
 
     // varying
-    if (LLVMTypes::MaskType != LLVMTypes::Int32VectorType &&
-        t == LLVMTypes::MaskType)
-        return AtomicType::VaryingBool;
-    else if (t == LLVMTypes::Int8VectorType)
+    if (t == LLVMTypes::Int8VectorType)
         return intAsUnsigned ? AtomicType::VaryingUInt8 : AtomicType::VaryingInt8;
     else if (t == LLVMTypes::Int16VectorType)
         return intAsUnsigned ? AtomicType::VaryingUInt16 : AtomicType::VaryingInt16;
@@ -127,6 +124,8 @@ lLLVMTypeToISPCType(const llvm::Type *t, bool intAsUnsigned) {
         return AtomicType::VaryingDouble;
     else if (t == LLVMTypes::Int64VectorType)
         return intAsUnsigned ? AtomicType::VaryingUInt64 : AtomicType::VaryingInt64;
+    else if (t == LLVMTypes::MaskType)
+        return AtomicType::VaryingBool;
 
     // pointers to uniform
     else if (t == LLVMTypes::Int8PointerType)
@@ -1038,16 +1037,30 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
         // If the user wants the standard library to be included, parse the
         // serialized version of the stdlib.ispc file to get its
         // definitions added.
+        extern char stdlib_mask1_code[], stdlib_mask8_code[];
+        extern char stdlib_mask16_code[], stdlib_mask32_code[];
         if (g->target->getISA() == Target::GENERIC &&
-            g->target->getVectorWidth() != 1) { // 1 wide uses x86 stdlib
-            extern char stdlib_generic_code[];
-            yy_scan_string(stdlib_generic_code);
-            yyparse();
+            g->target->getVectorWidth() == 1) { // 1 wide uses 32 stdlib
+            yy_scan_string(stdlib_mask32_code);
         }
         else {
-            extern char stdlib_x86_code[];
-            yy_scan_string(stdlib_x86_code);
-            yyparse();
+            switch (g->target->getMaskBitCount()) {
+            case 1:
+                yy_scan_string(stdlib_mask1_code);
+                break;
+            case 8:
+                yy_scan_string(stdlib_mask8_code);
+                break;
+            case 16:
+                yy_scan_string(stdlib_mask16_code);
+                break;
+            case 32:
+                yy_scan_string(stdlib_mask32_code);
+                break;
+            default:
+                FATAL("Unhandled mask bit size for stdlib.ispc");
+            }
         }
+        yyparse();
     }
 }
diff --git a/builtins/util.m4 b/builtins/util.m4
index c19d4930..d6f3e5c3 100644
--- a/builtins/util.m4
+++ b/builtins/util.m4
@@ -690,6 +690,75 @@ shuffles(i64, 8)
 ;; $4: return type of the LLVM atomic type, in ispc naming paralance (e.g. int32)
 ;; $5: identity value for the operator (e.g. 0 for add, -1 for AND, ...)
 
+define(`mask_converts', `
+define internal <$1 x i8> @convertmask_i1_i8_$1(<$1 x i1>) {
+  %r = sext <$1 x i1> %0 to <$1 x i8>
+  ret <$1 x i8> %r
+}
+define internal <$1 x i16> @convertmask_i1_i16_$1(<$1 x i1>) {
+  %r = sext <$1 x i1> %0 to <$1 x i16>
+  ret <$1 x i16> %r
+}
+define internal <$1 x i32> @convertmask_i1_i32_$1(<$1 x i1>) {
+  %r = sext <$1 x i1> %0 to <$1 x i32>
+  ret <$1 x i32> %r
+}
+define internal <$1 x i64> @convertmask_i1_i64_$1(<$1 x i1>) {
+  %r = sext <$1 x i1> %0 to <$1 x i64>
+  ret <$1 x i64> %r
+}
+
+define internal <$1 x i8> @convertmask_i8_i8_$1(<$1 x i8>) {
+  ret <$1 x i8> %0
+}
+define internal <$1 x i16> @convertmask_i8_i86_$1(<$1 x i8>) {
+  %r = sext <$1 x i8> %0 to <$1 x i16>
+  ret <$1 x i16> %r
+}
+define internal <$1 x i32> @convertmask_i8_i32_$1(<$1 x i8>) {
+  %r = sext <$1 x i8> %0 to <$1 x i32>
+  ret <$1 x i32> %r
+}
+define internal <$1 x i64> @convertmask_i8_i64_$1(<$1 x i8>) {
+  %r = sext <$1 x i8> %0 to <$1 x i64>
+  ret <$1 x i64> %r
+}
+
+define internal <$1 x i8> @convertmask_i16_i8_$1(<$1 x i16>) {
+  %r = trunc <$1 x i16> %0 to <$1 x i8>
+  ret <$1 x i8> %r
+}
+define internal <$1 x i16> @convertmask_i16_i16_$1(<$1 x i16>) {
+  ret <$1 x i16> %0
+}
+define internal <$1 x i32> @convertmask_i16_i32_$1(<$1 x i16>) {
+  %r = sext <$1 x i16> %0 to <$1 x i32>
+  ret <$1 x i32> %r
+}
+define internal <$1 x i64> @convertmask_i16_i64_$1(<$1 x i16>) {
+  %r = sext <$1 x i16> %0 to <$1 x i64>
+  ret <$1 x i64> %r
+}
+
+define internal <$1 x i8> @convertmask_i32_i8_$1(<$1 x i32>) {
+  %r = trunc <$1 x i32> %0 to <$1 x i8>
+  ret <$1 x i8> %r
+}
+define internal <$1 x i16> @convertmask_i32_i16_$1(<$1 x i32>) {
+  %r = trunc <$1 x i32> %0 to <$1 x i16>
+  ret <$1 x i16> %r
+}
+define internal <$1 x i32> @convertmask_i32_i32_$1(<$1 x i32>) {
+  ret <$1 x i32> %0
+}
+define internal <$1 x i64> @convertmask_i32_i64_$1(<$1 x i32>) {
+  %r = sext <$1 x i32> %0 to <$1 x i64>
+  ret <$1 x i64> %r
+}
+')
+
+mask_converts(WIDTH)
+
 define(`global_atomic_associative', `
 
 define <$1 x $3> @__atomic_$2_$4_global($3 * %ptr, <$1 x $3> %val,
@@ -697,17 +766,10 @@ define <$1 x $3> @__atomic_$2_$4_global($3 * %ptr, <$1 x $3> %val,
   ; first, for any lanes where the mask is off, compute a vector where those lanes
   ; hold the identity value..
 
-  ; for the bit tricks below, we need the mask to be sign extended to be
-  ; the size of the element type.
-  ifelse(
-    MASK,i1,`%mask = sext <$1 x MASK> %m to <$1 x $3>',
-    $3,i64, `%mask = sext <$1 x MASK> %m to <$1 x i64>',
-    $3,i32, `
-       ; silly workaround to do %mask = %m, which is not possible directly..
-       %maskmem = alloca <$1 x i32>
-       store <$1 x i32> %m, <$1 x i32> * %maskmem
-       %mask = load <$1 x i32> * %maskmem'
-  )
+  ; for the bit tricks below, we need the mask to have the
+  ; the same element size as the element type.
+  %mask = call <$1 x $3> @convertmask_`'MASK`'_$3_$1(<$1 x MASK> %m)
+
   ; zero out any lanes that are off
   %valoff = and <$1 x $3> %val, %mask
 
@@ -2440,13 +2502,12 @@ define i32 @__sext_uniform_bool(i1) nounwind readnone alwaysinline {
 }
 
 define <WIDTH x i32> @__sext_varying_bool(<WIDTH x MASK>) nounwind readnone alwaysinline {
-  ifelse(MASK,i1, `
-  %se = sext <WIDTH x i1> %0 to <WIDTH x i32>
-  ret <WIDTH x i32> %se
-  ', `
-  ret <WIDTH x i32> %0')
+  ifelse(MASK,i32, `ret <WIDTH x i32> %0',
+  `%se = sext <WIDTH x MASK> %0 to <WIDTH x i32>
+  ret <WIDTH x i32> %se')
 }
 
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; memcpy/memmove/memset
 
@@ -3201,8 +3262,8 @@ return:
 ;; $1: llvm type of elements (and suffix for function name)
 
 define(`gen_masked_store', `
-define void @__masked_store_$1(<WIDTH x $1>* nocapture, <WIDTH x $1>, <WIDTH x i32>) nounwind alwaysinline {
-  per_lane(WIDTH, <WIDTH x i32> %2, `
+define void @__masked_store_$1(<WIDTH x $1>* nocapture, <WIDTH x $1>, <WIDTH x MASK>) nounwind alwaysinline {
+  per_lane(WIDTH, <WIDTH x MASK> %2, `
       %ptr_LANE_ID = getelementptr <WIDTH x $1> * %0, i32 0, i32 LANE
       %storeval_LANE_ID = extractelement <WIDTH x $1> %1, i32 LANE
       store $1 %storeval_LANE_ID, $1 * %ptr_LANE_ID')
@@ -3378,10 +3439,10 @@ define void @__masked_store_blend_i16(<16 x i16>* nocapture, <16 x i16>,
 define(`packed_load_and_store', `
 
 define i32 @__packed_load_active(i32 * %startptr, <WIDTH x i32> * %val_ptr,
-                                 <WIDTH x i32> %full_mask) nounwind alwaysinline {
+                                 <WIDTH x MASK> %full_mask) nounwind alwaysinline {
 entry:
-  %mask = call i64 @__movmsk(<WIDTH x i32> %full_mask)
-  %mask_known = call i1 @__is_compile_time_constant_mask(<WIDTH x i32> %full_mask)
+  %mask = call i64 @__movmsk(<WIDTH x MASK> %full_mask)
+  %mask_known = call i1 @__is_compile_time_constant_mask(<WIDTH x MASK> %full_mask)
   br i1 %mask_known, label %known_mask, label %unknown_mask
 
 known_mask:
@@ -3432,10 +3493,10 @@ done:
 }
 
 define i32 @__packed_store_active(i32 * %startptr, <WIDTH x i32> %vals,
-                                   <WIDTH x i32> %full_mask) nounwind alwaysinline {
+                                   <WIDTH x MASK> %full_mask) nounwind alwaysinline {
 entry:
-  %mask = call i64 @__movmsk(<WIDTH x i32> %full_mask)
-  %mask_known = call i1 @__is_compile_time_constant_mask(<WIDTH x i32> %full_mask)
+  %mask = call i64 @__movmsk(<WIDTH x MASK> %full_mask)
+  %mask_known = call i1 @__is_compile_time_constant_mask(<WIDTH x MASK> %full_mask)
   br i1 %mask_known, label %known_mask, label %unknown_mask
 
 known_mask:
@@ -3544,10 +3605,10 @@ check_neighbors:
   %castvr = call <$1 x $4> @__rotate_i$6(<$1 x $4> %castvec, i32 1)
   %vr = bitcast <$1 x $4> %castvr to <$1 x $2>
   %eq = $5 $7 <$1 x $2> %vec, %vr
-  ifelse(MASK,i32, `
-    %eq32 = sext <$1 x i1> %eq to <$1 x i32>
-    %eqmm = call i64 @__movmsk(<$1 x i32> %eq32)', `
-    %eqmm = call i64 @__movmsk(<$1 x MASK> %eq)')
+  ifelse(MASK,i1, `
+    %eqmm = call i64 @__movmsk(<$1 x MASK> %eq)',
+    `%eqm = sext <$1 x i1> %eq to <$1 x MASK>
+    %eqmm = call i64 @__movmsk(<$1 x MASK> %eqm)')
   %alleq = icmp eq i64 %eqmm, ALL_ON_MASK
   br i1 %alleq, label %all_equal, label %not_all_equal
   ', `
@@ -3722,9 +3783,9 @@ pl_done:
 define(`gen_gather_general', `
 ; fully general 32-bit gather, takes array of pointers encoded as vector of i32s
 define <WIDTH x $1> @__gather32_$1(<WIDTH x i32> %ptrs, 
-                                   <WIDTH x i32> %vecmask) nounwind readonly alwaysinline {
+                                   <WIDTH x MASK> %vecmask) nounwind readonly alwaysinline {
   %ret_ptr = alloca <WIDTH x $1>
-  per_lane(WIDTH, <WIDTH x i32> %vecmask, `
+  per_lane(WIDTH, <WIDTH x MASK> %vecmask, `
   %iptr_LANE_ID = extractelement <WIDTH x i32> %ptrs, i32 LANE
   %ptr_LANE_ID = inttoptr i32 %iptr_LANE_ID to $1 *
   %val_LANE_ID = load $1 * %ptr_LANE_ID
@@ -3738,9 +3799,9 @@ define <WIDTH x $1> @__gather32_$1(<WIDTH x i32> %ptrs,
 
 ; fully general 64-bit gather, takes array of pointers encoded as vector of i32s
 define <WIDTH x $1> @__gather64_$1(<WIDTH x i64> %ptrs, 
-                                   <WIDTH x i32> %vecmask) nounwind readonly alwaysinline {
+                                   <WIDTH x MASK> %vecmask) nounwind readonly alwaysinline {
   %ret_ptr = alloca <WIDTH x $1>
-  per_lane(WIDTH, <WIDTH x i32> %vecmask, `
+  per_lane(WIDTH, <WIDTH x MASK> %vecmask, `
   %iptr_LANE_ID = extractelement <WIDTH x i64> %ptrs, i32 LANE
   %ptr_LANE_ID = inttoptr i64 %iptr_LANE_ID to $1 *
   %val_LANE_ID = load $1 * %ptr_LANE_ID
@@ -3804,7 +3865,7 @@ define <WIDTH x $1> @__gather_elt64_$1(i8 * %ptr, <WIDTH x i64> %offsets, i32 %o
 
 define <WIDTH x $1> @__gather_factored_base_offsets32_$1(i8 * %ptr, <WIDTH x i32> %offsets, i32 %offset_scale,
                                              <WIDTH x i32> %offset_delta,
-                                             <WIDTH x i32> %vecmask) nounwind readonly alwaysinline {
+                                             <WIDTH x MASK> %vecmask) nounwind readonly alwaysinline {
   ; We can be clever and avoid the per-lane stuff for gathers if we are willing
   ; to require that the 0th element of the array being gathered from is always
   ; legal to read from (and we do indeed require that, given the benefits!) 
@@ -3813,13 +3874,13 @@ define <WIDTH x $1> @__gather_factored_base_offsets32_$1(i8 * %ptr, <WIDTH x i32
   %offsetsPtr = alloca <WIDTH x i32>
   store <WIDTH x i32> zeroinitializer, <WIDTH x i32> * %offsetsPtr
   call void @__masked_store_blend_i32(<WIDTH x i32> * %offsetsPtr, <WIDTH x i32> %offsets, 
-                                      <WIDTH x i32> %vecmask)
+                                      <WIDTH x MASK> %vecmask)
   %newOffsets = load <WIDTH x i32> * %offsetsPtr
 
   %deltaPtr = alloca <WIDTH x i32>
   store <WIDTH x i32> zeroinitializer, <WIDTH x i32> * %deltaPtr
   call void @__masked_store_blend_i32(<WIDTH x i32> * %deltaPtr, <WIDTH x i32> %offset_delta, 
-                                      <WIDTH x i32> %vecmask)
+                                      <WIDTH x MASK> %vecmask)
   %newDelta = load <WIDTH x i32> * %deltaPtr
 
   %ret0 = call <WIDTH x $1> @__gather_elt32_$1(i8 * %ptr, <WIDTH x i32> %newOffsets,
@@ -3835,7 +3896,7 @@ define <WIDTH x $1> @__gather_factored_base_offsets32_$1(i8 * %ptr, <WIDTH x i32
 
 define <WIDTH x $1> @__gather_factored_base_offsets64_$1(i8 * %ptr, <WIDTH x i64> %offsets, i32 %offset_scale,
                                              <WIDTH x i64> %offset_delta,
-                                             <WIDTH x i32> %vecmask) nounwind readonly alwaysinline {
+                                             <WIDTH x MASK> %vecmask) nounwind readonly alwaysinline {
   ; We can be clever and avoid the per-lane stuff for gathers if we are willing
   ; to require that the 0th element of the array being gathered from is always
   ; legal to read from (and we do indeed require that, given the benefits!) 
@@ -3844,13 +3905,13 @@ define <WIDTH x $1> @__gather_factored_base_offsets64_$1(i8 * %ptr, <WIDTH x i64
   %offsetsPtr = alloca <WIDTH x i64>
   store <WIDTH x i64> zeroinitializer, <WIDTH x i64> * %offsetsPtr
   call void @__masked_store_blend_i64(<WIDTH x i64> * %offsetsPtr, <WIDTH x i64> %offsets, 
-                                      <WIDTH x i32> %vecmask)
+                                      <WIDTH x MASK> %vecmask)
   %newOffsets = load <WIDTH x i64> * %offsetsPtr
 
   %deltaPtr = alloca <WIDTH x i64>
   store <WIDTH x i64> zeroinitializer, <WIDTH x i64> * %deltaPtr
   call void @__masked_store_blend_i64(<WIDTH x i64> * %deltaPtr, <WIDTH x i64> %offset_delta, 
-                                      <WIDTH x i32> %vecmask)
+                                      <WIDTH x MASK> %vecmask)
   %newDelta = load <WIDTH x i64> * %deltaPtr
 
   %ret0 = call <WIDTH x $1> @__gather_elt64_$1(i8 * %ptr, <WIDTH x i64> %newOffsets,
@@ -3876,27 +3937,27 @@ gen_gather_factored($1)
 define <WIDTH x $1>
 @__gather_base_offsets32_$1(i8 * %ptr, i32 %offset_scale,
                            <WIDTH x i32> %offsets,
-                           <WIDTH x i32> %vecmask) nounwind readonly alwaysinline {
+                           <WIDTH x MASK> %vecmask) nounwind readonly alwaysinline {
   %scale_vec = bitcast i32 %offset_scale to <1 x i32>
   %smear_scale = shufflevector <1 x i32> %scale_vec, <1 x i32> undef,
      <WIDTH x i32> < forloop(i, 1, eval(WIDTH-1), `i32 0, ') i32 0 >
   %scaled_offsets = mul <WIDTH x i32> %smear_scale, %offsets
   %v = call <WIDTH x $1> @__gather_factored_base_offsets32_$1(i8 * %ptr, <WIDTH x i32> %scaled_offsets, i32 1, 
-                                                     <WIDTH x i32> zeroinitializer, <WIDTH x i32> %vecmask)
+                                                     <WIDTH x i32> zeroinitializer, <WIDTH x MASK> %vecmask)
   ret <WIDTH x $1> %v
 }
 
 define <WIDTH x $1>
 @__gather_base_offsets64_$1(i8 * %ptr, i32 %offset_scale,
                             <WIDTH x i64> %offsets,
-                            <WIDTH x i32> %vecmask) nounwind readonly alwaysinline {
+                            <WIDTH x MASK> %vecmask) nounwind readonly alwaysinline {
   %scale64 = zext i32 %offset_scale to i64
   %scale_vec = bitcast i64 %scale64 to <1 x i64>
   %smear_scale = shufflevector <1 x i64> %scale_vec, <1 x i64> undef,
      <WIDTH x i32> < forloop(i, 1, eval(WIDTH-1), `i32 0, ') i32 0 >
   %scaled_offsets = mul <WIDTH x i64> %smear_scale, %offsets
   %v = call <WIDTH x $1> @__gather_factored_base_offsets64_$1(i8 * %ptr, <WIDTH x i64> %scaled_offsets,
-                                                     i32 1, <WIDTH x i64> zeroinitializer, <WIDTH x i32> %vecmask)
+                                                     i32 1, <WIDTH x i64> zeroinitializer, <WIDTH x MASK> %vecmask)
   ret <WIDTH x $1> %v
 }
 
@@ -3955,9 +4016,9 @@ define void @__scatter_elt64_$1(i8 * %ptr, <WIDTH x i64> %offsets, i32 %offset_s
 
 define void @__scatter_factored_base_offsets32_$1(i8* %base, <WIDTH x i32> %offsets, i32 %offset_scale,
                                          <WIDTH x i32> %offset_delta, <WIDTH x $1> %values,
-                                         <WIDTH x i32> %mask) nounwind alwaysinline {
+                                         <WIDTH x MASK> %mask) nounwind alwaysinline {
   ;; And use the `per_lane' macro to do all of the per-lane work for scatter...
-  per_lane(WIDTH, <WIDTH x i32> %mask, `
+  per_lane(WIDTH, <WIDTH x MASK> %mask, `
       call void @__scatter_elt32_$1(i8 * %base, <WIDTH x i32> %offsets, i32 %offset_scale,
                                     <WIDTH x i32> %offset_delta, <WIDTH x $1> %values, i32 LANE)')
   ret void
@@ -3965,9 +4026,9 @@ define void @__scatter_factored_base_offsets32_$1(i8* %base, <WIDTH x i32> %offs
 
 define void @__scatter_factored_base_offsets64_$1(i8* %base, <WIDTH x i64> %offsets, i32 %offset_scale,
                                          <WIDTH x i64> %offset_delta, <WIDTH x $1> %values,
-                                         <WIDTH x i32> %mask) nounwind alwaysinline {
+                                         <WIDTH x MASK> %mask) nounwind alwaysinline {
   ;; And use the `per_lane' macro to do all of the per-lane work for scatter...
-  per_lane(WIDTH, <WIDTH x i32> %mask, `
+  per_lane(WIDTH, <WIDTH x MASK> %mask, `
       call void @__scatter_elt64_$1(i8 * %base, <WIDTH x i64> %offsets, i32 %offset_scale,
                                     <WIDTH x i64> %offset_delta, <WIDTH x $1> %values, i32 LANE)')
   ret void
@@ -3975,8 +4036,8 @@ define void @__scatter_factored_base_offsets64_$1(i8* %base, <WIDTH x i64> %offs
 
 ; fully general 32-bit scatter, takes array of pointers encoded as vector of i32s
 define void @__scatter32_$1(<WIDTH x i32> %ptrs, <WIDTH x $1> %values,
-                            <WIDTH x i32> %mask) nounwind alwaysinline {
-  per_lane(WIDTH, <WIDTH x i32> %mask, `
+                            <WIDTH x MASK> %mask) nounwind alwaysinline {
+  per_lane(WIDTH, <WIDTH x MASK> %mask, `
   %iptr_LANE_ID = extractelement <WIDTH x i32> %ptrs, i32 LANE
   %ptr_LANE_ID = inttoptr i32 %iptr_LANE_ID to $1 *
   %val_LANE_ID = extractelement <WIDTH x $1> %values, i32 LANE
@@ -3987,8 +4048,8 @@ define void @__scatter32_$1(<WIDTH x i32> %ptrs, <WIDTH x $1> %values,
 
 ; fully general 64-bit scatter, takes array of pointers encoded as vector of i64s
 define void @__scatter64_$1(<WIDTH x i64> %ptrs, <WIDTH x $1> %values,
-                            <WIDTH x i32> %mask) nounwind alwaysinline {
-  per_lane(WIDTH, <WIDTH x i32> %mask, `
+                            <WIDTH x MASK> %mask) nounwind alwaysinline {
+  per_lane(WIDTH, <WIDTH x MASK> %mask, `
   %iptr_LANE_ID = extractelement <WIDTH x i64> %ptrs, i32 LANE
   %ptr_LANE_ID = inttoptr i64 %iptr_LANE_ID to $1 *
   %val_LANE_ID = extractelement <WIDTH x $1> %values, i32 LANE
diff --git a/ctx.cpp b/ctx.cpp
index 1e79c97b..c50d22f9 100644
--- a/ctx.cpp
+++ b/ctx.cpp
@@ -1456,13 +1456,13 @@ FunctionEmitContext::I1VecToBoolVec(llvm::Value *b) {
         for (unsigned int i = 0; i < at->getNumElements(); ++i) {
             llvm::Value *elt = ExtractInst(b, i);
             llvm::Value *sext = SExtInst(elt, LLVMTypes::BoolVectorType,
-                                         LLVMGetName(elt, "_to_boolvec32"));
+                                         LLVMGetName(elt, "_to_boolvec"));
             ret = InsertInst(ret, sext, i);
         }
         return ret;
     }
     else
-        return SExtInst(b, LLVMTypes::BoolVectorType, LLVMGetName(b, "_to_i32"));
+        return SExtInst(b, LLVMTypes::BoolVectorType, LLVMGetName(b, "_to_boolvec"));
 }
 
 
@@ -2781,6 +2781,7 @@ FunctionEmitContext::maskedStore(llvm::Value *value, llvm::Value *ptr,
 
     // Figure out if we need a 8, 16, 32 or 64-bit masked store.
     llvm::Function *maskedStoreFunc = NULL;
+    llvm::Type *llvmValueType = value->getType();
 
     const PointerType *pt = CastType<PointerType>(valueType);
     if (pt != NULL) {
@@ -2809,8 +2810,7 @@ FunctionEmitContext::maskedStore(llvm::Value *value, llvm::Value *ptr,
         else
             maskedStoreFunc = m->module->getFunction("__pseudo_masked_store_i64");
     }
-    else if (Type::Equal(valueType, AtomicType::VaryingBool) &&
-             g->target->getMaskBitCount() == 1) {
+    else if (llvmValueType == LLVMTypes::Int1VectorType) {
         llvm::Value *notMask = BinaryOperator(llvm::Instruction::Xor, mask,
                                               LLVMMaskAllOn, "~mask");
         llvm::Value *old = LoadInst(ptr);
@@ -2823,28 +2823,22 @@ FunctionEmitContext::maskedStore(llvm::Value *value, llvm::Value *ptr,
         StoreInst(final, ptr);
         return;
     }
-    else if (Type::Equal(valueType, AtomicType::VaryingDouble)) {
+    else if (llvmValueType == LLVMTypes::DoubleVectorType) {
         maskedStoreFunc = m->module->getFunction("__pseudo_masked_store_double");
     }
-    else if (Type::Equal(valueType, AtomicType::VaryingInt64) ||
-             Type::Equal(valueType, AtomicType::VaryingUInt64)) {
+    else if (llvmValueType == LLVMTypes::Int64VectorType) {
         maskedStoreFunc = m->module->getFunction("__pseudo_masked_store_i64");
     }
-    else if (Type::Equal(valueType, AtomicType::VaryingFloat)) {
+    else if (llvmValueType == LLVMTypes::FloatVectorType) {
         maskedStoreFunc = m->module->getFunction("__pseudo_masked_store_float");
     }
-    else if (Type::Equal(valueType, AtomicType::VaryingBool) ||
-             Type::Equal(valueType, AtomicType::VaryingInt32) ||
-             Type::Equal(valueType, AtomicType::VaryingUInt32) ||
-             CastType<EnumType>(valueType) != NULL) {
+    else if (llvmValueType == LLVMTypes::Int32VectorType) {
         maskedStoreFunc = m->module->getFunction("__pseudo_masked_store_i32");
     }
-    else if (Type::Equal(valueType, AtomicType::VaryingInt16) ||
-             Type::Equal(valueType, AtomicType::VaryingUInt16)) {
+    else if (llvmValueType == LLVMTypes::Int16VectorType) {
         maskedStoreFunc = m->module->getFunction("__pseudo_masked_store_i16");
     }
-    else if (Type::Equal(valueType, AtomicType::VaryingInt8) ||
-             Type::Equal(valueType, AtomicType::VaryingUInt8)) {
+    else if (llvmValueType == LLVMTypes::Int8VectorType) {
         maskedStoreFunc = m->module->getFunction("__pseudo_masked_store_i8");
     }
     AssertPos(currentPos, maskedStoreFunc != NULL);
diff --git a/expr.cpp b/expr.cpp
index 3baaabaf..6bde2acb 100644
--- a/expr.cpp
+++ b/expr.cpp
@@ -6161,9 +6161,9 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal,
         switch (fromType->basicType) {
         case AtomicType::TYPE_BOOL:
             if (fromType->IsVaryingType() &&
-                LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType)
-                // If we have a bool vector of i32 elements, first truncate
-                // down to a single bit
+                LLVMTypes::BoolVectorType != LLVMTypes::Int1VectorType)
+                // If we have a bool vector of non-i1 elements, first
+                // truncate down to a single bit.
                 exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, cOpName);
             // And then do an unisgned int->float cast
             cast = ctx->CastInst(llvm::Instruction::UIToFP, // unsigned int
@@ -6205,8 +6205,8 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal,
         switch (fromType->basicType) {
         case AtomicType::TYPE_BOOL:
             if (fromType->IsVaryingType() &&
-                LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType)
-                // truncate i32 bool vector values to i1s
+                LLVMTypes::BoolVectorType != LLVMTypes::Int1VectorType)
+                // truncate bool vector values to i1s
                 exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, cOpName);
             cast = ctx->CastInst(llvm::Instruction::UIToFP, // unsigned int to double
                                  exprVal, targetType, cOpName);
@@ -6243,7 +6243,7 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal,
         switch (fromType->basicType) {
         case AtomicType::TYPE_BOOL:
             if (fromType->IsVaryingType() &&
-                LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType)
+                LLVMTypes::BoolVectorType != LLVMTypes::Int1VectorType)
                 exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, cOpName);
             cast = ctx->ZExtInst(exprVal, targetType, cOpName);
             break;
@@ -6279,7 +6279,7 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal,
         switch (fromType->basicType) {
         case AtomicType::TYPE_BOOL:
             if (fromType->IsVaryingType() &&
-                LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType)
+                LLVMTypes::BoolVectorType != LLVMTypes::Int1VectorType)
                 exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, cOpName);
             cast = ctx->ZExtInst(exprVal, targetType, cOpName);
             break;
@@ -6321,7 +6321,7 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal,
         switch (fromType->basicType) {
         case AtomicType::TYPE_BOOL:
             if (fromType->IsVaryingType() &&
-                LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType)
+                LLVMTypes::BoolVectorType != LLVMTypes::Int1VectorType)
                 exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, cOpName);
             cast = ctx->ZExtInst(exprVal, targetType, cOpName);
             break;
@@ -6361,7 +6361,7 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal,
         switch (fromType->basicType) {
         case AtomicType::TYPE_BOOL:
             if (fromType->IsVaryingType() &&
-                LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType)
+                LLVMTypes::BoolVectorType != LLVMTypes::Int1VectorType)
                 exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, cOpName);
             cast = ctx->ZExtInst(exprVal, targetType, cOpName);
             break;
@@ -6407,7 +6407,7 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal,
         switch (fromType->basicType) {
         case AtomicType::TYPE_BOOL:
             if (fromType->IsVaryingType() &&
-                LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType)
+                LLVMTypes::BoolVectorType != LLVMTypes::Int1VectorType)
                 exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, cOpName);
             cast = ctx->ZExtInst(exprVal, targetType, cOpName);
             break;
@@ -6447,7 +6447,7 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal,
         switch (fromType->basicType) {
         case AtomicType::TYPE_BOOL:
             if (fromType->IsVaryingType() &&
-                LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType)
+                LLVMTypes::BoolVectorType != LLVMTypes::Int1VectorType)
                 exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, cOpName);
             cast = ctx->ZExtInst(exprVal, targetType, cOpName);
             break;
@@ -6493,7 +6493,7 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal,
         switch (fromType->basicType) {
         case AtomicType::TYPE_BOOL:
             if (fromType->IsVaryingType() &&
-                LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType)
+                LLVMTypes::BoolVectorType != LLVMTypes::Int1VectorType)
                 exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, cOpName);
             cast = ctx->ZExtInst(exprVal, targetType, cOpName);
             break;
@@ -6531,7 +6531,7 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal,
         switch (fromType->basicType) {
         case AtomicType::TYPE_BOOL:
             if (fromType->IsVaryingType() &&
-                LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType)
+                LLVMTypes::BoolVectorType != LLVMTypes::Int1VectorType)
                 exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, cOpName);
             cast = ctx->ZExtInst(exprVal, targetType, cOpName);
             break;
@@ -6625,12 +6625,12 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal,
 
         if (fromType->IsUniformType()) {
             if (toType->IsVaryingType() &&
-                LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType) {
-                // extend out to i32 bool values from i1 here.  then we'll
-                // turn into a vector below, the way it does for everyone
-                // else...
+                LLVMTypes::BoolVectorType != LLVMTypes::Int1VectorType) {
+                // extend out to an bool as an i8/i16/i32 from the i1 here.
+                // Then we'll turn that into a vector below, the way it
+                // does for everyone else...
                 cast = ctx->SExtInst(cast, LLVMTypes::BoolVectorType->getElementType(),
-                                     LLVMGetName(cast, "to_i32bool"));
+                                     LLVMGetName(cast, "to_i_bool"));
             }
         }
         else
diff --git a/llvmutil.cpp b/llvmutil.cpp
index 26c18bf5..180c8676 100644
--- a/llvmutil.cpp
+++ b/llvmutil.cpp
@@ -115,13 +115,25 @@ InitLLVMUtil(llvm::LLVMContext *ctx, Target& target) {
     LLVMTypes::FloatPointerType = llvm::PointerType::get(LLVMTypes::FloatType, 0);
     LLVMTypes::DoublePointerType = llvm::PointerType::get(LLVMTypes::DoubleType, 0);
 
-    if (target.getMaskBitCount() == 1)
+    switch (target.getMaskBitCount()) {
+    case 1:
         LLVMTypes::MaskType = LLVMTypes::BoolVectorType =
             llvm::VectorType::get(llvm::Type::getInt1Ty(*ctx), target.getVectorWidth());
-    else {
-        Assert(target.getMaskBitCount() == 32);
+        break;
+    case 8:
+        LLVMTypes::MaskType = LLVMTypes::BoolVectorType =
+            llvm::VectorType::get(llvm::Type::getInt8Ty(*ctx), target.getVectorWidth());
+        break;
+    case 16:
+        LLVMTypes::MaskType = LLVMTypes::BoolVectorType =
+            llvm::VectorType::get(llvm::Type::getInt16Ty(*ctx), target.getVectorWidth());
+        break;
+    case 32:
         LLVMTypes::MaskType = LLVMTypes::BoolVectorType =
             llvm::VectorType::get(llvm::Type::getInt32Ty(*ctx), target.getVectorWidth());
+        break;
+    default:
+        FATAL("Unhandled mask width for initializing MaskType");
     }
 
     LLVMTypes::Int1VectorType =
@@ -154,12 +166,26 @@ InitLLVMUtil(llvm::LLVMContext *ctx, Target& target) {
 
     std::vector<llvm::Constant *> maskOnes;
     llvm::Constant *onMask = NULL;
-    if (target.getMaskBitCount() == 1)
+    switch (target.getMaskBitCount()) {
+    case 1:
         onMask = llvm::ConstantInt::get(llvm::Type::getInt1Ty(*ctx), 1,
                                         false /*unsigned*/); // 0x1
-    else
+        break;
+    case 8:
+        onMask = llvm::ConstantInt::get(llvm::Type::getInt8Ty(*ctx), -1,
+                                        true /*signed*/); // 0xff
+        break;
+    case 16:
+        onMask = llvm::ConstantInt::get(llvm::Type::getInt16Ty(*ctx), -1,
+                                        true /*signed*/); // 0xffff
+        break;
+    case 32:
         onMask = llvm::ConstantInt::get(llvm::Type::getInt32Ty(*ctx), -1,
                                     true /*signed*/); // 0xffffffff
+        break;
+    default:
+        FATAL("Unhandled mask width for onMask");
+    }
 
     for (int i = 0; i < target.getVectorWidth(); ++i)
         maskOnes.push_back(onMask);
@@ -167,13 +193,26 @@ InitLLVMUtil(llvm::LLVMContext *ctx, Target& target) {
 
     std::vector<llvm::Constant *> maskZeros;
     llvm::Constant *offMask = NULL;
-    if (target.getMaskBitCount() == 1)
+    switch (target.getMaskBitCount()) {
+    case 1:
         offMask = llvm::ConstantInt::get(llvm::Type::getInt1Ty(*ctx), 0,
                                          true /*signed*/);
-    else
+        break;
+    case 8:
+        offMask = llvm::ConstantInt::get(llvm::Type::getInt8Ty(*ctx), 0,
+                                         true /*signed*/);
+        break;
+    case 16:
+        offMask = llvm::ConstantInt::get(llvm::Type::getInt16Ty(*ctx), 0,
+                                         true /*signed*/);
+        break;
+    case 32:
         offMask = llvm::ConstantInt::get(llvm::Type::getInt32Ty(*ctx), 0,
                                          true /*signed*/);
-
+        break;
+    default:
+        FATAL("Unhandled mask width for offMask");
+    }
     for (int i = 0; i < target.getVectorWidth(); ++i)
         maskZeros.push_back(offMask);
     LLVMMaskAllOff = llvm::ConstantVector::get(maskZeros);
@@ -444,9 +483,14 @@ LLVMBoolVector(bool b) {
     if (LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType)
         v = llvm::ConstantInt::get(LLVMTypes::Int32Type, b ? 0xffffffff : 0,
                                    false /*unsigned*/);
+    else if (LLVMTypes::BoolVectorType == LLVMTypes::Int16VectorType)
+        v = llvm::ConstantInt::get(LLVMTypes::Int16Type, b ? 0xffff : 0,
+                                   false /*unsigned*/);
+    else if (LLVMTypes::BoolVectorType == LLVMTypes::Int8VectorType)
+        v = llvm::ConstantInt::get(LLVMTypes::Int8Type, b ? 0xff : 0,
+                                   false /*unsigned*/);
     else {
-        Assert(LLVMTypes::BoolVectorType->getElementType() ==
-               llvm::Type::getInt1Ty(*g->ctx));
+        Assert(LLVMTypes::BoolVectorType == LLVMTypes::Int1VectorType);
         v = b ? LLVMTrue : LLVMFalse;
     }
 
@@ -465,9 +509,14 @@ LLVMBoolVector(const bool *bvec) {
         if (LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType)
             v = llvm::ConstantInt::get(LLVMTypes::Int32Type, bvec[i] ? 0xffffffff : 0,
                                        false /*unsigned*/);
+        else if (LLVMTypes::BoolVectorType == LLVMTypes::Int16VectorType)
+            v = llvm::ConstantInt::get(LLVMTypes::Int16Type, bvec[i] ? 0xffff : 0,
+                                       false /*unsigned*/);
+        else if (LLVMTypes::BoolVectorType == LLVMTypes::Int8VectorType)
+            v = llvm::ConstantInt::get(LLVMTypes::Int8Type, bvec[i] ? 0xff : 0,
+                                       false /*unsigned*/);
         else {
-            Assert(LLVMTypes::BoolVectorType->getElementType() ==
-                   llvm::Type::getInt1Ty(*g->ctx));
+            Assert(LLVMTypes::BoolVectorType == LLVMTypes::Int1VectorType);
             v = bvec[i] ? LLVMTrue : LLVMFalse;
         }
 
diff --git a/parse.yy b/parse.yy
index 3ad815cf..488c864a 100644
--- a/parse.yy
+++ b/parse.yy
@@ -2148,8 +2148,24 @@ lAddFunctionParams(Declarator *decl) {
 
 /** Add a symbol for the built-in mask variable to the symbol table */
 static void lAddMaskToSymbolTable(SourcePos pos) {
-    const Type *t = g->target->getMaskBitCount() == 1 ?
-        AtomicType::VaryingBool : AtomicType::VaryingUInt32;
+    const Type *t;
+    switch (g->target->getMaskBitCount()) {
+    case 1:
+        t = AtomicType::VaryingBool;
+        break;
+    case 8:
+        t = AtomicType::VaryingUInt8;
+        break;
+    case 16:
+        t = AtomicType::VaryingUInt16;
+        break;
+    case 32:
+        t = AtomicType::VaryingUInt32;
+        break;
+    default:
+        FATAL("Unhandled mask bitsize in lAddMaskToSymbolTable");
+    }
+
     t = t->GetAsConstType();
     Symbol *maskSymbol = new Symbol("__mask", pos, t);
     m->symbolTable->AddVariable(maskSymbol);
diff --git a/stdlib.ispc b/stdlib.ispc
index b8ed2057..8ad5aa49 100644
--- a/stdlib.ispc
+++ b/stdlib.ispc
@@ -38,12 +38,20 @@
            ispc code 
 */
 
-#ifdef ISPC_TARGET_GENERIC
-#define IntMaskType bool
-#define UIntMaskType bool
+#if (ISPC_MASK_BITS == 1)
+  #define IntMaskType bool
+  #define UIntMaskType bool
+#elif (ISPC_MASK_BITS == 8)
+  #define IntMaskType int8
+  #define UIntMaskType unsigned int8
+#elif (ISPC_MASK_BITS == 16)
+  #define IntMaskType int16
+  #define UIntMaskType unsigned int16
+#elif (ISPC_MASK_BITS == 32)
+  #define IntMaskType int32
+  #define UIntMaskType unsigned int32
 #else
-#define IntMaskType int32
-#define UIntMaskType unsigned int32
+  #error Unknown value of ISPC_MASK_BITS
 #endif
 
 ///////////////////////////////////////////////////////////////////////////
@@ -335,14 +343,15 @@ static inline int32 sign_extend(bool v) {
     return __sext_varying_bool(v);
 }
 
+
 __declspec(safe) 
 static inline uniform bool any(bool v) {
     // We only care about whether "any" is true for the active program instances,
     // so we have to make v with the current program mask.
-#ifdef ISPC_TARGET_GENERIC
+#if (ISPC_MASK_BITS == 1)
     return __any(v & __mask);
 #else
-    return __any(__sext_varying_bool(v) & __mask);
+    return __any((UIntMaskType)__sext_varying_bool(v) & __mask);
 #endif
 }
 
@@ -350,11 +359,10 @@ __declspec(safe)
 static inline uniform bool all(bool v) {
     // As with any(), we need to explicitly mask v with the current program mask
     // so we're only looking at the current lanes
-
-#ifdef ISPC_TARGET_GENERIC
+#if (ISPC_MASK_BITS == 1)
     return __all(v | !__mask);
 #else
-    return __all(__sext_varying_bool(v) | !__mask);
+    return __all((UIntMaskType)__sext_varying_bool(v) | !__mask);
 #endif
 }
 
@@ -362,11 +370,10 @@ __declspec(safe)
 static inline uniform bool none(bool v) {
     // As with any(), we need to explicitly mask v with the current program mask
     // so we're only looking at the current lanes
-
-#ifdef ISPC_TARGET_GENERIC
+#if (ISPC_MASK_BITS == 1)
     return __none(v & __mask);
 #else
-    return __none(__sext_varying_bool(v) & __mask);
+    return __none((UIntMaskType)__sext_varying_bool(v) & __mask);
 #endif
 }
 
@@ -399,10 +406,10 @@ static inline int popcnt(int64 v) {
 __declspec(safe) 
 static inline uniform int popcnt(bool v) {
     // As with any() and all(), only count across the active lanes
-#ifdef ISPC_TARGET_GENERIC
+#if (ISPC_MASK_BITS == 1)
     return __popcnt_int64(__movmsk(v & __mask));
 #else
-    return __popcnt_int64(__movmsk(__sext_varying_bool(v) & __mask));
+    return __popcnt_int64(__movmsk((UIntMaskType)__sext_varying_bool(v) & __mask));
 #endif
 }
 

From 9ba49eabb21c7971f529fda25bad5fc1e84a6e3e Mon Sep 17 00:00:00 2001
From: Matt Pharr <mpharr@google.com>
Date: Tue, 23 Jul 2013 16:52:43 -0700
Subject: [PATCH 04/34] Reduce estimated costs for 8 and 16-bit min() and max()
 in stdlib.

These actually compile to a single instruction.
---
 stdlib.ispc | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/stdlib.ispc b/stdlib.ispc
index 8ad5aa49..9a2b191f 100644
--- a/stdlib.ispc
+++ b/stdlib.ispc
@@ -1332,88 +1332,88 @@ static inline uniform double max(uniform double a, uniform double b) {
 
 // int8
 
-__declspec(safe,cost2) 
+__declspec(safe,cost1)
 static inline uniform unsigned int8 min(uniform unsigned int8 a,
                                         uniform unsigned int8 b) {
     return (a < b) ? a : b;
 }
 
-__declspec(safe,cost2) 
+__declspec(safe,cost1)
 static inline uniform unsigned int8 max(uniform unsigned int8 a, 
                                         uniform unsigned int8 b) {
     return (a > b) ? a : b;
 }
 
-__declspec(safe,cost2)
+__declspec(safe,cost1)
 static inline uniform int8 min(uniform int8 a, uniform int8 b) {
     return (a < b) ? a : b;
 }
 
-__declspec(safe,cost2)
+__declspec(safe,cost1)
 static inline uniform int8 max(uniform int8 a, uniform int8 b) {
     return (a > b) ? a : b;
 }
 
-__declspec(safe,cost2)
+__declspec(safe,cost1)
 static inline unsigned int8 min(unsigned int8 a, unsigned int8 b) {
     return (a < b) ? a : b;
 }
 
-__declspec(safe,cost2)
+__declspec(safe,cost1)
 static inline unsigned int8 max(unsigned int8 a, unsigned int8 b) {
     return (a > b) ? a : b;
 }
 
-__declspec(safe,cost2)
+__declspec(safe,cost1)
 static inline int8 min(int8 a, int8 b) {
     return (a < b) ? a : b;
 }
 
-__declspec(safe,cost2)
+__declspec(safe,cost1)
 static inline int8 max(int8 a, int8 b) {
     return (a > b) ? a : b;
 }
 
 // int16
 
-__declspec(safe,cost2)
+__declspec(safe,cost1)
 static inline uniform unsigned int16 min(uniform unsigned int16 a, 
                                          uniform unsigned int16 b) {
     return (a < b) ? a : b;
 }
 
-__declspec(safe,cost2)
+__declspec(safe,cost1)
 static inline uniform unsigned int16 max(uniform unsigned int16 a, 
                                          uniform unsigned int16 b) {
     return (a > b) ? a : b;
 }
 
-__declspec(safe,cost2)
+__declspec(safe,cost1)
 static inline uniform int16 min(uniform int16 a, uniform int16 b) {
     return (a < b) ? a : b;
 }
 
-__declspec(safe,cost2)
+__declspec(safe,cost1)
 static inline uniform int16 max(uniform int16 a, uniform int16 b) {
     return (a > b) ? a : b;
 }
 
-__declspec(safe,cost2)
+__declspec(safe,cost1)
 static inline unsigned int16 min(unsigned int16 a, unsigned int16 b) {
     return (a < b) ? a : b;
 }
 
-__declspec(safe,cost2)
+__declspec(safe,cost1)
 static inline unsigned int16 max(unsigned int16 a, unsigned int16 b) {
     return (a > b) ? a : b;
 }
 
-__declspec(safe,cost2)
+__declspec(safe,cost1)
 static inline int16 min(int16 a, int16 b) {
     return (a < b) ? a : b;
 }
 
-__declspec(safe,cost2)
+__declspec(safe,cost1)
 static inline int16 max(int16 a, int16 b) {
     return (a > b) ? a : b;
 }

From f7f281a256c38c1986860baec81736fcb4f5b6d1 Mon Sep 17 00:00:00 2001
From: Matt Pharr <mpharr@google.com>
Date: Tue, 23 Jul 2013 17:01:03 -0700
Subject: [PATCH 05/34] Choose type for integer literals to match the target
 mask size (if possible).

On a target with a 16-bit mask (for example), we would choose the type
of an integer literal "1024" to be an int16.  Previously, we used an int32,
which is a worse fit and leads to less efficient code than an int16
on a 16-bit mask target.  (However, we'd still give an integer literal
1000000 the type int32, even in a 16-bit target.)

Updated the tests to still pass with 8 and 16-bit targets, given this
change.
---
 lex.ll                                      | 27 +++++++-
 parse.yy                                    | 23 ++++++-
 run_tests.py                                |  4 +-
 stdlib.ispc                                 | 74 ++++++++++-----------
 tests/aossoa-1.ispc                         |  4 +-
 tests/aossoa-2.ispc                         |  4 +-
 tests/aossoa-5.ispc                         |  4 +-
 tests/aossoa-6.ispc                         |  4 +-
 tests/atomics-12.ispc                       |  4 +-
 tests/atomics-13.ispc                       |  2 +-
 tests/atomics-4.ispc                        |  4 +-
 tests/coalesce-1.ispc                       |  4 +-
 tests/coalesce-2.ispc                       |  4 +-
 tests/coalesce-3.ispc                       |  4 +-
 tests/coalesce-4.ispc                       |  4 +-
 tests/coalesce-5.ispc                       |  4 +-
 tests/coalesce-6.ispc                       |  4 +-
 tests/coalesce-7.ispc                       |  4 +-
 tests/coalesce-8.ispc                       |  4 +-
 tests/count-leading-trailing-zeros-1.ispc   |  2 +-
 tests/count-leading-trailing-zeros-4.ispc   |  2 +-
 tests/exclusive-scan-and-2.ispc             |  4 +-
 tests/exclusive-scan-or-1.ispc              |  4 +-
 tests/frexp-double-1.ispc                   |  2 +-
 tests/frexp-double.ispc                     |  2 +-
 tests/frexp-float-1.ispc                    |  2 +-
 tests/frexp-float.ispc                      |  2 +-
 tests/kilo-mega-giga-2.ispc                 |  2 +-
 tests/ldexp-double.ispc                     |  4 +-
 tests/ldexp-float.ispc                      |  4 +-
 tests/local-atomics-12.ispc                 |  4 +-
 tests/local-atomics-13.ispc                 |  2 +-
 tests/local-atomics-14.ispc                 |  4 +-
 tests/local-atomics-4.ispc                  |  4 +-
 tests/rand-distrib-1.ispc                   |  2 +-
 tests/sizeof-9.ispc                         |  2 +-
 tests/test-83.ispc                          |  2 +-
 tests/test-84.ispc                          |  2 +-
 tests/test-85.ispc                          |  2 +-
 tests_errors/array-plus-equals.ispc         |  2 +-
 tests_errors/array-pointer-assign.ispc      |  2 +-
 tests_errors/float-logical.ispc             |  2 +-
 tests_errors/fptr-typecheck-2.ispc          |  2 +-
 tests_errors/fptr-typecheck-3.ispc          |  2 +-
 tests_errors/initexpr-2.ispc                |  2 +-
 tests_errors/int-ptr-fail.ispc              |  4 +-
 tests_errors/lvalue-2.ispc                  |  2 +-
 tests_errors/lvalue-3.ispc                  |  2 +-
 tests_errors/new-delete-3.ispc              |  2 +-
 tests_errors/new-delete-6.ispc              |  2 +-
 tests_errors/ptr-1.ispc                     |  2 +-
 tests_errors/ptr-const-1.ispc               |  2 +-
 tests_errors/ptrcast-lose-info.ispc         |  2 +-
 tests_errors/ref-3.ispc                     |  2 +-
 tests_errors/soa-11.ispc                    |  2 +-
 tests_errors/soa-12.ispc                    |  2 +-
 tests_errors/soa-3.ispc                     |  2 +-
 tests_errors/soa-4.ispc                     |  2 +-
 tests_errors/soa-9.ispc                     |  2 +-
 tests_errors/struct_arith.ispc              |  2 +-
 tests_errors/vec-size-compile-constant.ispc |  2 +-
 61 files changed, 166 insertions(+), 120 deletions(-)

diff --git a/lex.ll b/lex.ll
index f6633fce..8baa627a 100644
--- a/lex.ll
+++ b/lex.ll
@@ -77,6 +77,8 @@ static int allTokens[] = {
   TOKEN_UNSIGNED, TOKEN_VARYING, TOKEN_VOID, TOKEN_WHILE,
   TOKEN_STRING_C_LITERAL, TOKEN_DOTDOTDOT,
   TOKEN_FLOAT_CONSTANT,
+  TOKEN_INT8_CONSTANT, TOKEN_UINT8_CONSTANT,
+  TOKEN_INT16_CONSTANT, TOKEN_UINT16_CONSTANT,
   TOKEN_INT32_CONSTANT, TOKEN_UINT32_CONSTANT,
   TOKEN_INT64_CONSTANT, TOKEN_UINT64_CONSTANT,
   TOKEN_INC_OP, TOKEN_DEC_OP, TOKEN_LEFT_OP, TOKEN_RIGHT_OP, TOKEN_LE_OP,
@@ -150,6 +152,10 @@ void ParserInit() {
     tokenToName[TOKEN_STRING_C_LITERAL] = "\"C\"";
     tokenToName[TOKEN_DOTDOTDOT] = "...";
     tokenToName[TOKEN_FLOAT_CONSTANT] = "TOKEN_FLOAT_CONSTANT";
+    tokenToName[TOKEN_INT8_CONSTANT] = "TOKEN_INT8_CONSTANT";
+    tokenToName[TOKEN_UINT8_CONSTANT] = "TOKEN_UINT8_CONSTANT";
+    tokenToName[TOKEN_INT16_CONSTANT] = "TOKEN_INT16_CONSTANT";
+    tokenToName[TOKEN_UINT16_CONSTANT] = "TOKEN_UINT16_CONSTANT";
     tokenToName[TOKEN_INT32_CONSTANT] = "TOKEN_INT32_CONSTANT";
     tokenToName[TOKEN_UINT32_CONSTANT] = "TOKEN_UINT32_CONSTANT";
     tokenToName[TOKEN_INT64_CONSTANT] = "TOKEN_INT64_CONSTANT";
@@ -260,6 +266,10 @@ void ParserInit() {
     tokenNameRemap["TOKEN_STRING_C_LITERAL"] = "\"C\"";
     tokenNameRemap["TOKEN_DOTDOTDOT"] = "\'...\'";
     tokenNameRemap["TOKEN_FLOAT_CONSTANT"] = "float constant";
+    tokenNameRemap["TOKEN_INT8_CONSTANT"] = "int8 constant";
+    tokenNameRemap["TOKEN_UINT8_CONSTANT"] = "unsigned int8 constant";
+    tokenNameRemap["TOKEN_INT16_CONSTANT"] = "int16 constant";
+    tokenNameRemap["TOKEN_UINT16_CONSTANT"] = "unsigned int16 constant";
     tokenNameRemap["TOKEN_INT32_CONSTANT"] = "int32 constant";
     tokenNameRemap["TOKEN_UINT32_CONSTANT"] = "unsigned int32 constant";
     tokenNameRemap["TOKEN_INT64_CONSTANT"] = "int64 constant";
@@ -599,7 +609,22 @@ lParseInteger(bool dotdotdot) {
         }
         else {
             // No u or l suffix
-            // First, see if we can fit this into a 32-bit integer...
+            // If we're compiling to an 8-bit mask target and the constant
+            // fits into 8 bits, return an 8-bit int.
+            if (g->target->getMaskBitCount() == 8) {
+                if (yylval.intVal <= 0x7fULL)
+                    return TOKEN_INT8_CONSTANT;
+                else if (yylval.intVal <= 0xffULL)
+                    return TOKEN_UINT8_CONSTANT;
+            }
+            // And similarly for 16-bit masks and constants
+            if (g->target->getMaskBitCount() == 16) {
+                if (yylval.intVal <= 0x7fffULL)
+                    return TOKEN_INT16_CONSTANT;
+                else if (yylval.intVal <= 0xffffULL)
+                    return TOKEN_UINT16_CONSTANT;
+            }
+            // Otherwise, see if we can fit this into a 32-bit integer...
             if (yylval.intVal <= 0x7fffffffULL)
                 return TOKEN_INT32_CONSTANT;
             else if (yylval.intVal <= 0xffffffffULL)
diff --git a/parse.yy b/parse.yy
index 488c864a..6ed2a43d 100644
--- a/parse.yy
+++ b/parse.yy
@@ -179,6 +179,8 @@ struct ForeachDimension {
 }
 
 
+%token TOKEN_INT8_CONSTANT TOKEN_UINT8_CONSTANT
+%token TOKEN_INT16_CONSTANT TOKEN_UINT16_CONSTANT
 %token TOKEN_INT32_CONSTANT TOKEN_UINT32_CONSTANT
 %token TOKEN_INT64_CONSTANT TOKEN_UINT64_CONSTANT
 %token TOKEN_INT32DOTDOTDOT_CONSTANT TOKEN_UINT32DOTDOTDOT_CONSTANT
@@ -291,6 +293,22 @@ primary_expression
             Error(@1, "Undeclared symbol \"%s\".%s", name, alts.c_str());
         }
     }
+    | TOKEN_INT8_CONSTANT {
+        $$ = new ConstExpr(AtomicType::UniformInt8->GetAsConstType(),
+                           (int8_t)yylval.intVal, @1);
+    }
+    | TOKEN_UINT8_CONSTANT {
+        $$ = new ConstExpr(AtomicType::UniformUInt8->GetAsConstType(),
+                           (uint8_t)yylval.intVal, @1);
+    }
+    | TOKEN_INT16_CONSTANT {
+        $$ = new ConstExpr(AtomicType::UniformInt16->GetAsConstType(),
+                           (int16_t)yylval.intVal, @1);
+    }
+    | TOKEN_UINT16_CONSTANT {
+        $$ = new ConstExpr(AtomicType::UniformUInt16->GetAsConstType(),
+                           (uint16_t)yylval.intVal, @1);
+    }
     | TOKEN_INT32_CONSTANT {
         $$ = new ConstExpr(AtomicType::UniformInt32->GetAsConstType(),
                            (int32_t)yylval.intVal, @1);
@@ -1233,7 +1251,10 @@ declarator
     ;
 
 int_constant
-    : TOKEN_INT32_CONSTANT { $$ = yylval.intVal; }
+    : TOKEN_INT8_CONSTANT { $$ = yylval.intVal; }
+    | TOKEN_INT16_CONSTANT { $$ = yylval.intVal; }
+    | TOKEN_INT32_CONSTANT { $$ = yylval.intVal; }
+    | TOKEN_INT64_CONSTANT { $$ = yylval.intVal; }
     ;
 
 direct_declarator
diff --git a/run_tests.py b/run_tests.py
index 7c6b1eb8..296db867 100755
--- a/run_tests.py
+++ b/run_tests.py
@@ -37,7 +37,7 @@ parser.add_option("-g", "--generics-include", dest="include_file", help="Filenam
 parser.add_option("-f", "--ispc-flags", dest="ispc_flags", help="Additional flags for ispc (-g, -O1, ...)",
                   default="")
 parser.add_option('-t', '--target', dest='target',
-                  help='Set compilation target (neon, sse2, sse2-x2, sse4, sse4-x2, avx, avx-x2, generic-4, generic-8, generic-16, generic-32)',
+                  help='Set compilation target (neon, sse2, sse2-x2, sse4, sse4-x2, sse4-8, avx, avx-x2, generic-4, generic-8, generic-16, generic-32)',
                   default="sse4")
 parser.add_option('-a', '--arch', dest='arch',
                   help='Set architecture (arm, x86, x86-64)',
@@ -294,7 +294,7 @@ def run_test(testname):
         firstline = firstline.rstrip()
         file.close()
 
-        if (output.find(firstline) == -1):
+        if re.search(firstline, output) == None:
             sys.stderr.write("Didn't see expected error message %s from test %s.\nActual output:\n%s\n" % \
                 (firstline, testname, output))
             return (1, 0)
diff --git a/stdlib.ispc b/stdlib.ispc
index 9a2b191f..7e848481 100644
--- a/stdlib.ispc
+++ b/stdlib.ispc
@@ -3126,7 +3126,7 @@ static inline void __range_reduce_log(float input, varying float * uniform reduc
     static const int nonexponent_mask = 0x807FFFFF;
 
     // We want the reduced version to have an exponent of -1 which is -1 + 127 after biasing or 126
-    static const int exponent_neg1 = (126 << 23);
+    static const int exponent_neg1 = (126l << 23);
     // NOTE(boulos): We don't need to mask anything out since we know
     // the sign bit has to be 0. If it's 1, we need to return infinity/nan
     // anyway (log(x), x = +-0 -> infinity, x < 0 -> NaN).
@@ -3149,7 +3149,7 @@ static inline void __range_reduce_log(uniform float input, uniform float * unifo
     uniform int int_version = intbits(input);
     static const uniform int nonexponent_mask = 0x807FFFFF;
 
-    static const uniform int exponent_neg1 = (126 << 23);
+    static const uniform int exponent_neg1 = (126ul << 23);
     uniform int biased_exponent = int_version >> 23;
     uniform int offset_exponent = biased_exponent + 1;
     *exponent = offset_exponent - 127; // get the real value
@@ -3647,18 +3647,18 @@ static inline uniform float half_to_float(uniform unsigned int16 h) {
     else {
         // https://gist.github.com/2144712
         // Fabian "ryg" Giesen.
-        static const uniform unsigned int32 shifted_exp = 0x7c00 << 13; // exponent mask after shift
+        static const uniform unsigned int32 shifted_exp = 0x7c00ul << 13; // exponent mask after shift
 
         uniform int32 o = ((int32)(h & 0x7fff)) << 13;     // exponent/mantissa bits
         uniform unsigned int32 exp = shifted_exp & o;   // just the exponent
-        o += (127 - 15) << 23;        // exponent adjust
+        o += (uniform int32)(127 - 15) << 23;        // exponent adjust
 
         // handle exponent special cases
         if (exp == shifted_exp) // Inf/NaN?
-            o += (128 - 16) << 23;    // extra exp adjust
+            o += (uniform unsigned int32)(128 - 16) << 23;    // extra exp adjust
         else if (exp == 0) { // Zero/Denormal?
-            o += 1 << 23;             // extra exp adjust
-            o = intbits(floatbits(o) - floatbits(113 << 23)); // renormalize
+            o += 1ul << 23;             // extra exp adjust
+            o = intbits(floatbits(o) - floatbits(113ul << 23)); // renormalize
         }
 
         o |= ((int32)(h & 0x8000)) << 16;    // sign bit
@@ -3675,17 +3675,17 @@ static inline float half_to_float(unsigned int16 h) {
         // https://gist.github.com/2144712
         // Fabian "ryg" Giesen.
 
-        const unsigned int32 shifted_exp = 0x7c00 << 13; // exponent mask after shift
+        const unsigned int32 shifted_exp = 0x7c00ul << 13; // exponent mask after shift
 
-        int32 o = ((int32)(h & 0x7fff)) << 13;     // exponent/mantissa bits
+        int32 o = ((int32)(h & 0x7ffful)) << 13;     // exponent/mantissa bits
         unsigned int32 exp = shifted_exp & o;   // just the exponent
-        o += (127 - 15) << 23;        // exponent adjust
+        o += (int32)(127 - 15) << 23;        // exponent adjust
 
-        int32 infnan_val = o + ((128 - 16) << 23);
-        int32 zerodenorm_val = intbits(floatbits(o + (1<<23)) - floatbits(113 << 23));
+        int32 infnan_val = o + ((int32)(128 - 16) << 23);
+        int32 zerodenorm_val = intbits(floatbits(o + (1ul<<23)) - floatbits(113ul << 23));
         int32 reg_val = (exp == 0) ? zerodenorm_val : o;
 
-        int32 sign_bit = ((int32)(h & 0x8000)) << 16;
+        int32 sign_bit = ((int32)(h & 0x8000ul)) << 16;
         return floatbits(((exp == shifted_exp) ? infnan_val : reg_val) | sign_bit);
     }
 }
@@ -3715,16 +3715,16 @@ static inline uniform int16 float_to_half(uniform float f) {
         // NaN->qNaN and Inf->Inf
         // unconditional assignment here, will override with right value for
         // the regular case below.
-        uniform int32 f32infty = 255 << 23;
-        o = (fint > f32infty) ? 0x7e00 : 0x7c00; 
+        uniform int32 f32infty = 255ul << 23;
+        o = (fint > f32infty) ? 0x7e00u : 0x7c00u; 
 
         // (De)normalized number or zero
         // update fint unconditionally to save the blending; we don't need it
         // anymore for the Inf/NaN case anyway.
 
-        const uniform unsigned int32 round_mask = ~0xfffu; 
-        const uniform int32 magic = 15 << 23;
-        const uniform int32 f16infty = 31 << 23;
+        const uniform unsigned int32 round_mask = ~0xffful;
+        const uniform int32 magic = 15ul << 23;
+        const uniform int32 f16infty = 31ul << 23;
 
         uniform int32 fint2 = intbits(floatbits(fint & round_mask) * floatbits(magic)) - round_mask;
         fint2 = (fint2 > f16infty) ? f16infty : fint2; // Clamp to signed infinity if overflowed
@@ -3761,16 +3761,16 @@ static inline int16 float_to_half(float f) {
         // NaN->qNaN and Inf->Inf
         // unconditional assignment here, will override with right value for
         // the regular case below.
-        int32 f32infty = 255 << 23;
-        o = (fint > f32infty) ? 0x7e00 : 0x7c00; 
+        int32 f32infty = 255ul << 23;
+        o = (fint > f32infty) ? 0x7e00u : 0x7c00u;
 
         // (De)normalized number or zero
         // update fint unconditionally to save the blending; we don't need it
         // anymore for the Inf/NaN case anyway.
 
-        const unsigned int32 round_mask = ~0xfffu; 
-        const int32 magic = 15 << 23;
-        const int32 f16infty = 31 << 23;
+        const unsigned int32 round_mask = ~0xffful;
+        const int32 magic = 15ul << 23;
+        const int32 f16infty = 31ul << 23;
 
         // Shift exponent down, denormalize if necessary.
         // NOTE This represents half-float denormals using single precision denormals.
@@ -3789,7 +3789,7 @@ static inline int16 float_to_half(float f) {
         //   FP16 denormals are rare in practice, I don't know. Whatever slow path your HW
         //   may or may not have for denormals, this may well hit it.
         float fscale = floatbits(fint & round_mask) * floatbits(magic);
-        fscale = min(fscale, floatbits((31 << 23) - 0x1000));
+        fscale = min(fscale, floatbits((31ul << 23) - 0x1000ul));
         int32 fint2 = intbits(fscale) - round_mask;
 
         if (fint < f32infty)
@@ -3956,7 +3956,7 @@ float_to_srgb8(float inval)
     // Do the table lookup and unpack bias, scale
     unsigned int tab = table[(intbits(inval) - 0x39000000u) >> 20];
     unsigned int bias = (tab >> 16) << 9;
-    unsigned int scale = tab & 0xffff;
+    unsigned int scale = tab & 0xfffful;
 
     // Grab next-highest mantissa bits and perform linear interpolation
     unsigned int t = (intbits(inval) >> 12) & 0xff;
@@ -4006,7 +4006,7 @@ float_to_srgb8(uniform float inval)
     // Do the table lookup and unpack bias, scale
     uniform unsigned int tab = table[(intbits(inval) - 0x39000000u) >> 20];
     uniform unsigned int bias = (tab >> 16) << 9;
-    uniform unsigned int scale = tab & 0xffff;
+    uniform unsigned int scale = tab & 0xfffful;
 
     // Grab next-highest mantissa bits and perform linear interpolation
     uniform unsigned int t = (intbits(inval) >> 12) & 0xff;
@@ -4053,14 +4053,14 @@ static inline uniform unsigned int random(uniform RNGState * uniform state)
 static inline float frandom(varying RNGState * uniform state)
 {
     unsigned int irand = random(state);
-    irand &= (1<<23)-1;
+    irand &= (1ul<<23)-1;
     return floatbits(0x3F800000 | irand)-1.0f;
 }
 
 static inline uniform float frandom(uniform RNGState * uniform state)
 {
     uniform unsigned int irand = random(state);
-    irand &= (1<<23)-1;
+    irand &= (1ul<<23)-1;
     return floatbits(0x3F800000 | irand)-1.0f;
 }
 
@@ -4068,18 +4068,18 @@ static inline void seed_rng(varying RNGState * uniform state,
                             unsigned int seed) {
     state->z1 = seed;
     state->z2 = seed ^ 0xbeeff00d;
-    state->z3 = ((seed & 0xffff) << 16) | (seed >> 16);
-    state->z4 = (((seed & 0xff) << 24) | ((seed & 0xff00)  << 8) |
-                 ((seed & 0xff0000) >> 8) | (seed & 0xff000000) >> 24);
+    state->z3 = ((seed & 0xfffful) << 16) | (seed >> 16);
+    state->z4 = (((seed & 0xfful) << 24) | ((seed & 0xff00ul)  << 8) |
+                 ((seed & 0xff0000ul) >> 8) | (seed & 0xff000000ul) >> 24);
 }
 
 static inline void seed_rng(uniform RNGState * uniform state, 
                             uniform unsigned int seed) {
     state->z1 = seed;
     state->z2 = seed ^ 0xbeeff00d;
-    state->z3 = ((seed & 0xffff) << 16) | (seed >> 16);
-    state->z4 = (((seed & 0xff) << 24) | ((seed & 0xff00)  << 8) |
-                 ((seed & 0xff0000) >> 8) | (seed & 0xff000000) >> 24);
+    state->z3 = ((seed & 0xfffful) << 16) | (seed >> 16);
+    state->z4 = (((seed & 0xfful) << 24) | ((seed & 0xff00ul)  << 8) |
+                 ((seed & 0xff0000ul) >> 8) | (seed & 0xff000000ul) >> 24);
 }
 
 
@@ -4097,7 +4097,7 @@ static inline uniform bool rdrand(float * uniform ptr) {
         uniform int32 irand;
         uniform bool success = __rdrand_i32(&irand);
         if (success) {
-            irand &= (1<<23)-1;
+            irand &= (1ul<<23)-1;
             *ptr = floatbits(0x3F800000 | irand)-1.0f;
         }
         return success;
@@ -4117,7 +4117,7 @@ static inline bool rdrand(varying float * uniform ptr) {
                 // in vector form.  However, we need to be careful to not
                 // clobber any existing already-set values in *ptr with
                 // inactive lanes here...
-                irand &= (1<<23)-1;
+                irand &= (1ul<<23)-1;
                 *ptr = floatbits(0x3F800000 | irand)-1.0f;
                 success = true;
             }
@@ -4137,7 +4137,7 @@ static inline bool rdrand(float * ptr) {
         foreach_active (index) {
             uniform int32 irand;
             if (__rdrand_i32(&irand)) {
-                irand &= (1<<23)-1;
+                irand &= (1ul<<23)-1;
                 *ptrs[index] = floatbits(0x3F800000 | irand)-1.0f;
                 success = true;
             }
diff --git a/tests/aossoa-1.ispc b/tests/aossoa-1.ispc
index 59964d6d..32d3bcba 100644
--- a/tests/aossoa-1.ispc
+++ b/tests/aossoa-1.ispc
@@ -2,8 +2,8 @@
 export uniform int width() { return programCount; }
 
 export void f_v(uniform float RET[]) {
-#define width 3
-#define maxProgramCount 64
+#define width 3ul
+#define maxProgramCount 64ul
     assert(programCount <= maxProgramCount);
 
 //CO    const uniform int width = 3;
diff --git a/tests/aossoa-2.ispc b/tests/aossoa-2.ispc
index 9ff82226..df8eae5c 100644
--- a/tests/aossoa-2.ispc
+++ b/tests/aossoa-2.ispc
@@ -2,8 +2,8 @@
 export uniform int width() { return programCount; }
 
 export void f_v(uniform float RET[]) {
-#define width 4
-#define maxProgramCount 64
+#define width 4ul
+#define maxProgramCount 64ul
     assert(programCount <= maxProgramCount);
 
 //CO    const uniform int width = 4;
diff --git a/tests/aossoa-5.ispc b/tests/aossoa-5.ispc
index eb4fed3a..d6346455 100644
--- a/tests/aossoa-5.ispc
+++ b/tests/aossoa-5.ispc
@@ -2,8 +2,8 @@
 export uniform int width() { return programCount; }
 
 export void f_v(uniform float RET[]) {
-#define width 3
-#define maxProgramCount 64
+#define width 3ul
+#define maxProgramCount 64ul
     assert(programCount <= maxProgramCount);
 
 //CO    const uniform int width = 3;
diff --git a/tests/aossoa-6.ispc b/tests/aossoa-6.ispc
index b64cd10b..7c177fde 100644
--- a/tests/aossoa-6.ispc
+++ b/tests/aossoa-6.ispc
@@ -2,8 +2,8 @@
 export uniform int width() { return programCount; }
 
 export void f_v(uniform float RET[]) {
-#define width 4
-#define maxProgramCount 64
+#define width 4ul
+#define maxProgramCount 64ul
     assert(programCount <= maxProgramCount);
 
 //CO    const uniform int width = 4;
diff --git a/tests/atomics-12.ispc b/tests/atomics-12.ispc
index c27ad99c..d6359555 100644
--- a/tests/atomics-12.ispc
+++ b/tests/atomics-12.ispc
@@ -7,7 +7,7 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
     float a = aFOO[programIndex]; 
     float b = 0;
     if (programIndex < 30 && programIndex & 1)
-        b = atomic_or_global(&s, (1 << programIndex));
+        b = atomic_or_global(&s, (1ul << programIndex));
     RET[programIndex] = s;
 }
 
@@ -15,6 +15,6 @@ export void result(uniform float RET[]) {
     uniform int sum = 0;
     for (uniform int i = 0; i < min(30, programCount); ++i)
         if (i & 1)
-            sum += (1 << i);
+            sum += (1ul << i);
     RET[programIndex] = sum;
 }
diff --git a/tests/atomics-13.ispc b/tests/atomics-13.ispc
index 86faaddb..dea3bfc3 100644
--- a/tests/atomics-13.ispc
+++ b/tests/atomics-13.ispc
@@ -7,7 +7,7 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
     float a = aFOO[programIndex]; 
     int32 b = 0;
     if (programIndex < 32 && programIndex & 1)
-        b = atomic_or_global(&s, (1 << programIndex));
+        b = atomic_or_global(&s, (1ul << programIndex));
     RET[programIndex] = popcnt(reduce_max((int32)b));
 }
 
diff --git a/tests/atomics-4.ispc b/tests/atomics-4.ispc
index 30b343d1..ac746ad2 100644
--- a/tests/atomics-4.ispc
+++ b/tests/atomics-4.ispc
@@ -5,10 +5,10 @@ uniform int32 s = 0;
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
     float a = aFOO[programIndex]; 
-    float b = atomic_or_global(&s, (1<<min(programIndex,30)));
+    float b = atomic_or_global(&s, (1ul<<min(programIndex,30)));
     RET[programIndex] = s;
 }
 
 export void result(uniform float RET[]) {
-    RET[programIndex] = (1<<min(programCount,31))-1;
+    RET[programIndex] = (1ul<<min(programCount,31))-1;
 }
diff --git a/tests/coalesce-1.ispc b/tests/coalesce-1.ispc
index acfe8cdf..39a79a91 100644
--- a/tests/coalesce-1.ispc
+++ b/tests/coalesce-1.ispc
@@ -2,8 +2,8 @@
 export uniform int width() { return programCount; }
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
-    uniform float * uniform buf = uniform new uniform float[32*32];
-    for (uniform int i = 0; i < 32*32; ++i)
+    uniform float * uniform buf = uniform new uniform float[32l*32l];
+    for (uniform int i = 0; i < 32l*32l; ++i)
         buf[i] = i;
     
     assert(programIndex <= 64);
diff --git a/tests/coalesce-2.ispc b/tests/coalesce-2.ispc
index 88b952a4..a047e456 100644
--- a/tests/coalesce-2.ispc
+++ b/tests/coalesce-2.ispc
@@ -2,8 +2,8 @@
 export uniform int width() { return programCount; }
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
-    uniform float * uniform buf = uniform new uniform float[32*32];
-    for (uniform int i = 0; i < 32*32; ++i)
+    uniform float * uniform buf = uniform new uniform float[32l*32l];
+    for (uniform int i = 0; i < 32l*32l; ++i)
         buf[i] = i;
 
     RET[programIndex] = buf[programIndex & 1];
diff --git a/tests/coalesce-3.ispc b/tests/coalesce-3.ispc
index 7a05963f..c1718b4f 100644
--- a/tests/coalesce-3.ispc
+++ b/tests/coalesce-3.ispc
@@ -2,8 +2,8 @@
 export uniform int width() { return programCount; }
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
-    uniform float * uniform buf = uniform new uniform float[32*32];
-    for (uniform int i = 0; i < 32*32; ++i)
+    uniform float * uniform buf = uniform new uniform float[32l*32l];
+    for (uniform int i = 0; i < 32l*32l; ++i)
         buf[i] = i;
 
     RET[programIndex] = buf[(programIndex >> 2) * 16 + (programIndex & 3)];
diff --git a/tests/coalesce-4.ispc b/tests/coalesce-4.ispc
index 1ddd4b89..182a4d4f 100644
--- a/tests/coalesce-4.ispc
+++ b/tests/coalesce-4.ispc
@@ -2,8 +2,8 @@
 export uniform int width() { return programCount; }
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
-    uniform float * uniform buf = uniform new uniform float[32*32];
-    for (uniform int i = 0; i < 32*32; ++i)
+    uniform float * uniform buf = uniform new uniform float[32l*32l];
+    for (uniform int i = 0; i < 32l*32l; ++i)
         buf[i] = i;
 
     float a = buf[2*programIndex];
diff --git a/tests/coalesce-5.ispc b/tests/coalesce-5.ispc
index 2dd8d44e..385e8526 100644
--- a/tests/coalesce-5.ispc
+++ b/tests/coalesce-5.ispc
@@ -2,8 +2,8 @@
 export uniform int width() { return programCount; }
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
-    uniform float * uniform buf = uniform new uniform float[32*32];
-    for (uniform int i = 0; i < 32*32; ++i)
+    uniform float * uniform buf = uniform new uniform float[32l*32l];
+    for (uniform int i = 0; i < 32l*32l; ++i)
         buf[i] = i;
 
     float a = buf[4*programIndex];
diff --git a/tests/coalesce-6.ispc b/tests/coalesce-6.ispc
index 2a54a2db..8c630a45 100644
--- a/tests/coalesce-6.ispc
+++ b/tests/coalesce-6.ispc
@@ -2,8 +2,8 @@
 export uniform int width() { return programCount; }
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
-    uniform float * uniform buf = uniform new uniform float[32*32];
-    for (uniform int i = 0; i < 32*32; ++i)
+    uniform float * uniform buf = uniform new uniform float[32l*32l];
+    for (uniform int i = 0; i < 32l*32l; ++i)
         buf[i] = i;
 
     float a = buf[4*programIndex];
diff --git a/tests/coalesce-7.ispc b/tests/coalesce-7.ispc
index 8ed628bd..29b56b8d 100644
--- a/tests/coalesce-7.ispc
+++ b/tests/coalesce-7.ispc
@@ -2,8 +2,8 @@
 export uniform int width() { return programCount; }
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
-    uniform float * uniform buf = uniform new uniform float[32*32];
-    for (uniform int i = 0; i < 32*32; ++i)
+    uniform float * uniform buf = uniform new uniform float[32l*32l];
+    for (uniform int i = 0; i < 32l*32l; ++i)
         buf[i] = i;
 
     float a = buf[4*programIndex];
diff --git a/tests/coalesce-8.ispc b/tests/coalesce-8.ispc
index dfefaa19..f01ca9c3 100644
--- a/tests/coalesce-8.ispc
+++ b/tests/coalesce-8.ispc
@@ -2,8 +2,8 @@
 export uniform int width() { return programCount; }
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
-    uniform float * uniform buf = uniform new uniform float[32*32];
-    for (uniform int i = 0; i < 32*32; ++i)
+    uniform float * uniform buf = uniform new uniform float[32l*32l];
+    for (uniform int i = 0; i < 32l*32l; ++i)
         buf[i] = i;
 
     int index = (programIndex < 4) ? (programIndex & 1) :
diff --git a/tests/count-leading-trailing-zeros-1.ispc b/tests/count-leading-trailing-zeros-1.ispc
index 221d066d..3f12c07d 100644
--- a/tests/count-leading-trailing-zeros-1.ispc
+++ b/tests/count-leading-trailing-zeros-1.ispc
@@ -3,7 +3,7 @@ export uniform int width() { return programCount; }
 
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
-    RET[programIndex] = count_trailing_zeros(0xf0);
+    RET[programIndex] = count_trailing_zeros(0xf0ul);
 }
 
 export void result(uniform float RET[]) {
diff --git a/tests/count-leading-trailing-zeros-4.ispc b/tests/count-leading-trailing-zeros-4.ispc
index 475c18ca..4b849018 100644
--- a/tests/count-leading-trailing-zeros-4.ispc
+++ b/tests/count-leading-trailing-zeros-4.ispc
@@ -3,7 +3,7 @@ export uniform int width() { return programCount; }
 
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
-    int32 i = (1 << (programIndex % 28));
+    int32 i = (1ul << (programIndex % 28));
     RET[programIndex] = count_leading_zeros(i);
 }
 
diff --git a/tests/exclusive-scan-and-2.ispc b/tests/exclusive-scan-and-2.ispc
index 5d2bcd1f..b742a91e 100644
--- a/tests/exclusive-scan-and-2.ispc
+++ b/tests/exclusive-scan-and-2.ispc
@@ -3,7 +3,7 @@ export uniform int width() { return programCount; }
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
     RET[programIndex] = -1;
-    int32 a = ~(1 << programIndex);
+    int32 a = ~(1ul << programIndex);
     if ((programIndex < 32) && (programIndex & 1) == 0) {
         RET[programIndex] = exclusive_scan_and(a);
     }
@@ -15,7 +15,7 @@ export void result(uniform float RET[]) {
     if ((programIndex & 1) == 0 && programIndex > 0 && programIndex < 32) {
         int val = 0xffffffff;
         for (int i = 0; i < programIndex-1; i += 2)
-            val &= ~(1<<i);
+            val &= ~(1ul<<i);
         RET[programIndex] = val;
     }
 }
diff --git a/tests/exclusive-scan-or-1.ispc b/tests/exclusive-scan-or-1.ispc
index bd2b7598..ce790c4f 100644
--- a/tests/exclusive-scan-or-1.ispc
+++ b/tests/exclusive-scan-or-1.ispc
@@ -3,11 +3,11 @@ export uniform int width() { return programCount; }
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
     RET[programIndex] = -1;
-    int32 a = (1 << (min(programIndex, 30)));
+    int32 a = (1ul << (min(programIndex, 30)));
     RET[programIndex] = exclusive_scan_or(a);
 }
 
 
 export void result(uniform float RET[]) {
-    RET[programIndex] = (1 << (min(programIndex, 31))) - 1;
+    RET[programIndex] = (1ul << (min(programIndex, 31))) - 1;
 }
diff --git a/tests/frexp-double-1.ispc b/tests/frexp-double-1.ispc
index 6c38b05e..dbb4128b 100644
--- a/tests/frexp-double-1.ispc
+++ b/tests/frexp-double-1.ispc
@@ -3,7 +3,7 @@ export uniform int width() { return programCount; }
 
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
-    double a = (1<< (programIndex % 28)) * 1.5;
+    double a = (1ul<< (programIndex % 28)) * 1.5;
     if (programIndex & 1)
         a = -a;
     int exponent;
diff --git a/tests/frexp-double.ispc b/tests/frexp-double.ispc
index ba4831d7..f397355f 100644
--- a/tests/frexp-double.ispc
+++ b/tests/frexp-double.ispc
@@ -3,7 +3,7 @@ export uniform int width() { return programCount; }
 
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
-    double a = (1<< (programIndex%28)) * 1.5;
+    double a = (1ul << (programIndex%28)) * 1.5;
     if (programIndex & 1)
         a = -a;
     int exponent;
diff --git a/tests/frexp-float-1.ispc b/tests/frexp-float-1.ispc
index 7d5fc1d2..9df35c4c 100644
--- a/tests/frexp-float-1.ispc
+++ b/tests/frexp-float-1.ispc
@@ -3,7 +3,7 @@ export uniform int width() { return programCount; }
 
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
-    float a = (1<< (programIndex%28)) * 1.5;
+    float a = (1ul << (programIndex%28)) * 1.5;
     if (programIndex & 1)
         a = -a;
     int exponent;
diff --git a/tests/frexp-float.ispc b/tests/frexp-float.ispc
index ec54e4be..2bfa35ff 100644
--- a/tests/frexp-float.ispc
+++ b/tests/frexp-float.ispc
@@ -3,7 +3,7 @@ export uniform int width() { return programCount; }
 
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
-    float a = (1<< (programIndex%28)) * 1.5;
+    float a = (1ul << (programIndex%28)) * 1.5;
     if (programIndex & 1)
         a = -a;
     int exponent;
diff --git a/tests/kilo-mega-giga-2.ispc b/tests/kilo-mega-giga-2.ispc
index 77e201ef..42545b8d 100644
--- a/tests/kilo-mega-giga-2.ispc
+++ b/tests/kilo-mega-giga-2.ispc
@@ -8,5 +8,5 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
 }
 
 export void result(uniform float RET[]) {
-    RET[programIndex] = 2*1024*1024 + 5;
+    RET[programIndex] = 2ul*1024ul*1024ul + 5;
 }
diff --git a/tests/ldexp-double.ispc b/tests/ldexp-double.ispc
index 6b3ed734..e1b7a59f 100644
--- a/tests/ldexp-double.ispc
+++ b/tests/ldexp-double.ispc
@@ -3,7 +3,7 @@ export uniform int width() { return programCount; }
 
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
-    double a = 1 << (programIndex % 28);
+    double a = 1ul << (programIndex % 28);
     if (programIndex & 1)
         a = -a;
     RET[programIndex] = ldexp(a, 2);
@@ -11,7 +11,7 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
 
 export void result(uniform float RET[]) {
     int pi = programIndex % 28;
-    RET[programIndex] = (1 << (pi + 2));
+    RET[programIndex] = (1ul << (pi + 2));
     if (programIndex & 1)
         RET[programIndex] = -RET[programIndex];
 }
diff --git a/tests/ldexp-float.ispc b/tests/ldexp-float.ispc
index a2ec9a27..305ae106 100644
--- a/tests/ldexp-float.ispc
+++ b/tests/ldexp-float.ispc
@@ -3,7 +3,7 @@ export uniform int width() { return programCount; }
 
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
-    float a = 1 << (programIndex % 28);
+    float a = 1ul << (programIndex % 28);
     if (programIndex & 1)
         a = -a;
     RET[programIndex] = ldexp(a, 2);
@@ -11,7 +11,7 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
 
 export void result(uniform float RET[]) {
     int pi = programIndex % 28;
-    RET[programIndex] = (1 << (pi + 2));
+    RET[programIndex] = (1ul << (pi + 2));
     if (programIndex & 1)
         RET[programIndex] = -RET[programIndex];
 }
diff --git a/tests/local-atomics-12.ispc b/tests/local-atomics-12.ispc
index 23a30af5..358ffd34 100644
--- a/tests/local-atomics-12.ispc
+++ b/tests/local-atomics-12.ispc
@@ -7,7 +7,7 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
     float a = aFOO[programIndex]; 
     float b = 0;
     if (programIndex < 29 && (programIndex & 1))
-        b = atomic_or_local(&s, (1 << programIndex));
+        b = atomic_or_local(&s, (1ul << programIndex));
     RET[programIndex] = s;
 }
 
@@ -15,6 +15,6 @@ export void result(uniform float RET[]) {
     uniform int sum = 0;
     for (uniform int i = 0; i < min(programCount, 29); ++i)
         if (i & 1)
-            sum += (1 << i);
+            sum += (1ul << i);
     RET[programIndex] = sum;
 }
diff --git a/tests/local-atomics-13.ispc b/tests/local-atomics-13.ispc
index 36fd1f1c..b9d35d09 100644
--- a/tests/local-atomics-13.ispc
+++ b/tests/local-atomics-13.ispc
@@ -7,7 +7,7 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
     float a = aFOO[programIndex]; 
     int32 b = 0;
     if (programIndex < 28 && (programIndex & 1))
-        b = atomic_or_local(&s, (1 << programIndex));
+        b = atomic_or_local(&s, (1ul << programIndex));
     RET[programIndex] = popcnt(reduce_max(b));
 }
 
diff --git a/tests/local-atomics-14.ispc b/tests/local-atomics-14.ispc
index 4cf81809..25c52e60 100644
--- a/tests/local-atomics-14.ispc
+++ b/tests/local-atomics-14.ispc
@@ -7,7 +7,7 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
     float a = aFOO[programIndex]; 
     float b = 0;
     if (programIndex < 32 && (programIndex & 1))
-        b = atomic_or_local(&s, (1 << programIndex));
+        b = atomic_or_local(&s, (1ul << programIndex));
     RET[programIndex] = (s>>20);
 }
 
@@ -15,6 +15,6 @@ export void result(uniform float RET[]) {
     uniform int sum = 0;
     for (uniform int i = 0; i < min(32, programCount); ++i)
         if (i & 1)
-            sum += (1 << i);
+            sum += (1ul << i);
     RET[programIndex] = ((unsigned int64)(0xffffffffff000000 | sum)) >> 20;
 }
diff --git a/tests/local-atomics-4.ispc b/tests/local-atomics-4.ispc
index f7f6a04a..b3648ab5 100644
--- a/tests/local-atomics-4.ispc
+++ b/tests/local-atomics-4.ispc
@@ -7,10 +7,10 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
     float a = aFOO[programIndex]; 
     float b = 0;
     if (programIndex < 29)
-        atomic_or_local(&s, (1<<programIndex));
+        atomic_or_local(&s, (1ul<<programIndex));
     RET[programIndex] = s;
 }
 
 export void result(uniform float RET[]) {
-    RET[programIndex] = (1<<min(29,programCount))-1;
+    RET[programIndex] = (1ul<<min(29,programCount))-1;
 }
diff --git a/tests/rand-distrib-1.ispc b/tests/rand-distrib-1.ispc
index 3a23a917..8869d600 100644
--- a/tests/rand-distrib-1.ispc
+++ b/tests/rand-distrib-1.ispc
@@ -11,7 +11,7 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
     for (uniform int i = 0; i < iters; ++i) {
         unsigned int val = random(&state);
         for (uniform int j = 0; j < 32; ++j) {
-            if (val & (1<<j))
+            if (val & (1ul<<j))
                 ++count[j];
         }
     }
diff --git a/tests/sizeof-9.ispc b/tests/sizeof-9.ispc
index ad96fab2..84492bcc 100644
--- a/tests/sizeof-9.ispc
+++ b/tests/sizeof-9.ispc
@@ -2,7 +2,7 @@
 export uniform int width() { return programCount; }
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
-    RET[programIndex] = sizeof 1;
+  RET[programIndex] = sizeof 1u;
 }
 
 export void result(uniform float RET[]) {
diff --git a/tests/test-83.ispc b/tests/test-83.ispc
index eaca92d7..6aa408b7 100644
--- a/tests/test-83.ispc
+++ b/tests/test-83.ispc
@@ -6,7 +6,7 @@ float f(int i) { return i + 1.; }
 float f(float v) { return 2 * v; }
 export void f_f(uniform float RET[], uniform float aFOO[]) {
     float a = aFOO[programIndex]; 
-    RET[programIndex] = f(a) + f(10); 
+    RET[programIndex] = f(a) + f(10l);
 }
 
 export void result(uniform float RET[]) {
diff --git a/tests/test-84.ispc b/tests/test-84.ispc
index f39568b0..fedf149a 100644
--- a/tests/test-84.ispc
+++ b/tests/test-84.ispc
@@ -6,7 +6,7 @@ float f(float v) { return 2 * v; }
 float f(int i) { return i + 1.; }
 export void f_f(uniform float RET[], uniform float aFOO[]) {
     float a = aFOO[programIndex]; 
-    RET[programIndex] = f(a) + f(10); 
+    RET[programIndex] = f(a) + f(10l);
 }
 
 export void result(uniform float RET[]) {
diff --git a/tests/test-85.ispc b/tests/test-85.ispc
index 0001816c..e9510ffb 100644
--- a/tests/test-85.ispc
+++ b/tests/test-85.ispc
@@ -8,7 +8,7 @@ float f(float a, int b) { return a + b; }
 float f(int i) { return i + 1.; }
 export void f_f(uniform float RET[], uniform float aFOO[]) {
     float a = aFOO[programIndex]; 
-    RET[programIndex] = f(a) + f() + f(a, a) + f(10); 
+    RET[programIndex] = f(a) + f() + f(a, a) + f(10l);
 }
 
 export void result(uniform float RET[]) {
diff --git a/tests_errors/array-plus-equals.ispc b/tests_errors/array-plus-equals.ispc
index 0e0ba744..7fcecc43 100644
--- a/tests_errors/array-plus-equals.ispc
+++ b/tests_errors/array-plus-equals.ispc
@@ -1,4 +1,4 @@
-// Illegal to assign to array type "varying float[5]"
+// Illegal to assign to array type "varying float\[5\]"
 
 void foo(float *x) {
     float a[5] = { 1,2,3,4,5};
diff --git a/tests_errors/array-pointer-assign.ispc b/tests_errors/array-pointer-assign.ispc
index d709dbd3..3e74c2ed 100644
--- a/tests_errors/array-pointer-assign.ispc
+++ b/tests_errors/array-pointer-assign.ispc
@@ -1,4 +1,4 @@
-// Illegal to assign to array type "varying float[5]"
+// Illegal to assign to array type "varying float\[5\]"
 
 void foo(float *x) {
     float a[5] = { 1,2,3,4,5};
diff --git a/tests_errors/float-logical.ispc b/tests_errors/float-logical.ispc
index 27ab4c8c..a5a1a5cc 100644
--- a/tests_errors/float-logical.ispc
+++ b/tests_errors/float-logical.ispc
@@ -1,4 +1,4 @@
-// Illegal to use ^= operator with floating-point
+// Illegal to use \^= operator with floating-point
 
 float foo(float a, float b) {
     return a ^= b;
diff --git a/tests_errors/fptr-typecheck-2.ispc b/tests_errors/fptr-typecheck-2.ispc
index ea59fa54..6a665207 100644
--- a/tests_errors/fptr-typecheck-2.ispc
+++ b/tests_errors/fptr-typecheck-2.ispc
@@ -1,4 +1,4 @@
-// Can't convert argument of type "void * uniform" to type "varying float" for function call argument.
+// Can't convert argument of type "void \* uniform" to type "varying float" for function call argument.
 
 float bar(float a, float b);
 
diff --git a/tests_errors/fptr-typecheck-3.ispc b/tests_errors/fptr-typecheck-3.ispc
index 26412632..a65ac74e 100644
--- a/tests_errors/fptr-typecheck-3.ispc
+++ b/tests_errors/fptr-typecheck-3.ispc
@@ -1,4 +1,4 @@
-// Too few parameter values provided in function call (1 provided, 2 expected).
+// Too few parameter values provided in function call \(1 provided, 2 expected\).
 
 float bar(float a, float b);
 
diff --git a/tests_errors/initexpr-2.ispc b/tests_errors/initexpr-2.ispc
index 681fe6fc..db0b4925 100644
--- a/tests_errors/initexpr-2.ispc
+++ b/tests_errors/initexpr-2.ispc
@@ -1,3 +1,3 @@
-// Initializer list for array "varying int32[2][4]" must have no more than 2 elements (has 3)
+// Initializer list for array "varying int32\[2\]\[4\]" must have no more than 2 elements \(has 3\)
 
 int a[2][4] = { { 1, 2, 3 }, { 1, 2, 3, 4 }, 1 };
diff --git a/tests_errors/int-ptr-fail.ispc b/tests_errors/int-ptr-fail.ispc
index 0c06bfc8..4185651e 100644
--- a/tests_errors/int-ptr-fail.ispc
+++ b/tests_errors/int-ptr-fail.ispc
@@ -1,5 +1,5 @@
-// Type conversion from "const uniform int32" to "uniform int32 * varying" for initializer is not possible
+// Type conversion from "const uniform int32" to "uniform int32 \* varying" for initializer is not possible
 
 int voo() {
-    int * varying foo = 1;
+    int * varying foo = 1l;
 }
diff --git a/tests_errors/lvalue-2.ispc b/tests_errors/lvalue-2.ispc
index ae2e3edd..77438ebb 100644
--- a/tests_errors/lvalue-2.ispc
+++ b/tests_errors/lvalue-2.ispc
@@ -1,4 +1,4 @@
-// Can't assign to type "const uniform int32" on left-hand side of expression
+// Can't assign to type "const uniform int[0-9]*" on left-hand side of expression
 
 int bar(){ 
     4 = 0;
diff --git a/tests_errors/lvalue-3.ispc b/tests_errors/lvalue-3.ispc
index eb856bf0..72b8b9b8 100644
--- a/tests_errors/lvalue-3.ispc
+++ b/tests_errors/lvalue-3.ispc
@@ -1,4 +1,4 @@
-// Can't assign to type "const uniform int32" on left-hand side of expression
+// Can't assign to type "const uniform int[0-9]*" on left-hand side of expression
 
 int bar(){ 
     int x;
diff --git a/tests_errors/new-delete-3.ispc b/tests_errors/new-delete-3.ispc
index bb22aa56..f46053e3 100644
--- a/tests_errors/new-delete-3.ispc
+++ b/tests_errors/new-delete-3.ispc
@@ -1,4 +1,4 @@
-// syntax error, unexpected '('
+// syntax error, unexpected '\('
 
 int * func(int a) {
     return new int[a](10);
diff --git a/tests_errors/new-delete-6.ispc b/tests_errors/new-delete-6.ispc
index 250441c2..1b090a1c 100644
--- a/tests_errors/new-delete-6.ispc
+++ b/tests_errors/new-delete-6.ispc
@@ -1,4 +1,4 @@
-// Can't convert from type "uniform int32 * varying" to type "uniform int32 * uniform" for return
+// Can't convert from type "uniform int32 \* varying" to type "uniform int32 \* uniform" for return
 
 int * uniform func(int x) {
     return new int[x];
diff --git a/tests_errors/ptr-1.ispc b/tests_errors/ptr-1.ispc
index 97a88488..5a9d891c 100644
--- a/tests_errors/ptr-1.ispc
+++ b/tests_errors/ptr-1.ispc
@@ -1,4 +1,4 @@
-// Can't convert from pointer type "void * varying" to incompatible pointer type "uniform int32 * varying" for return statement
+// Can't convert from pointer type "void \* varying" to incompatible pointer type "uniform int32 \* varying" for return statement
 
 int *foo(void *p) {
     return p;
diff --git a/tests_errors/ptr-const-1.ispc b/tests_errors/ptr-const-1.ispc
index 4dcfaa75..65900060 100644
--- a/tests_errors/ptr-const-1.ispc
+++ b/tests_errors/ptr-const-1.ispc
@@ -1,4 +1,4 @@
-// Can't assign to type "const uniform int32 * const varying"
+// Can't assign to type "const uniform int32 \* const varying"
 
 void foo(const int * const p) {
     ++p;
diff --git a/tests_errors/ptrcast-lose-info.ispc b/tests_errors/ptrcast-lose-info.ispc
index 5da374aa..61efe95e 100644
--- a/tests_errors/ptrcast-lose-info.ispc
+++ b/tests_errors/ptrcast-lose-info.ispc
@@ -1,4 +1,4 @@
-// Pointer type cast of type "uniform int32 * uniform" to integer type "uniform int32" may lose information.
+// Pointer type cast of type "uniform int32 \* uniform" to integer type "uniform int32" may lose information.
 //  rule: run on arch=x86-64
 
 int32 foo(int * uniform x)  {
diff --git a/tests_errors/ref-3.ispc b/tests_errors/ref-3.ispc
index 85a8dd35..11b30a92 100644
--- a/tests_errors/ref-3.ispc
+++ b/tests_errors/ref-3.ispc
@@ -1,4 +1,4 @@
-// syntax error, unexpected '*',
+// syntax error, unexpected '\*',
 
 void foo(int & * x) {
     *x = NULL;
diff --git a/tests_errors/soa-11.ispc b/tests_errors/soa-11.ispc
index 67d814c6..d632e9b0 100644
--- a/tests_errors/soa-11.ispc
+++ b/tests_errors/soa-11.ispc
@@ -1,4 +1,4 @@
-// Type conversion from "const uniform int32" to "soa<4> struct Foo" for assignment operator is not possible
+// Type conversion from "const uniform int[0-9]*" to "soa<4> struct Foo" for assignment operator is not possible
 
 struct Pt { float x, y, z; };
 
diff --git a/tests_errors/soa-12.ispc b/tests_errors/soa-12.ispc
index e2cd3242..c0420614 100644
--- a/tests_errors/soa-12.ispc
+++ b/tests_errors/soa-12.ispc
@@ -1,4 +1,4 @@
-// Can't convert between types "const uniform int32" and "soa<4> float" with different SOA widths
+// Can't convert between types "const uniform int[0-9]*" and "soa<4> float" with different SOA widths
 
 struct Pt { float x, y, z; };
 
diff --git a/tests_errors/soa-3.ispc b/tests_errors/soa-3.ispc
index b2be1b59..04dc84bc 100644
--- a/tests_errors/soa-3.ispc
+++ b/tests_errors/soa-3.ispc
@@ -1,4 +1,4 @@
-// syntax error, unexpected '-', expecting int32 constant
+// syntax error, unexpected '-', expecting int
 
 struct F { float a, b, c; };
 
diff --git a/tests_errors/soa-4.ispc b/tests_errors/soa-4.ispc
index b2be1b59..04dc84bc 100644
--- a/tests_errors/soa-4.ispc
+++ b/tests_errors/soa-4.ispc
@@ -1,4 +1,4 @@
-// syntax error, unexpected '-', expecting int32 constant
+// syntax error, unexpected '-', expecting int
 
 struct F { float a, b, c; };
 
diff --git a/tests_errors/soa-9.ispc b/tests_errors/soa-9.ispc
index 7c6a1df9..e9e7509a 100644
--- a/tests_errors/soa-9.ispc
+++ b/tests_errors/soa-9.ispc
@@ -1,4 +1,4 @@
-// Can't convert from pointer to SOA type "soa<8> struct A * uniform" to pointer to non-SOA type "void * varying" 
+// Can't convert from pointer to SOA type "soa<8> struct A \* uniform" to pointer to non-SOA type "void \* varying" 
 
 struct A { float a, b; };
 
diff --git a/tests_errors/struct_arith.ispc b/tests_errors/struct_arith.ispc
index 9d942880..df729d02 100644
--- a/tests_errors/struct_arith.ispc
+++ b/tests_errors/struct_arith.ispc
@@ -1,4 +1,4 @@
-// Assignment operator "+=" is illegal with struct type
+// Assignment operator "\+=" is illegal with struct type
 
 struct Point { float x, y, z; };
 
diff --git a/tests_errors/vec-size-compile-constant.ispc b/tests_errors/vec-size-compile-constant.ispc
index b9e61721..0eb6f90e 100644
--- a/tests_errors/vec-size-compile-constant.ispc
+++ b/tests_errors/vec-size-compile-constant.ispc
@@ -1,4 +1,4 @@
-// syntax error, unexpected identifier, expecting int32 constant
+// syntax error, unexpected identifier, expecting int
 
 void foo(uniform int i) {
     float<i> a;

From c14659c6754f4d91a3bec3cbb48c4e67b7421d13 Mon Sep 17 00:00:00 2001
From: Matt Pharr <mpharr@google.com>
Date: Tue, 23 Jul 2013 17:02:49 -0700
Subject: [PATCH 06/34] Fix bug in lGetConstantInt() in parse.yy.

Previously, we weren't handling signed/unsigned constant types correctly.
---
 parse.yy | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/parse.yy b/parse.yy
index 6ed2a43d..4b315776 100644
--- a/parse.yy
+++ b/parse.yy
@@ -2278,7 +2278,11 @@ lGetConstantInt(Expr *expr, int *value, SourcePos pos, const char *usage) {
             Error(pos, "%s must be representable with a 32-bit integer.", usage);
             return false;
         }
-        *value = (int)ci->getZExtValue();
+        const Type *type = expr->GetType();
+        if (type->IsUnsignedType())
+            *value = (int)ci->getZExtValue();
+        else
+            *value = (int)ci->getSExtValue();
         return true;
     }
 }

From 15a3ef370a433eedcf6e6650f07ec81775d0322d Mon Sep 17 00:00:00 2001
From: Matt Pharr <mpharr@google.com>
Date: Tue, 23 Jul 2013 17:11:01 -0700
Subject: [PATCH 07/34] Use @llvm.readcyclecounter to implement stdlib clock()
 function.

Also added a test for the clock builtin.
---
 builtins/util.m4 | 14 ++++----------
 tests/clock.ispc | 14 ++++++++++++++
 2 files changed, 18 insertions(+), 10 deletions(-)
 create mode 100644 tests/clock.ispc

diff --git a/builtins/util.m4 b/builtins/util.m4
index d6f3e5c3..8c379781 100644
--- a/builtins/util.m4
+++ b/builtins/util.m4
@@ -2891,17 +2891,11 @@ m4exit(`1')
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; read hw clock
 
+declare i64 @llvm.readcyclecounter()
+
 define i64 @__clock() nounwind {
-entry:
-  tail call void asm sideeffect "xorl %eax,%eax \0A    cpuid", "~{rax},~{rbx},~{rcx},~{rdx},~{dirflag},~{fpsr},~{flags}"() nounwind
-  %0 = tail call { i32, i32 } asm sideeffect "rdtsc", "={ax},={dx},~{dirflag},~{fpsr},~{flags}"() nounwind
-  %asmresult = extractvalue { i32, i32 } %0, 0
-  %asmresult1 = extractvalue { i32, i32 } %0, 1
-  %conv = zext i32 %asmresult1 to i64
-  %shl = shl nuw i64 %conv, 32
-  %conv2 = zext i32 %asmresult to i64
-  %or = or i64 %shl, %conv2
-  ret i64 %or
+  %r = call i64 @llvm.readcyclecounter()
+  ret i64 %r
 }
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
diff --git a/tests/clock.ispc b/tests/clock.ispc
new file mode 100644
index 00000000..0e95379b
--- /dev/null
+++ b/tests/clock.ispc
@@ -0,0 +1,14 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+  unsigned uniform int64 a = clock();
+  float x = pow(sqrt(aFOO[programIndex]), 5.5);
+  unsigned uniform int64 b = clock();
+  RET[programIndex] = (b - a) > 0 ? 1 : 0;
+}
+
+export void result(uniform float RET[]) {
+  RET[programIndex] = 1;
+}

From 53414f12e6ce7d1615cd650cc7b2152063da6556 Mon Sep 17 00:00:00 2001
From: Matt Pharr <mpharr@google.com>
Date: Tue, 23 Jul 2013 17:30:32 -0700
Subject: [PATCH 08/34] Add SSE4 target optimized for computation with 8-bit
 datatypes.

This change adds a new 'sse4-8' target, where programCount is 16 and
the mask element size is 8-bits.  (i.e. the most appropriate sizing of
the mask for SIMD computation with 8-bit datatypes.)
---
 Makefile                  |   2 +-
 builtins.cpp              |   9 +
 builtins/target-sse4-8.ll | 444 ++++++++++++++++++++++++++++++++++++++
 builtins/util.m4          | 104 ++++++++-
 expr.cpp                  |   5 +
 ispc.cpp                  |   8 +
 opt.cpp                   |  13 +-
 7 files changed, 578 insertions(+), 7 deletions(-)
 create mode 100644 builtins/target-sse4-8.ll

diff --git a/Makefile b/Makefile
index 043ab4cf..054a3da1 100644
--- a/Makefile
+++ b/Makefile
@@ -123,7 +123,7 @@ CXX_SRC=ast.cpp builtins.cpp cbackend.cpp ctx.cpp decl.cpp expr.cpp func.cpp \
 HEADERS=ast.h builtins.h ctx.h decl.h expr.h func.h ispc.h llvmutil.h module.h \
 	opt.h stmt.h sym.h type.h util.h
 TARGETS=neon avx1 avx1-x2 avx11 avx11-x2 avx2 avx2-x2 sse2 sse2-x2 sse4 sse4-x2 \
-	generic-4 generic-8 generic-16 generic-32 generic-64 generic-1
+	sse4-8 generic-4 generic-8 generic-16 generic-32 generic-64 generic-1
 # These files need to be compiled in two versions - 32 and 64 bits.
 BUILTINS_SRC_TARGET=$(addprefix builtins/target-, $(addsuffix .ll, $(TARGETS)))
 # These are files to be compiled in single version.
diff --git a/builtins.cpp b/builtins.cpp
index d3bbaa6a..6c586595 100644
--- a/builtins.cpp
+++ b/builtins.cpp
@@ -868,6 +868,15 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
                 EXPORT_MODULE(builtins_bitcode_sse4_x2_64bit);
             }
             break;
+        case 16:
+            Assert(g->target->getMaskBitCount() == 8);
+            if (runtime32) {
+                EXPORT_MODULE(builtins_bitcode_sse4_8_32bit);
+            }
+            else {
+                EXPORT_MODULE(builtins_bitcode_sse4_8_64bit);
+            }
+            break;
         default:
             FATAL("logic error in DefineStdlib");
         }
diff --git a/builtins/target-sse4-8.ll b/builtins/target-sse4-8.ll
new file mode 100644
index 00000000..c85209ba
--- /dev/null
+++ b/builtins/target-sse4-8.ll
@@ -0,0 +1,444 @@
+;;  Copyright (c) 2013, Google, Inc.
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Google, Inc. nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; Define common 4-wide stuff
+define(`WIDTH',`16')
+define(`MASK',`i8')
+include(`util.m4')
+
+stdlib_core()
+packed_load_and_store()
+scans()
+int64minmax()
+
+include(`target-sse4-common.ll')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; half conversion routines
+
+declare float @__half_to_float_uniform(i16 %v) nounwind readnone
+declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
+declare i16 @__float_to_half_uniform(float %v) nounwind readnone
+declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rcp
+
+declare <4 x float> @llvm.x86.sse.rcp.ps(<4 x float>) nounwind readnone
+
+define <WIDTH x float> @__rcp_varying_float(<WIDTH x float>) nounwind readonly alwaysinline {
+  unary4to16(call, float, @llvm.x86.sse.rcp.ps, %0)
+  ; do one N-R iteration to improve precision
+  ;  float iv = __rcp_v(v);
+  ;  return iv * (2. - v * iv);
+  %v_iv = fmul <16 x float> %0, %call
+  %two_minus = fsub <16 x float> <float 2., float 2., float 2., float 2.,
+                                  float 2., float 2., float 2., float 2.,
+                                  float 2., float 2., float 2., float 2.,
+                                  float 2., float 2., float 2., float 2.>, %v_iv  
+  %iv_mul = fmul <16 x float> %call, %two_minus
+  ret <16 x float> %iv_mul
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; rsqrt
+
+declare <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float>) nounwind readnone
+
+define <16 x float> @__rsqrt_varying_float(<16 x float> %v) nounwind readonly alwaysinline {
+  ;  float is = __rsqrt_v(v);
+  unary4to16(is, float, @llvm.x86.sse.rsqrt.ps, %v)
+   ; Newton-Raphson iteration to improve precision
+  ;  return 0.5 * is * (3. - (v * is) * is);
+  %v_is = fmul <16 x float> %v, %is
+  %v_is_is = fmul <16 x float> %v_is, %is
+  %three_sub = fsub <16 x float> <float 3., float 3., float 3., float 3.,
+                                  float 3., float 3., float 3., float 3.,
+                                  float 3., float 3., float 3., float 3.,
+                                  float 3., float 3., float 3., float 3.>, %v_is_is
+  %is_mul = fmul <16 x float> %is, %three_sub
+  %half_scale = fmul <16 x float> <float 0.5, float 0.5, float 0.5, float 0.5,
+                                   float 0.5, float 0.5, float 0.5, float 0.5,
+                                   float 0.5, float 0.5, float 0.5, float 0.5,
+                                   float 0.5, float 0.5, float 0.5, float 0.5>, %is_mul
+  ret <16 x float> %half_scale
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; sqrt
+
+declare <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float>) nounwind readnone
+
+define <16 x float> @__sqrt_varying_float(<16 x float>) nounwind readonly alwaysinline {
+  unary4to16(call, float, @llvm.x86.sse.sqrt.ps, %0)
+  ret <16 x float> %call
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; double precision sqrt
+
+declare <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double>) nounwind readnone
+
+define <16 x double> @__sqrt_varying_double(<16 x double>) nounwind
+alwaysinline {
+  unary2to16(ret, double, @llvm.x86.sse2.sqrt.pd, %0)
+  ret <16 x double> %ret
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rounding floats
+
+declare <4 x float> @llvm.x86.sse41.round.ps(<4 x float>, i32) nounwind readnone
+
+define <16 x float> @__round_varying_float(<16 x float>) nounwind readonly alwaysinline {
+  ; roundps, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8
+  round4to16(%0, 8)
+}
+
+define <16 x float> @__floor_varying_float(<16 x float>) nounwind readonly alwaysinline {
+  ; roundps, round down 0b01 | don't signal precision exceptions 0b1001 = 9
+  round4to16(%0, 9)
+}
+
+define <16 x float> @__ceil_varying_float(<16 x float>) nounwind readonly alwaysinline {
+  ; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
+  round4to16(%0, 10)
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rounding doubles
+
+declare <2 x double> @llvm.x86.sse41.round.pd(<2 x double>, i32) nounwind readnone
+
+define <16 x double> @__round_varying_double(<16 x double>) nounwind readonly alwaysinline {
+;  XXXround2to4double(%0, 8)
+  ; FIXME: need round2to16double in util.m4...
+  ret <16 x double> undef  
+}
+
+define <16 x double> @__floor_varying_double(<16 x double>) nounwind readonly alwaysinline {
+  ; roundpd, round down 0b01 | don't signal precision exceptions 0b1001 = 9
+;  XXXround2to4double(%0, 9)
+  ret <16 x double> undef  
+}
+
+define <16 x double> @__ceil_varying_double(<16 x double>) nounwind readonly alwaysinline {
+  ; roundpd, round up 0b10 | don't signal precision exceptions 0b1010 = 10
+;  XXXround2to4double(%0, 10)
+  ret <16 x double> undef  
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; float min/max
+
+declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind readnone
+declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind readnone
+
+define <16 x float> @__max_varying_float(<16 x float>, <16 x float>) nounwind readonly alwaysinline {
+  binary4to16(call, float, @llvm.x86.sse.max.ps, %0, %1)
+  ret <16 x float> %call
+}
+
+define <16 x float> @__min_varying_float(<16 x float>, <16 x float>) nounwind readonly alwaysinline {
+  binary4to16(call, float, @llvm.x86.sse.min.ps, %0, %1)
+  ret <16 x float> %call
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int32 min/max
+
+define <16 x i32> @__min_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
+  binary4to16(call, i32, @llvm.x86.sse41.pminsd, %0, %1)
+  ret <16 x i32> %call
+}
+
+define <16 x i32> @__max_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
+  binary4to16(call, i32, @llvm.x86.sse41.pmaxsd, %0, %1)
+  ret <16 x i32> %call
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; unsigned int min/max
+
+define <16 x i32> @__min_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
+  binary4to16(call, i32, @llvm.x86.sse41.pminud, %0, %1)
+  ret <16 x i32> %call
+}
+
+define <16 x i32> @__max_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
+  binary4to16(call, i32, @llvm.x86.sse41.pmaxud, %0, %1)
+  ret <16 x i32> %call
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; double precision min/max
+
+declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>) nounwind readnone
+declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) nounwind readnone
+
+define <16 x double> @__min_varying_double(<16 x double>, <16 x double>) nounwind readnone {
+  binary2to16(ret, double, @llvm.x86.sse2.min.pd, %0, %1)
+  ret <16 x double> %ret
+}
+
+define <16 x double> @__max_varying_double(<16 x double>, <16 x double>) nounwind readnone {
+  binary2to16(ret, double, @llvm.x86.sse2.max.pd, %0, %1)
+  ret <16 x double> %ret
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; svml
+
+; FIXME
+
+declare <16 x float> @__svml_sin(<16 x float>)
+declare <16 x float> @__svml_cos(<16 x float>)
+declare void @__svml_sincos(<16 x float>, <16 x float> *, <16 x float> *)
+declare <16 x float> @__svml_tan(<16 x float>)
+declare <16 x float> @__svml_atan(<16 x float>)
+declare <16 x float> @__svml_atan2(<16 x float>, <16 x float>)
+declare <16 x float> @__svml_exp(<16 x float>)
+declare <16 x float> @__svml_log(<16 x float>)
+declare <16 x float> @__svml_pow(<16 x float>, <16 x float>)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; horizontal ops / reductions
+
+declare i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8>) nounwind readnone
+
+define i64 @__movmsk(<16 x i8>) nounwind readnone alwaysinline {
+  %m = call i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8> %0)
+  %m64 = zext i32 %m to i64
+  ret i64 %m64
+}
+
+define i1 @__any(<16 x i8>) nounwind readnone alwaysinline {
+  %m = call i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8> %0)
+  %mne = icmp ne i32 %m, 0
+  ret i1 %mne
+}
+
+define i1 @__all(<16 x i8>) nounwind readnone alwaysinline {
+  %m = call i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8> %0)
+  %meq = icmp eq i32 %m, ALL_ON_MASK
+  ret i1 %meq
+}
+
+define i1 @__none(<16 x i8>) nounwind readnone alwaysinline {
+  %m = call i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8> %0)
+  %meq = icmp eq i32 %m, 0
+  ret i1 %meq
+}
+
+define internal <16 x float> @__add_varying_float(<16 x float>, <16 x float>) {
+  %r = fadd <16 x float> %0, %1
+  ret <16 x float> %r
+}
+
+define internal float @__add_uniform_float(float, float) {
+  %r = fadd float %0, %1
+  ret float %r
+}
+
+define float @__reduce_add_float(<16 x float>) nounwind readonly alwaysinline {
+  reduce16(float, @__add_varying_float, @__add_uniform_float)
+}
+
+define float @__reduce_min_float(<16 x float>) nounwind readnone {
+  reduce16(float, @__min_varying_float, @__min_uniform_float)
+}
+
+define float @__reduce_max_float(<16 x float>) nounwind readnone {
+  reduce16(float, @__max_varying_float, @__max_uniform_float)
+}
+
+define internal <16 x i32> @__add_varying_int32(<16 x i32>, <16 x i32>) {
+  %r = add <16 x i32> %0, %1
+  ret <16 x i32> %r
+}
+
+define internal i32 @__add_uniform_int32(i32, i32) {
+  %r = add i32 %0, %1
+  ret i32 %r
+}
+
+define i32 @__reduce_add_int32(<16 x i32>) nounwind readnone {
+  reduce16(i32, @__add_varying_int32, @__add_uniform_int32)
+}
+
+define i32 @__reduce_min_int32(<16 x i32>) nounwind readnone {
+  reduce16(i32, @__min_varying_int32, @__min_uniform_int32)
+}
+
+define i32 @__reduce_max_int32(<16 x i32>) nounwind readnone {
+  reduce16(i32, @__max_varying_int32, @__max_uniform_int32)
+}
+
+define i32 @__reduce_min_uint32(<16 x i32>) nounwind readnone {
+  reduce16(i32, @__min_varying_uint32, @__min_uniform_uint32)
+}
+
+define i32 @__reduce_max_uint32(<16 x i32>) nounwind readnone {
+  reduce16(i32, @__max_varying_uint32, @__max_uniform_uint32)
+}
+
+define internal <16 x double> @__add_varying_double(<16 x double>, <16 x double>) {
+  %r = fadd <16 x double> %0, %1
+  ret <16 x double> %r
+}
+
+define internal double @__add_uniform_double(double, double) {
+  %r = fadd double %0, %1
+  ret double %r
+}
+
+define double @__reduce_add_double(<16 x double>) nounwind readnone {
+  reduce16(double, @__add_varying_double, @__add_uniform_double)
+}
+
+define double @__reduce_min_double(<16 x double>) nounwind readnone {
+  reduce16(double, @__min_varying_double, @__min_uniform_double)
+}
+
+define double @__reduce_max_double(<16 x double>) nounwind readnone {
+  reduce16(double, @__max_varying_double, @__max_uniform_double)
+}
+
+define internal <16 x i64> @__add_varying_int64(<16 x i64>, <16 x i64>) {
+  %r = add <16 x i64> %0, %1
+  ret <16 x i64> %r
+}
+
+define internal i64 @__add_uniform_int64(i64, i64) {
+  %r = add i64 %0, %1
+  ret i64 %r
+}
+
+define i64 @__reduce_add_int64(<16 x i64>) nounwind readnone {
+  reduce16(i64, @__add_varying_int64, @__add_uniform_int64)
+}
+
+define i64 @__reduce_min_int64(<16 x i64>) nounwind readnone {
+  reduce16(i64, @__min_varying_int64, @__min_uniform_int64)
+}
+
+define i64 @__reduce_max_int64(<16 x i64>) nounwind readnone {
+  reduce16(i64, @__max_varying_int64, @__max_uniform_int64)
+}
+
+define i64 @__reduce_min_uint64(<16 x i64>) nounwind readnone {
+  reduce16(i64, @__min_varying_uint64, @__min_uniform_uint64)
+}
+
+define i64 @__reduce_max_uint64(<16 x i64>) nounwind readnone {
+  reduce16(i64, @__max_varying_uint64, @__max_uniform_uint64)
+}
+
+reduce_equal(16)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; masked store
+
+define void @__masked_store_blend_i64(<16 x i64>* nocapture, <16 x i64>,
+                                      <16 x i8> %mask) nounwind
+                                      alwaysinline {
+  %mask_as_i1 = trunc <16 x MASK> %mask to <16 x i1>
+  %old = load <16 x i64>* %0, align 4
+  %blend = select <16 x i1> %mask_as_i1, <16 x i64> %1, <16 x i64> %old
+  store <16 x i64> %blend, <16 x i64>* %0, align 4
+  ret void
+}
+
+define void @__masked_store_blend_i32(<16 x i32>* nocapture, <16 x i32>, 
+                                      <16 x MASK> %mask) nounwind alwaysinline {
+  %mask_as_i1 = trunc <16 x MASK> %mask to <16 x i1>
+  %old = load <16 x i32>* %0, align 4
+  %blend = select <16 x i1> %mask_as_i1, <16 x i32> %1, <16 x i32> %old
+  store <16 x i32> %blend, <16 x i32>* %0, align 4
+  ret void
+}
+
+define void @__masked_store_blend_i16(<16 x i16>* nocapture, <16 x i16>,
+                                     <16 x MASK> %mask) nounwind alwaysinline {
+  %mask_as_i1 = trunc <16 x MASK> %mask to <16 x i1>
+  %old = load <16 x i16>* %0, align 4
+  %blend = select <16 x i1> %mask_as_i1, <16 x i16> %1, <16 x i16> %old
+  store <16 x i16> %blend, <16 x i16>* %0, align 4
+  ret void
+}
+
+define void @__masked_store_blend_i8(<16 x i8>* nocapture, <16 x i8>,
+                                     <16 x MASK> %mask) nounwind alwaysinline {
+  %mask_as_i1 = trunc <16 x MASK> %mask to <16 x i1>
+  %old = load <16 x i8>* %0, align 4
+  %blend = select <16 x i1> %mask_as_i1, <16 x i8> %1, <16 x i8> %old
+  store <16 x i8> %blend, <16 x i8>* %0, align 4
+  ret void
+}
+
+gen_masked_store(i8)
+gen_masked_store(i16)
+gen_masked_store(i32)
+gen_masked_store(i64)
+
+masked_store_float_double()
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; unaligned loads/loads+broadcasts
+
+masked_load(i8,  1)
+masked_load(i16, 2)
+masked_load(i32, 4)
+masked_load(float, 4)
+masked_load(i64, 8)
+masked_load(double, 8)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; gather/scatter
+
+; define these with the macros from stdlib.m4
+
+gen_gather_factored(i8)
+gen_gather_factored(i16)
+gen_gather_factored(i32)
+gen_gather_factored(float)
+gen_gather_factored(i64)
+gen_gather_factored(double)
+
+gen_scatter(i8)
+gen_scatter(i16)
+gen_scatter(i32)
+gen_scatter(float)
+gen_scatter(i64)
+gen_scatter(double)
diff --git a/builtins/util.m4 b/builtins/util.m4
index 8c379781..ee45ebc7 100644
--- a/builtins/util.m4
+++ b/builtins/util.m4
@@ -411,6 +411,42 @@ define(`unary2to8', `
 '
 )
 
+define(`unary2to16', `
+  %$1_0 = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> <i32 0, i32 1>
+  %v$1_0 = call <2 x $2> $3(<2 x $2> %$1_0)
+  %$1_1 = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> <i32 2, i32 3>
+  %v$1_1 = call <2 x $2> $3(<2 x $2> %$1_1)
+  %$1_2 = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> <i32 4, i32 5>
+  %v$1_2 = call <2 x $2> $3(<2 x $2> %$1_2)
+  %$1_3 = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> <i32 6, i32 7>
+  %v$1_3 = call <2 x $2> $3(<2 x $2> %$1_3)
+  %$1_4 = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> <i32 8, i32 9>
+  %v$1_4 = call <2 x $2> $3(<2 x $2> %$1_4)
+  %$1_5 = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> <i32 10, i32 11>
+  %v$1_5 = call <2 x $2> $3(<2 x $2> %$1_5)
+  %$1_6 = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> <i32 12, i32 13>
+  %v$1_6 = call <2 x $2> $3(<2 x $2> %$1_6)
+  %$1_7 = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> <i32 14, i32 15>
+  %v$1_7 = call <2 x $2> $3(<2 x $2> %$1_7)
+  %$1a = shufflevector <2 x $2> %v$1_0, <2 x $2> %v$1_1,
+           <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %$1b = shufflevector <2 x $2> %v$1_2, <2 x $2> %v$1_3,
+           <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %$1ab = shufflevector <4 x $2> %$1a, <4 x $2> %$1b,
+           <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %$1c = shufflevector <2 x $2> %v$1_4, <2 x $2> %v$1_5,
+           <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %$1d = shufflevector <2 x $2> %v$1_6, <2 x $2> %v$1_7,
+           <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %$1cd = shufflevector <4 x $2> %$1c, <4 x $2> %$1d,
+           <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+
+  %$1 = shufflevector <8 x $2> %$1ab, <8 x $2> %$1cd,
+           <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                       i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+'
+)
+
 ;; Maps an 2-wide binary function to two 8-wide vector operands
 ;; $1: name of variable into which the final result should go
 ;; $2: scalar type of the vector elements
@@ -432,12 +468,58 @@ define(`binary2to8', `
   %$1_3b = shufflevector <8 x $2> $5, <8 x $2> undef, <2 x i32> <i32 6, i32 7>
   %v$1_3 = call <2 x $2> $3(<2 x $2> %$1_3a, <2 x $2> %$1_3b)
 
+  %$1a = shufflevector <2 x $2> %v$1_0, <2 x $2> %v$1_1,
+           <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %$1b = shufflevector <2 x $2> %v$1_2, <2 x $2> %v$1_3,
+           <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %$1 = shufflevector <4 x $2> %$1a, <4 x $2> %$1b,
+           <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+'
+)
+
+define(`binary2to16', `
+  %$1_0a = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> <i32 0, i32 1>
+  %$1_0b = shufflevector <16 x $2> $5, <16 x $2> undef, <2 x i32> <i32 0, i32 1>
+  %v$1_0 = call <2 x $2> $3(<2 x $2> %$1_0a, <2 x $2> %$1_0b)
+  %$1_1a = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> <i32 2, i32 3>
+  %$1_1b = shufflevector <16 x $2> $5, <16 x $2> undef, <2 x i32> <i32 2, i32 3>
+  %v$1_1 = call <2 x $2> $3(<2 x $2> %$1_1a, <2 x $2> %$1_1b)
+  %$1_2a = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> <i32 4, i32 5>
+  %$1_2b = shufflevector <16 x $2> $5, <16 x $2> undef, <2 x i32> <i32 4, i32 5>
+  %v$1_2 = call <2 x $2> $3(<2 x $2> %$1_2a, <2 x $2> %$1_2b)
+  %$1_3a = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> <i32 6, i32 7>
+  %$1_3b = shufflevector <16 x $2> $5, <16 x $2> undef, <2 x i32> <i32 6, i32 7>
+  %v$1_3 = call <2 x $2> $3(<2 x $2> %$1_3a, <2 x $2> %$1_3b)
+  %$1_4a = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> <i32 8, i32 9>
+  %$1_4b = shufflevector <16 x $2> $5, <16 x $2> undef, <2 x i32> <i32 8, i32 9>
+  %v$1_4 = call <2 x $2> $3(<2 x $2> %$1_4a, <2 x $2> %$1_4b)
+  %$1_5a = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> <i32 10, i32 11>
+  %$1_5b = shufflevector <16 x $2> $5, <16 x $2> undef, <2 x i32> <i32 10, i32 11>
+  %v$1_5 = call <2 x $2> $3(<2 x $2> %$1_5a, <2 x $2> %$1_5b)
+  %$1_6a = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> <i32 12, i32 13>
+  %$1_6b = shufflevector <16 x $2> $5, <16 x $2> undef, <2 x i32> <i32 12, i32 13>
+  %v$1_6 = call <2 x $2> $3(<2 x $2> %$1_6a, <2 x $2> %$1_6b)
+  %$1_7a = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> <i32 14, i32 15>
+  %$1_7b = shufflevector <16 x $2> $5, <16 x $2> undef, <2 x i32> <i32 14, i32 15>
+  %v$1_7 = call <2 x $2> $3(<2 x $2> %$1_7a, <2 x $2> %$1_7b)
+
   %$1a = shufflevector <2 x $2> %v$1_0, <2 x $2> %v$1_1, 
            <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   %$1b = shufflevector <2 x $2> %v$1_2, <2 x $2> %v$1_3, 
            <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %$1 = shufflevector <4 x $2> %$1a, <4 x $2> %$1b,
+  %$1ab = shufflevector <4 x $2> %$1a, <4 x $2> %$1b,
            <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>           
+
+  %$1c = shufflevector <2 x $2> %v$1_4, <2 x $2> %v$1_5,
+           <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %$1d = shufflevector <2 x $2> %v$1_6, <2 x $2> %v$1_7,
+           <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %$1cd = shufflevector <4 x $2> %$1c, <4 x $2> %$1d,
+           <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+
+  %$1 = shufflevector <8 x $2> %$1ab, <8 x $2> %$1cd,
+           <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                       i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 '
 )
 
@@ -460,6 +542,26 @@ ret <8 x float> %ret
 '
 )
 
+define(`round4to16', `
+%v0 = shufflevector <16 x float> $1, <16 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+%v1 = shufflevector <16 x float> $1, <16 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+%v2 = shufflevector <16 x float> $1, <16 x float> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+%v3 = shufflevector <16 x float> $1, <16 x float> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+%r0 = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %v0, i32 $2)
+%r1 = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %v1, i32 $2)
+%r2 = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %v2, i32 $2)
+%r3 = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %v3, i32 $2)
+%ret01 = shufflevector <4 x float> %r0, <4 x float> %r1,
+         <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+%ret23 = shufflevector <4 x float> %r2, <4 x float> %r3,
+         <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+%ret = shufflevector <8 x float> %ret01, <8 x float> %ret23,
+         <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                     i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ret <16 x float> %ret
+'
+)
+
 define(`round8to16', `
 %v0 = shufflevector <16 x float> $1, <16 x float> undef,
         <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
diff --git a/expr.cpp b/expr.cpp
index 6bde2acb..f81037f6 100644
--- a/expr.cpp
+++ b/expr.cpp
@@ -3123,6 +3123,10 @@ static llvm::Value *
 lEmitVaryingSelect(FunctionEmitContext *ctx, llvm::Value *test,
                    llvm::Value *expr1, llvm::Value *expr2,
                    const Type *type) {
+#if !defined(LLVM_3_1)
+    test = ctx->TruncInst(test, LLVMTypes::Int1VectorType);
+    return ctx->SelectInst(test, expr1, expr2, "select");
+#else
     llvm::Value *resultPtr = ctx->AllocaInst(expr1->getType(), "selectexpr_tmp");
     // Don't need to worry about masking here
     ctx->StoreInst(expr2, resultPtr);
@@ -3131,6 +3135,7 @@ lEmitVaryingSelect(FunctionEmitContext *ctx, llvm::Value *test,
            PointerType::GetUniform(type)->LLVMType(g->ctx));
     ctx->StoreInst(expr1, resultPtr, test, type, PointerType::GetUniform(type));
     return ctx->LoadInst(resultPtr, "selectexpr_final");
+#endif // !LLVM_3_1
 }
 
 
diff --git a/ispc.cpp b/ispc.cpp
index 887f6ca3..6ac23781 100644
--- a/ispc.cpp
+++ b/ispc.cpp
@@ -310,6 +310,14 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
         this->m_maskingIsFree = false;
         this->m_maskBitCount = 32;
     }
+    else if (!strcasecmp(isa, "sse4-8")) {
+        this->m_isa = Target::SSE4;
+        this->m_nativeVectorWidth = 16;
+        this->m_vectorWidth = 16;
+        this->m_attributes = "+sse,+sse2,+sse3,+sse41,-sse42,-sse4a,+ssse3,-popcnt,+cmov";
+        this->m_maskingIsFree = false;
+        this->m_maskBitCount = 8;
+    }
     else if (!strcasecmp(isa, "generic-4")) {
         this->m_isa = Target::GENERIC;
         this->m_nativeVectorWidth = 4;
diff --git a/opt.cpp b/opt.cpp
index ba32c639..4701e7df 100644
--- a/opt.cpp
+++ b/opt.cpp
@@ -670,14 +670,17 @@ IntrinsicsOpt::IntrinsicsOpt()
     // All of the mask instructions we may encounter.  Note that even if
     // compiling for AVX, we may still encounter the regular 4-wide SSE
     // MOVMSK instruction.
-    llvm::Function *sseMovmsk =
+    llvm::Function *ssei8Movmsk =
+        llvm::Intrinsic::getDeclaration(m->module, llvm::Intrinsic::x86_sse2_pmovmskb_128);
+    maskInstructions.push_back(ssei8Movmsk);
+    llvm::Function *sseFloatMovmsk =
         llvm::Intrinsic::getDeclaration(m->module, llvm::Intrinsic::x86_sse_movmsk_ps);
-    maskInstructions.push_back(sseMovmsk);
+    maskInstructions.push_back(sseFloatMovmsk);
     maskInstructions.push_back(m->module->getFunction("__movmsk"));
-    llvm::Function *avxMovmsk =
+    llvm::Function *avxFloatMovmsk =
         llvm::Intrinsic::getDeclaration(m->module, llvm::Intrinsic::x86_avx_movmsk_ps_256);
-    Assert(avxMovmsk != NULL);
-    maskInstructions.push_back(avxMovmsk);
+    Assert(avxFloatMovmsk != NULL);
+    maskInstructions.push_back(avxFloatMovmsk);
 
     // And all of the blend instructions
     blendInstructions.push_back(BlendInstruction(

From 04d61afa23a64d9fc5f95648509bd5ec002da53e Mon Sep 17 00:00:00 2001
From: Matt Pharr <mpharr@google.com>
Date: Thu, 25 Jul 2013 09:40:48 -0700
Subject: [PATCH 09/34] Fix bug in lEmitVaryingSelect() for targets with i1
 mask types.

Commit 53414f12e6c introduced a but where lEmitVaryingSelect() would
try to truncate a vector of i1s to a vector of i1s, which in turn
made LLVM's IR analyzer unhappy.
---
 expr.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/expr.cpp b/expr.cpp
index f81037f6..856d363c 100644
--- a/expr.cpp
+++ b/expr.cpp
@@ -3124,7 +3124,8 @@ lEmitVaryingSelect(FunctionEmitContext *ctx, llvm::Value *test,
                    llvm::Value *expr1, llvm::Value *expr2,
                    const Type *type) {
 #if !defined(LLVM_3_1)
-    test = ctx->TruncInst(test, LLVMTypes::Int1VectorType);
+    if (test->getType() != LLVMTypes::Int1VectorType)
+        test = ctx->TruncInst(test, LLVMTypes::Int1VectorType);
     return ctx->SelectInst(test, expr1, expr2, "select");
 #else
     llvm::Value *resultPtr = ctx->AllocaInst(expr1->getType(), "selectexpr_tmp");

From 780b0dfe47a770785c4fe1f224813e3a518cd135 Mon Sep 17 00:00:00 2001
From: Matt Pharr <mpharr@google.com>
Date: Wed, 24 Jul 2013 09:40:50 -0700
Subject: [PATCH 10/34] Add SSE4-16 target.

Along the lines of sse4-8, this is an 8-wide target for SSE4, using
16-bit elements for the mask.  It's thus (in principle) the best
target for SIMD computation with 16-bit datatypes.
---
 Makefile                   |   2 +-
 builtins.cpp               |  16 +-
 builtins/target-sse4-16.ll | 436 +++++++++++++++++++++++++++++++++++++
 ispc.cpp                   |  14 +-
 run_tests.py               |   2 +-
 5 files changed, 463 insertions(+), 7 deletions(-)
 create mode 100644 builtins/target-sse4-16.ll

diff --git a/Makefile b/Makefile
index 054a3da1..fc064dbd 100644
--- a/Makefile
+++ b/Makefile
@@ -123,7 +123,7 @@ CXX_SRC=ast.cpp builtins.cpp cbackend.cpp ctx.cpp decl.cpp expr.cpp func.cpp \
 HEADERS=ast.h builtins.h ctx.h decl.h expr.h func.h ispc.h llvmutil.h module.h \
 	opt.h stmt.h sym.h type.h util.h
 TARGETS=neon avx1 avx1-x2 avx11 avx11-x2 avx2 avx2-x2 sse2 sse2-x2 sse4 sse4-x2 \
-	sse4-8 generic-4 generic-8 generic-16 generic-32 generic-64 generic-1
+	sse4-8 sse4-16 generic-4 generic-8 generic-16 generic-32 generic-64 generic-1
 # These files need to be compiled in two versions - 32 and 64 bits.
 BUILTINS_SRC_TARGET=$(addprefix builtins/target-, $(addsuffix .ll, $(TARGETS)))
 # These are files to be compiled in single version.
diff --git a/builtins.cpp b/builtins.cpp
index 6c586595..c4a2f3b5 100644
--- a/builtins.cpp
+++ b/builtins.cpp
@@ -862,10 +862,22 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
             break;
         case 8:
             if (runtime32) {
-                EXPORT_MODULE(builtins_bitcode_sse4_x2_32bit);
+                if (g->target->getMaskBitCount() == 16) {
+                    EXPORT_MODULE(builtins_bitcode_sse4_16_32bit);
+                }
+                else {
+                    Assert(g->target->getMaskBitCount() == 32);
+                    EXPORT_MODULE(builtins_bitcode_sse4_x2_32bit);
+                }
             }
             else {
-                EXPORT_MODULE(builtins_bitcode_sse4_x2_64bit);
+                if (g->target->getMaskBitCount() == 16) {
+                    EXPORT_MODULE(builtins_bitcode_sse4_16_64bit);
+                }
+                else {
+                    Assert(g->target->getMaskBitCount() == 32);
+                    EXPORT_MODULE(builtins_bitcode_sse4_x2_64bit);
+                }
             }
             break;
         case 16:
diff --git a/builtins/target-sse4-16.ll b/builtins/target-sse4-16.ll
new file mode 100644
index 00000000..2044fbee
--- /dev/null
+++ b/builtins/target-sse4-16.ll
@@ -0,0 +1,436 @@
+;;  Copyright (c) 2013, Google, Inc.
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Google, Inc. nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; Define common 4-wide stuff
+define(`WIDTH',`8')
+define(`MASK',`i16')
+include(`util.m4')
+
+stdlib_core()
+packed_load_and_store()
+scans()
+int64minmax()
+
+include(`target-sse4-common.ll')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; half conversion routines
+
+declare float @__half_to_float_uniform(i16 %v) nounwind readnone
+declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
+declare i16 @__float_to_half_uniform(float %v) nounwind readnone
+declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rcp
+
+declare <4 x float> @llvm.x86.sse.rcp.ps(<4 x float>) nounwind readnone
+
+define <WIDTH x float> @__rcp_varying_float(<WIDTH x float>) nounwind readonly alwaysinline {
+  unary4to8(call, float, @llvm.x86.sse.rcp.ps, %0)
+  ; do one N-R iteration to improve precision
+  ;  float iv = __rcp_v(v);
+  ;  return iv * (2. - v * iv);
+  %v_iv = fmul <8 x float> %0, %call
+  %two_minus = fsub <8 x float> <float 2., float 2., float 2., float 2.,
+                                 float 2., float 2., float 2., float 2.>, %v_iv  
+  %iv_mul = fmul <8 x float> %call, %two_minus
+  ret <8 x float> %iv_mul
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; rsqrt
+
+declare <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float>) nounwind readnone
+
+define <WIDTH x float> @__rsqrt_varying_float(<WIDTH x float> %v) nounwind readonly alwaysinline {
+  ;  float is = __rsqrt_v(v);
+  unary4to8(is, float, @llvm.x86.sse.rsqrt.ps, %v)
+   ; Newton-Raphson iteration to improve precision
+  ;  return 0.5 * is * (3. - (v * is) * is);
+  %v_is = fmul <8 x float> %v, %is
+  %v_is_is = fmul <8 x float> %v_is, %is
+  %three_sub = fsub <8 x float> <float 3., float 3., float 3., float 3.,
+                                  float 3., float 3., float 3., float 3.>, %v_is_is
+  %is_mul = fmul <8 x float> %is, %three_sub
+  %half_scale = fmul <8 x float> <float 0.5, float 0.5, float 0.5, float 0.5,
+                                   float 0.5, float 0.5, float 0.5, float 0.5>, %is_mul
+  ret <8 x float> %half_scale
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; sqrt
+
+declare <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float>) nounwind readnone
+
+define <8 x float> @__sqrt_varying_float(<8 x float>) nounwind readonly alwaysinline {
+  unary4to8(call, float, @llvm.x86.sse.sqrt.ps, %0)
+  ret <8 x float> %call
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; double precision sqrt
+
+declare <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double>) nounwind readnone
+
+define <8 x double> @__sqrt_varying_double(<8 x double>) nounwind
+alwaysinline {
+  unary2to8(ret, double, @llvm.x86.sse2.sqrt.pd, %0)
+  ret <8 x double> %ret
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rounding floats
+
+declare <4 x float> @llvm.x86.sse41.round.ps(<4 x float>, i32) nounwind readnone
+
+define <8 x float> @__round_varying_float(<8 x float>) nounwind readonly alwaysinline {
+  ; roundps, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8
+  round4to8(%0, 8)
+}
+
+define <8 x float> @__floor_varying_float(<8 x float>) nounwind readonly alwaysinline {
+  ; roundps, round down 0b01 | don't signal precision exceptions 0b1001 = 9
+  round4to8(%0, 9)
+}
+
+define <8 x float> @__ceil_varying_float(<8 x float>) nounwind readonly alwaysinline {
+  ; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
+  round4to8(%0, 10)
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rounding doubles
+
+declare <2 x double> @llvm.x86.sse41.round.pd(<2 x double>, i32) nounwind readnone
+
+define <8 x double> @__round_varying_double(<8 x double>) nounwind readonly alwaysinline {
+  round2to8double(%0, 8)
+}
+
+define <8 x double> @__floor_varying_double(<8 x double>) nounwind readonly alwaysinline {
+  round2to8double(%0, 9)
+}
+
+define <8 x double> @__ceil_varying_double(<8 x double>) nounwind readonly alwaysinline {
+  round2to8double(%0, 10)
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; float min/max
+
+declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind readnone
+declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind readnone
+
+define <8 x float> @__max_varying_float(<8 x float>, <8 x float>) nounwind readonly alwaysinline {
+  binary4to8(call, float, @llvm.x86.sse.max.ps, %0, %1)
+  ret <8 x float> %call
+}
+
+define <8 x float> @__min_varying_float(<8 x float>, <8 x float>) nounwind readonly alwaysinline {
+  binary4to8(call, float, @llvm.x86.sse.min.ps, %0, %1)
+  ret <8 x float> %call
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int32 min/max
+
+define <8 x i32> @__min_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
+  binary4to8(call, i32, @llvm.x86.sse41.pminsd, %0, %1)
+  ret <8 x i32> %call
+}
+
+define <8 x i32> @__max_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
+  binary4to8(call, i32, @llvm.x86.sse41.pmaxsd, %0, %1)
+  ret <8 x i32> %call
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; unsigned int min/max
+
+define <8 x i32> @__min_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
+  binary4to8(call, i32, @llvm.x86.sse41.pminud, %0, %1)
+  ret <8 x i32> %call
+}
+
+define <8 x i32> @__max_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
+  binary4to8(call, i32, @llvm.x86.sse41.pmaxud, %0, %1)
+  ret <8 x i32> %call
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; double precision min/max
+
+declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>) nounwind readnone
+declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) nounwind readnone
+
+define <8 x double> @__min_varying_double(<8 x double>, <8 x double>) nounwind readnone {
+  binary2to8(ret, double, @llvm.x86.sse2.min.pd, %0, %1)
+  ret <8 x double> %ret
+}
+
+define <8 x double> @__max_varying_double(<8 x double>, <8 x double>) nounwind readnone {
+  binary2to8(ret, double, @llvm.x86.sse2.max.pd, %0, %1)
+  ret <8 x double> %ret
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; svml
+
+; FIXME
+
+declare <8 x float> @__svml_sin(<8 x float>)
+declare <8 x float> @__svml_cos(<8 x float>)
+declare void @__svml_sincos(<8 x float>, <8 x float> *, <8 x float> *)
+declare <8 x float> @__svml_tan(<8 x float>)
+declare <8 x float> @__svml_atan(<8 x float>)
+declare <8 x float> @__svml_atan2(<8 x float>, <8 x float>)
+declare <8 x float> @__svml_exp(<8 x float>)
+declare <8 x float> @__svml_log(<8 x float>)
+declare <8 x float> @__svml_pow(<8 x float>, <8 x float>)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; horizontal ops / reductions
+
+declare i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8>) nounwind readnone
+
+define i64 @__movmsk(<8 x MASK>) nounwind readnone alwaysinline {
+  %m8 = trunc <8 x MASK> %0 to <8 x i8>
+  %mask8 = shufflevector <8 x i8> %m8, <8 x i8> zeroinitializer,
+      <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                  i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
+  %m = call i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8> %mask8)
+  %m64 = zext i32 %m to i64
+  ret i64 %m64
+}
+
+define i1 @__any(<8 x MASK>) nounwind readnone alwaysinline {
+  %m = call i64 @__movmsk(<8 x MASK> %0)
+  %mne = icmp ne i64 %m, 0
+  ret i1 %mne
+}
+
+define i1 @__all(<8 x MASK>) nounwind readnone alwaysinline {
+  %m = call i64 @__movmsk(<8 x MASK> %0)
+  %meq = icmp eq i64 %m, ALL_ON_MASK
+  ret i1 %meq
+}
+
+define i1 @__none(<8 x MASK>) nounwind readnone alwaysinline {
+  %m = call i64 @__movmsk(<8 x MASK> %0)
+  %meq = icmp eq i64 %m, 0
+  ret i1 %meq
+}
+
+define internal <8 x float> @__add_varying_float(<8 x float>, <8 x float>) {
+  %r = fadd <8 x float> %0, %1
+  ret <8 x float> %r
+}
+
+define internal float @__add_uniform_float(float, float) {
+  %r = fadd float %0, %1
+  ret float %r
+}
+
+define float @__reduce_add_float(<8 x float>) nounwind readonly alwaysinline {
+  reduce8(float, @__add_varying_float, @__add_uniform_float)
+}
+
+define float @__reduce_min_float(<8 x float>) nounwind readnone {
+  reduce8(float, @__min_varying_float, @__min_uniform_float)
+}
+
+define float @__reduce_max_float(<8 x float>) nounwind readnone {
+  reduce8(float, @__max_varying_float, @__max_uniform_float)
+}
+
+define internal <8 x i32> @__add_varying_int32(<8 x i32>, <8 x i32>) {
+  %r = add <8 x i32> %0, %1
+  ret <8 x i32> %r
+}
+
+define internal i32 @__add_uniform_int32(i32, i32) {
+  %r = add i32 %0, %1
+  ret i32 %r
+}
+
+define i32 @__reduce_add_int32(<8 x i32>) nounwind readnone {
+  reduce8(i32, @__add_varying_int32, @__add_uniform_int32)
+}
+
+define i32 @__reduce_min_int32(<8 x i32>) nounwind readnone {
+  reduce8(i32, @__min_varying_int32, @__min_uniform_int32)
+}
+
+define i32 @__reduce_max_int32(<8 x i32>) nounwind readnone {
+  reduce8(i32, @__max_varying_int32, @__max_uniform_int32)
+}
+
+define i32 @__reduce_min_uint32(<8 x i32>) nounwind readnone {
+  reduce8(i32, @__min_varying_uint32, @__min_uniform_uint32)
+}
+
+define i32 @__reduce_max_uint32(<8 x i32>) nounwind readnone {
+  reduce8(i32, @__max_varying_uint32, @__max_uniform_uint32)
+}
+
+define internal <8 x double> @__add_varying_double(<8 x double>, <8 x double>) {
+  %r = fadd <8 x double> %0, %1
+  ret <8 x double> %r
+}
+
+define internal double @__add_uniform_double(double, double) {
+  %r = fadd double %0, %1
+  ret double %r
+}
+
+define double @__reduce_add_double(<8 x double>) nounwind readnone {
+  reduce8(double, @__add_varying_double, @__add_uniform_double)
+}
+
+define double @__reduce_min_double(<8 x double>) nounwind readnone {
+  reduce8(double, @__min_varying_double, @__min_uniform_double)
+}
+
+define double @__reduce_max_double(<8 x double>) nounwind readnone {
+  reduce8(double, @__max_varying_double, @__max_uniform_double)
+}
+
+define internal <8 x i64> @__add_varying_int64(<8 x i64>, <8 x i64>) {
+  %r = add <8 x i64> %0, %1
+  ret <8 x i64> %r
+}
+
+define internal i64 @__add_uniform_int64(i64, i64) {
+  %r = add i64 %0, %1
+  ret i64 %r
+}
+
+define i64 @__reduce_add_int64(<8 x i64>) nounwind readnone {
+  reduce8(i64, @__add_varying_int64, @__add_uniform_int64)
+}
+
+define i64 @__reduce_min_int64(<8 x i64>) nounwind readnone {
+  reduce8(i64, @__min_varying_int64, @__min_uniform_int64)
+}
+
+define i64 @__reduce_max_int64(<8 x i64>) nounwind readnone {
+  reduce8(i64, @__max_varying_int64, @__max_uniform_int64)
+}
+
+define i64 @__reduce_min_uint64(<8 x i64>) nounwind readnone {
+  reduce8(i64, @__min_varying_uint64, @__min_uniform_uint64)
+}
+
+define i64 @__reduce_max_uint64(<8 x i64>) nounwind readnone {
+  reduce8(i64, @__max_varying_uint64, @__max_uniform_uint64)
+}
+
+reduce_equal(8)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; masked store
+
+define void @__masked_store_blend_i64(<8 x i64>* nocapture, <8 x i64>,
+                                      <8 x MASK> %mask) nounwind
+                                      alwaysinline {
+  %mask_as_i1 = trunc <8 x MASK> %mask to <8 x i1>
+  %old = load <8 x i64>* %0, align 4
+  %blend = select <8 x i1> %mask_as_i1, <8 x i64> %1, <8 x i64> %old
+  store <8 x i64> %blend, <8 x i64>* %0, align 4
+  ret void
+}
+
+define void @__masked_store_blend_i32(<8 x i32>* nocapture, <8 x i32>, 
+                                      <8 x MASK> %mask) nounwind alwaysinline {
+  %mask_as_i1 = trunc <8 x MASK> %mask to <8 x i1>
+  %old = load <8 x i32>* %0, align 4
+  %blend = select <8 x i1> %mask_as_i1, <8 x i32> %1, <8 x i32> %old
+  store <8 x i32> %blend, <8 x i32>* %0, align 4
+  ret void
+}
+
+define void @__masked_store_blend_i16(<8 x i16>* nocapture, <8 x i16>,
+                                     <8 x MASK> %mask) nounwind alwaysinline {
+  %mask_as_i1 = trunc <8 x MASK> %mask to <8 x i1>
+  %old = load <8 x i16>* %0, align 4
+  %blend = select <8 x i1> %mask_as_i1, <8 x i16> %1, <8 x i16> %old
+  store <8 x i16> %blend, <8 x i16>* %0, align 4
+  ret void
+}
+
+define void @__masked_store_blend_i8(<8 x i8>* nocapture, <8 x i8>,
+                                     <8 x MASK> %mask) nounwind alwaysinline {
+  %mask_as_i1 = trunc <8 x MASK> %mask to <8 x i1>
+  %old = load <8 x i8>* %0, align 4
+  %blend = select <8 x i1> %mask_as_i1, <8 x i8> %1, <8 x i8> %old
+  store <8 x i8> %blend, <8 x i8>* %0, align 4
+  ret void
+}
+
+gen_masked_store(i8)
+gen_masked_store(i16)
+gen_masked_store(i32)
+gen_masked_store(i64)
+
+masked_store_float_double()
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; unaligned loads/loads+broadcasts
+
+masked_load(i8,  1)
+masked_load(i16, 2)
+masked_load(i32, 4)
+masked_load(float, 4)
+masked_load(i64, 8)
+masked_load(double, 8)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; gather/scatter
+
+; define these with the macros from stdlib.m4
+
+gen_gather_factored(i8)
+gen_gather_factored(i16)
+gen_gather_factored(i32)
+gen_gather_factored(float)
+gen_gather_factored(i64)
+gen_gather_factored(double)
+
+gen_scatter(i8)
+gen_scatter(i16)
+gen_scatter(i32)
+gen_scatter(float)
+gen_scatter(i64)
+gen_scatter(double)
diff --git a/ispc.cpp b/ispc.cpp
index 6ac23781..a9f5ff5c 100644
--- a/ispc.cpp
+++ b/ispc.cpp
@@ -318,6 +318,14 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
         this->m_maskingIsFree = false;
         this->m_maskBitCount = 8;
     }
+    else if (!strcasecmp(isa, "sse4-16")) {
+        this->m_isa = Target::SSE4;
+        this->m_nativeVectorWidth = 8;
+        this->m_vectorWidth = 8;
+        this->m_attributes = "+sse,+sse2,+sse3,+sse41,-sse42,-sse4a,+ssse3,-popcnt,+cmov";
+        this->m_maskingIsFree = false;
+        this->m_maskBitCount = 16;
+    }
     else if (!strcasecmp(isa, "generic-4")) {
         this->m_isa = Target::GENERIC;
         this->m_nativeVectorWidth = 4;
@@ -575,9 +583,9 @@ Target::SupportedTargetArchs() {
 
 const char *
 Target::SupportedTargetISAs() {
-    return "neon, sse2, sse2-x2, sse4, sse4-x2, avx, avx-x2"
-        ", avx1.1, avx1.1-x2, avx2, avx2-x2"
-        ", generic-1, generic-4, generic-8, generic-16, generic-32";
+    return "neon, sse2, sse2-x2, sse4, sse4-8, sse4-16, sse4-x2, "
+        "avx, avx-x2, avx1.1, avx1.1-x2, avx2, avx2-x2,"
+        "generic-1, generic-4, generic-8, generic-16, generic-32";
 }
 
 
diff --git a/run_tests.py b/run_tests.py
index 296db867..ea819ea4 100755
--- a/run_tests.py
+++ b/run_tests.py
@@ -37,7 +37,7 @@ parser.add_option("-g", "--generics-include", dest="include_file", help="Filenam
 parser.add_option("-f", "--ispc-flags", dest="ispc_flags", help="Additional flags for ispc (-g, -O1, ...)",
                   default="")
 parser.add_option('-t', '--target', dest='target',
-                  help='Set compilation target (neon, sse2, sse2-x2, sse4, sse4-x2, sse4-8, avx, avx-x2, generic-4, generic-8, generic-16, generic-32)',
+                  help='Set compilation target (neon, sse2, sse2-x2, sse4, sse4-x2, sse4-8, sse4-16, avx, avx-x2, generic-4, generic-8, generic-16, generic-32)',
                   default="sse4")
 parser.add_option('-a', '--arch', dest='arch',
                   help='Set architecture (arm, x86, x86-64)',

From bba84f247c34f67ed28a357d19a4a7414c590c2b Mon Sep 17 00:00:00 2001
From: Matt Pharr <mpharr@google.com>
Date: Wed, 24 Jul 2013 15:08:07 -0700
Subject: [PATCH 11/34] Improved optimization of vector select instructions.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Various LLVM optimization passes are turning code like:

%cmp = icmp lt <8 x i32> %foo, %bar
%cmp32 = sext <8 x i1> %cmp to <8 x i32>
. . .
%cmp1 = trunc <8 x i32> %cmp32 to <8 x i1>
%result = select <8 x i1> %cmp1, . . .

Into:

%cmp = icmp lt <8 x i32> %foo, %bar
%cmp32 = zext <8 x i1> %cmp to <8 x i32>   # note: zext
. . .
%cmp1 = icmp ne <8 x i32> %cmp32, zeroinitializer
%result = select <8 x i1> %cmp1, …

Which in turn isn't matched well by the LLVM code generators, which
in turn leads to fairly inefficient code.  (i.e. it doesn't just emit
a vector compare and blend instruction.)

Also, renamed VSelMovmskOptPass to InstructionSimplifyPass to better
describe its functionality.
---
 opt.cpp | 175 ++++++++++++++++++++++++++++++++++++++++----------------
 1 file changed, 126 insertions(+), 49 deletions(-)

diff --git a/opt.cpp b/opt.cpp
index 4701e7df..8efdbc67 100644
--- a/opt.cpp
+++ b/opt.cpp
@@ -108,7 +108,7 @@
 #endif
 
 static llvm::Pass *CreateIntrinsicsOptPass();
-static llvm::Pass *CreateVSelMovmskOptPass();
+static llvm::Pass *CreateInstructionSimplifyPass();
 
 static llvm::Pass *CreateImproveMemoryOpsPass();
 static llvm::Pass *CreateGatherCoalescePass();
@@ -476,7 +476,7 @@ Optimize(llvm::Module *module, int optLevel) {
         }
         if (!g->opt.disableMaskAllOnOptimizations) {
             optPM.add(CreateIntrinsicsOptPass());
-            optPM.add(CreateVSelMovmskOptPass());
+            optPM.add(CreateInstructionSimplifyPass());
         }
         optPM.add(llvm::createDeadInstEliminationPass());
 
@@ -519,7 +519,7 @@ Optimize(llvm::Module *module, int optLevel) {
 
         if (!g->opt.disableMaskAllOnOptimizations) {
             optPM.add(CreateIntrinsicsOptPass());
-            optPM.add(CreateVSelMovmskOptPass());
+            optPM.add(CreateInstructionSimplifyPass());
         }
 
         if (g->opt.disableGatherScatterOptimizations == false &&
@@ -539,7 +539,7 @@ Optimize(llvm::Module *module, int optLevel) {
         optPM.add(llvm::createFunctionInliningPass());
         optPM.add(llvm::createConstantPropagationPass());
         optPM.add(CreateIntrinsicsOptPass());
-        optPM.add(CreateVSelMovmskOptPass());
+        optPM.add(CreateInstructionSimplifyPass());
 
         if (g->opt.disableGatherScatterOptimizations == false &&
             g->target->getVectorWidth() > 1) {
@@ -555,18 +555,20 @@ Optimize(llvm::Module *module, int optLevel) {
         if (g->opt.disableHandlePseudoMemoryOps == false)
             optPM.add(CreateReplacePseudoMemoryOpsPass());
         optPM.add(CreateIntrinsicsOptPass());
-        optPM.add(CreateVSelMovmskOptPass());
+        optPM.add(CreateInstructionSimplifyPass());
 
         optPM.add(llvm::createFunctionInliningPass());
         optPM.add(llvm::createArgumentPromotionPass());
         optPM.add(llvm::createScalarReplAggregatesPass(sr_threshold, false));
         optPM.add(llvm::createInstructionCombiningPass());
+        optPM.add(CreateInstructionSimplifyPass());
         optPM.add(llvm::createCFGSimplificationPass());
         optPM.add(llvm::createReassociatePass());
         optPM.add(llvm::createLoopRotatePass());
         optPM.add(llvm::createLICMPass());
         optPM.add(llvm::createLoopUnswitchPass(false));
         optPM.add(llvm::createInstructionCombiningPass());
+        optPM.add(CreateInstructionSimplifyPass());
         optPM.add(llvm::createIndVarSimplifyPass());
         optPM.add(llvm::createLoopIdiomPass());
         optPM.add(llvm::createLoopDeletionPass());
@@ -576,17 +578,19 @@ Optimize(llvm::Module *module, int optLevel) {
 
         optPM.add(CreateIsCompileTimeConstantPass(true));
         optPM.add(CreateIntrinsicsOptPass());
-        optPM.add(CreateVSelMovmskOptPass());
+        optPM.add(CreateInstructionSimplifyPass());
 
         optPM.add(llvm::createMemCpyOptPass());
         optPM.add(llvm::createSCCPPass());
         optPM.add(llvm::createInstructionCombiningPass());
+        optPM.add(CreateInstructionSimplifyPass());
         optPM.add(llvm::createJumpThreadingPass());
         optPM.add(llvm::createCorrelatedValuePropagationPass());
         optPM.add(llvm::createDeadStoreEliminationPass());
         optPM.add(llvm::createAggressiveDCEPass());
         optPM.add(llvm::createCFGSimplificationPass());
         optPM.add(llvm::createInstructionCombiningPass());
+        optPM.add(CreateInstructionSimplifyPass());
         optPM.add(llvm::createStripDeadPrototypesPass());
         optPM.add(CreateMakeInternalFuncsStaticPass());
         optPM.add(llvm::createGlobalDCEPass());
@@ -927,80 +931,153 @@ CreateIntrinsicsOptPass() {
     @todo The better thing to do would be to submit a patch to LLVM to get
     these; they're presumably pretty simple patterns to match.
 */
-class VSelMovmskOpt : public llvm::BasicBlockPass {
+class InstructionSimplifyPass : public llvm::BasicBlockPass {
 public:
-    VSelMovmskOpt()
+    InstructionSimplifyPass()
         : BasicBlockPass(ID) { }
 
     const char *getPassName() const { return "Vector Select Optimization"; }
     bool runOnBasicBlock(llvm::BasicBlock &BB);
 
     static char ID;
+
+private:
+    static bool simplifySelect(llvm::SelectInst *selectInst,
+                               llvm::BasicBlock::iterator iter);
+    static llvm::Value *simplifyBoolVec(llvm::Value *value);
+    static bool simplifyCall(llvm::CallInst *callInst,
+                               llvm::BasicBlock::iterator iter);
 };
 
-char VSelMovmskOpt::ID = 0;
+char InstructionSimplifyPass::ID = 0;
+
+
+llvm::Value *
+InstructionSimplifyPass::simplifyBoolVec(llvm::Value *value) {
+    llvm::TruncInst *trunc = llvm::dyn_cast<llvm::TruncInst>(value);
+    if (trunc != NULL) {
+        // Convert trunc({sext,zext}(i1 vector)) -> (i1 vector)
+        llvm::SExtInst *sext = llvm::dyn_cast<llvm::SExtInst>(value);
+        if (sext && 
+            sext->getOperand(0)->getType() == LLVMTypes::Int1VectorType)
+            return sext->getOperand(0);
+
+        llvm::ZExtInst *zext = llvm::dyn_cast<llvm::ZExtInst>(value);
+        if (zext && 
+            zext->getOperand(0)->getType() == LLVMTypes::Int1VectorType)
+            return zext->getOperand(0);
+    }
+
+    llvm::ICmpInst *icmp = llvm::dyn_cast<llvm::ICmpInst>(value);
+    if (icmp != NULL) {
+        // icmp(ne, {sext,zext}(foo), zeroinitializer) -> foo
+        if (icmp->getSignedPredicate() == llvm::CmpInst::ICMP_NE) {
+            llvm::Value *op1 = icmp->getOperand(1);
+            if (llvm::isa<llvm::ConstantAggregateZero>(op1)) {
+                llvm::Value *op0 = icmp->getOperand(0);
+                llvm::SExtInst *sext = llvm::dyn_cast<llvm::SExtInst>(op0);
+                if (sext)
+                    return sext->getOperand(0);
+                llvm::ZExtInst *zext = llvm::dyn_cast<llvm::ZExtInst>(op0);
+                if (zext)
+                    return zext->getOperand(0);
+            }
+        }
+    }
+    return NULL;
+}
 
 
 bool
-VSelMovmskOpt::runOnBasicBlock(llvm::BasicBlock &bb) {
-    DEBUG_START_PASS("VSelMovmaskOpt");
+InstructionSimplifyPass::simplifySelect(llvm::SelectInst *selectInst,
+                                        llvm::BasicBlock::iterator iter) {
+    if (selectInst->getType()->isVectorTy() == false)
+        return false;
+
+    llvm::Value *factor = selectInst->getOperand(0);
+
+    // Simplify all-on or all-off mask values
+    MaskStatus maskStatus = lGetMaskStatus(factor);
+    llvm::Value *value = NULL;
+    if (maskStatus == ALL_ON)
+        // Mask all on -> replace with the first select value
+        value = selectInst->getOperand(1);
+    else if (maskStatus == ALL_OFF)
+        // Mask all off -> replace with the second select value
+        value = selectInst->getOperand(2);
+    if (value != NULL) {
+        llvm::ReplaceInstWithValue(iter->getParent()->getInstList(),
+                                   iter, value);
+        return true;
+    }
+
+    // Sometimes earlier LLVM optimization passes generate unnecessarily
+    // complex expressions for the selection vector, which in turn confuses
+    // the code generators and leads to sub-optimal code (particularly for
+    // 8 and 16-bit masks).  We'll try to simplify them out here so that
+    // the code generator patterns match..
+    if ((factor = simplifyBoolVec(factor)) != NULL) {
+        llvm::Instruction *newSelect =
+            llvm::SelectInst::Create(factor, selectInst->getOperand(1),
+                                     selectInst->getOperand(2),
+                                     selectInst->getName());
+        llvm::ReplaceInstWithInst(selectInst, newSelect);
+        return true;
+    }
+
+    return false;
+}
+
+
+bool
+InstructionSimplifyPass::simplifyCall(llvm::CallInst *callInst,
+                                      llvm::BasicBlock::iterator iter) {
+    llvm::Function *calledFunc = callInst->getCalledFunction();
+
+    // Turn a __movmsk call with a compile-time constant vector into the
+    // equivalent scalar value.
+    if (calledFunc == NULL || calledFunc != m->module->getFunction("__movmsk"))
+        return false;
+
+    uint64_t mask;
+    if (lGetMask(callInst->getArgOperand(0), &mask) == true) {
+        llvm::ReplaceInstWithValue(iter->getParent()->getInstList(),
+                                   iter, LLVMInt64(mask));
+        return true;
+    }
+    return false;
+}
+
+
+bool
+InstructionSimplifyPass::runOnBasicBlock(llvm::BasicBlock &bb) {
+    DEBUG_START_PASS("InstructionSimplify");
 
     bool modifiedAny = false;
 
  restart:
     for (llvm::BasicBlock::iterator iter = bb.begin(), e = bb.end(); iter != e; ++iter) {
         llvm::SelectInst *selectInst = llvm::dyn_cast<llvm::SelectInst>(&*iter);
-        if (selectInst != NULL && selectInst->getType()->isVectorTy()) {
-            llvm::Value *factor = selectInst->getOperand(0);
-
-            MaskStatus maskStatus = lGetMaskStatus(factor);
-            llvm::Value *value = NULL;
-            if (maskStatus == ALL_ON)
-                // Mask all on -> replace with the first select value
-                value = selectInst->getOperand(1);
-            else if (maskStatus == ALL_OFF)
-                // Mask all off -> replace with the second select value
-                value = selectInst->getOperand(2);
-
-            if (value != NULL) {
-                llvm::ReplaceInstWithValue(iter->getParent()->getInstList(),
-                                           iter, value);
-                modifiedAny = true;
-                goto restart;
-            }
+        if (selectInst && simplifySelect(selectInst, iter)) {
+            modifiedAny = true;
+            goto restart;
         }
-
         llvm::CallInst *callInst = llvm::dyn_cast<llvm::CallInst>(&*iter);
-        if (callInst == NULL)
-            continue;
-
-        llvm::Function *calledFunc = callInst->getCalledFunction();
-        if (calledFunc == NULL || calledFunc != m->module->getFunction("__movmsk"))
-            continue;
-
-        uint64_t mask;
-        if (lGetMask(callInst->getArgOperand(0), &mask) == true) {
-#if 0
-            fprintf(stderr, "mask %d\n", mask);
-            callInst->getArgOperand(0)->dump();
-            fprintf(stderr, "-----------\n");
-#endif
-            llvm::ReplaceInstWithValue(iter->getParent()->getInstList(),
-                                       iter, LLVMInt64(mask));
+        if (callInst && simplifyCall(callInst, iter)) {
             modifiedAny = true;
             goto restart;
         }
     }
 
-    DEBUG_END_PASS("VSelMovMskOpt");
+    DEBUG_END_PASS("InstructionSimplify");
 
     return modifiedAny;
 }
 
 
 static llvm::Pass *
-CreateVSelMovmskOptPass() {
-    return new VSelMovmskOpt;
+CreateInstructionSimplifyPass() {
+    return new InstructionSimplifyPass;
 }
 
 

From 2d063925a1d5ab758bcdd22454c201ac7d617dd3 Mon Sep 17 00:00:00 2001
From: Matt Pharr <mpharr@google.com>
Date: Wed, 24 Jul 2013 15:10:08 -0700
Subject: [PATCH 12/34] Explicitly call the PBLENDVB intrinsic for i8 blending
 with sse4-8.

This is slightly cleaner than trunc-ing the i8 mask to i1 and using
a vector select.  (And is probably more safe in terms of good code.)
---
 builtins/target-sse4-8.ll | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/builtins/target-sse4-8.ll b/builtins/target-sse4-8.ll
index c85209ba..cd8fdce2 100644
--- a/builtins/target-sse4-8.ll
+++ b/builtins/target-sse4-8.ll
@@ -398,11 +398,13 @@ define void @__masked_store_blend_i16(<16 x i16>* nocapture, <16 x i16>,
   ret void
 }
 
+declare <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
+
 define void @__masked_store_blend_i8(<16 x i8>* nocapture, <16 x i8>,
                                      <16 x MASK> %mask) nounwind alwaysinline {
-  %mask_as_i1 = trunc <16 x MASK> %mask to <16 x i1>
   %old = load <16 x i8>* %0, align 4
-  %blend = select <16 x i1> %mask_as_i1, <16 x i8> %1, <16 x i8> %old
+  %blend = call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> %old, <16 x i8> %1,
+                                                   <16 x i8> %mask)
   store <16 x i8> %blend, <16 x i8>* %0, align 4
   ret void
 }

From b6df447b550507ba77dde70758a5bdaf0e079f95 Mon Sep 17 00:00:00 2001
From: Matt Pharr <mpharr@google.com>
Date: Thu, 25 Jul 2013 09:11:39 -0700
Subject: [PATCH 13/34] Add reduce_add() for int8 and int16 types.

This maps to specialized instructions (e.g. PSADBW) when available.
---
 builtins.cpp                      |  2 ++
 builtins/target-avx-x2.ll         | 27 ++++++++++++++++++
 builtins/target-avx.ll            | 46 ++++++++++++++++++++++++-------
 builtins/target-generic-1.ll      |  9 ++++++
 builtins/target-generic-common.ll |  7 +++--
 builtins/target-neon.ll           | 33 ++++++++++++++++++----
 builtins/target-sse2-x2.ll        | 30 ++++++++++++++++++++
 builtins/target-sse2.ll           | 30 ++++++++++++++++++++
 builtins/target-sse4-16.ll        | 30 ++++++++++++++++++++
 builtins/target-sse4-8.ll         | 27 ++++++++++++++++++
 builtins/target-sse4-x2.ll        | 30 ++++++++++++++++++++
 builtins/target-sse4.ll           | 30 ++++++++++++++++++++
 docs/ispc.rst                     | 39 ++++++++++++++++++--------
 examples/intrinsics/generic-16.h  |  9 +++---
 examples/intrinsics/generic-32.h  |  9 +++---
 examples/intrinsics/generic-64.h  |  9 +++---
 examples/intrinsics/knc.h         | 16 +++++++++++
 examples/intrinsics/knc2x.h       |  3 ++
 examples/intrinsics/sse4.h        | 16 +++++++++++
 stdlib.ispc                       | 25 +++++++++++++++--
 tests/reduce-add-int16-1.ispc     | 21 ++++++++++++++
 tests/reduce-add-int16.ispc       | 21 ++++++++++++++
 tests/reduce-add-int8-1.ispc      | 21 ++++++++++++++
 tests/reduce-add-int8.ispc        | 18 ++++++++++++
 24 files changed, 464 insertions(+), 44 deletions(-)
 create mode 100644 tests/reduce-add-int16-1.ispc
 create mode 100644 tests/reduce-add-int16.ispc
 create mode 100644 tests/reduce-add-int8-1.ispc
 create mode 100644 tests/reduce-add-int8.ispc

diff --git a/builtins.cpp b/builtins.cpp
index c4a2f3b5..08472623 100644
--- a/builtins.cpp
+++ b/builtins.cpp
@@ -501,6 +501,8 @@ lSetInternalFunctions(llvm::Module *module) {
         "__rdrand_i64",
         "__reduce_add_double",
         "__reduce_add_float",
+        "__reduce_add_int8",
+        "__reduce_add_int16",
         "__reduce_add_int32",
         "__reduce_add_int64",
         "__reduce_equal_double",
diff --git a/builtins/target-avx-x2.ll b/builtins/target-avx-x2.ll
index 8c6b7753..d9e0322b 100644
--- a/builtins/target-avx-x2.ll
+++ b/builtins/target-avx-x2.ll
@@ -271,6 +271,33 @@ reduce_equal(16)
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; horizontal int32 ops
 
+declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone
+
+define i16 @__reduce_add_int8(<16 x i8>) nounwind readnone alwaysinline {
+  %rv = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %0,
+                                              <16 x i8> zeroinitializer)
+  %r0 = extractelement <2 x i64> %rv, i32 0
+  %r1 = extractelement <2 x i64> %rv, i32 1
+  %r = add i64 %r0, %r1
+  %r16 = trunc i64 %r to i16
+  ret i16 %r16
+}
+
+define internal <16 x i16> @__add_varying_i16(<16 x i16>,
+                                  <16 x i16>) nounwind readnone alwaysinline {
+  %r = add <16 x i16> %0, %1
+  ret <16 x i16> %r
+}
+
+define internal i16 @__add_uniform_i16(i16, i16) nounwind readnone alwaysinline {
+  %r = add i16 %0, %1
+  ret i16 %r
+}
+
+define i16 @__reduce_add_int16(<16 x i16>) nounwind readnone alwaysinline {
+  reduce16(i16, @__add_varying_i16, @__add_uniform_i16)
+}
+
 define <16 x i32> @__add_varying_int32(<16 x i32>,
                                        <16 x i32>) nounwind readnone alwaysinline {
   %s = add <16 x i32> %0, %1
diff --git a/builtins/target-avx.ll b/builtins/target-avx.ll
index e6ab3a4b..90e2f3ac 100644
--- a/builtins/target-avx.ll
+++ b/builtins/target-avx.ll
@@ -217,7 +217,6 @@ define float @__reduce_add_float(<8 x float>) nounwind readonly alwaysinline {
   ret float %sum
 }
 
-
 define float @__reduce_min_float(<8 x float>) nounwind readnone alwaysinline {
   reduce8(float, @__min_varying_float, @__min_uniform_float)
 }
@@ -229,6 +228,42 @@ define float @__reduce_max_float(<8 x float>) nounwind readnone alwaysinline {
 
 reduce_equal(8)
 
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; horizontal int8 ops
+
+declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone
+
+define i16 @__reduce_add_int8(<8 x i8>) nounwind readnone alwaysinline {
+  %wide8 = shufflevector <8 x i8> %0, <8 x i8> zeroinitializer,
+      <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                  i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
+  %rv = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %wide8,
+                                              <16 x i8> zeroinitializer)
+  %r0 = extractelement <2 x i64> %rv, i32 0
+  %r1 = extractelement <2 x i64> %rv, i32 1
+  %r = add i64 %r0, %r1
+  %r16 = trunc i64 %r to i16
+  ret i16 %r16
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; horizontal int16 ops
+
+define internal <8 x i16> @__add_varying_i16(<8 x i16>,
+                                  <8 x i16>) nounwind readnone alwaysinline {
+  %r = add <8 x i16> %0, %1
+  ret <8 x i16> %r
+}
+
+define internal i16 @__add_uniform_i16(i16, i16) nounwind readnone alwaysinline {
+  %r = add i16 %0, %1
+  ret i16 %r
+}
+
+define i16 @__reduce_add_int16(<8 x i16>) nounwind readnone alwaysinline {
+  reduce8(i16, @__add_varying_i16, @__add_uniform_i16)
+}
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; horizontal int32 ops
 
@@ -257,20 +292,14 @@ define i32 @__reduce_max_int32(<8 x i32>) nounwind readnone alwaysinline {
   reduce8(i32, @__max_varying_int32, @__max_uniform_int32)
 }
 
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;;; horizontal uint32 ops
-
 define i32 @__reduce_min_uint32(<8 x i32>) nounwind readnone alwaysinline {
   reduce8(i32, @__min_varying_uint32, @__min_uniform_uint32)
 }
 
-
 define i32 @__reduce_max_uint32(<8 x i32>) nounwind readnone alwaysinline {
   reduce8(i32, @__max_varying_uint32, @__max_uniform_uint32)
 }
 
-
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; horizontal double ops
 
@@ -329,9 +358,6 @@ define i64 @__reduce_max_int64(<8 x i64>) nounwind readnone alwaysinline {
 }
 
 
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;;; horizontal uint64 ops
-
 define i64 @__reduce_min_uint64(<8 x i64>) nounwind readnone alwaysinline {
   reduce8(i64, @__min_varying_uint64, @__min_uniform_uint64)
 }
diff --git a/builtins/target-generic-1.ll b/builtins/target-generic-1.ll
index 9b747e2e..3dec76b0 100644
--- a/builtins/target-generic-1.ll
+++ b/builtins/target-generic-1.ll
@@ -471,6 +471,15 @@ define  i64 @__popcnt_int64(i64) nounwind readonly alwaysinline {
   ret i64 %call
 }
 
+define i8 @__reduce_add_int8(<1 x i8> %v) nounwind readonly alwaysinline {
+  %r = extractelement <1 x i8> %v, i32 0
+  ret i8 %r
+}
+
+define i16 @__reduce_add_int16(<1 x i16> %v) nounwind readonly alwaysinline {
+  %r = extractelement <1 x i16> %v, i32 0
+  ret i16 %r
+}
 
 define  float @__reduce_add_float(<1 x float> %v) nounwind readonly alwaysinline {
   %r = extractelement <1 x float> %v, i32 0
diff --git a/builtins/target-generic-common.ll b/builtins/target-generic-common.ll
index bbf1b842..76d1faf3 100644
--- a/builtins/target-generic-common.ll
+++ b/builtins/target-generic-common.ll
@@ -226,14 +226,16 @@ declare i1 @__any(<WIDTH x i1>) nounwind readnone
 declare i1 @__all(<WIDTH x i1>) nounwind readnone 
 declare i1 @__none(<WIDTH x i1>) nounwind readnone 
 
+declare i16 @__reduce_add_int8(<WIDTH x i8>) nounwind readnone
+declare i32 @__reduce_add_int16(<WIDTH x i16>) nounwind readnone
+
 declare float @__reduce_add_float(<WIDTH x float>) nounwind readnone
 declare float @__reduce_min_float(<WIDTH x float>) nounwind readnone 
 declare float @__reduce_max_float(<WIDTH x float>) nounwind readnone 
 
-declare i32 @__reduce_add_int32(<WIDTH x i32>) nounwind readnone 
+declare i64 @__reduce_add_int32(<WIDTH x i32>) nounwind readnone
 declare i32 @__reduce_min_int32(<WIDTH x i32>) nounwind readnone 
 declare i32 @__reduce_max_int32(<WIDTH x i32>) nounwind readnone 
-
 declare i32 @__reduce_min_uint32(<WIDTH x i32>) nounwind readnone 
 declare i32 @__reduce_max_uint32(<WIDTH x i32>) nounwind readnone 
 
@@ -244,7 +246,6 @@ declare double @__reduce_max_double(<WIDTH x double>) nounwind readnone
 declare i64 @__reduce_add_int64(<WIDTH x i64>) nounwind readnone 
 declare i64 @__reduce_min_int64(<WIDTH x i64>) nounwind readnone 
 declare i64 @__reduce_max_int64(<WIDTH x i64>) nounwind readnone 
-
 declare i64 @__reduce_min_uint64(<WIDTH x i64>) nounwind readnone 
 declare i64 @__reduce_max_uint64(<WIDTH x i64>) nounwind readnone 
 
diff --git a/builtins/target-neon.ll b/builtins/target-neon.ll
index e70b774b..fbeac352 100644
--- a/builtins/target-neon.ll
+++ b/builtins/target-neon.ll
@@ -509,15 +509,38 @@ define float @__reduce_max_float(<4 x float>) nounwind readnone {
   neon_reduce(float, @llvm.arm.neon.vpmaxs.v2f32, @max_f32)
 }
 
-define internal i32 @add_i32(i32, i32) {
-  %r = add i32 %0, %1
+declare <4 x i16> @llvm.arm.neon.vpaddls.v4i16.v8i8(<8 x i8>) nounwind readnone
+
+define i16 @__reduce_add_int8(<WIDTH x i8>) nounwind readnone {
+  %v8 = shufflevector <4 x i8> %0, <4 x i8> zeroinitializer,
+           <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4>
+  %a16 = call <4 x i16> @llvm.arm.neon.vpaddls.v4i16.v8i8(<8 x i8> %v8)
+  %a32 = call <2 x i32> @llvm.arm.neon.vpaddlu.v2i32.v4i16(<4 x i16> %a16)
+  %a0 = extractelement <2 x i32> %a32, i32 0
+  %a1 = extractelement <2 x i32> %a32, i32 1
+  %r = add i32 %a0, %a1
+  %r16 = trunc i32 %r to i16
+  ret i16 %r16
+}
+
+declare <2 x i32> @llvm.arm.neon.vpaddlu.v2i32.v4i16(<4 x i16>) nounwind readnone
+
+define i32 @__reduce_add_int16(<WIDTH x i16>) nounwind readnone {
+  %a32 = call <2 x i32> @llvm.arm.neon.vpaddlu.v2i32.v4i16(<4 x i16> %0)
+  %a0 = extractelement <2 x i32> %a32, i32 0
+  %a1 = extractelement <2 x i32> %a32, i32 1
+  %r = add i32 %a0, %a1
   ret i32 %r
 }
 
-declare <2 x i32> @llvm.arm.neon.vpadd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32>) nounwind readnone
 
-define i32 @__reduce_add_int32(<WIDTH x i32>) nounwind readnone {
-  neon_reduce(i32, @llvm.arm.neon.vpadd.v2i32, @add_i32)
+define i64 @__reduce_add_int32(<WIDTH x i32>) nounwind readnone {
+  %a64 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %0)
+  %a0 = extractelement <2 x i64> %a64, i32 0
+  %a1 = extractelement <2 x i64> %a64, i32 1
+  %r = add i64 %a0, %a1
+  ret i64 %r
 }
 
 declare <2 x i32> @llvm.arm.neon.vpmins.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
diff --git a/builtins/target-sse2-x2.ll b/builtins/target-sse2-x2.ll
index 73361720..da22a66c 100644
--- a/builtins/target-sse2-x2.ll
+++ b/builtins/target-sse2-x2.ll
@@ -367,6 +367,36 @@ define i1 @__none(<8 x i32>) nounwind readnone alwaysinline {
   ret i1 %cmp
 }
 
+declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone
+
+define i16 @__reduce_add_int8(<8 x i8>) nounwind readnone alwaysinline {
+  %wide8 = shufflevector <8 x i8> %0, <8 x i8> zeroinitializer,
+      <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                  i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
+  %rv = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %wide8,
+                                              <16 x i8> zeroinitializer)
+  %r0 = extractelement <2 x i64> %rv, i32 0
+  %r1 = extractelement <2 x i64> %rv, i32 1
+  %r = add i64 %r0, %r1
+  %r16 = trunc i64 %r to i16
+  ret i16 %r16
+}
+
+define internal <8 x i16> @__add_varying_i16(<8 x i16>,
+                                  <8 x i16>) nounwind readnone alwaysinline {
+  %r = add <8 x i16> %0, %1
+  ret <8 x i16> %r
+}
+
+define internal i16 @__add_uniform_i16(i16, i16) nounwind readnone alwaysinline {
+  %r = add i16 %0, %1
+  ret i16 %r
+}
+
+define i16 @__reduce_add_int16(<8 x i16>) nounwind readnone alwaysinline {
+  reduce8(i16, @__add_varying_i16, @__add_uniform_i16)
+}
+
 define <4 x float> @__vec4_add_float(<4 x float> %v0,
                                      <4 x float> %v1) nounwind readnone alwaysinline {
   %v = fadd <4 x float> %v0, %v1
diff --git a/builtins/target-sse2.ll b/builtins/target-sse2.ll
index 2bb06391..a6b206b6 100644
--- a/builtins/target-sse2.ll
+++ b/builtins/target-sse2.ll
@@ -267,6 +267,36 @@ define i1 @__none(<4 x i32>) nounwind readnone alwaysinline {
   ret i1 %cmp
 }
 
+declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone
+
+define i16 @__reduce_add_int8(<4 x i8>) nounwind readnone alwaysinline {
+  %wide8 = shufflevector <4 x i8> %0, <4 x i8> zeroinitializer,
+      <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4,
+                  i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
+  %rv = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %wide8,
+                                              <16 x i8> zeroinitializer)
+  %r0 = extractelement <2 x i64> %rv, i32 0
+  %r1 = extractelement <2 x i64> %rv, i32 1
+  %r = add i64 %r0, %r1
+  %r16 = trunc i64 %r to i16
+  ret i16 %r16
+}
+
+define internal <4 x i16> @__add_varying_i16(<4 x i16>,
+                                  <4 x i16>) nounwind readnone alwaysinline {
+  %r = add <4 x i16> %0, %1
+  ret <4 x i16> %r
+}
+
+define internal i16 @__add_uniform_i16(i16, i16) nounwind readnone alwaysinline {
+  %r = add i16 %0, %1
+  ret i16 %r
+}
+
+define i16 @__reduce_add_int16(<4 x i16>) nounwind readnone alwaysinline {
+  reduce4(i16, @__add_varying_i16, @__add_uniform_i16)
+}
+
 define float @__reduce_add_float(<4 x float> %v) nounwind readonly alwaysinline {
   %v1 = shufflevector <4 x float> %v, <4 x float> undef,
                       <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
diff --git a/builtins/target-sse4-16.ll b/builtins/target-sse4-16.ll
index 2044fbee..d1563988 100644
--- a/builtins/target-sse4-16.ll
+++ b/builtins/target-sse4-16.ll
@@ -253,6 +253,36 @@ define i1 @__none(<8 x MASK>) nounwind readnone alwaysinline {
   ret i1 %meq
 }
 
+declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone
+
+define i16 @__reduce_add_int8(<8 x i8>) nounwind readnone alwaysinline {
+  %wide8 = shufflevector <8 x i8> %0, <8 x i8> zeroinitializer,
+      <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                  i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
+  %rv = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %wide8,
+                                              <16 x i8> zeroinitializer)
+  %r0 = extractelement <2 x i64> %rv, i32 0
+  %r1 = extractelement <2 x i64> %rv, i32 1
+  %r = add i64 %r0, %r1
+  %r16 = trunc i64 %r to i16
+  ret i16 %r16
+}
+
+define internal <8 x i16> @__add_varying_i16(<8 x i16>,
+                                  <8 x i16>) nounwind readnone alwaysinline {
+  %r = add <8 x i16> %0, %1
+  ret <8 x i16> %r
+}
+
+define internal i16 @__add_uniform_i16(i16, i16) nounwind readnone alwaysinline {
+  %r = add i16 %0, %1
+  ret i16 %r
+}
+
+define i16 @__reduce_add_int16(<8 x i16>) nounwind readnone alwaysinline {
+  reduce8(i16, @__add_varying_i16, @__add_uniform_i16)
+}
+
 define internal <8 x float> @__add_varying_float(<8 x float>, <8 x float>) {
   %r = fadd <8 x float> %0, %1
   ret <8 x float> %r
diff --git a/builtins/target-sse4-8.ll b/builtins/target-sse4-8.ll
index cd8fdce2..85b7bbe7 100644
--- a/builtins/target-sse4-8.ll
+++ b/builtins/target-sse4-8.ll
@@ -261,6 +261,33 @@ define i1 @__none(<16 x i8>) nounwind readnone alwaysinline {
   ret i1 %meq
 }
 
+declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone
+
+define i16 @__reduce_add_int8(<16 x i8>) nounwind readnone alwaysinline {
+  %rv = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %0,
+                                              <16 x i8> zeroinitializer)
+  %r0 = extractelement <2 x i64> %rv, i32 0
+  %r1 = extractelement <2 x i64> %rv, i32 1
+  %r = add i64 %r0, %r1
+  %r16 = trunc i64 %r to i16
+  ret i16 %r16
+}
+
+define internal <16 x i16> @__add_varying_i16(<16 x i16>,
+                                  <16 x i16>) nounwind readnone alwaysinline {
+  %r = add <16 x i16> %0, %1
+  ret <16 x i16> %r
+}
+
+define internal i16 @__add_uniform_i16(i16, i16) nounwind readnone alwaysinline {
+  %r = add i16 %0, %1
+  ret i16 %r
+}
+
+define i16 @__reduce_add_int16(<16 x i16>) nounwind readnone alwaysinline {
+  reduce16(i16, @__add_varying_i16, @__add_uniform_i16)
+}
+
 define internal <16 x float> @__add_varying_float(<16 x float>, <16 x float>) {
   %r = fadd <16 x float> %0, %1
   ret <16 x float> %r
diff --git a/builtins/target-sse4-x2.ll b/builtins/target-sse4-x2.ll
index ccae4d51..e2debbc2 100644
--- a/builtins/target-sse4-x2.ll
+++ b/builtins/target-sse4-x2.ll
@@ -309,6 +309,36 @@ define i1 @__none(<8 x i32>) nounwind readnone alwaysinline {
   ret i1 %cmp
 }
 
+declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone
+
+define i16 @__reduce_add_int8(<8 x i8>) nounwind readnone alwaysinline {
+  %wide8 = shufflevector <8 x i8> %0, <8 x i8> zeroinitializer,
+      <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                  i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
+  %rv = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %wide8,
+                                              <16 x i8> zeroinitializer)
+  %r0 = extractelement <2 x i64> %rv, i32 0
+  %r1 = extractelement <2 x i64> %rv, i32 1
+  %r = add i64 %r0, %r1
+  %r16 = trunc i64 %r to i16
+  ret i16 %r16
+}
+
+define internal <8 x i16> @__add_varying_i16(<8 x i16>,
+                                  <8 x i16>) nounwind readnone alwaysinline {
+  %r = add <8 x i16> %0, %1
+  ret <8 x i16> %r
+}
+
+define internal i16 @__add_uniform_i16(i16, i16) nounwind readnone alwaysinline {
+  %r = add i16 %0, %1
+  ret i16 %r
+}
+
+define i16 @__reduce_add_int16(<8 x i16>) nounwind readnone alwaysinline {
+  reduce8(i16, @__add_varying_i16, @__add_uniform_i16)
+}
+
 define float @__reduce_min_float(<8 x float>) nounwind readnone alwaysinline {
   reduce8by4(float, @llvm.x86.sse.min.ps, @__min_uniform_float)
 }
diff --git a/builtins/target-sse4.ll b/builtins/target-sse4.ll
index f622b839..98a7ef69 100644
--- a/builtins/target-sse4.ll
+++ b/builtins/target-sse4.ll
@@ -299,6 +299,36 @@ define i1 @__none(<4 x i32>) nounwind readnone alwaysinline {
   ret i1 %cmp
 }
 
+declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone
+
+define i16 @__reduce_add_int8(<4 x i8>) nounwind readnone alwaysinline {
+  %wide8 = shufflevector <4 x i8> %0, <4 x i8> zeroinitializer,
+      <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4,
+                  i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
+  %rv = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %wide8,
+                                              <16 x i8> zeroinitializer)
+  %r0 = extractelement <2 x i64> %rv, i32 0
+  %r1 = extractelement <2 x i64> %rv, i32 1
+  %r = add i64 %r0, %r1
+  %r16 = trunc i64 %r to i16
+  ret i16 %r16
+}
+
+define internal <4 x i16> @__add_varying_i16(<4 x i16>,
+                                  <4 x i16>) nounwind readnone alwaysinline {
+  %r = add <4 x i16> %0, %1
+  ret <4 x i16> %r
+}
+
+define internal i16 @__add_uniform_i16(i16, i16) nounwind readnone alwaysinline {
+  %r = add i16 %0, %1
+  ret i16 %r
+}
+
+define i16 @__reduce_add_int16(<4 x i16>) nounwind readnone alwaysinline {
+  reduce4(i16, @__add_varying_i16, @__add_uniform_i16)
+}
+
 declare <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float>, <4 x float>) nounwind readnone
 
 define float @__reduce_add_float(<4 x float>) nounwind readonly alwaysinline {
diff --git a/docs/ispc.rst b/docs/ispc.rst
index c6c63172..39d3a5c8 100755
--- a/docs/ispc.rst
+++ b/docs/ispc.rst
@@ -3711,29 +3711,44 @@ instances are added together by the ``reduce_add()`` function.
 
 ::
 
-    uniform float reduce_add(float x)
-    uniform int reduce_add(int x)
-    uniform unsigned int reduce_add(unsigned int x)
+    uniform int16 reduce_add(int8 x)
+    uniform unsigned int16 reduce_add(unsigned int8 x)
+    uniform int32 reduce_add(int16 x)
+    uniform unsigned int32 reduce_add(unsigned 16int x)
+    uniform int64 reduce_add(int32 x)
+    uniform unsigned int64 reduce_add(unsigned int32 x)
+    uniform int64 reduce_add(int64 x)
+    uniform unsigned int64 reduce_add(unsigned int64 x)
 
-You can also use functions to compute the minimum and maximum value of the
-given value across all of the currently-executing program instances.
+    uniform float reduce_add(float x)
+    uniform double reduce_add(double x)
+
+You can also use functions to compute the minimum value of the given value
+across all of the currently-executing program instances.
 
 ::
 
-    uniform float reduce_min(float a)
     uniform int32 reduce_min(int32 a)
     uniform unsigned int32 reduce_min(unsigned int32 a)
-    uniform double reduce_min(double a)
     uniform int64 reduce_min(int64 a)
     uniform unsigned int64 reduce_min(unsigned int64 a)
 
-    uniform float reduce_max(float a)
+    uniform float reduce_min(float a)
+    uniform double reduce_min(double a)
+
+Equivalent functions are available to comptue the maximum of the given
+varying variable over the active program instances.
+
+::
+
     uniform int32 reduce_max(int32 a)
     uniform unsigned int32 reduce_max(unsigned int32 a)
-    uniform double reduce_max(double a)
     uniform int64 reduce_max(int64 a)
     uniform unsigned int64 reduce_max(unsigned int64 a)
 
+    uniform float reduce_max(float a)
+    uniform double reduce_max(double a)
+
 Finally, you can check to see if a particular value has the same value in
 all of the currently-running program instances:
 
@@ -3741,9 +3756,10 @@ all of the currently-running program instances:
 
     uniform bool reduce_equal(int32 v)
     uniform bool reduce_equal(unsigned int32 v)
-    uniform bool reduce_equal(float v)
     uniform bool reduce_equal(int64 v)
     uniform bool reduce_equal(unsigned int64 v)
+
+    uniform bool reduce_equal(float v)
     uniform bool reduce_equal(double)
 
 There are also variants of these functions that return the value as a
@@ -3758,10 +3774,11 @@ performance in the `Performance Guide`_.
     uniform bool reduce_equal(int32 v, uniform int32 * uniform sameval)
     uniform bool reduce_equal(unsigned int32 v,
                               uniform unsigned int32 * uniform sameval)
-    uniform bool reduce_equal(float v, uniform float * uniform sameval)
     uniform bool reduce_equal(int64 v, uniform int64 * uniform sameval)
     uniform bool reduce_equal(unsigned int64 v,
                               uniform unsigned int64 * uniform sameval)
+
+    uniform bool reduce_equal(float v, uniform float * uniform sameval)
     uniform bool reduce_equal(double, uniform double * uniform sameval)
 
 If called when none of the program instances are running,
diff --git a/examples/intrinsics/generic-16.h b/examples/intrinsics/generic-16.h
index 828c1ab4..6d4fe1f4 100644
--- a/examples/intrinsics/generic-16.h
+++ b/examples/intrinsics/generic-16.h
@@ -1162,19 +1162,20 @@ REDUCE_ADD(double, __vec16_d, __reduce_add_double)
 REDUCE_MINMAX(double, __vec16_d, __reduce_min_double, <)
 REDUCE_MINMAX(double, __vec16_d, __reduce_max_double, >)
 
-REDUCE_ADD(uint32_t, __vec16_i32, __reduce_add_int32)
+REDUCE_ADD(int16_t, __vec16_i8, __reduce_add_int8)
+REDUCE_ADD(int32_t, __vec16_i16, __reduce_add_int16)
+
+REDUCE_ADD(int64_t, __vec16_i32, __reduce_add_int32)
 REDUCE_MINMAX(int32_t, __vec16_i32, __reduce_min_int32, <)
 REDUCE_MINMAX(int32_t, __vec16_i32, __reduce_max_int32, >)
 
-REDUCE_ADD(uint32_t, __vec16_i32, __reduce_add_uint32)
 REDUCE_MINMAX(uint32_t, __vec16_i32, __reduce_min_uint32, <)
 REDUCE_MINMAX(uint32_t, __vec16_i32, __reduce_max_uint32, >)
 
-REDUCE_ADD(uint64_t, __vec16_i64, __reduce_add_int64)
+REDUCE_ADD(int64_t, __vec16_i64, __reduce_add_int64)
 REDUCE_MINMAX(int64_t, __vec16_i64, __reduce_min_int64, <)
 REDUCE_MINMAX(int64_t, __vec16_i64, __reduce_max_int64, >)
 
-REDUCE_ADD(uint64_t, __vec16_i64, __reduce_add_uint64)
 REDUCE_MINMAX(uint64_t, __vec16_i64, __reduce_min_uint64, <)
 REDUCE_MINMAX(uint64_t, __vec16_i64, __reduce_max_uint64, >)
 
diff --git a/examples/intrinsics/generic-32.h b/examples/intrinsics/generic-32.h
index 64b82cb1..12c4f84e 100644
--- a/examples/intrinsics/generic-32.h
+++ b/examples/intrinsics/generic-32.h
@@ -1231,19 +1231,20 @@ REDUCE_ADD(double, __vec32_d, __reduce_add_double)
 REDUCE_MINMAX(double, __vec32_d, __reduce_min_double, <)
 REDUCE_MINMAX(double, __vec32_d, __reduce_max_double, >)
 
-REDUCE_ADD(uint32_t, __vec32_i32, __reduce_add_int32)
+REDUCE_ADD(int16_t, __vec16_i8, __reduce_add_int8)
+REDUCE_ADD(int32_t, __vec16_i16, __reduce_add_int16)
+
+REDUCE_ADD(int64_t, __vec32_i32, __reduce_add_int32)
 REDUCE_MINMAX(int32_t, __vec32_i32, __reduce_min_int32, <)
 REDUCE_MINMAX(int32_t, __vec32_i32, __reduce_max_int32, >)
 
-REDUCE_ADD(uint32_t, __vec32_i32, __reduce_add_uint32)
 REDUCE_MINMAX(uint32_t, __vec32_i32, __reduce_min_uint32, <)
 REDUCE_MINMAX(uint32_t, __vec32_i32, __reduce_max_uint32, >)
 
-REDUCE_ADD(uint64_t, __vec32_i64, __reduce_add_int64)
+REDUCE_ADD(int64_t, __vec32_i64, __reduce_add_int64)
 REDUCE_MINMAX(int64_t, __vec32_i64, __reduce_min_int64, <)
 REDUCE_MINMAX(int64_t, __vec32_i64, __reduce_max_int64, >)
 
-REDUCE_ADD(uint64_t, __vec32_i64, __reduce_add_uint64)
 REDUCE_MINMAX(uint64_t, __vec32_i64, __reduce_min_uint64, <)
 REDUCE_MINMAX(uint64_t, __vec32_i64, __reduce_max_uint64, >)
 
diff --git a/examples/intrinsics/generic-64.h b/examples/intrinsics/generic-64.h
index 7869faa5..a3648f42 100644
--- a/examples/intrinsics/generic-64.h
+++ b/examples/intrinsics/generic-64.h
@@ -1364,19 +1364,20 @@ REDUCE_ADD(double, __vec64_d, __reduce_add_double)
 REDUCE_MINMAX(double, __vec64_d, __reduce_min_double, <)
 REDUCE_MINMAX(double, __vec64_d, __reduce_max_double, >)
 
-REDUCE_ADD(uint32_t, __vec64_i32, __reduce_add_int32)
+REDUCE_ADD(int16_t, __vec16_i8, __reduce_add_int8)
+REDUCE_ADD(int32_t, __vec16_i16, __reduce_add_int16)
+
+REDUCE_ADD(int64_t, __vec64_i32, __reduce_add_int32)
 REDUCE_MINMAX(int32_t, __vec64_i32, __reduce_min_int32, <)
 REDUCE_MINMAX(int32_t, __vec64_i32, __reduce_max_int32, >)
 
-REDUCE_ADD(uint32_t, __vec64_i32, __reduce_add_uint32)
 REDUCE_MINMAX(uint32_t, __vec64_i32, __reduce_min_uint32, <)
 REDUCE_MINMAX(uint32_t, __vec64_i32, __reduce_max_uint32, >)
 
-REDUCE_ADD(uint64_t, __vec64_i64, __reduce_add_int64)
+REDUCE_ADD(int64_t, __vec64_i64, __reduce_add_int64)
 REDUCE_MINMAX(int64_t, __vec64_i64, __reduce_min_int64, <)
 REDUCE_MINMAX(int64_t, __vec64_i64, __reduce_max_int64, >)
 
-REDUCE_ADD(uint64_t, __vec64_i64, __reduce_add_uint64)
 REDUCE_MINMAX(uint64_t, __vec64_i64, __reduce_min_uint64, <)
 REDUCE_MINMAX(uint64_t, __vec64_i64, __reduce_max_uint64, >)
 
diff --git a/examples/intrinsics/knc.h b/examples/intrinsics/knc.h
index bf383c88..41c4cbc0 100644
--- a/examples/intrinsics/knc.h
+++ b/examples/intrinsics/knc.h
@@ -1511,6 +1511,22 @@ static FORCEINLINE int64_t __count_trailing_zeros_i64(const __vec1_i64 mask) {
 // reductions
 ///////////////////////////////////////////////////////////////////////////
 
+static FORCEINLINE int16_t __reduce_add_i8(__vec16_i8 v) {
+  // TODO: improve this!
+  int16_t ret = 0;
+  for (int i = 0; i < 16; ++i)
+    ret += v.v[i];
+  return ret;
+}
+
+static FORCEINLINE int32_t __reduce_add_i16(__vec16_i16 v) {
+  // TODO: improve this!
+  int32_t ret = 0;
+  for (int i = 0; i < 16; ++i)
+    ret += v.v[i];
+  return ret;
+}
+
 static FORCEINLINE uint32_t __reduce_add_i32(__vec16_i32 v) {
     return _mm512_reduce_add_epi32(v);
 }
diff --git a/examples/intrinsics/knc2x.h b/examples/intrinsics/knc2x.h
index 0041a6c9..5b6e5295 100644
--- a/examples/intrinsics/knc2x.h
+++ b/examples/intrinsics/knc2x.h
@@ -1607,6 +1607,9 @@ static FORCEINLINE int64_t __count_leading_zeros_i64(uint64_t v) {
 ///////////////////////////////////////////////////////////////////////////
 // reductions
 
+REDUCE_ADD(int16_t, __vec32_i8, __reduce_add_int8)
+REDUCE_ADD(int32_t, __vec32_i16, __reduce_add_int16)
+
 static FORCEINLINE float __reduce_add_float(__vec32_f v) {
     return _mm512_reduce_add_ps(v.v1) + _mm512_reduce_add_ps(v.v2);
 }
diff --git a/examples/intrinsics/sse4.h b/examples/intrinsics/sse4.h
index d4739d61..30f90b31 100644
--- a/examples/intrinsics/sse4.h
+++ b/examples/intrinsics/sse4.h
@@ -2528,6 +2528,22 @@ static FORCEINLINE int64_t __count_leading_zeros_i64(uint64_t v) {
 ///////////////////////////////////////////////////////////////////////////
 // reductions
 
+static FORCEINLINE int16_t __reduce_add_int8(__vec4_i8 v) {
+    // TODO: improve
+    int16_t ret = 0;
+    for (int i = 0; i < 4; ++i)
+        ret += v.v[i];
+    return ret;
+}
+
+static FORCEINLINE int32_t __reduce_add_int16(__vec4_i16 v) {
+    // TODO: improve
+    int32_t ret = 0;
+    for (int i = 0; i < 4; ++i)
+        ret += v.v[i];
+    return ret;
+}
+
 static FORCEINLINE float __reduce_add_float(__vec4_f v) {
     float r = bits_as_float(_mm_extract_ps(v.v, 0));
     r += bits_as_float(_mm_extract_ps(v.v, 1));
diff --git a/stdlib.ispc b/stdlib.ispc
index 7e848481..c9c66252 100644
--- a/stdlib.ispc
+++ b/stdlib.ispc
@@ -887,13 +887,32 @@ static inline uniform double select(uniform bool c, uniform double a,
 ///////////////////////////////////////////////////////////////////////////
 // Horizontal ops / reductions
 
+__declspec(safe)
+static inline uniform int16 reduce_add(int8 x) {
+    return __reduce_add_int8(__mask ? x : (int8)0);
+}
+
+__declspec(safe)
+static inline uniform unsigned int16 reduce_add(unsigned int8 x) {
+    return __reduce_add_int8(__mask ? x : (int8)0);
+}
+
+__declspec(safe)
+static inline uniform int32 reduce_add(int16 x) {
+    return __reduce_add_int16(__mask ? x : (int16)0);
+}
+
+__declspec(safe)
+static inline uniform unsigned int32 reduce_add(unsigned int16 x) {
+    return __reduce_add_int16(__mask ? x : (int16)0);
+}
+
 __declspec(safe) 
 static inline uniform float reduce_add(float x) {
     // zero the lanes where the mask is off
     return __reduce_add_float(__mask ? x : 0.);
 }
 
-
 __declspec(safe) 
 static inline uniform float reduce_min(float v) {
     // For the lanes where the mask is off, replace the given value with
@@ -915,7 +934,7 @@ static inline uniform float reduce_max(float v) {
 }
 
 __declspec(safe) 
-static inline uniform int reduce_add(int x) {
+static inline uniform int64 reduce_add(int32 x) {
     // Zero out the values for lanes that aren't running
     return __reduce_add_int32(__mask ? x : 0);
 }
@@ -937,7 +956,7 @@ static inline uniform int reduce_max(int v) {
 }
 
 __declspec(safe) 
-static inline uniform unsigned int reduce_add(unsigned int x) {
+static inline uniform unsigned int64 reduce_add(unsigned int32 x) {
     // Set values for non-running lanes to zero so they don't affect the
     // result.
     return __reduce_add_int32(__mask ? x : 0);
diff --git a/tests/reduce-add-int16-1.ispc b/tests/reduce-add-int16-1.ispc
new file mode 100644
index 00000000..58529ca1
--- /dev/null
+++ b/tests/reduce-add-int16-1.ispc
@@ -0,0 +1,21 @@
+
+export uniform int width() { return programCount; }
+
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float v = aFOO[programIndex];
+    uniform float m;
+    int16 iv = (int)v;
+    if (iv & 1)
+        m = reduce_add(iv);
+    RET[programIndex] = m;
+}
+
+export void result(uniform float RET[]) { 
+    uniform int x = 0;
+    for (uniform int i = 1; i <= programCount; i += 2)
+        x += i;
+    RET[programIndex] = x;
+}
+
diff --git a/tests/reduce-add-int16.ispc b/tests/reduce-add-int16.ispc
new file mode 100644
index 00000000..8657b201
--- /dev/null
+++ b/tests/reduce-add-int16.ispc
@@ -0,0 +1,21 @@
+
+export uniform int width() { return programCount; }
+
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float v = aFOO[programIndex];
+    uniform float m;
+    int16 iv = (int)v;
+/*CO    if (iv & 1)*/
+        m = reduce_add(iv);
+    RET[programIndex] = m;
+}
+
+export void result(uniform float RET[]) { 
+    uniform int x = 0;
+    for (uniform int i = 1; i <= programCount; ++i)
+        x += i;
+    RET[programIndex] = x;
+}
+
diff --git a/tests/reduce-add-int8-1.ispc b/tests/reduce-add-int8-1.ispc
new file mode 100644
index 00000000..e5310aae
--- /dev/null
+++ b/tests/reduce-add-int8-1.ispc
@@ -0,0 +1,21 @@
+
+export uniform int width() { return programCount; }
+
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float v = aFOO[programIndex];
+    uniform float m;
+    int8 iv = (int)v;
+    if (iv & 1)
+        m = reduce_add(iv);
+    RET[programIndex] = m;
+}
+
+export void result(uniform float RET[]) { 
+    uniform int x = 0;
+    for (uniform int i = 1; i <= programCount; i += 2)
+        x += i;
+    RET[programIndex] = x;
+}
+
diff --git a/tests/reduce-add-int8.ispc b/tests/reduce-add-int8.ispc
new file mode 100644
index 00000000..7e0dd027
--- /dev/null
+++ b/tests/reduce-add-int8.ispc
@@ -0,0 +1,18 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+  int8 db = b-4;
+  int8 iv = programIndex + db;
+  int m = reduce_add(iv);
+  RET[programIndex] = m;
+}
+
+export void result(uniform float RET[]) { 
+    uniform int x = 0;
+    for (uniform int i = 1; i <= programCount; ++i)
+        x += i;
+    RET[programIndex] = x;
+}
+

From ab3b633733ec05f3778e46f792a98844e9ee5900 Mon Sep 17 00:00:00 2001
From: Matt Pharr <mpharr@google.com>
Date: Mon, 29 Jul 2013 16:14:58 -0700
Subject: [PATCH 14/34] Add 8-bit and 16-bit specialized NEON targets.

Like SSE4-8 and SSE4-16, these use 8-bit and 16-bit values for mask
elements, respectively, and thus should generate the best code when used
for computation with datatypes of those sizes.
---
 Makefile                                      |   6 +-
 builtins.cpp                                  |  28 +-
 builtins/target-neon-16.ll                    | 458 ++++++++++++++++
 .../{target-neon.ll => target-neon-32.ll}     | 305 +----------
 builtins/target-neon-8.ll                     | 508 ++++++++++++++++++
 builtins/target-neon-common.ll                | 351 ++++++++++++
 builtins/util.m4                              | 120 ++++-
 ispc.cpp                                      |  41 +-
 ispc.h                                        |   3 +-
 ispc.vcxproj                                  | 111 ++--
 module.cpp                                    |   1 +
 run_tests.py                                  |   2 +-
 12 files changed, 1561 insertions(+), 373 deletions(-)
 create mode 100644 builtins/target-neon-16.ll
 rename builtins/{target-neon.ll => target-neon-32.ll} (62%)
 create mode 100644 builtins/target-neon-8.ll
 create mode 100644 builtins/target-neon-common.ll

diff --git a/Makefile b/Makefile
index fc064dbd..98729bfc 100644
--- a/Makefile
+++ b/Makefile
@@ -122,8 +122,10 @@ CXX_SRC=ast.cpp builtins.cpp cbackend.cpp ctx.cpp decl.cpp expr.cpp func.cpp \
 	type.cpp util.cpp
 HEADERS=ast.h builtins.h ctx.h decl.h expr.h func.h ispc.h llvmutil.h module.h \
 	opt.h stmt.h sym.h type.h util.h
-TARGETS=neon avx1 avx1-x2 avx11 avx11-x2 avx2 avx2-x2 sse2 sse2-x2 sse4 sse4-x2 \
-	sse4-8 sse4-16 generic-4 generic-8 generic-16 generic-32 generic-64 generic-1
+TARGETS=neon-32 neon-16 neon-8 \
+	avx1 avx1-x2 avx11 avx11-x2 avx2 avx2-x2 \
+	sse2 sse2-x2 sse4 sse4-x2 sse4-8 sse4-16 \
+	generic-1 generic-4 generic-8 generic-16 generic-32 generic-64
 # These files need to be compiled in two versions - 32 and 64 bits.
 BUILTINS_SRC_TARGET=$(addprefix builtins/target-, $(addsuffix .ll, $(TARGETS)))
 # These are files to be compiled in single version.
diff --git a/builtins.cpp b/builtins.cpp
index 08472623..e671a491 100644
--- a/builtins.cpp
+++ b/builtins.cpp
@@ -657,7 +657,9 @@ AddBitcodeToModule(const unsigned char *bitcode, int length,
         // the values for an ARM target.  This maybe won't cause problems
         // in the generated code, since bulitins.c doesn't do anything too
         // complex w.r.t. struct layouts, etc.
-        if (g->target->getISA() != Target::NEON)
+        if (g->target->getISA() != Target::NEON32 &&
+            g->target->getISA() != Target::NEON16 &&
+            g->target->getISA() != Target::NEON8)
 #endif // !__arm__
         {
             Assert(bcTriple.getArch() == llvm::Triple::UnknownArch ||
@@ -820,12 +822,30 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
     // Next, add the target's custom implementations of the various needed
     // builtin functions (e.g. __masked_store_32(), etc).
     switch (g->target->getISA()) {
-    case Target::NEON: {
+    case Target::NEON8: {
         if (runtime32) {
-            EXPORT_MODULE(builtins_bitcode_neon_32bit);
+            EXPORT_MODULE(builtins_bitcode_neon_8_32bit);
         }
         else {
-            EXPORT_MODULE(builtins_bitcode_neon_64bit);
+            EXPORT_MODULE(builtins_bitcode_neon_8_64bit);
+        }
+        break;
+    }
+    case Target::NEON16: {
+        if (runtime32) {
+            EXPORT_MODULE(builtins_bitcode_neon_16_32bit);
+        }
+        else {
+            EXPORT_MODULE(builtins_bitcode_neon_16_64bit);
+        }
+        break;
+    }
+    case Target::NEON32: {
+        if (runtime32) {
+            EXPORT_MODULE(builtins_bitcode_neon_32_32bit);
+        }
+        else {
+            EXPORT_MODULE(builtins_bitcode_neon_32_64bit);
         }
         break;
     }
diff --git a/builtins/target-neon-16.ll b/builtins/target-neon-16.ll
new file mode 100644
index 00000000..fd15eb0b
--- /dev/null
+++ b/builtins/target-neon-16.ll
@@ -0,0 +1,458 @@
+;;
+;; target-neon-16.ll
+;;
+;;  Copyright(c) 2013 Google, Inc.
+;;
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Matt Pharr nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+define(`WIDTH',`8')
+define(`MASK',`i16')
+
+include(`util.m4')
+include(`target-neon-common.ll')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; half conversion routines
+
+define <8 x float> @__half_to_float_varying(<8 x i16> %v) nounwind readnone {
+  unary4to8conv(r, i16, float, @llvm.arm.neon.vcvthf2fp, %v)
+  ret <8 x float> %r
+}
+
+define <8 x i16> @__float_to_half_varying(<8 x float> %v) nounwind readnone {
+  unary4to8conv(r, float, i16, @llvm.arm.neon.vcvtfp2hf, %v)
+  ret <8 x i16> %r
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; math
+
+;; round/floor/ceil
+
+;; FIXME: grabbed these from the sse2 target, which does not have native
+;; instructions for these.  Is there a better approach for NEON?
+
+define <8 x float> @__round_varying_float(<8 x float>) nounwind readonly alwaysinline {
+  %float_to_int_bitcast.i.i.i.i = bitcast <8 x float> %0 to <8 x i32>
+  %bitop.i.i = and <8 x i32> %float_to_int_bitcast.i.i.i.i,
+      <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648,
+       i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>
+  %bitop.i = xor <8 x i32> %float_to_int_bitcast.i.i.i.i, %bitop.i.i
+  %int_to_float_bitcast.i.i40.i = bitcast <8 x i32> %bitop.i to <8 x float>
+  %binop.i = fadd <8 x float> %int_to_float_bitcast.i.i40.i,
+    <float 8.388608e+06, float 8.388608e+06, float 8.388608e+06, float 8.388608e+06,
+     float 8.388608e+06, float 8.388608e+06, float 8.388608e+06, float 8.388608e+06>
+  %binop21.i = fadd <8 x float> %binop.i,
+    <float -8.388608e+06, float -8.388608e+06, float -8.388608e+06, float -8.388608e+06,
+     float -8.388608e+06, float -8.388608e+06, float -8.388608e+06, float -8.388608e+06>
+  %float_to_int_bitcast.i.i.i = bitcast <8 x float> %binop21.i to <8 x i32>
+  %bitop31.i = xor <8 x i32> %float_to_int_bitcast.i.i.i, %bitop.i.i
+  %int_to_float_bitcast.i.i.i = bitcast <8 x i32> %bitop31.i to <8 x float>
+  ret <8 x float> %int_to_float_bitcast.i.i.i
+}
+
+define <8 x float> @__floor_varying_float(<8 x float>) nounwind readonly alwaysinline {
+  %calltmp.i = tail call <8 x float> @__round_varying_float(<8 x float> %0) nounwind
+  %bincmp.i = fcmp ogt <8 x float> %calltmp.i, %0
+  %val_to_boolvec32.i = sext <8 x i1> %bincmp.i to <8 x i32>
+  %bitop.i = and <8 x i32> %val_to_boolvec32.i,
+    <i32 -1082130432, i32 -1082130432, i32 -1082130432, i32 -1082130432,
+     i32 -1082130432, i32 -1082130432, i32 -1082130432, i32 -1082130432>
+  %int_to_float_bitcast.i.i.i = bitcast <8 x i32> %bitop.i to <8 x float>
+  %binop.i = fadd <8 x float> %calltmp.i, %int_to_float_bitcast.i.i.i
+  ret <8 x float> %binop.i
+}
+
+define <8 x float> @__ceil_varying_float(<8 x float>) nounwind readonly alwaysinline {
+  %calltmp.i = tail call <8 x float> @__round_varying_float(<8 x float> %0) nounwind
+  %bincmp.i = fcmp olt <8 x float> %calltmp.i, %0
+  %val_to_boolvec32.i = sext <8 x i1> %bincmp.i to <8 x i32>
+  %bitop.i = and <8 x i32> %val_to_boolvec32.i,
+    <i32 1065353216, i32 1065353216, i32 1065353216, i32 1065353216,
+     i32 1065353216, i32 1065353216, i32 1065353216, i32 1065353216>
+  %int_to_float_bitcast.i.i.i = bitcast <8 x i32> %bitop.i to <8 x float>
+  %binop.i = fadd <8 x float> %calltmp.i, %int_to_float_bitcast.i.i.i
+  ret <8 x float> %binop.i
+}
+
+;; FIXME: rounding doubles and double vectors needs to be implemented
+declare <WIDTH x double> @__round_varying_double(<WIDTH x double>) nounwind readnone 
+declare <WIDTH x double> @__floor_varying_double(<WIDTH x double>) nounwind readnone 
+declare <WIDTH x double> @__ceil_varying_double(<WIDTH x double>) nounwind readnone 
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; min/max
+
+declare <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float>, <4 x float>) nounwind readnone
+declare <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float>, <4 x float>) nounwind readnone
+
+define <WIDTH x float> @__max_varying_float(<WIDTH x float>,
+                                            <WIDTH x float>) nounwind readnone {
+  binary4to8(r, float, @llvm.arm.neon.vmaxs.v4f32, %0, %1)
+  ret <WIDTH x float> %r
+}
+
+define <WIDTH x float> @__min_varying_float(<WIDTH x float>,
+                                            <WIDTH x float>) nounwind readnone {
+  binary4to8(r, float, @llvm.arm.neon.vmins.v4f32, %0, %1)
+  ret <WIDTH x float> %r
+}
+
+declare <4 x i32> @llvm.arm.neon.vmins.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vminu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vmaxs.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vmaxu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <WIDTH x i32> @__min_varying_int32(<WIDTH x i32>, <WIDTH x i32>) nounwind readnone {
+  binary4to8(r, i32, @llvm.arm.neon.vmins.v4i32, %0, %1)
+  ret <WIDTH x i32> %r
+}
+
+define <WIDTH x i32> @__max_varying_int32(<WIDTH x i32>, <WIDTH x i32>) nounwind readnone {
+  binary4to8(r, i32, @llvm.arm.neon.vmaxs.v4i32, %0, %1)
+  ret <WIDTH x i32> %r
+}
+
+define <WIDTH x i32> @__min_varying_uint32(<WIDTH x i32>, <WIDTH x i32>) nounwind readnone {
+  binary4to8(r, i32, @llvm.arm.neon.vminu.v4i32, %0, %1)
+  ret <WIDTH x i32> %r
+}
+
+define <WIDTH x i32> @__max_varying_uint32(<WIDTH x i32>, <WIDTH x i32>) nounwind readnone {
+  binary4to8(r, i32, @llvm.arm.neon.vmaxu.v4i32, %0, %1)
+  ret <WIDTH x i32> %r
+}
+
+;; sqrt/rsqrt/rcp
+
+declare <4 x float> @llvm.arm.neon.vrecpe.v4f32(<4 x float>) nounwind readnone
+declare <4 x float> @llvm.arm.neon.vrecps.v4f32(<4 x float>, <4 x float>) nounwind readnone
+
+define <WIDTH x float> @__rcp_varying_float(<WIDTH x float> %d) nounwind readnone {
+  unary4to8(x0, float, @llvm.arm.neon.vrecpe.v4f32, %d)
+  binary4to8(x0_nr, float, @llvm.arm.neon.vrecps.v4f32, %d, %x0)
+  %x1 = fmul <WIDTH x float> %x0, %x0_nr
+  binary4to8(x1_nr, float, @llvm.arm.neon.vrecps.v4f32, %d, %x1)
+  %x2 = fmul <WIDTH x float> %x1, %x1_nr
+  ret <WIDTH x float> %x2
+}
+
+declare <4 x float> @llvm.arm.neon.vrsqrte.v4f32(<4 x float>) nounwind readnone
+declare <4 x float> @llvm.arm.neon.vrsqrts.v4f32(<4 x float>, <4 x float>) nounwind readnone
+
+define <WIDTH x float> @__rsqrt_varying_float(<WIDTH x float> %d) nounwind readnone {
+  unary4to8(x0, float, @llvm.arm.neon.vrsqrte.v4f32, %d)
+  %x0_2 = fmul <WIDTH x float> %x0, %x0
+  binary4to8(x0_nr, float, @llvm.arm.neon.vrsqrts.v4f32, %d, %x0_2)
+  %x1 = fmul <WIDTH x float> %x0, %x0_nr
+  %x1_2 = fmul <WIDTH x float> %x1, %x1
+  binary4to8(x1_nr, float, @llvm.arm.neon.vrsqrts.v4f32, %d, %x1_2)
+  %x2 = fmul <WIDTH x float> %x1, %x1_nr
+  ret <WIDTH x float> %x2
+}
+
+define float @__rsqrt_uniform_float(float) nounwind readnone {
+  %v1 = bitcast float %0 to <1 x float>
+  %vs = shufflevector <1 x float> %v1, <1 x float> undef,
+          <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef,
+                      i32 undef, i32 undef, i32 undef, i32 undef>
+  %vr = call <8 x float> @__rsqrt_varying_float(<8 x float> %vs)
+  %r = extractelement <8 x float> %vr, i32 0
+  ret float %r
+}
+
+define float @__rcp_uniform_float(float) nounwind readnone {
+  %v1 = bitcast float %0 to <1 x float>
+  %vs = shufflevector <1 x float> %v1, <1 x float> undef,
+          <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef,
+                      i32 undef, i32 undef, i32 undef, i32 undef>
+  %vr = call <8 x float> @__rcp_varying_float(<8 x float> %vs)
+  %r = extractelement <8 x float> %vr, i32 0
+  ret float %r
+}
+
+declare <4 x float> @llvm.sqrt.v4f32(<4 x float>)
+
+define <WIDTH x float> @__sqrt_varying_float(<WIDTH x float>) nounwind readnone {
+  unary4to8(result, float, @llvm.sqrt.v4f32, %0)
+;; this returns nan for v=0, which is undesirable..
+;;  %rsqrt = call <WIDTH x float> @__rsqrt_varying_float(<WIDTH x float> %0)
+;;  %result = fmul <4 x float> %rsqrt, %0
+  ret <8 x float> %result
+}
+
+declare <4 x double> @llvm.sqrt.v4f64(<4 x double>)
+
+define <WIDTH x double> @__sqrt_varying_double(<WIDTH x double>) nounwind readnone {
+  unary4to8(r, double, @llvm.sqrt.v4f64, %0)
+  ret <WIDTH x double> %r
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; reductions
+
+define i64 @__movmsk(<WIDTH x MASK>) nounwind readnone {
+  %and_mask = and <WIDTH x i16> %0,
+    <i16 1, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128>
+  %v4 = call <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<8 x i16> %and_mask)
+  %v2 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %v4)
+  %va = extractelement <2 x i64> %v2, i32 0
+  %vb = extractelement <2 x i64> %v2, i32 1
+  %v = or i64 %va, %vb
+  ret i64 %v
+}
+
+define i1 @__any(<WIDTH x MASK>) nounwind readnone alwaysinline {
+  v8tov4(MASK, %0, %v0123, %v4567)
+  %vor = or <4 x MASK> %v0123, %v4567
+  %v0 = extractelement <4 x MASK> %vor, i32 0
+  %v1 = extractelement <4 x MASK> %vor, i32 1
+  %v2 = extractelement <4 x MASK> %vor, i32 2
+  %v3 = extractelement <4 x MASK> %vor, i32 3
+  %v01 = or MASK %v0, %v1
+  %v23 = or MASK %v2, %v3
+  %v = or MASK %v01, %v23
+  %cmp = icmp ne MASK %v, 0
+  ret i1 %cmp
+}
+
+define i1 @__all(<WIDTH x MASK>) nounwind readnone alwaysinline {
+  v8tov4(MASK, %0, %v0123, %v4567)
+  %vand = and <4 x MASK> %v0123, %v4567
+  %v0 = extractelement <4 x MASK> %vand, i32 0
+  %v1 = extractelement <4 x MASK> %vand, i32 1
+  %v2 = extractelement <4 x MASK> %vand, i32 2
+  %v3 = extractelement <4 x MASK> %vand, i32 3
+  %v01 = and MASK %v0, %v1
+  %v23 = and MASK %v2, %v3
+  %v = and MASK %v01, %v23
+  %cmp = icmp ne MASK %v, 0
+  ret i1 %cmp
+}
+
+define i1 @__none(<WIDTH x MASK>) nounwind readnone alwaysinline {
+  %any = call i1 @__any(<WIDTH x MASK> %0)
+  %none = icmp eq i1 %any, 0
+  ret i1 %none
+}
+
+;; $1: scalar type
+;; $2: vector/vector reduce function (2 x <WIDTH x vec> -> <WIDTH x vec>)
+;; $3: pairwise vector reduce function (2 x <2 x vec> -> <2 x vec>)
+;; $4: scalar reduce function
+
+define(`neon_reduce', `
+  v8tov4($1, %0, %v0123, %v4567)
+  %v0123_8 = shufflevector <4 x $1> %v0123, <4 x $1> undef,
+    <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+  %v4567_8 = shufflevector <4 x $1> %v4567, <4 x $1> undef,
+    <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+  %vfirst = call <8 x $1> $2(<8 x $1> %v0123_8, <8 x $1> %v4567_8)
+  %vfirst_4 = shufflevector <8 x $1> %vfirst, <8 x $1> undef,
+    <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  v4tov2($1, %vfirst_4, %v0, %v1)
+  %vh = call <2 x $1> $3(<2 x $1> %v0, <2 x $1> %v1)
+  %vh0 = extractelement <2 x $1> %vh, i32 0
+  %vh1 = extractelement <2 x $1> %vh, i32 1
+  %r = call $1 $4($1 %vh0, $1 %vh1)
+  ret $1 %r
+')
+
+declare <2 x float> @llvm.arm.neon.vpadd.v2f32(<2 x float>, <2 x float>) nounwind readnone
+
+define internal float @add_f32(float, float) {
+  %r = fadd float %0, %1
+  ret float %r
+}
+
+define internal <WIDTH x float> @__add_varying_float(<WIDTH x float>, <WIDTH x float>) {
+  %r = fadd <WIDTH x float> %0, %1
+  ret <WIDTH x float> %r
+}
+
+define float @__reduce_add_float(<WIDTH x float>) nounwind readnone {
+  neon_reduce(float, @__add_varying_float, @llvm.arm.neon.vpadd.v2f32, @add_f32)
+}
+
+declare <2 x float> @llvm.arm.neon.vpmins.v2f32(<2 x float>, <2 x float>) nounwind readnone
+
+define internal float @min_f32(float, float) {
+  %cmp = fcmp olt float %0, %1
+  %r = select i1 %cmp, float %0, float %1
+  ret float %r
+}
+
+define float @__reduce_min_float(<WIDTH x float>) nounwind readnone {
+  neon_reduce(float, @__min_varying_float, @llvm.arm.neon.vpmins.v2f32, @min_f32)
+}
+
+declare <2 x float> @llvm.arm.neon.vpmaxs.v2f32(<2 x float>, <2 x float>) nounwind readnone
+
+define internal float @max_f32(float, float) {
+  %cmp = fcmp ugt float %0, %1
+  %r = select i1 %cmp, float %0, float %1
+  ret float %r
+}
+
+define float @__reduce_max_float(<WIDTH x float>) nounwind readnone {
+  neon_reduce(float, @__max_varying_float, @llvm.arm.neon.vpmaxs.v2f32, @max_f32)
+}
+
+declare <4 x i16> @llvm.arm.neon.vpaddls.v4i16.v8i8(<8 x i8>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vpaddlu.v2i32.v4i16(<4 x i16>) nounwind readnone
+
+define i16 @__reduce_add_int8(<WIDTH x i8>) nounwind readnone {
+  %a16 = call <4 x i16> @llvm.arm.neon.vpaddls.v4i16.v8i8(<8 x i8> %0)
+  %a32 = call <2 x i32> @llvm.arm.neon.vpaddlu.v2i32.v4i16(<4 x i16> %a16)
+  %a0 = extractelement <2 x i32> %a32, i32 0
+  %a1 = extractelement <2 x i32> %a32, i32 1
+  %r = add i32 %a0, %a1
+  %r16 = trunc i32 %r to i16
+  ret i16 %r16
+}
+
+declare <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<WIDTH x i16>)
+
+define i64 @__reduce_add_int16(<WIDTH x i16>) nounwind readnone {
+  %a1 = call <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<WIDTH x i16> %0)
+  %a2 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %a1)
+  %aa = extractelement <2 x i64> %a2, i32 0
+  %ab = extractelement <2 x i64> %a2, i32 1
+  %r = add i64 %aa, %ab
+  ret i64 %r
+}
+
+declare <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32>) nounwind readnone
+
+define i64 @__reduce_add_int32(<WIDTH x i32>) nounwind readnone {
+  v8tov4(i32, %0, %va, %vb)
+  %pa = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %va)
+  %pb = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %vb)
+  %psum = add <2 x i64> %pa, %pb
+  %a0 = extractelement <2 x i64> %psum, i32 0
+  %a1 = extractelement <2 x i64> %psum, i32 1
+  %r = add i64 %a0, %a1
+  ret i64 %r
+}
+
+declare <2 x i32> @llvm.arm.neon.vpmins.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+define internal i32 @min_si32(i32, i32) {
+  %cmp = icmp slt i32 %0, %1
+  %r = select i1 %cmp, i32 %0, i32 %1
+  ret i32 %r
+}
+
+define i32 @__reduce_min_int32(<WIDTH x i32>) nounwind readnone {
+  neon_reduce(i32, @__min_varying_int32, @llvm.arm.neon.vpmins.v2i32, @min_si32)
+}
+
+declare <2 x i32> @llvm.arm.neon.vpmaxs.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+define internal i32 @max_si32(i32, i32) {
+  %cmp = icmp sgt i32 %0, %1
+  %r = select i1 %cmp, i32 %0, i32 %1
+  ret i32 %r
+}
+
+define i32 @__reduce_max_int32(<WIDTH x i32>) nounwind readnone {
+  neon_reduce(i32, @__max_varying_int32, @llvm.arm.neon.vpmaxs.v2i32, @max_si32)
+}
+
+declare <2 x i32> @llvm.arm.neon.vpminu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+define internal i32 @min_ui32(i32, i32) {
+  %cmp = icmp ult i32 %0, %1
+  %r = select i1 %cmp, i32 %0, i32 %1
+  ret i32 %r
+}
+
+define i32 @__reduce_min_uint32(<WIDTH x i32>) nounwind readnone {
+  neon_reduce(i32, @__min_varying_uint32, @llvm.arm.neon.vpmins.v2i32, @min_ui32)
+}
+
+declare <2 x i32> @llvm.arm.neon.vpmaxu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+define internal i32 @max_ui32(i32, i32) {
+  %cmp = icmp ugt i32 %0, %1
+  %r = select i1 %cmp, i32 %0, i32 %1
+  ret i32 %r
+}
+
+define i32 @__reduce_max_uint32(<WIDTH x i32>) nounwind readnone {
+  neon_reduce(i32, @__max_varying_uint32, @llvm.arm.neon.vpmaxs.v2i32, @max_ui32)
+}
+
+define double @__reduce_add_double(<WIDTH x double>) nounwind readnone {
+  v8tov2(double, %0, %v0, %v1, %v2, %v3)
+  %v01 = fadd <2 x double> %v0, %v1
+  %v23 = fadd <2 x double> %v2, %v3
+  %sum = fadd <2 x double> %v01, %v23
+  %e0 = extractelement <2 x double> %sum, i32 0
+  %e1 = extractelement <2 x double> %sum, i32 1
+  %m = fadd double %e0, %e1
+  ret double %m
+}
+
+define double @__reduce_min_double(<WIDTH x double>) nounwind readnone {
+  reduce8(double, @__min_varying_double, @__min_uniform_double)
+}
+
+define double @__reduce_max_double(<WIDTH x double>) nounwind readnone {
+  reduce8(double, @__max_varying_double, @__max_uniform_double)
+}
+
+define i64 @__reduce_add_int64(<WIDTH x i64>) nounwind readnone {
+  v8tov2(i64, %0, %v0, %v1, %v2, %v3)
+  %v01 = add <2 x i64> %v0, %v1
+  %v23 = add <2 x i64> %v2, %v3
+  %sum = add <2 x i64> %v01, %v23
+  %e0 = extractelement <2 x i64> %sum, i32 0
+  %e1 = extractelement <2 x i64> %sum, i32 1
+  %m = add i64 %e0, %e1
+  ret i64 %m
+}
+
+define i64 @__reduce_min_int64(<WIDTH x i64>) nounwind readnone {
+  reduce8(i64, @__min_varying_int64, @__min_uniform_int64)
+}
+
+define i64 @__reduce_max_int64(<WIDTH x i64>) nounwind readnone {
+  reduce8(i64, @__max_varying_int64, @__max_uniform_int64)
+}
+
+define i64 @__reduce_min_uint64(<WIDTH x i64>) nounwind readnone {
+  reduce8(i64, @__min_varying_uint64, @__min_uniform_uint64)
+}
+
+define i64 @__reduce_max_uint64(<WIDTH x i64>) nounwind readnone {
+  reduce8(i64, @__max_varying_uint64, @__max_uniform_uint64)
+}
diff --git a/builtins/target-neon.ll b/builtins/target-neon-32.ll
similarity index 62%
rename from builtins/target-neon.ll
rename to builtins/target-neon-32.ll
index fbeac352..1f8003d7 100644
--- a/builtins/target-neon.ll
+++ b/builtins/target-neon-32.ll
@@ -1,5 +1,5 @@
 ;;
-;; target-neon.ll
+;; target-neon-32.ll
 ;;
 ;;  Copyright(c) 2012-2013 Matt Pharr
 ;;  Copyright(c) 2013 Google, Inc.
@@ -34,52 +34,20 @@
 ;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 ;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
 
-target datalayout = "e-p:32:32:32-S32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f16:16:16-f32:32:32-f64:32:64-f128:128:128-v64:32:64-v128:32:128-a0:0:64-n32"
-
 define(`WIDTH',`4')
-
 define(`MASK',`i32')
 
 include(`util.m4')
-
-stdlib_core()
-scans()
-reduce_equal(WIDTH)
-rdrand_decls()
-define_shuffles()
-aossoa()
-ctlztz()
+include(`target-neon-common.ll')
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; half conversion routines
 
-declare <4 x i16> @llvm.arm.neon.vcvtfp2hf(<4 x float>) nounwind readnone
-declare <4 x float> @llvm.arm.neon.vcvthf2fp(<4 x i16>) nounwind readnone
-
-define float @__half_to_float_uniform(i16 %v) nounwind readnone {
-  %v1 = bitcast i16 %v to <1 x i16>
-  %vec = shufflevector <1 x i16> %v1, <1 x i16> undef, 
-           <4 x i32> <i32 0, i32 0, i32 0, i32 0>
-  %h = call <4 x float> @llvm.arm.neon.vcvthf2fp(<4 x i16> %vec)
-  %r = extractelement <4 x float> %h, i32 0
-  ret float %r
-}
-
 define <4 x float> @__half_to_float_varying(<4 x i16> %v) nounwind readnone {
   %r = call <4 x float> @llvm.arm.neon.vcvthf2fp(<4 x i16> %v)
   ret <4 x float> %r
 }
 
-define i16 @__float_to_half_uniform(float %v) nounwind readnone {
-  %v1 = bitcast float %v to <1 x float>
-  %vec = shufflevector <1 x float> %v1, <1 x float> undef, 
-           <4 x i32> <i32 0, i32 0, i32 0, i32 0>
-  %h = call <4 x i16> @llvm.arm.neon.vcvtfp2hf(<4 x float> %vec)
-  %r = extractelement <4 x i16> %h, i32 0
-  ret i16 %r
-}
-
-
 define <4 x i16> @__float_to_half_varying(<4 x float> %v) nounwind readnone {
   %r = call <4 x i16> @llvm.arm.neon.vcvtfp2hf(<4 x float> %v)
   ret <4 x i16> %r
@@ -88,48 +56,11 @@ define <4 x i16> @__float_to_half_varying(<4 x float> %v) nounwind readnone {
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; math
 
-define void @__fastmath() nounwind {
-  ret void
-}
-
 ;; round/floor/ceil
 
 ;; FIXME: grabbed these from the sse2 target, which does not have native
 ;; instructions for these.  Is there a better approach for NEON?
 
-define float @__round_uniform_float(float) nounwind readonly alwaysinline {
-  %float_to_int_bitcast.i.i.i.i = bitcast float %0 to i32
-  %bitop.i.i = and i32 %float_to_int_bitcast.i.i.i.i, -2147483648
-  %bitop.i = xor i32 %bitop.i.i, %float_to_int_bitcast.i.i.i.i
-  %int_to_float_bitcast.i.i40.i = bitcast i32 %bitop.i to float
-  %binop.i = fadd float %int_to_float_bitcast.i.i40.i, 8.388608e+06
-  %binop21.i = fadd float %binop.i, -8.388608e+06
-  %float_to_int_bitcast.i.i.i = bitcast float %binop21.i to i32
-  %bitop31.i = xor i32 %float_to_int_bitcast.i.i.i, %bitop.i.i
-  %int_to_float_bitcast.i.i.i = bitcast i32 %bitop31.i to float
-  ret float %int_to_float_bitcast.i.i.i
-}
-
-define float @__floor_uniform_float(float) nounwind readonly alwaysinline {
-  %calltmp.i = tail call float @__round_uniform_float(float %0) nounwind
-  %bincmp.i = fcmp ogt float %calltmp.i, %0
-  %selectexpr.i = sext i1 %bincmp.i to i32
-  %bitop.i = and i32 %selectexpr.i, -1082130432
-  %int_to_float_bitcast.i.i.i = bitcast i32 %bitop.i to float
-  %binop.i = fadd float %calltmp.i, %int_to_float_bitcast.i.i.i
-  ret float %binop.i
-}
-
-define float @__ceil_uniform_float(float) nounwind readonly alwaysinline {
-  %calltmp.i = tail call float @__round_uniform_float(float %0) nounwind
-  %bincmp.i = fcmp olt float %calltmp.i, %0
-  %selectexpr.i = sext i1 %bincmp.i to i32
-  %bitop.i = and i32 %selectexpr.i, 1065353216
-  %int_to_float_bitcast.i.i.i = bitcast i32 %bitop.i to float
-  %binop.i = fadd float %calltmp.i, %int_to_float_bitcast.i.i.i
-  ret float %binop.i
-}
-
 define <4 x float> @__round_varying_float(<4 x float>) nounwind readonly alwaysinline {
   %float_to_int_bitcast.i.i.i.i = bitcast <4 x float> %0 to <4 x i32>
   %bitop.i.i = and <4 x i32> %float_to_int_bitcast.i.i.i.i, <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>
@@ -164,10 +95,6 @@ define <4 x float> @__ceil_varying_float(<4 x float>) nounwind readonly alwaysin
 }
 
 ;; FIXME: rounding doubles and double vectors needs to be implemented
-declare double @__round_uniform_double(double) nounwind readnone 
-declare double @__floor_uniform_double(double) nounwind readnone 
-declare double @__ceil_uniform_double(double) nounwind readnone 
-
 declare <WIDTH x double> @__round_varying_double(<WIDTH x double>) nounwind readnone 
 declare <WIDTH x double> @__floor_varying_double(<WIDTH x double>) nounwind readnone 
 declare <WIDTH x double> @__ceil_varying_double(<WIDTH x double>) nounwind readnone 
@@ -175,78 +102,6 @@ declare <WIDTH x double> @__ceil_varying_double(<WIDTH x double>) nounwind readn
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; min/max
 
-define float @__max_uniform_float(float, float) nounwind readnone {
-  %cmp = fcmp ugt float %0, %1
-  %r = select i1 %cmp, float %0, float %1
-  ret float %r
-}
-
-define float @__min_uniform_float(float, float) nounwind readnone {
-  %cmp = fcmp ult float %0, %1
-  %r = select i1 %cmp, float %0, float %1
-  ret float %r
-}
-
-define i32 @__min_uniform_int32(i32, i32) nounwind readnone {
-  %cmp = icmp slt i32 %0, %1
-  %r = select i1 %cmp, i32 %0, i32 %1
-  ret i32 %r
-}
-
-define i32 @__max_uniform_int32(i32, i32) nounwind readnone {
-  %cmp = icmp sgt i32 %0, %1
-  %r = select i1 %cmp, i32 %0, i32 %1
-  ret i32 %r
-}
-
-define i32 @__min_uniform_uint32(i32, i32) nounwind readnone {
-  %cmp = icmp ult i32 %0, %1
-  %r = select i1 %cmp, i32 %0, i32 %1
-  ret i32 %r
-}
-
-define i32 @__max_uniform_uint32(i32, i32) nounwind readnone {
-  %cmp = icmp ugt i32 %0, %1
-  %r = select i1 %cmp, i32 %0, i32 %1
-  ret i32 %r
-}
-
-define i64 @__min_uniform_int64(i64, i64) nounwind readnone {
-  %cmp = icmp slt i64 %0, %1
-  %r = select i1 %cmp, i64 %0, i64 %1
-  ret i64 %r
-}
-
-define i64 @__max_uniform_int64(i64, i64) nounwind readnone {
-  %cmp = icmp sgt i64 %0, %1
-  %r = select i1 %cmp, i64 %0, i64 %1
-  ret i64 %r
-}
-
-define i64 @__min_uniform_uint64(i64, i64) nounwind readnone {
-  %cmp = icmp ult i64 %0, %1
-  %r = select i1 %cmp, i64 %0, i64 %1
-  ret i64 %r
-}
-
-define i64 @__max_uniform_uint64(i64, i64) nounwind readnone {
-  %cmp = icmp ugt i64 %0, %1
-  %r = select i1 %cmp, i64 %0, i64 %1
-  ret i64 %r
-}
-
-define double @__min_uniform_double(double, double) nounwind readnone {
-  %cmp = fcmp olt double %0, %1
-  %r = select i1 %cmp, double %0, double %1
-  ret double %r
-}
-
-define double @__max_uniform_double(double, double) nounwind readnone {
-  %cmp = fcmp ogt double %0, %1
-  %r = select i1 %cmp, double %0, double %1
-  ret double %r
-}
-
 declare <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float>, <4 x float>) nounwind readnone
 declare <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float>, <4 x float>) nounwind readnone
 
@@ -287,44 +142,6 @@ define <WIDTH x i32> @__max_varying_uint32(<WIDTH x i32>, <WIDTH x i32>) nounwin
   ret <4 x i32> %r
 }
 
-define <WIDTH x i64> @__min_varying_int64(<WIDTH x i64>, <WIDTH x i64>) nounwind readnone {
-  %m = icmp slt <WIDTH x i64> %0, %1
-  %r = select <WIDTH x i1> %m, <WIDTH x i64> %0, <WIDTH x i64> %1
-  ret <WIDTH x i64> %r
-}
-
-define <WIDTH x i64> @__max_varying_int64(<WIDTH x i64>, <WIDTH x i64>) nounwind readnone {
-  %m = icmp sgt <WIDTH x i64> %0, %1
-  %r = select <WIDTH x i1> %m, <WIDTH x i64> %0, <WIDTH x i64> %1
-  ret <WIDTH x i64> %r
-}
-
-define <WIDTH x i64> @__min_varying_uint64(<WIDTH x i64>, <WIDTH x i64>) nounwind readnone {
-  %m = icmp ult <WIDTH x i64> %0, %1
-  %r = select <WIDTH x i1> %m, <WIDTH x i64> %0, <WIDTH x i64> %1
-  ret <WIDTH x i64> %r
-}
-
-define <WIDTH x i64> @__max_varying_uint64(<WIDTH x i64>, <WIDTH x i64>) nounwind readnone {
-  %m = icmp ugt <WIDTH x i64> %0, %1
-  %r = select <WIDTH x i1> %m, <WIDTH x i64> %0, <WIDTH x i64> %1
-  ret <WIDTH x i64> %r
-}
-
-define <WIDTH x double> @__min_varying_double(<WIDTH x double>,
-                                              <WIDTH x double>) nounwind readnone {
-  %m = fcmp olt <WIDTH x double> %0, %1
-  %r = select <WIDTH x i1> %m, <WIDTH x double> %0, <WIDTH x double> %1
-  ret <WIDTH x double> %r
-}
-
-define <WIDTH x double> @__max_varying_double(<WIDTH x double>,
-                                              <WIDTH x double>) nounwind readnone {
-  %m = fcmp ogt <WIDTH x double> %0, %1
-  %r = select <WIDTH x i1> %m, <WIDTH x double> %0, <WIDTH x double> %1
-  ret <WIDTH x double> %r
-}
-
 ;; sqrt/rsqrt/rcp
 
 declare <4 x float> @llvm.arm.neon.vrecpe.v4f32(<4 x float>) nounwind readnone
@@ -371,13 +188,6 @@ define float @__rcp_uniform_float(float) nounwind readnone {
   ret float %r
 }
 
-declare float @llvm.sqrt.f32(float)
-
-define float @__sqrt_uniform_float(float) nounwind readnone {
-  %r = call float @llvm.sqrt.f32(float %0)
-  ret float %r
-}
-
 declare <4 x float> @llvm.sqrt.v4f32(<4 x float>)
 
 define <WIDTH x float> @__sqrt_varying_float(<WIDTH x float>) nounwind readnone {
@@ -388,13 +198,6 @@ define <WIDTH x float> @__sqrt_varying_float(<WIDTH x float>) nounwind readnone
   ret <4 x float> %result
 }
 
-declare double @llvm.sqrt.f64(double)
-
-define double @__sqrt_uniform_double(double) nounwind readnone {
-  %r = call double @llvm.sqrt.f64(double %0)
-  ret double %r
-}
-
 declare <4 x double> @llvm.sqrt.v4f64(<4 x double>)
 
 define <WIDTH x double> @__sqrt_varying_double(<WIDTH x double>) nounwind readnone {
@@ -402,21 +205,6 @@ define <WIDTH x double> @__sqrt_varying_double(<WIDTH x double>) nounwind readno
   ret <4 x double> %r
 }
 
-;; bit ops
-
-declare i32 @llvm.ctpop.i32(i32) nounwind readnone
-declare i64 @llvm.ctpop.i64(i64) nounwind readnone
-
-define i32 @__popcnt_int32(i32) nounwind readnone {
-  %v = call i32 @llvm.ctpop.i32(i32 %0)
-  ret i32 %v
-}
-
-define i64 @__popcnt_int64(i64) nounwind readnone {
-  %v = call i64 @llvm.ctpop.i64(i64 %0)
-  ret i64 %v
-}
-
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; reductions
 
@@ -638,92 +426,3 @@ define i64 @__reduce_min_uint64(<4 x i64>) nounwind readnone {
 define i64 @__reduce_max_uint64(<4 x i64>) nounwind readnone {
   reduce4(i64, @__max_varying_uint64, @__max_uniform_uint64)
 }
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; unaligned loads/loads+broadcasts
-
-masked_load(i8,  1)
-masked_load(i16, 2)
-masked_load(i32, 4)
-masked_load(float, 4)
-masked_load(i64, 8)
-masked_load(double, 8)
-
-gen_masked_store(i8)
-gen_masked_store(i16)
-gen_masked_store(i32)
-gen_masked_store(i64)
-masked_store_float_double()
-
-define void @__masked_store_blend_i8(<WIDTH x i8>* nocapture %ptr, <WIDTH x i8> %new,
-                                     <WIDTH x MASK> %mask) nounwind alwaysinline {
-  %old = load <WIDTH x i8> * %ptr
-  %mask1 = trunc <4 x MASK> %mask to <4 x i1>
-  %result = select <4 x i1> %mask1, <4 x i8> %new, <4 x i8> %old
-  store <WIDTH x i8> %result, <WIDTH x i8> * %ptr
-  ret void
-}
-
-define void @__masked_store_blend_i16(<WIDTH x i16>* nocapture %ptr, <WIDTH x i16> %new, 
-                                      <WIDTH x MASK> %mask) nounwind alwaysinline {
-  %old = load <WIDTH x i16> * %ptr
-  %mask1 = trunc <4 x MASK> %mask to <4 x i1>
-  %result = select <4 x i1> %mask1, <4 x i16> %new, <4 x i16> %old
-  store <WIDTH x i16> %result, <WIDTH x i16> * %ptr
-  ret void
-}
-
-define void @__masked_store_blend_i32(<WIDTH x i32>* nocapture %ptr, <WIDTH x i32> %new, 
-                                      <WIDTH x MASK> %mask) nounwind alwaysinline {
-  %old = load <WIDTH x i32> * %ptr
-  %mask1 = trunc <4 x MASK> %mask to <4 x i1>
-  %result = select <4 x i1> %mask1, <4 x i32> %new, <4 x i32> %old
-  store <WIDTH x i32> %result, <WIDTH x i32> * %ptr
-  ret void
-}
-
-define void @__masked_store_blend_i64(<WIDTH x i64>* nocapture %ptr,
-                            <WIDTH x i64> %new, <WIDTH x MASK> %mask) nounwind alwaysinline {
-  %old = load <WIDTH x i64> * %ptr
-  %mask1 = trunc <4 x MASK> %mask to <4 x i1>
-  %result = select <4 x i1> %mask1, <4 x i64> %new, <4 x i64> %old
-  store <WIDTH x i64> %result, <WIDTH x i64> * %ptr
-  ret void
-}
-
-;; yuck.  We need declarations of these, even though we shouldnt ever
-;; actually generate calls to them for the NEON target...
-
-declare <WIDTH x float> @__svml_sin(<WIDTH x float>)
-declare <WIDTH x float> @__svml_cos(<WIDTH x float>)
-declare void @__svml_sincos(<WIDTH x float>, <WIDTH x float> *, <WIDTH x float> *)
-declare <WIDTH x float> @__svml_tan(<WIDTH x float>)
-declare <WIDTH x float> @__svml_atan(<WIDTH x float>)
-declare <WIDTH x float> @__svml_atan2(<WIDTH x float>, <WIDTH x float>)
-declare <WIDTH x float> @__svml_exp(<WIDTH x float>)
-declare <WIDTH x float> @__svml_log(<WIDTH x float>)
-declare <WIDTH x float> @__svml_pow(<WIDTH x float>, <WIDTH x float>)
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; gather
-
-gen_gather_factored(i8)
-gen_gather_factored(i16)
-gen_gather_factored(i32)
-gen_gather_factored(float)
-gen_gather_factored(i64)
-gen_gather_factored(double)
-
-gen_scatter(i8)
-gen_scatter(i16)
-gen_scatter(i32)
-gen_scatter(float)
-gen_scatter(i64)
-gen_scatter(double)
-
-packed_load_and_store(4)
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; prefetch
-
-define_prefetches()
diff --git a/builtins/target-neon-8.ll b/builtins/target-neon-8.ll
new file mode 100644
index 00000000..eb65f224
--- /dev/null
+++ b/builtins/target-neon-8.ll
@@ -0,0 +1,508 @@
+;;
+;; target-neon-8.ll
+;;
+;;  Copyright(c) 2013 Google, Inc.
+;;
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Matt Pharr nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+define(`WIDTH',`16')
+define(`MASK',`i8')
+
+include(`util.m4')
+include(`target-neon-common.ll')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; half conversion routines
+
+define <16 x float> @__half_to_float_varying(<16 x i16> %v) nounwind readnone {
+  unary4to16conv(r, i16, float, @llvm.arm.neon.vcvthf2fp, %v)
+  ret <16 x float> %r
+}
+
+define <16 x i16> @__float_to_half_varying(<16 x float> %v) nounwind readnone {
+  unary4to16conv(r, float, i16, @llvm.arm.neon.vcvtfp2hf, %v)
+  ret <16 x i16> %r
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; math
+
+;; round/floor/ceil
+
+;; FIXME: grabbed these from the sse2 target, which does not have native
+;; instructions for these.  Is there a better approach for NEON?
+
+define <16 x float> @__round_varying_float(<16 x float>) nounwind readonly alwaysinline {
+  %float_to_int_bitcast.i.i.i.i = bitcast <16 x float> %0 to <16 x i32>
+  %bitop.i.i = and <16 x i32> %float_to_int_bitcast.i.i.i.i,
+    <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648,
+     i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648,
+     i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648,
+     i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>
+  %bitop.i = xor <16 x i32> %float_to_int_bitcast.i.i.i.i, %bitop.i.i
+  %int_to_float_bitcast.i.i40.i = bitcast <16 x i32> %bitop.i to <16 x float>
+  %binop.i = fadd <16 x float> %int_to_float_bitcast.i.i40.i,
+    <float 8.388608e+06, float 8.388608e+06, float 8.388608e+06, float 8.388608e+06,
+     float 8.388608e+06, float 8.388608e+06, float 8.388608e+06, float 8.388608e+06,
+     float 8.388608e+06, float 8.388608e+06, float 8.388608e+06, float 8.388608e+06,
+     float 8.388608e+06, float 8.388608e+06, float 8.388608e+06, float 8.388608e+06>
+  %binop21.i = fadd <16 x float> %binop.i,
+    <float -8.388608e+06, float -8.388608e+06, float -8.388608e+06, float -8.388608e+06,
+     float -8.388608e+06, float -8.388608e+06, float -8.388608e+06, float -8.388608e+06,
+     float -8.388608e+06, float -8.388608e+06, float -8.388608e+06, float -8.388608e+06,
+     float -8.388608e+06, float -8.388608e+06, float -8.388608e+06, float -8.388608e+06>
+  %float_to_int_bitcast.i.i.i = bitcast <16 x float> %binop21.i to <16 x i32>
+  %bitop31.i = xor <16 x i32> %float_to_int_bitcast.i.i.i, %bitop.i.i
+  %int_to_float_bitcast.i.i.i = bitcast <16 x i32> %bitop31.i to <16 x float>
+  ret <16 x float> %int_to_float_bitcast.i.i.i
+}
+
+define <16 x float> @__floor_varying_float(<16 x float>) nounwind readonly alwaysinline {
+  %calltmp.i = tail call <16 x float> @__round_varying_float(<16 x float> %0) nounwind
+  %bincmp.i = fcmp ogt <16 x float> %calltmp.i, %0
+  %val_to_boolvec32.i = sext <16 x i1> %bincmp.i to <16 x i32>
+  %bitop.i = and <16 x i32> %val_to_boolvec32.i,
+    <i32 -1082130432, i32 -1082130432, i32 -1082130432, i32 -1082130432,
+     i32 -1082130432, i32 -1082130432, i32 -1082130432, i32 -1082130432,
+     i32 -1082130432, i32 -1082130432, i32 -1082130432, i32 -1082130432,
+     i32 -1082130432, i32 -1082130432, i32 -1082130432, i32 -1082130432>
+  %int_to_float_bitcast.i.i.i = bitcast <16 x i32> %bitop.i to <16 x float>
+  %binop.i = fadd <16 x float> %calltmp.i, %int_to_float_bitcast.i.i.i
+  ret <16 x float> %binop.i
+}
+
+define <16 x float> @__ceil_varying_float(<16 x float>) nounwind readonly alwaysinline {
+  %calltmp.i = tail call <16 x float> @__round_varying_float(<16 x float> %0) nounwind
+  %bincmp.i = fcmp olt <16 x float> %calltmp.i, %0
+  %val_to_boolvec32.i = sext <16 x i1> %bincmp.i to <16 x i32>
+  %bitop.i = and <16 x i32> %val_to_boolvec32.i,
+    <i32 1065353216, i32 1065353216, i32 1065353216, i32 1065353216,
+     i32 1065353216, i32 1065353216, i32 1065353216, i32 1065353216,
+     i32 1065353216, i32 1065353216, i32 1065353216, i32 1065353216,
+     i32 1065353216, i32 1065353216, i32 1065353216, i32 1065353216>
+  %int_to_float_bitcast.i.i.i = bitcast <16 x i32> %bitop.i to <16 x float>
+  %binop.i = fadd <16 x float> %calltmp.i, %int_to_float_bitcast.i.i.i
+  ret <16 x float> %binop.i
+}
+
+;; FIXME: rounding doubles and double vectors needs to be implemented
+declare <WIDTH x double> @__round_varying_double(<WIDTH x double>) nounwind readnone 
+declare <WIDTH x double> @__floor_varying_double(<WIDTH x double>) nounwind readnone 
+declare <WIDTH x double> @__ceil_varying_double(<WIDTH x double>) nounwind readnone 
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; min/max
+
+declare <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float>, <4 x float>) nounwind readnone
+declare <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float>, <4 x float>) nounwind readnone
+
+define <WIDTH x float> @__max_varying_float(<WIDTH x float>,
+                                            <WIDTH x float>) nounwind readnone {
+  binary4to16(r, float, @llvm.arm.neon.vmaxs.v4f32, %0, %1)
+  ret <WIDTH x float> %r
+}
+
+define <WIDTH x float> @__min_varying_float(<WIDTH x float>,
+                                            <WIDTH x float>) nounwind readnone {
+  binary4to16(r, float, @llvm.arm.neon.vmins.v4f32, %0, %1)
+  ret <WIDTH x float> %r
+}
+
+declare <4 x i32> @llvm.arm.neon.vmins.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vminu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vmaxs.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vmaxu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <WIDTH x i32> @__min_varying_int32(<WIDTH x i32>, <WIDTH x i32>) nounwind readnone {
+  binary4to16(r, i32, @llvm.arm.neon.vmins.v4i32, %0, %1)
+  ret <WIDTH x i32> %r
+}
+
+define <WIDTH x i32> @__max_varying_int32(<WIDTH x i32>, <WIDTH x i32>) nounwind readnone {
+  binary4to16(r, i32, @llvm.arm.neon.vmaxs.v4i32, %0, %1)
+  ret <WIDTH x i32> %r
+}
+
+define <WIDTH x i32> @__min_varying_uint32(<WIDTH x i32>, <WIDTH x i32>) nounwind readnone {
+  binary4to16(r, i32, @llvm.arm.neon.vminu.v4i32, %0, %1)
+  ret <WIDTH x i32> %r
+}
+
+define <WIDTH x i32> @__max_varying_uint32(<WIDTH x i32>, <WIDTH x i32>) nounwind readnone {
+  binary4to16(r, i32, @llvm.arm.neon.vmaxu.v4i32, %0, %1)
+  ret <WIDTH x i32> %r
+}
+
+;; sqrt/rsqrt/rcp
+
+declare <4 x float> @llvm.arm.neon.vrecpe.v4f32(<4 x float>) nounwind readnone
+declare <4 x float> @llvm.arm.neon.vrecps.v4f32(<4 x float>, <4 x float>) nounwind readnone
+
+define <WIDTH x float> @__rcp_varying_float(<WIDTH x float> %d) nounwind readnone {
+  unary4to16(x0, float, @llvm.arm.neon.vrecpe.v4f32, %d)
+  binary4to16(x0_nr, float, @llvm.arm.neon.vrecps.v4f32, %d, %x0)
+  %x1 = fmul <WIDTH x float> %x0, %x0_nr
+  binary4to16(x1_nr, float, @llvm.arm.neon.vrecps.v4f32, %d, %x1)
+  %x2 = fmul <WIDTH x float> %x1, %x1_nr
+  ret <WIDTH x float> %x2
+}
+
+declare <4 x float> @llvm.arm.neon.vrsqrte.v4f32(<4 x float>) nounwind readnone
+declare <4 x float> @llvm.arm.neon.vrsqrts.v4f32(<4 x float>, <4 x float>) nounwind readnone
+
+define <WIDTH x float> @__rsqrt_varying_float(<WIDTH x float> %d) nounwind readnone {
+  unary4to16(x0, float, @llvm.arm.neon.vrsqrte.v4f32, %d)
+  %x0_2 = fmul <WIDTH x float> %x0, %x0
+  binary4to16(x0_nr, float, @llvm.arm.neon.vrsqrts.v4f32, %d, %x0_2)
+  %x1 = fmul <WIDTH x float> %x0, %x0_nr
+  %x1_2 = fmul <WIDTH x float> %x1, %x1
+  binary4to16(x1_nr, float, @llvm.arm.neon.vrsqrts.v4f32, %d, %x1_2)
+  %x2 = fmul <WIDTH x float> %x1, %x1_nr
+  ret <WIDTH x float> %x2
+}
+
+define float @__rsqrt_uniform_float(float) nounwind readnone {
+  %v1 = bitcast float %0 to <1 x float>
+  %vs = shufflevector <1 x float> %v1, <1 x float> undef,
+          <16 x i32> <i32 0, i32 undef, i32 undef, i32 undef,
+                      i32 undef, i32 undef, i32 undef, i32 undef,
+                      i32 undef, i32 undef, i32 undef, i32 undef,
+                      i32 undef, i32 undef, i32 undef, i32 undef>
+  %vr = call <16 x float> @__rsqrt_varying_float(<16 x float> %vs)
+  %r = extractelement <16 x float> %vr, i32 0
+  ret float %r
+}
+
+define float @__rcp_uniform_float(float) nounwind readnone {
+  %v1 = bitcast float %0 to <1 x float>
+  %vs = shufflevector <1 x float> %v1, <1 x float> undef,
+          <16 x i32> <i32 0, i32 undef, i32 undef, i32 undef,
+                      i32 undef, i32 undef, i32 undef, i32 undef,
+                      i32 undef, i32 undef, i32 undef, i32 undef,
+                      i32 undef, i32 undef, i32 undef, i32 undef>
+  %vr = call <16 x float> @__rcp_varying_float(<16 x float> %vs)
+  %r = extractelement <16 x float> %vr, i32 0
+  ret float %r
+}
+
+declare <4 x float> @llvm.sqrt.v4f32(<4 x float>)
+
+define <WIDTH x float> @__sqrt_varying_float(<WIDTH x float>) nounwind readnone {
+  unary4to16(result, float, @llvm.sqrt.v4f32, %0)
+;; this returns nan for v=0, which is undesirable..
+;;  %rsqrt = call <WIDTH x float> @__rsqrt_varying_float(<WIDTH x float> %0)
+;;  %result = fmul <4 x float> %rsqrt, %0
+  ret <16 x float> %result
+}
+
+declare <4 x double> @llvm.sqrt.v4f64(<4 x double>)
+
+define <WIDTH x double> @__sqrt_varying_double(<WIDTH x double>) nounwind readnone {
+  unary4to16(r, double, @llvm.sqrt.v4f64, %0)
+  ret <WIDTH x double> %r
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; reductions
+
+define i64 @__movmsk(<WIDTH x MASK>) nounwind readnone {
+  %and_mask = and <WIDTH x i8> %0,
+    <i8 1, i8 2, i8 4, i8 8, i8 16, i8 32, i8 64, i8 128,
+     i8 1, i8 2, i8 4, i8 8, i8 16, i8 32, i8 64, i8 128>
+  %v8 = call <8 x i16> @llvm.arm.neon.vpaddlu.v8i16.v16i8(<16 x i8> %and_mask)
+  %v4 = call <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<8 x i16> %v8)
+  %v2 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %v4)
+  %va = extractelement <2 x i64> %v2, i32 0
+  %vb = extractelement <2 x i64> %v2, i32 1
+  %vbshift = shl i64 %vb, 8
+  %v = or i64 %va, %vbshift
+  ret i64 %v
+}
+
+define i1 @__any(<WIDTH x MASK>) nounwind readnone alwaysinline {
+  v16tov8(MASK, %0, %v8a, %v8b)
+  %vor8 = or <8 x MASK> %v8a, %v8b
+  %v16 = sext <8 x i8> %vor8 to <8 x i16>
+  v8tov4(i16, %v16, %v16a, %v16b)
+  %vor16 = or <4 x i16> %v16a, %v16b
+  %v32 = sext <4 x i16> %vor16 to <4 x i32>
+  v4tov2(i32, %v32, %v32a, %v32b)
+  %vor32 = or <2 x i32> %v32a, %v32b
+  %v0 = extractelement <2 x i32> %vor32, i32 0
+  %v1 = extractelement <2 x i32> %vor32, i32 1
+  %v = or i32 %v0, %v1
+  %cmp = icmp ne i32 %v, 0
+  ret i1 %cmp
+}
+
+define i1 @__all(<WIDTH x MASK>) nounwind readnone alwaysinline {
+  v16tov8(MASK, %0, %v8a, %v8b)
+  %vand8 = and <8 x MASK> %v8a, %v8b
+  %v16 = sext <8 x i8> %vand8 to <8 x i16>
+  v8tov4(i16, %v16, %v16a, %v16b)
+  %vand16 = and <4 x i16> %v16a, %v16b
+  %v32 = sext <4 x i16> %vand16 to <4 x i32>
+  v4tov2(i32, %v32, %v32a, %v32b)
+  %vand32 = and <2 x i32> %v32a, %v32b
+  %v0 = extractelement <2 x i32> %vand32, i32 0
+  %v1 = extractelement <2 x i32> %vand32, i32 1
+  %v = and i32 %v0, %v1
+  %cmp = icmp ne i32 %v, 0
+  ret i1 %cmp
+}
+
+define i1 @__none(<WIDTH x MASK>) nounwind readnone alwaysinline {
+  %any = call i1 @__any(<WIDTH x MASK> %0)
+  %none = icmp eq i1 %any, 0
+  ret i1 %none
+}
+
+;; $1: scalar type
+;; $2: vector/vector reduce function (2 x <WIDTH x vec> -> <WIDTH x vec>)
+;; $3: pairwise vector reduce function (2 x <2 x vec> -> <2 x vec>)
+;; $4: scalar reduce function
+
+define(`neon_reduce', `
+  v16tov8($1, %0, %va, %vb)
+  %va_16 = shufflevector <8 x $1> %va, <8 x $1> undef,
+    <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %vb_16 = shufflevector <8 x $1> %vb, <8 x $1> undef,
+    <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %v8 = call <16 x $1> $2(<16 x $1> %va_16, <16 x $1> %vb_16)
+
+  %v8a = shufflevector <16 x $1> %v8, <16 x $1> undef,
+    <16 x i32> <i32 0, i32 1, i32 2, i32 3, 
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %v8b = shufflevector <16 x $1> %v8, <16 x $1> undef,
+    <16 x i32> <i32 4, i32 5, i32 6, i32 7, 
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef>
+
+  %v4 = call <16 x $1> $2(<16 x $1> %v8a, <16 x $1> %v8b)
+
+  %vfirst_4 = shufflevector <16 x $1> %v4, <16 x $1> undef,
+    <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  v4tov2($1, %vfirst_4, %v0, %v1)
+  %vh = call <2 x $1> $3(<2 x $1> %v0, <2 x $1> %v1)
+  %vh0 = extractelement <2 x $1> %vh, i32 0
+  %vh1 = extractelement <2 x $1> %vh, i32 1
+  %r = call $1 $4($1 %vh0, $1 %vh1)
+  ret $1 %r
+')
+
+declare <2 x float> @llvm.arm.neon.vpadd.v2f32(<2 x float>, <2 x float>) nounwind readnone
+
+define internal float @add_f32(float, float) {
+  %r = fadd float %0, %1
+  ret float %r
+}
+
+define internal <WIDTH x float> @__add_varying_float(<WIDTH x float>, <WIDTH x float>) {
+  %r = fadd <WIDTH x float> %0, %1
+  ret <WIDTH x float> %r
+}
+
+define float @__reduce_add_float(<WIDTH x float>) nounwind readnone {
+  neon_reduce(float, @__add_varying_float, @llvm.arm.neon.vpadd.v2f32, @add_f32)
+}
+
+declare <2 x float> @llvm.arm.neon.vpmins.v2f32(<2 x float>, <2 x float>) nounwind readnone
+
+define internal float @min_f32(float, float) {
+  %cmp = fcmp olt float %0, %1
+  %r = select i1 %cmp, float %0, float %1
+  ret float %r
+}
+
+define float @__reduce_min_float(<WIDTH x float>) nounwind readnone {
+  neon_reduce(float, @__min_varying_float, @llvm.arm.neon.vpmins.v2f32, @min_f32)
+}
+
+declare <2 x float> @llvm.arm.neon.vpmaxs.v2f32(<2 x float>, <2 x float>) nounwind readnone
+
+define internal float @max_f32(float, float) {
+  %cmp = fcmp ugt float %0, %1
+  %r = select i1 %cmp, float %0, float %1
+  ret float %r
+}
+
+define float @__reduce_max_float(<WIDTH x float>) nounwind readnone {
+  neon_reduce(float, @__max_varying_float, @llvm.arm.neon.vpmaxs.v2f32, @max_f32)
+}
+
+declare <8 x i16> @llvm.arm.neon.vpaddlu.v8i16.v16i8(<16 x i8>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<8 x i16>) nounwind readnone
+declare <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32>) nounwind readnone
+
+define i64 @__reduce_add_int8(<WIDTH x i8>) nounwind readnone {
+  %a16 = call <8 x i16> @llvm.arm.neon.vpaddlu.v8i16.v16i8(<16 x i8> %0)
+  %a32 = call <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<8 x i16> %a16)
+  %a64 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %a32)
+  %a0 = extractelement <2 x i64> %a64, i32 0
+  %a1 = extractelement <2 x i64> %a64, i32 1
+  %r = add i64 %a0, %a1
+  ret i64 %r
+}
+
+define i64 @__reduce_add_int16(<WIDTH x i16>) nounwind readnone {
+  v16tov8(i16, %0, %va, %vb)
+  %a32 = call <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<8 x i16> %va)
+  %b32 = call <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<8 x i16> %vb)
+  %a64 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %a32)
+  %b64 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %b32)
+  %sum = add <2 x i64> %a64, %b64
+  %a0 = extractelement <2 x i64> %sum, i32 0
+  %a1 = extractelement <2 x i64> %sum, i32 1
+  %r = add i64 %a0, %a1
+  ret i64 %r
+}
+
+define i64 @__reduce_add_int32(<WIDTH x i32>) nounwind readnone {
+  v16tov4(i32, %0, %va, %vb, %vc, %vd)
+  %a64 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %va)
+  %b64 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %vb)
+  %c64 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %vc)
+  %d64 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %vd)
+  %ab = add <2 x i64> %a64, %b64
+  %cd = add <2 x i64> %c64, %d64
+  %sum = add <2 x i64> %ab, %cd
+  %a0 = extractelement <2 x i64> %sum, i32 0
+  %a1 = extractelement <2 x i64> %sum, i32 1
+  %r = add i64 %a0, %a1
+  ret i64 %r
+}
+
+declare <2 x i32> @llvm.arm.neon.vpmins.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+define internal i32 @min_si32(i32, i32) {
+  %cmp = icmp slt i32 %0, %1
+  %r = select i1 %cmp, i32 %0, i32 %1
+  ret i32 %r
+}
+
+define i32 @__reduce_min_int32(<WIDTH x i32>) nounwind readnone {
+  neon_reduce(i32, @__min_varying_int32, @llvm.arm.neon.vpmins.v2i32, @min_si32)
+}
+
+declare <2 x i32> @llvm.arm.neon.vpmaxs.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+define internal i32 @max_si32(i32, i32) {
+  %cmp = icmp sgt i32 %0, %1
+  %r = select i1 %cmp, i32 %0, i32 %1
+  ret i32 %r
+}
+
+define i32 @__reduce_max_int32(<WIDTH x i32>) nounwind readnone {
+  neon_reduce(i32, @__max_varying_int32, @llvm.arm.neon.vpmaxs.v2i32, @max_si32)
+}
+
+declare <2 x i32> @llvm.arm.neon.vpminu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+define internal i32 @min_ui32(i32, i32) {
+  %cmp = icmp ult i32 %0, %1
+  %r = select i1 %cmp, i32 %0, i32 %1
+  ret i32 %r
+}
+
+define i32 @__reduce_min_uint32(<WIDTH x i32>) nounwind readnone {
+  neon_reduce(i32, @__min_varying_uint32, @llvm.arm.neon.vpmins.v2i32, @min_ui32)
+}
+
+declare <2 x i32> @llvm.arm.neon.vpmaxu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+define internal i32 @max_ui32(i32, i32) {
+  %cmp = icmp ugt i32 %0, %1
+  %r = select i1 %cmp, i32 %0, i32 %1
+  ret i32 %r
+}
+
+define i32 @__reduce_max_uint32(<WIDTH x i32>) nounwind readnone {
+  neon_reduce(i32, @__max_varying_uint32, @llvm.arm.neon.vpmaxs.v2i32, @max_ui32)
+}
+
+define internal double @__add_uniform_double(double, double) {
+  %r = fadd double %0, %1
+  ret double %r
+}
+
+define internal <WIDTH x double> @__add_varying_double(<WIDTH x double>, <WIDTH x double>) {
+  %r = fadd <WIDTH x double> %0, %1
+  ret <WIDTH x double> %r
+}
+
+define double @__reduce_add_double(<WIDTH x double>) nounwind readnone {
+  reduce16(double, @__add_varying_double, @__add_uniform_double)
+}
+
+define double @__reduce_min_double(<WIDTH x double>) nounwind readnone {
+  reduce16(double, @__min_varying_double, @__min_uniform_double)
+}
+
+define double @__reduce_max_double(<WIDTH x double>) nounwind readnone {
+  reduce16(double, @__max_varying_double, @__max_uniform_double)
+}
+
+define internal i64 @__add_uniform_int64(i64, i64) {
+  %r = add i64 %0, %1
+  ret i64 %r
+}
+
+define internal <WIDTH x i64> @__add_varying_int64(<WIDTH x i64>, <WIDTH x i64>) {
+  %r = add <WIDTH x i64> %0, %1
+  ret <WIDTH x i64> %r
+}
+
+define i64 @__reduce_add_int64(<WIDTH x i64>) nounwind readnone {
+  reduce16(i64, @__add_varying_int64, @__add_uniform_int64)
+}
+
+define i64 @__reduce_min_int64(<WIDTH x i64>) nounwind readnone {
+  reduce16(i64, @__min_varying_int64, @__min_uniform_int64)
+}
+
+define i64 @__reduce_max_int64(<WIDTH x i64>) nounwind readnone {
+  reduce16(i64, @__max_varying_int64, @__max_uniform_int64)
+}
+
+define i64 @__reduce_min_uint64(<WIDTH x i64>) nounwind readnone {
+  reduce16(i64, @__min_varying_uint64, @__min_uniform_uint64)
+}
+
+define i64 @__reduce_max_uint64(<WIDTH x i64>) nounwind readnone {
+  reduce16(i64, @__max_varying_uint64, @__max_uniform_uint64)
+}
diff --git a/builtins/target-neon-common.ll b/builtins/target-neon-common.ll
new file mode 100644
index 00000000..696b0748
--- /dev/null
+++ b/builtins/target-neon-common.ll
@@ -0,0 +1,351 @@
+;;
+;; target-neon-common.ll
+;;
+;;  Copyright(c) 2013 Google, Inc.
+;;
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Matt Pharr nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+target datalayout = "e-p:32:32:32-S32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f16:16:16-f32:32:32-f64:32:64-f128:128:128-v64:32:64-v128:32:128-a0:0:64-n32"
+
+stdlib_core()
+scans()
+reduce_equal(WIDTH)
+rdrand_decls()
+define_shuffles()
+aossoa()
+ctlztz()
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; half conversion routines
+
+declare <4 x i16> @llvm.arm.neon.vcvtfp2hf(<4 x float>) nounwind readnone
+declare <4 x float> @llvm.arm.neon.vcvthf2fp(<4 x i16>) nounwind readnone
+
+define float @__half_to_float_uniform(i16 %v) nounwind readnone {
+  %v1 = bitcast i16 %v to <1 x i16>
+  %vec = shufflevector <1 x i16> %v1, <1 x i16> undef, 
+           <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+  %h = call <4 x float> @llvm.arm.neon.vcvthf2fp(<4 x i16> %vec)
+  %r = extractelement <4 x float> %h, i32 0
+  ret float %r
+}
+
+define i16 @__float_to_half_uniform(float %v) nounwind readnone {
+  %v1 = bitcast float %v to <1 x float>
+  %vec = shufflevector <1 x float> %v1, <1 x float> undef, 
+           <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+  %h = call <4 x i16> @llvm.arm.neon.vcvtfp2hf(<4 x float> %vec)
+  %r = extractelement <4 x i16> %h, i32 0
+  ret i16 %r
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; math
+
+define void @__fastmath() nounwind {
+  ret void
+}
+
+;; round/floor/ceil
+
+;; FIXME: grabbed these from the sse2 target, which does not have native
+;; instructions for these.  Is there a better approach for NEON?
+
+define float @__round_uniform_float(float) nounwind readonly alwaysinline {
+  %float_to_int_bitcast.i.i.i.i = bitcast float %0 to i32
+  %bitop.i.i = and i32 %float_to_int_bitcast.i.i.i.i, -2147483648
+  %bitop.i = xor i32 %bitop.i.i, %float_to_int_bitcast.i.i.i.i
+  %int_to_float_bitcast.i.i40.i = bitcast i32 %bitop.i to float
+  %binop.i = fadd float %int_to_float_bitcast.i.i40.i, 8.388608e+06
+  %binop21.i = fadd float %binop.i, -8.388608e+06
+  %float_to_int_bitcast.i.i.i = bitcast float %binop21.i to i32
+  %bitop31.i = xor i32 %float_to_int_bitcast.i.i.i, %bitop.i.i
+  %int_to_float_bitcast.i.i.i = bitcast i32 %bitop31.i to float
+  ret float %int_to_float_bitcast.i.i.i
+}
+
+define float @__floor_uniform_float(float) nounwind readonly alwaysinline {
+  %calltmp.i = tail call float @__round_uniform_float(float %0) nounwind
+  %bincmp.i = fcmp ogt float %calltmp.i, %0
+  %selectexpr.i = sext i1 %bincmp.i to i32
+  %bitop.i = and i32 %selectexpr.i, -1082130432
+  %int_to_float_bitcast.i.i.i = bitcast i32 %bitop.i to float
+  %binop.i = fadd float %calltmp.i, %int_to_float_bitcast.i.i.i
+  ret float %binop.i
+}
+
+define float @__ceil_uniform_float(float) nounwind readonly alwaysinline {
+  %calltmp.i = tail call float @__round_uniform_float(float %0) nounwind
+  %bincmp.i = fcmp olt float %calltmp.i, %0
+  %selectexpr.i = sext i1 %bincmp.i to i32
+  %bitop.i = and i32 %selectexpr.i, 1065353216
+  %int_to_float_bitcast.i.i.i = bitcast i32 %bitop.i to float
+  %binop.i = fadd float %calltmp.i, %int_to_float_bitcast.i.i.i
+  ret float %binop.i
+}
+
+;; FIXME: rounding doubles and double vectors needs to be implemented
+declare double @__round_uniform_double(double) nounwind readnone 
+declare double @__floor_uniform_double(double) nounwind readnone 
+declare double @__ceil_uniform_double(double) nounwind readnone 
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; min/max
+
+define float @__max_uniform_float(float, float) nounwind readnone {
+  %cmp = fcmp ugt float %0, %1
+  %r = select i1 %cmp, float %0, float %1
+  ret float %r
+}
+
+define float @__min_uniform_float(float, float) nounwind readnone {
+  %cmp = fcmp ult float %0, %1
+  %r = select i1 %cmp, float %0, float %1
+  ret float %r
+}
+
+define i32 @__min_uniform_int32(i32, i32) nounwind readnone {
+  %cmp = icmp slt i32 %0, %1
+  %r = select i1 %cmp, i32 %0, i32 %1
+  ret i32 %r
+}
+
+define i32 @__max_uniform_int32(i32, i32) nounwind readnone {
+  %cmp = icmp sgt i32 %0, %1
+  %r = select i1 %cmp, i32 %0, i32 %1
+  ret i32 %r
+}
+
+define i32 @__min_uniform_uint32(i32, i32) nounwind readnone {
+  %cmp = icmp ult i32 %0, %1
+  %r = select i1 %cmp, i32 %0, i32 %1
+  ret i32 %r
+}
+
+define i32 @__max_uniform_uint32(i32, i32) nounwind readnone {
+  %cmp = icmp ugt i32 %0, %1
+  %r = select i1 %cmp, i32 %0, i32 %1
+  ret i32 %r
+}
+
+define i64 @__min_uniform_int64(i64, i64) nounwind readnone {
+  %cmp = icmp slt i64 %0, %1
+  %r = select i1 %cmp, i64 %0, i64 %1
+  ret i64 %r
+}
+
+define i64 @__max_uniform_int64(i64, i64) nounwind readnone {
+  %cmp = icmp sgt i64 %0, %1
+  %r = select i1 %cmp, i64 %0, i64 %1
+  ret i64 %r
+}
+
+define i64 @__min_uniform_uint64(i64, i64) nounwind readnone {
+  %cmp = icmp ult i64 %0, %1
+  %r = select i1 %cmp, i64 %0, i64 %1
+  ret i64 %r
+}
+
+define i64 @__max_uniform_uint64(i64, i64) nounwind readnone {
+  %cmp = icmp ugt i64 %0, %1
+  %r = select i1 %cmp, i64 %0, i64 %1
+  ret i64 %r
+}
+
+define double @__min_uniform_double(double, double) nounwind readnone {
+  %cmp = fcmp olt double %0, %1
+  %r = select i1 %cmp, double %0, double %1
+  ret double %r
+}
+
+define double @__max_uniform_double(double, double) nounwind readnone {
+  %cmp = fcmp ogt double %0, %1
+  %r = select i1 %cmp, double %0, double %1
+  ret double %r
+}
+
+define <WIDTH x i64> @__min_varying_int64(<WIDTH x i64>, <WIDTH x i64>) nounwind readnone {
+  %m = icmp slt <WIDTH x i64> %0, %1
+  %r = select <WIDTH x i1> %m, <WIDTH x i64> %0, <WIDTH x i64> %1
+  ret <WIDTH x i64> %r
+}
+
+define <WIDTH x i64> @__max_varying_int64(<WIDTH x i64>, <WIDTH x i64>) nounwind readnone {
+  %m = icmp sgt <WIDTH x i64> %0, %1
+  %r = select <WIDTH x i1> %m, <WIDTH x i64> %0, <WIDTH x i64> %1
+  ret <WIDTH x i64> %r
+}
+
+define <WIDTH x i64> @__min_varying_uint64(<WIDTH x i64>, <WIDTH x i64>) nounwind readnone {
+  %m = icmp ult <WIDTH x i64> %0, %1
+  %r = select <WIDTH x i1> %m, <WIDTH x i64> %0, <WIDTH x i64> %1
+  ret <WIDTH x i64> %r
+}
+
+define <WIDTH x i64> @__max_varying_uint64(<WIDTH x i64>, <WIDTH x i64>) nounwind readnone {
+  %m = icmp ugt <WIDTH x i64> %0, %1
+  %r = select <WIDTH x i1> %m, <WIDTH x i64> %0, <WIDTH x i64> %1
+  ret <WIDTH x i64> %r
+}
+
+define <WIDTH x double> @__min_varying_double(<WIDTH x double>,
+                                              <WIDTH x double>) nounwind readnone {
+  %m = fcmp olt <WIDTH x double> %0, %1
+  %r = select <WIDTH x i1> %m, <WIDTH x double> %0, <WIDTH x double> %1
+  ret <WIDTH x double> %r
+}
+
+define <WIDTH x double> @__max_varying_double(<WIDTH x double>,
+                                              <WIDTH x double>) nounwind readnone {
+  %m = fcmp ogt <WIDTH x double> %0, %1
+  %r = select <WIDTH x i1> %m, <WIDTH x double> %0, <WIDTH x double> %1
+  ret <WIDTH x double> %r
+}
+
+;; sqrt/rsqrt/rcp
+
+declare float @llvm.sqrt.f32(float)
+
+define float @__sqrt_uniform_float(float) nounwind readnone {
+  %r = call float @llvm.sqrt.f32(float %0)
+  ret float %r
+}
+
+declare double @llvm.sqrt.f64(double)
+
+define double @__sqrt_uniform_double(double) nounwind readnone {
+  %r = call double @llvm.sqrt.f64(double %0)
+  ret double %r
+}
+
+;; bit ops
+
+declare i32 @llvm.ctpop.i32(i32) nounwind readnone
+declare i64 @llvm.ctpop.i64(i64) nounwind readnone
+
+define i32 @__popcnt_int32(i32) nounwind readnone {
+  %v = call i32 @llvm.ctpop.i32(i32 %0)
+  ret i32 %v
+}
+
+define i64 @__popcnt_int64(i64) nounwind readnone {
+  %v = call i64 @llvm.ctpop.i64(i64 %0)
+  ret i64 %v
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; unaligned loads/loads+broadcasts
+
+masked_load(i8,  1)
+masked_load(i16, 2)
+masked_load(i32, 4)
+masked_load(float, 4)
+masked_load(i64, 8)
+masked_load(double, 8)
+
+gen_masked_store(i8)
+gen_masked_store(i16)
+gen_masked_store(i32)
+gen_masked_store(i64)
+masked_store_float_double()
+
+define void @__masked_store_blend_i8(<WIDTH x i8>* nocapture %ptr, <WIDTH x i8> %new,
+                                     <WIDTH x MASK> %mask) nounwind alwaysinline {
+  %old = load <WIDTH x i8> * %ptr
+  %mask1 = trunc <WIDTH x MASK> %mask to <WIDTH x i1>
+  %result = select <WIDTH x i1> %mask1, <WIDTH x i8> %new, <WIDTH x i8> %old
+  store <WIDTH x i8> %result, <WIDTH x i8> * %ptr
+  ret void
+}
+
+define void @__masked_store_blend_i16(<WIDTH x i16>* nocapture %ptr, <WIDTH x i16> %new, 
+                                      <WIDTH x MASK> %mask) nounwind alwaysinline {
+  %old = load <WIDTH x i16> * %ptr
+  %mask1 = trunc <WIDTH x MASK> %mask to <WIDTH x i1>
+  %result = select <WIDTH x i1> %mask1, <WIDTH x i16> %new, <WIDTH x i16> %old
+  store <WIDTH x i16> %result, <WIDTH x i16> * %ptr
+  ret void
+}
+
+define void @__masked_store_blend_i32(<WIDTH x i32>* nocapture %ptr, <WIDTH x i32> %new, 
+                                      <WIDTH x MASK> %mask) nounwind alwaysinline {
+  %old = load <WIDTH x i32> * %ptr
+  %mask1 = trunc <WIDTH x MASK> %mask to <WIDTH x i1>
+  %result = select <WIDTH x i1> %mask1, <WIDTH x i32> %new, <WIDTH x i32> %old
+  store <WIDTH x i32> %result, <WIDTH x i32> * %ptr
+  ret void
+}
+
+define void @__masked_store_blend_i64(<WIDTH x i64>* nocapture %ptr,
+                            <WIDTH x i64> %new, <WIDTH x MASK> %mask) nounwind alwaysinline {
+  %old = load <WIDTH x i64> * %ptr
+  %mask1 = trunc <WIDTH x MASK> %mask to <WIDTH x i1>
+  %result = select <WIDTH x i1> %mask1, <WIDTH x i64> %new, <WIDTH x i64> %old
+  store <WIDTH x i64> %result, <WIDTH x i64> * %ptr
+  ret void
+}
+
+;; yuck.  We need declarations of these, even though we shouldnt ever
+;; actually generate calls to them for the NEON target...
+
+declare <WIDTH x float> @__svml_sin(<WIDTH x float>)
+declare <WIDTH x float> @__svml_cos(<WIDTH x float>)
+declare void @__svml_sincos(<WIDTH x float>, <WIDTH x float> *, <WIDTH x float> *)
+declare <WIDTH x float> @__svml_tan(<WIDTH x float>)
+declare <WIDTH x float> @__svml_atan(<WIDTH x float>)
+declare <WIDTH x float> @__svml_atan2(<WIDTH x float>, <WIDTH x float>)
+declare <WIDTH x float> @__svml_exp(<WIDTH x float>)
+declare <WIDTH x float> @__svml_log(<WIDTH x float>)
+declare <WIDTH x float> @__svml_pow(<WIDTH x float>, <WIDTH x float>)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; gather
+
+gen_gather_factored(i8)
+gen_gather_factored(i16)
+gen_gather_factored(i32)
+gen_gather_factored(float)
+gen_gather_factored(i64)
+gen_gather_factored(double)
+
+gen_scatter(i8)
+gen_scatter(i16)
+gen_scatter(i32)
+gen_scatter(float)
+gen_scatter(i64)
+gen_scatter(double)
+
+packed_load_and_store(4)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; prefetch
+
+define_prefetches()
diff --git a/builtins/util.m4 b/builtins/util.m4
index ee45ebc7..1f85e2cc 100644
--- a/builtins/util.m4
+++ b/builtins/util.m4
@@ -49,6 +49,53 @@ define(`MASK_HIGH_BIT_ON',
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
+;; vector assembly and deconstruction utilities
+;; split 8-wide vector into 2 4-wide vectors
+;; 
+;; $1: vector element type
+;; $2: 8-wide vector
+;; $3: first 4-wide vector
+;; $4: second 4-wide vector
+
+define(`v8tov4', `
+  $3 = shufflevector <8 x $1> $2, <8 x $1> undef,
+    <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  $4 = shufflevector <8 x $1> $2, <8 x $1> undef,
+    <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+')
+
+define(`v16tov8', `
+  $3 = shufflevector <16 x $1> $2, <16 x $1> undef,
+    <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  $4 = shufflevector <16 x $1> $2, <16 x $1> undef,
+    <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+')
+
+;; 4-wide into 2 2-wide
+;; args as above
+;;
+
+define(`v4tov2', `
+  $3 = shufflevector <4 x $1> $2, <4 x $1> undef, <2 x i32> <i32 0, i32 1>
+  $4 = shufflevector <4 x $1> $2, <4 x $1> undef, <2 x i32> <i32 2, i32 3>
+')
+
+define(`v8tov2', `
+  $3 = shufflevector <8 x $1> $2, <8 x $1> undef, <2 x i32> <i32 0, i32 1>
+  $4 = shufflevector <8 x $1> $2, <8 x $1> undef, <2 x i32> <i32 2, i32 3>
+  $5 = shufflevector <8 x $1> $2, <8 x $1> undef, <2 x i32> <i32 4, i32 5>
+  $6 = shufflevector <8 x $1> $2, <8 x $1> undef, <2 x i32> <i32 6, i32 7>
+')
+
+define(`v16tov4', `
+  $3 = shufflevector <16 x $1> $2, <16 x $1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  $4 = shufflevector <16 x $1> $2, <16 x $1> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  $5 = shufflevector <16 x $1> $2, <16 x $1> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+  $6 = shufflevector <16 x $1> $2, <16 x $1> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
 ;; Helper macro for calling various SSE instructions for scalar values
 ;; but where the instruction takes a vector parameter.
 ;; $1 : name of variable to put the final value in
@@ -156,10 +203,7 @@ define(`reduce16', `
 ;;     the final reduction
 
 define(`reduce8by4', `
-  %v1 = shufflevector <8 x $1> %0, <8 x $1> undef,
-        <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %v2 = shufflevector <8 x $1> %0, <8 x $1> undef,
-        <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  v8tov4($1, %0, %v1, %v2)
   %m1 = call <4 x $1> $2(<4 x $1> %v1, <4 x $1> %v2)
   %v3 = shufflevector <4 x $1> %m1, <4 x $1> undef,
         <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
@@ -266,30 +310,66 @@ define(`binary2to4', `
 ;; $4: 8-wide operand value
 
 define(`unary4to8', `
-  %$1_0 = shufflevector <8 x $2> $4, <8 x $2> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %v$1_0 = call <4 x $2> $3(<4 x $2> %$1_0)
-  %$1_1 = shufflevector <8 x $2> $4, <8 x $2> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %v$1_1 = call <4 x $2> $3(<4 x $2> %$1_1)
-  %$1 = shufflevector <4 x $2> %v$1_0, <4 x $2> %v$1_1, 
+  %__$1_0 = shufflevector <8 x $2> $4, <8 x $2> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %__v$1_0 = call <4 x $2> $3(<4 x $2> %__$1_0)
+  %__$1_1 = shufflevector <8 x $2> $4, <8 x $2> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %__v$1_1 = call <4 x $2> $3(<4 x $2> %__$1_1)
+  %$1 = shufflevector <4 x $2> %__v$1_0, <4 x $2> %__v$1_1, 
+           <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+'
+)
+
+;; $1: name of variable into which the final result should go
+;; $2: scalar type of the input vector elements
+;; $3: scalar type of the result vector elements
+;; $4: 4-wide unary vector function to apply
+;; $5: 8-wide operand value
+
+define(`unary4to8conv', `
+  %$1_0 = shufflevector <8 x $2> $5, <8 x $2> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v$1_0 = call <4 x $3> $4(<4 x $2> %$1_0)
+  %$1_1 = shufflevector <8 x $2> $5, <8 x $2> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %v$1_1 = call <4 x $3> $4(<4 x $2> %$1_1)
+  %$1 = shufflevector <4 x $3> %v$1_0, <4 x $3> %v$1_1, 
            <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 '
 )
 
 define(`unary4to16', `
-  %$1_0 = shufflevector <16 x $2> $4, <16 x $2> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %v$1_0 = call <4 x $2> $3(<4 x $2> %$1_0)
-  %$1_1 = shufflevector <16 x $2> $4, <16 x $2> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %v$1_1 = call <4 x $2> $3(<4 x $2> %$1_1)
-  %$1_2 = shufflevector <16 x $2> $4, <16 x $2> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
-  %v$1_2 = call <4 x $2> $3(<4 x $2> %$1_2)
-  %$1_3 = shufflevector <16 x $2> $4, <16 x $2> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
-  %v$1_3 = call <4 x $2> $3(<4 x $2> %$1_3)
+  %__$1_0 = shufflevector <16 x $2> $4, <16 x $2> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %__v$1_0 = call <4 x $2> $3(<4 x $2> %__$1_0)
+  %__$1_1 = shufflevector <16 x $2> $4, <16 x $2> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %__v$1_1 = call <4 x $2> $3(<4 x $2> %__$1_1)
+  %__$1_2 = shufflevector <16 x $2> $4, <16 x $2> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+  %__v$1_2 = call <4 x $2> $3(<4 x $2> %__$1_2)
+  %__$1_3 = shufflevector <16 x $2> $4, <16 x $2> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+  %__v$1_3 = call <4 x $2> $3(<4 x $2> %__$1_3)
 
-  %$1a = shufflevector <4 x $2> %v$1_0, <4 x $2> %v$1_1, 
+  %__$1a = shufflevector <4 x $2> %__v$1_0, <4 x $2> %__v$1_1, 
            <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  %$1b = shufflevector <4 x $2> %v$1_2, <4 x $2> %v$1_3, 
+  %__$1b = shufflevector <4 x $2> %__v$1_2, <4 x $2> %__v$1_3, 
            <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  %$1 = shufflevector <8 x $2> %$1a, <8 x $2> %$1b,
+  %$1 = shufflevector <8 x $2> %__$1a, <8 x $2> %__$1b,
+           <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                       i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+'
+)
+
+define(`unary4to16conv', `
+  %$1_0 = shufflevector <16 x $2> $5, <16 x $2> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v$1_0 = call <4 x $3> $4(<4 x $2> %$1_0)
+  %$1_1 = shufflevector <16 x $2> $5, <16 x $2> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %v$1_1 = call <4 x $3> $4(<4 x $2> %$1_1)
+  %$1_2 = shufflevector <16 x $2> $5, <16 x $2> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+  %v$1_2 = call <4 x $3> $4(<4 x $2> %$1_2)
+  %$1_3 = shufflevector <16 x $2> $5, <16 x $2> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+  %v$1_3 = call <4 x $3> $4(<4 x $2> %$1_3)
+
+  %$1a = shufflevector <4 x $3> %v$1_0, <4 x $3> %v$1_1, 
+           <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %$1b = shufflevector <4 x $3> %v$1_2, <4 x $3> %v$1_3, 
+           <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %$1 = shufflevector <8 x $3> %$1a, <8 x $3> %$1b,
            <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
                        i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 '
diff --git a/ispc.cpp b/ispc.cpp
index a9f5ff5c..de8fba4d 100644
--- a/ispc.cpp
+++ b/ispc.cpp
@@ -106,7 +106,7 @@ static void __cpuidex(int info[4], int level, int count) {
 static const char *
 lGetSystemISA() {
 #ifdef __arm__
-    return "neon";
+    return "neon-32";
 #else
     int info[4];
     __cpuid(info, 1);
@@ -187,7 +187,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
                 isa = "avx2";
             else if (!strcmp(cpu, "cortex-a9") ||
                      !strcmp(cpu, "cortex-a15"))
-                isa = "neon";
+                isa = "neon-32";
             else if (!strcmp(cpu, "core-avx-i"))
                 isa = "avx1.1";
             else if (!strcmp(cpu, "sandybridge") ||
@@ -212,7 +212,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
     }
 
 #if !defined(__arm__)
-    if (cpu == NULL && !strcmp(isa, "neon"))
+    if (cpu == NULL && !strncmp(isa, "neon", 4))
         // If we're compiling NEON on an x86 host and the CPU wasn't
         // supplied, don't go and set the CPU based on the host...
         cpu = "cortex-a9";
@@ -246,7 +246,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
     this->m_cpu = cpu;
 
     if (arch == NULL) {
-        if (!strcmp(isa, "neon"))
+        if (!strncmp(isa, "neon", 4))
             arch = "arm";
         else
             arch = "x86-64";
@@ -461,8 +461,26 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
         this->m_hasGather = true;
 #endif
     }
-    else if (!strcasecmp(isa, "neon")) {
-        this->m_isa = Target::NEON;
+    else if (!strcasecmp(isa, "neon-8")) {
+        this->m_isa = Target::NEON8;
+        this->m_nativeVectorWidth = 16;
+        this->m_vectorWidth = 16;
+        this->m_attributes = "+neon,+fp16";
+        this->m_hasHalf = true; // ??
+        this->m_maskingIsFree = false;
+        this->m_maskBitCount = 8;
+    }
+    else if (!strcasecmp(isa, "neon-16")) {
+        this->m_isa = Target::NEON16;
+        this->m_nativeVectorWidth = 8;
+        this->m_vectorWidth = 8;
+        this->m_attributes = "+neon,+fp16";
+        this->m_hasHalf = true; // ??
+        this->m_maskingIsFree = false;
+        this->m_maskBitCount = 16;
+    }
+    else if (!strcasecmp(isa, "neon-32") || !strcasecmp(isa, "neon")) {
+        this->m_isa = Target::NEON32;
         this->m_nativeVectorWidth = 4;
         this->m_vectorWidth = 4;
         this->m_attributes = "+neon,+fp16";
@@ -484,7 +502,8 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
             llvm::Reloc::Default;
         std::string featuresString = m_attributes;
         llvm::TargetOptions options;
-        if (m_isa == Target::NEON)
+        if (m_isa == Target::NEON8 || m_isa == Target::NEON16 ||
+            m_isa == Target::NEON32)
             options.FloatABIType = llvm::FloatABI::Hard;
 #if !defined(LLVM_3_1)
         if (g->opt.disableFMA == false)
@@ -618,8 +637,12 @@ Target::GetTripleString() const {
 const char *
 Target::ISAToString(ISA isa) {
     switch (isa) {
-    case Target::NEON:
-        return "neon";
+    case Target::NEON8:
+        return "neon-8";
+    case Target::NEON16:
+        return "neon-16";
+    case Target::NEON32:
+        return "neon-32";
     case Target::SSE2:
         return "sse2";
     case Target::SSE4:
diff --git a/ispc.h b/ispc.h
index 7d10b908..bf6d2642 100644
--- a/ispc.h
+++ b/ispc.h
@@ -175,7 +175,8 @@ public:
         flexible/performant of them will apear last in the enumerant.  Note
         also that __best_available_isa() needs to be updated if ISAs are
         added or the enumerant values are reordered.  */
-    enum ISA { NEON, SSE2, SSE4, AVX, AVX11, AVX2, GENERIC, NUM_ISAS };
+    enum ISA { NEON32, NEON16, NEON8, SSE2, SSE4, AVX, AVX11, AVX2, GENERIC,
+               NUM_ISAS };
 
     /** Initializes the given Target pointer for a target of the given
         name, if the name is a known target.  Returns true if the
diff --git a/ispc.vcxproj b/ispc.vcxproj
index 96682fe3..e9bf9d97 100755
--- a/ispc.vcxproj
+++ b/ispc.vcxproj
@@ -45,8 +45,12 @@
     <ClCompile Include="$(Configuration)\gen-bitcode-generic-32-64bit.cpp" />
     <ClCompile Include="$(Configuration)\gen-bitcode-generic-64-32bit.cpp" />
     <ClCompile Include="$(Configuration)\gen-bitcode-generic-64-64bit.cpp" />
-    <ClCompile Include="$(Configuration)\gen-bitcode-neon-32bit.cpp" />
-    <ClCompile Include="$(Configuration)\gen-bitcode-neon-64bit.cpp" />
+    <ClCompile Include="$(Configuration)\gen-bitcode-neon-8-32bit.cpp" />
+    <ClCompile Include="$(Configuration)\gen-bitcode-neon-8-64bit.cpp" />
+    <ClCompile Include="$(Configuration)\gen-bitcode-neon-16-32bit.cpp" />
+    <ClCompile Include="$(Configuration)\gen-bitcode-neon-16-64bit.cpp" />
+    <ClCompile Include="$(Configuration)\gen-bitcode-neon-32-32bit.cpp" />
+    <ClCompile Include="$(Configuration)\gen-bitcode-neon-32-64bit.cpp" />
     <ClCompile Include="$(Configuration)\gen-bitcode-sse2-32bit.cpp" />
     <ClCompile Include="$(Configuration)\gen-bitcode-sse2-64bit.cpp" />
     <ClCompile Include="$(Configuration)\gen-bitcode-sse2-x2-32bit.cpp" />
@@ -187,37 +191,78 @@
       <Message>Building gen-bitcode-sse2-x2-64bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
-  <ItemGroup>
-    <CustomBuild Include="builtins\target-neon.ll">
-      <FileType>Document</FileType>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-neon.ll | python bitcode2cpp.py builtins\target-neon.ll &gt; gen-bitcode-neon.cpp</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-neon.cpp</Outputs>
-      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins\util.m4</AdditionalInputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-neon.ll | python bitcode2cpp.py builtins\target-neon.ll &gt; gen-bitcode-neon.cpp</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-neon.cpp</Outputs>
-      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins\util.m4</AdditionalInputs>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-neon.cpp</Message>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-neon.cpp</Message>
-    </CustomBuild>
-  </ItemGroup>
-  <ItemGroup>
-    <CustomBuild Include="builtins\target-avx1.ll">
-      <FileType>Document</FileType>
-      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-avx1.ll | python bitcode2cpp.py builtins\target-avx1.ll 32bit &gt; $(Configuration)/gen-bitcode-avx1-32bit.cpp</Command>
-      <Outputs>$(Configuration)/gen-bitcode-avx1-32bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\target-avx-common.ll;builtins\target-avx.ll</AdditionalInputs>
-      <Message>Building gen-bitcode-avx1-32bit.cpp</Message>
-    </CustomBuild>
-  </ItemGroup>
-  <ItemGroup>
-    <CustomBuild Include="builtins\target-avx1.ll">
-      <FileType>Document</FileType>
-      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-avx1.ll | python bitcode2cpp.py builtins\target-avx1.ll 64bit &gt; $(Configuration)/gen-bitcode-avx1-64bit.cpp</Command>
-      <Outputs>$(Configuration)/gen-bitcode-avx1-64bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\target-avx-common.ll;builtins\target-avx.ll</AdditionalInputs>
-      <Message>Building gen-bitcode-avx1-64bit.cpp</Message>
-    </CustomBuild>
-  </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="builtins\target-neon-8.ll">
+      <FileType>Document</FileType>
+      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-neon-8.ll | python bitcode2cpp.py builtins\target-neon-8.ll 32bit &gt; $(Configuration)/gen-bitcode-neon-8-32bit.cpp</Command>
+      <Outputs>$(Configuration)/gen-bitcode-neon-8-32bit.cpp</Outputs>
+      <AdditionalInputs>builtins\util.m4;builtins\target-neon-common.ll</AdditionalInputs>
+      <Message>Building gen-bitcode-neon-8-32bit.cpp</Message>
+    </CustomBuild>
+  </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="builtins\target-neon-8.ll">
+      <FileType>Document</FileType>
+      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-neon-8.ll | python bitcode2cpp.py builtins\target-neon-8.ll 64bit &gt; $(Configuration)/gen-bitcode-neon-8-64bit.cpp</Command>
+      <Outputs>$(Configuration)/gen-bitcode-neon-8-64bit.cpp</Outputs>
+      <AdditionalInputs>builtins\util.m4;builtins\target-neon-common.ll</AdditionalInputs>
+      <Message>Building gen-bitcode-neon-8-64bit.cpp</Message>
+    </CustomBuild>
+  </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="builtins\target-neon-16.ll">
+      <FileType>Document</FileType>
+      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-neon-16.ll | python bitcode2cpp.py builtins\target-neon-16.ll 32bit &gt; $(Configuration)/gen-bitcode-neon-16-32bit.cpp</Command>
+      <Outputs>$(Configuration)/gen-bitcode-neon-16-32bit.cpp</Outputs>
+      <AdditionalInputs>builtins\util.m4;builtins\target-neon-common.ll</AdditionalInputs>
+      <Message>Building gen-bitcode-neon-16-32bit.cpp</Message>
+    </CustomBuild>
+  </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="builtins\target-neon-16.ll">
+      <FileType>Document</FileType>
+      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-neon-16.ll | python bitcode2cpp.py builtins\target-neon-16.ll 64bit &gt; $(Configuration)/gen-bitcode-neon-16-64bit.cpp</Command>
+      <Outputs>$(Configuration)/gen-bitcode-neon-16-64bit.cpp</Outputs>
+      <AdditionalInputs>builtins\util.m4;builtins\target-neon-common.ll</AdditionalInputs>
+      <Message>Building gen-bitcode-neon-16-64bit.cpp</Message>
+    </CustomBuild>
+  </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="builtins\target-neon-32.ll">
+      <FileType>Document</FileType>
+      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-neon-32.ll | python bitcode2cpp.py builtins\target-neon-32.ll 32bit &gt; $(Configuration)/gen-bitcode-neon-32-32bit.cpp</Command>
+      <Outputs>$(Configuration)/gen-bitcode-neon-32-32bit.cpp</Outputs>
+      <AdditionalInputs>builtins\util.m4;builtins\target-neon-common.ll</AdditionalInputs>
+      <Message>Building gen-bitcode-neon-32-32bit.cpp</Message>
+    </CustomBuild>
+  </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="builtins\target-neon-32.ll">
+      <FileType>Document</FileType>
+      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-neon-32.ll | python bitcode2cpp.py builtins\target-neon-32.ll 64bit &gt; $(Configuration)/gen-bitcode-neon-32-64bit.cpp</Command>
+      <Outputs>$(Configuration)/gen-bitcode-neon-32-64bit.cpp</Outputs>
+      <AdditionalInputs>builtins\util.m4;builtins\target-neon-common.ll</AdditionalInputs>
+      <Message>Building gen-bitcode-neon-32-64bit.cpp</Message>
+    </CustomBuild>
+  </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="builtins\target-avx1.ll">
+      <FileType>Document</FileType>
+      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-avx1.ll | python bitcode2cpp.py builtins\target-avx1.ll 32bit &gt; $(Configuration)/gen-bitcode-avx1-32bit.cpp</Command>
+      <Outputs>$(Configuration)/gen-bitcode-avx1-32bit.cpp</Outputs>
+      <AdditionalInputs>builtins\util.m4;builtins\target-avx-common.ll;builtins\target-avx.ll</AdditionalInputs>
+      <Message>Building gen-bitcode-avx1-32bit.cpp</Message>
+    </CustomBuild>
+  </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="builtins\target-avx1.ll">
+      <FileType>Document</FileType>
+      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-avx1.ll | python bitcode2cpp.py builtins\target-avx1.ll 64bit &gt; $(Configuration)/gen-bitcode-avx1-64bit.cpp</Command>
+      <Outputs>$(Configuration)/gen-bitcode-avx1-64bit.cpp</Outputs>
+      <AdditionalInputs>builtins\util.m4;builtins\target-avx-common.ll;builtins\target-avx.ll</AdditionalInputs>
+      <Message>Building gen-bitcode-avx1-64bit.cpp</Message>
+    </CustomBuild>
+  </ItemGroup>
   <ItemGroup>
     <CustomBuild Include="builtins\target-avx1-x2.ll">
       <FileType>Document</FileType>
diff --git a/module.cpp b/module.cpp
index 85bf242c..755a5dc4 100644
--- a/module.cpp
+++ b/module.cpp
@@ -1877,6 +1877,7 @@ Module::execPreprocessor(const char *infilename, llvm::raw_string_ostream *ostre
     char *p = targetMacro;
     while (*p) {
         *p = toupper(*p);
+        if (*p == '-') *p = '_';
         ++p;
     }
     opts.addMacroDef(targetMacro);
diff --git a/run_tests.py b/run_tests.py
index ea819ea4..c9dd8b76 100755
--- a/run_tests.py
+++ b/run_tests.py
@@ -37,7 +37,7 @@ parser.add_option("-g", "--generics-include", dest="include_file", help="Filenam
 parser.add_option("-f", "--ispc-flags", dest="ispc_flags", help="Additional flags for ispc (-g, -O1, ...)",
                   default="")
 parser.add_option('-t', '--target', dest='target',
-                  help='Set compilation target (neon, sse2, sse2-x2, sse4, sse4-x2, sse4-8, sse4-16, avx, avx-x2, generic-4, generic-8, generic-16, generic-32)',
+                  help='Set compilation target (neon8, neon16, neon32, sse2, sse2-x2, sse4, sse4-x2, sse4-8, sse4-16, avx, avx-x2, generic-4, generic-8, generic-16, generic-32)',
                   default="sse4")
 parser.add_option('-a', '--arch', dest='arch',
                   help='Set architecture (arm, x86, x86-64)',

From 48ff03112fd30d12a85eaf7cee3636ee6bfbedb4 Mon Sep 17 00:00:00 2001
From: Matt Pharr <mpharr@google.com>
Date: Mon, 29 Jul 2013 16:20:46 -0700
Subject: [PATCH 15/34] Remove __pause from stdlib_core() in utils.m4.

It wasn't ever being used, and was breaking compilation on ARM.
---
 builtins.cpp     | 1 -
 builtins/util.m4 | 5 -----
 2 files changed, 6 deletions(-)

diff --git a/builtins.cpp b/builtins.cpp
index e671a491..b2896388 100644
--- a/builtins.cpp
+++ b/builtins.cpp
@@ -487,7 +487,6 @@ lSetInternalFunctions(llvm::Module *module) {
         "__num_cores",
         "__packed_load_active",
         "__packed_store_active",
-        "__pause",
         "__popcnt_int32",
         "__popcnt_int64",
         "__prefetch_read_uniform_1",
diff --git a/builtins/util.m4 b/builtins/util.m4
index 1f85e2cc..025030d5 100644
--- a/builtins/util.m4
+++ b/builtins/util.m4
@@ -1795,11 +1795,6 @@ declare i1 @__is_compile_time_constant_mask(<WIDTH x MASK> %mask)
 declare i1 @__is_compile_time_constant_uniform_int32(i32)
 declare i1 @__is_compile_time_constant_varying_int32(<WIDTH x i32>)
 
-define void @__pause() nounwind readnone {
-  call void asm sideeffect "pause", "~{dirflag},~{fpsr},~{flags}"() nounwind
-  ret void
-}
-
 ; This function declares placeholder masked store functions for the
 ;  front-end to use.
 ;

From d3c567503bf64ec9066c09cb8959c31d4aa1be0e Mon Sep 17 00:00:00 2001
From: Matt Pharr <mpharr@google.com>
Date: Wed, 31 Jul 2013 06:46:45 -0700
Subject: [PATCH 16/34] Remove support for building with LLVM 3.1

---
 builtins.cpp                |   2 -
 builtins/target-avx11-x2.ll |   4 +-
 builtins/target-avx11.ll    |   4 +-
 builtins/target-avx2-x2.ll  |  25 +-------
 builtins/target-avx2.ll     |  25 +-------
 cbackend.cpp                | 115 +++++++++---------------------------
 ctx.cpp                     |   4 +-
 ctx.h                       |  11 +---
 expr.cpp                    |   2 +-
 func.cpp                    |  10 +---
 ispc.cpp                    |  68 ++++-----------------
 ispc.h                      |  18 +-----
 llvmutil.cpp                |   2 +-
 llvmutil.h                  |   2 +-
 main.cpp                    |   4 +-
 module.cpp                  |  47 +++------------
 opt.cpp                     |  22 ++-----
 stmt.cpp                    |   2 +-
 type.cpp                    |  26 +++-----
 type.h                      |   2 +-
 util.cpp                    |   9 +--
 21 files changed, 84 insertions(+), 320 deletions(-)

diff --git a/builtins.cpp b/builtins.cpp
index b2896388..17582d68 100644
--- a/builtins.cpp
+++ b/builtins.cpp
@@ -49,8 +49,6 @@
 #include <stdlib.h>
 #if defined(LLVM_3_2)
   #include <llvm/Attributes.h>
-#endif
-#if defined(LLVM_3_1) || defined(LLVM_3_2)
   #include <llvm/LLVMContext.h>
   #include <llvm/Module.h>
   #include <llvm/Type.h>
diff --git a/builtins/target-avx11-x2.ll b/builtins/target-avx11-x2.ll
index 1aa6345c..2aee1e1c 100644
--- a/builtins/target-avx11-x2.ll
+++ b/builtins/target-avx11-x2.ll
@@ -31,9 +31,7 @@
 
 include(`target-avx-x2.ll')
 
-ifelse(LLVM_VERSION, `LLVM_3_0', `rdrand_decls()',
-       LLVM_VERSION, `LLVM_3_1', `rdrand_decls()',
-       `rdrand_definition()')
+rdrand_definition()
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; int min/max
diff --git a/builtins/target-avx11.ll b/builtins/target-avx11.ll
index fea0a7c2..44593113 100644
--- a/builtins/target-avx11.ll
+++ b/builtins/target-avx11.ll
@@ -31,9 +31,7 @@
 
 include(`target-avx.ll')
 
-ifelse(LLVM_VERSION, `LLVM_3_0', `rdrand_decls()',
-       LLVM_VERSION, `LLVM_3_1', `rdrand_decls()',
-       `rdrand_definition()')
+rdrand_definition()
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; int min/max
diff --git a/builtins/target-avx2-x2.ll b/builtins/target-avx2-x2.ll
index 053fd078..19f1845d 100644
--- a/builtins/target-avx2-x2.ll
+++ b/builtins/target-avx2-x2.ll
@@ -29,15 +29,11 @@
 ;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 ;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
 
-ifelse(LLVM_VERSION, `LLVM_3_0', `',
-       LLVM_VERSION, `LLVM_3_1', `',
-       `define(`HAVE_GATHER', `1')')
+define(`HAVE_GATHER', `1')
 
 include(`target-avx-x2.ll')
 
-ifelse(LLVM_VERSION, `LLVM_3_0', `rdrand_decls()',
-       LLVM_VERSION, `LLVM_3_1', `rdrand_decls()',
-       `rdrand_definition()')
+rdrand_definition()
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; int min/max
@@ -176,21 +172,6 @@ define(`assemble_4s', `
   assemble_8s($1, $2, $2_1, $2_2)
 ')
 
-ifelse(LLVM_VERSION, `LLVM_3_0', `
-gen_gather_factored(i8)
-gen_gather_factored(i16)
-gen_gather_factored(i32)
-gen_gather_factored(float)
-gen_gather_factored(i64)
-gen_gather_factored(double)',
-LLVM_VERSION, `LLVM_3_1', `
-gen_gather_factored(i8)
-gen_gather_factored(i16)
-gen_gather_factored(i32)
-gen_gather_factored(float)
-gen_gather_factored(i64)
-gen_gather_factored(double)', `
-
 gen_gather(i8)
 gen_gather(i16)
 
@@ -557,5 +538,3 @@ define <16 x double> @__gather64_double(<16 x i64> %ptrs,
 
   ret <16 x double> %v
 }
-
-')
diff --git a/builtins/target-avx2.ll b/builtins/target-avx2.ll
index f4a0ee07..d3410011 100644
--- a/builtins/target-avx2.ll
+++ b/builtins/target-avx2.ll
@@ -29,15 +29,11 @@
 ;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 ;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
 
-ifelse(LLVM_VERSION, `LLVM_3_0', `',
-       LLVM_VERSION, `LLVM_3_1', `',
-       `define(`HAVE_GATHER', `1')')
+define(`HAVE_GATHER', `1')
 
 include(`target-avx.ll')
 
-ifelse(LLVM_VERSION, `LLVM_3_0', `rdrand_decls()',
-       LLVM_VERSION, `LLVM_3_1', `rdrand_decls()',
-       `rdrand_definition()')
+rdrand_definition()
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; int min/max
@@ -123,21 +119,6 @@ define(`extract_4s', `
   %$2_2 = shufflevector <8 x $1> %$2, <8 x $1> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 ')
 
-ifelse(LLVM_VERSION, `LLVM_3_0', `
-gen_gather_factored(i8)
-gen_gather_factored(i16)
-gen_gather_factored(i32)
-gen_gather_factored(float)
-gen_gather_factored(i64)
-gen_gather_factored(double)',
-LLVM_VERSION, `LLVM_3_1', `
-gen_gather_factored(i8)
-gen_gather_factored(i16)
-gen_gather_factored(i32)
-gen_gather_factored(float)
-gen_gather_factored(i64)
-gen_gather_factored(double)', `
-
 gen_gather(i8)
 gen_gather(i16)
 
@@ -429,5 +410,3 @@ define <8 x double> @__gather64_double(<8 x i64> %ptrs,
 
   ret <8 x double> %v
 }
-
-')
diff --git a/cbackend.cpp b/cbackend.cpp
index d23bcc20..d54f48fb 100644
--- a/cbackend.cpp
+++ b/cbackend.cpp
@@ -29,7 +29,7 @@
 
 #include "llvmutil.h"
 
-#if defined(LLVM_3_1) || defined(LLVM_3_2)
+#if defined(LLVM_3_2)
   #include "llvm/Constants.h"
   #include "llvm/DerivedTypes.h"
   #include "llvm/CallingConv.h"
@@ -38,6 +38,7 @@
   #include "llvm/Intrinsics.h"
   #include "llvm/IntrinsicInst.h"
   #include "llvm/InlineAsm.h"
+  #include "llvm/TypeFinder.h"
 #else
   #include "llvm/IR/Constants.h"
   #include "llvm/IR/DerivedTypes.h"
@@ -47,16 +48,10 @@
   #include "llvm/IR/Intrinsics.h"
   #include "llvm/IR/IntrinsicInst.h"
   #include "llvm/IR/InlineAsm.h"
+  #include "llvm/IR/TypeFinder.h"
 #endif
 #include "llvm/Pass.h"
 #include "llvm/PassManager.h"
-#if !defined(LLVM_3_1)
-  #if defined(LLVM_3_2)
-    #include "llvm/TypeFinder.h"
-  #else // LLVM_3_3 +
-    #include "llvm/IR/TypeFinder.h"
-  #endif
-#endif // LLVM_3_2 +
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/STLExtras.h"
@@ -76,9 +71,7 @@
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCSymbol.h"
-#if defined(LLVM_3_1)
-  #include "llvm/Target/TargetData.h"
-#elif defined(LLVM_3_2)
+#if defined(LLVM_3_2)
   #include "llvm/DataLayout.h"
 #else // LLVM 3.3+
   #include "llvm/IR/DataLayout.h"
@@ -88,7 +81,7 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FormattedStream.h"
 #include "llvm/Support/GetElementPtrTypeIterator.h"
-#if defined(LLVM_3_1) || defined(LLVM_3_2)
+#if defined(LLVM_3_2)
   #include "llvm/Support/InstVisitor.h"
 #else // LLVM 3.3+
   #include "llvm/InstVisitor.h"
@@ -258,14 +251,10 @@ namespace {
     const llvm::MCRegisterInfo *MRI;
     const llvm::MCObjectFileInfo *MOFI;
     llvm::MCContext *TCtx;
-#if defined(LLVM_3_1)
-    const llvm::TargetData* TD;
-#else
     // FIXME: it's ugly to have the name be "TD" here, but it saves us
     // lots of ifdefs in the below since the new DataLayout and the old
     // TargetData have generally similar interfaces...
     const llvm::DataLayout* TD;
-#endif
 
     std::map<const llvm::ConstantFP *, unsigned> FPConstantMap;
     std::map<const llvm::ConstantDataVector *, unsigned> VectorConstantMap;
@@ -352,7 +341,7 @@ namespace {
                            bool isSigned = false,
                            const std::string &VariableName = "",
                            bool IgnoreName = false,
-#if defined(LLVM_3_1) || defined(LLVM_3_2)
+#if defined(LLVM_3_2)
                            const llvm::AttrListPtr &PAL = llvm::AttrListPtr()
 #else
                            const llvm::AttributeSet &PAL = llvm::AttributeSet()
@@ -363,7 +352,7 @@ namespace {
                            const std::string &NameSoFar = "");
 
     void printStructReturnPointerFunctionType(llvm::raw_ostream &Out,
-#if defined(LLVM_3_1) || defined(LLVM_3_2)
+#if defined(LLVM_3_2)
                                               const llvm::AttrListPtr &PAL,
 #else
                                               const llvm::AttributeSet &PAL,
@@ -586,7 +575,7 @@ std::string CWriter::getArrayName(llvm::ArrayType *AT) {
 /// return type, except, instead of printing the type as void (*)(Struct*, ...)
 /// print it as "Struct (*)(...)", for struct return functions.
 void CWriter::printStructReturnPointerFunctionType(llvm::raw_ostream &Out,
-#if defined(LLVM_3_1) || defined(LLVM_3_2)
+#if defined(LLVM_3_2)
                                                    const llvm::AttrListPtr &PAL,
 #else
                                                    const llvm::AttributeSet &PAL,
@@ -605,20 +594,16 @@ void CWriter::printStructReturnPointerFunctionType(llvm::raw_ostream &Out,
     if (PrintedType)
       FunctionInnards << ", ";
     llvm::Type *ArgTy = *I;
-#if defined(LLVM_3_1)
-    if (PAL.paramHasAttr(Idx, llvm::Attribute::ByVal)) {
-#elif defined(LLVM_3_2)
+#if defined(LLVM_3_2)
     if (PAL.getParamAttributes(Idx).hasAttribute(llvm::Attributes::ByVal)) {
 #else
-        if (PAL.getParamAttributes(Idx).hasAttribute(llvm::AttributeSet::FunctionIndex, llvm::Attribute::ByVal)) {
+    if (PAL.getParamAttributes(Idx).hasAttribute(llvm::AttributeSet::FunctionIndex, llvm::Attribute::ByVal)) {
 #endif
       assert(ArgTy->isPointerTy());
       ArgTy = llvm::cast<llvm::PointerType>(ArgTy)->getElementType();
     }
     printType(FunctionInnards, ArgTy,
-#if defined(LLVM_3_1)
-              /*isSigned=*/PAL.paramHasAttr(Idx, llvm::Attribute::SExt),
-#elif defined(LLVM_3_2)
+#if defined(LLVM_3_2)
               PAL.getParamAttributes(Idx).hasAttribute(llvm::Attributes::SExt),
 #else
               PAL.getParamAttributes(Idx).hasAttribute(llvm::AttributeSet::FunctionIndex, llvm::Attribute::SExt),
@@ -635,9 +620,7 @@ void CWriter::printStructReturnPointerFunctionType(llvm::raw_ostream &Out,
   }
   FunctionInnards << ')';
   printType(Out, RetTy,
-#if defined(LLVM_3_1)
-            /*isSigned=*/PAL.paramHasAttr(0, llvm::Attribute::SExt),
-#elif defined(LLVM_3_2)
+#if defined(LLVM_3_2)
             PAL.getParamAttributes(0).hasAttribute(llvm::Attributes::SExt),
 #else
             PAL.getParamAttributes(0).hasAttribute(llvm::AttributeSet::ReturnIndex, llvm::Attribute::SExt),
@@ -737,7 +720,7 @@ CWriter::printSimpleType(llvm::raw_ostream &Out, llvm::Type *Ty, bool isSigned,
 llvm::raw_ostream &CWriter::printType(llvm::raw_ostream &Out, llvm::Type *Ty,
                                 bool isSigned, const std::string &NameSoFar,
                                 bool IgnoreName,
-#if defined(LLVM_3_1) || defined(LLVM_3_2)
+#if defined(LLVM_3_2)
                                 const llvm::AttrListPtr &PAL
 #else
                                 const llvm::AttributeSet &PAL
@@ -759,9 +742,7 @@ llvm::raw_ostream &CWriter::printType(llvm::raw_ostream &Out, llvm::Type *Ty,
     for (llvm::FunctionType::param_iterator I = FTy->param_begin(),
            E = FTy->param_end(); I != E; ++I) {
       llvm::Type *ArgTy = *I;
-#if defined(LLVM_3_1)
-      if (PAL.paramHasAttr(Idx, llvm::Attribute::ByVal)) {
-#elif defined(LLVM_3_2)
+#if defined(LLVM_3_2)
       if (PAL.getParamAttributes(Idx).hasAttribute(llvm::Attributes::ByVal)) {
 #else
           if (PAL.getParamAttributes(Idx).hasAttribute(llvm::AttributeSet::FunctionIndex, llvm::Attribute::ByVal)) {
@@ -772,9 +753,7 @@ llvm::raw_ostream &CWriter::printType(llvm::raw_ostream &Out, llvm::Type *Ty,
       if (I != FTy->param_begin())
         FunctionInnards << ", ";
       printType(FunctionInnards, ArgTy,
-#if defined(LLVM_3_1)
-                /*isSigned=*/PAL.paramHasAttr(Idx, llvm::Attribute::SExt),
-#elif defined(LLVM_3_2)
+#if defined(LLVM_3_2)
                 PAL.getParamAttributes(Idx).hasAttribute(llvm::Attributes::SExt),
 #else
                 PAL.getParamAttributes(Idx).hasAttribute(llvm::AttributeSet::FunctionIndex, llvm::Attribute::SExt),
@@ -791,9 +770,7 @@ llvm::raw_ostream &CWriter::printType(llvm::raw_ostream &Out, llvm::Type *Ty,
     }
     FunctionInnards << ')';
     printType(Out, FTy->getReturnType(),
-#if defined(LLVM_3_1)
-              /*isSigned=*/PAL.paramHasAttr(0, llvm::Attribute::SExt),
-#elif defined(LLVM_3_2)
+#if defined(LLVM_3_2)
               PAL.getParamAttributes(0).hasAttribute(llvm::Attributes::SExt),
 #else
               PAL.getParamAttributes(0).hasAttribute(llvm::AttributeSet::ReturnIndex, llvm::Attribute::SExt),
@@ -1972,11 +1949,7 @@ void CWriter::writeOperandWithCast(llvm::Value* Operand, const llvm::ICmpInst &C
 // directives to cater to specific compilers as need be.
 //
 static void generateCompilerSpecificCode(llvm::formatted_raw_ostream& Out,
-#if defined(LLVM_3_1)
-                                         const llvm::TargetData *TD) {
-#else
                                          const llvm::DataLayout *TD) {
-#endif
   // We output GCC specific attributes to preserve 'linkonce'ness on globals.
   // If we aren't being compiled with GCC, just drop these attributes.
   Out << "#ifndef __GNUC__  /* Can only support \"linkonce\" vars with GCC */\n"
@@ -2169,11 +2142,7 @@ bool CWriter::doInitialization(llvm::Module &M) {
   // Initialize
   TheModule = &M;
 
-#if defined(LLVM_3_1)
-  TD = new llvm::TargetData(&M);
-#else
   TD = new llvm::DataLayout(&M);
-#endif
   IL = new llvm::IntrinsicLowering(*TD);
   IL->AddPrototypes(M);
 
@@ -2656,15 +2625,11 @@ void CWriter::printModuleTypes() {
 
   // Get all of the struct types used in the module.
   std::vector<llvm::StructType*> StructTypes;
-#if defined(LLVM_3_1)
-  TheModule->findUsedStructTypes(StructTypes);
-#else
   llvm::TypeFinder typeFinder;
   typeFinder.run(*TheModule, false);
   for (llvm::TypeFinder::iterator iter = typeFinder.begin();
        iter != typeFinder.end(); ++iter)
       StructTypes.push_back(*iter);
-#endif
 
   // Get all of the array types used in the module
   std::vector<llvm::ArrayType*> ArrayTypes;
@@ -2785,7 +2750,7 @@ void CWriter::printFunctionSignature(const llvm::Function *F, bool Prototype) {
 
   // Loop over the arguments, printing them...
   llvm::FunctionType *FT = llvm::cast<llvm::FunctionType>(F->getFunctionType());
-#if defined(LLVM_3_1) || defined(LLVM_3_2)
+#if defined(LLVM_3_2)
   const llvm::AttrListPtr &PAL = F->getAttributes();
 #else
   const llvm::AttributeSet &PAL = F->getAttributes();
@@ -2819,20 +2784,16 @@ void CWriter::printFunctionSignature(const llvm::Function *F, bool Prototype) {
         else
           ArgName = "";
         llvm::Type *ArgTy = I->getType();
-#if defined(LLVM_3_1)
-        if (PAL.paramHasAttr(Idx, llvm::Attribute::ByVal)) {
-#elif defined(LLVM_3_2)
+#if defined(LLVM_3_2)
         if (PAL.getParamAttributes(Idx).hasAttribute(llvm::Attributes::ByVal)) {
 #else
-            if (PAL.getParamAttributes(Idx).hasAttribute(llvm::AttributeSet::FunctionIndex, llvm::Attribute::ByVal)) {
+        if (PAL.getParamAttributes(Idx).hasAttribute(llvm::AttributeSet::FunctionIndex, llvm::Attribute::ByVal)) {
 #endif
           ArgTy = llvm::cast<llvm::PointerType>(ArgTy)->getElementType();
           ByValParams.insert(I);
         }
         printType(FunctionInnards, ArgTy,
-#if defined(LLVM_3_1)
-                  /*isSigned=*/PAL.paramHasAttr(Idx, llvm::Attribute::SExt),
-#elif defined(LLVM_3_2)
+#if defined(LLVM_3_2)
                   PAL.getParamAttributes(Idx).hasAttribute(llvm::Attributes::SExt),
 #else
                   PAL.getParamAttributes(Idx).hasAttribute(llvm::AttributeSet::FunctionIndex, llvm::Attribute::SExt),
@@ -2858,9 +2819,7 @@ void CWriter::printFunctionSignature(const llvm::Function *F, bool Prototype) {
     for (; I != E; ++I) {
       if (PrintedArg) FunctionInnards << ", ";
       llvm::Type *ArgTy = *I;
-#if defined(LLVM_3_1)
-      if (PAL.paramHasAttr(Idx, llvm::Attribute::ByVal)) {
-#elif defined(LLVM_3_2)
+#if defined(LLVM_3_2)
       if (PAL.getParamAttributes(Idx).hasAttribute(llvm::Attributes::ByVal)) {
 #else
           if (PAL.getParamAttributes(Idx).hasAttribute(llvm::AttributeSet::FunctionIndex, llvm::Attribute::ByVal)) {
@@ -2869,9 +2828,7 @@ void CWriter::printFunctionSignature(const llvm::Function *F, bool Prototype) {
         ArgTy = llvm::cast<llvm::PointerType>(ArgTy)->getElementType();
       }
       printType(FunctionInnards, ArgTy,
-#if defined(LLVM_3_1)
-                /*isSigned=*/PAL.paramHasAttr(Idx, llvm::Attribute::SExt)
-#elif defined(LLVM_3_2)
+#if defined(LLVM_3_2)
                 PAL.getParamAttributes(Idx).hasAttribute(llvm::Attributes::SExt)
 #else
                 PAL.getParamAttributes(Idx).hasAttribute(llvm::AttributeSet::FunctionIndex, llvm::Attribute::SExt)
@@ -2908,9 +2865,7 @@ void CWriter::printFunctionSignature(const llvm::Function *F, bool Prototype) {
 
   // Print out the return type and the signature built above.
   printType(Out, RetTy,
-#if defined(LLVM_3_1)
-            /*isSigned=*/PAL.paramHasAttr(0, llvm::Attribute::SExt),
-#elif defined(LLVM_3_2)
+#if defined(LLVM_3_2)
             PAL.getParamAttributes(0).hasAttribute(llvm::Attributes::SExt),
 #else
             PAL.getParamAttributes(0).hasAttribute(llvm::AttributeSet::ReturnIndex, llvm::Attribute::SExt),
@@ -3712,7 +3667,7 @@ void CWriter::lowerIntrinsics(llvm::Function &F) {
             const char *BuiltinName = "";
 #define GET_GCC_BUILTIN_NAME
 #define Intrinsic llvm::Intrinsic
-#if defined(LLVM_3_1) || defined(LLVM_3_2)
+#if defined(LLVM_3_2)
   #include "llvm/Intrinsics.gen"
 #else
   #include "llvm/IR/Intrinsics.gen"
@@ -3777,7 +3732,7 @@ void CWriter::visitCallInst(llvm::CallInst &I) {
 
   // If this is a call to a struct-return function, assign to the first
   // parameter instead of passing it to the call.
-#if defined(LLVM_3_1) || defined(LLVM_3_2)
+#if defined(LLVM_3_2)
   const llvm::AttrListPtr &PAL = I.getAttributes();
 #else
   const llvm::AttributeSet &PAL = I.getAttributes();
@@ -3865,9 +3820,7 @@ void CWriter::visitCallInst(llvm::CallInst &I) {
         (*AI)->getType() != FTy->getParamType(ArgNo)) {
       Out << '(';
       printType(Out, FTy->getParamType(ArgNo),
-#if defined(LLVM_3_1)
-                /*isSigned=*/PAL.paramHasAttr(ArgNo+1, llvm::Attribute::SExt)
-#elif defined(LLVM_3_2)
+#if defined(LLVM_3_2)
                 PAL.getParamAttributes(ArgNo+1).hasAttribute(llvm::Attributes::SExt)
 #else
                 PAL.getParamAttributes(ArgNo+1).hasAttribute(llvm::AttributeSet::FunctionIndex, llvm::Attribute::SExt)
@@ -3905,7 +3858,7 @@ bool CWriter::visitBuiltinCall(llvm::CallInst &I, llvm::Intrinsic::ID ID,
     const char *BuiltinName = "";
 #define GET_GCC_BUILTIN_NAME
 #define Intrinsic llvm::Intrinsic
-#if defined(LLVM_3_1) || defined(LLVM_3_2)
+#if defined(LLVM_3_2)
   #include "llvm/Intrinsics.gen"
 #else
   #include "llvm/IR/Intrinsics.gen"
@@ -4555,13 +4508,8 @@ SmearCleanupPass::runOnBasicBlock(llvm::BasicBlock &bb) {
                                                 smearType, NULL);
                 smearFunc = llvm::dyn_cast<llvm::Function>(sf);
                 assert(smearFunc != NULL);
-#if defined(LLVM_3_1)
-                smearFunc->setDoesNotThrow(true);
-                smearFunc->setDoesNotAccessMemory(true);
-#else
                 smearFunc->setDoesNotThrow();
                 smearFunc->setDoesNotAccessMemory();
-#endif
             }
 
             assert(smearFunc != NULL);
@@ -4703,13 +4651,8 @@ AndCmpCleanupPass::runOnBasicBlock(llvm::BasicBlock &bb) {
                                                    LLVMTypes::MaskType, NULL);
                 andCmpFunc = llvm::dyn_cast<llvm::Function>(acf);
                 Assert(andCmpFunc != NULL);
-#if defined(LLVM_3_1)
-                andCmpFunc->setDoesNotThrow(true);
-                andCmpFunc->setDoesNotAccessMemory(true);
-#else
                 andCmpFunc->setDoesNotThrow();
                 andCmpFunc->setDoesNotAccessMemory();
-#endif
             }
 
             // Set up the function call to the *_and_mask function; the
@@ -4914,7 +4857,7 @@ WriteCXXFile(llvm::Module *module, const char *fn, int vectorWidth,
         pm.add(new llvm::TargetData(module));
 #endif
 
-#if defined(LLVM_3_1) || defined(LLVM_3_2) || defined(LLVM_3_3)
+#if defined(LLVM_3_2) || defined(LLVM_3_3)
     int flags = 0;
 #else
     llvm::sys::fs::OpenFlags flags = llvm::sys::fs::F_None;
@@ -4939,7 +4882,7 @@ WriteCXXFile(llvm::Module *module, const char *fn, int vectorWidth,
     pm.add(llvm::createDeadCodeEliminationPass()); // clean up after smear pass
 //CO    pm.add(llvm::createPrintModulePass(&fos));
     pm.add(new CWriter(fos, includeName, vectorWidth));
-#if defined(LLVM_3_1) || defined(LLVM_3_2)
+#if defined(LLVM_3_2)
     // This interface is depricated for 3.3+
     pm.add(llvm::createGCInfoDeleter());
 #endif
diff --git a/ctx.cpp b/ctx.cpp
index c50d22f9..32ba0ad9 100644
--- a/ctx.cpp
+++ b/ctx.cpp
@@ -46,7 +46,7 @@
 #include "sym.h"
 #include <map>
 #include <llvm/Support/Dwarf.h>
-#if defined(LLVM_3_1) || defined(LLVM_3_2)
+#if defined(LLVM_3_2)
   #include <llvm/Metadata.h>
   #include <llvm/Module.h>
   #include <llvm/Instructions.h>
@@ -3312,7 +3312,7 @@ FunctionEmitContext::CallInst(llvm::Value *func, const FunctionType *funcType,
         // alias analysis.
         // TODO: what other attributes needs to be copied?
         // TODO: do the same for varing path.
-#if !defined (LLVM_3_1) && !defined (LLVM_3_2) // LLVM 3.3+
+#if !defined (LLVM_3_2) // LLVM 3.3+
         llvm::CallInst *cc = llvm::dyn_cast<llvm::CallInst>(ci);
         if (cc &&
             cc->getCalledFunction() &&
diff --git a/ctx.h b/ctx.h
index 58f9aae3..4b27e6e5 100644
--- a/ctx.h
+++ b/ctx.h
@@ -40,20 +40,15 @@
 
 #include "ispc.h"
 #include <map>
-#if defined(LLVM_3_1) || defined(LLVM_3_2)
+#if defined(LLVM_3_2)
   #include <llvm/InstrTypes.h>
   #include <llvm/Instructions.h>
 #else
   #include <llvm/IR/InstrTypes.h>
   #include <llvm/IR/Instructions.h>
 #endif
-#if defined(LLVM_3_1)
-  #include <llvm/Analysis/DebugInfo.h>
-  #include <llvm/Analysis/DIBuilder.h>
-#else
-  #include <llvm/DebugInfo.h>
-  #include <llvm/DIBuilder.h>
-#endif
+#include <llvm/DebugInfo.h>
+#include <llvm/DIBuilder.h>
 
 struct CFInfo;
 
diff --git a/expr.cpp b/expr.cpp
index 856d363c..eb8c0951 100644
--- a/expr.cpp
+++ b/expr.cpp
@@ -56,7 +56,7 @@
 #include <list>
 #include <set>
 #include <stdio.h>
-#if defined(LLVM_3_1) || defined(LLVM_3_2)
+#if defined(LLVM_3_2)
   #include <llvm/Module.h>
   #include <llvm/Type.h>
   #include <llvm/Instructions.h>
diff --git a/func.cpp b/func.cpp
index b975049b..3097f86d 100644
--- a/func.cpp
+++ b/func.cpp
@@ -46,7 +46,7 @@
 #include "util.h"
 #include <stdio.h>
 
-#if defined(LLVM_3_1) || defined(LLVM_3_2)
+#if defined(LLVM_3_2)
   #include <llvm/LLVMContext.h>
   #include <llvm/Module.h>
   #include <llvm/Type.h>
@@ -310,9 +310,7 @@ Function::emitCode(FunctionEmitContext *ctx, llvm::Function *function,
         // isn't worth the code bloat / overhead.
         bool checkMask = (type->isTask == true) ||
             (
-#if defined(LLVM_3_1)
-              (function->hasFnAttr(llvm::Attribute::AlwaysInline) == false)
-#elif defined(LLVM_3_2)
+#if defined(LLVM_3_2)
               (function->getFnAttributes().hasAttribute(llvm::Attributes::AlwaysInline) == false)
 #else // LLVM 3.3+
               (function->getAttributes().getFnAttributes().hasAttribute(llvm::AttributeSet::FunctionIndex, llvm::Attribute::AlwaysInline) == false)
@@ -453,11 +451,7 @@ Function::GenerateIR() {
                     functionName += std::string("_") + g->target->GetISAString();
                 llvm::Function *appFunction =
                     llvm::Function::Create(ftype, linkage, functionName.c_str(), m->module);
-#if defined(LLVM_3_1)
-                appFunction->setDoesNotThrow(true);
-#else
                 appFunction->setDoesNotThrow();
-#endif
 
                 g->target->markFuncWithTargetAttr(appFunction);
 
diff --git a/ispc.cpp b/ispc.cpp
index de8fba4d..b25527c4 100644
--- a/ispc.cpp
+++ b/ispc.cpp
@@ -48,7 +48,7 @@
   #include <sys/types.h>
   #include <unistd.h>
 #endif
-#if defined(LLVM_3_1) || defined(LLVM_3_2)
+#if defined(LLVM_3_2)
   #include <llvm/LLVMContext.h>
   #include <llvm/Module.h>
   #include <llvm/Instructions.h>
@@ -57,19 +57,12 @@
   #include <llvm/IR/Module.h>
   #include <llvm/IR/Instructions.h>
 #endif
-#if defined(LLVM_3_1)
-  #include <llvm/Analysis/DebugInfo.h>
-  #include <llvm/Analysis/DIBuilder.h>
-#else
-  #include <llvm/DebugInfo.h>
-  #include <llvm/DIBuilder.h>
-#endif
+#include <llvm/DebugInfo.h>
+#include <llvm/DIBuilder.h>
 #include <llvm/Support/Dwarf.h>
 #include <llvm/Target/TargetMachine.h>
 #include <llvm/Target/TargetOptions.h>
-#if defined(LLVM_3_1)
-  #include <llvm/Target/TargetData.h>
-#elif defined(LLVM_3_2)
+#if defined(LLVM_3_2)
   #include <llvm/DataLayout.h>
 #else // LLVM 3.3+
   #include <llvm/IR/DataLayout.h>
@@ -145,27 +138,20 @@ static const char *supportedCPUs[] = {
     // cortex-a9 and a15.  We should be able to handle any of them that also
     // have NEON support.
     "cortex-a9", "cortex-a15",
-    "atom", "penryn", "core2", "corei7", "corei7-avx"
-#if !defined(LLVM_3_1)
-    , "core-avx-i", "core-avx2"
-#endif // LLVM 3.2+
+    "atom", "penryn", "core2", "corei7", "corei7-avx", "core-avx-i", "core-avx2"
 };
 
 Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
     m_target(NULL),
     m_targetMachine(NULL),
-#if defined(LLVM_3_1)
-    m_targetData(NULL),
-#else
     m_dataLayout(NULL),
-#endif
     m_valid(false),
     m_isa(SSE2),
     m_arch(""),
     m_is32Bit(true),
     m_cpu(""),
     m_attributes(""),
-#if !defined(LLVM_3_1) && !defined(LLVM_3_2)
+#if !defined(LLVM_3_2)
     m_tf_attributes(NULL),
 #endif
     m_nativeVectorWidth(-1),
@@ -407,10 +393,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
         this->m_maskingIsFree = false;
         this->m_maskBitCount = 32;
         this->m_hasHalf = true;
-#if !defined(LLVM_3_1)
-        // LLVM 3.2+ only
         this->m_hasRand = true;
-#endif
     }
     else if (!strcasecmp(isa, "avx1.1-x2")) {
         this->m_isa = Target::AVX11;
@@ -420,46 +403,29 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
         this->m_maskingIsFree = false;
         this->m_maskBitCount = 32;
         this->m_hasHalf = true;
-#if !defined(LLVM_3_1)
-        // LLVM 3.2+ only
         this->m_hasRand = true;
-#endif
     }
     else if (!strcasecmp(isa, "avx2")) {
         this->m_isa = Target::AVX2;
         this->m_nativeVectorWidth = 8;
         this->m_vectorWidth = 8;
-        this->m_attributes = "+avx2,+popcnt,+cmov,+f16c,+rdrand"
-#ifndef LLVM_3_1
-            ",+fma"
-#endif // !LLVM_3_1
-            ;
+        this->m_attributes = "+avx2,+popcnt,+cmov,+f16c,+rdrand,+fma";
         this->m_maskingIsFree = false;
         this->m_maskBitCount = 32;
         this->m_hasHalf = true;
-#if !defined(LLVM_3_1)
-        // LLVM 3.2+ only
         this->m_hasRand = true;
         this->m_hasGather = true;
-#endif
     }
     else if (!strcasecmp(isa, "avx2-x2")) {
         this->m_isa = Target::AVX2;
         this->m_nativeVectorWidth = 16;
         this->m_vectorWidth = 16;
-        this->m_attributes = "+avx2,+popcnt,+cmov,+f16c,+rdrand"
-#ifndef LLVM_3_1
-            ",+fma"
-#endif // !LLVM_3_1
-            ;
+        this->m_attributes = "+avx2,+popcnt,+cmov,+f16c,+rdrand,+fma";
         this->m_maskingIsFree = false;
         this->m_maskBitCount = 32;
         this->m_hasHalf = true;
-#if !defined(LLVM_3_1)
-        // LLVM 3.2+ only
         this->m_hasRand = true;
         this->m_hasGather = true;
-#endif
     }
     else if (!strcasecmp(isa, "neon-8")) {
         this->m_isa = Target::NEON8;
@@ -505,10 +471,8 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
         if (m_isa == Target::NEON8 || m_isa == Target::NEON16 ||
             m_isa == Target::NEON32)
             options.FloatABIType = llvm::FloatABI::Hard;
-#if !defined(LLVM_3_1)
         if (g->opt.disableFMA == false)
             options.AllowFPOpFusion = llvm::FPOpFusion::Fast;
-#endif // !LLVM_3_1
 
 #ifdef ISPC_IS_WINDOWS
         if (strcmp("x86", arch) == 0) {
@@ -526,12 +490,8 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
 
         // Initialize TargetData/DataLayout in 3 steps.
         // 1. Get default data layout first
-        std::string dl_string;
-#if defined(LLVM_3_1)
-        dl_string = m_targetMachine->getTargetData()->getStringRepresentation();
-#else
-        dl_string = m_targetMachine->getDataLayout()->getStringRepresentation();
-#endif
+        std::string dl_string =
+          m_targetMachine->getDataLayout()->getStringRepresentation();
 
         // 2. Adjust for generic
         if (m_isa == Target::GENERIC) {
@@ -546,11 +506,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
         }
 
         // 3. Finally set member data
-#if defined(LLVM_3_1)
-        m_targetData = new llvm::TargetData(dl_string);
-#else
         m_dataLayout = new llvm::DataLayout(dl_string);
-#endif
 
         // Set is32Bit
         // This indicates if we are compiling for 32 bit platform
@@ -558,7 +514,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
         // FIXME: all generic targets are handled as 64 bit, which is incorrect.
         this->m_is32Bit = (getDataLayout()->getPointerSize() == 4);
 
-#if !defined(LLVM_3_1) && !defined(LLVM_3_2)
+#if !defined(LLVM_3_2)
         // This is LLVM 3.3+ feature.
         // Initialize target-specific "target-feature" attribute.
         if (!m_attributes.empty()) {
@@ -772,7 +728,7 @@ Target::StructOffset(llvm::Type *type, int element,
 }
 
 void Target::markFuncWithTargetAttr(llvm::Function* func) {
-#if !defined(LLVM_3_1) && !defined(LLVM_3_2)
+#if !defined(LLVM_3_2)
     if (m_tf_attributes) {
         func->addAttributes(llvm::AttributeSet::FunctionIndex, *m_tf_attributes);
     }
diff --git a/ispc.h b/ispc.h
index bf6d2642..d68f9034 100644
--- a/ispc.h
+++ b/ispc.h
@@ -40,8 +40,8 @@
 
 #define ISPC_VERSION "1.4.5dev"
 
-#if !defined(LLVM_3_1) && !defined(LLVM_3_2) && !defined(LLVM_3_3) && !defined(LLVM_3_4)
-#error "Only LLVM 3.1, 3.2, 3.3 and the 3.4 development branch are supported"
+#if !defined(LLVM_3_2) && !defined(LLVM_3_3) && !defined(LLVM_3_4)
+#error "Only LLVM 3.2, 3.3 and the 3.4 development branch are supported"
 #endif
 
 #if defined(_WIN32) || defined(_WIN64)
@@ -72,11 +72,7 @@ namespace llvm {
     class BasicBlock;
     class Constant;
     class ConstantValue;
-#if defined(LLVM_3_1)
-    class TargetData;
-#else
     class DataLayout;
-#endif
     class DIBuilder;
     class DIDescriptor;
     class DIFile;
@@ -226,11 +222,7 @@ public:
 
     // Note the same name of method for 3.1 and 3.2+, this allows
     // to reduce number ifdefs on client side.
-#if defined(LLVM_3_1)
-    llvm::TargetData *getDataLayout() const {return m_targetData;}
-#else
     llvm::DataLayout *getDataLayout() const {return m_dataLayout;}
-#endif
 
     /** Reports if Target object has valid state. */
     bool isValid() const {return m_valid;}
@@ -278,11 +270,7 @@ private:
         */
     llvm::TargetMachine *m_targetMachine;
 
-#if defined(LLVM_3_1)
-    llvm::TargetData *m_targetData;
-#else
     llvm::DataLayout *m_dataLayout;
-#endif
 
     /** flag to report invalid state after construction
         (due to bad parameters passed to constructor). */
@@ -303,7 +291,7 @@ private:
     /** Target-specific attribute string to pass along to the LLVM backend */
     std::string m_attributes;
 
-#if !defined(LLVM_3_1) && !defined(LLVM_3_2)
+#if !defined(LLVM_3_2)
     /** Target-specific LLVM attribute, which has to be attached to every
         function to ensure that it is generated for correct target architecture.
         This is requirement was introduced in LLVM 3.3 */
diff --git a/llvmutil.cpp b/llvmutil.cpp
index 180c8676..2f54a2fe 100644
--- a/llvmutil.cpp
+++ b/llvmutil.cpp
@@ -38,7 +38,7 @@
 #include "llvmutil.h"
 #include "ispc.h"
 #include "type.h"
-#if defined(LLVM_3_1) || defined(LLVM_3_2)
+#if defined(LLVM_3_2)
   #include <llvm/Instructions.h>
   #include <llvm/BasicBlock.h>
 #else
diff --git a/llvmutil.h b/llvmutil.h
index d6c5ede0..d1803f32 100644
--- a/llvmutil.h
+++ b/llvmutil.h
@@ -38,7 +38,7 @@
 #ifndef ISPC_LLVMUTIL_H
 #define ISPC_LLVMUTIL_H 1
 
-#if defined(LLVM_3_1) || defined(LLVM_3_2)
+#if defined(LLVM_3_2)
   #include <llvm/LLVMContext.h>
   #include <llvm/Type.h>
   #include <llvm/DerivedTypes.h>
diff --git a/main.cpp b/main.cpp
index de2bb620..4c4b4575 100644
--- a/main.cpp
+++ b/main.cpp
@@ -62,9 +62,7 @@ static void
 lPrintVersion() {
     printf("Intel(r) SPMD Program Compiler (ispc), %s (build %s @ %s, LLVM %s)\n",
            ISPC_VERSION, BUILD_VERSION, BUILD_DATE,
-#if defined(LLVM_3_1)
-           "3.1"
-#elif defined(LLVM_3_2)
+#if defined(LLVM_3_2)
            "3.2"
 #elif defined(LLVM_3_3)
            "3.3"
diff --git a/module.cpp b/module.cpp
index 755a5dc4..eba5eb3b 100644
--- a/module.cpp
+++ b/module.cpp
@@ -64,7 +64,7 @@
 #define strcasecmp stricmp
 #endif
 
-#if defined(LLVM_3_1) || defined(LLVM_3_2)
+#if defined(LLVM_3_2)
   #include <llvm/LLVMContext.h>
   #include <llvm/Module.h>
   #include <llvm/Type.h>
@@ -86,9 +86,7 @@
 #include <llvm/Support/FileUtilities.h>
 #include <llvm/Target/TargetMachine.h>
 #include <llvm/Target/TargetOptions.h>
-#if defined(LLVM_3_1)
-  #include <llvm/Target/TargetData.h>
-#elif defined(LLVM_3_2)
+#if defined(LLVM_3_2)
   #include <llvm/DataLayout.h>
   #include <llvm/TargetTransformInfo.h>
 #else // LLVM 3.3+
@@ -202,7 +200,7 @@ lStripUnusedDebugInfo(llvm::Module *module) {
             // stuff and remove it later on. Removing it is useful, as it
             // reduces size of the binary significantly (manyfold for small
             // programs).
-#if defined(LLVM_3_1) || defined(LLVM_3_2)
+#if defined(LLVM_3_2)
             llvm::MDNode *nodeSPMD =
                 llvm::dyn_cast<llvm::MDNode>(cuNode->getOperand(12));
             Assert(nodeSPMD != NULL);
@@ -797,11 +795,7 @@ Module::AddFunctionDeclaration(const std::string &name,
 #endif
     if (functionType->isTask)
         // This also applies transitively to members I think?
-#if defined(LLVM_3_1)
-        function->setDoesNotAlias(1, true);
-#else // LLVM 3.2+
         function->setDoesNotAlias(1);
-#endif
 
     g->target->markFuncWithTargetAttr(function);
 
@@ -850,12 +844,7 @@ Module::AddFunctionDeclaration(const std::string &name,
 
             // NOTE: LLVM indexes function parameters starting from 1.
             // This is unintuitive.
-#if defined(LLVM_3_1)
-            function->setDoesNotAlias(i+1, true);
-#else
             function->setDoesNotAlias(i+1);
-#endif
-
 #if 0
             int align = 4 * RoundUpPow2(g->target->nativeVectorWidth);
             function->addAttribute(i+1, llvm::Attribute::constructAlignmentFromInt(align));
@@ -1067,7 +1056,7 @@ Module::writeObjectFileOrAssembly(llvm::TargetMachine *targetMachine,
     llvm::TargetMachine::CodeGenFileType fileType = (outputType == Object) ?
         llvm::TargetMachine::CGFT_ObjectFile : llvm::TargetMachine::CGFT_AssemblyFile;
     bool binary = (fileType == llvm::TargetMachine::CGFT_ObjectFile);
-#if defined(LLVM_3_1) || defined(LLVM_3_2) || defined(LLVM_3_3)
+#if defined(LLVM_3_2) || defined(LLVM_3_3)
     unsigned int flags = binary ? llvm::raw_fd_ostream::F_Binary : 0;
 #else
     llvm::sys::fs::OpenFlags flags = binary ? llvm::sys::fs::F_Binary :
@@ -1082,11 +1071,7 @@ Module::writeObjectFileOrAssembly(llvm::TargetMachine *targetMachine,
     }
 
     llvm::PassManager pm;
-#if defined(LLVM_3_1)
-    pm.add(new llvm::TargetData(*g->target->getDataLayout()));
-#else
     pm.add(new llvm::DataLayout(*g->target->getDataLayout()));
-#endif
 
     llvm::formatted_raw_ostream fos(of->os());
 
@@ -1800,22 +1785,12 @@ Module::execPreprocessor(const char *infilename, llvm::raw_string_ostream *ostre
 
     llvm::raw_fd_ostream stderrRaw(2, false);
 
-#if defined(LLVM_3_1)
-    clang::TextDiagnosticPrinter *diagPrinter =
-        new clang::TextDiagnosticPrinter(stderrRaw, clang::DiagnosticOptions());
-#else
     clang::DiagnosticOptions *diagOptions = new clang::DiagnosticOptions();
     clang::TextDiagnosticPrinter *diagPrinter =
         new clang::TextDiagnosticPrinter(stderrRaw, diagOptions);
-#endif
     llvm::IntrusiveRefCntPtr<clang::DiagnosticIDs> diagIDs(new clang::DiagnosticIDs);
-#if defined(LLVM_3_1)
-    clang::DiagnosticsEngine *diagEngine =
-        new clang::DiagnosticsEngine(diagIDs, diagPrinter);
-#else
     clang::DiagnosticsEngine *diagEngine =
         new clang::DiagnosticsEngine(diagIDs, diagOptions, diagPrinter);
-#endif
     inst.setDiagnostics(diagEngine);
 
     clang::TargetOptions &options = inst.getTargetOpts();
@@ -1825,7 +1800,7 @@ Module::execPreprocessor(const char *infilename, llvm::raw_string_ostream *ostre
     }
     options.Triple = triple.getTriple();
 
-#if defined(LLVM_3_1) || defined(LLVM_3_2)
+#if defined(LLVM_3_2)
     clang::TargetInfo *target =
         clang::TargetInfo::CreateTargetInfo(inst.getDiagnostics(), options);
 #else // LLVM 3.3+
@@ -1835,18 +1810,14 @@ Module::execPreprocessor(const char *infilename, llvm::raw_string_ostream *ostre
 
     inst.setTarget(target);
     inst.createSourceManager(inst.getFileManager());
-#if defined(LLVM_3_1)
-    inst.InitializeSourceManager(infilename);
-#else
     clang::FrontendInputFile inputFile(infilename, clang::IK_None);
     inst.InitializeSourceManager(inputFile);
-#endif
 
     // Don't remove comments in the preprocessor, so that we can accurately
     // track the source file position by handling them ourselves.
     inst.getPreprocessorOutputOpts().ShowComments = 1;
 
-#if !defined(LLVM_3_1) && !defined(LLVM_3_2) // LLVM 3.3+
+#if !defined(LLVM_3_2) // LLVM 3.3+
     inst.getPreprocessorOutputOpts().ShowCPP = 1;
 #endif
 
@@ -1858,7 +1829,7 @@ Module::execPreprocessor(const char *infilename, llvm::raw_string_ostream *ostre
         headerOpts.Verbose = 1;
     for (int i = 0; i < (int)g->includePath.size(); ++i) {
         headerOpts.AddPath(g->includePath[i], clang::frontend::Angled,
-#if defined(LLVM_3_1) || defined(LLVM_3_2)
+#if defined(LLVM_3_2)
                            true /* is user supplied */,
 #endif
                            false /* not a framework */,
@@ -1913,11 +1884,7 @@ Module::execPreprocessor(const char *infilename, llvm::raw_string_ostream *ostre
         }
     }
 
-#if defined(LLVM_3_1)
-    inst.getLangOpts().BCPLComment = 1;
-#else
     inst.getLangOpts().LineComment = 1;
-#endif
     inst.createPreprocessor();
 
     diagPrinter->BeginSourceFile(inst.getLangOpts(), &inst.getPreprocessor());
diff --git a/opt.cpp b/opt.cpp
index 8efdbc67..8c86368e 100644
--- a/opt.cpp
+++ b/opt.cpp
@@ -48,7 +48,7 @@
 #include <set>
 
 #include <llvm/Pass.h>
-#if defined(LLVM_3_1) || defined(LLVM_3_2)
+#if defined(LLVM_3_2)
   #include <llvm/Module.h>
   #include <llvm/Instructions.h>
   #include <llvm/Intrinsics.h>
@@ -73,9 +73,7 @@
 #include <llvm/Transforms/IPO.h>
 #include <llvm/Transforms/Utils/BasicBlockUtils.h>
 #include <llvm/Target/TargetOptions.h>
-#if defined(LLVM_3_1)
-  #include <llvm/Target/TargetData.h>
-#elif defined(LLVM_3_2)
+#if defined(LLVM_3_2)
   #include <llvm/DataLayout.h>
 #else // LLVM 3.3+
   #include <llvm/IR/DataLayout.h>
@@ -85,11 +83,7 @@
 #include <llvm/Analysis/Verifier.h>
 #include <llvm/Analysis/Passes.h>
 #include <llvm/Support/raw_ostream.h>
-#if defined(LLVM_3_1)
-  #include <llvm/Analysis/DebugInfo.h>
-#else
-  #include <llvm/DebugInfo.h>
-#endif
+#include <llvm/DebugInfo.h>
 #include <llvm/Support/Dwarf.h>
 #ifdef ISPC_IS_LINUX
   #include <alloca.h>
@@ -415,18 +409,14 @@ Optimize(llvm::Module *module, int optLevel) {
     optPM.add(targetLibraryInfo);
 
 
-#if defined(LLVM_3_1)
-    optPM.add(new llvm::TargetData(*g->target->getDataLayout()));
-#else
     optPM.add(new llvm::DataLayout(*g->target->getDataLayout()));
 
     llvm::TargetMachine *targetMachine = g->target->GetTargetMachine();
-  #ifdef LLVM_3_2
+#ifdef LLVM_3_2
     optPM.add(new llvm::TargetTransformInfo(targetMachine->getScalarTargetTransformInfo(),
                                             targetMachine->getVectorTargetTransformInfo()));
-  #else // LLVM 3.3+
+#else // LLVM 3.3+
     targetMachine->addAnalysisPasses(optPM);
-  #endif
 #endif
 
     optPM.add(llvm::createIndVarSimplifyPass());
@@ -505,7 +495,7 @@ Optimize(llvm::Module *module, int optLevel) {
         optPM.add(llvm::createCFGSimplificationPass());
 
         optPM.add(llvm::createArgumentPromotionPass());
-#if defined(LLVM_3_1) || defined(LLVM_3_2) || defined(LLVM_3_3)
+#if defined(LLVM_3_2) || defined(LLVM_3_3)
         // Starting from 3.4 this functionality was moved to
         // InstructionCombiningPass. See r184459 for details.
         optPM.add(llvm::createSimplifyLibCallsPass());
diff --git a/stmt.cpp b/stmt.cpp
index 4ec63d35..412b0dd9 100644
--- a/stmt.cpp
+++ b/stmt.cpp
@@ -48,7 +48,7 @@
 #include <stdio.h>
 #include <map>
 
-#if defined(LLVM_3_1) || defined(LLVM_3_2)
+#if defined(LLVM_3_2)
   #include <llvm/Module.h>
   #include <llvm/Type.h>
   #include <llvm/Instructions.h>
diff --git a/type.cpp b/type.cpp
index 5fa1845b..11a165f5 100644
--- a/type.cpp
+++ b/type.cpp
@@ -43,20 +43,15 @@
 
 #include <stdio.h>
 #include <map>
-#if defined(LLVM_3_1) || defined(LLVM_3_2)
+#if defined(LLVM_3_2)
   #include <llvm/Value.h>
   #include <llvm/Module.h>
 #else
   #include <llvm/IR/Value.h>
   #include <llvm/IR/Module.h>
 #endif
-#if defined(LLVM_3_1)
-  #include <llvm/Analysis/DebugInfo.h>
-  #include <llvm/Analysis/DIBuilder.h>
-#else
-  #include <llvm/DebugInfo.h>
-  #include <llvm/DIBuilder.h>
-#endif
+#include <llvm/DebugInfo.h>
+#include <llvm/DIBuilder.h>
 #include <llvm/Support/Dwarf.h>
 
 
@@ -819,11 +814,8 @@ EnumType::GetDIType(llvm::DIDescriptor scope) const {
         m->diBuilder->createEnumerationType(scope, name, diFile, pos.first_line,
                                             32 /* size in bits */,
                                             32 /* align in bits */,
-                                            elementArray
-#if !defined(LLVM_3_1)
-                                            , llvm::DIType()
-#endif
-                                            );
+                                            elementArray,
+                                            llvm::DIType());
 
 
     switch (variability.type) {
@@ -2139,7 +2131,7 @@ StructType::GetDIType(llvm::DIDescriptor scope) const {
         currentSize,    // Size in bits
         align,          // Alignment in bits
         0,              // Flags
-#if !defined(LLVM_3_1) && !defined(LLVM_3_2)
+#if !defined(LLVM_3_2)
         llvm::DIType(), // DerivedFrom
 #endif
         elements);
@@ -2382,7 +2374,7 @@ UndefinedStructType::GetDIType(llvm::DIDescriptor scope) const {
         0,              // Size
         0,              // Align
         0,              // Flags
-#if !defined(LLVM_3_1) && !defined(LLVM_3_2)
+#if !defined(LLVM_3_2)
         llvm::DIType(), // DerivedFrom
 #endif
         elements);
@@ -2645,12 +2637,8 @@ ReferenceType::GetDIType(llvm::DIDescriptor scope) const {
     }
 
     llvm::DIType diTargetType = targetType->GetDIType(scope);
-#if defined(LLVM_3_1)
-    return m->diBuilder->createReferenceType(diTargetType);
-#else
     return m->diBuilder->createReferenceType(llvm::dwarf::DW_TAG_reference_type,
                                              diTargetType);
-#endif
 }
 
 
diff --git a/type.h b/type.h
index 880f8574..a6a52e10 100644
--- a/type.h
+++ b/type.h
@@ -40,7 +40,7 @@
 
 #include "ispc.h"
 #include "util.h"
-#if defined(LLVM_3_1) || defined(LLVM_3_2)
+#if defined(LLVM_3_2)
   #include <llvm/Type.h>
   #include <llvm/DerivedTypes.h>
 #else
diff --git a/util.cpp b/util.cpp
index dbea9517..4be863bf 100644
--- a/util.cpp
+++ b/util.cpp
@@ -65,9 +65,7 @@
 #include <set>
 #include <algorithm>
 
-#if defined(LLVM_3_1)
-  #include <llvm/Target/TargetData.h>
-#elif defined(LLVM_3_2)
+#if defined(LLVM_3_2)
   #include <llvm/DataLayout.h>
 #else // LLVM 3.3+
   #include <llvm/IR/DataLayout.h>
@@ -616,13 +614,8 @@ VerifyDataLayoutCompatibility(const std::string &module_dl,
     // which contradic: f80:128:128 followed by f80:32:32. This is a bug, but
     // correct thing to do is to interpret this exactly how LLVM would treat it,
     // so we create a DataLayout class and take its string representation.
-#if defined(LLVM_3_1)
-    llvm::TargetData d1(module_dl);
-    llvm::TargetData d2(lib_dl);
-#else // LLVM 3.2+
     llvm::DataLayout d1(module_dl);
     llvm::DataLayout d2(lib_dl);
-#endif
 
     std::string module_dl_canonic = d1.getStringRepresentation();
     std::string lib_dl_canonic = d2.getStringRepresentation();

From d9c38b5c1f6c1ccb4920465789b9e3d451e302a8 Mon Sep 17 00:00:00 2001
From: Matt Pharr <mpharr@google.com>
Date: Thu, 18 Jul 2013 09:24:23 -0700
Subject: [PATCH 17/34] Remove support for using SVML for math lib routines.

This path was poorly maintained and wasn't actually available on most
targets.
---
 builtins.cpp                      | 11 ----
 builtins/target-avx-x2.ll         | 17 ------
 builtins/target-avx.ll            | 17 ------
 builtins/target-generic-1.ll      | 98 -------------------------------
 builtins/target-generic-common.ll | 16 -----
 builtins/target-neon-common.ll    | 13 ----
 builtins/target-sse2-x2.ll        | 86 ---------------------------
 builtins/target-sse2.ll           | 60 -------------------
 builtins/target-sse4-16.ll        | 15 -----
 builtins/target-sse4-8.ll         | 15 -----
 builtins/target-sse4-x2.ll        | 86 ---------------------------
 builtins/target-sse4.ll           | 60 -------------------
 docs/ispc.rst                     |  3 -
 ispc.h                            |  2 +-
 main.cpp                          |  3 -
 stdlib.ispc                       | 72 ++++++-----------------
 16 files changed, 18 insertions(+), 556 deletions(-)

diff --git a/builtins.cpp b/builtins.cpp
index 17582d68..d75db43e 100644
--- a/builtins.cpp
+++ b/builtins.cpp
@@ -579,15 +579,6 @@ lSetInternalFunctions(llvm::Module *module) {
         "__stdlib_sinf",
         "__stdlib_tan",
         "__stdlib_tanf",
-        "__svml_sin",
-        "__svml_cos",
-        "__svml_sincos",
-        "__svml_tan",
-        "__svml_atan",
-        "__svml_atan2",
-        "__svml_exp",
-        "__svml_log",
-        "__svml_pow",
         "__undef_uniform",
         "__undef_varying",
         "__vec4_add_float",
@@ -1054,8 +1045,6 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
                        symbolTable);
     lDefineConstantInt("__math_lib_ispc_fast", (int)Globals::Math_ISPCFast,
                        module, symbolTable);
-    lDefineConstantInt("__math_lib_svml", (int)Globals::Math_SVML, module,
-                       symbolTable);
     lDefineConstantInt("__math_lib_system", (int)Globals::Math_System, module,
                        symbolTable);
     lDefineConstantIntFunc("__fast_masked_vload", (int)g->opt.fastMaskedVload,
diff --git a/builtins/target-avx-x2.ll b/builtins/target-avx-x2.ll
index d9e0322b..8fb2e427 100644
--- a/builtins/target-avx-x2.ll
+++ b/builtins/target-avx-x2.ll
@@ -134,23 +134,6 @@ define <16 x float> @__sqrt_varying_float(<16 x float>) nounwind readonly always
   ret <16 x float> %call
 }
 
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; svml
-
-; FIXME: need either to wire these up to the 8-wide SVML entrypoints,
-; or, use the macro to call the 4-wide ones 4x with our 16-wide
-; vectors...
-
-declare <16 x float> @__svml_sin(<16 x float>)
-declare <16 x float> @__svml_cos(<16 x float>)
-declare void @__svml_sincos(<16 x float>, <16 x float> *, <16 x float> *)
-declare <16 x float> @__svml_tan(<16 x float>)
-declare <16 x float> @__svml_atan(<16 x float>)
-declare <16 x float> @__svml_atan2(<16 x float>, <16 x float>)
-declare <16 x float> @__svml_exp(<16 x float>)
-declare <16 x float> @__svml_log(<16 x float>)
-declare <16 x float> @__svml_pow(<16 x float>, <16 x float>)
-
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; float min/max
 
diff --git a/builtins/target-avx.ll b/builtins/target-avx.ll
index 90e2f3ac..adaed9ba 100644
--- a/builtins/target-avx.ll
+++ b/builtins/target-avx.ll
@@ -134,23 +134,6 @@ define <8 x float> @__sqrt_varying_float(<8 x float>) nounwind readonly alwaysin
   ret <8 x float> %call
 }
 
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; svml
-
-; FIXME: need either to wire these up to the 8-wide SVML entrypoints,
-; or, use the macro to call the 4-wide ones twice with our 8-wide
-; vectors...
-
-declare <8 x float> @__svml_sin(<8 x float>)
-declare <8 x float> @__svml_cos(<8 x float>)
-declare void @__svml_sincos(<8 x float>, <8 x float> *, <8 x float> *)
-declare <8 x float> @__svml_tan(<8 x float>)
-declare <8 x float> @__svml_atan(<8 x float>)
-declare <8 x float> @__svml_atan2(<8 x float>, <8 x float>)
-declare <8 x float> @__svml_exp(<8 x float>)
-declare <8 x float> @__svml_log(<8 x float>)
-declare <8 x float> @__svml_pow(<8 x float>, <8 x float>)
-
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; float min/max
 
diff --git a/builtins/target-generic-1.ll b/builtins/target-generic-1.ll
index 3dec76b0..238de444 100644
--- a/builtins/target-generic-1.ll
+++ b/builtins/target-generic-1.ll
@@ -647,104 +647,6 @@ define  <1 x float> @__rsqrt_varying_float(<1 x float> %v) nounwind readonly alw
   
 }
 
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-; svml stuff
-
-define  <1 x float> @__svml_sin(<1 x float>) nounwind readnone alwaysinline {
-  ;%ret = call <1 x float> @__svml_sinf4(<1 x float> %0)
-  ;ret <1 x float> %ret
-  ;%r = extractelement <1 x float> %0, i32 0
-  ;%s = call float @llvm.sin.f32(float %r)
-  ;%rv = insertelement <1 x float> undef, float %r, i32 0
-  ;ret <1 x float> %rv
-  unary1to1(float,@llvm.sin.f32)
-   
-}
-
-define  <1 x float> @__svml_cos(<1 x float>) nounwind readnone alwaysinline {
-  ;%ret = call <1 x float> @__svml_cosf4(<1 x float> %0)
-  ;ret <1 x float> %ret
-  ;%r = extractelement <1 x float> %0, i32 0
-  ;%s = call float @llvm.cos.f32(float %r)
-  ;%rv = insertelement <1 x float> undef, float %r, i32 0
-  ;ret <1 x float> %rv
-  unary1to1(float, @llvm.cos.f32)
-
-}
-
-define  void @__svml_sincos(<1 x float>, <1 x float> *, <1 x float> *) nounwind readnone alwaysinline {
-;  %s = call <1 x float> @__svml_sincosf4(<1 x float> * %2, <1 x float> %0)
-;  store <1 x float> %s, <1 x float> * %1
-;  ret void
-   %sin = call <1 x float> @__svml_sin (<1 x float> %0)
-   %cos = call <1 x float> @__svml_cos (<1 x float> %0)
-   store <1 x float> %sin, <1 x float> * %1
-   store <1 x float> %cos, <1 x float> * %2
-   ret void
-}
-
-define  <1 x float> @__svml_tan(<1 x float>) nounwind readnone alwaysinline {
-  ;%ret = call <1 x float> @__svml_tanf4(<1 x float> %0)
-  ;ret <1 x float> %ret
-  ;%r = extractelement <1 x float> %0, i32 0
-  ;%s = call float @llvm_tan_f32(float %r)
-  ;%rv = insertelement <1 x float> undef, float %r, i32 0
-  ;ret <1 x float> %rv
-  ;unasry1to1(float, @llvm.tan.f32)
-  ; UNSUPPORTED!
-  ret <1 x float > %0
-}
-
-define  <1 x float> @__svml_atan(<1 x float>) nounwind readnone alwaysinline {
-;  %ret = call <1 x float> @__svml_atanf4(<1 x float> %0)
-;  ret <1 x float> %ret
-  ;%r = extractelement <1 x float> %0, i32 0
-  ;%s = call float @llvm_atan_f32(float %r)
-  ;%rv = insertelement <1 x float> undef, float %r, i32 0
-  ;ret <1 x float> %rv
-  ;unsary1to1(float,@llvm.atan.f32)
-  ;UNSUPPORTED!
-  ret <1 x float > %0
-
-}
-
-define  <1 x float> @__svml_atan2(<1 x float>, <1 x float>) nounwind readnone alwaysinline {
-  ;%ret = call <1 x float> @__svml_atan2f4(<1 x float> %0, <1 x float> %1)
-  ;ret <1 x float> %ret
-  ;%y = extractelement <1 x float> %0, i32 0
-  ;%x = extractelement <1 x float> %1, i32 0
-  ;%q = fdiv float %y, %x
-  ;%a = call float @llvm.atan.f32 (float %q)
-  ;%rv = insertelement <1 x float> undef, float %a, i32 0
-  ;ret <1 x float> %rv
-  ; UNSUPPORTED!
-  ret <1 x float > %0
-}
-
-define  <1 x float> @__svml_exp(<1 x float>) nounwind readnone alwaysinline {
-  ;%ret = call <1 x float> @__svml_expf4(<1 x float> %0)
-  ;ret <1 x float> %ret
-  unary1to1(float, @llvm.exp.f32)
-}
-
-define  <1 x float> @__svml_log(<1 x float>) nounwind readnone alwaysinline {
-  ;%ret = call <1 x float> @__svml_logf4(<1 x float> %0)
-  ;ret <1 x float> %ret
-  unary1to1(float, @llvm.log.f32)
-}
-
-define  <1 x float> @__svml_pow(<1 x float>, <1 x float>) nounwind readnone alwaysinline {
-  ;%ret = call <1 x float> @__svml_powf4(<1 x float> %0, <1 x float> %1)
-  ;ret <1 x float> %ret
-  %r = extractelement <1 x float> %0, i32 0
-  %e  = extractelement <1 x float> %1, i32 0
-  %s = call float @llvm.pow.f32(float %r,float %e)
-  %rv = insertelement <1 x float> undef, float %s, i32 0
-  ret <1 x float> %rv
-
-}
-
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; float min/max
 
diff --git a/builtins/target-generic-common.ll b/builtins/target-generic-common.ll
index 76d1faf3..b581e0a7 100644
--- a/builtins/target-generic-common.ll
+++ b/builtins/target-generic-common.ll
@@ -202,22 +202,6 @@ declare i64 @__count_trailing_zeros_i64(i64) nounwind readnone
 declare i32 @__count_leading_zeros_i32(i32) nounwind readnone
 declare i64 @__count_leading_zeros_i64(i64) nounwind readnone
 
-;; svml
-
-; FIXME: need either to wire these up to the 8-wide SVML entrypoints,
-; or, use the macro to call the 4-wide ones twice with our 8-wide
-; vectors...
-
-declare <WIDTH x float> @__svml_sin(<WIDTH x float>)
-declare <WIDTH x float> @__svml_cos(<WIDTH x float>)
-declare void @__svml_sincos(<WIDTH x float>, <WIDTH x float> *, <WIDTH x float> *)
-declare <WIDTH x float> @__svml_tan(<WIDTH x float>)
-declare <WIDTH x float> @__svml_atan(<WIDTH x float>)
-declare <WIDTH x float> @__svml_atan2(<WIDTH x float>, <WIDTH x float>)
-declare <WIDTH x float> @__svml_exp(<WIDTH x float>)
-declare <WIDTH x float> @__svml_log(<WIDTH x float>)
-declare <WIDTH x float> @__svml_pow(<WIDTH x float>, <WIDTH x float>)
-
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; reductions
 
diff --git a/builtins/target-neon-common.ll b/builtins/target-neon-common.ll
index 696b0748..f892a0a1 100644
--- a/builtins/target-neon-common.ll
+++ b/builtins/target-neon-common.ll
@@ -313,19 +313,6 @@ define void @__masked_store_blend_i64(<WIDTH x i64>* nocapture %ptr,
   ret void
 }
 
-;; yuck.  We need declarations of these, even though we shouldnt ever
-;; actually generate calls to them for the NEON target...
-
-declare <WIDTH x float> @__svml_sin(<WIDTH x float>)
-declare <WIDTH x float> @__svml_cos(<WIDTH x float>)
-declare void @__svml_sincos(<WIDTH x float>, <WIDTH x float> *, <WIDTH x float> *)
-declare <WIDTH x float> @__svml_tan(<WIDTH x float>)
-declare <WIDTH x float> @__svml_atan(<WIDTH x float>)
-declare <WIDTH x float> @__svml_atan2(<WIDTH x float>, <WIDTH x float>)
-declare <WIDTH x float> @__svml_exp(<WIDTH x float>)
-declare <WIDTH x float> @__svml_log(<WIDTH x float>)
-declare <WIDTH x float> @__svml_pow(<WIDTH x float>, <WIDTH x float>)
-
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; gather
 
diff --git a/builtins/target-sse2-x2.ll b/builtins/target-sse2-x2.ll
index da22a66c..057ea98f 100644
--- a/builtins/target-sse2-x2.ll
+++ b/builtins/target-sse2-x2.ll
@@ -102,92 +102,6 @@ define <8 x float> @__sqrt_varying_float(<8 x float>) nounwind readonly alwaysin
   ret <8 x float> %call
 }
 
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-; svml stuff
-
-declare <4 x float> @__svml_sinf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_cosf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_sincosf4(<4 x float> *, <4 x float>) nounwind readnone
-declare <4 x float> @__svml_tanf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_atanf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_atan2f4(<4 x float>, <4 x float>) nounwind readnone
-declare <4 x float> @__svml_expf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_logf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_powf4(<4 x float>, <4 x float>) nounwind readnone
-
-
-define <8 x float> @__svml_sin(<8 x float>) nounwind readnone alwaysinline {
-  unary4to8(ret, float, @__svml_sinf4, %0)
-  ret <8 x float> %ret
-}
-
-define <8 x float> @__svml_cos(<8 x float>) nounwind readnone alwaysinline {
-  unary4to8(ret, float, @__svml_cosf4, %0)
-  ret <8 x float> %ret
-}
-
-define void @__svml_sincos(<8 x float>, <8 x float> *,
-                                    <8 x float> *) nounwind readnone alwaysinline {
-  ; call svml_sincosf4 two times with the two 4-wide sub-vectors
-  %a = shufflevector <8 x float> %0, <8 x float> undef,
-         <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %b = shufflevector <8 x float> %0, <8 x float> undef,
-         <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-
-  %cospa = alloca <4 x float>
-  %sa = call <4 x float> @__svml_sincosf4(<4 x float> * %cospa, <4 x float> %a)
-
-  %cospb = alloca <4 x float>
-  %sb = call <4 x float> @__svml_sincosf4(<4 x float> * %cospb, <4 x float> %b)
-
-  %sin = shufflevector <4 x float> %sa, <4 x float> %sb,
-         <8 x i32> <i32 0, i32 1, i32 2, i32 3,
-                    i32 4, i32 5, i32 6, i32 7>
-  store <8 x float> %sin, <8 x float> * %1
-
-  %cosa = load <4 x float> * %cospa
-  %cosb = load <4 x float> * %cospb
-  %cos = shufflevector <4 x float> %cosa, <4 x float> %cosb,
-         <8 x i32> <i32 0, i32 1, i32 2, i32 3,
-                    i32 4, i32 5, i32 6, i32 7>
-  store <8 x float> %cos, <8 x float> * %2
-
-  ret void
-}
-
-define <8 x float> @__svml_tan(<8 x float>) nounwind readnone alwaysinline {
-  unary4to8(ret, float, @__svml_tanf4, %0)
-  ret <8 x float> %ret
-}
-
-define <8 x float> @__svml_atan(<8 x float>) nounwind readnone alwaysinline {
-  unary4to8(ret, float, @__svml_atanf4, %0)
-  ret <8 x float> %ret
-}
-
-define <8 x float> @__svml_atan2(<8 x float>,
-                                          <8 x float>) nounwind readnone alwaysinline {
-  binary4to8(ret, float, @__svml_atan2f4, %0, %1)
-  ret <8 x float> %ret
-}
-
-define <8 x float> @__svml_exp(<8 x float>) nounwind readnone alwaysinline {
-  unary4to8(ret, float, @__svml_expf4, %0)
-  ret <8 x float> %ret
-}
-
-define <8 x float> @__svml_log(<8 x float>) nounwind readnone alwaysinline {
-  unary4to8(ret, float, @__svml_logf4, %0)
-  ret <8 x float> %ret
-}
-
-define <8 x float> @__svml_pow(<8 x float>,
-                                        <8 x float>) nounwind readnone alwaysinline {
-  binary4to8(ret, float, @__svml_powf4, %0, %1)
-  ret <8 x float> %ret
-}
-
-
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; float min/max
 
diff --git a/builtins/target-sse2.ll b/builtins/target-sse2.ll
index a6b206b6..e0a5c3d5 100644
--- a/builtins/target-sse2.ll
+++ b/builtins/target-sse2.ll
@@ -493,66 +493,6 @@ define <4 x float> @__sqrt_varying_float(<4 x float>) nounwind readonly alwaysin
   ret <4 x float> %call
 }
 
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-; svml stuff
-
-declare <4 x float> @__svml_sinf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_cosf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_sincosf4(<4 x float> *, <4 x float>) nounwind readnone
-declare <4 x float> @__svml_tanf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_atanf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_atan2f4(<4 x float>, <4 x float>) nounwind readnone
-declare <4 x float> @__svml_expf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_logf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_powf4(<4 x float>, <4 x float>) nounwind readnone
-
-
-define <4 x float> @__svml_sin(<4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_sinf4(<4 x float> %0)
-  ret <4 x float> %ret
-}
-
-define <4 x float> @__svml_cos(<4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_cosf4(<4 x float> %0)
-  ret <4 x float> %ret
-}
-
-define void @__svml_sincos(<4 x float>, <4 x float> *, <4 x float> *) nounwind readnone alwaysinline {
-  %s = call <4 x float> @__svml_sincosf4(<4 x float> * %2, <4 x float> %0)
-  store <4 x float> %s, <4 x float> * %1
-  ret void
-}
-
-define <4 x float> @__svml_tan(<4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_tanf4(<4 x float> %0)
-  ret <4 x float> %ret
-}
-
-define <4 x float> @__svml_atan(<4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_atanf4(<4 x float> %0)
-  ret <4 x float> %ret
-}
-
-define <4 x float> @__svml_atan2(<4 x float>, <4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_atan2f4(<4 x float> %0, <4 x float> %1)
-  ret <4 x float> %ret
-}
-
-define <4 x float> @__svml_exp(<4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_expf4(<4 x float> %0)
-  ret <4 x float> %ret
-}
-
-define <4 x float> @__svml_log(<4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_logf4(<4 x float> %0)
-  ret <4 x float> %ret
-}
-
-define <4 x float> @__svml_pow(<4 x float>, <4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_powf4(<4 x float> %0, <4 x float> %1)
-  ret <4 x float> %ret
-}
-
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; float min/max
 
diff --git a/builtins/target-sse4-16.ll b/builtins/target-sse4-16.ll
index d1563988..50f0848d 100644
--- a/builtins/target-sse4-16.ll
+++ b/builtins/target-sse4-16.ll
@@ -205,21 +205,6 @@ define <8 x double> @__max_varying_double(<8 x double>, <8 x double>) nounwind r
   ret <8 x double> %ret
 }
 
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; svml
-
-; FIXME
-
-declare <8 x float> @__svml_sin(<8 x float>)
-declare <8 x float> @__svml_cos(<8 x float>)
-declare void @__svml_sincos(<8 x float>, <8 x float> *, <8 x float> *)
-declare <8 x float> @__svml_tan(<8 x float>)
-declare <8 x float> @__svml_atan(<8 x float>)
-declare <8 x float> @__svml_atan2(<8 x float>, <8 x float>)
-declare <8 x float> @__svml_exp(<8 x float>)
-declare <8 x float> @__svml_log(<8 x float>)
-declare <8 x float> @__svml_pow(<8 x float>, <8 x float>)
-
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; horizontal ops / reductions
 
diff --git a/builtins/target-sse4-8.ll b/builtins/target-sse4-8.ll
index 85b7bbe7..7fa9075b 100644
--- a/builtins/target-sse4-8.ll
+++ b/builtins/target-sse4-8.ll
@@ -217,21 +217,6 @@ define <16 x double> @__max_varying_double(<16 x double>, <16 x double>) nounwin
   ret <16 x double> %ret
 }
 
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; svml
-
-; FIXME
-
-declare <16 x float> @__svml_sin(<16 x float>)
-declare <16 x float> @__svml_cos(<16 x float>)
-declare void @__svml_sincos(<16 x float>, <16 x float> *, <16 x float> *)
-declare <16 x float> @__svml_tan(<16 x float>)
-declare <16 x float> @__svml_atan(<16 x float>)
-declare <16 x float> @__svml_atan2(<16 x float>, <16 x float>)
-declare <16 x float> @__svml_exp(<16 x float>)
-declare <16 x float> @__svml_log(<16 x float>)
-declare <16 x float> @__svml_pow(<16 x float>, <16 x float>)
-
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; horizontal ops / reductions
 
diff --git a/builtins/target-sse4-x2.ll b/builtins/target-sse4-x2.ll
index e2debbc2..4a447ba6 100644
--- a/builtins/target-sse4-x2.ll
+++ b/builtins/target-sse4-x2.ll
@@ -102,92 +102,6 @@ define <8 x float> @__sqrt_varying_float(<8 x float>) nounwind readonly alwaysin
   ret <8 x float> %call
 }
 
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-; svml stuff
-
-declare <4 x float> @__svml_sinf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_cosf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_sincosf4(<4 x float> *, <4 x float>) nounwind readnone
-declare <4 x float> @__svml_tanf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_atanf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_atan2f4(<4 x float>, <4 x float>) nounwind readnone
-declare <4 x float> @__svml_expf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_logf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_powf4(<4 x float>, <4 x float>) nounwind readnone
-
-
-define <8 x float> @__svml_sin(<8 x float>) nounwind readnone alwaysinline {
-  unary4to8(ret, float, @__svml_sinf4, %0)
-  ret <8 x float> %ret
-}
-
-define <8 x float> @__svml_cos(<8 x float>) nounwind readnone alwaysinline {
-  unary4to8(ret, float, @__svml_cosf4, %0)
-  ret <8 x float> %ret
-}
-
-define void @__svml_sincos(<8 x float>, <8 x float> *,
-                                    <8 x float> *) nounwind readnone alwaysinline {
-  ; call svml_sincosf4 two times with the two 4-wide sub-vectors
-  %a = shufflevector <8 x float> %0, <8 x float> undef,
-         <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %b = shufflevector <8 x float> %0, <8 x float> undef,
-         <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-
-  %cospa = alloca <4 x float>
-  %sa = call <4 x float> @__svml_sincosf4(<4 x float> * %cospa, <4 x float> %a)
-
-  %cospb = alloca <4 x float>
-  %sb = call <4 x float> @__svml_sincosf4(<4 x float> * %cospb, <4 x float> %b)
-
-  %sin = shufflevector <4 x float> %sa, <4 x float> %sb,
-         <8 x i32> <i32 0, i32 1, i32 2, i32 3,
-                    i32 4, i32 5, i32 6, i32 7>
-  store <8 x float> %sin, <8 x float> * %1
-
-  %cosa = load <4 x float> * %cospa
-  %cosb = load <4 x float> * %cospb
-  %cos = shufflevector <4 x float> %cosa, <4 x float> %cosb,
-         <8 x i32> <i32 0, i32 1, i32 2, i32 3,
-                    i32 4, i32 5, i32 6, i32 7>
-  store <8 x float> %cos, <8 x float> * %2
-
-  ret void
-}
-
-define <8 x float> @__svml_tan(<8 x float>) nounwind readnone alwaysinline {
-  unary4to8(ret, float, @__svml_tanf4, %0)
-  ret <8 x float> %ret
-}
-
-define <8 x float> @__svml_atan(<8 x float>) nounwind readnone alwaysinline {
-  unary4to8(ret, float, @__svml_atanf4, %0)
-  ret <8 x float> %ret
-}
-
-define <8 x float> @__svml_atan2(<8 x float>,
-                                          <8 x float>) nounwind readnone alwaysinline {
-  binary4to8(ret, float, @__svml_atan2f4, %0, %1)
-  ret <8 x float> %ret
-}
-
-define <8 x float> @__svml_exp(<8 x float>) nounwind readnone alwaysinline {
-  unary4to8(ret, float, @__svml_expf4, %0)
-  ret <8 x float> %ret
-}
-
-define <8 x float> @__svml_log(<8 x float>) nounwind readnone alwaysinline {
-  unary4to8(ret, float, @__svml_logf4, %0)
-  ret <8 x float> %ret
-}
-
-define <8 x float> @__svml_pow(<8 x float>,
-                                        <8 x float>) nounwind readnone alwaysinline {
-  binary4to8(ret, float, @__svml_powf4, %0, %1)
-  ret <8 x float> %ret
-}
-
-
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; float min/max
 
diff --git a/builtins/target-sse4.ll b/builtins/target-sse4.ll
index 98a7ef69..7f9a9185 100644
--- a/builtins/target-sse4.ll
+++ b/builtins/target-sse4.ll
@@ -206,66 +206,6 @@ define <4 x double> @__max_varying_double(<4 x double>, <4 x double>) nounwind r
   ret <4 x double> %ret
 }
 
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-; svml stuff
-
-declare <4 x float> @__svml_sinf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_cosf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_sincosf4(<4 x float> *, <4 x float>) nounwind readnone
-declare <4 x float> @__svml_tanf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_atanf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_atan2f4(<4 x float>, <4 x float>) nounwind readnone
-declare <4 x float> @__svml_expf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_logf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_powf4(<4 x float>, <4 x float>) nounwind readnone
-
-
-define <4 x float> @__svml_sin(<4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_sinf4(<4 x float> %0)
-  ret <4 x float> %ret
-}
-
-define <4 x float> @__svml_cos(<4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_cosf4(<4 x float> %0)
-  ret <4 x float> %ret
-}
-
-define void @__svml_sincos(<4 x float>, <4 x float> *, <4 x float> *) nounwind readnone alwaysinline {
-  %s = call <4 x float> @__svml_sincosf4(<4 x float> * %2, <4 x float> %0)
-  store <4 x float> %s, <4 x float> * %1
-  ret void
-}
-
-define <4 x float> @__svml_tan(<4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_tanf4(<4 x float> %0)
-  ret <4 x float> %ret
-}
-
-define <4 x float> @__svml_atan(<4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_atanf4(<4 x float> %0)
-  ret <4 x float> %ret
-}
-
-define <4 x float> @__svml_atan2(<4 x float>, <4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_atan2f4(<4 x float> %0, <4 x float> %1)
-  ret <4 x float> %ret
-}
-
-define <4 x float> @__svml_exp(<4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_expf4(<4 x float> %0)
-  ret <4 x float> %ret
-}
-
-define <4 x float> @__svml_log(<4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_logf4(<4 x float> %0)
-  ret <4 x float> %ret
-}
-
-define <4 x float> @__svml_pow(<4 x float>, <4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_powf4(<4 x float> %0, <4 x float> %1)
-  ret <4 x float> %ret
-}
-
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; horizontal ops / reductions
 
diff --git a/docs/ispc.rst b/docs/ispc.rst
index 39d3a5c8..af59714a 100755
--- a/docs/ispc.rst
+++ b/docs/ispc.rst
@@ -3278,9 +3278,6 @@ for this argument.
   approximately 1.45e-6 over the range -10pi to 10pi.)
 * ``fast``: more efficient but lower accuracy versions of the default ``ispc``
   implementations.
-* ``svml``: use Intel "Short Vector Math Library".  Use
-  ``icc`` to link your final executable so that the appropriate libraries
-  are linked.
 * ``system``: use the system's math library.  On many systems, these
   functions are more accurate than both of ``ispc``'s implementations.
   Using these functions may be quite
diff --git a/ispc.h b/ispc.h
index d68f9034..8653553e 100644
--- a/ispc.h
+++ b/ispc.h
@@ -468,7 +468,7 @@ struct Globals {
 
     /** There are a number of math libraries that can be used for
         transcendentals and the like during program compilation. */
-    enum MathLib { Math_ISPC, Math_ISPCFast, Math_SVML, Math_System };
+    enum MathLib { Math_ISPC, Math_ISPCFast, Math_System };
     MathLib mathLib;
 
     /** Records whether the ispc standard library should be made available
diff --git a/main.cpp b/main.cpp
index 4c4b4575..c21e7f88 100644
--- a/main.cpp
+++ b/main.cpp
@@ -107,7 +107,6 @@ usage(int ret) {
     printf("    [--math-lib=<option>]\t\tSelect math library\n");
     printf("        default\t\t\t\tUse ispc's built-in math functions\n");
     printf("        fast\t\t\t\tUse high-performance but lower-accuracy math functions\n");
-    printf("        svml\t\t\t\tUse the Intel(r) SVML math libraries\n");
     printf("        system\t\t\t\tUse the system's math library (*may be quite slow*)\n");
     printf("    [-MMM <filename>\t\t\t\tWrite #include dependencies to given file.\n");
     printf("    [--nostdlib]\t\t\tDon't make the ispc standard library available\n");
@@ -353,8 +352,6 @@ int main(int Argc, char *Argv[]) {
                 g->mathLib = Globals::Math_ISPC;
             else if (!strcmp(lib, "fast"))
                 g->mathLib = Globals::Math_ISPCFast;
-            else if (!strcmp(lib, "svml"))
-                g->mathLib = Globals::Math_SVML;
             else if (!strcmp(lib, "system"))
                 g->mathLib = Globals::Math_System;
             else {
diff --git a/stdlib.ispc b/stdlib.ispc
index c9c66252..affa7fef 100644
--- a/stdlib.ispc
+++ b/stdlib.ispc
@@ -2159,10 +2159,7 @@ static inline uniform float frexp(uniform float x, uniform int * uniform pw2) {
 
 __declspec(safe)
 static inline float sin(float x_full) {
-    if (__math_lib == __math_lib_svml) {
-        return __svml_sin(x_full);
-    }
-    else if (__math_lib == __math_lib_system) {
+    if (__math_lib == __math_lib_system) {
         float ret;
         foreach_active (i) {
             uniform float r = __stdlib_sinf(extract(x_full, i));
@@ -2221,8 +2218,7 @@ static inline float sin(float x_full) {
 
 __declspec(safe)
 static inline uniform float sin(uniform float x_full) {
-    if (__math_lib == __math_lib_system ||
-        __math_lib == __math_lib_svml) {
+    if (__math_lib == __math_lib_system) {
         return __stdlib_sinf(x_full);
     }
     else if (__math_lib == __math_lib_ispc || 
@@ -2293,8 +2289,7 @@ static inline float asin(float x) {
     bool isnan = (x > 1);
 
     float v;
-    if (__math_lib == __math_lib_svml ||
-        __math_lib == __math_lib_system) {
+    if (__math_lib == __math_lib_system) {
         float ret;
         foreach_active (i) {
             uniform float r = __stdlib_asinf(extract(x, i));
@@ -2350,8 +2345,7 @@ static inline uniform float asin(uniform float x) {
     uniform bool isnan = (x > 1);
 
     uniform float v;
-    if (__math_lib == __math_lib_svml ||
-        __math_lib == __math_lib_system) {
+    if (__math_lib == __math_lib_system) {
         return __stdlib_asinf(x);
     }
     else if (__math_lib == __math_lib_ispc)
@@ -2396,10 +2390,7 @@ static inline uniform float asin(uniform float x) {
 
 __declspec(safe)
 static inline float cos(float x_full) {
-    if (__math_lib == __math_lib_svml) {
-        return __svml_cos(x_full);
-    }
-    else if (__math_lib == __math_lib_system) {
+    if (__math_lib == __math_lib_system) {
         float ret;
         foreach_active (i) {
             uniform float r = __stdlib_cosf(extract(x_full, i));
@@ -2457,8 +2448,7 @@ static inline float cos(float x_full) {
 
 __declspec(safe)
 static inline uniform float cos(uniform float x_full) {
-    if (__math_lib == __math_lib_system ||
-        __math_lib == __math_lib_svml) {
+    if (__math_lib == __math_lib_system) {
         return __stdlib_cosf(x_full);
     }
     else if (__math_lib == __math_lib_ispc || 
@@ -2535,10 +2525,7 @@ static inline uniform float acos(uniform float v) {
 __declspec(safe)
 static inline void sincos(float x_full, varying float * uniform sin_result, 
                           varying float * uniform cos_result) {
-    if (__math_lib == __math_lib_svml) {
-        __svml_sincos(x_full, sin_result, cos_result);
-    }
-    else if (__math_lib == __math_lib_system) {
+    if (__math_lib == __math_lib_system) {
         foreach_active (i) {
             uniform float s, c;
             __stdlib_sincosf(extract(x_full, i), &s, &c);
@@ -2605,8 +2592,7 @@ static inline void sincos(float x_full, varying float * uniform sin_result,
 __declspec(safe)
 static inline void sincos(uniform float x_full, uniform float * uniform sin_result,
                           uniform float * uniform cos_result) {
-    if (__math_lib == __math_lib_system ||
-        __math_lib == __math_lib_svml) {
+    if (__math_lib == __math_lib_system) {
         __stdlib_sincosf(x_full, sin_result, cos_result);
     }
     else if (__math_lib == __math_lib_ispc || 
@@ -2667,10 +2653,7 @@ static inline void sincos(uniform float x_full, uniform float * uniform sin_resu
 
 __declspec(safe)
 static inline float tan(float x_full) {
-    if (__math_lib == __math_lib_svml) {
-        return __svml_tan(x_full);
-    }
-    else if (__math_lib == __math_lib_system) {
+    if (__math_lib == __math_lib_system) {
         float ret;
         foreach_active (i) {
             uniform float r = __stdlib_tanf(extract(x_full, i));
@@ -2746,8 +2729,7 @@ static inline float tan(float x_full) {
 
 __declspec(safe)
 static inline uniform float tan(uniform float x_full) {
-    if (__math_lib == __math_lib_system ||
-        __math_lib == __math_lib_svml) {
+    if (__math_lib == __math_lib_system) {
         return __stdlib_tanf(x_full);
     }
     else if (__math_lib == __math_lib_ispc || 
@@ -2818,10 +2800,7 @@ static inline uniform float tan(uniform float x_full) {
 
 __declspec(safe)
 static inline float atan(float x_full) {
-    if (__math_lib == __math_lib_svml) {
-        return __svml_atan(x_full);
-    }
-    else if (__math_lib == __math_lib_system) {
+    if (__math_lib == __math_lib_system) {
         float ret;
         foreach_active (i) {
             uniform float r = __stdlib_atanf(extract(x_full, i));
@@ -2869,8 +2848,7 @@ static inline float atan(float x_full) {
 
 __declspec(safe)
 static inline uniform float atan(uniform float x_full) {
-    if (__math_lib == __math_lib_system ||
-        __math_lib == __math_lib_svml) {
+    if (__math_lib == __math_lib_system) {
         return __stdlib_atanf(x_full);
     }
     else if (__math_lib == __math_lib_ispc || 
@@ -2913,10 +2891,7 @@ static inline uniform float atan(uniform float x_full) {
 
 __declspec(safe)
 static inline float atan2(float y, float x) {
-    if (__math_lib == __math_lib_svml) {
-        return __svml_atan2(y, x);
-    }
-    else if (__math_lib == __math_lib_system) {
+    if (__math_lib == __math_lib_system) {
         float ret;
         foreach_active (i) {
             uniform float r = __stdlib_atan2f(extract(y, i), extract(x, i));
@@ -2952,8 +2927,7 @@ static inline float atan2(float y, float x) {
 
 __declspec(safe)
 static inline uniform float atan2(uniform float y, uniform float x) {
-    if (__math_lib == __math_lib_system ||
-        __math_lib == __math_lib_svml) {
+    if (__math_lib == __math_lib_system) {
         return __stdlib_atan2f(y, x);
     }
     else if (__math_lib == __math_lib_ispc || 
@@ -2976,9 +2950,6 @@ static inline float exp(float x_full) {
     if (__have_native_transcendentals) {
         return __exp_varying_float(x_full);
     }
-    else if (__math_lib == __math_lib_svml) {
-        return __svml_exp(x_full);
-    }
     else if (__math_lib == __math_lib_system) {
         float ret;
         foreach_active (i) {
@@ -3058,8 +3029,7 @@ static inline uniform float exp(uniform float x_full) {
     if (__have_native_transcendentals) {
         return __exp_uniform_float(x_full);
     }
-    else if (__math_lib == __math_lib_system ||
-        __math_lib == __math_lib_svml) {
+    else if (__math_lib == __math_lib_system) {
         return __stdlib_expf(x_full);
     }
     else if (__math_lib == __math_lib_ispc_fast) {
@@ -3183,9 +3153,6 @@ static inline float log(float x_full) {
     if (__have_native_transcendentals) {
         return __log_varying_float(x_full);
     }
-    else if (__math_lib == __math_lib_svml) {
-        return __svml_log(x_full);
-    }
     else if (__math_lib == __math_lib_system) {
         float ret;
         foreach_active (i) {
@@ -3274,8 +3241,7 @@ static inline uniform float log(uniform float x_full) {
     if (__have_native_transcendentals) {
         return __log_uniform_float(x_full);
     }
-    else if (__math_lib == __math_lib_system ||
-        __math_lib == __math_lib_svml) {
+    else if (__math_lib == __math_lib_system) {
         return __stdlib_logf(x_full);
     }
     else if (__math_lib == __math_lib_ispc_fast) {
@@ -3358,9 +3324,6 @@ static inline float pow(float a, float b) {
     if (__have_native_transcendentals) {
         return __pow_varying_float(a, b);
     }
-    else if (__math_lib == __math_lib_svml) {
-        return __svml_pow(a, b);
-    }
     else if (__math_lib == __math_lib_system) {
         float ret;
         foreach_active (i) {
@@ -3380,8 +3343,7 @@ static inline uniform float pow(uniform float a, uniform float b) {
     if (__have_native_transcendentals) {
         return __pow_uniform_float(a, b);
     }
-    if (__math_lib == __math_lib_system ||
-        __math_lib == __math_lib_svml) {
+    if (__math_lib == __math_lib_system) {
         return __stdlib_powf(a, b);
     }
     else if (__math_lib == __math_lib_ispc || 

From 4f48d3258a27087f93c08c28f32200d0ed194eee Mon Sep 17 00:00:00 2001
From: Matt Pharr <mpharr@google.com>
Date: Wed, 31 Jul 2013 20:06:04 -0700
Subject: [PATCH 18/34] Documentation updates for NEON

---
 docs/build.sh |  8 ++++---
 docs/ispc.rst | 61 +++++++++++++++++++++++++++++++++++++++++----------
 ispc.cpp      |  4 ++--
 3 files changed, 56 insertions(+), 17 deletions(-)

diff --git a/docs/build.sh b/docs/build.sh
index a13f3231..4f4fbfe4 100755
--- a/docs/build.sh
+++ b/docs/build.sh
@@ -1,14 +1,16 @@
 #!/bin/bash
 
+rst2html=rst2html.py
+
 for i in ispc perfguide faq; do
-    rst2html --template=template.txt --link-stylesheet \
+    $rst2html --template=template.txt --link-stylesheet \
         --stylesheet-path=css/style.css $i.rst > $i.html
 done
 
-rst2html --template=template-news.txt --link-stylesheet \
+$rst2html --template=template-news.txt --link-stylesheet \
     --stylesheet-path=css/style.css news.rst > news.html
 
-rst2html --template=template-perf.txt --link-stylesheet \
+$rst2html --template=template-perf.txt --link-stylesheet \
         --stylesheet-path=css/style.css perf.rst > perf.html
 
 #rst2latex --section-numbering --documentclass=article --documentoptions=DIV=9,10pt,letterpaper ispc.txt > ispc.tex
diff --git a/docs/ispc.rst b/docs/ispc.rst
index af59714a..8456f126 100755
--- a/docs/ispc.rst
+++ b/docs/ispc.rst
@@ -467,31 +467,68 @@ There are three options that affect the compilation target: ``--arch``,
 which sets the target architecture, ``--cpu``, which sets the target CPU,
 and ``--target``, which sets the target instruction set.
 
-By default, the ``ispc`` compiler generates code for the 64-bit x86-64
-architecture (i.e. ``--arch=x86-64``.)  To compile to a 32-bit x86 target,
-supply ``--arch=x86`` on the command line:
+If none of these options is specified, ``ispc`` generates code for the
+architecture of the system the compiler is running on (i.e. 64-bit x86-64
+(``--arch=x86-64``) on x86 systems and ARM NEON on ARM systems.
+
+To compile to a 32-bit x86 target, for example, supply ``--arch=x86`` on
+the command line:
 
 ::
 
    ispc foo.ispc -o foo.obj --arch=x86
 
-No other architectures are currently supported.
+Currently-supported architectures are ``x86-64``, ``x86``, and ``arm``.
 
 The target CPU determines both the default instruction set used as well as
 which CPU architecture the code is tuned for.  ``ispc --help`` provides a
-list of a number of the supported CPUs.  By default, the CPU type of the
-system on which you're running ``ispc`` is used to determine the target
-CPU.
+list of all of the supported CPUs.  By default, the CPU type of the system
+on which you're running ``ispc`` is used to determine the target CPU.
 
 ::
 
    ispc foo.ispc -o foo.obj --cpu=corei7-avx
 
-Finally, ``--target`` selects between the SSE2, SSE4, and AVX, and AVX2
+Finally, ``--target`` selects the target instruction set.  The following
+targets are currently supported:
+
+=========== ========= =======================================
+Target      Gang Size Description
+----------- --------- ---------------------------------------
+avx         8         AVX (2010-2011 era Intel CPUs)
+avx-x2      16        "Double-pumped" AVX target, running
+                      twice as many program instances as the
+                      native vector width.
+avx1.1      8         AVX 1.1 target (2012 era "Ivybridge"
+                      Intel CPUs).
+avx1.1-x2   16        Double-pumped AVX 1.1 target.
+avx2        8         AVX 2 target (2013- Intel "Haswell"
+                      CPUs.)
+avx2-x2     16        Double-pumped AVX 2 target.
+neon-8      16        ARM NEON target, targeting computation
+                      on 8-bit data types. 
+neon-16     8         ARM NEON target, targeting computation
+                      on 16-bit data types.
+neon-32     4         ARM NEON target, targeting computation
+                      on 32-bit data types.
+sse2        4         SSE2 (early 2000s era x86 CPUs).
+sse2-x2     8         Double-pumped SSE2.
+sse4        4         SSE4 (generally 2008-2010 Intel CPUs).
+sse4-x2     8         Double-pumped SSE4.
+sse4-8      16        SSE4 target targeting computation on
+                      8-bit data types. 
+sse4-16     8         SSE4 target targeting computation on
+                      16-bit data types.
+=========== ========= =======================================
+
+See `Basic Concepts: Program Instances and Gangs of Program Instances`_ for
+more discussion of the "gang size" and its implications for program
+execution.
+
 instruction sets.  (As general context, SSE2 was first introduced in
 processors that shipped in 2001, SSE4 was introduced in 2007, and
-processors with AVX were introduced in 2010.  AVX2 will be supported on
-future CPUs based on Intel's "Haswell" architecture.  Consult your CPU's
+processors with AVX were introduced in 2010, and AVX2 arrived in 2013.
+Consult your CPU's
 manual for specifics on which vector instruction set it supports.)
 
 By default, the target instruction set is chosen based on the most capable
@@ -505,7 +542,7 @@ Generating Generic C++ Output
 -----------------------------
 
 In addition to generating object files or assembly output for specific
-targets like SSE2, SSE4, and AVX, ``ispc`` provides an option to generate
+targets like NEON, SSE2, SSE4, and AVX, ``ispc`` provides an option to generate
 "generic" C++ output.  This
 
 As an example, consider the following simple ``ispc`` program:
@@ -659,7 +696,7 @@ preprocessor runs:
   * - ISPC
     - 1
     - Detecting that the ``ispc`` compiler is processing the file
-  * - ISPC_TARGET_{SSE2,SSE4,AVX,AVX2}
+  * - ISPC_TARGET_{NEON_8,NEON_16,NEON_32,SSE2,SSE4,AVX,AVX11,AVX2,GENERIC}
     - 1
     - One of these will be set, depending on the compilation target.
   * - ISPC_POINTER_SIZE
diff --git a/ispc.cpp b/ispc.cpp
index b25527c4..03d1aaff 100644
--- a/ispc.cpp
+++ b/ispc.cpp
@@ -558,8 +558,8 @@ Target::SupportedTargetArchs() {
 
 const char *
 Target::SupportedTargetISAs() {
-    return "neon, sse2, sse2-x2, sse4, sse4-8, sse4-16, sse4-x2, "
-        "avx, avx-x2, avx1.1, avx1.1-x2, avx2, avx2-x2,"
+    return "neon-8, neon-16, neon-32, sse2, sse2-x2, sse4, sse4-8, sse4-16, sse4-x2, "
+        "avx, avx-x2, avx1.1, avx1.1-x2, avx2, avx2-x2, "
         "generic-1, generic-4, generic-8, generic-16, generic-32";
 }
 

From 5b20b06bd9c75d84e78749b752716d6f2088b8d1 Mon Sep 17 00:00:00 2001
From: Matt Pharr <mpharr@google.com>
Date: Sat, 3 Aug 2013 20:44:25 -0700
Subject: [PATCH 19/34] Add avg_{up,down}_int{8,16} routines to stdlib

These compute the average of two given values, rounding up and down,
respectively, if the result isn't exact.  When possible, these are
mapped to target-specific intrinsics (PADD[BW] on IA and VH[R]ADD[US]
on NEON.)

A subsequent commit will add pattern-matching to generate calls to
these intrinsincs when the corresponding patterns are detected in the
IR.)
---
 builtins/target-avx-common.ll     |   6 ++
 builtins/target-generic-1.ll      |   6 ++
 builtins/target-generic-common.ll |   5 ++
 builtins/target-neon-16.ll        |  59 ++++++++++++++
 builtins/target-neon-32.ll        |  59 ++++++++++++++
 builtins/target-neon-8.ll         |  75 +++++++++++++++++
 builtins/target-sse2-common.ll    |   4 +
 builtins/target-sse4-16.ll        |  31 ++++++++
 builtins/target-sse4-8.ll         |  25 ++++++
 builtins/target-sse4-x2.ll        |   6 ++
 builtins/target-sse4.ll           |   6 ++
 builtins/util.m4                  | 128 ++++++++++++++++++++++++++++--
 docs/ispc.rst                     |  25 ++++++
 opt.cpp                           |   8 ++
 stdlib.ispc                       |  60 +++++++++++---
 tests/avg-down-int16.ispc         |  13 +++
 tests/avg-down-int8.ispc          |  13 +++
 tests/avg-down-uint16.ispc        |  13 +++
 tests/avg-down-uint8.ispc         |  13 +++
 tests/avg-up-int16.ispc           |  13 +++
 tests/avg-up-int8.ispc            |  13 +++
 tests/avg-up-uint16.ispc          |  13 +++
 tests/avg-up-uint8.ispc           |  13 +++
 23 files changed, 592 insertions(+), 15 deletions(-)
 create mode 100644 tests/avg-down-int16.ispc
 create mode 100644 tests/avg-down-int8.ispc
 create mode 100644 tests/avg-down-uint16.ispc
 create mode 100644 tests/avg-down-uint8.ispc
 create mode 100644 tests/avg-up-int16.ispc
 create mode 100644 tests/avg-up-int8.ispc
 create mode 100644 tests/avg-up-uint16.ispc
 create mode 100644 tests/avg-up-uint8.ispc

diff --git a/builtins/target-avx-common.ll b/builtins/target-avx-common.ll
index dcbe0a66..1d317713 100644
--- a/builtins/target-avx-common.ll
+++ b/builtins/target-avx-common.ll
@@ -277,3 +277,9 @@ define double @__max_uniform_double(double, double) nounwind readnone alwaysinli
   sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.max.sd, %0, %1)
   ret double %ret
 }
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int8/int16 builtins
+
+define_avgs()
+
diff --git a/builtins/target-generic-1.ll b/builtins/target-generic-1.ll
index 238de444..3472c207 100644
--- a/builtins/target-generic-1.ll
+++ b/builtins/target-generic-1.ll
@@ -864,3 +864,9 @@ declare float @__half_to_float_uniform(i16 %v) nounwind readnone
 declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
 declare i16 @__float_to_half_uniform(float %v) nounwind readnone
 declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int8/int16 builtins
+
+define_avgs()
+
diff --git a/builtins/target-generic-common.ll b/builtins/target-generic-common.ll
index b581e0a7..c683ff45 100644
--- a/builtins/target-generic-common.ll
+++ b/builtins/target-generic-common.ll
@@ -364,3 +364,8 @@ declare void @__prefetch_read_uniform_2(i8 * nocapture) nounwind
 declare void @__prefetch_read_uniform_3(i8 * nocapture) nounwind 
 declare void @__prefetch_read_uniform_nt(i8 * nocapture) nounwind 
 
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int8/int16 builtins
+
+define_avgs()
+
diff --git a/builtins/target-neon-16.ll b/builtins/target-neon-16.ll
index fd15eb0b..a0575927 100644
--- a/builtins/target-neon-16.ll
+++ b/builtins/target-neon-16.ll
@@ -456,3 +456,62 @@ define i64 @__reduce_min_uint64(<WIDTH x i64>) nounwind readnone {
 define i64 @__reduce_max_uint64(<WIDTH x i64>) nounwind readnone {
   reduce8(i64, @__max_varying_uint64, @__max_uniform_uint64)
 }
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int8/int16
+
+declare <8 x i8> @llvm.arm.neon.vrhaddu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+
+define <8 x i8> @__avg_up_uint8(<8 x i8>, <8 x i8>) nounwind readnone {
+  %r = call <8 x i8> @llvm.arm.neon.vrhaddu.v8i8(<8 x i8> %0, <8 x i8> %1)
+  ret <8 x i8> %r
+}
+
+declare <8 x i8> @llvm.arm.neon.vrhadds.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+
+define <8 x i8> @__avg_up_int8(<8 x i8>, <8 x i8>) nounwind readnone {
+  %r = call <8 x i8> @llvm.arm.neon.vrhadds.v8i8(<8 x i8> %0, <8 x i8> %1)
+  ret <8 x i8> %r
+}
+
+declare <8 x i8> @llvm.arm.neon.vhaddu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+
+define <8 x i8> @__avg_down_uint8(<8 x i8>, <8 x i8>) nounwind readnone {
+  %r = call <8 x i8> @llvm.arm.neon.vhaddu.v8i8(<8 x i8> %0, <8 x i8> %1)
+  ret <8 x i8> %r
+}
+
+declare <8 x i8> @llvm.arm.neon.vhadds.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+
+define <8 x i8> @__avg_down_int8(<8 x i8>, <8 x i8>) nounwind readnone {
+  %r = call <8 x i8> @llvm.arm.neon.vhadds.v8i8(<8 x i8> %0, <8 x i8> %1)
+  ret <8 x i8> %r
+}
+
+declare <8 x i16> @llvm.arm.neon.vrhaddu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <8 x i16> @__avg_up_uint16(<8 x i16>, <8 x i16>) nounwind readnone {
+  %r = call <8 x i16> @llvm.arm.neon.vrhaddu.v8i16(<8 x i16> %0, <8 x i16> %1)
+  ret <8 x i16> %r
+}
+
+declare <8 x i16> @llvm.arm.neon.vrhadds.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <8 x i16> @__avg_up_int16(<8 x i16>, <8 x i16>) nounwind readnone {
+  %r = call <8 x i16> @llvm.arm.neon.vrhadds.v8i16(<8 x i16> %0, <8 x i16> %1)
+  ret <8 x i16> %r
+}
+
+declare <8 x i16> @llvm.arm.neon.vhaddu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <8 x i16> @__avg_down_uint16(<8 x i16>, <8 x i16>) nounwind readnone {
+  %r = call <8 x i16> @llvm.arm.neon.vhaddu.v8i16(<8 x i16> %0, <8 x i16> %1)
+  ret <8 x i16> %r
+}
+
+declare <8 x i16> @llvm.arm.neon.vhadds.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <8 x i16> @__avg_down_int16(<8 x i16>, <8 x i16>) nounwind readnone {
+  %r = call <8 x i16> @llvm.arm.neon.vhadds.v8i16(<8 x i16> %0, <8 x i16> %1)
+  ret <8 x i16> %r
+}
diff --git a/builtins/target-neon-32.ll b/builtins/target-neon-32.ll
index 1f8003d7..30b062c9 100644
--- a/builtins/target-neon-32.ll
+++ b/builtins/target-neon-32.ll
@@ -426,3 +426,62 @@ define i64 @__reduce_min_uint64(<4 x i64>) nounwind readnone {
 define i64 @__reduce_max_uint64(<4 x i64>) nounwind readnone {
   reduce4(i64, @__max_varying_uint64, @__max_uniform_uint64)
 }
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int8/int16
+
+declare <4 x i8> @llvm.arm.neon.vrhaddu.v4i8(<4 x i8>, <4 x i8>) nounwind readnone
+
+define <4 x i8> @__avg_up_uint8(<4 x i8>, <4 x i8>) nounwind readnone {
+  %r = call <4 x i8> @llvm.arm.neon.vrhaddu.v4i8(<4 x i8> %0, <4 x i8> %1)
+  ret <4 x i8> %r
+}
+
+declare <4 x i8> @llvm.arm.neon.vrhadds.v4i8(<4 x i8>, <4 x i8>) nounwind readnone
+
+define <4 x i8> @__avg_up_int8(<4 x i8>, <4 x i8>) nounwind readnone {
+  %r = call <4 x i8> @llvm.arm.neon.vrhadds.v4i8(<4 x i8> %0, <4 x i8> %1)
+  ret <4 x i8> %r
+}
+
+declare <4 x i8> @llvm.arm.neon.vhaddu.v4i8(<4 x i8>, <4 x i8>) nounwind readnone
+
+define <4 x i8> @__avg_down_uint8(<4 x i8>, <4 x i8>) nounwind readnone {
+  %r = call <4 x i8> @llvm.arm.neon.vhaddu.v4i8(<4 x i8> %0, <4 x i8> %1)
+  ret <4 x i8> %r
+}
+
+declare <4 x i8> @llvm.arm.neon.vhadds.v4i8(<4 x i8>, <4 x i8>) nounwind readnone
+
+define <4 x i8> @__avg_down_int8(<4 x i8>, <4 x i8>) nounwind readnone {
+  %r = call <4 x i8> @llvm.arm.neon.vhadds.v4i8(<4 x i8> %0, <4 x i8> %1)
+  ret <4 x i8> %r
+}
+
+declare <4 x i16> @llvm.arm.neon.vrhaddu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+
+define <4 x i16> @__avg_up_uint16(<4 x i16>, <4 x i16>) nounwind readnone {
+  %r = call <4 x i16> @llvm.arm.neon.vrhaddu.v4i16(<4 x i16> %0, <4 x i16> %1)
+  ret <4 x i16> %r
+}
+
+declare <4 x i16> @llvm.arm.neon.vrhadds.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+
+define <4 x i16> @__avg_up_int16(<4 x i16>, <4 x i16>) nounwind readnone {
+  %r = call <4 x i16> @llvm.arm.neon.vrhadds.v4i16(<4 x i16> %0, <4 x i16> %1)
+  ret <4 x i16> %r
+}
+
+declare <4 x i16> @llvm.arm.neon.vhaddu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+
+define <4 x i16> @__avg_down_uint16(<4 x i16>, <4 x i16>) nounwind readnone {
+  %r = call <4 x i16> @llvm.arm.neon.vhaddu.v4i16(<4 x i16> %0, <4 x i16> %1)
+  ret <4 x i16> %r
+}
+
+declare <4 x i16> @llvm.arm.neon.vhadds.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+
+define <4 x i16> @__avg_down_int16(<4 x i16>, <4 x i16>) nounwind readnone {
+  %r = call <4 x i16> @llvm.arm.neon.vhadds.v4i16(<4 x i16> %0, <4 x i16> %1)
+  ret <4 x i16> %r
+}
diff --git a/builtins/target-neon-8.ll b/builtins/target-neon-8.ll
index eb65f224..2accfe53 100644
--- a/builtins/target-neon-8.ll
+++ b/builtins/target-neon-8.ll
@@ -506,3 +506,78 @@ define i64 @__reduce_min_uint64(<WIDTH x i64>) nounwind readnone {
 define i64 @__reduce_max_uint64(<WIDTH x i64>) nounwind readnone {
   reduce16(i64, @__max_varying_uint64, @__max_uniform_uint64)
 }
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int8/int16 builtins
+
+declare <16 x i8> @llvm.arm.neon.vrhaddu.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+
+define <16 x i8> @__avg_up_uint8(<16 x i8>, <16 x i8>) nounwind readnone {
+  %r = call <16 x i8> @llvm.arm.neon.vrhaddu.v16i8(<16 x i8> %0, <16 x i8> %1)
+  ret <16 x i8> %r
+}
+
+declare <16 x i8> @llvm.arm.neon.vrhadds.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+
+define <16 x i8> @__avg_up_int8(<16 x i8>, <16 x i8>) nounwind readnone {
+  %r = call <16 x i8> @llvm.arm.neon.vrhadds.v16i8(<16 x i8> %0, <16 x i8> %1)
+  ret <16 x i8> %r
+}
+
+declare <16 x i8> @llvm.arm.neon.vhaddu.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+
+define <16 x i8> @__avg_down_uint8(<16 x i8>, <16 x i8>) nounwind readnone {
+  %r = call <16 x i8> @llvm.arm.neon.vhaddu.v16i8(<16 x i8> %0, <16 x i8> %1)
+  ret <16 x i8> %r
+}
+
+declare <16 x i8> @llvm.arm.neon.vhadds.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+
+define <16 x i8> @__avg_down_int8(<16 x i8>, <16 x i8>) nounwind readnone {
+  %r = call <16 x i8> @llvm.arm.neon.vhadds.v16i8(<16 x i8> %0, <16 x i8> %1)
+  ret <16 x i8> %r
+}
+
+declare <8 x i16> @llvm.arm.neon.vrhaddu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <16 x i16> @__avg_up_uint16(<16 x i16>, <16 x i16>) nounwind readnone {
+  v16tov8(i16, %0, %a0, %b0)
+  v16tov8(i16, %1, %a1, %b1)
+  %r0 = call <8 x i16> @llvm.arm.neon.vrhaddu.v8i16(<8 x i16> %a0, <8 x i16> %a1)
+  %r1 = call <8 x i16> @llvm.arm.neon.vrhaddu.v8i16(<8 x i16> %b0, <8 x i16> %b1)
+  v8tov16(i16, %r0, %r1, %r)
+  ret <16 x i16> %r
+}
+
+declare <8 x i16> @llvm.arm.neon.vrhadds.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <16 x i16> @__avg_up_int16(<16 x i16>, <16 x i16>) nounwind readnone {
+  v16tov8(i16, %0, %a0, %b0)
+  v16tov8(i16, %1, %a1, %b1)
+  %r0 = call <8 x i16> @llvm.arm.neon.vrhadds.v8i16(<8 x i16> %a0, <8 x i16> %a1)
+  %r1 = call <8 x i16> @llvm.arm.neon.vrhadds.v8i16(<8 x i16> %b0, <8 x i16> %b1)
+  v8tov16(i16, %r0, %r1, %r)
+  ret <16 x i16> %r
+}
+
+declare <8 x i16> @llvm.arm.neon.vhaddu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <16 x i16> @__avg_down_uint16(<16 x i16>, <16 x i16>) nounwind readnone {
+  v16tov8(i16, %0, %a0, %b0)
+  v16tov8(i16, %1, %a1, %b1)
+  %r0 = call <8 x i16> @llvm.arm.neon.vhaddu.v8i16(<8 x i16> %a0, <8 x i16> %a1)
+  %r1 = call <8 x i16> @llvm.arm.neon.vhaddu.v8i16(<8 x i16> %b0, <8 x i16> %b1)
+  v8tov16(i16, %r0, %r1, %r)
+  ret <16 x i16> %r
+}
+
+declare <8 x i16> @llvm.arm.neon.vhadds.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <16 x i16> @__avg_down_int16(<16 x i16>, <16 x i16>) nounwind readnone {
+  v16tov8(i16, %0, %a0, %b0)
+  v16tov8(i16, %1, %a1, %b1)
+  %r0 = call <8 x i16> @llvm.arm.neon.vhadds.v8i16(<8 x i16> %a0, <8 x i16> %a1)
+  %r1 = call <8 x i16> @llvm.arm.neon.vhadds.v8i16(<8 x i16> %b0, <8 x i16> %b1)
+  v8tov16(i16, %r0, %r1, %r)
+  ret <16 x i16> %r
+}
diff --git a/builtins/target-sse2-common.ll b/builtins/target-sse2-common.ll
index c6a3afe2..ad1d88bc 100644
--- a/builtins/target-sse2-common.ll
+++ b/builtins/target-sse2-common.ll
@@ -269,4 +269,8 @@ define i64 @__popcnt_int64(i64) nounwind readnone alwaysinline {
   ret i64 %val
 }
 
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int8/int16 builtins
+
+define_avgs()
 
diff --git a/builtins/target-sse4-16.ll b/builtins/target-sse4-16.ll
index 50f0848d..b4772552 100644
--- a/builtins/target-sse4-16.ll
+++ b/builtins/target-sse4-16.ll
@@ -449,3 +449,34 @@ gen_scatter(i32)
 gen_scatter(float)
 gen_scatter(i64)
 gen_scatter(double)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int8/int16 builtins
+
+declare <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8>, <16 x i8>) nounwind readnone
+
+define <8 x i8> @__avg_up_uint8(<8 x i8>, <8 x i8>) {
+  %v0 = shufflevector <8 x i8> %0, <8 x i8> undef,
+    <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %v1 = shufflevector <8 x i8> %1, <8 x i8> undef,
+    <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %r16 = call <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8> %v0, <16 x i8> %v1)
+  %r = shufflevector <16 x i8> %r16, <16 x i8> undef,
+    <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i8> %r
+}
+
+declare <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <8 x i16> @__avg_up_uint16(<8 x i16>, <8 x i16>) {
+  %r = call <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16> %0, <8 x i16> %1)
+  ret <8 x i16> %r
+}
+
+define_avg_up_int8()
+define_avg_up_int16()
+define_down_avgs()
diff --git a/builtins/target-sse4-8.ll b/builtins/target-sse4-8.ll
index 7fa9075b..a75d8e3a 100644
--- a/builtins/target-sse4-8.ll
+++ b/builtins/target-sse4-8.ll
@@ -456,3 +456,28 @@ gen_scatter(i32)
 gen_scatter(float)
 gen_scatter(i64)
 gen_scatter(double)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int8/int16 builtins
+
+declare <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8>, <16 x i8>) nounwind readnone
+
+define <16 x i8> @__avg_up_uint8(<16 x i8>, <16 x i8>) nounwind readnone {
+  %r = call <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8> %0, <16 x i8> %1)
+  ret <16 x i8> %r
+}
+
+declare <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <16 x i16> @__avg_up_uint16(<16 x i16>, <16 x i16>) nounwind readnone {
+  v16tov8(i16, %0, %a0, %b0)
+  v16tov8(i16, %1, %a1, %b1)
+  %r0 = call <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16> %a0, <8 x i16> %a1)
+  %r1 = call <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16> %b0, <8 x i16> %b1)
+  v8tov16(i16, %r0, %r1, %r)
+  ret <16 x i16> %r
+}
+
+define_avg_up_int8()
+define_avg_up_int16()
+define_down_avgs()
diff --git a/builtins/target-sse4-x2.ll b/builtins/target-sse4-x2.ll
index 4a447ba6..897a09eb 100644
--- a/builtins/target-sse4-x2.ll
+++ b/builtins/target-sse4-x2.ll
@@ -573,3 +573,9 @@ define <8 x double> @__max_varying_double(<8 x double>, <8 x double>) nounwind r
   binary2to8(ret, double, @llvm.x86.sse2.max.pd, %0, %1)
   ret <8 x double> %ret
 }
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int8/int16 builtins
+
+define_avgs()
+
diff --git a/builtins/target-sse4.ll b/builtins/target-sse4.ll
index 7f9a9185..5429b461 100644
--- a/builtins/target-sse4.ll
+++ b/builtins/target-sse4.ll
@@ -473,3 +473,9 @@ gen_scatter(i32)
 gen_scatter(float)
 gen_scatter(i64)
 gen_scatter(double)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int8/int16 builtins
+
+define_avgs()
+
diff --git a/builtins/util.m4 b/builtins/util.m4
index 025030d5..95e3844d 100644
--- a/builtins/util.m4
+++ b/builtins/util.m4
@@ -49,9 +49,9 @@ define(`MASK_HIGH_BIT_ON',
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
-;; vector assembly and deconstruction utilities
+;; vector deconstruction utilities
 ;; split 8-wide vector into 2 4-wide vectors
-;; 
+;;
 ;; $1: vector element type
 ;; $2: 8-wide vector
 ;; $3: first 4-wide vector
@@ -71,10 +71,6 @@ define(`v16tov8', `
     <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ')
 
-;; 4-wide into 2 2-wide
-;; args as above
-;;
-
 define(`v4tov2', `
   $3 = shufflevector <4 x $1> $2, <4 x $1> undef, <2 x i32> <i32 0, i32 1>
   $4 = shufflevector <4 x $1> $2, <4 x $1> undef, <2 x i32> <i32 2, i32 3>
@@ -96,6 +92,20 @@ define(`v16tov4', `
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
+;; vector assembly: wider vector from two narrower vectors
+;;
+;; $1: vector element type
+;; $2: first n-wide vector
+;; $3: second n-wide vector
+;; $4: result 2*n-wide vector
+define(`v8tov16', `
+  $4 = shufflevector <8 x $1> $2, <8 x $1> $3,
+    <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
 ;; Helper macro for calling various SSE instructions for scalar values
 ;; but where the instruction takes a vector parameter.
 ;; $1 : name of variable to put the final value in
@@ -4276,3 +4286,109 @@ define i1 @__rdrand_i64(i64 * %ptr) {
   ret i1 %good
 }
 ')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int8/int16 builtins
+
+define(`define_avg_up_uint8', `
+define <WIDTH x i8> @__avg_up_uint8(<WIDTH x i8>, <WIDTH x i8>) {
+  %a16 = zext <WIDTH x i8> %0 to <WIDTH x i16>
+  %b16 = zext <WIDTH x i8> %1 to <WIDTH x i16>
+  %sum1 = add <WIDTH x i16> %a16, %b16
+  %sum = add <WIDTH x i16> %sum1, < forloop(i, 1, eval(WIDTH-1), `i16 1, ') i16 1 >
+  %avg = lshr <WIDTH x i16> %sum, < forloop(i, 1, eval(WIDTH-1), `i16 1, ') i16 1 >
+  %r = trunc <WIDTH x i16> %avg to <WIDTH x i8>
+  ret <WIDTH x i8> %r
+}')
+
+define(`define_avg_up_int8', `
+define <WIDTH x i8> @__avg_up_int8(<WIDTH x i8>, <WIDTH x i8>) {
+  %a16 = sext <WIDTH x i8> %0 to <WIDTH x i16>
+  %b16 = sext <WIDTH x i8> %1 to <WIDTH x i16>
+  %sum1 = add <WIDTH x i16> %a16, %b16
+  %sum = add <WIDTH x i16> %sum1, < forloop(i, 1, eval(WIDTH-1), `i16 1, ') i16 1 >
+  %avg = sdiv <WIDTH x i16> %sum, < forloop(i, 1, eval(WIDTH-1), `i16 2, ') i16 2 >
+  %r = trunc <WIDTH x i16> %avg to <WIDTH x i8>
+  ret <WIDTH x i8> %r
+}')
+
+define(`define_avg_up_uint16', `
+define <WIDTH x i16> @__avg_up_uint16(<WIDTH x i16>, <WIDTH x i16>) {
+  %a32 = zext <WIDTH x i16> %0 to <WIDTH x i32>
+  %b32 = zext <WIDTH x i16> %1 to <WIDTH x i32>
+  %sum1 = add <WIDTH x i32> %a32, %b32
+  %sum = add <WIDTH x i32> %sum1, < forloop(i, 1, eval(WIDTH-1), `i32 1, ') i32 1 >
+  %avg = lshr <WIDTH x i32> %sum, < forloop(i, 1, eval(WIDTH-1), `i32 1, ') i32 1 >
+  %r = trunc <WIDTH x i32> %avg to <WIDTH x i16>
+  ret <WIDTH x i16> %r
+}')
+
+define(`define_avg_up_int16', `
+define <WIDTH x i16> @__avg_up_int16(<WIDTH x i16>, <WIDTH x i16>) {
+  %a32 = sext <WIDTH x i16> %0 to <WIDTH x i32>
+  %b32 = sext <WIDTH x i16> %1 to <WIDTH x i32>
+  %sum1 = add <WIDTH x i32> %a32, %b32
+  %sum = add <WIDTH x i32> %sum1, < forloop(i, 1, eval(WIDTH-1), `i32 1, ') i32 1 >
+  %avg = sdiv <WIDTH x i32> %sum, < forloop(i, 1, eval(WIDTH-1), `i32 2, ') i32 2 >
+  %r = trunc <WIDTH x i32> %avg to <WIDTH x i16>
+  ret <WIDTH x i16> %r
+}')
+
+define(`define_avg_down_uint8', `
+define <WIDTH x i8> @__avg_down_uint8(<WIDTH x i8>, <WIDTH x i8>) {
+  %a16 = zext <WIDTH x i8> %0 to <WIDTH x i16>
+  %b16 = zext <WIDTH x i8> %1 to <WIDTH x i16>
+  %sum = add <WIDTH x i16> %a16, %b16
+  %avg = lshr <WIDTH x i16> %sum, < forloop(i, 1, eval(WIDTH-1), `i16 1, ') i16 1 >
+  %r = trunc <WIDTH x i16> %avg to <WIDTH x i8>
+  ret <WIDTH x i8> %r
+}')
+
+define(`define_avg_down_int8', `
+define <WIDTH x i8> @__avg_down_int8(<WIDTH x i8>, <WIDTH x i8>) {
+  %a16 = sext <WIDTH x i8> %0 to <WIDTH x i16>
+  %b16 = sext <WIDTH x i8> %1 to <WIDTH x i16>
+  %sum = add <WIDTH x i16> %a16, %b16
+  %avg = sdiv <WIDTH x i16> %sum, < forloop(i, 1, eval(WIDTH-1), `i16 2, ') i16 2 >
+  %r = trunc <WIDTH x i16> %avg to <WIDTH x i8>
+  ret <WIDTH x i8> %r
+}')
+
+define(`define_avg_down_uint16', `
+define <WIDTH x i16> @__avg_down_uint16(<WIDTH x i16>, <WIDTH x i16>) {
+  %a32 = zext <WIDTH x i16> %0 to <WIDTH x i32>
+  %b32 = zext <WIDTH x i16> %1 to <WIDTH x i32>
+  %sum = add <WIDTH x i32> %a32, %b32
+  %avg = lshr <WIDTH x i32> %sum, < forloop(i, 1, eval(WIDTH-1), `i32 1, ') i32 1 >
+  %r = trunc <WIDTH x i32> %avg to <WIDTH x i16>
+  ret <WIDTH x i16> %r
+}')
+
+define(`define_avg_down_int16', `
+define <WIDTH x i16> @__avg_down_int16(<WIDTH x i16>, <WIDTH x i16>) {
+  %a32 = sext <WIDTH x i16> %0 to <WIDTH x i32>
+  %b32 = sext <WIDTH x i16> %1 to <WIDTH x i32>
+  %sum = add <WIDTH x i32> %a32, %b32
+  %avg = sdiv <WIDTH x i32> %sum, < forloop(i, 1, eval(WIDTH-1), `i32 2, ') i32 2 >
+  %r = trunc <WIDTH x i32> %avg to <WIDTH x i16>
+  ret <WIDTH x i16> %r
+}')
+
+define(`define_up_avgs', `
+define_avg_up_uint8()
+define_avg_up_int8()
+define_avg_up_uint16()
+define_avg_up_int16()
+')
+
+define(`define_down_avgs', `
+define_avg_down_uint8()
+define_avg_down_int8()
+define_avg_down_uint16()
+define_avg_down_int16()
+')
+
+define(`define_avgs', `
+define_up_avgs()
+define_down_avgs()
+')
diff --git a/docs/ispc.rst b/docs/ispc.rst
index 8456f126..eb8333de 100755
--- a/docs/ispc.rst
+++ b/docs/ispc.rst
@@ -3399,6 +3399,31 @@ The ``isnan()`` functions test whether the given value is a floating-point
     uniform bool isnan(uniform double v)
 
 
+A number of functions are also available for performing operations on 8- and
+16-bit quantities; these map to specialized instructions that perform these
+operations on targets that support them.  ``avg_up()`` computes the average
+of the two values, rounding up if their average is halfway between two
+integers (i.e., it computes ``(a+b+1)/2``).
+
+::
+
+   int8 avg_up(int8 a, int8 b)
+   unsigned int8 avg_up(unsigned int8 a, unsigned int8 b)
+   int16 avg_up(int16 a, int16 b)
+   unsigned int16 avg_up(unsigned int16 a, unsigned int16 b)
+
+
+``avg_down()`` computes the average of the two values, rounding down (i.e.,
+it computes ``(a+b)/2``).
+
+::
+
+   int8 avg_down(int8 a, int8 b)
+   unsigned int8 avg_down(unsigned int8 a, unsigned int8 b)
+   int16 avg_down(int16 a, int16 b)
+   unsigned int16 avg_down(unsigned int16 a, unsigned int16 b)
+
+
 Transcendental Functions
 ------------------------
 
diff --git a/opt.cpp b/opt.cpp
index 8c86368e..b363f0e1 100644
--- a/opt.cpp
+++ b/opt.cpp
@@ -4343,6 +4343,14 @@ char MakeInternalFuncsStaticPass::ID = 0;
 bool
 MakeInternalFuncsStaticPass::runOnModule(llvm::Module &module) {
     const char *names[] = {
+        "__avg_up_uint8",
+        "__avg_up_int8",
+        "__avg_up_uint16",
+        "__avg_up_int16",
+        "__avg_down_uint8",
+        "__avg_down_int8",
+        "__avg_down_uint16",
+        "__avg_down_int16",
         "__fast_masked_vload",
         "__gather_factored_base_offsets32_i8", "__gather_factored_base_offsets32_i16",
         "__gather_factored_base_offsets32_i32", "__gather_factored_base_offsets32_i64",
diff --git a/stdlib.ispc b/stdlib.ispc
index affa7fef..dc94d7e3 100644
--- a/stdlib.ispc
+++ b/stdlib.ispc
@@ -4812,8 +4812,8 @@ static const uniform int64 __idiv_table_s32[][3] = {
 };
 
 __declspec(safe)
-static unmasked unsigned int8 __fast_idiv(unsigned int8 numerator,
-                                          uniform unsigned int8 divisor) {
+static unmasked inline unsigned int8
+__fast_idiv(unsigned int8 numerator, uniform unsigned int8 divisor) {
   uniform int64 method = __idiv_table_u8[divisor-2][0];
   uniform int64 multiplier = __idiv_table_u8[divisor-2][1];
   uniform int64 shift = __idiv_table_u8[divisor-2][2];
@@ -4833,7 +4833,7 @@ static unmasked unsigned int8 __fast_idiv(unsigned int8 numerator,
 }
 
 __declspec(safe)
-static unmasked int8 __fast_idiv(int8 numerator, uniform int8 divisor) {
+static unmasked inline int8 __fast_idiv(int8 numerator, uniform int8 divisor) {
   uniform int8 method = __idiv_table_s8[divisor-2][0];
   uniform int16 multiplier = __idiv_table_s8[divisor-2][1];
   uniform int8 shift = __idiv_table_s8[divisor-2][2];
@@ -4850,8 +4850,8 @@ static unmasked int8 __fast_idiv(int8 numerator, uniform int8 divisor) {
 }
 
 __declspec(safe)
-static unmasked unsigned int16 __fast_idiv(unsigned int16 numerator,
-                                           uniform unsigned int16 divisor) {
+static unmasked inline unsigned int16 __fast_idiv(unsigned int16 numerator,
+                                                  uniform unsigned int16 divisor) {
   uniform int64 method = __idiv_table_u16[divisor-2][0];
   uniform int64 multiplier = __idiv_table_u16[divisor-2][1];
   uniform int64 shift = __idiv_table_u16[divisor-2][2];
@@ -4871,7 +4871,7 @@ static unmasked unsigned int16 __fast_idiv(unsigned int16 numerator,
 }
 
 __declspec(safe)
-static unmasked int16 __fast_idiv(int16 numerator, uniform int16 divisor) {
+static unmasked inline int16 __fast_idiv(int16 numerator, uniform int16 divisor) {
   uniform int64 method = __idiv_table_s16[divisor-2][0];
   uniform int64 multiplier = __idiv_table_s16[divisor-2][1];
   uniform int64 shift = __idiv_table_s16[divisor-2][2];
@@ -4889,8 +4889,8 @@ static unmasked int16 __fast_idiv(int16 numerator, uniform int16 divisor) {
 }
 
 __declspec(safe)
-static unmasked inline unsigned int32 __fast_idiv(unsigned int32 numerator,
-                                                  uniform unsigned int32 divisor) {
+static unmasked inline inline unsigned int32 __fast_idiv(unsigned int32 numerator,
+                                                         uniform unsigned int32 divisor) {
   uniform int64 method = __idiv_table_u32[divisor-2][0];
   uniform int64 multiplier = __idiv_table_u32[divisor-2][1];
   uniform int64 shift = __idiv_table_u32[divisor-2][2];
@@ -4910,7 +4910,7 @@ static unmasked inline unsigned int32 __fast_idiv(unsigned int32 numerator,
 }
 
 __declspec(safe)
-static unmasked int32 __fast_idiv(int32 numerator, uniform int32 divisor) {
+static unmasked inline int32 __fast_idiv(int32 numerator, uniform int32 divisor) {
   uniform int64 method = __idiv_table_s32[divisor-2][0];
   uniform int64 multiplier = __idiv_table_s32[divisor-2][1];
   uniform int64 shift = __idiv_table_s32[divisor-2][2];
@@ -4927,3 +4927,45 @@ static unmasked int32 __fast_idiv(int32 numerator, uniform int32 divisor) {
   }
 }
 
+///////////////////////////////////////////////////////////////////////////
+// Saturating int8/int16 ops
+
+__declspec(safe)
+static unmasked inline unsigned int8 avg_up(unsigned int8 a, unsigned int8 b) {
+    return __avg_up_uint8(a, b);
+}
+
+__declspec(safe)
+static unmasked inline int8 avg_up(int8 a, int8 b) {
+    return __avg_up_int8(a, b);
+}
+
+__declspec(safe)
+static unmasked inline unsigned int16 avg_up(unsigned int16 a, unsigned int16 b) {
+    return __avg_up_uint16(a, b);
+}
+
+__declspec(safe)
+static unmasked inline int16 avg_up(int16 a, int16 b) {
+    return __avg_up_int16(a, b);
+}
+
+__declspec(safe)
+static unmasked inline unsigned int8 avg_down(unsigned int8 a, unsigned int8 b) {
+    return __avg_down_uint8(a, b);
+}
+
+__declspec(safe)
+static unmasked inline int8 avg_down(int8 a, int8 b) {
+    return __avg_down_int8(a, b);
+}
+
+__declspec(safe)
+static unmasked inline unsigned int16 avg_down(unsigned int16 a, unsigned int16 b) {
+    return __avg_down_uint16(a, b);
+}
+
+__declspec(safe)
+static unmasked inline int16 avg_down(int16 a, int16 b) {
+    return __avg_down_int16(a, b);
+}
diff --git a/tests/avg-down-int16.ispc b/tests/avg-down-int16.ispc
new file mode 100644
index 00000000..10a3c2a2
--- /dev/null
+++ b/tests/avg-down-int16.ispc
@@ -0,0 +1,13 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float bf) {
+    int16 a = aFOO[programIndex];
+    int16 b = bf;
+    RET[programIndex] = avg_down(a, b);
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = ((int)programIndex + 1 + 5) / 2;
+}
diff --git a/tests/avg-down-int8.ispc b/tests/avg-down-int8.ispc
new file mode 100644
index 00000000..67638934
--- /dev/null
+++ b/tests/avg-down-int8.ispc
@@ -0,0 +1,13 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float bf) {
+    int8 a = aFOO[programIndex];
+    int8 b = bf;
+    RET[programIndex] = avg_down(a, b);
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = ((int)programIndex + 1 + 5) / 2;
+}
diff --git a/tests/avg-down-uint16.ispc b/tests/avg-down-uint16.ispc
new file mode 100644
index 00000000..70f9185e
--- /dev/null
+++ b/tests/avg-down-uint16.ispc
@@ -0,0 +1,13 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float bf) {
+    unsigned int16 a = aFOO[programIndex];
+    unsigned int16 b = bf;
+    RET[programIndex] = avg_down(a, b);
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = ((int)programIndex + 1 + 5) / 2;
+}
diff --git a/tests/avg-down-uint8.ispc b/tests/avg-down-uint8.ispc
new file mode 100644
index 00000000..75fbf116
--- /dev/null
+++ b/tests/avg-down-uint8.ispc
@@ -0,0 +1,13 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float bf) {
+    unsigned int8 a = aFOO[programIndex];
+    unsigned int8 b = bf;
+    RET[programIndex] = avg_down(a, b);
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = ((int)programIndex + 1 + 5) / 2;
+}
diff --git a/tests/avg-up-int16.ispc b/tests/avg-up-int16.ispc
new file mode 100644
index 00000000..8f557a5b
--- /dev/null
+++ b/tests/avg-up-int16.ispc
@@ -0,0 +1,13 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float bf) {
+    int16 a = aFOO[programIndex];
+    int16 b = bf;
+    RET[programIndex] = avg_up(a, b);
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = ((int)programIndex + 1 + 5 + 1) / 2;
+}
diff --git a/tests/avg-up-int8.ispc b/tests/avg-up-int8.ispc
new file mode 100644
index 00000000..d0a3b444
--- /dev/null
+++ b/tests/avg-up-int8.ispc
@@ -0,0 +1,13 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float bf) {
+    int8 a = aFOO[programIndex];
+    int8 b = bf;
+    RET[programIndex] = avg_up(a, b);
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = ((int)programIndex + 1 + 5 + 1) / 2;
+}
diff --git a/tests/avg-up-uint16.ispc b/tests/avg-up-uint16.ispc
new file mode 100644
index 00000000..273f9f3b
--- /dev/null
+++ b/tests/avg-up-uint16.ispc
@@ -0,0 +1,13 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float bf) {
+    unsigned int16 a = aFOO[programIndex];
+    unsigned int16 b = bf;
+    RET[programIndex] = avg_up(a, b);
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = ((int)programIndex + 1 + 5 + 1) / 2;
+}
diff --git a/tests/avg-up-uint8.ispc b/tests/avg-up-uint8.ispc
new file mode 100644
index 00000000..d5d02491
--- /dev/null
+++ b/tests/avg-up-uint8.ispc
@@ -0,0 +1,13 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float bf) {
+    unsigned int8 a = aFOO[programIndex];
+    unsigned int8 b = bf;
+    RET[programIndex] = avg_up(a, b);
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = ((int)programIndex + 1 + 5 + 1) / 2;
+}

From ccdbddd388bf494bf3cb4aaf6a90cbb684cd18f0 Mon Sep 17 00:00:00 2001
From: Matt Pharr <mpharr@google.com>
Date: Tue, 6 Aug 2013 08:59:46 -0700
Subject: [PATCH 20/34] Add peephole optimization to match int8/int16 averages.

Match the following patterns in IR, turning them into target-specific
intrinsics (e.g. PAVGB on x86) when possible.

(unsigned int8)(((unsigned int16)a + (unsigned int16)b + 1)/2)
(unsigned int8)(((unsigned int16)a + (unsigned int16)b)/2)
(unsigned int16)(((unsigned int32)a + (unsigned int32)b + 1)/2)
(unsigned int16)(((unsigned int32)a + (unsigned int32)b)/2)
(int8)(((int16)a + (int16)b + 1)/2)
(int8)(((int16)a + (int16)b)/2)
(int16)(((int32)a + (int32)b + 1)/2)
(int16)(((int32)a + (int32)b)/2)
---
 opt.cpp | 393 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 393 insertions(+)

diff --git a/opt.cpp b/opt.cpp
index b363f0e1..8899c64d 100644
--- a/opt.cpp
+++ b/opt.cpp
@@ -84,6 +84,7 @@
 #include <llvm/Analysis/Passes.h>
 #include <llvm/Support/raw_ostream.h>
 #include <llvm/DebugInfo.h>
+#include <llvm/Support/PatternMatch.h>
 #include <llvm/Support/Dwarf.h>
 #ifdef ISPC_IS_LINUX
   #include <alloca.h>
@@ -103,6 +104,7 @@
 
 static llvm::Pass *CreateIntrinsicsOptPass();
 static llvm::Pass *CreateInstructionSimplifyPass();
+static llvm::Pass *CreatePeepholePass();
 
 static llvm::Pass *CreateImproveMemoryOpsPass();
 static llvm::Pass *CreateGatherCoalescePass();
@@ -459,6 +461,9 @@ Optimize(llvm::Module *module, int optLevel) {
         optPM.add(llvm::createDeadInstEliminationPass());
         optPM.add(llvm::createCFGSimplificationPass());
 
+        optPM.add(llvm::createPromoteMemoryToRegisterPass());
+        optPM.add(llvm::createAggressiveDCEPass());
+
         if (g->opt.disableGatherScatterOptimizations == false &&
             g->target->getVectorWidth() > 1) {
             optPM.add(llvm::createInstructionCombiningPass());
@@ -500,6 +505,7 @@ Optimize(llvm::Module *module, int optLevel) {
         // InstructionCombiningPass. See r184459 for details.
         optPM.add(llvm::createSimplifyLibCallsPass());
 #endif
+        optPM.add(llvm::createAggressiveDCEPass());
         optPM.add(llvm::createInstructionCombiningPass());
         optPM.add(llvm::createJumpThreadingPass());
         optPM.add(llvm::createCFGSimplificationPass());
@@ -539,6 +545,7 @@ Optimize(llvm::Module *module, int optLevel) {
 
         optPM.add(llvm::createIPSCCPPass());
         optPM.add(llvm::createDeadArgEliminationPass());
+        optPM.add(llvm::createAggressiveDCEPass());
         optPM.add(llvm::createInstructionCombiningPass());
         optPM.add(llvm::createCFGSimplificationPass());
 
@@ -581,6 +588,9 @@ Optimize(llvm::Module *module, int optLevel) {
         optPM.add(llvm::createCFGSimplificationPass());
         optPM.add(llvm::createInstructionCombiningPass());
         optPM.add(CreateInstructionSimplifyPass());
+        optPM.add(CreatePeepholePass());
+        optPM.add(llvm::createFunctionInliningPass());
+        optPM.add(llvm::createAggressiveDCEPass());
         optPM.add(llvm::createStripDeadPrototypesPass());
         optPM.add(CreateMakeInternalFuncsStaticPass());
         optPM.add(llvm::createGlobalDCEPass());
@@ -4430,3 +4440,386 @@ static llvm::Pass *
 CreateMakeInternalFuncsStaticPass() {
     return new MakeInternalFuncsStaticPass;
 }
+
+
+///////////////////////////////////////////////////////////////////////////
+// PeepholePass
+
+class PeepholePass : public llvm::BasicBlockPass {
+public:
+    PeepholePass();
+
+    const char *getPassName() const { return "Peephole Optimizations"; }
+    bool runOnBasicBlock(llvm::BasicBlock &BB);
+
+    static char ID;
+};
+
+char PeepholePass::ID = 0;
+
+PeepholePass::PeepholePass()
+    : BasicBlockPass(ID) {
+}
+
+using namespace llvm::PatternMatch;
+
+template<typename Op_t, unsigned Opcode>
+struct CastClassTypes_match {
+    Op_t Op;
+    const llvm::Type *fromType, *toType;
+
+    CastClassTypes_match(const Op_t &OpMatch, const llvm::Type *f,
+                         const llvm::Type *t)
+        : Op(OpMatch), fromType(f), toType(t) {}
+
+    template<typename OpTy>
+    bool match(OpTy *V) {
+        if (llvm::Operator *O = llvm::dyn_cast<llvm::Operator>(V))
+            return (O->getOpcode() == Opcode && Op.match(O->getOperand(0)) &&
+                    O->getType() == toType &&
+                    O->getOperand(0)->getType() == fromType);
+        return false;
+    }
+};
+
+template<typename OpTy>
+inline CastClassTypes_match<OpTy, llvm::Instruction::SExt>
+m_SExt8To16(const OpTy &Op) {
+    return CastClassTypes_match<OpTy, llvm::Instruction::SExt>(
+        Op,
+        LLVMTypes::Int8VectorType,
+        LLVMTypes::Int16VectorType);
+}
+
+template<typename OpTy>
+inline CastClassTypes_match<OpTy, llvm::Instruction::ZExt>
+m_ZExt8To16(const OpTy &Op) {
+    return CastClassTypes_match<OpTy, llvm::Instruction::ZExt>(
+        Op,
+        LLVMTypes::Int8VectorType,
+        LLVMTypes::Int16VectorType);
+}
+
+
+template<typename OpTy>
+inline CastClassTypes_match<OpTy, llvm::Instruction::Trunc>
+m_Trunc16To8(const OpTy &Op) {
+    return CastClassTypes_match<OpTy, llvm::Instruction::Trunc>(
+        Op,
+        LLVMTypes::Int16VectorType,
+        LLVMTypes::Int8VectorType);
+}
+
+template<typename OpTy>
+inline CastClassTypes_match<OpTy, llvm::Instruction::SExt>
+m_SExt16To32(const OpTy &Op) {
+    return CastClassTypes_match<OpTy, llvm::Instruction::SExt>(
+        Op,
+        LLVMTypes::Int16VectorType,
+        LLVMTypes::Int32VectorType);
+}
+
+template<typename OpTy>
+inline CastClassTypes_match<OpTy, llvm::Instruction::ZExt>
+m_ZExt16To32(const OpTy &Op) {
+    return CastClassTypes_match<OpTy, llvm::Instruction::ZExt>(
+        Op,
+        LLVMTypes::Int16VectorType,
+        LLVMTypes::Int32VectorType);
+}
+
+
+template<typename OpTy>
+inline CastClassTypes_match<OpTy, llvm::Instruction::Trunc>
+m_Trunc32To16(const OpTy &Op) {
+    return CastClassTypes_match<OpTy, llvm::Instruction::Trunc>(
+        Op,
+        LLVMTypes::Int32VectorType,
+        LLVMTypes::Int16VectorType);
+}
+
+template<typename Op_t>
+struct UDiv2_match {
+    Op_t Op;
+
+    UDiv2_match(const Op_t &OpMatch)
+        : Op(OpMatch) {}
+
+    template<typename OpTy>
+    bool match(OpTy *V) {
+        llvm::BinaryOperator *bop;
+        llvm::ConstantDataVector *cdv;
+        if ((bop = llvm::dyn_cast<llvm::BinaryOperator>(V)) &&
+            (cdv = llvm::dyn_cast<llvm::ConstantDataVector>(bop->getOperand(1))) &&
+            cdv->getSplatValue() != NULL) {
+            const llvm::APInt &apInt = cdv->getUniqueInteger();
+
+            switch (bop->getOpcode()) {
+            case llvm::Instruction::UDiv:
+                // divide by 2
+                return (apInt.isIntN(2) && Op.match(bop->getOperand(0)));
+            case llvm::Instruction::LShr:
+                // shift left by 1
+                return (apInt.isIntN(1) && Op.match(bop->getOperand(0)));
+            default:
+                return false;
+            }
+        }
+        return false;
+    }
+};
+
+template<typename V>
+inline UDiv2_match<V>
+m_UDiv2(const V &v) {
+    return UDiv2_match<V>(v);
+}
+
+template<typename Op_t>
+struct SDiv2_match {
+    Op_t Op;
+
+    SDiv2_match(const Op_t &OpMatch)
+        : Op(OpMatch) {}
+
+    template<typename OpTy>
+    bool match(OpTy *V) {
+        llvm::BinaryOperator *bop;
+        llvm::ConstantDataVector *cdv;
+        if ((bop = llvm::dyn_cast<llvm::BinaryOperator>(V)) &&
+            (cdv = llvm::dyn_cast<llvm::ConstantDataVector>(bop->getOperand(1))) &&
+            cdv->getSplatValue() != NULL) {
+            const llvm::APInt &apInt = cdv->getUniqueInteger();
+
+            switch (bop->getOpcode()) {
+            case llvm::Instruction::SDiv:
+                // divide by 2
+                return (apInt.isIntN(2) && Op.match(bop->getOperand(0)));
+            case llvm::Instruction::AShr:
+                // shift left by 1
+                return (apInt.isIntN(1) && Op.match(bop->getOperand(0)));
+            default:
+                return false;
+            }
+        }
+        return false;
+    }
+};
+
+template<typename V>
+inline SDiv2_match<V>
+m_SDiv2(const V &v) {
+    return SDiv2_match<V>(v);
+}
+// Returns true if the given function has a call to an intrinsic function
+// in its definition.
+static bool
+lHasIntrinsicInDefinition(llvm::Function *func) {
+  llvm::Function::iterator bbiter = func->begin();
+  for (; bbiter != func->end(); ++bbiter) {
+    for (llvm::BasicBlock::iterator institer = bbiter->begin();
+         institer != bbiter->end(); ++institer) {
+      if (llvm::isa<llvm::IntrinsicInst>(institer))
+        return true;
+    }
+  }
+  return false;
+}
+
+static llvm::Instruction *
+lGetBinaryIntrinsic(const char *name, llvm::Value *opa, llvm::Value *opb) {
+  llvm::Function *func = m->module->getFunction(name);
+  Assert(func != NULL);
+
+  // Make sure that the definition of the llvm::Function has a call to an
+  // intrinsic function in its instructions; otherwise we will generate
+  // infinite loops where we "helpfully" turn the default implementations
+  // of target builtins like __avg_up_uint8 that are implemented with plain
+  // arithmetic ops into recursive calls to themselves.
+  if (lHasIntrinsicInDefinition(func))
+    return lCallInst(func, opa, opb, name);
+  else
+    return NULL;
+}
+
+//////////////////////////////////////////////////
+
+static llvm::Instruction *
+lMatchAvgUpUInt8(llvm::Value *inst) {
+    // (unsigned int8)(((unsigned int16)a + (unsigned int16)b + 1)/2)
+    llvm::Value *opa, *opb;
+    const llvm::APInt *delta;
+    if (match(inst, m_Trunc16To8(m_UDiv2(m_CombineOr(
+        m_CombineOr(
+            m_Add(m_ZExt8To16(m_Value(opa)),
+                  m_Add(m_ZExt8To16(m_Value(opb)), m_APInt(delta))),
+            m_Add(m_Add(m_ZExt8To16(m_Value(opa)), m_APInt(delta)),
+                  m_ZExt8To16(m_Value(opb)))),
+        m_Add(m_Add(m_ZExt8To16(m_Value(opa)), m_ZExt8To16(m_Value(opb))),
+              m_APInt(delta))))))) {
+        if (delta->isIntN(1) == false)
+            return false;
+
+        return lGetBinaryIntrinsic("__avg_up_uint8", opa, opb);
+    }
+    return NULL;
+}
+
+
+static llvm::Instruction *
+lMatchAvgDownUInt8(llvm::Value *inst) {
+    // (unsigned int8)(((unsigned int16)a + (unsigned int16)b)/2)
+    llvm::Value *opa, *opb;
+    if (match(inst, m_Trunc16To8(m_UDiv2(
+                    m_Add(m_ZExt8To16(m_Value(opa)),
+                          m_ZExt8To16(m_Value(opb))))))) {
+        return lGetBinaryIntrinsic("__avg_down_uint8", opa, opb);
+    }
+    return NULL;
+}
+
+static llvm::Instruction *
+lMatchAvgUpUInt16(llvm::Value *inst) {
+    // (unsigned int16)(((unsigned int32)a + (unsigned int32)b + 1)/2)
+    llvm::Value *opa, *opb;
+    const llvm::APInt *delta;
+    if (match(inst, m_Trunc32To16(m_UDiv2(m_CombineOr(
+        m_CombineOr(
+            m_Add(m_ZExt16To32(m_Value(opa)),
+                  m_Add(m_ZExt16To32(m_Value(opb)), m_APInt(delta))),
+            m_Add(m_Add(m_ZExt16To32(m_Value(opa)), m_APInt(delta)),
+                  m_ZExt16To32(m_Value(opb)))),
+        m_Add(m_Add(m_ZExt16To32(m_Value(opa)), m_ZExt16To32(m_Value(opb))),
+              m_APInt(delta))))))) {
+        if (delta->isIntN(1) == false)
+            return false;
+
+        return lGetBinaryIntrinsic("__avg_up_uint16", opa, opb);
+    }
+    return NULL;
+}
+
+
+static llvm::Instruction *
+lMatchAvgDownUInt16(llvm::Value *inst) {
+    // (unsigned int16)(((unsigned int32)a + (unsigned int32)b)/2)
+    llvm::Value *opa, *opb;
+    if (match(inst, m_Trunc32To16(m_UDiv2(
+                    m_Add(m_ZExt16To32(m_Value(opa)),
+                          m_ZExt16To32(m_Value(opb))))))) {
+        return lGetBinaryIntrinsic("__avg_down_uint16", opa, opb);
+    }
+    return NULL;
+}
+
+
+static llvm::Instruction *
+lMatchAvgUpInt8(llvm::Value *inst) {
+    // (int8)(((int16)a + (int16)b + 1)/2)
+    llvm::Value *opa, *opb;
+    const llvm::APInt *delta;
+    if (match(inst, m_Trunc16To8(m_SDiv2(m_CombineOr(
+        m_CombineOr(
+            m_Add(m_SExt8To16(m_Value(opa)),
+                  m_Add(m_SExt8To16(m_Value(opb)), m_APInt(delta))),
+            m_Add(m_Add(m_SExt8To16(m_Value(opa)), m_APInt(delta)),
+                  m_SExt8To16(m_Value(opb)))),
+        m_Add(m_Add(m_SExt8To16(m_Value(opa)), m_SExt8To16(m_Value(opb))),
+              m_APInt(delta))))))) {
+        if (delta->isIntN(1) == false)
+            return false;
+
+        return lGetBinaryIntrinsic("__avg_up_int8", opa, opb);
+    }
+    return NULL;
+}
+
+
+static llvm::Instruction *
+lMatchAvgDownInt8(llvm::Value *inst) {
+    // (int8)(((int16)a + (int16)b)/2)
+    llvm::Value *opa, *opb;
+    if (match(inst, m_Trunc16To8(m_SDiv2(
+                    m_Add(m_SExt8To16(m_Value(opa)),
+                          m_SExt8To16(m_Value(opb))))))) {
+        return lGetBinaryIntrinsic("__avg_down_int8", opa, opb);
+    }
+    return NULL;
+}
+
+static llvm::Instruction *
+lMatchAvgUpInt16(llvm::Value *inst) {
+    // (int16)(((int32)a + (int32)b + 1)/2)
+    llvm::Value *opa, *opb;
+    const llvm::APInt *delta;
+    if (match(inst, m_Trunc32To16(m_SDiv2(m_CombineOr(
+        m_CombineOr(
+            m_Add(m_SExt16To32(m_Value(opa)),
+                  m_Add(m_SExt16To32(m_Value(opb)), m_APInt(delta))),
+            m_Add(m_Add(m_SExt16To32(m_Value(opa)), m_APInt(delta)),
+                  m_SExt16To32(m_Value(opb)))),
+        m_Add(m_Add(m_SExt16To32(m_Value(opa)), m_SExt16To32(m_Value(opb))),
+              m_APInt(delta))))))) {
+        if (delta->isIntN(1) == false)
+            return false;
+
+        return lGetBinaryIntrinsic("__avg_up_int16", opa, opb);
+    }
+    return NULL;
+}
+
+static llvm::Instruction *
+lMatchAvgDownInt16(llvm::Value *inst) {
+    // (int16)(((int32)a + (int32)b)/2)
+    llvm::Value *opa, *opb;
+    if (match(inst, m_Trunc32To16(m_SDiv2(
+                    m_Add(m_SExt16To32(m_Value(opa)),
+                          m_SExt16To32(m_Value(opb))))))) {
+        return lGetBinaryIntrinsic("__avg_down_int16", opa, opb);
+    }
+    return NULL;
+}
+
+bool
+PeepholePass::runOnBasicBlock(llvm::BasicBlock &bb) {
+    DEBUG_START_PASS("PeepholePass");
+
+    bool modifiedAny = false;
+ restart:
+    for (llvm::BasicBlock::iterator iter = bb.begin(), e = bb.end(); iter != e; ++iter) {
+        llvm::Instruction *inst = &*iter;
+
+        llvm::Instruction *builtinCall = NULL;
+        if (!builtinCall)
+          builtinCall = lMatchAvgUpUInt8(inst);
+        if (!builtinCall)
+          builtinCall = lMatchAvgUpUInt16(inst);
+        if (!builtinCall)
+          builtinCall = lMatchAvgDownUInt8(inst);
+        if (!builtinCall)
+          builtinCall = lMatchAvgDownUInt16(inst);
+        if (!builtinCall)
+          builtinCall = lMatchAvgUpInt8(inst);
+        if (!builtinCall)
+          builtinCall = lMatchAvgUpInt16(inst);
+        if (!builtinCall)
+          builtinCall = lMatchAvgDownInt8(inst);
+        if (!builtinCall)
+          builtinCall = lMatchAvgDownInt16(inst);
+
+        if (builtinCall != NULL) {
+          llvm::ReplaceInstWithInst(inst, builtinCall);
+          modifiedAny = true;
+          goto restart;
+        }
+    }
+
+    DEBUG_END_PASS("PeepholePass");
+
+    return modifiedAny;
+}
+
+static llvm::Pass *
+CreatePeepholePass() {
+  return new PeepholePass;
+}

From 1276ea98440fc95bdb1388c27217c618cdac3cba Mon Sep 17 00:00:00 2001
From: Matt Pharr <mpharr@google.com>
Date: Tue, 6 Aug 2013 17:00:35 -0700
Subject: [PATCH 21/34] Revert "Remove support for building with LLVM 3.1"

This reverts commit d3c567503bf64ec9066c09cb8959c31d4aa1be0e.

Conflicts:
	opt.cpp
---
 builtins.cpp                |   2 +
 builtins/target-avx11-x2.ll |   4 +-
 builtins/target-avx11.ll    |   4 +-
 builtins/target-avx2-x2.ll  |  25 +++++++-
 builtins/target-avx2.ll     |  25 +++++++-
 cbackend.cpp                | 115 +++++++++++++++++++++++++++---------
 ctx.cpp                     |   4 +-
 ctx.h                       |  11 +++-
 expr.cpp                    |   2 +-
 func.cpp                    |  10 +++-
 ispc.cpp                    |  68 +++++++++++++++++----
 ispc.h                      |  18 +++++-
 llvmutil.cpp                |   2 +-
 llvmutil.h                  |   2 +-
 main.cpp                    |   4 +-
 module.cpp                  |  47 ++++++++++++---
 opt.cpp                     |  22 +++++--
 stmt.cpp                    |   2 +-
 type.cpp                    |  26 +++++---
 type.h                      |   2 +-
 util.cpp                    |   9 ++-
 21 files changed, 320 insertions(+), 84 deletions(-)

diff --git a/builtins.cpp b/builtins.cpp
index d75db43e..82c45b02 100644
--- a/builtins.cpp
+++ b/builtins.cpp
@@ -49,6 +49,8 @@
 #include <stdlib.h>
 #if defined(LLVM_3_2)
   #include <llvm/Attributes.h>
+#endif
+#if defined(LLVM_3_1) || defined(LLVM_3_2)
   #include <llvm/LLVMContext.h>
   #include <llvm/Module.h>
   #include <llvm/Type.h>
diff --git a/builtins/target-avx11-x2.ll b/builtins/target-avx11-x2.ll
index 2aee1e1c..1aa6345c 100644
--- a/builtins/target-avx11-x2.ll
+++ b/builtins/target-avx11-x2.ll
@@ -31,7 +31,9 @@
 
 include(`target-avx-x2.ll')
 
-rdrand_definition()
+ifelse(LLVM_VERSION, `LLVM_3_0', `rdrand_decls()',
+       LLVM_VERSION, `LLVM_3_1', `rdrand_decls()',
+       `rdrand_definition()')
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; int min/max
diff --git a/builtins/target-avx11.ll b/builtins/target-avx11.ll
index 44593113..fea0a7c2 100644
--- a/builtins/target-avx11.ll
+++ b/builtins/target-avx11.ll
@@ -31,7 +31,9 @@
 
 include(`target-avx.ll')
 
-rdrand_definition()
+ifelse(LLVM_VERSION, `LLVM_3_0', `rdrand_decls()',
+       LLVM_VERSION, `LLVM_3_1', `rdrand_decls()',
+       `rdrand_definition()')
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; int min/max
diff --git a/builtins/target-avx2-x2.ll b/builtins/target-avx2-x2.ll
index 19f1845d..053fd078 100644
--- a/builtins/target-avx2-x2.ll
+++ b/builtins/target-avx2-x2.ll
@@ -29,11 +29,15 @@
 ;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 ;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
 
-define(`HAVE_GATHER', `1')
+ifelse(LLVM_VERSION, `LLVM_3_0', `',
+       LLVM_VERSION, `LLVM_3_1', `',
+       `define(`HAVE_GATHER', `1')')
 
 include(`target-avx-x2.ll')
 
-rdrand_definition()
+ifelse(LLVM_VERSION, `LLVM_3_0', `rdrand_decls()',
+       LLVM_VERSION, `LLVM_3_1', `rdrand_decls()',
+       `rdrand_definition()')
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; int min/max
@@ -172,6 +176,21 @@ define(`assemble_4s', `
   assemble_8s($1, $2, $2_1, $2_2)
 ')
 
+ifelse(LLVM_VERSION, `LLVM_3_0', `
+gen_gather_factored(i8)
+gen_gather_factored(i16)
+gen_gather_factored(i32)
+gen_gather_factored(float)
+gen_gather_factored(i64)
+gen_gather_factored(double)',
+LLVM_VERSION, `LLVM_3_1', `
+gen_gather_factored(i8)
+gen_gather_factored(i16)
+gen_gather_factored(i32)
+gen_gather_factored(float)
+gen_gather_factored(i64)
+gen_gather_factored(double)', `
+
 gen_gather(i8)
 gen_gather(i16)
 
@@ -538,3 +557,5 @@ define <16 x double> @__gather64_double(<16 x i64> %ptrs,
 
   ret <16 x double> %v
 }
+
+')
diff --git a/builtins/target-avx2.ll b/builtins/target-avx2.ll
index d3410011..f4a0ee07 100644
--- a/builtins/target-avx2.ll
+++ b/builtins/target-avx2.ll
@@ -29,11 +29,15 @@
 ;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 ;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
 
-define(`HAVE_GATHER', `1')
+ifelse(LLVM_VERSION, `LLVM_3_0', `',
+       LLVM_VERSION, `LLVM_3_1', `',
+       `define(`HAVE_GATHER', `1')')
 
 include(`target-avx.ll')
 
-rdrand_definition()
+ifelse(LLVM_VERSION, `LLVM_3_0', `rdrand_decls()',
+       LLVM_VERSION, `LLVM_3_1', `rdrand_decls()',
+       `rdrand_definition()')
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; int min/max
@@ -119,6 +123,21 @@ define(`extract_4s', `
   %$2_2 = shufflevector <8 x $1> %$2, <8 x $1> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 ')
 
+ifelse(LLVM_VERSION, `LLVM_3_0', `
+gen_gather_factored(i8)
+gen_gather_factored(i16)
+gen_gather_factored(i32)
+gen_gather_factored(float)
+gen_gather_factored(i64)
+gen_gather_factored(double)',
+LLVM_VERSION, `LLVM_3_1', `
+gen_gather_factored(i8)
+gen_gather_factored(i16)
+gen_gather_factored(i32)
+gen_gather_factored(float)
+gen_gather_factored(i64)
+gen_gather_factored(double)', `
+
 gen_gather(i8)
 gen_gather(i16)
 
@@ -410,3 +429,5 @@ define <8 x double> @__gather64_double(<8 x i64> %ptrs,
 
   ret <8 x double> %v
 }
+
+')
diff --git a/cbackend.cpp b/cbackend.cpp
index d54f48fb..d23bcc20 100644
--- a/cbackend.cpp
+++ b/cbackend.cpp
@@ -29,7 +29,7 @@
 
 #include "llvmutil.h"
 
-#if defined(LLVM_3_2)
+#if defined(LLVM_3_1) || defined(LLVM_3_2)
   #include "llvm/Constants.h"
   #include "llvm/DerivedTypes.h"
   #include "llvm/CallingConv.h"
@@ -38,7 +38,6 @@
   #include "llvm/Intrinsics.h"
   #include "llvm/IntrinsicInst.h"
   #include "llvm/InlineAsm.h"
-  #include "llvm/TypeFinder.h"
 #else
   #include "llvm/IR/Constants.h"
   #include "llvm/IR/DerivedTypes.h"
@@ -48,10 +47,16 @@
   #include "llvm/IR/Intrinsics.h"
   #include "llvm/IR/IntrinsicInst.h"
   #include "llvm/IR/InlineAsm.h"
-  #include "llvm/IR/TypeFinder.h"
 #endif
 #include "llvm/Pass.h"
 #include "llvm/PassManager.h"
+#if !defined(LLVM_3_1)
+  #if defined(LLVM_3_2)
+    #include "llvm/TypeFinder.h"
+  #else // LLVM_3_3 +
+    #include "llvm/IR/TypeFinder.h"
+  #endif
+#endif // LLVM_3_2 +
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/STLExtras.h"
@@ -71,7 +76,9 @@
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCSymbol.h"
-#if defined(LLVM_3_2)
+#if defined(LLVM_3_1)
+  #include "llvm/Target/TargetData.h"
+#elif defined(LLVM_3_2)
   #include "llvm/DataLayout.h"
 #else // LLVM 3.3+
   #include "llvm/IR/DataLayout.h"
@@ -81,7 +88,7 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FormattedStream.h"
 #include "llvm/Support/GetElementPtrTypeIterator.h"
-#if defined(LLVM_3_2)
+#if defined(LLVM_3_1) || defined(LLVM_3_2)
   #include "llvm/Support/InstVisitor.h"
 #else // LLVM 3.3+
   #include "llvm/InstVisitor.h"
@@ -251,10 +258,14 @@ namespace {
     const llvm::MCRegisterInfo *MRI;
     const llvm::MCObjectFileInfo *MOFI;
     llvm::MCContext *TCtx;
+#if defined(LLVM_3_1)
+    const llvm::TargetData* TD;
+#else
     // FIXME: it's ugly to have the name be "TD" here, but it saves us
     // lots of ifdefs in the below since the new DataLayout and the old
     // TargetData have generally similar interfaces...
     const llvm::DataLayout* TD;
+#endif
 
     std::map<const llvm::ConstantFP *, unsigned> FPConstantMap;
     std::map<const llvm::ConstantDataVector *, unsigned> VectorConstantMap;
@@ -341,7 +352,7 @@ namespace {
                            bool isSigned = false,
                            const std::string &VariableName = "",
                            bool IgnoreName = false,
-#if defined(LLVM_3_2)
+#if defined(LLVM_3_1) || defined(LLVM_3_2)
                            const llvm::AttrListPtr &PAL = llvm::AttrListPtr()
 #else
                            const llvm::AttributeSet &PAL = llvm::AttributeSet()
@@ -352,7 +363,7 @@ namespace {
                            const std::string &NameSoFar = "");
 
     void printStructReturnPointerFunctionType(llvm::raw_ostream &Out,
-#if defined(LLVM_3_2)
+#if defined(LLVM_3_1) || defined(LLVM_3_2)
                                               const llvm::AttrListPtr &PAL,
 #else
                                               const llvm::AttributeSet &PAL,
@@ -575,7 +586,7 @@ std::string CWriter::getArrayName(llvm::ArrayType *AT) {
 /// return type, except, instead of printing the type as void (*)(Struct*, ...)
 /// print it as "Struct (*)(...)", for struct return functions.
 void CWriter::printStructReturnPointerFunctionType(llvm::raw_ostream &Out,
-#if defined(LLVM_3_2)
+#if defined(LLVM_3_1) || defined(LLVM_3_2)
                                                    const llvm::AttrListPtr &PAL,
 #else
                                                    const llvm::AttributeSet &PAL,
@@ -594,16 +605,20 @@ void CWriter::printStructReturnPointerFunctionType(llvm::raw_ostream &Out,
     if (PrintedType)
       FunctionInnards << ", ";
     llvm::Type *ArgTy = *I;
-#if defined(LLVM_3_2)
+#if defined(LLVM_3_1)
+    if (PAL.paramHasAttr(Idx, llvm::Attribute::ByVal)) {
+#elif defined(LLVM_3_2)
     if (PAL.getParamAttributes(Idx).hasAttribute(llvm::Attributes::ByVal)) {
 #else
-    if (PAL.getParamAttributes(Idx).hasAttribute(llvm::AttributeSet::FunctionIndex, llvm::Attribute::ByVal)) {
+        if (PAL.getParamAttributes(Idx).hasAttribute(llvm::AttributeSet::FunctionIndex, llvm::Attribute::ByVal)) {
 #endif
       assert(ArgTy->isPointerTy());
       ArgTy = llvm::cast<llvm::PointerType>(ArgTy)->getElementType();
     }
     printType(FunctionInnards, ArgTy,
-#if defined(LLVM_3_2)
+#if defined(LLVM_3_1)
+              /*isSigned=*/PAL.paramHasAttr(Idx, llvm::Attribute::SExt),
+#elif defined(LLVM_3_2)
               PAL.getParamAttributes(Idx).hasAttribute(llvm::Attributes::SExt),
 #else
               PAL.getParamAttributes(Idx).hasAttribute(llvm::AttributeSet::FunctionIndex, llvm::Attribute::SExt),
@@ -620,7 +635,9 @@ void CWriter::printStructReturnPointerFunctionType(llvm::raw_ostream &Out,
   }
   FunctionInnards << ')';
   printType(Out, RetTy,
-#if defined(LLVM_3_2)
+#if defined(LLVM_3_1)
+            /*isSigned=*/PAL.paramHasAttr(0, llvm::Attribute::SExt),
+#elif defined(LLVM_3_2)
             PAL.getParamAttributes(0).hasAttribute(llvm::Attributes::SExt),
 #else
             PAL.getParamAttributes(0).hasAttribute(llvm::AttributeSet::ReturnIndex, llvm::Attribute::SExt),
@@ -720,7 +737,7 @@ CWriter::printSimpleType(llvm::raw_ostream &Out, llvm::Type *Ty, bool isSigned,
 llvm::raw_ostream &CWriter::printType(llvm::raw_ostream &Out, llvm::Type *Ty,
                                 bool isSigned, const std::string &NameSoFar,
                                 bool IgnoreName,
-#if defined(LLVM_3_2)
+#if defined(LLVM_3_1) || defined(LLVM_3_2)
                                 const llvm::AttrListPtr &PAL
 #else
                                 const llvm::AttributeSet &PAL
@@ -742,7 +759,9 @@ llvm::raw_ostream &CWriter::printType(llvm::raw_ostream &Out, llvm::Type *Ty,
     for (llvm::FunctionType::param_iterator I = FTy->param_begin(),
            E = FTy->param_end(); I != E; ++I) {
       llvm::Type *ArgTy = *I;
-#if defined(LLVM_3_2)
+#if defined(LLVM_3_1)
+      if (PAL.paramHasAttr(Idx, llvm::Attribute::ByVal)) {
+#elif defined(LLVM_3_2)
       if (PAL.getParamAttributes(Idx).hasAttribute(llvm::Attributes::ByVal)) {
 #else
           if (PAL.getParamAttributes(Idx).hasAttribute(llvm::AttributeSet::FunctionIndex, llvm::Attribute::ByVal)) {
@@ -753,7 +772,9 @@ llvm::raw_ostream &CWriter::printType(llvm::raw_ostream &Out, llvm::Type *Ty,
       if (I != FTy->param_begin())
         FunctionInnards << ", ";
       printType(FunctionInnards, ArgTy,
-#if defined(LLVM_3_2)
+#if defined(LLVM_3_1)
+                /*isSigned=*/PAL.paramHasAttr(Idx, llvm::Attribute::SExt),
+#elif defined(LLVM_3_2)
                 PAL.getParamAttributes(Idx).hasAttribute(llvm::Attributes::SExt),
 #else
                 PAL.getParamAttributes(Idx).hasAttribute(llvm::AttributeSet::FunctionIndex, llvm::Attribute::SExt),
@@ -770,7 +791,9 @@ llvm::raw_ostream &CWriter::printType(llvm::raw_ostream &Out, llvm::Type *Ty,
     }
     FunctionInnards << ')';
     printType(Out, FTy->getReturnType(),
-#if defined(LLVM_3_2)
+#if defined(LLVM_3_1)
+              /*isSigned=*/PAL.paramHasAttr(0, llvm::Attribute::SExt),
+#elif defined(LLVM_3_2)
               PAL.getParamAttributes(0).hasAttribute(llvm::Attributes::SExt),
 #else
               PAL.getParamAttributes(0).hasAttribute(llvm::AttributeSet::ReturnIndex, llvm::Attribute::SExt),
@@ -1949,7 +1972,11 @@ void CWriter::writeOperandWithCast(llvm::Value* Operand, const llvm::ICmpInst &C
 // directives to cater to specific compilers as need be.
 //
 static void generateCompilerSpecificCode(llvm::formatted_raw_ostream& Out,
+#if defined(LLVM_3_1)
+                                         const llvm::TargetData *TD) {
+#else
                                          const llvm::DataLayout *TD) {
+#endif
   // We output GCC specific attributes to preserve 'linkonce'ness on globals.
   // If we aren't being compiled with GCC, just drop these attributes.
   Out << "#ifndef __GNUC__  /* Can only support \"linkonce\" vars with GCC */\n"
@@ -2142,7 +2169,11 @@ bool CWriter::doInitialization(llvm::Module &M) {
   // Initialize
   TheModule = &M;
 
+#if defined(LLVM_3_1)
+  TD = new llvm::TargetData(&M);
+#else
   TD = new llvm::DataLayout(&M);
+#endif
   IL = new llvm::IntrinsicLowering(*TD);
   IL->AddPrototypes(M);
 
@@ -2625,11 +2656,15 @@ void CWriter::printModuleTypes() {
 
   // Get all of the struct types used in the module.
   std::vector<llvm::StructType*> StructTypes;
+#if defined(LLVM_3_1)
+  TheModule->findUsedStructTypes(StructTypes);
+#else
   llvm::TypeFinder typeFinder;
   typeFinder.run(*TheModule, false);
   for (llvm::TypeFinder::iterator iter = typeFinder.begin();
        iter != typeFinder.end(); ++iter)
       StructTypes.push_back(*iter);
+#endif
 
   // Get all of the array types used in the module
   std::vector<llvm::ArrayType*> ArrayTypes;
@@ -2750,7 +2785,7 @@ void CWriter::printFunctionSignature(const llvm::Function *F, bool Prototype) {
 
   // Loop over the arguments, printing them...
   llvm::FunctionType *FT = llvm::cast<llvm::FunctionType>(F->getFunctionType());
-#if defined(LLVM_3_2)
+#if defined(LLVM_3_1) || defined(LLVM_3_2)
   const llvm::AttrListPtr &PAL = F->getAttributes();
 #else
   const llvm::AttributeSet &PAL = F->getAttributes();
@@ -2784,16 +2819,20 @@ void CWriter::printFunctionSignature(const llvm::Function *F, bool Prototype) {
         else
           ArgName = "";
         llvm::Type *ArgTy = I->getType();
-#if defined(LLVM_3_2)
+#if defined(LLVM_3_1)
+        if (PAL.paramHasAttr(Idx, llvm::Attribute::ByVal)) {
+#elif defined(LLVM_3_2)
         if (PAL.getParamAttributes(Idx).hasAttribute(llvm::Attributes::ByVal)) {
 #else
-        if (PAL.getParamAttributes(Idx).hasAttribute(llvm::AttributeSet::FunctionIndex, llvm::Attribute::ByVal)) {
+            if (PAL.getParamAttributes(Idx).hasAttribute(llvm::AttributeSet::FunctionIndex, llvm::Attribute::ByVal)) {
 #endif
           ArgTy = llvm::cast<llvm::PointerType>(ArgTy)->getElementType();
           ByValParams.insert(I);
         }
         printType(FunctionInnards, ArgTy,
-#if defined(LLVM_3_2)
+#if defined(LLVM_3_1)
+                  /*isSigned=*/PAL.paramHasAttr(Idx, llvm::Attribute::SExt),
+#elif defined(LLVM_3_2)
                   PAL.getParamAttributes(Idx).hasAttribute(llvm::Attributes::SExt),
 #else
                   PAL.getParamAttributes(Idx).hasAttribute(llvm::AttributeSet::FunctionIndex, llvm::Attribute::SExt),
@@ -2819,7 +2858,9 @@ void CWriter::printFunctionSignature(const llvm::Function *F, bool Prototype) {
     for (; I != E; ++I) {
       if (PrintedArg) FunctionInnards << ", ";
       llvm::Type *ArgTy = *I;
-#if defined(LLVM_3_2)
+#if defined(LLVM_3_1)
+      if (PAL.paramHasAttr(Idx, llvm::Attribute::ByVal)) {
+#elif defined(LLVM_3_2)
       if (PAL.getParamAttributes(Idx).hasAttribute(llvm::Attributes::ByVal)) {
 #else
           if (PAL.getParamAttributes(Idx).hasAttribute(llvm::AttributeSet::FunctionIndex, llvm::Attribute::ByVal)) {
@@ -2828,7 +2869,9 @@ void CWriter::printFunctionSignature(const llvm::Function *F, bool Prototype) {
         ArgTy = llvm::cast<llvm::PointerType>(ArgTy)->getElementType();
       }
       printType(FunctionInnards, ArgTy,
-#if defined(LLVM_3_2)
+#if defined(LLVM_3_1)
+                /*isSigned=*/PAL.paramHasAttr(Idx, llvm::Attribute::SExt)
+#elif defined(LLVM_3_2)
                 PAL.getParamAttributes(Idx).hasAttribute(llvm::Attributes::SExt)
 #else
                 PAL.getParamAttributes(Idx).hasAttribute(llvm::AttributeSet::FunctionIndex, llvm::Attribute::SExt)
@@ -2865,7 +2908,9 @@ void CWriter::printFunctionSignature(const llvm::Function *F, bool Prototype) {
 
   // Print out the return type and the signature built above.
   printType(Out, RetTy,
-#if defined(LLVM_3_2)
+#if defined(LLVM_3_1)
+            /*isSigned=*/PAL.paramHasAttr(0, llvm::Attribute::SExt),
+#elif defined(LLVM_3_2)
             PAL.getParamAttributes(0).hasAttribute(llvm::Attributes::SExt),
 #else
             PAL.getParamAttributes(0).hasAttribute(llvm::AttributeSet::ReturnIndex, llvm::Attribute::SExt),
@@ -3667,7 +3712,7 @@ void CWriter::lowerIntrinsics(llvm::Function &F) {
             const char *BuiltinName = "";
 #define GET_GCC_BUILTIN_NAME
 #define Intrinsic llvm::Intrinsic
-#if defined(LLVM_3_2)
+#if defined(LLVM_3_1) || defined(LLVM_3_2)
   #include "llvm/Intrinsics.gen"
 #else
   #include "llvm/IR/Intrinsics.gen"
@@ -3732,7 +3777,7 @@ void CWriter::visitCallInst(llvm::CallInst &I) {
 
   // If this is a call to a struct-return function, assign to the first
   // parameter instead of passing it to the call.
-#if defined(LLVM_3_2)
+#if defined(LLVM_3_1) || defined(LLVM_3_2)
   const llvm::AttrListPtr &PAL = I.getAttributes();
 #else
   const llvm::AttributeSet &PAL = I.getAttributes();
@@ -3820,7 +3865,9 @@ void CWriter::visitCallInst(llvm::CallInst &I) {
         (*AI)->getType() != FTy->getParamType(ArgNo)) {
       Out << '(';
       printType(Out, FTy->getParamType(ArgNo),
-#if defined(LLVM_3_2)
+#if defined(LLVM_3_1)
+                /*isSigned=*/PAL.paramHasAttr(ArgNo+1, llvm::Attribute::SExt)
+#elif defined(LLVM_3_2)
                 PAL.getParamAttributes(ArgNo+1).hasAttribute(llvm::Attributes::SExt)
 #else
                 PAL.getParamAttributes(ArgNo+1).hasAttribute(llvm::AttributeSet::FunctionIndex, llvm::Attribute::SExt)
@@ -3858,7 +3905,7 @@ bool CWriter::visitBuiltinCall(llvm::CallInst &I, llvm::Intrinsic::ID ID,
     const char *BuiltinName = "";
 #define GET_GCC_BUILTIN_NAME
 #define Intrinsic llvm::Intrinsic
-#if defined(LLVM_3_2)
+#if defined(LLVM_3_1) || defined(LLVM_3_2)
   #include "llvm/Intrinsics.gen"
 #else
   #include "llvm/IR/Intrinsics.gen"
@@ -4508,8 +4555,13 @@ SmearCleanupPass::runOnBasicBlock(llvm::BasicBlock &bb) {
                                                 smearType, NULL);
                 smearFunc = llvm::dyn_cast<llvm::Function>(sf);
                 assert(smearFunc != NULL);
+#if defined(LLVM_3_1)
+                smearFunc->setDoesNotThrow(true);
+                smearFunc->setDoesNotAccessMemory(true);
+#else
                 smearFunc->setDoesNotThrow();
                 smearFunc->setDoesNotAccessMemory();
+#endif
             }
 
             assert(smearFunc != NULL);
@@ -4651,8 +4703,13 @@ AndCmpCleanupPass::runOnBasicBlock(llvm::BasicBlock &bb) {
                                                    LLVMTypes::MaskType, NULL);
                 andCmpFunc = llvm::dyn_cast<llvm::Function>(acf);
                 Assert(andCmpFunc != NULL);
+#if defined(LLVM_3_1)
+                andCmpFunc->setDoesNotThrow(true);
+                andCmpFunc->setDoesNotAccessMemory(true);
+#else
                 andCmpFunc->setDoesNotThrow();
                 andCmpFunc->setDoesNotAccessMemory();
+#endif
             }
 
             // Set up the function call to the *_and_mask function; the
@@ -4857,7 +4914,7 @@ WriteCXXFile(llvm::Module *module, const char *fn, int vectorWidth,
         pm.add(new llvm::TargetData(module));
 #endif
 
-#if defined(LLVM_3_2) || defined(LLVM_3_3)
+#if defined(LLVM_3_1) || defined(LLVM_3_2) || defined(LLVM_3_3)
     int flags = 0;
 #else
     llvm::sys::fs::OpenFlags flags = llvm::sys::fs::F_None;
@@ -4882,7 +4939,7 @@ WriteCXXFile(llvm::Module *module, const char *fn, int vectorWidth,
     pm.add(llvm::createDeadCodeEliminationPass()); // clean up after smear pass
 //CO    pm.add(llvm::createPrintModulePass(&fos));
     pm.add(new CWriter(fos, includeName, vectorWidth));
-#if defined(LLVM_3_2)
+#if defined(LLVM_3_1) || defined(LLVM_3_2)
     // This interface is depricated for 3.3+
     pm.add(llvm::createGCInfoDeleter());
 #endif
diff --git a/ctx.cpp b/ctx.cpp
index 32ba0ad9..c50d22f9 100644
--- a/ctx.cpp
+++ b/ctx.cpp
@@ -46,7 +46,7 @@
 #include "sym.h"
 #include <map>
 #include <llvm/Support/Dwarf.h>
-#if defined(LLVM_3_2)
+#if defined(LLVM_3_1) || defined(LLVM_3_2)
   #include <llvm/Metadata.h>
   #include <llvm/Module.h>
   #include <llvm/Instructions.h>
@@ -3312,7 +3312,7 @@ FunctionEmitContext::CallInst(llvm::Value *func, const FunctionType *funcType,
         // alias analysis.
         // TODO: what other attributes needs to be copied?
         // TODO: do the same for varing path.
-#if !defined (LLVM_3_2) // LLVM 3.3+
+#if !defined (LLVM_3_1) && !defined (LLVM_3_2) // LLVM 3.3+
         llvm::CallInst *cc = llvm::dyn_cast<llvm::CallInst>(ci);
         if (cc &&
             cc->getCalledFunction() &&
diff --git a/ctx.h b/ctx.h
index 4b27e6e5..58f9aae3 100644
--- a/ctx.h
+++ b/ctx.h
@@ -40,15 +40,20 @@
 
 #include "ispc.h"
 #include <map>
-#if defined(LLVM_3_2)
+#if defined(LLVM_3_1) || defined(LLVM_3_2)
   #include <llvm/InstrTypes.h>
   #include <llvm/Instructions.h>
 #else
   #include <llvm/IR/InstrTypes.h>
   #include <llvm/IR/Instructions.h>
 #endif
-#include <llvm/DebugInfo.h>
-#include <llvm/DIBuilder.h>
+#if defined(LLVM_3_1)
+  #include <llvm/Analysis/DebugInfo.h>
+  #include <llvm/Analysis/DIBuilder.h>
+#else
+  #include <llvm/DebugInfo.h>
+  #include <llvm/DIBuilder.h>
+#endif
 
 struct CFInfo;
 
diff --git a/expr.cpp b/expr.cpp
index eb8c0951..856d363c 100644
--- a/expr.cpp
+++ b/expr.cpp
@@ -56,7 +56,7 @@
 #include <list>
 #include <set>
 #include <stdio.h>
-#if defined(LLVM_3_2)
+#if defined(LLVM_3_1) || defined(LLVM_3_2)
   #include <llvm/Module.h>
   #include <llvm/Type.h>
   #include <llvm/Instructions.h>
diff --git a/func.cpp b/func.cpp
index 3097f86d..b975049b 100644
--- a/func.cpp
+++ b/func.cpp
@@ -46,7 +46,7 @@
 #include "util.h"
 #include <stdio.h>
 
-#if defined(LLVM_3_2)
+#if defined(LLVM_3_1) || defined(LLVM_3_2)
   #include <llvm/LLVMContext.h>
   #include <llvm/Module.h>
   #include <llvm/Type.h>
@@ -310,7 +310,9 @@ Function::emitCode(FunctionEmitContext *ctx, llvm::Function *function,
         // isn't worth the code bloat / overhead.
         bool checkMask = (type->isTask == true) ||
             (
-#if defined(LLVM_3_2)
+#if defined(LLVM_3_1)
+              (function->hasFnAttr(llvm::Attribute::AlwaysInline) == false)
+#elif defined(LLVM_3_2)
               (function->getFnAttributes().hasAttribute(llvm::Attributes::AlwaysInline) == false)
 #else // LLVM 3.3+
               (function->getAttributes().getFnAttributes().hasAttribute(llvm::AttributeSet::FunctionIndex, llvm::Attribute::AlwaysInline) == false)
@@ -451,7 +453,11 @@ Function::GenerateIR() {
                     functionName += std::string("_") + g->target->GetISAString();
                 llvm::Function *appFunction =
                     llvm::Function::Create(ftype, linkage, functionName.c_str(), m->module);
+#if defined(LLVM_3_1)
+                appFunction->setDoesNotThrow(true);
+#else
                 appFunction->setDoesNotThrow();
+#endif
 
                 g->target->markFuncWithTargetAttr(appFunction);
 
diff --git a/ispc.cpp b/ispc.cpp
index 03d1aaff..0f07895f 100644
--- a/ispc.cpp
+++ b/ispc.cpp
@@ -48,7 +48,7 @@
   #include <sys/types.h>
   #include <unistd.h>
 #endif
-#if defined(LLVM_3_2)
+#if defined(LLVM_3_1) || defined(LLVM_3_2)
   #include <llvm/LLVMContext.h>
   #include <llvm/Module.h>
   #include <llvm/Instructions.h>
@@ -57,12 +57,19 @@
   #include <llvm/IR/Module.h>
   #include <llvm/IR/Instructions.h>
 #endif
-#include <llvm/DebugInfo.h>
-#include <llvm/DIBuilder.h>
+#if defined(LLVM_3_1)
+  #include <llvm/Analysis/DebugInfo.h>
+  #include <llvm/Analysis/DIBuilder.h>
+#else
+  #include <llvm/DebugInfo.h>
+  #include <llvm/DIBuilder.h>
+#endif
 #include <llvm/Support/Dwarf.h>
 #include <llvm/Target/TargetMachine.h>
 #include <llvm/Target/TargetOptions.h>
-#if defined(LLVM_3_2)
+#if defined(LLVM_3_1)
+  #include <llvm/Target/TargetData.h>
+#elif defined(LLVM_3_2)
   #include <llvm/DataLayout.h>
 #else // LLVM 3.3+
   #include <llvm/IR/DataLayout.h>
@@ -138,20 +145,27 @@ static const char *supportedCPUs[] = {
     // cortex-a9 and a15.  We should be able to handle any of them that also
     // have NEON support.
     "cortex-a9", "cortex-a15",
-    "atom", "penryn", "core2", "corei7", "corei7-avx", "core-avx-i", "core-avx2"
+    "atom", "penryn", "core2", "corei7", "corei7-avx"
+#if !defined(LLVM_3_1)
+    , "core-avx-i", "core-avx2"
+#endif // LLVM 3.2+
 };
 
 Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
     m_target(NULL),
     m_targetMachine(NULL),
+#if defined(LLVM_3_1)
+    m_targetData(NULL),
+#else
     m_dataLayout(NULL),
+#endif
     m_valid(false),
     m_isa(SSE2),
     m_arch(""),
     m_is32Bit(true),
     m_cpu(""),
     m_attributes(""),
-#if !defined(LLVM_3_2)
+#if !defined(LLVM_3_1) && !defined(LLVM_3_2)
     m_tf_attributes(NULL),
 #endif
     m_nativeVectorWidth(-1),
@@ -393,7 +407,10 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
         this->m_maskingIsFree = false;
         this->m_maskBitCount = 32;
         this->m_hasHalf = true;
+#if !defined(LLVM_3_1)
+        // LLVM 3.2+ only
         this->m_hasRand = true;
+#endif
     }
     else if (!strcasecmp(isa, "avx1.1-x2")) {
         this->m_isa = Target::AVX11;
@@ -403,29 +420,46 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
         this->m_maskingIsFree = false;
         this->m_maskBitCount = 32;
         this->m_hasHalf = true;
+#if !defined(LLVM_3_1)
+        // LLVM 3.2+ only
         this->m_hasRand = true;
+#endif
     }
     else if (!strcasecmp(isa, "avx2")) {
         this->m_isa = Target::AVX2;
         this->m_nativeVectorWidth = 8;
         this->m_vectorWidth = 8;
-        this->m_attributes = "+avx2,+popcnt,+cmov,+f16c,+rdrand,+fma";
+        this->m_attributes = "+avx2,+popcnt,+cmov,+f16c,+rdrand"
+#ifndef LLVM_3_1
+            ",+fma"
+#endif // !LLVM_3_1
+            ;
         this->m_maskingIsFree = false;
         this->m_maskBitCount = 32;
         this->m_hasHalf = true;
+#if !defined(LLVM_3_1)
+        // LLVM 3.2+ only
         this->m_hasRand = true;
         this->m_hasGather = true;
+#endif
     }
     else if (!strcasecmp(isa, "avx2-x2")) {
         this->m_isa = Target::AVX2;
         this->m_nativeVectorWidth = 16;
         this->m_vectorWidth = 16;
-        this->m_attributes = "+avx2,+popcnt,+cmov,+f16c,+rdrand,+fma";
+        this->m_attributes = "+avx2,+popcnt,+cmov,+f16c,+rdrand"
+#ifndef LLVM_3_1
+            ",+fma"
+#endif // !LLVM_3_1
+            ;
         this->m_maskingIsFree = false;
         this->m_maskBitCount = 32;
         this->m_hasHalf = true;
+#if !defined(LLVM_3_1)
+        // LLVM 3.2+ only
         this->m_hasRand = true;
         this->m_hasGather = true;
+#endif
     }
     else if (!strcasecmp(isa, "neon-8")) {
         this->m_isa = Target::NEON8;
@@ -471,8 +505,10 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
         if (m_isa == Target::NEON8 || m_isa == Target::NEON16 ||
             m_isa == Target::NEON32)
             options.FloatABIType = llvm::FloatABI::Hard;
+#if !defined(LLVM_3_1)
         if (g->opt.disableFMA == false)
             options.AllowFPOpFusion = llvm::FPOpFusion::Fast;
+#endif // !LLVM_3_1
 
 #ifdef ISPC_IS_WINDOWS
         if (strcmp("x86", arch) == 0) {
@@ -490,8 +526,12 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
 
         // Initialize TargetData/DataLayout in 3 steps.
         // 1. Get default data layout first
-        std::string dl_string =
-          m_targetMachine->getDataLayout()->getStringRepresentation();
+        std::string dl_string;
+#if defined(LLVM_3_1)
+        dl_string = m_targetMachine->getTargetData()->getStringRepresentation();
+#else
+        dl_string = m_targetMachine->getDataLayout()->getStringRepresentation();
+#endif
 
         // 2. Adjust for generic
         if (m_isa == Target::GENERIC) {
@@ -506,7 +546,11 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
         }
 
         // 3. Finally set member data
+#if defined(LLVM_3_1)
+        m_targetData = new llvm::TargetData(dl_string);
+#else
         m_dataLayout = new llvm::DataLayout(dl_string);
+#endif
 
         // Set is32Bit
         // This indicates if we are compiling for 32 bit platform
@@ -514,7 +558,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
         // FIXME: all generic targets are handled as 64 bit, which is incorrect.
         this->m_is32Bit = (getDataLayout()->getPointerSize() == 4);
 
-#if !defined(LLVM_3_2)
+#if !defined(LLVM_3_1) && !defined(LLVM_3_2)
         // This is LLVM 3.3+ feature.
         // Initialize target-specific "target-feature" attribute.
         if (!m_attributes.empty()) {
@@ -728,7 +772,7 @@ Target::StructOffset(llvm::Type *type, int element,
 }
 
 void Target::markFuncWithTargetAttr(llvm::Function* func) {
-#if !defined(LLVM_3_2)
+#if !defined(LLVM_3_1) && !defined(LLVM_3_2)
     if (m_tf_attributes) {
         func->addAttributes(llvm::AttributeSet::FunctionIndex, *m_tf_attributes);
     }
diff --git a/ispc.h b/ispc.h
index 8653553e..98fcd199 100644
--- a/ispc.h
+++ b/ispc.h
@@ -40,8 +40,8 @@
 
 #define ISPC_VERSION "1.4.5dev"
 
-#if !defined(LLVM_3_2) && !defined(LLVM_3_3) && !defined(LLVM_3_4)
-#error "Only LLVM 3.2, 3.3 and the 3.4 development branch are supported"
+#if !defined(LLVM_3_1) && !defined(LLVM_3_2) && !defined(LLVM_3_3) && !defined(LLVM_3_4)
+#error "Only LLVM 3.1, 3.2, 3.3 and the 3.4 development branch are supported"
 #endif
 
 #if defined(_WIN32) || defined(_WIN64)
@@ -72,7 +72,11 @@ namespace llvm {
     class BasicBlock;
     class Constant;
     class ConstantValue;
+#if defined(LLVM_3_1)
+    class TargetData;
+#else
     class DataLayout;
+#endif
     class DIBuilder;
     class DIDescriptor;
     class DIFile;
@@ -222,7 +226,11 @@ public:
 
     // Note the same name of method for 3.1 and 3.2+, this allows
     // to reduce number ifdefs on client side.
+#if defined(LLVM_3_1)
+    llvm::TargetData *getDataLayout() const {return m_targetData;}
+#else
     llvm::DataLayout *getDataLayout() const {return m_dataLayout;}
+#endif
 
     /** Reports if Target object has valid state. */
     bool isValid() const {return m_valid;}
@@ -270,7 +278,11 @@ private:
         */
     llvm::TargetMachine *m_targetMachine;
 
+#if defined(LLVM_3_1)
+    llvm::TargetData *m_targetData;
+#else
     llvm::DataLayout *m_dataLayout;
+#endif
 
     /** flag to report invalid state after construction
         (due to bad parameters passed to constructor). */
@@ -291,7 +303,7 @@ private:
     /** Target-specific attribute string to pass along to the LLVM backend */
     std::string m_attributes;
 
-#if !defined(LLVM_3_2)
+#if !defined(LLVM_3_1) && !defined(LLVM_3_2)
     /** Target-specific LLVM attribute, which has to be attached to every
         function to ensure that it is generated for correct target architecture.
         This is requirement was introduced in LLVM 3.3 */
diff --git a/llvmutil.cpp b/llvmutil.cpp
index 2f54a2fe..180c8676 100644
--- a/llvmutil.cpp
+++ b/llvmutil.cpp
@@ -38,7 +38,7 @@
 #include "llvmutil.h"
 #include "ispc.h"
 #include "type.h"
-#if defined(LLVM_3_2)
+#if defined(LLVM_3_1) || defined(LLVM_3_2)
   #include <llvm/Instructions.h>
   #include <llvm/BasicBlock.h>
 #else
diff --git a/llvmutil.h b/llvmutil.h
index d1803f32..d6c5ede0 100644
--- a/llvmutil.h
+++ b/llvmutil.h
@@ -38,7 +38,7 @@
 #ifndef ISPC_LLVMUTIL_H
 #define ISPC_LLVMUTIL_H 1
 
-#if defined(LLVM_3_2)
+#if defined(LLVM_3_1) || defined(LLVM_3_2)
   #include <llvm/LLVMContext.h>
   #include <llvm/Type.h>
   #include <llvm/DerivedTypes.h>
diff --git a/main.cpp b/main.cpp
index c21e7f88..8d7282f5 100644
--- a/main.cpp
+++ b/main.cpp
@@ -62,7 +62,9 @@ static void
 lPrintVersion() {
     printf("Intel(r) SPMD Program Compiler (ispc), %s (build %s @ %s, LLVM %s)\n",
            ISPC_VERSION, BUILD_VERSION, BUILD_DATE,
-#if defined(LLVM_3_2)
+#if defined(LLVM_3_1)
+           "3.1"
+#elif defined(LLVM_3_2)
            "3.2"
 #elif defined(LLVM_3_3)
            "3.3"
diff --git a/module.cpp b/module.cpp
index eba5eb3b..755a5dc4 100644
--- a/module.cpp
+++ b/module.cpp
@@ -64,7 +64,7 @@
 #define strcasecmp stricmp
 #endif
 
-#if defined(LLVM_3_2)
+#if defined(LLVM_3_1) || defined(LLVM_3_2)
   #include <llvm/LLVMContext.h>
   #include <llvm/Module.h>
   #include <llvm/Type.h>
@@ -86,7 +86,9 @@
 #include <llvm/Support/FileUtilities.h>
 #include <llvm/Target/TargetMachine.h>
 #include <llvm/Target/TargetOptions.h>
-#if defined(LLVM_3_2)
+#if defined(LLVM_3_1)
+  #include <llvm/Target/TargetData.h>
+#elif defined(LLVM_3_2)
   #include <llvm/DataLayout.h>
   #include <llvm/TargetTransformInfo.h>
 #else // LLVM 3.3+
@@ -200,7 +202,7 @@ lStripUnusedDebugInfo(llvm::Module *module) {
             // stuff and remove it later on. Removing it is useful, as it
             // reduces size of the binary significantly (manyfold for small
             // programs).
-#if defined(LLVM_3_2)
+#if defined(LLVM_3_1) || defined(LLVM_3_2)
             llvm::MDNode *nodeSPMD =
                 llvm::dyn_cast<llvm::MDNode>(cuNode->getOperand(12));
             Assert(nodeSPMD != NULL);
@@ -795,7 +797,11 @@ Module::AddFunctionDeclaration(const std::string &name,
 #endif
     if (functionType->isTask)
         // This also applies transitively to members I think?
+#if defined(LLVM_3_1)
+        function->setDoesNotAlias(1, true);
+#else // LLVM 3.2+
         function->setDoesNotAlias(1);
+#endif
 
     g->target->markFuncWithTargetAttr(function);
 
@@ -844,7 +850,12 @@ Module::AddFunctionDeclaration(const std::string &name,
 
             // NOTE: LLVM indexes function parameters starting from 1.
             // This is unintuitive.
+#if defined(LLVM_3_1)
+            function->setDoesNotAlias(i+1, true);
+#else
             function->setDoesNotAlias(i+1);
+#endif
+
 #if 0
             int align = 4 * RoundUpPow2(g->target->nativeVectorWidth);
             function->addAttribute(i+1, llvm::Attribute::constructAlignmentFromInt(align));
@@ -1056,7 +1067,7 @@ Module::writeObjectFileOrAssembly(llvm::TargetMachine *targetMachine,
     llvm::TargetMachine::CodeGenFileType fileType = (outputType == Object) ?
         llvm::TargetMachine::CGFT_ObjectFile : llvm::TargetMachine::CGFT_AssemblyFile;
     bool binary = (fileType == llvm::TargetMachine::CGFT_ObjectFile);
-#if defined(LLVM_3_2) || defined(LLVM_3_3)
+#if defined(LLVM_3_1) || defined(LLVM_3_2) || defined(LLVM_3_3)
     unsigned int flags = binary ? llvm::raw_fd_ostream::F_Binary : 0;
 #else
     llvm::sys::fs::OpenFlags flags = binary ? llvm::sys::fs::F_Binary :
@@ -1071,7 +1082,11 @@ Module::writeObjectFileOrAssembly(llvm::TargetMachine *targetMachine,
     }
 
     llvm::PassManager pm;
+#if defined(LLVM_3_1)
+    pm.add(new llvm::TargetData(*g->target->getDataLayout()));
+#else
     pm.add(new llvm::DataLayout(*g->target->getDataLayout()));
+#endif
 
     llvm::formatted_raw_ostream fos(of->os());
 
@@ -1785,12 +1800,22 @@ Module::execPreprocessor(const char *infilename, llvm::raw_string_ostream *ostre
 
     llvm::raw_fd_ostream stderrRaw(2, false);
 
+#if defined(LLVM_3_1)
+    clang::TextDiagnosticPrinter *diagPrinter =
+        new clang::TextDiagnosticPrinter(stderrRaw, clang::DiagnosticOptions());
+#else
     clang::DiagnosticOptions *diagOptions = new clang::DiagnosticOptions();
     clang::TextDiagnosticPrinter *diagPrinter =
         new clang::TextDiagnosticPrinter(stderrRaw, diagOptions);
+#endif
     llvm::IntrusiveRefCntPtr<clang::DiagnosticIDs> diagIDs(new clang::DiagnosticIDs);
+#if defined(LLVM_3_1)
+    clang::DiagnosticsEngine *diagEngine =
+        new clang::DiagnosticsEngine(diagIDs, diagPrinter);
+#else
     clang::DiagnosticsEngine *diagEngine =
         new clang::DiagnosticsEngine(diagIDs, diagOptions, diagPrinter);
+#endif
     inst.setDiagnostics(diagEngine);
 
     clang::TargetOptions &options = inst.getTargetOpts();
@@ -1800,7 +1825,7 @@ Module::execPreprocessor(const char *infilename, llvm::raw_string_ostream *ostre
     }
     options.Triple = triple.getTriple();
 
-#if defined(LLVM_3_2)
+#if defined(LLVM_3_1) || defined(LLVM_3_2)
     clang::TargetInfo *target =
         clang::TargetInfo::CreateTargetInfo(inst.getDiagnostics(), options);
 #else // LLVM 3.3+
@@ -1810,14 +1835,18 @@ Module::execPreprocessor(const char *infilename, llvm::raw_string_ostream *ostre
 
     inst.setTarget(target);
     inst.createSourceManager(inst.getFileManager());
+#if defined(LLVM_3_1)
+    inst.InitializeSourceManager(infilename);
+#else
     clang::FrontendInputFile inputFile(infilename, clang::IK_None);
     inst.InitializeSourceManager(inputFile);
+#endif
 
     // Don't remove comments in the preprocessor, so that we can accurately
     // track the source file position by handling them ourselves.
     inst.getPreprocessorOutputOpts().ShowComments = 1;
 
-#if !defined(LLVM_3_2) // LLVM 3.3+
+#if !defined(LLVM_3_1) && !defined(LLVM_3_2) // LLVM 3.3+
     inst.getPreprocessorOutputOpts().ShowCPP = 1;
 #endif
 
@@ -1829,7 +1858,7 @@ Module::execPreprocessor(const char *infilename, llvm::raw_string_ostream *ostre
         headerOpts.Verbose = 1;
     for (int i = 0; i < (int)g->includePath.size(); ++i) {
         headerOpts.AddPath(g->includePath[i], clang::frontend::Angled,
-#if defined(LLVM_3_2)
+#if defined(LLVM_3_1) || defined(LLVM_3_2)
                            true /* is user supplied */,
 #endif
                            false /* not a framework */,
@@ -1884,7 +1913,11 @@ Module::execPreprocessor(const char *infilename, llvm::raw_string_ostream *ostre
         }
     }
 
+#if defined(LLVM_3_1)
+    inst.getLangOpts().BCPLComment = 1;
+#else
     inst.getLangOpts().LineComment = 1;
+#endif
     inst.createPreprocessor();
 
     diagPrinter->BeginSourceFile(inst.getLangOpts(), &inst.getPreprocessor());
diff --git a/opt.cpp b/opt.cpp
index 8899c64d..077320d5 100644
--- a/opt.cpp
+++ b/opt.cpp
@@ -48,7 +48,7 @@
 #include <set>
 
 #include <llvm/Pass.h>
-#if defined(LLVM_3_2)
+#if defined(LLVM_3_1) || defined(LLVM_3_2)
   #include <llvm/Module.h>
   #include <llvm/Instructions.h>
   #include <llvm/Intrinsics.h>
@@ -73,7 +73,9 @@
 #include <llvm/Transforms/IPO.h>
 #include <llvm/Transforms/Utils/BasicBlockUtils.h>
 #include <llvm/Target/TargetOptions.h>
-#if defined(LLVM_3_2)
+#if defined(LLVM_3_1)
+  #include <llvm/Target/TargetData.h>
+#elif defined(LLVM_3_2)
   #include <llvm/DataLayout.h>
 #else // LLVM 3.3+
   #include <llvm/IR/DataLayout.h>
@@ -83,8 +85,12 @@
 #include <llvm/Analysis/Verifier.h>
 #include <llvm/Analysis/Passes.h>
 #include <llvm/Support/raw_ostream.h>
-#include <llvm/DebugInfo.h>
 #include <llvm/Support/PatternMatch.h>
+#if defined(LLVM_3_1)
+  #include <llvm/Analysis/DebugInfo.h>
+#else
+  #include <llvm/DebugInfo.h>
+#endif
 #include <llvm/Support/Dwarf.h>
 #ifdef ISPC_IS_LINUX
   #include <alloca.h>
@@ -411,14 +417,18 @@ Optimize(llvm::Module *module, int optLevel) {
     optPM.add(targetLibraryInfo);
 
 
+#if defined(LLVM_3_1)
+    optPM.add(new llvm::TargetData(*g->target->getDataLayout()));
+#else
     optPM.add(new llvm::DataLayout(*g->target->getDataLayout()));
 
     llvm::TargetMachine *targetMachine = g->target->GetTargetMachine();
-#ifdef LLVM_3_2
+  #ifdef LLVM_3_2
     optPM.add(new llvm::TargetTransformInfo(targetMachine->getScalarTargetTransformInfo(),
                                             targetMachine->getVectorTargetTransformInfo()));
-#else // LLVM 3.3+
+  #else // LLVM 3.3+
     targetMachine->addAnalysisPasses(optPM);
+  #endif
 #endif
 
     optPM.add(llvm::createIndVarSimplifyPass());
@@ -500,7 +510,7 @@ Optimize(llvm::Module *module, int optLevel) {
         optPM.add(llvm::createCFGSimplificationPass());
 
         optPM.add(llvm::createArgumentPromotionPass());
-#if defined(LLVM_3_2) || defined(LLVM_3_3)
+#if defined(LLVM_3_1) || defined(LLVM_3_2) || defined(LLVM_3_3)
         // Starting from 3.4 this functionality was moved to
         // InstructionCombiningPass. See r184459 for details.
         optPM.add(llvm::createSimplifyLibCallsPass());
diff --git a/stmt.cpp b/stmt.cpp
index 412b0dd9..4ec63d35 100644
--- a/stmt.cpp
+++ b/stmt.cpp
@@ -48,7 +48,7 @@
 #include <stdio.h>
 #include <map>
 
-#if defined(LLVM_3_2)
+#if defined(LLVM_3_1) || defined(LLVM_3_2)
   #include <llvm/Module.h>
   #include <llvm/Type.h>
   #include <llvm/Instructions.h>
diff --git a/type.cpp b/type.cpp
index 11a165f5..5fa1845b 100644
--- a/type.cpp
+++ b/type.cpp
@@ -43,15 +43,20 @@
 
 #include <stdio.h>
 #include <map>
-#if defined(LLVM_3_2)
+#if defined(LLVM_3_1) || defined(LLVM_3_2)
   #include <llvm/Value.h>
   #include <llvm/Module.h>
 #else
   #include <llvm/IR/Value.h>
   #include <llvm/IR/Module.h>
 #endif
-#include <llvm/DebugInfo.h>
-#include <llvm/DIBuilder.h>
+#if defined(LLVM_3_1)
+  #include <llvm/Analysis/DebugInfo.h>
+  #include <llvm/Analysis/DIBuilder.h>
+#else
+  #include <llvm/DebugInfo.h>
+  #include <llvm/DIBuilder.h>
+#endif
 #include <llvm/Support/Dwarf.h>
 
 
@@ -814,8 +819,11 @@ EnumType::GetDIType(llvm::DIDescriptor scope) const {
         m->diBuilder->createEnumerationType(scope, name, diFile, pos.first_line,
                                             32 /* size in bits */,
                                             32 /* align in bits */,
-                                            elementArray,
-                                            llvm::DIType());
+                                            elementArray
+#if !defined(LLVM_3_1)
+                                            , llvm::DIType()
+#endif
+                                            );
 
 
     switch (variability.type) {
@@ -2131,7 +2139,7 @@ StructType::GetDIType(llvm::DIDescriptor scope) const {
         currentSize,    // Size in bits
         align,          // Alignment in bits
         0,              // Flags
-#if !defined(LLVM_3_2)
+#if !defined(LLVM_3_1) && !defined(LLVM_3_2)
         llvm::DIType(), // DerivedFrom
 #endif
         elements);
@@ -2374,7 +2382,7 @@ UndefinedStructType::GetDIType(llvm::DIDescriptor scope) const {
         0,              // Size
         0,              // Align
         0,              // Flags
-#if !defined(LLVM_3_2)
+#if !defined(LLVM_3_1) && !defined(LLVM_3_2)
         llvm::DIType(), // DerivedFrom
 #endif
         elements);
@@ -2637,8 +2645,12 @@ ReferenceType::GetDIType(llvm::DIDescriptor scope) const {
     }
 
     llvm::DIType diTargetType = targetType->GetDIType(scope);
+#if defined(LLVM_3_1)
+    return m->diBuilder->createReferenceType(diTargetType);
+#else
     return m->diBuilder->createReferenceType(llvm::dwarf::DW_TAG_reference_type,
                                              diTargetType);
+#endif
 }
 
 
diff --git a/type.h b/type.h
index a6a52e10..880f8574 100644
--- a/type.h
+++ b/type.h
@@ -40,7 +40,7 @@
 
 #include "ispc.h"
 #include "util.h"
-#if defined(LLVM_3_2)
+#if defined(LLVM_3_1) || defined(LLVM_3_2)
   #include <llvm/Type.h>
   #include <llvm/DerivedTypes.h>
 #else
diff --git a/util.cpp b/util.cpp
index 4be863bf..dbea9517 100644
--- a/util.cpp
+++ b/util.cpp
@@ -65,7 +65,9 @@
 #include <set>
 #include <algorithm>
 
-#if defined(LLVM_3_2)
+#if defined(LLVM_3_1)
+  #include <llvm/Target/TargetData.h>
+#elif defined(LLVM_3_2)
   #include <llvm/DataLayout.h>
 #else // LLVM 3.3+
   #include <llvm/IR/DataLayout.h>
@@ -614,8 +616,13 @@ VerifyDataLayoutCompatibility(const std::string &module_dl,
     // which contradic: f80:128:128 followed by f80:32:32. This is a bug, but
     // correct thing to do is to interpret this exactly how LLVM would treat it,
     // so we create a DataLayout class and take its string representation.
+#if defined(LLVM_3_1)
+    llvm::TargetData d1(module_dl);
+    llvm::TargetData d2(lib_dl);
+#else // LLVM 3.2+
     llvm::DataLayout d1(module_dl);
     llvm::DataLayout d2(lib_dl);
+#endif
 
     std::string module_dl_canonic = d1.getStringRepresentation();
     std::string lib_dl_canonic = d2.getStringRepresentation();

From 5e5d42b918852a7aeb12bbc98cf4a5b46e5f9842 Mon Sep 17 00:00:00 2001
From: Matt Pharr <mpharr@google.com>
Date: Tue, 6 Aug 2013 17:55:37 -0700
Subject: [PATCH 22/34] Fix build with LLVM 3.1

---
 opt.cpp | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/opt.cpp b/opt.cpp
index 3e2efcd8..e1618b7a 100644
--- a/opt.cpp
+++ b/opt.cpp
@@ -4556,6 +4556,8 @@ PeepholePass::PeepholePass()
     : BasicBlockPass(ID) {
 }
 
+#ifndef LLVM_3_1
+
 using namespace llvm::PatternMatch;
 
 template<typename Op_t, unsigned Opcode>
@@ -4706,6 +4708,7 @@ inline SDiv2_match<V>
 m_SDiv2(const V &v) {
     return SDiv2_match<V>(v);
 }
+
 // Returns true if the given function has a call to an intrinsic function
 // in its definition.
 static bool
@@ -4874,6 +4877,7 @@ lMatchAvgDownInt16(llvm::Value *inst) {
     }
     return NULL;
 }
+#endif // !LLVM_3_1
 
 bool
 PeepholePass::runOnBasicBlock(llvm::BasicBlock &bb) {
@@ -4885,6 +4889,7 @@ PeepholePass::runOnBasicBlock(llvm::BasicBlock &bb) {
         llvm::Instruction *inst = &*iter;
 
         llvm::Instruction *builtinCall = NULL;
+#ifndef LLVM_3_1
         if (!builtinCall)
           builtinCall = lMatchAvgUpUInt8(inst);
         if (!builtinCall)
@@ -4901,7 +4906,7 @@ PeepholePass::runOnBasicBlock(llvm::BasicBlock &bb) {
           builtinCall = lMatchAvgDownInt8(inst);
         if (!builtinCall)
           builtinCall = lMatchAvgDownInt16(inst);
-
+#endif // LLVM_3_1
         if (builtinCall != NULL) {
           llvm::ReplaceInstWithInst(inst, builtinCall);
           modifiedAny = true;

From 1d76f74b165ee79840a739490fddedbb532a275f Mon Sep 17 00:00:00 2001
From: Matt Pharr <mpharr@google.com>
Date: Wed, 7 Aug 2013 12:53:39 -0700
Subject: [PATCH 23/34] Fix compiler warnings

---
 opt.cpp  | 8 ++++----
 parse.yy | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/opt.cpp b/opt.cpp
index e1618b7a..522e601b 100644
--- a/opt.cpp
+++ b/opt.cpp
@@ -4756,7 +4756,7 @@ lMatchAvgUpUInt8(llvm::Value *inst) {
         m_Add(m_Add(m_ZExt8To16(m_Value(opa)), m_ZExt8To16(m_Value(opb))),
               m_APInt(delta))))))) {
         if (delta->isIntN(1) == false)
-            return false;
+            return NULL;
 
         return lGetBinaryIntrinsic("__avg_up_uint8", opa, opb);
     }
@@ -4790,7 +4790,7 @@ lMatchAvgUpUInt16(llvm::Value *inst) {
         m_Add(m_Add(m_ZExt16To32(m_Value(opa)), m_ZExt16To32(m_Value(opb))),
               m_APInt(delta))))))) {
         if (delta->isIntN(1) == false)
-            return false;
+            return NULL;
 
         return lGetBinaryIntrinsic("__avg_up_uint16", opa, opb);
     }
@@ -4825,7 +4825,7 @@ lMatchAvgUpInt8(llvm::Value *inst) {
         m_Add(m_Add(m_SExt8To16(m_Value(opa)), m_SExt8To16(m_Value(opb))),
               m_APInt(delta))))))) {
         if (delta->isIntN(1) == false)
-            return false;
+            return NULL;
 
         return lGetBinaryIntrinsic("__avg_up_int8", opa, opb);
     }
@@ -4859,7 +4859,7 @@ lMatchAvgUpInt16(llvm::Value *inst) {
         m_Add(m_Add(m_SExt16To32(m_Value(opa)), m_SExt16To32(m_Value(opb))),
               m_APInt(delta))))))) {
         if (delta->isIntN(1) == false)
-            return false;
+            return NULL;
 
         return lGetBinaryIntrinsic("__avg_up_int16", opa, opb);
     }
diff --git a/parse.yy b/parse.yy
index 4b315776..5fc01cb0 100644
--- a/parse.yy
+++ b/parse.yy
@@ -2169,7 +2169,7 @@ lAddFunctionParams(Declarator *decl) {
 
 /** Add a symbol for the built-in mask variable to the symbol table */
 static void lAddMaskToSymbolTable(SourcePos pos) {
-    const Type *t;
+    const Type *t = NULL;
     switch (g->target->getMaskBitCount()) {
     case 1:
         t = AtomicType::VaryingBool;

From 0c5742b6f88a7b880f27352f652e282d817b92a0 Mon Sep 17 00:00:00 2001
From: Matt Pharr <mpharr@google.com>
Date: Thu, 8 Aug 2013 19:23:44 -0700
Subject: [PATCH 24/34] Implement new naming scheme for --target.

Now targets are named like "<isa>-i<mask size>x<gang size>", e.g.
"sse4-i8x16", or "avx2-i32x16".

The old target names are still supported.
---
 docs/ispc.rst |  94 +++++++++++++++++++++++-----------------
 ispc.cpp      | 116 ++++++++++++++++++++++++++++++--------------------
 ispc.h        |  12 +++---
 main.cpp      |  15 +++++--
 run_tests.py  |   2 +-
 util.cpp      |  10 ++---
 util.h        |  14 ++++++
 7 files changed, 163 insertions(+), 100 deletions(-)

diff --git a/docs/ispc.rst b/docs/ispc.rst
index eb8333de..26cf6be3 100755
--- a/docs/ispc.rst
+++ b/docs/ispc.rst
@@ -489,54 +489,72 @@ on which you're running ``ispc`` is used to determine the target CPU.
 
    ispc foo.ispc -o foo.obj --cpu=corei7-avx
 
-Finally, ``--target`` selects the target instruction set.  The following
-targets are currently supported:
+Finally, ``--target`` selects the target instruction set.  The target
+string is of the form ``[ISA]-i[mask size]x[gang size]``.  For example,
+``--target=avx2-i32x16`` specifies a target with the AVX2 instruction set,
+a mask size of 32 bits, and a gang size of 16.
 
-=========== ========= =======================================
-Target      Gang Size Description
------------ --------- ---------------------------------------
-avx         8         AVX (2010-2011 era Intel CPUs)
-avx-x2      16        "Double-pumped" AVX target, running
-                      twice as many program instances as the
-                      native vector width.
-avx1.1      8         AVX 1.1 target (2012 era "Ivybridge"
-                      Intel CPUs).
-avx1.1-x2   16        Double-pumped AVX 1.1 target.
-avx2        8         AVX 2 target (2013- Intel "Haswell"
-                      CPUs.)
-avx2-x2     16        Double-pumped AVX 2 target.
-neon-8      16        ARM NEON target, targeting computation
-                      on 8-bit data types. 
-neon-16     8         ARM NEON target, targeting computation
-                      on 16-bit data types.
-neon-32     4         ARM NEON target, targeting computation
-                      on 32-bit data types.
-sse2        4         SSE2 (early 2000s era x86 CPUs).
-sse2-x2     8         Double-pumped SSE2.
-sse4        4         SSE4 (generally 2008-2010 Intel CPUs).
-sse4-x2     8         Double-pumped SSE4.
-sse4-8      16        SSE4 target targeting computation on
-                      8-bit data types. 
-sse4-16     8         SSE4 target targeting computation on
-                      16-bit data types.
-=========== ========= =======================================
+The following target ISAs are supported:
+
+============ ==========================================
+Target       Description
+------------ ------------------------------------------
+avx, avx1    AVX (2010-2011 era Intel CPUs)
+avx1.1       AVX 1.1 (2012 era "Ivybridge" Intel CPUs)
+avx2         AVX 2 target (2013- Intel "Haswell" CPUs)
+neon         ARM NEON
+sse2         SSE2 (early 2000s era x86 CPUs)
+sse4         SSE4 (generally 2008-2010 Intel CPUs)
+============ ==========================================
+
+Consult your CPU's manual for specifics on which vector instruction set it
+supports.
+
+The mask size may be 8, 16, or 32 bits, though not all combinations of ISAs
+and mask sizes are supported.  For best performance, the best general
+approach is to choose a mask size equal to the size of the most common
+datatype in your programs.  For example, if most of your computation is on
+32-bit floating-point values, an ``i32`` target is appropriate.  However,
+if you're mostly doing computation on 8-bit images, ``i8`` is a better choice.
 
 See `Basic Concepts: Program Instances and Gangs of Program Instances`_ for
 more discussion of the "gang size" and its implications for program
 execution.
 
-instruction sets.  (As general context, SSE2 was first introduced in
-processors that shipped in 2001, SSE4 was introduced in 2007, and
-processors with AVX were introduced in 2010, and AVX2 arrived in 2013.
-Consult your CPU's
-manual for specifics on which vector instruction set it supports.)
+Running ``ispc --help`` and looking at the output for the ``--target``
+option gives the most up-to-date documentation about which targets your
+compiler binary supports.
+
+The naming scheme for compilation targets changed in August 2013; the
+following table shows the relationship between names in the old scheme and
+in the new scheme:
+
+============= ===========
+Target        Former Name
+------------- -----------
+avx1-i32x8    avx, avx1
+avx1-i32x16   avx-x2
+avx1.1-i32x8  avx1.1
+avx1.1-i32x16 avx1.1-x2
+avx2-i32x8    avx2
+avx2-i32x16   avx2-x2
+neon-8        n/a
+neon-16       n/a
+neon-32       n/a
+sse2-i32x4    sse2
+sse2-i32x8    sse2-x2
+sse4-i32x4    sse4
+sse4-i32x8    sse4-x2
+sse4-i8x16    n/a
+sse4-i16x8    n/a
+============= ===========
 
 By default, the target instruction set is chosen based on the most capable
 one supported by the system on which you're running ``ispc``.  You can
 override this choice with the ``--target`` flag; for example, to select
-Intel® SSE2, use ``--target=sse2``.  (As with the other options in this
-section, see the output of ``ispc --help`` for a full list of supported
-targets.)
+Intel® SSE2 with a 32-bit mask and 4 program instances in a gang, use
+``--target=sse2-i32x4``.  (As with the other options in this section, see
+the output of ``ispc --help`` for a full list of supported targets.)
 
 Generating Generic C++ Output
 -----------------------------
diff --git a/ispc.cpp b/ispc.cpp
index a012b08d..8a0f16c6 100644
--- a/ispc.cpp
+++ b/ispc.cpp
@@ -106,7 +106,7 @@ static void __cpuidex(int info[4], int level, int count) {
 static const char *
 lGetSystemISA() {
 #ifdef __arm__
-    return "neon-32";
+    return "neon-i32x4";
 #else
     int info[4];
     __cpuid(info, 1);
@@ -121,19 +121,19 @@ lGetSystemISA() {
             int info2[4];
             __cpuidex(info2, 7, 0);
             if ((info2[1] & (1 << 5)) != 0)
-                return "avx2";
+                return "avx2-i32x8";
             else
-                return "avx1.1";
+                return "avx1.1-i32x8";
         }
         // Regular AVX
-        return "avx";
+        return "avx-i32x8";
     }
     else if ((info[2] & (1 << 19)) != 0)
-        return "sse4";
+        return "sse4-i32x4";
     else if ((info[3] & (1 << 26)) != 0)
-        return "sse2";
+        return "sse2-i32x4";
     else {
-        fprintf(stderr, "Unable to detect supported SSE/AVX ISA.  Exiting.\n");
+        Error(SourcePos(), "Unable to detect supported SSE/AVX ISA.  Exiting.");
         exit(1);
     }
 #endif
@@ -186,22 +186,22 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
             // If a CPU was specified explicitly, try to pick the best
             // possible ISA based on that.
             if (!strcmp(cpu, "core-avx2"))
-                isa = "avx2";
+                isa = "avx2-i32x8";
 #ifdef ISPC_ARM_ENABLED
             else if (!strcmp(cpu, "cortex-a9") ||
                      !strcmp(cpu, "cortex-a15"))
-                isa = "neon-32";
+                isa = "neon-i32x4";
 #endif
             else if (!strcmp(cpu, "core-avx-i"))
-                isa = "avx1.1";
+                isa = "avx1.1-i32x8";
             else if (!strcmp(cpu, "sandybridge") ||
                 !strcmp(cpu, "corei7-avx"))
-                isa = "avx";
+                isa = "avx-i32x8";
             else if (!strcmp(cpu, "corei7") ||
                      !strcmp(cpu, "penryn"))
-                isa = "sse4";
+                isa = "sse4-i32x4";
             else
-                isa = "sse2";
+                isa = "sse2-i32x4";
             Warning(SourcePos(), "No --target specified on command-line.  "
                     "Using ISA \"%s\" based on specified CPU \"%s\".", isa,
                     cpu);
@@ -211,7 +211,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
             // supports.
             isa = lGetSystemISA();
             Warning(SourcePos(), "No --target specified on command-line.  "
-                    "Using system ISA \"%s\".", isa);
+                    "Using default system target \"%s\".", isa);
         }
     }
 
@@ -241,8 +241,8 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
             }
         }
         if (foundCPU == false) {
-            fprintf(stderr, "Error: CPU type \"%s\" unknown. Supported CPUs: "
-                    "%s.\n", cpu, SupportedTargetCPUs().c_str());
+            Error(SourcePos(), "Error: CPU type \"%s\" unknown. Supported CPUs: "
+                    "%s.", cpu, SupportedCPUs().c_str());
             return;
         }
     }
@@ -283,7 +283,8 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
     }
 
     // Check default LLVM generated targets
-    if (!strcasecmp(isa, "sse2")) {
+    if (!strcasecmp(isa, "sse2") ||
+        !strcasecmp(isa, "sse2-i32x4")) {
         this->m_isa = Target::SSE2;
         this->m_nativeVectorWidth = 4;
         this->m_vectorWidth = 4;
@@ -291,7 +292,8 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
         this->m_maskingIsFree = false;
         this->m_maskBitCount = 32;
     }
-    else if (!strcasecmp(isa, "sse2-x2")) {
+    else if (!strcasecmp(isa, "sse2-x2") ||
+             !strcasecmp(isa, "sse2-i32x8")) {
         this->m_isa = Target::SSE2;
         this->m_nativeVectorWidth = 4;
         this->m_vectorWidth = 8;
@@ -299,7 +301,8 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
         this->m_maskingIsFree = false;
         this->m_maskBitCount = 32;
     }
-    else if (!strcasecmp(isa, "sse4")) {
+    else if (!strcasecmp(isa, "sse4") ||
+             !strcasecmp(isa, "sse4-i32x4")) {
         this->m_isa = Target::SSE4;
         this->m_nativeVectorWidth = 4;
         this->m_vectorWidth = 4;
@@ -308,7 +311,9 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
         this->m_maskingIsFree = false;
         this->m_maskBitCount = 32;
     }
-    else if (!strcasecmp(isa, "sse4x2") || !strcasecmp(isa, "sse4-x2")) {
+    else if (!strcasecmp(isa, "sse4x2") ||
+             !strcasecmp(isa, "sse4-x2") ||
+             !strcasecmp(isa, "sse4-i32x8")) {
         this->m_isa = Target::SSE4;
         this->m_nativeVectorWidth = 4;
         this->m_vectorWidth = 8;
@@ -316,7 +321,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
         this->m_maskingIsFree = false;
         this->m_maskBitCount = 32;
     }
-    else if (!strcasecmp(isa, "sse4-8")) {
+    else if (!strcasecmp(isa, "sse4-i8x16")) {
         this->m_isa = Target::SSE4;
         this->m_nativeVectorWidth = 16;
         this->m_vectorWidth = 16;
@@ -324,7 +329,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
         this->m_maskingIsFree = false;
         this->m_maskBitCount = 8;
     }
-    else if (!strcasecmp(isa, "sse4-16")) {
+    else if (!strcasecmp(isa, "sse4-i16x8")) {
         this->m_isa = Target::SSE4;
         this->m_nativeVectorWidth = 8;
         this->m_vectorWidth = 8;
@@ -332,7 +337,8 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
         this->m_maskingIsFree = false;
         this->m_maskBitCount = 16;
     }
-    else if (!strcasecmp(isa, "generic-4")) {
+    else if (!strcasecmp(isa, "generic-4") ||
+             !strcasecmp(isa, "generic-x4")) {
         this->m_isa = Target::GENERIC;
         this->m_nativeVectorWidth = 4;
         this->m_vectorWidth = 4;
@@ -342,7 +348,8 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
         this->m_hasTranscendentals = true;
         this->m_hasGather = this->m_hasScatter = true;
     }
-    else if (!strcasecmp(isa, "generic-8")) {
+    else if (!strcasecmp(isa, "generic-8") ||
+             !strcasecmp(isa, "generic-x8")) {
         this->m_isa = Target::GENERIC;
         this->m_nativeVectorWidth = 8;
         this->m_vectorWidth = 8;
@@ -352,7 +359,8 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
         this->m_hasTranscendentals = true;
         this->m_hasGather = this->m_hasScatter = true;
     }
-    else if (!strcasecmp(isa, "generic-16")) {
+    else if (!strcasecmp(isa, "generic-16") ||
+             !strcasecmp(isa, "generic-x16")) {
         this->m_isa = Target::GENERIC;
         this->m_nativeVectorWidth = 16;
         this->m_vectorWidth = 16;
@@ -362,7 +370,8 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
         this->m_hasTranscendentals = true;
         this->m_hasGather = this->m_hasScatter = true;
     }
-    else if (!strcasecmp(isa, "generic-32")) {
+    else if (!strcasecmp(isa, "generic-32") ||
+             !strcasecmp(isa, "generic-x32")) {
         this->m_isa = Target::GENERIC;
         this->m_nativeVectorWidth = 32;
         this->m_vectorWidth = 32;
@@ -372,7 +381,8 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
         this->m_hasTranscendentals = true;
         this->m_hasGather = this->m_hasScatter = true;
     }
-    else if (!strcasecmp(isa, "generic-64")) {
+    else if (!strcasecmp(isa, "generic-64") ||
+             !strcasecmp(isa, "generic-x64")) {
         this->m_isa = Target::GENERIC;
         this->m_nativeVectorWidth = 64;
         this->m_vectorWidth = 64;
@@ -382,14 +392,17 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
         this->m_hasTranscendentals = true;
         this->m_hasGather = this->m_hasScatter = true;
     }
-    else if (!strcasecmp(isa, "generic-1")) {
+    else if (!strcasecmp(isa, "generic-1") ||
+             !strcasecmp(isa, "generic-x1")) {
         this->m_isa = Target::GENERIC;
         this->m_nativeVectorWidth = 1;
         this->m_vectorWidth = 1;
         this->m_maskingIsFree = false;
         this->m_maskBitCount = 32;
     }
-    else if (!strcasecmp(isa, "avx") || !strcasecmp(isa, "avx1")) {
+    else if (!strcasecmp(isa, "avx") ||
+             !strcasecmp(isa, "avx1") ||
+             !strcasecmp(isa, "avx1-i32x8")) {
         this->m_isa = Target::AVX;
         this->m_nativeVectorWidth = 8;
         this->m_vectorWidth = 8;
@@ -397,7 +410,9 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
         this->m_maskingIsFree = false;
         this->m_maskBitCount = 32;
     }
-    else if (!strcasecmp(isa, "avx-x2") || !strcasecmp(isa, "avx1-x2")) {
+    else if (!strcasecmp(isa, "avx-x2") ||
+             !strcasecmp(isa, "avx1-x2") ||
+             !strcasecmp(isa, "avx1-i32x16")) {
         this->m_isa = Target::AVX;
         this->m_nativeVectorWidth = 8;
         this->m_vectorWidth = 16;
@@ -405,7 +420,8 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
         this->m_maskingIsFree = false;
         this->m_maskBitCount = 32;
     }
-    else if (!strcasecmp(isa, "avx1.1")) {
+    else if (!strcasecmp(isa, "avx1.1") ||
+             !strcasecmp(isa, "avx1.1-i32x8")) {
         this->m_isa = Target::AVX11;
         this->m_nativeVectorWidth = 8;
         this->m_vectorWidth = 8;
@@ -418,7 +434,8 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
         this->m_hasRand = true;
 #endif
     }
-    else if (!strcasecmp(isa, "avx1.1-x2")) {
+    else if (!strcasecmp(isa, "avx1.1-x2") ||
+             !strcasecmp(isa, "avx1.1-i32x16")) {
         this->m_isa = Target::AVX11;
         this->m_nativeVectorWidth = 8;
         this->m_vectorWidth = 16;
@@ -431,7 +448,8 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
         this->m_hasRand = true;
 #endif
     }
-    else if (!strcasecmp(isa, "avx2")) {
+    else if (!strcasecmp(isa, "avx2") ||
+             !strcasecmp(isa, "avx2-i32x8")) {
         this->m_isa = Target::AVX2;
         this->m_nativeVectorWidth = 8;
         this->m_vectorWidth = 8;
@@ -449,7 +467,8 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
         this->m_hasGather = true;
 #endif
     }
-    else if (!strcasecmp(isa, "avx2-x2")) {
+    else if (!strcasecmp(isa, "avx2-x2") ||
+             !strcasecmp(isa, "avx2-i32x16")) {
         this->m_isa = Target::AVX2;
         this->m_nativeVectorWidth = 16;
         this->m_vectorWidth = 16;
@@ -468,7 +487,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
 #endif
     }
 #ifdef ISPC_ARM_ENABLED
-    else if (!strcasecmp(isa, "neon-8")) {
+    else if (!strcasecmp(isa, "neon-i8x16")) {
         this->m_isa = Target::NEON8;
         this->m_nativeVectorWidth = 16;
         this->m_vectorWidth = 16;
@@ -477,7 +496,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
         this->m_maskingIsFree = false;
         this->m_maskBitCount = 8;
     }
-    else if (!strcasecmp(isa, "neon-16")) {
+    else if (!strcasecmp(isa, "neon-i16x8")) {
         this->m_isa = Target::NEON16;
         this->m_nativeVectorWidth = 8;
         this->m_vectorWidth = 8;
@@ -486,7 +505,8 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
         this->m_maskingIsFree = false;
         this->m_maskBitCount = 16;
     }
-    else if (!strcasecmp(isa, "neon-32") || !strcasecmp(isa, "neon")) {
+    else if (!strcasecmp(isa, "neon") ||
+             !strcasecmp(isa, "neon-i32x4")) {
         this->m_isa = Target::NEON32;
         this->m_nativeVectorWidth = 4;
         this->m_vectorWidth = 4;
@@ -497,8 +517,8 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
     }
 #endif
     else {
-        fprintf(stderr, "Target ISA \"%s\" is unknown.  Choices are: %s\n",
-                isa, SupportedTargetISAs());
+        Error(SourcePos(), "Target \"%s\" is unknown.  Choices are: %s.",
+                isa, SupportedTargets());
         error = true;
     }
 
@@ -592,7 +612,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
 
 
 std::string
-Target::SupportedTargetCPUs() {
+Target::SupportedCPUs() {
     std::string ret;
     int count = sizeof(supportedCPUs) / sizeof(supportedCPUs[0]);
     for (int i = 0; i < count; ++i) {
@@ -605,7 +625,7 @@ Target::SupportedTargetCPUs() {
 
 
 const char *
-Target::SupportedTargetArchs() {
+Target::SupportedArchs() {
     return
 #ifdef ISPC_ARM_ENABLED
         "arm, "
@@ -615,14 +635,18 @@ Target::SupportedTargetArchs() {
 
 
 const char *
-Target::SupportedTargetISAs() {
+Target::SupportedTargets() {
     return
 #ifdef ISPC_ARM_ENABLED
-        "neon-8, neon-16, neon-32, "
+        "neon-i8x16, neon-16x8, neon-32x4, "
 #endif
-        "sse2, sse2-x2, sse4, sse4-8, sse4-16, sse4-x2, "
-        "avx, avx-x2, avx1.1, avx1.1-x2, avx2, avx2-x2, "
-        "generic-1, generic-4, generic-8, generic-16, generic-32";
+        "sse2-i32x4, sse2-i32x8, "
+        "sse4-i32x4, sse4-i32x8, sse4-i16x8, sse4-i8x16, "
+        "avx1-i32x8, avx1-i32x16, "
+        "avx1.1-i32x8, avx1.1-i32x16, "
+        "avx2-i32x8, avx2-i32x16, "
+        "generic-x1, generic-x4, generic-x8, generic-x16, "
+            "generic-x32, generic-x64";
 }
 
 
diff --git a/ispc.h b/ispc.h
index 25a03e1d..fc78e415 100644
--- a/ispc.h
+++ b/ispc.h
@@ -192,16 +192,16 @@ public:
     Target(const char *arch, const char *cpu, const char *isa, bool pic);
 
     /** Returns a comma-delimited string giving the names of the currently
-        supported target ISAs. */
-    static const char *SupportedTargetISAs();
+        supported compilation targets. */
+    static const char *SupportedTargets();
 
     /** Returns a comma-delimited string giving the names of the currently
-        supported target CPUs. */
-    static std::string SupportedTargetCPUs();
+        supported CPUs. */
+    static std::string SupportedCPUs();
 
     /** Returns a comma-delimited string giving the names of the currently
-        supported target architectures. */
-    static const char *SupportedTargetArchs();
+        supported architectures. */
+    static const char *SupportedArchs();
 
     /** Returns a triple string specifying the target architecture, vendor,
         and environment. */
diff --git a/main.cpp b/main.cpp
index 7290d3c8..94edb73f 100644
--- a/main.cpp
+++ b/main.cpp
@@ -85,13 +85,16 @@ usage(int ret) {
     printf("                          \t\taddressing calculations are done by default, even\n");
     printf("                          \t\ton 64-bit target architectures.)\n");
     printf("    [--arch={%s}]\t\tSelect target architecture\n",
-           Target::SupportedTargetArchs());
+           Target::SupportedArchs());
     printf("    [--c++-include-file=<name>]\t\tSpecify name of file to emit in #include statement in generated C++ code.\n");
 #ifndef ISPC_IS_WINDOWS
     printf("    [--colored-output]\t\tAlways use terminal colors in error/warning messages.\n");
 #endif
-    printf("    [--cpu=<cpu>]\t\t\tSelect target CPU type\n");
-    printf("         <cpu>={%s}\n", Target::SupportedTargetCPUs().c_str());
+    printf("    ");
+    char cpuHelp[2048];
+    sprintf(cpuHelp, "[--cpu=<cpu>]\t\t\tSelect target CPU type\n<cpu>={%s}\n",
+            Target::SupportedCPUs().c_str());
+    PrintWithWordBreaks(cpuHelp, 16, TerminalWidth(), stdout);
     printf("    [-D<foo>]\t\t\t\t#define given value when running preprocessor\n");
     printf("    [--dev-stub <filename>]\t\tEmit device-side offload stub functions to file\n");
     printf("    [--emit-asm]\t\t\tGenerate assembly language file as output\n");
@@ -126,7 +129,11 @@ usage(int ret) {
     printf("    [--pic]\t\t\t\tGenerate position-independent code\n");
 #endif // !ISPC_IS_WINDOWS
     printf("    [--quiet]\t\t\t\tSuppress all output\n");
-    printf("    [--target=<isa>]\t\t\tSelect target ISA. <isa>={%s}\n", Target::SupportedTargetISAs());
+    printf("    ");
+    char targetHelp[2048];
+    sprintf(targetHelp, "[--target=<t>]\t\t\tSelect target ISA and width.\n"
+            "<t>={%s}", Target::SupportedTargets());
+    PrintWithWordBreaks(targetHelp, 24, TerminalWidth(), stdout);
     printf("    [--version]\t\t\t\tPrint ispc version\n");
     printf("    [--werror]\t\t\t\tTreat warnings as errors\n");
     printf("    [--woff]\t\t\t\tDisable warnings\n");
diff --git a/run_tests.py b/run_tests.py
index c9dd8b76..3225c7fd 100755
--- a/run_tests.py
+++ b/run_tests.py
@@ -37,7 +37,7 @@ parser.add_option("-g", "--generics-include", dest="include_file", help="Filenam
 parser.add_option("-f", "--ispc-flags", dest="ispc_flags", help="Additional flags for ispc (-g, -O1, ...)",
                   default="")
 parser.add_option('-t', '--target', dest='target',
-                  help='Set compilation target (neon8, neon16, neon32, sse2, sse2-x2, sse4, sse4-x2, sse4-8, sse4-16, avx, avx-x2, generic-4, generic-8, generic-16, generic-32)',
+                  help='Set compilation target (sse2-i32x4, sse2-i32x8, sse4-i32x4, sse4-i32x8, sse4-i16x8, sse4-i8x16, avx1-i32x8, avx1-i32x16, avx1.1-i32x8, avx1.1-i32x16, avx2-i32x8, avx2-i32x16, generic-x1, generic-x4, generic-x8, generic-x16, generic-x32, generic-x64)',
                   default="sse4")
 parser.add_option('-a', '--arch', dest='arch',
                   help='Set architecture (arm, x86, x86-64)',
diff --git a/util.cpp b/util.cpp
index dbea9517..6b121988 100644
--- a/util.cpp
+++ b/util.cpp
@@ -79,8 +79,8 @@
     compiler under a debuffer; in this case, just return a reasonable
     default.
  */
-static int
-lTerminalWidth() {
+int
+TerminalWidth() {
     if (g->disableLineWrap)
         return 1<<30;
 
@@ -228,8 +228,8 @@ lFindIndent(int numColons, const char *buf) {
 /** Print the given string to the given FILE, assuming the given output
     column width.  Break words as needed to avoid words spilling past the
     last column.  */
-static void
-lPrintWithWordBreaks(const char *buf, int indent, int columnWidth, FILE *out) {
+void
+PrintWithWordBreaks(const char *buf, int indent, int columnWidth, FILE *out) {
 #ifdef ISPC_IS_WINDOWS
     fputs(buf, out);
     fputs("\n", out);
@@ -375,7 +375,7 @@ lPrint(const char *type, bool isError, SourcePos p, const char *fmt,
         return;
     printed.insert(formattedBuf);
 
-    lPrintWithWordBreaks(formattedBuf, indent, lTerminalWidth(), stderr);
+    PrintWithWordBreaks(formattedBuf, indent, TerminalWidth(), stderr);
     lPrintFileLineContext(p);
 
     free(errorBuf);
diff --git a/util.h b/util.h
index b247b8bd..7edf71f7 100644
--- a/util.h
+++ b/util.h
@@ -156,4 +156,18 @@ void GetDirectoryAndFileName(const std::string &currentDir,
 bool VerifyDataLayoutCompatibility(const std::string &module_dl,
                                    const std::string &lib_dl);
 
+/** Print the given string to the given FILE, assuming the given output
+    column width.  Break words as needed to avoid words spilling past the
+    last column.  */
+void PrintWithWordBreaks(const char *buf, int indent, int columnWidth,
+                         FILE *out);
+
+/** Returns the width of the terminal where the compiler is running.
+    Finding this out may fail in a variety of reasonable situations (piping
+    compiler output to 'less', redirecting output to a file, running the
+    compiler under a debuffer; in this case, just return a reasonable
+    default.
+ */
+int TerminalWidth();
+
 #endif // ISPC_UTIL_H

From 7ab4c5391cf5c00eae9e557e579402d2a76644fd Mon Sep 17 00:00:00 2001
From: Matt Pharr <mpharr@google.com>
Date: Fri, 9 Aug 2013 19:56:43 -0700
Subject: [PATCH 25/34] Fix build with LLVM 3.2 and generic-4 / examples/sse4.h
 target.

---
 examples/intrinsics/sse4.h | 4 ++--
 opt.cpp                    | 8 ++++----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/examples/intrinsics/sse4.h b/examples/intrinsics/sse4.h
index 30f90b31..44dedf33 100644
--- a/examples/intrinsics/sse4.h
+++ b/examples/intrinsics/sse4.h
@@ -2532,7 +2532,7 @@ static FORCEINLINE int16_t __reduce_add_int8(__vec4_i8 v) {
     // TODO: improve
     int16_t ret = 0;
     for (int i = 0; i < 4; ++i)
-        ret += v.v[i];
+        ret += __extract_element(v, i);
     return ret;
 }
 
@@ -2540,7 +2540,7 @@ static FORCEINLINE int32_t __reduce_add_int16(__vec4_i16 v) {
     // TODO: improve
     int32_t ret = 0;
     for (int i = 0; i < 4; ++i)
-        ret += v.v[i];
+        ret += __extract_element(v, i);
     return ret;
 }
 
diff --git a/opt.cpp b/opt.cpp
index 522e601b..75eae20c 100644
--- a/opt.cpp
+++ b/opt.cpp
@@ -4556,7 +4556,7 @@ PeepholePass::PeepholePass()
     : BasicBlockPass(ID) {
 }
 
-#ifndef LLVM_3_1
+#if !defined(LLVM_3_1) && !defined(LLVM_3_2)
 
 using namespace llvm::PatternMatch;
 
@@ -4877,7 +4877,7 @@ lMatchAvgDownInt16(llvm::Value *inst) {
     }
     return NULL;
 }
-#endif // !LLVM_3_1
+#endif // !LLVM_3_1 && !LLVM_3_2
 
 bool
 PeepholePass::runOnBasicBlock(llvm::BasicBlock &bb) {
@@ -4889,7 +4889,7 @@ PeepholePass::runOnBasicBlock(llvm::BasicBlock &bb) {
         llvm::Instruction *inst = &*iter;
 
         llvm::Instruction *builtinCall = NULL;
-#ifndef LLVM_3_1
+#if !defined(LLVM_3_1) && !defined(LLVM_3_2)
         if (!builtinCall)
           builtinCall = lMatchAvgUpUInt8(inst);
         if (!builtinCall)
@@ -4906,7 +4906,7 @@ PeepholePass::runOnBasicBlock(llvm::BasicBlock &bb) {
           builtinCall = lMatchAvgDownInt8(inst);
         if (!builtinCall)
           builtinCall = lMatchAvgDownInt16(inst);
-#endif // LLVM_3_1
+#endif // !LLVM_3_1 && !LLVM_3_2
         if (builtinCall != NULL) {
           llvm::ReplaceInstWithInst(inst, builtinCall);
           modifiedAny = true;

From ea8591a85a6ac494ce3395cfbeca17e196a3d463 Mon Sep 17 00:00:00 2001
From: Matt Pharr <mpharr@google.com>
Date: Sat, 10 Aug 2013 11:22:43 -0700
Subject: [PATCH 26/34] Fix build with LLVM top-of-tree (link libcurses)

---
 Makefile | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/Makefile b/Makefile
index 5bac4a6e..69e24d41 100644
--- a/Makefile
+++ b/Makefile
@@ -79,6 +79,10 @@ CLANG_LIBS = -lclangFrontend -lclangDriver \
 ISPC_LIBS=$(shell $(LLVM_CONFIG) --ldflags) $(CLANG_LIBS) $(LLVM_LIBS) \
 	-lpthread
 
+ifeq ($(LLVM_VERSION),LLVM_3_4)
+    ISPC_LIBS += -lcurses
+endif
+
 ifeq ($(ARCH_OS),Linux)
 	ISPC_LIBS += -ldl
 endif

From 4766467271a9e6c0702eec04ebd6d8b9725db5f1 Mon Sep 17 00:00:00 2001
From: Matt Pharr <mpharr@google.com>
Date: Sat, 10 Aug 2013 11:23:39 -0700
Subject: [PATCH 27/34] Revert ispc.vcxproj to version from top-of-tree.

---
 ispc.vcxproj | 22 +---------------------
 1 file changed, 1 insertion(+), 21 deletions(-)

diff --git a/ispc.vcxproj b/ispc.vcxproj
index 53386c4c..36fbad5d 100755
--- a/ispc.vcxproj
+++ b/ispc.vcxproj
@@ -1,4 +1,4 @@
-<?xml version="1.0" encoding="utf-8"?>
+﻿<?xml version="1.0" encoding="utf-8"?>
 <Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
   <ItemGroup Label="ProjectConfigurations">
     <ProjectConfiguration Include="Debug|Win32">
@@ -185,25 +185,6 @@
       <Message>Building gen-bitcode-sse2-x2-64bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
-  <ItemGroup>
-    <CustomBuild Include="builtins\target-avx1.ll">
-      <FileType>Document</FileType>
-      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-avx1.ll | python bitcode2cpp.py builtins\target-avx1.ll 32bit &gt; $(Configuration)/gen-bitcode-avx1-32bit.cpp</Command>
-      <Outputs>$(Configuration)/gen-bitcode-avx1-32bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\target-avx-common.ll;builtins\target-avx.ll</AdditionalInputs>
-      <Message>Building gen-bitcode-avx1-32bit.cpp</Message>
-    </CustomBuild>
-  </ItemGroup>
-  <ItemGroup>
-    <CustomBuild Include="builtins\target-avx1.ll">
-      <FileType>Document</FileType>
-      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-avx1.ll | python bitcode2cpp.py builtins\target-avx1.ll 64bit &gt; $(Configuration)/gen-bitcode-avx1-64bit.cpp</Command>
-      <Outputs>$(Configuration)/gen-bitcode-avx1-64bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\target-avx-common.ll;builtins\target-avx.ll</AdditionalInputs>
-      <Message>Building gen-bitcode-avx1-64bit.cpp</Message>
-    </CustomBuild>
-  </ItemGroup>
-=======
   <ItemGroup>
     <CustomBuild Include="builtins\target-avx1.ll">
       <FileType>Document</FileType>
@@ -222,7 +203,6 @@
       <Message>Building gen-bitcode-avx1-64bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
->>>>>>> master
   <ItemGroup>
     <CustomBuild Include="builtins\target-avx1-x2.ll">
       <FileType>Document</FileType>

From ed017c42f1933ea1c57242f52cecb45507d9e324 Mon Sep 17 00:00:00 2001
From: Matt Pharr <mpharr@google.com>
Date: Sun, 11 Aug 2013 07:47:20 -0700
Subject: [PATCH 28/34] Fix ispc.vcxproj for Windows builds

---
 ispc.vcxproj | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/ispc.vcxproj b/ispc.vcxproj
index 36fbad5d..74186ac0 100755
--- a/ispc.vcxproj
+++ b/ispc.vcxproj
@@ -53,8 +53,10 @@
     <ClCompile Include="$(Configuration)\gen-bitcode-sse4-64bit.cpp" />
     <ClCompile Include="$(Configuration)\gen-bitcode-sse4-x2-32bit.cpp" />
     <ClCompile Include="$(Configuration)\gen-bitcode-sse4-x2-64bit.cpp" />
-    <ClCompile Include="$(Configuration)\gen-stdlib-generic.cpp" />
-    <ClCompile Include="$(Configuration)\gen-stdlib-x86.cpp" />
+    <ClCompile Include="$(Configuration)\gen-stdlib-mask1.cpp" />
+    <ClCompile Include="$(Configuration)\gen-stdlib-mask8.cpp" />
+    <ClCompile Include="$(Configuration)\gen-stdlib-mask16.cpp" />
+    <ClCompile Include="$(Configuration)\gen-stdlib-mask32.cpp" />
     <ClCompile Include="ispc.cpp" />
     <ClCompile Include="$(Configuration)\lex.cc">
       <DisableSpecificWarnings>4146;4800;4996;4355;4624;4005;4003;4018</DisableSpecificWarnings>
@@ -97,11 +99,13 @@
   <ItemGroup>
     <CustomBuild Include="stdlib.ispc">
       <FileType>Document</FileType>
-      <Command>%LLVM_INSTALL_DIR%\bin\clang -E -x c %(Filename).ispc -DISPC=1 -DPI=3.1415926535 | python stdlib2cpp.py x86 &gt; $(Configuration)/gen-stdlib-x86.cpp;
-%LLVM_INSTALL_DIR%\bin\clang -E -x c %(Filename).ispc -DISPC=1 -DISPC_TARGET_GENERIC=1 -DPI=3.1415926535 | python stdlib2cpp.py generic &gt; $(Configuration)/gen-stdlib-generic.cpp;
+      <Command>%LLVM_INSTALL_DIR%\bin\clang -E -x c %(Filename).ispc -DISPC_MASK_BITS=1 -DISPC=1 -DPI=3.1415926535 | python stdlib2cpp.py mask1 &gt; $(Configuration)/gen-stdlib-mask1.cpp;
+%LLVM_INSTALL_DIR%\bin\clang -E -x c %(Filename).ispc -DISPC_MASK_BITS=8 -DISPC=1 -DPI=3.1415926535 | python stdlib2cpp.py mask8 &gt; $(Configuration)/gen-stdlib-mask8.cpp;
+%LLVM_INSTALL_DIR%\bin\clang -E -x c %(Filename).ispc -DISPC_MASK_BITS=16 -DISPC=1 -DPI=3.1415926535 | python stdlib2cpp.py mask16 &gt; $(Configuration)/gen-stdlib-mask16.cpp;
+%LLVM_INSTALL_DIR%\bin\clang -E -x c %(Filename).ispc -DISPC_MASK_BITS=32 -DISPC=1 -DPI=3.1415926535 | python stdlib2cpp.py mask32 &gt; $(Configuration)/gen-stdlib-mask32.cpp;
 </Command>
-      <Outputs>$(Configuration)/gen-stdlib-generic.cpp;$(Configuration)/gen-stdlib-x86.cpp</Outputs>
-      <Message>Building gen-stdlib-{generic,x86}.cpp</Message>
+      <Outputs>$(Configuration)/gen-stdlib-mask1.cpp;$(Configuration)/gen-stdlib-mask8.cpp;$(Configuration)/gen-stdlib-mask16.cpp;$(Configuration)/gen-stdlib-mask32.cpp</Outputs>
+      <Message>Building gen-stdlib-{mask1,8,16,32}.cpp</Message>
     </CustomBuild>
   </ItemGroup>
   <ItemGroup>

From 42f31aed6901f131cf20eb7606db498f43192012 Mon Sep 17 00:00:00 2001
From: Matt Pharr <mpharr@google.com>
Date: Wed, 14 Aug 2013 11:02:45 -0700
Subject: [PATCH 29/34] Another attempt at fixing the Windows build (added
 sse4-8/sse4-16 targets).

---
 ispc.vcxproj | 40 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 40 insertions(+)

diff --git a/ispc.vcxproj b/ispc.vcxproj
index 74186ac0..b4a8b764 100755
--- a/ispc.vcxproj
+++ b/ispc.vcxproj
@@ -51,6 +51,10 @@
     <ClCompile Include="$(Configuration)\gen-bitcode-sse2-x2-64bit.cpp" />
     <ClCompile Include="$(Configuration)\gen-bitcode-sse4-32bit.cpp" />
     <ClCompile Include="$(Configuration)\gen-bitcode-sse4-64bit.cpp" />
+    <ClCompile Include="$(Configuration)\gen-bitcode-sse4-8-32bit.cpp" />
+    <ClCompile Include="$(Configuration)\gen-bitcode-sse4-8-64bit.cpp" />
+    <ClCompile Include="$(Configuration)\gen-bitcode-sse4-16-32bit.cpp" />
+    <ClCompile Include="$(Configuration)\gen-bitcode-sse4-16-64bit.cpp" />
     <ClCompile Include="$(Configuration)\gen-bitcode-sse4-x2-32bit.cpp" />
     <ClCompile Include="$(Configuration)\gen-bitcode-sse4-x2-64bit.cpp" />
     <ClCompile Include="$(Configuration)\gen-stdlib-mask1.cpp" />
@@ -135,6 +139,42 @@
       <Message>Building gen-bitcode-sse4-64bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="builtins\target-sse4-8.ll">
+      <FileType>Document</FileType>
+      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-sse4-8.ll | python bitcode2cpp.py builtins\target-sse4-8.ll 32bit &gt; $(Configuration)/gen-bitcode-sse4-8-32bit.cpp</Command>
+      <Outputs>$(Configuration)/gen-bitcode-sse4-8-32bit.cpp</Outputs>
+      <AdditionalInputs>builtins\util.m4;builtins\target-sse4-common.ll</AdditionalInputs>
+      <Message>Building gen-bitcode-sse4-8-32bit.cpp</Message>
+    </CustomBuild>
+  </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="builtins\target-sse4-8.ll">
+      <FileType>Document</FileType>
+      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-sse4-8.ll | python bitcode2cpp.py builtins\target-sse4-8.ll 64bit &gt; $(Configuration)/gen-bitcode-sse4-8-64bit.cpp</Command>
+      <Outputs>$(Configuration)/gen-bitcode-sse4-8-64bit.cpp</Outputs>
+      <AdditionalInputs>builtins\util.m4;builtins\target-sse4-common.ll</AdditionalInputs>
+      <Message>Building gen-bitcode-sse4-8-64bit.cpp</Message>
+    </CustomBuild>
+  </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="builtins\target-sse4-16.ll">
+      <FileType>Document</FileType>
+      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-sse4-16.ll | python bitcode2cpp.py builtins\target-sse4-16.ll 32bit &gt; $(Configuration)/gen-bitcode-sse4-16-32bit.cpp</Command>
+      <Outputs>$(Configuration)/gen-bitcode-sse4-16-32bit.cpp</Outputs>
+      <AdditionalInputs>builtins\util.m4;builtins\target-sse4-common.ll</AdditionalInputs>
+      <Message>Building gen-bitcode-sse4-16-32bit.cpp</Message>
+    </CustomBuild>
+  </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="builtins\target-sse4-16.ll">
+      <FileType>Document</FileType>
+      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-sse4-16.ll | python bitcode2cpp.py builtins\target-sse4-16.ll 64bit &gt; $(Configuration)/gen-bitcode-sse4-16-64bit.cpp</Command>
+      <Outputs>$(Configuration)/gen-bitcode-sse4-16-64bit.cpp</Outputs>
+      <AdditionalInputs>builtins\util.m4;builtins\target-sse4-common.ll</AdditionalInputs>
+      <Message>Building gen-bitcode-sse4-16-64bit.cpp</Message>
+    </CustomBuild>
+  </ItemGroup>
   <ItemGroup>
     <CustomBuild Include="builtins\target-sse4-x2.ll">
       <FileType>Document</FileType>

From d976da7559089fa9bdc033ad764c73793ad34598 Mon Sep 17 00:00:00 2001
From: Matt Pharr <mpharr@google.com>
Date: Tue, 20 Aug 2013 08:49:51 -0700
Subject: [PATCH 30/34] Speed up idiv test (dont test int32 as thoroughly)

---
 tests/idiv.ispc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/idiv.ispc b/tests/idiv.ispc
index b7bd78dc..8738740b 100644
--- a/tests/idiv.ispc
+++ b/tests/idiv.ispc
@@ -44,7 +44,7 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
   // randomly sample int32s...
   uniform RNGState state;
   seed_rng(&state, 1234);
-  for (uniform int i = 0; i < 1M; ++i) {
+  for (uniform int i = 0; i < 64k; ++i) {
     unsigned int32 num = random(&state);
     for (uniform unsigned int32 div = 2; div < 256; ++div) {
       if (__fast_idiv(num, div) != num/div) {
@@ -54,7 +54,7 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
     }
   }
 
-  for (uniform int64 i = 0; i < 1M; ++i) {  
+  for (uniform int64 i = 0; i < 64k; ++i) {
     int32 num = random(&state);
     if (num < 0)
       continue;

From e7f067d70cf03415fc350272daf0506b7184fa84 Mon Sep 17 00:00:00 2001
From: Matt Pharr <mpharr@google.com>
Date: Tue, 20 Aug 2013 09:04:52 -0700
Subject: [PATCH 31/34] Fix handling of __clock() builtin for "generic"
 targets.

---
 cbackend.cpp                     |  4 ++++
 examples/intrinsics/generic-16.h | 20 ++++++++++++++++++++
 examples/intrinsics/generic-32.h | 20 ++++++++++++++++++++
 examples/intrinsics/generic-64.h | 20 ++++++++++++++++++++
 examples/intrinsics/knc.h        | 21 ++++++++++++++++++---
 examples/intrinsics/knc2x.h      | 19 ++++++++++++++++++-
 examples/intrinsics/sse4.h       | 20 ++++++++++++++++++--
 7 files changed, 118 insertions(+), 6 deletions(-)

diff --git a/cbackend.cpp b/cbackend.cpp
index d23bcc20..7d4b4cfc 100644
--- a/cbackend.cpp
+++ b/cbackend.cpp
@@ -3704,6 +3704,7 @@ void CWriter::lowerIntrinsics(llvm::Function &F) {
           case llvm::Intrinsic::sadd_with_overflow:
           case llvm::Intrinsic::trap:
           case llvm::Intrinsic::objectsize:
+          case llvm::Intrinsic::readcyclecounter:
               // We directly implement these intrinsics
             break;
           default:
@@ -4056,6 +4057,9 @@ bool CWriter::visitBuiltinCall(llvm::CallInst &I, llvm::Intrinsic::ID ID,
     return true;
   case llvm::Intrinsic::objectsize:
     return true;
+  case llvm::Intrinsic::readcyclecounter:
+    Out << "__clock()";
+    return true;
   }
 }
 
diff --git a/examples/intrinsics/generic-16.h b/examples/intrinsics/generic-16.h
index 6d4fe1f4..d81101f7 100644
--- a/examples/intrinsics/generic-16.h
+++ b/examples/intrinsics/generic-16.h
@@ -1759,3 +1759,23 @@ static FORCEINLINE uint64_t __atomic_cmpxchg(uint64_t *p, uint64_t cmpval,
     return __sync_val_compare_and_swap(p, cmpval, newval);
 #endif
 }
+
+#ifdef WIN32
+#include <windows.h>
+#define __clock __rdtsc
+#else // WIN32
+static FORCEINLINE uint64_t __clock() {
+  uint32_t low, high;
+#ifdef __x86_64
+  __asm__ __volatile__ ("xorl %%eax,%%eax \n    cpuid"
+                        ::: "%rax", "%rbx", "%rcx", "%rdx" );
+#else
+  __asm__ __volatile__ ("xorl %%eax,%%eax \n    cpuid"
+                        ::: "%eax", "%ebx", "%ecx", "%edx" );
+#endif
+  __asm__ __volatile__ ("rdtsc" : "=a" (low), "=d" (high));
+  return (uint64_t)high << 32 | low;
+}
+
+#endif // !WIN32
+
diff --git a/examples/intrinsics/generic-32.h b/examples/intrinsics/generic-32.h
index 12c4f84e..7e6c69d4 100644
--- a/examples/intrinsics/generic-32.h
+++ b/examples/intrinsics/generic-32.h
@@ -1827,3 +1827,23 @@ static FORCEINLINE uint64_t __atomic_cmpxchg(uint64_t *p, uint64_t cmpval,
     return __sync_val_compare_and_swap(p, cmpval, newval);
 #endif
 }
+
+#ifdef WIN32
+#include <windows.h>
+#define __clock __rdtsc
+#else // WIN32
+static FORCEINLINE uint64_t __clock() {
+  uint32_t low, high;
+#ifdef __x86_64
+  __asm__ __volatile__ ("xorl %%eax,%%eax \n    cpuid"
+                        ::: "%rax", "%rbx", "%rcx", "%rdx" );
+#else
+  __asm__ __volatile__ ("xorl %%eax,%%eax \n    cpuid"
+                        ::: "%eax", "%ebx", "%ecx", "%edx" );
+#endif
+  __asm__ __volatile__ ("rdtsc" : "=a" (low), "=d" (high));
+  return (uint64_t)high << 32 | low;
+}
+#endif // WIN32
+
+#undef FORCEINLINE
diff --git a/examples/intrinsics/generic-64.h b/examples/intrinsics/generic-64.h
index a3648f42..39124186 100644
--- a/examples/intrinsics/generic-64.h
+++ b/examples/intrinsics/generic-64.h
@@ -1960,3 +1960,23 @@ static FORCEINLINE uint64_t __atomic_cmpxchg(uint64_t *p, uint64_t cmpval,
     return __sync_val_compare_and_swap(p, cmpval, newval);
 #endif
 }
+
+#ifdef WIN32
+#include <windows.h>
+#define __clock __rdtsc
+#else // WIN32
+static FORCEINLINE uint64_t __clock() {
+  uint32_t low, high;
+#ifdef __x86_64
+  __asm__ __volatile__ ("xorl %%eax,%%eax \n    cpuid"
+                        ::: "%rax", "%rbx", "%rcx", "%rdx" );
+#else
+  __asm__ __volatile__ ("xorl %%eax,%%eax \n    cpuid"
+                        ::: "%eax", "%ebx", "%ecx", "%edx" );
+#endif
+  __asm__ __volatile__ ("rdtsc" : "=a" (low), "=d" (high));
+  return (uint64_t)high << 32 | low;
+}
+#endif
+
+#undef FORCEINLINE
diff --git a/examples/intrinsics/knc.h b/examples/intrinsics/knc.h
index 41c4cbc0..8baef8cb 100644
--- a/examples/intrinsics/knc.h
+++ b/examples/intrinsics/knc.h
@@ -2121,9 +2121,24 @@ static FORCEINLINE uint64_t __atomic_cmpxchg(uint64_t *p, uint64_t cmpval,
 #endif
 }
 
+#ifdef WIN32
+#include <windows.h>
+#define __clock __rdtsc
+#else // WIN32
+static FORCEINLINE uint64_t __clock() {
+  uint32_t low, high;
+#ifdef __x86_64
+  __asm__ __volatile__ ("xorl %%eax,%%eax \n    cpuid"
+                        ::: "%rax", "%rbx", "%rcx", "%rdx" );
+#else
+  __asm__ __volatile__ ("xorl %%eax,%%eax \n    cpuid"
+                        ::: "%eax", "%ebx", "%ecx", "%edx" );
+#endif
+  __asm__ __volatile__ ("rdtsc" : "=a" (low), "=d" (high));
+  return (uint64_t)high << 32 | low;
+}
+#endif // !WIN32
+
 #undef FORCEINLINE
 #undef PRE_ALIGN
 #undef POST_ALIGN
-
-
-
diff --git a/examples/intrinsics/knc2x.h b/examples/intrinsics/knc2x.h
index 5b6e5295..a1b1fc9d 100644
--- a/examples/intrinsics/knc2x.h
+++ b/examples/intrinsics/knc2x.h
@@ -2055,7 +2055,24 @@ static FORCEINLINE void __aos_to_soa4_float(float *ptr, __vec32_f *out0, __vec32
 }
 */
 
+#ifdef WIN32
+#include <windows.h>
+#define __clock __rdtsc
+#else // WIN32
+static FORCEINLINE uint64_t __clock() {
+  uint32_t low, high;
+#ifdef __x86_64
+  __asm__ __volatile__ ("xorl %%eax,%%eax \n    cpuid"
+                        ::: "%rax", "%rbx", "%rcx", "%rdx" );
+#else
+  __asm__ __volatile__ ("xorl %%eax,%%eax \n    cpuid"
+                        ::: "%eax", "%ebx", "%ecx", "%edx" );
+#endif
+  __asm__ __volatile__ ("rdtsc" : "=a" (low), "=d" (high));
+  return (uint64_t)high << 32 | low;
+}
+#endif // !WIN32
+
 #undef FORCEINLINE
 #undef PRE_ALIGN
 #undef POST_ALIGN
-
diff --git a/examples/intrinsics/sse4.h b/examples/intrinsics/sse4.h
index 44dedf33..ff00d920 100644
--- a/examples/intrinsics/sse4.h
+++ b/examples/intrinsics/sse4.h
@@ -4000,6 +4000,22 @@ static FORCEINLINE uint64_t __atomic_cmpxchg(uint64_t *p, uint64_t cmpval,
 #endif
 }
 
+#ifdef WIN32
+#include <windows.h>
+#define __clock __rdtsc
+#else // WIN32
+static FORCEINLINE uint64_t __clock() {
+  uint32_t low, high;
+#ifdef __x86_64
+  __asm__ __volatile__ ("xorl %%eax,%%eax \n    cpuid"
+                        ::: "%rax", "%rbx", "%rcx", "%rdx" );
+#else
+  __asm__ __volatile__ ("xorl %%eax,%%eax \n    cpuid"
+                        ::: "%eax", "%ebx", "%ecx", "%edx" );
+#endif
+  __asm__ __volatile__ ("rdtsc" : "=a" (low), "=d" (high));
+  return (uint64_t)high << 32 | low;
+}
+#endif // !WIN32
+
 #undef FORCEINLINE
-
-

From 2b2905b567fec1725beff5064d6b0ffe21d93c38 Mon Sep 17 00:00:00 2001
From: Matt Pharr <mpharr@google.com>
Date: Tue, 20 Aug 2013 09:05:50 -0700
Subject: [PATCH 32/34] Fix (preexisting) bugs in generic-32/64.h with type of
 "__any", etc.

This should be a bool, not a one-wide vector of bools.  The equivalent
fix was previously made in generic-16.h, but not made here.  (Note that
many tests are still failing with these targets, but at least they
compile properly now.)
---
 examples/intrinsics/generic-32.h | 12 ++++++------
 examples/intrinsics/generic-64.h | 12 ++++++------
 2 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/examples/intrinsics/generic-32.h b/examples/intrinsics/generic-32.h
index 7e6c69d4..531ed215 100644
--- a/examples/intrinsics/generic-32.h
+++ b/examples/intrinsics/generic-32.h
@@ -408,15 +408,15 @@ static FORCEINLINE uint64_t __movmsk(__vec32_i1 mask) {
     return (uint64_t)mask.v;
 }
 
-static FORCEINLINE __vec32_i1 __any(__vec32_i1 mask) {
+static FORCEINLINE bool __any(__vec32_i1 mask) {
     return (mask.v!=0);
 }
 
-static FORCEINLINE __vec32_i1 __all(__vec32_i1 mask) {
-    return (mask.v==0xFFFFFFFF);
+static FORCEINLINE bool __all(__vec32_i1 mask) {
+    return (mask.v==0xFFFFFFFFul);
 }
 
-static FORCEINLINE __vec32_i1 __none(__vec32_i1 mask) {
+static FORCEINLINE bool __none(__vec32_i1 mask) {
     return (mask.v==0);
 }
 
@@ -1231,8 +1231,8 @@ REDUCE_ADD(double, __vec32_d, __reduce_add_double)
 REDUCE_MINMAX(double, __vec32_d, __reduce_min_double, <)
 REDUCE_MINMAX(double, __vec32_d, __reduce_max_double, >)
 
-REDUCE_ADD(int16_t, __vec16_i8, __reduce_add_int8)
-REDUCE_ADD(int32_t, __vec16_i16, __reduce_add_int16)
+//REDUCE_ADD(int16_t, __vec16_i8, __reduce_add_int8)
+//REDUCE_ADD(int32_t, __vec16_i16, __reduce_add_int16)
 
 REDUCE_ADD(int64_t, __vec32_i32, __reduce_add_int32)
 REDUCE_MINMAX(int32_t, __vec32_i32, __reduce_min_int32, <)
diff --git a/examples/intrinsics/generic-64.h b/examples/intrinsics/generic-64.h
index 39124186..bbeb007a 100644
--- a/examples/intrinsics/generic-64.h
+++ b/examples/intrinsics/generic-64.h
@@ -533,15 +533,15 @@ static FORCEINLINE uint64_t __movmsk(__vec64_i1 mask) {
     return (uint64_t)mask.v;
 }
 
-static FORCEINLINE __vec64_i1 __any(__vec64_i1 mask) {
+static FORCEINLINE bool __any(__vec64_i1 mask) {
     return (mask.v!=0);
 }
 
-static FORCEINLINE __vec64_i1 __all(__vec64_i1 mask) {
-    return (mask.v==0xFFFFFFFFFFFFFFFF);
+static FORCEINLINE bool __all(__vec64_i1 mask) {
+    return (mask.v==0xFFFFFFFFFFFFFFFFull);
 }
 
-static FORCEINLINE __vec64_i1 __none(__vec64_i1 mask) {
+static FORCEINLINE bool __none(__vec64_i1 mask) {
     return (mask.v==0);
 }
 
@@ -1364,8 +1364,8 @@ REDUCE_ADD(double, __vec64_d, __reduce_add_double)
 REDUCE_MINMAX(double, __vec64_d, __reduce_min_double, <)
 REDUCE_MINMAX(double, __vec64_d, __reduce_max_double, >)
 
-REDUCE_ADD(int16_t, __vec16_i8, __reduce_add_int8)
-REDUCE_ADD(int32_t, __vec16_i16, __reduce_add_int16)
+//REDUCE_ADD(int16_t, __vec16_i8, __reduce_add_int8)
+//REDUCE_ADD(int32_t, __vec16_i16, __reduce_add_int16)
 
 REDUCE_ADD(int64_t, __vec64_i32, __reduce_add_int32)
 REDUCE_MINMAX(int32_t, __vec64_i32, __reduce_min_int32, <)

From 502f8fd76b9cf88cd260106b546494c1facc28b4 Mon Sep 17 00:00:00 2001
From: Matt Pharr <mpharr@google.com>
Date: Tue, 20 Aug 2013 09:22:09 -0700
Subject: [PATCH 33/34] Reduce debug spew on failing idiv.ispc tests

---
 tests/idiv.ispc | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/tests/idiv.ispc b/tests/idiv.ispc
index 8738740b..bd0766da 100644
--- a/tests/idiv.ispc
+++ b/tests/idiv.ispc
@@ -4,12 +4,13 @@ export uniform int width() { return programCount; }
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
   uniform int errorCount = 0;
- 
+
   for (unsigned int8 num = 0; num < 255; ++num) {
     for (uniform unsigned int8 div = 2; div < 255; ++div) {
       if (__fast_idiv(num, div) != num/div) {
         ++errorCount;
         print("error %/% = %, got %\n", num, div, num/div, __fast_idiv(num,div));
+        if (errorCount > 32) break;
       }
     }
   }
@@ -19,6 +20,7 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
       if (__fast_idiv(num, div) != num/div) {
         ++errorCount;
         print("error %/% = %, got %\n", num, div, num/div, __fast_idiv(num,div));
+        if (errorCount > 64) break;
       }
     }
   }
@@ -28,6 +30,7 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
       if (__fast_idiv(num, div) != num/div) {
         ++errorCount;
         print("error %/% = %, got %\n", num, div, num/div, __fast_idiv(num,div));
+        if (errorCount > 96) break;
       }
     }
   }
@@ -37,6 +40,7 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
       if (__fast_idiv(num, div) != num/div) {
         ++errorCount;
         print("error %/% = %, got %\n", num, div, num/div, __fast_idiv(num,div));
+        if (errorCount > 128) break;
       }
     }
   }
@@ -50,6 +54,7 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
       if (__fast_idiv(num, div) != num/div) {
         ++errorCount;
         print("ui32 error %/% = %, got %\n", num, div, num/div, __fast_idiv(num,div));
+        if (errorCount > 160) break;
       }
     }
   }
@@ -62,6 +67,7 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
       if (__fast_idiv(num, div) != num/div) {
         ++errorCount;
         print("si32 error %/% = %, got %\n", num, div, num/div, __fast_idiv(num,div));
+        if (errorCount > 192) break;
       }
     }
   }

From 611477e214f19e89657cd85252bb44e801573240 Mon Sep 17 00:00:00 2001
From: Matt Pharr <mpharr@google.com>
Date: Thu, 22 Aug 2013 07:50:25 -0700
Subject: [PATCH 34/34] Revert change to lEmitVaryingSelect().

Using vector select versus a store and masked load for varying vector
selects seems to give worse code.  This may be related to
http://llvm.org/bugs/show_bug.cgi?id=16941.
---
 expr.cpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/expr.cpp b/expr.cpp
index 856d363c..614cb5e5 100644
--- a/expr.cpp
+++ b/expr.cpp
@@ -3123,7 +3123,10 @@ static llvm::Value *
 lEmitVaryingSelect(FunctionEmitContext *ctx, llvm::Value *test,
                    llvm::Value *expr1, llvm::Value *expr2,
                    const Type *type) {
-#if !defined(LLVM_3_1)
+#if 0 // !defined(LLVM_3_1)
+    // Though it should be equivalent, this seems to cause non-trivial
+    // performance regressions versus the below.  This may be related to
+    // http://llvm.org/bugs/show_bug.cgi?id=16941.
     if (test->getType() != LLVMTypes::Int1VectorType)
         test = ctx->TruncInst(test, LLVMTypes::Int1VectorType);
     return ctx->SelectInst(test, expr1, expr2, "select");