From 0277ba1aaa8a3c2b9441b149942bbc9c0ed3be5d Mon Sep 17 00:00:00 2001 From: Matt Pharr Date: Tue, 23 Jul 2013 16:49:07 -0700 Subject: [PATCH 001/124] Improve warnings for right shift by varying amounts. Fixes: - Don't issue a warning when the shift is a by the same amount in all vector lanes. - Do issue a warning when it's a compile-time constant but the values are different in different lanes. Previously, we warned iff the shift amount wasn't a compile-time constant. --- expr.cpp | 39 ++++++++++++++++++++++++++++++++++++--- 1 file changed, 36 insertions(+), 3 deletions(-) diff --git a/expr.cpp b/expr.cpp index fc3d295a..894942d2 100644 --- a/expr.cpp +++ b/expr.cpp @@ -1911,6 +1911,40 @@ lEmitLogicalOp(BinaryExpr::Op op, Expr *arg0, Expr *arg1, } +/* Returns true if shifting right by the given amount will lead to + inefficient code. (Assumes x86 target. May also warn inaccurately if + later optimization simplify the shift amount more than we are able to + see at this point.) */ +static bool +lIsDifficultShiftAmount(Expr *expr) { + // Uniform shifts (of uniform values) are no problem. + if (expr->GetType()->IsVaryingType() == false) + return false; + + ConstExpr *ce = dynamic_cast(expr); + if (ce) { + // If the shift is by a constant amount, *and* it's the same amount + // in all vector lanes, we're in good shape. + uint32_t amount[ISPC_MAX_NVEC]; + int count = ce->GetValues(amount); + for (int i = 1; i < count; ++i) + if (amount[i] != amount[0]) + return true; + return false; + } + + TypeCastExpr *tce = dynamic_cast(expr); + if (tce && tce->expr) { + // Finally, if the shift amount is given by a uniform value that's + // been smeared out into a varying, we have the same shift for all + // lanes and are also in good shape. + return (tce->expr->GetType()->IsUniformType() == false); + } + + return true; +} + + llvm::Value * BinaryExpr::GetValue(FunctionEmitContext *ctx) const { if (!arg0 || !arg1) { @@ -1951,9 +1985,8 @@ BinaryExpr::GetValue(FunctionEmitContext *ctx) const { case BitAnd: case BitXor: case BitOr: { - if (op == Shr && arg1->GetType()->IsVaryingType() && - dynamic_cast(arg1) == NULL) - PerformanceWarning(pos, "Shift right is extremely inefficient for " + if (op == Shr && lIsDifficultShiftAmount(arg1)) + PerformanceWarning(pos, "Shift right is inefficient for " "varying shift amounts."); return lEmitBinaryBitOp(op, value0, value1, arg0->GetType()->IsUnsignedType(), ctx); From 83e1630fbcfde4aa67b50245cd96e36cbe033660 Mon Sep 17 00:00:00 2001 From: Matt Pharr Date: Tue, 23 Jul 2013 16:49:56 -0700 Subject: [PATCH 002/124] Add support for fast division of varying int values by small constants. For varying int8/16/32 types, divides by small constants can be implemented efficiently through multiplies and shifts with integer types of twice the bit-width; this commit adds this optimization. (Implementation is based on Halide.) --- expr.cpp | 69 +++++ stdlib.ispc | 675 ++++++++++++++++++++++++++++++++++++++++++++++++ tests/idiv.ispc | 75 ++++++ 3 files changed, 819 insertions(+) create mode 100644 tests/idiv.ispc diff --git a/expr.cpp b/expr.cpp index 894942d2..3baaabaf 100644 --- a/expr.cpp +++ b/expr.cpp @@ -2240,6 +2240,49 @@ lConstFoldBinaryIntOp(ConstExpr *constArg0, ConstExpr *constArg1, } +/* Returns true if the given arguments (which are assumed to be the + operands of a divide) represent a divide that can be performed by one of + the __fast_idiv functions. + */ +static bool +lCanImproveVectorDivide(Expr *arg0, Expr *arg1, int *divisor) { + const Type *type = arg0->GetType(); + if (!type) + return false; + + // The value being divided must be an int8/16/32. + if (!(Type::EqualIgnoringConst(type, AtomicType::VaryingInt8) || + Type::EqualIgnoringConst(type, AtomicType::VaryingUInt8) || + Type::EqualIgnoringConst(type, AtomicType::VaryingInt16) || + Type::EqualIgnoringConst(type, AtomicType::VaryingUInt16) || + Type::EqualIgnoringConst(type, AtomicType::VaryingInt32) || + Type::EqualIgnoringConst(type, AtomicType::VaryingUInt32))) + return false; + + // The divisor must be the same compile-time constant value for all of + // the vector lanes. + ConstExpr *ce = dynamic_cast(arg1); + if (!ce) + return false; + int64_t div[ISPC_MAX_NVEC]; + int count = ce->GetValues(div); + for (int i = 1; i < count; ++i) + if (div[i] != div[0]) + return false; + *divisor = div[0]; + + // And finally, the divisor must be >= 2 and <128 (for 8-bit divides), + // and <256 otherwise. + if (*divisor < 2) + return false; + if (Type::EqualIgnoringConst(type, AtomicType::VaryingInt8) || + Type::EqualIgnoringConst(type, AtomicType::VaryingUInt8)) + return *divisor < 128; + else + return *divisor < 256; +} + + Expr * BinaryExpr::Optimize() { if (arg0 == NULL || arg1 == NULL) @@ -2302,6 +2345,32 @@ BinaryExpr::Optimize() { } } + int divisor; + if (op == Div && lCanImproveVectorDivide(arg0, arg1, &divisor)) { + Debug(pos, "Improving vector divide by constant %d", divisor); + + std::vector idivFuns; + m->symbolTable->LookupFunction("__fast_idiv", &idivFuns); + if (idivFuns.size() == 0) { + Warning(pos, "Couldn't find __fast_idiv to optimize integer divide. " + "Are you compiling with --nostdlib?"); + return this; + } + + Expr *idivSymExpr = new FunctionSymbolExpr("__fast_idiv", idivFuns, pos); + ExprList *args = new ExprList(arg0, pos); + args->exprs.push_back(new ConstExpr(AtomicType::UniformInt32, divisor, arg1->pos)); + Expr *idivCall = new FunctionCallExpr(idivSymExpr, args, pos); + + idivCall = ::TypeCheck(idivCall); + if (idivCall == NULL) + return NULL; + + Assert(Type::EqualIgnoringConst(GetType(), idivCall->GetType())); + idivCall = new TypeCastExpr(GetType(), idivCall, pos); + return ::Optimize(idivCall); + } + // From here on out, we're just doing constant folding, so if both args // aren't constants then we're done... if (constArg0 == NULL || constArg1 == NULL) diff --git a/stdlib.ispc b/stdlib.ispc index 4e06f5da..b8ed2057 100644 --- a/stdlib.ispc +++ b/stdlib.ispc @@ -4264,3 +4264,678 @@ static inline bool rdrand(int64 * ptr) { return success; } } + +/////////////////////////////////////////////////////////////////////////// +// Fast vector integer division + +/* These tables and the algorithms in the __fast_idiv() functions below are + from Halide; the idea is based on the paper "Division by Invariant + Integers using Multiplication" by Granlund and Montgomery. + + Copyright (c) 2012 MIT CSAIL + + Developed by: + + The Halide team + MIT CSAIL + http://halide-lang.org + + Permission is hereby granted, free of charge, to any person obtaining a + copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be included + in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +static const uniform int64 __idiv_table_u8[][3] = { + {0, 0LL, 1}, {1, 171LL, 1}, {0, 0LL, 2}, + {1, 205LL, 2}, {1, 171LL, 2}, {2, 37LL, 2}, + {0, 0LL, 3}, {1, 57LL, 1}, {1, 205LL, 3}, + {2, 117LL, 3}, {1, 171LL, 3}, {1, 79LL, 2}, + {2, 37LL, 3}, {1, 137LL, 3}, {0, 0LL, 4}, + {1, 241LL, 4}, {1, 57LL, 2}, {1, 27LL, 1}, + {1, 205LL, 4}, {2, 135LL, 4}, {2, 117LL, 4}, + {2, 101LL, 4}, {1, 171LL, 4}, {1, 41LL, 2}, + {1, 79LL, 3}, {1, 19LL, 1}, {2, 37LL, 4}, + {2, 27LL, 4}, {1, 137LL, 4}, {2, 9LL, 4}, + {0, 0LL, 5}, {1, 249LL, 5}, {1, 241LL, 5}, + {1, 235LL, 5}, {1, 57LL, 3}, {1, 111LL, 4}, + {1, 27LL, 2}, {2, 165LL, 5}, {1, 205LL, 5}, + {1, 25LL, 2}, {2, 135LL, 5}, {1, 191LL, 5}, + {1, 187LL, 5}, {2, 109LL, 5}, {2, 101LL, 5}, + {1, 175LL, 5}, {1, 171LL, 5}, {2, 79LL, 5}, + {1, 41LL, 3}, {1, 161LL, 5}, {1, 79LL, 4}, + {1, 155LL, 5}, {1, 19LL, 2}, {1, 149LL, 5}, + {2, 37LL, 5}, {1, 9LL, 1}, {2, 27LL, 5}, + {1, 139LL, 5}, {1, 137LL, 5}, {2, 13LL, 5}, + {2, 9LL, 5}, {2, 5LL, 5}, {0, 0LL, 6}, + {1, 253LL, 6}, {1, 249LL, 6}, {1, 245LL, 6}, + {1, 121LL, 5}, {1, 119LL, 5}, {1, 235LL, 6}, + {1, 231LL, 6}, {1, 57LL, 4}, {1, 225LL, 6}, + {1, 111LL, 5}, {1, 219LL, 6}, {1, 27LL, 3}, + {1, 213LL, 6}, {2, 165LL, 6}, {1, 13LL, 2}, + {1, 205LL, 6}, {1, 203LL, 6}, {1, 25LL, 3}, + {1, 99LL, 5}, {2, 135LL, 6}, {1, 193LL, 6}, + {1, 191LL, 6}, {1, 189LL, 6}, {1, 187LL, 6}, + {1, 185LL, 6}, {1, 183LL, 6}, {1, 181LL, 6}, + {1, 179LL, 6}, {1, 177LL, 6}, {1, 175LL, 6}, + {1, 173LL, 6}, {1, 171LL, 6}, {1, 169LL, 6}, + {1, 21LL, 3}, {1, 83LL, 5}, {1, 41LL, 4}, + {1, 163LL, 6}, {1, 161LL, 6}, {2, 63LL, 6}, + {1, 79LL, 5}, {2, 57LL, 6}, {1, 155LL, 6}, + {2, 51LL, 6}, {1, 19LL, 3}, {1, 151LL, 6}, + {1, 149LL, 6}, {1, 37LL, 4}, {2, 37LL, 6}, + {1, 145LL, 6}, {1, 9LL, 2}, {1, 143LL, 6}, + {2, 27LL, 6}, {2, 25LL, 6}, {1, 139LL, 6}, + {1, 69LL, 5}, {1, 137LL, 6}, {2, 15LL, 6}, + {2, 13LL, 6}, {2, 11LL, 6}, {2, 9LL, 6}, + {2, 7LL, 6}, {2, 5LL, 6}, {2, 3LL, 6}, + {0, 0LL, 7}, {1, 255LL, 7}, {1, 127LL, 6}, + {1, 63LL, 5}, {1, 125LL, 6}, {1, 31LL, 4}, + {1, 123LL, 6}, {1, 61LL, 5}, {1, 121LL, 6}, + {1, 15LL, 3}, {1, 119LL, 6}, {1, 59LL, 5}, + {1, 235LL, 7}, {1, 117LL, 6}, {1, 29LL, 4}, + {1, 115LL, 6}, {1, 57LL, 5}, {1, 113LL, 6}, + {1, 225LL, 7}, {1, 7LL, 2}, {1, 111LL, 6}, + {1, 55LL, 5}, {1, 219LL, 7}, {1, 109LL, 6}, + {1, 27LL, 4}, {1, 215LL, 7}, {1, 107LL, 6}, + {1, 53LL, 5}, {1, 211LL, 7}, {1, 105LL, 6}, + {1, 13LL, 3}, {1, 207LL, 7}, {1, 103LL, 6}, + {1, 51LL, 5}, {1, 203LL, 7}, {1, 101LL, 6}, + {1, 25LL, 4}, {1, 199LL, 7}, {1, 99LL, 6}, + {1, 197LL, 7}, {1, 49LL, 5}, {1, 97LL, 6}, + {1, 193LL, 7}, {1, 3LL, 1}, {1, 191LL, 7}, + {1, 95LL, 6}, {1, 189LL, 7}, {1, 47LL, 5}, + {1, 187LL, 7}, {1, 93LL, 6}, {1, 185LL, 7}, + {1, 23LL, 4}, {1, 183LL, 7}, {1, 91LL, 6}, + {1, 181LL, 7}, {1, 45LL, 5}, {1, 179LL, 7}, + {1, 89LL, 6}, {1, 177LL, 7}, {1, 11LL, 3}, + {1, 175LL, 7}, {1, 87LL, 6}, {1, 173LL, 7}, + {1, 43LL, 5}, {1, 171LL, 7}, {1, 85LL, 6}, + {1, 169LL, 7}, {2, 81LL, 7}, {1, 21LL, 4}, + {1, 167LL, 7}, {1, 83LL, 6}, {1, 165LL, 7}, + {1, 41LL, 5}, {2, 71LL, 7}, {1, 163LL, 7}, + {1, 81LL, 6}, {1, 161LL, 7}, {1, 5LL, 2}, + {2, 63LL, 7}, {1, 159LL, 7}, {1, 79LL, 6}, + {1, 157LL, 7}, {2, 57LL, 7}, {1, 39LL, 5}, + {1, 155LL, 7}, {1, 77LL, 6}, {2, 51LL, 7}, + {1, 153LL, 7}, {1, 19LL, 4}, {2, 47LL, 7}, + {1, 151LL, 7}, {1, 75LL, 6}, {1, 149LL, 7}, + {2, 41LL, 7}, {1, 37LL, 5}, {1, 147LL, 7}, + {2, 37LL, 7}, {1, 73LL, 6}, {1, 145LL, 7}, + {2, 33LL, 7}, {1, 9LL, 3}, {2, 31LL, 7}, + {1, 143LL, 7}, {1, 71LL, 6}, {2, 27LL, 7}, + {1, 141LL, 7}, {2, 25LL, 7}, {1, 35LL, 5}, + {1, 139LL, 7}, {2, 21LL, 7}, {1, 69LL, 6}, + {2, 19LL, 7}, {1, 137LL, 7}, {1, 17LL, 4}, + {2, 15LL, 7}, {1, 135LL, 7}, {2, 13LL, 7}, + {1, 67LL, 6}, {2, 11LL, 7}, {1, 133LL, 7}, + {2, 9LL, 7}, {1, 33LL, 5}, {2, 7LL, 7}, + {1, 131LL, 7}, {2, 5LL, 7}, {1, 65LL, 6}, + {2, 3LL, 7}, {1, 129LL, 7}, {0, 0LL, 8}, +}; +static const uniform int64 __idiv_table_s8[][3] = { + {0, 0LL, 1}, {1, 86LL, 0}, {0, 0LL, 2}, + {1, 103LL, 1}, {1, 43LL, 0}, {1, 147LL, 2}, + {0, 0LL, 3}, {1, 57LL, 1}, {1, 103LL, 2}, + {1, 187LL, 3}, {1, 43LL, 1}, {1, 79LL, 2}, + {1, 147LL, 3}, {1, 137LL, 3}, {0, 0LL, 4}, + {1, 121LL, 3}, {1, 57LL, 2}, {1, 27LL, 1}, + {1, 103LL, 3}, {1, 49LL, 2}, {1, 187LL, 4}, + {1, 179LL, 4}, {1, 43LL, 2}, {1, 41LL, 2}, + {1, 79LL, 3}, {1, 19LL, 1}, {1, 147LL, 4}, + {1, 71LL, 3}, {1, 137LL, 4}, {1, 133LL, 4}, + {0, 0LL, 5}, {1, 125LL, 4}, {1, 121LL, 4}, + {1, 59LL, 3}, {1, 57LL, 3}, {1, 111LL, 4}, + {1, 27LL, 2}, {1, 211LL, 5}, {1, 103LL, 4}, + {1, 25LL, 2}, {1, 49LL, 3}, {1, 6LL, 0}, + {1, 47LL, 3}, {1, 23LL, 2}, {1, 45LL, 3}, + {1, 11LL, 1}, {1, 43LL, 3}, {1, 21LL, 2}, + {1, 41LL, 3}, {1, 81LL, 4}, {1, 79LL, 4}, + {1, 39LL, 3}, {1, 19LL, 2}, {1, 75LL, 4}, + {1, 147LL, 5}, {1, 9LL, 1}, {1, 71LL, 4}, + {1, 35LL, 3}, {1, 137LL, 5}, {1, 135LL, 5}, + {1, 133LL, 5}, {1, 131LL, 5}, {0, 0LL, 6}, + {1, 127LL, 5}, {1, 63LL, 4}, {1, 31LL, 3}, + {1, 61LL, 4}, {1, 15LL, 2}, {1, 59LL, 4}, + {1, 29LL, 3}, {1, 57LL, 4}, {1, 113LL, 5}, + {1, 7LL, 1}, {1, 55LL, 4}, {1, 27LL, 3}, + {1, 107LL, 5}, {1, 53LL, 4}, {1, 13LL, 2}, + {1, 103LL, 5}, {1, 51LL, 4}, {1, 25LL, 3}, + {1, 99LL, 5}, {1, 49LL, 4}, {1, 97LL, 5}, + {1, 3LL, 0}, {1, 95LL, 5}, {1, 47LL, 4}, + {1, 93LL, 5}, {1, 23LL, 3}, {1, 91LL, 5}, + {1, 45LL, 4}, {1, 89LL, 5}, {1, 11LL, 2}, + {1, 87LL, 5}, {1, 43LL, 4}, {1, 85LL, 5}, + {1, 21LL, 3}, {1, 83LL, 5}, {1, 41LL, 4}, + {1, 163LL, 6}, {1, 81LL, 5}, {1, 5LL, 1}, + {1, 79LL, 5}, {1, 157LL, 6}, {1, 39LL, 4}, + {1, 77LL, 5}, {1, 19LL, 3}, {1, 151LL, 6}, + {1, 75LL, 5}, {1, 37LL, 4}, {1, 147LL, 6}, + {1, 73LL, 5}, {1, 9LL, 2}, {1, 143LL, 6}, + {1, 71LL, 5}, {1, 141LL, 6}, {1, 35LL, 4}, + {1, 69LL, 5}, {1, 137LL, 6}, {1, 17LL, 3}, + {1, 135LL, 6}, {1, 67LL, 5}, {1, 133LL, 6}, + {1, 33LL, 4}, {1, 131LL, 6}, {1, 65LL, 5}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, +}; +static const uniform int64 __idiv_table_u16[][3] = { + {0, 0LL, 1}, {1, 43691LL, 1}, {0, 0LL, 2}, + {1, 52429LL, 2}, {1, 43691LL, 2}, {2, 9363LL, 2}, + {0, 0LL, 3}, {1, 58255LL, 3}, {1, 52429LL, 3}, + {1, 47663LL, 3}, {1, 43691LL, 3}, {1, 20165LL, 2}, + {2, 9363LL, 3}, {1, 34953LL, 3}, {0, 0LL, 4}, + {1, 61681LL, 4}, {1, 58255LL, 4}, {1, 55189LL, 4}, + {1, 52429LL, 4}, {2, 34329LL, 4}, {1, 47663LL, 4}, + {2, 25645LL, 4}, {1, 43691LL, 4}, {2, 18351LL, 4}, + {1, 20165LL, 3}, {2, 12137LL, 4}, {2, 9363LL, 4}, + {1, 18079LL, 3}, {1, 34953LL, 4}, {2, 2115LL, 4}, + {0, 0LL, 5}, {1, 63551LL, 5}, {1, 61681LL, 5}, + {1, 59919LL, 5}, {1, 58255LL, 5}, {1, 7085LL, 2}, + {1, 55189LL, 5}, {2, 42011LL, 5}, {1, 52429LL, 5}, + {2, 36765LL, 5}, {2, 34329LL, 5}, {1, 48771LL, 5}, + {1, 47663LL, 5}, {1, 11651LL, 3}, {2, 25645LL, 5}, + {2, 23705LL, 5}, {1, 43691LL, 5}, {2, 20063LL, 5}, + {2, 18351LL, 5}, {1, 41121LL, 5}, {1, 20165LL, 4}, + {1, 39569LL, 5}, {2, 12137LL, 5}, {2, 10725LL, 5}, + {2, 9363LL, 5}, {2, 8049LL, 5}, {1, 18079LL, 4}, + {1, 35545LL, 5}, {1, 34953LL, 5}, {1, 8595LL, 3}, + {2, 2115LL, 5}, {2, 1041LL, 5}, {0, 0LL, 6}, + {1, 4033LL, 2}, {1, 63551LL, 6}, {1, 31301LL, 5}, + {1, 61681LL, 6}, {2, 56039LL, 6}, {1, 59919LL, 6}, + {1, 59075LL, 6}, {1, 58255LL, 6}, {1, 57457LL, 6}, + {1, 7085LL, 3}, {2, 46313LL, 6}, {1, 55189LL, 6}, + {1, 6809LL, 3}, {2, 42011LL, 6}, {1, 53093LL, 6}, + {1, 52429LL, 6}, {1, 25891LL, 5}, {2, 36765LL, 6}, + {1, 25267LL, 5}, {2, 34329LL, 6}, {1, 49345LL, 6}, + {1, 48771LL, 6}, {1, 48211LL, 6}, {1, 47663LL, 6}, + {2, 28719LL, 6}, {1, 11651LL, 4}, {2, 26647LL, 6}, + {2, 25645LL, 6}, {2, 24665LL, 6}, {2, 23705LL, 6}, + {1, 44151LL, 6}, {1, 43691LL, 6}, {2, 20945LL, 6}, + {2, 20063LL, 6}, {1, 42367LL, 6}, {2, 18351LL, 6}, + {1, 5191LL, 3}, {1, 41121LL, 6}, {1, 20361LL, 5}, + {1, 20165LL, 5}, {1, 19973LL, 5}, {1, 39569LL, 6}, + {2, 12863LL, 6}, {2, 12137LL, 6}, {1, 2405LL, 2}, + {2, 10725LL, 6}, {1, 37787LL, 6}, {2, 9363LL, 6}, + {1, 18559LL, 5}, {2, 8049LL, 6}, {2, 7409LL, 6}, + {1, 18079LL, 5}, {1, 35849LL, 6}, {1, 35545LL, 6}, + {2, 4957LL, 6}, {1, 34953LL, 6}, {1, 4333LL, 3}, + {1, 8595LL, 4}, {2, 2665LL, 6}, {2, 2115LL, 6}, + {2, 1573LL, 6}, {2, 1041LL, 6}, {2, 517LL, 6}, + {0, 0LL, 7}, {1, 16257LL, 5}, {1, 4033LL, 3}, + {1, 16009LL, 5}, {1, 63551LL, 7}, {1, 63073LL, 7}, + {1, 31301LL, 6}, {1, 31069LL, 6}, {1, 61681LL, 7}, + {1, 61231LL, 7}, {2, 56039LL, 7}, {1, 30175LL, 6}, + {1, 59919LL, 7}, {1, 29747LL, 6}, {1, 59075LL, 7}, + {1, 29331LL, 6}, {1, 58255LL, 7}, {1, 57853LL, 7}, + {1, 57457LL, 7}, {1, 28533LL, 6}, {1, 7085LL, 4}, + {1, 14075LL, 5}, {2, 46313LL, 7}, {1, 27777LL, 6}, + {1, 55189LL, 7}, {1, 13707LL, 5}, {1, 6809LL, 4}, + {2, 42705LL, 7}, {2, 42011LL, 7}, {1, 53431LL, 7}, + {1, 53093LL, 7}, {1, 52759LL, 7}, {1, 52429LL, 7}, + {2, 38671LL, 7}, {1, 25891LL, 6}, {1, 6433LL, 4}, + {2, 36765LL, 7}, {2, 36145LL, 7}, {1, 25267LL, 6}, + {2, 34927LL, 7}, {2, 34329LL, 7}, {1, 49637LL, 7}, + {1, 49345LL, 7}, {2, 32577LL, 7}, {1, 48771LL, 7}, + {2, 31443LL, 7}, {1, 48211LL, 7}, {1, 47935LL, 7}, + {1, 47663LL, 7}, {2, 29251LL, 7}, {2, 28719LL, 7}, + {1, 2929LL, 3}, {1, 11651LL, 5}, {1, 23173LL, 6}, + {2, 26647LL, 7}, {1, 2865LL, 3}, {2, 25645LL, 7}, + {1, 1417LL, 2}, {2, 24665LL, 7}, {1, 44859LL, 7}, + {2, 23705LL, 7}, {2, 23233LL, 7}, {1, 44151LL, 7}, + {1, 2745LL, 3}, {1, 43691LL, 7}, {2, 21393LL, 7}, + {2, 20945LL, 7}, {1, 43019LL, 7}, {2, 20063LL, 7}, + {1, 21291LL, 6}, {1, 42367LL, 7}, {1, 21077LL, 6}, + {2, 18351LL, 7}, {1, 41735LL, 7}, {1, 5191LL, 4}, + {2, 17111LL, 7}, {1, 41121LL, 7}, {2, 16305LL, 7}, + {1, 20361LL, 6}, {1, 40525LL, 7}, {1, 20165LL, 6}, + {1, 40137LL, 7}, {1, 19973LL, 6}, {1, 39757LL, 7}, + {1, 39569LL, 7}, {2, 13231LL, 7}, {2, 12863LL, 7}, + {1, 39017LL, 7}, {2, 12137LL, 7}, {2, 11779LL, 7}, + {1, 2405LL, 3}, {2, 11073LL, 7}, {2, 10725LL, 7}, + {1, 18979LL, 6}, {1, 37787LL, 7}, {2, 9699LL, 7}, + {2, 9363LL, 7}, {1, 37283LL, 7}, {1, 18559LL, 6}, + {2, 8373LL, 7}, {2, 8049LL, 7}, {1, 4579LL, 4}, + {2, 7409LL, 7}, {2, 7093LL, 7}, {1, 18079LL, 6}, + {1, 36003LL, 7}, {1, 35849LL, 7}, {2, 5857LL, 7}, + {1, 35545LL, 7}, {1, 35395LL, 7}, {2, 4957LL, 7}, + {1, 35099LL, 7}, {1, 34953LL, 7}, {1, 4351LL, 4}, + {1, 4333LL, 4}, {2, 3507LL, 7}, {1, 8595LL, 5}, + {2, 2943LL, 7}, {2, 2665LL, 7}, {1, 16981LL, 6}, + {2, 2115LL, 7}, {2, 1843LL, 7}, {2, 1573LL, 7}, + {1, 33421LL, 7}, {2, 1041LL, 7}, {1, 33157LL, 7}, + {2, 517LL, 7}, {1, 32897LL, 7}, {0, 0LL, 8}, +}; +static const uniform int64 __idiv_table_s16[][3] = { + {0, 0LL, 1}, {1, 21846LL, 0}, {0, 0LL, 2}, + {1, 26215LL, 1}, {1, 10923LL, 0}, {1, 18725LL, 1}, + {0, 0LL, 3}, {1, 7282LL, 0}, {1, 26215LL, 2}, + {1, 5958LL, 0}, {1, 10923LL, 1}, {1, 20165LL, 2}, + {1, 18725LL, 2}, {1, 34953LL, 3}, {0, 0LL, 4}, + {1, 30841LL, 3}, {1, 3641LL, 0}, {1, 55189LL, 4}, + {1, 26215LL, 3}, {1, 49933LL, 4}, {1, 2979LL, 0}, + {1, 45591LL, 4}, {1, 10923LL, 2}, {1, 5243LL, 1}, + {1, 20165LL, 3}, {1, 38837LL, 4}, {1, 18725LL, 3}, + {1, 18079LL, 3}, {1, 34953LL, 4}, {1, 16913LL, 3}, + {0, 0LL, 5}, {1, 1986LL, 0}, {1, 30841LL, 4}, + {1, 3745LL, 1}, {1, 3641LL, 1}, {1, 7085LL, 2}, + {1, 55189LL, 5}, {1, 26887LL, 4}, {1, 26215LL, 4}, + {1, 51151LL, 5}, {1, 49933LL, 5}, {1, 12193LL, 3}, + {1, 2979LL, 1}, {1, 11651LL, 3}, {1, 45591LL, 5}, + {1, 44621LL, 5}, {1, 10923LL, 3}, {1, 2675LL, 1}, + {1, 5243LL, 2}, {1, 41121LL, 5}, {1, 20165LL, 4}, + {1, 19785LL, 4}, {1, 38837LL, 5}, {1, 38131LL, 5}, + {1, 18725LL, 4}, {1, 36793LL, 5}, {1, 18079LL, 4}, + {1, 17773LL, 4}, {1, 34953LL, 5}, {1, 8595LL, 3}, + {1, 16913LL, 4}, {1, 33289LL, 5}, {0, 0LL, 6}, + {1, 4033LL, 2}, {1, 993LL, 0}, {1, 31301LL, 5}, + {1, 30841LL, 5}, {1, 15197LL, 4}, {1, 3745LL, 2}, + {1, 14769LL, 4}, {1, 3641LL, 2}, {1, 57457LL, 6}, + {1, 7085LL, 3}, {1, 55925LL, 6}, {1, 55189LL, 6}, + {1, 6809LL, 3}, {1, 26887LL, 5}, {1, 26547LL, 5}, + {1, 26215LL, 5}, {1, 25891LL, 5}, {1, 51151LL, 6}, + {1, 25267LL, 5}, {1, 49933LL, 6}, {1, 24673LL, 5}, + {1, 12193LL, 4}, {1, 48211LL, 6}, {1, 2979LL, 2}, + {1, 5891LL, 3}, {1, 11651LL, 4}, {1, 11523LL, 4}, + {1, 45591LL, 6}, {1, 45101LL, 6}, {1, 44621LL, 6}, + {1, 44151LL, 6}, {1, 10923LL, 4}, {1, 43241LL, 6}, + {1, 2675LL, 2}, {1, 662LL, 0}, {1, 5243LL, 3}, + {1, 5191LL, 3}, {1, 41121LL, 6}, {1, 20361LL, 5}, + {1, 20165LL, 5}, {1, 19973LL, 5}, {1, 19785LL, 5}, + {1, 1225LL, 1}, {1, 38837LL, 6}, {1, 2405LL, 2}, + {1, 38131LL, 6}, {1, 37787LL, 6}, {1, 18725LL, 5}, + {1, 18559LL, 5}, {1, 36793LL, 6}, {1, 36473LL, 6}, + {1, 18079LL, 5}, {1, 35849LL, 6}, {1, 17773LL, 5}, + {1, 35247LL, 6}, {1, 34953LL, 6}, {1, 4333LL, 3}, + {1, 8595LL, 4}, {1, 34101LL, 6}, {1, 16913LL, 5}, + {1, 33555LL, 6}, {1, 33289LL, 6}, {1, 33027LL, 6}, + {0, 0LL, 7}, {1, 16257LL, 5}, {1, 4033LL, 3}, + {1, 16009LL, 5}, {1, 993LL, 1}, {1, 31537LL, 6}, + {1, 31301LL, 6}, {1, 31069LL, 6}, {1, 30841LL, 6}, + {1, 3827LL, 3}, {1, 15197LL, 5}, {1, 30175LL, 6}, + {1, 3745LL, 3}, {1, 29747LL, 6}, {1, 14769LL, 5}, + {1, 29331LL, 6}, {1, 3641LL, 3}, {1, 28927LL, 6}, + {1, 57457LL, 7}, {1, 28533LL, 6}, {1, 7085LL, 4}, + {1, 14075LL, 5}, {1, 55925LL, 7}, {1, 27777LL, 6}, + {1, 55189LL, 7}, {1, 13707LL, 5}, {1, 6809LL, 4}, + {1, 54121LL, 7}, {1, 26887LL, 6}, {1, 6679LL, 4}, + {1, 26547LL, 6}, {1, 6595LL, 4}, {1, 26215LL, 6}, + {1, 6513LL, 4}, {1, 25891LL, 6}, {1, 6433LL, 4}, + {1, 51151LL, 7}, {1, 50841LL, 7}, {1, 25267LL, 6}, + {1, 6279LL, 4}, {1, 49933LL, 7}, {1, 24819LL, 6}, + {1, 24673LL, 6}, {1, 49057LL, 7}, {1, 12193LL, 5}, + {1, 24245LL, 6}, {1, 48211LL, 7}, {1, 749LL, 1}, + {1, 2979LL, 3}, {1, 23697LL, 6}, {1, 5891LL, 4}, + {1, 2929LL, 3}, {1, 11651LL, 5}, {1, 23173LL, 6}, + {1, 11523LL, 5}, {1, 2865LL, 3}, {1, 45591LL, 7}, + {1, 1417LL, 2}, {1, 45101LL, 7}, {1, 11215LL, 5}, + {1, 44621LL, 7}, {1, 44385LL, 7}, {1, 44151LL, 7}, + {1, 2745LL, 3}, {1, 10923LL, 5}, {1, 43465LL, 7}, + {1, 43241LL, 7}, {1, 43019LL, 7}, {1, 2675LL, 3}, + {1, 21291LL, 6}, {1, 331LL, 0}, {1, 21077LL, 6}, + {1, 5243LL, 4}, {1, 41735LL, 7}, {1, 5191LL, 4}, + {1, 10331LL, 5}, {1, 41121LL, 7}, {1, 40921LL, 7}, + {1, 20361LL, 6}, {1, 40525LL, 7}, {1, 20165LL, 6}, + {1, 20069LL, 6}, {1, 19973LL, 6}, {1, 39757LL, 7}, + {1, 19785LL, 6}, {1, 4923LL, 4}, {1, 1225LL, 2}, + {1, 39017LL, 7}, {1, 38837LL, 7}, {1, 19329LL, 6}, + {1, 2405LL, 3}, {1, 38305LL, 7}, {1, 38131LL, 7}, + {1, 18979LL, 6}, {1, 37787LL, 7}, {1, 18809LL, 6}, + {1, 18725LL, 6}, {1, 37283LL, 7}, {1, 18559LL, 6}, + {1, 36955LL, 7}, {1, 36793LL, 7}, {1, 4579LL, 4}, + {1, 36473LL, 7}, {1, 36315LL, 7}, {1, 18079LL, 6}, + {1, 36003LL, 7}, {1, 35849LL, 7}, {1, 35697LL, 7}, + {1, 17773LL, 6}, {1, 8849LL, 5}, {1, 35247LL, 7}, + {1, 35099LL, 7}, {1, 34953LL, 7}, {1, 4351LL, 4}, + {1, 4333LL, 4}, {1, 17261LL, 6}, {1, 8595LL, 5}, + {1, 535LL, 1}, {1, 34101LL, 7}, {1, 16981LL, 6}, + {1, 16913LL, 6}, {1, 16845LL, 6}, {1, 33555LL, 7}, + {1, 33421LL, 7}, {1, 33289LL, 7}, {1, 33157LL, 7}, + {1, 33027LL, 7}, {1, 32897LL, 7}, {1, 32769LL, 7}, +}; +static const uniform int64 __idiv_table_u32[][3] = { + {0, 0LL, 1}, {1, 2863311531LL, 1}, {0, 0LL, 2}, + {1, 3435973837LL, 2}, {1, 2863311531LL, 2}, {2, 613566757LL, 2}, + {0, 0LL, 3}, {1, 954437177LL, 1}, {1, 3435973837LL, 3}, + {1, 3123612579LL, 3}, {1, 2863311531LL, 3}, {1, 1321528399LL, 2}, + {2, 613566757LL, 3}, {1, 2290649225LL, 3}, {0, 0LL, 4}, + {1, 4042322161LL, 4}, {1, 954437177LL, 2}, {2, 2938661835LL, 4}, + {1, 3435973837LL, 4}, {2, 2249744775LL, 4}, {1, 3123612579LL, 4}, + {1, 2987803337LL, 4}, {1, 2863311531LL, 4}, {1, 1374389535LL, 3}, + {1, 1321528399LL, 3}, {2, 795364315LL, 4}, {2, 613566757LL, 4}, + {1, 2369637129LL, 4}, {1, 2290649225LL, 4}, {2, 138547333LL, 4}, + {0, 0LL, 5}, {1, 1041204193LL, 3}, {1, 4042322161LL, 5}, + {2, 3558687189LL, 5}, {1, 954437177LL, 3}, {2, 3134165325LL, 5}, + {2, 2938661835LL, 5}, {2, 2753184165LL, 5}, {1, 3435973837LL, 5}, + {1, 3352169597LL, 5}, {2, 2249744775LL, 5}, {1, 799063683LL, 3}, + {1, 3123612579LL, 5}, {2, 1813430637LL, 5}, {1, 2987803337LL, 5}, + {1, 2924233053LL, 5}, {1, 2863311531LL, 5}, {1, 1402438301LL, 4}, + {1, 1374389535LL, 4}, {1, 2694881441LL, 5}, {1, 1321528399LL, 4}, + {2, 891408307LL, 5}, {2, 795364315LL, 5}, {2, 702812831LL, 5}, + {2, 613566757LL, 5}, {2, 527452125LL, 5}, {1, 2369637129LL, 5}, + {1, 582368447LL, 3}, {1, 2290649225LL, 5}, {1, 1126548799LL, 4}, + {2, 138547333LL, 5}, {2, 68174085LL, 5}, {0, 0LL, 6}, + {1, 4228890877LL, 6}, {1, 1041204193LL, 4}, {1, 128207979LL, 1}, + {1, 4042322161LL, 6}, {1, 1991868891LL, 5}, {2, 3558687189LL, 6}, + {1, 3871519817LL, 6}, {1, 954437177LL, 4}, {2, 3235934265LL, 6}, + {2, 3134165325LL, 6}, {1, 458129845LL, 3}, {2, 2938661835LL, 6}, + {1, 892460737LL, 4}, {2, 2753184165LL, 6}, {1, 3479467177LL, 6}, + {1, 3435973837LL, 6}, {1, 3393554407LL, 6}, {1, 3352169597LL, 6}, + {1, 827945503LL, 4}, {2, 2249744775LL, 6}, {1, 3233857729LL, 6}, + {1, 799063683LL, 4}, {1, 789879043LL, 4}, {1, 3123612579LL, 6}, + {1, 3088515809LL, 6}, {2, 1813430637LL, 6}, {2, 1746305385LL, 6}, + {1, 2987803337LL, 6}, {1, 2955676419LL, 6}, {1, 2924233053LL, 6}, + {2, 1491936009LL, 6}, {1, 2863311531LL, 6}, {2, 1372618415LL, 6}, + {1, 1402438301LL, 5}, {1, 2776544515LL, 6}, {1, 1374389535LL, 5}, + {2, 1148159575LL, 6}, {1, 2694881441LL, 6}, {2, 1042467791LL, 6}, + {1, 1321528399LL, 5}, {2, 940802361LL, 6}, {2, 891408307LL, 6}, + {2, 842937507LL, 6}, {2, 795364315LL, 6}, {2, 748664025LL, 6}, + {2, 702812831LL, 6}, {2, 657787785LL, 6}, {2, 613566757LL, 6}, + {2, 570128403LL, 6}, {2, 527452125LL, 6}, {2, 485518043LL, 6}, + {1, 2369637129LL, 6}, {2, 403800345LL, 6}, {1, 582368447LL, 4}, + {1, 1154949189LL, 5}, {1, 2290649225LL, 6}, {2, 248469183LL, 6}, + {1, 1126548799LL, 5}, {2, 174592167LL, 6}, {2, 138547333LL, 6}, + {1, 274877907LL, 3}, {2, 68174085LL, 6}, {2, 33818641LL, 6}, + {0, 0LL, 7}, {1, 266354561LL, 3}, {1, 4228890877LL, 7}, + {1, 4196609267LL, 7}, {1, 1041204193LL, 5}, {1, 4133502361LL, 7}, + {1, 128207979LL, 2}, {1, 4072265289LL, 7}, {1, 4042322161LL, 7}, + {1, 125400505LL, 2}, {1, 1991868891LL, 6}, {1, 1977538899LL, 6}, + {2, 3558687189LL, 7}, {1, 974744351LL, 5}, {1, 3871519817LL, 7}, + {1, 3844446251LL, 7}, {1, 954437177LL, 5}, {1, 3791419407LL, 7}, + {2, 3235934265LL, 7}, {1, 3739835469LL, 7}, {2, 3134165325LL, 7}, + {1, 3689636335LL, 7}, {1, 458129845LL, 4}, {1, 910191745LL, 5}, + {2, 2938661835LL, 7}, {1, 3593175255LL, 7}, {1, 892460737LL, 5}, + {1, 3546811703LL, 7}, {2, 2753184165LL, 7}, {1, 875407347LL, 5}, + {1, 3479467177LL, 7}, {2, 2620200175LL, 7}, {1, 3435973837LL, 7}, + {1, 3414632385LL, 7}, {1, 3393554407LL, 7}, {1, 3372735055LL, 7}, + {1, 3352169597LL, 7}, {1, 1665926709LL, 6}, {1, 827945503LL, 5}, + {1, 1645975491LL, 6}, {2, 2249744775LL, 7}, {1, 1626496491LL, 6}, + {1, 3233857729LL, 7}, {2, 2134925265LL, 7}, {1, 799063683LL, 5}, + {2, 2060591247LL, 7}, {1, 789879043LL, 5}, {1, 1570730897LL, 6}, + {1, 3123612579LL, 7}, {2, 1916962805LL, 7}, {1, 3088515809LL, 7}, + {2, 1847555765LL, 7}, {2, 1813430637LL, 7}, {1, 3037324939LL, 7}, + {2, 1746305385LL, 7}, {1, 3004130131LL, 7}, {1, 2987803337LL, 7}, + {2, 1648338801LL, 7}, {1, 2955676419LL, 7}, {1, 2939870663LL, 7}, + {1, 2924233053LL, 7}, {2, 1522554545LL, 7}, {2, 1491936009LL, 7}, + {1, 2878302691LL, 7}, {1, 2863311531LL, 7}, {1, 356059465LL, 4}, + {2, 1372618415LL, 7}, {2, 1343553873LL, 7}, {1, 1402438301LL, 6}, + {2, 1286310003LL, 7}, {1, 2776544515LL, 7}, {1, 1381296015LL, 6}, + {1, 1374389535LL, 6}, {1, 42735993LL, 1}, {2, 1148159575LL, 7}, + {1, 2708156719LL, 7}, {1, 2694881441LL, 7}, {1, 1340867839LL, 6}, + {2, 1042467791LL, 7}, {1, 663956297LL, 5}, {1, 1321528399LL, 6}, + {1, 2630410593LL, 7}, {2, 940802361LL, 7}, {1, 2605477791LL, 7}, + {2, 891408307LL, 7}, {1, 2581013211LL, 7}, {2, 842937507LL, 7}, + {1, 1278501893LL, 6}, {2, 795364315LL, 7}, {2, 771906565LL, 7}, + {2, 748664025LL, 7}, {2, 725633745LL, 7}, {2, 702812831LL, 7}, + {2, 680198441LL, 7}, {2, 657787785LL, 7}, {2, 635578121LL, 7}, + {2, 613566757LL, 7}, {1, 2443359173LL, 7}, {2, 570128403LL, 7}, + {2, 548696263LL, 7}, {2, 527452125LL, 7}, {1, 1200340205LL, 6}, + {2, 485518043LL, 7}, {2, 464823301LL, 7}, {1, 2369637129LL, 7}, + {2, 423966729LL, 7}, {2, 403800345LL, 7}, {2, 383805589LL, 7}, + {1, 582368447LL, 5}, {2, 344322273LL, 7}, {1, 1154949189LL, 6}, + {1, 2300233531LL, 7}, {1, 2290649225LL, 7}, {1, 285143057LL, 4}, + {2, 248469183LL, 7}, {1, 2262369605LL, 7}, {1, 1126548799LL, 6}, + {2, 192835267LL, 7}, {2, 174592167LL, 7}, {2, 156496785LL, 7}, + {2, 138547333LL, 7}, {2, 120742053LL, 7}, {1, 274877907LL, 4}, + {1, 2190262207LL, 7}, {2, 68174085LL, 7}, {1, 2172947881LL, 7}, + {2, 33818641LL, 7}, {1, 2155905153LL, 7}, {0, 0LL, 8}, +}; +static const uniform int64 __idiv_table_s32[][3] = { + {0, 0LL, 1}, {1, 1431655766LL, 0}, {0, 0LL, 2}, + {1, 1717986919LL, 1}, {1, 715827883LL, 0}, {1, 2454267027LL, 2}, + {0, 0LL, 3}, {1, 954437177LL, 1}, {1, 1717986919LL, 2}, + {1, 780903145LL, 1}, {1, 715827883LL, 1}, {1, 1321528399LL, 2}, + {1, 2454267027LL, 3}, {1, 2290649225LL, 3}, {0, 0LL, 4}, + {1, 2021161081LL, 3}, {1, 954437177LL, 2}, {1, 1808407283LL, 3}, + {1, 1717986919LL, 3}, {1, 818089009LL, 2}, {1, 780903145LL, 2}, + {1, 2987803337LL, 4}, {1, 715827883LL, 2}, {1, 1374389535LL, 3}, + {1, 1321528399LL, 3}, {1, 1272582903LL, 3}, {1, 2454267027LL, 4}, + {1, 2369637129LL, 4}, {1, 2290649225LL, 4}, {1, 2216757315LL, 4}, + {0, 0LL, 5}, {1, 1041204193LL, 3}, {1, 2021161081LL, 4}, + {1, 3926827243LL, 5}, {1, 954437177LL, 3}, {1, 3714566311LL, 5}, + {1, 1808407283LL, 4}, {1, 3524075731LL, 5}, {1, 1717986919LL, 4}, + {1, 1676084799LL, 4}, {1, 818089009LL, 3}, {1, 799063683LL, 3}, + {1, 780903145LL, 3}, {1, 3054198967LL, 5}, {1, 2987803337LL, 5}, + {1, 2924233053LL, 5}, {1, 715827883LL, 3}, {1, 1402438301LL, 4}, + {1, 1374389535LL, 4}, {1, 2694881441LL, 5}, {1, 1321528399LL, 4}, + {1, 1296593901LL, 4}, {1, 1272582903LL, 4}, {1, 156180629LL, 1}, + {1, 2454267027LL, 5}, {1, 2411209711LL, 5}, {1, 2369637129LL, 5}, + {1, 582368447LL, 3}, {1, 2290649225LL, 5}, {1, 1126548799LL, 4}, + {1, 2216757315LL, 5}, {1, 2181570691LL, 5}, {0, 0LL, 6}, + {1, 2114445439LL, 5}, {1, 1041204193LL, 4}, {1, 128207979LL, 1}, + {1, 2021161081LL, 5}, {1, 1991868891LL, 5}, {1, 3926827243LL, 6}, + {1, 3871519817LL, 6}, {1, 954437177LL, 4}, {1, 3765450781LL, 6}, + {1, 3714566311LL, 6}, {1, 458129845LL, 3}, {1, 1808407283LL, 5}, + {1, 892460737LL, 4}, {1, 3524075731LL, 6}, {1, 1739733589LL, 5}, + {1, 1717986919LL, 5}, {1, 424194301LL, 3}, {1, 1676084799LL, 5}, + {1, 827945503LL, 4}, {1, 818089009LL, 4}, {1, 1616928865LL, 5}, + {1, 799063683LL, 4}, {1, 789879043LL, 4}, {1, 780903145LL, 4}, + {1, 3088515809LL, 6}, {1, 3054198967LL, 6}, {1, 3020636341LL, 6}, + {1, 2987803337LL, 6}, {1, 738919105LL, 4}, {1, 2924233053LL, 6}, + {1, 2893451653LL, 6}, {1, 715827883LL, 4}, {1, 354224107LL, 3}, + {1, 1402438301LL, 5}, {1, 2776544515LL, 6}, {1, 1374389535LL, 5}, + {1, 680390859LL, 4}, {1, 2694881441LL, 6}, {1, 333589693LL, 3}, + {1, 1321528399LL, 5}, {1, 2617884829LL, 6}, {1, 1296593901LL, 5}, + {1, 1284476201LL, 5}, {1, 1272582903LL, 5}, {1, 2521815661LL, 6}, + {1, 156180629LL, 2}, {1, 2476377541LL, 6}, {1, 2454267027LL, 6}, + {1, 1216273925LL, 5}, {1, 2411209711LL, 6}, {1, 1195121335LL, 5}, + {1, 2369637129LL, 6}, {1, 2349383821LL, 6}, {1, 582368447LL, 4}, + {1, 1154949189LL, 5}, {1, 2290649225LL, 6}, {1, 70991195LL, 1}, + {1, 1126548799LL, 5}, {1, 558694933LL, 4}, {1, 2216757315LL, 6}, + {1, 274877907LL, 3}, {1, 2181570691LL, 6}, {1, 2164392969LL, 6}, + {0, 0LL, 7}, {1, 266354561LL, 3}, {1, 2114445439LL, 6}, + {1, 1049152317LL, 5}, {1, 1041204193LL, 5}, {1, 4133502361LL, 7}, + {1, 128207979LL, 2}, {1, 4072265289LL, 7}, {1, 2021161081LL, 6}, + {1, 125400505LL, 2}, {1, 1991868891LL, 6}, {1, 1977538899LL, 6}, + {1, 3926827243LL, 7}, {1, 974744351LL, 5}, {1, 3871519817LL, 7}, + {1, 961111563LL, 5}, {1, 954437177LL, 5}, {1, 3791419407LL, 7}, + {1, 3765450781LL, 7}, {1, 1869917735LL, 6}, {1, 3714566311LL, 7}, + {1, 230602271LL, 3}, {1, 458129845LL, 4}, {1, 910191745LL, 5}, + {1, 1808407283LL, 6}, {1, 3593175255LL, 7}, {1, 892460737LL, 5}, + {1, 443351463LL, 4}, {1, 3524075731LL, 7}, {1, 875407347LL, 5}, + {1, 1739733589LL, 6}, {1, 432197967LL, 4}, {1, 1717986919LL, 6}, + {1, 3414632385LL, 7}, {1, 424194301LL, 4}, {1, 210795941LL, 3}, + {1, 1676084799LL, 6}, {1, 1665926709LL, 6}, {1, 827945503LL, 5}, + {1, 1645975491LL, 6}, {1, 818089009LL, 5}, {1, 1626496491LL, 6}, + {1, 1616928865LL, 6}, {1, 3214946281LL, 7}, {1, 799063683LL, 5}, + {1, 397222409LL, 4}, {1, 789879043LL, 5}, {1, 1570730897LL, 6}, + {1, 780903145LL, 5}, {1, 3105965051LL, 7}, {1, 3088515809LL, 7}, + {1, 3071261531LL, 7}, {1, 3054198967LL, 7}, {1, 759331235LL, 5}, + {1, 3020636341LL, 7}, {1, 3004130131LL, 7}, {1, 2987803337LL, 7}, + {1, 2971653049LL, 7}, {1, 738919105LL, 5}, {1, 2939870663LL, 7}, + {1, 2924233053LL, 7}, {1, 2908760921LL, 7}, {1, 2893451653LL, 7}, + {1, 2878302691LL, 7}, {1, 715827883LL, 5}, {1, 356059465LL, 4}, + {1, 354224107LL, 4}, {1, 2819260585LL, 7}, {1, 1402438301LL, 6}, + {1, 1395319325LL, 6}, {1, 2776544515LL, 7}, {1, 1381296015LL, 6}, + {1, 1374389535LL, 6}, {1, 42735993LL, 1}, {1, 680390859LL, 5}, + {1, 2708156719LL, 7}, {1, 2694881441LL, 7}, {1, 1340867839LL, 6}, + {1, 333589693LL, 4}, {1, 663956297LL, 5}, {1, 1321528399LL, 6}, + {1, 2630410593LL, 7}, {1, 2617884829LL, 7}, {1, 81421181LL, 2}, + {1, 1296593901LL, 6}, {1, 2581013211LL, 7}, {1, 1284476201LL, 6}, + {1, 1278501893LL, 6}, {1, 1272582903LL, 6}, {1, 2533436931LL, 7}, + {1, 2521815661LL, 7}, {1, 2510300521LL, 7}, {1, 156180629LL, 3}, + {1, 2487582869LL, 7}, {1, 2476377541LL, 7}, {1, 2465272709LL, 7}, + {1, 2454267027LL, 7}, {1, 2443359173LL, 7}, {1, 1216273925LL, 6}, + {1, 605457945LL, 5}, {1, 2411209711LL, 7}, {1, 1200340205LL, 6}, + {1, 1195121335LL, 6}, {1, 2379895299LL, 7}, {1, 2369637129LL, 7}, + {1, 2359467013LL, 7}, {1, 2349383821LL, 7}, {1, 2339386443LL, 7}, + {1, 582368447LL, 5}, {1, 2319644785LL, 7}, {1, 1154949189LL, 6}, + {1, 2300233531LL, 7}, {1, 2290649225LL, 7}, {1, 285143057LL, 4}, + {1, 70991195LL, 2}, {1, 2262369605LL, 7}, {1, 1126548799LL, 6}, + {1, 1121950641LL, 6}, {1, 558694933LL, 5}, {1, 2225732041LL, 7}, + {1, 2216757315LL, 7}, {1, 2207854675LL, 7}, {1, 274877907LL, 4}, + {1, 2190262207LL, 7}, {1, 2181570691LL, 7}, {1, 2172947881LL, 7}, + {1, 2164392969LL, 7}, {1, 2155905153LL, 7}, {1, 2147483649LL, 7}, +}; + +__declspec(safe) +static unmasked unsigned int8 __fast_idiv(unsigned int8 numerator, + uniform unsigned int8 divisor) { + uniform int64 method = __idiv_table_u8[divisor-2][0]; + uniform int64 multiplier = __idiv_table_u8[divisor-2][1]; + uniform int64 shift = __idiv_table_u8[divisor-2][2]; + + unsigned int16 mult = multiplier; + unsigned int16 val = numerator; + if (method == 0) + return numerator >> shift; + else if (method == 1) + return (val * mult) >> (8 + shift); + else { + val *= mult; + val >>= 8; + val += (numerator-val)>>1; + return (val >> shift); + } +} + +__declspec(safe) +static unmasked int8 __fast_idiv(int8 numerator, uniform int8 divisor) { + uniform int8 method = __idiv_table_s8[divisor-2][0]; + uniform int16 multiplier = __idiv_table_s8[divisor-2][1]; + uniform int8 shift = __idiv_table_s8[divisor-2][2]; + + if (method == 0) + return numerator >> shift; + else { + unsigned int8 sign = numerator >> 7; + numerator ^= sign; + int16 mul = (int16)numerator * (int16)multiplier; + mul >>= 8 + shift; + return (int8)mul ^ sign; + } +} + +__declspec(safe) +static unmasked unsigned int16 __fast_idiv(unsigned int16 numerator, + uniform unsigned int16 divisor) { + uniform int64 method = __idiv_table_u16[divisor-2][0]; + uniform int64 multiplier = __idiv_table_u16[divisor-2][1]; + uniform int64 shift = __idiv_table_u16[divisor-2][2]; + + unsigned int32 mult = multiplier; + unsigned int32 val = numerator; + if (method == 0) + return numerator >> shift; + else if (method == 1) + return (val * mult) >> (16 + shift); + else { + val *= mult; + val >>= 16; + val += (numerator-val)>>1; + return val >> shift; + } +} + +__declspec(safe) +static unmasked int16 __fast_idiv(int16 numerator, uniform int16 divisor) { + uniform int64 method = __idiv_table_s16[divisor-2][0]; + uniform int64 multiplier = __idiv_table_s16[divisor-2][1]; + uniform int64 shift = __idiv_table_s16[divisor-2][2]; + + if (method == 0) + return numerator >> shift; + else { + unsigned int16 sign = numerator >> 15; + numerator ^= sign; + int32 mul = (int32)numerator * (int32)multiplier; + mul >>= 16 + shift; + int16 result = mul; + return result ^ sign; + } +} + +__declspec(safe) +static unmasked inline unsigned int32 __fast_idiv(unsigned int32 numerator, + uniform unsigned int32 divisor) { + uniform int64 method = __idiv_table_u32[divisor-2][0]; + uniform int64 multiplier = __idiv_table_u32[divisor-2][1]; + uniform int64 shift = __idiv_table_u32[divisor-2][2]; + + unsigned int64 mult = multiplier; + unsigned int64 val = numerator; + if (method == 0) + return numerator >> shift; + else if (method == 1) + return (val * mult) >> (32 + shift); + else { + val *= mult; + val >>= 32; + val += (numerator-val)>>1; + return val >> shift; + } +} + +__declspec(safe) +static unmasked int32 __fast_idiv(int32 numerator, uniform int32 divisor) { + uniform int64 method = __idiv_table_s32[divisor-2][0]; + uniform int64 multiplier = __idiv_table_s32[divisor-2][1]; + uniform int64 shift = __idiv_table_s32[divisor-2][2]; + + if (method == 0) + return numerator >> shift; + else { + unsigned int32 sign = numerator >> 31; + numerator ^= sign; + int64 mul = (int64)numerator * (int64)multiplier; + mul >>= 32 + shift; + int32 result = mul; + return result ^ sign; + } +} + diff --git a/tests/idiv.ispc b/tests/idiv.ispc new file mode 100644 index 00000000..b7bd78dc --- /dev/null +++ b/tests/idiv.ispc @@ -0,0 +1,75 @@ + +export uniform int width() { return programCount; } + + +export void f_f(uniform float RET[], uniform float aFOO[]) { + uniform int errorCount = 0; + + for (unsigned int8 num = 0; num < 255; ++num) { + for (uniform unsigned int8 div = 2; div < 255; ++div) { + if (__fast_idiv(num, div) != num/div) { + ++errorCount; + print("error %/% = %, got %\n", num, div, num/div, __fast_idiv(num,div)); + } + } + } + + for (int8 num = 0; num < 127; ++num) { + for (uniform int8 div = 2; div < 127; ++div) { + if (__fast_idiv(num, div) != num/div) { + ++errorCount; + print("error %/% = %, got %\n", num, div, num/div, __fast_idiv(num,div)); + } + } + } + + for (int16 num = 0; num < 32767; ++num) { + for (uniform int16 div = 2; div < 256; ++div) { + if (__fast_idiv(num, div) != num/div) { + ++errorCount; + print("error %/% = %, got %\n", num, div, num/div, __fast_idiv(num,div)); + } + } + } + + for (unsigned int16 num = 0; num < 0xffff; ++num) { + for (uniform unsigned int16 div = 2; div < 256; ++div) { + if (__fast_idiv(num, div) != num/div) { + ++errorCount; + print("error %/% = %, got %\n", num, div, num/div, __fast_idiv(num,div)); + } + } + } + + // randomly sample int32s... + uniform RNGState state; + seed_rng(&state, 1234); + for (uniform int i = 0; i < 1M; ++i) { + unsigned int32 num = random(&state); + for (uniform unsigned int32 div = 2; div < 256; ++div) { + if (__fast_idiv(num, div) != num/div) { + ++errorCount; + print("ui32 error %/% = %, got %\n", num, div, num/div, __fast_idiv(num,div)); + } + } + } + + for (uniform int64 i = 0; i < 1M; ++i) { + int32 num = random(&state); + if (num < 0) + continue; + for (uniform int32 div = 2; div < 256; ++div) { + if (__fast_idiv(num, div) != num/div) { + ++errorCount; + print("si32 error %/% = %, got %\n", num, div, num/div, __fast_idiv(num,div)); + } + } + } + + RET[programIndex] = errorCount; +} + +export void result(uniform float RET[]) { + RET[programIndex] = 0; +} + From e7abf3f2eacd50b0b8cb194fc87e878bdc25ddec Mon Sep 17 00:00:00 2001 From: Matt Pharr Date: Tue, 23 Jul 2013 16:38:10 -0700 Subject: [PATCH 003/124] Add support for mask vectors of 8 and 16-bit element types. There were a number of places throughout the system that assumed that the execution mask would only have either 32-bit or 1-bit elements. This commit makes it possible to have a target with an 8- or 16-bit mask. --- Makefile | 29 ++++++--- builtins.cpp | 35 +++++++---- builtins/util.m4 | 161 ++++++++++++++++++++++++++++++++--------------- ctx.cpp | 26 +++----- expr.cpp | 36 +++++------ llvmutil.cpp | 73 +++++++++++++++++---- parse.yy | 20 +++++- stdlib.ispc | 37 ++++++----- 8 files changed, 284 insertions(+), 133 deletions(-) diff --git a/Makefile b/Makefile index 835f8e15..043ab4cf 100644 --- a/Makefile +++ b/Makefile @@ -137,7 +137,7 @@ BISON_SRC=parse.yy FLEX_SRC=lex.ll OBJS=$(addprefix objs/, $(CXX_SRC:.cpp=.o) $(BUILTINS_OBJS) \ - stdlib_generic_ispc.o stdlib_x86_ispc.o \ + stdlib_mask1_ispc.o stdlib_mask8_ispc.o stdlib_mask16_ispc.o stdlib_mask32_ispc.o \ $(BISON_SRC:.yy=.o) $(FLEX_SRC:.ll=.o)) default: ispc @@ -243,12 +243,23 @@ objs/builtins-c-64.cpp: builtins/builtins.c @echo Creating C++ source from builtins definition file $< @$(CLANG) -m64 -emit-llvm -c $< -o - | llvm-dis - | python bitcode2cpp.py c 64 > $@ -objs/stdlib_generic_ispc.cpp: stdlib.ispc - @echo Creating C++ source from $< for generic - @$(CLANG) -E -x c -DISPC_TARGET_GENERIC=1 -DISPC=1 -DPI=3.1415926536 $< -o - | \ - python stdlib2cpp.py generic > $@ +objs/stdlib_mask1_ispc.cpp: stdlib.ispc + @echo Creating C++ source from $< for mask1 + @$(CLANG) -E -x c -DISPC_MASK_BITS=1 -DISPC=1 -DPI=3.1415926536 $< -o - | \ + python stdlib2cpp.py mask1 > $@ + +objs/stdlib_mask8_ispc.cpp: stdlib.ispc + @echo Creating C++ source from $< for mask8 + @$(CLANG) -E -x c -DISPC_MASK_BITS=8 -DISPC=1 -DPI=3.1415926536 $< -o - | \ + python stdlib2cpp.py mask8 > $@ + +objs/stdlib_mask16_ispc.cpp: stdlib.ispc + @echo Creating C++ source from $< for mask16 + @$(CLANG) -E -x c -DISPC_MASK_BITS=16 -DISPC=1 -DPI=3.1415926536 $< -o - | \ + python stdlib2cpp.py mask16 > $@ + +objs/stdlib_mask32_ispc.cpp: stdlib.ispc + @echo Creating C++ source from $< for mask32 + @$(CLANG) -E -x c -DISPC_MASK_BITS=32 -DISPC=1 -DPI=3.1415926536 $< -o - | \ + python stdlib2cpp.py mask32 > $@ -objs/stdlib_x86_ispc.cpp: stdlib.ispc - @echo Creating C++ source from $< for x86 - @$(CLANG) -E -x c -DISPC=1 -DPI=3.1415926536 $< -o - | \ - python stdlib2cpp.py x86 > $@ diff --git a/builtins.cpp b/builtins.cpp index 3e03de10..d3bbaa6a 100644 --- a/builtins.cpp +++ b/builtins.cpp @@ -112,10 +112,7 @@ lLLVMTypeToISPCType(const llvm::Type *t, bool intAsUnsigned) { return intAsUnsigned ? AtomicType::UniformUInt64 : AtomicType::UniformInt64; // varying - if (LLVMTypes::MaskType != LLVMTypes::Int32VectorType && - t == LLVMTypes::MaskType) - return AtomicType::VaryingBool; - else if (t == LLVMTypes::Int8VectorType) + if (t == LLVMTypes::Int8VectorType) return intAsUnsigned ? AtomicType::VaryingUInt8 : AtomicType::VaryingInt8; else if (t == LLVMTypes::Int16VectorType) return intAsUnsigned ? AtomicType::VaryingUInt16 : AtomicType::VaryingInt16; @@ -127,6 +124,8 @@ lLLVMTypeToISPCType(const llvm::Type *t, bool intAsUnsigned) { return AtomicType::VaryingDouble; else if (t == LLVMTypes::Int64VectorType) return intAsUnsigned ? AtomicType::VaryingUInt64 : AtomicType::VaryingInt64; + else if (t == LLVMTypes::MaskType) + return AtomicType::VaryingBool; // pointers to uniform else if (t == LLVMTypes::Int8PointerType) @@ -1038,16 +1037,30 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod // If the user wants the standard library to be included, parse the // serialized version of the stdlib.ispc file to get its // definitions added. + extern char stdlib_mask1_code[], stdlib_mask8_code[]; + extern char stdlib_mask16_code[], stdlib_mask32_code[]; if (g->target->getISA() == Target::GENERIC && - g->target->getVectorWidth() != 1) { // 1 wide uses x86 stdlib - extern char stdlib_generic_code[]; - yy_scan_string(stdlib_generic_code); - yyparse(); + g->target->getVectorWidth() == 1) { // 1 wide uses 32 stdlib + yy_scan_string(stdlib_mask32_code); } else { - extern char stdlib_x86_code[]; - yy_scan_string(stdlib_x86_code); - yyparse(); + switch (g->target->getMaskBitCount()) { + case 1: + yy_scan_string(stdlib_mask1_code); + break; + case 8: + yy_scan_string(stdlib_mask8_code); + break; + case 16: + yy_scan_string(stdlib_mask16_code); + break; + case 32: + yy_scan_string(stdlib_mask32_code); + break; + default: + FATAL("Unhandled mask bit size for stdlib.ispc"); + } } + yyparse(); } } diff --git a/builtins/util.m4 b/builtins/util.m4 index c19d4930..d6f3e5c3 100644 --- a/builtins/util.m4 +++ b/builtins/util.m4 @@ -690,6 +690,75 @@ shuffles(i64, 8) ;; $4: return type of the LLVM atomic type, in ispc naming paralance (e.g. int32) ;; $5: identity value for the operator (e.g. 0 for add, -1 for AND, ...) +define(`mask_converts', ` +define internal <$1 x i8> @convertmask_i1_i8_$1(<$1 x i1>) { + %r = sext <$1 x i1> %0 to <$1 x i8> + ret <$1 x i8> %r +} +define internal <$1 x i16> @convertmask_i1_i16_$1(<$1 x i1>) { + %r = sext <$1 x i1> %0 to <$1 x i16> + ret <$1 x i16> %r +} +define internal <$1 x i32> @convertmask_i1_i32_$1(<$1 x i1>) { + %r = sext <$1 x i1> %0 to <$1 x i32> + ret <$1 x i32> %r +} +define internal <$1 x i64> @convertmask_i1_i64_$1(<$1 x i1>) { + %r = sext <$1 x i1> %0 to <$1 x i64> + ret <$1 x i64> %r +} + +define internal <$1 x i8> @convertmask_i8_i8_$1(<$1 x i8>) { + ret <$1 x i8> %0 +} +define internal <$1 x i16> @convertmask_i8_i86_$1(<$1 x i8>) { + %r = sext <$1 x i8> %0 to <$1 x i16> + ret <$1 x i16> %r +} +define internal <$1 x i32> @convertmask_i8_i32_$1(<$1 x i8>) { + %r = sext <$1 x i8> %0 to <$1 x i32> + ret <$1 x i32> %r +} +define internal <$1 x i64> @convertmask_i8_i64_$1(<$1 x i8>) { + %r = sext <$1 x i8> %0 to <$1 x i64> + ret <$1 x i64> %r +} + +define internal <$1 x i8> @convertmask_i16_i8_$1(<$1 x i16>) { + %r = trunc <$1 x i16> %0 to <$1 x i8> + ret <$1 x i8> %r +} +define internal <$1 x i16> @convertmask_i16_i16_$1(<$1 x i16>) { + ret <$1 x i16> %0 +} +define internal <$1 x i32> @convertmask_i16_i32_$1(<$1 x i16>) { + %r = sext <$1 x i16> %0 to <$1 x i32> + ret <$1 x i32> %r +} +define internal <$1 x i64> @convertmask_i16_i64_$1(<$1 x i16>) { + %r = sext <$1 x i16> %0 to <$1 x i64> + ret <$1 x i64> %r +} + +define internal <$1 x i8> @convertmask_i32_i8_$1(<$1 x i32>) { + %r = trunc <$1 x i32> %0 to <$1 x i8> + ret <$1 x i8> %r +} +define internal <$1 x i16> @convertmask_i32_i16_$1(<$1 x i32>) { + %r = trunc <$1 x i32> %0 to <$1 x i16> + ret <$1 x i16> %r +} +define internal <$1 x i32> @convertmask_i32_i32_$1(<$1 x i32>) { + ret <$1 x i32> %0 +} +define internal <$1 x i64> @convertmask_i32_i64_$1(<$1 x i32>) { + %r = sext <$1 x i32> %0 to <$1 x i64> + ret <$1 x i64> %r +} +') + +mask_converts(WIDTH) + define(`global_atomic_associative', ` define <$1 x $3> @__atomic_$2_$4_global($3 * %ptr, <$1 x $3> %val, @@ -697,17 +766,10 @@ define <$1 x $3> @__atomic_$2_$4_global($3 * %ptr, <$1 x $3> %val, ; first, for any lanes where the mask is off, compute a vector where those lanes ; hold the identity value.. - ; for the bit tricks below, we need the mask to be sign extended to be - ; the size of the element type. - ifelse( - MASK,i1,`%mask = sext <$1 x MASK> %m to <$1 x $3>', - $3,i64, `%mask = sext <$1 x MASK> %m to <$1 x i64>', - $3,i32, ` - ; silly workaround to do %mask = %m, which is not possible directly.. - %maskmem = alloca <$1 x i32> - store <$1 x i32> %m, <$1 x i32> * %maskmem - %mask = load <$1 x i32> * %maskmem' - ) + ; for the bit tricks below, we need the mask to have the + ; the same element size as the element type. + %mask = call <$1 x $3> @convertmask_`'MASK`'_$3_$1(<$1 x MASK> %m) + ; zero out any lanes that are off %valoff = and <$1 x $3> %val, %mask @@ -2440,13 +2502,12 @@ define i32 @__sext_uniform_bool(i1) nounwind readnone alwaysinline { } define @__sext_varying_bool() nounwind readnone alwaysinline { - ifelse(MASK,i1, ` - %se = sext %0 to - ret %se - ', ` - ret %0') + ifelse(MASK,i32, `ret %0', + `%se = sext %0 to + ret %se') } + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; memcpy/memmove/memset @@ -3201,8 +3262,8 @@ return: ;; $1: llvm type of elements (and suffix for function name) define(`gen_masked_store', ` -define void @__masked_store_$1(* nocapture, , ) nounwind alwaysinline { - per_lane(WIDTH, %2, ` +define void @__masked_store_$1(* nocapture, , ) nounwind alwaysinline { + per_lane(WIDTH, %2, ` %ptr_LANE_ID = getelementptr * %0, i32 0, i32 LANE %storeval_LANE_ID = extractelement %1, i32 LANE store $1 %storeval_LANE_ID, $1 * %ptr_LANE_ID') @@ -3378,10 +3439,10 @@ define void @__masked_store_blend_i16(<16 x i16>* nocapture, <16 x i16>, define(`packed_load_and_store', ` define i32 @__packed_load_active(i32 * %startptr, * %val_ptr, - %full_mask) nounwind alwaysinline { + %full_mask) nounwind alwaysinline { entry: - %mask = call i64 @__movmsk( %full_mask) - %mask_known = call i1 @__is_compile_time_constant_mask( %full_mask) + %mask = call i64 @__movmsk( %full_mask) + %mask_known = call i1 @__is_compile_time_constant_mask( %full_mask) br i1 %mask_known, label %known_mask, label %unknown_mask known_mask: @@ -3432,10 +3493,10 @@ done: } define i32 @__packed_store_active(i32 * %startptr, %vals, - %full_mask) nounwind alwaysinline { + %full_mask) nounwind alwaysinline { entry: - %mask = call i64 @__movmsk( %full_mask) - %mask_known = call i1 @__is_compile_time_constant_mask( %full_mask) + %mask = call i64 @__movmsk( %full_mask) + %mask_known = call i1 @__is_compile_time_constant_mask( %full_mask) br i1 %mask_known, label %known_mask, label %unknown_mask known_mask: @@ -3544,10 +3605,10 @@ check_neighbors: %castvr = call <$1 x $4> @__rotate_i$6(<$1 x $4> %castvec, i32 1) %vr = bitcast <$1 x $4> %castvr to <$1 x $2> %eq = $5 $7 <$1 x $2> %vec, %vr - ifelse(MASK,i32, ` - %eq32 = sext <$1 x i1> %eq to <$1 x i32> - %eqmm = call i64 @__movmsk(<$1 x i32> %eq32)', ` - %eqmm = call i64 @__movmsk(<$1 x MASK> %eq)') + ifelse(MASK,i1, ` + %eqmm = call i64 @__movmsk(<$1 x MASK> %eq)', + `%eqm = sext <$1 x i1> %eq to <$1 x MASK> + %eqmm = call i64 @__movmsk(<$1 x MASK> %eqm)') %alleq = icmp eq i64 %eqmm, ALL_ON_MASK br i1 %alleq, label %all_equal, label %not_all_equal ', ` @@ -3722,9 +3783,9 @@ pl_done: define(`gen_gather_general', ` ; fully general 32-bit gather, takes array of pointers encoded as vector of i32s define @__gather32_$1( %ptrs, - %vecmask) nounwind readonly alwaysinline { + %vecmask) nounwind readonly alwaysinline { %ret_ptr = alloca - per_lane(WIDTH, %vecmask, ` + per_lane(WIDTH, %vecmask, ` %iptr_LANE_ID = extractelement %ptrs, i32 LANE %ptr_LANE_ID = inttoptr i32 %iptr_LANE_ID to $1 * %val_LANE_ID = load $1 * %ptr_LANE_ID @@ -3738,9 +3799,9 @@ define @__gather32_$1( %ptrs, ; fully general 64-bit gather, takes array of pointers encoded as vector of i32s define @__gather64_$1( %ptrs, - %vecmask) nounwind readonly alwaysinline { + %vecmask) nounwind readonly alwaysinline { %ret_ptr = alloca - per_lane(WIDTH, %vecmask, ` + per_lane(WIDTH, %vecmask, ` %iptr_LANE_ID = extractelement %ptrs, i32 LANE %ptr_LANE_ID = inttoptr i64 %iptr_LANE_ID to $1 * %val_LANE_ID = load $1 * %ptr_LANE_ID @@ -3804,7 +3865,7 @@ define @__gather_elt64_$1(i8 * %ptr, %offsets, i32 %o define @__gather_factored_base_offsets32_$1(i8 * %ptr, %offsets, i32 %offset_scale, %offset_delta, - %vecmask) nounwind readonly alwaysinline { + %vecmask) nounwind readonly alwaysinline { ; We can be clever and avoid the per-lane stuff for gathers if we are willing ; to require that the 0th element of the array being gathered from is always ; legal to read from (and we do indeed require that, given the benefits!) @@ -3813,13 +3874,13 @@ define @__gather_factored_base_offsets32_$1(i8 * %ptr, store zeroinitializer, * %offsetsPtr call void @__masked_store_blend_i32( * %offsetsPtr, %offsets, - %vecmask) + %vecmask) %newOffsets = load * %offsetsPtr %deltaPtr = alloca store zeroinitializer, * %deltaPtr call void @__masked_store_blend_i32( * %deltaPtr, %offset_delta, - %vecmask) + %vecmask) %newDelta = load * %deltaPtr %ret0 = call @__gather_elt32_$1(i8 * %ptr, %newOffsets, @@ -3835,7 +3896,7 @@ define @__gather_factored_base_offsets32_$1(i8 * %ptr, @__gather_factored_base_offsets64_$1(i8 * %ptr, %offsets, i32 %offset_scale, %offset_delta, - %vecmask) nounwind readonly alwaysinline { + %vecmask) nounwind readonly alwaysinline { ; We can be clever and avoid the per-lane stuff for gathers if we are willing ; to require that the 0th element of the array being gathered from is always ; legal to read from (and we do indeed require that, given the benefits!) @@ -3844,13 +3905,13 @@ define @__gather_factored_base_offsets64_$1(i8 * %ptr, store zeroinitializer, * %offsetsPtr call void @__masked_store_blend_i64( * %offsetsPtr, %offsets, - %vecmask) + %vecmask) %newOffsets = load * %offsetsPtr %deltaPtr = alloca store zeroinitializer, * %deltaPtr call void @__masked_store_blend_i64( * %deltaPtr, %offset_delta, - %vecmask) + %vecmask) %newDelta = load * %deltaPtr %ret0 = call @__gather_elt64_$1(i8 * %ptr, %newOffsets, @@ -3876,27 +3937,27 @@ gen_gather_factored($1) define @__gather_base_offsets32_$1(i8 * %ptr, i32 %offset_scale, %offsets, - %vecmask) nounwind readonly alwaysinline { + %vecmask) nounwind readonly alwaysinline { %scale_vec = bitcast i32 %offset_scale to <1 x i32> %smear_scale = shufflevector <1 x i32> %scale_vec, <1 x i32> undef, < forloop(i, 1, eval(WIDTH-1), `i32 0, ') i32 0 > %scaled_offsets = mul %smear_scale, %offsets %v = call @__gather_factored_base_offsets32_$1(i8 * %ptr, %scaled_offsets, i32 1, - zeroinitializer, %vecmask) + zeroinitializer, %vecmask) ret %v } define @__gather_base_offsets64_$1(i8 * %ptr, i32 %offset_scale, %offsets, - %vecmask) nounwind readonly alwaysinline { + %vecmask) nounwind readonly alwaysinline { %scale64 = zext i32 %offset_scale to i64 %scale_vec = bitcast i64 %scale64 to <1 x i64> %smear_scale = shufflevector <1 x i64> %scale_vec, <1 x i64> undef, < forloop(i, 1, eval(WIDTH-1), `i32 0, ') i32 0 > %scaled_offsets = mul %smear_scale, %offsets %v = call @__gather_factored_base_offsets64_$1(i8 * %ptr, %scaled_offsets, - i32 1, zeroinitializer, %vecmask) + i32 1, zeroinitializer, %vecmask) ret %v } @@ -3955,9 +4016,9 @@ define void @__scatter_elt64_$1(i8 * %ptr, %offsets, i32 %offset_s define void @__scatter_factored_base_offsets32_$1(i8* %base, %offsets, i32 %offset_scale, %offset_delta, %values, - %mask) nounwind alwaysinline { + %mask) nounwind alwaysinline { ;; And use the `per_lane' macro to do all of the per-lane work for scatter... - per_lane(WIDTH, %mask, ` + per_lane(WIDTH, %mask, ` call void @__scatter_elt32_$1(i8 * %base, %offsets, i32 %offset_scale, %offset_delta, %values, i32 LANE)') ret void @@ -3965,9 +4026,9 @@ define void @__scatter_factored_base_offsets32_$1(i8* %base, %offs define void @__scatter_factored_base_offsets64_$1(i8* %base, %offsets, i32 %offset_scale, %offset_delta, %values, - %mask) nounwind alwaysinline { + %mask) nounwind alwaysinline { ;; And use the `per_lane' macro to do all of the per-lane work for scatter... - per_lane(WIDTH, %mask, ` + per_lane(WIDTH, %mask, ` call void @__scatter_elt64_$1(i8 * %base, %offsets, i32 %offset_scale, %offset_delta, %values, i32 LANE)') ret void @@ -3975,8 +4036,8 @@ define void @__scatter_factored_base_offsets64_$1(i8* %base, %offs ; fully general 32-bit scatter, takes array of pointers encoded as vector of i32s define void @__scatter32_$1( %ptrs, %values, - %mask) nounwind alwaysinline { - per_lane(WIDTH, %mask, ` + %mask) nounwind alwaysinline { + per_lane(WIDTH, %mask, ` %iptr_LANE_ID = extractelement %ptrs, i32 LANE %ptr_LANE_ID = inttoptr i32 %iptr_LANE_ID to $1 * %val_LANE_ID = extractelement %values, i32 LANE @@ -3987,8 +4048,8 @@ define void @__scatter32_$1( %ptrs, %values, ; fully general 64-bit scatter, takes array of pointers encoded as vector of i64s define void @__scatter64_$1( %ptrs, %values, - %mask) nounwind alwaysinline { - per_lane(WIDTH, %mask, ` + %mask) nounwind alwaysinline { + per_lane(WIDTH, %mask, ` %iptr_LANE_ID = extractelement %ptrs, i32 LANE %ptr_LANE_ID = inttoptr i64 %iptr_LANE_ID to $1 * %val_LANE_ID = extractelement %values, i32 LANE diff --git a/ctx.cpp b/ctx.cpp index 1e79c97b..c50d22f9 100644 --- a/ctx.cpp +++ b/ctx.cpp @@ -1456,13 +1456,13 @@ FunctionEmitContext::I1VecToBoolVec(llvm::Value *b) { for (unsigned int i = 0; i < at->getNumElements(); ++i) { llvm::Value *elt = ExtractInst(b, i); llvm::Value *sext = SExtInst(elt, LLVMTypes::BoolVectorType, - LLVMGetName(elt, "_to_boolvec32")); + LLVMGetName(elt, "_to_boolvec")); ret = InsertInst(ret, sext, i); } return ret; } else - return SExtInst(b, LLVMTypes::BoolVectorType, LLVMGetName(b, "_to_i32")); + return SExtInst(b, LLVMTypes::BoolVectorType, LLVMGetName(b, "_to_boolvec")); } @@ -2781,6 +2781,7 @@ FunctionEmitContext::maskedStore(llvm::Value *value, llvm::Value *ptr, // Figure out if we need a 8, 16, 32 or 64-bit masked store. llvm::Function *maskedStoreFunc = NULL; + llvm::Type *llvmValueType = value->getType(); const PointerType *pt = CastType(valueType); if (pt != NULL) { @@ -2809,8 +2810,7 @@ FunctionEmitContext::maskedStore(llvm::Value *value, llvm::Value *ptr, else maskedStoreFunc = m->module->getFunction("__pseudo_masked_store_i64"); } - else if (Type::Equal(valueType, AtomicType::VaryingBool) && - g->target->getMaskBitCount() == 1) { + else if (llvmValueType == LLVMTypes::Int1VectorType) { llvm::Value *notMask = BinaryOperator(llvm::Instruction::Xor, mask, LLVMMaskAllOn, "~mask"); llvm::Value *old = LoadInst(ptr); @@ -2823,28 +2823,22 @@ FunctionEmitContext::maskedStore(llvm::Value *value, llvm::Value *ptr, StoreInst(final, ptr); return; } - else if (Type::Equal(valueType, AtomicType::VaryingDouble)) { + else if (llvmValueType == LLVMTypes::DoubleVectorType) { maskedStoreFunc = m->module->getFunction("__pseudo_masked_store_double"); } - else if (Type::Equal(valueType, AtomicType::VaryingInt64) || - Type::Equal(valueType, AtomicType::VaryingUInt64)) { + else if (llvmValueType == LLVMTypes::Int64VectorType) { maskedStoreFunc = m->module->getFunction("__pseudo_masked_store_i64"); } - else if (Type::Equal(valueType, AtomicType::VaryingFloat)) { + else if (llvmValueType == LLVMTypes::FloatVectorType) { maskedStoreFunc = m->module->getFunction("__pseudo_masked_store_float"); } - else if (Type::Equal(valueType, AtomicType::VaryingBool) || - Type::Equal(valueType, AtomicType::VaryingInt32) || - Type::Equal(valueType, AtomicType::VaryingUInt32) || - CastType(valueType) != NULL) { + else if (llvmValueType == LLVMTypes::Int32VectorType) { maskedStoreFunc = m->module->getFunction("__pseudo_masked_store_i32"); } - else if (Type::Equal(valueType, AtomicType::VaryingInt16) || - Type::Equal(valueType, AtomicType::VaryingUInt16)) { + else if (llvmValueType == LLVMTypes::Int16VectorType) { maskedStoreFunc = m->module->getFunction("__pseudo_masked_store_i16"); } - else if (Type::Equal(valueType, AtomicType::VaryingInt8) || - Type::Equal(valueType, AtomicType::VaryingUInt8)) { + else if (llvmValueType == LLVMTypes::Int8VectorType) { maskedStoreFunc = m->module->getFunction("__pseudo_masked_store_i8"); } AssertPos(currentPos, maskedStoreFunc != NULL); diff --git a/expr.cpp b/expr.cpp index 3baaabaf..6bde2acb 100644 --- a/expr.cpp +++ b/expr.cpp @@ -6161,9 +6161,9 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal, switch (fromType->basicType) { case AtomicType::TYPE_BOOL: if (fromType->IsVaryingType() && - LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType) - // If we have a bool vector of i32 elements, first truncate - // down to a single bit + LLVMTypes::BoolVectorType != LLVMTypes::Int1VectorType) + // If we have a bool vector of non-i1 elements, first + // truncate down to a single bit. exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, cOpName); // And then do an unisgned int->float cast cast = ctx->CastInst(llvm::Instruction::UIToFP, // unsigned int @@ -6205,8 +6205,8 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal, switch (fromType->basicType) { case AtomicType::TYPE_BOOL: if (fromType->IsVaryingType() && - LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType) - // truncate i32 bool vector values to i1s + LLVMTypes::BoolVectorType != LLVMTypes::Int1VectorType) + // truncate bool vector values to i1s exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, cOpName); cast = ctx->CastInst(llvm::Instruction::UIToFP, // unsigned int to double exprVal, targetType, cOpName); @@ -6243,7 +6243,7 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal, switch (fromType->basicType) { case AtomicType::TYPE_BOOL: if (fromType->IsVaryingType() && - LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType) + LLVMTypes::BoolVectorType != LLVMTypes::Int1VectorType) exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, cOpName); cast = ctx->ZExtInst(exprVal, targetType, cOpName); break; @@ -6279,7 +6279,7 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal, switch (fromType->basicType) { case AtomicType::TYPE_BOOL: if (fromType->IsVaryingType() && - LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType) + LLVMTypes::BoolVectorType != LLVMTypes::Int1VectorType) exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, cOpName); cast = ctx->ZExtInst(exprVal, targetType, cOpName); break; @@ -6321,7 +6321,7 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal, switch (fromType->basicType) { case AtomicType::TYPE_BOOL: if (fromType->IsVaryingType() && - LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType) + LLVMTypes::BoolVectorType != LLVMTypes::Int1VectorType) exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, cOpName); cast = ctx->ZExtInst(exprVal, targetType, cOpName); break; @@ -6361,7 +6361,7 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal, switch (fromType->basicType) { case AtomicType::TYPE_BOOL: if (fromType->IsVaryingType() && - LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType) + LLVMTypes::BoolVectorType != LLVMTypes::Int1VectorType) exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, cOpName); cast = ctx->ZExtInst(exprVal, targetType, cOpName); break; @@ -6407,7 +6407,7 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal, switch (fromType->basicType) { case AtomicType::TYPE_BOOL: if (fromType->IsVaryingType() && - LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType) + LLVMTypes::BoolVectorType != LLVMTypes::Int1VectorType) exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, cOpName); cast = ctx->ZExtInst(exprVal, targetType, cOpName); break; @@ -6447,7 +6447,7 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal, switch (fromType->basicType) { case AtomicType::TYPE_BOOL: if (fromType->IsVaryingType() && - LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType) + LLVMTypes::BoolVectorType != LLVMTypes::Int1VectorType) exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, cOpName); cast = ctx->ZExtInst(exprVal, targetType, cOpName); break; @@ -6493,7 +6493,7 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal, switch (fromType->basicType) { case AtomicType::TYPE_BOOL: if (fromType->IsVaryingType() && - LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType) + LLVMTypes::BoolVectorType != LLVMTypes::Int1VectorType) exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, cOpName); cast = ctx->ZExtInst(exprVal, targetType, cOpName); break; @@ -6531,7 +6531,7 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal, switch (fromType->basicType) { case AtomicType::TYPE_BOOL: if (fromType->IsVaryingType() && - LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType) + LLVMTypes::BoolVectorType != LLVMTypes::Int1VectorType) exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, cOpName); cast = ctx->ZExtInst(exprVal, targetType, cOpName); break; @@ -6625,12 +6625,12 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal, if (fromType->IsUniformType()) { if (toType->IsVaryingType() && - LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType) { - // extend out to i32 bool values from i1 here. then we'll - // turn into a vector below, the way it does for everyone - // else... + LLVMTypes::BoolVectorType != LLVMTypes::Int1VectorType) { + // extend out to an bool as an i8/i16/i32 from the i1 here. + // Then we'll turn that into a vector below, the way it + // does for everyone else... cast = ctx->SExtInst(cast, LLVMTypes::BoolVectorType->getElementType(), - LLVMGetName(cast, "to_i32bool")); + LLVMGetName(cast, "to_i_bool")); } } else diff --git a/llvmutil.cpp b/llvmutil.cpp index 26c18bf5..180c8676 100644 --- a/llvmutil.cpp +++ b/llvmutil.cpp @@ -115,13 +115,25 @@ InitLLVMUtil(llvm::LLVMContext *ctx, Target& target) { LLVMTypes::FloatPointerType = llvm::PointerType::get(LLVMTypes::FloatType, 0); LLVMTypes::DoublePointerType = llvm::PointerType::get(LLVMTypes::DoubleType, 0); - if (target.getMaskBitCount() == 1) + switch (target.getMaskBitCount()) { + case 1: LLVMTypes::MaskType = LLVMTypes::BoolVectorType = llvm::VectorType::get(llvm::Type::getInt1Ty(*ctx), target.getVectorWidth()); - else { - Assert(target.getMaskBitCount() == 32); + break; + case 8: + LLVMTypes::MaskType = LLVMTypes::BoolVectorType = + llvm::VectorType::get(llvm::Type::getInt8Ty(*ctx), target.getVectorWidth()); + break; + case 16: + LLVMTypes::MaskType = LLVMTypes::BoolVectorType = + llvm::VectorType::get(llvm::Type::getInt16Ty(*ctx), target.getVectorWidth()); + break; + case 32: LLVMTypes::MaskType = LLVMTypes::BoolVectorType = llvm::VectorType::get(llvm::Type::getInt32Ty(*ctx), target.getVectorWidth()); + break; + default: + FATAL("Unhandled mask width for initializing MaskType"); } LLVMTypes::Int1VectorType = @@ -154,12 +166,26 @@ InitLLVMUtil(llvm::LLVMContext *ctx, Target& target) { std::vector maskOnes; llvm::Constant *onMask = NULL; - if (target.getMaskBitCount() == 1) + switch (target.getMaskBitCount()) { + case 1: onMask = llvm::ConstantInt::get(llvm::Type::getInt1Ty(*ctx), 1, false /*unsigned*/); // 0x1 - else + break; + case 8: + onMask = llvm::ConstantInt::get(llvm::Type::getInt8Ty(*ctx), -1, + true /*signed*/); // 0xff + break; + case 16: + onMask = llvm::ConstantInt::get(llvm::Type::getInt16Ty(*ctx), -1, + true /*signed*/); // 0xffff + break; + case 32: onMask = llvm::ConstantInt::get(llvm::Type::getInt32Ty(*ctx), -1, true /*signed*/); // 0xffffffff + break; + default: + FATAL("Unhandled mask width for onMask"); + } for (int i = 0; i < target.getVectorWidth(); ++i) maskOnes.push_back(onMask); @@ -167,13 +193,26 @@ InitLLVMUtil(llvm::LLVMContext *ctx, Target& target) { std::vector maskZeros; llvm::Constant *offMask = NULL; - if (target.getMaskBitCount() == 1) + switch (target.getMaskBitCount()) { + case 1: offMask = llvm::ConstantInt::get(llvm::Type::getInt1Ty(*ctx), 0, true /*signed*/); - else + break; + case 8: + offMask = llvm::ConstantInt::get(llvm::Type::getInt8Ty(*ctx), 0, + true /*signed*/); + break; + case 16: + offMask = llvm::ConstantInt::get(llvm::Type::getInt16Ty(*ctx), 0, + true /*signed*/); + break; + case 32: offMask = llvm::ConstantInt::get(llvm::Type::getInt32Ty(*ctx), 0, true /*signed*/); - + break; + default: + FATAL("Unhandled mask width for offMask"); + } for (int i = 0; i < target.getVectorWidth(); ++i) maskZeros.push_back(offMask); LLVMMaskAllOff = llvm::ConstantVector::get(maskZeros); @@ -444,9 +483,14 @@ LLVMBoolVector(bool b) { if (LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType) v = llvm::ConstantInt::get(LLVMTypes::Int32Type, b ? 0xffffffff : 0, false /*unsigned*/); + else if (LLVMTypes::BoolVectorType == LLVMTypes::Int16VectorType) + v = llvm::ConstantInt::get(LLVMTypes::Int16Type, b ? 0xffff : 0, + false /*unsigned*/); + else if (LLVMTypes::BoolVectorType == LLVMTypes::Int8VectorType) + v = llvm::ConstantInt::get(LLVMTypes::Int8Type, b ? 0xff : 0, + false /*unsigned*/); else { - Assert(LLVMTypes::BoolVectorType->getElementType() == - llvm::Type::getInt1Ty(*g->ctx)); + Assert(LLVMTypes::BoolVectorType == LLVMTypes::Int1VectorType); v = b ? LLVMTrue : LLVMFalse; } @@ -465,9 +509,14 @@ LLVMBoolVector(const bool *bvec) { if (LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType) v = llvm::ConstantInt::get(LLVMTypes::Int32Type, bvec[i] ? 0xffffffff : 0, false /*unsigned*/); + else if (LLVMTypes::BoolVectorType == LLVMTypes::Int16VectorType) + v = llvm::ConstantInt::get(LLVMTypes::Int16Type, bvec[i] ? 0xffff : 0, + false /*unsigned*/); + else if (LLVMTypes::BoolVectorType == LLVMTypes::Int8VectorType) + v = llvm::ConstantInt::get(LLVMTypes::Int8Type, bvec[i] ? 0xff : 0, + false /*unsigned*/); else { - Assert(LLVMTypes::BoolVectorType->getElementType() == - llvm::Type::getInt1Ty(*g->ctx)); + Assert(LLVMTypes::BoolVectorType == LLVMTypes::Int1VectorType); v = bvec[i] ? LLVMTrue : LLVMFalse; } diff --git a/parse.yy b/parse.yy index 3ad815cf..488c864a 100644 --- a/parse.yy +++ b/parse.yy @@ -2148,8 +2148,24 @@ lAddFunctionParams(Declarator *decl) { /** Add a symbol for the built-in mask variable to the symbol table */ static void lAddMaskToSymbolTable(SourcePos pos) { - const Type *t = g->target->getMaskBitCount() == 1 ? - AtomicType::VaryingBool : AtomicType::VaryingUInt32; + const Type *t; + switch (g->target->getMaskBitCount()) { + case 1: + t = AtomicType::VaryingBool; + break; + case 8: + t = AtomicType::VaryingUInt8; + break; + case 16: + t = AtomicType::VaryingUInt16; + break; + case 32: + t = AtomicType::VaryingUInt32; + break; + default: + FATAL("Unhandled mask bitsize in lAddMaskToSymbolTable"); + } + t = t->GetAsConstType(); Symbol *maskSymbol = new Symbol("__mask", pos, t); m->symbolTable->AddVariable(maskSymbol); diff --git a/stdlib.ispc b/stdlib.ispc index b8ed2057..8ad5aa49 100644 --- a/stdlib.ispc +++ b/stdlib.ispc @@ -38,12 +38,20 @@ ispc code */ -#ifdef ISPC_TARGET_GENERIC -#define IntMaskType bool -#define UIntMaskType bool +#if (ISPC_MASK_BITS == 1) + #define IntMaskType bool + #define UIntMaskType bool +#elif (ISPC_MASK_BITS == 8) + #define IntMaskType int8 + #define UIntMaskType unsigned int8 +#elif (ISPC_MASK_BITS == 16) + #define IntMaskType int16 + #define UIntMaskType unsigned int16 +#elif (ISPC_MASK_BITS == 32) + #define IntMaskType int32 + #define UIntMaskType unsigned int32 #else -#define IntMaskType int32 -#define UIntMaskType unsigned int32 + #error Unknown value of ISPC_MASK_BITS #endif /////////////////////////////////////////////////////////////////////////// @@ -335,14 +343,15 @@ static inline int32 sign_extend(bool v) { return __sext_varying_bool(v); } + __declspec(safe) static inline uniform bool any(bool v) { // We only care about whether "any" is true for the active program instances, // so we have to make v with the current program mask. -#ifdef ISPC_TARGET_GENERIC +#if (ISPC_MASK_BITS == 1) return __any(v & __mask); #else - return __any(__sext_varying_bool(v) & __mask); + return __any((UIntMaskType)__sext_varying_bool(v) & __mask); #endif } @@ -350,11 +359,10 @@ __declspec(safe) static inline uniform bool all(bool v) { // As with any(), we need to explicitly mask v with the current program mask // so we're only looking at the current lanes - -#ifdef ISPC_TARGET_GENERIC +#if (ISPC_MASK_BITS == 1) return __all(v | !__mask); #else - return __all(__sext_varying_bool(v) | !__mask); + return __all((UIntMaskType)__sext_varying_bool(v) | !__mask); #endif } @@ -362,11 +370,10 @@ __declspec(safe) static inline uniform bool none(bool v) { // As with any(), we need to explicitly mask v with the current program mask // so we're only looking at the current lanes - -#ifdef ISPC_TARGET_GENERIC +#if (ISPC_MASK_BITS == 1) return __none(v & __mask); #else - return __none(__sext_varying_bool(v) & __mask); + return __none((UIntMaskType)__sext_varying_bool(v) & __mask); #endif } @@ -399,10 +406,10 @@ static inline int popcnt(int64 v) { __declspec(safe) static inline uniform int popcnt(bool v) { // As with any() and all(), only count across the active lanes -#ifdef ISPC_TARGET_GENERIC +#if (ISPC_MASK_BITS == 1) return __popcnt_int64(__movmsk(v & __mask)); #else - return __popcnt_int64(__movmsk(__sext_varying_bool(v) & __mask)); + return __popcnt_int64(__movmsk((UIntMaskType)__sext_varying_bool(v) & __mask)); #endif } From 9ba49eabb21c7971f529fda25bad5fc1e84a6e3e Mon Sep 17 00:00:00 2001 From: Matt Pharr Date: Tue, 23 Jul 2013 16:52:43 -0700 Subject: [PATCH 004/124] Reduce estimated costs for 8 and 16-bit min() and max() in stdlib. These actually compile to a single instruction. --- stdlib.ispc | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/stdlib.ispc b/stdlib.ispc index 8ad5aa49..9a2b191f 100644 --- a/stdlib.ispc +++ b/stdlib.ispc @@ -1332,88 +1332,88 @@ static inline uniform double max(uniform double a, uniform double b) { // int8 -__declspec(safe,cost2) +__declspec(safe,cost1) static inline uniform unsigned int8 min(uniform unsigned int8 a, uniform unsigned int8 b) { return (a < b) ? a : b; } -__declspec(safe,cost2) +__declspec(safe,cost1) static inline uniform unsigned int8 max(uniform unsigned int8 a, uniform unsigned int8 b) { return (a > b) ? a : b; } -__declspec(safe,cost2) +__declspec(safe,cost1) static inline uniform int8 min(uniform int8 a, uniform int8 b) { return (a < b) ? a : b; } -__declspec(safe,cost2) +__declspec(safe,cost1) static inline uniform int8 max(uniform int8 a, uniform int8 b) { return (a > b) ? a : b; } -__declspec(safe,cost2) +__declspec(safe,cost1) static inline unsigned int8 min(unsigned int8 a, unsigned int8 b) { return (a < b) ? a : b; } -__declspec(safe,cost2) +__declspec(safe,cost1) static inline unsigned int8 max(unsigned int8 a, unsigned int8 b) { return (a > b) ? a : b; } -__declspec(safe,cost2) +__declspec(safe,cost1) static inline int8 min(int8 a, int8 b) { return (a < b) ? a : b; } -__declspec(safe,cost2) +__declspec(safe,cost1) static inline int8 max(int8 a, int8 b) { return (a > b) ? a : b; } // int16 -__declspec(safe,cost2) +__declspec(safe,cost1) static inline uniform unsigned int16 min(uniform unsigned int16 a, uniform unsigned int16 b) { return (a < b) ? a : b; } -__declspec(safe,cost2) +__declspec(safe,cost1) static inline uniform unsigned int16 max(uniform unsigned int16 a, uniform unsigned int16 b) { return (a > b) ? a : b; } -__declspec(safe,cost2) +__declspec(safe,cost1) static inline uniform int16 min(uniform int16 a, uniform int16 b) { return (a < b) ? a : b; } -__declspec(safe,cost2) +__declspec(safe,cost1) static inline uniform int16 max(uniform int16 a, uniform int16 b) { return (a > b) ? a : b; } -__declspec(safe,cost2) +__declspec(safe,cost1) static inline unsigned int16 min(unsigned int16 a, unsigned int16 b) { return (a < b) ? a : b; } -__declspec(safe,cost2) +__declspec(safe,cost1) static inline unsigned int16 max(unsigned int16 a, unsigned int16 b) { return (a > b) ? a : b; } -__declspec(safe,cost2) +__declspec(safe,cost1) static inline int16 min(int16 a, int16 b) { return (a < b) ? a : b; } -__declspec(safe,cost2) +__declspec(safe,cost1) static inline int16 max(int16 a, int16 b) { return (a > b) ? a : b; } From f7f281a256c38c1986860baec81736fcb4f5b6d1 Mon Sep 17 00:00:00 2001 From: Matt Pharr Date: Tue, 23 Jul 2013 17:01:03 -0700 Subject: [PATCH 005/124] Choose type for integer literals to match the target mask size (if possible). On a target with a 16-bit mask (for example), we would choose the type of an integer literal "1024" to be an int16. Previously, we used an int32, which is a worse fit and leads to less efficient code than an int16 on a 16-bit mask target. (However, we'd still give an integer literal 1000000 the type int32, even in a 16-bit target.) Updated the tests to still pass with 8 and 16-bit targets, given this change. --- lex.ll | 27 +++++++- parse.yy | 23 ++++++- run_tests.py | 4 +- stdlib.ispc | 74 ++++++++++----------- tests/aossoa-1.ispc | 4 +- tests/aossoa-2.ispc | 4 +- tests/aossoa-5.ispc | 4 +- tests/aossoa-6.ispc | 4 +- tests/atomics-12.ispc | 4 +- tests/atomics-13.ispc | 2 +- tests/atomics-4.ispc | 4 +- tests/coalesce-1.ispc | 4 +- tests/coalesce-2.ispc | 4 +- tests/coalesce-3.ispc | 4 +- tests/coalesce-4.ispc | 4 +- tests/coalesce-5.ispc | 4 +- tests/coalesce-6.ispc | 4 +- tests/coalesce-7.ispc | 4 +- tests/coalesce-8.ispc | 4 +- tests/count-leading-trailing-zeros-1.ispc | 2 +- tests/count-leading-trailing-zeros-4.ispc | 2 +- tests/exclusive-scan-and-2.ispc | 4 +- tests/exclusive-scan-or-1.ispc | 4 +- tests/frexp-double-1.ispc | 2 +- tests/frexp-double.ispc | 2 +- tests/frexp-float-1.ispc | 2 +- tests/frexp-float.ispc | 2 +- tests/kilo-mega-giga-2.ispc | 2 +- tests/ldexp-double.ispc | 4 +- tests/ldexp-float.ispc | 4 +- tests/local-atomics-12.ispc | 4 +- tests/local-atomics-13.ispc | 2 +- tests/local-atomics-14.ispc | 4 +- tests/local-atomics-4.ispc | 4 +- tests/rand-distrib-1.ispc | 2 +- tests/sizeof-9.ispc | 2 +- tests/test-83.ispc | 2 +- tests/test-84.ispc | 2 +- tests/test-85.ispc | 2 +- tests_errors/array-plus-equals.ispc | 2 +- tests_errors/array-pointer-assign.ispc | 2 +- tests_errors/float-logical.ispc | 2 +- tests_errors/fptr-typecheck-2.ispc | 2 +- tests_errors/fptr-typecheck-3.ispc | 2 +- tests_errors/initexpr-2.ispc | 2 +- tests_errors/int-ptr-fail.ispc | 4 +- tests_errors/lvalue-2.ispc | 2 +- tests_errors/lvalue-3.ispc | 2 +- tests_errors/new-delete-3.ispc | 2 +- tests_errors/new-delete-6.ispc | 2 +- tests_errors/ptr-1.ispc | 2 +- tests_errors/ptr-const-1.ispc | 2 +- tests_errors/ptrcast-lose-info.ispc | 2 +- tests_errors/ref-3.ispc | 2 +- tests_errors/soa-11.ispc | 2 +- tests_errors/soa-12.ispc | 2 +- tests_errors/soa-3.ispc | 2 +- tests_errors/soa-4.ispc | 2 +- tests_errors/soa-9.ispc | 2 +- tests_errors/struct_arith.ispc | 2 +- tests_errors/vec-size-compile-constant.ispc | 2 +- 61 files changed, 166 insertions(+), 120 deletions(-) diff --git a/lex.ll b/lex.ll index f6633fce..8baa627a 100644 --- a/lex.ll +++ b/lex.ll @@ -77,6 +77,8 @@ static int allTokens[] = { TOKEN_UNSIGNED, TOKEN_VARYING, TOKEN_VOID, TOKEN_WHILE, TOKEN_STRING_C_LITERAL, TOKEN_DOTDOTDOT, TOKEN_FLOAT_CONSTANT, + TOKEN_INT8_CONSTANT, TOKEN_UINT8_CONSTANT, + TOKEN_INT16_CONSTANT, TOKEN_UINT16_CONSTANT, TOKEN_INT32_CONSTANT, TOKEN_UINT32_CONSTANT, TOKEN_INT64_CONSTANT, TOKEN_UINT64_CONSTANT, TOKEN_INC_OP, TOKEN_DEC_OP, TOKEN_LEFT_OP, TOKEN_RIGHT_OP, TOKEN_LE_OP, @@ -150,6 +152,10 @@ void ParserInit() { tokenToName[TOKEN_STRING_C_LITERAL] = "\"C\""; tokenToName[TOKEN_DOTDOTDOT] = "..."; tokenToName[TOKEN_FLOAT_CONSTANT] = "TOKEN_FLOAT_CONSTANT"; + tokenToName[TOKEN_INT8_CONSTANT] = "TOKEN_INT8_CONSTANT"; + tokenToName[TOKEN_UINT8_CONSTANT] = "TOKEN_UINT8_CONSTANT"; + tokenToName[TOKEN_INT16_CONSTANT] = "TOKEN_INT16_CONSTANT"; + tokenToName[TOKEN_UINT16_CONSTANT] = "TOKEN_UINT16_CONSTANT"; tokenToName[TOKEN_INT32_CONSTANT] = "TOKEN_INT32_CONSTANT"; tokenToName[TOKEN_UINT32_CONSTANT] = "TOKEN_UINT32_CONSTANT"; tokenToName[TOKEN_INT64_CONSTANT] = "TOKEN_INT64_CONSTANT"; @@ -260,6 +266,10 @@ void ParserInit() { tokenNameRemap["TOKEN_STRING_C_LITERAL"] = "\"C\""; tokenNameRemap["TOKEN_DOTDOTDOT"] = "\'...\'"; tokenNameRemap["TOKEN_FLOAT_CONSTANT"] = "float constant"; + tokenNameRemap["TOKEN_INT8_CONSTANT"] = "int8 constant"; + tokenNameRemap["TOKEN_UINT8_CONSTANT"] = "unsigned int8 constant"; + tokenNameRemap["TOKEN_INT16_CONSTANT"] = "int16 constant"; + tokenNameRemap["TOKEN_UINT16_CONSTANT"] = "unsigned int16 constant"; tokenNameRemap["TOKEN_INT32_CONSTANT"] = "int32 constant"; tokenNameRemap["TOKEN_UINT32_CONSTANT"] = "unsigned int32 constant"; tokenNameRemap["TOKEN_INT64_CONSTANT"] = "int64 constant"; @@ -599,7 +609,22 @@ lParseInteger(bool dotdotdot) { } else { // No u or l suffix - // First, see if we can fit this into a 32-bit integer... + // If we're compiling to an 8-bit mask target and the constant + // fits into 8 bits, return an 8-bit int. + if (g->target->getMaskBitCount() == 8) { + if (yylval.intVal <= 0x7fULL) + return TOKEN_INT8_CONSTANT; + else if (yylval.intVal <= 0xffULL) + return TOKEN_UINT8_CONSTANT; + } + // And similarly for 16-bit masks and constants + if (g->target->getMaskBitCount() == 16) { + if (yylval.intVal <= 0x7fffULL) + return TOKEN_INT16_CONSTANT; + else if (yylval.intVal <= 0xffffULL) + return TOKEN_UINT16_CONSTANT; + } + // Otherwise, see if we can fit this into a 32-bit integer... if (yylval.intVal <= 0x7fffffffULL) return TOKEN_INT32_CONSTANT; else if (yylval.intVal <= 0xffffffffULL) diff --git a/parse.yy b/parse.yy index 488c864a..6ed2a43d 100644 --- a/parse.yy +++ b/parse.yy @@ -179,6 +179,8 @@ struct ForeachDimension { } +%token TOKEN_INT8_CONSTANT TOKEN_UINT8_CONSTANT +%token TOKEN_INT16_CONSTANT TOKEN_UINT16_CONSTANT %token TOKEN_INT32_CONSTANT TOKEN_UINT32_CONSTANT %token TOKEN_INT64_CONSTANT TOKEN_UINT64_CONSTANT %token TOKEN_INT32DOTDOTDOT_CONSTANT TOKEN_UINT32DOTDOTDOT_CONSTANT @@ -291,6 +293,22 @@ primary_expression Error(@1, "Undeclared symbol \"%s\".%s", name, alts.c_str()); } } + | TOKEN_INT8_CONSTANT { + $$ = new ConstExpr(AtomicType::UniformInt8->GetAsConstType(), + (int8_t)yylval.intVal, @1); + } + | TOKEN_UINT8_CONSTANT { + $$ = new ConstExpr(AtomicType::UniformUInt8->GetAsConstType(), + (uint8_t)yylval.intVal, @1); + } + | TOKEN_INT16_CONSTANT { + $$ = new ConstExpr(AtomicType::UniformInt16->GetAsConstType(), + (int16_t)yylval.intVal, @1); + } + | TOKEN_UINT16_CONSTANT { + $$ = new ConstExpr(AtomicType::UniformUInt16->GetAsConstType(), + (uint16_t)yylval.intVal, @1); + } | TOKEN_INT32_CONSTANT { $$ = new ConstExpr(AtomicType::UniformInt32->GetAsConstType(), (int32_t)yylval.intVal, @1); @@ -1233,7 +1251,10 @@ declarator ; int_constant - : TOKEN_INT32_CONSTANT { $$ = yylval.intVal; } + : TOKEN_INT8_CONSTANT { $$ = yylval.intVal; } + | TOKEN_INT16_CONSTANT { $$ = yylval.intVal; } + | TOKEN_INT32_CONSTANT { $$ = yylval.intVal; } + | TOKEN_INT64_CONSTANT { $$ = yylval.intVal; } ; direct_declarator diff --git a/run_tests.py b/run_tests.py index 7c6b1eb8..296db867 100755 --- a/run_tests.py +++ b/run_tests.py @@ -37,7 +37,7 @@ parser.add_option("-g", "--generics-include", dest="include_file", help="Filenam parser.add_option("-f", "--ispc-flags", dest="ispc_flags", help="Additional flags for ispc (-g, -O1, ...)", default="") parser.add_option('-t', '--target', dest='target', - help='Set compilation target (neon, sse2, sse2-x2, sse4, sse4-x2, avx, avx-x2, generic-4, generic-8, generic-16, generic-32)', + help='Set compilation target (neon, sse2, sse2-x2, sse4, sse4-x2, sse4-8, avx, avx-x2, generic-4, generic-8, generic-16, generic-32)', default="sse4") parser.add_option('-a', '--arch', dest='arch', help='Set architecture (arm, x86, x86-64)', @@ -294,7 +294,7 @@ def run_test(testname): firstline = firstline.rstrip() file.close() - if (output.find(firstline) == -1): + if re.search(firstline, output) == None: sys.stderr.write("Didn't see expected error message %s from test %s.\nActual output:\n%s\n" % \ (firstline, testname, output)) return (1, 0) diff --git a/stdlib.ispc b/stdlib.ispc index 9a2b191f..7e848481 100644 --- a/stdlib.ispc +++ b/stdlib.ispc @@ -3126,7 +3126,7 @@ static inline void __range_reduce_log(float input, varying float * uniform reduc static const int nonexponent_mask = 0x807FFFFF; // We want the reduced version to have an exponent of -1 which is -1 + 127 after biasing or 126 - static const int exponent_neg1 = (126 << 23); + static const int exponent_neg1 = (126l << 23); // NOTE(boulos): We don't need to mask anything out since we know // the sign bit has to be 0. If it's 1, we need to return infinity/nan // anyway (log(x), x = +-0 -> infinity, x < 0 -> NaN). @@ -3149,7 +3149,7 @@ static inline void __range_reduce_log(uniform float input, uniform float * unifo uniform int int_version = intbits(input); static const uniform int nonexponent_mask = 0x807FFFFF; - static const uniform int exponent_neg1 = (126 << 23); + static const uniform int exponent_neg1 = (126ul << 23); uniform int biased_exponent = int_version >> 23; uniform int offset_exponent = biased_exponent + 1; *exponent = offset_exponent - 127; // get the real value @@ -3647,18 +3647,18 @@ static inline uniform float half_to_float(uniform unsigned int16 h) { else { // https://gist.github.com/2144712 // Fabian "ryg" Giesen. - static const uniform unsigned int32 shifted_exp = 0x7c00 << 13; // exponent mask after shift + static const uniform unsigned int32 shifted_exp = 0x7c00ul << 13; // exponent mask after shift uniform int32 o = ((int32)(h & 0x7fff)) << 13; // exponent/mantissa bits uniform unsigned int32 exp = shifted_exp & o; // just the exponent - o += (127 - 15) << 23; // exponent adjust + o += (uniform int32)(127 - 15) << 23; // exponent adjust // handle exponent special cases if (exp == shifted_exp) // Inf/NaN? - o += (128 - 16) << 23; // extra exp adjust + o += (uniform unsigned int32)(128 - 16) << 23; // extra exp adjust else if (exp == 0) { // Zero/Denormal? - o += 1 << 23; // extra exp adjust - o = intbits(floatbits(o) - floatbits(113 << 23)); // renormalize + o += 1ul << 23; // extra exp adjust + o = intbits(floatbits(o) - floatbits(113ul << 23)); // renormalize } o |= ((int32)(h & 0x8000)) << 16; // sign bit @@ -3675,17 +3675,17 @@ static inline float half_to_float(unsigned int16 h) { // https://gist.github.com/2144712 // Fabian "ryg" Giesen. - const unsigned int32 shifted_exp = 0x7c00 << 13; // exponent mask after shift + const unsigned int32 shifted_exp = 0x7c00ul << 13; // exponent mask after shift - int32 o = ((int32)(h & 0x7fff)) << 13; // exponent/mantissa bits + int32 o = ((int32)(h & 0x7ffful)) << 13; // exponent/mantissa bits unsigned int32 exp = shifted_exp & o; // just the exponent - o += (127 - 15) << 23; // exponent adjust + o += (int32)(127 - 15) << 23; // exponent adjust - int32 infnan_val = o + ((128 - 16) << 23); - int32 zerodenorm_val = intbits(floatbits(o + (1<<23)) - floatbits(113 << 23)); + int32 infnan_val = o + ((int32)(128 - 16) << 23); + int32 zerodenorm_val = intbits(floatbits(o + (1ul<<23)) - floatbits(113ul << 23)); int32 reg_val = (exp == 0) ? zerodenorm_val : o; - int32 sign_bit = ((int32)(h & 0x8000)) << 16; + int32 sign_bit = ((int32)(h & 0x8000ul)) << 16; return floatbits(((exp == shifted_exp) ? infnan_val : reg_val) | sign_bit); } } @@ -3715,16 +3715,16 @@ static inline uniform int16 float_to_half(uniform float f) { // NaN->qNaN and Inf->Inf // unconditional assignment here, will override with right value for // the regular case below. - uniform int32 f32infty = 255 << 23; - o = (fint > f32infty) ? 0x7e00 : 0x7c00; + uniform int32 f32infty = 255ul << 23; + o = (fint > f32infty) ? 0x7e00u : 0x7c00u; // (De)normalized number or zero // update fint unconditionally to save the blending; we don't need it // anymore for the Inf/NaN case anyway. - const uniform unsigned int32 round_mask = ~0xfffu; - const uniform int32 magic = 15 << 23; - const uniform int32 f16infty = 31 << 23; + const uniform unsigned int32 round_mask = ~0xffful; + const uniform int32 magic = 15ul << 23; + const uniform int32 f16infty = 31ul << 23; uniform int32 fint2 = intbits(floatbits(fint & round_mask) * floatbits(magic)) - round_mask; fint2 = (fint2 > f16infty) ? f16infty : fint2; // Clamp to signed infinity if overflowed @@ -3761,16 +3761,16 @@ static inline int16 float_to_half(float f) { // NaN->qNaN and Inf->Inf // unconditional assignment here, will override with right value for // the regular case below. - int32 f32infty = 255 << 23; - o = (fint > f32infty) ? 0x7e00 : 0x7c00; + int32 f32infty = 255ul << 23; + o = (fint > f32infty) ? 0x7e00u : 0x7c00u; // (De)normalized number or zero // update fint unconditionally to save the blending; we don't need it // anymore for the Inf/NaN case anyway. - const unsigned int32 round_mask = ~0xfffu; - const int32 magic = 15 << 23; - const int32 f16infty = 31 << 23; + const unsigned int32 round_mask = ~0xffful; + const int32 magic = 15ul << 23; + const int32 f16infty = 31ul << 23; // Shift exponent down, denormalize if necessary. // NOTE This represents half-float denormals using single precision denormals. @@ -3789,7 +3789,7 @@ static inline int16 float_to_half(float f) { // FP16 denormals are rare in practice, I don't know. Whatever slow path your HW // may or may not have for denormals, this may well hit it. float fscale = floatbits(fint & round_mask) * floatbits(magic); - fscale = min(fscale, floatbits((31 << 23) - 0x1000)); + fscale = min(fscale, floatbits((31ul << 23) - 0x1000ul)); int32 fint2 = intbits(fscale) - round_mask; if (fint < f32infty) @@ -3956,7 +3956,7 @@ float_to_srgb8(float inval) // Do the table lookup and unpack bias, scale unsigned int tab = table[(intbits(inval) - 0x39000000u) >> 20]; unsigned int bias = (tab >> 16) << 9; - unsigned int scale = tab & 0xffff; + unsigned int scale = tab & 0xfffful; // Grab next-highest mantissa bits and perform linear interpolation unsigned int t = (intbits(inval) >> 12) & 0xff; @@ -4006,7 +4006,7 @@ float_to_srgb8(uniform float inval) // Do the table lookup and unpack bias, scale uniform unsigned int tab = table[(intbits(inval) - 0x39000000u) >> 20]; uniform unsigned int bias = (tab >> 16) << 9; - uniform unsigned int scale = tab & 0xffff; + uniform unsigned int scale = tab & 0xfffful; // Grab next-highest mantissa bits and perform linear interpolation uniform unsigned int t = (intbits(inval) >> 12) & 0xff; @@ -4053,14 +4053,14 @@ static inline uniform unsigned int random(uniform RNGState * uniform state) static inline float frandom(varying RNGState * uniform state) { unsigned int irand = random(state); - irand &= (1<<23)-1; + irand &= (1ul<<23)-1; return floatbits(0x3F800000 | irand)-1.0f; } static inline uniform float frandom(uniform RNGState * uniform state) { uniform unsigned int irand = random(state); - irand &= (1<<23)-1; + irand &= (1ul<<23)-1; return floatbits(0x3F800000 | irand)-1.0f; } @@ -4068,18 +4068,18 @@ static inline void seed_rng(varying RNGState * uniform state, unsigned int seed) { state->z1 = seed; state->z2 = seed ^ 0xbeeff00d; - state->z3 = ((seed & 0xffff) << 16) | (seed >> 16); - state->z4 = (((seed & 0xff) << 24) | ((seed & 0xff00) << 8) | - ((seed & 0xff0000) >> 8) | (seed & 0xff000000) >> 24); + state->z3 = ((seed & 0xfffful) << 16) | (seed >> 16); + state->z4 = (((seed & 0xfful) << 24) | ((seed & 0xff00ul) << 8) | + ((seed & 0xff0000ul) >> 8) | (seed & 0xff000000ul) >> 24); } static inline void seed_rng(uniform RNGState * uniform state, uniform unsigned int seed) { state->z1 = seed; state->z2 = seed ^ 0xbeeff00d; - state->z3 = ((seed & 0xffff) << 16) | (seed >> 16); - state->z4 = (((seed & 0xff) << 24) | ((seed & 0xff00) << 8) | - ((seed & 0xff0000) >> 8) | (seed & 0xff000000) >> 24); + state->z3 = ((seed & 0xfffful) << 16) | (seed >> 16); + state->z4 = (((seed & 0xfful) << 24) | ((seed & 0xff00ul) << 8) | + ((seed & 0xff0000ul) >> 8) | (seed & 0xff000000ul) >> 24); } @@ -4097,7 +4097,7 @@ static inline uniform bool rdrand(float * uniform ptr) { uniform int32 irand; uniform bool success = __rdrand_i32(&irand); if (success) { - irand &= (1<<23)-1; + irand &= (1ul<<23)-1; *ptr = floatbits(0x3F800000 | irand)-1.0f; } return success; @@ -4117,7 +4117,7 @@ static inline bool rdrand(varying float * uniform ptr) { // in vector form. However, we need to be careful to not // clobber any existing already-set values in *ptr with // inactive lanes here... - irand &= (1<<23)-1; + irand &= (1ul<<23)-1; *ptr = floatbits(0x3F800000 | irand)-1.0f; success = true; } @@ -4137,7 +4137,7 @@ static inline bool rdrand(float * ptr) { foreach_active (index) { uniform int32 irand; if (__rdrand_i32(&irand)) { - irand &= (1<<23)-1; + irand &= (1ul<<23)-1; *ptrs[index] = floatbits(0x3F800000 | irand)-1.0f; success = true; } diff --git a/tests/aossoa-1.ispc b/tests/aossoa-1.ispc index 59964d6d..32d3bcba 100644 --- a/tests/aossoa-1.ispc +++ b/tests/aossoa-1.ispc @@ -2,8 +2,8 @@ export uniform int width() { return programCount; } export void f_v(uniform float RET[]) { -#define width 3 -#define maxProgramCount 64 +#define width 3ul +#define maxProgramCount 64ul assert(programCount <= maxProgramCount); //CO const uniform int width = 3; diff --git a/tests/aossoa-2.ispc b/tests/aossoa-2.ispc index 9ff82226..df8eae5c 100644 --- a/tests/aossoa-2.ispc +++ b/tests/aossoa-2.ispc @@ -2,8 +2,8 @@ export uniform int width() { return programCount; } export void f_v(uniform float RET[]) { -#define width 4 -#define maxProgramCount 64 +#define width 4ul +#define maxProgramCount 64ul assert(programCount <= maxProgramCount); //CO const uniform int width = 4; diff --git a/tests/aossoa-5.ispc b/tests/aossoa-5.ispc index eb4fed3a..d6346455 100644 --- a/tests/aossoa-5.ispc +++ b/tests/aossoa-5.ispc @@ -2,8 +2,8 @@ export uniform int width() { return programCount; } export void f_v(uniform float RET[]) { -#define width 3 -#define maxProgramCount 64 +#define width 3ul +#define maxProgramCount 64ul assert(programCount <= maxProgramCount); //CO const uniform int width = 3; diff --git a/tests/aossoa-6.ispc b/tests/aossoa-6.ispc index b64cd10b..7c177fde 100644 --- a/tests/aossoa-6.ispc +++ b/tests/aossoa-6.ispc @@ -2,8 +2,8 @@ export uniform int width() { return programCount; } export void f_v(uniform float RET[]) { -#define width 4 -#define maxProgramCount 64 +#define width 4ul +#define maxProgramCount 64ul assert(programCount <= maxProgramCount); //CO const uniform int width = 4; diff --git a/tests/atomics-12.ispc b/tests/atomics-12.ispc index c27ad99c..d6359555 100644 --- a/tests/atomics-12.ispc +++ b/tests/atomics-12.ispc @@ -7,7 +7,7 @@ export void f_f(uniform float RET[], uniform float aFOO[]) { float a = aFOO[programIndex]; float b = 0; if (programIndex < 30 && programIndex & 1) - b = atomic_or_global(&s, (1 << programIndex)); + b = atomic_or_global(&s, (1ul << programIndex)); RET[programIndex] = s; } @@ -15,6 +15,6 @@ export void result(uniform float RET[]) { uniform int sum = 0; for (uniform int i = 0; i < min(30, programCount); ++i) if (i & 1) - sum += (1 << i); + sum += (1ul << i); RET[programIndex] = sum; } diff --git a/tests/atomics-13.ispc b/tests/atomics-13.ispc index 86faaddb..dea3bfc3 100644 --- a/tests/atomics-13.ispc +++ b/tests/atomics-13.ispc @@ -7,7 +7,7 @@ export void f_f(uniform float RET[], uniform float aFOO[]) { float a = aFOO[programIndex]; int32 b = 0; if (programIndex < 32 && programIndex & 1) - b = atomic_or_global(&s, (1 << programIndex)); + b = atomic_or_global(&s, (1ul << programIndex)); RET[programIndex] = popcnt(reduce_max((int32)b)); } diff --git a/tests/atomics-4.ispc b/tests/atomics-4.ispc index 30b343d1..ac746ad2 100644 --- a/tests/atomics-4.ispc +++ b/tests/atomics-4.ispc @@ -5,10 +5,10 @@ uniform int32 s = 0; export void f_f(uniform float RET[], uniform float aFOO[]) { float a = aFOO[programIndex]; - float b = atomic_or_global(&s, (1<> 2) * 16 + (programIndex & 3)]; diff --git a/tests/coalesce-4.ispc b/tests/coalesce-4.ispc index 1ddd4b89..182a4d4f 100644 --- a/tests/coalesce-4.ispc +++ b/tests/coalesce-4.ispc @@ -2,8 +2,8 @@ export uniform int width() { return programCount; } export void f_f(uniform float RET[], uniform float aFOO[]) { - uniform float * uniform buf = uniform new uniform float[32*32]; - for (uniform int i = 0; i < 32*32; ++i) + uniform float * uniform buf = uniform new uniform float[32l*32l]; + for (uniform int i = 0; i < 32l*32l; ++i) buf[i] = i; float a = buf[2*programIndex]; diff --git a/tests/coalesce-5.ispc b/tests/coalesce-5.ispc index 2dd8d44e..385e8526 100644 --- a/tests/coalesce-5.ispc +++ b/tests/coalesce-5.ispc @@ -2,8 +2,8 @@ export uniform int width() { return programCount; } export void f_f(uniform float RET[], uniform float aFOO[]) { - uniform float * uniform buf = uniform new uniform float[32*32]; - for (uniform int i = 0; i < 32*32; ++i) + uniform float * uniform buf = uniform new uniform float[32l*32l]; + for (uniform int i = 0; i < 32l*32l; ++i) buf[i] = i; float a = buf[4*programIndex]; diff --git a/tests/coalesce-6.ispc b/tests/coalesce-6.ispc index 2a54a2db..8c630a45 100644 --- a/tests/coalesce-6.ispc +++ b/tests/coalesce-6.ispc @@ -2,8 +2,8 @@ export uniform int width() { return programCount; } export void f_f(uniform float RET[], uniform float aFOO[]) { - uniform float * uniform buf = uniform new uniform float[32*32]; - for (uniform int i = 0; i < 32*32; ++i) + uniform float * uniform buf = uniform new uniform float[32l*32l]; + for (uniform int i = 0; i < 32l*32l; ++i) buf[i] = i; float a = buf[4*programIndex]; diff --git a/tests/coalesce-7.ispc b/tests/coalesce-7.ispc index 8ed628bd..29b56b8d 100644 --- a/tests/coalesce-7.ispc +++ b/tests/coalesce-7.ispc @@ -2,8 +2,8 @@ export uniform int width() { return programCount; } export void f_f(uniform float RET[], uniform float aFOO[]) { - uniform float * uniform buf = uniform new uniform float[32*32]; - for (uniform int i = 0; i < 32*32; ++i) + uniform float * uniform buf = uniform new uniform float[32l*32l]; + for (uniform int i = 0; i < 32l*32l; ++i) buf[i] = i; float a = buf[4*programIndex]; diff --git a/tests/coalesce-8.ispc b/tests/coalesce-8.ispc index dfefaa19..f01ca9c3 100644 --- a/tests/coalesce-8.ispc +++ b/tests/coalesce-8.ispc @@ -2,8 +2,8 @@ export uniform int width() { return programCount; } export void f_f(uniform float RET[], uniform float aFOO[]) { - uniform float * uniform buf = uniform new uniform float[32*32]; - for (uniform int i = 0; i < 32*32; ++i) + uniform float * uniform buf = uniform new uniform float[32l*32l]; + for (uniform int i = 0; i < 32l*32l; ++i) buf[i] = i; int index = (programIndex < 4) ? (programIndex & 1) : diff --git a/tests/count-leading-trailing-zeros-1.ispc b/tests/count-leading-trailing-zeros-1.ispc index 221d066d..3f12c07d 100644 --- a/tests/count-leading-trailing-zeros-1.ispc +++ b/tests/count-leading-trailing-zeros-1.ispc @@ -3,7 +3,7 @@ export uniform int width() { return programCount; } export void f_f(uniform float RET[], uniform float aFOO[]) { - RET[programIndex] = count_trailing_zeros(0xf0); + RET[programIndex] = count_trailing_zeros(0xf0ul); } export void result(uniform float RET[]) { diff --git a/tests/count-leading-trailing-zeros-4.ispc b/tests/count-leading-trailing-zeros-4.ispc index 475c18ca..4b849018 100644 --- a/tests/count-leading-trailing-zeros-4.ispc +++ b/tests/count-leading-trailing-zeros-4.ispc @@ -3,7 +3,7 @@ export uniform int width() { return programCount; } export void f_f(uniform float RET[], uniform float aFOO[]) { - int32 i = (1 << (programIndex % 28)); + int32 i = (1ul << (programIndex % 28)); RET[programIndex] = count_leading_zeros(i); } diff --git a/tests/exclusive-scan-and-2.ispc b/tests/exclusive-scan-and-2.ispc index 5d2bcd1f..b742a91e 100644 --- a/tests/exclusive-scan-and-2.ispc +++ b/tests/exclusive-scan-and-2.ispc @@ -3,7 +3,7 @@ export uniform int width() { return programCount; } export void f_f(uniform float RET[], uniform float aFOO[]) { RET[programIndex] = -1; - int32 a = ~(1 << programIndex); + int32 a = ~(1ul << programIndex); if ((programIndex < 32) && (programIndex & 1) == 0) { RET[programIndex] = exclusive_scan_and(a); } @@ -15,7 +15,7 @@ export void result(uniform float RET[]) { if ((programIndex & 1) == 0 && programIndex > 0 && programIndex < 32) { int val = 0xffffffff; for (int i = 0; i < programIndex-1; i += 2) - val &= ~(1<>20); } @@ -15,6 +15,6 @@ export void result(uniform float RET[]) { uniform int sum = 0; for (uniform int i = 0; i < min(32, programCount); ++i) if (i & 1) - sum += (1 << i); + sum += (1ul << i); RET[programIndex] = ((unsigned int64)(0xffffffffff000000 | sum)) >> 20; } diff --git a/tests/local-atomics-4.ispc b/tests/local-atomics-4.ispc index f7f6a04a..b3648ab5 100644 --- a/tests/local-atomics-4.ispc +++ b/tests/local-atomics-4.ispc @@ -7,10 +7,10 @@ export void f_f(uniform float RET[], uniform float aFOO[]) { float a = aFOO[programIndex]; float b = 0; if (programIndex < 29) - atomic_or_local(&s, (1< struct Foo" for assignment operator is not possible +// Type conversion from "const uniform int[0-9]*" to "soa<4> struct Foo" for assignment operator is not possible struct Pt { float x, y, z; }; diff --git a/tests_errors/soa-12.ispc b/tests_errors/soa-12.ispc index e2cd3242..c0420614 100644 --- a/tests_errors/soa-12.ispc +++ b/tests_errors/soa-12.ispc @@ -1,4 +1,4 @@ -// Can't convert between types "const uniform int32" and "soa<4> float" with different SOA widths +// Can't convert between types "const uniform int[0-9]*" and "soa<4> float" with different SOA widths struct Pt { float x, y, z; }; diff --git a/tests_errors/soa-3.ispc b/tests_errors/soa-3.ispc index b2be1b59..04dc84bc 100644 --- a/tests_errors/soa-3.ispc +++ b/tests_errors/soa-3.ispc @@ -1,4 +1,4 @@ -// syntax error, unexpected '-', expecting int32 constant +// syntax error, unexpected '-', expecting int struct F { float a, b, c; }; diff --git a/tests_errors/soa-4.ispc b/tests_errors/soa-4.ispc index b2be1b59..04dc84bc 100644 --- a/tests_errors/soa-4.ispc +++ b/tests_errors/soa-4.ispc @@ -1,4 +1,4 @@ -// syntax error, unexpected '-', expecting int32 constant +// syntax error, unexpected '-', expecting int struct F { float a, b, c; }; diff --git a/tests_errors/soa-9.ispc b/tests_errors/soa-9.ispc index 7c6a1df9..e9e7509a 100644 --- a/tests_errors/soa-9.ispc +++ b/tests_errors/soa-9.ispc @@ -1,4 +1,4 @@ -// Can't convert from pointer to SOA type "soa<8> struct A * uniform" to pointer to non-SOA type "void * varying" +// Can't convert from pointer to SOA type "soa<8> struct A \* uniform" to pointer to non-SOA type "void \* varying" struct A { float a, b; }; diff --git a/tests_errors/struct_arith.ispc b/tests_errors/struct_arith.ispc index 9d942880..df729d02 100644 --- a/tests_errors/struct_arith.ispc +++ b/tests_errors/struct_arith.ispc @@ -1,4 +1,4 @@ -// Assignment operator "+=" is illegal with struct type +// Assignment operator "\+=" is illegal with struct type struct Point { float x, y, z; }; diff --git a/tests_errors/vec-size-compile-constant.ispc b/tests_errors/vec-size-compile-constant.ispc index b9e61721..0eb6f90e 100644 --- a/tests_errors/vec-size-compile-constant.ispc +++ b/tests_errors/vec-size-compile-constant.ispc @@ -1,4 +1,4 @@ -// syntax error, unexpected identifier, expecting int32 constant +// syntax error, unexpected identifier, expecting int void foo(uniform int i) { float a; From c14659c6754f4d91a3bec3cbb48c4e67b7421d13 Mon Sep 17 00:00:00 2001 From: Matt Pharr Date: Tue, 23 Jul 2013 17:02:49 -0700 Subject: [PATCH 006/124] Fix bug in lGetConstantInt() in parse.yy. Previously, we weren't handling signed/unsigned constant types correctly. --- parse.yy | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/parse.yy b/parse.yy index 6ed2a43d..4b315776 100644 --- a/parse.yy +++ b/parse.yy @@ -2278,7 +2278,11 @@ lGetConstantInt(Expr *expr, int *value, SourcePos pos, const char *usage) { Error(pos, "%s must be representable with a 32-bit integer.", usage); return false; } - *value = (int)ci->getZExtValue(); + const Type *type = expr->GetType(); + if (type->IsUnsignedType()) + *value = (int)ci->getZExtValue(); + else + *value = (int)ci->getSExtValue(); return true; } } From 15a3ef370a433eedcf6e6650f07ec81775d0322d Mon Sep 17 00:00:00 2001 From: Matt Pharr Date: Tue, 23 Jul 2013 17:11:01 -0700 Subject: [PATCH 007/124] Use @llvm.readcyclecounter to implement stdlib clock() function. Also added a test for the clock builtin. --- builtins/util.m4 | 14 ++++---------- tests/clock.ispc | 14 ++++++++++++++ 2 files changed, 18 insertions(+), 10 deletions(-) create mode 100644 tests/clock.ispc diff --git a/builtins/util.m4 b/builtins/util.m4 index d6f3e5c3..8c379781 100644 --- a/builtins/util.m4 +++ b/builtins/util.m4 @@ -2891,17 +2891,11 @@ m4exit(`1') ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; read hw clock +declare i64 @llvm.readcyclecounter() + define i64 @__clock() nounwind { -entry: - tail call void asm sideeffect "xorl %eax,%eax \0A cpuid", "~{rax},~{rbx},~{rcx},~{rdx},~{dirflag},~{fpsr},~{flags}"() nounwind - %0 = tail call { i32, i32 } asm sideeffect "rdtsc", "={ax},={dx},~{dirflag},~{fpsr},~{flags}"() nounwind - %asmresult = extractvalue { i32, i32 } %0, 0 - %asmresult1 = extractvalue { i32, i32 } %0, 1 - %conv = zext i32 %asmresult1 to i64 - %shl = shl nuw i64 %conv, 32 - %conv2 = zext i32 %asmresult to i64 - %or = or i64 %shl, %conv2 - ret i64 %or + %r = call i64 @llvm.readcyclecounter() + ret i64 %r } ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; diff --git a/tests/clock.ispc b/tests/clock.ispc new file mode 100644 index 00000000..0e95379b --- /dev/null +++ b/tests/clock.ispc @@ -0,0 +1,14 @@ + +export uniform int width() { return programCount; } + + +export void f_f(uniform float RET[], uniform float aFOO[]) { + unsigned uniform int64 a = clock(); + float x = pow(sqrt(aFOO[programIndex]), 5.5); + unsigned uniform int64 b = clock(); + RET[programIndex] = (b - a) > 0 ? 1 : 0; +} + +export void result(uniform float RET[]) { + RET[programIndex] = 1; +} From 53414f12e6ce7d1615cd650cc7b2152063da6556 Mon Sep 17 00:00:00 2001 From: Matt Pharr Date: Tue, 23 Jul 2013 17:30:32 -0700 Subject: [PATCH 008/124] Add SSE4 target optimized for computation with 8-bit datatypes. This change adds a new 'sse4-8' target, where programCount is 16 and the mask element size is 8-bits. (i.e. the most appropriate sizing of the mask for SIMD computation with 8-bit datatypes.) --- Makefile | 2 +- builtins.cpp | 9 + builtins/target-sse4-8.ll | 444 ++++++++++++++++++++++++++++++++++++++ builtins/util.m4 | 104 ++++++++- expr.cpp | 5 + ispc.cpp | 8 + opt.cpp | 13 +- 7 files changed, 578 insertions(+), 7 deletions(-) create mode 100644 builtins/target-sse4-8.ll diff --git a/Makefile b/Makefile index 043ab4cf..054a3da1 100644 --- a/Makefile +++ b/Makefile @@ -123,7 +123,7 @@ CXX_SRC=ast.cpp builtins.cpp cbackend.cpp ctx.cpp decl.cpp expr.cpp func.cpp \ HEADERS=ast.h builtins.h ctx.h decl.h expr.h func.h ispc.h llvmutil.h module.h \ opt.h stmt.h sym.h type.h util.h TARGETS=neon avx1 avx1-x2 avx11 avx11-x2 avx2 avx2-x2 sse2 sse2-x2 sse4 sse4-x2 \ - generic-4 generic-8 generic-16 generic-32 generic-64 generic-1 + sse4-8 generic-4 generic-8 generic-16 generic-32 generic-64 generic-1 # These files need to be compiled in two versions - 32 and 64 bits. BUILTINS_SRC_TARGET=$(addprefix builtins/target-, $(addsuffix .ll, $(TARGETS))) # These are files to be compiled in single version. diff --git a/builtins.cpp b/builtins.cpp index d3bbaa6a..6c586595 100644 --- a/builtins.cpp +++ b/builtins.cpp @@ -868,6 +868,15 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod EXPORT_MODULE(builtins_bitcode_sse4_x2_64bit); } break; + case 16: + Assert(g->target->getMaskBitCount() == 8); + if (runtime32) { + EXPORT_MODULE(builtins_bitcode_sse4_8_32bit); + } + else { + EXPORT_MODULE(builtins_bitcode_sse4_8_64bit); + } + break; default: FATAL("logic error in DefineStdlib"); } diff --git a/builtins/target-sse4-8.ll b/builtins/target-sse4-8.ll new file mode 100644 index 00000000..c85209ba --- /dev/null +++ b/builtins/target-sse4-8.ll @@ -0,0 +1,444 @@ +;; Copyright (c) 2013, Google, Inc. +;; All rights reserved. +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are +;; met: +;; +;; * Redistributions of source code must retain the above copyright +;; notice, this list of conditions and the following disclaimer. +;; +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; +;; * Neither the name of Google, Inc. nor the names of its +;; contributors may be used to endorse or promote products derived from +;; this software without specific prior written permission. +;; +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +;; IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +;; TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +;; PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER +;; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +;; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +;; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +; Define common 4-wide stuff +define(`WIDTH',`16') +define(`MASK',`i8') +include(`util.m4') + +stdlib_core() +packed_load_and_store() +scans() +int64minmax() + +include(`target-sse4-common.ll') + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; half conversion routines + +declare float @__half_to_float_uniform(i16 %v) nounwind readnone +declare @__half_to_float_varying( %v) nounwind readnone +declare i16 @__float_to_half_uniform(float %v) nounwind readnone +declare @__float_to_half_varying( %v) nounwind readnone + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; rcp + +declare <4 x float> @llvm.x86.sse.rcp.ps(<4 x float>) nounwind readnone + +define @__rcp_varying_float() nounwind readonly alwaysinline { + unary4to16(call, float, @llvm.x86.sse.rcp.ps, %0) + ; do one N-R iteration to improve precision + ; float iv = __rcp_v(v); + ; return iv * (2. - v * iv); + %v_iv = fmul <16 x float> %0, %call + %two_minus = fsub <16 x float> , %v_iv + %iv_mul = fmul <16 x float> %call, %two_minus + ret <16 x float> %iv_mul +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; rsqrt + +declare <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float>) nounwind readnone + +define <16 x float> @__rsqrt_varying_float(<16 x float> %v) nounwind readonly alwaysinline { + ; float is = __rsqrt_v(v); + unary4to16(is, float, @llvm.x86.sse.rsqrt.ps, %v) + ; Newton-Raphson iteration to improve precision + ; return 0.5 * is * (3. - (v * is) * is); + %v_is = fmul <16 x float> %v, %is + %v_is_is = fmul <16 x float> %v_is, %is + %three_sub = fsub <16 x float> , %v_is_is + %is_mul = fmul <16 x float> %is, %three_sub + %half_scale = fmul <16 x float> , %is_mul + ret <16 x float> %half_scale +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; sqrt + +declare <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float>) nounwind readnone + +define <16 x float> @__sqrt_varying_float(<16 x float>) nounwind readonly alwaysinline { + unary4to16(call, float, @llvm.x86.sse.sqrt.ps, %0) + ret <16 x float> %call +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; double precision sqrt + +declare <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double>) nounwind readnone + +define <16 x double> @__sqrt_varying_double(<16 x double>) nounwind +alwaysinline { + unary2to16(ret, double, @llvm.x86.sse2.sqrt.pd, %0) + ret <16 x double> %ret +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; rounding floats + +declare <4 x float> @llvm.x86.sse41.round.ps(<4 x float>, i32) nounwind readnone + +define <16 x float> @__round_varying_float(<16 x float>) nounwind readonly alwaysinline { + ; roundps, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8 + round4to16(%0, 8) +} + +define <16 x float> @__floor_varying_float(<16 x float>) nounwind readonly alwaysinline { + ; roundps, round down 0b01 | don't signal precision exceptions 0b1001 = 9 + round4to16(%0, 9) +} + +define <16 x float> @__ceil_varying_float(<16 x float>) nounwind readonly alwaysinline { + ; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10 + round4to16(%0, 10) +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; rounding doubles + +declare <2 x double> @llvm.x86.sse41.round.pd(<2 x double>, i32) nounwind readnone + +define <16 x double> @__round_varying_double(<16 x double>) nounwind readonly alwaysinline { +; XXXround2to4double(%0, 8) + ; FIXME: need round2to16double in util.m4... + ret <16 x double> undef +} + +define <16 x double> @__floor_varying_double(<16 x double>) nounwind readonly alwaysinline { + ; roundpd, round down 0b01 | don't signal precision exceptions 0b1001 = 9 +; XXXround2to4double(%0, 9) + ret <16 x double> undef +} + +define <16 x double> @__ceil_varying_double(<16 x double>) nounwind readonly alwaysinline { + ; roundpd, round up 0b10 | don't signal precision exceptions 0b1010 = 10 +; XXXround2to4double(%0, 10) + ret <16 x double> undef +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; float min/max + +declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind readnone +declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind readnone + +define <16 x float> @__max_varying_float(<16 x float>, <16 x float>) nounwind readonly alwaysinline { + binary4to16(call, float, @llvm.x86.sse.max.ps, %0, %1) + ret <16 x float> %call +} + +define <16 x float> @__min_varying_float(<16 x float>, <16 x float>) nounwind readonly alwaysinline { + binary4to16(call, float, @llvm.x86.sse.min.ps, %0, %1) + ret <16 x float> %call +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; int32 min/max + +define <16 x i32> @__min_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline { + binary4to16(call, i32, @llvm.x86.sse41.pminsd, %0, %1) + ret <16 x i32> %call +} + +define <16 x i32> @__max_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline { + binary4to16(call, i32, @llvm.x86.sse41.pmaxsd, %0, %1) + ret <16 x i32> %call +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; unsigned int min/max + +define <16 x i32> @__min_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline { + binary4to16(call, i32, @llvm.x86.sse41.pminud, %0, %1) + ret <16 x i32> %call +} + +define <16 x i32> @__max_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline { + binary4to16(call, i32, @llvm.x86.sse41.pmaxud, %0, %1) + ret <16 x i32> %call +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; double precision min/max + +declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>) nounwind readnone +declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) nounwind readnone + +define <16 x double> @__min_varying_double(<16 x double>, <16 x double>) nounwind readnone { + binary2to16(ret, double, @llvm.x86.sse2.min.pd, %0, %1) + ret <16 x double> %ret +} + +define <16 x double> @__max_varying_double(<16 x double>, <16 x double>) nounwind readnone { + binary2to16(ret, double, @llvm.x86.sse2.max.pd, %0, %1) + ret <16 x double> %ret +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; svml + +; FIXME + +declare <16 x float> @__svml_sin(<16 x float>) +declare <16 x float> @__svml_cos(<16 x float>) +declare void @__svml_sincos(<16 x float>, <16 x float> *, <16 x float> *) +declare <16 x float> @__svml_tan(<16 x float>) +declare <16 x float> @__svml_atan(<16 x float>) +declare <16 x float> @__svml_atan2(<16 x float>, <16 x float>) +declare <16 x float> @__svml_exp(<16 x float>) +declare <16 x float> @__svml_log(<16 x float>) +declare <16 x float> @__svml_pow(<16 x float>, <16 x float>) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; horizontal ops / reductions + +declare i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8>) nounwind readnone + +define i64 @__movmsk(<16 x i8>) nounwind readnone alwaysinline { + %m = call i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8> %0) + %m64 = zext i32 %m to i64 + ret i64 %m64 +} + +define i1 @__any(<16 x i8>) nounwind readnone alwaysinline { + %m = call i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8> %0) + %mne = icmp ne i32 %m, 0 + ret i1 %mne +} + +define i1 @__all(<16 x i8>) nounwind readnone alwaysinline { + %m = call i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8> %0) + %meq = icmp eq i32 %m, ALL_ON_MASK + ret i1 %meq +} + +define i1 @__none(<16 x i8>) nounwind readnone alwaysinline { + %m = call i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8> %0) + %meq = icmp eq i32 %m, 0 + ret i1 %meq +} + +define internal <16 x float> @__add_varying_float(<16 x float>, <16 x float>) { + %r = fadd <16 x float> %0, %1 + ret <16 x float> %r +} + +define internal float @__add_uniform_float(float, float) { + %r = fadd float %0, %1 + ret float %r +} + +define float @__reduce_add_float(<16 x float>) nounwind readonly alwaysinline { + reduce16(float, @__add_varying_float, @__add_uniform_float) +} + +define float @__reduce_min_float(<16 x float>) nounwind readnone { + reduce16(float, @__min_varying_float, @__min_uniform_float) +} + +define float @__reduce_max_float(<16 x float>) nounwind readnone { + reduce16(float, @__max_varying_float, @__max_uniform_float) +} + +define internal <16 x i32> @__add_varying_int32(<16 x i32>, <16 x i32>) { + %r = add <16 x i32> %0, %1 + ret <16 x i32> %r +} + +define internal i32 @__add_uniform_int32(i32, i32) { + %r = add i32 %0, %1 + ret i32 %r +} + +define i32 @__reduce_add_int32(<16 x i32>) nounwind readnone { + reduce16(i32, @__add_varying_int32, @__add_uniform_int32) +} + +define i32 @__reduce_min_int32(<16 x i32>) nounwind readnone { + reduce16(i32, @__min_varying_int32, @__min_uniform_int32) +} + +define i32 @__reduce_max_int32(<16 x i32>) nounwind readnone { + reduce16(i32, @__max_varying_int32, @__max_uniform_int32) +} + +define i32 @__reduce_min_uint32(<16 x i32>) nounwind readnone { + reduce16(i32, @__min_varying_uint32, @__min_uniform_uint32) +} + +define i32 @__reduce_max_uint32(<16 x i32>) nounwind readnone { + reduce16(i32, @__max_varying_uint32, @__max_uniform_uint32) +} + +define internal <16 x double> @__add_varying_double(<16 x double>, <16 x double>) { + %r = fadd <16 x double> %0, %1 + ret <16 x double> %r +} + +define internal double @__add_uniform_double(double, double) { + %r = fadd double %0, %1 + ret double %r +} + +define double @__reduce_add_double(<16 x double>) nounwind readnone { + reduce16(double, @__add_varying_double, @__add_uniform_double) +} + +define double @__reduce_min_double(<16 x double>) nounwind readnone { + reduce16(double, @__min_varying_double, @__min_uniform_double) +} + +define double @__reduce_max_double(<16 x double>) nounwind readnone { + reduce16(double, @__max_varying_double, @__max_uniform_double) +} + +define internal <16 x i64> @__add_varying_int64(<16 x i64>, <16 x i64>) { + %r = add <16 x i64> %0, %1 + ret <16 x i64> %r +} + +define internal i64 @__add_uniform_int64(i64, i64) { + %r = add i64 %0, %1 + ret i64 %r +} + +define i64 @__reduce_add_int64(<16 x i64>) nounwind readnone { + reduce16(i64, @__add_varying_int64, @__add_uniform_int64) +} + +define i64 @__reduce_min_int64(<16 x i64>) nounwind readnone { + reduce16(i64, @__min_varying_int64, @__min_uniform_int64) +} + +define i64 @__reduce_max_int64(<16 x i64>) nounwind readnone { + reduce16(i64, @__max_varying_int64, @__max_uniform_int64) +} + +define i64 @__reduce_min_uint64(<16 x i64>) nounwind readnone { + reduce16(i64, @__min_varying_uint64, @__min_uniform_uint64) +} + +define i64 @__reduce_max_uint64(<16 x i64>) nounwind readnone { + reduce16(i64, @__max_varying_uint64, @__max_uniform_uint64) +} + +reduce_equal(16) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; masked store + +define void @__masked_store_blend_i64(<16 x i64>* nocapture, <16 x i64>, + <16 x i8> %mask) nounwind + alwaysinline { + %mask_as_i1 = trunc <16 x MASK> %mask to <16 x i1> + %old = load <16 x i64>* %0, align 4 + %blend = select <16 x i1> %mask_as_i1, <16 x i64> %1, <16 x i64> %old + store <16 x i64> %blend, <16 x i64>* %0, align 4 + ret void +} + +define void @__masked_store_blend_i32(<16 x i32>* nocapture, <16 x i32>, + <16 x MASK> %mask) nounwind alwaysinline { + %mask_as_i1 = trunc <16 x MASK> %mask to <16 x i1> + %old = load <16 x i32>* %0, align 4 + %blend = select <16 x i1> %mask_as_i1, <16 x i32> %1, <16 x i32> %old + store <16 x i32> %blend, <16 x i32>* %0, align 4 + ret void +} + +define void @__masked_store_blend_i16(<16 x i16>* nocapture, <16 x i16>, + <16 x MASK> %mask) nounwind alwaysinline { + %mask_as_i1 = trunc <16 x MASK> %mask to <16 x i1> + %old = load <16 x i16>* %0, align 4 + %blend = select <16 x i1> %mask_as_i1, <16 x i16> %1, <16 x i16> %old + store <16 x i16> %blend, <16 x i16>* %0, align 4 + ret void +} + +define void @__masked_store_blend_i8(<16 x i8>* nocapture, <16 x i8>, + <16 x MASK> %mask) nounwind alwaysinline { + %mask_as_i1 = trunc <16 x MASK> %mask to <16 x i1> + %old = load <16 x i8>* %0, align 4 + %blend = select <16 x i1> %mask_as_i1, <16 x i8> %1, <16 x i8> %old + store <16 x i8> %blend, <16 x i8>* %0, align 4 + ret void +} + +gen_masked_store(i8) +gen_masked_store(i16) +gen_masked_store(i32) +gen_masked_store(i64) + +masked_store_float_double() + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; unaligned loads/loads+broadcasts + +masked_load(i8, 1) +masked_load(i16, 2) +masked_load(i32, 4) +masked_load(float, 4) +masked_load(i64, 8) +masked_load(double, 8) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; gather/scatter + +; define these with the macros from stdlib.m4 + +gen_gather_factored(i8) +gen_gather_factored(i16) +gen_gather_factored(i32) +gen_gather_factored(float) +gen_gather_factored(i64) +gen_gather_factored(double) + +gen_scatter(i8) +gen_scatter(i16) +gen_scatter(i32) +gen_scatter(float) +gen_scatter(i64) +gen_scatter(double) diff --git a/builtins/util.m4 b/builtins/util.m4 index 8c379781..ee45ebc7 100644 --- a/builtins/util.m4 +++ b/builtins/util.m4 @@ -411,6 +411,42 @@ define(`unary2to8', ` ' ) +define(`unary2to16', ` + %$1_0 = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> + %v$1_0 = call <2 x $2> $3(<2 x $2> %$1_0) + %$1_1 = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> + %v$1_1 = call <2 x $2> $3(<2 x $2> %$1_1) + %$1_2 = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> + %v$1_2 = call <2 x $2> $3(<2 x $2> %$1_2) + %$1_3 = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> + %v$1_3 = call <2 x $2> $3(<2 x $2> %$1_3) + %$1_4 = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> + %v$1_4 = call <2 x $2> $3(<2 x $2> %$1_4) + %$1_5 = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> + %v$1_5 = call <2 x $2> $3(<2 x $2> %$1_5) + %$1_6 = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> + %v$1_6 = call <2 x $2> $3(<2 x $2> %$1_6) + %$1_7 = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> + %v$1_7 = call <2 x $2> $3(<2 x $2> %$1_7) + %$1a = shufflevector <2 x $2> %v$1_0, <2 x $2> %v$1_1, + <4 x i32> + %$1b = shufflevector <2 x $2> %v$1_2, <2 x $2> %v$1_3, + <4 x i32> + %$1ab = shufflevector <4 x $2> %$1a, <4 x $2> %$1b, + <8 x i32> + %$1c = shufflevector <2 x $2> %v$1_4, <2 x $2> %v$1_5, + <4 x i32> + %$1d = shufflevector <2 x $2> %v$1_6, <2 x $2> %v$1_7, + <4 x i32> + %$1cd = shufflevector <4 x $2> %$1c, <4 x $2> %$1d, + <8 x i32> + + %$1 = shufflevector <8 x $2> %$1ab, <8 x $2> %$1cd, + <16 x i32> +' +) + ;; Maps an 2-wide binary function to two 8-wide vector operands ;; $1: name of variable into which the final result should go ;; $2: scalar type of the vector elements @@ -432,12 +468,58 @@ define(`binary2to8', ` %$1_3b = shufflevector <8 x $2> $5, <8 x $2> undef, <2 x i32> %v$1_3 = call <2 x $2> $3(<2 x $2> %$1_3a, <2 x $2> %$1_3b) + %$1a = shufflevector <2 x $2> %v$1_0, <2 x $2> %v$1_1, + <4 x i32> + %$1b = shufflevector <2 x $2> %v$1_2, <2 x $2> %v$1_3, + <4 x i32> + %$1 = shufflevector <4 x $2> %$1a, <4 x $2> %$1b, + <8 x i32> +' +) + +define(`binary2to16', ` + %$1_0a = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> + %$1_0b = shufflevector <16 x $2> $5, <16 x $2> undef, <2 x i32> + %v$1_0 = call <2 x $2> $3(<2 x $2> %$1_0a, <2 x $2> %$1_0b) + %$1_1a = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> + %$1_1b = shufflevector <16 x $2> $5, <16 x $2> undef, <2 x i32> + %v$1_1 = call <2 x $2> $3(<2 x $2> %$1_1a, <2 x $2> %$1_1b) + %$1_2a = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> + %$1_2b = shufflevector <16 x $2> $5, <16 x $2> undef, <2 x i32> + %v$1_2 = call <2 x $2> $3(<2 x $2> %$1_2a, <2 x $2> %$1_2b) + %$1_3a = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> + %$1_3b = shufflevector <16 x $2> $5, <16 x $2> undef, <2 x i32> + %v$1_3 = call <2 x $2> $3(<2 x $2> %$1_3a, <2 x $2> %$1_3b) + %$1_4a = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> + %$1_4b = shufflevector <16 x $2> $5, <16 x $2> undef, <2 x i32> + %v$1_4 = call <2 x $2> $3(<2 x $2> %$1_4a, <2 x $2> %$1_4b) + %$1_5a = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> + %$1_5b = shufflevector <16 x $2> $5, <16 x $2> undef, <2 x i32> + %v$1_5 = call <2 x $2> $3(<2 x $2> %$1_5a, <2 x $2> %$1_5b) + %$1_6a = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> + %$1_6b = shufflevector <16 x $2> $5, <16 x $2> undef, <2 x i32> + %v$1_6 = call <2 x $2> $3(<2 x $2> %$1_6a, <2 x $2> %$1_6b) + %$1_7a = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> + %$1_7b = shufflevector <16 x $2> $5, <16 x $2> undef, <2 x i32> + %v$1_7 = call <2 x $2> $3(<2 x $2> %$1_7a, <2 x $2> %$1_7b) + %$1a = shufflevector <2 x $2> %v$1_0, <2 x $2> %v$1_1, <4 x i32> %$1b = shufflevector <2 x $2> %v$1_2, <2 x $2> %v$1_3, <4 x i32> - %$1 = shufflevector <4 x $2> %$1a, <4 x $2> %$1b, + %$1ab = shufflevector <4 x $2> %$1a, <4 x $2> %$1b, <8 x i32> + + %$1c = shufflevector <2 x $2> %v$1_4, <2 x $2> %v$1_5, + <4 x i32> + %$1d = shufflevector <2 x $2> %v$1_6, <2 x $2> %v$1_7, + <4 x i32> + %$1cd = shufflevector <4 x $2> %$1c, <4 x $2> %$1d, + <8 x i32> + + %$1 = shufflevector <8 x $2> %$1ab, <8 x $2> %$1cd, + <16 x i32> ' ) @@ -460,6 +542,26 @@ ret <8 x float> %ret ' ) +define(`round4to16', ` +%v0 = shufflevector <16 x float> $1, <16 x float> undef, <4 x i32> +%v1 = shufflevector <16 x float> $1, <16 x float> undef, <4 x i32> +%v2 = shufflevector <16 x float> $1, <16 x float> undef, <4 x i32> +%v3 = shufflevector <16 x float> $1, <16 x float> undef, <4 x i32> +%r0 = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %v0, i32 $2) +%r1 = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %v1, i32 $2) +%r2 = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %v2, i32 $2) +%r3 = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %v3, i32 $2) +%ret01 = shufflevector <4 x float> %r0, <4 x float> %r1, + <8 x i32> +%ret23 = shufflevector <4 x float> %r2, <4 x float> %r3, + <8 x i32> +%ret = shufflevector <8 x float> %ret01, <8 x float> %ret23, + <16 x i32> +ret <16 x float> %ret +' +) + define(`round8to16', ` %v0 = shufflevector <16 x float> $1, <16 x float> undef, <8 x i32> diff --git a/expr.cpp b/expr.cpp index 6bde2acb..f81037f6 100644 --- a/expr.cpp +++ b/expr.cpp @@ -3123,6 +3123,10 @@ static llvm::Value * lEmitVaryingSelect(FunctionEmitContext *ctx, llvm::Value *test, llvm::Value *expr1, llvm::Value *expr2, const Type *type) { +#if !defined(LLVM_3_1) + test = ctx->TruncInst(test, LLVMTypes::Int1VectorType); + return ctx->SelectInst(test, expr1, expr2, "select"); +#else llvm::Value *resultPtr = ctx->AllocaInst(expr1->getType(), "selectexpr_tmp"); // Don't need to worry about masking here ctx->StoreInst(expr2, resultPtr); @@ -3131,6 +3135,7 @@ lEmitVaryingSelect(FunctionEmitContext *ctx, llvm::Value *test, PointerType::GetUniform(type)->LLVMType(g->ctx)); ctx->StoreInst(expr1, resultPtr, test, type, PointerType::GetUniform(type)); return ctx->LoadInst(resultPtr, "selectexpr_final"); +#endif // !LLVM_3_1 } diff --git a/ispc.cpp b/ispc.cpp index 887f6ca3..6ac23781 100644 --- a/ispc.cpp +++ b/ispc.cpp @@ -310,6 +310,14 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : this->m_maskingIsFree = false; this->m_maskBitCount = 32; } + else if (!strcasecmp(isa, "sse4-8")) { + this->m_isa = Target::SSE4; + this->m_nativeVectorWidth = 16; + this->m_vectorWidth = 16; + this->m_attributes = "+sse,+sse2,+sse3,+sse41,-sse42,-sse4a,+ssse3,-popcnt,+cmov"; + this->m_maskingIsFree = false; + this->m_maskBitCount = 8; + } else if (!strcasecmp(isa, "generic-4")) { this->m_isa = Target::GENERIC; this->m_nativeVectorWidth = 4; diff --git a/opt.cpp b/opt.cpp index ba32c639..4701e7df 100644 --- a/opt.cpp +++ b/opt.cpp @@ -670,14 +670,17 @@ IntrinsicsOpt::IntrinsicsOpt() // All of the mask instructions we may encounter. Note that even if // compiling for AVX, we may still encounter the regular 4-wide SSE // MOVMSK instruction. - llvm::Function *sseMovmsk = + llvm::Function *ssei8Movmsk = + llvm::Intrinsic::getDeclaration(m->module, llvm::Intrinsic::x86_sse2_pmovmskb_128); + maskInstructions.push_back(ssei8Movmsk); + llvm::Function *sseFloatMovmsk = llvm::Intrinsic::getDeclaration(m->module, llvm::Intrinsic::x86_sse_movmsk_ps); - maskInstructions.push_back(sseMovmsk); + maskInstructions.push_back(sseFloatMovmsk); maskInstructions.push_back(m->module->getFunction("__movmsk")); - llvm::Function *avxMovmsk = + llvm::Function *avxFloatMovmsk = llvm::Intrinsic::getDeclaration(m->module, llvm::Intrinsic::x86_avx_movmsk_ps_256); - Assert(avxMovmsk != NULL); - maskInstructions.push_back(avxMovmsk); + Assert(avxFloatMovmsk != NULL); + maskInstructions.push_back(avxFloatMovmsk); // And all of the blend instructions blendInstructions.push_back(BlendInstruction( From 04d61afa23a64d9fc5f95648509bd5ec002da53e Mon Sep 17 00:00:00 2001 From: Matt Pharr Date: Thu, 25 Jul 2013 09:40:48 -0700 Subject: [PATCH 009/124] Fix bug in lEmitVaryingSelect() for targets with i1 mask types. Commit 53414f12e6c introduced a but where lEmitVaryingSelect() would try to truncate a vector of i1s to a vector of i1s, which in turn made LLVM's IR analyzer unhappy. --- expr.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/expr.cpp b/expr.cpp index f81037f6..856d363c 100644 --- a/expr.cpp +++ b/expr.cpp @@ -3124,7 +3124,8 @@ lEmitVaryingSelect(FunctionEmitContext *ctx, llvm::Value *test, llvm::Value *expr1, llvm::Value *expr2, const Type *type) { #if !defined(LLVM_3_1) - test = ctx->TruncInst(test, LLVMTypes::Int1VectorType); + if (test->getType() != LLVMTypes::Int1VectorType) + test = ctx->TruncInst(test, LLVMTypes::Int1VectorType); return ctx->SelectInst(test, expr1, expr2, "select"); #else llvm::Value *resultPtr = ctx->AllocaInst(expr1->getType(), "selectexpr_tmp"); From 780b0dfe47a770785c4fe1f224813e3a518cd135 Mon Sep 17 00:00:00 2001 From: Matt Pharr Date: Wed, 24 Jul 2013 09:40:50 -0700 Subject: [PATCH 010/124] Add SSE4-16 target. Along the lines of sse4-8, this is an 8-wide target for SSE4, using 16-bit elements for the mask. It's thus (in principle) the best target for SIMD computation with 16-bit datatypes. --- Makefile | 2 +- builtins.cpp | 16 +- builtins/target-sse4-16.ll | 436 +++++++++++++++++++++++++++++++++++++ ispc.cpp | 14 +- run_tests.py | 2 +- 5 files changed, 463 insertions(+), 7 deletions(-) create mode 100644 builtins/target-sse4-16.ll diff --git a/Makefile b/Makefile index 054a3da1..fc064dbd 100644 --- a/Makefile +++ b/Makefile @@ -123,7 +123,7 @@ CXX_SRC=ast.cpp builtins.cpp cbackend.cpp ctx.cpp decl.cpp expr.cpp func.cpp \ HEADERS=ast.h builtins.h ctx.h decl.h expr.h func.h ispc.h llvmutil.h module.h \ opt.h stmt.h sym.h type.h util.h TARGETS=neon avx1 avx1-x2 avx11 avx11-x2 avx2 avx2-x2 sse2 sse2-x2 sse4 sse4-x2 \ - sse4-8 generic-4 generic-8 generic-16 generic-32 generic-64 generic-1 + sse4-8 sse4-16 generic-4 generic-8 generic-16 generic-32 generic-64 generic-1 # These files need to be compiled in two versions - 32 and 64 bits. BUILTINS_SRC_TARGET=$(addprefix builtins/target-, $(addsuffix .ll, $(TARGETS))) # These are files to be compiled in single version. diff --git a/builtins.cpp b/builtins.cpp index 6c586595..c4a2f3b5 100644 --- a/builtins.cpp +++ b/builtins.cpp @@ -862,10 +862,22 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod break; case 8: if (runtime32) { - EXPORT_MODULE(builtins_bitcode_sse4_x2_32bit); + if (g->target->getMaskBitCount() == 16) { + EXPORT_MODULE(builtins_bitcode_sse4_16_32bit); + } + else { + Assert(g->target->getMaskBitCount() == 32); + EXPORT_MODULE(builtins_bitcode_sse4_x2_32bit); + } } else { - EXPORT_MODULE(builtins_bitcode_sse4_x2_64bit); + if (g->target->getMaskBitCount() == 16) { + EXPORT_MODULE(builtins_bitcode_sse4_16_64bit); + } + else { + Assert(g->target->getMaskBitCount() == 32); + EXPORT_MODULE(builtins_bitcode_sse4_x2_64bit); + } } break; case 16: diff --git a/builtins/target-sse4-16.ll b/builtins/target-sse4-16.ll new file mode 100644 index 00000000..2044fbee --- /dev/null +++ b/builtins/target-sse4-16.ll @@ -0,0 +1,436 @@ +;; Copyright (c) 2013, Google, Inc. +;; All rights reserved. +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are +;; met: +;; +;; * Redistributions of source code must retain the above copyright +;; notice, this list of conditions and the following disclaimer. +;; +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; +;; * Neither the name of Google, Inc. nor the names of its +;; contributors may be used to endorse or promote products derived from +;; this software without specific prior written permission. +;; +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +;; IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +;; TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +;; PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER +;; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +;; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +;; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +; Define common 4-wide stuff +define(`WIDTH',`8') +define(`MASK',`i16') +include(`util.m4') + +stdlib_core() +packed_load_and_store() +scans() +int64minmax() + +include(`target-sse4-common.ll') + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; half conversion routines + +declare float @__half_to_float_uniform(i16 %v) nounwind readnone +declare @__half_to_float_varying( %v) nounwind readnone +declare i16 @__float_to_half_uniform(float %v) nounwind readnone +declare @__float_to_half_varying( %v) nounwind readnone + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; rcp + +declare <4 x float> @llvm.x86.sse.rcp.ps(<4 x float>) nounwind readnone + +define @__rcp_varying_float() nounwind readonly alwaysinline { + unary4to8(call, float, @llvm.x86.sse.rcp.ps, %0) + ; do one N-R iteration to improve precision + ; float iv = __rcp_v(v); + ; return iv * (2. - v * iv); + %v_iv = fmul <8 x float> %0, %call + %two_minus = fsub <8 x float> , %v_iv + %iv_mul = fmul <8 x float> %call, %two_minus + ret <8 x float> %iv_mul +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; rsqrt + +declare <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float>) nounwind readnone + +define @__rsqrt_varying_float( %v) nounwind readonly alwaysinline { + ; float is = __rsqrt_v(v); + unary4to8(is, float, @llvm.x86.sse.rsqrt.ps, %v) + ; Newton-Raphson iteration to improve precision + ; return 0.5 * is * (3. - (v * is) * is); + %v_is = fmul <8 x float> %v, %is + %v_is_is = fmul <8 x float> %v_is, %is + %three_sub = fsub <8 x float> , %v_is_is + %is_mul = fmul <8 x float> %is, %three_sub + %half_scale = fmul <8 x float> , %is_mul + ret <8 x float> %half_scale +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; sqrt + +declare <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float>) nounwind readnone + +define <8 x float> @__sqrt_varying_float(<8 x float>) nounwind readonly alwaysinline { + unary4to8(call, float, @llvm.x86.sse.sqrt.ps, %0) + ret <8 x float> %call +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; double precision sqrt + +declare <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double>) nounwind readnone + +define <8 x double> @__sqrt_varying_double(<8 x double>) nounwind +alwaysinline { + unary2to8(ret, double, @llvm.x86.sse2.sqrt.pd, %0) + ret <8 x double> %ret +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; rounding floats + +declare <4 x float> @llvm.x86.sse41.round.ps(<4 x float>, i32) nounwind readnone + +define <8 x float> @__round_varying_float(<8 x float>) nounwind readonly alwaysinline { + ; roundps, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8 + round4to8(%0, 8) +} + +define <8 x float> @__floor_varying_float(<8 x float>) nounwind readonly alwaysinline { + ; roundps, round down 0b01 | don't signal precision exceptions 0b1001 = 9 + round4to8(%0, 9) +} + +define <8 x float> @__ceil_varying_float(<8 x float>) nounwind readonly alwaysinline { + ; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10 + round4to8(%0, 10) +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; rounding doubles + +declare <2 x double> @llvm.x86.sse41.round.pd(<2 x double>, i32) nounwind readnone + +define <8 x double> @__round_varying_double(<8 x double>) nounwind readonly alwaysinline { + round2to8double(%0, 8) +} + +define <8 x double> @__floor_varying_double(<8 x double>) nounwind readonly alwaysinline { + round2to8double(%0, 9) +} + +define <8 x double> @__ceil_varying_double(<8 x double>) nounwind readonly alwaysinline { + round2to8double(%0, 10) +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; float min/max + +declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind readnone +declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind readnone + +define <8 x float> @__max_varying_float(<8 x float>, <8 x float>) nounwind readonly alwaysinline { + binary4to8(call, float, @llvm.x86.sse.max.ps, %0, %1) + ret <8 x float> %call +} + +define <8 x float> @__min_varying_float(<8 x float>, <8 x float>) nounwind readonly alwaysinline { + binary4to8(call, float, @llvm.x86.sse.min.ps, %0, %1) + ret <8 x float> %call +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; int32 min/max + +define <8 x i32> @__min_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline { + binary4to8(call, i32, @llvm.x86.sse41.pminsd, %0, %1) + ret <8 x i32> %call +} + +define <8 x i32> @__max_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline { + binary4to8(call, i32, @llvm.x86.sse41.pmaxsd, %0, %1) + ret <8 x i32> %call +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; unsigned int min/max + +define <8 x i32> @__min_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline { + binary4to8(call, i32, @llvm.x86.sse41.pminud, %0, %1) + ret <8 x i32> %call +} + +define <8 x i32> @__max_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline { + binary4to8(call, i32, @llvm.x86.sse41.pmaxud, %0, %1) + ret <8 x i32> %call +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; double precision min/max + +declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>) nounwind readnone +declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) nounwind readnone + +define <8 x double> @__min_varying_double(<8 x double>, <8 x double>) nounwind readnone { + binary2to8(ret, double, @llvm.x86.sse2.min.pd, %0, %1) + ret <8 x double> %ret +} + +define <8 x double> @__max_varying_double(<8 x double>, <8 x double>) nounwind readnone { + binary2to8(ret, double, @llvm.x86.sse2.max.pd, %0, %1) + ret <8 x double> %ret +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; svml + +; FIXME + +declare <8 x float> @__svml_sin(<8 x float>) +declare <8 x float> @__svml_cos(<8 x float>) +declare void @__svml_sincos(<8 x float>, <8 x float> *, <8 x float> *) +declare <8 x float> @__svml_tan(<8 x float>) +declare <8 x float> @__svml_atan(<8 x float>) +declare <8 x float> @__svml_atan2(<8 x float>, <8 x float>) +declare <8 x float> @__svml_exp(<8 x float>) +declare <8 x float> @__svml_log(<8 x float>) +declare <8 x float> @__svml_pow(<8 x float>, <8 x float>) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; horizontal ops / reductions + +declare i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8>) nounwind readnone + +define i64 @__movmsk(<8 x MASK>) nounwind readnone alwaysinline { + %m8 = trunc <8 x MASK> %0 to <8 x i8> + %mask8 = shufflevector <8 x i8> %m8, <8 x i8> zeroinitializer, + <16 x i32> + %m = call i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8> %mask8) + %m64 = zext i32 %m to i64 + ret i64 %m64 +} + +define i1 @__any(<8 x MASK>) nounwind readnone alwaysinline { + %m = call i64 @__movmsk(<8 x MASK> %0) + %mne = icmp ne i64 %m, 0 + ret i1 %mne +} + +define i1 @__all(<8 x MASK>) nounwind readnone alwaysinline { + %m = call i64 @__movmsk(<8 x MASK> %0) + %meq = icmp eq i64 %m, ALL_ON_MASK + ret i1 %meq +} + +define i1 @__none(<8 x MASK>) nounwind readnone alwaysinline { + %m = call i64 @__movmsk(<8 x MASK> %0) + %meq = icmp eq i64 %m, 0 + ret i1 %meq +} + +define internal <8 x float> @__add_varying_float(<8 x float>, <8 x float>) { + %r = fadd <8 x float> %0, %1 + ret <8 x float> %r +} + +define internal float @__add_uniform_float(float, float) { + %r = fadd float %0, %1 + ret float %r +} + +define float @__reduce_add_float(<8 x float>) nounwind readonly alwaysinline { + reduce8(float, @__add_varying_float, @__add_uniform_float) +} + +define float @__reduce_min_float(<8 x float>) nounwind readnone { + reduce8(float, @__min_varying_float, @__min_uniform_float) +} + +define float @__reduce_max_float(<8 x float>) nounwind readnone { + reduce8(float, @__max_varying_float, @__max_uniform_float) +} + +define internal <8 x i32> @__add_varying_int32(<8 x i32>, <8 x i32>) { + %r = add <8 x i32> %0, %1 + ret <8 x i32> %r +} + +define internal i32 @__add_uniform_int32(i32, i32) { + %r = add i32 %0, %1 + ret i32 %r +} + +define i32 @__reduce_add_int32(<8 x i32>) nounwind readnone { + reduce8(i32, @__add_varying_int32, @__add_uniform_int32) +} + +define i32 @__reduce_min_int32(<8 x i32>) nounwind readnone { + reduce8(i32, @__min_varying_int32, @__min_uniform_int32) +} + +define i32 @__reduce_max_int32(<8 x i32>) nounwind readnone { + reduce8(i32, @__max_varying_int32, @__max_uniform_int32) +} + +define i32 @__reduce_min_uint32(<8 x i32>) nounwind readnone { + reduce8(i32, @__min_varying_uint32, @__min_uniform_uint32) +} + +define i32 @__reduce_max_uint32(<8 x i32>) nounwind readnone { + reduce8(i32, @__max_varying_uint32, @__max_uniform_uint32) +} + +define internal <8 x double> @__add_varying_double(<8 x double>, <8 x double>) { + %r = fadd <8 x double> %0, %1 + ret <8 x double> %r +} + +define internal double @__add_uniform_double(double, double) { + %r = fadd double %0, %1 + ret double %r +} + +define double @__reduce_add_double(<8 x double>) nounwind readnone { + reduce8(double, @__add_varying_double, @__add_uniform_double) +} + +define double @__reduce_min_double(<8 x double>) nounwind readnone { + reduce8(double, @__min_varying_double, @__min_uniform_double) +} + +define double @__reduce_max_double(<8 x double>) nounwind readnone { + reduce8(double, @__max_varying_double, @__max_uniform_double) +} + +define internal <8 x i64> @__add_varying_int64(<8 x i64>, <8 x i64>) { + %r = add <8 x i64> %0, %1 + ret <8 x i64> %r +} + +define internal i64 @__add_uniform_int64(i64, i64) { + %r = add i64 %0, %1 + ret i64 %r +} + +define i64 @__reduce_add_int64(<8 x i64>) nounwind readnone { + reduce8(i64, @__add_varying_int64, @__add_uniform_int64) +} + +define i64 @__reduce_min_int64(<8 x i64>) nounwind readnone { + reduce8(i64, @__min_varying_int64, @__min_uniform_int64) +} + +define i64 @__reduce_max_int64(<8 x i64>) nounwind readnone { + reduce8(i64, @__max_varying_int64, @__max_uniform_int64) +} + +define i64 @__reduce_min_uint64(<8 x i64>) nounwind readnone { + reduce8(i64, @__min_varying_uint64, @__min_uniform_uint64) +} + +define i64 @__reduce_max_uint64(<8 x i64>) nounwind readnone { + reduce8(i64, @__max_varying_uint64, @__max_uniform_uint64) +} + +reduce_equal(8) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; masked store + +define void @__masked_store_blend_i64(<8 x i64>* nocapture, <8 x i64>, + <8 x MASK> %mask) nounwind + alwaysinline { + %mask_as_i1 = trunc <8 x MASK> %mask to <8 x i1> + %old = load <8 x i64>* %0, align 4 + %blend = select <8 x i1> %mask_as_i1, <8 x i64> %1, <8 x i64> %old + store <8 x i64> %blend, <8 x i64>* %0, align 4 + ret void +} + +define void @__masked_store_blend_i32(<8 x i32>* nocapture, <8 x i32>, + <8 x MASK> %mask) nounwind alwaysinline { + %mask_as_i1 = trunc <8 x MASK> %mask to <8 x i1> + %old = load <8 x i32>* %0, align 4 + %blend = select <8 x i1> %mask_as_i1, <8 x i32> %1, <8 x i32> %old + store <8 x i32> %blend, <8 x i32>* %0, align 4 + ret void +} + +define void @__masked_store_blend_i16(<8 x i16>* nocapture, <8 x i16>, + <8 x MASK> %mask) nounwind alwaysinline { + %mask_as_i1 = trunc <8 x MASK> %mask to <8 x i1> + %old = load <8 x i16>* %0, align 4 + %blend = select <8 x i1> %mask_as_i1, <8 x i16> %1, <8 x i16> %old + store <8 x i16> %blend, <8 x i16>* %0, align 4 + ret void +} + +define void @__masked_store_blend_i8(<8 x i8>* nocapture, <8 x i8>, + <8 x MASK> %mask) nounwind alwaysinline { + %mask_as_i1 = trunc <8 x MASK> %mask to <8 x i1> + %old = load <8 x i8>* %0, align 4 + %blend = select <8 x i1> %mask_as_i1, <8 x i8> %1, <8 x i8> %old + store <8 x i8> %blend, <8 x i8>* %0, align 4 + ret void +} + +gen_masked_store(i8) +gen_masked_store(i16) +gen_masked_store(i32) +gen_masked_store(i64) + +masked_store_float_double() + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; unaligned loads/loads+broadcasts + +masked_load(i8, 1) +masked_load(i16, 2) +masked_load(i32, 4) +masked_load(float, 4) +masked_load(i64, 8) +masked_load(double, 8) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; gather/scatter + +; define these with the macros from stdlib.m4 + +gen_gather_factored(i8) +gen_gather_factored(i16) +gen_gather_factored(i32) +gen_gather_factored(float) +gen_gather_factored(i64) +gen_gather_factored(double) + +gen_scatter(i8) +gen_scatter(i16) +gen_scatter(i32) +gen_scatter(float) +gen_scatter(i64) +gen_scatter(double) diff --git a/ispc.cpp b/ispc.cpp index 6ac23781..a9f5ff5c 100644 --- a/ispc.cpp +++ b/ispc.cpp @@ -318,6 +318,14 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : this->m_maskingIsFree = false; this->m_maskBitCount = 8; } + else if (!strcasecmp(isa, "sse4-16")) { + this->m_isa = Target::SSE4; + this->m_nativeVectorWidth = 8; + this->m_vectorWidth = 8; + this->m_attributes = "+sse,+sse2,+sse3,+sse41,-sse42,-sse4a,+ssse3,-popcnt,+cmov"; + this->m_maskingIsFree = false; + this->m_maskBitCount = 16; + } else if (!strcasecmp(isa, "generic-4")) { this->m_isa = Target::GENERIC; this->m_nativeVectorWidth = 4; @@ -575,9 +583,9 @@ Target::SupportedTargetArchs() { const char * Target::SupportedTargetISAs() { - return "neon, sse2, sse2-x2, sse4, sse4-x2, avx, avx-x2" - ", avx1.1, avx1.1-x2, avx2, avx2-x2" - ", generic-1, generic-4, generic-8, generic-16, generic-32"; + return "neon, sse2, sse2-x2, sse4, sse4-8, sse4-16, sse4-x2, " + "avx, avx-x2, avx1.1, avx1.1-x2, avx2, avx2-x2," + "generic-1, generic-4, generic-8, generic-16, generic-32"; } diff --git a/run_tests.py b/run_tests.py index 296db867..ea819ea4 100755 --- a/run_tests.py +++ b/run_tests.py @@ -37,7 +37,7 @@ parser.add_option("-g", "--generics-include", dest="include_file", help="Filenam parser.add_option("-f", "--ispc-flags", dest="ispc_flags", help="Additional flags for ispc (-g, -O1, ...)", default="") parser.add_option('-t', '--target', dest='target', - help='Set compilation target (neon, sse2, sse2-x2, sse4, sse4-x2, sse4-8, avx, avx-x2, generic-4, generic-8, generic-16, generic-32)', + help='Set compilation target (neon, sse2, sse2-x2, sse4, sse4-x2, sse4-8, sse4-16, avx, avx-x2, generic-4, generic-8, generic-16, generic-32)', default="sse4") parser.add_option('-a', '--arch', dest='arch', help='Set architecture (arm, x86, x86-64)', From bba84f247c34f67ed28a357d19a4a7414c590c2b Mon Sep 17 00:00:00 2001 From: Matt Pharr Date: Wed, 24 Jul 2013 15:08:07 -0700 Subject: [PATCH 011/124] Improved optimization of vector select instructions. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Various LLVM optimization passes are turning code like: %cmp = icmp lt <8 x i32> %foo, %bar %cmp32 = sext <8 x i1> %cmp to <8 x i32> . . . %cmp1 = trunc <8 x i32> %cmp32 to <8 x i1> %result = select <8 x i1> %cmp1, . . . Into: %cmp = icmp lt <8 x i32> %foo, %bar %cmp32 = zext <8 x i1> %cmp to <8 x i32> # note: zext . . . %cmp1 = icmp ne <8 x i32> %cmp32, zeroinitializer %result = select <8 x i1> %cmp1, … Which in turn isn't matched well by the LLVM code generators, which in turn leads to fairly inefficient code. (i.e. it doesn't just emit a vector compare and blend instruction.) Also, renamed VSelMovmskOptPass to InstructionSimplifyPass to better describe its functionality. --- opt.cpp | 175 ++++++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 126 insertions(+), 49 deletions(-) diff --git a/opt.cpp b/opt.cpp index 4701e7df..8efdbc67 100644 --- a/opt.cpp +++ b/opt.cpp @@ -108,7 +108,7 @@ #endif static llvm::Pass *CreateIntrinsicsOptPass(); -static llvm::Pass *CreateVSelMovmskOptPass(); +static llvm::Pass *CreateInstructionSimplifyPass(); static llvm::Pass *CreateImproveMemoryOpsPass(); static llvm::Pass *CreateGatherCoalescePass(); @@ -476,7 +476,7 @@ Optimize(llvm::Module *module, int optLevel) { } if (!g->opt.disableMaskAllOnOptimizations) { optPM.add(CreateIntrinsicsOptPass()); - optPM.add(CreateVSelMovmskOptPass()); + optPM.add(CreateInstructionSimplifyPass()); } optPM.add(llvm::createDeadInstEliminationPass()); @@ -519,7 +519,7 @@ Optimize(llvm::Module *module, int optLevel) { if (!g->opt.disableMaskAllOnOptimizations) { optPM.add(CreateIntrinsicsOptPass()); - optPM.add(CreateVSelMovmskOptPass()); + optPM.add(CreateInstructionSimplifyPass()); } if (g->opt.disableGatherScatterOptimizations == false && @@ -539,7 +539,7 @@ Optimize(llvm::Module *module, int optLevel) { optPM.add(llvm::createFunctionInliningPass()); optPM.add(llvm::createConstantPropagationPass()); optPM.add(CreateIntrinsicsOptPass()); - optPM.add(CreateVSelMovmskOptPass()); + optPM.add(CreateInstructionSimplifyPass()); if (g->opt.disableGatherScatterOptimizations == false && g->target->getVectorWidth() > 1) { @@ -555,18 +555,20 @@ Optimize(llvm::Module *module, int optLevel) { if (g->opt.disableHandlePseudoMemoryOps == false) optPM.add(CreateReplacePseudoMemoryOpsPass()); optPM.add(CreateIntrinsicsOptPass()); - optPM.add(CreateVSelMovmskOptPass()); + optPM.add(CreateInstructionSimplifyPass()); optPM.add(llvm::createFunctionInliningPass()); optPM.add(llvm::createArgumentPromotionPass()); optPM.add(llvm::createScalarReplAggregatesPass(sr_threshold, false)); optPM.add(llvm::createInstructionCombiningPass()); + optPM.add(CreateInstructionSimplifyPass()); optPM.add(llvm::createCFGSimplificationPass()); optPM.add(llvm::createReassociatePass()); optPM.add(llvm::createLoopRotatePass()); optPM.add(llvm::createLICMPass()); optPM.add(llvm::createLoopUnswitchPass(false)); optPM.add(llvm::createInstructionCombiningPass()); + optPM.add(CreateInstructionSimplifyPass()); optPM.add(llvm::createIndVarSimplifyPass()); optPM.add(llvm::createLoopIdiomPass()); optPM.add(llvm::createLoopDeletionPass()); @@ -576,17 +578,19 @@ Optimize(llvm::Module *module, int optLevel) { optPM.add(CreateIsCompileTimeConstantPass(true)); optPM.add(CreateIntrinsicsOptPass()); - optPM.add(CreateVSelMovmskOptPass()); + optPM.add(CreateInstructionSimplifyPass()); optPM.add(llvm::createMemCpyOptPass()); optPM.add(llvm::createSCCPPass()); optPM.add(llvm::createInstructionCombiningPass()); + optPM.add(CreateInstructionSimplifyPass()); optPM.add(llvm::createJumpThreadingPass()); optPM.add(llvm::createCorrelatedValuePropagationPass()); optPM.add(llvm::createDeadStoreEliminationPass()); optPM.add(llvm::createAggressiveDCEPass()); optPM.add(llvm::createCFGSimplificationPass()); optPM.add(llvm::createInstructionCombiningPass()); + optPM.add(CreateInstructionSimplifyPass()); optPM.add(llvm::createStripDeadPrototypesPass()); optPM.add(CreateMakeInternalFuncsStaticPass()); optPM.add(llvm::createGlobalDCEPass()); @@ -927,80 +931,153 @@ CreateIntrinsicsOptPass() { @todo The better thing to do would be to submit a patch to LLVM to get these; they're presumably pretty simple patterns to match. */ -class VSelMovmskOpt : public llvm::BasicBlockPass { +class InstructionSimplifyPass : public llvm::BasicBlockPass { public: - VSelMovmskOpt() + InstructionSimplifyPass() : BasicBlockPass(ID) { } const char *getPassName() const { return "Vector Select Optimization"; } bool runOnBasicBlock(llvm::BasicBlock &BB); static char ID; + +private: + static bool simplifySelect(llvm::SelectInst *selectInst, + llvm::BasicBlock::iterator iter); + static llvm::Value *simplifyBoolVec(llvm::Value *value); + static bool simplifyCall(llvm::CallInst *callInst, + llvm::BasicBlock::iterator iter); }; -char VSelMovmskOpt::ID = 0; +char InstructionSimplifyPass::ID = 0; + + +llvm::Value * +InstructionSimplifyPass::simplifyBoolVec(llvm::Value *value) { + llvm::TruncInst *trunc = llvm::dyn_cast(value); + if (trunc != NULL) { + // Convert trunc({sext,zext}(i1 vector)) -> (i1 vector) + llvm::SExtInst *sext = llvm::dyn_cast(value); + if (sext && + sext->getOperand(0)->getType() == LLVMTypes::Int1VectorType) + return sext->getOperand(0); + + llvm::ZExtInst *zext = llvm::dyn_cast(value); + if (zext && + zext->getOperand(0)->getType() == LLVMTypes::Int1VectorType) + return zext->getOperand(0); + } + + llvm::ICmpInst *icmp = llvm::dyn_cast(value); + if (icmp != NULL) { + // icmp(ne, {sext,zext}(foo), zeroinitializer) -> foo + if (icmp->getSignedPredicate() == llvm::CmpInst::ICMP_NE) { + llvm::Value *op1 = icmp->getOperand(1); + if (llvm::isa(op1)) { + llvm::Value *op0 = icmp->getOperand(0); + llvm::SExtInst *sext = llvm::dyn_cast(op0); + if (sext) + return sext->getOperand(0); + llvm::ZExtInst *zext = llvm::dyn_cast(op0); + if (zext) + return zext->getOperand(0); + } + } + } + return NULL; +} bool -VSelMovmskOpt::runOnBasicBlock(llvm::BasicBlock &bb) { - DEBUG_START_PASS("VSelMovmaskOpt"); +InstructionSimplifyPass::simplifySelect(llvm::SelectInst *selectInst, + llvm::BasicBlock::iterator iter) { + if (selectInst->getType()->isVectorTy() == false) + return false; + + llvm::Value *factor = selectInst->getOperand(0); + + // Simplify all-on or all-off mask values + MaskStatus maskStatus = lGetMaskStatus(factor); + llvm::Value *value = NULL; + if (maskStatus == ALL_ON) + // Mask all on -> replace with the first select value + value = selectInst->getOperand(1); + else if (maskStatus == ALL_OFF) + // Mask all off -> replace with the second select value + value = selectInst->getOperand(2); + if (value != NULL) { + llvm::ReplaceInstWithValue(iter->getParent()->getInstList(), + iter, value); + return true; + } + + // Sometimes earlier LLVM optimization passes generate unnecessarily + // complex expressions for the selection vector, which in turn confuses + // the code generators and leads to sub-optimal code (particularly for + // 8 and 16-bit masks). We'll try to simplify them out here so that + // the code generator patterns match.. + if ((factor = simplifyBoolVec(factor)) != NULL) { + llvm::Instruction *newSelect = + llvm::SelectInst::Create(factor, selectInst->getOperand(1), + selectInst->getOperand(2), + selectInst->getName()); + llvm::ReplaceInstWithInst(selectInst, newSelect); + return true; + } + + return false; +} + + +bool +InstructionSimplifyPass::simplifyCall(llvm::CallInst *callInst, + llvm::BasicBlock::iterator iter) { + llvm::Function *calledFunc = callInst->getCalledFunction(); + + // Turn a __movmsk call with a compile-time constant vector into the + // equivalent scalar value. + if (calledFunc == NULL || calledFunc != m->module->getFunction("__movmsk")) + return false; + + uint64_t mask; + if (lGetMask(callInst->getArgOperand(0), &mask) == true) { + llvm::ReplaceInstWithValue(iter->getParent()->getInstList(), + iter, LLVMInt64(mask)); + return true; + } + return false; +} + + +bool +InstructionSimplifyPass::runOnBasicBlock(llvm::BasicBlock &bb) { + DEBUG_START_PASS("InstructionSimplify"); bool modifiedAny = false; restart: for (llvm::BasicBlock::iterator iter = bb.begin(), e = bb.end(); iter != e; ++iter) { llvm::SelectInst *selectInst = llvm::dyn_cast(&*iter); - if (selectInst != NULL && selectInst->getType()->isVectorTy()) { - llvm::Value *factor = selectInst->getOperand(0); - - MaskStatus maskStatus = lGetMaskStatus(factor); - llvm::Value *value = NULL; - if (maskStatus == ALL_ON) - // Mask all on -> replace with the first select value - value = selectInst->getOperand(1); - else if (maskStatus == ALL_OFF) - // Mask all off -> replace with the second select value - value = selectInst->getOperand(2); - - if (value != NULL) { - llvm::ReplaceInstWithValue(iter->getParent()->getInstList(), - iter, value); - modifiedAny = true; - goto restart; - } + if (selectInst && simplifySelect(selectInst, iter)) { + modifiedAny = true; + goto restart; } - llvm::CallInst *callInst = llvm::dyn_cast(&*iter); - if (callInst == NULL) - continue; - - llvm::Function *calledFunc = callInst->getCalledFunction(); - if (calledFunc == NULL || calledFunc != m->module->getFunction("__movmsk")) - continue; - - uint64_t mask; - if (lGetMask(callInst->getArgOperand(0), &mask) == true) { -#if 0 - fprintf(stderr, "mask %d\n", mask); - callInst->getArgOperand(0)->dump(); - fprintf(stderr, "-----------\n"); -#endif - llvm::ReplaceInstWithValue(iter->getParent()->getInstList(), - iter, LLVMInt64(mask)); + if (callInst && simplifyCall(callInst, iter)) { modifiedAny = true; goto restart; } } - DEBUG_END_PASS("VSelMovMskOpt"); + DEBUG_END_PASS("InstructionSimplify"); return modifiedAny; } static llvm::Pass * -CreateVSelMovmskOptPass() { - return new VSelMovmskOpt; +CreateInstructionSimplifyPass() { + return new InstructionSimplifyPass; } From 2d063925a1d5ab758bcdd22454c201ac7d617dd3 Mon Sep 17 00:00:00 2001 From: Matt Pharr Date: Wed, 24 Jul 2013 15:10:08 -0700 Subject: [PATCH 012/124] Explicitly call the PBLENDVB intrinsic for i8 blending with sse4-8. This is slightly cleaner than trunc-ing the i8 mask to i1 and using a vector select. (And is probably more safe in terms of good code.) --- builtins/target-sse4-8.ll | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/builtins/target-sse4-8.ll b/builtins/target-sse4-8.ll index c85209ba..cd8fdce2 100644 --- a/builtins/target-sse4-8.ll +++ b/builtins/target-sse4-8.ll @@ -398,11 +398,13 @@ define void @__masked_store_blend_i16(<16 x i16>* nocapture, <16 x i16>, ret void } +declare <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone + define void @__masked_store_blend_i8(<16 x i8>* nocapture, <16 x i8>, <16 x MASK> %mask) nounwind alwaysinline { - %mask_as_i1 = trunc <16 x MASK> %mask to <16 x i1> %old = load <16 x i8>* %0, align 4 - %blend = select <16 x i1> %mask_as_i1, <16 x i8> %1, <16 x i8> %old + %blend = call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> %old, <16 x i8> %1, + <16 x i8> %mask) store <16 x i8> %blend, <16 x i8>* %0, align 4 ret void } From b6df447b550507ba77dde70758a5bdaf0e079f95 Mon Sep 17 00:00:00 2001 From: Matt Pharr Date: Thu, 25 Jul 2013 09:11:39 -0700 Subject: [PATCH 013/124] Add reduce_add() for int8 and int16 types. This maps to specialized instructions (e.g. PSADBW) when available. --- builtins.cpp | 2 ++ builtins/target-avx-x2.ll | 27 ++++++++++++++++++ builtins/target-avx.ll | 46 ++++++++++++++++++++++++------- builtins/target-generic-1.ll | 9 ++++++ builtins/target-generic-common.ll | 7 +++-- builtins/target-neon.ll | 33 ++++++++++++++++++---- builtins/target-sse2-x2.ll | 30 ++++++++++++++++++++ builtins/target-sse2.ll | 30 ++++++++++++++++++++ builtins/target-sse4-16.ll | 30 ++++++++++++++++++++ builtins/target-sse4-8.ll | 27 ++++++++++++++++++ builtins/target-sse4-x2.ll | 30 ++++++++++++++++++++ builtins/target-sse4.ll | 30 ++++++++++++++++++++ docs/ispc.rst | 39 ++++++++++++++++++-------- examples/intrinsics/generic-16.h | 9 +++--- examples/intrinsics/generic-32.h | 9 +++--- examples/intrinsics/generic-64.h | 9 +++--- examples/intrinsics/knc.h | 16 +++++++++++ examples/intrinsics/knc2x.h | 3 ++ examples/intrinsics/sse4.h | 16 +++++++++++ stdlib.ispc | 25 +++++++++++++++-- tests/reduce-add-int16-1.ispc | 21 ++++++++++++++ tests/reduce-add-int16.ispc | 21 ++++++++++++++ tests/reduce-add-int8-1.ispc | 21 ++++++++++++++ tests/reduce-add-int8.ispc | 18 ++++++++++++ 24 files changed, 464 insertions(+), 44 deletions(-) create mode 100644 tests/reduce-add-int16-1.ispc create mode 100644 tests/reduce-add-int16.ispc create mode 100644 tests/reduce-add-int8-1.ispc create mode 100644 tests/reduce-add-int8.ispc diff --git a/builtins.cpp b/builtins.cpp index c4a2f3b5..08472623 100644 --- a/builtins.cpp +++ b/builtins.cpp @@ -501,6 +501,8 @@ lSetInternalFunctions(llvm::Module *module) { "__rdrand_i64", "__reduce_add_double", "__reduce_add_float", + "__reduce_add_int8", + "__reduce_add_int16", "__reduce_add_int32", "__reduce_add_int64", "__reduce_equal_double", diff --git a/builtins/target-avx-x2.ll b/builtins/target-avx-x2.ll index 8c6b7753..d9e0322b 100644 --- a/builtins/target-avx-x2.ll +++ b/builtins/target-avx-x2.ll @@ -271,6 +271,33 @@ reduce_equal(16) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; horizontal int32 ops +declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone + +define i16 @__reduce_add_int8(<16 x i8>) nounwind readnone alwaysinline { + %rv = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %0, + <16 x i8> zeroinitializer) + %r0 = extractelement <2 x i64> %rv, i32 0 + %r1 = extractelement <2 x i64> %rv, i32 1 + %r = add i64 %r0, %r1 + %r16 = trunc i64 %r to i16 + ret i16 %r16 +} + +define internal <16 x i16> @__add_varying_i16(<16 x i16>, + <16 x i16>) nounwind readnone alwaysinline { + %r = add <16 x i16> %0, %1 + ret <16 x i16> %r +} + +define internal i16 @__add_uniform_i16(i16, i16) nounwind readnone alwaysinline { + %r = add i16 %0, %1 + ret i16 %r +} + +define i16 @__reduce_add_int16(<16 x i16>) nounwind readnone alwaysinline { + reduce16(i16, @__add_varying_i16, @__add_uniform_i16) +} + define <16 x i32> @__add_varying_int32(<16 x i32>, <16 x i32>) nounwind readnone alwaysinline { %s = add <16 x i32> %0, %1 diff --git a/builtins/target-avx.ll b/builtins/target-avx.ll index e6ab3a4b..90e2f3ac 100644 --- a/builtins/target-avx.ll +++ b/builtins/target-avx.ll @@ -217,7 +217,6 @@ define float @__reduce_add_float(<8 x float>) nounwind readonly alwaysinline { ret float %sum } - define float @__reduce_min_float(<8 x float>) nounwind readnone alwaysinline { reduce8(float, @__min_varying_float, @__min_uniform_float) } @@ -229,6 +228,42 @@ define float @__reduce_max_float(<8 x float>) nounwind readnone alwaysinline { reduce_equal(8) +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; horizontal int8 ops + +declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone + +define i16 @__reduce_add_int8(<8 x i8>) nounwind readnone alwaysinline { + %wide8 = shufflevector <8 x i8> %0, <8 x i8> zeroinitializer, + <16 x i32> + %rv = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %wide8, + <16 x i8> zeroinitializer) + %r0 = extractelement <2 x i64> %rv, i32 0 + %r1 = extractelement <2 x i64> %rv, i32 1 + %r = add i64 %r0, %r1 + %r16 = trunc i64 %r to i16 + ret i16 %r16 +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; horizontal int16 ops + +define internal <8 x i16> @__add_varying_i16(<8 x i16>, + <8 x i16>) nounwind readnone alwaysinline { + %r = add <8 x i16> %0, %1 + ret <8 x i16> %r +} + +define internal i16 @__add_uniform_i16(i16, i16) nounwind readnone alwaysinline { + %r = add i16 %0, %1 + ret i16 %r +} + +define i16 @__reduce_add_int16(<8 x i16>) nounwind readnone alwaysinline { + reduce8(i16, @__add_varying_i16, @__add_uniform_i16) +} + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; horizontal int32 ops @@ -257,20 +292,14 @@ define i32 @__reduce_max_int32(<8 x i32>) nounwind readnone alwaysinline { reduce8(i32, @__max_varying_int32, @__max_uniform_int32) } - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;;; horizontal uint32 ops - define i32 @__reduce_min_uint32(<8 x i32>) nounwind readnone alwaysinline { reduce8(i32, @__min_varying_uint32, @__min_uniform_uint32) } - define i32 @__reduce_max_uint32(<8 x i32>) nounwind readnone alwaysinline { reduce8(i32, @__max_varying_uint32, @__max_uniform_uint32) } - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; horizontal double ops @@ -329,9 +358,6 @@ define i64 @__reduce_max_int64(<8 x i64>) nounwind readnone alwaysinline { } -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;;; horizontal uint64 ops - define i64 @__reduce_min_uint64(<8 x i64>) nounwind readnone alwaysinline { reduce8(i64, @__min_varying_uint64, @__min_uniform_uint64) } diff --git a/builtins/target-generic-1.ll b/builtins/target-generic-1.ll index 9b747e2e..3dec76b0 100644 --- a/builtins/target-generic-1.ll +++ b/builtins/target-generic-1.ll @@ -471,6 +471,15 @@ define i64 @__popcnt_int64(i64) nounwind readonly alwaysinline { ret i64 %call } +define i8 @__reduce_add_int8(<1 x i8> %v) nounwind readonly alwaysinline { + %r = extractelement <1 x i8> %v, i32 0 + ret i8 %r +} + +define i16 @__reduce_add_int16(<1 x i16> %v) nounwind readonly alwaysinline { + %r = extractelement <1 x i16> %v, i32 0 + ret i16 %r +} define float @__reduce_add_float(<1 x float> %v) nounwind readonly alwaysinline { %r = extractelement <1 x float> %v, i32 0 diff --git a/builtins/target-generic-common.ll b/builtins/target-generic-common.ll index bbf1b842..76d1faf3 100644 --- a/builtins/target-generic-common.ll +++ b/builtins/target-generic-common.ll @@ -226,14 +226,16 @@ declare i1 @__any() nounwind readnone declare i1 @__all() nounwind readnone declare i1 @__none() nounwind readnone +declare i16 @__reduce_add_int8() nounwind readnone +declare i32 @__reduce_add_int16() nounwind readnone + declare float @__reduce_add_float() nounwind readnone declare float @__reduce_min_float() nounwind readnone declare float @__reduce_max_float() nounwind readnone -declare i32 @__reduce_add_int32() nounwind readnone +declare i64 @__reduce_add_int32() nounwind readnone declare i32 @__reduce_min_int32() nounwind readnone declare i32 @__reduce_max_int32() nounwind readnone - declare i32 @__reduce_min_uint32() nounwind readnone declare i32 @__reduce_max_uint32() nounwind readnone @@ -244,7 +246,6 @@ declare double @__reduce_max_double() nounwind readnone declare i64 @__reduce_add_int64() nounwind readnone declare i64 @__reduce_min_int64() nounwind readnone declare i64 @__reduce_max_int64() nounwind readnone - declare i64 @__reduce_min_uint64() nounwind readnone declare i64 @__reduce_max_uint64() nounwind readnone diff --git a/builtins/target-neon.ll b/builtins/target-neon.ll index e70b774b..fbeac352 100644 --- a/builtins/target-neon.ll +++ b/builtins/target-neon.ll @@ -509,15 +509,38 @@ define float @__reduce_max_float(<4 x float>) nounwind readnone { neon_reduce(float, @llvm.arm.neon.vpmaxs.v2f32, @max_f32) } -define internal i32 @add_i32(i32, i32) { - %r = add i32 %0, %1 +declare <4 x i16> @llvm.arm.neon.vpaddls.v4i16.v8i8(<8 x i8>) nounwind readnone + +define i16 @__reduce_add_int8() nounwind readnone { + %v8 = shufflevector <4 x i8> %0, <4 x i8> zeroinitializer, + <8 x i32> + %a16 = call <4 x i16> @llvm.arm.neon.vpaddls.v4i16.v8i8(<8 x i8> %v8) + %a32 = call <2 x i32> @llvm.arm.neon.vpaddlu.v2i32.v4i16(<4 x i16> %a16) + %a0 = extractelement <2 x i32> %a32, i32 0 + %a1 = extractelement <2 x i32> %a32, i32 1 + %r = add i32 %a0, %a1 + %r16 = trunc i32 %r to i16 + ret i16 %r16 +} + +declare <2 x i32> @llvm.arm.neon.vpaddlu.v2i32.v4i16(<4 x i16>) nounwind readnone + +define i32 @__reduce_add_int16() nounwind readnone { + %a32 = call <2 x i32> @llvm.arm.neon.vpaddlu.v2i32.v4i16(<4 x i16> %0) + %a0 = extractelement <2 x i32> %a32, i32 0 + %a1 = extractelement <2 x i32> %a32, i32 1 + %r = add i32 %a0, %a1 ret i32 %r } -declare <2 x i32> @llvm.arm.neon.vpadd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone +declare <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32>) nounwind readnone -define i32 @__reduce_add_int32() nounwind readnone { - neon_reduce(i32, @llvm.arm.neon.vpadd.v2i32, @add_i32) +define i64 @__reduce_add_int32() nounwind readnone { + %a64 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %0) + %a0 = extractelement <2 x i64> %a64, i32 0 + %a1 = extractelement <2 x i64> %a64, i32 1 + %r = add i64 %a0, %a1 + ret i64 %r } declare <2 x i32> @llvm.arm.neon.vpmins.v2i32(<2 x i32>, <2 x i32>) nounwind readnone diff --git a/builtins/target-sse2-x2.ll b/builtins/target-sse2-x2.ll index 73361720..da22a66c 100644 --- a/builtins/target-sse2-x2.ll +++ b/builtins/target-sse2-x2.ll @@ -367,6 +367,36 @@ define i1 @__none(<8 x i32>) nounwind readnone alwaysinline { ret i1 %cmp } +declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone + +define i16 @__reduce_add_int8(<8 x i8>) nounwind readnone alwaysinline { + %wide8 = shufflevector <8 x i8> %0, <8 x i8> zeroinitializer, + <16 x i32> + %rv = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %wide8, + <16 x i8> zeroinitializer) + %r0 = extractelement <2 x i64> %rv, i32 0 + %r1 = extractelement <2 x i64> %rv, i32 1 + %r = add i64 %r0, %r1 + %r16 = trunc i64 %r to i16 + ret i16 %r16 +} + +define internal <8 x i16> @__add_varying_i16(<8 x i16>, + <8 x i16>) nounwind readnone alwaysinline { + %r = add <8 x i16> %0, %1 + ret <8 x i16> %r +} + +define internal i16 @__add_uniform_i16(i16, i16) nounwind readnone alwaysinline { + %r = add i16 %0, %1 + ret i16 %r +} + +define i16 @__reduce_add_int16(<8 x i16>) nounwind readnone alwaysinline { + reduce8(i16, @__add_varying_i16, @__add_uniform_i16) +} + define <4 x float> @__vec4_add_float(<4 x float> %v0, <4 x float> %v1) nounwind readnone alwaysinline { %v = fadd <4 x float> %v0, %v1 diff --git a/builtins/target-sse2.ll b/builtins/target-sse2.ll index 2bb06391..a6b206b6 100644 --- a/builtins/target-sse2.ll +++ b/builtins/target-sse2.ll @@ -267,6 +267,36 @@ define i1 @__none(<4 x i32>) nounwind readnone alwaysinline { ret i1 %cmp } +declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone + +define i16 @__reduce_add_int8(<4 x i8>) nounwind readnone alwaysinline { + %wide8 = shufflevector <4 x i8> %0, <4 x i8> zeroinitializer, + <16 x i32> + %rv = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %wide8, + <16 x i8> zeroinitializer) + %r0 = extractelement <2 x i64> %rv, i32 0 + %r1 = extractelement <2 x i64> %rv, i32 1 + %r = add i64 %r0, %r1 + %r16 = trunc i64 %r to i16 + ret i16 %r16 +} + +define internal <4 x i16> @__add_varying_i16(<4 x i16>, + <4 x i16>) nounwind readnone alwaysinline { + %r = add <4 x i16> %0, %1 + ret <4 x i16> %r +} + +define internal i16 @__add_uniform_i16(i16, i16) nounwind readnone alwaysinline { + %r = add i16 %0, %1 + ret i16 %r +} + +define i16 @__reduce_add_int16(<4 x i16>) nounwind readnone alwaysinline { + reduce4(i16, @__add_varying_i16, @__add_uniform_i16) +} + define float @__reduce_add_float(<4 x float> %v) nounwind readonly alwaysinline { %v1 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> diff --git a/builtins/target-sse4-16.ll b/builtins/target-sse4-16.ll index 2044fbee..d1563988 100644 --- a/builtins/target-sse4-16.ll +++ b/builtins/target-sse4-16.ll @@ -253,6 +253,36 @@ define i1 @__none(<8 x MASK>) nounwind readnone alwaysinline { ret i1 %meq } +declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone + +define i16 @__reduce_add_int8(<8 x i8>) nounwind readnone alwaysinline { + %wide8 = shufflevector <8 x i8> %0, <8 x i8> zeroinitializer, + <16 x i32> + %rv = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %wide8, + <16 x i8> zeroinitializer) + %r0 = extractelement <2 x i64> %rv, i32 0 + %r1 = extractelement <2 x i64> %rv, i32 1 + %r = add i64 %r0, %r1 + %r16 = trunc i64 %r to i16 + ret i16 %r16 +} + +define internal <8 x i16> @__add_varying_i16(<8 x i16>, + <8 x i16>) nounwind readnone alwaysinline { + %r = add <8 x i16> %0, %1 + ret <8 x i16> %r +} + +define internal i16 @__add_uniform_i16(i16, i16) nounwind readnone alwaysinline { + %r = add i16 %0, %1 + ret i16 %r +} + +define i16 @__reduce_add_int16(<8 x i16>) nounwind readnone alwaysinline { + reduce8(i16, @__add_varying_i16, @__add_uniform_i16) +} + define internal <8 x float> @__add_varying_float(<8 x float>, <8 x float>) { %r = fadd <8 x float> %0, %1 ret <8 x float> %r diff --git a/builtins/target-sse4-8.ll b/builtins/target-sse4-8.ll index cd8fdce2..85b7bbe7 100644 --- a/builtins/target-sse4-8.ll +++ b/builtins/target-sse4-8.ll @@ -261,6 +261,33 @@ define i1 @__none(<16 x i8>) nounwind readnone alwaysinline { ret i1 %meq } +declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone + +define i16 @__reduce_add_int8(<16 x i8>) nounwind readnone alwaysinline { + %rv = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %0, + <16 x i8> zeroinitializer) + %r0 = extractelement <2 x i64> %rv, i32 0 + %r1 = extractelement <2 x i64> %rv, i32 1 + %r = add i64 %r0, %r1 + %r16 = trunc i64 %r to i16 + ret i16 %r16 +} + +define internal <16 x i16> @__add_varying_i16(<16 x i16>, + <16 x i16>) nounwind readnone alwaysinline { + %r = add <16 x i16> %0, %1 + ret <16 x i16> %r +} + +define internal i16 @__add_uniform_i16(i16, i16) nounwind readnone alwaysinline { + %r = add i16 %0, %1 + ret i16 %r +} + +define i16 @__reduce_add_int16(<16 x i16>) nounwind readnone alwaysinline { + reduce16(i16, @__add_varying_i16, @__add_uniform_i16) +} + define internal <16 x float> @__add_varying_float(<16 x float>, <16 x float>) { %r = fadd <16 x float> %0, %1 ret <16 x float> %r diff --git a/builtins/target-sse4-x2.ll b/builtins/target-sse4-x2.ll index ccae4d51..e2debbc2 100644 --- a/builtins/target-sse4-x2.ll +++ b/builtins/target-sse4-x2.ll @@ -309,6 +309,36 @@ define i1 @__none(<8 x i32>) nounwind readnone alwaysinline { ret i1 %cmp } +declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone + +define i16 @__reduce_add_int8(<8 x i8>) nounwind readnone alwaysinline { + %wide8 = shufflevector <8 x i8> %0, <8 x i8> zeroinitializer, + <16 x i32> + %rv = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %wide8, + <16 x i8> zeroinitializer) + %r0 = extractelement <2 x i64> %rv, i32 0 + %r1 = extractelement <2 x i64> %rv, i32 1 + %r = add i64 %r0, %r1 + %r16 = trunc i64 %r to i16 + ret i16 %r16 +} + +define internal <8 x i16> @__add_varying_i16(<8 x i16>, + <8 x i16>) nounwind readnone alwaysinline { + %r = add <8 x i16> %0, %1 + ret <8 x i16> %r +} + +define internal i16 @__add_uniform_i16(i16, i16) nounwind readnone alwaysinline { + %r = add i16 %0, %1 + ret i16 %r +} + +define i16 @__reduce_add_int16(<8 x i16>) nounwind readnone alwaysinline { + reduce8(i16, @__add_varying_i16, @__add_uniform_i16) +} + define float @__reduce_min_float(<8 x float>) nounwind readnone alwaysinline { reduce8by4(float, @llvm.x86.sse.min.ps, @__min_uniform_float) } diff --git a/builtins/target-sse4.ll b/builtins/target-sse4.ll index f622b839..98a7ef69 100644 --- a/builtins/target-sse4.ll +++ b/builtins/target-sse4.ll @@ -299,6 +299,36 @@ define i1 @__none(<4 x i32>) nounwind readnone alwaysinline { ret i1 %cmp } +declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone + +define i16 @__reduce_add_int8(<4 x i8>) nounwind readnone alwaysinline { + %wide8 = shufflevector <4 x i8> %0, <4 x i8> zeroinitializer, + <16 x i32> + %rv = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %wide8, + <16 x i8> zeroinitializer) + %r0 = extractelement <2 x i64> %rv, i32 0 + %r1 = extractelement <2 x i64> %rv, i32 1 + %r = add i64 %r0, %r1 + %r16 = trunc i64 %r to i16 + ret i16 %r16 +} + +define internal <4 x i16> @__add_varying_i16(<4 x i16>, + <4 x i16>) nounwind readnone alwaysinline { + %r = add <4 x i16> %0, %1 + ret <4 x i16> %r +} + +define internal i16 @__add_uniform_i16(i16, i16) nounwind readnone alwaysinline { + %r = add i16 %0, %1 + ret i16 %r +} + +define i16 @__reduce_add_int16(<4 x i16>) nounwind readnone alwaysinline { + reduce4(i16, @__add_varying_i16, @__add_uniform_i16) +} + declare <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float>, <4 x float>) nounwind readnone define float @__reduce_add_float(<4 x float>) nounwind readonly alwaysinline { diff --git a/docs/ispc.rst b/docs/ispc.rst index c6c63172..39d3a5c8 100755 --- a/docs/ispc.rst +++ b/docs/ispc.rst @@ -3711,29 +3711,44 @@ instances are added together by the ``reduce_add()`` function. :: - uniform float reduce_add(float x) - uniform int reduce_add(int x) - uniform unsigned int reduce_add(unsigned int x) + uniform int16 reduce_add(int8 x) + uniform unsigned int16 reduce_add(unsigned int8 x) + uniform int32 reduce_add(int16 x) + uniform unsigned int32 reduce_add(unsigned 16int x) + uniform int64 reduce_add(int32 x) + uniform unsigned int64 reduce_add(unsigned int32 x) + uniform int64 reduce_add(int64 x) + uniform unsigned int64 reduce_add(unsigned int64 x) -You can also use functions to compute the minimum and maximum value of the -given value across all of the currently-executing program instances. + uniform float reduce_add(float x) + uniform double reduce_add(double x) + +You can also use functions to compute the minimum value of the given value +across all of the currently-executing program instances. :: - uniform float reduce_min(float a) uniform int32 reduce_min(int32 a) uniform unsigned int32 reduce_min(unsigned int32 a) - uniform double reduce_min(double a) uniform int64 reduce_min(int64 a) uniform unsigned int64 reduce_min(unsigned int64 a) - uniform float reduce_max(float a) + uniform float reduce_min(float a) + uniform double reduce_min(double a) + +Equivalent functions are available to comptue the maximum of the given +varying variable over the active program instances. + +:: + uniform int32 reduce_max(int32 a) uniform unsigned int32 reduce_max(unsigned int32 a) - uniform double reduce_max(double a) uniform int64 reduce_max(int64 a) uniform unsigned int64 reduce_max(unsigned int64 a) + uniform float reduce_max(float a) + uniform double reduce_max(double a) + Finally, you can check to see if a particular value has the same value in all of the currently-running program instances: @@ -3741,9 +3756,10 @@ all of the currently-running program instances: uniform bool reduce_equal(int32 v) uniform bool reduce_equal(unsigned int32 v) - uniform bool reduce_equal(float v) uniform bool reduce_equal(int64 v) uniform bool reduce_equal(unsigned int64 v) + + uniform bool reduce_equal(float v) uniform bool reduce_equal(double) There are also variants of these functions that return the value as a @@ -3758,10 +3774,11 @@ performance in the `Performance Guide`_. uniform bool reduce_equal(int32 v, uniform int32 * uniform sameval) uniform bool reduce_equal(unsigned int32 v, uniform unsigned int32 * uniform sameval) - uniform bool reduce_equal(float v, uniform float * uniform sameval) uniform bool reduce_equal(int64 v, uniform int64 * uniform sameval) uniform bool reduce_equal(unsigned int64 v, uniform unsigned int64 * uniform sameval) + + uniform bool reduce_equal(float v, uniform float * uniform sameval) uniform bool reduce_equal(double, uniform double * uniform sameval) If called when none of the program instances are running, diff --git a/examples/intrinsics/generic-16.h b/examples/intrinsics/generic-16.h index 828c1ab4..6d4fe1f4 100644 --- a/examples/intrinsics/generic-16.h +++ b/examples/intrinsics/generic-16.h @@ -1162,19 +1162,20 @@ REDUCE_ADD(double, __vec16_d, __reduce_add_double) REDUCE_MINMAX(double, __vec16_d, __reduce_min_double, <) REDUCE_MINMAX(double, __vec16_d, __reduce_max_double, >) -REDUCE_ADD(uint32_t, __vec16_i32, __reduce_add_int32) +REDUCE_ADD(int16_t, __vec16_i8, __reduce_add_int8) +REDUCE_ADD(int32_t, __vec16_i16, __reduce_add_int16) + +REDUCE_ADD(int64_t, __vec16_i32, __reduce_add_int32) REDUCE_MINMAX(int32_t, __vec16_i32, __reduce_min_int32, <) REDUCE_MINMAX(int32_t, __vec16_i32, __reduce_max_int32, >) -REDUCE_ADD(uint32_t, __vec16_i32, __reduce_add_uint32) REDUCE_MINMAX(uint32_t, __vec16_i32, __reduce_min_uint32, <) REDUCE_MINMAX(uint32_t, __vec16_i32, __reduce_max_uint32, >) -REDUCE_ADD(uint64_t, __vec16_i64, __reduce_add_int64) +REDUCE_ADD(int64_t, __vec16_i64, __reduce_add_int64) REDUCE_MINMAX(int64_t, __vec16_i64, __reduce_min_int64, <) REDUCE_MINMAX(int64_t, __vec16_i64, __reduce_max_int64, >) -REDUCE_ADD(uint64_t, __vec16_i64, __reduce_add_uint64) REDUCE_MINMAX(uint64_t, __vec16_i64, __reduce_min_uint64, <) REDUCE_MINMAX(uint64_t, __vec16_i64, __reduce_max_uint64, >) diff --git a/examples/intrinsics/generic-32.h b/examples/intrinsics/generic-32.h index 64b82cb1..12c4f84e 100644 --- a/examples/intrinsics/generic-32.h +++ b/examples/intrinsics/generic-32.h @@ -1231,19 +1231,20 @@ REDUCE_ADD(double, __vec32_d, __reduce_add_double) REDUCE_MINMAX(double, __vec32_d, __reduce_min_double, <) REDUCE_MINMAX(double, __vec32_d, __reduce_max_double, >) -REDUCE_ADD(uint32_t, __vec32_i32, __reduce_add_int32) +REDUCE_ADD(int16_t, __vec16_i8, __reduce_add_int8) +REDUCE_ADD(int32_t, __vec16_i16, __reduce_add_int16) + +REDUCE_ADD(int64_t, __vec32_i32, __reduce_add_int32) REDUCE_MINMAX(int32_t, __vec32_i32, __reduce_min_int32, <) REDUCE_MINMAX(int32_t, __vec32_i32, __reduce_max_int32, >) -REDUCE_ADD(uint32_t, __vec32_i32, __reduce_add_uint32) REDUCE_MINMAX(uint32_t, __vec32_i32, __reduce_min_uint32, <) REDUCE_MINMAX(uint32_t, __vec32_i32, __reduce_max_uint32, >) -REDUCE_ADD(uint64_t, __vec32_i64, __reduce_add_int64) +REDUCE_ADD(int64_t, __vec32_i64, __reduce_add_int64) REDUCE_MINMAX(int64_t, __vec32_i64, __reduce_min_int64, <) REDUCE_MINMAX(int64_t, __vec32_i64, __reduce_max_int64, >) -REDUCE_ADD(uint64_t, __vec32_i64, __reduce_add_uint64) REDUCE_MINMAX(uint64_t, __vec32_i64, __reduce_min_uint64, <) REDUCE_MINMAX(uint64_t, __vec32_i64, __reduce_max_uint64, >) diff --git a/examples/intrinsics/generic-64.h b/examples/intrinsics/generic-64.h index 7869faa5..a3648f42 100644 --- a/examples/intrinsics/generic-64.h +++ b/examples/intrinsics/generic-64.h @@ -1364,19 +1364,20 @@ REDUCE_ADD(double, __vec64_d, __reduce_add_double) REDUCE_MINMAX(double, __vec64_d, __reduce_min_double, <) REDUCE_MINMAX(double, __vec64_d, __reduce_max_double, >) -REDUCE_ADD(uint32_t, __vec64_i32, __reduce_add_int32) +REDUCE_ADD(int16_t, __vec16_i8, __reduce_add_int8) +REDUCE_ADD(int32_t, __vec16_i16, __reduce_add_int16) + +REDUCE_ADD(int64_t, __vec64_i32, __reduce_add_int32) REDUCE_MINMAX(int32_t, __vec64_i32, __reduce_min_int32, <) REDUCE_MINMAX(int32_t, __vec64_i32, __reduce_max_int32, >) -REDUCE_ADD(uint32_t, __vec64_i32, __reduce_add_uint32) REDUCE_MINMAX(uint32_t, __vec64_i32, __reduce_min_uint32, <) REDUCE_MINMAX(uint32_t, __vec64_i32, __reduce_max_uint32, >) -REDUCE_ADD(uint64_t, __vec64_i64, __reduce_add_int64) +REDUCE_ADD(int64_t, __vec64_i64, __reduce_add_int64) REDUCE_MINMAX(int64_t, __vec64_i64, __reduce_min_int64, <) REDUCE_MINMAX(int64_t, __vec64_i64, __reduce_max_int64, >) -REDUCE_ADD(uint64_t, __vec64_i64, __reduce_add_uint64) REDUCE_MINMAX(uint64_t, __vec64_i64, __reduce_min_uint64, <) REDUCE_MINMAX(uint64_t, __vec64_i64, __reduce_max_uint64, >) diff --git a/examples/intrinsics/knc.h b/examples/intrinsics/knc.h index bf383c88..41c4cbc0 100644 --- a/examples/intrinsics/knc.h +++ b/examples/intrinsics/knc.h @@ -1511,6 +1511,22 @@ static FORCEINLINE int64_t __count_trailing_zeros_i64(const __vec1_i64 mask) { // reductions /////////////////////////////////////////////////////////////////////////// +static FORCEINLINE int16_t __reduce_add_i8(__vec16_i8 v) { + // TODO: improve this! + int16_t ret = 0; + for (int i = 0; i < 16; ++i) + ret += v.v[i]; + return ret; +} + +static FORCEINLINE int32_t __reduce_add_i16(__vec16_i16 v) { + // TODO: improve this! + int32_t ret = 0; + for (int i = 0; i < 16; ++i) + ret += v.v[i]; + return ret; +} + static FORCEINLINE uint32_t __reduce_add_i32(__vec16_i32 v) { return _mm512_reduce_add_epi32(v); } diff --git a/examples/intrinsics/knc2x.h b/examples/intrinsics/knc2x.h index 0041a6c9..5b6e5295 100644 --- a/examples/intrinsics/knc2x.h +++ b/examples/intrinsics/knc2x.h @@ -1607,6 +1607,9 @@ static FORCEINLINE int64_t __count_leading_zeros_i64(uint64_t v) { /////////////////////////////////////////////////////////////////////////// // reductions +REDUCE_ADD(int16_t, __vec32_i8, __reduce_add_int8) +REDUCE_ADD(int32_t, __vec32_i16, __reduce_add_int16) + static FORCEINLINE float __reduce_add_float(__vec32_f v) { return _mm512_reduce_add_ps(v.v1) + _mm512_reduce_add_ps(v.v2); } diff --git a/examples/intrinsics/sse4.h b/examples/intrinsics/sse4.h index d4739d61..30f90b31 100644 --- a/examples/intrinsics/sse4.h +++ b/examples/intrinsics/sse4.h @@ -2528,6 +2528,22 @@ static FORCEINLINE int64_t __count_leading_zeros_i64(uint64_t v) { /////////////////////////////////////////////////////////////////////////// // reductions +static FORCEINLINE int16_t __reduce_add_int8(__vec4_i8 v) { + // TODO: improve + int16_t ret = 0; + for (int i = 0; i < 4; ++i) + ret += v.v[i]; + return ret; +} + +static FORCEINLINE int32_t __reduce_add_int16(__vec4_i16 v) { + // TODO: improve + int32_t ret = 0; + for (int i = 0; i < 4; ++i) + ret += v.v[i]; + return ret; +} + static FORCEINLINE float __reduce_add_float(__vec4_f v) { float r = bits_as_float(_mm_extract_ps(v.v, 0)); r += bits_as_float(_mm_extract_ps(v.v, 1)); diff --git a/stdlib.ispc b/stdlib.ispc index 7e848481..c9c66252 100644 --- a/stdlib.ispc +++ b/stdlib.ispc @@ -887,13 +887,32 @@ static inline uniform double select(uniform bool c, uniform double a, /////////////////////////////////////////////////////////////////////////// // Horizontal ops / reductions +__declspec(safe) +static inline uniform int16 reduce_add(int8 x) { + return __reduce_add_int8(__mask ? x : (int8)0); +} + +__declspec(safe) +static inline uniform unsigned int16 reduce_add(unsigned int8 x) { + return __reduce_add_int8(__mask ? x : (int8)0); +} + +__declspec(safe) +static inline uniform int32 reduce_add(int16 x) { + return __reduce_add_int16(__mask ? x : (int16)0); +} + +__declspec(safe) +static inline uniform unsigned int32 reduce_add(unsigned int16 x) { + return __reduce_add_int16(__mask ? x : (int16)0); +} + __declspec(safe) static inline uniform float reduce_add(float x) { // zero the lanes where the mask is off return __reduce_add_float(__mask ? x : 0.); } - __declspec(safe) static inline uniform float reduce_min(float v) { // For the lanes where the mask is off, replace the given value with @@ -915,7 +934,7 @@ static inline uniform float reduce_max(float v) { } __declspec(safe) -static inline uniform int reduce_add(int x) { +static inline uniform int64 reduce_add(int32 x) { // Zero out the values for lanes that aren't running return __reduce_add_int32(__mask ? x : 0); } @@ -937,7 +956,7 @@ static inline uniform int reduce_max(int v) { } __declspec(safe) -static inline uniform unsigned int reduce_add(unsigned int x) { +static inline uniform unsigned int64 reduce_add(unsigned int32 x) { // Set values for non-running lanes to zero so they don't affect the // result. return __reduce_add_int32(__mask ? x : 0); diff --git a/tests/reduce-add-int16-1.ispc b/tests/reduce-add-int16-1.ispc new file mode 100644 index 00000000..58529ca1 --- /dev/null +++ b/tests/reduce-add-int16-1.ispc @@ -0,0 +1,21 @@ + +export uniform int width() { return programCount; } + + + +export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) { + float v = aFOO[programIndex]; + uniform float m; + int16 iv = (int)v; + if (iv & 1) + m = reduce_add(iv); + RET[programIndex] = m; +} + +export void result(uniform float RET[]) { + uniform int x = 0; + for (uniform int i = 1; i <= programCount; i += 2) + x += i; + RET[programIndex] = x; +} + diff --git a/tests/reduce-add-int16.ispc b/tests/reduce-add-int16.ispc new file mode 100644 index 00000000..8657b201 --- /dev/null +++ b/tests/reduce-add-int16.ispc @@ -0,0 +1,21 @@ + +export uniform int width() { return programCount; } + + + +export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) { + float v = aFOO[programIndex]; + uniform float m; + int16 iv = (int)v; +/*CO if (iv & 1)*/ + m = reduce_add(iv); + RET[programIndex] = m; +} + +export void result(uniform float RET[]) { + uniform int x = 0; + for (uniform int i = 1; i <= programCount; ++i) + x += i; + RET[programIndex] = x; +} + diff --git a/tests/reduce-add-int8-1.ispc b/tests/reduce-add-int8-1.ispc new file mode 100644 index 00000000..e5310aae --- /dev/null +++ b/tests/reduce-add-int8-1.ispc @@ -0,0 +1,21 @@ + +export uniform int width() { return programCount; } + + + +export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) { + float v = aFOO[programIndex]; + uniform float m; + int8 iv = (int)v; + if (iv & 1) + m = reduce_add(iv); + RET[programIndex] = m; +} + +export void result(uniform float RET[]) { + uniform int x = 0; + for (uniform int i = 1; i <= programCount; i += 2) + x += i; + RET[programIndex] = x; +} + diff --git a/tests/reduce-add-int8.ispc b/tests/reduce-add-int8.ispc new file mode 100644 index 00000000..7e0dd027 --- /dev/null +++ b/tests/reduce-add-int8.ispc @@ -0,0 +1,18 @@ + +export uniform int width() { return programCount; } + + +export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) { + int8 db = b-4; + int8 iv = programIndex + db; + int m = reduce_add(iv); + RET[programIndex] = m; +} + +export void result(uniform float RET[]) { + uniform int x = 0; + for (uniform int i = 1; i <= programCount; ++i) + x += i; + RET[programIndex] = x; +} + From fa93cb7d0ba3bcd587ca5dd6bfaa0a6f083cb2b7 Mon Sep 17 00:00:00 2001 From: Dmitry Babokin Date: Mon, 29 Jul 2013 22:46:36 -0700 Subject: [PATCH 014/124] InterlockedAdd -> InterlockedExchangeAdd for better portability (InterlockedAdd is not always supported) --- examples/tasksys.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/tasksys.cpp b/examples/tasksys.cpp index b4ced5c7..c9c2fa7b 100644 --- a/examples/tasksys.cpp +++ b/examples/tasksys.cpp @@ -365,7 +365,7 @@ lAtomicCompareAndSwap32(volatile int32_t *v, int32_t newValue, int32_t oldValue) static inline int32_t lAtomicAdd(volatile int32_t *v, int32_t delta) { #ifdef ISPC_IS_WINDOWS - return InterlockedAdd((volatile LONG *)v, delta); + return InterlockedExchangeAdd((volatile LONG *)v, delta)+delta; #else return __sync_fetch_and_add(v, delta); #endif From ab3b633733ec05f3778e46f792a98844e9ee5900 Mon Sep 17 00:00:00 2001 From: Matt Pharr Date: Mon, 29 Jul 2013 16:14:58 -0700 Subject: [PATCH 015/124] Add 8-bit and 16-bit specialized NEON targets. Like SSE4-8 and SSE4-16, these use 8-bit and 16-bit values for mask elements, respectively, and thus should generate the best code when used for computation with datatypes of those sizes. --- Makefile | 6 +- builtins.cpp | 28 +- builtins/target-neon-16.ll | 458 ++++++++++++++++ .../{target-neon.ll => target-neon-32.ll} | 305 +---------- builtins/target-neon-8.ll | 508 ++++++++++++++++++ builtins/target-neon-common.ll | 351 ++++++++++++ builtins/util.m4 | 120 ++++- ispc.cpp | 41 +- ispc.h | 3 +- ispc.vcxproj | 111 ++-- module.cpp | 1 + run_tests.py | 2 +- 12 files changed, 1561 insertions(+), 373 deletions(-) create mode 100644 builtins/target-neon-16.ll rename builtins/{target-neon.ll => target-neon-32.ll} (62%) create mode 100644 builtins/target-neon-8.ll create mode 100644 builtins/target-neon-common.ll diff --git a/Makefile b/Makefile index fc064dbd..98729bfc 100644 --- a/Makefile +++ b/Makefile @@ -122,8 +122,10 @@ CXX_SRC=ast.cpp builtins.cpp cbackend.cpp ctx.cpp decl.cpp expr.cpp func.cpp \ type.cpp util.cpp HEADERS=ast.h builtins.h ctx.h decl.h expr.h func.h ispc.h llvmutil.h module.h \ opt.h stmt.h sym.h type.h util.h -TARGETS=neon avx1 avx1-x2 avx11 avx11-x2 avx2 avx2-x2 sse2 sse2-x2 sse4 sse4-x2 \ - sse4-8 sse4-16 generic-4 generic-8 generic-16 generic-32 generic-64 generic-1 +TARGETS=neon-32 neon-16 neon-8 \ + avx1 avx1-x2 avx11 avx11-x2 avx2 avx2-x2 \ + sse2 sse2-x2 sse4 sse4-x2 sse4-8 sse4-16 \ + generic-1 generic-4 generic-8 generic-16 generic-32 generic-64 # These files need to be compiled in two versions - 32 and 64 bits. BUILTINS_SRC_TARGET=$(addprefix builtins/target-, $(addsuffix .ll, $(TARGETS))) # These are files to be compiled in single version. diff --git a/builtins.cpp b/builtins.cpp index 08472623..e671a491 100644 --- a/builtins.cpp +++ b/builtins.cpp @@ -657,7 +657,9 @@ AddBitcodeToModule(const unsigned char *bitcode, int length, // the values for an ARM target. This maybe won't cause problems // in the generated code, since bulitins.c doesn't do anything too // complex w.r.t. struct layouts, etc. - if (g->target->getISA() != Target::NEON) + if (g->target->getISA() != Target::NEON32 && + g->target->getISA() != Target::NEON16 && + g->target->getISA() != Target::NEON8) #endif // !__arm__ { Assert(bcTriple.getArch() == llvm::Triple::UnknownArch || @@ -820,12 +822,30 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod // Next, add the target's custom implementations of the various needed // builtin functions (e.g. __masked_store_32(), etc). switch (g->target->getISA()) { - case Target::NEON: { + case Target::NEON8: { if (runtime32) { - EXPORT_MODULE(builtins_bitcode_neon_32bit); + EXPORT_MODULE(builtins_bitcode_neon_8_32bit); } else { - EXPORT_MODULE(builtins_bitcode_neon_64bit); + EXPORT_MODULE(builtins_bitcode_neon_8_64bit); + } + break; + } + case Target::NEON16: { + if (runtime32) { + EXPORT_MODULE(builtins_bitcode_neon_16_32bit); + } + else { + EXPORT_MODULE(builtins_bitcode_neon_16_64bit); + } + break; + } + case Target::NEON32: { + if (runtime32) { + EXPORT_MODULE(builtins_bitcode_neon_32_32bit); + } + else { + EXPORT_MODULE(builtins_bitcode_neon_32_64bit); } break; } diff --git a/builtins/target-neon-16.ll b/builtins/target-neon-16.ll new file mode 100644 index 00000000..fd15eb0b --- /dev/null +++ b/builtins/target-neon-16.ll @@ -0,0 +1,458 @@ +;; +;; target-neon-16.ll +;; +;; Copyright(c) 2013 Google, Inc. +;; +;; All rights reserved. +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are +;; met: +;; +;; * Redistributions of source code must retain the above copyright +;; notice, this list of conditions and the following disclaimer. +;; +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; +;; * Neither the name of Matt Pharr nor the names of its +;; contributors may be used to endorse or promote products derived from +;; this software without specific prior written permission. +;; +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +;; IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +;; TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +;; PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER +;; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +;; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +;; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +define(`WIDTH',`8') +define(`MASK',`i16') + +include(`util.m4') +include(`target-neon-common.ll') + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; half conversion routines + +define <8 x float> @__half_to_float_varying(<8 x i16> %v) nounwind readnone { + unary4to8conv(r, i16, float, @llvm.arm.neon.vcvthf2fp, %v) + ret <8 x float> %r +} + +define <8 x i16> @__float_to_half_varying(<8 x float> %v) nounwind readnone { + unary4to8conv(r, float, i16, @llvm.arm.neon.vcvtfp2hf, %v) + ret <8 x i16> %r +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; math + +;; round/floor/ceil + +;; FIXME: grabbed these from the sse2 target, which does not have native +;; instructions for these. Is there a better approach for NEON? + +define <8 x float> @__round_varying_float(<8 x float>) nounwind readonly alwaysinline { + %float_to_int_bitcast.i.i.i.i = bitcast <8 x float> %0 to <8 x i32> + %bitop.i.i = and <8 x i32> %float_to_int_bitcast.i.i.i.i, + + %bitop.i = xor <8 x i32> %float_to_int_bitcast.i.i.i.i, %bitop.i.i + %int_to_float_bitcast.i.i40.i = bitcast <8 x i32> %bitop.i to <8 x float> + %binop.i = fadd <8 x float> %int_to_float_bitcast.i.i40.i, + + %binop21.i = fadd <8 x float> %binop.i, + + %float_to_int_bitcast.i.i.i = bitcast <8 x float> %binop21.i to <8 x i32> + %bitop31.i = xor <8 x i32> %float_to_int_bitcast.i.i.i, %bitop.i.i + %int_to_float_bitcast.i.i.i = bitcast <8 x i32> %bitop31.i to <8 x float> + ret <8 x float> %int_to_float_bitcast.i.i.i +} + +define <8 x float> @__floor_varying_float(<8 x float>) nounwind readonly alwaysinline { + %calltmp.i = tail call <8 x float> @__round_varying_float(<8 x float> %0) nounwind + %bincmp.i = fcmp ogt <8 x float> %calltmp.i, %0 + %val_to_boolvec32.i = sext <8 x i1> %bincmp.i to <8 x i32> + %bitop.i = and <8 x i32> %val_to_boolvec32.i, + + %int_to_float_bitcast.i.i.i = bitcast <8 x i32> %bitop.i to <8 x float> + %binop.i = fadd <8 x float> %calltmp.i, %int_to_float_bitcast.i.i.i + ret <8 x float> %binop.i +} + +define <8 x float> @__ceil_varying_float(<8 x float>) nounwind readonly alwaysinline { + %calltmp.i = tail call <8 x float> @__round_varying_float(<8 x float> %0) nounwind + %bincmp.i = fcmp olt <8 x float> %calltmp.i, %0 + %val_to_boolvec32.i = sext <8 x i1> %bincmp.i to <8 x i32> + %bitop.i = and <8 x i32> %val_to_boolvec32.i, + + %int_to_float_bitcast.i.i.i = bitcast <8 x i32> %bitop.i to <8 x float> + %binop.i = fadd <8 x float> %calltmp.i, %int_to_float_bitcast.i.i.i + ret <8 x float> %binop.i +} + +;; FIXME: rounding doubles and double vectors needs to be implemented +declare @__round_varying_double() nounwind readnone +declare @__floor_varying_double() nounwind readnone +declare @__ceil_varying_double() nounwind readnone + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; min/max + +declare <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float>, <4 x float>) nounwind readnone +declare <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float>, <4 x float>) nounwind readnone + +define @__max_varying_float(, + ) nounwind readnone { + binary4to8(r, float, @llvm.arm.neon.vmaxs.v4f32, %0, %1) + ret %r +} + +define @__min_varying_float(, + ) nounwind readnone { + binary4to8(r, float, @llvm.arm.neon.vmins.v4f32, %0, %1) + ret %r +} + +declare <4 x i32> @llvm.arm.neon.vmins.v4i32(<4 x i32>, <4 x i32>) nounwind readnone +declare <4 x i32> @llvm.arm.neon.vminu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone +declare <4 x i32> @llvm.arm.neon.vmaxs.v4i32(<4 x i32>, <4 x i32>) nounwind readnone +declare <4 x i32> @llvm.arm.neon.vmaxu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone + +define @__min_varying_int32(, ) nounwind readnone { + binary4to8(r, i32, @llvm.arm.neon.vmins.v4i32, %0, %1) + ret %r +} + +define @__max_varying_int32(, ) nounwind readnone { + binary4to8(r, i32, @llvm.arm.neon.vmaxs.v4i32, %0, %1) + ret %r +} + +define @__min_varying_uint32(, ) nounwind readnone { + binary4to8(r, i32, @llvm.arm.neon.vminu.v4i32, %0, %1) + ret %r +} + +define @__max_varying_uint32(, ) nounwind readnone { + binary4to8(r, i32, @llvm.arm.neon.vmaxu.v4i32, %0, %1) + ret %r +} + +;; sqrt/rsqrt/rcp + +declare <4 x float> @llvm.arm.neon.vrecpe.v4f32(<4 x float>) nounwind readnone +declare <4 x float> @llvm.arm.neon.vrecps.v4f32(<4 x float>, <4 x float>) nounwind readnone + +define @__rcp_varying_float( %d) nounwind readnone { + unary4to8(x0, float, @llvm.arm.neon.vrecpe.v4f32, %d) + binary4to8(x0_nr, float, @llvm.arm.neon.vrecps.v4f32, %d, %x0) + %x1 = fmul %x0, %x0_nr + binary4to8(x1_nr, float, @llvm.arm.neon.vrecps.v4f32, %d, %x1) + %x2 = fmul %x1, %x1_nr + ret %x2 +} + +declare <4 x float> @llvm.arm.neon.vrsqrte.v4f32(<4 x float>) nounwind readnone +declare <4 x float> @llvm.arm.neon.vrsqrts.v4f32(<4 x float>, <4 x float>) nounwind readnone + +define @__rsqrt_varying_float( %d) nounwind readnone { + unary4to8(x0, float, @llvm.arm.neon.vrsqrte.v4f32, %d) + %x0_2 = fmul %x0, %x0 + binary4to8(x0_nr, float, @llvm.arm.neon.vrsqrts.v4f32, %d, %x0_2) + %x1 = fmul %x0, %x0_nr + %x1_2 = fmul %x1, %x1 + binary4to8(x1_nr, float, @llvm.arm.neon.vrsqrts.v4f32, %d, %x1_2) + %x2 = fmul %x1, %x1_nr + ret %x2 +} + +define float @__rsqrt_uniform_float(float) nounwind readnone { + %v1 = bitcast float %0 to <1 x float> + %vs = shufflevector <1 x float> %v1, <1 x float> undef, + <8 x i32> + %vr = call <8 x float> @__rsqrt_varying_float(<8 x float> %vs) + %r = extractelement <8 x float> %vr, i32 0 + ret float %r +} + +define float @__rcp_uniform_float(float) nounwind readnone { + %v1 = bitcast float %0 to <1 x float> + %vs = shufflevector <1 x float> %v1, <1 x float> undef, + <8 x i32> + %vr = call <8 x float> @__rcp_varying_float(<8 x float> %vs) + %r = extractelement <8 x float> %vr, i32 0 + ret float %r +} + +declare <4 x float> @llvm.sqrt.v4f32(<4 x float>) + +define @__sqrt_varying_float() nounwind readnone { + unary4to8(result, float, @llvm.sqrt.v4f32, %0) +;; this returns nan for v=0, which is undesirable.. +;; %rsqrt = call @__rsqrt_varying_float( %0) +;; %result = fmul <4 x float> %rsqrt, %0 + ret <8 x float> %result +} + +declare <4 x double> @llvm.sqrt.v4f64(<4 x double>) + +define @__sqrt_varying_double() nounwind readnone { + unary4to8(r, double, @llvm.sqrt.v4f64, %0) + ret %r +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; reductions + +define i64 @__movmsk() nounwind readnone { + %and_mask = and %0, + + %v4 = call <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<8 x i16> %and_mask) + %v2 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %v4) + %va = extractelement <2 x i64> %v2, i32 0 + %vb = extractelement <2 x i64> %v2, i32 1 + %v = or i64 %va, %vb + ret i64 %v +} + +define i1 @__any() nounwind readnone alwaysinline { + v8tov4(MASK, %0, %v0123, %v4567) + %vor = or <4 x MASK> %v0123, %v4567 + %v0 = extractelement <4 x MASK> %vor, i32 0 + %v1 = extractelement <4 x MASK> %vor, i32 1 + %v2 = extractelement <4 x MASK> %vor, i32 2 + %v3 = extractelement <4 x MASK> %vor, i32 3 + %v01 = or MASK %v0, %v1 + %v23 = or MASK %v2, %v3 + %v = or MASK %v01, %v23 + %cmp = icmp ne MASK %v, 0 + ret i1 %cmp +} + +define i1 @__all() nounwind readnone alwaysinline { + v8tov4(MASK, %0, %v0123, %v4567) + %vand = and <4 x MASK> %v0123, %v4567 + %v0 = extractelement <4 x MASK> %vand, i32 0 + %v1 = extractelement <4 x MASK> %vand, i32 1 + %v2 = extractelement <4 x MASK> %vand, i32 2 + %v3 = extractelement <4 x MASK> %vand, i32 3 + %v01 = and MASK %v0, %v1 + %v23 = and MASK %v2, %v3 + %v = and MASK %v01, %v23 + %cmp = icmp ne MASK %v, 0 + ret i1 %cmp +} + +define i1 @__none() nounwind readnone alwaysinline { + %any = call i1 @__any( %0) + %none = icmp eq i1 %any, 0 + ret i1 %none +} + +;; $1: scalar type +;; $2: vector/vector reduce function (2 x -> ) +;; $3: pairwise vector reduce function (2 x <2 x vec> -> <2 x vec>) +;; $4: scalar reduce function + +define(`neon_reduce', ` + v8tov4($1, %0, %v0123, %v4567) + %v0123_8 = shufflevector <4 x $1> %v0123, <4 x $1> undef, + <8 x i32> + %v4567_8 = shufflevector <4 x $1> %v4567, <4 x $1> undef, + <8 x i32> + %vfirst = call <8 x $1> $2(<8 x $1> %v0123_8, <8 x $1> %v4567_8) + %vfirst_4 = shufflevector <8 x $1> %vfirst, <8 x $1> undef, + <4 x i32> + v4tov2($1, %vfirst_4, %v0, %v1) + %vh = call <2 x $1> $3(<2 x $1> %v0, <2 x $1> %v1) + %vh0 = extractelement <2 x $1> %vh, i32 0 + %vh1 = extractelement <2 x $1> %vh, i32 1 + %r = call $1 $4($1 %vh0, $1 %vh1) + ret $1 %r +') + +declare <2 x float> @llvm.arm.neon.vpadd.v2f32(<2 x float>, <2 x float>) nounwind readnone + +define internal float @add_f32(float, float) { + %r = fadd float %0, %1 + ret float %r +} + +define internal @__add_varying_float(, ) { + %r = fadd %0, %1 + ret %r +} + +define float @__reduce_add_float() nounwind readnone { + neon_reduce(float, @__add_varying_float, @llvm.arm.neon.vpadd.v2f32, @add_f32) +} + +declare <2 x float> @llvm.arm.neon.vpmins.v2f32(<2 x float>, <2 x float>) nounwind readnone + +define internal float @min_f32(float, float) { + %cmp = fcmp olt float %0, %1 + %r = select i1 %cmp, float %0, float %1 + ret float %r +} + +define float @__reduce_min_float() nounwind readnone { + neon_reduce(float, @__min_varying_float, @llvm.arm.neon.vpmins.v2f32, @min_f32) +} + +declare <2 x float> @llvm.arm.neon.vpmaxs.v2f32(<2 x float>, <2 x float>) nounwind readnone + +define internal float @max_f32(float, float) { + %cmp = fcmp ugt float %0, %1 + %r = select i1 %cmp, float %0, float %1 + ret float %r +} + +define float @__reduce_max_float() nounwind readnone { + neon_reduce(float, @__max_varying_float, @llvm.arm.neon.vpmaxs.v2f32, @max_f32) +} + +declare <4 x i16> @llvm.arm.neon.vpaddls.v4i16.v8i8(<8 x i8>) nounwind readnone +declare <2 x i32> @llvm.arm.neon.vpaddlu.v2i32.v4i16(<4 x i16>) nounwind readnone + +define i16 @__reduce_add_int8() nounwind readnone { + %a16 = call <4 x i16> @llvm.arm.neon.vpaddls.v4i16.v8i8(<8 x i8> %0) + %a32 = call <2 x i32> @llvm.arm.neon.vpaddlu.v2i32.v4i16(<4 x i16> %a16) + %a0 = extractelement <2 x i32> %a32, i32 0 + %a1 = extractelement <2 x i32> %a32, i32 1 + %r = add i32 %a0, %a1 + %r16 = trunc i32 %r to i16 + ret i16 %r16 +} + +declare <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16() + +define i64 @__reduce_add_int16() nounwind readnone { + %a1 = call <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16( %0) + %a2 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %a1) + %aa = extractelement <2 x i64> %a2, i32 0 + %ab = extractelement <2 x i64> %a2, i32 1 + %r = add i64 %aa, %ab + ret i64 %r +} + +declare <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32>) nounwind readnone + +define i64 @__reduce_add_int32() nounwind readnone { + v8tov4(i32, %0, %va, %vb) + %pa = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %va) + %pb = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %vb) + %psum = add <2 x i64> %pa, %pb + %a0 = extractelement <2 x i64> %psum, i32 0 + %a1 = extractelement <2 x i64> %psum, i32 1 + %r = add i64 %a0, %a1 + ret i64 %r +} + +declare <2 x i32> @llvm.arm.neon.vpmins.v2i32(<2 x i32>, <2 x i32>) nounwind readnone + +define internal i32 @min_si32(i32, i32) { + %cmp = icmp slt i32 %0, %1 + %r = select i1 %cmp, i32 %0, i32 %1 + ret i32 %r +} + +define i32 @__reduce_min_int32() nounwind readnone { + neon_reduce(i32, @__min_varying_int32, @llvm.arm.neon.vpmins.v2i32, @min_si32) +} + +declare <2 x i32> @llvm.arm.neon.vpmaxs.v2i32(<2 x i32>, <2 x i32>) nounwind readnone + +define internal i32 @max_si32(i32, i32) { + %cmp = icmp sgt i32 %0, %1 + %r = select i1 %cmp, i32 %0, i32 %1 + ret i32 %r +} + +define i32 @__reduce_max_int32() nounwind readnone { + neon_reduce(i32, @__max_varying_int32, @llvm.arm.neon.vpmaxs.v2i32, @max_si32) +} + +declare <2 x i32> @llvm.arm.neon.vpminu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone + +define internal i32 @min_ui32(i32, i32) { + %cmp = icmp ult i32 %0, %1 + %r = select i1 %cmp, i32 %0, i32 %1 + ret i32 %r +} + +define i32 @__reduce_min_uint32() nounwind readnone { + neon_reduce(i32, @__min_varying_uint32, @llvm.arm.neon.vpmins.v2i32, @min_ui32) +} + +declare <2 x i32> @llvm.arm.neon.vpmaxu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone + +define internal i32 @max_ui32(i32, i32) { + %cmp = icmp ugt i32 %0, %1 + %r = select i1 %cmp, i32 %0, i32 %1 + ret i32 %r +} + +define i32 @__reduce_max_uint32() nounwind readnone { + neon_reduce(i32, @__max_varying_uint32, @llvm.arm.neon.vpmaxs.v2i32, @max_ui32) +} + +define double @__reduce_add_double() nounwind readnone { + v8tov2(double, %0, %v0, %v1, %v2, %v3) + %v01 = fadd <2 x double> %v0, %v1 + %v23 = fadd <2 x double> %v2, %v3 + %sum = fadd <2 x double> %v01, %v23 + %e0 = extractelement <2 x double> %sum, i32 0 + %e1 = extractelement <2 x double> %sum, i32 1 + %m = fadd double %e0, %e1 + ret double %m +} + +define double @__reduce_min_double() nounwind readnone { + reduce8(double, @__min_varying_double, @__min_uniform_double) +} + +define double @__reduce_max_double() nounwind readnone { + reduce8(double, @__max_varying_double, @__max_uniform_double) +} + +define i64 @__reduce_add_int64() nounwind readnone { + v8tov2(i64, %0, %v0, %v1, %v2, %v3) + %v01 = add <2 x i64> %v0, %v1 + %v23 = add <2 x i64> %v2, %v3 + %sum = add <2 x i64> %v01, %v23 + %e0 = extractelement <2 x i64> %sum, i32 0 + %e1 = extractelement <2 x i64> %sum, i32 1 + %m = add i64 %e0, %e1 + ret i64 %m +} + +define i64 @__reduce_min_int64() nounwind readnone { + reduce8(i64, @__min_varying_int64, @__min_uniform_int64) +} + +define i64 @__reduce_max_int64() nounwind readnone { + reduce8(i64, @__max_varying_int64, @__max_uniform_int64) +} + +define i64 @__reduce_min_uint64() nounwind readnone { + reduce8(i64, @__min_varying_uint64, @__min_uniform_uint64) +} + +define i64 @__reduce_max_uint64() nounwind readnone { + reduce8(i64, @__max_varying_uint64, @__max_uniform_uint64) +} diff --git a/builtins/target-neon.ll b/builtins/target-neon-32.ll similarity index 62% rename from builtins/target-neon.ll rename to builtins/target-neon-32.ll index fbeac352..1f8003d7 100644 --- a/builtins/target-neon.ll +++ b/builtins/target-neon-32.ll @@ -1,5 +1,5 @@ ;; -;; target-neon.ll +;; target-neon-32.ll ;; ;; Copyright(c) 2012-2013 Matt Pharr ;; Copyright(c) 2013 Google, Inc. @@ -34,52 +34,20 @@ ;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -target datalayout = "e-p:32:32:32-S32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f16:16:16-f32:32:32-f64:32:64-f128:128:128-v64:32:64-v128:32:128-a0:0:64-n32" - define(`WIDTH',`4') - define(`MASK',`i32') include(`util.m4') - -stdlib_core() -scans() -reduce_equal(WIDTH) -rdrand_decls() -define_shuffles() -aossoa() -ctlztz() +include(`target-neon-common.ll') ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; half conversion routines -declare <4 x i16> @llvm.arm.neon.vcvtfp2hf(<4 x float>) nounwind readnone -declare <4 x float> @llvm.arm.neon.vcvthf2fp(<4 x i16>) nounwind readnone - -define float @__half_to_float_uniform(i16 %v) nounwind readnone { - %v1 = bitcast i16 %v to <1 x i16> - %vec = shufflevector <1 x i16> %v1, <1 x i16> undef, - <4 x i32> - %h = call <4 x float> @llvm.arm.neon.vcvthf2fp(<4 x i16> %vec) - %r = extractelement <4 x float> %h, i32 0 - ret float %r -} - define <4 x float> @__half_to_float_varying(<4 x i16> %v) nounwind readnone { %r = call <4 x float> @llvm.arm.neon.vcvthf2fp(<4 x i16> %v) ret <4 x float> %r } -define i16 @__float_to_half_uniform(float %v) nounwind readnone { - %v1 = bitcast float %v to <1 x float> - %vec = shufflevector <1 x float> %v1, <1 x float> undef, - <4 x i32> - %h = call <4 x i16> @llvm.arm.neon.vcvtfp2hf(<4 x float> %vec) - %r = extractelement <4 x i16> %h, i32 0 - ret i16 %r -} - - define <4 x i16> @__float_to_half_varying(<4 x float> %v) nounwind readnone { %r = call <4 x i16> @llvm.arm.neon.vcvtfp2hf(<4 x float> %v) ret <4 x i16> %r @@ -88,48 +56,11 @@ define <4 x i16> @__float_to_half_varying(<4 x float> %v) nounwind readnone { ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; math -define void @__fastmath() nounwind { - ret void -} - ;; round/floor/ceil ;; FIXME: grabbed these from the sse2 target, which does not have native ;; instructions for these. Is there a better approach for NEON? -define float @__round_uniform_float(float) nounwind readonly alwaysinline { - %float_to_int_bitcast.i.i.i.i = bitcast float %0 to i32 - %bitop.i.i = and i32 %float_to_int_bitcast.i.i.i.i, -2147483648 - %bitop.i = xor i32 %bitop.i.i, %float_to_int_bitcast.i.i.i.i - %int_to_float_bitcast.i.i40.i = bitcast i32 %bitop.i to float - %binop.i = fadd float %int_to_float_bitcast.i.i40.i, 8.388608e+06 - %binop21.i = fadd float %binop.i, -8.388608e+06 - %float_to_int_bitcast.i.i.i = bitcast float %binop21.i to i32 - %bitop31.i = xor i32 %float_to_int_bitcast.i.i.i, %bitop.i.i - %int_to_float_bitcast.i.i.i = bitcast i32 %bitop31.i to float - ret float %int_to_float_bitcast.i.i.i -} - -define float @__floor_uniform_float(float) nounwind readonly alwaysinline { - %calltmp.i = tail call float @__round_uniform_float(float %0) nounwind - %bincmp.i = fcmp ogt float %calltmp.i, %0 - %selectexpr.i = sext i1 %bincmp.i to i32 - %bitop.i = and i32 %selectexpr.i, -1082130432 - %int_to_float_bitcast.i.i.i = bitcast i32 %bitop.i to float - %binop.i = fadd float %calltmp.i, %int_to_float_bitcast.i.i.i - ret float %binop.i -} - -define float @__ceil_uniform_float(float) nounwind readonly alwaysinline { - %calltmp.i = tail call float @__round_uniform_float(float %0) nounwind - %bincmp.i = fcmp olt float %calltmp.i, %0 - %selectexpr.i = sext i1 %bincmp.i to i32 - %bitop.i = and i32 %selectexpr.i, 1065353216 - %int_to_float_bitcast.i.i.i = bitcast i32 %bitop.i to float - %binop.i = fadd float %calltmp.i, %int_to_float_bitcast.i.i.i - ret float %binop.i -} - define <4 x float> @__round_varying_float(<4 x float>) nounwind readonly alwaysinline { %float_to_int_bitcast.i.i.i.i = bitcast <4 x float> %0 to <4 x i32> %bitop.i.i = and <4 x i32> %float_to_int_bitcast.i.i.i.i, @@ -164,10 +95,6 @@ define <4 x float> @__ceil_varying_float(<4 x float>) nounwind readonly alwaysin } ;; FIXME: rounding doubles and double vectors needs to be implemented -declare double @__round_uniform_double(double) nounwind readnone -declare double @__floor_uniform_double(double) nounwind readnone -declare double @__ceil_uniform_double(double) nounwind readnone - declare @__round_varying_double() nounwind readnone declare @__floor_varying_double() nounwind readnone declare @__ceil_varying_double() nounwind readnone @@ -175,78 +102,6 @@ declare @__ceil_varying_double() nounwind readn ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; min/max -define float @__max_uniform_float(float, float) nounwind readnone { - %cmp = fcmp ugt float %0, %1 - %r = select i1 %cmp, float %0, float %1 - ret float %r -} - -define float @__min_uniform_float(float, float) nounwind readnone { - %cmp = fcmp ult float %0, %1 - %r = select i1 %cmp, float %0, float %1 - ret float %r -} - -define i32 @__min_uniform_int32(i32, i32) nounwind readnone { - %cmp = icmp slt i32 %0, %1 - %r = select i1 %cmp, i32 %0, i32 %1 - ret i32 %r -} - -define i32 @__max_uniform_int32(i32, i32) nounwind readnone { - %cmp = icmp sgt i32 %0, %1 - %r = select i1 %cmp, i32 %0, i32 %1 - ret i32 %r -} - -define i32 @__min_uniform_uint32(i32, i32) nounwind readnone { - %cmp = icmp ult i32 %0, %1 - %r = select i1 %cmp, i32 %0, i32 %1 - ret i32 %r -} - -define i32 @__max_uniform_uint32(i32, i32) nounwind readnone { - %cmp = icmp ugt i32 %0, %1 - %r = select i1 %cmp, i32 %0, i32 %1 - ret i32 %r -} - -define i64 @__min_uniform_int64(i64, i64) nounwind readnone { - %cmp = icmp slt i64 %0, %1 - %r = select i1 %cmp, i64 %0, i64 %1 - ret i64 %r -} - -define i64 @__max_uniform_int64(i64, i64) nounwind readnone { - %cmp = icmp sgt i64 %0, %1 - %r = select i1 %cmp, i64 %0, i64 %1 - ret i64 %r -} - -define i64 @__min_uniform_uint64(i64, i64) nounwind readnone { - %cmp = icmp ult i64 %0, %1 - %r = select i1 %cmp, i64 %0, i64 %1 - ret i64 %r -} - -define i64 @__max_uniform_uint64(i64, i64) nounwind readnone { - %cmp = icmp ugt i64 %0, %1 - %r = select i1 %cmp, i64 %0, i64 %1 - ret i64 %r -} - -define double @__min_uniform_double(double, double) nounwind readnone { - %cmp = fcmp olt double %0, %1 - %r = select i1 %cmp, double %0, double %1 - ret double %r -} - -define double @__max_uniform_double(double, double) nounwind readnone { - %cmp = fcmp ogt double %0, %1 - %r = select i1 %cmp, double %0, double %1 - ret double %r -} - declare <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float>, <4 x float>) nounwind readnone declare <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float>, <4 x float>) nounwind readnone @@ -287,44 +142,6 @@ define @__max_varying_uint32(, ) nounwin ret <4 x i32> %r } -define @__min_varying_int64(, ) nounwind readnone { - %m = icmp slt %0, %1 - %r = select %m, %0, %1 - ret %r -} - -define @__max_varying_int64(, ) nounwind readnone { - %m = icmp sgt %0, %1 - %r = select %m, %0, %1 - ret %r -} - -define @__min_varying_uint64(, ) nounwind readnone { - %m = icmp ult %0, %1 - %r = select %m, %0, %1 - ret %r -} - -define @__max_varying_uint64(, ) nounwind readnone { - %m = icmp ugt %0, %1 - %r = select %m, %0, %1 - ret %r -} - -define @__min_varying_double(, - ) nounwind readnone { - %m = fcmp olt %0, %1 - %r = select %m, %0, %1 - ret %r -} - -define @__max_varying_double(, - ) nounwind readnone { - %m = fcmp ogt %0, %1 - %r = select %m, %0, %1 - ret %r -} - ;; sqrt/rsqrt/rcp declare <4 x float> @llvm.arm.neon.vrecpe.v4f32(<4 x float>) nounwind readnone @@ -371,13 +188,6 @@ define float @__rcp_uniform_float(float) nounwind readnone { ret float %r } -declare float @llvm.sqrt.f32(float) - -define float @__sqrt_uniform_float(float) nounwind readnone { - %r = call float @llvm.sqrt.f32(float %0) - ret float %r -} - declare <4 x float> @llvm.sqrt.v4f32(<4 x float>) define @__sqrt_varying_float() nounwind readnone { @@ -388,13 +198,6 @@ define @__sqrt_varying_float() nounwind readnone ret <4 x float> %result } -declare double @llvm.sqrt.f64(double) - -define double @__sqrt_uniform_double(double) nounwind readnone { - %r = call double @llvm.sqrt.f64(double %0) - ret double %r -} - declare <4 x double> @llvm.sqrt.v4f64(<4 x double>) define @__sqrt_varying_double() nounwind readnone { @@ -402,21 +205,6 @@ define @__sqrt_varying_double() nounwind readno ret <4 x double> %r } -;; bit ops - -declare i32 @llvm.ctpop.i32(i32) nounwind readnone -declare i64 @llvm.ctpop.i64(i64) nounwind readnone - -define i32 @__popcnt_int32(i32) nounwind readnone { - %v = call i32 @llvm.ctpop.i32(i32 %0) - ret i32 %v -} - -define i64 @__popcnt_int64(i64) nounwind readnone { - %v = call i64 @llvm.ctpop.i64(i64 %0) - ret i64 %v -} - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; reductions @@ -638,92 +426,3 @@ define i64 @__reduce_min_uint64(<4 x i64>) nounwind readnone { define i64 @__reduce_max_uint64(<4 x i64>) nounwind readnone { reduce4(i64, @__max_varying_uint64, @__max_uniform_uint64) } - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;; unaligned loads/loads+broadcasts - -masked_load(i8, 1) -masked_load(i16, 2) -masked_load(i32, 4) -masked_load(float, 4) -masked_load(i64, 8) -masked_load(double, 8) - -gen_masked_store(i8) -gen_masked_store(i16) -gen_masked_store(i32) -gen_masked_store(i64) -masked_store_float_double() - -define void @__masked_store_blend_i8(* nocapture %ptr, %new, - %mask) nounwind alwaysinline { - %old = load * %ptr - %mask1 = trunc <4 x MASK> %mask to <4 x i1> - %result = select <4 x i1> %mask1, <4 x i8> %new, <4 x i8> %old - store %result, * %ptr - ret void -} - -define void @__masked_store_blend_i16(* nocapture %ptr, %new, - %mask) nounwind alwaysinline { - %old = load * %ptr - %mask1 = trunc <4 x MASK> %mask to <4 x i1> - %result = select <4 x i1> %mask1, <4 x i16> %new, <4 x i16> %old - store %result, * %ptr - ret void -} - -define void @__masked_store_blend_i32(* nocapture %ptr, %new, - %mask) nounwind alwaysinline { - %old = load * %ptr - %mask1 = trunc <4 x MASK> %mask to <4 x i1> - %result = select <4 x i1> %mask1, <4 x i32> %new, <4 x i32> %old - store %result, * %ptr - ret void -} - -define void @__masked_store_blend_i64(* nocapture %ptr, - %new, %mask) nounwind alwaysinline { - %old = load * %ptr - %mask1 = trunc <4 x MASK> %mask to <4 x i1> - %result = select <4 x i1> %mask1, <4 x i64> %new, <4 x i64> %old - store %result, * %ptr - ret void -} - -;; yuck. We need declarations of these, even though we shouldnt ever -;; actually generate calls to them for the NEON target... - -declare @__svml_sin() -declare @__svml_cos() -declare void @__svml_sincos(, *, *) -declare @__svml_tan() -declare @__svml_atan() -declare @__svml_atan2(, ) -declare @__svml_exp() -declare @__svml_log() -declare @__svml_pow(, ) - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;; gather - -gen_gather_factored(i8) -gen_gather_factored(i16) -gen_gather_factored(i32) -gen_gather_factored(float) -gen_gather_factored(i64) -gen_gather_factored(double) - -gen_scatter(i8) -gen_scatter(i16) -gen_scatter(i32) -gen_scatter(float) -gen_scatter(i64) -gen_scatter(double) - -packed_load_and_store(4) - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;; prefetch - -define_prefetches() diff --git a/builtins/target-neon-8.ll b/builtins/target-neon-8.ll new file mode 100644 index 00000000..eb65f224 --- /dev/null +++ b/builtins/target-neon-8.ll @@ -0,0 +1,508 @@ +;; +;; target-neon-8.ll +;; +;; Copyright(c) 2013 Google, Inc. +;; +;; All rights reserved. +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are +;; met: +;; +;; * Redistributions of source code must retain the above copyright +;; notice, this list of conditions and the following disclaimer. +;; +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; +;; * Neither the name of Matt Pharr nor the names of its +;; contributors may be used to endorse or promote products derived from +;; this software without specific prior written permission. +;; +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +;; IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +;; TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +;; PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER +;; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +;; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +;; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +define(`WIDTH',`16') +define(`MASK',`i8') + +include(`util.m4') +include(`target-neon-common.ll') + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; half conversion routines + +define <16 x float> @__half_to_float_varying(<16 x i16> %v) nounwind readnone { + unary4to16conv(r, i16, float, @llvm.arm.neon.vcvthf2fp, %v) + ret <16 x float> %r +} + +define <16 x i16> @__float_to_half_varying(<16 x float> %v) nounwind readnone { + unary4to16conv(r, float, i16, @llvm.arm.neon.vcvtfp2hf, %v) + ret <16 x i16> %r +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; math + +;; round/floor/ceil + +;; FIXME: grabbed these from the sse2 target, which does not have native +;; instructions for these. Is there a better approach for NEON? + +define <16 x float> @__round_varying_float(<16 x float>) nounwind readonly alwaysinline { + %float_to_int_bitcast.i.i.i.i = bitcast <16 x float> %0 to <16 x i32> + %bitop.i.i = and <16 x i32> %float_to_int_bitcast.i.i.i.i, + + %bitop.i = xor <16 x i32> %float_to_int_bitcast.i.i.i.i, %bitop.i.i + %int_to_float_bitcast.i.i40.i = bitcast <16 x i32> %bitop.i to <16 x float> + %binop.i = fadd <16 x float> %int_to_float_bitcast.i.i40.i, + + %binop21.i = fadd <16 x float> %binop.i, + + %float_to_int_bitcast.i.i.i = bitcast <16 x float> %binop21.i to <16 x i32> + %bitop31.i = xor <16 x i32> %float_to_int_bitcast.i.i.i, %bitop.i.i + %int_to_float_bitcast.i.i.i = bitcast <16 x i32> %bitop31.i to <16 x float> + ret <16 x float> %int_to_float_bitcast.i.i.i +} + +define <16 x float> @__floor_varying_float(<16 x float>) nounwind readonly alwaysinline { + %calltmp.i = tail call <16 x float> @__round_varying_float(<16 x float> %0) nounwind + %bincmp.i = fcmp ogt <16 x float> %calltmp.i, %0 + %val_to_boolvec32.i = sext <16 x i1> %bincmp.i to <16 x i32> + %bitop.i = and <16 x i32> %val_to_boolvec32.i, + + %int_to_float_bitcast.i.i.i = bitcast <16 x i32> %bitop.i to <16 x float> + %binop.i = fadd <16 x float> %calltmp.i, %int_to_float_bitcast.i.i.i + ret <16 x float> %binop.i +} + +define <16 x float> @__ceil_varying_float(<16 x float>) nounwind readonly alwaysinline { + %calltmp.i = tail call <16 x float> @__round_varying_float(<16 x float> %0) nounwind + %bincmp.i = fcmp olt <16 x float> %calltmp.i, %0 + %val_to_boolvec32.i = sext <16 x i1> %bincmp.i to <16 x i32> + %bitop.i = and <16 x i32> %val_to_boolvec32.i, + + %int_to_float_bitcast.i.i.i = bitcast <16 x i32> %bitop.i to <16 x float> + %binop.i = fadd <16 x float> %calltmp.i, %int_to_float_bitcast.i.i.i + ret <16 x float> %binop.i +} + +;; FIXME: rounding doubles and double vectors needs to be implemented +declare @__round_varying_double() nounwind readnone +declare @__floor_varying_double() nounwind readnone +declare @__ceil_varying_double() nounwind readnone + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; min/max + +declare <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float>, <4 x float>) nounwind readnone +declare <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float>, <4 x float>) nounwind readnone + +define @__max_varying_float(, + ) nounwind readnone { + binary4to16(r, float, @llvm.arm.neon.vmaxs.v4f32, %0, %1) + ret %r +} + +define @__min_varying_float(, + ) nounwind readnone { + binary4to16(r, float, @llvm.arm.neon.vmins.v4f32, %0, %1) + ret %r +} + +declare <4 x i32> @llvm.arm.neon.vmins.v4i32(<4 x i32>, <4 x i32>) nounwind readnone +declare <4 x i32> @llvm.arm.neon.vminu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone +declare <4 x i32> @llvm.arm.neon.vmaxs.v4i32(<4 x i32>, <4 x i32>) nounwind readnone +declare <4 x i32> @llvm.arm.neon.vmaxu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone + +define @__min_varying_int32(, ) nounwind readnone { + binary4to16(r, i32, @llvm.arm.neon.vmins.v4i32, %0, %1) + ret %r +} + +define @__max_varying_int32(, ) nounwind readnone { + binary4to16(r, i32, @llvm.arm.neon.vmaxs.v4i32, %0, %1) + ret %r +} + +define @__min_varying_uint32(, ) nounwind readnone { + binary4to16(r, i32, @llvm.arm.neon.vminu.v4i32, %0, %1) + ret %r +} + +define @__max_varying_uint32(, ) nounwind readnone { + binary4to16(r, i32, @llvm.arm.neon.vmaxu.v4i32, %0, %1) + ret %r +} + +;; sqrt/rsqrt/rcp + +declare <4 x float> @llvm.arm.neon.vrecpe.v4f32(<4 x float>) nounwind readnone +declare <4 x float> @llvm.arm.neon.vrecps.v4f32(<4 x float>, <4 x float>) nounwind readnone + +define @__rcp_varying_float( %d) nounwind readnone { + unary4to16(x0, float, @llvm.arm.neon.vrecpe.v4f32, %d) + binary4to16(x0_nr, float, @llvm.arm.neon.vrecps.v4f32, %d, %x0) + %x1 = fmul %x0, %x0_nr + binary4to16(x1_nr, float, @llvm.arm.neon.vrecps.v4f32, %d, %x1) + %x2 = fmul %x1, %x1_nr + ret %x2 +} + +declare <4 x float> @llvm.arm.neon.vrsqrte.v4f32(<4 x float>) nounwind readnone +declare <4 x float> @llvm.arm.neon.vrsqrts.v4f32(<4 x float>, <4 x float>) nounwind readnone + +define @__rsqrt_varying_float( %d) nounwind readnone { + unary4to16(x0, float, @llvm.arm.neon.vrsqrte.v4f32, %d) + %x0_2 = fmul %x0, %x0 + binary4to16(x0_nr, float, @llvm.arm.neon.vrsqrts.v4f32, %d, %x0_2) + %x1 = fmul %x0, %x0_nr + %x1_2 = fmul %x1, %x1 + binary4to16(x1_nr, float, @llvm.arm.neon.vrsqrts.v4f32, %d, %x1_2) + %x2 = fmul %x1, %x1_nr + ret %x2 +} + +define float @__rsqrt_uniform_float(float) nounwind readnone { + %v1 = bitcast float %0 to <1 x float> + %vs = shufflevector <1 x float> %v1, <1 x float> undef, + <16 x i32> + %vr = call <16 x float> @__rsqrt_varying_float(<16 x float> %vs) + %r = extractelement <16 x float> %vr, i32 0 + ret float %r +} + +define float @__rcp_uniform_float(float) nounwind readnone { + %v1 = bitcast float %0 to <1 x float> + %vs = shufflevector <1 x float> %v1, <1 x float> undef, + <16 x i32> + %vr = call <16 x float> @__rcp_varying_float(<16 x float> %vs) + %r = extractelement <16 x float> %vr, i32 0 + ret float %r +} + +declare <4 x float> @llvm.sqrt.v4f32(<4 x float>) + +define @__sqrt_varying_float() nounwind readnone { + unary4to16(result, float, @llvm.sqrt.v4f32, %0) +;; this returns nan for v=0, which is undesirable.. +;; %rsqrt = call @__rsqrt_varying_float( %0) +;; %result = fmul <4 x float> %rsqrt, %0 + ret <16 x float> %result +} + +declare <4 x double> @llvm.sqrt.v4f64(<4 x double>) + +define @__sqrt_varying_double() nounwind readnone { + unary4to16(r, double, @llvm.sqrt.v4f64, %0) + ret %r +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; reductions + +define i64 @__movmsk() nounwind readnone { + %and_mask = and %0, + + %v8 = call <8 x i16> @llvm.arm.neon.vpaddlu.v8i16.v16i8(<16 x i8> %and_mask) + %v4 = call <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<8 x i16> %v8) + %v2 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %v4) + %va = extractelement <2 x i64> %v2, i32 0 + %vb = extractelement <2 x i64> %v2, i32 1 + %vbshift = shl i64 %vb, 8 + %v = or i64 %va, %vbshift + ret i64 %v +} + +define i1 @__any() nounwind readnone alwaysinline { + v16tov8(MASK, %0, %v8a, %v8b) + %vor8 = or <8 x MASK> %v8a, %v8b + %v16 = sext <8 x i8> %vor8 to <8 x i16> + v8tov4(i16, %v16, %v16a, %v16b) + %vor16 = or <4 x i16> %v16a, %v16b + %v32 = sext <4 x i16> %vor16 to <4 x i32> + v4tov2(i32, %v32, %v32a, %v32b) + %vor32 = or <2 x i32> %v32a, %v32b + %v0 = extractelement <2 x i32> %vor32, i32 0 + %v1 = extractelement <2 x i32> %vor32, i32 1 + %v = or i32 %v0, %v1 + %cmp = icmp ne i32 %v, 0 + ret i1 %cmp +} + +define i1 @__all() nounwind readnone alwaysinline { + v16tov8(MASK, %0, %v8a, %v8b) + %vand8 = and <8 x MASK> %v8a, %v8b + %v16 = sext <8 x i8> %vand8 to <8 x i16> + v8tov4(i16, %v16, %v16a, %v16b) + %vand16 = and <4 x i16> %v16a, %v16b + %v32 = sext <4 x i16> %vand16 to <4 x i32> + v4tov2(i32, %v32, %v32a, %v32b) + %vand32 = and <2 x i32> %v32a, %v32b + %v0 = extractelement <2 x i32> %vand32, i32 0 + %v1 = extractelement <2 x i32> %vand32, i32 1 + %v = and i32 %v0, %v1 + %cmp = icmp ne i32 %v, 0 + ret i1 %cmp +} + +define i1 @__none() nounwind readnone alwaysinline { + %any = call i1 @__any( %0) + %none = icmp eq i1 %any, 0 + ret i1 %none +} + +;; $1: scalar type +;; $2: vector/vector reduce function (2 x -> ) +;; $3: pairwise vector reduce function (2 x <2 x vec> -> <2 x vec>) +;; $4: scalar reduce function + +define(`neon_reduce', ` + v16tov8($1, %0, %va, %vb) + %va_16 = shufflevector <8 x $1> %va, <8 x $1> undef, + <16 x i32> + %vb_16 = shufflevector <8 x $1> %vb, <8 x $1> undef, + <16 x i32> + %v8 = call <16 x $1> $2(<16 x $1> %va_16, <16 x $1> %vb_16) + + %v8a = shufflevector <16 x $1> %v8, <16 x $1> undef, + <16 x i32> + %v8b = shufflevector <16 x $1> %v8, <16 x $1> undef, + <16 x i32> + + %v4 = call <16 x $1> $2(<16 x $1> %v8a, <16 x $1> %v8b) + + %vfirst_4 = shufflevector <16 x $1> %v4, <16 x $1> undef, + <4 x i32> + v4tov2($1, %vfirst_4, %v0, %v1) + %vh = call <2 x $1> $3(<2 x $1> %v0, <2 x $1> %v1) + %vh0 = extractelement <2 x $1> %vh, i32 0 + %vh1 = extractelement <2 x $1> %vh, i32 1 + %r = call $1 $4($1 %vh0, $1 %vh1) + ret $1 %r +') + +declare <2 x float> @llvm.arm.neon.vpadd.v2f32(<2 x float>, <2 x float>) nounwind readnone + +define internal float @add_f32(float, float) { + %r = fadd float %0, %1 + ret float %r +} + +define internal @__add_varying_float(, ) { + %r = fadd %0, %1 + ret %r +} + +define float @__reduce_add_float() nounwind readnone { + neon_reduce(float, @__add_varying_float, @llvm.arm.neon.vpadd.v2f32, @add_f32) +} + +declare <2 x float> @llvm.arm.neon.vpmins.v2f32(<2 x float>, <2 x float>) nounwind readnone + +define internal float @min_f32(float, float) { + %cmp = fcmp olt float %0, %1 + %r = select i1 %cmp, float %0, float %1 + ret float %r +} + +define float @__reduce_min_float() nounwind readnone { + neon_reduce(float, @__min_varying_float, @llvm.arm.neon.vpmins.v2f32, @min_f32) +} + +declare <2 x float> @llvm.arm.neon.vpmaxs.v2f32(<2 x float>, <2 x float>) nounwind readnone + +define internal float @max_f32(float, float) { + %cmp = fcmp ugt float %0, %1 + %r = select i1 %cmp, float %0, float %1 + ret float %r +} + +define float @__reduce_max_float() nounwind readnone { + neon_reduce(float, @__max_varying_float, @llvm.arm.neon.vpmaxs.v2f32, @max_f32) +} + +declare <8 x i16> @llvm.arm.neon.vpaddlu.v8i16.v16i8(<16 x i8>) nounwind readnone +declare <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<8 x i16>) nounwind readnone +declare <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32>) nounwind readnone + +define i64 @__reduce_add_int8() nounwind readnone { + %a16 = call <8 x i16> @llvm.arm.neon.vpaddlu.v8i16.v16i8(<16 x i8> %0) + %a32 = call <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<8 x i16> %a16) + %a64 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %a32) + %a0 = extractelement <2 x i64> %a64, i32 0 + %a1 = extractelement <2 x i64> %a64, i32 1 + %r = add i64 %a0, %a1 + ret i64 %r +} + +define i64 @__reduce_add_int16() nounwind readnone { + v16tov8(i16, %0, %va, %vb) + %a32 = call <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<8 x i16> %va) + %b32 = call <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<8 x i16> %vb) + %a64 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %a32) + %b64 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %b32) + %sum = add <2 x i64> %a64, %b64 + %a0 = extractelement <2 x i64> %sum, i32 0 + %a1 = extractelement <2 x i64> %sum, i32 1 + %r = add i64 %a0, %a1 + ret i64 %r +} + +define i64 @__reduce_add_int32() nounwind readnone { + v16tov4(i32, %0, %va, %vb, %vc, %vd) + %a64 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %va) + %b64 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %vb) + %c64 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %vc) + %d64 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %vd) + %ab = add <2 x i64> %a64, %b64 + %cd = add <2 x i64> %c64, %d64 + %sum = add <2 x i64> %ab, %cd + %a0 = extractelement <2 x i64> %sum, i32 0 + %a1 = extractelement <2 x i64> %sum, i32 1 + %r = add i64 %a0, %a1 + ret i64 %r +} + +declare <2 x i32> @llvm.arm.neon.vpmins.v2i32(<2 x i32>, <2 x i32>) nounwind readnone + +define internal i32 @min_si32(i32, i32) { + %cmp = icmp slt i32 %0, %1 + %r = select i1 %cmp, i32 %0, i32 %1 + ret i32 %r +} + +define i32 @__reduce_min_int32() nounwind readnone { + neon_reduce(i32, @__min_varying_int32, @llvm.arm.neon.vpmins.v2i32, @min_si32) +} + +declare <2 x i32> @llvm.arm.neon.vpmaxs.v2i32(<2 x i32>, <2 x i32>) nounwind readnone + +define internal i32 @max_si32(i32, i32) { + %cmp = icmp sgt i32 %0, %1 + %r = select i1 %cmp, i32 %0, i32 %1 + ret i32 %r +} + +define i32 @__reduce_max_int32() nounwind readnone { + neon_reduce(i32, @__max_varying_int32, @llvm.arm.neon.vpmaxs.v2i32, @max_si32) +} + +declare <2 x i32> @llvm.arm.neon.vpminu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone + +define internal i32 @min_ui32(i32, i32) { + %cmp = icmp ult i32 %0, %1 + %r = select i1 %cmp, i32 %0, i32 %1 + ret i32 %r +} + +define i32 @__reduce_min_uint32() nounwind readnone { + neon_reduce(i32, @__min_varying_uint32, @llvm.arm.neon.vpmins.v2i32, @min_ui32) +} + +declare <2 x i32> @llvm.arm.neon.vpmaxu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone + +define internal i32 @max_ui32(i32, i32) { + %cmp = icmp ugt i32 %0, %1 + %r = select i1 %cmp, i32 %0, i32 %1 + ret i32 %r +} + +define i32 @__reduce_max_uint32() nounwind readnone { + neon_reduce(i32, @__max_varying_uint32, @llvm.arm.neon.vpmaxs.v2i32, @max_ui32) +} + +define internal double @__add_uniform_double(double, double) { + %r = fadd double %0, %1 + ret double %r +} + +define internal @__add_varying_double(, ) { + %r = fadd %0, %1 + ret %r +} + +define double @__reduce_add_double() nounwind readnone { + reduce16(double, @__add_varying_double, @__add_uniform_double) +} + +define double @__reduce_min_double() nounwind readnone { + reduce16(double, @__min_varying_double, @__min_uniform_double) +} + +define double @__reduce_max_double() nounwind readnone { + reduce16(double, @__max_varying_double, @__max_uniform_double) +} + +define internal i64 @__add_uniform_int64(i64, i64) { + %r = add i64 %0, %1 + ret i64 %r +} + +define internal @__add_varying_int64(, ) { + %r = add %0, %1 + ret %r +} + +define i64 @__reduce_add_int64() nounwind readnone { + reduce16(i64, @__add_varying_int64, @__add_uniform_int64) +} + +define i64 @__reduce_min_int64() nounwind readnone { + reduce16(i64, @__min_varying_int64, @__min_uniform_int64) +} + +define i64 @__reduce_max_int64() nounwind readnone { + reduce16(i64, @__max_varying_int64, @__max_uniform_int64) +} + +define i64 @__reduce_min_uint64() nounwind readnone { + reduce16(i64, @__min_varying_uint64, @__min_uniform_uint64) +} + +define i64 @__reduce_max_uint64() nounwind readnone { + reduce16(i64, @__max_varying_uint64, @__max_uniform_uint64) +} diff --git a/builtins/target-neon-common.ll b/builtins/target-neon-common.ll new file mode 100644 index 00000000..696b0748 --- /dev/null +++ b/builtins/target-neon-common.ll @@ -0,0 +1,351 @@ +;; +;; target-neon-common.ll +;; +;; Copyright(c) 2013 Google, Inc. +;; +;; All rights reserved. +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are +;; met: +;; +;; * Redistributions of source code must retain the above copyright +;; notice, this list of conditions and the following disclaimer. +;; +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; +;; * Neither the name of Matt Pharr nor the names of its +;; contributors may be used to endorse or promote products derived from +;; this software without specific prior written permission. +;; +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +;; IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +;; TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +;; PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER +;; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +;; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +;; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +target datalayout = "e-p:32:32:32-S32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f16:16:16-f32:32:32-f64:32:64-f128:128:128-v64:32:64-v128:32:128-a0:0:64-n32" + +stdlib_core() +scans() +reduce_equal(WIDTH) +rdrand_decls() +define_shuffles() +aossoa() +ctlztz() + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; half conversion routines + +declare <4 x i16> @llvm.arm.neon.vcvtfp2hf(<4 x float>) nounwind readnone +declare <4 x float> @llvm.arm.neon.vcvthf2fp(<4 x i16>) nounwind readnone + +define float @__half_to_float_uniform(i16 %v) nounwind readnone { + %v1 = bitcast i16 %v to <1 x i16> + %vec = shufflevector <1 x i16> %v1, <1 x i16> undef, + <4 x i32> + %h = call <4 x float> @llvm.arm.neon.vcvthf2fp(<4 x i16> %vec) + %r = extractelement <4 x float> %h, i32 0 + ret float %r +} + +define i16 @__float_to_half_uniform(float %v) nounwind readnone { + %v1 = bitcast float %v to <1 x float> + %vec = shufflevector <1 x float> %v1, <1 x float> undef, + <4 x i32> + %h = call <4 x i16> @llvm.arm.neon.vcvtfp2hf(<4 x float> %vec) + %r = extractelement <4 x i16> %h, i32 0 + ret i16 %r +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; math + +define void @__fastmath() nounwind { + ret void +} + +;; round/floor/ceil + +;; FIXME: grabbed these from the sse2 target, which does not have native +;; instructions for these. Is there a better approach for NEON? + +define float @__round_uniform_float(float) nounwind readonly alwaysinline { + %float_to_int_bitcast.i.i.i.i = bitcast float %0 to i32 + %bitop.i.i = and i32 %float_to_int_bitcast.i.i.i.i, -2147483648 + %bitop.i = xor i32 %bitop.i.i, %float_to_int_bitcast.i.i.i.i + %int_to_float_bitcast.i.i40.i = bitcast i32 %bitop.i to float + %binop.i = fadd float %int_to_float_bitcast.i.i40.i, 8.388608e+06 + %binop21.i = fadd float %binop.i, -8.388608e+06 + %float_to_int_bitcast.i.i.i = bitcast float %binop21.i to i32 + %bitop31.i = xor i32 %float_to_int_bitcast.i.i.i, %bitop.i.i + %int_to_float_bitcast.i.i.i = bitcast i32 %bitop31.i to float + ret float %int_to_float_bitcast.i.i.i +} + +define float @__floor_uniform_float(float) nounwind readonly alwaysinline { + %calltmp.i = tail call float @__round_uniform_float(float %0) nounwind + %bincmp.i = fcmp ogt float %calltmp.i, %0 + %selectexpr.i = sext i1 %bincmp.i to i32 + %bitop.i = and i32 %selectexpr.i, -1082130432 + %int_to_float_bitcast.i.i.i = bitcast i32 %bitop.i to float + %binop.i = fadd float %calltmp.i, %int_to_float_bitcast.i.i.i + ret float %binop.i +} + +define float @__ceil_uniform_float(float) nounwind readonly alwaysinline { + %calltmp.i = tail call float @__round_uniform_float(float %0) nounwind + %bincmp.i = fcmp olt float %calltmp.i, %0 + %selectexpr.i = sext i1 %bincmp.i to i32 + %bitop.i = and i32 %selectexpr.i, 1065353216 + %int_to_float_bitcast.i.i.i = bitcast i32 %bitop.i to float + %binop.i = fadd float %calltmp.i, %int_to_float_bitcast.i.i.i + ret float %binop.i +} + +;; FIXME: rounding doubles and double vectors needs to be implemented +declare double @__round_uniform_double(double) nounwind readnone +declare double @__floor_uniform_double(double) nounwind readnone +declare double @__ceil_uniform_double(double) nounwind readnone + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; min/max + +define float @__max_uniform_float(float, float) nounwind readnone { + %cmp = fcmp ugt float %0, %1 + %r = select i1 %cmp, float %0, float %1 + ret float %r +} + +define float @__min_uniform_float(float, float) nounwind readnone { + %cmp = fcmp ult float %0, %1 + %r = select i1 %cmp, float %0, float %1 + ret float %r +} + +define i32 @__min_uniform_int32(i32, i32) nounwind readnone { + %cmp = icmp slt i32 %0, %1 + %r = select i1 %cmp, i32 %0, i32 %1 + ret i32 %r +} + +define i32 @__max_uniform_int32(i32, i32) nounwind readnone { + %cmp = icmp sgt i32 %0, %1 + %r = select i1 %cmp, i32 %0, i32 %1 + ret i32 %r +} + +define i32 @__min_uniform_uint32(i32, i32) nounwind readnone { + %cmp = icmp ult i32 %0, %1 + %r = select i1 %cmp, i32 %0, i32 %1 + ret i32 %r +} + +define i32 @__max_uniform_uint32(i32, i32) nounwind readnone { + %cmp = icmp ugt i32 %0, %1 + %r = select i1 %cmp, i32 %0, i32 %1 + ret i32 %r +} + +define i64 @__min_uniform_int64(i64, i64) nounwind readnone { + %cmp = icmp slt i64 %0, %1 + %r = select i1 %cmp, i64 %0, i64 %1 + ret i64 %r +} + +define i64 @__max_uniform_int64(i64, i64) nounwind readnone { + %cmp = icmp sgt i64 %0, %1 + %r = select i1 %cmp, i64 %0, i64 %1 + ret i64 %r +} + +define i64 @__min_uniform_uint64(i64, i64) nounwind readnone { + %cmp = icmp ult i64 %0, %1 + %r = select i1 %cmp, i64 %0, i64 %1 + ret i64 %r +} + +define i64 @__max_uniform_uint64(i64, i64) nounwind readnone { + %cmp = icmp ugt i64 %0, %1 + %r = select i1 %cmp, i64 %0, i64 %1 + ret i64 %r +} + +define double @__min_uniform_double(double, double) nounwind readnone { + %cmp = fcmp olt double %0, %1 + %r = select i1 %cmp, double %0, double %1 + ret double %r +} + +define double @__max_uniform_double(double, double) nounwind readnone { + %cmp = fcmp ogt double %0, %1 + %r = select i1 %cmp, double %0, double %1 + ret double %r +} + +define @__min_varying_int64(, ) nounwind readnone { + %m = icmp slt %0, %1 + %r = select %m, %0, %1 + ret %r +} + +define @__max_varying_int64(, ) nounwind readnone { + %m = icmp sgt %0, %1 + %r = select %m, %0, %1 + ret %r +} + +define @__min_varying_uint64(, ) nounwind readnone { + %m = icmp ult %0, %1 + %r = select %m, %0, %1 + ret %r +} + +define @__max_varying_uint64(, ) nounwind readnone { + %m = icmp ugt %0, %1 + %r = select %m, %0, %1 + ret %r +} + +define @__min_varying_double(, + ) nounwind readnone { + %m = fcmp olt %0, %1 + %r = select %m, %0, %1 + ret %r +} + +define @__max_varying_double(, + ) nounwind readnone { + %m = fcmp ogt %0, %1 + %r = select %m, %0, %1 + ret %r +} + +;; sqrt/rsqrt/rcp + +declare float @llvm.sqrt.f32(float) + +define float @__sqrt_uniform_float(float) nounwind readnone { + %r = call float @llvm.sqrt.f32(float %0) + ret float %r +} + +declare double @llvm.sqrt.f64(double) + +define double @__sqrt_uniform_double(double) nounwind readnone { + %r = call double @llvm.sqrt.f64(double %0) + ret double %r +} + +;; bit ops + +declare i32 @llvm.ctpop.i32(i32) nounwind readnone +declare i64 @llvm.ctpop.i64(i64) nounwind readnone + +define i32 @__popcnt_int32(i32) nounwind readnone { + %v = call i32 @llvm.ctpop.i32(i32 %0) + ret i32 %v +} + +define i64 @__popcnt_int64(i64) nounwind readnone { + %v = call i64 @llvm.ctpop.i64(i64 %0) + ret i64 %v +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; unaligned loads/loads+broadcasts + +masked_load(i8, 1) +masked_load(i16, 2) +masked_load(i32, 4) +masked_load(float, 4) +masked_load(i64, 8) +masked_load(double, 8) + +gen_masked_store(i8) +gen_masked_store(i16) +gen_masked_store(i32) +gen_masked_store(i64) +masked_store_float_double() + +define void @__masked_store_blend_i8(* nocapture %ptr, %new, + %mask) nounwind alwaysinline { + %old = load * %ptr + %mask1 = trunc %mask to + %result = select %mask1, %new, %old + store %result, * %ptr + ret void +} + +define void @__masked_store_blend_i16(* nocapture %ptr, %new, + %mask) nounwind alwaysinline { + %old = load * %ptr + %mask1 = trunc %mask to + %result = select %mask1, %new, %old + store %result, * %ptr + ret void +} + +define void @__masked_store_blend_i32(* nocapture %ptr, %new, + %mask) nounwind alwaysinline { + %old = load * %ptr + %mask1 = trunc %mask to + %result = select %mask1, %new, %old + store %result, * %ptr + ret void +} + +define void @__masked_store_blend_i64(* nocapture %ptr, + %new, %mask) nounwind alwaysinline { + %old = load * %ptr + %mask1 = trunc %mask to + %result = select %mask1, %new, %old + store %result, * %ptr + ret void +} + +;; yuck. We need declarations of these, even though we shouldnt ever +;; actually generate calls to them for the NEON target... + +declare @__svml_sin() +declare @__svml_cos() +declare void @__svml_sincos(, *, *) +declare @__svml_tan() +declare @__svml_atan() +declare @__svml_atan2(, ) +declare @__svml_exp() +declare @__svml_log() +declare @__svml_pow(, ) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; gather + +gen_gather_factored(i8) +gen_gather_factored(i16) +gen_gather_factored(i32) +gen_gather_factored(float) +gen_gather_factored(i64) +gen_gather_factored(double) + +gen_scatter(i8) +gen_scatter(i16) +gen_scatter(i32) +gen_scatter(float) +gen_scatter(i64) +gen_scatter(double) + +packed_load_and_store(4) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; prefetch + +define_prefetches() diff --git a/builtins/util.m4 b/builtins/util.m4 index ee45ebc7..1f85e2cc 100644 --- a/builtins/util.m4 +++ b/builtins/util.m4 @@ -49,6 +49,53 @@ define(`MASK_HIGH_BIT_ON', ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; vector assembly and deconstruction utilities +;; split 8-wide vector into 2 4-wide vectors +;; +;; $1: vector element type +;; $2: 8-wide vector +;; $3: first 4-wide vector +;; $4: second 4-wide vector + +define(`v8tov4', ` + $3 = shufflevector <8 x $1> $2, <8 x $1> undef, + <4 x i32> + $4 = shufflevector <8 x $1> $2, <8 x $1> undef, + <4 x i32> +') + +define(`v16tov8', ` + $3 = shufflevector <16 x $1> $2, <16 x $1> undef, + <8 x i32> + $4 = shufflevector <16 x $1> $2, <16 x $1> undef, + <8 x i32> +') + +;; 4-wide into 2 2-wide +;; args as above +;; + +define(`v4tov2', ` + $3 = shufflevector <4 x $1> $2, <4 x $1> undef, <2 x i32> + $4 = shufflevector <4 x $1> $2, <4 x $1> undef, <2 x i32> +') + +define(`v8tov2', ` + $3 = shufflevector <8 x $1> $2, <8 x $1> undef, <2 x i32> + $4 = shufflevector <8 x $1> $2, <8 x $1> undef, <2 x i32> + $5 = shufflevector <8 x $1> $2, <8 x $1> undef, <2 x i32> + $6 = shufflevector <8 x $1> $2, <8 x $1> undef, <2 x i32> +') + +define(`v16tov4', ` + $3 = shufflevector <16 x $1> $2, <16 x $1> undef, <4 x i32> + $4 = shufflevector <16 x $1> $2, <16 x $1> undef, <4 x i32> + $5 = shufflevector <16 x $1> $2, <16 x $1> undef, <4 x i32> + $6 = shufflevector <16 x $1> $2, <16 x $1> undef, <4 x i32> +') + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; Helper macro for calling various SSE instructions for scalar values ;; but where the instruction takes a vector parameter. ;; $1 : name of variable to put the final value in @@ -156,10 +203,7 @@ define(`reduce16', ` ;; the final reduction define(`reduce8by4', ` - %v1 = shufflevector <8 x $1> %0, <8 x $1> undef, - <4 x i32> - %v2 = shufflevector <8 x $1> %0, <8 x $1> undef, - <4 x i32> + v8tov4($1, %0, %v1, %v2) %m1 = call <4 x $1> $2(<4 x $1> %v1, <4 x $1> %v2) %v3 = shufflevector <4 x $1> %m1, <4 x $1> undef, <4 x i32> @@ -266,30 +310,66 @@ define(`binary2to4', ` ;; $4: 8-wide operand value define(`unary4to8', ` - %$1_0 = shufflevector <8 x $2> $4, <8 x $2> undef, <4 x i32> - %v$1_0 = call <4 x $2> $3(<4 x $2> %$1_0) - %$1_1 = shufflevector <8 x $2> $4, <8 x $2> undef, <4 x i32> - %v$1_1 = call <4 x $2> $3(<4 x $2> %$1_1) - %$1 = shufflevector <4 x $2> %v$1_0, <4 x $2> %v$1_1, + %__$1_0 = shufflevector <8 x $2> $4, <8 x $2> undef, <4 x i32> + %__v$1_0 = call <4 x $2> $3(<4 x $2> %__$1_0) + %__$1_1 = shufflevector <8 x $2> $4, <8 x $2> undef, <4 x i32> + %__v$1_1 = call <4 x $2> $3(<4 x $2> %__$1_1) + %$1 = shufflevector <4 x $2> %__v$1_0, <4 x $2> %__v$1_1, + <8 x i32> +' +) + +;; $1: name of variable into which the final result should go +;; $2: scalar type of the input vector elements +;; $3: scalar type of the result vector elements +;; $4: 4-wide unary vector function to apply +;; $5: 8-wide operand value + +define(`unary4to8conv', ` + %$1_0 = shufflevector <8 x $2> $5, <8 x $2> undef, <4 x i32> + %v$1_0 = call <4 x $3> $4(<4 x $2> %$1_0) + %$1_1 = shufflevector <8 x $2> $5, <8 x $2> undef, <4 x i32> + %v$1_1 = call <4 x $3> $4(<4 x $2> %$1_1) + %$1 = shufflevector <4 x $3> %v$1_0, <4 x $3> %v$1_1, <8 x i32> ' ) define(`unary4to16', ` - %$1_0 = shufflevector <16 x $2> $4, <16 x $2> undef, <4 x i32> - %v$1_0 = call <4 x $2> $3(<4 x $2> %$1_0) - %$1_1 = shufflevector <16 x $2> $4, <16 x $2> undef, <4 x i32> - %v$1_1 = call <4 x $2> $3(<4 x $2> %$1_1) - %$1_2 = shufflevector <16 x $2> $4, <16 x $2> undef, <4 x i32> - %v$1_2 = call <4 x $2> $3(<4 x $2> %$1_2) - %$1_3 = shufflevector <16 x $2> $4, <16 x $2> undef, <4 x i32> - %v$1_3 = call <4 x $2> $3(<4 x $2> %$1_3) + %__$1_0 = shufflevector <16 x $2> $4, <16 x $2> undef, <4 x i32> + %__v$1_0 = call <4 x $2> $3(<4 x $2> %__$1_0) + %__$1_1 = shufflevector <16 x $2> $4, <16 x $2> undef, <4 x i32> + %__v$1_1 = call <4 x $2> $3(<4 x $2> %__$1_1) + %__$1_2 = shufflevector <16 x $2> $4, <16 x $2> undef, <4 x i32> + %__v$1_2 = call <4 x $2> $3(<4 x $2> %__$1_2) + %__$1_3 = shufflevector <16 x $2> $4, <16 x $2> undef, <4 x i32> + %__v$1_3 = call <4 x $2> $3(<4 x $2> %__$1_3) - %$1a = shufflevector <4 x $2> %v$1_0, <4 x $2> %v$1_1, + %__$1a = shufflevector <4 x $2> %__v$1_0, <4 x $2> %__v$1_1, <8 x i32> - %$1b = shufflevector <4 x $2> %v$1_2, <4 x $2> %v$1_3, + %__$1b = shufflevector <4 x $2> %__v$1_2, <4 x $2> %__v$1_3, <8 x i32> - %$1 = shufflevector <8 x $2> %$1a, <8 x $2> %$1b, + %$1 = shufflevector <8 x $2> %__$1a, <8 x $2> %__$1b, + <16 x i32> +' +) + +define(`unary4to16conv', ` + %$1_0 = shufflevector <16 x $2> $5, <16 x $2> undef, <4 x i32> + %v$1_0 = call <4 x $3> $4(<4 x $2> %$1_0) + %$1_1 = shufflevector <16 x $2> $5, <16 x $2> undef, <4 x i32> + %v$1_1 = call <4 x $3> $4(<4 x $2> %$1_1) + %$1_2 = shufflevector <16 x $2> $5, <16 x $2> undef, <4 x i32> + %v$1_2 = call <4 x $3> $4(<4 x $2> %$1_2) + %$1_3 = shufflevector <16 x $2> $5, <16 x $2> undef, <4 x i32> + %v$1_3 = call <4 x $3> $4(<4 x $2> %$1_3) + + %$1a = shufflevector <4 x $3> %v$1_0, <4 x $3> %v$1_1, + <8 x i32> + %$1b = shufflevector <4 x $3> %v$1_2, <4 x $3> %v$1_3, + <8 x i32> + %$1 = shufflevector <8 x $3> %$1a, <8 x $3> %$1b, <16 x i32> ' diff --git a/ispc.cpp b/ispc.cpp index a9f5ff5c..de8fba4d 100644 --- a/ispc.cpp +++ b/ispc.cpp @@ -106,7 +106,7 @@ static void __cpuidex(int info[4], int level, int count) { static const char * lGetSystemISA() { #ifdef __arm__ - return "neon"; + return "neon-32"; #else int info[4]; __cpuid(info, 1); @@ -187,7 +187,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : isa = "avx2"; else if (!strcmp(cpu, "cortex-a9") || !strcmp(cpu, "cortex-a15")) - isa = "neon"; + isa = "neon-32"; else if (!strcmp(cpu, "core-avx-i")) isa = "avx1.1"; else if (!strcmp(cpu, "sandybridge") || @@ -212,7 +212,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : } #if !defined(__arm__) - if (cpu == NULL && !strcmp(isa, "neon")) + if (cpu == NULL && !strncmp(isa, "neon", 4)) // If we're compiling NEON on an x86 host and the CPU wasn't // supplied, don't go and set the CPU based on the host... cpu = "cortex-a9"; @@ -246,7 +246,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : this->m_cpu = cpu; if (arch == NULL) { - if (!strcmp(isa, "neon")) + if (!strncmp(isa, "neon", 4)) arch = "arm"; else arch = "x86-64"; @@ -461,8 +461,26 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : this->m_hasGather = true; #endif } - else if (!strcasecmp(isa, "neon")) { - this->m_isa = Target::NEON; + else if (!strcasecmp(isa, "neon-8")) { + this->m_isa = Target::NEON8; + this->m_nativeVectorWidth = 16; + this->m_vectorWidth = 16; + this->m_attributes = "+neon,+fp16"; + this->m_hasHalf = true; // ?? + this->m_maskingIsFree = false; + this->m_maskBitCount = 8; + } + else if (!strcasecmp(isa, "neon-16")) { + this->m_isa = Target::NEON16; + this->m_nativeVectorWidth = 8; + this->m_vectorWidth = 8; + this->m_attributes = "+neon,+fp16"; + this->m_hasHalf = true; // ?? + this->m_maskingIsFree = false; + this->m_maskBitCount = 16; + } + else if (!strcasecmp(isa, "neon-32") || !strcasecmp(isa, "neon")) { + this->m_isa = Target::NEON32; this->m_nativeVectorWidth = 4; this->m_vectorWidth = 4; this->m_attributes = "+neon,+fp16"; @@ -484,7 +502,8 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : llvm::Reloc::Default; std::string featuresString = m_attributes; llvm::TargetOptions options; - if (m_isa == Target::NEON) + if (m_isa == Target::NEON8 || m_isa == Target::NEON16 || + m_isa == Target::NEON32) options.FloatABIType = llvm::FloatABI::Hard; #if !defined(LLVM_3_1) if (g->opt.disableFMA == false) @@ -618,8 +637,12 @@ Target::GetTripleString() const { const char * Target::ISAToString(ISA isa) { switch (isa) { - case Target::NEON: - return "neon"; + case Target::NEON8: + return "neon-8"; + case Target::NEON16: + return "neon-16"; + case Target::NEON32: + return "neon-32"; case Target::SSE2: return "sse2"; case Target::SSE4: diff --git a/ispc.h b/ispc.h index 7d10b908..bf6d2642 100644 --- a/ispc.h +++ b/ispc.h @@ -175,7 +175,8 @@ public: flexible/performant of them will apear last in the enumerant. Note also that __best_available_isa() needs to be updated if ISAs are added or the enumerant values are reordered. */ - enum ISA { NEON, SSE2, SSE4, AVX, AVX11, AVX2, GENERIC, NUM_ISAS }; + enum ISA { NEON32, NEON16, NEON8, SSE2, SSE4, AVX, AVX11, AVX2, GENERIC, + NUM_ISAS }; /** Initializes the given Target pointer for a target of the given name, if the name is a known target. Returns true if the diff --git a/ispc.vcxproj b/ispc.vcxproj index 96682fe3..e9bf9d97 100755 --- a/ispc.vcxproj +++ b/ispc.vcxproj @@ -45,8 +45,12 @@ - - + + + + + + @@ -187,37 +191,78 @@ Building gen-bitcode-sse2-x2-64bit.cpp - - - Document - m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-neon.ll | python bitcode2cpp.py builtins\target-neon.ll > gen-bitcode-neon.cpp - gen-bitcode-neon.cpp - builtins\util.m4 - m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-neon.ll | python bitcode2cpp.py builtins\target-neon.ll > gen-bitcode-neon.cpp - gen-bitcode-neon.cpp - builtins\util.m4 - Building gen-bitcode-neon.cpp - Building gen-bitcode-neon.cpp - - - - - Document - m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-avx1.ll | python bitcode2cpp.py builtins\target-avx1.ll 32bit > $(Configuration)/gen-bitcode-avx1-32bit.cpp - $(Configuration)/gen-bitcode-avx1-32bit.cpp - builtins\util.m4;builtins\target-avx-common.ll;builtins\target-avx.ll - Building gen-bitcode-avx1-32bit.cpp - - - - - Document - m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-avx1.ll | python bitcode2cpp.py builtins\target-avx1.ll 64bit > $(Configuration)/gen-bitcode-avx1-64bit.cpp - $(Configuration)/gen-bitcode-avx1-64bit.cpp - builtins\util.m4;builtins\target-avx-common.ll;builtins\target-avx.ll - Building gen-bitcode-avx1-64bit.cpp - - + + + Document + m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-neon-8.ll | python bitcode2cpp.py builtins\target-neon-8.ll 32bit > $(Configuration)/gen-bitcode-neon-8-32bit.cpp + $(Configuration)/gen-bitcode-neon-8-32bit.cpp + builtins\util.m4;builtins\target-neon-common.ll + Building gen-bitcode-neon-8-32bit.cpp + + + + + Document + m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-neon-8.ll | python bitcode2cpp.py builtins\target-neon-8.ll 64bit > $(Configuration)/gen-bitcode-neon-8-64bit.cpp + $(Configuration)/gen-bitcode-neon-8-64bit.cpp + builtins\util.m4;builtins\target-neon-common.ll + Building gen-bitcode-neon-8-64bit.cpp + + + + + Document + m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-neon-16.ll | python bitcode2cpp.py builtins\target-neon-16.ll 32bit > $(Configuration)/gen-bitcode-neon-16-32bit.cpp + $(Configuration)/gen-bitcode-neon-16-32bit.cpp + builtins\util.m4;builtins\target-neon-common.ll + Building gen-bitcode-neon-16-32bit.cpp + + + + + Document + m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-neon-16.ll | python bitcode2cpp.py builtins\target-neon-16.ll 64bit > $(Configuration)/gen-bitcode-neon-16-64bit.cpp + $(Configuration)/gen-bitcode-neon-16-64bit.cpp + builtins\util.m4;builtins\target-neon-common.ll + Building gen-bitcode-neon-16-64bit.cpp + + + + + Document + m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-neon-32.ll | python bitcode2cpp.py builtins\target-neon-32.ll 32bit > $(Configuration)/gen-bitcode-neon-32-32bit.cpp + $(Configuration)/gen-bitcode-neon-32-32bit.cpp + builtins\util.m4;builtins\target-neon-common.ll + Building gen-bitcode-neon-32-32bit.cpp + + + + + Document + m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-neon-32.ll | python bitcode2cpp.py builtins\target-neon-32.ll 64bit > $(Configuration)/gen-bitcode-neon-32-64bit.cpp + $(Configuration)/gen-bitcode-neon-32-64bit.cpp + builtins\util.m4;builtins\target-neon-common.ll + Building gen-bitcode-neon-32-64bit.cpp + + + + + Document + m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-avx1.ll | python bitcode2cpp.py builtins\target-avx1.ll 32bit > $(Configuration)/gen-bitcode-avx1-32bit.cpp + $(Configuration)/gen-bitcode-avx1-32bit.cpp + builtins\util.m4;builtins\target-avx-common.ll;builtins\target-avx.ll + Building gen-bitcode-avx1-32bit.cpp + + + + + Document + m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-avx1.ll | python bitcode2cpp.py builtins\target-avx1.ll 64bit > $(Configuration)/gen-bitcode-avx1-64bit.cpp + $(Configuration)/gen-bitcode-avx1-64bit.cpp + builtins\util.m4;builtins\target-avx-common.ll;builtins\target-avx.ll + Building gen-bitcode-avx1-64bit.cpp + + Document diff --git a/module.cpp b/module.cpp index 85bf242c..755a5dc4 100644 --- a/module.cpp +++ b/module.cpp @@ -1877,6 +1877,7 @@ Module::execPreprocessor(const char *infilename, llvm::raw_string_ostream *ostre char *p = targetMacro; while (*p) { *p = toupper(*p); + if (*p == '-') *p = '_'; ++p; } opts.addMacroDef(targetMacro); diff --git a/run_tests.py b/run_tests.py index ea819ea4..c9dd8b76 100755 --- a/run_tests.py +++ b/run_tests.py @@ -37,7 +37,7 @@ parser.add_option("-g", "--generics-include", dest="include_file", help="Filenam parser.add_option("-f", "--ispc-flags", dest="ispc_flags", help="Additional flags for ispc (-g, -O1, ...)", default="") parser.add_option('-t', '--target', dest='target', - help='Set compilation target (neon, sse2, sse2-x2, sse4, sse4-x2, sse4-8, sse4-16, avx, avx-x2, generic-4, generic-8, generic-16, generic-32)', + help='Set compilation target (neon8, neon16, neon32, sse2, sse2-x2, sse4, sse4-x2, sse4-8, sse4-16, avx, avx-x2, generic-4, generic-8, generic-16, generic-32)', default="sse4") parser.add_option('-a', '--arch', dest='arch', help='Set architecture (arm, x86, x86-64)', From 48ff03112fd30d12a85eaf7cee3636ee6bfbedb4 Mon Sep 17 00:00:00 2001 From: Matt Pharr Date: Mon, 29 Jul 2013 16:20:46 -0700 Subject: [PATCH 016/124] Remove __pause from stdlib_core() in utils.m4. It wasn't ever being used, and was breaking compilation on ARM. --- builtins.cpp | 1 - builtins/util.m4 | 5 ----- 2 files changed, 6 deletions(-) diff --git a/builtins.cpp b/builtins.cpp index e671a491..b2896388 100644 --- a/builtins.cpp +++ b/builtins.cpp @@ -487,7 +487,6 @@ lSetInternalFunctions(llvm::Module *module) { "__num_cores", "__packed_load_active", "__packed_store_active", - "__pause", "__popcnt_int32", "__popcnt_int64", "__prefetch_read_uniform_1", diff --git a/builtins/util.m4 b/builtins/util.m4 index 1f85e2cc..025030d5 100644 --- a/builtins/util.m4 +++ b/builtins/util.m4 @@ -1795,11 +1795,6 @@ declare i1 @__is_compile_time_constant_mask( %mask) declare i1 @__is_compile_time_constant_uniform_int32(i32) declare i1 @__is_compile_time_constant_varying_int32() -define void @__pause() nounwind readnone { - call void asm sideeffect "pause", "~{dirflag},~{fpsr},~{flags}"() nounwind - ret void -} - ; This function declares placeholder masked store functions for the ; front-end to use. ; From 220f0b0b4037f8c9124e6e2f666b053b39d71152 Mon Sep 17 00:00:00 2001 From: Dmitry Babokin Date: Tue, 30 Jul 2013 19:53:12 -0700 Subject: [PATCH 017/124] Renaming mandelbrot_tasks files to be different from mandelbrot --- examples/mandelbrot_tasks/Makefile | 6 +++--- .../{mandelbrot.cpp => mandelbrot_tasks.cpp} | 2 +- .../{mandelbrot.ispc => mandelbrot_tasks.ispc} | 0 .../mandelbrot_tasks/mandelbrot_tasks.vcxproj | 16 ++++++++-------- ...ot_serial.cpp => mandelbrot_tasks_serial.cpp} | 0 examples/perf.ini | 2 +- 6 files changed, 13 insertions(+), 13 deletions(-) rename examples/mandelbrot_tasks/{mandelbrot.cpp => mandelbrot_tasks.cpp} (99%) rename examples/mandelbrot_tasks/{mandelbrot.ispc => mandelbrot_tasks.ispc} (100%) rename examples/mandelbrot_tasks/{mandelbrot_serial.cpp => mandelbrot_tasks_serial.cpp} (100%) diff --git a/examples/mandelbrot_tasks/Makefile b/examples/mandelbrot_tasks/Makefile index 7e83e618..1a565ffd 100644 --- a/examples/mandelbrot_tasks/Makefile +++ b/examples/mandelbrot_tasks/Makefile @@ -1,7 +1,7 @@ -EXAMPLE=mandelbrot -CPP_SRC=mandelbrot.cpp mandelbrot_serial.cpp -ISPC_SRC=mandelbrot.ispc +EXAMPLE=mandelbrot_tasks +CPP_SRC=mandelbrot_tasks.cpp mandelbrot_tasks_serial.cpp +ISPC_SRC=mandelbrot_tasks.ispc ISPC_IA_TARGETS=sse2,sse4-x2,avx-x2 ISPC_ARM_TARGETS=neon diff --git a/examples/mandelbrot_tasks/mandelbrot.cpp b/examples/mandelbrot_tasks/mandelbrot_tasks.cpp similarity index 99% rename from examples/mandelbrot_tasks/mandelbrot.cpp rename to examples/mandelbrot_tasks/mandelbrot_tasks.cpp index a01cfe43..dae22736 100644 --- a/examples/mandelbrot_tasks/mandelbrot.cpp +++ b/examples/mandelbrot_tasks/mandelbrot_tasks.cpp @@ -42,7 +42,7 @@ #include #include #include "../timing.h" -#include "mandelbrot_ispc.h" +#include "mandelbrot_tasks_ispc.h" using namespace ispc; extern void mandelbrot_serial(float x0, float y0, float x1, float y1, diff --git a/examples/mandelbrot_tasks/mandelbrot.ispc b/examples/mandelbrot_tasks/mandelbrot_tasks.ispc similarity index 100% rename from examples/mandelbrot_tasks/mandelbrot.ispc rename to examples/mandelbrot_tasks/mandelbrot_tasks.ispc diff --git a/examples/mandelbrot_tasks/mandelbrot_tasks.vcxproj b/examples/mandelbrot_tasks/mandelbrot_tasks.vcxproj index b92de72f..3a8fca79 100644 --- a/examples/mandelbrot_tasks/mandelbrot_tasks.vcxproj +++ b/examples/mandelbrot_tasks/mandelbrot_tasks.vcxproj @@ -21,7 +21,7 @@ {E80DA7D4-AB22-4648-A068-327307156BE6} Win32Proj - mandelbrot + mandelbrot_tasks @@ -65,22 +65,22 @@ true $(ProjectDir)..\..;$(ExecutablePath) - mandelbrot + mandelbrot_tasks true $(ProjectDir)..\..;$(ExecutablePath) - mandelbrot + mandelbrot_tasks false $(ProjectDir)..\..;$(ExecutablePath) - mandelbrot + mandelbrot_tasks false $(ProjectDir)..\..;$(ExecutablePath) - mandelbrot + mandelbrot_tasks @@ -153,12 +153,12 @@ - - + + - + Document ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx-x2 diff --git a/examples/mandelbrot_tasks/mandelbrot_serial.cpp b/examples/mandelbrot_tasks/mandelbrot_tasks_serial.cpp similarity index 100% rename from examples/mandelbrot_tasks/mandelbrot_serial.cpp rename to examples/mandelbrot_tasks/mandelbrot_tasks_serial.cpp diff --git a/examples/perf.ini b/examples/perf.ini index 3814bf16..d2a5c73e 100755 --- a/examples/perf.ini +++ b/examples/perf.ini @@ -22,7 +22,7 @@ mandelbrot #*** Mandelbrot Set mandelbrot_tasks -mandelbrot +mandelbrot_tasks ^ #*** Perlin Noise Function From d3c567503bf64ec9066c09cb8959c31d4aa1be0e Mon Sep 17 00:00:00 2001 From: Matt Pharr Date: Wed, 31 Jul 2013 06:46:45 -0700 Subject: [PATCH 018/124] Remove support for building with LLVM 3.1 --- builtins.cpp | 2 - builtins/target-avx11-x2.ll | 4 +- builtins/target-avx11.ll | 4 +- builtins/target-avx2-x2.ll | 25 +------- builtins/target-avx2.ll | 25 +------- cbackend.cpp | 115 +++++++++--------------------------- ctx.cpp | 4 +- ctx.h | 11 +--- expr.cpp | 2 +- func.cpp | 10 +--- ispc.cpp | 68 ++++----------------- ispc.h | 18 +----- llvmutil.cpp | 2 +- llvmutil.h | 2 +- main.cpp | 4 +- module.cpp | 47 +++------------ opt.cpp | 22 ++----- stmt.cpp | 2 +- type.cpp | 26 +++----- type.h | 2 +- util.cpp | 9 +-- 21 files changed, 84 insertions(+), 320 deletions(-) diff --git a/builtins.cpp b/builtins.cpp index b2896388..17582d68 100644 --- a/builtins.cpp +++ b/builtins.cpp @@ -49,8 +49,6 @@ #include #if defined(LLVM_3_2) #include -#endif -#if defined(LLVM_3_1) || defined(LLVM_3_2) #include #include #include diff --git a/builtins/target-avx11-x2.ll b/builtins/target-avx11-x2.ll index 1aa6345c..2aee1e1c 100644 --- a/builtins/target-avx11-x2.ll +++ b/builtins/target-avx11-x2.ll @@ -31,9 +31,7 @@ include(`target-avx-x2.ll') -ifelse(LLVM_VERSION, `LLVM_3_0', `rdrand_decls()', - LLVM_VERSION, `LLVM_3_1', `rdrand_decls()', - `rdrand_definition()') +rdrand_definition() ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; int min/max diff --git a/builtins/target-avx11.ll b/builtins/target-avx11.ll index fea0a7c2..44593113 100644 --- a/builtins/target-avx11.ll +++ b/builtins/target-avx11.ll @@ -31,9 +31,7 @@ include(`target-avx.ll') -ifelse(LLVM_VERSION, `LLVM_3_0', `rdrand_decls()', - LLVM_VERSION, `LLVM_3_1', `rdrand_decls()', - `rdrand_definition()') +rdrand_definition() ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; int min/max diff --git a/builtins/target-avx2-x2.ll b/builtins/target-avx2-x2.ll index 053fd078..19f1845d 100644 --- a/builtins/target-avx2-x2.ll +++ b/builtins/target-avx2-x2.ll @@ -29,15 +29,11 @@ ;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -ifelse(LLVM_VERSION, `LLVM_3_0', `', - LLVM_VERSION, `LLVM_3_1', `', - `define(`HAVE_GATHER', `1')') +define(`HAVE_GATHER', `1') include(`target-avx-x2.ll') -ifelse(LLVM_VERSION, `LLVM_3_0', `rdrand_decls()', - LLVM_VERSION, `LLVM_3_1', `rdrand_decls()', - `rdrand_definition()') +rdrand_definition() ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; int min/max @@ -176,21 +172,6 @@ define(`assemble_4s', ` assemble_8s($1, $2, $2_1, $2_2) ') -ifelse(LLVM_VERSION, `LLVM_3_0', ` -gen_gather_factored(i8) -gen_gather_factored(i16) -gen_gather_factored(i32) -gen_gather_factored(float) -gen_gather_factored(i64) -gen_gather_factored(double)', -LLVM_VERSION, `LLVM_3_1', ` -gen_gather_factored(i8) -gen_gather_factored(i16) -gen_gather_factored(i32) -gen_gather_factored(float) -gen_gather_factored(i64) -gen_gather_factored(double)', ` - gen_gather(i8) gen_gather(i16) @@ -557,5 +538,3 @@ define <16 x double> @__gather64_double(<16 x i64> %ptrs, ret <16 x double> %v } - -') diff --git a/builtins/target-avx2.ll b/builtins/target-avx2.ll index f4a0ee07..d3410011 100644 --- a/builtins/target-avx2.ll +++ b/builtins/target-avx2.ll @@ -29,15 +29,11 @@ ;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -ifelse(LLVM_VERSION, `LLVM_3_0', `', - LLVM_VERSION, `LLVM_3_1', `', - `define(`HAVE_GATHER', `1')') +define(`HAVE_GATHER', `1') include(`target-avx.ll') -ifelse(LLVM_VERSION, `LLVM_3_0', `rdrand_decls()', - LLVM_VERSION, `LLVM_3_1', `rdrand_decls()', - `rdrand_definition()') +rdrand_definition() ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; int min/max @@ -123,21 +119,6 @@ define(`extract_4s', ` %$2_2 = shufflevector <8 x $1> %$2, <8 x $1> undef, <4 x i32> ') -ifelse(LLVM_VERSION, `LLVM_3_0', ` -gen_gather_factored(i8) -gen_gather_factored(i16) -gen_gather_factored(i32) -gen_gather_factored(float) -gen_gather_factored(i64) -gen_gather_factored(double)', -LLVM_VERSION, `LLVM_3_1', ` -gen_gather_factored(i8) -gen_gather_factored(i16) -gen_gather_factored(i32) -gen_gather_factored(float) -gen_gather_factored(i64) -gen_gather_factored(double)', ` - gen_gather(i8) gen_gather(i16) @@ -429,5 +410,3 @@ define <8 x double> @__gather64_double(<8 x i64> %ptrs, ret <8 x double> %v } - -') diff --git a/cbackend.cpp b/cbackend.cpp index d23bcc20..d54f48fb 100644 --- a/cbackend.cpp +++ b/cbackend.cpp @@ -29,7 +29,7 @@ #include "llvmutil.h" -#if defined(LLVM_3_1) || defined(LLVM_3_2) +#if defined(LLVM_3_2) #include "llvm/Constants.h" #include "llvm/DerivedTypes.h" #include "llvm/CallingConv.h" @@ -38,6 +38,7 @@ #include "llvm/Intrinsics.h" #include "llvm/IntrinsicInst.h" #include "llvm/InlineAsm.h" + #include "llvm/TypeFinder.h" #else #include "llvm/IR/Constants.h" #include "llvm/IR/DerivedTypes.h" @@ -47,16 +48,10 @@ #include "llvm/IR/Intrinsics.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/InlineAsm.h" + #include "llvm/IR/TypeFinder.h" #endif #include "llvm/Pass.h" #include "llvm/PassManager.h" -#if !defined(LLVM_3_1) - #if defined(LLVM_3_2) - #include "llvm/TypeFinder.h" - #else // LLVM_3_3 + - #include "llvm/IR/TypeFinder.h" - #endif -#endif // LLVM_3_2 + #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/SmallString.h" #include "llvm/ADT/STLExtras.h" @@ -76,9 +71,7 @@ #include "llvm/MC/MCRegisterInfo.h" #include "llvm/MC/MCSubtargetInfo.h" #include "llvm/MC/MCSymbol.h" -#if defined(LLVM_3_1) - #include "llvm/Target/TargetData.h" -#elif defined(LLVM_3_2) +#if defined(LLVM_3_2) #include "llvm/DataLayout.h" #else // LLVM 3.3+ #include "llvm/IR/DataLayout.h" @@ -88,7 +81,7 @@ #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/FormattedStream.h" #include "llvm/Support/GetElementPtrTypeIterator.h" -#if defined(LLVM_3_1) || defined(LLVM_3_2) +#if defined(LLVM_3_2) #include "llvm/Support/InstVisitor.h" #else // LLVM 3.3+ #include "llvm/InstVisitor.h" @@ -258,14 +251,10 @@ namespace { const llvm::MCRegisterInfo *MRI; const llvm::MCObjectFileInfo *MOFI; llvm::MCContext *TCtx; -#if defined(LLVM_3_1) - const llvm::TargetData* TD; -#else // FIXME: it's ugly to have the name be "TD" here, but it saves us // lots of ifdefs in the below since the new DataLayout and the old // TargetData have generally similar interfaces... const llvm::DataLayout* TD; -#endif std::map FPConstantMap; std::map VectorConstantMap; @@ -352,7 +341,7 @@ namespace { bool isSigned = false, const std::string &VariableName = "", bool IgnoreName = false, -#if defined(LLVM_3_1) || defined(LLVM_3_2) +#if defined(LLVM_3_2) const llvm::AttrListPtr &PAL = llvm::AttrListPtr() #else const llvm::AttributeSet &PAL = llvm::AttributeSet() @@ -363,7 +352,7 @@ namespace { const std::string &NameSoFar = ""); void printStructReturnPointerFunctionType(llvm::raw_ostream &Out, -#if defined(LLVM_3_1) || defined(LLVM_3_2) +#if defined(LLVM_3_2) const llvm::AttrListPtr &PAL, #else const llvm::AttributeSet &PAL, @@ -586,7 +575,7 @@ std::string CWriter::getArrayName(llvm::ArrayType *AT) { /// return type, except, instead of printing the type as void (*)(Struct*, ...) /// print it as "Struct (*)(...)", for struct return functions. void CWriter::printStructReturnPointerFunctionType(llvm::raw_ostream &Out, -#if defined(LLVM_3_1) || defined(LLVM_3_2) +#if defined(LLVM_3_2) const llvm::AttrListPtr &PAL, #else const llvm::AttributeSet &PAL, @@ -605,20 +594,16 @@ void CWriter::printStructReturnPointerFunctionType(llvm::raw_ostream &Out, if (PrintedType) FunctionInnards << ", "; llvm::Type *ArgTy = *I; -#if defined(LLVM_3_1) - if (PAL.paramHasAttr(Idx, llvm::Attribute::ByVal)) { -#elif defined(LLVM_3_2) +#if defined(LLVM_3_2) if (PAL.getParamAttributes(Idx).hasAttribute(llvm::Attributes::ByVal)) { #else - if (PAL.getParamAttributes(Idx).hasAttribute(llvm::AttributeSet::FunctionIndex, llvm::Attribute::ByVal)) { + if (PAL.getParamAttributes(Idx).hasAttribute(llvm::AttributeSet::FunctionIndex, llvm::Attribute::ByVal)) { #endif assert(ArgTy->isPointerTy()); ArgTy = llvm::cast(ArgTy)->getElementType(); } printType(FunctionInnards, ArgTy, -#if defined(LLVM_3_1) - /*isSigned=*/PAL.paramHasAttr(Idx, llvm::Attribute::SExt), -#elif defined(LLVM_3_2) +#if defined(LLVM_3_2) PAL.getParamAttributes(Idx).hasAttribute(llvm::Attributes::SExt), #else PAL.getParamAttributes(Idx).hasAttribute(llvm::AttributeSet::FunctionIndex, llvm::Attribute::SExt), @@ -635,9 +620,7 @@ void CWriter::printStructReturnPointerFunctionType(llvm::raw_ostream &Out, } FunctionInnards << ')'; printType(Out, RetTy, -#if defined(LLVM_3_1) - /*isSigned=*/PAL.paramHasAttr(0, llvm::Attribute::SExt), -#elif defined(LLVM_3_2) +#if defined(LLVM_3_2) PAL.getParamAttributes(0).hasAttribute(llvm::Attributes::SExt), #else PAL.getParamAttributes(0).hasAttribute(llvm::AttributeSet::ReturnIndex, llvm::Attribute::SExt), @@ -737,7 +720,7 @@ CWriter::printSimpleType(llvm::raw_ostream &Out, llvm::Type *Ty, bool isSigned, llvm::raw_ostream &CWriter::printType(llvm::raw_ostream &Out, llvm::Type *Ty, bool isSigned, const std::string &NameSoFar, bool IgnoreName, -#if defined(LLVM_3_1) || defined(LLVM_3_2) +#if defined(LLVM_3_2) const llvm::AttrListPtr &PAL #else const llvm::AttributeSet &PAL @@ -759,9 +742,7 @@ llvm::raw_ostream &CWriter::printType(llvm::raw_ostream &Out, llvm::Type *Ty, for (llvm::FunctionType::param_iterator I = FTy->param_begin(), E = FTy->param_end(); I != E; ++I) { llvm::Type *ArgTy = *I; -#if defined(LLVM_3_1) - if (PAL.paramHasAttr(Idx, llvm::Attribute::ByVal)) { -#elif defined(LLVM_3_2) +#if defined(LLVM_3_2) if (PAL.getParamAttributes(Idx).hasAttribute(llvm::Attributes::ByVal)) { #else if (PAL.getParamAttributes(Idx).hasAttribute(llvm::AttributeSet::FunctionIndex, llvm::Attribute::ByVal)) { @@ -772,9 +753,7 @@ llvm::raw_ostream &CWriter::printType(llvm::raw_ostream &Out, llvm::Type *Ty, if (I != FTy->param_begin()) FunctionInnards << ", "; printType(FunctionInnards, ArgTy, -#if defined(LLVM_3_1) - /*isSigned=*/PAL.paramHasAttr(Idx, llvm::Attribute::SExt), -#elif defined(LLVM_3_2) +#if defined(LLVM_3_2) PAL.getParamAttributes(Idx).hasAttribute(llvm::Attributes::SExt), #else PAL.getParamAttributes(Idx).hasAttribute(llvm::AttributeSet::FunctionIndex, llvm::Attribute::SExt), @@ -791,9 +770,7 @@ llvm::raw_ostream &CWriter::printType(llvm::raw_ostream &Out, llvm::Type *Ty, } FunctionInnards << ')'; printType(Out, FTy->getReturnType(), -#if defined(LLVM_3_1) - /*isSigned=*/PAL.paramHasAttr(0, llvm::Attribute::SExt), -#elif defined(LLVM_3_2) +#if defined(LLVM_3_2) PAL.getParamAttributes(0).hasAttribute(llvm::Attributes::SExt), #else PAL.getParamAttributes(0).hasAttribute(llvm::AttributeSet::ReturnIndex, llvm::Attribute::SExt), @@ -1972,11 +1949,7 @@ void CWriter::writeOperandWithCast(llvm::Value* Operand, const llvm::ICmpInst &C // directives to cater to specific compilers as need be. // static void generateCompilerSpecificCode(llvm::formatted_raw_ostream& Out, -#if defined(LLVM_3_1) - const llvm::TargetData *TD) { -#else const llvm::DataLayout *TD) { -#endif // We output GCC specific attributes to preserve 'linkonce'ness on globals. // If we aren't being compiled with GCC, just drop these attributes. Out << "#ifndef __GNUC__ /* Can only support \"linkonce\" vars with GCC */\n" @@ -2169,11 +2142,7 @@ bool CWriter::doInitialization(llvm::Module &M) { // Initialize TheModule = &M; -#if defined(LLVM_3_1) - TD = new llvm::TargetData(&M); -#else TD = new llvm::DataLayout(&M); -#endif IL = new llvm::IntrinsicLowering(*TD); IL->AddPrototypes(M); @@ -2656,15 +2625,11 @@ void CWriter::printModuleTypes() { // Get all of the struct types used in the module. std::vector StructTypes; -#if defined(LLVM_3_1) - TheModule->findUsedStructTypes(StructTypes); -#else llvm::TypeFinder typeFinder; typeFinder.run(*TheModule, false); for (llvm::TypeFinder::iterator iter = typeFinder.begin(); iter != typeFinder.end(); ++iter) StructTypes.push_back(*iter); -#endif // Get all of the array types used in the module std::vector ArrayTypes; @@ -2785,7 +2750,7 @@ void CWriter::printFunctionSignature(const llvm::Function *F, bool Prototype) { // Loop over the arguments, printing them... llvm::FunctionType *FT = llvm::cast(F->getFunctionType()); -#if defined(LLVM_3_1) || defined(LLVM_3_2) +#if defined(LLVM_3_2) const llvm::AttrListPtr &PAL = F->getAttributes(); #else const llvm::AttributeSet &PAL = F->getAttributes(); @@ -2819,20 +2784,16 @@ void CWriter::printFunctionSignature(const llvm::Function *F, bool Prototype) { else ArgName = ""; llvm::Type *ArgTy = I->getType(); -#if defined(LLVM_3_1) - if (PAL.paramHasAttr(Idx, llvm::Attribute::ByVal)) { -#elif defined(LLVM_3_2) +#if defined(LLVM_3_2) if (PAL.getParamAttributes(Idx).hasAttribute(llvm::Attributes::ByVal)) { #else - if (PAL.getParamAttributes(Idx).hasAttribute(llvm::AttributeSet::FunctionIndex, llvm::Attribute::ByVal)) { + if (PAL.getParamAttributes(Idx).hasAttribute(llvm::AttributeSet::FunctionIndex, llvm::Attribute::ByVal)) { #endif ArgTy = llvm::cast(ArgTy)->getElementType(); ByValParams.insert(I); } printType(FunctionInnards, ArgTy, -#if defined(LLVM_3_1) - /*isSigned=*/PAL.paramHasAttr(Idx, llvm::Attribute::SExt), -#elif defined(LLVM_3_2) +#if defined(LLVM_3_2) PAL.getParamAttributes(Idx).hasAttribute(llvm::Attributes::SExt), #else PAL.getParamAttributes(Idx).hasAttribute(llvm::AttributeSet::FunctionIndex, llvm::Attribute::SExt), @@ -2858,9 +2819,7 @@ void CWriter::printFunctionSignature(const llvm::Function *F, bool Prototype) { for (; I != E; ++I) { if (PrintedArg) FunctionInnards << ", "; llvm::Type *ArgTy = *I; -#if defined(LLVM_3_1) - if (PAL.paramHasAttr(Idx, llvm::Attribute::ByVal)) { -#elif defined(LLVM_3_2) +#if defined(LLVM_3_2) if (PAL.getParamAttributes(Idx).hasAttribute(llvm::Attributes::ByVal)) { #else if (PAL.getParamAttributes(Idx).hasAttribute(llvm::AttributeSet::FunctionIndex, llvm::Attribute::ByVal)) { @@ -2869,9 +2828,7 @@ void CWriter::printFunctionSignature(const llvm::Function *F, bool Prototype) { ArgTy = llvm::cast(ArgTy)->getElementType(); } printType(FunctionInnards, ArgTy, -#if defined(LLVM_3_1) - /*isSigned=*/PAL.paramHasAttr(Idx, llvm::Attribute::SExt) -#elif defined(LLVM_3_2) +#if defined(LLVM_3_2) PAL.getParamAttributes(Idx).hasAttribute(llvm::Attributes::SExt) #else PAL.getParamAttributes(Idx).hasAttribute(llvm::AttributeSet::FunctionIndex, llvm::Attribute::SExt) @@ -2908,9 +2865,7 @@ void CWriter::printFunctionSignature(const llvm::Function *F, bool Prototype) { // Print out the return type and the signature built above. printType(Out, RetTy, -#if defined(LLVM_3_1) - /*isSigned=*/PAL.paramHasAttr(0, llvm::Attribute::SExt), -#elif defined(LLVM_3_2) +#if defined(LLVM_3_2) PAL.getParamAttributes(0).hasAttribute(llvm::Attributes::SExt), #else PAL.getParamAttributes(0).hasAttribute(llvm::AttributeSet::ReturnIndex, llvm::Attribute::SExt), @@ -3712,7 +3667,7 @@ void CWriter::lowerIntrinsics(llvm::Function &F) { const char *BuiltinName = ""; #define GET_GCC_BUILTIN_NAME #define Intrinsic llvm::Intrinsic -#if defined(LLVM_3_1) || defined(LLVM_3_2) +#if defined(LLVM_3_2) #include "llvm/Intrinsics.gen" #else #include "llvm/IR/Intrinsics.gen" @@ -3777,7 +3732,7 @@ void CWriter::visitCallInst(llvm::CallInst &I) { // If this is a call to a struct-return function, assign to the first // parameter instead of passing it to the call. -#if defined(LLVM_3_1) || defined(LLVM_3_2) +#if defined(LLVM_3_2) const llvm::AttrListPtr &PAL = I.getAttributes(); #else const llvm::AttributeSet &PAL = I.getAttributes(); @@ -3865,9 +3820,7 @@ void CWriter::visitCallInst(llvm::CallInst &I) { (*AI)->getType() != FTy->getParamType(ArgNo)) { Out << '('; printType(Out, FTy->getParamType(ArgNo), -#if defined(LLVM_3_1) - /*isSigned=*/PAL.paramHasAttr(ArgNo+1, llvm::Attribute::SExt) -#elif defined(LLVM_3_2) +#if defined(LLVM_3_2) PAL.getParamAttributes(ArgNo+1).hasAttribute(llvm::Attributes::SExt) #else PAL.getParamAttributes(ArgNo+1).hasAttribute(llvm::AttributeSet::FunctionIndex, llvm::Attribute::SExt) @@ -3905,7 +3858,7 @@ bool CWriter::visitBuiltinCall(llvm::CallInst &I, llvm::Intrinsic::ID ID, const char *BuiltinName = ""; #define GET_GCC_BUILTIN_NAME #define Intrinsic llvm::Intrinsic -#if defined(LLVM_3_1) || defined(LLVM_3_2) +#if defined(LLVM_3_2) #include "llvm/Intrinsics.gen" #else #include "llvm/IR/Intrinsics.gen" @@ -4555,13 +4508,8 @@ SmearCleanupPass::runOnBasicBlock(llvm::BasicBlock &bb) { smearType, NULL); smearFunc = llvm::dyn_cast(sf); assert(smearFunc != NULL); -#if defined(LLVM_3_1) - smearFunc->setDoesNotThrow(true); - smearFunc->setDoesNotAccessMemory(true); -#else smearFunc->setDoesNotThrow(); smearFunc->setDoesNotAccessMemory(); -#endif } assert(smearFunc != NULL); @@ -4703,13 +4651,8 @@ AndCmpCleanupPass::runOnBasicBlock(llvm::BasicBlock &bb) { LLVMTypes::MaskType, NULL); andCmpFunc = llvm::dyn_cast(acf); Assert(andCmpFunc != NULL); -#if defined(LLVM_3_1) - andCmpFunc->setDoesNotThrow(true); - andCmpFunc->setDoesNotAccessMemory(true); -#else andCmpFunc->setDoesNotThrow(); andCmpFunc->setDoesNotAccessMemory(); -#endif } // Set up the function call to the *_and_mask function; the @@ -4914,7 +4857,7 @@ WriteCXXFile(llvm::Module *module, const char *fn, int vectorWidth, pm.add(new llvm::TargetData(module)); #endif -#if defined(LLVM_3_1) || defined(LLVM_3_2) || defined(LLVM_3_3) +#if defined(LLVM_3_2) || defined(LLVM_3_3) int flags = 0; #else llvm::sys::fs::OpenFlags flags = llvm::sys::fs::F_None; @@ -4939,7 +4882,7 @@ WriteCXXFile(llvm::Module *module, const char *fn, int vectorWidth, pm.add(llvm::createDeadCodeEliminationPass()); // clean up after smear pass //CO pm.add(llvm::createPrintModulePass(&fos)); pm.add(new CWriter(fos, includeName, vectorWidth)); -#if defined(LLVM_3_1) || defined(LLVM_3_2) +#if defined(LLVM_3_2) // This interface is depricated for 3.3+ pm.add(llvm::createGCInfoDeleter()); #endif diff --git a/ctx.cpp b/ctx.cpp index c50d22f9..32ba0ad9 100644 --- a/ctx.cpp +++ b/ctx.cpp @@ -46,7 +46,7 @@ #include "sym.h" #include #include -#if defined(LLVM_3_1) || defined(LLVM_3_2) +#if defined(LLVM_3_2) #include #include #include @@ -3312,7 +3312,7 @@ FunctionEmitContext::CallInst(llvm::Value *func, const FunctionType *funcType, // alias analysis. // TODO: what other attributes needs to be copied? // TODO: do the same for varing path. -#if !defined (LLVM_3_1) && !defined (LLVM_3_2) // LLVM 3.3+ +#if !defined (LLVM_3_2) // LLVM 3.3+ llvm::CallInst *cc = llvm::dyn_cast(ci); if (cc && cc->getCalledFunction() && diff --git a/ctx.h b/ctx.h index 58f9aae3..4b27e6e5 100644 --- a/ctx.h +++ b/ctx.h @@ -40,20 +40,15 @@ #include "ispc.h" #include -#if defined(LLVM_3_1) || defined(LLVM_3_2) +#if defined(LLVM_3_2) #include #include #else #include #include #endif -#if defined(LLVM_3_1) - #include - #include -#else - #include - #include -#endif +#include +#include struct CFInfo; diff --git a/expr.cpp b/expr.cpp index 856d363c..eb8c0951 100644 --- a/expr.cpp +++ b/expr.cpp @@ -56,7 +56,7 @@ #include #include #include -#if defined(LLVM_3_1) || defined(LLVM_3_2) +#if defined(LLVM_3_2) #include #include #include diff --git a/func.cpp b/func.cpp index b975049b..3097f86d 100644 --- a/func.cpp +++ b/func.cpp @@ -46,7 +46,7 @@ #include "util.h" #include -#if defined(LLVM_3_1) || defined(LLVM_3_2) +#if defined(LLVM_3_2) #include #include #include @@ -310,9 +310,7 @@ Function::emitCode(FunctionEmitContext *ctx, llvm::Function *function, // isn't worth the code bloat / overhead. bool checkMask = (type->isTask == true) || ( -#if defined(LLVM_3_1) - (function->hasFnAttr(llvm::Attribute::AlwaysInline) == false) -#elif defined(LLVM_3_2) +#if defined(LLVM_3_2) (function->getFnAttributes().hasAttribute(llvm::Attributes::AlwaysInline) == false) #else // LLVM 3.3+ (function->getAttributes().getFnAttributes().hasAttribute(llvm::AttributeSet::FunctionIndex, llvm::Attribute::AlwaysInline) == false) @@ -453,11 +451,7 @@ Function::GenerateIR() { functionName += std::string("_") + g->target->GetISAString(); llvm::Function *appFunction = llvm::Function::Create(ftype, linkage, functionName.c_str(), m->module); -#if defined(LLVM_3_1) - appFunction->setDoesNotThrow(true); -#else appFunction->setDoesNotThrow(); -#endif g->target->markFuncWithTargetAttr(appFunction); diff --git a/ispc.cpp b/ispc.cpp index de8fba4d..b25527c4 100644 --- a/ispc.cpp +++ b/ispc.cpp @@ -48,7 +48,7 @@ #include #include #endif -#if defined(LLVM_3_1) || defined(LLVM_3_2) +#if defined(LLVM_3_2) #include #include #include @@ -57,19 +57,12 @@ #include #include #endif -#if defined(LLVM_3_1) - #include - #include -#else - #include - #include -#endif +#include +#include #include #include #include -#if defined(LLVM_3_1) - #include -#elif defined(LLVM_3_2) +#if defined(LLVM_3_2) #include #else // LLVM 3.3+ #include @@ -145,27 +138,20 @@ static const char *supportedCPUs[] = { // cortex-a9 and a15. We should be able to handle any of them that also // have NEON support. "cortex-a9", "cortex-a15", - "atom", "penryn", "core2", "corei7", "corei7-avx" -#if !defined(LLVM_3_1) - , "core-avx-i", "core-avx2" -#endif // LLVM 3.2+ + "atom", "penryn", "core2", "corei7", "corei7-avx", "core-avx-i", "core-avx2" }; Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : m_target(NULL), m_targetMachine(NULL), -#if defined(LLVM_3_1) - m_targetData(NULL), -#else m_dataLayout(NULL), -#endif m_valid(false), m_isa(SSE2), m_arch(""), m_is32Bit(true), m_cpu(""), m_attributes(""), -#if !defined(LLVM_3_1) && !defined(LLVM_3_2) +#if !defined(LLVM_3_2) m_tf_attributes(NULL), #endif m_nativeVectorWidth(-1), @@ -407,10 +393,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : this->m_maskingIsFree = false; this->m_maskBitCount = 32; this->m_hasHalf = true; -#if !defined(LLVM_3_1) - // LLVM 3.2+ only this->m_hasRand = true; -#endif } else if (!strcasecmp(isa, "avx1.1-x2")) { this->m_isa = Target::AVX11; @@ -420,46 +403,29 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : this->m_maskingIsFree = false; this->m_maskBitCount = 32; this->m_hasHalf = true; -#if !defined(LLVM_3_1) - // LLVM 3.2+ only this->m_hasRand = true; -#endif } else if (!strcasecmp(isa, "avx2")) { this->m_isa = Target::AVX2; this->m_nativeVectorWidth = 8; this->m_vectorWidth = 8; - this->m_attributes = "+avx2,+popcnt,+cmov,+f16c,+rdrand" -#ifndef LLVM_3_1 - ",+fma" -#endif // !LLVM_3_1 - ; + this->m_attributes = "+avx2,+popcnt,+cmov,+f16c,+rdrand,+fma"; this->m_maskingIsFree = false; this->m_maskBitCount = 32; this->m_hasHalf = true; -#if !defined(LLVM_3_1) - // LLVM 3.2+ only this->m_hasRand = true; this->m_hasGather = true; -#endif } else if (!strcasecmp(isa, "avx2-x2")) { this->m_isa = Target::AVX2; this->m_nativeVectorWidth = 16; this->m_vectorWidth = 16; - this->m_attributes = "+avx2,+popcnt,+cmov,+f16c,+rdrand" -#ifndef LLVM_3_1 - ",+fma" -#endif // !LLVM_3_1 - ; + this->m_attributes = "+avx2,+popcnt,+cmov,+f16c,+rdrand,+fma"; this->m_maskingIsFree = false; this->m_maskBitCount = 32; this->m_hasHalf = true; -#if !defined(LLVM_3_1) - // LLVM 3.2+ only this->m_hasRand = true; this->m_hasGather = true; -#endif } else if (!strcasecmp(isa, "neon-8")) { this->m_isa = Target::NEON8; @@ -505,10 +471,8 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : if (m_isa == Target::NEON8 || m_isa == Target::NEON16 || m_isa == Target::NEON32) options.FloatABIType = llvm::FloatABI::Hard; -#if !defined(LLVM_3_1) if (g->opt.disableFMA == false) options.AllowFPOpFusion = llvm::FPOpFusion::Fast; -#endif // !LLVM_3_1 #ifdef ISPC_IS_WINDOWS if (strcmp("x86", arch) == 0) { @@ -526,12 +490,8 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : // Initialize TargetData/DataLayout in 3 steps. // 1. Get default data layout first - std::string dl_string; -#if defined(LLVM_3_1) - dl_string = m_targetMachine->getTargetData()->getStringRepresentation(); -#else - dl_string = m_targetMachine->getDataLayout()->getStringRepresentation(); -#endif + std::string dl_string = + m_targetMachine->getDataLayout()->getStringRepresentation(); // 2. Adjust for generic if (m_isa == Target::GENERIC) { @@ -546,11 +506,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : } // 3. Finally set member data -#if defined(LLVM_3_1) - m_targetData = new llvm::TargetData(dl_string); -#else m_dataLayout = new llvm::DataLayout(dl_string); -#endif // Set is32Bit // This indicates if we are compiling for 32 bit platform @@ -558,7 +514,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : // FIXME: all generic targets are handled as 64 bit, which is incorrect. this->m_is32Bit = (getDataLayout()->getPointerSize() == 4); -#if !defined(LLVM_3_1) && !defined(LLVM_3_2) +#if !defined(LLVM_3_2) // This is LLVM 3.3+ feature. // Initialize target-specific "target-feature" attribute. if (!m_attributes.empty()) { @@ -772,7 +728,7 @@ Target::StructOffset(llvm::Type *type, int element, } void Target::markFuncWithTargetAttr(llvm::Function* func) { -#if !defined(LLVM_3_1) && !defined(LLVM_3_2) +#if !defined(LLVM_3_2) if (m_tf_attributes) { func->addAttributes(llvm::AttributeSet::FunctionIndex, *m_tf_attributes); } diff --git a/ispc.h b/ispc.h index bf6d2642..d68f9034 100644 --- a/ispc.h +++ b/ispc.h @@ -40,8 +40,8 @@ #define ISPC_VERSION "1.4.5dev" -#if !defined(LLVM_3_1) && !defined(LLVM_3_2) && !defined(LLVM_3_3) && !defined(LLVM_3_4) -#error "Only LLVM 3.1, 3.2, 3.3 and the 3.4 development branch are supported" +#if !defined(LLVM_3_2) && !defined(LLVM_3_3) && !defined(LLVM_3_4) +#error "Only LLVM 3.2, 3.3 and the 3.4 development branch are supported" #endif #if defined(_WIN32) || defined(_WIN64) @@ -72,11 +72,7 @@ namespace llvm { class BasicBlock; class Constant; class ConstantValue; -#if defined(LLVM_3_1) - class TargetData; -#else class DataLayout; -#endif class DIBuilder; class DIDescriptor; class DIFile; @@ -226,11 +222,7 @@ public: // Note the same name of method for 3.1 and 3.2+, this allows // to reduce number ifdefs on client side. -#if defined(LLVM_3_1) - llvm::TargetData *getDataLayout() const {return m_targetData;} -#else llvm::DataLayout *getDataLayout() const {return m_dataLayout;} -#endif /** Reports if Target object has valid state. */ bool isValid() const {return m_valid;} @@ -278,11 +270,7 @@ private: */ llvm::TargetMachine *m_targetMachine; -#if defined(LLVM_3_1) - llvm::TargetData *m_targetData; -#else llvm::DataLayout *m_dataLayout; -#endif /** flag to report invalid state after construction (due to bad parameters passed to constructor). */ @@ -303,7 +291,7 @@ private: /** Target-specific attribute string to pass along to the LLVM backend */ std::string m_attributes; -#if !defined(LLVM_3_1) && !defined(LLVM_3_2) +#if !defined(LLVM_3_2) /** Target-specific LLVM attribute, which has to be attached to every function to ensure that it is generated for correct target architecture. This is requirement was introduced in LLVM 3.3 */ diff --git a/llvmutil.cpp b/llvmutil.cpp index 180c8676..2f54a2fe 100644 --- a/llvmutil.cpp +++ b/llvmutil.cpp @@ -38,7 +38,7 @@ #include "llvmutil.h" #include "ispc.h" #include "type.h" -#if defined(LLVM_3_1) || defined(LLVM_3_2) +#if defined(LLVM_3_2) #include #include #else diff --git a/llvmutil.h b/llvmutil.h index d6c5ede0..d1803f32 100644 --- a/llvmutil.h +++ b/llvmutil.h @@ -38,7 +38,7 @@ #ifndef ISPC_LLVMUTIL_H #define ISPC_LLVMUTIL_H 1 -#if defined(LLVM_3_1) || defined(LLVM_3_2) +#if defined(LLVM_3_2) #include #include #include diff --git a/main.cpp b/main.cpp index de2bb620..4c4b4575 100644 --- a/main.cpp +++ b/main.cpp @@ -62,9 +62,7 @@ static void lPrintVersion() { printf("Intel(r) SPMD Program Compiler (ispc), %s (build %s @ %s, LLVM %s)\n", ISPC_VERSION, BUILD_VERSION, BUILD_DATE, -#if defined(LLVM_3_1) - "3.1" -#elif defined(LLVM_3_2) +#if defined(LLVM_3_2) "3.2" #elif defined(LLVM_3_3) "3.3" diff --git a/module.cpp b/module.cpp index 755a5dc4..eba5eb3b 100644 --- a/module.cpp +++ b/module.cpp @@ -64,7 +64,7 @@ #define strcasecmp stricmp #endif -#if defined(LLVM_3_1) || defined(LLVM_3_2) +#if defined(LLVM_3_2) #include #include #include @@ -86,9 +86,7 @@ #include #include #include -#if defined(LLVM_3_1) - #include -#elif defined(LLVM_3_2) +#if defined(LLVM_3_2) #include #include #else // LLVM 3.3+ @@ -202,7 +200,7 @@ lStripUnusedDebugInfo(llvm::Module *module) { // stuff and remove it later on. Removing it is useful, as it // reduces size of the binary significantly (manyfold for small // programs). -#if defined(LLVM_3_1) || defined(LLVM_3_2) +#if defined(LLVM_3_2) llvm::MDNode *nodeSPMD = llvm::dyn_cast(cuNode->getOperand(12)); Assert(nodeSPMD != NULL); @@ -797,11 +795,7 @@ Module::AddFunctionDeclaration(const std::string &name, #endif if (functionType->isTask) // This also applies transitively to members I think? -#if defined(LLVM_3_1) - function->setDoesNotAlias(1, true); -#else // LLVM 3.2+ function->setDoesNotAlias(1); -#endif g->target->markFuncWithTargetAttr(function); @@ -850,12 +844,7 @@ Module::AddFunctionDeclaration(const std::string &name, // NOTE: LLVM indexes function parameters starting from 1. // This is unintuitive. -#if defined(LLVM_3_1) - function->setDoesNotAlias(i+1, true); -#else function->setDoesNotAlias(i+1); -#endif - #if 0 int align = 4 * RoundUpPow2(g->target->nativeVectorWidth); function->addAttribute(i+1, llvm::Attribute::constructAlignmentFromInt(align)); @@ -1067,7 +1056,7 @@ Module::writeObjectFileOrAssembly(llvm::TargetMachine *targetMachine, llvm::TargetMachine::CodeGenFileType fileType = (outputType == Object) ? llvm::TargetMachine::CGFT_ObjectFile : llvm::TargetMachine::CGFT_AssemblyFile; bool binary = (fileType == llvm::TargetMachine::CGFT_ObjectFile); -#if defined(LLVM_3_1) || defined(LLVM_3_2) || defined(LLVM_3_3) +#if defined(LLVM_3_2) || defined(LLVM_3_3) unsigned int flags = binary ? llvm::raw_fd_ostream::F_Binary : 0; #else llvm::sys::fs::OpenFlags flags = binary ? llvm::sys::fs::F_Binary : @@ -1082,11 +1071,7 @@ Module::writeObjectFileOrAssembly(llvm::TargetMachine *targetMachine, } llvm::PassManager pm; -#if defined(LLVM_3_1) - pm.add(new llvm::TargetData(*g->target->getDataLayout())); -#else pm.add(new llvm::DataLayout(*g->target->getDataLayout())); -#endif llvm::formatted_raw_ostream fos(of->os()); @@ -1800,22 +1785,12 @@ Module::execPreprocessor(const char *infilename, llvm::raw_string_ostream *ostre llvm::raw_fd_ostream stderrRaw(2, false); -#if defined(LLVM_3_1) - clang::TextDiagnosticPrinter *diagPrinter = - new clang::TextDiagnosticPrinter(stderrRaw, clang::DiagnosticOptions()); -#else clang::DiagnosticOptions *diagOptions = new clang::DiagnosticOptions(); clang::TextDiagnosticPrinter *diagPrinter = new clang::TextDiagnosticPrinter(stderrRaw, diagOptions); -#endif llvm::IntrusiveRefCntPtr diagIDs(new clang::DiagnosticIDs); -#if defined(LLVM_3_1) - clang::DiagnosticsEngine *diagEngine = - new clang::DiagnosticsEngine(diagIDs, diagPrinter); -#else clang::DiagnosticsEngine *diagEngine = new clang::DiagnosticsEngine(diagIDs, diagOptions, diagPrinter); -#endif inst.setDiagnostics(diagEngine); clang::TargetOptions &options = inst.getTargetOpts(); @@ -1825,7 +1800,7 @@ Module::execPreprocessor(const char *infilename, llvm::raw_string_ostream *ostre } options.Triple = triple.getTriple(); -#if defined(LLVM_3_1) || defined(LLVM_3_2) +#if defined(LLVM_3_2) clang::TargetInfo *target = clang::TargetInfo::CreateTargetInfo(inst.getDiagnostics(), options); #else // LLVM 3.3+ @@ -1835,18 +1810,14 @@ Module::execPreprocessor(const char *infilename, llvm::raw_string_ostream *ostre inst.setTarget(target); inst.createSourceManager(inst.getFileManager()); -#if defined(LLVM_3_1) - inst.InitializeSourceManager(infilename); -#else clang::FrontendInputFile inputFile(infilename, clang::IK_None); inst.InitializeSourceManager(inputFile); -#endif // Don't remove comments in the preprocessor, so that we can accurately // track the source file position by handling them ourselves. inst.getPreprocessorOutputOpts().ShowComments = 1; -#if !defined(LLVM_3_1) && !defined(LLVM_3_2) // LLVM 3.3+ +#if !defined(LLVM_3_2) // LLVM 3.3+ inst.getPreprocessorOutputOpts().ShowCPP = 1; #endif @@ -1858,7 +1829,7 @@ Module::execPreprocessor(const char *infilename, llvm::raw_string_ostream *ostre headerOpts.Verbose = 1; for (int i = 0; i < (int)g->includePath.size(); ++i) { headerOpts.AddPath(g->includePath[i], clang::frontend::Angled, -#if defined(LLVM_3_1) || defined(LLVM_3_2) +#if defined(LLVM_3_2) true /* is user supplied */, #endif false /* not a framework */, @@ -1913,11 +1884,7 @@ Module::execPreprocessor(const char *infilename, llvm::raw_string_ostream *ostre } } -#if defined(LLVM_3_1) - inst.getLangOpts().BCPLComment = 1; -#else inst.getLangOpts().LineComment = 1; -#endif inst.createPreprocessor(); diagPrinter->BeginSourceFile(inst.getLangOpts(), &inst.getPreprocessor()); diff --git a/opt.cpp b/opt.cpp index 8efdbc67..8c86368e 100644 --- a/opt.cpp +++ b/opt.cpp @@ -48,7 +48,7 @@ #include #include -#if defined(LLVM_3_1) || defined(LLVM_3_2) +#if defined(LLVM_3_2) #include #include #include @@ -73,9 +73,7 @@ #include #include #include -#if defined(LLVM_3_1) - #include -#elif defined(LLVM_3_2) +#if defined(LLVM_3_2) #include #else // LLVM 3.3+ #include @@ -85,11 +83,7 @@ #include #include #include -#if defined(LLVM_3_1) - #include -#else - #include -#endif +#include #include #ifdef ISPC_IS_LINUX #include @@ -415,18 +409,14 @@ Optimize(llvm::Module *module, int optLevel) { optPM.add(targetLibraryInfo); -#if defined(LLVM_3_1) - optPM.add(new llvm::TargetData(*g->target->getDataLayout())); -#else optPM.add(new llvm::DataLayout(*g->target->getDataLayout())); llvm::TargetMachine *targetMachine = g->target->GetTargetMachine(); - #ifdef LLVM_3_2 +#ifdef LLVM_3_2 optPM.add(new llvm::TargetTransformInfo(targetMachine->getScalarTargetTransformInfo(), targetMachine->getVectorTargetTransformInfo())); - #else // LLVM 3.3+ +#else // LLVM 3.3+ targetMachine->addAnalysisPasses(optPM); - #endif #endif optPM.add(llvm::createIndVarSimplifyPass()); @@ -505,7 +495,7 @@ Optimize(llvm::Module *module, int optLevel) { optPM.add(llvm::createCFGSimplificationPass()); optPM.add(llvm::createArgumentPromotionPass()); -#if defined(LLVM_3_1) || defined(LLVM_3_2) || defined(LLVM_3_3) +#if defined(LLVM_3_2) || defined(LLVM_3_3) // Starting from 3.4 this functionality was moved to // InstructionCombiningPass. See r184459 for details. optPM.add(llvm::createSimplifyLibCallsPass()); diff --git a/stmt.cpp b/stmt.cpp index 4ec63d35..412b0dd9 100644 --- a/stmt.cpp +++ b/stmt.cpp @@ -48,7 +48,7 @@ #include #include -#if defined(LLVM_3_1) || defined(LLVM_3_2) +#if defined(LLVM_3_2) #include #include #include diff --git a/type.cpp b/type.cpp index 5fa1845b..11a165f5 100644 --- a/type.cpp +++ b/type.cpp @@ -43,20 +43,15 @@ #include #include -#if defined(LLVM_3_1) || defined(LLVM_3_2) +#if defined(LLVM_3_2) #include #include #else #include #include #endif -#if defined(LLVM_3_1) - #include - #include -#else - #include - #include -#endif +#include +#include #include @@ -819,11 +814,8 @@ EnumType::GetDIType(llvm::DIDescriptor scope) const { m->diBuilder->createEnumerationType(scope, name, diFile, pos.first_line, 32 /* size in bits */, 32 /* align in bits */, - elementArray -#if !defined(LLVM_3_1) - , llvm::DIType() -#endif - ); + elementArray, + llvm::DIType()); switch (variability.type) { @@ -2139,7 +2131,7 @@ StructType::GetDIType(llvm::DIDescriptor scope) const { currentSize, // Size in bits align, // Alignment in bits 0, // Flags -#if !defined(LLVM_3_1) && !defined(LLVM_3_2) +#if !defined(LLVM_3_2) llvm::DIType(), // DerivedFrom #endif elements); @@ -2382,7 +2374,7 @@ UndefinedStructType::GetDIType(llvm::DIDescriptor scope) const { 0, // Size 0, // Align 0, // Flags -#if !defined(LLVM_3_1) && !defined(LLVM_3_2) +#if !defined(LLVM_3_2) llvm::DIType(), // DerivedFrom #endif elements); @@ -2645,12 +2637,8 @@ ReferenceType::GetDIType(llvm::DIDescriptor scope) const { } llvm::DIType diTargetType = targetType->GetDIType(scope); -#if defined(LLVM_3_1) - return m->diBuilder->createReferenceType(diTargetType); -#else return m->diBuilder->createReferenceType(llvm::dwarf::DW_TAG_reference_type, diTargetType); -#endif } diff --git a/type.h b/type.h index 880f8574..a6a52e10 100644 --- a/type.h +++ b/type.h @@ -40,7 +40,7 @@ #include "ispc.h" #include "util.h" -#if defined(LLVM_3_1) || defined(LLVM_3_2) +#if defined(LLVM_3_2) #include #include #else diff --git a/util.cpp b/util.cpp index dbea9517..4be863bf 100644 --- a/util.cpp +++ b/util.cpp @@ -65,9 +65,7 @@ #include #include -#if defined(LLVM_3_1) - #include -#elif defined(LLVM_3_2) +#if defined(LLVM_3_2) #include #else // LLVM 3.3+ #include @@ -616,13 +614,8 @@ VerifyDataLayoutCompatibility(const std::string &module_dl, // which contradic: f80:128:128 followed by f80:32:32. This is a bug, but // correct thing to do is to interpret this exactly how LLVM would treat it, // so we create a DataLayout class and take its string representation. -#if defined(LLVM_3_1) - llvm::TargetData d1(module_dl); - llvm::TargetData d2(lib_dl); -#else // LLVM 3.2+ llvm::DataLayout d1(module_dl); llvm::DataLayout d2(lib_dl); -#endif std::string module_dl_canonic = d1.getStringRepresentation(); std::string lib_dl_canonic = d2.getStringRepresentation(); From d9c38b5c1f6c1ccb4920465789b9e3d451e302a8 Mon Sep 17 00:00:00 2001 From: Matt Pharr Date: Thu, 18 Jul 2013 09:24:23 -0700 Subject: [PATCH 019/124] Remove support for using SVML for math lib routines. This path was poorly maintained and wasn't actually available on most targets. --- builtins.cpp | 11 ---- builtins/target-avx-x2.ll | 17 ------ builtins/target-avx.ll | 17 ------ builtins/target-generic-1.ll | 98 ------------------------------- builtins/target-generic-common.ll | 16 ----- builtins/target-neon-common.ll | 13 ---- builtins/target-sse2-x2.ll | 86 --------------------------- builtins/target-sse2.ll | 60 ------------------- builtins/target-sse4-16.ll | 15 ----- builtins/target-sse4-8.ll | 15 ----- builtins/target-sse4-x2.ll | 86 --------------------------- builtins/target-sse4.ll | 60 ------------------- docs/ispc.rst | 3 - ispc.h | 2 +- main.cpp | 3 - stdlib.ispc | 72 ++++++----------------- 16 files changed, 18 insertions(+), 556 deletions(-) diff --git a/builtins.cpp b/builtins.cpp index 17582d68..d75db43e 100644 --- a/builtins.cpp +++ b/builtins.cpp @@ -579,15 +579,6 @@ lSetInternalFunctions(llvm::Module *module) { "__stdlib_sinf", "__stdlib_tan", "__stdlib_tanf", - "__svml_sin", - "__svml_cos", - "__svml_sincos", - "__svml_tan", - "__svml_atan", - "__svml_atan2", - "__svml_exp", - "__svml_log", - "__svml_pow", "__undef_uniform", "__undef_varying", "__vec4_add_float", @@ -1054,8 +1045,6 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod symbolTable); lDefineConstantInt("__math_lib_ispc_fast", (int)Globals::Math_ISPCFast, module, symbolTable); - lDefineConstantInt("__math_lib_svml", (int)Globals::Math_SVML, module, - symbolTable); lDefineConstantInt("__math_lib_system", (int)Globals::Math_System, module, symbolTable); lDefineConstantIntFunc("__fast_masked_vload", (int)g->opt.fastMaskedVload, diff --git a/builtins/target-avx-x2.ll b/builtins/target-avx-x2.ll index d9e0322b..8fb2e427 100644 --- a/builtins/target-avx-x2.ll +++ b/builtins/target-avx-x2.ll @@ -134,23 +134,6 @@ define <16 x float> @__sqrt_varying_float(<16 x float>) nounwind readonly always ret <16 x float> %call } -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;; svml - -; FIXME: need either to wire these up to the 8-wide SVML entrypoints, -; or, use the macro to call the 4-wide ones 4x with our 16-wide -; vectors... - -declare <16 x float> @__svml_sin(<16 x float>) -declare <16 x float> @__svml_cos(<16 x float>) -declare void @__svml_sincos(<16 x float>, <16 x float> *, <16 x float> *) -declare <16 x float> @__svml_tan(<16 x float>) -declare <16 x float> @__svml_atan(<16 x float>) -declare <16 x float> @__svml_atan2(<16 x float>, <16 x float>) -declare <16 x float> @__svml_exp(<16 x float>) -declare <16 x float> @__svml_log(<16 x float>) -declare <16 x float> @__svml_pow(<16 x float>, <16 x float>) - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; float min/max diff --git a/builtins/target-avx.ll b/builtins/target-avx.ll index 90e2f3ac..adaed9ba 100644 --- a/builtins/target-avx.ll +++ b/builtins/target-avx.ll @@ -134,23 +134,6 @@ define <8 x float> @__sqrt_varying_float(<8 x float>) nounwind readonly alwaysin ret <8 x float> %call } -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;; svml - -; FIXME: need either to wire these up to the 8-wide SVML entrypoints, -; or, use the macro to call the 4-wide ones twice with our 8-wide -; vectors... - -declare <8 x float> @__svml_sin(<8 x float>) -declare <8 x float> @__svml_cos(<8 x float>) -declare void @__svml_sincos(<8 x float>, <8 x float> *, <8 x float> *) -declare <8 x float> @__svml_tan(<8 x float>) -declare <8 x float> @__svml_atan(<8 x float>) -declare <8 x float> @__svml_atan2(<8 x float>, <8 x float>) -declare <8 x float> @__svml_exp(<8 x float>) -declare <8 x float> @__svml_log(<8 x float>) -declare <8 x float> @__svml_pow(<8 x float>, <8 x float>) - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; float min/max diff --git a/builtins/target-generic-1.ll b/builtins/target-generic-1.ll index 3dec76b0..238de444 100644 --- a/builtins/target-generic-1.ll +++ b/builtins/target-generic-1.ll @@ -647,104 +647,6 @@ define <1 x float> @__rsqrt_varying_float(<1 x float> %v) nounwind readonly alw } - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -; svml stuff - -define <1 x float> @__svml_sin(<1 x float>) nounwind readnone alwaysinline { - ;%ret = call <1 x float> @__svml_sinf4(<1 x float> %0) - ;ret <1 x float> %ret - ;%r = extractelement <1 x float> %0, i32 0 - ;%s = call float @llvm.sin.f32(float %r) - ;%rv = insertelement <1 x float> undef, float %r, i32 0 - ;ret <1 x float> %rv - unary1to1(float,@llvm.sin.f32) - -} - -define <1 x float> @__svml_cos(<1 x float>) nounwind readnone alwaysinline { - ;%ret = call <1 x float> @__svml_cosf4(<1 x float> %0) - ;ret <1 x float> %ret - ;%r = extractelement <1 x float> %0, i32 0 - ;%s = call float @llvm.cos.f32(float %r) - ;%rv = insertelement <1 x float> undef, float %r, i32 0 - ;ret <1 x float> %rv - unary1to1(float, @llvm.cos.f32) - -} - -define void @__svml_sincos(<1 x float>, <1 x float> *, <1 x float> *) nounwind readnone alwaysinline { -; %s = call <1 x float> @__svml_sincosf4(<1 x float> * %2, <1 x float> %0) -; store <1 x float> %s, <1 x float> * %1 -; ret void - %sin = call <1 x float> @__svml_sin (<1 x float> %0) - %cos = call <1 x float> @__svml_cos (<1 x float> %0) - store <1 x float> %sin, <1 x float> * %1 - store <1 x float> %cos, <1 x float> * %2 - ret void -} - -define <1 x float> @__svml_tan(<1 x float>) nounwind readnone alwaysinline { - ;%ret = call <1 x float> @__svml_tanf4(<1 x float> %0) - ;ret <1 x float> %ret - ;%r = extractelement <1 x float> %0, i32 0 - ;%s = call float @llvm_tan_f32(float %r) - ;%rv = insertelement <1 x float> undef, float %r, i32 0 - ;ret <1 x float> %rv - ;unasry1to1(float, @llvm.tan.f32) - ; UNSUPPORTED! - ret <1 x float > %0 -} - -define <1 x float> @__svml_atan(<1 x float>) nounwind readnone alwaysinline { -; %ret = call <1 x float> @__svml_atanf4(<1 x float> %0) -; ret <1 x float> %ret - ;%r = extractelement <1 x float> %0, i32 0 - ;%s = call float @llvm_atan_f32(float %r) - ;%rv = insertelement <1 x float> undef, float %r, i32 0 - ;ret <1 x float> %rv - ;unsary1to1(float,@llvm.atan.f32) - ;UNSUPPORTED! - ret <1 x float > %0 - -} - -define <1 x float> @__svml_atan2(<1 x float>, <1 x float>) nounwind readnone alwaysinline { - ;%ret = call <1 x float> @__svml_atan2f4(<1 x float> %0, <1 x float> %1) - ;ret <1 x float> %ret - ;%y = extractelement <1 x float> %0, i32 0 - ;%x = extractelement <1 x float> %1, i32 0 - ;%q = fdiv float %y, %x - ;%a = call float @llvm.atan.f32 (float %q) - ;%rv = insertelement <1 x float> undef, float %a, i32 0 - ;ret <1 x float> %rv - ; UNSUPPORTED! - ret <1 x float > %0 -} - -define <1 x float> @__svml_exp(<1 x float>) nounwind readnone alwaysinline { - ;%ret = call <1 x float> @__svml_expf4(<1 x float> %0) - ;ret <1 x float> %ret - unary1to1(float, @llvm.exp.f32) -} - -define <1 x float> @__svml_log(<1 x float>) nounwind readnone alwaysinline { - ;%ret = call <1 x float> @__svml_logf4(<1 x float> %0) - ;ret <1 x float> %ret - unary1to1(float, @llvm.log.f32) -} - -define <1 x float> @__svml_pow(<1 x float>, <1 x float>) nounwind readnone alwaysinline { - ;%ret = call <1 x float> @__svml_powf4(<1 x float> %0, <1 x float> %1) - ;ret <1 x float> %ret - %r = extractelement <1 x float> %0, i32 0 - %e = extractelement <1 x float> %1, i32 0 - %s = call float @llvm.pow.f32(float %r,float %e) - %rv = insertelement <1 x float> undef, float %s, i32 0 - ret <1 x float> %rv - -} - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; float min/max diff --git a/builtins/target-generic-common.ll b/builtins/target-generic-common.ll index 76d1faf3..b581e0a7 100644 --- a/builtins/target-generic-common.ll +++ b/builtins/target-generic-common.ll @@ -202,22 +202,6 @@ declare i64 @__count_trailing_zeros_i64(i64) nounwind readnone declare i32 @__count_leading_zeros_i32(i32) nounwind readnone declare i64 @__count_leading_zeros_i64(i64) nounwind readnone -;; svml - -; FIXME: need either to wire these up to the 8-wide SVML entrypoints, -; or, use the macro to call the 4-wide ones twice with our 8-wide -; vectors... - -declare @__svml_sin() -declare @__svml_cos() -declare void @__svml_sincos(, *, *) -declare @__svml_tan() -declare @__svml_atan() -declare @__svml_atan2(, ) -declare @__svml_exp() -declare @__svml_log() -declare @__svml_pow(, ) - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; reductions diff --git a/builtins/target-neon-common.ll b/builtins/target-neon-common.ll index 696b0748..f892a0a1 100644 --- a/builtins/target-neon-common.ll +++ b/builtins/target-neon-common.ll @@ -313,19 +313,6 @@ define void @__masked_store_blend_i64(* nocapture %ptr, ret void } -;; yuck. We need declarations of these, even though we shouldnt ever -;; actually generate calls to them for the NEON target... - -declare @__svml_sin() -declare @__svml_cos() -declare void @__svml_sincos(, *, *) -declare @__svml_tan() -declare @__svml_atan() -declare @__svml_atan2(, ) -declare @__svml_exp() -declare @__svml_log() -declare @__svml_pow(, ) - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; gather diff --git a/builtins/target-sse2-x2.ll b/builtins/target-sse2-x2.ll index da22a66c..057ea98f 100644 --- a/builtins/target-sse2-x2.ll +++ b/builtins/target-sse2-x2.ll @@ -102,92 +102,6 @@ define <8 x float> @__sqrt_varying_float(<8 x float>) nounwind readonly alwaysin ret <8 x float> %call } -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -; svml stuff - -declare <4 x float> @__svml_sinf4(<4 x float>) nounwind readnone -declare <4 x float> @__svml_cosf4(<4 x float>) nounwind readnone -declare <4 x float> @__svml_sincosf4(<4 x float> *, <4 x float>) nounwind readnone -declare <4 x float> @__svml_tanf4(<4 x float>) nounwind readnone -declare <4 x float> @__svml_atanf4(<4 x float>) nounwind readnone -declare <4 x float> @__svml_atan2f4(<4 x float>, <4 x float>) nounwind readnone -declare <4 x float> @__svml_expf4(<4 x float>) nounwind readnone -declare <4 x float> @__svml_logf4(<4 x float>) nounwind readnone -declare <4 x float> @__svml_powf4(<4 x float>, <4 x float>) nounwind readnone - - -define <8 x float> @__svml_sin(<8 x float>) nounwind readnone alwaysinline { - unary4to8(ret, float, @__svml_sinf4, %0) - ret <8 x float> %ret -} - -define <8 x float> @__svml_cos(<8 x float>) nounwind readnone alwaysinline { - unary4to8(ret, float, @__svml_cosf4, %0) - ret <8 x float> %ret -} - -define void @__svml_sincos(<8 x float>, <8 x float> *, - <8 x float> *) nounwind readnone alwaysinline { - ; call svml_sincosf4 two times with the two 4-wide sub-vectors - %a = shufflevector <8 x float> %0, <8 x float> undef, - <4 x i32> - %b = shufflevector <8 x float> %0, <8 x float> undef, - <4 x i32> - - %cospa = alloca <4 x float> - %sa = call <4 x float> @__svml_sincosf4(<4 x float> * %cospa, <4 x float> %a) - - %cospb = alloca <4 x float> - %sb = call <4 x float> @__svml_sincosf4(<4 x float> * %cospb, <4 x float> %b) - - %sin = shufflevector <4 x float> %sa, <4 x float> %sb, - <8 x i32> - store <8 x float> %sin, <8 x float> * %1 - - %cosa = load <4 x float> * %cospa - %cosb = load <4 x float> * %cospb - %cos = shufflevector <4 x float> %cosa, <4 x float> %cosb, - <8 x i32> - store <8 x float> %cos, <8 x float> * %2 - - ret void -} - -define <8 x float> @__svml_tan(<8 x float>) nounwind readnone alwaysinline { - unary4to8(ret, float, @__svml_tanf4, %0) - ret <8 x float> %ret -} - -define <8 x float> @__svml_atan(<8 x float>) nounwind readnone alwaysinline { - unary4to8(ret, float, @__svml_atanf4, %0) - ret <8 x float> %ret -} - -define <8 x float> @__svml_atan2(<8 x float>, - <8 x float>) nounwind readnone alwaysinline { - binary4to8(ret, float, @__svml_atan2f4, %0, %1) - ret <8 x float> %ret -} - -define <8 x float> @__svml_exp(<8 x float>) nounwind readnone alwaysinline { - unary4to8(ret, float, @__svml_expf4, %0) - ret <8 x float> %ret -} - -define <8 x float> @__svml_log(<8 x float>) nounwind readnone alwaysinline { - unary4to8(ret, float, @__svml_logf4, %0) - ret <8 x float> %ret -} - -define <8 x float> @__svml_pow(<8 x float>, - <8 x float>) nounwind readnone alwaysinline { - binary4to8(ret, float, @__svml_powf4, %0, %1) - ret <8 x float> %ret -} - - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; float min/max diff --git a/builtins/target-sse2.ll b/builtins/target-sse2.ll index a6b206b6..e0a5c3d5 100644 --- a/builtins/target-sse2.ll +++ b/builtins/target-sse2.ll @@ -493,66 +493,6 @@ define <4 x float> @__sqrt_varying_float(<4 x float>) nounwind readonly alwaysin ret <4 x float> %call } -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -; svml stuff - -declare <4 x float> @__svml_sinf4(<4 x float>) nounwind readnone -declare <4 x float> @__svml_cosf4(<4 x float>) nounwind readnone -declare <4 x float> @__svml_sincosf4(<4 x float> *, <4 x float>) nounwind readnone -declare <4 x float> @__svml_tanf4(<4 x float>) nounwind readnone -declare <4 x float> @__svml_atanf4(<4 x float>) nounwind readnone -declare <4 x float> @__svml_atan2f4(<4 x float>, <4 x float>) nounwind readnone -declare <4 x float> @__svml_expf4(<4 x float>) nounwind readnone -declare <4 x float> @__svml_logf4(<4 x float>) nounwind readnone -declare <4 x float> @__svml_powf4(<4 x float>, <4 x float>) nounwind readnone - - -define <4 x float> @__svml_sin(<4 x float>) nounwind readnone alwaysinline { - %ret = call <4 x float> @__svml_sinf4(<4 x float> %0) - ret <4 x float> %ret -} - -define <4 x float> @__svml_cos(<4 x float>) nounwind readnone alwaysinline { - %ret = call <4 x float> @__svml_cosf4(<4 x float> %0) - ret <4 x float> %ret -} - -define void @__svml_sincos(<4 x float>, <4 x float> *, <4 x float> *) nounwind readnone alwaysinline { - %s = call <4 x float> @__svml_sincosf4(<4 x float> * %2, <4 x float> %0) - store <4 x float> %s, <4 x float> * %1 - ret void -} - -define <4 x float> @__svml_tan(<4 x float>) nounwind readnone alwaysinline { - %ret = call <4 x float> @__svml_tanf4(<4 x float> %0) - ret <4 x float> %ret -} - -define <4 x float> @__svml_atan(<4 x float>) nounwind readnone alwaysinline { - %ret = call <4 x float> @__svml_atanf4(<4 x float> %0) - ret <4 x float> %ret -} - -define <4 x float> @__svml_atan2(<4 x float>, <4 x float>) nounwind readnone alwaysinline { - %ret = call <4 x float> @__svml_atan2f4(<4 x float> %0, <4 x float> %1) - ret <4 x float> %ret -} - -define <4 x float> @__svml_exp(<4 x float>) nounwind readnone alwaysinline { - %ret = call <4 x float> @__svml_expf4(<4 x float> %0) - ret <4 x float> %ret -} - -define <4 x float> @__svml_log(<4 x float>) nounwind readnone alwaysinline { - %ret = call <4 x float> @__svml_logf4(<4 x float> %0) - ret <4 x float> %ret -} - -define <4 x float> @__svml_pow(<4 x float>, <4 x float>) nounwind readnone alwaysinline { - %ret = call <4 x float> @__svml_powf4(<4 x float> %0, <4 x float> %1) - ret <4 x float> %ret -} - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; float min/max diff --git a/builtins/target-sse4-16.ll b/builtins/target-sse4-16.ll index d1563988..50f0848d 100644 --- a/builtins/target-sse4-16.ll +++ b/builtins/target-sse4-16.ll @@ -205,21 +205,6 @@ define <8 x double> @__max_varying_double(<8 x double>, <8 x double>) nounwind r ret <8 x double> %ret } -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;; svml - -; FIXME - -declare <8 x float> @__svml_sin(<8 x float>) -declare <8 x float> @__svml_cos(<8 x float>) -declare void @__svml_sincos(<8 x float>, <8 x float> *, <8 x float> *) -declare <8 x float> @__svml_tan(<8 x float>) -declare <8 x float> @__svml_atan(<8 x float>) -declare <8 x float> @__svml_atan2(<8 x float>, <8 x float>) -declare <8 x float> @__svml_exp(<8 x float>) -declare <8 x float> @__svml_log(<8 x float>) -declare <8 x float> @__svml_pow(<8 x float>, <8 x float>) - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; horizontal ops / reductions diff --git a/builtins/target-sse4-8.ll b/builtins/target-sse4-8.ll index 85b7bbe7..7fa9075b 100644 --- a/builtins/target-sse4-8.ll +++ b/builtins/target-sse4-8.ll @@ -217,21 +217,6 @@ define <16 x double> @__max_varying_double(<16 x double>, <16 x double>) nounwin ret <16 x double> %ret } -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;; svml - -; FIXME - -declare <16 x float> @__svml_sin(<16 x float>) -declare <16 x float> @__svml_cos(<16 x float>) -declare void @__svml_sincos(<16 x float>, <16 x float> *, <16 x float> *) -declare <16 x float> @__svml_tan(<16 x float>) -declare <16 x float> @__svml_atan(<16 x float>) -declare <16 x float> @__svml_atan2(<16 x float>, <16 x float>) -declare <16 x float> @__svml_exp(<16 x float>) -declare <16 x float> @__svml_log(<16 x float>) -declare <16 x float> @__svml_pow(<16 x float>, <16 x float>) - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; horizontal ops / reductions diff --git a/builtins/target-sse4-x2.ll b/builtins/target-sse4-x2.ll index e2debbc2..4a447ba6 100644 --- a/builtins/target-sse4-x2.ll +++ b/builtins/target-sse4-x2.ll @@ -102,92 +102,6 @@ define <8 x float> @__sqrt_varying_float(<8 x float>) nounwind readonly alwaysin ret <8 x float> %call } -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -; svml stuff - -declare <4 x float> @__svml_sinf4(<4 x float>) nounwind readnone -declare <4 x float> @__svml_cosf4(<4 x float>) nounwind readnone -declare <4 x float> @__svml_sincosf4(<4 x float> *, <4 x float>) nounwind readnone -declare <4 x float> @__svml_tanf4(<4 x float>) nounwind readnone -declare <4 x float> @__svml_atanf4(<4 x float>) nounwind readnone -declare <4 x float> @__svml_atan2f4(<4 x float>, <4 x float>) nounwind readnone -declare <4 x float> @__svml_expf4(<4 x float>) nounwind readnone -declare <4 x float> @__svml_logf4(<4 x float>) nounwind readnone -declare <4 x float> @__svml_powf4(<4 x float>, <4 x float>) nounwind readnone - - -define <8 x float> @__svml_sin(<8 x float>) nounwind readnone alwaysinline { - unary4to8(ret, float, @__svml_sinf4, %0) - ret <8 x float> %ret -} - -define <8 x float> @__svml_cos(<8 x float>) nounwind readnone alwaysinline { - unary4to8(ret, float, @__svml_cosf4, %0) - ret <8 x float> %ret -} - -define void @__svml_sincos(<8 x float>, <8 x float> *, - <8 x float> *) nounwind readnone alwaysinline { - ; call svml_sincosf4 two times with the two 4-wide sub-vectors - %a = shufflevector <8 x float> %0, <8 x float> undef, - <4 x i32> - %b = shufflevector <8 x float> %0, <8 x float> undef, - <4 x i32> - - %cospa = alloca <4 x float> - %sa = call <4 x float> @__svml_sincosf4(<4 x float> * %cospa, <4 x float> %a) - - %cospb = alloca <4 x float> - %sb = call <4 x float> @__svml_sincosf4(<4 x float> * %cospb, <4 x float> %b) - - %sin = shufflevector <4 x float> %sa, <4 x float> %sb, - <8 x i32> - store <8 x float> %sin, <8 x float> * %1 - - %cosa = load <4 x float> * %cospa - %cosb = load <4 x float> * %cospb - %cos = shufflevector <4 x float> %cosa, <4 x float> %cosb, - <8 x i32> - store <8 x float> %cos, <8 x float> * %2 - - ret void -} - -define <8 x float> @__svml_tan(<8 x float>) nounwind readnone alwaysinline { - unary4to8(ret, float, @__svml_tanf4, %0) - ret <8 x float> %ret -} - -define <8 x float> @__svml_atan(<8 x float>) nounwind readnone alwaysinline { - unary4to8(ret, float, @__svml_atanf4, %0) - ret <8 x float> %ret -} - -define <8 x float> @__svml_atan2(<8 x float>, - <8 x float>) nounwind readnone alwaysinline { - binary4to8(ret, float, @__svml_atan2f4, %0, %1) - ret <8 x float> %ret -} - -define <8 x float> @__svml_exp(<8 x float>) nounwind readnone alwaysinline { - unary4to8(ret, float, @__svml_expf4, %0) - ret <8 x float> %ret -} - -define <8 x float> @__svml_log(<8 x float>) nounwind readnone alwaysinline { - unary4to8(ret, float, @__svml_logf4, %0) - ret <8 x float> %ret -} - -define <8 x float> @__svml_pow(<8 x float>, - <8 x float>) nounwind readnone alwaysinline { - binary4to8(ret, float, @__svml_powf4, %0, %1) - ret <8 x float> %ret -} - - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; float min/max diff --git a/builtins/target-sse4.ll b/builtins/target-sse4.ll index 98a7ef69..7f9a9185 100644 --- a/builtins/target-sse4.ll +++ b/builtins/target-sse4.ll @@ -206,66 +206,6 @@ define <4 x double> @__max_varying_double(<4 x double>, <4 x double>) nounwind r ret <4 x double> %ret } -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -; svml stuff - -declare <4 x float> @__svml_sinf4(<4 x float>) nounwind readnone -declare <4 x float> @__svml_cosf4(<4 x float>) nounwind readnone -declare <4 x float> @__svml_sincosf4(<4 x float> *, <4 x float>) nounwind readnone -declare <4 x float> @__svml_tanf4(<4 x float>) nounwind readnone -declare <4 x float> @__svml_atanf4(<4 x float>) nounwind readnone -declare <4 x float> @__svml_atan2f4(<4 x float>, <4 x float>) nounwind readnone -declare <4 x float> @__svml_expf4(<4 x float>) nounwind readnone -declare <4 x float> @__svml_logf4(<4 x float>) nounwind readnone -declare <4 x float> @__svml_powf4(<4 x float>, <4 x float>) nounwind readnone - - -define <4 x float> @__svml_sin(<4 x float>) nounwind readnone alwaysinline { - %ret = call <4 x float> @__svml_sinf4(<4 x float> %0) - ret <4 x float> %ret -} - -define <4 x float> @__svml_cos(<4 x float>) nounwind readnone alwaysinline { - %ret = call <4 x float> @__svml_cosf4(<4 x float> %0) - ret <4 x float> %ret -} - -define void @__svml_sincos(<4 x float>, <4 x float> *, <4 x float> *) nounwind readnone alwaysinline { - %s = call <4 x float> @__svml_sincosf4(<4 x float> * %2, <4 x float> %0) - store <4 x float> %s, <4 x float> * %1 - ret void -} - -define <4 x float> @__svml_tan(<4 x float>) nounwind readnone alwaysinline { - %ret = call <4 x float> @__svml_tanf4(<4 x float> %0) - ret <4 x float> %ret -} - -define <4 x float> @__svml_atan(<4 x float>) nounwind readnone alwaysinline { - %ret = call <4 x float> @__svml_atanf4(<4 x float> %0) - ret <4 x float> %ret -} - -define <4 x float> @__svml_atan2(<4 x float>, <4 x float>) nounwind readnone alwaysinline { - %ret = call <4 x float> @__svml_atan2f4(<4 x float> %0, <4 x float> %1) - ret <4 x float> %ret -} - -define <4 x float> @__svml_exp(<4 x float>) nounwind readnone alwaysinline { - %ret = call <4 x float> @__svml_expf4(<4 x float> %0) - ret <4 x float> %ret -} - -define <4 x float> @__svml_log(<4 x float>) nounwind readnone alwaysinline { - %ret = call <4 x float> @__svml_logf4(<4 x float> %0) - ret <4 x float> %ret -} - -define <4 x float> @__svml_pow(<4 x float>, <4 x float>) nounwind readnone alwaysinline { - %ret = call <4 x float> @__svml_powf4(<4 x float> %0, <4 x float> %1) - ret <4 x float> %ret -} - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; horizontal ops / reductions diff --git a/docs/ispc.rst b/docs/ispc.rst index 39d3a5c8..af59714a 100755 --- a/docs/ispc.rst +++ b/docs/ispc.rst @@ -3278,9 +3278,6 @@ for this argument. approximately 1.45e-6 over the range -10pi to 10pi.) * ``fast``: more efficient but lower accuracy versions of the default ``ispc`` implementations. -* ``svml``: use Intel "Short Vector Math Library". Use - ``icc`` to link your final executable so that the appropriate libraries - are linked. * ``system``: use the system's math library. On many systems, these functions are more accurate than both of ``ispc``'s implementations. Using these functions may be quite diff --git a/ispc.h b/ispc.h index d68f9034..8653553e 100644 --- a/ispc.h +++ b/ispc.h @@ -468,7 +468,7 @@ struct Globals { /** There are a number of math libraries that can be used for transcendentals and the like during program compilation. */ - enum MathLib { Math_ISPC, Math_ISPCFast, Math_SVML, Math_System }; + enum MathLib { Math_ISPC, Math_ISPCFast, Math_System }; MathLib mathLib; /** Records whether the ispc standard library should be made available diff --git a/main.cpp b/main.cpp index 4c4b4575..c21e7f88 100644 --- a/main.cpp +++ b/main.cpp @@ -107,7 +107,6 @@ usage(int ret) { printf(" [--math-lib= - - - Document - m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-neon.ll | python bitcode2cpp.py builtins\target-neon.ll > gen-bitcode-neon.cpp - gen-bitcode-neon.cpp - builtins\util.m4 - m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-neon.ll | python bitcode2cpp.py builtins\target-neon.ll > gen-bitcode-neon.cpp - gen-bitcode-neon.cpp - builtins\util.m4 - Building gen-bitcode-neon.cpp - Building gen-bitcode-neon.cpp - - Document diff --git a/main.cpp b/main.cpp index b107075c..c6786c39 100644 --- a/main.cpp +++ b/main.cpp @@ -300,6 +300,8 @@ int main(int Argc, char *Argv[]) { LLVMInitializeX86Disassembler(); LLVMInitializeX86TargetMC(); #endif // !__ARM__ + +#ifdef ISPC_ARM_ENABLED // Generating ARM from x86 is more likely to be useful, though. LLVMInitializeARMTargetInfo(); LLVMInitializeARMTarget(); @@ -307,6 +309,7 @@ int main(int Argc, char *Argv[]) { LLVMInitializeARMAsmParser(); LLVMInitializeARMDisassembler(); LLVMInitializeARMTargetMC(); +#endif char *file = NULL; const char *headerFileName = NULL; From 5b20b06bd9c75d84e78749b752716d6f2088b8d1 Mon Sep 17 00:00:00 2001 From: Matt Pharr Date: Sat, 3 Aug 2013 20:44:25 -0700 Subject: [PATCH 024/124] Add avg_{up,down}_int{8,16} routines to stdlib These compute the average of two given values, rounding up and down, respectively, if the result isn't exact. When possible, these are mapped to target-specific intrinsics (PADD[BW] on IA and VH[R]ADD[US] on NEON.) A subsequent commit will add pattern-matching to generate calls to these intrinsincs when the corresponding patterns are detected in the IR.) --- builtins/target-avx-common.ll | 6 ++ builtins/target-generic-1.ll | 6 ++ builtins/target-generic-common.ll | 5 ++ builtins/target-neon-16.ll | 59 ++++++++++++++ builtins/target-neon-32.ll | 59 ++++++++++++++ builtins/target-neon-8.ll | 75 +++++++++++++++++ builtins/target-sse2-common.ll | 4 + builtins/target-sse4-16.ll | 31 ++++++++ builtins/target-sse4-8.ll | 25 ++++++ builtins/target-sse4-x2.ll | 6 ++ builtins/target-sse4.ll | 6 ++ builtins/util.m4 | 128 ++++++++++++++++++++++++++++-- docs/ispc.rst | 25 ++++++ opt.cpp | 8 ++ stdlib.ispc | 60 +++++++++++--- tests/avg-down-int16.ispc | 13 +++ tests/avg-down-int8.ispc | 13 +++ tests/avg-down-uint16.ispc | 13 +++ tests/avg-down-uint8.ispc | 13 +++ tests/avg-up-int16.ispc | 13 +++ tests/avg-up-int8.ispc | 13 +++ tests/avg-up-uint16.ispc | 13 +++ tests/avg-up-uint8.ispc | 13 +++ 23 files changed, 592 insertions(+), 15 deletions(-) create mode 100644 tests/avg-down-int16.ispc create mode 100644 tests/avg-down-int8.ispc create mode 100644 tests/avg-down-uint16.ispc create mode 100644 tests/avg-down-uint8.ispc create mode 100644 tests/avg-up-int16.ispc create mode 100644 tests/avg-up-int8.ispc create mode 100644 tests/avg-up-uint16.ispc create mode 100644 tests/avg-up-uint8.ispc diff --git a/builtins/target-avx-common.ll b/builtins/target-avx-common.ll index dcbe0a66..1d317713 100644 --- a/builtins/target-avx-common.ll +++ b/builtins/target-avx-common.ll @@ -277,3 +277,9 @@ define double @__max_uniform_double(double, double) nounwind readnone alwaysinli sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.max.sd, %0, %1) ret double %ret } + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; int8/int16 builtins + +define_avgs() + diff --git a/builtins/target-generic-1.ll b/builtins/target-generic-1.ll index 238de444..3472c207 100644 --- a/builtins/target-generic-1.ll +++ b/builtins/target-generic-1.ll @@ -864,3 +864,9 @@ declare float @__half_to_float_uniform(i16 %v) nounwind readnone declare @__half_to_float_varying( %v) nounwind readnone declare i16 @__float_to_half_uniform(float %v) nounwind readnone declare @__float_to_half_varying( %v) nounwind readnone + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; int8/int16 builtins + +define_avgs() + diff --git a/builtins/target-generic-common.ll b/builtins/target-generic-common.ll index b581e0a7..c683ff45 100644 --- a/builtins/target-generic-common.ll +++ b/builtins/target-generic-common.ll @@ -364,3 +364,8 @@ declare void @__prefetch_read_uniform_2(i8 * nocapture) nounwind declare void @__prefetch_read_uniform_3(i8 * nocapture) nounwind declare void @__prefetch_read_uniform_nt(i8 * nocapture) nounwind +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; int8/int16 builtins + +define_avgs() + diff --git a/builtins/target-neon-16.ll b/builtins/target-neon-16.ll index fd15eb0b..a0575927 100644 --- a/builtins/target-neon-16.ll +++ b/builtins/target-neon-16.ll @@ -456,3 +456,62 @@ define i64 @__reduce_min_uint64() nounwind readnone { define i64 @__reduce_max_uint64() nounwind readnone { reduce8(i64, @__max_varying_uint64, @__max_uniform_uint64) } + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; int8/int16 + +declare <8 x i8> @llvm.arm.neon.vrhaddu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone + +define <8 x i8> @__avg_up_uint8(<8 x i8>, <8 x i8>) nounwind readnone { + %r = call <8 x i8> @llvm.arm.neon.vrhaddu.v8i8(<8 x i8> %0, <8 x i8> %1) + ret <8 x i8> %r +} + +declare <8 x i8> @llvm.arm.neon.vrhadds.v8i8(<8 x i8>, <8 x i8>) nounwind readnone + +define <8 x i8> @__avg_up_int8(<8 x i8>, <8 x i8>) nounwind readnone { + %r = call <8 x i8> @llvm.arm.neon.vrhadds.v8i8(<8 x i8> %0, <8 x i8> %1) + ret <8 x i8> %r +} + +declare <8 x i8> @llvm.arm.neon.vhaddu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone + +define <8 x i8> @__avg_down_uint8(<8 x i8>, <8 x i8>) nounwind readnone { + %r = call <8 x i8> @llvm.arm.neon.vhaddu.v8i8(<8 x i8> %0, <8 x i8> %1) + ret <8 x i8> %r +} + +declare <8 x i8> @llvm.arm.neon.vhadds.v8i8(<8 x i8>, <8 x i8>) nounwind readnone + +define <8 x i8> @__avg_down_int8(<8 x i8>, <8 x i8>) nounwind readnone { + %r = call <8 x i8> @llvm.arm.neon.vhadds.v8i8(<8 x i8> %0, <8 x i8> %1) + ret <8 x i8> %r +} + +declare <8 x i16> @llvm.arm.neon.vrhaddu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone + +define <8 x i16> @__avg_up_uint16(<8 x i16>, <8 x i16>) nounwind readnone { + %r = call <8 x i16> @llvm.arm.neon.vrhaddu.v8i16(<8 x i16> %0, <8 x i16> %1) + ret <8 x i16> %r +} + +declare <8 x i16> @llvm.arm.neon.vrhadds.v8i16(<8 x i16>, <8 x i16>) nounwind readnone + +define <8 x i16> @__avg_up_int16(<8 x i16>, <8 x i16>) nounwind readnone { + %r = call <8 x i16> @llvm.arm.neon.vrhadds.v8i16(<8 x i16> %0, <8 x i16> %1) + ret <8 x i16> %r +} + +declare <8 x i16> @llvm.arm.neon.vhaddu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone + +define <8 x i16> @__avg_down_uint16(<8 x i16>, <8 x i16>) nounwind readnone { + %r = call <8 x i16> @llvm.arm.neon.vhaddu.v8i16(<8 x i16> %0, <8 x i16> %1) + ret <8 x i16> %r +} + +declare <8 x i16> @llvm.arm.neon.vhadds.v8i16(<8 x i16>, <8 x i16>) nounwind readnone + +define <8 x i16> @__avg_down_int16(<8 x i16>, <8 x i16>) nounwind readnone { + %r = call <8 x i16> @llvm.arm.neon.vhadds.v8i16(<8 x i16> %0, <8 x i16> %1) + ret <8 x i16> %r +} diff --git a/builtins/target-neon-32.ll b/builtins/target-neon-32.ll index 1f8003d7..30b062c9 100644 --- a/builtins/target-neon-32.ll +++ b/builtins/target-neon-32.ll @@ -426,3 +426,62 @@ define i64 @__reduce_min_uint64(<4 x i64>) nounwind readnone { define i64 @__reduce_max_uint64(<4 x i64>) nounwind readnone { reduce4(i64, @__max_varying_uint64, @__max_uniform_uint64) } + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; int8/int16 + +declare <4 x i8> @llvm.arm.neon.vrhaddu.v4i8(<4 x i8>, <4 x i8>) nounwind readnone + +define <4 x i8> @__avg_up_uint8(<4 x i8>, <4 x i8>) nounwind readnone { + %r = call <4 x i8> @llvm.arm.neon.vrhaddu.v4i8(<4 x i8> %0, <4 x i8> %1) + ret <4 x i8> %r +} + +declare <4 x i8> @llvm.arm.neon.vrhadds.v4i8(<4 x i8>, <4 x i8>) nounwind readnone + +define <4 x i8> @__avg_up_int8(<4 x i8>, <4 x i8>) nounwind readnone { + %r = call <4 x i8> @llvm.arm.neon.vrhadds.v4i8(<4 x i8> %0, <4 x i8> %1) + ret <4 x i8> %r +} + +declare <4 x i8> @llvm.arm.neon.vhaddu.v4i8(<4 x i8>, <4 x i8>) nounwind readnone + +define <4 x i8> @__avg_down_uint8(<4 x i8>, <4 x i8>) nounwind readnone { + %r = call <4 x i8> @llvm.arm.neon.vhaddu.v4i8(<4 x i8> %0, <4 x i8> %1) + ret <4 x i8> %r +} + +declare <4 x i8> @llvm.arm.neon.vhadds.v4i8(<4 x i8>, <4 x i8>) nounwind readnone + +define <4 x i8> @__avg_down_int8(<4 x i8>, <4 x i8>) nounwind readnone { + %r = call <4 x i8> @llvm.arm.neon.vhadds.v4i8(<4 x i8> %0, <4 x i8> %1) + ret <4 x i8> %r +} + +declare <4 x i16> @llvm.arm.neon.vrhaddu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone + +define <4 x i16> @__avg_up_uint16(<4 x i16>, <4 x i16>) nounwind readnone { + %r = call <4 x i16> @llvm.arm.neon.vrhaddu.v4i16(<4 x i16> %0, <4 x i16> %1) + ret <4 x i16> %r +} + +declare <4 x i16> @llvm.arm.neon.vrhadds.v4i16(<4 x i16>, <4 x i16>) nounwind readnone + +define <4 x i16> @__avg_up_int16(<4 x i16>, <4 x i16>) nounwind readnone { + %r = call <4 x i16> @llvm.arm.neon.vrhadds.v4i16(<4 x i16> %0, <4 x i16> %1) + ret <4 x i16> %r +} + +declare <4 x i16> @llvm.arm.neon.vhaddu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone + +define <4 x i16> @__avg_down_uint16(<4 x i16>, <4 x i16>) nounwind readnone { + %r = call <4 x i16> @llvm.arm.neon.vhaddu.v4i16(<4 x i16> %0, <4 x i16> %1) + ret <4 x i16> %r +} + +declare <4 x i16> @llvm.arm.neon.vhadds.v4i16(<4 x i16>, <4 x i16>) nounwind readnone + +define <4 x i16> @__avg_down_int16(<4 x i16>, <4 x i16>) nounwind readnone { + %r = call <4 x i16> @llvm.arm.neon.vhadds.v4i16(<4 x i16> %0, <4 x i16> %1) + ret <4 x i16> %r +} diff --git a/builtins/target-neon-8.ll b/builtins/target-neon-8.ll index eb65f224..2accfe53 100644 --- a/builtins/target-neon-8.ll +++ b/builtins/target-neon-8.ll @@ -506,3 +506,78 @@ define i64 @__reduce_min_uint64() nounwind readnone { define i64 @__reduce_max_uint64() nounwind readnone { reduce16(i64, @__max_varying_uint64, @__max_uniform_uint64) } + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; int8/int16 builtins + +declare <16 x i8> @llvm.arm.neon.vrhaddu.v16i8(<16 x i8>, <16 x i8>) nounwind readnone + +define <16 x i8> @__avg_up_uint8(<16 x i8>, <16 x i8>) nounwind readnone { + %r = call <16 x i8> @llvm.arm.neon.vrhaddu.v16i8(<16 x i8> %0, <16 x i8> %1) + ret <16 x i8> %r +} + +declare <16 x i8> @llvm.arm.neon.vrhadds.v16i8(<16 x i8>, <16 x i8>) nounwind readnone + +define <16 x i8> @__avg_up_int8(<16 x i8>, <16 x i8>) nounwind readnone { + %r = call <16 x i8> @llvm.arm.neon.vrhadds.v16i8(<16 x i8> %0, <16 x i8> %1) + ret <16 x i8> %r +} + +declare <16 x i8> @llvm.arm.neon.vhaddu.v16i8(<16 x i8>, <16 x i8>) nounwind readnone + +define <16 x i8> @__avg_down_uint8(<16 x i8>, <16 x i8>) nounwind readnone { + %r = call <16 x i8> @llvm.arm.neon.vhaddu.v16i8(<16 x i8> %0, <16 x i8> %1) + ret <16 x i8> %r +} + +declare <16 x i8> @llvm.arm.neon.vhadds.v16i8(<16 x i8>, <16 x i8>) nounwind readnone + +define <16 x i8> @__avg_down_int8(<16 x i8>, <16 x i8>) nounwind readnone { + %r = call <16 x i8> @llvm.arm.neon.vhadds.v16i8(<16 x i8> %0, <16 x i8> %1) + ret <16 x i8> %r +} + +declare <8 x i16> @llvm.arm.neon.vrhaddu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone + +define <16 x i16> @__avg_up_uint16(<16 x i16>, <16 x i16>) nounwind readnone { + v16tov8(i16, %0, %a0, %b0) + v16tov8(i16, %1, %a1, %b1) + %r0 = call <8 x i16> @llvm.arm.neon.vrhaddu.v8i16(<8 x i16> %a0, <8 x i16> %a1) + %r1 = call <8 x i16> @llvm.arm.neon.vrhaddu.v8i16(<8 x i16> %b0, <8 x i16> %b1) + v8tov16(i16, %r0, %r1, %r) + ret <16 x i16> %r +} + +declare <8 x i16> @llvm.arm.neon.vrhadds.v8i16(<8 x i16>, <8 x i16>) nounwind readnone + +define <16 x i16> @__avg_up_int16(<16 x i16>, <16 x i16>) nounwind readnone { + v16tov8(i16, %0, %a0, %b0) + v16tov8(i16, %1, %a1, %b1) + %r0 = call <8 x i16> @llvm.arm.neon.vrhadds.v8i16(<8 x i16> %a0, <8 x i16> %a1) + %r1 = call <8 x i16> @llvm.arm.neon.vrhadds.v8i16(<8 x i16> %b0, <8 x i16> %b1) + v8tov16(i16, %r0, %r1, %r) + ret <16 x i16> %r +} + +declare <8 x i16> @llvm.arm.neon.vhaddu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone + +define <16 x i16> @__avg_down_uint16(<16 x i16>, <16 x i16>) nounwind readnone { + v16tov8(i16, %0, %a0, %b0) + v16tov8(i16, %1, %a1, %b1) + %r0 = call <8 x i16> @llvm.arm.neon.vhaddu.v8i16(<8 x i16> %a0, <8 x i16> %a1) + %r1 = call <8 x i16> @llvm.arm.neon.vhaddu.v8i16(<8 x i16> %b0, <8 x i16> %b1) + v8tov16(i16, %r0, %r1, %r) + ret <16 x i16> %r +} + +declare <8 x i16> @llvm.arm.neon.vhadds.v8i16(<8 x i16>, <8 x i16>) nounwind readnone + +define <16 x i16> @__avg_down_int16(<16 x i16>, <16 x i16>) nounwind readnone { + v16tov8(i16, %0, %a0, %b0) + v16tov8(i16, %1, %a1, %b1) + %r0 = call <8 x i16> @llvm.arm.neon.vhadds.v8i16(<8 x i16> %a0, <8 x i16> %a1) + %r1 = call <8 x i16> @llvm.arm.neon.vhadds.v8i16(<8 x i16> %b0, <8 x i16> %b1) + v8tov16(i16, %r0, %r1, %r) + ret <16 x i16> %r +} diff --git a/builtins/target-sse2-common.ll b/builtins/target-sse2-common.ll index c6a3afe2..ad1d88bc 100644 --- a/builtins/target-sse2-common.ll +++ b/builtins/target-sse2-common.ll @@ -269,4 +269,8 @@ define i64 @__popcnt_int64(i64) nounwind readnone alwaysinline { ret i64 %val } +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; int8/int16 builtins + +define_avgs() diff --git a/builtins/target-sse4-16.ll b/builtins/target-sse4-16.ll index 50f0848d..b4772552 100644 --- a/builtins/target-sse4-16.ll +++ b/builtins/target-sse4-16.ll @@ -449,3 +449,34 @@ gen_scatter(i32) gen_scatter(float) gen_scatter(i64) gen_scatter(double) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; int8/int16 builtins + +declare <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8>, <16 x i8>) nounwind readnone + +define <8 x i8> @__avg_up_uint8(<8 x i8>, <8 x i8>) { + %v0 = shufflevector <8 x i8> %0, <8 x i8> undef, + <16 x i32> + %v1 = shufflevector <8 x i8> %1, <8 x i8> undef, + <16 x i32> + %r16 = call <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8> %v0, <16 x i8> %v1) + %r = shufflevector <16 x i8> %r16, <16 x i8> undef, + <8 x i32> + ret <8 x i8> %r +} + +declare <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16>, <8 x i16>) nounwind readnone + +define <8 x i16> @__avg_up_uint16(<8 x i16>, <8 x i16>) { + %r = call <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16> %0, <8 x i16> %1) + ret <8 x i16> %r +} + +define_avg_up_int8() +define_avg_up_int16() +define_down_avgs() diff --git a/builtins/target-sse4-8.ll b/builtins/target-sse4-8.ll index 7fa9075b..a75d8e3a 100644 --- a/builtins/target-sse4-8.ll +++ b/builtins/target-sse4-8.ll @@ -456,3 +456,28 @@ gen_scatter(i32) gen_scatter(float) gen_scatter(i64) gen_scatter(double) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; int8/int16 builtins + +declare <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8>, <16 x i8>) nounwind readnone + +define <16 x i8> @__avg_up_uint8(<16 x i8>, <16 x i8>) nounwind readnone { + %r = call <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8> %0, <16 x i8> %1) + ret <16 x i8> %r +} + +declare <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16>, <8 x i16>) nounwind readnone + +define <16 x i16> @__avg_up_uint16(<16 x i16>, <16 x i16>) nounwind readnone { + v16tov8(i16, %0, %a0, %b0) + v16tov8(i16, %1, %a1, %b1) + %r0 = call <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16> %a0, <8 x i16> %a1) + %r1 = call <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16> %b0, <8 x i16> %b1) + v8tov16(i16, %r0, %r1, %r) + ret <16 x i16> %r +} + +define_avg_up_int8() +define_avg_up_int16() +define_down_avgs() diff --git a/builtins/target-sse4-x2.ll b/builtins/target-sse4-x2.ll index 4a447ba6..897a09eb 100644 --- a/builtins/target-sse4-x2.ll +++ b/builtins/target-sse4-x2.ll @@ -573,3 +573,9 @@ define <8 x double> @__max_varying_double(<8 x double>, <8 x double>) nounwind r binary2to8(ret, double, @llvm.x86.sse2.max.pd, %0, %1) ret <8 x double> %ret } + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; int8/int16 builtins + +define_avgs() + diff --git a/builtins/target-sse4.ll b/builtins/target-sse4.ll index 7f9a9185..5429b461 100644 --- a/builtins/target-sse4.ll +++ b/builtins/target-sse4.ll @@ -473,3 +473,9 @@ gen_scatter(i32) gen_scatter(float) gen_scatter(i64) gen_scatter(double) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; int8/int16 builtins + +define_avgs() + diff --git a/builtins/util.m4 b/builtins/util.m4 index 025030d5..95e3844d 100644 --- a/builtins/util.m4 +++ b/builtins/util.m4 @@ -49,9 +49,9 @@ define(`MASK_HIGH_BIT_ON', ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;; vector assembly and deconstruction utilities +;; vector deconstruction utilities ;; split 8-wide vector into 2 4-wide vectors -;; +;; ;; $1: vector element type ;; $2: 8-wide vector ;; $3: first 4-wide vector @@ -71,10 +71,6 @@ define(`v16tov8', ` <8 x i32> ') -;; 4-wide into 2 2-wide -;; args as above -;; - define(`v4tov2', ` $3 = shufflevector <4 x $1> $2, <4 x $1> undef, <2 x i32> $4 = shufflevector <4 x $1> $2, <4 x $1> undef, <2 x i32> @@ -96,6 +92,20 @@ define(`v16tov4', ` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; vector assembly: wider vector from two narrower vectors +;; +;; $1: vector element type +;; $2: first n-wide vector +;; $3: second n-wide vector +;; $4: result 2*n-wide vector +define(`v8tov16', ` + $4 = shufflevector <8 x $1> $2, <8 x $1> $3, + <16 x i32> +') + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; Helper macro for calling various SSE instructions for scalar values ;; but where the instruction takes a vector parameter. ;; $1 : name of variable to put the final value in @@ -4276,3 +4286,109 @@ define i1 @__rdrand_i64(i64 * %ptr) { ret i1 %good } ') + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; int8/int16 builtins + +define(`define_avg_up_uint8', ` +define @__avg_up_uint8(, ) { + %a16 = zext %0 to + %b16 = zext %1 to + %sum1 = add %a16, %b16 + %sum = add %sum1, < forloop(i, 1, eval(WIDTH-1), `i16 1, ') i16 1 > + %avg = lshr %sum, < forloop(i, 1, eval(WIDTH-1), `i16 1, ') i16 1 > + %r = trunc %avg to + ret %r +}') + +define(`define_avg_up_int8', ` +define @__avg_up_int8(, ) { + %a16 = sext %0 to + %b16 = sext %1 to + %sum1 = add %a16, %b16 + %sum = add %sum1, < forloop(i, 1, eval(WIDTH-1), `i16 1, ') i16 1 > + %avg = sdiv %sum, < forloop(i, 1, eval(WIDTH-1), `i16 2, ') i16 2 > + %r = trunc %avg to + ret %r +}') + +define(`define_avg_up_uint16', ` +define @__avg_up_uint16(, ) { + %a32 = zext %0 to + %b32 = zext %1 to + %sum1 = add %a32, %b32 + %sum = add %sum1, < forloop(i, 1, eval(WIDTH-1), `i32 1, ') i32 1 > + %avg = lshr %sum, < forloop(i, 1, eval(WIDTH-1), `i32 1, ') i32 1 > + %r = trunc %avg to + ret %r +}') + +define(`define_avg_up_int16', ` +define @__avg_up_int16(, ) { + %a32 = sext %0 to + %b32 = sext %1 to + %sum1 = add %a32, %b32 + %sum = add %sum1, < forloop(i, 1, eval(WIDTH-1), `i32 1, ') i32 1 > + %avg = sdiv %sum, < forloop(i, 1, eval(WIDTH-1), `i32 2, ') i32 2 > + %r = trunc %avg to + ret %r +}') + +define(`define_avg_down_uint8', ` +define @__avg_down_uint8(, ) { + %a16 = zext %0 to + %b16 = zext %1 to + %sum = add %a16, %b16 + %avg = lshr %sum, < forloop(i, 1, eval(WIDTH-1), `i16 1, ') i16 1 > + %r = trunc %avg to + ret %r +}') + +define(`define_avg_down_int8', ` +define @__avg_down_int8(, ) { + %a16 = sext %0 to + %b16 = sext %1 to + %sum = add %a16, %b16 + %avg = sdiv %sum, < forloop(i, 1, eval(WIDTH-1), `i16 2, ') i16 2 > + %r = trunc %avg to + ret %r +}') + +define(`define_avg_down_uint16', ` +define @__avg_down_uint16(, ) { + %a32 = zext %0 to + %b32 = zext %1 to + %sum = add %a32, %b32 + %avg = lshr %sum, < forloop(i, 1, eval(WIDTH-1), `i32 1, ') i32 1 > + %r = trunc %avg to + ret %r +}') + +define(`define_avg_down_int16', ` +define @__avg_down_int16(, ) { + %a32 = sext %0 to + %b32 = sext %1 to + %sum = add %a32, %b32 + %avg = sdiv %sum, < forloop(i, 1, eval(WIDTH-1), `i32 2, ') i32 2 > + %r = trunc %avg to + ret %r +}') + +define(`define_up_avgs', ` +define_avg_up_uint8() +define_avg_up_int8() +define_avg_up_uint16() +define_avg_up_int16() +') + +define(`define_down_avgs', ` +define_avg_down_uint8() +define_avg_down_int8() +define_avg_down_uint16() +define_avg_down_int16() +') + +define(`define_avgs', ` +define_up_avgs() +define_down_avgs() +') diff --git a/docs/ispc.rst b/docs/ispc.rst index 8456f126..eb8333de 100755 --- a/docs/ispc.rst +++ b/docs/ispc.rst @@ -3399,6 +3399,31 @@ The ``isnan()`` functions test whether the given value is a floating-point uniform bool isnan(uniform double v) +A number of functions are also available for performing operations on 8- and +16-bit quantities; these map to specialized instructions that perform these +operations on targets that support them. ``avg_up()`` computes the average +of the two values, rounding up if their average is halfway between two +integers (i.e., it computes ``(a+b+1)/2``). + +:: + + int8 avg_up(int8 a, int8 b) + unsigned int8 avg_up(unsigned int8 a, unsigned int8 b) + int16 avg_up(int16 a, int16 b) + unsigned int16 avg_up(unsigned int16 a, unsigned int16 b) + + +``avg_down()`` computes the average of the two values, rounding down (i.e., +it computes ``(a+b)/2``). + +:: + + int8 avg_down(int8 a, int8 b) + unsigned int8 avg_down(unsigned int8 a, unsigned int8 b) + int16 avg_down(int16 a, int16 b) + unsigned int16 avg_down(unsigned int16 a, unsigned int16 b) + + Transcendental Functions ------------------------ diff --git a/opt.cpp b/opt.cpp index 8c86368e..b363f0e1 100644 --- a/opt.cpp +++ b/opt.cpp @@ -4343,6 +4343,14 @@ char MakeInternalFuncsStaticPass::ID = 0; bool MakeInternalFuncsStaticPass::runOnModule(llvm::Module &module) { const char *names[] = { + "__avg_up_uint8", + "__avg_up_int8", + "__avg_up_uint16", + "__avg_up_int16", + "__avg_down_uint8", + "__avg_down_int8", + "__avg_down_uint16", + "__avg_down_int16", "__fast_masked_vload", "__gather_factored_base_offsets32_i8", "__gather_factored_base_offsets32_i16", "__gather_factored_base_offsets32_i32", "__gather_factored_base_offsets32_i64", diff --git a/stdlib.ispc b/stdlib.ispc index affa7fef..dc94d7e3 100644 --- a/stdlib.ispc +++ b/stdlib.ispc @@ -4812,8 +4812,8 @@ static const uniform int64 __idiv_table_s32[][3] = { }; __declspec(safe) -static unmasked unsigned int8 __fast_idiv(unsigned int8 numerator, - uniform unsigned int8 divisor) { +static unmasked inline unsigned int8 +__fast_idiv(unsigned int8 numerator, uniform unsigned int8 divisor) { uniform int64 method = __idiv_table_u8[divisor-2][0]; uniform int64 multiplier = __idiv_table_u8[divisor-2][1]; uniform int64 shift = __idiv_table_u8[divisor-2][2]; @@ -4833,7 +4833,7 @@ static unmasked unsigned int8 __fast_idiv(unsigned int8 numerator, } __declspec(safe) -static unmasked int8 __fast_idiv(int8 numerator, uniform int8 divisor) { +static unmasked inline int8 __fast_idiv(int8 numerator, uniform int8 divisor) { uniform int8 method = __idiv_table_s8[divisor-2][0]; uniform int16 multiplier = __idiv_table_s8[divisor-2][1]; uniform int8 shift = __idiv_table_s8[divisor-2][2]; @@ -4850,8 +4850,8 @@ static unmasked int8 __fast_idiv(int8 numerator, uniform int8 divisor) { } __declspec(safe) -static unmasked unsigned int16 __fast_idiv(unsigned int16 numerator, - uniform unsigned int16 divisor) { +static unmasked inline unsigned int16 __fast_idiv(unsigned int16 numerator, + uniform unsigned int16 divisor) { uniform int64 method = __idiv_table_u16[divisor-2][0]; uniform int64 multiplier = __idiv_table_u16[divisor-2][1]; uniform int64 shift = __idiv_table_u16[divisor-2][2]; @@ -4871,7 +4871,7 @@ static unmasked unsigned int16 __fast_idiv(unsigned int16 numerator, } __declspec(safe) -static unmasked int16 __fast_idiv(int16 numerator, uniform int16 divisor) { +static unmasked inline int16 __fast_idiv(int16 numerator, uniform int16 divisor) { uniform int64 method = __idiv_table_s16[divisor-2][0]; uniform int64 multiplier = __idiv_table_s16[divisor-2][1]; uniform int64 shift = __idiv_table_s16[divisor-2][2]; @@ -4889,8 +4889,8 @@ static unmasked int16 __fast_idiv(int16 numerator, uniform int16 divisor) { } __declspec(safe) -static unmasked inline unsigned int32 __fast_idiv(unsigned int32 numerator, - uniform unsigned int32 divisor) { +static unmasked inline inline unsigned int32 __fast_idiv(unsigned int32 numerator, + uniform unsigned int32 divisor) { uniform int64 method = __idiv_table_u32[divisor-2][0]; uniform int64 multiplier = __idiv_table_u32[divisor-2][1]; uniform int64 shift = __idiv_table_u32[divisor-2][2]; @@ -4910,7 +4910,7 @@ static unmasked inline unsigned int32 __fast_idiv(unsigned int32 numerator, } __declspec(safe) -static unmasked int32 __fast_idiv(int32 numerator, uniform int32 divisor) { +static unmasked inline int32 __fast_idiv(int32 numerator, uniform int32 divisor) { uniform int64 method = __idiv_table_s32[divisor-2][0]; uniform int64 multiplier = __idiv_table_s32[divisor-2][1]; uniform int64 shift = __idiv_table_s32[divisor-2][2]; @@ -4927,3 +4927,45 @@ static unmasked int32 __fast_idiv(int32 numerator, uniform int32 divisor) { } } +/////////////////////////////////////////////////////////////////////////// +// Saturating int8/int16 ops + +__declspec(safe) +static unmasked inline unsigned int8 avg_up(unsigned int8 a, unsigned int8 b) { + return __avg_up_uint8(a, b); +} + +__declspec(safe) +static unmasked inline int8 avg_up(int8 a, int8 b) { + return __avg_up_int8(a, b); +} + +__declspec(safe) +static unmasked inline unsigned int16 avg_up(unsigned int16 a, unsigned int16 b) { + return __avg_up_uint16(a, b); +} + +__declspec(safe) +static unmasked inline int16 avg_up(int16 a, int16 b) { + return __avg_up_int16(a, b); +} + +__declspec(safe) +static unmasked inline unsigned int8 avg_down(unsigned int8 a, unsigned int8 b) { + return __avg_down_uint8(a, b); +} + +__declspec(safe) +static unmasked inline int8 avg_down(int8 a, int8 b) { + return __avg_down_int8(a, b); +} + +__declspec(safe) +static unmasked inline unsigned int16 avg_down(unsigned int16 a, unsigned int16 b) { + return __avg_down_uint16(a, b); +} + +__declspec(safe) +static unmasked inline int16 avg_down(int16 a, int16 b) { + return __avg_down_int16(a, b); +} diff --git a/tests/avg-down-int16.ispc b/tests/avg-down-int16.ispc new file mode 100644 index 00000000..10a3c2a2 --- /dev/null +++ b/tests/avg-down-int16.ispc @@ -0,0 +1,13 @@ + +export uniform int width() { return programCount; } + + +export void f_fu(uniform float RET[], uniform float aFOO[], uniform float bf) { + int16 a = aFOO[programIndex]; + int16 b = bf; + RET[programIndex] = avg_down(a, b); +} + +export void result(uniform float RET[]) { + RET[programIndex] = ((int)programIndex + 1 + 5) / 2; +} diff --git a/tests/avg-down-int8.ispc b/tests/avg-down-int8.ispc new file mode 100644 index 00000000..67638934 --- /dev/null +++ b/tests/avg-down-int8.ispc @@ -0,0 +1,13 @@ + +export uniform int width() { return programCount; } + + +export void f_fu(uniform float RET[], uniform float aFOO[], uniform float bf) { + int8 a = aFOO[programIndex]; + int8 b = bf; + RET[programIndex] = avg_down(a, b); +} + +export void result(uniform float RET[]) { + RET[programIndex] = ((int)programIndex + 1 + 5) / 2; +} diff --git a/tests/avg-down-uint16.ispc b/tests/avg-down-uint16.ispc new file mode 100644 index 00000000..70f9185e --- /dev/null +++ b/tests/avg-down-uint16.ispc @@ -0,0 +1,13 @@ + +export uniform int width() { return programCount; } + + +export void f_fu(uniform float RET[], uniform float aFOO[], uniform float bf) { + unsigned int16 a = aFOO[programIndex]; + unsigned int16 b = bf; + RET[programIndex] = avg_down(a, b); +} + +export void result(uniform float RET[]) { + RET[programIndex] = ((int)programIndex + 1 + 5) / 2; +} diff --git a/tests/avg-down-uint8.ispc b/tests/avg-down-uint8.ispc new file mode 100644 index 00000000..75fbf116 --- /dev/null +++ b/tests/avg-down-uint8.ispc @@ -0,0 +1,13 @@ + +export uniform int width() { return programCount; } + + +export void f_fu(uniform float RET[], uniform float aFOO[], uniform float bf) { + unsigned int8 a = aFOO[programIndex]; + unsigned int8 b = bf; + RET[programIndex] = avg_down(a, b); +} + +export void result(uniform float RET[]) { + RET[programIndex] = ((int)programIndex + 1 + 5) / 2; +} diff --git a/tests/avg-up-int16.ispc b/tests/avg-up-int16.ispc new file mode 100644 index 00000000..8f557a5b --- /dev/null +++ b/tests/avg-up-int16.ispc @@ -0,0 +1,13 @@ + +export uniform int width() { return programCount; } + + +export void f_fu(uniform float RET[], uniform float aFOO[], uniform float bf) { + int16 a = aFOO[programIndex]; + int16 b = bf; + RET[programIndex] = avg_up(a, b); +} + +export void result(uniform float RET[]) { + RET[programIndex] = ((int)programIndex + 1 + 5 + 1) / 2; +} diff --git a/tests/avg-up-int8.ispc b/tests/avg-up-int8.ispc new file mode 100644 index 00000000..d0a3b444 --- /dev/null +++ b/tests/avg-up-int8.ispc @@ -0,0 +1,13 @@ + +export uniform int width() { return programCount; } + + +export void f_fu(uniform float RET[], uniform float aFOO[], uniform float bf) { + int8 a = aFOO[programIndex]; + int8 b = bf; + RET[programIndex] = avg_up(a, b); +} + +export void result(uniform float RET[]) { + RET[programIndex] = ((int)programIndex + 1 + 5 + 1) / 2; +} diff --git a/tests/avg-up-uint16.ispc b/tests/avg-up-uint16.ispc new file mode 100644 index 00000000..273f9f3b --- /dev/null +++ b/tests/avg-up-uint16.ispc @@ -0,0 +1,13 @@ + +export uniform int width() { return programCount; } + + +export void f_fu(uniform float RET[], uniform float aFOO[], uniform float bf) { + unsigned int16 a = aFOO[programIndex]; + unsigned int16 b = bf; + RET[programIndex] = avg_up(a, b); +} + +export void result(uniform float RET[]) { + RET[programIndex] = ((int)programIndex + 1 + 5 + 1) / 2; +} diff --git a/tests/avg-up-uint8.ispc b/tests/avg-up-uint8.ispc new file mode 100644 index 00000000..d5d02491 --- /dev/null +++ b/tests/avg-up-uint8.ispc @@ -0,0 +1,13 @@ + +export uniform int width() { return programCount; } + + +export void f_fu(uniform float RET[], uniform float aFOO[], uniform float bf) { + unsigned int8 a = aFOO[programIndex]; + unsigned int8 b = bf; + RET[programIndex] = avg_up(a, b); +} + +export void result(uniform float RET[]) { + RET[programIndex] = ((int)programIndex + 1 + 5 + 1) / 2; +} From ccdbddd388bf494bf3cb4aaf6a90cbb684cd18f0 Mon Sep 17 00:00:00 2001 From: Matt Pharr Date: Tue, 6 Aug 2013 08:59:46 -0700 Subject: [PATCH 025/124] Add peephole optimization to match int8/int16 averages. Match the following patterns in IR, turning them into target-specific intrinsics (e.g. PAVGB on x86) when possible. (unsigned int8)(((unsigned int16)a + (unsigned int16)b + 1)/2) (unsigned int8)(((unsigned int16)a + (unsigned int16)b)/2) (unsigned int16)(((unsigned int32)a + (unsigned int32)b + 1)/2) (unsigned int16)(((unsigned int32)a + (unsigned int32)b)/2) (int8)(((int16)a + (int16)b + 1)/2) (int8)(((int16)a + (int16)b)/2) (int16)(((int32)a + (int32)b + 1)/2) (int16)(((int32)a + (int32)b)/2) --- opt.cpp | 393 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 393 insertions(+) diff --git a/opt.cpp b/opt.cpp index b363f0e1..8899c64d 100644 --- a/opt.cpp +++ b/opt.cpp @@ -84,6 +84,7 @@ #include #include #include +#include #include #ifdef ISPC_IS_LINUX #include @@ -103,6 +104,7 @@ static llvm::Pass *CreateIntrinsicsOptPass(); static llvm::Pass *CreateInstructionSimplifyPass(); +static llvm::Pass *CreatePeepholePass(); static llvm::Pass *CreateImproveMemoryOpsPass(); static llvm::Pass *CreateGatherCoalescePass(); @@ -459,6 +461,9 @@ Optimize(llvm::Module *module, int optLevel) { optPM.add(llvm::createDeadInstEliminationPass()); optPM.add(llvm::createCFGSimplificationPass()); + optPM.add(llvm::createPromoteMemoryToRegisterPass()); + optPM.add(llvm::createAggressiveDCEPass()); + if (g->opt.disableGatherScatterOptimizations == false && g->target->getVectorWidth() > 1) { optPM.add(llvm::createInstructionCombiningPass()); @@ -500,6 +505,7 @@ Optimize(llvm::Module *module, int optLevel) { // InstructionCombiningPass. See r184459 for details. optPM.add(llvm::createSimplifyLibCallsPass()); #endif + optPM.add(llvm::createAggressiveDCEPass()); optPM.add(llvm::createInstructionCombiningPass()); optPM.add(llvm::createJumpThreadingPass()); optPM.add(llvm::createCFGSimplificationPass()); @@ -539,6 +545,7 @@ Optimize(llvm::Module *module, int optLevel) { optPM.add(llvm::createIPSCCPPass()); optPM.add(llvm::createDeadArgEliminationPass()); + optPM.add(llvm::createAggressiveDCEPass()); optPM.add(llvm::createInstructionCombiningPass()); optPM.add(llvm::createCFGSimplificationPass()); @@ -581,6 +588,9 @@ Optimize(llvm::Module *module, int optLevel) { optPM.add(llvm::createCFGSimplificationPass()); optPM.add(llvm::createInstructionCombiningPass()); optPM.add(CreateInstructionSimplifyPass()); + optPM.add(CreatePeepholePass()); + optPM.add(llvm::createFunctionInliningPass()); + optPM.add(llvm::createAggressiveDCEPass()); optPM.add(llvm::createStripDeadPrototypesPass()); optPM.add(CreateMakeInternalFuncsStaticPass()); optPM.add(llvm::createGlobalDCEPass()); @@ -4430,3 +4440,386 @@ static llvm::Pass * CreateMakeInternalFuncsStaticPass() { return new MakeInternalFuncsStaticPass; } + + +/////////////////////////////////////////////////////////////////////////// +// PeepholePass + +class PeepholePass : public llvm::BasicBlockPass { +public: + PeepholePass(); + + const char *getPassName() const { return "Peephole Optimizations"; } + bool runOnBasicBlock(llvm::BasicBlock &BB); + + static char ID; +}; + +char PeepholePass::ID = 0; + +PeepholePass::PeepholePass() + : BasicBlockPass(ID) { +} + +using namespace llvm::PatternMatch; + +template +struct CastClassTypes_match { + Op_t Op; + const llvm::Type *fromType, *toType; + + CastClassTypes_match(const Op_t &OpMatch, const llvm::Type *f, + const llvm::Type *t) + : Op(OpMatch), fromType(f), toType(t) {} + + template + bool match(OpTy *V) { + if (llvm::Operator *O = llvm::dyn_cast(V)) + return (O->getOpcode() == Opcode && Op.match(O->getOperand(0)) && + O->getType() == toType && + O->getOperand(0)->getType() == fromType); + return false; + } +}; + +template +inline CastClassTypes_match +m_SExt8To16(const OpTy &Op) { + return CastClassTypes_match( + Op, + LLVMTypes::Int8VectorType, + LLVMTypes::Int16VectorType); +} + +template +inline CastClassTypes_match +m_ZExt8To16(const OpTy &Op) { + return CastClassTypes_match( + Op, + LLVMTypes::Int8VectorType, + LLVMTypes::Int16VectorType); +} + + +template +inline CastClassTypes_match +m_Trunc16To8(const OpTy &Op) { + return CastClassTypes_match( + Op, + LLVMTypes::Int16VectorType, + LLVMTypes::Int8VectorType); +} + +template +inline CastClassTypes_match +m_SExt16To32(const OpTy &Op) { + return CastClassTypes_match( + Op, + LLVMTypes::Int16VectorType, + LLVMTypes::Int32VectorType); +} + +template +inline CastClassTypes_match +m_ZExt16To32(const OpTy &Op) { + return CastClassTypes_match( + Op, + LLVMTypes::Int16VectorType, + LLVMTypes::Int32VectorType); +} + + +template +inline CastClassTypes_match +m_Trunc32To16(const OpTy &Op) { + return CastClassTypes_match( + Op, + LLVMTypes::Int32VectorType, + LLVMTypes::Int16VectorType); +} + +template +struct UDiv2_match { + Op_t Op; + + UDiv2_match(const Op_t &OpMatch) + : Op(OpMatch) {} + + template + bool match(OpTy *V) { + llvm::BinaryOperator *bop; + llvm::ConstantDataVector *cdv; + if ((bop = llvm::dyn_cast(V)) && + (cdv = llvm::dyn_cast(bop->getOperand(1))) && + cdv->getSplatValue() != NULL) { + const llvm::APInt &apInt = cdv->getUniqueInteger(); + + switch (bop->getOpcode()) { + case llvm::Instruction::UDiv: + // divide by 2 + return (apInt.isIntN(2) && Op.match(bop->getOperand(0))); + case llvm::Instruction::LShr: + // shift left by 1 + return (apInt.isIntN(1) && Op.match(bop->getOperand(0))); + default: + return false; + } + } + return false; + } +}; + +template +inline UDiv2_match +m_UDiv2(const V &v) { + return UDiv2_match(v); +} + +template +struct SDiv2_match { + Op_t Op; + + SDiv2_match(const Op_t &OpMatch) + : Op(OpMatch) {} + + template + bool match(OpTy *V) { + llvm::BinaryOperator *bop; + llvm::ConstantDataVector *cdv; + if ((bop = llvm::dyn_cast(V)) && + (cdv = llvm::dyn_cast(bop->getOperand(1))) && + cdv->getSplatValue() != NULL) { + const llvm::APInt &apInt = cdv->getUniqueInteger(); + + switch (bop->getOpcode()) { + case llvm::Instruction::SDiv: + // divide by 2 + return (apInt.isIntN(2) && Op.match(bop->getOperand(0))); + case llvm::Instruction::AShr: + // shift left by 1 + return (apInt.isIntN(1) && Op.match(bop->getOperand(0))); + default: + return false; + } + } + return false; + } +}; + +template +inline SDiv2_match +m_SDiv2(const V &v) { + return SDiv2_match(v); +} +// Returns true if the given function has a call to an intrinsic function +// in its definition. +static bool +lHasIntrinsicInDefinition(llvm::Function *func) { + llvm::Function::iterator bbiter = func->begin(); + for (; bbiter != func->end(); ++bbiter) { + for (llvm::BasicBlock::iterator institer = bbiter->begin(); + institer != bbiter->end(); ++institer) { + if (llvm::isa(institer)) + return true; + } + } + return false; +} + +static llvm::Instruction * +lGetBinaryIntrinsic(const char *name, llvm::Value *opa, llvm::Value *opb) { + llvm::Function *func = m->module->getFunction(name); + Assert(func != NULL); + + // Make sure that the definition of the llvm::Function has a call to an + // intrinsic function in its instructions; otherwise we will generate + // infinite loops where we "helpfully" turn the default implementations + // of target builtins like __avg_up_uint8 that are implemented with plain + // arithmetic ops into recursive calls to themselves. + if (lHasIntrinsicInDefinition(func)) + return lCallInst(func, opa, opb, name); + else + return NULL; +} + +////////////////////////////////////////////////// + +static llvm::Instruction * +lMatchAvgUpUInt8(llvm::Value *inst) { + // (unsigned int8)(((unsigned int16)a + (unsigned int16)b + 1)/2) + llvm::Value *opa, *opb; + const llvm::APInt *delta; + if (match(inst, m_Trunc16To8(m_UDiv2(m_CombineOr( + m_CombineOr( + m_Add(m_ZExt8To16(m_Value(opa)), + m_Add(m_ZExt8To16(m_Value(opb)), m_APInt(delta))), + m_Add(m_Add(m_ZExt8To16(m_Value(opa)), m_APInt(delta)), + m_ZExt8To16(m_Value(opb)))), + m_Add(m_Add(m_ZExt8To16(m_Value(opa)), m_ZExt8To16(m_Value(opb))), + m_APInt(delta))))))) { + if (delta->isIntN(1) == false) + return false; + + return lGetBinaryIntrinsic("__avg_up_uint8", opa, opb); + } + return NULL; +} + + +static llvm::Instruction * +lMatchAvgDownUInt8(llvm::Value *inst) { + // (unsigned int8)(((unsigned int16)a + (unsigned int16)b)/2) + llvm::Value *opa, *opb; + if (match(inst, m_Trunc16To8(m_UDiv2( + m_Add(m_ZExt8To16(m_Value(opa)), + m_ZExt8To16(m_Value(opb))))))) { + return lGetBinaryIntrinsic("__avg_down_uint8", opa, opb); + } + return NULL; +} + +static llvm::Instruction * +lMatchAvgUpUInt16(llvm::Value *inst) { + // (unsigned int16)(((unsigned int32)a + (unsigned int32)b + 1)/2) + llvm::Value *opa, *opb; + const llvm::APInt *delta; + if (match(inst, m_Trunc32To16(m_UDiv2(m_CombineOr( + m_CombineOr( + m_Add(m_ZExt16To32(m_Value(opa)), + m_Add(m_ZExt16To32(m_Value(opb)), m_APInt(delta))), + m_Add(m_Add(m_ZExt16To32(m_Value(opa)), m_APInt(delta)), + m_ZExt16To32(m_Value(opb)))), + m_Add(m_Add(m_ZExt16To32(m_Value(opa)), m_ZExt16To32(m_Value(opb))), + m_APInt(delta))))))) { + if (delta->isIntN(1) == false) + return false; + + return lGetBinaryIntrinsic("__avg_up_uint16", opa, opb); + } + return NULL; +} + + +static llvm::Instruction * +lMatchAvgDownUInt16(llvm::Value *inst) { + // (unsigned int16)(((unsigned int32)a + (unsigned int32)b)/2) + llvm::Value *opa, *opb; + if (match(inst, m_Trunc32To16(m_UDiv2( + m_Add(m_ZExt16To32(m_Value(opa)), + m_ZExt16To32(m_Value(opb))))))) { + return lGetBinaryIntrinsic("__avg_down_uint16", opa, opb); + } + return NULL; +} + + +static llvm::Instruction * +lMatchAvgUpInt8(llvm::Value *inst) { + // (int8)(((int16)a + (int16)b + 1)/2) + llvm::Value *opa, *opb; + const llvm::APInt *delta; + if (match(inst, m_Trunc16To8(m_SDiv2(m_CombineOr( + m_CombineOr( + m_Add(m_SExt8To16(m_Value(opa)), + m_Add(m_SExt8To16(m_Value(opb)), m_APInt(delta))), + m_Add(m_Add(m_SExt8To16(m_Value(opa)), m_APInt(delta)), + m_SExt8To16(m_Value(opb)))), + m_Add(m_Add(m_SExt8To16(m_Value(opa)), m_SExt8To16(m_Value(opb))), + m_APInt(delta))))))) { + if (delta->isIntN(1) == false) + return false; + + return lGetBinaryIntrinsic("__avg_up_int8", opa, opb); + } + return NULL; +} + + +static llvm::Instruction * +lMatchAvgDownInt8(llvm::Value *inst) { + // (int8)(((int16)a + (int16)b)/2) + llvm::Value *opa, *opb; + if (match(inst, m_Trunc16To8(m_SDiv2( + m_Add(m_SExt8To16(m_Value(opa)), + m_SExt8To16(m_Value(opb))))))) { + return lGetBinaryIntrinsic("__avg_down_int8", opa, opb); + } + return NULL; +} + +static llvm::Instruction * +lMatchAvgUpInt16(llvm::Value *inst) { + // (int16)(((int32)a + (int32)b + 1)/2) + llvm::Value *opa, *opb; + const llvm::APInt *delta; + if (match(inst, m_Trunc32To16(m_SDiv2(m_CombineOr( + m_CombineOr( + m_Add(m_SExt16To32(m_Value(opa)), + m_Add(m_SExt16To32(m_Value(opb)), m_APInt(delta))), + m_Add(m_Add(m_SExt16To32(m_Value(opa)), m_APInt(delta)), + m_SExt16To32(m_Value(opb)))), + m_Add(m_Add(m_SExt16To32(m_Value(opa)), m_SExt16To32(m_Value(opb))), + m_APInt(delta))))))) { + if (delta->isIntN(1) == false) + return false; + + return lGetBinaryIntrinsic("__avg_up_int16", opa, opb); + } + return NULL; +} + +static llvm::Instruction * +lMatchAvgDownInt16(llvm::Value *inst) { + // (int16)(((int32)a + (int32)b)/2) + llvm::Value *opa, *opb; + if (match(inst, m_Trunc32To16(m_SDiv2( + m_Add(m_SExt16To32(m_Value(opa)), + m_SExt16To32(m_Value(opb))))))) { + return lGetBinaryIntrinsic("__avg_down_int16", opa, opb); + } + return NULL; +} + +bool +PeepholePass::runOnBasicBlock(llvm::BasicBlock &bb) { + DEBUG_START_PASS("PeepholePass"); + + bool modifiedAny = false; + restart: + for (llvm::BasicBlock::iterator iter = bb.begin(), e = bb.end(); iter != e; ++iter) { + llvm::Instruction *inst = &*iter; + + llvm::Instruction *builtinCall = NULL; + if (!builtinCall) + builtinCall = lMatchAvgUpUInt8(inst); + if (!builtinCall) + builtinCall = lMatchAvgUpUInt16(inst); + if (!builtinCall) + builtinCall = lMatchAvgDownUInt8(inst); + if (!builtinCall) + builtinCall = lMatchAvgDownUInt16(inst); + if (!builtinCall) + builtinCall = lMatchAvgUpInt8(inst); + if (!builtinCall) + builtinCall = lMatchAvgUpInt16(inst); + if (!builtinCall) + builtinCall = lMatchAvgDownInt8(inst); + if (!builtinCall) + builtinCall = lMatchAvgDownInt16(inst); + + if (builtinCall != NULL) { + llvm::ReplaceInstWithInst(inst, builtinCall); + modifiedAny = true; + goto restart; + } + } + + DEBUG_END_PASS("PeepholePass"); + + return modifiedAny; +} + +static llvm::Pass * +CreatePeepholePass() { + return new PeepholePass; +} From 1276ea98440fc95bdb1388c27217c618cdac3cba Mon Sep 17 00:00:00 2001 From: Matt Pharr Date: Tue, 6 Aug 2013 17:00:35 -0700 Subject: [PATCH 026/124] Revert "Remove support for building with LLVM 3.1" This reverts commit d3c567503bf64ec9066c09cb8959c31d4aa1be0e. Conflicts: opt.cpp --- builtins.cpp | 2 + builtins/target-avx11-x2.ll | 4 +- builtins/target-avx11.ll | 4 +- builtins/target-avx2-x2.ll | 25 +++++++- builtins/target-avx2.ll | 25 +++++++- cbackend.cpp | 115 +++++++++++++++++++++++++++--------- ctx.cpp | 4 +- ctx.h | 11 +++- expr.cpp | 2 +- func.cpp | 10 +++- ispc.cpp | 68 +++++++++++++++++---- ispc.h | 18 +++++- llvmutil.cpp | 2 +- llvmutil.h | 2 +- main.cpp | 4 +- module.cpp | 47 ++++++++++++--- opt.cpp | 22 +++++-- stmt.cpp | 2 +- type.cpp | 26 +++++--- type.h | 2 +- util.cpp | 9 ++- 21 files changed, 320 insertions(+), 84 deletions(-) diff --git a/builtins.cpp b/builtins.cpp index d75db43e..82c45b02 100644 --- a/builtins.cpp +++ b/builtins.cpp @@ -49,6 +49,8 @@ #include #if defined(LLVM_3_2) #include +#endif +#if defined(LLVM_3_1) || defined(LLVM_3_2) #include #include #include diff --git a/builtins/target-avx11-x2.ll b/builtins/target-avx11-x2.ll index 2aee1e1c..1aa6345c 100644 --- a/builtins/target-avx11-x2.ll +++ b/builtins/target-avx11-x2.ll @@ -31,7 +31,9 @@ include(`target-avx-x2.ll') -rdrand_definition() +ifelse(LLVM_VERSION, `LLVM_3_0', `rdrand_decls()', + LLVM_VERSION, `LLVM_3_1', `rdrand_decls()', + `rdrand_definition()') ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; int min/max diff --git a/builtins/target-avx11.ll b/builtins/target-avx11.ll index 44593113..fea0a7c2 100644 --- a/builtins/target-avx11.ll +++ b/builtins/target-avx11.ll @@ -31,7 +31,9 @@ include(`target-avx.ll') -rdrand_definition() +ifelse(LLVM_VERSION, `LLVM_3_0', `rdrand_decls()', + LLVM_VERSION, `LLVM_3_1', `rdrand_decls()', + `rdrand_definition()') ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; int min/max diff --git a/builtins/target-avx2-x2.ll b/builtins/target-avx2-x2.ll index 19f1845d..053fd078 100644 --- a/builtins/target-avx2-x2.ll +++ b/builtins/target-avx2-x2.ll @@ -29,11 +29,15 @@ ;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -define(`HAVE_GATHER', `1') +ifelse(LLVM_VERSION, `LLVM_3_0', `', + LLVM_VERSION, `LLVM_3_1', `', + `define(`HAVE_GATHER', `1')') include(`target-avx-x2.ll') -rdrand_definition() +ifelse(LLVM_VERSION, `LLVM_3_0', `rdrand_decls()', + LLVM_VERSION, `LLVM_3_1', `rdrand_decls()', + `rdrand_definition()') ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; int min/max @@ -172,6 +176,21 @@ define(`assemble_4s', ` assemble_8s($1, $2, $2_1, $2_2) ') +ifelse(LLVM_VERSION, `LLVM_3_0', ` +gen_gather_factored(i8) +gen_gather_factored(i16) +gen_gather_factored(i32) +gen_gather_factored(float) +gen_gather_factored(i64) +gen_gather_factored(double)', +LLVM_VERSION, `LLVM_3_1', ` +gen_gather_factored(i8) +gen_gather_factored(i16) +gen_gather_factored(i32) +gen_gather_factored(float) +gen_gather_factored(i64) +gen_gather_factored(double)', ` + gen_gather(i8) gen_gather(i16) @@ -538,3 +557,5 @@ define <16 x double> @__gather64_double(<16 x i64> %ptrs, ret <16 x double> %v } + +') diff --git a/builtins/target-avx2.ll b/builtins/target-avx2.ll index d3410011..f4a0ee07 100644 --- a/builtins/target-avx2.ll +++ b/builtins/target-avx2.ll @@ -29,11 +29,15 @@ ;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -define(`HAVE_GATHER', `1') +ifelse(LLVM_VERSION, `LLVM_3_0', `', + LLVM_VERSION, `LLVM_3_1', `', + `define(`HAVE_GATHER', `1')') include(`target-avx.ll') -rdrand_definition() +ifelse(LLVM_VERSION, `LLVM_3_0', `rdrand_decls()', + LLVM_VERSION, `LLVM_3_1', `rdrand_decls()', + `rdrand_definition()') ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; int min/max @@ -119,6 +123,21 @@ define(`extract_4s', ` %$2_2 = shufflevector <8 x $1> %$2, <8 x $1> undef, <4 x i32> ') +ifelse(LLVM_VERSION, `LLVM_3_0', ` +gen_gather_factored(i8) +gen_gather_factored(i16) +gen_gather_factored(i32) +gen_gather_factored(float) +gen_gather_factored(i64) +gen_gather_factored(double)', +LLVM_VERSION, `LLVM_3_1', ` +gen_gather_factored(i8) +gen_gather_factored(i16) +gen_gather_factored(i32) +gen_gather_factored(float) +gen_gather_factored(i64) +gen_gather_factored(double)', ` + gen_gather(i8) gen_gather(i16) @@ -410,3 +429,5 @@ define <8 x double> @__gather64_double(<8 x i64> %ptrs, ret <8 x double> %v } + +') diff --git a/cbackend.cpp b/cbackend.cpp index d54f48fb..d23bcc20 100644 --- a/cbackend.cpp +++ b/cbackend.cpp @@ -29,7 +29,7 @@ #include "llvmutil.h" -#if defined(LLVM_3_2) +#if defined(LLVM_3_1) || defined(LLVM_3_2) #include "llvm/Constants.h" #include "llvm/DerivedTypes.h" #include "llvm/CallingConv.h" @@ -38,7 +38,6 @@ #include "llvm/Intrinsics.h" #include "llvm/IntrinsicInst.h" #include "llvm/InlineAsm.h" - #include "llvm/TypeFinder.h" #else #include "llvm/IR/Constants.h" #include "llvm/IR/DerivedTypes.h" @@ -48,10 +47,16 @@ #include "llvm/IR/Intrinsics.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/InlineAsm.h" - #include "llvm/IR/TypeFinder.h" #endif #include "llvm/Pass.h" #include "llvm/PassManager.h" +#if !defined(LLVM_3_1) + #if defined(LLVM_3_2) + #include "llvm/TypeFinder.h" + #else // LLVM_3_3 + + #include "llvm/IR/TypeFinder.h" + #endif +#endif // LLVM_3_2 + #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/SmallString.h" #include "llvm/ADT/STLExtras.h" @@ -71,7 +76,9 @@ #include "llvm/MC/MCRegisterInfo.h" #include "llvm/MC/MCSubtargetInfo.h" #include "llvm/MC/MCSymbol.h" -#if defined(LLVM_3_2) +#if defined(LLVM_3_1) + #include "llvm/Target/TargetData.h" +#elif defined(LLVM_3_2) #include "llvm/DataLayout.h" #else // LLVM 3.3+ #include "llvm/IR/DataLayout.h" @@ -81,7 +88,7 @@ #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/FormattedStream.h" #include "llvm/Support/GetElementPtrTypeIterator.h" -#if defined(LLVM_3_2) +#if defined(LLVM_3_1) || defined(LLVM_3_2) #include "llvm/Support/InstVisitor.h" #else // LLVM 3.3+ #include "llvm/InstVisitor.h" @@ -251,10 +258,14 @@ namespace { const llvm::MCRegisterInfo *MRI; const llvm::MCObjectFileInfo *MOFI; llvm::MCContext *TCtx; +#if defined(LLVM_3_1) + const llvm::TargetData* TD; +#else // FIXME: it's ugly to have the name be "TD" here, but it saves us // lots of ifdefs in the below since the new DataLayout and the old // TargetData have generally similar interfaces... const llvm::DataLayout* TD; +#endif std::map FPConstantMap; std::map VectorConstantMap; @@ -341,7 +352,7 @@ namespace { bool isSigned = false, const std::string &VariableName = "", bool IgnoreName = false, -#if defined(LLVM_3_2) +#if defined(LLVM_3_1) || defined(LLVM_3_2) const llvm::AttrListPtr &PAL = llvm::AttrListPtr() #else const llvm::AttributeSet &PAL = llvm::AttributeSet() @@ -352,7 +363,7 @@ namespace { const std::string &NameSoFar = ""); void printStructReturnPointerFunctionType(llvm::raw_ostream &Out, -#if defined(LLVM_3_2) +#if defined(LLVM_3_1) || defined(LLVM_3_2) const llvm::AttrListPtr &PAL, #else const llvm::AttributeSet &PAL, @@ -575,7 +586,7 @@ std::string CWriter::getArrayName(llvm::ArrayType *AT) { /// return type, except, instead of printing the type as void (*)(Struct*, ...) /// print it as "Struct (*)(...)", for struct return functions. void CWriter::printStructReturnPointerFunctionType(llvm::raw_ostream &Out, -#if defined(LLVM_3_2) +#if defined(LLVM_3_1) || defined(LLVM_3_2) const llvm::AttrListPtr &PAL, #else const llvm::AttributeSet &PAL, @@ -594,16 +605,20 @@ void CWriter::printStructReturnPointerFunctionType(llvm::raw_ostream &Out, if (PrintedType) FunctionInnards << ", "; llvm::Type *ArgTy = *I; -#if defined(LLVM_3_2) +#if defined(LLVM_3_1) + if (PAL.paramHasAttr(Idx, llvm::Attribute::ByVal)) { +#elif defined(LLVM_3_2) if (PAL.getParamAttributes(Idx).hasAttribute(llvm::Attributes::ByVal)) { #else - if (PAL.getParamAttributes(Idx).hasAttribute(llvm::AttributeSet::FunctionIndex, llvm::Attribute::ByVal)) { + if (PAL.getParamAttributes(Idx).hasAttribute(llvm::AttributeSet::FunctionIndex, llvm::Attribute::ByVal)) { #endif assert(ArgTy->isPointerTy()); ArgTy = llvm::cast(ArgTy)->getElementType(); } printType(FunctionInnards, ArgTy, -#if defined(LLVM_3_2) +#if defined(LLVM_3_1) + /*isSigned=*/PAL.paramHasAttr(Idx, llvm::Attribute::SExt), +#elif defined(LLVM_3_2) PAL.getParamAttributes(Idx).hasAttribute(llvm::Attributes::SExt), #else PAL.getParamAttributes(Idx).hasAttribute(llvm::AttributeSet::FunctionIndex, llvm::Attribute::SExt), @@ -620,7 +635,9 @@ void CWriter::printStructReturnPointerFunctionType(llvm::raw_ostream &Out, } FunctionInnards << ')'; printType(Out, RetTy, -#if defined(LLVM_3_2) +#if defined(LLVM_3_1) + /*isSigned=*/PAL.paramHasAttr(0, llvm::Attribute::SExt), +#elif defined(LLVM_3_2) PAL.getParamAttributes(0).hasAttribute(llvm::Attributes::SExt), #else PAL.getParamAttributes(0).hasAttribute(llvm::AttributeSet::ReturnIndex, llvm::Attribute::SExt), @@ -720,7 +737,7 @@ CWriter::printSimpleType(llvm::raw_ostream &Out, llvm::Type *Ty, bool isSigned, llvm::raw_ostream &CWriter::printType(llvm::raw_ostream &Out, llvm::Type *Ty, bool isSigned, const std::string &NameSoFar, bool IgnoreName, -#if defined(LLVM_3_2) +#if defined(LLVM_3_1) || defined(LLVM_3_2) const llvm::AttrListPtr &PAL #else const llvm::AttributeSet &PAL @@ -742,7 +759,9 @@ llvm::raw_ostream &CWriter::printType(llvm::raw_ostream &Out, llvm::Type *Ty, for (llvm::FunctionType::param_iterator I = FTy->param_begin(), E = FTy->param_end(); I != E; ++I) { llvm::Type *ArgTy = *I; -#if defined(LLVM_3_2) +#if defined(LLVM_3_1) + if (PAL.paramHasAttr(Idx, llvm::Attribute::ByVal)) { +#elif defined(LLVM_3_2) if (PAL.getParamAttributes(Idx).hasAttribute(llvm::Attributes::ByVal)) { #else if (PAL.getParamAttributes(Idx).hasAttribute(llvm::AttributeSet::FunctionIndex, llvm::Attribute::ByVal)) { @@ -753,7 +772,9 @@ llvm::raw_ostream &CWriter::printType(llvm::raw_ostream &Out, llvm::Type *Ty, if (I != FTy->param_begin()) FunctionInnards << ", "; printType(FunctionInnards, ArgTy, -#if defined(LLVM_3_2) +#if defined(LLVM_3_1) + /*isSigned=*/PAL.paramHasAttr(Idx, llvm::Attribute::SExt), +#elif defined(LLVM_3_2) PAL.getParamAttributes(Idx).hasAttribute(llvm::Attributes::SExt), #else PAL.getParamAttributes(Idx).hasAttribute(llvm::AttributeSet::FunctionIndex, llvm::Attribute::SExt), @@ -770,7 +791,9 @@ llvm::raw_ostream &CWriter::printType(llvm::raw_ostream &Out, llvm::Type *Ty, } FunctionInnards << ')'; printType(Out, FTy->getReturnType(), -#if defined(LLVM_3_2) +#if defined(LLVM_3_1) + /*isSigned=*/PAL.paramHasAttr(0, llvm::Attribute::SExt), +#elif defined(LLVM_3_2) PAL.getParamAttributes(0).hasAttribute(llvm::Attributes::SExt), #else PAL.getParamAttributes(0).hasAttribute(llvm::AttributeSet::ReturnIndex, llvm::Attribute::SExt), @@ -1949,7 +1972,11 @@ void CWriter::writeOperandWithCast(llvm::Value* Operand, const llvm::ICmpInst &C // directives to cater to specific compilers as need be. // static void generateCompilerSpecificCode(llvm::formatted_raw_ostream& Out, +#if defined(LLVM_3_1) + const llvm::TargetData *TD) { +#else const llvm::DataLayout *TD) { +#endif // We output GCC specific attributes to preserve 'linkonce'ness on globals. // If we aren't being compiled with GCC, just drop these attributes. Out << "#ifndef __GNUC__ /* Can only support \"linkonce\" vars with GCC */\n" @@ -2142,7 +2169,11 @@ bool CWriter::doInitialization(llvm::Module &M) { // Initialize TheModule = &M; +#if defined(LLVM_3_1) + TD = new llvm::TargetData(&M); +#else TD = new llvm::DataLayout(&M); +#endif IL = new llvm::IntrinsicLowering(*TD); IL->AddPrototypes(M); @@ -2625,11 +2656,15 @@ void CWriter::printModuleTypes() { // Get all of the struct types used in the module. std::vector StructTypes; +#if defined(LLVM_3_1) + TheModule->findUsedStructTypes(StructTypes); +#else llvm::TypeFinder typeFinder; typeFinder.run(*TheModule, false); for (llvm::TypeFinder::iterator iter = typeFinder.begin(); iter != typeFinder.end(); ++iter) StructTypes.push_back(*iter); +#endif // Get all of the array types used in the module std::vector ArrayTypes; @@ -2750,7 +2785,7 @@ void CWriter::printFunctionSignature(const llvm::Function *F, bool Prototype) { // Loop over the arguments, printing them... llvm::FunctionType *FT = llvm::cast(F->getFunctionType()); -#if defined(LLVM_3_2) +#if defined(LLVM_3_1) || defined(LLVM_3_2) const llvm::AttrListPtr &PAL = F->getAttributes(); #else const llvm::AttributeSet &PAL = F->getAttributes(); @@ -2784,16 +2819,20 @@ void CWriter::printFunctionSignature(const llvm::Function *F, bool Prototype) { else ArgName = ""; llvm::Type *ArgTy = I->getType(); -#if defined(LLVM_3_2) +#if defined(LLVM_3_1) + if (PAL.paramHasAttr(Idx, llvm::Attribute::ByVal)) { +#elif defined(LLVM_3_2) if (PAL.getParamAttributes(Idx).hasAttribute(llvm::Attributes::ByVal)) { #else - if (PAL.getParamAttributes(Idx).hasAttribute(llvm::AttributeSet::FunctionIndex, llvm::Attribute::ByVal)) { + if (PAL.getParamAttributes(Idx).hasAttribute(llvm::AttributeSet::FunctionIndex, llvm::Attribute::ByVal)) { #endif ArgTy = llvm::cast(ArgTy)->getElementType(); ByValParams.insert(I); } printType(FunctionInnards, ArgTy, -#if defined(LLVM_3_2) +#if defined(LLVM_3_1) + /*isSigned=*/PAL.paramHasAttr(Idx, llvm::Attribute::SExt), +#elif defined(LLVM_3_2) PAL.getParamAttributes(Idx).hasAttribute(llvm::Attributes::SExt), #else PAL.getParamAttributes(Idx).hasAttribute(llvm::AttributeSet::FunctionIndex, llvm::Attribute::SExt), @@ -2819,7 +2858,9 @@ void CWriter::printFunctionSignature(const llvm::Function *F, bool Prototype) { for (; I != E; ++I) { if (PrintedArg) FunctionInnards << ", "; llvm::Type *ArgTy = *I; -#if defined(LLVM_3_2) +#if defined(LLVM_3_1) + if (PAL.paramHasAttr(Idx, llvm::Attribute::ByVal)) { +#elif defined(LLVM_3_2) if (PAL.getParamAttributes(Idx).hasAttribute(llvm::Attributes::ByVal)) { #else if (PAL.getParamAttributes(Idx).hasAttribute(llvm::AttributeSet::FunctionIndex, llvm::Attribute::ByVal)) { @@ -2828,7 +2869,9 @@ void CWriter::printFunctionSignature(const llvm::Function *F, bool Prototype) { ArgTy = llvm::cast(ArgTy)->getElementType(); } printType(FunctionInnards, ArgTy, -#if defined(LLVM_3_2) +#if defined(LLVM_3_1) + /*isSigned=*/PAL.paramHasAttr(Idx, llvm::Attribute::SExt) +#elif defined(LLVM_3_2) PAL.getParamAttributes(Idx).hasAttribute(llvm::Attributes::SExt) #else PAL.getParamAttributes(Idx).hasAttribute(llvm::AttributeSet::FunctionIndex, llvm::Attribute::SExt) @@ -2865,7 +2908,9 @@ void CWriter::printFunctionSignature(const llvm::Function *F, bool Prototype) { // Print out the return type and the signature built above. printType(Out, RetTy, -#if defined(LLVM_3_2) +#if defined(LLVM_3_1) + /*isSigned=*/PAL.paramHasAttr(0, llvm::Attribute::SExt), +#elif defined(LLVM_3_2) PAL.getParamAttributes(0).hasAttribute(llvm::Attributes::SExt), #else PAL.getParamAttributes(0).hasAttribute(llvm::AttributeSet::ReturnIndex, llvm::Attribute::SExt), @@ -3667,7 +3712,7 @@ void CWriter::lowerIntrinsics(llvm::Function &F) { const char *BuiltinName = ""; #define GET_GCC_BUILTIN_NAME #define Intrinsic llvm::Intrinsic -#if defined(LLVM_3_2) +#if defined(LLVM_3_1) || defined(LLVM_3_2) #include "llvm/Intrinsics.gen" #else #include "llvm/IR/Intrinsics.gen" @@ -3732,7 +3777,7 @@ void CWriter::visitCallInst(llvm::CallInst &I) { // If this is a call to a struct-return function, assign to the first // parameter instead of passing it to the call. -#if defined(LLVM_3_2) +#if defined(LLVM_3_1) || defined(LLVM_3_2) const llvm::AttrListPtr &PAL = I.getAttributes(); #else const llvm::AttributeSet &PAL = I.getAttributes(); @@ -3820,7 +3865,9 @@ void CWriter::visitCallInst(llvm::CallInst &I) { (*AI)->getType() != FTy->getParamType(ArgNo)) { Out << '('; printType(Out, FTy->getParamType(ArgNo), -#if defined(LLVM_3_2) +#if defined(LLVM_3_1) + /*isSigned=*/PAL.paramHasAttr(ArgNo+1, llvm::Attribute::SExt) +#elif defined(LLVM_3_2) PAL.getParamAttributes(ArgNo+1).hasAttribute(llvm::Attributes::SExt) #else PAL.getParamAttributes(ArgNo+1).hasAttribute(llvm::AttributeSet::FunctionIndex, llvm::Attribute::SExt) @@ -3858,7 +3905,7 @@ bool CWriter::visitBuiltinCall(llvm::CallInst &I, llvm::Intrinsic::ID ID, const char *BuiltinName = ""; #define GET_GCC_BUILTIN_NAME #define Intrinsic llvm::Intrinsic -#if defined(LLVM_3_2) +#if defined(LLVM_3_1) || defined(LLVM_3_2) #include "llvm/Intrinsics.gen" #else #include "llvm/IR/Intrinsics.gen" @@ -4508,8 +4555,13 @@ SmearCleanupPass::runOnBasicBlock(llvm::BasicBlock &bb) { smearType, NULL); smearFunc = llvm::dyn_cast(sf); assert(smearFunc != NULL); +#if defined(LLVM_3_1) + smearFunc->setDoesNotThrow(true); + smearFunc->setDoesNotAccessMemory(true); +#else smearFunc->setDoesNotThrow(); smearFunc->setDoesNotAccessMemory(); +#endif } assert(smearFunc != NULL); @@ -4651,8 +4703,13 @@ AndCmpCleanupPass::runOnBasicBlock(llvm::BasicBlock &bb) { LLVMTypes::MaskType, NULL); andCmpFunc = llvm::dyn_cast(acf); Assert(andCmpFunc != NULL); +#if defined(LLVM_3_1) + andCmpFunc->setDoesNotThrow(true); + andCmpFunc->setDoesNotAccessMemory(true); +#else andCmpFunc->setDoesNotThrow(); andCmpFunc->setDoesNotAccessMemory(); +#endif } // Set up the function call to the *_and_mask function; the @@ -4857,7 +4914,7 @@ WriteCXXFile(llvm::Module *module, const char *fn, int vectorWidth, pm.add(new llvm::TargetData(module)); #endif -#if defined(LLVM_3_2) || defined(LLVM_3_3) +#if defined(LLVM_3_1) || defined(LLVM_3_2) || defined(LLVM_3_3) int flags = 0; #else llvm::sys::fs::OpenFlags flags = llvm::sys::fs::F_None; @@ -4882,7 +4939,7 @@ WriteCXXFile(llvm::Module *module, const char *fn, int vectorWidth, pm.add(llvm::createDeadCodeEliminationPass()); // clean up after smear pass //CO pm.add(llvm::createPrintModulePass(&fos)); pm.add(new CWriter(fos, includeName, vectorWidth)); -#if defined(LLVM_3_2) +#if defined(LLVM_3_1) || defined(LLVM_3_2) // This interface is depricated for 3.3+ pm.add(llvm::createGCInfoDeleter()); #endif diff --git a/ctx.cpp b/ctx.cpp index 32ba0ad9..c50d22f9 100644 --- a/ctx.cpp +++ b/ctx.cpp @@ -46,7 +46,7 @@ #include "sym.h" #include #include -#if defined(LLVM_3_2) +#if defined(LLVM_3_1) || defined(LLVM_3_2) #include #include #include @@ -3312,7 +3312,7 @@ FunctionEmitContext::CallInst(llvm::Value *func, const FunctionType *funcType, // alias analysis. // TODO: what other attributes needs to be copied? // TODO: do the same for varing path. -#if !defined (LLVM_3_2) // LLVM 3.3+ +#if !defined (LLVM_3_1) && !defined (LLVM_3_2) // LLVM 3.3+ llvm::CallInst *cc = llvm::dyn_cast(ci); if (cc && cc->getCalledFunction() && diff --git a/ctx.h b/ctx.h index 4b27e6e5..58f9aae3 100644 --- a/ctx.h +++ b/ctx.h @@ -40,15 +40,20 @@ #include "ispc.h" #include -#if defined(LLVM_3_2) +#if defined(LLVM_3_1) || defined(LLVM_3_2) #include #include #else #include #include #endif -#include -#include +#if defined(LLVM_3_1) + #include + #include +#else + #include + #include +#endif struct CFInfo; diff --git a/expr.cpp b/expr.cpp index eb8c0951..856d363c 100644 --- a/expr.cpp +++ b/expr.cpp @@ -56,7 +56,7 @@ #include #include #include -#if defined(LLVM_3_2) +#if defined(LLVM_3_1) || defined(LLVM_3_2) #include #include #include diff --git a/func.cpp b/func.cpp index 3097f86d..b975049b 100644 --- a/func.cpp +++ b/func.cpp @@ -46,7 +46,7 @@ #include "util.h" #include -#if defined(LLVM_3_2) +#if defined(LLVM_3_1) || defined(LLVM_3_2) #include #include #include @@ -310,7 +310,9 @@ Function::emitCode(FunctionEmitContext *ctx, llvm::Function *function, // isn't worth the code bloat / overhead. bool checkMask = (type->isTask == true) || ( -#if defined(LLVM_3_2) +#if defined(LLVM_3_1) + (function->hasFnAttr(llvm::Attribute::AlwaysInline) == false) +#elif defined(LLVM_3_2) (function->getFnAttributes().hasAttribute(llvm::Attributes::AlwaysInline) == false) #else // LLVM 3.3+ (function->getAttributes().getFnAttributes().hasAttribute(llvm::AttributeSet::FunctionIndex, llvm::Attribute::AlwaysInline) == false) @@ -451,7 +453,11 @@ Function::GenerateIR() { functionName += std::string("_") + g->target->GetISAString(); llvm::Function *appFunction = llvm::Function::Create(ftype, linkage, functionName.c_str(), m->module); +#if defined(LLVM_3_1) + appFunction->setDoesNotThrow(true); +#else appFunction->setDoesNotThrow(); +#endif g->target->markFuncWithTargetAttr(appFunction); diff --git a/ispc.cpp b/ispc.cpp index 03d1aaff..0f07895f 100644 --- a/ispc.cpp +++ b/ispc.cpp @@ -48,7 +48,7 @@ #include #include #endif -#if defined(LLVM_3_2) +#if defined(LLVM_3_1) || defined(LLVM_3_2) #include #include #include @@ -57,12 +57,19 @@ #include #include #endif -#include -#include +#if defined(LLVM_3_1) + #include + #include +#else + #include + #include +#endif #include #include #include -#if defined(LLVM_3_2) +#if defined(LLVM_3_1) + #include +#elif defined(LLVM_3_2) #include #else // LLVM 3.3+ #include @@ -138,20 +145,27 @@ static const char *supportedCPUs[] = { // cortex-a9 and a15. We should be able to handle any of them that also // have NEON support. "cortex-a9", "cortex-a15", - "atom", "penryn", "core2", "corei7", "corei7-avx", "core-avx-i", "core-avx2" + "atom", "penryn", "core2", "corei7", "corei7-avx" +#if !defined(LLVM_3_1) + , "core-avx-i", "core-avx2" +#endif // LLVM 3.2+ }; Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : m_target(NULL), m_targetMachine(NULL), +#if defined(LLVM_3_1) + m_targetData(NULL), +#else m_dataLayout(NULL), +#endif m_valid(false), m_isa(SSE2), m_arch(""), m_is32Bit(true), m_cpu(""), m_attributes(""), -#if !defined(LLVM_3_2) +#if !defined(LLVM_3_1) && !defined(LLVM_3_2) m_tf_attributes(NULL), #endif m_nativeVectorWidth(-1), @@ -393,7 +407,10 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : this->m_maskingIsFree = false; this->m_maskBitCount = 32; this->m_hasHalf = true; +#if !defined(LLVM_3_1) + // LLVM 3.2+ only this->m_hasRand = true; +#endif } else if (!strcasecmp(isa, "avx1.1-x2")) { this->m_isa = Target::AVX11; @@ -403,29 +420,46 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : this->m_maskingIsFree = false; this->m_maskBitCount = 32; this->m_hasHalf = true; +#if !defined(LLVM_3_1) + // LLVM 3.2+ only this->m_hasRand = true; +#endif } else if (!strcasecmp(isa, "avx2")) { this->m_isa = Target::AVX2; this->m_nativeVectorWidth = 8; this->m_vectorWidth = 8; - this->m_attributes = "+avx2,+popcnt,+cmov,+f16c,+rdrand,+fma"; + this->m_attributes = "+avx2,+popcnt,+cmov,+f16c,+rdrand" +#ifndef LLVM_3_1 + ",+fma" +#endif // !LLVM_3_1 + ; this->m_maskingIsFree = false; this->m_maskBitCount = 32; this->m_hasHalf = true; +#if !defined(LLVM_3_1) + // LLVM 3.2+ only this->m_hasRand = true; this->m_hasGather = true; +#endif } else if (!strcasecmp(isa, "avx2-x2")) { this->m_isa = Target::AVX2; this->m_nativeVectorWidth = 16; this->m_vectorWidth = 16; - this->m_attributes = "+avx2,+popcnt,+cmov,+f16c,+rdrand,+fma"; + this->m_attributes = "+avx2,+popcnt,+cmov,+f16c,+rdrand" +#ifndef LLVM_3_1 + ",+fma" +#endif // !LLVM_3_1 + ; this->m_maskingIsFree = false; this->m_maskBitCount = 32; this->m_hasHalf = true; +#if !defined(LLVM_3_1) + // LLVM 3.2+ only this->m_hasRand = true; this->m_hasGather = true; +#endif } else if (!strcasecmp(isa, "neon-8")) { this->m_isa = Target::NEON8; @@ -471,8 +505,10 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : if (m_isa == Target::NEON8 || m_isa == Target::NEON16 || m_isa == Target::NEON32) options.FloatABIType = llvm::FloatABI::Hard; +#if !defined(LLVM_3_1) if (g->opt.disableFMA == false) options.AllowFPOpFusion = llvm::FPOpFusion::Fast; +#endif // !LLVM_3_1 #ifdef ISPC_IS_WINDOWS if (strcmp("x86", arch) == 0) { @@ -490,8 +526,12 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : // Initialize TargetData/DataLayout in 3 steps. // 1. Get default data layout first - std::string dl_string = - m_targetMachine->getDataLayout()->getStringRepresentation(); + std::string dl_string; +#if defined(LLVM_3_1) + dl_string = m_targetMachine->getTargetData()->getStringRepresentation(); +#else + dl_string = m_targetMachine->getDataLayout()->getStringRepresentation(); +#endif // 2. Adjust for generic if (m_isa == Target::GENERIC) { @@ -506,7 +546,11 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : } // 3. Finally set member data +#if defined(LLVM_3_1) + m_targetData = new llvm::TargetData(dl_string); +#else m_dataLayout = new llvm::DataLayout(dl_string); +#endif // Set is32Bit // This indicates if we are compiling for 32 bit platform @@ -514,7 +558,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : // FIXME: all generic targets are handled as 64 bit, which is incorrect. this->m_is32Bit = (getDataLayout()->getPointerSize() == 4); -#if !defined(LLVM_3_2) +#if !defined(LLVM_3_1) && !defined(LLVM_3_2) // This is LLVM 3.3+ feature. // Initialize target-specific "target-feature" attribute. if (!m_attributes.empty()) { @@ -728,7 +772,7 @@ Target::StructOffset(llvm::Type *type, int element, } void Target::markFuncWithTargetAttr(llvm::Function* func) { -#if !defined(LLVM_3_2) +#if !defined(LLVM_3_1) && !defined(LLVM_3_2) if (m_tf_attributes) { func->addAttributes(llvm::AttributeSet::FunctionIndex, *m_tf_attributes); } diff --git a/ispc.h b/ispc.h index 8653553e..98fcd199 100644 --- a/ispc.h +++ b/ispc.h @@ -40,8 +40,8 @@ #define ISPC_VERSION "1.4.5dev" -#if !defined(LLVM_3_2) && !defined(LLVM_3_3) && !defined(LLVM_3_4) -#error "Only LLVM 3.2, 3.3 and the 3.4 development branch are supported" +#if !defined(LLVM_3_1) && !defined(LLVM_3_2) && !defined(LLVM_3_3) && !defined(LLVM_3_4) +#error "Only LLVM 3.1, 3.2, 3.3 and the 3.4 development branch are supported" #endif #if defined(_WIN32) || defined(_WIN64) @@ -72,7 +72,11 @@ namespace llvm { class BasicBlock; class Constant; class ConstantValue; +#if defined(LLVM_3_1) + class TargetData; +#else class DataLayout; +#endif class DIBuilder; class DIDescriptor; class DIFile; @@ -222,7 +226,11 @@ public: // Note the same name of method for 3.1 and 3.2+, this allows // to reduce number ifdefs on client side. +#if defined(LLVM_3_1) + llvm::TargetData *getDataLayout() const {return m_targetData;} +#else llvm::DataLayout *getDataLayout() const {return m_dataLayout;} +#endif /** Reports if Target object has valid state. */ bool isValid() const {return m_valid;} @@ -270,7 +278,11 @@ private: */ llvm::TargetMachine *m_targetMachine; +#if defined(LLVM_3_1) + llvm::TargetData *m_targetData; +#else llvm::DataLayout *m_dataLayout; +#endif /** flag to report invalid state after construction (due to bad parameters passed to constructor). */ @@ -291,7 +303,7 @@ private: /** Target-specific attribute string to pass along to the LLVM backend */ std::string m_attributes; -#if !defined(LLVM_3_2) +#if !defined(LLVM_3_1) && !defined(LLVM_3_2) /** Target-specific LLVM attribute, which has to be attached to every function to ensure that it is generated for correct target architecture. This is requirement was introduced in LLVM 3.3 */ diff --git a/llvmutil.cpp b/llvmutil.cpp index 2f54a2fe..180c8676 100644 --- a/llvmutil.cpp +++ b/llvmutil.cpp @@ -38,7 +38,7 @@ #include "llvmutil.h" #include "ispc.h" #include "type.h" -#if defined(LLVM_3_2) +#if defined(LLVM_3_1) || defined(LLVM_3_2) #include #include #else diff --git a/llvmutil.h b/llvmutil.h index d1803f32..d6c5ede0 100644 --- a/llvmutil.h +++ b/llvmutil.h @@ -38,7 +38,7 @@ #ifndef ISPC_LLVMUTIL_H #define ISPC_LLVMUTIL_H 1 -#if defined(LLVM_3_2) +#if defined(LLVM_3_1) || defined(LLVM_3_2) #include #include #include diff --git a/main.cpp b/main.cpp index c21e7f88..8d7282f5 100644 --- a/main.cpp +++ b/main.cpp @@ -62,7 +62,9 @@ static void lPrintVersion() { printf("Intel(r) SPMD Program Compiler (ispc), %s (build %s @ %s, LLVM %s)\n", ISPC_VERSION, BUILD_VERSION, BUILD_DATE, -#if defined(LLVM_3_2) +#if defined(LLVM_3_1) + "3.1" +#elif defined(LLVM_3_2) "3.2" #elif defined(LLVM_3_3) "3.3" diff --git a/module.cpp b/module.cpp index eba5eb3b..755a5dc4 100644 --- a/module.cpp +++ b/module.cpp @@ -64,7 +64,7 @@ #define strcasecmp stricmp #endif -#if defined(LLVM_3_2) +#if defined(LLVM_3_1) || defined(LLVM_3_2) #include #include #include @@ -86,7 +86,9 @@ #include #include #include -#if defined(LLVM_3_2) +#if defined(LLVM_3_1) + #include +#elif defined(LLVM_3_2) #include #include #else // LLVM 3.3+ @@ -200,7 +202,7 @@ lStripUnusedDebugInfo(llvm::Module *module) { // stuff and remove it later on. Removing it is useful, as it // reduces size of the binary significantly (manyfold for small // programs). -#if defined(LLVM_3_2) +#if defined(LLVM_3_1) || defined(LLVM_3_2) llvm::MDNode *nodeSPMD = llvm::dyn_cast(cuNode->getOperand(12)); Assert(nodeSPMD != NULL); @@ -795,7 +797,11 @@ Module::AddFunctionDeclaration(const std::string &name, #endif if (functionType->isTask) // This also applies transitively to members I think? +#if defined(LLVM_3_1) + function->setDoesNotAlias(1, true); +#else // LLVM 3.2+ function->setDoesNotAlias(1); +#endif g->target->markFuncWithTargetAttr(function); @@ -844,7 +850,12 @@ Module::AddFunctionDeclaration(const std::string &name, // NOTE: LLVM indexes function parameters starting from 1. // This is unintuitive. +#if defined(LLVM_3_1) + function->setDoesNotAlias(i+1, true); +#else function->setDoesNotAlias(i+1); +#endif + #if 0 int align = 4 * RoundUpPow2(g->target->nativeVectorWidth); function->addAttribute(i+1, llvm::Attribute::constructAlignmentFromInt(align)); @@ -1056,7 +1067,7 @@ Module::writeObjectFileOrAssembly(llvm::TargetMachine *targetMachine, llvm::TargetMachine::CodeGenFileType fileType = (outputType == Object) ? llvm::TargetMachine::CGFT_ObjectFile : llvm::TargetMachine::CGFT_AssemblyFile; bool binary = (fileType == llvm::TargetMachine::CGFT_ObjectFile); -#if defined(LLVM_3_2) || defined(LLVM_3_3) +#if defined(LLVM_3_1) || defined(LLVM_3_2) || defined(LLVM_3_3) unsigned int flags = binary ? llvm::raw_fd_ostream::F_Binary : 0; #else llvm::sys::fs::OpenFlags flags = binary ? llvm::sys::fs::F_Binary : @@ -1071,7 +1082,11 @@ Module::writeObjectFileOrAssembly(llvm::TargetMachine *targetMachine, } llvm::PassManager pm; +#if defined(LLVM_3_1) + pm.add(new llvm::TargetData(*g->target->getDataLayout())); +#else pm.add(new llvm::DataLayout(*g->target->getDataLayout())); +#endif llvm::formatted_raw_ostream fos(of->os()); @@ -1785,12 +1800,22 @@ Module::execPreprocessor(const char *infilename, llvm::raw_string_ostream *ostre llvm::raw_fd_ostream stderrRaw(2, false); +#if defined(LLVM_3_1) + clang::TextDiagnosticPrinter *diagPrinter = + new clang::TextDiagnosticPrinter(stderrRaw, clang::DiagnosticOptions()); +#else clang::DiagnosticOptions *diagOptions = new clang::DiagnosticOptions(); clang::TextDiagnosticPrinter *diagPrinter = new clang::TextDiagnosticPrinter(stderrRaw, diagOptions); +#endif llvm::IntrusiveRefCntPtr diagIDs(new clang::DiagnosticIDs); +#if defined(LLVM_3_1) + clang::DiagnosticsEngine *diagEngine = + new clang::DiagnosticsEngine(diagIDs, diagPrinter); +#else clang::DiagnosticsEngine *diagEngine = new clang::DiagnosticsEngine(diagIDs, diagOptions, diagPrinter); +#endif inst.setDiagnostics(diagEngine); clang::TargetOptions &options = inst.getTargetOpts(); @@ -1800,7 +1825,7 @@ Module::execPreprocessor(const char *infilename, llvm::raw_string_ostream *ostre } options.Triple = triple.getTriple(); -#if defined(LLVM_3_2) +#if defined(LLVM_3_1) || defined(LLVM_3_2) clang::TargetInfo *target = clang::TargetInfo::CreateTargetInfo(inst.getDiagnostics(), options); #else // LLVM 3.3+ @@ -1810,14 +1835,18 @@ Module::execPreprocessor(const char *infilename, llvm::raw_string_ostream *ostre inst.setTarget(target); inst.createSourceManager(inst.getFileManager()); +#if defined(LLVM_3_1) + inst.InitializeSourceManager(infilename); +#else clang::FrontendInputFile inputFile(infilename, clang::IK_None); inst.InitializeSourceManager(inputFile); +#endif // Don't remove comments in the preprocessor, so that we can accurately // track the source file position by handling them ourselves. inst.getPreprocessorOutputOpts().ShowComments = 1; -#if !defined(LLVM_3_2) // LLVM 3.3+ +#if !defined(LLVM_3_1) && !defined(LLVM_3_2) // LLVM 3.3+ inst.getPreprocessorOutputOpts().ShowCPP = 1; #endif @@ -1829,7 +1858,7 @@ Module::execPreprocessor(const char *infilename, llvm::raw_string_ostream *ostre headerOpts.Verbose = 1; for (int i = 0; i < (int)g->includePath.size(); ++i) { headerOpts.AddPath(g->includePath[i], clang::frontend::Angled, -#if defined(LLVM_3_2) +#if defined(LLVM_3_1) || defined(LLVM_3_2) true /* is user supplied */, #endif false /* not a framework */, @@ -1884,7 +1913,11 @@ Module::execPreprocessor(const char *infilename, llvm::raw_string_ostream *ostre } } +#if defined(LLVM_3_1) + inst.getLangOpts().BCPLComment = 1; +#else inst.getLangOpts().LineComment = 1; +#endif inst.createPreprocessor(); diagPrinter->BeginSourceFile(inst.getLangOpts(), &inst.getPreprocessor()); diff --git a/opt.cpp b/opt.cpp index 8899c64d..077320d5 100644 --- a/opt.cpp +++ b/opt.cpp @@ -48,7 +48,7 @@ #include #include -#if defined(LLVM_3_2) +#if defined(LLVM_3_1) || defined(LLVM_3_2) #include #include #include @@ -73,7 +73,9 @@ #include #include #include -#if defined(LLVM_3_2) +#if defined(LLVM_3_1) + #include +#elif defined(LLVM_3_2) #include #else // LLVM 3.3+ #include @@ -83,8 +85,12 @@ #include #include #include -#include #include +#if defined(LLVM_3_1) + #include +#else + #include +#endif #include #ifdef ISPC_IS_LINUX #include @@ -411,14 +417,18 @@ Optimize(llvm::Module *module, int optLevel) { optPM.add(targetLibraryInfo); +#if defined(LLVM_3_1) + optPM.add(new llvm::TargetData(*g->target->getDataLayout())); +#else optPM.add(new llvm::DataLayout(*g->target->getDataLayout())); llvm::TargetMachine *targetMachine = g->target->GetTargetMachine(); -#ifdef LLVM_3_2 + #ifdef LLVM_3_2 optPM.add(new llvm::TargetTransformInfo(targetMachine->getScalarTargetTransformInfo(), targetMachine->getVectorTargetTransformInfo())); -#else // LLVM 3.3+ + #else // LLVM 3.3+ targetMachine->addAnalysisPasses(optPM); + #endif #endif optPM.add(llvm::createIndVarSimplifyPass()); @@ -500,7 +510,7 @@ Optimize(llvm::Module *module, int optLevel) { optPM.add(llvm::createCFGSimplificationPass()); optPM.add(llvm::createArgumentPromotionPass()); -#if defined(LLVM_3_2) || defined(LLVM_3_3) +#if defined(LLVM_3_1) || defined(LLVM_3_2) || defined(LLVM_3_3) // Starting from 3.4 this functionality was moved to // InstructionCombiningPass. See r184459 for details. optPM.add(llvm::createSimplifyLibCallsPass()); diff --git a/stmt.cpp b/stmt.cpp index 412b0dd9..4ec63d35 100644 --- a/stmt.cpp +++ b/stmt.cpp @@ -48,7 +48,7 @@ #include #include -#if defined(LLVM_3_2) +#if defined(LLVM_3_1) || defined(LLVM_3_2) #include #include #include diff --git a/type.cpp b/type.cpp index 11a165f5..5fa1845b 100644 --- a/type.cpp +++ b/type.cpp @@ -43,15 +43,20 @@ #include #include -#if defined(LLVM_3_2) +#if defined(LLVM_3_1) || defined(LLVM_3_2) #include #include #else #include #include #endif -#include -#include +#if defined(LLVM_3_1) + #include + #include +#else + #include + #include +#endif #include @@ -814,8 +819,11 @@ EnumType::GetDIType(llvm::DIDescriptor scope) const { m->diBuilder->createEnumerationType(scope, name, diFile, pos.first_line, 32 /* size in bits */, 32 /* align in bits */, - elementArray, - llvm::DIType()); + elementArray +#if !defined(LLVM_3_1) + , llvm::DIType() +#endif + ); switch (variability.type) { @@ -2131,7 +2139,7 @@ StructType::GetDIType(llvm::DIDescriptor scope) const { currentSize, // Size in bits align, // Alignment in bits 0, // Flags -#if !defined(LLVM_3_2) +#if !defined(LLVM_3_1) && !defined(LLVM_3_2) llvm::DIType(), // DerivedFrom #endif elements); @@ -2374,7 +2382,7 @@ UndefinedStructType::GetDIType(llvm::DIDescriptor scope) const { 0, // Size 0, // Align 0, // Flags -#if !defined(LLVM_3_2) +#if !defined(LLVM_3_1) && !defined(LLVM_3_2) llvm::DIType(), // DerivedFrom #endif elements); @@ -2637,8 +2645,12 @@ ReferenceType::GetDIType(llvm::DIDescriptor scope) const { } llvm::DIType diTargetType = targetType->GetDIType(scope); +#if defined(LLVM_3_1) + return m->diBuilder->createReferenceType(diTargetType); +#else return m->diBuilder->createReferenceType(llvm::dwarf::DW_TAG_reference_type, diTargetType); +#endif } diff --git a/type.h b/type.h index a6a52e10..880f8574 100644 --- a/type.h +++ b/type.h @@ -40,7 +40,7 @@ #include "ispc.h" #include "util.h" -#if defined(LLVM_3_2) +#if defined(LLVM_3_1) || defined(LLVM_3_2) #include #include #else diff --git a/util.cpp b/util.cpp index 4be863bf..dbea9517 100644 --- a/util.cpp +++ b/util.cpp @@ -65,7 +65,9 @@ #include #include -#if defined(LLVM_3_2) +#if defined(LLVM_3_1) + #include +#elif defined(LLVM_3_2) #include #else // LLVM 3.3+ #include @@ -614,8 +616,13 @@ VerifyDataLayoutCompatibility(const std::string &module_dl, // which contradic: f80:128:128 followed by f80:32:32. This is a bug, but // correct thing to do is to interpret this exactly how LLVM would treat it, // so we create a DataLayout class and take its string representation. +#if defined(LLVM_3_1) + llvm::TargetData d1(module_dl); + llvm::TargetData d2(lib_dl); +#else // LLVM 3.2+ llvm::DataLayout d1(module_dl); llvm::DataLayout d2(lib_dl); +#endif std::string module_dl_canonic = d1.getStringRepresentation(); std::string lib_dl_canonic = d2.getStringRepresentation(); From 5e5d42b918852a7aeb12bbc98cf4a5b46e5f9842 Mon Sep 17 00:00:00 2001 From: Matt Pharr Date: Tue, 6 Aug 2013 17:55:37 -0700 Subject: [PATCH 027/124] Fix build with LLVM 3.1 --- opt.cpp | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/opt.cpp b/opt.cpp index 3e2efcd8..e1618b7a 100644 --- a/opt.cpp +++ b/opt.cpp @@ -4556,6 +4556,8 @@ PeepholePass::PeepholePass() : BasicBlockPass(ID) { } +#ifndef LLVM_3_1 + using namespace llvm::PatternMatch; template @@ -4706,6 +4708,7 @@ inline SDiv2_match m_SDiv2(const V &v) { return SDiv2_match(v); } + // Returns true if the given function has a call to an intrinsic function // in its definition. static bool @@ -4874,6 +4877,7 @@ lMatchAvgDownInt16(llvm::Value *inst) { } return NULL; } +#endif // !LLVM_3_1 bool PeepholePass::runOnBasicBlock(llvm::BasicBlock &bb) { @@ -4885,6 +4889,7 @@ PeepholePass::runOnBasicBlock(llvm::BasicBlock &bb) { llvm::Instruction *inst = &*iter; llvm::Instruction *builtinCall = NULL; +#ifndef LLVM_3_1 if (!builtinCall) builtinCall = lMatchAvgUpUInt8(inst); if (!builtinCall) @@ -4901,7 +4906,7 @@ PeepholePass::runOnBasicBlock(llvm::BasicBlock &bb) { builtinCall = lMatchAvgDownInt8(inst); if (!builtinCall) builtinCall = lMatchAvgDownInt16(inst); - +#endif // LLVM_3_1 if (builtinCall != NULL) { llvm::ReplaceInstWithInst(inst, builtinCall); modifiedAny = true; From 1d76f74b165ee79840a739490fddedbb532a275f Mon Sep 17 00:00:00 2001 From: Matt Pharr Date: Wed, 7 Aug 2013 12:53:39 -0700 Subject: [PATCH 028/124] Fix compiler warnings --- opt.cpp | 8 ++++---- parse.yy | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/opt.cpp b/opt.cpp index e1618b7a..522e601b 100644 --- a/opt.cpp +++ b/opt.cpp @@ -4756,7 +4756,7 @@ lMatchAvgUpUInt8(llvm::Value *inst) { m_Add(m_Add(m_ZExt8To16(m_Value(opa)), m_ZExt8To16(m_Value(opb))), m_APInt(delta))))))) { if (delta->isIntN(1) == false) - return false; + return NULL; return lGetBinaryIntrinsic("__avg_up_uint8", opa, opb); } @@ -4790,7 +4790,7 @@ lMatchAvgUpUInt16(llvm::Value *inst) { m_Add(m_Add(m_ZExt16To32(m_Value(opa)), m_ZExt16To32(m_Value(opb))), m_APInt(delta))))))) { if (delta->isIntN(1) == false) - return false; + return NULL; return lGetBinaryIntrinsic("__avg_up_uint16", opa, opb); } @@ -4825,7 +4825,7 @@ lMatchAvgUpInt8(llvm::Value *inst) { m_Add(m_Add(m_SExt8To16(m_Value(opa)), m_SExt8To16(m_Value(opb))), m_APInt(delta))))))) { if (delta->isIntN(1) == false) - return false; + return NULL; return lGetBinaryIntrinsic("__avg_up_int8", opa, opb); } @@ -4859,7 +4859,7 @@ lMatchAvgUpInt16(llvm::Value *inst) { m_Add(m_Add(m_SExt16To32(m_Value(opa)), m_SExt16To32(m_Value(opb))), m_APInt(delta))))))) { if (delta->isIntN(1) == false) - return false; + return NULL; return lGetBinaryIntrinsic("__avg_up_int16", opa, opb); } diff --git a/parse.yy b/parse.yy index 4b315776..5fc01cb0 100644 --- a/parse.yy +++ b/parse.yy @@ -2169,7 +2169,7 @@ lAddFunctionParams(Declarator *decl) { /** Add a symbol for the built-in mask variable to the symbol table */ static void lAddMaskToSymbolTable(SourcePos pos) { - const Type *t; + const Type *t = NULL; switch (g->target->getMaskBitCount()) { case 1: t = AtomicType::VaryingBool; From 0c5742b6f88a7b880f27352f652e282d817b92a0 Mon Sep 17 00:00:00 2001 From: Matt Pharr Date: Thu, 8 Aug 2013 19:23:44 -0700 Subject: [PATCH 029/124] Implement new naming scheme for --target. Now targets are named like "-ix", e.g. "sse4-i8x16", or "avx2-i32x16". The old target names are still supported. --- docs/ispc.rst | 94 +++++++++++++++++++++++----------------- ispc.cpp | 116 ++++++++++++++++++++++++++++++-------------------- ispc.h | 12 +++--- main.cpp | 15 +++++-- run_tests.py | 2 +- util.cpp | 10 ++--- util.h | 14 ++++++ 7 files changed, 163 insertions(+), 100 deletions(-) diff --git a/docs/ispc.rst b/docs/ispc.rst index eb8333de..26cf6be3 100755 --- a/docs/ispc.rst +++ b/docs/ispc.rst @@ -489,54 +489,72 @@ on which you're running ``ispc`` is used to determine the target CPU. ispc foo.ispc -o foo.obj --cpu=corei7-avx -Finally, ``--target`` selects the target instruction set. The following -targets are currently supported: +Finally, ``--target`` selects the target instruction set. The target +string is of the form ``[ISA]-i[mask size]x[gang size]``. For example, +``--target=avx2-i32x16`` specifies a target with the AVX2 instruction set, +a mask size of 32 bits, and a gang size of 16. -=========== ========= ======================================= -Target Gang Size Description ------------ --------- --------------------------------------- -avx 8 AVX (2010-2011 era Intel CPUs) -avx-x2 16 "Double-pumped" AVX target, running - twice as many program instances as the - native vector width. -avx1.1 8 AVX 1.1 target (2012 era "Ivybridge" - Intel CPUs). -avx1.1-x2 16 Double-pumped AVX 1.1 target. -avx2 8 AVX 2 target (2013- Intel "Haswell" - CPUs.) -avx2-x2 16 Double-pumped AVX 2 target. -neon-8 16 ARM NEON target, targeting computation - on 8-bit data types. -neon-16 8 ARM NEON target, targeting computation - on 16-bit data types. -neon-32 4 ARM NEON target, targeting computation - on 32-bit data types. -sse2 4 SSE2 (early 2000s era x86 CPUs). -sse2-x2 8 Double-pumped SSE2. -sse4 4 SSE4 (generally 2008-2010 Intel CPUs). -sse4-x2 8 Double-pumped SSE4. -sse4-8 16 SSE4 target targeting computation on - 8-bit data types. -sse4-16 8 SSE4 target targeting computation on - 16-bit data types. -=========== ========= ======================================= +The following target ISAs are supported: + +============ ========================================== +Target Description +------------ ------------------------------------------ +avx, avx1 AVX (2010-2011 era Intel CPUs) +avx1.1 AVX 1.1 (2012 era "Ivybridge" Intel CPUs) +avx2 AVX 2 target (2013- Intel "Haswell" CPUs) +neon ARM NEON +sse2 SSE2 (early 2000s era x86 CPUs) +sse4 SSE4 (generally 2008-2010 Intel CPUs) +============ ========================================== + +Consult your CPU's manual for specifics on which vector instruction set it +supports. + +The mask size may be 8, 16, or 32 bits, though not all combinations of ISAs +and mask sizes are supported. For best performance, the best general +approach is to choose a mask size equal to the size of the most common +datatype in your programs. For example, if most of your computation is on +32-bit floating-point values, an ``i32`` target is appropriate. However, +if you're mostly doing computation on 8-bit images, ``i8`` is a better choice. See `Basic Concepts: Program Instances and Gangs of Program Instances`_ for more discussion of the "gang size" and its implications for program execution. -instruction sets. (As general context, SSE2 was first introduced in -processors that shipped in 2001, SSE4 was introduced in 2007, and -processors with AVX were introduced in 2010, and AVX2 arrived in 2013. -Consult your CPU's -manual for specifics on which vector instruction set it supports.) +Running ``ispc --help`` and looking at the output for the ``--target`` +option gives the most up-to-date documentation about which targets your +compiler binary supports. + +The naming scheme for compilation targets changed in August 2013; the +following table shows the relationship between names in the old scheme and +in the new scheme: + +============= =========== +Target Former Name +------------- ----------- +avx1-i32x8 avx, avx1 +avx1-i32x16 avx-x2 +avx1.1-i32x8 avx1.1 +avx1.1-i32x16 avx1.1-x2 +avx2-i32x8 avx2 +avx2-i32x16 avx2-x2 +neon-8 n/a +neon-16 n/a +neon-32 n/a +sse2-i32x4 sse2 +sse2-i32x8 sse2-x2 +sse4-i32x4 sse4 +sse4-i32x8 sse4-x2 +sse4-i8x16 n/a +sse4-i16x8 n/a +============= =========== By default, the target instruction set is chosen based on the most capable one supported by the system on which you're running ``ispc``. You can override this choice with the ``--target`` flag; for example, to select -Intel® SSE2, use ``--target=sse2``. (As with the other options in this -section, see the output of ``ispc --help`` for a full list of supported -targets.) +Intel® SSE2 with a 32-bit mask and 4 program instances in a gang, use +``--target=sse2-i32x4``. (As with the other options in this section, see +the output of ``ispc --help`` for a full list of supported targets.) Generating Generic C++ Output ----------------------------- diff --git a/ispc.cpp b/ispc.cpp index a012b08d..8a0f16c6 100644 --- a/ispc.cpp +++ b/ispc.cpp @@ -106,7 +106,7 @@ static void __cpuidex(int info[4], int level, int count) { static const char * lGetSystemISA() { #ifdef __arm__ - return "neon-32"; + return "neon-i32x4"; #else int info[4]; __cpuid(info, 1); @@ -121,19 +121,19 @@ lGetSystemISA() { int info2[4]; __cpuidex(info2, 7, 0); if ((info2[1] & (1 << 5)) != 0) - return "avx2"; + return "avx2-i32x8"; else - return "avx1.1"; + return "avx1.1-i32x8"; } // Regular AVX - return "avx"; + return "avx-i32x8"; } else if ((info[2] & (1 << 19)) != 0) - return "sse4"; + return "sse4-i32x4"; else if ((info[3] & (1 << 26)) != 0) - return "sse2"; + return "sse2-i32x4"; else { - fprintf(stderr, "Unable to detect supported SSE/AVX ISA. Exiting.\n"); + Error(SourcePos(), "Unable to detect supported SSE/AVX ISA. Exiting."); exit(1); } #endif @@ -186,22 +186,22 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : // If a CPU was specified explicitly, try to pick the best // possible ISA based on that. if (!strcmp(cpu, "core-avx2")) - isa = "avx2"; + isa = "avx2-i32x8"; #ifdef ISPC_ARM_ENABLED else if (!strcmp(cpu, "cortex-a9") || !strcmp(cpu, "cortex-a15")) - isa = "neon-32"; + isa = "neon-i32x4"; #endif else if (!strcmp(cpu, "core-avx-i")) - isa = "avx1.1"; + isa = "avx1.1-i32x8"; else if (!strcmp(cpu, "sandybridge") || !strcmp(cpu, "corei7-avx")) - isa = "avx"; + isa = "avx-i32x8"; else if (!strcmp(cpu, "corei7") || !strcmp(cpu, "penryn")) - isa = "sse4"; + isa = "sse4-i32x4"; else - isa = "sse2"; + isa = "sse2-i32x4"; Warning(SourcePos(), "No --target specified on command-line. " "Using ISA \"%s\" based on specified CPU \"%s\".", isa, cpu); @@ -211,7 +211,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : // supports. isa = lGetSystemISA(); Warning(SourcePos(), "No --target specified on command-line. " - "Using system ISA \"%s\".", isa); + "Using default system target \"%s\".", isa); } } @@ -241,8 +241,8 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : } } if (foundCPU == false) { - fprintf(stderr, "Error: CPU type \"%s\" unknown. Supported CPUs: " - "%s.\n", cpu, SupportedTargetCPUs().c_str()); + Error(SourcePos(), "Error: CPU type \"%s\" unknown. Supported CPUs: " + "%s.", cpu, SupportedCPUs().c_str()); return; } } @@ -283,7 +283,8 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : } // Check default LLVM generated targets - if (!strcasecmp(isa, "sse2")) { + if (!strcasecmp(isa, "sse2") || + !strcasecmp(isa, "sse2-i32x4")) { this->m_isa = Target::SSE2; this->m_nativeVectorWidth = 4; this->m_vectorWidth = 4; @@ -291,7 +292,8 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : this->m_maskingIsFree = false; this->m_maskBitCount = 32; } - else if (!strcasecmp(isa, "sse2-x2")) { + else if (!strcasecmp(isa, "sse2-x2") || + !strcasecmp(isa, "sse2-i32x8")) { this->m_isa = Target::SSE2; this->m_nativeVectorWidth = 4; this->m_vectorWidth = 8; @@ -299,7 +301,8 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : this->m_maskingIsFree = false; this->m_maskBitCount = 32; } - else if (!strcasecmp(isa, "sse4")) { + else if (!strcasecmp(isa, "sse4") || + !strcasecmp(isa, "sse4-i32x4")) { this->m_isa = Target::SSE4; this->m_nativeVectorWidth = 4; this->m_vectorWidth = 4; @@ -308,7 +311,9 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : this->m_maskingIsFree = false; this->m_maskBitCount = 32; } - else if (!strcasecmp(isa, "sse4x2") || !strcasecmp(isa, "sse4-x2")) { + else if (!strcasecmp(isa, "sse4x2") || + !strcasecmp(isa, "sse4-x2") || + !strcasecmp(isa, "sse4-i32x8")) { this->m_isa = Target::SSE4; this->m_nativeVectorWidth = 4; this->m_vectorWidth = 8; @@ -316,7 +321,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : this->m_maskingIsFree = false; this->m_maskBitCount = 32; } - else if (!strcasecmp(isa, "sse4-8")) { + else if (!strcasecmp(isa, "sse4-i8x16")) { this->m_isa = Target::SSE4; this->m_nativeVectorWidth = 16; this->m_vectorWidth = 16; @@ -324,7 +329,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : this->m_maskingIsFree = false; this->m_maskBitCount = 8; } - else if (!strcasecmp(isa, "sse4-16")) { + else if (!strcasecmp(isa, "sse4-i16x8")) { this->m_isa = Target::SSE4; this->m_nativeVectorWidth = 8; this->m_vectorWidth = 8; @@ -332,7 +337,8 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : this->m_maskingIsFree = false; this->m_maskBitCount = 16; } - else if (!strcasecmp(isa, "generic-4")) { + else if (!strcasecmp(isa, "generic-4") || + !strcasecmp(isa, "generic-x4")) { this->m_isa = Target::GENERIC; this->m_nativeVectorWidth = 4; this->m_vectorWidth = 4; @@ -342,7 +348,8 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : this->m_hasTranscendentals = true; this->m_hasGather = this->m_hasScatter = true; } - else if (!strcasecmp(isa, "generic-8")) { + else if (!strcasecmp(isa, "generic-8") || + !strcasecmp(isa, "generic-x8")) { this->m_isa = Target::GENERIC; this->m_nativeVectorWidth = 8; this->m_vectorWidth = 8; @@ -352,7 +359,8 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : this->m_hasTranscendentals = true; this->m_hasGather = this->m_hasScatter = true; } - else if (!strcasecmp(isa, "generic-16")) { + else if (!strcasecmp(isa, "generic-16") || + !strcasecmp(isa, "generic-x16")) { this->m_isa = Target::GENERIC; this->m_nativeVectorWidth = 16; this->m_vectorWidth = 16; @@ -362,7 +370,8 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : this->m_hasTranscendentals = true; this->m_hasGather = this->m_hasScatter = true; } - else if (!strcasecmp(isa, "generic-32")) { + else if (!strcasecmp(isa, "generic-32") || + !strcasecmp(isa, "generic-x32")) { this->m_isa = Target::GENERIC; this->m_nativeVectorWidth = 32; this->m_vectorWidth = 32; @@ -372,7 +381,8 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : this->m_hasTranscendentals = true; this->m_hasGather = this->m_hasScatter = true; } - else if (!strcasecmp(isa, "generic-64")) { + else if (!strcasecmp(isa, "generic-64") || + !strcasecmp(isa, "generic-x64")) { this->m_isa = Target::GENERIC; this->m_nativeVectorWidth = 64; this->m_vectorWidth = 64; @@ -382,14 +392,17 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : this->m_hasTranscendentals = true; this->m_hasGather = this->m_hasScatter = true; } - else if (!strcasecmp(isa, "generic-1")) { + else if (!strcasecmp(isa, "generic-1") || + !strcasecmp(isa, "generic-x1")) { this->m_isa = Target::GENERIC; this->m_nativeVectorWidth = 1; this->m_vectorWidth = 1; this->m_maskingIsFree = false; this->m_maskBitCount = 32; } - else if (!strcasecmp(isa, "avx") || !strcasecmp(isa, "avx1")) { + else if (!strcasecmp(isa, "avx") || + !strcasecmp(isa, "avx1") || + !strcasecmp(isa, "avx1-i32x8")) { this->m_isa = Target::AVX; this->m_nativeVectorWidth = 8; this->m_vectorWidth = 8; @@ -397,7 +410,9 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : this->m_maskingIsFree = false; this->m_maskBitCount = 32; } - else if (!strcasecmp(isa, "avx-x2") || !strcasecmp(isa, "avx1-x2")) { + else if (!strcasecmp(isa, "avx-x2") || + !strcasecmp(isa, "avx1-x2") || + !strcasecmp(isa, "avx1-i32x16")) { this->m_isa = Target::AVX; this->m_nativeVectorWidth = 8; this->m_vectorWidth = 16; @@ -405,7 +420,8 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : this->m_maskingIsFree = false; this->m_maskBitCount = 32; } - else if (!strcasecmp(isa, "avx1.1")) { + else if (!strcasecmp(isa, "avx1.1") || + !strcasecmp(isa, "avx1.1-i32x8")) { this->m_isa = Target::AVX11; this->m_nativeVectorWidth = 8; this->m_vectorWidth = 8; @@ -418,7 +434,8 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : this->m_hasRand = true; #endif } - else if (!strcasecmp(isa, "avx1.1-x2")) { + else if (!strcasecmp(isa, "avx1.1-x2") || + !strcasecmp(isa, "avx1.1-i32x16")) { this->m_isa = Target::AVX11; this->m_nativeVectorWidth = 8; this->m_vectorWidth = 16; @@ -431,7 +448,8 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : this->m_hasRand = true; #endif } - else if (!strcasecmp(isa, "avx2")) { + else if (!strcasecmp(isa, "avx2") || + !strcasecmp(isa, "avx2-i32x8")) { this->m_isa = Target::AVX2; this->m_nativeVectorWidth = 8; this->m_vectorWidth = 8; @@ -449,7 +467,8 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : this->m_hasGather = true; #endif } - else if (!strcasecmp(isa, "avx2-x2")) { + else if (!strcasecmp(isa, "avx2-x2") || + !strcasecmp(isa, "avx2-i32x16")) { this->m_isa = Target::AVX2; this->m_nativeVectorWidth = 16; this->m_vectorWidth = 16; @@ -468,7 +487,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : #endif } #ifdef ISPC_ARM_ENABLED - else if (!strcasecmp(isa, "neon-8")) { + else if (!strcasecmp(isa, "neon-i8x16")) { this->m_isa = Target::NEON8; this->m_nativeVectorWidth = 16; this->m_vectorWidth = 16; @@ -477,7 +496,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : this->m_maskingIsFree = false; this->m_maskBitCount = 8; } - else if (!strcasecmp(isa, "neon-16")) { + else if (!strcasecmp(isa, "neon-i16x8")) { this->m_isa = Target::NEON16; this->m_nativeVectorWidth = 8; this->m_vectorWidth = 8; @@ -486,7 +505,8 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : this->m_maskingIsFree = false; this->m_maskBitCount = 16; } - else if (!strcasecmp(isa, "neon-32") || !strcasecmp(isa, "neon")) { + else if (!strcasecmp(isa, "neon") || + !strcasecmp(isa, "neon-i32x4")) { this->m_isa = Target::NEON32; this->m_nativeVectorWidth = 4; this->m_vectorWidth = 4; @@ -497,8 +517,8 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : } #endif else { - fprintf(stderr, "Target ISA \"%s\" is unknown. Choices are: %s\n", - isa, SupportedTargetISAs()); + Error(SourcePos(), "Target \"%s\" is unknown. Choices are: %s.", + isa, SupportedTargets()); error = true; } @@ -592,7 +612,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : std::string -Target::SupportedTargetCPUs() { +Target::SupportedCPUs() { std::string ret; int count = sizeof(supportedCPUs) / sizeof(supportedCPUs[0]); for (int i = 0; i < count; ++i) { @@ -605,7 +625,7 @@ Target::SupportedTargetCPUs() { const char * -Target::SupportedTargetArchs() { +Target::SupportedArchs() { return #ifdef ISPC_ARM_ENABLED "arm, " @@ -615,14 +635,18 @@ Target::SupportedTargetArchs() { const char * -Target::SupportedTargetISAs() { +Target::SupportedTargets() { return #ifdef ISPC_ARM_ENABLED - "neon-8, neon-16, neon-32, " + "neon-i8x16, neon-16x8, neon-32x4, " #endif - "sse2, sse2-x2, sse4, sse4-8, sse4-16, sse4-x2, " - "avx, avx-x2, avx1.1, avx1.1-x2, avx2, avx2-x2, " - "generic-1, generic-4, generic-8, generic-16, generic-32"; + "sse2-i32x4, sse2-i32x8, " + "sse4-i32x4, sse4-i32x8, sse4-i16x8, sse4-i8x16, " + "avx1-i32x8, avx1-i32x16, " + "avx1.1-i32x8, avx1.1-i32x16, " + "avx2-i32x8, avx2-i32x16, " + "generic-x1, generic-x4, generic-x8, generic-x16, " + "generic-x32, generic-x64"; } diff --git a/ispc.h b/ispc.h index 25a03e1d..fc78e415 100644 --- a/ispc.h +++ b/ispc.h @@ -192,16 +192,16 @@ public: Target(const char *arch, const char *cpu, const char *isa, bool pic); /** Returns a comma-delimited string giving the names of the currently - supported target ISAs. */ - static const char *SupportedTargetISAs(); + supported compilation targets. */ + static const char *SupportedTargets(); /** Returns a comma-delimited string giving the names of the currently - supported target CPUs. */ - static std::string SupportedTargetCPUs(); + supported CPUs. */ + static std::string SupportedCPUs(); /** Returns a comma-delimited string giving the names of the currently - supported target architectures. */ - static const char *SupportedTargetArchs(); + supported architectures. */ + static const char *SupportedArchs(); /** Returns a triple string specifying the target architecture, vendor, and environment. */ diff --git a/main.cpp b/main.cpp index 7290d3c8..94edb73f 100644 --- a/main.cpp +++ b/main.cpp @@ -85,13 +85,16 @@ usage(int ret) { printf(" \t\taddressing calculations are done by default, even\n"); printf(" \t\ton 64-bit target architectures.)\n"); printf(" [--arch={%s}]\t\tSelect target architecture\n", - Target::SupportedTargetArchs()); + Target::SupportedArchs()); printf(" [--c++-include-file=]\t\tSpecify name of file to emit in #include statement in generated C++ code.\n"); #ifndef ISPC_IS_WINDOWS printf(" [--colored-output]\t\tAlways use terminal colors in error/warning messages.\n"); #endif - printf(" [--cpu=]\t\t\tSelect target CPU type\n"); - printf(" ={%s}\n", Target::SupportedTargetCPUs().c_str()); + printf(" "); + char cpuHelp[2048]; + sprintf(cpuHelp, "[--cpu=]\t\t\tSelect target CPU type\n={%s}\n", + Target::SupportedCPUs().c_str()); + PrintWithWordBreaks(cpuHelp, 16, TerminalWidth(), stdout); printf(" [-D]\t\t\t\t#define given value when running preprocessor\n"); printf(" [--dev-stub ]\t\tEmit device-side offload stub functions to file\n"); printf(" [--emit-asm]\t\t\tGenerate assembly language file as output\n"); @@ -126,7 +129,11 @@ usage(int ret) { printf(" [--pic]\t\t\t\tGenerate position-independent code\n"); #endif // !ISPC_IS_WINDOWS printf(" [--quiet]\t\t\t\tSuppress all output\n"); - printf(" [--target=]\t\t\tSelect target ISA. ={%s}\n", Target::SupportedTargetISAs()); + printf(" "); + char targetHelp[2048]; + sprintf(targetHelp, "[--target=]\t\t\tSelect target ISA and width.\n" + "={%s}", Target::SupportedTargets()); + PrintWithWordBreaks(targetHelp, 24, TerminalWidth(), stdout); printf(" [--version]\t\t\t\tPrint ispc version\n"); printf(" [--werror]\t\t\t\tTreat warnings as errors\n"); printf(" [--woff]\t\t\t\tDisable warnings\n"); diff --git a/run_tests.py b/run_tests.py index c9dd8b76..3225c7fd 100755 --- a/run_tests.py +++ b/run_tests.py @@ -37,7 +37,7 @@ parser.add_option("-g", "--generics-include", dest="include_file", help="Filenam parser.add_option("-f", "--ispc-flags", dest="ispc_flags", help="Additional flags for ispc (-g, -O1, ...)", default="") parser.add_option('-t', '--target', dest='target', - help='Set compilation target (neon8, neon16, neon32, sse2, sse2-x2, sse4, sse4-x2, sse4-8, sse4-16, avx, avx-x2, generic-4, generic-8, generic-16, generic-32)', + help='Set compilation target (sse2-i32x4, sse2-i32x8, sse4-i32x4, sse4-i32x8, sse4-i16x8, sse4-i8x16, avx1-i32x8, avx1-i32x16, avx1.1-i32x8, avx1.1-i32x16, avx2-i32x8, avx2-i32x16, generic-x1, generic-x4, generic-x8, generic-x16, generic-x32, generic-x64)', default="sse4") parser.add_option('-a', '--arch', dest='arch', help='Set architecture (arm, x86, x86-64)', diff --git a/util.cpp b/util.cpp index dbea9517..6b121988 100644 --- a/util.cpp +++ b/util.cpp @@ -79,8 +79,8 @@ compiler under a debuffer; in this case, just return a reasonable default. */ -static int -lTerminalWidth() { +int +TerminalWidth() { if (g->disableLineWrap) return 1<<30; @@ -228,8 +228,8 @@ lFindIndent(int numColons, const char *buf) { /** Print the given string to the given FILE, assuming the given output column width. Break words as needed to avoid words spilling past the last column. */ -static void -lPrintWithWordBreaks(const char *buf, int indent, int columnWidth, FILE *out) { +void +PrintWithWordBreaks(const char *buf, int indent, int columnWidth, FILE *out) { #ifdef ISPC_IS_WINDOWS fputs(buf, out); fputs("\n", out); @@ -375,7 +375,7 @@ lPrint(const char *type, bool isError, SourcePos p, const char *fmt, return; printed.insert(formattedBuf); - lPrintWithWordBreaks(formattedBuf, indent, lTerminalWidth(), stderr); + PrintWithWordBreaks(formattedBuf, indent, TerminalWidth(), stderr); lPrintFileLineContext(p); free(errorBuf); diff --git a/util.h b/util.h index b247b8bd..7edf71f7 100644 --- a/util.h +++ b/util.h @@ -156,4 +156,18 @@ void GetDirectoryAndFileName(const std::string ¤tDir, bool VerifyDataLayoutCompatibility(const std::string &module_dl, const std::string &lib_dl); +/** Print the given string to the given FILE, assuming the given output + column width. Break words as needed to avoid words spilling past the + last column. */ +void PrintWithWordBreaks(const char *buf, int indent, int columnWidth, + FILE *out); + +/** Returns the width of the terminal where the compiler is running. + Finding this out may fail in a variety of reasonable situations (piping + compiler output to 'less', redirecting output to a file, running the + compiler under a debuffer; in this case, just return a reasonable + default. + */ +int TerminalWidth(); + #endif // ISPC_UTIL_H From 7ab4c5391cf5c00eae9e557e579402d2a76644fd Mon Sep 17 00:00:00 2001 From: Matt Pharr Date: Fri, 9 Aug 2013 19:56:43 -0700 Subject: [PATCH 030/124] Fix build with LLVM 3.2 and generic-4 / examples/sse4.h target. --- examples/intrinsics/sse4.h | 4 ++-- opt.cpp | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/examples/intrinsics/sse4.h b/examples/intrinsics/sse4.h index 30f90b31..44dedf33 100644 --- a/examples/intrinsics/sse4.h +++ b/examples/intrinsics/sse4.h @@ -2532,7 +2532,7 @@ static FORCEINLINE int16_t __reduce_add_int8(__vec4_i8 v) { // TODO: improve int16_t ret = 0; for (int i = 0; i < 4; ++i) - ret += v.v[i]; + ret += __extract_element(v, i); return ret; } @@ -2540,7 +2540,7 @@ static FORCEINLINE int32_t __reduce_add_int16(__vec4_i16 v) { // TODO: improve int32_t ret = 0; for (int i = 0; i < 4; ++i) - ret += v.v[i]; + ret += __extract_element(v, i); return ret; } diff --git a/opt.cpp b/opt.cpp index 522e601b..75eae20c 100644 --- a/opt.cpp +++ b/opt.cpp @@ -4556,7 +4556,7 @@ PeepholePass::PeepholePass() : BasicBlockPass(ID) { } -#ifndef LLVM_3_1 +#if !defined(LLVM_3_1) && !defined(LLVM_3_2) using namespace llvm::PatternMatch; @@ -4877,7 +4877,7 @@ lMatchAvgDownInt16(llvm::Value *inst) { } return NULL; } -#endif // !LLVM_3_1 +#endif // !LLVM_3_1 && !LLVM_3_2 bool PeepholePass::runOnBasicBlock(llvm::BasicBlock &bb) { @@ -4889,7 +4889,7 @@ PeepholePass::runOnBasicBlock(llvm::BasicBlock &bb) { llvm::Instruction *inst = &*iter; llvm::Instruction *builtinCall = NULL; -#ifndef LLVM_3_1 +#if !defined(LLVM_3_1) && !defined(LLVM_3_2) if (!builtinCall) builtinCall = lMatchAvgUpUInt8(inst); if (!builtinCall) @@ -4906,7 +4906,7 @@ PeepholePass::runOnBasicBlock(llvm::BasicBlock &bb) { builtinCall = lMatchAvgDownInt8(inst); if (!builtinCall) builtinCall = lMatchAvgDownInt16(inst); -#endif // LLVM_3_1 +#endif // !LLVM_3_1 && !LLVM_3_2 if (builtinCall != NULL) { llvm::ReplaceInstWithInst(inst, builtinCall); modifiedAny = true; From ea8591a85a6ac494ce3395cfbeca17e196a3d463 Mon Sep 17 00:00:00 2001 From: Matt Pharr Date: Sat, 10 Aug 2013 11:22:43 -0700 Subject: [PATCH 031/124] Fix build with LLVM top-of-tree (link libcurses) --- Makefile | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Makefile b/Makefile index 5bac4a6e..69e24d41 100644 --- a/Makefile +++ b/Makefile @@ -79,6 +79,10 @@ CLANG_LIBS = -lclangFrontend -lclangDriver \ ISPC_LIBS=$(shell $(LLVM_CONFIG) --ldflags) $(CLANG_LIBS) $(LLVM_LIBS) \ -lpthread +ifeq ($(LLVM_VERSION),LLVM_3_4) + ISPC_LIBS += -lcurses +endif + ifeq ($(ARCH_OS),Linux) ISPC_LIBS += -ldl endif From 4766467271a9e6c0702eec04ebd6d8b9725db5f1 Mon Sep 17 00:00:00 2001 From: Matt Pharr Date: Sat, 10 Aug 2013 11:23:39 -0700 Subject: [PATCH 032/124] Revert ispc.vcxproj to version from top-of-tree. --- ispc.vcxproj | 22 +--------------------- 1 file changed, 1 insertion(+), 21 deletions(-) diff --git a/ispc.vcxproj b/ispc.vcxproj index 53386c4c..36fbad5d 100755 --- a/ispc.vcxproj +++ b/ispc.vcxproj @@ -1,4 +1,4 @@ - + @@ -185,25 +185,6 @@ Building gen-bitcode-sse2-x2-64bit.cpp - - - Document - m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-avx1.ll | python bitcode2cpp.py builtins\target-avx1.ll 32bit > $(Configuration)/gen-bitcode-avx1-32bit.cpp - $(Configuration)/gen-bitcode-avx1-32bit.cpp - builtins\util.m4;builtins\target-avx-common.ll;builtins\target-avx.ll - Building gen-bitcode-avx1-32bit.cpp - - - - - Document - m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-avx1.ll | python bitcode2cpp.py builtins\target-avx1.ll 64bit > $(Configuration)/gen-bitcode-avx1-64bit.cpp - $(Configuration)/gen-bitcode-avx1-64bit.cpp - builtins\util.m4;builtins\target-avx-common.ll;builtins\target-avx.ll - Building gen-bitcode-avx1-64bit.cpp - - -======= Document @@ -222,7 +203,6 @@ Building gen-bitcode-avx1-64bit.cpp ->>>>>>> master Document From ed017c42f1933ea1c57242f52cecb45507d9e324 Mon Sep 17 00:00:00 2001 From: Matt Pharr Date: Sun, 11 Aug 2013 07:47:20 -0700 Subject: [PATCH 033/124] Fix ispc.vcxproj for Windows builds --- ispc.vcxproj | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/ispc.vcxproj b/ispc.vcxproj index 36fbad5d..74186ac0 100755 --- a/ispc.vcxproj +++ b/ispc.vcxproj @@ -53,8 +53,10 @@ - - + + + + 4146;4800;4996;4355;4624;4005;4003;4018 @@ -97,11 +99,13 @@ Document - %LLVM_INSTALL_DIR%\bin\clang -E -x c %(Filename).ispc -DISPC=1 -DPI=3.1415926535 | python stdlib2cpp.py x86 > $(Configuration)/gen-stdlib-x86.cpp; -%LLVM_INSTALL_DIR%\bin\clang -E -x c %(Filename).ispc -DISPC=1 -DISPC_TARGET_GENERIC=1 -DPI=3.1415926535 | python stdlib2cpp.py generic > $(Configuration)/gen-stdlib-generic.cpp; + %LLVM_INSTALL_DIR%\bin\clang -E -x c %(Filename).ispc -DISPC_MASK_BITS=1 -DISPC=1 -DPI=3.1415926535 | python stdlib2cpp.py mask1 > $(Configuration)/gen-stdlib-mask1.cpp; +%LLVM_INSTALL_DIR%\bin\clang -E -x c %(Filename).ispc -DISPC_MASK_BITS=8 -DISPC=1 -DPI=3.1415926535 | python stdlib2cpp.py mask8 > $(Configuration)/gen-stdlib-mask8.cpp; +%LLVM_INSTALL_DIR%\bin\clang -E -x c %(Filename).ispc -DISPC_MASK_BITS=16 -DISPC=1 -DPI=3.1415926535 | python stdlib2cpp.py mask16 > $(Configuration)/gen-stdlib-mask16.cpp; +%LLVM_INSTALL_DIR%\bin\clang -E -x c %(Filename).ispc -DISPC_MASK_BITS=32 -DISPC=1 -DPI=3.1415926535 | python stdlib2cpp.py mask32 > $(Configuration)/gen-stdlib-mask32.cpp; - $(Configuration)/gen-stdlib-generic.cpp;$(Configuration)/gen-stdlib-x86.cpp - Building gen-stdlib-{generic,x86}.cpp + $(Configuration)/gen-stdlib-mask1.cpp;$(Configuration)/gen-stdlib-mask8.cpp;$(Configuration)/gen-stdlib-mask16.cpp;$(Configuration)/gen-stdlib-mask32.cpp + Building gen-stdlib-{mask1,8,16,32}.cpp From 42f31aed6901f131cf20eb7606db498f43192012 Mon Sep 17 00:00:00 2001 From: Matt Pharr Date: Wed, 14 Aug 2013 11:02:45 -0700 Subject: [PATCH 034/124] Another attempt at fixing the Windows build (added sse4-8/sse4-16 targets). --- ispc.vcxproj | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/ispc.vcxproj b/ispc.vcxproj index 74186ac0..b4a8b764 100755 --- a/ispc.vcxproj +++ b/ispc.vcxproj @@ -51,6 +51,10 @@ + + + + @@ -135,6 +139,42 @@ Building gen-bitcode-sse4-64bit.cpp + + + Document + m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-sse4-8.ll | python bitcode2cpp.py builtins\target-sse4-8.ll 32bit > $(Configuration)/gen-bitcode-sse4-8-32bit.cpp + $(Configuration)/gen-bitcode-sse4-8-32bit.cpp + builtins\util.m4;builtins\target-sse4-common.ll + Building gen-bitcode-sse4-8-32bit.cpp + + + + + Document + m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-sse4-8.ll | python bitcode2cpp.py builtins\target-sse4-8.ll 64bit > $(Configuration)/gen-bitcode-sse4-8-64bit.cpp + $(Configuration)/gen-bitcode-sse4-8-64bit.cpp + builtins\util.m4;builtins\target-sse4-common.ll + Building gen-bitcode-sse4-8-64bit.cpp + + + + + Document + m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-sse4-16.ll | python bitcode2cpp.py builtins\target-sse4-16.ll 32bit > $(Configuration)/gen-bitcode-sse4-16-32bit.cpp + $(Configuration)/gen-bitcode-sse4-16-32bit.cpp + builtins\util.m4;builtins\target-sse4-common.ll + Building gen-bitcode-sse4-16-32bit.cpp + + + + + Document + m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-sse4-16.ll | python bitcode2cpp.py builtins\target-sse4-16.ll 64bit > $(Configuration)/gen-bitcode-sse4-16-64bit.cpp + $(Configuration)/gen-bitcode-sse4-16-64bit.cpp + builtins\util.m4;builtins\target-sse4-common.ll + Building gen-bitcode-sse4-16-64bit.cpp + + Document From 6be3c24ee5a6accc8157eb20f00d72da060d8644 Mon Sep 17 00:00:00 2001 From: "james.brodman" Date: Thu, 15 Aug 2013 15:24:46 -0400 Subject: [PATCH 035/124] Separate -O and -g --- main.cpp | 6 ------ 1 file changed, 6 deletions(-) diff --git a/main.cpp b/main.cpp index c6786c39..b4a02b9f 100644 --- a/main.cpp +++ b/main.cpp @@ -571,12 +571,6 @@ int main(int Argc, char *Argv[]) { } } - // If the user specified -g, then the default optimization level is 0. - // If -g wasn't specified, the default optimization level is 1 (full - // optimization). - if (debugSet && !optSet) - g->opt.level = 0; - if (g->enableFuzzTest) { if (g->fuzzTestSeed == -1) { #ifdef ISPC_IS_WINDOWS From d976da7559089fa9bdc033ad764c73793ad34598 Mon Sep 17 00:00:00 2001 From: Matt Pharr Date: Tue, 20 Aug 2013 08:49:51 -0700 Subject: [PATCH 036/124] Speed up idiv test (dont test int32 as thoroughly) --- tests/idiv.ispc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/idiv.ispc b/tests/idiv.ispc index b7bd78dc..8738740b 100644 --- a/tests/idiv.ispc +++ b/tests/idiv.ispc @@ -44,7 +44,7 @@ export void f_f(uniform float RET[], uniform float aFOO[]) { // randomly sample int32s... uniform RNGState state; seed_rng(&state, 1234); - for (uniform int i = 0; i < 1M; ++i) { + for (uniform int i = 0; i < 64k; ++i) { unsigned int32 num = random(&state); for (uniform unsigned int32 div = 2; div < 256; ++div) { if (__fast_idiv(num, div) != num/div) { @@ -54,7 +54,7 @@ export void f_f(uniform float RET[], uniform float aFOO[]) { } } - for (uniform int64 i = 0; i < 1M; ++i) { + for (uniform int64 i = 0; i < 64k; ++i) { int32 num = random(&state); if (num < 0) continue; From e7f067d70cf03415fc350272daf0506b7184fa84 Mon Sep 17 00:00:00 2001 From: Matt Pharr Date: Tue, 20 Aug 2013 09:04:52 -0700 Subject: [PATCH 037/124] Fix handling of __clock() builtin for "generic" targets. --- cbackend.cpp | 4 ++++ examples/intrinsics/generic-16.h | 20 ++++++++++++++++++++ examples/intrinsics/generic-32.h | 20 ++++++++++++++++++++ examples/intrinsics/generic-64.h | 20 ++++++++++++++++++++ examples/intrinsics/knc.h | 21 ++++++++++++++++++--- examples/intrinsics/knc2x.h | 19 ++++++++++++++++++- examples/intrinsics/sse4.h | 20 ++++++++++++++++++-- 7 files changed, 118 insertions(+), 6 deletions(-) diff --git a/cbackend.cpp b/cbackend.cpp index d23bcc20..7d4b4cfc 100644 --- a/cbackend.cpp +++ b/cbackend.cpp @@ -3704,6 +3704,7 @@ void CWriter::lowerIntrinsics(llvm::Function &F) { case llvm::Intrinsic::sadd_with_overflow: case llvm::Intrinsic::trap: case llvm::Intrinsic::objectsize: + case llvm::Intrinsic::readcyclecounter: // We directly implement these intrinsics break; default: @@ -4056,6 +4057,9 @@ bool CWriter::visitBuiltinCall(llvm::CallInst &I, llvm::Intrinsic::ID ID, return true; case llvm::Intrinsic::objectsize: return true; + case llvm::Intrinsic::readcyclecounter: + Out << "__clock()"; + return true; } } diff --git a/examples/intrinsics/generic-16.h b/examples/intrinsics/generic-16.h index 6d4fe1f4..d81101f7 100644 --- a/examples/intrinsics/generic-16.h +++ b/examples/intrinsics/generic-16.h @@ -1759,3 +1759,23 @@ static FORCEINLINE uint64_t __atomic_cmpxchg(uint64_t *p, uint64_t cmpval, return __sync_val_compare_and_swap(p, cmpval, newval); #endif } + +#ifdef WIN32 +#include +#define __clock __rdtsc +#else // WIN32 +static FORCEINLINE uint64_t __clock() { + uint32_t low, high; +#ifdef __x86_64 + __asm__ __volatile__ ("xorl %%eax,%%eax \n cpuid" + ::: "%rax", "%rbx", "%rcx", "%rdx" ); +#else + __asm__ __volatile__ ("xorl %%eax,%%eax \n cpuid" + ::: "%eax", "%ebx", "%ecx", "%edx" ); +#endif + __asm__ __volatile__ ("rdtsc" : "=a" (low), "=d" (high)); + return (uint64_t)high << 32 | low; +} + +#endif // !WIN32 + diff --git a/examples/intrinsics/generic-32.h b/examples/intrinsics/generic-32.h index 12c4f84e..7e6c69d4 100644 --- a/examples/intrinsics/generic-32.h +++ b/examples/intrinsics/generic-32.h @@ -1827,3 +1827,23 @@ static FORCEINLINE uint64_t __atomic_cmpxchg(uint64_t *p, uint64_t cmpval, return __sync_val_compare_and_swap(p, cmpval, newval); #endif } + +#ifdef WIN32 +#include +#define __clock __rdtsc +#else // WIN32 +static FORCEINLINE uint64_t __clock() { + uint32_t low, high; +#ifdef __x86_64 + __asm__ __volatile__ ("xorl %%eax,%%eax \n cpuid" + ::: "%rax", "%rbx", "%rcx", "%rdx" ); +#else + __asm__ __volatile__ ("xorl %%eax,%%eax \n cpuid" + ::: "%eax", "%ebx", "%ecx", "%edx" ); +#endif + __asm__ __volatile__ ("rdtsc" : "=a" (low), "=d" (high)); + return (uint64_t)high << 32 | low; +} +#endif // WIN32 + +#undef FORCEINLINE diff --git a/examples/intrinsics/generic-64.h b/examples/intrinsics/generic-64.h index a3648f42..39124186 100644 --- a/examples/intrinsics/generic-64.h +++ b/examples/intrinsics/generic-64.h @@ -1960,3 +1960,23 @@ static FORCEINLINE uint64_t __atomic_cmpxchg(uint64_t *p, uint64_t cmpval, return __sync_val_compare_and_swap(p, cmpval, newval); #endif } + +#ifdef WIN32 +#include +#define __clock __rdtsc +#else // WIN32 +static FORCEINLINE uint64_t __clock() { + uint32_t low, high; +#ifdef __x86_64 + __asm__ __volatile__ ("xorl %%eax,%%eax \n cpuid" + ::: "%rax", "%rbx", "%rcx", "%rdx" ); +#else + __asm__ __volatile__ ("xorl %%eax,%%eax \n cpuid" + ::: "%eax", "%ebx", "%ecx", "%edx" ); +#endif + __asm__ __volatile__ ("rdtsc" : "=a" (low), "=d" (high)); + return (uint64_t)high << 32 | low; +} +#endif + +#undef FORCEINLINE diff --git a/examples/intrinsics/knc.h b/examples/intrinsics/knc.h index 41c4cbc0..8baef8cb 100644 --- a/examples/intrinsics/knc.h +++ b/examples/intrinsics/knc.h @@ -2121,9 +2121,24 @@ static FORCEINLINE uint64_t __atomic_cmpxchg(uint64_t *p, uint64_t cmpval, #endif } +#ifdef WIN32 +#include +#define __clock __rdtsc +#else // WIN32 +static FORCEINLINE uint64_t __clock() { + uint32_t low, high; +#ifdef __x86_64 + __asm__ __volatile__ ("xorl %%eax,%%eax \n cpuid" + ::: "%rax", "%rbx", "%rcx", "%rdx" ); +#else + __asm__ __volatile__ ("xorl %%eax,%%eax \n cpuid" + ::: "%eax", "%ebx", "%ecx", "%edx" ); +#endif + __asm__ __volatile__ ("rdtsc" : "=a" (low), "=d" (high)); + return (uint64_t)high << 32 | low; +} +#endif // !WIN32 + #undef FORCEINLINE #undef PRE_ALIGN #undef POST_ALIGN - - - diff --git a/examples/intrinsics/knc2x.h b/examples/intrinsics/knc2x.h index 5b6e5295..a1b1fc9d 100644 --- a/examples/intrinsics/knc2x.h +++ b/examples/intrinsics/knc2x.h @@ -2055,7 +2055,24 @@ static FORCEINLINE void __aos_to_soa4_float(float *ptr, __vec32_f *out0, __vec32 } */ +#ifdef WIN32 +#include +#define __clock __rdtsc +#else // WIN32 +static FORCEINLINE uint64_t __clock() { + uint32_t low, high; +#ifdef __x86_64 + __asm__ __volatile__ ("xorl %%eax,%%eax \n cpuid" + ::: "%rax", "%rbx", "%rcx", "%rdx" ); +#else + __asm__ __volatile__ ("xorl %%eax,%%eax \n cpuid" + ::: "%eax", "%ebx", "%ecx", "%edx" ); +#endif + __asm__ __volatile__ ("rdtsc" : "=a" (low), "=d" (high)); + return (uint64_t)high << 32 | low; +} +#endif // !WIN32 + #undef FORCEINLINE #undef PRE_ALIGN #undef POST_ALIGN - diff --git a/examples/intrinsics/sse4.h b/examples/intrinsics/sse4.h index 44dedf33..ff00d920 100644 --- a/examples/intrinsics/sse4.h +++ b/examples/intrinsics/sse4.h @@ -4000,6 +4000,22 @@ static FORCEINLINE uint64_t __atomic_cmpxchg(uint64_t *p, uint64_t cmpval, #endif } +#ifdef WIN32 +#include +#define __clock __rdtsc +#else // WIN32 +static FORCEINLINE uint64_t __clock() { + uint32_t low, high; +#ifdef __x86_64 + __asm__ __volatile__ ("xorl %%eax,%%eax \n cpuid" + ::: "%rax", "%rbx", "%rcx", "%rdx" ); +#else + __asm__ __volatile__ ("xorl %%eax,%%eax \n cpuid" + ::: "%eax", "%ebx", "%ecx", "%edx" ); +#endif + __asm__ __volatile__ ("rdtsc" : "=a" (low), "=d" (high)); + return (uint64_t)high << 32 | low; +} +#endif // !WIN32 + #undef FORCEINLINE - - From 2b2905b567fec1725beff5064d6b0ffe21d93c38 Mon Sep 17 00:00:00 2001 From: Matt Pharr Date: Tue, 20 Aug 2013 09:05:50 -0700 Subject: [PATCH 038/124] Fix (preexisting) bugs in generic-32/64.h with type of "__any", etc. This should be a bool, not a one-wide vector of bools. The equivalent fix was previously made in generic-16.h, but not made here. (Note that many tests are still failing with these targets, but at least they compile properly now.) --- examples/intrinsics/generic-32.h | 12 ++++++------ examples/intrinsics/generic-64.h | 12 ++++++------ 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/examples/intrinsics/generic-32.h b/examples/intrinsics/generic-32.h index 7e6c69d4..531ed215 100644 --- a/examples/intrinsics/generic-32.h +++ b/examples/intrinsics/generic-32.h @@ -408,15 +408,15 @@ static FORCEINLINE uint64_t __movmsk(__vec32_i1 mask) { return (uint64_t)mask.v; } -static FORCEINLINE __vec32_i1 __any(__vec32_i1 mask) { +static FORCEINLINE bool __any(__vec32_i1 mask) { return (mask.v!=0); } -static FORCEINLINE __vec32_i1 __all(__vec32_i1 mask) { - return (mask.v==0xFFFFFFFF); +static FORCEINLINE bool __all(__vec32_i1 mask) { + return (mask.v==0xFFFFFFFFul); } -static FORCEINLINE __vec32_i1 __none(__vec32_i1 mask) { +static FORCEINLINE bool __none(__vec32_i1 mask) { return (mask.v==0); } @@ -1231,8 +1231,8 @@ REDUCE_ADD(double, __vec32_d, __reduce_add_double) REDUCE_MINMAX(double, __vec32_d, __reduce_min_double, <) REDUCE_MINMAX(double, __vec32_d, __reduce_max_double, >) -REDUCE_ADD(int16_t, __vec16_i8, __reduce_add_int8) -REDUCE_ADD(int32_t, __vec16_i16, __reduce_add_int16) +//REDUCE_ADD(int16_t, __vec16_i8, __reduce_add_int8) +//REDUCE_ADD(int32_t, __vec16_i16, __reduce_add_int16) REDUCE_ADD(int64_t, __vec32_i32, __reduce_add_int32) REDUCE_MINMAX(int32_t, __vec32_i32, __reduce_min_int32, <) diff --git a/examples/intrinsics/generic-64.h b/examples/intrinsics/generic-64.h index 39124186..bbeb007a 100644 --- a/examples/intrinsics/generic-64.h +++ b/examples/intrinsics/generic-64.h @@ -533,15 +533,15 @@ static FORCEINLINE uint64_t __movmsk(__vec64_i1 mask) { return (uint64_t)mask.v; } -static FORCEINLINE __vec64_i1 __any(__vec64_i1 mask) { +static FORCEINLINE bool __any(__vec64_i1 mask) { return (mask.v!=0); } -static FORCEINLINE __vec64_i1 __all(__vec64_i1 mask) { - return (mask.v==0xFFFFFFFFFFFFFFFF); +static FORCEINLINE bool __all(__vec64_i1 mask) { + return (mask.v==0xFFFFFFFFFFFFFFFFull); } -static FORCEINLINE __vec64_i1 __none(__vec64_i1 mask) { +static FORCEINLINE bool __none(__vec64_i1 mask) { return (mask.v==0); } @@ -1364,8 +1364,8 @@ REDUCE_ADD(double, __vec64_d, __reduce_add_double) REDUCE_MINMAX(double, __vec64_d, __reduce_min_double, <) REDUCE_MINMAX(double, __vec64_d, __reduce_max_double, >) -REDUCE_ADD(int16_t, __vec16_i8, __reduce_add_int8) -REDUCE_ADD(int32_t, __vec16_i16, __reduce_add_int16) +//REDUCE_ADD(int16_t, __vec16_i8, __reduce_add_int8) +//REDUCE_ADD(int32_t, __vec16_i16, __reduce_add_int16) REDUCE_ADD(int64_t, __vec64_i32, __reduce_add_int32) REDUCE_MINMAX(int32_t, __vec64_i32, __reduce_min_int32, <) From 502f8fd76b9cf88cd260106b546494c1facc28b4 Mon Sep 17 00:00:00 2001 From: Matt Pharr Date: Tue, 20 Aug 2013 09:22:09 -0700 Subject: [PATCH 039/124] Reduce debug spew on failing idiv.ispc tests --- tests/idiv.ispc | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/tests/idiv.ispc b/tests/idiv.ispc index 8738740b..bd0766da 100644 --- a/tests/idiv.ispc +++ b/tests/idiv.ispc @@ -4,12 +4,13 @@ export uniform int width() { return programCount; } export void f_f(uniform float RET[], uniform float aFOO[]) { uniform int errorCount = 0; - + for (unsigned int8 num = 0; num < 255; ++num) { for (uniform unsigned int8 div = 2; div < 255; ++div) { if (__fast_idiv(num, div) != num/div) { ++errorCount; print("error %/% = %, got %\n", num, div, num/div, __fast_idiv(num,div)); + if (errorCount > 32) break; } } } @@ -19,6 +20,7 @@ export void f_f(uniform float RET[], uniform float aFOO[]) { if (__fast_idiv(num, div) != num/div) { ++errorCount; print("error %/% = %, got %\n", num, div, num/div, __fast_idiv(num,div)); + if (errorCount > 64) break; } } } @@ -28,6 +30,7 @@ export void f_f(uniform float RET[], uniform float aFOO[]) { if (__fast_idiv(num, div) != num/div) { ++errorCount; print("error %/% = %, got %\n", num, div, num/div, __fast_idiv(num,div)); + if (errorCount > 96) break; } } } @@ -37,6 +40,7 @@ export void f_f(uniform float RET[], uniform float aFOO[]) { if (__fast_idiv(num, div) != num/div) { ++errorCount; print("error %/% = %, got %\n", num, div, num/div, __fast_idiv(num,div)); + if (errorCount > 128) break; } } } @@ -50,6 +54,7 @@ export void f_f(uniform float RET[], uniform float aFOO[]) { if (__fast_idiv(num, div) != num/div) { ++errorCount; print("ui32 error %/% = %, got %\n", num, div, num/div, __fast_idiv(num,div)); + if (errorCount > 160) break; } } } @@ -62,6 +67,7 @@ export void f_f(uniform float RET[], uniform float aFOO[]) { if (__fast_idiv(num, div) != num/div) { ++errorCount; print("si32 error %/% = %, got %\n", num, div, num/div, __fast_idiv(num,div)); + if (errorCount > 192) break; } } } From 60b413a9cb9b30dc2c6e1f9c345bdf19286f9114 Mon Sep 17 00:00:00 2001 From: Dmitry Babokin Date: Wed, 21 Aug 2013 19:25:30 +0400 Subject: [PATCH 040/124] Adding --non-interactive switch to run_tests.py --- run_tests.py | 32 +++++++++++++++++++------------- 1 file changed, 19 insertions(+), 13 deletions(-) diff --git a/run_tests.py b/run_tests.py index 7c6b1eb8..74407ce4 100755 --- a/run_tests.py +++ b/run_tests.py @@ -55,6 +55,8 @@ parser.add_option('--wrap-exe', dest='wrapexe', default="") parser.add_option('--time', dest='time', help='Enable time output', default=False, action="store_true") +parser.add_option('--non-interactive', dest='non_interactive', help='Disable interactive status updates', + default=False, action="store_true") (options, args) = parser.parse_args() @@ -162,14 +164,15 @@ total_tests = 0 # finished. Should be called with the lock held.. def update_progress(fn, total_tests_arg, counter, max_test_length_arg): counter.value += 1 - progress_str = " Done %d / %d [%s]" % (counter.value, total_tests_arg, fn) - # spaces to clear out detrius from previous printing... - spaces_needed = max_test_length_arg - len(fn) - for x in range(spaces_needed): - progress_str += ' ' - progress_str += '\r' - sys.stdout.write(progress_str) - sys.stdout.flush() + if options.non_interactive == False: + progress_str = " Done %d / %d [%s]" % (counter.value, total_tests_arg, fn) + # spaces to clear out detrius from previous printing... + spaces_needed = max_test_length_arg - len(fn) + for x in range(spaces_needed): + progress_str += ' ' + progress_str += '\r' + sys.stdout.write(progress_str) + sys.stdout.flush() def run_command(cmd): if options.verbose: @@ -489,11 +492,8 @@ if __name__ == '__main__': # (i.e. return 0 if all is ok) for t in task_threads: t.join() - sys.stdout.write("\n") - - elapsed_time = time.time() - start_time - if options.time: - sys.stdout.write("Elapsed time: %d s\n" % elapsed_time) + if options.non_interactive == False: + sys.stdout.write("\n") while not qret.empty(): (c, r, s) = qret.get() @@ -501,6 +501,8 @@ if __name__ == '__main__': run_error_files += r skip_files += s + if options.non_interactive: + sys.stdout.write(" Done %d / %d\n" % (finished_tests_counter.value, total_tests)) if len(skip_files) > 0: skip_files.sort() sys.stdout.write("%d / %d tests SKIPPED:\n" % (len(skip_files), total_tests)) @@ -517,4 +519,8 @@ if __name__ == '__main__': for f in run_error_files: sys.stdout.write("\t%s\n" % f) + elapsed_time = time.time() - start_time + if options.time: + sys.stdout.write("Elapsed time: %d s\n" % elapsed_time) + sys.exit(len(compile_error_files) + len(run_error_files)) From 5fb30939be6b4a7949c039c7b1db9b42eb478a22 Mon Sep 17 00:00:00 2001 From: Dmitry Babokin Date: Wed, 21 Aug 2013 19:46:18 +0400 Subject: [PATCH 041/124] Fix for #564, using wrong ispc in run_tests.py --- run_tests.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/run_tests.py b/run_tests.py index 74407ce4..e029b9a6 100755 --- a/run_tests.py +++ b/run_tests.py @@ -234,7 +234,7 @@ def add_prefix(path): else: input_prefix = "" path = input_prefix + path - path = os.path.normpath(path) + path = os.path.abspath(path) return path From f31a31478b7329cbaf6d8b7d50f30c0cc90996dc Mon Sep 17 00:00:00 2001 From: Dmitry Babokin Date: Thu, 22 Aug 2013 12:41:57 +0400 Subject: [PATCH 042/124] Moving time calculation earlier --- run_tests.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/run_tests.py b/run_tests.py index e029b9a6..710bd274 100755 --- a/run_tests.py +++ b/run_tests.py @@ -495,6 +495,8 @@ if __name__ == '__main__': if options.non_interactive == False: sys.stdout.write("\n") + elapsed_time = time.time() - start_time + while not qret.empty(): (c, r, s) = qret.get() compile_error_files += c @@ -519,7 +521,6 @@ if __name__ == '__main__': for f in run_error_files: sys.stdout.write("\t%s\n" % f) - elapsed_time = time.time() - start_time if options.time: sys.stdout.write("Elapsed time: %d s\n" % elapsed_time) From 611477e214f19e89657cd85252bb44e801573240 Mon Sep 17 00:00:00 2001 From: Matt Pharr Date: Thu, 22 Aug 2013 07:50:25 -0700 Subject: [PATCH 043/124] Revert change to lEmitVaryingSelect(). Using vector select versus a store and masked load for varying vector selects seems to give worse code. This may be related to http://llvm.org/bugs/show_bug.cgi?id=16941. --- expr.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/expr.cpp b/expr.cpp index 856d363c..614cb5e5 100644 --- a/expr.cpp +++ b/expr.cpp @@ -3123,7 +3123,10 @@ static llvm::Value * lEmitVaryingSelect(FunctionEmitContext *ctx, llvm::Value *test, llvm::Value *expr1, llvm::Value *expr2, const Type *type) { -#if !defined(LLVM_3_1) +#if 0 // !defined(LLVM_3_1) + // Though it should be equivalent, this seems to cause non-trivial + // performance regressions versus the below. This may be related to + // http://llvm.org/bugs/show_bug.cgi?id=16941. if (test->getType() != LLVMTypes::Int1VectorType) test = ctx->TruncInst(test, LLVMTypes::Int1VectorType); return ctx->SelectInst(test, expr1, expr2, "select"); From f620cdbaa1f6cfdad15218a28d7da025e2493c01 Mon Sep 17 00:00:00 2001 From: Ilia Filippov Date: Mon, 26 Aug 2013 14:04:59 +0400 Subject: [PATCH 044/124] Changes in perf.py functionality, unification of examples, correction build warnings --- Makefile | 6 +- examples/aobench/ao.cpp | 6 +- examples/deferred/main.cpp | 2 +- examples/mandelbrot/mandelbrot.cpp | 2 +- .../mandelbrot_tasks/mandelbrot_tasks.cpp | 2 +- examples/noise/noise.cpp | 2 +- examples/perf.py | 153 +++++++++++++++--- examples/stencil/stencil.cpp | 2 +- examples/volume_rendering/volume.cpp | 2 +- main.cpp | 4 - 10 files changed, 140 insertions(+), 41 deletions(-) diff --git a/Makefile b/Makefile index 69e24d41..8d27cc80 100644 --- a/Makefile +++ b/Makefile @@ -113,8 +113,10 @@ CXX=g++ CPP=cpp OPT=-O2 CXXFLAGS=$(OPT) $(LLVM_CXXFLAGS) -I. -Iobjs/ -I$(CLANG_INCLUDE) \ - -Wall $(LLVM_VERSION_DEF) \ - -DBUILD_DATE="\"$(BUILD_DATE)\"" -DBUILD_VERSION="\"$(BUILD_VERSION)\"" + $(LLVM_VERSION_DEF) \ + -Wall \ + -DBUILD_DATE="\"$(BUILD_DATE)\"" -DBUILD_VERSION="\"$(BUILD_VERSION)\"" \ + -Werror -Wno-sign-compare ifneq ($(ARM_ENABLED), 0) CXXFLAGS+=-DISPC_ARM_ENABLED endif diff --git a/examples/aobench/ao.cpp b/examples/aobench/ao.cpp index cbe75a0b..2286316d 100644 --- a/examples/aobench/ao.cpp +++ b/examples/aobench/ao.cpp @@ -138,7 +138,7 @@ int main(int argc, char **argv) } // Report results and save image - printf("[aobench ispc]:\t\t\t[%.3f] M cycles (%d x %d image)\n", + printf("[aobench ispc]:\t\t\t[%.3f] million cycles (%d x %d image)\n", minTimeISPC, width, height); savePPM("ao-ispc.ppm", width, height); @@ -158,7 +158,7 @@ int main(int argc, char **argv) } // Report results and save image - printf("[aobench ispc + tasks]:\t\t[%.3f] M cycles (%d x %d image)\n", + printf("[aobench ispc + tasks]:\t\t[%.3f] million cycles (%d x %d image)\n", minTimeISPCTasks, width, height); savePPM("ao-ispc-tasks.ppm", width, height); @@ -176,7 +176,7 @@ int main(int argc, char **argv) } // Report more results, save another image... - printf("[aobench serial]:\t\t[%.3f] M cycles (%d x %d image)\n", minTimeSerial, + printf("[aobench serial]:\t\t[%.3f] million cycles (%d x %d image)\n", minTimeSerial, width, height); printf("\t\t\t\t(%.2fx speedup from ISPC, %.2fx speedup from ISPC + tasks)\n", minTimeSerial / minTimeISPC, minTimeSerial / minTimeISPCTasks); diff --git a/examples/deferred/main.cpp b/examples/deferred/main.cpp index 17bd3f42..4f2be879 100644 --- a/examples/deferred/main.cpp +++ b/examples/deferred/main.cpp @@ -130,7 +130,7 @@ int main(int argc, char** argv) { printf("\t\t\t\t(%.2fx speedup from static ISPC, %.2fx from Cilk+ISPC)\n", serialCycles/ispcCycles, serialCycles/dynamicCilkCycles); #else - printf("\t\t\t\t(%.2fx speedup from ISPC)\n", serialCycles/ispcCycles); + printf("\t\t\t\t(%.2fx speedup from ISPC + tasks)\n", serialCycles/ispcCycles); #endif // __cilk DeleteInputData(input); diff --git a/examples/mandelbrot/mandelbrot.cpp b/examples/mandelbrot/mandelbrot.cpp index 7e73768f..d2bebb96 100644 --- a/examples/mandelbrot/mandelbrot.cpp +++ b/examples/mandelbrot/mandelbrot.cpp @@ -109,7 +109,7 @@ int main() { minSerial = std::min(minSerial, dt); } - printf("[mandelbrot serial]:\t\t[%.3f] millon cycles\n", minSerial); + printf("[mandelbrot serial]:\t\t[%.3f] million cycles\n", minSerial); writePPM(buf, width, height, "mandelbrot-serial.ppm"); printf("\t\t\t\t(%.2fx speedup from ISPC)\n", minSerial/minISPC); diff --git a/examples/mandelbrot_tasks/mandelbrot_tasks.cpp b/examples/mandelbrot_tasks/mandelbrot_tasks.cpp index dae22736..698daf0f 100644 --- a/examples/mandelbrot_tasks/mandelbrot_tasks.cpp +++ b/examples/mandelbrot_tasks/mandelbrot_tasks.cpp @@ -137,7 +137,7 @@ int main(int argc, char *argv[]) { minSerial = std::min(minSerial, dt); } - printf("[mandelbrot serial]:\t\t[%.3f] millon cycles\n", minSerial); + printf("[mandelbrot serial]:\t\t[%.3f] million cycles\n", minSerial); writePPM(buf, width, height, "mandelbrot-serial.ppm"); printf("\t\t\t\t(%.2fx speedup from ISPC + tasks)\n", minSerial/minISPC); diff --git a/examples/noise/noise.cpp b/examples/noise/noise.cpp index 58552ce3..123f98c7 100644 --- a/examples/noise/noise.cpp +++ b/examples/noise/noise.cpp @@ -106,7 +106,7 @@ int main() { minSerial = std::min(minSerial, dt); } - printf("[noise serial]:\t\t\t[%.3f] millon cycles\n", minSerial); + printf("[noise serial]:\t\t\t[%.3f] million cycles\n", minSerial); writePPM(buf, width, height, "noise-serial.ppm"); printf("\t\t\t\t(%.2fx speedup from ISPC)\n", minSerial/minISPC); diff --git a/examples/perf.py b/examples/perf.py index f96ef9ec..4b661b39 100755 --- a/examples/perf.py +++ b/examples/perf.py @@ -10,12 +10,22 @@ import glob import string import platform +def print_debug(line): + if options.silent == False: + sys.stdout.write(line) + +def print_file(line): + if options.output != "": + output = open(options.output, 'w') + output.writelines(line) + output.close() + def build_test(): global build_log global is_windows if is_windows == False: os.system("make clean >> "+build_log) - return os.system("make >> "+build_log+" 2>> "+build_log) + return os.system("make CXX="+ref_compiler+" CC="+refc_compiler+" >> "+build_log+" 2>> "+build_log) else: os.system("msbuild /t:clean >> " + build_log) return os.system("msbuild /V:m /p:Platform=x64 /p:Configuration=Release /p:TargetDir=.\ /t:rebuild >> " + build_log) @@ -30,7 +40,7 @@ def execute_test(command): return r #gathers all tests results and made an item test from answer structure -def run_test(command, c1, c2, test): +def run_test(command, c1, c2, test, b_serial): global perf_temp if build_test() != 0: sys.stdout.write("ERROR: Compilation fails\n") @@ -40,11 +50,13 @@ def run_test(command, c1, c2, test): return tasks = [] #list of results with tasks, it will be test[2] ispc = [] #list of results without tasks, it will be test[1] + absolute_tasks = [] #list of absolute results with tasks, it will be test[4] + absolute_ispc = [] #list of absolute results without tasks, ut will be test[3] + serial = [] #list serial times, it will be test[5] j = 1 for line in open(perf_temp): # we take test output if "speedup" in line: # we are interested only in lines with speedup if j == c1: # we are interested only in lines with c1 numbers - sys.stdout.write(line) line = line.expandtabs(0) line = line.replace("("," ") line = line.split(",") @@ -57,9 +69,42 @@ def run_test(command, c1, c2, test): ispc.append(number) c1 = c1 + c2 j+=1 + if "million cycles" in line: + if j == c1: + line = line.replace("]","[") + line = line.split("[") + number = float(line[3]) + if "tasks" in line[1]: + absolute_tasks.append(number) + else: + if "ispc" in line[1]: + absolute_ispc.append(number) + if "serial" in line[1]: + serial.append(number) + + if len(ispc) != 0: + if len(tasks) != 0: + print_debug("ISPC speedup / ISPC + tasks speedup / ISPC time / ISPC + tasks time / serial time\n") + for i in range(0,len(serial)): + print_debug("%10s /\t%10s\t /%9s / %10s\t /%10s\n" % + (ispc[i], tasks[i], absolute_ispc[i], absolute_tasks[i], serial[i])) + else: + print_debug("ISPC speedup / ISPC time / serial time\n") + for i in range(0,len(serial)): + print_debug("%10s /%9s /%10s\n" % (ispc[i], absolute_ispc[i], serial[i])) + else: + if len(tasks) != 0: + print_debug("ISPC + tasks speedup / ISPC + tasks time / serial time\n") + for i in range(0,len(serial)): + print_debug("%10s\t / %10s\t /%10s\n" % (tasks[i], absolute_tasks[i], serial[i])) + test[1] = test[1] + ispc test[2] = test[2] + tasks - + test[3] = test[3] + absolute_ispc + test[4] = test[4] + absolute_tasks + if b_serial == True: + #if we concatenate outputs we should use only the first serial answer. + test[5] = test[5] + serial def cpu_get(): p = open("/proc/stat", 'r') @@ -113,30 +158,57 @@ def geomean(par): #test[0] - name of test #test[1] - list of results without tasks #test[2] - list of results with tasks -#test[1] or test[2] may be empty +#test[3] - list of absolute results without tasks +#test[4] - list of absolute results with tasks +#test[5] - list of absolute time without ISPC (serial) +#test[1..4] may be empty def print_answer(answer): - sys.stdout.write("Name of test:\t\tISPC:\tISPC + tasks:\n") - max_t = [0,0] - diff_t = [0,0] - geomean_t = [0,0] - list_of_max = [[],[]] + filelist = [] + print_debug("--------------------------------------------------------------------------\n") + print_debug("test name:\t ISPC speedup: ISPC + tasks speedup: | " + + "ISPC time: ISPC + tasks time: serial:\n") + filelist.append("test name,ISPC speedup,diff," + + "ISPC + tasks speedup,diff,ISPC time,diff,ISPC + tasks time,diff,serial,diff\n") + max_t = [0,0,0,0,0] + diff_t = [0,0,0,0,0] + geomean_t = [0,0,0,0,0] + list_of_max = [[],[],[],[],[]] for i in range(len(answer)): - for t in range(1,3): + for t in range(1,6): if len(answer[i][t]) == 0: max_t[t-1] = "n/a" diff_t[t-1] = "n/a" else: - list_of_max[t-1].append(max(answer[i][t])) - max_t[t-1] = str(max(answer[i][t])) - diff_t[t-1] = str(max(answer[i][t]) - min(answer[i][t])) - sys.stdout.write("%s:\n" % answer[i][0]) - sys.stdout.write("\t\tmax:\t%s\t%s\n" % (max_t[0], max_t[1])) - sys.stdout.write("\t\tdiff:\t%s\t%s\n" % (diff_t[0], diff_t[1])) + if t < 3: + mm = max(answer[i][t]) + else: + mm = min(answer[i][t]) + max_t[t-1] = '%.2f' % mm + list_of_max[t-1].append(mm) + diff_t[t-1] = '%.2f' % (max(answer[i][t]) - min(answer[i][t])) + print_debug("%s:\n" % answer[i][0]) + print_debug("\t\tmax:\t%5s\t\t%10s\t|%10s\t%10s\t%10s\n" % + (max_t[0], max_t[1], max_t[2], max_t[3], max_t[4])) + print_debug("\t\tdiff:\t%5s\t\t%10s\t|%10s\t%10s\t%10s\n" % + (diff_t[0], diff_t[1], diff_t[2], diff_t[3], diff_t[4])) + for t in range(0,5): + if max_t[t] == "n/a": + max_t[t] = "" + if diff_t[t] == "n/a": + diff_t[t] = "" + filelist.append(answer[i][0] + "," + + max_t[0] + "," + diff_t[0] + "," + max_t[1] + "," + diff_t[1] + "," + + max_t[2] + "," + diff_t[2] + "," + max_t[3] + "," + diff_t[3] + "," + + max_t[4] + "," + diff_t[4] + "\n") + for i in range(0,5): + geomean_t[i] = geomean(list_of_max[i]) + print_debug("---------------------------------------------------------------------------------\n") + print_debug("Geomean:\t\t%5s\t\t%10s\t|%10s\t%10s\t%10s\n" % + (geomean_t[0], geomean_t[1], geomean_t[2], geomean_t[3], geomean_t[4])) + filelist.append("Geomean," + str(geomean_t[0]) + ",," + str(geomean_t[1]) + + ",," + str(geomean_t[2]) + ",," + str(geomean_t[3]) + ",," + str(geomean_t[4]) + "\n") + print_file(filelist) - geomean_t[0] = geomean(list_of_max[0]) - geomean_t[1] = geomean(list_of_max[1]) - sys.stdout.write("---------------------------------------------\n") - sys.stdout.write("Geomean:\t\t%s\t%s\n" % (geomean_t[0], geomean_t[1])) ###Main### # parsing options @@ -147,6 +219,12 @@ parser.add_option('-c', '--config', dest='config', help='config file of tests', default="./perf.ini") parser.add_option('-p', '--path', dest='path', help='path to examples directory', default="./") +parser.add_option('-s', '--silent', dest='silent', + help='silent mode, only table output', default=False, action="store_true") +parser.add_option('-o', '--output', dest='output', + help='output file for script reading', default="") +parser.add_option('--compiler', dest='compiler', + help='reference compiler', default="") (options, args) = parser.parse_args() global is_windows @@ -174,6 +252,14 @@ ref_compiler_exists = False if is_windows == False: compiler = "ispc" ref_compiler = "g++" + refc_compiler = "gcc" + if options.compiler != "": + if options.compiler == "clang" or options.compiler == "clang++": + ref_compiler = "clang++" + refc_compiler = "clang" + if options.compiler == "icc" or options.compiler == "icpc": + ref_compiler = "icpc" + refc_compiler = "icc" else: compiler = "ispc.exe" ref_compiler = "cl.exe" @@ -222,12 +308,27 @@ perf_temp = pwd + "perf_temp" i = 0 answer = [] -sys.stdout.write("Okey go go go!\n\n") +print_debug("Okey go go go!\n\n") +os.system(compiler + " --version >" + build_log) +version = open(build_log) +print_debug("Using test compiler: " + version.readline()) +version.close() + +if is_windows == False: + os.system(ref_compiler + " --version >" + build_log) +else: + os.system(ref_compiler + " 2>" + build_log + " 1>&2") + +version = open(build_log) +print_debug("Using reference compiler: " + version.readline()) +version.close() + + # loop for all tests while i < length-2: # we read name of test - sys.stdout.write("%s" % lines[i]) - test = [lines[i][:-1],[],[]] + print_debug("%s" % lines[i]) + test = [lines[i][:-1],[],[],[],[],[]] # read location of test folder = lines[i+1] folder = folder[:-1] @@ -257,10 +358,10 @@ while i < length-2: c2 = 1 next_line = lines[i+3] if next_line[0] == "^": #we should concatenate result of this test with previous one - run_test(command, c1, c2, answer[len(answer)-1]) + run_test(command, c1, c2, answer[len(answer)-1], False) i = i+1 else: #we run this test and append it's result to answer structure - run_test(command, c1, c2, test) + run_test(command, c1, c2, test, True) answer.append(test) # preparing next loop iteration os.chdir(pwd) diff --git a/examples/stencil/stencil.cpp b/examples/stencil/stencil.cpp index 9d5b3ee6..593d901f 100644 --- a/examples/stencil/stencil.cpp +++ b/examples/stencil/stencil.cpp @@ -130,7 +130,7 @@ int main() { minTimeSerial = std::min(minTimeSerial, dt); } - printf("[stencil serial]:\t\t[%.3f] millon cycles\n", minTimeSerial); + printf("[stencil serial]:\t\t[%.3f] million cycles\n", minTimeSerial); printf("\t\t\t\t(%.2fx speedup from ISPC, %.2fx speedup from ISPC + tasks)\n", minTimeSerial / minTimeISPC, minTimeSerial / minTimeISPCTasks); diff --git a/examples/volume_rendering/volume.cpp b/examples/volume_rendering/volume.cpp index 7d8b8e99..458cd407 100644 --- a/examples/volume_rendering/volume.cpp +++ b/examples/volume_rendering/volume.cpp @@ -204,7 +204,7 @@ int main(int argc, char *argv[]) { minSerial = std::min(minSerial, dt); } - printf("[volume serial]:\t\t[%.3f] millon cycles\n", minSerial); + printf("[volume serial]:\t\t[%.3f] million cycles\n", minSerial); writePPM(image, width, height, "volume-serial.ppm"); printf("\t\t\t\t(%.2fx speedup from ISPC, %.2fx speedup from ISPC + tasks)\n", diff --git a/main.cpp b/main.cpp index 58daa2d3..61c62042 100644 --- a/main.cpp +++ b/main.cpp @@ -328,7 +328,6 @@ int main(int Argc, char *Argv[]) { // as we're parsing below g = new Globals; - bool debugSet = false, optSet = false; Module::OutputType ot = Module::Object; bool generatePIC = false; const char *arch = NULL, *cpu = NULL, *target = NULL; @@ -371,7 +370,6 @@ int main(int Argc, char *Argv[]) { g->emitInstrumentation = true; else if (!strcmp(argv[i], "-g")) { g->generateDebuggingSymbols = true; - debugSet = true; } else if (!strcmp(argv[i], "--emit-asm")) ot = Module::Asm; @@ -496,12 +494,10 @@ int main(int Argc, char *Argv[]) { } else if (!strcmp(argv[i], "-O0")) { g->opt.level = 0; - optSet = true; } else if (!strcmp(argv[i], "-O") || !strcmp(argv[i], "-O1") || !strcmp(argv[i], "-O2") || !strcmp(argv[i], "-O3")) { g->opt.level = 1; - optSet = true; } else if (!strcmp(argv[i], "-")) ; From 443987f536a15adc384fe98284106208b2049eed Mon Sep 17 00:00:00 2001 From: Dmitry Babokin Date: Tue, 27 Aug 2013 15:33:44 +0400 Subject: [PATCH 045/124] fixing ispc.rst file properties (should not be executable) --- docs/ispc.rst | 0 1 file changed, 0 insertions(+), 0 deletions(-) mode change 100755 => 100644 docs/ispc.rst diff --git a/docs/ispc.rst b/docs/ispc.rst old mode 100755 new mode 100644 From 5d8ebf3ca17ed18d21b89d4cacf6599220e9c293 Mon Sep 17 00:00:00 2001 From: Dmitry Babokin Date: Tue, 27 Aug 2013 18:27:06 +0400 Subject: [PATCH 046/124] Fixing r183327-AVX2-GATHER.patch file permissions --- llvm_patches/r183327-AVX2-GATHER.patch | 0 1 file changed, 0 insertions(+), 0 deletions(-) mode change 100755 => 100644 llvm_patches/r183327-AVX2-GATHER.patch diff --git a/llvm_patches/r183327-AVX2-GATHER.patch b/llvm_patches/r183327-AVX2-GATHER.patch old mode 100755 new mode 100644 From be3a40e70b84a4615e36d5067e939d33f4da702e Mon Sep 17 00:00:00 2001 From: "james.brodman" Date: Tue, 27 Aug 2013 15:15:16 -0400 Subject: [PATCH 047/124] Fix for 3.4 --- ispc.cpp | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/ispc.cpp b/ispc.cpp index 8a0f16c6..2dd1a87d 100644 --- a/ispc.cpp +++ b/ispc.cpp @@ -288,7 +288,11 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : this->m_isa = Target::SSE2; this->m_nativeVectorWidth = 4; this->m_vectorWidth = 4; +#if defined(LLVM_3_4) + this->m_attributes = "+sse,+sse2,-sse3,-sse4.1,-sse4.2,-sse4a,-ssse3,-popcnt"; +#else this->m_attributes = "+sse,+sse2,-sse3,-sse41,-sse42,-sse4a,-ssse3,-popcnt"; +#endif this->m_maskingIsFree = false; this->m_maskBitCount = 32; } @@ -297,7 +301,11 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : this->m_isa = Target::SSE2; this->m_nativeVectorWidth = 4; this->m_vectorWidth = 8; +#if defined(LLVM_3_4) + this->m_attributes = "+sse,+sse2,-sse3,-sse4.1,-sse4.2,-sse4a,-ssse3,-popcnt"; +#else this->m_attributes = "+sse,+sse2,-sse3,-sse41,-sse42,-sse4a,-ssse3,-popcnt"; +#endif this->m_maskingIsFree = false; this->m_maskBitCount = 32; } @@ -307,7 +315,11 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : this->m_nativeVectorWidth = 4; this->m_vectorWidth = 4; // TODO: why not sse42 and popcnt? +#if defined(LLVM_3_4) + this->m_attributes = "+sse,+sse2,+sse3,+sse4.1,-sse4.2,-sse4a,+ssse3,-popcnt,+cmov"; +#else this->m_attributes = "+sse,+sse2,+sse3,+sse41,-sse42,-sse4a,+ssse3,-popcnt,+cmov"; +#endif this->m_maskingIsFree = false; this->m_maskBitCount = 32; } @@ -317,7 +329,11 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : this->m_isa = Target::SSE4; this->m_nativeVectorWidth = 4; this->m_vectorWidth = 8; +#if defined(LLVM_3_4) + this->m_attributes = "+sse,+sse2,+sse3,+sse4.1,-sse4.2,-sse4a,+ssse3,-popcnt,+cmov"; +#else this->m_attributes = "+sse,+sse2,+sse3,+sse41,-sse42,-sse4a,+ssse3,-popcnt,+cmov"; +#endif this->m_maskingIsFree = false; this->m_maskBitCount = 32; } @@ -325,7 +341,11 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : this->m_isa = Target::SSE4; this->m_nativeVectorWidth = 16; this->m_vectorWidth = 16; +#if defined(LLVM_3_4) + this->m_attributes = "+sse,+sse2,+sse3,+sse4.1,-sse4.2,-sse4a,+ssse3,-popcnt,+cmov"; +#else this->m_attributes = "+sse,+sse2,+sse3,+sse41,-sse42,-sse4a,+ssse3,-popcnt,+cmov"; +#endif this->m_maskingIsFree = false; this->m_maskBitCount = 8; } @@ -333,7 +353,11 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : this->m_isa = Target::SSE4; this->m_nativeVectorWidth = 8; this->m_vectorWidth = 8; +#if defined(LLVM_3_4) + this->m_attributes = "+sse,+sse2,+sse3,+sse4.1,-sse4.2,-sse4a,+ssse3,-popcnt,+cmov"; +#else this->m_attributes = "+sse,+sse2,+sse3,+sse41,-sse42,-sse4a,+ssse3,-popcnt,+cmov"; +#endif this->m_maskingIsFree = false; this->m_maskBitCount = 16; } @@ -425,7 +449,11 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : this->m_isa = Target::AVX11; this->m_nativeVectorWidth = 8; this->m_vectorWidth = 8; +#if defined(LLVM_3_4) + this->m_attributes = "+avx,+popcnt,+cmov,+f16c,+rdrnd"; +#else this->m_attributes = "+avx,+popcnt,+cmov,+f16c,+rdrand"; +#endif this->m_maskingIsFree = false; this->m_maskBitCount = 32; this->m_hasHalf = true; @@ -453,7 +481,11 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : this->m_isa = Target::AVX2; this->m_nativeVectorWidth = 8; this->m_vectorWidth = 8; +#if defined(LLVM_3_4) + this->m_attributes = "+avx2,+popcnt,+cmov,+f16c,+rdrnd" +#else this->m_attributes = "+avx2,+popcnt,+cmov,+f16c,+rdrand" +#endif #ifndef LLVM_3_1 ",+fma" #endif // !LLVM_3_1 @@ -472,7 +504,11 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : this->m_isa = Target::AVX2; this->m_nativeVectorWidth = 16; this->m_vectorWidth = 16; +#if defined(LLVM_3_4) + this->m_attributes = "+avx2,+popcnt,+cmov,+f16c,+rdrnd" +#else this->m_attributes = "+avx2,+popcnt,+cmov,+f16c,+rdrand" +#endif #ifndef LLVM_3_1 ",+fma" #endif // !LLVM_3_1 From 28080b0c22bd84d4b1d5cf29759c9e1423739f7e Mon Sep 17 00:00:00 2001 From: "james.brodman" Date: Tue, 27 Aug 2013 16:56:00 -0400 Subject: [PATCH 048/124] Fix build against 3.4 --- ispc.cpp | 60 ++++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 41 insertions(+), 19 deletions(-) diff --git a/ispc.cpp b/ispc.cpp index 2dd1a87d..6d4b063d 100644 --- a/ispc.cpp +++ b/ispc.cpp @@ -288,11 +288,13 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : this->m_isa = Target::SSE2; this->m_nativeVectorWidth = 4; this->m_vectorWidth = 4; + this->m_attributes = "+sse,+sse2,-sse3,-sse4a,-ssse3,-popcnt" #if defined(LLVM_3_4) - this->m_attributes = "+sse,+sse2,-sse3,-sse4.1,-sse4.2,-sse4a,-ssse3,-popcnt"; + ",-sse4.1,-sse4.2" #else - this->m_attributes = "+sse,+sse2,-sse3,-sse41,-sse42,-sse4a,-ssse3,-popcnt"; + ",-sse41,-sse42" #endif + ; this->m_maskingIsFree = false; this->m_maskBitCount = 32; } @@ -301,11 +303,13 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : this->m_isa = Target::SSE2; this->m_nativeVectorWidth = 4; this->m_vectorWidth = 8; + this->m_attributes = "+sse,+sse2,-sse3,-sse4a,-ssse3,-popcnt" #if defined(LLVM_3_4) - this->m_attributes = "+sse,+sse2,-sse3,-sse4.1,-sse4.2,-sse4a,-ssse3,-popcnt"; + ",-sse4.1,-sse4.2" #else - this->m_attributes = "+sse,+sse2,-sse3,-sse41,-sse42,-sse4a,-ssse3,-popcnt"; + ",-sse41,-sse42" #endif + ; this->m_maskingIsFree = false; this->m_maskBitCount = 32; } @@ -315,11 +319,13 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : this->m_nativeVectorWidth = 4; this->m_vectorWidth = 4; // TODO: why not sse42 and popcnt? + this->m_attributes = "+sse,+sse2,+sse3,-sse4a,+ssse3,-popcnt,+cmov" #if defined(LLVM_3_4) - this->m_attributes = "+sse,+sse2,+sse3,+sse4.1,-sse4.2,-sse4a,+ssse3,-popcnt,+cmov"; + ",+sse4.1,-sse4.2" #else - this->m_attributes = "+sse,+sse2,+sse3,+sse41,-sse42,-sse4a,+ssse3,-popcnt,+cmov"; + ",+sse41,-sse42" #endif + ; this->m_maskingIsFree = false; this->m_maskBitCount = 32; } @@ -329,11 +335,13 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : this->m_isa = Target::SSE4; this->m_nativeVectorWidth = 4; this->m_vectorWidth = 8; + this->m_attributes = "+sse,+sse2,+sse3,-sse4a,+ssse3,-popcnt,+cmov" #if defined(LLVM_3_4) - this->m_attributes = "+sse,+sse2,+sse3,+sse4.1,-sse4.2,-sse4a,+ssse3,-popcnt,+cmov"; + ",+sse4.1,-sse4.2" #else - this->m_attributes = "+sse,+sse2,+sse3,+sse41,-sse42,-sse4a,+ssse3,-popcnt,+cmov"; + ",+sse41,-sse42" #endif + ; this->m_maskingIsFree = false; this->m_maskBitCount = 32; } @@ -341,11 +349,13 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : this->m_isa = Target::SSE4; this->m_nativeVectorWidth = 16; this->m_vectorWidth = 16; + this->m_attributes = "+sse,+sse2,+sse3,-sse4a,+ssse3,-popcnt,+cmov" #if defined(LLVM_3_4) - this->m_attributes = "+sse,+sse2,+sse3,+sse4.1,-sse4.2,-sse4a,+ssse3,-popcnt,+cmov"; + ",+sse4.1,-sse4.2" #else - this->m_attributes = "+sse,+sse2,+sse3,+sse41,-sse42,-sse4a,+ssse3,-popcnt,+cmov"; + ",+sse41,-sse42" #endif + ; this->m_maskingIsFree = false; this->m_maskBitCount = 8; } @@ -353,11 +363,13 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : this->m_isa = Target::SSE4; this->m_nativeVectorWidth = 8; this->m_vectorWidth = 8; + this->m_attributes = "+sse,+sse2,+sse3,-sse4a,+ssse3,-popcnt,+cmov" #if defined(LLVM_3_4) - this->m_attributes = "+sse,+sse2,+sse3,+sse4.1,-sse4.2,-sse4a,+ssse3,-popcnt,+cmov"; + ",+sse4.1,-sse4.2" #else - this->m_attributes = "+sse,+sse2,+sse3,+sse41,-sse42,-sse4a,+ssse3,-popcnt,+cmov"; + ",+sse41,-sse42" #endif + ; this->m_maskingIsFree = false; this->m_maskBitCount = 16; } @@ -449,11 +461,13 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : this->m_isa = Target::AVX11; this->m_nativeVectorWidth = 8; this->m_vectorWidth = 8; + this->m_attributes = "+avx,+popcnt,+cmov,+f16c" #if defined(LLVM_3_4) - this->m_attributes = "+avx,+popcnt,+cmov,+f16c,+rdrnd"; + ",+rdrnd" #else - this->m_attributes = "+avx,+popcnt,+cmov,+f16c,+rdrand"; + ",+rdrand" #endif + ; this->m_maskingIsFree = false; this->m_maskBitCount = 32; this->m_hasHalf = true; @@ -467,7 +481,13 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : this->m_isa = Target::AVX11; this->m_nativeVectorWidth = 8; this->m_vectorWidth = 16; - this->m_attributes = "+avx,+popcnt,+cmov,+f16c,+rdrand"; + this->m_attributes = "+avx,+popcnt,+cmov,+f16c" +#if defined(LLVM_3_4) + ",+rdrnd" +#else + ",+rdrand" +#endif + ; this->m_maskingIsFree = false; this->m_maskBitCount = 32; this->m_hasHalf = true; @@ -481,10 +501,11 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : this->m_isa = Target::AVX2; this->m_nativeVectorWidth = 8; this->m_vectorWidth = 8; + this->m_attributes = "+avx2,+popcnt,+cmov,+f16c" #if defined(LLVM_3_4) - this->m_attributes = "+avx2,+popcnt,+cmov,+f16c,+rdrnd" + ",+rdrnd" #else - this->m_attributes = "+avx2,+popcnt,+cmov,+f16c,+rdrand" + ",+rdrand" #endif #ifndef LLVM_3_1 ",+fma" @@ -504,10 +525,11 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : this->m_isa = Target::AVX2; this->m_nativeVectorWidth = 16; this->m_vectorWidth = 16; + this->m_attributes = "+avx2,+popcnt,+cmov,+f16c" #if defined(LLVM_3_4) - this->m_attributes = "+avx2,+popcnt,+cmov,+f16c,+rdrnd" + ",+rdrnd" #else - this->m_attributes = "+avx2,+popcnt,+cmov,+f16c,+rdrand" + ",+rdrand" #endif #ifndef LLVM_3_1 ",+fma" From 501a23ad208c027c208c00a44f12c65824d6f7f3 Mon Sep 17 00:00:00 2001 From: Dmitry Babokin Date: Thu, 29 Aug 2013 14:48:09 +0400 Subject: [PATCH 049/124] Typos fixes in docs --- docs/ispc.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/ispc.rst b/docs/ispc.rst index 26cf6be3..476046e8 100644 --- a/docs/ispc.rst +++ b/docs/ispc.rst @@ -3659,7 +3659,7 @@ command-line argument. Cross-Program Instance Operations --------------------------------- -``ispc`` programs are often used to expresses independently-executing +``ispc`` programs are often used to express independently-executing programs performing computation on separate data elements. (i.e. pure data-parallelism). However, it's often the case where it's useful for the program instances to be able to cooperate in computing results. The @@ -3690,7 +3690,7 @@ the running program instances. The ``rotate()`` function allows each program instance to find the value of the given value that their neighbor ``offset`` steps away has. For -example, on an 8-wide target, if ``offset`` has the value (1, 2, 3, 4, 5, +example, on an 8-wide target, if ``value`` has the value (1, 2, 3, 4, 5, 6, 7, 8) across the gang of running program instances, then ``rotate(value, -1)`` causes the first program instance to get the value 8, the second program instance to get the value 1, the third 2, and so forth. The @@ -3769,7 +3769,7 @@ where the ``i`` th element of ``x`` has been replaced with the value ``v`` Reductions ---------- -A number routines are available to evaluate conditions across the +A number of routines are available to evaluate conditions across the running program instances. For example, ``any()`` returns ``true`` if the given value ``v`` is ``true`` for any of the SPMD program instances currently running, ``all()`` returns ``true`` if it true From e06267ef1bab233a955c5182c4071969520ac7b8 Mon Sep 17 00:00:00 2001 From: Dmitry Babokin Date: Thu, 29 Aug 2013 16:16:02 +0400 Subject: [PATCH 050/124] Fix for incorrect implementation of reduce_[min|max]_[float|double], it showed up as -O0 --- stdlib.ispc | 44 ++++++++++++++++++++++++++++++++------------ 1 file changed, 32 insertions(+), 12 deletions(-) diff --git a/stdlib.ispc b/stdlib.ispc index dc94d7e3..f7d135dd 100644 --- a/stdlib.ispc +++ b/stdlib.ispc @@ -918,9 +918,14 @@ static inline uniform float reduce_min(float v) { // For the lanes where the mask is off, replace the given value with // infinity, so that it doesn't affect the result. int iflt_max = 0x7f800000; // infinity - // Must use __floatbits_varying_int32, not floatbits(), since with the - // latter the current mask enters into the returned result... - return __reduce_min_float(__mask ? v : __floatbits_varying_int32(iflt_max)); + // unmasked block is needed to make sure that argument for unmasked + // function __reduce_min_float() are calculated without a mask. + bool test = __mask; + uniform float result; + unmasked { + result = __reduce_min_float(test ? v : floatbits(iflt_max)); + } + return result; } __declspec(safe) @@ -928,9 +933,14 @@ static inline uniform float reduce_max(float v) { // For the lanes where the mask is off, replace the given value with // negative infinity, so that it doesn't affect the result. const int iflt_neg_max = 0xff800000; // -infinity - // Must use __floatbits_varying_int32, not floatbits(), since with the - // latter the current mask enters into the returned result... - return __reduce_max_float(__mask ? v : __floatbits_varying_int32(iflt_neg_max)); + // unmasked block is needed to make sure that argument for unmasked + // function __reduce_max_float() are calculated without a mask. + bool test = __mask; + uniform float result; + unmasked { + result = __reduce_max_float(test ? v : floatbits(iflt_neg_max)); + } + return result; } __declspec(safe) @@ -986,17 +996,27 @@ static inline uniform double reduce_add(double x) { __declspec(safe) static inline uniform double reduce_min(double v) { int64 iflt_max = 0x7ff0000000000000; // infinity - // Must use __doublebits_varying_int64, not doublebits(), since with the - // latter the current mask enters into the returned result... - return __reduce_min_double(__mask ? v : __doublebits_varying_int64(iflt_max)); + // unmasked block is needed to make sure that argument for unmasked + // function __reduce_min_double() are calculated without a mask. + bool test = __mask; + uniform double result; + unmasked { + result = __reduce_min_double(test ? v : doublebits(iflt_max)); + } + return result; } __declspec(safe) static inline uniform double reduce_max(double v) { const int64 iflt_neg_max = 0xfff0000000000000; // -infinity - // Must use __doublebits_varying_int64, not doublebits(), since with the - // latter the current mask enters into the returned result... - return __reduce_max_double(__mask ? v : __doublebits_varying_int64(iflt_neg_max)); + // unmasked block is needed to make sure that argument for unmasked + // function __reduce_max_double() are calculated without a mask. + bool test = __mask; + uniform double result; + unmasked { + result = __reduce_max_double(test ? v : doublebits(iflt_neg_max)); + } + return result; } __declspec(safe) From 320b1700ff2c6f791d8477223f3d799a875089b5 Mon Sep 17 00:00:00 2001 From: Ilia Filippov Date: Fri, 30 Aug 2013 16:01:01 +0400 Subject: [PATCH 051/124] correction of adding -Werror option --- Makefile | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 8d27cc80..09ec302d 100644 --- a/Makefile +++ b/Makefile @@ -116,7 +116,10 @@ CXXFLAGS=$(OPT) $(LLVM_CXXFLAGS) -I. -Iobjs/ -I$(CLANG_INCLUDE) \ $(LLVM_VERSION_DEF) \ -Wall \ -DBUILD_DATE="\"$(BUILD_DATE)\"" -DBUILD_VERSION="\"$(BUILD_VERSION)\"" \ - -Werror -Wno-sign-compare + -Wno-sign-compare +ifneq ($(LLVM_VERSION),LLVM_3_1) + CXXFLAGS+=-Werror +endif ifneq ($(ARM_ENABLED), 0) CXXFLAGS+=-DISPC_ARM_ENABLED endif From 97d430d5cd87630a1074888476aceb110ebf4772 Mon Sep 17 00:00:00 2001 From: "james.brodman" Date: Fri, 30 Aug 2013 14:13:08 -0400 Subject: [PATCH 052/124] Fix to respect uniform/varying qualifiers inside of typedefs. --- decl.cpp | 25 +++++++++++++++++++++++-- sym.cpp | 11 +++++++++++ sym.h | 6 ++++++ 3 files changed, 40 insertions(+), 2 deletions(-) diff --git a/decl.cpp b/decl.cpp index e7b3cdef..8a10543b 100644 --- a/decl.cpp +++ b/decl.cpp @@ -69,8 +69,15 @@ lApplyTypeQualifiers(int typeQualifiers, const Type *type, SourcePos pos) { if (type == NULL) return NULL; - if ((typeQualifiers & TYPEQUAL_CONST) != 0) + if ((typeQualifiers & TYPEQUAL_CONST) != 0) { type = type->GetAsConstType(); + } + + if ( ((typeQualifiers & TYPEQUAL_UNIFORM) != 0) + && ((typeQualifiers & TYPEQUAL_VARYING) != 0) ) { + Error(pos, "Type \"%s\" cannot be qualified with both uniform and varying.", + type->GetString().c_str()); + } if ((typeQualifiers & TYPEQUAL_UNIFORM) != 0) { if (Type::Equal(type, AtomicType::Void)) @@ -84,9 +91,10 @@ lApplyTypeQualifiers(int typeQualifiers, const Type *type, SourcePos pos) { else type = type->GetAsVaryingType(); } - else + else { if (Type::Equal(type, AtomicType::Void) == false) type = type->GetAsUnboundVariabilityType(); + } if ((typeQualifiers & TYPEQUAL_UNSIGNED) != 0) { if ((typeQualifiers & TYPEQUAL_SIGNED) != 0) @@ -124,6 +132,17 @@ DeclSpecs::DeclSpecs(const Type *t, StorageClass sc, int tq) { typeQualifiers = tq; soaWidth = 0; vectorSize = 0; + if (t != NULL) { + if (m->symbolTable->ContainsType(t)) { + // Typedefs might have uniform/varying qualifiers inside. + if (t->IsVaryingType()) { + typeQualifiers |= TYPEQUAL_VARYING; + } + else if (t->IsUniformType()) { + typeQualifiers |= TYPEQUAL_UNIFORM; + } + } + } } @@ -229,6 +248,7 @@ Declarator::Declarator(DeclaratorKind dk, SourcePos p) void Declarator::InitFromDeclSpecs(DeclSpecs *ds) { const Type *baseType = ds->GetBaseType(pos); + InitFromType(baseType, ds); if (type == NULL) { @@ -591,6 +611,7 @@ Declaration::Declaration(DeclSpecs *ds, Declarator *d) { } + std::vector Declaration::GetVariableDeclarations() const { Assert(declSpecs->storageClass != SC_TYPEDEF); diff --git a/sym.cpp b/sym.cpp index f16f5e11..05f9996a 100644 --- a/sym.cpp +++ b/sym.cpp @@ -214,6 +214,17 @@ SymbolTable::LookupType(const char *name) const { return NULL; } +bool +SymbolTable::ContainsType(const Type *type) const { + TypeMapType::const_iterator iter = types.begin(); + while (iter != types.end()) { + if (iter->second == type) { + return true; + } + iter++; + } + return false; +} std::vector SymbolTable::ClosestVariableOrFunctionMatch(const char *str) const { diff --git a/sym.h b/sym.h index efb532a3..761c3612 100644 --- a/sym.h +++ b/sym.h @@ -219,6 +219,12 @@ public: @return Pointer to the Type, if found; otherwise NULL is returned. */ const Type *LookupType(const char *name) const; + + /** Look for a type given a pointer. + + @return True if found, False otherwise. + */ + bool ContainsType(const Type * type) const; /** This method returns zero or more strings with the names of symbols in the symbol table that nearly (but not exactly) match the given From 8db378b26565e6263f523faa335f10651078551f Mon Sep 17 00:00:00 2001 From: "james.brodman" Date: Wed, 4 Sep 2013 16:01:58 -0400 Subject: [PATCH 053/124] Revert "Remove support for using SVML for math lib routines." This reverts commit d9c38b5c1f6c1ccb4920465789b9e3d451e302a8. --- builtins.cpp | 11 ++++ builtins/target-avx-x2.ll | 17 ++++++ builtins/target-avx.ll | 17 ++++++ builtins/target-generic-1.ll | 98 +++++++++++++++++++++++++++++++ builtins/target-generic-common.ll | 16 +++++ builtins/target-neon-common.ll | 13 ++++ builtins/target-sse2-x2.ll | 86 +++++++++++++++++++++++++++ builtins/target-sse2.ll | 60 +++++++++++++++++++ builtins/target-sse4-16.ll | 15 +++++ builtins/target-sse4-8.ll | 15 +++++ builtins/target-sse4-x2.ll | 86 +++++++++++++++++++++++++++ builtins/target-sse4.ll | 60 +++++++++++++++++++ docs/ispc.rst | 3 + ispc.h | 2 +- main.cpp | 3 + stdlib.ispc | 72 +++++++++++++++++------ 16 files changed, 556 insertions(+), 18 deletions(-) diff --git a/builtins.cpp b/builtins.cpp index f3a0cf59..886eec15 100644 --- a/builtins.cpp +++ b/builtins.cpp @@ -581,6 +581,15 @@ lSetInternalFunctions(llvm::Module *module) { "__stdlib_sinf", "__stdlib_tan", "__stdlib_tanf", + "__svml_sin", + "__svml_cos", + "__svml_sincos", + "__svml_tan", + "__svml_atan", + "__svml_atan2", + "__svml_exp", + "__svml_log", + "__svml_pow", "__undef_uniform", "__undef_varying", "__vec4_add_float", @@ -1050,6 +1059,8 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod symbolTable); lDefineConstantInt("__math_lib_ispc_fast", (int)Globals::Math_ISPCFast, module, symbolTable); + lDefineConstantInt("__math_lib_svml", (int)Globals::Math_SVML, module, + symbolTable); lDefineConstantInt("__math_lib_system", (int)Globals::Math_System, module, symbolTable); lDefineConstantIntFunc("__fast_masked_vload", (int)g->opt.fastMaskedVload, diff --git a/builtins/target-avx-x2.ll b/builtins/target-avx-x2.ll index 8fb2e427..d9e0322b 100644 --- a/builtins/target-avx-x2.ll +++ b/builtins/target-avx-x2.ll @@ -134,6 +134,23 @@ define <16 x float> @__sqrt_varying_float(<16 x float>) nounwind readonly always ret <16 x float> %call } +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; svml + +; FIXME: need either to wire these up to the 8-wide SVML entrypoints, +; or, use the macro to call the 4-wide ones 4x with our 16-wide +; vectors... + +declare <16 x float> @__svml_sin(<16 x float>) +declare <16 x float> @__svml_cos(<16 x float>) +declare void @__svml_sincos(<16 x float>, <16 x float> *, <16 x float> *) +declare <16 x float> @__svml_tan(<16 x float>) +declare <16 x float> @__svml_atan(<16 x float>) +declare <16 x float> @__svml_atan2(<16 x float>, <16 x float>) +declare <16 x float> @__svml_exp(<16 x float>) +declare <16 x float> @__svml_log(<16 x float>) +declare <16 x float> @__svml_pow(<16 x float>, <16 x float>) + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; float min/max diff --git a/builtins/target-avx.ll b/builtins/target-avx.ll index adaed9ba..90e2f3ac 100644 --- a/builtins/target-avx.ll +++ b/builtins/target-avx.ll @@ -134,6 +134,23 @@ define <8 x float> @__sqrt_varying_float(<8 x float>) nounwind readonly alwaysin ret <8 x float> %call } +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; svml + +; FIXME: need either to wire these up to the 8-wide SVML entrypoints, +; or, use the macro to call the 4-wide ones twice with our 8-wide +; vectors... + +declare <8 x float> @__svml_sin(<8 x float>) +declare <8 x float> @__svml_cos(<8 x float>) +declare void @__svml_sincos(<8 x float>, <8 x float> *, <8 x float> *) +declare <8 x float> @__svml_tan(<8 x float>) +declare <8 x float> @__svml_atan(<8 x float>) +declare <8 x float> @__svml_atan2(<8 x float>, <8 x float>) +declare <8 x float> @__svml_exp(<8 x float>) +declare <8 x float> @__svml_log(<8 x float>) +declare <8 x float> @__svml_pow(<8 x float>, <8 x float>) + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; float min/max diff --git a/builtins/target-generic-1.ll b/builtins/target-generic-1.ll index 3472c207..31ebcdd5 100644 --- a/builtins/target-generic-1.ll +++ b/builtins/target-generic-1.ll @@ -647,6 +647,104 @@ define <1 x float> @__rsqrt_varying_float(<1 x float> %v) nounwind readonly alw } + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; svml stuff + +define <1 x float> @__svml_sin(<1 x float>) nounwind readnone alwaysinline { + ;%ret = call <1 x float> @__svml_sinf4(<1 x float> %0) + ;ret <1 x float> %ret + ;%r = extractelement <1 x float> %0, i32 0 + ;%s = call float @llvm.sin.f32(float %r) + ;%rv = insertelement <1 x float> undef, float %r, i32 0 + ;ret <1 x float> %rv + unary1to1(float,@llvm.sin.f32) + +} + +define <1 x float> @__svml_cos(<1 x float>) nounwind readnone alwaysinline { + ;%ret = call <1 x float> @__svml_cosf4(<1 x float> %0) + ;ret <1 x float> %ret + ;%r = extractelement <1 x float> %0, i32 0 + ;%s = call float @llvm.cos.f32(float %r) + ;%rv = insertelement <1 x float> undef, float %r, i32 0 + ;ret <1 x float> %rv + unary1to1(float, @llvm.cos.f32) + +} + +define void @__svml_sincos(<1 x float>, <1 x float> *, <1 x float> *) nounwind readnone alwaysinline { +; %s = call <1 x float> @__svml_sincosf4(<1 x float> * %2, <1 x float> %0) +; store <1 x float> %s, <1 x float> * %1 +; ret void + %sin = call <1 x float> @__svml_sin (<1 x float> %0) + %cos = call <1 x float> @__svml_cos (<1 x float> %0) + store <1 x float> %sin, <1 x float> * %1 + store <1 x float> %cos, <1 x float> * %2 + ret void +} + +define <1 x float> @__svml_tan(<1 x float>) nounwind readnone alwaysinline { + ;%ret = call <1 x float> @__svml_tanf4(<1 x float> %0) + ;ret <1 x float> %ret + ;%r = extractelement <1 x float> %0, i32 0 + ;%s = call float @llvm_tan_f32(float %r) + ;%rv = insertelement <1 x float> undef, float %r, i32 0 + ;ret <1 x float> %rv + ;unasry1to1(float, @llvm.tan.f32) + ; UNSUPPORTED! + ret <1 x float > %0 +} + +define <1 x float> @__svml_atan(<1 x float>) nounwind readnone alwaysinline { +; %ret = call <1 x float> @__svml_atanf4(<1 x float> %0) +; ret <1 x float> %ret + ;%r = extractelement <1 x float> %0, i32 0 + ;%s = call float @llvm_atan_f32(float %r) + ;%rv = insertelement <1 x float> undef, float %r, i32 0 + ;ret <1 x float> %rv + ;unsary1to1(float,@llvm.atan.f32) + ;UNSUPPORTED! + ret <1 x float > %0 + +} + +define <1 x float> @__svml_atan2(<1 x float>, <1 x float>) nounwind readnone alwaysinline { + ;%ret = call <1 x float> @__svml_atan2f4(<1 x float> %0, <1 x float> %1) + ;ret <1 x float> %ret + ;%y = extractelement <1 x float> %0, i32 0 + ;%x = extractelement <1 x float> %1, i32 0 + ;%q = fdiv float %y, %x + ;%a = call float @llvm.atan.f32 (float %q) + ;%rv = insertelement <1 x float> undef, float %a, i32 0 + ;ret <1 x float> %rv + ; UNSUPPORTED! + ret <1 x float > %0 +} + +define <1 x float> @__svml_exp(<1 x float>) nounwind readnone alwaysinline { + ;%ret = call <1 x float> @__svml_expf4(<1 x float> %0) + ;ret <1 x float> %ret + unary1to1(float, @llvm.exp.f32) +} + +define <1 x float> @__svml_log(<1 x float>) nounwind readnone alwaysinline { + ;%ret = call <1 x float> @__svml_logf4(<1 x float> %0) + ;ret <1 x float> %ret + unary1to1(float, @llvm.log.f32) +} + +define <1 x float> @__svml_pow(<1 x float>, <1 x float>) nounwind readnone alwaysinline { + ;%ret = call <1 x float> @__svml_powf4(<1 x float> %0, <1 x float> %1) + ;ret <1 x float> %ret + %r = extractelement <1 x float> %0, i32 0 + %e = extractelement <1 x float> %1, i32 0 + %s = call float @llvm.pow.f32(float %r,float %e) + %rv = insertelement <1 x float> undef, float %s, i32 0 + ret <1 x float> %rv + +} + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; float min/max diff --git a/builtins/target-generic-common.ll b/builtins/target-generic-common.ll index c683ff45..2896c6b1 100644 --- a/builtins/target-generic-common.ll +++ b/builtins/target-generic-common.ll @@ -202,6 +202,22 @@ declare i64 @__count_trailing_zeros_i64(i64) nounwind readnone declare i32 @__count_leading_zeros_i32(i32) nounwind readnone declare i64 @__count_leading_zeros_i64(i64) nounwind readnone +;; svml + +; FIXME: need either to wire these up to the 8-wide SVML entrypoints, +; or, use the macro to call the 4-wide ones twice with our 8-wide +; vectors... + +declare @__svml_sin() +declare @__svml_cos() +declare void @__svml_sincos(, *, *) +declare @__svml_tan() +declare @__svml_atan() +declare @__svml_atan2(, ) +declare @__svml_exp() +declare @__svml_log() +declare @__svml_pow(, ) + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; reductions diff --git a/builtins/target-neon-common.ll b/builtins/target-neon-common.ll index f892a0a1..696b0748 100644 --- a/builtins/target-neon-common.ll +++ b/builtins/target-neon-common.ll @@ -313,6 +313,19 @@ define void @__masked_store_blend_i64(* nocapture %ptr, ret void } +;; yuck. We need declarations of these, even though we shouldnt ever +;; actually generate calls to them for the NEON target... + +declare @__svml_sin() +declare @__svml_cos() +declare void @__svml_sincos(, *, *) +declare @__svml_tan() +declare @__svml_atan() +declare @__svml_atan2(, ) +declare @__svml_exp() +declare @__svml_log() +declare @__svml_pow(, ) + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; gather diff --git a/builtins/target-sse2-x2.ll b/builtins/target-sse2-x2.ll index 057ea98f..da22a66c 100644 --- a/builtins/target-sse2-x2.ll +++ b/builtins/target-sse2-x2.ll @@ -102,6 +102,92 @@ define <8 x float> @__sqrt_varying_float(<8 x float>) nounwind readonly alwaysin ret <8 x float> %call } +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; svml stuff + +declare <4 x float> @__svml_sinf4(<4 x float>) nounwind readnone +declare <4 x float> @__svml_cosf4(<4 x float>) nounwind readnone +declare <4 x float> @__svml_sincosf4(<4 x float> *, <4 x float>) nounwind readnone +declare <4 x float> @__svml_tanf4(<4 x float>) nounwind readnone +declare <4 x float> @__svml_atanf4(<4 x float>) nounwind readnone +declare <4 x float> @__svml_atan2f4(<4 x float>, <4 x float>) nounwind readnone +declare <4 x float> @__svml_expf4(<4 x float>) nounwind readnone +declare <4 x float> @__svml_logf4(<4 x float>) nounwind readnone +declare <4 x float> @__svml_powf4(<4 x float>, <4 x float>) nounwind readnone + + +define <8 x float> @__svml_sin(<8 x float>) nounwind readnone alwaysinline { + unary4to8(ret, float, @__svml_sinf4, %0) + ret <8 x float> %ret +} + +define <8 x float> @__svml_cos(<8 x float>) nounwind readnone alwaysinline { + unary4to8(ret, float, @__svml_cosf4, %0) + ret <8 x float> %ret +} + +define void @__svml_sincos(<8 x float>, <8 x float> *, + <8 x float> *) nounwind readnone alwaysinline { + ; call svml_sincosf4 two times with the two 4-wide sub-vectors + %a = shufflevector <8 x float> %0, <8 x float> undef, + <4 x i32> + %b = shufflevector <8 x float> %0, <8 x float> undef, + <4 x i32> + + %cospa = alloca <4 x float> + %sa = call <4 x float> @__svml_sincosf4(<4 x float> * %cospa, <4 x float> %a) + + %cospb = alloca <4 x float> + %sb = call <4 x float> @__svml_sincosf4(<4 x float> * %cospb, <4 x float> %b) + + %sin = shufflevector <4 x float> %sa, <4 x float> %sb, + <8 x i32> + store <8 x float> %sin, <8 x float> * %1 + + %cosa = load <4 x float> * %cospa + %cosb = load <4 x float> * %cospb + %cos = shufflevector <4 x float> %cosa, <4 x float> %cosb, + <8 x i32> + store <8 x float> %cos, <8 x float> * %2 + + ret void +} + +define <8 x float> @__svml_tan(<8 x float>) nounwind readnone alwaysinline { + unary4to8(ret, float, @__svml_tanf4, %0) + ret <8 x float> %ret +} + +define <8 x float> @__svml_atan(<8 x float>) nounwind readnone alwaysinline { + unary4to8(ret, float, @__svml_atanf4, %0) + ret <8 x float> %ret +} + +define <8 x float> @__svml_atan2(<8 x float>, + <8 x float>) nounwind readnone alwaysinline { + binary4to8(ret, float, @__svml_atan2f4, %0, %1) + ret <8 x float> %ret +} + +define <8 x float> @__svml_exp(<8 x float>) nounwind readnone alwaysinline { + unary4to8(ret, float, @__svml_expf4, %0) + ret <8 x float> %ret +} + +define <8 x float> @__svml_log(<8 x float>) nounwind readnone alwaysinline { + unary4to8(ret, float, @__svml_logf4, %0) + ret <8 x float> %ret +} + +define <8 x float> @__svml_pow(<8 x float>, + <8 x float>) nounwind readnone alwaysinline { + binary4to8(ret, float, @__svml_powf4, %0, %1) + ret <8 x float> %ret +} + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; float min/max diff --git a/builtins/target-sse2.ll b/builtins/target-sse2.ll index e0a5c3d5..a6b206b6 100644 --- a/builtins/target-sse2.ll +++ b/builtins/target-sse2.ll @@ -493,6 +493,66 @@ define <4 x float> @__sqrt_varying_float(<4 x float>) nounwind readonly alwaysin ret <4 x float> %call } +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; svml stuff + +declare <4 x float> @__svml_sinf4(<4 x float>) nounwind readnone +declare <4 x float> @__svml_cosf4(<4 x float>) nounwind readnone +declare <4 x float> @__svml_sincosf4(<4 x float> *, <4 x float>) nounwind readnone +declare <4 x float> @__svml_tanf4(<4 x float>) nounwind readnone +declare <4 x float> @__svml_atanf4(<4 x float>) nounwind readnone +declare <4 x float> @__svml_atan2f4(<4 x float>, <4 x float>) nounwind readnone +declare <4 x float> @__svml_expf4(<4 x float>) nounwind readnone +declare <4 x float> @__svml_logf4(<4 x float>) nounwind readnone +declare <4 x float> @__svml_powf4(<4 x float>, <4 x float>) nounwind readnone + + +define <4 x float> @__svml_sin(<4 x float>) nounwind readnone alwaysinline { + %ret = call <4 x float> @__svml_sinf4(<4 x float> %0) + ret <4 x float> %ret +} + +define <4 x float> @__svml_cos(<4 x float>) nounwind readnone alwaysinline { + %ret = call <4 x float> @__svml_cosf4(<4 x float> %0) + ret <4 x float> %ret +} + +define void @__svml_sincos(<4 x float>, <4 x float> *, <4 x float> *) nounwind readnone alwaysinline { + %s = call <4 x float> @__svml_sincosf4(<4 x float> * %2, <4 x float> %0) + store <4 x float> %s, <4 x float> * %1 + ret void +} + +define <4 x float> @__svml_tan(<4 x float>) nounwind readnone alwaysinline { + %ret = call <4 x float> @__svml_tanf4(<4 x float> %0) + ret <4 x float> %ret +} + +define <4 x float> @__svml_atan(<4 x float>) nounwind readnone alwaysinline { + %ret = call <4 x float> @__svml_atanf4(<4 x float> %0) + ret <4 x float> %ret +} + +define <4 x float> @__svml_atan2(<4 x float>, <4 x float>) nounwind readnone alwaysinline { + %ret = call <4 x float> @__svml_atan2f4(<4 x float> %0, <4 x float> %1) + ret <4 x float> %ret +} + +define <4 x float> @__svml_exp(<4 x float>) nounwind readnone alwaysinline { + %ret = call <4 x float> @__svml_expf4(<4 x float> %0) + ret <4 x float> %ret +} + +define <4 x float> @__svml_log(<4 x float>) nounwind readnone alwaysinline { + %ret = call <4 x float> @__svml_logf4(<4 x float> %0) + ret <4 x float> %ret +} + +define <4 x float> @__svml_pow(<4 x float>, <4 x float>) nounwind readnone alwaysinline { + %ret = call <4 x float> @__svml_powf4(<4 x float> %0, <4 x float> %1) + ret <4 x float> %ret +} + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; float min/max diff --git a/builtins/target-sse4-16.ll b/builtins/target-sse4-16.ll index b4772552..d7f3833d 100644 --- a/builtins/target-sse4-16.ll +++ b/builtins/target-sse4-16.ll @@ -205,6 +205,21 @@ define <8 x double> @__max_varying_double(<8 x double>, <8 x double>) nounwind r ret <8 x double> %ret } +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; svml + +; FIXME + +declare <8 x float> @__svml_sin(<8 x float>) +declare <8 x float> @__svml_cos(<8 x float>) +declare void @__svml_sincos(<8 x float>, <8 x float> *, <8 x float> *) +declare <8 x float> @__svml_tan(<8 x float>) +declare <8 x float> @__svml_atan(<8 x float>) +declare <8 x float> @__svml_atan2(<8 x float>, <8 x float>) +declare <8 x float> @__svml_exp(<8 x float>) +declare <8 x float> @__svml_log(<8 x float>) +declare <8 x float> @__svml_pow(<8 x float>, <8 x float>) + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; horizontal ops / reductions diff --git a/builtins/target-sse4-8.ll b/builtins/target-sse4-8.ll index a75d8e3a..fd4b74d7 100644 --- a/builtins/target-sse4-8.ll +++ b/builtins/target-sse4-8.ll @@ -217,6 +217,21 @@ define <16 x double> @__max_varying_double(<16 x double>, <16 x double>) nounwin ret <16 x double> %ret } +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; svml + +; FIXME + +declare <16 x float> @__svml_sin(<16 x float>) +declare <16 x float> @__svml_cos(<16 x float>) +declare void @__svml_sincos(<16 x float>, <16 x float> *, <16 x float> *) +declare <16 x float> @__svml_tan(<16 x float>) +declare <16 x float> @__svml_atan(<16 x float>) +declare <16 x float> @__svml_atan2(<16 x float>, <16 x float>) +declare <16 x float> @__svml_exp(<16 x float>) +declare <16 x float> @__svml_log(<16 x float>) +declare <16 x float> @__svml_pow(<16 x float>, <16 x float>) + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; horizontal ops / reductions diff --git a/builtins/target-sse4-x2.ll b/builtins/target-sse4-x2.ll index 897a09eb..a7faddb3 100644 --- a/builtins/target-sse4-x2.ll +++ b/builtins/target-sse4-x2.ll @@ -102,6 +102,92 @@ define <8 x float> @__sqrt_varying_float(<8 x float>) nounwind readonly alwaysin ret <8 x float> %call } +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; svml stuff + +declare <4 x float> @__svml_sinf4(<4 x float>) nounwind readnone +declare <4 x float> @__svml_cosf4(<4 x float>) nounwind readnone +declare <4 x float> @__svml_sincosf4(<4 x float> *, <4 x float>) nounwind readnone +declare <4 x float> @__svml_tanf4(<4 x float>) nounwind readnone +declare <4 x float> @__svml_atanf4(<4 x float>) nounwind readnone +declare <4 x float> @__svml_atan2f4(<4 x float>, <4 x float>) nounwind readnone +declare <4 x float> @__svml_expf4(<4 x float>) nounwind readnone +declare <4 x float> @__svml_logf4(<4 x float>) nounwind readnone +declare <4 x float> @__svml_powf4(<4 x float>, <4 x float>) nounwind readnone + + +define <8 x float> @__svml_sin(<8 x float>) nounwind readnone alwaysinline { + unary4to8(ret, float, @__svml_sinf4, %0) + ret <8 x float> %ret +} + +define <8 x float> @__svml_cos(<8 x float>) nounwind readnone alwaysinline { + unary4to8(ret, float, @__svml_cosf4, %0) + ret <8 x float> %ret +} + +define void @__svml_sincos(<8 x float>, <8 x float> *, + <8 x float> *) nounwind readnone alwaysinline { + ; call svml_sincosf4 two times with the two 4-wide sub-vectors + %a = shufflevector <8 x float> %0, <8 x float> undef, + <4 x i32> + %b = shufflevector <8 x float> %0, <8 x float> undef, + <4 x i32> + + %cospa = alloca <4 x float> + %sa = call <4 x float> @__svml_sincosf4(<4 x float> * %cospa, <4 x float> %a) + + %cospb = alloca <4 x float> + %sb = call <4 x float> @__svml_sincosf4(<4 x float> * %cospb, <4 x float> %b) + + %sin = shufflevector <4 x float> %sa, <4 x float> %sb, + <8 x i32> + store <8 x float> %sin, <8 x float> * %1 + + %cosa = load <4 x float> * %cospa + %cosb = load <4 x float> * %cospb + %cos = shufflevector <4 x float> %cosa, <4 x float> %cosb, + <8 x i32> + store <8 x float> %cos, <8 x float> * %2 + + ret void +} + +define <8 x float> @__svml_tan(<8 x float>) nounwind readnone alwaysinline { + unary4to8(ret, float, @__svml_tanf4, %0) + ret <8 x float> %ret +} + +define <8 x float> @__svml_atan(<8 x float>) nounwind readnone alwaysinline { + unary4to8(ret, float, @__svml_atanf4, %0) + ret <8 x float> %ret +} + +define <8 x float> @__svml_atan2(<8 x float>, + <8 x float>) nounwind readnone alwaysinline { + binary4to8(ret, float, @__svml_atan2f4, %0, %1) + ret <8 x float> %ret +} + +define <8 x float> @__svml_exp(<8 x float>) nounwind readnone alwaysinline { + unary4to8(ret, float, @__svml_expf4, %0) + ret <8 x float> %ret +} + +define <8 x float> @__svml_log(<8 x float>) nounwind readnone alwaysinline { + unary4to8(ret, float, @__svml_logf4, %0) + ret <8 x float> %ret +} + +define <8 x float> @__svml_pow(<8 x float>, + <8 x float>) nounwind readnone alwaysinline { + binary4to8(ret, float, @__svml_powf4, %0, %1) + ret <8 x float> %ret +} + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; float min/max diff --git a/builtins/target-sse4.ll b/builtins/target-sse4.ll index 5429b461..e05b865f 100644 --- a/builtins/target-sse4.ll +++ b/builtins/target-sse4.ll @@ -206,6 +206,66 @@ define <4 x double> @__max_varying_double(<4 x double>, <4 x double>) nounwind r ret <4 x double> %ret } +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; svml stuff + +declare <4 x float> @__svml_sinf4(<4 x float>) nounwind readnone +declare <4 x float> @__svml_cosf4(<4 x float>) nounwind readnone +declare <4 x float> @__svml_sincosf4(<4 x float> *, <4 x float>) nounwind readnone +declare <4 x float> @__svml_tanf4(<4 x float>) nounwind readnone +declare <4 x float> @__svml_atanf4(<4 x float>) nounwind readnone +declare <4 x float> @__svml_atan2f4(<4 x float>, <4 x float>) nounwind readnone +declare <4 x float> @__svml_expf4(<4 x float>) nounwind readnone +declare <4 x float> @__svml_logf4(<4 x float>) nounwind readnone +declare <4 x float> @__svml_powf4(<4 x float>, <4 x float>) nounwind readnone + + +define <4 x float> @__svml_sin(<4 x float>) nounwind readnone alwaysinline { + %ret = call <4 x float> @__svml_sinf4(<4 x float> %0) + ret <4 x float> %ret +} + +define <4 x float> @__svml_cos(<4 x float>) nounwind readnone alwaysinline { + %ret = call <4 x float> @__svml_cosf4(<4 x float> %0) + ret <4 x float> %ret +} + +define void @__svml_sincos(<4 x float>, <4 x float> *, <4 x float> *) nounwind readnone alwaysinline { + %s = call <4 x float> @__svml_sincosf4(<4 x float> * %2, <4 x float> %0) + store <4 x float> %s, <4 x float> * %1 + ret void +} + +define <4 x float> @__svml_tan(<4 x float>) nounwind readnone alwaysinline { + %ret = call <4 x float> @__svml_tanf4(<4 x float> %0) + ret <4 x float> %ret +} + +define <4 x float> @__svml_atan(<4 x float>) nounwind readnone alwaysinline { + %ret = call <4 x float> @__svml_atanf4(<4 x float> %0) + ret <4 x float> %ret +} + +define <4 x float> @__svml_atan2(<4 x float>, <4 x float>) nounwind readnone alwaysinline { + %ret = call <4 x float> @__svml_atan2f4(<4 x float> %0, <4 x float> %1) + ret <4 x float> %ret +} + +define <4 x float> @__svml_exp(<4 x float>) nounwind readnone alwaysinline { + %ret = call <4 x float> @__svml_expf4(<4 x float> %0) + ret <4 x float> %ret +} + +define <4 x float> @__svml_log(<4 x float>) nounwind readnone alwaysinline { + %ret = call <4 x float> @__svml_logf4(<4 x float> %0) + ret <4 x float> %ret +} + +define <4 x float> @__svml_pow(<4 x float>, <4 x float>) nounwind readnone alwaysinline { + %ret = call <4 x float> @__svml_powf4(<4 x float> %0, <4 x float> %1) + ret <4 x float> %ret +} + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; horizontal ops / reductions diff --git a/docs/ispc.rst b/docs/ispc.rst index 476046e8..ff07f6d8 100644 --- a/docs/ispc.rst +++ b/docs/ispc.rst @@ -3333,6 +3333,9 @@ for this argument. approximately 1.45e-6 over the range -10pi to 10pi.) * ``fast``: more efficient but lower accuracy versions of the default ``ispc`` implementations. +* ``svml``: use Intel "Short Vector Math Library". Use + ``icc`` to link your final executable so that the appropriate libraries + are linked. * ``system``: use the system's math library. On many systems, these functions are more accurate than both of ``ispc``'s implementations. Using these functions may be quite diff --git a/ispc.h b/ispc.h index fc78e415..4804832f 100644 --- a/ispc.h +++ b/ispc.h @@ -488,7 +488,7 @@ struct Globals { /** There are a number of math libraries that can be used for transcendentals and the like during program compilation. */ - enum MathLib { Math_ISPC, Math_ISPCFast, Math_System }; + enum MathLib { Math_ISPC, Math_ISPCFast, Math_SVML, Math_System }; MathLib mathLib; /** Records whether the ispc standard library should be made available diff --git a/main.cpp b/main.cpp index 61c62042..21a47de8 100644 --- a/main.cpp +++ b/main.cpp @@ -112,6 +112,7 @@ usage(int ret) { printf(" [--math-lib= @@ -126,7 +130,7 @@ Document m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-sse4.ll | python bitcode2cpp.py builtins\target-sse4.ll 32bit > $(Configuration)/gen-bitcode-sse4-32bit.cpp $(Configuration)/gen-bitcode-sse4-32bit.cpp - builtins\util.m4;builtins\target-sse4-common.ll + builtins\util.m4;builtins\svml.m4;builtins\target-sse4-common.ll Building gen-bitcode-sse4-32bit.cpp @@ -135,7 +139,7 @@ Document m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-sse4.ll | python bitcode2cpp.py builtins\target-sse4.ll 64bit > $(Configuration)/gen-bitcode-sse4-64bit.cpp $(Configuration)/gen-bitcode-sse4-64bit.cpp - builtins\util.m4;builtins\target-sse4-common.ll + builtins\util.m4;builtins\svml.m4;builtins\target-sse4-common.ll Building gen-bitcode-sse4-64bit.cpp @@ -144,7 +148,7 @@ Document m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-sse4-8.ll | python bitcode2cpp.py builtins\target-sse4-8.ll 32bit > $(Configuration)/gen-bitcode-sse4-8-32bit.cpp $(Configuration)/gen-bitcode-sse4-8-32bit.cpp - builtins\util.m4;builtins\target-sse4-common.ll + builtins\util.m4;builtins\svml.m4;builtins\target-sse4-common.ll Building gen-bitcode-sse4-8-32bit.cpp @@ -153,7 +157,7 @@ Document m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-sse4-8.ll | python bitcode2cpp.py builtins\target-sse4-8.ll 64bit > $(Configuration)/gen-bitcode-sse4-8-64bit.cpp $(Configuration)/gen-bitcode-sse4-8-64bit.cpp - builtins\util.m4;builtins\target-sse4-common.ll + builtins\util.m4;builtins\svml.m4;builtins\target-sse4-common.ll Building gen-bitcode-sse4-8-64bit.cpp @@ -162,7 +166,7 @@ Document m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-sse4-16.ll | python bitcode2cpp.py builtins\target-sse4-16.ll 32bit > $(Configuration)/gen-bitcode-sse4-16-32bit.cpp $(Configuration)/gen-bitcode-sse4-16-32bit.cpp - builtins\util.m4;builtins\target-sse4-common.ll + builtins\util.m4;builtins\svml.m4;builtins\target-sse4-common.ll Building gen-bitcode-sse4-16-32bit.cpp @@ -171,7 +175,7 @@ Document m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-sse4-16.ll | python bitcode2cpp.py builtins\target-sse4-16.ll 64bit > $(Configuration)/gen-bitcode-sse4-16-64bit.cpp $(Configuration)/gen-bitcode-sse4-16-64bit.cpp - builtins\util.m4;builtins\target-sse4-common.ll + builtins\util.m4;builtins\svml.m4;builtins\target-sse4-common.ll Building gen-bitcode-sse4-16-64bit.cpp @@ -180,7 +184,7 @@ Document m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-sse4-x2.ll | python bitcode2cpp.py builtins\target-sse4-x2.ll 32bit > $(Configuration)/gen-bitcode-sse4-x2-32bit.cpp $(Configuration)/gen-bitcode-sse4-x2-32bit.cpp - builtins\util.m4;builtins\target-sse4-common.ll + builtins\util.m4;builtins\svml.m4;builtins\target-sse4-common.ll Building gen-bitcode-sse4-x2-32bit.cpp @@ -189,7 +193,7 @@ Document m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-sse4-x2.ll | python bitcode2cpp.py builtins\target-sse4-x2.ll 64bit > $(Configuration)/gen-bitcode-sse4-x2-64bit.cpp $(Configuration)/gen-bitcode-sse4-x2-64bit.cpp - builtins\util.m4;builtins\target-sse4-common.ll + builtins\util.m4;builtins\svml.m4;builtins\target-sse4-common.ll Building gen-bitcode-sse4-x2-64bit.cpp @@ -198,7 +202,7 @@ Document m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-sse2.ll | python bitcode2cpp.py builtins\target-sse2.ll 32bit > $(Configuration)/gen-bitcode-sse2-32bit.cpp $(Configuration)/gen-bitcode-sse2-32bit.cpp - builtins\util.m4;builtins\target-sse2-common.ll + builtins\util.m4;builtins\svml.m4;builtins\target-sse2-common.ll Building gen-bitcode-sse2-32bit.cpp @@ -207,7 +211,7 @@ Document m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-sse2.ll | python bitcode2cpp.py builtins\target-sse2.ll 64bit > $(Configuration)/gen-bitcode-sse2-64bit.cpp $(Configuration)/gen-bitcode-sse2-64bit.cpp - builtins\util.m4;builtins\target-sse2-common.ll + builtins\util.m4;builtins\svml.m4;builtins\target-sse2-common.ll Building gen-bitcode-sse2-64bit.cpp @@ -216,7 +220,7 @@ Document m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-sse2-x2.ll | python bitcode2cpp.py builtins\target-sse2-x2.ll 32bit > $(Configuration)/gen-bitcode-sse2-x2-32bit.cpp $(Configuration)/gen-bitcode-sse2-x2-32bit.cpp - builtins\util.m4;builtins\target-sse2-common.ll + builtins\util.m4;builtins\svml.m4;builtins\target-sse2-common.ll Building gen-bitcode-sse2-x2-32bit.cpp @@ -225,7 +229,7 @@ Document m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-sse2-x2.ll | python bitcode2cpp.py builtins\target-sse2-x2.ll 64bit > $(Configuration)/gen-bitcode-sse2-x2-64bit.cpp $(Configuration)/gen-bitcode-sse2-x2-64bit.cpp - builtins\util.m4;builtins\target-sse2-common.ll + builtins\util.m4;builtins\svml.m4;builtins\target-sse2-common.ll Building gen-bitcode-sse2-x2-64bit.cpp @@ -234,7 +238,7 @@ Document m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-avx1.ll | python bitcode2cpp.py builtins\target-avx1.ll 32bit > $(Configuration)/gen-bitcode-avx1-32bit.cpp $(Configuration)/gen-bitcode-avx1-32bit.cpp - builtins\util.m4;builtins\target-avx-common.ll;builtins\target-avx.ll + builtins\util.m4;builtins\svml.m4;builtins\target-avx-common.ll;builtins\target-avx.ll Building gen-bitcode-avx1-32bit.cpp @@ -243,7 +247,7 @@ Document m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-avx1.ll | python bitcode2cpp.py builtins\target-avx1.ll 64bit > $(Configuration)/gen-bitcode-avx1-64bit.cpp $(Configuration)/gen-bitcode-avx1-64bit.cpp - builtins\util.m4;builtins\target-avx-common.ll;builtins\target-avx.ll + builtins\util.m4;builtins\svml.m4;builtins\target-avx-common.ll;builtins\target-avx.ll Building gen-bitcode-avx1-64bit.cpp @@ -252,7 +256,7 @@ Document m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-avx1-x2.ll | python bitcode2cpp.py builtins\target-avx1-x2.ll 32bit > $(Configuration)/gen-bitcode-avx1-x2-32bit.cpp $(Configuration)/gen-bitcode-avx1-x2-32bit.cpp - builtins\util.m4;builtins\target-avx-common.ll;builtins\target-avx-x2.ll + builtins\util.m4;builtins\svml.m4;builtins\target-avx-common.ll;builtins\target-avx-x2.ll Building gen-bitcode-avx1-x2-32bit.cpp @@ -261,16 +265,34 @@ Document m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-avx1-x2.ll | python bitcode2cpp.py builtins\target-avx1-x2.ll 64bit > $(Configuration)/gen-bitcode-avx1-x2-64bit.cpp $(Configuration)/gen-bitcode-avx1-x2-64bit.cpp - builtins\util.m4;builtins\target-avx-common.ll;builtins\target-avx-x2.ll + builtins\util.m4;builtins\svml.m4;builtins\target-avx-common.ll;builtins\target-avx-x2.ll Building gen-bitcode-avx1-x2-64bit.cpp + + + Document + m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-avx1-i64x4.ll | python bitcode2cpp.py builtins\target-avx1-i64x4.ll 32bit > $(Configuration)/gen-bitcode-avx1-i64x4-32bit.cpp + $(Configuration)/gen-bitcode-avx1-i64x4-32bit.cpp + builtins\util.m4;builtins\svml.m4;builtins\target-avx-common.ll;builtins\target-avx.ll;builtins\target-avx1-i64x4base.ll + Building gen-bitcode-avx1-i64x4-32bit.cpp + + + + + Document + m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-avx1-i64x4.ll | python bitcode2cpp.py builtins\target-avx1-i64x4.ll 64bit > $(Configuration)/gen-bitcode-avx1-i64x4-64bit.cpp + $(Configuration)/gen-bitcode-avx1-i64x4-64bit.cpp + builtins\util.m4;builtins\svml.m4;builtins\target-avx-common.ll;builtins\target-avx.ll;builtins\target-avx1-i64x4base.ll + Building gen-bitcode-avx1-i64x4-64bit.cpp + + Document m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-avx11.ll | python bitcode2cpp.py builtins\target-avx11.ll 32bit > $(Configuration)/gen-bitcode-avx11-32bit.cpp $(Configuration)/gen-bitcode-avx11-32bit.cpp - builtins\util.m4;builtins\target-avx-common.ll;builtins\target-avx.ll + builtins\util.m4;builtins\svml.m4;builtins\target-avx-common.ll;builtins\target-avx.ll Building gen-bitcode-avx11-32bit.cpp @@ -279,7 +301,7 @@ Document m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-avx11.ll | python bitcode2cpp.py builtins\target-avx11.ll 64bit > $(Configuration)/gen-bitcode-avx11-64bit.cpp $(Configuration)/gen-bitcode-avx11-64bit.cpp - builtins\util.m4;builtins\target-avx-common.ll;builtins\target-avx.ll + builtins\util.m4;builtins\svml.m4;builtins\target-avx-common.ll;builtins\target-avx.ll Building gen-bitcode-avx11-64bit.cpp @@ -288,7 +310,7 @@ Document m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-avx11-x2.ll | python bitcode2cpp.py builtins\target-avx11-x2.ll 32bit > $(Configuration)/gen-bitcode-avx11-x2-32bit.cpp $(Configuration)/gen-bitcode-avx11-x2-32bit.cpp - builtins\util.m4;builtins\target-avx-common.ll;builtins\target-avx-x2.ll + builtins\util.m4;builtins\svml.m4;builtins\target-avx-common.ll;builtins\target-avx-x2.ll Building gen-bitcode-avx11-x2-32bit.cpp @@ -297,7 +319,7 @@ Document m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-avx11-x2.ll | python bitcode2cpp.py builtins\target-avx11-x2.ll 64bit > $(Configuration)/gen-bitcode-avx11-x2-64bit.cpp $(Configuration)/gen-bitcode-avx11-x2-64bit.cpp - builtins\util.m4;builtins\target-avx-common.ll;builtins\target-avx-x2.ll + builtins\util.m4;builtins\svml.m4;builtins\target-avx-common.ll;builtins\target-avx-x2.ll Building gen-bitcode-avx11-x2-64bit.cpp @@ -306,7 +328,7 @@ Document m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-avx2.ll | python bitcode2cpp.py builtins\target-avx2.ll 32bit > $(Configuration)/gen-bitcode-avx2-32bit.cpp $(Configuration)/gen-bitcode-avx2-32bit.cpp - builtins\util.m4;builtins\target-avx-common.ll;builtins\target-avx.ll + builtins\util.m4;builtins\svml.m4;builtins\target-avx-common.ll;builtins\target-avx.ll Building gen-bitcode-avx2-32bit.cpp @@ -315,7 +337,7 @@ Document m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-avx2.ll | python bitcode2cpp.py builtins\target-avx2.ll 64bit > $(Configuration)/gen-bitcode-avx2-64bit.cpp $(Configuration)/gen-bitcode-avx2-64bit.cpp - builtins\util.m4;builtins\target-avx-common.ll;builtins\target-avx.ll + builtins\util.m4;builtins\svml.m4;builtins\target-avx-common.ll;builtins\target-avx.ll Building gen-bitcode-avx2-64bit.cpp @@ -324,7 +346,7 @@ Document m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-avx2-x2.ll | python bitcode2cpp.py builtins\target-avx2-x2.ll 32bit > $(Configuration)/gen-bitcode-avx2-x2-32bit.cpp $(Configuration)/gen-bitcode-avx2-x2-32bit.cpp - builtins\util.m4;builtins\target-avx-common.ll;builtins\target-avx-x2.ll + builtins\util.m4;builtins\svml.m4;builtins\target-avx-common.ll;builtins\target-avx-x2.ll Building gen-bitcode-avx2-x2-32bit.cpp @@ -333,7 +355,7 @@ Document m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-avx2-x2.ll | python bitcode2cpp.py builtins\target-avx2-x2.ll 64bit > $(Configuration)/gen-bitcode-avx2-x2-64bit.cpp $(Configuration)/gen-bitcode-avx2-x2-64bit.cpp - builtins\util.m4;builtins\target-avx-common.ll;builtins\target-avx-x2.ll + builtins\util.m4;builtins\svml.m4;builtins\target-avx-common.ll;builtins\target-avx-x2.ll Building gen-bitcode-avx2-x2-64bit.cpp @@ -342,7 +364,7 @@ Document m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-generic-1.ll | python bitcode2cpp.py builtins\target-generic-1.ll 32bit > $(Configuration)/gen-bitcode-generic-1-32bit.cpp $(Configuration)/gen-bitcode-generic-1-32bit.cpp - builtins\util.m4;builtins\target-generic-common.ll + builtins\util.m4;builtins\svml.m4;builtins\target-generic-common.ll Building gen-bitcode-generic-1-32bit.cpp @@ -351,7 +373,7 @@ Document m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-generic-1.ll | python bitcode2cpp.py builtins\target-generic-1.ll 64bit > $(Configuration)/gen-bitcode-generic-1-64bit.cpp $(Configuration)/gen-bitcode-generic-1-64bit.cpp - builtins\util.m4;builtins\target-generic-common.ll + builtins\util.m4;builtins\svml.m4;builtins\target-generic-common.ll Building gen-bitcode-generic-1-64bit.cpp @@ -360,7 +382,7 @@ Document m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-generic-4.ll | python bitcode2cpp.py builtins\target-generic-4.ll 32bit > $(Configuration)/gen-bitcode-generic-4-32bit.cpp $(Configuration)/gen-bitcode-generic-4-32bit.cpp - builtins\util.m4;builtins\target-generic-common.ll + builtins\util.m4;builtins\svml.m4;builtins\target-generic-common.ll Building gen-bitcode-generic-4-32bit.cpp @@ -369,7 +391,7 @@ Document m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-generic-4.ll | python bitcode2cpp.py builtins\target-generic-4.ll 64bit > $(Configuration)/gen-bitcode-generic-4-64bit.cpp $(Configuration)/gen-bitcode-generic-4-64bit.cpp - builtins\util.m4;builtins\target-generic-common.ll + builtins\util.m4;builtins\svml.m4;builtins\target-generic-common.ll Building gen-bitcode-generic-4-64bit.cpp @@ -378,7 +400,7 @@ Document m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-generic-8.ll | python bitcode2cpp.py builtins\target-generic-8.ll 32bit > $(Configuration)/gen-bitcode-generic-8-32bit.cpp $(Configuration)/gen-bitcode-generic-8-32bit.cpp - builtins\util.m4;builtins\target-generic-common.ll + builtins\util.m4;builtins\svml.m4;builtins\target-generic-common.ll Building gen-bitcode-generic-8-32bit.cpp @@ -387,7 +409,7 @@ Document m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-generic-8.ll | python bitcode2cpp.py builtins\target-generic-8.ll 64bit > $(Configuration)/gen-bitcode-generic-8-64bit.cpp $(Configuration)/gen-bitcode-generic-8-64bit.cpp - builtins\util.m4;builtins\target-generic-common.ll + builtins\util.m4;builtins\svml.m4;builtins\target-generic-common.ll Building gen-bitcode-generic-8-64bit.cpp @@ -396,7 +418,7 @@ Document m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-generic-16.ll | python bitcode2cpp.py builtins\target-generic-16.ll 32bit > $(Configuration)/gen-bitcode-generic-16-32bit.cpp $(Configuration)/gen-bitcode-generic-16-32bit.cpp - builtins\util.m4;builtins\target-generic-common.ll + builtins\util.m4;builtins\svml.m4;builtins\target-generic-common.ll Building gen-bitcode-generic-16-32bit.cpp @@ -405,7 +427,7 @@ Document m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-generic-16.ll | python bitcode2cpp.py builtins\target-generic-16.ll 64bit > $(Configuration)/gen-bitcode-generic-16-64bit.cpp $(Configuration)/gen-bitcode-generic-16-64bit.cpp - builtins\util.m4;builtins\target-generic-common.ll + builtins\util.m4;builtins\svml.m4;builtins\target-generic-common.ll Building gen-bitcode-generic-16-64bit.cpp @@ -414,7 +436,7 @@ Document m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-generic-32.ll | python bitcode2cpp.py builtins\target-generic-32.ll 32bit > $(Configuration)/gen-bitcode-generic-32-32bit.cpp $(Configuration)/gen-bitcode-generic-32-32bit.cpp - builtins\util.m4;builtins\target-generic-common.ll + builtins\util.m4;builtins\svml.m4;builtins\target-generic-common.ll Building gen-bitcode-generic-32-32bit.cpp @@ -423,7 +445,7 @@ Document m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-generic-32.ll | python bitcode2cpp.py builtins\target-generic-32.ll 64bit > $(Configuration)/gen-bitcode-generic-32-64bit.cpp $(Configuration)/gen-bitcode-generic-32-64bit.cpp - builtins\util.m4;builtins\target-generic-common.ll + builtins\util.m4;builtins\svml.m4;builtins\target-generic-common.ll Building gen-bitcode-generic-32-64bit.cpp @@ -432,7 +454,7 @@ Document m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-generic-64.ll | python bitcode2cpp.py builtins\target-generic-64.ll 32bit > $(Configuration)/gen-bitcode-generic-64-32bit.cpp $(Configuration)/gen-bitcode-generic-64-32bit.cpp - builtins\util.m4;builtins\target-generic-common.ll + builtins\util.m4;builtins\svml.m4;builtins\target-generic-common.ll Building gen-bitcode-generic-64-32bit.cpp @@ -441,7 +463,7 @@ Document m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-generic-64.ll | python bitcode2cpp.py builtins\target-generic-64.ll 64bit > $(Configuration)/gen-bitcode-generic-64-64bit.cpp $(Configuration)/gen-bitcode-generic-64-64bit.cpp - builtins\util.m4;builtins\target-generic-common.ll + builtins\util.m4;builtins\svml.m4;builtins\target-generic-common.ll Building gen-bitcode-generic-64-64bit.cpp From 97068765e884599afc4cc4b7187a4de4dd509b46 Mon Sep 17 00:00:00 2001 From: Tomasz Koziara Date: Sat, 14 Sep 2013 18:09:04 +0100 Subject: [PATCH 070/124] Copyright reversed. --- examples/sort/sort.cpp | 4 ++-- examples/sort/sort.ispc | 4 ++-- examples/sort/sort_serial.cpp | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/examples/sort/sort.cpp b/examples/sort/sort.cpp index 1d05b247..4f402c75 100644 --- a/examples/sort/sort.cpp +++ b/examples/sort/sort.cpp @@ -1,5 +1,5 @@ /* - Copyright (c) 2013, Intel Corporation + Copyright (c) 2013, Durham University All rights reserved. Redistribution and use in source and binary forms, with or without @@ -13,7 +13,7 @@ notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - * Neither the name of Intel Corporation nor the names of its + * Neither the name of Durham University nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. diff --git a/examples/sort/sort.ispc b/examples/sort/sort.ispc index 5fc89d91..25ea90f4 100644 --- a/examples/sort/sort.ispc +++ b/examples/sort/sort.ispc @@ -1,5 +1,5 @@ /* - Copyright (c) 2013, Intel Corporation + Copyright (c) 2013, Durham University All rights reserved. Redistribution and use in source and binary forms, with or without @@ -13,7 +13,7 @@ notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - * Neither the name of Intel Corporation nor the names of its + * Neither the name of Durham University nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. diff --git a/examples/sort/sort_serial.cpp b/examples/sort/sort_serial.cpp index ba955c77..38bbdda6 100644 --- a/examples/sort/sort_serial.cpp +++ b/examples/sort/sort_serial.cpp @@ -1,5 +1,5 @@ /* - Copyright (c) 2013, Intel Corporation + Copyright (c) 2013, Durham University All rights reserved. Redistribution and use in source and binary forms, with or without @@ -13,7 +13,7 @@ notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - * Neither the name of Intel Corporation nor the names of its + * Neither the name of Durham University nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. From e2a91e6de5fdcd370b903b2670e76be14c60dc09 Mon Sep 17 00:00:00 2001 From: egaburov Date: Mon, 16 Sep 2013 15:54:32 +0200 Subject: [PATCH 071/124] added support for "d"-suffix --- lex.ll | 20 +++++++++++++++++++- parse.yy | 11 ++++++++--- stdlib.ispc | 12 ++++++++++++ 3 files changed, 39 insertions(+), 4 deletions(-) diff --git a/lex.ll b/lex.ll index 8baa627a..c2990ccc 100644 --- a/lex.ll +++ b/lex.ll @@ -76,7 +76,7 @@ static int allTokens[] = { TOKEN_TASK, TOKEN_TRUE, TOKEN_TYPEDEF, TOKEN_UNIFORM, TOKEN_UNMASKED, TOKEN_UNSIGNED, TOKEN_VARYING, TOKEN_VOID, TOKEN_WHILE, TOKEN_STRING_C_LITERAL, TOKEN_DOTDOTDOT, - TOKEN_FLOAT_CONSTANT, + TOKEN_FLOAT_CONSTANT, TOKEN_DOUBLE_CONSTANT, TOKEN_INT8_CONSTANT, TOKEN_UINT8_CONSTANT, TOKEN_INT16_CONSTANT, TOKEN_UINT16_CONSTANT, TOKEN_INT32_CONSTANT, TOKEN_UINT32_CONSTANT, @@ -152,6 +152,7 @@ void ParserInit() { tokenToName[TOKEN_STRING_C_LITERAL] = "\"C\""; tokenToName[TOKEN_DOTDOTDOT] = "..."; tokenToName[TOKEN_FLOAT_CONSTANT] = "TOKEN_FLOAT_CONSTANT"; + tokenToName[TOKEN_DOUBLE_CONSTANT] = "TOKEN_DOUBLE_CONSTANT"; tokenToName[TOKEN_INT8_CONSTANT] = "TOKEN_INT8_CONSTANT"; tokenToName[TOKEN_UINT8_CONSTANT] = "TOKEN_UINT8_CONSTANT"; tokenToName[TOKEN_INT16_CONSTANT] = "TOKEN_INT16_CONSTANT"; @@ -266,6 +267,7 @@ void ParserInit() { tokenNameRemap["TOKEN_STRING_C_LITERAL"] = "\"C\""; tokenNameRemap["TOKEN_DOTDOTDOT"] = "\'...\'"; tokenNameRemap["TOKEN_FLOAT_CONSTANT"] = "float constant"; + tokenNameRemap["TOKEN_DOUBLE_CONSTANT"] = "double constant"; tokenNameRemap["TOKEN_INT8_CONSTANT"] = "int8 constant"; tokenNameRemap["TOKEN_UINT8_CONSTANT"] = "unsigned int8 constant"; tokenNameRemap["TOKEN_INT16_CONSTANT"] = "int16 constant"; @@ -343,6 +345,8 @@ INT_NUMBER (([0-9]+)|(0x[0-9a-fA-F]+)|(0b[01]+))[uUlL]*[kMG]?[uUlL]* INT_NUMBER_DOTDOTDOT (([0-9]+)|(0x[0-9a-fA-F]+)|(0b[01]+))[uUlL]*[kMG]?[uUlL]*\.\.\. FLOAT_NUMBER (([0-9]+|(([0-9]+\.[0-9]*[fF]?)|(\.[0-9]+)))([eE][-+]?[0-9]+)?[fF]?) HEX_FLOAT_NUMBER (0x[01](\.[0-9a-fA-F]*)?p[-+]?[0-9]+[fF]?) +DOUBLE_NUMBER (([0-9]+|(([0-9]+\.[0-9]*[dD]?)|(\.[0-9]+)))([dD][-+]?[0-9]+)?[dD]?) +HEX_DOUBLE_NUMBER (0x[01](\.[0-9a-fA-F]*)?p[-+]?[0-9]+[dD]?) IDENT [a-zA-Z_][a-zA-Z_0-9]* ZO_SWIZZLE ([01]+[w-z]+)+|([01]+[rgba]+)+|([01]+[uv]+)+ @@ -438,6 +442,7 @@ L?\"(\\.|[^\\"])*\" { lStringConst(&yylval, &yylloc); return TOKEN_STRING_LITERA } + {FLOAT_NUMBER} { RT; yylval.floatVal = (float)atof(yytext); @@ -450,6 +455,19 @@ L?\"(\\.|[^\\"])*\" { lStringConst(&yylval, &yylloc); return TOKEN_STRING_LITERA return TOKEN_FLOAT_CONSTANT; } +{DOUBLE_NUMBER} { + RT; + yylval.doubleVal = atof(yytext); + return TOKEN_DOUBLE_CONSTANT; +} + +{HEX_DOUBLE_NUMBER} { + RT; + yylval.doubleVal = lParseHexFloat(yytext); + return TOKEN_DOUBLE_CONSTANT; +} + + "++" { RT; return TOKEN_INC_OP; } "--" { RT; return TOKEN_DEC_OP; } "<<" { RT; return TOKEN_LEFT_OP; } diff --git a/parse.yy b/parse.yy index 9a2b4fc3..933a3455 100644 --- a/parse.yy +++ b/parse.yy @@ -149,7 +149,8 @@ struct ForeachDimension { %union { uint64_t intVal; - float floatVal; + float floatVal; + double doubleVal; std::string *stringVal; const char *constCharPtr; @@ -185,7 +186,7 @@ struct ForeachDimension { %token TOKEN_INT64_CONSTANT TOKEN_UINT64_CONSTANT %token TOKEN_INT32DOTDOTDOT_CONSTANT TOKEN_UINT32DOTDOTDOT_CONSTANT %token TOKEN_INT64DOTDOTDOT_CONSTANT TOKEN_UINT64DOTDOTDOT_CONSTANT -%token TOKEN_FLOAT_CONSTANT TOKEN_STRING_C_LITERAL +%token TOKEN_FLOAT_CONSTANT TOKEN_DOUBLE_CONSTANT TOKEN_STRING_C_LITERAL %token TOKEN_IDENTIFIER TOKEN_STRING_LITERAL TOKEN_TYPE_NAME TOKEN_NULL %token TOKEN_PTR_OP TOKEN_INC_OP TOKEN_DEC_OP TOKEN_LEFT_OP TOKEN_RIGHT_OP %token TOKEN_LE_OP TOKEN_GE_OP TOKEN_EQ_OP TOKEN_NE_OP @@ -327,7 +328,11 @@ primary_expression } | TOKEN_FLOAT_CONSTANT { $$ = new ConstExpr(AtomicType::UniformFloat->GetAsConstType(), - (float)yylval.floatVal, @1); + yylval.floatVal, @1); + } + | TOKEN_DOUBLE_CONSTANT { + $$ = new ConstExpr(AtomicType::UniformDouble->GetAsConstType(), + yylval.doubleVal, @1); } | TOKEN_TRUE { $$ = new ConstExpr(AtomicType::UniformBool->GetAsConstType(), true, @1); diff --git a/stdlib.ispc b/stdlib.ispc index 6d7ee051..0d5c4efd 100644 --- a/stdlib.ispc +++ b/stdlib.ispc @@ -1559,6 +1559,18 @@ static inline uniform float clamp(uniform float v, uniform float low, uniform fl return min(max(v, low), high); } +// double + +__declspec(safe,cost2) +static inline double clamp(double v, double low, double high) { + return min(max(v, low), high); +} + +__declspec(safe,cost2) +static inline uniform double clamp(uniform double v, uniform double low, uniform double high) { + return min(max(v, low), high); +} + // int8 __declspec(safe,cost2) From 233249048127b94cdb073e694f18987b643741d2 Mon Sep 17 00:00:00 2001 From: egaburov Date: Mon, 16 Sep 2013 16:31:41 +0200 Subject: [PATCH 072/124] added fortran_double_constant --- lex.ll | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/lex.ll b/lex.ll index c2990ccc..3d88a23a 100644 --- a/lex.ll +++ b/lex.ll @@ -345,8 +345,7 @@ INT_NUMBER (([0-9]+)|(0x[0-9a-fA-F]+)|(0b[01]+))[uUlL]*[kMG]?[uUlL]* INT_NUMBER_DOTDOTDOT (([0-9]+)|(0x[0-9a-fA-F]+)|(0b[01]+))[uUlL]*[kMG]?[uUlL]*\.\.\. FLOAT_NUMBER (([0-9]+|(([0-9]+\.[0-9]*[fF]?)|(\.[0-9]+)))([eE][-+]?[0-9]+)?[fF]?) HEX_FLOAT_NUMBER (0x[01](\.[0-9a-fA-F]*)?p[-+]?[0-9]+[fF]?) -DOUBLE_NUMBER (([0-9]+|(([0-9]+\.[0-9]*[dD]?)|(\.[0-9]+)))([dD][-+]?[0-9]+)?[dD]?) -HEX_DOUBLE_NUMBER (0x[01](\.[0-9a-fA-F]*)?p[-+]?[0-9]+[dD]?) +FORTRAN_DOUBLE_NUMBER (([0-9]+\.[0-9]*[dD])|([0-9]+\.[0-9]*[dD][-+]?[0-9])|([0-9]+[dD][-+]?[0-9])) IDENT [a-zA-Z_][a-zA-Z_0-9]* ZO_SWIZZLE ([01]+[w-z]+)+|([01]+[rgba]+)+|([01]+[uv]+)+ @@ -455,18 +454,19 @@ L?\"(\\.|[^\\"])*\" { lStringConst(&yylval, &yylloc); return TOKEN_STRING_LITERA return TOKEN_FLOAT_CONSTANT; } -{DOUBLE_NUMBER} { +{FORTRAN_DOUBLE_NUMBER} { RT; + { + int i = 0; + while (yytext[i] != 'd') i++; + if ((yytext[i+1] >= '0' && yytext[i+1] <= '9') + || yytext[i+1] == '+' || yytext[i+1] == '-') + yytext[i] = 'E'; + } yylval.doubleVal = atof(yytext); return TOKEN_DOUBLE_CONSTANT; } -{HEX_DOUBLE_NUMBER} { - RT; - yylval.doubleVal = lParseHexFloat(yytext); - return TOKEN_DOUBLE_CONSTANT; -} - "++" { RT; return TOKEN_INC_OP; } "--" { RT; return TOKEN_DEC_OP; } From 6fd21d988d999b62aa0e2832cd93ccdb4ca78f77 Mon Sep 17 00:00:00 2001 From: Evghenii Date: Mon, 16 Sep 2013 17:15:02 +0200 Subject: [PATCH 073/124] fixed lexer to properly read fortran-notation double constants --- lex.ll | 26 +++++++++++++------------- stdlib.ispc | 4 ++-- 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/lex.ll b/lex.ll index 3d88a23a..ca318dbb 100644 --- a/lex.ll +++ b/lex.ll @@ -345,7 +345,9 @@ INT_NUMBER (([0-9]+)|(0x[0-9a-fA-F]+)|(0b[01]+))[uUlL]*[kMG]?[uUlL]* INT_NUMBER_DOTDOTDOT (([0-9]+)|(0x[0-9a-fA-F]+)|(0b[01]+))[uUlL]*[kMG]?[uUlL]*\.\.\. FLOAT_NUMBER (([0-9]+|(([0-9]+\.[0-9]*[fF]?)|(\.[0-9]+)))([eE][-+]?[0-9]+)?[fF]?) HEX_FLOAT_NUMBER (0x[01](\.[0-9a-fA-F]*)?p[-+]?[0-9]+[fF]?) -FORTRAN_DOUBLE_NUMBER (([0-9]+\.[0-9]*[dD])|([0-9]+\.[0-9]*[dD][-+]?[0-9])|([0-9]+[dD][-+]?[0-9])) +FORTRAN_DOUBLE_NUMBER (([0-9]+\.[0-9]*[dD])|([0-9]+\.[0-9]*[dD][-+]?[0-9]+)|([0-9]+[dD][-+]?[0-9]+)) + + IDENT [a-zA-Z_][a-zA-Z_0-9]* ZO_SWIZZLE ([01]+[w-z]+)+|([01]+[rgba]+)+|([01]+[uv]+)+ @@ -440,6 +442,16 @@ L?\"(\\.|[^\\"])*\" { lStringConst(&yylval, &yylloc); return TOKEN_STRING_LITERA return lParseInteger(true); } +{FORTRAN_DOUBLE_NUMBER} { + RT; + { + int i = 0; + while (yytext[i] != 'd') i++; + yytext[i] = 'E'; + } + yylval.doubleVal = atof(yytext); + return TOKEN_DOUBLE_CONSTANT; +} {FLOAT_NUMBER} { @@ -454,18 +466,6 @@ L?\"(\\.|[^\\"])*\" { lStringConst(&yylval, &yylloc); return TOKEN_STRING_LITERA return TOKEN_FLOAT_CONSTANT; } -{FORTRAN_DOUBLE_NUMBER} { - RT; - { - int i = 0; - while (yytext[i] != 'd') i++; - if ((yytext[i+1] >= '0' && yytext[i+1] <= '9') - || yytext[i+1] == '+' || yytext[i+1] == '-') - yytext[i] = 'E'; - } - yylval.doubleVal = atof(yytext); - return TOKEN_DOUBLE_CONSTANT; -} "++" { RT; return TOKEN_INC_OP; } diff --git a/stdlib.ispc b/stdlib.ispc index 0d5c4efd..9b02d0ba 100644 --- a/stdlib.ispc +++ b/stdlib.ispc @@ -2564,7 +2564,7 @@ static inline float acos(float v) { __declspec(safe) static inline double acos(const double v) { - return 1.57079637050628662109375 - asin(v); + return 1.57079637050628662109375d0 - asin(v); } @@ -2575,7 +2575,7 @@ static inline uniform float acos(uniform float v) { __declspec(safe) static inline uniform double acos(const uniform double v) { - return 1.57079637050628662109375 - asin(v); + return 1.57079637050628662109375d0 - asin(v); } From eef4e11768222914ffb93ccc1ab698e1cfbd7922 Mon Sep 17 00:00:00 2001 From: egaburov Date: Mon, 16 Sep 2013 17:25:13 +0200 Subject: [PATCH 074/124] now it is also case nonsensitive --- lex.ll | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lex.ll b/lex.ll index ca318dbb..f1dcaa6f 100644 --- a/lex.ll +++ b/lex.ll @@ -446,7 +446,7 @@ L?\"(\\.|[^\\"])*\" { lStringConst(&yylval, &yylloc); return TOKEN_STRING_LITERA RT; { int i = 0; - while (yytext[i] != 'd') i++; + while (yytext[i] != 'd' && yytext[i] != 'D') i++; yytext[i] = 'E'; } yylval.doubleVal = atof(yytext); From 6e0b9ddc74a4480e97d9b19c66e4ad8de5d5198a Mon Sep 17 00:00:00 2001 From: Tomasz Koziara Date: Mon, 16 Sep 2013 18:02:07 +0100 Subject: [PATCH 075/124] Sort description. --- examples/README.txt | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/examples/README.txt b/examples/README.txt index 5b47df44..b67529c1 100644 --- a/examples/README.txt +++ b/examples/README.txt @@ -146,6 +146,11 @@ This is a simple "hello world" type program that shows a ~10 line application program calling out to a ~5 line ispc program to do a simple computation. +Sort +==== +This is a bucket sort of 32 bit unsigned integers. +By default 1000000 random elements get sorted. +Call ./sort N in order to sort N elements instead. Volume ====== From fa78d548ccc17c4a844762bd5660e49d941f9383 Mon Sep 17 00:00:00 2001 From: Dmitry Babokin Date: Tue, 17 Sep 2013 23:36:16 +0400 Subject: [PATCH 076/124] Test, documentation and vim support for double precision constants --- contrib/ispc.vim | 5 +++++ docs/ispc.rst | 11 ++++++++++- tests/double-consts.ispc | 23 +++++++++++++++++++++++ 3 files changed, 38 insertions(+), 1 deletion(-) create mode 100644 tests/double-consts.ispc diff --git a/contrib/ispc.vim b/contrib/ispc.vim index cc8493f0..4d870dcd 100644 --- a/contrib/ispc.vim +++ b/contrib/ispc.vim @@ -19,6 +19,11 @@ syn keyword ispcRepeat cdo cfor cwhile syn keyword ispcBuiltin programCount programIndex syn keyword ispcType export uniform varying int8 int16 int32 int64 +"double precision floating point number, with dot, optional exponent +syn match cFloat display contained "\d\+\.\d*d[-+]\=\d*\>" +"double precision floating point number, without dot, with exponent +syn match cFloat display contained "\d\+d[-+]\=\d\+\>" + " Default highlighting command -nargs=+ HiLink hi def link HiLink ispcStatement Statement diff --git a/docs/ispc.rst b/docs/ispc.rst index ff07f6d8..224faaa9 100644 --- a/docs/ispc.rst +++ b/docs/ispc.rst @@ -270,6 +270,14 @@ new reserved words: ``unmasked``, ``foreach_unique``, ``foreach_active``, and ``in``. Any program that happens to have a variable or function with one of these names must be modified to rename that symbol. +Updating ISPC Programs For Changes In ISPC 1.4.5 +---------------------------------------------- + +This release adds support for double precision floating point constants. +Double precision floating point constants are floating point number with +``d`` suffix and optional exponent part. Here are some examples: 3.14d, +31.4d-1, 1.d, 1.0d, 1d-2. Note that floating point number without suffix is +treated as single precision constant. Getting Started with ISPC ========================= @@ -1349,7 +1357,8 @@ but are likely to be supported in future releases: * Bitfield members of ``struct`` types * Variable numbers of arguments to functions * Literal floating-point constants (even without a ``f`` suffix) are - currently treated as being ``float`` type, not ``double`` + currently treated as being ``float`` type, not ``double``. To have a double + precision floating point constant use ``d`` suffix. * The ``volatile`` qualifier * The ``register`` storage class for variables. (Will be ignored). diff --git a/tests/double-consts.ispc b/tests/double-consts.ispc new file mode 100644 index 00000000..3259156a --- /dev/null +++ b/tests/double-consts.ispc @@ -0,0 +1,23 @@ + +export uniform int width() { return programCount; } + +export void f_f(uniform float RET[], uniform float aFOO[]) { + double a = aFOO[programIndex]; + // Test parsing of double constants. + double d1 = 1.0d40; + double d2 = 1.d40; + double d3 = 1d40; + double d4 = 10000000000000000000000000000000000000000.d; + double d5 = 10000000000000000000000000000000000000000.0d; + + // All the constants should be equal and if it's evaluated as "float", + // then sqrt will evaluate to +inf. + if (d1 == d2 && d1 == d3 && d1 == d4 && d1 == d5 && + ((float)sqrt(d1)) < 2e20) { + RET[programIndex] = a; + } +} + +export void result(uniform float RET[]) { + RET[programIndex] = 1 + programIndex; +} From 922edb11281ae432bc1647445dfa556de8fd663f Mon Sep 17 00:00:00 2001 From: evghenii Date: Wed, 18 Sep 2013 18:14:07 +0300 Subject: [PATCH 077/124] completed knc-i1x16.h and added knc-i1x8.h with knc-i1x8unsafe_fast.h that doesnt pass several tests.. --- examples/intrinsics/knc-i1x16.h | 3092 +++++++++++++++++++++ examples/intrinsics/knc-i1x8.h | 2862 +++++++++++++++++++ examples/intrinsics/knc-i1x8unsafe_fast.h | 2 + run_tests.py | 7 +- 4 files changed, 5961 insertions(+), 2 deletions(-) create mode 100644 examples/intrinsics/knc-i1x16.h create mode 100644 examples/intrinsics/knc-i1x8.h create mode 100644 examples/intrinsics/knc-i1x8unsafe_fast.h diff --git a/examples/intrinsics/knc-i1x16.h b/examples/intrinsics/knc-i1x16.h new file mode 100644 index 00000000..8b1a2bb9 --- /dev/null +++ b/examples/intrinsics/knc-i1x16.h @@ -0,0 +1,3092 @@ +/** + Copyright (c) 2010-2012, Intel Corporation + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS + IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A + PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#include +#include +#include +#include +#include +#include + +#ifdef _MSC_VER +#define FORCEINLINE __forceinline +#define PRE_ALIGN(x) /*__declspec(align(x))*/ +#define POST_ALIGN(x) +#define roundf(x) (floorf(x + .5f)) +#define round(x) (floor(x + .5)) +#else +#define FORCEINLINE __attribute__((always_inline)) +#define PRE_ALIGN(x) +#define POST_ALIGN(x) __attribute__ ((aligned(x))) +#endif + +#if 0 +#define KNC 1 +extern "C" +{ + int printf(const unsigned char *, ...); + int puts(unsigned char *); + unsigned int putchar(unsigned int); + int fflush(void *); + uint8_t *memcpy(uint8_t *, uint8_t *, uint64_t); + uint8_t *memset(uint8_t *, uint8_t, uint64_t); + void memset_pattern16(void *, const void *, uint64_t); +} +#endif + +typedef float __vec1_f; +typedef double __vec1_d; +typedef int8_t __vec1_i8; +typedef int16_t __vec1_i16; +typedef int32_t __vec1_i32; +typedef int64_t __vec1_i64; + +struct __vec16_i1 { + __vec16_i1() { } + __vec16_i1(const __mmask16 &vv) : v(vv) { } + __vec16_i1(bool v0, bool v1, bool v2, bool v3, + bool v4, bool v5, bool v6, bool v7, + bool v8, bool v9, bool v10, bool v11, + bool v12, bool v13, bool v14, bool v15) { + v = ((v0 & 1) | + ((v1 & 1) << 1) | + ((v2 & 1) << 2) | + ((v3 & 1) << 3) | + ((v4 & 1) << 4) | + ((v5 & 1) << 5) | + ((v6 & 1) << 6) | + ((v7 & 1) << 7) | + ((v8 & 1) << 8) | + ((v9 & 1) << 9) | + ((v10 & 1) << 10) | + ((v11 & 1) << 11) | + ((v12 & 1) << 12) | + ((v13 & 1) << 13) | + ((v14 & 1) << 14) | + ((v15 & 1) << 15)); + } + + __mmask16 v; + FORCEINLINE operator __mmask16() const { return v; } +}; + + +template +struct vec16 { + vec16() { } + vec16(T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7, + T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15) { + data[0] = v0; data[1] = v1; data[2] = v2; data[3] = v3; + data[4] = v4; data[5] = v5; data[6] = v6; data[7] = v7; + data[8] = v8; data[9] = v9; data[10] = v10; data[11] = v11; + data[12] = v12; data[13] = v13; data[14] = v14; data[15] = v15; + } + T data[16]; + FORCEINLINE const T& operator[](const int i) const { return data[i]; } + FORCEINLINE T& operator[](const int i) { return data[i]; } +}; + +#if 0 /* evghenii:i32 */ +struct PRE_ALIGN(64) __vec16_i32 : public vec16 { + __vec16_i32() { } + __vec16_i32(int32_t v0, int32_t v1, int32_t v2, int32_t v3, + int32_t v4, int32_t v5, int32_t v6, int32_t v7, + int32_t v8, int32_t v9, int32_t v10, int32_t v11, + int32_t v12, int32_t v13, int32_t v14, int32_t v15) + : vec16(v0, v1, v2, v3, v4, v5, v6, v7, + v8, v9, v10, v11, v12, v13, v14, v15) { } +} POST_ALIGN(64); +#else /* evghenii:i32 */ +struct PRE_ALIGN(64) __vec16_i32 +{ + __m512i v; + FORCEINLINE operator __m512i() const { return v; } + FORCEINLINE __vec16_i32() : v(_mm512_undefined_epi32()) {} + FORCEINLINE __vec16_i32(const int32_t &in) : v(_mm512_set1_epi32(in)) {} + FORCEINLINE __vec16_i32(const __m512i &in) : v(in) {} + FORCEINLINE __vec16_i32(const __vec16_i32 &o) : v(o.v) {} + FORCEINLINE __vec16_i32& operator =(const __vec16_i32 &o) { v=o.v; return *this; } + FORCEINLINE __vec16_i32(int32_t v00, int32_t v01, int32_t v02, int32_t v03, + int32_t v04, int32_t v05, int32_t v06, int32_t v07, + int32_t v08, int32_t v09, int32_t v10, int32_t v11, + int32_t v12, int32_t v13, int32_t v14, int32_t v15) : + v ( _mm512_set_16to16_pi(v15, v14, v13, v12, v11, v10, v09, v08, v07, v06, v05, v04, v03, v02, v01, v00) ) {} + FORCEINLINE const int32_t& operator[](const int i) const { return ((int32_t*)this)[i]; } + FORCEINLINE int32_t& operator[](const int i) { return ((int32_t*)this)[i]; } +} POST_ALIGN(64); +#endif /* evghenii:i32 */ + +#if 0 /* evghenii::f */ +PRE_ALIGN(64) struct __vec16_f : public vec16 { + __vec16_f() { } + __vec16_f(float v0, float v1, float v2, float v3, + float v4, float v5, float v6, float v7, + float v8, float v9, float v10, float v11, + float v12, float v13, float v14, float v15) + : vec16(v0, v1, v2, v3, v4, v5, v6, v7, + v8, v9, v10, v11, v12, v13, v14, v15) { } + +} POST_ALIGN(64); +#else /* evghenii::f */ +PRE_ALIGN(64) struct __vec16_f +{ + __m512 v; + FORCEINLINE operator __m512() const { return v; } + FORCEINLINE __vec16_f() : v(_mm512_undefined_ps()) { } + FORCEINLINE __vec16_f(const __m512 &in) : v(in) {} + FORCEINLINE __vec16_f(const __vec16_f &o) : v(o.v) {} + FORCEINLINE __vec16_f& operator =(const __vec16_f &o) { v=o.v; return *this; } + FORCEINLINE __vec16_f(float v00, float v01, float v02, float v03, + float v04, float v05, float v06, float v07, + float v08, float v09, float v10, float v11, + float v12, float v13, float v14, float v15) : + v ( _mm512_set_16to16_ps(v15, v14, v13, v12, v11, v10, v09, v08, v07, v06, v05, v04, v03, v02, v01, v00) ) {} + FORCEINLINE const float& operator[](const int i) const { return ((float*)this)[i]; } + FORCEINLINE float& operator[](const int i) { return ((float*)this)[i]; } +} POST_ALIGN(64); +#endif /* evghenii::f */ + +#if 0 /* evghenii::d */ +PRE_ALIGN(128) struct __vec16_d : public vec16 { + __vec16_d() { } + __vec16_d(double v0, double v1, double v2, double v3, + double v4, double v5, double v6, double v7, + double v8, double v9, double v10, double v11, + double v12, double v13, double v14, double v15) + : vec16(v0, v1, v2, v3, v4, v5, v6, v7, + v8, v9, v10, v11, v12, v13, v14, v15) { } + +} POST_ALIGN(128); +#else /* evghenii::d */ +struct PRE_ALIGN(128) __vec16_d +{ + __m512d v1; + __m512d v2; + FORCEINLINE __vec16_d() : v1(_mm512_undefined_pd()), v2(_mm512_undefined_pd()) {} + FORCEINLINE __vec16_d(const __m512d _v1, const __m512d _v2) : v1(_v1), v2(_v2) {} + FORCEINLINE __vec16_d(const __vec16_d &o) : v1(o.v1), v2(o.v2) {} + FORCEINLINE __vec16_d& operator =(const __vec16_d &o) { v1=o.v1; v2=o.v2; return *this; } + FORCEINLINE __vec16_d(double v00, double v01, double v02, double v03, + double v04, double v05, double v06, double v07, + double v08, double v09, double v10, double v11, + double v12, double v13, double v14, double v15) { + v1 = _mm512_set_8to8_pd(v15, v14, v13, v12, v11, v10, v09, v08); + v2 = _mm512_set_8to8_pd(v07, v06, v05, v04, v03, v02, v01, v00); + } + FORCEINLINE const double& operator[](const int i) const { return ((double*)this)[i]; } + FORCEINLINE double& operator[](const int i) { return ((double*)this)[i]; } +} POST_ALIGN(128); +#endif /* evghenii::d */ + +#if 1 /* evghenii::i64 */ +PRE_ALIGN(128) struct __vec16_i64 : public vec16 { + __vec16_i64() { } + __vec16_i64(int64_t v0, int64_t v1, int64_t v2, int64_t v3, + int64_t v4, int64_t v5, int64_t v6, int64_t v7, + int64_t v8, int64_t v9, int64_t v10, int64_t v11, + int64_t v12, int64_t v13, int64_t v14, int64_t v15) + : vec16(v0, v1, v2, v3, v4, v5, v6, v7, + v8, v9, v10, v11, v12, v13, v14, v15) { } +} POST_ALIGN(128); +#else /* evghenii::i64 */ +struct PRE_ALIGN(64) __vec16_i64 { + FORCEINLINE __vec16_i64() : v_lo(_mm512_undefined_epi32()), v_hi(_mm512_undefined_epi32()) {} + FORCEINLINE __vec16_i64(const __vec16_i64 &o) : v_lo(o.v_lo), v_hi(o.v_hi) {} + FORCEINLINE __vec16_i64(__m512i l, __m512i h) : v_lo(l), v_hi(h) {} + FORCEINLINE __vec16_i64& operator =(const __vec16_i64 &o) { v_lo=o.v_lo; v_hi=o.v_hi; return *this; } + FORCEINLINE __vec16_i64(int64_t v00, int64_t v01, int64_t v02, int64_t v03, + int64_t v04, int64_t v05, int64_t v06, int64_t v07, + int64_t v08, int64_t v09, int64_t v10, int64_t v11, + int64_t v12, int64_t v13, int64_t v14, int64_t v15) { + __m512i v1 = _mm512_set_8to8_epi64(v15, v14, v13, v12, v11, v10, v09, v08); + __m512i v2 = _mm512_set_8to8_epi64(v07, v06, v05, v04, v03, v02, v01, v00); + v_hi = _mm512_mask_permutevar_epi32(v_hi, 0xFF00, + _mm512_set_16to16_pi(15,13,11,9,7,5,3,1,14,12,10,8,6,4,2,0), + v1); + v_hi = _mm512_mask_permutevar_epi32(v_hi, 0x00FF, + _mm512_set_16to16_pi(14,12,10,8,6,4,2,0,15,13,11,9,7,5,3,1), + v2); + v_lo = _mm512_mask_permutevar_epi32(v_lo, 0xFF00, + _mm512_set_16to16_pi(14,12,10,8,6,4,2,0,15,13,11,9,7,5,3,1), + v1); + v_lo = _mm512_mask_permutevar_epi32(v_lo, 0x00FF, + _mm512_set_16to16_pi(15,13,11,9,7,5,3,1,14,12,10,8,6,4,2,0), + v2); + } + __m512i v_hi; + __m512i v_lo; +} POST_ALIGN(64); + +#endif /* evghenii::i64 */ + +PRE_ALIGN(16) struct __vec16_i8 : public vec16 { + __vec16_i8() { } + __vec16_i8(int8_t v0, int8_t v1, int8_t v2, int8_t v3, + int8_t v4, int8_t v5, int8_t v6, int8_t v7, + int8_t v8, int8_t v9, int8_t v10, int8_t v11, + int8_t v12, int8_t v13, int8_t v14, int8_t v15) + : vec16(v0, v1, v2, v3, v4, v5, v6, v7, + v8, v9, v10, v11, v12, v13, v14, v15) { } +} POST_ALIGN(16); + +PRE_ALIGN(32) struct __vec16_i16 : public vec16 { + __vec16_i16() { } + __vec16_i16(int16_t v0, int16_t v1, int16_t v2, int16_t v3, + int16_t v4, int16_t v5, int16_t v6, int16_t v7, + int16_t v8, int16_t v9, int16_t v10, int16_t v11, + int16_t v12, int16_t v13, int16_t v14, int16_t v15) + : vec16(v0, v1, v2, v3, v4, v5, v6, v7, + v8, v9, v10, v11, v12, v13, v14, v15) { } +} POST_ALIGN(32); + +static inline int32_t __extract_element(__vec16_i32, int); + + +/////////////////////////////////////////////////////////////////////////// +// macros... + +#define UNARY_OP(TYPE, NAME, OP) \ +static FORCEINLINE TYPE NAME(TYPE v) { \ + TYPE ret; \ + for (int i = 0; i < 16; ++i) \ + ret[i] = OP(v[i]); \ + return ret; \ +} + +#define BINARY_OP(TYPE, NAME, OP) \ +static FORCEINLINE TYPE NAME(TYPE a, TYPE b) { \ + TYPE ret; \ + for (int i = 0; i < 16; ++i) \ + ret[i] = a[i] OP b[i]; \ + return ret; \ +} + +#define BINARY_OP_CAST(TYPE, CAST, NAME, OP) \ +static FORCEINLINE TYPE NAME(TYPE a, TYPE b) { \ + TYPE ret; \ + for (int i = 0; i < 16; ++i) \ + ret[i] = (CAST)(a[i]) OP (CAST)(b[i]); \ + return ret; \ +} + +#define BINARY_OP_FUNC(TYPE, NAME, FUNC) \ +static FORCEINLINE TYPE NAME(TYPE a, TYPE b) { \ + TYPE ret; \ + for (int i = 0; i < 16; ++i) \ + ret[i] = FUNC(a[i], b[i]); \ + return ret; \ +} + +#define CMP_OP(TYPE, SUFFIX, CAST, NAME, OP) \ +static FORCEINLINE __vec16_i1 NAME##_##SUFFIX(TYPE a, TYPE b) { \ + __vec16_i1 ret; \ + ret.v = 0; \ + for (int i = 0; i < 16; ++i) \ + ret.v |= ((CAST)(a[i]) OP (CAST)(b[i])) << i; \ + return ret; \ +} \ +static FORCEINLINE __vec16_i1 NAME##_##SUFFIX##_and_mask(TYPE a, TYPE b, \ + __vec16_i1 mask) { \ + __vec16_i1 ret; \ + ret.v = 0; \ + for (int i = 0; i < 16; ++i) \ + ret.v |= ((CAST)(a[i]) OP (CAST)(b[i])) << i; \ + ret.v &= mask.v; \ + return ret; \ +} + +#define INSERT_EXTRACT(VTYPE, STYPE) \ +static FORCEINLINE STYPE __extract_element(VTYPE v, int index) { \ + return ((STYPE *)&v)[index]; \ +} \ +static FORCEINLINE void __insert_element(VTYPE *v, int index, STYPE val) { \ + ((STYPE *)v)[index] = val; \ +} + +#define LOAD_STORE(VTYPE, STYPE) \ +template \ +static FORCEINLINE VTYPE __load(const VTYPE *p) { \ + STYPE *ptr = (STYPE *)p; \ + VTYPE ret; \ + for (int i = 0; i < 16; ++i) \ + ret[i] = ptr[i]; \ + return ret; \ +} \ +template \ +static FORCEINLINE void __store(VTYPE *p, VTYPE v) { \ + STYPE *ptr = (STYPE *)p; \ + for (int i = 0; i < 16; ++i) \ + ptr[i] = v[i]; \ +} + +#define LOADS(VTYPE, STYPE) \ +template \ +static FORCEINLINE VTYPE __load(const VTYPE *p) { \ + STYPE *ptr = (STYPE *)p; \ + VTYPE ret; \ + for (int i = 0; i < 16; ++i) \ + ret[i] = ptr[i]; \ + return ret; \ +} \ + +#define STORES(VTYPE, STYPE) \ +template \ +static FORCEINLINE void __store(VTYPE *p, VTYPE v) { \ + STYPE *ptr = (STYPE *)p; \ + for (int i = 0; i < 16; ++i) \ + ptr[i] = v[i]; \ +} + +#define REDUCE_ADD(TYPE, VTYPE, NAME) \ +static FORCEINLINE TYPE NAME(VTYPE v) { \ + TYPE ret = v[0]; \ + for (int i = 1; i < 16; ++i) \ + ret = ret + v[i]; \ + return ret; \ +} + +#define REDUCE_MINMAX(TYPE, VTYPE, NAME, OP) \ +static FORCEINLINE TYPE NAME(VTYPE v) { \ + TYPE ret = v[0]; \ + for (int i = 1; i < 16; ++i) \ + ret = (ret OP (TYPE)v[i]) ? ret : (TYPE)v[i]; \ + return ret; \ +} + +#define SELECT(TYPE) \ +static FORCEINLINE TYPE __select(__vec16_i1 mask, TYPE a, TYPE b) { \ + TYPE ret; \ + for (int i = 0; i < 16; ++i) \ + ret[i] = (mask.v & (1< VTYPE __smear_##NAME(STYPE); \ +template <> FORCEINLINE VTYPE __smear_##NAME(STYPE v) { \ + VTYPE ret; \ + for (int i = 0; i < 16; ++i) \ + ret[i] = v; \ + return ret; \ +} + +#define SETZERO(VTYPE, NAME) \ +template VTYPE __setzero_##NAME(); \ +template <> FORCEINLINE VTYPE __setzero_##NAME() { \ + VTYPE ret; \ + for (int i = 0; i < 16; ++i) \ + ret[i] = 0; \ + return ret; \ +} + +#define UNDEF(VTYPE, NAME) \ +template VTYPE __undef_##NAME(); \ +template <> FORCEINLINE VTYPE __undef_##NAME() { \ + return VTYPE(); \ +} + +#define BROADCAST(VTYPE, NAME, STYPE) \ +static FORCEINLINE VTYPE __broadcast_##NAME(VTYPE v, int index) { \ + VTYPE ret; \ + for (int i = 0; i < 16; ++i) \ + ret[i] = v[index & 0xf]; \ + return ret; \ +} \ + +#define ROTATE(VTYPE, NAME, STYPE) \ +static FORCEINLINE VTYPE __rotate_##NAME(VTYPE v, int index) { \ + VTYPE ret; \ + for (int i = 0; i < 16; ++i) \ + ret[i] = v[(i+index) & 0xf]; \ + return ret; \ +} \ + +#define SHUFFLES(VTYPE, NAME, STYPE) \ +static FORCEINLINE VTYPE __shuffle_##NAME(VTYPE v, __vec16_i32 index) { \ + VTYPE ret; \ + for (int i = 0; i < 16; ++i) \ + ret[i] = v[__extract_element(index, i) & 0xf]; \ + return ret; \ +} \ +static FORCEINLINE VTYPE __shuffle2_##NAME(VTYPE v0, VTYPE v1, __vec16_i32 index) { \ + VTYPE ret; \ + for (int i = 0; i < 16; ++i) { \ + int ii = __extract_element(index, i) & 0x1f; \ + ret[i] = (ii < 16) ? v0[ii] : v1[ii-16]; \ + } \ + return ret; \ +} + +#define SHUFFLE2(VTYPE, NAME, STYPE) \ +static FORCEINLINE VTYPE __shuffle2_##NAME(VTYPE v0, VTYPE v1, __vec16_i32 index) { \ + VTYPE ret; \ + for (int i = 0; i < 16; ++i) { \ + int ii = __extract_element(index, i) & 0x1f; \ + ret[i] = (ii < 16) ? v0[ii] : v1[ii-16]; \ + } \ + return ret; \ +} + +/////////////////////////////////////////////////////////////////////////// + +INSERT_EXTRACT(__vec1_i8, int8_t) +INSERT_EXTRACT(__vec1_i16, int16_t) +INSERT_EXTRACT(__vec1_i32, int32_t) +INSERT_EXTRACT(__vec1_i64, int64_t) +INSERT_EXTRACT(__vec1_f, float) +INSERT_EXTRACT(__vec1_d, double) + +/////////////////////////////////////////////////////////////////////////// +// mask ops + +static FORCEINLINE uint64_t __movmsk(__vec16_i1 mask) { + return (uint64_t)mask.v; +} + +static FORCEINLINE bool __any(__vec16_i1 mask) { + return (mask.v!=0); +} + +static FORCEINLINE bool __all(__vec16_i1 mask) { + return (mask.v==0xFFFF); +} + +static FORCEINLINE bool __none(__vec16_i1 mask) { + return (mask.v==0); +} + +static FORCEINLINE __vec16_i1 __equal_i1(__vec16_i1 a, __vec16_i1 b) { + __vec16_i1 r; + r.v = (a.v & b.v) | (~a.v & ~b.v); + return r; +} + +static FORCEINLINE __vec16_i1 __and(__vec16_i1 a, __vec16_i1 b) { + __vec16_i1 r; + r.v = a.v & b.v; + return r; +} + +static FORCEINLINE __vec16_i1 __xor(__vec16_i1 a, __vec16_i1 b) { + __vec16_i1 r; + r.v = a.v ^ b.v; + return r; +} + +static FORCEINLINE __vec16_i1 __or(__vec16_i1 a, __vec16_i1 b) { + __vec16_i1 r; + r.v = a.v | b.v; + return r; +} + +static FORCEINLINE __vec16_i1 __not(__vec16_i1 v) { + __vec16_i1 r; + r.v = ~v.v; + return r; +} + +static FORCEINLINE __vec16_i1 __and_not1(__vec16_i1 a, __vec16_i1 b) { + __vec16_i1 r; + r.v = ~a.v & b.v; + return r; +} + +static FORCEINLINE __vec16_i1 __and_not2(__vec16_i1 a, __vec16_i1 b) { + __vec16_i1 r; + r.v = a.v & ~b.v; + return r; +} + +static FORCEINLINE __vec16_i1 __select(__vec16_i1 mask, __vec16_i1 a, + __vec16_i1 b) { + __vec16_i1 r; + r.v = (a.v & mask.v) | (b.v & ~mask.v); + return r; +} + +static FORCEINLINE __vec16_i1 __select(bool cond, __vec16_i1 a, __vec16_i1 b) { + return cond ? a : b; +} + +static FORCEINLINE bool __extract_element(__vec16_i1 vec, int index) { + return (vec.v & (1 << index)) ? true : false; +} + +static FORCEINLINE void __insert_element(__vec16_i1 *vec, int index, + bool val) { + if (val == false) + vec->v &= ~(1 << index); + else + vec->v |= (1 << index); +} + +template static FORCEINLINE __vec16_i1 __load(const __vec16_i1 *p) { + uint16_t *ptr = (uint16_t *)p; + __vec16_i1 r; + r.v = *ptr; + return r; +} + +template static FORCEINLINE void __store(__vec16_i1 *p, __vec16_i1 v) { + uint16_t *ptr = (uint16_t *)p; + *ptr = v.v; +} + +template RetVecType __smear_i1(int i); +template <> static FORCEINLINE __vec16_i1 __smear_i1<__vec16_i1>(int i) { + return i?0xFFFF:0x0; +} + +template RetVecType __setzero_i1(); +template <> static FORCEINLINE __vec16_i1 __setzero_i1<__vec16_i1>() { + return 0; +} + +template __vec16_i1 __undef_i1(); +template <> FORCEINLINE __vec16_i1 __undef_i1<__vec16_i1>() { + return __vec16_i1(); +} + + +/////////////////////////////////////////////////////////////////////////// +// int8 + +BINARY_OP(__vec16_i8, __add, +) +BINARY_OP(__vec16_i8, __sub, -) +BINARY_OP(__vec16_i8, __mul, *) + +BINARY_OP(__vec16_i8, __or, |) +BINARY_OP(__vec16_i8, __and, &) +BINARY_OP(__vec16_i8, __xor, ^) +BINARY_OP(__vec16_i8, __shl, <<) + +BINARY_OP_CAST(__vec16_i8, uint8_t, __udiv, /) +BINARY_OP_CAST(__vec16_i8, int8_t, __sdiv, /) + +BINARY_OP_CAST(__vec16_i8, uint8_t, __urem, %) +BINARY_OP_CAST(__vec16_i8, int8_t, __srem, %) +BINARY_OP_CAST(__vec16_i8, uint8_t, __lshr, >>) +BINARY_OP_CAST(__vec16_i8, int8_t, __ashr, >>) + +SHIFT_UNIFORM(__vec16_i8, uint8_t, __lshr, >>) +SHIFT_UNIFORM(__vec16_i8, int8_t, __ashr, >>) +SHIFT_UNIFORM(__vec16_i8, int8_t, __shl, <<) + +CMP_OP(__vec16_i8, i8, int8_t, __equal, ==) +CMP_OP(__vec16_i8, i8, int8_t, __not_equal, !=) +CMP_OP(__vec16_i8, i8, uint8_t, __unsigned_less_equal, <=) +CMP_OP(__vec16_i8, i8, int8_t, __signed_less_equal, <=) +CMP_OP(__vec16_i8, i8, uint8_t, __unsigned_greater_equal, >=) +CMP_OP(__vec16_i8, i8, int8_t, __signed_greater_equal, >=) +CMP_OP(__vec16_i8, i8, uint8_t, __unsigned_less_than, <) +CMP_OP(__vec16_i8, i8, int8_t, __signed_less_than, <) +CMP_OP(__vec16_i8, i8, uint8_t, __unsigned_greater_than, >) +CMP_OP(__vec16_i8, i8, int8_t, __signed_greater_than, >) + +SELECT(__vec16_i8) +INSERT_EXTRACT(__vec16_i8, int8_t) +SMEAR(__vec16_i8, i8, int8_t) +SETZERO(__vec16_i8, i8) +UNDEF(__vec16_i8, i8) +BROADCAST(__vec16_i8, i8, int8_t) +ROTATE(__vec16_i8, i8, int8_t) +SHUFFLES(__vec16_i8, i8, int8_t) +LOAD_STORE(__vec16_i8, int8_t) + +/////////////////////////////////////////////////////////////////////////// +// int16 + +BINARY_OP(__vec16_i16, __add, +) +BINARY_OP(__vec16_i16, __sub, -) +BINARY_OP(__vec16_i16, __mul, *) + +BINARY_OP(__vec16_i16, __or, |) +BINARY_OP(__vec16_i16, __and, &) +BINARY_OP(__vec16_i16, __xor, ^) +BINARY_OP(__vec16_i16, __shl, <<) + +BINARY_OP_CAST(__vec16_i16, uint16_t, __udiv, /) +BINARY_OP_CAST(__vec16_i16, int16_t, __sdiv, /) + +BINARY_OP_CAST(__vec16_i16, uint16_t, __urem, %) +BINARY_OP_CAST(__vec16_i16, int16_t, __srem, %) +BINARY_OP_CAST(__vec16_i16, uint16_t, __lshr, >>) +BINARY_OP_CAST(__vec16_i16, int16_t, __ashr, >>) + +SHIFT_UNIFORM(__vec16_i16, uint16_t, __lshr, >>) +SHIFT_UNIFORM(__vec16_i16, int16_t, __ashr, >>) +SHIFT_UNIFORM(__vec16_i16, int16_t, __shl, <<) + +CMP_OP(__vec16_i16, i16, int16_t, __equal, ==) +CMP_OP(__vec16_i16, i16, int16_t, __not_equal, !=) +CMP_OP(__vec16_i16, i16, uint16_t, __unsigned_less_equal, <=) +CMP_OP(__vec16_i16, i16, int16_t, __signed_less_equal, <=) +CMP_OP(__vec16_i16, i16, uint16_t, __unsigned_greater_equal, >=) +CMP_OP(__vec16_i16, i16, int16_t, __signed_greater_equal, >=) +CMP_OP(__vec16_i16, i16, uint16_t, __unsigned_less_than, <) +CMP_OP(__vec16_i16, i16, int16_t, __signed_less_than, <) +CMP_OP(__vec16_i16, i16, uint16_t, __unsigned_greater_than, >) +CMP_OP(__vec16_i16, i16, int16_t, __signed_greater_than, >) + +SELECT(__vec16_i16) +INSERT_EXTRACT(__vec16_i16, int16_t) +SMEAR(__vec16_i16, i16, int16_t) +SETZERO(__vec16_i16, i16) +UNDEF(__vec16_i16, i16) +BROADCAST(__vec16_i16, i16, int16_t) +ROTATE(__vec16_i16, i16, int16_t) +SHUFFLES(__vec16_i16, i16, int16_t) +LOAD_STORE(__vec16_i16, int16_t) + +#if 0 /* evghenii::int32 */ +/////////////////////////////////////////////////////////////////////////// +// int32 + +BINARY_OP(__vec16_i32, __add, +) +BINARY_OP(__vec16_i32, __sub, -) +BINARY_OP(__vec16_i32, __mul, *) + +BINARY_OP(__vec16_i32, __or, |) +BINARY_OP(__vec16_i32, __and, &) +BINARY_OP(__vec16_i32, __xor, ^) +BINARY_OP(__vec16_i32, __shl, <<) + +BINARY_OP_CAST(__vec16_i32, uint32_t, __udiv, /) +BINARY_OP_CAST(__vec16_i32, int32_t, __sdiv, /) + +BINARY_OP_CAST(__vec16_i32, uint32_t, __urem, %) +BINARY_OP_CAST(__vec16_i32, int32_t, __srem, %) +BINARY_OP_CAST(__vec16_i32, uint32_t, __lshr, >>) +BINARY_OP_CAST(__vec16_i32, int32_t, __ashr, >>) + +SHIFT_UNIFORM(__vec16_i32, uint32_t, __lshr, >>) +SHIFT_UNIFORM(__vec16_i32, int32_t, __ashr, >>) +SHIFT_UNIFORM(__vec16_i32, int32_t, __shl, <<) + +CMP_OP(__vec16_i32, i32, int32_t, __equal, ==) +CMP_OP(__vec16_i32, i32, int32_t, __not_equal, !=) +CMP_OP(__vec16_i32, i32, uint32_t, __unsigned_less_equal, <=) +CMP_OP(__vec16_i32, i32, int32_t, __signed_less_equal, <=) +CMP_OP(__vec16_i32, i32, uint32_t, __unsigned_greater_equal, >=) +CMP_OP(__vec16_i32, i32, int32_t, __signed_greater_equal, >=) +CMP_OP(__vec16_i32, i32, uint32_t, __unsigned_less_than, <) +CMP_OP(__vec16_i32, i32, int32_t, __signed_less_than, <) +CMP_OP(__vec16_i32, i32, uint32_t, __unsigned_greater_than, >) +CMP_OP(__vec16_i32, i32, int32_t, __signed_greater_than, >) + +SELECT(__vec16_i32) +INSERT_EXTRACT(__vec16_i32, int32_t) +SMEAR(__vec16_i32, i32, int32_t) +SETZERO(__vec16_i32, i32) +UNDEF(__vec16_i32, i32) +BROADCAST(__vec16_i32, i32, int32_t) +ROTATE(__vec16_i32, i32, int32_t) +SHUFFLES(__vec16_i32, i32, int32_t) +LOAD_STORE(__vec16_i32, int32_t) + +#else /* evghenii::int32 */ +/////////////////////////////////////////////////////////////////////////// +// int32 +/////////////////////////////////////////////////////////////////////////// + +static FORCEINLINE __vec16_i32 __add(__vec16_i32 a, __vec16_i32 b) { + return _mm512_add_epi32(a, b); +} + +static FORCEINLINE __vec16_i32 __sub(__vec16_i32 a, __vec16_i32 b) { + return _mm512_sub_epi32(a, b); +} + +static FORCEINLINE __vec16_i32 __mul(__vec16_i32 a, __vec16_i32 b) { + return _mm512_mullo_epi32(a, b); +} + +static FORCEINLINE __vec16_i32 __udiv(__vec16_i32 a, __vec16_i32 b) { + return _mm512_div_epu32(a, b); +} + +static FORCEINLINE __vec16_i32 __sdiv(__vec16_i32 a, __vec16_i32 b) { + return _mm512_div_epi32(a, b); +} + +static FORCEINLINE __vec16_i32 __urem(__vec16_i32 a, __vec16_i32 b) { + return _mm512_rem_epu32(a, b); +} + +static FORCEINLINE __vec16_i32 __srem(__vec16_i32 a, __vec16_i32 b) { + return _mm512_rem_epi32(a, b); +} + +static FORCEINLINE __vec16_i32 __or(__vec16_i32 a, __vec16_i32 b) { + return _mm512_or_epi32(a, b); +} + +static FORCEINLINE __vec16_i32 __and(__vec16_i32 a, __vec16_i32 b) { + return _mm512_and_epi32(a, b); +} + +static FORCEINLINE __vec16_i32 __xor(__vec16_i32 a, __vec16_i32 b) { + return _mm512_xor_epi32(a, b); +} + +static FORCEINLINE __vec16_i32 __shl(__vec16_i32 a, __vec16_i32 b) { + return _mm512_sllv_epi32(a, b); +} + +static FORCEINLINE __vec16_i32 __lshr(__vec16_i32 a, __vec16_i32 b) { + return _mm512_srlv_epi32(a, b); +} + +static FORCEINLINE __vec16_i32 __ashr(__vec16_i32 a, __vec16_i32 b) { + return _mm512_srav_epi32(a, b); +} + +static FORCEINLINE __vec16_i32 __shl(__vec16_i32 a, int32_t n) { + return _mm512_slli_epi32(a, n); +} + +static FORCEINLINE __vec16_i32 __lshr(__vec16_i32 a, int32_t n) { + return _mm512_srli_epi32(a, n); +} + +static FORCEINLINE __vec16_i32 __ashr(__vec16_i32 a, int32_t n) { + return _mm512_srai_epi32(a, n); +} + +static FORCEINLINE __vec16_i1 __equal_i32(const __vec16_i32 &a, const __vec16_i32 &b) { + return _mm512_cmpeq_epi32_mask(a, b); +} + +static FORCEINLINE __vec16_i1 __equal_i32_and_mask(const __vec16_i32 &a, const __vec16_i32 &b, + __vec16_i1 m) { + return _mm512_mask_cmpeq_epi32_mask(m, a, b); +} + +static FORCEINLINE __vec16_i1 __not_equal_i32(__vec16_i32 a, __vec16_i32 b) { + return _mm512_cmpneq_epi32_mask(a, b); +} + +static FORCEINLINE __vec16_i1 __not_equal_i32_and_mask(__vec16_i32 a, __vec16_i32 b, + __vec16_i1 m) { + return _mm512_mask_cmpneq_epi32_mask(m, a, b); +} + +static FORCEINLINE __vec16_i1 __unsigned_less_equal_i32(__vec16_i32 a, __vec16_i32 b) { + return _mm512_cmple_epu32_mask(a, b); +} + +static FORCEINLINE __vec16_i1 __unsigned_less_equal_i32_and_mask(__vec16_i32 a, __vec16_i32 b, + __vec16_i1 m) { + return _mm512_mask_cmple_epu32_mask(m, a, b); +} + +static FORCEINLINE __vec16_i1 __signed_less_equal_i32(__vec16_i32 a, __vec16_i32 b) { + return _mm512_cmple_epi32_mask(a, b); +} + +static FORCEINLINE __vec16_i1 __signed_less_equal_i32_and_mask(__vec16_i32 a, __vec16_i32 b, + __vec16_i1 m) { + return _mm512_mask_cmple_epi32_mask(m, a, b); +} + +static FORCEINLINE __vec16_i1 __unsigned_greater_equal_i32(__vec16_i32 a, __vec16_i32 b) { + return _mm512_cmpge_epu32_mask(a, b); +} + +static FORCEINLINE __vec16_i1 __unsigned_greater_equal_i32_and_mask(__vec16_i32 a, __vec16_i32 b, + __vec16_i1 m) { + return _mm512_mask_cmpge_epu32_mask(m, a, b); +} + +static FORCEINLINE __vec16_i1 __signed_greater_equal_i32(__vec16_i32 a, __vec16_i32 b) { + return _mm512_cmpge_epi32_mask(a, b); +} + +static FORCEINLINE __vec16_i1 __signed_greater_equal_i32_and_mask(__vec16_i32 a, __vec16_i32 b, + __vec16_i1 m) { + return _mm512_mask_cmpge_epi32_mask(m, a, b); +} + +static FORCEINLINE __vec16_i1 __unsigned_less_than_i32(__vec16_i32 a, __vec16_i32 b) { + return _mm512_cmplt_epu32_mask(a, b); +} + +static FORCEINLINE __vec16_i1 __unsigned_less_than_i32_and_mask(__vec16_i32 a, __vec16_i32 b, + __vec16_i1 m) { + return _mm512_mask_cmplt_epu32_mask(m, a, b); +} + +static FORCEINLINE __vec16_i1 __signed_less_than_i32(__vec16_i32 a, __vec16_i32 b) { + return _mm512_cmplt_epi32_mask(a, b); +} + +static FORCEINLINE __vec16_i1 __signed_less_than_i32_and_mask(__vec16_i32 a, __vec16_i32 b, + __vec16_i1 m) { + return _mm512_mask_cmplt_epi32_mask(m, a, b); +} + +static FORCEINLINE __vec16_i1 __unsigned_greater_than_i32(__vec16_i32 a, __vec16_i32 b) { + return _mm512_cmpgt_epu32_mask(a, b); +} + +static FORCEINLINE __vec16_i1 __unsigned_greater_than_i32_and_mask(__vec16_i32 a, __vec16_i32 b, + __vec16_i1 m) { + return _mm512_mask_cmpgt_epu32_mask(m, a, b); +} + +static FORCEINLINE __vec16_i1 __signed_greater_than_i32(__vec16_i32 a, __vec16_i32 b) { + return _mm512_cmpgt_epi32_mask(a, b); +} + +static FORCEINLINE __vec16_i1 __signed_greater_than_i32_and_mask(__vec16_i32 a, __vec16_i32 b, + __vec16_i1 m) { + return _mm512_mask_cmpgt_epi32_mask(m, a, b); +} + +static FORCEINLINE __vec16_i32 __select(__vec16_i1 mask, + __vec16_i32 a, __vec16_i32 b) { + return _mm512_mask_mov_epi32(b.v, mask, a.v); +} + +static FORCEINLINE __vec16_i32 __select(bool cond, __vec16_i32 a, __vec16_i32 b) { + return cond ? a : b; +} + +static FORCEINLINE int32_t __extract_element(__vec16_i32 v, int index) { //uint32_t index) { + return ((int32_t *)&v)[index]; +} + +static FORCEINLINE void __insert_element(__vec16_i32 *v, uint32_t index, int32_t val) { + ((int32_t *)v)[index] = val; +} + +template RetVecType __smear_i32(int32_t i); +template <> static FORCEINLINE __vec16_i32 __smear_i32<__vec16_i32>(int32_t i) { + return _mm512_set1_epi32(i); +} + +static const __vec16_i32 __ispc_one = __smear_i32<__vec16_i32>(1); +static const __vec16_i32 __ispc_thirty_two = __smear_i32<__vec16_i32>(32); +static const __vec16_i32 __ispc_ffffffff = __smear_i32<__vec16_i32>(-1); +static const __vec16_i32 __ispc_stride1(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + +template RetVecType __setzero_i32(); +template <> static FORCEINLINE __vec16_i32 __setzero_i32<__vec16_i32>() { + return _mm512_setzero_epi32(); +} + +template RetVecType __undef_i32(); +template <> static FORCEINLINE __vec16_i32 __undef_i32<__vec16_i32>() { + return __vec16_i32(); +} + +static FORCEINLINE __vec16_i32 __broadcast_i32(__vec16_i32 v, int index) { + int32_t val = __extract_element(v, index & 0xf); + return _mm512_set1_epi32(val); +} + +#if 0 /* evghenii::doesn't work */ +static FORCEINLINE __vec16_i32 __rotate_i32(__vec16_i32 v, int index) { + __vec16_i32 idx = __smear_i32<__vec16_i32>(index); + __vec16_i32 shuffle = _mm512_and_epi32(_mm512_add_epi32(__ispc_stride1, idx), __smear_i32<__vec16_i32>(0x7)); + return _mm512_mask_permutevar_epi32(v, 0xffff, shuffle, v); +} +#else +ROTATE(__vec16_i32, i32, int32_t) +#endif + +static FORCEINLINE __vec16_i32 __shuffle_i32(__vec16_i32 v, __vec16_i32 index) { + return _mm512_mask_permutevar_epi32(v, 0xffff, index, v); +} +SHUFFLE2(__vec16_i32, i32, int32_t) /* evghenii::to implement */ + +template static FORCEINLINE __vec16_i32 __load(const __vec16_i32 *p) { +#ifdef ISPC_FORCE_ALIGNED_MEMORY + return _mm512_load_epi32(p); +#else + __vec16_i32 v; + v = _mm512_extloadunpacklo_epi32(v, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE); + v = _mm512_extloadunpackhi_epi32(v, (uint8_t*)p+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE); + return v; +#endif +} + + +template static FORCEINLINE void __store(__vec16_i32 *p, __vec16_i32 v) { +#ifdef ISPC_FORCE_ALIGNED_MEMORY + _mm512_store_epi32(p, v); +#else + _mm512_extpackstorelo_epi32(p, v, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE); + _mm512_extpackstorehi_epi32((uint8_t*)p+64, v, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE); +#endif +} + +#if 0 +template <> static FORCEINLINE __vec16_i32 __load<64>(const __vec16_i32 *p) { + return _mm512_load_epi32(p); +} +template <> static FORCEINLINE void __store<64>(__vec16_i32 *p, __vec16_i32 v) { + _mm512_store_epi32(p, v); +} +#endif +#endif /* evghenii::int32 */ + +/////////////////////////////////////////////////////////////////////////// +// int64 + +BINARY_OP(__vec16_i64, __add, +) +BINARY_OP(__vec16_i64, __sub, -) +BINARY_OP(__vec16_i64, __mul, *) + +BINARY_OP(__vec16_i64, __or, |) +BINARY_OP(__vec16_i64, __and, &) +BINARY_OP(__vec16_i64, __xor, ^) +BINARY_OP(__vec16_i64, __shl, <<) + +BINARY_OP_CAST(__vec16_i64, uint64_t, __udiv, /) +BINARY_OP_CAST(__vec16_i64, int64_t, __sdiv, /) + +BINARY_OP_CAST(__vec16_i64, uint64_t, __urem, %) +BINARY_OP_CAST(__vec16_i64, int64_t, __srem, %) +BINARY_OP_CAST(__vec16_i64, uint64_t, __lshr, >>) +BINARY_OP_CAST(__vec16_i64, int64_t, __ashr, >>) + +SHIFT_UNIFORM(__vec16_i64, uint64_t, __lshr, >>) +SHIFT_UNIFORM(__vec16_i64, int64_t, __ashr, >>) +SHIFT_UNIFORM(__vec16_i64, int64_t, __shl, <<) + +CMP_OP(__vec16_i64, i64, int64_t, __equal, ==) +CMP_OP(__vec16_i64, i64, int64_t, __not_equal, !=) +CMP_OP(__vec16_i64, i64, uint64_t, __unsigned_less_equal, <=) +CMP_OP(__vec16_i64, i64, int64_t, __signed_less_equal, <=) +CMP_OP(__vec16_i64, i64, uint64_t, __unsigned_greater_equal, >=) +CMP_OP(__vec16_i64, i64, int64_t, __signed_greater_equal, >=) +CMP_OP(__vec16_i64, i64, uint64_t, __unsigned_less_than, <) +CMP_OP(__vec16_i64, i64, int64_t, __signed_less_than, <) +CMP_OP(__vec16_i64, i64, uint64_t, __unsigned_greater_than, >) +CMP_OP(__vec16_i64, i64, int64_t, __signed_greater_than, >) + +SELECT(__vec16_i64) +INSERT_EXTRACT(__vec16_i64, int64_t) +SMEAR(__vec16_i64, i64, int64_t) +SETZERO(__vec16_i64, i64) +UNDEF(__vec16_i64, i64) +BROADCAST(__vec16_i64, i64, int64_t) +ROTATE(__vec16_i64, i64, int64_t) +SHUFFLES(__vec16_i64, i64, int64_t) +LOAD_STORE(__vec16_i64, int64_t) + + +#if 0 /* evghenii::float */ +/////////////////////////////////////////////////////////////////////////// +// float + +BINARY_OP(__vec16_f, __add, +) +BINARY_OP(__vec16_f, __sub, -) +BINARY_OP(__vec16_f, __mul, *) +BINARY_OP(__vec16_f, __div, /) + +CMP_OP(__vec16_f, float, float, __equal, ==) +CMP_OP(__vec16_f, float, float, __not_equal, !=) +CMP_OP(__vec16_f, float, float, __less_than, <) +CMP_OP(__vec16_f, float, float, __less_equal, <=) +CMP_OP(__vec16_f, float, float, __greater_than, >) +CMP_OP(__vec16_f, float, float, __greater_equal, >=) + +static FORCEINLINE __vec16_i1 __ordered_float(__vec16_f a, __vec16_f b) { + __vec16_i1 ret; + ret.v = 0; + for (int i = 0; i < 16; ++i) + ret.v |= ((a[i] == a[i]) && (b[i] == b[i])) ? (1 << i) : 0; + return ret; +} + +static FORCEINLINE __vec16_i1 __unordered_float(__vec16_f a, __vec16_f b) { + __vec16_i1 ret; + ret.v = 0; + for (int i = 0; i < 16; ++i) + ret.v |= ((a[i] != a[i]) || (b[i] != b[i])) ? (1 << i) : 0; + return ret; +} + +#if 0 + case Instruction::FRem: intrinsic = "__frem"; break; +#endif + +SELECT(__vec16_f) +INSERT_EXTRACT(__vec16_f, float) +SMEAR(__vec16_f, float, float) +SETZERO(__vec16_f, float) +UNDEF(__vec16_f, float) +BROADCAST(__vec16_f, float, float) +ROTATE(__vec16_f, float, float) +SHUFFLES(__vec16_f, float, float) +LOAD_STORE(__vec16_f, float) +#else /* evghenii::float */ + +/////////////////////////////////////////////////////////////////////////// +// float +/////////////////////////////////////////////////////////////////////////// + +static FORCEINLINE __vec16_f __add(__vec16_f a, __vec16_f b) { + return _mm512_add_ps(a, b); +} + +static FORCEINLINE __vec16_f __sub(__vec16_f a, __vec16_f b) { + return _mm512_sub_ps(a, b); +} + +#if 1 /* evghenii::this two fails assert-3.ispc test */ +static FORCEINLINE __vec16_f __mul(__vec16_f a, __vec16_f b) { + return _mm512_mul_ps(a, b); +} + +static FORCEINLINE __vec16_f __div(__vec16_f a, __vec16_f b) { + return _mm512_div_ps(a, b); +} +#else +BINARY_OP(__vec16_f, __mul, *) +BINARY_OP(__vec16_f, __div, /) +#endif + + +static FORCEINLINE __vec16_i1 __equal_float(__vec16_f a, __vec16_f b) { + return _mm512_cmpeq_ps_mask(a, b); +} + +static FORCEINLINE __vec16_i1 __equal_float_and_mask(__vec16_f a, __vec16_f b, + __vec16_i1 m) { + return _mm512_mask_cmpeq_ps_mask(m, a, b); +} + +static FORCEINLINE __vec16_i1 __not_equal_float(__vec16_f a, __vec16_f b) { + return _mm512_cmpneq_ps_mask(a, b); +} + +static FORCEINLINE __vec16_i1 __not_equal_float_and_mask(__vec16_f a, __vec16_f b, + __vec16_i1 m) { + return _mm512_mask_cmpneq_ps_mask(m, a, b); +} + +static FORCEINLINE __vec16_i1 __less_than_float(__vec16_f a, __vec16_f b) { + return _mm512_cmplt_ps_mask(a, b); +} + +static FORCEINLINE __vec16_i1 __less_than_float_and_mask(__vec16_f a, __vec16_f b, + __vec16_i1 m) { + return _mm512_mask_cmplt_ps_mask(m, a, b); +} + +static FORCEINLINE __vec16_i1 __less_equal_float(__vec16_f a, __vec16_f b) { + return _mm512_cmple_ps_mask(a, b); +} + +static FORCEINLINE __vec16_i1 __less_equal_float_and_mask(__vec16_f a, __vec16_f b, + __vec16_i1 m) { + return _mm512_mask_cmple_ps_mask(m, a, b); +} + +static FORCEINLINE __vec16_i1 __greater_than_float(__vec16_f a, __vec16_f b) { +// return _mm512_cmpnle_ps_mask(a, b); + return _mm512_cmp_ps_mask(a, b,_CMP_GT_OS); +} + +static FORCEINLINE __vec16_i1 __greater_than_float_and_mask(__vec16_f a, __vec16_f b, + __vec16_i1 m) { +// return _mm512_mask_cmpnle_ps_mask(m, a, b); + return _mm512_mask_cmp_ps_mask(m,a, b,_CMP_GT_OS); +} + +static FORCEINLINE __vec16_i1 __greater_equal_float(__vec16_f a, __vec16_f b) { +// return _mm512_cmpnlt_ps_mask(a, b); + return _mm512_cmp_ps_mask(a, b,_CMP_GE_OS); +} + +static FORCEINLINE __vec16_i1 __greater_equal_float_and_mask(__vec16_f a, __vec16_f b, + __vec16_i1 m) { +// return _mm512_mask_cmpnlt_ps_mask(m, a, b); + return _mm512_mask_cmp_ps_mask(m,a, b,_CMP_GE_OS); +} + +static FORCEINLINE __vec16_i1 __ordered_float(__vec16_f a, __vec16_f b) { + return _mm512_cmpord_ps_mask(a, b); +} + +static FORCEINLINE __vec16_i1 __unordered_float(__vec16_f a, __vec16_f b) { + return _mm512_cmpunord_ps_mask(a, b); +} + +static FORCEINLINE __vec16_f __select(__vec16_i1 mask, __vec16_f a, __vec16_f b) { + return _mm512_mask_mov_ps(b, mask, a); +} + +static FORCEINLINE __vec16_f __select(bool cond, __vec16_f a, __vec16_f b) { + return cond ? a : b; +} + +static FORCEINLINE float __extract_element(__vec16_f v, uint32_t index) { + return v[index]; + // return ((float *)&v)[index]; +} + +static FORCEINLINE void __insert_element(__vec16_f *v, uint32_t index, float val) { + (*v)[index] = val; +// ((float *)v)[index] = val; +} + +template RetVecType __smear_float(float f); +template <> static FORCEINLINE __vec16_f __smear_float<__vec16_f>(float f) { + return _mm512_set_1to16_ps(f); +} + +template RetVecType __setzero_float(); +template <> static FORCEINLINE __vec16_f __setzero_float<__vec16_f>() { + return _mm512_setzero_ps(); +} + +template RetVecType __undef_float(); +template <> static FORCEINLINE __vec16_f __undef_float<__vec16_f>() { + return __vec16_f(); +} + +static FORCEINLINE __vec16_f __broadcast_float(__vec16_f v, int index) { + float val = __extract_element(v, index & 0xf); + return _mm512_set1_ps(val); +} + +#if 1 +static FORCEINLINE __vec16_f __shuffle_float(__vec16_f v, __vec16_i32 index) { + return _mm512_castsi512_ps(_mm512_mask_permutevar_epi32(_mm512_castps_si512(v), 0xffff, index, _mm512_castps_si512(v))); +} +#endif +ROTATE(__vec16_f, float, float) +SHUFFLE2(__vec16_f, float, float) + +template static FORCEINLINE __vec16_f __load(const __vec16_f *p) { +#ifdef ISPC_FORCE_ALIGNED_MEMORY + return _mm512_load_ps(p); +#else + __vec16_f v; + v = _mm512_extloadunpacklo_ps(v, p, _MM_UPCONV_PS_NONE, _MM_HINT_NONE); + v = _mm512_extloadunpackhi_ps(v, (uint8_t*)p+64, _MM_UPCONV_PS_NONE, _MM_HINT_NONE); + return v; +#endif +} + +template static FORCEINLINE void __store(__vec16_f *p, __vec16_f v) { +#ifdef ISPC_FORCE_ALIGNED_MEMORY + _mm512_store_ps(p, v); +#else + _mm512_extpackstorelo_ps( p, v, _MM_DOWNCONV_PS_NONE, _MM_HINT_NONE); + _mm512_extpackstorehi_ps((uint8_t*)p+64, v, _MM_DOWNCONV_PS_NONE, _MM_HINT_NONE); +#endif +} + +#if 0 +template <> static FORCEINLINE void __store<64>(__vec16_f *p, __vec16_f v) { + _mm512_store_ps(p, v); +} +template <> static FORCEINLINE __vec16_f __load<64>(const __vec16_f *p) { + return _mm512_load_ps(p); +} +#endif + +#endif /* evghenii::float */ + +static FORCEINLINE float __exp_uniform_float(float v) { return expf(v);} +static FORCEINLINE __vec16_f __exp_varying_float(__vec16_f v) { return _mm512_exp_ps(v); } + +static FORCEINLINE float __log_uniform_float(float v) { return logf(v);} +static FORCEINLINE __vec16_f __log_varying_float(__vec16_f v) { return _mm512_log_ps(v); } + +static FORCEINLINE float __pow_uniform_float(float a, float b) { return powf(a, b);} +static FORCEINLINE __vec16_f __pow_varying_float(__vec16_f a, __vec16_f b) { return _mm512_pow_ps(a,b); } + +static FORCEINLINE int __intbits(float v) { + union { + float f; + int i; + } u; + u.f = v; + return u.i; +} + +static FORCEINLINE float __floatbits(int v) { + union { + float f; + int i; + } u; + u.i = v; + return u.f; +} + +static FORCEINLINE float __half_to_float_uniform(int16_t h) { + static const uint32_t shifted_exp = 0x7c00 << 13; // exponent mask after shift + + int32_t o = ((int32_t)(h & 0x7fff)) << 13; // exponent/mantissa bits + uint32_t exp = shifted_exp & o; // just the exponent + o += (127 - 15) << 23; // exponent adjust + + // handle exponent special cases + if (exp == shifted_exp) // Inf/NaN? + o += (128 - 16) << 23; // extra exp adjust + else if (exp == 0) { // Zero/Denormal? + o += 1 << 23; // extra exp adjust + o = __intbits(__floatbits(o) - __floatbits(113 << 23)); // renormalize + } + + o |= ((int32_t)(h & 0x8000)) << 16; // sign bit + return __floatbits(o); +} + + +static FORCEINLINE __vec16_f __half_to_float_varying(__vec16_i16 v) { + __vec16_f ret; + for (int i = 0; i < 16; ++i) + ret[i] = __half_to_float_uniform(v[i]); + return ret; +} + + +static FORCEINLINE int16_t __float_to_half_uniform(float f) { + uint32_t sign_mask = 0x80000000u; + int32_t o; + + int32_t fint = __intbits(f); + int32_t sign = fint & sign_mask; + fint ^= sign; + + int32_t f32infty = 255 << 23; + o = (fint > f32infty) ? 0x7e00 : 0x7c00; + + // (De)normalized number or zero + // update fint unconditionally to save the blending; we don't need it + // anymore for the Inf/NaN case anyway. + const uint32_t round_mask = ~0xfffu; + const int32_t magic = 15 << 23; + const int32_t f16infty = 31 << 23; + + int32_t fint2 = __intbits(__floatbits(fint & round_mask) * __floatbits(magic)) - round_mask; + fint2 = (fint2 > f16infty) ? f16infty : fint2; // Clamp to signed infinity if overflowed + + if (fint < f32infty) + o = fint2 >> 13; // Take the bits! + + return (o | (sign >> 16)); +} + + +static FORCEINLINE __vec16_i16 __float_to_half_varying(__vec16_f v) { + __vec16_i16 ret; + for (int i = 0; i < 16; ++i) + ret[i] = __float_to_half_uniform(v[i]); + return ret; +} + + +#if 0 /* evghenii::double */ +/////////////////////////////////////////////////////////////////////////// +// double + +BINARY_OP(__vec16_d, __add, +) +BINARY_OP(__vec16_d, __sub, -) +BINARY_OP(__vec16_d, __mul, *) +BINARY_OP(__vec16_d, __div, /) + +CMP_OP(__vec16_d, double, double, __equal, ==) +CMP_OP(__vec16_d, double, double, __not_equal, !=) +CMP_OP(__vec16_d, double, double, __less_than, <) +CMP_OP(__vec16_d, double, double, __less_equal, <=) +CMP_OP(__vec16_d, double, double, __greater_than, >) +CMP_OP(__vec16_d, double, double, __greater_equal, >=) + +static FORCEINLINE __vec16_i1 __ordered_double(__vec16_d a, __vec16_d b) { + __vec16_i1 ret; + ret.v = 0; + for (int i = 0; i < 16; ++i) + ret.v |= ((a[i] == a[i]) && (b[i] == b[i])) ? (1 << i) : 0; + return ret; +} + +static FORCEINLINE __vec16_i1 __unordered_double(__vec16_d a, __vec16_d b) { + __vec16_i1 ret; + ret.v = 0; + for (int i = 0; i < 16; ++i) + ret.v |= ((a[i] != a[i]) || (b[i] != b[i])) ? (1 << i) : 0; + return ret; +} + +#if 0 + case Instruction::FRem: intrinsic = "__frem"; break; +#endif + +SELECT(__vec16_d) +INSERT_EXTRACT(__vec16_d, double) +SMEAR(__vec16_d, double, double) +SETZERO(__vec16_d, double) +UNDEF(__vec16_d, double) +BROADCAST(__vec16_d, double, double) +ROTATE(__vec16_d, double, double) +SHUFFLES(__vec16_d, double, double) +LOAD_STORE(__vec16_d, double) +#else /* evghenii::double */ +/////////////////////////////////////////////////////////////////////////// +// double +/////////////////////////////////////////////////////////////////////////// + +static FORCEINLINE __vec16_d __add(__vec16_d a, __vec16_d b) { + __vec16_d ret; + ret.v1 = _mm512_add_pd(a.v1, b.v1); + ret.v2 = _mm512_add_pd(a.v2, b.v2); + return ret; +} + +static FORCEINLINE __vec16_d __sub(__vec16_d a, __vec16_d b) { + __vec16_d ret; + ret.v1 = _mm512_sub_pd(a.v1, b.v1); + ret.v2 = _mm512_sub_pd(a.v2, b.v2); + return ret; +} + +static FORCEINLINE __vec16_d __mul(__vec16_d a, __vec16_d b) { + __vec16_d ret; + ret.v1 = _mm512_mul_pd(a.v1, b.v1); + ret.v2 = _mm512_mul_pd(a.v2, b.v2); + return ret; +} + +static FORCEINLINE __vec16_d __div(__vec16_d a, __vec16_d b) { + __vec16_d ret; + ret.v1 = _mm512_div_pd(a.v1, b.v1); + ret.v2 = _mm512_div_pd(a.v2, b.v2); + return ret; +} + +static FORCEINLINE __vec16_i1 __equal_double(__vec16_d a, __vec16_d b) { + __vec16_i1 ret1; + __vec16_i1 ret2; + ret1 = _mm512_cmpeq_pd_mask(a.v1, b.v1); + ret2 = _mm512_cmpeq_pd_mask(a.v2, b.v2); + return _mm512_kmovlhb(ret1, ret2); +} + +static FORCEINLINE __vec16_i1 __equal_double_and_mask(__vec16_d a, __vec16_d b, + __vec16_i1 m) { + __vec16_i1 ret1; + __vec16_i1 ret2; + ret1 = _mm512_mask_cmpeq_pd_mask(m, a.v1, b.v1); + __vec16_i1 tmp_m = m; + ret2 = _mm512_mask_cmpeq_pd_mask(_mm512_kswapb(tmp_m,tmp_m), a.v2, b.v2); + return _mm512_kmovlhb(ret1, ret2); +} + +static FORCEINLINE __vec16_i1 __not_equal_double(__vec16_d a, __vec16_d b) { + __vec16_i1 ret1; + __vec16_i1 ret2; + ret1 = _mm512_cmpneq_pd_mask(a.v1, b.v1); + ret2 = _mm512_cmpneq_pd_mask(a.v2, b.v2); + return _mm512_kmovlhb(ret1, ret2); +} + +static FORCEINLINE __vec16_i1 __not_equal_double_and_mask(__vec16_d a, __vec16_d b, + __vec16_i1 m) { + __vec16_i1 ret1; + __vec16_i1 ret2; + __vec16_i1 tmp_m = m; + ret1 = _mm512_mask_cmpneq_pd_mask(m, a.v1, b.v1); + ret2 = _mm512_mask_cmpneq_pd_mask(_mm512_kswapb(tmp_m, tmp_m), a.v2, b.v2); + return _mm512_kmovlhb(ret1, ret2); +} + +static FORCEINLINE __vec16_i1 __less_than_double(__vec16_d a, __vec16_d b) { + __vec16_i1 ret1; + __vec16_i1 ret2; + ret1 = _mm512_cmplt_pd_mask(a.v1, b.v1); + ret2 = _mm512_cmplt_pd_mask(a.v2, b.v2); + return _mm512_kmovlhb(ret1, ret2); +} + +static FORCEINLINE __vec16_i1 __less_than_double_and_mask(__vec16_d a, __vec16_d b, + __vec16_i1 m) { + __vec16_i1 ret1; + __vec16_i1 ret2; + __vec16_i1 tmp_m = m; + ret1 = _mm512_mask_cmplt_pd_mask(m, a.v1, b.v1); + ret2 = _mm512_mask_cmplt_pd_mask(_mm512_kswapb(tmp_m, tmp_m), a.v2, b.v2); + return _mm512_kmovlhb(ret1, ret2); +} + +static FORCEINLINE __vec16_i1 __less_equal_double(__vec16_d a, __vec16_d b) { + __vec16_i1 ret1; + __vec16_i1 ret2; + ret1 = _mm512_cmple_pd_mask(a.v1, b.v1); + ret2 = _mm512_cmple_pd_mask(a.v2, b.v2); + return _mm512_kmovlhb(ret1, ret2); +} + +static FORCEINLINE __vec16_i1 __less_equal_double_and_mask(__vec16_d a, __vec16_d b, + __vec16_i1 m) { + __vec16_i1 ret1; + __vec16_i1 ret2; + __vec16_i1 tmp_m = m; + ret1 = _mm512_mask_cmple_pd_mask(m, a.v1, b.v1); + ret2 = _mm512_mask_cmple_pd_mask(_mm512_kswapb(tmp_m, tmp_m), a.v2, b.v2); + return _mm512_kmovlhb(ret1, ret2); +} + +static FORCEINLINE __vec16_i1 __greater_than_double(__vec16_d a, __vec16_d b) { + __vec16_i1 ret1; + __vec16_i1 ret2; + ret1 = _mm512_cmpnle_pd_mask(a.v1, b.v1); + ret2 = _mm512_cmpnle_pd_mask(a.v2, b.v2); + return _mm512_kmovlhb(ret1, ret2); +} + +static FORCEINLINE __vec16_i1 __greater_than_double_and_mask(__vec16_d a, __vec16_d b, + __vec16_i1 m) { + __vec16_i1 ret1; + __vec16_i1 ret2; + __vec16_i1 tmp_m = m; + ret1 = _mm512_mask_cmpnle_pd_mask(m, a.v1, b.v1); + ret2 = _mm512_mask_cmpnle_pd_mask(_mm512_kswapb(tmp_m, tmp_m), a.v2, b.v2); + return _mm512_kmovlhb(ret1, ret2); +} + +static FORCEINLINE __vec16_i1 __greater_equal_double(__vec16_d a, __vec16_d b) { + __vec16_i1 ret1; + __vec16_i1 ret2; + ret1 = _mm512_cmpnlt_pd_mask(a.v1, b.v1); + ret2 = _mm512_cmpnlt_pd_mask(a.v2, b.v2); + return _mm512_kmovlhb(ret1, ret2); +} + +static FORCEINLINE __vec16_i1 __greater_equal_double_and_mask(__vec16_d a, __vec16_d b, + __vec16_i1 m) { + __vec16_i1 ret1; + __vec16_i1 ret2; + __vec16_i1 tmp_m = m; + ret1 = _mm512_mask_cmpnlt_pd_mask(m, a.v1, b.v1); + ret2 = _mm512_mask_cmpnlt_pd_mask(_mm512_kswapb(tmp_m, tmp_m), a.v2, b.v2); + return _mm512_kmovlhb(ret1, ret2); +} + +static FORCEINLINE __vec16_i1 __ordered_double(__vec16_d a, __vec16_d b) { + __vec16_i1 ret1; + __vec16_i1 ret2; + ret1 = _mm512_cmpord_pd_mask(a.v1, b.v1); + ret2 = _mm512_cmpord_pd_mask(a.v2, b.v2); + return _mm512_kmovlhb(ret1, ret2); +} + +static FORCEINLINE __vec16_i1 __unordered_double(__vec16_d a, __vec16_d b) { + __vec16_i1 ret1; + __vec16_i1 ret2; + ret1 = _mm512_cmpunord_pd_mask(a.v1, b.v1); + ret2 = _mm512_cmpunord_pd_mask(a.v2, b.v2); + return _mm512_kmovlhb(ret1, ret2); +} + +static FORCEINLINE __vec16_d __select(__vec16_i1 mask, __vec16_d a, __vec16_d b) { + __vec16_d ret; + __vec16_i1 tmp_m = mask; + ret.v1 = _mm512_mask_mov_pd(b.v1, mask, a.v1); + ret.v2 = _mm512_mask_mov_pd(b.v2, _mm512_kswapb(tmp_m, tmp_m), a.v2); + return ret; +} + + +static FORCEINLINE __vec16_d __select(bool cond, __vec16_d a, __vec16_d b) { + return cond ? a : b; +} + +static FORCEINLINE double __extract_element(__vec16_d v, uint32_t index) { + return ((double *)&v)[index]; +} + +static FORCEINLINE void __insert_element(__vec16_d *v, uint32_t index, double val) { + ((double *)v)[index] = val; +} + +template RetVecType __smear_double(double d); +template <> static FORCEINLINE __vec16_d __smear_double<__vec16_d>(double d) { + __vec16_d ret; + ret.v1 = _mm512_set1_pd(d); + ret.v2 = _mm512_set1_pd(d); + return ret; +} + +template RetVecType __setzero_double(); +template <> static FORCEINLINE __vec16_d __setzero_double<__vec16_d>() { + __vec16_d ret; + ret.v1 = _mm512_setzero_pd(); + ret.v2 = _mm512_setzero_pd(); + return ret; +} + +template RetVecType __undef_double(); +template <> static FORCEINLINE __vec16_d __undef_double<__vec16_d>() { + return __vec16_d(); +} + +static FORCEINLINE __vec16_d __broadcast_double(__vec16_d v, int index) { + __vec16_d ret; + double val = __extract_element(v, index & 0xf); + ret.v1 = _mm512_set1_pd(val); + ret.v2 = _mm512_set1_pd(val); + return ret; +} + +ROTATE(__vec16_d, double, double) +SHUFFLES(__vec16_d, double, double) + +template static FORCEINLINE __vec16_d __load(const __vec16_d *p) { + __vec16_d ret; + ret.v1 = _mm512_extloadunpacklo_pd(ret.v1, p, _MM_UPCONV_PD_NONE, _MM_HINT_NONE); + ret.v1 = _mm512_extloadunpackhi_pd(ret.v1, (uint8_t*)p+64, _MM_UPCONV_PD_NONE, _MM_HINT_NONE); + ret.v2 = _mm512_extloadunpacklo_pd(ret.v2, (uint8_t*)p+64, _MM_UPCONV_PD_NONE, _MM_HINT_NONE); + ret.v2 = _mm512_extloadunpackhi_pd(ret.v2, (uint8_t*)p+128, _MM_UPCONV_PD_NONE, _MM_HINT_NONE); + return ret; +} + +template static FORCEINLINE void __store(__vec16_d *p, __vec16_d v) { + _mm512_extpackstorelo_pd(p, v.v1, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE); + _mm512_extpackstorehi_pd((uint8_t*)p+64, v.v1, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE); + _mm512_extpackstorelo_pd((uint8_t*)p+64, v.v2, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE); + _mm512_extpackstorehi_pd((uint8_t*)p+128, v.v2, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE); +} + + +#if 0 +template <> static FORCEINLINE __vec16_d __load<64>(const __vec16_d *p) { + __vec16_d ret; + ret.v1 = _mm512_load_pd(p); + ret.v2 = _mm512_load_pd(((uint8_t*)p)+64); + return ret; +} +template <> static FORCEINLINE __vec16_d __load<128>(const __vec16_d *p) { + return __load<64>(p); +} +template <> static FORCEINLINE void __store<64>(__vec16_d *p, __vec16_d v) { + _mm512_store_pd(p, v.v1); + _mm512_store_pd(((uint8_t*)p)+64, v.v2); +} +template <> static FORCEINLINE void __store<128>(__vec16_d *p, __vec16_d v) { + __store<64>(p, v); +} +#endif +#endif /* evghenii::double */ + +/////////////////////////////////////////////////////////////////////////// +// casts + + +#define CAST(TO, STO, FROM, SFROM, FUNC) \ +static FORCEINLINE TO FUNC(TO, FROM val) { \ + TO ret; \ + for (int i = 0; i < 16; ++i) \ + ret[i] = (STO)((SFROM)(val[i])); \ + return ret; \ +} + +// sign extension conversions +CAST(__vec16_i64, int64_t, __vec16_i32, int32_t, __cast_sext) +CAST(__vec16_i64, int64_t, __vec16_i16, int16_t, __cast_sext) +CAST(__vec16_i64, int64_t, __vec16_i8, int8_t, __cast_sext) +CAST(__vec16_i32, int32_t, __vec16_i16, int16_t, __cast_sext) +CAST(__vec16_i32, int32_t, __vec16_i8, int8_t, __cast_sext) +CAST(__vec16_i16, int16_t, __vec16_i8, int8_t, __cast_sext) + +#define CAST_SEXT_I1(TYPE) \ +static FORCEINLINE TYPE __cast_sext(TYPE, __vec16_i1 v) { \ + TYPE ret; \ + for (int i = 0; i < 16; ++i) { \ + ret[i] = 0; \ + if (v.v & (1 << i)) \ + ret[i] = ~ret[i]; \ + } \ + return ret; \ +} + +CAST_SEXT_I1(__vec16_i8) +CAST_SEXT_I1(__vec16_i16) +#if 0 +CAST_SEXT_I1(__vec16_i32) +#else +static FORCEINLINE __vec16_i32 __cast_sext(const __vec16_i32 &, const __vec16_i1 &val) +{ + __vec16_i32 ret = _mm512_setzero_epi32(); + __vec16_i32 one = _mm512_set1_epi32(-1); + return _mm512_mask_mov_epi32(ret, val, one); +} +#endif +CAST_SEXT_I1(__vec16_i64) + +// zero extension +CAST(__vec16_i64, uint64_t, __vec16_i32, uint32_t, __cast_zext) +CAST(__vec16_i64, uint64_t, __vec16_i16, uint16_t, __cast_zext) +CAST(__vec16_i64, uint64_t, __vec16_i8, uint8_t, __cast_zext) +CAST(__vec16_i32, uint32_t, __vec16_i16, uint16_t, __cast_zext) +CAST(__vec16_i32, uint32_t, __vec16_i8, uint8_t, __cast_zext) +CAST(__vec16_i16, uint16_t, __vec16_i8, uint8_t, __cast_zext) + +#define CAST_ZEXT_I1(TYPE) \ +static FORCEINLINE TYPE __cast_zext(TYPE, __vec16_i1 v) { \ + TYPE ret; \ + for (int i = 0; i < 16; ++i) \ + ret[i] = (v.v & (1 << i)) ? 1 : 0; \ + return ret; \ +} + +CAST_ZEXT_I1(__vec16_i8) +CAST_ZEXT_I1(__vec16_i16) +#if 0 +CAST_ZEXT_I1(__vec16_i32) +#else +static FORCEINLINE __vec16_i32 __cast_zext(const __vec16_i32 &, const __vec16_i1 &val) +{ + __vec16_i32 ret = _mm512_setzero_epi32(); + __vec16_i32 one = _mm512_set1_epi32(1); + return _mm512_mask_mov_epi32(ret, val, one); +} +#endif +CAST_ZEXT_I1(__vec16_i64) + +// truncations +CAST(__vec16_i32, int32_t, __vec16_i64, int64_t, __cast_trunc) +CAST(__vec16_i16, int16_t, __vec16_i64, int64_t, __cast_trunc) +CAST(__vec16_i8, int8_t, __vec16_i64, int64_t, __cast_trunc) +CAST(__vec16_i16, int16_t, __vec16_i32, int32_t, __cast_trunc) +CAST(__vec16_i8, int8_t, __vec16_i32, int32_t, __cast_trunc) +CAST(__vec16_i8, int8_t, __vec16_i16, int16_t, __cast_trunc) + +// signed int to float/double +#if 0 +CAST(__vec16_f, float, __vec16_i8, int8_t, __cast_sitofp) +CAST(__vec16_f, float, __vec16_i16, int16_t, __cast_sitofp) +CAST(__vec16_f, float, __vec16_i32, int32_t, __cast_sitofp) +#else +static FORCEINLINE __vec16_f __cast_sitofp(__vec16_f, __vec16_i8 val) {return _mm512_extload_ps(&val, _MM_UPCONV_PS_SINT8, _MM_BROADCAST_16X16, _MM_HINT_NONE);} +static FORCEINLINE __vec16_f __cast_sitofp(__vec16_f, __vec16_i16 val) {return _mm512_extload_ps(&val, _MM_UPCONV_PS_SINT16, _MM_BROADCAST_16X16, _MM_HINT_NONE);} +static FORCEINLINE __vec16_f __cast_sitofp(__vec16_f, __vec16_i32 val) {return _mm512_cvtfxpnt_round_adjustepi32_ps(val, _MM_ROUND_MODE_NEAREST, _MM_EXPADJ_NONE);} +#endif +CAST(__vec16_f, float, __vec16_i64, int64_t, __cast_sitofp) +#if 0 +CAST(__vec16_d, double, __vec16_i8, int8_t, __cast_sitofp) +CAST(__vec16_d, double, __vec16_i16, int16_t, __cast_sitofp) +CAST(__vec16_d, double, __vec16_i32, int32_t, __cast_sitofp) +#else +static FORCEINLINE __vec16_d __cast_sitofp(__vec16_d, __vec16_i8 val) { + __vec16_i32 vi = _mm512_extload_epi32(&val, _MM_UPCONV_EPI32_SINT8, _MM_BROADCAST_16X16, _MM_HINT_NONE); + __vec16_d ret; + ret.v1 = _mm512_cvtepi32lo_pd(vi); + __vec16_i32 other8 = _mm512_permute4f128_epi32(vi, _MM_PERM_DCDC); + ret.v2 = _mm512_cvtepi32lo_pd(other8); + return ret; +} + +static FORCEINLINE __vec16_d __cast_sitofp(__vec16_d, __vec16_i16 val) { + __vec16_i32 vi = _mm512_extload_epi32(&val, _MM_UPCONV_EPI32_SINT16, _MM_BROADCAST_16X16, _MM_HINT_NONE); + __vec16_d ret; + ret.v1 = _mm512_cvtepi32lo_pd(vi); + __vec16_i32 other8 = _mm512_permute4f128_epi32(vi, _MM_PERM_DCDC); + ret.v2 = _mm512_cvtepi32lo_pd(other8); + return ret; +} + +static FORCEINLINE __vec16_d __cast_sitofp(__vec16_d, __vec16_i32 val) { + __vec16_d ret; + ret.v1 = _mm512_cvtepi32lo_pd(val); + __vec16_i32 other8 = _mm512_permute4f128_epi32(val, _MM_PERM_DCDC); + ret.v2 = _mm512_cvtepi32lo_pd(other8); + return ret; +} +#endif +CAST(__vec16_d, double, __vec16_i64, int64_t, __cast_sitofp) + +// unsigned int to float/double +#if 0 +CAST(__vec16_f, float, __vec16_i8, uint8_t, __cast_uitofp) +CAST(__vec16_f, float, __vec16_i16, uint16_t, __cast_uitofp) +CAST(__vec16_f, float, __vec16_i32, uint32_t, __cast_uitofp) +#else +static FORCEINLINE __vec16_f __cast_uitofp(__vec16_f, __vec16_i8 val) {return _mm512_extload_ps(&val, _MM_UPCONV_PS_UINT8, _MM_BROADCAST_16X16, _MM_HINT_NONE);} +static FORCEINLINE __vec16_f __cast_uitofp(__vec16_f, __vec16_i16 val) {return _mm512_extload_ps(&val, _MM_UPCONV_PS_UINT16, _MM_BROADCAST_16X16, _MM_HINT_NONE);} +static FORCEINLINE __vec16_f __cast_uitofp(__vec16_f, __vec16_i32 val) {return _mm512_cvtfxpnt_round_adjustepu32_ps(val, _MM_ROUND_MODE_NEAREST, _MM_EXPADJ_NONE);} +#endif +CAST(__vec16_f, float, __vec16_i64, uint64_t, __cast_uitofp) +#if 0 +CAST(__vec16_d, double, __vec16_i8, uint8_t, __cast_uitofp) +CAST(__vec16_d, double, __vec16_i16, uint16_t, __cast_uitofp) +CAST(__vec16_d, double, __vec16_i32, uint32_t, __cast_uitofp) +#else +static FORCEINLINE __vec16_d __cast_uitofp(__vec16_d, __vec16_i8 val) { + __vec16_i32 vi = _mm512_extload_epi32(&val, _MM_UPCONV_EPI32_UINT8, _MM_BROADCAST_16X16, _MM_HINT_NONE); + __vec16_d ret; + ret.v1 = _mm512_cvtepu32lo_pd(vi); + __vec16_i32 other8 = _mm512_permute4f128_epi32(vi, _MM_PERM_DCDC); + ret.v2 = _mm512_cvtepu32lo_pd(other8); + return ret; +} + +static FORCEINLINE __vec16_d __cast_uitofp(__vec16_d, __vec16_i16 val) { + __vec16_i32 vi = _mm512_extload_epi32(&val, _MM_UPCONV_EPI32_UINT16, _MM_BROADCAST_16X16, _MM_HINT_NONE); + __vec16_d ret; + ret.v1 = _mm512_cvtepu32lo_pd(vi); + __vec16_i32 other8 = _mm512_permute4f128_epi32(vi, _MM_PERM_DCDC); + ret.v2 = _mm512_cvtepu32lo_pd(other8); + return ret; +} + +static FORCEINLINE __vec16_d __cast_uitofp(__vec16_d, __vec16_i32 val) { + __vec16_d ret; + ret.v1 = _mm512_cvtepu32lo_pd(val); + __vec16_i32 other8 = _mm512_permute4f128_epi32(val, _MM_PERM_DCDC); + ret.v2 = _mm512_cvtepu32lo_pd(other8); + return ret; +} +#endif +CAST(__vec16_d, double, __vec16_i64, uint64_t, __cast_uitofp) + +#if 0 +static FORCEINLINE __vec16_f __cast_uitofp(__vec16_f, __vec16_i1 v) { + __vec16_f ret; + for (int i = 0; i < 16; ++i) + ret[i] = (v.v & (1 << i)) ? 1. : 0.; + return ret; +} +#else +static FORCEINLINE __vec16_f __cast_uitofp(__vec16_f, __vec16_i1 v) +{ + const __m512 ret = _mm512_setzero_ps(); + const __m512 one = _mm512_set1_ps(1.0); + return _mm512_mask_mov_ps(ret, v, one); +} +#endif + +// float/double to signed int +CAST(__vec16_i8, int8_t, __vec16_f, float, __cast_fptosi) +CAST(__vec16_i16, int16_t, __vec16_f, float, __cast_fptosi) +#if 0 +CAST(__vec16_i32, int32_t, __vec16_f, float, __cast_fptosi) +#else +static FORCEINLINE __vec16_i32 __cast_fptosi(__vec16_i32, __vec16_f val) { + return _mm512_cvtfxpnt_round_adjustps_epi32(val, _MM_ROUND_MODE_TOWARD_ZERO, _MM_EXPADJ_NONE); +} +#endif +CAST(__vec16_i64, int64_t, __vec16_f, float, __cast_fptosi) +CAST(__vec16_i8, int8_t, __vec16_d, double, __cast_fptosi) +CAST(__vec16_i16, int16_t, __vec16_d, double, __cast_fptosi) +#if 1 +CAST(__vec16_i32, int32_t, __vec16_d, double, __cast_fptosi) +#else +#endif +CAST(__vec16_i64, int64_t, __vec16_d, double, __cast_fptosi) + +// float/double to unsigned int +CAST(__vec16_i8, uint8_t, __vec16_f, float, __cast_fptoui) +CAST(__vec16_i16, uint16_t, __vec16_f, float, __cast_fptoui) +#if 0 +CAST(__vec16_i32, uint32_t, __vec16_f, float, __cast_fptoui) +#else +static FORCEINLINE __vec16_i32 __cast_fptoui(__vec16_i32, __vec16_f val) { + return _mm512_cvtfxpnt_round_adjustps_epu32(val, _MM_ROUND_MODE_TOWARD_ZERO, _MM_EXPADJ_NONE); +} +#endif +CAST(__vec16_i64, uint64_t, __vec16_f, float, __cast_fptoui) +CAST(__vec16_i8, uint8_t, __vec16_d, double, __cast_fptoui) +CAST(__vec16_i16, uint16_t, __vec16_d, double, __cast_fptoui) +#if 1 +CAST(__vec16_i32, uint32_t, __vec16_d, double, __cast_fptoui) +#else +#endif +CAST(__vec16_i64, uint64_t, __vec16_d, double, __cast_fptoui) + +// float/double conversions +#if 1 +CAST(__vec16_f, float, __vec16_d, double, __cast_fptrunc) +CAST(__vec16_d, double, __vec16_f, float, __cast_fpext) +#else +static FORCEINLINE __vec16_f __cast_fptrunc(__vec16_f, __vec16_d val) { + __m512i r0i = _mm512_castps_si512(_mm512_cvtpd_pslo(val.v1)); + __m512i r1i = _mm512_castps_si512(_mm512_cvtpd_pslo(val.v2)); + + return _mm512_mask_permute4f128_epi32(r0i, 0xFF00, r1i, _MM_PERM_BABA); +} +static FORCEINLINE __vec16_d __cast_fpext(__vec16_d, __vec16_f val) { + __vec16_d ret; + ret.v2 = _mm512_cvtpslo_pd(val.v); + __vec16_f other8 = _mm512_permute4f128_epi32(_mm512_castps_si512(val.v), _MM_PERM_DCDC); + ret.v1 = _mm512_cvtpslo_pd(other8); + return ret; +} +#endif + +typedef union { + int32_t i32; + float f; + int64_t i64; + double d; +} BitcastUnion; + +#define CAST_BITS(TO, TO_ELT, FROM, FROM_ELT) \ +static FORCEINLINE TO __cast_bits(TO, FROM val) { \ + TO r; \ + for (int i = 0; i < 16; ++i) { \ + BitcastUnion u; \ + u.FROM_ELT = val[i]; \ + r[i] = u.TO_ELT; \ + } \ + return r; \ +} + +#if 0 +CAST_BITS(__vec16_f, f, __vec16_i32, i32) +CAST_BITS(__vec16_i32, i32, __vec16_f, f) +#else +static FORCEINLINE __vec16_f __cast_bits(__vec16_f, __vec16_i32 val) { + return _mm512_castsi512_ps(val); +} +static FORCEINLINE __vec16_i32 __cast_bits(__vec16_i32, __vec16_f val) { + return _mm512_castps_si512(val); +} +#endif + +#if 0 +CAST_BITS(__vec16_d, d, __vec16_i64, i64) +CAST_BITS(__vec16_i64, i64, __vec16_d, d) +#else +static FORCEINLINE __vec16_i64 __cast_bits(__vec16_i64, __vec16_d val) { + return *(__vec16_i64*)&val; +} +static FORCEINLINE __vec16_d __cast_bits(__vec16_d, __vec16_i64 val) { + return *(__vec16_d*)&val; +} +#endif + +#define CAST_BITS_SCALAR(TO, FROM) \ +static FORCEINLINE TO __cast_bits(TO, FROM v) { \ + union { \ + TO to; \ + FROM from; \ + } u; \ + u.from = v; \ + return u.to; \ +} + +CAST_BITS_SCALAR(uint32_t, float) +CAST_BITS_SCALAR(int32_t, float) +CAST_BITS_SCALAR(float, uint32_t) +CAST_BITS_SCALAR(float, int32_t) +CAST_BITS_SCALAR(uint64_t, double) +CAST_BITS_SCALAR(int64_t, double) +CAST_BITS_SCALAR(double, uint64_t) +CAST_BITS_SCALAR(double, int64_t) + +/////////////////////////////////////////////////////////////////////////// +// various math functions + +static FORCEINLINE void __fastmath() { +} + +static FORCEINLINE float __round_uniform_float(float v) { + return roundf(v); +} + +static FORCEINLINE float __floor_uniform_float(float v) { + return floorf(v); +} + +static FORCEINLINE float __ceil_uniform_float(float v) { + return ceilf(v); +} + +static FORCEINLINE double __round_uniform_double(double v) { + return round(v); +} + +static FORCEINLINE double __floor_uniform_double(double v) { + return floor(v); +} + +static FORCEINLINE double __ceil_uniform_double(double v) { + return ceil(v); +} + +#if 0 +UNARY_OP(__vec16_f, __round_varying_float, roundf) +UNARY_OP(__vec16_f, __floor_varying_float, floorf) +UNARY_OP(__vec16_f, __ceil_varying_float, ceilf) +#else +static FORCEINLINE __vec16_f __round_varying_float(__vec16_f v) { + return _mm512_round_ps(v, _MM_ROUND_MODE_NEAREST, _MM_EXPADJ_NONE); +} + +static FORCEINLINE __vec16_f __floor_varying_float(__vec16_f v) { + return _mm512_floor_ps(v); +} + +static FORCEINLINE __vec16_f __ceil_varying_float(__vec16_f v) { + return _mm512_ceil_ps(v); +} +#endif + +#if 0 +UNARY_OP(__vec16_d, __round_varying_double, round) +UNARY_OP(__vec16_d, __floor_varying_double, floor) +UNARY_OP(__vec16_d, __ceil_varying_double, ceil) +#else +static FORCEINLINE __vec16_d __round_varying_float(__vec16_d v) { + __vec16_d ret; + ret.v1 = _mm512_svml_round_pd(v.v1); + ret.v2 = _mm512_svml_round_pd(v.v2); + return ret; +} + +static FORCEINLINE __vec16_d __floor_varying_float(__vec16_d v) { + __vec16_d ret; + ret.v1 = _mm512_floor_pd(v.v1); + ret.v2 = _mm512_floor_pd(v.v2); + return ret; +} + +static FORCEINLINE __vec16_d __ceil_varying_float(__vec16_d v) { + __vec16_d ret; + ret.v1 = _mm512_ceil_pd(v.v1); + ret.v2 = _mm512_ceil_pd(v.v2); + return ret; +} +#endif + + +// min/max + +static FORCEINLINE float __min_uniform_float(float a, float b) { return (ab) ? a : b; } +static FORCEINLINE double __min_uniform_double(double a, double b) { return (ab) ? a : b; } + +static FORCEINLINE int32_t __min_uniform_int32(int32_t a, int32_t b) { return (ab) ? a : b; } +static FORCEINLINE int32_t __min_uniform_uint32(uint32_t a, uint32_t b) { return (ab) ? a : b; } + +static FORCEINLINE int64_t __min_uniform_int64(int64_t a, int64_t b) { return (ab) ? a : b; } +static FORCEINLINE int64_t __min_uniform_uint64(uint64_t a, uint64_t b) { return (ab) ? a : b; } + + +#if 0 +BINARY_OP_FUNC(__vec16_f, __max_varying_float, __max_uniform_float) +BINARY_OP_FUNC(__vec16_f, __min_varying_float, __min_uniform_float) +BINARY_OP_FUNC(__vec16_d, __max_varying_double, __max_uniform_double) +BINARY_OP_FUNC(__vec16_d, __min_varying_double, __min_uniform_double) +#else +static FORCEINLINE __vec16_f __max_varying_float (__vec16_f v1, __vec16_f v2) { return _mm512_gmax_ps(v1, v2);} +static FORCEINLINE __vec16_f __min_varying_float (__vec16_f v1, __vec16_f v2) { return _mm512_gmin_ps(v1, v2);} +static FORCEINLINE __vec16_d __max_varying_double(__vec16_d v1, __vec16_d v2) { return __vec16_d(_mm512_gmax_pd(v1.v1, v2.v1),_mm512_gmax_pd(v1.v2,v2.v2));} +static FORCEINLINE __vec16_d __min_varying_double(__vec16_d v1, __vec16_d v2) { return __vec16_d(_mm512_gmin_pd(v1.v1, v2.v1),_mm512_gmin_pd(v1.v2,v2.v2));} +#endif + +#if 0 +BINARY_OP_FUNC(__vec16_i32, __max_varying_int32, __max_uniform_int32) +BINARY_OP_FUNC(__vec16_i32, __min_varying_int32, __min_uniform_int32) +BINARY_OP_FUNC(__vec16_i32, __max_varying_uint32, __max_uniform_uint32) +BINARY_OP_FUNC(__vec16_i32, __min_varying_uint32, __min_uniform_uint32) +#else +static FORCEINLINE __vec16_i32 __max_varying_int32 (__vec16_i32 v1, __vec16_i32 v2) { return _mm512_max_epi32(v1, v2);} +static FORCEINLINE __vec16_i32 __min_varying_int32 (__vec16_i32 v1, __vec16_i32 v2) { return _mm512_min_epi32(v1, v2);} +static FORCEINLINE __vec16_i32 __max_varying_uint32(__vec16_i32 v1, __vec16_i32 v2) { return _mm512_max_epu32(v1, v2);} +static FORCEINLINE __vec16_i32 __min_varying_uint32(__vec16_i32 v1, __vec16_i32 v2) { return _mm512_min_epu32(v1, v2);} +#endif + +BINARY_OP_FUNC(__vec16_i64, __max_varying_int64, __max_uniform_int64) +BINARY_OP_FUNC(__vec16_i64, __min_varying_int64, __min_uniform_int64) +BINARY_OP_FUNC(__vec16_i64, __max_varying_uint64, __max_uniform_uint64) +BINARY_OP_FUNC(__vec16_i64, __min_varying_uint64, __min_uniform_uint64) + +// sqrt/rsqrt/rcp + +static FORCEINLINE float __rsqrt_uniform_float(float v) { + return 1.f / sqrtf(v); +} + +static FORCEINLINE float __rcp_uniform_float(float v) { + return 1.f / v; +} + +static FORCEINLINE float __sqrt_uniform_float(float v) { + return sqrtf(v); +} + +static FORCEINLINE double __sqrt_uniform_double(double v) { + return sqrt(v); +} + +#if 0 +UNARY_OP(__vec16_f, __rcp_varying_float, __rcp_uniform_float) +UNARY_OP(__vec16_f, __rsqrt_varying_float, __rsqrt_uniform_float) +UNARY_OP(__vec16_f, __sqrt_varying_float, __sqrt_uniform_float) +UNARY_OP(__vec16_d, __sqrt_varying_double, __sqrt_uniform_double) +#else +static FORCEINLINE __vec16_f __rcp_varying_float(__vec16_f v) { +#ifdef ISPC_FAST_MATH + return _mm512_rcp23_ps(v); // Approximation with 23 bits of accuracy. +#else + return _mm512_recip_ps(v); +#endif +} + +static FORCEINLINE __vec16_f __rsqrt_varying_float(__vec16_f v) { +#ifdef ISPC_FAST_MATH + return _mm512_rsqrt23_ps(v); // Approximation with 0.775ULP accuracy +#else + return _mm512_invsqrt_ps(v); +#endif +} +static FORCEINLINE __vec16_f __sqrt_varying_float (__vec16_f v) { return _mm512_sqrt_ps(v);} +static FORCEINLINE __vec16_d __sqrt_varying_double(__vec16_d v) { return __vec16_d(_mm512_sqrt_pd(v.v1),_mm512_sqrt_pd(v.v2));} +#endif + +/////////////////////////////////////////////////////////////////////////// +// svml +/////////////////////////////////////////////////////////////////////////// + +static FORCEINLINE __vec16_f __svml_logf(__vec16_f v) { return _mm512_log_ps(v); } +static FORCEINLINE __vec16_f __svml_expf(__vec16_f v) { return _mm512_exp_ps(v); } +static FORCEINLINE __vec16_f __svml_cosf(__vec16_f v) { return _mm512_cos_ps(v); } +static FORCEINLINE __vec16_f __svml_powf(__vec16_f a, __vec16_f b) { return _mm512_pow_ps(a,b); } + +static FORCEINLINE __vec16_d __svml_logd(__vec16_d v) { return __vec16_d(_mm512_log_pd(v.v1), _mm512_log_pd(v.v2)); } +static FORCEINLINE __vec16_d __svml_expd(__vec16_d v) { return __vec16_d(_mm512_exp_pd(v.v1), _mm512_exp_pd(v.v2)); } +static FORCEINLINE __vec16_d __svml_cosd(__vec16_d v) { return __vec16_d(_mm512_cos_pd(v.v1), _mm512_cos_pd(v.v2)); } +static FORCEINLINE __vec16_d __svml_powd(__vec16_d a, __vec16_d b) { return __vec16_d(_mm512_pow_pd(a.v1,b.v1), _mm512_pow_pd(a.v2,b.v2)); } + +/////////////////////////////////////////////////////////////////////////// +// bit ops + +static FORCEINLINE int32_t __popcnt_int32(uint32_t v) { + int count = 0; + for (; v != 0; v >>= 1) + count += (v & 1); + return count; +} + +static FORCEINLINE int32_t __popcnt_int64(uint64_t v) { + int count = 0; + for (; v != 0; v >>= 1) + count += (v & 1); + return count; +} + +static FORCEINLINE int32_t __count_trailing_zeros_i32(uint32_t v) { + if (v == 0) + return 32; + + int count = 0; + while ((v & 1) == 0) { + ++count; + v >>= 1; + } + return count; +} + +static FORCEINLINE int64_t __count_trailing_zeros_i64(uint64_t v) { + if (v == 0) + return 64; + + int count = 0; + while ((v & 1) == 0) { + ++count; + v >>= 1; + } + return count; +} + +static FORCEINLINE int32_t __count_leading_zeros_i32(uint32_t v) { + if (v == 0) + return 32; + + int count = 0; + while ((v & (1<<31)) == 0) { + ++count; + v <<= 1; + } + return count; +} + +static FORCEINLINE int64_t __count_leading_zeros_i64(uint64_t v) { + if (v == 0) + return 64; + + int count = 0; + while ((v & (1ull<<63)) == 0) { + ++count; + v <<= 1; + } + return count; +} + +/////////////////////////////////////////////////////////////////////////// +// reductions + +#if 0 +REDUCE_ADD(float, __vec16_f, __reduce_add_float) +REDUCE_MINMAX(float, __vec16_f, __reduce_min_float, <) +REDUCE_MINMAX(float, __vec16_f, __reduce_max_float, >) +#else +static FORCEINLINE float __reduce_add_float(__vec16_f v) { return _mm512_reduce_add_ps(v); } +static FORCEINLINE float __reduce_min_float(__vec16_f v) { return _mm512_reduce_min_ps(v); } +static FORCEINLINE float __reduce_max_float(__vec16_f v) { return _mm512_reduce_max_ps(v); } +#endif + +#if 0 +REDUCE_ADD(double, __vec16_d, __reduce_add_double) +REDUCE_MINMAX(double, __vec16_d, __reduce_min_double, <) +REDUCE_MINMAX(double, __vec16_d, __reduce_max_double, >) +#else +static FORCEINLINE float __reduce_add_double(__vec16_d v) { return _mm512_reduce_add_pd(v.v1) + _mm512_reduce_add_pd(v.v2); } +static FORCEINLINE float __reduce_min_double(__vec16_d v) { return std::min(_mm512_reduce_min_pd(v.v1), _mm512_reduce_min_pd(v.v2)); } +static FORCEINLINE float __reduce_max_double(__vec16_d v) { return std::max(_mm512_reduce_max_pd(v.v1), _mm512_reduce_max_pd(v.v2)); } +#endif + + + +#if 0 +REDUCE_ADD (int64_t, __vec16_i32, __reduce_add_int32) +REDUCE_MINMAX(int32_t, __vec16_i32, __reduce_min_int32, <) +REDUCE_MINMAX(int32_t, __vec16_i32, __reduce_max_int32, >) +REDUCE_MINMAX(uint32_t, __vec16_i32, __reduce_min_uint32, <) +REDUCE_MINMAX(uint32_t, __vec16_i32, __reduce_max_uint32, >) +#else +static FORCEINLINE int64_t __reduce_add_int32 (__vec16_i32 v) { return _mm512_reduce_add_epi32(v);} +static FORCEINLINE int32_t __reduce_min_int32 (__vec16_i32 v) { return _mm512_reduce_min_epi32(v);} +static FORCEINLINE int32_t __reduce_max_int32 (__vec16_i32 v) { return _mm512_reduce_max_epi32(v);} +static FORCEINLINE uint32_t __reduce_min_uint32 (__vec16_i32 v) { return _mm512_reduce_min_epu32(v);} +static FORCEINLINE uint32_t __reduce_max_uint32 (__vec16_i32 v) { return _mm512_reduce_max_epu32(v);} +#endif + +REDUCE_ADD ( int16_t, __vec16_i8, __reduce_add_int8) +REDUCE_ADD ( int32_t, __vec16_i16, __reduce_add_int16) +REDUCE_ADD ( int64_t, __vec16_i64, __reduce_add_int64) +REDUCE_MINMAX( int64_t, __vec16_i64, __reduce_min_int64, <) +REDUCE_MINMAX( int64_t, __vec16_i64, __reduce_max_int64, >) +REDUCE_MINMAX(uint64_t, __vec16_i64, __reduce_min_uint64, <) +REDUCE_MINMAX(uint64_t, __vec16_i64, __reduce_max_uint64, >) + +/////////////////////////////////////////////////////////////////////////// +// masked load/store + +static FORCEINLINE __vec16_i8 __masked_load_i8(void *p, + __vec16_i1 mask) { + __vec16_i8 ret; + int8_t *ptr = (int8_t *)p; + for (int i = 0; i < 16; ++i) + if ((mask.v & (1 << i)) != 0) + ret[i] = ptr[i]; + return ret; +} + +static FORCEINLINE __vec16_i16 __masked_load_i16(void *p, + __vec16_i1 mask) { + __vec16_i16 ret; + int16_t *ptr = (int16_t *)p; + for (int i = 0; i < 16; ++i) + if ((mask.v & (1 << i)) != 0) + ret[i] = ptr[i]; + return ret; +} + +#if 0 +static FORCEINLINE __vec16_i32 __masked_load_i32(void *p, + __vec16_i1 mask) { + __vec16_i32 ret; + int32_t *ptr = (int32_t *)p; + for (int i = 0; i < 16; ++i) + if ((mask.v & (1 << i)) != 0) + ret[i] = ptr[i]; + return ret; +} +#else +static FORCEINLINE __vec16_i32 __masked_load_i32(void *p, __vec16_i1 mask) { +#ifdef ISPC_FORCE_ALIGNED_MEMORY + return _mm512_mask_load_epi32(__vec16_i32(), mask, p); +#else + __vec16_i32 tmp; + tmp.v = _mm512_mask_extloadunpacklo_epi32(tmp.v, 0xFFFF, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE); + tmp.v = _mm512_mask_extloadunpackhi_epi32(tmp.v, 0xFFFF, (uint8_t*)p+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE); + __vec16_i32 ret; + return _mm512_mask_mov_epi32(ret.v, mask, tmp.v); +#endif +} +#endif + +#if 0 +static FORCEINLINE __vec16_f __masked_load_float(void *p, + __vec16_i1 mask) { + __vec16_f ret; + float *ptr = (float *)p; + for (int i = 0; i < 16; ++i) + if ((mask.v & (1 << i)) != 0) + ret[i] = ptr[i]; + return ret; +} +#else +static FORCEINLINE __vec16_f __masked_load_float(void *p, __vec16_i1 mask) { +#ifdef ISPC_FORCE_ALIGNED_MEMORY + return _mm512_mask_load_ps(_mm512_undefined_ps(), mask,p); +#else + __vec16_f tmp; + tmp.v = _mm512_mask_extloadunpacklo_ps(tmp.v, 0xFFFF, p, _MM_UPCONV_PS_NONE, _MM_HINT_NONE); + tmp.v = _mm512_mask_extloadunpackhi_ps(tmp.v, 0xFFFF, (uint8_t*)p+64, _MM_UPCONV_PS_NONE, _MM_HINT_NONE); + __vec16_f ret; + return _mm512_mask_mov_ps(ret.v, mask, tmp.v); +#endif +} +#endif + +static FORCEINLINE __vec16_i64 __masked_load_i64(void *p, + __vec16_i1 mask) { + __vec16_i64 ret; + int64_t *ptr = (int64_t *)p; + for (int i = 0; i < 16; ++i) + if ((mask.v & (1 << i)) != 0) + ret[i] = ptr[i]; + return ret; +} + +#if 0 +static FORCEINLINE __vec16_d __masked_load_double(void *p, + __vec16_i1 mask) { + __vec16_d ret; + double *ptr = (double *)p; + for (int i = 0; i < 16; ++i) + if ((mask.v & (1 << i)) != 0) + ret[i] = ptr[i]; + return ret; +} +#else +static FORCEINLINE __vec16_d __masked_load_double(void *p, __vec16_i1 mask) { +#ifdef ISPC_FORCE_ALIGNED_MEMORY + __vec16_d ret; + __vec16_i1 tmp_m = mask; + tmp_m = _mm512_kswapb(tmp_m, tmp_m); + ret.v1 = _mm512_mask_load_pd(ret.v1, mask, p); + ret.v2 = _mm512_mask_load_pd(ret.v2, tmp_m, (uint8_t*)p+64); + return ret; +#else + __vec16_d tmp; + tmp.v1 = _mm512_mask_extloadunpacklo_pd(tmp.v1, 0xFF, p, _MM_UPCONV_PD_NONE, _MM_HINT_NONE); + tmp.v1 = _mm512_mask_extloadunpackhi_pd(tmp.v1, 0xFF, (uint8_t*)p+64, _MM_UPCONV_PD_NONE, _MM_HINT_NONE); + tmp.v2 = _mm512_mask_extloadunpacklo_pd(tmp.v2, 0xFF, (uint8_t*)p+64, _MM_UPCONV_PD_NONE, _MM_HINT_NONE); + tmp.v2 = _mm512_mask_extloadunpackhi_pd(tmp.v2, 0xFF, (uint8_t*)p+128, _MM_UPCONV_PD_NONE, _MM_HINT_NONE); + __vec16_d ret; + __vec16_i1 tmp_m = mask; + tmp_m = _mm512_kswapb(tmp_m, tmp_m); + ret.v1 = _mm512_mask_mov_pd(ret.v1, mask, tmp.v1); + ret.v2 = _mm512_mask_mov_pd(ret.v2, tmp_m, tmp.v2); + return ret; +#endif +} +#endif + + +static FORCEINLINE void __masked_store_i8(void *p, __vec16_i8 val, + __vec16_i1 mask) { + int8_t *ptr = (int8_t *)p; + for (int i = 0; i < 16; ++i) + if ((mask.v & (1 << i)) != 0) + ptr[i] = val[i]; +} + +static FORCEINLINE void __masked_store_i16(void *p, __vec16_i16 val, + __vec16_i1 mask) { + int16_t *ptr = (int16_t *)p; + for (int i = 0; i < 16; ++i) + if ((mask.v & (1 << i)) != 0) + ptr[i] = val[i]; +} + +#if 0 +static FORCEINLINE void __masked_store_i32(void *p, __vec16_i32 val, + __vec16_i1 mask) { + int32_t *ptr = (int32_t *)p; + for (int i = 0; i < 16; ++i) + if ((mask.v & (1 << i)) != 0) + ptr[i] = val[i]; +} +#else +static FORCEINLINE void __masked_store_i32(void *p, __vec16_i32 val, __vec16_i1 mask) { +#ifdef ISPC_FORCE_ALIGNED_MEMORY + _mm512_mask_store_epi32(p, mask, val.v); +#else + __vec16_i32 tmp; + tmp.v = _mm512_extloadunpacklo_epi32(tmp.v, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE); + tmp.v = _mm512_extloadunpackhi_epi32(tmp.v, (uint8_t*)p+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE); + tmp.v = _mm512_mask_mov_epi32(tmp.v, mask, val.v); + _mm512_extpackstorelo_epi32(p, tmp.v, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE); + _mm512_extpackstorehi_epi32((uint8_t*)p+64, tmp.v, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE); +#endif +} +#endif + +#if 0 +static FORCEINLINE void __masked_store_float(void *p, __vec16_f val, + __vec16_i1 mask) { + float *ptr = (float *)p; + for (int i = 0; i < 16; ++i) + if ((mask.v & (1 << i)) != 0) + ptr[i] = val[i]; +} +#else +static FORCEINLINE void __masked_store_float(void *p, __vec16_f val, + __vec16_i1 mask) { +#ifdef ISPC_FORCE_ALIGNED_MEMORY + _mm512_mask_store_ps(p, mask, val.v); +#else + __vec16_f tmp; + tmp.v = _mm512_extloadunpacklo_ps(tmp.v, p, _MM_UPCONV_PS_NONE, _MM_HINT_NONE); + tmp.v = _mm512_extloadunpackhi_ps(tmp.v, (uint8_t*)p+64, _MM_UPCONV_PS_NONE, _MM_HINT_NONE); + tmp.v = _mm512_mask_mov_ps(tmp.v, mask, val.v); + _mm512_extpackstorelo_ps(p, tmp.v, _MM_DOWNCONV_PS_NONE, _MM_HINT_NONE); + _mm512_extpackstorehi_ps((uint8_t*)p+64, tmp.v, _MM_DOWNCONV_PS_NONE, _MM_HINT_NONE); +#endif +} +#endif + +static FORCEINLINE void __masked_store_i64(void *p, __vec16_i64 val, + __vec16_i1 mask) { + int64_t *ptr = (int64_t *)p; + for (int i = 0; i < 16; ++i) + if ((mask.v & (1 << i)) != 0) + ptr[i] = val[i]; +} + +#if 0 +static FORCEINLINE void __masked_store_double(void *p, __vec16_d val, + __vec16_i1 mask) { + double *ptr = (double *)p; + for (int i = 0; i < 16; ++i) + if ((mask.v & (1 << i)) != 0) + ptr[i] = val[i]; +} +#else +static FORCEINLINE void __masked_store_double(void *p, __vec16_d val, + __vec16_i1 mask) { +#ifdef ISPC_FORCE_ALIGNED_MEMORY + __vec16_i1 tmp_m = mask; + tmp_m = _mm512_kswapb(tmp_m, tmp_m); + _mm512_mask_store_pd(p, mask, val.v1); + _mm512_mask_store_pd((uint8_t*)p+64, tmp_m, val.v2); +#else + __vec16_d tmp; + __vec16_i1 tmp_m = mask; + tmp_m = _mm512_kswapb(tmp_m, tmp_m); + tmp.v1 = _mm512_extloadunpacklo_pd(tmp.v1, p, _MM_UPCONV_PD_NONE, _MM_HINT_NONE); + tmp.v1 = _mm512_extloadunpackhi_pd(tmp.v1, (uint8_t*)p+64, _MM_UPCONV_PD_NONE, _MM_HINT_NONE); + tmp.v2 = _mm512_extloadunpacklo_pd(tmp.v2, (uint8_t*)p+64, _MM_UPCONV_PD_NONE, _MM_HINT_NONE); + tmp.v2 = _mm512_extloadunpackhi_pd(tmp.v2, (uint8_t*)p+128, _MM_UPCONV_PD_NONE, _MM_HINT_NONE); + tmp.v1 = _mm512_mask_mov_pd(tmp.v1, mask, val.v1); + tmp.v2 = _mm512_mask_mov_pd(tmp.v2, tmp_m, val.v2); + _mm512_extpackstorelo_pd(p, tmp.v1, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE); + _mm512_extpackstorehi_pd((uint8_t*)p+64, tmp.v1, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE); + _mm512_extpackstorelo_pd((uint8_t*)p+64, tmp.v2, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE); + _mm512_extpackstorehi_pd((uint8_t*)p+128, tmp.v2, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE); +#endif +} +#endif + +static FORCEINLINE void __masked_store_blend_i8(void *p, __vec16_i8 val, + __vec16_i1 mask) { + __masked_store_i8(p, val, mask); +} + +static FORCEINLINE void __masked_store_blend_i16(void *p, __vec16_i16 val, + __vec16_i1 mask) { + __masked_store_i16(p, val, mask); +} + +static FORCEINLINE void __masked_store_blend_i32(void *p, __vec16_i32 val, + __vec16_i1 mask) { + __masked_store_i32(p, val, mask); +} + +static FORCEINLINE void __masked_store_blend_float(void *p, __vec16_f val, + __vec16_i1 mask) { + __masked_store_float(p, val, mask); +} + +static FORCEINLINE void __masked_store_blend_i64(void *p, __vec16_i64 val, + __vec16_i1 mask) { + __masked_store_i64(p, val, mask); +} + +static FORCEINLINE void __masked_store_blend_double(void *p, __vec16_d val, + __vec16_i1 mask) { + __masked_store_double(p, val, mask); +} + +/////////////////////////////////////////////////////////////////////////// +// gather/scatter + +// offsets * offsetScale is in bytes (for all of these) + +#define GATHER_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC) \ +static FORCEINLINE VTYPE FUNC(unsigned char *b, uint32_t scale, \ + OTYPE offset, __vec16_i1 mask) { \ + VTYPE ret; \ + int8_t *base = (int8_t *)b; \ + for (int i = 0; i < 16; ++i) \ + if ((mask.v & (1 << i)) != 0) { \ + STYPE *ptr = (STYPE *)(base + scale * offset[i]); \ + ret[i] = *ptr; \ + } \ + return ret; \ +} + + +/****************/ +#if 0 +GATHER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i32, __gather_base_offsets32_i8) +#else +static FORCEINLINE __vec16_i8 __gather_base_offsets32_i8(uint8_t *base, uint32_t scale, __vec16_i32 offsets, __vec16_i1 mask) +{ + // (iw): need to temporarily store as int because gathers can only return ints. + __vec16_i32 tmp = _mm512_mask_i32extgather_epi32(_mm512_undefined_epi32(), mask, offsets, base, + _MM_UPCONV_EPI32_SINT8, scale, + _MM_HINT_NONE); + // now, downconverting to chars into temporary char vector + __vec16_i8 ret; + _mm512_extstore_epi32(ret.data,tmp,_MM_DOWNCONV_EPI32_SINT8,_MM_HINT_NONE); + return ret; +} +#endif +GATHER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i64, __gather_base_offsets64_i8) +/****************/ +GATHER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i32, __gather_base_offsets32_i16) +GATHER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i64, __gather_base_offsets64_i16) +/****************/ +#if 0 +GATHER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i32, __gather_base_offsets32_i32) +#else +static FORCEINLINE __vec16_i32 __gather_base_offsets32_i32(uint8_t *base, uint32_t scale, __vec16_i32 offsets, __vec16_i1 mask) +{ + return _mm512_mask_i32extgather_epi32(_mm512_undefined_epi32(), mask, offsets, + base, _MM_UPCONV_EPI32_NONE, scale, + _MM_HINT_NONE); +} +#endif +GATHER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i64, __gather_base_offsets64_i32) +/****************/ +#if 0 +GATHER_BASE_OFFSETS(__vec16_f, float, __vec16_i32, __gather_base_offsets32_float) +#else +static FORCEINLINE __vec16_f __gather_base_offsets32_float(uint8_t *base, uint32_t scale, __vec16_i32 offsets, __vec16_i1 mask) +{ + return _mm512_mask_i32extgather_ps(_mm512_undefined_ps(), mask, offsets, + base, _MM_UPCONV_PS_NONE, scale, + _MM_HINT_NONE); +} +#endif +GATHER_BASE_OFFSETS(__vec16_f, float, __vec16_i64, __gather_base_offsets64_float) +/****************/ +GATHER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i32, __gather_base_offsets32_i64) +GATHER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i64, __gather_base_offsets64_i64) +/****************/ +#if 0 +GATHER_BASE_OFFSETS(__vec16_d, double, __vec16_i32, __gather_base_offsets32_double) +#else +static FORCEINLINE __vec16_d __gather_base_offsets32_double(uint8_t *base, uint32_t scale, __vec16_i32 offsets, __vec16_i1 mask) +{ + __vec16_d ret; + ret.v1 = _mm512_mask_i32loextgather_pd(_mm512_undefined_pd(), mask, offsets, + base, _MM_UPCONV_PD_NONE, scale, + _MM_HINT_NONE); + __m512i shuffled_offsets = _mm512_permute4f128_epi32(offsets.v, _MM_PERM_DCDC); + const __mmask8 mask8 = 0x00FF & (mask >> 8); /* evghenii::testme */ + ret.v2 = _mm512_mask_i32loextgather_pd(_mm512_undefined_pd(), mask8, shuffled_offsets, + base, _MM_UPCONV_PD_NONE, scale, + _MM_HINT_NONE); + return ret; +} +#endif +GATHER_BASE_OFFSETS(__vec16_d, double, __vec16_i64, __gather_base_offsets64_double) + +#define GATHER_GENERAL(VTYPE, STYPE, PTRTYPE, FUNC) \ +static FORCEINLINE VTYPE FUNC(PTRTYPE ptrs, __vec16_i1 mask) { \ + VTYPE ret; \ + for (int i = 0; i < 16; ++i) \ + if ((mask.v & (1 << i)) != 0) { \ + STYPE *ptr = (STYPE *)ptrs[i]; \ + ret[i] = *ptr; \ + } \ + return ret; \ +} +#define GATHER_GENERALF(VTYPE, STYPE, PTRTYPE, FUNC,FUNC1) \ +static FORCEINLINE VTYPE FUNC(PTRTYPE ptrs, __vec16_i1 mask) { \ + return FUNC1(0, 1, ptrs, mask); \ +} + + +#if 1 +/***********/ +GATHER_GENERALF(__vec16_i8, int8_t, __vec16_i32, __gather32_i8, __gather_base_offsets32_i8) +GATHER_GENERALF(__vec16_i16, int16_t, __vec16_i32, __gather32_i16, __gather_base_offsets32_i16) +GATHER_GENERALF(__vec16_i32, int32_t, __vec16_i32, __gather32_i32, __gather_base_offsets32_i32) +GATHER_GENERALF(__vec16_i64, int64_t, __vec16_i32, __gather32_i64, __gather_base_offsets32_i64) +GATHER_GENERALF(__vec16_f, float, __vec16_i32, __gather32_float, __gather_base_offsets32_float) +GATHER_GENERALF(__vec16_d, double, __vec16_i32, __gather32_double, __gather_base_offsets32_double) +/***********/ +GATHER_GENERAL(__vec16_i8, int8_t, __vec16_i64, __gather64_i8); +GATHER_GENERAL(__vec16_i16, int16_t, __vec16_i64, __gather64_i16); +GATHER_GENERAL(__vec16_i32, int32_t, __vec16_i64, __gather64_i32); +GATHER_GENERAL(__vec16_i64, int64_t, __vec16_i64, __gather64_i64); +GATHER_GENERAL(__vec16_f, float, __vec16_i64, __gather64_float); +GATHER_GENERAL(__vec16_d, double, __vec16_i64, __gather64_double); +/***********/ +#endif + +// scatter + +#define SCATTER_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC) \ +static FORCEINLINE void FUNC(unsigned char *b, uint32_t scale, \ + OTYPE offset, VTYPE val, \ + __vec16_i1 mask) { \ + int8_t *base = (int8_t *)b; \ + for (int i = 0; i < 16; ++i) \ + if ((mask.v & (1 << i)) != 0) { \ + STYPE *ptr = (STYPE *)(base + scale * offset[i]); \ + *ptr = val[i]; \ + } \ +} + + +/*****************/ +SCATTER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i32, __scatter_base_offsets32_i8) +SCATTER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i64, __scatter_base_offsets64_i8) +/*****************/ +SCATTER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i32, __scatter_base_offsets32_i16) +SCATTER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i64, __scatter_base_offsets64_i16) +/*****************/ +#if 0 +SCATTER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i32, __scatter_base_offsets32_i32) +#else +static FORCEINLINE void __scatter_base_offsets32_i32(uint8_t *b, uint32_t scale, __vec16_i32 offsets, __vec16_i32 val, __vec16_i1 mask) +{ + _mm512_mask_i32extscatter_epi32(b, mask, offsets, val, + _MM_DOWNCONV_EPI32_NONE, scale, + _MM_HINT_NONE); +} +#endif +SCATTER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i64, __scatter_base_offsets64_i32) +/*****************/ +#if 0 +SCATTER_BASE_OFFSETS(__vec16_f, float, __vec16_i32, __scatter_base_offsets32_float) +#else +static FORCEINLINE void __scatter_base_offsets32_float(void *base, uint32_t scale, __vec16_i32 offsets, + __vec16_f val, __vec16_i1 mask) +{ + _mm512_mask_i32extscatter_ps(base, mask, offsets, val, + _MM_DOWNCONV_PS_NONE, scale, + _MM_HINT_NONE); +} +#endif +SCATTER_BASE_OFFSETS(__vec16_f, float, __vec16_i64, __scatter_base_offsets64_float) +/*****************/ +SCATTER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i32, __scatter_base_offsets32_i64) +SCATTER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i64, __scatter_base_offsets64_i64) +/*****************/ +#if 0 /* evghenii::to implement */ +SCATTER_BASE_OFFSETS(__vec16_d, double, __vec16_i32, __scatter_base_offsets32_double) +#else /* evghenii:testme */ +static FORCEINLINE void __scatter_base_offsets32_double(void *base, uint32_t scale, __vec16_i32 offsets, + __vec16_d val, __vec16_i1 mask) +{ + _mm512_mask_i32loextscatter_pd(base, mask, offsets, val.v1, + _MM_DOWNCONV_PD_NONE, scale, + _MM_HINT_NONE); + __m512i shuffled_offsets = _mm512_permute4f128_epi32(offsets.v, _MM_PERM_DCDC); + const __mmask8 mask8 = 0x00FF & (mask >> 8); /* evghenii::testme */ + _mm512_mask_i32loextscatter_pd(base, mask8, shuffled_offsets, val.v2, + _MM_DOWNCONV_PD_NONE, scale, + _MM_HINT_NONE); +} +#endif +SCATTER_BASE_OFFSETS(__vec16_d, double, __vec16_i64, __scatter_base_offsets64_double) + +#define SCATTER_GENERAL(VTYPE, STYPE, PTRTYPE, FUNC) \ +static FORCEINLINE void FUNC(PTRTYPE ptrs, VTYPE val, __vec16_i1 mask) { \ + VTYPE ret; \ + for (int i = 0; i < 16; ++i) \ + if ((mask.v & (1 << i)) != 0) { \ + STYPE *ptr = (STYPE *)ptrs[i]; \ + *ptr = val[i]; \ + } \ +} +#define SCATTER_GENERALF(VTYPE, STYPE, PTRTYPE, FUNC,FUNC1) \ +static FORCEINLINE void FUNC(PTRTYPE ptrs, VTYPE val, __vec16_i1 mask) { \ + return FUNC1(0, 1, ptrs, val, mask); \ +} + +#if 1 +/***********/ +SCATTER_GENERALF(__vec16_i8, int8_t, __vec16_i32, __scatter32_i8, __scatter_base_offsets32_i8) +SCATTER_GENERALF(__vec16_i16, int16_t, __vec16_i32, __scatter32_i16, __scatter_base_offsets32_i16) +SCATTER_GENERALF(__vec16_i32, int32_t, __vec16_i32, __scatter32_i32, __scatter_base_offsets32_i32) +SCATTER_GENERALF(__vec16_i64, int64_t, __vec16_i32, __scatter32_i64, __scatter_base_offsets32_i64) +SCATTER_GENERALF(__vec16_f, float, __vec16_i32, __scatter32_float, __scatter_base_offsets32_float) +SCATTER_GENERALF(__vec16_d, double, __vec16_i32, __scatter32_double, __scatter_base_offsets32_double) +/***********/ +SCATTER_GENERAL(__vec16_i8, int8_t, __vec16_i64, __scatter64_i8) +SCATTER_GENERAL(__vec16_i16, int16_t, __vec16_i64, __scatter64_i16) +SCATTER_GENERAL(__vec16_i32, int32_t, __vec16_i64, __scatter64_i32) +SCATTER_GENERAL(__vec16_f, float, __vec16_i64, __scatter64_float) +SCATTER_GENERAL(__vec16_i64, int64_t, __vec16_i64, __scatter64_i64) +SCATTER_GENERAL(__vec16_d, double, __vec16_i64, __scatter64_double) +/***********/ +#endif + +/////////////////////////////////////////////////////////////////////////// +// packed load/store + +#if 0 +static FORCEINLINE int32_t __packed_load_active(int32_t *ptr, __vec16_i32 *val, + __vec16_i1 mask) { + int count = 0; + for (int i = 0; i < 16; ++i) { + if ((mask.v & (1 << i)) != 0) { + val->operator[](i) = *ptr++; + ++count; + } + } + return count; +} +#endif + +#if 0 +static FORCEINLINE int32_t __packed_store_active(int32_t *ptr, + __vec16_i32 val, + __vec16_i1 mask) { + int count = 0; + for (int i = 0; i < 16; ++i) { + if ((mask.v & (1 << i)) != 0) { + *ptr++ = val[i]; + ++count; + } + } + return count; +} +#endif + +#if 0 +static FORCEINLINE int32_t __packed_load_active(uint32_t *ptr, + __vec16_i32 *val, + __vec16_i1 mask) { + int count = 0; + for (int i = 0; i < 16; ++i) { + if ((mask.v & (1 << i)) != 0) { + val->operator[](i) = *ptr++; + ++count; + } + } + return count; +} +static FORCEINLINE int32_t __packed_store_active(uint32_t *ptr, + __vec16_i32 val, + __vec16_i1 mask) { + int count = 0; + for (int i = 0; i < 16; ++i) { + if ((mask.v & (1 << i)) != 0) { + *ptr++ = val[i]; + ++count; + } + } + return count; +} +#endif + +#if 1 +static FORCEINLINE int32_t __packed_load_active(uint32_t *p, __vec16_i32 *val, + __vec16_i1 mask) { + __vec16_i32 v = __load<64>(val); + v = _mm512_mask_extloadunpacklo_epi32(v, mask, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE); + v = _mm512_mask_extloadunpackhi_epi32(v, mask, (uint8_t*)p+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE); + __store<64>(val, v); + return _mm_countbits_32(uint32_t(mask)); +} +#endif + +#if 1 +static FORCEINLINE int32_t __packed_store_active(uint32_t *p, __vec16_i32 val, + __vec16_i1 mask) { + _mm512_mask_extpackstorelo_epi32(p, mask, val, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE); + _mm512_mask_extpackstorehi_epi32((uint8_t*)p+64, mask, val, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE); + return _mm_countbits_32(uint32_t(mask)); +} +#endif + +#if 1 +static FORCEINLINE int32_t __packed_load_active(int32_t *p, __vec16_i32 *val, + __vec16_i1 mask) { + __vec16_i32 v = __load<64>(val); + v = _mm512_mask_extloadunpacklo_epi32(v, mask, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE); + v = _mm512_mask_extloadunpackhi_epi32(v, mask, (uint8_t*)p+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE); + __store<64>(val, v); + return _mm_countbits_32(uint32_t(mask)); +} +#endif + +#if 1 +static FORCEINLINE int32_t __packed_store_active(int32_t *p, __vec16_i32 val, + __vec16_i1 mask) { + _mm512_mask_extpackstorelo_epi32(p, mask, val, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE); + _mm512_mask_extpackstorehi_epi32((uint8_t*)p+64, mask, val, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE); + return _mm_countbits_32(uint32_t(mask)); +} +#endif + +/////////////////////////////////////////////////////////////////////////// +// aos/soa + +static FORCEINLINE void __soa_to_aos3_float(__vec16_f v0, __vec16_f v1, __vec16_f v2, + float *ptr) { + for (int i = 0; i < 16; ++i) { + *ptr++ = __extract_element(v0, i); + *ptr++ = __extract_element(v1, i); + *ptr++ = __extract_element(v2, i); + } +} + +static FORCEINLINE void __aos_to_soa3_float(float *ptr, __vec16_f *out0, __vec16_f *out1, + __vec16_f *out2) { + for (int i = 0; i < 16; ++i) { + __insert_element(out0, i, *ptr++); + __insert_element(out1, i, *ptr++); + __insert_element(out2, i, *ptr++); + } +} + +static FORCEINLINE void __soa_to_aos4_float(__vec16_f v0, __vec16_f v1, __vec16_f v2, + __vec16_f v3, float *ptr) { + for (int i = 0; i < 16; ++i) { + *ptr++ = __extract_element(v0, i); + *ptr++ = __extract_element(v1, i); + *ptr++ = __extract_element(v2, i); + *ptr++ = __extract_element(v3, i); + } +} + +static FORCEINLINE void __aos_to_soa4_float(float *ptr, __vec16_f *out0, __vec16_f *out1, + __vec16_f *out2, __vec16_f *out3) { + for (int i = 0; i < 16; ++i) { + __insert_element(out0, i, *ptr++); + __insert_element(out1, i, *ptr++); + __insert_element(out2, i, *ptr++); + __insert_element(out3, i, *ptr++); + } +} + +/////////////////////////////////////////////////////////////////////////// +// prefetch + +static FORCEINLINE void __prefetch_read_uniform_1(unsigned char *p) { + _mm_prefetch((char *)p, _MM_HINT_T0); // prefetch into L1$ +} + +static FORCEINLINE void __prefetch_read_uniform_2(unsigned char *p) { + _mm_prefetch((char *)p, _MM_HINT_T1); // prefetch into L2$ +} + +static FORCEINLINE void __prefetch_read_uniform_3(unsigned char *p) { + // There is no L3$ on KNC, don't want to pollute L2$ unecessarily +} + +static FORCEINLINE void __prefetch_read_uniform_nt(unsigned char *p) { + _mm_prefetch((char *)p, _MM_HINT_T2); // prefetch into L2$ with non-temporal hint + // _mm_prefetch(p, _MM_HINT_NTA); // prefetch into L1$ with non-temporal hint +} + +/////////////////////////////////////////////////////////////////////////// +// atomics + +static FORCEINLINE uint32_t __atomic_add(uint32_t *p, uint32_t v) { +#ifdef _MSC_VER + return InterlockedAdd((LONG volatile *)p, v) - v; +#else + return __sync_fetch_and_add(p, v); +#endif +} + +static FORCEINLINE uint32_t __atomic_sub(uint32_t *p, uint32_t v) { +#ifdef _MSC_VER + return InterlockedAdd((LONG volatile *)p, -v) + v; +#else + return __sync_fetch_and_sub(p, v); +#endif +} + +static FORCEINLINE uint32_t __atomic_and(uint32_t *p, uint32_t v) { +#ifdef _MSC_VER + return InterlockedAnd((LONG volatile *)p, v); +#else + return __sync_fetch_and_and(p, v); +#endif +} + +static FORCEINLINE uint32_t __atomic_or(uint32_t *p, uint32_t v) { +#ifdef _MSC_VER + return InterlockedOr((LONG volatile *)p, v); +#else + return __sync_fetch_and_or(p, v); +#endif +} + +static FORCEINLINE uint32_t __atomic_xor(uint32_t *p, uint32_t v) { +#ifdef _MSC_VER + return InterlockedXor((LONG volatile *)p, v); +#else + return __sync_fetch_and_xor(p, v); +#endif +} + +static FORCEINLINE uint32_t __atomic_min(uint32_t *p, uint32_t v) { + int32_t old, min; + do { + old = *((volatile int32_t *)p); + min = (old < (int32_t)v) ? old : (int32_t)v; +#ifdef _MSC_VER + } while (InterlockedCompareExchange((LONG volatile *)p, min, old) != old); +#else + } while (__sync_bool_compare_and_swap(p, old, min) == false); +#endif + return old; +} + +static FORCEINLINE uint32_t __atomic_max(uint32_t *p, uint32_t v) { + int32_t old, max; + do { + old = *((volatile int32_t *)p); + max = (old > (int32_t)v) ? old : (int32_t)v; +#ifdef _MSC_VER + } while (InterlockedCompareExchange((LONG volatile *)p, max, old) != old); +#else + } while (__sync_bool_compare_and_swap(p, old, max) == false); +#endif + return old; +} + +static FORCEINLINE uint32_t __atomic_umin(uint32_t *p, uint32_t v) { + uint32_t old, min; + do { + old = *((volatile uint32_t *)p); + min = (old < v) ? old : v; +#ifdef _MSC_VER + } while (InterlockedCompareExchange((LONG volatile *)p, min, old) != old); +#else + } while (__sync_bool_compare_and_swap(p, old, min) == false); +#endif + return old; +} + +static FORCEINLINE uint32_t __atomic_umax(uint32_t *p, uint32_t v) { + uint32_t old, max; + do { + old = *((volatile uint32_t *)p); + max = (old > v) ? old : v; +#ifdef _MSC_VER + } while (InterlockedCompareExchange((LONG volatile *)p, max, old) != old); +#else + } while (__sync_bool_compare_and_swap(p, old, max) == false); +#endif + return old; +} + +static FORCEINLINE uint32_t __atomic_xchg(uint32_t *p, uint32_t v) { +#ifdef _MSC_VER + return InterlockedExchange((LONG volatile *)p, v); +#else + return __sync_lock_test_and_set(p, v); +#endif +} + +static FORCEINLINE uint32_t __atomic_cmpxchg(uint32_t *p, uint32_t cmpval, + uint32_t newval) { +#ifdef _MSC_VER + return InterlockedCompareExchange((LONG volatile *)p, newval, cmpval); +#else + return __sync_val_compare_and_swap(p, cmpval, newval); +#endif +} + +static FORCEINLINE uint64_t __atomic_add(uint64_t *p, uint64_t v) { +#ifdef _MSC_VER + return InterlockedAdd64((LONGLONG volatile *)p, v) - v; +#else + return __sync_fetch_and_add(p, v); +#endif +} + +static FORCEINLINE uint64_t __atomic_sub(uint64_t *p, uint64_t v) { +#ifdef _MSC_VER + return InterlockedAdd64((LONGLONG volatile *)p, -v) + v; +#else + return __sync_fetch_and_sub(p, v); +#endif +} + +static FORCEINLINE uint64_t __atomic_and(uint64_t *p, uint64_t v) { +#ifdef _MSC_VER + return InterlockedAnd64((LONGLONG volatile *)p, v) - v; +#else + return __sync_fetch_and_and(p, v); +#endif +} + +static FORCEINLINE uint64_t __atomic_or(uint64_t *p, uint64_t v) { +#ifdef _MSC_VER + return InterlockedOr64((LONGLONG volatile *)p, v) - v; +#else + return __sync_fetch_and_or(p, v); +#endif +} + +static FORCEINLINE uint64_t __atomic_xor(uint64_t *p, uint64_t v) { +#ifdef _MSC_VER + return InterlockedXor64((LONGLONG volatile *)p, v) - v; +#else + return __sync_fetch_and_xor(p, v); +#endif +} + +static FORCEINLINE uint64_t __atomic_min(uint64_t *p, uint64_t v) { + int64_t old, min; + do { + old = *((volatile int64_t *)p); + min = (old < (int64_t)v) ? old : (int64_t)v; +#ifdef _MSC_VER + } while (InterlockedCompareExchange64((LONGLONG volatile *)p, min, old) != old); +#else + } while (__sync_bool_compare_and_swap(p, old, min) == false); +#endif + return old; +} + +static FORCEINLINE uint64_t __atomic_max(uint64_t *p, uint64_t v) { + int64_t old, max; + do { + old = *((volatile int64_t *)p); + max = (old > (int64_t)v) ? old : (int64_t)v; +#ifdef _MSC_VER + } while (InterlockedCompareExchange64((LONGLONG volatile *)p, max, old) != old); +#else + } while (__sync_bool_compare_and_swap(p, old, max) == false); +#endif + return old; +} + +static FORCEINLINE uint64_t __atomic_umin(uint64_t *p, uint64_t v) { + uint64_t old, min; + do { + old = *((volatile uint64_t *)p); + min = (old < v) ? old : v; +#ifdef _MSC_VER + } while (InterlockedCompareExchange64((LONGLONG volatile *)p, min, old) != old); +#else + } while (__sync_bool_compare_and_swap(p, old, min) == false); +#endif + return old; +} + +static FORCEINLINE uint64_t __atomic_umax(uint64_t *p, uint64_t v) { + uint64_t old, max; + do { + old = *((volatile uint64_t *)p); + max = (old > v) ? old : v; +#ifdef _MSC_VER + } while (InterlockedCompareExchange64((LONGLONG volatile *)p, max, old) != old); +#else + } while (__sync_bool_compare_and_swap(p, old, max) == false); +#endif + return old; +} + +static FORCEINLINE uint64_t __atomic_xchg(uint64_t *p, uint64_t v) { +#ifdef _MSC_VER + return InterlockedExchange64((LONGLONG volatile *)p, v); +#else + return __sync_lock_test_and_set(p, v); +#endif +} + +static FORCEINLINE uint64_t __atomic_cmpxchg(uint64_t *p, uint64_t cmpval, + uint64_t newval) { +#ifdef _MSC_VER + return InterlockedCompareExchange64((LONGLONG volatile *)p, newval, cmpval); +#else + return __sync_val_compare_and_swap(p, cmpval, newval); +#endif +} + +#ifdef WIN32 +#include +#define __clock __rdtsc +#else // WIN32 +static FORCEINLINE uint64_t __clock() { + uint32_t low, high; +#ifdef __x86_64 + __asm__ __volatile__ ("xorl %%eax,%%eax \n cpuid" + ::: "%rax", "%rbx", "%rcx", "%rdx" ); +#else + __asm__ __volatile__ ("xorl %%eax,%%eax \n cpuid" + ::: "%eax", "%ebx", "%ecx", "%edx" ); +#endif + __asm__ __volatile__ ("rdtsc" : "=a" (low), "=d" (high)); + return (uint64_t)high << 32 | low; +} + +#endif // !WIN32 + +#undef FORCEINLINE +#undef PRE_ALIGN +#undef POST_ALIGN diff --git a/examples/intrinsics/knc-i1x8.h b/examples/intrinsics/knc-i1x8.h new file mode 100644 index 00000000..de9bddcc --- /dev/null +++ b/examples/intrinsics/knc-i1x8.h @@ -0,0 +1,2862 @@ +/** + Copyright (c) 2010-2012, Intel Corporation + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS + IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A + PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#include +#include +#include +#include +#include +#include + +#if 0 +#define __ZMM32BIT__ +#endif + + +#ifdef _MSC_VER +#define FORCEINLINE __forceinline +#define PRE_ALIGN(x) /*__declspec(align(x))*/ +#define POST_ALIGN(x) +#define roundf(x) (floorf(x + .5f)) +#define round(x) (floor(x + .5)) +#else +#define FORCEINLINE __attribute__((always_inline)) +#define PRE_ALIGN(x) +#define POST_ALIGN(x) __attribute__ ((aligned(x))) +#endif + +#if 0 +#define KNC 1 +extern "C" +{ + int printf(const unsigned char *, ...); + int puts(unsigned char *); + unsigned int putchar(unsigned int); + int fflush(void *); + uint8_t *memcpy(uint8_t *, uint8_t *, uint64_t); + uint8_t *memset(uint8_t *, uint8_t, uint64_t); + void memset_pattern16(void *, const void *, uint64_t); +} +#endif + +typedef float __vec1_f; +typedef double __vec1_d; +typedef int8_t __vec1_i8; +typedef int16_t __vec1_i16; +typedef int32_t __vec1_i32; +typedef int64_t __vec1_i64; + +struct __vec8_i1 { + __vec8_i1() { } + __vec8_i1(const __mmask16 &vv) : v(vv) { } + __vec8_i1(bool v0, bool v1, bool v2, bool v3, + bool v4, bool v5, bool v6, bool v7) { + v = ((v0 & 1) | + ((v1 & 1) << 1) | + ((v2 & 1) << 2) | + ((v3 & 1) << 3) | + ((v4 & 1) << 4) | + ((v5 & 1) << 5) | + ((v6 & 1) << 6) | + ((v7 & 1) << 7) ); + } + + __mmask8 v; + FORCEINLINE operator __mmask8() const { return v; } +}; + + +template +struct vec8 { + vec8() { } + vec8(T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7) { + data[0] = v0; data[1] = v1; data[2] = v2; data[3] = v3; + data[4] = v4; data[5] = v5; data[6] = v6; data[7] = v7; + } + T data[8]; + FORCEINLINE const T& operator[](const int i) const { return data[i]; } + FORCEINLINE T& operator[](const int i) { return data[i]; } +}; + +/****************/ + +#ifndef __ZMM32BIT__ +struct PRE_ALIGN(32) __vec8_i32 : public vec8 { + __vec8_i32() { } + FORCEINLINE __vec8_i32(int32_t v0, int32_t v1, int32_t v2, int32_t v3, + int32_t v4, int32_t v5, int32_t v6, int32_t v7) + : vec8(v0, v1, v2, v3, v4, v5, v6, v7) { } + FORCEINLINE __vec8_i32(__m512i v) + { + union { __m512i v; int32_t s[8]; } val = {v}; + data[0] = val.s[0]; + data[1] = val.s[1]; + data[2] = val.s[2]; + data[3] = val.s[3]; + data[4] = val.s[4]; + data[5] = val.s[5]; + data[6] = val.s[6]; + data[7] = val.s[7]; + } + FORCEINLINE operator __m512i() const + { + return _mm512_set_16to16_pi( + 0,0,0,0, 0,0,0,0, + data[7],data[6],data[5],data[4],data[3],data[2],data[1],data[0]); + } +} POST_ALIGN(32); +#else /* __ZMM32BIT__ */ +struct PRE_ALIGN(32) __vec8_i32 +{ + __m512i v; + FORCEINLINE operator __m512i() const { return v; } + FORCEINLINE __vec8_i32() : v(_mm512_undefined_epi32()) {} + FORCEINLINE __vec8_i32(const __m512i &in) : v(in) {} + FORCEINLINE __vec8_i32(const __vec8_i32 &o) : v(o.v) {} + FORCEINLINE __vec8_i32& operator =(const __vec8_i32 &o) { v=o.v; return *this; } + FORCEINLINE __vec8_i32(int32_t v00, int32_t v01, int32_t v02, int32_t v03, + int32_t v04, int32_t v05, int32_t v06, int32_t v07) : + v ( _mm512_set_16to16_pi(0,0,0,0,0,0,0,0, v07, v06, v05, v04, v03, v02, v01, v00) ) {} + FORCEINLINE const int32_t& operator[](const int i) const { return ((int32_t*)this)[i]; } + FORCEINLINE int32_t& operator[](const int i) { return ((int32_t*)this)[i]; } +} POST_ALIGN(32); +#endif /* __ZMM32BIT__ */ + +#ifndef __ZMM32BIT__ /* __ZMM32BIT__ */ +PRE_ALIGN(32) struct __vec8_f : public vec8 { + __vec8_f() { } + FORCEINLINE __vec8_f(float v0, float v1, float v2, float v3, + float v4, float v5, float v6, float v7) + : vec8(v0, v1, v2, v3, v4, v5, v6, v7) { } + FORCEINLINE operator __m512() const + { + return _mm512_set_16to16_ps( + 0,0,0,0,0,0,0,0, + data[7],data[6],data[5],data[4],data[3],data[2],data[1],data[0]); + } + FORCEINLINE __vec8_f(__m512 v) + { + union { __m512 v; float s[8]; } val = {v}; + data[0] = val.s[0]; + data[1] = val.s[1]; + data[2] = val.s[2]; + data[3] = val.s[3]; + data[4] = val.s[4]; + data[5] = val.s[5]; + data[6] = val.s[6]; + data[7] = val.s[7]; + } +} POST_ALIGN(32); +#else /* __ZMM32BIT__ */ +PRE_ALIGN(32) struct __vec8_f +{ + __m512 v; + FORCEINLINE operator __m512() const { return v; } + FORCEINLINE __vec8_f() : v(_mm512_undefined_ps()) { } + FORCEINLINE __vec8_f(const __m512 &in) : v(in) {} + FORCEINLINE __vec8_f(const __vec8_f &o) : v(o.v) {} + FORCEINLINE __vec8_f& operator =(const __vec8_f &o) { v=o.v; return *this; } + FORCEINLINE __vec8_f(float v00, float v01, float v02, float v03, + float v04, float v05, float v06, float v07) : + v ( _mm512_set_16to16_ps(0,0,0,0,0,0,0,0, v07, v06, v05, v04, v03, v02, v01, v00) ) {} + FORCEINLINE const float& operator[](const int i) const { return ((float*)this)[i]; } + FORCEINLINE float& operator[](const int i) { return ((float*)this)[i]; } +} POST_ALIGN(32); +#endif /* __ZMM32BIT__ */ + +struct PRE_ALIGN(64) __vec8_d +{ + __m512d v; + FORCEINLINE __vec8_d() : v(_mm512_undefined_pd()) {} + FORCEINLINE __vec8_d(const __m512d _v) : v(_v) {} + FORCEINLINE __vec8_d(const __vec8_d &o) : v(o.v) {} + FORCEINLINE __vec8_d& operator =(const __vec8_d &o) { v=o.v; return *this; } + FORCEINLINE operator __m512d() const { return v; } + FORCEINLINE __vec8_d(double v00, double v01, double v02, double v03, + double v04, double v05, double v06, double v07) : + v ( _mm512_set_8to8_pd(v07, v06, v05, v04, v03, v02, v01, v00) ) {} + FORCEINLINE const double& operator[](const int i) const { return ((double*)this)[i]; } + FORCEINLINE double& operator[](const int i) { return ((double*)this)[i]; } +} POST_ALIGN(64); + +/****************/ + +PRE_ALIGN(64) struct __vec8_i64 : public vec8 { + __vec8_i64() { } + __vec8_i64(int64_t v0, int64_t v1, int64_t v2, int64_t v3, + int64_t v4, int64_t v5, int64_t v6, int64_t v7) + : vec8(v0, v1, v2, v3, v4, v5, v6, v7) { } +} POST_ALIGN(64); + +PRE_ALIGN(16) struct __vec8_i8 : public vec8 { + __vec8_i8() { } + __vec8_i8(int8_t v0, int8_t v1, int8_t v2, int8_t v3, + int8_t v4, int8_t v5, int8_t v6, int8_t v7) + : vec8(v0, v1, v2, v3, v4, v5, v6, v7) { } +} POST_ALIGN(16); + +PRE_ALIGN(32) struct __vec8_i16 : public vec8 { + __vec8_i16() { } + __vec8_i16(int16_t v0, int16_t v1, int16_t v2, int16_t v3, + int16_t v4, int16_t v5, int16_t v6, int16_t v7) + : vec8(v0, v1, v2, v3, v4, v5, v6, v7) { } +} POST_ALIGN(32); + +static inline int32_t __extract_element(__vec8_i32, int); + + +/////////////////////////////////////////////////////////////////////////// +// macros... + +#define UNARY_OP(TYPE, NAME, OP) \ +static FORCEINLINE TYPE NAME(TYPE v) { \ + TYPE ret; \ + for (int i = 0; i < 8; ++i) \ + ret[i] = OP(v[i]); \ + return ret; \ +} + +#define BINARY_OP(TYPE, NAME, OP) \ +static FORCEINLINE TYPE NAME(TYPE a, TYPE b) { \ + TYPE ret; \ + for (int i = 0; i < 8; ++i) \ + ret[i] = a[i] OP b[i]; \ + return ret; \ +} + +#define BINARY_OP_CAST(TYPE, CAST, NAME, OP) \ +static FORCEINLINE TYPE NAME(TYPE a, TYPE b) { \ + TYPE ret; \ + for (int i = 0; i < 8; ++i) \ + ret[i] = (CAST)(a[i]) OP (CAST)(b[i]); \ + return ret; \ +} + +#define BINARY_OP_FUNC(TYPE, NAME, FUNC) \ +static FORCEINLINE TYPE NAME(TYPE a, TYPE b) { \ + TYPE ret; \ + for (int i = 0; i < 8; ++i) \ + ret[i] = FUNC(a[i], b[i]); \ + return ret; \ +} + +#define CMP_OP(TYPE, SUFFIX, CAST, NAME, OP) \ +static FORCEINLINE __vec8_i1 NAME##_##SUFFIX(TYPE a, TYPE b) { \ + __vec8_i1 ret; \ + ret.v = 0; \ + for (int i = 0; i < 8; ++i) \ + ret.v |= ((CAST)(a[i]) OP (CAST)(b[i])) << i; \ + return ret; \ +} \ +static FORCEINLINE __vec8_i1 NAME##_##SUFFIX##_and_mask(TYPE a, TYPE b, \ + __vec8_i1 mask) { \ + __vec8_i1 ret; \ + ret.v = 0; \ + for (int i = 0; i < 8; ++i) \ + ret.v |= ((CAST)(a[i]) OP (CAST)(b[i])) << i; \ + ret.v &= mask.v; \ + return ret; \ +} + +#define INSERT_EXTRACT(VTYPE, STYPE) \ +static FORCEINLINE STYPE __extract_element(VTYPE v, int index) { \ + return ((STYPE *)&v)[index]; \ +} \ +static FORCEINLINE void __insert_element(VTYPE *v, int index, STYPE val) { \ + ((STYPE *)v)[index] = val; \ +} + +#define LOAD_STORE(VTYPE, STYPE) \ +template \ +static FORCEINLINE VTYPE __load(const VTYPE *p) { \ + STYPE *ptr = (STYPE *)p; \ + VTYPE ret; \ + for (int i = 0; i < 8; ++i) \ + ret[i] = ptr[i]; \ + return ret; \ +} \ +template \ +static FORCEINLINE void __store(VTYPE *p, VTYPE v) { \ + STYPE *ptr = (STYPE *)p; \ + for (int i = 0; i < 8; ++i) \ + ptr[i] = v[i]; \ +} + +#define LOADS(VTYPE, STYPE) \ +template \ +static FORCEINLINE VTYPE __load(const VTYPE *p) { \ + STYPE *ptr = (STYPE *)p; \ + VTYPE ret; \ + for (int i = 0; i < 8; ++i) \ + ret[i] = ptr[i]; \ + return ret; \ +} \ + +#define STORES(VTYPE, STYPE) \ +template \ +static FORCEINLINE void __store(VTYPE *p, VTYPE v) { \ + STYPE *ptr = (STYPE *)p; \ + for (int i = 0; i < 8; ++i) \ + ptr[i] = v[i]; \ +} + +#define REDUCE_ADD(TYPE, VTYPE, NAME) \ +static FORCEINLINE TYPE NAME(VTYPE v) { \ + TYPE ret = v[0]; \ + for (int i = 1; i < 8; ++i) \ + ret = ret + v[i]; \ + return ret; \ +} + +#define REDUCE_MINMAX(TYPE, VTYPE, NAME, OP) \ +static FORCEINLINE TYPE NAME(VTYPE v) { \ + TYPE ret = v[0]; \ + for (int i = 1; i < 8; ++i) \ + ret = (ret OP (TYPE)v[i]) ? ret : (TYPE)v[i]; \ + return ret; \ +} + +#define SELECT(TYPE) \ +static FORCEINLINE TYPE __select(__vec8_i1 mask, TYPE a, TYPE b) { \ + TYPE ret; \ + for (int i = 0; i < 8; ++i) \ + ret[i] = (mask.v & (1< VTYPE __smear_##NAME(STYPE); \ +template <> FORCEINLINE VTYPE __smear_##NAME(STYPE v) { \ + VTYPE ret; \ + for (int i = 0; i < 8; ++i) \ + ret[i] = v; \ + return ret; \ +} + +#define SETZERO(VTYPE, NAME) \ +template VTYPE __setzero_##NAME(); \ +template <> FORCEINLINE VTYPE __setzero_##NAME() { \ + VTYPE ret; \ + for (int i = 0; i < 8; ++i) \ + ret[i] = 0; \ + return ret; \ +} + +#define UNDEF(VTYPE, NAME) \ +template VTYPE __undef_##NAME(); \ +template <> FORCEINLINE VTYPE __undef_##NAME() { \ + return VTYPE(); \ +} + +#define BROADCAST(VTYPE, NAME, STYPE) \ +static FORCEINLINE VTYPE __broadcast_##NAME(VTYPE v, int index) { \ + VTYPE ret; \ + for (int i = 0; i < 8; ++i) \ + ret[i] = v[index & 0x7]; \ + return ret; \ +} \ + +#define ROTATE(VTYPE, NAME, STYPE) \ +static FORCEINLINE VTYPE __rotate_##NAME(VTYPE v, int index) { \ + VTYPE ret; \ + for (int i = 0; i < 8; ++i) \ + ret[i] = v[(i+index) & 0x7]; \ + return ret; \ +} \ + +#define SHUFFLES(VTYPE, NAME, STYPE) \ +static FORCEINLINE VTYPE __shuffle_##NAME(VTYPE v, __vec8_i32 index) { \ + VTYPE ret; \ + for (int i = 0; i < 8; ++i) \ + ret[i] = v[__extract_element(index, i) & 0x7]; \ + return ret; \ +} \ +static FORCEINLINE VTYPE __shuffle2_##NAME(VTYPE v0, VTYPE v1, __vec8_i32 index) { \ + VTYPE ret; \ + for (int i = 0; i < 8; ++i) { \ + int ii = __extract_element(index, i) & 0xf; \ + ret[i] = (ii < 8) ? v0[ii] : v1[ii-8]; \ + } \ + return ret; \ +} + +#define SHUFFLE2(VTYPE, NAME, STYPE) \ +static FORCEINLINE VTYPE __shuffle2_##NAME(VTYPE v0, VTYPE v1, __vec8_i32 index) { \ + VTYPE ret; \ + for (int i = 0; i < 8; ++i) { \ + int ii = __extract_element(index, i) & 0xf; \ + ret[i] = (ii < 8) ? v0[ii] : v1[ii-8]; \ + } \ + return ret; \ +} + +/////////////////////////////////////////////////////////////////////////// + +INSERT_EXTRACT(__vec1_i8, int8_t) +INSERT_EXTRACT(__vec1_i16, int16_t) +INSERT_EXTRACT(__vec1_i32, int32_t) +INSERT_EXTRACT(__vec1_i64, int64_t) +INSERT_EXTRACT(__vec1_f, float) +INSERT_EXTRACT(__vec1_d, double) + +/////////////////////////////////////////////////////////////////////////// +// mask ops + +static FORCEINLINE uint64_t __movmsk(__vec8_i1 mask) { + return (uint64_t)mask.v; +} + +static FORCEINLINE bool __any(__vec8_i1 mask) { + return (mask.v!=0); +} + +static FORCEINLINE bool __all(__vec8_i1 mask) { + return (mask.v==0xFF); +} + +static FORCEINLINE bool __none(__vec8_i1 mask) { + return (mask.v==0); +} + +static FORCEINLINE __vec8_i1 __equal_i1(__vec8_i1 a, __vec8_i1 b) { + __vec8_i1 r; + r.v = (a.v & b.v) | (~a.v & ~b.v); + return r; +} + +static FORCEINLINE __vec8_i1 __and(__vec8_i1 a, __vec8_i1 b) { + __vec8_i1 r; + r.v = a.v & b.v; + return r; +} + +static FORCEINLINE __vec8_i1 __xor(__vec8_i1 a, __vec8_i1 b) { + __vec8_i1 r; + r.v = a.v ^ b.v; + return r; +} + +static FORCEINLINE __vec8_i1 __or(__vec8_i1 a, __vec8_i1 b) { + __vec8_i1 r; + r.v = a.v | b.v; + return r; +} + +static FORCEINLINE __vec8_i1 __not(__vec8_i1 v) { + __vec8_i1 r; + r.v = ~v.v; + return r; +} + +static FORCEINLINE __vec8_i1 __and_not1(__vec8_i1 a, __vec8_i1 b) { + __vec8_i1 r; + r.v = ~a.v & b.v; + return r; +} + +static FORCEINLINE __vec8_i1 __and_not2(__vec8_i1 a, __vec8_i1 b) { + __vec8_i1 r; + r.v = a.v & ~b.v; + return r; +} + +static FORCEINLINE __vec8_i1 __select(__vec8_i1 mask, __vec8_i1 a, + __vec8_i1 b) { + __vec8_i1 r; + r.v = (a.v & mask.v) | (b.v & ~mask.v); + return r; +} + +static FORCEINLINE __vec8_i1 __select(bool cond, __vec8_i1 a, __vec8_i1 b) { + return cond ? a : b; +} + +static FORCEINLINE bool __extract_element(__vec8_i1 vec, int index) { + return (vec.v & (1 << index)) ? true : false; +} + +static FORCEINLINE void __insert_element(__vec8_i1 *vec, int index, + bool val) { + if (val == false) + vec->v &= ~(1 << index); + else + vec->v |= (1 << index); +} + +template static FORCEINLINE __vec8_i1 __load(const __vec8_i1 *p) { + uint8_t *ptr = (uint8_t *)p; + __vec8_i1 r; + r.v = *ptr; + return r; +} + +template static FORCEINLINE void __store(__vec8_i1 *p, __vec8_i1 v) { + uint8_t *ptr = (uint8_t *)p; + *ptr = v.v; +} + +template RetVecType __smear_i1(int i); +template <> static FORCEINLINE __vec8_i1 __smear_i1<__vec8_i1>(int i) { + return i?0xFF:0x0; +} + +template RetVecType __setzero_i1(); +template <> static FORCEINLINE __vec8_i1 __setzero_i1<__vec8_i1>() { + return 0; +} + +template __vec8_i1 __undef_i1(); +template <> FORCEINLINE __vec8_i1 __undef_i1<__vec8_i1>() { + return __vec8_i1(); +} + + +/////////////////////////////////////////////////////////////////////////// +// int8 + +BINARY_OP(__vec8_i8, __add, +) +BINARY_OP(__vec8_i8, __sub, -) +BINARY_OP(__vec8_i8, __mul, *) + +BINARY_OP(__vec8_i8, __or, |) +BINARY_OP(__vec8_i8, __and, &) +BINARY_OP(__vec8_i8, __xor, ^) +BINARY_OP(__vec8_i8, __shl, <<) + +BINARY_OP_CAST(__vec8_i8, uint8_t, __udiv, /) +BINARY_OP_CAST(__vec8_i8, int8_t, __sdiv, /) + +BINARY_OP_CAST(__vec8_i8, uint8_t, __urem, %) +BINARY_OP_CAST(__vec8_i8, int8_t, __srem, %) +BINARY_OP_CAST(__vec8_i8, uint8_t, __lshr, >>) +BINARY_OP_CAST(__vec8_i8, int8_t, __ashr, >>) + +SHIFT_UNIFORM(__vec8_i8, uint8_t, __lshr, >>) +SHIFT_UNIFORM(__vec8_i8, int8_t, __ashr, >>) +SHIFT_UNIFORM(__vec8_i8, int8_t, __shl, <<) + +CMP_OP(__vec8_i8, i8, int8_t, __equal, ==) +CMP_OP(__vec8_i8, i8, int8_t, __not_equal, !=) +CMP_OP(__vec8_i8, i8, uint8_t, __unsigned_less_equal, <=) +CMP_OP(__vec8_i8, i8, int8_t, __signed_less_equal, <=) +CMP_OP(__vec8_i8, i8, uint8_t, __unsigned_greater_equal, >=) +CMP_OP(__vec8_i8, i8, int8_t, __signed_greater_equal, >=) +CMP_OP(__vec8_i8, i8, uint8_t, __unsigned_less_than, <) +CMP_OP(__vec8_i8, i8, int8_t, __signed_less_than, <) +CMP_OP(__vec8_i8, i8, uint8_t, __unsigned_greater_than, >) +CMP_OP(__vec8_i8, i8, int8_t, __signed_greater_than, >) + +SELECT(__vec8_i8) +INSERT_EXTRACT(__vec8_i8, int8_t) +SMEAR(__vec8_i8, i8, int8_t) +SETZERO(__vec8_i8, i8) +UNDEF(__vec8_i8, i8) +BROADCAST(__vec8_i8, i8, int8_t) +ROTATE(__vec8_i8, i8, int8_t) +SHUFFLES(__vec8_i8, i8, int8_t) +LOAD_STORE(__vec8_i8, int8_t) + +/////////////////////////////////////////////////////////////////////////// +// int16 + +BINARY_OP(__vec8_i16, __add, +) +BINARY_OP(__vec8_i16, __sub, -) +BINARY_OP(__vec8_i16, __mul, *) + +BINARY_OP(__vec8_i16, __or, |) +BINARY_OP(__vec8_i16, __and, &) +BINARY_OP(__vec8_i16, __xor, ^) +BINARY_OP(__vec8_i16, __shl, <<) + +BINARY_OP_CAST(__vec8_i16, uint16_t, __udiv, /) +BINARY_OP_CAST(__vec8_i16, int16_t, __sdiv, /) + +BINARY_OP_CAST(__vec8_i16, uint16_t, __urem, %) +BINARY_OP_CAST(__vec8_i16, int16_t, __srem, %) +BINARY_OP_CAST(__vec8_i16, uint16_t, __lshr, >>) +BINARY_OP_CAST(__vec8_i16, int16_t, __ashr, >>) + +SHIFT_UNIFORM(__vec8_i16, uint16_t, __lshr, >>) +SHIFT_UNIFORM(__vec8_i16, int16_t, __ashr, >>) +SHIFT_UNIFORM(__vec8_i16, int16_t, __shl, <<) + +CMP_OP(__vec8_i16, i16, int16_t, __equal, ==) +CMP_OP(__vec8_i16, i16, int16_t, __not_equal, !=) +CMP_OP(__vec8_i16, i16, uint16_t, __unsigned_less_equal, <=) +CMP_OP(__vec8_i16, i16, int16_t, __signed_less_equal, <=) +CMP_OP(__vec8_i16, i16, uint16_t, __unsigned_greater_equal, >=) +CMP_OP(__vec8_i16, i16, int16_t, __signed_greater_equal, >=) +CMP_OP(__vec8_i16, i16, uint16_t, __unsigned_less_than, <) +CMP_OP(__vec8_i16, i16, int16_t, __signed_less_than, <) +CMP_OP(__vec8_i16, i16, uint16_t, __unsigned_greater_than, >) +CMP_OP(__vec8_i16, i16, int16_t, __signed_greater_than, >) + +SELECT(__vec8_i16) +INSERT_EXTRACT(__vec8_i16, int16_t) +SMEAR(__vec8_i16, i16, int16_t) +SETZERO(__vec8_i16, i16) +UNDEF(__vec8_i16, i16) +BROADCAST(__vec8_i16, i16, int16_t) +ROTATE(__vec8_i16, i16, int16_t) +SHUFFLES(__vec8_i16, i16, int16_t) +LOAD_STORE(__vec8_i16, int16_t) + +#if 0 /* evghenii::int32 */ +/////////////////////////////////////////////////////////////////////////// +// int32 + +BINARY_OP(__vec8_i32, __add, +) +BINARY_OP(__vec8_i32, __sub, -) +BINARY_OP(__vec8_i32, __mul, *) + +BINARY_OP(__vec8_i32, __or, |) +BINARY_OP(__vec8_i32, __and, &) +BINARY_OP(__vec8_i32, __xor, ^) +BINARY_OP(__vec8_i32, __shl, <<) + +BINARY_OP_CAST(__vec8_i32, uint32_t, __udiv, /) +BINARY_OP_CAST(__vec8_i32, int32_t, __sdiv, /) + +BINARY_OP_CAST(__vec8_i32, uint32_t, __urem, %) +BINARY_OP_CAST(__vec8_i32, int32_t, __srem, %) +BINARY_OP_CAST(__vec8_i32, uint32_t, __lshr, >>) +BINARY_OP_CAST(__vec8_i32, int32_t, __ashr, >>) + +SHIFT_UNIFORM(__vec8_i32, uint32_t, __lshr, >>) +SHIFT_UNIFORM(__vec8_i32, int32_t, __ashr, >>) +SHIFT_UNIFORM(__vec8_i32, int32_t, __shl, <<) + +CMP_OP(__vec8_i32, i32, int32_t, __equal, ==) +CMP_OP(__vec8_i32, i32, int32_t, __not_equal, !=) +CMP_OP(__vec8_i32, i32, uint32_t, __unsigned_less_equal, <=) +CMP_OP(__vec8_i32, i32, int32_t, __signed_less_equal, <=) +CMP_OP(__vec8_i32, i32, uint32_t, __unsigned_greater_equal, >=) +CMP_OP(__vec8_i32, i32, int32_t, __signed_greater_equal, >=) +CMP_OP(__vec8_i32, i32, uint32_t, __unsigned_less_than, <) +CMP_OP(__vec8_i32, i32, int32_t, __signed_less_than, <) +CMP_OP(__vec8_i32, i32, uint32_t, __unsigned_greater_than, >) +CMP_OP(__vec8_i32, i32, int32_t, __signed_greater_than, >) + +SELECT(__vec8_i32) +INSERT_EXTRACT(__vec8_i32, int32_t) +SMEAR(__vec8_i32, i32, int32_t) +SETZERO(__vec8_i32, i32) +UNDEF(__vec8_i32, i32) +BROADCAST(__vec8_i32, i32, int32_t) +ROTATE(__vec8_i32, i32, int32_t) +SHUFFLES(__vec8_i32, i32, int32_t) +LOAD_STORE(__vec8_i32, int32_t) + +#else /* evghenii::int32 */ +/////////////////////////////////////////////////////////////////////////// +// int32 +/////////////////////////////////////////////////////////////////////////// + +#define IZERO _mm512_setzero_epi32() +static FORCEINLINE __vec8_i32 __add(__vec8_i32 a, __vec8_i32 b) { + return _mm512_mask_add_epi32(IZERO,0xFF, a, b); +} + +static FORCEINLINE __vec8_i32 __sub(__vec8_i32 a, __vec8_i32 b) { + return _mm512_mask_sub_epi32(IZERO,0xFF, a, b); +} + +static FORCEINLINE __vec8_i32 __mul(__vec8_i32 a, __vec8_i32 b) { + return _mm512_mask_mullo_epi32(IZERO,0xFF, a, b); +} + +static FORCEINLINE __vec8_i32 __udiv(__vec8_i32 a, __vec8_i32 b) { + return _mm512_mask_div_epu32(IZERO,0xFF, a, b); +} + +static FORCEINLINE __vec8_i32 __sdiv(__vec8_i32 a, __vec8_i32 b) { + return _mm512_mask_div_epi32(IZERO,0xFF, a, b); +} + +static FORCEINLINE __vec8_i32 __urem(__vec8_i32 a, __vec8_i32 b) { + return _mm512_mask_rem_epu32(IZERO,0xFF, a, b); +} + +static FORCEINLINE __vec8_i32 __srem(__vec8_i32 a, __vec8_i32 b) { + return _mm512_mask_rem_epi32(IZERO,0xFF, a, b); +} + +static FORCEINLINE __vec8_i32 __or(__vec8_i32 a, __vec8_i32 b) { + return _mm512_mask_or_epi32(IZERO,0xFF, a, b); +} + +static FORCEINLINE __vec8_i32 __and(__vec8_i32 a, __vec8_i32 b) { + return _mm512_mask_and_epi32(IZERO,0xFF, a, b); +} + +static FORCEINLINE __vec8_i32 __xor(__vec8_i32 a, __vec8_i32 b) { + return _mm512_mask_xor_epi32(IZERO,0xFF, a, b); +} + +static FORCEINLINE __vec8_i32 __shl(__vec8_i32 a, __vec8_i32 b) { + return _mm512_mask_sllv_epi32(IZERO,0xFF, a, b); +} + +static FORCEINLINE __vec8_i32 __lshr(__vec8_i32 a, __vec8_i32 b) { + return _mm512_mask_srlv_epi32(IZERO,0xFF, a, b); +} + +static FORCEINLINE __vec8_i32 __ashr(__vec8_i32 a, __vec8_i32 b) { + return _mm512_mask_srav_epi32(IZERO,0xFF, a, b); +} + +static FORCEINLINE __vec8_i32 __shl(__vec8_i32 a, int32_t n) { + return _mm512_mask_slli_epi32(IZERO,0xFF, a, n); +} + +static FORCEINLINE __vec8_i32 __lshr(__vec8_i32 a, int32_t n) { + return _mm512_mask_srli_epi32(IZERO,0xFF, a, n); +} + +static FORCEINLINE __vec8_i32 __ashr(__vec8_i32 a, int32_t n) { + return _mm512_mask_srai_epi32(IZERO,0xFF, a, n); +} + +static FORCEINLINE __vec8_i1 __equal_i32(const __vec8_i32 &a, const __vec8_i32 &b) { + return _mm512_mask_cmpeq_epi32_mask(0xFF,a, b); +} + +static FORCEINLINE __vec8_i1 __equal_i32_and_mask(const __vec8_i32 &a, const __vec8_i32 &b, + __vec8_i1 m) { + return _mm512_mask_cmpeq_epi32_mask(m, a, b); +} + +static FORCEINLINE __vec8_i1 __not_equal_i32(__vec8_i32 a, __vec8_i32 b) { + return _mm512_mask_cmpneq_epi32_mask(0xFF,a, b); +} + +static FORCEINLINE __vec8_i1 __not_equal_i32_and_mask(__vec8_i32 a, __vec8_i32 b, + __vec8_i1 m) { + return _mm512_mask_cmpneq_epi32_mask(m, a, b); +} + +static FORCEINLINE __vec8_i1 __unsigned_less_equal_i32(__vec8_i32 a, __vec8_i32 b) { + return _mm512_mask_cmple_epu32_mask(0xFF,a, b); +} + +static FORCEINLINE __vec8_i1 __unsigned_less_equal_i32_and_mask(__vec8_i32 a, __vec8_i32 b, + __vec8_i1 m) { + return _mm512_mask_cmple_epu32_mask(m, a, b); +} + +static FORCEINLINE __vec8_i1 __signed_less_equal_i32(__vec8_i32 a, __vec8_i32 b) { + return _mm512_mask_cmple_epi32_mask(0xFF,a, b); +} + +static FORCEINLINE __vec8_i1 __signed_less_equal_i32_and_mask(__vec8_i32 a, __vec8_i32 b, + __vec8_i1 m) { + return _mm512_mask_cmple_epi32_mask(m, a, b); +} + +static FORCEINLINE __vec8_i1 __unsigned_greater_equal_i32(__vec8_i32 a, __vec8_i32 b) { + return _mm512_mask_cmpge_epu32_mask(0xFF,a, b); +} + +static FORCEINLINE __vec8_i1 __unsigned_greater_equal_i32_and_mask(__vec8_i32 a, __vec8_i32 b, + __vec8_i1 m) { + return _mm512_mask_cmpge_epu32_mask(m, a, b); +} + +static FORCEINLINE __vec8_i1 __signed_greater_equal_i32(__vec8_i32 a, __vec8_i32 b) { + return _mm512_mask_cmpge_epi32_mask(0xFF,a, b); +} + +static FORCEINLINE __vec8_i1 __signed_greater_equal_i32_and_mask(__vec8_i32 a, __vec8_i32 b, + __vec8_i1 m) { + return _mm512_mask_cmpge_epi32_mask(m, a, b); +} + +static FORCEINLINE __vec8_i1 __unsigned_less_than_i32(__vec8_i32 a, __vec8_i32 b) { + return _mm512_mask_cmplt_epu32_mask(0xFF,a, b); +} + +static FORCEINLINE __vec8_i1 __unsigned_less_than_i32_and_mask(__vec8_i32 a, __vec8_i32 b, + __vec8_i1 m) { + return _mm512_mask_cmplt_epu32_mask(m, a, b); +} + +static FORCEINLINE __vec8_i1 __signed_less_than_i32(__vec8_i32 a, __vec8_i32 b) { + return _mm512_mask_cmplt_epi32_mask(0xFF,a, b); +} + +static FORCEINLINE __vec8_i1 __signed_less_than_i32_and_mask(__vec8_i32 a, __vec8_i32 b, + __vec8_i1 m) { + return _mm512_mask_cmplt_epi32_mask(m, a, b); +} + +static FORCEINLINE __vec8_i1 __unsigned_greater_than_i32(__vec8_i32 a, __vec8_i32 b) { + return _mm512_mask_cmpgt_epu32_mask(0xFF,a, b); +} + +static FORCEINLINE __vec8_i1 __unsigned_greater_than_i32_and_mask(__vec8_i32 a, __vec8_i32 b, + __vec8_i1 m) { + return _mm512_mask_cmpgt_epu32_mask(m, a, b); +} + +static FORCEINLINE __vec8_i1 __signed_greater_than_i32(__vec8_i32 a, __vec8_i32 b) { + return _mm512_mask_cmpgt_epi32_mask(0xFF,a, b); +} + +static FORCEINLINE __vec8_i1 __signed_greater_than_i32_and_mask(__vec8_i32 a, __vec8_i32 b, + __vec8_i1 m) { + return _mm512_mask_cmpgt_epi32_mask(m, a, b); +} + +static FORCEINLINE __vec8_i32 __select(__vec8_i1 mask, + __vec8_i32 a, __vec8_i32 b) { + return _mm512_mask_mov_epi32(b, mask, a); +} + +static FORCEINLINE __vec8_i32 __select(bool cond, __vec8_i32 a, __vec8_i32 b) { + return cond ? a : b; +} + +static FORCEINLINE int32_t __extract_element(__vec8_i32 v, int index) { //uint32_t index) { + return ((int32_t *)&v)[index]; +} + +static FORCEINLINE void __insert_element(__vec8_i32 *v, uint32_t index, int32_t val) { + ((int32_t *)v)[index] = val; +} + +template RetVecType __smear_i32(int32_t i); +template <> static FORCEINLINE __vec8_i32 __smear_i32<__vec8_i32>(int32_t i) { + return _mm512_set_16to16_epi32(0,0,0,0,0,0,0,0, i,i,i,i,i,i,i,i); +} + +static const __vec8_i32 __ispc_one = __smear_i32<__vec8_i32>(1); +static const __vec8_i32 __ispc_thirty_two = __smear_i32<__vec8_i32>(32); +static const __vec8_i32 __ispc_ffffffff = __smear_i32<__vec8_i32>(-1); +static const __vec8_i32 __ispc_stride1(0, 1, 2, 3, 4, 5, 6, 7); + +template RetVecType __setzero_i32(); +template <> static FORCEINLINE __vec8_i32 __setzero_i32<__vec8_i32>() { + return _mm512_setzero_epi32(); +} + +template RetVecType __undef_i32(); +template <> static FORCEINLINE __vec8_i32 __undef_i32<__vec8_i32>() { + return __vec8_i32(); +} + +static FORCEINLINE __vec8_i32 __broadcast_i32(__vec8_i32 v, int index) { + int32_t val = __extract_element(v, index & 0xf); + return _mm512_set1_epi32(val); +} + +#if 0 /* evghenii::doesn't work */ +static FORCEINLINE __vec8_i32 __rotate_i32(__vec8_i32 v, int index) { + __vec8_i32 idx = __smear_i32<__vec8_i32>(index); + __vec8_i32 shuffle = _mm512_and_epi32(_mm512_add_epi32(__ispc_stride1, idx), __smear_i32<__vec8_i32>(0x7)); + return _mm512_mask_permutevar_epi32(v, 0xffff, shuffle, v); +} +#else +ROTATE(__vec8_i32, i32, int32_t) +#endif + +static FORCEINLINE __vec8_i32 __shuffle_i32(__vec8_i32 v, __vec8_i32 index) { + return _mm512_mask_permutevar_epi32(v, 0xffff, index, v); +} +SHUFFLE2(__vec8_i32, i32, int32_t) /* evghenii::to implement */ + +template static FORCEINLINE __vec8_i32 __load(const __vec8_i32 *p) { + __vec8_i32 v; + v = _mm512_extloadunpacklo_epi32(v, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE); + v = _mm512_extloadunpackhi_epi32(v, (uint8_t*)p+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE); + return __select(0xFF,v,IZERO); +} + + +template static FORCEINLINE void __store(__vec8_i32 *p, __vec8_i32 v) { + _mm512_mask_extpackstorelo_epi32( p, 0xFF, v, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE); + _mm512_mask_extpackstorehi_epi32((uint8_t*)p+64, 0xFF, v, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE); +} + +#if 0 +template <> static FORCEINLINE __vec8_i32 __load<64>(const __vec8_i32 *p) { + return _mm512_load_epi32(p); +} +template <> static FORCEINLINE void __store<64>(__vec8_i32 *p, __vec8_i32 v) { + _mm512_store_epi32(p, v); +} +#endif +#endif /* evghenii::int32 */ + +/////////////////////////////////////////////////////////////////////////// +// int64 + +BINARY_OP(__vec8_i64, __add, +) +BINARY_OP(__vec8_i64, __sub, -) +BINARY_OP(__vec8_i64, __mul, *) + +BINARY_OP(__vec8_i64, __or, |) +BINARY_OP(__vec8_i64, __and, &) +BINARY_OP(__vec8_i64, __xor, ^) +BINARY_OP(__vec8_i64, __shl, <<) + +BINARY_OP_CAST(__vec8_i64, uint64_t, __udiv, /) +BINARY_OP_CAST(__vec8_i64, int64_t, __sdiv, /) + +BINARY_OP_CAST(__vec8_i64, uint64_t, __urem, %) +BINARY_OP_CAST(__vec8_i64, int64_t, __srem, %) +BINARY_OP_CAST(__vec8_i64, uint64_t, __lshr, >>) +BINARY_OP_CAST(__vec8_i64, int64_t, __ashr, >>) + +SHIFT_UNIFORM(__vec8_i64, uint64_t, __lshr, >>) +SHIFT_UNIFORM(__vec8_i64, int64_t, __ashr, >>) +SHIFT_UNIFORM(__vec8_i64, int64_t, __shl, <<) + +CMP_OP(__vec8_i64, i64, int64_t, __equal, ==) +CMP_OP(__vec8_i64, i64, int64_t, __not_equal, !=) +CMP_OP(__vec8_i64, i64, uint64_t, __unsigned_less_equal, <=) +CMP_OP(__vec8_i64, i64, int64_t, __signed_less_equal, <=) +CMP_OP(__vec8_i64, i64, uint64_t, __unsigned_greater_equal, >=) +CMP_OP(__vec8_i64, i64, int64_t, __signed_greater_equal, >=) +CMP_OP(__vec8_i64, i64, uint64_t, __unsigned_less_than, <) +CMP_OP(__vec8_i64, i64, int64_t, __signed_less_than, <) +CMP_OP(__vec8_i64, i64, uint64_t, __unsigned_greater_than, >) +CMP_OP(__vec8_i64, i64, int64_t, __signed_greater_than, >) + +SELECT(__vec8_i64) +INSERT_EXTRACT(__vec8_i64, int64_t) +SMEAR(__vec8_i64, i64, int64_t) +SETZERO(__vec8_i64, i64) +UNDEF(__vec8_i64, i64) +BROADCAST(__vec8_i64, i64, int64_t) +ROTATE(__vec8_i64, i64, int64_t) +SHUFFLES(__vec8_i64, i64, int64_t) +LOAD_STORE(__vec8_i64, int64_t) + + +#if 0 /* evghenii::float */ +/////////////////////////////////////////////////////////////////////////// +// float + +BINARY_OP(__vec8_f, __add, +) +BINARY_OP(__vec8_f, __sub, -) +BINARY_OP(__vec8_f, __mul, *) +BINARY_OP(__vec8_f, __div, /) + +CMP_OP(__vec8_f, float, float, __equal, ==) +CMP_OP(__vec8_f, float, float, __not_equal, !=) +CMP_OP(__vec8_f, float, float, __less_than, <) +CMP_OP(__vec8_f, float, float, __less_equal, <=) +CMP_OP(__vec8_f, float, float, __greater_than, >) +CMP_OP(__vec8_f, float, float, __greater_equal, >=) + +static FORCEINLINE __vec8_i1 __ordered_float(__vec8_f a, __vec8_f b) { + __vec8_i1 ret; + ret.v = 0; + for (int i = 0; i < 8; ++i) + ret.v |= ((a[i] == a[i]) && (b[i] == b[i])) ? (1 << i) : 0; + return ret; +} + +static FORCEINLINE __vec8_i1 __unordered_float(__vec8_f a, __vec8_f b) { + __vec8_i1 ret; + ret.v = 0; + for (int i = 0; i < 8; ++i) + ret.v |= ((a[i] != a[i]) || (b[i] != b[i])) ? (1 << i) : 0; + return ret; +} + +#if 0 + case Instruction::FRem: intrinsic = "__frem"; break; +#endif + +SELECT(__vec8_f) +INSERT_EXTRACT(__vec8_f, float) +SMEAR(__vec8_f, float, float) +SETZERO(__vec8_f, float) +UNDEF(__vec8_f, float) +BROADCAST(__vec8_f, float, float) +ROTATE(__vec8_f, float, float) +SHUFFLES(__vec8_f, float, float) +LOAD_STORE(__vec8_f, float) +#else /* evghenii::float */ + +/////////////////////////////////////////////////////////////////////////// +// float +/////////////////////////////////////////////////////////////////////////// + +#define FZERO _mm512_setzero_ps() +static FORCEINLINE __vec8_f __add(__vec8_f a, __vec8_f b) { + return _mm512_mask_add_ps(FZERO, 0xFF, a, b); +} + +static FORCEINLINE __vec8_f __sub(__vec8_f a, __vec8_f b) { + return _mm512_mask_sub_ps(FZERO, 0xFF, a, b); +} + +static FORCEINLINE __vec8_f __mul(__vec8_f a, __vec8_f b) { + return _mm512_mask_mul_ps(FZERO, 0xFF, a, b); +} + +static FORCEINLINE __vec8_f __div(__vec8_f a, __vec8_f b) { + return _mm512_mask_div_ps(FZERO, 0xFF, a, b); +} + +static FORCEINLINE __vec8_i1 __equal_float(__vec8_f a, __vec8_f b) { + return _mm512_mask_cmpeq_ps_mask(0xFF, a, b); +} + +static FORCEINLINE __vec8_i1 __equal_float_and_mask(__vec8_f a, __vec8_f b, + __vec8_i1 m) { + return _mm512_mask_cmpeq_ps_mask(m, a, b); +} + +static FORCEINLINE __vec8_i1 __not_equal_float(__vec8_f a, __vec8_f b) { + return _mm512_mask_cmpneq_ps_mask(0xFF, a, b); +} + +static FORCEINLINE __vec8_i1 __not_equal_float_and_mask(__vec8_f a, __vec8_f b, + __vec8_i1 m) { + return _mm512_mask_cmpneq_ps_mask(m, a, b); +} + +static FORCEINLINE __vec8_i1 __less_than_float(__vec8_f a, __vec8_f b) { + return _mm512_mask_cmplt_ps_mask(0xFF, a, b); +} + +static FORCEINLINE __vec8_i1 __less_than_float_and_mask(__vec8_f a, __vec8_f b, + __vec8_i1 m) { + return _mm512_mask_cmplt_ps_mask(m, a, b); +} + +static FORCEINLINE __vec8_i1 __less_equal_float(__vec8_f a, __vec8_f b) { + return _mm512_mask_cmple_ps_mask(0xFF, a, b); +} + +static FORCEINLINE __vec8_i1 __less_equal_float_and_mask(__vec8_f a, __vec8_f b, + __vec8_i1 m) { + return _mm512_mask_cmple_ps_mask(m, a, b); +} + +static FORCEINLINE __vec8_i1 __greater_than_float(__vec8_f a, __vec8_f b) { + return _mm512_mask_cmp_ps_mask(0xFF, a, b,_CMP_GT_OS); +} + +static FORCEINLINE __vec8_i1 __greater_than_float_and_mask(__vec8_f a, __vec8_f b, + __vec8_i1 m) { + return _mm512_mask_cmp_ps_mask(m,a, b,_CMP_GT_OS); +} + +static FORCEINLINE __vec8_i1 __greater_equal_float(__vec8_f a, __vec8_f b) { + return _mm512_mask_cmp_ps_mask(0xFF, a, b,_CMP_GE_OS); +} + +static FORCEINLINE __vec8_i1 __greater_equal_float_and_mask(__vec8_f a, __vec8_f b, + __vec8_i1 m) { + return _mm512_mask_cmp_ps_mask(m,a, b,_CMP_GE_OS); +} + +static FORCEINLINE __vec8_i1 __ordered_float(__vec8_f a, __vec8_f b) { + return _mm512_mask_cmpord_ps_mask(0xFF, a, b); +} + +static FORCEINLINE __vec8_i1 __unordered_float(__vec8_f a, __vec8_f b) { + return _mm512_mask_cmpunord_ps_mask(0xFF,a, b); +} + +static FORCEINLINE __vec8_f __select(__vec8_i1 mask, __vec8_f a, __vec8_f b) { + return _mm512_mask_mov_ps(b, mask & 0xFF, a); +} + +static FORCEINLINE __vec8_f __select(bool cond, __vec8_f a, __vec8_f b) { + return cond ? a : b; +} + +static FORCEINLINE float __extract_element(__vec8_f v, uint32_t index) { + return v[index]; + // return ((float *)&v)[index]; +} + +static FORCEINLINE void __insert_element(__vec8_f *v, uint32_t index, float val) { + (*v)[index] = val; +// ((float *)v)[index] = val; +} + +template RetVecType __smear_float(float f); +template <> static FORCEINLINE __vec8_f __smear_float<__vec8_f>(float f) { + return _mm512_set_16to16_ps(0,0,0,0,0,0,0,0, f,f,f,f,f,f,f,f); +} + +template RetVecType __setzero_float(); +template <> static FORCEINLINE __vec8_f __setzero_float<__vec8_f>() { + return _mm512_setzero_ps(); +} + +template RetVecType __undef_float(); +template <> static FORCEINLINE __vec8_f __undef_float<__vec8_f>() { + return __vec8_f(); +} + +static FORCEINLINE __vec8_f __broadcast_float(__vec8_f v, int index) { + float val = __extract_element(v, index & 0x7); + return _mm512_set_16to16_ps(0,0,0,0,0,0,0,0, val,val,val,val,val,val,val,val); +} + +#if 1 +static FORCEINLINE __vec8_f __shuffle_float(__vec8_f v, __vec8_i32 index) { + return _mm512_castsi512_ps(_mm512_mask_permutevar_epi32(_mm512_castps_si512(v), 0xffff, index, _mm512_castps_si512(v))); +} +#endif +ROTATE(__vec8_f, float, float) +SHUFFLE2(__vec8_f, float, float) + +#if 0 +LOADS(__vec8_f, float) +#else +template static FORCEINLINE __vec8_f __load(const __vec8_f *p) { + __vec8_f v; + v = _mm512_extloadunpacklo_ps(v, p, _MM_UPCONV_PS_NONE, _MM_HINT_NONE); + v = _mm512_extloadunpackhi_ps(v, (uint8_t*)p+64, _MM_UPCONV_PS_NONE, _MM_HINT_NONE); + return __select(0xFF,v,FZERO); +} +#endif + +#if 0 +STORES(__vec8_f, float) +#else +template static FORCEINLINE void __store(__vec8_f *p, __vec8_f v) +{ + _mm512_mask_extpackstorelo_ps( p, 0xFF, v, _MM_DOWNCONV_PS_NONE, _MM_HINT_NONE); + _mm512_mask_extpackstorehi_ps((uint8_t*)p+64, 0xFF, v, _MM_DOWNCONV_PS_NONE, _MM_HINT_NONE); +} +#endif + +#endif /* evghenii::float */ + +static FORCEINLINE float __exp_uniform_float(float v) { return expf(v);} +static FORCEINLINE __vec8_f __exp_varying_float(__vec8_f v) { return _mm512_mask_exp_ps(FZERO, 0xFF, v); } + + +static FORCEINLINE float __log_uniform_float(float v) { return logf(v);} +static FORCEINLINE __vec8_f __log_varying_float(__vec8_f v) { return _mm512_mask_log_ps(FZERO, 0xFF, v); } + +static FORCEINLINE float __pow_uniform_float(float a, float b) { return powf(a, b);} +static FORCEINLINE __vec8_f __pow_varying_float(__vec8_f a, __vec8_f b) { return _mm512_mask_pow_ps(FZERO, 0xFF, a,b); } + + +static FORCEINLINE int __intbits(float v) { + union { + float f; + int i; + } u; + u.f = v; + return u.i; +} + +static FORCEINLINE float __floatbits(int v) { + union { + float f; + int i; + } u; + u.i = v; + return u.f; +} + +static FORCEINLINE float __half_to_float_uniform(int16_t h) { + static const uint32_t shifted_exp = 0x7c00 << 13; // exponent mask after shift + + int32_t o = ((int32_t)(h & 0x7fff)) << 13; // exponent/mantissa bits + uint32_t exp = shifted_exp & o; // just the exponent + o += (127 - 15) << 23; // exponent adjust + + // handle exponent special cases + if (exp == shifted_exp) // Inf/NaN? + o += (128 - 16) << 23; // extra exp adjust + else if (exp == 0) { // Zero/Denormal? + o += 1 << 23; // extra exp adjust + o = __intbits(__floatbits(o) - __floatbits(113 << 23)); // renormalize + } + + o |= ((int32_t)(h & 0x8000)) << 16; // sign bit + return __floatbits(o); +} + + +static FORCEINLINE __vec8_f __half_to_float_varying(__vec8_i16 v) { + __vec8_f ret; + for (int i = 0; i < 8; ++i) + ret[i] = __half_to_float_uniform(v[i]); + return ret; +} + + +static FORCEINLINE int16_t __float_to_half_uniform(float f) { + uint32_t sign_mask = 0x80000000u; + int32_t o; + + int32_t fint = __intbits(f); + int32_t sign = fint & sign_mask; + fint ^= sign; + + int32_t f32infty = 255 << 23; + o = (fint > f32infty) ? 0x7e00 : 0x7c00; + + // (De)normalized number or zero + // update fint unconditionally to save the blending; we don't need it + // anymore for the Inf/NaN case anyway. + const uint32_t round_mask = ~0xfffu; + const int32_t magic = 15 << 23; + const int32_t f16infty = 31 << 23; + + int32_t fint2 = __intbits(__floatbits(fint & round_mask) * __floatbits(magic)) - round_mask; + fint2 = (fint2 > f16infty) ? f16infty : fint2; // Clamp to signed infinity if overflowed + + if (fint < f32infty) + o = fint2 >> 13; // Take the bits! + + return (o | (sign >> 16)); +} + + +static FORCEINLINE __vec8_i16 __float_to_half_varying(__vec8_f v) { + __vec8_i16 ret; + for (int i = 0; i < 8; ++i) + ret[i] = __float_to_half_uniform(v[i]); + return ret; +} + + +#if 0 /* evghenii::double */ +/////////////////////////////////////////////////////////////////////////// +// double + +BINARY_OP(__vec8_d, __add, +) +BINARY_OP(__vec8_d, __sub, -) +BINARY_OP(__vec8_d, __mul, *) +BINARY_OP(__vec8_d, __div, /) + +CMP_OP(__vec8_d, double, double, __equal, ==) +CMP_OP(__vec8_d, double, double, __not_equal, !=) +CMP_OP(__vec8_d, double, double, __less_than, <) +CMP_OP(__vec8_d, double, double, __less_equal, <=) +CMP_OP(__vec8_d, double, double, __greater_than, >) +CMP_OP(__vec8_d, double, double, __greater_equal, >=) + +static FORCEINLINE __vec8_i1 __ordered_double(__vec8_d a, __vec8_d b) { + __vec8_i1 ret; + ret.v = 0; + for (int i = 0; i < 8; ++i) + ret.v |= ((a[i] == a[i]) && (b[i] == b[i])) ? (1 << i) : 0; + return ret; +} + +static FORCEINLINE __vec8_i1 __unordered_double(__vec8_d a, __vec8_d b) { + __vec8_i1 ret; + ret.v = 0; + for (int i = 0; i < 8; ++i) + ret.v |= ((a[i] != a[i]) || (b[i] != b[i])) ? (1 << i) : 0; + return ret; +} + +#if 0 + case Instruction::FRem: intrinsic = "__frem"; break; +#endif + +SELECT(__vec8_d) +INSERT_EXTRACT(__vec8_d, double) +SMEAR(__vec8_d, double, double) +SETZERO(__vec8_d, double) +UNDEF(__vec8_d, double) +BROADCAST(__vec8_d, double, double) +ROTATE(__vec8_d, double, double) +SHUFFLES(__vec8_d, double, double) +LOAD_STORE(__vec8_d, double) +#else /* evghenii::double */ +/////////////////////////////////////////////////////////////////////////// +// double +/////////////////////////////////////////////////////////////////////////// + +static FORCEINLINE __vec8_d __add(__vec8_d a, __vec8_d b) { + return _mm512_add_pd(a, b); +} +static FORCEINLINE __vec8_d __sub(__vec8_d a, __vec8_d b) { + return _mm512_sub_pd(a, b); +} +static FORCEINLINE __vec8_d __mul(__vec8_d a, __vec8_d b) { + return _mm512_mul_pd(a, b); +} + +static FORCEINLINE __vec8_d __div(__vec8_d a, __vec8_d b) { + return _mm512_div_pd(a, b); +} + +static FORCEINLINE __vec8_i1 __equal_double(__vec8_d a, __vec8_d b) { + return _mm512_cmpeq_pd_mask(a, b); +} + +static FORCEINLINE __vec8_i1 __equal_double_and_mask(__vec8_d a, __vec8_d b, + __vec8_i1 m) { + return _mm512_mask_cmpeq_pd_mask(m, a, b); +} + +static FORCEINLINE __vec8_i1 __not_equal_double(__vec8_d a, __vec8_d b) { + return _mm512_cmpneq_pd_mask(a, b); +} + +static FORCEINLINE __vec8_i1 __not_equal_double_and_mask(__vec8_d a, __vec8_d b, + __vec8_i1 m) { + return _mm512_mask_cmpneq_pd_mask(m, a, b); +} + +static FORCEINLINE __vec8_i1 __less_than_double(__vec8_d a, __vec8_d b) { + return _mm512_cmplt_pd_mask(a, b); +} + +static FORCEINLINE __vec8_i1 __less_than_double_and_mask(__vec8_d a, __vec8_d b, + __vec8_i1 m) { + return _mm512_mask_cmplt_pd_mask(m, a, b); +} + +static FORCEINLINE __vec8_i1 __less_equal_double(__vec8_d a, __vec8_d b) { + return _mm512_cmple_pd_mask(a, b); +} + +static FORCEINLINE __vec8_i1 __less_equal_double_and_mask(__vec8_d a, __vec8_d b, + __vec8_i1 m) { + return _mm512_mask_cmple_pd_mask(m, a, b); +} + +static FORCEINLINE __vec8_i1 __greater_than_double(__vec8_d a, __vec8_d b) { + return _mm512_cmpnle_pd_mask(a, b); +} + +static FORCEINLINE __vec8_i1 __greater_than_double_and_mask(__vec8_d a, __vec8_d b, + __vec8_i1 m) { + return _mm512_mask_cmpnle_pd_mask(m, a, b); +} + +static FORCEINLINE __vec8_i1 __greater_equal_double(__vec8_d a, __vec8_d b) { + return _mm512_cmpnlt_pd_mask(a, b); +} + +static FORCEINLINE __vec8_i1 __greater_equal_double_and_mask(__vec8_d a, __vec8_d b, + __vec8_i1 m) { + return _mm512_mask_cmpnlt_pd_mask(m, a, b); +} + +static FORCEINLINE __vec8_i1 __ordered_double(__vec8_d a, __vec8_d b) { + return _mm512_cmpord_pd_mask(a, b); +} + +static FORCEINLINE __vec8_i1 __unordered_double(__vec8_d a, __vec8_d b) { + return _mm512_cmpunord_pd_mask(a, b); +} + +static FORCEINLINE __vec8_d __select(__vec8_i1 mask, __vec8_d a, __vec8_d b) { + return _mm512_mask_mov_pd(b, mask, a); +} + + +static FORCEINLINE __vec8_d __select(bool cond, __vec8_d a, __vec8_d b) { + return cond ? a : b; +} + +static FORCEINLINE double __extract_element(__vec8_d v, uint32_t index) { + return ((double *)&v)[index]; +} + +static FORCEINLINE void __insert_element(__vec8_d *v, uint32_t index, double val) { + ((double *)v)[index] = val; +} + +template RetVecType __smear_double(double d); +template <> static FORCEINLINE __vec8_d __smear_double<__vec8_d>(double d) { return _mm512_set1_pd(d); } + +template RetVecType __setzero_double(); +template <> static FORCEINLINE __vec8_d __setzero_double<__vec8_d>() { return _mm512_setzero_pd(); } + +template RetVecType __undef_double(); +template <> static FORCEINLINE __vec8_d __undef_double<__vec8_d>() { return __vec8_d();} + +static FORCEINLINE __vec8_d __broadcast_double(__vec8_d v, int index) { + double val = __extract_element(v, index & 0xf); + return _mm512_set1_pd(val); +} + +ROTATE(__vec8_d, double, double) +SHUFFLES(__vec8_d, double, double) + +template static FORCEINLINE __vec8_d __load(const __vec8_d *p) { + __vec8_d ret; + ret.v = _mm512_extloadunpacklo_pd(ret.v, p, _MM_UPCONV_PD_NONE, _MM_HINT_NONE); + ret.v = _mm512_extloadunpackhi_pd(ret.v, (uint8_t*)p+64, _MM_UPCONV_PD_NONE, _MM_HINT_NONE); + return ret; +} + +template static FORCEINLINE void __store(__vec8_d *p, __vec8_d v) { + _mm512_extpackstorelo_pd(p, v.v, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE); + _mm512_extpackstorehi_pd((uint8_t*)p+64, v.v, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE); +} + + +#if 0 +template <> static FORCEINLINE __vec8_d __load<64>(const __vec8_d *p) { + return _mm512_load_pd(p); +} +template <> static FORCEINLINE __vec8_d __load<128>(const __vec8_d *p) { + return __load<64>(p); +} +template <> static FORCEINLINE void __store<64>(__vec8_d *p, __vec8_d v) { + _mm512_store_pd(p, v.v); +} +template <> static FORCEINLINE void __store<128>(__vec8_d *p, __vec8_d v) { + __store<64>(p, v); +} +#endif +#endif /* evghenii::double */ + +/////////////////////////////////////////////////////////////////////////// +// casts + + +#define CAST(TO, STO, FROM, SFROM, FUNC) \ +static FORCEINLINE TO FUNC(TO, FROM val) { \ + TO ret; \ + for (int i = 0; i < 8; ++i) \ + ret[i] = (STO)((SFROM)(val[i])); \ + return ret; \ +} + +// sign extension conversions +CAST(__vec8_i64, int64_t, __vec8_i32, int32_t, __cast_sext) +CAST(__vec8_i64, int64_t, __vec8_i16, int16_t, __cast_sext) +CAST(__vec8_i64, int64_t, __vec8_i8, int8_t, __cast_sext) +CAST(__vec8_i32, int32_t, __vec8_i16, int16_t, __cast_sext) +CAST(__vec8_i32, int32_t, __vec8_i8, int8_t, __cast_sext) +CAST(__vec8_i16, int16_t, __vec8_i8, int8_t, __cast_sext) + +#define CAST_SEXT_I1(TYPE) \ +static FORCEINLINE TYPE __cast_sext(TYPE, __vec8_i1 v) { \ + TYPE ret; \ + for (int i = 0; i < 8; ++i) { \ + ret[i] = 0; \ + if (v.v & (1 << i)) \ + ret[i] = ~ret[i]; \ + } \ + return ret; \ +} + +CAST_SEXT_I1(__vec8_i8) +CAST_SEXT_I1(__vec8_i16) +#if 0 +CAST_SEXT_I1(__vec8_i32) +#else +static FORCEINLINE __vec8_i32 __cast_sext(const __vec8_i32 &, const __vec8_i1 &val) +{ + __vec8_i32 ret = _mm512_setzero_epi32(); + __vec8_i32 one = _mm512_set1_epi32(-1); + return _mm512_mask_mov_epi32(ret, 0xFF & val, one); +} +#endif +CAST_SEXT_I1(__vec8_i64) + +// zero extension +CAST(__vec8_i64, uint64_t, __vec8_i32, uint32_t, __cast_zext) +CAST(__vec8_i64, uint64_t, __vec8_i16, uint16_t, __cast_zext) +CAST(__vec8_i64, uint64_t, __vec8_i8, uint8_t, __cast_zext) +CAST(__vec8_i32, uint32_t, __vec8_i16, uint16_t, __cast_zext) +CAST(__vec8_i32, uint32_t, __vec8_i8, uint8_t, __cast_zext) +CAST(__vec8_i16, uint16_t, __vec8_i8, uint8_t, __cast_zext) + +#define CAST_ZEXT_I1(TYPE) \ +static FORCEINLINE TYPE __cast_zext(TYPE, __vec8_i1 v) { \ + TYPE ret; \ + for (int i = 0; i < 8; ++i) \ + ret[i] = (v.v & (1 << i)) ? 1 : 0; \ + return ret; \ +} + +CAST_ZEXT_I1(__vec8_i8) +CAST_ZEXT_I1(__vec8_i16) +#if 0 +CAST_ZEXT_I1(__vec8_i32) +#else +static FORCEINLINE __vec8_i32 __cast_zext(const __vec8_i32 &, const __vec8_i1 &val) +{ + __vec8_i32 ret = _mm512_setzero_epi32(); + __vec8_i32 one = _mm512_set1_epi32(1); + return _mm512_mask_mov_epi32(ret, 0xFF & val, one); +} +#endif +CAST_ZEXT_I1(__vec8_i64) + +// truncations +CAST(__vec8_i32, int32_t, __vec8_i64, int64_t, __cast_trunc) +CAST(__vec8_i16, int16_t, __vec8_i64, int64_t, __cast_trunc) +CAST(__vec8_i8, int8_t, __vec8_i64, int64_t, __cast_trunc) +CAST(__vec8_i16, int16_t, __vec8_i32, int32_t, __cast_trunc) +CAST(__vec8_i8, int8_t, __vec8_i32, int32_t, __cast_trunc) +CAST(__vec8_i8, int8_t, __vec8_i16, int16_t, __cast_trunc) + +// signed int to float/double +#if 0 +CAST(__vec8_f, float, __vec8_i8, int8_t, __cast_sitofp) +CAST(__vec8_f, float, __vec8_i16, int16_t, __cast_sitofp) +CAST(__vec8_f, float, __vec8_i32, int32_t, __cast_sitofp) +#else +static FORCEINLINE __vec8_f __cast_sitofp(__vec8_f, __vec8_i8 val) {return _mm512_mask_extload_ps(FZERO, 0xFF, &val, _MM_UPCONV_PS_SINT8, _MM_BROADCAST_16X16, _MM_HINT_NONE);} +static FORCEINLINE __vec8_f __cast_sitofp(__vec8_f, __vec8_i16 val) {return _mm512_mask_extload_ps(FZERO, 0xFF, &val, _MM_UPCONV_PS_SINT16, _MM_BROADCAST_16X16, _MM_HINT_NONE);} +static FORCEINLINE __vec8_f __cast_sitofp(__vec8_f, __vec8_i32 val) {return _mm512_mask_cvtfxpnt_round_adjustepi32_ps(FZERO, 0xFF, val, _MM_ROUND_MODE_NEAREST, _MM_EXPADJ_NONE);} +#endif +CAST(__vec8_f, float, __vec8_i64, int64_t, __cast_sitofp) +#if 0 +CAST(__vec8_d, double, __vec8_i8, int8_t, __cast_sitofp) +CAST(__vec8_d, double, __vec8_i16, int16_t, __cast_sitofp) +CAST(__vec8_d, double, __vec8_i32, int32_t, __cast_sitofp) +#else +static FORCEINLINE __vec8_d __cast_sitofp(__vec8_d, __vec8_i8 val) { + __vec8_i32 vi = _mm512_extload_epi32(&val, _MM_UPCONV_EPI32_SINT8, _MM_BROADCAST_16X16, _MM_HINT_NONE); + return _mm512_cvtepi32lo_pd(vi); +} + +static FORCEINLINE __vec8_d __cast_sitofp(__vec8_d, __vec8_i16 val) { + __vec8_i32 vi = _mm512_extload_epi32(&val, _MM_UPCONV_EPI32_SINT16, _MM_BROADCAST_16X16, _MM_HINT_NONE); + return _mm512_cvtepi32lo_pd(vi); +} + +static FORCEINLINE __vec8_d __cast_sitofp(__vec8_d, __vec8_i32 val) { + __vec8_d ret; + return _mm512_cvtepi32lo_pd(val); +} +#endif +CAST(__vec8_d, double, __vec8_i64, int64_t, __cast_sitofp) + +// unsigned int to float/double +#if 0 +CAST(__vec8_f, float, __vec8_i8, uint8_t, __cast_uitofp) +CAST(__vec8_f, float, __vec8_i16, uint16_t, __cast_uitofp) +CAST(__vec8_f, float, __vec8_i32, uint32_t, __cast_uitofp) +#else +static FORCEINLINE __vec8_f __cast_uitofp(__vec8_f, __vec8_i8 val) {return _mm512_mask_extload_ps(FZERO, 0xFF, &val, _MM_UPCONV_PS_UINT8, _MM_BROADCAST_16X16, _MM_HINT_NONE);} +static FORCEINLINE __vec8_f __cast_uitofp(__vec8_f, __vec8_i16 val) {return _mm512_mask_extload_ps(FZERO, 0xFF, &val, _MM_UPCONV_PS_UINT16, _MM_BROADCAST_16X16, _MM_HINT_NONE);} +static FORCEINLINE __vec8_f __cast_uitofp(__vec8_f, __vec8_i32 val) {return _mm512_mask_cvtfxpnt_round_adjustepu32_ps(FZERO, 0xFF, val, _MM_ROUND_MODE_NEAREST, _MM_EXPADJ_NONE);} +#endif +CAST(__vec8_f, float, __vec8_i64, uint64_t, __cast_uitofp) +#if 0 +CAST(__vec8_d, double, __vec8_i8, uint8_t, __cast_uitofp) +CAST(__vec8_d, double, __vec8_i16, uint16_t, __cast_uitofp) +CAST(__vec8_d, double, __vec8_i32, uint32_t, __cast_uitofp) +#else +static FORCEINLINE __vec8_d __cast_uitofp(__vec8_d, __vec8_i8 val) { + __vec8_i32 vi = _mm512_extload_epi32(&val, _MM_UPCONV_EPI32_UINT8, _MM_BROADCAST_16X16, _MM_HINT_NONE); + return _mm512_cvtepu32lo_pd(vi); +} + +static FORCEINLINE __vec8_d __cast_uitofp(__vec8_d, __vec8_i16 val) { + __vec8_i32 vi = _mm512_extload_epi32(&val, _MM_UPCONV_EPI32_UINT16, _MM_BROADCAST_16X16, _MM_HINT_NONE); + return _mm512_cvtepu32lo_pd(vi); +} + +static FORCEINLINE __vec8_d __cast_uitofp(__vec8_d, __vec8_i32 val) { + __vec8_d ret; + return _mm512_cvtepu32lo_pd(val); +} +#endif +CAST(__vec8_d, double, __vec8_i64, uint64_t, __cast_uitofp) + +#if 0 +static FORCEINLINE __vec8_f __cast_uitofp(__vec8_f, __vec8_i1 v) { + __vec8_f ret; + for (int i = 0; i < 8; ++i) + ret[i] = (v.v & (1 << i)) ? 1. : 0.; + return ret; +} +#else +static FORCEINLINE __vec8_f __cast_uitofp(__vec8_f, __vec8_i1 v) +{ + const __m512 ret = _mm512_setzero_ps(); + const __m512 one = _mm512_set1_ps(1.0); + return _mm512_mask_mov_ps(ret, v & 0xFF, one); +} +#endif + +// float/double to signed int +CAST(__vec8_i8, int8_t, __vec8_f, float, __cast_fptosi) +CAST(__vec8_i16, int16_t, __vec8_f, float, __cast_fptosi) +#if 0 +CAST(__vec8_i32, int32_t, __vec8_f, float, __cast_fptosi) +#else +static FORCEINLINE __vec8_i32 __cast_fptosi(__vec8_i32, __vec8_f val) { + return _mm512_mask_cvtfxpnt_round_adjustps_epi32(IZERO, 0xFF, val, _MM_ROUND_MODE_TOWARD_ZERO, _MM_EXPADJ_NONE); +} +#endif +CAST(__vec8_i64, int64_t, __vec8_f, float, __cast_fptosi) +CAST(__vec8_i8, int8_t, __vec8_d, double, __cast_fptosi) +CAST(__vec8_i16, int16_t, __vec8_d, double, __cast_fptosi) +#if 1 +CAST(__vec8_i32, int32_t, __vec8_d, double, __cast_fptosi) +#else +#endif +CAST(__vec8_i64, int64_t, __vec8_d, double, __cast_fptosi) + +// float/double to unsigned int +CAST(__vec8_i8, uint8_t, __vec8_f, float, __cast_fptoui) +CAST(__vec8_i16, uint16_t, __vec8_f, float, __cast_fptoui) +#if 0 +CAST(__vec8_i32, uint32_t, __vec8_f, float, __cast_fptoui) +#else +static FORCEINLINE __vec8_i32 __cast_fptoui(__vec8_i32, __vec8_f val) { + return _mm512_mask_cvtfxpnt_round_adjustps_epu32(IZERO, 0xFF, val, _MM_ROUND_MODE_TOWARD_ZERO, _MM_EXPADJ_NONE); +} +#endif +CAST(__vec8_i64, uint64_t, __vec8_f, float, __cast_fptoui) +CAST(__vec8_i8, uint8_t, __vec8_d, double, __cast_fptoui) +CAST(__vec8_i16, uint16_t, __vec8_d, double, __cast_fptoui) +#if 1 +CAST(__vec8_i32, uint32_t, __vec8_d, double, __cast_fptoui) +#else +#endif +CAST(__vec8_i64, uint64_t, __vec8_d, double, __cast_fptoui) + +// float/double conversions +#if 0 +CAST(__vec8_f, float, __vec8_d, double, __cast_fptrunc) +CAST(__vec8_d, double, __vec8_f, float, __cast_fpext) +#else +static FORCEINLINE __vec8_f __cast_fptrunc(__vec8_f, __vec8_d val) { + return _mm512_mask_cvtpd_pslo(FZERO, 0xFF, val); +} +static FORCEINLINE __vec8_d __cast_fpext(__vec8_d, __vec8_f val) { + return _mm512_cvtpslo_pd(val); +} +#endif + +typedef union { + int32_t i32; + float f; + int64_t i64; + double d; +} BitcastUnion; + +#define CAST_BITS(TO, TO_ELT, FROM, FROM_ELT) \ +static FORCEINLINE TO __cast_bits(TO, FROM val) { \ + TO r; \ + for (int i = 0; i < 8; ++i) { \ + BitcastUnion u; \ + u.FROM_ELT = val[i]; \ + r[i] = u.TO_ELT; \ + } \ + return r; \ +} + +#if 0 +CAST_BITS(__vec8_f, f, __vec8_i32, i32) +CAST_BITS(__vec8_i32, i32, __vec8_f, f) +#else +static FORCEINLINE __vec8_f __cast_bits(__vec8_f, __vec8_i32 val) { + return _mm512_castsi512_ps(val); +} +static FORCEINLINE __vec8_i32 __cast_bits(__vec8_i32, __vec8_f val) { + return _mm512_castps_si512(val); +} +#endif + +#if 0 +CAST_BITS(__vec8_d, d, __vec8_i64, i64) +CAST_BITS(__vec8_i64, i64, __vec8_d, d) +#else +static FORCEINLINE __vec8_i64 __cast_bits(__vec8_i64, __vec8_d val) { + return *(__vec8_i64*)&val; +} +static FORCEINLINE __vec8_d __cast_bits(__vec8_d, __vec8_i64 val) { + return *(__vec8_d*)&val; +} +#endif + +#define CAST_BITS_SCALAR(TO, FROM) \ +static FORCEINLINE TO __cast_bits(TO, FROM v) { \ + union { \ + TO to; \ + FROM from; \ + } u; \ + u.from = v; \ + return u.to; \ +} + +CAST_BITS_SCALAR(uint32_t, float) +CAST_BITS_SCALAR(int32_t, float) +CAST_BITS_SCALAR(float, uint32_t) +CAST_BITS_SCALAR(float, int32_t) +CAST_BITS_SCALAR(uint64_t, double) +CAST_BITS_SCALAR(int64_t, double) +CAST_BITS_SCALAR(double, uint64_t) +CAST_BITS_SCALAR(double, int64_t) + +/////////////////////////////////////////////////////////////////////////// +// various math functions + +static FORCEINLINE void __fastmath() { +} + +static FORCEINLINE float __round_uniform_float(float v) { + return roundf(v); +} + +static FORCEINLINE float __floor_uniform_float(float v) { + return floorf(v); +} + +static FORCEINLINE float __ceil_uniform_float(float v) { + return ceilf(v); +} + +static FORCEINLINE double __round_uniform_double(double v) { + return round(v); +} + +static FORCEINLINE double __floor_uniform_double(double v) { + return floor(v); +} + +static FORCEINLINE double __ceil_uniform_double(double v) { + return ceil(v); +} + +#if 0 +UNARY_OP(__vec8_f, __round_varying_float, roundf) +UNARY_OP(__vec8_f, __floor_varying_float, floorf) +UNARY_OP(__vec8_f, __ceil_varying_float, ceilf) +#else +static FORCEINLINE __vec8_f __round_varying_float(__vec8_f v) { + return _mm512_mask_round_ps(FZERO, 0xFF, v, _MM_ROUND_MODE_NEAREST, _MM_EXPADJ_NONE); +} + +static FORCEINLINE __vec8_f __floor_varying_float(__vec8_f v) { + return _mm512_mask_floor_ps(FZERO, 0xFF, v); +} + +static FORCEINLINE __vec8_f __ceil_varying_float(__vec8_f v) { + return _mm512_mask_ceil_ps(FZERO, 0xFF, v); +} +#endif + +#if 0 +UNARY_OP(__vec8_d, __round_varying_double, round) +UNARY_OP(__vec8_d, __floor_varying_double, floor) +UNARY_OP(__vec8_d, __ceil_varying_double, ceil) +#else +static FORCEINLINE __vec8_d __round_varying_float(__vec8_d v) { + return _mm512_svml_round_pd(v); +} + +static FORCEINLINE __vec8_d __floor_varying_float(__vec8_d v) { + return _mm512_floor_pd(v); +} + +static FORCEINLINE __vec8_d __ceil_varying_float(__vec8_d v) { + return _mm512_ceil_pd(v); +} +#endif + + +// min/max + +static FORCEINLINE float __min_uniform_float(float a, float b) { return (ab) ? a : b; } +static FORCEINLINE double __min_uniform_double(double a, double b) { return (ab) ? a : b; } + +static FORCEINLINE int32_t __min_uniform_int32(int32_t a, int32_t b) { return (ab) ? a : b; } +static FORCEINLINE int32_t __min_uniform_uint32(uint32_t a, uint32_t b) { return (ab) ? a : b; } + +static FORCEINLINE int64_t __min_uniform_int64(int64_t a, int64_t b) { return (ab) ? a : b; } +static FORCEINLINE int64_t __min_uniform_uint64(uint64_t a, uint64_t b) { return (ab) ? a : b; } + + +#if 0 +BINARY_OP_FUNC(__vec8_f, __max_varying_float, __max_uniform_float) +BINARY_OP_FUNC(__vec8_f, __min_varying_float, __min_uniform_float) +#else +static FORCEINLINE __vec8_f __max_varying_float (__vec8_f v1, __vec8_f v2) { return _mm512_mask_gmax_ps(FZERO, 0xFF, v1, v2);} +static FORCEINLINE __vec8_f __min_varying_float (__vec8_f v1, __vec8_f v2) { return _mm512_mask_gmin_ps(FZERO, 0xFF, v1, v2);} +#endif + +#if 0 +BINARY_OP_FUNC(__vec8_d, __max_varying_double, __max_uniform_double) +BINARY_OP_FUNC(__vec8_d, __min_varying_double, __min_uniform_double) +#else +static FORCEINLINE __vec8_d __max_varying_double(__vec8_d v1, __vec8_d v2) { return _mm512_gmax_pd(v1,v2); } +static FORCEINLINE __vec8_d __min_varying_double(__vec8_d v1, __vec8_d v2) { return _mm512_gmin_pd(v1,v2); } +#endif + +#if 0 +BINARY_OP_FUNC(__vec8_i32, __max_varying_int32, __max_uniform_int32) +BINARY_OP_FUNC(__vec8_i32, __min_varying_int32, __min_uniform_int32) +BINARY_OP_FUNC(__vec8_i32, __max_varying_uint32, __max_uniform_uint32) +BINARY_OP_FUNC(__vec8_i32, __min_varying_uint32, __min_uniform_uint32) +#else +static FORCEINLINE __vec8_i32 __max_varying_int32 (__vec8_i32 v1, __vec8_i32 v2) { return _mm512_mask_max_epi32(IZERO,0xFF, v1, v2);} +static FORCEINLINE __vec8_i32 __min_varying_int32 (__vec8_i32 v1, __vec8_i32 v2) { return _mm512_mask_min_epi32(IZERO,0xFF, v1, v2);} +static FORCEINLINE __vec8_i32 __max_varying_uint32(__vec8_i32 v1, __vec8_i32 v2) { return _mm512_mask_max_epu32(IZERO,0xFF, v1, v2);} +static FORCEINLINE __vec8_i32 __min_varying_uint32(__vec8_i32 v1, __vec8_i32 v2) { return _mm512_mask_min_epu32(IZERO,0xFF, v1, v2);} +#endif + +BINARY_OP_FUNC(__vec8_i64, __max_varying_int64, __max_uniform_int64) +BINARY_OP_FUNC(__vec8_i64, __min_varying_int64, __min_uniform_int64) +BINARY_OP_FUNC(__vec8_i64, __max_varying_uint64, __max_uniform_uint64) +BINARY_OP_FUNC(__vec8_i64, __min_varying_uint64, __min_uniform_uint64) + +// sqrt/rsqrt/rcp + +static FORCEINLINE float __rsqrt_uniform_float(float v) { + return 1.f / sqrtf(v); +} + +static FORCEINLINE float __rcp_uniform_float(float v) { + return 1.f / v; +} + +static FORCEINLINE float __sqrt_uniform_float(float v) { + return sqrtf(v); +} + +static FORCEINLINE double __sqrt_uniform_double(double v) { + return sqrt(v); +} + +#if 0 +UNARY_OP(__vec8_f, __rcp_varying_float, __rcp_uniform_float) +UNARY_OP(__vec8_f, __rsqrt_varying_float, __rsqrt_uniform_float) +UNARY_OP(__vec8_f, __sqrt_varying_float, __sqrt_uniform_float) +#else +static FORCEINLINE __vec8_f __rcp_varying_float(__vec8_f v) { +#ifdef ISPC_FAST_MATH + return _mm512_mask_rcp23_ps(FZERO, 0xFF, v); // Approximation with 23 bits of accuracy. +#else + return _mm512_mask_recip_ps(FZERO, 0xFF, v); +#endif +} + +static FORCEINLINE __vec8_f __rsqrt_varying_float(__vec8_f v) { +#ifdef ISPC_FAST_MATH + return _mm512_mask_rsqrt23_ps(FZERO,0xFF,v); // Approximation with 0.775ULP accuracy +#else + return _mm512_mask_invsqrt_ps(FZERO,0xFF,v); +#endif +} +static FORCEINLINE __vec8_f __sqrt_varying_float (__vec8_f v) { return _mm512_mask_sqrt_ps(FZERO,0xFF,v);} +#endif + +#if 0 +UNARY_OP(__vec8_d, __sqrt_varying_double, __sqrt_uniform_double) +#else +static FORCEINLINE __vec8_d __sqrt_varying_double(__vec8_d v) { return _mm512_sqrt_pd(v); } +#endif + +/////////////////////////////////////////////////////////////////////////// +// svml +/////////////////////////////////////////////////////////////////////////// + +static FORCEINLINE __vec8_f __svml_logf(__vec8_f v) { return _mm512_mask_log_ps(FZERO,0xFF,v); } +static FORCEINLINE __vec8_f __svml_expf(__vec8_f v) { return _mm512_mask_exp_ps(FZERO,0xFF,v); } +static FORCEINLINE __vec8_f __svml_cosf(__vec8_f v) { return _mm512_mask_cos_ps(FZERO,0xFF,v); } +static FORCEINLINE __vec8_f __svml_powf(__vec8_f a, __vec8_f b) { return _mm512_mask_pow_ps(FZERO,0xFF,a,b); } + +static FORCEINLINE __vec8_d __svml_logd(__vec8_d v) { return _mm512_log_pd(v); } +static FORCEINLINE __vec8_d __svml_expd(__vec8_d v) { return _mm512_exp_pd(v); } +static FORCEINLINE __vec8_d __svml_cosd(__vec8_d v) { return _mm512_cos_pd(v); } +static FORCEINLINE __vec8_d __svml_powd(__vec8_d a, __vec8_d b) { return _mm512_pow_pd(a,b); } + +/////////////////////////////////////////////////////////////////////////// +// bit ops + +static FORCEINLINE int32_t __popcnt_int32(uint32_t v) { + int count = 0; + for (; v != 0; v >>= 1) + count += (v & 1); + return count; +} + +static FORCEINLINE int32_t __popcnt_int64(uint64_t v) { + int count = 0; + for (; v != 0; v >>= 1) + count += (v & 1); + return count; +} + +static FORCEINLINE int32_t __count_trailing_zeros_i32(uint32_t v) { + if (v == 0) + return 32; + + int count = 0; + while ((v & 1) == 0) { + ++count; + v >>= 1; + } + return count; +} + +static FORCEINLINE int64_t __count_trailing_zeros_i64(uint64_t v) { + if (v == 0) + return 64; + + int count = 0; + while ((v & 1) == 0) { + ++count; + v >>= 1; + } + return count; +} + +static FORCEINLINE int32_t __count_leading_zeros_i32(uint32_t v) { + if (v == 0) + return 32; + + int count = 0; + while ((v & (1<<31)) == 0) { + ++count; + v <<= 1; + } + return count; +} + +static FORCEINLINE int64_t __count_leading_zeros_i64(uint64_t v) { + if (v == 0) + return 64; + + int count = 0; + while ((v & (1ull<<63)) == 0) { + ++count; + v <<= 1; + } + return count; +} + +/////////////////////////////////////////////////////////////////////////// +// reductions + +#if 0 +REDUCE_ADD(float, __vec8_f, __reduce_add_float) +REDUCE_MINMAX(float, __vec8_f, __reduce_min_float, <) +REDUCE_MINMAX(float, __vec8_f, __reduce_max_float, >) +#else +static FORCEINLINE float __reduce_add_float(__vec8_f v) { return _mm512_mask_reduce_add_ps(0xFF,v); } +static FORCEINLINE float __reduce_min_float(__vec8_f v) { return _mm512_mask_reduce_min_ps(0xFF,v); } +static FORCEINLINE float __reduce_max_float(__vec8_f v) { return _mm512_mask_reduce_max_ps(0xFF,v); } +#endif + +#if 0 +REDUCE_ADD(double, __vec8_d, __reduce_add_double) +REDUCE_MINMAX(double, __vec8_d, __reduce_min_double, <) +REDUCE_MINMAX(double, __vec8_d, __reduce_max_double, >) +#else +static FORCEINLINE float __reduce_add_double(__vec8_d v) { return _mm512_reduce_add_pd(v); } +static FORCEINLINE float __reduce_min_double(__vec8_d v) { return _mm512_reduce_min_pd(v); } +static FORCEINLINE float __reduce_max_double(__vec8_d v) { return _mm512_reduce_max_pd(v); } +#endif + + + +#if 0 +REDUCE_ADD (int64_t, __vec8_i32, __reduce_add_int32) +REDUCE_MINMAX(int32_t, __vec8_i32, __reduce_min_int32, <) +REDUCE_MINMAX(int32_t, __vec8_i32, __reduce_max_int32, >) +REDUCE_MINMAX(uint32_t, __vec8_i32, __reduce_min_uint32, <) +REDUCE_MINMAX(uint32_t, __vec8_i32, __reduce_max_uint32, >) +#else +static FORCEINLINE int64_t __reduce_add_int32 (__vec8_i32 v) { return _mm512_mask_reduce_add_epi32(0xFF, v);} +static FORCEINLINE int32_t __reduce_min_int32 (__vec8_i32 v) { return _mm512_mask_reduce_min_epi32(0xFF, v);} +static FORCEINLINE int32_t __reduce_max_int32 (__vec8_i32 v) { return _mm512_mask_reduce_max_epi32(0xFF, v);} +static FORCEINLINE uint32_t __reduce_min_uint32 (__vec8_i32 v) { return _mm512_mask_reduce_min_epu32(0xFF, v);} +static FORCEINLINE uint32_t __reduce_max_uint32 (__vec8_i32 v) { return _mm512_mask_reduce_max_epu32(0xFF, v);} +#endif + +REDUCE_ADD ( int16_t, __vec8_i8, __reduce_add_int8) +REDUCE_ADD ( int32_t, __vec8_i16, __reduce_add_int16) +REDUCE_ADD ( int64_t, __vec8_i64, __reduce_add_int64) +REDUCE_MINMAX( int64_t, __vec8_i64, __reduce_min_int64, <) +REDUCE_MINMAX( int64_t, __vec8_i64, __reduce_max_int64, >) +REDUCE_MINMAX(uint64_t, __vec8_i64, __reduce_min_uint64, <) +REDUCE_MINMAX(uint64_t, __vec8_i64, __reduce_max_uint64, >) + +/////////////////////////////////////////////////////////////////////////// +// masked load/store + +static FORCEINLINE __vec8_i8 __masked_load_i8(void *p, + __vec8_i1 mask) { + __vec8_i8 ret; + int8_t *ptr = (int8_t *)p; + for (int i = 0; i < 8; ++i) + if ((mask.v & (1 << i)) != 0) + ret[i] = ptr[i]; + return ret; +} + +static FORCEINLINE __vec8_i16 __masked_load_i16(void *p, + __vec8_i1 mask) { + __vec8_i16 ret; + int16_t *ptr = (int16_t *)p; + for (int i = 0; i < 8; ++i) + if ((mask.v & (1 << i)) != 0) + ret[i] = ptr[i]; + return ret; +} + +#if 0 +static FORCEINLINE __vec8_i32 __masked_load_i32(void *p, + __vec8_i1 mask) { + __vec8_i32 ret; + int32_t *ptr = (int32_t *)p; + for (int i = 0; i < 8; ++i) + if ((mask.v & (1 << i)) != 0) + ret[i] = ptr[i]; + return ret; +} +#else +static FORCEINLINE __vec8_i32 __masked_load_i32(void *p, __vec8_i1 mask) { +#ifdef ISPC_FORCE_ALIGNED_MEMORY + return _mm512_mask_load_epi32(__vec8_i32(), mask, p); +#else + __vec8_i32 tmp; + tmp = _mm512_mask_extloadunpacklo_epi32(tmp, 0xFF, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE); + tmp = _mm512_mask_extloadunpackhi_epi32(tmp, 0xFF, (uint8_t*)p+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE); + __vec8_i32 ret; + return _mm512_mask_mov_epi32(ret, 0xFF & mask, tmp); +#endif +} +#endif + +#if 0 +static FORCEINLINE __vec8_f __masked_load_float(void *p, + __vec8_i1 mask) { + __vec8_f ret; + float *ptr = (float *)p; + for (int i = 0; i < 8; ++i) + if ((mask.v & (1 << i)) != 0) + ret[i] = ptr[i]; + return ret; +} +#else +static FORCEINLINE __vec8_f __masked_load_float(void *p, __vec8_i1 mask) { +#ifdef ISPC_FORCE_ALIGNED_MEMORY + return _mm512_mask_load_ps(_mm512_undefined_ps(), mask,p); +#else + __vec8_f tmp; + tmp = _mm512_mask_extloadunpacklo_ps(tmp, 0xFF, p, _MM_UPCONV_PS_NONE, _MM_HINT_NONE); + tmp = _mm512_mask_extloadunpackhi_ps(tmp, 0xFF, (uint8_t*)p+64, _MM_UPCONV_PS_NONE, _MM_HINT_NONE); + __vec8_f ret; + return _mm512_mask_mov_ps(ret, 0xFF & mask, tmp); +#endif +} +#endif + +static FORCEINLINE __vec8_i64 __masked_load_i64(void *p, + __vec8_i1 mask) { + __vec8_i64 ret; + int64_t *ptr = (int64_t *)p; + for (int i = 0; i < 8; ++i) + if ((mask.v & (1 << i)) != 0) + ret[i] = ptr[i]; + return ret; +} + +#if 0 +static FORCEINLINE __vec8_d __masked_load_double(void *p, + __vec8_i1 mask) { + __vec8_d ret; + double *ptr = (double *)p; + for (int i = 0; i < 8; ++i) + if ((mask.v & (1 << i)) != 0) + ret[i] = ptr[i]; + return ret; +} +#else +static FORCEINLINE __vec8_d __masked_load_double(void *p, __vec8_i1 mask) { +#ifdef ISPC_FORCE_ALIGNED_MEMORY + __vec8_d ret = FZERO; + ret = _mm512_mask_load_pd(ret, 0xFF & mask, p); + return ret; +#else + __vec8_d tmp = FZERO; + tmp.v = _mm512_mask_extloadunpacklo_pd(tmp.v, 0xFF, p, _MM_UPCONV_PD_NONE, _MM_HINT_NONE); + tmp.v = _mm512_mask_extloadunpackhi_pd(tmp.v, 0xFF, (uint8_t*)p+64, _MM_UPCONV_PD_NONE, _MM_HINT_NONE); + __vec8_d ret = FZERO; + ret.v = _mm512_mask_mov_pd(ret.v, mask, tmp.v); + return ret; +#endif +} +#endif + + +static FORCEINLINE void __masked_store_i8(void *p, __vec8_i8 val, + __vec8_i1 mask) { + int8_t *ptr = (int8_t *)p; + for (int i = 0; i < 8; ++i) + if ((mask.v & (1 << i)) != 0) + ptr[i] = val[i]; +} + +static FORCEINLINE void __masked_store_i16(void *p, __vec8_i16 val, + __vec8_i1 mask) { + int16_t *ptr = (int16_t *)p; + for (int i = 0; i < 8; ++i) + if ((mask.v & (1 << i)) != 0) + ptr[i] = val[i]; +} + +#if 0 +static FORCEINLINE void __masked_store_i32(void *p, __vec8_i32 val, + __vec8_i1 mask) { + int32_t *ptr = (int32_t *)p; + for (int i = 0; i < 8; ++i) + if ((mask.v & (1 << i)) != 0) + ptr[i] = val[i]; +} +#else +static FORCEINLINE void __masked_store_i32(void *p, __vec8_i32 val, __vec8_i1 mask) { +#ifdef ISPC_FORCE_ALIGNED_MEMORY + _mm512_mask_store_epi32(p, mask, val.v); +#else + __vec8_i32 tmp; + tmp = _mm512_extloadunpacklo_epi32(tmp, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE); + tmp = _mm512_extloadunpackhi_epi32(tmp, (uint8_t*)p+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE); + tmp = _mm512_mask_mov_epi32(tmp, 0xFF & mask, val); + _mm512_mask_extpackstorelo_epi32( p, 0xFF, tmp, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE); + _mm512_mask_extpackstorehi_epi32((uint8_t*)p+64, 0xFF, tmp, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE); +#endif +} +#endif + +#if 0 +static FORCEINLINE void __masked_store_float(void *p, __vec8_f val, + __vec8_i1 mask) { + float *ptr = (float *)p; + for (int i = 0; i < 8; ++i) + if ((mask.v & (1 << i)) != 0) + ptr[i] = val[i]; +} +#else +static FORCEINLINE void __masked_store_float(void *p, __vec8_f val, + __vec8_i1 mask) { +#ifdef ISPC_FORCE_ALIGNED_MEMORY + _mm512_mask_store_ps(p, 0xFF & mask, val.v); +#else + __vec8_f tmp = FZERO; + tmp = _mm512_extloadunpacklo_ps(tmp, p, _MM_UPCONV_PS_NONE, _MM_HINT_NONE); + tmp = _mm512_extloadunpackhi_ps(tmp, (uint8_t*)p+64, _MM_UPCONV_PS_NONE, _MM_HINT_NONE); + tmp = _mm512_mask_mov_ps(tmp, 0xFF & mask, val); + _mm512_mask_extpackstorelo_ps( p, 0xFF, tmp, _MM_DOWNCONV_PS_NONE, _MM_HINT_NONE); + _mm512_mask_extpackstorehi_ps((uint8_t*)p+64, 0xFF, tmp, _MM_DOWNCONV_PS_NONE, _MM_HINT_NONE); +#endif +} +#endif + +static FORCEINLINE void __masked_store_i64(void *p, __vec8_i64 val, + __vec8_i1 mask) { + int64_t *ptr = (int64_t *)p; + for (int i = 0; i < 8; ++i) + if ((mask.v & (1 << i)) != 0) + ptr[i] = val[i]; +} + +#if 0 +static FORCEINLINE void __masked_store_double(void *p, __vec8_d val, + __vec8_i1 mask) { + double *ptr = (double *)p; + for (int i = 0; i < 8; ++i) + if ((mask.v & (1 << i)) != 0) + ptr[i] = val[i]; +} +#else +static FORCEINLINE void __masked_store_double(void *p, __vec8_d val, + __vec8_i1 mask) { +#ifdef ISPC_FORCE_ALIGNED_MEMORY + _mm512_mask_store_pd(p, mask, val.v); +#else + __vec8_d tmp; + tmp.v = _mm512_extloadunpacklo_pd(tmp.v, p, _MM_UPCONV_PD_NONE, _MM_HINT_NONE); + tmp.v = _mm512_extloadunpackhi_pd(tmp.v, (uint8_t*)p+64, _MM_UPCONV_PD_NONE, _MM_HINT_NONE); + tmp.v = _mm512_mask_mov_pd(tmp.v, mask, val.v); + _mm512_extpackstorelo_pd(p, tmp.v, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE); + _mm512_extpackstorehi_pd((uint8_t*)p+64, tmp.v, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE); +#endif +} +#endif + +static FORCEINLINE void __masked_store_blend_i8(void *p, __vec8_i8 val, + __vec8_i1 mask) { + __masked_store_i8(p, val, mask); +} + +static FORCEINLINE void __masked_store_blend_i16(void *p, __vec8_i16 val, + __vec8_i1 mask) { + __masked_store_i16(p, val, mask); +} + +static FORCEINLINE void __masked_store_blend_i32(void *p, __vec8_i32 val, + __vec8_i1 mask) { + __masked_store_i32(p, val, mask); +} + +static FORCEINLINE void __masked_store_blend_float(void *p, __vec8_f val, + __vec8_i1 mask) { + __masked_store_float(p, val, mask); +} + +static FORCEINLINE void __masked_store_blend_i64(void *p, __vec8_i64 val, + __vec8_i1 mask) { + __masked_store_i64(p, val, mask); +} + +static FORCEINLINE void __masked_store_blend_double(void *p, __vec8_d val, + __vec8_i1 mask) { + __masked_store_double(p, val, mask); +} + +/////////////////////////////////////////////////////////////////////////// +// gather/scatter + +// offsets * offsetScale is in bytes (for all of these) + +#define GATHER_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC) \ +static FORCEINLINE VTYPE FUNC(unsigned char *b, uint32_t scale, \ + OTYPE offset, __vec8_i1 mask) { \ + VTYPE ret; \ + int8_t *base = (int8_t *)b; \ + for (int i = 0; i < 8; ++i) \ + if ((mask.v & (1 << i)) != 0) { \ + STYPE *ptr = (STYPE *)(base + scale * offset[i]); \ + ret[i] = *ptr; \ + } \ + return ret; \ +} + + +/****************/ +#if 0 +GATHER_BASE_OFFSETS(__vec8_i8, int8_t, __vec8_i32, __gather_base_offsets32_i8) +#else +static FORCEINLINE __vec8_i8 __gather_base_offsets32_i8(uint8_t *base, uint32_t scale, __vec8_i32 offsets, __vec8_i1 mask) +{ + // (iw): need to temporarily store as int because gathers can only return ints. + __vec8_i32 tmp = _mm512_mask_i32extgather_epi32(_mm512_undefined_epi32(), 0xFF & mask, offsets, base, + _MM_UPCONV_EPI32_SINT8, scale, + _MM_HINT_NONE); + // now, downconverting to chars into temporary char vector + __vec8_i8 ret; + _mm512_mask_extstore_epi32(ret.data,0xFF,tmp,_MM_DOWNCONV_EPI32_SINT8,_MM_HINT_NONE); + return ret; +} +#endif +GATHER_BASE_OFFSETS(__vec8_i8, int8_t, __vec8_i64, __gather_base_offsets64_i8) +/****************/ +GATHER_BASE_OFFSETS(__vec8_i16, int16_t, __vec8_i32, __gather_base_offsets32_i16) +GATHER_BASE_OFFSETS(__vec8_i16, int16_t, __vec8_i64, __gather_base_offsets64_i16) +/****************/ +#if 0 +GATHER_BASE_OFFSETS(__vec8_i32, int32_t, __vec8_i32, __gather_base_offsets32_i32) +#else +static FORCEINLINE __vec8_i32 __gather_base_offsets32_i32(uint8_t *base, uint32_t scale, __vec8_i32 offsets, __vec8_i1 mask) +{ + return _mm512_mask_i32extgather_epi32(_mm512_undefined_epi32(), 0xFF & mask, offsets, + base, _MM_UPCONV_EPI32_NONE, scale, + _MM_HINT_NONE); +} +#endif +GATHER_BASE_OFFSETS(__vec8_i32, int32_t, __vec8_i64, __gather_base_offsets64_i32) +/****************/ +#if 0 +GATHER_BASE_OFFSETS(__vec8_f, float, __vec8_i32, __gather_base_offsets32_float) +#else +static FORCEINLINE __vec8_f __gather_base_offsets32_float(uint8_t *base, uint32_t scale, __vec8_i32 offsets, __vec8_i1 mask) +{ + return _mm512_mask_i32extgather_ps(_mm512_undefined_ps(), 0xFF & mask, offsets, + base, _MM_UPCONV_PS_NONE, scale, + _MM_HINT_NONE); +} +#endif +GATHER_BASE_OFFSETS(__vec8_f, float, __vec8_i64, __gather_base_offsets64_float) +/****************/ +GATHER_BASE_OFFSETS(__vec8_i64, int64_t, __vec8_i32, __gather_base_offsets32_i64) +GATHER_BASE_OFFSETS(__vec8_i64, int64_t, __vec8_i64, __gather_base_offsets64_i64) +/****************/ +#if 0 +GATHER_BASE_OFFSETS(__vec8_d, double, __vec8_i32, __gather_base_offsets32_double) +#else +static FORCEINLINE __vec8_d __gather_base_offsets32_double(uint8_t *base, uint32_t scale, __vec8_i32 offsets, __vec8_i1 mask) +{ + __vec8_d ret; + ret.v = _mm512_mask_i32loextgather_pd(_mm512_undefined_pd(), mask, offsets, + base, _MM_UPCONV_PD_NONE, scale, + _MM_HINT_NONE); +#if 0 + __m512i shuffled_offsets = _mm512_permute4f128_epi32(offsets.v, _MM_PERM_DCDC); + const __mmask8 mask8 = 0x00FF & (mask >> 8); /* evghenii::testme */ + ret.v2 = _mm512_mask_i32loextgather_pd(_mm512_undefined_pd(), mask8, shuffled_offsets, + base, _MM_UPCONV_PD_NONE, scale, + _MM_HINT_NONE); +#endif + return ret; +} +#endif +GATHER_BASE_OFFSETS(__vec8_d, double, __vec8_i64, __gather_base_offsets64_double) + +#define GATHER_GENERAL(VTYPE, STYPE, PTRTYPE, FUNC) \ +static FORCEINLINE VTYPE FUNC(PTRTYPE ptrs, __vec8_i1 mask) { \ + VTYPE ret; \ + for (int i = 0; i < 8; ++i) \ + if ((mask.v & (1 << i)) != 0) { \ + STYPE *ptr = (STYPE *)ptrs[i]; \ + ret[i] = *ptr; \ + } \ + return ret; \ +} +#define GATHER_GENERALF(VTYPE, STYPE, PTRTYPE, FUNC,FUNC1) \ +static FORCEINLINE VTYPE FUNC(PTRTYPE ptrs, __vec8_i1 mask) { \ + return FUNC1(0, 1, ptrs, mask); \ +} + + +#if 1 +/***********/ +GATHER_GENERALF(__vec8_i8, int8_t, __vec8_i32, __gather32_i8, __gather_base_offsets32_i8) +GATHER_GENERALF(__vec8_i16, int16_t, __vec8_i32, __gather32_i16, __gather_base_offsets32_i16) +GATHER_GENERALF(__vec8_i32, int32_t, __vec8_i32, __gather32_i32, __gather_base_offsets32_i32) +GATHER_GENERALF(__vec8_i64, int64_t, __vec8_i32, __gather32_i64, __gather_base_offsets32_i64) +GATHER_GENERALF(__vec8_f, float, __vec8_i32, __gather32_float, __gather_base_offsets32_float) +GATHER_GENERALF(__vec8_d, double, __vec8_i32, __gather32_double, __gather_base_offsets32_double) +/***********/ +GATHER_GENERAL(__vec8_i8, int8_t, __vec8_i64, __gather64_i8); +GATHER_GENERAL(__vec8_i16, int16_t, __vec8_i64, __gather64_i16); +GATHER_GENERAL(__vec8_i32, int32_t, __vec8_i64, __gather64_i32); +GATHER_GENERAL(__vec8_i64, int64_t, __vec8_i64, __gather64_i64); +GATHER_GENERAL(__vec8_f, float, __vec8_i64, __gather64_float); +GATHER_GENERAL(__vec8_d, double, __vec8_i64, __gather64_double); +/***********/ +#endif + +// scatter + +#define SCATTER_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC) \ +static FORCEINLINE void FUNC(unsigned char *b, uint32_t scale, \ + OTYPE offset, VTYPE val, \ + __vec8_i1 mask) { \ + int8_t *base = (int8_t *)b; \ + for (int i = 0; i < 8; ++i) \ + if ((mask.v & (1 << i)) != 0) { \ + STYPE *ptr = (STYPE *)(base + scale * offset[i]); \ + *ptr = val[i]; \ + } \ +} + + +/*****************/ +SCATTER_BASE_OFFSETS(__vec8_i8, int8_t, __vec8_i32, __scatter_base_offsets32_i8) +SCATTER_BASE_OFFSETS(__vec8_i8, int8_t, __vec8_i64, __scatter_base_offsets64_i8) +/*****************/ +SCATTER_BASE_OFFSETS(__vec8_i16, int16_t, __vec8_i32, __scatter_base_offsets32_i16) +SCATTER_BASE_OFFSETS(__vec8_i16, int16_t, __vec8_i64, __scatter_base_offsets64_i16) +/*****************/ +#if 0 +SCATTER_BASE_OFFSETS(__vec8_i32, int32_t, __vec8_i32, __scatter_base_offsets32_i32) +#else +static FORCEINLINE void __scatter_base_offsets32_i32(uint8_t *b, uint32_t scale, __vec8_i32 offsets, __vec8_i32 val, __vec8_i1 mask) +{ + _mm512_mask_i32extscatter_epi32(b, 0xFF & mask, offsets, val, + _MM_DOWNCONV_EPI32_NONE, scale, + _MM_HINT_NONE); +} +#endif +SCATTER_BASE_OFFSETS(__vec8_i32, int32_t, __vec8_i64, __scatter_base_offsets64_i32) +/*****************/ +#if 0 +SCATTER_BASE_OFFSETS(__vec8_f, float, __vec8_i32, __scatter_base_offsets32_float) +#else +static FORCEINLINE void __scatter_base_offsets32_float(void *base, uint32_t scale, __vec8_i32 offsets, + __vec8_f val, __vec8_i1 mask) +{ + _mm512_mask_i32extscatter_ps(base, 0xFF & mask, offsets, val, + _MM_DOWNCONV_PS_NONE, scale, + _MM_HINT_NONE); +} +#endif +SCATTER_BASE_OFFSETS(__vec8_f, float, __vec8_i64, __scatter_base_offsets64_float) +/*****************/ +SCATTER_BASE_OFFSETS(__vec8_i64, int64_t, __vec8_i32, __scatter_base_offsets32_i64) +SCATTER_BASE_OFFSETS(__vec8_i64, int64_t, __vec8_i64, __scatter_base_offsets64_i64) +/*****************/ +#if 0 /* evghenii::to implement */ +SCATTER_BASE_OFFSETS(__vec8_d, double, __vec8_i32, __scatter_base_offsets32_double) +#else /* evghenii:testme */ +static FORCEINLINE void __scatter_base_offsets32_double(void *base, uint32_t scale, __vec8_i32 offsets, + __vec8_d val, __vec8_i1 mask) +{ + _mm512_mask_i32loextscatter_pd(base, mask, offsets, val.v, + _MM_DOWNCONV_PD_NONE, scale, + _MM_HINT_NONE); +} +#endif +SCATTER_BASE_OFFSETS(__vec8_d, double, __vec8_i64, __scatter_base_offsets64_double) + +#define SCATTER_GENERAL(VTYPE, STYPE, PTRTYPE, FUNC) \ +static FORCEINLINE void FUNC(PTRTYPE ptrs, VTYPE val, __vec8_i1 mask) { \ + VTYPE ret; \ + for (int i = 0; i < 8; ++i) \ + if ((mask.v & (1 << i)) != 0) { \ + STYPE *ptr = (STYPE *)ptrs[i]; \ + *ptr = val[i]; \ + } \ +} +#define SCATTER_GENERALF(VTYPE, STYPE, PTRTYPE, FUNC,FUNC1) \ +static FORCEINLINE void FUNC(PTRTYPE ptrs, VTYPE val, __vec8_i1 mask) { \ + return FUNC1(0, 1, ptrs, val, mask); \ +} + +#if 1 +/***********/ +SCATTER_GENERALF(__vec8_i8, int8_t, __vec8_i32, __scatter32_i8, __scatter_base_offsets32_i8) +SCATTER_GENERALF(__vec8_i16, int16_t, __vec8_i32, __scatter32_i16, __scatter_base_offsets32_i16) +SCATTER_GENERALF(__vec8_i32, int32_t, __vec8_i32, __scatter32_i32, __scatter_base_offsets32_i32) +SCATTER_GENERALF(__vec8_i64, int64_t, __vec8_i32, __scatter32_i64, __scatter_base_offsets32_i64) +SCATTER_GENERALF(__vec8_f, float, __vec8_i32, __scatter32_float, __scatter_base_offsets32_float) +SCATTER_GENERALF(__vec8_d, double, __vec8_i32, __scatter32_double, __scatter_base_offsets32_double) +/***********/ +SCATTER_GENERAL(__vec8_i8, int8_t, __vec8_i64, __scatter64_i8) +SCATTER_GENERAL(__vec8_i16, int16_t, __vec8_i64, __scatter64_i16) +SCATTER_GENERAL(__vec8_i32, int32_t, __vec8_i64, __scatter64_i32) +SCATTER_GENERAL(__vec8_f, float, __vec8_i64, __scatter64_float) +SCATTER_GENERAL(__vec8_i64, int64_t, __vec8_i64, __scatter64_i64) +SCATTER_GENERAL(__vec8_d, double, __vec8_i64, __scatter64_double) +/***********/ +#endif + +/////////////////////////////////////////////////////////////////////////// +// packed load/store + +#if 0 +static FORCEINLINE int32_t __packed_load_active(int32_t *ptr, __vec8_i32 *val, + __vec8_i1 mask) { + int count = 0; + for (int i = 0; i < 8; ++i) { + if ((mask.v & (1 << i)) != 0) { + val->operator[](i) = *ptr++; + ++count; + } + } + return count; +} +static FORCEINLINE int32_t __packed_store_active(int32_t *ptr, + __vec8_i32 val, + __vec8_i1 mask) { + int count = 0; + for (int i = 0; i < 8; ++i) { + if ((mask.v & (1 << i)) != 0) { + *ptr++ = val[i]; + ++count; + } + } + return count; +} +static FORCEINLINE int32_t __packed_load_active(uint32_t *ptr, + __vec8_i32 *val, + __vec8_i1 mask) { + int count = 0; + for (int i = 0; i < 8; ++i) { + if ((mask.v & (1 << i)) != 0) { + val->operator[](i) = *ptr++; + ++count; + } + } + return count; +} +static FORCEINLINE int32_t __packed_store_active(uint32_t *ptr, + __vec8_i32 val, + __vec8_i1 mask) { + int count = 0; + for (int i = 0; i < 8; ++i) { + if ((mask.v & (1 << i)) != 0) { + *ptr++ = val[i]; + ++count; + } + } + return count; +} +#else +static FORCEINLINE int32_t __packed_load_active(uint32_t *p, __vec8_i32 *val, + __vec8_i1 mask) { + __vec8_i32 v = __load<64>(val); + v = _mm512_mask_extloadunpacklo_epi32(v, 0xFF & mask, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE); + v = _mm512_mask_extloadunpackhi_epi32(v, 0xFF & mask, (uint8_t*)p+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE); + __store<64>(val, v); + return _mm_countbits_32(uint32_t(0xFF & mask)); +} +static FORCEINLINE int32_t __packed_store_active(uint32_t *p, __vec8_i32 val, + __vec8_i1 mask) { + _mm512_mask_extpackstorelo_epi32(p, 0xFF & mask, val, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE); + _mm512_mask_extpackstorehi_epi32((uint8_t*)p+64, 0xFF & mask, val, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE); + return _mm_countbits_32(uint32_t(0xFF & mask)); +} +static FORCEINLINE int32_t __packed_load_active(int32_t *p, __vec8_i32 *val, + __vec8_i1 mask) { + __vec8_i32 v = __load<64>(val); + v = _mm512_mask_extloadunpacklo_epi32(v, 0xFF & mask, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE); + v = _mm512_mask_extloadunpackhi_epi32(v, 0xFF & mask, (uint8_t*)p+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE); + __store<64>(val, v); + return _mm_countbits_32(uint32_t(0xFF & mask)); +} +static FORCEINLINE int32_t __packed_store_active(int32_t *p, __vec8_i32 val, + __vec8_i1 mask) { + _mm512_mask_extpackstorelo_epi32(p, 0xFF & mask, val, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE); + _mm512_mask_extpackstorehi_epi32((uint8_t*)p+64, 0xFF & mask, val, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE); + return _mm_countbits_32(uint32_t(0xFF & mask)); +} +#endif + +/////////////////////////////////////////////////////////////////////////// +// aos/soa + +static FORCEINLINE void __soa_to_aos3_float(__vec8_f v0, __vec8_f v1, __vec8_f v2, + float *ptr) { + for (int i = 0; i < 8; ++i) { + *ptr++ = __extract_element(v0, i); + *ptr++ = __extract_element(v1, i); + *ptr++ = __extract_element(v2, i); + } +} + +static FORCEINLINE void __aos_to_soa3_float(float *ptr, __vec8_f *out0, __vec8_f *out1, + __vec8_f *out2) { + for (int i = 0; i < 8; ++i) { + __insert_element(out0, i, *ptr++); + __insert_element(out1, i, *ptr++); + __insert_element(out2, i, *ptr++); + } +} + +static FORCEINLINE void __soa_to_aos4_float(__vec8_f v0, __vec8_f v1, __vec8_f v2, + __vec8_f v3, float *ptr) { + for (int i = 0; i < 8; ++i) { + *ptr++ = __extract_element(v0, i); + *ptr++ = __extract_element(v1, i); + *ptr++ = __extract_element(v2, i); + *ptr++ = __extract_element(v3, i); + } +} + +static FORCEINLINE void __aos_to_soa4_float(float *ptr, __vec8_f *out0, __vec8_f *out1, + __vec8_f *out2, __vec8_f *out3) { + for (int i = 0; i < 8; ++i) { + __insert_element(out0, i, *ptr++); + __insert_element(out1, i, *ptr++); + __insert_element(out2, i, *ptr++); + __insert_element(out3, i, *ptr++); + } +} + +/////////////////////////////////////////////////////////////////////////// +// prefetch + +static FORCEINLINE void __prefetch_read_uniform_1(unsigned char *p) { + _mm_prefetch((char *)p, _MM_HINT_T0); // prefetch into L1$ +} + +static FORCEINLINE void __prefetch_read_uniform_2(unsigned char *p) { + _mm_prefetch((char *)p, _MM_HINT_T1); // prefetch into L2$ +} + +static FORCEINLINE void __prefetch_read_uniform_3(unsigned char *p) { + // There is no L3$ on KNC, don't want to pollute L2$ unecessarily +} + +static FORCEINLINE void __prefetch_read_uniform_nt(unsigned char *p) { + _mm_prefetch((char *)p, _MM_HINT_T2); // prefetch into L2$ with non-temporal hint + // _mm_prefetch(p, _MM_HINT_NTA); // prefetch into L1$ with non-temporal hint +} + +/////////////////////////////////////////////////////////////////////////// +// atomics + +static FORCEINLINE uint32_t __atomic_add(uint32_t *p, uint32_t v) { +#ifdef _MSC_VER + return InterlockedAdd((LONG volatile *)p, v) - v; +#else + return __sync_fetch_and_add(p, v); +#endif +} + +static FORCEINLINE uint32_t __atomic_sub(uint32_t *p, uint32_t v) { +#ifdef _MSC_VER + return InterlockedAdd((LONG volatile *)p, -v) + v; +#else + return __sync_fetch_and_sub(p, v); +#endif +} + +static FORCEINLINE uint32_t __atomic_and(uint32_t *p, uint32_t v) { +#ifdef _MSC_VER + return InterlockedAnd((LONG volatile *)p, v); +#else + return __sync_fetch_and_and(p, v); +#endif +} + +static FORCEINLINE uint32_t __atomic_or(uint32_t *p, uint32_t v) { +#ifdef _MSC_VER + return InterlockedOr((LONG volatile *)p, v); +#else + return __sync_fetch_and_or(p, v); +#endif +} + +static FORCEINLINE uint32_t __atomic_xor(uint32_t *p, uint32_t v) { +#ifdef _MSC_VER + return InterlockedXor((LONG volatile *)p, v); +#else + return __sync_fetch_and_xor(p, v); +#endif +} + +static FORCEINLINE uint32_t __atomic_min(uint32_t *p, uint32_t v) { + int32_t old, min; + do { + old = *((volatile int32_t *)p); + min = (old < (int32_t)v) ? old : (int32_t)v; +#ifdef _MSC_VER + } while (InterlockedCompareExchange((LONG volatile *)p, min, old) != old); +#else + } while (__sync_bool_compare_and_swap(p, old, min) == false); +#endif + return old; +} + +static FORCEINLINE uint32_t __atomic_max(uint32_t *p, uint32_t v) { + int32_t old, max; + do { + old = *((volatile int32_t *)p); + max = (old > (int32_t)v) ? old : (int32_t)v; +#ifdef _MSC_VER + } while (InterlockedCompareExchange((LONG volatile *)p, max, old) != old); +#else + } while (__sync_bool_compare_and_swap(p, old, max) == false); +#endif + return old; +} + +static FORCEINLINE uint32_t __atomic_umin(uint32_t *p, uint32_t v) { + uint32_t old, min; + do { + old = *((volatile uint32_t *)p); + min = (old < v) ? old : v; +#ifdef _MSC_VER + } while (InterlockedCompareExchange((LONG volatile *)p, min, old) != old); +#else + } while (__sync_bool_compare_and_swap(p, old, min) == false); +#endif + return old; +} + +static FORCEINLINE uint32_t __atomic_umax(uint32_t *p, uint32_t v) { + uint32_t old, max; + do { + old = *((volatile uint32_t *)p); + max = (old > v) ? old : v; +#ifdef _MSC_VER + } while (InterlockedCompareExchange((LONG volatile *)p, max, old) != old); +#else + } while (__sync_bool_compare_and_swap(p, old, max) == false); +#endif + return old; +} + +static FORCEINLINE uint32_t __atomic_xchg(uint32_t *p, uint32_t v) { +#ifdef _MSC_VER + return InterlockedExchange((LONG volatile *)p, v); +#else + return __sync_lock_test_and_set(p, v); +#endif +} + +static FORCEINLINE uint32_t __atomic_cmpxchg(uint32_t *p, uint32_t cmpval, + uint32_t newval) { +#ifdef _MSC_VER + return InterlockedCompareExchange((LONG volatile *)p, newval, cmpval); +#else + return __sync_val_compare_and_swap(p, cmpval, newval); +#endif +} + +static FORCEINLINE uint64_t __atomic_add(uint64_t *p, uint64_t v) { +#ifdef _MSC_VER + return InterlockedAdd64((LONGLONG volatile *)p, v) - v; +#else + return __sync_fetch_and_add(p, v); +#endif +} + +static FORCEINLINE uint64_t __atomic_sub(uint64_t *p, uint64_t v) { +#ifdef _MSC_VER + return InterlockedAdd64((LONGLONG volatile *)p, -v) + v; +#else + return __sync_fetch_and_sub(p, v); +#endif +} + +static FORCEINLINE uint64_t __atomic_and(uint64_t *p, uint64_t v) { +#ifdef _MSC_VER + return InterlockedAnd64((LONGLONG volatile *)p, v) - v; +#else + return __sync_fetch_and_and(p, v); +#endif +} + +static FORCEINLINE uint64_t __atomic_or(uint64_t *p, uint64_t v) { +#ifdef _MSC_VER + return InterlockedOr64((LONGLONG volatile *)p, v) - v; +#else + return __sync_fetch_and_or(p, v); +#endif +} + +static FORCEINLINE uint64_t __atomic_xor(uint64_t *p, uint64_t v) { +#ifdef _MSC_VER + return InterlockedXor64((LONGLONG volatile *)p, v) - v; +#else + return __sync_fetch_and_xor(p, v); +#endif +} + +static FORCEINLINE uint64_t __atomic_min(uint64_t *p, uint64_t v) { + int64_t old, min; + do { + old = *((volatile int64_t *)p); + min = (old < (int64_t)v) ? old : (int64_t)v; +#ifdef _MSC_VER + } while (InterlockedCompareExchange64((LONGLONG volatile *)p, min, old) != old); +#else + } while (__sync_bool_compare_and_swap(p, old, min) == false); +#endif + return old; +} + +static FORCEINLINE uint64_t __atomic_max(uint64_t *p, uint64_t v) { + int64_t old, max; + do { + old = *((volatile int64_t *)p); + max = (old > (int64_t)v) ? old : (int64_t)v; +#ifdef _MSC_VER + } while (InterlockedCompareExchange64((LONGLONG volatile *)p, max, old) != old); +#else + } while (__sync_bool_compare_and_swap(p, old, max) == false); +#endif + return old; +} + +static FORCEINLINE uint64_t __atomic_umin(uint64_t *p, uint64_t v) { + uint64_t old, min; + do { + old = *((volatile uint64_t *)p); + min = (old < v) ? old : v; +#ifdef _MSC_VER + } while (InterlockedCompareExchange64((LONGLONG volatile *)p, min, old) != old); +#else + } while (__sync_bool_compare_and_swap(p, old, min) == false); +#endif + return old; +} + +static FORCEINLINE uint64_t __atomic_umax(uint64_t *p, uint64_t v) { + uint64_t old, max; + do { + old = *((volatile uint64_t *)p); + max = (old > v) ? old : v; +#ifdef _MSC_VER + } while (InterlockedCompareExchange64((LONGLONG volatile *)p, max, old) != old); +#else + } while (__sync_bool_compare_and_swap(p, old, max) == false); +#endif + return old; +} + +static FORCEINLINE uint64_t __atomic_xchg(uint64_t *p, uint64_t v) { +#ifdef _MSC_VER + return InterlockedExchange64((LONGLONG volatile *)p, v); +#else + return __sync_lock_test_and_set(p, v); +#endif +} + +static FORCEINLINE uint64_t __atomic_cmpxchg(uint64_t *p, uint64_t cmpval, + uint64_t newval) { +#ifdef _MSC_VER + return InterlockedCompareExchange64((LONGLONG volatile *)p, newval, cmpval); +#else + return __sync_val_compare_and_swap(p, cmpval, newval); +#endif +} + +#ifdef WIN32 +#include +#define __clock __rdtsc +#else // WIN32 +static FORCEINLINE uint64_t __clock() { + uint32_t low, high; +#ifdef __x86_64 + __asm__ __volatile__ ("xorl %%eax,%%eax \n cpuid" + ::: "%rax", "%rbx", "%rcx", "%rdx" ); +#else + __asm__ __volatile__ ("xorl %%eax,%%eax \n cpuid" + ::: "%eax", "%ebx", "%ecx", "%edx" ); +#endif + __asm__ __volatile__ ("rdtsc" : "=a" (low), "=d" (high)); + return (uint64_t)high << 32 | low; +} + +#endif // !WIN32 + +#undef FORCEINLINE +#undef PRE_ALIGN +#undef POST_ALIGN diff --git a/examples/intrinsics/knc-i1x8unsafe_fast.h b/examples/intrinsics/knc-i1x8unsafe_fast.h new file mode 100644 index 00000000..55d97566 --- /dev/null +++ b/examples/intrinsics/knc-i1x8unsafe_fast.h @@ -0,0 +1,2 @@ +#define __ZMM32BIT__ +#include "knc-i1x8.h" diff --git a/run_tests.py b/run_tests.py index 9729930f..2cca983e 100755 --- a/run_tests.py +++ b/run_tests.py @@ -362,10 +362,13 @@ def run_test(testname): gcc_isa="" if options.target == 'generic-4': gcc_isa = '-msse4.2' - if options.target == 'generic-8': + if (options.target == 'generic-8'): + if (options.include_file.find("knc-i1x8.h")!=-1 or options.include_file.find("knc-i1x8unsafe_fast.h")!=-1): + gcc_isa = '-mmic' + else: gcc_isa = '-mavx' if (options.target == 'generic-16' or options.target == 'generic-32' or options.target == 'generic-64') \ - and (options.include_file.find("knc.h")!=-1 or options.include_file.find("knc2x.h")!=-1): + and (options.include_file.find("knc-i1x16.h")!=-1 or options.include_file.find("knc.h")!=-1 or options.include_file.find("knc2x.h")!=-1): gcc_isa = '-mmic' cc_cmd = "%s -O2 -I. %s %s test_static.cpp -DTEST_SIG=%d %s -o %s" % \ From 4b1a0b4bc46f6a4503c1ebec8cbfa7b74ffc78a3 Mon Sep 17 00:00:00 2001 From: evghenii Date: Wed, 18 Sep 2013 18:41:22 +0300 Subject: [PATCH 078/124] added fails --- examples/intrinsics/knc-i1x8unsafe_fast.h | 67 +++++++++++++++++++++++ 1 file changed, 67 insertions(+) diff --git a/examples/intrinsics/knc-i1x8unsafe_fast.h b/examples/intrinsics/knc-i1x8unsafe_fast.h index 55d97566..ce66ea11 100644 --- a/examples/intrinsics/knc-i1x8unsafe_fast.h +++ b/examples/intrinsics/knc-i1x8unsafe_fast.h @@ -1,2 +1,69 @@ #define __ZMM32BIT__ #include "knc-i1x8.h" + +/* the following tests fails because vec8_i32 and vec8_float are 512 and not 256 bit in size. + * not sure how it is possible to fix this, any suggestions? +33 / 1206 tests FAILED execution: + ./tests/array-gather-simple.ispc + ./tests/array-gather-vary.ispc + ./tests/array-multidim-gather-scatter.ispc + ./tests/array-scatter-vary.ispc + ./tests/atomics-5.ispc + ./tests/atomics-swap.ispc + ./tests/cfor-array-gather-vary.ispc + ./tests/cfor-gs-improve-varying-1.ispc + ./tests/cfor-struct-gather-2.ispc + ./tests/cfor-struct-gather-3.ispc + ./tests/cfor-struct-gather.ispc + ./tests/gather-struct-vector.ispc + ./tests/global-array-4.ispc + ./tests/gs-improve-varying-1.ispc + ./tests/half-1.ispc + ./tests/half-3.ispc + ./tests/half.ispc + ./tests/launch-3.ispc + ./tests/launch-4.ispc + ./tests/masked-scatter-vector.ispc + ./tests/masked-struct-scatter-varying.ispc + ./tests/new-delete-6.ispc + ./tests/ptr-24.ispc + ./tests/ptr-25.ispc + ./tests/short-vec-15.ispc + ./tests/struct-gather-2.ispc + ./tests/struct-gather-3.ispc + ./tests/struct-gather.ispc + ./tests/struct-ref-lvalue.ispc + ./tests/struct-test-118.ispc + ./tests/struct-vary-index-expr.ispc + ./tests/typedef-2.ispc + ./tests/vector-varying-scatter.ispc +*/ + +/* knc-i1x8.h has the following fails: +3 / 1206 tests FAILED execution: + ./tests/half-1.ispc + ./tests/half-3.ispc + ./tests/half.ispc +*/ + +/* knc-i1x16.h has the following fails: +5 / 1206 tests FAILED execution: + ./tests/assert-3.ispc + ./tests/half-1.ispc + ./tests/half-3.ispc + ./tests/half.ispc + ./tests/test-141.ispc +*/ + +/* generics-16, from which these knc-i1x*.h are derived, has the following fails: +6 / 1206 tests FAILED execution: + ./tests/func-overload-max.ispc + ./tests/half-1.ispc + ./tests/half-3.ispc + ./tests/half.ispc + ./tests/test-141.ispc + ./tests/test-143.ispc +*/ + + + From e4b1f585952d4748818d01995f24c04d35c4c0b0 Mon Sep 17 00:00:00 2001 From: evghenii Date: Wed, 18 Sep 2013 19:14:41 +0300 Subject: [PATCH 079/124] performance fix.. still some issues left with equal_i1 for __vec8_i1 --- examples/intrinsics/knc-i1x16.h | 52 ++++++++++++++++++++++----------- 1 file changed, 35 insertions(+), 17 deletions(-) diff --git a/examples/intrinsics/knc-i1x16.h b/examples/intrinsics/knc-i1x16.h index 8b1a2bb9..ebffa4d6 100644 --- a/examples/intrinsics/knc-i1x16.h +++ b/examples/intrinsics/knc-i1x16.h @@ -480,46 +480,63 @@ INSERT_EXTRACT(__vec1_d, double) /////////////////////////////////////////////////////////////////////////// // mask ops -static FORCEINLINE uint64_t __movmsk(__vec16_i1 mask) { - return (uint64_t)mask.v; +static FORCEINLINE __vec16_i1 __movmsk(__vec16_i1 mask) { + return _mm512_kmov(mask); } static FORCEINLINE bool __any(__vec16_i1 mask) { - return (mask.v!=0); + return !_mm512_kortestz(mask, mask); } static FORCEINLINE bool __all(__vec16_i1 mask) { - return (mask.v==0xFFFF); + return _mm512_kortestc(mask, mask); } static FORCEINLINE bool __none(__vec16_i1 mask) { - return (mask.v==0); + return _mm512_kortestz(mask, mask); } +#if 0 +static FORCEINLINE __vec16_i1 __equal_i1(__vec16_i1 a, __vec16_i1 b) { +#if 0 + return _mm512_kand(a,b); /* this fails some short circut tests */ +#else + return _mm512_knot( _mm512_kandn(a, b)); /* this fails some asin test */ +#endif +} +#else /* passes all the tests */ static FORCEINLINE __vec16_i1 __equal_i1(__vec16_i1 a, __vec16_i1 b) { __vec16_i1 r; r.v = (a.v & b.v) | (~a.v & ~b.v); return r; } +#endif static FORCEINLINE __vec16_i1 __and(__vec16_i1 a, __vec16_i1 b) { - __vec16_i1 r; - r.v = a.v & b.v; - return r; + return _mm512_kand(a, b); } static FORCEINLINE __vec16_i1 __xor(__vec16_i1 a, __vec16_i1 b) { - __vec16_i1 r; - r.v = a.v ^ b.v; - return r; + return _mm512_kxor(a, b); } static FORCEINLINE __vec16_i1 __or(__vec16_i1 a, __vec16_i1 b) { - __vec16_i1 r; - r.v = a.v | b.v; - return r; + return _mm512_kor(a, b); } +#if 0 +static FORCEINLINE __vec16_i1 __not(__vec16_i1 a) { + return _mm512_knot(a); +} + +static FORCEINLINE __vec16_i1 __and_not1(__vec16_i1 a, __vec16_i1 b) { + return _mm512_kandn(a, b); +} + +static FORCEINLINE __vec16_i1 __and_not2(__vec16_i1 a, __vec16_i1 b) { + return _mm512_kandnr(a, b); +} +#else static FORCEINLINE __vec16_i1 __not(__vec16_i1 v) { __vec16_i1 r; r.v = ~v.v; @@ -537,18 +554,19 @@ static FORCEINLINE __vec16_i1 __and_not2(__vec16_i1 a, __vec16_i1 b) { r.v = a.v & ~b.v; return r; } +#endif static FORCEINLINE __vec16_i1 __select(__vec16_i1 mask, __vec16_i1 a, __vec16_i1 b) { - __vec16_i1 r; - r.v = (a.v & mask.v) | (b.v & ~mask.v); - return r; + return ((a & mask) | (b & ~mask)); + //return __or(__and(a, mask), __andnr(b, mask)); } static FORCEINLINE __vec16_i1 __select(bool cond, __vec16_i1 a, __vec16_i1 b) { return cond ? a : b; } + static FORCEINLINE bool __extract_element(__vec16_i1 vec, int index) { return (vec.v & (1 << index)) ? true : false; } From 3cf63362a4885056bf72e6daaad7ffc67d7a93dc Mon Sep 17 00:00:00 2001 From: evghenii Date: Wed, 18 Sep 2013 20:03:08 +0300 Subject: [PATCH 080/124] small tuning --- examples/intrinsics/knc-i1x16.h | 39 +++------------------------------ 1 file changed, 3 insertions(+), 36 deletions(-) diff --git a/examples/intrinsics/knc-i1x16.h b/examples/intrinsics/knc-i1x16.h index ebffa4d6..b7d3a7f1 100644 --- a/examples/intrinsics/knc-i1x16.h +++ b/examples/intrinsics/knc-i1x16.h @@ -496,22 +496,9 @@ static FORCEINLINE bool __none(__vec16_i1 mask) { return _mm512_kortestz(mask, mask); } -#if 0 static FORCEINLINE __vec16_i1 __equal_i1(__vec16_i1 a, __vec16_i1 b) { -#if 0 - return _mm512_kand(a,b); /* this fails some short circut tests */ -#else - return _mm512_knot( _mm512_kandn(a, b)); /* this fails some asin test */ -#endif + return _mm512_kxnor(a,b); } -#else /* passes all the tests */ -static FORCEINLINE __vec16_i1 __equal_i1(__vec16_i1 a, __vec16_i1 b) { - __vec16_i1 r; - r.v = (a.v & b.v) | (~a.v & ~b.v); - return r; -} -#endif - static FORCEINLINE __vec16_i1 __and(__vec16_i1 a, __vec16_i1 b) { return _mm512_kand(a, b); } @@ -524,7 +511,6 @@ static FORCEINLINE __vec16_i1 __or(__vec16_i1 a, __vec16_i1 b) { return _mm512_kor(a, b); } -#if 0 static FORCEINLINE __vec16_i1 __not(__vec16_i1 a) { return _mm512_knot(a); } @@ -536,30 +522,11 @@ static FORCEINLINE __vec16_i1 __and_not1(__vec16_i1 a, __vec16_i1 b) { static FORCEINLINE __vec16_i1 __and_not2(__vec16_i1 a, __vec16_i1 b) { return _mm512_kandnr(a, b); } -#else -static FORCEINLINE __vec16_i1 __not(__vec16_i1 v) { - __vec16_i1 r; - r.v = ~v.v; - return r; -} - -static FORCEINLINE __vec16_i1 __and_not1(__vec16_i1 a, __vec16_i1 b) { - __vec16_i1 r; - r.v = ~a.v & b.v; - return r; -} - -static FORCEINLINE __vec16_i1 __and_not2(__vec16_i1 a, __vec16_i1 b) { - __vec16_i1 r; - r.v = a.v & ~b.v; - return r; -} -#endif static FORCEINLINE __vec16_i1 __select(__vec16_i1 mask, __vec16_i1 a, __vec16_i1 b) { - return ((a & mask) | (b & ~mask)); - //return __or(__and(a, mask), __andnr(b, mask)); +// return ((a & mask) | (b & ~mask)); + return __or(__and(a, mask), __and_not2(b, mask)); } static FORCEINLINE __vec16_i1 __select(bool cond, __vec16_i1 a, __vec16_i1 b) { From 406e2eb8d0e9eaac0c1923c8a91837882b8f4610 Mon Sep 17 00:00:00 2001 From: egaburov Date: Thu, 19 Sep 2013 09:16:37 +0200 Subject: [PATCH 081/124] fix double precision input to support .123d321 type of input --- lex.ll | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lex.ll b/lex.ll index f1dcaa6f..3655220f 100644 --- a/lex.ll +++ b/lex.ll @@ -345,7 +345,7 @@ INT_NUMBER (([0-9]+)|(0x[0-9a-fA-F]+)|(0b[01]+))[uUlL]*[kMG]?[uUlL]* INT_NUMBER_DOTDOTDOT (([0-9]+)|(0x[0-9a-fA-F]+)|(0b[01]+))[uUlL]*[kMG]?[uUlL]*\.\.\. FLOAT_NUMBER (([0-9]+|(([0-9]+\.[0-9]*[fF]?)|(\.[0-9]+)))([eE][-+]?[0-9]+)?[fF]?) HEX_FLOAT_NUMBER (0x[01](\.[0-9a-fA-F]*)?p[-+]?[0-9]+[fF]?) -FORTRAN_DOUBLE_NUMBER (([0-9]+\.[0-9]*[dD])|([0-9]+\.[0-9]*[dD][-+]?[0-9]+)|([0-9]+[dD][-+]?[0-9]+)) +FORTRAN_DOUBLE_NUMBER (([0-9]+\.[0-9]*[dD])|([0-9]+\.[0-9]*[dD][-+]?[0-9]+)|([0-9]+[dD][-+]?[0-9]+)|(\.[0-9]*[dD][-+]?[0-9]+)) From 00cd90c6b0c31a0d709c368db8b0dc42501577cc Mon Sep 17 00:00:00 2001 From: Ilia Filippov Date: Tue, 17 Sep 2013 17:30:34 +0400 Subject: [PATCH 082/124] test system --- alloy.py | 600 ++++++++++++++++++ check_env.py | 102 +++ common.py | 120 ++++ examples/noise/Makefile | 2 +- examples/perf.py | 374 ----------- fail_db.txt | 1 + ...ER.patch => 3_3_r183327-AVX2-GATHER.patch} | 0 ...hift.patch => 3_3_r184575-x86-shift.patch} | 0 examples/perf.ini => perf.ini | 24 +- perf.py | 489 ++++++++++++++ run_tests.py | 570 +++++++++++------ 11 files changed, 1711 insertions(+), 571 deletions(-) create mode 100755 alloy.py create mode 100755 check_env.py create mode 100644 common.py delete mode 100755 examples/perf.py create mode 100644 fail_db.txt rename llvm_patches/{r183327-AVX2-GATHER.patch => 3_3_r183327-AVX2-GATHER.patch} (100%) rename llvm_patches/{r184575-x86-shift.patch => 3_3_r184575-x86-shift.patch} (100%) rename examples/perf.ini => perf.ini (84%) create mode 100755 perf.py diff --git a/alloy.py b/alloy.py new file mode 100755 index 00000000..67f534ca --- /dev/null +++ b/alloy.py @@ -0,0 +1,600 @@ +#!/usr/bin/python +# +# Copyright (c) 2013, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +# IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +# TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER +# OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +# // Author: Filippov Ilia + +def attach_mail_file(msg, filename, name): + if os.path.exists(filename): + fp = open(filename, "rb") + to_attach = MIMEBase("application", "octet-stream") + to_attach.set_payload(fp.read()) + encode_base64(to_attach) + to_attach.add_header("Content-Disposition", "attachment", filename=name) + fp.close() + msg.attach(to_attach) + +def setting_paths(llvm, ispc, sde): + if llvm != "": + os.environ["LLVM_HOME"]=llvm + if ispc != "": + os.environ["ISPC_HOME"]=ispc + if sde != "": + os.environ["SDE_HOME"]=sde + +def check_LLVM(which_LLVM): + answer = [] + if which_LLVM[0] == " ": + return answer + p = os.environ["LLVM_HOME"] + for i in range(0,len(which_LLVM)): + if not os.path.exists(p + os.sep + "bin-" + which_LLVM[i] + os.sep + "bin"): + answer.append(which_LLVM[i]) + return answer + +def try_do_LLVM(text, command, from_validation): + if from_validation == True: + text = text + "\n" + print_debug("Trying to " + text, from_validation, alloy_build) + if os.system(command + " >> " + alloy_build + " 2>> " + alloy_build) != 0: + print_debug("ERROR.\n", from_validation, alloy_build) + error("can't " + text, 1) + print_debug("DONE.\n", from_validation, alloy_build) + +def build_LLVM(version_LLVM, revision, folder, tarball, debug, selfbuild, from_validation, force): + print_debug("Building LLVM. Version: " + version_LLVM + ". ", from_validation, alloy_build) + if revision != "": + print_debug("Revision: " + revision + ".\n", from_validation, alloy_build) + else: + print_debug("\n", from_validation, alloy_build) + # Here we understand what and where do we want to build + current_path = os.getcwd() + llvm_home = os.environ["LLVM_HOME"] + os.chdir(llvm_home) + FOLDER_NAME=version_LLVM + if version_LLVM == "head": + SVN_PATH="trunk" + if version_LLVM == "3.3": + SVN_PATH="tags/RELEASE_33/final" + version_LLVM = "3_3" + if version_LLVM == "3.2": + SVN_PATH="tags/RELEASE_32/final" + version_LLVM = "3_2" + if version_LLVM == "3.1": + SVN_PATH="tags/RELEASE_31/final" + version_LLVM = "3_1" + if revision != "": + FOLDER_NAME = FOLDER_NAME + "_" + revision + revision = "-" + revision + if folder == "": + folder = FOLDER_NAME + LLVM_SRC="llvm-" + folder + LLVM_BUILD="build-" + folder + LLVM_BIN="bin-" + folder + if os.path.exists(LLVM_BIN) and not force: + print_debug("You have folder " + LLVM_BIN + ". If you want to rebuild use --force\n", False, "") + exit(0) + LLVM_BUILD_selfbuild = LLVM_BUILD + "_temp" + LLVM_BIN_selfbuild = LLVM_BIN + "_temp" + common.remove_if_exists(LLVM_SRC) + common.remove_if_exists(LLVM_BUILD) + common.remove_if_exists(LLVM_BIN) + if selfbuild: + common.remove_if_exists(LLVM_BUILD_selfbuild) + common.remove_if_exists(LLVM_BIN_selfbuild) + MAKE = "gmake" + print_debug("Using folders: " + LLVM_SRC + " " + LLVM_BUILD + " " + LLVM_BIN + " in " + + llvm_home + "\n", from_validation, alloy_build) + # load llvm + if tarball == "": + try_do_LLVM("load LLVM from http://llvm.org/svn/llvm-project/llvm/" + SVN_PATH + " ", + "svn co " + revision + " http://llvm.org/svn/llvm-project/llvm/" + SVN_PATH + " " + LLVM_SRC, + from_validation) + os.chdir(LLVM_SRC + "/tools") + try_do_LLVM("load clang from http://llvm.org/svn/llvm-project/cfe/" + SVN_PATH + " ", + "svn co " + revision + " http://llvm.org/svn/llvm-project/cfe/" + SVN_PATH + " clang", + from_validation) + os.chdir("../") + else: + tar = tarball.split(" ") + os.makedirs(LLVM_SRC) + os.chdir(LLVM_SRC) + try_do_LLVM("untar LLVM from " + tar[0] + " ", + "tar -xvzf " + tar[0] + " --strip-components 1", from_validation) + os.chdir("./tools") + os.makedirs("clang") + os.chdir("./clang") + try_do_LLVM("untar clang from " + tar[1] + " ", + "tar -xvzf " + tar[1] + " --strip-components 1", from_validation) + os.chdir("../../") + # paching llvm + patches = glob.glob(os.environ["ISPC_HOME"] + "/llvm_patches/*.*") + for patch in patches: + if version_LLVM in os.path.basename(patch): + try_do_LLVM("patch LLVM with patch" + patch + " ", "patch -p0 < " + patch, from_validation) + os.chdir("../") + # configuring llvm, build first part of selfbuild + os.makedirs(LLVM_BUILD) + os.makedirs(LLVM_BIN) + selfbuild_compiler = "" + if selfbuild: + print_debug("Making selfbuild and use folders " + LLVM_BUILD_selfbuild + " and " + + LLVM_BIN_selfbuild + "\n", from_validation, alloy_build) + os.makedirs(LLVM_BUILD_selfbuild) + os.makedirs(LLVM_BIN_selfbuild) + os.chdir(LLVM_BUILD_selfbuild) + try_do_LLVM("configure release version for selfbuild ", + "../" + LLVM_SRC + "/configure --prefix=" + llvm_home + "/" + + LLVM_BIN_selfbuild + " --enable-optimized", + from_validation) + try_do_LLVM("build release version for selfbuild ", + MAKE + " -j32", from_validation) + try_do_LLVM("install release version for selfbuild ", + MAKE + " install", + from_validation) + os.chdir("../") + selfbuild_compiler = " CC="+llvm_home+ "/" + LLVM_BIN_selfbuild + "/bin/clang" + print_debug("Now we have compiler for selfbuild: " + selfbuild_compiler + "\n", from_validation, alloy_build) + os.chdir(LLVM_BUILD) + if debug == False: + try_do_LLVM("configure release version ", + "../" + LLVM_SRC + "/configure --prefix=" + llvm_home + "/" + + LLVM_BIN + " --enable-optimized" + selfbuild_compiler, + from_validation) + else: + try_do_LLVM("configure debug version ", + "../" + LLVM_SRC + "/configure --prefix=" + llvm_home + "/" + LLVM_BIN + + " --enable-debug-runtime --enable-debug-symbols --enable-keep-symbols" + selfbuild_compiler, + from_validation) + # building llvm + try_do_LLVM("build LLVM ", MAKE + " -j32", from_validation) + try_do_LLVM("install LLVM ", MAKE + " install", from_validation) + os.chdir(current_path) + +def check_targets(): + answer = [] + answer_sde = [] + SSE2 = False; + SSE4 = False; + AVX = False; + AVX11 = False; + AVX2 = False; + cpu = open("/proc/cpuinfo") + f_lines = cpu.readlines() + cpu.close() + # check what native targets do we have + for i in range(0,len(f_lines)): + if SSE2 == False and "sse2" in f_lines[i]: + SSE2 = True; + answer = answer + ["sse2-i32x4", "sse2-i32x8"] + if SSE4 == False and "sse4_1" in f_lines[i]: + SSE4 = True; + answer = answer + ["sse4-i32x4", "sse4-i32x8", "sse4-i16x8", "sse4-i8x16"] + if AVX == False and "avx" in f_lines[i]: + AVX = True; + answer = answer + ["avx1-i32x8", "avx1-i32x16"] + if AVX11 == False and "rdrand" in f_lines[i]: + AVX11 = True; + answer = answer + ["avx1.1-i32x8", "avx1.1-i32x16"] + if AVX2 == False and "avx2" in f_lines[i]: + AVX2 = True; + answer = answer + ["avx2-i32x8", "avx2-i32x16"] + answer = answer + ["generic-4", "generic-16", "generic-8", "generic-1", "generic-32", "generic-64"] + # now check what targets we have with the help of SDE + sde_exists = "" + PATH_dir = string.split(os.getenv("PATH"), os.pathsep) + for counter in PATH_dir: + if os.path.exists(counter + os.sep + "sde") and sde_exists == "": + sde_exists = counter + os.sep + "sde" + if os.environ.get("SDE_HOME") != None: + if os.path.exists(os.environ.get("SDE_HOME") + os.sep + "sde"): + sde_exists = os.environ.get("SDE_HOME") + os.sep + "sde" + if sde_exists == "": + error("you haven't got sde neither in SDE_HOME nor in your PATH.\n" + + "To test all platforms please set SDE_HOME to path containing SDE.\n" + + "Please refer to http://www.intel.com/software/sde for SDE download information.", 2) + return [answer, answer_sde] + # here we have SDE + os.system(sde_exists + " -help > " + temp_alloy_file) + cpu = open(temp_alloy_file) + f_lines = cpu.readlines() + cpu.close() + for i in range(0,len(f_lines)): + if SSE4 == False and "wsm" in f_lines[i]: + answer_sde = answer_sde + [["-wsm", "sse4-i32x4"], ["-wsm", "sse4-i32x8"], ["-wsm", "sse4-i16x8"], ["-wsm", "sse4-i8x16"]] + if AVX == False and "snb" in f_lines[i]: + answer_sde = answer_sde + [["-snb", "avx1-i32x8"], ["-snb", "avx1-i32x16"]] + if AVX11 == False and "ivb" in f_lines[i]: + answer_sde = answer_sde + [["-ivb", "avx1.1-i32x8"], ["ivb", "avx1.1-i32x16"]] + if AVX2 == False and "hsw" in f_lines[i]: + answer_sde = answer_sde + [["-hsw", "avx2-i32x8"], ["-hsw", "avx2-i32x16"]] + return [answer, answer_sde] + +def build_ispc(version_LLVM): + current_path = os.getcwd() + os.chdir(os.environ["ISPC_HOME"]) + p_temp = os.getenv("PATH") + os.environ["PATH"] = os.environ["LLVM_HOME"] + "/bin-" + version_LLVM + "/bin:" + os.environ["PATH"] + os.system("make clean >> " + alloy_build) + try_do_LLVM("build ISPC with LLVM version " + version_LLVM + " ", "make -j32", True) + os.environ["PATH"] = p_temp + os.chdir(current_path) + +def execute_stability(stability, R, print_version): + stability1 = copy.deepcopy(stability) + temp = run_tests.run_tests(stability1, [], print_version) + for j in range(0,4): + R[j][0] = R[j][0] + temp[j] + for i in range(0,len(temp[j])): + R[j][1].append(temp[4]) + number_of_fails = temp[5] + number_of_new_fails = len(temp[0]) + len(temp[1]) + if number_of_fails == 0: + str_fails = ". No fails" + else: + str_fails = ". Fails: " + str(number_of_fails) + if number_of_new_fails == 0: + str_new_fails = ", No new fails.\n" + else: + str_new_fails = ", New fails: " + str(number_of_new_fails) + ".\n" + print_debug(temp[4][1:-3] + str_fails + str_new_fails, False, stability_log) + +def run_special_tests(): + i = 5 + +def validation_run(only, only_targets, reference_branch, notify, update): + current_path = os.getcwd() + os.chdir(os.environ["ISPC_HOME"]) + os.environ["PATH"] = os.environ["ISPC_HOME"] + ":" + os.environ["PATH"] + if options.notify != "": + if os.environ.get("SMTP_ISPC") == None: + error("you have no SMTP_ISPC in your environment for option notify", 1) + common.remove_if_exists(os.environ["ISPC_HOME"] + os.sep + "all_answer.txt") + smtp_server = os.environ["SMTP_ISPC"] + msg = MIMEMultipart() + msg['Subject'] = 'ISPC test system results' + msg['From'] = 'ISPC_test_system' + msg['To'] = options.notify + print_debug("Command: " + ' '.join(sys.argv) + "\n", False, "") + print_debug("Folder: " + os.environ["ISPC_HOME"] + "\n", False, "") + date = datetime.datetime.now() + print_debug("Date: " + date.strftime('%H:%M %d/%m/%Y') + "\n", False, "") + class options_for_drivers: + pass +# *** *** *** +# Stability validation run +# *** *** *** + if ((("stability" in only) == True) or ("performance" in only) == False): + print_debug("\n\nStability validation run\n\n", False, "") + stability = options_for_drivers() +# stability constant options + stability.random = False + stability.ispc_flags = "" + stability.compiler_exe = None + stability.num_jobs = 1024 + stability.verbose = False + stability.time = False + stability.non_interactive = True + stability.update = update + stability.include_file = None + stability.silent = True + stability.in_file = "." + os.sep + f_date + os.sep + "run_tests_log.log" + stability.verify = False +# stability varying options + stability.target = "" + stability.arch = "" + stability.no_opt = False + stability.wrapexe = "" +# prepare parameters of run + common.check_tools(1) + [targets_t, sde_targets_t] = check_targets() + rebuild = True + opts = [] + archs = [] + LLVM = [] + targets = [] + sde_targets = [] +# parsing option only, update parameters of run + if "-O2" in only: + opts.append(False) + if "-O0" in only: + opts.append(True) + if "x86" in only and not ("x86-64" in only): + archs.append("x86") + if "x86-64" in only: + archs.append("x86-64") + if "native" in only: + sde_targets_t = [] + for i in ["3.1", "3.2", "3.3", "head"]: + if i in only: + LLVM.append(i) + if "current" in only: + LLVM = [" "] + rebuild = False + if only_targets != "": + only_targets_t = only_targets.split(" ") + for i in only_targets_t: + err = True + for j in range(0,len(targets_t)): + if i in targets_t[j]: + targets.append(targets_t[j]) + err = False + for j in range(0,len(sde_targets_t)): + if i in sde_targets_t[j][1]: + sde_targets.append(sde_targets_t[j]) + err = False + if err == True: + error("You haven't sde for target " + i, 1) + else: + targets = targets_t[:-4] + sde_targets = sde_targets_t + if "build" in only: + targets = [] + sde_targets = [] + only = only + " stability " +# finish parameters of run, prepare LLVM + if len(opts) == 0: + opts = [False] + if len(archs) == 0: + archs = ["x86", "x86-64"] + if len(LLVM) == 0: + LLVM = ["3.1", "3.2", "3.3", "head"] + gen_archs = ["x86-64"] + need_LLVM = check_LLVM(LLVM) + for i in range(0,len(need_LLVM)): + build_LLVM(need_LLVM[i], "", "", "", False, False, True, False) +# begin validation run for stabitily + common.remove_if_exists(stability.in_file) + R = [[[],[]],[[],[]],[[],[]],[[],[]]] + print_debug("\n_________________________STABILITY REPORT_________________________\n", False, stability_log) + for i in range(0,len(LLVM)): + print_version = 2 + if rebuild: + build_ispc(LLVM[i]) + for j in range(0,len(targets)): + stability.target = targets[j] + stability.wrapexe = "" + if "generic" in targets[j]: + arch = gen_archs + else: + arch = archs + for i1 in range(0,len(arch)): + for i2 in range(0,len(opts)): + stability.arch = arch[i1] + stability.no_opt = opts[i2] + execute_stability(stability, R, print_version) + print_version = 0 + for j in range(0,len(sde_targets)): + stability.target = sde_targets[j][1] + stability.wrapexe = os.environ["SDE_HOME"] + "/sde " + sde_targets[j][0] + " -- " + for i1 in range(0,len(archs)): + for i2 in range(0,len(opts)): + stability.arch = archs[i1] + stability.no_opt = opts[i2] + execute_stability(stability, R, print_version) + print_version = 0 +# run special tests like embree +# + run_special_tests() + ttt = ["NEW RUNFAILS: ", "NEW COMPFAILS: ", "NEW PASSES RUNFAILS: ", "NEW PASSES COMPFAILS: "] + for j in range(0,4): + if len(R[j][0]) == 0: + print_debug("NO " + ttt[j][:-2] + "\n", False, stability_log) + else: + print_debug(ttt[j] + str(len(R[j][0])) + "\n", False, stability_log) + temp5 = [[],[]] + for i in range(0,len(R[j][0])): + er = True + for k in range(0,len(temp5[0])): + if R[j][0][i] == temp5[0][k]: + temp5[1][k].append(R[j][1][i]) + er = False + if er == True: + temp5[0].append(R[j][0][i]) + temp5[1].append([R[j][1][i]]) + for i in range(0,len(temp5[0])): + print_debug("\t" + temp5[0][i] + "\n", True, stability_log) + for k in range(0,len(temp5[1][i])): + print_debug("\t\t\t" + temp5[1][i][k], True, stability_log) + print_debug("__________________Watch stability.log for details_________________\n", False, stability_log) + if options.notify != "": + attach_mail_file(msg, stability.in_file, "run_tests_log.log") + attach_mail_file(msg, stability_log, "stability.log") + +# *** *** *** +# Performance validation run +# *** *** *** + if ((("performance" in only) == True) or ("stability" in only) == False): + print_debug("\n\nPerformance validation run\n\n", False, "") + performance = options_for_drivers() +# performance constant options + performance.number = 5 + performance.config = "./perf.ini" + performance.path = "./" + performance.silent = True + performance.output = "" + performance.compiler = "" + performance.ref = "ispc_ref" + performance.in_file = "." + os.sep + f_date + os.sep + "performance.log" +# prepare LLVM 3.3 as newest LLVM + need_LLVM = check_LLVM(["3.3"]) + if len(need_LLVM) != 0: + build_LLVM(need_LLVM[i], "", "", "", False, False, True, False) +# prepare reference point. build both test and reference compilers + os.system("git branch > " + temp_alloy_file) + br = open(temp_alloy_file) + temp4 = br.readlines() + br.close() + for line in temp4: + if "*" in line: + current_branch = line[2:-1] + stashing = True + sys.stdout.write("Please, don't interrupt script here! You can have not sync git status after interruption!\n") + if "No local changes" in detect_version("git stash"): + stashing = False + #try_do_LLVM("stash current branch ", "git stash", True) + try_do_LLVM("checkout reference branch " + reference_branch + " ", "git checkout " + reference_branch, True) + sys.stdout.write(".\n") + build_ispc("3.3") + sys.stdout.write(".\n") + os.rename("ispc", "ispc_ref") + try_do_LLVM("checkout test branch " + current_branch + " ", "git checkout " + current_branch, True) + if stashing: + try_do_LLVM("return current branch ", "git stash pop", True) + sys.stdout.write("You can interrupt script now.\n") + build_ispc("3.3") +# begin validation run for performance. output is inserted into perf() + perf.perf(performance, []) + if options.notify != "": + attach_mail_file(msg, performance.in_file, "performance.log") + attach_mail_file(msg, "." + os.sep + "logs" + os.sep + "perf_build.log", "perf_build.log") + + print_debug("Logs are in alloy_results_[date]", False, "") + +# sending e-mail with results + if options.notify != "": + fp = open(os.environ["ISPC_HOME"] + os.sep + "all_answer.txt", 'rb') + f_lines = fp.readlines() + fp.close() + line = "" + for i in range(0,len(f_lines)): + line = line + f_lines[i][:-1] + line = line + ' \n' + text = MIMEText(line, "", "KOI-8") + msg.attach(text) + attach_mail_file(msg, alloy_build, "alloy_build.log") + s = smtplib.SMTP(smtp_server) + s.sendmail('ISPC_test_system', options.notify, msg.as_string()) + s.quit() +# exit of validation routine + common.remove_if_exists(temp_alloy_file) + os.chdir(current_path) + +def Main(): + if (platform.system() == 'Windows' or 'CYGWIN_NT' in platform.system()) == True: + error("Windows isn't supported now", 1) + if (options.build_llvm == False and + options.validation_run == False and + options.llvm_home == "" and + options.ispc_home == "" and + options.sde_home == ""): + parser.print_help() + exit(0) + global f_date + f_date = "logs" + common.remove_if_exists(f_date) + os.makedirs(f_date) + global temp_alloy_file + temp_alloy_file = os.getcwd() + os.sep + f_date + os.sep + "temp_detect_version" + global alloy_build + alloy_build = os.getcwd() + os.sep + f_date + os.sep + "alloy_build.log" + common.remove_if_exists(alloy_build) + global stability_log + stability_log = os.getcwd() + os.sep + f_date + os.sep + "stability.log" + common.remove_if_exists(stability_log) + setting_paths(options.llvm_home, options.ispc_home, options.sde_home) + if os.environ.get("LLVM_HOME") == None: + error("you have no LLVM_HOME", 1) + if os.environ.get("ISPC_HOME") == None: + error("you have no ISPC_HOME", 1) + if options.build_llvm: + build_LLVM(options.version, options.revision, options.folder, options.tarball, + options.debug, options.selfbuild, False, options.force) + if options.validation_run: + validation_run(options.only, options.only_targets, options.branch, options.notify, options.update) + os.rename(f_date, "alloy_results_" + datetime.datetime.now().strftime('%H_%M_%d_%m_%Y')) + +###Main### +from optparse import OptionParser +import sys +import os +import operator +import time +import glob +import string +import platform +import smtplib +import datetime +import copy +from email.MIMEMultipart import MIMEMultipart +from email.MIMEBase import MIMEBase +from email.mime.text import MIMEText +from email.Encoders import encode_base64 +# our drivers +import run_tests +import perf +import common +error = common.error +detect_version = common.detect_version +print_debug = common.print_debug +# parsing options +parser = OptionParser() +# options for activity "build LLVM" +parser.add_option('-b', '--build-llvm', dest='build_llvm', + help='ask to build LLVM', default=False, action="store_true") +parser.add_option('--version', dest='version', + help='version of llvm to build', default="head") +parser.add_option('--revision', dest='revision', + help='revision of llvm to build', default="") +parser.add_option('--debug', dest='debug', + help='debug build of LLVM?', default=False, action="store_true") +parser.add_option('--folder', dest='folder', + help='folder to build LLVM in', default="") +parser.add_option('--tarball', dest='tarball', + help='"llvm_tarball clang_tarball"', default="") +parser.add_option('--selfbuild', dest='selfbuild', + help='make selfbuild of LLVM and clang', default=False, action="store_true") +parser.add_option('--force', dest='force', + help='rebuild LLVM', default=False, action='store_true') +# options for activity "setup PATHS" +parser.add_option('--llvm_home', dest='llvm_home',help='path to LLVM',default="") +parser.add_option('--ispc_home', dest='ispc_home',help='path to ISPC',default="") +parser.add_option('--sde_home', dest='sde_home',help='path to SDE',default="") +# options for activity "validation run" +parser.add_option('-r', '--run', dest='validation_run', + help='ask for validation run', default=False, action="store_true") +parser.add_option('--compare-with', dest='branch', + help='set performance reference point', default="master") +parser.add_option('--only-targets', dest='only_targets', + help='set list of targets to test. Possible values - all subnames of targets.\n' + + 'Example: --only-targets="avx2-i32x8 sse4 i32x16 sse2"', default="") +parser.add_option('--notify', dest='notify', + help='sent results to email', default="") +parser.add_option('--only', dest='only', + help='set types of tests. Possible values:\n' + + '-O0, -O2, x86, x86-64, stability (test only stability), performance (test only performance)\n' + + 'build (only build with different LLVM), 3.1, 3.2, 3.3, head, native (do not use SDE), current (do not rebuild ISPC).\n' + + 'Example: --only="3.2 -O0 stability 3.3"', default="") +parser.add_option('--update-errors', dest='update', + help='rewrite fail_db.txt file according to received results (F or FP)', default="") +(options, args) = parser.parse_args() +Main() diff --git a/check_env.py b/check_env.py new file mode 100755 index 00000000..98deb235 --- /dev/null +++ b/check_env.py @@ -0,0 +1,102 @@ +#!/usr/bin/python +# +# Copyright (c) 2013, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +# IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +# TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER +# OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +# // Author: Filippov Ilia + +import common +import sys +import os +import string +print_debug = common.print_debug +error = common.error +detect_version = common.detect_version + +exists = [False, False, False, False, False, False, False, False] +names = ["m4", "bison", "flex", "sde", "ispc", "clang", "gcc", "icc"] + +PATH_dir = string.split(os.getenv("PATH"), os.pathsep) +for counter in PATH_dir: + for i in range(0,8): + if os.path.exists(counter + os.sep + names[i]): + exists[i] = True + +print_debug("=== in PATH: ===\n", False, "") +print_debug("Tools:\n", False, "") +for i in range(0,3): + if exists[i]: + print_debug(detect_version(names[i] + " --version"), False, "") + else: + error("you don't have " + names[i], 0) +if exists[0] and exists[1] and exists[2]: + if common.check_tools(2): + print_debug("versions are ok\n", False, "") +print_debug("\nSDE:\n", False, "") +if exists[3]: + print_debug(detect_version(names[3] + " --version"), False, "") +else: + error("you don't have " + names[3], 2) +print_debug("\nISPC:\n", False, "") +if exists[4]: + print_debug(detect_version(names[4] + " --version"), False, "") +else: + error("you don't have " + names[4], 2) +print_debug("\nC/C++ compilers:\n", False, "") +for i in range(5,8): + if exists[i]: + print_debug(detect_version(names[i] + " --version"), False, "") + else: + error("you don't have " + names[i], 2) + +print_debug("\n=== in ISPC specific environment variables: ===\n", False, "") +if os.environ.get("LLVM_HOME") == None: + error("you have no LLVM_HOME", 2) +else: + print_debug("Your LLVM_HOME:" + os.environ.get("LLVM_HOME") + "\n", False, "") +if os.environ.get("ISPC_HOME") == None: + error("you have no ISPC_HOME", 2) +else: + print_debug("Your ISPC_HOME:" + os.environ.get("ISPC_HOME") + "\n", False, "") + if os.path.exists(os.environ.get("ISPC_HOME") + os.sep + "ispc"): + print_debug("You have ISPC in your ISPC_HOME: " + + detect_version(os.environ.get("ISPC_HOME") + os.sep + "ispc" + " --version"), False, "") + else: + error("you don't have ISPC in your ISPC_HOME", 2) +if os.environ.get("SDE_HOME") == None: + error("You have no SDE_HOME", 2) +else: + print_debug("Your SDE_HOME:" + os.environ.get("SDE_HOME") + "\n", False, "") + if os.path.exists(os.environ.get("SDE_HOME") + os.sep + "sde"): + print_debug("You have sde in your SDE_HOME: " + + detect_version(os.environ.get("SDE_HOME") + os.sep + "sde" + " --version"), False, "") + else: + error("you don't have any SDE in your ISPC_HOME", 2) diff --git a/common.py b/common.py new file mode 100644 index 00000000..dd8fb388 --- /dev/null +++ b/common.py @@ -0,0 +1,120 @@ +#!/usr/bin/python +# +# Copyright (c) 2013, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +# IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +# TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER +# OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +# // Author: Filippov Ilia +import sys +import os +import shutil + +def write_to_file(filename, line): + f = open(filename, 'a') + f.writelines(line) + f.close() + +#remove file if it exists +def remove_if_exists(filename): + if os.path.exists(filename): + if os.path.isdir(filename): + shutil.rmtree(filename) + else: + os.remove(filename) + +# detect version which is printed after command +def detect_version(command): + os.system(command + " > " + "temp_detect_version") + version = open("temp_detect_version") + answer = version.readline() + version.close() + remove_if_exists("temp_detect_version") + return answer + +# print versions of compilers +def print_version(ispc_test, ispc_ref, ref_compiler, s, perf_log, is_windows): + print_debug("\nUsing test compiler: " + detect_version(ispc_test + " --version"), s, perf_log) + if ispc_ref != "": + print_debug("Using ref compiler: " + detect_version(ispc_ref + " --version"), s, perf_log) + if is_windows == False: + temp1 = detect_version(ref_compiler + " --version") + else: + os.system(ref_compiler + " 2>&1" + " 2> temp_detect_version > temp_detect_version1" ) + version = open("temp_detect_version") + temp1 = version.readline() + version.close() + remove_if_exists("temp_detect_version") + remove_if_exists("temp_detect_version1") + print_debug("Using C/C++ compiler: " + temp1 + "\n", s, perf_log) + +# print everything from scripts instead errors +def print_debug(line, silent, filename): + if silent == False: + sys.stdout.write(line) + sys.stdout.flush() + if os.environ.get("ISPC_HOME") != None: + write_to_file(os.environ["ISPC_HOME"] + os.sep + "all_answer.txt", line) + if filename != "": + write_to_file(filename, line) + +# print errors from scripts +# type 1 for error in environment +# type 2 for warning +# type 3 for error of compiler or test which isn't the goal of script +def error(line, error_type): + line = line + "\n" + if error_type == 1: + sys.stderr.write("Fatal error: " + line) + sys.exit(1) + if error_type == 2: + sys.stderr.write("Warning: " + line) + if error_type == 0: + print_debug("FIND ERROR: " + line, False, "") + +def check_tools(m): + input_tools=[[[1,4],"m4 --version", "bad m4 version"], + [[2,4],"bison --version", "bad bison version"], + [[2,5], "flex --version", "bad flex version"]] + + for t in range(0,len(input_tools)): + t1 = ((detect_version(input_tools[t][1]))[:-1].split(" ")) + for i in range(0,len(t1)): + t11 = t1[i].split(".") + f = True + for j in range(0,len(t11)): + if not t11[j].isdigit(): + f = False + if f == True: + for j in range(0,len(t11)): + if j < len(input_tools[t][0]): + if int(t11[j])> "+build_log) - return os.system("make CXX="+ref_compiler+" CC="+refc_compiler+" >> "+build_log+" 2>> "+build_log) - else: - os.system("msbuild /t:clean >> " + build_log) - return os.system("msbuild /V:m /p:Platform=x64 /p:Configuration=Release /p:TargetDir=.\ /t:rebuild >> " + build_log) - -def execute_test(command): - global perf_temp - r = 0 - if os.path.exists(perf_temp): - os.remove(perf_temp) - for k in range(int(options.number)): - r = r + os.system(command) - return r - -#gathers all tests results and made an item test from answer structure -def run_test(command, c1, c2, test, b_serial): - global perf_temp - if build_test() != 0: - sys.stdout.write("ERROR: Compilation fails\n") - return - if execute_test(command) != 0: - sys.stdout.write("ERROR: Execution fails\n") - return - tasks = [] #list of results with tasks, it will be test[2] - ispc = [] #list of results without tasks, it will be test[1] - absolute_tasks = [] #list of absolute results with tasks, it will be test[4] - absolute_ispc = [] #list of absolute results without tasks, ut will be test[3] - serial = [] #list serial times, it will be test[5] - j = 1 - for line in open(perf_temp): # we take test output - if "speedup" in line: # we are interested only in lines with speedup - if j == c1: # we are interested only in lines with c1 numbers - line = line.expandtabs(0) - line = line.replace("("," ") - line = line.split(",") - for i in range(len(line)): - subline = line[i].split(" ") - number = float(subline[1][:-1]) - if "speedup from ISPC + tasks" in line[i]: - tasks.append(number) - else: - ispc.append(number) - c1 = c1 + c2 - j+=1 - if "million cycles" in line: - if j == c1: - line = line.replace("]","[") - line = line.split("[") - number = float(line[3]) - if "tasks" in line[1]: - absolute_tasks.append(number) - else: - if "ispc" in line[1]: - absolute_ispc.append(number) - if "serial" in line[1]: - serial.append(number) - - if len(ispc) != 0: - if len(tasks) != 0: - print_debug("ISPC speedup / ISPC + tasks speedup / ISPC time / ISPC + tasks time / serial time\n") - for i in range(0,len(serial)): - print_debug("%10s /\t%10s\t /%9s / %10s\t /%10s\n" % - (ispc[i], tasks[i], absolute_ispc[i], absolute_tasks[i], serial[i])) - else: - print_debug("ISPC speedup / ISPC time / serial time\n") - for i in range(0,len(serial)): - print_debug("%10s /%9s /%10s\n" % (ispc[i], absolute_ispc[i], serial[i])) - else: - if len(tasks) != 0: - print_debug("ISPC + tasks speedup / ISPC + tasks time / serial time\n") - for i in range(0,len(serial)): - print_debug("%10s\t / %10s\t /%10s\n" % (tasks[i], absolute_tasks[i], serial[i])) - - test[1] = test[1] + ispc - test[2] = test[2] + tasks - test[3] = test[3] + absolute_ispc - test[4] = test[4] + absolute_tasks - if b_serial == True: - #if we concatenate outputs we should use only the first serial answer. - test[5] = test[5] + serial - -def cpu_get(): - p = open("/proc/stat", 'r') - cpu = p.readline() - p.close() - cpu = cpu.split(" ") - cpu_usage = (int(cpu[2]) + int(cpu[3]) + int(cpu[4])) - cpu_all = cpu_usage + int(cpu[5]) - return [cpu_usage, cpu_all] - -#returns cpu_usage -def cpu_check(): - if is_windows == False: - if is_mac == False: - cpu1 = cpu_get() - time.sleep(1) - cpu2 = cpu_get() - cpu_percent = (float(cpu1[0] - cpu2[0])/float(cpu1[1] - cpu2[1]))*100 - else: - os.system("sysctl -n vm.loadavg > cpu_temp") - c = open("cpu_temp", 'r') - c_line = c.readline() - c.close - os.remove("cpu_temp") - R = c_line.split(' ') - cpu_percent = float(R[1]) * 3 - else: - os.system("wmic cpu get loadpercentage /value > cpu_temp") - c = open("cpu_temp", 'r') - c_lines = c.readlines() - c.close() - os.remove("cpu_temp") - t = "0" - for i in c_lines[2]: - if i.isdigit(): - t = t + i - cpu_percent = int(t) - return cpu_percent - -#returns geomean of list -def geomean(par): - temp = 1 - l = len(par) - for i in range(l): - temp = temp * par[i] - temp = temp ** (1.0/l) - return round(temp, 2) - -#takes an answer struct and print it. -#answer struct: list answer contains lists test -#test[0] - name of test -#test[1] - list of results without tasks -#test[2] - list of results with tasks -#test[3] - list of absolute results without tasks -#test[4] - list of absolute results with tasks -#test[5] - list of absolute time without ISPC (serial) -#test[1..4] may be empty -def print_answer(answer): - filelist = [] - print_debug("--------------------------------------------------------------------------\n") - print_debug("test name:\t ISPC speedup: ISPC + tasks speedup: | " + - "ISPC time: ISPC + tasks time: serial:\n") - filelist.append("test name,ISPC speedup,diff," + - "ISPC + tasks speedup,diff,ISPC time,diff,ISPC + tasks time,diff,serial,diff\n") - max_t = [0,0,0,0,0] - diff_t = [0,0,0,0,0] - geomean_t = [0,0,0,0,0] - list_of_max = [[],[],[],[],[]] - for i in range(len(answer)): - for t in range(1,6): - if len(answer[i][t]) == 0: - max_t[t-1] = "n/a" - diff_t[t-1] = "n/a" - else: - if t < 3: - mm = max(answer[i][t]) - else: - mm = min(answer[i][t]) - max_t[t-1] = '%.2f' % mm - list_of_max[t-1].append(mm) - diff_t[t-1] = '%.2f' % (max(answer[i][t]) - min(answer[i][t])) - print_debug("%s:\n" % answer[i][0]) - print_debug("\t\tmax:\t%5s\t\t%10s\t|%10s\t%10s\t%10s\n" % - (max_t[0], max_t[1], max_t[2], max_t[3], max_t[4])) - print_debug("\t\tdiff:\t%5s\t\t%10s\t|%10s\t%10s\t%10s\n" % - (diff_t[0], diff_t[1], diff_t[2], diff_t[3], diff_t[4])) - for t in range(0,5): - if max_t[t] == "n/a": - max_t[t] = "" - if diff_t[t] == "n/a": - diff_t[t] = "" - filelist.append(answer[i][0] + "," + - max_t[0] + "," + diff_t[0] + "," + max_t[1] + "," + diff_t[1] + "," + - max_t[2] + "," + diff_t[2] + "," + max_t[3] + "," + diff_t[3] + "," + - max_t[4] + "," + diff_t[4] + "\n") - for i in range(0,5): - geomean_t[i] = geomean(list_of_max[i]) - print_debug("---------------------------------------------------------------------------------\n") - print_debug("Geomean:\t\t%5s\t\t%10s\t|%10s\t%10s\t%10s\n" % - (geomean_t[0], geomean_t[1], geomean_t[2], geomean_t[3], geomean_t[4])) - filelist.append("Geomean," + str(geomean_t[0]) + ",," + str(geomean_t[1]) - + ",," + str(geomean_t[2]) + ",," + str(geomean_t[3]) + ",," + str(geomean_t[4]) + "\n") - print_file(filelist) - - -###Main### -# parsing options -parser = OptionParser() -parser.add_option('-n', '--number', dest='number', - help='number of repeats', default="3") -parser.add_option('-c', '--config', dest='config', - help='config file of tests', default="./perf.ini") -parser.add_option('-p', '--path', dest='path', - help='path to examples directory', default="./") -parser.add_option('-s', '--silent', dest='silent', - help='silent mode, only table output', default=False, action="store_true") -parser.add_option('-o', '--output', dest='output', - help='output file for script reading', default="") -parser.add_option('--compiler', dest='compiler', - help='reference compiler', default="") -(options, args) = parser.parse_args() - -global is_windows -is_windows = (platform.system() == 'Windows' or - 'CYGWIN_NT' in platform.system()) -global is_mac -is_mac = (platform.system() == 'Darwin') - -# save corrent path -pwd = os.getcwd() -pwd = pwd + os.sep -if is_windows: - pwd = "..\\" - -# check if cpu usage is low now -cpu_percent = cpu_check() -if cpu_percent > 20: - sys.stdout.write("Warning: CPU Usage is very high.\n") - sys.stdout.write("Close other applications.\n") - -# check that required compilers exist -PATH_dir = string.split(os.getenv("PATH"), os.pathsep) -compiler_exists = False -ref_compiler_exists = False -if is_windows == False: - compiler = "ispc" - ref_compiler = "g++" - refc_compiler = "gcc" - if options.compiler != "": - if options.compiler == "clang" or options.compiler == "clang++": - ref_compiler = "clang++" - refc_compiler = "clang" - if options.compiler == "icc" or options.compiler == "icpc": - ref_compiler = "icpc" - refc_compiler = "icc" -else: - compiler = "ispc.exe" - ref_compiler = "cl.exe" -for counter in PATH_dir: - if os.path.exists(counter + os.sep + compiler): - compiler_exists = True - if os.path.exists(counter + os.sep + ref_compiler): - ref_compiler_exists = True -if not compiler_exists: - sys.stderr.write("Fatal error: ISPC compiler not found.\n") - sys.stderr.write("Added path to ispc compiler to your PATH variable.\n") - sys.exit() -if not ref_compiler_exists: - sys.stderr.write("Fatal error: reference compiler %s not found.\n" % ref_compiler) - sys.stderr.write("Added path to %s compiler to your PATH variable.\n" % ref_compiler) - sys.exit() - -# checks that config file exists -path_config = os.path.normpath(options.config) -if os.path.exists(path_config) == False: - sys.stderr.write("Fatal error: config file not found: %s.\n" % options.config) - sys.stderr.write("Set path to your config file in --config.\n") - sys.exit() - -# read lines from config file except comments -f = open(path_config, 'r') -f_lines = f.readlines() -f.close() -lines =[] -for i in range(len(f_lines)): - if f_lines[i][0] != "%": - lines.append(f_lines[i]) -length = len(lines) - -# prepare build.log and perf_temp files -global build_log -build_log = pwd + "build.log" -if is_windows == False: - if os.path.exists(build_log): - os.remove(build_log) -else: - if os.path.exists("build.log"): - os.remove("build.log") -global perf_temp -perf_temp = pwd + "perf_temp" - -i = 0 -answer = [] -print_debug("Okey go go go!\n\n") -os.system(compiler + " --version >" + build_log) -version = open(build_log) -print_debug("Using test compiler: " + version.readline()) -version.close() - -if is_windows == False: - os.system(ref_compiler + " --version >" + build_log) -else: - os.system(ref_compiler + " 2>" + build_log + " 1>&2") - -version = open(build_log) -print_debug("Using reference compiler: " + version.readline()) -version.close() - - -# loop for all tests -while i < length-2: - # we read name of test - print_debug("%s" % lines[i]) - test = [lines[i][:-1],[],[],[],[],[]] - # read location of test - folder = lines[i+1] - folder = folder[:-1] - folder = os.path.normpath(options.path + os.sep + folder) - # check that test exists - if os.path.exists(folder) == False: - sys.stdout.write("Fatal error: Can't find test %s. Your path is: \"%s\".\n" % (lines[i][:-1], options.path)) - sys.stdout.write("Change current location to /examples or set path to /examples in --path.\n") - exit(0) - os.chdir(folder) - # read parameters of test - command = lines[i+2] - command = command[:-1] - if is_windows == False: - command = "./"+command + " >> " + perf_temp - else: - command = "x64\\Release\\"+command + " >> " + perf_temp - # parsing config parameters - next_line = lines[i+3] - if next_line[0] == "!": # we should take only one part of test output - R = next_line.split(' ') - c1 = int(R[1]) #c1 is a number of string which we want to use in test output - c2 = int(R[2]) #c2 is total number of strings in test output - i = i+1 - else: - c1 = 1 - c2 = 1 - next_line = lines[i+3] - if next_line[0] == "^": #we should concatenate result of this test with previous one - run_test(command, c1, c2, answer[len(answer)-1], False) - i = i+1 - else: #we run this test and append it's result to answer structure - run_test(command, c1, c2, test, True) - answer.append(test) - # preparing next loop iteration - os.chdir(pwd) - i+=4 - -# delete temp file -if os.path.exists(perf_temp): - os.remove(perf_temp) -#print collected answer -print_answer(answer) diff --git a/fail_db.txt b/fail_db.txt new file mode 100644 index 00000000..7adc3e41 --- /dev/null +++ b/fail_db.txt @@ -0,0 +1 @@ +% List of known fails diff --git a/llvm_patches/r183327-AVX2-GATHER.patch b/llvm_patches/3_3_r183327-AVX2-GATHER.patch similarity index 100% rename from llvm_patches/r183327-AVX2-GATHER.patch rename to llvm_patches/3_3_r183327-AVX2-GATHER.patch diff --git a/llvm_patches/r184575-x86-shift.patch b/llvm_patches/3_3_r184575-x86-shift.patch similarity index 100% rename from llvm_patches/r184575-x86-shift.patch rename to llvm_patches/3_3_r184575-x86-shift.patch diff --git a/examples/perf.ini b/perf.ini similarity index 84% rename from examples/perf.ini rename to perf.ini index d2a5c73e..d8c7fe71 100755 --- a/examples/perf.ini +++ b/perf.ini @@ -10,44 +10,48 @@ %**************************************************************************************************** AOBench aobench -ao 10 512 512 +10 512 512 #*** Deferred Shading deferred -deferred_shading data/pp1280x720.bin +data/pp1280x720.bin #*** Mandelbrot Set mandelbrot -mandelbrot + #*** Mandelbrot Set mandelbrot_tasks -mandelbrot_tasks + ^ #*** Perlin Noise Function noise -noise + #*** Binomial Options options -options + ! 1 2 #*** Black-Scholes Options options -options + ! 2 2 #*** Ray Tracer rt -rt sponza +sponza #*** 3D Stencil stencil -stencil + #*** Volume Rendering volume_rendering -volume camera.dat density_highres.vol +camera.dat density_highres.vol #*** +%Sort +%sort +% +%#*** diff --git a/perf.py b/perf.py new file mode 100755 index 00000000..d1d7654b --- /dev/null +++ b/perf.py @@ -0,0 +1,489 @@ +#!/usr/bin/python +# +# Copyright (c) 2013, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +# IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +# TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER +# OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +# // Author: Filippov Ilia + +def print_file(line): + if options.output != "": + output = open(options.output, 'w') + output.writelines(line) + output.close() + +def build_test(commands): + os.system(commands[4]) + test = os.system(commands[1]) + if options.ref: + ref = os.system(commands[3]) + return (options.ref and ref) or test + +def execute_test(commands): + r = 0 + common.remove_if_exists(perf_temp+"_test") + common.remove_if_exists(perf_temp+"_ref") + for k in range(int(options.number)): + r = r + os.system(commands[0]) + if options.ref: + r = r + os.system(commands[2]) + return r + +#gathers all tests results and made an item test from answer structure +def run_test(commands, c1, c2, test, test_ref, b_serial): + if build_test(commands) != 0: + error("Compilation fails of test %s\n" % test[0], 0) + return + if execute_test(commands) != 0: + error("Execution fails of test %s\n" % test[0], 0) + return + print_debug("TEST COMPILER:\n", s, perf_log) + analyse_test(c1, c2, test, b_serial, perf_temp+"_test") + if options.ref: + print_debug("REFERENCE COMPILER:\n", s, perf_log) + analyse_test(c1, c2, test_ref, b_serial, perf_temp+"_ref") + + +def analyse_test(c1, c2, test, b_serial, perf_temp_n): + tasks = [] #list of results with tasks, it will be test[2] + ispc = [] #list of results without tasks, it will be test[1] + absolute_tasks = [] #list of absolute results with tasks, it will be test[4] + absolute_ispc = [] #list of absolute results without tasks, ut will be test[3] + serial = [] #list serial times, it will be test[5] + j = 1 + for line in open(perf_temp_n): # we take test output + if "speedup" in line: # we are interested only in lines with speedup + if j == c1: # we are interested only in lines with c1 numbers + line = line.expandtabs(0) + line = line.replace("("," ") + line = line.split(",") + for i in range(len(line)): + subline = line[i].split(" ") + number = float(subline[1][:-1]) + if "speedup from ISPC + tasks" in line[i]: + tasks.append(number) + else: + ispc.append(number) + c1 = c1 + c2 + j+=1 + if "million cycles" in line: + if j == c1: + line = line.replace("]","[") + line = line.split("[") + number = float(line[3]) + if "tasks" in line[1]: + absolute_tasks.append(number) + else: + if "ispc" in line[1]: + absolute_ispc.append(number) + if "serial" in line[1]: + serial.append(number) + + if len(ispc) != 0: + if len(tasks) != 0: + print_debug("ISPC speedup / ISPC + tasks speedup / ISPC time / ISPC + tasks time / serial time\n", s, perf_log) + for i in range(0,len(serial)): + print_debug("%10s /\t%10s\t /%9s / %10s\t /%10s\n" % + (ispc[i], tasks[i], absolute_ispc[i], absolute_tasks[i], serial[i]), s, perf_log) + else: + print_debug("ISPC speedup / ISPC time / serial time\n", s, perf_log) + for i in range(0,len(serial)): + print_debug("%10s /%9s /%10s\n" % (ispc[i], absolute_ispc[i], serial[i]), s, perf_log) + else: + if len(tasks) != 0: + print_debug("ISPC + tasks speedup / ISPC + tasks time / serial time\n", s, perf_log) + for i in range(0,len(serial)): + print_debug("%10s\t / %10s\t /%10s\n" % (tasks[i], absolute_tasks[i], serial[i]), s, perf_log) + + test[1] = test[1] + ispc + test[2] = test[2] + tasks + test[3] = test[3] + absolute_ispc + test[4] = test[4] + absolute_tasks + if b_serial == True: + #if we concatenate outputs we should use only the first serial answer. + test[5] = test[5] + serial + +def cpu_get(): + p = open("/proc/stat", 'r') + cpu = p.readline() + p.close() + cpu = cpu.split(" ") + cpu_usage = (int(cpu[2]) + int(cpu[3]) + int(cpu[4])) + cpu_all = cpu_usage + int(cpu[5]) + return [cpu_usage, cpu_all] + +#returns cpu_usage +def cpu_check(): + if is_windows == False: + if is_mac == False: + cpu1 = cpu_get() + time.sleep(1) + cpu2 = cpu_get() + cpu_percent = (float(cpu1[0] - cpu2[0])/float(cpu1[1] - cpu2[1]))*100 + else: + os.system("sysctl -n vm.loadavg > cpu_temp") + c = open("cpu_temp", 'r') + c_line = c.readline() + c.close + os.remove("cpu_temp") + R = c_line.split(' ') + cpu_percent = float(R[1]) * 3 + else: + os.system("wmic cpu get loadpercentage /value > cpu_temp") + c = open("cpu_temp", 'r') + c_lines = c.readlines() + c.close() + os.remove("cpu_temp") + t = "0" + for i in c_lines[2]: + if i.isdigit(): + t = t + i + cpu_percent = int(t) + return cpu_percent + +#returns geomean of list +def geomean(par): + temp = 1 + l = len(par) + for i in range(l): + temp = temp * par[i] + temp = temp ** (1.0/l) + return round(temp, 2) + +#takes an answer struct and print it. +#answer struct: list answer contains lists test +#test[0] - name of test +#test[1] - list of results without tasks +#test[2] - list of results with tasks +#test[3] - list of absolute results without tasks +#test[4] - list of absolute results with tasks +#test[5] - list of absolute time without ISPC (serial) +#test[1..4] may be empty +def print_answer(answer): + filelist = [] + print_debug("--------------------------------------------------------------------------\n", s, perf_log) + print_debug("test name:\t ISPC speedup: ISPC + tasks speedup: | " + + "ISPC time: ISPC + tasks time: serial:\n", s, perf_log) + filelist.append("test name,ISPC speedup,diff," + + "ISPC + tasks speedup,diff,ISPC time,diff,ISPC + tasks time,diff,serial,diff\n") + max_t = [0,0,0,0,0] + diff_t = [0,0,0,0,0] + geomean_t = [0,0,0,0,0] + list_of_max = [[],[],[],[],[]] + list_of_compare = [[],[],[],[],[],[]] + for i in range(len(answer)): + list_of_compare[0].append(answer[i][0]) + for t in range(1,6): + if len(answer[i][t]) == 0: + max_t[t-1] = "n/a" + diff_t[t-1] = "n/a" + list_of_compare[t].append(0); + else: + if t < 3: + mm = max(answer[i][t]) + else: + mm = min(answer[i][t]) + list_of_compare[t].append(mm) + max_t[t-1] = '%.2f' % mm + list_of_max[t-1].append(mm) + diff_t[t-1] = '%.2f' % (max(answer[i][t]) - min(answer[i][t])) + print_debug("%s:\n" % answer[i][0], s, perf_log) + print_debug("\t\tmax:\t%5s\t\t%10s\t|%10s\t%10s\t%10s\n" % + (max_t[0], max_t[1], max_t[2], max_t[3], max_t[4]), s, perf_log) + print_debug("\t\tdiff:\t%5s\t\t%10s\t|%10s\t%10s\t%10s\n" % + (diff_t[0], diff_t[1], diff_t[2], diff_t[3], diff_t[4]), s, perf_log) + for t in range(0,5): + if max_t[t] == "n/a": + max_t[t] = "" + if diff_t[t] == "n/a": + diff_t[t] = "" + filelist.append(answer[i][0] + "," + + max_t[0] + "," + diff_t[0] + "," + max_t[1] + "," + diff_t[1] + "," + + max_t[2] + "," + diff_t[2] + "," + max_t[3] + "," + diff_t[3] + "," + + max_t[4] + "," + diff_t[4] + "\n") + for i in range(0,5): + geomean_t[i] = geomean(list_of_max[i]) + print_debug("---------------------------------------------------------------------------------\n", s, perf_log) + print_debug("Geomean:\t\t%5s\t\t%10s\t|%10s\t%10s\t%10s\n" % + (geomean_t[0], geomean_t[1], geomean_t[2], geomean_t[3], geomean_t[4]), s, perf_log) + filelist.append("Geomean," + str(geomean_t[0]) + ",," + str(geomean_t[1]) + + ",," + str(geomean_t[2]) + ",," + str(geomean_t[3]) + ",," + str(geomean_t[4]) + "\n") + print_file(filelist) + return list_of_compare + + +def compare(A, B): + print_debug("\n\n_____________________PERFORMANCE REPORT____________________________\n", False, "") + print_debug("test name: ISPC time: ISPC time ref: %:\n", False, "") + for i in range(0,len(A[0])): + if B[3][i] == 0: + p1 = 0 + else: + p1 = 100 - 100 * A[3][i]/B[3][i] + print_debug("%21s: %10.2f %10.2f %10.2f" % (A[0][i], A[3][i], B[3][i], p1), False, "") + if p1 < -1: + print_debug(" <-", False, "") + if p1 > 1: + print_debug(" <+", False, "") + print_debug("\n", False, "") + print_debug("\n", False, "") + + print_debug("test name: TASKS time: TASKS time ref: %:\n", False, "") + for i in range(0,len(A[0])): + if B[4][i] == 0: + p2 = 0 + else: + p2 = 100 - 100 * A[4][i]/B[4][i] + print_debug("%21s: %10.2f %10.2f %10.2f" % (A[0][i], A[4][i], B[4][i], p2), False, "") + if p2 < -1: + print_debug(" <-", False, "") + if p2 > 1: + print_debug(" <+", False, "") + print_debug("\n", False, "") + if "performance.log" in options.in_file: + print_debug("\n\n_________________Watch performance.log for details________________\n", False, "") + else: + print_debug("\n\n__________________________________________________________________\n", False, "") + + + +def perf(options1, args): + global options + options = options1 + global s + s = options.silent + + # save current OS + global is_windows + is_windows = (platform.system() == 'Windows' or + 'CYGWIN_NT' in platform.system()) + global is_mac + is_mac = (platform.system() == 'Darwin') + + # save current path + pwd = os.getcwd() + pwd = pwd + os.sep + pwd1 = pwd + if is_windows: + pwd1 = "..\\..\\" + + # check if cpu usage is low now + cpu_percent = cpu_check() + if cpu_percent > 20: + error("CPU Usage is very high.\nClose other applications.\n", 2) + + global ispc_test + global ispc_ref + global ref_compiler + global refc_compiler + # check that required compilers exist + PATH_dir = string.split(os.getenv("PATH"), os.pathsep) + ispc_test_exists = False + ispc_ref_exists = False + ref_compiler_exists = False + if is_windows == False: + ispc_test = "ispc" + ref_compiler = "g++" + refc_compiler = "gcc" + if options.compiler != "": + if options.compiler == "clang" or options.compiler == "clang++": + ref_compiler = "clang++" + refc_compiler = "clang" + if options.compiler == "icc" or options.compiler == "icpc": + ref_compiler = "icpc" + refc_compiler = "icc" + else: + ispc_test = "ispc.exe" + ref_compiler = "cl.exe" + ispc_ref = options.ref + if options.ref != "": + options.ref = True + for counter in PATH_dir: + if os.path.exists(counter + os.sep + ispc_test): + ispc_test_exists = True + if os.path.exists(counter + os.sep + ref_compiler): + ref_compiler_exists = True + if os.path.exists(counter + os.sep + ispc_ref): + ispc_ref_exists = True + if not ispc_test_exists: + error("ISPC compiler not found.\nAdded path to ispc compiler to your PATH variable.\n", 1) + if not ref_compiler_exists: + error("C/C++ compiler %s not found.\nAdded path to %s compiler to your PATH variable.\n" % (ref_compiler, ref_compiler), 1) + if options.ref: + if not ispc_ref_exists: + error("ISPC reference compiler not found.\nAdded path to ispc reference compiler to your PATH variable.\n", 1) + + # checks that config file exists + path_config = os.path.normpath(options.config) + if os.path.exists(path_config) == False: + error("config file not found: %s.\nSet path to your config file in --config.\n" % options.config, 1) + sys.exit() + + # read lines from config file except comments + f = open(path_config, 'r') + f_lines = f.readlines() + f.close() + lines =[] + for i in range(len(f_lines)): + if f_lines[i][0] != "%": + lines.append(f_lines[i]) + length = len(lines) + + # prepare build.log, perf_temp and perf.log files + global perf_log + if options.in_file: + perf_log = pwd + options.in_file + common.remove_if_exists(perf_log) + else: + perf_log = "" + global build_log + build_log = pwd + os.sep + "logs" + os.sep + "perf_build.log" + common.remove_if_exists(build_log) + if os.path.exists(pwd + os.sep + "logs") == False: + os.makedirs(pwd + os.sep + "logs") + + global perf_temp + perf_temp = pwd + "perf_temp" + # end of preparations + + print_debug("Okey go go go!\n\n", s, perf_log) + + #print compilers versions + common.print_version(ispc_test, ispc_ref, ref_compiler, False, perf_log, is_windows) + + # begin + i = 0 + answer = [] + answer_ref = [] + + # loop for all tests + while i < length-2: + # we read name of test + print_debug("%s" % lines[i], s, perf_log) + test = [lines[i][:-1],[],[],[],[],[]] + test_ref = [lines[i][:-1],[],[],[],[],[]] + # read location of test + folder = lines[i+1] + folder = folder[:-1] + folder = os.path.normpath(options.path + os.sep + "examples" + os.sep + folder) + # check that test exists + if os.path.exists(folder) == False: + error("Can't find test %s. Your path is: \"%s\".\nChange current location to ISPC_HOME or set path to ISPC_HOME in --path.\n" % + (lines[i][:-1], options.path), 1) + os.chdir(folder) + # read parameters of test + command = lines[i+2] + command = command[:-1] + if is_windows == False: + ex_command_ref = "./ref " + command + " >> " + perf_temp + "_ref" + ex_command = "./test " + command + " >> " + perf_temp + "_test" + bu_command_ref = "make CXX="+ref_compiler+" CC="+refc_compiler+ " EXAMPLE=ref ISPC="+ispc_ref+" >> "+build_log+" 2>> "+build_log + bu_command = "make CXX="+ref_compiler+" CC="+refc_compiler+ " EXAMPLE=test ISPC="+ispc_test+" >> "+build_log+" 2>> "+build_log + re_command = "make clean >> "+build_log + else: + ex_command_ref = "x64\\Release\\ref.exe " + command + " >> " + perf_temp + "_ref" + ex_command = "x64\\Release\\test.exe " + command + " >> " + perf_temp + "_test" + bu_command_ref = "msbuild /V:m /p:Platform=x64 /p:Configuration=Release /p:TargetDir=.\ /p:TargetName=ref /t:rebuild >> " + build_log + bu_command = "msbuild /V:m /p:Platform=x64 /p:Configuration=Release /p:TargetDir=.\ /p:TargetName=test /t:rebuild >> " + build_log + re_command = "msbuild /t:clean >> " + build_log + commands = [ex_command, bu_command, ex_command_ref, bu_command_ref, re_command] + # parsing config parameters + next_line = lines[i+3] + if next_line[0] == "!": # we should take only one part of test output + R = next_line.split(' ') + c1 = int(R[1]) #c1 is a number of string which we want to use in test output + c2 = int(R[2]) #c2 is total number of strings in test output + i = i+1 + else: + c1 = 1 + c2 = 1 + next_line = lines[i+3] + if next_line[0] == "^": #we should concatenate result of this test with previous one + run_test(commands, c1, c2, answer[len(answer)-1], answer_ref[len(answer)-1], False) + i = i+1 + else: #we run this test and append it's result to answer structure + run_test(commands, c1, c2, test, test_ref, True) + answer.append(test) + answer_ref.append(test_ref) + + # preparing next loop iteration + os.chdir(pwd1) + i+=4 + + # delete temp file + common.remove_if_exists(perf_temp+"_test") + common.remove_if_exists(perf_temp+"_ref") + + #print collected answer + print_debug("\n\nTEST COMPILER:\n", s, perf_log) + A = print_answer(answer) + if options.ref != "": + print_debug("\n\nREFERENCE COMPILER:\n", s, perf_log) + B = print_answer(answer_ref) + # print perf report + compare(A,B) + + + +###Main### +from optparse import OptionParser +import sys +import os +import operator +import time +import glob +import string +import platform +# our functions +import common +print_debug = common.print_debug +error = common.error + +if __name__ == "__main__": + # parsing options + parser = OptionParser() + parser.add_option('-n', '--number', dest='number', + help='number of repeats', default="3") + parser.add_option('-c', '--config', dest='config', + help='config file of tests', default="./perf.ini") + parser.add_option('-p', '--path', dest='path', + help='path to test_system directory', default=".") + parser.add_option('-s', '--silent', dest='silent', + help='silent mode, only table output', default=False, action="store_true") + parser.add_option('-o', '--output', dest='output', + help='output file for script reading', default="") + parser.add_option('--compiler', dest='compiler', + help='C/C++ compiler', default="") + parser.add_option('-r', '--ref', dest='ref', + help='set reference compiler for compare', default="") + parser.add_option('-f', '--file', dest='in_file', + help='file to save perf output', default="") + (options, args) = parser.parse_args() + perf(options, args) diff --git a/run_tests.py b/run_tests.py index 9729930f..2471b6cb 100755 --- a/run_tests.py +++ b/run_tests.py @@ -1,165 +1,37 @@ #!/usr/bin/python +# +# Copyright (c) 2013, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +# IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +# TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER +# OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # test-running driver for ispc - -from optparse import OptionParser -import multiprocessing -from ctypes import c_int -import os -import sys -import glob -import re -import signal -import random -import string -import subprocess -import shlex -import platform -import tempfile -import os.path -import time - -# disable fancy error/warning printing with ANSI colors, so grepping for error -# messages doesn't get confused -os.environ["TERM"] = "dumb" - -# This script is affected by http://bugs.python.org/issue5261 on OSX 10.5 Leopard -# git history has a workaround for that issue. - -is_windows = (platform.system() == 'Windows' or - 'CYGWIN_NT' in platform.system()) - -parser = OptionParser() -parser.add_option("-r", "--random-shuffle", dest="random", help="Randomly order tests", - default=False, action="store_true") -parser.add_option("-g", "--generics-include", dest="include_file", help="Filename for header implementing functions for generics", - default=None) -parser.add_option("-f", "--ispc-flags", dest="ispc_flags", help="Additional flags for ispc (-g, -O1, ...)", - default="") -parser.add_option('-t', '--target', dest='target', - help='Set compilation target (sse2-i32x4, sse2-i32x8, sse4-i32x4, sse4-i32x8, sse4-i16x8, sse4-i8x16, avx1-i32x8, avx1-i32x16, avx1.1-i32x8, avx1.1-i32x16, avx2-i32x8, avx2-i32x16, generic-x1, generic-x4, generic-x8, generic-x16, generic-x32, generic-x64)', - default="sse4") -parser.add_option('-a', '--arch', dest='arch', - help='Set architecture (arm, x86, x86-64)', - default="x86-64") -parser.add_option("-c", "--compiler", dest="compiler_exe", help="Compiler binary to use to run tests", - default=None) -parser.add_option('-o', '--no-opt', dest='no_opt', help='Disable optimization', - default=False, action="store_true") -parser.add_option('-j', '--jobs', dest='num_jobs', help='Maximum number of jobs to run in parallel', - default="1024", type="int") -parser.add_option('-v', '--verbose', dest='verbose', help='Enable verbose output', - default=False, action="store_true") -parser.add_option('--wrap-exe', dest='wrapexe', - help='Executable to wrap test runs with (e.g. "valgrind")', - default="") -parser.add_option('--time', dest='time', help='Enable time output', - default=False, action="store_true") -parser.add_option('--non-interactive', dest='non_interactive', help='Disable interactive status updates', - default=False, action="store_true") - -(options, args) = parser.parse_args() - -if options.target == 'neon': - options.arch = 'arm' - -# use relative path to not depend on host directory, which may possibly -# have white spaces and unicode characters. -if not is_windows: - ispc_exe = "./ispc" -else: - ispc_exe = ".\\Release\\ispc.exe" - -# checks the required ispc compiler otherwise prints an error message -if not os.path.exists(ispc_exe): - sys.stderr.write("Fatal error: missing ispc compiler: %s\n" % ispc_exe) - sys.exit() - -ispc_exe += " " + options.ispc_flags - -if __name__ == '__main__': - sys.stdout.write("ispc compiler: %s\n" % ispc_exe) - -is_generic_target = (options.target.find("generic-") != -1 and - options.target != "generic-1") -if is_generic_target and options.include_file == None: - if options.target == "generic-4": - sys.stderr.write("No generics #include specified; using examples/intrinsics/sse4.h\n") - options.include_file = "examples/intrinsics/sse4.h" - elif options.target == "generic-8": - sys.stderr.write("No generics #include specified and no default available for \"generic-8\" target.\n") - sys.exit(1) - elif options.target == "generic-16": - sys.stderr.write("No generics #include specified; using examples/intrinsics/generic-16.h\n") - options.include_file = "examples/intrinsics/generic-16.h" - elif options.target == "generic-32": - sys.stderr.write("No generics #include specified; using examples/intrinsics/generic-32.h\n") - options.include_file = "examples/intrinsics/generic-32.h" - elif options.target == "generic-64": - sys.stderr.write("No generics #include specified; using examples/intrinsics/generic-64.h\n") - options.include_file = "examples/intrinsics/generic-64.h" - -if options.compiler_exe == None: - if is_windows: - options.compiler_exe = "cl.exe" - else: - options.compiler_exe = "g++" - -# checks the required compiler otherwise prints an error message -PATH_dir = string.split(os.getenv("PATH"), os.pathsep) -compiler_exists = False - -for counter in PATH_dir: - if os.path.exists(counter + os.sep + options.compiler_exe): - compiler_exists = True - break - -if not compiler_exists: - sys.stderr.write("Fatal error: missing the required compiler: %s \n" % - options.compiler_exe) - sys.exit() - -ispc_root = "." - -# if no specific test files are specified, run all of the tests in tests/, -# failing_tests/, and tests_errors/ -if len(args) == 0: - files = glob.glob(ispc_root + os.sep + "tests" + os.sep + "*ispc") + \ - glob.glob(ispc_root + os.sep + "failing_tests" + os.sep + "*ispc") + \ - glob.glob(ispc_root + os.sep + "tests_errors" + os.sep + "*ispc") -else: - if is_windows: - argfiles = [ ] - for f in args: - # we have to glob ourselves if this is being run under a DOS - # shell, as it passes wildcard as is. - argfiles += glob.glob(f) - else: - argfiles = args - - files = [ ] - for f in argfiles: - if os.path.splitext(string.lower(f))[1] != ".ispc": - sys.stdout.write("Ignoring file %s, which doesn't have an .ispc extension.\n" % f) - else: - files += [ f ] - -# max_test_length is used to issue exact number of whitespace characters when -# updating status. Otherwise update causes new lines standard 80 char terminal -# on both Linux and Windows. -max_test_length = 0 -for f in files: - max_test_length = max(max_test_length, len(f)) - -# randomly shuffle the tests if asked to do so -if (options.random): - random.seed() - random.shuffle(files) - -# counter -total_tests = 0 - - # utility routine to print an update on the number of tests that have been # finished. Should be called with the lock held.. def update_progress(fn, total_tests_arg, counter, max_test_length_arg): @@ -176,7 +48,7 @@ def update_progress(fn, total_tests_arg, counter, max_test_length_arg): def run_command(cmd): if options.verbose: - sys.stdout.write("Running: %s\n" % cmd) + print_debug("Running: %s\n" % cmd, s, run_tests_log) # Here's a bit tricky part. To pass a command for execution we should # break down the line in to arguments. shlex class is designed exactly @@ -204,9 +76,9 @@ def run_cmds(compile_cmds, run_cmd, filename, expect_failure): (return_code, output) = run_command(cmd) compile_failed = (return_code != 0) if compile_failed: - sys.stdout.write("Compilation of test %s failed \n" % filename) + print_debug("Compilation of test %s failed \n" % filename, s, run_tests_log) if output != "": - sys.stdout.write("%s" % output.encode("utf-8")) + print_debug("%s" % output.encode("utf-8"), s, run_tests_log) return (1, 0) (return_code, output) = run_command(run_cmd) @@ -215,11 +87,11 @@ def run_cmds(compile_cmds, run_cmd, filename, expect_failure): surprise = ((expect_failure and not run_failed) or (not expect_failure and run_failed)) if surprise == True: - sys.stderr.write("Test %s %s (return code %d) \n" % \ + print_debug("Test %s %s (return code %d) \n" % \ (filename, "unexpectedly passed" if expect_failure else "failed", - return_code)) + return_code), s, run_tests_log) if output != "": - sys.stdout.write("%s\n" % output.encode("utf-8")) + print_debug("%s\n" % output.encode("utf-8"), s, run_tests_log) if surprise == True: return (0, 1) else: @@ -298,11 +170,11 @@ def run_test(testname): file.close() if re.search(firstline, output) == None: - sys.stderr.write("Didn't see expected error message %s from test %s.\nActual output:\n%s\n" % \ - (firstline, testname, output)) + print_debug("Didn't see expected error message %s from test %s.\nActual output:\n%s\n" % \ + (firstline, testname, output), s, run_tests_log) return (1, 0) elif got_error == False: - sys.stderr.write("Unexpectedly no errors issued from test %s\n" % testname) + print_debug("Unexpectedly no errors issued from test %s\n" % testname, s, run_tests_log) return (1, 0) else: return (0, 0) @@ -328,8 +200,7 @@ def run_test(testname): break file.close() if match == -1: - sys.stderr.write("Fatal error: unable to find function signature " + \ - "in test %s\n" % testname) + error("unable to find function signature in test %s\n" % testname, 0) return (1, 0) else: global is_generic_target @@ -404,7 +275,21 @@ def run_test(testname): # pull tests to run from the given queue and run them. Multiple copies of # this function will be running in parallel across all of the CPU cores of # the system. -def run_tasks_from_queue(queue, queue_ret, queue_skip, total_tests_arg, max_test_length_arg, counter, mutex): +def run_tasks_from_queue(queue, queue_ret, queue_skip, total_tests_arg, max_test_length_arg, counter, mutex, glob_var): + # This is needed on windows because windows doen't copy globals from parent process whili multiprocessing + global is_windows + is_windows = glob_var[0] + global options + options = glob_var[1] + global s + s = glob_var[2] + global ispc_exe + ispc_exe = glob_var[3] + global is_generic_target + is_generic_target = glob_var[4] + global run_tests_log + run_tests_log = glob_var[5] + if is_windows: tmpdir = "tmp%d" % os.getpid() os.mkdir(tmpdir) @@ -454,7 +339,256 @@ def sigint(signum, frame): t.terminate() sys.exit(1) -if __name__ == '__main__': + +def file_check(compfails, runfails): + errors = len(compfails) + len(runfails) + new_compfails = [] + new_runfails = [] + new_passes_compfails = [] + new_passes_runfails = [] +# Open file fail_db.txt + f = open(test_states, 'r') + f_lines = f.readlines() + f.close() +# Detect OS + if platform.system() == 'Windows' or 'CYGWIN_NT' in platform.system(): + OS = "Windows" + else: + if platform.system() == 'Darwin': + OS = "Mac" + else: + OS = "Linux" +# Detect opt_set + if options.no_opt == True: + opt = "-O0" + else: + opt = "-O2" +# Detect LLVM version + temp1 = common.detect_version(ispc_exe + " --version") + llvm_version = temp1[-10:-2] +#Detect compiler version + if is_windows == False: + temp1 = common.detect_version(options.compiler_exe + " --version") + temp2 = temp1.split(" ") + compiler_version = temp2[0] + temp2[2][0:4] + else: + compiler_version = "cl" + new_line = " "+options.arch.rjust(6)+" "+options.target.rjust(14)+" "+OS.rjust(7)+" "+llvm_version+" "+compiler_version.rjust(10)+" "+opt+" *\n" + + new_compfails = compfails[:] + new_runfails = runfails[:] + new_f_lines = f_lines[:] + for j in range(0, len(f_lines)): + if (((" "+options.arch+" ") in f_lines[j]) and + ((" "+options.target+" ") in f_lines[j]) and + ((" "+OS+" ") in f_lines[j]) and + ((" "+llvm_version+" ") in f_lines[j]) and + ((" "+compiler_version+" ") in f_lines[j]) and + ((" "+opt+" ") in f_lines[j])): + if (" compfail " in f_lines[j]): + f = 0 + for i in range(0, len(compfails)): + if compfails[i] in f_lines[j]: + new_compfails.remove(compfails[i]) + else: + f = f + 1 + if f == len(compfails): + temp3 = f_lines[j].split(" ") + new_passes_compfails.append(temp3[0]) + if options.update == "FP": + new_f_lines.remove(f_lines[j]) + if (" runfail " in f_lines[j]): + f = 0 + for i in range(0, len(runfails)): + if runfails[i] in f_lines[j]: + new_runfails.remove(runfails[i]) + else: + f = f + 1 + if f == len(runfails): + temp3 = f_lines[j].split(" ") + new_passes_runfails.append(temp3[0]) + if options.update == "FP": + new_f_lines.remove(f_lines[j]) + if len(new_runfails) != 0: + print_debug("NEW RUNFAILS:\n", s, run_tests_log) + for i in range (0,len(new_runfails)): + new_f_lines.append(new_runfails[i] + " runfail " + new_line) + print_debug("\t" + new_runfails[i] + "\n", s, run_tests_log) + if len(new_compfails) != 0: + print_debug("NEW COMPFAILS:\n", s, run_tests_log) + for i in range (0,len(new_compfails)): + new_f_lines.append(new_compfails[i] + " compfail " + new_line) + print_debug("\t" + new_compfails[i] + "\n", s, run_tests_log) + if len(new_passes_runfails) != 0: + print_debug("NEW PASSES after RUNFAILS:\n", s, run_tests_log) + for i in range (0,len(new_passes_runfails)): + print_debug("\t" + new_passes_runfails[i] + "\n", s, run_tests_log) + if len(new_passes_compfails) != 0: + print_debug("NEW PASSES after COMPFAILS:\n", s, run_tests_log) + for i in range (0,len(new_passes_compfails)): + print_debug("\t" + new_passes_compfails[i] + "\n", s, run_tests_log) + + if options.update != "": + output = open(test_states, 'w') + output.writelines(new_f_lines) + output.close() + return [new_runfails, new_compfails, new_passes_runfails, new_passes_compfails, new_line, errors] + +def verify(): + # Open file fail_db.txt + f = open(test_states, 'r') + f_lines = f.readlines() + f.close() + check = [["g++", "clang", "cl"],["-O0", "-O2"],["x86","x86-64"], + ["Linux","Windows","Mac"],["LLVM 3.1","LLVM 3.2","LLVM 3.3","LLVM head"], + ["sse2-i32x4", "sse2-i32x8", "sse4-i32x4", "sse4-i32x8", "sse4-i16x8", + "sse4-i8x16", "avx1-i32x8", "avx1-i32x16", "avx1.1-i32x8", "avx1.1-i32x16", + "avx2-i32x8", "avx2-i32x16", "generic-1", "generic-4", "generic-8", + "generic-16", "generic-32", "generic-64"]] + for i in range (0,len(f_lines)): + if f_lines[i][0] == "%": + continue + for j in range(0,len(check)): + temp = 0 + for t in range(0,len(check[j])): + if " " + check[j][t] + " " in f_lines[i]: + temp = temp + 1 + if temp != 1: + print_debug("error in line " + str(i) + "\n", False, run_tests_log) + break + + +def run_tests(options1, args, print_version): + global options + options = options1 + global s + s = options.silent + + # prepare run_tests_log and test_states files + global run_tests_log + if options.in_file: + run_tests_log = os.getcwd() + os.sep + options.in_file + if print_version == 1: + common.remove_if_exists(run_tests_log) + else: + run_tests_log = "" + global test_states + test_states = "fail_db.txt" + if options.verify: + verify() + return 0 + + # disable fancy error/warning printing with ANSI colors, so grepping for error + # messages doesn't get confused + os.environ["TERM"] = "dumb" + + # This script is affected by http://bugs.python.org/issue5261 on OSX 10.5 Leopard + # git history has a workaround for that issue. + global is_windows + is_windows = (platform.system() == 'Windows' or + 'CYGWIN_NT' in platform.system()) + + if options.target == 'neon': + options.arch = 'arm' + + # use relative path to not depend on host directory, which may possibly + # have white spaces and unicode characters. + global ispc_exe + if not is_windows: + ispc_exe = "./ispc" + else: + ispc_exe = ".\\Release\\ispc.exe" + + # checks the required ispc compiler otherwise prints an error message + if not os.path.exists(ispc_exe): + error("missing ispc compiler: %s\n" % ispc_exe, 1) + ispc_exe += " " + options.ispc_flags + print_debug("ispc compiler: %s\n" % ispc_exe, s, run_tests_log) + + global is_generic_target + is_generic_target = (options.target.find("generic-") != -1 and + options.target != "generic-1" and options.target != "generic-x1") + if is_generic_target and options.include_file == None: + if options.target == "generic-4" or options.target == "generic-x4": + error("No generics #include specified; using examples/intrinsics/sse4.h\n", 2) + options.include_file = "examples/intrinsics/sse4.h" + options.target = "generic-4" + elif options.target == "generic-8" or options.target == "generic-x8": + error("No generics #include specified and no default available for \"generic-8\" target.\n", 1) + options.target = "generic-8" + elif options.target == "generic-16" or options.target == "generic-x16": + error("No generics #include specified; using examples/intrinsics/generic-16.h\n", 2) + options.include_file = "examples/intrinsics/generic-16.h" + options.target = "generic-16" + elif options.target == "generic-32" or options.target == "generic-x32": + error("No generics #include specified; using examples/intrinsics/generic-32.h\n", 2) + options.include_file = "examples/intrinsics/generic-32.h" + options.target = "generic-32" + elif options.target == "generic-64" or options.target == "generic-x64": + error("No generics #include specified; using examples/intrinsics/generic-64.h\n", 2) + options.include_file = "examples/intrinsics/generic-64.h" + options.target = "generic-64" + + if options.compiler_exe == None: + if is_windows: + options.compiler_exe = "cl.exe" + else: + options.compiler_exe = "g++" + + # checks the required compiler otherwise prints an error message + PATH_dir = string.split(os.getenv("PATH"), os.pathsep) + compiler_exists = False + + for counter in PATH_dir: + if os.path.exists(counter + os.sep + options.compiler_exe): + compiler_exists = True + break + + if not compiler_exists: + error("missing the required compiler: %s \n" % options.compiler_exe, 1) + + # print compilers versions + if print_version > 0: + common.print_version(ispc_exe, "", options.compiler_exe, False, run_tests_log, is_windows) + + ispc_root = "." + + # if no specific test files are specified, run all of the tests in tests/, + # failing_tests/, and tests_errors/ + if len(args) == 0: + files = glob.glob(ispc_root + os.sep + "tests" + os.sep + "*ispc") + \ + glob.glob(ispc_root + os.sep + "failing_tests" + os.sep + "*ispc") + \ + glob.glob(ispc_root + os.sep + "tests_errors" + os.sep + "*ispc") + else: + if is_windows: + argfiles = [ ] + for f in args: + # we have to glob ourselves if this is being run under a DOS + # shell, as it passes wildcard as is. + argfiles += glob.glob(f) + else: + argfiles = args + + files = [ ] + for f in argfiles: + if os.path.splitext(string.lower(f))[1] != ".ispc": + error("Ignoring file %s, which doesn't have an .ispc extension.\n" % f, 2) + else: + files += [ f ] + + # max_test_length is used to issue exact number of whitespace characters when + # updating status. Otherwise update causes new lines standard 80 char terminal + # on both Linux and Windows. + max_test_length = 0 + for f in files: + max_test_length = max(max_test_length, len(f)) + + # randomly shuffle the tests if asked to do so + if (options.random): + random.seed() + random.shuffle(files) + + # counter total_tests = len(files) compile_error_files = [ ] @@ -463,7 +597,7 @@ if __name__ == '__main__': nthreads = min(multiprocessing.cpu_count(), options.num_jobs) nthreads = min(nthreads, len(files)) - sys.stdout.write("Running %d jobs in parallel. Running %d tests.\n" % (nthreads, total_tests)) + print_debug("Running %d jobs in parallel. Running %d tests.\n" % (nthreads, total_tests), s, run_tests_log) # put each of the test filenames into a queue q = multiprocessing.Queue() @@ -483,8 +617,10 @@ if __name__ == '__main__': start_time = time.time() # launch jobs to run tests + glob_var = [is_windows, options, s, ispc_exe, is_generic_target, run_tests_log] for x in range(nthreads): - t = multiprocessing.Process(target=run_tasks_from_queue, args=(q, qret, qskip, total_tests, max_test_length, finished_tests_counter, finished_tests_counter_lock)) + t = multiprocessing.Process(target=run_tasks_from_queue, args=(q, qret, qskip, total_tests, + max_test_length, finished_tests_counter, finished_tests_counter_lock, glob_var)) task_threads.append(t) t.start() @@ -493,35 +629,97 @@ if __name__ == '__main__': for t in task_threads: t.join() if options.non_interactive == False: - sys.stdout.write("\n") + print_debug("\n", s, run_tests_log) elapsed_time = time.time() - start_time while not qret.empty(): - (c, r, s) = qret.get() + (c, r, skip) = qret.get() compile_error_files += c run_error_files += r - skip_files += s + skip_files += skip if options.non_interactive: - sys.stdout.write(" Done %d / %d\n" % (finished_tests_counter.value, total_tests)) + print_debug(" Done %d / %d\n" % (finished_tests_counter.value, total_tests), s, run_tests_log) if len(skip_files) > 0: skip_files.sort() - sys.stdout.write("%d / %d tests SKIPPED:\n" % (len(skip_files), total_tests)) + print_debug("%d / %d tests SKIPPED:\n" % (len(skip_files), total_tests), s, run_tests_log) for f in skip_files: - sys.stdout.write("\t%s\n" % f) + print_debug("\t%s\n" % f, s, run_tests_log) if len(compile_error_files) > 0: compile_error_files.sort() - sys.stdout.write("%d / %d tests FAILED compilation:\n" % (len(compile_error_files), total_tests)) + print_debug("%d / %d tests FAILED compilation:\n" % (len(compile_error_files), total_tests), s, run_tests_log) for f in compile_error_files: - sys.stdout.write("\t%s\n" % f) + print_debug("\t%s\n" % f, s, run_tests_log) if len(run_error_files) > 0: run_error_files.sort() - sys.stdout.write("%d / %d tests FAILED execution:\n" % (len(run_error_files), total_tests)) + print_debug("%d / %d tests FAILED execution:\n" % (len(run_error_files), total_tests), s, run_tests_log) for f in run_error_files: - sys.stdout.write("\t%s\n" % f) + print_debug("\t%s\n" % f, s, run_tests_log) + + R = file_check(compile_error_files, run_error_files) if options.time: - sys.stdout.write("Elapsed time: %d s\n" % elapsed_time) + print_debug("Elapsed time: %d s\n" % elapsed_time, s, run_tests_log) - sys.exit(len(compile_error_files) + len(run_error_files)) + return R + + +from optparse import OptionParser +import multiprocessing +from ctypes import c_int +import os +import sys +import glob +import re +import signal +import random +import string +import subprocess +import shlex +import platform +import tempfile +import os.path +import time +# our functions +import common +print_debug = common.print_debug +error = common.error + +if __name__ == "__main__": + parser = OptionParser() + parser.add_option("-r", "--random-shuffle", dest="random", help="Randomly order tests", + default=False, action="store_true") + parser.add_option("-g", "--generics-include", dest="include_file", help="Filename for header implementing functions for generics", + default=None) + parser.add_option("-f", "--ispc-flags", dest="ispc_flags", help="Additional flags for ispc (-g, -O1, ...)", + default="") + parser.add_option('-t', '--target', dest='target', + help='Set compilation target (sse2-i32x4, sse2-i32x8, sse4-i32x4, sse4-i32x8, sse4-i16x8, sse4-i8x16, avx1-i32x8, avx1-i32x16, avx1.1-i32x8, avx1.1-i32x16, avx2-i32x8, avx2-i32x16, generic-x1, generic-x4, generic-x8, generic-x16, generic-x32, generic-x64)', + default="sse4") + parser.add_option('-a', '--arch', dest='arch', + help='Set architecture (arm, x86, x86-64)', + default="x86-64") + parser.add_option("-c", "--compiler", dest="compiler_exe", help="C/C++ compiler binary to use to run tests", + default=None) + parser.add_option('-o', '--no-opt', dest='no_opt', help='Disable optimization', + default=False, action="store_true") + parser.add_option('-j', '--jobs', dest='num_jobs', help='Maximum number of jobs to run in parallel', + default="1024", type="int") + parser.add_option('-v', '--verbose', dest='verbose', help='Enable verbose output', + default=False, action="store_true") + parser.add_option('--wrap-exe', dest='wrapexe', + help='Executable to wrap test runs with (e.g. "valgrind")', + default="") + parser.add_option('--time', dest='time', help='Enable time output', + default=False, action="store_true") + parser.add_option('--non-interactive', dest='non_interactive', help='Disable interactive status updates', + default=False, action="store_true") + parser.add_option('-u', "--update", dest='update', help='Update file with fails (F of FP)', default="") + parser.add_option('-s', "--silent", dest='silent', help='enable silent mode without any output', default=False, + action = "store_true") + parser.add_option("--file", dest='in_file', help='file to save run_tests output', default="") + parser.add_option("--verify", dest='verify', help='verify the file fail_db.txt', default=False, action="store_true") + (options, args) = parser.parse_args() + L = run_tests(options, args, 1) + exit(0) From f45f6cb32a390d834e53037751365cd1932929e3 Mon Sep 17 00:00:00 2001 From: Dmitry Babokin Date: Tue, 17 Sep 2013 23:36:16 +0400 Subject: [PATCH 083/124] Test, documentation and vim support for double precision constants --- contrib/ispc.vim | 5 +++++ docs/ispc.rst | 11 ++++++++++- tests/double-consts.ispc | 23 +++++++++++++++++++++++ 3 files changed, 38 insertions(+), 1 deletion(-) create mode 100644 tests/double-consts.ispc diff --git a/contrib/ispc.vim b/contrib/ispc.vim index cc8493f0..4d870dcd 100644 --- a/contrib/ispc.vim +++ b/contrib/ispc.vim @@ -19,6 +19,11 @@ syn keyword ispcRepeat cdo cfor cwhile syn keyword ispcBuiltin programCount programIndex syn keyword ispcType export uniform varying int8 int16 int32 int64 +"double precision floating point number, with dot, optional exponent +syn match cFloat display contained "\d\+\.\d*d[-+]\=\d*\>" +"double precision floating point number, without dot, with exponent +syn match cFloat display contained "\d\+d[-+]\=\d\+\>" + " Default highlighting command -nargs=+ HiLink hi def link HiLink ispcStatement Statement diff --git a/docs/ispc.rst b/docs/ispc.rst index ff07f6d8..224faaa9 100644 --- a/docs/ispc.rst +++ b/docs/ispc.rst @@ -270,6 +270,14 @@ new reserved words: ``unmasked``, ``foreach_unique``, ``foreach_active``, and ``in``. Any program that happens to have a variable or function with one of these names must be modified to rename that symbol. +Updating ISPC Programs For Changes In ISPC 1.4.5 +---------------------------------------------- + +This release adds support for double precision floating point constants. +Double precision floating point constants are floating point number with +``d`` suffix and optional exponent part. Here are some examples: 3.14d, +31.4d-1, 1.d, 1.0d, 1d-2. Note that floating point number without suffix is +treated as single precision constant. Getting Started with ISPC ========================= @@ -1349,7 +1357,8 @@ but are likely to be supported in future releases: * Bitfield members of ``struct`` types * Variable numbers of arguments to functions * Literal floating-point constants (even without a ``f`` suffix) are - currently treated as being ``float`` type, not ``double`` + currently treated as being ``float`` type, not ``double``. To have a double + precision floating point constant use ``d`` suffix. * The ``volatile`` qualifier * The ``register`` storage class for variables. (Will be ignored). diff --git a/tests/double-consts.ispc b/tests/double-consts.ispc new file mode 100644 index 00000000..3259156a --- /dev/null +++ b/tests/double-consts.ispc @@ -0,0 +1,23 @@ + +export uniform int width() { return programCount; } + +export void f_f(uniform float RET[], uniform float aFOO[]) { + double a = aFOO[programIndex]; + // Test parsing of double constants. + double d1 = 1.0d40; + double d2 = 1.d40; + double d3 = 1d40; + double d4 = 10000000000000000000000000000000000000000.d; + double d5 = 10000000000000000000000000000000000000000.0d; + + // All the constants should be equal and if it's evaluated as "float", + // then sqrt will evaluate to +inf. + if (d1 == d2 && d1 == d3 && d1 == d4 && d1 == d5 && + ((float)sqrt(d1)) < 2e20) { + RET[programIndex] = a; + } +} + +export void result(uniform float RET[]) { + RET[programIndex] = 1 + programIndex; +} From 1c527ae34cf7c257f8deaf0261af447b238cab56 Mon Sep 17 00:00:00 2001 From: Dmitry Babokin Date: Wed, 18 Sep 2013 11:48:24 +0400 Subject: [PATCH 084/124] Adding tests and vim support for double constant of the form .1d41 --- contrib/ispc.vim | 2 ++ tests/double-consts.ispc | 7 ++++--- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/contrib/ispc.vim b/contrib/ispc.vim index 4d870dcd..f3cb413b 100644 --- a/contrib/ispc.vim +++ b/contrib/ispc.vim @@ -21,6 +21,8 @@ syn keyword ispcType export uniform varying int8 int16 int32 int64 "double precision floating point number, with dot, optional exponent syn match cFloat display contained "\d\+\.\d*d[-+]\=\d*\>" +"double precision floating point number, starting with dot, optional exponent +syn match cFloat display contained ".\d*d[-+]\=\d*\>" "double precision floating point number, without dot, with exponent syn match cFloat display contained "\d\+d[-+]\=\d\+\>" diff --git a/tests/double-consts.ispc b/tests/double-consts.ispc index 3259156a..4096aa1c 100644 --- a/tests/double-consts.ispc +++ b/tests/double-consts.ispc @@ -7,12 +7,13 @@ export void f_f(uniform float RET[], uniform float aFOO[]) { double d1 = 1.0d40; double d2 = 1.d40; double d3 = 1d40; - double d4 = 10000000000000000000000000000000000000000.d; - double d5 = 10000000000000000000000000000000000000000.0d; + double d4 = .1d41; + double d5 = 10000000000000000000000000000000000000000.d; + double d6 = 10000000000000000000000000000000000000000.0d; // All the constants should be equal and if it's evaluated as "float", // then sqrt will evaluate to +inf. - if (d1 == d2 && d1 == d3 && d1 == d4 && d1 == d5 && + if (d1 == d2 && d1 == d3 && d1 == d4 && d1 == d5 && d1 == d6 ((float)sqrt(d1)) < 2e20) { RET[programIndex] = a; } From bb8f7d4e3f2a226a8f4b7b7ae2de6fce7d609791 Mon Sep 17 00:00:00 2001 From: Ilia Filippov Date: Thu, 19 Sep 2013 14:37:26 +0400 Subject: [PATCH 085/124] removing LLVM 3.1 and 3.2 from default testing --- alloy.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/alloy.py b/alloy.py index 67f534ca..06025324 100755 --- a/alloy.py +++ b/alloy.py @@ -367,7 +367,7 @@ def validation_run(only, only_targets, reference_branch, notify, update): if len(archs) == 0: archs = ["x86", "x86-64"] if len(LLVM) == 0: - LLVM = ["3.1", "3.2", "3.3", "head"] + LLVM = ["3.3", "head"] gen_archs = ["x86-64"] need_LLVM = check_LLVM(LLVM) for i in range(0,len(need_LLVM)): From 6a21218c13aa14666d11150c265f542afd79818e Mon Sep 17 00:00:00 2001 From: evghenii Date: Thu, 19 Sep 2013 13:45:31 +0300 Subject: [PATCH 086/124] fix warrning and add KNC 1 --- examples/intrinsics/knc-i1x16.h | 4 ++-- examples/intrinsics/knc-i1x8.h | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/examples/intrinsics/knc-i1x16.h b/examples/intrinsics/knc-i1x16.h index b7d3a7f1..c535e61a 100644 --- a/examples/intrinsics/knc-i1x16.h +++ b/examples/intrinsics/knc-i1x16.h @@ -45,13 +45,13 @@ #define roundf(x) (floorf(x + .5f)) #define round(x) (floor(x + .5)) #else -#define FORCEINLINE __attribute__((always_inline)) +#define FORCEINLINE __forceinline #define PRE_ALIGN(x) #define POST_ALIGN(x) __attribute__ ((aligned(x))) #endif -#if 0 #define KNC 1 +#if 0 extern "C" { int printf(const unsigned char *, ...); diff --git a/examples/intrinsics/knc-i1x8.h b/examples/intrinsics/knc-i1x8.h index de9bddcc..573d232c 100644 --- a/examples/intrinsics/knc-i1x8.h +++ b/examples/intrinsics/knc-i1x8.h @@ -50,13 +50,13 @@ #define roundf(x) (floorf(x + .5f)) #define round(x) (floor(x + .5)) #else -#define FORCEINLINE __attribute__((always_inline)) +#define FORCEINLINE __forceinline #define PRE_ALIGN(x) #define POST_ALIGN(x) __attribute__ ((aligned(x))) #endif -#if 0 #define KNC 1 +#if 0 extern "C" { int printf(const unsigned char *, ...); From 43245bbc118c1b415c9c538c98555fc110ad1f3c Mon Sep 17 00:00:00 2001 From: Dmitry Babokin Date: Wed, 18 Sep 2013 14:24:46 +0400 Subject: [PATCH 087/124] Adding check for OS AVX support to auto-dispatch code --- builtins/dispatch.ll | 81 +++++++++++++++++++++++++++----------------- 1 file changed, 49 insertions(+), 32 deletions(-) diff --git a/builtins/dispatch.ll b/builtins/dispatch.ll index f1d5a969..ba216df7 100644 --- a/builtins/dispatch.ll +++ b/builtins/dispatch.ll @@ -1,4 +1,4 @@ -;; Copyright (c) 2011, Intel Corporation +;; Copyright (c) 2011-2013, Intel Corporation ;; All rights reserved. ;; ;; Redistribution and use in source and binary forms, with or without @@ -41,15 +41,13 @@ @__system_best_isa = internal global i32 -1 -declare void @abort() noreturn - ;; The below is the result of running "clang -O2 -emit-llvm -c -o -" on the ;; following code... Specifically, __get_system_isa should return a value ;; corresponding to one of the Target::ISA enumerant values that gives the ;; most capable ISA that the curremt system can run. ;; -;; Note: clang from LLVM 3.0 should be used if this is updated, for maximum -;; backwards compatibility for anyone building ispc with LLVM 3.0 +;; Note: clang from LLVM 3.1 should be used if this is updated, for maximum +;; backwards compatibility for anyone building ispc with LLVM 3.1 ;; ;; #include ;; #include @@ -60,7 +58,7 @@ declare void @abort() noreturn ;; : "0" (infoType)); ;; } ;; -;; /* Save %ebx in case it's the PIC register */ +;; // Save %ebx in case it's the PIC register. ;; static void __cpuid_count(int info[4], int level, int count) { ;; __asm__ __volatile__ ("xchg{l}\t{%%}ebx, %1\n\t" ;; "cpuid\n\t" @@ -69,13 +67,23 @@ declare void @abort() noreturn ;; : "0" (level), "2" (count)); ;; } ;; +;; static int __os_has_avx_support() { +;; // Check xgetbv; this uses a .byte sequence instead of the instruction +;; // directly because older assemblers do not include support for xgetbv and +;; // there is no easy way to conditionally compile based on the assembler used. +;; int rEAX, rEDX; +;; __asm__ __volatile__ (".byte 0x0f, 0x01, 0xd0" : "=a" (rEAX), "=d" (rEDX) : "c" (0)); +;; return (rEAX & 6) == 6; +;; } +;; ;; int32_t __get_system_isa() { ;; int info[4]; ;; __cpuid(info, 1); ;; -;; /* NOTE: the values returned below must be the same as the -;; corresponding enumerant values in Target::ISA. */ -;; if ((info[2] & (1 << 28)) != 0) { +;; // NOTE: the values returned below must be the same as the +;; // corresponding enumerant values in Target::ISA. +;; if ((info[2] & (1 << 28)) != 0 && +;; __os_has_avx_support()) { ;; if ((info[2] & (1 << 29)) != 0 && // F16C ;; (info[2] & (1 << 30)) != 0) { // RDRAND ;; // So far, so good. AVX2? @@ -98,47 +106,56 @@ declare void @abort() noreturn ;; abort(); ;; } -define i32 @__get_system_isa() nounwind uwtable ssp { +define i32 @__get_system_isa() nounwind uwtable { entry: %0 = tail call { i32, i32, i32, i32 } asm sideeffect "cpuid", "={ax},={bx},={cx},={dx},0,~{dirflag},~{fpsr},~{flags}"(i32 1) nounwind %asmresult5.i = extractvalue { i32, i32, i32, i32 } %0, 2 %asmresult6.i = extractvalue { i32, i32, i32, i32 } %0, 3 %and = and i32 %asmresult5.i, 268435456 %cmp = icmp eq i32 %and, 0 - br i1 %cmp, label %if.else13, label %if.then + br i1 %cmp, label %if.else14, label %land.lhs.true -if.then: ; preds = %entry - %1 = and i32 %asmresult5.i, 1610612736 - %2 = icmp eq i32 %1, 1610612736 - br i1 %2, label %if.then7, label %return +land.lhs.true: ; preds = %entry + %1 = tail call { i32, i32 } asm sideeffect ".byte 0x0f, 0x01, 0xd0", "={ax},={dx},{cx},~{dirflag},~{fpsr},~{flags}"(i32 0) nounwind + %asmresult.i25 = extractvalue { i32, i32 } %1, 0 + %and.i = and i32 %asmresult.i25, 6 + %cmp.i = icmp eq i32 %and.i, 6 + br i1 %cmp.i, label %if.then, label %if.else14 -if.then7: ; preds = %if.then - %3 = tail call { i32, i32, i32, i32 } asm sideeffect "xchg$(l$)\09$(%$)ebx, $1\0A\09cpuid\0A\09xchg$(l$)\09$(%$)ebx, $1\0A\09", "={ax},=r,={cx},={dx},0,2,~{dirflag},~{fpsr},~{flags}"(i32 7, i32 0) nounwind - %asmresult4.i28 = extractvalue { i32, i32, i32, i32 } %3, 1 - %and10 = lshr i32 %asmresult4.i28, 5 - %4 = and i32 %and10, 1 - %5 = add i32 %4, 3 +if.then: ; preds = %land.lhs.true + %2 = and i32 %asmresult5.i, 1610612736 + %3 = icmp eq i32 %2, 1610612736 + br i1 %3, label %if.then8, label %return + +if.then8: ; preds = %if.then + %4 = tail call { i32, i32, i32, i32 } asm sideeffect "xchg$(l$)\09$(%$)ebx, $1\0A\09cpuid\0A\09xchg$(l$)\09$(%$)ebx, $1\0A\09", "={ax},=r,={cx},={dx},0,2,~{dirflag},~{fpsr},~{flags}"(i32 7, i32 0) nounwind + %asmresult4.i30 = extractvalue { i32, i32, i32, i32 } %4, 1 + %and11 = lshr i32 %asmresult4.i30, 5 + %5 = and i32 %and11, 1 + %6 = add i32 %5, 3 br label %return -if.else13: ; preds = %entry - %and15 = and i32 %asmresult5.i, 524288 - %cmp16 = icmp eq i32 %and15, 0 - br i1 %cmp16, label %if.else18, label %return +if.else14: ; preds = %land.lhs.true, %entry + %and16 = and i32 %asmresult5.i, 524288 + %cmp17 = icmp eq i32 %and16, 0 + br i1 %cmp17, label %if.else19, label %return -if.else18: ; preds = %if.else13 - %and20 = and i32 %asmresult6.i, 67108864 - %cmp21 = icmp eq i32 %and20, 0 - br i1 %cmp21, label %if.else23, label %return +if.else19: ; preds = %if.else14 + %and21 = and i32 %asmresult6.i, 67108864 + %cmp22 = icmp eq i32 %and21, 0 + br i1 %cmp22, label %if.else24, label %return -if.else23: ; preds = %if.else18 +if.else24: ; preds = %if.else19 tail call void @abort() noreturn nounwind unreachable -return: ; preds = %if.else18, %if.else13, %if.then7, %if.then - %retval.0 = phi i32 [ %5, %if.then7 ], [ 2, %if.then ], [ 1, %if.else13 ], [ 0, %if.else18 ] +return: ; preds = %if.else19, %if.else14, %if.then8, %if.then + %retval.0 = phi i32 [ %6, %if.then8 ], [ 2, %if.then ], [ 1, %if.else14 ], [ 0, %if.else19 ] ret i32 %retval.0 } +declare void @abort() noreturn nounwind + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; This function is called by each of the dispatch functions we generate; ;; it sets @__system_best_isa if it is unset. From dbef4fd7d7d270e350f8af26f76846ba24deb8a0 Mon Sep 17 00:00:00 2001 From: evghenii Date: Thu, 19 Sep 2013 14:52:22 +0300 Subject: [PATCH 088/124] fixed notation --- examples/intrinsics/knc-i1x8.h | 17 ++++++----------- examples/intrinsics/knc-i1x8unsafe_fast.h | 2 +- 2 files changed, 7 insertions(+), 12 deletions(-) diff --git a/examples/intrinsics/knc-i1x8.h b/examples/intrinsics/knc-i1x8.h index 573d232c..c17b7238 100644 --- a/examples/intrinsics/knc-i1x8.h +++ b/examples/intrinsics/knc-i1x8.h @@ -38,11 +38,6 @@ #include #include -#if 0 -#define __ZMM32BIT__ -#endif - - #ifdef _MSC_VER #define FORCEINLINE __forceinline #define PRE_ALIGN(x) /*__declspec(align(x))*/ @@ -110,7 +105,7 @@ struct vec8 { /****************/ -#ifndef __ZMM32BIT__ +#ifndef __ZMM64BIT__ struct PRE_ALIGN(32) __vec8_i32 : public vec8 { __vec8_i32() { } FORCEINLINE __vec8_i32(int32_t v0, int32_t v1, int32_t v2, int32_t v3, @@ -135,7 +130,7 @@ struct PRE_ALIGN(32) __vec8_i32 : public vec8 { data[7],data[6],data[5],data[4],data[3],data[2],data[1],data[0]); } } POST_ALIGN(32); -#else /* __ZMM32BIT__ */ +#else /* __ZMM64BIT__ */ struct PRE_ALIGN(32) __vec8_i32 { __m512i v; @@ -150,9 +145,9 @@ struct PRE_ALIGN(32) __vec8_i32 FORCEINLINE const int32_t& operator[](const int i) const { return ((int32_t*)this)[i]; } FORCEINLINE int32_t& operator[](const int i) { return ((int32_t*)this)[i]; } } POST_ALIGN(32); -#endif /* __ZMM32BIT__ */ +#endif /* __ZMM64BIT__ */ -#ifndef __ZMM32BIT__ /* __ZMM32BIT__ */ +#ifndef __ZMM64BIT__ /* __ZMM64BIT__ */ PRE_ALIGN(32) struct __vec8_f : public vec8 { __vec8_f() { } FORCEINLINE __vec8_f(float v0, float v1, float v2, float v3, @@ -177,7 +172,7 @@ PRE_ALIGN(32) struct __vec8_f : public vec8 { data[7] = val.s[7]; } } POST_ALIGN(32); -#else /* __ZMM32BIT__ */ +#else /* __ZMM64BIT__ */ PRE_ALIGN(32) struct __vec8_f { __m512 v; @@ -192,7 +187,7 @@ PRE_ALIGN(32) struct __vec8_f FORCEINLINE const float& operator[](const int i) const { return ((float*)this)[i]; } FORCEINLINE float& operator[](const int i) { return ((float*)this)[i]; } } POST_ALIGN(32); -#endif /* __ZMM32BIT__ */ +#endif /* __ZMM64BIT__ */ struct PRE_ALIGN(64) __vec8_d { diff --git a/examples/intrinsics/knc-i1x8unsafe_fast.h b/examples/intrinsics/knc-i1x8unsafe_fast.h index ce66ea11..2e00a567 100644 --- a/examples/intrinsics/knc-i1x8unsafe_fast.h +++ b/examples/intrinsics/knc-i1x8unsafe_fast.h @@ -1,4 +1,4 @@ -#define __ZMM32BIT__ +#define __ZMM64BIT__ #include "knc-i1x8.h" /* the following tests fails because vec8_i32 and vec8_float are 512 and not 256 bit in size. From 0c274212c2104a4547018fd3be31f33e153b82d3 Mon Sep 17 00:00:00 2001 From: evghenii Date: Thu, 19 Sep 2013 16:07:22 +0300 Subject: [PATCH 089/124] performance tuning for knc-i1x8.h. this gives goed enough performance for double only. float performance is terrible --- examples/intrinsics/knc-i1x8.h | 167 +++++++++++++-------------------- 1 file changed, 64 insertions(+), 103 deletions(-) diff --git a/examples/intrinsics/knc-i1x8.h b/examples/intrinsics/knc-i1x8.h index c17b7238..d7696117 100644 --- a/examples/intrinsics/knc-i1x8.h +++ b/examples/intrinsics/knc-i1x8.h @@ -73,9 +73,9 @@ typedef int64_t __vec1_i64; struct __vec8_i1 { __vec8_i1() { } - __vec8_i1(const __mmask16 &vv) : v(vv) { } + __vec8_i1(const __mmask8 &vv) : v(vv) { } __vec8_i1(bool v0, bool v1, bool v2, bool v3, - bool v4, bool v5, bool v6, bool v7) { + bool v4, bool v5, bool v6, bool v7) { v = ((v0 & 1) | ((v1 & 1) << 1) | ((v2 & 1) << 2) | @@ -87,7 +87,7 @@ struct __vec8_i1 { } __mmask8 v; - FORCEINLINE operator __mmask8() const { return v; } + FORCEINLINE operator __mmask8() const { return v; }//0xFF & v; } }; @@ -105,89 +105,66 @@ struct vec8 { /****************/ -#ifndef __ZMM64BIT__ -struct PRE_ALIGN(32) __vec8_i32 : public vec8 { +struct PRE_ALIGN(32) __vec8_i32 +{ +#ifdef __ZMM64BIT__ + __m512i _data; + FORCEINLINE __vec8_i32(const __m512i &in) : _data(in) {} + FORCEINLINE operator __m512i() const { return _data; } +#else /* __ZMM64BIT__ */ + typedef int32_t _v8si __attribute__((vector_size(32))); + _v8si _data; + FORCEINLINE __vec8_i32(const __m512i &in) + { + _mm512_mask_extpackstorelo_epi32((__m512i*)&_data, 0xFF, in, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE); + } + FORCEINLINE operator __m512i() const + { + return _mm512_extloadunpacklo_epi32(_mm512_setzero_epi32(), (uint8_t*)&_data, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE); + } +#endif /* __ZMM64BIT__ */ + __vec8_i32() { } FORCEINLINE __vec8_i32(int32_t v0, int32_t v1, int32_t v2, int32_t v3, - int32_t v4, int32_t v5, int32_t v6, int32_t v7) - : vec8(v0, v1, v2, v3, v4, v5, v6, v7) { } - FORCEINLINE __vec8_i32(__m512i v) + int32_t v4, int32_t v5, int32_t v6, int32_t v7) { - union { __m512i v; int32_t s[8]; } val = {v}; - data[0] = val.s[0]; - data[1] = val.s[1]; - data[2] = val.s[2]; - data[3] = val.s[3]; - data[4] = val.s[4]; - data[5] = val.s[5]; - data[6] = val.s[6]; - data[7] = val.s[7]; + const __m512i v = _mm512_set_16to16_pi(0,0,0,0,0,0,0,0, v7, v6, v5, v4, v3, v2, v1, v0); + *this = __vec8_i32(v); } - FORCEINLINE operator __m512i() const - { - return _mm512_set_16to16_pi( - 0,0,0,0, 0,0,0,0, - data[7],data[6],data[5],data[4],data[3],data[2],data[1],data[0]); - } -} POST_ALIGN(32); -#else /* __ZMM64BIT__ */ -struct PRE_ALIGN(32) __vec8_i32 -{ - __m512i v; - FORCEINLINE operator __m512i() const { return v; } - FORCEINLINE __vec8_i32() : v(_mm512_undefined_epi32()) {} - FORCEINLINE __vec8_i32(const __m512i &in) : v(in) {} - FORCEINLINE __vec8_i32(const __vec8_i32 &o) : v(o.v) {} - FORCEINLINE __vec8_i32& operator =(const __vec8_i32 &o) { v=o.v; return *this; } - FORCEINLINE __vec8_i32(int32_t v00, int32_t v01, int32_t v02, int32_t v03, - int32_t v04, int32_t v05, int32_t v06, int32_t v07) : - v ( _mm512_set_16to16_pi(0,0,0,0,0,0,0,0, v07, v06, v05, v04, v03, v02, v01, v00) ) {} - FORCEINLINE const int32_t& operator[](const int i) const { return ((int32_t*)this)[i]; } - FORCEINLINE int32_t& operator[](const int i) { return ((int32_t*)this)[i]; } -} POST_ALIGN(32); -#endif /* __ZMM64BIT__ */ -#ifndef __ZMM64BIT__ /* __ZMM64BIT__ */ -PRE_ALIGN(32) struct __vec8_f : public vec8 { - __vec8_f() { } - FORCEINLINE __vec8_f(float v0, float v1, float v2, float v3, - float v4, float v5, float v6, float v7) - : vec8(v0, v1, v2, v3, v4, v5, v6, v7) { } - FORCEINLINE operator __m512() const - { - return _mm512_set_16to16_ps( - 0,0,0,0,0,0,0,0, - data[7],data[6],data[5],data[4],data[3],data[2],data[1],data[0]); - } - FORCEINLINE __vec8_f(__m512 v) - { - union { __m512 v; float s[8]; } val = {v}; - data[0] = val.s[0]; - data[1] = val.s[1]; - data[2] = val.s[2]; - data[3] = val.s[3]; - data[4] = val.s[4]; - data[5] = val.s[5]; - data[6] = val.s[6]; - data[7] = val.s[7]; - } + FORCEINLINE const int32_t& operator[](const int i) const { return ((int32_t*)this)[i]; } + FORCEINLINE int32_t& operator[](const int i) { return ((int32_t*)this)[i]; } } POST_ALIGN(32); -#else /* __ZMM64BIT__ */ + PRE_ALIGN(32) struct __vec8_f { - __m512 v; - FORCEINLINE operator __m512() const { return v; } - FORCEINLINE __vec8_f() : v(_mm512_undefined_ps()) { } - FORCEINLINE __vec8_f(const __m512 &in) : v(in) {} - FORCEINLINE __vec8_f(const __vec8_f &o) : v(o.v) {} - FORCEINLINE __vec8_f& operator =(const __vec8_f &o) { v=o.v; return *this; } - FORCEINLINE __vec8_f(float v00, float v01, float v02, float v03, - float v04, float v05, float v06, float v07) : - v ( _mm512_set_16to16_ps(0,0,0,0,0,0,0,0, v07, v06, v05, v04, v03, v02, v01, v00) ) {} - FORCEINLINE const float& operator[](const int i) const { return ((float*)this)[i]; } - FORCEINLINE float& operator[](const int i) { return ((float*)this)[i]; } -} POST_ALIGN(32); +#ifdef __ZMM64BIT__ + __m512 _data; + FORCEINLINE __vec8_f(const __m512 &in) : _data(in) {} + FORCEINLINE operator __m512() const { return _data; } +#else /* __ZMM64BIT__ */ + typedef float _v8sf __attribute__((vector_size(32))); + _v8sf _data; + FORCEINLINE __vec8_f(const __m512 &in) + { + _mm512_mask_extpackstorelo_ps((__m512*)&_data, 0xFF, in, _MM_DOWNCONV_PS_NONE, _MM_HINT_NONE); + } + FORCEINLINE operator __m512() const + { + return _mm512_extloadunpacklo_ps(_mm512_setzero_ps(), (uint8_t*)&_data, _MM_UPCONV_PS_NONE, _MM_HINT_NONE); + } #endif /* __ZMM64BIT__ */ + FORCEINLINE __vec8_f() { } + FORCEINLINE __vec8_f(float v0, float v1, float v2, float v3, + float v4, float v5, float v6, float v7) + { + const __m512 v = _mm512_set_16to16_ps(0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0, v7, v6, v5, v4, v3, v2, v1, v0); + *this = __vec8_f(v); + } + + FORCEINLINE const float& operator[](const int i) const { return ((float*)this)[i]; } + FORCEINLINE float& operator[](const int i) { return ((float*)this)[i]; } +} POST_ALIGN(32); struct PRE_ALIGN(64) __vec8_d { @@ -438,8 +415,8 @@ INSERT_EXTRACT(__vec1_d, double) /////////////////////////////////////////////////////////////////////////// // mask ops -static FORCEINLINE uint64_t __movmsk(__vec8_i1 mask) { - return (uint64_t)mask.v; +static FORCEINLINE __vec8_i1 __movmsk(__vec8_i1 mask) { + return mask.v; } static FORCEINLINE bool __any(__vec8_i1 mask) { @@ -455,52 +432,36 @@ static FORCEINLINE bool __none(__vec8_i1 mask) { } static FORCEINLINE __vec8_i1 __equal_i1(__vec8_i1 a, __vec8_i1 b) { - __vec8_i1 r; - r.v = (a.v & b.v) | (~a.v & ~b.v); - return r; + return (a.v & b.v) | (~a.v & ~b.v); } static FORCEINLINE __vec8_i1 __and(__vec8_i1 a, __vec8_i1 b) { - __vec8_i1 r; - r.v = a.v & b.v; - return r; + return a.v & b.v; } static FORCEINLINE __vec8_i1 __xor(__vec8_i1 a, __vec8_i1 b) { - __vec8_i1 r; - r.v = a.v ^ b.v; - return r; + return a.v ^ b.v; } static FORCEINLINE __vec8_i1 __or(__vec8_i1 a, __vec8_i1 b) { - __vec8_i1 r; - r.v = a.v | b.v; - return r; + return a.v | b.v; } static FORCEINLINE __vec8_i1 __not(__vec8_i1 v) { - __vec8_i1 r; - r.v = ~v.v; - return r; + return ~v; } static FORCEINLINE __vec8_i1 __and_not1(__vec8_i1 a, __vec8_i1 b) { - __vec8_i1 r; - r.v = ~a.v & b.v; - return r; + return ~a.v & b.v; } static FORCEINLINE __vec8_i1 __and_not2(__vec8_i1 a, __vec8_i1 b) { - __vec8_i1 r; - r.v = a.v & ~b.v; - return r; + return a.v & ~b.v; } static FORCEINLINE __vec8_i1 __select(__vec8_i1 mask, __vec8_i1 a, __vec8_i1 b) { - __vec8_i1 r; - r.v = (a.v & mask.v) | (b.v & ~mask.v); - return r; + return (a.v & mask.v) | (b.v & ~mask.v); } static FORCEINLINE __vec8_i1 __select(bool cond, __vec8_i1 a, __vec8_i1 b) { From b2678b43388914e4eb94a9cd5845bfea16ae0e3e Mon Sep 17 00:00:00 2001 From: Dmitry Babokin Date: Thu, 19 Sep 2013 17:27:58 +0400 Subject: [PATCH 090/124] Typo fix is tests/double-consts.ispc --- tests/double-consts.ispc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/double-consts.ispc b/tests/double-consts.ispc index 4096aa1c..5f9a66d5 100644 --- a/tests/double-consts.ispc +++ b/tests/double-consts.ispc @@ -13,7 +13,7 @@ export void f_f(uniform float RET[], uniform float aFOO[]) { // All the constants should be equal and if it's evaluated as "float", // then sqrt will evaluate to +inf. - if (d1 == d2 && d1 == d3 && d1 == d4 && d1 == d5 && d1 == d6 + if (d1 == d2 && d1 == d3 && d1 == d4 && d1 == d5 && d1 == d6 && ((float)sqrt(d1)) < 2e20) { RET[programIndex] = a; } From 0ed89e93fa309796867c0e8729c16dac0c27bbb8 Mon Sep 17 00:00:00 2001 From: evghenii Date: Thu, 19 Sep 2013 16:34:06 +0300 Subject: [PATCH 091/124] added fails info --- examples/intrinsics/knc-i1x8unsafe_fast.h | 103 +++++++++++++--------- 1 file changed, 60 insertions(+), 43 deletions(-) diff --git a/examples/intrinsics/knc-i1x8unsafe_fast.h b/examples/intrinsics/knc-i1x8unsafe_fast.h index 2e00a567..05be27bd 100644 --- a/examples/intrinsics/knc-i1x8unsafe_fast.h +++ b/examples/intrinsics/knc-i1x8unsafe_fast.h @@ -1,61 +1,78 @@ #define __ZMM64BIT__ #include "knc-i1x8.h" -/* the following tests fails because vec8_i32 and vec8_float are 512 and not 256 bit in size. - * not sure how it is possible to fix this, any suggestions? +/* the following tests fails because on KNC native vec8_i32 and vec8_float are 512 and not 256 bit in size. + * + * Using test compiler: Intel(r) SPMD Program Compiler (ispc), 1.4.5dev (build commit d68dbbc7bce74803 @ 20130919, LLVM 3.3) + * Using C/C++ compiler: icpc (ICC) 14.0.0 20130728 + * + */ + +/* knc-i1x8unsafe_fast.h fails: + * ---------------------------- +1 / 1206 tests FAILED compilation: + ./tests/ptr-assign-lhs-math-1.ispc 33 / 1206 tests FAILED execution: - ./tests/array-gather-simple.ispc - ./tests/array-gather-vary.ispc - ./tests/array-multidim-gather-scatter.ispc - ./tests/array-scatter-vary.ispc - ./tests/atomics-5.ispc - ./tests/atomics-swap.ispc - ./tests/cfor-array-gather-vary.ispc - ./tests/cfor-gs-improve-varying-1.ispc - ./tests/cfor-struct-gather-2.ispc - ./tests/cfor-struct-gather-3.ispc - ./tests/cfor-struct-gather.ispc - ./tests/gather-struct-vector.ispc - ./tests/global-array-4.ispc - ./tests/gs-improve-varying-1.ispc - ./tests/half-1.ispc - ./tests/half-3.ispc - ./tests/half.ispc - ./tests/launch-3.ispc - ./tests/launch-4.ispc - ./tests/masked-scatter-vector.ispc - ./tests/masked-struct-scatter-varying.ispc - ./tests/new-delete-6.ispc - ./tests/ptr-24.ispc - ./tests/ptr-25.ispc - ./tests/short-vec-15.ispc - ./tests/struct-gather-2.ispc - ./tests/struct-gather-3.ispc - ./tests/struct-gather.ispc - ./tests/struct-ref-lvalue.ispc - ./tests/struct-test-118.ispc - ./tests/struct-vary-index-expr.ispc - ./tests/typedef-2.ispc - ./tests/vector-varying-scatter.ispc + ./tests/array-gather-simple.ispc + ./tests/array-gather-vary.ispc + ./tests/array-multidim-gather-scatter.ispc + ./tests/array-scatter-vary.ispc + ./tests/atomics-5.ispc + ./tests/atomics-swap.ispc + ./tests/cfor-array-gather-vary.ispc + ./tests/cfor-gs-improve-varying-1.ispc + ./tests/cfor-struct-gather-2.ispc + ./tests/cfor-struct-gather-3.ispc + ./tests/cfor-struct-gather.ispc + ./tests/gather-struct-vector.ispc + ./tests/global-array-4.ispc + ./tests/gs-improve-varying-1.ispc + ./tests/half-1.ispc + ./tests/half-3.ispc + ./tests/half.ispc + ./tests/launch-3.ispc + ./tests/launch-4.ispc + ./tests/masked-scatter-vector.ispc + ./tests/masked-struct-scatter-varying.ispc + ./tests/new-delete-6.ispc + ./tests/ptr-24.ispc + ./tests/ptr-25.ispc + ./tests/short-vec-15.ispc + ./tests/struct-gather-2.ispc + ./tests/struct-gather-3.ispc + ./tests/struct-gather.ispc + ./tests/struct-ref-lvalue.ispc + ./tests/struct-test-118.ispc + ./tests/struct-vary-index-expr.ispc + ./tests/typedef-2.ispc + ./tests/vector-varying-scatter.ispc */ -/* knc-i1x8.h has the following fails: +/* knc-i1x8.h fails: + * ---------------------------- +1 / 1206 tests FAILED compilation: + ./tests/ptr-assign-lhs-math-1.ispc 3 / 1206 tests FAILED execution: - ./tests/half-1.ispc - ./tests/half-3.ispc - ./tests/half.ispc + ./tests/half-1.ispc + ./tests/half-3.ispc + ./tests/half.ispc */ -/* knc-i1x16.h has the following fails: -5 / 1206 tests FAILED execution: - ./tests/assert-3.ispc +/* knc-i1x8.h fails: + * ---------------------------- +1 / 1206 tests FAILED compilation: + ./tests/ptr-assign-lhs-math-1.ispc +4 / 1206 tests FAILED execution: ./tests/half-1.ispc ./tests/half-3.ispc ./tests/half.ispc ./tests/test-141.ispc */ -/* generics-16, from which these knc-i1x*.h are derived, has the following fails: +/* generic-16.h fails: (from these knc-i1x8.h & knc-i1x16.h are derived + * ---------------------------- +1 / 1206 tests FAILED compilation: + ./tests/ptr-assign-lhs-math-1.ispc 6 / 1206 tests FAILED execution: ./tests/func-overload-max.ispc ./tests/half-1.ispc From 491c58aef374a1de7987ba8d5919a641a65cb853 Mon Sep 17 00:00:00 2001 From: Ilia Filippov Date: Thu, 19 Sep 2013 17:47:10 +0400 Subject: [PATCH 092/124] change head to trunk --- alloy.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/alloy.py b/alloy.py index 06025324..119874b8 100755 --- a/alloy.py +++ b/alloy.py @@ -81,7 +81,7 @@ def build_LLVM(version_LLVM, revision, folder, tarball, debug, selfbuild, from_v llvm_home = os.environ["LLVM_HOME"] os.chdir(llvm_home) FOLDER_NAME=version_LLVM - if version_LLVM == "head": + if version_LLVM == "trunk": SVN_PATH="trunk" if version_LLVM == "3.3": SVN_PATH="tags/RELEASE_33/final" @@ -334,7 +334,7 @@ def validation_run(only, only_targets, reference_branch, notify, update): archs.append("x86-64") if "native" in only: sde_targets_t = [] - for i in ["3.1", "3.2", "3.3", "head"]: + for i in ["3.1", "3.2", "3.3", "trunk"]: if i in only: LLVM.append(i) if "current" in only: @@ -367,7 +367,7 @@ def validation_run(only, only_targets, reference_branch, notify, update): if len(archs) == 0: archs = ["x86", "x86-64"] if len(LLVM) == 0: - LLVM = ["3.3", "head"] + LLVM = ["3.3", "trunk"] gen_archs = ["x86-64"] need_LLVM = check_LLVM(LLVM) for i in range(0,len(need_LLVM)): @@ -562,9 +562,9 @@ parser = OptionParser() parser.add_option('-b', '--build-llvm', dest='build_llvm', help='ask to build LLVM', default=False, action="store_true") parser.add_option('--version', dest='version', - help='version of llvm to build', default="head") + help='version of llvm to build: 3.1 3.2 3.3 trunk', default="trunk") parser.add_option('--revision', dest='revision', - help='revision of llvm to build', default="") + help='revision of llvm to build in format r172870', default="") parser.add_option('--debug', dest='debug', help='debug build of LLVM?', default=False, action="store_true") parser.add_option('--folder', dest='folder', @@ -592,7 +592,7 @@ parser.add_option('--notify', dest='notify', parser.add_option('--only', dest='only', help='set types of tests. Possible values:\n' + '-O0, -O2, x86, x86-64, stability (test only stability), performance (test only performance)\n' + - 'build (only build with different LLVM), 3.1, 3.2, 3.3, head, native (do not use SDE), current (do not rebuild ISPC).\n' + + 'build (only build with different LLVM), 3.1, 3.2, 3.3, trunk, native (do not use SDE), current (do not rebuild ISPC).\n' + 'Example: --only="3.2 -O0 stability 3.3"', default="") parser.add_option('--update-errors', dest='update', help='rewrite fail_db.txt file according to received results (F or FP)', default="") From 5cabf0bef06af579571046cae63dcd82768c1220 Mon Sep 17 00:00:00 2001 From: evghenii Date: Fri, 20 Sep 2013 14:13:40 +0300 Subject: [PATCH 093/124] adding int64 support form knc.h, phase 1. bugs: __lshr & __ashr fail idiv.ispc test, __equal_i64 & __equal_i64_and_mask fails reduce_equal_8.ispc test --- examples/intrinsics/knc-i1x16.h | 290 ++++++++++++++++++++++++++++---- 1 file changed, 259 insertions(+), 31 deletions(-) diff --git a/examples/intrinsics/knc-i1x16.h b/examples/intrinsics/knc-i1x16.h index c535e61a..628a38b8 100644 --- a/examples/intrinsics/knc-i1x16.h +++ b/examples/intrinsics/knc-i1x16.h @@ -208,7 +208,7 @@ struct PRE_ALIGN(128) __vec16_d } POST_ALIGN(128); #endif /* evghenii::d */ -#if 1 /* evghenii::i64 */ +#if 0 /* evghenii::i64 */ PRE_ALIGN(128) struct __vec16_i64 : public vec16 { __vec16_i64() { } __vec16_i64(int64_t v0, int64_t v1, int64_t v2, int64_t v3, @@ -219,34 +219,66 @@ PRE_ALIGN(128) struct __vec16_i64 : public vec16 { v8, v9, v10, v11, v12, v13, v14, v15) { } } POST_ALIGN(128); #else /* evghenii::i64 */ -struct PRE_ALIGN(64) __vec16_i64 { - FORCEINLINE __vec16_i64() : v_lo(_mm512_undefined_epi32()), v_hi(_mm512_undefined_epi32()) {} - FORCEINLINE __vec16_i64(const __vec16_i64 &o) : v_lo(o.v_lo), v_hi(o.v_hi) {} - FORCEINLINE __vec16_i64(__m512i l, __m512i h) : v_lo(l), v_hi(h) {} - FORCEINLINE __vec16_i64& operator =(const __vec16_i64 &o) { v_lo=o.v_lo; v_hi=o.v_hi; return *this; } - FORCEINLINE __vec16_i64(int64_t v00, int64_t v01, int64_t v02, int64_t v03, - int64_t v04, int64_t v05, int64_t v06, int64_t v07, - int64_t v08, int64_t v09, int64_t v10, int64_t v11, - int64_t v12, int64_t v13, int64_t v14, int64_t v15) { - __m512i v1 = _mm512_set_8to8_epi64(v15, v14, v13, v12, v11, v10, v09, v08); - __m512i v2 = _mm512_set_8to8_epi64(v07, v06, v05, v04, v03, v02, v01, v00); - v_hi = _mm512_mask_permutevar_epi32(v_hi, 0xFF00, - _mm512_set_16to16_pi(15,13,11,9,7,5,3,1,14,12,10,8,6,4,2,0), - v1); - v_hi = _mm512_mask_permutevar_epi32(v_hi, 0x00FF, - _mm512_set_16to16_pi(14,12,10,8,6,4,2,0,15,13,11,9,7,5,3,1), - v2); - v_lo = _mm512_mask_permutevar_epi32(v_lo, 0xFF00, - _mm512_set_16to16_pi(14,12,10,8,6,4,2,0,15,13,11,9,7,5,3,1), - v1); - v_lo = _mm512_mask_permutevar_epi32(v_lo, 0x00FF, - _mm512_set_16to16_pi(15,13,11,9,7,5,3,1,14,12,10,8,6,4,2,0), - v2); - } +struct PRE_ALIGN(128) __vec16_i64 +{ + union { + __m512i v1; __m512i v_hi; + }; + union + { + __m512i v2; __m512i v_lo; -} POST_ALIGN(64); + }; + FORCEINLINE __vec16_i64() : v1(_mm512_undefined_epi32()), v2(_mm512_undefined_epi32()) {} + FORCEINLINE __vec16_i64(const __m512i _v1, const __m512i _v2) : v1(_v1), v2(_v2) {} + FORCEINLINE __vec16_i64(const __vec16_i64 &o) : v1(o.v1), v2(o.v2) {} + FORCEINLINE __vec16_i64& operator =(const __vec16_i64 &o) { v1=o.v1; v2=o.v2; return *this; } + FORCEINLINE __vec16_i64(int64_t v00, int64_t v01, int64_t v02, int64_t v03, + int64_t v04, int64_t v05, int64_t v06, int64_t v07, + int64_t v08, int64_t v09, int64_t v10, int64_t v11, + int64_t v12, int64_t v13, int64_t v14, int64_t v15) { + v2 = _mm512_set_8to8_epi64(v15, v14, v13, v12, v11, v10, v09, v08); + v1 = _mm512_set_8to8_epi64(v07, v06, v05, v04, v03, v02, v01, v00); + } + FORCEINLINE const int64_t& operator[](const int i) const { return ((int64_t*)this)[i]; } + FORCEINLINE int64_t& operator[](const int i) { return ((int64_t*)this)[i]; } + FORCEINLINE __vec16_i64 cvt2hilo() const + { + __m512i _hi, _lo; + _hi = _mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xFF00, + _mm512_set_16to16_pi(15,13,11,9,7,5,3,1,14,12,10,8,6,4,2,0), + v1); + _hi = _mm512_mask_permutevar_epi32(_hi, 0x00FF, + _mm512_set_16to16_pi(14,12,10,8,6,4,2,0,15,13,11,9,7,5,3,1), + v2); + _lo = _mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xFF00, + _mm512_set_16to16_pi(14,12,10,8,6,4,2,0,15,13,11,9,7,5,3,1), + v1); + _lo = _mm512_mask_permutevar_epi32(_lo, 0x00FF, + _mm512_set_16to16_pi(15,13,11,9,7,5,3,1,14,12,10,8,6,4,2,0), + v2); + return __vec16_i64(_hi, _lo); + } + FORCEINLINE __vec16_i64 cvt2zmm() const + { + __m512i _v1, _v2; + _v1 = _mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xAAAA, + _mm512_set_16to16_pi(15,15,14,14,13,13,12,12,11,11,10,10,9,9,8,8), + v_hi); + _v1 = _mm512_mask_permutevar_epi32(_v1, 0x5555, + _mm512_set_16to16_pi(15,15,14,14,13,13,12,12,11,11,10,10,9,9,8,8), + v_lo); + _v2 = _mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xAAAA, + _mm512_set_16to16_pi(7,7,6,6,5,5,4,4,3,3,2,2,1,1,0,0), + v_hi); + _v2 = _mm512_mask_permutevar_epi32(_v2, 0x5555, + _mm512_set_16to16_pi(7,7,6,6,5,5,4,4,3,3,2,2,1,1,0,0), + v_lo); + return __vec16_i64(_v1, _v2); + } +} POST_ALIGN(128); #endif /* evghenii::i64 */ PRE_ALIGN(16) struct __vec16_i8 : public vec16 { @@ -959,30 +991,162 @@ template <> static FORCEINLINE void __store<64>(__vec16_i32 *p, __vec16_i32 v) { /////////////////////////////////////////////////////////////////////////// // int64 +// evghenii::int64 +#if 0 BINARY_OP(__vec16_i64, __add, +) BINARY_OP(__vec16_i64, __sub, -) BINARY_OP(__vec16_i64, __mul, *) +#else +static FORCEINLINE __vec16_i64 __add(__vec16_i64 a, __vec16_i64 b) { + return __vec16_i64(_mm512_add_epi64(a.v1, b.v1), _mm512_add_epi64(a.v2,b.v2)); +} +static FORCEINLINE __vec16_i64 __sub(__vec16_i64 _a, __vec16_i64 _b) { +// return __vec16_i64(_mm512_sub_epi64(_a.v1, _b.v1), _mm512_sub_epi64(_a.v2,_b.v2)); + const __vec16_i64 a = _a.cvt2hilo(); + const __vec16_i64 b = _b.cvt2hilo(); + __vec16_i64 ret; + __mmask16 borrow = 0; + ret.v_lo = _mm512_subsetb_epi32(a.v_lo, b.v_lo, &borrow); + ret.v_hi = _mm512_sbb_epi32 (a.v_hi, borrow, b.v_hi, &borrow); + return ret.cvt2zmm(); +} + +static FORCEINLINE __vec16_i64 __mul(const __vec16_i32 &a, const __vec16_i64 &_b) +{ + const __vec16_i64 b = _b.cvt2hilo(); + return __vec16_i64(_mm512_mullo_epi32(a.v,b.v_lo), + _mm512_add_epi32(_mm512_mullo_epi32(a.v, b.v_hi), + _mm512_mulhi_epi32(a.v, b.v_lo))).cvt2zmm(); +} + +static FORCEINLINE __vec16_i64 __mul(__vec16_i64 a, __vec16_i64 b) { + return __vec16_i64(_mm512_mullox_epi64(a.v1, b.v1), _mm512_mullox_epi64(a.v2,b.v2)); +} +#endif + +#if 0 BINARY_OP(__vec16_i64, __or, |) BINARY_OP(__vec16_i64, __and, &) BINARY_OP(__vec16_i64, __xor, ^) BINARY_OP(__vec16_i64, __shl, <<) +#else +static FORCEINLINE __vec16_i64 __or(__vec16_i64 a, __vec16_i64 b) { + return __vec16_i64(_mm512_or_epi64(a.v1, b.v1), _mm512_or_epi64(a.v2, b.v2)); +} +static FORCEINLINE __vec16_i64 __and(__vec16_i64 a, __vec16_i64 b) { + return __vec16_i64(_mm512_and_epi64(a.v1, b.v1), _mm512_and_epi64(a.v2, b.v2)); +} + +static FORCEINLINE __vec16_i64 __xor(__vec16_i64 a, __vec16_i64 b) { + return __vec16_i64(_mm512_xor_epi64(a.v1, b.v1), _mm512_xor_epi64(a.v2, b.v2)); +} + +static FORCEINLINE __vec16_i64 __shl(__vec16_i64 _a, __vec16_i64 _b) { + const __vec16_i64 a = _a.cvt2hilo(); + const __vec16_i64 b = _b.cvt2hilo(); + __vec16_i32 xfer = _mm512_srlv_epi32(a.v_lo, _mm512_sub_epi32(__ispc_thirty_two, b.v_lo)); + __vec16_i32 hi = _mm512_or_epi32(_mm512_sllv_epi32(a.v_hi, b.v_lo), xfer); + __vec16_i32 lo = _mm512_sllv_epi32(a.v_lo, b.v_lo); + return __vec16_i64(hi,lo).cvt2zmm(); +} +#endif + +#if 0 BINARY_OP_CAST(__vec16_i64, uint64_t, __udiv, /) BINARY_OP_CAST(__vec16_i64, int64_t, __sdiv, /) +#else +static FORCEINLINE __vec16_i64 __udiv(__vec16_i64 a, __vec16_i64 b) { + return __vec16_i64(_mm512_div_epu64(a.v1,b.v1), _mm512_div_epu64(a.v2,b.v2)); +} +static FORCEINLINE __vec16_i64 __sdiv(__vec16_i64 a, __vec16_i64 b) { + return __vec16_i64(_mm512_div_epi64(a.v1,b.v1), _mm512_div_epi64(a.v2,b.v2)); +} +#endif +#if 0 BINARY_OP_CAST(__vec16_i64, uint64_t, __urem, %) BINARY_OP_CAST(__vec16_i64, int64_t, __srem, %) +#else +static FORCEINLINE __vec16_i64 __urem(__vec16_i64 a, __vec16_i64 b) { + return __vec16_i64(_mm512_rem_epu64(a.v1,b.v1), _mm512_rem_epu64(a.v2,b.v2)); +} +static FORCEINLINE __vec16_i64 __srem(__vec16_i64 a, __vec16_i64 b) { + return __vec16_i64(_mm512_rem_epi64(a.v1,b.v1), _mm512_rem_epi64(a.v2,b.v2)); +} +#endif + +#if 1 BINARY_OP_CAST(__vec16_i64, uint64_t, __lshr, >>) +#else /* evghenii::fails idiv.ispc */ +static FORCEINLINE __vec16_i64 __lshr(__vec16_i64 _a, __vec16_i64 _b) { + const __vec16_i64 a = _a.cvt2hilo(); + const __vec16_i64 b = _b.cvt2hilo(); + __vec16_i32 shift = _mm512_sub_epi32(__ispc_thirty_two, b.v_lo); +#if 0 + __vec16_i32 xfer = _mm512_and_epi32(_mm512_sllv_epi32(__ispc_ffffffff, shift), _mm512_sllv_epi32(a.v_hi, shift)); +#else + __vec16_i32 xfer = _mm512_sllv_epi32(_mm512_and_epi32(a.v_hi, + _mm512_sub_epi32(_mm512_sllv_epi32(__ispc_one, b.v_lo), __ispc_one)), + _mm512_sub_epi32(__ispc_thirty_two, b.v_lo)); +#endif + __vec16_i32 hi = _mm512_srlv_epi32(a.v_hi, b.v_lo); + __vec16_i32 lo = _mm512_or_epi32(xfer, _mm512_srlv_epi32(a.v_lo, b.v_lo)); + return __vec16_i64(hi,lo).cvt2zmm(); +} + +#endif + +#if 1 BINARY_OP_CAST(__vec16_i64, int64_t, __ashr, >>) +#else /* evghenii::fails idiv.ispc */ +static FORCEINLINE __vec16_i64 __ashr(__vec16_i64 _a, __vec16_i64 _b) { + const __vec16_i64 a = _a.cvt2hilo(); + const __vec16_i64 b = _b.cvt2hilo(); + __vec16_i32 xfer = _mm512_sllv_epi32(_mm512_and_epi32(a.v_hi, + _mm512_sub_epi32(_mm512_sllv_epi32(__ispc_one, b.v_lo), __ispc_one)), + _mm512_sub_epi32(__ispc_thirty_two, b.v_lo)); + __vec16_i32 hi = _mm512_srav_epi32(a.v_hi, b.v_lo); + __vec16_i32 lo = _mm512_or_epi32(xfer, _mm512_srlv_epi32(a.v_lo, b.v_lo)); + return __vec16_i64(hi,lo).cvt2zmm(); +} +#endif SHIFT_UNIFORM(__vec16_i64, uint64_t, __lshr, >>) SHIFT_UNIFORM(__vec16_i64, int64_t, __ashr, >>) SHIFT_UNIFORM(__vec16_i64, int64_t, __shl, <<) +#if 1 CMP_OP(__vec16_i64, i64, int64_t, __equal, ==) CMP_OP(__vec16_i64, i64, int64_t, __not_equal, !=) +#else /* evghenii::fails ./tests/reduce-equal-8.ispc, some other test hang... */ +static FORCEINLINE __vec16_i1 __equal_i64(const __vec16_i64 &_a, const __vec16_i64 &_b) { + const __vec16_i64 a = _a.cvt2hilo(); + const __vec16_i64 b = _b.cvt2hilo(); + const __mmask16 lo_match = _mm512_cmpeq_epi32_mask(a.v_lo,b.v_lo); + return _mm512_mask_cmpeq_epi32_mask(lo_match,a.v_hi,b.v_hi); +} +static FORCEINLINE __vec16_i1 __equal_i64_and_mask(const __vec16_i64 &_a, const __vec16_i64 &_b, + __vec16_i1 mask) { + const __vec16_i64 a = _a.cvt2hilo(); + const __vec16_i64 b = _b.cvt2hilo(); + __mmask16 lo_match = _mm512_cmpeq_epi32_mask(a.v_lo,b.v_lo); + __mmask16 full_match = _mm512_mask_cmpeq_epi32_mask(lo_match,a.v_hi,b.v_hi); + return _mm512_kand(full_match, (__mmask16)mask); +} + +static FORCEINLINE __vec16_i1 __not_equal_i64(const __vec16_i64 &a, const __vec16_i64 &b) { + return __not(__equal_i64(a,b)); +} +static FORCEINLINE __vec16_i1 __not_equal_i64_and_mask(const __vec16_i64 &a, const __vec16_i64 &b, + __vec16_i1 mask) { + return __and(__not(__equal_i64(a,b)), mask); +} +#endif + + CMP_OP(__vec16_i64, i64, uint64_t, __unsigned_less_equal, <=) CMP_OP(__vec16_i64, i64, int64_t, __signed_less_equal, <=) CMP_OP(__vec16_i64, i64, uint64_t, __unsigned_greater_equal, >=) @@ -992,15 +1156,84 @@ CMP_OP(__vec16_i64, i64, int64_t, __signed_less_than, <) CMP_OP(__vec16_i64, i64, uint64_t, __unsigned_greater_than, >) CMP_OP(__vec16_i64, i64, int64_t, __signed_greater_than, >) +#if 0 SELECT(__vec16_i64) +#else +static FORCEINLINE __vec16_i64 __select(__vec16_i1 mask, + __vec16_i64 a, __vec16_i64 b) { + __vec16_i64 ret; + ret.v_hi = _mm512_mask_mov_epi64(b.v_hi, mask, a.v_hi); + ret.v_lo = _mm512_mask_mov_epi64(b.v_lo, mask >> 8, a.v_lo); + return ret; +} +#endif + INSERT_EXTRACT(__vec16_i64, int64_t) +#if 0 SMEAR(__vec16_i64, i64, int64_t) SETZERO(__vec16_i64, i64) UNDEF(__vec16_i64, i64) BROADCAST(__vec16_i64, i64, int64_t) +#else +template RetVecType __smear_i64(const int64_t &l); +template <> FORCEINLINE __vec16_i64 __smear_i64<__vec16_i64>(const int64_t &l) { return __vec16_i64(_mm512_set1_epi64(l), _mm512_set1_epi64(l)); } + +template RetVecType __setzero_i64(); +template <> FORCEINLINE __vec16_i64 __setzero_i64<__vec16_i64>() { return __vec16_i64(_mm512_setzero_epi32(), _mm512_setzero_epi32()); } + +template RetVecType __undef_i64(); +template <> FORCEINLINE __vec16_i64 __undef_i64<__vec16_i64>() { return __vec16_i64(_mm512_undefined_epi32(), _mm512_undefined_epi32()); } + +static FORCEINLINE __vec16_i64 __broadcast_i64(__vec16_i64 v, int index) { + int64_t val = __extract_element(v, index & 0xf); + return __smear_i64<__vec16_i64>(val); +} +#endif ROTATE(__vec16_i64, i64, int64_t) SHUFFLES(__vec16_i64, i64, int64_t) +#if 0 LOAD_STORE(__vec16_i64, int64_t) +#else +template static FORCEINLINE __vec16_i64 __load(const __vec16_i64 *p) +{ + __vec16_i32 v1; + __vec16_i32 v2; + v2 = _mm512_extloadunpacklo_epi32(v2, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE); + v2 = _mm512_extloadunpackhi_epi32(v2, (uint8_t*)p+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE); + v1 = _mm512_extloadunpacklo_epi32(v1, (uint8_t*)p+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE); + v1 = _mm512_extloadunpackhi_epi32(v1, (uint8_t*)p+128, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE); + return __vec16_i64(v2,v1); +} + +template <> static FORCEINLINE __vec16_i64 __load<64>(const __vec16_i64 *p) +{ + __m512i v2 = _mm512_load_epi32(p); + __m512i v1 = _mm512_load_epi32(((uint8_t*)p)+64); + return __vec16_i64(v2,v1); +} + +template <> static FORCEINLINE __vec16_i64 __load<128>(const __vec16_i64 *p) { return __load<64>(p); } + +template static FORCEINLINE void __store(__vec16_i64 *p, __vec16_i64 v) +{ + __m512i v1 = v.v2; + __m512i v2 = v.v1; + _mm512_extpackstorelo_epi32(p, v2, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE); + _mm512_extpackstorehi_epi32((uint8_t*)p+64, v2, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE); + _mm512_extpackstorelo_epi32((uint8_t*)p+64, v1, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE); + _mm512_extpackstorehi_epi32((uint8_t*)p+128, v1, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE); +} + +template <> static FORCEINLINE void __store<64>(__vec16_i64 *p, __vec16_i64 v) +{ + __m512i v1 = v.v2; + __m512i v2 = v.v1; + _mm512_store_epi64(p, v2); + _mm512_store_epi64(((uint8_t*)p)+64, v1); +} + +template <> static FORCEINLINE void __store<128>(__vec16_i64 *p, __vec16_i64 v) { __store<64>(p, v); } +#endif #if 0 /* evghenii::float */ @@ -1062,7 +1295,6 @@ static FORCEINLINE __vec16_f __sub(__vec16_f a, __vec16_f b) { return _mm512_sub_ps(a, b); } -#if 1 /* evghenii::this two fails assert-3.ispc test */ static FORCEINLINE __vec16_f __mul(__vec16_f a, __vec16_f b) { return _mm512_mul_ps(a, b); } @@ -1070,10 +1302,6 @@ static FORCEINLINE __vec16_f __mul(__vec16_f a, __vec16_f b) { static FORCEINLINE __vec16_f __div(__vec16_f a, __vec16_f b) { return _mm512_div_ps(a, b); } -#else -BINARY_OP(__vec16_f, __mul, *) -BINARY_OP(__vec16_f, __div, /) -#endif static FORCEINLINE __vec16_i1 __equal_float(__vec16_f a, __vec16_f b) { From ddecdeb8349e1d3db7d6c4ef949c9fb86734609d Mon Sep 17 00:00:00 2001 From: evghenii Date: Fri, 20 Sep 2013 14:55:15 +0300 Subject: [PATCH 094/124] move remaining int64 from knc.h some of fails to pass tests, grep for evghenii::fails to find out which functions fail and on what tests --- examples/intrinsics/knc-i1x16.h | 170 +++++++++++++++++++++++++++++--- 1 file changed, 157 insertions(+), 13 deletions(-) diff --git a/examples/intrinsics/knc-i1x16.h b/examples/intrinsics/knc-i1x16.h index 628a38b8..1f5a6056 100644 --- a/examples/intrinsics/knc-i1x16.h +++ b/examples/intrinsics/knc-i1x16.h @@ -1120,7 +1120,6 @@ SHIFT_UNIFORM(__vec16_i64, int64_t, __shl, <<) #if 1 CMP_OP(__vec16_i64, i64, int64_t, __equal, ==) -CMP_OP(__vec16_i64, i64, int64_t, __not_equal, !=) #else /* evghenii::fails ./tests/reduce-equal-8.ispc, some other test hang... */ static FORCEINLINE __vec16_i1 __equal_i64(const __vec16_i64 &_a, const __vec16_i64 &_b) { const __vec16_i64 a = _a.cvt2hilo(); @@ -1128,6 +1127,14 @@ static FORCEINLINE __vec16_i1 __equal_i64(const __vec16_i64 &_a, const __vec16_i const __mmask16 lo_match = _mm512_cmpeq_epi32_mask(a.v_lo,b.v_lo); return _mm512_mask_cmpeq_epi32_mask(lo_match,a.v_hi,b.v_hi); } +static FORCEINLINE __vec16_i1 __not_equal_i64(const __vec16_i64 &a, const __vec16_i64 &b) { + return __not(__equal_i64(a,b)); +} +#endif + +#if 1 +CMP_OP(__vec16_i64, i64, int64_t, __not_equal, !=) +#else /* evghenii::fails ./tests/reduce-equal-8.ispc, some other test hang... */ static FORCEINLINE __vec16_i1 __equal_i64_and_mask(const __vec16_i64 &_a, const __vec16_i64 &_b, __vec16_i1 mask) { const __vec16_i64 a = _a.cvt2hilo(); @@ -1136,10 +1143,6 @@ static FORCEINLINE __vec16_i1 __equal_i64_and_mask(const __vec16_i64 &_a, const __mmask16 full_match = _mm512_mask_cmpeq_epi32_mask(lo_match,a.v_hi,b.v_hi); return _mm512_kand(full_match, (__mmask16)mask); } - -static FORCEINLINE __vec16_i1 __not_equal_i64(const __vec16_i64 &a, const __vec16_i64 &b) { - return __not(__equal_i64(a,b)); -} static FORCEINLINE __vec16_i1 __not_equal_i64_and_mask(const __vec16_i64 &a, const __vec16_i64 &b, __vec16_i1 mask) { return __and(__not(__equal_i64(a,b)), mask); @@ -1147,6 +1150,7 @@ static FORCEINLINE __vec16_i1 __not_equal_i64_and_mask(const __vec16_i64 &a, con #endif + CMP_OP(__vec16_i64, i64, uint64_t, __unsigned_less_equal, <=) CMP_OP(__vec16_i64, i64, int64_t, __signed_less_equal, <=) CMP_OP(__vec16_i64, i64, uint64_t, __unsigned_greater_equal, >=) @@ -1843,7 +1847,14 @@ static FORCEINLINE TO FUNC(TO, FROM val) { \ } // sign extension conversions +#if 1 CAST(__vec16_i64, int64_t, __vec16_i32, int32_t, __cast_sext) +#else /* evghenii::fails on soa-9 soa-13 soa-10 soa-29 soa-3 ... and others */ +static FORCEINLINE __vec16_i64 __cast_sext(const __vec16_i64 &, const __vec16_i32 &val) +{ + return __vec16_i64(_mm512_srai_epi32(val.v,31), val.v).cvt2zmm(); +} +#endif CAST(__vec16_i64, int64_t, __vec16_i16, int16_t, __cast_sext) CAST(__vec16_i64, int64_t, __vec16_i8, int8_t, __cast_sext) CAST(__vec16_i32, int32_t, __vec16_i16, int16_t, __cast_sext) @@ -1868,15 +1879,23 @@ CAST_SEXT_I1(__vec16_i32) #else static FORCEINLINE __vec16_i32 __cast_sext(const __vec16_i32 &, const __vec16_i1 &val) { - __vec16_i32 ret = _mm512_setzero_epi32(); - __vec16_i32 one = _mm512_set1_epi32(-1); - return _mm512_mask_mov_epi32(ret, val, one); + __vec16_i32 ret = _mm512_setzero_epi32(); + __vec16_i32 one = _mm512_set1_epi32(-1); + return _mm512_mask_mov_epi32(ret, val, one); } #endif CAST_SEXT_I1(__vec16_i64) // zero extension +#if 0 CAST(__vec16_i64, uint64_t, __vec16_i32, uint32_t, __cast_zext) +#else +static FORCEINLINE __vec16_i64 __cast_zext(const __vec16_i64 &, const __vec16_i32 &val) +{ + return __vec16_i64(_mm512_setzero_epi32(), val.v).cvt2zmm(); +} + +#endif CAST(__vec16_i64, uint64_t, __vec16_i16, uint16_t, __cast_zext) CAST(__vec16_i64, uint64_t, __vec16_i8, uint8_t, __cast_zext) CAST(__vec16_i32, uint32_t, __vec16_i16, uint16_t, __cast_zext) @@ -2714,8 +2733,34 @@ static FORCEINLINE __vec16_i8 __gather_base_offsets32_i8(uint8_t *base, uint32_t _mm512_extstore_epi32(ret.data,tmp,_MM_DOWNCONV_EPI32_SINT8,_MM_HINT_NONE); return ret; } -#endif +#if 0 /* evghenii::fails on gather-int8-2 & gather-int8-4 */ +static FORCEINLINE __vec16_i8 __gather_base_offsets64_i8(uint8_t *_base, uint32_t scale, __vec16_i64 _offsets, __vec16_i1 mask) +{ + const __vec16_i64 offsets = _offsets.cvt2hilo(); + __vec16_i1 still_to_do = mask; + __vec16_i32 tmp; + while (still_to_do) { + int first_active_lane = _mm_tzcnt_32((int)still_to_do); + const uint &hi32 = ((uint*)&offsets.v_hi)[first_active_lane]; + __vec16_i1 match = _mm512_mask_cmp_epi32_mask(mask,offsets.v_hi, + __smear_i32<__vec16_i32>((int32_t)hi32), + _MM_CMPINT_EQ); + + void * base = (void*)((unsigned long)_base + + ((scale*(unsigned long)hi32) << 32)); + tmp = _mm512_mask_i32extgather_epi32(tmp, match, offsets.v_lo, base, + _MM_UPCONV_EPI32_SINT8, scale, + _MM_HINT_NONE); + still_to_do = _mm512_kxor(match,still_to_do); + } + __vec16_i8 ret; + _mm512_extstore_epi32(ret.data,tmp,_MM_DOWNCONV_EPI32_SINT8,_MM_HINT_NONE); + return ret; +} +#else GATHER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i64, __gather_base_offsets64_i8) +#endif +#endif /****************/ GATHER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i32, __gather_base_offsets32_i16) GATHER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i64, __gather_base_offsets64_i16) @@ -2729,8 +2774,35 @@ static FORCEINLINE __vec16_i32 __gather_base_offsets32_i32(uint8_t *base, uint32 base, _MM_UPCONV_EPI32_NONE, scale, _MM_HINT_NONE); } -#endif +#if 0 /* evghenii::fails on gather-int32-2 & gather-int32-4 */ +static FORCEINLINE __vec16_i32 __gather_base_offsets64_i32(uint8_t *_base, uint32_t scale, __vec16_i64 _offsets, __vec16_i1 mask) +{ + const __vec16_i64 offsets = _offsets.cvt2hilo(); + // There is no gather instruction with 64-bit offsets in KNC. + // We have to manually iterate over the upper 32 bits ;-) + __vec16_i1 still_to_do = mask; + __vec16_i32 ret; + while (still_to_do) { + int first_active_lane = _mm_tzcnt_32((int)still_to_do); + const uint &hi32 = ((uint*)&offsets.v_hi)[first_active_lane]; + __vec16_i1 match = _mm512_mask_cmp_epi32_mask(mask,offsets.v_hi, + __smear_i32<__vec16_i32>((int32_t)hi32), + _MM_CMPINT_EQ); + + void * base = (void*)((unsigned long)_base + + ((scale*(unsigned long)hi32) << 32)); + ret = _mm512_mask_i32extgather_epi32(ret, match, offsets.v_lo, base, + _MM_UPCONV_EPI32_NONE, scale, + _MM_HINT_NONE); + still_to_do = _mm512_kxor(match, still_to_do); + } + + return ret; +} +#else GATHER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i64, __gather_base_offsets64_i32) +#endif +#endif /****************/ #if 0 GATHER_BASE_OFFSETS(__vec16_f, float, __vec16_i32, __gather_base_offsets32_float) @@ -2741,8 +2813,35 @@ static FORCEINLINE __vec16_f __gather_base_offsets32_float(uint8_t *base, uint32 base, _MM_UPCONV_PS_NONE, scale, _MM_HINT_NONE); } -#endif +#if 0 /* evghenii::fails on gather-float-2 gather-float-4 & soa-14 */ +static FORCEINLINE __vec16_f __gather_base_offsets64_float(uint8_t *_base, uint32_t scale, __vec16_i64 _offsets, __vec16_i1 mask) +{ + const __vec16_i64 offsets = _offsets.cvt2hilo(); + // There is no gather instruction with 64-bit offsets in KNC. + // We have to manually iterate over the upper 32 bits ;-) + __vec16_i1 still_to_do = mask; + __vec16_f ret; + while (still_to_do) { + int first_active_lane = _mm_tzcnt_32((int)still_to_do); + const uint &hi32 = ((uint*)&offsets.v_hi)[first_active_lane]; + __vec16_i1 match = _mm512_mask_cmp_epi32_mask(mask,offsets.v_hi, + __smear_i32<__vec16_i32>((int32_t)hi32), + _MM_CMPINT_EQ); + + void * base = (void*)((unsigned long)_base + + ((scale*(unsigned long)hi32) << 32)); + ret = _mm512_mask_i32extgather_ps(ret, match, offsets.v_lo, base, + _MM_UPCONV_PS_NONE, scale, + _MM_HINT_NONE); + still_to_do = _mm512_kxor(match, still_to_do); + } + + return ret; +} +#else GATHER_BASE_OFFSETS(__vec16_f, float, __vec16_i64, __gather_base_offsets64_float) +#endif +#endif /****************/ GATHER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i32, __gather_base_offsets32_i64) GATHER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i64, __gather_base_offsets64_i64) @@ -2824,6 +2923,7 @@ SCATTER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i64, __scatter_base_offsets64 /*****************/ #if 0 SCATTER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i32, __scatter_base_offsets32_i32) +SCATTER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i64, __scatter_base_offsets64_i32) #else static FORCEINLINE void __scatter_base_offsets32_i32(uint8_t *b, uint32_t scale, __vec16_i32 offsets, __vec16_i32 val, __vec16_i1 mask) { @@ -2831,8 +2931,28 @@ static FORCEINLINE void __scatter_base_offsets32_i32(uint8_t *b, uint32_t scale, _MM_DOWNCONV_EPI32_NONE, scale, _MM_HINT_NONE); } +static FORCEINLINE void __scatter_base_offsets64_i32(uint8_t *_base, uint32_t scale, __vec16_i64 _offsets, __vec16_i32 value, __vec16_i1 mask) +{ + const __vec16_i64 offsets = _offsets.cvt2hilo(); + + __vec16_i1 still_to_do = mask; + while (still_to_do) { + int first_active_lane = _mm_tzcnt_32((int)still_to_do); + const uint &hi32 = ((uint*)&offsets.v_hi)[first_active_lane]; + __vec16_i1 match = _mm512_mask_cmp_epi32_mask(mask,offsets.v_hi, + __smear_i32<__vec16_i32>((int32_t)hi32), + _MM_CMPINT_EQ); + + void * base = (void*)((unsigned long)_base + + ((scale*(unsigned long)hi32) << 32)); + _mm512_mask_i32extscatter_epi32(base, match, offsets.v_lo, + value, + _MM_DOWNCONV_EPI32_NONE, scale, + _MM_HINT_NONE); + still_to_do = _mm512_kxor(match,still_to_do); + } +} #endif -SCATTER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i64, __scatter_base_offsets64_i32) /*****************/ #if 0 SCATTER_BASE_OFFSETS(__vec16_f, float, __vec16_i32, __scatter_base_offsets32_float) @@ -2844,8 +2964,32 @@ static FORCEINLINE void __scatter_base_offsets32_float(void *base, uint32_t scal _MM_DOWNCONV_PS_NONE, scale, _MM_HINT_NONE); } -#endif +#if 0 /* evghenii::fails on soa-10 & soa-13 , it is very similar to __scatter_base_offsets64_it32, but that passes tests, why ?!? */ +static FORCEINLINE void __scatter_base_offsets64_float(uint8_t *_base, uint32_t scale, __vec16_i64 _offsets, __vec16_f value, __vec16_i1 mask) +{ + const __vec16_i64 offsets = _offsets.cvt2hilo(); + + __vec16_i1 still_to_do = mask; + while (still_to_do) { + int first_active_lane = _mm_tzcnt_32((int)still_to_do); + const uint &hi32 = ((uint*)&offsets.v_hi)[first_active_lane]; + __vec16_i1 match = _mm512_mask_cmp_epi32_mask(mask,offsets.v_hi, + __smear_i32<__vec16_i32>((int32_t)hi32), + _MM_CMPINT_EQ); + + void * base = (void*)((unsigned long)_base + + ((scale*(unsigned long)hi32) << 32)); + _mm512_mask_i32extscatter_ps(base, match, offsets.v_lo, + value, + _MM_DOWNCONV_PS_NONE, scale, + _MM_HINT_NONE); + still_to_do = _mm512_kxor(match,still_to_do); + } +} +#else SCATTER_BASE_OFFSETS(__vec16_f, float, __vec16_i64, __scatter_base_offsets64_float) +#endif +#endif /*****************/ SCATTER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i32, __scatter_base_offsets32_i64) SCATTER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i64, __scatter_base_offsets64_i64) From 87cecddabb69f0a5794c6d6c325c8ccd329165c9 Mon Sep 17 00:00:00 2001 From: Ilia Filippov Date: Fri, 20 Sep 2013 18:57:20 +0400 Subject: [PATCH 095/124] adding sort to performance checking --- examples/sort/sort.cpp | 16 +++++++++------- perf.ini | 8 ++++---- 2 files changed, 13 insertions(+), 11 deletions(-) diff --git a/examples/sort/sort.cpp b/examples/sort/sort.cpp index 4f402c75..f5e4264a 100644 --- a/examples/sort/sort.cpp +++ b/examples/sort/sort.cpp @@ -86,7 +86,8 @@ int main (int argc, char *argv[]) tISPC1 += get_elapsed_mcycles(); - progressbar (i, m); + if (argc != 3) + progressbar (i, m); } printf("[sort ispc]:\t[%.3f] million cycles\n", tISPC1); @@ -103,10 +104,11 @@ int main (int argc, char *argv[]) tISPC2 += get_elapsed_mcycles(); - progressbar (i, m); + if (argc != 3) + progressbar (i, m); } - - printf("[sort ispc+tasks]:\t[%.3f] million cycles\n", tISPC2); + + printf("[sort ispc + tasks]:\t[%.3f] million cycles\n", tISPC2); srand (0); @@ -120,13 +122,13 @@ int main (int argc, char *argv[]) tSerial += get_elapsed_mcycles(); - progressbar (i, m); + if (argc != 3) + progressbar (i, m); } printf("[sort serial]:\t\t[%.3f] million cycles\n", tSerial); - printf("\t\t\t\t(%.2fx speedup from ISPC serial)\n", tSerial/tISPC1); - printf("\t\t\t\t(%.2fx speedup from ISPC with tasks)\n", tSerial/tISPC2); + printf("\t\t\t\t(%.2fx speedup from ISPC, %.2fx speedup from ISPC + tasks)\n", tSerial/tISPC1, tSerial/tISPC2); delete code; delete order; diff --git a/perf.ini b/perf.ini index d8c7fe71..249c25f4 100755 --- a/perf.ini +++ b/perf.ini @@ -51,7 +51,7 @@ Volume Rendering volume_rendering camera.dat density_highres.vol #*** -%Sort -%sort -% -%#*** +Sort +sort +1000000 1 +#*** From 9e0e9dbecc484fdbc6fd16a3fca283df71572f65 Mon Sep 17 00:00:00 2001 From: Preston Gurd Date: Fri, 20 Sep 2013 14:42:46 -0400 Subject: [PATCH 096/124] - Add Silvermont (--cpu=slm) option for llvm 3.4+. - Change default Sandybridge isa name to avx1-i32x8 from avx-i32x8, to conform with replacement of avx-i32x8 by avx1-i32x8 everywhere else. - Add "target-cpu" attribute, when using AttrBuilder, to correct a problem whereby llvm would switch from the command line cpu setting to the native (auto-detected) cpu setting on second and subsequent functions. e.g. if I wanted to build for Silvermont on a Sandy Bridge machine, ispc/llvm would correctly use Silvermont and turn on the Silvermont scheduler. For the second and subsequent functions, it would auto-detect Sandy Bridge, but still run the Silvermont scheduler. --- ispc.cpp | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/ispc.cpp b/ispc.cpp index 82f0518b..ea7bfcd7 100644 --- a/ispc.cpp +++ b/ispc.cpp @@ -126,7 +126,7 @@ lGetSystemISA() { return "avx1.1-i32x8"; } // Regular AVX - return "avx-i32x8"; + return "avx1-i32x8"; } else if ((info[2] & (1 << 19)) != 0) return "sse4-i32x4"; @@ -149,8 +149,11 @@ static const char *supportedCPUs[] = { #endif "atom", "penryn", "core2", "corei7", "corei7-avx" #if !defined(LLVM_3_1) - , "core-avx-i", "core-avx2" + , "core-avx-i", "core-avx2", "slm" #endif // LLVM 3.2+ +#if !defined(LLVM_3_1) && !defined(LLVM_3_2) && !defined(LLVM_3_3) + , "slm" +#endif // LLVM 3.4+ }; Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : @@ -196,9 +199,10 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : isa = "avx1.1-i32x8"; else if (!strcmp(cpu, "sandybridge") || !strcmp(cpu, "corei7-avx")) - isa = "avx-i32x8"; + isa = "avx1-i32x8"; else if (!strcmp(cpu, "corei7") || - !strcmp(cpu, "penryn")) + !strcmp(cpu, "penryn") || + !strcmp(cpu, "slm")) isa = "sse4-i32x4"; else isa = "sse2-i32x4"; @@ -660,6 +664,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : // Initialize target-specific "target-feature" attribute. if (!m_attributes.empty()) { llvm::AttrBuilder attrBuilder; + attrBuilder.addAttribute("target-cpu", this->m_cpu); attrBuilder.addAttribute("target-features", this->m_attributes); this->m_tf_attributes = new llvm::AttributeSet( llvm::AttributeSet::get( From 4b26b8b4309ffb3295db16815620d2ab751c61c7 Mon Sep 17 00:00:00 2001 From: Preston Gurd Date: Fri, 20 Sep 2013 16:44:01 -0400 Subject: [PATCH 097/124] Remove redundant "slm". --- ispc.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ispc.cpp b/ispc.cpp index ea7bfcd7..bec7baf7 100644 --- a/ispc.cpp +++ b/ispc.cpp @@ -149,7 +149,7 @@ static const char *supportedCPUs[] = { #endif "atom", "penryn", "core2", "corei7", "corei7-avx" #if !defined(LLVM_3_1) - , "core-avx-i", "core-avx2", "slm" + , "core-avx-i", "core-avx2" #endif // LLVM 3.2+ #if !defined(LLVM_3_1) && !defined(LLVM_3_2) && !defined(LLVM_3_3) , "slm" From 019043f55ee13865fe6f672fcce544028ff63e2f Mon Sep 17 00:00:00 2001 From: evghenii Date: Mon, 23 Sep 2013 09:55:55 +0300 Subject: [PATCH 098/124] patched half2float & float2half to pass the tests. Now only test-141 is failed. but it seems to be test rather than knc-i1x16.h related --- examples/intrinsics/knc-i1x16.h | 138 +++++++++++++++++++++++++++----- 1 file changed, 117 insertions(+), 21 deletions(-) diff --git a/examples/intrinsics/knc-i1x16.h b/examples/intrinsics/knc-i1x16.h index 1f5a6056..2ee6d2f5 100644 --- a/examples/intrinsics/knc-i1x16.h +++ b/examples/intrinsics/knc-i1x16.h @@ -1478,23 +1478,101 @@ static FORCEINLINE float __floatbits(int v) { return u.f; } +/* source : + * http://stackoverflow.com/questions/1659440/32-bit-to-16-bit-floating-point-conversion */ +class Float16Compressor +{ + union Bits + { + float f; + int32_t si; + uint32_t ui; + }; + + static int const shift = 13; + static int const shiftSign = 16; + + static int32_t const infN = 0x7F800000; // flt32 infinity + static int32_t const maxN = 0x477FE000; // max flt16 normal as a flt32 + static int32_t const minN = 0x38800000; // min flt16 normal as a flt32 + static int32_t const signN = 0x80000000; // flt32 sign bit + + static int32_t const infC = infN >> shift; + static int32_t const nanN = (infC + 1) << shift; // minimum flt16 nan as a flt32 + static int32_t const maxC = maxN >> shift; + static int32_t const minC = minN >> shift; + static int32_t const signC = signN >> shiftSign; // flt16 sign bit + + static int32_t const mulN = 0x52000000; // (1 << 23) / minN + static int32_t const mulC = 0x33800000; // minN / (1 << (23 - shift)) + + static int32_t const subC = 0x003FF; // max flt32 subnormal down shifted + static int32_t const norC = 0x00400; // min flt32 normal down shifted + + static int32_t const maxD = infC - maxC - 1; + static int32_t const minD = minC - subC - 1; + + public: + + static uint16_t compress(float value) + { + Bits v, s; + v.f = value; + uint32_t sign = v.si & signN; + v.si ^= sign; + sign >>= shiftSign; // logical shift + s.si = mulN; + s.si = s.f * v.f; // correct subnormals + v.si ^= (s.si ^ v.si) & -(minN > v.si); + v.si ^= (infN ^ v.si) & -((infN > v.si) & (v.si > maxN)); + v.si ^= (nanN ^ v.si) & -((nanN > v.si) & (v.si > infN)); + v.ui >>= shift; // logical shift + v.si ^= ((v.si - maxD) ^ v.si) & -(v.si > maxC); + v.si ^= ((v.si - minD) ^ v.si) & -(v.si > subC); + return v.ui | sign; + } + + static float decompress(uint16_t value) + { + Bits v; + v.ui = value; + int32_t sign = v.si & signC; + v.si ^= sign; + sign <<= shiftSign; + v.si ^= ((v.si + minD) ^ v.si) & -(v.si > subC); + v.si ^= ((v.si + maxD) ^ v.si) & -(v.si > maxC); + Bits s; + s.si = mulC; + s.f *= v.si; + int32_t mask = -(norC > v.si); + v.si <<= shift; + v.si ^= (s.si ^ v.si) & mask; + v.si |= sign; + return v.f; + } +}; + static FORCEINLINE float __half_to_float_uniform(int16_t h) { - static const uint32_t shifted_exp = 0x7c00 << 13; // exponent mask after shift +#if 0 + static const uint32_t shifted_exp = 0x7c00 << 13; // exponent mask after shift - int32_t o = ((int32_t)(h & 0x7fff)) << 13; // exponent/mantissa bits - uint32_t exp = shifted_exp & o; // just the exponent - o += (127 - 15) << 23; // exponent adjust + int32_t o = ((int32_t)(h & 0x7fff)) << 13; // exponent/mantissa bits + uint32_t exp = shifted_exp & o; // just the exponent + o += (127 - 15) << 23; // exponent adjust - // handle exponent special cases - if (exp == shifted_exp) // Inf/NaN? - o += (128 - 16) << 23; // extra exp adjust - else if (exp == 0) { // Zero/Denormal? - o += 1 << 23; // extra exp adjust - o = __intbits(__floatbits(o) - __floatbits(113 << 23)); // renormalize - } + // handle exponent special cases + if (exp == shifted_exp) // Inf/NaN? + o += (128 - 16) << 23; // extra exp adjust + else if (exp == 0) { // Zero/Denormal? + o += 1 << 23; // extra exp adjust + o = __intbits(__floatbits(o) - __floatbits(113 << 23)); // renormalize + } - o |= ((int32_t)(h & 0x8000)) << 16; // sign bit - return __floatbits(o); + o |= ((int32_t)(h & 0x8000)) << 16; // sign bit + return __floatbits(o); +#else + return Float16Compressor::decompress(h); +#endif } @@ -1507,6 +1585,7 @@ static FORCEINLINE __vec16_f __half_to_float_varying(__vec16_i16 v) { static FORCEINLINE int16_t __float_to_half_uniform(float f) { +#if 0 uint32_t sign_mask = 0x80000000u; int32_t o; @@ -1531,6 +1610,9 @@ static FORCEINLINE int16_t __float_to_half_uniform(float f) { o = fint2 >> 13; // Take the bits! return (o | (sign >> 16)); +#else + return Float16Compressor::compress(f); +#endif } @@ -2075,9 +2157,8 @@ CAST(__vec16_i32, uint32_t, __vec16_d, double, __cast_fptoui) CAST(__vec16_i64, uint64_t, __vec16_d, double, __cast_fptoui) // float/double conversions -#if 1 +#if 0 CAST(__vec16_f, float, __vec16_d, double, __cast_fptrunc) -CAST(__vec16_d, double, __vec16_f, float, __cast_fpext) #else static FORCEINLINE __vec16_f __cast_fptrunc(__vec16_f, __vec16_d val) { __m512i r0i = _mm512_castps_si512(_mm512_cvtpd_pslo(val.v1)); @@ -2085,11 +2166,16 @@ static FORCEINLINE __vec16_f __cast_fptrunc(__vec16_f, __vec16_d val) { return _mm512_mask_permute4f128_epi32(r0i, 0xFF00, r1i, _MM_PERM_BABA); } +#endif + +#if 0 +CAST(__vec16_d, double, __vec16_f, float, __cast_fpext) +#else static FORCEINLINE __vec16_d __cast_fpext(__vec16_d, __vec16_f val) { __vec16_d ret; - ret.v2 = _mm512_cvtpslo_pd(val.v); + ret.v1 = _mm512_cvtpslo_pd(val.v); __vec16_f other8 = _mm512_permute4f128_epi32(_mm512_castps_si512(val.v), _MM_PERM_DCDC); - ret.v1 = _mm512_cvtpslo_pd(other8); + ret.v2 = _mm512_cvtpslo_pd(other8); return ret; } #endif @@ -2325,14 +2411,24 @@ static FORCEINLINE __vec16_d __sqrt_varying_double(__vec16_d v) { return __ve // svml /////////////////////////////////////////////////////////////////////////// -static FORCEINLINE __vec16_f __svml_logf(__vec16_f v) { return _mm512_log_ps(v); } -static FORCEINLINE __vec16_f __svml_expf(__vec16_f v) { return _mm512_exp_ps(v); } +static FORCEINLINE __vec16_f __svml_sinf(__vec16_f v) { return _mm512_sin_ps(v); } +static FORCEINLINE __vec16_f __svml_asinf(__vec16_f v) { return _mm512_asin_ps(v); } static FORCEINLINE __vec16_f __svml_cosf(__vec16_f v) { return _mm512_cos_ps(v); } +static FORCEINLINE __vec16_f __svml_tanf(__vec16_f v) { return _mm512_tan_ps(v); } +static FORCEINLINE __vec16_f __svml_atanf(__vec16_f v) { return _mm512_atan_ps(v); } +static FORCEINLINE __vec16_f __svml_atan2f(__vec16_f a, __vec16_f b) { return _mm512_atan2_ps(a,b); } +static FORCEINLINE __vec16_f __svml_expf(__vec16_f v) { return _mm512_exp_ps(v); } +static FORCEINLINE __vec16_f __svml_logf(__vec16_f v) { return _mm512_log_ps(v); } static FORCEINLINE __vec16_f __svml_powf(__vec16_f a, __vec16_f b) { return _mm512_pow_ps(a,b); } -static FORCEINLINE __vec16_d __svml_logd(__vec16_d v) { return __vec16_d(_mm512_log_pd(v.v1), _mm512_log_pd(v.v2)); } -static FORCEINLINE __vec16_d __svml_expd(__vec16_d v) { return __vec16_d(_mm512_exp_pd(v.v1), _mm512_exp_pd(v.v2)); } +static FORCEINLINE __vec16_d __svml_sind(__vec16_d v) { return __vec16_d(_mm512_sin_pd(v.v1), _mm512_sin_pd(v.v2)); } +static FORCEINLINE __vec16_d __svml_asind(__vec16_d v) { return __vec16_d(_mm512_asin_pd(v.v1), _mm512_asin_pd(v.v2)); } static FORCEINLINE __vec16_d __svml_cosd(__vec16_d v) { return __vec16_d(_mm512_cos_pd(v.v1), _mm512_cos_pd(v.v2)); } +static FORCEINLINE __vec16_d __svml_tand(__vec16_d v) { return __vec16_d(_mm512_tan_pd(v.v1), _mm512_tan_pd(v.v2)); } +static FORCEINLINE __vec16_d __svml_atand(__vec16_d v) { return __vec16_d(_mm512_atan_pd(v.v1), _mm512_atan_pd(v.v2)); } +static FORCEINLINE __vec16_d __svml_atan2d(__vec16_d a, __vec16_d b) { return __vec16_d(_mm512_atan2_pd(a.v1,b.v1), _mm512_atan2_pd(a.v2,b.v2)); } +static FORCEINLINE __vec16_d __svml_expd(__vec16_d v) { return __vec16_d(_mm512_exp_pd(v.v1), _mm512_exp_pd(v.v2)); } +static FORCEINLINE __vec16_d __svml_logd(__vec16_d v) { return __vec16_d(_mm512_log_pd(v.v1), _mm512_log_pd(v.v2)); } static FORCEINLINE __vec16_d __svml_powd(__vec16_d a, __vec16_d b) { return __vec16_d(_mm512_pow_pd(a.v1,b.v1), _mm512_pow_pd(a.v2,b.v2)); } /////////////////////////////////////////////////////////////////////////// From 5a9b3b3abb592d19fbe298467bcb631b25c8bd76 Mon Sep 17 00:00:00 2001 From: Ilia Filippov Date: Fri, 20 Sep 2013 19:03:58 +0400 Subject: [PATCH 099/124] adding patch for LLVM 3.3 which increases performance after regression --- .../3_3_r172868-vmovups-vinsertf128.patch | 102 ++++++++++++++++++ 1 file changed, 102 insertions(+) create mode 100644 llvm_patches/3_3_r172868-vmovups-vinsertf128.patch diff --git a/llvm_patches/3_3_r172868-vmovups-vinsertf128.patch b/llvm_patches/3_3_r172868-vmovups-vinsertf128.patch new file mode 100644 index 00000000..36bb5572 --- /dev/null +++ b/llvm_patches/3_3_r172868-vmovups-vinsertf128.patch @@ -0,0 +1,102 @@ +This patch needs to be applied to LLVM 3.3 to fix performance regression after r172868 revision. +This regression is due to increased register pressure after revision causing spills in case of multiple loads +This regression is fixed in 3.4 but the changes in 3.4 is not back portable, +so we roll back r172868 to avoid regression with 3.3. + +Index: test/CodeGen/X86/sandybridge-loads.ll +=================================================================== +--- test/CodeGen/X86/sandybridge-loads.ll (revision 191082) ++++ test/CodeGen/X86/sandybridge-loads.ll (working copy) +@@ -1,24 +1,5 @@ + ; RUN: llc -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -o - < %s | FileCheck %s + +-;CHECK: wideloads +-;CHECK: vmovaps +-;CHECK: vinsertf128 +-;CHECK: vmovaps +-;CHECK-NOT: vinsertf128 +-;CHECK: ret +- +-define void @wideloads(<8 x float>* %a, <8 x float>* %b, <8 x float>* %c) nounwind uwtable noinline ssp { +- %v0 = load <8 x float>* %a, align 16 ; <---- unaligned! +- %v1 = load <8 x float>* %b, align 32 ; <---- aligned! +- %m0 = fcmp olt <8 x float> %v1, %v0 +- %v2 = load <8 x float>* %c, align 32 ; <---- aligned! +- %m1 = fcmp olt <8 x float> %v2, %v0 +- %mand = and <8 x i1> %m1, %m0 +- %r = zext <8 x i1> %mand to <8 x i32> +- store <8 x i32> %r, <8 x i32>* undef, align 32 +- ret void +-} +- + ; CHECK: widestores + ; loads: + ; CHECK: vmovaps +Index: test/CodeGen/X86/v8i1-masks.ll +=================================================================== +--- test/CodeGen/X86/v8i1-masks.ll (revision 172868) ++++ test/CodeGen/X86/v8i1-masks.ll (revision 172866) +@@ -1,7 +1,7 @@ + ; RUN: llc -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -o - < %s | FileCheck %s + + ;CHECK: and_masks +-;CHECK: vmovaps ++;CHECK: vmovups + ;CHECK: vcmpltp + ;CHECK: vcmpltp + ;CHECK: vandps +Index: lib/Target/X86/X86ISelLowering.cpp +=================================================================== +--- lib/Target/X86/X86ISelLowering.cpp (revision 191077) ++++ lib/Target/X86/X86ISelLowering.cpp (working copy) +@@ -16756,42 +16756,9 @@ + EVT MemVT = Ld->getMemoryVT(); + DebugLoc dl = Ld->getDebugLoc(); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); +- unsigned RegSz = RegVT.getSizeInBits(); + +- // On Sandybridge unaligned 256bit loads are inefficient. + ISD::LoadExtType Ext = Ld->getExtensionType(); +- unsigned Alignment = Ld->getAlignment(); +- bool IsAligned = Alignment == 0 || Alignment >= MemVT.getSizeInBits()/8; +- if (RegVT.is256BitVector() && !Subtarget->hasInt256() && +- !DCI.isBeforeLegalizeOps() && !IsAligned && Ext == ISD::NON_EXTLOAD) { +- unsigned NumElems = RegVT.getVectorNumElements(); +- if (NumElems < 2) +- return SDValue(); + +- SDValue Ptr = Ld->getBasePtr(); +- SDValue Increment = DAG.getConstant(16, TLI.getPointerTy()); +- +- EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(), +- NumElems/2); +- SDValue Load1 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr, +- Ld->getPointerInfo(), Ld->isVolatile(), +- Ld->isNonTemporal(), Ld->isInvariant(), +- Alignment); +- Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment); +- SDValue Load2 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr, +- Ld->getPointerInfo(), Ld->isVolatile(), +- Ld->isNonTemporal(), Ld->isInvariant(), +- std::min(16U, Alignment)); +- SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, +- Load1.getValue(1), +- Load2.getValue(1)); +- +- SDValue NewVec = DAG.getUNDEF(RegVT); +- NewVec = Insert128BitVector(NewVec, Load1, 0, DAG, dl); +- NewVec = Insert128BitVector(NewVec, Load2, NumElems/2, DAG, dl); +- return DCI.CombineTo(N, NewVec, TF, true); +- } +- + // If this is a vector EXT Load then attempt to optimize it using a + // shuffle. If SSSE3 is not available we may emit an illegal shuffle but the + // expansion is still better than scalar code. +@@ -16805,6 +16772,7 @@ + assert(MemVT.isVector() && "Must load a vector from memory"); + + unsigned NumElems = RegVT.getVectorNumElements(); ++ unsigned RegSz = RegVT.getSizeInBits(); + unsigned MemSz = MemVT.getSizeInBits(); + assert(RegSz > MemSz && "Register size must be greater than the mem size"); + From af5da885a56b32798f4c6dc94ccbbe60bc40b28e Mon Sep 17 00:00:00 2001 From: Ilia Filippov Date: Fri, 20 Sep 2013 17:28:07 +0400 Subject: [PATCH 100/124] small corrections of test system --- .gitignore | 6 ++ alloy.py | 223 ++++++++++++++++++++++++++++++--------------------- check_env.py | 16 ++-- common.py | 23 +++--- perf.py | 12 +-- run_tests.py | 6 +- 6 files changed, 169 insertions(+), 117 deletions(-) diff --git a/.gitignore b/.gitignore index 88fb0197..429199bb 100644 --- a/.gitignore +++ b/.gitignore @@ -3,14 +3,20 @@ depend ispc ispc_test +ispc_ref objs docs/doxygen docs/*.html tests*/*cpp tests*/*run +logs/ +notify_log.log +alloy_results_* examples/*/*.png examples/*/*.ppm examples/*/objs/* +examples/*/ref +examples/*/test *.swp diff --git a/alloy.py b/alloy.py index 119874b8..31399a37 100755 --- a/alloy.py +++ b/alloy.py @@ -101,8 +101,7 @@ def build_LLVM(version_LLVM, revision, folder, tarball, debug, selfbuild, from_v LLVM_BUILD="build-" + folder LLVM_BIN="bin-" + folder if os.path.exists(LLVM_BIN) and not force: - print_debug("You have folder " + LLVM_BIN + ". If you want to rebuild use --force\n", False, "") - exit(0) + error("you have folder " + LLVM_BIN + ".\nIf you want to rebuild use --force", 1) LLVM_BUILD_selfbuild = LLVM_BUILD + "_temp" LLVM_BIN_selfbuild = LLVM_BIN + "_temp" common.remove_if_exists(LLVM_SRC) @@ -188,26 +187,45 @@ def check_targets(): AVX = False; AVX11 = False; AVX2 = False; - cpu = open("/proc/cpuinfo") - f_lines = cpu.readlines() - cpu.close() - # check what native targets do we have - for i in range(0,len(f_lines)): - if SSE2 == False and "sse2" in f_lines[i]: + if current_OS == "Linux": + cpu = open("/proc/cpuinfo") + f_lines = cpu.readlines() + cpu.close() + # check what native targets do we have + for i in range(0,len(f_lines)): + if SSE2 == False and "sse2" in f_lines[i]: + SSE2 = True; + answer = answer + ["sse2-i32x4", "sse2-i32x8"] + if SSE4 == False and "sse4_1" in f_lines[i]: + SSE4 = True; + answer = answer + ["sse4-i32x4", "sse4-i32x8", "sse4-i16x8", "sse4-i8x16"] + if AVX == False and "avx" in f_lines[i]: + AVX = True; + answer = answer + ["avx1-i32x8", "avx1-i32x16"] + if AVX11 == False and "rdrand" in f_lines[i]: + AVX11 = True; + answer = answer + ["avx1.1-i32x8", "avx1.1-i32x16"] + if AVX2 == False and "avx2" in f_lines[i]: + AVX2 = True; + answer = answer + ["avx2-i32x8", "avx2-i32x16"] + if current_OS == "MacOS": + f_lines = take_lines("sysctl machdep.cpu.features", "first") + if "SSE2" in f_lines: SSE2 = True; answer = answer + ["sse2-i32x4", "sse2-i32x8"] - if SSE4 == False and "sse4_1" in f_lines[i]: + if "SSE4.1" in f_lines: SSE4 = True; answer = answer + ["sse4-i32x4", "sse4-i32x8", "sse4-i16x8", "sse4-i8x16"] - if AVX == False and "avx" in f_lines[i]: + if "AVX1.0" in f_lines: AVX = True; answer = answer + ["avx1-i32x8", "avx1-i32x16"] - if AVX11 == False and "rdrand" in f_lines[i]: + if "RDRAND" in f_lines: AVX11 = True; answer = answer + ["avx1.1-i32x8", "avx1.1-i32x16"] - if AVX2 == False and "avx2" in f_lines[i]: + if "AVX2.0" in f_lines: AVX2 = True; answer = answer + ["avx2-i32x8", "avx2-i32x16"] + answer = answer + ["generic-4", "generic-16", "generic-8", "generic-1", "generic-32", "generic-64"] # now check what targets we have with the help of SDE sde_exists = "" @@ -224,17 +242,14 @@ def check_targets(): "Please refer to http://www.intel.com/software/sde for SDE download information.", 2) return [answer, answer_sde] # here we have SDE - os.system(sde_exists + " -help > " + temp_alloy_file) - cpu = open(temp_alloy_file) - f_lines = cpu.readlines() - cpu.close() + f_lines = take_lines(sde_exists + " -help", "all") for i in range(0,len(f_lines)): if SSE4 == False and "wsm" in f_lines[i]: answer_sde = answer_sde + [["-wsm", "sse4-i32x4"], ["-wsm", "sse4-i32x8"], ["-wsm", "sse4-i16x8"], ["-wsm", "sse4-i8x16"]] if AVX == False and "snb" in f_lines[i]: answer_sde = answer_sde + [["-snb", "avx1-i32x8"], ["-snb", "avx1-i32x16"]] if AVX11 == False and "ivb" in f_lines[i]: - answer_sde = answer_sde + [["-ivb", "avx1.1-i32x8"], ["ivb", "avx1.1-i32x16"]] + answer_sde = answer_sde + [["-ivb", "avx1.1-i32x8"], ["-ivb", "avx1.1-i32x16"]] if AVX2 == False and "hsw" in f_lines[i]: answer_sde = answer_sde + [["-hsw", "avx2-i32x8"], ["-hsw", "avx2-i32x16"]] return [answer, answer_sde] @@ -271,14 +286,11 @@ def execute_stability(stability, R, print_version): def run_special_tests(): i = 5 -def validation_run(only, only_targets, reference_branch, notify, update): - current_path = os.getcwd() +def validation_run(only, only_targets, reference_branch, number, notify, update): os.chdir(os.environ["ISPC_HOME"]) os.environ["PATH"] = os.environ["ISPC_HOME"] + ":" + os.environ["PATH"] if options.notify != "": - if os.environ.get("SMTP_ISPC") == None: - error("you have no SMTP_ISPC in your environment for option notify", 1) - common.remove_if_exists(os.environ["ISPC_HOME"] + os.sep + "all_answer.txt") + common.remove_if_exists(os.environ["ISPC_HOME"] + os.sep + "notify_log.log") smtp_server = os.environ["SMTP_ISPC"] msg = MIMEMultipart() msg['Subject'] = 'ISPC test system results' @@ -437,7 +449,7 @@ def validation_run(only, only_targets, reference_branch, notify, update): print_debug("\n\nPerformance validation run\n\n", False, "") performance = options_for_drivers() # performance constant options - performance.number = 5 + performance.number = number performance.config = "./perf.ini" performance.path = "./" performance.silent = True @@ -450,16 +462,13 @@ def validation_run(only, only_targets, reference_branch, notify, update): if len(need_LLVM) != 0: build_LLVM(need_LLVM[i], "", "", "", False, False, True, False) # prepare reference point. build both test and reference compilers - os.system("git branch > " + temp_alloy_file) - br = open(temp_alloy_file) - temp4 = br.readlines() - br.close() + temp4 = take_lines("git branch", "all") for line in temp4: if "*" in line: current_branch = line[2:-1] stashing = True sys.stdout.write("Please, don't interrupt script here! You can have not sync git status after interruption!\n") - if "No local changes" in detect_version("git stash"): + if "No local changes" in take_lines("git stash", "first"): stashing = False #try_do_LLVM("stash current branch ", "git stash", True) try_do_LLVM("checkout reference branch " + reference_branch + " ", "git checkout " + reference_branch, True) @@ -478,11 +487,9 @@ def validation_run(only, only_targets, reference_branch, notify, update): attach_mail_file(msg, performance.in_file, "performance.log") attach_mail_file(msg, "." + os.sep + "logs" + os.sep + "perf_build.log", "perf_build.log") - print_debug("Logs are in alloy_results_[date]", False, "") - # sending e-mail with results if options.notify != "": - fp = open(os.environ["ISPC_HOME"] + os.sep + "all_answer.txt", 'rb') + fp = open(os.environ["ISPC_HOME"] + os.sep + "notify_log.log", 'rb') f_lines = fp.readlines() fp.close() line = "" @@ -495,46 +502,56 @@ def validation_run(only, only_targets, reference_branch, notify, update): s = smtplib.SMTP(smtp_server) s.sendmail('ISPC_test_system', options.notify, msg.as_string()) s.quit() -# exit of validation routine - common.remove_if_exists(temp_alloy_file) - os.chdir(current_path) def Main(): + global current_OS if (platform.system() == 'Windows' or 'CYGWIN_NT' in platform.system()) == True: + current_OS = "Windows" error("Windows isn't supported now", 1) - if (options.build_llvm == False and - options.validation_run == False and - options.llvm_home == "" and - options.ispc_home == "" and - options.sde_home == ""): + else: + if (platform.system() == 'Darwin'): + current_OS = "MacOS" + else: + current_OS = "Linux" + + if (options.build_llvm == False and options.validation_run == False): parser.print_help() exit(0) - global f_date - f_date = "logs" - common.remove_if_exists(f_date) - os.makedirs(f_date) - global temp_alloy_file - temp_alloy_file = os.getcwd() + os.sep + f_date + os.sep + "temp_detect_version" - global alloy_build - alloy_build = os.getcwd() + os.sep + f_date + os.sep + "alloy_build.log" - common.remove_if_exists(alloy_build) - global stability_log - stability_log = os.getcwd() + os.sep + f_date + os.sep + "stability.log" - common.remove_if_exists(stability_log) + setting_paths(options.llvm_home, options.ispc_home, options.sde_home) if os.environ.get("LLVM_HOME") == None: error("you have no LLVM_HOME", 1) if os.environ.get("ISPC_HOME") == None: error("you have no ISPC_HOME", 1) - if options.build_llvm: - build_LLVM(options.version, options.revision, options.folder, options.tarball, + if options.notify != "": + if os.environ.get("SMTP_ISPC") == None: + error("you have no SMTP_ISPC in your environment for option notify", 1) + + global f_date + f_date = "logs" + common.remove_if_exists(f_date) + os.makedirs(f_date) + global alloy_build + alloy_build = os.getcwd() + os.sep + f_date + os.sep + "alloy_build.log" + global stability_log + stability_log = os.getcwd() + os.sep + f_date + os.sep + "stability.log" + current_path = os.getcwd() + try: + if options.build_llvm: + build_LLVM(options.version, options.revision, options.folder, options.tarball, options.debug, options.selfbuild, False, options.force) - if options.validation_run: - validation_run(options.only, options.only_targets, options.branch, options.notify, options.update) - os.rename(f_date, "alloy_results_" + datetime.datetime.now().strftime('%H_%M_%d_%m_%Y')) + if options.validation_run: + validation_run(options.only, options.only_targets, options.branch, + options.number_for_performance, options.notify, options.update) + finally: + os.chdir(current_path) + date_name = "alloy_results_" + datetime.datetime.now().strftime('%H_%M_%d_%m_%Y') + os.rename(f_date, date_name) + print_debug("Logs are in " + date_name + "\n", False, "") ###Main### from optparse import OptionParser +from optparse import OptionGroup import sys import os import operator @@ -554,47 +571,73 @@ import run_tests import perf import common error = common.error -detect_version = common.detect_version +take_lines = common.take_lines print_debug = common.print_debug # parsing options -parser = OptionParser() -# options for activity "build LLVM" +class MyParser(OptionParser): + def format_epilog(self, formatter): + return self.epilog +examples = ("Examples:\n" + +"Load and build LLVM from trunk\n\talloy.py -b\n" + +"Load and build LLVM 3.3. Rewrite LLVM folders\n\talloy.py -b --version=3.3 --force\n" + +"Untar files llvm.tgz clang.tgz, build LLVM from them in folder bin-from_tar\n\talloy.py -b --tarball='llvm.tgz clang.tgz' --folder=from_tar\n" + +"Load LLVM from trunk, revision r172870. Build it. Do selfbuild\n\talloy.py -b --revision=r172870 --selfbuild\n" + +"Validation run with LLVM 3.3, trunk; x86, x86-64; -O2;\nall supported targets; performance\n\talloy.py -r\n" + +"Validation run with all avx targets and sse4-i8x16 without performance\n\talloy.py -r --only=stability --only-targets='avx sse4-i8x16'\n" + +"Validation run with avx2-i32x8, all sse4 and sse2 targets\nand all targets with i32x16\n\talloy.py -r --only-targets='avx2-i32x8 sse4 i32x16 sse2'\n" + +"Stability validation run with LLVM 3.2, 3.3; -O0; x86,\nupdate fail_db.txt with passes and fails\n\talloy.py -r --only='3.2 -O0 stability 3.3 x86' --update-errors=FP\n" + +"Try to build compiler with all LLVM\n\talloy.py -r --only=build\n" + +"Performance validation run with 10 runs of each test and comparing to branch 'old'\n\talloy.py -r --only=performance --compare-with=old --number=10\n" + +"Validation run. Update fail_db.txt with new fails, send results to my@my.com\n\talloy.py -r --update-errors=F --notify='my@my.com'\n") +parser = MyParser(usage="Usage: alloy.py -r/-b [options]", epilog=examples) parser.add_option('-b', '--build-llvm', dest='build_llvm', help='ask to build LLVM', default=False, action="store_true") -parser.add_option('--version', dest='version', - help='version of llvm to build: 3.1 3.2 3.3 trunk', default="trunk") -parser.add_option('--revision', dest='revision', - help='revision of llvm to build in format r172870', default="") -parser.add_option('--debug', dest='debug', - help='debug build of LLVM?', default=False, action="store_true") -parser.add_option('--folder', dest='folder', - help='folder to build LLVM in', default="") -parser.add_option('--tarball', dest='tarball', - help='"llvm_tarball clang_tarball"', default="") -parser.add_option('--selfbuild', dest='selfbuild', - help='make selfbuild of LLVM and clang', default=False, action="store_true") -parser.add_option('--force', dest='force', - help='rebuild LLVM', default=False, action='store_true') -# options for activity "setup PATHS" -parser.add_option('--llvm_home', dest='llvm_home',help='path to LLVM',default="") -parser.add_option('--ispc_home', dest='ispc_home',help='path to ISPC',default="") -parser.add_option('--sde_home', dest='sde_home',help='path to SDE',default="") -# options for activity "validation run" parser.add_option('-r', '--run', dest='validation_run', help='ask for validation run', default=False, action="store_true") -parser.add_option('--compare-with', dest='branch', - help='set performance reference point', default="master") -parser.add_option('--only-targets', dest='only_targets', - help='set list of targets to test. Possible values - all subnames of targets.\n' + - 'Example: --only-targets="avx2-i32x8 sse4 i32x16 sse2"', default="") -parser.add_option('--notify', dest='notify', - help='sent results to email', default="") -parser.add_option('--only', dest='only', +# options for activity "build LLVM" +llvm_group = OptionGroup(parser, "Options for building LLVM", + "These options must be used with -b option.") +llvm_group.add_option('--version', dest='version', + help='version of llvm to build: 3.1 3.2 3.3 trunk. Default: trunk', default="trunk") +llvm_group.add_option('--revision', dest='revision', + help='revision of llvm to build in format r172870', default="") +llvm_group.add_option('--debug', dest='debug', + help='debug build of LLVM?', default=False, action="store_true") +llvm_group.add_option('--folder', dest='folder', + help='folder to build LLVM in', default="") +llvm_group.add_option('--tarball', dest='tarball', + help='"llvm_tarball clang_tarball"', default="") +llvm_group.add_option('--selfbuild', dest='selfbuild', + help='make selfbuild of LLVM and clang', default=False, action="store_true") +llvm_group.add_option('--force', dest='force', + help='rebuild LLVM', default=False, action='store_true') +parser.add_option_group(llvm_group) +# options for activity "validation run" +run_group = OptionGroup(parser, "Options for validation run", + "These options must be used with -r option.") +run_group.add_option('--compare-with', dest='branch', + help='set performance reference point. Dafault: master', default="master") +run_group.add_option('--number', dest='number_for_performance', + help='number of performance runs for each test. Default: 5', default=5) +run_group.add_option('--notify', dest='notify', + help='email to sent results to', default="") +run_group.add_option('--update-errors', dest='update', + help='rewrite fail_db.txt file according to received results (F or FP)', default="") +run_group.add_option('--only-targets', dest='only_targets', + help='set list of targets to test. Possible values - all subnames of targets.', + default="") +run_group.add_option('--only', dest='only', help='set types of tests. Possible values:\n' + '-O0, -O2, x86, x86-64, stability (test only stability), performance (test only performance)\n' + - 'build (only build with different LLVM), 3.1, 3.2, 3.3, trunk, native (do not use SDE), current (do not rebuild ISPC).\n' + - 'Example: --only="3.2 -O0 stability 3.3"', default="") -parser.add_option('--update-errors', dest='update', - help='rewrite fail_db.txt file according to received results (F or FP)', default="") + 'build (only build with different LLVM), 3.1, 3.2, 3.3, trunk, native (do not use SDE), current (do not rebuild ISPC).', + default="") +parser.add_option_group(run_group) +# options for activity "setup PATHS" +setup_group = OptionGroup(parser, "Options for setup", + "These options must be use with -r or -b to setup environment variables") +setup_group.add_option('--llvm_home', dest='llvm_home',help='path to LLVM',default="") +setup_group.add_option('--ispc_home', dest='ispc_home',help='path to ISPC',default="") +setup_group.add_option('--sde_home', dest='sde_home',help='path to SDE',default="") +parser.add_option_group(setup_group) (options, args) = parser.parse_args() Main() diff --git a/check_env.py b/check_env.py index 98deb235..8c90d895 100755 --- a/check_env.py +++ b/check_env.py @@ -39,7 +39,7 @@ import os import string print_debug = common.print_debug error = common.error -detect_version = common.detect_version +take_lines = common.take_lines exists = [False, False, False, False, False, False, False, False] names = ["m4", "bison", "flex", "sde", "ispc", "clang", "gcc", "icc"] @@ -54,26 +54,26 @@ print_debug("=== in PATH: ===\n", False, "") print_debug("Tools:\n", False, "") for i in range(0,3): if exists[i]: - print_debug(detect_version(names[i] + " --version"), False, "") + print_debug(take_lines(names[i] + " --version", "first"), False, "") else: error("you don't have " + names[i], 0) if exists[0] and exists[1] and exists[2]: if common.check_tools(2): - print_debug("versions are ok\n", False, "") + print_debug("Tools' versions are ok\n", False, "") print_debug("\nSDE:\n", False, "") if exists[3]: - print_debug(detect_version(names[3] + " --version"), False, "") + print_debug(take_lines(names[3] + " --version", "first"), False, "") else: error("you don't have " + names[3], 2) print_debug("\nISPC:\n", False, "") if exists[4]: - print_debug(detect_version(names[4] + " --version"), False, "") + print_debug(take_lines(names[4] + " --version", "first"), False, "") else: error("you don't have " + names[4], 2) print_debug("\nC/C++ compilers:\n", False, "") for i in range(5,8): if exists[i]: - print_debug(detect_version(names[i] + " --version"), False, "") + print_debug(take_lines(names[i] + " --version", "first"), False, "") else: error("you don't have " + names[i], 2) @@ -88,7 +88,7 @@ else: print_debug("Your ISPC_HOME:" + os.environ.get("ISPC_HOME") + "\n", False, "") if os.path.exists(os.environ.get("ISPC_HOME") + os.sep + "ispc"): print_debug("You have ISPC in your ISPC_HOME: " + - detect_version(os.environ.get("ISPC_HOME") + os.sep + "ispc" + " --version"), False, "") + take_lines(os.environ.get("ISPC_HOME") + os.sep + "ispc" + " --version", "first"), False, "") else: error("you don't have ISPC in your ISPC_HOME", 2) if os.environ.get("SDE_HOME") == None: @@ -97,6 +97,6 @@ else: print_debug("Your SDE_HOME:" + os.environ.get("SDE_HOME") + "\n", False, "") if os.path.exists(os.environ.get("SDE_HOME") + os.sep + "sde"): print_debug("You have sde in your SDE_HOME: " + - detect_version(os.environ.get("SDE_HOME") + os.sep + "sde" + " --version"), False, "") + take_lines(os.environ.get("SDE_HOME") + os.sep + "sde" + " --version", "first"), False, "") else: error("you don't have any SDE in your ISPC_HOME", 2) diff --git a/common.py b/common.py index dd8fb388..19d09e4d 100644 --- a/common.py +++ b/common.py @@ -50,21 +50,24 @@ def remove_if_exists(filename): os.remove(filename) # detect version which is printed after command -def detect_version(command): +def take_lines(command, which): os.system(command + " > " + "temp_detect_version") version = open("temp_detect_version") - answer = version.readline() + if which == "first": + answer = version.readline() + if which == "all": + answer = version.readlines() version.close() remove_if_exists("temp_detect_version") return answer # print versions of compilers def print_version(ispc_test, ispc_ref, ref_compiler, s, perf_log, is_windows): - print_debug("\nUsing test compiler: " + detect_version(ispc_test + " --version"), s, perf_log) + print_debug("\nUsing test compiler: " + take_lines(ispc_test + " --version", "first"), s, perf_log) if ispc_ref != "": - print_debug("Using ref compiler: " + detect_version(ispc_ref + " --version"), s, perf_log) + print_debug("Using ref compiler: " + take_lines(ispc_ref + " --version", "first"), s, perf_log) if is_windows == False: - temp1 = detect_version(ref_compiler + " --version") + temp1 = take_lines(ref_compiler + " --version", "first") else: os.system(ref_compiler + " 2>&1" + " 2> temp_detect_version > temp_detect_version1" ) version = open("temp_detect_version") @@ -80,7 +83,7 @@ def print_debug(line, silent, filename): sys.stdout.write(line) sys.stdout.flush() if os.environ.get("ISPC_HOME") != None: - write_to_file(os.environ["ISPC_HOME"] + os.sep + "all_answer.txt", line) + write_to_file(os.environ["ISPC_HOME"] + os.sep + "notify_log.log", line) if filename != "": write_to_file(filename, line) @@ -102,9 +105,9 @@ def check_tools(m): input_tools=[[[1,4],"m4 --version", "bad m4 version"], [[2,4],"bison --version", "bad bison version"], [[2,5], "flex --version", "bad flex version"]] - + ret = 1 for t in range(0,len(input_tools)): - t1 = ((detect_version(input_tools[t][1]))[:-1].split(" ")) + t1 = ((take_lines(input_tools[t][1], "first"))[:-1].split(" ")) for i in range(0,len(t1)): t11 = t1[i].split(".") f = True @@ -116,5 +119,5 @@ def check_tools(m): if j < len(input_tools[t][0]): if int(t11[j]) 1: print_debug(" <+", False, "") + if p1 > 1: + print_debug(" <-", False, "") print_debug("\n", False, "") print_debug("\n", False, "") @@ -261,11 +261,11 @@ def compare(A, B): p2 = 0 else: p2 = 100 - 100 * A[4][i]/B[4][i] - print_debug("%21s: %10.2f %10.2f %10.2f" % (A[0][i], A[4][i], B[4][i], p2), False, "") + print_debug("%21s: %10.2f %10.2f %10.2f" % (A[0][i], A[4][i], B[4][i], abs(p2)), False, "") if p2 < -1: - print_debug(" <-", False, "") - if p2 > 1: print_debug(" <+", False, "") + if p2 > 1: + print_debug(" <-", False, "") print_debug("\n", False, "") if "performance.log" in options.in_file: print_debug("\n\n_________________Watch performance.log for details________________\n", False, "") diff --git a/run_tests.py b/run_tests.py index 2471b6cb..914f22a7 100755 --- a/run_tests.py +++ b/run_tests.py @@ -364,11 +364,11 @@ def file_check(compfails, runfails): else: opt = "-O2" # Detect LLVM version - temp1 = common.detect_version(ispc_exe + " --version") + temp1 = common.take_lines(ispc_exe + " --version", "first") llvm_version = temp1[-10:-2] -#Detect compiler version +# Detect compiler version if is_windows == False: - temp1 = common.detect_version(options.compiler_exe + " --version") + temp1 = common.take_lines(options.compiler_exe + " --version", "first") temp2 = temp1.split(" ") compiler_version = temp2[0] + temp2[2][0:4] else: From 1c858c34f795c1b2fb29d9c07ae5c448dab287a0 Mon Sep 17 00:00:00 2001 From: Ilia Filippov Date: Tue, 24 Sep 2013 17:37:39 +0400 Subject: [PATCH 101/124] correction of test system --- alloy.py | 53 +++++++++++++++++++++++++++++----------------- examples/common.mk | 2 +- perf.py | 10 ++++----- run_tests.py | 12 +++++++---- 4 files changed, 47 insertions(+), 30 deletions(-) diff --git a/alloy.py b/alloy.py index 31399a37..7ae972b4 100755 --- a/alloy.py +++ b/alloy.py @@ -70,7 +70,7 @@ def try_do_LLVM(text, command, from_validation): error("can't " + text, 1) print_debug("DONE.\n", from_validation, alloy_build) -def build_LLVM(version_LLVM, revision, folder, tarball, debug, selfbuild, from_validation, force): +def build_LLVM(version_LLVM, revision, folder, tarball, debug, selfbuild, from_validation, force, make): print_debug("Building LLVM. Version: " + version_LLVM + ". ", from_validation, alloy_build) if revision != "": print_debug("Revision: " + revision + ".\n", from_validation, alloy_build) @@ -100,7 +100,7 @@ def build_LLVM(version_LLVM, revision, folder, tarball, debug, selfbuild, from_v LLVM_SRC="llvm-" + folder LLVM_BUILD="build-" + folder LLVM_BIN="bin-" + folder - if os.path.exists(LLVM_BIN) and not force: + if os.path.exists(LLVM_BIN + os.sep + "bin") and not force: error("you have folder " + LLVM_BIN + ".\nIf you want to rebuild use --force", 1) LLVM_BUILD_selfbuild = LLVM_BUILD + "_temp" LLVM_BIN_selfbuild = LLVM_BIN + "_temp" @@ -110,7 +110,6 @@ def build_LLVM(version_LLVM, revision, folder, tarball, debug, selfbuild, from_v if selfbuild: common.remove_if_exists(LLVM_BUILD_selfbuild) common.remove_if_exists(LLVM_BIN_selfbuild) - MAKE = "gmake" print_debug("Using folders: " + LLVM_SRC + " " + LLVM_BUILD + " " + LLVM_BIN + " in " + llvm_home + "\n", from_validation, alloy_build) # load llvm @@ -156,9 +155,9 @@ def build_LLVM(version_LLVM, revision, folder, tarball, debug, selfbuild, from_v LLVM_BIN_selfbuild + " --enable-optimized", from_validation) try_do_LLVM("build release version for selfbuild ", - MAKE + " -j32", from_validation) + make, from_validation) try_do_LLVM("install release version for selfbuild ", - MAKE + " install", + "make install", from_validation) os.chdir("../") selfbuild_compiler = " CC="+llvm_home+ "/" + LLVM_BIN_selfbuild + "/bin/clang" @@ -175,8 +174,8 @@ def build_LLVM(version_LLVM, revision, folder, tarball, debug, selfbuild, from_v " --enable-debug-runtime --enable-debug-symbols --enable-keep-symbols" + selfbuild_compiler, from_validation) # building llvm - try_do_LLVM("build LLVM ", MAKE + " -j32", from_validation) - try_do_LLVM("install LLVM ", MAKE + " install", from_validation) + try_do_LLVM("build LLVM ", make, from_validation) + try_do_LLVM("install LLVM ", "make install", from_validation) os.chdir(current_path) def check_targets(): @@ -254,13 +253,13 @@ def check_targets(): answer_sde = answer_sde + [["-hsw", "avx2-i32x8"], ["-hsw", "avx2-i32x16"]] return [answer, answer_sde] -def build_ispc(version_LLVM): +def build_ispc(version_LLVM, make): current_path = os.getcwd() os.chdir(os.environ["ISPC_HOME"]) p_temp = os.getenv("PATH") os.environ["PATH"] = os.environ["LLVM_HOME"] + "/bin-" + version_LLVM + "/bin:" + os.environ["PATH"] - os.system("make clean >> " + alloy_build) - try_do_LLVM("build ISPC with LLVM version " + version_LLVM + " ", "make -j32", True) + try_do_LLVM("clean ISPC for building", "make clean", True) + try_do_LLVM("build ISPC with LLVM version " + version_LLVM + " ", make, True) os.environ["PATH"] = p_temp os.chdir(current_path) @@ -286,7 +285,7 @@ def execute_stability(stability, R, print_version): def run_special_tests(): i = 5 -def validation_run(only, only_targets, reference_branch, number, notify, update): +def validation_run(only, only_targets, reference_branch, number, notify, update, make): os.chdir(os.environ["ISPC_HOME"]) os.environ["PATH"] = os.environ["ISPC_HOME"] + ":" + os.environ["PATH"] if options.notify != "": @@ -327,7 +326,6 @@ def validation_run(only, only_targets, reference_branch, number, notify, update) stability.no_opt = False stability.wrapexe = "" # prepare parameters of run - common.check_tools(1) [targets_t, sde_targets_t] = check_targets() rebuild = True opts = [] @@ -352,6 +350,8 @@ def validation_run(only, only_targets, reference_branch, number, notify, update) if "current" in only: LLVM = [" "] rebuild = False + else: + common.check_tools(1) if only_targets != "": only_targets_t = only_targets.split(" ") for i in only_targets_t: @@ -383,7 +383,7 @@ def validation_run(only, only_targets, reference_branch, number, notify, update) gen_archs = ["x86-64"] need_LLVM = check_LLVM(LLVM) for i in range(0,len(need_LLVM)): - build_LLVM(need_LLVM[i], "", "", "", False, False, True, False) + build_LLVM(need_LLVM[i], "", "", "", False, False, True, False, make) # begin validation run for stabitily common.remove_if_exists(stability.in_file) R = [[[],[]],[[],[]],[[],[]],[[],[]]] @@ -391,7 +391,7 @@ def validation_run(only, only_targets, reference_branch, number, notify, update) for i in range(0,len(LLVM)): print_version = 2 if rebuild: - build_ispc(LLVM[i]) + build_ispc(LLVM[i], make) for j in range(0,len(targets)): stability.target = targets[j] stability.wrapexe = "" @@ -447,6 +447,7 @@ def validation_run(only, only_targets, reference_branch, number, notify, update) # *** *** *** if ((("performance" in only) == True) or ("stability" in only) == False): print_debug("\n\nPerformance validation run\n\n", False, "") + common.check_tools(1) performance = options_for_drivers() # performance constant options performance.number = number @@ -460,8 +461,9 @@ def validation_run(only, only_targets, reference_branch, number, notify, update) # prepare LLVM 3.3 as newest LLVM need_LLVM = check_LLVM(["3.3"]) if len(need_LLVM) != 0: - build_LLVM(need_LLVM[i], "", "", "", False, False, True, False) + build_LLVM(need_LLVM[i], "", "", "", False, False, True, False, make) # prepare reference point. build both test and reference compilers + try_do_LLVM("apply git", "git branch", True) temp4 = take_lines("git branch", "all") for line in temp4: if "*" in line: @@ -473,14 +475,14 @@ def validation_run(only, only_targets, reference_branch, number, notify, update) #try_do_LLVM("stash current branch ", "git stash", True) try_do_LLVM("checkout reference branch " + reference_branch + " ", "git checkout " + reference_branch, True) sys.stdout.write(".\n") - build_ispc("3.3") + build_ispc("3.3", make) sys.stdout.write(".\n") os.rename("ispc", "ispc_ref") try_do_LLVM("checkout test branch " + current_branch + " ", "git checkout " + current_branch, True) if stashing: try_do_LLVM("return current branch ", "git stash pop", True) sys.stdout.write("You can interrupt script now.\n") - build_ispc("3.3") + build_ispc("3.3", make) # begin validation run for performance. output is inserted into perf() perf.perf(performance, []) if options.notify != "": @@ -526,6 +528,12 @@ def Main(): if options.notify != "": if os.environ.get("SMTP_ISPC") == None: error("you have no SMTP_ISPC in your environment for option notify", 1) + if options.only != "": + test_only_r = " 3.1 3.2 3.3 trunk current build stability performance x86 x86-64 -O0 -O2 native " + test_only = options.only.split(" ") + for iterator in test_only: + if not (" " + iterator + " " in test_only_r): + error("unknow option for only: " + iterator, 1) global f_date f_date = "logs" @@ -536,16 +544,19 @@ def Main(): global stability_log stability_log = os.getcwd() + os.sep + f_date + os.sep + "stability.log" current_path = os.getcwd() + make = "make -j" + options.speed try: if options.build_llvm: build_LLVM(options.version, options.revision, options.folder, options.tarball, - options.debug, options.selfbuild, False, options.force) + options.debug, options.selfbuild, False, options.force, make) if options.validation_run: validation_run(options.only, options.only_targets, options.branch, - options.number_for_performance, options.notify, options.update) + options.number_for_performance, options.notify, options.update, make) finally: os.chdir(current_path) - date_name = "alloy_results_" + datetime.datetime.now().strftime('%H_%M_%d_%m_%Y') + date_name = "alloy_results_" + datetime.datetime.now().strftime('%d_%m_%Y_%H_%M_%S') + if os.path.exists(date_name): + error("It's forbidden to run alloy two times in a second, logs are in ./logs", 1) os.rename(f_date, date_name) print_debug("Logs are in " + date_name + "\n", False, "") @@ -594,6 +605,8 @@ parser.add_option('-b', '--build-llvm', dest='build_llvm', help='ask to build LLVM', default=False, action="store_true") parser.add_option('-r', '--run', dest='validation_run', help='ask for validation run', default=False, action="store_true") +parser.add_option('-j', dest='speed', + help='set -j for make', default="8") # options for activity "build LLVM" llvm_group = OptionGroup(parser, "Options for building LLVM", "These options must be used with -b option.") diff --git a/examples/common.mk b/examples/common.mk index cdfc4c6a..95ec7ccb 100644 --- a/examples/common.mk +++ b/examples/common.mk @@ -44,7 +44,7 @@ dirs: objs/%.cpp objs/%.o objs/%.h: dirs clean: - /bin/rm -rf objs *~ $(EXAMPLE) $(EXAMPLE)-sse4 $(EXAMPLE)-generic16 + /bin/rm -rf objs *~ $(EXAMPLE) $(EXAMPLE)-sse4 $(EXAMPLE)-generic16 ref test $(EXAMPLE): $(OBJS) $(CXX) $(CXXFLAGS) -o $@ $^ $(LIBS) diff --git a/perf.py b/perf.py index b33e1f25..576a5c7d 100755 --- a/perf.py +++ b/perf.py @@ -190,7 +190,7 @@ def print_answer(answer): filelist = [] print_debug("--------------------------------------------------------------------------\n", s, perf_log) print_debug("test name:\t ISPC speedup: ISPC + tasks speedup: | " + - "ISPC time: ISPC + tasks time: serial:\n", s, perf_log) + " ISPC time: ISPC + tasks time: serial:\n", s, perf_log) filelist.append("test name,ISPC speedup,diff," + "ISPC + tasks speedup,diff,ISPC time,diff,ISPC + tasks time,diff,serial,diff\n") max_t = [0,0,0,0,0] @@ -215,9 +215,9 @@ def print_answer(answer): list_of_max[t-1].append(mm) diff_t[t-1] = '%.2f' % (max(answer[i][t]) - min(answer[i][t])) print_debug("%s:\n" % answer[i][0], s, perf_log) - print_debug("\t\tmax:\t%5s\t\t%10s\t|%10s\t%10s\t%10s\n" % + print_debug("\t\tmax:\t%5s\t\t%10s\t|min:%10s\t%10s\t%10s\n" % (max_t[0], max_t[1], max_t[2], max_t[3], max_t[4]), s, perf_log) - print_debug("\t\tdiff:\t%5s\t\t%10s\t|%10s\t%10s\t%10s\n" % + print_debug("\t\tdiff:\t%5s\t\t%10s\t|%14s\t%10s\t%10s\n" % (diff_t[0], diff_t[1], diff_t[2], diff_t[3], diff_t[4]), s, perf_log) for t in range(0,5): if max_t[t] == "n/a": @@ -231,7 +231,7 @@ def print_answer(answer): for i in range(0,5): geomean_t[i] = geomean(list_of_max[i]) print_debug("---------------------------------------------------------------------------------\n", s, perf_log) - print_debug("Geomean:\t\t%5s\t\t%10s\t|%10s\t%10s\t%10s\n" % + print_debug("Geomean:\t\t%5s\t\t%10s\t|%14s\t%10s\t%10s\n" % (geomean_t[0], geomean_t[1], geomean_t[2], geomean_t[3], geomean_t[4]), s, perf_log) filelist.append("Geomean," + str(geomean_t[0]) + ",," + str(geomean_t[1]) + ",," + str(geomean_t[2]) + ",," + str(geomean_t[3]) + ",," + str(geomean_t[4]) + "\n") @@ -474,7 +474,7 @@ if __name__ == "__main__": parser.add_option('-c', '--config', dest='config', help='config file of tests', default="./perf.ini") parser.add_option('-p', '--path', dest='path', - help='path to test_system directory', default=".") + help='path to ispc root', default=".") parser.add_option('-s', '--silent', dest='silent', help='silent mode, only table output', default=False, action="store_true") parser.add_option('-o', '--output', dest='output', diff --git a/run_tests.py b/run_tests.py index 914f22a7..abc9b656 100755 --- a/run_tests.py +++ b/run_tests.py @@ -369,8 +369,12 @@ def file_check(compfails, runfails): # Detect compiler version if is_windows == False: temp1 = common.take_lines(options.compiler_exe + " --version", "first") - temp2 = temp1.split(" ") - compiler_version = temp2[0] + temp2[2][0:4] + temp2 = re.search("[0-9]*\.[0-9]*\.[0-9]", temp1) + if temp2 == None: + temp3 = re.search("[0-9]*\.[0-9]*", temp1) + else: + temp3 = re.search("[0-9]*\.[0-9]*", temp2.group()) + compiler_version = options.compiler_exe + temp3.group() else: compiler_version = "cl" new_line = " "+options.arch.rjust(6)+" "+options.target.rjust(14)+" "+OS.rjust(7)+" "+llvm_version+" "+compiler_version.rjust(10)+" "+opt+" *\n" @@ -464,7 +468,7 @@ def run_tests(options1, args, print_version): global s s = options.silent - # prepare run_tests_log and test_states files + # prepare run_tests_log and fail_db files global run_tests_log if options.in_file: run_tests_log = os.getcwd() + os.sep + options.in_file @@ -715,7 +719,7 @@ if __name__ == "__main__": default=False, action="store_true") parser.add_option('--non-interactive', dest='non_interactive', help='Disable interactive status updates', default=False, action="store_true") - parser.add_option('-u', "--update", dest='update', help='Update file with fails (F of FP)', default="") + parser.add_option('-u', "--update-errors", dest='update', help='Update file with fails (F of FP)', default="") parser.add_option('-s', "--silent", dest='silent', help='enable silent mode without any output', default=False, action = "store_true") parser.add_option("--file", dest='in_file', help='file to save run_tests output', default="") From dfc723bc1958f39d4526897fdfd5173a936c09f7 Mon Sep 17 00:00:00 2001 From: Dmitry Babokin Date: Mon, 23 Sep 2013 21:35:33 +0400 Subject: [PATCH 102/124] Add fails with gcc 4.4 on Linux --- fail_db.txt | 326 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 326 insertions(+) diff --git a/fail_db.txt b/fail_db.txt index 7adc3e41..23a6c8ca 100644 --- a/fail_db.txt +++ b/fail_db.txt @@ -1 +1,327 @@ % List of known fails +./tests/masked-scatter-vector.ispc runfail x86-64 sse2-i32x4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/atomics-13.ispc compfail x86 sse4-i16x8 Linux LLVM 3.3 g++4.4 -O2 * +./tests/reduce-equal-10.ispc compfail x86 sse4-i16x8 Linux LLVM 3.3 g++4.4 -O2 * +./tests/reduce-equal-11.ispc compfail x86 sse4-i16x8 Linux LLVM 3.3 g++4.4 -O2 * +./tests/reduce-equal-13.ispc compfail x86 sse4-i16x8 Linux LLVM 3.3 g++4.4 -O2 * +./tests/reduce-equal-5.ispc compfail x86 sse4-i16x8 Linux LLVM 3.3 g++4.4 -O2 * +./tests/reduce-equal-6.ispc compfail x86 sse4-i16x8 Linux LLVM 3.3 g++4.4 -O2 * +./tests/atomics-13.ispc compfail x86-64 sse4-i16x8 Linux LLVM 3.3 g++4.4 -O2 * +./tests/reduce-equal-10.ispc compfail x86-64 sse4-i16x8 Linux LLVM 3.3 g++4.4 -O2 * +./tests/reduce-equal-11.ispc compfail x86-64 sse4-i16x8 Linux LLVM 3.3 g++4.4 -O2 * +./tests/reduce-equal-13.ispc compfail x86-64 sse4-i16x8 Linux LLVM 3.3 g++4.4 -O2 * +./tests/reduce-equal-5.ispc compfail x86-64 sse4-i16x8 Linux LLVM 3.3 g++4.4 -O2 * +./tests/reduce-equal-6.ispc compfail x86-64 sse4-i16x8 Linux LLVM 3.3 g++4.4 -O2 * +./tests/funcptr-null-4.ispc runfail x86 sse4-i8x16 Linux LLVM 3.3 g++4.4 -O2 * +./tests/funcptr-null-5.ispc runfail x86 sse4-i8x16 Linux LLVM 3.3 g++4.4 -O2 * +./tests/funcptr-null-6.ispc runfail x86 sse4-i8x16 Linux LLVM 3.3 g++4.4 -O2 * +./tests/atomics-13.ispc compfail x86 sse4-i8x16 Linux LLVM 3.3 g++4.4 -O2 * +./tests/reduce-equal-10.ispc compfail x86 sse4-i8x16 Linux LLVM 3.3 g++4.4 -O2 * +./tests/reduce-equal-11.ispc compfail x86 sse4-i8x16 Linux LLVM 3.3 g++4.4 -O2 * +./tests/reduce-equal-13.ispc compfail x86 sse4-i8x16 Linux LLVM 3.3 g++4.4 -O2 * +./tests/reduce-equal-5.ispc compfail x86 sse4-i8x16 Linux LLVM 3.3 g++4.4 -O2 * +./tests/reduce-equal-6.ispc compfail x86 sse4-i8x16 Linux LLVM 3.3 g++4.4 -O2 * +./tests/funcptr-null-4.ispc runfail x86-64 sse4-i8x16 Linux LLVM 3.3 g++4.4 -O2 * +./tests/funcptr-null-5.ispc runfail x86-64 sse4-i8x16 Linux LLVM 3.3 g++4.4 -O2 * +./tests/funcptr-null-6.ispc runfail x86-64 sse4-i8x16 Linux LLVM 3.3 g++4.4 -O2 * +./tests/atomics-13.ispc compfail x86-64 sse4-i8x16 Linux LLVM 3.3 g++4.4 -O2 * +./tests/reduce-equal-10.ispc compfail x86-64 sse4-i8x16 Linux LLVM 3.3 g++4.4 -O2 * +./tests/reduce-equal-11.ispc compfail x86-64 sse4-i8x16 Linux LLVM 3.3 g++4.4 -O2 * +./tests/reduce-equal-13.ispc compfail x86-64 sse4-i8x16 Linux LLVM 3.3 g++4.4 -O2 * +./tests/reduce-equal-5.ispc compfail x86-64 sse4-i8x16 Linux LLVM 3.3 g++4.4 -O2 * +./tests/reduce-equal-6.ispc compfail x86-64 sse4-i8x16 Linux LLVM 3.3 g++4.4 -O2 * +./tests/avg-down-int8.ispc compfail x86 avx1-i32x16 Linux LLVM 3.3 g++4.4 -O2 * +./tests/avg-up-int8.ispc compfail x86 avx1-i32x16 Linux LLVM 3.3 g++4.4 -O2 * +./tests/avg-down-int8.ispc compfail x86-64 avx1-i32x16 Linux LLVM 3.3 g++4.4 -O2 * +./tests/avg-up-int8.ispc compfail x86-64 avx1-i32x16 Linux LLVM 3.3 g++4.4 -O2 * +./tests/atomics-4.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/atomics-6.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/atomics-swap.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/atomics-varyingptr-2.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/atomics-varyingptr-4.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/avg-down-uint16.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/avg-down-uint8.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/avg-up-uint16.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/avg-up-uint8.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/broadcast-1.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/broadcast-2.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/count-leading-trailing-zeros-4.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/count-leading-trailing-zeros-5.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/exclusive-scan-add-10.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/exclusive-scan-add-8.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/exclusive-scan-add-9.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/exclusive-scan-and-2.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/exclusive-scan-or-1.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/funcptr-null-2.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/funcptr-null-3.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/funcptr-null-4.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/funcptr-null-5.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/funcptr-null-6.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/funcptr-uniform-7.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/funcptr-uniform-8.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/funcptr-uniform-9.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/funcptr-varying-5.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/funcptr-varying-7.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/funcptr-varying-8.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/funcptr-varying-9.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/half-3.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/idiv.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/int64-max-1.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/int64-max.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/int64-min-1.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/int64-min.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/local-atomics-1.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/local-atomics-11.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/local-atomics-12.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/local-atomics-13.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/local-atomics-2.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/local-atomics-4.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/local-atomics-5.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/local-atomics-9.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/local-atomics-swap.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/local-atomics-varyingptr-2.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/local-atomics-varyingptr-3.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/local-atomics-varyingptr-4.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/new-delete-6.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/phi-opts-3.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/phi-opts-4.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/popcnt-1.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/popcnt-2.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/popcnt-4.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/ptr-15.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/reduce-add-int16-1.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/reduce-add-int16.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/reduce-equal-1.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/reduce-equal-10.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/reduce-equal-12.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/reduce-equal-13.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/reduce-equal-2.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/reduce-equal-4.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/reduce-equal-5.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/reduce-equal-7.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/reduce-equal-8.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/rotate-1.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/rotate-2.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/rotate-3.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/rotate-4.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/rotate-6.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/rotate.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/short-vec-14.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/shuffle-1.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/shuffle-2.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/shuffle-4.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/shuffle-flatten.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/shuffle.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/shuffle2-1.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/shuffle2-11.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/shuffle2-2.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/shuffle2-3.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/shuffle2-4.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/shuffle2-5.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/shuffle2-6.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/shuffle2-7.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/shuffle2-8.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/shuffle2-9.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/shuffle2.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/soa-27.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/soa-28.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/test-128.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/test-129.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/test-130.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/test-57.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/uint64-max-1.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/uint64-max.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/uint64-min-1.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/uint64-min.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/ptr-assign-lhs-math-1.ispc compfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/short-vec-8.ispc compfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/ptr-15.ispc runfail x86-64 generic-16 Linux LLVM 3.3 g++4.4 -O2 * +./tests/test-141.ispc runfail x86-64 generic-16 Linux LLVM 3.3 g++4.4 -O2 * +./tests/test-143.ispc runfail x86-64 generic-16 Linux LLVM 3.3 g++4.4 -O2 * +./tests/ptr-assign-lhs-math-1.ispc compfail x86-64 generic-16 Linux LLVM 3.3 g++4.4 -O2 * +./tests/avg-down-int8.ispc compfail x86 avx1.1-i32x16 Linux LLVM 3.3 g++4.4 -O2 * +./tests/avg-up-int8.ispc compfail x86 avx1.1-i32x16 Linux LLVM 3.3 g++4.4 -O2 * +./tests/avg-down-int8.ispc compfail x86-64 avx1.1-i32x16 Linux LLVM 3.3 g++4.4 -O2 * +./tests/avg-up-int8.ispc compfail x86-64 avx1.1-i32x16 Linux LLVM 3.3 g++4.4 -O2 * +./tests/test-141.ispc runfail x86 avx2-i32x16 Linux LLVM 3.3 g++4.4 -O2 * +./tests/test-141.ispc runfail x86-64 avx2-i32x16 Linux LLVM 3.3 g++4.4 -O2 * +./tests/funcptr-null-4.ispc runfail x86 sse4-i8x16 Linux LLVM 3.4 g++4.4 -O2 * +./tests/funcptr-null-5.ispc runfail x86 sse4-i8x16 Linux LLVM 3.4 g++4.4 -O2 * +./tests/funcptr-null-6.ispc runfail x86 sse4-i8x16 Linux LLVM 3.4 g++4.4 -O2 * +./tests/funcptr-null-4.ispc runfail x86-64 sse4-i8x16 Linux LLVM 3.4 g++4.4 -O2 * +./tests/funcptr-null-5.ispc runfail x86-64 sse4-i8x16 Linux LLVM 3.4 g++4.4 -O2 * +./tests/funcptr-null-6.ispc runfail x86-64 sse4-i8x16 Linux LLVM 3.4 g++4.4 -O2 * +./tests/avg-down-int8.ispc compfail x86 avx1-i32x16 Linux LLVM 3.4 g++4.4 -O2 * +./tests/avg-up-int8.ispc compfail x86 avx1-i32x16 Linux LLVM 3.4 g++4.4 -O2 * +./tests/avg-down-int8.ispc compfail x86-64 avx1-i32x16 Linux LLVM 3.4 g++4.4 -O2 * +./tests/avg-up-int8.ispc compfail x86-64 avx1-i32x16 Linux LLVM 3.4 g++4.4 -O2 * +./tests/array-gather-ifs.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/array-gather-multi-unif.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/array-gather-unif.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/array-mixed-unif-vary-indexing-2.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/array-mixed-unif-vary-indexing-3.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/array-mixed-unif-vary-indexing.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/array-multidim-gather.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/array-scatter-unif-2.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/array-scatter-vary.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/array-struct-gather.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/atomics-4.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/atomics-6.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/atomics-varyingptr-2.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/atomics-varyingptr-4.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/avg-down-uint16.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/avg-down-uint8.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/avg-up-uint16.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/avg-up-uint8.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/broadcast-1.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/broadcast-2.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/cfor-array-gather-ifs.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/cfor-array-gather-unif.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/cfor-array-multidim-gather.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/cfor-array-struct-gather.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/cfor-struct-test-114.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/cfor-unif-struct-test-114.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/count-leading-trailing-zeros-4.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/count-leading-trailing-zeros-5.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/exclusive-scan-add-10.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/exclusive-scan-add-8.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/exclusive-scan-add-9.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/exclusive-scan-and-2.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/exclusive-scan-or-1.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/funcptr-null-2.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/funcptr-null-3.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/funcptr-null-4.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/funcptr-null-5.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/funcptr-null-6.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/funcptr-uniform-7.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/funcptr-uniform-8.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/funcptr-uniform-9.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/funcptr-varying-5.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/funcptr-varying-7.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/funcptr-varying-8.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/funcptr-varying-9.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/gather-int16.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/gather-to-vload-neg-offset.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/global-array-1.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/half-3.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/idiv.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/int64-max-1.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/int64-max.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/int64-min-1.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/int64-min.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/local-atomics-1.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/local-atomics-11.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/local-atomics-12.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/local-atomics-13.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/local-atomics-2.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/local-atomics-4.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/local-atomics-5.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/local-atomics-9.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/local-atomics-swap.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/local-atomics-varyingptr-2.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/local-atomics-varyingptr-3.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/local-atomics-varyingptr-4.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/masked-scatter-struct.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/masked-scatter-vector.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/nested-structs-2.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/new-delete-6.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/pass-varying-lvalue-to-ref.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/phi-opts-3.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/phi-opts-4.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/popcnt-1.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/popcnt-2.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/popcnt-4.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/ptr-15.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/reduce-add-int16-1.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/reduce-add-int16.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/reduce-equal-1.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/reduce-equal-10.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/reduce-equal-12.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/reduce-equal-13.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/reduce-equal-2.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/reduce-equal-5.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/reduce-equal-7.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/reduce-equal-8.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/rotate-1.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/rotate-2.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/rotate-3.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/rotate-4.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/rotate-6.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/rotate.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/scatter-int16-1.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/scatter-int16.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/scatter-mask-1.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/scatter-mask-2.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/short-vec-12.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/short-vec-14.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/shuffle-1.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/shuffle-2.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/shuffle-4.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/shuffle-flatten.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/shuffle.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/shuffle2-1.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/shuffle2-11.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/shuffle2-2.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/shuffle2-3.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/shuffle2-4.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/shuffle2-5.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/shuffle2-6.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/shuffle2-7.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/shuffle2-8.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/shuffle2-9.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/shuffle2.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/soa-28.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/struct-test-114.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/test-128.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/test-129.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/test-130.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/test-57.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/uint64-max-1.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/uint64-max.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/uint64-min-1.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/uint64-min.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/unif-struct-test-114.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/varying-struct-2.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/varying-struct-3.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/varying-struct-4.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/write-same-loc.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/ptr-assign-lhs-math-1.ispc compfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/short-vec-8.ispc compfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/ptr-15.ispc runfail x86-64 generic-16 Linux LLVM 3.4 g++4.4 -O2 * +./tests/test-141.ispc runfail x86-64 generic-16 Linux LLVM 3.4 g++4.4 -O2 * +./tests/test-143.ispc runfail x86-64 generic-16 Linux LLVM 3.4 g++4.4 -O2 * +./tests/ptr-assign-lhs-math-1.ispc compfail x86-64 generic-16 Linux LLVM 3.4 g++4.4 -O2 * +./tests/avg-down-int8.ispc compfail x86 avx1.1-i32x16 Linux LLVM 3.4 g++4.4 -O2 * +./tests/avg-up-int8.ispc compfail x86 avx1.1-i32x16 Linux LLVM 3.4 g++4.4 -O2 * +./tests/avg-down-int8.ispc compfail x86-64 avx1.1-i32x16 Linux LLVM 3.4 g++4.4 -O2 * +./tests/avg-up-int8.ispc compfail x86-64 avx1.1-i32x16 Linux LLVM 3.4 g++4.4 -O2 * +./tests/atomics-varyingptr-2.ispc compfail x86-64 avx2-i32x8 Linux LLVM 3.4 g++4.4 -O2 * +./tests/atomics-varyingptr-3.ispc compfail x86-64 avx2-i32x8 Linux LLVM 3.4 g++4.4 -O2 * +./tests/atomics-varyingptr-4.ispc compfail x86-64 avx2-i32x8 Linux LLVM 3.4 g++4.4 -O2 * +./tests/local-atomics-11.ispc compfail x86-64 avx2-i32x8 Linux LLVM 3.4 g++4.4 -O2 * +./tests/local-atomics-12.ispc compfail x86-64 avx2-i32x8 Linux LLVM 3.4 g++4.4 -O2 * +./tests/local-atomics-13.ispc compfail x86-64 avx2-i32x8 Linux LLVM 3.4 g++4.4 -O2 * +./tests/local-atomics-4.ispc compfail x86-64 avx2-i32x8 Linux LLVM 3.4 g++4.4 -O2 * +./tests/local-atomics-5.ispc compfail x86-64 avx2-i32x8 Linux LLVM 3.4 g++4.4 -O2 * +./tests/local-atomics-6.ispc compfail x86-64 avx2-i32x8 Linux LLVM 3.4 g++4.4 -O2 * +./tests/local-atomics-7.ispc compfail x86-64 avx2-i32x8 Linux LLVM 3.4 g++4.4 -O2 * +./tests/local-atomics-8.ispc compfail x86-64 avx2-i32x8 Linux LLVM 3.4 g++4.4 -O2 * +./tests/local-atomics-swap.ispc compfail x86-64 avx2-i32x8 Linux LLVM 3.4 g++4.4 -O2 * +./tests/local-atomics-varyingptr-2.ispc compfail x86-64 avx2-i32x8 Linux LLVM 3.4 g++4.4 -O2 * +./tests/local-atomics-varyingptr-3.ispc compfail x86-64 avx2-i32x8 Linux LLVM 3.4 g++4.4 -O2 * +./tests/local-atomics-varyingptr-4.ispc compfail x86-64 avx2-i32x8 Linux LLVM 3.4 g++4.4 -O2 * +./tests/memset-varying.ispc compfail x86-64 avx2-i32x8 Linux LLVM 3.4 g++4.4 -O2 * +./tests/reduce-equal-1.ispc compfail x86-64 avx2-i32x8 Linux LLVM 3.4 g++4.4 -O2 * +./tests/reduce-equal-12.ispc compfail x86-64 avx2-i32x8 Linux LLVM 3.4 g++4.4 -O2 * +./tests/reduce-equal-13.ispc compfail x86-64 avx2-i32x8 Linux LLVM 3.4 g++4.4 -O2 * +./tests/reduce-equal-2.ispc compfail x86-64 avx2-i32x8 Linux LLVM 3.4 g++4.4 -O2 * +./tests/reduce-equal-3.ispc compfail x86-64 avx2-i32x8 Linux LLVM 3.4 g++4.4 -O2 * +./tests/reduce-equal-4.ispc compfail x86-64 avx2-i32x8 Linux LLVM 3.4 g++4.4 -O2 * +./tests/reduce-equal-5.ispc compfail x86-64 avx2-i32x8 Linux LLVM 3.4 g++4.4 -O2 * +./tests/reduce-equal-6.ispc compfail x86-64 avx2-i32x8 Linux LLVM 3.4 g++4.4 -O2 * +./tests/reduce-equal-7.ispc compfail x86-64 avx2-i32x8 Linux LLVM 3.4 g++4.4 -O2 * +./tests/reduce-equal.ispc compfail x86-64 avx2-i32x8 Linux LLVM 3.4 g++4.4 -O2 * +./tests/test-141.ispc runfail x86 avx2-i32x16 Linux LLVM 3.4 g++4.4 -O2 * +./tests/test-141.ispc runfail x86-64 avx2-i32x16 Linux LLVM 3.4 g++4.4 -O2 * +./tests/atomics-swap.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * +./tests/masked-scatter-struct.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * From 2a83cefd5b0d3f19f968e9f91702e073211375bb Mon Sep 17 00:00:00 2001 From: Dmitry Babokin Date: Thu, 26 Sep 2013 19:07:38 +0400 Subject: [PATCH 103/124] Add fails with gcc 4.7 on Linux --- fail_db.txt | 175 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 175 insertions(+) diff --git a/fail_db.txt b/fail_db.txt index 23a6c8ca..9cc7a884 100644 --- a/fail_db.txt +++ b/fail_db.txt @@ -325,3 +325,178 @@ ./tests/test-141.ispc runfail x86-64 avx2-i32x16 Linux LLVM 3.4 g++4.4 -O2 * ./tests/atomics-swap.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.4 -O2 * ./tests/masked-scatter-struct.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.4 -O2 * +./tests/atomics-13.ispc compfail x86 sse4-i16x8 Linux LLVM 3.3 g++4.7 -O2 * +./tests/reduce-equal-10.ispc compfail x86 sse4-i16x8 Linux LLVM 3.3 g++4.7 -O2 * +./tests/reduce-equal-11.ispc compfail x86 sse4-i16x8 Linux LLVM 3.3 g++4.7 -O2 * +./tests/reduce-equal-13.ispc compfail x86 sse4-i16x8 Linux LLVM 3.3 g++4.7 -O2 * +./tests/reduce-equal-5.ispc compfail x86 sse4-i16x8 Linux LLVM 3.3 g++4.7 -O2 * +./tests/reduce-equal-6.ispc compfail x86 sse4-i16x8 Linux LLVM 3.3 g++4.7 -O2 * +./tests/atomics-13.ispc compfail x86-64 sse4-i16x8 Linux LLVM 3.3 g++4.7 -O2 * +./tests/reduce-equal-10.ispc compfail x86-64 sse4-i16x8 Linux LLVM 3.3 g++4.7 -O2 * +./tests/reduce-equal-11.ispc compfail x86-64 sse4-i16x8 Linux LLVM 3.3 g++4.7 -O2 * +./tests/reduce-equal-13.ispc compfail x86-64 sse4-i16x8 Linux LLVM 3.3 g++4.7 -O2 * +./tests/reduce-equal-5.ispc compfail x86-64 sse4-i16x8 Linux LLVM 3.3 g++4.7 -O2 * +./tests/reduce-equal-6.ispc compfail x86-64 sse4-i16x8 Linux LLVM 3.3 g++4.7 -O2 * +./tests/funcptr-null-4.ispc runfail x86 sse4-i8x16 Linux LLVM 3.3 g++4.7 -O2 * +./tests/funcptr-null-5.ispc runfail x86 sse4-i8x16 Linux LLVM 3.3 g++4.7 -O2 * +./tests/funcptr-null-6.ispc runfail x86 sse4-i8x16 Linux LLVM 3.3 g++4.7 -O2 * +./tests/atomics-13.ispc compfail x86 sse4-i8x16 Linux LLVM 3.3 g++4.7 -O2 * +./tests/reduce-equal-10.ispc compfail x86 sse4-i8x16 Linux LLVM 3.3 g++4.7 -O2 * +./tests/reduce-equal-11.ispc compfail x86 sse4-i8x16 Linux LLVM 3.3 g++4.7 -O2 * +./tests/reduce-equal-13.ispc compfail x86 sse4-i8x16 Linux LLVM 3.3 g++4.7 -O2 * +./tests/reduce-equal-5.ispc compfail x86 sse4-i8x16 Linux LLVM 3.3 g++4.7 -O2 * +./tests/reduce-equal-6.ispc compfail x86 sse4-i8x16 Linux LLVM 3.3 g++4.7 -O2 * +./tests/funcptr-null-4.ispc runfail x86-64 sse4-i8x16 Linux LLVM 3.3 g++4.7 -O2 * +./tests/funcptr-null-5.ispc runfail x86-64 sse4-i8x16 Linux LLVM 3.3 g++4.7 -O2 * +./tests/funcptr-null-6.ispc runfail x86-64 sse4-i8x16 Linux LLVM 3.3 g++4.7 -O2 * +./tests/atomics-13.ispc compfail x86-64 sse4-i8x16 Linux LLVM 3.3 g++4.7 -O2 * +./tests/reduce-equal-10.ispc compfail x86-64 sse4-i8x16 Linux LLVM 3.3 g++4.7 -O2 * +./tests/reduce-equal-11.ispc compfail x86-64 sse4-i8x16 Linux LLVM 3.3 g++4.7 -O2 * +./tests/reduce-equal-13.ispc compfail x86-64 sse4-i8x16 Linux LLVM 3.3 g++4.7 -O2 * +./tests/reduce-equal-5.ispc compfail x86-64 sse4-i8x16 Linux LLVM 3.3 g++4.7 -O2 * +./tests/reduce-equal-6.ispc compfail x86-64 sse4-i8x16 Linux LLVM 3.3 g++4.7 -O2 * +./tests/avg-down-int8.ispc compfail x86 avx1-i32x16 Linux LLVM 3.3 g++4.7 -O2 * +./tests/avg-up-int8.ispc compfail x86 avx1-i32x16 Linux LLVM 3.3 g++4.7 -O2 * +./tests/avg-down-int8.ispc compfail x86-64 avx1-i32x16 Linux LLVM 3.3 g++4.7 -O2 * +./tests/avg-up-int8.ispc compfail x86-64 avx1-i32x16 Linux LLVM 3.3 g++4.7 -O2 * +./tests/avg-down-int8.ispc compfail x86 avx1.1-i32x16 Linux LLVM 3.3 g++4.7 -O2 * +./tests/avg-up-int8.ispc compfail x86 avx1.1-i32x16 Linux LLVM 3.3 g++4.7 -O2 * +./tests/avg-down-int8.ispc compfail x86-64 avx1.1-i32x16 Linux LLVM 3.3 g++4.7 -O2 * +./tests/avg-up-int8.ispc compfail x86-64 avx1.1-i32x16 Linux LLVM 3.3 g++4.7 -O2 * +./tests/atomics-varyingptr-2.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.7 -O2 * +./tests/atomics-varyingptr-4.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.7 -O2 * +./tests/broadcast-1.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.7 -O2 * +./tests/half-3.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.7 -O2 * +./tests/local-atomics-1.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.7 -O2 * +./tests/local-atomics-11.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.7 -O2 * +./tests/local-atomics-12.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.7 -O2 * +./tests/local-atomics-13.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.7 -O2 * +./tests/local-atomics-2.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.7 -O2 * +./tests/local-atomics-4.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.7 -O2 * +./tests/local-atomics-5.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.7 -O2 * +./tests/local-atomics-9.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.7 -O2 * +./tests/local-atomics-swap.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.7 -O2 * +./tests/local-atomics-varyingptr-2.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.7 -O2 * +./tests/local-atomics-varyingptr-3.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.7 -O2 * +./tests/local-atomics-varyingptr-4.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.7 -O2 * +./tests/memset-varying.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.7 -O2 * +./tests/ptr-15.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.7 -O2 * +./tests/reduce-equal-2.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.7 -O2 * +./tests/rotate-2.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.7 -O2 * +./tests/rotate-3.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.7 -O2 * +./tests/shuffle-4.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.7 -O2 * +./tests/shuffle-flatten.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.7 -O2 * +./tests/shuffle.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.7 -O2 * +./tests/shuffle2-1.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.7 -O2 * +./tests/shuffle2-10.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.7 -O2 * +./tests/shuffle2-11.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.7 -O2 * +./tests/shuffle2-2.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.7 -O2 * +./tests/shuffle2-3.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.7 -O2 * +./tests/shuffle2-4.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.7 -O2 * +./tests/shuffle2-5.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.7 -O2 * +./tests/shuffle2-6.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.7 -O2 * +./tests/shuffle2-7.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.7 -O2 * +./tests/shuffle2-8.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.7 -O2 * +./tests/shuffle2-9.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.7 -O2 * +./tests/shuffle2.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.7 -O2 * +./tests/test-129.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.7 -O2 * +./tests/test-130.ispc runfail x86-64 generic-4 Linux LLVM 3.3 g++4.7 -O2 * +./tests/ptr-assign-lhs-math-1.ispc compfail x86-64 generic-4 Linux LLVM 3.3 g++4.7 -O2 * +./tests/short-vec-8.ispc compfail x86-64 generic-4 Linux LLVM 3.3 g++4.7 -O2 * +./tests/ptr-15.ispc runfail x86-64 generic-16 Linux LLVM 3.3 g++4.7 -O2 * +./tests/test-141.ispc runfail x86-64 generic-16 Linux LLVM 3.3 g++4.7 -O2 * +./tests/test-143.ispc runfail x86-64 generic-16 Linux LLVM 3.3 g++4.7 -O2 * +./tests/ptr-assign-lhs-math-1.ispc compfail x86-64 generic-16 Linux LLVM 3.3 g++4.7 -O2 * +./tests/rotate.ispc compfail x86-64 avx2-i32x8 Linux LLVM 3.3 g++4.7 -O2 * +./tests/shift1.ispc runfail x86 avx2-i32x16 Linux LLVM 3.3 g++4.7 -O2 * +./tests/test-141.ispc runfail x86 avx2-i32x16 Linux LLVM 3.3 g++4.7 -O2 * +./tests/shift1.ispc runfail x86-64 avx2-i32x16 Linux LLVM 3.3 g++4.7 -O2 * +./tests/test-141.ispc runfail x86-64 avx2-i32x16 Linux LLVM 3.3 g++4.7 -O2 * +./tests/funcptr-null-4.ispc runfail x86 sse4-i8x16 Linux LLVM 3.4 g++4.7 -O2 * +./tests/funcptr-null-5.ispc runfail x86 sse4-i8x16 Linux LLVM 3.4 g++4.7 -O2 * +./tests/funcptr-null-6.ispc runfail x86 sse4-i8x16 Linux LLVM 3.4 g++4.7 -O2 * +./tests/funcptr-null-4.ispc runfail x86-64 sse4-i8x16 Linux LLVM 3.4 g++4.7 -O2 * +./tests/funcptr-null-5.ispc runfail x86-64 sse4-i8x16 Linux LLVM 3.4 g++4.7 -O2 * +./tests/funcptr-null-6.ispc runfail x86-64 sse4-i8x16 Linux LLVM 3.4 g++4.7 -O2 * +./tests/avg-down-int8.ispc compfail x86 avx1-i32x16 Linux LLVM 3.4 g++4.7 -O2 * +./tests/avg-up-int8.ispc compfail x86 avx1-i32x16 Linux LLVM 3.4 g++4.7 -O2 * +./tests/avg-down-int8.ispc compfail x86-64 avx1-i32x16 Linux LLVM 3.4 g++4.7 -O2 * +./tests/avg-up-int8.ispc compfail x86-64 avx1-i32x16 Linux LLVM 3.4 g++4.7 -O2 * +./tests/avg-down-int8.ispc compfail x86 avx1.1-i32x16 Linux LLVM 3.4 g++4.7 -O2 * +./tests/avg-up-int8.ispc compfail x86 avx1.1-i32x16 Linux LLVM 3.4 g++4.7 -O2 * +./tests/avg-down-int8.ispc compfail x86-64 avx1.1-i32x16 Linux LLVM 3.4 g++4.7 -O2 * +./tests/avg-up-int8.ispc compfail x86-64 avx1.1-i32x16 Linux LLVM 3.4 g++4.7 -O2 * +./tests/atomics-varyingptr-2.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.7 -O2 * +./tests/atomics-varyingptr-4.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.7 -O2 * +./tests/broadcast-1.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.7 -O2 * +./tests/half-3.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.7 -O2 * +./tests/local-atomics-1.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.7 -O2 * +./tests/local-atomics-11.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.7 -O2 * +./tests/local-atomics-12.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.7 -O2 * +./tests/local-atomics-13.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.7 -O2 * +./tests/local-atomics-2.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.7 -O2 * +./tests/local-atomics-4.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.7 -O2 * +./tests/local-atomics-5.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.7 -O2 * +./tests/local-atomics-9.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.7 -O2 * +./tests/local-atomics-swap.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.7 -O2 * +./tests/local-atomics-varyingptr-2.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.7 -O2 * +./tests/local-atomics-varyingptr-3.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.7 -O2 * +./tests/local-atomics-varyingptr-4.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.7 -O2 * +./tests/memset-varying.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.7 -O2 * +./tests/ptr-15.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.7 -O2 * +./tests/reduce-equal-2.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.7 -O2 * +./tests/rotate-2.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.7 -O2 * +./tests/rotate-3.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.7 -O2 * +./tests/shuffle-4.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.7 -O2 * +./tests/shuffle-flatten.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.7 -O2 * +./tests/shuffle.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.7 -O2 * +./tests/shuffle2-1.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.7 -O2 * +./tests/shuffle2-10.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.7 -O2 * +./tests/shuffle2-11.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.7 -O2 * +./tests/shuffle2-2.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.7 -O2 * +./tests/shuffle2-3.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.7 -O2 * +./tests/shuffle2-4.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.7 -O2 * +./tests/shuffle2-5.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.7 -O2 * +./tests/shuffle2-6.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.7 -O2 * +./tests/shuffle2-7.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.7 -O2 * +./tests/shuffle2-8.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.7 -O2 * +./tests/shuffle2-9.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.7 -O2 * +./tests/shuffle2.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.7 -O2 * +./tests/test-129.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.7 -O2 * +./tests/test-130.ispc runfail x86-64 generic-4 Linux LLVM 3.4 g++4.7 -O2 * +./tests/ptr-assign-lhs-math-1.ispc compfail x86-64 generic-4 Linux LLVM 3.4 g++4.7 -O2 * +./tests/short-vec-8.ispc compfail x86-64 generic-4 Linux LLVM 3.4 g++4.7 -O2 * +./tests/ptr-15.ispc runfail x86-64 generic-16 Linux LLVM 3.4 g++4.7 -O2 * +./tests/test-141.ispc runfail x86-64 generic-16 Linux LLVM 3.4 g++4.7 -O2 * +./tests/test-143.ispc runfail x86-64 generic-16 Linux LLVM 3.4 g++4.7 -O2 * +./tests/ptr-assign-lhs-math-1.ispc compfail x86-64 generic-16 Linux LLVM 3.4 g++4.7 -O2 * +./tests/atomics-varyingptr-2.ispc compfail x86-64 avx2-i32x8 Linux LLVM 3.4 g++4.7 -O2 * +./tests/atomics-varyingptr-3.ispc compfail x86-64 avx2-i32x8 Linux LLVM 3.4 g++4.7 -O2 * +./tests/atomics-varyingptr-4.ispc compfail x86-64 avx2-i32x8 Linux LLVM 3.4 g++4.7 -O2 * +./tests/local-atomics-11.ispc compfail x86-64 avx2-i32x8 Linux LLVM 3.4 g++4.7 -O2 * +./tests/local-atomics-12.ispc compfail x86-64 avx2-i32x8 Linux LLVM 3.4 g++4.7 -O2 * +./tests/local-atomics-13.ispc compfail x86-64 avx2-i32x8 Linux LLVM 3.4 g++4.7 -O2 * +./tests/local-atomics-4.ispc compfail x86-64 avx2-i32x8 Linux LLVM 3.4 g++4.7 -O2 * +./tests/local-atomics-5.ispc compfail x86-64 avx2-i32x8 Linux LLVM 3.4 g++4.7 -O2 * +./tests/local-atomics-6.ispc compfail x86-64 avx2-i32x8 Linux LLVM 3.4 g++4.7 -O2 * +./tests/local-atomics-7.ispc compfail x86-64 avx2-i32x8 Linux LLVM 3.4 g++4.7 -O2 * +./tests/local-atomics-8.ispc compfail x86-64 avx2-i32x8 Linux LLVM 3.4 g++4.7 -O2 * +./tests/local-atomics-swap.ispc compfail x86-64 avx2-i32x8 Linux LLVM 3.4 g++4.7 -O2 * +./tests/local-atomics-varyingptr-2.ispc compfail x86-64 avx2-i32x8 Linux LLVM 3.4 g++4.7 -O2 * +./tests/local-atomics-varyingptr-3.ispc compfail x86-64 avx2-i32x8 Linux LLVM 3.4 g++4.7 -O2 * +./tests/local-atomics-varyingptr-4.ispc compfail x86-64 avx2-i32x8 Linux LLVM 3.4 g++4.7 -O2 * +./tests/memset-varying.ispc compfail x86-64 avx2-i32x8 Linux LLVM 3.4 g++4.7 -O2 * +./tests/reduce-equal-1.ispc compfail x86-64 avx2-i32x8 Linux LLVM 3.4 g++4.7 -O2 * +./tests/reduce-equal-12.ispc compfail x86-64 avx2-i32x8 Linux LLVM 3.4 g++4.7 -O2 * +./tests/reduce-equal-13.ispc compfail x86-64 avx2-i32x8 Linux LLVM 3.4 g++4.7 -O2 * +./tests/reduce-equal-2.ispc compfail x86-64 avx2-i32x8 Linux LLVM 3.4 g++4.7 -O2 * +./tests/reduce-equal-3.ispc compfail x86-64 avx2-i32x8 Linux LLVM 3.4 g++4.7 -O2 * +./tests/reduce-equal-4.ispc compfail x86-64 avx2-i32x8 Linux LLVM 3.4 g++4.7 -O2 * +./tests/reduce-equal-5.ispc compfail x86-64 avx2-i32x8 Linux LLVM 3.4 g++4.7 -O2 * +./tests/reduce-equal-6.ispc compfail x86-64 avx2-i32x8 Linux LLVM 3.4 g++4.7 -O2 * +./tests/reduce-equal-7.ispc compfail x86-64 avx2-i32x8 Linux LLVM 3.4 g++4.7 -O2 * +./tests/reduce-equal.ispc compfail x86-64 avx2-i32x8 Linux LLVM 3.4 g++4.7 -O2 * +./tests/shift1.ispc runfail x86 avx2-i32x16 Linux LLVM 3.4 g++4.7 -O2 * +./tests/test-141.ispc runfail x86 avx2-i32x16 Linux LLVM 3.4 g++4.7 -O2 * +./tests/shift1.ispc runfail x86-64 avx2-i32x16 Linux LLVM 3.4 g++4.7 -O2 * +./tests/test-141.ispc runfail x86-64 avx2-i32x16 Linux LLVM 3.4 g++4.7 -O2 * From 5855ae746021553cea0cb4c81c913a71e4fc71f9 Mon Sep 17 00:00:00 2001 From: Dmitry Babokin Date: Fri, 27 Sep 2013 02:32:01 +0400 Subject: [PATCH 104/124] Add fails with gcc 4.7 on Mac --- fail_db.txt | 146 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 146 insertions(+) diff --git a/fail_db.txt b/fail_db.txt index 9cc7a884..b8e58d8b 100644 --- a/fail_db.txt +++ b/fail_db.txt @@ -500,3 +500,149 @@ ./tests/test-141.ispc runfail x86 avx2-i32x16 Linux LLVM 3.4 g++4.7 -O2 * ./tests/shift1.ispc runfail x86-64 avx2-i32x16 Linux LLVM 3.4 g++4.7 -O2 * ./tests/test-141.ispc runfail x86-64 avx2-i32x16 Linux LLVM 3.4 g++4.7 -O2 * +./tests/atomics-13.ispc compfail x86 sse4-i16x8 Mac LLVM 3.3 g++4.7 -O2 * +./tests/reduce-equal-10.ispc compfail x86 sse4-i16x8 Mac LLVM 3.3 g++4.7 -O2 * +./tests/reduce-equal-11.ispc compfail x86 sse4-i16x8 Mac LLVM 3.3 g++4.7 -O2 * +./tests/reduce-equal-13.ispc compfail x86 sse4-i16x8 Mac LLVM 3.3 g++4.7 -O2 * +./tests/reduce-equal-5.ispc compfail x86 sse4-i16x8 Mac LLVM 3.3 g++4.7 -O2 * +./tests/reduce-equal-6.ispc compfail x86 sse4-i16x8 Mac LLVM 3.3 g++4.7 -O2 * +./tests/atomics-13.ispc compfail x86-64 sse4-i16x8 Mac LLVM 3.3 g++4.7 -O2 * +./tests/reduce-equal-10.ispc compfail x86-64 sse4-i16x8 Mac LLVM 3.3 g++4.7 -O2 * +./tests/reduce-equal-11.ispc compfail x86-64 sse4-i16x8 Mac LLVM 3.3 g++4.7 -O2 * +./tests/reduce-equal-13.ispc compfail x86-64 sse4-i16x8 Mac LLVM 3.3 g++4.7 -O2 * +./tests/reduce-equal-5.ispc compfail x86-64 sse4-i16x8 Mac LLVM 3.3 g++4.7 -O2 * +./tests/reduce-equal-6.ispc compfail x86-64 sse4-i16x8 Mac LLVM 3.3 g++4.7 -O2 * +./tests/funcptr-null-4.ispc runfail x86 sse4-i8x16 Mac LLVM 3.3 g++4.7 -O2 * +./tests/funcptr-null-5.ispc runfail x86 sse4-i8x16 Mac LLVM 3.3 g++4.7 -O2 * +./tests/funcptr-null-6.ispc runfail x86 sse4-i8x16 Mac LLVM 3.3 g++4.7 -O2 * +./tests/atomics-13.ispc compfail x86 sse4-i8x16 Mac LLVM 3.3 g++4.7 -O2 * +./tests/reduce-equal-10.ispc compfail x86 sse4-i8x16 Mac LLVM 3.3 g++4.7 -O2 * +./tests/reduce-equal-11.ispc compfail x86 sse4-i8x16 Mac LLVM 3.3 g++4.7 -O2 * +./tests/reduce-equal-13.ispc compfail x86 sse4-i8x16 Mac LLVM 3.3 g++4.7 -O2 * +./tests/reduce-equal-5.ispc compfail x86 sse4-i8x16 Mac LLVM 3.3 g++4.7 -O2 * +./tests/reduce-equal-6.ispc compfail x86 sse4-i8x16 Mac LLVM 3.3 g++4.7 -O2 * +./tests/funcptr-null-4.ispc runfail x86-64 sse4-i8x16 Mac LLVM 3.3 g++4.7 -O2 * +./tests/funcptr-null-5.ispc runfail x86-64 sse4-i8x16 Mac LLVM 3.3 g++4.7 -O2 * +./tests/funcptr-null-6.ispc runfail x86-64 sse4-i8x16 Mac LLVM 3.3 g++4.7 -O2 * +./tests/atomics-13.ispc compfail x86-64 sse4-i8x16 Mac LLVM 3.3 g++4.7 -O2 * +./tests/reduce-equal-10.ispc compfail x86-64 sse4-i8x16 Mac LLVM 3.3 g++4.7 -O2 * +./tests/reduce-equal-11.ispc compfail x86-64 sse4-i8x16 Mac LLVM 3.3 g++4.7 -O2 * +./tests/reduce-equal-13.ispc compfail x86-64 sse4-i8x16 Mac LLVM 3.3 g++4.7 -O2 * +./tests/reduce-equal-5.ispc compfail x86-64 sse4-i8x16 Mac LLVM 3.3 g++4.7 -O2 * +./tests/reduce-equal-6.ispc compfail x86-64 sse4-i8x16 Mac LLVM 3.3 g++4.7 -O2 * +./tests/avg-down-int8.ispc compfail x86 avx1-i32x16 Mac LLVM 3.3 g++4.7 -O2 * +./tests/avg-up-int8.ispc compfail x86 avx1-i32x16 Mac LLVM 3.3 g++4.7 -O2 * +./tests/avg-down-int8.ispc compfail x86-64 avx1-i32x16 Mac LLVM 3.3 g++4.7 -O2 * +./tests/avg-up-int8.ispc compfail x86-64 avx1-i32x16 Mac LLVM 3.3 g++4.7 -O2 * +./tests/avg-down-int8.ispc compfail x86 avx1.1-i32x16 Mac LLVM 3.3 g++4.7 -O2 * +./tests/avg-up-int8.ispc compfail x86 avx1.1-i32x16 Mac LLVM 3.3 g++4.7 -O2 * +./tests/avg-down-int8.ispc compfail x86-64 avx1.1-i32x16 Mac LLVM 3.3 g++4.7 -O2 * +./tests/avg-up-int8.ispc compfail x86-64 avx1.1-i32x16 Mac LLVM 3.3 g++4.7 -O2 * +./tests/test-141.ispc runfail x86 avx2-i32x16 Mac LLVM 3.3 g++4.7 -O2 * +./tests/test-141.ispc runfail x86-64 avx2-i32x16 Mac LLVM 3.3 g++4.7 -O2 * +./tests/broadcast-1.ispc runfail x86-64 generic-4 Mac LLVM 3.3 g++4.7 -O2 * +./tests/half-3.ispc runfail x86-64 generic-4 Mac LLVM 3.3 g++4.7 -O2 * +./tests/local-atomics-1.ispc runfail x86-64 generic-4 Mac LLVM 3.3 g++4.7 -O2 * +./tests/local-atomics-13.ispc runfail x86-64 generic-4 Mac LLVM 3.3 g++4.7 -O2 * +./tests/local-atomics-5.ispc runfail x86-64 generic-4 Mac LLVM 3.3 g++4.7 -O2 * +./tests/local-atomics-9.ispc runfail x86-64 generic-4 Mac LLVM 3.3 g++4.7 -O2 * +./tests/local-atomics-swap.ispc runfail x86-64 generic-4 Mac LLVM 3.3 g++4.7 -O2 * +./tests/local-atomics-varyingptr-3.ispc runfail x86-64 generic-4 Mac LLVM 3.3 g++4.7 -O2 * +./tests/memset-varying.ispc runfail x86-64 generic-4 Mac LLVM 3.3 g++4.7 -O2 * +./tests/ptr-15.ispc runfail x86-64 generic-4 Mac LLVM 3.3 g++4.7 -O2 * +./tests/rotate-2.ispc runfail x86-64 generic-4 Mac LLVM 3.3 g++4.7 -O2 * +./tests/shuffle-4.ispc runfail x86-64 generic-4 Mac LLVM 3.3 g++4.7 -O2 * +./tests/shuffle2-1.ispc runfail x86-64 generic-4 Mac LLVM 3.3 g++4.7 -O2 * +./tests/shuffle2-10.ispc runfail x86-64 generic-4 Mac LLVM 3.3 g++4.7 -O2 * +./tests/shuffle2-11.ispc runfail x86-64 generic-4 Mac LLVM 3.3 g++4.7 -O2 * +./tests/shuffle2-2.ispc runfail x86-64 generic-4 Mac LLVM 3.3 g++4.7 -O2 * +./tests/shuffle2-3.ispc runfail x86-64 generic-4 Mac LLVM 3.3 g++4.7 -O2 * +./tests/shuffle2-4.ispc runfail x86-64 generic-4 Mac LLVM 3.3 g++4.7 -O2 * +./tests/shuffle2-5.ispc runfail x86-64 generic-4 Mac LLVM 3.3 g++4.7 -O2 * +./tests/shuffle2-6.ispc runfail x86-64 generic-4 Mac LLVM 3.3 g++4.7 -O2 * +./tests/shuffle2-7.ispc runfail x86-64 generic-4 Mac LLVM 3.3 g++4.7 -O2 * +./tests/shuffle2-8.ispc runfail x86-64 generic-4 Mac LLVM 3.3 g++4.7 -O2 * +./tests/shuffle2-9.ispc runfail x86-64 generic-4 Mac LLVM 3.3 g++4.7 -O2 * +./tests/shuffle2.ispc runfail x86-64 generic-4 Mac LLVM 3.3 g++4.7 -O2 * +./tests/test-129.ispc runfail x86-64 generic-4 Mac LLVM 3.3 g++4.7 -O2 * +./tests/test-130.ispc runfail x86-64 generic-4 Mac LLVM 3.3 g++4.7 -O2 * +./tests/ptr-assign-lhs-math-1.ispc compfail x86-64 generic-4 Mac LLVM 3.3 g++4.7 -O2 * +./tests/short-vec-8.ispc compfail x86-64 generic-4 Mac LLVM 3.3 g++4.7 -O2 * +./tests/ptr-15.ispc runfail x86-64 generic-16 Mac LLVM 3.3 g++4.7 -O2 * +./tests/test-141.ispc runfail x86-64 generic-16 Mac LLVM 3.3 g++4.7 -O2 * +./tests/test-143.ispc runfail x86-64 generic-16 Mac LLVM 3.3 g++4.7 -O2 * +./tests/ptr-assign-lhs-math-1.ispc compfail x86-64 generic-16 Mac LLVM 3.3 g++4.7 -O2 * +./tests/funcptr-null-4.ispc runfail x86 sse4-i8x16 Mac LLVM 3.4 g++4.7 -O2 * +./tests/funcptr-null-5.ispc runfail x86 sse4-i8x16 Mac LLVM 3.4 g++4.7 -O2 * +./tests/funcptr-null-6.ispc runfail x86 sse4-i8x16 Mac LLVM 3.4 g++4.7 -O2 * +./tests/funcptr-null-4.ispc runfail x86-64 sse4-i8x16 Mac LLVM 3.4 g++4.7 -O2 * +./tests/funcptr-null-5.ispc runfail x86-64 sse4-i8x16 Mac LLVM 3.4 g++4.7 -O2 * +./tests/funcptr-null-6.ispc runfail x86-64 sse4-i8x16 Mac LLVM 3.4 g++4.7 -O2 * +./tests/avg-down-int8.ispc compfail x86 avx1-i32x16 Mac LLVM 3.4 g++4.7 -O2 * +./tests/avg-up-int8.ispc compfail x86 avx1-i32x16 Mac LLVM 3.4 g++4.7 -O2 * +./tests/avg-down-int8.ispc compfail x86-64 avx1-i32x16 Mac LLVM 3.4 g++4.7 -O2 * +./tests/avg-up-int8.ispc compfail x86-64 avx1-i32x16 Mac LLVM 3.4 g++4.7 -O2 * +./tests/avg-down-int8.ispc compfail x86 avx1.1-i32x16 Mac LLVM 3.4 g++4.7 -O2 * +./tests/avg-up-int8.ispc compfail x86 avx1.1-i32x16 Mac LLVM 3.4 g++4.7 -O2 * +./tests/avg-down-int8.ispc compfail x86-64 avx1.1-i32x16 Mac LLVM 3.4 g++4.7 -O2 * +./tests/avg-up-int8.ispc compfail x86-64 avx1.1-i32x16 Mac LLVM 3.4 g++4.7 -O2 * +./tests/atomics-varyingptr-2.ispc compfail x86-64 avx2-i32x8 Mac LLVM 3.4 g++4.7 -O2 * +./tests/atomics-varyingptr-3.ispc compfail x86-64 avx2-i32x8 Mac LLVM 3.4 g++4.7 -O2 * +./tests/atomics-varyingptr-4.ispc compfail x86-64 avx2-i32x8 Mac LLVM 3.4 g++4.7 -O2 * +./tests/local-atomics-11.ispc compfail x86-64 avx2-i32x8 Mac LLVM 3.4 g++4.7 -O2 * +./tests/local-atomics-12.ispc compfail x86-64 avx2-i32x8 Mac LLVM 3.4 g++4.7 -O2 * +./tests/local-atomics-13.ispc compfail x86-64 avx2-i32x8 Mac LLVM 3.4 g++4.7 -O2 * +./tests/local-atomics-4.ispc compfail x86-64 avx2-i32x8 Mac LLVM 3.4 g++4.7 -O2 * +./tests/local-atomics-5.ispc compfail x86-64 avx2-i32x8 Mac LLVM 3.4 g++4.7 -O2 * +./tests/local-atomics-6.ispc compfail x86-64 avx2-i32x8 Mac LLVM 3.4 g++4.7 -O2 * +./tests/local-atomics-7.ispc compfail x86-64 avx2-i32x8 Mac LLVM 3.4 g++4.7 -O2 * +./tests/local-atomics-8.ispc compfail x86-64 avx2-i32x8 Mac LLVM 3.4 g++4.7 -O2 * +./tests/local-atomics-swap.ispc compfail x86-64 avx2-i32x8 Mac LLVM 3.4 g++4.7 -O2 * +./tests/local-atomics-varyingptr-2.ispc compfail x86-64 avx2-i32x8 Mac LLVM 3.4 g++4.7 -O2 * +./tests/local-atomics-varyingptr-3.ispc compfail x86-64 avx2-i32x8 Mac LLVM 3.4 g++4.7 -O2 * +./tests/local-atomics-varyingptr-4.ispc compfail x86-64 avx2-i32x8 Mac LLVM 3.4 g++4.7 -O2 * +./tests/memset-varying.ispc compfail x86-64 avx2-i32x8 Mac LLVM 3.4 g++4.7 -O2 * +./tests/reduce-equal-1.ispc compfail x86-64 avx2-i32x8 Mac LLVM 3.4 g++4.7 -O2 * +./tests/reduce-equal-12.ispc compfail x86-64 avx2-i32x8 Mac LLVM 3.4 g++4.7 -O2 * +./tests/reduce-equal-13.ispc compfail x86-64 avx2-i32x8 Mac LLVM 3.4 g++4.7 -O2 * +./tests/reduce-equal-2.ispc compfail x86-64 avx2-i32x8 Mac LLVM 3.4 g++4.7 -O2 * +./tests/reduce-equal-3.ispc compfail x86-64 avx2-i32x8 Mac LLVM 3.4 g++4.7 -O2 * +./tests/reduce-equal-4.ispc compfail x86-64 avx2-i32x8 Mac LLVM 3.4 g++4.7 -O2 * +./tests/reduce-equal-5.ispc compfail x86-64 avx2-i32x8 Mac LLVM 3.4 g++4.7 -O2 * +./tests/reduce-equal-6.ispc compfail x86-64 avx2-i32x8 Mac LLVM 3.4 g++4.7 -O2 * +./tests/reduce-equal-7.ispc compfail x86-64 avx2-i32x8 Mac LLVM 3.4 g++4.7 -O2 * +./tests/reduce-equal.ispc compfail x86-64 avx2-i32x8 Mac LLVM 3.4 g++4.7 -O2 * +./tests/test-141.ispc runfail x86 avx2-i32x16 Mac LLVM 3.4 g++4.7 -O2 * +./tests/test-141.ispc runfail x86-64 avx2-i32x16 Mac LLVM 3.4 g++4.7 -O2 * +./tests/broadcast-1.ispc runfail x86-64 generic-4 Mac LLVM 3.4 g++4.7 -O2 * +./tests/half-3.ispc runfail x86-64 generic-4 Mac LLVM 3.4 g++4.7 -O2 * +./tests/local-atomics-1.ispc runfail x86-64 generic-4 Mac LLVM 3.4 g++4.7 -O2 * +./tests/local-atomics-13.ispc runfail x86-64 generic-4 Mac LLVM 3.4 g++4.7 -O2 * +./tests/local-atomics-5.ispc runfail x86-64 generic-4 Mac LLVM 3.4 g++4.7 -O2 * +./tests/local-atomics-9.ispc runfail x86-64 generic-4 Mac LLVM 3.4 g++4.7 -O2 * +./tests/local-atomics-swap.ispc runfail x86-64 generic-4 Mac LLVM 3.4 g++4.7 -O2 * +./tests/local-atomics-varyingptr-3.ispc runfail x86-64 generic-4 Mac LLVM 3.4 g++4.7 -O2 * +./tests/memset-varying.ispc runfail x86-64 generic-4 Mac LLVM 3.4 g++4.7 -O2 * +./tests/ptr-15.ispc runfail x86-64 generic-4 Mac LLVM 3.4 g++4.7 -O2 * +./tests/rotate-2.ispc runfail x86-64 generic-4 Mac LLVM 3.4 g++4.7 -O2 * +./tests/shuffle-4.ispc runfail x86-64 generic-4 Mac LLVM 3.4 g++4.7 -O2 * +./tests/shuffle2-1.ispc runfail x86-64 generic-4 Mac LLVM 3.4 g++4.7 -O2 * +./tests/shuffle2-10.ispc runfail x86-64 generic-4 Mac LLVM 3.4 g++4.7 -O2 * +./tests/shuffle2-11.ispc runfail x86-64 generic-4 Mac LLVM 3.4 g++4.7 -O2 * +./tests/shuffle2-2.ispc runfail x86-64 generic-4 Mac LLVM 3.4 g++4.7 -O2 * +./tests/shuffle2-3.ispc runfail x86-64 generic-4 Mac LLVM 3.4 g++4.7 -O2 * +./tests/shuffle2-4.ispc runfail x86-64 generic-4 Mac LLVM 3.4 g++4.7 -O2 * +./tests/shuffle2-5.ispc runfail x86-64 generic-4 Mac LLVM 3.4 g++4.7 -O2 * +./tests/shuffle2-6.ispc runfail x86-64 generic-4 Mac LLVM 3.4 g++4.7 -O2 * +./tests/shuffle2-7.ispc runfail x86-64 generic-4 Mac LLVM 3.4 g++4.7 -O2 * +./tests/shuffle2-8.ispc runfail x86-64 generic-4 Mac LLVM 3.4 g++4.7 -O2 * +./tests/shuffle2-9.ispc runfail x86-64 generic-4 Mac LLVM 3.4 g++4.7 -O2 * +./tests/shuffle2.ispc runfail x86-64 generic-4 Mac LLVM 3.4 g++4.7 -O2 * +./tests/test-129.ispc runfail x86-64 generic-4 Mac LLVM 3.4 g++4.7 -O2 * +./tests/test-130.ispc runfail x86-64 generic-4 Mac LLVM 3.4 g++4.7 -O2 * +./tests/ptr-assign-lhs-math-1.ispc compfail x86-64 generic-4 Mac LLVM 3.4 g++4.7 -O2 * +./tests/short-vec-8.ispc compfail x86-64 generic-4 Mac LLVM 3.4 g++4.7 -O2 * +./tests/ptr-15.ispc runfail x86-64 generic-16 Mac LLVM 3.4 g++4.7 -O2 * +./tests/test-141.ispc runfail x86-64 generic-16 Mac LLVM 3.4 g++4.7 -O2 * +./tests/test-143.ispc runfail x86-64 generic-16 Mac LLVM 3.4 g++4.7 -O2 * +./tests/ptr-assign-lhs-math-1.ispc compfail x86-64 generic-16 Mac LLVM 3.4 g++4.7 -O2 * From 396aaae098abc2e7a5ed5a02c97254a9f292086e Mon Sep 17 00:00:00 2001 From: Dmitry Babokin Date: Fri, 27 Sep 2013 17:00:17 +0400 Subject: [PATCH 105/124] Add fails with VS2010 on Windows --- fail_db.txt | 216 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 216 insertions(+) diff --git a/fail_db.txt b/fail_db.txt index b8e58d8b..a6608c12 100644 --- a/fail_db.txt +++ b/fail_db.txt @@ -646,3 +646,219 @@ ./tests/test-141.ispc runfail x86-64 generic-16 Mac LLVM 3.4 g++4.7 -O2 * ./tests/test-143.ispc runfail x86-64 generic-16 Mac LLVM 3.4 g++4.7 -O2 * ./tests/ptr-assign-lhs-math-1.ispc compfail x86-64 generic-16 Mac LLVM 3.4 g++4.7 -O2 * +.\tests\exclusive-scan-add-9.ispc runfail x86 sse2-i32x4 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-equal-10.ispc runfail x86 sse2-i32x4 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-max-uint64.ispc runfail x86 sse2-i32x4 Windows LLVM 3.3 cl -O2 * +.\tests\uint64-max-1.ispc runfail x86 sse2-i32x4 Windows LLVM 3.3 cl -O2 * +.\tests\uint64-max.ispc runfail x86 sse2-i32x4 Windows LLVM 3.3 cl -O2 * +.\tests\uint64-min-1.ispc runfail x86 sse2-i32x4 Windows LLVM 3.3 cl -O2 * +.\tests\uint64-min.ispc runfail x86 sse2-i32x4 Windows LLVM 3.3 cl -O2 * +.\tests\exclusive-scan-add-10.ispc runfail x86 sse2-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\exclusive-scan-add-9.ispc runfail x86 sse2-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-equal-10.ispc runfail x86 sse2-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-max-uint64.ispc runfail x86 sse2-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\uint64-max-1.ispc runfail x86 sse2-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\uint64-max.ispc runfail x86 sse2-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\uint64-min-1.ispc runfail x86 sse2-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\uint64-min.ispc runfail x86 sse2-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\exclusive-scan-add-10.ispc runfail x86 sse4-i32x4 Windows LLVM 3.3 cl -O2 * +.\tests\exclusive-scan-add-9.ispc runfail x86 sse4-i32x4 Windows LLVM 3.3 cl -O2 * +.\tests\max-uint-1.ispc runfail x86 sse4-i32x4 Windows LLVM 3.3 cl -O2 * +.\tests\max-uint.ispc runfail x86 sse4-i32x4 Windows LLVM 3.3 cl -O2 * +.\tests\min-uint-2.ispc runfail x86 sse4-i32x4 Windows LLVM 3.3 cl -O2 * +.\tests\packed-load-1.ispc runfail x86 sse4-i32x4 Windows LLVM 3.3 cl -O2 * +.\tests\packed-store.ispc runfail x86 sse4-i32x4 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-add-uint-1.ispc runfail x86 sse4-i32x4 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-add-uint.ispc runfail x86 sse4-i32x4 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-add-uint64-1.ispc runfail x86 sse4-i32x4 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-add-uint64.ispc runfail x86 sse4-i32x4 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-max-uint.ispc runfail x86 sse4-i32x4 Windows LLVM 3.3 cl -O2 * +.\tests\max-uint-1.ispc runfail x86 sse4-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\max-uint.ispc runfail x86 sse4-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\min-uint-2.ispc runfail x86 sse4-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\packed-load-1.ispc runfail x86 sse4-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\packed-store.ispc runfail x86 sse4-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-add-uint-1.ispc runfail x86 sse4-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-add-uint.ispc runfail x86 sse4-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-add-uint64-1.ispc runfail x86 sse4-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-add-uint64.ispc runfail x86 sse4-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-max-uint.ispc runfail x86 sse4-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\exclusive-scan-add-10.ispc runfail x86 sse4-i16x8 Windows LLVM 3.3 cl -O2 * +.\tests\max-uint-1.ispc runfail x86 sse4-i16x8 Windows LLVM 3.3 cl -O2 * +.\tests\max-uint.ispc runfail x86 sse4-i16x8 Windows LLVM 3.3 cl -O2 * +.\tests\min-uint-2.ispc runfail x86 sse4-i16x8 Windows LLVM 3.3 cl -O2 * +.\tests\packed-load-1.ispc runfail x86 sse4-i16x8 Windows LLVM 3.3 cl -O2 * +.\tests\packed-store.ispc runfail x86 sse4-i16x8 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-add-uint-1.ispc runfail x86 sse4-i16x8 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-add-uint.ispc runfail x86 sse4-i16x8 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-add-uint64-1.ispc runfail x86 sse4-i16x8 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-add-uint64.ispc runfail x86 sse4-i16x8 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-max-uint.ispc runfail x86 sse4-i16x8 Windows LLVM 3.3 cl -O2 * +.\tests\atomics-13.ispc compfail x86 sse4-i16x8 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-equal-10.ispc compfail x86 sse4-i16x8 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-equal-11.ispc compfail x86 sse4-i16x8 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-equal-13.ispc compfail x86 sse4-i16x8 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-equal-5.ispc compfail x86 sse4-i16x8 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-equal-6.ispc compfail x86 sse4-i16x8 Windows LLVM 3.3 cl -O2 * +.\tests\exclusive-scan-add-10.ispc runfail x86 sse4-i8x16 Windows LLVM 3.3 cl -O2 * +.\tests\exclusive-scan-add-9.ispc runfail x86 sse4-i8x16 Windows LLVM 3.3 cl -O2 * +.\tests\funcptr-null-4.ispc runfail x86 sse4-i8x16 Windows LLVM 3.3 cl -O2 * +.\tests\funcptr-null-5.ispc runfail x86 sse4-i8x16 Windows LLVM 3.3 cl -O2 * +.\tests\funcptr-null-6.ispc runfail x86 sse4-i8x16 Windows LLVM 3.3 cl -O2 * +.\tests\max-uint-1.ispc runfail x86 sse4-i8x16 Windows LLVM 3.3 cl -O2 * +.\tests\max-uint.ispc runfail x86 sse4-i8x16 Windows LLVM 3.3 cl -O2 * +.\tests\min-uint-2.ispc runfail x86 sse4-i8x16 Windows LLVM 3.3 cl -O2 * +.\tests\packed-load-1.ispc runfail x86 sse4-i8x16 Windows LLVM 3.3 cl -O2 * +.\tests\packed-store.ispc runfail x86 sse4-i8x16 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-add-uint-1.ispc runfail x86 sse4-i8x16 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-add-uint.ispc runfail x86 sse4-i8x16 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-add-uint64-1.ispc runfail x86 sse4-i8x16 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-add-uint64.ispc runfail x86 sse4-i8x16 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-max-uint.ispc runfail x86 sse4-i8x16 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-max-uint64.ispc runfail x86 sse4-i8x16 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-min-uint64.ispc runfail x86 sse4-i8x16 Windows LLVM 3.3 cl -O2 * +.\tests\atomics-13.ispc compfail x86 sse4-i8x16 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-equal-10.ispc compfail x86 sse4-i8x16 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-equal-11.ispc compfail x86 sse4-i8x16 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-equal-13.ispc compfail x86 sse4-i8x16 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-equal-5.ispc compfail x86 sse4-i8x16 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-equal-6.ispc compfail x86 sse4-i8x16 Windows LLVM 3.3 cl -O2 * +.\tests\exclusive-scan-add-10.ispc runfail x86 avx1-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\max-uint-1.ispc runfail x86 avx1-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\max-uint.ispc runfail x86 avx1-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\min-uint-2.ispc runfail x86 avx1-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\packed-load-1.ispc runfail x86 avx1-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\packed-store.ispc runfail x86 avx1-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-add-uint-1.ispc runfail x86 avx1-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-add-uint.ispc runfail x86 avx1-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-add-uint64-1.ispc runfail x86 avx1-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-add-uint64.ispc runfail x86 avx1-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-equal-10.ispc runfail x86 avx1-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-max-uint.ispc runfail x86 avx1-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-max-uint64.ispc runfail x86 avx1-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-min-uint64.ispc runfail x86 avx1-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\uint64-max-1.ispc runfail x86 avx1-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\uint64-max.ispc runfail x86 avx1-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\uint64-min-1.ispc runfail x86 avx1-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\uint64-min.ispc runfail x86 avx1-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\switch-10.ispc compfail x86 avx1-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\switch-11.ispc compfail x86 avx1-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\switch-12.ispc compfail x86 avx1-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\switch-8.ispc compfail x86 avx1-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\switch-9.ispc compfail x86 avx1-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\exclusive-scan-add-10.ispc runfail x86 avx1-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\max-uint-1.ispc runfail x86 avx1-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\max-uint.ispc runfail x86 avx1-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\min-uint-1.ispc runfail x86 avx1-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\min-uint-2.ispc runfail x86 avx1-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\packed-load-1.ispc runfail x86 avx1-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\packed-store.ispc runfail x86 avx1-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-add-uint-1.ispc runfail x86 avx1-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-add-uint.ispc runfail x86 avx1-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-add-uint64-1.ispc runfail x86 avx1-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-add-uint64.ispc runfail x86 avx1-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-equal-10.ispc runfail x86 avx1-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-max-uint.ispc runfail x86 avx1-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-max-uint64.ispc runfail x86 avx1-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-min-uint.ispc runfail x86 avx1-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-min-uint64.ispc runfail x86 avx1-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\uint64-max-1.ispc runfail x86 avx1-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\uint64-max.ispc runfail x86 avx1-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\uint64-min-1.ispc runfail x86 avx1-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\uint64-min.ispc runfail x86 avx1-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\avg-down-int8.ispc compfail x86 avx1-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\avg-up-int8.ispc compfail x86 avx1-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\switch-10.ispc compfail x86 avx1-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\switch-11.ispc compfail x86 avx1-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\switch-12.ispc compfail x86 avx1-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\switch-8.ispc compfail x86 avx1-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\switch-9.ispc compfail x86 avx1-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\exclusive-scan-add-10.ispc runfail x86 avx1.1-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\max-uint-1.ispc runfail x86 avx1.1-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\max-uint.ispc runfail x86 avx1.1-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\min-uint-2.ispc runfail x86 avx1.1-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\packed-load-1.ispc runfail x86 avx1.1-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\packed-store.ispc runfail x86 avx1.1-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-add-uint-1.ispc runfail x86 avx1.1-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-add-uint.ispc runfail x86 avx1.1-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-add-uint64-1.ispc runfail x86 avx1.1-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-add-uint64.ispc runfail x86 avx1.1-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-equal-10.ispc runfail x86 avx1.1-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-max-uint.ispc runfail x86 avx1.1-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-max-uint64.ispc runfail x86 avx1.1-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-min-uint64.ispc runfail x86 avx1.1-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\uint64-max-1.ispc runfail x86 avx1.1-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\uint64-max.ispc runfail x86 avx1.1-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\uint64-min-1.ispc runfail x86 avx1.1-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\uint64-min.ispc runfail x86 avx1.1-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\switch-10.ispc compfail x86 avx1.1-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\switch-11.ispc compfail x86 avx1.1-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\switch-12.ispc compfail x86 avx1.1-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\switch-8.ispc compfail x86 avx1.1-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\switch-9.ispc compfail x86 avx1.1-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\exclusive-scan-add-10.ispc runfail x86 avx1.1-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\max-uint-1.ispc runfail x86 avx1.1-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\max-uint.ispc runfail x86 avx1.1-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\min-uint-1.ispc runfail x86 avx1.1-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\min-uint-2.ispc runfail x86 avx1.1-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\packed-load-1.ispc runfail x86 avx1.1-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\packed-store.ispc runfail x86 avx1.1-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-add-uint-1.ispc runfail x86 avx1.1-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-add-uint.ispc runfail x86 avx1.1-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-add-uint64-1.ispc runfail x86 avx1.1-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-add-uint64.ispc runfail x86 avx1.1-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-equal-10.ispc runfail x86 avx1.1-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-max-uint.ispc runfail x86 avx1.1-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-max-uint64.ispc runfail x86 avx1.1-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-min-uint.ispc runfail x86 avx1.1-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-min-uint64.ispc runfail x86 avx1.1-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\uint64-max-1.ispc runfail x86 avx1.1-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\uint64-max.ispc runfail x86 avx1.1-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\uint64-min-1.ispc runfail x86 avx1.1-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\uint64-min.ispc runfail x86 avx1.1-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\avg-down-int8.ispc compfail x86 avx1.1-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\avg-up-int8.ispc compfail x86 avx1.1-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\switch-10.ispc compfail x86 avx1.1-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\switch-11.ispc compfail x86 avx1.1-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\switch-12.ispc compfail x86 avx1.1-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\switch-8.ispc compfail x86 avx1.1-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\switch-9.ispc compfail x86 avx1.1-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\atomics-13.ispc compfail x86-64 sse4-i16x8 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-equal-10.ispc compfail x86-64 sse4-i16x8 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-equal-11.ispc compfail x86-64 sse4-i16x8 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-equal-13.ispc compfail x86-64 sse4-i16x8 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-equal-5.ispc compfail x86-64 sse4-i16x8 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-equal-6.ispc compfail x86-64 sse4-i16x8 Windows LLVM 3.3 cl -O2 * +.\tests\funcptr-null-4.ispc runfail x86-64 sse4-i8x16 Windows LLVM 3.3 cl -O2 * +.\tests\funcptr-null-5.ispc runfail x86-64 sse4-i8x16 Windows LLVM 3.3 cl -O2 * +.\tests\funcptr-null-6.ispc runfail x86-64 sse4-i8x16 Windows LLVM 3.3 cl -O2 * +.\tests\atomics-13.ispc compfail x86-64 sse4-i8x16 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-equal-10.ispc compfail x86-64 sse4-i8x16 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-equal-11.ispc compfail x86-64 sse4-i8x16 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-equal-13.ispc compfail x86-64 sse4-i8x16 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-equal-5.ispc compfail x86-64 sse4-i8x16 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-equal-6.ispc compfail x86-64 sse4-i8x16 Windows LLVM 3.3 cl -O2 * +.\tests\switch-10.ispc compfail x86-64 avx1-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\switch-11.ispc compfail x86-64 avx1-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\switch-12.ispc compfail x86-64 avx1-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\switch-8.ispc compfail x86-64 avx1-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\switch-9.ispc compfail x86-64 avx1-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\avg-down-int8.ispc compfail x86-64 avx1-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\avg-up-int8.ispc compfail x86-64 avx1-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\switch-10.ispc compfail x86-64 avx1-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\switch-11.ispc compfail x86-64 avx1-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\switch-12.ispc compfail x86-64 avx1-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\switch-8.ispc compfail x86-64 avx1-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\switch-9.ispc compfail x86-64 avx1-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\switch-10.ispc compfail x86-64 avx1.1-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\switch-11.ispc compfail x86-64 avx1.1-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\switch-12.ispc compfail x86-64 avx1.1-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\switch-8.ispc compfail x86-64 avx1.1-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\switch-9.ispc compfail x86-64 avx1.1-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\avg-down-int8.ispc compfail x86-64 avx1.1-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\avg-up-int8.ispc compfail x86-64 avx1.1-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\switch-10.ispc compfail x86-64 avx1.1-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\switch-11.ispc compfail x86-64 avx1.1-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\switch-12.ispc compfail x86-64 avx1.1-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\switch-8.ispc compfail x86-64 avx1.1-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\switch-9.ispc compfail x86-64 avx1.1-i32x16 Windows LLVM 3.3 cl -O2 * From da52ae844f95ef617ef81af0f0588395109d2994 Mon Sep 17 00:00:00 2001 From: Dmitry Babokin Date: Fri, 27 Sep 2013 18:06:28 +0400 Subject: [PATCH 106/124] Adding AVX2 fails on Windows --- fail_db.txt | 62 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 62 insertions(+) diff --git a/fail_db.txt b/fail_db.txt index a6608c12..59e0a7a6 100644 --- a/fail_db.txt +++ b/fail_db.txt @@ -862,3 +862,65 @@ .\tests\switch-12.ispc compfail x86-64 avx1.1-i32x16 Windows LLVM 3.3 cl -O2 * .\tests\switch-8.ispc compfail x86-64 avx1.1-i32x16 Windows LLVM 3.3 cl -O2 * .\tests\switch-9.ispc compfail x86-64 avx1.1-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\switch-10.ispc compfail x86-64 avx2-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\switch-11.ispc compfail x86-64 avx2-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\switch-12.ispc compfail x86-64 avx2-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\switch-8.ispc compfail x86-64 avx2-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\switch-9.ispc compfail x86-64 avx2-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\test-141.ispc runfail x86-64 avx2-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\switch-10.ispc compfail x86-64 avx2-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\switch-11.ispc compfail x86-64 avx2-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\switch-12.ispc compfail x86-64 avx2-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\switch-8.ispc compfail x86-64 avx2-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\switch-9.ispc compfail x86-64 avx2-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\exclusive-scan-add-10.ispc runfail x86 avx2-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\exclusive-scan-add-9.ispc runfail x86 avx2-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\max-uint-1.ispc runfail x86 avx2-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\max-uint.ispc runfail x86 avx2-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\min-uint-2.ispc runfail x86 avx2-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\packed-load-1.ispc runfail x86 avx2-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\packed-store.ispc runfail x86 avx2-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-add-uint-1.ispc runfail x86 avx2-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-add-uint.ispc runfail x86 avx2-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-add-uint64-1.ispc runfail x86 avx2-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-add-uint64.ispc runfail x86 avx2-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-equal-10.ispc runfail x86 avx2-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-max-uint.ispc runfail x86 avx2-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-max-uint64.ispc runfail x86 avx2-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-min-uint64.ispc runfail x86 avx2-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\uint64-max-1.ispc runfail x86 avx2-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\uint64-max.ispc runfail x86 avx2-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\uint64-min-1.ispc runfail x86 avx2-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\uint64-min.ispc runfail x86 avx2-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\switch-10.ispc compfail x86 avx2-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\switch-11.ispc compfail x86 avx2-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\switch-12.ispc compfail x86 avx2-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\switch-8.ispc compfail x86 avx2-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\switch-9.ispc compfail x86 avx2-i32x8 Windows LLVM 3.3 cl -O2 * +.\tests\exclusive-scan-add-10.ispc runfail x86 avx2-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\exclusive-scan-add-9.ispc runfail x86 avx2-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\max-uint-1.ispc runfail x86 avx2-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\max-uint.ispc runfail x86 avx2-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\min-uint-1.ispc runfail x86 avx2-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\min-uint-2.ispc runfail x86 avx2-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\packed-load-1.ispc runfail x86 avx2-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\packed-store.ispc runfail x86 avx2-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-add-uint-1.ispc runfail x86 avx2-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-add-uint.ispc runfail x86 avx2-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-add-uint64-1.ispc runfail x86 avx2-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-add-uint64.ispc runfail x86 avx2-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-equal-10.ispc runfail x86 avx2-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-max-uint.ispc runfail x86 avx2-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-max-uint64.ispc runfail x86 avx2-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-min-uint.ispc runfail x86 avx2-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-min-uint64.ispc runfail x86 avx2-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\test-141.ispc runfail x86 avx2-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\uint64-max-1.ispc runfail x86 avx2-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\uint64-max.ispc runfail x86 avx2-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\uint64-min-1.ispc runfail x86 avx2-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\uint64-min.ispc runfail x86 avx2-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\switch-10.ispc compfail x86 avx2-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\switch-11.ispc compfail x86 avx2-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\switch-12.ispc compfail x86 avx2-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\switch-8.ispc compfail x86 avx2-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\switch-9.ispc compfail x86 avx2-i32x16 Windows LLVM 3.3 cl -O2 * From 8e71dbd6c12b0fde77ed58c21e4083c84227114e Mon Sep 17 00:00:00 2001 From: Dmitry Babokin Date: Fri, 27 Sep 2013 18:12:12 +0400 Subject: [PATCH 107/124] Adding comments to fail_db.txt --- fail_db.txt | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/fail_db.txt b/fail_db.txt index 59e0a7a6..eb3c0fe9 100644 --- a/fail_db.txt +++ b/fail_db.txt @@ -1,4 +1,11 @@ -% List of known fails +% List of known fails. +% The list is unordered and contains information about commonly used platforms / configurations. +% Our goas is to maintain this list for Linux, MacOS and Windows with reasonably new compilers. +% Note, that it's important which C++ compiler was used. For example, gcc 4.4 is know to produce +% considerably more fails with generic targets, than gcc 4.7 or later. +% Using old compilers (gcc 4.4 is considered to be relatively old) may cause LLVM bugs. +% To avoid them you can use LLVM selfbuild. +% ./tests/masked-scatter-vector.ispc runfail x86-64 sse2-i32x4 Linux LLVM 3.3 g++4.4 -O2 * ./tests/atomics-13.ispc compfail x86 sse4-i16x8 Linux LLVM 3.3 g++4.4 -O2 * ./tests/reduce-equal-10.ispc compfail x86 sse4-i16x8 Linux LLVM 3.3 g++4.4 -O2 * From 8a39af8f7204640fa802f6eb07403526523d1ea3 Mon Sep 17 00:00:00 2001 From: Dmitry Babokin Date: Fri, 27 Sep 2013 23:27:05 +0400 Subject: [PATCH 108/124] Release 1.5.0 --- docs/ReleaseNotes.txt | 60 +++++++++++++++++++++++++++++++++++++++++++ docs/news.rst | 8 ++++++ doxygen.cfg | 2 +- ispc.h | 2 +- 4 files changed, 70 insertions(+), 2 deletions(-) diff --git a/docs/ReleaseNotes.txt b/docs/ReleaseNotes.txt index 007f283e..a8575ea0 100644 --- a/docs/ReleaseNotes.txt +++ b/docs/ReleaseNotes.txt @@ -1,3 +1,63 @@ +=== v1.5.0 === (27 September 2013) + +A major new version of ISPC with several new targets and important bug fixes. +Here's a list of the most important changes, if you are using pre-built +binaries (which are based on patched version of LLVM 3.3): + +* The naming of targets was changed to explicitly include data type width and + a number of threads in the gang. For example, avx2-i32x8 is avx2 target, + which uses 32 bit types as a base and has 8 threads in a gang. Old naming + scheme is still supported, but depricated. + +* New SSE4 targets for calculations based on 8 bit and 16 bit data types: + sse4-i8x16 and sse4-i16x8. + +* New AVX1 target for calculations based on 64 bit data types: avx1-i64x4. + +* SVML support was extended and improved. + +* Behavior of -g switch was changed to not affect optimization level. + +* ISPC debug infrastructure was redesigned. See --help-dev for more info and + enjoy capabilities of new --debug-phase= and --off-phase= + switches. + +* Fixed an auto-dispatch bug, which caused AVX code execution when OS doesn't + support AVX (but hardware does). + +* Fixed a bug, which discarded uniform/varying keyword in typedefs. + +* Several performance regressions were fixed. + +If you are building ISPC yourself, then following changes are also available +to you: + +* --cpu=slm for targeting Intel Atom codename Silvermont (if LLVM 3.4 is used). + +* ARM NEON targets are available (if enabled in build system). + +* --debug-ir= is available to generate debug information based on LLVM + IR (if LLVM 3.4 is used). In debugger you'll see LLVM IR instead of source + code. + +* A redesigned and improved test and configuration management system is + available to facilitate the process of building LLVM and testing ISPC + compiler. + +Standard library changes/fixes: + +* __pause() function was removed from standard library. + +* Fixed reduce_[min|max]_[float|double] intrinsics, which were producing + incorrect code under some conditions. + +Language changes: + +* By default a floating point constant without a suffix is a single precision + constant (32 bit). A new suffix "d" was introduced to allow double precision + constant (64 bit). Please refer to tests/double-consts.ispc for syntax + examples. + === v1.4.4 === (19 July 2013) A minor version update with several stability fixes requested by the customers. diff --git a/docs/news.rst b/docs/news.rst index c1c35de3..7d78a662 100644 --- a/docs/news.rst +++ b/docs/news.rst @@ -2,6 +2,14 @@ ispc News ========= +ispc 1.5.0 is Released +---------------------- + +A major update of ``ispc`` has been released with several new targets available +and bunch of performance and stability fixes. The released binaries are built +with patched version of LLVM 3.3. Please refer to Release Notes for complete +set of changes. + ispc 1.4.4 is Released ---------------------- diff --git a/doxygen.cfg b/doxygen.cfg index 480d9331..ab4eec20 100644 --- a/doxygen.cfg +++ b/doxygen.cfg @@ -31,7 +31,7 @@ PROJECT_NAME = "Intel SPMD Program Compiler" # This could be handy for archiving the generated documentation or # if some version control system is used. -PROJECT_NUMBER = 1.4.5dev +PROJECT_NUMBER = 1.5.0 # The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) # base path where the generated documentation will be put. diff --git a/ispc.h b/ispc.h index 4804832f..4b7ae732 100644 --- a/ispc.h +++ b/ispc.h @@ -38,7 +38,7 @@ #ifndef ISPC_H #define ISPC_H -#define ISPC_VERSION "1.4.5dev" +#define ISPC_VERSION "1.5.0" #if !defined(LLVM_3_1) && !defined(LLVM_3_2) && !defined(LLVM_3_3) && !defined(LLVM_3_4) #error "Only LLVM 3.1, 3.2, 3.3 and the 3.4 development branch are supported" From 3b4cc9080046983932ea461345344deccd0ad33e Mon Sep 17 00:00:00 2001 From: Dmitry Babokin Date: Sat, 28 Sep 2013 01:32:00 +0400 Subject: [PATCH 109/124] Changing ISPC to 1.5.dev --- doxygen.cfg | 2 +- ispc.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/doxygen.cfg b/doxygen.cfg index ab4eec20..a0ad3176 100644 --- a/doxygen.cfg +++ b/doxygen.cfg @@ -31,7 +31,7 @@ PROJECT_NAME = "Intel SPMD Program Compiler" # This could be handy for archiving the generated documentation or # if some version control system is used. -PROJECT_NUMBER = 1.5.0 +PROJECT_NUMBER = 1.5.1dev # The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) # base path where the generated documentation will be put. diff --git a/ispc.h b/ispc.h index 4b7ae732..82cb9050 100644 --- a/ispc.h +++ b/ispc.h @@ -38,7 +38,7 @@ #ifndef ISPC_H #define ISPC_H -#define ISPC_VERSION "1.5.0" +#define ISPC_VERSION "1.5.1dev" #if !defined(LLVM_3_1) && !defined(LLVM_3_2) && !defined(LLVM_3_3) && !defined(LLVM_3_4) #error "Only LLVM 3.1, 3.2, 3.3 and the 3.4 development branch are supported" From 758efebb3cc166e46169931490fbb42c5f9ffd65 Mon Sep 17 00:00:00 2001 From: Dmitry Babokin Date: Mon, 30 Sep 2013 17:54:59 +0400 Subject: [PATCH 110/124] Add missing testing support for avx1-i64x4 target --- alloy.py | 6 +++--- ispc.cpp | 4 ++-- run_tests.py | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/alloy.py b/alloy.py index 7ae972b4..3f05f4fd 100755 --- a/alloy.py +++ b/alloy.py @@ -200,7 +200,7 @@ def check_targets(): answer = answer + ["sse4-i32x4", "sse4-i32x8", "sse4-i16x8", "sse4-i8x16"] if AVX == False and "avx" in f_lines[i]: AVX = True; - answer = answer + ["avx1-i32x8", "avx1-i32x16"] + answer = answer + ["avx1-i32x8", "avx1-i32x16", "avx1-i64x4"] if AVX11 == False and "rdrand" in f_lines[i]: AVX11 = True; answer = answer + ["avx1.1-i32x8", "avx1.1-i32x16"] @@ -217,7 +217,7 @@ def check_targets(): answer = answer + ["sse4-i32x4", "sse4-i32x8", "sse4-i16x8", "sse4-i8x16"] if "AVX1.0" in f_lines: AVX = True; - answer = answer + ["avx1-i32x8", "avx1-i32x16"] + answer = answer + ["avx1-i32x8", "avx1-i32x16", "avx1-i64x4"] if "RDRAND" in f_lines: AVX11 = True; answer = answer + ["avx1.1-i32x8", "avx1.1-i32x16"] @@ -246,7 +246,7 @@ def check_targets(): if SSE4 == False and "wsm" in f_lines[i]: answer_sde = answer_sde + [["-wsm", "sse4-i32x4"], ["-wsm", "sse4-i32x8"], ["-wsm", "sse4-i16x8"], ["-wsm", "sse4-i8x16"]] if AVX == False and "snb" in f_lines[i]: - answer_sde = answer_sde + [["-snb", "avx1-i32x8"], ["-snb", "avx1-i32x16"]] + answer_sde = answer_sde + [["-snb", "avx1-i32x8"], ["-snb", "avx1-i32x16"], ["-snb", "avx1-i64x4"]] if AVX11 == False and "ivb" in f_lines[i]: answer_sde = answer_sde + [["-ivb", "avx1.1-i32x8"], ["-ivb", "avx1.1-i32x16"]] if AVX2 == False and "hsw" in f_lines[i]: diff --git a/ispc.cpp b/ispc.cpp index bec7baf7..56b0a25f 100644 --- a/ispc.cpp +++ b/ispc.cpp @@ -714,11 +714,11 @@ Target::SupportedTargets() { #endif "sse2-i32x4, sse2-i32x8, " "sse4-i32x4, sse4-i32x8, sse4-i16x8, sse4-i8x16, " - "avx1-i32x8, avx1-i32x16, " + "avx1-i32x8, avx1-i32x16, avx1-i64x4, " "avx1.1-i32x8, avx1.1-i32x16, " "avx2-i32x8, avx2-i32x16, " "generic-x1, generic-x4, generic-x8, generic-x16, " - "generic-x32, generic-x64"; + "generic-x32, generic-x64"; } diff --git a/run_tests.py b/run_tests.py index 64d3462a..4146576c 100755 --- a/run_tests.py +++ b/run_tests.py @@ -449,7 +449,7 @@ def verify(): check = [["g++", "clang", "cl"],["-O0", "-O2"],["x86","x86-64"], ["Linux","Windows","Mac"],["LLVM 3.1","LLVM 3.2","LLVM 3.3","LLVM head"], ["sse2-i32x4", "sse2-i32x8", "sse4-i32x4", "sse4-i32x8", "sse4-i16x8", - "sse4-i8x16", "avx1-i32x8", "avx1-i32x16", "avx1.1-i32x8", "avx1.1-i32x16", + "sse4-i8x16", "avx1-i32x8", "avx1-i32x16", "avx1-i64x4", "avx1.1-i32x8", "avx1.1-i32x16", "avx2-i32x8", "avx2-i32x16", "generic-1", "generic-4", "generic-8", "generic-16", "generic-32", "generic-64"]] for i in range (0,len(f_lines)): From 7942bdb728f8fc9b6cc560303cf6193ed5aba647 Mon Sep 17 00:00:00 2001 From: Dmitry Babokin Date: Mon, 30 Sep 2013 18:09:59 +0400 Subject: [PATCH 111/124] Typo fix and copyright update --- docs/ispc.rst | 4 ++-- docs/template-news.txt | 2 +- docs/template-perf.txt | 2 +- docs/template.txt | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/docs/ispc.rst b/docs/ispc.rst index 224faaa9..eac9b24e 100644 --- a/docs/ispc.rst +++ b/docs/ispc.rst @@ -270,8 +270,8 @@ new reserved words: ``unmasked``, ``foreach_unique``, ``foreach_active``, and ``in``. Any program that happens to have a variable or function with one of these names must be modified to rename that symbol. -Updating ISPC Programs For Changes In ISPC 1.4.5 ----------------------------------------------- +Updating ISPC Programs For Changes In ISPC 1.5.0 +------------------------------------------------ This release adds support for double precision floating point constants. Double precision floating point constants are floating point number with diff --git a/docs/template-news.txt b/docs/template-news.txt index 9a41fbdb..d5eebdd1 100644 --- a/docs/template-news.txt +++ b/docs/template-news.txt @@ -57,7 +57,7 @@ %(body)s
- diff --git a/docs/template-perf.txt b/docs/template-perf.txt index 4932e332..9537a836 100644 --- a/docs/template-perf.txt +++ b/docs/template-perf.txt @@ -57,7 +57,7 @@ %(body)s
- diff --git a/docs/template.txt b/docs/template.txt index 8cb4f5ab..b9041f19 100644 --- a/docs/template.txt +++ b/docs/template.txt @@ -57,7 +57,7 @@ %(body)s
- From 49cefc2e972bb3d742f74f855cd40b09b57f029b Mon Sep 17 00:00:00 2001 From: Dmitry Babokin Date: Mon, 30 Sep 2013 19:20:18 +0400 Subject: [PATCH 112/124] Updating fail_db for new target --- fail_db.txt | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/fail_db.txt b/fail_db.txt index eb3c0fe9..31db9961 100644 --- a/fail_db.txt +++ b/fail_db.txt @@ -931,3 +931,21 @@ .\tests\switch-12.ispc compfail x86 avx2-i32x16 Windows LLVM 3.3 cl -O2 * .\tests\switch-8.ispc compfail x86 avx2-i32x16 Windows LLVM 3.3 cl -O2 * .\tests\switch-9.ispc compfail x86 avx2-i32x16 Windows LLVM 3.3 cl -O2 * +.\tests\exclusive-scan-add-10.ispc runfail x86 avx1-i64x4 Windows LLVM 3.3 cl -O2 * +.\tests\exclusive-scan-add-9.ispc runfail x86 avx1-i64x4 Windows LLVM 3.3 cl -O2 * +.\tests\max-uint-1.ispc runfail x86 avx1-i64x4 Windows LLVM 3.3 cl -O2 * +.\tests\max-uint.ispc runfail x86 avx1-i64x4 Windows LLVM 3.3 cl -O2 * +.\tests\min-uint-2.ispc runfail x86 avx1-i64x4 Windows LLVM 3.3 cl -O2 * +.\tests\packed-load-1.ispc runfail x86 avx1-i64x4 Windows LLVM 3.3 cl -O2 * +.\tests\packed-store.ispc runfail x86 avx1-i64x4 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-add-uint-1.ispc runfail x86 avx1-i64x4 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-add-uint.ispc runfail x86 avx1-i64x4 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-add-uint64-1.ispc runfail x86 avx1-i64x4 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-add-uint64.ispc runfail x86 avx1-i64x4 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-max-uint.ispc runfail x86 avx1-i64x4 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-min-uint64.ispc runfail x86 avx1-i64x4 Windows LLVM 3.3 cl -O2 * +.\tests\uint64-max-1.ispc runfail x86 avx1-i64x4 Windows LLVM 3.3 cl -O2 * +.\tests\uint64-max.ispc runfail x86 avx1-i64x4 Windows LLVM 3.3 cl -O2 * +.\tests\uint64-min-1.ispc runfail x86 avx1-i64x4 Windows LLVM 3.3 cl -O2 * +.\tests\uint64-min.ispc runfail x86 avx1-i64x4 Windows LLVM 3.3 cl -O2 * +.\tests\reduce-min-uint64.ispc runfail x86 avx1-i64x4 Windows LLVM 3.4 cl -O2 * From 2d6f7a7c93bcbe89c2ec55e99a995d309c2d85b5 Mon Sep 17 00:00:00 2001 From: Dmitry Babokin Date: Tue, 1 Oct 2013 17:37:34 +0400 Subject: [PATCH 113/124] Support i686 architecture recognition as x86 and enable 32 bit x86 platforms --- examples/common.mk | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/examples/common.mk b/examples/common.mk index 95ec7ccb..330a2453 100644 --- a/examples/common.mk +++ b/examples/common.mk @@ -12,15 +12,22 @@ LIBS=-lm $(TASK_LIB) -lstdc++ ISPC=ispc -O2 $(ISPC_FLAGS) ISPC_HEADER=objs/$(ISPC_SRC:.ispc=_ispc.h) -ARCH:=$(shell uname -m | sed -e s/x86_64/x86/ -e s/arm.*/arm/ -e s/sa110/arm/) +ARCH:=$(shell uname -m | sed -e s/x86_64/x86/ -e s/i686/x86/ -e s/arm.*/arm/ -e s/sa110/arm/) ifeq ($(ARCH),x86) ISPC_OBJS=$(addprefix objs/, $(ISPC_SRC:.ispc=)_ispc.o $(ISPC_SRC:.ispc=)_ispc_sse2.o \ $(ISPC_SRC:.ispc=)_ispc_sse4.o $(ISPC_SRC:.ispc=)_ispc_avx.o) ISPC_TARGETS=$(ISPC_IA_TARGETS) - ISPC_FLAGS += --arch=x86-64 - CXXFLAGS += -m64 - CCFLAGS += -m64 + ARCH_BIT:=$(shell getconf LONG_BIT) + ifeq ($(ARCH_BIT),32) + ISPC_FLAGS += --arch=x86 + CXXFLAGS += -m32 + CCFLAGS += -m32 + else + ISPC_FLAGS += --arch=x86-64 + CXXFLAGS += -m64 + CCFLAGS += -m64 + endif else ifeq ($(ARCH),arm) ISPC_OBJS=$(addprefix objs/, $(ISPC_SRC:.ispc=_ispc.o)) ISPC_TARGETS=$(ISPC_ARM_TARGETS) From b2cf0209b153c072f5e531e23203a68e05d47d87 Mon Sep 17 00:00:00 2001 From: Ilia Filippov Date: Tue, 1 Oct 2013 18:01:29 +0400 Subject: [PATCH 114/124] pipe correction and some other small changes in test system --- alloy.py | 4 ++++ common.py | 3 ++- run_tests.py | 15 ++++++++------- 3 files changed, 14 insertions(+), 8 deletions(-) diff --git a/alloy.py b/alloy.py index 7ae972b4..6b55f85b 100755 --- a/alloy.py +++ b/alloy.py @@ -353,8 +353,12 @@ def validation_run(only, only_targets, reference_branch, number, notify, update, else: common.check_tools(1) if only_targets != "": + only_targets += " " + only_targets = only_targets.replace("generic "," generic-4 generic-16 ") only_targets_t = only_targets.split(" ") for i in only_targets_t: + if i == "": + continue err = True for j in range(0,len(targets_t)): if i in targets_t[j]: diff --git a/common.py b/common.py index 19d09e4d..be3e9526 100644 --- a/common.py +++ b/common.py @@ -83,7 +83,8 @@ def print_debug(line, silent, filename): sys.stdout.write(line) sys.stdout.flush() if os.environ.get("ISPC_HOME") != None: - write_to_file(os.environ["ISPC_HOME"] + os.sep + "notify_log.log", line) + if os.path.exists(os.environ.get("ISPC_HOME")): + write_to_file(os.environ["ISPC_HOME"] + os.sep + "notify_log.log", line) if filename != "": write_to_file(filename, line) diff --git a/run_tests.py b/run_tests.py index abc9b656..7b2f5f29 100755 --- a/run_tests.py +++ b/run_tests.py @@ -332,8 +332,6 @@ def run_tasks_from_queue(queue, queue_ret, queue_skip, total_tests_arg, max_test skip_files += [ filename ] -task_threads = [] - def sigint(signum, frame): for t in task_threads: t.terminate() @@ -423,6 +421,8 @@ def file_check(compfails, runfails): for i in range (0,len(new_compfails)): new_f_lines.append(new_compfails[i] + " compfail " + new_line) print_debug("\t" + new_compfails[i] + "\n", s, run_tests_log) + if len(new_runfails) == 0 and len(new_compfails) == 0: + print_debug("No new fails\n", s, run_tests_log) if len(new_passes_runfails) != 0: print_debug("NEW PASSES after RUNFAILS:\n", s, run_tests_log) for i in range (0,len(new_passes_runfails)): @@ -561,7 +561,6 @@ def run_tests(options1, args, print_version): # failing_tests/, and tests_errors/ if len(args) == 0: files = glob.glob(ispc_root + os.sep + "tests" + os.sep + "*ispc") + \ - glob.glob(ispc_root + os.sep + "failing_tests" + os.sep + "*ispc") + \ glob.glob(ispc_root + os.sep + "tests_errors" + os.sep + "*ispc") else: if is_windows: @@ -622,12 +621,12 @@ def run_tests(options1, args, print_version): start_time = time.time() # launch jobs to run tests glob_var = [is_windows, options, s, ispc_exe, is_generic_target, run_tests_log] + global task_threads + task_threads = [0] * nthreads for x in range(nthreads): - t = multiprocessing.Process(target=run_tasks_from_queue, args=(q, qret, qskip, total_tests, + task_threads[x] = multiprocessing.Process(target=run_tasks_from_queue, args=(q, qret, qskip, total_tests, max_test_length, finished_tests_counter, finished_tests_counter_lock, glob_var)) - task_threads.append(t) - t.start() - + task_threads[x].start() # wait for them to all finish and then return the number that failed # (i.e. return 0 if all is ok) for t in task_threads: @@ -660,6 +659,8 @@ def run_tests(options1, args, print_version): print_debug("%d / %d tests FAILED execution:\n" % (len(run_error_files), total_tests), s, run_tests_log) for f in run_error_files: print_debug("\t%s\n" % f, s, run_tests_log) + if len(compile_error_files) == 0 and len(run_error_files) == 0: + print_debug("No fails\n", s, run_tests_log) R = file_check(compile_error_files, run_error_files) From c7b4164122f7a9cf45a1a2ea30c90064650258dd Mon Sep 17 00:00:00 2001 From: Dmitry Babokin Date: Tue, 1 Oct 2013 18:40:26 +0400 Subject: [PATCH 115/124] Redefining ISPC should not discard ISPC_FLAGS --- examples/common.mk | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/examples/common.mk b/examples/common.mk index 330a2453..367d3eb3 100644 --- a/examples/common.mk +++ b/examples/common.mk @@ -9,7 +9,8 @@ CC=gcc CCFLAGS=-Iobjs/ -O2 LIBS=-lm $(TASK_LIB) -lstdc++ -ISPC=ispc -O2 $(ISPC_FLAGS) +ISPC=ispc +ISPC_FLAGS=-O2 ISPC_HEADER=objs/$(ISPC_SRC:.ispc=_ispc.h) ARCH:=$(shell uname -m | sed -e s/x86_64/x86/ -e s/i686/x86/ -e s/arm.*/arm/ -e s/sa110/arm/) @@ -68,10 +69,10 @@ objs/%.o: ../%.cpp dirs objs/$(EXAMPLE).o: objs/$(EXAMPLE)_ispc.h objs/%_ispc.h objs/%_ispc.o objs/%_ispc_sse2.o objs/%_ispc_sse4.o objs/%_ispc_avx.o: %.ispc - $(ISPC) --target=$(ISPC_TARGETS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h + $(ISPC) $(ISPC_FLAGS) --target=$(ISPC_TARGETS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h objs/$(ISPC_SRC:.ispc=)_sse4.cpp: $(ISPC_SRC) - $(ISPC) $< -o $@ --target=generic-4 --emit-c++ --c++-include-file=sse4.h + $(ISPC) $(ISPC_FLAGS) $< -o $@ --target=generic-4 --emit-c++ --c++-include-file=sse4.h objs/$(ISPC_SRC:.ispc=)_sse4.o: objs/$(ISPC_SRC:.ispc=)_sse4.cpp $(CXX) -I../intrinsics -msse4.2 $< $(CXXFLAGS) -c -o $@ @@ -80,7 +81,7 @@ $(EXAMPLE)-sse4: $(CPP_OBJS) objs/$(ISPC_SRC:.ispc=)_sse4.o $(CXX) $(CXXFLAGS) -o $@ $^ $(LIBS) objs/$(ISPC_SRC:.ispc=)_generic16.cpp: $(ISPC_SRC) - $(ISPC) $< -o $@ --target=generic-16 --emit-c++ --c++-include-file=generic-16.h + $(ISPC) $(ISPC_FLAGS) $< -o $@ --target=generic-16 --emit-c++ --c++-include-file=generic-16.h objs/$(ISPC_SRC:.ispc=)_generic16.o: objs/$(ISPC_SRC:.ispc=)_generic16.cpp $(CXX) -I../intrinsics $< $(CXXFLAGS) -c -o $@ @@ -89,7 +90,7 @@ $(EXAMPLE)-generic16: $(CPP_OBJS) objs/$(ISPC_SRC:.ispc=)_generic16.o $(CXX) $(CXXFLAGS) -o $@ $^ $(LIBS) objs/$(ISPC_SRC:.ispc=)_scalar.o: $(ISPC_SRC) - $(ISPC) $< -o $@ --target=generic-1 + $(ISPC) $(ISPC_FLAGS) $< -o $@ --target=generic-1 $(EXAMPLE)-scalar: $(CPP_OBJS) objs/$(ISPC_SRC:.ispc=)_scalar.o $(CXX) $(CXXFLAGS) -o $@ $^ $(LIBS) From dc8895352af94d7042e9e7658035c3c9d35ba8b7 Mon Sep 17 00:00:00 2001 From: "james.brodman" Date: Tue, 1 Oct 2013 11:53:56 -0400 Subject: [PATCH 116/124] Adding missing typecasts and guarding i64 __mul with compiler version check --- examples/intrinsics/knc-i1x16.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/examples/intrinsics/knc-i1x16.h b/examples/intrinsics/knc-i1x16.h index 2ee6d2f5..ae9c4130 100644 --- a/examples/intrinsics/knc-i1x16.h +++ b/examples/intrinsics/knc-i1x16.h @@ -1021,9 +1021,13 @@ static FORCEINLINE __vec16_i64 __mul(const __vec16_i32 &a, const __vec16_i64 &_b _mm512_mulhi_epi32(a.v, b.v_lo))).cvt2zmm(); } +#if __ICC_VERSION == 1400 static FORCEINLINE __vec16_i64 __mul(__vec16_i64 a, __vec16_i64 b) { return __vec16_i64(_mm512_mullox_epi64(a.v1, b.v1), _mm512_mullox_epi64(a.v2,b.v2)); } +#else +BINARY_OP(__vec16_i64, __mul, *) +#endif #endif #if 0 @@ -2164,7 +2168,7 @@ static FORCEINLINE __vec16_f __cast_fptrunc(__vec16_f, __vec16_d val) { __m512i r0i = _mm512_castps_si512(_mm512_cvtpd_pslo(val.v1)); __m512i r1i = _mm512_castps_si512(_mm512_cvtpd_pslo(val.v2)); - return _mm512_mask_permute4f128_epi32(r0i, 0xFF00, r1i, _MM_PERM_BABA); + return _mm512_castsi512_ps(_mm512_mask_permute4f128_epi32(r0i, 0xFF00, r1i, _MM_PERM_BABA)); } #endif @@ -2174,7 +2178,7 @@ CAST(__vec16_d, double, __vec16_f, float, __cast_fpext) static FORCEINLINE __vec16_d __cast_fpext(__vec16_d, __vec16_f val) { __vec16_d ret; ret.v1 = _mm512_cvtpslo_pd(val.v); - __vec16_f other8 = _mm512_permute4f128_epi32(_mm512_castps_si512(val.v), _MM_PERM_DCDC); + __vec16_f other8 = _mm512_castsi512_ps(_mm512_permute4f128_epi32(_mm512_castps_si512(val.v), _MM_PERM_DCDC)); ret.v2 = _mm512_cvtpslo_pd(other8); return ret; } From 32c77be2f3537b24890e1334b1a7d2579c58d2c1 Mon Sep 17 00:00:00 2001 From: evghenii Date: Fri, 4 Oct 2013 11:42:52 +0300 Subject: [PATCH 117/124] cleaned mask & int32, only test141 fails --- examples/intrinsics/knc-i1x16.h | 656 +++++++++----------------------- 1 file changed, 190 insertions(+), 466 deletions(-) diff --git a/examples/intrinsics/knc-i1x16.h b/examples/intrinsics/knc-i1x16.h index ae9c4130..aae4be57 100644 --- a/examples/intrinsics/knc-i1x16.h +++ b/examples/intrinsics/knc-i1x16.h @@ -64,69 +64,48 @@ extern "C" } #endif -typedef float __vec1_f; -typedef double __vec1_d; -typedef int8_t __vec1_i8; +typedef float __vec1_f; +typedef double __vec1_d; +typedef int8_t __vec1_i8; typedef int16_t __vec1_i16; typedef int32_t __vec1_i32; typedef int64_t __vec1_i64; -struct __vec16_i1 { - __vec16_i1() { } - __vec16_i1(const __mmask16 &vv) : v(vv) { } - __vec16_i1(bool v0, bool v1, bool v2, bool v3, - bool v4, bool v5, bool v6, bool v7, - bool v8, bool v9, bool v10, bool v11, - bool v12, bool v13, bool v14, bool v15) { - v = ((v0 & 1) | - ((v1 & 1) << 1) | - ((v2 & 1) << 2) | - ((v3 & 1) << 3) | - ((v4 & 1) << 4) | - ((v5 & 1) << 5) | - ((v6 & 1) << 6) | - ((v7 & 1) << 7) | - ((v8 & 1) << 8) | - ((v9 & 1) << 9) | - ((v10 & 1) << 10) | - ((v11 & 1) << 11) | - ((v12 & 1) << 12) | - ((v13 & 1) << 13) | - ((v14 & 1) << 14) | - ((v15 & 1) << 15)); - } - - __mmask16 v; - FORCEINLINE operator __mmask16() const { return v; } +/************ mask **************/ + +struct __vec16_i1 +{ + __mmask16 v; + + FORCEINLINE __vec16_i1() { } + FORCEINLINE __vec16_i1(const __mmask16 &vv) : v(vv) { } + FORCEINLINE __vec16_i1(bool v0, bool v1, bool v2, bool v3, + bool v4, bool v5, bool v6, bool v7, + bool v8, bool v9, bool v10, bool v11, + bool v12, bool v13, bool v14, bool v15) { + v = ((v0 & 1) | + ((v1 & 1) << 1) | + ((v2 & 1) << 2) | + ((v3 & 1) << 3) | + ((v4 & 1) << 4) | + ((v5 & 1) << 5) | + ((v6 & 1) << 6) | + ((v7 & 1) << 7) | + ((v8 & 1) << 8) | + ((v9 & 1) << 9) | + ((v10 & 1) << 10) | + ((v11 & 1) << 11) | + ((v12 & 1) << 12) | + ((v13 & 1) << 13) | + ((v14 & 1) << 14) | + ((v15 & 1) << 15)); + } + + FORCEINLINE operator __mmask16() const { return v; } }; +/************ vector **************/ -template -struct vec16 { - vec16() { } - vec16(T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7, - T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15) { - data[0] = v0; data[1] = v1; data[2] = v2; data[3] = v3; - data[4] = v4; data[5] = v5; data[6] = v6; data[7] = v7; - data[8] = v8; data[9] = v9; data[10] = v10; data[11] = v11; - data[12] = v12; data[13] = v13; data[14] = v14; data[15] = v15; - } - T data[16]; - FORCEINLINE const T& operator[](const int i) const { return data[i]; } - FORCEINLINE T& operator[](const int i) { return data[i]; } -}; - -#if 0 /* evghenii:i32 */ -struct PRE_ALIGN(64) __vec16_i32 : public vec16 { - __vec16_i32() { } - __vec16_i32(int32_t v0, int32_t v1, int32_t v2, int32_t v3, - int32_t v4, int32_t v5, int32_t v6, int32_t v7, - int32_t v8, int32_t v9, int32_t v10, int32_t v11, - int32_t v12, int32_t v13, int32_t v14, int32_t v15) - : vec16(v0, v1, v2, v3, v4, v5, v6, v7, - v8, v9, v10, v11, v12, v13, v14, v15) { } -} POST_ALIGN(64); -#else /* evghenii:i32 */ struct PRE_ALIGN(64) __vec16_i32 { __m512i v; @@ -144,81 +123,43 @@ struct PRE_ALIGN(64) __vec16_i32 FORCEINLINE const int32_t& operator[](const int i) const { return ((int32_t*)this)[i]; } FORCEINLINE int32_t& operator[](const int i) { return ((int32_t*)this)[i]; } } POST_ALIGN(64); -#endif /* evghenii:i32 */ -#if 0 /* evghenii::f */ -PRE_ALIGN(64) struct __vec16_f : public vec16 { - __vec16_f() { } - __vec16_f(float v0, float v1, float v2, float v3, - float v4, float v5, float v6, float v7, - float v8, float v9, float v10, float v11, - float v12, float v13, float v14, float v15) - : vec16(v0, v1, v2, v3, v4, v5, v6, v7, - v8, v9, v10, v11, v12, v13, v14, v15) { } - -} POST_ALIGN(64); -#else /* evghenii::f */ PRE_ALIGN(64) struct __vec16_f { - __m512 v; - FORCEINLINE operator __m512() const { return v; } - FORCEINLINE __vec16_f() : v(_mm512_undefined_ps()) { } - FORCEINLINE __vec16_f(const __m512 &in) : v(in) {} - FORCEINLINE __vec16_f(const __vec16_f &o) : v(o.v) {} - FORCEINLINE __vec16_f& operator =(const __vec16_f &o) { v=o.v; return *this; } - FORCEINLINE __vec16_f(float v00, float v01, float v02, float v03, - float v04, float v05, float v06, float v07, - float v08, float v09, float v10, float v11, - float v12, float v13, float v14, float v15) : - v ( _mm512_set_16to16_ps(v15, v14, v13, v12, v11, v10, v09, v08, v07, v06, v05, v04, v03, v02, v01, v00) ) {} - FORCEINLINE const float& operator[](const int i) const { return ((float*)this)[i]; } - FORCEINLINE float& operator[](const int i) { return ((float*)this)[i]; } + __m512 v; + FORCEINLINE operator __m512() const { return v; } + FORCEINLINE __vec16_f() : v(_mm512_undefined_ps()) { } + FORCEINLINE __vec16_f(const __m512 &in) : v(in) {} + FORCEINLINE __vec16_f(const __vec16_f &o) : v(o.v) {} + FORCEINLINE __vec16_f& operator =(const __vec16_f &o) { v=o.v; return *this; } + FORCEINLINE __vec16_f(float v00, float v01, float v02, float v03, + float v04, float v05, float v06, float v07, + float v08, float v09, float v10, float v11, + float v12, float v13, float v14, float v15) : + v ( _mm512_set_16to16_ps(v15, v14, v13, v12, v11, v10, v09, v08, v07, v06, v05, v04, v03, v02, v01, v00) ) {} + FORCEINLINE const float& operator[](const int i) const { return ((float*)this)[i]; } + FORCEINLINE float& operator[](const int i) { return ((float*)this)[i]; } } POST_ALIGN(64); -#endif /* evghenii::f */ -#if 0 /* evghenii::d */ -PRE_ALIGN(128) struct __vec16_d : public vec16 { - __vec16_d() { } - __vec16_d(double v0, double v1, double v2, double v3, - double v4, double v5, double v6, double v7, - double v8, double v9, double v10, double v11, - double v12, double v13, double v14, double v15) - : vec16(v0, v1, v2, v3, v4, v5, v6, v7, - v8, v9, v10, v11, v12, v13, v14, v15) { } - -} POST_ALIGN(128); -#else /* evghenii::d */ struct PRE_ALIGN(128) __vec16_d { - __m512d v1; - __m512d v2; - FORCEINLINE __vec16_d() : v1(_mm512_undefined_pd()), v2(_mm512_undefined_pd()) {} - FORCEINLINE __vec16_d(const __m512d _v1, const __m512d _v2) : v1(_v1), v2(_v2) {} - FORCEINLINE __vec16_d(const __vec16_d &o) : v1(o.v1), v2(o.v2) {} - FORCEINLINE __vec16_d& operator =(const __vec16_d &o) { v1=o.v1; v2=o.v2; return *this; } - FORCEINLINE __vec16_d(double v00, double v01, double v02, double v03, - double v04, double v05, double v06, double v07, - double v08, double v09, double v10, double v11, - double v12, double v13, double v14, double v15) { - v1 = _mm512_set_8to8_pd(v15, v14, v13, v12, v11, v10, v09, v08); - v2 = _mm512_set_8to8_pd(v07, v06, v05, v04, v03, v02, v01, v00); - } - FORCEINLINE const double& operator[](const int i) const { return ((double*)this)[i]; } - FORCEINLINE double& operator[](const int i) { return ((double*)this)[i]; } + __m512d v1; + __m512d v2; + FORCEINLINE __vec16_d() : v1(_mm512_undefined_pd()), v2(_mm512_undefined_pd()) {} + FORCEINLINE __vec16_d(const __m512d _v1, const __m512d _v2) : v1(_v1), v2(_v2) {} + FORCEINLINE __vec16_d(const __vec16_d &o) : v1(o.v1), v2(o.v2) {} + FORCEINLINE __vec16_d& operator =(const __vec16_d &o) { v1=o.v1; v2=o.v2; return *this; } + FORCEINLINE __vec16_d(double v00, double v01, double v02, double v03, + double v04, double v05, double v06, double v07, + double v08, double v09, double v10, double v11, + double v12, double v13, double v14, double v15) { + v1 = _mm512_set_8to8_pd(v15, v14, v13, v12, v11, v10, v09, v08); + v2 = _mm512_set_8to8_pd(v07, v06, v05, v04, v03, v02, v01, v00); + } + FORCEINLINE const double& operator[](const int i) const { return ((double*)this)[i]; } + FORCEINLINE double& operator[](const int i) { return ((double*)this)[i]; } } POST_ALIGN(128); -#endif /* evghenii::d */ -#if 0 /* evghenii::i64 */ -PRE_ALIGN(128) struct __vec16_i64 : public vec16 { - __vec16_i64() { } - __vec16_i64(int64_t v0, int64_t v1, int64_t v2, int64_t v3, - int64_t v4, int64_t v5, int64_t v6, int64_t v7, - int64_t v8, int64_t v9, int64_t v10, int64_t v11, - int64_t v12, int64_t v13, int64_t v14, int64_t v15) - : vec16(v0, v1, v2, v3, v4, v5, v6, v7, - v8, v9, v10, v11, v12, v13, v14, v15) { } -} POST_ALIGN(128); -#else /* evghenii::i64 */ struct PRE_ALIGN(128) __vec16_i64 { union { @@ -279,7 +220,24 @@ struct PRE_ALIGN(128) __vec16_i64 return __vec16_i64(_v1, _v2); } } POST_ALIGN(128); -#endif /* evghenii::i64 */ + +/************ scalar **************/ + +template +struct vec16 +{ + FORCEINLINE vec16() { } + FORCEINLINE vec16(T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7, + T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15) { + data[0] = v0; data[1] = v1; data[2] = v2; data[3] = v3; + data[4] = v4; data[5] = v5; data[6] = v6; data[7] = v7; + data[8] = v8; data[9] = v9; data[10] = v10; data[11] = v11; + data[12] = v12; data[13] = v13; data[14] = v14; data[15] = v15; + } + T data[16]; + FORCEINLINE const T& operator[](const int i) const { return data[i]; } + FORCEINLINE T& operator[](const int i) { return data[i]; } +}; PRE_ALIGN(16) struct __vec16_i8 : public vec16 { __vec16_i8() { } @@ -510,104 +468,54 @@ INSERT_EXTRACT(__vec1_f, float) INSERT_EXTRACT(__vec1_d, double) /////////////////////////////////////////////////////////////////////////// -// mask ops +// mask +/////////////////////////////////////////////////////////////////////////// -static FORCEINLINE __vec16_i1 __movmsk(__vec16_i1 mask) { - return _mm512_kmov(mask); +static FORCEINLINE __vec16_i1 __movmsk(__vec16_i1 mask) { return _mm512_kmov (mask); } +static FORCEINLINE bool __any (__vec16_i1 mask) { return !_mm512_kortestz(mask, mask); } +static FORCEINLINE bool __all (__vec16_i1 mask) { return _mm512_kortestc(mask, mask); } +static FORCEINLINE bool __none (__vec16_i1 mask) { return _mm512_kortestz(mask, mask); } +static FORCEINLINE __vec16_i1 __not (__vec16_i1 mask) { return _mm512_knot (mask); } + +static FORCEINLINE __vec16_i1 __equal_i1(__vec16_i1 a, __vec16_i1 b) { return _mm512_kxnor (a,b); } +static FORCEINLINE __vec16_i1 __and (__vec16_i1 a, __vec16_i1 b) { return _mm512_kand (a,b); } +static FORCEINLINE __vec16_i1 __xor (__vec16_i1 a, __vec16_i1 b) { return _mm512_kxor (a,b); } +static FORCEINLINE __vec16_i1 __or (__vec16_i1 a, __vec16_i1 b) { return _mm512_kor (a,b); } +static FORCEINLINE __vec16_i1 __and_not1(__vec16_i1 a, __vec16_i1 b) { return _mm512_kandn (a,b); } +static FORCEINLINE __vec16_i1 __and_not2(__vec16_i1 a, __vec16_i1 b) { return _mm512_kandnr(a,b); } + +static FORCEINLINE __vec16_i1 __select(__vec16_i1 mask, __vec16_i1 a, __vec16_i1 b) { return __or(__and(a, mask), __and_not2(b, mask)); } +static FORCEINLINE __vec16_i1 __select( bool cond, __vec16_i1 a, __vec16_i1 b) { return cond ? a : b; } + +static FORCEINLINE bool __extract_element(__vec16_i1 vec, int index) { return (vec.v & (1 << index)) ? true : false; } +static FORCEINLINE void __insert_element(__vec16_i1 *vec, int index, bool val) +{ + if (val == false) vec->v &= ~(1 << index); + else vec->v |= (1 << index); } -static FORCEINLINE bool __any(__vec16_i1 mask) { - return !_mm512_kortestz(mask, mask); +template static FORCEINLINE __vec16_i1 __load(const __vec16_i1 *p) +{ + return *p; } -static FORCEINLINE bool __all(__vec16_i1 mask) { - return _mm512_kortestc(mask, mask); -} - -static FORCEINLINE bool __none(__vec16_i1 mask) { - return _mm512_kortestz(mask, mask); -} - -static FORCEINLINE __vec16_i1 __equal_i1(__vec16_i1 a, __vec16_i1 b) { - return _mm512_kxnor(a,b); -} -static FORCEINLINE __vec16_i1 __and(__vec16_i1 a, __vec16_i1 b) { - return _mm512_kand(a, b); -} - -static FORCEINLINE __vec16_i1 __xor(__vec16_i1 a, __vec16_i1 b) { - return _mm512_kxor(a, b); -} - -static FORCEINLINE __vec16_i1 __or(__vec16_i1 a, __vec16_i1 b) { - return _mm512_kor(a, b); -} - -static FORCEINLINE __vec16_i1 __not(__vec16_i1 a) { - return _mm512_knot(a); -} - -static FORCEINLINE __vec16_i1 __and_not1(__vec16_i1 a, __vec16_i1 b) { - return _mm512_kandn(a, b); -} - -static FORCEINLINE __vec16_i1 __and_not2(__vec16_i1 a, __vec16_i1 b) { - return _mm512_kandnr(a, b); -} - -static FORCEINLINE __vec16_i1 __select(__vec16_i1 mask, __vec16_i1 a, - __vec16_i1 b) { -// return ((a & mask) | (b & ~mask)); - return __or(__and(a, mask), __and_not2(b, mask)); -} - -static FORCEINLINE __vec16_i1 __select(bool cond, __vec16_i1 a, __vec16_i1 b) { - return cond ? a : b; -} - - -static FORCEINLINE bool __extract_element(__vec16_i1 vec, int index) { - return (vec.v & (1 << index)) ? true : false; -} - -static FORCEINLINE void __insert_element(__vec16_i1 *vec, int index, - bool val) { - if (val == false) - vec->v &= ~(1 << index); - else - vec->v |= (1 << index); -} - -template static FORCEINLINE __vec16_i1 __load(const __vec16_i1 *p) { - uint16_t *ptr = (uint16_t *)p; - __vec16_i1 r; - r.v = *ptr; - return r; -} - -template static FORCEINLINE void __store(__vec16_i1 *p, __vec16_i1 v) { - uint16_t *ptr = (uint16_t *)p; - *ptr = v.v; +template static FORCEINLINE void __store(__vec16_i1 *p, __vec16_i1 v) +{ + *p = v; } template RetVecType __smear_i1(int i); -template <> static FORCEINLINE __vec16_i1 __smear_i1<__vec16_i1>(int i) { - return i?0xFFFF:0x0; -} +template <> static FORCEINLINE __vec16_i1 __smear_i1<__vec16_i1>(int i) { return i?0xFFFF:0x0; } template RetVecType __setzero_i1(); -template <> static FORCEINLINE __vec16_i1 __setzero_i1<__vec16_i1>() { - return 0; -} +template <> static FORCEINLINE __vec16_i1 __setzero_i1<__vec16_i1>() { return 0; } template __vec16_i1 __undef_i1(); -template <> FORCEINLINE __vec16_i1 __undef_i1<__vec16_i1>() { - return __vec16_i1(); -} - +template <> FORCEINLINE __vec16_i1 __undef_i1<__vec16_i1>() { return __vec16_i1(); } /////////////////////////////////////////////////////////////////////////// // int8 +/////////////////////////////////////////////////////////////////////////// BINARY_OP(__vec16_i8, __add, +) BINARY_OP(__vec16_i8, __sub, -) @@ -653,6 +561,7 @@ LOAD_STORE(__vec16_i8, int8_t) /////////////////////////////////////////////////////////////////////////// // int16 +/////////////////////////////////////////////////////////////////////////// BINARY_OP(__vec16_i16, __add, +) BINARY_OP(__vec16_i16, __sub, -) @@ -696,232 +605,57 @@ ROTATE(__vec16_i16, i16, int16_t) SHUFFLES(__vec16_i16, i16, int16_t) LOAD_STORE(__vec16_i16, int16_t) -#if 0 /* evghenii::int32 */ -/////////////////////////////////////////////////////////////////////////// -// int32 - -BINARY_OP(__vec16_i32, __add, +) -BINARY_OP(__vec16_i32, __sub, -) -BINARY_OP(__vec16_i32, __mul, *) - -BINARY_OP(__vec16_i32, __or, |) -BINARY_OP(__vec16_i32, __and, &) -BINARY_OP(__vec16_i32, __xor, ^) -BINARY_OP(__vec16_i32, __shl, <<) - -BINARY_OP_CAST(__vec16_i32, uint32_t, __udiv, /) -BINARY_OP_CAST(__vec16_i32, int32_t, __sdiv, /) - -BINARY_OP_CAST(__vec16_i32, uint32_t, __urem, %) -BINARY_OP_CAST(__vec16_i32, int32_t, __srem, %) -BINARY_OP_CAST(__vec16_i32, uint32_t, __lshr, >>) -BINARY_OP_CAST(__vec16_i32, int32_t, __ashr, >>) - -SHIFT_UNIFORM(__vec16_i32, uint32_t, __lshr, >>) -SHIFT_UNIFORM(__vec16_i32, int32_t, __ashr, >>) -SHIFT_UNIFORM(__vec16_i32, int32_t, __shl, <<) - -CMP_OP(__vec16_i32, i32, int32_t, __equal, ==) -CMP_OP(__vec16_i32, i32, int32_t, __not_equal, !=) -CMP_OP(__vec16_i32, i32, uint32_t, __unsigned_less_equal, <=) -CMP_OP(__vec16_i32, i32, int32_t, __signed_less_equal, <=) -CMP_OP(__vec16_i32, i32, uint32_t, __unsigned_greater_equal, >=) -CMP_OP(__vec16_i32, i32, int32_t, __signed_greater_equal, >=) -CMP_OP(__vec16_i32, i32, uint32_t, __unsigned_less_than, <) -CMP_OP(__vec16_i32, i32, int32_t, __signed_less_than, <) -CMP_OP(__vec16_i32, i32, uint32_t, __unsigned_greater_than, >) -CMP_OP(__vec16_i32, i32, int32_t, __signed_greater_than, >) - -SELECT(__vec16_i32) -INSERT_EXTRACT(__vec16_i32, int32_t) -SMEAR(__vec16_i32, i32, int32_t) -SETZERO(__vec16_i32, i32) -UNDEF(__vec16_i32, i32) -BROADCAST(__vec16_i32, i32, int32_t) -ROTATE(__vec16_i32, i32, int32_t) -SHUFFLES(__vec16_i32, i32, int32_t) -LOAD_STORE(__vec16_i32, int32_t) - -#else /* evghenii::int32 */ /////////////////////////////////////////////////////////////////////////// // int32 /////////////////////////////////////////////////////////////////////////// -static FORCEINLINE __vec16_i32 __add(__vec16_i32 a, __vec16_i32 b) { - return _mm512_add_epi32(a, b); -} +static FORCEINLINE __vec16_i32 __add (__vec16_i32 a, __vec16_i32 b) { return _mm512_add_epi32 (a,b); } +static FORCEINLINE __vec16_i32 __sub (__vec16_i32 a, __vec16_i32 b) { return _mm512_sub_epi32 (a,b); } +static FORCEINLINE __vec16_i32 __mul (__vec16_i32 a, __vec16_i32 b) { return _mm512_mullo_epi32(a,b); } +static FORCEINLINE __vec16_i32 __udiv(__vec16_i32 a, __vec16_i32 b) { return _mm512_div_epu32 (a,b); } +static FORCEINLINE __vec16_i32 __sdiv(__vec16_i32 a, __vec16_i32 b) { return _mm512_div_epi32 (a,b); } +static FORCEINLINE __vec16_i32 __urem(__vec16_i32 a, __vec16_i32 b) { return _mm512_rem_epu32 (a,b); } +static FORCEINLINE __vec16_i32 __srem(__vec16_i32 a, __vec16_i32 b) { return _mm512_rem_epi32 (a,b); } +static FORCEINLINE __vec16_i32 __or (__vec16_i32 a, __vec16_i32 b) { return _mm512_or_epi32 (a,b); } +static FORCEINLINE __vec16_i32 __and (__vec16_i32 a, __vec16_i32 b) { return _mm512_and_epi32 (a,b); } +static FORCEINLINE __vec16_i32 __xor (__vec16_i32 a, __vec16_i32 b) { return _mm512_xor_epi32 (a,b); } +static FORCEINLINE __vec16_i32 __shl (__vec16_i32 a, __vec16_i32 b) { return _mm512_sllv_epi32 (a,b); } +static FORCEINLINE __vec16_i32 __lshr(__vec16_i32 a, __vec16_i32 b) { return _mm512_srlv_epi32 (a,b); } +static FORCEINLINE __vec16_i32 __ashr(__vec16_i32 a, __vec16_i32 b) { return _mm512_srav_epi32 (a,b); } +static FORCEINLINE __vec16_i32 __shl (__vec16_i32 a, int32_t n) { return _mm512_slli_epi32 (a,n); } +static FORCEINLINE __vec16_i32 __lshr(__vec16_i32 a, int32_t n) { return _mm512_srli_epi32 (a,n); } +static FORCEINLINE __vec16_i32 __ashr(__vec16_i32 a, int32_t n) { return _mm512_srai_epi32 (a,n); } -static FORCEINLINE __vec16_i32 __sub(__vec16_i32 a, __vec16_i32 b) { - return _mm512_sub_epi32(a, b); -} +static FORCEINLINE __vec16_i1 __equal_i32 (__vec16_i32 a, __vec16_i32 b) { return _mm512_cmpeq_epi32_mask (a,b); } +static FORCEINLINE __vec16_i1 __not_equal_i32 (__vec16_i32 a, __vec16_i32 b) { return _mm512_cmpneq_epi32_mask(a,b); } +static FORCEINLINE __vec16_i1 __unsigned_less_equal_i32 (__vec16_i32 a, __vec16_i32 b) { return _mm512_cmple_epu32_mask (a,b); } +static FORCEINLINE __vec16_i1 __signed_less_equal_i32 (__vec16_i32 a, __vec16_i32 b) { return _mm512_cmple_epi32_mask (a,b); } +static FORCEINLINE __vec16_i1 __unsigned_greater_equal_i32(__vec16_i32 a, __vec16_i32 b) { return _mm512_cmpge_epu32_mask (a,b); } +static FORCEINLINE __vec16_i1 __signed_greater_equal_i32 (__vec16_i32 a, __vec16_i32 b) { return _mm512_cmpge_epi32_mask (a,b); } +static FORCEINLINE __vec16_i1 __unsigned_less_than_i32 (__vec16_i32 a, __vec16_i32 b) { return _mm512_cmplt_epu32_mask (a,b); } +static FORCEINLINE __vec16_i1 __signed_less_than_i32 (__vec16_i32 a, __vec16_i32 b) { return _mm512_cmplt_epi32_mask (a,b); } +static FORCEINLINE __vec16_i1 __unsigned_greater_than_i32 (__vec16_i32 a, __vec16_i32 b) { return _mm512_cmpgt_epu32_mask (a,b); } +static FORCEINLINE __vec16_i1 __signed_greater_than_i32 (__vec16_i32 a, __vec16_i32 b) { return _mm512_cmpgt_epi32_mask (a,b); } -static FORCEINLINE __vec16_i32 __mul(__vec16_i32 a, __vec16_i32 b) { - return _mm512_mullo_epi32(a, b); -} +static FORCEINLINE __vec16_i1 __equal_i32_and_mask (__vec16_i32 a, __vec16_i32 b, __vec16_i1 m) { return _mm512_mask_cmpeq_epi32_mask (m,a,b); } +static FORCEINLINE __vec16_i1 __not_equal_i32_and_mask (__vec16_i32 a, __vec16_i32 b, __vec16_i1 m) { return _mm512_mask_cmpneq_epi32_mask(m,a,b); } +static FORCEINLINE __vec16_i1 __unsigned_less_equal_i32_and_mask (__vec16_i32 a, __vec16_i32 b, __vec16_i1 m) { return _mm512_mask_cmple_epu32_mask (m,a,b); } +static FORCEINLINE __vec16_i1 __signed_less_equal_i32_and_mask (__vec16_i32 a, __vec16_i32 b, __vec16_i1 m) { return _mm512_mask_cmple_epi32_mask (m,a,b); } +static FORCEINLINE __vec16_i1 __unsigned_greater_equal_i32_and_mask(__vec16_i32 a, __vec16_i32 b, __vec16_i1 m) { return _mm512_mask_cmpge_epu32_mask (m,a,b); } +static FORCEINLINE __vec16_i1 __signed_greater_equal_i32_and_mask (__vec16_i32 a, __vec16_i32 b, __vec16_i1 m) { return _mm512_mask_cmpge_epi32_mask (m,a,b); } +static FORCEINLINE __vec16_i1 __unsigned_less_than_i32_and_mask (__vec16_i32 a, __vec16_i32 b, __vec16_i1 m) { return _mm512_mask_cmplt_epu32_mask (m,a,b); } +static FORCEINLINE __vec16_i1 __signed_less_than_i32_and_mask (__vec16_i32 a, __vec16_i32 b, __vec16_i1 m) { return _mm512_mask_cmplt_epi32_mask (m,a,b); } +static FORCEINLINE __vec16_i1 __unsigned_greater_than_i32_and_mask (__vec16_i32 a, __vec16_i32 b, __vec16_i1 m) { return _mm512_mask_cmpgt_epu32_mask (m,a,b); } +static FORCEINLINE __vec16_i1 __signed_greater_than_i32_and_mask (__vec16_i32 a, __vec16_i32 b, __vec16_i1 m) { return _mm512_mask_cmpgt_epi32_mask (m,a,b); } -static FORCEINLINE __vec16_i32 __udiv(__vec16_i32 a, __vec16_i32 b) { - return _mm512_div_epu32(a, b); -} +static FORCEINLINE __vec16_i32 __select(__vec16_i1 mask, __vec16_i32 a, __vec16_i32 b) { return _mm512_mask_mov_epi32(b, mask, a); } +static FORCEINLINE __vec16_i32 __select( bool cond, __vec16_i32 a, __vec16_i32 b) { return cond ? a : b; } -static FORCEINLINE __vec16_i32 __sdiv(__vec16_i32 a, __vec16_i32 b) { - return _mm512_div_epi32(a, b); -} - -static FORCEINLINE __vec16_i32 __urem(__vec16_i32 a, __vec16_i32 b) { - return _mm512_rem_epu32(a, b); -} - -static FORCEINLINE __vec16_i32 __srem(__vec16_i32 a, __vec16_i32 b) { - return _mm512_rem_epi32(a, b); -} - -static FORCEINLINE __vec16_i32 __or(__vec16_i32 a, __vec16_i32 b) { - return _mm512_or_epi32(a, b); -} - -static FORCEINLINE __vec16_i32 __and(__vec16_i32 a, __vec16_i32 b) { - return _mm512_and_epi32(a, b); -} - -static FORCEINLINE __vec16_i32 __xor(__vec16_i32 a, __vec16_i32 b) { - return _mm512_xor_epi32(a, b); -} - -static FORCEINLINE __vec16_i32 __shl(__vec16_i32 a, __vec16_i32 b) { - return _mm512_sllv_epi32(a, b); -} - -static FORCEINLINE __vec16_i32 __lshr(__vec16_i32 a, __vec16_i32 b) { - return _mm512_srlv_epi32(a, b); -} - -static FORCEINLINE __vec16_i32 __ashr(__vec16_i32 a, __vec16_i32 b) { - return _mm512_srav_epi32(a, b); -} - -static FORCEINLINE __vec16_i32 __shl(__vec16_i32 a, int32_t n) { - return _mm512_slli_epi32(a, n); -} - -static FORCEINLINE __vec16_i32 __lshr(__vec16_i32 a, int32_t n) { - return _mm512_srli_epi32(a, n); -} - -static FORCEINLINE __vec16_i32 __ashr(__vec16_i32 a, int32_t n) { - return _mm512_srai_epi32(a, n); -} - -static FORCEINLINE __vec16_i1 __equal_i32(const __vec16_i32 &a, const __vec16_i32 &b) { - return _mm512_cmpeq_epi32_mask(a, b); -} - -static FORCEINLINE __vec16_i1 __equal_i32_and_mask(const __vec16_i32 &a, const __vec16_i32 &b, - __vec16_i1 m) { - return _mm512_mask_cmpeq_epi32_mask(m, a, b); -} - -static FORCEINLINE __vec16_i1 __not_equal_i32(__vec16_i32 a, __vec16_i32 b) { - return _mm512_cmpneq_epi32_mask(a, b); -} - -static FORCEINLINE __vec16_i1 __not_equal_i32_and_mask(__vec16_i32 a, __vec16_i32 b, - __vec16_i1 m) { - return _mm512_mask_cmpneq_epi32_mask(m, a, b); -} - -static FORCEINLINE __vec16_i1 __unsigned_less_equal_i32(__vec16_i32 a, __vec16_i32 b) { - return _mm512_cmple_epu32_mask(a, b); -} - -static FORCEINLINE __vec16_i1 __unsigned_less_equal_i32_and_mask(__vec16_i32 a, __vec16_i32 b, - __vec16_i1 m) { - return _mm512_mask_cmple_epu32_mask(m, a, b); -} - -static FORCEINLINE __vec16_i1 __signed_less_equal_i32(__vec16_i32 a, __vec16_i32 b) { - return _mm512_cmple_epi32_mask(a, b); -} - -static FORCEINLINE __vec16_i1 __signed_less_equal_i32_and_mask(__vec16_i32 a, __vec16_i32 b, - __vec16_i1 m) { - return _mm512_mask_cmple_epi32_mask(m, a, b); -} - -static FORCEINLINE __vec16_i1 __unsigned_greater_equal_i32(__vec16_i32 a, __vec16_i32 b) { - return _mm512_cmpge_epu32_mask(a, b); -} - -static FORCEINLINE __vec16_i1 __unsigned_greater_equal_i32_and_mask(__vec16_i32 a, __vec16_i32 b, - __vec16_i1 m) { - return _mm512_mask_cmpge_epu32_mask(m, a, b); -} - -static FORCEINLINE __vec16_i1 __signed_greater_equal_i32(__vec16_i32 a, __vec16_i32 b) { - return _mm512_cmpge_epi32_mask(a, b); -} - -static FORCEINLINE __vec16_i1 __signed_greater_equal_i32_and_mask(__vec16_i32 a, __vec16_i32 b, - __vec16_i1 m) { - return _mm512_mask_cmpge_epi32_mask(m, a, b); -} - -static FORCEINLINE __vec16_i1 __unsigned_less_than_i32(__vec16_i32 a, __vec16_i32 b) { - return _mm512_cmplt_epu32_mask(a, b); -} - -static FORCEINLINE __vec16_i1 __unsigned_less_than_i32_and_mask(__vec16_i32 a, __vec16_i32 b, - __vec16_i1 m) { - return _mm512_mask_cmplt_epu32_mask(m, a, b); -} - -static FORCEINLINE __vec16_i1 __signed_less_than_i32(__vec16_i32 a, __vec16_i32 b) { - return _mm512_cmplt_epi32_mask(a, b); -} - -static FORCEINLINE __vec16_i1 __signed_less_than_i32_and_mask(__vec16_i32 a, __vec16_i32 b, - __vec16_i1 m) { - return _mm512_mask_cmplt_epi32_mask(m, a, b); -} - -static FORCEINLINE __vec16_i1 __unsigned_greater_than_i32(__vec16_i32 a, __vec16_i32 b) { - return _mm512_cmpgt_epu32_mask(a, b); -} - -static FORCEINLINE __vec16_i1 __unsigned_greater_than_i32_and_mask(__vec16_i32 a, __vec16_i32 b, - __vec16_i1 m) { - return _mm512_mask_cmpgt_epu32_mask(m, a, b); -} - -static FORCEINLINE __vec16_i1 __signed_greater_than_i32(__vec16_i32 a, __vec16_i32 b) { - return _mm512_cmpgt_epi32_mask(a, b); -} - -static FORCEINLINE __vec16_i1 __signed_greater_than_i32_and_mask(__vec16_i32 a, __vec16_i32 b, - __vec16_i1 m) { - return _mm512_mask_cmpgt_epi32_mask(m, a, b); -} - -static FORCEINLINE __vec16_i32 __select(__vec16_i1 mask, - __vec16_i32 a, __vec16_i32 b) { - return _mm512_mask_mov_epi32(b.v, mask, a.v); -} - -static FORCEINLINE __vec16_i32 __select(bool cond, __vec16_i32 a, __vec16_i32 b) { - return cond ? a : b; -} - -static FORCEINLINE int32_t __extract_element(__vec16_i32 v, int index) { //uint32_t index) { - return ((int32_t *)&v)[index]; -} - -static FORCEINLINE void __insert_element(__vec16_i32 *v, uint32_t index, int32_t val) { - ((int32_t *)v)[index] = val; -} +static FORCEINLINE int32_t __extract_element(__vec16_i32 v, int32_t index) { return v[index]; } +static FORCEINLINE void __insert_element (__vec16_i32 *v, uint32_t index, int32_t val) { (*v)[index] = val; } template RetVecType __smear_i32(int32_t i); -template <> static FORCEINLINE __vec16_i32 __smear_i32<__vec16_i32>(int32_t i) { - return _mm512_set1_epi32(i); -} +template <> static FORCEINLINE __vec16_i32 __smear_i32<__vec16_i32>(int32_t i) { return _mm512_set1_epi32(i); } static const __vec16_i32 __ispc_one = __smear_i32<__vec16_i32>(1); static const __vec16_i32 __ispc_thirty_two = __smear_i32<__vec16_i32>(32); @@ -929,66 +663,56 @@ static const __vec16_i32 __ispc_ffffffff = __smear_i32<__vec16_i32>(-1); static const __vec16_i32 __ispc_stride1(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); template RetVecType __setzero_i32(); -template <> static FORCEINLINE __vec16_i32 __setzero_i32<__vec16_i32>() { - return _mm512_setzero_epi32(); -} +template <> static FORCEINLINE __vec16_i32 __setzero_i32<__vec16_i32>() { return _mm512_setzero_epi32(); } template RetVecType __undef_i32(); -template <> static FORCEINLINE __vec16_i32 __undef_i32<__vec16_i32>() { - return __vec16_i32(); +template <> static FORCEINLINE __vec16_i32 __undef_i32<__vec16_i32>() { return __vec16_i32(); } + +static FORCEINLINE __vec16_i32 __broadcast_i32(__vec16_i32 v, int index) { return _mm512_mask_permutevar_epi32(v, 0xFFFF, _mm512_set1_epi32(index), v); } + +static FORCEINLINE __vec16_i32 __rotate_i32(__vec16_i32 v, int index) +{ + __vec16_i32 idx = __smear_i32<__vec16_i32>(index); + __vec16_i32 shuffle = _mm512_and_epi32(_mm512_add_epi32(__ispc_stride1, idx), __smear_i32<__vec16_i32>(0xF)); + return _mm512_mask_permutevar_epi32(v, 0xFFFF, shuffle, v); } -static FORCEINLINE __vec16_i32 __broadcast_i32(__vec16_i32 v, int index) { - int32_t val = __extract_element(v, index & 0xf); - return _mm512_set1_epi32(val); +static FORCEINLINE __vec16_i32 __shuffle_i32 (__vec16_i32 v, __vec16_i32 index) +{ + return _mm512_mask_permutevar_epi32(v, 0xFFFF, __and(index, __smear_i32<__vec16_i32>(0xF)), v); +} +static FORCEINLINE __vec16_i32 __shuffle2_i32(__vec16_i32 v0, __vec16_i32 v1, __vec16_i32 index) +{ + const __vec16_i1 mask = __signed_less_than_i32(index, __smear_i32<__vec16_i32>(0x10)); + index = __and(index, __smear_i32<__vec16_i32>(0xF)); + __vec16_i32 ret = __undef_i32<__vec16_i32>(); + ret = _mm512_mask_permutevar_epi32(ret, mask, index, v0); + ret = _mm512_mask_permutevar_epi32(ret, __not(mask), index, v1); + return ret; } -#if 0 /* evghenii::doesn't work */ -static FORCEINLINE __vec16_i32 __rotate_i32(__vec16_i32 v, int index) { - __vec16_i32 idx = __smear_i32<__vec16_i32>(index); - __vec16_i32 shuffle = _mm512_and_epi32(_mm512_add_epi32(__ispc_stride1, idx), __smear_i32<__vec16_i32>(0x7)); - return _mm512_mask_permutevar_epi32(v, 0xffff, shuffle, v); -} -#else -ROTATE(__vec16_i32, i32, int32_t) -#endif - -static FORCEINLINE __vec16_i32 __shuffle_i32(__vec16_i32 v, __vec16_i32 index) { - return _mm512_mask_permutevar_epi32(v, 0xffff, index, v); -} -SHUFFLE2(__vec16_i32, i32, int32_t) /* evghenii::to implement */ - -template static FORCEINLINE __vec16_i32 __load(const __vec16_i32 *p) { +template static FORCEINLINE __vec16_i32 __load(const __vec16_i32 *p) +{ #ifdef ISPC_FORCE_ALIGNED_MEMORY - return _mm512_load_epi32(p); + return __load<64>(p); #else - __vec16_i32 v; - v = _mm512_extloadunpacklo_epi32(v, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE); - v = _mm512_extloadunpackhi_epi32(v, (uint8_t*)p+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE); - return v; + __vec16_i32 v; + v = _mm512_extloadunpacklo_epi32(v, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE); + v = _mm512_extloadunpackhi_epi32(v, (uint8_t*)p+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE); + return v; #endif } - -template static FORCEINLINE void __store(__vec16_i32 *p, __vec16_i32 v) { +template static FORCEINLINE void __store(__vec16_i32 *p, __vec16_i32 v) +{ #ifdef ISPC_FORCE_ALIGNED_MEMORY - _mm512_store_epi32(p, v); + __store<64>(p,v); #else - _mm512_extpackstorelo_epi32(p, v, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE); - _mm512_extpackstorehi_epi32((uint8_t*)p+64, v, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE); + _mm512_extpackstorelo_epi32( p, v, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE); + _mm512_extpackstorehi_epi32((uint8_t*)p+64, v, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE); #endif } -#if 0 -template <> static FORCEINLINE __vec16_i32 __load<64>(const __vec16_i32 *p) { - return _mm512_load_epi32(p); -} -template <> static FORCEINLINE void __store<64>(__vec16_i32 *p, __vec16_i32 v) { - _mm512_store_epi32(p, v); -} -#endif -#endif /* evghenii::int32 */ - /////////////////////////////////////////////////////////////////////////// // int64 // evghenii::int64 From 57f019a6e02db5b90f9310b1f19114c0c93926ee Mon Sep 17 00:00:00 2001 From: evghenii Date: Fri, 4 Oct 2013 13:39:15 +0300 Subject: [PATCH 118/124] cleaned int64 added fails info --- examples/intrinsics/knc-i1x16.h | 162 +++++++++++++------------------- 1 file changed, 67 insertions(+), 95 deletions(-) diff --git a/examples/intrinsics/knc-i1x16.h b/examples/intrinsics/knc-i1x16.h index aae4be57..934d90b6 100644 --- a/examples/intrinsics/knc-i1x16.h +++ b/examples/intrinsics/knc-i1x16.h @@ -715,19 +715,18 @@ template static FORCEINLINE void __store(__vec16_i32 *p, __vec16_i32 /////////////////////////////////////////////////////////////////////////// // int64 -// evghenii::int64 +/////////////////////////////////////////////////////////////////////////// -#if 0 -BINARY_OP(__vec16_i64, __add, +) -BINARY_OP(__vec16_i64, __sub, -) -BINARY_OP(__vec16_i64, __mul, *) -#else -static FORCEINLINE __vec16_i64 __add(__vec16_i64 a, __vec16_i64 b) { +static FORCEINLINE __vec16_i64 __add(__vec16_i64 a, __vec16_i64 b) +{ return __vec16_i64(_mm512_add_epi64(a.v1, b.v1), _mm512_add_epi64(a.v2,b.v2)); } -static FORCEINLINE __vec16_i64 __sub(__vec16_i64 _a, __vec16_i64 _b) { -// return __vec16_i64(_mm512_sub_epi64(_a.v1, _b.v1), _mm512_sub_epi64(_a.v2,_b.v2)); +static FORCEINLINE __vec16_i64 __sub(__vec16_i64 _a, __vec16_i64 _b) +{ + // this intrinsic doesn't exist :S + // return __vec16_i64(_mm512_sub_epi64(_a.v1, _b.v1), _mm512_sub_epi64(_a.v2,_b.v2)); + // use knc.h implementation const __vec16_i64 a = _a.cvt2hilo(); const __vec16_i64 b = _b.cvt2hilo(); __vec16_i64 ret; @@ -745,34 +744,30 @@ static FORCEINLINE __vec16_i64 __mul(const __vec16_i32 &a, const __vec16_i64 &_b _mm512_mulhi_epi32(a.v, b.v_lo))).cvt2zmm(); } -#if __ICC_VERSION == 1400 -static FORCEINLINE __vec16_i64 __mul(__vec16_i64 a, __vec16_i64 b) { - return __vec16_i64(_mm512_mullox_epi64(a.v1, b.v1), _mm512_mullox_epi64(a.v2,b.v2)); -} +static FORCEINLINE __vec16_i64 __mul(__vec16_i64 _a, __vec16_i64 _b) +{ +#if __ICC >= 1400 + return __vec16_i64(_mm512_mullox_epi64(_a.v1,_b.v1), _mm512_mullox_epi64(_a.v2,_b.v2)); #else -BINARY_OP(__vec16_i64, __mul, *) + const __vec16_i64 a = _a.cvt2hilo(); + const __vec16_i64 b = _b.cvt2hilo(); + __vec16_i32 lo = _mm512_mullo_epi32(a.v_lo,b.v_lo); + __vec16_i32 hi_m1 = _mm512_mulhi_epi32(a.v_lo, b.v_lo); + __vec16_i32 hi_m2 = _mm512_mullo_epi32(a.v_hi, b.v_lo); + __vec16_i32 hi_m3 = _mm512_mullo_epi32(a.v_lo, b.v_hi); + __mmask16 carry = 0; + __vec16_i32 hi_p23 = _mm512_addsetc_epi32(hi_m2, hi_m1, &carry); + __vec16_i32 hi = _mm512_adc_epi32(hi_m3, carry, hi_p23, &carry); + return __vec16_i64(hi,lo).cvt2zmm(); #endif -#endif - -#if 0 -BINARY_OP(__vec16_i64, __or, |) -BINARY_OP(__vec16_i64, __and, &) -BINARY_OP(__vec16_i64, __xor, ^) -BINARY_OP(__vec16_i64, __shl, <<) -#else -static FORCEINLINE __vec16_i64 __or(__vec16_i64 a, __vec16_i64 b) { - return __vec16_i64(_mm512_or_epi64(a.v1, b.v1), _mm512_or_epi64(a.v2, b.v2)); } -static FORCEINLINE __vec16_i64 __and(__vec16_i64 a, __vec16_i64 b) { - return __vec16_i64(_mm512_and_epi64(a.v1, b.v1), _mm512_and_epi64(a.v2, b.v2)); -} +static FORCEINLINE __vec16_i64 __or (__vec16_i64 a, __vec16_i64 b) { return __vec16_i64(_mm512_or_epi64 (a.v1, b.v1), _mm512_or_epi64 (a.v2, b.v2)); } +static FORCEINLINE __vec16_i64 __and(__vec16_i64 a, __vec16_i64 b) { return __vec16_i64(_mm512_and_epi64(a.v1, b.v1), _mm512_and_epi64(a.v2, b.v2)); } +static FORCEINLINE __vec16_i64 __xor(__vec16_i64 a, __vec16_i64 b) { return __vec16_i64(_mm512_xor_epi64(a.v1, b.v1), _mm512_xor_epi64(a.v2, b.v2)); } -static FORCEINLINE __vec16_i64 __xor(__vec16_i64 a, __vec16_i64 b) { - return __vec16_i64(_mm512_xor_epi64(a.v1, b.v1), _mm512_xor_epi64(a.v2, b.v2)); -} - -static FORCEINLINE __vec16_i64 __shl(__vec16_i64 _a, __vec16_i64 _b) { +static FORCEINLINE __vec16_i64 __shl(__vec16_i64 _a, __vec16_i64 _b) +{ const __vec16_i64 a = _a.cvt2hilo(); const __vec16_i64 b = _b.cvt2hilo(); __vec16_i32 xfer = _mm512_srlv_epi32(a.v_lo, _mm512_sub_epi32(__ispc_thirty_two, b.v_lo)); @@ -780,35 +775,16 @@ static FORCEINLINE __vec16_i64 __shl(__vec16_i64 _a, __vec16_i64 _b) { __vec16_i32 lo = _mm512_sllv_epi32(a.v_lo, b.v_lo); return __vec16_i64(hi,lo).cvt2zmm(); } -#endif -#if 0 -BINARY_OP_CAST(__vec16_i64, uint64_t, __udiv, /) -BINARY_OP_CAST(__vec16_i64, int64_t, __sdiv, /) -#else -static FORCEINLINE __vec16_i64 __udiv(__vec16_i64 a, __vec16_i64 b) { - return __vec16_i64(_mm512_div_epu64(a.v1,b.v1), _mm512_div_epu64(a.v2,b.v2)); -} -static FORCEINLINE __vec16_i64 __sdiv(__vec16_i64 a, __vec16_i64 b) { - return __vec16_i64(_mm512_div_epi64(a.v1,b.v1), _mm512_div_epi64(a.v2,b.v2)); -} -#endif +static FORCEINLINE __vec16_i64 __udiv(__vec16_i64 a, __vec16_i64 b) { return __vec16_i64(_mm512_div_epu64(a.v1,b.v1), _mm512_div_epu64(a.v2,b.v2)); } +static FORCEINLINE __vec16_i64 __sdiv(__vec16_i64 a, __vec16_i64 b) { return __vec16_i64(_mm512_div_epi64(a.v1,b.v1), _mm512_div_epi64(a.v2,b.v2)); } -#if 0 -BINARY_OP_CAST(__vec16_i64, uint64_t, __urem, %) -BINARY_OP_CAST(__vec16_i64, int64_t, __srem, %) -#else -static FORCEINLINE __vec16_i64 __urem(__vec16_i64 a, __vec16_i64 b) { - return __vec16_i64(_mm512_rem_epu64(a.v1,b.v1), _mm512_rem_epu64(a.v2,b.v2)); -} -static FORCEINLINE __vec16_i64 __srem(__vec16_i64 a, __vec16_i64 b) { - return __vec16_i64(_mm512_rem_epi64(a.v1,b.v1), _mm512_rem_epi64(a.v2,b.v2)); -} -#endif +static FORCEINLINE __vec16_i64 __urem(__vec16_i64 a, __vec16_i64 b) { return __vec16_i64(_mm512_rem_epu64(a.v1,b.v1), _mm512_rem_epu64(a.v2,b.v2)); } +static FORCEINLINE __vec16_i64 __srem(__vec16_i64 a, __vec16_i64 b) { return __vec16_i64(_mm512_rem_epi64(a.v1,b.v1), _mm512_rem_epi64(a.v2,b.v2)); } #if 1 BINARY_OP_CAST(__vec16_i64, uint64_t, __lshr, >>) -#else /* evghenii::fails idiv.ispc */ +#else /* knc::fails ./tests/idiv.ispc */ static FORCEINLINE __vec16_i64 __lshr(__vec16_i64 _a, __vec16_i64 _b) { const __vec16_i64 a = _a.cvt2hilo(); const __vec16_i64 b = _b.cvt2hilo(); @@ -829,7 +805,7 @@ static FORCEINLINE __vec16_i64 __lshr(__vec16_i64 _a, __vec16_i64 _b) { #if 1 BINARY_OP_CAST(__vec16_i64, int64_t, __ashr, >>) -#else /* evghenii::fails idiv.ispc */ +#else /* knc::fails ./tests/idiv.ispc */ static FORCEINLINE __vec16_i64 __ashr(__vec16_i64 _a, __vec16_i64 _b) { const __vec16_i64 a = _a.cvt2hilo(); const __vec16_i64 b = _b.cvt2hilo(); @@ -848,31 +824,30 @@ SHIFT_UNIFORM(__vec16_i64, int64_t, __shl, <<) #if 1 CMP_OP(__vec16_i64, i64, int64_t, __equal, ==) -#else /* evghenii::fails ./tests/reduce-equal-8.ispc, some other test hang... */ -static FORCEINLINE __vec16_i1 __equal_i64(const __vec16_i64 &_a, const __vec16_i64 &_b) { +CMP_OP(__vec16_i64, i64, int64_t, __not_equal, !=) +#else /* knc::fails ./tests/reduce-equal-8.ispc , knc::hangs foreach-unique-6.ispc funcptr-null-[2-6].ispc funcptr-uniform-9.ispc funcptr-varying-5.ispc */ +static FORCEINLINE __vec16_i1 __equal_i64(__vec16_i64 _a, __vec16_i64 _b) +{ const __vec16_i64 a = _a.cvt2hilo(); const __vec16_i64 b = _b.cvt2hilo(); const __mmask16 lo_match = _mm512_cmpeq_epi32_mask(a.v_lo,b.v_lo); return _mm512_mask_cmpeq_epi32_mask(lo_match,a.v_hi,b.v_hi); } -static FORCEINLINE __vec16_i1 __not_equal_i64(const __vec16_i64 &a, const __vec16_i64 &b) { - return __not(__equal_i64(a,b)); -} -#endif - -#if 1 -CMP_OP(__vec16_i64, i64, int64_t, __not_equal, !=) -#else /* evghenii::fails ./tests/reduce-equal-8.ispc, some other test hang... */ -static FORCEINLINE __vec16_i1 __equal_i64_and_mask(const __vec16_i64 &_a, const __vec16_i64 &_b, - __vec16_i1 mask) { +static FORCEINLINE __vec16_i1 __equal_i64_and_mask(__vec16_i64 _a, __vec16_i64 _b, __vec16_i1 mask) +{ const __vec16_i64 a = _a.cvt2hilo(); const __vec16_i64 b = _b.cvt2hilo(); __mmask16 lo_match = _mm512_cmpeq_epi32_mask(a.v_lo,b.v_lo); __mmask16 full_match = _mm512_mask_cmpeq_epi32_mask(lo_match,a.v_hi,b.v_hi); return _mm512_kand(full_match, (__mmask16)mask); } -static FORCEINLINE __vec16_i1 __not_equal_i64_and_mask(const __vec16_i64 &a, const __vec16_i64 &b, - __vec16_i1 mask) { + +static FORCEINLINE __vec16_i1 __not_equal_i64(__vec16_i64 a, __vec16_i64 b) +{ + return __not(__equal_i64(a,b)); +} +static FORCEINLINE __vec16_i1 __not_equal_i64_and_mask(__vec16_i64 a, __vec16_i64 b, __vec16_i1 mask) +{ return __and(__not(__equal_i64(a,b)), mask); } #endif @@ -888,46 +863,39 @@ CMP_OP(__vec16_i64, i64, int64_t, __signed_less_than, <) CMP_OP(__vec16_i64, i64, uint64_t, __unsigned_greater_than, >) CMP_OP(__vec16_i64, i64, int64_t, __signed_greater_than, >) -#if 0 -SELECT(__vec16_i64) -#else -static FORCEINLINE __vec16_i64 __select(__vec16_i1 mask, - __vec16_i64 a, __vec16_i64 b) { +static FORCEINLINE __vec16_i64 __select(__vec16_i1 mask, __vec16_i64 a, __vec16_i64 b) +{ __vec16_i64 ret; ret.v_hi = _mm512_mask_mov_epi64(b.v_hi, mask, a.v_hi); ret.v_lo = _mm512_mask_mov_epi64(b.v_lo, mask >> 8, a.v_lo); return ret; } -#endif INSERT_EXTRACT(__vec16_i64, int64_t) -#if 0 -SMEAR(__vec16_i64, i64, int64_t) -SETZERO(__vec16_i64, i64) -UNDEF(__vec16_i64, i64) -BROADCAST(__vec16_i64, i64, int64_t) -#else + template RetVecType __smear_i64(const int64_t &l); -template <> FORCEINLINE __vec16_i64 __smear_i64<__vec16_i64>(const int64_t &l) { return __vec16_i64(_mm512_set1_epi64(l), _mm512_set1_epi64(l)); } +template <> FORCEINLINE __vec16_i64 __smear_i64<__vec16_i64>(const int64_t &l) { return __vec16_i64(_mm512_set1_epi64(l), _mm512_set1_epi64(l)); } template RetVecType __setzero_i64(); -template <> FORCEINLINE __vec16_i64 __setzero_i64<__vec16_i64>() { return __vec16_i64(_mm512_setzero_epi32(), _mm512_setzero_epi32()); } +template <> FORCEINLINE __vec16_i64 __setzero_i64<__vec16_i64>() { return __vec16_i64(_mm512_setzero_epi32(), _mm512_setzero_epi32()); } template RetVecType __undef_i64(); -template <> FORCEINLINE __vec16_i64 __undef_i64<__vec16_i64>() { return __vec16_i64(_mm512_undefined_epi32(), _mm512_undefined_epi32()); } +template <> FORCEINLINE __vec16_i64 __undef_i64<__vec16_i64>() { return __vec16_i64(_mm512_undefined_epi32(), _mm512_undefined_epi32()); } -static FORCEINLINE __vec16_i64 __broadcast_i64(__vec16_i64 v, int index) { - int64_t val = __extract_element(v, index & 0xf); - return __smear_i64<__vec16_i64>(val); +static FORCEINLINE __vec16_i64 __broadcast_i64(__vec16_i64 v, int index) +{ + int64_t val = __extract_element(v, index & 0xf); + return __smear_i64<__vec16_i64>(val); } -#endif -ROTATE(__vec16_i64, i64, int64_t) + +ROTATE (__vec16_i64, i64, int64_t) SHUFFLES(__vec16_i64, i64, int64_t) -#if 0 -LOAD_STORE(__vec16_i64, int64_t) -#else + template static FORCEINLINE __vec16_i64 __load(const __vec16_i64 *p) { +#ifdef ISPC_FORCE_ALIGNED_MEMORY + return __load<128>(p); +#else __vec16_i32 v1; __vec16_i32 v2; v2 = _mm512_extloadunpacklo_epi32(v2, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE); @@ -935,6 +903,7 @@ template static FORCEINLINE __vec16_i64 __load(const __vec16_i64 *p) v1 = _mm512_extloadunpacklo_epi32(v1, (uint8_t*)p+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE); v1 = _mm512_extloadunpackhi_epi32(v1, (uint8_t*)p+128, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE); return __vec16_i64(v2,v1); +#endif } template <> static FORCEINLINE __vec16_i64 __load<64>(const __vec16_i64 *p) @@ -948,12 +917,16 @@ template <> static FORCEINLINE __vec16_i64 __load<128>(const __vec16_i64 *p) { template static FORCEINLINE void __store(__vec16_i64 *p, __vec16_i64 v) { +#ifdef ISPC_FORCE_ALIGNED_MEMORY + return __store<128>(p,v); +#else __m512i v1 = v.v2; __m512i v2 = v.v1; _mm512_extpackstorelo_epi32(p, v2, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE); _mm512_extpackstorehi_epi32((uint8_t*)p+64, v2, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE); _mm512_extpackstorelo_epi32((uint8_t*)p+64, v1, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE); _mm512_extpackstorehi_epi32((uint8_t*)p+128, v1, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE); +#endif } template <> static FORCEINLINE void __store<64>(__vec16_i64 *p, __vec16_i64 v) @@ -965,7 +938,6 @@ template <> static FORCEINLINE void __store<64>(__vec16_i64 *p, __vec16_i64 v) } template <> static FORCEINLINE void __store<128>(__vec16_i64 *p, __vec16_i64 v) { __store<64>(p, v); } -#endif #if 0 /* evghenii::float */ From 8a6789ef61e006866ead9e0c5d0cfa1db39cd8c5 Mon Sep 17 00:00:00 2001 From: evghenii Date: Fri, 4 Oct 2013 14:11:09 +0300 Subject: [PATCH 119/124] cleaned float added fails info --- examples/intrinsics/knc-i1x16.h | 348 ++++++++++---------------------- 1 file changed, 107 insertions(+), 241 deletions(-) diff --git a/examples/intrinsics/knc-i1x16.h b/examples/intrinsics/knc-i1x16.h index 934d90b6..87f54dfa 100644 --- a/examples/intrinsics/knc-i1x16.h +++ b/examples/intrinsics/knc-i1x16.h @@ -940,217 +940,113 @@ template <> static FORCEINLINE void __store<64>(__vec16_i64 *p, __vec16_i64 v) template <> static FORCEINLINE void __store<128>(__vec16_i64 *p, __vec16_i64 v) { __store<64>(p, v); } -#if 0 /* evghenii::float */ -/////////////////////////////////////////////////////////////////////////// -// float - -BINARY_OP(__vec16_f, __add, +) -BINARY_OP(__vec16_f, __sub, -) -BINARY_OP(__vec16_f, __mul, *) -BINARY_OP(__vec16_f, __div, /) - -CMP_OP(__vec16_f, float, float, __equal, ==) -CMP_OP(__vec16_f, float, float, __not_equal, !=) -CMP_OP(__vec16_f, float, float, __less_than, <) -CMP_OP(__vec16_f, float, float, __less_equal, <=) -CMP_OP(__vec16_f, float, float, __greater_than, >) -CMP_OP(__vec16_f, float, float, __greater_equal, >=) - -static FORCEINLINE __vec16_i1 __ordered_float(__vec16_f a, __vec16_f b) { - __vec16_i1 ret; - ret.v = 0; - for (int i = 0; i < 16; ++i) - ret.v |= ((a[i] == a[i]) && (b[i] == b[i])) ? (1 << i) : 0; - return ret; -} - -static FORCEINLINE __vec16_i1 __unordered_float(__vec16_f a, __vec16_f b) { - __vec16_i1 ret; - ret.v = 0; - for (int i = 0; i < 16; ++i) - ret.v |= ((a[i] != a[i]) || (b[i] != b[i])) ? (1 << i) : 0; - return ret; -} - -#if 0 - case Instruction::FRem: intrinsic = "__frem"; break; -#endif - -SELECT(__vec16_f) -INSERT_EXTRACT(__vec16_f, float) -SMEAR(__vec16_f, float, float) -SETZERO(__vec16_f, float) -UNDEF(__vec16_f, float) -BROADCAST(__vec16_f, float, float) -ROTATE(__vec16_f, float, float) -SHUFFLES(__vec16_f, float, float) -LOAD_STORE(__vec16_f, float) -#else /* evghenii::float */ - /////////////////////////////////////////////////////////////////////////// // float /////////////////////////////////////////////////////////////////////////// -static FORCEINLINE __vec16_f __add(__vec16_f a, __vec16_f b) { - return _mm512_add_ps(a, b); -} +static FORCEINLINE __vec16_f __add(__vec16_f a, __vec16_f b) { return _mm512_add_ps(a,b); } +static FORCEINLINE __vec16_f __sub(__vec16_f a, __vec16_f b) { return _mm512_sub_ps(a,b); } +static FORCEINLINE __vec16_f __mul(__vec16_f a, __vec16_f b) { return _mm512_mul_ps(a,b); } +static FORCEINLINE __vec16_f __div(__vec16_f a, __vec16_f b) { return _mm512_div_ps(a,b); } -static FORCEINLINE __vec16_f __sub(__vec16_f a, __vec16_f b) { - return _mm512_sub_ps(a, b); -} +static FORCEINLINE __vec16_i1 __equal_float (__vec16_f a, __vec16_f b) { return _mm512_cmpeq_ps_mask (a,b); } +static FORCEINLINE __vec16_i1 __not_equal_float (__vec16_f a, __vec16_f b) { return _mm512_cmpneq_ps_mask(a,b); } +static FORCEINLINE __vec16_i1 __less_than_float (__vec16_f a, __vec16_f b) { return _mm512_cmplt_ps_mask (a,b); } +static FORCEINLINE __vec16_i1 __less_equal_float (__vec16_f a, __vec16_f b) { return _mm512_cmple_ps_mask (a,b); } +static FORCEINLINE __vec16_i1 __greater_than_float (__vec16_f a, __vec16_f b) { return _mm512_cmp_ps_mask (a,b,_CMP_GT_OS); } +static FORCEINLINE __vec16_i1 __greater_equal_float(__vec16_f a, __vec16_f b) { return _mm512_cmp_ps_mask (a,b,_CMP_GE_OS); } -static FORCEINLINE __vec16_f __mul(__vec16_f a, __vec16_f b) { - return _mm512_mul_ps(a, b); -} +static FORCEINLINE __vec16_i1 __equal_float_and_mask (__vec16_f a, __vec16_f b, __vec16_i1 m) { return _mm512_mask_cmpeq_ps_mask (m,a,b); } +static FORCEINLINE __vec16_i1 __not_equal_float_and_mask (__vec16_f a, __vec16_f b, __vec16_i1 m) { return _mm512_mask_cmpneq_ps_mask(m,a,b); } +static FORCEINLINE __vec16_i1 __less_than_float_and_mask (__vec16_f a, __vec16_f b, __vec16_i1 m) { return _mm512_mask_cmplt_ps_mask (m,a,b); } +static FORCEINLINE __vec16_i1 __less_equal_float_and_mask (__vec16_f a, __vec16_f b, __vec16_i1 m) { return _mm512_mask_cmple_ps_mask (m,a,b); } +static FORCEINLINE __vec16_i1 __greater_than_float_and_mask (__vec16_f a, __vec16_f b, __vec16_i1 m) { return _mm512_mask_cmp_ps_mask (m,a,b,_CMP_GT_OS); } +static FORCEINLINE __vec16_i1 __greater_equal_float_and_mask(__vec16_f a, __vec16_f b, __vec16_i1 m) { return _mm512_mask_cmp_ps_mask (m,a,b,_CMP_GE_OS); } -static FORCEINLINE __vec16_f __div(__vec16_f a, __vec16_f b) { - return _mm512_div_ps(a, b); -} +static FORCEINLINE __vec16_i1 __ordered_float(__vec16_f a, __vec16_f b) { return _mm512_cmpord_ps_mask (a,b); } +static FORCEINLINE __vec16_i1 __unordered_float(__vec16_f a, __vec16_f b) { return _mm512_cmpunord_ps_mask(a,b); } +static FORCEINLINE __vec16_f __select(__vec16_i1 mask, __vec16_f a, __vec16_f b) { return _mm512_mask_mov_ps(b, mask, a); } +static FORCEINLINE __vec16_f __select( bool cond, __vec16_f a, __vec16_f b) { return cond ? a : b; } -static FORCEINLINE __vec16_i1 __equal_float(__vec16_f a, __vec16_f b) { - return _mm512_cmpeq_ps_mask(a, b); -} - -static FORCEINLINE __vec16_i1 __equal_float_and_mask(__vec16_f a, __vec16_f b, - __vec16_i1 m) { - return _mm512_mask_cmpeq_ps_mask(m, a, b); -} - -static FORCEINLINE __vec16_i1 __not_equal_float(__vec16_f a, __vec16_f b) { - return _mm512_cmpneq_ps_mask(a, b); -} - -static FORCEINLINE __vec16_i1 __not_equal_float_and_mask(__vec16_f a, __vec16_f b, - __vec16_i1 m) { - return _mm512_mask_cmpneq_ps_mask(m, a, b); -} - -static FORCEINLINE __vec16_i1 __less_than_float(__vec16_f a, __vec16_f b) { - return _mm512_cmplt_ps_mask(a, b); -} - -static FORCEINLINE __vec16_i1 __less_than_float_and_mask(__vec16_f a, __vec16_f b, - __vec16_i1 m) { - return _mm512_mask_cmplt_ps_mask(m, a, b); -} - -static FORCEINLINE __vec16_i1 __less_equal_float(__vec16_f a, __vec16_f b) { - return _mm512_cmple_ps_mask(a, b); -} - -static FORCEINLINE __vec16_i1 __less_equal_float_and_mask(__vec16_f a, __vec16_f b, - __vec16_i1 m) { - return _mm512_mask_cmple_ps_mask(m, a, b); -} - -static FORCEINLINE __vec16_i1 __greater_than_float(__vec16_f a, __vec16_f b) { -// return _mm512_cmpnle_ps_mask(a, b); - return _mm512_cmp_ps_mask(a, b,_CMP_GT_OS); -} - -static FORCEINLINE __vec16_i1 __greater_than_float_and_mask(__vec16_f a, __vec16_f b, - __vec16_i1 m) { -// return _mm512_mask_cmpnle_ps_mask(m, a, b); - return _mm512_mask_cmp_ps_mask(m,a, b,_CMP_GT_OS); -} - -static FORCEINLINE __vec16_i1 __greater_equal_float(__vec16_f a, __vec16_f b) { -// return _mm512_cmpnlt_ps_mask(a, b); - return _mm512_cmp_ps_mask(a, b,_CMP_GE_OS); -} - -static FORCEINLINE __vec16_i1 __greater_equal_float_and_mask(__vec16_f a, __vec16_f b, - __vec16_i1 m) { -// return _mm512_mask_cmpnlt_ps_mask(m, a, b); - return _mm512_mask_cmp_ps_mask(m,a, b,_CMP_GE_OS); -} - -static FORCEINLINE __vec16_i1 __ordered_float(__vec16_f a, __vec16_f b) { - return _mm512_cmpord_ps_mask(a, b); -} - -static FORCEINLINE __vec16_i1 __unordered_float(__vec16_f a, __vec16_f b) { - return _mm512_cmpunord_ps_mask(a, b); -} - -static FORCEINLINE __vec16_f __select(__vec16_i1 mask, __vec16_f a, __vec16_f b) { - return _mm512_mask_mov_ps(b, mask, a); -} - -static FORCEINLINE __vec16_f __select(bool cond, __vec16_f a, __vec16_f b) { - return cond ? a : b; -} - -static FORCEINLINE float __extract_element(__vec16_f v, uint32_t index) { - return v[index]; - // return ((float *)&v)[index]; -} - -static FORCEINLINE void __insert_element(__vec16_f *v, uint32_t index, float val) { - (*v)[index] = val; -// ((float *)v)[index] = val; -} +static FORCEINLINE float __extract_element(__vec16_f v, uint32_t index) { return v[index]; } +static FORCEINLINE void __insert_element(__vec16_f *v, uint32_t index, float val) { (*v)[index] = val; } template RetVecType __smear_float(float f); -template <> static FORCEINLINE __vec16_f __smear_float<__vec16_f>(float f) { - return _mm512_set_1to16_ps(f); -} +template <> static FORCEINLINE __vec16_f __smear_float<__vec16_f>(float f) { return _mm512_set_1to16_ps(f); } template RetVecType __setzero_float(); -template <> static FORCEINLINE __vec16_f __setzero_float<__vec16_f>() { - return _mm512_setzero_ps(); -} +template <> static FORCEINLINE __vec16_f __setzero_float<__vec16_f>() { return _mm512_setzero_ps(); } template RetVecType __undef_float(); -template <> static FORCEINLINE __vec16_f __undef_float<__vec16_f>() { - return __vec16_f(); -} +template <> static FORCEINLINE __vec16_f __undef_float<__vec16_f>() { return __vec16_f(); } -static FORCEINLINE __vec16_f __broadcast_float(__vec16_f v, int index) { - float val = __extract_element(v, index & 0xf); - return _mm512_set1_ps(val); +static FORCEINLINE __vec16_f __broadcast_float(__vec16_f _v, int index) +{ + const __vec16_i32 v = _mm512_castps_si512(_v); + return _mm512_castsi512_ps(_mm512_mask_permutevar_epi32(v, 0xFFFF, _mm512_set1_epi32(index), v)); } -#if 1 -static FORCEINLINE __vec16_f __shuffle_float(__vec16_f v, __vec16_i32 index) { - return _mm512_castsi512_ps(_mm512_mask_permutevar_epi32(_mm512_castps_si512(v), 0xffff, index, _mm512_castps_si512(v))); +static FORCEINLINE __vec16_f __rotate_float(__vec16_f _v, int index) +{ + const __vec16_i32 v = _mm512_castps_si512(_v); + const __vec16_i32 idx = __smear_i32<__vec16_i32>(index); + const __vec16_i32 shuffle = _mm512_and_epi32(_mm512_add_epi32(__ispc_stride1, idx), __smear_i32<__vec16_i32>(0xF)); + return _mm512_castsi512_ps(_mm512_mask_permutevar_epi32(v, 0xFFFF, shuffle, v)); +} +static FORCEINLINE __vec16_f __shuffle_float(__vec16_f v, __vec16_i32 index) +{ + return _mm512_castsi512_ps(_mm512_mask_permutevar_epi32(_mm512_castps_si512(v), 0xffff, index, _mm512_castps_si512(v))); +} +static FORCEINLINE __vec16_f __shuffle2_float(__vec16_f _v0, __vec16_f _v1, __vec16_i32 index) +{ + const __vec16_i32 v0 = _mm512_castps_si512(_v0); + const __vec16_i32 v1 = _mm512_castps_si512(_v1); + const __vec16_i1 mask = __signed_less_than_i32(index, __smear_i32<__vec16_i32>(0x10)); + index = __and(index, __smear_i32<__vec16_i32>(0xF)); + __vec16_i32 ret = __undef_i32<__vec16_i32>(); + ret = _mm512_mask_permutevar_epi32(ret, mask, index, v0); + ret = _mm512_mask_permutevar_epi32(ret, __not(mask), index, v1); + return _mm512_castsi512_ps(ret); } -#endif -ROTATE(__vec16_f, float, float) -SHUFFLE2(__vec16_f, float, float) -template static FORCEINLINE __vec16_f __load(const __vec16_f *p) { +template static FORCEINLINE __vec16_f __load(const __vec16_f *p) +{ #ifdef ISPC_FORCE_ALIGNED_MEMORY - return _mm512_load_ps(p); + return __load<64>(p); #else - __vec16_f v; - v = _mm512_extloadunpacklo_ps(v, p, _MM_UPCONV_PS_NONE, _MM_HINT_NONE); - v = _mm512_extloadunpackhi_ps(v, (uint8_t*)p+64, _MM_UPCONV_PS_NONE, _MM_HINT_NONE); - return v; + __vec16_f v; + v = _mm512_extloadunpacklo_ps(v, p, _MM_UPCONV_PS_NONE, _MM_HINT_NONE); + v = _mm512_extloadunpackhi_ps(v, (uint8_t*)p+64, _MM_UPCONV_PS_NONE, _MM_HINT_NONE); + return v; #endif } -template static FORCEINLINE void __store(__vec16_f *p, __vec16_f v) { +template static FORCEINLINE void __store(__vec16_f *p, __vec16_f v) +{ #ifdef ISPC_FORCE_ALIGNED_MEMORY - _mm512_store_ps(p, v); + __store<64>(p,v); #else - _mm512_extpackstorelo_ps( p, v, _MM_DOWNCONV_PS_NONE, _MM_HINT_NONE); - _mm512_extpackstorehi_ps((uint8_t*)p+64, v, _MM_DOWNCONV_PS_NONE, _MM_HINT_NONE); + _mm512_extpackstorelo_ps( p, v, _MM_DOWNCONV_PS_NONE, _MM_HINT_NONE); + _mm512_extpackstorehi_ps((uint8_t*)p+64, v, _MM_DOWNCONV_PS_NONE, _MM_HINT_NONE); #endif } -#if 0 -template <> static FORCEINLINE void __store<64>(__vec16_f *p, __vec16_f v) { - _mm512_store_ps(p, v); -} -template <> static FORCEINLINE __vec16_f __load<64>(const __vec16_f *p) { +#if 0 /* knc::fails ./tests/gs-improve-progindex.ispc with segfault */ +template <> static FORCEINLINE __vec16_f __load<64>(const __vec16_f *p) +{ return _mm512_load_ps(p); } +/* this one doesn't fail but it is commented out for completenes, no aligned load/stores */ +template <> static FORCEINLINE void __store<64>(__vec16_f *p, __vec16_f v) +{ + _mm512_store_ps(p, v); +} #endif -#endif /* evghenii::float */ +/******** math ******/ +/*** float ***/ static FORCEINLINE float __exp_uniform_float(float v) { return expf(v);} static FORCEINLINE __vec16_f __exp_varying_float(__vec16_f v) { return _mm512_exp_ps(v); } @@ -1160,6 +1056,18 @@ static FORCEINLINE __vec16_f __log_varying_float(__vec16_f v) { return _mm512_lo static FORCEINLINE float __pow_uniform_float(float a, float b) { return powf(a, b);} static FORCEINLINE __vec16_f __pow_varying_float(__vec16_f a, __vec16_f b) { return _mm512_pow_ps(a,b); } +/*** double ***/ +static FORCEINLINE double __exp_uniform_double(double v) { return exp(v);} +static FORCEINLINE __vec16_d __exp_varying_double(__vec16_d v) { return __vec16_d(_mm512_exp_pd(v.v1),_mm512_exp_pd(v.v2)); } + +static FORCEINLINE double __log_uniform_double(double v) { return log(v);} +static FORCEINLINE __vec16_d __log_varying_double(__vec16_d v) { return __vec16_d(_mm512_log_pd(v.v1),_mm512_log_pd(v.v2)); } + +static FORCEINLINE double __pow_uniform_double(double a, double b) { return pow(a,b);} +static FORCEINLINE __vec16_d __pow_varying_double(__vec16_d a, __vec16_d b) { return __vec16_d(_mm512_pow_pd(a.v1,b.v1),_mm512_pow_pd(a.v2,b.v2)); } + +/******** bitcast ******/ + static FORCEINLINE int __intbits(float v) { union { float f; @@ -1178,8 +1086,11 @@ static FORCEINLINE float __floatbits(int v) { return u.f; } -/* source : - * http://stackoverflow.com/questions/1659440/32-bit-to-16-bit-floating-point-conversion */ +/////////////////////////////////////////////////////////////////////////// +// half<->float : this one passes the tests +// source : +// http://stackoverflow.com/questions/1659440/32-bit-to-16-bit-floating-point-conversion +/////////////////////////////////////////////////////////////////////////// class Float16Compressor { union Bits @@ -1252,81 +1163,36 @@ class Float16Compressor } }; -static FORCEINLINE float __half_to_float_uniform(int16_t h) { -#if 0 - static const uint32_t shifted_exp = 0x7c00 << 13; // exponent mask after shift - - int32_t o = ((int32_t)(h & 0x7fff)) << 13; // exponent/mantissa bits - uint32_t exp = shifted_exp & o; // just the exponent - o += (127 - 15) << 23; // exponent adjust - - // handle exponent special cases - if (exp == shifted_exp) // Inf/NaN? - o += (128 - 16) << 23; // extra exp adjust - else if (exp == 0) { // Zero/Denormal? - o += 1 << 23; // extra exp adjust - o = __intbits(__floatbits(o) - __floatbits(113 << 23)); // renormalize - } - - o |= ((int32_t)(h & 0x8000)) << 16; // sign bit - return __floatbits(o); -#else +static FORCEINLINE float __half_to_float_uniform(int16_t h) +{ return Float16Compressor::decompress(h); -#endif +} +static FORCEINLINE __vec16_f __half_to_float_varying(__vec16_i16 v) +{ + __vec16_f ret; + for (int i = 0; i < 16; ++i) + ret[i] = __half_to_float_uniform(v[i]); + return ret; } -static FORCEINLINE __vec16_f __half_to_float_varying(__vec16_i16 v) { - __vec16_f ret; - for (int i = 0; i < 16; ++i) - ret[i] = __half_to_float_uniform(v[i]); - return ret; -} - - -static FORCEINLINE int16_t __float_to_half_uniform(float f) { -#if 0 - uint32_t sign_mask = 0x80000000u; - int32_t o; - - int32_t fint = __intbits(f); - int32_t sign = fint & sign_mask; - fint ^= sign; - - int32_t f32infty = 255 << 23; - o = (fint > f32infty) ? 0x7e00 : 0x7c00; - - // (De)normalized number or zero - // update fint unconditionally to save the blending; we don't need it - // anymore for the Inf/NaN case anyway. - const uint32_t round_mask = ~0xfffu; - const int32_t magic = 15 << 23; - const int32_t f16infty = 31 << 23; - - int32_t fint2 = __intbits(__floatbits(fint & round_mask) * __floatbits(magic)) - round_mask; - fint2 = (fint2 > f16infty) ? f16infty : fint2; // Clamp to signed infinity if overflowed - - if (fint < f32infty) - o = fint2 >> 13; // Take the bits! - - return (o | (sign >> 16)); -#else +static FORCEINLINE int16_t __float_to_half_uniform(float f) +{ return Float16Compressor::compress(f); -#endif } - - -static FORCEINLINE __vec16_i16 __float_to_half_varying(__vec16_f v) { - __vec16_i16 ret; - for (int i = 0; i < 16; ++i) - ret[i] = __float_to_half_uniform(v[i]); - return ret; +static FORCEINLINE __vec16_i16 __float_to_half_varying(__vec16_f v) +{ + __vec16_i16 ret; + for (int i = 0; i < 16; ++i) + ret[i] = __float_to_half_uniform(v[i]); + return ret; } #if 0 /* evghenii::double */ /////////////////////////////////////////////////////////////////////////// // double +/////////////////////////////////////////////////////////////////////////// BINARY_OP(__vec16_d, __add, +) BINARY_OP(__vec16_d, __sub, -) From 8b0fc558cb88a1675f903058a1695b70b60efefe Mon Sep 17 00:00:00 2001 From: evghenii Date: Sat, 5 Oct 2013 14:15:33 +0300 Subject: [PATCH 120/124] complete cleaning --- examples/intrinsics/knc-i1x16.h | 1322 ++++++++++--------------------- 1 file changed, 438 insertions(+), 884 deletions(-) diff --git a/examples/intrinsics/knc-i1x16.h b/examples/intrinsics/knc-i1x16.h index 87f54dfa..e712c969 100644 --- a/examples/intrinsics/knc-i1x16.h +++ b/examples/intrinsics/knc-i1x16.h @@ -1,5 +1,5 @@ /** - Copyright (c) 2010-2012, Intel Corporation + Copyright (c) 2010-2013, Intel Corporation All rights reserved. Redistribution and use in source and binary forms, with or without @@ -265,6 +265,7 @@ static inline int32_t __extract_element(__vec16_i32, int); /////////////////////////////////////////////////////////////////////////// // macros... +/* knc::macro::not used */ #define UNARY_OP(TYPE, NAME, OP) \ static FORCEINLINE TYPE NAME(TYPE v) { \ TYPE ret; \ @@ -273,6 +274,7 @@ static FORCEINLINE TYPE NAME(TYPE v) { \ return ret; \ } +/* knc::macro::used */ #define BINARY_OP(TYPE, NAME, OP) \ static FORCEINLINE TYPE NAME(TYPE a, TYPE b) { \ TYPE ret; \ @@ -281,6 +283,7 @@ static FORCEINLINE TYPE NAME(TYPE a, TYPE b) { \ return ret; \ } +/* knc::macro::used */ #define BINARY_OP_CAST(TYPE, CAST, NAME, OP) \ static FORCEINLINE TYPE NAME(TYPE a, TYPE b) { \ TYPE ret; \ @@ -289,6 +292,7 @@ static FORCEINLINE TYPE NAME(TYPE a, TYPE b) { \ return ret; \ } +/* knc::macro::used */ #define BINARY_OP_FUNC(TYPE, NAME, FUNC) \ static FORCEINLINE TYPE NAME(TYPE a, TYPE b) { \ TYPE ret; \ @@ -297,6 +301,7 @@ static FORCEINLINE TYPE NAME(TYPE a, TYPE b) { \ return ret; \ } +/* knc::macro::used */ #define CMP_OP(TYPE, SUFFIX, CAST, NAME, OP) \ static FORCEINLINE __vec16_i1 NAME##_##SUFFIX(TYPE a, TYPE b) { \ __vec16_i1 ret; \ @@ -315,6 +320,7 @@ static FORCEINLINE __vec16_i1 NAME##_##SUFFIX##_and_mask(TYPE a, TYPE b, \ return ret; \ } +/* knc::macro::used */ #define INSERT_EXTRACT(VTYPE, STYPE) \ static FORCEINLINE STYPE __extract_element(VTYPE v, int index) { \ return ((STYPE *)&v)[index]; \ @@ -323,6 +329,7 @@ static FORCEINLINE void __insert_element(VTYPE *v, int index, STYPE val) { \ ((STYPE *)v)[index] = val; \ } +/* knc::macro::used */ #define LOAD_STORE(VTYPE, STYPE) \ template \ static FORCEINLINE VTYPE __load(const VTYPE *p) { \ @@ -339,24 +346,7 @@ static FORCEINLINE void __store(VTYPE *p, VTYPE v) { \ ptr[i] = v[i]; \ } -#define LOADS(VTYPE, STYPE) \ -template \ -static FORCEINLINE VTYPE __load(const VTYPE *p) { \ - STYPE *ptr = (STYPE *)p; \ - VTYPE ret; \ - for (int i = 0; i < 16; ++i) \ - ret[i] = ptr[i]; \ - return ret; \ -} \ - -#define STORES(VTYPE, STYPE) \ -template \ -static FORCEINLINE void __store(VTYPE *p, VTYPE v) { \ - STYPE *ptr = (STYPE *)p; \ - for (int i = 0; i < 16; ++i) \ - ptr[i] = v[i]; \ -} - +/* knc::macro::used */ #define REDUCE_ADD(TYPE, VTYPE, NAME) \ static FORCEINLINE TYPE NAME(VTYPE v) { \ TYPE ret = v[0]; \ @@ -365,6 +355,7 @@ static FORCEINLINE TYPE NAME(VTYPE v) { \ return ret; \ } +/* knc::macro::used */ #define REDUCE_MINMAX(TYPE, VTYPE, NAME, OP) \ static FORCEINLINE TYPE NAME(VTYPE v) { \ TYPE ret = v[0]; \ @@ -373,6 +364,7 @@ static FORCEINLINE TYPE NAME(VTYPE v) { \ return ret; \ } +/* knc::macro::used */ #define SELECT(TYPE) \ static FORCEINLINE TYPE __select(__vec16_i1 mask, TYPE a, TYPE b) { \ TYPE ret; \ @@ -384,6 +376,7 @@ static FORCEINLINE TYPE __select(bool cond, TYPE a, TYPE b) { \ return cond ? a : b; \ } +/* knc::macro::used */ #define SHIFT_UNIFORM(TYPE, CAST, NAME, OP) \ static FORCEINLINE TYPE NAME(TYPE a, int32_t b) { \ TYPE ret; \ @@ -392,6 +385,7 @@ static FORCEINLINE TYPE NAME(TYPE a, int32_t b) { \ return ret; \ } +/* knc::macro::used */ #define SMEAR(VTYPE, NAME, STYPE) \ template VTYPE __smear_##NAME(STYPE); \ template <> FORCEINLINE VTYPE __smear_##NAME(STYPE v) { \ @@ -401,6 +395,7 @@ template <> FORCEINLINE VTYPE __smear_##NAME(STYPE v) { \ return ret; \ } +/* knc::macro::used */ #define SETZERO(VTYPE, NAME) \ template VTYPE __setzero_##NAME(); \ template <> FORCEINLINE VTYPE __setzero_##NAME() { \ @@ -410,12 +405,14 @@ template <> FORCEINLINE VTYPE __setzero_##NAME() { \ return ret; \ } +/* knc::macro::used */ #define UNDEF(VTYPE, NAME) \ template VTYPE __undef_##NAME(); \ template <> FORCEINLINE VTYPE __undef_##NAME() { \ return VTYPE(); \ } +/* knc::macro::used */ #define BROADCAST(VTYPE, NAME, STYPE) \ static FORCEINLINE VTYPE __broadcast_##NAME(VTYPE v, int index) { \ VTYPE ret; \ @@ -424,6 +421,7 @@ static FORCEINLINE VTYPE __broadcast_##NAME(VTYPE v, int index) { \ return ret; \ } \ +/* knc::macro::used */ #define ROTATE(VTYPE, NAME, STYPE) \ static FORCEINLINE VTYPE __rotate_##NAME(VTYPE v, int index) { \ VTYPE ret; \ @@ -432,6 +430,7 @@ static FORCEINLINE VTYPE __rotate_##NAME(VTYPE v, int index) { \ return ret; \ } \ +/* knc::macro::used */ #define SHUFFLES(VTYPE, NAME, STYPE) \ static FORCEINLINE VTYPE __shuffle_##NAME(VTYPE v, __vec16_i32 index) { \ VTYPE ret; \ @@ -448,16 +447,6 @@ static FORCEINLINE VTYPE __shuffle2_##NAME(VTYPE v0, VTYPE v1, __vec16_i32 index return ret; \ } -#define SHUFFLE2(VTYPE, NAME, STYPE) \ -static FORCEINLINE VTYPE __shuffle2_##NAME(VTYPE v0, VTYPE v1, __vec16_i32 index) { \ - VTYPE ret; \ - for (int i = 0; i < 16; ++i) { \ - int ii = __extract_element(index, i) & 0x1f; \ - ret[i] = (ii < 16) ? v0[ii] : v1[ii-16]; \ - } \ - return ret; \ -} - /////////////////////////////////////////////////////////////////////////// INSERT_EXTRACT(__vec1_i8, int8_t) @@ -724,9 +713,9 @@ static FORCEINLINE __vec16_i64 __add(__vec16_i64 a, __vec16_i64 b) static FORCEINLINE __vec16_i64 __sub(__vec16_i64 _a, __vec16_i64 _b) { - // this intrinsic doesn't exist :S - // return __vec16_i64(_mm512_sub_epi64(_a.v1, _b.v1), _mm512_sub_epi64(_a.v2,_b.v2)); - // use knc.h implementation +#if __ICC >= 99999 /* compiler gate, icc >= 99999 will hopefully support _mm512_sub_epi64 */ + return __vec16_i64(_mm512_sub_epi64(_a.v1, _b.v1), _mm512_sub_epi64(_a.v2,_b.v2)); +#else const __vec16_i64 a = _a.cvt2hilo(); const __vec16_i64 b = _b.cvt2hilo(); __vec16_i64 ret; @@ -734,6 +723,7 @@ static FORCEINLINE __vec16_i64 __sub(__vec16_i64 _a, __vec16_i64 _b) ret.v_lo = _mm512_subsetb_epi32(a.v_lo, b.v_lo, &borrow); ret.v_hi = _mm512_sbb_epi32 (a.v_hi, borrow, b.v_hi, &borrow); return ret.cvt2zmm(); +#endif } static FORCEINLINE __vec16_i64 __mul(const __vec16_i32 &a, const __vec16_i64 &_b) @@ -744,11 +734,15 @@ static FORCEINLINE __vec16_i64 __mul(const __vec16_i32 &a, const __vec16_i64 &_b _mm512_mulhi_epi32(a.v, b.v_lo))).cvt2zmm(); } +#if __ICC >= 1400 /* compiler gate, icc >= 14.0.0 support _mm512_mullox_epi64 */ +static FORCEINLINE __vec16_i64 __mul(__vec16_i64 a, __vec16_i64 b) +{ + return __vec16_i64(_mm512_mullox_epi64(a.v1,b.v1), _mm512_mullox_epi64(a.v2,b.v2)); +} +#else /* __ICC >= 1400 */ +#if 0 /* knc::fails ./tests/int64-min-1.ispc ./tests/idiv.ispc */ static FORCEINLINE __vec16_i64 __mul(__vec16_i64 _a, __vec16_i64 _b) { -#if __ICC >= 1400 - return __vec16_i64(_mm512_mullox_epi64(_a.v1,_b.v1), _mm512_mullox_epi64(_a.v2,_b.v2)); -#else const __vec16_i64 a = _a.cvt2hilo(); const __vec16_i64 b = _b.cvt2hilo(); __vec16_i32 lo = _mm512_mullo_epi32(a.v_lo,b.v_lo); @@ -759,8 +753,11 @@ static FORCEINLINE __vec16_i64 __mul(__vec16_i64 _a, __vec16_i64 _b) __vec16_i32 hi_p23 = _mm512_addsetc_epi32(hi_m2, hi_m1, &carry); __vec16_i32 hi = _mm512_adc_epi32(hi_m3, carry, hi_p23, &carry); return __vec16_i64(hi,lo).cvt2zmm(); -#endif } +#else +BINARY_OP(__vec16_i64, __mul, *) +#endif +#endif /* __ICC >= 1400 */ static FORCEINLINE __vec16_i64 __or (__vec16_i64 a, __vec16_i64 b) { return __vec16_i64(_mm512_or_epi64 (a.v1, b.v1), _mm512_or_epi64 (a.v2, b.v2)); } static FORCEINLINE __vec16_i64 __and(__vec16_i64 a, __vec16_i64 b) { return __vec16_i64(_mm512_and_epi64(a.v1, b.v1), _mm512_and_epi64(a.v2, b.v2)); } @@ -782,9 +779,7 @@ static FORCEINLINE __vec16_i64 __sdiv(__vec16_i64 a, __vec16_i64 b) { return __v static FORCEINLINE __vec16_i64 __urem(__vec16_i64 a, __vec16_i64 b) { return __vec16_i64(_mm512_rem_epu64(a.v1,b.v1), _mm512_rem_epu64(a.v2,b.v2)); } static FORCEINLINE __vec16_i64 __srem(__vec16_i64 a, __vec16_i64 b) { return __vec16_i64(_mm512_rem_epi64(a.v1,b.v1), _mm512_rem_epi64(a.v2,b.v2)); } -#if 1 -BINARY_OP_CAST(__vec16_i64, uint64_t, __lshr, >>) -#else /* knc::fails ./tests/idiv.ispc */ +#if 0 /* knc::fails ./tests/idiv.ispc */ static FORCEINLINE __vec16_i64 __lshr(__vec16_i64 _a, __vec16_i64 _b) { const __vec16_i64 a = _a.cvt2hilo(); const __vec16_i64 b = _b.cvt2hilo(); @@ -800,12 +795,11 @@ static FORCEINLINE __vec16_i64 __lshr(__vec16_i64 _a, __vec16_i64 _b) { __vec16_i32 lo = _mm512_or_epi32(xfer, _mm512_srlv_epi32(a.v_lo, b.v_lo)); return __vec16_i64(hi,lo).cvt2zmm(); } - +#else +BINARY_OP_CAST(__vec16_i64, uint64_t, __lshr, >>) #endif -#if 1 -BINARY_OP_CAST(__vec16_i64, int64_t, __ashr, >>) -#else /* knc::fails ./tests/idiv.ispc */ +#if 0 /* knc::fails ./tests/idiv.ispc */ static FORCEINLINE __vec16_i64 __ashr(__vec16_i64 _a, __vec16_i64 _b) { const __vec16_i64 a = _a.cvt2hilo(); const __vec16_i64 b = _b.cvt2hilo(); @@ -816,16 +810,15 @@ static FORCEINLINE __vec16_i64 __ashr(__vec16_i64 _a, __vec16_i64 _b) { __vec16_i32 lo = _mm512_or_epi32(xfer, _mm512_srlv_epi32(a.v_lo, b.v_lo)); return __vec16_i64(hi,lo).cvt2zmm(); } +#else +BINARY_OP_CAST(__vec16_i64, int64_t, __ashr, >>) #endif SHIFT_UNIFORM(__vec16_i64, uint64_t, __lshr, >>) SHIFT_UNIFORM(__vec16_i64, int64_t, __ashr, >>) SHIFT_UNIFORM(__vec16_i64, int64_t, __shl, <<) -#if 1 -CMP_OP(__vec16_i64, i64, int64_t, __equal, ==) -CMP_OP(__vec16_i64, i64, int64_t, __not_equal, !=) -#else /* knc::fails ./tests/reduce-equal-8.ispc , knc::hangs foreach-unique-6.ispc funcptr-null-[2-6].ispc funcptr-uniform-9.ispc funcptr-varying-5.ispc */ +#if 0 /* knc::fails ./tests/reduce-equal-8.ispc , knc::hangs foreach-unique-6.ispc funcptr-null-[2-6].ispc funcptr-uniform-9.ispc funcptr-varying-5.ispc */ static FORCEINLINE __vec16_i1 __equal_i64(__vec16_i64 _a, __vec16_i64 _b) { const __vec16_i64 a = _a.cvt2hilo(); @@ -850,6 +843,9 @@ static FORCEINLINE __vec16_i1 __not_equal_i64_and_mask(__vec16_i64 a, __vec16_i6 { return __and(__not(__equal_i64(a,b)), mask); } +#else +CMP_OP(__vec16_i64, i64, int64_t, __equal, ==) +CMP_OP(__vec16_i64, i64, int64_t, __not_equal, !=) #endif @@ -1037,7 +1033,7 @@ template <> static FORCEINLINE __vec16_f __load<64>(const __vec16_f *p) { return _mm512_load_ps(p); } -/* this one doesn't fail but it is commented out for completenes, no aligned load/stores */ +/* this one doesn't fail but it is commented out for completeness, no aligned load/stores */ template <> static FORCEINLINE void __store<64>(__vec16_f *p, __vec16_f v) { _mm512_store_ps(p, v); @@ -1189,303 +1185,110 @@ static FORCEINLINE __vec16_i16 __float_to_half_varying(__vec16_f v) } -#if 0 /* evghenii::double */ /////////////////////////////////////////////////////////////////////////// // double /////////////////////////////////////////////////////////////////////////// -BINARY_OP(__vec16_d, __add, +) -BINARY_OP(__vec16_d, __sub, -) -BINARY_OP(__vec16_d, __mul, *) -BINARY_OP(__vec16_d, __div, /) +#define VECOP(OP) __vec16_d(_mm512_##OP(a.v1,b.v1),_mm512_##OP(a.v2,b.v2)) +static FORCEINLINE __vec16_d __add(__vec16_d a, __vec16_d b) { return VECOP(add_pd); } +static FORCEINLINE __vec16_d __sub(__vec16_d a, __vec16_d b) { return VECOP(sub_pd); } +static FORCEINLINE __vec16_d __mul(__vec16_d a, __vec16_d b) { return VECOP(mul_pd); } +static FORCEINLINE __vec16_d __div(__vec16_d a, __vec16_d b) { return VECOP(div_pd); } +#undef VECOP -CMP_OP(__vec16_d, double, double, __equal, ==) -CMP_OP(__vec16_d, double, double, __not_equal, !=) -CMP_OP(__vec16_d, double, double, __less_than, <) -CMP_OP(__vec16_d, double, double, __less_equal, <=) -CMP_OP(__vec16_d, double, double, __greater_than, >) -CMP_OP(__vec16_d, double, double, __greater_equal, >=) +#define CMPOP(OP) _mm512_kmovlhb(_mm512_##OP(a.v1,b.v1),_mm512_##OP(a.v2,b.v2)) +static FORCEINLINE __vec16_i1 __equal_double (__vec16_d a, __vec16_d b) { return CMPOP(cmpeq_pd_mask); } +static FORCEINLINE __vec16_i1 __not_equal_double (__vec16_d a, __vec16_d b) { return CMPOP(cmpneq_pd_mask); } +static FORCEINLINE __vec16_i1 __less_than_double (__vec16_d a, __vec16_d b) { return CMPOP(cmplt_pd_mask); } +static FORCEINLINE __vec16_i1 __less_equal_double (__vec16_d a, __vec16_d b) { return CMPOP(cmple_pd_mask); } +static FORCEINLINE __vec16_i1 __greater_than_double (__vec16_d a, __vec16_d b) { return CMPOP(cmpnle_pd_mask); } +static FORCEINLINE __vec16_i1 __greater_equal_double(__vec16_d a, __vec16_d b) { return CMPOP(cmpnlt_pd_mask); } +static FORCEINLINE __vec16_i1 __ordered_double (__vec16_d a, __vec16_d b) { return CMPOP(cmpord_pd_mask); } +static FORCEINLINE __vec16_i1 __unordered_double (__vec16_d a, __vec16_d b) { return CMPOP(cmpunord_pd_mask); } +#undef CMPOP -static FORCEINLINE __vec16_i1 __ordered_double(__vec16_d a, __vec16_d b) { - __vec16_i1 ret; - ret.v = 0; - for (int i = 0; i < 16; ++i) - ret.v |= ((a[i] == a[i]) && (b[i] == b[i])) ? (1 << i) : 0; - return ret; +#define CMPOPMASK(OP) _mm512_kmovlhb(_mm512_mask_##OP(m,a.v1,b.v1),_mm512_mask_##OP(_mm512_kswapb(m,m),a.v2,b.v2)) +static FORCEINLINE __vec16_i1 __equal_double_and_mask (__vec16_d a, __vec16_d b, __vec16_i1 m) { return CMPOPMASK(cmpeq_pd_mask); } +static FORCEINLINE __vec16_i1 __not_equal_double_and_mask (__vec16_d a, __vec16_d b, __vec16_i1 m) { return CMPOPMASK(cmpneq_pd_mask); } +static FORCEINLINE __vec16_i1 __less_than_double_and_mask (__vec16_d a, __vec16_d b, __vec16_i1 m) { return CMPOPMASK(cmplt_pd_mask); } +static FORCEINLINE __vec16_i1 __less_equal_double_and_mask (__vec16_d a, __vec16_d b, __vec16_i1 m) { return CMPOPMASK(cmple_pd_mask); } +static FORCEINLINE __vec16_i1 __greater_than_double_and_mask (__vec16_d a, __vec16_d b, __vec16_i1 m) { return CMPOPMASK(cmpnle_pd_mask); } +static FORCEINLINE __vec16_i1 __greater_equal_double_and_mask(__vec16_d a, __vec16_d b, __vec16_i1 m) { return CMPOPMASK(cmpnlt_pd_mask); } +#undef CMOPMASK + + +static FORCEINLINE __vec16_d __select(__vec16_i1 m, __vec16_d a, __vec16_d b) +{ + return __vec16_d(_mm512_mask_mov_pd(b.v1, m, a.v1), _mm512_mask_mov_pd(b.v2, _mm512_kswapb(m, m), a.v2)); } - -static FORCEINLINE __vec16_i1 __unordered_double(__vec16_d a, __vec16_d b) { - __vec16_i1 ret; - ret.v = 0; - for (int i = 0; i < 16; ++i) - ret.v |= ((a[i] != a[i]) || (b[i] != b[i])) ? (1 << i) : 0; - return ret; -} - -#if 0 - case Instruction::FRem: intrinsic = "__frem"; break; -#endif - -SELECT(__vec16_d) -INSERT_EXTRACT(__vec16_d, double) -SMEAR(__vec16_d, double, double) -SETZERO(__vec16_d, double) -UNDEF(__vec16_d, double) -BROADCAST(__vec16_d, double, double) -ROTATE(__vec16_d, double, double) -SHUFFLES(__vec16_d, double, double) -LOAD_STORE(__vec16_d, double) -#else /* evghenii::double */ -/////////////////////////////////////////////////////////////////////////// -// double -/////////////////////////////////////////////////////////////////////////// - -static FORCEINLINE __vec16_d __add(__vec16_d a, __vec16_d b) { - __vec16_d ret; - ret.v1 = _mm512_add_pd(a.v1, b.v1); - ret.v2 = _mm512_add_pd(a.v2, b.v2); - return ret; -} - -static FORCEINLINE __vec16_d __sub(__vec16_d a, __vec16_d b) { - __vec16_d ret; - ret.v1 = _mm512_sub_pd(a.v1, b.v1); - ret.v2 = _mm512_sub_pd(a.v2, b.v2); - return ret; -} - -static FORCEINLINE __vec16_d __mul(__vec16_d a, __vec16_d b) { - __vec16_d ret; - ret.v1 = _mm512_mul_pd(a.v1, b.v1); - ret.v2 = _mm512_mul_pd(a.v2, b.v2); - return ret; -} - -static FORCEINLINE __vec16_d __div(__vec16_d a, __vec16_d b) { - __vec16_d ret; - ret.v1 = _mm512_div_pd(a.v1, b.v1); - ret.v2 = _mm512_div_pd(a.v2, b.v2); - return ret; -} - -static FORCEINLINE __vec16_i1 __equal_double(__vec16_d a, __vec16_d b) { - __vec16_i1 ret1; - __vec16_i1 ret2; - ret1 = _mm512_cmpeq_pd_mask(a.v1, b.v1); - ret2 = _mm512_cmpeq_pd_mask(a.v2, b.v2); - return _mm512_kmovlhb(ret1, ret2); -} - -static FORCEINLINE __vec16_i1 __equal_double_and_mask(__vec16_d a, __vec16_d b, - __vec16_i1 m) { - __vec16_i1 ret1; - __vec16_i1 ret2; - ret1 = _mm512_mask_cmpeq_pd_mask(m, a.v1, b.v1); - __vec16_i1 tmp_m = m; - ret2 = _mm512_mask_cmpeq_pd_mask(_mm512_kswapb(tmp_m,tmp_m), a.v2, b.v2); - return _mm512_kmovlhb(ret1, ret2); -} - -static FORCEINLINE __vec16_i1 __not_equal_double(__vec16_d a, __vec16_d b) { - __vec16_i1 ret1; - __vec16_i1 ret2; - ret1 = _mm512_cmpneq_pd_mask(a.v1, b.v1); - ret2 = _mm512_cmpneq_pd_mask(a.v2, b.v2); - return _mm512_kmovlhb(ret1, ret2); -} - -static FORCEINLINE __vec16_i1 __not_equal_double_and_mask(__vec16_d a, __vec16_d b, - __vec16_i1 m) { - __vec16_i1 ret1; - __vec16_i1 ret2; - __vec16_i1 tmp_m = m; - ret1 = _mm512_mask_cmpneq_pd_mask(m, a.v1, b.v1); - ret2 = _mm512_mask_cmpneq_pd_mask(_mm512_kswapb(tmp_m, tmp_m), a.v2, b.v2); - return _mm512_kmovlhb(ret1, ret2); -} - -static FORCEINLINE __vec16_i1 __less_than_double(__vec16_d a, __vec16_d b) { - __vec16_i1 ret1; - __vec16_i1 ret2; - ret1 = _mm512_cmplt_pd_mask(a.v1, b.v1); - ret2 = _mm512_cmplt_pd_mask(a.v2, b.v2); - return _mm512_kmovlhb(ret1, ret2); -} - -static FORCEINLINE __vec16_i1 __less_than_double_and_mask(__vec16_d a, __vec16_d b, - __vec16_i1 m) { - __vec16_i1 ret1; - __vec16_i1 ret2; - __vec16_i1 tmp_m = m; - ret1 = _mm512_mask_cmplt_pd_mask(m, a.v1, b.v1); - ret2 = _mm512_mask_cmplt_pd_mask(_mm512_kswapb(tmp_m, tmp_m), a.v2, b.v2); - return _mm512_kmovlhb(ret1, ret2); -} - -static FORCEINLINE __vec16_i1 __less_equal_double(__vec16_d a, __vec16_d b) { - __vec16_i1 ret1; - __vec16_i1 ret2; - ret1 = _mm512_cmple_pd_mask(a.v1, b.v1); - ret2 = _mm512_cmple_pd_mask(a.v2, b.v2); - return _mm512_kmovlhb(ret1, ret2); -} - -static FORCEINLINE __vec16_i1 __less_equal_double_and_mask(__vec16_d a, __vec16_d b, - __vec16_i1 m) { - __vec16_i1 ret1; - __vec16_i1 ret2; - __vec16_i1 tmp_m = m; - ret1 = _mm512_mask_cmple_pd_mask(m, a.v1, b.v1); - ret2 = _mm512_mask_cmple_pd_mask(_mm512_kswapb(tmp_m, tmp_m), a.v2, b.v2); - return _mm512_kmovlhb(ret1, ret2); -} - -static FORCEINLINE __vec16_i1 __greater_than_double(__vec16_d a, __vec16_d b) { - __vec16_i1 ret1; - __vec16_i1 ret2; - ret1 = _mm512_cmpnle_pd_mask(a.v1, b.v1); - ret2 = _mm512_cmpnle_pd_mask(a.v2, b.v2); - return _mm512_kmovlhb(ret1, ret2); -} - -static FORCEINLINE __vec16_i1 __greater_than_double_and_mask(__vec16_d a, __vec16_d b, - __vec16_i1 m) { - __vec16_i1 ret1; - __vec16_i1 ret2; - __vec16_i1 tmp_m = m; - ret1 = _mm512_mask_cmpnle_pd_mask(m, a.v1, b.v1); - ret2 = _mm512_mask_cmpnle_pd_mask(_mm512_kswapb(tmp_m, tmp_m), a.v2, b.v2); - return _mm512_kmovlhb(ret1, ret2); -} - -static FORCEINLINE __vec16_i1 __greater_equal_double(__vec16_d a, __vec16_d b) { - __vec16_i1 ret1; - __vec16_i1 ret2; - ret1 = _mm512_cmpnlt_pd_mask(a.v1, b.v1); - ret2 = _mm512_cmpnlt_pd_mask(a.v2, b.v2); - return _mm512_kmovlhb(ret1, ret2); -} - -static FORCEINLINE __vec16_i1 __greater_equal_double_and_mask(__vec16_d a, __vec16_d b, - __vec16_i1 m) { - __vec16_i1 ret1; - __vec16_i1 ret2; - __vec16_i1 tmp_m = m; - ret1 = _mm512_mask_cmpnlt_pd_mask(m, a.v1, b.v1); - ret2 = _mm512_mask_cmpnlt_pd_mask(_mm512_kswapb(tmp_m, tmp_m), a.v2, b.v2); - return _mm512_kmovlhb(ret1, ret2); -} - -static FORCEINLINE __vec16_i1 __ordered_double(__vec16_d a, __vec16_d b) { - __vec16_i1 ret1; - __vec16_i1 ret2; - ret1 = _mm512_cmpord_pd_mask(a.v1, b.v1); - ret2 = _mm512_cmpord_pd_mask(a.v2, b.v2); - return _mm512_kmovlhb(ret1, ret2); -} - -static FORCEINLINE __vec16_i1 __unordered_double(__vec16_d a, __vec16_d b) { - __vec16_i1 ret1; - __vec16_i1 ret2; - ret1 = _mm512_cmpunord_pd_mask(a.v1, b.v1); - ret2 = _mm512_cmpunord_pd_mask(a.v2, b.v2); - return _mm512_kmovlhb(ret1, ret2); -} - -static FORCEINLINE __vec16_d __select(__vec16_i1 mask, __vec16_d a, __vec16_d b) { - __vec16_d ret; - __vec16_i1 tmp_m = mask; - ret.v1 = _mm512_mask_mov_pd(b.v1, mask, a.v1); - ret.v2 = _mm512_mask_mov_pd(b.v2, _mm512_kswapb(tmp_m, tmp_m), a.v2); - return ret; -} - - -static FORCEINLINE __vec16_d __select(bool cond, __vec16_d a, __vec16_d b) { +static FORCEINLINE __vec16_d __select(bool cond, __vec16_d a, __vec16_d b) +{ return cond ? a : b; } -static FORCEINLINE double __extract_element(__vec16_d v, uint32_t index) { - return ((double *)&v)[index]; -} - -static FORCEINLINE void __insert_element(__vec16_d *v, uint32_t index, double val) { - ((double *)v)[index] = val; -} +static FORCEINLINE double __extract_element(__vec16_d v, uint32_t index) { return v[index]; } +static FORCEINLINE void __insert_element(__vec16_d *v, uint32_t index, double val) { (*v)[index] = val; } template RetVecType __smear_double(double d); -template <> static FORCEINLINE __vec16_d __smear_double<__vec16_d>(double d) { - __vec16_d ret; - ret.v1 = _mm512_set1_pd(d); - ret.v2 = _mm512_set1_pd(d); - return ret; -} +template <> static FORCEINLINE __vec16_d __smear_double<__vec16_d>(double d) { return __vec16_d(_mm512_set1_pd(d), _mm512_set1_pd(d)); } template RetVecType __setzero_double(); -template <> static FORCEINLINE __vec16_d __setzero_double<__vec16_d>() { - __vec16_d ret; - ret.v1 = _mm512_setzero_pd(); - ret.v2 = _mm512_setzero_pd(); - return ret; -} +template <> static FORCEINLINE __vec16_d __setzero_double<__vec16_d>() { return __vec16_d(_mm512_setzero_pd(), _mm512_setzero_pd()); } template RetVecType __undef_double(); -template <> static FORCEINLINE __vec16_d __undef_double<__vec16_d>() { - return __vec16_d(); -} +template <> static FORCEINLINE __vec16_d __undef_double<__vec16_d>() { return __vec16_d(); } -static FORCEINLINE __vec16_d __broadcast_double(__vec16_d v, int index) { - __vec16_d ret; - double val = __extract_element(v, index & 0xf); - ret.v1 = _mm512_set1_pd(val); - ret.v2 = _mm512_set1_pd(val); - return ret; +static FORCEINLINE __vec16_d __broadcast_double(__vec16_d v, int index) +{ + __vec16_d ret; + double val = __extract_element(v, index & 0xf); + ret.v1 = _mm512_set1_pd(val); + ret.v2 = _mm512_set1_pd(val); + return ret; } ROTATE(__vec16_d, double, double) SHUFFLES(__vec16_d, double, double) -template static FORCEINLINE __vec16_d __load(const __vec16_d *p) { - __vec16_d ret; - ret.v1 = _mm512_extloadunpacklo_pd(ret.v1, p, _MM_UPCONV_PD_NONE, _MM_HINT_NONE); - ret.v1 = _mm512_extloadunpackhi_pd(ret.v1, (uint8_t*)p+64, _MM_UPCONV_PD_NONE, _MM_HINT_NONE); - ret.v2 = _mm512_extloadunpacklo_pd(ret.v2, (uint8_t*)p+64, _MM_UPCONV_PD_NONE, _MM_HINT_NONE); - ret.v2 = _mm512_extloadunpackhi_pd(ret.v2, (uint8_t*)p+128, _MM_UPCONV_PD_NONE, _MM_HINT_NONE); - return ret; +template static FORCEINLINE __vec16_d __load(const __vec16_d *p) \ +{ + __vec16_d ret; + ret.v1 = _mm512_extloadunpacklo_pd(ret.v1, p, _MM_UPCONV_PD_NONE, _MM_HINT_NONE); + ret.v1 = _mm512_extloadunpackhi_pd(ret.v1, (uint8_t*)p+64, _MM_UPCONV_PD_NONE, _MM_HINT_NONE); + ret.v2 = _mm512_extloadunpacklo_pd(ret.v2, (uint8_t*)p+64, _MM_UPCONV_PD_NONE, _MM_HINT_NONE); + ret.v2 = _mm512_extloadunpackhi_pd(ret.v2, (uint8_t*)p+128, _MM_UPCONV_PD_NONE, _MM_HINT_NONE); + return ret; } -template static FORCEINLINE void __store(__vec16_d *p, __vec16_d v) { - _mm512_extpackstorelo_pd(p, v.v1, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE); - _mm512_extpackstorehi_pd((uint8_t*)p+64, v.v1, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE); - _mm512_extpackstorelo_pd((uint8_t*)p+64, v.v2, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE); - _mm512_extpackstorehi_pd((uint8_t*)p+128, v.v2, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE); +template static FORCEINLINE void __store(__vec16_d *p, __vec16_d v) +{ + _mm512_extpackstorelo_pd(p, v.v1, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE); + _mm512_extpackstorehi_pd((uint8_t*)p+64, v.v1, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE); + _mm512_extpackstorelo_pd((uint8_t*)p+64, v.v2, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE); + _mm512_extpackstorehi_pd((uint8_t*)p+128, v.v2, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE); } -#if 0 -template <> static FORCEINLINE __vec16_d __load<64>(const __vec16_d *p) { - __vec16_d ret; - ret.v1 = _mm512_load_pd(p); - ret.v2 = _mm512_load_pd(((uint8_t*)p)+64); - return ret; +#if 1 +template <> static FORCEINLINE __vec16_d __load<64>(const __vec16_d *p) +{ + return __vec16_d(_mm512_load_pd(p), _mm512_load_pd(((uint8_t*)p)+64)); } -template <> static FORCEINLINE __vec16_d __load<128>(const __vec16_d *p) { - return __load<64>(p); -} -template <> static FORCEINLINE void __store<64>(__vec16_d *p, __vec16_d v) { - _mm512_store_pd(p, v.v1); - _mm512_store_pd(((uint8_t*)p)+64, v.v2); -} -template <> static FORCEINLINE void __store<128>(__vec16_d *p, __vec16_d v) { - __store<64>(p, v); +template <> static FORCEINLINE void __store<64>(__vec16_d *p, __vec16_d v) +{ + _mm512_store_pd(p, v.v1); + _mm512_store_pd(((uint8_t*)p)+64, v.v2); } +template <> static FORCEINLINE __vec16_d __load <128>(const __vec16_d *p) { return __load<64>(p); } +template <> static FORCEINLINE void __store<128>(__vec16_d *p, __vec16_d v) { __store<64>(p, v); } #endif -#endif /* evghenii::double */ /////////////////////////////////////////////////////////////////////////// // casts +/////////////////////////////////////////////////////////////////////////// +/* knc::macro::used */ #define CAST(TO, STO, FROM, SFROM, FUNC) \ static FORCEINLINE TO FUNC(TO, FROM val) { \ TO ret; \ @@ -1495,13 +1298,13 @@ static FORCEINLINE TO FUNC(TO, FROM val) { \ } // sign extension conversions -#if 1 -CAST(__vec16_i64, int64_t, __vec16_i32, int32_t, __cast_sext) -#else /* evghenii::fails on soa-9 soa-13 soa-10 soa-29 soa-3 ... and others */ +#if 0 /* knc::fails on soa-9 soa-13 soa-10 soa-29 soa-3 ... and others */ static FORCEINLINE __vec16_i64 __cast_sext(const __vec16_i64 &, const __vec16_i32 &val) { return __vec16_i64(_mm512_srai_epi32(val.v,31), val.v).cvt2zmm(); } +#else +CAST(__vec16_i64, int64_t, __vec16_i32, int32_t, __cast_sext) #endif CAST(__vec16_i64, int64_t, __vec16_i16, int16_t, __cast_sext) CAST(__vec16_i64, int64_t, __vec16_i8, int8_t, __cast_sext) @@ -1509,6 +1312,7 @@ CAST(__vec16_i32, int32_t, __vec16_i16, int16_t, __cast_sext) CAST(__vec16_i32, int32_t, __vec16_i8, int8_t, __cast_sext) CAST(__vec16_i16, int16_t, __vec16_i8, int8_t, __cast_sext) +/* knc::macro::used */ #define CAST_SEXT_I1(TYPE) \ static FORCEINLINE TYPE __cast_sext(TYPE, __vec16_i1 v) { \ TYPE ret; \ @@ -1522,34 +1326,31 @@ static FORCEINLINE TYPE __cast_sext(TYPE, __vec16_i1 v) { \ CAST_SEXT_I1(__vec16_i8) CAST_SEXT_I1(__vec16_i16) -#if 0 -CAST_SEXT_I1(__vec16_i32) -#else + +//CAST_SEXT_I1(__vec16_i32) static FORCEINLINE __vec16_i32 __cast_sext(const __vec16_i32 &, const __vec16_i1 &val) { __vec16_i32 ret = _mm512_setzero_epi32(); __vec16_i32 one = _mm512_set1_epi32(-1); return _mm512_mask_mov_epi32(ret, val, one); } -#endif + CAST_SEXT_I1(__vec16_i64) // zero extension -#if 0 -CAST(__vec16_i64, uint64_t, __vec16_i32, uint32_t, __cast_zext) -#else +// CAST(__vec16_i64, uint64_t, __vec16_i32, uint32_t, __cast_zext) static FORCEINLINE __vec16_i64 __cast_zext(const __vec16_i64 &, const __vec16_i32 &val) { return __vec16_i64(_mm512_setzero_epi32(), val.v).cvt2zmm(); } -#endif CAST(__vec16_i64, uint64_t, __vec16_i16, uint16_t, __cast_zext) CAST(__vec16_i64, uint64_t, __vec16_i8, uint8_t, __cast_zext) CAST(__vec16_i32, uint32_t, __vec16_i16, uint16_t, __cast_zext) CAST(__vec16_i32, uint32_t, __vec16_i8, uint8_t, __cast_zext) CAST(__vec16_i16, uint16_t, __vec16_i8, uint8_t, __cast_zext) +/* knc::macro::used */ #define CAST_ZEXT_I1(TYPE) \ static FORCEINLINE TYPE __cast_zext(TYPE, __vec16_i1 v) { \ TYPE ret; \ @@ -1560,16 +1361,15 @@ static FORCEINLINE TYPE __cast_zext(TYPE, __vec16_i1 v) { \ CAST_ZEXT_I1(__vec16_i8) CAST_ZEXT_I1(__vec16_i16) -#if 0 -CAST_ZEXT_I1(__vec16_i32) -#else + +//CAST_ZEXT_I1(__vec16_i32) static FORCEINLINE __vec16_i32 __cast_zext(const __vec16_i32 &, const __vec16_i1 &val) { - __vec16_i32 ret = _mm512_setzero_epi32(); - __vec16_i32 one = _mm512_set1_epi32(1); - return _mm512_mask_mov_epi32(ret, val, one); + __vec16_i32 ret = _mm512_setzero_epi32(); + __vec16_i32 one = _mm512_set1_epi32(1); + return _mm512_mask_mov_epi32(ret, val, one); } -#endif + CAST_ZEXT_I1(__vec16_i64) // truncations @@ -1581,170 +1381,160 @@ CAST(__vec16_i8, int8_t, __vec16_i32, int32_t, __cast_trunc) CAST(__vec16_i8, int8_t, __vec16_i16, int16_t, __cast_trunc) // signed int to float/double -#if 0 -CAST(__vec16_f, float, __vec16_i8, int8_t, __cast_sitofp) -CAST(__vec16_f, float, __vec16_i16, int16_t, __cast_sitofp) -CAST(__vec16_f, float, __vec16_i32, int32_t, __cast_sitofp) -#else + +//CAST(__vec16_f, float, __vec16_i8, int8_t, __cast_sitofp) static FORCEINLINE __vec16_f __cast_sitofp(__vec16_f, __vec16_i8 val) {return _mm512_extload_ps(&val, _MM_UPCONV_PS_SINT8, _MM_BROADCAST_16X16, _MM_HINT_NONE);} +//CAST(__vec16_f, float, __vec16_i16, int16_t, __cast_sitofp) static FORCEINLINE __vec16_f __cast_sitofp(__vec16_f, __vec16_i16 val) {return _mm512_extload_ps(&val, _MM_UPCONV_PS_SINT16, _MM_BROADCAST_16X16, _MM_HINT_NONE);} +//CAST(__vec16_f, float, __vec16_i32, int32_t, __cast_sitofp) static FORCEINLINE __vec16_f __cast_sitofp(__vec16_f, __vec16_i32 val) {return _mm512_cvtfxpnt_round_adjustepi32_ps(val, _MM_ROUND_MODE_NEAREST, _MM_EXPADJ_NONE);} -#endif + CAST(__vec16_f, float, __vec16_i64, int64_t, __cast_sitofp) -#if 0 -CAST(__vec16_d, double, __vec16_i8, int8_t, __cast_sitofp) -CAST(__vec16_d, double, __vec16_i16, int16_t, __cast_sitofp) -CAST(__vec16_d, double, __vec16_i32, int32_t, __cast_sitofp) -#else -static FORCEINLINE __vec16_d __cast_sitofp(__vec16_d, __vec16_i8 val) { - __vec16_i32 vi = _mm512_extload_epi32(&val, _MM_UPCONV_EPI32_SINT8, _MM_BROADCAST_16X16, _MM_HINT_NONE); - __vec16_d ret; - ret.v1 = _mm512_cvtepi32lo_pd(vi); - __vec16_i32 other8 = _mm512_permute4f128_epi32(vi, _MM_PERM_DCDC); - ret.v2 = _mm512_cvtepi32lo_pd(other8); - return ret; + +//CAST(__vec16_d, double, __vec16_i8, int8_t, __cast_sitofp) +static FORCEINLINE __vec16_d __cast_sitofp(__vec16_d, __vec16_i8 val) +{ + __vec16_i32 vi = _mm512_extload_epi32(&val, _MM_UPCONV_EPI32_SINT8, _MM_BROADCAST_16X16, _MM_HINT_NONE); + __vec16_d ret; + ret.v1 = _mm512_cvtepi32lo_pd(vi); + __vec16_i32 other8 = _mm512_permute4f128_epi32(vi, _MM_PERM_DCDC); + ret.v2 = _mm512_cvtepi32lo_pd(other8); + return ret; } -static FORCEINLINE __vec16_d __cast_sitofp(__vec16_d, __vec16_i16 val) { - __vec16_i32 vi = _mm512_extload_epi32(&val, _MM_UPCONV_EPI32_SINT16, _MM_BROADCAST_16X16, _MM_HINT_NONE); - __vec16_d ret; - ret.v1 = _mm512_cvtepi32lo_pd(vi); - __vec16_i32 other8 = _mm512_permute4f128_epi32(vi, _MM_PERM_DCDC); - ret.v2 = _mm512_cvtepi32lo_pd(other8); - return ret; +// CAST(__vec16_d, double, __vec16_i16, int16_t, __cast_sitofp) +static FORCEINLINE __vec16_d __cast_sitofp(__vec16_d, __vec16_i16 val) +{ + __vec16_i32 vi = _mm512_extload_epi32(&val, _MM_UPCONV_EPI32_SINT16, _MM_BROADCAST_16X16, _MM_HINT_NONE); + __vec16_d ret; + ret.v1 = _mm512_cvtepi32lo_pd(vi); + __vec16_i32 other8 = _mm512_permute4f128_epi32(vi, _MM_PERM_DCDC); + ret.v2 = _mm512_cvtepi32lo_pd(other8); + return ret; } -static FORCEINLINE __vec16_d __cast_sitofp(__vec16_d, __vec16_i32 val) { - __vec16_d ret; - ret.v1 = _mm512_cvtepi32lo_pd(val); - __vec16_i32 other8 = _mm512_permute4f128_epi32(val, _MM_PERM_DCDC); - ret.v2 = _mm512_cvtepi32lo_pd(other8); - return ret; +// CAST(__vec16_d, double, __vec16_i32, int32_t, __cast_sitofp) +static FORCEINLINE __vec16_d __cast_sitofp(__vec16_d, __vec16_i32 val) +{ + __vec16_d ret; + ret.v1 = _mm512_cvtepi32lo_pd(val); + __vec16_i32 other8 = _mm512_permute4f128_epi32(val, _MM_PERM_DCDC); + ret.v2 = _mm512_cvtepi32lo_pd(other8); + return ret; } -#endif + CAST(__vec16_d, double, __vec16_i64, int64_t, __cast_sitofp) // unsigned int to float/double -#if 0 -CAST(__vec16_f, float, __vec16_i8, uint8_t, __cast_uitofp) -CAST(__vec16_f, float, __vec16_i16, uint16_t, __cast_uitofp) -CAST(__vec16_f, float, __vec16_i32, uint32_t, __cast_uitofp) -#else + +// CAST(__vec16_f, float, __vec16_i8, uint8_t, __cast_uitofp) static FORCEINLINE __vec16_f __cast_uitofp(__vec16_f, __vec16_i8 val) {return _mm512_extload_ps(&val, _MM_UPCONV_PS_UINT8, _MM_BROADCAST_16X16, _MM_HINT_NONE);} +//CAST(__vec16_f, float, __vec16_i16, uint16_t, __cast_uitofp) static FORCEINLINE __vec16_f __cast_uitofp(__vec16_f, __vec16_i16 val) {return _mm512_extload_ps(&val, _MM_UPCONV_PS_UINT16, _MM_BROADCAST_16X16, _MM_HINT_NONE);} +//CAST(__vec16_f, float, __vec16_i32, uint32_t, __cast_uitofp) static FORCEINLINE __vec16_f __cast_uitofp(__vec16_f, __vec16_i32 val) {return _mm512_cvtfxpnt_round_adjustepu32_ps(val, _MM_ROUND_MODE_NEAREST, _MM_EXPADJ_NONE);} -#endif + CAST(__vec16_f, float, __vec16_i64, uint64_t, __cast_uitofp) -#if 0 -CAST(__vec16_d, double, __vec16_i8, uint8_t, __cast_uitofp) -CAST(__vec16_d, double, __vec16_i16, uint16_t, __cast_uitofp) -CAST(__vec16_d, double, __vec16_i32, uint32_t, __cast_uitofp) -#else -static FORCEINLINE __vec16_d __cast_uitofp(__vec16_d, __vec16_i8 val) { - __vec16_i32 vi = _mm512_extload_epi32(&val, _MM_UPCONV_EPI32_UINT8, _MM_BROADCAST_16X16, _MM_HINT_NONE); - __vec16_d ret; - ret.v1 = _mm512_cvtepu32lo_pd(vi); - __vec16_i32 other8 = _mm512_permute4f128_epi32(vi, _MM_PERM_DCDC); - ret.v2 = _mm512_cvtepu32lo_pd(other8); - return ret; + +// CAST(__vec16_d, double, __vec16_i8, uint8_t, __cast_uitofp) +static FORCEINLINE __vec16_d __cast_uitofp(__vec16_d, __vec16_i8 val) +{ + __vec16_i32 vi = _mm512_extload_epi32(&val, _MM_UPCONV_EPI32_UINT8, _MM_BROADCAST_16X16, _MM_HINT_NONE); + __vec16_d ret; + ret.v1 = _mm512_cvtepu32lo_pd(vi); + __vec16_i32 other8 = _mm512_permute4f128_epi32(vi, _MM_PERM_DCDC); + ret.v2 = _mm512_cvtepu32lo_pd(other8); + return ret; } -static FORCEINLINE __vec16_d __cast_uitofp(__vec16_d, __vec16_i16 val) { - __vec16_i32 vi = _mm512_extload_epi32(&val, _MM_UPCONV_EPI32_UINT16, _MM_BROADCAST_16X16, _MM_HINT_NONE); - __vec16_d ret; - ret.v1 = _mm512_cvtepu32lo_pd(vi); - __vec16_i32 other8 = _mm512_permute4f128_epi32(vi, _MM_PERM_DCDC); - ret.v2 = _mm512_cvtepu32lo_pd(other8); - return ret; +// CAST(__vec16_d, double, __vec16_i16, uint16_t, __cast_uitofp) +static FORCEINLINE __vec16_d __cast_uitofp(__vec16_d, __vec16_i16 val) +{ + __vec16_i32 vi = _mm512_extload_epi32(&val, _MM_UPCONV_EPI32_UINT16, _MM_BROADCAST_16X16, _MM_HINT_NONE); + __vec16_d ret; + ret.v1 = _mm512_cvtepu32lo_pd(vi); + __vec16_i32 other8 = _mm512_permute4f128_epi32(vi, _MM_PERM_DCDC); + ret.v2 = _mm512_cvtepu32lo_pd(other8); + return ret; } -static FORCEINLINE __vec16_d __cast_uitofp(__vec16_d, __vec16_i32 val) { - __vec16_d ret; - ret.v1 = _mm512_cvtepu32lo_pd(val); - __vec16_i32 other8 = _mm512_permute4f128_epi32(val, _MM_PERM_DCDC); - ret.v2 = _mm512_cvtepu32lo_pd(other8); - return ret; +// CAST(__vec16_d, double, __vec16_i32, uint32_t, __cast_uitofp) +static FORCEINLINE __vec16_d __cast_uitofp(__vec16_d, __vec16_i32 val) +{ + __vec16_d ret; + ret.v1 = _mm512_cvtepu32lo_pd(val); + __vec16_i32 other8 = _mm512_permute4f128_epi32(val, _MM_PERM_DCDC); + ret.v2 = _mm512_cvtepu32lo_pd(other8); + return ret; } -#endif + CAST(__vec16_d, double, __vec16_i64, uint64_t, __cast_uitofp) -#if 0 -static FORCEINLINE __vec16_f __cast_uitofp(__vec16_f, __vec16_i1 v) { - __vec16_f ret; - for (int i = 0; i < 16; ++i) - ret[i] = (v.v & (1 << i)) ? 1. : 0.; - return ret; -} -#else static FORCEINLINE __vec16_f __cast_uitofp(__vec16_f, __vec16_i1 v) { - const __m512 ret = _mm512_setzero_ps(); - const __m512 one = _mm512_set1_ps(1.0); - return _mm512_mask_mov_ps(ret, v, one); + const __m512 ret = _mm512_setzero_ps(); + const __m512 one = _mm512_set1_ps(1.0); + return _mm512_mask_mov_ps(ret, v, one); } -#endif // float/double to signed int CAST(__vec16_i8, int8_t, __vec16_f, float, __cast_fptosi) CAST(__vec16_i16, int16_t, __vec16_f, float, __cast_fptosi) -#if 0 -CAST(__vec16_i32, int32_t, __vec16_f, float, __cast_fptosi) -#else -static FORCEINLINE __vec16_i32 __cast_fptosi(__vec16_i32, __vec16_f val) { + +// CAST(__vec16_i32, int32_t, __vec16_f, float, __cast_fptosi) +static FORCEINLINE __vec16_i32 __cast_fptosi(__vec16_i32, __vec16_f val) +{ return _mm512_cvtfxpnt_round_adjustps_epi32(val, _MM_ROUND_MODE_TOWARD_ZERO, _MM_EXPADJ_NONE); } -#endif + CAST(__vec16_i64, int64_t, __vec16_f, float, __cast_fptosi) CAST(__vec16_i8, int8_t, __vec16_d, double, __cast_fptosi) CAST(__vec16_i16, int16_t, __vec16_d, double, __cast_fptosi) -#if 1 -CAST(__vec16_i32, int32_t, __vec16_d, double, __cast_fptosi) +#if 0 /* knc::2implement */ #else +CAST(__vec16_i32, int32_t, __vec16_d, double, __cast_fptosi) #endif CAST(__vec16_i64, int64_t, __vec16_d, double, __cast_fptosi) // float/double to unsigned int CAST(__vec16_i8, uint8_t, __vec16_f, float, __cast_fptoui) CAST(__vec16_i16, uint16_t, __vec16_f, float, __cast_fptoui) -#if 0 -CAST(__vec16_i32, uint32_t, __vec16_f, float, __cast_fptoui) -#else -static FORCEINLINE __vec16_i32 __cast_fptoui(__vec16_i32, __vec16_f val) { + +// CAST(__vec16_i32, uint32_t, __vec16_f, float, __cast_fptoui) +static FORCEINLINE __vec16_i32 __cast_fptoui(__vec16_i32, __vec16_f val) +{ return _mm512_cvtfxpnt_round_adjustps_epu32(val, _MM_ROUND_MODE_TOWARD_ZERO, _MM_EXPADJ_NONE); } -#endif + CAST(__vec16_i64, uint64_t, __vec16_f, float, __cast_fptoui) CAST(__vec16_i8, uint8_t, __vec16_d, double, __cast_fptoui) CAST(__vec16_i16, uint16_t, __vec16_d, double, __cast_fptoui) -#if 1 -CAST(__vec16_i32, uint32_t, __vec16_d, double, __cast_fptoui) +#if 0 /* knc::2implement */ #else +CAST(__vec16_i32, uint32_t, __vec16_d, double, __cast_fptoui) #endif CAST(__vec16_i64, uint64_t, __vec16_d, double, __cast_fptoui) // float/double conversions -#if 0 -CAST(__vec16_f, float, __vec16_d, double, __cast_fptrunc) -#else -static FORCEINLINE __vec16_f __cast_fptrunc(__vec16_f, __vec16_d val) { - __m512i r0i = _mm512_castps_si512(_mm512_cvtpd_pslo(val.v1)); - __m512i r1i = _mm512_castps_si512(_mm512_cvtpd_pslo(val.v2)); - return _mm512_castsi512_ps(_mm512_mask_permute4f128_epi32(r0i, 0xFF00, r1i, _MM_PERM_BABA)); -} -#endif +// CAST(__vec16_f, float, __vec16_d, double, __cast_fptrunc) +static FORCEINLINE __vec16_f __cast_fptrunc(__vec16_f, __vec16_d val) +{ + __m512i r0i = _mm512_castps_si512(_mm512_cvtpd_pslo(val.v1)); + __m512i r1i = _mm512_castps_si512(_mm512_cvtpd_pslo(val.v2)); -#if 0 -CAST(__vec16_d, double, __vec16_f, float, __cast_fpext) -#else -static FORCEINLINE __vec16_d __cast_fpext(__vec16_d, __vec16_f val) { - __vec16_d ret; - ret.v1 = _mm512_cvtpslo_pd(val.v); - __vec16_f other8 = _mm512_castsi512_ps(_mm512_permute4f128_epi32(_mm512_castps_si512(val.v), _MM_PERM_DCDC)); - ret.v2 = _mm512_cvtpslo_pd(other8); - return ret; + return _mm512_castsi512_ps(_mm512_mask_permute4f128_epi32(r0i, 0xFF00, r1i, _MM_PERM_BABA)); +} + +// CAST(__vec16_d, double, __vec16_f, float, __cast_fpext) +static FORCEINLINE __vec16_d __cast_fpext(__vec16_d, __vec16_f val) +{ + __vec16_d ret; + ret.v1 = _mm512_cvtpslo_pd(val.v); + __vec16_f other8 = _mm512_castsi512_ps(_mm512_permute4f128_epi32(_mm512_castps_si512(val.v), _MM_PERM_DCDC)); + ret.v2 = _mm512_cvtpslo_pd(other8); + return ret; } -#endif typedef union { int32_t i32; @@ -1753,6 +1543,7 @@ typedef union { double d; } BitcastUnion; +/* knc::macro::not used */ #define CAST_BITS(TO, TO_ELT, FROM, FROM_ELT) \ static FORCEINLINE TO __cast_bits(TO, FROM val) { \ TO r; \ @@ -1764,30 +1555,17 @@ static FORCEINLINE TO __cast_bits(TO, FROM val) { \ return r; \ } -#if 0 -CAST_BITS(__vec16_f, f, __vec16_i32, i32) -CAST_BITS(__vec16_i32, i32, __vec16_f, f) -#else -static FORCEINLINE __vec16_f __cast_bits(__vec16_f, __vec16_i32 val) { - return _mm512_castsi512_ps(val); -} -static FORCEINLINE __vec16_i32 __cast_bits(__vec16_i32, __vec16_f val) { - return _mm512_castps_si512(val); -} -#endif +// CAST_BITS(__vec16_f, f, __vec16_i32, i32) +static FORCEINLINE __vec16_f __cast_bits(__vec16_f, __vec16_i32 val) { return _mm512_castsi512_ps(val); } +// CAST_BITS(__vec16_i32, i32, __vec16_f, f) +static FORCEINLINE __vec16_i32 __cast_bits(__vec16_i32, __vec16_f val) { return _mm512_castps_si512(val); } -#if 0 -CAST_BITS(__vec16_d, d, __vec16_i64, i64) -CAST_BITS(__vec16_i64, i64, __vec16_d, d) -#else -static FORCEINLINE __vec16_i64 __cast_bits(__vec16_i64, __vec16_d val) { - return *(__vec16_i64*)&val; -} -static FORCEINLINE __vec16_d __cast_bits(__vec16_d, __vec16_i64 val) { - return *(__vec16_d*)&val; -} -#endif +// CAST_BITS(__vec16_d, d, __vec16_i64, i64) +static FORCEINLINE __vec16_i64 __cast_bits(__vec16_i64, __vec16_d val) { return *(__vec16_i64*)&val; } +// CAST_BITS(__vec16_i64, i64, __vec16_d, d) +static FORCEINLINE __vec16_d __cast_bits(__vec16_d, __vec16_i64 val) { return *(__vec16_d*)&val; } +/* knc::macro::used */ #define CAST_BITS_SCALAR(TO, FROM) \ static FORCEINLINE TO __cast_bits(TO, FROM v) { \ union { \ @@ -1809,6 +1587,7 @@ CAST_BITS_SCALAR(double, int64_t) /////////////////////////////////////////////////////////////////////////// // various math functions +/////////////////////////////////////////////////////////////////////////// static FORCEINLINE void __fastmath() { } @@ -1837,168 +1616,100 @@ static FORCEINLINE double __ceil_uniform_double(double v) { return ceil(v); } -#if 0 -UNARY_OP(__vec16_f, __round_varying_float, roundf) -UNARY_OP(__vec16_f, __floor_varying_float, floorf) -UNARY_OP(__vec16_f, __ceil_varying_float, ceilf) -#else -static FORCEINLINE __vec16_f __round_varying_float(__vec16_f v) { - return _mm512_round_ps(v, _MM_ROUND_MODE_NEAREST, _MM_EXPADJ_NONE); -} - -static FORCEINLINE __vec16_f __floor_varying_float(__vec16_f v) { - return _mm512_floor_ps(v); -} - -static FORCEINLINE __vec16_f __ceil_varying_float(__vec16_f v) { - return _mm512_ceil_ps(v); -} -#endif - -#if 0 -UNARY_OP(__vec16_d, __round_varying_double, round) -UNARY_OP(__vec16_d, __floor_varying_double, floor) -UNARY_OP(__vec16_d, __ceil_varying_double, ceil) -#else -static FORCEINLINE __vec16_d __round_varying_float(__vec16_d v) { - __vec16_d ret; - ret.v1 = _mm512_svml_round_pd(v.v1); - ret.v2 = _mm512_svml_round_pd(v.v2); - return ret; -} - -static FORCEINLINE __vec16_d __floor_varying_float(__vec16_d v) { - __vec16_d ret; - ret.v1 = _mm512_floor_pd(v.v1); - ret.v2 = _mm512_floor_pd(v.v2); - return ret; -} - -static FORCEINLINE __vec16_d __ceil_varying_float(__vec16_d v) { - __vec16_d ret; - ret.v1 = _mm512_ceil_pd(v.v1); - ret.v2 = _mm512_ceil_pd(v.v2); - return ret; -} -#endif +static FORCEINLINE __vec16_f __round_varying_float(__vec16_f v) { return _mm512_round_ps(v, _MM_ROUND_MODE_NEAREST, _MM_EXPADJ_NONE); } +static FORCEINLINE __vec16_f __floor_varying_float(__vec16_f v) { return _mm512_floor_ps(v); } +static FORCEINLINE __vec16_f __ceil_varying_float(__vec16_f v) { return _mm512_ceil_ps(v); } +static FORCEINLINE __vec16_d __round_varying_float(__vec16_d v) { return __vec16_d(_mm512_svml_round_pd(v.v1), _mm512_svml_round_pd(v.v2)); } +static FORCEINLINE __vec16_d __floor_varying_float(__vec16_d v) { return __vec16_d(_mm512_floor_pd(v.v1), _mm512_floor_pd(v.v2)); } +static FORCEINLINE __vec16_d __ceil_varying_float(__vec16_d v) { return __vec16_d(_mm512_ceil_pd(v.v1), _mm512_ceil_pd(v.v2)); } // min/max -static FORCEINLINE float __min_uniform_float(float a, float b) { return (ab) ? a : b; } +static FORCEINLINE float __min_uniform_float (float a, float b) { return (ab) ? a : b; } static FORCEINLINE double __min_uniform_double(double a, double b) { return (ab) ? a : b; } -static FORCEINLINE int32_t __min_uniform_int32(int32_t a, int32_t b) { return (ab) ? a : b; } +static FORCEINLINE int32_t __min_uniform_int32 ( int32_t a, int32_t b) { return (ab) ? a : b; } static FORCEINLINE int32_t __min_uniform_uint32(uint32_t a, uint32_t b) { return (ab) ? a : b; } -static FORCEINLINE int64_t __min_uniform_int64(int64_t a, int64_t b) { return (ab) ? a : b; } +static FORCEINLINE int64_t __min_uniform_int64 ( int64_t a, int64_t b) { return (ab) ? a : b; } static FORCEINLINE int64_t __min_uniform_uint64(uint64_t a, uint64_t b) { return (ab) ? a : b; } - -#if 0 -BINARY_OP_FUNC(__vec16_f, __max_varying_float, __max_uniform_float) -BINARY_OP_FUNC(__vec16_f, __min_varying_float, __min_uniform_float) -BINARY_OP_FUNC(__vec16_d, __max_varying_double, __max_uniform_double) -BINARY_OP_FUNC(__vec16_d, __min_varying_double, __min_uniform_double) -#else static FORCEINLINE __vec16_f __max_varying_float (__vec16_f v1, __vec16_f v2) { return _mm512_gmax_ps(v1, v2);} static FORCEINLINE __vec16_f __min_varying_float (__vec16_f v1, __vec16_f v2) { return _mm512_gmin_ps(v1, v2);} static FORCEINLINE __vec16_d __max_varying_double(__vec16_d v1, __vec16_d v2) { return __vec16_d(_mm512_gmax_pd(v1.v1, v2.v1),_mm512_gmax_pd(v1.v2,v2.v2));} static FORCEINLINE __vec16_d __min_varying_double(__vec16_d v1, __vec16_d v2) { return __vec16_d(_mm512_gmin_pd(v1.v1, v2.v1),_mm512_gmin_pd(v1.v2,v2.v2));} -#endif -#if 0 -BINARY_OP_FUNC(__vec16_i32, __max_varying_int32, __max_uniform_int32) -BINARY_OP_FUNC(__vec16_i32, __min_varying_int32, __min_uniform_int32) -BINARY_OP_FUNC(__vec16_i32, __max_varying_uint32, __max_uniform_uint32) -BINARY_OP_FUNC(__vec16_i32, __min_varying_uint32, __min_uniform_uint32) -#else static FORCEINLINE __vec16_i32 __max_varying_int32 (__vec16_i32 v1, __vec16_i32 v2) { return _mm512_max_epi32(v1, v2);} static FORCEINLINE __vec16_i32 __min_varying_int32 (__vec16_i32 v1, __vec16_i32 v2) { return _mm512_min_epi32(v1, v2);} static FORCEINLINE __vec16_i32 __max_varying_uint32(__vec16_i32 v1, __vec16_i32 v2) { return _mm512_max_epu32(v1, v2);} static FORCEINLINE __vec16_i32 __min_varying_uint32(__vec16_i32 v1, __vec16_i32 v2) { return _mm512_min_epu32(v1, v2);} -#endif -BINARY_OP_FUNC(__vec16_i64, __max_varying_int64, __max_uniform_int64) -BINARY_OP_FUNC(__vec16_i64, __min_varying_int64, __min_uniform_int64) +BINARY_OP_FUNC(__vec16_i64, __max_varying_int64, __max_uniform_int64) +BINARY_OP_FUNC(__vec16_i64, __min_varying_int64, __min_uniform_int64) BINARY_OP_FUNC(__vec16_i64, __max_varying_uint64, __max_uniform_uint64) BINARY_OP_FUNC(__vec16_i64, __min_varying_uint64, __min_uniform_uint64) // sqrt/rsqrt/rcp -static FORCEINLINE float __rsqrt_uniform_float(float v) { - return 1.f / sqrtf(v); -} +static FORCEINLINE float __rsqrt_uniform_float(float v) { return 1.f / sqrtf(v); } +static FORCEINLINE float __rcp_uniform_float (float v) { return 1.f / v; } +static FORCEINLINE float __sqrt_uniform_float (float v) { return sqrtf(v); } +static FORCEINLINE double __sqrt_uniform_double(double v) { return sqrt (v); } -static FORCEINLINE float __rcp_uniform_float(float v) { - return 1.f / v; -} - -static FORCEINLINE float __sqrt_uniform_float(float v) { - return sqrtf(v); -} - -static FORCEINLINE double __sqrt_uniform_double(double v) { - return sqrt(v); -} - -#if 0 -UNARY_OP(__vec16_f, __rcp_varying_float, __rcp_uniform_float) -UNARY_OP(__vec16_f, __rsqrt_varying_float, __rsqrt_uniform_float) -UNARY_OP(__vec16_f, __sqrt_varying_float, __sqrt_uniform_float) -UNARY_OP(__vec16_d, __sqrt_varying_double, __sqrt_uniform_double) -#else -static FORCEINLINE __vec16_f __rcp_varying_float(__vec16_f v) { +static FORCEINLINE __vec16_f __rcp_varying_float(__vec16_f v) +{ #ifdef ISPC_FAST_MATH - return _mm512_rcp23_ps(v); // Approximation with 23 bits of accuracy. + return _mm512_rcp23_ps(v); // Approximation with 23 bits of accuracy. #else - return _mm512_recip_ps(v); + return _mm512_recip_ps(v); #endif } -static FORCEINLINE __vec16_f __rsqrt_varying_float(__vec16_f v) { +static FORCEINLINE __vec16_f __rsqrt_varying_float(__vec16_f v) +{ #ifdef ISPC_FAST_MATH - return _mm512_rsqrt23_ps(v); // Approximation with 0.775ULP accuracy + return _mm512_rsqrt23_ps(v); // Approximation with 0.775ULP accuracy #else - return _mm512_invsqrt_ps(v); + return _mm512_invsqrt_ps(v); #endif } -static FORCEINLINE __vec16_f __sqrt_varying_float (__vec16_f v) { return _mm512_sqrt_ps(v);} -static FORCEINLINE __vec16_d __sqrt_varying_double(__vec16_d v) { return __vec16_d(_mm512_sqrt_pd(v.v1),_mm512_sqrt_pd(v.v2));} -#endif +static FORCEINLINE __vec16_f __sqrt_varying_float (__vec16_f v) { return _mm512_sqrt_ps(v);} +static FORCEINLINE __vec16_d __sqrt_varying_double(__vec16_d v) { return __vec16_d(_mm512_sqrt_pd(v.v1),_mm512_sqrt_pd(v.v2));} /////////////////////////////////////////////////////////////////////////// // svml /////////////////////////////////////////////////////////////////////////// -static FORCEINLINE __vec16_f __svml_sinf(__vec16_f v) { return _mm512_sin_ps(v); } -static FORCEINLINE __vec16_f __svml_asinf(__vec16_f v) { return _mm512_asin_ps(v); } -static FORCEINLINE __vec16_f __svml_cosf(__vec16_f v) { return _mm512_cos_ps(v); } -static FORCEINLINE __vec16_f __svml_tanf(__vec16_f v) { return _mm512_tan_ps(v); } -static FORCEINLINE __vec16_f __svml_atanf(__vec16_f v) { return _mm512_atan_ps(v); } +static FORCEINLINE __vec16_f __svml_sinf (__vec16_f v) { return _mm512_sin_ps(v); } +static FORCEINLINE __vec16_f __svml_asinf (__vec16_f v) { return _mm512_asin_ps(v); } +static FORCEINLINE __vec16_f __svml_cosf (__vec16_f v) { return _mm512_cos_ps(v); } +static FORCEINLINE __vec16_f __svml_tanf (__vec16_f v) { return _mm512_tan_ps(v); } +static FORCEINLINE __vec16_f __svml_atanf (__vec16_f v) { return _mm512_atan_ps(v); } static FORCEINLINE __vec16_f __svml_atan2f(__vec16_f a, __vec16_f b) { return _mm512_atan2_ps(a,b); } -static FORCEINLINE __vec16_f __svml_expf(__vec16_f v) { return _mm512_exp_ps(v); } -static FORCEINLINE __vec16_f __svml_logf(__vec16_f v) { return _mm512_log_ps(v); } -static FORCEINLINE __vec16_f __svml_powf(__vec16_f a, __vec16_f b) { return _mm512_pow_ps(a,b); } +static FORCEINLINE __vec16_f __svml_expf (__vec16_f v) { return _mm512_exp_ps(v); } +static FORCEINLINE __vec16_f __svml_logf (__vec16_f v) { return _mm512_log_ps(v); } +static FORCEINLINE __vec16_f __svml_powf (__vec16_f a, __vec16_f b) { return _mm512_pow_ps(a,b); } -static FORCEINLINE __vec16_d __svml_sind(__vec16_d v) { return __vec16_d(_mm512_sin_pd(v.v1), _mm512_sin_pd(v.v2)); } -static FORCEINLINE __vec16_d __svml_asind(__vec16_d v) { return __vec16_d(_mm512_asin_pd(v.v1), _mm512_asin_pd(v.v2)); } -static FORCEINLINE __vec16_d __svml_cosd(__vec16_d v) { return __vec16_d(_mm512_cos_pd(v.v1), _mm512_cos_pd(v.v2)); } -static FORCEINLINE __vec16_d __svml_tand(__vec16_d v) { return __vec16_d(_mm512_tan_pd(v.v1), _mm512_tan_pd(v.v2)); } -static FORCEINLINE __vec16_d __svml_atand(__vec16_d v) { return __vec16_d(_mm512_atan_pd(v.v1), _mm512_atan_pd(v.v2)); } +static FORCEINLINE __vec16_d __svml_sind (__vec16_d v) { return __vec16_d(_mm512_sin_pd(v.v1), _mm512_sin_pd(v.v2)); } +static FORCEINLINE __vec16_d __svml_asind (__vec16_d v) { return __vec16_d(_mm512_asin_pd(v.v1), _mm512_asin_pd(v.v2)); } +static FORCEINLINE __vec16_d __svml_cosd (__vec16_d v) { return __vec16_d(_mm512_cos_pd(v.v1), _mm512_cos_pd(v.v2)); } +static FORCEINLINE __vec16_d __svml_tand (__vec16_d v) { return __vec16_d(_mm512_tan_pd(v.v1), _mm512_tan_pd(v.v2)); } +static FORCEINLINE __vec16_d __svml_atand (__vec16_d v) { return __vec16_d(_mm512_atan_pd(v.v1), _mm512_atan_pd(v.v2)); } static FORCEINLINE __vec16_d __svml_atan2d(__vec16_d a, __vec16_d b) { return __vec16_d(_mm512_atan2_pd(a.v1,b.v1), _mm512_atan2_pd(a.v2,b.v2)); } -static FORCEINLINE __vec16_d __svml_expd(__vec16_d v) { return __vec16_d(_mm512_exp_pd(v.v1), _mm512_exp_pd(v.v2)); } -static FORCEINLINE __vec16_d __svml_logd(__vec16_d v) { return __vec16_d(_mm512_log_pd(v.v1), _mm512_log_pd(v.v2)); } -static FORCEINLINE __vec16_d __svml_powd(__vec16_d a, __vec16_d b) { return __vec16_d(_mm512_pow_pd(a.v1,b.v1), _mm512_pow_pd(a.v2,b.v2)); } +static FORCEINLINE __vec16_d __svml_expd (__vec16_d v) { return __vec16_d(_mm512_exp_pd(v.v1), _mm512_exp_pd(v.v2)); } +static FORCEINLINE __vec16_d __svml_logd (__vec16_d v) { return __vec16_d(_mm512_log_pd(v.v1), _mm512_log_pd(v.v2)); } +static FORCEINLINE __vec16_d __svml_powd (__vec16_d a, __vec16_d b) { return __vec16_d(_mm512_pow_pd(a.v1,b.v1), _mm512_pow_pd(a.v2,b.v2)); } /////////////////////////////////////////////////////////////////////////// // bit ops +/////////////////////////////////////////////////////////////////////////// static FORCEINLINE int32_t __popcnt_int32(uint32_t v) { int count = 0; @@ -2064,42 +1775,23 @@ static FORCEINLINE int64_t __count_leading_zeros_i64(uint64_t v) { /////////////////////////////////////////////////////////////////////////// // reductions +/////////////////////////////////////////////////////////////////////////// -#if 0 -REDUCE_ADD(float, __vec16_f, __reduce_add_float) -REDUCE_MINMAX(float, __vec16_f, __reduce_min_float, <) -REDUCE_MINMAX(float, __vec16_f, __reduce_max_float, >) -#else static FORCEINLINE float __reduce_add_float(__vec16_f v) { return _mm512_reduce_add_ps(v); } static FORCEINLINE float __reduce_min_float(__vec16_f v) { return _mm512_reduce_min_ps(v); } static FORCEINLINE float __reduce_max_float(__vec16_f v) { return _mm512_reduce_max_ps(v); } -#endif -#if 0 -REDUCE_ADD(double, __vec16_d, __reduce_add_double) -REDUCE_MINMAX(double, __vec16_d, __reduce_min_double, <) -REDUCE_MINMAX(double, __vec16_d, __reduce_max_double, >) -#else static FORCEINLINE float __reduce_add_double(__vec16_d v) { return _mm512_reduce_add_pd(v.v1) + _mm512_reduce_add_pd(v.v2); } static FORCEINLINE float __reduce_min_double(__vec16_d v) { return std::min(_mm512_reduce_min_pd(v.v1), _mm512_reduce_min_pd(v.v2)); } static FORCEINLINE float __reduce_max_double(__vec16_d v) { return std::max(_mm512_reduce_max_pd(v.v1), _mm512_reduce_max_pd(v.v2)); } -#endif -#if 0 -REDUCE_ADD (int64_t, __vec16_i32, __reduce_add_int32) -REDUCE_MINMAX(int32_t, __vec16_i32, __reduce_min_int32, <) -REDUCE_MINMAX(int32_t, __vec16_i32, __reduce_max_int32, >) -REDUCE_MINMAX(uint32_t, __vec16_i32, __reduce_min_uint32, <) -REDUCE_MINMAX(uint32_t, __vec16_i32, __reduce_max_uint32, >) -#else static FORCEINLINE int64_t __reduce_add_int32 (__vec16_i32 v) { return _mm512_reduce_add_epi32(v);} static FORCEINLINE int32_t __reduce_min_int32 (__vec16_i32 v) { return _mm512_reduce_min_epi32(v);} static FORCEINLINE int32_t __reduce_max_int32 (__vec16_i32 v) { return _mm512_reduce_max_epi32(v);} static FORCEINLINE uint32_t __reduce_min_uint32 (__vec16_i32 v) { return _mm512_reduce_min_epu32(v);} static FORCEINLINE uint32_t __reduce_max_uint32 (__vec16_i32 v) { return _mm512_reduce_max_epu32(v);} -#endif REDUCE_ADD ( int16_t, __vec16_i8, __reduce_add_int8) REDUCE_ADD ( int32_t, __vec16_i16, __reduce_add_int16) @@ -2111,6 +1803,7 @@ REDUCE_MINMAX(uint64_t, __vec16_i64, __reduce_max_uint64, >) /////////////////////////////////////////////////////////////////////////// // masked load/store +/////////////////////////////////////////////////////////////////////////// static FORCEINLINE __vec16_i8 __masked_load_i8(void *p, __vec16_i1 mask) { @@ -2132,53 +1825,31 @@ static FORCEINLINE __vec16_i16 __masked_load_i16(void *p, return ret; } -#if 0 -static FORCEINLINE __vec16_i32 __masked_load_i32(void *p, - __vec16_i1 mask) { - __vec16_i32 ret; - int32_t *ptr = (int32_t *)p; - for (int i = 0; i < 16; ++i) - if ((mask.v & (1 << i)) != 0) - ret[i] = ptr[i]; - return ret; -} -#else -static FORCEINLINE __vec16_i32 __masked_load_i32(void *p, __vec16_i1 mask) { +static FORCEINLINE __vec16_i32 __masked_load_i32(void *p, __vec16_i1 mask) +{ #ifdef ISPC_FORCE_ALIGNED_MEMORY - return _mm512_mask_load_epi32(__vec16_i32(), mask, p); + return _mm512_mask_load_epi32(__vec16_i32(), mask, p); #else - __vec16_i32 tmp; - tmp.v = _mm512_mask_extloadunpacklo_epi32(tmp.v, 0xFFFF, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE); - tmp.v = _mm512_mask_extloadunpackhi_epi32(tmp.v, 0xFFFF, (uint8_t*)p+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE); - __vec16_i32 ret; - return _mm512_mask_mov_epi32(ret.v, mask, tmp.v); + __vec16_i32 tmp; + tmp.v = _mm512_mask_extloadunpacklo_epi32(tmp.v, 0xFFFF, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE); + tmp.v = _mm512_mask_extloadunpackhi_epi32(tmp.v, 0xFFFF, (uint8_t*)p+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE); + __vec16_i32 ret; + return _mm512_mask_mov_epi32(ret.v, mask, tmp.v); #endif } -#endif -#if 0 -static FORCEINLINE __vec16_f __masked_load_float(void *p, - __vec16_i1 mask) { - __vec16_f ret; - float *ptr = (float *)p; - for (int i = 0; i < 16; ++i) - if ((mask.v & (1 << i)) != 0) - ret[i] = ptr[i]; - return ret; -} -#else -static FORCEINLINE __vec16_f __masked_load_float(void *p, __vec16_i1 mask) { +static FORCEINLINE __vec16_f __masked_load_float(void *p, __vec16_i1 mask) +{ #ifdef ISPC_FORCE_ALIGNED_MEMORY - return _mm512_mask_load_ps(_mm512_undefined_ps(), mask,p); + return _mm512_mask_load_ps(_mm512_undefined_ps(), mask,p); #else - __vec16_f tmp; - tmp.v = _mm512_mask_extloadunpacklo_ps(tmp.v, 0xFFFF, p, _MM_UPCONV_PS_NONE, _MM_HINT_NONE); - tmp.v = _mm512_mask_extloadunpackhi_ps(tmp.v, 0xFFFF, (uint8_t*)p+64, _MM_UPCONV_PS_NONE, _MM_HINT_NONE); - __vec16_f ret; - return _mm512_mask_mov_ps(ret.v, mask, tmp.v); + __vec16_f tmp; + tmp.v = _mm512_mask_extloadunpacklo_ps(tmp.v, 0xFFFF, p, _MM_UPCONV_PS_NONE, _MM_HINT_NONE); + tmp.v = _mm512_mask_extloadunpackhi_ps(tmp.v, 0xFFFF, (uint8_t*)p+64, _MM_UPCONV_PS_NONE, _MM_HINT_NONE); + __vec16_f ret; + return _mm512_mask_mov_ps(ret.v, mask, tmp.v); #endif } -#endif static FORCEINLINE __vec16_i64 __masked_load_i64(void *p, __vec16_i1 mask) { @@ -2190,40 +1861,29 @@ static FORCEINLINE __vec16_i64 __masked_load_i64(void *p, return ret; } -#if 0 -static FORCEINLINE __vec16_d __masked_load_double(void *p, - __vec16_i1 mask) { - __vec16_d ret; - double *ptr = (double *)p; - for (int i = 0; i < 16; ++i) - if ((mask.v & (1 << i)) != 0) - ret[i] = ptr[i]; - return ret; -} -#else -static FORCEINLINE __vec16_d __masked_load_double(void *p, __vec16_i1 mask) { +static FORCEINLINE __vec16_d __masked_load_double(void *p, __vec16_i1 mask) +{ #ifdef ISPC_FORCE_ALIGNED_MEMORY - __vec16_d ret; - __vec16_i1 tmp_m = mask; - tmp_m = _mm512_kswapb(tmp_m, tmp_m); - ret.v1 = _mm512_mask_load_pd(ret.v1, mask, p); - ret.v2 = _mm512_mask_load_pd(ret.v2, tmp_m, (uint8_t*)p+64); - return ret; + __vec16_d ret; + __vec16_i1 tmp_m = mask; + tmp_m = _mm512_kswapb(tmp_m, tmp_m); + ret.v1 = _mm512_mask_load_pd(ret.v1, mask, p); + ret.v2 = _mm512_mask_load_pd(ret.v2, tmp_m, (uint8_t*)p+64); + return ret; #else - __vec16_d tmp; - tmp.v1 = _mm512_mask_extloadunpacklo_pd(tmp.v1, 0xFF, p, _MM_UPCONV_PD_NONE, _MM_HINT_NONE); - tmp.v1 = _mm512_mask_extloadunpackhi_pd(tmp.v1, 0xFF, (uint8_t*)p+64, _MM_UPCONV_PD_NONE, _MM_HINT_NONE); - tmp.v2 = _mm512_mask_extloadunpacklo_pd(tmp.v2, 0xFF, (uint8_t*)p+64, _MM_UPCONV_PD_NONE, _MM_HINT_NONE); - tmp.v2 = _mm512_mask_extloadunpackhi_pd(tmp.v2, 0xFF, (uint8_t*)p+128, _MM_UPCONV_PD_NONE, _MM_HINT_NONE); - __vec16_d ret; - __vec16_i1 tmp_m = mask; - tmp_m = _mm512_kswapb(tmp_m, tmp_m); - ret.v1 = _mm512_mask_mov_pd(ret.v1, mask, tmp.v1); - ret.v2 = _mm512_mask_mov_pd(ret.v2, tmp_m, tmp.v2); - return ret; + __vec16_d tmp; + tmp.v1 = _mm512_mask_extloadunpacklo_pd(tmp.v1, 0xFF, p, _MM_UPCONV_PD_NONE, _MM_HINT_NONE); + tmp.v1 = _mm512_mask_extloadunpackhi_pd(tmp.v1, 0xFF, (uint8_t*)p+64, _MM_UPCONV_PD_NONE, _MM_HINT_NONE); + tmp.v2 = _mm512_mask_extloadunpacklo_pd(tmp.v2, 0xFF, (uint8_t*)p+64, _MM_UPCONV_PD_NONE, _MM_HINT_NONE); + tmp.v2 = _mm512_mask_extloadunpackhi_pd(tmp.v2, 0xFF, (uint8_t*)p+128, _MM_UPCONV_PD_NONE, _MM_HINT_NONE); + __vec16_d ret; + __vec16_i1 tmp_m = mask; + tmp_m = _mm512_kswapb(tmp_m, tmp_m); + ret.v1 = _mm512_mask_mov_pd(ret.v1, mask, tmp.v1); + ret.v2 = _mm512_mask_mov_pd(ret.v2, tmp_m, tmp.v2); + return ret; #endif } -#endif static FORCEINLINE void __masked_store_i8(void *p, __vec16_i8 val, @@ -2242,52 +1902,33 @@ static FORCEINLINE void __masked_store_i16(void *p, __vec16_i16 val, ptr[i] = val[i]; } -#if 0 -static FORCEINLINE void __masked_store_i32(void *p, __vec16_i32 val, - __vec16_i1 mask) { - int32_t *ptr = (int32_t *)p; - for (int i = 0; i < 16; ++i) - if ((mask.v & (1 << i)) != 0) - ptr[i] = val[i]; -} -#else -static FORCEINLINE void __masked_store_i32(void *p, __vec16_i32 val, __vec16_i1 mask) { +static FORCEINLINE void __masked_store_i32(void *p, __vec16_i32 val, __vec16_i1 mask) +{ #ifdef ISPC_FORCE_ALIGNED_MEMORY - _mm512_mask_store_epi32(p, mask, val.v); + _mm512_mask_store_epi32(p, mask, val.v); #else - __vec16_i32 tmp; - tmp.v = _mm512_extloadunpacklo_epi32(tmp.v, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE); - tmp.v = _mm512_extloadunpackhi_epi32(tmp.v, (uint8_t*)p+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE); - tmp.v = _mm512_mask_mov_epi32(tmp.v, mask, val.v); - _mm512_extpackstorelo_epi32(p, tmp.v, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE); - _mm512_extpackstorehi_epi32((uint8_t*)p+64, tmp.v, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE); + __vec16_i32 tmp; + tmp.v = _mm512_extloadunpacklo_epi32(tmp.v, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE); + tmp.v = _mm512_extloadunpackhi_epi32(tmp.v, (uint8_t*)p+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE); + tmp.v = _mm512_mask_mov_epi32(tmp.v, mask, val.v); + _mm512_extpackstorelo_epi32(p, tmp.v, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE); + _mm512_extpackstorehi_epi32((uint8_t*)p+64, tmp.v, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE); #endif } -#endif -#if 0 -static FORCEINLINE void __masked_store_float(void *p, __vec16_f val, - __vec16_i1 mask) { - float *ptr = (float *)p; - for (int i = 0; i < 16; ++i) - if ((mask.v & (1 << i)) != 0) - ptr[i] = val[i]; -} -#else -static FORCEINLINE void __masked_store_float(void *p, __vec16_f val, - __vec16_i1 mask) { +static FORCEINLINE void __masked_store_float(void *p, __vec16_f val, __vec16_i1 mask) +{ #ifdef ISPC_FORCE_ALIGNED_MEMORY - _mm512_mask_store_ps(p, mask, val.v); + _mm512_mask_store_ps(p, mask, val.v); #else - __vec16_f tmp; - tmp.v = _mm512_extloadunpacklo_ps(tmp.v, p, _MM_UPCONV_PS_NONE, _MM_HINT_NONE); - tmp.v = _mm512_extloadunpackhi_ps(tmp.v, (uint8_t*)p+64, _MM_UPCONV_PS_NONE, _MM_HINT_NONE); - tmp.v = _mm512_mask_mov_ps(tmp.v, mask, val.v); - _mm512_extpackstorelo_ps(p, tmp.v, _MM_DOWNCONV_PS_NONE, _MM_HINT_NONE); - _mm512_extpackstorehi_ps((uint8_t*)p+64, tmp.v, _MM_DOWNCONV_PS_NONE, _MM_HINT_NONE); + __vec16_f tmp; + tmp.v = _mm512_extloadunpacklo_ps(tmp.v, p, _MM_UPCONV_PS_NONE, _MM_HINT_NONE); + tmp.v = _mm512_extloadunpackhi_ps(tmp.v, (uint8_t*)p+64, _MM_UPCONV_PS_NONE, _MM_HINT_NONE); + tmp.v = _mm512_mask_mov_ps(tmp.v, mask, val.v); + _mm512_extpackstorelo_ps(p, tmp.v, _MM_DOWNCONV_PS_NONE, _MM_HINT_NONE); + _mm512_extpackstorehi_ps((uint8_t*)p+64, tmp.v, _MM_DOWNCONV_PS_NONE, _MM_HINT_NONE); #endif } -#endif static FORCEINLINE void __masked_store_i64(void *p, __vec16_i64 val, __vec16_i1 mask) { @@ -2297,39 +1938,29 @@ static FORCEINLINE void __masked_store_i64(void *p, __vec16_i64 val, ptr[i] = val[i]; } -#if 0 -static FORCEINLINE void __masked_store_double(void *p, __vec16_d val, - __vec16_i1 mask) { - double *ptr = (double *)p; - for (int i = 0; i < 16; ++i) - if ((mask.v & (1 << i)) != 0) - ptr[i] = val[i]; -} -#else -static FORCEINLINE void __masked_store_double(void *p, __vec16_d val, - __vec16_i1 mask) { +static FORCEINLINE void __masked_store_double(void *p, __vec16_d val, __vec16_i1 mask) +{ #ifdef ISPC_FORCE_ALIGNED_MEMORY - __vec16_i1 tmp_m = mask; - tmp_m = _mm512_kswapb(tmp_m, tmp_m); - _mm512_mask_store_pd(p, mask, val.v1); - _mm512_mask_store_pd((uint8_t*)p+64, tmp_m, val.v2); + __vec16_i1 tmp_m = mask; + tmp_m = _mm512_kswapb(tmp_m, tmp_m); + _mm512_mask_store_pd(p, mask, val.v1); + _mm512_mask_store_pd((uint8_t*)p+64, tmp_m, val.v2); #else - __vec16_d tmp; - __vec16_i1 tmp_m = mask; - tmp_m = _mm512_kswapb(tmp_m, tmp_m); - tmp.v1 = _mm512_extloadunpacklo_pd(tmp.v1, p, _MM_UPCONV_PD_NONE, _MM_HINT_NONE); - tmp.v1 = _mm512_extloadunpackhi_pd(tmp.v1, (uint8_t*)p+64, _MM_UPCONV_PD_NONE, _MM_HINT_NONE); - tmp.v2 = _mm512_extloadunpacklo_pd(tmp.v2, (uint8_t*)p+64, _MM_UPCONV_PD_NONE, _MM_HINT_NONE); - tmp.v2 = _mm512_extloadunpackhi_pd(tmp.v2, (uint8_t*)p+128, _MM_UPCONV_PD_NONE, _MM_HINT_NONE); - tmp.v1 = _mm512_mask_mov_pd(tmp.v1, mask, val.v1); - tmp.v2 = _mm512_mask_mov_pd(tmp.v2, tmp_m, val.v2); - _mm512_extpackstorelo_pd(p, tmp.v1, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE); - _mm512_extpackstorehi_pd((uint8_t*)p+64, tmp.v1, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE); - _mm512_extpackstorelo_pd((uint8_t*)p+64, tmp.v2, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE); - _mm512_extpackstorehi_pd((uint8_t*)p+128, tmp.v2, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE); + __vec16_d tmp; + __vec16_i1 tmp_m = mask; + tmp_m = _mm512_kswapb(tmp_m, tmp_m); + tmp.v1 = _mm512_extloadunpacklo_pd(tmp.v1, p, _MM_UPCONV_PD_NONE, _MM_HINT_NONE); + tmp.v1 = _mm512_extloadunpackhi_pd(tmp.v1, (uint8_t*)p+64, _MM_UPCONV_PD_NONE, _MM_HINT_NONE); + tmp.v2 = _mm512_extloadunpacklo_pd(tmp.v2, (uint8_t*)p+64, _MM_UPCONV_PD_NONE, _MM_HINT_NONE); + tmp.v2 = _mm512_extloadunpackhi_pd(tmp.v2, (uint8_t*)p+128, _MM_UPCONV_PD_NONE, _MM_HINT_NONE); + tmp.v1 = _mm512_mask_mov_pd(tmp.v1, mask, val.v1); + tmp.v2 = _mm512_mask_mov_pd(tmp.v2, tmp_m, val.v2); + _mm512_extpackstorelo_pd(p, tmp.v1, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE); + _mm512_extpackstorehi_pd((uint8_t*)p+64, tmp.v1, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE); + _mm512_extpackstorelo_pd((uint8_t*)p+64, tmp.v2, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE); + _mm512_extpackstorehi_pd((uint8_t*)p+128, tmp.v2, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE); #endif } -#endif static FORCEINLINE void __masked_store_blend_i8(void *p, __vec16_i8 val, __vec16_i1 mask) { @@ -2363,9 +1994,11 @@ static FORCEINLINE void __masked_store_blend_double(void *p, __vec16_d val, /////////////////////////////////////////////////////////////////////////// // gather/scatter +/////////////////////////////////////////////////////////////////////////// // offsets * offsetScale is in bytes (for all of these) +/* knc::macro::used */ #define GATHER_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC) \ static FORCEINLINE VTYPE FUNC(unsigned char *b, uint32_t scale, \ OTYPE offset, __vec16_i1 mask) { \ @@ -2381,21 +2014,19 @@ static FORCEINLINE VTYPE FUNC(unsigned char *b, uint32_t scale, \ /****************/ -#if 0 -GATHER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i32, __gather_base_offsets32_i8) -#else +// GATHER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i32, __gather_base_offsets32_i8) static FORCEINLINE __vec16_i8 __gather_base_offsets32_i8(uint8_t *base, uint32_t scale, __vec16_i32 offsets, __vec16_i1 mask) { - // (iw): need to temporarily store as int because gathers can only return ints. - __vec16_i32 tmp = _mm512_mask_i32extgather_epi32(_mm512_undefined_epi32(), mask, offsets, base, - _MM_UPCONV_EPI32_SINT8, scale, - _MM_HINT_NONE); - // now, downconverting to chars into temporary char vector - __vec16_i8 ret; - _mm512_extstore_epi32(ret.data,tmp,_MM_DOWNCONV_EPI32_SINT8,_MM_HINT_NONE); - return ret; + // (iw): need to temporarily store as int because gathers can only return ints. + __vec16_i32 tmp = _mm512_mask_i32extgather_epi32(_mm512_undefined_epi32(), mask, offsets, base, + _MM_UPCONV_EPI32_SINT8, scale, + _MM_HINT_NONE); + // now, downconverting to chars into temporary char vector + __vec16_i8 ret; + _mm512_extstore_epi32(ret.data,tmp,_MM_DOWNCONV_EPI32_SINT8,_MM_HINT_NONE); + return ret; } -#if 0 /* evghenii::fails on gather-int8-2 & gather-int8-4 */ +#if 0 /* knc::fails on gather-int8-2 & gather-int8-4 */ static FORCEINLINE __vec16_i8 __gather_base_offsets64_i8(uint8_t *_base, uint32_t scale, __vec16_i64 _offsets, __vec16_i1 mask) { const __vec16_i64 offsets = _offsets.cvt2hilo(); @@ -2422,21 +2053,18 @@ static FORCEINLINE __vec16_i8 __gather_base_offsets64_i8(uint8_t *_base, uint32_ #else GATHER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i64, __gather_base_offsets64_i8) #endif -#endif /****************/ GATHER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i32, __gather_base_offsets32_i16) GATHER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i64, __gather_base_offsets64_i16) /****************/ -#if 0 -GATHER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i32, __gather_base_offsets32_i32) -#else +// GATHER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i32, __gather_base_offsets32_i32) static FORCEINLINE __vec16_i32 __gather_base_offsets32_i32(uint8_t *base, uint32_t scale, __vec16_i32 offsets, __vec16_i1 mask) { - return _mm512_mask_i32extgather_epi32(_mm512_undefined_epi32(), mask, offsets, - base, _MM_UPCONV_EPI32_NONE, scale, - _MM_HINT_NONE); + return _mm512_mask_i32extgather_epi32(_mm512_undefined_epi32(), mask, offsets, + base, _MM_UPCONV_EPI32_NONE, scale, + _MM_HINT_NONE); } -#if 0 /* evghenii::fails on gather-int32-2 & gather-int32-4 */ +#if 0 /* knc::fails on gather-int32-2 & gather-int32-4 */ static FORCEINLINE __vec16_i32 __gather_base_offsets64_i32(uint8_t *_base, uint32_t scale, __vec16_i64 _offsets, __vec16_i1 mask) { const __vec16_i64 offsets = _offsets.cvt2hilo(); @@ -2464,18 +2092,15 @@ static FORCEINLINE __vec16_i32 __gather_base_offsets64_i32(uint8_t *_base, uint3 #else GATHER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i64, __gather_base_offsets64_i32) #endif -#endif /****************/ -#if 0 -GATHER_BASE_OFFSETS(__vec16_f, float, __vec16_i32, __gather_base_offsets32_float) -#else +// GATHER_BASE_OFFSETS(__vec16_f, float, __vec16_i32, __gather_base_offsets32_float) static FORCEINLINE __vec16_f __gather_base_offsets32_float(uint8_t *base, uint32_t scale, __vec16_i32 offsets, __vec16_i1 mask) { - return _mm512_mask_i32extgather_ps(_mm512_undefined_ps(), mask, offsets, - base, _MM_UPCONV_PS_NONE, scale, - _MM_HINT_NONE); + return _mm512_mask_i32extgather_ps(_mm512_undefined_ps(), mask, offsets, + base, _MM_UPCONV_PS_NONE, scale, + _MM_HINT_NONE); } -#if 0 /* evghenii::fails on gather-float-2 gather-float-4 & soa-14 */ +#if 0 /* knc::fails on gather-float-2 gather-float-4 & soa-14 */ static FORCEINLINE __vec16_f __gather_base_offsets64_float(uint8_t *_base, uint32_t scale, __vec16_i64 _offsets, __vec16_i1 mask) { const __vec16_i64 offsets = _offsets.cvt2hilo(); @@ -2503,30 +2128,27 @@ static FORCEINLINE __vec16_f __gather_base_offsets64_float(uint8_t *_base, uint3 #else GATHER_BASE_OFFSETS(__vec16_f, float, __vec16_i64, __gather_base_offsets64_float) #endif -#endif /****************/ GATHER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i32, __gather_base_offsets32_i64) GATHER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i64, __gather_base_offsets64_i64) /****************/ -#if 0 -GATHER_BASE_OFFSETS(__vec16_d, double, __vec16_i32, __gather_base_offsets32_double) -#else +// GATHER_BASE_OFFSETS(__vec16_d, double, __vec16_i32, __gather_base_offsets32_double) static FORCEINLINE __vec16_d __gather_base_offsets32_double(uint8_t *base, uint32_t scale, __vec16_i32 offsets, __vec16_i1 mask) { - __vec16_d ret; - ret.v1 = _mm512_mask_i32loextgather_pd(_mm512_undefined_pd(), mask, offsets, - base, _MM_UPCONV_PD_NONE, scale, - _MM_HINT_NONE); - __m512i shuffled_offsets = _mm512_permute4f128_epi32(offsets.v, _MM_PERM_DCDC); - const __mmask8 mask8 = 0x00FF & (mask >> 8); /* evghenii::testme */ - ret.v2 = _mm512_mask_i32loextgather_pd(_mm512_undefined_pd(), mask8, shuffled_offsets, - base, _MM_UPCONV_PD_NONE, scale, - _MM_HINT_NONE); - return ret; + __vec16_d ret; + ret.v1 = _mm512_mask_i32loextgather_pd(_mm512_undefined_pd(), mask, offsets, + base, _MM_UPCONV_PD_NONE, scale, + _MM_HINT_NONE); + __m512i shuffled_offsets = _mm512_permute4f128_epi32(offsets.v, _MM_PERM_DCDC); + const __mmask8 mask8 = 0x00FF & (mask >> 8); /* knc::testme */ + ret.v2 = _mm512_mask_i32loextgather_pd(_mm512_undefined_pd(), mask8, shuffled_offsets, + base, _MM_UPCONV_PD_NONE, scale, + _MM_HINT_NONE); + return ret; } -#endif GATHER_BASE_OFFSETS(__vec16_d, double, __vec16_i64, __gather_base_offsets64_double) +/* knc::macro::used */ #define GATHER_GENERAL(VTYPE, STYPE, PTRTYPE, FUNC) \ static FORCEINLINE VTYPE FUNC(PTRTYPE ptrs, __vec16_i1 mask) { \ VTYPE ret; \ @@ -2537,13 +2159,13 @@ static FORCEINLINE VTYPE FUNC(PTRTYPE ptrs, __vec16_i1 mask) { \ } \ return ret; \ } +/* knc::macro::used */ #define GATHER_GENERALF(VTYPE, STYPE, PTRTYPE, FUNC,FUNC1) \ static FORCEINLINE VTYPE FUNC(PTRTYPE ptrs, __vec16_i1 mask) { \ return FUNC1(0, 1, ptrs, mask); \ } -#if 1 /***********/ GATHER_GENERALF(__vec16_i8, int8_t, __vec16_i32, __gather32_i8, __gather_base_offsets32_i8) GATHER_GENERALF(__vec16_i16, int16_t, __vec16_i32, __gather32_i16, __gather_base_offsets32_i16) @@ -2559,10 +2181,10 @@ GATHER_GENERAL(__vec16_i64, int64_t, __vec16_i64, __gather64_i64); GATHER_GENERAL(__vec16_f, float, __vec16_i64, __gather64_float); GATHER_GENERAL(__vec16_d, double, __vec16_i64, __gather64_double); /***********/ -#endif // scatter +/* knc::macro::used */ #define SCATTER_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC) \ static FORCEINLINE void FUNC(unsigned char *b, uint32_t scale, \ OTYPE offset, VTYPE val, \ @@ -2583,16 +2205,14 @@ SCATTER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i64, __scatter_base_offsets64 SCATTER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i32, __scatter_base_offsets32_i16) SCATTER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i64, __scatter_base_offsets64_i16) /*****************/ -#if 0 -SCATTER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i32, __scatter_base_offsets32_i32) -SCATTER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i64, __scatter_base_offsets64_i32) -#else +// SCATTER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i32, __scatter_base_offsets32_i32) static FORCEINLINE void __scatter_base_offsets32_i32(uint8_t *b, uint32_t scale, __vec16_i32 offsets, __vec16_i32 val, __vec16_i1 mask) { - _mm512_mask_i32extscatter_epi32(b, mask, offsets, val, - _MM_DOWNCONV_EPI32_NONE, scale, - _MM_HINT_NONE); + _mm512_mask_i32extscatter_epi32(b, mask, offsets, val, + _MM_DOWNCONV_EPI32_NONE, scale, + _MM_HINT_NONE); } +// SCATTER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i64, __scatter_base_offsets64_i32) static FORCEINLINE void __scatter_base_offsets64_i32(uint8_t *_base, uint32_t scale, __vec16_i64 _offsets, __vec16_i32 value, __vec16_i1 mask) { const __vec16_i64 offsets = _offsets.cvt2hilo(); @@ -2614,19 +2234,16 @@ static FORCEINLINE void __scatter_base_offsets64_i32(uint8_t *_base, uint32_t sc still_to_do = _mm512_kxor(match,still_to_do); } } -#endif /*****************/ -#if 0 -SCATTER_BASE_OFFSETS(__vec16_f, float, __vec16_i32, __scatter_base_offsets32_float) -#else +// SCATTER_BASE_OFFSETS(__vec16_f, float, __vec16_i32, __scatter_base_offsets32_float) static FORCEINLINE void __scatter_base_offsets32_float(void *base, uint32_t scale, __vec16_i32 offsets, __vec16_f val, __vec16_i1 mask) { - _mm512_mask_i32extscatter_ps(base, mask, offsets, val, - _MM_DOWNCONV_PS_NONE, scale, - _MM_HINT_NONE); + _mm512_mask_i32extscatter_ps(base, mask, offsets, val, + _MM_DOWNCONV_PS_NONE, scale, + _MM_HINT_NONE); } -#if 0 /* evghenii::fails on soa-10 & soa-13 , it is very similar to __scatter_base_offsets64_it32, but that passes tests, why ?!? */ +#if 0 /* knc::fails on soa-10 & soa-13 , it is very similar to __scatter_base_offsets64_it32, but that passes tests, why ?!? */ static FORCEINLINE void __scatter_base_offsets64_float(uint8_t *_base, uint32_t scale, __vec16_i64 _offsets, __vec16_f value, __vec16_i1 mask) { const __vec16_i64 offsets = _offsets.cvt2hilo(); @@ -2651,29 +2268,26 @@ static FORCEINLINE void __scatter_base_offsets64_float(uint8_t *_base, uint32_t #else SCATTER_BASE_OFFSETS(__vec16_f, float, __vec16_i64, __scatter_base_offsets64_float) #endif -#endif /*****************/ SCATTER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i32, __scatter_base_offsets32_i64) SCATTER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i64, __scatter_base_offsets64_i64) /*****************/ -#if 0 /* evghenii::to implement */ -SCATTER_BASE_OFFSETS(__vec16_d, double, __vec16_i32, __scatter_base_offsets32_double) -#else /* evghenii:testme */ +// SCATTER_BASE_OFFSETS(__vec16_d, double, __vec16_i32, __scatter_base_offsets32_double) static FORCEINLINE void __scatter_base_offsets32_double(void *base, uint32_t scale, __vec16_i32 offsets, __vec16_d val, __vec16_i1 mask) { - _mm512_mask_i32loextscatter_pd(base, mask, offsets, val.v1, + _mm512_mask_i32loextscatter_pd(base, mask, offsets, val.v1, _MM_DOWNCONV_PD_NONE, scale, _MM_HINT_NONE); - __m512i shuffled_offsets = _mm512_permute4f128_epi32(offsets.v, _MM_PERM_DCDC); - const __mmask8 mask8 = 0x00FF & (mask >> 8); /* evghenii::testme */ - _mm512_mask_i32loextscatter_pd(base, mask8, shuffled_offsets, val.v2, + __m512i shuffled_offsets = _mm512_permute4f128_epi32(offsets.v, _MM_PERM_DCDC); + const __mmask8 mask8 = 0x00FF & (mask >> 8); /* knc::testme */ + _mm512_mask_i32loextscatter_pd(base, mask8, shuffled_offsets, val.v2, _MM_DOWNCONV_PD_NONE, scale, _MM_HINT_NONE); } -#endif SCATTER_BASE_OFFSETS(__vec16_d, double, __vec16_i64, __scatter_base_offsets64_double) +/* knc::macro::used */ #define SCATTER_GENERAL(VTYPE, STYPE, PTRTYPE, FUNC) \ static FORCEINLINE void FUNC(PTRTYPE ptrs, VTYPE val, __vec16_i1 mask) { \ VTYPE ret; \ @@ -2683,12 +2297,12 @@ static FORCEINLINE void FUNC(PTRTYPE ptrs, VTYPE val, __vec16_i1 mask) { \ *ptr = val[i]; \ } \ } +/* knc::macro::used */ #define SCATTER_GENERALF(VTYPE, STYPE, PTRTYPE, FUNC,FUNC1) \ static FORCEINLINE void FUNC(PTRTYPE ptrs, VTYPE val, __vec16_i1 mask) { \ return FUNC1(0, 1, ptrs, val, mask); \ } -#if 1 /***********/ SCATTER_GENERALF(__vec16_i8, int8_t, __vec16_i32, __scatter32_i8, __scatter_base_offsets32_i8) SCATTER_GENERALF(__vec16_i16, int16_t, __vec16_i32, __scatter32_i16, __scatter_base_offsets32_i16) @@ -2704,109 +2318,47 @@ SCATTER_GENERAL(__vec16_f, float, __vec16_i64, __scatter64_float) SCATTER_GENERAL(__vec16_i64, int64_t, __vec16_i64, __scatter64_i64) SCATTER_GENERAL(__vec16_d, double, __vec16_i64, __scatter64_double) /***********/ -#endif /////////////////////////////////////////////////////////////////////////// // packed load/store +/////////////////////////////////////////////////////////////////////////// -#if 0 -static FORCEINLINE int32_t __packed_load_active(int32_t *ptr, __vec16_i32 *val, - __vec16_i1 mask) { - int count = 0; - for (int i = 0; i < 16; ++i) { - if ((mask.v & (1 << i)) != 0) { - val->operator[](i) = *ptr++; - ++count; - } - } - return count; -} -#endif -#if 0 -static FORCEINLINE int32_t __packed_store_active(int32_t *ptr, - __vec16_i32 val, - __vec16_i1 mask) { - int count = 0; - for (int i = 0; i < 16; ++i) { - if ((mask.v & (1 << i)) != 0) { - *ptr++ = val[i]; - ++count; - } - } - return count; +static FORCEINLINE int32_t __packed_load_active(uint32_t *p, __vec16_i32 *val, __vec16_i1 mask) +{ + __vec16_i32 v = __load<64>(val); + v = _mm512_mask_extloadunpacklo_epi32(v, mask, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE); + v = _mm512_mask_extloadunpackhi_epi32(v, mask, (uint8_t*)p+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE); + __store<64>(val, v); + return _mm_countbits_32(uint32_t(mask)); } -#endif -#if 0 -static FORCEINLINE int32_t __packed_load_active(uint32_t *ptr, - __vec16_i32 *val, - __vec16_i1 mask) { - int count = 0; - for (int i = 0; i < 16; ++i) { - if ((mask.v & (1 << i)) != 0) { - val->operator[](i) = *ptr++; - ++count; - } - } - return count; +static FORCEINLINE int32_t __packed_store_active(uint32_t *p, __vec16_i32 val, __vec16_i1 mask) +{ + _mm512_mask_extpackstorelo_epi32(p, mask, val, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE); + _mm512_mask_extpackstorehi_epi32((uint8_t*)p+64, mask, val, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE); + return _mm_countbits_32(uint32_t(mask)); } -static FORCEINLINE int32_t __packed_store_active(uint32_t *ptr, - __vec16_i32 val, - __vec16_i1 mask) { - int count = 0; - for (int i = 0; i < 16; ++i) { - if ((mask.v & (1 << i)) != 0) { - *ptr++ = val[i]; - ++count; - } - } - return count; -} -#endif -#if 1 -static FORCEINLINE int32_t __packed_load_active(uint32_t *p, __vec16_i32 *val, - __vec16_i1 mask) { - __vec16_i32 v = __load<64>(val); - v = _mm512_mask_extloadunpacklo_epi32(v, mask, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE); - v = _mm512_mask_extloadunpackhi_epi32(v, mask, (uint8_t*)p+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE); - __store<64>(val, v); - return _mm_countbits_32(uint32_t(mask)); +static FORCEINLINE int32_t __packed_load_active(int32_t *p, __vec16_i32 *val, __vec16_i1 mask) +{ + __vec16_i32 v = __load<64>(val); + v = _mm512_mask_extloadunpacklo_epi32(v, mask, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE); + v = _mm512_mask_extloadunpackhi_epi32(v, mask, (uint8_t*)p+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE); + __store<64>(val, v); + return _mm_countbits_32(uint32_t(mask)); } -#endif -#if 1 -static FORCEINLINE int32_t __packed_store_active(uint32_t *p, __vec16_i32 val, - __vec16_i1 mask) { - _mm512_mask_extpackstorelo_epi32(p, mask, val, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE); - _mm512_mask_extpackstorehi_epi32((uint8_t*)p+64, mask, val, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE); - return _mm_countbits_32(uint32_t(mask)); +static FORCEINLINE int32_t __packed_store_active(int32_t *p, __vec16_i32 val, __vec16_i1 mask) +{ + _mm512_mask_extpackstorelo_epi32(p, mask, val, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE); + _mm512_mask_extpackstorehi_epi32((uint8_t*)p+64, mask, val, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE); + return _mm_countbits_32(uint32_t(mask)); } -#endif - -#if 1 -static FORCEINLINE int32_t __packed_load_active(int32_t *p, __vec16_i32 *val, - __vec16_i1 mask) { - __vec16_i32 v = __load<64>(val); - v = _mm512_mask_extloadunpacklo_epi32(v, mask, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE); - v = _mm512_mask_extloadunpackhi_epi32(v, mask, (uint8_t*)p+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE); - __store<64>(val, v); - return _mm_countbits_32(uint32_t(mask)); -} -#endif - -#if 1 -static FORCEINLINE int32_t __packed_store_active(int32_t *p, __vec16_i32 val, - __vec16_i1 mask) { - _mm512_mask_extpackstorelo_epi32(p, mask, val, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE); - _mm512_mask_extpackstorehi_epi32((uint8_t*)p+64, mask, val, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE); - return _mm_countbits_32(uint32_t(mask)); -} -#endif /////////////////////////////////////////////////////////////////////////// // aos/soa +/////////////////////////////////////////////////////////////////////////// static FORCEINLINE void __soa_to_aos3_float(__vec16_f v0, __vec16_f v1, __vec16_f v2, float *ptr) { @@ -2848,6 +2400,7 @@ static FORCEINLINE void __aos_to_soa4_float(float *ptr, __vec16_f *out0, __vec16 /////////////////////////////////////////////////////////////////////////// // prefetch +/////////////////////////////////////////////////////////////////////////// static FORCEINLINE void __prefetch_read_uniform_1(unsigned char *p) { _mm_prefetch((char *)p, _MM_HINT_T0); // prefetch into L1$ @@ -2868,6 +2421,7 @@ static FORCEINLINE void __prefetch_read_uniform_nt(unsigned char *p) { /////////////////////////////////////////////////////////////////////////// // atomics +/////////////////////////////////////////////////////////////////////////// static FORCEINLINE uint32_t __atomic_add(uint32_t *p, uint32_t v) { #ifdef _MSC_VER From 10223cfac3a8d0f5d80bd5eff095055e593764cd Mon Sep 17 00:00:00 2001 From: evghenii Date: Sat, 5 Oct 2013 15:23:55 +0300 Subject: [PATCH 121/124] workong on shuffle/rotate for double, there seems to be a bug in cvt2zmm cvt2hilo --- examples/intrinsics/knc-i1x16.h | 85 ++++++++++++++++++++++++++++++++- 1 file changed, 83 insertions(+), 2 deletions(-) diff --git a/examples/intrinsics/knc-i1x16.h b/examples/intrinsics/knc-i1x16.h index e712c969..807781f0 100644 --- a/examples/intrinsics/knc-i1x16.h +++ b/examples/intrinsics/knc-i1x16.h @@ -143,8 +143,14 @@ PRE_ALIGN(64) struct __vec16_f struct PRE_ALIGN(128) __vec16_d { - __m512d v1; - __m512d v2; + union { + __m512d v1; + __m512d v_hi; + }; + union { + __m512d v2; + __m512d v_lo; + }; FORCEINLINE __vec16_d() : v1(_mm512_undefined_pd()), v2(_mm512_undefined_pd()) {} FORCEINLINE __vec16_d(const __m512d _v1, const __m512d _v2) : v1(_v1), v2(_v2) {} FORCEINLINE __vec16_d(const __vec16_d &o) : v1(o.v1), v2(o.v2) {} @@ -158,6 +164,40 @@ struct PRE_ALIGN(128) __vec16_d } FORCEINLINE const double& operator[](const int i) const { return ((double*)this)[i]; } FORCEINLINE double& operator[](const int i) { return ((double*)this)[i]; } + FORCEINLINE __vec16_d cvt2hilo() const + { + __m512i _hi, _lo; + _hi = _mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xFF00, + _mm512_set_16to16_pi(15,13,11,9,7,5,3,1,14,12,10,8,6,4,2,0), + _mm512_castpd_si512(v1)); + _hi = _mm512_mask_permutevar_epi32(_hi, 0x00FF, + _mm512_set_16to16_pi(14,12,10,8,6,4,2,0,15,13,11,9,7,5,3,1), + _mm512_castpd_si512(v2)); + _lo = _mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xFF00, + _mm512_set_16to16_pi(14,12,10,8,6,4,2,0,15,13,11,9,7,5,3,1), + _mm512_castpd_si512(v1)); + _lo = _mm512_mask_permutevar_epi32(_lo, 0x00FF, + _mm512_set_16to16_pi(15,13,11,9,7,5,3,1,14,12,10,8,6,4,2,0), + _mm512_castpd_si512(v2)); + return __vec16_d(_mm512_castsi512_pd(_hi), _mm512_castsi512_pd(_lo)); + } + FORCEINLINE __vec16_d cvt2zmm() const + { + __m512i _v1, _v2; + _v1 = _mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xAAAA, + _mm512_set_16to16_pi(15,15,14,14,13,13,12,12,11,11,10,10,9,9,8,8), + _mm512_castpd_si512(v_hi)); + _v1 = _mm512_mask_permutevar_epi32(_v1, 0x5555, + _mm512_set_16to16_pi(15,15,14,14,13,13,12,12,11,11,10,10,9,9,8,8), + _mm512_castpd_si512(v_lo)); + _v2 = _mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xAAAA, + _mm512_set_16to16_pi(7,7,6,6,5,5,4,4,3,3,2,2,1,1,0,0), + _mm512_castpd_si512(v_hi)); + _v2 = _mm512_mask_permutevar_epi32(_v2, 0x5555, + _mm512_set_16to16_pi(7,7,6,6,5,5,4,4,3,3,2,2,1,1,0,0), + _mm512_castpd_si512(v_lo)); + return __vec16_d(_mm512_castsi512_pd(_v1), _mm512_castsi512_pd(_v2)); + } } POST_ALIGN(128); struct PRE_ALIGN(128) __vec16_i64 @@ -1247,8 +1287,49 @@ static FORCEINLINE __vec16_d __broadcast_double(__vec16_d v, int index) return ret; } +#define CASTD2F(_v_, _v_hi_, _v_lo_) \ + __vec16_f _v_hi_, _v_lo_; \ + { \ + const __vec16_d v = _v_.cvt2hilo(); \ + _v_hi_ = _mm512_castpd_ps(v.v_hi); \ + _v_lo_ = _mm512_castpd_ps(v.v_lo); } +#define CASTF2D(_ret_hi_, _ret_lo_) \ + __vec16_d(_mm512_castps_pd(_ret_hi_), _mm512_castps_pd(_ret_lo_)).cvt2zmm() + +#if 0 /* knc::testme there appears to be no tests in ./tests for checking this functionality */ +static FORCEINLINE __vec16_d __rotate_double(const __vec16_d _v, const int index) +{ +// return _v; /* this one passes all tests , but most not */ + CASTD2F(_v, v_hi, v_lo); + const __vec16_f ret_hi = __rotate_float(v_hi, index); + const __vec16_f ret_lo = __rotate_float(v_lo, index); + return CASTF2D(ret_hi, ret_lo); +} +#else ROTATE(__vec16_d, double, double) +#endif + +#if 0 /* knc::fails ./tests/shuffle2-4.ispc ./tests/shuffle2-5.ispc */ +static FORCEINLINE __vec16_d __shuffle_double(__vec16_d _v, const __vec16_i32 index) +{ + CASTD2F(_v, v_hi, v_lo); + const __vec16_f ret_hi = __shuffle_float(v_hi, index); + const __vec16_f ret_lo = __shuffle_float(v_lo, index); + return CASTF2D(ret_hi, ret_lo); +} +static FORCEINLINE __vec16_d __shuffle2_double(__vec16_d _v0, __vec16_d _v1, const __vec16_i32 index) +{ + CASTD2F(_v0, v0_hi, v0_lo); + CASTD2F(_v1, v1_hi, v1_lo); + const __vec16_f ret_hi = __shuffle2_float(v0_hi, v1_hi, index); + const __vec16_f ret_lo = __shuffle2_float(v0_lo, v1_lo, index); + return CASTF2D(ret_hi, ret_lo); +} +#else SHUFFLES(__vec16_d, double, double) +#endif +#undef CASTD2F +#undef CASTF2D template static FORCEINLINE __vec16_d __load(const __vec16_d *p) \ { From 1b196520f6877c14203e5bc88ab37db6deeb88a7 Mon Sep 17 00:00:00 2001 From: evghenii Date: Sat, 5 Oct 2013 22:10:05 +0300 Subject: [PATCH 122/124] knc-i1x16.h is cleaned: int32,float,double are complete, int64 is partially complete --- examples/intrinsics/knc-i1x16.h | 271 ++++++++++++++++---------------- 1 file changed, 133 insertions(+), 138 deletions(-) diff --git a/examples/intrinsics/knc-i1x16.h b/examples/intrinsics/knc-i1x16.h index 807781f0..fb2cf618 100644 --- a/examples/intrinsics/knc-i1x16.h +++ b/examples/intrinsics/knc-i1x16.h @@ -141,6 +141,37 @@ PRE_ALIGN(64) struct __vec16_f FORCEINLINE float& operator[](const int i) { return ((float*)this)[i]; } } POST_ALIGN(64); +static void zmm2hilo(const __m512i v1, const __m512i v2, __m512i &_hi, __m512i &_lo) +{ + _hi = _mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xFF00, + _mm512_set_16to16_pi(15,13,11,9,7,5,3,1,14,12,10,8,6,4,2,0), + v2); + _hi = _mm512_mask_permutevar_epi32(_hi, 0x00FF, + _mm512_set_16to16_pi(14,12,10,8,6,4,2,0,15,13,11,9,7,5,3,1), + v1); + _lo = _mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xFF00, + _mm512_set_16to16_pi(14,12,10,8,6,4,2,0,15,13,11,9,7,5,3,1), + v2); + _lo = _mm512_mask_permutevar_epi32(_lo, 0x00FF, + _mm512_set_16to16_pi(15,13,11,9,7,5,3,1,14,12,10,8,6,4,2,0), + v1); +} +static void hilo2zmm(const __m512i v_hi, const __m512i v_lo, __m512i &_v1, __m512i &_v2) +{ + _v2 = _mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xAAAA, + _mm512_set_16to16_pi(15,15,14,14,13,13,12,12,11,11,10,10,9,9,8,8), + v_hi); + _v2 = _mm512_mask_permutevar_epi32(_v2, 0x5555, + _mm512_set_16to16_pi(15,15,14,14,13,13,12,12,11,11,10,10,9,9,8,8), + v_lo); + _v1 = _mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xAAAA, + _mm512_set_16to16_pi(7,7,6,6,5,5,4,4,3,3,2,2,1,1,0,0), + v_hi); + _v1 = _mm512_mask_permutevar_epi32(_v1, 0x5555, + _mm512_set_16to16_pi(7,7,6,6,5,5,4,4,3,3,2,2,1,1,0,0), + v_lo); +} + struct PRE_ALIGN(128) __vec16_d { union { @@ -166,36 +197,18 @@ struct PRE_ALIGN(128) __vec16_d FORCEINLINE double& operator[](const int i) { return ((double*)this)[i]; } FORCEINLINE __vec16_d cvt2hilo() const { + const __m512i _v1 = _mm512_castpd_si512(v1); + const __m512i _v2 = _mm512_castpd_si512(v2); __m512i _hi, _lo; - _hi = _mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xFF00, - _mm512_set_16to16_pi(15,13,11,9,7,5,3,1,14,12,10,8,6,4,2,0), - _mm512_castpd_si512(v1)); - _hi = _mm512_mask_permutevar_epi32(_hi, 0x00FF, - _mm512_set_16to16_pi(14,12,10,8,6,4,2,0,15,13,11,9,7,5,3,1), - _mm512_castpd_si512(v2)); - _lo = _mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xFF00, - _mm512_set_16to16_pi(14,12,10,8,6,4,2,0,15,13,11,9,7,5,3,1), - _mm512_castpd_si512(v1)); - _lo = _mm512_mask_permutevar_epi32(_lo, 0x00FF, - _mm512_set_16to16_pi(15,13,11,9,7,5,3,1,14,12,10,8,6,4,2,0), - _mm512_castpd_si512(v2)); + zmm2hilo(_v1, _v2, _hi, _lo); return __vec16_d(_mm512_castsi512_pd(_hi), _mm512_castsi512_pd(_lo)); } FORCEINLINE __vec16_d cvt2zmm() const { + const __m512i _hi = _mm512_castpd_si512(v_hi); + const __m512i _lo = _mm512_castpd_si512(v_lo); __m512i _v1, _v2; - _v1 = _mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xAAAA, - _mm512_set_16to16_pi(15,15,14,14,13,13,12,12,11,11,10,10,9,9,8,8), - _mm512_castpd_si512(v_hi)); - _v1 = _mm512_mask_permutevar_epi32(_v1, 0x5555, - _mm512_set_16to16_pi(15,15,14,14,13,13,12,12,11,11,10,10,9,9,8,8), - _mm512_castpd_si512(v_lo)); - _v2 = _mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xAAAA, - _mm512_set_16to16_pi(7,7,6,6,5,5,4,4,3,3,2,2,1,1,0,0), - _mm512_castpd_si512(v_hi)); - _v2 = _mm512_mask_permutevar_epi32(_v2, 0x5555, - _mm512_set_16to16_pi(7,7,6,6,5,5,4,4,3,3,2,2,1,1,0,0), - _mm512_castpd_si512(v_lo)); + hilo2zmm(_hi,_lo, _v1,_v2); return __vec16_d(_mm512_castsi512_pd(_v1), _mm512_castsi512_pd(_v2)); } } POST_ALIGN(128); @@ -226,38 +239,15 @@ struct PRE_ALIGN(128) __vec16_i64 FORCEINLINE int64_t& operator[](const int i) { return ((int64_t*)this)[i]; } FORCEINLINE __vec16_i64 cvt2hilo() const { - __m512i _hi, _lo; - _hi = _mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xFF00, - _mm512_set_16to16_pi(15,13,11,9,7,5,3,1,14,12,10,8,6,4,2,0), - v1); - _hi = _mm512_mask_permutevar_epi32(_hi, 0x00FF, - _mm512_set_16to16_pi(14,12,10,8,6,4,2,0,15,13,11,9,7,5,3,1), - v2); - _lo = _mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xFF00, - _mm512_set_16to16_pi(14,12,10,8,6,4,2,0,15,13,11,9,7,5,3,1), - v1); - _lo = _mm512_mask_permutevar_epi32(_lo, 0x00FF, - _mm512_set_16to16_pi(15,13,11,9,7,5,3,1,14,12,10,8,6,4,2,0), - v2); - return __vec16_i64(_hi, _lo); + __vec16_i64 ret; + zmm2hilo(v1,v2,ret.v_hi,ret.v_lo); + return ret; } FORCEINLINE __vec16_i64 cvt2zmm() const { - __m512i _v1, _v2; - _v1 = _mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xAAAA, - _mm512_set_16to16_pi(15,15,14,14,13,13,12,12,11,11,10,10,9,9,8,8), - v_hi); - _v1 = _mm512_mask_permutevar_epi32(_v1, 0x5555, - _mm512_set_16to16_pi(15,15,14,14,13,13,12,12,11,11,10,10,9,9,8,8), - v_lo); - - _v2 = _mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xAAAA, - _mm512_set_16to16_pi(7,7,6,6,5,5,4,4,3,3,2,2,1,1,0,0), - v_hi); - _v2 = _mm512_mask_permutevar_epi32(_v2, 0x5555, - _mm512_set_16to16_pi(7,7,6,6,5,5,4,4,3,3,2,2,1,1,0,0), - v_lo); - return __vec16_i64(_v1, _v2); + __vec16_i64 ret; + hilo2zmm(v_hi,v_lo, ret.v1, ret.v2); + return ret; } } POST_ALIGN(128); @@ -305,15 +295,6 @@ static inline int32_t __extract_element(__vec16_i32, int); /////////////////////////////////////////////////////////////////////////// // macros... -/* knc::macro::not used */ -#define UNARY_OP(TYPE, NAME, OP) \ -static FORCEINLINE TYPE NAME(TYPE v) { \ - TYPE ret; \ - for (int i = 0; i < 16; ++i) \ - ret[i] = OP(v[i]); \ - return ret; \ -} - /* knc::macro::used */ #define BINARY_OP(TYPE, NAME, OP) \ static FORCEINLINE TYPE NAME(TYPE a, TYPE b) { \ @@ -722,7 +703,7 @@ static FORCEINLINE __vec16_i32 __shuffle2_i32(__vec16_i32 v0, __vec16_i32 v1, __ template static FORCEINLINE __vec16_i32 __load(const __vec16_i32 *p) { -#ifdef ISPC_FORCE_ALIGNED_MEMORY +#ifdef ISPC_FORCE_ALIGNED_MEMORY__REMOVETHIS_WHEN_FIXED return __load<64>(p); #else __vec16_i32 v; @@ -734,7 +715,7 @@ template static FORCEINLINE __vec16_i32 __load(const __vec16_i32 *p) template static FORCEINLINE void __store(__vec16_i32 *p, __vec16_i32 v) { -#ifdef ISPC_FORCE_ALIGNED_MEMORY +#ifdef ISPC_FORCE_ALIGNED_MEMORY__REMOVETHIS_WHEN_FIXED __store<64>(p,v); #else _mm512_extpackstorelo_epi32( p, v, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE); @@ -742,6 +723,17 @@ template static FORCEINLINE void __store(__vec16_i32 *p, __vec16_i32 #endif } +#if 0 /* knc::fails ./tests/foreach-25.ispc ./tests/forach-26.ispc ./tests/foreach-27.ispc */ +template <> static FORCEINLINE __vec16_i32 __load<64>(const __vec16_i32 *p) +{ + return _mm512_load_epi32(p); +} +template <> static FORCEINLINE void __store<64>(__vec16_i32 *p, __vec16_i32 v) +{ + _mm512_store_epi32(p, v); +} +#endif + /////////////////////////////////////////////////////////////////////////// // int64 /////////////////////////////////////////////////////////////////////////// @@ -783,8 +775,8 @@ static FORCEINLINE __vec16_i64 __mul(__vec16_i64 a, __vec16_i64 b) #if 0 /* knc::fails ./tests/int64-min-1.ispc ./tests/idiv.ispc */ static FORCEINLINE __vec16_i64 __mul(__vec16_i64 _a, __vec16_i64 _b) { - const __vec16_i64 a = _a.cvt2hilo(); - const __vec16_i64 b = _b.cvt2hilo(); + __vec16_i64 a = _a.cvt2hilo(); + __vec16_i64 b = _b.cvt2hilo(); __vec16_i32 lo = _mm512_mullo_epi32(a.v_lo,b.v_lo); __vec16_i32 hi_m1 = _mm512_mulhi_epi32(a.v_lo, b.v_lo); __vec16_i32 hi_m2 = _mm512_mullo_epi32(a.v_hi, b.v_lo); @@ -858,7 +850,6 @@ SHIFT_UNIFORM(__vec16_i64, uint64_t, __lshr, >>) SHIFT_UNIFORM(__vec16_i64, int64_t, __ashr, >>) SHIFT_UNIFORM(__vec16_i64, int64_t, __shl, <<) -#if 0 /* knc::fails ./tests/reduce-equal-8.ispc , knc::hangs foreach-unique-6.ispc funcptr-null-[2-6].ispc funcptr-uniform-9.ispc funcptr-varying-5.ispc */ static FORCEINLINE __vec16_i1 __equal_i64(__vec16_i64 _a, __vec16_i64 _b) { const __vec16_i64 a = _a.cvt2hilo(); @@ -874,22 +865,14 @@ static FORCEINLINE __vec16_i1 __equal_i64_and_mask(__vec16_i64 _a, __vec16_i64 _ __mmask16 full_match = _mm512_mask_cmpeq_epi32_mask(lo_match,a.v_hi,b.v_hi); return _mm512_kand(full_match, (__mmask16)mask); } - static FORCEINLINE __vec16_i1 __not_equal_i64(__vec16_i64 a, __vec16_i64 b) { - return __not(__equal_i64(a,b)); + return __not(__equal_i64(a,b)); } static FORCEINLINE __vec16_i1 __not_equal_i64_and_mask(__vec16_i64 a, __vec16_i64 b, __vec16_i1 mask) { - return __and(__not(__equal_i64(a,b)), mask); + return __and(__not(__equal_i64(a,b)), mask); } -#else -CMP_OP(__vec16_i64, i64, int64_t, __equal, ==) -CMP_OP(__vec16_i64, i64, int64_t, __not_equal, !=) -#endif - - - CMP_OP(__vec16_i64, i64, uint64_t, __unsigned_less_equal, <=) CMP_OP(__vec16_i64, i64, int64_t, __signed_less_equal, <=) CMP_OP(__vec16_i64, i64, uint64_t, __unsigned_greater_equal, >=) @@ -918,18 +901,49 @@ template <> FORCEINLINE __vec16_i64 __setzero_i64<__vec16_i64>() { return __vec template RetVecType __undef_i64(); template <> FORCEINLINE __vec16_i64 __undef_i64<__vec16_i64>() { return __vec16_i64(_mm512_undefined_epi32(), _mm512_undefined_epi32()); } -static FORCEINLINE __vec16_i64 __broadcast_i64(__vec16_i64 v, int index) +#define CASTL2I(_v_, _v_hi_, _v_lo_) \ + __vec16_i32 _v_hi_, _v_lo_; \ + { \ + const __vec16_i64 v = _v_.cvt2hilo(); \ + _v_hi_ = v.v_hi; \ + _v_lo_ = v.v_lo; } +#define CASTI2L(_ret_hi_, _ret_lo_) \ + __vec16_i64(_ret_hi_, _ret_lo_).cvt2zmm() +static FORCEINLINE __vec16_i64 __broadcast_i64(__vec16_i64 _v, int index) { - int64_t val = __extract_element(v, index & 0xf); - return __smear_i64<__vec16_i64>(val); + CASTL2I(_v, v_hi, v_lo); + const __vec16_i32 ret_hi = __broadcast_i32(v_hi, index); + const __vec16_i32 ret_lo = __broadcast_i32(v_lo, index); + return CASTI2L(ret_hi, ret_lo); } - -ROTATE (__vec16_i64, i64, int64_t) -SHUFFLES(__vec16_i64, i64, int64_t) +static FORCEINLINE __vec16_i64 __rotate_i64(const __vec16_i64 _v, const int index) +{ + CASTL2I(_v, v_hi, v_lo); + const __vec16_i32 ret_hi = __rotate_i32(v_hi, index); + const __vec16_i32 ret_lo = __rotate_i32(v_lo, index); + return CASTI2L(ret_hi, ret_lo); +} +static FORCEINLINE __vec16_i64 __shuffle_double(__vec16_i64 _v, const __vec16_i32 index) +{ + CASTL2I(_v, v_hi, v_lo); + const __vec16_i32 ret_hi = __shuffle_i32(v_hi, index); + const __vec16_i32 ret_lo = __shuffle_i32(v_lo, index); + return CASTI2L(ret_hi, ret_lo); +} +static FORCEINLINE __vec16_i64 __shuffle2_double(__vec16_i64 _v0, __vec16_i64 _v1, const __vec16_i32 index) +{ + CASTL2I(_v0, v0_hi, v0_lo); + CASTL2I(_v1, v1_hi, v1_lo); + const __vec16_i32 ret_hi = __shuffle2_i32(v0_hi, v1_hi, index); + const __vec16_i32 ret_lo = __shuffle2_i32(v0_lo, v1_lo, index); + return CASTI2L(ret_hi, ret_lo); +} +#undef CASTI2L +#undef CASTL2I template static FORCEINLINE __vec16_i64 __load(const __vec16_i64 *p) { -#ifdef ISPC_FORCE_ALIGNED_MEMORY +#ifdef ISPC_FORCE_ALIGNED_MEMORY__REMOVETHIS_WHEN_FIXED return __load<128>(p); #else __vec16_i32 v1; @@ -942,18 +956,10 @@ template static FORCEINLINE __vec16_i64 __load(const __vec16_i64 *p) #endif } -template <> static FORCEINLINE __vec16_i64 __load<64>(const __vec16_i64 *p) -{ - __m512i v2 = _mm512_load_epi32(p); - __m512i v1 = _mm512_load_epi32(((uint8_t*)p)+64); - return __vec16_i64(v2,v1); -} - -template <> static FORCEINLINE __vec16_i64 __load<128>(const __vec16_i64 *p) { return __load<64>(p); } template static FORCEINLINE void __store(__vec16_i64 *p, __vec16_i64 v) { -#ifdef ISPC_FORCE_ALIGNED_MEMORY +#ifdef ISPC_FORCE_ALIGNED_MEMORY__REMOVETHIS_WHEN_FIXED return __store<128>(p,v); #else __m512i v1 = v.v2; @@ -965,6 +971,14 @@ template static FORCEINLINE void __store(__vec16_i64 *p, __vec16_i64 #endif } +#if 0 /* knc::fails as with _i32 this may generate fails ... so commetining it out */ +template <> static FORCEINLINE __vec16_i64 __load<64>(const __vec16_i64 *p) +{ + __m512i v2 = _mm512_load_epi32(p); + __m512i v1 = _mm512_load_epi32(((uint8_t*)p)+64); + return __vec16_i64(v2,v1); +} +template <> static FORCEINLINE __vec16_i64 __load<128>(const __vec16_i64 *p) { return __load<64>(p); } template <> static FORCEINLINE void __store<64>(__vec16_i64 *p, __vec16_i64 v) { __m512i v1 = v.v2; @@ -972,8 +986,8 @@ template <> static FORCEINLINE void __store<64>(__vec16_i64 *p, __vec16_i64 v) _mm512_store_epi64(p, v2); _mm512_store_epi64(((uint8_t*)p)+64, v1); } - template <> static FORCEINLINE void __store<128>(__vec16_i64 *p, __vec16_i64 v) { __store<64>(p, v); } +#endif /////////////////////////////////////////////////////////////////////////// @@ -1048,7 +1062,7 @@ static FORCEINLINE __vec16_f __shuffle2_float(__vec16_f _v0, __vec16_f _v1, __ve template static FORCEINLINE __vec16_f __load(const __vec16_f *p) { -#ifdef ISPC_FORCE_ALIGNED_MEMORY +#ifdef ISPC_FORCE_ALIGNED_MEMORY__REMOVETHIS_WHEN_FIXED return __load<64>(p); #else __vec16_f v; @@ -1060,7 +1074,7 @@ template static FORCEINLINE __vec16_f __load(const __vec16_f *p) template static FORCEINLINE void __store(__vec16_f *p, __vec16_f v) { -#ifdef ISPC_FORCE_ALIGNED_MEMORY +#ifdef ISPC_FORCE_ALIGNED_MEMORY__REMOVETHIS_WHEN_FIXED __store<64>(p,v); #else _mm512_extpackstorelo_ps( p, v, _MM_DOWNCONV_PS_NONE, _MM_HINT_NONE); @@ -1278,15 +1292,6 @@ template <> static FORCEINLINE __vec16_d __setzero_double<__vec16_d>() { return template RetVecType __undef_double(); template <> static FORCEINLINE __vec16_d __undef_double<__vec16_d>() { return __vec16_d(); } -static FORCEINLINE __vec16_d __broadcast_double(__vec16_d v, int index) -{ - __vec16_d ret; - double val = __extract_element(v, index & 0xf); - ret.v1 = _mm512_set1_pd(val); - ret.v2 = _mm512_set1_pd(val); - return ret; -} - #define CASTD2F(_v_, _v_hi_, _v_lo_) \ __vec16_f _v_hi_, _v_lo_; \ { \ @@ -1295,21 +1300,20 @@ static FORCEINLINE __vec16_d __broadcast_double(__vec16_d v, int index) _v_lo_ = _mm512_castpd_ps(v.v_lo); } #define CASTF2D(_ret_hi_, _ret_lo_) \ __vec16_d(_mm512_castps_pd(_ret_hi_), _mm512_castps_pd(_ret_lo_)).cvt2zmm() - -#if 0 /* knc::testme there appears to be no tests in ./tests for checking this functionality */ +static FORCEINLINE __vec16_d __broadcast_double(__vec16_d _v, int index) +{ + CASTD2F(_v, v_hi, v_lo); + const __vec16_f ret_hi = __broadcast_float(v_hi, index); + const __vec16_f ret_lo = __broadcast_float(v_lo, index); + return CASTF2D(ret_hi, ret_lo); +} static FORCEINLINE __vec16_d __rotate_double(const __vec16_d _v, const int index) { -// return _v; /* this one passes all tests , but most not */ CASTD2F(_v, v_hi, v_lo); const __vec16_f ret_hi = __rotate_float(v_hi, index); const __vec16_f ret_lo = __rotate_float(v_lo, index); return CASTF2D(ret_hi, ret_lo); } -#else -ROTATE(__vec16_d, double, double) -#endif - -#if 0 /* knc::fails ./tests/shuffle2-4.ispc ./tests/shuffle2-5.ispc */ static FORCEINLINE __vec16_d __shuffle_double(__vec16_d _v, const __vec16_i32 index) { CASTD2F(_v, v_hi, v_lo); @@ -1325,32 +1329,37 @@ static FORCEINLINE __vec16_d __shuffle2_double(__vec16_d _v0, __vec16_d _v1, con const __vec16_f ret_lo = __shuffle2_float(v0_lo, v1_lo, index); return CASTF2D(ret_hi, ret_lo); } -#else -SHUFFLES(__vec16_d, double, double) -#endif -#undef CASTD2F #undef CASTF2D +#undef CASTD2F template static FORCEINLINE __vec16_d __load(const __vec16_d *p) \ { +#ifdef ISPC_FORCE_ALIGNED_MEMORY__REMOVETHIS_WHEN_FIXED + return __load<128>(p); +#else __vec16_d ret; ret.v1 = _mm512_extloadunpacklo_pd(ret.v1, p, _MM_UPCONV_PD_NONE, _MM_HINT_NONE); ret.v1 = _mm512_extloadunpackhi_pd(ret.v1, (uint8_t*)p+64, _MM_UPCONV_PD_NONE, _MM_HINT_NONE); ret.v2 = _mm512_extloadunpacklo_pd(ret.v2, (uint8_t*)p+64, _MM_UPCONV_PD_NONE, _MM_HINT_NONE); ret.v2 = _mm512_extloadunpackhi_pd(ret.v2, (uint8_t*)p+128, _MM_UPCONV_PD_NONE, _MM_HINT_NONE); return ret; +#endif } template static FORCEINLINE void __store(__vec16_d *p, __vec16_d v) { +#ifdef ISPC_FORCE_ALIGNED_MEMORY__REMOVETHIS_WHEN_FIXED + return __store<128>(p,v); +#else _mm512_extpackstorelo_pd(p, v.v1, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE); _mm512_extpackstorehi_pd((uint8_t*)p+64, v.v1, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE); _mm512_extpackstorelo_pd((uint8_t*)p+64, v.v2, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE); _mm512_extpackstorehi_pd((uint8_t*)p+128, v.v2, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE); +#endif } -#if 1 +#if 0 /* knc::fails as with _f this may generate fails ... so commetining it out */ template <> static FORCEINLINE __vec16_d __load<64>(const __vec16_d *p) { return __vec16_d(_mm512_load_pd(p), _mm512_load_pd(((uint8_t*)p)+64)); @@ -1379,14 +1388,12 @@ static FORCEINLINE TO FUNC(TO, FROM val) { \ } // sign extension conversions -#if 0 /* knc::fails on soa-9 soa-13 soa-10 soa-29 soa-3 ... and others */ + +// CAST(__vec16_i64, int64_t, __vec16_i32, int32_t, __cast_sext) static FORCEINLINE __vec16_i64 __cast_sext(const __vec16_i64 &, const __vec16_i32 &val) { return __vec16_i64(_mm512_srai_epi32(val.v,31), val.v).cvt2zmm(); } -#else -CAST(__vec16_i64, int64_t, __vec16_i32, int32_t, __cast_sext) -#endif CAST(__vec16_i64, int64_t, __vec16_i16, int16_t, __cast_sext) CAST(__vec16_i64, int64_t, __vec16_i8, int8_t, __cast_sext) CAST(__vec16_i32, int32_t, __vec16_i16, int16_t, __cast_sext) @@ -2107,7 +2114,7 @@ static FORCEINLINE __vec16_i8 __gather_base_offsets32_i8(uint8_t *base, uint32_t _mm512_extstore_epi32(ret.data,tmp,_MM_DOWNCONV_EPI32_SINT8,_MM_HINT_NONE); return ret; } -#if 0 /* knc::fails on gather-int8-2 & gather-int8-4 */ +// GATHER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i64, __gather_base_offsets64_i8) static FORCEINLINE __vec16_i8 __gather_base_offsets64_i8(uint8_t *_base, uint32_t scale, __vec16_i64 _offsets, __vec16_i1 mask) { const __vec16_i64 offsets = _offsets.cvt2hilo(); @@ -2131,9 +2138,6 @@ static FORCEINLINE __vec16_i8 __gather_base_offsets64_i8(uint8_t *_base, uint32_ _mm512_extstore_epi32(ret.data,tmp,_MM_DOWNCONV_EPI32_SINT8,_MM_HINT_NONE); return ret; } -#else -GATHER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i64, __gather_base_offsets64_i8) -#endif /****************/ GATHER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i32, __gather_base_offsets32_i16) GATHER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i64, __gather_base_offsets64_i16) @@ -2145,7 +2149,7 @@ static FORCEINLINE __vec16_i32 __gather_base_offsets32_i32(uint8_t *base, uint32 base, _MM_UPCONV_EPI32_NONE, scale, _MM_HINT_NONE); } -#if 0 /* knc::fails on gather-int32-2 & gather-int32-4 */ +// GATHER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i64, __gather_base_offsets64_i32) static FORCEINLINE __vec16_i32 __gather_base_offsets64_i32(uint8_t *_base, uint32_t scale, __vec16_i64 _offsets, __vec16_i1 mask) { const __vec16_i64 offsets = _offsets.cvt2hilo(); @@ -2170,9 +2174,6 @@ static FORCEINLINE __vec16_i32 __gather_base_offsets64_i32(uint8_t *_base, uint3 return ret; } -#else -GATHER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i64, __gather_base_offsets64_i32) -#endif /****************/ // GATHER_BASE_OFFSETS(__vec16_f, float, __vec16_i32, __gather_base_offsets32_float) static FORCEINLINE __vec16_f __gather_base_offsets32_float(uint8_t *base, uint32_t scale, __vec16_i32 offsets, __vec16_i1 mask) @@ -2181,7 +2182,7 @@ static FORCEINLINE __vec16_f __gather_base_offsets32_float(uint8_t *base, uint32 base, _MM_UPCONV_PS_NONE, scale, _MM_HINT_NONE); } -#if 0 /* knc::fails on gather-float-2 gather-float-4 & soa-14 */ +// GATHER_BASE_OFFSETS(__vec16_f, float, __vec16_i64, __gather_base_offsets64_float) static FORCEINLINE __vec16_f __gather_base_offsets64_float(uint8_t *_base, uint32_t scale, __vec16_i64 _offsets, __vec16_i1 mask) { const __vec16_i64 offsets = _offsets.cvt2hilo(); @@ -2206,9 +2207,6 @@ static FORCEINLINE __vec16_f __gather_base_offsets64_float(uint8_t *_base, uint3 return ret; } -#else -GATHER_BASE_OFFSETS(__vec16_f, float, __vec16_i64, __gather_base_offsets64_float) -#endif /****************/ GATHER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i32, __gather_base_offsets32_i64) GATHER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i64, __gather_base_offsets64_i64) @@ -2324,7 +2322,7 @@ static FORCEINLINE void __scatter_base_offsets32_float(void *base, uint32_t scal _MM_DOWNCONV_PS_NONE, scale, _MM_HINT_NONE); } -#if 0 /* knc::fails on soa-10 & soa-13 , it is very similar to __scatter_base_offsets64_it32, but that passes tests, why ?!? */ +//SCATTER_BASE_OFFSETS(__vec16_f, float, __vec16_i64, __scatter_base_offsets64_float) static FORCEINLINE void __scatter_base_offsets64_float(uint8_t *_base, uint32_t scale, __vec16_i64 _offsets, __vec16_f value, __vec16_i1 mask) { const __vec16_i64 offsets = _offsets.cvt2hilo(); @@ -2346,9 +2344,6 @@ static FORCEINLINE void __scatter_base_offsets64_float(uint8_t *_base, uint32_t still_to_do = _mm512_kxor(match,still_to_do); } } -#else -SCATTER_BASE_OFFSETS(__vec16_f, float, __vec16_i64, __scatter_base_offsets64_float) -#endif /*****************/ SCATTER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i32, __scatter_base_offsets32_i64) SCATTER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i64, __scatter_base_offsets64_i64) From 4222605f873e589aa9dc905fb0c2e6dcb9353d01 Mon Sep 17 00:00:00 2001 From: evghenii Date: Mon, 7 Oct 2013 14:24:27 +0300 Subject: [PATCH 123/124] fixed lshr/ashr/shl shifts. __mul i64 vector version for icc < 14.0.0 works only on signed, so commented it out in favour of sequential --- examples/intrinsics/knc-i1x16.h | 115 ++++++++++++++++---------------- 1 file changed, 58 insertions(+), 57 deletions(-) diff --git a/examples/intrinsics/knc-i1x16.h b/examples/intrinsics/knc-i1x16.h index fb2cf618..ffe8fb56 100644 --- a/examples/intrinsics/knc-i1x16.h +++ b/examples/intrinsics/knc-i1x16.h @@ -668,6 +668,7 @@ template RetVecType __smear_i32(int32_t i); template <> static FORCEINLINE __vec16_i32 __smear_i32<__vec16_i32>(int32_t i) { return _mm512_set1_epi32(i); } static const __vec16_i32 __ispc_one = __smear_i32<__vec16_i32>(1); +static const __vec16_i32 __ispc_zero = __smear_i32<__vec16_i32>(0); static const __vec16_i32 __ispc_thirty_two = __smear_i32<__vec16_i32>(32); static const __vec16_i32 __ispc_ffffffff = __smear_i32<__vec16_i32>(-1); static const __vec16_i32 __ispc_stride1(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); @@ -772,18 +773,18 @@ static FORCEINLINE __vec16_i64 __mul(__vec16_i64 a, __vec16_i64 b) return __vec16_i64(_mm512_mullox_epi64(a.v1,b.v1), _mm512_mullox_epi64(a.v2,b.v2)); } #else /* __ICC >= 1400 */ -#if 0 /* knc::fails ./tests/int64-min-1.ispc ./tests/idiv.ispc */ +#if 0 /* knc::fails ./tests/int64-min-1.ispc ./tests/idiv.ispc cause: if one or both numbers are negative multiplication fails */ static FORCEINLINE __vec16_i64 __mul(__vec16_i64 _a, __vec16_i64 _b) { - __vec16_i64 a = _a.cvt2hilo(); - __vec16_i64 b = _b.cvt2hilo(); - __vec16_i32 lo = _mm512_mullo_epi32(a.v_lo,b.v_lo); - __vec16_i32 hi_m1 = _mm512_mulhi_epi32(a.v_lo, b.v_lo); - __vec16_i32 hi_m2 = _mm512_mullo_epi32(a.v_hi, b.v_lo); - __vec16_i32 hi_m3 = _mm512_mullo_epi32(a.v_lo, b.v_hi); - __mmask16 carry = 0; - __vec16_i32 hi_p23 = _mm512_addsetc_epi32(hi_m2, hi_m1, &carry); - __vec16_i32 hi = _mm512_adc_epi32(hi_m3, carry, hi_p23, &carry); + const __vec16_i64 a = _a.cvt2hilo(); + const __vec16_i64 b = _b.cvt2hilo(); + const __vec16_i32 lo = _mm512_mullo_epi32(a.v_lo, b.v_lo); + const __vec16_i32 hi_m1 = _mm512_mulhi_epi32(a.v_lo, b.v_lo); + const __vec16_i32 hi_m2 = _mm512_mullo_epi32(a.v_hi, b.v_lo); + const __vec16_i32 hi_m3 = _mm512_mullo_epi32(a.v_lo, b.v_hi); + __mmask16 carry; + const __vec16_i32 hi_p23 = _mm512_addsetc_epi32(hi_m2, hi_m3, &carry); + const __vec16_i32 hi = _mm512_adc_epi32(hi_p23, carry, hi_m1, &carry); return __vec16_i64(hi,lo).cvt2zmm(); } #else @@ -795,60 +796,68 @@ static FORCEINLINE __vec16_i64 __or (__vec16_i64 a, __vec16_i64 b) { return __ve static FORCEINLINE __vec16_i64 __and(__vec16_i64 a, __vec16_i64 b) { return __vec16_i64(_mm512_and_epi64(a.v1, b.v1), _mm512_and_epi64(a.v2, b.v2)); } static FORCEINLINE __vec16_i64 __xor(__vec16_i64 a, __vec16_i64 b) { return __vec16_i64(_mm512_xor_epi64(a.v1, b.v1), _mm512_xor_epi64(a.v2, b.v2)); } -static FORCEINLINE __vec16_i64 __shl(__vec16_i64 _a, __vec16_i64 _b) -{ - const __vec16_i64 a = _a.cvt2hilo(); - const __vec16_i64 b = _b.cvt2hilo(); - __vec16_i32 xfer = _mm512_srlv_epi32(a.v_lo, _mm512_sub_epi32(__ispc_thirty_two, b.v_lo)); - __vec16_i32 hi = _mm512_or_epi32(_mm512_sllv_epi32(a.v_hi, b.v_lo), xfer); - __vec16_i32 lo = _mm512_sllv_epi32(a.v_lo, b.v_lo); - return __vec16_i64(hi,lo).cvt2zmm(); -} - static FORCEINLINE __vec16_i64 __udiv(__vec16_i64 a, __vec16_i64 b) { return __vec16_i64(_mm512_div_epu64(a.v1,b.v1), _mm512_div_epu64(a.v2,b.v2)); } static FORCEINLINE __vec16_i64 __sdiv(__vec16_i64 a, __vec16_i64 b) { return __vec16_i64(_mm512_div_epi64(a.v1,b.v1), _mm512_div_epi64(a.v2,b.v2)); } static FORCEINLINE __vec16_i64 __urem(__vec16_i64 a, __vec16_i64 b) { return __vec16_i64(_mm512_rem_epu64(a.v1,b.v1), _mm512_rem_epu64(a.v2,b.v2)); } static FORCEINLINE __vec16_i64 __srem(__vec16_i64 a, __vec16_i64 b) { return __vec16_i64(_mm512_rem_epi64(a.v1,b.v1), _mm512_rem_epi64(a.v2,b.v2)); } -#if 0 /* knc::fails ./tests/idiv.ispc */ -static FORCEINLINE __vec16_i64 __lshr(__vec16_i64 _a, __vec16_i64 _b) { + +static FORCEINLINE __vec16_i64 __shl(__vec16_i64 _a, __vec16_i64 _b) +{ const __vec16_i64 a = _a.cvt2hilo(); const __vec16_i64 b = _b.cvt2hilo(); - __vec16_i32 shift = _mm512_sub_epi32(__ispc_thirty_two, b.v_lo); -#if 0 - __vec16_i32 xfer = _mm512_and_epi32(_mm512_sllv_epi32(__ispc_ffffffff, shift), _mm512_sllv_epi32(a.v_hi, shift)); -#else - __vec16_i32 xfer = _mm512_sllv_epi32(_mm512_and_epi32(a.v_hi, - _mm512_sub_epi32(_mm512_sllv_epi32(__ispc_one, b.v_lo), __ispc_one)), - _mm512_sub_epi32(__ispc_thirty_two, b.v_lo)); -#endif - __vec16_i32 hi = _mm512_srlv_epi32(a.v_hi, b.v_lo); - __vec16_i32 lo = _mm512_or_epi32(xfer, _mm512_srlv_epi32(a.v_lo, b.v_lo)); + /* this is a safety gate in case b-shift >= 32 */ + const __vec16_i32 xfer = __select( + __signed_less_than_i32(b.v_lo, __ispc_thirty_two), + __lshr(a.v_lo, __sub(__ispc_thirty_two, b.v_lo)), + __shl (a.v_lo, __sub(b.v_lo, __ispc_thirty_two)) + ); + const __vec16_i32 hi = __or(__shl(a.v_hi, b.v_lo), xfer); + const __vec16_i32 lo = __shl(a.v_lo, b.v_lo); return __vec16_i64(hi,lo).cvt2zmm(); } -#else -BINARY_OP_CAST(__vec16_i64, uint64_t, __lshr, >>) -#endif - -#if 0 /* knc::fails ./tests/idiv.ispc */ -static FORCEINLINE __vec16_i64 __ashr(__vec16_i64 _a, __vec16_i64 _b) { +static FORCEINLINE __vec16_i64 __lshr(__vec16_i64 _a, __vec16_i64 _b) +{ const __vec16_i64 a = _a.cvt2hilo(); const __vec16_i64 b = _b.cvt2hilo(); - __vec16_i32 xfer = _mm512_sllv_epi32(_mm512_and_epi32(a.v_hi, - _mm512_sub_epi32(_mm512_sllv_epi32(__ispc_one, b.v_lo), __ispc_one)), - _mm512_sub_epi32(__ispc_thirty_two, b.v_lo)); - __vec16_i32 hi = _mm512_srav_epi32(a.v_hi, b.v_lo); - __vec16_i32 lo = _mm512_or_epi32(xfer, _mm512_srlv_epi32(a.v_lo, b.v_lo)); + /* this is a safety gate in case b-shift >= 32 */ + const __vec16_i32 xfer = __select( + __signed_less_than_i32(b.v_lo, __ispc_thirty_two), + __shl (a.v_hi, __sub(__ispc_thirty_two, b.v_lo)), + __lshr(a.v_hi, __sub(b.v_lo, __ispc_thirty_two)) + ); + const __vec16_i32 lo = __or(__lshr(a.v_lo, b.v_lo), xfer); + const __vec16_i32 hi = __lshr(a.v_hi, b.v_lo); + return __vec16_i64(hi,lo).cvt2zmm(); +} +static FORCEINLINE __vec16_i64 __ashr(__vec16_i64 _a, __vec16_i64 _b) +{ + const __vec16_i64 a = _a.cvt2hilo(); + const __vec16_i64 b = _b.cvt2hilo(); + /* this is a safety gate in case b-shift >= 32 */ + const __vec16_i32 xfer = __select( + __signed_less_than_i32(b.v_lo, __ispc_thirty_two), + __shl (a.v_hi, __sub(__ispc_thirty_two, b.v_lo)), + __ashr(a.v_hi, __sub(b.v_lo, __ispc_thirty_two)) + ); + const __vec16_i32 lo = __or(__lshr(a.v_lo, b.v_lo), xfer); + const __vec16_i32 hi = __ashr(a.v_hi, b.v_lo); return __vec16_i64(hi,lo).cvt2zmm(); } -#else -BINARY_OP_CAST(__vec16_i64, int64_t, __ashr, >>) -#endif -SHIFT_UNIFORM(__vec16_i64, uint64_t, __lshr, >>) -SHIFT_UNIFORM(__vec16_i64, int64_t, __ashr, >>) -SHIFT_UNIFORM(__vec16_i64, int64_t, __shl, <<) +template RetVecType __smear_i64(const int64_t &l); +template <> FORCEINLINE __vec16_i64 __smear_i64<__vec16_i64>(const int64_t &l) { return __vec16_i64(_mm512_set1_epi64(l), _mm512_set1_epi64(l)); } + +template RetVecType __setzero_i64(); +template <> FORCEINLINE __vec16_i64 __setzero_i64<__vec16_i64>() { return __vec16_i64(_mm512_setzero_epi32(), _mm512_setzero_epi32()); } + +template RetVecType __undef_i64(); +template <> FORCEINLINE __vec16_i64 __undef_i64<__vec16_i64>() { return __vec16_i64(_mm512_undefined_epi32(), _mm512_undefined_epi32()); } + +static FORCEINLINE __vec16_i64 __lshr(__vec16_i64 a, uint64_t shift) { return __lshr(a, __smear_i64<__vec16_i64>(shift)); } +static FORCEINLINE __vec16_i64 __ashr(__vec16_i64 a, int64_t shift) { return __ashr(a, __smear_i64<__vec16_i64>(shift)); } +static FORCEINLINE __vec16_i64 __shl (__vec16_i64 a, int64_t shift) { return __shl (a, __smear_i64<__vec16_i64>(shift)); } static FORCEINLINE __vec16_i1 __equal_i64(__vec16_i64 _a, __vec16_i64 _b) { @@ -892,14 +901,6 @@ static FORCEINLINE __vec16_i64 __select(__vec16_i1 mask, __vec16_i64 a, __vec16_ INSERT_EXTRACT(__vec16_i64, int64_t) -template RetVecType __smear_i64(const int64_t &l); -template <> FORCEINLINE __vec16_i64 __smear_i64<__vec16_i64>(const int64_t &l) { return __vec16_i64(_mm512_set1_epi64(l), _mm512_set1_epi64(l)); } - -template RetVecType __setzero_i64(); -template <> FORCEINLINE __vec16_i64 __setzero_i64<__vec16_i64>() { return __vec16_i64(_mm512_setzero_epi32(), _mm512_setzero_epi32()); } - -template RetVecType __undef_i64(); -template <> FORCEINLINE __vec16_i64 __undef_i64<__vec16_i64>() { return __vec16_i64(_mm512_undefined_epi32(), _mm512_undefined_epi32()); } #define CASTL2I(_v_, _v_hi_, _v_lo_) \ __vec16_i32 _v_hi_, _v_lo_; \ From 3da152a150d5b99f856368317031f181835afb9e Mon Sep 17 00:00:00 2001 From: evghenii Date: Mon, 7 Oct 2013 18:30:22 +0300 Subject: [PATCH 124/124] fixed zmm __mul for i64 with icc < 14.0.0, 4 knc::fails lefts, but I doubt these are due to this include.. --- examples/intrinsics/knc-i1x16.h | 50 ++++++++++++++++++++++----------- 1 file changed, 34 insertions(+), 16 deletions(-) diff --git a/examples/intrinsics/knc-i1x16.h b/examples/intrinsics/knc-i1x16.h index ffe8fb56..78d35ddc 100644 --- a/examples/intrinsics/knc-i1x16.h +++ b/examples/intrinsics/knc-i1x16.h @@ -767,31 +767,56 @@ static FORCEINLINE __vec16_i64 __mul(const __vec16_i32 &a, const __vec16_i64 &_b _mm512_mulhi_epi32(a.v, b.v_lo))).cvt2zmm(); } +static FORCEINLINE __vec16_i64 __select(__vec16_i1 mask, __vec16_i64 a, __vec16_i64 b) +{ + __vec16_i64 ret; + ret.v1 = _mm512_mask_mov_epi64(b.v1, mask, a.v1); + ret.v2 = _mm512_mask_mov_epi64(b.v2, mask >> 8, a.v2); + return ret; +} + #if __ICC >= 1400 /* compiler gate, icc >= 14.0.0 support _mm512_mullox_epi64 */ static FORCEINLINE __vec16_i64 __mul(__vec16_i64 a, __vec16_i64 b) { return __vec16_i64(_mm512_mullox_epi64(a.v1,b.v1), _mm512_mullox_epi64(a.v2,b.v2)); } #else /* __ICC >= 1400 */ -#if 0 /* knc::fails ./tests/int64-min-1.ispc ./tests/idiv.ispc cause: if one or both numbers are negative multiplication fails */ +static FORCEINLINE void __abs_i32i64(__m512i &_hi, __m512i &_lo) +{ + /* abs(x) : + * mask = x >> 32; + * abs(x) = (x^mask) - mask + */ + const __vec16_i32 mask = __ashr(_hi, __ispc_thirty_two); + __vec16_i32 hi = __xor(_hi, mask); + __vec16_i32 lo = __xor(_lo, mask); + __mmask16 borrow = 0; + _lo = _mm512_subsetb_epi32(lo, mask, &borrow); + _hi = _mm512_sbb_epi32 (hi, borrow, mask, &borrow); +} static FORCEINLINE __vec16_i64 __mul(__vec16_i64 _a, __vec16_i64 _b) { - const __vec16_i64 a = _a.cvt2hilo(); - const __vec16_i64 b = _b.cvt2hilo(); - const __vec16_i32 lo = _mm512_mullo_epi32(a.v_lo, b.v_lo); - const __vec16_i32 hi_m1 = _mm512_mulhi_epi32(a.v_lo, b.v_lo); + __vec16_i64 a = _a.cvt2hilo(); + __vec16_i64 b = _b.cvt2hilo(); + /* sign = (a^b) >> 32, if sign == 0 then a*b >= 0, otherwise a*b < 0 */ + const __vec16_i1 sign = __not_equal_i32(__ashr(__xor(a.v_hi, b.v_hi), __ispc_thirty_two), __ispc_zero); + __abs_i32i64(a.v_hi, a.v_lo); /* abs(a) */ + __abs_i32i64(b.v_hi, b.v_lo); /* abs(b) */ + const __vec16_i32 lo_m1 = _mm512_mullo_epi32(a.v_lo, b.v_lo); + const __vec16_i32 hi_m1 = _mm512_mulhi_epu32(a.v_lo, b.v_lo); const __vec16_i32 hi_m2 = _mm512_mullo_epi32(a.v_hi, b.v_lo); const __vec16_i32 hi_m3 = _mm512_mullo_epi32(a.v_lo, b.v_hi); __mmask16 carry; const __vec16_i32 hi_p23 = _mm512_addsetc_epi32(hi_m2, hi_m3, &carry); const __vec16_i32 hi = _mm512_adc_epi32(hi_p23, carry, hi_m1, &carry); - return __vec16_i64(hi,lo).cvt2zmm(); + const __vec16_i32 lo = lo_m1; + const __vec16_i64 ret_abs = __vec16_i64(hi,lo).cvt2zmm(); + /* if sign != 0, means either a or b is negative, then negate the result */ + return __select(sign, __sub(__vec16_i64(__ispc_zero, __ispc_zero), ret_abs), ret_abs); } -#else -BINARY_OP(__vec16_i64, __mul, *) -#endif #endif /* __ICC >= 1400 */ + static FORCEINLINE __vec16_i64 __or (__vec16_i64 a, __vec16_i64 b) { return __vec16_i64(_mm512_or_epi64 (a.v1, b.v1), _mm512_or_epi64 (a.v2, b.v2)); } static FORCEINLINE __vec16_i64 __and(__vec16_i64 a, __vec16_i64 b) { return __vec16_i64(_mm512_and_epi64(a.v1, b.v1), _mm512_and_epi64(a.v2, b.v2)); } static FORCEINLINE __vec16_i64 __xor(__vec16_i64 a, __vec16_i64 b) { return __vec16_i64(_mm512_xor_epi64(a.v1, b.v1), _mm512_xor_epi64(a.v2, b.v2)); } @@ -891,13 +916,6 @@ CMP_OP(__vec16_i64, i64, int64_t, __signed_less_than, <) CMP_OP(__vec16_i64, i64, uint64_t, __unsigned_greater_than, >) CMP_OP(__vec16_i64, i64, int64_t, __signed_greater_than, >) -static FORCEINLINE __vec16_i64 __select(__vec16_i1 mask, __vec16_i64 a, __vec16_i64 b) -{ - __vec16_i64 ret; - ret.v_hi = _mm512_mask_mov_epi64(b.v_hi, mask, a.v_hi); - ret.v_lo = _mm512_mask_mov_epi64(b.v_lo, mask >> 8, a.v_lo); - return ret; -} INSERT_EXTRACT(__vec16_i64, int64_t)