From 0277ba1aaa8a3c2b9441b149942bbc9c0ed3be5d Mon Sep 17 00:00:00 2001 From: Matt Pharr Date: Tue, 23 Jul 2013 16:49:07 -0700 Subject: [PATCH 01/34] Improve warnings for right shift by varying amounts. Fixes: - Don't issue a warning when the shift is a by the same amount in all vector lanes. - Do issue a warning when it's a compile-time constant but the values are different in different lanes. Previously, we warned iff the shift amount wasn't a compile-time constant. --- expr.cpp | 39 ++++++++++++++++++++++++++++++++++++--- 1 file changed, 36 insertions(+), 3 deletions(-) diff --git a/expr.cpp b/expr.cpp index fc3d295a..894942d2 100644 --- a/expr.cpp +++ b/expr.cpp @@ -1911,6 +1911,40 @@ lEmitLogicalOp(BinaryExpr::Op op, Expr *arg0, Expr *arg1, } +/* Returns true if shifting right by the given amount will lead to + inefficient code. (Assumes x86 target. May also warn inaccurately if + later optimization simplify the shift amount more than we are able to + see at this point.) */ +static bool +lIsDifficultShiftAmount(Expr *expr) { + // Uniform shifts (of uniform values) are no problem. + if (expr->GetType()->IsVaryingType() == false) + return false; + + ConstExpr *ce = dynamic_cast(expr); + if (ce) { + // If the shift is by a constant amount, *and* it's the same amount + // in all vector lanes, we're in good shape. + uint32_t amount[ISPC_MAX_NVEC]; + int count = ce->GetValues(amount); + for (int i = 1; i < count; ++i) + if (amount[i] != amount[0]) + return true; + return false; + } + + TypeCastExpr *tce = dynamic_cast(expr); + if (tce && tce->expr) { + // Finally, if the shift amount is given by a uniform value that's + // been smeared out into a varying, we have the same shift for all + // lanes and are also in good shape. + return (tce->expr->GetType()->IsUniformType() == false); + } + + return true; +} + + llvm::Value * BinaryExpr::GetValue(FunctionEmitContext *ctx) const { if (!arg0 || !arg1) { @@ -1951,9 +1985,8 @@ BinaryExpr::GetValue(FunctionEmitContext *ctx) const { case BitAnd: case BitXor: case BitOr: { - if (op == Shr && arg1->GetType()->IsVaryingType() && - dynamic_cast(arg1) == NULL) - PerformanceWarning(pos, "Shift right is extremely inefficient for " + if (op == Shr && lIsDifficultShiftAmount(arg1)) + PerformanceWarning(pos, "Shift right is inefficient for " "varying shift amounts."); return lEmitBinaryBitOp(op, value0, value1, arg0->GetType()->IsUnsignedType(), ctx); From 83e1630fbcfde4aa67b50245cd96e36cbe033660 Mon Sep 17 00:00:00 2001 From: Matt Pharr Date: Tue, 23 Jul 2013 16:49:56 -0700 Subject: [PATCH 02/34] Add support for fast division of varying int values by small constants. For varying int8/16/32 types, divides by small constants can be implemented efficiently through multiplies and shifts with integer types of twice the bit-width; this commit adds this optimization. (Implementation is based on Halide.) --- expr.cpp | 69 +++++ stdlib.ispc | 675 ++++++++++++++++++++++++++++++++++++++++++++++++ tests/idiv.ispc | 75 ++++++ 3 files changed, 819 insertions(+) create mode 100644 tests/idiv.ispc diff --git a/expr.cpp b/expr.cpp index 894942d2..3baaabaf 100644 --- a/expr.cpp +++ b/expr.cpp @@ -2240,6 +2240,49 @@ lConstFoldBinaryIntOp(ConstExpr *constArg0, ConstExpr *constArg1, } +/* Returns true if the given arguments (which are assumed to be the + operands of a divide) represent a divide that can be performed by one of + the __fast_idiv functions. + */ +static bool +lCanImproveVectorDivide(Expr *arg0, Expr *arg1, int *divisor) { + const Type *type = arg0->GetType(); + if (!type) + return false; + + // The value being divided must be an int8/16/32. + if (!(Type::EqualIgnoringConst(type, AtomicType::VaryingInt8) || + Type::EqualIgnoringConst(type, AtomicType::VaryingUInt8) || + Type::EqualIgnoringConst(type, AtomicType::VaryingInt16) || + Type::EqualIgnoringConst(type, AtomicType::VaryingUInt16) || + Type::EqualIgnoringConst(type, AtomicType::VaryingInt32) || + Type::EqualIgnoringConst(type, AtomicType::VaryingUInt32))) + return false; + + // The divisor must be the same compile-time constant value for all of + // the vector lanes. + ConstExpr *ce = dynamic_cast(arg1); + if (!ce) + return false; + int64_t div[ISPC_MAX_NVEC]; + int count = ce->GetValues(div); + for (int i = 1; i < count; ++i) + if (div[i] != div[0]) + return false; + *divisor = div[0]; + + // And finally, the divisor must be >= 2 and <128 (for 8-bit divides), + // and <256 otherwise. + if (*divisor < 2) + return false; + if (Type::EqualIgnoringConst(type, AtomicType::VaryingInt8) || + Type::EqualIgnoringConst(type, AtomicType::VaryingUInt8)) + return *divisor < 128; + else + return *divisor < 256; +} + + Expr * BinaryExpr::Optimize() { if (arg0 == NULL || arg1 == NULL) @@ -2302,6 +2345,32 @@ BinaryExpr::Optimize() { } } + int divisor; + if (op == Div && lCanImproveVectorDivide(arg0, arg1, &divisor)) { + Debug(pos, "Improving vector divide by constant %d", divisor); + + std::vector idivFuns; + m->symbolTable->LookupFunction("__fast_idiv", &idivFuns); + if (idivFuns.size() == 0) { + Warning(pos, "Couldn't find __fast_idiv to optimize integer divide. " + "Are you compiling with --nostdlib?"); + return this; + } + + Expr *idivSymExpr = new FunctionSymbolExpr("__fast_idiv", idivFuns, pos); + ExprList *args = new ExprList(arg0, pos); + args->exprs.push_back(new ConstExpr(AtomicType::UniformInt32, divisor, arg1->pos)); + Expr *idivCall = new FunctionCallExpr(idivSymExpr, args, pos); + + idivCall = ::TypeCheck(idivCall); + if (idivCall == NULL) + return NULL; + + Assert(Type::EqualIgnoringConst(GetType(), idivCall->GetType())); + idivCall = new TypeCastExpr(GetType(), idivCall, pos); + return ::Optimize(idivCall); + } + // From here on out, we're just doing constant folding, so if both args // aren't constants then we're done... if (constArg0 == NULL || constArg1 == NULL) diff --git a/stdlib.ispc b/stdlib.ispc index 4e06f5da..b8ed2057 100644 --- a/stdlib.ispc +++ b/stdlib.ispc @@ -4264,3 +4264,678 @@ static inline bool rdrand(int64 * ptr) { return success; } } + +/////////////////////////////////////////////////////////////////////////// +// Fast vector integer division + +/* These tables and the algorithms in the __fast_idiv() functions below are + from Halide; the idea is based on the paper "Division by Invariant + Integers using Multiplication" by Granlund and Montgomery. + + Copyright (c) 2012 MIT CSAIL + + Developed by: + + The Halide team + MIT CSAIL + http://halide-lang.org + + Permission is hereby granted, free of charge, to any person obtaining a + copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be included + in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +static const uniform int64 __idiv_table_u8[][3] = { + {0, 0LL, 1}, {1, 171LL, 1}, {0, 0LL, 2}, + {1, 205LL, 2}, {1, 171LL, 2}, {2, 37LL, 2}, + {0, 0LL, 3}, {1, 57LL, 1}, {1, 205LL, 3}, + {2, 117LL, 3}, {1, 171LL, 3}, {1, 79LL, 2}, + {2, 37LL, 3}, {1, 137LL, 3}, {0, 0LL, 4}, + {1, 241LL, 4}, {1, 57LL, 2}, {1, 27LL, 1}, + {1, 205LL, 4}, {2, 135LL, 4}, {2, 117LL, 4}, + {2, 101LL, 4}, {1, 171LL, 4}, {1, 41LL, 2}, + {1, 79LL, 3}, {1, 19LL, 1}, {2, 37LL, 4}, + {2, 27LL, 4}, {1, 137LL, 4}, {2, 9LL, 4}, + {0, 0LL, 5}, {1, 249LL, 5}, {1, 241LL, 5}, + {1, 235LL, 5}, {1, 57LL, 3}, {1, 111LL, 4}, + {1, 27LL, 2}, {2, 165LL, 5}, {1, 205LL, 5}, + {1, 25LL, 2}, {2, 135LL, 5}, {1, 191LL, 5}, + {1, 187LL, 5}, {2, 109LL, 5}, {2, 101LL, 5}, + {1, 175LL, 5}, {1, 171LL, 5}, {2, 79LL, 5}, + {1, 41LL, 3}, {1, 161LL, 5}, {1, 79LL, 4}, + {1, 155LL, 5}, {1, 19LL, 2}, {1, 149LL, 5}, + {2, 37LL, 5}, {1, 9LL, 1}, {2, 27LL, 5}, + {1, 139LL, 5}, {1, 137LL, 5}, {2, 13LL, 5}, + {2, 9LL, 5}, {2, 5LL, 5}, {0, 0LL, 6}, + {1, 253LL, 6}, {1, 249LL, 6}, {1, 245LL, 6}, + {1, 121LL, 5}, {1, 119LL, 5}, {1, 235LL, 6}, + {1, 231LL, 6}, {1, 57LL, 4}, {1, 225LL, 6}, + {1, 111LL, 5}, {1, 219LL, 6}, {1, 27LL, 3}, + {1, 213LL, 6}, {2, 165LL, 6}, {1, 13LL, 2}, + {1, 205LL, 6}, {1, 203LL, 6}, {1, 25LL, 3}, + {1, 99LL, 5}, {2, 135LL, 6}, {1, 193LL, 6}, + {1, 191LL, 6}, {1, 189LL, 6}, {1, 187LL, 6}, + {1, 185LL, 6}, {1, 183LL, 6}, {1, 181LL, 6}, + {1, 179LL, 6}, {1, 177LL, 6}, {1, 175LL, 6}, + {1, 173LL, 6}, {1, 171LL, 6}, {1, 169LL, 6}, + {1, 21LL, 3}, {1, 83LL, 5}, {1, 41LL, 4}, + {1, 163LL, 6}, {1, 161LL, 6}, {2, 63LL, 6}, + {1, 79LL, 5}, {2, 57LL, 6}, {1, 155LL, 6}, + {2, 51LL, 6}, {1, 19LL, 3}, {1, 151LL, 6}, + {1, 149LL, 6}, {1, 37LL, 4}, {2, 37LL, 6}, + {1, 145LL, 6}, {1, 9LL, 2}, {1, 143LL, 6}, + {2, 27LL, 6}, {2, 25LL, 6}, {1, 139LL, 6}, + {1, 69LL, 5}, {1, 137LL, 6}, {2, 15LL, 6}, + {2, 13LL, 6}, {2, 11LL, 6}, {2, 9LL, 6}, + {2, 7LL, 6}, {2, 5LL, 6}, {2, 3LL, 6}, + {0, 0LL, 7}, {1, 255LL, 7}, {1, 127LL, 6}, + {1, 63LL, 5}, {1, 125LL, 6}, {1, 31LL, 4}, + {1, 123LL, 6}, {1, 61LL, 5}, {1, 121LL, 6}, + {1, 15LL, 3}, {1, 119LL, 6}, {1, 59LL, 5}, + {1, 235LL, 7}, {1, 117LL, 6}, {1, 29LL, 4}, + {1, 115LL, 6}, {1, 57LL, 5}, {1, 113LL, 6}, + {1, 225LL, 7}, {1, 7LL, 2}, {1, 111LL, 6}, + {1, 55LL, 5}, {1, 219LL, 7}, {1, 109LL, 6}, + {1, 27LL, 4}, {1, 215LL, 7}, {1, 107LL, 6}, + {1, 53LL, 5}, {1, 211LL, 7}, {1, 105LL, 6}, + {1, 13LL, 3}, {1, 207LL, 7}, {1, 103LL, 6}, + {1, 51LL, 5}, {1, 203LL, 7}, {1, 101LL, 6}, + {1, 25LL, 4}, {1, 199LL, 7}, {1, 99LL, 6}, + {1, 197LL, 7}, {1, 49LL, 5}, {1, 97LL, 6}, + {1, 193LL, 7}, {1, 3LL, 1}, {1, 191LL, 7}, + {1, 95LL, 6}, {1, 189LL, 7}, {1, 47LL, 5}, + {1, 187LL, 7}, {1, 93LL, 6}, {1, 185LL, 7}, + {1, 23LL, 4}, {1, 183LL, 7}, {1, 91LL, 6}, + {1, 181LL, 7}, {1, 45LL, 5}, {1, 179LL, 7}, + {1, 89LL, 6}, {1, 177LL, 7}, {1, 11LL, 3}, + {1, 175LL, 7}, {1, 87LL, 6}, {1, 173LL, 7}, + {1, 43LL, 5}, {1, 171LL, 7}, {1, 85LL, 6}, + {1, 169LL, 7}, {2, 81LL, 7}, {1, 21LL, 4}, + {1, 167LL, 7}, {1, 83LL, 6}, {1, 165LL, 7}, + {1, 41LL, 5}, {2, 71LL, 7}, {1, 163LL, 7}, + {1, 81LL, 6}, {1, 161LL, 7}, {1, 5LL, 2}, + {2, 63LL, 7}, {1, 159LL, 7}, {1, 79LL, 6}, + {1, 157LL, 7}, {2, 57LL, 7}, {1, 39LL, 5}, + {1, 155LL, 7}, {1, 77LL, 6}, {2, 51LL, 7}, + {1, 153LL, 7}, {1, 19LL, 4}, {2, 47LL, 7}, + {1, 151LL, 7}, {1, 75LL, 6}, {1, 149LL, 7}, + {2, 41LL, 7}, {1, 37LL, 5}, {1, 147LL, 7}, + {2, 37LL, 7}, {1, 73LL, 6}, {1, 145LL, 7}, + {2, 33LL, 7}, {1, 9LL, 3}, {2, 31LL, 7}, + {1, 143LL, 7}, {1, 71LL, 6}, {2, 27LL, 7}, + {1, 141LL, 7}, {2, 25LL, 7}, {1, 35LL, 5}, + {1, 139LL, 7}, {2, 21LL, 7}, {1, 69LL, 6}, + {2, 19LL, 7}, {1, 137LL, 7}, {1, 17LL, 4}, + {2, 15LL, 7}, {1, 135LL, 7}, {2, 13LL, 7}, + {1, 67LL, 6}, {2, 11LL, 7}, {1, 133LL, 7}, + {2, 9LL, 7}, {1, 33LL, 5}, {2, 7LL, 7}, + {1, 131LL, 7}, {2, 5LL, 7}, {1, 65LL, 6}, + {2, 3LL, 7}, {1, 129LL, 7}, {0, 0LL, 8}, +}; +static const uniform int64 __idiv_table_s8[][3] = { + {0, 0LL, 1}, {1, 86LL, 0}, {0, 0LL, 2}, + {1, 103LL, 1}, {1, 43LL, 0}, {1, 147LL, 2}, + {0, 0LL, 3}, {1, 57LL, 1}, {1, 103LL, 2}, + {1, 187LL, 3}, {1, 43LL, 1}, {1, 79LL, 2}, + {1, 147LL, 3}, {1, 137LL, 3}, {0, 0LL, 4}, + {1, 121LL, 3}, {1, 57LL, 2}, {1, 27LL, 1}, + {1, 103LL, 3}, {1, 49LL, 2}, {1, 187LL, 4}, + {1, 179LL, 4}, {1, 43LL, 2}, {1, 41LL, 2}, + {1, 79LL, 3}, {1, 19LL, 1}, {1, 147LL, 4}, + {1, 71LL, 3}, {1, 137LL, 4}, {1, 133LL, 4}, + {0, 0LL, 5}, {1, 125LL, 4}, {1, 121LL, 4}, + {1, 59LL, 3}, {1, 57LL, 3}, {1, 111LL, 4}, + {1, 27LL, 2}, {1, 211LL, 5}, {1, 103LL, 4}, + {1, 25LL, 2}, {1, 49LL, 3}, {1, 6LL, 0}, + {1, 47LL, 3}, {1, 23LL, 2}, {1, 45LL, 3}, + {1, 11LL, 1}, {1, 43LL, 3}, {1, 21LL, 2}, + {1, 41LL, 3}, {1, 81LL, 4}, {1, 79LL, 4}, + {1, 39LL, 3}, {1, 19LL, 2}, {1, 75LL, 4}, + {1, 147LL, 5}, {1, 9LL, 1}, {1, 71LL, 4}, + {1, 35LL, 3}, {1, 137LL, 5}, {1, 135LL, 5}, + {1, 133LL, 5}, {1, 131LL, 5}, {0, 0LL, 6}, + {1, 127LL, 5}, {1, 63LL, 4}, {1, 31LL, 3}, + {1, 61LL, 4}, {1, 15LL, 2}, {1, 59LL, 4}, + {1, 29LL, 3}, {1, 57LL, 4}, {1, 113LL, 5}, + {1, 7LL, 1}, {1, 55LL, 4}, {1, 27LL, 3}, + {1, 107LL, 5}, {1, 53LL, 4}, {1, 13LL, 2}, + {1, 103LL, 5}, {1, 51LL, 4}, {1, 25LL, 3}, + {1, 99LL, 5}, {1, 49LL, 4}, {1, 97LL, 5}, + {1, 3LL, 0}, {1, 95LL, 5}, {1, 47LL, 4}, + {1, 93LL, 5}, {1, 23LL, 3}, {1, 91LL, 5}, + {1, 45LL, 4}, {1, 89LL, 5}, {1, 11LL, 2}, + {1, 87LL, 5}, {1, 43LL, 4}, {1, 85LL, 5}, + {1, 21LL, 3}, {1, 83LL, 5}, {1, 41LL, 4}, + {1, 163LL, 6}, {1, 81LL, 5}, {1, 5LL, 1}, + {1, 79LL, 5}, {1, 157LL, 6}, {1, 39LL, 4}, + {1, 77LL, 5}, {1, 19LL, 3}, {1, 151LL, 6}, + {1, 75LL, 5}, {1, 37LL, 4}, {1, 147LL, 6}, + {1, 73LL, 5}, {1, 9LL, 2}, {1, 143LL, 6}, + {1, 71LL, 5}, {1, 141LL, 6}, {1, 35LL, 4}, + {1, 69LL, 5}, {1, 137LL, 6}, {1, 17LL, 3}, + {1, 135LL, 6}, {1, 67LL, 5}, {1, 133LL, 6}, + {1, 33LL, 4}, {1, 131LL, 6}, {1, 65LL, 5}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, + {0, 0LL, 7}, {0, 0LL, 7}, {0, 0LL, 7}, +}; +static const uniform int64 __idiv_table_u16[][3] = { + {0, 0LL, 1}, {1, 43691LL, 1}, {0, 0LL, 2}, + {1, 52429LL, 2}, {1, 43691LL, 2}, {2, 9363LL, 2}, + {0, 0LL, 3}, {1, 58255LL, 3}, {1, 52429LL, 3}, + {1, 47663LL, 3}, {1, 43691LL, 3}, {1, 20165LL, 2}, + {2, 9363LL, 3}, {1, 34953LL, 3}, {0, 0LL, 4}, + {1, 61681LL, 4}, {1, 58255LL, 4}, {1, 55189LL, 4}, + {1, 52429LL, 4}, {2, 34329LL, 4}, {1, 47663LL, 4}, + {2, 25645LL, 4}, {1, 43691LL, 4}, {2, 18351LL, 4}, + {1, 20165LL, 3}, {2, 12137LL, 4}, {2, 9363LL, 4}, + {1, 18079LL, 3}, {1, 34953LL, 4}, {2, 2115LL, 4}, + {0, 0LL, 5}, {1, 63551LL, 5}, {1, 61681LL, 5}, + {1, 59919LL, 5}, {1, 58255LL, 5}, {1, 7085LL, 2}, + {1, 55189LL, 5}, {2, 42011LL, 5}, {1, 52429LL, 5}, + {2, 36765LL, 5}, {2, 34329LL, 5}, {1, 48771LL, 5}, + {1, 47663LL, 5}, {1, 11651LL, 3}, {2, 25645LL, 5}, + {2, 23705LL, 5}, {1, 43691LL, 5}, {2, 20063LL, 5}, + {2, 18351LL, 5}, {1, 41121LL, 5}, {1, 20165LL, 4}, + {1, 39569LL, 5}, {2, 12137LL, 5}, {2, 10725LL, 5}, + {2, 9363LL, 5}, {2, 8049LL, 5}, {1, 18079LL, 4}, + {1, 35545LL, 5}, {1, 34953LL, 5}, {1, 8595LL, 3}, + {2, 2115LL, 5}, {2, 1041LL, 5}, {0, 0LL, 6}, + {1, 4033LL, 2}, {1, 63551LL, 6}, {1, 31301LL, 5}, + {1, 61681LL, 6}, {2, 56039LL, 6}, {1, 59919LL, 6}, + {1, 59075LL, 6}, {1, 58255LL, 6}, {1, 57457LL, 6}, + {1, 7085LL, 3}, {2, 46313LL, 6}, {1, 55189LL, 6}, + {1, 6809LL, 3}, {2, 42011LL, 6}, {1, 53093LL, 6}, + {1, 52429LL, 6}, {1, 25891LL, 5}, {2, 36765LL, 6}, + {1, 25267LL, 5}, {2, 34329LL, 6}, {1, 49345LL, 6}, + {1, 48771LL, 6}, {1, 48211LL, 6}, {1, 47663LL, 6}, + {2, 28719LL, 6}, {1, 11651LL, 4}, {2, 26647LL, 6}, + {2, 25645LL, 6}, {2, 24665LL, 6}, {2, 23705LL, 6}, + {1, 44151LL, 6}, {1, 43691LL, 6}, {2, 20945LL, 6}, + {2, 20063LL, 6}, {1, 42367LL, 6}, {2, 18351LL, 6}, + {1, 5191LL, 3}, {1, 41121LL, 6}, {1, 20361LL, 5}, + {1, 20165LL, 5}, {1, 19973LL, 5}, {1, 39569LL, 6}, + {2, 12863LL, 6}, {2, 12137LL, 6}, {1, 2405LL, 2}, + {2, 10725LL, 6}, {1, 37787LL, 6}, {2, 9363LL, 6}, + {1, 18559LL, 5}, {2, 8049LL, 6}, {2, 7409LL, 6}, + {1, 18079LL, 5}, {1, 35849LL, 6}, {1, 35545LL, 6}, + {2, 4957LL, 6}, {1, 34953LL, 6}, {1, 4333LL, 3}, + {1, 8595LL, 4}, {2, 2665LL, 6}, {2, 2115LL, 6}, + {2, 1573LL, 6}, {2, 1041LL, 6}, {2, 517LL, 6}, + {0, 0LL, 7}, {1, 16257LL, 5}, {1, 4033LL, 3}, + {1, 16009LL, 5}, {1, 63551LL, 7}, {1, 63073LL, 7}, + {1, 31301LL, 6}, {1, 31069LL, 6}, {1, 61681LL, 7}, + {1, 61231LL, 7}, {2, 56039LL, 7}, {1, 30175LL, 6}, + {1, 59919LL, 7}, {1, 29747LL, 6}, {1, 59075LL, 7}, + {1, 29331LL, 6}, {1, 58255LL, 7}, {1, 57853LL, 7}, + {1, 57457LL, 7}, {1, 28533LL, 6}, {1, 7085LL, 4}, + {1, 14075LL, 5}, {2, 46313LL, 7}, {1, 27777LL, 6}, + {1, 55189LL, 7}, {1, 13707LL, 5}, {1, 6809LL, 4}, + {2, 42705LL, 7}, {2, 42011LL, 7}, {1, 53431LL, 7}, + {1, 53093LL, 7}, {1, 52759LL, 7}, {1, 52429LL, 7}, + {2, 38671LL, 7}, {1, 25891LL, 6}, {1, 6433LL, 4}, + {2, 36765LL, 7}, {2, 36145LL, 7}, {1, 25267LL, 6}, + {2, 34927LL, 7}, {2, 34329LL, 7}, {1, 49637LL, 7}, + {1, 49345LL, 7}, {2, 32577LL, 7}, {1, 48771LL, 7}, + {2, 31443LL, 7}, {1, 48211LL, 7}, {1, 47935LL, 7}, + {1, 47663LL, 7}, {2, 29251LL, 7}, {2, 28719LL, 7}, + {1, 2929LL, 3}, {1, 11651LL, 5}, {1, 23173LL, 6}, + {2, 26647LL, 7}, {1, 2865LL, 3}, {2, 25645LL, 7}, + {1, 1417LL, 2}, {2, 24665LL, 7}, {1, 44859LL, 7}, + {2, 23705LL, 7}, {2, 23233LL, 7}, {1, 44151LL, 7}, + {1, 2745LL, 3}, {1, 43691LL, 7}, {2, 21393LL, 7}, + {2, 20945LL, 7}, {1, 43019LL, 7}, {2, 20063LL, 7}, + {1, 21291LL, 6}, {1, 42367LL, 7}, {1, 21077LL, 6}, + {2, 18351LL, 7}, {1, 41735LL, 7}, {1, 5191LL, 4}, + {2, 17111LL, 7}, {1, 41121LL, 7}, {2, 16305LL, 7}, + {1, 20361LL, 6}, {1, 40525LL, 7}, {1, 20165LL, 6}, + {1, 40137LL, 7}, {1, 19973LL, 6}, {1, 39757LL, 7}, + {1, 39569LL, 7}, {2, 13231LL, 7}, {2, 12863LL, 7}, + {1, 39017LL, 7}, {2, 12137LL, 7}, {2, 11779LL, 7}, + {1, 2405LL, 3}, {2, 11073LL, 7}, {2, 10725LL, 7}, + {1, 18979LL, 6}, {1, 37787LL, 7}, {2, 9699LL, 7}, + {2, 9363LL, 7}, {1, 37283LL, 7}, {1, 18559LL, 6}, + {2, 8373LL, 7}, {2, 8049LL, 7}, {1, 4579LL, 4}, + {2, 7409LL, 7}, {2, 7093LL, 7}, {1, 18079LL, 6}, + {1, 36003LL, 7}, {1, 35849LL, 7}, {2, 5857LL, 7}, + {1, 35545LL, 7}, {1, 35395LL, 7}, {2, 4957LL, 7}, + {1, 35099LL, 7}, {1, 34953LL, 7}, {1, 4351LL, 4}, + {1, 4333LL, 4}, {2, 3507LL, 7}, {1, 8595LL, 5}, + {2, 2943LL, 7}, {2, 2665LL, 7}, {1, 16981LL, 6}, + {2, 2115LL, 7}, {2, 1843LL, 7}, {2, 1573LL, 7}, + {1, 33421LL, 7}, {2, 1041LL, 7}, {1, 33157LL, 7}, + {2, 517LL, 7}, {1, 32897LL, 7}, {0, 0LL, 8}, +}; +static const uniform int64 __idiv_table_s16[][3] = { + {0, 0LL, 1}, {1, 21846LL, 0}, {0, 0LL, 2}, + {1, 26215LL, 1}, {1, 10923LL, 0}, {1, 18725LL, 1}, + {0, 0LL, 3}, {1, 7282LL, 0}, {1, 26215LL, 2}, + {1, 5958LL, 0}, {1, 10923LL, 1}, {1, 20165LL, 2}, + {1, 18725LL, 2}, {1, 34953LL, 3}, {0, 0LL, 4}, + {1, 30841LL, 3}, {1, 3641LL, 0}, {1, 55189LL, 4}, + {1, 26215LL, 3}, {1, 49933LL, 4}, {1, 2979LL, 0}, + {1, 45591LL, 4}, {1, 10923LL, 2}, {1, 5243LL, 1}, + {1, 20165LL, 3}, {1, 38837LL, 4}, {1, 18725LL, 3}, + {1, 18079LL, 3}, {1, 34953LL, 4}, {1, 16913LL, 3}, + {0, 0LL, 5}, {1, 1986LL, 0}, {1, 30841LL, 4}, + {1, 3745LL, 1}, {1, 3641LL, 1}, {1, 7085LL, 2}, + {1, 55189LL, 5}, {1, 26887LL, 4}, {1, 26215LL, 4}, + {1, 51151LL, 5}, {1, 49933LL, 5}, {1, 12193LL, 3}, + {1, 2979LL, 1}, {1, 11651LL, 3}, {1, 45591LL, 5}, + {1, 44621LL, 5}, {1, 10923LL, 3}, {1, 2675LL, 1}, + {1, 5243LL, 2}, {1, 41121LL, 5}, {1, 20165LL, 4}, + {1, 19785LL, 4}, {1, 38837LL, 5}, {1, 38131LL, 5}, + {1, 18725LL, 4}, {1, 36793LL, 5}, {1, 18079LL, 4}, + {1, 17773LL, 4}, {1, 34953LL, 5}, {1, 8595LL, 3}, + {1, 16913LL, 4}, {1, 33289LL, 5}, {0, 0LL, 6}, + {1, 4033LL, 2}, {1, 993LL, 0}, {1, 31301LL, 5}, + {1, 30841LL, 5}, {1, 15197LL, 4}, {1, 3745LL, 2}, + {1, 14769LL, 4}, {1, 3641LL, 2}, {1, 57457LL, 6}, + {1, 7085LL, 3}, {1, 55925LL, 6}, {1, 55189LL, 6}, + {1, 6809LL, 3}, {1, 26887LL, 5}, {1, 26547LL, 5}, + {1, 26215LL, 5}, {1, 25891LL, 5}, {1, 51151LL, 6}, + {1, 25267LL, 5}, {1, 49933LL, 6}, {1, 24673LL, 5}, + {1, 12193LL, 4}, {1, 48211LL, 6}, {1, 2979LL, 2}, + {1, 5891LL, 3}, {1, 11651LL, 4}, {1, 11523LL, 4}, + {1, 45591LL, 6}, {1, 45101LL, 6}, {1, 44621LL, 6}, + {1, 44151LL, 6}, {1, 10923LL, 4}, {1, 43241LL, 6}, + {1, 2675LL, 2}, {1, 662LL, 0}, {1, 5243LL, 3}, + {1, 5191LL, 3}, {1, 41121LL, 6}, {1, 20361LL, 5}, + {1, 20165LL, 5}, {1, 19973LL, 5}, {1, 19785LL, 5}, + {1, 1225LL, 1}, {1, 38837LL, 6}, {1, 2405LL, 2}, + {1, 38131LL, 6}, {1, 37787LL, 6}, {1, 18725LL, 5}, + {1, 18559LL, 5}, {1, 36793LL, 6}, {1, 36473LL, 6}, + {1, 18079LL, 5}, {1, 35849LL, 6}, {1, 17773LL, 5}, + {1, 35247LL, 6}, {1, 34953LL, 6}, {1, 4333LL, 3}, + {1, 8595LL, 4}, {1, 34101LL, 6}, {1, 16913LL, 5}, + {1, 33555LL, 6}, {1, 33289LL, 6}, {1, 33027LL, 6}, + {0, 0LL, 7}, {1, 16257LL, 5}, {1, 4033LL, 3}, + {1, 16009LL, 5}, {1, 993LL, 1}, {1, 31537LL, 6}, + {1, 31301LL, 6}, {1, 31069LL, 6}, {1, 30841LL, 6}, + {1, 3827LL, 3}, {1, 15197LL, 5}, {1, 30175LL, 6}, + {1, 3745LL, 3}, {1, 29747LL, 6}, {1, 14769LL, 5}, + {1, 29331LL, 6}, {1, 3641LL, 3}, {1, 28927LL, 6}, + {1, 57457LL, 7}, {1, 28533LL, 6}, {1, 7085LL, 4}, + {1, 14075LL, 5}, {1, 55925LL, 7}, {1, 27777LL, 6}, + {1, 55189LL, 7}, {1, 13707LL, 5}, {1, 6809LL, 4}, + {1, 54121LL, 7}, {1, 26887LL, 6}, {1, 6679LL, 4}, + {1, 26547LL, 6}, {1, 6595LL, 4}, {1, 26215LL, 6}, + {1, 6513LL, 4}, {1, 25891LL, 6}, {1, 6433LL, 4}, + {1, 51151LL, 7}, {1, 50841LL, 7}, {1, 25267LL, 6}, + {1, 6279LL, 4}, {1, 49933LL, 7}, {1, 24819LL, 6}, + {1, 24673LL, 6}, {1, 49057LL, 7}, {1, 12193LL, 5}, + {1, 24245LL, 6}, {1, 48211LL, 7}, {1, 749LL, 1}, + {1, 2979LL, 3}, {1, 23697LL, 6}, {1, 5891LL, 4}, + {1, 2929LL, 3}, {1, 11651LL, 5}, {1, 23173LL, 6}, + {1, 11523LL, 5}, {1, 2865LL, 3}, {1, 45591LL, 7}, + {1, 1417LL, 2}, {1, 45101LL, 7}, {1, 11215LL, 5}, + {1, 44621LL, 7}, {1, 44385LL, 7}, {1, 44151LL, 7}, + {1, 2745LL, 3}, {1, 10923LL, 5}, {1, 43465LL, 7}, + {1, 43241LL, 7}, {1, 43019LL, 7}, {1, 2675LL, 3}, + {1, 21291LL, 6}, {1, 331LL, 0}, {1, 21077LL, 6}, + {1, 5243LL, 4}, {1, 41735LL, 7}, {1, 5191LL, 4}, + {1, 10331LL, 5}, {1, 41121LL, 7}, {1, 40921LL, 7}, + {1, 20361LL, 6}, {1, 40525LL, 7}, {1, 20165LL, 6}, + {1, 20069LL, 6}, {1, 19973LL, 6}, {1, 39757LL, 7}, + {1, 19785LL, 6}, {1, 4923LL, 4}, {1, 1225LL, 2}, + {1, 39017LL, 7}, {1, 38837LL, 7}, {1, 19329LL, 6}, + {1, 2405LL, 3}, {1, 38305LL, 7}, {1, 38131LL, 7}, + {1, 18979LL, 6}, {1, 37787LL, 7}, {1, 18809LL, 6}, + {1, 18725LL, 6}, {1, 37283LL, 7}, {1, 18559LL, 6}, + {1, 36955LL, 7}, {1, 36793LL, 7}, {1, 4579LL, 4}, + {1, 36473LL, 7}, {1, 36315LL, 7}, {1, 18079LL, 6}, + {1, 36003LL, 7}, {1, 35849LL, 7}, {1, 35697LL, 7}, + {1, 17773LL, 6}, {1, 8849LL, 5}, {1, 35247LL, 7}, + {1, 35099LL, 7}, {1, 34953LL, 7}, {1, 4351LL, 4}, + {1, 4333LL, 4}, {1, 17261LL, 6}, {1, 8595LL, 5}, + {1, 535LL, 1}, {1, 34101LL, 7}, {1, 16981LL, 6}, + {1, 16913LL, 6}, {1, 16845LL, 6}, {1, 33555LL, 7}, + {1, 33421LL, 7}, {1, 33289LL, 7}, {1, 33157LL, 7}, + {1, 33027LL, 7}, {1, 32897LL, 7}, {1, 32769LL, 7}, +}; +static const uniform int64 __idiv_table_u32[][3] = { + {0, 0LL, 1}, {1, 2863311531LL, 1}, {0, 0LL, 2}, + {1, 3435973837LL, 2}, {1, 2863311531LL, 2}, {2, 613566757LL, 2}, + {0, 0LL, 3}, {1, 954437177LL, 1}, {1, 3435973837LL, 3}, + {1, 3123612579LL, 3}, {1, 2863311531LL, 3}, {1, 1321528399LL, 2}, + {2, 613566757LL, 3}, {1, 2290649225LL, 3}, {0, 0LL, 4}, + {1, 4042322161LL, 4}, {1, 954437177LL, 2}, {2, 2938661835LL, 4}, + {1, 3435973837LL, 4}, {2, 2249744775LL, 4}, {1, 3123612579LL, 4}, + {1, 2987803337LL, 4}, {1, 2863311531LL, 4}, {1, 1374389535LL, 3}, + {1, 1321528399LL, 3}, {2, 795364315LL, 4}, {2, 613566757LL, 4}, + {1, 2369637129LL, 4}, {1, 2290649225LL, 4}, {2, 138547333LL, 4}, + {0, 0LL, 5}, {1, 1041204193LL, 3}, {1, 4042322161LL, 5}, + {2, 3558687189LL, 5}, {1, 954437177LL, 3}, {2, 3134165325LL, 5}, + {2, 2938661835LL, 5}, {2, 2753184165LL, 5}, {1, 3435973837LL, 5}, + {1, 3352169597LL, 5}, {2, 2249744775LL, 5}, {1, 799063683LL, 3}, + {1, 3123612579LL, 5}, {2, 1813430637LL, 5}, {1, 2987803337LL, 5}, + {1, 2924233053LL, 5}, {1, 2863311531LL, 5}, {1, 1402438301LL, 4}, + {1, 1374389535LL, 4}, {1, 2694881441LL, 5}, {1, 1321528399LL, 4}, + {2, 891408307LL, 5}, {2, 795364315LL, 5}, {2, 702812831LL, 5}, + {2, 613566757LL, 5}, {2, 527452125LL, 5}, {1, 2369637129LL, 5}, + {1, 582368447LL, 3}, {1, 2290649225LL, 5}, {1, 1126548799LL, 4}, + {2, 138547333LL, 5}, {2, 68174085LL, 5}, {0, 0LL, 6}, + {1, 4228890877LL, 6}, {1, 1041204193LL, 4}, {1, 128207979LL, 1}, + {1, 4042322161LL, 6}, {1, 1991868891LL, 5}, {2, 3558687189LL, 6}, + {1, 3871519817LL, 6}, {1, 954437177LL, 4}, {2, 3235934265LL, 6}, + {2, 3134165325LL, 6}, {1, 458129845LL, 3}, {2, 2938661835LL, 6}, + {1, 892460737LL, 4}, {2, 2753184165LL, 6}, {1, 3479467177LL, 6}, + {1, 3435973837LL, 6}, {1, 3393554407LL, 6}, {1, 3352169597LL, 6}, + {1, 827945503LL, 4}, {2, 2249744775LL, 6}, {1, 3233857729LL, 6}, + {1, 799063683LL, 4}, {1, 789879043LL, 4}, {1, 3123612579LL, 6}, + {1, 3088515809LL, 6}, {2, 1813430637LL, 6}, {2, 1746305385LL, 6}, + {1, 2987803337LL, 6}, {1, 2955676419LL, 6}, {1, 2924233053LL, 6}, + {2, 1491936009LL, 6}, {1, 2863311531LL, 6}, {2, 1372618415LL, 6}, + {1, 1402438301LL, 5}, {1, 2776544515LL, 6}, {1, 1374389535LL, 5}, + {2, 1148159575LL, 6}, {1, 2694881441LL, 6}, {2, 1042467791LL, 6}, + {1, 1321528399LL, 5}, {2, 940802361LL, 6}, {2, 891408307LL, 6}, + {2, 842937507LL, 6}, {2, 795364315LL, 6}, {2, 748664025LL, 6}, + {2, 702812831LL, 6}, {2, 657787785LL, 6}, {2, 613566757LL, 6}, + {2, 570128403LL, 6}, {2, 527452125LL, 6}, {2, 485518043LL, 6}, + {1, 2369637129LL, 6}, {2, 403800345LL, 6}, {1, 582368447LL, 4}, + {1, 1154949189LL, 5}, {1, 2290649225LL, 6}, {2, 248469183LL, 6}, + {1, 1126548799LL, 5}, {2, 174592167LL, 6}, {2, 138547333LL, 6}, + {1, 274877907LL, 3}, {2, 68174085LL, 6}, {2, 33818641LL, 6}, + {0, 0LL, 7}, {1, 266354561LL, 3}, {1, 4228890877LL, 7}, + {1, 4196609267LL, 7}, {1, 1041204193LL, 5}, {1, 4133502361LL, 7}, + {1, 128207979LL, 2}, {1, 4072265289LL, 7}, {1, 4042322161LL, 7}, + {1, 125400505LL, 2}, {1, 1991868891LL, 6}, {1, 1977538899LL, 6}, + {2, 3558687189LL, 7}, {1, 974744351LL, 5}, {1, 3871519817LL, 7}, + {1, 3844446251LL, 7}, {1, 954437177LL, 5}, {1, 3791419407LL, 7}, + {2, 3235934265LL, 7}, {1, 3739835469LL, 7}, {2, 3134165325LL, 7}, + {1, 3689636335LL, 7}, {1, 458129845LL, 4}, {1, 910191745LL, 5}, + {2, 2938661835LL, 7}, {1, 3593175255LL, 7}, {1, 892460737LL, 5}, + {1, 3546811703LL, 7}, {2, 2753184165LL, 7}, {1, 875407347LL, 5}, + {1, 3479467177LL, 7}, {2, 2620200175LL, 7}, {1, 3435973837LL, 7}, + {1, 3414632385LL, 7}, {1, 3393554407LL, 7}, {1, 3372735055LL, 7}, + {1, 3352169597LL, 7}, {1, 1665926709LL, 6}, {1, 827945503LL, 5}, + {1, 1645975491LL, 6}, {2, 2249744775LL, 7}, {1, 1626496491LL, 6}, + {1, 3233857729LL, 7}, {2, 2134925265LL, 7}, {1, 799063683LL, 5}, + {2, 2060591247LL, 7}, {1, 789879043LL, 5}, {1, 1570730897LL, 6}, + {1, 3123612579LL, 7}, {2, 1916962805LL, 7}, {1, 3088515809LL, 7}, + {2, 1847555765LL, 7}, {2, 1813430637LL, 7}, {1, 3037324939LL, 7}, + {2, 1746305385LL, 7}, {1, 3004130131LL, 7}, {1, 2987803337LL, 7}, + {2, 1648338801LL, 7}, {1, 2955676419LL, 7}, {1, 2939870663LL, 7}, + {1, 2924233053LL, 7}, {2, 1522554545LL, 7}, {2, 1491936009LL, 7}, + {1, 2878302691LL, 7}, {1, 2863311531LL, 7}, {1, 356059465LL, 4}, + {2, 1372618415LL, 7}, {2, 1343553873LL, 7}, {1, 1402438301LL, 6}, + {2, 1286310003LL, 7}, {1, 2776544515LL, 7}, {1, 1381296015LL, 6}, + {1, 1374389535LL, 6}, {1, 42735993LL, 1}, {2, 1148159575LL, 7}, + {1, 2708156719LL, 7}, {1, 2694881441LL, 7}, {1, 1340867839LL, 6}, + {2, 1042467791LL, 7}, {1, 663956297LL, 5}, {1, 1321528399LL, 6}, + {1, 2630410593LL, 7}, {2, 940802361LL, 7}, {1, 2605477791LL, 7}, + {2, 891408307LL, 7}, {1, 2581013211LL, 7}, {2, 842937507LL, 7}, + {1, 1278501893LL, 6}, {2, 795364315LL, 7}, {2, 771906565LL, 7}, + {2, 748664025LL, 7}, {2, 725633745LL, 7}, {2, 702812831LL, 7}, + {2, 680198441LL, 7}, {2, 657787785LL, 7}, {2, 635578121LL, 7}, + {2, 613566757LL, 7}, {1, 2443359173LL, 7}, {2, 570128403LL, 7}, + {2, 548696263LL, 7}, {2, 527452125LL, 7}, {1, 1200340205LL, 6}, + {2, 485518043LL, 7}, {2, 464823301LL, 7}, {1, 2369637129LL, 7}, + {2, 423966729LL, 7}, {2, 403800345LL, 7}, {2, 383805589LL, 7}, + {1, 582368447LL, 5}, {2, 344322273LL, 7}, {1, 1154949189LL, 6}, + {1, 2300233531LL, 7}, {1, 2290649225LL, 7}, {1, 285143057LL, 4}, + {2, 248469183LL, 7}, {1, 2262369605LL, 7}, {1, 1126548799LL, 6}, + {2, 192835267LL, 7}, {2, 174592167LL, 7}, {2, 156496785LL, 7}, + {2, 138547333LL, 7}, {2, 120742053LL, 7}, {1, 274877907LL, 4}, + {1, 2190262207LL, 7}, {2, 68174085LL, 7}, {1, 2172947881LL, 7}, + {2, 33818641LL, 7}, {1, 2155905153LL, 7}, {0, 0LL, 8}, +}; +static const uniform int64 __idiv_table_s32[][3] = { + {0, 0LL, 1}, {1, 1431655766LL, 0}, {0, 0LL, 2}, + {1, 1717986919LL, 1}, {1, 715827883LL, 0}, {1, 2454267027LL, 2}, + {0, 0LL, 3}, {1, 954437177LL, 1}, {1, 1717986919LL, 2}, + {1, 780903145LL, 1}, {1, 715827883LL, 1}, {1, 1321528399LL, 2}, + {1, 2454267027LL, 3}, {1, 2290649225LL, 3}, {0, 0LL, 4}, + {1, 2021161081LL, 3}, {1, 954437177LL, 2}, {1, 1808407283LL, 3}, + {1, 1717986919LL, 3}, {1, 818089009LL, 2}, {1, 780903145LL, 2}, + {1, 2987803337LL, 4}, {1, 715827883LL, 2}, {1, 1374389535LL, 3}, + {1, 1321528399LL, 3}, {1, 1272582903LL, 3}, {1, 2454267027LL, 4}, + {1, 2369637129LL, 4}, {1, 2290649225LL, 4}, {1, 2216757315LL, 4}, + {0, 0LL, 5}, {1, 1041204193LL, 3}, {1, 2021161081LL, 4}, + {1, 3926827243LL, 5}, {1, 954437177LL, 3}, {1, 3714566311LL, 5}, + {1, 1808407283LL, 4}, {1, 3524075731LL, 5}, {1, 1717986919LL, 4}, + {1, 1676084799LL, 4}, {1, 818089009LL, 3}, {1, 799063683LL, 3}, + {1, 780903145LL, 3}, {1, 3054198967LL, 5}, {1, 2987803337LL, 5}, + {1, 2924233053LL, 5}, {1, 715827883LL, 3}, {1, 1402438301LL, 4}, + {1, 1374389535LL, 4}, {1, 2694881441LL, 5}, {1, 1321528399LL, 4}, + {1, 1296593901LL, 4}, {1, 1272582903LL, 4}, {1, 156180629LL, 1}, + {1, 2454267027LL, 5}, {1, 2411209711LL, 5}, {1, 2369637129LL, 5}, + {1, 582368447LL, 3}, {1, 2290649225LL, 5}, {1, 1126548799LL, 4}, + {1, 2216757315LL, 5}, {1, 2181570691LL, 5}, {0, 0LL, 6}, + {1, 2114445439LL, 5}, {1, 1041204193LL, 4}, {1, 128207979LL, 1}, + {1, 2021161081LL, 5}, {1, 1991868891LL, 5}, {1, 3926827243LL, 6}, + {1, 3871519817LL, 6}, {1, 954437177LL, 4}, {1, 3765450781LL, 6}, + {1, 3714566311LL, 6}, {1, 458129845LL, 3}, {1, 1808407283LL, 5}, + {1, 892460737LL, 4}, {1, 3524075731LL, 6}, {1, 1739733589LL, 5}, + {1, 1717986919LL, 5}, {1, 424194301LL, 3}, {1, 1676084799LL, 5}, + {1, 827945503LL, 4}, {1, 818089009LL, 4}, {1, 1616928865LL, 5}, + {1, 799063683LL, 4}, {1, 789879043LL, 4}, {1, 780903145LL, 4}, + {1, 3088515809LL, 6}, {1, 3054198967LL, 6}, {1, 3020636341LL, 6}, + {1, 2987803337LL, 6}, {1, 738919105LL, 4}, {1, 2924233053LL, 6}, + {1, 2893451653LL, 6}, {1, 715827883LL, 4}, {1, 354224107LL, 3}, + {1, 1402438301LL, 5}, {1, 2776544515LL, 6}, {1, 1374389535LL, 5}, + {1, 680390859LL, 4}, {1, 2694881441LL, 6}, {1, 333589693LL, 3}, + {1, 1321528399LL, 5}, {1, 2617884829LL, 6}, {1, 1296593901LL, 5}, + {1, 1284476201LL, 5}, {1, 1272582903LL, 5}, {1, 2521815661LL, 6}, + {1, 156180629LL, 2}, {1, 2476377541LL, 6}, {1, 2454267027LL, 6}, + {1, 1216273925LL, 5}, {1, 2411209711LL, 6}, {1, 1195121335LL, 5}, + {1, 2369637129LL, 6}, {1, 2349383821LL, 6}, {1, 582368447LL, 4}, + {1, 1154949189LL, 5}, {1, 2290649225LL, 6}, {1, 70991195LL, 1}, + {1, 1126548799LL, 5}, {1, 558694933LL, 4}, {1, 2216757315LL, 6}, + {1, 274877907LL, 3}, {1, 2181570691LL, 6}, {1, 2164392969LL, 6}, + {0, 0LL, 7}, {1, 266354561LL, 3}, {1, 2114445439LL, 6}, + {1, 1049152317LL, 5}, {1, 1041204193LL, 5}, {1, 4133502361LL, 7}, + {1, 128207979LL, 2}, {1, 4072265289LL, 7}, {1, 2021161081LL, 6}, + {1, 125400505LL, 2}, {1, 1991868891LL, 6}, {1, 1977538899LL, 6}, + {1, 3926827243LL, 7}, {1, 974744351LL, 5}, {1, 3871519817LL, 7}, + {1, 961111563LL, 5}, {1, 954437177LL, 5}, {1, 3791419407LL, 7}, + {1, 3765450781LL, 7}, {1, 1869917735LL, 6}, {1, 3714566311LL, 7}, + {1, 230602271LL, 3}, {1, 458129845LL, 4}, {1, 910191745LL, 5}, + {1, 1808407283LL, 6}, {1, 3593175255LL, 7}, {1, 892460737LL, 5}, + {1, 443351463LL, 4}, {1, 3524075731LL, 7}, {1, 875407347LL, 5}, + {1, 1739733589LL, 6}, {1, 432197967LL, 4}, {1, 1717986919LL, 6}, + {1, 3414632385LL, 7}, {1, 424194301LL, 4}, {1, 210795941LL, 3}, + {1, 1676084799LL, 6}, {1, 1665926709LL, 6}, {1, 827945503LL, 5}, + {1, 1645975491LL, 6}, {1, 818089009LL, 5}, {1, 1626496491LL, 6}, + {1, 1616928865LL, 6}, {1, 3214946281LL, 7}, {1, 799063683LL, 5}, + {1, 397222409LL, 4}, {1, 789879043LL, 5}, {1, 1570730897LL, 6}, + {1, 780903145LL, 5}, {1, 3105965051LL, 7}, {1, 3088515809LL, 7}, + {1, 3071261531LL, 7}, {1, 3054198967LL, 7}, {1, 759331235LL, 5}, + {1, 3020636341LL, 7}, {1, 3004130131LL, 7}, {1, 2987803337LL, 7}, + {1, 2971653049LL, 7}, {1, 738919105LL, 5}, {1, 2939870663LL, 7}, + {1, 2924233053LL, 7}, {1, 2908760921LL, 7}, {1, 2893451653LL, 7}, + {1, 2878302691LL, 7}, {1, 715827883LL, 5}, {1, 356059465LL, 4}, + {1, 354224107LL, 4}, {1, 2819260585LL, 7}, {1, 1402438301LL, 6}, + {1, 1395319325LL, 6}, {1, 2776544515LL, 7}, {1, 1381296015LL, 6}, + {1, 1374389535LL, 6}, {1, 42735993LL, 1}, {1, 680390859LL, 5}, + {1, 2708156719LL, 7}, {1, 2694881441LL, 7}, {1, 1340867839LL, 6}, + {1, 333589693LL, 4}, {1, 663956297LL, 5}, {1, 1321528399LL, 6}, + {1, 2630410593LL, 7}, {1, 2617884829LL, 7}, {1, 81421181LL, 2}, + {1, 1296593901LL, 6}, {1, 2581013211LL, 7}, {1, 1284476201LL, 6}, + {1, 1278501893LL, 6}, {1, 1272582903LL, 6}, {1, 2533436931LL, 7}, + {1, 2521815661LL, 7}, {1, 2510300521LL, 7}, {1, 156180629LL, 3}, + {1, 2487582869LL, 7}, {1, 2476377541LL, 7}, {1, 2465272709LL, 7}, + {1, 2454267027LL, 7}, {1, 2443359173LL, 7}, {1, 1216273925LL, 6}, + {1, 605457945LL, 5}, {1, 2411209711LL, 7}, {1, 1200340205LL, 6}, + {1, 1195121335LL, 6}, {1, 2379895299LL, 7}, {1, 2369637129LL, 7}, + {1, 2359467013LL, 7}, {1, 2349383821LL, 7}, {1, 2339386443LL, 7}, + {1, 582368447LL, 5}, {1, 2319644785LL, 7}, {1, 1154949189LL, 6}, + {1, 2300233531LL, 7}, {1, 2290649225LL, 7}, {1, 285143057LL, 4}, + {1, 70991195LL, 2}, {1, 2262369605LL, 7}, {1, 1126548799LL, 6}, + {1, 1121950641LL, 6}, {1, 558694933LL, 5}, {1, 2225732041LL, 7}, + {1, 2216757315LL, 7}, {1, 2207854675LL, 7}, {1, 274877907LL, 4}, + {1, 2190262207LL, 7}, {1, 2181570691LL, 7}, {1, 2172947881LL, 7}, + {1, 2164392969LL, 7}, {1, 2155905153LL, 7}, {1, 2147483649LL, 7}, +}; + +__declspec(safe) +static unmasked unsigned int8 __fast_idiv(unsigned int8 numerator, + uniform unsigned int8 divisor) { + uniform int64 method = __idiv_table_u8[divisor-2][0]; + uniform int64 multiplier = __idiv_table_u8[divisor-2][1]; + uniform int64 shift = __idiv_table_u8[divisor-2][2]; + + unsigned int16 mult = multiplier; + unsigned int16 val = numerator; + if (method == 0) + return numerator >> shift; + else if (method == 1) + return (val * mult) >> (8 + shift); + else { + val *= mult; + val >>= 8; + val += (numerator-val)>>1; + return (val >> shift); + } +} + +__declspec(safe) +static unmasked int8 __fast_idiv(int8 numerator, uniform int8 divisor) { + uniform int8 method = __idiv_table_s8[divisor-2][0]; + uniform int16 multiplier = __idiv_table_s8[divisor-2][1]; + uniform int8 shift = __idiv_table_s8[divisor-2][2]; + + if (method == 0) + return numerator >> shift; + else { + unsigned int8 sign = numerator >> 7; + numerator ^= sign; + int16 mul = (int16)numerator * (int16)multiplier; + mul >>= 8 + shift; + return (int8)mul ^ sign; + } +} + +__declspec(safe) +static unmasked unsigned int16 __fast_idiv(unsigned int16 numerator, + uniform unsigned int16 divisor) { + uniform int64 method = __idiv_table_u16[divisor-2][0]; + uniform int64 multiplier = __idiv_table_u16[divisor-2][1]; + uniform int64 shift = __idiv_table_u16[divisor-2][2]; + + unsigned int32 mult = multiplier; + unsigned int32 val = numerator; + if (method == 0) + return numerator >> shift; + else if (method == 1) + return (val * mult) >> (16 + shift); + else { + val *= mult; + val >>= 16; + val += (numerator-val)>>1; + return val >> shift; + } +} + +__declspec(safe) +static unmasked int16 __fast_idiv(int16 numerator, uniform int16 divisor) { + uniform int64 method = __idiv_table_s16[divisor-2][0]; + uniform int64 multiplier = __idiv_table_s16[divisor-2][1]; + uniform int64 shift = __idiv_table_s16[divisor-2][2]; + + if (method == 0) + return numerator >> shift; + else { + unsigned int16 sign = numerator >> 15; + numerator ^= sign; + int32 mul = (int32)numerator * (int32)multiplier; + mul >>= 16 + shift; + int16 result = mul; + return result ^ sign; + } +} + +__declspec(safe) +static unmasked inline unsigned int32 __fast_idiv(unsigned int32 numerator, + uniform unsigned int32 divisor) { + uniform int64 method = __idiv_table_u32[divisor-2][0]; + uniform int64 multiplier = __idiv_table_u32[divisor-2][1]; + uniform int64 shift = __idiv_table_u32[divisor-2][2]; + + unsigned int64 mult = multiplier; + unsigned int64 val = numerator; + if (method == 0) + return numerator >> shift; + else if (method == 1) + return (val * mult) >> (32 + shift); + else { + val *= mult; + val >>= 32; + val += (numerator-val)>>1; + return val >> shift; + } +} + +__declspec(safe) +static unmasked int32 __fast_idiv(int32 numerator, uniform int32 divisor) { + uniform int64 method = __idiv_table_s32[divisor-2][0]; + uniform int64 multiplier = __idiv_table_s32[divisor-2][1]; + uniform int64 shift = __idiv_table_s32[divisor-2][2]; + + if (method == 0) + return numerator >> shift; + else { + unsigned int32 sign = numerator >> 31; + numerator ^= sign; + int64 mul = (int64)numerator * (int64)multiplier; + mul >>= 32 + shift; + int32 result = mul; + return result ^ sign; + } +} + diff --git a/tests/idiv.ispc b/tests/idiv.ispc new file mode 100644 index 00000000..b7bd78dc --- /dev/null +++ b/tests/idiv.ispc @@ -0,0 +1,75 @@ + +export uniform int width() { return programCount; } + + +export void f_f(uniform float RET[], uniform float aFOO[]) { + uniform int errorCount = 0; + + for (unsigned int8 num = 0; num < 255; ++num) { + for (uniform unsigned int8 div = 2; div < 255; ++div) { + if (__fast_idiv(num, div) != num/div) { + ++errorCount; + print("error %/% = %, got %\n", num, div, num/div, __fast_idiv(num,div)); + } + } + } + + for (int8 num = 0; num < 127; ++num) { + for (uniform int8 div = 2; div < 127; ++div) { + if (__fast_idiv(num, div) != num/div) { + ++errorCount; + print("error %/% = %, got %\n", num, div, num/div, __fast_idiv(num,div)); + } + } + } + + for (int16 num = 0; num < 32767; ++num) { + for (uniform int16 div = 2; div < 256; ++div) { + if (__fast_idiv(num, div) != num/div) { + ++errorCount; + print("error %/% = %, got %\n", num, div, num/div, __fast_idiv(num,div)); + } + } + } + + for (unsigned int16 num = 0; num < 0xffff; ++num) { + for (uniform unsigned int16 div = 2; div < 256; ++div) { + if (__fast_idiv(num, div) != num/div) { + ++errorCount; + print("error %/% = %, got %\n", num, div, num/div, __fast_idiv(num,div)); + } + } + } + + // randomly sample int32s... + uniform RNGState state; + seed_rng(&state, 1234); + for (uniform int i = 0; i < 1M; ++i) { + unsigned int32 num = random(&state); + for (uniform unsigned int32 div = 2; div < 256; ++div) { + if (__fast_idiv(num, div) != num/div) { + ++errorCount; + print("ui32 error %/% = %, got %\n", num, div, num/div, __fast_idiv(num,div)); + } + } + } + + for (uniform int64 i = 0; i < 1M; ++i) { + int32 num = random(&state); + if (num < 0) + continue; + for (uniform int32 div = 2; div < 256; ++div) { + if (__fast_idiv(num, div) != num/div) { + ++errorCount; + print("si32 error %/% = %, got %\n", num, div, num/div, __fast_idiv(num,div)); + } + } + } + + RET[programIndex] = errorCount; +} + +export void result(uniform float RET[]) { + RET[programIndex] = 0; +} + From e7abf3f2eacd50b0b8cb194fc87e878bdc25ddec Mon Sep 17 00:00:00 2001 From: Matt Pharr Date: Tue, 23 Jul 2013 16:38:10 -0700 Subject: [PATCH 03/34] Add support for mask vectors of 8 and 16-bit element types. There were a number of places throughout the system that assumed that the execution mask would only have either 32-bit or 1-bit elements. This commit makes it possible to have a target with an 8- or 16-bit mask. --- Makefile | 29 ++++++--- builtins.cpp | 35 +++++++---- builtins/util.m4 | 161 ++++++++++++++++++++++++++++++++--------------- ctx.cpp | 26 +++----- expr.cpp | 36 +++++------ llvmutil.cpp | 73 +++++++++++++++++---- parse.yy | 20 +++++- stdlib.ispc | 37 ++++++----- 8 files changed, 284 insertions(+), 133 deletions(-) diff --git a/Makefile b/Makefile index 835f8e15..043ab4cf 100644 --- a/Makefile +++ b/Makefile @@ -137,7 +137,7 @@ BISON_SRC=parse.yy FLEX_SRC=lex.ll OBJS=$(addprefix objs/, $(CXX_SRC:.cpp=.o) $(BUILTINS_OBJS) \ - stdlib_generic_ispc.o stdlib_x86_ispc.o \ + stdlib_mask1_ispc.o stdlib_mask8_ispc.o stdlib_mask16_ispc.o stdlib_mask32_ispc.o \ $(BISON_SRC:.yy=.o) $(FLEX_SRC:.ll=.o)) default: ispc @@ -243,12 +243,23 @@ objs/builtins-c-64.cpp: builtins/builtins.c @echo Creating C++ source from builtins definition file $< @$(CLANG) -m64 -emit-llvm -c $< -o - | llvm-dis - | python bitcode2cpp.py c 64 > $@ -objs/stdlib_generic_ispc.cpp: stdlib.ispc - @echo Creating C++ source from $< for generic - @$(CLANG) -E -x c -DISPC_TARGET_GENERIC=1 -DISPC=1 -DPI=3.1415926536 $< -o - | \ - python stdlib2cpp.py generic > $@ +objs/stdlib_mask1_ispc.cpp: stdlib.ispc + @echo Creating C++ source from $< for mask1 + @$(CLANG) -E -x c -DISPC_MASK_BITS=1 -DISPC=1 -DPI=3.1415926536 $< -o - | \ + python stdlib2cpp.py mask1 > $@ + +objs/stdlib_mask8_ispc.cpp: stdlib.ispc + @echo Creating C++ source from $< for mask8 + @$(CLANG) -E -x c -DISPC_MASK_BITS=8 -DISPC=1 -DPI=3.1415926536 $< -o - | \ + python stdlib2cpp.py mask8 > $@ + +objs/stdlib_mask16_ispc.cpp: stdlib.ispc + @echo Creating C++ source from $< for mask16 + @$(CLANG) -E -x c -DISPC_MASK_BITS=16 -DISPC=1 -DPI=3.1415926536 $< -o - | \ + python stdlib2cpp.py mask16 > $@ + +objs/stdlib_mask32_ispc.cpp: stdlib.ispc + @echo Creating C++ source from $< for mask32 + @$(CLANG) -E -x c -DISPC_MASK_BITS=32 -DISPC=1 -DPI=3.1415926536 $< -o - | \ + python stdlib2cpp.py mask32 > $@ -objs/stdlib_x86_ispc.cpp: stdlib.ispc - @echo Creating C++ source from $< for x86 - @$(CLANG) -E -x c -DISPC=1 -DPI=3.1415926536 $< -o - | \ - python stdlib2cpp.py x86 > $@ diff --git a/builtins.cpp b/builtins.cpp index 3e03de10..d3bbaa6a 100644 --- a/builtins.cpp +++ b/builtins.cpp @@ -112,10 +112,7 @@ lLLVMTypeToISPCType(const llvm::Type *t, bool intAsUnsigned) { return intAsUnsigned ? AtomicType::UniformUInt64 : AtomicType::UniformInt64; // varying - if (LLVMTypes::MaskType != LLVMTypes::Int32VectorType && - t == LLVMTypes::MaskType) - return AtomicType::VaryingBool; - else if (t == LLVMTypes::Int8VectorType) + if (t == LLVMTypes::Int8VectorType) return intAsUnsigned ? AtomicType::VaryingUInt8 : AtomicType::VaryingInt8; else if (t == LLVMTypes::Int16VectorType) return intAsUnsigned ? AtomicType::VaryingUInt16 : AtomicType::VaryingInt16; @@ -127,6 +124,8 @@ lLLVMTypeToISPCType(const llvm::Type *t, bool intAsUnsigned) { return AtomicType::VaryingDouble; else if (t == LLVMTypes::Int64VectorType) return intAsUnsigned ? AtomicType::VaryingUInt64 : AtomicType::VaryingInt64; + else if (t == LLVMTypes::MaskType) + return AtomicType::VaryingBool; // pointers to uniform else if (t == LLVMTypes::Int8PointerType) @@ -1038,16 +1037,30 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod // If the user wants the standard library to be included, parse the // serialized version of the stdlib.ispc file to get its // definitions added. + extern char stdlib_mask1_code[], stdlib_mask8_code[]; + extern char stdlib_mask16_code[], stdlib_mask32_code[]; if (g->target->getISA() == Target::GENERIC && - g->target->getVectorWidth() != 1) { // 1 wide uses x86 stdlib - extern char stdlib_generic_code[]; - yy_scan_string(stdlib_generic_code); - yyparse(); + g->target->getVectorWidth() == 1) { // 1 wide uses 32 stdlib + yy_scan_string(stdlib_mask32_code); } else { - extern char stdlib_x86_code[]; - yy_scan_string(stdlib_x86_code); - yyparse(); + switch (g->target->getMaskBitCount()) { + case 1: + yy_scan_string(stdlib_mask1_code); + break; + case 8: + yy_scan_string(stdlib_mask8_code); + break; + case 16: + yy_scan_string(stdlib_mask16_code); + break; + case 32: + yy_scan_string(stdlib_mask32_code); + break; + default: + FATAL("Unhandled mask bit size for stdlib.ispc"); + } } + yyparse(); } } diff --git a/builtins/util.m4 b/builtins/util.m4 index c19d4930..d6f3e5c3 100644 --- a/builtins/util.m4 +++ b/builtins/util.m4 @@ -690,6 +690,75 @@ shuffles(i64, 8) ;; $4: return type of the LLVM atomic type, in ispc naming paralance (e.g. int32) ;; $5: identity value for the operator (e.g. 0 for add, -1 for AND, ...) +define(`mask_converts', ` +define internal <$1 x i8> @convertmask_i1_i8_$1(<$1 x i1>) { + %r = sext <$1 x i1> %0 to <$1 x i8> + ret <$1 x i8> %r +} +define internal <$1 x i16> @convertmask_i1_i16_$1(<$1 x i1>) { + %r = sext <$1 x i1> %0 to <$1 x i16> + ret <$1 x i16> %r +} +define internal <$1 x i32> @convertmask_i1_i32_$1(<$1 x i1>) { + %r = sext <$1 x i1> %0 to <$1 x i32> + ret <$1 x i32> %r +} +define internal <$1 x i64> @convertmask_i1_i64_$1(<$1 x i1>) { + %r = sext <$1 x i1> %0 to <$1 x i64> + ret <$1 x i64> %r +} + +define internal <$1 x i8> @convertmask_i8_i8_$1(<$1 x i8>) { + ret <$1 x i8> %0 +} +define internal <$1 x i16> @convertmask_i8_i86_$1(<$1 x i8>) { + %r = sext <$1 x i8> %0 to <$1 x i16> + ret <$1 x i16> %r +} +define internal <$1 x i32> @convertmask_i8_i32_$1(<$1 x i8>) { + %r = sext <$1 x i8> %0 to <$1 x i32> + ret <$1 x i32> %r +} +define internal <$1 x i64> @convertmask_i8_i64_$1(<$1 x i8>) { + %r = sext <$1 x i8> %0 to <$1 x i64> + ret <$1 x i64> %r +} + +define internal <$1 x i8> @convertmask_i16_i8_$1(<$1 x i16>) { + %r = trunc <$1 x i16> %0 to <$1 x i8> + ret <$1 x i8> %r +} +define internal <$1 x i16> @convertmask_i16_i16_$1(<$1 x i16>) { + ret <$1 x i16> %0 +} +define internal <$1 x i32> @convertmask_i16_i32_$1(<$1 x i16>) { + %r = sext <$1 x i16> %0 to <$1 x i32> + ret <$1 x i32> %r +} +define internal <$1 x i64> @convertmask_i16_i64_$1(<$1 x i16>) { + %r = sext <$1 x i16> %0 to <$1 x i64> + ret <$1 x i64> %r +} + +define internal <$1 x i8> @convertmask_i32_i8_$1(<$1 x i32>) { + %r = trunc <$1 x i32> %0 to <$1 x i8> + ret <$1 x i8> %r +} +define internal <$1 x i16> @convertmask_i32_i16_$1(<$1 x i32>) { + %r = trunc <$1 x i32> %0 to <$1 x i16> + ret <$1 x i16> %r +} +define internal <$1 x i32> @convertmask_i32_i32_$1(<$1 x i32>) { + ret <$1 x i32> %0 +} +define internal <$1 x i64> @convertmask_i32_i64_$1(<$1 x i32>) { + %r = sext <$1 x i32> %0 to <$1 x i64> + ret <$1 x i64> %r +} +') + +mask_converts(WIDTH) + define(`global_atomic_associative', ` define <$1 x $3> @__atomic_$2_$4_global($3 * %ptr, <$1 x $3> %val, @@ -697,17 +766,10 @@ define <$1 x $3> @__atomic_$2_$4_global($3 * %ptr, <$1 x $3> %val, ; first, for any lanes where the mask is off, compute a vector where those lanes ; hold the identity value.. - ; for the bit tricks below, we need the mask to be sign extended to be - ; the size of the element type. - ifelse( - MASK,i1,`%mask = sext <$1 x MASK> %m to <$1 x $3>', - $3,i64, `%mask = sext <$1 x MASK> %m to <$1 x i64>', - $3,i32, ` - ; silly workaround to do %mask = %m, which is not possible directly.. - %maskmem = alloca <$1 x i32> - store <$1 x i32> %m, <$1 x i32> * %maskmem - %mask = load <$1 x i32> * %maskmem' - ) + ; for the bit tricks below, we need the mask to have the + ; the same element size as the element type. + %mask = call <$1 x $3> @convertmask_`'MASK`'_$3_$1(<$1 x MASK> %m) + ; zero out any lanes that are off %valoff = and <$1 x $3> %val, %mask @@ -2440,13 +2502,12 @@ define i32 @__sext_uniform_bool(i1) nounwind readnone alwaysinline { } define @__sext_varying_bool() nounwind readnone alwaysinline { - ifelse(MASK,i1, ` - %se = sext %0 to - ret %se - ', ` - ret %0') + ifelse(MASK,i32, `ret %0', + `%se = sext %0 to + ret %se') } + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; memcpy/memmove/memset @@ -3201,8 +3262,8 @@ return: ;; $1: llvm type of elements (and suffix for function name) define(`gen_masked_store', ` -define void @__masked_store_$1(* nocapture, , ) nounwind alwaysinline { - per_lane(WIDTH, %2, ` +define void @__masked_store_$1(* nocapture, , ) nounwind alwaysinline { + per_lane(WIDTH, %2, ` %ptr_LANE_ID = getelementptr * %0, i32 0, i32 LANE %storeval_LANE_ID = extractelement %1, i32 LANE store $1 %storeval_LANE_ID, $1 * %ptr_LANE_ID') @@ -3378,10 +3439,10 @@ define void @__masked_store_blend_i16(<16 x i16>* nocapture, <16 x i16>, define(`packed_load_and_store', ` define i32 @__packed_load_active(i32 * %startptr, * %val_ptr, - %full_mask) nounwind alwaysinline { + %full_mask) nounwind alwaysinline { entry: - %mask = call i64 @__movmsk( %full_mask) - %mask_known = call i1 @__is_compile_time_constant_mask( %full_mask) + %mask = call i64 @__movmsk( %full_mask) + %mask_known = call i1 @__is_compile_time_constant_mask( %full_mask) br i1 %mask_known, label %known_mask, label %unknown_mask known_mask: @@ -3432,10 +3493,10 @@ done: } define i32 @__packed_store_active(i32 * %startptr, %vals, - %full_mask) nounwind alwaysinline { + %full_mask) nounwind alwaysinline { entry: - %mask = call i64 @__movmsk( %full_mask) - %mask_known = call i1 @__is_compile_time_constant_mask( %full_mask) + %mask = call i64 @__movmsk( %full_mask) + %mask_known = call i1 @__is_compile_time_constant_mask( %full_mask) br i1 %mask_known, label %known_mask, label %unknown_mask known_mask: @@ -3544,10 +3605,10 @@ check_neighbors: %castvr = call <$1 x $4> @__rotate_i$6(<$1 x $4> %castvec, i32 1) %vr = bitcast <$1 x $4> %castvr to <$1 x $2> %eq = $5 $7 <$1 x $2> %vec, %vr - ifelse(MASK,i32, ` - %eq32 = sext <$1 x i1> %eq to <$1 x i32> - %eqmm = call i64 @__movmsk(<$1 x i32> %eq32)', ` - %eqmm = call i64 @__movmsk(<$1 x MASK> %eq)') + ifelse(MASK,i1, ` + %eqmm = call i64 @__movmsk(<$1 x MASK> %eq)', + `%eqm = sext <$1 x i1> %eq to <$1 x MASK> + %eqmm = call i64 @__movmsk(<$1 x MASK> %eqm)') %alleq = icmp eq i64 %eqmm, ALL_ON_MASK br i1 %alleq, label %all_equal, label %not_all_equal ', ` @@ -3722,9 +3783,9 @@ pl_done: define(`gen_gather_general', ` ; fully general 32-bit gather, takes array of pointers encoded as vector of i32s define @__gather32_$1( %ptrs, - %vecmask) nounwind readonly alwaysinline { + %vecmask) nounwind readonly alwaysinline { %ret_ptr = alloca - per_lane(WIDTH, %vecmask, ` + per_lane(WIDTH, %vecmask, ` %iptr_LANE_ID = extractelement %ptrs, i32 LANE %ptr_LANE_ID = inttoptr i32 %iptr_LANE_ID to $1 * %val_LANE_ID = load $1 * %ptr_LANE_ID @@ -3738,9 +3799,9 @@ define @__gather32_$1( %ptrs, ; fully general 64-bit gather, takes array of pointers encoded as vector of i32s define @__gather64_$1( %ptrs, - %vecmask) nounwind readonly alwaysinline { + %vecmask) nounwind readonly alwaysinline { %ret_ptr = alloca - per_lane(WIDTH, %vecmask, ` + per_lane(WIDTH, %vecmask, ` %iptr_LANE_ID = extractelement %ptrs, i32 LANE %ptr_LANE_ID = inttoptr i64 %iptr_LANE_ID to $1 * %val_LANE_ID = load $1 * %ptr_LANE_ID @@ -3804,7 +3865,7 @@ define @__gather_elt64_$1(i8 * %ptr, %offsets, i32 %o define @__gather_factored_base_offsets32_$1(i8 * %ptr, %offsets, i32 %offset_scale, %offset_delta, - %vecmask) nounwind readonly alwaysinline { + %vecmask) nounwind readonly alwaysinline { ; We can be clever and avoid the per-lane stuff for gathers if we are willing ; to require that the 0th element of the array being gathered from is always ; legal to read from (and we do indeed require that, given the benefits!) @@ -3813,13 +3874,13 @@ define @__gather_factored_base_offsets32_$1(i8 * %ptr, store zeroinitializer, * %offsetsPtr call void @__masked_store_blend_i32( * %offsetsPtr, %offsets, - %vecmask) + %vecmask) %newOffsets = load * %offsetsPtr %deltaPtr = alloca store zeroinitializer, * %deltaPtr call void @__masked_store_blend_i32( * %deltaPtr, %offset_delta, - %vecmask) + %vecmask) %newDelta = load * %deltaPtr %ret0 = call @__gather_elt32_$1(i8 * %ptr, %newOffsets, @@ -3835,7 +3896,7 @@ define @__gather_factored_base_offsets32_$1(i8 * %ptr, @__gather_factored_base_offsets64_$1(i8 * %ptr, %offsets, i32 %offset_scale, %offset_delta, - %vecmask) nounwind readonly alwaysinline { + %vecmask) nounwind readonly alwaysinline { ; We can be clever and avoid the per-lane stuff for gathers if we are willing ; to require that the 0th element of the array being gathered from is always ; legal to read from (and we do indeed require that, given the benefits!) @@ -3844,13 +3905,13 @@ define @__gather_factored_base_offsets64_$1(i8 * %ptr, store zeroinitializer, * %offsetsPtr call void @__masked_store_blend_i64( * %offsetsPtr, %offsets, - %vecmask) + %vecmask) %newOffsets = load * %offsetsPtr %deltaPtr = alloca store zeroinitializer, * %deltaPtr call void @__masked_store_blend_i64( * %deltaPtr, %offset_delta, - %vecmask) + %vecmask) %newDelta = load * %deltaPtr %ret0 = call @__gather_elt64_$1(i8 * %ptr, %newOffsets, @@ -3876,27 +3937,27 @@ gen_gather_factored($1) define @__gather_base_offsets32_$1(i8 * %ptr, i32 %offset_scale, %offsets, - %vecmask) nounwind readonly alwaysinline { + %vecmask) nounwind readonly alwaysinline { %scale_vec = bitcast i32 %offset_scale to <1 x i32> %smear_scale = shufflevector <1 x i32> %scale_vec, <1 x i32> undef, < forloop(i, 1, eval(WIDTH-1), `i32 0, ') i32 0 > %scaled_offsets = mul %smear_scale, %offsets %v = call @__gather_factored_base_offsets32_$1(i8 * %ptr, %scaled_offsets, i32 1, - zeroinitializer, %vecmask) + zeroinitializer, %vecmask) ret %v } define @__gather_base_offsets64_$1(i8 * %ptr, i32 %offset_scale, %offsets, - %vecmask) nounwind readonly alwaysinline { + %vecmask) nounwind readonly alwaysinline { %scale64 = zext i32 %offset_scale to i64 %scale_vec = bitcast i64 %scale64 to <1 x i64> %smear_scale = shufflevector <1 x i64> %scale_vec, <1 x i64> undef, < forloop(i, 1, eval(WIDTH-1), `i32 0, ') i32 0 > %scaled_offsets = mul %smear_scale, %offsets %v = call @__gather_factored_base_offsets64_$1(i8 * %ptr, %scaled_offsets, - i32 1, zeroinitializer, %vecmask) + i32 1, zeroinitializer, %vecmask) ret %v } @@ -3955,9 +4016,9 @@ define void @__scatter_elt64_$1(i8 * %ptr, %offsets, i32 %offset_s define void @__scatter_factored_base_offsets32_$1(i8* %base, %offsets, i32 %offset_scale, %offset_delta, %values, - %mask) nounwind alwaysinline { + %mask) nounwind alwaysinline { ;; And use the `per_lane' macro to do all of the per-lane work for scatter... - per_lane(WIDTH, %mask, ` + per_lane(WIDTH, %mask, ` call void @__scatter_elt32_$1(i8 * %base, %offsets, i32 %offset_scale, %offset_delta, %values, i32 LANE)') ret void @@ -3965,9 +4026,9 @@ define void @__scatter_factored_base_offsets32_$1(i8* %base, %offs define void @__scatter_factored_base_offsets64_$1(i8* %base, %offsets, i32 %offset_scale, %offset_delta, %values, - %mask) nounwind alwaysinline { + %mask) nounwind alwaysinline { ;; And use the `per_lane' macro to do all of the per-lane work for scatter... - per_lane(WIDTH, %mask, ` + per_lane(WIDTH, %mask, ` call void @__scatter_elt64_$1(i8 * %base, %offsets, i32 %offset_scale, %offset_delta, %values, i32 LANE)') ret void @@ -3975,8 +4036,8 @@ define void @__scatter_factored_base_offsets64_$1(i8* %base, %offs ; fully general 32-bit scatter, takes array of pointers encoded as vector of i32s define void @__scatter32_$1( %ptrs, %values, - %mask) nounwind alwaysinline { - per_lane(WIDTH, %mask, ` + %mask) nounwind alwaysinline { + per_lane(WIDTH, %mask, ` %iptr_LANE_ID = extractelement %ptrs, i32 LANE %ptr_LANE_ID = inttoptr i32 %iptr_LANE_ID to $1 * %val_LANE_ID = extractelement %values, i32 LANE @@ -3987,8 +4048,8 @@ define void @__scatter32_$1( %ptrs, %values, ; fully general 64-bit scatter, takes array of pointers encoded as vector of i64s define void @__scatter64_$1( %ptrs, %values, - %mask) nounwind alwaysinline { - per_lane(WIDTH, %mask, ` + %mask) nounwind alwaysinline { + per_lane(WIDTH, %mask, ` %iptr_LANE_ID = extractelement %ptrs, i32 LANE %ptr_LANE_ID = inttoptr i64 %iptr_LANE_ID to $1 * %val_LANE_ID = extractelement %values, i32 LANE diff --git a/ctx.cpp b/ctx.cpp index 1e79c97b..c50d22f9 100644 --- a/ctx.cpp +++ b/ctx.cpp @@ -1456,13 +1456,13 @@ FunctionEmitContext::I1VecToBoolVec(llvm::Value *b) { for (unsigned int i = 0; i < at->getNumElements(); ++i) { llvm::Value *elt = ExtractInst(b, i); llvm::Value *sext = SExtInst(elt, LLVMTypes::BoolVectorType, - LLVMGetName(elt, "_to_boolvec32")); + LLVMGetName(elt, "_to_boolvec")); ret = InsertInst(ret, sext, i); } return ret; } else - return SExtInst(b, LLVMTypes::BoolVectorType, LLVMGetName(b, "_to_i32")); + return SExtInst(b, LLVMTypes::BoolVectorType, LLVMGetName(b, "_to_boolvec")); } @@ -2781,6 +2781,7 @@ FunctionEmitContext::maskedStore(llvm::Value *value, llvm::Value *ptr, // Figure out if we need a 8, 16, 32 or 64-bit masked store. llvm::Function *maskedStoreFunc = NULL; + llvm::Type *llvmValueType = value->getType(); const PointerType *pt = CastType(valueType); if (pt != NULL) { @@ -2809,8 +2810,7 @@ FunctionEmitContext::maskedStore(llvm::Value *value, llvm::Value *ptr, else maskedStoreFunc = m->module->getFunction("__pseudo_masked_store_i64"); } - else if (Type::Equal(valueType, AtomicType::VaryingBool) && - g->target->getMaskBitCount() == 1) { + else if (llvmValueType == LLVMTypes::Int1VectorType) { llvm::Value *notMask = BinaryOperator(llvm::Instruction::Xor, mask, LLVMMaskAllOn, "~mask"); llvm::Value *old = LoadInst(ptr); @@ -2823,28 +2823,22 @@ FunctionEmitContext::maskedStore(llvm::Value *value, llvm::Value *ptr, StoreInst(final, ptr); return; } - else if (Type::Equal(valueType, AtomicType::VaryingDouble)) { + else if (llvmValueType == LLVMTypes::DoubleVectorType) { maskedStoreFunc = m->module->getFunction("__pseudo_masked_store_double"); } - else if (Type::Equal(valueType, AtomicType::VaryingInt64) || - Type::Equal(valueType, AtomicType::VaryingUInt64)) { + else if (llvmValueType == LLVMTypes::Int64VectorType) { maskedStoreFunc = m->module->getFunction("__pseudo_masked_store_i64"); } - else if (Type::Equal(valueType, AtomicType::VaryingFloat)) { + else if (llvmValueType == LLVMTypes::FloatVectorType) { maskedStoreFunc = m->module->getFunction("__pseudo_masked_store_float"); } - else if (Type::Equal(valueType, AtomicType::VaryingBool) || - Type::Equal(valueType, AtomicType::VaryingInt32) || - Type::Equal(valueType, AtomicType::VaryingUInt32) || - CastType(valueType) != NULL) { + else if (llvmValueType == LLVMTypes::Int32VectorType) { maskedStoreFunc = m->module->getFunction("__pseudo_masked_store_i32"); } - else if (Type::Equal(valueType, AtomicType::VaryingInt16) || - Type::Equal(valueType, AtomicType::VaryingUInt16)) { + else if (llvmValueType == LLVMTypes::Int16VectorType) { maskedStoreFunc = m->module->getFunction("__pseudo_masked_store_i16"); } - else if (Type::Equal(valueType, AtomicType::VaryingInt8) || - Type::Equal(valueType, AtomicType::VaryingUInt8)) { + else if (llvmValueType == LLVMTypes::Int8VectorType) { maskedStoreFunc = m->module->getFunction("__pseudo_masked_store_i8"); } AssertPos(currentPos, maskedStoreFunc != NULL); diff --git a/expr.cpp b/expr.cpp index 3baaabaf..6bde2acb 100644 --- a/expr.cpp +++ b/expr.cpp @@ -6161,9 +6161,9 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal, switch (fromType->basicType) { case AtomicType::TYPE_BOOL: if (fromType->IsVaryingType() && - LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType) - // If we have a bool vector of i32 elements, first truncate - // down to a single bit + LLVMTypes::BoolVectorType != LLVMTypes::Int1VectorType) + // If we have a bool vector of non-i1 elements, first + // truncate down to a single bit. exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, cOpName); // And then do an unisgned int->float cast cast = ctx->CastInst(llvm::Instruction::UIToFP, // unsigned int @@ -6205,8 +6205,8 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal, switch (fromType->basicType) { case AtomicType::TYPE_BOOL: if (fromType->IsVaryingType() && - LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType) - // truncate i32 bool vector values to i1s + LLVMTypes::BoolVectorType != LLVMTypes::Int1VectorType) + // truncate bool vector values to i1s exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, cOpName); cast = ctx->CastInst(llvm::Instruction::UIToFP, // unsigned int to double exprVal, targetType, cOpName); @@ -6243,7 +6243,7 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal, switch (fromType->basicType) { case AtomicType::TYPE_BOOL: if (fromType->IsVaryingType() && - LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType) + LLVMTypes::BoolVectorType != LLVMTypes::Int1VectorType) exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, cOpName); cast = ctx->ZExtInst(exprVal, targetType, cOpName); break; @@ -6279,7 +6279,7 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal, switch (fromType->basicType) { case AtomicType::TYPE_BOOL: if (fromType->IsVaryingType() && - LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType) + LLVMTypes::BoolVectorType != LLVMTypes::Int1VectorType) exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, cOpName); cast = ctx->ZExtInst(exprVal, targetType, cOpName); break; @@ -6321,7 +6321,7 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal, switch (fromType->basicType) { case AtomicType::TYPE_BOOL: if (fromType->IsVaryingType() && - LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType) + LLVMTypes::BoolVectorType != LLVMTypes::Int1VectorType) exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, cOpName); cast = ctx->ZExtInst(exprVal, targetType, cOpName); break; @@ -6361,7 +6361,7 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal, switch (fromType->basicType) { case AtomicType::TYPE_BOOL: if (fromType->IsVaryingType() && - LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType) + LLVMTypes::BoolVectorType != LLVMTypes::Int1VectorType) exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, cOpName); cast = ctx->ZExtInst(exprVal, targetType, cOpName); break; @@ -6407,7 +6407,7 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal, switch (fromType->basicType) { case AtomicType::TYPE_BOOL: if (fromType->IsVaryingType() && - LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType) + LLVMTypes::BoolVectorType != LLVMTypes::Int1VectorType) exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, cOpName); cast = ctx->ZExtInst(exprVal, targetType, cOpName); break; @@ -6447,7 +6447,7 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal, switch (fromType->basicType) { case AtomicType::TYPE_BOOL: if (fromType->IsVaryingType() && - LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType) + LLVMTypes::BoolVectorType != LLVMTypes::Int1VectorType) exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, cOpName); cast = ctx->ZExtInst(exprVal, targetType, cOpName); break; @@ -6493,7 +6493,7 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal, switch (fromType->basicType) { case AtomicType::TYPE_BOOL: if (fromType->IsVaryingType() && - LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType) + LLVMTypes::BoolVectorType != LLVMTypes::Int1VectorType) exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, cOpName); cast = ctx->ZExtInst(exprVal, targetType, cOpName); break; @@ -6531,7 +6531,7 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal, switch (fromType->basicType) { case AtomicType::TYPE_BOOL: if (fromType->IsVaryingType() && - LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType) + LLVMTypes::BoolVectorType != LLVMTypes::Int1VectorType) exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, cOpName); cast = ctx->ZExtInst(exprVal, targetType, cOpName); break; @@ -6625,12 +6625,12 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal, if (fromType->IsUniformType()) { if (toType->IsVaryingType() && - LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType) { - // extend out to i32 bool values from i1 here. then we'll - // turn into a vector below, the way it does for everyone - // else... + LLVMTypes::BoolVectorType != LLVMTypes::Int1VectorType) { + // extend out to an bool as an i8/i16/i32 from the i1 here. + // Then we'll turn that into a vector below, the way it + // does for everyone else... cast = ctx->SExtInst(cast, LLVMTypes::BoolVectorType->getElementType(), - LLVMGetName(cast, "to_i32bool")); + LLVMGetName(cast, "to_i_bool")); } } else diff --git a/llvmutil.cpp b/llvmutil.cpp index 26c18bf5..180c8676 100644 --- a/llvmutil.cpp +++ b/llvmutil.cpp @@ -115,13 +115,25 @@ InitLLVMUtil(llvm::LLVMContext *ctx, Target& target) { LLVMTypes::FloatPointerType = llvm::PointerType::get(LLVMTypes::FloatType, 0); LLVMTypes::DoublePointerType = llvm::PointerType::get(LLVMTypes::DoubleType, 0); - if (target.getMaskBitCount() == 1) + switch (target.getMaskBitCount()) { + case 1: LLVMTypes::MaskType = LLVMTypes::BoolVectorType = llvm::VectorType::get(llvm::Type::getInt1Ty(*ctx), target.getVectorWidth()); - else { - Assert(target.getMaskBitCount() == 32); + break; + case 8: + LLVMTypes::MaskType = LLVMTypes::BoolVectorType = + llvm::VectorType::get(llvm::Type::getInt8Ty(*ctx), target.getVectorWidth()); + break; + case 16: + LLVMTypes::MaskType = LLVMTypes::BoolVectorType = + llvm::VectorType::get(llvm::Type::getInt16Ty(*ctx), target.getVectorWidth()); + break; + case 32: LLVMTypes::MaskType = LLVMTypes::BoolVectorType = llvm::VectorType::get(llvm::Type::getInt32Ty(*ctx), target.getVectorWidth()); + break; + default: + FATAL("Unhandled mask width for initializing MaskType"); } LLVMTypes::Int1VectorType = @@ -154,12 +166,26 @@ InitLLVMUtil(llvm::LLVMContext *ctx, Target& target) { std::vector maskOnes; llvm::Constant *onMask = NULL; - if (target.getMaskBitCount() == 1) + switch (target.getMaskBitCount()) { + case 1: onMask = llvm::ConstantInt::get(llvm::Type::getInt1Ty(*ctx), 1, false /*unsigned*/); // 0x1 - else + break; + case 8: + onMask = llvm::ConstantInt::get(llvm::Type::getInt8Ty(*ctx), -1, + true /*signed*/); // 0xff + break; + case 16: + onMask = llvm::ConstantInt::get(llvm::Type::getInt16Ty(*ctx), -1, + true /*signed*/); // 0xffff + break; + case 32: onMask = llvm::ConstantInt::get(llvm::Type::getInt32Ty(*ctx), -1, true /*signed*/); // 0xffffffff + break; + default: + FATAL("Unhandled mask width for onMask"); + } for (int i = 0; i < target.getVectorWidth(); ++i) maskOnes.push_back(onMask); @@ -167,13 +193,26 @@ InitLLVMUtil(llvm::LLVMContext *ctx, Target& target) { std::vector maskZeros; llvm::Constant *offMask = NULL; - if (target.getMaskBitCount() == 1) + switch (target.getMaskBitCount()) { + case 1: offMask = llvm::ConstantInt::get(llvm::Type::getInt1Ty(*ctx), 0, true /*signed*/); - else + break; + case 8: + offMask = llvm::ConstantInt::get(llvm::Type::getInt8Ty(*ctx), 0, + true /*signed*/); + break; + case 16: + offMask = llvm::ConstantInt::get(llvm::Type::getInt16Ty(*ctx), 0, + true /*signed*/); + break; + case 32: offMask = llvm::ConstantInt::get(llvm::Type::getInt32Ty(*ctx), 0, true /*signed*/); - + break; + default: + FATAL("Unhandled mask width for offMask"); + } for (int i = 0; i < target.getVectorWidth(); ++i) maskZeros.push_back(offMask); LLVMMaskAllOff = llvm::ConstantVector::get(maskZeros); @@ -444,9 +483,14 @@ LLVMBoolVector(bool b) { if (LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType) v = llvm::ConstantInt::get(LLVMTypes::Int32Type, b ? 0xffffffff : 0, false /*unsigned*/); + else if (LLVMTypes::BoolVectorType == LLVMTypes::Int16VectorType) + v = llvm::ConstantInt::get(LLVMTypes::Int16Type, b ? 0xffff : 0, + false /*unsigned*/); + else if (LLVMTypes::BoolVectorType == LLVMTypes::Int8VectorType) + v = llvm::ConstantInt::get(LLVMTypes::Int8Type, b ? 0xff : 0, + false /*unsigned*/); else { - Assert(LLVMTypes::BoolVectorType->getElementType() == - llvm::Type::getInt1Ty(*g->ctx)); + Assert(LLVMTypes::BoolVectorType == LLVMTypes::Int1VectorType); v = b ? LLVMTrue : LLVMFalse; } @@ -465,9 +509,14 @@ LLVMBoolVector(const bool *bvec) { if (LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType) v = llvm::ConstantInt::get(LLVMTypes::Int32Type, bvec[i] ? 0xffffffff : 0, false /*unsigned*/); + else if (LLVMTypes::BoolVectorType == LLVMTypes::Int16VectorType) + v = llvm::ConstantInt::get(LLVMTypes::Int16Type, bvec[i] ? 0xffff : 0, + false /*unsigned*/); + else if (LLVMTypes::BoolVectorType == LLVMTypes::Int8VectorType) + v = llvm::ConstantInt::get(LLVMTypes::Int8Type, bvec[i] ? 0xff : 0, + false /*unsigned*/); else { - Assert(LLVMTypes::BoolVectorType->getElementType() == - llvm::Type::getInt1Ty(*g->ctx)); + Assert(LLVMTypes::BoolVectorType == LLVMTypes::Int1VectorType); v = bvec[i] ? LLVMTrue : LLVMFalse; } diff --git a/parse.yy b/parse.yy index 3ad815cf..488c864a 100644 --- a/parse.yy +++ b/parse.yy @@ -2148,8 +2148,24 @@ lAddFunctionParams(Declarator *decl) { /** Add a symbol for the built-in mask variable to the symbol table */ static void lAddMaskToSymbolTable(SourcePos pos) { - const Type *t = g->target->getMaskBitCount() == 1 ? - AtomicType::VaryingBool : AtomicType::VaryingUInt32; + const Type *t; + switch (g->target->getMaskBitCount()) { + case 1: + t = AtomicType::VaryingBool; + break; + case 8: + t = AtomicType::VaryingUInt8; + break; + case 16: + t = AtomicType::VaryingUInt16; + break; + case 32: + t = AtomicType::VaryingUInt32; + break; + default: + FATAL("Unhandled mask bitsize in lAddMaskToSymbolTable"); + } + t = t->GetAsConstType(); Symbol *maskSymbol = new Symbol("__mask", pos, t); m->symbolTable->AddVariable(maskSymbol); diff --git a/stdlib.ispc b/stdlib.ispc index b8ed2057..8ad5aa49 100644 --- a/stdlib.ispc +++ b/stdlib.ispc @@ -38,12 +38,20 @@ ispc code */ -#ifdef ISPC_TARGET_GENERIC -#define IntMaskType bool -#define UIntMaskType bool +#if (ISPC_MASK_BITS == 1) + #define IntMaskType bool + #define UIntMaskType bool +#elif (ISPC_MASK_BITS == 8) + #define IntMaskType int8 + #define UIntMaskType unsigned int8 +#elif (ISPC_MASK_BITS == 16) + #define IntMaskType int16 + #define UIntMaskType unsigned int16 +#elif (ISPC_MASK_BITS == 32) + #define IntMaskType int32 + #define UIntMaskType unsigned int32 #else -#define IntMaskType int32 -#define UIntMaskType unsigned int32 + #error Unknown value of ISPC_MASK_BITS #endif /////////////////////////////////////////////////////////////////////////// @@ -335,14 +343,15 @@ static inline int32 sign_extend(bool v) { return __sext_varying_bool(v); } + __declspec(safe) static inline uniform bool any(bool v) { // We only care about whether "any" is true for the active program instances, // so we have to make v with the current program mask. -#ifdef ISPC_TARGET_GENERIC +#if (ISPC_MASK_BITS == 1) return __any(v & __mask); #else - return __any(__sext_varying_bool(v) & __mask); + return __any((UIntMaskType)__sext_varying_bool(v) & __mask); #endif } @@ -350,11 +359,10 @@ __declspec(safe) static inline uniform bool all(bool v) { // As with any(), we need to explicitly mask v with the current program mask // so we're only looking at the current lanes - -#ifdef ISPC_TARGET_GENERIC +#if (ISPC_MASK_BITS == 1) return __all(v | !__mask); #else - return __all(__sext_varying_bool(v) | !__mask); + return __all((UIntMaskType)__sext_varying_bool(v) | !__mask); #endif } @@ -362,11 +370,10 @@ __declspec(safe) static inline uniform bool none(bool v) { // As with any(), we need to explicitly mask v with the current program mask // so we're only looking at the current lanes - -#ifdef ISPC_TARGET_GENERIC +#if (ISPC_MASK_BITS == 1) return __none(v & __mask); #else - return __none(__sext_varying_bool(v) & __mask); + return __none((UIntMaskType)__sext_varying_bool(v) & __mask); #endif } @@ -399,10 +406,10 @@ static inline int popcnt(int64 v) { __declspec(safe) static inline uniform int popcnt(bool v) { // As with any() and all(), only count across the active lanes -#ifdef ISPC_TARGET_GENERIC +#if (ISPC_MASK_BITS == 1) return __popcnt_int64(__movmsk(v & __mask)); #else - return __popcnt_int64(__movmsk(__sext_varying_bool(v) & __mask)); + return __popcnt_int64(__movmsk((UIntMaskType)__sext_varying_bool(v) & __mask)); #endif } From 9ba49eabb21c7971f529fda25bad5fc1e84a6e3e Mon Sep 17 00:00:00 2001 From: Matt Pharr Date: Tue, 23 Jul 2013 16:52:43 -0700 Subject: [PATCH 04/34] Reduce estimated costs for 8 and 16-bit min() and max() in stdlib. These actually compile to a single instruction. --- stdlib.ispc | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/stdlib.ispc b/stdlib.ispc index 8ad5aa49..9a2b191f 100644 --- a/stdlib.ispc +++ b/stdlib.ispc @@ -1332,88 +1332,88 @@ static inline uniform double max(uniform double a, uniform double b) { // int8 -__declspec(safe,cost2) +__declspec(safe,cost1) static inline uniform unsigned int8 min(uniform unsigned int8 a, uniform unsigned int8 b) { return (a < b) ? a : b; } -__declspec(safe,cost2) +__declspec(safe,cost1) static inline uniform unsigned int8 max(uniform unsigned int8 a, uniform unsigned int8 b) { return (a > b) ? a : b; } -__declspec(safe,cost2) +__declspec(safe,cost1) static inline uniform int8 min(uniform int8 a, uniform int8 b) { return (a < b) ? a : b; } -__declspec(safe,cost2) +__declspec(safe,cost1) static inline uniform int8 max(uniform int8 a, uniform int8 b) { return (a > b) ? a : b; } -__declspec(safe,cost2) +__declspec(safe,cost1) static inline unsigned int8 min(unsigned int8 a, unsigned int8 b) { return (a < b) ? a : b; } -__declspec(safe,cost2) +__declspec(safe,cost1) static inline unsigned int8 max(unsigned int8 a, unsigned int8 b) { return (a > b) ? a : b; } -__declspec(safe,cost2) +__declspec(safe,cost1) static inline int8 min(int8 a, int8 b) { return (a < b) ? a : b; } -__declspec(safe,cost2) +__declspec(safe,cost1) static inline int8 max(int8 a, int8 b) { return (a > b) ? a : b; } // int16 -__declspec(safe,cost2) +__declspec(safe,cost1) static inline uniform unsigned int16 min(uniform unsigned int16 a, uniform unsigned int16 b) { return (a < b) ? a : b; } -__declspec(safe,cost2) +__declspec(safe,cost1) static inline uniform unsigned int16 max(uniform unsigned int16 a, uniform unsigned int16 b) { return (a > b) ? a : b; } -__declspec(safe,cost2) +__declspec(safe,cost1) static inline uniform int16 min(uniform int16 a, uniform int16 b) { return (a < b) ? a : b; } -__declspec(safe,cost2) +__declspec(safe,cost1) static inline uniform int16 max(uniform int16 a, uniform int16 b) { return (a > b) ? a : b; } -__declspec(safe,cost2) +__declspec(safe,cost1) static inline unsigned int16 min(unsigned int16 a, unsigned int16 b) { return (a < b) ? a : b; } -__declspec(safe,cost2) +__declspec(safe,cost1) static inline unsigned int16 max(unsigned int16 a, unsigned int16 b) { return (a > b) ? a : b; } -__declspec(safe,cost2) +__declspec(safe,cost1) static inline int16 min(int16 a, int16 b) { return (a < b) ? a : b; } -__declspec(safe,cost2) +__declspec(safe,cost1) static inline int16 max(int16 a, int16 b) { return (a > b) ? a : b; } From f7f281a256c38c1986860baec81736fcb4f5b6d1 Mon Sep 17 00:00:00 2001 From: Matt Pharr Date: Tue, 23 Jul 2013 17:01:03 -0700 Subject: [PATCH 05/34] Choose type for integer literals to match the target mask size (if possible). On a target with a 16-bit mask (for example), we would choose the type of an integer literal "1024" to be an int16. Previously, we used an int32, which is a worse fit and leads to less efficient code than an int16 on a 16-bit mask target. (However, we'd still give an integer literal 1000000 the type int32, even in a 16-bit target.) Updated the tests to still pass with 8 and 16-bit targets, given this change. --- lex.ll | 27 +++++++- parse.yy | 23 ++++++- run_tests.py | 4 +- stdlib.ispc | 74 ++++++++++----------- tests/aossoa-1.ispc | 4 +- tests/aossoa-2.ispc | 4 +- tests/aossoa-5.ispc | 4 +- tests/aossoa-6.ispc | 4 +- tests/atomics-12.ispc | 4 +- tests/atomics-13.ispc | 2 +- tests/atomics-4.ispc | 4 +- tests/coalesce-1.ispc | 4 +- tests/coalesce-2.ispc | 4 +- tests/coalesce-3.ispc | 4 +- tests/coalesce-4.ispc | 4 +- tests/coalesce-5.ispc | 4 +- tests/coalesce-6.ispc | 4 +- tests/coalesce-7.ispc | 4 +- tests/coalesce-8.ispc | 4 +- tests/count-leading-trailing-zeros-1.ispc | 2 +- tests/count-leading-trailing-zeros-4.ispc | 2 +- tests/exclusive-scan-and-2.ispc | 4 +- tests/exclusive-scan-or-1.ispc | 4 +- tests/frexp-double-1.ispc | 2 +- tests/frexp-double.ispc | 2 +- tests/frexp-float-1.ispc | 2 +- tests/frexp-float.ispc | 2 +- tests/kilo-mega-giga-2.ispc | 2 +- tests/ldexp-double.ispc | 4 +- tests/ldexp-float.ispc | 4 +- tests/local-atomics-12.ispc | 4 +- tests/local-atomics-13.ispc | 2 +- tests/local-atomics-14.ispc | 4 +- tests/local-atomics-4.ispc | 4 +- tests/rand-distrib-1.ispc | 2 +- tests/sizeof-9.ispc | 2 +- tests/test-83.ispc | 2 +- tests/test-84.ispc | 2 +- tests/test-85.ispc | 2 +- tests_errors/array-plus-equals.ispc | 2 +- tests_errors/array-pointer-assign.ispc | 2 +- tests_errors/float-logical.ispc | 2 +- tests_errors/fptr-typecheck-2.ispc | 2 +- tests_errors/fptr-typecheck-3.ispc | 2 +- tests_errors/initexpr-2.ispc | 2 +- tests_errors/int-ptr-fail.ispc | 4 +- tests_errors/lvalue-2.ispc | 2 +- tests_errors/lvalue-3.ispc | 2 +- tests_errors/new-delete-3.ispc | 2 +- tests_errors/new-delete-6.ispc | 2 +- tests_errors/ptr-1.ispc | 2 +- tests_errors/ptr-const-1.ispc | 2 +- tests_errors/ptrcast-lose-info.ispc | 2 +- tests_errors/ref-3.ispc | 2 +- tests_errors/soa-11.ispc | 2 +- tests_errors/soa-12.ispc | 2 +- tests_errors/soa-3.ispc | 2 +- tests_errors/soa-4.ispc | 2 +- tests_errors/soa-9.ispc | 2 +- tests_errors/struct_arith.ispc | 2 +- tests_errors/vec-size-compile-constant.ispc | 2 +- 61 files changed, 166 insertions(+), 120 deletions(-) diff --git a/lex.ll b/lex.ll index f6633fce..8baa627a 100644 --- a/lex.ll +++ b/lex.ll @@ -77,6 +77,8 @@ static int allTokens[] = { TOKEN_UNSIGNED, TOKEN_VARYING, TOKEN_VOID, TOKEN_WHILE, TOKEN_STRING_C_LITERAL, TOKEN_DOTDOTDOT, TOKEN_FLOAT_CONSTANT, + TOKEN_INT8_CONSTANT, TOKEN_UINT8_CONSTANT, + TOKEN_INT16_CONSTANT, TOKEN_UINT16_CONSTANT, TOKEN_INT32_CONSTANT, TOKEN_UINT32_CONSTANT, TOKEN_INT64_CONSTANT, TOKEN_UINT64_CONSTANT, TOKEN_INC_OP, TOKEN_DEC_OP, TOKEN_LEFT_OP, TOKEN_RIGHT_OP, TOKEN_LE_OP, @@ -150,6 +152,10 @@ void ParserInit() { tokenToName[TOKEN_STRING_C_LITERAL] = "\"C\""; tokenToName[TOKEN_DOTDOTDOT] = "..."; tokenToName[TOKEN_FLOAT_CONSTANT] = "TOKEN_FLOAT_CONSTANT"; + tokenToName[TOKEN_INT8_CONSTANT] = "TOKEN_INT8_CONSTANT"; + tokenToName[TOKEN_UINT8_CONSTANT] = "TOKEN_UINT8_CONSTANT"; + tokenToName[TOKEN_INT16_CONSTANT] = "TOKEN_INT16_CONSTANT"; + tokenToName[TOKEN_UINT16_CONSTANT] = "TOKEN_UINT16_CONSTANT"; tokenToName[TOKEN_INT32_CONSTANT] = "TOKEN_INT32_CONSTANT"; tokenToName[TOKEN_UINT32_CONSTANT] = "TOKEN_UINT32_CONSTANT"; tokenToName[TOKEN_INT64_CONSTANT] = "TOKEN_INT64_CONSTANT"; @@ -260,6 +266,10 @@ void ParserInit() { tokenNameRemap["TOKEN_STRING_C_LITERAL"] = "\"C\""; tokenNameRemap["TOKEN_DOTDOTDOT"] = "\'...\'"; tokenNameRemap["TOKEN_FLOAT_CONSTANT"] = "float constant"; + tokenNameRemap["TOKEN_INT8_CONSTANT"] = "int8 constant"; + tokenNameRemap["TOKEN_UINT8_CONSTANT"] = "unsigned int8 constant"; + tokenNameRemap["TOKEN_INT16_CONSTANT"] = "int16 constant"; + tokenNameRemap["TOKEN_UINT16_CONSTANT"] = "unsigned int16 constant"; tokenNameRemap["TOKEN_INT32_CONSTANT"] = "int32 constant"; tokenNameRemap["TOKEN_UINT32_CONSTANT"] = "unsigned int32 constant"; tokenNameRemap["TOKEN_INT64_CONSTANT"] = "int64 constant"; @@ -599,7 +609,22 @@ lParseInteger(bool dotdotdot) { } else { // No u or l suffix - // First, see if we can fit this into a 32-bit integer... + // If we're compiling to an 8-bit mask target and the constant + // fits into 8 bits, return an 8-bit int. + if (g->target->getMaskBitCount() == 8) { + if (yylval.intVal <= 0x7fULL) + return TOKEN_INT8_CONSTANT; + else if (yylval.intVal <= 0xffULL) + return TOKEN_UINT8_CONSTANT; + } + // And similarly for 16-bit masks and constants + if (g->target->getMaskBitCount() == 16) { + if (yylval.intVal <= 0x7fffULL) + return TOKEN_INT16_CONSTANT; + else if (yylval.intVal <= 0xffffULL) + return TOKEN_UINT16_CONSTANT; + } + // Otherwise, see if we can fit this into a 32-bit integer... if (yylval.intVal <= 0x7fffffffULL) return TOKEN_INT32_CONSTANT; else if (yylval.intVal <= 0xffffffffULL) diff --git a/parse.yy b/parse.yy index 488c864a..6ed2a43d 100644 --- a/parse.yy +++ b/parse.yy @@ -179,6 +179,8 @@ struct ForeachDimension { } +%token TOKEN_INT8_CONSTANT TOKEN_UINT8_CONSTANT +%token TOKEN_INT16_CONSTANT TOKEN_UINT16_CONSTANT %token TOKEN_INT32_CONSTANT TOKEN_UINT32_CONSTANT %token TOKEN_INT64_CONSTANT TOKEN_UINT64_CONSTANT %token TOKEN_INT32DOTDOTDOT_CONSTANT TOKEN_UINT32DOTDOTDOT_CONSTANT @@ -291,6 +293,22 @@ primary_expression Error(@1, "Undeclared symbol \"%s\".%s", name, alts.c_str()); } } + | TOKEN_INT8_CONSTANT { + $$ = new ConstExpr(AtomicType::UniformInt8->GetAsConstType(), + (int8_t)yylval.intVal, @1); + } + | TOKEN_UINT8_CONSTANT { + $$ = new ConstExpr(AtomicType::UniformUInt8->GetAsConstType(), + (uint8_t)yylval.intVal, @1); + } + | TOKEN_INT16_CONSTANT { + $$ = new ConstExpr(AtomicType::UniformInt16->GetAsConstType(), + (int16_t)yylval.intVal, @1); + } + | TOKEN_UINT16_CONSTANT { + $$ = new ConstExpr(AtomicType::UniformUInt16->GetAsConstType(), + (uint16_t)yylval.intVal, @1); + } | TOKEN_INT32_CONSTANT { $$ = new ConstExpr(AtomicType::UniformInt32->GetAsConstType(), (int32_t)yylval.intVal, @1); @@ -1233,7 +1251,10 @@ declarator ; int_constant - : TOKEN_INT32_CONSTANT { $$ = yylval.intVal; } + : TOKEN_INT8_CONSTANT { $$ = yylval.intVal; } + | TOKEN_INT16_CONSTANT { $$ = yylval.intVal; } + | TOKEN_INT32_CONSTANT { $$ = yylval.intVal; } + | TOKEN_INT64_CONSTANT { $$ = yylval.intVal; } ; direct_declarator diff --git a/run_tests.py b/run_tests.py index 7c6b1eb8..296db867 100755 --- a/run_tests.py +++ b/run_tests.py @@ -37,7 +37,7 @@ parser.add_option("-g", "--generics-include", dest="include_file", help="Filenam parser.add_option("-f", "--ispc-flags", dest="ispc_flags", help="Additional flags for ispc (-g, -O1, ...)", default="") parser.add_option('-t', '--target', dest='target', - help='Set compilation target (neon, sse2, sse2-x2, sse4, sse4-x2, avx, avx-x2, generic-4, generic-8, generic-16, generic-32)', + help='Set compilation target (neon, sse2, sse2-x2, sse4, sse4-x2, sse4-8, avx, avx-x2, generic-4, generic-8, generic-16, generic-32)', default="sse4") parser.add_option('-a', '--arch', dest='arch', help='Set architecture (arm, x86, x86-64)', @@ -294,7 +294,7 @@ def run_test(testname): firstline = firstline.rstrip() file.close() - if (output.find(firstline) == -1): + if re.search(firstline, output) == None: sys.stderr.write("Didn't see expected error message %s from test %s.\nActual output:\n%s\n" % \ (firstline, testname, output)) return (1, 0) diff --git a/stdlib.ispc b/stdlib.ispc index 9a2b191f..7e848481 100644 --- a/stdlib.ispc +++ b/stdlib.ispc @@ -3126,7 +3126,7 @@ static inline void __range_reduce_log(float input, varying float * uniform reduc static const int nonexponent_mask = 0x807FFFFF; // We want the reduced version to have an exponent of -1 which is -1 + 127 after biasing or 126 - static const int exponent_neg1 = (126 << 23); + static const int exponent_neg1 = (126l << 23); // NOTE(boulos): We don't need to mask anything out since we know // the sign bit has to be 0. If it's 1, we need to return infinity/nan // anyway (log(x), x = +-0 -> infinity, x < 0 -> NaN). @@ -3149,7 +3149,7 @@ static inline void __range_reduce_log(uniform float input, uniform float * unifo uniform int int_version = intbits(input); static const uniform int nonexponent_mask = 0x807FFFFF; - static const uniform int exponent_neg1 = (126 << 23); + static const uniform int exponent_neg1 = (126ul << 23); uniform int biased_exponent = int_version >> 23; uniform int offset_exponent = biased_exponent + 1; *exponent = offset_exponent - 127; // get the real value @@ -3647,18 +3647,18 @@ static inline uniform float half_to_float(uniform unsigned int16 h) { else { // https://gist.github.com/2144712 // Fabian "ryg" Giesen. - static const uniform unsigned int32 shifted_exp = 0x7c00 << 13; // exponent mask after shift + static const uniform unsigned int32 shifted_exp = 0x7c00ul << 13; // exponent mask after shift uniform int32 o = ((int32)(h & 0x7fff)) << 13; // exponent/mantissa bits uniform unsigned int32 exp = shifted_exp & o; // just the exponent - o += (127 - 15) << 23; // exponent adjust + o += (uniform int32)(127 - 15) << 23; // exponent adjust // handle exponent special cases if (exp == shifted_exp) // Inf/NaN? - o += (128 - 16) << 23; // extra exp adjust + o += (uniform unsigned int32)(128 - 16) << 23; // extra exp adjust else if (exp == 0) { // Zero/Denormal? - o += 1 << 23; // extra exp adjust - o = intbits(floatbits(o) - floatbits(113 << 23)); // renormalize + o += 1ul << 23; // extra exp adjust + o = intbits(floatbits(o) - floatbits(113ul << 23)); // renormalize } o |= ((int32)(h & 0x8000)) << 16; // sign bit @@ -3675,17 +3675,17 @@ static inline float half_to_float(unsigned int16 h) { // https://gist.github.com/2144712 // Fabian "ryg" Giesen. - const unsigned int32 shifted_exp = 0x7c00 << 13; // exponent mask after shift + const unsigned int32 shifted_exp = 0x7c00ul << 13; // exponent mask after shift - int32 o = ((int32)(h & 0x7fff)) << 13; // exponent/mantissa bits + int32 o = ((int32)(h & 0x7ffful)) << 13; // exponent/mantissa bits unsigned int32 exp = shifted_exp & o; // just the exponent - o += (127 - 15) << 23; // exponent adjust + o += (int32)(127 - 15) << 23; // exponent adjust - int32 infnan_val = o + ((128 - 16) << 23); - int32 zerodenorm_val = intbits(floatbits(o + (1<<23)) - floatbits(113 << 23)); + int32 infnan_val = o + ((int32)(128 - 16) << 23); + int32 zerodenorm_val = intbits(floatbits(o + (1ul<<23)) - floatbits(113ul << 23)); int32 reg_val = (exp == 0) ? zerodenorm_val : o; - int32 sign_bit = ((int32)(h & 0x8000)) << 16; + int32 sign_bit = ((int32)(h & 0x8000ul)) << 16; return floatbits(((exp == shifted_exp) ? infnan_val : reg_val) | sign_bit); } } @@ -3715,16 +3715,16 @@ static inline uniform int16 float_to_half(uniform float f) { // NaN->qNaN and Inf->Inf // unconditional assignment here, will override with right value for // the regular case below. - uniform int32 f32infty = 255 << 23; - o = (fint > f32infty) ? 0x7e00 : 0x7c00; + uniform int32 f32infty = 255ul << 23; + o = (fint > f32infty) ? 0x7e00u : 0x7c00u; // (De)normalized number or zero // update fint unconditionally to save the blending; we don't need it // anymore for the Inf/NaN case anyway. - const uniform unsigned int32 round_mask = ~0xfffu; - const uniform int32 magic = 15 << 23; - const uniform int32 f16infty = 31 << 23; + const uniform unsigned int32 round_mask = ~0xffful; + const uniform int32 magic = 15ul << 23; + const uniform int32 f16infty = 31ul << 23; uniform int32 fint2 = intbits(floatbits(fint & round_mask) * floatbits(magic)) - round_mask; fint2 = (fint2 > f16infty) ? f16infty : fint2; // Clamp to signed infinity if overflowed @@ -3761,16 +3761,16 @@ static inline int16 float_to_half(float f) { // NaN->qNaN and Inf->Inf // unconditional assignment here, will override with right value for // the regular case below. - int32 f32infty = 255 << 23; - o = (fint > f32infty) ? 0x7e00 : 0x7c00; + int32 f32infty = 255ul << 23; + o = (fint > f32infty) ? 0x7e00u : 0x7c00u; // (De)normalized number or zero // update fint unconditionally to save the blending; we don't need it // anymore for the Inf/NaN case anyway. - const unsigned int32 round_mask = ~0xfffu; - const int32 magic = 15 << 23; - const int32 f16infty = 31 << 23; + const unsigned int32 round_mask = ~0xffful; + const int32 magic = 15ul << 23; + const int32 f16infty = 31ul << 23; // Shift exponent down, denormalize if necessary. // NOTE This represents half-float denormals using single precision denormals. @@ -3789,7 +3789,7 @@ static inline int16 float_to_half(float f) { // FP16 denormals are rare in practice, I don't know. Whatever slow path your HW // may or may not have for denormals, this may well hit it. float fscale = floatbits(fint & round_mask) * floatbits(magic); - fscale = min(fscale, floatbits((31 << 23) - 0x1000)); + fscale = min(fscale, floatbits((31ul << 23) - 0x1000ul)); int32 fint2 = intbits(fscale) - round_mask; if (fint < f32infty) @@ -3956,7 +3956,7 @@ float_to_srgb8(float inval) // Do the table lookup and unpack bias, scale unsigned int tab = table[(intbits(inval) - 0x39000000u) >> 20]; unsigned int bias = (tab >> 16) << 9; - unsigned int scale = tab & 0xffff; + unsigned int scale = tab & 0xfffful; // Grab next-highest mantissa bits and perform linear interpolation unsigned int t = (intbits(inval) >> 12) & 0xff; @@ -4006,7 +4006,7 @@ float_to_srgb8(uniform float inval) // Do the table lookup and unpack bias, scale uniform unsigned int tab = table[(intbits(inval) - 0x39000000u) >> 20]; uniform unsigned int bias = (tab >> 16) << 9; - uniform unsigned int scale = tab & 0xffff; + uniform unsigned int scale = tab & 0xfffful; // Grab next-highest mantissa bits and perform linear interpolation uniform unsigned int t = (intbits(inval) >> 12) & 0xff; @@ -4053,14 +4053,14 @@ static inline uniform unsigned int random(uniform RNGState * uniform state) static inline float frandom(varying RNGState * uniform state) { unsigned int irand = random(state); - irand &= (1<<23)-1; + irand &= (1ul<<23)-1; return floatbits(0x3F800000 | irand)-1.0f; } static inline uniform float frandom(uniform RNGState * uniform state) { uniform unsigned int irand = random(state); - irand &= (1<<23)-1; + irand &= (1ul<<23)-1; return floatbits(0x3F800000 | irand)-1.0f; } @@ -4068,18 +4068,18 @@ static inline void seed_rng(varying RNGState * uniform state, unsigned int seed) { state->z1 = seed; state->z2 = seed ^ 0xbeeff00d; - state->z3 = ((seed & 0xffff) << 16) | (seed >> 16); - state->z4 = (((seed & 0xff) << 24) | ((seed & 0xff00) << 8) | - ((seed & 0xff0000) >> 8) | (seed & 0xff000000) >> 24); + state->z3 = ((seed & 0xfffful) << 16) | (seed >> 16); + state->z4 = (((seed & 0xfful) << 24) | ((seed & 0xff00ul) << 8) | + ((seed & 0xff0000ul) >> 8) | (seed & 0xff000000ul) >> 24); } static inline void seed_rng(uniform RNGState * uniform state, uniform unsigned int seed) { state->z1 = seed; state->z2 = seed ^ 0xbeeff00d; - state->z3 = ((seed & 0xffff) << 16) | (seed >> 16); - state->z4 = (((seed & 0xff) << 24) | ((seed & 0xff00) << 8) | - ((seed & 0xff0000) >> 8) | (seed & 0xff000000) >> 24); + state->z3 = ((seed & 0xfffful) << 16) | (seed >> 16); + state->z4 = (((seed & 0xfful) << 24) | ((seed & 0xff00ul) << 8) | + ((seed & 0xff0000ul) >> 8) | (seed & 0xff000000ul) >> 24); } @@ -4097,7 +4097,7 @@ static inline uniform bool rdrand(float * uniform ptr) { uniform int32 irand; uniform bool success = __rdrand_i32(&irand); if (success) { - irand &= (1<<23)-1; + irand &= (1ul<<23)-1; *ptr = floatbits(0x3F800000 | irand)-1.0f; } return success; @@ -4117,7 +4117,7 @@ static inline bool rdrand(varying float * uniform ptr) { // in vector form. However, we need to be careful to not // clobber any existing already-set values in *ptr with // inactive lanes here... - irand &= (1<<23)-1; + irand &= (1ul<<23)-1; *ptr = floatbits(0x3F800000 | irand)-1.0f; success = true; } @@ -4137,7 +4137,7 @@ static inline bool rdrand(float * ptr) { foreach_active (index) { uniform int32 irand; if (__rdrand_i32(&irand)) { - irand &= (1<<23)-1; + irand &= (1ul<<23)-1; *ptrs[index] = floatbits(0x3F800000 | irand)-1.0f; success = true; } diff --git a/tests/aossoa-1.ispc b/tests/aossoa-1.ispc index 59964d6d..32d3bcba 100644 --- a/tests/aossoa-1.ispc +++ b/tests/aossoa-1.ispc @@ -2,8 +2,8 @@ export uniform int width() { return programCount; } export void f_v(uniform float RET[]) { -#define width 3 -#define maxProgramCount 64 +#define width 3ul +#define maxProgramCount 64ul assert(programCount <= maxProgramCount); //CO const uniform int width = 3; diff --git a/tests/aossoa-2.ispc b/tests/aossoa-2.ispc index 9ff82226..df8eae5c 100644 --- a/tests/aossoa-2.ispc +++ b/tests/aossoa-2.ispc @@ -2,8 +2,8 @@ export uniform int width() { return programCount; } export void f_v(uniform float RET[]) { -#define width 4 -#define maxProgramCount 64 +#define width 4ul +#define maxProgramCount 64ul assert(programCount <= maxProgramCount); //CO const uniform int width = 4; diff --git a/tests/aossoa-5.ispc b/tests/aossoa-5.ispc index eb4fed3a..d6346455 100644 --- a/tests/aossoa-5.ispc +++ b/tests/aossoa-5.ispc @@ -2,8 +2,8 @@ export uniform int width() { return programCount; } export void f_v(uniform float RET[]) { -#define width 3 -#define maxProgramCount 64 +#define width 3ul +#define maxProgramCount 64ul assert(programCount <= maxProgramCount); //CO const uniform int width = 3; diff --git a/tests/aossoa-6.ispc b/tests/aossoa-6.ispc index b64cd10b..7c177fde 100644 --- a/tests/aossoa-6.ispc +++ b/tests/aossoa-6.ispc @@ -2,8 +2,8 @@ export uniform int width() { return programCount; } export void f_v(uniform float RET[]) { -#define width 4 -#define maxProgramCount 64 +#define width 4ul +#define maxProgramCount 64ul assert(programCount <= maxProgramCount); //CO const uniform int width = 4; diff --git a/tests/atomics-12.ispc b/tests/atomics-12.ispc index c27ad99c..d6359555 100644 --- a/tests/atomics-12.ispc +++ b/tests/atomics-12.ispc @@ -7,7 +7,7 @@ export void f_f(uniform float RET[], uniform float aFOO[]) { float a = aFOO[programIndex]; float b = 0; if (programIndex < 30 && programIndex & 1) - b = atomic_or_global(&s, (1 << programIndex)); + b = atomic_or_global(&s, (1ul << programIndex)); RET[programIndex] = s; } @@ -15,6 +15,6 @@ export void result(uniform float RET[]) { uniform int sum = 0; for (uniform int i = 0; i < min(30, programCount); ++i) if (i & 1) - sum += (1 << i); + sum += (1ul << i); RET[programIndex] = sum; } diff --git a/tests/atomics-13.ispc b/tests/atomics-13.ispc index 86faaddb..dea3bfc3 100644 --- a/tests/atomics-13.ispc +++ b/tests/atomics-13.ispc @@ -7,7 +7,7 @@ export void f_f(uniform float RET[], uniform float aFOO[]) { float a = aFOO[programIndex]; int32 b = 0; if (programIndex < 32 && programIndex & 1) - b = atomic_or_global(&s, (1 << programIndex)); + b = atomic_or_global(&s, (1ul << programIndex)); RET[programIndex] = popcnt(reduce_max((int32)b)); } diff --git a/tests/atomics-4.ispc b/tests/atomics-4.ispc index 30b343d1..ac746ad2 100644 --- a/tests/atomics-4.ispc +++ b/tests/atomics-4.ispc @@ -5,10 +5,10 @@ uniform int32 s = 0; export void f_f(uniform float RET[], uniform float aFOO[]) { float a = aFOO[programIndex]; - float b = atomic_or_global(&s, (1<> 2) * 16 + (programIndex & 3)]; diff --git a/tests/coalesce-4.ispc b/tests/coalesce-4.ispc index 1ddd4b89..182a4d4f 100644 --- a/tests/coalesce-4.ispc +++ b/tests/coalesce-4.ispc @@ -2,8 +2,8 @@ export uniform int width() { return programCount; } export void f_f(uniform float RET[], uniform float aFOO[]) { - uniform float * uniform buf = uniform new uniform float[32*32]; - for (uniform int i = 0; i < 32*32; ++i) + uniform float * uniform buf = uniform new uniform float[32l*32l]; + for (uniform int i = 0; i < 32l*32l; ++i) buf[i] = i; float a = buf[2*programIndex]; diff --git a/tests/coalesce-5.ispc b/tests/coalesce-5.ispc index 2dd8d44e..385e8526 100644 --- a/tests/coalesce-5.ispc +++ b/tests/coalesce-5.ispc @@ -2,8 +2,8 @@ export uniform int width() { return programCount; } export void f_f(uniform float RET[], uniform float aFOO[]) { - uniform float * uniform buf = uniform new uniform float[32*32]; - for (uniform int i = 0; i < 32*32; ++i) + uniform float * uniform buf = uniform new uniform float[32l*32l]; + for (uniform int i = 0; i < 32l*32l; ++i) buf[i] = i; float a = buf[4*programIndex]; diff --git a/tests/coalesce-6.ispc b/tests/coalesce-6.ispc index 2a54a2db..8c630a45 100644 --- a/tests/coalesce-6.ispc +++ b/tests/coalesce-6.ispc @@ -2,8 +2,8 @@ export uniform int width() { return programCount; } export void f_f(uniform float RET[], uniform float aFOO[]) { - uniform float * uniform buf = uniform new uniform float[32*32]; - for (uniform int i = 0; i < 32*32; ++i) + uniform float * uniform buf = uniform new uniform float[32l*32l]; + for (uniform int i = 0; i < 32l*32l; ++i) buf[i] = i; float a = buf[4*programIndex]; diff --git a/tests/coalesce-7.ispc b/tests/coalesce-7.ispc index 8ed628bd..29b56b8d 100644 --- a/tests/coalesce-7.ispc +++ b/tests/coalesce-7.ispc @@ -2,8 +2,8 @@ export uniform int width() { return programCount; } export void f_f(uniform float RET[], uniform float aFOO[]) { - uniform float * uniform buf = uniform new uniform float[32*32]; - for (uniform int i = 0; i < 32*32; ++i) + uniform float * uniform buf = uniform new uniform float[32l*32l]; + for (uniform int i = 0; i < 32l*32l; ++i) buf[i] = i; float a = buf[4*programIndex]; diff --git a/tests/coalesce-8.ispc b/tests/coalesce-8.ispc index dfefaa19..f01ca9c3 100644 --- a/tests/coalesce-8.ispc +++ b/tests/coalesce-8.ispc @@ -2,8 +2,8 @@ export uniform int width() { return programCount; } export void f_f(uniform float RET[], uniform float aFOO[]) { - uniform float * uniform buf = uniform new uniform float[32*32]; - for (uniform int i = 0; i < 32*32; ++i) + uniform float * uniform buf = uniform new uniform float[32l*32l]; + for (uniform int i = 0; i < 32l*32l; ++i) buf[i] = i; int index = (programIndex < 4) ? (programIndex & 1) : diff --git a/tests/count-leading-trailing-zeros-1.ispc b/tests/count-leading-trailing-zeros-1.ispc index 221d066d..3f12c07d 100644 --- a/tests/count-leading-trailing-zeros-1.ispc +++ b/tests/count-leading-trailing-zeros-1.ispc @@ -3,7 +3,7 @@ export uniform int width() { return programCount; } export void f_f(uniform float RET[], uniform float aFOO[]) { - RET[programIndex] = count_trailing_zeros(0xf0); + RET[programIndex] = count_trailing_zeros(0xf0ul); } export void result(uniform float RET[]) { diff --git a/tests/count-leading-trailing-zeros-4.ispc b/tests/count-leading-trailing-zeros-4.ispc index 475c18ca..4b849018 100644 --- a/tests/count-leading-trailing-zeros-4.ispc +++ b/tests/count-leading-trailing-zeros-4.ispc @@ -3,7 +3,7 @@ export uniform int width() { return programCount; } export void f_f(uniform float RET[], uniform float aFOO[]) { - int32 i = (1 << (programIndex % 28)); + int32 i = (1ul << (programIndex % 28)); RET[programIndex] = count_leading_zeros(i); } diff --git a/tests/exclusive-scan-and-2.ispc b/tests/exclusive-scan-and-2.ispc index 5d2bcd1f..b742a91e 100644 --- a/tests/exclusive-scan-and-2.ispc +++ b/tests/exclusive-scan-and-2.ispc @@ -3,7 +3,7 @@ export uniform int width() { return programCount; } export void f_f(uniform float RET[], uniform float aFOO[]) { RET[programIndex] = -1; - int32 a = ~(1 << programIndex); + int32 a = ~(1ul << programIndex); if ((programIndex < 32) && (programIndex & 1) == 0) { RET[programIndex] = exclusive_scan_and(a); } @@ -15,7 +15,7 @@ export void result(uniform float RET[]) { if ((programIndex & 1) == 0 && programIndex > 0 && programIndex < 32) { int val = 0xffffffff; for (int i = 0; i < programIndex-1; i += 2) - val &= ~(1<>20); } @@ -15,6 +15,6 @@ export void result(uniform float RET[]) { uniform int sum = 0; for (uniform int i = 0; i < min(32, programCount); ++i) if (i & 1) - sum += (1 << i); + sum += (1ul << i); RET[programIndex] = ((unsigned int64)(0xffffffffff000000 | sum)) >> 20; } diff --git a/tests/local-atomics-4.ispc b/tests/local-atomics-4.ispc index f7f6a04a..b3648ab5 100644 --- a/tests/local-atomics-4.ispc +++ b/tests/local-atomics-4.ispc @@ -7,10 +7,10 @@ export void f_f(uniform float RET[], uniform float aFOO[]) { float a = aFOO[programIndex]; float b = 0; if (programIndex < 29) - atomic_or_local(&s, (1< struct Foo" for assignment operator is not possible +// Type conversion from "const uniform int[0-9]*" to "soa<4> struct Foo" for assignment operator is not possible struct Pt { float x, y, z; }; diff --git a/tests_errors/soa-12.ispc b/tests_errors/soa-12.ispc index e2cd3242..c0420614 100644 --- a/tests_errors/soa-12.ispc +++ b/tests_errors/soa-12.ispc @@ -1,4 +1,4 @@ -// Can't convert between types "const uniform int32" and "soa<4> float" with different SOA widths +// Can't convert between types "const uniform int[0-9]*" and "soa<4> float" with different SOA widths struct Pt { float x, y, z; }; diff --git a/tests_errors/soa-3.ispc b/tests_errors/soa-3.ispc index b2be1b59..04dc84bc 100644 --- a/tests_errors/soa-3.ispc +++ b/tests_errors/soa-3.ispc @@ -1,4 +1,4 @@ -// syntax error, unexpected '-', expecting int32 constant +// syntax error, unexpected '-', expecting int struct F { float a, b, c; }; diff --git a/tests_errors/soa-4.ispc b/tests_errors/soa-4.ispc index b2be1b59..04dc84bc 100644 --- a/tests_errors/soa-4.ispc +++ b/tests_errors/soa-4.ispc @@ -1,4 +1,4 @@ -// syntax error, unexpected '-', expecting int32 constant +// syntax error, unexpected '-', expecting int struct F { float a, b, c; }; diff --git a/tests_errors/soa-9.ispc b/tests_errors/soa-9.ispc index 7c6a1df9..e9e7509a 100644 --- a/tests_errors/soa-9.ispc +++ b/tests_errors/soa-9.ispc @@ -1,4 +1,4 @@ -// Can't convert from pointer to SOA type "soa<8> struct A * uniform" to pointer to non-SOA type "void * varying" +// Can't convert from pointer to SOA type "soa<8> struct A \* uniform" to pointer to non-SOA type "void \* varying" struct A { float a, b; }; diff --git a/tests_errors/struct_arith.ispc b/tests_errors/struct_arith.ispc index 9d942880..df729d02 100644 --- a/tests_errors/struct_arith.ispc +++ b/tests_errors/struct_arith.ispc @@ -1,4 +1,4 @@ -// Assignment operator "+=" is illegal with struct type +// Assignment operator "\+=" is illegal with struct type struct Point { float x, y, z; }; diff --git a/tests_errors/vec-size-compile-constant.ispc b/tests_errors/vec-size-compile-constant.ispc index b9e61721..0eb6f90e 100644 --- a/tests_errors/vec-size-compile-constant.ispc +++ b/tests_errors/vec-size-compile-constant.ispc @@ -1,4 +1,4 @@ -// syntax error, unexpected identifier, expecting int32 constant +// syntax error, unexpected identifier, expecting int void foo(uniform int i) { float a; From c14659c6754f4d91a3bec3cbb48c4e67b7421d13 Mon Sep 17 00:00:00 2001 From: Matt Pharr Date: Tue, 23 Jul 2013 17:02:49 -0700 Subject: [PATCH 06/34] Fix bug in lGetConstantInt() in parse.yy. Previously, we weren't handling signed/unsigned constant types correctly. --- parse.yy | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/parse.yy b/parse.yy index 6ed2a43d..4b315776 100644 --- a/parse.yy +++ b/parse.yy @@ -2278,7 +2278,11 @@ lGetConstantInt(Expr *expr, int *value, SourcePos pos, const char *usage) { Error(pos, "%s must be representable with a 32-bit integer.", usage); return false; } - *value = (int)ci->getZExtValue(); + const Type *type = expr->GetType(); + if (type->IsUnsignedType()) + *value = (int)ci->getZExtValue(); + else + *value = (int)ci->getSExtValue(); return true; } } From 15a3ef370a433eedcf6e6650f07ec81775d0322d Mon Sep 17 00:00:00 2001 From: Matt Pharr Date: Tue, 23 Jul 2013 17:11:01 -0700 Subject: [PATCH 07/34] Use @llvm.readcyclecounter to implement stdlib clock() function. Also added a test for the clock builtin. --- builtins/util.m4 | 14 ++++---------- tests/clock.ispc | 14 ++++++++++++++ 2 files changed, 18 insertions(+), 10 deletions(-) create mode 100644 tests/clock.ispc diff --git a/builtins/util.m4 b/builtins/util.m4 index d6f3e5c3..8c379781 100644 --- a/builtins/util.m4 +++ b/builtins/util.m4 @@ -2891,17 +2891,11 @@ m4exit(`1') ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; read hw clock +declare i64 @llvm.readcyclecounter() + define i64 @__clock() nounwind { -entry: - tail call void asm sideeffect "xorl %eax,%eax \0A cpuid", "~{rax},~{rbx},~{rcx},~{rdx},~{dirflag},~{fpsr},~{flags}"() nounwind - %0 = tail call { i32, i32 } asm sideeffect "rdtsc", "={ax},={dx},~{dirflag},~{fpsr},~{flags}"() nounwind - %asmresult = extractvalue { i32, i32 } %0, 0 - %asmresult1 = extractvalue { i32, i32 } %0, 1 - %conv = zext i32 %asmresult1 to i64 - %shl = shl nuw i64 %conv, 32 - %conv2 = zext i32 %asmresult to i64 - %or = or i64 %shl, %conv2 - ret i64 %or + %r = call i64 @llvm.readcyclecounter() + ret i64 %r } ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; diff --git a/tests/clock.ispc b/tests/clock.ispc new file mode 100644 index 00000000..0e95379b --- /dev/null +++ b/tests/clock.ispc @@ -0,0 +1,14 @@ + +export uniform int width() { return programCount; } + + +export void f_f(uniform float RET[], uniform float aFOO[]) { + unsigned uniform int64 a = clock(); + float x = pow(sqrt(aFOO[programIndex]), 5.5); + unsigned uniform int64 b = clock(); + RET[programIndex] = (b - a) > 0 ? 1 : 0; +} + +export void result(uniform float RET[]) { + RET[programIndex] = 1; +} From 53414f12e6ce7d1615cd650cc7b2152063da6556 Mon Sep 17 00:00:00 2001 From: Matt Pharr Date: Tue, 23 Jul 2013 17:30:32 -0700 Subject: [PATCH 08/34] Add SSE4 target optimized for computation with 8-bit datatypes. This change adds a new 'sse4-8' target, where programCount is 16 and the mask element size is 8-bits. (i.e. the most appropriate sizing of the mask for SIMD computation with 8-bit datatypes.) --- Makefile | 2 +- builtins.cpp | 9 + builtins/target-sse4-8.ll | 444 ++++++++++++++++++++++++++++++++++++++ builtins/util.m4 | 104 ++++++++- expr.cpp | 5 + ispc.cpp | 8 + opt.cpp | 13 +- 7 files changed, 578 insertions(+), 7 deletions(-) create mode 100644 builtins/target-sse4-8.ll diff --git a/Makefile b/Makefile index 043ab4cf..054a3da1 100644 --- a/Makefile +++ b/Makefile @@ -123,7 +123,7 @@ CXX_SRC=ast.cpp builtins.cpp cbackend.cpp ctx.cpp decl.cpp expr.cpp func.cpp \ HEADERS=ast.h builtins.h ctx.h decl.h expr.h func.h ispc.h llvmutil.h module.h \ opt.h stmt.h sym.h type.h util.h TARGETS=neon avx1 avx1-x2 avx11 avx11-x2 avx2 avx2-x2 sse2 sse2-x2 sse4 sse4-x2 \ - generic-4 generic-8 generic-16 generic-32 generic-64 generic-1 + sse4-8 generic-4 generic-8 generic-16 generic-32 generic-64 generic-1 # These files need to be compiled in two versions - 32 and 64 bits. BUILTINS_SRC_TARGET=$(addprefix builtins/target-, $(addsuffix .ll, $(TARGETS))) # These are files to be compiled in single version. diff --git a/builtins.cpp b/builtins.cpp index d3bbaa6a..6c586595 100644 --- a/builtins.cpp +++ b/builtins.cpp @@ -868,6 +868,15 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod EXPORT_MODULE(builtins_bitcode_sse4_x2_64bit); } break; + case 16: + Assert(g->target->getMaskBitCount() == 8); + if (runtime32) { + EXPORT_MODULE(builtins_bitcode_sse4_8_32bit); + } + else { + EXPORT_MODULE(builtins_bitcode_sse4_8_64bit); + } + break; default: FATAL("logic error in DefineStdlib"); } diff --git a/builtins/target-sse4-8.ll b/builtins/target-sse4-8.ll new file mode 100644 index 00000000..c85209ba --- /dev/null +++ b/builtins/target-sse4-8.ll @@ -0,0 +1,444 @@ +;; Copyright (c) 2013, Google, Inc. +;; All rights reserved. +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are +;; met: +;; +;; * Redistributions of source code must retain the above copyright +;; notice, this list of conditions and the following disclaimer. +;; +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; +;; * Neither the name of Google, Inc. nor the names of its +;; contributors may be used to endorse or promote products derived from +;; this software without specific prior written permission. +;; +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +;; IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +;; TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +;; PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER +;; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +;; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +;; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +; Define common 4-wide stuff +define(`WIDTH',`16') +define(`MASK',`i8') +include(`util.m4') + +stdlib_core() +packed_load_and_store() +scans() +int64minmax() + +include(`target-sse4-common.ll') + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; half conversion routines + +declare float @__half_to_float_uniform(i16 %v) nounwind readnone +declare @__half_to_float_varying( %v) nounwind readnone +declare i16 @__float_to_half_uniform(float %v) nounwind readnone +declare @__float_to_half_varying( %v) nounwind readnone + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; rcp + +declare <4 x float> @llvm.x86.sse.rcp.ps(<4 x float>) nounwind readnone + +define @__rcp_varying_float() nounwind readonly alwaysinline { + unary4to16(call, float, @llvm.x86.sse.rcp.ps, %0) + ; do one N-R iteration to improve precision + ; float iv = __rcp_v(v); + ; return iv * (2. - v * iv); + %v_iv = fmul <16 x float> %0, %call + %two_minus = fsub <16 x float> , %v_iv + %iv_mul = fmul <16 x float> %call, %two_minus + ret <16 x float> %iv_mul +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; rsqrt + +declare <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float>) nounwind readnone + +define <16 x float> @__rsqrt_varying_float(<16 x float> %v) nounwind readonly alwaysinline { + ; float is = __rsqrt_v(v); + unary4to16(is, float, @llvm.x86.sse.rsqrt.ps, %v) + ; Newton-Raphson iteration to improve precision + ; return 0.5 * is * (3. - (v * is) * is); + %v_is = fmul <16 x float> %v, %is + %v_is_is = fmul <16 x float> %v_is, %is + %three_sub = fsub <16 x float> , %v_is_is + %is_mul = fmul <16 x float> %is, %three_sub + %half_scale = fmul <16 x float> , %is_mul + ret <16 x float> %half_scale +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; sqrt + +declare <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float>) nounwind readnone + +define <16 x float> @__sqrt_varying_float(<16 x float>) nounwind readonly alwaysinline { + unary4to16(call, float, @llvm.x86.sse.sqrt.ps, %0) + ret <16 x float> %call +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; double precision sqrt + +declare <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double>) nounwind readnone + +define <16 x double> @__sqrt_varying_double(<16 x double>) nounwind +alwaysinline { + unary2to16(ret, double, @llvm.x86.sse2.sqrt.pd, %0) + ret <16 x double> %ret +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; rounding floats + +declare <4 x float> @llvm.x86.sse41.round.ps(<4 x float>, i32) nounwind readnone + +define <16 x float> @__round_varying_float(<16 x float>) nounwind readonly alwaysinline { + ; roundps, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8 + round4to16(%0, 8) +} + +define <16 x float> @__floor_varying_float(<16 x float>) nounwind readonly alwaysinline { + ; roundps, round down 0b01 | don't signal precision exceptions 0b1001 = 9 + round4to16(%0, 9) +} + +define <16 x float> @__ceil_varying_float(<16 x float>) nounwind readonly alwaysinline { + ; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10 + round4to16(%0, 10) +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; rounding doubles + +declare <2 x double> @llvm.x86.sse41.round.pd(<2 x double>, i32) nounwind readnone + +define <16 x double> @__round_varying_double(<16 x double>) nounwind readonly alwaysinline { +; XXXround2to4double(%0, 8) + ; FIXME: need round2to16double in util.m4... + ret <16 x double> undef +} + +define <16 x double> @__floor_varying_double(<16 x double>) nounwind readonly alwaysinline { + ; roundpd, round down 0b01 | don't signal precision exceptions 0b1001 = 9 +; XXXround2to4double(%0, 9) + ret <16 x double> undef +} + +define <16 x double> @__ceil_varying_double(<16 x double>) nounwind readonly alwaysinline { + ; roundpd, round up 0b10 | don't signal precision exceptions 0b1010 = 10 +; XXXround2to4double(%0, 10) + ret <16 x double> undef +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; float min/max + +declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind readnone +declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind readnone + +define <16 x float> @__max_varying_float(<16 x float>, <16 x float>) nounwind readonly alwaysinline { + binary4to16(call, float, @llvm.x86.sse.max.ps, %0, %1) + ret <16 x float> %call +} + +define <16 x float> @__min_varying_float(<16 x float>, <16 x float>) nounwind readonly alwaysinline { + binary4to16(call, float, @llvm.x86.sse.min.ps, %0, %1) + ret <16 x float> %call +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; int32 min/max + +define <16 x i32> @__min_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline { + binary4to16(call, i32, @llvm.x86.sse41.pminsd, %0, %1) + ret <16 x i32> %call +} + +define <16 x i32> @__max_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline { + binary4to16(call, i32, @llvm.x86.sse41.pmaxsd, %0, %1) + ret <16 x i32> %call +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; unsigned int min/max + +define <16 x i32> @__min_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline { + binary4to16(call, i32, @llvm.x86.sse41.pminud, %0, %1) + ret <16 x i32> %call +} + +define <16 x i32> @__max_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline { + binary4to16(call, i32, @llvm.x86.sse41.pmaxud, %0, %1) + ret <16 x i32> %call +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; double precision min/max + +declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>) nounwind readnone +declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) nounwind readnone + +define <16 x double> @__min_varying_double(<16 x double>, <16 x double>) nounwind readnone { + binary2to16(ret, double, @llvm.x86.sse2.min.pd, %0, %1) + ret <16 x double> %ret +} + +define <16 x double> @__max_varying_double(<16 x double>, <16 x double>) nounwind readnone { + binary2to16(ret, double, @llvm.x86.sse2.max.pd, %0, %1) + ret <16 x double> %ret +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; svml + +; FIXME + +declare <16 x float> @__svml_sin(<16 x float>) +declare <16 x float> @__svml_cos(<16 x float>) +declare void @__svml_sincos(<16 x float>, <16 x float> *, <16 x float> *) +declare <16 x float> @__svml_tan(<16 x float>) +declare <16 x float> @__svml_atan(<16 x float>) +declare <16 x float> @__svml_atan2(<16 x float>, <16 x float>) +declare <16 x float> @__svml_exp(<16 x float>) +declare <16 x float> @__svml_log(<16 x float>) +declare <16 x float> @__svml_pow(<16 x float>, <16 x float>) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; horizontal ops / reductions + +declare i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8>) nounwind readnone + +define i64 @__movmsk(<16 x i8>) nounwind readnone alwaysinline { + %m = call i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8> %0) + %m64 = zext i32 %m to i64 + ret i64 %m64 +} + +define i1 @__any(<16 x i8>) nounwind readnone alwaysinline { + %m = call i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8> %0) + %mne = icmp ne i32 %m, 0 + ret i1 %mne +} + +define i1 @__all(<16 x i8>) nounwind readnone alwaysinline { + %m = call i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8> %0) + %meq = icmp eq i32 %m, ALL_ON_MASK + ret i1 %meq +} + +define i1 @__none(<16 x i8>) nounwind readnone alwaysinline { + %m = call i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8> %0) + %meq = icmp eq i32 %m, 0 + ret i1 %meq +} + +define internal <16 x float> @__add_varying_float(<16 x float>, <16 x float>) { + %r = fadd <16 x float> %0, %1 + ret <16 x float> %r +} + +define internal float @__add_uniform_float(float, float) { + %r = fadd float %0, %1 + ret float %r +} + +define float @__reduce_add_float(<16 x float>) nounwind readonly alwaysinline { + reduce16(float, @__add_varying_float, @__add_uniform_float) +} + +define float @__reduce_min_float(<16 x float>) nounwind readnone { + reduce16(float, @__min_varying_float, @__min_uniform_float) +} + +define float @__reduce_max_float(<16 x float>) nounwind readnone { + reduce16(float, @__max_varying_float, @__max_uniform_float) +} + +define internal <16 x i32> @__add_varying_int32(<16 x i32>, <16 x i32>) { + %r = add <16 x i32> %0, %1 + ret <16 x i32> %r +} + +define internal i32 @__add_uniform_int32(i32, i32) { + %r = add i32 %0, %1 + ret i32 %r +} + +define i32 @__reduce_add_int32(<16 x i32>) nounwind readnone { + reduce16(i32, @__add_varying_int32, @__add_uniform_int32) +} + +define i32 @__reduce_min_int32(<16 x i32>) nounwind readnone { + reduce16(i32, @__min_varying_int32, @__min_uniform_int32) +} + +define i32 @__reduce_max_int32(<16 x i32>) nounwind readnone { + reduce16(i32, @__max_varying_int32, @__max_uniform_int32) +} + +define i32 @__reduce_min_uint32(<16 x i32>) nounwind readnone { + reduce16(i32, @__min_varying_uint32, @__min_uniform_uint32) +} + +define i32 @__reduce_max_uint32(<16 x i32>) nounwind readnone { + reduce16(i32, @__max_varying_uint32, @__max_uniform_uint32) +} + +define internal <16 x double> @__add_varying_double(<16 x double>, <16 x double>) { + %r = fadd <16 x double> %0, %1 + ret <16 x double> %r +} + +define internal double @__add_uniform_double(double, double) { + %r = fadd double %0, %1 + ret double %r +} + +define double @__reduce_add_double(<16 x double>) nounwind readnone { + reduce16(double, @__add_varying_double, @__add_uniform_double) +} + +define double @__reduce_min_double(<16 x double>) nounwind readnone { + reduce16(double, @__min_varying_double, @__min_uniform_double) +} + +define double @__reduce_max_double(<16 x double>) nounwind readnone { + reduce16(double, @__max_varying_double, @__max_uniform_double) +} + +define internal <16 x i64> @__add_varying_int64(<16 x i64>, <16 x i64>) { + %r = add <16 x i64> %0, %1 + ret <16 x i64> %r +} + +define internal i64 @__add_uniform_int64(i64, i64) { + %r = add i64 %0, %1 + ret i64 %r +} + +define i64 @__reduce_add_int64(<16 x i64>) nounwind readnone { + reduce16(i64, @__add_varying_int64, @__add_uniform_int64) +} + +define i64 @__reduce_min_int64(<16 x i64>) nounwind readnone { + reduce16(i64, @__min_varying_int64, @__min_uniform_int64) +} + +define i64 @__reduce_max_int64(<16 x i64>) nounwind readnone { + reduce16(i64, @__max_varying_int64, @__max_uniform_int64) +} + +define i64 @__reduce_min_uint64(<16 x i64>) nounwind readnone { + reduce16(i64, @__min_varying_uint64, @__min_uniform_uint64) +} + +define i64 @__reduce_max_uint64(<16 x i64>) nounwind readnone { + reduce16(i64, @__max_varying_uint64, @__max_uniform_uint64) +} + +reduce_equal(16) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; masked store + +define void @__masked_store_blend_i64(<16 x i64>* nocapture, <16 x i64>, + <16 x i8> %mask) nounwind + alwaysinline { + %mask_as_i1 = trunc <16 x MASK> %mask to <16 x i1> + %old = load <16 x i64>* %0, align 4 + %blend = select <16 x i1> %mask_as_i1, <16 x i64> %1, <16 x i64> %old + store <16 x i64> %blend, <16 x i64>* %0, align 4 + ret void +} + +define void @__masked_store_blend_i32(<16 x i32>* nocapture, <16 x i32>, + <16 x MASK> %mask) nounwind alwaysinline { + %mask_as_i1 = trunc <16 x MASK> %mask to <16 x i1> + %old = load <16 x i32>* %0, align 4 + %blend = select <16 x i1> %mask_as_i1, <16 x i32> %1, <16 x i32> %old + store <16 x i32> %blend, <16 x i32>* %0, align 4 + ret void +} + +define void @__masked_store_blend_i16(<16 x i16>* nocapture, <16 x i16>, + <16 x MASK> %mask) nounwind alwaysinline { + %mask_as_i1 = trunc <16 x MASK> %mask to <16 x i1> + %old = load <16 x i16>* %0, align 4 + %blend = select <16 x i1> %mask_as_i1, <16 x i16> %1, <16 x i16> %old + store <16 x i16> %blend, <16 x i16>* %0, align 4 + ret void +} + +define void @__masked_store_blend_i8(<16 x i8>* nocapture, <16 x i8>, + <16 x MASK> %mask) nounwind alwaysinline { + %mask_as_i1 = trunc <16 x MASK> %mask to <16 x i1> + %old = load <16 x i8>* %0, align 4 + %blend = select <16 x i1> %mask_as_i1, <16 x i8> %1, <16 x i8> %old + store <16 x i8> %blend, <16 x i8>* %0, align 4 + ret void +} + +gen_masked_store(i8) +gen_masked_store(i16) +gen_masked_store(i32) +gen_masked_store(i64) + +masked_store_float_double() + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; unaligned loads/loads+broadcasts + +masked_load(i8, 1) +masked_load(i16, 2) +masked_load(i32, 4) +masked_load(float, 4) +masked_load(i64, 8) +masked_load(double, 8) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; gather/scatter + +; define these with the macros from stdlib.m4 + +gen_gather_factored(i8) +gen_gather_factored(i16) +gen_gather_factored(i32) +gen_gather_factored(float) +gen_gather_factored(i64) +gen_gather_factored(double) + +gen_scatter(i8) +gen_scatter(i16) +gen_scatter(i32) +gen_scatter(float) +gen_scatter(i64) +gen_scatter(double) diff --git a/builtins/util.m4 b/builtins/util.m4 index 8c379781..ee45ebc7 100644 --- a/builtins/util.m4 +++ b/builtins/util.m4 @@ -411,6 +411,42 @@ define(`unary2to8', ` ' ) +define(`unary2to16', ` + %$1_0 = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> + %v$1_0 = call <2 x $2> $3(<2 x $2> %$1_0) + %$1_1 = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> + %v$1_1 = call <2 x $2> $3(<2 x $2> %$1_1) + %$1_2 = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> + %v$1_2 = call <2 x $2> $3(<2 x $2> %$1_2) + %$1_3 = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> + %v$1_3 = call <2 x $2> $3(<2 x $2> %$1_3) + %$1_4 = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> + %v$1_4 = call <2 x $2> $3(<2 x $2> %$1_4) + %$1_5 = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> + %v$1_5 = call <2 x $2> $3(<2 x $2> %$1_5) + %$1_6 = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> + %v$1_6 = call <2 x $2> $3(<2 x $2> %$1_6) + %$1_7 = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> + %v$1_7 = call <2 x $2> $3(<2 x $2> %$1_7) + %$1a = shufflevector <2 x $2> %v$1_0, <2 x $2> %v$1_1, + <4 x i32> + %$1b = shufflevector <2 x $2> %v$1_2, <2 x $2> %v$1_3, + <4 x i32> + %$1ab = shufflevector <4 x $2> %$1a, <4 x $2> %$1b, + <8 x i32> + %$1c = shufflevector <2 x $2> %v$1_4, <2 x $2> %v$1_5, + <4 x i32> + %$1d = shufflevector <2 x $2> %v$1_6, <2 x $2> %v$1_7, + <4 x i32> + %$1cd = shufflevector <4 x $2> %$1c, <4 x $2> %$1d, + <8 x i32> + + %$1 = shufflevector <8 x $2> %$1ab, <8 x $2> %$1cd, + <16 x i32> +' +) + ;; Maps an 2-wide binary function to two 8-wide vector operands ;; $1: name of variable into which the final result should go ;; $2: scalar type of the vector elements @@ -432,12 +468,58 @@ define(`binary2to8', ` %$1_3b = shufflevector <8 x $2> $5, <8 x $2> undef, <2 x i32> %v$1_3 = call <2 x $2> $3(<2 x $2> %$1_3a, <2 x $2> %$1_3b) + %$1a = shufflevector <2 x $2> %v$1_0, <2 x $2> %v$1_1, + <4 x i32> + %$1b = shufflevector <2 x $2> %v$1_2, <2 x $2> %v$1_3, + <4 x i32> + %$1 = shufflevector <4 x $2> %$1a, <4 x $2> %$1b, + <8 x i32> +' +) + +define(`binary2to16', ` + %$1_0a = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> + %$1_0b = shufflevector <16 x $2> $5, <16 x $2> undef, <2 x i32> + %v$1_0 = call <2 x $2> $3(<2 x $2> %$1_0a, <2 x $2> %$1_0b) + %$1_1a = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> + %$1_1b = shufflevector <16 x $2> $5, <16 x $2> undef, <2 x i32> + %v$1_1 = call <2 x $2> $3(<2 x $2> %$1_1a, <2 x $2> %$1_1b) + %$1_2a = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> + %$1_2b = shufflevector <16 x $2> $5, <16 x $2> undef, <2 x i32> + %v$1_2 = call <2 x $2> $3(<2 x $2> %$1_2a, <2 x $2> %$1_2b) + %$1_3a = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> + %$1_3b = shufflevector <16 x $2> $5, <16 x $2> undef, <2 x i32> + %v$1_3 = call <2 x $2> $3(<2 x $2> %$1_3a, <2 x $2> %$1_3b) + %$1_4a = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> + %$1_4b = shufflevector <16 x $2> $5, <16 x $2> undef, <2 x i32> + %v$1_4 = call <2 x $2> $3(<2 x $2> %$1_4a, <2 x $2> %$1_4b) + %$1_5a = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> + %$1_5b = shufflevector <16 x $2> $5, <16 x $2> undef, <2 x i32> + %v$1_5 = call <2 x $2> $3(<2 x $2> %$1_5a, <2 x $2> %$1_5b) + %$1_6a = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> + %$1_6b = shufflevector <16 x $2> $5, <16 x $2> undef, <2 x i32> + %v$1_6 = call <2 x $2> $3(<2 x $2> %$1_6a, <2 x $2> %$1_6b) + %$1_7a = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> + %$1_7b = shufflevector <16 x $2> $5, <16 x $2> undef, <2 x i32> + %v$1_7 = call <2 x $2> $3(<2 x $2> %$1_7a, <2 x $2> %$1_7b) + %$1a = shufflevector <2 x $2> %v$1_0, <2 x $2> %v$1_1, <4 x i32> %$1b = shufflevector <2 x $2> %v$1_2, <2 x $2> %v$1_3, <4 x i32> - %$1 = shufflevector <4 x $2> %$1a, <4 x $2> %$1b, + %$1ab = shufflevector <4 x $2> %$1a, <4 x $2> %$1b, <8 x i32> + + %$1c = shufflevector <2 x $2> %v$1_4, <2 x $2> %v$1_5, + <4 x i32> + %$1d = shufflevector <2 x $2> %v$1_6, <2 x $2> %v$1_7, + <4 x i32> + %$1cd = shufflevector <4 x $2> %$1c, <4 x $2> %$1d, + <8 x i32> + + %$1 = shufflevector <8 x $2> %$1ab, <8 x $2> %$1cd, + <16 x i32> ' ) @@ -460,6 +542,26 @@ ret <8 x float> %ret ' ) +define(`round4to16', ` +%v0 = shufflevector <16 x float> $1, <16 x float> undef, <4 x i32> +%v1 = shufflevector <16 x float> $1, <16 x float> undef, <4 x i32> +%v2 = shufflevector <16 x float> $1, <16 x float> undef, <4 x i32> +%v3 = shufflevector <16 x float> $1, <16 x float> undef, <4 x i32> +%r0 = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %v0, i32 $2) +%r1 = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %v1, i32 $2) +%r2 = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %v2, i32 $2) +%r3 = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %v3, i32 $2) +%ret01 = shufflevector <4 x float> %r0, <4 x float> %r1, + <8 x i32> +%ret23 = shufflevector <4 x float> %r2, <4 x float> %r3, + <8 x i32> +%ret = shufflevector <8 x float> %ret01, <8 x float> %ret23, + <16 x i32> +ret <16 x float> %ret +' +) + define(`round8to16', ` %v0 = shufflevector <16 x float> $1, <16 x float> undef, <8 x i32> diff --git a/expr.cpp b/expr.cpp index 6bde2acb..f81037f6 100644 --- a/expr.cpp +++ b/expr.cpp @@ -3123,6 +3123,10 @@ static llvm::Value * lEmitVaryingSelect(FunctionEmitContext *ctx, llvm::Value *test, llvm::Value *expr1, llvm::Value *expr2, const Type *type) { +#if !defined(LLVM_3_1) + test = ctx->TruncInst(test, LLVMTypes::Int1VectorType); + return ctx->SelectInst(test, expr1, expr2, "select"); +#else llvm::Value *resultPtr = ctx->AllocaInst(expr1->getType(), "selectexpr_tmp"); // Don't need to worry about masking here ctx->StoreInst(expr2, resultPtr); @@ -3131,6 +3135,7 @@ lEmitVaryingSelect(FunctionEmitContext *ctx, llvm::Value *test, PointerType::GetUniform(type)->LLVMType(g->ctx)); ctx->StoreInst(expr1, resultPtr, test, type, PointerType::GetUniform(type)); return ctx->LoadInst(resultPtr, "selectexpr_final"); +#endif // !LLVM_3_1 } diff --git a/ispc.cpp b/ispc.cpp index 887f6ca3..6ac23781 100644 --- a/ispc.cpp +++ b/ispc.cpp @@ -310,6 +310,14 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : this->m_maskingIsFree = false; this->m_maskBitCount = 32; } + else if (!strcasecmp(isa, "sse4-8")) { + this->m_isa = Target::SSE4; + this->m_nativeVectorWidth = 16; + this->m_vectorWidth = 16; + this->m_attributes = "+sse,+sse2,+sse3,+sse41,-sse42,-sse4a,+ssse3,-popcnt,+cmov"; + this->m_maskingIsFree = false; + this->m_maskBitCount = 8; + } else if (!strcasecmp(isa, "generic-4")) { this->m_isa = Target::GENERIC; this->m_nativeVectorWidth = 4; diff --git a/opt.cpp b/opt.cpp index ba32c639..4701e7df 100644 --- a/opt.cpp +++ b/opt.cpp @@ -670,14 +670,17 @@ IntrinsicsOpt::IntrinsicsOpt() // All of the mask instructions we may encounter. Note that even if // compiling for AVX, we may still encounter the regular 4-wide SSE // MOVMSK instruction. - llvm::Function *sseMovmsk = + llvm::Function *ssei8Movmsk = + llvm::Intrinsic::getDeclaration(m->module, llvm::Intrinsic::x86_sse2_pmovmskb_128); + maskInstructions.push_back(ssei8Movmsk); + llvm::Function *sseFloatMovmsk = llvm::Intrinsic::getDeclaration(m->module, llvm::Intrinsic::x86_sse_movmsk_ps); - maskInstructions.push_back(sseMovmsk); + maskInstructions.push_back(sseFloatMovmsk); maskInstructions.push_back(m->module->getFunction("__movmsk")); - llvm::Function *avxMovmsk = + llvm::Function *avxFloatMovmsk = llvm::Intrinsic::getDeclaration(m->module, llvm::Intrinsic::x86_avx_movmsk_ps_256); - Assert(avxMovmsk != NULL); - maskInstructions.push_back(avxMovmsk); + Assert(avxFloatMovmsk != NULL); + maskInstructions.push_back(avxFloatMovmsk); // And all of the blend instructions blendInstructions.push_back(BlendInstruction( From 04d61afa23a64d9fc5f95648509bd5ec002da53e Mon Sep 17 00:00:00 2001 From: Matt Pharr Date: Thu, 25 Jul 2013 09:40:48 -0700 Subject: [PATCH 09/34] Fix bug in lEmitVaryingSelect() for targets with i1 mask types. Commit 53414f12e6c introduced a but where lEmitVaryingSelect() would try to truncate a vector of i1s to a vector of i1s, which in turn made LLVM's IR analyzer unhappy. --- expr.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/expr.cpp b/expr.cpp index f81037f6..856d363c 100644 --- a/expr.cpp +++ b/expr.cpp @@ -3124,7 +3124,8 @@ lEmitVaryingSelect(FunctionEmitContext *ctx, llvm::Value *test, llvm::Value *expr1, llvm::Value *expr2, const Type *type) { #if !defined(LLVM_3_1) - test = ctx->TruncInst(test, LLVMTypes::Int1VectorType); + if (test->getType() != LLVMTypes::Int1VectorType) + test = ctx->TruncInst(test, LLVMTypes::Int1VectorType); return ctx->SelectInst(test, expr1, expr2, "select"); #else llvm::Value *resultPtr = ctx->AllocaInst(expr1->getType(), "selectexpr_tmp"); From 780b0dfe47a770785c4fe1f224813e3a518cd135 Mon Sep 17 00:00:00 2001 From: Matt Pharr Date: Wed, 24 Jul 2013 09:40:50 -0700 Subject: [PATCH 10/34] Add SSE4-16 target. Along the lines of sse4-8, this is an 8-wide target for SSE4, using 16-bit elements for the mask. It's thus (in principle) the best target for SIMD computation with 16-bit datatypes. --- Makefile | 2 +- builtins.cpp | 16 +- builtins/target-sse4-16.ll | 436 +++++++++++++++++++++++++++++++++++++ ispc.cpp | 14 +- run_tests.py | 2 +- 5 files changed, 463 insertions(+), 7 deletions(-) create mode 100644 builtins/target-sse4-16.ll diff --git a/Makefile b/Makefile index 054a3da1..fc064dbd 100644 --- a/Makefile +++ b/Makefile @@ -123,7 +123,7 @@ CXX_SRC=ast.cpp builtins.cpp cbackend.cpp ctx.cpp decl.cpp expr.cpp func.cpp \ HEADERS=ast.h builtins.h ctx.h decl.h expr.h func.h ispc.h llvmutil.h module.h \ opt.h stmt.h sym.h type.h util.h TARGETS=neon avx1 avx1-x2 avx11 avx11-x2 avx2 avx2-x2 sse2 sse2-x2 sse4 sse4-x2 \ - sse4-8 generic-4 generic-8 generic-16 generic-32 generic-64 generic-1 + sse4-8 sse4-16 generic-4 generic-8 generic-16 generic-32 generic-64 generic-1 # These files need to be compiled in two versions - 32 and 64 bits. BUILTINS_SRC_TARGET=$(addprefix builtins/target-, $(addsuffix .ll, $(TARGETS))) # These are files to be compiled in single version. diff --git a/builtins.cpp b/builtins.cpp index 6c586595..c4a2f3b5 100644 --- a/builtins.cpp +++ b/builtins.cpp @@ -862,10 +862,22 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod break; case 8: if (runtime32) { - EXPORT_MODULE(builtins_bitcode_sse4_x2_32bit); + if (g->target->getMaskBitCount() == 16) { + EXPORT_MODULE(builtins_bitcode_sse4_16_32bit); + } + else { + Assert(g->target->getMaskBitCount() == 32); + EXPORT_MODULE(builtins_bitcode_sse4_x2_32bit); + } } else { - EXPORT_MODULE(builtins_bitcode_sse4_x2_64bit); + if (g->target->getMaskBitCount() == 16) { + EXPORT_MODULE(builtins_bitcode_sse4_16_64bit); + } + else { + Assert(g->target->getMaskBitCount() == 32); + EXPORT_MODULE(builtins_bitcode_sse4_x2_64bit); + } } break; case 16: diff --git a/builtins/target-sse4-16.ll b/builtins/target-sse4-16.ll new file mode 100644 index 00000000..2044fbee --- /dev/null +++ b/builtins/target-sse4-16.ll @@ -0,0 +1,436 @@ +;; Copyright (c) 2013, Google, Inc. +;; All rights reserved. +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are +;; met: +;; +;; * Redistributions of source code must retain the above copyright +;; notice, this list of conditions and the following disclaimer. +;; +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; +;; * Neither the name of Google, Inc. nor the names of its +;; contributors may be used to endorse or promote products derived from +;; this software without specific prior written permission. +;; +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +;; IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +;; TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +;; PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER +;; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +;; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +;; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +; Define common 4-wide stuff +define(`WIDTH',`8') +define(`MASK',`i16') +include(`util.m4') + +stdlib_core() +packed_load_and_store() +scans() +int64minmax() + +include(`target-sse4-common.ll') + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; half conversion routines + +declare float @__half_to_float_uniform(i16 %v) nounwind readnone +declare @__half_to_float_varying( %v) nounwind readnone +declare i16 @__float_to_half_uniform(float %v) nounwind readnone +declare @__float_to_half_varying( %v) nounwind readnone + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; rcp + +declare <4 x float> @llvm.x86.sse.rcp.ps(<4 x float>) nounwind readnone + +define @__rcp_varying_float() nounwind readonly alwaysinline { + unary4to8(call, float, @llvm.x86.sse.rcp.ps, %0) + ; do one N-R iteration to improve precision + ; float iv = __rcp_v(v); + ; return iv * (2. - v * iv); + %v_iv = fmul <8 x float> %0, %call + %two_minus = fsub <8 x float> , %v_iv + %iv_mul = fmul <8 x float> %call, %two_minus + ret <8 x float> %iv_mul +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; rsqrt + +declare <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float>) nounwind readnone + +define @__rsqrt_varying_float( %v) nounwind readonly alwaysinline { + ; float is = __rsqrt_v(v); + unary4to8(is, float, @llvm.x86.sse.rsqrt.ps, %v) + ; Newton-Raphson iteration to improve precision + ; return 0.5 * is * (3. - (v * is) * is); + %v_is = fmul <8 x float> %v, %is + %v_is_is = fmul <8 x float> %v_is, %is + %three_sub = fsub <8 x float> , %v_is_is + %is_mul = fmul <8 x float> %is, %three_sub + %half_scale = fmul <8 x float> , %is_mul + ret <8 x float> %half_scale +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; sqrt + +declare <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float>) nounwind readnone + +define <8 x float> @__sqrt_varying_float(<8 x float>) nounwind readonly alwaysinline { + unary4to8(call, float, @llvm.x86.sse.sqrt.ps, %0) + ret <8 x float> %call +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; double precision sqrt + +declare <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double>) nounwind readnone + +define <8 x double> @__sqrt_varying_double(<8 x double>) nounwind +alwaysinline { + unary2to8(ret, double, @llvm.x86.sse2.sqrt.pd, %0) + ret <8 x double> %ret +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; rounding floats + +declare <4 x float> @llvm.x86.sse41.round.ps(<4 x float>, i32) nounwind readnone + +define <8 x float> @__round_varying_float(<8 x float>) nounwind readonly alwaysinline { + ; roundps, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8 + round4to8(%0, 8) +} + +define <8 x float> @__floor_varying_float(<8 x float>) nounwind readonly alwaysinline { + ; roundps, round down 0b01 | don't signal precision exceptions 0b1001 = 9 + round4to8(%0, 9) +} + +define <8 x float> @__ceil_varying_float(<8 x float>) nounwind readonly alwaysinline { + ; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10 + round4to8(%0, 10) +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; rounding doubles + +declare <2 x double> @llvm.x86.sse41.round.pd(<2 x double>, i32) nounwind readnone + +define <8 x double> @__round_varying_double(<8 x double>) nounwind readonly alwaysinline { + round2to8double(%0, 8) +} + +define <8 x double> @__floor_varying_double(<8 x double>) nounwind readonly alwaysinline { + round2to8double(%0, 9) +} + +define <8 x double> @__ceil_varying_double(<8 x double>) nounwind readonly alwaysinline { + round2to8double(%0, 10) +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; float min/max + +declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind readnone +declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind readnone + +define <8 x float> @__max_varying_float(<8 x float>, <8 x float>) nounwind readonly alwaysinline { + binary4to8(call, float, @llvm.x86.sse.max.ps, %0, %1) + ret <8 x float> %call +} + +define <8 x float> @__min_varying_float(<8 x float>, <8 x float>) nounwind readonly alwaysinline { + binary4to8(call, float, @llvm.x86.sse.min.ps, %0, %1) + ret <8 x float> %call +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; int32 min/max + +define <8 x i32> @__min_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline { + binary4to8(call, i32, @llvm.x86.sse41.pminsd, %0, %1) + ret <8 x i32> %call +} + +define <8 x i32> @__max_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline { + binary4to8(call, i32, @llvm.x86.sse41.pmaxsd, %0, %1) + ret <8 x i32> %call +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; unsigned int min/max + +define <8 x i32> @__min_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline { + binary4to8(call, i32, @llvm.x86.sse41.pminud, %0, %1) + ret <8 x i32> %call +} + +define <8 x i32> @__max_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline { + binary4to8(call, i32, @llvm.x86.sse41.pmaxud, %0, %1) + ret <8 x i32> %call +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; double precision min/max + +declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>) nounwind readnone +declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) nounwind readnone + +define <8 x double> @__min_varying_double(<8 x double>, <8 x double>) nounwind readnone { + binary2to8(ret, double, @llvm.x86.sse2.min.pd, %0, %1) + ret <8 x double> %ret +} + +define <8 x double> @__max_varying_double(<8 x double>, <8 x double>) nounwind readnone { + binary2to8(ret, double, @llvm.x86.sse2.max.pd, %0, %1) + ret <8 x double> %ret +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; svml + +; FIXME + +declare <8 x float> @__svml_sin(<8 x float>) +declare <8 x float> @__svml_cos(<8 x float>) +declare void @__svml_sincos(<8 x float>, <8 x float> *, <8 x float> *) +declare <8 x float> @__svml_tan(<8 x float>) +declare <8 x float> @__svml_atan(<8 x float>) +declare <8 x float> @__svml_atan2(<8 x float>, <8 x float>) +declare <8 x float> @__svml_exp(<8 x float>) +declare <8 x float> @__svml_log(<8 x float>) +declare <8 x float> @__svml_pow(<8 x float>, <8 x float>) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; horizontal ops / reductions + +declare i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8>) nounwind readnone + +define i64 @__movmsk(<8 x MASK>) nounwind readnone alwaysinline { + %m8 = trunc <8 x MASK> %0 to <8 x i8> + %mask8 = shufflevector <8 x i8> %m8, <8 x i8> zeroinitializer, + <16 x i32> + %m = call i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8> %mask8) + %m64 = zext i32 %m to i64 + ret i64 %m64 +} + +define i1 @__any(<8 x MASK>) nounwind readnone alwaysinline { + %m = call i64 @__movmsk(<8 x MASK> %0) + %mne = icmp ne i64 %m, 0 + ret i1 %mne +} + +define i1 @__all(<8 x MASK>) nounwind readnone alwaysinline { + %m = call i64 @__movmsk(<8 x MASK> %0) + %meq = icmp eq i64 %m, ALL_ON_MASK + ret i1 %meq +} + +define i1 @__none(<8 x MASK>) nounwind readnone alwaysinline { + %m = call i64 @__movmsk(<8 x MASK> %0) + %meq = icmp eq i64 %m, 0 + ret i1 %meq +} + +define internal <8 x float> @__add_varying_float(<8 x float>, <8 x float>) { + %r = fadd <8 x float> %0, %1 + ret <8 x float> %r +} + +define internal float @__add_uniform_float(float, float) { + %r = fadd float %0, %1 + ret float %r +} + +define float @__reduce_add_float(<8 x float>) nounwind readonly alwaysinline { + reduce8(float, @__add_varying_float, @__add_uniform_float) +} + +define float @__reduce_min_float(<8 x float>) nounwind readnone { + reduce8(float, @__min_varying_float, @__min_uniform_float) +} + +define float @__reduce_max_float(<8 x float>) nounwind readnone { + reduce8(float, @__max_varying_float, @__max_uniform_float) +} + +define internal <8 x i32> @__add_varying_int32(<8 x i32>, <8 x i32>) { + %r = add <8 x i32> %0, %1 + ret <8 x i32> %r +} + +define internal i32 @__add_uniform_int32(i32, i32) { + %r = add i32 %0, %1 + ret i32 %r +} + +define i32 @__reduce_add_int32(<8 x i32>) nounwind readnone { + reduce8(i32, @__add_varying_int32, @__add_uniform_int32) +} + +define i32 @__reduce_min_int32(<8 x i32>) nounwind readnone { + reduce8(i32, @__min_varying_int32, @__min_uniform_int32) +} + +define i32 @__reduce_max_int32(<8 x i32>) nounwind readnone { + reduce8(i32, @__max_varying_int32, @__max_uniform_int32) +} + +define i32 @__reduce_min_uint32(<8 x i32>) nounwind readnone { + reduce8(i32, @__min_varying_uint32, @__min_uniform_uint32) +} + +define i32 @__reduce_max_uint32(<8 x i32>) nounwind readnone { + reduce8(i32, @__max_varying_uint32, @__max_uniform_uint32) +} + +define internal <8 x double> @__add_varying_double(<8 x double>, <8 x double>) { + %r = fadd <8 x double> %0, %1 + ret <8 x double> %r +} + +define internal double @__add_uniform_double(double, double) { + %r = fadd double %0, %1 + ret double %r +} + +define double @__reduce_add_double(<8 x double>) nounwind readnone { + reduce8(double, @__add_varying_double, @__add_uniform_double) +} + +define double @__reduce_min_double(<8 x double>) nounwind readnone { + reduce8(double, @__min_varying_double, @__min_uniform_double) +} + +define double @__reduce_max_double(<8 x double>) nounwind readnone { + reduce8(double, @__max_varying_double, @__max_uniform_double) +} + +define internal <8 x i64> @__add_varying_int64(<8 x i64>, <8 x i64>) { + %r = add <8 x i64> %0, %1 + ret <8 x i64> %r +} + +define internal i64 @__add_uniform_int64(i64, i64) { + %r = add i64 %0, %1 + ret i64 %r +} + +define i64 @__reduce_add_int64(<8 x i64>) nounwind readnone { + reduce8(i64, @__add_varying_int64, @__add_uniform_int64) +} + +define i64 @__reduce_min_int64(<8 x i64>) nounwind readnone { + reduce8(i64, @__min_varying_int64, @__min_uniform_int64) +} + +define i64 @__reduce_max_int64(<8 x i64>) nounwind readnone { + reduce8(i64, @__max_varying_int64, @__max_uniform_int64) +} + +define i64 @__reduce_min_uint64(<8 x i64>) nounwind readnone { + reduce8(i64, @__min_varying_uint64, @__min_uniform_uint64) +} + +define i64 @__reduce_max_uint64(<8 x i64>) nounwind readnone { + reduce8(i64, @__max_varying_uint64, @__max_uniform_uint64) +} + +reduce_equal(8) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; masked store + +define void @__masked_store_blend_i64(<8 x i64>* nocapture, <8 x i64>, + <8 x MASK> %mask) nounwind + alwaysinline { + %mask_as_i1 = trunc <8 x MASK> %mask to <8 x i1> + %old = load <8 x i64>* %0, align 4 + %blend = select <8 x i1> %mask_as_i1, <8 x i64> %1, <8 x i64> %old + store <8 x i64> %blend, <8 x i64>* %0, align 4 + ret void +} + +define void @__masked_store_blend_i32(<8 x i32>* nocapture, <8 x i32>, + <8 x MASK> %mask) nounwind alwaysinline { + %mask_as_i1 = trunc <8 x MASK> %mask to <8 x i1> + %old = load <8 x i32>* %0, align 4 + %blend = select <8 x i1> %mask_as_i1, <8 x i32> %1, <8 x i32> %old + store <8 x i32> %blend, <8 x i32>* %0, align 4 + ret void +} + +define void @__masked_store_blend_i16(<8 x i16>* nocapture, <8 x i16>, + <8 x MASK> %mask) nounwind alwaysinline { + %mask_as_i1 = trunc <8 x MASK> %mask to <8 x i1> + %old = load <8 x i16>* %0, align 4 + %blend = select <8 x i1> %mask_as_i1, <8 x i16> %1, <8 x i16> %old + store <8 x i16> %blend, <8 x i16>* %0, align 4 + ret void +} + +define void @__masked_store_blend_i8(<8 x i8>* nocapture, <8 x i8>, + <8 x MASK> %mask) nounwind alwaysinline { + %mask_as_i1 = trunc <8 x MASK> %mask to <8 x i1> + %old = load <8 x i8>* %0, align 4 + %blend = select <8 x i1> %mask_as_i1, <8 x i8> %1, <8 x i8> %old + store <8 x i8> %blend, <8 x i8>* %0, align 4 + ret void +} + +gen_masked_store(i8) +gen_masked_store(i16) +gen_masked_store(i32) +gen_masked_store(i64) + +masked_store_float_double() + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; unaligned loads/loads+broadcasts + +masked_load(i8, 1) +masked_load(i16, 2) +masked_load(i32, 4) +masked_load(float, 4) +masked_load(i64, 8) +masked_load(double, 8) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; gather/scatter + +; define these with the macros from stdlib.m4 + +gen_gather_factored(i8) +gen_gather_factored(i16) +gen_gather_factored(i32) +gen_gather_factored(float) +gen_gather_factored(i64) +gen_gather_factored(double) + +gen_scatter(i8) +gen_scatter(i16) +gen_scatter(i32) +gen_scatter(float) +gen_scatter(i64) +gen_scatter(double) diff --git a/ispc.cpp b/ispc.cpp index 6ac23781..a9f5ff5c 100644 --- a/ispc.cpp +++ b/ispc.cpp @@ -318,6 +318,14 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : this->m_maskingIsFree = false; this->m_maskBitCount = 8; } + else if (!strcasecmp(isa, "sse4-16")) { + this->m_isa = Target::SSE4; + this->m_nativeVectorWidth = 8; + this->m_vectorWidth = 8; + this->m_attributes = "+sse,+sse2,+sse3,+sse41,-sse42,-sse4a,+ssse3,-popcnt,+cmov"; + this->m_maskingIsFree = false; + this->m_maskBitCount = 16; + } else if (!strcasecmp(isa, "generic-4")) { this->m_isa = Target::GENERIC; this->m_nativeVectorWidth = 4; @@ -575,9 +583,9 @@ Target::SupportedTargetArchs() { const char * Target::SupportedTargetISAs() { - return "neon, sse2, sse2-x2, sse4, sse4-x2, avx, avx-x2" - ", avx1.1, avx1.1-x2, avx2, avx2-x2" - ", generic-1, generic-4, generic-8, generic-16, generic-32"; + return "neon, sse2, sse2-x2, sse4, sse4-8, sse4-16, sse4-x2, " + "avx, avx-x2, avx1.1, avx1.1-x2, avx2, avx2-x2," + "generic-1, generic-4, generic-8, generic-16, generic-32"; } diff --git a/run_tests.py b/run_tests.py index 296db867..ea819ea4 100755 --- a/run_tests.py +++ b/run_tests.py @@ -37,7 +37,7 @@ parser.add_option("-g", "--generics-include", dest="include_file", help="Filenam parser.add_option("-f", "--ispc-flags", dest="ispc_flags", help="Additional flags for ispc (-g, -O1, ...)", default="") parser.add_option('-t', '--target', dest='target', - help='Set compilation target (neon, sse2, sse2-x2, sse4, sse4-x2, sse4-8, avx, avx-x2, generic-4, generic-8, generic-16, generic-32)', + help='Set compilation target (neon, sse2, sse2-x2, sse4, sse4-x2, sse4-8, sse4-16, avx, avx-x2, generic-4, generic-8, generic-16, generic-32)', default="sse4") parser.add_option('-a', '--arch', dest='arch', help='Set architecture (arm, x86, x86-64)', From bba84f247c34f67ed28a357d19a4a7414c590c2b Mon Sep 17 00:00:00 2001 From: Matt Pharr Date: Wed, 24 Jul 2013 15:08:07 -0700 Subject: [PATCH 11/34] Improved optimization of vector select instructions. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Various LLVM optimization passes are turning code like: %cmp = icmp lt <8 x i32> %foo, %bar %cmp32 = sext <8 x i1> %cmp to <8 x i32> . . . %cmp1 = trunc <8 x i32> %cmp32 to <8 x i1> %result = select <8 x i1> %cmp1, . . . Into: %cmp = icmp lt <8 x i32> %foo, %bar %cmp32 = zext <8 x i1> %cmp to <8 x i32> # note: zext . . . %cmp1 = icmp ne <8 x i32> %cmp32, zeroinitializer %result = select <8 x i1> %cmp1, … Which in turn isn't matched well by the LLVM code generators, which in turn leads to fairly inefficient code. (i.e. it doesn't just emit a vector compare and blend instruction.) Also, renamed VSelMovmskOptPass to InstructionSimplifyPass to better describe its functionality. --- opt.cpp | 175 ++++++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 126 insertions(+), 49 deletions(-) diff --git a/opt.cpp b/opt.cpp index 4701e7df..8efdbc67 100644 --- a/opt.cpp +++ b/opt.cpp @@ -108,7 +108,7 @@ #endif static llvm::Pass *CreateIntrinsicsOptPass(); -static llvm::Pass *CreateVSelMovmskOptPass(); +static llvm::Pass *CreateInstructionSimplifyPass(); static llvm::Pass *CreateImproveMemoryOpsPass(); static llvm::Pass *CreateGatherCoalescePass(); @@ -476,7 +476,7 @@ Optimize(llvm::Module *module, int optLevel) { } if (!g->opt.disableMaskAllOnOptimizations) { optPM.add(CreateIntrinsicsOptPass()); - optPM.add(CreateVSelMovmskOptPass()); + optPM.add(CreateInstructionSimplifyPass()); } optPM.add(llvm::createDeadInstEliminationPass()); @@ -519,7 +519,7 @@ Optimize(llvm::Module *module, int optLevel) { if (!g->opt.disableMaskAllOnOptimizations) { optPM.add(CreateIntrinsicsOptPass()); - optPM.add(CreateVSelMovmskOptPass()); + optPM.add(CreateInstructionSimplifyPass()); } if (g->opt.disableGatherScatterOptimizations == false && @@ -539,7 +539,7 @@ Optimize(llvm::Module *module, int optLevel) { optPM.add(llvm::createFunctionInliningPass()); optPM.add(llvm::createConstantPropagationPass()); optPM.add(CreateIntrinsicsOptPass()); - optPM.add(CreateVSelMovmskOptPass()); + optPM.add(CreateInstructionSimplifyPass()); if (g->opt.disableGatherScatterOptimizations == false && g->target->getVectorWidth() > 1) { @@ -555,18 +555,20 @@ Optimize(llvm::Module *module, int optLevel) { if (g->opt.disableHandlePseudoMemoryOps == false) optPM.add(CreateReplacePseudoMemoryOpsPass()); optPM.add(CreateIntrinsicsOptPass()); - optPM.add(CreateVSelMovmskOptPass()); + optPM.add(CreateInstructionSimplifyPass()); optPM.add(llvm::createFunctionInliningPass()); optPM.add(llvm::createArgumentPromotionPass()); optPM.add(llvm::createScalarReplAggregatesPass(sr_threshold, false)); optPM.add(llvm::createInstructionCombiningPass()); + optPM.add(CreateInstructionSimplifyPass()); optPM.add(llvm::createCFGSimplificationPass()); optPM.add(llvm::createReassociatePass()); optPM.add(llvm::createLoopRotatePass()); optPM.add(llvm::createLICMPass()); optPM.add(llvm::createLoopUnswitchPass(false)); optPM.add(llvm::createInstructionCombiningPass()); + optPM.add(CreateInstructionSimplifyPass()); optPM.add(llvm::createIndVarSimplifyPass()); optPM.add(llvm::createLoopIdiomPass()); optPM.add(llvm::createLoopDeletionPass()); @@ -576,17 +578,19 @@ Optimize(llvm::Module *module, int optLevel) { optPM.add(CreateIsCompileTimeConstantPass(true)); optPM.add(CreateIntrinsicsOptPass()); - optPM.add(CreateVSelMovmskOptPass()); + optPM.add(CreateInstructionSimplifyPass()); optPM.add(llvm::createMemCpyOptPass()); optPM.add(llvm::createSCCPPass()); optPM.add(llvm::createInstructionCombiningPass()); + optPM.add(CreateInstructionSimplifyPass()); optPM.add(llvm::createJumpThreadingPass()); optPM.add(llvm::createCorrelatedValuePropagationPass()); optPM.add(llvm::createDeadStoreEliminationPass()); optPM.add(llvm::createAggressiveDCEPass()); optPM.add(llvm::createCFGSimplificationPass()); optPM.add(llvm::createInstructionCombiningPass()); + optPM.add(CreateInstructionSimplifyPass()); optPM.add(llvm::createStripDeadPrototypesPass()); optPM.add(CreateMakeInternalFuncsStaticPass()); optPM.add(llvm::createGlobalDCEPass()); @@ -927,80 +931,153 @@ CreateIntrinsicsOptPass() { @todo The better thing to do would be to submit a patch to LLVM to get these; they're presumably pretty simple patterns to match. */ -class VSelMovmskOpt : public llvm::BasicBlockPass { +class InstructionSimplifyPass : public llvm::BasicBlockPass { public: - VSelMovmskOpt() + InstructionSimplifyPass() : BasicBlockPass(ID) { } const char *getPassName() const { return "Vector Select Optimization"; } bool runOnBasicBlock(llvm::BasicBlock &BB); static char ID; + +private: + static bool simplifySelect(llvm::SelectInst *selectInst, + llvm::BasicBlock::iterator iter); + static llvm::Value *simplifyBoolVec(llvm::Value *value); + static bool simplifyCall(llvm::CallInst *callInst, + llvm::BasicBlock::iterator iter); }; -char VSelMovmskOpt::ID = 0; +char InstructionSimplifyPass::ID = 0; + + +llvm::Value * +InstructionSimplifyPass::simplifyBoolVec(llvm::Value *value) { + llvm::TruncInst *trunc = llvm::dyn_cast(value); + if (trunc != NULL) { + // Convert trunc({sext,zext}(i1 vector)) -> (i1 vector) + llvm::SExtInst *sext = llvm::dyn_cast(value); + if (sext && + sext->getOperand(0)->getType() == LLVMTypes::Int1VectorType) + return sext->getOperand(0); + + llvm::ZExtInst *zext = llvm::dyn_cast(value); + if (zext && + zext->getOperand(0)->getType() == LLVMTypes::Int1VectorType) + return zext->getOperand(0); + } + + llvm::ICmpInst *icmp = llvm::dyn_cast(value); + if (icmp != NULL) { + // icmp(ne, {sext,zext}(foo), zeroinitializer) -> foo + if (icmp->getSignedPredicate() == llvm::CmpInst::ICMP_NE) { + llvm::Value *op1 = icmp->getOperand(1); + if (llvm::isa(op1)) { + llvm::Value *op0 = icmp->getOperand(0); + llvm::SExtInst *sext = llvm::dyn_cast(op0); + if (sext) + return sext->getOperand(0); + llvm::ZExtInst *zext = llvm::dyn_cast(op0); + if (zext) + return zext->getOperand(0); + } + } + } + return NULL; +} bool -VSelMovmskOpt::runOnBasicBlock(llvm::BasicBlock &bb) { - DEBUG_START_PASS("VSelMovmaskOpt"); +InstructionSimplifyPass::simplifySelect(llvm::SelectInst *selectInst, + llvm::BasicBlock::iterator iter) { + if (selectInst->getType()->isVectorTy() == false) + return false; + + llvm::Value *factor = selectInst->getOperand(0); + + // Simplify all-on or all-off mask values + MaskStatus maskStatus = lGetMaskStatus(factor); + llvm::Value *value = NULL; + if (maskStatus == ALL_ON) + // Mask all on -> replace with the first select value + value = selectInst->getOperand(1); + else if (maskStatus == ALL_OFF) + // Mask all off -> replace with the second select value + value = selectInst->getOperand(2); + if (value != NULL) { + llvm::ReplaceInstWithValue(iter->getParent()->getInstList(), + iter, value); + return true; + } + + // Sometimes earlier LLVM optimization passes generate unnecessarily + // complex expressions for the selection vector, which in turn confuses + // the code generators and leads to sub-optimal code (particularly for + // 8 and 16-bit masks). We'll try to simplify them out here so that + // the code generator patterns match.. + if ((factor = simplifyBoolVec(factor)) != NULL) { + llvm::Instruction *newSelect = + llvm::SelectInst::Create(factor, selectInst->getOperand(1), + selectInst->getOperand(2), + selectInst->getName()); + llvm::ReplaceInstWithInst(selectInst, newSelect); + return true; + } + + return false; +} + + +bool +InstructionSimplifyPass::simplifyCall(llvm::CallInst *callInst, + llvm::BasicBlock::iterator iter) { + llvm::Function *calledFunc = callInst->getCalledFunction(); + + // Turn a __movmsk call with a compile-time constant vector into the + // equivalent scalar value. + if (calledFunc == NULL || calledFunc != m->module->getFunction("__movmsk")) + return false; + + uint64_t mask; + if (lGetMask(callInst->getArgOperand(0), &mask) == true) { + llvm::ReplaceInstWithValue(iter->getParent()->getInstList(), + iter, LLVMInt64(mask)); + return true; + } + return false; +} + + +bool +InstructionSimplifyPass::runOnBasicBlock(llvm::BasicBlock &bb) { + DEBUG_START_PASS("InstructionSimplify"); bool modifiedAny = false; restart: for (llvm::BasicBlock::iterator iter = bb.begin(), e = bb.end(); iter != e; ++iter) { llvm::SelectInst *selectInst = llvm::dyn_cast(&*iter); - if (selectInst != NULL && selectInst->getType()->isVectorTy()) { - llvm::Value *factor = selectInst->getOperand(0); - - MaskStatus maskStatus = lGetMaskStatus(factor); - llvm::Value *value = NULL; - if (maskStatus == ALL_ON) - // Mask all on -> replace with the first select value - value = selectInst->getOperand(1); - else if (maskStatus == ALL_OFF) - // Mask all off -> replace with the second select value - value = selectInst->getOperand(2); - - if (value != NULL) { - llvm::ReplaceInstWithValue(iter->getParent()->getInstList(), - iter, value); - modifiedAny = true; - goto restart; - } + if (selectInst && simplifySelect(selectInst, iter)) { + modifiedAny = true; + goto restart; } - llvm::CallInst *callInst = llvm::dyn_cast(&*iter); - if (callInst == NULL) - continue; - - llvm::Function *calledFunc = callInst->getCalledFunction(); - if (calledFunc == NULL || calledFunc != m->module->getFunction("__movmsk")) - continue; - - uint64_t mask; - if (lGetMask(callInst->getArgOperand(0), &mask) == true) { -#if 0 - fprintf(stderr, "mask %d\n", mask); - callInst->getArgOperand(0)->dump(); - fprintf(stderr, "-----------\n"); -#endif - llvm::ReplaceInstWithValue(iter->getParent()->getInstList(), - iter, LLVMInt64(mask)); + if (callInst && simplifyCall(callInst, iter)) { modifiedAny = true; goto restart; } } - DEBUG_END_PASS("VSelMovMskOpt"); + DEBUG_END_PASS("InstructionSimplify"); return modifiedAny; } static llvm::Pass * -CreateVSelMovmskOptPass() { - return new VSelMovmskOpt; +CreateInstructionSimplifyPass() { + return new InstructionSimplifyPass; } From 2d063925a1d5ab758bcdd22454c201ac7d617dd3 Mon Sep 17 00:00:00 2001 From: Matt Pharr Date: Wed, 24 Jul 2013 15:10:08 -0700 Subject: [PATCH 12/34] Explicitly call the PBLENDVB intrinsic for i8 blending with sse4-8. This is slightly cleaner than trunc-ing the i8 mask to i1 and using a vector select. (And is probably more safe in terms of good code.) --- builtins/target-sse4-8.ll | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/builtins/target-sse4-8.ll b/builtins/target-sse4-8.ll index c85209ba..cd8fdce2 100644 --- a/builtins/target-sse4-8.ll +++ b/builtins/target-sse4-8.ll @@ -398,11 +398,13 @@ define void @__masked_store_blend_i16(<16 x i16>* nocapture, <16 x i16>, ret void } +declare <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone + define void @__masked_store_blend_i8(<16 x i8>* nocapture, <16 x i8>, <16 x MASK> %mask) nounwind alwaysinline { - %mask_as_i1 = trunc <16 x MASK> %mask to <16 x i1> %old = load <16 x i8>* %0, align 4 - %blend = select <16 x i1> %mask_as_i1, <16 x i8> %1, <16 x i8> %old + %blend = call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> %old, <16 x i8> %1, + <16 x i8> %mask) store <16 x i8> %blend, <16 x i8>* %0, align 4 ret void } From b6df447b550507ba77dde70758a5bdaf0e079f95 Mon Sep 17 00:00:00 2001 From: Matt Pharr Date: Thu, 25 Jul 2013 09:11:39 -0700 Subject: [PATCH 13/34] Add reduce_add() for int8 and int16 types. This maps to specialized instructions (e.g. PSADBW) when available. --- builtins.cpp | 2 ++ builtins/target-avx-x2.ll | 27 ++++++++++++++++++ builtins/target-avx.ll | 46 ++++++++++++++++++++++++------- builtins/target-generic-1.ll | 9 ++++++ builtins/target-generic-common.ll | 7 +++-- builtins/target-neon.ll | 33 ++++++++++++++++++---- builtins/target-sse2-x2.ll | 30 ++++++++++++++++++++ builtins/target-sse2.ll | 30 ++++++++++++++++++++ builtins/target-sse4-16.ll | 30 ++++++++++++++++++++ builtins/target-sse4-8.ll | 27 ++++++++++++++++++ builtins/target-sse4-x2.ll | 30 ++++++++++++++++++++ builtins/target-sse4.ll | 30 ++++++++++++++++++++ docs/ispc.rst | 39 ++++++++++++++++++-------- examples/intrinsics/generic-16.h | 9 +++--- examples/intrinsics/generic-32.h | 9 +++--- examples/intrinsics/generic-64.h | 9 +++--- examples/intrinsics/knc.h | 16 +++++++++++ examples/intrinsics/knc2x.h | 3 ++ examples/intrinsics/sse4.h | 16 +++++++++++ stdlib.ispc | 25 +++++++++++++++-- tests/reduce-add-int16-1.ispc | 21 ++++++++++++++ tests/reduce-add-int16.ispc | 21 ++++++++++++++ tests/reduce-add-int8-1.ispc | 21 ++++++++++++++ tests/reduce-add-int8.ispc | 18 ++++++++++++ 24 files changed, 464 insertions(+), 44 deletions(-) create mode 100644 tests/reduce-add-int16-1.ispc create mode 100644 tests/reduce-add-int16.ispc create mode 100644 tests/reduce-add-int8-1.ispc create mode 100644 tests/reduce-add-int8.ispc diff --git a/builtins.cpp b/builtins.cpp index c4a2f3b5..08472623 100644 --- a/builtins.cpp +++ b/builtins.cpp @@ -501,6 +501,8 @@ lSetInternalFunctions(llvm::Module *module) { "__rdrand_i64", "__reduce_add_double", "__reduce_add_float", + "__reduce_add_int8", + "__reduce_add_int16", "__reduce_add_int32", "__reduce_add_int64", "__reduce_equal_double", diff --git a/builtins/target-avx-x2.ll b/builtins/target-avx-x2.ll index 8c6b7753..d9e0322b 100644 --- a/builtins/target-avx-x2.ll +++ b/builtins/target-avx-x2.ll @@ -271,6 +271,33 @@ reduce_equal(16) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; horizontal int32 ops +declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone + +define i16 @__reduce_add_int8(<16 x i8>) nounwind readnone alwaysinline { + %rv = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %0, + <16 x i8> zeroinitializer) + %r0 = extractelement <2 x i64> %rv, i32 0 + %r1 = extractelement <2 x i64> %rv, i32 1 + %r = add i64 %r0, %r1 + %r16 = trunc i64 %r to i16 + ret i16 %r16 +} + +define internal <16 x i16> @__add_varying_i16(<16 x i16>, + <16 x i16>) nounwind readnone alwaysinline { + %r = add <16 x i16> %0, %1 + ret <16 x i16> %r +} + +define internal i16 @__add_uniform_i16(i16, i16) nounwind readnone alwaysinline { + %r = add i16 %0, %1 + ret i16 %r +} + +define i16 @__reduce_add_int16(<16 x i16>) nounwind readnone alwaysinline { + reduce16(i16, @__add_varying_i16, @__add_uniform_i16) +} + define <16 x i32> @__add_varying_int32(<16 x i32>, <16 x i32>) nounwind readnone alwaysinline { %s = add <16 x i32> %0, %1 diff --git a/builtins/target-avx.ll b/builtins/target-avx.ll index e6ab3a4b..90e2f3ac 100644 --- a/builtins/target-avx.ll +++ b/builtins/target-avx.ll @@ -217,7 +217,6 @@ define float @__reduce_add_float(<8 x float>) nounwind readonly alwaysinline { ret float %sum } - define float @__reduce_min_float(<8 x float>) nounwind readnone alwaysinline { reduce8(float, @__min_varying_float, @__min_uniform_float) } @@ -229,6 +228,42 @@ define float @__reduce_max_float(<8 x float>) nounwind readnone alwaysinline { reduce_equal(8) +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; horizontal int8 ops + +declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone + +define i16 @__reduce_add_int8(<8 x i8>) nounwind readnone alwaysinline { + %wide8 = shufflevector <8 x i8> %0, <8 x i8> zeroinitializer, + <16 x i32> + %rv = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %wide8, + <16 x i8> zeroinitializer) + %r0 = extractelement <2 x i64> %rv, i32 0 + %r1 = extractelement <2 x i64> %rv, i32 1 + %r = add i64 %r0, %r1 + %r16 = trunc i64 %r to i16 + ret i16 %r16 +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; horizontal int16 ops + +define internal <8 x i16> @__add_varying_i16(<8 x i16>, + <8 x i16>) nounwind readnone alwaysinline { + %r = add <8 x i16> %0, %1 + ret <8 x i16> %r +} + +define internal i16 @__add_uniform_i16(i16, i16) nounwind readnone alwaysinline { + %r = add i16 %0, %1 + ret i16 %r +} + +define i16 @__reduce_add_int16(<8 x i16>) nounwind readnone alwaysinline { + reduce8(i16, @__add_varying_i16, @__add_uniform_i16) +} + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; horizontal int32 ops @@ -257,20 +292,14 @@ define i32 @__reduce_max_int32(<8 x i32>) nounwind readnone alwaysinline { reduce8(i32, @__max_varying_int32, @__max_uniform_int32) } - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;;; horizontal uint32 ops - define i32 @__reduce_min_uint32(<8 x i32>) nounwind readnone alwaysinline { reduce8(i32, @__min_varying_uint32, @__min_uniform_uint32) } - define i32 @__reduce_max_uint32(<8 x i32>) nounwind readnone alwaysinline { reduce8(i32, @__max_varying_uint32, @__max_uniform_uint32) } - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; horizontal double ops @@ -329,9 +358,6 @@ define i64 @__reduce_max_int64(<8 x i64>) nounwind readnone alwaysinline { } -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;;; horizontal uint64 ops - define i64 @__reduce_min_uint64(<8 x i64>) nounwind readnone alwaysinline { reduce8(i64, @__min_varying_uint64, @__min_uniform_uint64) } diff --git a/builtins/target-generic-1.ll b/builtins/target-generic-1.ll index 9b747e2e..3dec76b0 100644 --- a/builtins/target-generic-1.ll +++ b/builtins/target-generic-1.ll @@ -471,6 +471,15 @@ define i64 @__popcnt_int64(i64) nounwind readonly alwaysinline { ret i64 %call } +define i8 @__reduce_add_int8(<1 x i8> %v) nounwind readonly alwaysinline { + %r = extractelement <1 x i8> %v, i32 0 + ret i8 %r +} + +define i16 @__reduce_add_int16(<1 x i16> %v) nounwind readonly alwaysinline { + %r = extractelement <1 x i16> %v, i32 0 + ret i16 %r +} define float @__reduce_add_float(<1 x float> %v) nounwind readonly alwaysinline { %r = extractelement <1 x float> %v, i32 0 diff --git a/builtins/target-generic-common.ll b/builtins/target-generic-common.ll index bbf1b842..76d1faf3 100644 --- a/builtins/target-generic-common.ll +++ b/builtins/target-generic-common.ll @@ -226,14 +226,16 @@ declare i1 @__any() nounwind readnone declare i1 @__all() nounwind readnone declare i1 @__none() nounwind readnone +declare i16 @__reduce_add_int8() nounwind readnone +declare i32 @__reduce_add_int16() nounwind readnone + declare float @__reduce_add_float() nounwind readnone declare float @__reduce_min_float() nounwind readnone declare float @__reduce_max_float() nounwind readnone -declare i32 @__reduce_add_int32() nounwind readnone +declare i64 @__reduce_add_int32() nounwind readnone declare i32 @__reduce_min_int32() nounwind readnone declare i32 @__reduce_max_int32() nounwind readnone - declare i32 @__reduce_min_uint32() nounwind readnone declare i32 @__reduce_max_uint32() nounwind readnone @@ -244,7 +246,6 @@ declare double @__reduce_max_double() nounwind readnone declare i64 @__reduce_add_int64() nounwind readnone declare i64 @__reduce_min_int64() nounwind readnone declare i64 @__reduce_max_int64() nounwind readnone - declare i64 @__reduce_min_uint64() nounwind readnone declare i64 @__reduce_max_uint64() nounwind readnone diff --git a/builtins/target-neon.ll b/builtins/target-neon.ll index e70b774b..fbeac352 100644 --- a/builtins/target-neon.ll +++ b/builtins/target-neon.ll @@ -509,15 +509,38 @@ define float @__reduce_max_float(<4 x float>) nounwind readnone { neon_reduce(float, @llvm.arm.neon.vpmaxs.v2f32, @max_f32) } -define internal i32 @add_i32(i32, i32) { - %r = add i32 %0, %1 +declare <4 x i16> @llvm.arm.neon.vpaddls.v4i16.v8i8(<8 x i8>) nounwind readnone + +define i16 @__reduce_add_int8() nounwind readnone { + %v8 = shufflevector <4 x i8> %0, <4 x i8> zeroinitializer, + <8 x i32> + %a16 = call <4 x i16> @llvm.arm.neon.vpaddls.v4i16.v8i8(<8 x i8> %v8) + %a32 = call <2 x i32> @llvm.arm.neon.vpaddlu.v2i32.v4i16(<4 x i16> %a16) + %a0 = extractelement <2 x i32> %a32, i32 0 + %a1 = extractelement <2 x i32> %a32, i32 1 + %r = add i32 %a0, %a1 + %r16 = trunc i32 %r to i16 + ret i16 %r16 +} + +declare <2 x i32> @llvm.arm.neon.vpaddlu.v2i32.v4i16(<4 x i16>) nounwind readnone + +define i32 @__reduce_add_int16() nounwind readnone { + %a32 = call <2 x i32> @llvm.arm.neon.vpaddlu.v2i32.v4i16(<4 x i16> %0) + %a0 = extractelement <2 x i32> %a32, i32 0 + %a1 = extractelement <2 x i32> %a32, i32 1 + %r = add i32 %a0, %a1 ret i32 %r } -declare <2 x i32> @llvm.arm.neon.vpadd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone +declare <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32>) nounwind readnone -define i32 @__reduce_add_int32() nounwind readnone { - neon_reduce(i32, @llvm.arm.neon.vpadd.v2i32, @add_i32) +define i64 @__reduce_add_int32() nounwind readnone { + %a64 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %0) + %a0 = extractelement <2 x i64> %a64, i32 0 + %a1 = extractelement <2 x i64> %a64, i32 1 + %r = add i64 %a0, %a1 + ret i64 %r } declare <2 x i32> @llvm.arm.neon.vpmins.v2i32(<2 x i32>, <2 x i32>) nounwind readnone diff --git a/builtins/target-sse2-x2.ll b/builtins/target-sse2-x2.ll index 73361720..da22a66c 100644 --- a/builtins/target-sse2-x2.ll +++ b/builtins/target-sse2-x2.ll @@ -367,6 +367,36 @@ define i1 @__none(<8 x i32>) nounwind readnone alwaysinline { ret i1 %cmp } +declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone + +define i16 @__reduce_add_int8(<8 x i8>) nounwind readnone alwaysinline { + %wide8 = shufflevector <8 x i8> %0, <8 x i8> zeroinitializer, + <16 x i32> + %rv = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %wide8, + <16 x i8> zeroinitializer) + %r0 = extractelement <2 x i64> %rv, i32 0 + %r1 = extractelement <2 x i64> %rv, i32 1 + %r = add i64 %r0, %r1 + %r16 = trunc i64 %r to i16 + ret i16 %r16 +} + +define internal <8 x i16> @__add_varying_i16(<8 x i16>, + <8 x i16>) nounwind readnone alwaysinline { + %r = add <8 x i16> %0, %1 + ret <8 x i16> %r +} + +define internal i16 @__add_uniform_i16(i16, i16) nounwind readnone alwaysinline { + %r = add i16 %0, %1 + ret i16 %r +} + +define i16 @__reduce_add_int16(<8 x i16>) nounwind readnone alwaysinline { + reduce8(i16, @__add_varying_i16, @__add_uniform_i16) +} + define <4 x float> @__vec4_add_float(<4 x float> %v0, <4 x float> %v1) nounwind readnone alwaysinline { %v = fadd <4 x float> %v0, %v1 diff --git a/builtins/target-sse2.ll b/builtins/target-sse2.ll index 2bb06391..a6b206b6 100644 --- a/builtins/target-sse2.ll +++ b/builtins/target-sse2.ll @@ -267,6 +267,36 @@ define i1 @__none(<4 x i32>) nounwind readnone alwaysinline { ret i1 %cmp } +declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone + +define i16 @__reduce_add_int8(<4 x i8>) nounwind readnone alwaysinline { + %wide8 = shufflevector <4 x i8> %0, <4 x i8> zeroinitializer, + <16 x i32> + %rv = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %wide8, + <16 x i8> zeroinitializer) + %r0 = extractelement <2 x i64> %rv, i32 0 + %r1 = extractelement <2 x i64> %rv, i32 1 + %r = add i64 %r0, %r1 + %r16 = trunc i64 %r to i16 + ret i16 %r16 +} + +define internal <4 x i16> @__add_varying_i16(<4 x i16>, + <4 x i16>) nounwind readnone alwaysinline { + %r = add <4 x i16> %0, %1 + ret <4 x i16> %r +} + +define internal i16 @__add_uniform_i16(i16, i16) nounwind readnone alwaysinline { + %r = add i16 %0, %1 + ret i16 %r +} + +define i16 @__reduce_add_int16(<4 x i16>) nounwind readnone alwaysinline { + reduce4(i16, @__add_varying_i16, @__add_uniform_i16) +} + define float @__reduce_add_float(<4 x float> %v) nounwind readonly alwaysinline { %v1 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> diff --git a/builtins/target-sse4-16.ll b/builtins/target-sse4-16.ll index 2044fbee..d1563988 100644 --- a/builtins/target-sse4-16.ll +++ b/builtins/target-sse4-16.ll @@ -253,6 +253,36 @@ define i1 @__none(<8 x MASK>) nounwind readnone alwaysinline { ret i1 %meq } +declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone + +define i16 @__reduce_add_int8(<8 x i8>) nounwind readnone alwaysinline { + %wide8 = shufflevector <8 x i8> %0, <8 x i8> zeroinitializer, + <16 x i32> + %rv = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %wide8, + <16 x i8> zeroinitializer) + %r0 = extractelement <2 x i64> %rv, i32 0 + %r1 = extractelement <2 x i64> %rv, i32 1 + %r = add i64 %r0, %r1 + %r16 = trunc i64 %r to i16 + ret i16 %r16 +} + +define internal <8 x i16> @__add_varying_i16(<8 x i16>, + <8 x i16>) nounwind readnone alwaysinline { + %r = add <8 x i16> %0, %1 + ret <8 x i16> %r +} + +define internal i16 @__add_uniform_i16(i16, i16) nounwind readnone alwaysinline { + %r = add i16 %0, %1 + ret i16 %r +} + +define i16 @__reduce_add_int16(<8 x i16>) nounwind readnone alwaysinline { + reduce8(i16, @__add_varying_i16, @__add_uniform_i16) +} + define internal <8 x float> @__add_varying_float(<8 x float>, <8 x float>) { %r = fadd <8 x float> %0, %1 ret <8 x float> %r diff --git a/builtins/target-sse4-8.ll b/builtins/target-sse4-8.ll index cd8fdce2..85b7bbe7 100644 --- a/builtins/target-sse4-8.ll +++ b/builtins/target-sse4-8.ll @@ -261,6 +261,33 @@ define i1 @__none(<16 x i8>) nounwind readnone alwaysinline { ret i1 %meq } +declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone + +define i16 @__reduce_add_int8(<16 x i8>) nounwind readnone alwaysinline { + %rv = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %0, + <16 x i8> zeroinitializer) + %r0 = extractelement <2 x i64> %rv, i32 0 + %r1 = extractelement <2 x i64> %rv, i32 1 + %r = add i64 %r0, %r1 + %r16 = trunc i64 %r to i16 + ret i16 %r16 +} + +define internal <16 x i16> @__add_varying_i16(<16 x i16>, + <16 x i16>) nounwind readnone alwaysinline { + %r = add <16 x i16> %0, %1 + ret <16 x i16> %r +} + +define internal i16 @__add_uniform_i16(i16, i16) nounwind readnone alwaysinline { + %r = add i16 %0, %1 + ret i16 %r +} + +define i16 @__reduce_add_int16(<16 x i16>) nounwind readnone alwaysinline { + reduce16(i16, @__add_varying_i16, @__add_uniform_i16) +} + define internal <16 x float> @__add_varying_float(<16 x float>, <16 x float>) { %r = fadd <16 x float> %0, %1 ret <16 x float> %r diff --git a/builtins/target-sse4-x2.ll b/builtins/target-sse4-x2.ll index ccae4d51..e2debbc2 100644 --- a/builtins/target-sse4-x2.ll +++ b/builtins/target-sse4-x2.ll @@ -309,6 +309,36 @@ define i1 @__none(<8 x i32>) nounwind readnone alwaysinline { ret i1 %cmp } +declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone + +define i16 @__reduce_add_int8(<8 x i8>) nounwind readnone alwaysinline { + %wide8 = shufflevector <8 x i8> %0, <8 x i8> zeroinitializer, + <16 x i32> + %rv = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %wide8, + <16 x i8> zeroinitializer) + %r0 = extractelement <2 x i64> %rv, i32 0 + %r1 = extractelement <2 x i64> %rv, i32 1 + %r = add i64 %r0, %r1 + %r16 = trunc i64 %r to i16 + ret i16 %r16 +} + +define internal <8 x i16> @__add_varying_i16(<8 x i16>, + <8 x i16>) nounwind readnone alwaysinline { + %r = add <8 x i16> %0, %1 + ret <8 x i16> %r +} + +define internal i16 @__add_uniform_i16(i16, i16) nounwind readnone alwaysinline { + %r = add i16 %0, %1 + ret i16 %r +} + +define i16 @__reduce_add_int16(<8 x i16>) nounwind readnone alwaysinline { + reduce8(i16, @__add_varying_i16, @__add_uniform_i16) +} + define float @__reduce_min_float(<8 x float>) nounwind readnone alwaysinline { reduce8by4(float, @llvm.x86.sse.min.ps, @__min_uniform_float) } diff --git a/builtins/target-sse4.ll b/builtins/target-sse4.ll index f622b839..98a7ef69 100644 --- a/builtins/target-sse4.ll +++ b/builtins/target-sse4.ll @@ -299,6 +299,36 @@ define i1 @__none(<4 x i32>) nounwind readnone alwaysinline { ret i1 %cmp } +declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone + +define i16 @__reduce_add_int8(<4 x i8>) nounwind readnone alwaysinline { + %wide8 = shufflevector <4 x i8> %0, <4 x i8> zeroinitializer, + <16 x i32> + %rv = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %wide8, + <16 x i8> zeroinitializer) + %r0 = extractelement <2 x i64> %rv, i32 0 + %r1 = extractelement <2 x i64> %rv, i32 1 + %r = add i64 %r0, %r1 + %r16 = trunc i64 %r to i16 + ret i16 %r16 +} + +define internal <4 x i16> @__add_varying_i16(<4 x i16>, + <4 x i16>) nounwind readnone alwaysinline { + %r = add <4 x i16> %0, %1 + ret <4 x i16> %r +} + +define internal i16 @__add_uniform_i16(i16, i16) nounwind readnone alwaysinline { + %r = add i16 %0, %1 + ret i16 %r +} + +define i16 @__reduce_add_int16(<4 x i16>) nounwind readnone alwaysinline { + reduce4(i16, @__add_varying_i16, @__add_uniform_i16) +} + declare <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float>, <4 x float>) nounwind readnone define float @__reduce_add_float(<4 x float>) nounwind readonly alwaysinline { diff --git a/docs/ispc.rst b/docs/ispc.rst index c6c63172..39d3a5c8 100755 --- a/docs/ispc.rst +++ b/docs/ispc.rst @@ -3711,29 +3711,44 @@ instances are added together by the ``reduce_add()`` function. :: - uniform float reduce_add(float x) - uniform int reduce_add(int x) - uniform unsigned int reduce_add(unsigned int x) + uniform int16 reduce_add(int8 x) + uniform unsigned int16 reduce_add(unsigned int8 x) + uniform int32 reduce_add(int16 x) + uniform unsigned int32 reduce_add(unsigned 16int x) + uniform int64 reduce_add(int32 x) + uniform unsigned int64 reduce_add(unsigned int32 x) + uniform int64 reduce_add(int64 x) + uniform unsigned int64 reduce_add(unsigned int64 x) -You can also use functions to compute the minimum and maximum value of the -given value across all of the currently-executing program instances. + uniform float reduce_add(float x) + uniform double reduce_add(double x) + +You can also use functions to compute the minimum value of the given value +across all of the currently-executing program instances. :: - uniform float reduce_min(float a) uniform int32 reduce_min(int32 a) uniform unsigned int32 reduce_min(unsigned int32 a) - uniform double reduce_min(double a) uniform int64 reduce_min(int64 a) uniform unsigned int64 reduce_min(unsigned int64 a) - uniform float reduce_max(float a) + uniform float reduce_min(float a) + uniform double reduce_min(double a) + +Equivalent functions are available to comptue the maximum of the given +varying variable over the active program instances. + +:: + uniform int32 reduce_max(int32 a) uniform unsigned int32 reduce_max(unsigned int32 a) - uniform double reduce_max(double a) uniform int64 reduce_max(int64 a) uniform unsigned int64 reduce_max(unsigned int64 a) + uniform float reduce_max(float a) + uniform double reduce_max(double a) + Finally, you can check to see if a particular value has the same value in all of the currently-running program instances: @@ -3741,9 +3756,10 @@ all of the currently-running program instances: uniform bool reduce_equal(int32 v) uniform bool reduce_equal(unsigned int32 v) - uniform bool reduce_equal(float v) uniform bool reduce_equal(int64 v) uniform bool reduce_equal(unsigned int64 v) + + uniform bool reduce_equal(float v) uniform bool reduce_equal(double) There are also variants of these functions that return the value as a @@ -3758,10 +3774,11 @@ performance in the `Performance Guide`_. uniform bool reduce_equal(int32 v, uniform int32 * uniform sameval) uniform bool reduce_equal(unsigned int32 v, uniform unsigned int32 * uniform sameval) - uniform bool reduce_equal(float v, uniform float * uniform sameval) uniform bool reduce_equal(int64 v, uniform int64 * uniform sameval) uniform bool reduce_equal(unsigned int64 v, uniform unsigned int64 * uniform sameval) + + uniform bool reduce_equal(float v, uniform float * uniform sameval) uniform bool reduce_equal(double, uniform double * uniform sameval) If called when none of the program instances are running, diff --git a/examples/intrinsics/generic-16.h b/examples/intrinsics/generic-16.h index 828c1ab4..6d4fe1f4 100644 --- a/examples/intrinsics/generic-16.h +++ b/examples/intrinsics/generic-16.h @@ -1162,19 +1162,20 @@ REDUCE_ADD(double, __vec16_d, __reduce_add_double) REDUCE_MINMAX(double, __vec16_d, __reduce_min_double, <) REDUCE_MINMAX(double, __vec16_d, __reduce_max_double, >) -REDUCE_ADD(uint32_t, __vec16_i32, __reduce_add_int32) +REDUCE_ADD(int16_t, __vec16_i8, __reduce_add_int8) +REDUCE_ADD(int32_t, __vec16_i16, __reduce_add_int16) + +REDUCE_ADD(int64_t, __vec16_i32, __reduce_add_int32) REDUCE_MINMAX(int32_t, __vec16_i32, __reduce_min_int32, <) REDUCE_MINMAX(int32_t, __vec16_i32, __reduce_max_int32, >) -REDUCE_ADD(uint32_t, __vec16_i32, __reduce_add_uint32) REDUCE_MINMAX(uint32_t, __vec16_i32, __reduce_min_uint32, <) REDUCE_MINMAX(uint32_t, __vec16_i32, __reduce_max_uint32, >) -REDUCE_ADD(uint64_t, __vec16_i64, __reduce_add_int64) +REDUCE_ADD(int64_t, __vec16_i64, __reduce_add_int64) REDUCE_MINMAX(int64_t, __vec16_i64, __reduce_min_int64, <) REDUCE_MINMAX(int64_t, __vec16_i64, __reduce_max_int64, >) -REDUCE_ADD(uint64_t, __vec16_i64, __reduce_add_uint64) REDUCE_MINMAX(uint64_t, __vec16_i64, __reduce_min_uint64, <) REDUCE_MINMAX(uint64_t, __vec16_i64, __reduce_max_uint64, >) diff --git a/examples/intrinsics/generic-32.h b/examples/intrinsics/generic-32.h index 64b82cb1..12c4f84e 100644 --- a/examples/intrinsics/generic-32.h +++ b/examples/intrinsics/generic-32.h @@ -1231,19 +1231,20 @@ REDUCE_ADD(double, __vec32_d, __reduce_add_double) REDUCE_MINMAX(double, __vec32_d, __reduce_min_double, <) REDUCE_MINMAX(double, __vec32_d, __reduce_max_double, >) -REDUCE_ADD(uint32_t, __vec32_i32, __reduce_add_int32) +REDUCE_ADD(int16_t, __vec16_i8, __reduce_add_int8) +REDUCE_ADD(int32_t, __vec16_i16, __reduce_add_int16) + +REDUCE_ADD(int64_t, __vec32_i32, __reduce_add_int32) REDUCE_MINMAX(int32_t, __vec32_i32, __reduce_min_int32, <) REDUCE_MINMAX(int32_t, __vec32_i32, __reduce_max_int32, >) -REDUCE_ADD(uint32_t, __vec32_i32, __reduce_add_uint32) REDUCE_MINMAX(uint32_t, __vec32_i32, __reduce_min_uint32, <) REDUCE_MINMAX(uint32_t, __vec32_i32, __reduce_max_uint32, >) -REDUCE_ADD(uint64_t, __vec32_i64, __reduce_add_int64) +REDUCE_ADD(int64_t, __vec32_i64, __reduce_add_int64) REDUCE_MINMAX(int64_t, __vec32_i64, __reduce_min_int64, <) REDUCE_MINMAX(int64_t, __vec32_i64, __reduce_max_int64, >) -REDUCE_ADD(uint64_t, __vec32_i64, __reduce_add_uint64) REDUCE_MINMAX(uint64_t, __vec32_i64, __reduce_min_uint64, <) REDUCE_MINMAX(uint64_t, __vec32_i64, __reduce_max_uint64, >) diff --git a/examples/intrinsics/generic-64.h b/examples/intrinsics/generic-64.h index 7869faa5..a3648f42 100644 --- a/examples/intrinsics/generic-64.h +++ b/examples/intrinsics/generic-64.h @@ -1364,19 +1364,20 @@ REDUCE_ADD(double, __vec64_d, __reduce_add_double) REDUCE_MINMAX(double, __vec64_d, __reduce_min_double, <) REDUCE_MINMAX(double, __vec64_d, __reduce_max_double, >) -REDUCE_ADD(uint32_t, __vec64_i32, __reduce_add_int32) +REDUCE_ADD(int16_t, __vec16_i8, __reduce_add_int8) +REDUCE_ADD(int32_t, __vec16_i16, __reduce_add_int16) + +REDUCE_ADD(int64_t, __vec64_i32, __reduce_add_int32) REDUCE_MINMAX(int32_t, __vec64_i32, __reduce_min_int32, <) REDUCE_MINMAX(int32_t, __vec64_i32, __reduce_max_int32, >) -REDUCE_ADD(uint32_t, __vec64_i32, __reduce_add_uint32) REDUCE_MINMAX(uint32_t, __vec64_i32, __reduce_min_uint32, <) REDUCE_MINMAX(uint32_t, __vec64_i32, __reduce_max_uint32, >) -REDUCE_ADD(uint64_t, __vec64_i64, __reduce_add_int64) +REDUCE_ADD(int64_t, __vec64_i64, __reduce_add_int64) REDUCE_MINMAX(int64_t, __vec64_i64, __reduce_min_int64, <) REDUCE_MINMAX(int64_t, __vec64_i64, __reduce_max_int64, >) -REDUCE_ADD(uint64_t, __vec64_i64, __reduce_add_uint64) REDUCE_MINMAX(uint64_t, __vec64_i64, __reduce_min_uint64, <) REDUCE_MINMAX(uint64_t, __vec64_i64, __reduce_max_uint64, >) diff --git a/examples/intrinsics/knc.h b/examples/intrinsics/knc.h index bf383c88..41c4cbc0 100644 --- a/examples/intrinsics/knc.h +++ b/examples/intrinsics/knc.h @@ -1511,6 +1511,22 @@ static FORCEINLINE int64_t __count_trailing_zeros_i64(const __vec1_i64 mask) { // reductions /////////////////////////////////////////////////////////////////////////// +static FORCEINLINE int16_t __reduce_add_i8(__vec16_i8 v) { + // TODO: improve this! + int16_t ret = 0; + for (int i = 0; i < 16; ++i) + ret += v.v[i]; + return ret; +} + +static FORCEINLINE int32_t __reduce_add_i16(__vec16_i16 v) { + // TODO: improve this! + int32_t ret = 0; + for (int i = 0; i < 16; ++i) + ret += v.v[i]; + return ret; +} + static FORCEINLINE uint32_t __reduce_add_i32(__vec16_i32 v) { return _mm512_reduce_add_epi32(v); } diff --git a/examples/intrinsics/knc2x.h b/examples/intrinsics/knc2x.h index 0041a6c9..5b6e5295 100644 --- a/examples/intrinsics/knc2x.h +++ b/examples/intrinsics/knc2x.h @@ -1607,6 +1607,9 @@ static FORCEINLINE int64_t __count_leading_zeros_i64(uint64_t v) { /////////////////////////////////////////////////////////////////////////// // reductions +REDUCE_ADD(int16_t, __vec32_i8, __reduce_add_int8) +REDUCE_ADD(int32_t, __vec32_i16, __reduce_add_int16) + static FORCEINLINE float __reduce_add_float(__vec32_f v) { return _mm512_reduce_add_ps(v.v1) + _mm512_reduce_add_ps(v.v2); } diff --git a/examples/intrinsics/sse4.h b/examples/intrinsics/sse4.h index d4739d61..30f90b31 100644 --- a/examples/intrinsics/sse4.h +++ b/examples/intrinsics/sse4.h @@ -2528,6 +2528,22 @@ static FORCEINLINE int64_t __count_leading_zeros_i64(uint64_t v) { /////////////////////////////////////////////////////////////////////////// // reductions +static FORCEINLINE int16_t __reduce_add_int8(__vec4_i8 v) { + // TODO: improve + int16_t ret = 0; + for (int i = 0; i < 4; ++i) + ret += v.v[i]; + return ret; +} + +static FORCEINLINE int32_t __reduce_add_int16(__vec4_i16 v) { + // TODO: improve + int32_t ret = 0; + for (int i = 0; i < 4; ++i) + ret += v.v[i]; + return ret; +} + static FORCEINLINE float __reduce_add_float(__vec4_f v) { float r = bits_as_float(_mm_extract_ps(v.v, 0)); r += bits_as_float(_mm_extract_ps(v.v, 1)); diff --git a/stdlib.ispc b/stdlib.ispc index 7e848481..c9c66252 100644 --- a/stdlib.ispc +++ b/stdlib.ispc @@ -887,13 +887,32 @@ static inline uniform double select(uniform bool c, uniform double a, /////////////////////////////////////////////////////////////////////////// // Horizontal ops / reductions +__declspec(safe) +static inline uniform int16 reduce_add(int8 x) { + return __reduce_add_int8(__mask ? x : (int8)0); +} + +__declspec(safe) +static inline uniform unsigned int16 reduce_add(unsigned int8 x) { + return __reduce_add_int8(__mask ? x : (int8)0); +} + +__declspec(safe) +static inline uniform int32 reduce_add(int16 x) { + return __reduce_add_int16(__mask ? x : (int16)0); +} + +__declspec(safe) +static inline uniform unsigned int32 reduce_add(unsigned int16 x) { + return __reduce_add_int16(__mask ? x : (int16)0); +} + __declspec(safe) static inline uniform float reduce_add(float x) { // zero the lanes where the mask is off return __reduce_add_float(__mask ? x : 0.); } - __declspec(safe) static inline uniform float reduce_min(float v) { // For the lanes where the mask is off, replace the given value with @@ -915,7 +934,7 @@ static inline uniform float reduce_max(float v) { } __declspec(safe) -static inline uniform int reduce_add(int x) { +static inline uniform int64 reduce_add(int32 x) { // Zero out the values for lanes that aren't running return __reduce_add_int32(__mask ? x : 0); } @@ -937,7 +956,7 @@ static inline uniform int reduce_max(int v) { } __declspec(safe) -static inline uniform unsigned int reduce_add(unsigned int x) { +static inline uniform unsigned int64 reduce_add(unsigned int32 x) { // Set values for non-running lanes to zero so they don't affect the // result. return __reduce_add_int32(__mask ? x : 0); diff --git a/tests/reduce-add-int16-1.ispc b/tests/reduce-add-int16-1.ispc new file mode 100644 index 00000000..58529ca1 --- /dev/null +++ b/tests/reduce-add-int16-1.ispc @@ -0,0 +1,21 @@ + +export uniform int width() { return programCount; } + + + +export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) { + float v = aFOO[programIndex]; + uniform float m; + int16 iv = (int)v; + if (iv & 1) + m = reduce_add(iv); + RET[programIndex] = m; +} + +export void result(uniform float RET[]) { + uniform int x = 0; + for (uniform int i = 1; i <= programCount; i += 2) + x += i; + RET[programIndex] = x; +} + diff --git a/tests/reduce-add-int16.ispc b/tests/reduce-add-int16.ispc new file mode 100644 index 00000000..8657b201 --- /dev/null +++ b/tests/reduce-add-int16.ispc @@ -0,0 +1,21 @@ + +export uniform int width() { return programCount; } + + + +export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) { + float v = aFOO[programIndex]; + uniform float m; + int16 iv = (int)v; +/*CO if (iv & 1)*/ + m = reduce_add(iv); + RET[programIndex] = m; +} + +export void result(uniform float RET[]) { + uniform int x = 0; + for (uniform int i = 1; i <= programCount; ++i) + x += i; + RET[programIndex] = x; +} + diff --git a/tests/reduce-add-int8-1.ispc b/tests/reduce-add-int8-1.ispc new file mode 100644 index 00000000..e5310aae --- /dev/null +++ b/tests/reduce-add-int8-1.ispc @@ -0,0 +1,21 @@ + +export uniform int width() { return programCount; } + + + +export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) { + float v = aFOO[programIndex]; + uniform float m; + int8 iv = (int)v; + if (iv & 1) + m = reduce_add(iv); + RET[programIndex] = m; +} + +export void result(uniform float RET[]) { + uniform int x = 0; + for (uniform int i = 1; i <= programCount; i += 2) + x += i; + RET[programIndex] = x; +} + diff --git a/tests/reduce-add-int8.ispc b/tests/reduce-add-int8.ispc new file mode 100644 index 00000000..7e0dd027 --- /dev/null +++ b/tests/reduce-add-int8.ispc @@ -0,0 +1,18 @@ + +export uniform int width() { return programCount; } + + +export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) { + int8 db = b-4; + int8 iv = programIndex + db; + int m = reduce_add(iv); + RET[programIndex] = m; +} + +export void result(uniform float RET[]) { + uniform int x = 0; + for (uniform int i = 1; i <= programCount; ++i) + x += i; + RET[programIndex] = x; +} + From ab3b633733ec05f3778e46f792a98844e9ee5900 Mon Sep 17 00:00:00 2001 From: Matt Pharr Date: Mon, 29 Jul 2013 16:14:58 -0700 Subject: [PATCH 14/34] Add 8-bit and 16-bit specialized NEON targets. Like SSE4-8 and SSE4-16, these use 8-bit and 16-bit values for mask elements, respectively, and thus should generate the best code when used for computation with datatypes of those sizes. --- Makefile | 6 +- builtins.cpp | 28 +- builtins/target-neon-16.ll | 458 ++++++++++++++++ .../{target-neon.ll => target-neon-32.ll} | 305 +---------- builtins/target-neon-8.ll | 508 ++++++++++++++++++ builtins/target-neon-common.ll | 351 ++++++++++++ builtins/util.m4 | 120 ++++- ispc.cpp | 41 +- ispc.h | 3 +- ispc.vcxproj | 111 ++-- module.cpp | 1 + run_tests.py | 2 +- 12 files changed, 1561 insertions(+), 373 deletions(-) create mode 100644 builtins/target-neon-16.ll rename builtins/{target-neon.ll => target-neon-32.ll} (62%) create mode 100644 builtins/target-neon-8.ll create mode 100644 builtins/target-neon-common.ll diff --git a/Makefile b/Makefile index fc064dbd..98729bfc 100644 --- a/Makefile +++ b/Makefile @@ -122,8 +122,10 @@ CXX_SRC=ast.cpp builtins.cpp cbackend.cpp ctx.cpp decl.cpp expr.cpp func.cpp \ type.cpp util.cpp HEADERS=ast.h builtins.h ctx.h decl.h expr.h func.h ispc.h llvmutil.h module.h \ opt.h stmt.h sym.h type.h util.h -TARGETS=neon avx1 avx1-x2 avx11 avx11-x2 avx2 avx2-x2 sse2 sse2-x2 sse4 sse4-x2 \ - sse4-8 sse4-16 generic-4 generic-8 generic-16 generic-32 generic-64 generic-1 +TARGETS=neon-32 neon-16 neon-8 \ + avx1 avx1-x2 avx11 avx11-x2 avx2 avx2-x2 \ + sse2 sse2-x2 sse4 sse4-x2 sse4-8 sse4-16 \ + generic-1 generic-4 generic-8 generic-16 generic-32 generic-64 # These files need to be compiled in two versions - 32 and 64 bits. BUILTINS_SRC_TARGET=$(addprefix builtins/target-, $(addsuffix .ll, $(TARGETS))) # These are files to be compiled in single version. diff --git a/builtins.cpp b/builtins.cpp index 08472623..e671a491 100644 --- a/builtins.cpp +++ b/builtins.cpp @@ -657,7 +657,9 @@ AddBitcodeToModule(const unsigned char *bitcode, int length, // the values for an ARM target. This maybe won't cause problems // in the generated code, since bulitins.c doesn't do anything too // complex w.r.t. struct layouts, etc. - if (g->target->getISA() != Target::NEON) + if (g->target->getISA() != Target::NEON32 && + g->target->getISA() != Target::NEON16 && + g->target->getISA() != Target::NEON8) #endif // !__arm__ { Assert(bcTriple.getArch() == llvm::Triple::UnknownArch || @@ -820,12 +822,30 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod // Next, add the target's custom implementations of the various needed // builtin functions (e.g. __masked_store_32(), etc). switch (g->target->getISA()) { - case Target::NEON: { + case Target::NEON8: { if (runtime32) { - EXPORT_MODULE(builtins_bitcode_neon_32bit); + EXPORT_MODULE(builtins_bitcode_neon_8_32bit); } else { - EXPORT_MODULE(builtins_bitcode_neon_64bit); + EXPORT_MODULE(builtins_bitcode_neon_8_64bit); + } + break; + } + case Target::NEON16: { + if (runtime32) { + EXPORT_MODULE(builtins_bitcode_neon_16_32bit); + } + else { + EXPORT_MODULE(builtins_bitcode_neon_16_64bit); + } + break; + } + case Target::NEON32: { + if (runtime32) { + EXPORT_MODULE(builtins_bitcode_neon_32_32bit); + } + else { + EXPORT_MODULE(builtins_bitcode_neon_32_64bit); } break; } diff --git a/builtins/target-neon-16.ll b/builtins/target-neon-16.ll new file mode 100644 index 00000000..fd15eb0b --- /dev/null +++ b/builtins/target-neon-16.ll @@ -0,0 +1,458 @@ +;; +;; target-neon-16.ll +;; +;; Copyright(c) 2013 Google, Inc. +;; +;; All rights reserved. +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are +;; met: +;; +;; * Redistributions of source code must retain the above copyright +;; notice, this list of conditions and the following disclaimer. +;; +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; +;; * Neither the name of Matt Pharr nor the names of its +;; contributors may be used to endorse or promote products derived from +;; this software without specific prior written permission. +;; +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +;; IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +;; TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +;; PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER +;; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +;; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +;; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +define(`WIDTH',`8') +define(`MASK',`i16') + +include(`util.m4') +include(`target-neon-common.ll') + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; half conversion routines + +define <8 x float> @__half_to_float_varying(<8 x i16> %v) nounwind readnone { + unary4to8conv(r, i16, float, @llvm.arm.neon.vcvthf2fp, %v) + ret <8 x float> %r +} + +define <8 x i16> @__float_to_half_varying(<8 x float> %v) nounwind readnone { + unary4to8conv(r, float, i16, @llvm.arm.neon.vcvtfp2hf, %v) + ret <8 x i16> %r +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; math + +;; round/floor/ceil + +;; FIXME: grabbed these from the sse2 target, which does not have native +;; instructions for these. Is there a better approach for NEON? + +define <8 x float> @__round_varying_float(<8 x float>) nounwind readonly alwaysinline { + %float_to_int_bitcast.i.i.i.i = bitcast <8 x float> %0 to <8 x i32> + %bitop.i.i = and <8 x i32> %float_to_int_bitcast.i.i.i.i, + + %bitop.i = xor <8 x i32> %float_to_int_bitcast.i.i.i.i, %bitop.i.i + %int_to_float_bitcast.i.i40.i = bitcast <8 x i32> %bitop.i to <8 x float> + %binop.i = fadd <8 x float> %int_to_float_bitcast.i.i40.i, + + %binop21.i = fadd <8 x float> %binop.i, + + %float_to_int_bitcast.i.i.i = bitcast <8 x float> %binop21.i to <8 x i32> + %bitop31.i = xor <8 x i32> %float_to_int_bitcast.i.i.i, %bitop.i.i + %int_to_float_bitcast.i.i.i = bitcast <8 x i32> %bitop31.i to <8 x float> + ret <8 x float> %int_to_float_bitcast.i.i.i +} + +define <8 x float> @__floor_varying_float(<8 x float>) nounwind readonly alwaysinline { + %calltmp.i = tail call <8 x float> @__round_varying_float(<8 x float> %0) nounwind + %bincmp.i = fcmp ogt <8 x float> %calltmp.i, %0 + %val_to_boolvec32.i = sext <8 x i1> %bincmp.i to <8 x i32> + %bitop.i = and <8 x i32> %val_to_boolvec32.i, + + %int_to_float_bitcast.i.i.i = bitcast <8 x i32> %bitop.i to <8 x float> + %binop.i = fadd <8 x float> %calltmp.i, %int_to_float_bitcast.i.i.i + ret <8 x float> %binop.i +} + +define <8 x float> @__ceil_varying_float(<8 x float>) nounwind readonly alwaysinline { + %calltmp.i = tail call <8 x float> @__round_varying_float(<8 x float> %0) nounwind + %bincmp.i = fcmp olt <8 x float> %calltmp.i, %0 + %val_to_boolvec32.i = sext <8 x i1> %bincmp.i to <8 x i32> + %bitop.i = and <8 x i32> %val_to_boolvec32.i, + + %int_to_float_bitcast.i.i.i = bitcast <8 x i32> %bitop.i to <8 x float> + %binop.i = fadd <8 x float> %calltmp.i, %int_to_float_bitcast.i.i.i + ret <8 x float> %binop.i +} + +;; FIXME: rounding doubles and double vectors needs to be implemented +declare @__round_varying_double() nounwind readnone +declare @__floor_varying_double() nounwind readnone +declare @__ceil_varying_double() nounwind readnone + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; min/max + +declare <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float>, <4 x float>) nounwind readnone +declare <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float>, <4 x float>) nounwind readnone + +define @__max_varying_float(, + ) nounwind readnone { + binary4to8(r, float, @llvm.arm.neon.vmaxs.v4f32, %0, %1) + ret %r +} + +define @__min_varying_float(, + ) nounwind readnone { + binary4to8(r, float, @llvm.arm.neon.vmins.v4f32, %0, %1) + ret %r +} + +declare <4 x i32> @llvm.arm.neon.vmins.v4i32(<4 x i32>, <4 x i32>) nounwind readnone +declare <4 x i32> @llvm.arm.neon.vminu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone +declare <4 x i32> @llvm.arm.neon.vmaxs.v4i32(<4 x i32>, <4 x i32>) nounwind readnone +declare <4 x i32> @llvm.arm.neon.vmaxu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone + +define @__min_varying_int32(, ) nounwind readnone { + binary4to8(r, i32, @llvm.arm.neon.vmins.v4i32, %0, %1) + ret %r +} + +define @__max_varying_int32(, ) nounwind readnone { + binary4to8(r, i32, @llvm.arm.neon.vmaxs.v4i32, %0, %1) + ret %r +} + +define @__min_varying_uint32(, ) nounwind readnone { + binary4to8(r, i32, @llvm.arm.neon.vminu.v4i32, %0, %1) + ret %r +} + +define @__max_varying_uint32(, ) nounwind readnone { + binary4to8(r, i32, @llvm.arm.neon.vmaxu.v4i32, %0, %1) + ret %r +} + +;; sqrt/rsqrt/rcp + +declare <4 x float> @llvm.arm.neon.vrecpe.v4f32(<4 x float>) nounwind readnone +declare <4 x float> @llvm.arm.neon.vrecps.v4f32(<4 x float>, <4 x float>) nounwind readnone + +define @__rcp_varying_float( %d) nounwind readnone { + unary4to8(x0, float, @llvm.arm.neon.vrecpe.v4f32, %d) + binary4to8(x0_nr, float, @llvm.arm.neon.vrecps.v4f32, %d, %x0) + %x1 = fmul %x0, %x0_nr + binary4to8(x1_nr, float, @llvm.arm.neon.vrecps.v4f32, %d, %x1) + %x2 = fmul %x1, %x1_nr + ret %x2 +} + +declare <4 x float> @llvm.arm.neon.vrsqrte.v4f32(<4 x float>) nounwind readnone +declare <4 x float> @llvm.arm.neon.vrsqrts.v4f32(<4 x float>, <4 x float>) nounwind readnone + +define @__rsqrt_varying_float( %d) nounwind readnone { + unary4to8(x0, float, @llvm.arm.neon.vrsqrte.v4f32, %d) + %x0_2 = fmul %x0, %x0 + binary4to8(x0_nr, float, @llvm.arm.neon.vrsqrts.v4f32, %d, %x0_2) + %x1 = fmul %x0, %x0_nr + %x1_2 = fmul %x1, %x1 + binary4to8(x1_nr, float, @llvm.arm.neon.vrsqrts.v4f32, %d, %x1_2) + %x2 = fmul %x1, %x1_nr + ret %x2 +} + +define float @__rsqrt_uniform_float(float) nounwind readnone { + %v1 = bitcast float %0 to <1 x float> + %vs = shufflevector <1 x float> %v1, <1 x float> undef, + <8 x i32> + %vr = call <8 x float> @__rsqrt_varying_float(<8 x float> %vs) + %r = extractelement <8 x float> %vr, i32 0 + ret float %r +} + +define float @__rcp_uniform_float(float) nounwind readnone { + %v1 = bitcast float %0 to <1 x float> + %vs = shufflevector <1 x float> %v1, <1 x float> undef, + <8 x i32> + %vr = call <8 x float> @__rcp_varying_float(<8 x float> %vs) + %r = extractelement <8 x float> %vr, i32 0 + ret float %r +} + +declare <4 x float> @llvm.sqrt.v4f32(<4 x float>) + +define @__sqrt_varying_float() nounwind readnone { + unary4to8(result, float, @llvm.sqrt.v4f32, %0) +;; this returns nan for v=0, which is undesirable.. +;; %rsqrt = call @__rsqrt_varying_float( %0) +;; %result = fmul <4 x float> %rsqrt, %0 + ret <8 x float> %result +} + +declare <4 x double> @llvm.sqrt.v4f64(<4 x double>) + +define @__sqrt_varying_double() nounwind readnone { + unary4to8(r, double, @llvm.sqrt.v4f64, %0) + ret %r +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; reductions + +define i64 @__movmsk() nounwind readnone { + %and_mask = and %0, + + %v4 = call <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<8 x i16> %and_mask) + %v2 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %v4) + %va = extractelement <2 x i64> %v2, i32 0 + %vb = extractelement <2 x i64> %v2, i32 1 + %v = or i64 %va, %vb + ret i64 %v +} + +define i1 @__any() nounwind readnone alwaysinline { + v8tov4(MASK, %0, %v0123, %v4567) + %vor = or <4 x MASK> %v0123, %v4567 + %v0 = extractelement <4 x MASK> %vor, i32 0 + %v1 = extractelement <4 x MASK> %vor, i32 1 + %v2 = extractelement <4 x MASK> %vor, i32 2 + %v3 = extractelement <4 x MASK> %vor, i32 3 + %v01 = or MASK %v0, %v1 + %v23 = or MASK %v2, %v3 + %v = or MASK %v01, %v23 + %cmp = icmp ne MASK %v, 0 + ret i1 %cmp +} + +define i1 @__all() nounwind readnone alwaysinline { + v8tov4(MASK, %0, %v0123, %v4567) + %vand = and <4 x MASK> %v0123, %v4567 + %v0 = extractelement <4 x MASK> %vand, i32 0 + %v1 = extractelement <4 x MASK> %vand, i32 1 + %v2 = extractelement <4 x MASK> %vand, i32 2 + %v3 = extractelement <4 x MASK> %vand, i32 3 + %v01 = and MASK %v0, %v1 + %v23 = and MASK %v2, %v3 + %v = and MASK %v01, %v23 + %cmp = icmp ne MASK %v, 0 + ret i1 %cmp +} + +define i1 @__none() nounwind readnone alwaysinline { + %any = call i1 @__any( %0) + %none = icmp eq i1 %any, 0 + ret i1 %none +} + +;; $1: scalar type +;; $2: vector/vector reduce function (2 x -> ) +;; $3: pairwise vector reduce function (2 x <2 x vec> -> <2 x vec>) +;; $4: scalar reduce function + +define(`neon_reduce', ` + v8tov4($1, %0, %v0123, %v4567) + %v0123_8 = shufflevector <4 x $1> %v0123, <4 x $1> undef, + <8 x i32> + %v4567_8 = shufflevector <4 x $1> %v4567, <4 x $1> undef, + <8 x i32> + %vfirst = call <8 x $1> $2(<8 x $1> %v0123_8, <8 x $1> %v4567_8) + %vfirst_4 = shufflevector <8 x $1> %vfirst, <8 x $1> undef, + <4 x i32> + v4tov2($1, %vfirst_4, %v0, %v1) + %vh = call <2 x $1> $3(<2 x $1> %v0, <2 x $1> %v1) + %vh0 = extractelement <2 x $1> %vh, i32 0 + %vh1 = extractelement <2 x $1> %vh, i32 1 + %r = call $1 $4($1 %vh0, $1 %vh1) + ret $1 %r +') + +declare <2 x float> @llvm.arm.neon.vpadd.v2f32(<2 x float>, <2 x float>) nounwind readnone + +define internal float @add_f32(float, float) { + %r = fadd float %0, %1 + ret float %r +} + +define internal @__add_varying_float(, ) { + %r = fadd %0, %1 + ret %r +} + +define float @__reduce_add_float() nounwind readnone { + neon_reduce(float, @__add_varying_float, @llvm.arm.neon.vpadd.v2f32, @add_f32) +} + +declare <2 x float> @llvm.arm.neon.vpmins.v2f32(<2 x float>, <2 x float>) nounwind readnone + +define internal float @min_f32(float, float) { + %cmp = fcmp olt float %0, %1 + %r = select i1 %cmp, float %0, float %1 + ret float %r +} + +define float @__reduce_min_float() nounwind readnone { + neon_reduce(float, @__min_varying_float, @llvm.arm.neon.vpmins.v2f32, @min_f32) +} + +declare <2 x float> @llvm.arm.neon.vpmaxs.v2f32(<2 x float>, <2 x float>) nounwind readnone + +define internal float @max_f32(float, float) { + %cmp = fcmp ugt float %0, %1 + %r = select i1 %cmp, float %0, float %1 + ret float %r +} + +define float @__reduce_max_float() nounwind readnone { + neon_reduce(float, @__max_varying_float, @llvm.arm.neon.vpmaxs.v2f32, @max_f32) +} + +declare <4 x i16> @llvm.arm.neon.vpaddls.v4i16.v8i8(<8 x i8>) nounwind readnone +declare <2 x i32> @llvm.arm.neon.vpaddlu.v2i32.v4i16(<4 x i16>) nounwind readnone + +define i16 @__reduce_add_int8() nounwind readnone { + %a16 = call <4 x i16> @llvm.arm.neon.vpaddls.v4i16.v8i8(<8 x i8> %0) + %a32 = call <2 x i32> @llvm.arm.neon.vpaddlu.v2i32.v4i16(<4 x i16> %a16) + %a0 = extractelement <2 x i32> %a32, i32 0 + %a1 = extractelement <2 x i32> %a32, i32 1 + %r = add i32 %a0, %a1 + %r16 = trunc i32 %r to i16 + ret i16 %r16 +} + +declare <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16() + +define i64 @__reduce_add_int16() nounwind readnone { + %a1 = call <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16( %0) + %a2 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %a1) + %aa = extractelement <2 x i64> %a2, i32 0 + %ab = extractelement <2 x i64> %a2, i32 1 + %r = add i64 %aa, %ab + ret i64 %r +} + +declare <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32>) nounwind readnone + +define i64 @__reduce_add_int32() nounwind readnone { + v8tov4(i32, %0, %va, %vb) + %pa = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %va) + %pb = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %vb) + %psum = add <2 x i64> %pa, %pb + %a0 = extractelement <2 x i64> %psum, i32 0 + %a1 = extractelement <2 x i64> %psum, i32 1 + %r = add i64 %a0, %a1 + ret i64 %r +} + +declare <2 x i32> @llvm.arm.neon.vpmins.v2i32(<2 x i32>, <2 x i32>) nounwind readnone + +define internal i32 @min_si32(i32, i32) { + %cmp = icmp slt i32 %0, %1 + %r = select i1 %cmp, i32 %0, i32 %1 + ret i32 %r +} + +define i32 @__reduce_min_int32() nounwind readnone { + neon_reduce(i32, @__min_varying_int32, @llvm.arm.neon.vpmins.v2i32, @min_si32) +} + +declare <2 x i32> @llvm.arm.neon.vpmaxs.v2i32(<2 x i32>, <2 x i32>) nounwind readnone + +define internal i32 @max_si32(i32, i32) { + %cmp = icmp sgt i32 %0, %1 + %r = select i1 %cmp, i32 %0, i32 %1 + ret i32 %r +} + +define i32 @__reduce_max_int32() nounwind readnone { + neon_reduce(i32, @__max_varying_int32, @llvm.arm.neon.vpmaxs.v2i32, @max_si32) +} + +declare <2 x i32> @llvm.arm.neon.vpminu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone + +define internal i32 @min_ui32(i32, i32) { + %cmp = icmp ult i32 %0, %1 + %r = select i1 %cmp, i32 %0, i32 %1 + ret i32 %r +} + +define i32 @__reduce_min_uint32() nounwind readnone { + neon_reduce(i32, @__min_varying_uint32, @llvm.arm.neon.vpmins.v2i32, @min_ui32) +} + +declare <2 x i32> @llvm.arm.neon.vpmaxu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone + +define internal i32 @max_ui32(i32, i32) { + %cmp = icmp ugt i32 %0, %1 + %r = select i1 %cmp, i32 %0, i32 %1 + ret i32 %r +} + +define i32 @__reduce_max_uint32() nounwind readnone { + neon_reduce(i32, @__max_varying_uint32, @llvm.arm.neon.vpmaxs.v2i32, @max_ui32) +} + +define double @__reduce_add_double() nounwind readnone { + v8tov2(double, %0, %v0, %v1, %v2, %v3) + %v01 = fadd <2 x double> %v0, %v1 + %v23 = fadd <2 x double> %v2, %v3 + %sum = fadd <2 x double> %v01, %v23 + %e0 = extractelement <2 x double> %sum, i32 0 + %e1 = extractelement <2 x double> %sum, i32 1 + %m = fadd double %e0, %e1 + ret double %m +} + +define double @__reduce_min_double() nounwind readnone { + reduce8(double, @__min_varying_double, @__min_uniform_double) +} + +define double @__reduce_max_double() nounwind readnone { + reduce8(double, @__max_varying_double, @__max_uniform_double) +} + +define i64 @__reduce_add_int64() nounwind readnone { + v8tov2(i64, %0, %v0, %v1, %v2, %v3) + %v01 = add <2 x i64> %v0, %v1 + %v23 = add <2 x i64> %v2, %v3 + %sum = add <2 x i64> %v01, %v23 + %e0 = extractelement <2 x i64> %sum, i32 0 + %e1 = extractelement <2 x i64> %sum, i32 1 + %m = add i64 %e0, %e1 + ret i64 %m +} + +define i64 @__reduce_min_int64() nounwind readnone { + reduce8(i64, @__min_varying_int64, @__min_uniform_int64) +} + +define i64 @__reduce_max_int64() nounwind readnone { + reduce8(i64, @__max_varying_int64, @__max_uniform_int64) +} + +define i64 @__reduce_min_uint64() nounwind readnone { + reduce8(i64, @__min_varying_uint64, @__min_uniform_uint64) +} + +define i64 @__reduce_max_uint64() nounwind readnone { + reduce8(i64, @__max_varying_uint64, @__max_uniform_uint64) +} diff --git a/builtins/target-neon.ll b/builtins/target-neon-32.ll similarity index 62% rename from builtins/target-neon.ll rename to builtins/target-neon-32.ll index fbeac352..1f8003d7 100644 --- a/builtins/target-neon.ll +++ b/builtins/target-neon-32.ll @@ -1,5 +1,5 @@ ;; -;; target-neon.ll +;; target-neon-32.ll ;; ;; Copyright(c) 2012-2013 Matt Pharr ;; Copyright(c) 2013 Google, Inc. @@ -34,52 +34,20 @@ ;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -target datalayout = "e-p:32:32:32-S32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f16:16:16-f32:32:32-f64:32:64-f128:128:128-v64:32:64-v128:32:128-a0:0:64-n32" - define(`WIDTH',`4') - define(`MASK',`i32') include(`util.m4') - -stdlib_core() -scans() -reduce_equal(WIDTH) -rdrand_decls() -define_shuffles() -aossoa() -ctlztz() +include(`target-neon-common.ll') ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; half conversion routines -declare <4 x i16> @llvm.arm.neon.vcvtfp2hf(<4 x float>) nounwind readnone -declare <4 x float> @llvm.arm.neon.vcvthf2fp(<4 x i16>) nounwind readnone - -define float @__half_to_float_uniform(i16 %v) nounwind readnone { - %v1 = bitcast i16 %v to <1 x i16> - %vec = shufflevector <1 x i16> %v1, <1 x i16> undef, - <4 x i32> - %h = call <4 x float> @llvm.arm.neon.vcvthf2fp(<4 x i16> %vec) - %r = extractelement <4 x float> %h, i32 0 - ret float %r -} - define <4 x float> @__half_to_float_varying(<4 x i16> %v) nounwind readnone { %r = call <4 x float> @llvm.arm.neon.vcvthf2fp(<4 x i16> %v) ret <4 x float> %r } -define i16 @__float_to_half_uniform(float %v) nounwind readnone { - %v1 = bitcast float %v to <1 x float> - %vec = shufflevector <1 x float> %v1, <1 x float> undef, - <4 x i32> - %h = call <4 x i16> @llvm.arm.neon.vcvtfp2hf(<4 x float> %vec) - %r = extractelement <4 x i16> %h, i32 0 - ret i16 %r -} - - define <4 x i16> @__float_to_half_varying(<4 x float> %v) nounwind readnone { %r = call <4 x i16> @llvm.arm.neon.vcvtfp2hf(<4 x float> %v) ret <4 x i16> %r @@ -88,48 +56,11 @@ define <4 x i16> @__float_to_half_varying(<4 x float> %v) nounwind readnone { ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; math -define void @__fastmath() nounwind { - ret void -} - ;; round/floor/ceil ;; FIXME: grabbed these from the sse2 target, which does not have native ;; instructions for these. Is there a better approach for NEON? -define float @__round_uniform_float(float) nounwind readonly alwaysinline { - %float_to_int_bitcast.i.i.i.i = bitcast float %0 to i32 - %bitop.i.i = and i32 %float_to_int_bitcast.i.i.i.i, -2147483648 - %bitop.i = xor i32 %bitop.i.i, %float_to_int_bitcast.i.i.i.i - %int_to_float_bitcast.i.i40.i = bitcast i32 %bitop.i to float - %binop.i = fadd float %int_to_float_bitcast.i.i40.i, 8.388608e+06 - %binop21.i = fadd float %binop.i, -8.388608e+06 - %float_to_int_bitcast.i.i.i = bitcast float %binop21.i to i32 - %bitop31.i = xor i32 %float_to_int_bitcast.i.i.i, %bitop.i.i - %int_to_float_bitcast.i.i.i = bitcast i32 %bitop31.i to float - ret float %int_to_float_bitcast.i.i.i -} - -define float @__floor_uniform_float(float) nounwind readonly alwaysinline { - %calltmp.i = tail call float @__round_uniform_float(float %0) nounwind - %bincmp.i = fcmp ogt float %calltmp.i, %0 - %selectexpr.i = sext i1 %bincmp.i to i32 - %bitop.i = and i32 %selectexpr.i, -1082130432 - %int_to_float_bitcast.i.i.i = bitcast i32 %bitop.i to float - %binop.i = fadd float %calltmp.i, %int_to_float_bitcast.i.i.i - ret float %binop.i -} - -define float @__ceil_uniform_float(float) nounwind readonly alwaysinline { - %calltmp.i = tail call float @__round_uniform_float(float %0) nounwind - %bincmp.i = fcmp olt float %calltmp.i, %0 - %selectexpr.i = sext i1 %bincmp.i to i32 - %bitop.i = and i32 %selectexpr.i, 1065353216 - %int_to_float_bitcast.i.i.i = bitcast i32 %bitop.i to float - %binop.i = fadd float %calltmp.i, %int_to_float_bitcast.i.i.i - ret float %binop.i -} - define <4 x float> @__round_varying_float(<4 x float>) nounwind readonly alwaysinline { %float_to_int_bitcast.i.i.i.i = bitcast <4 x float> %0 to <4 x i32> %bitop.i.i = and <4 x i32> %float_to_int_bitcast.i.i.i.i, @@ -164,10 +95,6 @@ define <4 x float> @__ceil_varying_float(<4 x float>) nounwind readonly alwaysin } ;; FIXME: rounding doubles and double vectors needs to be implemented -declare double @__round_uniform_double(double) nounwind readnone -declare double @__floor_uniform_double(double) nounwind readnone -declare double @__ceil_uniform_double(double) nounwind readnone - declare @__round_varying_double() nounwind readnone declare @__floor_varying_double() nounwind readnone declare @__ceil_varying_double() nounwind readnone @@ -175,78 +102,6 @@ declare @__ceil_varying_double() nounwind readn ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; min/max -define float @__max_uniform_float(float, float) nounwind readnone { - %cmp = fcmp ugt float %0, %1 - %r = select i1 %cmp, float %0, float %1 - ret float %r -} - -define float @__min_uniform_float(float, float) nounwind readnone { - %cmp = fcmp ult float %0, %1 - %r = select i1 %cmp, float %0, float %1 - ret float %r -} - -define i32 @__min_uniform_int32(i32, i32) nounwind readnone { - %cmp = icmp slt i32 %0, %1 - %r = select i1 %cmp, i32 %0, i32 %1 - ret i32 %r -} - -define i32 @__max_uniform_int32(i32, i32) nounwind readnone { - %cmp = icmp sgt i32 %0, %1 - %r = select i1 %cmp, i32 %0, i32 %1 - ret i32 %r -} - -define i32 @__min_uniform_uint32(i32, i32) nounwind readnone { - %cmp = icmp ult i32 %0, %1 - %r = select i1 %cmp, i32 %0, i32 %1 - ret i32 %r -} - -define i32 @__max_uniform_uint32(i32, i32) nounwind readnone { - %cmp = icmp ugt i32 %0, %1 - %r = select i1 %cmp, i32 %0, i32 %1 - ret i32 %r -} - -define i64 @__min_uniform_int64(i64, i64) nounwind readnone { - %cmp = icmp slt i64 %0, %1 - %r = select i1 %cmp, i64 %0, i64 %1 - ret i64 %r -} - -define i64 @__max_uniform_int64(i64, i64) nounwind readnone { - %cmp = icmp sgt i64 %0, %1 - %r = select i1 %cmp, i64 %0, i64 %1 - ret i64 %r -} - -define i64 @__min_uniform_uint64(i64, i64) nounwind readnone { - %cmp = icmp ult i64 %0, %1 - %r = select i1 %cmp, i64 %0, i64 %1 - ret i64 %r -} - -define i64 @__max_uniform_uint64(i64, i64) nounwind readnone { - %cmp = icmp ugt i64 %0, %1 - %r = select i1 %cmp, i64 %0, i64 %1 - ret i64 %r -} - -define double @__min_uniform_double(double, double) nounwind readnone { - %cmp = fcmp olt double %0, %1 - %r = select i1 %cmp, double %0, double %1 - ret double %r -} - -define double @__max_uniform_double(double, double) nounwind readnone { - %cmp = fcmp ogt double %0, %1 - %r = select i1 %cmp, double %0, double %1 - ret double %r -} - declare <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float>, <4 x float>) nounwind readnone declare <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float>, <4 x float>) nounwind readnone @@ -287,44 +142,6 @@ define @__max_varying_uint32(, ) nounwin ret <4 x i32> %r } -define @__min_varying_int64(, ) nounwind readnone { - %m = icmp slt %0, %1 - %r = select %m, %0, %1 - ret %r -} - -define @__max_varying_int64(, ) nounwind readnone { - %m = icmp sgt %0, %1 - %r = select %m, %0, %1 - ret %r -} - -define @__min_varying_uint64(, ) nounwind readnone { - %m = icmp ult %0, %1 - %r = select %m, %0, %1 - ret %r -} - -define @__max_varying_uint64(, ) nounwind readnone { - %m = icmp ugt %0, %1 - %r = select %m, %0, %1 - ret %r -} - -define @__min_varying_double(, - ) nounwind readnone { - %m = fcmp olt %0, %1 - %r = select %m, %0, %1 - ret %r -} - -define @__max_varying_double(, - ) nounwind readnone { - %m = fcmp ogt %0, %1 - %r = select %m, %0, %1 - ret %r -} - ;; sqrt/rsqrt/rcp declare <4 x float> @llvm.arm.neon.vrecpe.v4f32(<4 x float>) nounwind readnone @@ -371,13 +188,6 @@ define float @__rcp_uniform_float(float) nounwind readnone { ret float %r } -declare float @llvm.sqrt.f32(float) - -define float @__sqrt_uniform_float(float) nounwind readnone { - %r = call float @llvm.sqrt.f32(float %0) - ret float %r -} - declare <4 x float> @llvm.sqrt.v4f32(<4 x float>) define @__sqrt_varying_float() nounwind readnone { @@ -388,13 +198,6 @@ define @__sqrt_varying_float() nounwind readnone ret <4 x float> %result } -declare double @llvm.sqrt.f64(double) - -define double @__sqrt_uniform_double(double) nounwind readnone { - %r = call double @llvm.sqrt.f64(double %0) - ret double %r -} - declare <4 x double> @llvm.sqrt.v4f64(<4 x double>) define @__sqrt_varying_double() nounwind readnone { @@ -402,21 +205,6 @@ define @__sqrt_varying_double() nounwind readno ret <4 x double> %r } -;; bit ops - -declare i32 @llvm.ctpop.i32(i32) nounwind readnone -declare i64 @llvm.ctpop.i64(i64) nounwind readnone - -define i32 @__popcnt_int32(i32) nounwind readnone { - %v = call i32 @llvm.ctpop.i32(i32 %0) - ret i32 %v -} - -define i64 @__popcnt_int64(i64) nounwind readnone { - %v = call i64 @llvm.ctpop.i64(i64 %0) - ret i64 %v -} - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; reductions @@ -638,92 +426,3 @@ define i64 @__reduce_min_uint64(<4 x i64>) nounwind readnone { define i64 @__reduce_max_uint64(<4 x i64>) nounwind readnone { reduce4(i64, @__max_varying_uint64, @__max_uniform_uint64) } - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;; unaligned loads/loads+broadcasts - -masked_load(i8, 1) -masked_load(i16, 2) -masked_load(i32, 4) -masked_load(float, 4) -masked_load(i64, 8) -masked_load(double, 8) - -gen_masked_store(i8) -gen_masked_store(i16) -gen_masked_store(i32) -gen_masked_store(i64) -masked_store_float_double() - -define void @__masked_store_blend_i8(* nocapture %ptr, %new, - %mask) nounwind alwaysinline { - %old = load * %ptr - %mask1 = trunc <4 x MASK> %mask to <4 x i1> - %result = select <4 x i1> %mask1, <4 x i8> %new, <4 x i8> %old - store %result, * %ptr - ret void -} - -define void @__masked_store_blend_i16(* nocapture %ptr, %new, - %mask) nounwind alwaysinline { - %old = load * %ptr - %mask1 = trunc <4 x MASK> %mask to <4 x i1> - %result = select <4 x i1> %mask1, <4 x i16> %new, <4 x i16> %old - store %result, * %ptr - ret void -} - -define void @__masked_store_blend_i32(* nocapture %ptr, %new, - %mask) nounwind alwaysinline { - %old = load * %ptr - %mask1 = trunc <4 x MASK> %mask to <4 x i1> - %result = select <4 x i1> %mask1, <4 x i32> %new, <4 x i32> %old - store %result, * %ptr - ret void -} - -define void @__masked_store_blend_i64(* nocapture %ptr, - %new, %mask) nounwind alwaysinline { - %old = load * %ptr - %mask1 = trunc <4 x MASK> %mask to <4 x i1> - %result = select <4 x i1> %mask1, <4 x i64> %new, <4 x i64> %old - store %result, * %ptr - ret void -} - -;; yuck. We need declarations of these, even though we shouldnt ever -;; actually generate calls to them for the NEON target... - -declare @__svml_sin() -declare @__svml_cos() -declare void @__svml_sincos(, *, *) -declare @__svml_tan() -declare @__svml_atan() -declare @__svml_atan2(, ) -declare @__svml_exp() -declare @__svml_log() -declare @__svml_pow(, ) - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;; gather - -gen_gather_factored(i8) -gen_gather_factored(i16) -gen_gather_factored(i32) -gen_gather_factored(float) -gen_gather_factored(i64) -gen_gather_factored(double) - -gen_scatter(i8) -gen_scatter(i16) -gen_scatter(i32) -gen_scatter(float) -gen_scatter(i64) -gen_scatter(double) - -packed_load_and_store(4) - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;; prefetch - -define_prefetches() diff --git a/builtins/target-neon-8.ll b/builtins/target-neon-8.ll new file mode 100644 index 00000000..eb65f224 --- /dev/null +++ b/builtins/target-neon-8.ll @@ -0,0 +1,508 @@ +;; +;; target-neon-8.ll +;; +;; Copyright(c) 2013 Google, Inc. +;; +;; All rights reserved. +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are +;; met: +;; +;; * Redistributions of source code must retain the above copyright +;; notice, this list of conditions and the following disclaimer. +;; +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; +;; * Neither the name of Matt Pharr nor the names of its +;; contributors may be used to endorse or promote products derived from +;; this software without specific prior written permission. +;; +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +;; IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +;; TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +;; PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER +;; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +;; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +;; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +define(`WIDTH',`16') +define(`MASK',`i8') + +include(`util.m4') +include(`target-neon-common.ll') + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; half conversion routines + +define <16 x float> @__half_to_float_varying(<16 x i16> %v) nounwind readnone { + unary4to16conv(r, i16, float, @llvm.arm.neon.vcvthf2fp, %v) + ret <16 x float> %r +} + +define <16 x i16> @__float_to_half_varying(<16 x float> %v) nounwind readnone { + unary4to16conv(r, float, i16, @llvm.arm.neon.vcvtfp2hf, %v) + ret <16 x i16> %r +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; math + +;; round/floor/ceil + +;; FIXME: grabbed these from the sse2 target, which does not have native +;; instructions for these. Is there a better approach for NEON? + +define <16 x float> @__round_varying_float(<16 x float>) nounwind readonly alwaysinline { + %float_to_int_bitcast.i.i.i.i = bitcast <16 x float> %0 to <16 x i32> + %bitop.i.i = and <16 x i32> %float_to_int_bitcast.i.i.i.i, + + %bitop.i = xor <16 x i32> %float_to_int_bitcast.i.i.i.i, %bitop.i.i + %int_to_float_bitcast.i.i40.i = bitcast <16 x i32> %bitop.i to <16 x float> + %binop.i = fadd <16 x float> %int_to_float_bitcast.i.i40.i, + + %binop21.i = fadd <16 x float> %binop.i, + + %float_to_int_bitcast.i.i.i = bitcast <16 x float> %binop21.i to <16 x i32> + %bitop31.i = xor <16 x i32> %float_to_int_bitcast.i.i.i, %bitop.i.i + %int_to_float_bitcast.i.i.i = bitcast <16 x i32> %bitop31.i to <16 x float> + ret <16 x float> %int_to_float_bitcast.i.i.i +} + +define <16 x float> @__floor_varying_float(<16 x float>) nounwind readonly alwaysinline { + %calltmp.i = tail call <16 x float> @__round_varying_float(<16 x float> %0) nounwind + %bincmp.i = fcmp ogt <16 x float> %calltmp.i, %0 + %val_to_boolvec32.i = sext <16 x i1> %bincmp.i to <16 x i32> + %bitop.i = and <16 x i32> %val_to_boolvec32.i, + + %int_to_float_bitcast.i.i.i = bitcast <16 x i32> %bitop.i to <16 x float> + %binop.i = fadd <16 x float> %calltmp.i, %int_to_float_bitcast.i.i.i + ret <16 x float> %binop.i +} + +define <16 x float> @__ceil_varying_float(<16 x float>) nounwind readonly alwaysinline { + %calltmp.i = tail call <16 x float> @__round_varying_float(<16 x float> %0) nounwind + %bincmp.i = fcmp olt <16 x float> %calltmp.i, %0 + %val_to_boolvec32.i = sext <16 x i1> %bincmp.i to <16 x i32> + %bitop.i = and <16 x i32> %val_to_boolvec32.i, + + %int_to_float_bitcast.i.i.i = bitcast <16 x i32> %bitop.i to <16 x float> + %binop.i = fadd <16 x float> %calltmp.i, %int_to_float_bitcast.i.i.i + ret <16 x float> %binop.i +} + +;; FIXME: rounding doubles and double vectors needs to be implemented +declare @__round_varying_double() nounwind readnone +declare @__floor_varying_double() nounwind readnone +declare @__ceil_varying_double() nounwind readnone + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; min/max + +declare <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float>, <4 x float>) nounwind readnone +declare <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float>, <4 x float>) nounwind readnone + +define @__max_varying_float(, + ) nounwind readnone { + binary4to16(r, float, @llvm.arm.neon.vmaxs.v4f32, %0, %1) + ret %r +} + +define @__min_varying_float(, + ) nounwind readnone { + binary4to16(r, float, @llvm.arm.neon.vmins.v4f32, %0, %1) + ret %r +} + +declare <4 x i32> @llvm.arm.neon.vmins.v4i32(<4 x i32>, <4 x i32>) nounwind readnone +declare <4 x i32> @llvm.arm.neon.vminu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone +declare <4 x i32> @llvm.arm.neon.vmaxs.v4i32(<4 x i32>, <4 x i32>) nounwind readnone +declare <4 x i32> @llvm.arm.neon.vmaxu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone + +define @__min_varying_int32(, ) nounwind readnone { + binary4to16(r, i32, @llvm.arm.neon.vmins.v4i32, %0, %1) + ret %r +} + +define @__max_varying_int32(, ) nounwind readnone { + binary4to16(r, i32, @llvm.arm.neon.vmaxs.v4i32, %0, %1) + ret %r +} + +define @__min_varying_uint32(, ) nounwind readnone { + binary4to16(r, i32, @llvm.arm.neon.vminu.v4i32, %0, %1) + ret %r +} + +define @__max_varying_uint32(, ) nounwind readnone { + binary4to16(r, i32, @llvm.arm.neon.vmaxu.v4i32, %0, %1) + ret %r +} + +;; sqrt/rsqrt/rcp + +declare <4 x float> @llvm.arm.neon.vrecpe.v4f32(<4 x float>) nounwind readnone +declare <4 x float> @llvm.arm.neon.vrecps.v4f32(<4 x float>, <4 x float>) nounwind readnone + +define @__rcp_varying_float( %d) nounwind readnone { + unary4to16(x0, float, @llvm.arm.neon.vrecpe.v4f32, %d) + binary4to16(x0_nr, float, @llvm.arm.neon.vrecps.v4f32, %d, %x0) + %x1 = fmul %x0, %x0_nr + binary4to16(x1_nr, float, @llvm.arm.neon.vrecps.v4f32, %d, %x1) + %x2 = fmul %x1, %x1_nr + ret %x2 +} + +declare <4 x float> @llvm.arm.neon.vrsqrte.v4f32(<4 x float>) nounwind readnone +declare <4 x float> @llvm.arm.neon.vrsqrts.v4f32(<4 x float>, <4 x float>) nounwind readnone + +define @__rsqrt_varying_float( %d) nounwind readnone { + unary4to16(x0, float, @llvm.arm.neon.vrsqrte.v4f32, %d) + %x0_2 = fmul %x0, %x0 + binary4to16(x0_nr, float, @llvm.arm.neon.vrsqrts.v4f32, %d, %x0_2) + %x1 = fmul %x0, %x0_nr + %x1_2 = fmul %x1, %x1 + binary4to16(x1_nr, float, @llvm.arm.neon.vrsqrts.v4f32, %d, %x1_2) + %x2 = fmul %x1, %x1_nr + ret %x2 +} + +define float @__rsqrt_uniform_float(float) nounwind readnone { + %v1 = bitcast float %0 to <1 x float> + %vs = shufflevector <1 x float> %v1, <1 x float> undef, + <16 x i32> + %vr = call <16 x float> @__rsqrt_varying_float(<16 x float> %vs) + %r = extractelement <16 x float> %vr, i32 0 + ret float %r +} + +define float @__rcp_uniform_float(float) nounwind readnone { + %v1 = bitcast float %0 to <1 x float> + %vs = shufflevector <1 x float> %v1, <1 x float> undef, + <16 x i32> + %vr = call <16 x float> @__rcp_varying_float(<16 x float> %vs) + %r = extractelement <16 x float> %vr, i32 0 + ret float %r +} + +declare <4 x float> @llvm.sqrt.v4f32(<4 x float>) + +define @__sqrt_varying_float() nounwind readnone { + unary4to16(result, float, @llvm.sqrt.v4f32, %0) +;; this returns nan for v=0, which is undesirable.. +;; %rsqrt = call @__rsqrt_varying_float( %0) +;; %result = fmul <4 x float> %rsqrt, %0 + ret <16 x float> %result +} + +declare <4 x double> @llvm.sqrt.v4f64(<4 x double>) + +define @__sqrt_varying_double() nounwind readnone { + unary4to16(r, double, @llvm.sqrt.v4f64, %0) + ret %r +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; reductions + +define i64 @__movmsk() nounwind readnone { + %and_mask = and %0, + + %v8 = call <8 x i16> @llvm.arm.neon.vpaddlu.v8i16.v16i8(<16 x i8> %and_mask) + %v4 = call <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<8 x i16> %v8) + %v2 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %v4) + %va = extractelement <2 x i64> %v2, i32 0 + %vb = extractelement <2 x i64> %v2, i32 1 + %vbshift = shl i64 %vb, 8 + %v = or i64 %va, %vbshift + ret i64 %v +} + +define i1 @__any() nounwind readnone alwaysinline { + v16tov8(MASK, %0, %v8a, %v8b) + %vor8 = or <8 x MASK> %v8a, %v8b + %v16 = sext <8 x i8> %vor8 to <8 x i16> + v8tov4(i16, %v16, %v16a, %v16b) + %vor16 = or <4 x i16> %v16a, %v16b + %v32 = sext <4 x i16> %vor16 to <4 x i32> + v4tov2(i32, %v32, %v32a, %v32b) + %vor32 = or <2 x i32> %v32a, %v32b + %v0 = extractelement <2 x i32> %vor32, i32 0 + %v1 = extractelement <2 x i32> %vor32, i32 1 + %v = or i32 %v0, %v1 + %cmp = icmp ne i32 %v, 0 + ret i1 %cmp +} + +define i1 @__all() nounwind readnone alwaysinline { + v16tov8(MASK, %0, %v8a, %v8b) + %vand8 = and <8 x MASK> %v8a, %v8b + %v16 = sext <8 x i8> %vand8 to <8 x i16> + v8tov4(i16, %v16, %v16a, %v16b) + %vand16 = and <4 x i16> %v16a, %v16b + %v32 = sext <4 x i16> %vand16 to <4 x i32> + v4tov2(i32, %v32, %v32a, %v32b) + %vand32 = and <2 x i32> %v32a, %v32b + %v0 = extractelement <2 x i32> %vand32, i32 0 + %v1 = extractelement <2 x i32> %vand32, i32 1 + %v = and i32 %v0, %v1 + %cmp = icmp ne i32 %v, 0 + ret i1 %cmp +} + +define i1 @__none() nounwind readnone alwaysinline { + %any = call i1 @__any( %0) + %none = icmp eq i1 %any, 0 + ret i1 %none +} + +;; $1: scalar type +;; $2: vector/vector reduce function (2 x -> ) +;; $3: pairwise vector reduce function (2 x <2 x vec> -> <2 x vec>) +;; $4: scalar reduce function + +define(`neon_reduce', ` + v16tov8($1, %0, %va, %vb) + %va_16 = shufflevector <8 x $1> %va, <8 x $1> undef, + <16 x i32> + %vb_16 = shufflevector <8 x $1> %vb, <8 x $1> undef, + <16 x i32> + %v8 = call <16 x $1> $2(<16 x $1> %va_16, <16 x $1> %vb_16) + + %v8a = shufflevector <16 x $1> %v8, <16 x $1> undef, + <16 x i32> + %v8b = shufflevector <16 x $1> %v8, <16 x $1> undef, + <16 x i32> + + %v4 = call <16 x $1> $2(<16 x $1> %v8a, <16 x $1> %v8b) + + %vfirst_4 = shufflevector <16 x $1> %v4, <16 x $1> undef, + <4 x i32> + v4tov2($1, %vfirst_4, %v0, %v1) + %vh = call <2 x $1> $3(<2 x $1> %v0, <2 x $1> %v1) + %vh0 = extractelement <2 x $1> %vh, i32 0 + %vh1 = extractelement <2 x $1> %vh, i32 1 + %r = call $1 $4($1 %vh0, $1 %vh1) + ret $1 %r +') + +declare <2 x float> @llvm.arm.neon.vpadd.v2f32(<2 x float>, <2 x float>) nounwind readnone + +define internal float @add_f32(float, float) { + %r = fadd float %0, %1 + ret float %r +} + +define internal @__add_varying_float(, ) { + %r = fadd %0, %1 + ret %r +} + +define float @__reduce_add_float() nounwind readnone { + neon_reduce(float, @__add_varying_float, @llvm.arm.neon.vpadd.v2f32, @add_f32) +} + +declare <2 x float> @llvm.arm.neon.vpmins.v2f32(<2 x float>, <2 x float>) nounwind readnone + +define internal float @min_f32(float, float) { + %cmp = fcmp olt float %0, %1 + %r = select i1 %cmp, float %0, float %1 + ret float %r +} + +define float @__reduce_min_float() nounwind readnone { + neon_reduce(float, @__min_varying_float, @llvm.arm.neon.vpmins.v2f32, @min_f32) +} + +declare <2 x float> @llvm.arm.neon.vpmaxs.v2f32(<2 x float>, <2 x float>) nounwind readnone + +define internal float @max_f32(float, float) { + %cmp = fcmp ugt float %0, %1 + %r = select i1 %cmp, float %0, float %1 + ret float %r +} + +define float @__reduce_max_float() nounwind readnone { + neon_reduce(float, @__max_varying_float, @llvm.arm.neon.vpmaxs.v2f32, @max_f32) +} + +declare <8 x i16> @llvm.arm.neon.vpaddlu.v8i16.v16i8(<16 x i8>) nounwind readnone +declare <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<8 x i16>) nounwind readnone +declare <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32>) nounwind readnone + +define i64 @__reduce_add_int8() nounwind readnone { + %a16 = call <8 x i16> @llvm.arm.neon.vpaddlu.v8i16.v16i8(<16 x i8> %0) + %a32 = call <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<8 x i16> %a16) + %a64 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %a32) + %a0 = extractelement <2 x i64> %a64, i32 0 + %a1 = extractelement <2 x i64> %a64, i32 1 + %r = add i64 %a0, %a1 + ret i64 %r +} + +define i64 @__reduce_add_int16() nounwind readnone { + v16tov8(i16, %0, %va, %vb) + %a32 = call <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<8 x i16> %va) + %b32 = call <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<8 x i16> %vb) + %a64 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %a32) + %b64 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %b32) + %sum = add <2 x i64> %a64, %b64 + %a0 = extractelement <2 x i64> %sum, i32 0 + %a1 = extractelement <2 x i64> %sum, i32 1 + %r = add i64 %a0, %a1 + ret i64 %r +} + +define i64 @__reduce_add_int32() nounwind readnone { + v16tov4(i32, %0, %va, %vb, %vc, %vd) + %a64 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %va) + %b64 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %vb) + %c64 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %vc) + %d64 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %vd) + %ab = add <2 x i64> %a64, %b64 + %cd = add <2 x i64> %c64, %d64 + %sum = add <2 x i64> %ab, %cd + %a0 = extractelement <2 x i64> %sum, i32 0 + %a1 = extractelement <2 x i64> %sum, i32 1 + %r = add i64 %a0, %a1 + ret i64 %r +} + +declare <2 x i32> @llvm.arm.neon.vpmins.v2i32(<2 x i32>, <2 x i32>) nounwind readnone + +define internal i32 @min_si32(i32, i32) { + %cmp = icmp slt i32 %0, %1 + %r = select i1 %cmp, i32 %0, i32 %1 + ret i32 %r +} + +define i32 @__reduce_min_int32() nounwind readnone { + neon_reduce(i32, @__min_varying_int32, @llvm.arm.neon.vpmins.v2i32, @min_si32) +} + +declare <2 x i32> @llvm.arm.neon.vpmaxs.v2i32(<2 x i32>, <2 x i32>) nounwind readnone + +define internal i32 @max_si32(i32, i32) { + %cmp = icmp sgt i32 %0, %1 + %r = select i1 %cmp, i32 %0, i32 %1 + ret i32 %r +} + +define i32 @__reduce_max_int32() nounwind readnone { + neon_reduce(i32, @__max_varying_int32, @llvm.arm.neon.vpmaxs.v2i32, @max_si32) +} + +declare <2 x i32> @llvm.arm.neon.vpminu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone + +define internal i32 @min_ui32(i32, i32) { + %cmp = icmp ult i32 %0, %1 + %r = select i1 %cmp, i32 %0, i32 %1 + ret i32 %r +} + +define i32 @__reduce_min_uint32() nounwind readnone { + neon_reduce(i32, @__min_varying_uint32, @llvm.arm.neon.vpmins.v2i32, @min_ui32) +} + +declare <2 x i32> @llvm.arm.neon.vpmaxu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone + +define internal i32 @max_ui32(i32, i32) { + %cmp = icmp ugt i32 %0, %1 + %r = select i1 %cmp, i32 %0, i32 %1 + ret i32 %r +} + +define i32 @__reduce_max_uint32() nounwind readnone { + neon_reduce(i32, @__max_varying_uint32, @llvm.arm.neon.vpmaxs.v2i32, @max_ui32) +} + +define internal double @__add_uniform_double(double, double) { + %r = fadd double %0, %1 + ret double %r +} + +define internal @__add_varying_double(, ) { + %r = fadd %0, %1 + ret %r +} + +define double @__reduce_add_double() nounwind readnone { + reduce16(double, @__add_varying_double, @__add_uniform_double) +} + +define double @__reduce_min_double() nounwind readnone { + reduce16(double, @__min_varying_double, @__min_uniform_double) +} + +define double @__reduce_max_double() nounwind readnone { + reduce16(double, @__max_varying_double, @__max_uniform_double) +} + +define internal i64 @__add_uniform_int64(i64, i64) { + %r = add i64 %0, %1 + ret i64 %r +} + +define internal @__add_varying_int64(, ) { + %r = add %0, %1 + ret %r +} + +define i64 @__reduce_add_int64() nounwind readnone { + reduce16(i64, @__add_varying_int64, @__add_uniform_int64) +} + +define i64 @__reduce_min_int64() nounwind readnone { + reduce16(i64, @__min_varying_int64, @__min_uniform_int64) +} + +define i64 @__reduce_max_int64() nounwind readnone { + reduce16(i64, @__max_varying_int64, @__max_uniform_int64) +} + +define i64 @__reduce_min_uint64() nounwind readnone { + reduce16(i64, @__min_varying_uint64, @__min_uniform_uint64) +} + +define i64 @__reduce_max_uint64() nounwind readnone { + reduce16(i64, @__max_varying_uint64, @__max_uniform_uint64) +} diff --git a/builtins/target-neon-common.ll b/builtins/target-neon-common.ll new file mode 100644 index 00000000..696b0748 --- /dev/null +++ b/builtins/target-neon-common.ll @@ -0,0 +1,351 @@ +;; +;; target-neon-common.ll +;; +;; Copyright(c) 2013 Google, Inc. +;; +;; All rights reserved. +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are +;; met: +;; +;; * Redistributions of source code must retain the above copyright +;; notice, this list of conditions and the following disclaimer. +;; +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; +;; * Neither the name of Matt Pharr nor the names of its +;; contributors may be used to endorse or promote products derived from +;; this software without specific prior written permission. +;; +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +;; IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +;; TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +;; PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER +;; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +;; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +;; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +target datalayout = "e-p:32:32:32-S32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f16:16:16-f32:32:32-f64:32:64-f128:128:128-v64:32:64-v128:32:128-a0:0:64-n32" + +stdlib_core() +scans() +reduce_equal(WIDTH) +rdrand_decls() +define_shuffles() +aossoa() +ctlztz() + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; half conversion routines + +declare <4 x i16> @llvm.arm.neon.vcvtfp2hf(<4 x float>) nounwind readnone +declare <4 x float> @llvm.arm.neon.vcvthf2fp(<4 x i16>) nounwind readnone + +define float @__half_to_float_uniform(i16 %v) nounwind readnone { + %v1 = bitcast i16 %v to <1 x i16> + %vec = shufflevector <1 x i16> %v1, <1 x i16> undef, + <4 x i32> + %h = call <4 x float> @llvm.arm.neon.vcvthf2fp(<4 x i16> %vec) + %r = extractelement <4 x float> %h, i32 0 + ret float %r +} + +define i16 @__float_to_half_uniform(float %v) nounwind readnone { + %v1 = bitcast float %v to <1 x float> + %vec = shufflevector <1 x float> %v1, <1 x float> undef, + <4 x i32> + %h = call <4 x i16> @llvm.arm.neon.vcvtfp2hf(<4 x float> %vec) + %r = extractelement <4 x i16> %h, i32 0 + ret i16 %r +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; math + +define void @__fastmath() nounwind { + ret void +} + +;; round/floor/ceil + +;; FIXME: grabbed these from the sse2 target, which does not have native +;; instructions for these. Is there a better approach for NEON? + +define float @__round_uniform_float(float) nounwind readonly alwaysinline { + %float_to_int_bitcast.i.i.i.i = bitcast float %0 to i32 + %bitop.i.i = and i32 %float_to_int_bitcast.i.i.i.i, -2147483648 + %bitop.i = xor i32 %bitop.i.i, %float_to_int_bitcast.i.i.i.i + %int_to_float_bitcast.i.i40.i = bitcast i32 %bitop.i to float + %binop.i = fadd float %int_to_float_bitcast.i.i40.i, 8.388608e+06 + %binop21.i = fadd float %binop.i, -8.388608e+06 + %float_to_int_bitcast.i.i.i = bitcast float %binop21.i to i32 + %bitop31.i = xor i32 %float_to_int_bitcast.i.i.i, %bitop.i.i + %int_to_float_bitcast.i.i.i = bitcast i32 %bitop31.i to float + ret float %int_to_float_bitcast.i.i.i +} + +define float @__floor_uniform_float(float) nounwind readonly alwaysinline { + %calltmp.i = tail call float @__round_uniform_float(float %0) nounwind + %bincmp.i = fcmp ogt float %calltmp.i, %0 + %selectexpr.i = sext i1 %bincmp.i to i32 + %bitop.i = and i32 %selectexpr.i, -1082130432 + %int_to_float_bitcast.i.i.i = bitcast i32 %bitop.i to float + %binop.i = fadd float %calltmp.i, %int_to_float_bitcast.i.i.i + ret float %binop.i +} + +define float @__ceil_uniform_float(float) nounwind readonly alwaysinline { + %calltmp.i = tail call float @__round_uniform_float(float %0) nounwind + %bincmp.i = fcmp olt float %calltmp.i, %0 + %selectexpr.i = sext i1 %bincmp.i to i32 + %bitop.i = and i32 %selectexpr.i, 1065353216 + %int_to_float_bitcast.i.i.i = bitcast i32 %bitop.i to float + %binop.i = fadd float %calltmp.i, %int_to_float_bitcast.i.i.i + ret float %binop.i +} + +;; FIXME: rounding doubles and double vectors needs to be implemented +declare double @__round_uniform_double(double) nounwind readnone +declare double @__floor_uniform_double(double) nounwind readnone +declare double @__ceil_uniform_double(double) nounwind readnone + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; min/max + +define float @__max_uniform_float(float, float) nounwind readnone { + %cmp = fcmp ugt float %0, %1 + %r = select i1 %cmp, float %0, float %1 + ret float %r +} + +define float @__min_uniform_float(float, float) nounwind readnone { + %cmp = fcmp ult float %0, %1 + %r = select i1 %cmp, float %0, float %1 + ret float %r +} + +define i32 @__min_uniform_int32(i32, i32) nounwind readnone { + %cmp = icmp slt i32 %0, %1 + %r = select i1 %cmp, i32 %0, i32 %1 + ret i32 %r +} + +define i32 @__max_uniform_int32(i32, i32) nounwind readnone { + %cmp = icmp sgt i32 %0, %1 + %r = select i1 %cmp, i32 %0, i32 %1 + ret i32 %r +} + +define i32 @__min_uniform_uint32(i32, i32) nounwind readnone { + %cmp = icmp ult i32 %0, %1 + %r = select i1 %cmp, i32 %0, i32 %1 + ret i32 %r +} + +define i32 @__max_uniform_uint32(i32, i32) nounwind readnone { + %cmp = icmp ugt i32 %0, %1 + %r = select i1 %cmp, i32 %0, i32 %1 + ret i32 %r +} + +define i64 @__min_uniform_int64(i64, i64) nounwind readnone { + %cmp = icmp slt i64 %0, %1 + %r = select i1 %cmp, i64 %0, i64 %1 + ret i64 %r +} + +define i64 @__max_uniform_int64(i64, i64) nounwind readnone { + %cmp = icmp sgt i64 %0, %1 + %r = select i1 %cmp, i64 %0, i64 %1 + ret i64 %r +} + +define i64 @__min_uniform_uint64(i64, i64) nounwind readnone { + %cmp = icmp ult i64 %0, %1 + %r = select i1 %cmp, i64 %0, i64 %1 + ret i64 %r +} + +define i64 @__max_uniform_uint64(i64, i64) nounwind readnone { + %cmp = icmp ugt i64 %0, %1 + %r = select i1 %cmp, i64 %0, i64 %1 + ret i64 %r +} + +define double @__min_uniform_double(double, double) nounwind readnone { + %cmp = fcmp olt double %0, %1 + %r = select i1 %cmp, double %0, double %1 + ret double %r +} + +define double @__max_uniform_double(double, double) nounwind readnone { + %cmp = fcmp ogt double %0, %1 + %r = select i1 %cmp, double %0, double %1 + ret double %r +} + +define @__min_varying_int64(, ) nounwind readnone { + %m = icmp slt %0, %1 + %r = select %m, %0, %1 + ret %r +} + +define @__max_varying_int64(, ) nounwind readnone { + %m = icmp sgt %0, %1 + %r = select %m, %0, %1 + ret %r +} + +define @__min_varying_uint64(, ) nounwind readnone { + %m = icmp ult %0, %1 + %r = select %m, %0, %1 + ret %r +} + +define @__max_varying_uint64(, ) nounwind readnone { + %m = icmp ugt %0, %1 + %r = select %m, %0, %1 + ret %r +} + +define @__min_varying_double(, + ) nounwind readnone { + %m = fcmp olt %0, %1 + %r = select %m, %0, %1 + ret %r +} + +define @__max_varying_double(, + ) nounwind readnone { + %m = fcmp ogt %0, %1 + %r = select %m, %0, %1 + ret %r +} + +;; sqrt/rsqrt/rcp + +declare float @llvm.sqrt.f32(float) + +define float @__sqrt_uniform_float(float) nounwind readnone { + %r = call float @llvm.sqrt.f32(float %0) + ret float %r +} + +declare double @llvm.sqrt.f64(double) + +define double @__sqrt_uniform_double(double) nounwind readnone { + %r = call double @llvm.sqrt.f64(double %0) + ret double %r +} + +;; bit ops + +declare i32 @llvm.ctpop.i32(i32) nounwind readnone +declare i64 @llvm.ctpop.i64(i64) nounwind readnone + +define i32 @__popcnt_int32(i32) nounwind readnone { + %v = call i32 @llvm.ctpop.i32(i32 %0) + ret i32 %v +} + +define i64 @__popcnt_int64(i64) nounwind readnone { + %v = call i64 @llvm.ctpop.i64(i64 %0) + ret i64 %v +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; unaligned loads/loads+broadcasts + +masked_load(i8, 1) +masked_load(i16, 2) +masked_load(i32, 4) +masked_load(float, 4) +masked_load(i64, 8) +masked_load(double, 8) + +gen_masked_store(i8) +gen_masked_store(i16) +gen_masked_store(i32) +gen_masked_store(i64) +masked_store_float_double() + +define void @__masked_store_blend_i8(* nocapture %ptr, %new, + %mask) nounwind alwaysinline { + %old = load * %ptr + %mask1 = trunc %mask to + %result = select %mask1, %new, %old + store %result, * %ptr + ret void +} + +define void @__masked_store_blend_i16(* nocapture %ptr, %new, + %mask) nounwind alwaysinline { + %old = load * %ptr + %mask1 = trunc %mask to + %result = select %mask1, %new, %old + store %result, * %ptr + ret void +} + +define void @__masked_store_blend_i32(* nocapture %ptr, %new, + %mask) nounwind alwaysinline { + %old = load * %ptr + %mask1 = trunc %mask to + %result = select %mask1, %new, %old + store %result, * %ptr + ret void +} + +define void @__masked_store_blend_i64(* nocapture %ptr, + %new, %mask) nounwind alwaysinline { + %old = load * %ptr + %mask1 = trunc %mask to + %result = select %mask1, %new, %old + store %result, * %ptr + ret void +} + +;; yuck. We need declarations of these, even though we shouldnt ever +;; actually generate calls to them for the NEON target... + +declare @__svml_sin() +declare @__svml_cos() +declare void @__svml_sincos(, *, *) +declare @__svml_tan() +declare @__svml_atan() +declare @__svml_atan2(, ) +declare @__svml_exp() +declare @__svml_log() +declare @__svml_pow(, ) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; gather + +gen_gather_factored(i8) +gen_gather_factored(i16) +gen_gather_factored(i32) +gen_gather_factored(float) +gen_gather_factored(i64) +gen_gather_factored(double) + +gen_scatter(i8) +gen_scatter(i16) +gen_scatter(i32) +gen_scatter(float) +gen_scatter(i64) +gen_scatter(double) + +packed_load_and_store(4) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; prefetch + +define_prefetches() diff --git a/builtins/util.m4 b/builtins/util.m4 index ee45ebc7..1f85e2cc 100644 --- a/builtins/util.m4 +++ b/builtins/util.m4 @@ -49,6 +49,53 @@ define(`MASK_HIGH_BIT_ON', ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; vector assembly and deconstruction utilities +;; split 8-wide vector into 2 4-wide vectors +;; +;; $1: vector element type +;; $2: 8-wide vector +;; $3: first 4-wide vector +;; $4: second 4-wide vector + +define(`v8tov4', ` + $3 = shufflevector <8 x $1> $2, <8 x $1> undef, + <4 x i32> + $4 = shufflevector <8 x $1> $2, <8 x $1> undef, + <4 x i32> +') + +define(`v16tov8', ` + $3 = shufflevector <16 x $1> $2, <16 x $1> undef, + <8 x i32> + $4 = shufflevector <16 x $1> $2, <16 x $1> undef, + <8 x i32> +') + +;; 4-wide into 2 2-wide +;; args as above +;; + +define(`v4tov2', ` + $3 = shufflevector <4 x $1> $2, <4 x $1> undef, <2 x i32> + $4 = shufflevector <4 x $1> $2, <4 x $1> undef, <2 x i32> +') + +define(`v8tov2', ` + $3 = shufflevector <8 x $1> $2, <8 x $1> undef, <2 x i32> + $4 = shufflevector <8 x $1> $2, <8 x $1> undef, <2 x i32> + $5 = shufflevector <8 x $1> $2, <8 x $1> undef, <2 x i32> + $6 = shufflevector <8 x $1> $2, <8 x $1> undef, <2 x i32> +') + +define(`v16tov4', ` + $3 = shufflevector <16 x $1> $2, <16 x $1> undef, <4 x i32> + $4 = shufflevector <16 x $1> $2, <16 x $1> undef, <4 x i32> + $5 = shufflevector <16 x $1> $2, <16 x $1> undef, <4 x i32> + $6 = shufflevector <16 x $1> $2, <16 x $1> undef, <4 x i32> +') + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; Helper macro for calling various SSE instructions for scalar values ;; but where the instruction takes a vector parameter. ;; $1 : name of variable to put the final value in @@ -156,10 +203,7 @@ define(`reduce16', ` ;; the final reduction define(`reduce8by4', ` - %v1 = shufflevector <8 x $1> %0, <8 x $1> undef, - <4 x i32> - %v2 = shufflevector <8 x $1> %0, <8 x $1> undef, - <4 x i32> + v8tov4($1, %0, %v1, %v2) %m1 = call <4 x $1> $2(<4 x $1> %v1, <4 x $1> %v2) %v3 = shufflevector <4 x $1> %m1, <4 x $1> undef, <4 x i32> @@ -266,30 +310,66 @@ define(`binary2to4', ` ;; $4: 8-wide operand value define(`unary4to8', ` - %$1_0 = shufflevector <8 x $2> $4, <8 x $2> undef, <4 x i32> - %v$1_0 = call <4 x $2> $3(<4 x $2> %$1_0) - %$1_1 = shufflevector <8 x $2> $4, <8 x $2> undef, <4 x i32> - %v$1_1 = call <4 x $2> $3(<4 x $2> %$1_1) - %$1 = shufflevector <4 x $2> %v$1_0, <4 x $2> %v$1_1, + %__$1_0 = shufflevector <8 x $2> $4, <8 x $2> undef, <4 x i32> + %__v$1_0 = call <4 x $2> $3(<4 x $2> %__$1_0) + %__$1_1 = shufflevector <8 x $2> $4, <8 x $2> undef, <4 x i32> + %__v$1_1 = call <4 x $2> $3(<4 x $2> %__$1_1) + %$1 = shufflevector <4 x $2> %__v$1_0, <4 x $2> %__v$1_1, + <8 x i32> +' +) + +;; $1: name of variable into which the final result should go +;; $2: scalar type of the input vector elements +;; $3: scalar type of the result vector elements +;; $4: 4-wide unary vector function to apply +;; $5: 8-wide operand value + +define(`unary4to8conv', ` + %$1_0 = shufflevector <8 x $2> $5, <8 x $2> undef, <4 x i32> + %v$1_0 = call <4 x $3> $4(<4 x $2> %$1_0) + %$1_1 = shufflevector <8 x $2> $5, <8 x $2> undef, <4 x i32> + %v$1_1 = call <4 x $3> $4(<4 x $2> %$1_1) + %$1 = shufflevector <4 x $3> %v$1_0, <4 x $3> %v$1_1, <8 x i32> ' ) define(`unary4to16', ` - %$1_0 = shufflevector <16 x $2> $4, <16 x $2> undef, <4 x i32> - %v$1_0 = call <4 x $2> $3(<4 x $2> %$1_0) - %$1_1 = shufflevector <16 x $2> $4, <16 x $2> undef, <4 x i32> - %v$1_1 = call <4 x $2> $3(<4 x $2> %$1_1) - %$1_2 = shufflevector <16 x $2> $4, <16 x $2> undef, <4 x i32> - %v$1_2 = call <4 x $2> $3(<4 x $2> %$1_2) - %$1_3 = shufflevector <16 x $2> $4, <16 x $2> undef, <4 x i32> - %v$1_3 = call <4 x $2> $3(<4 x $2> %$1_3) + %__$1_0 = shufflevector <16 x $2> $4, <16 x $2> undef, <4 x i32> + %__v$1_0 = call <4 x $2> $3(<4 x $2> %__$1_0) + %__$1_1 = shufflevector <16 x $2> $4, <16 x $2> undef, <4 x i32> + %__v$1_1 = call <4 x $2> $3(<4 x $2> %__$1_1) + %__$1_2 = shufflevector <16 x $2> $4, <16 x $2> undef, <4 x i32> + %__v$1_2 = call <4 x $2> $3(<4 x $2> %__$1_2) + %__$1_3 = shufflevector <16 x $2> $4, <16 x $2> undef, <4 x i32> + %__v$1_3 = call <4 x $2> $3(<4 x $2> %__$1_3) - %$1a = shufflevector <4 x $2> %v$1_0, <4 x $2> %v$1_1, + %__$1a = shufflevector <4 x $2> %__v$1_0, <4 x $2> %__v$1_1, <8 x i32> - %$1b = shufflevector <4 x $2> %v$1_2, <4 x $2> %v$1_3, + %__$1b = shufflevector <4 x $2> %__v$1_2, <4 x $2> %__v$1_3, <8 x i32> - %$1 = shufflevector <8 x $2> %$1a, <8 x $2> %$1b, + %$1 = shufflevector <8 x $2> %__$1a, <8 x $2> %__$1b, + <16 x i32> +' +) + +define(`unary4to16conv', ` + %$1_0 = shufflevector <16 x $2> $5, <16 x $2> undef, <4 x i32> + %v$1_0 = call <4 x $3> $4(<4 x $2> %$1_0) + %$1_1 = shufflevector <16 x $2> $5, <16 x $2> undef, <4 x i32> + %v$1_1 = call <4 x $3> $4(<4 x $2> %$1_1) + %$1_2 = shufflevector <16 x $2> $5, <16 x $2> undef, <4 x i32> + %v$1_2 = call <4 x $3> $4(<4 x $2> %$1_2) + %$1_3 = shufflevector <16 x $2> $5, <16 x $2> undef, <4 x i32> + %v$1_3 = call <4 x $3> $4(<4 x $2> %$1_3) + + %$1a = shufflevector <4 x $3> %v$1_0, <4 x $3> %v$1_1, + <8 x i32> + %$1b = shufflevector <4 x $3> %v$1_2, <4 x $3> %v$1_3, + <8 x i32> + %$1 = shufflevector <8 x $3> %$1a, <8 x $3> %$1b, <16 x i32> ' diff --git a/ispc.cpp b/ispc.cpp index a9f5ff5c..de8fba4d 100644 --- a/ispc.cpp +++ b/ispc.cpp @@ -106,7 +106,7 @@ static void __cpuidex(int info[4], int level, int count) { static const char * lGetSystemISA() { #ifdef __arm__ - return "neon"; + return "neon-32"; #else int info[4]; __cpuid(info, 1); @@ -187,7 +187,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : isa = "avx2"; else if (!strcmp(cpu, "cortex-a9") || !strcmp(cpu, "cortex-a15")) - isa = "neon"; + isa = "neon-32"; else if (!strcmp(cpu, "core-avx-i")) isa = "avx1.1"; else if (!strcmp(cpu, "sandybridge") || @@ -212,7 +212,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : } #if !defined(__arm__) - if (cpu == NULL && !strcmp(isa, "neon")) + if (cpu == NULL && !strncmp(isa, "neon", 4)) // If we're compiling NEON on an x86 host and the CPU wasn't // supplied, don't go and set the CPU based on the host... cpu = "cortex-a9"; @@ -246,7 +246,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : this->m_cpu = cpu; if (arch == NULL) { - if (!strcmp(isa, "neon")) + if (!strncmp(isa, "neon", 4)) arch = "arm"; else arch = "x86-64"; @@ -461,8 +461,26 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : this->m_hasGather = true; #endif } - else if (!strcasecmp(isa, "neon")) { - this->m_isa = Target::NEON; + else if (!strcasecmp(isa, "neon-8")) { + this->m_isa = Target::NEON8; + this->m_nativeVectorWidth = 16; + this->m_vectorWidth = 16; + this->m_attributes = "+neon,+fp16"; + this->m_hasHalf = true; // ?? + this->m_maskingIsFree = false; + this->m_maskBitCount = 8; + } + else if (!strcasecmp(isa, "neon-16")) { + this->m_isa = Target::NEON16; + this->m_nativeVectorWidth = 8; + this->m_vectorWidth = 8; + this->m_attributes = "+neon,+fp16"; + this->m_hasHalf = true; // ?? + this->m_maskingIsFree = false; + this->m_maskBitCount = 16; + } + else if (!strcasecmp(isa, "neon-32") || !strcasecmp(isa, "neon")) { + this->m_isa = Target::NEON32; this->m_nativeVectorWidth = 4; this->m_vectorWidth = 4; this->m_attributes = "+neon,+fp16"; @@ -484,7 +502,8 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : llvm::Reloc::Default; std::string featuresString = m_attributes; llvm::TargetOptions options; - if (m_isa == Target::NEON) + if (m_isa == Target::NEON8 || m_isa == Target::NEON16 || + m_isa == Target::NEON32) options.FloatABIType = llvm::FloatABI::Hard; #if !defined(LLVM_3_1) if (g->opt.disableFMA == false) @@ -618,8 +637,12 @@ Target::GetTripleString() const { const char * Target::ISAToString(ISA isa) { switch (isa) { - case Target::NEON: - return "neon"; + case Target::NEON8: + return "neon-8"; + case Target::NEON16: + return "neon-16"; + case Target::NEON32: + return "neon-32"; case Target::SSE2: return "sse2"; case Target::SSE4: diff --git a/ispc.h b/ispc.h index 7d10b908..bf6d2642 100644 --- a/ispc.h +++ b/ispc.h @@ -175,7 +175,8 @@ public: flexible/performant of them will apear last in the enumerant. Note also that __best_available_isa() needs to be updated if ISAs are added or the enumerant values are reordered. */ - enum ISA { NEON, SSE2, SSE4, AVX, AVX11, AVX2, GENERIC, NUM_ISAS }; + enum ISA { NEON32, NEON16, NEON8, SSE2, SSE4, AVX, AVX11, AVX2, GENERIC, + NUM_ISAS }; /** Initializes the given Target pointer for a target of the given name, if the name is a known target. Returns true if the diff --git a/ispc.vcxproj b/ispc.vcxproj index 96682fe3..e9bf9d97 100755 --- a/ispc.vcxproj +++ b/ispc.vcxproj @@ -45,8 +45,12 @@ - - + + + + + + @@ -187,37 +191,78 @@ Building gen-bitcode-sse2-x2-64bit.cpp - - - Document - m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-neon.ll | python bitcode2cpp.py builtins\target-neon.ll > gen-bitcode-neon.cpp - gen-bitcode-neon.cpp - builtins\util.m4 - m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-neon.ll | python bitcode2cpp.py builtins\target-neon.ll > gen-bitcode-neon.cpp - gen-bitcode-neon.cpp - builtins\util.m4 - Building gen-bitcode-neon.cpp - Building gen-bitcode-neon.cpp - - - - - Document - m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-avx1.ll | python bitcode2cpp.py builtins\target-avx1.ll 32bit > $(Configuration)/gen-bitcode-avx1-32bit.cpp - $(Configuration)/gen-bitcode-avx1-32bit.cpp - builtins\util.m4;builtins\target-avx-common.ll;builtins\target-avx.ll - Building gen-bitcode-avx1-32bit.cpp - - - - - Document - m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-avx1.ll | python bitcode2cpp.py builtins\target-avx1.ll 64bit > $(Configuration)/gen-bitcode-avx1-64bit.cpp - $(Configuration)/gen-bitcode-avx1-64bit.cpp - builtins\util.m4;builtins\target-avx-common.ll;builtins\target-avx.ll - Building gen-bitcode-avx1-64bit.cpp - - + + + Document + m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-neon-8.ll | python bitcode2cpp.py builtins\target-neon-8.ll 32bit > $(Configuration)/gen-bitcode-neon-8-32bit.cpp + $(Configuration)/gen-bitcode-neon-8-32bit.cpp + builtins\util.m4;builtins\target-neon-common.ll + Building gen-bitcode-neon-8-32bit.cpp + + + + + Document + m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-neon-8.ll | python bitcode2cpp.py builtins\target-neon-8.ll 64bit > $(Configuration)/gen-bitcode-neon-8-64bit.cpp + $(Configuration)/gen-bitcode-neon-8-64bit.cpp + builtins\util.m4;builtins\target-neon-common.ll + Building gen-bitcode-neon-8-64bit.cpp + + + + + Document + m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-neon-16.ll | python bitcode2cpp.py builtins\target-neon-16.ll 32bit > $(Configuration)/gen-bitcode-neon-16-32bit.cpp + $(Configuration)/gen-bitcode-neon-16-32bit.cpp + builtins\util.m4;builtins\target-neon-common.ll + Building gen-bitcode-neon-16-32bit.cpp + + + + + Document + m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-neon-16.ll | python bitcode2cpp.py builtins\target-neon-16.ll 64bit > $(Configuration)/gen-bitcode-neon-16-64bit.cpp + $(Configuration)/gen-bitcode-neon-16-64bit.cpp + builtins\util.m4;builtins\target-neon-common.ll + Building gen-bitcode-neon-16-64bit.cpp + + + + + Document + m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-neon-32.ll | python bitcode2cpp.py builtins\target-neon-32.ll 32bit > $(Configuration)/gen-bitcode-neon-32-32bit.cpp + $(Configuration)/gen-bitcode-neon-32-32bit.cpp + builtins\util.m4;builtins\target-neon-common.ll + Building gen-bitcode-neon-32-32bit.cpp + + + + + Document + m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-neon-32.ll | python bitcode2cpp.py builtins\target-neon-32.ll 64bit > $(Configuration)/gen-bitcode-neon-32-64bit.cpp + $(Configuration)/gen-bitcode-neon-32-64bit.cpp + builtins\util.m4;builtins\target-neon-common.ll + Building gen-bitcode-neon-32-64bit.cpp + + + + + Document + m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-avx1.ll | python bitcode2cpp.py builtins\target-avx1.ll 32bit > $(Configuration)/gen-bitcode-avx1-32bit.cpp + $(Configuration)/gen-bitcode-avx1-32bit.cpp + builtins\util.m4;builtins\target-avx-common.ll;builtins\target-avx.ll + Building gen-bitcode-avx1-32bit.cpp + + + + + Document + m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-avx1.ll | python bitcode2cpp.py builtins\target-avx1.ll 64bit > $(Configuration)/gen-bitcode-avx1-64bit.cpp + $(Configuration)/gen-bitcode-avx1-64bit.cpp + builtins\util.m4;builtins\target-avx-common.ll;builtins\target-avx.ll + Building gen-bitcode-avx1-64bit.cpp + + Document diff --git a/module.cpp b/module.cpp index 85bf242c..755a5dc4 100644 --- a/module.cpp +++ b/module.cpp @@ -1877,6 +1877,7 @@ Module::execPreprocessor(const char *infilename, llvm::raw_string_ostream *ostre char *p = targetMacro; while (*p) { *p = toupper(*p); + if (*p == '-') *p = '_'; ++p; } opts.addMacroDef(targetMacro); diff --git a/run_tests.py b/run_tests.py index ea819ea4..c9dd8b76 100755 --- a/run_tests.py +++ b/run_tests.py @@ -37,7 +37,7 @@ parser.add_option("-g", "--generics-include", dest="include_file", help="Filenam parser.add_option("-f", "--ispc-flags", dest="ispc_flags", help="Additional flags for ispc (-g, -O1, ...)", default="") parser.add_option('-t', '--target', dest='target', - help='Set compilation target (neon, sse2, sse2-x2, sse4, sse4-x2, sse4-8, sse4-16, avx, avx-x2, generic-4, generic-8, generic-16, generic-32)', + help='Set compilation target (neon8, neon16, neon32, sse2, sse2-x2, sse4, sse4-x2, sse4-8, sse4-16, avx, avx-x2, generic-4, generic-8, generic-16, generic-32)', default="sse4") parser.add_option('-a', '--arch', dest='arch', help='Set architecture (arm, x86, x86-64)', From 48ff03112fd30d12a85eaf7cee3636ee6bfbedb4 Mon Sep 17 00:00:00 2001 From: Matt Pharr Date: Mon, 29 Jul 2013 16:20:46 -0700 Subject: [PATCH 15/34] Remove __pause from stdlib_core() in utils.m4. It wasn't ever being used, and was breaking compilation on ARM. --- builtins.cpp | 1 - builtins/util.m4 | 5 ----- 2 files changed, 6 deletions(-) diff --git a/builtins.cpp b/builtins.cpp index e671a491..b2896388 100644 --- a/builtins.cpp +++ b/builtins.cpp @@ -487,7 +487,6 @@ lSetInternalFunctions(llvm::Module *module) { "__num_cores", "__packed_load_active", "__packed_store_active", - "__pause", "__popcnt_int32", "__popcnt_int64", "__prefetch_read_uniform_1", diff --git a/builtins/util.m4 b/builtins/util.m4 index 1f85e2cc..025030d5 100644 --- a/builtins/util.m4 +++ b/builtins/util.m4 @@ -1795,11 +1795,6 @@ declare i1 @__is_compile_time_constant_mask( %mask) declare i1 @__is_compile_time_constant_uniform_int32(i32) declare i1 @__is_compile_time_constant_varying_int32() -define void @__pause() nounwind readnone { - call void asm sideeffect "pause", "~{dirflag},~{fpsr},~{flags}"() nounwind - ret void -} - ; This function declares placeholder masked store functions for the ; front-end to use. ; From d3c567503bf64ec9066c09cb8959c31d4aa1be0e Mon Sep 17 00:00:00 2001 From: Matt Pharr Date: Wed, 31 Jul 2013 06:46:45 -0700 Subject: [PATCH 16/34] Remove support for building with LLVM 3.1 --- builtins.cpp | 2 - builtins/target-avx11-x2.ll | 4 +- builtins/target-avx11.ll | 4 +- builtins/target-avx2-x2.ll | 25 +------- builtins/target-avx2.ll | 25 +------- cbackend.cpp | 115 +++++++++--------------------------- ctx.cpp | 4 +- ctx.h | 11 +--- expr.cpp | 2 +- func.cpp | 10 +--- ispc.cpp | 68 ++++----------------- ispc.h | 18 +----- llvmutil.cpp | 2 +- llvmutil.h | 2 +- main.cpp | 4 +- module.cpp | 47 +++------------ opt.cpp | 22 ++----- stmt.cpp | 2 +- type.cpp | 26 +++----- type.h | 2 +- util.cpp | 9 +-- 21 files changed, 84 insertions(+), 320 deletions(-) diff --git a/builtins.cpp b/builtins.cpp index b2896388..17582d68 100644 --- a/builtins.cpp +++ b/builtins.cpp @@ -49,8 +49,6 @@ #include #if defined(LLVM_3_2) #include -#endif -#if defined(LLVM_3_1) || defined(LLVM_3_2) #include #include #include diff --git a/builtins/target-avx11-x2.ll b/builtins/target-avx11-x2.ll index 1aa6345c..2aee1e1c 100644 --- a/builtins/target-avx11-x2.ll +++ b/builtins/target-avx11-x2.ll @@ -31,9 +31,7 @@ include(`target-avx-x2.ll') -ifelse(LLVM_VERSION, `LLVM_3_0', `rdrand_decls()', - LLVM_VERSION, `LLVM_3_1', `rdrand_decls()', - `rdrand_definition()') +rdrand_definition() ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; int min/max diff --git a/builtins/target-avx11.ll b/builtins/target-avx11.ll index fea0a7c2..44593113 100644 --- a/builtins/target-avx11.ll +++ b/builtins/target-avx11.ll @@ -31,9 +31,7 @@ include(`target-avx.ll') -ifelse(LLVM_VERSION, `LLVM_3_0', `rdrand_decls()', - LLVM_VERSION, `LLVM_3_1', `rdrand_decls()', - `rdrand_definition()') +rdrand_definition() ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; int min/max diff --git a/builtins/target-avx2-x2.ll b/builtins/target-avx2-x2.ll index 053fd078..19f1845d 100644 --- a/builtins/target-avx2-x2.ll +++ b/builtins/target-avx2-x2.ll @@ -29,15 +29,11 @@ ;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -ifelse(LLVM_VERSION, `LLVM_3_0', `', - LLVM_VERSION, `LLVM_3_1', `', - `define(`HAVE_GATHER', `1')') +define(`HAVE_GATHER', `1') include(`target-avx-x2.ll') -ifelse(LLVM_VERSION, `LLVM_3_0', `rdrand_decls()', - LLVM_VERSION, `LLVM_3_1', `rdrand_decls()', - `rdrand_definition()') +rdrand_definition() ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; int min/max @@ -176,21 +172,6 @@ define(`assemble_4s', ` assemble_8s($1, $2, $2_1, $2_2) ') -ifelse(LLVM_VERSION, `LLVM_3_0', ` -gen_gather_factored(i8) -gen_gather_factored(i16) -gen_gather_factored(i32) -gen_gather_factored(float) -gen_gather_factored(i64) -gen_gather_factored(double)', -LLVM_VERSION, `LLVM_3_1', ` -gen_gather_factored(i8) -gen_gather_factored(i16) -gen_gather_factored(i32) -gen_gather_factored(float) -gen_gather_factored(i64) -gen_gather_factored(double)', ` - gen_gather(i8) gen_gather(i16) @@ -557,5 +538,3 @@ define <16 x double> @__gather64_double(<16 x i64> %ptrs, ret <16 x double> %v } - -') diff --git a/builtins/target-avx2.ll b/builtins/target-avx2.ll index f4a0ee07..d3410011 100644 --- a/builtins/target-avx2.ll +++ b/builtins/target-avx2.ll @@ -29,15 +29,11 @@ ;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -ifelse(LLVM_VERSION, `LLVM_3_0', `', - LLVM_VERSION, `LLVM_3_1', `', - `define(`HAVE_GATHER', `1')') +define(`HAVE_GATHER', `1') include(`target-avx.ll') -ifelse(LLVM_VERSION, `LLVM_3_0', `rdrand_decls()', - LLVM_VERSION, `LLVM_3_1', `rdrand_decls()', - `rdrand_definition()') +rdrand_definition() ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; int min/max @@ -123,21 +119,6 @@ define(`extract_4s', ` %$2_2 = shufflevector <8 x $1> %$2, <8 x $1> undef, <4 x i32> ') -ifelse(LLVM_VERSION, `LLVM_3_0', ` -gen_gather_factored(i8) -gen_gather_factored(i16) -gen_gather_factored(i32) -gen_gather_factored(float) -gen_gather_factored(i64) -gen_gather_factored(double)', -LLVM_VERSION, `LLVM_3_1', ` -gen_gather_factored(i8) -gen_gather_factored(i16) -gen_gather_factored(i32) -gen_gather_factored(float) -gen_gather_factored(i64) -gen_gather_factored(double)', ` - gen_gather(i8) gen_gather(i16) @@ -429,5 +410,3 @@ define <8 x double> @__gather64_double(<8 x i64> %ptrs, ret <8 x double> %v } - -') diff --git a/cbackend.cpp b/cbackend.cpp index d23bcc20..d54f48fb 100644 --- a/cbackend.cpp +++ b/cbackend.cpp @@ -29,7 +29,7 @@ #include "llvmutil.h" -#if defined(LLVM_3_1) || defined(LLVM_3_2) +#if defined(LLVM_3_2) #include "llvm/Constants.h" #include "llvm/DerivedTypes.h" #include "llvm/CallingConv.h" @@ -38,6 +38,7 @@ #include "llvm/Intrinsics.h" #include "llvm/IntrinsicInst.h" #include "llvm/InlineAsm.h" + #include "llvm/TypeFinder.h" #else #include "llvm/IR/Constants.h" #include "llvm/IR/DerivedTypes.h" @@ -47,16 +48,10 @@ #include "llvm/IR/Intrinsics.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/InlineAsm.h" + #include "llvm/IR/TypeFinder.h" #endif #include "llvm/Pass.h" #include "llvm/PassManager.h" -#if !defined(LLVM_3_1) - #if defined(LLVM_3_2) - #include "llvm/TypeFinder.h" - #else // LLVM_3_3 + - #include "llvm/IR/TypeFinder.h" - #endif -#endif // LLVM_3_2 + #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/SmallString.h" #include "llvm/ADT/STLExtras.h" @@ -76,9 +71,7 @@ #include "llvm/MC/MCRegisterInfo.h" #include "llvm/MC/MCSubtargetInfo.h" #include "llvm/MC/MCSymbol.h" -#if defined(LLVM_3_1) - #include "llvm/Target/TargetData.h" -#elif defined(LLVM_3_2) +#if defined(LLVM_3_2) #include "llvm/DataLayout.h" #else // LLVM 3.3+ #include "llvm/IR/DataLayout.h" @@ -88,7 +81,7 @@ #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/FormattedStream.h" #include "llvm/Support/GetElementPtrTypeIterator.h" -#if defined(LLVM_3_1) || defined(LLVM_3_2) +#if defined(LLVM_3_2) #include "llvm/Support/InstVisitor.h" #else // LLVM 3.3+ #include "llvm/InstVisitor.h" @@ -258,14 +251,10 @@ namespace { const llvm::MCRegisterInfo *MRI; const llvm::MCObjectFileInfo *MOFI; llvm::MCContext *TCtx; -#if defined(LLVM_3_1) - const llvm::TargetData* TD; -#else // FIXME: it's ugly to have the name be "TD" here, but it saves us // lots of ifdefs in the below since the new DataLayout and the old // TargetData have generally similar interfaces... const llvm::DataLayout* TD; -#endif std::map FPConstantMap; std::map VectorConstantMap; @@ -352,7 +341,7 @@ namespace { bool isSigned = false, const std::string &VariableName = "", bool IgnoreName = false, -#if defined(LLVM_3_1) || defined(LLVM_3_2) +#if defined(LLVM_3_2) const llvm::AttrListPtr &PAL = llvm::AttrListPtr() #else const llvm::AttributeSet &PAL = llvm::AttributeSet() @@ -363,7 +352,7 @@ namespace { const std::string &NameSoFar = ""); void printStructReturnPointerFunctionType(llvm::raw_ostream &Out, -#if defined(LLVM_3_1) || defined(LLVM_3_2) +#if defined(LLVM_3_2) const llvm::AttrListPtr &PAL, #else const llvm::AttributeSet &PAL, @@ -586,7 +575,7 @@ std::string CWriter::getArrayName(llvm::ArrayType *AT) { /// return type, except, instead of printing the type as void (*)(Struct*, ...) /// print it as "Struct (*)(...)", for struct return functions. void CWriter::printStructReturnPointerFunctionType(llvm::raw_ostream &Out, -#if defined(LLVM_3_1) || defined(LLVM_3_2) +#if defined(LLVM_3_2) const llvm::AttrListPtr &PAL, #else const llvm::AttributeSet &PAL, @@ -605,20 +594,16 @@ void CWriter::printStructReturnPointerFunctionType(llvm::raw_ostream &Out, if (PrintedType) FunctionInnards << ", "; llvm::Type *ArgTy = *I; -#if defined(LLVM_3_1) - if (PAL.paramHasAttr(Idx, llvm::Attribute::ByVal)) { -#elif defined(LLVM_3_2) +#if defined(LLVM_3_2) if (PAL.getParamAttributes(Idx).hasAttribute(llvm::Attributes::ByVal)) { #else - if (PAL.getParamAttributes(Idx).hasAttribute(llvm::AttributeSet::FunctionIndex, llvm::Attribute::ByVal)) { + if (PAL.getParamAttributes(Idx).hasAttribute(llvm::AttributeSet::FunctionIndex, llvm::Attribute::ByVal)) { #endif assert(ArgTy->isPointerTy()); ArgTy = llvm::cast(ArgTy)->getElementType(); } printType(FunctionInnards, ArgTy, -#if defined(LLVM_3_1) - /*isSigned=*/PAL.paramHasAttr(Idx, llvm::Attribute::SExt), -#elif defined(LLVM_3_2) +#if defined(LLVM_3_2) PAL.getParamAttributes(Idx).hasAttribute(llvm::Attributes::SExt), #else PAL.getParamAttributes(Idx).hasAttribute(llvm::AttributeSet::FunctionIndex, llvm::Attribute::SExt), @@ -635,9 +620,7 @@ void CWriter::printStructReturnPointerFunctionType(llvm::raw_ostream &Out, } FunctionInnards << ')'; printType(Out, RetTy, -#if defined(LLVM_3_1) - /*isSigned=*/PAL.paramHasAttr(0, llvm::Attribute::SExt), -#elif defined(LLVM_3_2) +#if defined(LLVM_3_2) PAL.getParamAttributes(0).hasAttribute(llvm::Attributes::SExt), #else PAL.getParamAttributes(0).hasAttribute(llvm::AttributeSet::ReturnIndex, llvm::Attribute::SExt), @@ -737,7 +720,7 @@ CWriter::printSimpleType(llvm::raw_ostream &Out, llvm::Type *Ty, bool isSigned, llvm::raw_ostream &CWriter::printType(llvm::raw_ostream &Out, llvm::Type *Ty, bool isSigned, const std::string &NameSoFar, bool IgnoreName, -#if defined(LLVM_3_1) || defined(LLVM_3_2) +#if defined(LLVM_3_2) const llvm::AttrListPtr &PAL #else const llvm::AttributeSet &PAL @@ -759,9 +742,7 @@ llvm::raw_ostream &CWriter::printType(llvm::raw_ostream &Out, llvm::Type *Ty, for (llvm::FunctionType::param_iterator I = FTy->param_begin(), E = FTy->param_end(); I != E; ++I) { llvm::Type *ArgTy = *I; -#if defined(LLVM_3_1) - if (PAL.paramHasAttr(Idx, llvm::Attribute::ByVal)) { -#elif defined(LLVM_3_2) +#if defined(LLVM_3_2) if (PAL.getParamAttributes(Idx).hasAttribute(llvm::Attributes::ByVal)) { #else if (PAL.getParamAttributes(Idx).hasAttribute(llvm::AttributeSet::FunctionIndex, llvm::Attribute::ByVal)) { @@ -772,9 +753,7 @@ llvm::raw_ostream &CWriter::printType(llvm::raw_ostream &Out, llvm::Type *Ty, if (I != FTy->param_begin()) FunctionInnards << ", "; printType(FunctionInnards, ArgTy, -#if defined(LLVM_3_1) - /*isSigned=*/PAL.paramHasAttr(Idx, llvm::Attribute::SExt), -#elif defined(LLVM_3_2) +#if defined(LLVM_3_2) PAL.getParamAttributes(Idx).hasAttribute(llvm::Attributes::SExt), #else PAL.getParamAttributes(Idx).hasAttribute(llvm::AttributeSet::FunctionIndex, llvm::Attribute::SExt), @@ -791,9 +770,7 @@ llvm::raw_ostream &CWriter::printType(llvm::raw_ostream &Out, llvm::Type *Ty, } FunctionInnards << ')'; printType(Out, FTy->getReturnType(), -#if defined(LLVM_3_1) - /*isSigned=*/PAL.paramHasAttr(0, llvm::Attribute::SExt), -#elif defined(LLVM_3_2) +#if defined(LLVM_3_2) PAL.getParamAttributes(0).hasAttribute(llvm::Attributes::SExt), #else PAL.getParamAttributes(0).hasAttribute(llvm::AttributeSet::ReturnIndex, llvm::Attribute::SExt), @@ -1972,11 +1949,7 @@ void CWriter::writeOperandWithCast(llvm::Value* Operand, const llvm::ICmpInst &C // directives to cater to specific compilers as need be. // static void generateCompilerSpecificCode(llvm::formatted_raw_ostream& Out, -#if defined(LLVM_3_1) - const llvm::TargetData *TD) { -#else const llvm::DataLayout *TD) { -#endif // We output GCC specific attributes to preserve 'linkonce'ness on globals. // If we aren't being compiled with GCC, just drop these attributes. Out << "#ifndef __GNUC__ /* Can only support \"linkonce\" vars with GCC */\n" @@ -2169,11 +2142,7 @@ bool CWriter::doInitialization(llvm::Module &M) { // Initialize TheModule = &M; -#if defined(LLVM_3_1) - TD = new llvm::TargetData(&M); -#else TD = new llvm::DataLayout(&M); -#endif IL = new llvm::IntrinsicLowering(*TD); IL->AddPrototypes(M); @@ -2656,15 +2625,11 @@ void CWriter::printModuleTypes() { // Get all of the struct types used in the module. std::vector StructTypes; -#if defined(LLVM_3_1) - TheModule->findUsedStructTypes(StructTypes); -#else llvm::TypeFinder typeFinder; typeFinder.run(*TheModule, false); for (llvm::TypeFinder::iterator iter = typeFinder.begin(); iter != typeFinder.end(); ++iter) StructTypes.push_back(*iter); -#endif // Get all of the array types used in the module std::vector ArrayTypes; @@ -2785,7 +2750,7 @@ void CWriter::printFunctionSignature(const llvm::Function *F, bool Prototype) { // Loop over the arguments, printing them... llvm::FunctionType *FT = llvm::cast(F->getFunctionType()); -#if defined(LLVM_3_1) || defined(LLVM_3_2) +#if defined(LLVM_3_2) const llvm::AttrListPtr &PAL = F->getAttributes(); #else const llvm::AttributeSet &PAL = F->getAttributes(); @@ -2819,20 +2784,16 @@ void CWriter::printFunctionSignature(const llvm::Function *F, bool Prototype) { else ArgName = ""; llvm::Type *ArgTy = I->getType(); -#if defined(LLVM_3_1) - if (PAL.paramHasAttr(Idx, llvm::Attribute::ByVal)) { -#elif defined(LLVM_3_2) +#if defined(LLVM_3_2) if (PAL.getParamAttributes(Idx).hasAttribute(llvm::Attributes::ByVal)) { #else - if (PAL.getParamAttributes(Idx).hasAttribute(llvm::AttributeSet::FunctionIndex, llvm::Attribute::ByVal)) { + if (PAL.getParamAttributes(Idx).hasAttribute(llvm::AttributeSet::FunctionIndex, llvm::Attribute::ByVal)) { #endif ArgTy = llvm::cast(ArgTy)->getElementType(); ByValParams.insert(I); } printType(FunctionInnards, ArgTy, -#if defined(LLVM_3_1) - /*isSigned=*/PAL.paramHasAttr(Idx, llvm::Attribute::SExt), -#elif defined(LLVM_3_2) +#if defined(LLVM_3_2) PAL.getParamAttributes(Idx).hasAttribute(llvm::Attributes::SExt), #else PAL.getParamAttributes(Idx).hasAttribute(llvm::AttributeSet::FunctionIndex, llvm::Attribute::SExt), @@ -2858,9 +2819,7 @@ void CWriter::printFunctionSignature(const llvm::Function *F, bool Prototype) { for (; I != E; ++I) { if (PrintedArg) FunctionInnards << ", "; llvm::Type *ArgTy = *I; -#if defined(LLVM_3_1) - if (PAL.paramHasAttr(Idx, llvm::Attribute::ByVal)) { -#elif defined(LLVM_3_2) +#if defined(LLVM_3_2) if (PAL.getParamAttributes(Idx).hasAttribute(llvm::Attributes::ByVal)) { #else if (PAL.getParamAttributes(Idx).hasAttribute(llvm::AttributeSet::FunctionIndex, llvm::Attribute::ByVal)) { @@ -2869,9 +2828,7 @@ void CWriter::printFunctionSignature(const llvm::Function *F, bool Prototype) { ArgTy = llvm::cast(ArgTy)->getElementType(); } printType(FunctionInnards, ArgTy, -#if defined(LLVM_3_1) - /*isSigned=*/PAL.paramHasAttr(Idx, llvm::Attribute::SExt) -#elif defined(LLVM_3_2) +#if defined(LLVM_3_2) PAL.getParamAttributes(Idx).hasAttribute(llvm::Attributes::SExt) #else PAL.getParamAttributes(Idx).hasAttribute(llvm::AttributeSet::FunctionIndex, llvm::Attribute::SExt) @@ -2908,9 +2865,7 @@ void CWriter::printFunctionSignature(const llvm::Function *F, bool Prototype) { // Print out the return type and the signature built above. printType(Out, RetTy, -#if defined(LLVM_3_1) - /*isSigned=*/PAL.paramHasAttr(0, llvm::Attribute::SExt), -#elif defined(LLVM_3_2) +#if defined(LLVM_3_2) PAL.getParamAttributes(0).hasAttribute(llvm::Attributes::SExt), #else PAL.getParamAttributes(0).hasAttribute(llvm::AttributeSet::ReturnIndex, llvm::Attribute::SExt), @@ -3712,7 +3667,7 @@ void CWriter::lowerIntrinsics(llvm::Function &F) { const char *BuiltinName = ""; #define GET_GCC_BUILTIN_NAME #define Intrinsic llvm::Intrinsic -#if defined(LLVM_3_1) || defined(LLVM_3_2) +#if defined(LLVM_3_2) #include "llvm/Intrinsics.gen" #else #include "llvm/IR/Intrinsics.gen" @@ -3777,7 +3732,7 @@ void CWriter::visitCallInst(llvm::CallInst &I) { // If this is a call to a struct-return function, assign to the first // parameter instead of passing it to the call. -#if defined(LLVM_3_1) || defined(LLVM_3_2) +#if defined(LLVM_3_2) const llvm::AttrListPtr &PAL = I.getAttributes(); #else const llvm::AttributeSet &PAL = I.getAttributes(); @@ -3865,9 +3820,7 @@ void CWriter::visitCallInst(llvm::CallInst &I) { (*AI)->getType() != FTy->getParamType(ArgNo)) { Out << '('; printType(Out, FTy->getParamType(ArgNo), -#if defined(LLVM_3_1) - /*isSigned=*/PAL.paramHasAttr(ArgNo+1, llvm::Attribute::SExt) -#elif defined(LLVM_3_2) +#if defined(LLVM_3_2) PAL.getParamAttributes(ArgNo+1).hasAttribute(llvm::Attributes::SExt) #else PAL.getParamAttributes(ArgNo+1).hasAttribute(llvm::AttributeSet::FunctionIndex, llvm::Attribute::SExt) @@ -3905,7 +3858,7 @@ bool CWriter::visitBuiltinCall(llvm::CallInst &I, llvm::Intrinsic::ID ID, const char *BuiltinName = ""; #define GET_GCC_BUILTIN_NAME #define Intrinsic llvm::Intrinsic -#if defined(LLVM_3_1) || defined(LLVM_3_2) +#if defined(LLVM_3_2) #include "llvm/Intrinsics.gen" #else #include "llvm/IR/Intrinsics.gen" @@ -4555,13 +4508,8 @@ SmearCleanupPass::runOnBasicBlock(llvm::BasicBlock &bb) { smearType, NULL); smearFunc = llvm::dyn_cast(sf); assert(smearFunc != NULL); -#if defined(LLVM_3_1) - smearFunc->setDoesNotThrow(true); - smearFunc->setDoesNotAccessMemory(true); -#else smearFunc->setDoesNotThrow(); smearFunc->setDoesNotAccessMemory(); -#endif } assert(smearFunc != NULL); @@ -4703,13 +4651,8 @@ AndCmpCleanupPass::runOnBasicBlock(llvm::BasicBlock &bb) { LLVMTypes::MaskType, NULL); andCmpFunc = llvm::dyn_cast(acf); Assert(andCmpFunc != NULL); -#if defined(LLVM_3_1) - andCmpFunc->setDoesNotThrow(true); - andCmpFunc->setDoesNotAccessMemory(true); -#else andCmpFunc->setDoesNotThrow(); andCmpFunc->setDoesNotAccessMemory(); -#endif } // Set up the function call to the *_and_mask function; the @@ -4914,7 +4857,7 @@ WriteCXXFile(llvm::Module *module, const char *fn, int vectorWidth, pm.add(new llvm::TargetData(module)); #endif -#if defined(LLVM_3_1) || defined(LLVM_3_2) || defined(LLVM_3_3) +#if defined(LLVM_3_2) || defined(LLVM_3_3) int flags = 0; #else llvm::sys::fs::OpenFlags flags = llvm::sys::fs::F_None; @@ -4939,7 +4882,7 @@ WriteCXXFile(llvm::Module *module, const char *fn, int vectorWidth, pm.add(llvm::createDeadCodeEliminationPass()); // clean up after smear pass //CO pm.add(llvm::createPrintModulePass(&fos)); pm.add(new CWriter(fos, includeName, vectorWidth)); -#if defined(LLVM_3_1) || defined(LLVM_3_2) +#if defined(LLVM_3_2) // This interface is depricated for 3.3+ pm.add(llvm::createGCInfoDeleter()); #endif diff --git a/ctx.cpp b/ctx.cpp index c50d22f9..32ba0ad9 100644 --- a/ctx.cpp +++ b/ctx.cpp @@ -46,7 +46,7 @@ #include "sym.h" #include #include -#if defined(LLVM_3_1) || defined(LLVM_3_2) +#if defined(LLVM_3_2) #include #include #include @@ -3312,7 +3312,7 @@ FunctionEmitContext::CallInst(llvm::Value *func, const FunctionType *funcType, // alias analysis. // TODO: what other attributes needs to be copied? // TODO: do the same for varing path. -#if !defined (LLVM_3_1) && !defined (LLVM_3_2) // LLVM 3.3+ +#if !defined (LLVM_3_2) // LLVM 3.3+ llvm::CallInst *cc = llvm::dyn_cast(ci); if (cc && cc->getCalledFunction() && diff --git a/ctx.h b/ctx.h index 58f9aae3..4b27e6e5 100644 --- a/ctx.h +++ b/ctx.h @@ -40,20 +40,15 @@ #include "ispc.h" #include -#if defined(LLVM_3_1) || defined(LLVM_3_2) +#if defined(LLVM_3_2) #include #include #else #include #include #endif -#if defined(LLVM_3_1) - #include - #include -#else - #include - #include -#endif +#include +#include struct CFInfo; diff --git a/expr.cpp b/expr.cpp index 856d363c..eb8c0951 100644 --- a/expr.cpp +++ b/expr.cpp @@ -56,7 +56,7 @@ #include #include #include -#if defined(LLVM_3_1) || defined(LLVM_3_2) +#if defined(LLVM_3_2) #include #include #include diff --git a/func.cpp b/func.cpp index b975049b..3097f86d 100644 --- a/func.cpp +++ b/func.cpp @@ -46,7 +46,7 @@ #include "util.h" #include -#if defined(LLVM_3_1) || defined(LLVM_3_2) +#if defined(LLVM_3_2) #include #include #include @@ -310,9 +310,7 @@ Function::emitCode(FunctionEmitContext *ctx, llvm::Function *function, // isn't worth the code bloat / overhead. bool checkMask = (type->isTask == true) || ( -#if defined(LLVM_3_1) - (function->hasFnAttr(llvm::Attribute::AlwaysInline) == false) -#elif defined(LLVM_3_2) +#if defined(LLVM_3_2) (function->getFnAttributes().hasAttribute(llvm::Attributes::AlwaysInline) == false) #else // LLVM 3.3+ (function->getAttributes().getFnAttributes().hasAttribute(llvm::AttributeSet::FunctionIndex, llvm::Attribute::AlwaysInline) == false) @@ -453,11 +451,7 @@ Function::GenerateIR() { functionName += std::string("_") + g->target->GetISAString(); llvm::Function *appFunction = llvm::Function::Create(ftype, linkage, functionName.c_str(), m->module); -#if defined(LLVM_3_1) - appFunction->setDoesNotThrow(true); -#else appFunction->setDoesNotThrow(); -#endif g->target->markFuncWithTargetAttr(appFunction); diff --git a/ispc.cpp b/ispc.cpp index de8fba4d..b25527c4 100644 --- a/ispc.cpp +++ b/ispc.cpp @@ -48,7 +48,7 @@ #include #include #endif -#if defined(LLVM_3_1) || defined(LLVM_3_2) +#if defined(LLVM_3_2) #include #include #include @@ -57,19 +57,12 @@ #include #include #endif -#if defined(LLVM_3_1) - #include - #include -#else - #include - #include -#endif +#include +#include #include #include #include -#if defined(LLVM_3_1) - #include -#elif defined(LLVM_3_2) +#if defined(LLVM_3_2) #include #else // LLVM 3.3+ #include @@ -145,27 +138,20 @@ static const char *supportedCPUs[] = { // cortex-a9 and a15. We should be able to handle any of them that also // have NEON support. "cortex-a9", "cortex-a15", - "atom", "penryn", "core2", "corei7", "corei7-avx" -#if !defined(LLVM_3_1) - , "core-avx-i", "core-avx2" -#endif // LLVM 3.2+ + "atom", "penryn", "core2", "corei7", "corei7-avx", "core-avx-i", "core-avx2" }; Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : m_target(NULL), m_targetMachine(NULL), -#if defined(LLVM_3_1) - m_targetData(NULL), -#else m_dataLayout(NULL), -#endif m_valid(false), m_isa(SSE2), m_arch(""), m_is32Bit(true), m_cpu(""), m_attributes(""), -#if !defined(LLVM_3_1) && !defined(LLVM_3_2) +#if !defined(LLVM_3_2) m_tf_attributes(NULL), #endif m_nativeVectorWidth(-1), @@ -407,10 +393,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : this->m_maskingIsFree = false; this->m_maskBitCount = 32; this->m_hasHalf = true; -#if !defined(LLVM_3_1) - // LLVM 3.2+ only this->m_hasRand = true; -#endif } else if (!strcasecmp(isa, "avx1.1-x2")) { this->m_isa = Target::AVX11; @@ -420,46 +403,29 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : this->m_maskingIsFree = false; this->m_maskBitCount = 32; this->m_hasHalf = true; -#if !defined(LLVM_3_1) - // LLVM 3.2+ only this->m_hasRand = true; -#endif } else if (!strcasecmp(isa, "avx2")) { this->m_isa = Target::AVX2; this->m_nativeVectorWidth = 8; this->m_vectorWidth = 8; - this->m_attributes = "+avx2,+popcnt,+cmov,+f16c,+rdrand" -#ifndef LLVM_3_1 - ",+fma" -#endif // !LLVM_3_1 - ; + this->m_attributes = "+avx2,+popcnt,+cmov,+f16c,+rdrand,+fma"; this->m_maskingIsFree = false; this->m_maskBitCount = 32; this->m_hasHalf = true; -#if !defined(LLVM_3_1) - // LLVM 3.2+ only this->m_hasRand = true; this->m_hasGather = true; -#endif } else if (!strcasecmp(isa, "avx2-x2")) { this->m_isa = Target::AVX2; this->m_nativeVectorWidth = 16; this->m_vectorWidth = 16; - this->m_attributes = "+avx2,+popcnt,+cmov,+f16c,+rdrand" -#ifndef LLVM_3_1 - ",+fma" -#endif // !LLVM_3_1 - ; + this->m_attributes = "+avx2,+popcnt,+cmov,+f16c,+rdrand,+fma"; this->m_maskingIsFree = false; this->m_maskBitCount = 32; this->m_hasHalf = true; -#if !defined(LLVM_3_1) - // LLVM 3.2+ only this->m_hasRand = true; this->m_hasGather = true; -#endif } else if (!strcasecmp(isa, "neon-8")) { this->m_isa = Target::NEON8; @@ -505,10 +471,8 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : if (m_isa == Target::NEON8 || m_isa == Target::NEON16 || m_isa == Target::NEON32) options.FloatABIType = llvm::FloatABI::Hard; -#if !defined(LLVM_3_1) if (g->opt.disableFMA == false) options.AllowFPOpFusion = llvm::FPOpFusion::Fast; -#endif // !LLVM_3_1 #ifdef ISPC_IS_WINDOWS if (strcmp("x86", arch) == 0) { @@ -526,12 +490,8 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : // Initialize TargetData/DataLayout in 3 steps. // 1. Get default data layout first - std::string dl_string; -#if defined(LLVM_3_1) - dl_string = m_targetMachine->getTargetData()->getStringRepresentation(); -#else - dl_string = m_targetMachine->getDataLayout()->getStringRepresentation(); -#endif + std::string dl_string = + m_targetMachine->getDataLayout()->getStringRepresentation(); // 2. Adjust for generic if (m_isa == Target::GENERIC) { @@ -546,11 +506,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : } // 3. Finally set member data -#if defined(LLVM_3_1) - m_targetData = new llvm::TargetData(dl_string); -#else m_dataLayout = new llvm::DataLayout(dl_string); -#endif // Set is32Bit // This indicates if we are compiling for 32 bit platform @@ -558,7 +514,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : // FIXME: all generic targets are handled as 64 bit, which is incorrect. this->m_is32Bit = (getDataLayout()->getPointerSize() == 4); -#if !defined(LLVM_3_1) && !defined(LLVM_3_2) +#if !defined(LLVM_3_2) // This is LLVM 3.3+ feature. // Initialize target-specific "target-feature" attribute. if (!m_attributes.empty()) { @@ -772,7 +728,7 @@ Target::StructOffset(llvm::Type *type, int element, } void Target::markFuncWithTargetAttr(llvm::Function* func) { -#if !defined(LLVM_3_1) && !defined(LLVM_3_2) +#if !defined(LLVM_3_2) if (m_tf_attributes) { func->addAttributes(llvm::AttributeSet::FunctionIndex, *m_tf_attributes); } diff --git a/ispc.h b/ispc.h index bf6d2642..d68f9034 100644 --- a/ispc.h +++ b/ispc.h @@ -40,8 +40,8 @@ #define ISPC_VERSION "1.4.5dev" -#if !defined(LLVM_3_1) && !defined(LLVM_3_2) && !defined(LLVM_3_3) && !defined(LLVM_3_4) -#error "Only LLVM 3.1, 3.2, 3.3 and the 3.4 development branch are supported" +#if !defined(LLVM_3_2) && !defined(LLVM_3_3) && !defined(LLVM_3_4) +#error "Only LLVM 3.2, 3.3 and the 3.4 development branch are supported" #endif #if defined(_WIN32) || defined(_WIN64) @@ -72,11 +72,7 @@ namespace llvm { class BasicBlock; class Constant; class ConstantValue; -#if defined(LLVM_3_1) - class TargetData; -#else class DataLayout; -#endif class DIBuilder; class DIDescriptor; class DIFile; @@ -226,11 +222,7 @@ public: // Note the same name of method for 3.1 and 3.2+, this allows // to reduce number ifdefs on client side. -#if defined(LLVM_3_1) - llvm::TargetData *getDataLayout() const {return m_targetData;} -#else llvm::DataLayout *getDataLayout() const {return m_dataLayout;} -#endif /** Reports if Target object has valid state. */ bool isValid() const {return m_valid;} @@ -278,11 +270,7 @@ private: */ llvm::TargetMachine *m_targetMachine; -#if defined(LLVM_3_1) - llvm::TargetData *m_targetData; -#else llvm::DataLayout *m_dataLayout; -#endif /** flag to report invalid state after construction (due to bad parameters passed to constructor). */ @@ -303,7 +291,7 @@ private: /** Target-specific attribute string to pass along to the LLVM backend */ std::string m_attributes; -#if !defined(LLVM_3_1) && !defined(LLVM_3_2) +#if !defined(LLVM_3_2) /** Target-specific LLVM attribute, which has to be attached to every function to ensure that it is generated for correct target architecture. This is requirement was introduced in LLVM 3.3 */ diff --git a/llvmutil.cpp b/llvmutil.cpp index 180c8676..2f54a2fe 100644 --- a/llvmutil.cpp +++ b/llvmutil.cpp @@ -38,7 +38,7 @@ #include "llvmutil.h" #include "ispc.h" #include "type.h" -#if defined(LLVM_3_1) || defined(LLVM_3_2) +#if defined(LLVM_3_2) #include #include #else diff --git a/llvmutil.h b/llvmutil.h index d6c5ede0..d1803f32 100644 --- a/llvmutil.h +++ b/llvmutil.h @@ -38,7 +38,7 @@ #ifndef ISPC_LLVMUTIL_H #define ISPC_LLVMUTIL_H 1 -#if defined(LLVM_3_1) || defined(LLVM_3_2) +#if defined(LLVM_3_2) #include #include #include diff --git a/main.cpp b/main.cpp index de2bb620..4c4b4575 100644 --- a/main.cpp +++ b/main.cpp @@ -62,9 +62,7 @@ static void lPrintVersion() { printf("Intel(r) SPMD Program Compiler (ispc), %s (build %s @ %s, LLVM %s)\n", ISPC_VERSION, BUILD_VERSION, BUILD_DATE, -#if defined(LLVM_3_1) - "3.1" -#elif defined(LLVM_3_2) +#if defined(LLVM_3_2) "3.2" #elif defined(LLVM_3_3) "3.3" diff --git a/module.cpp b/module.cpp index 755a5dc4..eba5eb3b 100644 --- a/module.cpp +++ b/module.cpp @@ -64,7 +64,7 @@ #define strcasecmp stricmp #endif -#if defined(LLVM_3_1) || defined(LLVM_3_2) +#if defined(LLVM_3_2) #include #include #include @@ -86,9 +86,7 @@ #include #include #include -#if defined(LLVM_3_1) - #include -#elif defined(LLVM_3_2) +#if defined(LLVM_3_2) #include #include #else // LLVM 3.3+ @@ -202,7 +200,7 @@ lStripUnusedDebugInfo(llvm::Module *module) { // stuff and remove it later on. Removing it is useful, as it // reduces size of the binary significantly (manyfold for small // programs). -#if defined(LLVM_3_1) || defined(LLVM_3_2) +#if defined(LLVM_3_2) llvm::MDNode *nodeSPMD = llvm::dyn_cast(cuNode->getOperand(12)); Assert(nodeSPMD != NULL); @@ -797,11 +795,7 @@ Module::AddFunctionDeclaration(const std::string &name, #endif if (functionType->isTask) // This also applies transitively to members I think? -#if defined(LLVM_3_1) - function->setDoesNotAlias(1, true); -#else // LLVM 3.2+ function->setDoesNotAlias(1); -#endif g->target->markFuncWithTargetAttr(function); @@ -850,12 +844,7 @@ Module::AddFunctionDeclaration(const std::string &name, // NOTE: LLVM indexes function parameters starting from 1. // This is unintuitive. -#if defined(LLVM_3_1) - function->setDoesNotAlias(i+1, true); -#else function->setDoesNotAlias(i+1); -#endif - #if 0 int align = 4 * RoundUpPow2(g->target->nativeVectorWidth); function->addAttribute(i+1, llvm::Attribute::constructAlignmentFromInt(align)); @@ -1067,7 +1056,7 @@ Module::writeObjectFileOrAssembly(llvm::TargetMachine *targetMachine, llvm::TargetMachine::CodeGenFileType fileType = (outputType == Object) ? llvm::TargetMachine::CGFT_ObjectFile : llvm::TargetMachine::CGFT_AssemblyFile; bool binary = (fileType == llvm::TargetMachine::CGFT_ObjectFile); -#if defined(LLVM_3_1) || defined(LLVM_3_2) || defined(LLVM_3_3) +#if defined(LLVM_3_2) || defined(LLVM_3_3) unsigned int flags = binary ? llvm::raw_fd_ostream::F_Binary : 0; #else llvm::sys::fs::OpenFlags flags = binary ? llvm::sys::fs::F_Binary : @@ -1082,11 +1071,7 @@ Module::writeObjectFileOrAssembly(llvm::TargetMachine *targetMachine, } llvm::PassManager pm; -#if defined(LLVM_3_1) - pm.add(new llvm::TargetData(*g->target->getDataLayout())); -#else pm.add(new llvm::DataLayout(*g->target->getDataLayout())); -#endif llvm::formatted_raw_ostream fos(of->os()); @@ -1800,22 +1785,12 @@ Module::execPreprocessor(const char *infilename, llvm::raw_string_ostream *ostre llvm::raw_fd_ostream stderrRaw(2, false); -#if defined(LLVM_3_1) - clang::TextDiagnosticPrinter *diagPrinter = - new clang::TextDiagnosticPrinter(stderrRaw, clang::DiagnosticOptions()); -#else clang::DiagnosticOptions *diagOptions = new clang::DiagnosticOptions(); clang::TextDiagnosticPrinter *diagPrinter = new clang::TextDiagnosticPrinter(stderrRaw, diagOptions); -#endif llvm::IntrusiveRefCntPtr diagIDs(new clang::DiagnosticIDs); -#if defined(LLVM_3_1) - clang::DiagnosticsEngine *diagEngine = - new clang::DiagnosticsEngine(diagIDs, diagPrinter); -#else clang::DiagnosticsEngine *diagEngine = new clang::DiagnosticsEngine(diagIDs, diagOptions, diagPrinter); -#endif inst.setDiagnostics(diagEngine); clang::TargetOptions &options = inst.getTargetOpts(); @@ -1825,7 +1800,7 @@ Module::execPreprocessor(const char *infilename, llvm::raw_string_ostream *ostre } options.Triple = triple.getTriple(); -#if defined(LLVM_3_1) || defined(LLVM_3_2) +#if defined(LLVM_3_2) clang::TargetInfo *target = clang::TargetInfo::CreateTargetInfo(inst.getDiagnostics(), options); #else // LLVM 3.3+ @@ -1835,18 +1810,14 @@ Module::execPreprocessor(const char *infilename, llvm::raw_string_ostream *ostre inst.setTarget(target); inst.createSourceManager(inst.getFileManager()); -#if defined(LLVM_3_1) - inst.InitializeSourceManager(infilename); -#else clang::FrontendInputFile inputFile(infilename, clang::IK_None); inst.InitializeSourceManager(inputFile); -#endif // Don't remove comments in the preprocessor, so that we can accurately // track the source file position by handling them ourselves. inst.getPreprocessorOutputOpts().ShowComments = 1; -#if !defined(LLVM_3_1) && !defined(LLVM_3_2) // LLVM 3.3+ +#if !defined(LLVM_3_2) // LLVM 3.3+ inst.getPreprocessorOutputOpts().ShowCPP = 1; #endif @@ -1858,7 +1829,7 @@ Module::execPreprocessor(const char *infilename, llvm::raw_string_ostream *ostre headerOpts.Verbose = 1; for (int i = 0; i < (int)g->includePath.size(); ++i) { headerOpts.AddPath(g->includePath[i], clang::frontend::Angled, -#if defined(LLVM_3_1) || defined(LLVM_3_2) +#if defined(LLVM_3_2) true /* is user supplied */, #endif false /* not a framework */, @@ -1913,11 +1884,7 @@ Module::execPreprocessor(const char *infilename, llvm::raw_string_ostream *ostre } } -#if defined(LLVM_3_1) - inst.getLangOpts().BCPLComment = 1; -#else inst.getLangOpts().LineComment = 1; -#endif inst.createPreprocessor(); diagPrinter->BeginSourceFile(inst.getLangOpts(), &inst.getPreprocessor()); diff --git a/opt.cpp b/opt.cpp index 8efdbc67..8c86368e 100644 --- a/opt.cpp +++ b/opt.cpp @@ -48,7 +48,7 @@ #include #include -#if defined(LLVM_3_1) || defined(LLVM_3_2) +#if defined(LLVM_3_2) #include #include #include @@ -73,9 +73,7 @@ #include #include #include -#if defined(LLVM_3_1) - #include -#elif defined(LLVM_3_2) +#if defined(LLVM_3_2) #include #else // LLVM 3.3+ #include @@ -85,11 +83,7 @@ #include #include #include -#if defined(LLVM_3_1) - #include -#else - #include -#endif +#include #include #ifdef ISPC_IS_LINUX #include @@ -415,18 +409,14 @@ Optimize(llvm::Module *module, int optLevel) { optPM.add(targetLibraryInfo); -#if defined(LLVM_3_1) - optPM.add(new llvm::TargetData(*g->target->getDataLayout())); -#else optPM.add(new llvm::DataLayout(*g->target->getDataLayout())); llvm::TargetMachine *targetMachine = g->target->GetTargetMachine(); - #ifdef LLVM_3_2 +#ifdef LLVM_3_2 optPM.add(new llvm::TargetTransformInfo(targetMachine->getScalarTargetTransformInfo(), targetMachine->getVectorTargetTransformInfo())); - #else // LLVM 3.3+ +#else // LLVM 3.3+ targetMachine->addAnalysisPasses(optPM); - #endif #endif optPM.add(llvm::createIndVarSimplifyPass()); @@ -505,7 +495,7 @@ Optimize(llvm::Module *module, int optLevel) { optPM.add(llvm::createCFGSimplificationPass()); optPM.add(llvm::createArgumentPromotionPass()); -#if defined(LLVM_3_1) || defined(LLVM_3_2) || defined(LLVM_3_3) +#if defined(LLVM_3_2) || defined(LLVM_3_3) // Starting from 3.4 this functionality was moved to // InstructionCombiningPass. See r184459 for details. optPM.add(llvm::createSimplifyLibCallsPass()); diff --git a/stmt.cpp b/stmt.cpp index 4ec63d35..412b0dd9 100644 --- a/stmt.cpp +++ b/stmt.cpp @@ -48,7 +48,7 @@ #include #include -#if defined(LLVM_3_1) || defined(LLVM_3_2) +#if defined(LLVM_3_2) #include #include #include diff --git a/type.cpp b/type.cpp index 5fa1845b..11a165f5 100644 --- a/type.cpp +++ b/type.cpp @@ -43,20 +43,15 @@ #include #include -#if defined(LLVM_3_1) || defined(LLVM_3_2) +#if defined(LLVM_3_2) #include #include #else #include #include #endif -#if defined(LLVM_3_1) - #include - #include -#else - #include - #include -#endif +#include +#include #include @@ -819,11 +814,8 @@ EnumType::GetDIType(llvm::DIDescriptor scope) const { m->diBuilder->createEnumerationType(scope, name, diFile, pos.first_line, 32 /* size in bits */, 32 /* align in bits */, - elementArray -#if !defined(LLVM_3_1) - , llvm::DIType() -#endif - ); + elementArray, + llvm::DIType()); switch (variability.type) { @@ -2139,7 +2131,7 @@ StructType::GetDIType(llvm::DIDescriptor scope) const { currentSize, // Size in bits align, // Alignment in bits 0, // Flags -#if !defined(LLVM_3_1) && !defined(LLVM_3_2) +#if !defined(LLVM_3_2) llvm::DIType(), // DerivedFrom #endif elements); @@ -2382,7 +2374,7 @@ UndefinedStructType::GetDIType(llvm::DIDescriptor scope) const { 0, // Size 0, // Align 0, // Flags -#if !defined(LLVM_3_1) && !defined(LLVM_3_2) +#if !defined(LLVM_3_2) llvm::DIType(), // DerivedFrom #endif elements); @@ -2645,12 +2637,8 @@ ReferenceType::GetDIType(llvm::DIDescriptor scope) const { } llvm::DIType diTargetType = targetType->GetDIType(scope); -#if defined(LLVM_3_1) - return m->diBuilder->createReferenceType(diTargetType); -#else return m->diBuilder->createReferenceType(llvm::dwarf::DW_TAG_reference_type, diTargetType); -#endif } diff --git a/type.h b/type.h index 880f8574..a6a52e10 100644 --- a/type.h +++ b/type.h @@ -40,7 +40,7 @@ #include "ispc.h" #include "util.h" -#if defined(LLVM_3_1) || defined(LLVM_3_2) +#if defined(LLVM_3_2) #include #include #else diff --git a/util.cpp b/util.cpp index dbea9517..4be863bf 100644 --- a/util.cpp +++ b/util.cpp @@ -65,9 +65,7 @@ #include #include -#if defined(LLVM_3_1) - #include -#elif defined(LLVM_3_2) +#if defined(LLVM_3_2) #include #else // LLVM 3.3+ #include @@ -616,13 +614,8 @@ VerifyDataLayoutCompatibility(const std::string &module_dl, // which contradic: f80:128:128 followed by f80:32:32. This is a bug, but // correct thing to do is to interpret this exactly how LLVM would treat it, // so we create a DataLayout class and take its string representation. -#if defined(LLVM_3_1) - llvm::TargetData d1(module_dl); - llvm::TargetData d2(lib_dl); -#else // LLVM 3.2+ llvm::DataLayout d1(module_dl); llvm::DataLayout d2(lib_dl); -#endif std::string module_dl_canonic = d1.getStringRepresentation(); std::string lib_dl_canonic = d2.getStringRepresentation(); From d9c38b5c1f6c1ccb4920465789b9e3d451e302a8 Mon Sep 17 00:00:00 2001 From: Matt Pharr Date: Thu, 18 Jul 2013 09:24:23 -0700 Subject: [PATCH 17/34] Remove support for using SVML for math lib routines. This path was poorly maintained and wasn't actually available on most targets. --- builtins.cpp | 11 ---- builtins/target-avx-x2.ll | 17 ------ builtins/target-avx.ll | 17 ------ builtins/target-generic-1.ll | 98 ------------------------------- builtins/target-generic-common.ll | 16 ----- builtins/target-neon-common.ll | 13 ---- builtins/target-sse2-x2.ll | 86 --------------------------- builtins/target-sse2.ll | 60 ------------------- builtins/target-sse4-16.ll | 15 ----- builtins/target-sse4-8.ll | 15 ----- builtins/target-sse4-x2.ll | 86 --------------------------- builtins/target-sse4.ll | 60 ------------------- docs/ispc.rst | 3 - ispc.h | 2 +- main.cpp | 3 - stdlib.ispc | 72 ++++++----------------- 16 files changed, 18 insertions(+), 556 deletions(-) diff --git a/builtins.cpp b/builtins.cpp index 17582d68..d75db43e 100644 --- a/builtins.cpp +++ b/builtins.cpp @@ -579,15 +579,6 @@ lSetInternalFunctions(llvm::Module *module) { "__stdlib_sinf", "__stdlib_tan", "__stdlib_tanf", - "__svml_sin", - "__svml_cos", - "__svml_sincos", - "__svml_tan", - "__svml_atan", - "__svml_atan2", - "__svml_exp", - "__svml_log", - "__svml_pow", "__undef_uniform", "__undef_varying", "__vec4_add_float", @@ -1054,8 +1045,6 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod symbolTable); lDefineConstantInt("__math_lib_ispc_fast", (int)Globals::Math_ISPCFast, module, symbolTable); - lDefineConstantInt("__math_lib_svml", (int)Globals::Math_SVML, module, - symbolTable); lDefineConstantInt("__math_lib_system", (int)Globals::Math_System, module, symbolTable); lDefineConstantIntFunc("__fast_masked_vload", (int)g->opt.fastMaskedVload, diff --git a/builtins/target-avx-x2.ll b/builtins/target-avx-x2.ll index d9e0322b..8fb2e427 100644 --- a/builtins/target-avx-x2.ll +++ b/builtins/target-avx-x2.ll @@ -134,23 +134,6 @@ define <16 x float> @__sqrt_varying_float(<16 x float>) nounwind readonly always ret <16 x float> %call } -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;; svml - -; FIXME: need either to wire these up to the 8-wide SVML entrypoints, -; or, use the macro to call the 4-wide ones 4x with our 16-wide -; vectors... - -declare <16 x float> @__svml_sin(<16 x float>) -declare <16 x float> @__svml_cos(<16 x float>) -declare void @__svml_sincos(<16 x float>, <16 x float> *, <16 x float> *) -declare <16 x float> @__svml_tan(<16 x float>) -declare <16 x float> @__svml_atan(<16 x float>) -declare <16 x float> @__svml_atan2(<16 x float>, <16 x float>) -declare <16 x float> @__svml_exp(<16 x float>) -declare <16 x float> @__svml_log(<16 x float>) -declare <16 x float> @__svml_pow(<16 x float>, <16 x float>) - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; float min/max diff --git a/builtins/target-avx.ll b/builtins/target-avx.ll index 90e2f3ac..adaed9ba 100644 --- a/builtins/target-avx.ll +++ b/builtins/target-avx.ll @@ -134,23 +134,6 @@ define <8 x float> @__sqrt_varying_float(<8 x float>) nounwind readonly alwaysin ret <8 x float> %call } -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;; svml - -; FIXME: need either to wire these up to the 8-wide SVML entrypoints, -; or, use the macro to call the 4-wide ones twice with our 8-wide -; vectors... - -declare <8 x float> @__svml_sin(<8 x float>) -declare <8 x float> @__svml_cos(<8 x float>) -declare void @__svml_sincos(<8 x float>, <8 x float> *, <8 x float> *) -declare <8 x float> @__svml_tan(<8 x float>) -declare <8 x float> @__svml_atan(<8 x float>) -declare <8 x float> @__svml_atan2(<8 x float>, <8 x float>) -declare <8 x float> @__svml_exp(<8 x float>) -declare <8 x float> @__svml_log(<8 x float>) -declare <8 x float> @__svml_pow(<8 x float>, <8 x float>) - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; float min/max diff --git a/builtins/target-generic-1.ll b/builtins/target-generic-1.ll index 3dec76b0..238de444 100644 --- a/builtins/target-generic-1.ll +++ b/builtins/target-generic-1.ll @@ -647,104 +647,6 @@ define <1 x float> @__rsqrt_varying_float(<1 x float> %v) nounwind readonly alw } - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -; svml stuff - -define <1 x float> @__svml_sin(<1 x float>) nounwind readnone alwaysinline { - ;%ret = call <1 x float> @__svml_sinf4(<1 x float> %0) - ;ret <1 x float> %ret - ;%r = extractelement <1 x float> %0, i32 0 - ;%s = call float @llvm.sin.f32(float %r) - ;%rv = insertelement <1 x float> undef, float %r, i32 0 - ;ret <1 x float> %rv - unary1to1(float,@llvm.sin.f32) - -} - -define <1 x float> @__svml_cos(<1 x float>) nounwind readnone alwaysinline { - ;%ret = call <1 x float> @__svml_cosf4(<1 x float> %0) - ;ret <1 x float> %ret - ;%r = extractelement <1 x float> %0, i32 0 - ;%s = call float @llvm.cos.f32(float %r) - ;%rv = insertelement <1 x float> undef, float %r, i32 0 - ;ret <1 x float> %rv - unary1to1(float, @llvm.cos.f32) - -} - -define void @__svml_sincos(<1 x float>, <1 x float> *, <1 x float> *) nounwind readnone alwaysinline { -; %s = call <1 x float> @__svml_sincosf4(<1 x float> * %2, <1 x float> %0) -; store <1 x float> %s, <1 x float> * %1 -; ret void - %sin = call <1 x float> @__svml_sin (<1 x float> %0) - %cos = call <1 x float> @__svml_cos (<1 x float> %0) - store <1 x float> %sin, <1 x float> * %1 - store <1 x float> %cos, <1 x float> * %2 - ret void -} - -define <1 x float> @__svml_tan(<1 x float>) nounwind readnone alwaysinline { - ;%ret = call <1 x float> @__svml_tanf4(<1 x float> %0) - ;ret <1 x float> %ret - ;%r = extractelement <1 x float> %0, i32 0 - ;%s = call float @llvm_tan_f32(float %r) - ;%rv = insertelement <1 x float> undef, float %r, i32 0 - ;ret <1 x float> %rv - ;unasry1to1(float, @llvm.tan.f32) - ; UNSUPPORTED! - ret <1 x float > %0 -} - -define <1 x float> @__svml_atan(<1 x float>) nounwind readnone alwaysinline { -; %ret = call <1 x float> @__svml_atanf4(<1 x float> %0) -; ret <1 x float> %ret - ;%r = extractelement <1 x float> %0, i32 0 - ;%s = call float @llvm_atan_f32(float %r) - ;%rv = insertelement <1 x float> undef, float %r, i32 0 - ;ret <1 x float> %rv - ;unsary1to1(float,@llvm.atan.f32) - ;UNSUPPORTED! - ret <1 x float > %0 - -} - -define <1 x float> @__svml_atan2(<1 x float>, <1 x float>) nounwind readnone alwaysinline { - ;%ret = call <1 x float> @__svml_atan2f4(<1 x float> %0, <1 x float> %1) - ;ret <1 x float> %ret - ;%y = extractelement <1 x float> %0, i32 0 - ;%x = extractelement <1 x float> %1, i32 0 - ;%q = fdiv float %y, %x - ;%a = call float @llvm.atan.f32 (float %q) - ;%rv = insertelement <1 x float> undef, float %a, i32 0 - ;ret <1 x float> %rv - ; UNSUPPORTED! - ret <1 x float > %0 -} - -define <1 x float> @__svml_exp(<1 x float>) nounwind readnone alwaysinline { - ;%ret = call <1 x float> @__svml_expf4(<1 x float> %0) - ;ret <1 x float> %ret - unary1to1(float, @llvm.exp.f32) -} - -define <1 x float> @__svml_log(<1 x float>) nounwind readnone alwaysinline { - ;%ret = call <1 x float> @__svml_logf4(<1 x float> %0) - ;ret <1 x float> %ret - unary1to1(float, @llvm.log.f32) -} - -define <1 x float> @__svml_pow(<1 x float>, <1 x float>) nounwind readnone alwaysinline { - ;%ret = call <1 x float> @__svml_powf4(<1 x float> %0, <1 x float> %1) - ;ret <1 x float> %ret - %r = extractelement <1 x float> %0, i32 0 - %e = extractelement <1 x float> %1, i32 0 - %s = call float @llvm.pow.f32(float %r,float %e) - %rv = insertelement <1 x float> undef, float %s, i32 0 - ret <1 x float> %rv - -} - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; float min/max diff --git a/builtins/target-generic-common.ll b/builtins/target-generic-common.ll index 76d1faf3..b581e0a7 100644 --- a/builtins/target-generic-common.ll +++ b/builtins/target-generic-common.ll @@ -202,22 +202,6 @@ declare i64 @__count_trailing_zeros_i64(i64) nounwind readnone declare i32 @__count_leading_zeros_i32(i32) nounwind readnone declare i64 @__count_leading_zeros_i64(i64) nounwind readnone -;; svml - -; FIXME: need either to wire these up to the 8-wide SVML entrypoints, -; or, use the macro to call the 4-wide ones twice with our 8-wide -; vectors... - -declare @__svml_sin() -declare @__svml_cos() -declare void @__svml_sincos(, *, *) -declare @__svml_tan() -declare @__svml_atan() -declare @__svml_atan2(, ) -declare @__svml_exp() -declare @__svml_log() -declare @__svml_pow(, ) - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; reductions diff --git a/builtins/target-neon-common.ll b/builtins/target-neon-common.ll index 696b0748..f892a0a1 100644 --- a/builtins/target-neon-common.ll +++ b/builtins/target-neon-common.ll @@ -313,19 +313,6 @@ define void @__masked_store_blend_i64(* nocapture %ptr, ret void } -;; yuck. We need declarations of these, even though we shouldnt ever -;; actually generate calls to them for the NEON target... - -declare @__svml_sin() -declare @__svml_cos() -declare void @__svml_sincos(, *, *) -declare @__svml_tan() -declare @__svml_atan() -declare @__svml_atan2(, ) -declare @__svml_exp() -declare @__svml_log() -declare @__svml_pow(, ) - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; gather diff --git a/builtins/target-sse2-x2.ll b/builtins/target-sse2-x2.ll index da22a66c..057ea98f 100644 --- a/builtins/target-sse2-x2.ll +++ b/builtins/target-sse2-x2.ll @@ -102,92 +102,6 @@ define <8 x float> @__sqrt_varying_float(<8 x float>) nounwind readonly alwaysin ret <8 x float> %call } -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -; svml stuff - -declare <4 x float> @__svml_sinf4(<4 x float>) nounwind readnone -declare <4 x float> @__svml_cosf4(<4 x float>) nounwind readnone -declare <4 x float> @__svml_sincosf4(<4 x float> *, <4 x float>) nounwind readnone -declare <4 x float> @__svml_tanf4(<4 x float>) nounwind readnone -declare <4 x float> @__svml_atanf4(<4 x float>) nounwind readnone -declare <4 x float> @__svml_atan2f4(<4 x float>, <4 x float>) nounwind readnone -declare <4 x float> @__svml_expf4(<4 x float>) nounwind readnone -declare <4 x float> @__svml_logf4(<4 x float>) nounwind readnone -declare <4 x float> @__svml_powf4(<4 x float>, <4 x float>) nounwind readnone - - -define <8 x float> @__svml_sin(<8 x float>) nounwind readnone alwaysinline { - unary4to8(ret, float, @__svml_sinf4, %0) - ret <8 x float> %ret -} - -define <8 x float> @__svml_cos(<8 x float>) nounwind readnone alwaysinline { - unary4to8(ret, float, @__svml_cosf4, %0) - ret <8 x float> %ret -} - -define void @__svml_sincos(<8 x float>, <8 x float> *, - <8 x float> *) nounwind readnone alwaysinline { - ; call svml_sincosf4 two times with the two 4-wide sub-vectors - %a = shufflevector <8 x float> %0, <8 x float> undef, - <4 x i32> - %b = shufflevector <8 x float> %0, <8 x float> undef, - <4 x i32> - - %cospa = alloca <4 x float> - %sa = call <4 x float> @__svml_sincosf4(<4 x float> * %cospa, <4 x float> %a) - - %cospb = alloca <4 x float> - %sb = call <4 x float> @__svml_sincosf4(<4 x float> * %cospb, <4 x float> %b) - - %sin = shufflevector <4 x float> %sa, <4 x float> %sb, - <8 x i32> - store <8 x float> %sin, <8 x float> * %1 - - %cosa = load <4 x float> * %cospa - %cosb = load <4 x float> * %cospb - %cos = shufflevector <4 x float> %cosa, <4 x float> %cosb, - <8 x i32> - store <8 x float> %cos, <8 x float> * %2 - - ret void -} - -define <8 x float> @__svml_tan(<8 x float>) nounwind readnone alwaysinline { - unary4to8(ret, float, @__svml_tanf4, %0) - ret <8 x float> %ret -} - -define <8 x float> @__svml_atan(<8 x float>) nounwind readnone alwaysinline { - unary4to8(ret, float, @__svml_atanf4, %0) - ret <8 x float> %ret -} - -define <8 x float> @__svml_atan2(<8 x float>, - <8 x float>) nounwind readnone alwaysinline { - binary4to8(ret, float, @__svml_atan2f4, %0, %1) - ret <8 x float> %ret -} - -define <8 x float> @__svml_exp(<8 x float>) nounwind readnone alwaysinline { - unary4to8(ret, float, @__svml_expf4, %0) - ret <8 x float> %ret -} - -define <8 x float> @__svml_log(<8 x float>) nounwind readnone alwaysinline { - unary4to8(ret, float, @__svml_logf4, %0) - ret <8 x float> %ret -} - -define <8 x float> @__svml_pow(<8 x float>, - <8 x float>) nounwind readnone alwaysinline { - binary4to8(ret, float, @__svml_powf4, %0, %1) - ret <8 x float> %ret -} - - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; float min/max diff --git a/builtins/target-sse2.ll b/builtins/target-sse2.ll index a6b206b6..e0a5c3d5 100644 --- a/builtins/target-sse2.ll +++ b/builtins/target-sse2.ll @@ -493,66 +493,6 @@ define <4 x float> @__sqrt_varying_float(<4 x float>) nounwind readonly alwaysin ret <4 x float> %call } -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -; svml stuff - -declare <4 x float> @__svml_sinf4(<4 x float>) nounwind readnone -declare <4 x float> @__svml_cosf4(<4 x float>) nounwind readnone -declare <4 x float> @__svml_sincosf4(<4 x float> *, <4 x float>) nounwind readnone -declare <4 x float> @__svml_tanf4(<4 x float>) nounwind readnone -declare <4 x float> @__svml_atanf4(<4 x float>) nounwind readnone -declare <4 x float> @__svml_atan2f4(<4 x float>, <4 x float>) nounwind readnone -declare <4 x float> @__svml_expf4(<4 x float>) nounwind readnone -declare <4 x float> @__svml_logf4(<4 x float>) nounwind readnone -declare <4 x float> @__svml_powf4(<4 x float>, <4 x float>) nounwind readnone - - -define <4 x float> @__svml_sin(<4 x float>) nounwind readnone alwaysinline { - %ret = call <4 x float> @__svml_sinf4(<4 x float> %0) - ret <4 x float> %ret -} - -define <4 x float> @__svml_cos(<4 x float>) nounwind readnone alwaysinline { - %ret = call <4 x float> @__svml_cosf4(<4 x float> %0) - ret <4 x float> %ret -} - -define void @__svml_sincos(<4 x float>, <4 x float> *, <4 x float> *) nounwind readnone alwaysinline { - %s = call <4 x float> @__svml_sincosf4(<4 x float> * %2, <4 x float> %0) - store <4 x float> %s, <4 x float> * %1 - ret void -} - -define <4 x float> @__svml_tan(<4 x float>) nounwind readnone alwaysinline { - %ret = call <4 x float> @__svml_tanf4(<4 x float> %0) - ret <4 x float> %ret -} - -define <4 x float> @__svml_atan(<4 x float>) nounwind readnone alwaysinline { - %ret = call <4 x float> @__svml_atanf4(<4 x float> %0) - ret <4 x float> %ret -} - -define <4 x float> @__svml_atan2(<4 x float>, <4 x float>) nounwind readnone alwaysinline { - %ret = call <4 x float> @__svml_atan2f4(<4 x float> %0, <4 x float> %1) - ret <4 x float> %ret -} - -define <4 x float> @__svml_exp(<4 x float>) nounwind readnone alwaysinline { - %ret = call <4 x float> @__svml_expf4(<4 x float> %0) - ret <4 x float> %ret -} - -define <4 x float> @__svml_log(<4 x float>) nounwind readnone alwaysinline { - %ret = call <4 x float> @__svml_logf4(<4 x float> %0) - ret <4 x float> %ret -} - -define <4 x float> @__svml_pow(<4 x float>, <4 x float>) nounwind readnone alwaysinline { - %ret = call <4 x float> @__svml_powf4(<4 x float> %0, <4 x float> %1) - ret <4 x float> %ret -} - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; float min/max diff --git a/builtins/target-sse4-16.ll b/builtins/target-sse4-16.ll index d1563988..50f0848d 100644 --- a/builtins/target-sse4-16.ll +++ b/builtins/target-sse4-16.ll @@ -205,21 +205,6 @@ define <8 x double> @__max_varying_double(<8 x double>, <8 x double>) nounwind r ret <8 x double> %ret } -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;; svml - -; FIXME - -declare <8 x float> @__svml_sin(<8 x float>) -declare <8 x float> @__svml_cos(<8 x float>) -declare void @__svml_sincos(<8 x float>, <8 x float> *, <8 x float> *) -declare <8 x float> @__svml_tan(<8 x float>) -declare <8 x float> @__svml_atan(<8 x float>) -declare <8 x float> @__svml_atan2(<8 x float>, <8 x float>) -declare <8 x float> @__svml_exp(<8 x float>) -declare <8 x float> @__svml_log(<8 x float>) -declare <8 x float> @__svml_pow(<8 x float>, <8 x float>) - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; horizontal ops / reductions diff --git a/builtins/target-sse4-8.ll b/builtins/target-sse4-8.ll index 85b7bbe7..7fa9075b 100644 --- a/builtins/target-sse4-8.ll +++ b/builtins/target-sse4-8.ll @@ -217,21 +217,6 @@ define <16 x double> @__max_varying_double(<16 x double>, <16 x double>) nounwin ret <16 x double> %ret } -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;; svml - -; FIXME - -declare <16 x float> @__svml_sin(<16 x float>) -declare <16 x float> @__svml_cos(<16 x float>) -declare void @__svml_sincos(<16 x float>, <16 x float> *, <16 x float> *) -declare <16 x float> @__svml_tan(<16 x float>) -declare <16 x float> @__svml_atan(<16 x float>) -declare <16 x float> @__svml_atan2(<16 x float>, <16 x float>) -declare <16 x float> @__svml_exp(<16 x float>) -declare <16 x float> @__svml_log(<16 x float>) -declare <16 x float> @__svml_pow(<16 x float>, <16 x float>) - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; horizontal ops / reductions diff --git a/builtins/target-sse4-x2.ll b/builtins/target-sse4-x2.ll index e2debbc2..4a447ba6 100644 --- a/builtins/target-sse4-x2.ll +++ b/builtins/target-sse4-x2.ll @@ -102,92 +102,6 @@ define <8 x float> @__sqrt_varying_float(<8 x float>) nounwind readonly alwaysin ret <8 x float> %call } -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -; svml stuff - -declare <4 x float> @__svml_sinf4(<4 x float>) nounwind readnone -declare <4 x float> @__svml_cosf4(<4 x float>) nounwind readnone -declare <4 x float> @__svml_sincosf4(<4 x float> *, <4 x float>) nounwind readnone -declare <4 x float> @__svml_tanf4(<4 x float>) nounwind readnone -declare <4 x float> @__svml_atanf4(<4 x float>) nounwind readnone -declare <4 x float> @__svml_atan2f4(<4 x float>, <4 x float>) nounwind readnone -declare <4 x float> @__svml_expf4(<4 x float>) nounwind readnone -declare <4 x float> @__svml_logf4(<4 x float>) nounwind readnone -declare <4 x float> @__svml_powf4(<4 x float>, <4 x float>) nounwind readnone - - -define <8 x float> @__svml_sin(<8 x float>) nounwind readnone alwaysinline { - unary4to8(ret, float, @__svml_sinf4, %0) - ret <8 x float> %ret -} - -define <8 x float> @__svml_cos(<8 x float>) nounwind readnone alwaysinline { - unary4to8(ret, float, @__svml_cosf4, %0) - ret <8 x float> %ret -} - -define void @__svml_sincos(<8 x float>, <8 x float> *, - <8 x float> *) nounwind readnone alwaysinline { - ; call svml_sincosf4 two times with the two 4-wide sub-vectors - %a = shufflevector <8 x float> %0, <8 x float> undef, - <4 x i32> - %b = shufflevector <8 x float> %0, <8 x float> undef, - <4 x i32> - - %cospa = alloca <4 x float> - %sa = call <4 x float> @__svml_sincosf4(<4 x float> * %cospa, <4 x float> %a) - - %cospb = alloca <4 x float> - %sb = call <4 x float> @__svml_sincosf4(<4 x float> * %cospb, <4 x float> %b) - - %sin = shufflevector <4 x float> %sa, <4 x float> %sb, - <8 x i32> - store <8 x float> %sin, <8 x float> * %1 - - %cosa = load <4 x float> * %cospa - %cosb = load <4 x float> * %cospb - %cos = shufflevector <4 x float> %cosa, <4 x float> %cosb, - <8 x i32> - store <8 x float> %cos, <8 x float> * %2 - - ret void -} - -define <8 x float> @__svml_tan(<8 x float>) nounwind readnone alwaysinline { - unary4to8(ret, float, @__svml_tanf4, %0) - ret <8 x float> %ret -} - -define <8 x float> @__svml_atan(<8 x float>) nounwind readnone alwaysinline { - unary4to8(ret, float, @__svml_atanf4, %0) - ret <8 x float> %ret -} - -define <8 x float> @__svml_atan2(<8 x float>, - <8 x float>) nounwind readnone alwaysinline { - binary4to8(ret, float, @__svml_atan2f4, %0, %1) - ret <8 x float> %ret -} - -define <8 x float> @__svml_exp(<8 x float>) nounwind readnone alwaysinline { - unary4to8(ret, float, @__svml_expf4, %0) - ret <8 x float> %ret -} - -define <8 x float> @__svml_log(<8 x float>) nounwind readnone alwaysinline { - unary4to8(ret, float, @__svml_logf4, %0) - ret <8 x float> %ret -} - -define <8 x float> @__svml_pow(<8 x float>, - <8 x float>) nounwind readnone alwaysinline { - binary4to8(ret, float, @__svml_powf4, %0, %1) - ret <8 x float> %ret -} - - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; float min/max diff --git a/builtins/target-sse4.ll b/builtins/target-sse4.ll index 98a7ef69..7f9a9185 100644 --- a/builtins/target-sse4.ll +++ b/builtins/target-sse4.ll @@ -206,66 +206,6 @@ define <4 x double> @__max_varying_double(<4 x double>, <4 x double>) nounwind r ret <4 x double> %ret } -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -; svml stuff - -declare <4 x float> @__svml_sinf4(<4 x float>) nounwind readnone -declare <4 x float> @__svml_cosf4(<4 x float>) nounwind readnone -declare <4 x float> @__svml_sincosf4(<4 x float> *, <4 x float>) nounwind readnone -declare <4 x float> @__svml_tanf4(<4 x float>) nounwind readnone -declare <4 x float> @__svml_atanf4(<4 x float>) nounwind readnone -declare <4 x float> @__svml_atan2f4(<4 x float>, <4 x float>) nounwind readnone -declare <4 x float> @__svml_expf4(<4 x float>) nounwind readnone -declare <4 x float> @__svml_logf4(<4 x float>) nounwind readnone -declare <4 x float> @__svml_powf4(<4 x float>, <4 x float>) nounwind readnone - - -define <4 x float> @__svml_sin(<4 x float>) nounwind readnone alwaysinline { - %ret = call <4 x float> @__svml_sinf4(<4 x float> %0) - ret <4 x float> %ret -} - -define <4 x float> @__svml_cos(<4 x float>) nounwind readnone alwaysinline { - %ret = call <4 x float> @__svml_cosf4(<4 x float> %0) - ret <4 x float> %ret -} - -define void @__svml_sincos(<4 x float>, <4 x float> *, <4 x float> *) nounwind readnone alwaysinline { - %s = call <4 x float> @__svml_sincosf4(<4 x float> * %2, <4 x float> %0) - store <4 x float> %s, <4 x float> * %1 - ret void -} - -define <4 x float> @__svml_tan(<4 x float>) nounwind readnone alwaysinline { - %ret = call <4 x float> @__svml_tanf4(<4 x float> %0) - ret <4 x float> %ret -} - -define <4 x float> @__svml_atan(<4 x float>) nounwind readnone alwaysinline { - %ret = call <4 x float> @__svml_atanf4(<4 x float> %0) - ret <4 x float> %ret -} - -define <4 x float> @__svml_atan2(<4 x float>, <4 x float>) nounwind readnone alwaysinline { - %ret = call <4 x float> @__svml_atan2f4(<4 x float> %0, <4 x float> %1) - ret <4 x float> %ret -} - -define <4 x float> @__svml_exp(<4 x float>) nounwind readnone alwaysinline { - %ret = call <4 x float> @__svml_expf4(<4 x float> %0) - ret <4 x float> %ret -} - -define <4 x float> @__svml_log(<4 x float>) nounwind readnone alwaysinline { - %ret = call <4 x float> @__svml_logf4(<4 x float> %0) - ret <4 x float> %ret -} - -define <4 x float> @__svml_pow(<4 x float>, <4 x float>) nounwind readnone alwaysinline { - %ret = call <4 x float> @__svml_powf4(<4 x float> %0, <4 x float> %1) - ret <4 x float> %ret -} - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; horizontal ops / reductions diff --git a/docs/ispc.rst b/docs/ispc.rst index 39d3a5c8..af59714a 100755 --- a/docs/ispc.rst +++ b/docs/ispc.rst @@ -3278,9 +3278,6 @@ for this argument. approximately 1.45e-6 over the range -10pi to 10pi.) * ``fast``: more efficient but lower accuracy versions of the default ``ispc`` implementations. -* ``svml``: use Intel "Short Vector Math Library". Use - ``icc`` to link your final executable so that the appropriate libraries - are linked. * ``system``: use the system's math library. On many systems, these functions are more accurate than both of ``ispc``'s implementations. Using these functions may be quite diff --git a/ispc.h b/ispc.h index d68f9034..8653553e 100644 --- a/ispc.h +++ b/ispc.h @@ -468,7 +468,7 @@ struct Globals { /** There are a number of math libraries that can be used for transcendentals and the like during program compilation. */ - enum MathLib { Math_ISPC, Math_ISPCFast, Math_SVML, Math_System }; + enum MathLib { Math_ISPC, Math_ISPCFast, Math_System }; MathLib mathLib; /** Records whether the ispc standard library should be made available diff --git a/main.cpp b/main.cpp index 4c4b4575..c21e7f88 100644 --- a/main.cpp +++ b/main.cpp @@ -107,7 +107,6 @@ usage(int ret) { printf(" [--math-lib= - - - Document - m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-avx1.ll | python bitcode2cpp.py builtins\target-avx1.ll 32bit > $(Configuration)/gen-bitcode-avx1-32bit.cpp - $(Configuration)/gen-bitcode-avx1-32bit.cpp - builtins\util.m4;builtins\target-avx-common.ll;builtins\target-avx.ll - Building gen-bitcode-avx1-32bit.cpp - - - - - Document - m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-avx1.ll | python bitcode2cpp.py builtins\target-avx1.ll 64bit > $(Configuration)/gen-bitcode-avx1-64bit.cpp - $(Configuration)/gen-bitcode-avx1-64bit.cpp - builtins\util.m4;builtins\target-avx-common.ll;builtins\target-avx.ll - Building gen-bitcode-avx1-64bit.cpp - - -======= Document @@ -222,7 +203,6 @@ Building gen-bitcode-avx1-64bit.cpp ->>>>>>> master Document From ed017c42f1933ea1c57242f52cecb45507d9e324 Mon Sep 17 00:00:00 2001 From: Matt Pharr Date: Sun, 11 Aug 2013 07:47:20 -0700 Subject: [PATCH 28/34] Fix ispc.vcxproj for Windows builds --- ispc.vcxproj | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/ispc.vcxproj b/ispc.vcxproj index 36fbad5d..74186ac0 100755 --- a/ispc.vcxproj +++ b/ispc.vcxproj @@ -53,8 +53,10 @@ - - + + + + 4146;4800;4996;4355;4624;4005;4003;4018 @@ -97,11 +99,13 @@ Document - %LLVM_INSTALL_DIR%\bin\clang -E -x c %(Filename).ispc -DISPC=1 -DPI=3.1415926535 | python stdlib2cpp.py x86 > $(Configuration)/gen-stdlib-x86.cpp; -%LLVM_INSTALL_DIR%\bin\clang -E -x c %(Filename).ispc -DISPC=1 -DISPC_TARGET_GENERIC=1 -DPI=3.1415926535 | python stdlib2cpp.py generic > $(Configuration)/gen-stdlib-generic.cpp; + %LLVM_INSTALL_DIR%\bin\clang -E -x c %(Filename).ispc -DISPC_MASK_BITS=1 -DISPC=1 -DPI=3.1415926535 | python stdlib2cpp.py mask1 > $(Configuration)/gen-stdlib-mask1.cpp; +%LLVM_INSTALL_DIR%\bin\clang -E -x c %(Filename).ispc -DISPC_MASK_BITS=8 -DISPC=1 -DPI=3.1415926535 | python stdlib2cpp.py mask8 > $(Configuration)/gen-stdlib-mask8.cpp; +%LLVM_INSTALL_DIR%\bin\clang -E -x c %(Filename).ispc -DISPC_MASK_BITS=16 -DISPC=1 -DPI=3.1415926535 | python stdlib2cpp.py mask16 > $(Configuration)/gen-stdlib-mask16.cpp; +%LLVM_INSTALL_DIR%\bin\clang -E -x c %(Filename).ispc -DISPC_MASK_BITS=32 -DISPC=1 -DPI=3.1415926535 | python stdlib2cpp.py mask32 > $(Configuration)/gen-stdlib-mask32.cpp; - $(Configuration)/gen-stdlib-generic.cpp;$(Configuration)/gen-stdlib-x86.cpp - Building gen-stdlib-{generic,x86}.cpp + $(Configuration)/gen-stdlib-mask1.cpp;$(Configuration)/gen-stdlib-mask8.cpp;$(Configuration)/gen-stdlib-mask16.cpp;$(Configuration)/gen-stdlib-mask32.cpp + Building gen-stdlib-{mask1,8,16,32}.cpp From 42f31aed6901f131cf20eb7606db498f43192012 Mon Sep 17 00:00:00 2001 From: Matt Pharr Date: Wed, 14 Aug 2013 11:02:45 -0700 Subject: [PATCH 29/34] Another attempt at fixing the Windows build (added sse4-8/sse4-16 targets). --- ispc.vcxproj | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/ispc.vcxproj b/ispc.vcxproj index 74186ac0..b4a8b764 100755 --- a/ispc.vcxproj +++ b/ispc.vcxproj @@ -51,6 +51,10 @@ + + + + @@ -135,6 +139,42 @@ Building gen-bitcode-sse4-64bit.cpp + + + Document + m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-sse4-8.ll | python bitcode2cpp.py builtins\target-sse4-8.ll 32bit > $(Configuration)/gen-bitcode-sse4-8-32bit.cpp + $(Configuration)/gen-bitcode-sse4-8-32bit.cpp + builtins\util.m4;builtins\target-sse4-common.ll + Building gen-bitcode-sse4-8-32bit.cpp + + + + + Document + m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-sse4-8.ll | python bitcode2cpp.py builtins\target-sse4-8.ll 64bit > $(Configuration)/gen-bitcode-sse4-8-64bit.cpp + $(Configuration)/gen-bitcode-sse4-8-64bit.cpp + builtins\util.m4;builtins\target-sse4-common.ll + Building gen-bitcode-sse4-8-64bit.cpp + + + + + Document + m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-sse4-16.ll | python bitcode2cpp.py builtins\target-sse4-16.ll 32bit > $(Configuration)/gen-bitcode-sse4-16-32bit.cpp + $(Configuration)/gen-bitcode-sse4-16-32bit.cpp + builtins\util.m4;builtins\target-sse4-common.ll + Building gen-bitcode-sse4-16-32bit.cpp + + + + + Document + m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-sse4-16.ll | python bitcode2cpp.py builtins\target-sse4-16.ll 64bit > $(Configuration)/gen-bitcode-sse4-16-64bit.cpp + $(Configuration)/gen-bitcode-sse4-16-64bit.cpp + builtins\util.m4;builtins\target-sse4-common.ll + Building gen-bitcode-sse4-16-64bit.cpp + + Document From d976da7559089fa9bdc033ad764c73793ad34598 Mon Sep 17 00:00:00 2001 From: Matt Pharr Date: Tue, 20 Aug 2013 08:49:51 -0700 Subject: [PATCH 30/34] Speed up idiv test (dont test int32 as thoroughly) --- tests/idiv.ispc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/idiv.ispc b/tests/idiv.ispc index b7bd78dc..8738740b 100644 --- a/tests/idiv.ispc +++ b/tests/idiv.ispc @@ -44,7 +44,7 @@ export void f_f(uniform float RET[], uniform float aFOO[]) { // randomly sample int32s... uniform RNGState state; seed_rng(&state, 1234); - for (uniform int i = 0; i < 1M; ++i) { + for (uniform int i = 0; i < 64k; ++i) { unsigned int32 num = random(&state); for (uniform unsigned int32 div = 2; div < 256; ++div) { if (__fast_idiv(num, div) != num/div) { @@ -54,7 +54,7 @@ export void f_f(uniform float RET[], uniform float aFOO[]) { } } - for (uniform int64 i = 0; i < 1M; ++i) { + for (uniform int64 i = 0; i < 64k; ++i) { int32 num = random(&state); if (num < 0) continue; From e7f067d70cf03415fc350272daf0506b7184fa84 Mon Sep 17 00:00:00 2001 From: Matt Pharr Date: Tue, 20 Aug 2013 09:04:52 -0700 Subject: [PATCH 31/34] Fix handling of __clock() builtin for "generic" targets. --- cbackend.cpp | 4 ++++ examples/intrinsics/generic-16.h | 20 ++++++++++++++++++++ examples/intrinsics/generic-32.h | 20 ++++++++++++++++++++ examples/intrinsics/generic-64.h | 20 ++++++++++++++++++++ examples/intrinsics/knc.h | 21 ++++++++++++++++++--- examples/intrinsics/knc2x.h | 19 ++++++++++++++++++- examples/intrinsics/sse4.h | 20 ++++++++++++++++++-- 7 files changed, 118 insertions(+), 6 deletions(-) diff --git a/cbackend.cpp b/cbackend.cpp index d23bcc20..7d4b4cfc 100644 --- a/cbackend.cpp +++ b/cbackend.cpp @@ -3704,6 +3704,7 @@ void CWriter::lowerIntrinsics(llvm::Function &F) { case llvm::Intrinsic::sadd_with_overflow: case llvm::Intrinsic::trap: case llvm::Intrinsic::objectsize: + case llvm::Intrinsic::readcyclecounter: // We directly implement these intrinsics break; default: @@ -4056,6 +4057,9 @@ bool CWriter::visitBuiltinCall(llvm::CallInst &I, llvm::Intrinsic::ID ID, return true; case llvm::Intrinsic::objectsize: return true; + case llvm::Intrinsic::readcyclecounter: + Out << "__clock()"; + return true; } } diff --git a/examples/intrinsics/generic-16.h b/examples/intrinsics/generic-16.h index 6d4fe1f4..d81101f7 100644 --- a/examples/intrinsics/generic-16.h +++ b/examples/intrinsics/generic-16.h @@ -1759,3 +1759,23 @@ static FORCEINLINE uint64_t __atomic_cmpxchg(uint64_t *p, uint64_t cmpval, return __sync_val_compare_and_swap(p, cmpval, newval); #endif } + +#ifdef WIN32 +#include +#define __clock __rdtsc +#else // WIN32 +static FORCEINLINE uint64_t __clock() { + uint32_t low, high; +#ifdef __x86_64 + __asm__ __volatile__ ("xorl %%eax,%%eax \n cpuid" + ::: "%rax", "%rbx", "%rcx", "%rdx" ); +#else + __asm__ __volatile__ ("xorl %%eax,%%eax \n cpuid" + ::: "%eax", "%ebx", "%ecx", "%edx" ); +#endif + __asm__ __volatile__ ("rdtsc" : "=a" (low), "=d" (high)); + return (uint64_t)high << 32 | low; +} + +#endif // !WIN32 + diff --git a/examples/intrinsics/generic-32.h b/examples/intrinsics/generic-32.h index 12c4f84e..7e6c69d4 100644 --- a/examples/intrinsics/generic-32.h +++ b/examples/intrinsics/generic-32.h @@ -1827,3 +1827,23 @@ static FORCEINLINE uint64_t __atomic_cmpxchg(uint64_t *p, uint64_t cmpval, return __sync_val_compare_and_swap(p, cmpval, newval); #endif } + +#ifdef WIN32 +#include +#define __clock __rdtsc +#else // WIN32 +static FORCEINLINE uint64_t __clock() { + uint32_t low, high; +#ifdef __x86_64 + __asm__ __volatile__ ("xorl %%eax,%%eax \n cpuid" + ::: "%rax", "%rbx", "%rcx", "%rdx" ); +#else + __asm__ __volatile__ ("xorl %%eax,%%eax \n cpuid" + ::: "%eax", "%ebx", "%ecx", "%edx" ); +#endif + __asm__ __volatile__ ("rdtsc" : "=a" (low), "=d" (high)); + return (uint64_t)high << 32 | low; +} +#endif // WIN32 + +#undef FORCEINLINE diff --git a/examples/intrinsics/generic-64.h b/examples/intrinsics/generic-64.h index a3648f42..39124186 100644 --- a/examples/intrinsics/generic-64.h +++ b/examples/intrinsics/generic-64.h @@ -1960,3 +1960,23 @@ static FORCEINLINE uint64_t __atomic_cmpxchg(uint64_t *p, uint64_t cmpval, return __sync_val_compare_and_swap(p, cmpval, newval); #endif } + +#ifdef WIN32 +#include +#define __clock __rdtsc +#else // WIN32 +static FORCEINLINE uint64_t __clock() { + uint32_t low, high; +#ifdef __x86_64 + __asm__ __volatile__ ("xorl %%eax,%%eax \n cpuid" + ::: "%rax", "%rbx", "%rcx", "%rdx" ); +#else + __asm__ __volatile__ ("xorl %%eax,%%eax \n cpuid" + ::: "%eax", "%ebx", "%ecx", "%edx" ); +#endif + __asm__ __volatile__ ("rdtsc" : "=a" (low), "=d" (high)); + return (uint64_t)high << 32 | low; +} +#endif + +#undef FORCEINLINE diff --git a/examples/intrinsics/knc.h b/examples/intrinsics/knc.h index 41c4cbc0..8baef8cb 100644 --- a/examples/intrinsics/knc.h +++ b/examples/intrinsics/knc.h @@ -2121,9 +2121,24 @@ static FORCEINLINE uint64_t __atomic_cmpxchg(uint64_t *p, uint64_t cmpval, #endif } +#ifdef WIN32 +#include +#define __clock __rdtsc +#else // WIN32 +static FORCEINLINE uint64_t __clock() { + uint32_t low, high; +#ifdef __x86_64 + __asm__ __volatile__ ("xorl %%eax,%%eax \n cpuid" + ::: "%rax", "%rbx", "%rcx", "%rdx" ); +#else + __asm__ __volatile__ ("xorl %%eax,%%eax \n cpuid" + ::: "%eax", "%ebx", "%ecx", "%edx" ); +#endif + __asm__ __volatile__ ("rdtsc" : "=a" (low), "=d" (high)); + return (uint64_t)high << 32 | low; +} +#endif // !WIN32 + #undef FORCEINLINE #undef PRE_ALIGN #undef POST_ALIGN - - - diff --git a/examples/intrinsics/knc2x.h b/examples/intrinsics/knc2x.h index 5b6e5295..a1b1fc9d 100644 --- a/examples/intrinsics/knc2x.h +++ b/examples/intrinsics/knc2x.h @@ -2055,7 +2055,24 @@ static FORCEINLINE void __aos_to_soa4_float(float *ptr, __vec32_f *out0, __vec32 } */ +#ifdef WIN32 +#include +#define __clock __rdtsc +#else // WIN32 +static FORCEINLINE uint64_t __clock() { + uint32_t low, high; +#ifdef __x86_64 + __asm__ __volatile__ ("xorl %%eax,%%eax \n cpuid" + ::: "%rax", "%rbx", "%rcx", "%rdx" ); +#else + __asm__ __volatile__ ("xorl %%eax,%%eax \n cpuid" + ::: "%eax", "%ebx", "%ecx", "%edx" ); +#endif + __asm__ __volatile__ ("rdtsc" : "=a" (low), "=d" (high)); + return (uint64_t)high << 32 | low; +} +#endif // !WIN32 + #undef FORCEINLINE #undef PRE_ALIGN #undef POST_ALIGN - diff --git a/examples/intrinsics/sse4.h b/examples/intrinsics/sse4.h index 44dedf33..ff00d920 100644 --- a/examples/intrinsics/sse4.h +++ b/examples/intrinsics/sse4.h @@ -4000,6 +4000,22 @@ static FORCEINLINE uint64_t __atomic_cmpxchg(uint64_t *p, uint64_t cmpval, #endif } +#ifdef WIN32 +#include +#define __clock __rdtsc +#else // WIN32 +static FORCEINLINE uint64_t __clock() { + uint32_t low, high; +#ifdef __x86_64 + __asm__ __volatile__ ("xorl %%eax,%%eax \n cpuid" + ::: "%rax", "%rbx", "%rcx", "%rdx" ); +#else + __asm__ __volatile__ ("xorl %%eax,%%eax \n cpuid" + ::: "%eax", "%ebx", "%ecx", "%edx" ); +#endif + __asm__ __volatile__ ("rdtsc" : "=a" (low), "=d" (high)); + return (uint64_t)high << 32 | low; +} +#endif // !WIN32 + #undef FORCEINLINE - - From 2b2905b567fec1725beff5064d6b0ffe21d93c38 Mon Sep 17 00:00:00 2001 From: Matt Pharr Date: Tue, 20 Aug 2013 09:05:50 -0700 Subject: [PATCH 32/34] Fix (preexisting) bugs in generic-32/64.h with type of "__any", etc. This should be a bool, not a one-wide vector of bools. The equivalent fix was previously made in generic-16.h, but not made here. (Note that many tests are still failing with these targets, but at least they compile properly now.) --- examples/intrinsics/generic-32.h | 12 ++++++------ examples/intrinsics/generic-64.h | 12 ++++++------ 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/examples/intrinsics/generic-32.h b/examples/intrinsics/generic-32.h index 7e6c69d4..531ed215 100644 --- a/examples/intrinsics/generic-32.h +++ b/examples/intrinsics/generic-32.h @@ -408,15 +408,15 @@ static FORCEINLINE uint64_t __movmsk(__vec32_i1 mask) { return (uint64_t)mask.v; } -static FORCEINLINE __vec32_i1 __any(__vec32_i1 mask) { +static FORCEINLINE bool __any(__vec32_i1 mask) { return (mask.v!=0); } -static FORCEINLINE __vec32_i1 __all(__vec32_i1 mask) { - return (mask.v==0xFFFFFFFF); +static FORCEINLINE bool __all(__vec32_i1 mask) { + return (mask.v==0xFFFFFFFFul); } -static FORCEINLINE __vec32_i1 __none(__vec32_i1 mask) { +static FORCEINLINE bool __none(__vec32_i1 mask) { return (mask.v==0); } @@ -1231,8 +1231,8 @@ REDUCE_ADD(double, __vec32_d, __reduce_add_double) REDUCE_MINMAX(double, __vec32_d, __reduce_min_double, <) REDUCE_MINMAX(double, __vec32_d, __reduce_max_double, >) -REDUCE_ADD(int16_t, __vec16_i8, __reduce_add_int8) -REDUCE_ADD(int32_t, __vec16_i16, __reduce_add_int16) +//REDUCE_ADD(int16_t, __vec16_i8, __reduce_add_int8) +//REDUCE_ADD(int32_t, __vec16_i16, __reduce_add_int16) REDUCE_ADD(int64_t, __vec32_i32, __reduce_add_int32) REDUCE_MINMAX(int32_t, __vec32_i32, __reduce_min_int32, <) diff --git a/examples/intrinsics/generic-64.h b/examples/intrinsics/generic-64.h index 39124186..bbeb007a 100644 --- a/examples/intrinsics/generic-64.h +++ b/examples/intrinsics/generic-64.h @@ -533,15 +533,15 @@ static FORCEINLINE uint64_t __movmsk(__vec64_i1 mask) { return (uint64_t)mask.v; } -static FORCEINLINE __vec64_i1 __any(__vec64_i1 mask) { +static FORCEINLINE bool __any(__vec64_i1 mask) { return (mask.v!=0); } -static FORCEINLINE __vec64_i1 __all(__vec64_i1 mask) { - return (mask.v==0xFFFFFFFFFFFFFFFF); +static FORCEINLINE bool __all(__vec64_i1 mask) { + return (mask.v==0xFFFFFFFFFFFFFFFFull); } -static FORCEINLINE __vec64_i1 __none(__vec64_i1 mask) { +static FORCEINLINE bool __none(__vec64_i1 mask) { return (mask.v==0); } @@ -1364,8 +1364,8 @@ REDUCE_ADD(double, __vec64_d, __reduce_add_double) REDUCE_MINMAX(double, __vec64_d, __reduce_min_double, <) REDUCE_MINMAX(double, __vec64_d, __reduce_max_double, >) -REDUCE_ADD(int16_t, __vec16_i8, __reduce_add_int8) -REDUCE_ADD(int32_t, __vec16_i16, __reduce_add_int16) +//REDUCE_ADD(int16_t, __vec16_i8, __reduce_add_int8) +//REDUCE_ADD(int32_t, __vec16_i16, __reduce_add_int16) REDUCE_ADD(int64_t, __vec64_i32, __reduce_add_int32) REDUCE_MINMAX(int32_t, __vec64_i32, __reduce_min_int32, <) From 502f8fd76b9cf88cd260106b546494c1facc28b4 Mon Sep 17 00:00:00 2001 From: Matt Pharr Date: Tue, 20 Aug 2013 09:22:09 -0700 Subject: [PATCH 33/34] Reduce debug spew on failing idiv.ispc tests --- tests/idiv.ispc | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/tests/idiv.ispc b/tests/idiv.ispc index 8738740b..bd0766da 100644 --- a/tests/idiv.ispc +++ b/tests/idiv.ispc @@ -4,12 +4,13 @@ export uniform int width() { return programCount; } export void f_f(uniform float RET[], uniform float aFOO[]) { uniform int errorCount = 0; - + for (unsigned int8 num = 0; num < 255; ++num) { for (uniform unsigned int8 div = 2; div < 255; ++div) { if (__fast_idiv(num, div) != num/div) { ++errorCount; print("error %/% = %, got %\n", num, div, num/div, __fast_idiv(num,div)); + if (errorCount > 32) break; } } } @@ -19,6 +20,7 @@ export void f_f(uniform float RET[], uniform float aFOO[]) { if (__fast_idiv(num, div) != num/div) { ++errorCount; print("error %/% = %, got %\n", num, div, num/div, __fast_idiv(num,div)); + if (errorCount > 64) break; } } } @@ -28,6 +30,7 @@ export void f_f(uniform float RET[], uniform float aFOO[]) { if (__fast_idiv(num, div) != num/div) { ++errorCount; print("error %/% = %, got %\n", num, div, num/div, __fast_idiv(num,div)); + if (errorCount > 96) break; } } } @@ -37,6 +40,7 @@ export void f_f(uniform float RET[], uniform float aFOO[]) { if (__fast_idiv(num, div) != num/div) { ++errorCount; print("error %/% = %, got %\n", num, div, num/div, __fast_idiv(num,div)); + if (errorCount > 128) break; } } } @@ -50,6 +54,7 @@ export void f_f(uniform float RET[], uniform float aFOO[]) { if (__fast_idiv(num, div) != num/div) { ++errorCount; print("ui32 error %/% = %, got %\n", num, div, num/div, __fast_idiv(num,div)); + if (errorCount > 160) break; } } } @@ -62,6 +67,7 @@ export void f_f(uniform float RET[], uniform float aFOO[]) { if (__fast_idiv(num, div) != num/div) { ++errorCount; print("si32 error %/% = %, got %\n", num, div, num/div, __fast_idiv(num,div)); + if (errorCount > 192) break; } } } From 611477e214f19e89657cd85252bb44e801573240 Mon Sep 17 00:00:00 2001 From: Matt Pharr Date: Thu, 22 Aug 2013 07:50:25 -0700 Subject: [PATCH 34/34] Revert change to lEmitVaryingSelect(). Using vector select versus a store and masked load for varying vector selects seems to give worse code. This may be related to http://llvm.org/bugs/show_bug.cgi?id=16941. --- expr.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/expr.cpp b/expr.cpp index 856d363c..614cb5e5 100644 --- a/expr.cpp +++ b/expr.cpp @@ -3123,7 +3123,10 @@ static llvm::Value * lEmitVaryingSelect(FunctionEmitContext *ctx, llvm::Value *test, llvm::Value *expr1, llvm::Value *expr2, const Type *type) { -#if !defined(LLVM_3_1) +#if 0 // !defined(LLVM_3_1) + // Though it should be equivalent, this seems to cause non-trivial + // performance regressions versus the below. This may be related to + // http://llvm.org/bugs/show_bug.cgi?id=16941. if (test->getType() != LLVMTypes::Int1VectorType) test = ctx->TruncInst(test, LLVMTypes::Int1VectorType); return ctx->SelectInst(test, expr1, expr2, "select");