diff --git a/cbackend.cpp b/cbackend.cpp index 25ea0f76..f409d88f 100644 --- a/cbackend.cpp +++ b/cbackend.cpp @@ -3200,11 +3200,37 @@ lPredicateToString(llvm::CmpInst::Predicate p) { } +static const char * +lTypeToSuffix(llvm::Type *t) { + llvm::VectorType *vt = llvm::dyn_cast(t); + Assert(vt != NULL); + t = vt->getElementType(); + + switch (t->getTypeID()) { + case llvm::Type::FloatTyID: return "float"; + case llvm::Type::DoubleTyID: return "double"; + case llvm::Type::IntegerTyID: { + switch (llvm::cast(t)->getBitWidth()) { + case 1: return "i1"; + case 8: return "i8"; + case 16: return "i16"; + case 32: return "i32"; + case 64: return "i64"; + } + } + default: llvm_unreachable(0); return NULL; + } + return NULL; +} + + void CWriter::visitICmpInst(llvm::ICmpInst &I) { bool isVector = llvm::isa(I.getOperand(0)->getType()); if (isVector) { Out << lPredicateToString(I.getPredicate()); + Out << "_"; + Out << lTypeToSuffix(I.getOperand(0)->getType()); Out << "("; writeOperand(I.getOperand(0)); Out << ", "; @@ -3270,6 +3296,8 @@ void CWriter::visitFCmpInst(llvm::FCmpInst &I) { if (isVector) { Out << lPredicateToString(I.getPredicate()); + Out << "_"; + Out << lTypeToSuffix(I.getOperand(0)->getType()); Out << "("; } else { diff --git a/examples/intrinsics/generic-16.h b/examples/intrinsics/generic-16.h index 6f4e5ed9..bb5b0dc3 100644 --- a/examples/intrinsics/generic-16.h +++ b/examples/intrinsics/generic-16.h @@ -193,8 +193,8 @@ static FORCEINLINE TYPE NAME(TYPE a, TYPE b) { \ return ret; \ } -#define CMP_OP(TYPE, CAST, NAME, OP) \ -static FORCEINLINE __vec16_i1 NAME(TYPE a, TYPE b) { \ +#define CMP_OP(TYPE, SUFFIX, CAST, NAME, OP) \ +static FORCEINLINE __vec16_i1 NAME##_##SUFFIX(TYPE a, TYPE b) { \ __vec16_i1 ret; \ ret.v = 0; \ for (int i = 0; i < 16; ++i) \ @@ -317,7 +317,7 @@ static FORCEINLINE uint64_t __movmsk(__vec16_i1 mask) { return (uint64_t)mask.v; } -static FORCEINLINE __vec16_i1 __equal(__vec16_i1 a, __vec16_i1 b) { +static FORCEINLINE __vec16_i1 __equal_i1(__vec16_i1 a, __vec16_i1 b) { __vec16_i1 r; r.v = (a.v & b.v) | (~a.v & ~b.v); return r; @@ -424,16 +424,16 @@ SHIFT_UNIFORM(__vec16_i8, uint8_t, __lshr, >>) SHIFT_UNIFORM(__vec16_i8, int8_t, __ashr, >>) SHIFT_UNIFORM(__vec16_i8, int8_t, __shl, <<) -CMP_OP(__vec16_i8, int8_t, __equal, ==) -CMP_OP(__vec16_i8, int8_t, __not_equal, !=) -CMP_OP(__vec16_i8, uint8_t, __unsigned_less_equal, <=) -CMP_OP(__vec16_i8, int8_t, __signed_less_equal, <=) -CMP_OP(__vec16_i8, uint8_t, __unsigned_greater_equal, >=) -CMP_OP(__vec16_i8, int8_t, __signed_greater_equal, >=) -CMP_OP(__vec16_i8, uint8_t, __unsigned_less_than, <) -CMP_OP(__vec16_i8, int8_t, __signed_less_than, <) -CMP_OP(__vec16_i8, uint8_t, __unsigned_greater_than, >) -CMP_OP(__vec16_i8, int8_t, __signed_greater_than, >) +CMP_OP(__vec16_i8, i8, int8_t, __equal, ==) +CMP_OP(__vec16_i8, i8, int8_t, __not_equal, !=) +CMP_OP(__vec16_i8, i8, uint8_t, __unsigned_less_equal, <=) +CMP_OP(__vec16_i8, i8, int8_t, __signed_less_equal, <=) +CMP_OP(__vec16_i8, i8, uint8_t, __unsigned_greater_equal, >=) +CMP_OP(__vec16_i8, i8, int8_t, __signed_greater_equal, >=) +CMP_OP(__vec16_i8, i8, uint8_t, __unsigned_less_than, <) +CMP_OP(__vec16_i8, i8, int8_t, __signed_less_than, <) +CMP_OP(__vec16_i8, i8, uint8_t, __unsigned_greater_than, >) +CMP_OP(__vec16_i8, i8, int8_t, __signed_greater_than, >) SELECT(__vec16_i8) INSERT_EXTRACT(__vec16_i8, int8_t) @@ -467,16 +467,16 @@ SHIFT_UNIFORM(__vec16_i16, uint16_t, __lshr, >>) SHIFT_UNIFORM(__vec16_i16, int16_t, __ashr, >>) SHIFT_UNIFORM(__vec16_i16, int16_t, __shl, <<) -CMP_OP(__vec16_i16, int16_t, __equal, ==) -CMP_OP(__vec16_i16, int16_t, __not_equal, !=) -CMP_OP(__vec16_i16, uint16_t, __unsigned_less_equal, <=) -CMP_OP(__vec16_i16, int16_t, __signed_less_equal, <=) -CMP_OP(__vec16_i16, uint16_t, __unsigned_greater_equal, >=) -CMP_OP(__vec16_i16, int16_t, __signed_greater_equal, >=) -CMP_OP(__vec16_i16, uint16_t, __unsigned_less_than, <) -CMP_OP(__vec16_i16, int16_t, __signed_less_than, <) -CMP_OP(__vec16_i16, uint16_t, __unsigned_greater_than, >) -CMP_OP(__vec16_i16, int16_t, __signed_greater_than, >) +CMP_OP(__vec16_i16, i16, int16_t, __equal, ==) +CMP_OP(__vec16_i16, i16, int16_t, __not_equal, !=) +CMP_OP(__vec16_i16, i16, uint16_t, __unsigned_less_equal, <=) +CMP_OP(__vec16_i16, i16, int16_t, __signed_less_equal, <=) +CMP_OP(__vec16_i16, i16, uint16_t, __unsigned_greater_equal, >=) +CMP_OP(__vec16_i16, i16, int16_t, __signed_greater_equal, >=) +CMP_OP(__vec16_i16, i16, uint16_t, __unsigned_less_than, <) +CMP_OP(__vec16_i16, i16, int16_t, __signed_less_than, <) +CMP_OP(__vec16_i16, i16, uint16_t, __unsigned_greater_than, >) +CMP_OP(__vec16_i16, i16, int16_t, __signed_greater_than, >) SELECT(__vec16_i16) INSERT_EXTRACT(__vec16_i16, int16_t) @@ -510,16 +510,16 @@ SHIFT_UNIFORM(__vec16_i32, uint32_t, __lshr, >>) SHIFT_UNIFORM(__vec16_i32, int32_t, __ashr, >>) SHIFT_UNIFORM(__vec16_i32, int32_t, __shl, <<) -CMP_OP(__vec16_i32, int32_t, __equal, ==) -CMP_OP(__vec16_i32, int32_t, __not_equal, !=) -CMP_OP(__vec16_i32, uint32_t, __unsigned_less_equal, <=) -CMP_OP(__vec16_i32, int32_t, __signed_less_equal, <=) -CMP_OP(__vec16_i32, uint32_t, __unsigned_greater_equal, >=) -CMP_OP(__vec16_i32, int32_t, __signed_greater_equal, >=) -CMP_OP(__vec16_i32, uint32_t, __unsigned_less_than, <) -CMP_OP(__vec16_i32, int32_t, __signed_less_than, <) -CMP_OP(__vec16_i32, uint32_t, __unsigned_greater_than, >) -CMP_OP(__vec16_i32, int32_t, __signed_greater_than, >) +CMP_OP(__vec16_i32, i32, int32_t, __equal, ==) +CMP_OP(__vec16_i32, i32, int32_t, __not_equal, !=) +CMP_OP(__vec16_i32, i32, uint32_t, __unsigned_less_equal, <=) +CMP_OP(__vec16_i32, i32, int32_t, __signed_less_equal, <=) +CMP_OP(__vec16_i32, i32, uint32_t, __unsigned_greater_equal, >=) +CMP_OP(__vec16_i32, i32, int32_t, __signed_greater_equal, >=) +CMP_OP(__vec16_i32, i32, uint32_t, __unsigned_less_than, <) +CMP_OP(__vec16_i32, i32, int32_t, __signed_less_than, <) +CMP_OP(__vec16_i32, i32, uint32_t, __unsigned_greater_than, >) +CMP_OP(__vec16_i32, i32, int32_t, __signed_greater_than, >) SELECT(__vec16_i32) INSERT_EXTRACT(__vec16_i32, int32_t) @@ -553,16 +553,16 @@ SHIFT_UNIFORM(__vec16_i64, uint64_t, __lshr, >>) SHIFT_UNIFORM(__vec16_i64, int64_t, __ashr, >>) SHIFT_UNIFORM(__vec16_i64, int64_t, __shl, <<) -CMP_OP(__vec16_i64, int64_t, __equal, ==) -CMP_OP(__vec16_i64, int64_t, __not_equal, !=) -CMP_OP(__vec16_i64, uint64_t, __unsigned_less_equal, <=) -CMP_OP(__vec16_i64, int64_t, __signed_less_equal, <=) -CMP_OP(__vec16_i64, uint64_t, __unsigned_greater_equal, >=) -CMP_OP(__vec16_i64, int64_t, __signed_greater_equal, >=) -CMP_OP(__vec16_i64, uint64_t, __unsigned_less_than, <) -CMP_OP(__vec16_i64, int64_t, __signed_less_than, <) -CMP_OP(__vec16_i64, uint64_t, __unsigned_greater_than, >) -CMP_OP(__vec16_i64, int64_t, __signed_greater_than, >) +CMP_OP(__vec16_i64, i64, int64_t, __equal, ==) +CMP_OP(__vec16_i64, i64, int64_t, __not_equal, !=) +CMP_OP(__vec16_i64, i64, uint64_t, __unsigned_less_equal, <=) +CMP_OP(__vec16_i64, i64, int64_t, __signed_less_equal, <=) +CMP_OP(__vec16_i64, i64, uint64_t, __unsigned_greater_equal, >=) +CMP_OP(__vec16_i64, i64, int64_t, __signed_greater_equal, >=) +CMP_OP(__vec16_i64, i64, uint64_t, __unsigned_less_than, <) +CMP_OP(__vec16_i64, i64, int64_t, __signed_less_than, <) +CMP_OP(__vec16_i64, i64, uint64_t, __unsigned_greater_than, >) +CMP_OP(__vec16_i64, i64, int64_t, __signed_greater_than, >) SELECT(__vec16_i64) INSERT_EXTRACT(__vec16_i64, int64_t) @@ -580,12 +580,12 @@ BINARY_OP(__vec16_f, __sub, -) BINARY_OP(__vec16_f, __mul, *) BINARY_OP(__vec16_f, __div, /) -CMP_OP(__vec16_f, float, __equal, ==) -CMP_OP(__vec16_f, float, __not_equal, !=) -CMP_OP(__vec16_f, float, __less_than, <) -CMP_OP(__vec16_f, float, __less_equal, <=) -CMP_OP(__vec16_f, float, __greater_than, >) -CMP_OP(__vec16_f, float, __greater_equal, >=) +CMP_OP(__vec16_f, float, float, __equal, ==) +CMP_OP(__vec16_f, float, float, __not_equal, !=) +CMP_OP(__vec16_f, float, float, __less_than, <) +CMP_OP(__vec16_f, float, float, __less_equal, <=) +CMP_OP(__vec16_f, float, float, __greater_than, >) +CMP_OP(__vec16_f, float, float, __greater_equal, >=) static FORCEINLINE __vec16_i1 __ordered(__vec16_f a, __vec16_f b) { __vec16_i1 ret; @@ -730,12 +730,12 @@ BINARY_OP(__vec16_d, __sub, -) BINARY_OP(__vec16_d, __mul, *) BINARY_OP(__vec16_d, __div, /) -CMP_OP(__vec16_d, double, __equal, ==) -CMP_OP(__vec16_d, double, __not_equal, !=) -CMP_OP(__vec16_d, double, __less_than, <) -CMP_OP(__vec16_d, double, __less_equal, <=) -CMP_OP(__vec16_d, double, __greater_than, >) -CMP_OP(__vec16_d, double, __greater_equal, >=) +CMP_OP(__vec16_d, double, double, __equal, ==) +CMP_OP(__vec16_d, double, double, __not_equal, !=) +CMP_OP(__vec16_d, double, double, __less_than, <) +CMP_OP(__vec16_d, double, double, __less_equal, <=) +CMP_OP(__vec16_d, double, double, __greater_than, >) +CMP_OP(__vec16_d, double, double, __greater_equal, >=) static FORCEINLINE __vec16_i1 __ordered(__vec16_d a, __vec16_d b) { __vec16_i1 ret; diff --git a/examples/intrinsics/generic-32.h b/examples/intrinsics/generic-32.h index 86a1ea9b..b593ac87 100644 --- a/examples/intrinsics/generic-32.h +++ b/examples/intrinsics/generic-32.h @@ -258,8 +258,8 @@ static FORCEINLINE TYPE NAME(TYPE a, TYPE b) { \ return ret; \ } -#define CMP_OP(TYPE, CAST, NAME, OP) \ -static FORCEINLINE __vec32_i1 NAME(TYPE a, TYPE b) { \ +#define CMP_OP(TYPE, SUFFIX, CAST, NAME, OP) \ +static FORCEINLINE __vec32_i1 NAME##_##SUFFIX(TYPE a, TYPE b) { \ __vec32_i1 ret; \ ret.v = 0; \ for (int i = 0; i < 32; ++i) \ @@ -382,7 +382,7 @@ static FORCEINLINE uint64_t __movmsk(__vec32_i1 mask) { return (uint64_t)mask.v; } -static FORCEINLINE __vec32_i1 __equal(__vec32_i1 a, __vec32_i1 b) { +static FORCEINLINE __vec32_i1 __equal_i1(__vec32_i1 a, __vec32_i1 b) { __vec32_i1 r; r.v = (a.v & b.v) | (~a.v & ~b.v); return r; @@ -491,16 +491,16 @@ SHIFT_UNIFORM(__vec32_i8, uint8_t, __lshr, >>) SHIFT_UNIFORM(__vec32_i8, int8_t, __ashr, >>) SHIFT_UNIFORM(__vec32_i8, int8_t, __shl, <<) -CMP_OP(__vec32_i8, int8_t, __equal, ==) -CMP_OP(__vec32_i8, int8_t, __not_equal, !=) -CMP_OP(__vec32_i8, uint8_t, __unsigned_less_equal, <=) -CMP_OP(__vec32_i8, int8_t, __signed_less_equal, <=) -CMP_OP(__vec32_i8, uint8_t, __unsigned_greater_equal, >=) -CMP_OP(__vec32_i8, int8_t, __signed_greater_equal, >=) -CMP_OP(__vec32_i8, uint8_t, __unsigned_less_than, <) -CMP_OP(__vec32_i8, int8_t, __signed_less_than, <) -CMP_OP(__vec32_i8, uint8_t, __unsigned_greater_than, >) -CMP_OP(__vec32_i8, int8_t, __signed_greater_than, >) +CMP_OP(__vec32_i8, i8, int8_t, __equal, ==) +CMP_OP(__vec32_i8, i8, int8_t, __not_equal, !=) +CMP_OP(__vec32_i8, i8, uint8_t, __unsigned_less_equal, <=) +CMP_OP(__vec32_i8, i8, int8_t, __signed_less_equal, <=) +CMP_OP(__vec32_i8, i8, uint8_t, __unsigned_greater_equal, >=) +CMP_OP(__vec32_i8, i8, int8_t, __signed_greater_equal, >=) +CMP_OP(__vec32_i8, i8, uint8_t, __unsigned_less_than, <) +CMP_OP(__vec32_i8, i8, int8_t, __signed_less_than, <) +CMP_OP(__vec32_i8, i8, uint8_t, __unsigned_greater_than, >) +CMP_OP(__vec32_i8, i8, int8_t, __signed_greater_than, >) SELECT(__vec32_i8) INSERT_EXTRACT(__vec32_i8, int8_t) @@ -534,16 +534,16 @@ SHIFT_UNIFORM(__vec32_i16, uint16_t, __lshr, >>) SHIFT_UNIFORM(__vec32_i16, int16_t, __ashr, >>) SHIFT_UNIFORM(__vec32_i16, int16_t, __shl, <<) -CMP_OP(__vec32_i16, int16_t, __equal, ==) -CMP_OP(__vec32_i16, int16_t, __not_equal, !=) -CMP_OP(__vec32_i16, uint16_t, __unsigned_less_equal, <=) -CMP_OP(__vec32_i16, int16_t, __signed_less_equal, <=) -CMP_OP(__vec32_i16, uint16_t, __unsigned_greater_equal, >=) -CMP_OP(__vec32_i16, int16_t, __signed_greater_equal, >=) -CMP_OP(__vec32_i16, uint16_t, __unsigned_less_than, <) -CMP_OP(__vec32_i16, int16_t, __signed_less_than, <) -CMP_OP(__vec32_i16, uint16_t, __unsigned_greater_than, >) -CMP_OP(__vec32_i16, int16_t, __signed_greater_than, >) +CMP_OP(__vec32_i16, i16, int16_t, __equal, ==) +CMP_OP(__vec32_i16, i16, int16_t, __not_equal, !=) +CMP_OP(__vec32_i16, i16, uint16_t, __unsigned_less_equal, <=) +CMP_OP(__vec32_i16, i16, int16_t, __signed_less_equal, <=) +CMP_OP(__vec32_i16, i16, uint16_t, __unsigned_greater_equal, >=) +CMP_OP(__vec32_i16, i16, int16_t, __signed_greater_equal, >=) +CMP_OP(__vec32_i16, i16, uint16_t, __unsigned_less_than, <) +CMP_OP(__vec32_i16, i16, int16_t, __signed_less_than, <) +CMP_OP(__vec32_i16, i16, uint16_t, __unsigned_greater_than, >) +CMP_OP(__vec32_i16, i16, int16_t, __signed_greater_than, >) SELECT(__vec32_i16) INSERT_EXTRACT(__vec32_i16, int16_t) @@ -577,16 +577,16 @@ SHIFT_UNIFORM(__vec32_i32, uint32_t, __lshr, >>) SHIFT_UNIFORM(__vec32_i32, int32_t, __ashr, >>) SHIFT_UNIFORM(__vec32_i32, int32_t, __shl, <<) -CMP_OP(__vec32_i32, int32_t, __equal, ==) -CMP_OP(__vec32_i32, int32_t, __not_equal, !=) -CMP_OP(__vec32_i32, uint32_t, __unsigned_less_equal, <=) -CMP_OP(__vec32_i32, int32_t, __signed_less_equal, <=) -CMP_OP(__vec32_i32, uint32_t, __unsigned_greater_equal, >=) -CMP_OP(__vec32_i32, int32_t, __signed_greater_equal, >=) -CMP_OP(__vec32_i32, uint32_t, __unsigned_less_than, <) -CMP_OP(__vec32_i32, int32_t, __signed_less_than, <) -CMP_OP(__vec32_i32, uint32_t, __unsigned_greater_than, >) -CMP_OP(__vec32_i32, int32_t, __signed_greater_than, >) +CMP_OP(__vec32_i32, i32, int32_t, __equal, ==) +CMP_OP(__vec32_i32, i32, int32_t, __not_equal, !=) +CMP_OP(__vec32_i32, i32, uint32_t, __unsigned_less_equal, <=) +CMP_OP(__vec32_i32, i32, int32_t, __signed_less_equal, <=) +CMP_OP(__vec32_i32, i32, uint32_t, __unsigned_greater_equal, >=) +CMP_OP(__vec32_i32, i32, int32_t, __signed_greater_equal, >=) +CMP_OP(__vec32_i32, i32, uint32_t, __unsigned_less_than, <) +CMP_OP(__vec32_i32, i32, int32_t, __signed_less_than, <) +CMP_OP(__vec32_i32, i32, uint32_t, __unsigned_greater_than, >) +CMP_OP(__vec32_i32, i32, int32_t, __signed_greater_than, >) SELECT(__vec32_i32) INSERT_EXTRACT(__vec32_i32, int32_t) @@ -620,16 +620,16 @@ SHIFT_UNIFORM(__vec32_i64, uint64_t, __lshr, >>) SHIFT_UNIFORM(__vec32_i64, int64_t, __ashr, >>) SHIFT_UNIFORM(__vec32_i64, int64_t, __shl, <<) -CMP_OP(__vec32_i64, int64_t, __equal, ==) -CMP_OP(__vec32_i64, int64_t, __not_equal, !=) -CMP_OP(__vec32_i64, uint64_t, __unsigned_less_equal, <=) -CMP_OP(__vec32_i64, int64_t, __signed_less_equal, <=) -CMP_OP(__vec32_i64, uint64_t, __unsigned_greater_equal, >=) -CMP_OP(__vec32_i64, int64_t, __signed_greater_equal, >=) -CMP_OP(__vec32_i64, uint64_t, __unsigned_less_than, <) -CMP_OP(__vec32_i64, int64_t, __signed_less_than, <) -CMP_OP(__vec32_i64, uint64_t, __unsigned_greater_than, >) -CMP_OP(__vec32_i64, int64_t, __signed_greater_than, >) +CMP_OP(__vec32_i64, i64, int64_t, __equal, ==) +CMP_OP(__vec32_i64, i64, int64_t, __not_equal, !=) +CMP_OP(__vec32_i64, i64, uint64_t, __unsigned_less_equal, <=) +CMP_OP(__vec32_i64, i64, int64_t, __signed_less_equal, <=) +CMP_OP(__vec32_i64, i64, uint64_t, __unsigned_greater_equal, >=) +CMP_OP(__vec32_i64, i64, int64_t, __signed_greater_equal, >=) +CMP_OP(__vec32_i64, i64, uint64_t, __unsigned_less_than, <) +CMP_OP(__vec32_i64, i64, int64_t, __signed_less_than, <) +CMP_OP(__vec32_i64, i64, uint64_t, __unsigned_greater_than, >) +CMP_OP(__vec32_i64, i64, int64_t, __signed_greater_than, >) SELECT(__vec32_i64) INSERT_EXTRACT(__vec32_i64, int64_t) @@ -647,12 +647,12 @@ BINARY_OP(__vec32_f, __sub, -) BINARY_OP(__vec32_f, __mul, *) BINARY_OP(__vec32_f, __div, /) -CMP_OP(__vec32_f, float, __equal, ==) -CMP_OP(__vec32_f, float, __not_equal, !=) -CMP_OP(__vec32_f, float, __less_than, <) -CMP_OP(__vec32_f, float, __less_equal, <=) -CMP_OP(__vec32_f, float, __greater_than, >) -CMP_OP(__vec32_f, float, __greater_equal, >=) +CMP_OP(__vec32_f, float, float, __equal, ==) +CMP_OP(__vec32_f, float, float, __not_equal, !=) +CMP_OP(__vec32_f, float, float, __less_than, <) +CMP_OP(__vec32_f, float, float, __less_equal, <=) +CMP_OP(__vec32_f, float, float, __greater_than, >) +CMP_OP(__vec32_f, float, float, __greater_equal, >=) static FORCEINLINE __vec32_i1 __ordered(__vec32_f a, __vec32_f b) { __vec32_i1 ret; @@ -797,12 +797,12 @@ BINARY_OP(__vec32_d, __sub, -) BINARY_OP(__vec32_d, __mul, *) BINARY_OP(__vec32_d, __div, /) -CMP_OP(__vec32_d, double, __equal, ==) -CMP_OP(__vec32_d, double, __not_equal, !=) -CMP_OP(__vec32_d, double, __less_than, <) -CMP_OP(__vec32_d, double, __less_equal, <=) -CMP_OP(__vec32_d, double, __greater_than, >) -CMP_OP(__vec32_d, double, __greater_equal, >=) +CMP_OP(__vec32_d, double, double, __equal, ==) +CMP_OP(__vec32_d, double, double, __not_equal, !=) +CMP_OP(__vec32_d, double, double, __less_than, <) +CMP_OP(__vec32_d, double, double, __less_equal, <=) +CMP_OP(__vec32_d, double, double, __greater_than, >) +CMP_OP(__vec32_d, double, double, __greater_equal, >=) static FORCEINLINE __vec32_i1 __ordered(__vec32_d a, __vec32_d b) { __vec32_i1 ret; diff --git a/examples/intrinsics/generic-64.h b/examples/intrinsics/generic-64.h index 59908700..a0456e49 100644 --- a/examples/intrinsics/generic-64.h +++ b/examples/intrinsics/generic-64.h @@ -383,8 +383,8 @@ static FORCEINLINE TYPE NAME(TYPE a, TYPE b) { \ return ret; \ } -#define CMP_OP(TYPE, CAST, NAME, OP) \ -static FORCEINLINE __vec64_i1 NAME(TYPE a, TYPE b) { \ +#define CMP_OP(TYPE, SUFFIX, CAST, NAME, OP) \ +static FORCEINLINE __vec64_i1 NAME##_##SUFFIX(TYPE a, TYPE b) { \ __vec64_i1 ret; \ ret.v = 0; \ for (int i = 0; i < 64; ++i) \ @@ -507,7 +507,7 @@ static FORCEINLINE uint64_t __movmsk(__vec64_i1 mask) { return (uint64_t)mask.v; } -static FORCEINLINE __vec64_i1 __equal(__vec64_i1 a, __vec64_i1 b) { +static FORCEINLINE __vec64_i1 __equal_i1(__vec64_i1 a, __vec64_i1 b) { __vec64_i1 r; r.v = (a.v & b.v) | (~a.v & ~b.v); return r; @@ -620,16 +620,16 @@ SHIFT_UNIFORM(__vec64_i8, uint8_t, __lshr, >>) SHIFT_UNIFORM(__vec64_i8, int8_t, __ashr, >>) SHIFT_UNIFORM(__vec64_i8, int8_t, __shl, <<) -CMP_OP(__vec64_i8, int8_t, __equal, ==) -CMP_OP(__vec64_i8, int8_t, __not_equal, !=) -CMP_OP(__vec64_i8, uint8_t, __unsigned_less_equal, <=) -CMP_OP(__vec64_i8, int8_t, __signed_less_equal, <=) -CMP_OP(__vec64_i8, uint8_t, __unsigned_greater_equal, >=) -CMP_OP(__vec64_i8, int8_t, __signed_greater_equal, >=) -CMP_OP(__vec64_i8, uint8_t, __unsigned_less_than, <) -CMP_OP(__vec64_i8, int8_t, __signed_less_than, <) -CMP_OP(__vec64_i8, uint8_t, __unsigned_greater_than, >) -CMP_OP(__vec64_i8, int8_t, __signed_greater_than, >) +CMP_OP(__vec64_i8, i8, int8_t, __equal, ==) +CMP_OP(__vec64_i8, i8, int8_t, __not_equal, !=) +CMP_OP(__vec64_i8, i8, uint8_t, __unsigned_less_equal, <=) +CMP_OP(__vec64_i8, i8, int8_t, __signed_less_equal, <=) +CMP_OP(__vec64_i8, i8, uint8_t, __unsigned_greater_equal, >=) +CMP_OP(__vec64_i8, i8, int8_t, __signed_greater_equal, >=) +CMP_OP(__vec64_i8, i8, uint8_t, __unsigned_less_than, <) +CMP_OP(__vec64_i8, i8, int8_t, __signed_less_than, <) +CMP_OP(__vec64_i8, i8, uint8_t, __unsigned_greater_than, >) +CMP_OP(__vec64_i8, i8, int8_t, __signed_greater_than, >) SELECT(__vec64_i8) INSERT_EXTRACT(__vec64_i8, int8_t) @@ -663,16 +663,16 @@ SHIFT_UNIFORM(__vec64_i16, uint16_t, __lshr, >>) SHIFT_UNIFORM(__vec64_i16, int16_t, __ashr, >>) SHIFT_UNIFORM(__vec64_i16, int16_t, __shl, <<) -CMP_OP(__vec64_i16, int16_t, __equal, ==) -CMP_OP(__vec64_i16, int16_t, __not_equal, !=) -CMP_OP(__vec64_i16, uint16_t, __unsigned_less_equal, <=) -CMP_OP(__vec64_i16, int16_t, __signed_less_equal, <=) -CMP_OP(__vec64_i16, uint16_t, __unsigned_greater_equal, >=) -CMP_OP(__vec64_i16, int16_t, __signed_greater_equal, >=) -CMP_OP(__vec64_i16, uint16_t, __unsigned_less_than, <) -CMP_OP(__vec64_i16, int16_t, __signed_less_than, <) -CMP_OP(__vec64_i16, uint16_t, __unsigned_greater_than, >) -CMP_OP(__vec64_i16, int16_t, __signed_greater_than, >) +CMP_OP(__vec64_i16, i16, int16_t, __equal, ==) +CMP_OP(__vec64_i16, i16, int16_t, __not_equal, !=) +CMP_OP(__vec64_i16, i16, uint16_t, __unsigned_less_equal, <=) +CMP_OP(__vec64_i16, i16, int16_t, __signed_less_equal, <=) +CMP_OP(__vec64_i16, i16, uint16_t, __unsigned_greater_equal, >=) +CMP_OP(__vec64_i16, i16, int16_t, __signed_greater_equal, >=) +CMP_OP(__vec64_i16, i16, uint16_t, __unsigned_less_than, <) +CMP_OP(__vec64_i16, i16, int16_t, __signed_less_than, <) +CMP_OP(__vec64_i16, i16, uint16_t, __unsigned_greater_than, >) +CMP_OP(__vec64_i16, i16, int16_t, __signed_greater_than, >) SELECT(__vec64_i16) INSERT_EXTRACT(__vec64_i16, int16_t) @@ -706,16 +706,16 @@ SHIFT_UNIFORM(__vec64_i32, uint32_t, __lshr, >>) SHIFT_UNIFORM(__vec64_i32, int32_t, __ashr, >>) SHIFT_UNIFORM(__vec64_i32, int32_t, __shl, <<) -CMP_OP(__vec64_i32, int32_t, __equal, ==) -CMP_OP(__vec64_i32, int32_t, __not_equal, !=) -CMP_OP(__vec64_i32, uint32_t, __unsigned_less_equal, <=) -CMP_OP(__vec64_i32, int32_t, __signed_less_equal, <=) -CMP_OP(__vec64_i32, uint32_t, __unsigned_greater_equal, >=) -CMP_OP(__vec64_i32, int32_t, __signed_greater_equal, >=) -CMP_OP(__vec64_i32, uint32_t, __unsigned_less_than, <) -CMP_OP(__vec64_i32, int32_t, __signed_less_than, <) -CMP_OP(__vec64_i32, uint32_t, __unsigned_greater_than, >) -CMP_OP(__vec64_i32, int32_t, __signed_greater_than, >) +CMP_OP(__vec64_i32, i32, int32_t, __equal, ==) +CMP_OP(__vec64_i32, i32, int32_t, __not_equal, !=) +CMP_OP(__vec64_i32, i32, uint32_t, __unsigned_less_equal, <=) +CMP_OP(__vec64_i32, i32, int32_t, __signed_less_equal, <=) +CMP_OP(__vec64_i32, i32, uint32_t, __unsigned_greater_equal, >=) +CMP_OP(__vec64_i32, i32, int32_t, __signed_greater_equal, >=) +CMP_OP(__vec64_i32, i32, uint32_t, __unsigned_less_than, <) +CMP_OP(__vec64_i32, i32, int32_t, __signed_less_than, <) +CMP_OP(__vec64_i32, i32, uint32_t, __unsigned_greater_than, >) +CMP_OP(__vec64_i32, i32, int32_t, __signed_greater_than, >) SELECT(__vec64_i32) INSERT_EXTRACT(__vec64_i32, int32_t) @@ -749,16 +749,16 @@ SHIFT_UNIFORM(__vec64_i64, uint64_t, __lshr, >>) SHIFT_UNIFORM(__vec64_i64, int64_t, __ashr, >>) SHIFT_UNIFORM(__vec64_i64, int64_t, __shl, <<) -CMP_OP(__vec64_i64, int64_t, __equal, ==) -CMP_OP(__vec64_i64, int64_t, __not_equal, !=) -CMP_OP(__vec64_i64, uint64_t, __unsigned_less_equal, <=) -CMP_OP(__vec64_i64, int64_t, __signed_less_equal, <=) -CMP_OP(__vec64_i64, uint64_t, __unsigned_greater_equal, >=) -CMP_OP(__vec64_i64, int64_t, __signed_greater_equal, >=) -CMP_OP(__vec64_i64, uint64_t, __unsigned_less_than, <) -CMP_OP(__vec64_i64, int64_t, __signed_less_than, <) -CMP_OP(__vec64_i64, uint64_t, __unsigned_greater_than, >) -CMP_OP(__vec64_i64, int64_t, __signed_greater_than, >) +CMP_OP(__vec64_i64, i64, int64_t, __equal, ==) +CMP_OP(__vec64_i64, i64, int64_t, __not_equal, !=) +CMP_OP(__vec64_i64, i64, uint64_t, __unsigned_less_equal, <=) +CMP_OP(__vec64_i64, i64, int64_t, __signed_less_equal, <=) +CMP_OP(__vec64_i64, i64, uint64_t, __unsigned_greater_equal, >=) +CMP_OP(__vec64_i64, i64, int64_t, __signed_greater_equal, >=) +CMP_OP(__vec64_i64, i64, uint64_t, __unsigned_less_than, <) +CMP_OP(__vec64_i64, i64, int64_t, __signed_less_than, <) +CMP_OP(__vec64_i64, i64, uint64_t, __unsigned_greater_than, >) +CMP_OP(__vec64_i64, i64, int64_t, __signed_greater_than, >) SELECT(__vec64_i64) INSERT_EXTRACT(__vec64_i64, int64_t) @@ -776,12 +776,12 @@ BINARY_OP(__vec64_f, __sub, -) BINARY_OP(__vec64_f, __mul, *) BINARY_OP(__vec64_f, __div, /) -CMP_OP(__vec64_f, float, __equal, ==) -CMP_OP(__vec64_f, float, __not_equal, !=) -CMP_OP(__vec64_f, float, __less_than, <) -CMP_OP(__vec64_f, float, __less_equal, <=) -CMP_OP(__vec64_f, float, __greater_than, >) -CMP_OP(__vec64_f, float, __greater_equal, >=) +CMP_OP(__vec64_f, float, float, __equal, ==) +CMP_OP(__vec64_f, float, float, __not_equal, !=) +CMP_OP(__vec64_f, float, float, __less_than, <) +CMP_OP(__vec64_f, float, float, __less_equal, <=) +CMP_OP(__vec64_f, float, float, __greater_than, >) +CMP_OP(__vec64_f, float, float, __greater_equal, >=) static FORCEINLINE __vec64_i1 __ordered(__vec64_f a, __vec64_f b) { __vec64_i1 ret; @@ -926,12 +926,12 @@ BINARY_OP(__vec64_d, __sub, -) BINARY_OP(__vec64_d, __mul, *) BINARY_OP(__vec64_d, __div, /) -CMP_OP(__vec64_d, double, __equal, ==) -CMP_OP(__vec64_d, double, __not_equal, !=) -CMP_OP(__vec64_d, double, __less_than, <) -CMP_OP(__vec64_d, double, __less_equal, <=) -CMP_OP(__vec64_d, double, __greater_than, >) -CMP_OP(__vec64_d, double, __greater_equal, >=) +CMP_OP(__vec64_d, double, double, __equal, ==) +CMP_OP(__vec64_d, double, double, __not_equal, !=) +CMP_OP(__vec64_d, double, double, __less_than, <) +CMP_OP(__vec64_d, double, double, __less_equal, <=) +CMP_OP(__vec64_d, double, double, __greater_than, >) +CMP_OP(__vec64_d, double, double, __greater_equal, >=) static FORCEINLINE __vec64_i1 __ordered(__vec64_d a, __vec64_d b) { __vec64_i1 ret; diff --git a/examples/intrinsics/knc.h b/examples/intrinsics/knc.h index 9cc6ef22..243dc539 100644 --- a/examples/intrinsics/knc.h +++ b/examples/intrinsics/knc.h @@ -409,7 +409,7 @@ static FORCEINLINE __vec16_i1 __movmsk(__vec16_i1 mask) { return _mm512_kmov(mask); } -static FORCEINLINE __vec16_i1 __equal(__vec16_i1 a, __vec16_i1 b) { +static FORCEINLINE __vec16_i1 __equal_i1(__vec16_i1 a, __vec16_i1 b) { return _mm512_knot( _mm512_kandn(a, b)); } @@ -634,43 +634,43 @@ static FORCEINLINE __vec16_i32 __ashr(__vec16_i32 a, int32_t n) { return _mm512_srai_epi32((__m512i)a, n); } -static FORCEINLINE __vec16_i1 __equal(const __vec16_i32 &a, const __vec16_i32 &b) { +static FORCEINLINE __vec16_i1 __equal_i32(const __vec16_i32 &a, const __vec16_i32 &b) { return _mm512_cmpeq_epi32_mask((__m512i)a, (__m512i)b); } -static FORCEINLINE __vec16_i1 __not_equal(__vec16_i32 a, __vec16_i32 b) { +static FORCEINLINE __vec16_i1 __not_equal_i32(__vec16_i32 a, __vec16_i32 b) { return _mm512_cmpneq_epi32_mask((__m512i)a, (__m512i)b); } -static FORCEINLINE __vec16_i1 __unsigned_less_equal(__vec16_i32 a, __vec16_i32 b) { +static FORCEINLINE __vec16_i1 __unsigned_less_equal_i32(__vec16_i32 a, __vec16_i32 b) { return _mm512_cmple_epu32_mask((__m512i)a, (__m512i)b); } -static FORCEINLINE __vec16_i1 __signed_less_equal(__vec16_i32 a, __vec16_i32 b) { +static FORCEINLINE __vec16_i1 __signed_less_equal_i32(__vec16_i32 a, __vec16_i32 b) { return _mm512_cmple_epi32_mask((__m512i)a, (__m512i)b); } -static FORCEINLINE __vec16_i1 __unsigned_greater_equal(__vec16_i32 a, __vec16_i32 b) { +static FORCEINLINE __vec16_i1 __unsigned_greater_equal_i32(__vec16_i32 a, __vec16_i32 b) { return _mm512_cmpge_epu32_mask((__m512i)a, (__m512i)b); } -static FORCEINLINE __vec16_i1 __signed_greater_equal(__vec16_i32 a, __vec16_i32 b) { +static FORCEINLINE __vec16_i1 __signed_greater_equal_i32(__vec16_i32 a, __vec16_i32 b) { return _mm512_cmpge_epi32_mask((__m512i)a, (__m512i)b); } -static FORCEINLINE __vec16_i1 __unsigned_less_than(__vec16_i32 a, __vec16_i32 b) { +static FORCEINLINE __vec16_i1 __unsigned_less_than_i32(__vec16_i32 a, __vec16_i32 b) { return _mm512_cmplt_epu32_mask((__m512i)a, (__m512i)b); } -static FORCEINLINE __vec16_i1 __signed_less_than(__vec16_i32 a, __vec16_i32 b) { +static FORCEINLINE __vec16_i1 __signed_less_than_i32(__vec16_i32 a, __vec16_i32 b) { return _mm512_cmplt_epi32_mask((__m512i)a, (__m512i)b); } -static FORCEINLINE __vec16_i1 __unsigned_greater_than(__vec16_i32 a, __vec16_i32 b) { +static FORCEINLINE __vec16_i1 __unsigned_greater_than_i32(__vec16_i32 a, __vec16_i32 b) { return _mm512_cmpgt_epu32_mask((__m512i)a, (__m512i)b); } -static FORCEINLINE __vec16_i1 __signed_greater_than(__vec16_i32 a, __vec16_i32 b) { +static FORCEINLINE __vec16_i1 __signed_greater_than_i32(__vec16_i32 a, __vec16_i32 b) { return _mm512_cmpgt_epi32_mask((__m512i)a, (__m512i)b); } @@ -773,12 +773,12 @@ SHIFT_UNIFORM(__vec16_i64, uint64_t, __lshr, >>) SHIFT_UNIFORM(__vec16_i64, int64_t, __ashr, >>) SHIFT_UNIFORM(__vec16_i64, int64_t, __shl, <<) -static FORCEINLINE __vec16_i1 __equal(const __vec16_i64 &a, const __vec16_i64 &b) { +static FORCEINLINE __vec16_i1 __equal_i64(const __vec16_i64 &a, const __vec16_i64 &b) { const __mmask16 lo_match = _mm512_cmpeq_epi32_mask(a.v_lo,b.v_lo); return _mm512_mask_cmpeq_epi32_mask(lo_match,a.v_hi,b.v_hi); } -static FORCEINLINE __vec16_i1 __not_equal(const __vec16_i64 &a, const __vec16_i64 &b) { +static FORCEINLINE __vec16_i1 __not_equal_i64(const __vec16_i64 &a, const __vec16_i64 &b) { return __not(__equal(a,b)); } @@ -915,27 +915,27 @@ static FORCEINLINE __vec16_f __div(__vec16_f a, __vec16_f b) { return _mm512_div_ps(a, b); } -static FORCEINLINE __vec16_i1 __equal(__vec16_f a, __vec16_f b) { +static FORCEINLINE __vec16_i1 __equal_float(__vec16_f a, __vec16_f b) { return _mm512_cmpeq_ps_mask(a, b); } -static FORCEINLINE __vec16_i1 __not_equal(__vec16_f a, __vec16_f b) { +static FORCEINLINE __vec16_i1 __not_equal_float(__vec16_f a, __vec16_f b) { return _mm512_cmpneq_ps_mask(a, b); } -static FORCEINLINE __vec16_i1 __less_than(__vec16_f a, __vec16_f b) { +static FORCEINLINE __vec16_i1 __less_than_float(__vec16_f a, __vec16_f b) { return _mm512_cmplt_ps_mask(a, b); } -static FORCEINLINE __vec16_i1 __less_equal(__vec16_f a, __vec16_f b) { +static FORCEINLINE __vec16_i1 __less_equal_float(__vec16_f a, __vec16_f b) { return _mm512_cmple_ps_mask(a, b); } -static FORCEINLINE __vec16_i1 __greater_than(__vec16_f a, __vec16_f b) { +static FORCEINLINE __vec16_i1 __greater_than_float(__vec16_f a, __vec16_f b) { return _mm512_cmpnle_ps_mask(a, b); } -static FORCEINLINE __vec16_i1 __greater_equal(__vec16_f a, __vec16_f b) { +static FORCEINLINE __vec16_i1 __greater_equal_float(__vec16_f a, __vec16_f b) { return _mm512_cmpnlt_ps_mask(a, b); } @@ -1043,42 +1043,42 @@ static FORCEINLINE __vec16_d __div(__vec16_d a, __vec16_d b) { return ret; } -static FORCEINLINE __vec16_i1 __equal(__vec16_d a, __vec16_d b) { +static FORCEINLINE __vec16_i1 __equal_double(__vec16_d a, __vec16_d b) { __vec16_i1 ret; ret.m8.m1 = _mm512_cmpeq_pd_mask(a.v1, b.v1); ret.m8.m2 = _mm512_cmpeq_pd_mask(a.v2, b.v2); return ret; } -static FORCEINLINE __vec16_i1 __not_equal(__vec16_d a, __vec16_d b) { +static FORCEINLINE __vec16_i1 __not_equal_double(__vec16_d a, __vec16_d b) { __vec16_i1 ret; ret.m8.m1 = _mm512_cmpneq_pd_mask(a.v1, b.v1); ret.m8.m2 = _mm512_cmpneq_pd_mask(a.v2, b.v2); return ret; } -static FORCEINLINE __vec16_i1 __less_than(__vec16_d a, __vec16_d b) { +static FORCEINLINE __vec16_i1 __less_than_double(__vec16_d a, __vec16_d b) { __vec16_i1 ret; ret.m8.m1 = _mm512_cmplt_pd_mask(a.v1, b.v1); ret.m8.m2 = _mm512_cmplt_pd_mask(a.v2, b.v2); return ret; } -static FORCEINLINE __vec16_i1 __less_equal(__vec16_d a, __vec16_d b) { +static FORCEINLINE __vec16_i1 __less_equal_double(__vec16_d a, __vec16_d b) { __vec16_i1 ret; ret.m8.m1 = _mm512_cmple_pd_mask(a.v1, b.v1); ret.m8.m2 = _mm512_cmple_pd_mask(a.v2, b.v2); return ret; } -static FORCEINLINE __vec16_i1 __greater_than(__vec16_d a, __vec16_d b) { +static FORCEINLINE __vec16_i1 __greater_than_double(__vec16_d a, __vec16_d b) { __vec16_i1 ret; ret.m8.m1 = _mm512_cmpnle_pd_mask(a.v1, b.v1); ret.m8.m2 = _mm512_cmpnle_pd_mask(a.v2, b.v2); return ret; } -static FORCEINLINE __vec16_i1 __greater_equal(__vec16_d a, __vec16_d b) { +static FORCEINLINE __vec16_i1 __greater_equal_double(__vec16_d a, __vec16_d b) { __vec16_i1 ret; ret.m8.m1 = _mm512_cmpnlt_pd_mask(a.v1, b.v1); ret.m8.m2 = _mm512_cmpnlt_pd_mask(a.v2, b.v2); diff --git a/examples/intrinsics/sse4.h b/examples/intrinsics/sse4.h index 736e5d2d..4fb946c3 100644 --- a/examples/intrinsics/sse4.h +++ b/examples/intrinsics/sse4.h @@ -244,7 +244,7 @@ static FORCEINLINE uint64_t __movmsk(__vec4_i1 mask) { return (uint64_t)_mm_movemask_ps(mask.v); } -static FORCEINLINE __vec4_i1 __equal(__vec4_i1 a, __vec4_i1 b) { +static FORCEINLINE __vec4_i1 __equal_i1(__vec4_i1 a, __vec4_i1 b) { return _mm_cmpeq_epi32(_mm_castps_si128(a.v), _mm_castps_si128(b.v)); } @@ -425,7 +425,7 @@ static FORCEINLINE __vec4_i8 __ashr(__vec4_i8 a, int32_t b) { (int8_t)_mm_extract_epi8(a.v, 3) >> b); } -static FORCEINLINE __vec4_i1 __equal(__vec4_i8 a, __vec4_i8 b) { +static FORCEINLINE __vec4_i1 __equal_i8(__vec4_i8 a, __vec4_i8 b) { __m128i cmp = _mm_cmpeq_epi8(a.v, b.v); return __vec4_i1(_mm_extract_epi8(cmp, 0), _mm_extract_epi8(cmp, 1), @@ -433,11 +433,12 @@ static FORCEINLINE __vec4_i1 __equal(__vec4_i8 a, __vec4_i8 b) { _mm_extract_epi8(cmp, 3)); } -static FORCEINLINE __vec4_i1 __not_equal(__vec4_i8 a, __vec4_i8 b) { - return __xor(__equal(a, b), __vec4_i1(1, 1, 1, 1)); + +static FORCEINLINE __vec4_i1 __not_equal_i8(__vec4_i8 a, __vec4_i8 b) { + return __xor(__equal_i8(a, b), __vec4_i1(1, 1, 1, 1)); } -static FORCEINLINE __vec4_i1 __unsigned_less_equal(__vec4_i8 a, __vec4_i8 b) { +static FORCEINLINE __vec4_i1 __unsigned_less_equal_i8(__vec4_i8 a, __vec4_i8 b) { return __vec4_i1((uint8_t)_mm_extract_epi8(a.v, 0) <= (uint8_t)_mm_extract_epi8(b.v, 0), (uint8_t)_mm_extract_epi8(a.v, 1) <= @@ -448,7 +449,7 @@ static FORCEINLINE __vec4_i1 __unsigned_less_equal(__vec4_i8 a, __vec4_i8 b) { (uint8_t)_mm_extract_epi8(b.v, 3)); } -static FORCEINLINE __vec4_i1 __unsigned_greater_equal(__vec4_i8 a, __vec4_i8 b) { +static FORCEINLINE __vec4_i1 __unsigned_greater_equal_i8(__vec4_i8 a, __vec4_i8 b) { return __vec4_i1((uint8_t)_mm_extract_epi8(a.v, 0) >= (uint8_t)_mm_extract_epi8(b.v, 0), (uint8_t)_mm_extract_epi8(a.v, 1) >= @@ -459,7 +460,7 @@ static FORCEINLINE __vec4_i1 __unsigned_greater_equal(__vec4_i8 a, __vec4_i8 b) (uint8_t)_mm_extract_epi8(b.v, 3)); } -static FORCEINLINE __vec4_i1 __unsigned_less_than(__vec4_i8 a, __vec4_i8 b) { +static FORCEINLINE __vec4_i1 __unsigned_less_than_i8(__vec4_i8 a, __vec4_i8 b) { return __vec4_i1((uint8_t)_mm_extract_epi8(a.v, 0) < (uint8_t)_mm_extract_epi8(b.v, 0), (uint8_t)_mm_extract_epi8(a.v, 1) < @@ -470,7 +471,7 @@ static FORCEINLINE __vec4_i1 __unsigned_less_than(__vec4_i8 a, __vec4_i8 b) { (uint8_t)_mm_extract_epi8(b.v, 3)); } -static FORCEINLINE __vec4_i1 __unsigned_greater_than(__vec4_i8 a, __vec4_i8 b) { +static FORCEINLINE __vec4_i1 __unsigned_greater_than_i8(__vec4_i8 a, __vec4_i8 b) { return __vec4_i1((uint8_t)_mm_extract_epi8(a.v, 0) > (uint8_t)_mm_extract_epi8(b.v, 0), (uint8_t)_mm_extract_epi8(a.v, 1) > @@ -481,7 +482,7 @@ static FORCEINLINE __vec4_i1 __unsigned_greater_than(__vec4_i8 a, __vec4_i8 b) { (uint8_t)_mm_extract_epi8(b.v, 3)); } -static FORCEINLINE __vec4_i1 __signed_less_than(__vec4_i8 a, __vec4_i8 b) { +static FORCEINLINE __vec4_i1 __signed_less_than_i8(__vec4_i8 a, __vec4_i8 b) { __m128i cmp = _mm_cmplt_epi8(a.v, b.v); return __vec4_i1(_mm_extract_epi8(cmp, 0), _mm_extract_epi8(cmp, 1), @@ -489,11 +490,11 @@ static FORCEINLINE __vec4_i1 __signed_less_than(__vec4_i8 a, __vec4_i8 b) { _mm_extract_epi8(cmp, 3)); } -static FORCEINLINE __vec4_i1 __signed_less_equal(__vec4_i8 a, __vec4_i8 b) { - return __or(__signed_less_than(a, b), __equal(a, b)); +static FORCEINLINE __vec4_i1 __signed_less_equal_i8(__vec4_i8 a, __vec4_i8 b) { + return __or(__signed_less_than_i8(a, b), __equal_i8(a, b)); } -static FORCEINLINE __vec4_i1 __signed_greater_than(__vec4_i8 a, __vec4_i8 b) { +static FORCEINLINE __vec4_i1 __signed_greater_than_i8(__vec4_i8 a, __vec4_i8 b) { __m128i cmp = _mm_cmpgt_epi8(a.v, b.v); return __vec4_i1(_mm_extract_epi8(cmp, 0), _mm_extract_epi8(cmp, 1), @@ -501,8 +502,8 @@ static FORCEINLINE __vec4_i1 __signed_greater_than(__vec4_i8 a, __vec4_i8 b) { _mm_extract_epi8(cmp, 3)); } -static FORCEINLINE __vec4_i1 __signed_greater_equal(__vec4_i8 a, __vec4_i8 b) { - return __or(__signed_greater_than(a, b), __equal(a, b)); +static FORCEINLINE __vec4_i1 __signed_greater_equal_i8(__vec4_i8 a, __vec4_i8 b) { + return __or(__signed_greater_than_i8(a, b), __equal_i8(a, b)); } static FORCEINLINE __vec4_i8 __select(__vec4_i1 mask, __vec4_i8 a, __vec4_i8 b) { @@ -681,7 +682,7 @@ static FORCEINLINE __vec4_i16 __ashr(__vec4_i16 a, int32_t b) { return _mm_sra_epi16(a.v, _mm_set_epi32(0, 0, 0, b)); } -static FORCEINLINE __vec4_i1 __equal(__vec4_i16 a, __vec4_i16 b) { +static FORCEINLINE __vec4_i1 __equal_i16(__vec4_i16 a, __vec4_i16 b) { __m128i cmp = _mm_cmpeq_epi16(a.v, b.v); return __vec4_i1(_mm_extract_epi16(cmp, 0), _mm_extract_epi16(cmp, 1), @@ -689,11 +690,11 @@ static FORCEINLINE __vec4_i1 __equal(__vec4_i16 a, __vec4_i16 b) { _mm_extract_epi16(cmp, 3)); } -static FORCEINLINE __vec4_i1 __not_equal(__vec4_i16 a, __vec4_i16 b) { - return __xor(__equal(a, b), __vec4_i1(1, 1, 1, 1)); +static FORCEINLINE __vec4_i1 __not_equal_i16(__vec4_i16 a, __vec4_i16 b) { + return __xor(__equal_i16(a, b), __vec4_i1(1, 1, 1, 1)); } -static FORCEINLINE __vec4_i1 __unsigned_less_equal(__vec4_i16 a, __vec4_i16 b) { +static FORCEINLINE __vec4_i1 __unsigned_less_equal_i16(__vec4_i16 a, __vec4_i16 b) { // FIXME: could use the trick that int32 does for the unsigned // comparisons so that we don't need to scalarie them. (This also // applies to i8s...) @@ -707,7 +708,7 @@ static FORCEINLINE __vec4_i1 __unsigned_less_equal(__vec4_i16 a, __vec4_i16 b) { (uint16_t)_mm_extract_epi16(b.v, 3)); } -static FORCEINLINE __vec4_i1 __unsigned_greater_equal(__vec4_i16 a, __vec4_i16 b) { +static FORCEINLINE __vec4_i1 __unsigned_greater_equal_i16(__vec4_i16 a, __vec4_i16 b) { return __vec4_i1((uint16_t)_mm_extract_epi16(a.v, 0) >= (uint16_t)_mm_extract_epi16(b.v, 0), (uint16_t)_mm_extract_epi16(a.v, 1) >= @@ -718,7 +719,7 @@ static FORCEINLINE __vec4_i1 __unsigned_greater_equal(__vec4_i16 a, __vec4_i16 b (uint16_t)_mm_extract_epi16(b.v, 3)); } -static FORCEINLINE __vec4_i1 __unsigned_less_than(__vec4_i16 a, __vec4_i16 b) { +static FORCEINLINE __vec4_i1 __unsigned_less_than_i16(__vec4_i16 a, __vec4_i16 b) { return __vec4_i1((uint16_t)_mm_extract_epi16(a.v, 0) < (uint16_t)_mm_extract_epi16(b.v, 0), (uint16_t)_mm_extract_epi16(a.v, 1) < @@ -729,7 +730,7 @@ static FORCEINLINE __vec4_i1 __unsigned_less_than(__vec4_i16 a, __vec4_i16 b) { (uint16_t)_mm_extract_epi16(b.v, 3)); } -static FORCEINLINE __vec4_i1 __unsigned_greater_than(__vec4_i16 a, __vec4_i16 b) { +static FORCEINLINE __vec4_i1 __unsigned_greater_than_i16(__vec4_i16 a, __vec4_i16 b) { return __vec4_i1((uint16_t)_mm_extract_epi16(a.v, 0) > (uint16_t)_mm_extract_epi16(b.v, 0), (uint16_t)_mm_extract_epi16(a.v, 1) > @@ -740,7 +741,7 @@ static FORCEINLINE __vec4_i1 __unsigned_greater_than(__vec4_i16 a, __vec4_i16 b) (uint16_t)_mm_extract_epi16(b.v, 3)); } -static FORCEINLINE __vec4_i1 __signed_less_than(__vec4_i16 a, __vec4_i16 b) { +static FORCEINLINE __vec4_i1 __signed_less_than_i16(__vec4_i16 a, __vec4_i16 b) { __m128i cmp = _mm_cmplt_epi16(a.v, b.v); return __vec4_i1(_mm_extract_epi16(cmp, 0), _mm_extract_epi16(cmp, 1), @@ -748,11 +749,11 @@ static FORCEINLINE __vec4_i1 __signed_less_than(__vec4_i16 a, __vec4_i16 b) { _mm_extract_epi16(cmp, 3)); } -static FORCEINLINE __vec4_i1 __signed_less_equal(__vec4_i16 a, __vec4_i16 b) { - return __or(__signed_less_than(a, b), __equal(a, b)); +static FORCEINLINE __vec4_i1 __signed_less_equal_i16(__vec4_i16 a, __vec4_i16 b) { + return __or(__signed_less_than_i16(a, b), __equal_i16(a, b)); } -static FORCEINLINE __vec4_i1 __signed_greater_than(__vec4_i16 a, __vec4_i16 b) { +static FORCEINLINE __vec4_i1 __signed_greater_than_i16(__vec4_i16 a, __vec4_i16 b) { __m128i cmp = _mm_cmpgt_epi16(a.v, b.v); return __vec4_i1(_mm_extract_epi16(cmp, 0), _mm_extract_epi16(cmp, 1), @@ -760,8 +761,8 @@ static FORCEINLINE __vec4_i1 __signed_greater_than(__vec4_i16 a, __vec4_i16 b) _mm_extract_epi16(cmp, 3)); } -static FORCEINLINE __vec4_i1 __signed_greater_equal(__vec4_i16 a, __vec4_i16 b) { - return __or(__signed_greater_than(a, b), __equal(a, b)); +static FORCEINLINE __vec4_i1 __signed_greater_equal_i16(__vec4_i16 a, __vec4_i16 b) { + return __or(__signed_greater_than_i16(a, b), __equal_i16(a, b)); } static FORCEINLINE __vec4_i16 __select(__vec4_i1 mask, __vec4_i16 a, __vec4_i16 b) { @@ -966,52 +967,52 @@ static FORCEINLINE __vec4_i32 __ashr(__vec4_i32 a, int32_t b) { return _mm_sra_epi32(a.v, _mm_set_epi32(0, 0, 0, b)); } -static FORCEINLINE __vec4_i1 __equal(__vec4_i32 a, __vec4_i32 b) { +static FORCEINLINE __vec4_i1 __equal_i32(__vec4_i32 a, __vec4_i32 b) { return _mm_cmpeq_epi32(a.v, b.v); } -static FORCEINLINE __vec4_i1 __not_equal(__vec4_i32 a, __vec4_i32 b) { +static FORCEINLINE __vec4_i1 __not_equal_i32(__vec4_i32 a, __vec4_i32 b) { return _mm_xor_si128(_mm_cmpeq_epi32(a.v, b.v), _mm_cmpeq_epi32(a.v, a.v)); } -static FORCEINLINE __vec4_i1 __unsigned_less_equal(__vec4_i32 a, __vec4_i32 b) { +static FORCEINLINE __vec4_i1 __unsigned_less_equal_i32(__vec4_i32 a, __vec4_i32 b) { // a<=b == (min(a,b) == a) return _mm_cmpeq_epi32(_mm_min_epu32(a.v, b.v), a.v); } -static FORCEINLINE __vec4_i1 __signed_less_equal(__vec4_i32 a, __vec4_i32 b) { +static FORCEINLINE __vec4_i1 __signed_less_equal_i32(__vec4_i32 a, __vec4_i32 b) { return _mm_or_si128(_mm_cmplt_epi32(a.v, b.v), _mm_cmpeq_epi32(a.v, b.v)); } -static FORCEINLINE __vec4_i1 __unsigned_greater_equal(__vec4_i32 a, __vec4_i32 b) { +static FORCEINLINE __vec4_i1 __unsigned_greater_equal_i32(__vec4_i32 a, __vec4_i32 b) { // a>=b == (max(a,b) == a) return _mm_cmpeq_epi32(_mm_max_epu32(a.v, b.v), a.v); } -static FORCEINLINE __vec4_i1 __signed_greater_equal(__vec4_i32 a, __vec4_i32 b) { +static FORCEINLINE __vec4_i1 __signed_greater_equal_i32(__vec4_i32 a, __vec4_i32 b) { return _mm_or_si128(_mm_cmpgt_epi32(a.v, b.v), _mm_cmpeq_epi32(a.v, b.v)); } -static FORCEINLINE __vec4_i1 __unsigned_less_than(__vec4_i32 a, __vec4_i32 b) { +static FORCEINLINE __vec4_i1 __unsigned_less_than_i32(__vec4_i32 a, __vec4_i32 b) { a.v = _mm_xor_si128(a.v, _mm_set1_epi32(0x80000000)); b.v = _mm_xor_si128(b.v, _mm_set1_epi32(0x80000000)); return _mm_cmplt_epi32(a.v, b.v); } -static FORCEINLINE __vec4_i1 __signed_less_than(__vec4_i32 a, __vec4_i32 b) { +static FORCEINLINE __vec4_i1 __signed_less_than_i32(__vec4_i32 a, __vec4_i32 b) { return _mm_cmplt_epi32(a.v, b.v); } -static FORCEINLINE __vec4_i1 __unsigned_greater_than(__vec4_i32 a, __vec4_i32 b) { +static FORCEINLINE __vec4_i1 __unsigned_greater_than_i32(__vec4_i32 a, __vec4_i32 b) { a.v = _mm_xor_si128(a.v, _mm_set1_epi32(0x80000000)); b.v = _mm_xor_si128(b.v, _mm_set1_epi32(0x80000000)); return _mm_cmpgt_epi32(a.v, b.v); } -static FORCEINLINE __vec4_i1 __signed_greater_than(__vec4_i32 a, __vec4_i32 b) { +static FORCEINLINE __vec4_i1 __signed_greater_than_i32(__vec4_i32 a, __vec4_i32 b) { return _mm_cmpgt_epi32(a.v, b.v); } @@ -1197,18 +1198,18 @@ static FORCEINLINE __vec4_i64 __ashr(__vec4_i64 a, int32_t b) { (int64_t)_mm_extract_epi64(a.v[1], 1) >> b); } -static FORCEINLINE __vec4_i1 __equal(__vec4_i64 a, __vec4_i64 b) { +static FORCEINLINE __vec4_i1 __equal_i64(__vec4_i64 a, __vec4_i64 b) { __m128i cmp0 = _mm_cmpeq_epi64(a.v[0], b.v[0]); __m128i cmp1 = _mm_cmpeq_epi64(a.v[1], b.v[1]); return _mm_shuffle_ps(_mm_castsi128_ps(cmp0), _mm_castsi128_ps(cmp1), _MM_SHUFFLE(2, 0, 2, 0)); } -static FORCEINLINE __vec4_i1 __not_equal(__vec4_i64 a, __vec4_i64 b) { - return __xor(__equal(a, b), __vec4_i1(1, 1, 1, 1)); +static FORCEINLINE __vec4_i1 __not_equal_i64(__vec4_i64 a, __vec4_i64 b) { + return __xor(__equal_i64(a, b), __vec4_i1(1, 1, 1, 1)); } -static FORCEINLINE __vec4_i1 __unsigned_less_equal(__vec4_i64 a, __vec4_i64 b) { +static FORCEINLINE __vec4_i1 __unsigned_less_equal_i64(__vec4_i64 a, __vec4_i64 b) { return __vec4_i1((uint64_t)_mm_extract_epi64(a.v[0], 0) <= (uint64_t)_mm_extract_epi64(b.v[0], 0), (uint64_t)_mm_extract_epi64(a.v[0], 1) <= @@ -1219,7 +1220,7 @@ static FORCEINLINE __vec4_i1 __unsigned_less_equal(__vec4_i64 a, __vec4_i64 b) { (uint64_t)_mm_extract_epi64(b.v[1], 1)); } -static FORCEINLINE __vec4_i1 __unsigned_greater_equal(__vec4_i64 a, __vec4_i64 b) { +static FORCEINLINE __vec4_i1 __unsigned_greater_equal_i64(__vec4_i64 a, __vec4_i64 b) { return __vec4_i1((uint64_t)_mm_extract_epi64(a.v[0], 0) >= (uint64_t)_mm_extract_epi64(b.v[0], 0), (uint64_t)_mm_extract_epi64(a.v[0], 1) >= @@ -1230,7 +1231,7 @@ static FORCEINLINE __vec4_i1 __unsigned_greater_equal(__vec4_i64 a, __vec4_i64 b (uint64_t)_mm_extract_epi64(b.v[1], 1)); } -static FORCEINLINE __vec4_i1 __unsigned_less_than(__vec4_i64 a, __vec4_i64 b) { +static FORCEINLINE __vec4_i1 __unsigned_less_than_i64(__vec4_i64 a, __vec4_i64 b) { return __vec4_i1((uint64_t)_mm_extract_epi64(a.v[0], 0) < (uint64_t)_mm_extract_epi64(b.v[0], 0), (uint64_t)_mm_extract_epi64(a.v[0], 1) < @@ -1241,7 +1242,7 @@ static FORCEINLINE __vec4_i1 __unsigned_less_than(__vec4_i64 a, __vec4_i64 b) { (uint64_t)_mm_extract_epi64(b.v[1], 1)); } -static FORCEINLINE __vec4_i1 __unsigned_greater_than(__vec4_i64 a, __vec4_i64 b) { +static FORCEINLINE __vec4_i1 __unsigned_greater_than_i64(__vec4_i64 a, __vec4_i64 b) { return __vec4_i1((uint64_t)_mm_extract_epi64(a.v[0], 0) > (uint64_t)_mm_extract_epi64(b.v[0], 0), (uint64_t)_mm_extract_epi64(a.v[0], 1) > @@ -1252,23 +1253,23 @@ static FORCEINLINE __vec4_i1 __unsigned_greater_than(__vec4_i64 a, __vec4_i64 b) (uint64_t)_mm_extract_epi64(b.v[1], 1)); } -static FORCEINLINE __vec4_i1 __signed_greater_than(__vec4_i64 a, __vec4_i64 b) { +static FORCEINLINE __vec4_i1 __signed_greater_than_i64(__vec4_i64 a, __vec4_i64 b) { __m128i cmp0 = _mm_cmpgt_epi64(a.v[0], b.v[0]); __m128i cmp1 = _mm_cmpgt_epi64(a.v[1], b.v[1]); return _mm_shuffle_ps(_mm_castsi128_ps(cmp0), _mm_castsi128_ps(cmp1), _MM_SHUFFLE(2, 0, 2, 0)); } -static FORCEINLINE __vec4_i1 __signed_greater_equal(__vec4_i64 a, __vec4_i64 b) { - return __or(__signed_greater_than(a, b), __equal(a, b)); +static FORCEINLINE __vec4_i1 __signed_greater_equal_i64(__vec4_i64 a, __vec4_i64 b) { + return __or(__signed_greater_than_i64(a, b), __equal_i64(a, b)); } -static FORCEINLINE __vec4_i1 __signed_less_than(__vec4_i64 a, __vec4_i64 b) { - return __xor(__signed_greater_equal(a, b), __vec4_i1(1, 1, 1, 1)); +static FORCEINLINE __vec4_i1 __signed_less_than_i64(__vec4_i64 a, __vec4_i64 b) { + return __xor(__signed_greater_equal_i64(a, b), __vec4_i1(1, 1, 1, 1)); } -static FORCEINLINE __vec4_i1 __signed_less_equal(__vec4_i64 a, __vec4_i64 b) { - return __xor(__signed_greater_than(a, b), __vec4_i1(1, 1, 1, 1)); +static FORCEINLINE __vec4_i1 __signed_less_equal_i64(__vec4_i64 a, __vec4_i64 b) { + return __xor(__signed_greater_than_i64(a, b), __vec4_i1(1, 1, 1, 1)); } static FORCEINLINE __vec4_i64 __select(__vec4_i1 mask, __vec4_i64 a, __vec4_i64 b) { @@ -1353,31 +1354,31 @@ static FORCEINLINE __vec4_f __div(__vec4_f a, __vec4_f b) { return _mm_div_ps(a.v, b.v); } -static FORCEINLINE __vec4_i1 __equal(__vec4_f a, __vec4_f b) { +static FORCEINLINE __vec4_i1 __equal_float(__vec4_f a, __vec4_f b) { return _mm_cmpeq_ps(a.v, b.v); } -static FORCEINLINE __vec4_i1 __not_equal(__vec4_f a, __vec4_f b) { +static FORCEINLINE __vec4_i1 __not_equal_float(__vec4_f a, __vec4_f b) { return _mm_cmpneq_ps(a.v, b.v); } -static FORCEINLINE __vec4_i1 __less_than(__vec4_f a, __vec4_f b) { +static FORCEINLINE __vec4_i1 __less_than_float(__vec4_f a, __vec4_f b) { return _mm_cmplt_ps(a.v, b.v); } -static FORCEINLINE __vec4_i1 __less_equal(__vec4_f a, __vec4_f b) { +static FORCEINLINE __vec4_i1 __less_equal_float(__vec4_f a, __vec4_f b) { return _mm_cmple_ps(a.v, b.v); } -static FORCEINLINE __vec4_i1 __greater_than(__vec4_f a, __vec4_f b) { +static FORCEINLINE __vec4_i1 __greater_than_float(__vec4_f a, __vec4_f b) { return _mm_cmpgt_ps(a.v, b.v); } -static FORCEINLINE __vec4_i1 __greater_equal(__vec4_f a, __vec4_f b) { +static FORCEINLINE __vec4_i1 __greater_equal_float(__vec4_f a, __vec4_f b) { return _mm_cmpge_ps(a.v, b.v); } -static FORCEINLINE __vec4_i1 __ordered(__vec4_f a, __vec4_f b) { +static FORCEINLINE __vec4_i1 __ordered_float(__vec4_f a, __vec4_f b) { return _mm_cmpord_ps(a.v, b.v); } @@ -1458,49 +1459,49 @@ static FORCEINLINE __vec4_d __div(__vec4_d a, __vec4_d b) { _mm_div_pd(a.v[1], b.v[1])); } -static FORCEINLINE __vec4_i1 __equal(__vec4_d a, __vec4_d b) { +static FORCEINLINE __vec4_i1 __equal_double(__vec4_d a, __vec4_d b) { __m128d cmp0 = _mm_cmpeq_pd(a.v[0], b.v[0]); __m128d cmp1 = _mm_cmpeq_pd(a.v[1], b.v[1]); return _mm_shuffle_ps(_mm_castpd_ps(cmp0), _mm_castpd_ps(cmp1), _MM_SHUFFLE(2, 0, 2, 0)); } -static FORCEINLINE __vec4_i1 __not_equal(__vec4_d a, __vec4_d b) { +static FORCEINLINE __vec4_i1 __not_equal_double(__vec4_d a, __vec4_d b) { __m128d cmp0 = _mm_cmpneq_pd(a.v[0], b.v[0]); __m128d cmp1 = _mm_cmpneq_pd(a.v[1], b.v[1]); return _mm_shuffle_ps(_mm_castpd_ps(cmp0), _mm_castpd_ps(cmp1), _MM_SHUFFLE(2, 0, 2, 0)); } -static FORCEINLINE __vec4_i1 __less_than(__vec4_d a, __vec4_d b) { +static FORCEINLINE __vec4_i1 __less_than_double(__vec4_d a, __vec4_d b) { __m128d cmp0 = _mm_cmplt_pd(a.v[0], b.v[0]); __m128d cmp1 = _mm_cmplt_pd(a.v[1], b.v[1]); return _mm_shuffle_ps(_mm_castpd_ps(cmp0), _mm_castpd_ps(cmp1), _MM_SHUFFLE(2, 0, 2, 0)); } -static FORCEINLINE __vec4_i1 __less_equal(__vec4_d a, __vec4_d b) { +static FORCEINLINE __vec4_i1 __less_equal_double(__vec4_d a, __vec4_d b) { __m128d cmp0 = _mm_cmple_pd(a.v[0], b.v[0]); __m128d cmp1 = _mm_cmple_pd(a.v[1], b.v[1]); return _mm_shuffle_ps(_mm_castpd_ps(cmp0), _mm_castpd_ps(cmp1), _MM_SHUFFLE(2, 0, 2, 0)); } -static FORCEINLINE __vec4_i1 __greater_than(__vec4_d a, __vec4_d b) { +static FORCEINLINE __vec4_i1 __greater_than_double(__vec4_d a, __vec4_d b) { __m128d cmp0 = _mm_cmpgt_pd(a.v[0], b.v[0]); __m128d cmp1 = _mm_cmpgt_pd(a.v[1], b.v[1]); return _mm_shuffle_ps(_mm_castpd_ps(cmp0), _mm_castpd_ps(cmp1), _MM_SHUFFLE(2, 0, 0 ,2)); } -static FORCEINLINE __vec4_i1 __greater_equal(__vec4_d a, __vec4_d b) { +static FORCEINLINE __vec4_i1 __greater_equal_double(__vec4_d a, __vec4_d b) { __m128d cmp0 = _mm_cmpge_pd(a.v[0], b.v[0]); __m128d cmp1 = _mm_cmpge_pd(a.v[1], b.v[1]); return _mm_shuffle_ps(_mm_castpd_ps(cmp0), _mm_castpd_ps(cmp1), _MM_SHUFFLE(2, 0, 2, 0)); } -static FORCEINLINE __vec4_i1 __ordered(__vec4_d a, __vec4_d b) { +static FORCEINLINE __vec4_i1 __ordered_double(__vec4_d a, __vec4_d b) { __m128d cmp0 = _mm_cmpord_pd(a.v[0], b.v[0]); __m128d cmp1 = _mm_cmpord_pd(a.v[1], b.v[1]); return _mm_shuffle_ps(_mm_castpd_ps(cmp0), _mm_castpd_ps(cmp1),