Add type suffix to comparison ops in C++ output.

e.g. "__equal()" -> "__equal_float()", etc.

No functional change; this is necessary groundwork for a forthcoming
peephole optimization that eliminates ANDs of masks in some cases.
This commit is contained in:
Matt Pharr
2012-07-07 07:50:59 -07:00
parent 45e9e0be0b
commit 974b40c8af
6 changed files with 284 additions and 255 deletions

View File

@@ -3200,11 +3200,37 @@ lPredicateToString(llvm::CmpInst::Predicate p) {
}
static const char *
lTypeToSuffix(llvm::Type *t) {
llvm::VectorType *vt = llvm::dyn_cast<llvm::VectorType>(t);
Assert(vt != NULL);
t = vt->getElementType();
switch (t->getTypeID()) {
case llvm::Type::FloatTyID: return "float";
case llvm::Type::DoubleTyID: return "double";
case llvm::Type::IntegerTyID: {
switch (llvm::cast<llvm::IntegerType>(t)->getBitWidth()) {
case 1: return "i1";
case 8: return "i8";
case 16: return "i16";
case 32: return "i32";
case 64: return "i64";
}
}
default: llvm_unreachable(0); return NULL;
}
return NULL;
}
void CWriter::visitICmpInst(llvm::ICmpInst &I) {
bool isVector = llvm::isa<llvm::VectorType>(I.getOperand(0)->getType());
if (isVector) {
Out << lPredicateToString(I.getPredicate());
Out << "_";
Out << lTypeToSuffix(I.getOperand(0)->getType());
Out << "(";
writeOperand(I.getOperand(0));
Out << ", ";
@@ -3270,6 +3296,8 @@ void CWriter::visitFCmpInst(llvm::FCmpInst &I) {
if (isVector) {
Out << lPredicateToString(I.getPredicate());
Out << "_";
Out << lTypeToSuffix(I.getOperand(0)->getType());
Out << "(";
}
else {

View File

@@ -193,8 +193,8 @@ static FORCEINLINE TYPE NAME(TYPE a, TYPE b) { \
return ret; \
}
#define CMP_OP(TYPE, CAST, NAME, OP) \
static FORCEINLINE __vec16_i1 NAME(TYPE a, TYPE b) { \
#define CMP_OP(TYPE, SUFFIX, CAST, NAME, OP) \
static FORCEINLINE __vec16_i1 NAME##_##SUFFIX(TYPE a, TYPE b) { \
__vec16_i1 ret; \
ret.v = 0; \
for (int i = 0; i < 16; ++i) \
@@ -317,7 +317,7 @@ static FORCEINLINE uint64_t __movmsk(__vec16_i1 mask) {
return (uint64_t)mask.v;
}
static FORCEINLINE __vec16_i1 __equal(__vec16_i1 a, __vec16_i1 b) {
static FORCEINLINE __vec16_i1 __equal_i1(__vec16_i1 a, __vec16_i1 b) {
__vec16_i1 r;
r.v = (a.v & b.v) | (~a.v & ~b.v);
return r;
@@ -424,16 +424,16 @@ SHIFT_UNIFORM(__vec16_i8, uint8_t, __lshr, >>)
SHIFT_UNIFORM(__vec16_i8, int8_t, __ashr, >>)
SHIFT_UNIFORM(__vec16_i8, int8_t, __shl, <<)
CMP_OP(__vec16_i8, int8_t, __equal, ==)
CMP_OP(__vec16_i8, int8_t, __not_equal, !=)
CMP_OP(__vec16_i8, uint8_t, __unsigned_less_equal, <=)
CMP_OP(__vec16_i8, int8_t, __signed_less_equal, <=)
CMP_OP(__vec16_i8, uint8_t, __unsigned_greater_equal, >=)
CMP_OP(__vec16_i8, int8_t, __signed_greater_equal, >=)
CMP_OP(__vec16_i8, uint8_t, __unsigned_less_than, <)
CMP_OP(__vec16_i8, int8_t, __signed_less_than, <)
CMP_OP(__vec16_i8, uint8_t, __unsigned_greater_than, >)
CMP_OP(__vec16_i8, int8_t, __signed_greater_than, >)
CMP_OP(__vec16_i8, i8, int8_t, __equal, ==)
CMP_OP(__vec16_i8, i8, int8_t, __not_equal, !=)
CMP_OP(__vec16_i8, i8, uint8_t, __unsigned_less_equal, <=)
CMP_OP(__vec16_i8, i8, int8_t, __signed_less_equal, <=)
CMP_OP(__vec16_i8, i8, uint8_t, __unsigned_greater_equal, >=)
CMP_OP(__vec16_i8, i8, int8_t, __signed_greater_equal, >=)
CMP_OP(__vec16_i8, i8, uint8_t, __unsigned_less_than, <)
CMP_OP(__vec16_i8, i8, int8_t, __signed_less_than, <)
CMP_OP(__vec16_i8, i8, uint8_t, __unsigned_greater_than, >)
CMP_OP(__vec16_i8, i8, int8_t, __signed_greater_than, >)
SELECT(__vec16_i8)
INSERT_EXTRACT(__vec16_i8, int8_t)
@@ -467,16 +467,16 @@ SHIFT_UNIFORM(__vec16_i16, uint16_t, __lshr, >>)
SHIFT_UNIFORM(__vec16_i16, int16_t, __ashr, >>)
SHIFT_UNIFORM(__vec16_i16, int16_t, __shl, <<)
CMP_OP(__vec16_i16, int16_t, __equal, ==)
CMP_OP(__vec16_i16, int16_t, __not_equal, !=)
CMP_OP(__vec16_i16, uint16_t, __unsigned_less_equal, <=)
CMP_OP(__vec16_i16, int16_t, __signed_less_equal, <=)
CMP_OP(__vec16_i16, uint16_t, __unsigned_greater_equal, >=)
CMP_OP(__vec16_i16, int16_t, __signed_greater_equal, >=)
CMP_OP(__vec16_i16, uint16_t, __unsigned_less_than, <)
CMP_OP(__vec16_i16, int16_t, __signed_less_than, <)
CMP_OP(__vec16_i16, uint16_t, __unsigned_greater_than, >)
CMP_OP(__vec16_i16, int16_t, __signed_greater_than, >)
CMP_OP(__vec16_i16, i16, int16_t, __equal, ==)
CMP_OP(__vec16_i16, i16, int16_t, __not_equal, !=)
CMP_OP(__vec16_i16, i16, uint16_t, __unsigned_less_equal, <=)
CMP_OP(__vec16_i16, i16, int16_t, __signed_less_equal, <=)
CMP_OP(__vec16_i16, i16, uint16_t, __unsigned_greater_equal, >=)
CMP_OP(__vec16_i16, i16, int16_t, __signed_greater_equal, >=)
CMP_OP(__vec16_i16, i16, uint16_t, __unsigned_less_than, <)
CMP_OP(__vec16_i16, i16, int16_t, __signed_less_than, <)
CMP_OP(__vec16_i16, i16, uint16_t, __unsigned_greater_than, >)
CMP_OP(__vec16_i16, i16, int16_t, __signed_greater_than, >)
SELECT(__vec16_i16)
INSERT_EXTRACT(__vec16_i16, int16_t)
@@ -510,16 +510,16 @@ SHIFT_UNIFORM(__vec16_i32, uint32_t, __lshr, >>)
SHIFT_UNIFORM(__vec16_i32, int32_t, __ashr, >>)
SHIFT_UNIFORM(__vec16_i32, int32_t, __shl, <<)
CMP_OP(__vec16_i32, int32_t, __equal, ==)
CMP_OP(__vec16_i32, int32_t, __not_equal, !=)
CMP_OP(__vec16_i32, uint32_t, __unsigned_less_equal, <=)
CMP_OP(__vec16_i32, int32_t, __signed_less_equal, <=)
CMP_OP(__vec16_i32, uint32_t, __unsigned_greater_equal, >=)
CMP_OP(__vec16_i32, int32_t, __signed_greater_equal, >=)
CMP_OP(__vec16_i32, uint32_t, __unsigned_less_than, <)
CMP_OP(__vec16_i32, int32_t, __signed_less_than, <)
CMP_OP(__vec16_i32, uint32_t, __unsigned_greater_than, >)
CMP_OP(__vec16_i32, int32_t, __signed_greater_than, >)
CMP_OP(__vec16_i32, i32, int32_t, __equal, ==)
CMP_OP(__vec16_i32, i32, int32_t, __not_equal, !=)
CMP_OP(__vec16_i32, i32, uint32_t, __unsigned_less_equal, <=)
CMP_OP(__vec16_i32, i32, int32_t, __signed_less_equal, <=)
CMP_OP(__vec16_i32, i32, uint32_t, __unsigned_greater_equal, >=)
CMP_OP(__vec16_i32, i32, int32_t, __signed_greater_equal, >=)
CMP_OP(__vec16_i32, i32, uint32_t, __unsigned_less_than, <)
CMP_OP(__vec16_i32, i32, int32_t, __signed_less_than, <)
CMP_OP(__vec16_i32, i32, uint32_t, __unsigned_greater_than, >)
CMP_OP(__vec16_i32, i32, int32_t, __signed_greater_than, >)
SELECT(__vec16_i32)
INSERT_EXTRACT(__vec16_i32, int32_t)
@@ -553,16 +553,16 @@ SHIFT_UNIFORM(__vec16_i64, uint64_t, __lshr, >>)
SHIFT_UNIFORM(__vec16_i64, int64_t, __ashr, >>)
SHIFT_UNIFORM(__vec16_i64, int64_t, __shl, <<)
CMP_OP(__vec16_i64, int64_t, __equal, ==)
CMP_OP(__vec16_i64, int64_t, __not_equal, !=)
CMP_OP(__vec16_i64, uint64_t, __unsigned_less_equal, <=)
CMP_OP(__vec16_i64, int64_t, __signed_less_equal, <=)
CMP_OP(__vec16_i64, uint64_t, __unsigned_greater_equal, >=)
CMP_OP(__vec16_i64, int64_t, __signed_greater_equal, >=)
CMP_OP(__vec16_i64, uint64_t, __unsigned_less_than, <)
CMP_OP(__vec16_i64, int64_t, __signed_less_than, <)
CMP_OP(__vec16_i64, uint64_t, __unsigned_greater_than, >)
CMP_OP(__vec16_i64, int64_t, __signed_greater_than, >)
CMP_OP(__vec16_i64, i64, int64_t, __equal, ==)
CMP_OP(__vec16_i64, i64, int64_t, __not_equal, !=)
CMP_OP(__vec16_i64, i64, uint64_t, __unsigned_less_equal, <=)
CMP_OP(__vec16_i64, i64, int64_t, __signed_less_equal, <=)
CMP_OP(__vec16_i64, i64, uint64_t, __unsigned_greater_equal, >=)
CMP_OP(__vec16_i64, i64, int64_t, __signed_greater_equal, >=)
CMP_OP(__vec16_i64, i64, uint64_t, __unsigned_less_than, <)
CMP_OP(__vec16_i64, i64, int64_t, __signed_less_than, <)
CMP_OP(__vec16_i64, i64, uint64_t, __unsigned_greater_than, >)
CMP_OP(__vec16_i64, i64, int64_t, __signed_greater_than, >)
SELECT(__vec16_i64)
INSERT_EXTRACT(__vec16_i64, int64_t)
@@ -580,12 +580,12 @@ BINARY_OP(__vec16_f, __sub, -)
BINARY_OP(__vec16_f, __mul, *)
BINARY_OP(__vec16_f, __div, /)
CMP_OP(__vec16_f, float, __equal, ==)
CMP_OP(__vec16_f, float, __not_equal, !=)
CMP_OP(__vec16_f, float, __less_than, <)
CMP_OP(__vec16_f, float, __less_equal, <=)
CMP_OP(__vec16_f, float, __greater_than, >)
CMP_OP(__vec16_f, float, __greater_equal, >=)
CMP_OP(__vec16_f, float, float, __equal, ==)
CMP_OP(__vec16_f, float, float, __not_equal, !=)
CMP_OP(__vec16_f, float, float, __less_than, <)
CMP_OP(__vec16_f, float, float, __less_equal, <=)
CMP_OP(__vec16_f, float, float, __greater_than, >)
CMP_OP(__vec16_f, float, float, __greater_equal, >=)
static FORCEINLINE __vec16_i1 __ordered(__vec16_f a, __vec16_f b) {
__vec16_i1 ret;
@@ -730,12 +730,12 @@ BINARY_OP(__vec16_d, __sub, -)
BINARY_OP(__vec16_d, __mul, *)
BINARY_OP(__vec16_d, __div, /)
CMP_OP(__vec16_d, double, __equal, ==)
CMP_OP(__vec16_d, double, __not_equal, !=)
CMP_OP(__vec16_d, double, __less_than, <)
CMP_OP(__vec16_d, double, __less_equal, <=)
CMP_OP(__vec16_d, double, __greater_than, >)
CMP_OP(__vec16_d, double, __greater_equal, >=)
CMP_OP(__vec16_d, double, double, __equal, ==)
CMP_OP(__vec16_d, double, double, __not_equal, !=)
CMP_OP(__vec16_d, double, double, __less_than, <)
CMP_OP(__vec16_d, double, double, __less_equal, <=)
CMP_OP(__vec16_d, double, double, __greater_than, >)
CMP_OP(__vec16_d, double, double, __greater_equal, >=)
static FORCEINLINE __vec16_i1 __ordered(__vec16_d a, __vec16_d b) {
__vec16_i1 ret;

View File

@@ -258,8 +258,8 @@ static FORCEINLINE TYPE NAME(TYPE a, TYPE b) { \
return ret; \
}
#define CMP_OP(TYPE, CAST, NAME, OP) \
static FORCEINLINE __vec32_i1 NAME(TYPE a, TYPE b) { \
#define CMP_OP(TYPE, SUFFIX, CAST, NAME, OP) \
static FORCEINLINE __vec32_i1 NAME##_##SUFFIX(TYPE a, TYPE b) { \
__vec32_i1 ret; \
ret.v = 0; \
for (int i = 0; i < 32; ++i) \
@@ -382,7 +382,7 @@ static FORCEINLINE uint64_t __movmsk(__vec32_i1 mask) {
return (uint64_t)mask.v;
}
static FORCEINLINE __vec32_i1 __equal(__vec32_i1 a, __vec32_i1 b) {
static FORCEINLINE __vec32_i1 __equal_i1(__vec32_i1 a, __vec32_i1 b) {
__vec32_i1 r;
r.v = (a.v & b.v) | (~a.v & ~b.v);
return r;
@@ -491,16 +491,16 @@ SHIFT_UNIFORM(__vec32_i8, uint8_t, __lshr, >>)
SHIFT_UNIFORM(__vec32_i8, int8_t, __ashr, >>)
SHIFT_UNIFORM(__vec32_i8, int8_t, __shl, <<)
CMP_OP(__vec32_i8, int8_t, __equal, ==)
CMP_OP(__vec32_i8, int8_t, __not_equal, !=)
CMP_OP(__vec32_i8, uint8_t, __unsigned_less_equal, <=)
CMP_OP(__vec32_i8, int8_t, __signed_less_equal, <=)
CMP_OP(__vec32_i8, uint8_t, __unsigned_greater_equal, >=)
CMP_OP(__vec32_i8, int8_t, __signed_greater_equal, >=)
CMP_OP(__vec32_i8, uint8_t, __unsigned_less_than, <)
CMP_OP(__vec32_i8, int8_t, __signed_less_than, <)
CMP_OP(__vec32_i8, uint8_t, __unsigned_greater_than, >)
CMP_OP(__vec32_i8, int8_t, __signed_greater_than, >)
CMP_OP(__vec32_i8, i8, int8_t, __equal, ==)
CMP_OP(__vec32_i8, i8, int8_t, __not_equal, !=)
CMP_OP(__vec32_i8, i8, uint8_t, __unsigned_less_equal, <=)
CMP_OP(__vec32_i8, i8, int8_t, __signed_less_equal, <=)
CMP_OP(__vec32_i8, i8, uint8_t, __unsigned_greater_equal, >=)
CMP_OP(__vec32_i8, i8, int8_t, __signed_greater_equal, >=)
CMP_OP(__vec32_i8, i8, uint8_t, __unsigned_less_than, <)
CMP_OP(__vec32_i8, i8, int8_t, __signed_less_than, <)
CMP_OP(__vec32_i8, i8, uint8_t, __unsigned_greater_than, >)
CMP_OP(__vec32_i8, i8, int8_t, __signed_greater_than, >)
SELECT(__vec32_i8)
INSERT_EXTRACT(__vec32_i8, int8_t)
@@ -534,16 +534,16 @@ SHIFT_UNIFORM(__vec32_i16, uint16_t, __lshr, >>)
SHIFT_UNIFORM(__vec32_i16, int16_t, __ashr, >>)
SHIFT_UNIFORM(__vec32_i16, int16_t, __shl, <<)
CMP_OP(__vec32_i16, int16_t, __equal, ==)
CMP_OP(__vec32_i16, int16_t, __not_equal, !=)
CMP_OP(__vec32_i16, uint16_t, __unsigned_less_equal, <=)
CMP_OP(__vec32_i16, int16_t, __signed_less_equal, <=)
CMP_OP(__vec32_i16, uint16_t, __unsigned_greater_equal, >=)
CMP_OP(__vec32_i16, int16_t, __signed_greater_equal, >=)
CMP_OP(__vec32_i16, uint16_t, __unsigned_less_than, <)
CMP_OP(__vec32_i16, int16_t, __signed_less_than, <)
CMP_OP(__vec32_i16, uint16_t, __unsigned_greater_than, >)
CMP_OP(__vec32_i16, int16_t, __signed_greater_than, >)
CMP_OP(__vec32_i16, i16, int16_t, __equal, ==)
CMP_OP(__vec32_i16, i16, int16_t, __not_equal, !=)
CMP_OP(__vec32_i16, i16, uint16_t, __unsigned_less_equal, <=)
CMP_OP(__vec32_i16, i16, int16_t, __signed_less_equal, <=)
CMP_OP(__vec32_i16, i16, uint16_t, __unsigned_greater_equal, >=)
CMP_OP(__vec32_i16, i16, int16_t, __signed_greater_equal, >=)
CMP_OP(__vec32_i16, i16, uint16_t, __unsigned_less_than, <)
CMP_OP(__vec32_i16, i16, int16_t, __signed_less_than, <)
CMP_OP(__vec32_i16, i16, uint16_t, __unsigned_greater_than, >)
CMP_OP(__vec32_i16, i16, int16_t, __signed_greater_than, >)
SELECT(__vec32_i16)
INSERT_EXTRACT(__vec32_i16, int16_t)
@@ -577,16 +577,16 @@ SHIFT_UNIFORM(__vec32_i32, uint32_t, __lshr, >>)
SHIFT_UNIFORM(__vec32_i32, int32_t, __ashr, >>)
SHIFT_UNIFORM(__vec32_i32, int32_t, __shl, <<)
CMP_OP(__vec32_i32, int32_t, __equal, ==)
CMP_OP(__vec32_i32, int32_t, __not_equal, !=)
CMP_OP(__vec32_i32, uint32_t, __unsigned_less_equal, <=)
CMP_OP(__vec32_i32, int32_t, __signed_less_equal, <=)
CMP_OP(__vec32_i32, uint32_t, __unsigned_greater_equal, >=)
CMP_OP(__vec32_i32, int32_t, __signed_greater_equal, >=)
CMP_OP(__vec32_i32, uint32_t, __unsigned_less_than, <)
CMP_OP(__vec32_i32, int32_t, __signed_less_than, <)
CMP_OP(__vec32_i32, uint32_t, __unsigned_greater_than, >)
CMP_OP(__vec32_i32, int32_t, __signed_greater_than, >)
CMP_OP(__vec32_i32, i32, int32_t, __equal, ==)
CMP_OP(__vec32_i32, i32, int32_t, __not_equal, !=)
CMP_OP(__vec32_i32, i32, uint32_t, __unsigned_less_equal, <=)
CMP_OP(__vec32_i32, i32, int32_t, __signed_less_equal, <=)
CMP_OP(__vec32_i32, i32, uint32_t, __unsigned_greater_equal, >=)
CMP_OP(__vec32_i32, i32, int32_t, __signed_greater_equal, >=)
CMP_OP(__vec32_i32, i32, uint32_t, __unsigned_less_than, <)
CMP_OP(__vec32_i32, i32, int32_t, __signed_less_than, <)
CMP_OP(__vec32_i32, i32, uint32_t, __unsigned_greater_than, >)
CMP_OP(__vec32_i32, i32, int32_t, __signed_greater_than, >)
SELECT(__vec32_i32)
INSERT_EXTRACT(__vec32_i32, int32_t)
@@ -620,16 +620,16 @@ SHIFT_UNIFORM(__vec32_i64, uint64_t, __lshr, >>)
SHIFT_UNIFORM(__vec32_i64, int64_t, __ashr, >>)
SHIFT_UNIFORM(__vec32_i64, int64_t, __shl, <<)
CMP_OP(__vec32_i64, int64_t, __equal, ==)
CMP_OP(__vec32_i64, int64_t, __not_equal, !=)
CMP_OP(__vec32_i64, uint64_t, __unsigned_less_equal, <=)
CMP_OP(__vec32_i64, int64_t, __signed_less_equal, <=)
CMP_OP(__vec32_i64, uint64_t, __unsigned_greater_equal, >=)
CMP_OP(__vec32_i64, int64_t, __signed_greater_equal, >=)
CMP_OP(__vec32_i64, uint64_t, __unsigned_less_than, <)
CMP_OP(__vec32_i64, int64_t, __signed_less_than, <)
CMP_OP(__vec32_i64, uint64_t, __unsigned_greater_than, >)
CMP_OP(__vec32_i64, int64_t, __signed_greater_than, >)
CMP_OP(__vec32_i64, i64, int64_t, __equal, ==)
CMP_OP(__vec32_i64, i64, int64_t, __not_equal, !=)
CMP_OP(__vec32_i64, i64, uint64_t, __unsigned_less_equal, <=)
CMP_OP(__vec32_i64, i64, int64_t, __signed_less_equal, <=)
CMP_OP(__vec32_i64, i64, uint64_t, __unsigned_greater_equal, >=)
CMP_OP(__vec32_i64, i64, int64_t, __signed_greater_equal, >=)
CMP_OP(__vec32_i64, i64, uint64_t, __unsigned_less_than, <)
CMP_OP(__vec32_i64, i64, int64_t, __signed_less_than, <)
CMP_OP(__vec32_i64, i64, uint64_t, __unsigned_greater_than, >)
CMP_OP(__vec32_i64, i64, int64_t, __signed_greater_than, >)
SELECT(__vec32_i64)
INSERT_EXTRACT(__vec32_i64, int64_t)
@@ -647,12 +647,12 @@ BINARY_OP(__vec32_f, __sub, -)
BINARY_OP(__vec32_f, __mul, *)
BINARY_OP(__vec32_f, __div, /)
CMP_OP(__vec32_f, float, __equal, ==)
CMP_OP(__vec32_f, float, __not_equal, !=)
CMP_OP(__vec32_f, float, __less_than, <)
CMP_OP(__vec32_f, float, __less_equal, <=)
CMP_OP(__vec32_f, float, __greater_than, >)
CMP_OP(__vec32_f, float, __greater_equal, >=)
CMP_OP(__vec32_f, float, float, __equal, ==)
CMP_OP(__vec32_f, float, float, __not_equal, !=)
CMP_OP(__vec32_f, float, float, __less_than, <)
CMP_OP(__vec32_f, float, float, __less_equal, <=)
CMP_OP(__vec32_f, float, float, __greater_than, >)
CMP_OP(__vec32_f, float, float, __greater_equal, >=)
static FORCEINLINE __vec32_i1 __ordered(__vec32_f a, __vec32_f b) {
__vec32_i1 ret;
@@ -797,12 +797,12 @@ BINARY_OP(__vec32_d, __sub, -)
BINARY_OP(__vec32_d, __mul, *)
BINARY_OP(__vec32_d, __div, /)
CMP_OP(__vec32_d, double, __equal, ==)
CMP_OP(__vec32_d, double, __not_equal, !=)
CMP_OP(__vec32_d, double, __less_than, <)
CMP_OP(__vec32_d, double, __less_equal, <=)
CMP_OP(__vec32_d, double, __greater_than, >)
CMP_OP(__vec32_d, double, __greater_equal, >=)
CMP_OP(__vec32_d, double, double, __equal, ==)
CMP_OP(__vec32_d, double, double, __not_equal, !=)
CMP_OP(__vec32_d, double, double, __less_than, <)
CMP_OP(__vec32_d, double, double, __less_equal, <=)
CMP_OP(__vec32_d, double, double, __greater_than, >)
CMP_OP(__vec32_d, double, double, __greater_equal, >=)
static FORCEINLINE __vec32_i1 __ordered(__vec32_d a, __vec32_d b) {
__vec32_i1 ret;

View File

@@ -383,8 +383,8 @@ static FORCEINLINE TYPE NAME(TYPE a, TYPE b) { \
return ret; \
}
#define CMP_OP(TYPE, CAST, NAME, OP) \
static FORCEINLINE __vec64_i1 NAME(TYPE a, TYPE b) { \
#define CMP_OP(TYPE, SUFFIX, CAST, NAME, OP) \
static FORCEINLINE __vec64_i1 NAME##_##SUFFIX(TYPE a, TYPE b) { \
__vec64_i1 ret; \
ret.v = 0; \
for (int i = 0; i < 64; ++i) \
@@ -507,7 +507,7 @@ static FORCEINLINE uint64_t __movmsk(__vec64_i1 mask) {
return (uint64_t)mask.v;
}
static FORCEINLINE __vec64_i1 __equal(__vec64_i1 a, __vec64_i1 b) {
static FORCEINLINE __vec64_i1 __equal_i1(__vec64_i1 a, __vec64_i1 b) {
__vec64_i1 r;
r.v = (a.v & b.v) | (~a.v & ~b.v);
return r;
@@ -620,16 +620,16 @@ SHIFT_UNIFORM(__vec64_i8, uint8_t, __lshr, >>)
SHIFT_UNIFORM(__vec64_i8, int8_t, __ashr, >>)
SHIFT_UNIFORM(__vec64_i8, int8_t, __shl, <<)
CMP_OP(__vec64_i8, int8_t, __equal, ==)
CMP_OP(__vec64_i8, int8_t, __not_equal, !=)
CMP_OP(__vec64_i8, uint8_t, __unsigned_less_equal, <=)
CMP_OP(__vec64_i8, int8_t, __signed_less_equal, <=)
CMP_OP(__vec64_i8, uint8_t, __unsigned_greater_equal, >=)
CMP_OP(__vec64_i8, int8_t, __signed_greater_equal, >=)
CMP_OP(__vec64_i8, uint8_t, __unsigned_less_than, <)
CMP_OP(__vec64_i8, int8_t, __signed_less_than, <)
CMP_OP(__vec64_i8, uint8_t, __unsigned_greater_than, >)
CMP_OP(__vec64_i8, int8_t, __signed_greater_than, >)
CMP_OP(__vec64_i8, i8, int8_t, __equal, ==)
CMP_OP(__vec64_i8, i8, int8_t, __not_equal, !=)
CMP_OP(__vec64_i8, i8, uint8_t, __unsigned_less_equal, <=)
CMP_OP(__vec64_i8, i8, int8_t, __signed_less_equal, <=)
CMP_OP(__vec64_i8, i8, uint8_t, __unsigned_greater_equal, >=)
CMP_OP(__vec64_i8, i8, int8_t, __signed_greater_equal, >=)
CMP_OP(__vec64_i8, i8, uint8_t, __unsigned_less_than, <)
CMP_OP(__vec64_i8, i8, int8_t, __signed_less_than, <)
CMP_OP(__vec64_i8, i8, uint8_t, __unsigned_greater_than, >)
CMP_OP(__vec64_i8, i8, int8_t, __signed_greater_than, >)
SELECT(__vec64_i8)
INSERT_EXTRACT(__vec64_i8, int8_t)
@@ -663,16 +663,16 @@ SHIFT_UNIFORM(__vec64_i16, uint16_t, __lshr, >>)
SHIFT_UNIFORM(__vec64_i16, int16_t, __ashr, >>)
SHIFT_UNIFORM(__vec64_i16, int16_t, __shl, <<)
CMP_OP(__vec64_i16, int16_t, __equal, ==)
CMP_OP(__vec64_i16, int16_t, __not_equal, !=)
CMP_OP(__vec64_i16, uint16_t, __unsigned_less_equal, <=)
CMP_OP(__vec64_i16, int16_t, __signed_less_equal, <=)
CMP_OP(__vec64_i16, uint16_t, __unsigned_greater_equal, >=)
CMP_OP(__vec64_i16, int16_t, __signed_greater_equal, >=)
CMP_OP(__vec64_i16, uint16_t, __unsigned_less_than, <)
CMP_OP(__vec64_i16, int16_t, __signed_less_than, <)
CMP_OP(__vec64_i16, uint16_t, __unsigned_greater_than, >)
CMP_OP(__vec64_i16, int16_t, __signed_greater_than, >)
CMP_OP(__vec64_i16, i16, int16_t, __equal, ==)
CMP_OP(__vec64_i16, i16, int16_t, __not_equal, !=)
CMP_OP(__vec64_i16, i16, uint16_t, __unsigned_less_equal, <=)
CMP_OP(__vec64_i16, i16, int16_t, __signed_less_equal, <=)
CMP_OP(__vec64_i16, i16, uint16_t, __unsigned_greater_equal, >=)
CMP_OP(__vec64_i16, i16, int16_t, __signed_greater_equal, >=)
CMP_OP(__vec64_i16, i16, uint16_t, __unsigned_less_than, <)
CMP_OP(__vec64_i16, i16, int16_t, __signed_less_than, <)
CMP_OP(__vec64_i16, i16, uint16_t, __unsigned_greater_than, >)
CMP_OP(__vec64_i16, i16, int16_t, __signed_greater_than, >)
SELECT(__vec64_i16)
INSERT_EXTRACT(__vec64_i16, int16_t)
@@ -706,16 +706,16 @@ SHIFT_UNIFORM(__vec64_i32, uint32_t, __lshr, >>)
SHIFT_UNIFORM(__vec64_i32, int32_t, __ashr, >>)
SHIFT_UNIFORM(__vec64_i32, int32_t, __shl, <<)
CMP_OP(__vec64_i32, int32_t, __equal, ==)
CMP_OP(__vec64_i32, int32_t, __not_equal, !=)
CMP_OP(__vec64_i32, uint32_t, __unsigned_less_equal, <=)
CMP_OP(__vec64_i32, int32_t, __signed_less_equal, <=)
CMP_OP(__vec64_i32, uint32_t, __unsigned_greater_equal, >=)
CMP_OP(__vec64_i32, int32_t, __signed_greater_equal, >=)
CMP_OP(__vec64_i32, uint32_t, __unsigned_less_than, <)
CMP_OP(__vec64_i32, int32_t, __signed_less_than, <)
CMP_OP(__vec64_i32, uint32_t, __unsigned_greater_than, >)
CMP_OP(__vec64_i32, int32_t, __signed_greater_than, >)
CMP_OP(__vec64_i32, i32, int32_t, __equal, ==)
CMP_OP(__vec64_i32, i32, int32_t, __not_equal, !=)
CMP_OP(__vec64_i32, i32, uint32_t, __unsigned_less_equal, <=)
CMP_OP(__vec64_i32, i32, int32_t, __signed_less_equal, <=)
CMP_OP(__vec64_i32, i32, uint32_t, __unsigned_greater_equal, >=)
CMP_OP(__vec64_i32, i32, int32_t, __signed_greater_equal, >=)
CMP_OP(__vec64_i32, i32, uint32_t, __unsigned_less_than, <)
CMP_OP(__vec64_i32, i32, int32_t, __signed_less_than, <)
CMP_OP(__vec64_i32, i32, uint32_t, __unsigned_greater_than, >)
CMP_OP(__vec64_i32, i32, int32_t, __signed_greater_than, >)
SELECT(__vec64_i32)
INSERT_EXTRACT(__vec64_i32, int32_t)
@@ -749,16 +749,16 @@ SHIFT_UNIFORM(__vec64_i64, uint64_t, __lshr, >>)
SHIFT_UNIFORM(__vec64_i64, int64_t, __ashr, >>)
SHIFT_UNIFORM(__vec64_i64, int64_t, __shl, <<)
CMP_OP(__vec64_i64, int64_t, __equal, ==)
CMP_OP(__vec64_i64, int64_t, __not_equal, !=)
CMP_OP(__vec64_i64, uint64_t, __unsigned_less_equal, <=)
CMP_OP(__vec64_i64, int64_t, __signed_less_equal, <=)
CMP_OP(__vec64_i64, uint64_t, __unsigned_greater_equal, >=)
CMP_OP(__vec64_i64, int64_t, __signed_greater_equal, >=)
CMP_OP(__vec64_i64, uint64_t, __unsigned_less_than, <)
CMP_OP(__vec64_i64, int64_t, __signed_less_than, <)
CMP_OP(__vec64_i64, uint64_t, __unsigned_greater_than, >)
CMP_OP(__vec64_i64, int64_t, __signed_greater_than, >)
CMP_OP(__vec64_i64, i64, int64_t, __equal, ==)
CMP_OP(__vec64_i64, i64, int64_t, __not_equal, !=)
CMP_OP(__vec64_i64, i64, uint64_t, __unsigned_less_equal, <=)
CMP_OP(__vec64_i64, i64, int64_t, __signed_less_equal, <=)
CMP_OP(__vec64_i64, i64, uint64_t, __unsigned_greater_equal, >=)
CMP_OP(__vec64_i64, i64, int64_t, __signed_greater_equal, >=)
CMP_OP(__vec64_i64, i64, uint64_t, __unsigned_less_than, <)
CMP_OP(__vec64_i64, i64, int64_t, __signed_less_than, <)
CMP_OP(__vec64_i64, i64, uint64_t, __unsigned_greater_than, >)
CMP_OP(__vec64_i64, i64, int64_t, __signed_greater_than, >)
SELECT(__vec64_i64)
INSERT_EXTRACT(__vec64_i64, int64_t)
@@ -776,12 +776,12 @@ BINARY_OP(__vec64_f, __sub, -)
BINARY_OP(__vec64_f, __mul, *)
BINARY_OP(__vec64_f, __div, /)
CMP_OP(__vec64_f, float, __equal, ==)
CMP_OP(__vec64_f, float, __not_equal, !=)
CMP_OP(__vec64_f, float, __less_than, <)
CMP_OP(__vec64_f, float, __less_equal, <=)
CMP_OP(__vec64_f, float, __greater_than, >)
CMP_OP(__vec64_f, float, __greater_equal, >=)
CMP_OP(__vec64_f, float, float, __equal, ==)
CMP_OP(__vec64_f, float, float, __not_equal, !=)
CMP_OP(__vec64_f, float, float, __less_than, <)
CMP_OP(__vec64_f, float, float, __less_equal, <=)
CMP_OP(__vec64_f, float, float, __greater_than, >)
CMP_OP(__vec64_f, float, float, __greater_equal, >=)
static FORCEINLINE __vec64_i1 __ordered(__vec64_f a, __vec64_f b) {
__vec64_i1 ret;
@@ -926,12 +926,12 @@ BINARY_OP(__vec64_d, __sub, -)
BINARY_OP(__vec64_d, __mul, *)
BINARY_OP(__vec64_d, __div, /)
CMP_OP(__vec64_d, double, __equal, ==)
CMP_OP(__vec64_d, double, __not_equal, !=)
CMP_OP(__vec64_d, double, __less_than, <)
CMP_OP(__vec64_d, double, __less_equal, <=)
CMP_OP(__vec64_d, double, __greater_than, >)
CMP_OP(__vec64_d, double, __greater_equal, >=)
CMP_OP(__vec64_d, double, double, __equal, ==)
CMP_OP(__vec64_d, double, double, __not_equal, !=)
CMP_OP(__vec64_d, double, double, __less_than, <)
CMP_OP(__vec64_d, double, double, __less_equal, <=)
CMP_OP(__vec64_d, double, double, __greater_than, >)
CMP_OP(__vec64_d, double, double, __greater_equal, >=)
static FORCEINLINE __vec64_i1 __ordered(__vec64_d a, __vec64_d b) {
__vec64_i1 ret;

View File

@@ -409,7 +409,7 @@ static FORCEINLINE __vec16_i1 __movmsk(__vec16_i1 mask) {
return _mm512_kmov(mask);
}
static FORCEINLINE __vec16_i1 __equal(__vec16_i1 a, __vec16_i1 b) {
static FORCEINLINE __vec16_i1 __equal_i1(__vec16_i1 a, __vec16_i1 b) {
return _mm512_knot( _mm512_kandn(a, b));
}
@@ -634,43 +634,43 @@ static FORCEINLINE __vec16_i32 __ashr(__vec16_i32 a, int32_t n) {
return _mm512_srai_epi32((__m512i)a, n);
}
static FORCEINLINE __vec16_i1 __equal(const __vec16_i32 &a, const __vec16_i32 &b) {
static FORCEINLINE __vec16_i1 __equal_i32(const __vec16_i32 &a, const __vec16_i32 &b) {
return _mm512_cmpeq_epi32_mask((__m512i)a, (__m512i)b);
}
static FORCEINLINE __vec16_i1 __not_equal(__vec16_i32 a, __vec16_i32 b) {
static FORCEINLINE __vec16_i1 __not_equal_i32(__vec16_i32 a, __vec16_i32 b) {
return _mm512_cmpneq_epi32_mask((__m512i)a, (__m512i)b);
}
static FORCEINLINE __vec16_i1 __unsigned_less_equal(__vec16_i32 a, __vec16_i32 b) {
static FORCEINLINE __vec16_i1 __unsigned_less_equal_i32(__vec16_i32 a, __vec16_i32 b) {
return _mm512_cmple_epu32_mask((__m512i)a, (__m512i)b);
}
static FORCEINLINE __vec16_i1 __signed_less_equal(__vec16_i32 a, __vec16_i32 b) {
static FORCEINLINE __vec16_i1 __signed_less_equal_i32(__vec16_i32 a, __vec16_i32 b) {
return _mm512_cmple_epi32_mask((__m512i)a, (__m512i)b);
}
static FORCEINLINE __vec16_i1 __unsigned_greater_equal(__vec16_i32 a, __vec16_i32 b) {
static FORCEINLINE __vec16_i1 __unsigned_greater_equal_i32(__vec16_i32 a, __vec16_i32 b) {
return _mm512_cmpge_epu32_mask((__m512i)a, (__m512i)b);
}
static FORCEINLINE __vec16_i1 __signed_greater_equal(__vec16_i32 a, __vec16_i32 b) {
static FORCEINLINE __vec16_i1 __signed_greater_equal_i32(__vec16_i32 a, __vec16_i32 b) {
return _mm512_cmpge_epi32_mask((__m512i)a, (__m512i)b);
}
static FORCEINLINE __vec16_i1 __unsigned_less_than(__vec16_i32 a, __vec16_i32 b) {
static FORCEINLINE __vec16_i1 __unsigned_less_than_i32(__vec16_i32 a, __vec16_i32 b) {
return _mm512_cmplt_epu32_mask((__m512i)a, (__m512i)b);
}
static FORCEINLINE __vec16_i1 __signed_less_than(__vec16_i32 a, __vec16_i32 b) {
static FORCEINLINE __vec16_i1 __signed_less_than_i32(__vec16_i32 a, __vec16_i32 b) {
return _mm512_cmplt_epi32_mask((__m512i)a, (__m512i)b);
}
static FORCEINLINE __vec16_i1 __unsigned_greater_than(__vec16_i32 a, __vec16_i32 b) {
static FORCEINLINE __vec16_i1 __unsigned_greater_than_i32(__vec16_i32 a, __vec16_i32 b) {
return _mm512_cmpgt_epu32_mask((__m512i)a, (__m512i)b);
}
static FORCEINLINE __vec16_i1 __signed_greater_than(__vec16_i32 a, __vec16_i32 b) {
static FORCEINLINE __vec16_i1 __signed_greater_than_i32(__vec16_i32 a, __vec16_i32 b) {
return _mm512_cmpgt_epi32_mask((__m512i)a, (__m512i)b);
}
@@ -773,12 +773,12 @@ SHIFT_UNIFORM(__vec16_i64, uint64_t, __lshr, >>)
SHIFT_UNIFORM(__vec16_i64, int64_t, __ashr, >>)
SHIFT_UNIFORM(__vec16_i64, int64_t, __shl, <<)
static FORCEINLINE __vec16_i1 __equal(const __vec16_i64 &a, const __vec16_i64 &b) {
static FORCEINLINE __vec16_i1 __equal_i64(const __vec16_i64 &a, const __vec16_i64 &b) {
const __mmask16 lo_match = _mm512_cmpeq_epi32_mask(a.v_lo,b.v_lo);
return _mm512_mask_cmpeq_epi32_mask(lo_match,a.v_hi,b.v_hi);
}
static FORCEINLINE __vec16_i1 __not_equal(const __vec16_i64 &a, const __vec16_i64 &b) {
static FORCEINLINE __vec16_i1 __not_equal_i64(const __vec16_i64 &a, const __vec16_i64 &b) {
return __not(__equal(a,b));
}
@@ -915,27 +915,27 @@ static FORCEINLINE __vec16_f __div(__vec16_f a, __vec16_f b) {
return _mm512_div_ps(a, b);
}
static FORCEINLINE __vec16_i1 __equal(__vec16_f a, __vec16_f b) {
static FORCEINLINE __vec16_i1 __equal_float(__vec16_f a, __vec16_f b) {
return _mm512_cmpeq_ps_mask(a, b);
}
static FORCEINLINE __vec16_i1 __not_equal(__vec16_f a, __vec16_f b) {
static FORCEINLINE __vec16_i1 __not_equal_float(__vec16_f a, __vec16_f b) {
return _mm512_cmpneq_ps_mask(a, b);
}
static FORCEINLINE __vec16_i1 __less_than(__vec16_f a, __vec16_f b) {
static FORCEINLINE __vec16_i1 __less_than_float(__vec16_f a, __vec16_f b) {
return _mm512_cmplt_ps_mask(a, b);
}
static FORCEINLINE __vec16_i1 __less_equal(__vec16_f a, __vec16_f b) {
static FORCEINLINE __vec16_i1 __less_equal_float(__vec16_f a, __vec16_f b) {
return _mm512_cmple_ps_mask(a, b);
}
static FORCEINLINE __vec16_i1 __greater_than(__vec16_f a, __vec16_f b) {
static FORCEINLINE __vec16_i1 __greater_than_float(__vec16_f a, __vec16_f b) {
return _mm512_cmpnle_ps_mask(a, b);
}
static FORCEINLINE __vec16_i1 __greater_equal(__vec16_f a, __vec16_f b) {
static FORCEINLINE __vec16_i1 __greater_equal_float(__vec16_f a, __vec16_f b) {
return _mm512_cmpnlt_ps_mask(a, b);
}
@@ -1043,42 +1043,42 @@ static FORCEINLINE __vec16_d __div(__vec16_d a, __vec16_d b) {
return ret;
}
static FORCEINLINE __vec16_i1 __equal(__vec16_d a, __vec16_d b) {
static FORCEINLINE __vec16_i1 __equal_double(__vec16_d a, __vec16_d b) {
__vec16_i1 ret;
ret.m8.m1 = _mm512_cmpeq_pd_mask(a.v1, b.v1);
ret.m8.m2 = _mm512_cmpeq_pd_mask(a.v2, b.v2);
return ret;
}
static FORCEINLINE __vec16_i1 __not_equal(__vec16_d a, __vec16_d b) {
static FORCEINLINE __vec16_i1 __not_equal_double(__vec16_d a, __vec16_d b) {
__vec16_i1 ret;
ret.m8.m1 = _mm512_cmpneq_pd_mask(a.v1, b.v1);
ret.m8.m2 = _mm512_cmpneq_pd_mask(a.v2, b.v2);
return ret;
}
static FORCEINLINE __vec16_i1 __less_than(__vec16_d a, __vec16_d b) {
static FORCEINLINE __vec16_i1 __less_than_double(__vec16_d a, __vec16_d b) {
__vec16_i1 ret;
ret.m8.m1 = _mm512_cmplt_pd_mask(a.v1, b.v1);
ret.m8.m2 = _mm512_cmplt_pd_mask(a.v2, b.v2);
return ret;
}
static FORCEINLINE __vec16_i1 __less_equal(__vec16_d a, __vec16_d b) {
static FORCEINLINE __vec16_i1 __less_equal_double(__vec16_d a, __vec16_d b) {
__vec16_i1 ret;
ret.m8.m1 = _mm512_cmple_pd_mask(a.v1, b.v1);
ret.m8.m2 = _mm512_cmple_pd_mask(a.v2, b.v2);
return ret;
}
static FORCEINLINE __vec16_i1 __greater_than(__vec16_d a, __vec16_d b) {
static FORCEINLINE __vec16_i1 __greater_than_double(__vec16_d a, __vec16_d b) {
__vec16_i1 ret;
ret.m8.m1 = _mm512_cmpnle_pd_mask(a.v1, b.v1);
ret.m8.m2 = _mm512_cmpnle_pd_mask(a.v2, b.v2);
return ret;
}
static FORCEINLINE __vec16_i1 __greater_equal(__vec16_d a, __vec16_d b) {
static FORCEINLINE __vec16_i1 __greater_equal_double(__vec16_d a, __vec16_d b) {
__vec16_i1 ret;
ret.m8.m1 = _mm512_cmpnlt_pd_mask(a.v1, b.v1);
ret.m8.m2 = _mm512_cmpnlt_pd_mask(a.v2, b.v2);

View File

@@ -244,7 +244,7 @@ static FORCEINLINE uint64_t __movmsk(__vec4_i1 mask) {
return (uint64_t)_mm_movemask_ps(mask.v);
}
static FORCEINLINE __vec4_i1 __equal(__vec4_i1 a, __vec4_i1 b) {
static FORCEINLINE __vec4_i1 __equal_i1(__vec4_i1 a, __vec4_i1 b) {
return _mm_cmpeq_epi32(_mm_castps_si128(a.v), _mm_castps_si128(b.v));
}
@@ -425,7 +425,7 @@ static FORCEINLINE __vec4_i8 __ashr(__vec4_i8 a, int32_t b) {
(int8_t)_mm_extract_epi8(a.v, 3) >> b);
}
static FORCEINLINE __vec4_i1 __equal(__vec4_i8 a, __vec4_i8 b) {
static FORCEINLINE __vec4_i1 __equal_i8(__vec4_i8 a, __vec4_i8 b) {
__m128i cmp = _mm_cmpeq_epi8(a.v, b.v);
return __vec4_i1(_mm_extract_epi8(cmp, 0),
_mm_extract_epi8(cmp, 1),
@@ -433,11 +433,12 @@ static FORCEINLINE __vec4_i1 __equal(__vec4_i8 a, __vec4_i8 b) {
_mm_extract_epi8(cmp, 3));
}
static FORCEINLINE __vec4_i1 __not_equal(__vec4_i8 a, __vec4_i8 b) {
return __xor(__equal(a, b), __vec4_i1(1, 1, 1, 1));
static FORCEINLINE __vec4_i1 __not_equal_i8(__vec4_i8 a, __vec4_i8 b) {
return __xor(__equal_i8(a, b), __vec4_i1(1, 1, 1, 1));
}
static FORCEINLINE __vec4_i1 __unsigned_less_equal(__vec4_i8 a, __vec4_i8 b) {
static FORCEINLINE __vec4_i1 __unsigned_less_equal_i8(__vec4_i8 a, __vec4_i8 b) {
return __vec4_i1((uint8_t)_mm_extract_epi8(a.v, 0) <=
(uint8_t)_mm_extract_epi8(b.v, 0),
(uint8_t)_mm_extract_epi8(a.v, 1) <=
@@ -448,7 +449,7 @@ static FORCEINLINE __vec4_i1 __unsigned_less_equal(__vec4_i8 a, __vec4_i8 b) {
(uint8_t)_mm_extract_epi8(b.v, 3));
}
static FORCEINLINE __vec4_i1 __unsigned_greater_equal(__vec4_i8 a, __vec4_i8 b) {
static FORCEINLINE __vec4_i1 __unsigned_greater_equal_i8(__vec4_i8 a, __vec4_i8 b) {
return __vec4_i1((uint8_t)_mm_extract_epi8(a.v, 0) >=
(uint8_t)_mm_extract_epi8(b.v, 0),
(uint8_t)_mm_extract_epi8(a.v, 1) >=
@@ -459,7 +460,7 @@ static FORCEINLINE __vec4_i1 __unsigned_greater_equal(__vec4_i8 a, __vec4_i8 b)
(uint8_t)_mm_extract_epi8(b.v, 3));
}
static FORCEINLINE __vec4_i1 __unsigned_less_than(__vec4_i8 a, __vec4_i8 b) {
static FORCEINLINE __vec4_i1 __unsigned_less_than_i8(__vec4_i8 a, __vec4_i8 b) {
return __vec4_i1((uint8_t)_mm_extract_epi8(a.v, 0) <
(uint8_t)_mm_extract_epi8(b.v, 0),
(uint8_t)_mm_extract_epi8(a.v, 1) <
@@ -470,7 +471,7 @@ static FORCEINLINE __vec4_i1 __unsigned_less_than(__vec4_i8 a, __vec4_i8 b) {
(uint8_t)_mm_extract_epi8(b.v, 3));
}
static FORCEINLINE __vec4_i1 __unsigned_greater_than(__vec4_i8 a, __vec4_i8 b) {
static FORCEINLINE __vec4_i1 __unsigned_greater_than_i8(__vec4_i8 a, __vec4_i8 b) {
return __vec4_i1((uint8_t)_mm_extract_epi8(a.v, 0) >
(uint8_t)_mm_extract_epi8(b.v, 0),
(uint8_t)_mm_extract_epi8(a.v, 1) >
@@ -481,7 +482,7 @@ static FORCEINLINE __vec4_i1 __unsigned_greater_than(__vec4_i8 a, __vec4_i8 b) {
(uint8_t)_mm_extract_epi8(b.v, 3));
}
static FORCEINLINE __vec4_i1 __signed_less_than(__vec4_i8 a, __vec4_i8 b) {
static FORCEINLINE __vec4_i1 __signed_less_than_i8(__vec4_i8 a, __vec4_i8 b) {
__m128i cmp = _mm_cmplt_epi8(a.v, b.v);
return __vec4_i1(_mm_extract_epi8(cmp, 0),
_mm_extract_epi8(cmp, 1),
@@ -489,11 +490,11 @@ static FORCEINLINE __vec4_i1 __signed_less_than(__vec4_i8 a, __vec4_i8 b) {
_mm_extract_epi8(cmp, 3));
}
static FORCEINLINE __vec4_i1 __signed_less_equal(__vec4_i8 a, __vec4_i8 b) {
return __or(__signed_less_than(a, b), __equal(a, b));
static FORCEINLINE __vec4_i1 __signed_less_equal_i8(__vec4_i8 a, __vec4_i8 b) {
return __or(__signed_less_than_i8(a, b), __equal_i8(a, b));
}
static FORCEINLINE __vec4_i1 __signed_greater_than(__vec4_i8 a, __vec4_i8 b) {
static FORCEINLINE __vec4_i1 __signed_greater_than_i8(__vec4_i8 a, __vec4_i8 b) {
__m128i cmp = _mm_cmpgt_epi8(a.v, b.v);
return __vec4_i1(_mm_extract_epi8(cmp, 0),
_mm_extract_epi8(cmp, 1),
@@ -501,8 +502,8 @@ static FORCEINLINE __vec4_i1 __signed_greater_than(__vec4_i8 a, __vec4_i8 b) {
_mm_extract_epi8(cmp, 3));
}
static FORCEINLINE __vec4_i1 __signed_greater_equal(__vec4_i8 a, __vec4_i8 b) {
return __or(__signed_greater_than(a, b), __equal(a, b));
static FORCEINLINE __vec4_i1 __signed_greater_equal_i8(__vec4_i8 a, __vec4_i8 b) {
return __or(__signed_greater_than_i8(a, b), __equal_i8(a, b));
}
static FORCEINLINE __vec4_i8 __select(__vec4_i1 mask, __vec4_i8 a, __vec4_i8 b) {
@@ -681,7 +682,7 @@ static FORCEINLINE __vec4_i16 __ashr(__vec4_i16 a, int32_t b) {
return _mm_sra_epi16(a.v, _mm_set_epi32(0, 0, 0, b));
}
static FORCEINLINE __vec4_i1 __equal(__vec4_i16 a, __vec4_i16 b) {
static FORCEINLINE __vec4_i1 __equal_i16(__vec4_i16 a, __vec4_i16 b) {
__m128i cmp = _mm_cmpeq_epi16(a.v, b.v);
return __vec4_i1(_mm_extract_epi16(cmp, 0),
_mm_extract_epi16(cmp, 1),
@@ -689,11 +690,11 @@ static FORCEINLINE __vec4_i1 __equal(__vec4_i16 a, __vec4_i16 b) {
_mm_extract_epi16(cmp, 3));
}
static FORCEINLINE __vec4_i1 __not_equal(__vec4_i16 a, __vec4_i16 b) {
return __xor(__equal(a, b), __vec4_i1(1, 1, 1, 1));
static FORCEINLINE __vec4_i1 __not_equal_i16(__vec4_i16 a, __vec4_i16 b) {
return __xor(__equal_i16(a, b), __vec4_i1(1, 1, 1, 1));
}
static FORCEINLINE __vec4_i1 __unsigned_less_equal(__vec4_i16 a, __vec4_i16 b) {
static FORCEINLINE __vec4_i1 __unsigned_less_equal_i16(__vec4_i16 a, __vec4_i16 b) {
// FIXME: could use the trick that int32 does for the unsigned
// comparisons so that we don't need to scalarie them. (This also
// applies to i8s...)
@@ -707,7 +708,7 @@ static FORCEINLINE __vec4_i1 __unsigned_less_equal(__vec4_i16 a, __vec4_i16 b) {
(uint16_t)_mm_extract_epi16(b.v, 3));
}
static FORCEINLINE __vec4_i1 __unsigned_greater_equal(__vec4_i16 a, __vec4_i16 b) {
static FORCEINLINE __vec4_i1 __unsigned_greater_equal_i16(__vec4_i16 a, __vec4_i16 b) {
return __vec4_i1((uint16_t)_mm_extract_epi16(a.v, 0) >=
(uint16_t)_mm_extract_epi16(b.v, 0),
(uint16_t)_mm_extract_epi16(a.v, 1) >=
@@ -718,7 +719,7 @@ static FORCEINLINE __vec4_i1 __unsigned_greater_equal(__vec4_i16 a, __vec4_i16 b
(uint16_t)_mm_extract_epi16(b.v, 3));
}
static FORCEINLINE __vec4_i1 __unsigned_less_than(__vec4_i16 a, __vec4_i16 b) {
static FORCEINLINE __vec4_i1 __unsigned_less_than_i16(__vec4_i16 a, __vec4_i16 b) {
return __vec4_i1((uint16_t)_mm_extract_epi16(a.v, 0) <
(uint16_t)_mm_extract_epi16(b.v, 0),
(uint16_t)_mm_extract_epi16(a.v, 1) <
@@ -729,7 +730,7 @@ static FORCEINLINE __vec4_i1 __unsigned_less_than(__vec4_i16 a, __vec4_i16 b) {
(uint16_t)_mm_extract_epi16(b.v, 3));
}
static FORCEINLINE __vec4_i1 __unsigned_greater_than(__vec4_i16 a, __vec4_i16 b) {
static FORCEINLINE __vec4_i1 __unsigned_greater_than_i16(__vec4_i16 a, __vec4_i16 b) {
return __vec4_i1((uint16_t)_mm_extract_epi16(a.v, 0) >
(uint16_t)_mm_extract_epi16(b.v, 0),
(uint16_t)_mm_extract_epi16(a.v, 1) >
@@ -740,7 +741,7 @@ static FORCEINLINE __vec4_i1 __unsigned_greater_than(__vec4_i16 a, __vec4_i16 b)
(uint16_t)_mm_extract_epi16(b.v, 3));
}
static FORCEINLINE __vec4_i1 __signed_less_than(__vec4_i16 a, __vec4_i16 b) {
static FORCEINLINE __vec4_i1 __signed_less_than_i16(__vec4_i16 a, __vec4_i16 b) {
__m128i cmp = _mm_cmplt_epi16(a.v, b.v);
return __vec4_i1(_mm_extract_epi16(cmp, 0),
_mm_extract_epi16(cmp, 1),
@@ -748,11 +749,11 @@ static FORCEINLINE __vec4_i1 __signed_less_than(__vec4_i16 a, __vec4_i16 b) {
_mm_extract_epi16(cmp, 3));
}
static FORCEINLINE __vec4_i1 __signed_less_equal(__vec4_i16 a, __vec4_i16 b) {
return __or(__signed_less_than(a, b), __equal(a, b));
static FORCEINLINE __vec4_i1 __signed_less_equal_i16(__vec4_i16 a, __vec4_i16 b) {
return __or(__signed_less_than_i16(a, b), __equal_i16(a, b));
}
static FORCEINLINE __vec4_i1 __signed_greater_than(__vec4_i16 a, __vec4_i16 b) {
static FORCEINLINE __vec4_i1 __signed_greater_than_i16(__vec4_i16 a, __vec4_i16 b) {
__m128i cmp = _mm_cmpgt_epi16(a.v, b.v);
return __vec4_i1(_mm_extract_epi16(cmp, 0),
_mm_extract_epi16(cmp, 1),
@@ -760,8 +761,8 @@ static FORCEINLINE __vec4_i1 __signed_greater_than(__vec4_i16 a, __vec4_i16 b)
_mm_extract_epi16(cmp, 3));
}
static FORCEINLINE __vec4_i1 __signed_greater_equal(__vec4_i16 a, __vec4_i16 b) {
return __or(__signed_greater_than(a, b), __equal(a, b));
static FORCEINLINE __vec4_i1 __signed_greater_equal_i16(__vec4_i16 a, __vec4_i16 b) {
return __or(__signed_greater_than_i16(a, b), __equal_i16(a, b));
}
static FORCEINLINE __vec4_i16 __select(__vec4_i1 mask, __vec4_i16 a, __vec4_i16 b) {
@@ -966,52 +967,52 @@ static FORCEINLINE __vec4_i32 __ashr(__vec4_i32 a, int32_t b) {
return _mm_sra_epi32(a.v, _mm_set_epi32(0, 0, 0, b));
}
static FORCEINLINE __vec4_i1 __equal(__vec4_i32 a, __vec4_i32 b) {
static FORCEINLINE __vec4_i1 __equal_i32(__vec4_i32 a, __vec4_i32 b) {
return _mm_cmpeq_epi32(a.v, b.v);
}
static FORCEINLINE __vec4_i1 __not_equal(__vec4_i32 a, __vec4_i32 b) {
static FORCEINLINE __vec4_i1 __not_equal_i32(__vec4_i32 a, __vec4_i32 b) {
return _mm_xor_si128(_mm_cmpeq_epi32(a.v, b.v),
_mm_cmpeq_epi32(a.v, a.v));
}
static FORCEINLINE __vec4_i1 __unsigned_less_equal(__vec4_i32 a, __vec4_i32 b) {
static FORCEINLINE __vec4_i1 __unsigned_less_equal_i32(__vec4_i32 a, __vec4_i32 b) {
// a<=b == (min(a,b) == a)
return _mm_cmpeq_epi32(_mm_min_epu32(a.v, b.v), a.v);
}
static FORCEINLINE __vec4_i1 __signed_less_equal(__vec4_i32 a, __vec4_i32 b) {
static FORCEINLINE __vec4_i1 __signed_less_equal_i32(__vec4_i32 a, __vec4_i32 b) {
return _mm_or_si128(_mm_cmplt_epi32(a.v, b.v),
_mm_cmpeq_epi32(a.v, b.v));
}
static FORCEINLINE __vec4_i1 __unsigned_greater_equal(__vec4_i32 a, __vec4_i32 b) {
static FORCEINLINE __vec4_i1 __unsigned_greater_equal_i32(__vec4_i32 a, __vec4_i32 b) {
// a>=b == (max(a,b) == a)
return _mm_cmpeq_epi32(_mm_max_epu32(a.v, b.v), a.v);
}
static FORCEINLINE __vec4_i1 __signed_greater_equal(__vec4_i32 a, __vec4_i32 b) {
static FORCEINLINE __vec4_i1 __signed_greater_equal_i32(__vec4_i32 a, __vec4_i32 b) {
return _mm_or_si128(_mm_cmpgt_epi32(a.v, b.v),
_mm_cmpeq_epi32(a.v, b.v));
}
static FORCEINLINE __vec4_i1 __unsigned_less_than(__vec4_i32 a, __vec4_i32 b) {
static FORCEINLINE __vec4_i1 __unsigned_less_than_i32(__vec4_i32 a, __vec4_i32 b) {
a.v = _mm_xor_si128(a.v, _mm_set1_epi32(0x80000000));
b.v = _mm_xor_si128(b.v, _mm_set1_epi32(0x80000000));
return _mm_cmplt_epi32(a.v, b.v);
}
static FORCEINLINE __vec4_i1 __signed_less_than(__vec4_i32 a, __vec4_i32 b) {
static FORCEINLINE __vec4_i1 __signed_less_than_i32(__vec4_i32 a, __vec4_i32 b) {
return _mm_cmplt_epi32(a.v, b.v);
}
static FORCEINLINE __vec4_i1 __unsigned_greater_than(__vec4_i32 a, __vec4_i32 b) {
static FORCEINLINE __vec4_i1 __unsigned_greater_than_i32(__vec4_i32 a, __vec4_i32 b) {
a.v = _mm_xor_si128(a.v, _mm_set1_epi32(0x80000000));
b.v = _mm_xor_si128(b.v, _mm_set1_epi32(0x80000000));
return _mm_cmpgt_epi32(a.v, b.v);
}
static FORCEINLINE __vec4_i1 __signed_greater_than(__vec4_i32 a, __vec4_i32 b) {
static FORCEINLINE __vec4_i1 __signed_greater_than_i32(__vec4_i32 a, __vec4_i32 b) {
return _mm_cmpgt_epi32(a.v, b.v);
}
@@ -1197,18 +1198,18 @@ static FORCEINLINE __vec4_i64 __ashr(__vec4_i64 a, int32_t b) {
(int64_t)_mm_extract_epi64(a.v[1], 1) >> b);
}
static FORCEINLINE __vec4_i1 __equal(__vec4_i64 a, __vec4_i64 b) {
static FORCEINLINE __vec4_i1 __equal_i64(__vec4_i64 a, __vec4_i64 b) {
__m128i cmp0 = _mm_cmpeq_epi64(a.v[0], b.v[0]);
__m128i cmp1 = _mm_cmpeq_epi64(a.v[1], b.v[1]);
return _mm_shuffle_ps(_mm_castsi128_ps(cmp0), _mm_castsi128_ps(cmp1),
_MM_SHUFFLE(2, 0, 2, 0));
}
static FORCEINLINE __vec4_i1 __not_equal(__vec4_i64 a, __vec4_i64 b) {
return __xor(__equal(a, b), __vec4_i1(1, 1, 1, 1));
static FORCEINLINE __vec4_i1 __not_equal_i64(__vec4_i64 a, __vec4_i64 b) {
return __xor(__equal_i64(a, b), __vec4_i1(1, 1, 1, 1));
}
static FORCEINLINE __vec4_i1 __unsigned_less_equal(__vec4_i64 a, __vec4_i64 b) {
static FORCEINLINE __vec4_i1 __unsigned_less_equal_i64(__vec4_i64 a, __vec4_i64 b) {
return __vec4_i1((uint64_t)_mm_extract_epi64(a.v[0], 0) <=
(uint64_t)_mm_extract_epi64(b.v[0], 0),
(uint64_t)_mm_extract_epi64(a.v[0], 1) <=
@@ -1219,7 +1220,7 @@ static FORCEINLINE __vec4_i1 __unsigned_less_equal(__vec4_i64 a, __vec4_i64 b) {
(uint64_t)_mm_extract_epi64(b.v[1], 1));
}
static FORCEINLINE __vec4_i1 __unsigned_greater_equal(__vec4_i64 a, __vec4_i64 b) {
static FORCEINLINE __vec4_i1 __unsigned_greater_equal_i64(__vec4_i64 a, __vec4_i64 b) {
return __vec4_i1((uint64_t)_mm_extract_epi64(a.v[0], 0) >=
(uint64_t)_mm_extract_epi64(b.v[0], 0),
(uint64_t)_mm_extract_epi64(a.v[0], 1) >=
@@ -1230,7 +1231,7 @@ static FORCEINLINE __vec4_i1 __unsigned_greater_equal(__vec4_i64 a, __vec4_i64 b
(uint64_t)_mm_extract_epi64(b.v[1], 1));
}
static FORCEINLINE __vec4_i1 __unsigned_less_than(__vec4_i64 a, __vec4_i64 b) {
static FORCEINLINE __vec4_i1 __unsigned_less_than_i64(__vec4_i64 a, __vec4_i64 b) {
return __vec4_i1((uint64_t)_mm_extract_epi64(a.v[0], 0) <
(uint64_t)_mm_extract_epi64(b.v[0], 0),
(uint64_t)_mm_extract_epi64(a.v[0], 1) <
@@ -1241,7 +1242,7 @@ static FORCEINLINE __vec4_i1 __unsigned_less_than(__vec4_i64 a, __vec4_i64 b) {
(uint64_t)_mm_extract_epi64(b.v[1], 1));
}
static FORCEINLINE __vec4_i1 __unsigned_greater_than(__vec4_i64 a, __vec4_i64 b) {
static FORCEINLINE __vec4_i1 __unsigned_greater_than_i64(__vec4_i64 a, __vec4_i64 b) {
return __vec4_i1((uint64_t)_mm_extract_epi64(a.v[0], 0) >
(uint64_t)_mm_extract_epi64(b.v[0], 0),
(uint64_t)_mm_extract_epi64(a.v[0], 1) >
@@ -1252,23 +1253,23 @@ static FORCEINLINE __vec4_i1 __unsigned_greater_than(__vec4_i64 a, __vec4_i64 b)
(uint64_t)_mm_extract_epi64(b.v[1], 1));
}
static FORCEINLINE __vec4_i1 __signed_greater_than(__vec4_i64 a, __vec4_i64 b) {
static FORCEINLINE __vec4_i1 __signed_greater_than_i64(__vec4_i64 a, __vec4_i64 b) {
__m128i cmp0 = _mm_cmpgt_epi64(a.v[0], b.v[0]);
__m128i cmp1 = _mm_cmpgt_epi64(a.v[1], b.v[1]);
return _mm_shuffle_ps(_mm_castsi128_ps(cmp0), _mm_castsi128_ps(cmp1),
_MM_SHUFFLE(2, 0, 2, 0));
}
static FORCEINLINE __vec4_i1 __signed_greater_equal(__vec4_i64 a, __vec4_i64 b) {
return __or(__signed_greater_than(a, b), __equal(a, b));
static FORCEINLINE __vec4_i1 __signed_greater_equal_i64(__vec4_i64 a, __vec4_i64 b) {
return __or(__signed_greater_than_i64(a, b), __equal_i64(a, b));
}
static FORCEINLINE __vec4_i1 __signed_less_than(__vec4_i64 a, __vec4_i64 b) {
return __xor(__signed_greater_equal(a, b), __vec4_i1(1, 1, 1, 1));
static FORCEINLINE __vec4_i1 __signed_less_than_i64(__vec4_i64 a, __vec4_i64 b) {
return __xor(__signed_greater_equal_i64(a, b), __vec4_i1(1, 1, 1, 1));
}
static FORCEINLINE __vec4_i1 __signed_less_equal(__vec4_i64 a, __vec4_i64 b) {
return __xor(__signed_greater_than(a, b), __vec4_i1(1, 1, 1, 1));
static FORCEINLINE __vec4_i1 __signed_less_equal_i64(__vec4_i64 a, __vec4_i64 b) {
return __xor(__signed_greater_than_i64(a, b), __vec4_i1(1, 1, 1, 1));
}
static FORCEINLINE __vec4_i64 __select(__vec4_i1 mask, __vec4_i64 a, __vec4_i64 b) {
@@ -1353,31 +1354,31 @@ static FORCEINLINE __vec4_f __div(__vec4_f a, __vec4_f b) {
return _mm_div_ps(a.v, b.v);
}
static FORCEINLINE __vec4_i1 __equal(__vec4_f a, __vec4_f b) {
static FORCEINLINE __vec4_i1 __equal_float(__vec4_f a, __vec4_f b) {
return _mm_cmpeq_ps(a.v, b.v);
}
static FORCEINLINE __vec4_i1 __not_equal(__vec4_f a, __vec4_f b) {
static FORCEINLINE __vec4_i1 __not_equal_float(__vec4_f a, __vec4_f b) {
return _mm_cmpneq_ps(a.v, b.v);
}
static FORCEINLINE __vec4_i1 __less_than(__vec4_f a, __vec4_f b) {
static FORCEINLINE __vec4_i1 __less_than_float(__vec4_f a, __vec4_f b) {
return _mm_cmplt_ps(a.v, b.v);
}
static FORCEINLINE __vec4_i1 __less_equal(__vec4_f a, __vec4_f b) {
static FORCEINLINE __vec4_i1 __less_equal_float(__vec4_f a, __vec4_f b) {
return _mm_cmple_ps(a.v, b.v);
}
static FORCEINLINE __vec4_i1 __greater_than(__vec4_f a, __vec4_f b) {
static FORCEINLINE __vec4_i1 __greater_than_float(__vec4_f a, __vec4_f b) {
return _mm_cmpgt_ps(a.v, b.v);
}
static FORCEINLINE __vec4_i1 __greater_equal(__vec4_f a, __vec4_f b) {
static FORCEINLINE __vec4_i1 __greater_equal_float(__vec4_f a, __vec4_f b) {
return _mm_cmpge_ps(a.v, b.v);
}
static FORCEINLINE __vec4_i1 __ordered(__vec4_f a, __vec4_f b) {
static FORCEINLINE __vec4_i1 __ordered_float(__vec4_f a, __vec4_f b) {
return _mm_cmpord_ps(a.v, b.v);
}
@@ -1458,49 +1459,49 @@ static FORCEINLINE __vec4_d __div(__vec4_d a, __vec4_d b) {
_mm_div_pd(a.v[1], b.v[1]));
}
static FORCEINLINE __vec4_i1 __equal(__vec4_d a, __vec4_d b) {
static FORCEINLINE __vec4_i1 __equal_double(__vec4_d a, __vec4_d b) {
__m128d cmp0 = _mm_cmpeq_pd(a.v[0], b.v[0]);
__m128d cmp1 = _mm_cmpeq_pd(a.v[1], b.v[1]);
return _mm_shuffle_ps(_mm_castpd_ps(cmp0), _mm_castpd_ps(cmp1),
_MM_SHUFFLE(2, 0, 2, 0));
}
static FORCEINLINE __vec4_i1 __not_equal(__vec4_d a, __vec4_d b) {
static FORCEINLINE __vec4_i1 __not_equal_double(__vec4_d a, __vec4_d b) {
__m128d cmp0 = _mm_cmpneq_pd(a.v[0], b.v[0]);
__m128d cmp1 = _mm_cmpneq_pd(a.v[1], b.v[1]);
return _mm_shuffle_ps(_mm_castpd_ps(cmp0), _mm_castpd_ps(cmp1),
_MM_SHUFFLE(2, 0, 2, 0));
}
static FORCEINLINE __vec4_i1 __less_than(__vec4_d a, __vec4_d b) {
static FORCEINLINE __vec4_i1 __less_than_double(__vec4_d a, __vec4_d b) {
__m128d cmp0 = _mm_cmplt_pd(a.v[0], b.v[0]);
__m128d cmp1 = _mm_cmplt_pd(a.v[1], b.v[1]);
return _mm_shuffle_ps(_mm_castpd_ps(cmp0), _mm_castpd_ps(cmp1),
_MM_SHUFFLE(2, 0, 2, 0));
}
static FORCEINLINE __vec4_i1 __less_equal(__vec4_d a, __vec4_d b) {
static FORCEINLINE __vec4_i1 __less_equal_double(__vec4_d a, __vec4_d b) {
__m128d cmp0 = _mm_cmple_pd(a.v[0], b.v[0]);
__m128d cmp1 = _mm_cmple_pd(a.v[1], b.v[1]);
return _mm_shuffle_ps(_mm_castpd_ps(cmp0), _mm_castpd_ps(cmp1),
_MM_SHUFFLE(2, 0, 2, 0));
}
static FORCEINLINE __vec4_i1 __greater_than(__vec4_d a, __vec4_d b) {
static FORCEINLINE __vec4_i1 __greater_than_double(__vec4_d a, __vec4_d b) {
__m128d cmp0 = _mm_cmpgt_pd(a.v[0], b.v[0]);
__m128d cmp1 = _mm_cmpgt_pd(a.v[1], b.v[1]);
return _mm_shuffle_ps(_mm_castpd_ps(cmp0), _mm_castpd_ps(cmp1),
_MM_SHUFFLE(2, 0, 0 ,2));
}
static FORCEINLINE __vec4_i1 __greater_equal(__vec4_d a, __vec4_d b) {
static FORCEINLINE __vec4_i1 __greater_equal_double(__vec4_d a, __vec4_d b) {
__m128d cmp0 = _mm_cmpge_pd(a.v[0], b.v[0]);
__m128d cmp1 = _mm_cmpge_pd(a.v[1], b.v[1]);
return _mm_shuffle_ps(_mm_castpd_ps(cmp0), _mm_castpd_ps(cmp1),
_MM_SHUFFLE(2, 0, 2, 0));
}
static FORCEINLINE __vec4_i1 __ordered(__vec4_d a, __vec4_d b) {
static FORCEINLINE __vec4_i1 __ordered_double(__vec4_d a, __vec4_d b) {
__m128d cmp0 = _mm_cmpord_pd(a.v[0], b.v[0]);
__m128d cmp1 = _mm_cmpord_pd(a.v[1], b.v[1]);
return _mm_shuffle_ps(_mm_castpd_ps(cmp0), _mm_castpd_ps(cmp1),