Add type suffix to comparison ops in C++ output.
e.g. "__equal()" -> "__equal_float()", etc. No functional change; this is necessary groundwork for a forthcoming peephole optimization that eliminates ANDs of masks in some cases.
This commit is contained in:
28
cbackend.cpp
28
cbackend.cpp
@@ -3200,11 +3200,37 @@ lPredicateToString(llvm::CmpInst::Predicate p) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
static const char *
|
||||||
|
lTypeToSuffix(llvm::Type *t) {
|
||||||
|
llvm::VectorType *vt = llvm::dyn_cast<llvm::VectorType>(t);
|
||||||
|
Assert(vt != NULL);
|
||||||
|
t = vt->getElementType();
|
||||||
|
|
||||||
|
switch (t->getTypeID()) {
|
||||||
|
case llvm::Type::FloatTyID: return "float";
|
||||||
|
case llvm::Type::DoubleTyID: return "double";
|
||||||
|
case llvm::Type::IntegerTyID: {
|
||||||
|
switch (llvm::cast<llvm::IntegerType>(t)->getBitWidth()) {
|
||||||
|
case 1: return "i1";
|
||||||
|
case 8: return "i8";
|
||||||
|
case 16: return "i16";
|
||||||
|
case 32: return "i32";
|
||||||
|
case 64: return "i64";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
default: llvm_unreachable(0); return NULL;
|
||||||
|
}
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
void CWriter::visitICmpInst(llvm::ICmpInst &I) {
|
void CWriter::visitICmpInst(llvm::ICmpInst &I) {
|
||||||
bool isVector = llvm::isa<llvm::VectorType>(I.getOperand(0)->getType());
|
bool isVector = llvm::isa<llvm::VectorType>(I.getOperand(0)->getType());
|
||||||
|
|
||||||
if (isVector) {
|
if (isVector) {
|
||||||
Out << lPredicateToString(I.getPredicate());
|
Out << lPredicateToString(I.getPredicate());
|
||||||
|
Out << "_";
|
||||||
|
Out << lTypeToSuffix(I.getOperand(0)->getType());
|
||||||
Out << "(";
|
Out << "(";
|
||||||
writeOperand(I.getOperand(0));
|
writeOperand(I.getOperand(0));
|
||||||
Out << ", ";
|
Out << ", ";
|
||||||
@@ -3270,6 +3296,8 @@ void CWriter::visitFCmpInst(llvm::FCmpInst &I) {
|
|||||||
|
|
||||||
if (isVector) {
|
if (isVector) {
|
||||||
Out << lPredicateToString(I.getPredicate());
|
Out << lPredicateToString(I.getPredicate());
|
||||||
|
Out << "_";
|
||||||
|
Out << lTypeToSuffix(I.getOperand(0)->getType());
|
||||||
Out << "(";
|
Out << "(";
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
|
|||||||
@@ -193,8 +193,8 @@ static FORCEINLINE TYPE NAME(TYPE a, TYPE b) { \
|
|||||||
return ret; \
|
return ret; \
|
||||||
}
|
}
|
||||||
|
|
||||||
#define CMP_OP(TYPE, CAST, NAME, OP) \
|
#define CMP_OP(TYPE, SUFFIX, CAST, NAME, OP) \
|
||||||
static FORCEINLINE __vec16_i1 NAME(TYPE a, TYPE b) { \
|
static FORCEINLINE __vec16_i1 NAME##_##SUFFIX(TYPE a, TYPE b) { \
|
||||||
__vec16_i1 ret; \
|
__vec16_i1 ret; \
|
||||||
ret.v = 0; \
|
ret.v = 0; \
|
||||||
for (int i = 0; i < 16; ++i) \
|
for (int i = 0; i < 16; ++i) \
|
||||||
@@ -317,7 +317,7 @@ static FORCEINLINE uint64_t __movmsk(__vec16_i1 mask) {
|
|||||||
return (uint64_t)mask.v;
|
return (uint64_t)mask.v;
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE __vec16_i1 __equal(__vec16_i1 a, __vec16_i1 b) {
|
static FORCEINLINE __vec16_i1 __equal_i1(__vec16_i1 a, __vec16_i1 b) {
|
||||||
__vec16_i1 r;
|
__vec16_i1 r;
|
||||||
r.v = (a.v & b.v) | (~a.v & ~b.v);
|
r.v = (a.v & b.v) | (~a.v & ~b.v);
|
||||||
return r;
|
return r;
|
||||||
@@ -424,16 +424,16 @@ SHIFT_UNIFORM(__vec16_i8, uint8_t, __lshr, >>)
|
|||||||
SHIFT_UNIFORM(__vec16_i8, int8_t, __ashr, >>)
|
SHIFT_UNIFORM(__vec16_i8, int8_t, __ashr, >>)
|
||||||
SHIFT_UNIFORM(__vec16_i8, int8_t, __shl, <<)
|
SHIFT_UNIFORM(__vec16_i8, int8_t, __shl, <<)
|
||||||
|
|
||||||
CMP_OP(__vec16_i8, int8_t, __equal, ==)
|
CMP_OP(__vec16_i8, i8, int8_t, __equal, ==)
|
||||||
CMP_OP(__vec16_i8, int8_t, __not_equal, !=)
|
CMP_OP(__vec16_i8, i8, int8_t, __not_equal, !=)
|
||||||
CMP_OP(__vec16_i8, uint8_t, __unsigned_less_equal, <=)
|
CMP_OP(__vec16_i8, i8, uint8_t, __unsigned_less_equal, <=)
|
||||||
CMP_OP(__vec16_i8, int8_t, __signed_less_equal, <=)
|
CMP_OP(__vec16_i8, i8, int8_t, __signed_less_equal, <=)
|
||||||
CMP_OP(__vec16_i8, uint8_t, __unsigned_greater_equal, >=)
|
CMP_OP(__vec16_i8, i8, uint8_t, __unsigned_greater_equal, >=)
|
||||||
CMP_OP(__vec16_i8, int8_t, __signed_greater_equal, >=)
|
CMP_OP(__vec16_i8, i8, int8_t, __signed_greater_equal, >=)
|
||||||
CMP_OP(__vec16_i8, uint8_t, __unsigned_less_than, <)
|
CMP_OP(__vec16_i8, i8, uint8_t, __unsigned_less_than, <)
|
||||||
CMP_OP(__vec16_i8, int8_t, __signed_less_than, <)
|
CMP_OP(__vec16_i8, i8, int8_t, __signed_less_than, <)
|
||||||
CMP_OP(__vec16_i8, uint8_t, __unsigned_greater_than, >)
|
CMP_OP(__vec16_i8, i8, uint8_t, __unsigned_greater_than, >)
|
||||||
CMP_OP(__vec16_i8, int8_t, __signed_greater_than, >)
|
CMP_OP(__vec16_i8, i8, int8_t, __signed_greater_than, >)
|
||||||
|
|
||||||
SELECT(__vec16_i8)
|
SELECT(__vec16_i8)
|
||||||
INSERT_EXTRACT(__vec16_i8, int8_t)
|
INSERT_EXTRACT(__vec16_i8, int8_t)
|
||||||
@@ -467,16 +467,16 @@ SHIFT_UNIFORM(__vec16_i16, uint16_t, __lshr, >>)
|
|||||||
SHIFT_UNIFORM(__vec16_i16, int16_t, __ashr, >>)
|
SHIFT_UNIFORM(__vec16_i16, int16_t, __ashr, >>)
|
||||||
SHIFT_UNIFORM(__vec16_i16, int16_t, __shl, <<)
|
SHIFT_UNIFORM(__vec16_i16, int16_t, __shl, <<)
|
||||||
|
|
||||||
CMP_OP(__vec16_i16, int16_t, __equal, ==)
|
CMP_OP(__vec16_i16, i16, int16_t, __equal, ==)
|
||||||
CMP_OP(__vec16_i16, int16_t, __not_equal, !=)
|
CMP_OP(__vec16_i16, i16, int16_t, __not_equal, !=)
|
||||||
CMP_OP(__vec16_i16, uint16_t, __unsigned_less_equal, <=)
|
CMP_OP(__vec16_i16, i16, uint16_t, __unsigned_less_equal, <=)
|
||||||
CMP_OP(__vec16_i16, int16_t, __signed_less_equal, <=)
|
CMP_OP(__vec16_i16, i16, int16_t, __signed_less_equal, <=)
|
||||||
CMP_OP(__vec16_i16, uint16_t, __unsigned_greater_equal, >=)
|
CMP_OP(__vec16_i16, i16, uint16_t, __unsigned_greater_equal, >=)
|
||||||
CMP_OP(__vec16_i16, int16_t, __signed_greater_equal, >=)
|
CMP_OP(__vec16_i16, i16, int16_t, __signed_greater_equal, >=)
|
||||||
CMP_OP(__vec16_i16, uint16_t, __unsigned_less_than, <)
|
CMP_OP(__vec16_i16, i16, uint16_t, __unsigned_less_than, <)
|
||||||
CMP_OP(__vec16_i16, int16_t, __signed_less_than, <)
|
CMP_OP(__vec16_i16, i16, int16_t, __signed_less_than, <)
|
||||||
CMP_OP(__vec16_i16, uint16_t, __unsigned_greater_than, >)
|
CMP_OP(__vec16_i16, i16, uint16_t, __unsigned_greater_than, >)
|
||||||
CMP_OP(__vec16_i16, int16_t, __signed_greater_than, >)
|
CMP_OP(__vec16_i16, i16, int16_t, __signed_greater_than, >)
|
||||||
|
|
||||||
SELECT(__vec16_i16)
|
SELECT(__vec16_i16)
|
||||||
INSERT_EXTRACT(__vec16_i16, int16_t)
|
INSERT_EXTRACT(__vec16_i16, int16_t)
|
||||||
@@ -510,16 +510,16 @@ SHIFT_UNIFORM(__vec16_i32, uint32_t, __lshr, >>)
|
|||||||
SHIFT_UNIFORM(__vec16_i32, int32_t, __ashr, >>)
|
SHIFT_UNIFORM(__vec16_i32, int32_t, __ashr, >>)
|
||||||
SHIFT_UNIFORM(__vec16_i32, int32_t, __shl, <<)
|
SHIFT_UNIFORM(__vec16_i32, int32_t, __shl, <<)
|
||||||
|
|
||||||
CMP_OP(__vec16_i32, int32_t, __equal, ==)
|
CMP_OP(__vec16_i32, i32, int32_t, __equal, ==)
|
||||||
CMP_OP(__vec16_i32, int32_t, __not_equal, !=)
|
CMP_OP(__vec16_i32, i32, int32_t, __not_equal, !=)
|
||||||
CMP_OP(__vec16_i32, uint32_t, __unsigned_less_equal, <=)
|
CMP_OP(__vec16_i32, i32, uint32_t, __unsigned_less_equal, <=)
|
||||||
CMP_OP(__vec16_i32, int32_t, __signed_less_equal, <=)
|
CMP_OP(__vec16_i32, i32, int32_t, __signed_less_equal, <=)
|
||||||
CMP_OP(__vec16_i32, uint32_t, __unsigned_greater_equal, >=)
|
CMP_OP(__vec16_i32, i32, uint32_t, __unsigned_greater_equal, >=)
|
||||||
CMP_OP(__vec16_i32, int32_t, __signed_greater_equal, >=)
|
CMP_OP(__vec16_i32, i32, int32_t, __signed_greater_equal, >=)
|
||||||
CMP_OP(__vec16_i32, uint32_t, __unsigned_less_than, <)
|
CMP_OP(__vec16_i32, i32, uint32_t, __unsigned_less_than, <)
|
||||||
CMP_OP(__vec16_i32, int32_t, __signed_less_than, <)
|
CMP_OP(__vec16_i32, i32, int32_t, __signed_less_than, <)
|
||||||
CMP_OP(__vec16_i32, uint32_t, __unsigned_greater_than, >)
|
CMP_OP(__vec16_i32, i32, uint32_t, __unsigned_greater_than, >)
|
||||||
CMP_OP(__vec16_i32, int32_t, __signed_greater_than, >)
|
CMP_OP(__vec16_i32, i32, int32_t, __signed_greater_than, >)
|
||||||
|
|
||||||
SELECT(__vec16_i32)
|
SELECT(__vec16_i32)
|
||||||
INSERT_EXTRACT(__vec16_i32, int32_t)
|
INSERT_EXTRACT(__vec16_i32, int32_t)
|
||||||
@@ -553,16 +553,16 @@ SHIFT_UNIFORM(__vec16_i64, uint64_t, __lshr, >>)
|
|||||||
SHIFT_UNIFORM(__vec16_i64, int64_t, __ashr, >>)
|
SHIFT_UNIFORM(__vec16_i64, int64_t, __ashr, >>)
|
||||||
SHIFT_UNIFORM(__vec16_i64, int64_t, __shl, <<)
|
SHIFT_UNIFORM(__vec16_i64, int64_t, __shl, <<)
|
||||||
|
|
||||||
CMP_OP(__vec16_i64, int64_t, __equal, ==)
|
CMP_OP(__vec16_i64, i64, int64_t, __equal, ==)
|
||||||
CMP_OP(__vec16_i64, int64_t, __not_equal, !=)
|
CMP_OP(__vec16_i64, i64, int64_t, __not_equal, !=)
|
||||||
CMP_OP(__vec16_i64, uint64_t, __unsigned_less_equal, <=)
|
CMP_OP(__vec16_i64, i64, uint64_t, __unsigned_less_equal, <=)
|
||||||
CMP_OP(__vec16_i64, int64_t, __signed_less_equal, <=)
|
CMP_OP(__vec16_i64, i64, int64_t, __signed_less_equal, <=)
|
||||||
CMP_OP(__vec16_i64, uint64_t, __unsigned_greater_equal, >=)
|
CMP_OP(__vec16_i64, i64, uint64_t, __unsigned_greater_equal, >=)
|
||||||
CMP_OP(__vec16_i64, int64_t, __signed_greater_equal, >=)
|
CMP_OP(__vec16_i64, i64, int64_t, __signed_greater_equal, >=)
|
||||||
CMP_OP(__vec16_i64, uint64_t, __unsigned_less_than, <)
|
CMP_OP(__vec16_i64, i64, uint64_t, __unsigned_less_than, <)
|
||||||
CMP_OP(__vec16_i64, int64_t, __signed_less_than, <)
|
CMP_OP(__vec16_i64, i64, int64_t, __signed_less_than, <)
|
||||||
CMP_OP(__vec16_i64, uint64_t, __unsigned_greater_than, >)
|
CMP_OP(__vec16_i64, i64, uint64_t, __unsigned_greater_than, >)
|
||||||
CMP_OP(__vec16_i64, int64_t, __signed_greater_than, >)
|
CMP_OP(__vec16_i64, i64, int64_t, __signed_greater_than, >)
|
||||||
|
|
||||||
SELECT(__vec16_i64)
|
SELECT(__vec16_i64)
|
||||||
INSERT_EXTRACT(__vec16_i64, int64_t)
|
INSERT_EXTRACT(__vec16_i64, int64_t)
|
||||||
@@ -580,12 +580,12 @@ BINARY_OP(__vec16_f, __sub, -)
|
|||||||
BINARY_OP(__vec16_f, __mul, *)
|
BINARY_OP(__vec16_f, __mul, *)
|
||||||
BINARY_OP(__vec16_f, __div, /)
|
BINARY_OP(__vec16_f, __div, /)
|
||||||
|
|
||||||
CMP_OP(__vec16_f, float, __equal, ==)
|
CMP_OP(__vec16_f, float, float, __equal, ==)
|
||||||
CMP_OP(__vec16_f, float, __not_equal, !=)
|
CMP_OP(__vec16_f, float, float, __not_equal, !=)
|
||||||
CMP_OP(__vec16_f, float, __less_than, <)
|
CMP_OP(__vec16_f, float, float, __less_than, <)
|
||||||
CMP_OP(__vec16_f, float, __less_equal, <=)
|
CMP_OP(__vec16_f, float, float, __less_equal, <=)
|
||||||
CMP_OP(__vec16_f, float, __greater_than, >)
|
CMP_OP(__vec16_f, float, float, __greater_than, >)
|
||||||
CMP_OP(__vec16_f, float, __greater_equal, >=)
|
CMP_OP(__vec16_f, float, float, __greater_equal, >=)
|
||||||
|
|
||||||
static FORCEINLINE __vec16_i1 __ordered(__vec16_f a, __vec16_f b) {
|
static FORCEINLINE __vec16_i1 __ordered(__vec16_f a, __vec16_f b) {
|
||||||
__vec16_i1 ret;
|
__vec16_i1 ret;
|
||||||
@@ -730,12 +730,12 @@ BINARY_OP(__vec16_d, __sub, -)
|
|||||||
BINARY_OP(__vec16_d, __mul, *)
|
BINARY_OP(__vec16_d, __mul, *)
|
||||||
BINARY_OP(__vec16_d, __div, /)
|
BINARY_OP(__vec16_d, __div, /)
|
||||||
|
|
||||||
CMP_OP(__vec16_d, double, __equal, ==)
|
CMP_OP(__vec16_d, double, double, __equal, ==)
|
||||||
CMP_OP(__vec16_d, double, __not_equal, !=)
|
CMP_OP(__vec16_d, double, double, __not_equal, !=)
|
||||||
CMP_OP(__vec16_d, double, __less_than, <)
|
CMP_OP(__vec16_d, double, double, __less_than, <)
|
||||||
CMP_OP(__vec16_d, double, __less_equal, <=)
|
CMP_OP(__vec16_d, double, double, __less_equal, <=)
|
||||||
CMP_OP(__vec16_d, double, __greater_than, >)
|
CMP_OP(__vec16_d, double, double, __greater_than, >)
|
||||||
CMP_OP(__vec16_d, double, __greater_equal, >=)
|
CMP_OP(__vec16_d, double, double, __greater_equal, >=)
|
||||||
|
|
||||||
static FORCEINLINE __vec16_i1 __ordered(__vec16_d a, __vec16_d b) {
|
static FORCEINLINE __vec16_i1 __ordered(__vec16_d a, __vec16_d b) {
|
||||||
__vec16_i1 ret;
|
__vec16_i1 ret;
|
||||||
|
|||||||
@@ -258,8 +258,8 @@ static FORCEINLINE TYPE NAME(TYPE a, TYPE b) { \
|
|||||||
return ret; \
|
return ret; \
|
||||||
}
|
}
|
||||||
|
|
||||||
#define CMP_OP(TYPE, CAST, NAME, OP) \
|
#define CMP_OP(TYPE, SUFFIX, CAST, NAME, OP) \
|
||||||
static FORCEINLINE __vec32_i1 NAME(TYPE a, TYPE b) { \
|
static FORCEINLINE __vec32_i1 NAME##_##SUFFIX(TYPE a, TYPE b) { \
|
||||||
__vec32_i1 ret; \
|
__vec32_i1 ret; \
|
||||||
ret.v = 0; \
|
ret.v = 0; \
|
||||||
for (int i = 0; i < 32; ++i) \
|
for (int i = 0; i < 32; ++i) \
|
||||||
@@ -382,7 +382,7 @@ static FORCEINLINE uint64_t __movmsk(__vec32_i1 mask) {
|
|||||||
return (uint64_t)mask.v;
|
return (uint64_t)mask.v;
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE __vec32_i1 __equal(__vec32_i1 a, __vec32_i1 b) {
|
static FORCEINLINE __vec32_i1 __equal_i1(__vec32_i1 a, __vec32_i1 b) {
|
||||||
__vec32_i1 r;
|
__vec32_i1 r;
|
||||||
r.v = (a.v & b.v) | (~a.v & ~b.v);
|
r.v = (a.v & b.v) | (~a.v & ~b.v);
|
||||||
return r;
|
return r;
|
||||||
@@ -491,16 +491,16 @@ SHIFT_UNIFORM(__vec32_i8, uint8_t, __lshr, >>)
|
|||||||
SHIFT_UNIFORM(__vec32_i8, int8_t, __ashr, >>)
|
SHIFT_UNIFORM(__vec32_i8, int8_t, __ashr, >>)
|
||||||
SHIFT_UNIFORM(__vec32_i8, int8_t, __shl, <<)
|
SHIFT_UNIFORM(__vec32_i8, int8_t, __shl, <<)
|
||||||
|
|
||||||
CMP_OP(__vec32_i8, int8_t, __equal, ==)
|
CMP_OP(__vec32_i8, i8, int8_t, __equal, ==)
|
||||||
CMP_OP(__vec32_i8, int8_t, __not_equal, !=)
|
CMP_OP(__vec32_i8, i8, int8_t, __not_equal, !=)
|
||||||
CMP_OP(__vec32_i8, uint8_t, __unsigned_less_equal, <=)
|
CMP_OP(__vec32_i8, i8, uint8_t, __unsigned_less_equal, <=)
|
||||||
CMP_OP(__vec32_i8, int8_t, __signed_less_equal, <=)
|
CMP_OP(__vec32_i8, i8, int8_t, __signed_less_equal, <=)
|
||||||
CMP_OP(__vec32_i8, uint8_t, __unsigned_greater_equal, >=)
|
CMP_OP(__vec32_i8, i8, uint8_t, __unsigned_greater_equal, >=)
|
||||||
CMP_OP(__vec32_i8, int8_t, __signed_greater_equal, >=)
|
CMP_OP(__vec32_i8, i8, int8_t, __signed_greater_equal, >=)
|
||||||
CMP_OP(__vec32_i8, uint8_t, __unsigned_less_than, <)
|
CMP_OP(__vec32_i8, i8, uint8_t, __unsigned_less_than, <)
|
||||||
CMP_OP(__vec32_i8, int8_t, __signed_less_than, <)
|
CMP_OP(__vec32_i8, i8, int8_t, __signed_less_than, <)
|
||||||
CMP_OP(__vec32_i8, uint8_t, __unsigned_greater_than, >)
|
CMP_OP(__vec32_i8, i8, uint8_t, __unsigned_greater_than, >)
|
||||||
CMP_OP(__vec32_i8, int8_t, __signed_greater_than, >)
|
CMP_OP(__vec32_i8, i8, int8_t, __signed_greater_than, >)
|
||||||
|
|
||||||
SELECT(__vec32_i8)
|
SELECT(__vec32_i8)
|
||||||
INSERT_EXTRACT(__vec32_i8, int8_t)
|
INSERT_EXTRACT(__vec32_i8, int8_t)
|
||||||
@@ -534,16 +534,16 @@ SHIFT_UNIFORM(__vec32_i16, uint16_t, __lshr, >>)
|
|||||||
SHIFT_UNIFORM(__vec32_i16, int16_t, __ashr, >>)
|
SHIFT_UNIFORM(__vec32_i16, int16_t, __ashr, >>)
|
||||||
SHIFT_UNIFORM(__vec32_i16, int16_t, __shl, <<)
|
SHIFT_UNIFORM(__vec32_i16, int16_t, __shl, <<)
|
||||||
|
|
||||||
CMP_OP(__vec32_i16, int16_t, __equal, ==)
|
CMP_OP(__vec32_i16, i16, int16_t, __equal, ==)
|
||||||
CMP_OP(__vec32_i16, int16_t, __not_equal, !=)
|
CMP_OP(__vec32_i16, i16, int16_t, __not_equal, !=)
|
||||||
CMP_OP(__vec32_i16, uint16_t, __unsigned_less_equal, <=)
|
CMP_OP(__vec32_i16, i16, uint16_t, __unsigned_less_equal, <=)
|
||||||
CMP_OP(__vec32_i16, int16_t, __signed_less_equal, <=)
|
CMP_OP(__vec32_i16, i16, int16_t, __signed_less_equal, <=)
|
||||||
CMP_OP(__vec32_i16, uint16_t, __unsigned_greater_equal, >=)
|
CMP_OP(__vec32_i16, i16, uint16_t, __unsigned_greater_equal, >=)
|
||||||
CMP_OP(__vec32_i16, int16_t, __signed_greater_equal, >=)
|
CMP_OP(__vec32_i16, i16, int16_t, __signed_greater_equal, >=)
|
||||||
CMP_OP(__vec32_i16, uint16_t, __unsigned_less_than, <)
|
CMP_OP(__vec32_i16, i16, uint16_t, __unsigned_less_than, <)
|
||||||
CMP_OP(__vec32_i16, int16_t, __signed_less_than, <)
|
CMP_OP(__vec32_i16, i16, int16_t, __signed_less_than, <)
|
||||||
CMP_OP(__vec32_i16, uint16_t, __unsigned_greater_than, >)
|
CMP_OP(__vec32_i16, i16, uint16_t, __unsigned_greater_than, >)
|
||||||
CMP_OP(__vec32_i16, int16_t, __signed_greater_than, >)
|
CMP_OP(__vec32_i16, i16, int16_t, __signed_greater_than, >)
|
||||||
|
|
||||||
SELECT(__vec32_i16)
|
SELECT(__vec32_i16)
|
||||||
INSERT_EXTRACT(__vec32_i16, int16_t)
|
INSERT_EXTRACT(__vec32_i16, int16_t)
|
||||||
@@ -577,16 +577,16 @@ SHIFT_UNIFORM(__vec32_i32, uint32_t, __lshr, >>)
|
|||||||
SHIFT_UNIFORM(__vec32_i32, int32_t, __ashr, >>)
|
SHIFT_UNIFORM(__vec32_i32, int32_t, __ashr, >>)
|
||||||
SHIFT_UNIFORM(__vec32_i32, int32_t, __shl, <<)
|
SHIFT_UNIFORM(__vec32_i32, int32_t, __shl, <<)
|
||||||
|
|
||||||
CMP_OP(__vec32_i32, int32_t, __equal, ==)
|
CMP_OP(__vec32_i32, i32, int32_t, __equal, ==)
|
||||||
CMP_OP(__vec32_i32, int32_t, __not_equal, !=)
|
CMP_OP(__vec32_i32, i32, int32_t, __not_equal, !=)
|
||||||
CMP_OP(__vec32_i32, uint32_t, __unsigned_less_equal, <=)
|
CMP_OP(__vec32_i32, i32, uint32_t, __unsigned_less_equal, <=)
|
||||||
CMP_OP(__vec32_i32, int32_t, __signed_less_equal, <=)
|
CMP_OP(__vec32_i32, i32, int32_t, __signed_less_equal, <=)
|
||||||
CMP_OP(__vec32_i32, uint32_t, __unsigned_greater_equal, >=)
|
CMP_OP(__vec32_i32, i32, uint32_t, __unsigned_greater_equal, >=)
|
||||||
CMP_OP(__vec32_i32, int32_t, __signed_greater_equal, >=)
|
CMP_OP(__vec32_i32, i32, int32_t, __signed_greater_equal, >=)
|
||||||
CMP_OP(__vec32_i32, uint32_t, __unsigned_less_than, <)
|
CMP_OP(__vec32_i32, i32, uint32_t, __unsigned_less_than, <)
|
||||||
CMP_OP(__vec32_i32, int32_t, __signed_less_than, <)
|
CMP_OP(__vec32_i32, i32, int32_t, __signed_less_than, <)
|
||||||
CMP_OP(__vec32_i32, uint32_t, __unsigned_greater_than, >)
|
CMP_OP(__vec32_i32, i32, uint32_t, __unsigned_greater_than, >)
|
||||||
CMP_OP(__vec32_i32, int32_t, __signed_greater_than, >)
|
CMP_OP(__vec32_i32, i32, int32_t, __signed_greater_than, >)
|
||||||
|
|
||||||
SELECT(__vec32_i32)
|
SELECT(__vec32_i32)
|
||||||
INSERT_EXTRACT(__vec32_i32, int32_t)
|
INSERT_EXTRACT(__vec32_i32, int32_t)
|
||||||
@@ -620,16 +620,16 @@ SHIFT_UNIFORM(__vec32_i64, uint64_t, __lshr, >>)
|
|||||||
SHIFT_UNIFORM(__vec32_i64, int64_t, __ashr, >>)
|
SHIFT_UNIFORM(__vec32_i64, int64_t, __ashr, >>)
|
||||||
SHIFT_UNIFORM(__vec32_i64, int64_t, __shl, <<)
|
SHIFT_UNIFORM(__vec32_i64, int64_t, __shl, <<)
|
||||||
|
|
||||||
CMP_OP(__vec32_i64, int64_t, __equal, ==)
|
CMP_OP(__vec32_i64, i64, int64_t, __equal, ==)
|
||||||
CMP_OP(__vec32_i64, int64_t, __not_equal, !=)
|
CMP_OP(__vec32_i64, i64, int64_t, __not_equal, !=)
|
||||||
CMP_OP(__vec32_i64, uint64_t, __unsigned_less_equal, <=)
|
CMP_OP(__vec32_i64, i64, uint64_t, __unsigned_less_equal, <=)
|
||||||
CMP_OP(__vec32_i64, int64_t, __signed_less_equal, <=)
|
CMP_OP(__vec32_i64, i64, int64_t, __signed_less_equal, <=)
|
||||||
CMP_OP(__vec32_i64, uint64_t, __unsigned_greater_equal, >=)
|
CMP_OP(__vec32_i64, i64, uint64_t, __unsigned_greater_equal, >=)
|
||||||
CMP_OP(__vec32_i64, int64_t, __signed_greater_equal, >=)
|
CMP_OP(__vec32_i64, i64, int64_t, __signed_greater_equal, >=)
|
||||||
CMP_OP(__vec32_i64, uint64_t, __unsigned_less_than, <)
|
CMP_OP(__vec32_i64, i64, uint64_t, __unsigned_less_than, <)
|
||||||
CMP_OP(__vec32_i64, int64_t, __signed_less_than, <)
|
CMP_OP(__vec32_i64, i64, int64_t, __signed_less_than, <)
|
||||||
CMP_OP(__vec32_i64, uint64_t, __unsigned_greater_than, >)
|
CMP_OP(__vec32_i64, i64, uint64_t, __unsigned_greater_than, >)
|
||||||
CMP_OP(__vec32_i64, int64_t, __signed_greater_than, >)
|
CMP_OP(__vec32_i64, i64, int64_t, __signed_greater_than, >)
|
||||||
|
|
||||||
SELECT(__vec32_i64)
|
SELECT(__vec32_i64)
|
||||||
INSERT_EXTRACT(__vec32_i64, int64_t)
|
INSERT_EXTRACT(__vec32_i64, int64_t)
|
||||||
@@ -647,12 +647,12 @@ BINARY_OP(__vec32_f, __sub, -)
|
|||||||
BINARY_OP(__vec32_f, __mul, *)
|
BINARY_OP(__vec32_f, __mul, *)
|
||||||
BINARY_OP(__vec32_f, __div, /)
|
BINARY_OP(__vec32_f, __div, /)
|
||||||
|
|
||||||
CMP_OP(__vec32_f, float, __equal, ==)
|
CMP_OP(__vec32_f, float, float, __equal, ==)
|
||||||
CMP_OP(__vec32_f, float, __not_equal, !=)
|
CMP_OP(__vec32_f, float, float, __not_equal, !=)
|
||||||
CMP_OP(__vec32_f, float, __less_than, <)
|
CMP_OP(__vec32_f, float, float, __less_than, <)
|
||||||
CMP_OP(__vec32_f, float, __less_equal, <=)
|
CMP_OP(__vec32_f, float, float, __less_equal, <=)
|
||||||
CMP_OP(__vec32_f, float, __greater_than, >)
|
CMP_OP(__vec32_f, float, float, __greater_than, >)
|
||||||
CMP_OP(__vec32_f, float, __greater_equal, >=)
|
CMP_OP(__vec32_f, float, float, __greater_equal, >=)
|
||||||
|
|
||||||
static FORCEINLINE __vec32_i1 __ordered(__vec32_f a, __vec32_f b) {
|
static FORCEINLINE __vec32_i1 __ordered(__vec32_f a, __vec32_f b) {
|
||||||
__vec32_i1 ret;
|
__vec32_i1 ret;
|
||||||
@@ -797,12 +797,12 @@ BINARY_OP(__vec32_d, __sub, -)
|
|||||||
BINARY_OP(__vec32_d, __mul, *)
|
BINARY_OP(__vec32_d, __mul, *)
|
||||||
BINARY_OP(__vec32_d, __div, /)
|
BINARY_OP(__vec32_d, __div, /)
|
||||||
|
|
||||||
CMP_OP(__vec32_d, double, __equal, ==)
|
CMP_OP(__vec32_d, double, double, __equal, ==)
|
||||||
CMP_OP(__vec32_d, double, __not_equal, !=)
|
CMP_OP(__vec32_d, double, double, __not_equal, !=)
|
||||||
CMP_OP(__vec32_d, double, __less_than, <)
|
CMP_OP(__vec32_d, double, double, __less_than, <)
|
||||||
CMP_OP(__vec32_d, double, __less_equal, <=)
|
CMP_OP(__vec32_d, double, double, __less_equal, <=)
|
||||||
CMP_OP(__vec32_d, double, __greater_than, >)
|
CMP_OP(__vec32_d, double, double, __greater_than, >)
|
||||||
CMP_OP(__vec32_d, double, __greater_equal, >=)
|
CMP_OP(__vec32_d, double, double, __greater_equal, >=)
|
||||||
|
|
||||||
static FORCEINLINE __vec32_i1 __ordered(__vec32_d a, __vec32_d b) {
|
static FORCEINLINE __vec32_i1 __ordered(__vec32_d a, __vec32_d b) {
|
||||||
__vec32_i1 ret;
|
__vec32_i1 ret;
|
||||||
|
|||||||
@@ -383,8 +383,8 @@ static FORCEINLINE TYPE NAME(TYPE a, TYPE b) { \
|
|||||||
return ret; \
|
return ret; \
|
||||||
}
|
}
|
||||||
|
|
||||||
#define CMP_OP(TYPE, CAST, NAME, OP) \
|
#define CMP_OP(TYPE, SUFFIX, CAST, NAME, OP) \
|
||||||
static FORCEINLINE __vec64_i1 NAME(TYPE a, TYPE b) { \
|
static FORCEINLINE __vec64_i1 NAME##_##SUFFIX(TYPE a, TYPE b) { \
|
||||||
__vec64_i1 ret; \
|
__vec64_i1 ret; \
|
||||||
ret.v = 0; \
|
ret.v = 0; \
|
||||||
for (int i = 0; i < 64; ++i) \
|
for (int i = 0; i < 64; ++i) \
|
||||||
@@ -507,7 +507,7 @@ static FORCEINLINE uint64_t __movmsk(__vec64_i1 mask) {
|
|||||||
return (uint64_t)mask.v;
|
return (uint64_t)mask.v;
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE __vec64_i1 __equal(__vec64_i1 a, __vec64_i1 b) {
|
static FORCEINLINE __vec64_i1 __equal_i1(__vec64_i1 a, __vec64_i1 b) {
|
||||||
__vec64_i1 r;
|
__vec64_i1 r;
|
||||||
r.v = (a.v & b.v) | (~a.v & ~b.v);
|
r.v = (a.v & b.v) | (~a.v & ~b.v);
|
||||||
return r;
|
return r;
|
||||||
@@ -620,16 +620,16 @@ SHIFT_UNIFORM(__vec64_i8, uint8_t, __lshr, >>)
|
|||||||
SHIFT_UNIFORM(__vec64_i8, int8_t, __ashr, >>)
|
SHIFT_UNIFORM(__vec64_i8, int8_t, __ashr, >>)
|
||||||
SHIFT_UNIFORM(__vec64_i8, int8_t, __shl, <<)
|
SHIFT_UNIFORM(__vec64_i8, int8_t, __shl, <<)
|
||||||
|
|
||||||
CMP_OP(__vec64_i8, int8_t, __equal, ==)
|
CMP_OP(__vec64_i8, i8, int8_t, __equal, ==)
|
||||||
CMP_OP(__vec64_i8, int8_t, __not_equal, !=)
|
CMP_OP(__vec64_i8, i8, int8_t, __not_equal, !=)
|
||||||
CMP_OP(__vec64_i8, uint8_t, __unsigned_less_equal, <=)
|
CMP_OP(__vec64_i8, i8, uint8_t, __unsigned_less_equal, <=)
|
||||||
CMP_OP(__vec64_i8, int8_t, __signed_less_equal, <=)
|
CMP_OP(__vec64_i8, i8, int8_t, __signed_less_equal, <=)
|
||||||
CMP_OP(__vec64_i8, uint8_t, __unsigned_greater_equal, >=)
|
CMP_OP(__vec64_i8, i8, uint8_t, __unsigned_greater_equal, >=)
|
||||||
CMP_OP(__vec64_i8, int8_t, __signed_greater_equal, >=)
|
CMP_OP(__vec64_i8, i8, int8_t, __signed_greater_equal, >=)
|
||||||
CMP_OP(__vec64_i8, uint8_t, __unsigned_less_than, <)
|
CMP_OP(__vec64_i8, i8, uint8_t, __unsigned_less_than, <)
|
||||||
CMP_OP(__vec64_i8, int8_t, __signed_less_than, <)
|
CMP_OP(__vec64_i8, i8, int8_t, __signed_less_than, <)
|
||||||
CMP_OP(__vec64_i8, uint8_t, __unsigned_greater_than, >)
|
CMP_OP(__vec64_i8, i8, uint8_t, __unsigned_greater_than, >)
|
||||||
CMP_OP(__vec64_i8, int8_t, __signed_greater_than, >)
|
CMP_OP(__vec64_i8, i8, int8_t, __signed_greater_than, >)
|
||||||
|
|
||||||
SELECT(__vec64_i8)
|
SELECT(__vec64_i8)
|
||||||
INSERT_EXTRACT(__vec64_i8, int8_t)
|
INSERT_EXTRACT(__vec64_i8, int8_t)
|
||||||
@@ -663,16 +663,16 @@ SHIFT_UNIFORM(__vec64_i16, uint16_t, __lshr, >>)
|
|||||||
SHIFT_UNIFORM(__vec64_i16, int16_t, __ashr, >>)
|
SHIFT_UNIFORM(__vec64_i16, int16_t, __ashr, >>)
|
||||||
SHIFT_UNIFORM(__vec64_i16, int16_t, __shl, <<)
|
SHIFT_UNIFORM(__vec64_i16, int16_t, __shl, <<)
|
||||||
|
|
||||||
CMP_OP(__vec64_i16, int16_t, __equal, ==)
|
CMP_OP(__vec64_i16, i16, int16_t, __equal, ==)
|
||||||
CMP_OP(__vec64_i16, int16_t, __not_equal, !=)
|
CMP_OP(__vec64_i16, i16, int16_t, __not_equal, !=)
|
||||||
CMP_OP(__vec64_i16, uint16_t, __unsigned_less_equal, <=)
|
CMP_OP(__vec64_i16, i16, uint16_t, __unsigned_less_equal, <=)
|
||||||
CMP_OP(__vec64_i16, int16_t, __signed_less_equal, <=)
|
CMP_OP(__vec64_i16, i16, int16_t, __signed_less_equal, <=)
|
||||||
CMP_OP(__vec64_i16, uint16_t, __unsigned_greater_equal, >=)
|
CMP_OP(__vec64_i16, i16, uint16_t, __unsigned_greater_equal, >=)
|
||||||
CMP_OP(__vec64_i16, int16_t, __signed_greater_equal, >=)
|
CMP_OP(__vec64_i16, i16, int16_t, __signed_greater_equal, >=)
|
||||||
CMP_OP(__vec64_i16, uint16_t, __unsigned_less_than, <)
|
CMP_OP(__vec64_i16, i16, uint16_t, __unsigned_less_than, <)
|
||||||
CMP_OP(__vec64_i16, int16_t, __signed_less_than, <)
|
CMP_OP(__vec64_i16, i16, int16_t, __signed_less_than, <)
|
||||||
CMP_OP(__vec64_i16, uint16_t, __unsigned_greater_than, >)
|
CMP_OP(__vec64_i16, i16, uint16_t, __unsigned_greater_than, >)
|
||||||
CMP_OP(__vec64_i16, int16_t, __signed_greater_than, >)
|
CMP_OP(__vec64_i16, i16, int16_t, __signed_greater_than, >)
|
||||||
|
|
||||||
SELECT(__vec64_i16)
|
SELECT(__vec64_i16)
|
||||||
INSERT_EXTRACT(__vec64_i16, int16_t)
|
INSERT_EXTRACT(__vec64_i16, int16_t)
|
||||||
@@ -706,16 +706,16 @@ SHIFT_UNIFORM(__vec64_i32, uint32_t, __lshr, >>)
|
|||||||
SHIFT_UNIFORM(__vec64_i32, int32_t, __ashr, >>)
|
SHIFT_UNIFORM(__vec64_i32, int32_t, __ashr, >>)
|
||||||
SHIFT_UNIFORM(__vec64_i32, int32_t, __shl, <<)
|
SHIFT_UNIFORM(__vec64_i32, int32_t, __shl, <<)
|
||||||
|
|
||||||
CMP_OP(__vec64_i32, int32_t, __equal, ==)
|
CMP_OP(__vec64_i32, i32, int32_t, __equal, ==)
|
||||||
CMP_OP(__vec64_i32, int32_t, __not_equal, !=)
|
CMP_OP(__vec64_i32, i32, int32_t, __not_equal, !=)
|
||||||
CMP_OP(__vec64_i32, uint32_t, __unsigned_less_equal, <=)
|
CMP_OP(__vec64_i32, i32, uint32_t, __unsigned_less_equal, <=)
|
||||||
CMP_OP(__vec64_i32, int32_t, __signed_less_equal, <=)
|
CMP_OP(__vec64_i32, i32, int32_t, __signed_less_equal, <=)
|
||||||
CMP_OP(__vec64_i32, uint32_t, __unsigned_greater_equal, >=)
|
CMP_OP(__vec64_i32, i32, uint32_t, __unsigned_greater_equal, >=)
|
||||||
CMP_OP(__vec64_i32, int32_t, __signed_greater_equal, >=)
|
CMP_OP(__vec64_i32, i32, int32_t, __signed_greater_equal, >=)
|
||||||
CMP_OP(__vec64_i32, uint32_t, __unsigned_less_than, <)
|
CMP_OP(__vec64_i32, i32, uint32_t, __unsigned_less_than, <)
|
||||||
CMP_OP(__vec64_i32, int32_t, __signed_less_than, <)
|
CMP_OP(__vec64_i32, i32, int32_t, __signed_less_than, <)
|
||||||
CMP_OP(__vec64_i32, uint32_t, __unsigned_greater_than, >)
|
CMP_OP(__vec64_i32, i32, uint32_t, __unsigned_greater_than, >)
|
||||||
CMP_OP(__vec64_i32, int32_t, __signed_greater_than, >)
|
CMP_OP(__vec64_i32, i32, int32_t, __signed_greater_than, >)
|
||||||
|
|
||||||
SELECT(__vec64_i32)
|
SELECT(__vec64_i32)
|
||||||
INSERT_EXTRACT(__vec64_i32, int32_t)
|
INSERT_EXTRACT(__vec64_i32, int32_t)
|
||||||
@@ -749,16 +749,16 @@ SHIFT_UNIFORM(__vec64_i64, uint64_t, __lshr, >>)
|
|||||||
SHIFT_UNIFORM(__vec64_i64, int64_t, __ashr, >>)
|
SHIFT_UNIFORM(__vec64_i64, int64_t, __ashr, >>)
|
||||||
SHIFT_UNIFORM(__vec64_i64, int64_t, __shl, <<)
|
SHIFT_UNIFORM(__vec64_i64, int64_t, __shl, <<)
|
||||||
|
|
||||||
CMP_OP(__vec64_i64, int64_t, __equal, ==)
|
CMP_OP(__vec64_i64, i64, int64_t, __equal, ==)
|
||||||
CMP_OP(__vec64_i64, int64_t, __not_equal, !=)
|
CMP_OP(__vec64_i64, i64, int64_t, __not_equal, !=)
|
||||||
CMP_OP(__vec64_i64, uint64_t, __unsigned_less_equal, <=)
|
CMP_OP(__vec64_i64, i64, uint64_t, __unsigned_less_equal, <=)
|
||||||
CMP_OP(__vec64_i64, int64_t, __signed_less_equal, <=)
|
CMP_OP(__vec64_i64, i64, int64_t, __signed_less_equal, <=)
|
||||||
CMP_OP(__vec64_i64, uint64_t, __unsigned_greater_equal, >=)
|
CMP_OP(__vec64_i64, i64, uint64_t, __unsigned_greater_equal, >=)
|
||||||
CMP_OP(__vec64_i64, int64_t, __signed_greater_equal, >=)
|
CMP_OP(__vec64_i64, i64, int64_t, __signed_greater_equal, >=)
|
||||||
CMP_OP(__vec64_i64, uint64_t, __unsigned_less_than, <)
|
CMP_OP(__vec64_i64, i64, uint64_t, __unsigned_less_than, <)
|
||||||
CMP_OP(__vec64_i64, int64_t, __signed_less_than, <)
|
CMP_OP(__vec64_i64, i64, int64_t, __signed_less_than, <)
|
||||||
CMP_OP(__vec64_i64, uint64_t, __unsigned_greater_than, >)
|
CMP_OP(__vec64_i64, i64, uint64_t, __unsigned_greater_than, >)
|
||||||
CMP_OP(__vec64_i64, int64_t, __signed_greater_than, >)
|
CMP_OP(__vec64_i64, i64, int64_t, __signed_greater_than, >)
|
||||||
|
|
||||||
SELECT(__vec64_i64)
|
SELECT(__vec64_i64)
|
||||||
INSERT_EXTRACT(__vec64_i64, int64_t)
|
INSERT_EXTRACT(__vec64_i64, int64_t)
|
||||||
@@ -776,12 +776,12 @@ BINARY_OP(__vec64_f, __sub, -)
|
|||||||
BINARY_OP(__vec64_f, __mul, *)
|
BINARY_OP(__vec64_f, __mul, *)
|
||||||
BINARY_OP(__vec64_f, __div, /)
|
BINARY_OP(__vec64_f, __div, /)
|
||||||
|
|
||||||
CMP_OP(__vec64_f, float, __equal, ==)
|
CMP_OP(__vec64_f, float, float, __equal, ==)
|
||||||
CMP_OP(__vec64_f, float, __not_equal, !=)
|
CMP_OP(__vec64_f, float, float, __not_equal, !=)
|
||||||
CMP_OP(__vec64_f, float, __less_than, <)
|
CMP_OP(__vec64_f, float, float, __less_than, <)
|
||||||
CMP_OP(__vec64_f, float, __less_equal, <=)
|
CMP_OP(__vec64_f, float, float, __less_equal, <=)
|
||||||
CMP_OP(__vec64_f, float, __greater_than, >)
|
CMP_OP(__vec64_f, float, float, __greater_than, >)
|
||||||
CMP_OP(__vec64_f, float, __greater_equal, >=)
|
CMP_OP(__vec64_f, float, float, __greater_equal, >=)
|
||||||
|
|
||||||
static FORCEINLINE __vec64_i1 __ordered(__vec64_f a, __vec64_f b) {
|
static FORCEINLINE __vec64_i1 __ordered(__vec64_f a, __vec64_f b) {
|
||||||
__vec64_i1 ret;
|
__vec64_i1 ret;
|
||||||
@@ -926,12 +926,12 @@ BINARY_OP(__vec64_d, __sub, -)
|
|||||||
BINARY_OP(__vec64_d, __mul, *)
|
BINARY_OP(__vec64_d, __mul, *)
|
||||||
BINARY_OP(__vec64_d, __div, /)
|
BINARY_OP(__vec64_d, __div, /)
|
||||||
|
|
||||||
CMP_OP(__vec64_d, double, __equal, ==)
|
CMP_OP(__vec64_d, double, double, __equal, ==)
|
||||||
CMP_OP(__vec64_d, double, __not_equal, !=)
|
CMP_OP(__vec64_d, double, double, __not_equal, !=)
|
||||||
CMP_OP(__vec64_d, double, __less_than, <)
|
CMP_OP(__vec64_d, double, double, __less_than, <)
|
||||||
CMP_OP(__vec64_d, double, __less_equal, <=)
|
CMP_OP(__vec64_d, double, double, __less_equal, <=)
|
||||||
CMP_OP(__vec64_d, double, __greater_than, >)
|
CMP_OP(__vec64_d, double, double, __greater_than, >)
|
||||||
CMP_OP(__vec64_d, double, __greater_equal, >=)
|
CMP_OP(__vec64_d, double, double, __greater_equal, >=)
|
||||||
|
|
||||||
static FORCEINLINE __vec64_i1 __ordered(__vec64_d a, __vec64_d b) {
|
static FORCEINLINE __vec64_i1 __ordered(__vec64_d a, __vec64_d b) {
|
||||||
__vec64_i1 ret;
|
__vec64_i1 ret;
|
||||||
|
|||||||
@@ -409,7 +409,7 @@ static FORCEINLINE __vec16_i1 __movmsk(__vec16_i1 mask) {
|
|||||||
return _mm512_kmov(mask);
|
return _mm512_kmov(mask);
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE __vec16_i1 __equal(__vec16_i1 a, __vec16_i1 b) {
|
static FORCEINLINE __vec16_i1 __equal_i1(__vec16_i1 a, __vec16_i1 b) {
|
||||||
return _mm512_knot( _mm512_kandn(a, b));
|
return _mm512_knot( _mm512_kandn(a, b));
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -634,43 +634,43 @@ static FORCEINLINE __vec16_i32 __ashr(__vec16_i32 a, int32_t n) {
|
|||||||
return _mm512_srai_epi32((__m512i)a, n);
|
return _mm512_srai_epi32((__m512i)a, n);
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE __vec16_i1 __equal(const __vec16_i32 &a, const __vec16_i32 &b) {
|
static FORCEINLINE __vec16_i1 __equal_i32(const __vec16_i32 &a, const __vec16_i32 &b) {
|
||||||
return _mm512_cmpeq_epi32_mask((__m512i)a, (__m512i)b);
|
return _mm512_cmpeq_epi32_mask((__m512i)a, (__m512i)b);
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE __vec16_i1 __not_equal(__vec16_i32 a, __vec16_i32 b) {
|
static FORCEINLINE __vec16_i1 __not_equal_i32(__vec16_i32 a, __vec16_i32 b) {
|
||||||
return _mm512_cmpneq_epi32_mask((__m512i)a, (__m512i)b);
|
return _mm512_cmpneq_epi32_mask((__m512i)a, (__m512i)b);
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE __vec16_i1 __unsigned_less_equal(__vec16_i32 a, __vec16_i32 b) {
|
static FORCEINLINE __vec16_i1 __unsigned_less_equal_i32(__vec16_i32 a, __vec16_i32 b) {
|
||||||
return _mm512_cmple_epu32_mask((__m512i)a, (__m512i)b);
|
return _mm512_cmple_epu32_mask((__m512i)a, (__m512i)b);
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE __vec16_i1 __signed_less_equal(__vec16_i32 a, __vec16_i32 b) {
|
static FORCEINLINE __vec16_i1 __signed_less_equal_i32(__vec16_i32 a, __vec16_i32 b) {
|
||||||
return _mm512_cmple_epi32_mask((__m512i)a, (__m512i)b);
|
return _mm512_cmple_epi32_mask((__m512i)a, (__m512i)b);
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE __vec16_i1 __unsigned_greater_equal(__vec16_i32 a, __vec16_i32 b) {
|
static FORCEINLINE __vec16_i1 __unsigned_greater_equal_i32(__vec16_i32 a, __vec16_i32 b) {
|
||||||
return _mm512_cmpge_epu32_mask((__m512i)a, (__m512i)b);
|
return _mm512_cmpge_epu32_mask((__m512i)a, (__m512i)b);
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE __vec16_i1 __signed_greater_equal(__vec16_i32 a, __vec16_i32 b) {
|
static FORCEINLINE __vec16_i1 __signed_greater_equal_i32(__vec16_i32 a, __vec16_i32 b) {
|
||||||
return _mm512_cmpge_epi32_mask((__m512i)a, (__m512i)b);
|
return _mm512_cmpge_epi32_mask((__m512i)a, (__m512i)b);
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE __vec16_i1 __unsigned_less_than(__vec16_i32 a, __vec16_i32 b) {
|
static FORCEINLINE __vec16_i1 __unsigned_less_than_i32(__vec16_i32 a, __vec16_i32 b) {
|
||||||
return _mm512_cmplt_epu32_mask((__m512i)a, (__m512i)b);
|
return _mm512_cmplt_epu32_mask((__m512i)a, (__m512i)b);
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE __vec16_i1 __signed_less_than(__vec16_i32 a, __vec16_i32 b) {
|
static FORCEINLINE __vec16_i1 __signed_less_than_i32(__vec16_i32 a, __vec16_i32 b) {
|
||||||
return _mm512_cmplt_epi32_mask((__m512i)a, (__m512i)b);
|
return _mm512_cmplt_epi32_mask((__m512i)a, (__m512i)b);
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE __vec16_i1 __unsigned_greater_than(__vec16_i32 a, __vec16_i32 b) {
|
static FORCEINLINE __vec16_i1 __unsigned_greater_than_i32(__vec16_i32 a, __vec16_i32 b) {
|
||||||
return _mm512_cmpgt_epu32_mask((__m512i)a, (__m512i)b);
|
return _mm512_cmpgt_epu32_mask((__m512i)a, (__m512i)b);
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE __vec16_i1 __signed_greater_than(__vec16_i32 a, __vec16_i32 b) {
|
static FORCEINLINE __vec16_i1 __signed_greater_than_i32(__vec16_i32 a, __vec16_i32 b) {
|
||||||
return _mm512_cmpgt_epi32_mask((__m512i)a, (__m512i)b);
|
return _mm512_cmpgt_epi32_mask((__m512i)a, (__m512i)b);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -773,12 +773,12 @@ SHIFT_UNIFORM(__vec16_i64, uint64_t, __lshr, >>)
|
|||||||
SHIFT_UNIFORM(__vec16_i64, int64_t, __ashr, >>)
|
SHIFT_UNIFORM(__vec16_i64, int64_t, __ashr, >>)
|
||||||
SHIFT_UNIFORM(__vec16_i64, int64_t, __shl, <<)
|
SHIFT_UNIFORM(__vec16_i64, int64_t, __shl, <<)
|
||||||
|
|
||||||
static FORCEINLINE __vec16_i1 __equal(const __vec16_i64 &a, const __vec16_i64 &b) {
|
static FORCEINLINE __vec16_i1 __equal_i64(const __vec16_i64 &a, const __vec16_i64 &b) {
|
||||||
const __mmask16 lo_match = _mm512_cmpeq_epi32_mask(a.v_lo,b.v_lo);
|
const __mmask16 lo_match = _mm512_cmpeq_epi32_mask(a.v_lo,b.v_lo);
|
||||||
return _mm512_mask_cmpeq_epi32_mask(lo_match,a.v_hi,b.v_hi);
|
return _mm512_mask_cmpeq_epi32_mask(lo_match,a.v_hi,b.v_hi);
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE __vec16_i1 __not_equal(const __vec16_i64 &a, const __vec16_i64 &b) {
|
static FORCEINLINE __vec16_i1 __not_equal_i64(const __vec16_i64 &a, const __vec16_i64 &b) {
|
||||||
return __not(__equal(a,b));
|
return __not(__equal(a,b));
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -915,27 +915,27 @@ static FORCEINLINE __vec16_f __div(__vec16_f a, __vec16_f b) {
|
|||||||
return _mm512_div_ps(a, b);
|
return _mm512_div_ps(a, b);
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE __vec16_i1 __equal(__vec16_f a, __vec16_f b) {
|
static FORCEINLINE __vec16_i1 __equal_float(__vec16_f a, __vec16_f b) {
|
||||||
return _mm512_cmpeq_ps_mask(a, b);
|
return _mm512_cmpeq_ps_mask(a, b);
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE __vec16_i1 __not_equal(__vec16_f a, __vec16_f b) {
|
static FORCEINLINE __vec16_i1 __not_equal_float(__vec16_f a, __vec16_f b) {
|
||||||
return _mm512_cmpneq_ps_mask(a, b);
|
return _mm512_cmpneq_ps_mask(a, b);
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE __vec16_i1 __less_than(__vec16_f a, __vec16_f b) {
|
static FORCEINLINE __vec16_i1 __less_than_float(__vec16_f a, __vec16_f b) {
|
||||||
return _mm512_cmplt_ps_mask(a, b);
|
return _mm512_cmplt_ps_mask(a, b);
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE __vec16_i1 __less_equal(__vec16_f a, __vec16_f b) {
|
static FORCEINLINE __vec16_i1 __less_equal_float(__vec16_f a, __vec16_f b) {
|
||||||
return _mm512_cmple_ps_mask(a, b);
|
return _mm512_cmple_ps_mask(a, b);
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE __vec16_i1 __greater_than(__vec16_f a, __vec16_f b) {
|
static FORCEINLINE __vec16_i1 __greater_than_float(__vec16_f a, __vec16_f b) {
|
||||||
return _mm512_cmpnle_ps_mask(a, b);
|
return _mm512_cmpnle_ps_mask(a, b);
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE __vec16_i1 __greater_equal(__vec16_f a, __vec16_f b) {
|
static FORCEINLINE __vec16_i1 __greater_equal_float(__vec16_f a, __vec16_f b) {
|
||||||
return _mm512_cmpnlt_ps_mask(a, b);
|
return _mm512_cmpnlt_ps_mask(a, b);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1043,42 +1043,42 @@ static FORCEINLINE __vec16_d __div(__vec16_d a, __vec16_d b) {
|
|||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE __vec16_i1 __equal(__vec16_d a, __vec16_d b) {
|
static FORCEINLINE __vec16_i1 __equal_double(__vec16_d a, __vec16_d b) {
|
||||||
__vec16_i1 ret;
|
__vec16_i1 ret;
|
||||||
ret.m8.m1 = _mm512_cmpeq_pd_mask(a.v1, b.v1);
|
ret.m8.m1 = _mm512_cmpeq_pd_mask(a.v1, b.v1);
|
||||||
ret.m8.m2 = _mm512_cmpeq_pd_mask(a.v2, b.v2);
|
ret.m8.m2 = _mm512_cmpeq_pd_mask(a.v2, b.v2);
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE __vec16_i1 __not_equal(__vec16_d a, __vec16_d b) {
|
static FORCEINLINE __vec16_i1 __not_equal_double(__vec16_d a, __vec16_d b) {
|
||||||
__vec16_i1 ret;
|
__vec16_i1 ret;
|
||||||
ret.m8.m1 = _mm512_cmpneq_pd_mask(a.v1, b.v1);
|
ret.m8.m1 = _mm512_cmpneq_pd_mask(a.v1, b.v1);
|
||||||
ret.m8.m2 = _mm512_cmpneq_pd_mask(a.v2, b.v2);
|
ret.m8.m2 = _mm512_cmpneq_pd_mask(a.v2, b.v2);
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE __vec16_i1 __less_than(__vec16_d a, __vec16_d b) {
|
static FORCEINLINE __vec16_i1 __less_than_double(__vec16_d a, __vec16_d b) {
|
||||||
__vec16_i1 ret;
|
__vec16_i1 ret;
|
||||||
ret.m8.m1 = _mm512_cmplt_pd_mask(a.v1, b.v1);
|
ret.m8.m1 = _mm512_cmplt_pd_mask(a.v1, b.v1);
|
||||||
ret.m8.m2 = _mm512_cmplt_pd_mask(a.v2, b.v2);
|
ret.m8.m2 = _mm512_cmplt_pd_mask(a.v2, b.v2);
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE __vec16_i1 __less_equal(__vec16_d a, __vec16_d b) {
|
static FORCEINLINE __vec16_i1 __less_equal_double(__vec16_d a, __vec16_d b) {
|
||||||
__vec16_i1 ret;
|
__vec16_i1 ret;
|
||||||
ret.m8.m1 = _mm512_cmple_pd_mask(a.v1, b.v1);
|
ret.m8.m1 = _mm512_cmple_pd_mask(a.v1, b.v1);
|
||||||
ret.m8.m2 = _mm512_cmple_pd_mask(a.v2, b.v2);
|
ret.m8.m2 = _mm512_cmple_pd_mask(a.v2, b.v2);
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE __vec16_i1 __greater_than(__vec16_d a, __vec16_d b) {
|
static FORCEINLINE __vec16_i1 __greater_than_double(__vec16_d a, __vec16_d b) {
|
||||||
__vec16_i1 ret;
|
__vec16_i1 ret;
|
||||||
ret.m8.m1 = _mm512_cmpnle_pd_mask(a.v1, b.v1);
|
ret.m8.m1 = _mm512_cmpnle_pd_mask(a.v1, b.v1);
|
||||||
ret.m8.m2 = _mm512_cmpnle_pd_mask(a.v2, b.v2);
|
ret.m8.m2 = _mm512_cmpnle_pd_mask(a.v2, b.v2);
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE __vec16_i1 __greater_equal(__vec16_d a, __vec16_d b) {
|
static FORCEINLINE __vec16_i1 __greater_equal_double(__vec16_d a, __vec16_d b) {
|
||||||
__vec16_i1 ret;
|
__vec16_i1 ret;
|
||||||
ret.m8.m1 = _mm512_cmpnlt_pd_mask(a.v1, b.v1);
|
ret.m8.m1 = _mm512_cmpnlt_pd_mask(a.v1, b.v1);
|
||||||
ret.m8.m2 = _mm512_cmpnlt_pd_mask(a.v2, b.v2);
|
ret.m8.m2 = _mm512_cmpnlt_pd_mask(a.v2, b.v2);
|
||||||
|
|||||||
@@ -244,7 +244,7 @@ static FORCEINLINE uint64_t __movmsk(__vec4_i1 mask) {
|
|||||||
return (uint64_t)_mm_movemask_ps(mask.v);
|
return (uint64_t)_mm_movemask_ps(mask.v);
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE __vec4_i1 __equal(__vec4_i1 a, __vec4_i1 b) {
|
static FORCEINLINE __vec4_i1 __equal_i1(__vec4_i1 a, __vec4_i1 b) {
|
||||||
return _mm_cmpeq_epi32(_mm_castps_si128(a.v), _mm_castps_si128(b.v));
|
return _mm_cmpeq_epi32(_mm_castps_si128(a.v), _mm_castps_si128(b.v));
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -425,7 +425,7 @@ static FORCEINLINE __vec4_i8 __ashr(__vec4_i8 a, int32_t b) {
|
|||||||
(int8_t)_mm_extract_epi8(a.v, 3) >> b);
|
(int8_t)_mm_extract_epi8(a.v, 3) >> b);
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE __vec4_i1 __equal(__vec4_i8 a, __vec4_i8 b) {
|
static FORCEINLINE __vec4_i1 __equal_i8(__vec4_i8 a, __vec4_i8 b) {
|
||||||
__m128i cmp = _mm_cmpeq_epi8(a.v, b.v);
|
__m128i cmp = _mm_cmpeq_epi8(a.v, b.v);
|
||||||
return __vec4_i1(_mm_extract_epi8(cmp, 0),
|
return __vec4_i1(_mm_extract_epi8(cmp, 0),
|
||||||
_mm_extract_epi8(cmp, 1),
|
_mm_extract_epi8(cmp, 1),
|
||||||
@@ -433,11 +433,12 @@ static FORCEINLINE __vec4_i1 __equal(__vec4_i8 a, __vec4_i8 b) {
|
|||||||
_mm_extract_epi8(cmp, 3));
|
_mm_extract_epi8(cmp, 3));
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE __vec4_i1 __not_equal(__vec4_i8 a, __vec4_i8 b) {
|
|
||||||
return __xor(__equal(a, b), __vec4_i1(1, 1, 1, 1));
|
static FORCEINLINE __vec4_i1 __not_equal_i8(__vec4_i8 a, __vec4_i8 b) {
|
||||||
|
return __xor(__equal_i8(a, b), __vec4_i1(1, 1, 1, 1));
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE __vec4_i1 __unsigned_less_equal(__vec4_i8 a, __vec4_i8 b) {
|
static FORCEINLINE __vec4_i1 __unsigned_less_equal_i8(__vec4_i8 a, __vec4_i8 b) {
|
||||||
return __vec4_i1((uint8_t)_mm_extract_epi8(a.v, 0) <=
|
return __vec4_i1((uint8_t)_mm_extract_epi8(a.v, 0) <=
|
||||||
(uint8_t)_mm_extract_epi8(b.v, 0),
|
(uint8_t)_mm_extract_epi8(b.v, 0),
|
||||||
(uint8_t)_mm_extract_epi8(a.v, 1) <=
|
(uint8_t)_mm_extract_epi8(a.v, 1) <=
|
||||||
@@ -448,7 +449,7 @@ static FORCEINLINE __vec4_i1 __unsigned_less_equal(__vec4_i8 a, __vec4_i8 b) {
|
|||||||
(uint8_t)_mm_extract_epi8(b.v, 3));
|
(uint8_t)_mm_extract_epi8(b.v, 3));
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE __vec4_i1 __unsigned_greater_equal(__vec4_i8 a, __vec4_i8 b) {
|
static FORCEINLINE __vec4_i1 __unsigned_greater_equal_i8(__vec4_i8 a, __vec4_i8 b) {
|
||||||
return __vec4_i1((uint8_t)_mm_extract_epi8(a.v, 0) >=
|
return __vec4_i1((uint8_t)_mm_extract_epi8(a.v, 0) >=
|
||||||
(uint8_t)_mm_extract_epi8(b.v, 0),
|
(uint8_t)_mm_extract_epi8(b.v, 0),
|
||||||
(uint8_t)_mm_extract_epi8(a.v, 1) >=
|
(uint8_t)_mm_extract_epi8(a.v, 1) >=
|
||||||
@@ -459,7 +460,7 @@ static FORCEINLINE __vec4_i1 __unsigned_greater_equal(__vec4_i8 a, __vec4_i8 b)
|
|||||||
(uint8_t)_mm_extract_epi8(b.v, 3));
|
(uint8_t)_mm_extract_epi8(b.v, 3));
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE __vec4_i1 __unsigned_less_than(__vec4_i8 a, __vec4_i8 b) {
|
static FORCEINLINE __vec4_i1 __unsigned_less_than_i8(__vec4_i8 a, __vec4_i8 b) {
|
||||||
return __vec4_i1((uint8_t)_mm_extract_epi8(a.v, 0) <
|
return __vec4_i1((uint8_t)_mm_extract_epi8(a.v, 0) <
|
||||||
(uint8_t)_mm_extract_epi8(b.v, 0),
|
(uint8_t)_mm_extract_epi8(b.v, 0),
|
||||||
(uint8_t)_mm_extract_epi8(a.v, 1) <
|
(uint8_t)_mm_extract_epi8(a.v, 1) <
|
||||||
@@ -470,7 +471,7 @@ static FORCEINLINE __vec4_i1 __unsigned_less_than(__vec4_i8 a, __vec4_i8 b) {
|
|||||||
(uint8_t)_mm_extract_epi8(b.v, 3));
|
(uint8_t)_mm_extract_epi8(b.v, 3));
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE __vec4_i1 __unsigned_greater_than(__vec4_i8 a, __vec4_i8 b) {
|
static FORCEINLINE __vec4_i1 __unsigned_greater_than_i8(__vec4_i8 a, __vec4_i8 b) {
|
||||||
return __vec4_i1((uint8_t)_mm_extract_epi8(a.v, 0) >
|
return __vec4_i1((uint8_t)_mm_extract_epi8(a.v, 0) >
|
||||||
(uint8_t)_mm_extract_epi8(b.v, 0),
|
(uint8_t)_mm_extract_epi8(b.v, 0),
|
||||||
(uint8_t)_mm_extract_epi8(a.v, 1) >
|
(uint8_t)_mm_extract_epi8(a.v, 1) >
|
||||||
@@ -481,7 +482,7 @@ static FORCEINLINE __vec4_i1 __unsigned_greater_than(__vec4_i8 a, __vec4_i8 b) {
|
|||||||
(uint8_t)_mm_extract_epi8(b.v, 3));
|
(uint8_t)_mm_extract_epi8(b.v, 3));
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE __vec4_i1 __signed_less_than(__vec4_i8 a, __vec4_i8 b) {
|
static FORCEINLINE __vec4_i1 __signed_less_than_i8(__vec4_i8 a, __vec4_i8 b) {
|
||||||
__m128i cmp = _mm_cmplt_epi8(a.v, b.v);
|
__m128i cmp = _mm_cmplt_epi8(a.v, b.v);
|
||||||
return __vec4_i1(_mm_extract_epi8(cmp, 0),
|
return __vec4_i1(_mm_extract_epi8(cmp, 0),
|
||||||
_mm_extract_epi8(cmp, 1),
|
_mm_extract_epi8(cmp, 1),
|
||||||
@@ -489,11 +490,11 @@ static FORCEINLINE __vec4_i1 __signed_less_than(__vec4_i8 a, __vec4_i8 b) {
|
|||||||
_mm_extract_epi8(cmp, 3));
|
_mm_extract_epi8(cmp, 3));
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE __vec4_i1 __signed_less_equal(__vec4_i8 a, __vec4_i8 b) {
|
static FORCEINLINE __vec4_i1 __signed_less_equal_i8(__vec4_i8 a, __vec4_i8 b) {
|
||||||
return __or(__signed_less_than(a, b), __equal(a, b));
|
return __or(__signed_less_than_i8(a, b), __equal_i8(a, b));
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE __vec4_i1 __signed_greater_than(__vec4_i8 a, __vec4_i8 b) {
|
static FORCEINLINE __vec4_i1 __signed_greater_than_i8(__vec4_i8 a, __vec4_i8 b) {
|
||||||
__m128i cmp = _mm_cmpgt_epi8(a.v, b.v);
|
__m128i cmp = _mm_cmpgt_epi8(a.v, b.v);
|
||||||
return __vec4_i1(_mm_extract_epi8(cmp, 0),
|
return __vec4_i1(_mm_extract_epi8(cmp, 0),
|
||||||
_mm_extract_epi8(cmp, 1),
|
_mm_extract_epi8(cmp, 1),
|
||||||
@@ -501,8 +502,8 @@ static FORCEINLINE __vec4_i1 __signed_greater_than(__vec4_i8 a, __vec4_i8 b) {
|
|||||||
_mm_extract_epi8(cmp, 3));
|
_mm_extract_epi8(cmp, 3));
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE __vec4_i1 __signed_greater_equal(__vec4_i8 a, __vec4_i8 b) {
|
static FORCEINLINE __vec4_i1 __signed_greater_equal_i8(__vec4_i8 a, __vec4_i8 b) {
|
||||||
return __or(__signed_greater_than(a, b), __equal(a, b));
|
return __or(__signed_greater_than_i8(a, b), __equal_i8(a, b));
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE __vec4_i8 __select(__vec4_i1 mask, __vec4_i8 a, __vec4_i8 b) {
|
static FORCEINLINE __vec4_i8 __select(__vec4_i1 mask, __vec4_i8 a, __vec4_i8 b) {
|
||||||
@@ -681,7 +682,7 @@ static FORCEINLINE __vec4_i16 __ashr(__vec4_i16 a, int32_t b) {
|
|||||||
return _mm_sra_epi16(a.v, _mm_set_epi32(0, 0, 0, b));
|
return _mm_sra_epi16(a.v, _mm_set_epi32(0, 0, 0, b));
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE __vec4_i1 __equal(__vec4_i16 a, __vec4_i16 b) {
|
static FORCEINLINE __vec4_i1 __equal_i16(__vec4_i16 a, __vec4_i16 b) {
|
||||||
__m128i cmp = _mm_cmpeq_epi16(a.v, b.v);
|
__m128i cmp = _mm_cmpeq_epi16(a.v, b.v);
|
||||||
return __vec4_i1(_mm_extract_epi16(cmp, 0),
|
return __vec4_i1(_mm_extract_epi16(cmp, 0),
|
||||||
_mm_extract_epi16(cmp, 1),
|
_mm_extract_epi16(cmp, 1),
|
||||||
@@ -689,11 +690,11 @@ static FORCEINLINE __vec4_i1 __equal(__vec4_i16 a, __vec4_i16 b) {
|
|||||||
_mm_extract_epi16(cmp, 3));
|
_mm_extract_epi16(cmp, 3));
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE __vec4_i1 __not_equal(__vec4_i16 a, __vec4_i16 b) {
|
static FORCEINLINE __vec4_i1 __not_equal_i16(__vec4_i16 a, __vec4_i16 b) {
|
||||||
return __xor(__equal(a, b), __vec4_i1(1, 1, 1, 1));
|
return __xor(__equal_i16(a, b), __vec4_i1(1, 1, 1, 1));
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE __vec4_i1 __unsigned_less_equal(__vec4_i16 a, __vec4_i16 b) {
|
static FORCEINLINE __vec4_i1 __unsigned_less_equal_i16(__vec4_i16 a, __vec4_i16 b) {
|
||||||
// FIXME: could use the trick that int32 does for the unsigned
|
// FIXME: could use the trick that int32 does for the unsigned
|
||||||
// comparisons so that we don't need to scalarie them. (This also
|
// comparisons so that we don't need to scalarie them. (This also
|
||||||
// applies to i8s...)
|
// applies to i8s...)
|
||||||
@@ -707,7 +708,7 @@ static FORCEINLINE __vec4_i1 __unsigned_less_equal(__vec4_i16 a, __vec4_i16 b) {
|
|||||||
(uint16_t)_mm_extract_epi16(b.v, 3));
|
(uint16_t)_mm_extract_epi16(b.v, 3));
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE __vec4_i1 __unsigned_greater_equal(__vec4_i16 a, __vec4_i16 b) {
|
static FORCEINLINE __vec4_i1 __unsigned_greater_equal_i16(__vec4_i16 a, __vec4_i16 b) {
|
||||||
return __vec4_i1((uint16_t)_mm_extract_epi16(a.v, 0) >=
|
return __vec4_i1((uint16_t)_mm_extract_epi16(a.v, 0) >=
|
||||||
(uint16_t)_mm_extract_epi16(b.v, 0),
|
(uint16_t)_mm_extract_epi16(b.v, 0),
|
||||||
(uint16_t)_mm_extract_epi16(a.v, 1) >=
|
(uint16_t)_mm_extract_epi16(a.v, 1) >=
|
||||||
@@ -718,7 +719,7 @@ static FORCEINLINE __vec4_i1 __unsigned_greater_equal(__vec4_i16 a, __vec4_i16 b
|
|||||||
(uint16_t)_mm_extract_epi16(b.v, 3));
|
(uint16_t)_mm_extract_epi16(b.v, 3));
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE __vec4_i1 __unsigned_less_than(__vec4_i16 a, __vec4_i16 b) {
|
static FORCEINLINE __vec4_i1 __unsigned_less_than_i16(__vec4_i16 a, __vec4_i16 b) {
|
||||||
return __vec4_i1((uint16_t)_mm_extract_epi16(a.v, 0) <
|
return __vec4_i1((uint16_t)_mm_extract_epi16(a.v, 0) <
|
||||||
(uint16_t)_mm_extract_epi16(b.v, 0),
|
(uint16_t)_mm_extract_epi16(b.v, 0),
|
||||||
(uint16_t)_mm_extract_epi16(a.v, 1) <
|
(uint16_t)_mm_extract_epi16(a.v, 1) <
|
||||||
@@ -729,7 +730,7 @@ static FORCEINLINE __vec4_i1 __unsigned_less_than(__vec4_i16 a, __vec4_i16 b) {
|
|||||||
(uint16_t)_mm_extract_epi16(b.v, 3));
|
(uint16_t)_mm_extract_epi16(b.v, 3));
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE __vec4_i1 __unsigned_greater_than(__vec4_i16 a, __vec4_i16 b) {
|
static FORCEINLINE __vec4_i1 __unsigned_greater_than_i16(__vec4_i16 a, __vec4_i16 b) {
|
||||||
return __vec4_i1((uint16_t)_mm_extract_epi16(a.v, 0) >
|
return __vec4_i1((uint16_t)_mm_extract_epi16(a.v, 0) >
|
||||||
(uint16_t)_mm_extract_epi16(b.v, 0),
|
(uint16_t)_mm_extract_epi16(b.v, 0),
|
||||||
(uint16_t)_mm_extract_epi16(a.v, 1) >
|
(uint16_t)_mm_extract_epi16(a.v, 1) >
|
||||||
@@ -740,7 +741,7 @@ static FORCEINLINE __vec4_i1 __unsigned_greater_than(__vec4_i16 a, __vec4_i16 b)
|
|||||||
(uint16_t)_mm_extract_epi16(b.v, 3));
|
(uint16_t)_mm_extract_epi16(b.v, 3));
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE __vec4_i1 __signed_less_than(__vec4_i16 a, __vec4_i16 b) {
|
static FORCEINLINE __vec4_i1 __signed_less_than_i16(__vec4_i16 a, __vec4_i16 b) {
|
||||||
__m128i cmp = _mm_cmplt_epi16(a.v, b.v);
|
__m128i cmp = _mm_cmplt_epi16(a.v, b.v);
|
||||||
return __vec4_i1(_mm_extract_epi16(cmp, 0),
|
return __vec4_i1(_mm_extract_epi16(cmp, 0),
|
||||||
_mm_extract_epi16(cmp, 1),
|
_mm_extract_epi16(cmp, 1),
|
||||||
@@ -748,11 +749,11 @@ static FORCEINLINE __vec4_i1 __signed_less_than(__vec4_i16 a, __vec4_i16 b) {
|
|||||||
_mm_extract_epi16(cmp, 3));
|
_mm_extract_epi16(cmp, 3));
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE __vec4_i1 __signed_less_equal(__vec4_i16 a, __vec4_i16 b) {
|
static FORCEINLINE __vec4_i1 __signed_less_equal_i16(__vec4_i16 a, __vec4_i16 b) {
|
||||||
return __or(__signed_less_than(a, b), __equal(a, b));
|
return __or(__signed_less_than_i16(a, b), __equal_i16(a, b));
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE __vec4_i1 __signed_greater_than(__vec4_i16 a, __vec4_i16 b) {
|
static FORCEINLINE __vec4_i1 __signed_greater_than_i16(__vec4_i16 a, __vec4_i16 b) {
|
||||||
__m128i cmp = _mm_cmpgt_epi16(a.v, b.v);
|
__m128i cmp = _mm_cmpgt_epi16(a.v, b.v);
|
||||||
return __vec4_i1(_mm_extract_epi16(cmp, 0),
|
return __vec4_i1(_mm_extract_epi16(cmp, 0),
|
||||||
_mm_extract_epi16(cmp, 1),
|
_mm_extract_epi16(cmp, 1),
|
||||||
@@ -760,8 +761,8 @@ static FORCEINLINE __vec4_i1 __signed_greater_than(__vec4_i16 a, __vec4_i16 b)
|
|||||||
_mm_extract_epi16(cmp, 3));
|
_mm_extract_epi16(cmp, 3));
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE __vec4_i1 __signed_greater_equal(__vec4_i16 a, __vec4_i16 b) {
|
static FORCEINLINE __vec4_i1 __signed_greater_equal_i16(__vec4_i16 a, __vec4_i16 b) {
|
||||||
return __or(__signed_greater_than(a, b), __equal(a, b));
|
return __or(__signed_greater_than_i16(a, b), __equal_i16(a, b));
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE __vec4_i16 __select(__vec4_i1 mask, __vec4_i16 a, __vec4_i16 b) {
|
static FORCEINLINE __vec4_i16 __select(__vec4_i1 mask, __vec4_i16 a, __vec4_i16 b) {
|
||||||
@@ -966,52 +967,52 @@ static FORCEINLINE __vec4_i32 __ashr(__vec4_i32 a, int32_t b) {
|
|||||||
return _mm_sra_epi32(a.v, _mm_set_epi32(0, 0, 0, b));
|
return _mm_sra_epi32(a.v, _mm_set_epi32(0, 0, 0, b));
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE __vec4_i1 __equal(__vec4_i32 a, __vec4_i32 b) {
|
static FORCEINLINE __vec4_i1 __equal_i32(__vec4_i32 a, __vec4_i32 b) {
|
||||||
return _mm_cmpeq_epi32(a.v, b.v);
|
return _mm_cmpeq_epi32(a.v, b.v);
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE __vec4_i1 __not_equal(__vec4_i32 a, __vec4_i32 b) {
|
static FORCEINLINE __vec4_i1 __not_equal_i32(__vec4_i32 a, __vec4_i32 b) {
|
||||||
return _mm_xor_si128(_mm_cmpeq_epi32(a.v, b.v),
|
return _mm_xor_si128(_mm_cmpeq_epi32(a.v, b.v),
|
||||||
_mm_cmpeq_epi32(a.v, a.v));
|
_mm_cmpeq_epi32(a.v, a.v));
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE __vec4_i1 __unsigned_less_equal(__vec4_i32 a, __vec4_i32 b) {
|
static FORCEINLINE __vec4_i1 __unsigned_less_equal_i32(__vec4_i32 a, __vec4_i32 b) {
|
||||||
// a<=b == (min(a,b) == a)
|
// a<=b == (min(a,b) == a)
|
||||||
return _mm_cmpeq_epi32(_mm_min_epu32(a.v, b.v), a.v);
|
return _mm_cmpeq_epi32(_mm_min_epu32(a.v, b.v), a.v);
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE __vec4_i1 __signed_less_equal(__vec4_i32 a, __vec4_i32 b) {
|
static FORCEINLINE __vec4_i1 __signed_less_equal_i32(__vec4_i32 a, __vec4_i32 b) {
|
||||||
return _mm_or_si128(_mm_cmplt_epi32(a.v, b.v),
|
return _mm_or_si128(_mm_cmplt_epi32(a.v, b.v),
|
||||||
_mm_cmpeq_epi32(a.v, b.v));
|
_mm_cmpeq_epi32(a.v, b.v));
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE __vec4_i1 __unsigned_greater_equal(__vec4_i32 a, __vec4_i32 b) {
|
static FORCEINLINE __vec4_i1 __unsigned_greater_equal_i32(__vec4_i32 a, __vec4_i32 b) {
|
||||||
// a>=b == (max(a,b) == a)
|
// a>=b == (max(a,b) == a)
|
||||||
return _mm_cmpeq_epi32(_mm_max_epu32(a.v, b.v), a.v);
|
return _mm_cmpeq_epi32(_mm_max_epu32(a.v, b.v), a.v);
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE __vec4_i1 __signed_greater_equal(__vec4_i32 a, __vec4_i32 b) {
|
static FORCEINLINE __vec4_i1 __signed_greater_equal_i32(__vec4_i32 a, __vec4_i32 b) {
|
||||||
return _mm_or_si128(_mm_cmpgt_epi32(a.v, b.v),
|
return _mm_or_si128(_mm_cmpgt_epi32(a.v, b.v),
|
||||||
_mm_cmpeq_epi32(a.v, b.v));
|
_mm_cmpeq_epi32(a.v, b.v));
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE __vec4_i1 __unsigned_less_than(__vec4_i32 a, __vec4_i32 b) {
|
static FORCEINLINE __vec4_i1 __unsigned_less_than_i32(__vec4_i32 a, __vec4_i32 b) {
|
||||||
a.v = _mm_xor_si128(a.v, _mm_set1_epi32(0x80000000));
|
a.v = _mm_xor_si128(a.v, _mm_set1_epi32(0x80000000));
|
||||||
b.v = _mm_xor_si128(b.v, _mm_set1_epi32(0x80000000));
|
b.v = _mm_xor_si128(b.v, _mm_set1_epi32(0x80000000));
|
||||||
return _mm_cmplt_epi32(a.v, b.v);
|
return _mm_cmplt_epi32(a.v, b.v);
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE __vec4_i1 __signed_less_than(__vec4_i32 a, __vec4_i32 b) {
|
static FORCEINLINE __vec4_i1 __signed_less_than_i32(__vec4_i32 a, __vec4_i32 b) {
|
||||||
return _mm_cmplt_epi32(a.v, b.v);
|
return _mm_cmplt_epi32(a.v, b.v);
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE __vec4_i1 __unsigned_greater_than(__vec4_i32 a, __vec4_i32 b) {
|
static FORCEINLINE __vec4_i1 __unsigned_greater_than_i32(__vec4_i32 a, __vec4_i32 b) {
|
||||||
a.v = _mm_xor_si128(a.v, _mm_set1_epi32(0x80000000));
|
a.v = _mm_xor_si128(a.v, _mm_set1_epi32(0x80000000));
|
||||||
b.v = _mm_xor_si128(b.v, _mm_set1_epi32(0x80000000));
|
b.v = _mm_xor_si128(b.v, _mm_set1_epi32(0x80000000));
|
||||||
return _mm_cmpgt_epi32(a.v, b.v);
|
return _mm_cmpgt_epi32(a.v, b.v);
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE __vec4_i1 __signed_greater_than(__vec4_i32 a, __vec4_i32 b) {
|
static FORCEINLINE __vec4_i1 __signed_greater_than_i32(__vec4_i32 a, __vec4_i32 b) {
|
||||||
return _mm_cmpgt_epi32(a.v, b.v);
|
return _mm_cmpgt_epi32(a.v, b.v);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1197,18 +1198,18 @@ static FORCEINLINE __vec4_i64 __ashr(__vec4_i64 a, int32_t b) {
|
|||||||
(int64_t)_mm_extract_epi64(a.v[1], 1) >> b);
|
(int64_t)_mm_extract_epi64(a.v[1], 1) >> b);
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE __vec4_i1 __equal(__vec4_i64 a, __vec4_i64 b) {
|
static FORCEINLINE __vec4_i1 __equal_i64(__vec4_i64 a, __vec4_i64 b) {
|
||||||
__m128i cmp0 = _mm_cmpeq_epi64(a.v[0], b.v[0]);
|
__m128i cmp0 = _mm_cmpeq_epi64(a.v[0], b.v[0]);
|
||||||
__m128i cmp1 = _mm_cmpeq_epi64(a.v[1], b.v[1]);
|
__m128i cmp1 = _mm_cmpeq_epi64(a.v[1], b.v[1]);
|
||||||
return _mm_shuffle_ps(_mm_castsi128_ps(cmp0), _mm_castsi128_ps(cmp1),
|
return _mm_shuffle_ps(_mm_castsi128_ps(cmp0), _mm_castsi128_ps(cmp1),
|
||||||
_MM_SHUFFLE(2, 0, 2, 0));
|
_MM_SHUFFLE(2, 0, 2, 0));
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE __vec4_i1 __not_equal(__vec4_i64 a, __vec4_i64 b) {
|
static FORCEINLINE __vec4_i1 __not_equal_i64(__vec4_i64 a, __vec4_i64 b) {
|
||||||
return __xor(__equal(a, b), __vec4_i1(1, 1, 1, 1));
|
return __xor(__equal_i64(a, b), __vec4_i1(1, 1, 1, 1));
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE __vec4_i1 __unsigned_less_equal(__vec4_i64 a, __vec4_i64 b) {
|
static FORCEINLINE __vec4_i1 __unsigned_less_equal_i64(__vec4_i64 a, __vec4_i64 b) {
|
||||||
return __vec4_i1((uint64_t)_mm_extract_epi64(a.v[0], 0) <=
|
return __vec4_i1((uint64_t)_mm_extract_epi64(a.v[0], 0) <=
|
||||||
(uint64_t)_mm_extract_epi64(b.v[0], 0),
|
(uint64_t)_mm_extract_epi64(b.v[0], 0),
|
||||||
(uint64_t)_mm_extract_epi64(a.v[0], 1) <=
|
(uint64_t)_mm_extract_epi64(a.v[0], 1) <=
|
||||||
@@ -1219,7 +1220,7 @@ static FORCEINLINE __vec4_i1 __unsigned_less_equal(__vec4_i64 a, __vec4_i64 b) {
|
|||||||
(uint64_t)_mm_extract_epi64(b.v[1], 1));
|
(uint64_t)_mm_extract_epi64(b.v[1], 1));
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE __vec4_i1 __unsigned_greater_equal(__vec4_i64 a, __vec4_i64 b) {
|
static FORCEINLINE __vec4_i1 __unsigned_greater_equal_i64(__vec4_i64 a, __vec4_i64 b) {
|
||||||
return __vec4_i1((uint64_t)_mm_extract_epi64(a.v[0], 0) >=
|
return __vec4_i1((uint64_t)_mm_extract_epi64(a.v[0], 0) >=
|
||||||
(uint64_t)_mm_extract_epi64(b.v[0], 0),
|
(uint64_t)_mm_extract_epi64(b.v[0], 0),
|
||||||
(uint64_t)_mm_extract_epi64(a.v[0], 1) >=
|
(uint64_t)_mm_extract_epi64(a.v[0], 1) >=
|
||||||
@@ -1230,7 +1231,7 @@ static FORCEINLINE __vec4_i1 __unsigned_greater_equal(__vec4_i64 a, __vec4_i64 b
|
|||||||
(uint64_t)_mm_extract_epi64(b.v[1], 1));
|
(uint64_t)_mm_extract_epi64(b.v[1], 1));
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE __vec4_i1 __unsigned_less_than(__vec4_i64 a, __vec4_i64 b) {
|
static FORCEINLINE __vec4_i1 __unsigned_less_than_i64(__vec4_i64 a, __vec4_i64 b) {
|
||||||
return __vec4_i1((uint64_t)_mm_extract_epi64(a.v[0], 0) <
|
return __vec4_i1((uint64_t)_mm_extract_epi64(a.v[0], 0) <
|
||||||
(uint64_t)_mm_extract_epi64(b.v[0], 0),
|
(uint64_t)_mm_extract_epi64(b.v[0], 0),
|
||||||
(uint64_t)_mm_extract_epi64(a.v[0], 1) <
|
(uint64_t)_mm_extract_epi64(a.v[0], 1) <
|
||||||
@@ -1241,7 +1242,7 @@ static FORCEINLINE __vec4_i1 __unsigned_less_than(__vec4_i64 a, __vec4_i64 b) {
|
|||||||
(uint64_t)_mm_extract_epi64(b.v[1], 1));
|
(uint64_t)_mm_extract_epi64(b.v[1], 1));
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE __vec4_i1 __unsigned_greater_than(__vec4_i64 a, __vec4_i64 b) {
|
static FORCEINLINE __vec4_i1 __unsigned_greater_than_i64(__vec4_i64 a, __vec4_i64 b) {
|
||||||
return __vec4_i1((uint64_t)_mm_extract_epi64(a.v[0], 0) >
|
return __vec4_i1((uint64_t)_mm_extract_epi64(a.v[0], 0) >
|
||||||
(uint64_t)_mm_extract_epi64(b.v[0], 0),
|
(uint64_t)_mm_extract_epi64(b.v[0], 0),
|
||||||
(uint64_t)_mm_extract_epi64(a.v[0], 1) >
|
(uint64_t)_mm_extract_epi64(a.v[0], 1) >
|
||||||
@@ -1252,23 +1253,23 @@ static FORCEINLINE __vec4_i1 __unsigned_greater_than(__vec4_i64 a, __vec4_i64 b)
|
|||||||
(uint64_t)_mm_extract_epi64(b.v[1], 1));
|
(uint64_t)_mm_extract_epi64(b.v[1], 1));
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE __vec4_i1 __signed_greater_than(__vec4_i64 a, __vec4_i64 b) {
|
static FORCEINLINE __vec4_i1 __signed_greater_than_i64(__vec4_i64 a, __vec4_i64 b) {
|
||||||
__m128i cmp0 = _mm_cmpgt_epi64(a.v[0], b.v[0]);
|
__m128i cmp0 = _mm_cmpgt_epi64(a.v[0], b.v[0]);
|
||||||
__m128i cmp1 = _mm_cmpgt_epi64(a.v[1], b.v[1]);
|
__m128i cmp1 = _mm_cmpgt_epi64(a.v[1], b.v[1]);
|
||||||
return _mm_shuffle_ps(_mm_castsi128_ps(cmp0), _mm_castsi128_ps(cmp1),
|
return _mm_shuffle_ps(_mm_castsi128_ps(cmp0), _mm_castsi128_ps(cmp1),
|
||||||
_MM_SHUFFLE(2, 0, 2, 0));
|
_MM_SHUFFLE(2, 0, 2, 0));
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE __vec4_i1 __signed_greater_equal(__vec4_i64 a, __vec4_i64 b) {
|
static FORCEINLINE __vec4_i1 __signed_greater_equal_i64(__vec4_i64 a, __vec4_i64 b) {
|
||||||
return __or(__signed_greater_than(a, b), __equal(a, b));
|
return __or(__signed_greater_than_i64(a, b), __equal_i64(a, b));
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE __vec4_i1 __signed_less_than(__vec4_i64 a, __vec4_i64 b) {
|
static FORCEINLINE __vec4_i1 __signed_less_than_i64(__vec4_i64 a, __vec4_i64 b) {
|
||||||
return __xor(__signed_greater_equal(a, b), __vec4_i1(1, 1, 1, 1));
|
return __xor(__signed_greater_equal_i64(a, b), __vec4_i1(1, 1, 1, 1));
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE __vec4_i1 __signed_less_equal(__vec4_i64 a, __vec4_i64 b) {
|
static FORCEINLINE __vec4_i1 __signed_less_equal_i64(__vec4_i64 a, __vec4_i64 b) {
|
||||||
return __xor(__signed_greater_than(a, b), __vec4_i1(1, 1, 1, 1));
|
return __xor(__signed_greater_than_i64(a, b), __vec4_i1(1, 1, 1, 1));
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE __vec4_i64 __select(__vec4_i1 mask, __vec4_i64 a, __vec4_i64 b) {
|
static FORCEINLINE __vec4_i64 __select(__vec4_i1 mask, __vec4_i64 a, __vec4_i64 b) {
|
||||||
@@ -1353,31 +1354,31 @@ static FORCEINLINE __vec4_f __div(__vec4_f a, __vec4_f b) {
|
|||||||
return _mm_div_ps(a.v, b.v);
|
return _mm_div_ps(a.v, b.v);
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE __vec4_i1 __equal(__vec4_f a, __vec4_f b) {
|
static FORCEINLINE __vec4_i1 __equal_float(__vec4_f a, __vec4_f b) {
|
||||||
return _mm_cmpeq_ps(a.v, b.v);
|
return _mm_cmpeq_ps(a.v, b.v);
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE __vec4_i1 __not_equal(__vec4_f a, __vec4_f b) {
|
static FORCEINLINE __vec4_i1 __not_equal_float(__vec4_f a, __vec4_f b) {
|
||||||
return _mm_cmpneq_ps(a.v, b.v);
|
return _mm_cmpneq_ps(a.v, b.v);
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE __vec4_i1 __less_than(__vec4_f a, __vec4_f b) {
|
static FORCEINLINE __vec4_i1 __less_than_float(__vec4_f a, __vec4_f b) {
|
||||||
return _mm_cmplt_ps(a.v, b.v);
|
return _mm_cmplt_ps(a.v, b.v);
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE __vec4_i1 __less_equal(__vec4_f a, __vec4_f b) {
|
static FORCEINLINE __vec4_i1 __less_equal_float(__vec4_f a, __vec4_f b) {
|
||||||
return _mm_cmple_ps(a.v, b.v);
|
return _mm_cmple_ps(a.v, b.v);
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE __vec4_i1 __greater_than(__vec4_f a, __vec4_f b) {
|
static FORCEINLINE __vec4_i1 __greater_than_float(__vec4_f a, __vec4_f b) {
|
||||||
return _mm_cmpgt_ps(a.v, b.v);
|
return _mm_cmpgt_ps(a.v, b.v);
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE __vec4_i1 __greater_equal(__vec4_f a, __vec4_f b) {
|
static FORCEINLINE __vec4_i1 __greater_equal_float(__vec4_f a, __vec4_f b) {
|
||||||
return _mm_cmpge_ps(a.v, b.v);
|
return _mm_cmpge_ps(a.v, b.v);
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE __vec4_i1 __ordered(__vec4_f a, __vec4_f b) {
|
static FORCEINLINE __vec4_i1 __ordered_float(__vec4_f a, __vec4_f b) {
|
||||||
return _mm_cmpord_ps(a.v, b.v);
|
return _mm_cmpord_ps(a.v, b.v);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1458,49 +1459,49 @@ static FORCEINLINE __vec4_d __div(__vec4_d a, __vec4_d b) {
|
|||||||
_mm_div_pd(a.v[1], b.v[1]));
|
_mm_div_pd(a.v[1], b.v[1]));
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE __vec4_i1 __equal(__vec4_d a, __vec4_d b) {
|
static FORCEINLINE __vec4_i1 __equal_double(__vec4_d a, __vec4_d b) {
|
||||||
__m128d cmp0 = _mm_cmpeq_pd(a.v[0], b.v[0]);
|
__m128d cmp0 = _mm_cmpeq_pd(a.v[0], b.v[0]);
|
||||||
__m128d cmp1 = _mm_cmpeq_pd(a.v[1], b.v[1]);
|
__m128d cmp1 = _mm_cmpeq_pd(a.v[1], b.v[1]);
|
||||||
return _mm_shuffle_ps(_mm_castpd_ps(cmp0), _mm_castpd_ps(cmp1),
|
return _mm_shuffle_ps(_mm_castpd_ps(cmp0), _mm_castpd_ps(cmp1),
|
||||||
_MM_SHUFFLE(2, 0, 2, 0));
|
_MM_SHUFFLE(2, 0, 2, 0));
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE __vec4_i1 __not_equal(__vec4_d a, __vec4_d b) {
|
static FORCEINLINE __vec4_i1 __not_equal_double(__vec4_d a, __vec4_d b) {
|
||||||
__m128d cmp0 = _mm_cmpneq_pd(a.v[0], b.v[0]);
|
__m128d cmp0 = _mm_cmpneq_pd(a.v[0], b.v[0]);
|
||||||
__m128d cmp1 = _mm_cmpneq_pd(a.v[1], b.v[1]);
|
__m128d cmp1 = _mm_cmpneq_pd(a.v[1], b.v[1]);
|
||||||
return _mm_shuffle_ps(_mm_castpd_ps(cmp0), _mm_castpd_ps(cmp1),
|
return _mm_shuffle_ps(_mm_castpd_ps(cmp0), _mm_castpd_ps(cmp1),
|
||||||
_MM_SHUFFLE(2, 0, 2, 0));
|
_MM_SHUFFLE(2, 0, 2, 0));
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE __vec4_i1 __less_than(__vec4_d a, __vec4_d b) {
|
static FORCEINLINE __vec4_i1 __less_than_double(__vec4_d a, __vec4_d b) {
|
||||||
__m128d cmp0 = _mm_cmplt_pd(a.v[0], b.v[0]);
|
__m128d cmp0 = _mm_cmplt_pd(a.v[0], b.v[0]);
|
||||||
__m128d cmp1 = _mm_cmplt_pd(a.v[1], b.v[1]);
|
__m128d cmp1 = _mm_cmplt_pd(a.v[1], b.v[1]);
|
||||||
return _mm_shuffle_ps(_mm_castpd_ps(cmp0), _mm_castpd_ps(cmp1),
|
return _mm_shuffle_ps(_mm_castpd_ps(cmp0), _mm_castpd_ps(cmp1),
|
||||||
_MM_SHUFFLE(2, 0, 2, 0));
|
_MM_SHUFFLE(2, 0, 2, 0));
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE __vec4_i1 __less_equal(__vec4_d a, __vec4_d b) {
|
static FORCEINLINE __vec4_i1 __less_equal_double(__vec4_d a, __vec4_d b) {
|
||||||
__m128d cmp0 = _mm_cmple_pd(a.v[0], b.v[0]);
|
__m128d cmp0 = _mm_cmple_pd(a.v[0], b.v[0]);
|
||||||
__m128d cmp1 = _mm_cmple_pd(a.v[1], b.v[1]);
|
__m128d cmp1 = _mm_cmple_pd(a.v[1], b.v[1]);
|
||||||
return _mm_shuffle_ps(_mm_castpd_ps(cmp0), _mm_castpd_ps(cmp1),
|
return _mm_shuffle_ps(_mm_castpd_ps(cmp0), _mm_castpd_ps(cmp1),
|
||||||
_MM_SHUFFLE(2, 0, 2, 0));
|
_MM_SHUFFLE(2, 0, 2, 0));
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE __vec4_i1 __greater_than(__vec4_d a, __vec4_d b) {
|
static FORCEINLINE __vec4_i1 __greater_than_double(__vec4_d a, __vec4_d b) {
|
||||||
__m128d cmp0 = _mm_cmpgt_pd(a.v[0], b.v[0]);
|
__m128d cmp0 = _mm_cmpgt_pd(a.v[0], b.v[0]);
|
||||||
__m128d cmp1 = _mm_cmpgt_pd(a.v[1], b.v[1]);
|
__m128d cmp1 = _mm_cmpgt_pd(a.v[1], b.v[1]);
|
||||||
return _mm_shuffle_ps(_mm_castpd_ps(cmp0), _mm_castpd_ps(cmp1),
|
return _mm_shuffle_ps(_mm_castpd_ps(cmp0), _mm_castpd_ps(cmp1),
|
||||||
_MM_SHUFFLE(2, 0, 0 ,2));
|
_MM_SHUFFLE(2, 0, 0 ,2));
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE __vec4_i1 __greater_equal(__vec4_d a, __vec4_d b) {
|
static FORCEINLINE __vec4_i1 __greater_equal_double(__vec4_d a, __vec4_d b) {
|
||||||
__m128d cmp0 = _mm_cmpge_pd(a.v[0], b.v[0]);
|
__m128d cmp0 = _mm_cmpge_pd(a.v[0], b.v[0]);
|
||||||
__m128d cmp1 = _mm_cmpge_pd(a.v[1], b.v[1]);
|
__m128d cmp1 = _mm_cmpge_pd(a.v[1], b.v[1]);
|
||||||
return _mm_shuffle_ps(_mm_castpd_ps(cmp0), _mm_castpd_ps(cmp1),
|
return _mm_shuffle_ps(_mm_castpd_ps(cmp0), _mm_castpd_ps(cmp1),
|
||||||
_MM_SHUFFLE(2, 0, 2, 0));
|
_MM_SHUFFLE(2, 0, 2, 0));
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE __vec4_i1 __ordered(__vec4_d a, __vec4_d b) {
|
static FORCEINLINE __vec4_i1 __ordered_double(__vec4_d a, __vec4_d b) {
|
||||||
__m128d cmp0 = _mm_cmpord_pd(a.v[0], b.v[0]);
|
__m128d cmp0 = _mm_cmpord_pd(a.v[0], b.v[0]);
|
||||||
__m128d cmp1 = _mm_cmpord_pd(a.v[1], b.v[1]);
|
__m128d cmp1 = _mm_cmpord_pd(a.v[1], b.v[1]);
|
||||||
return _mm_shuffle_ps(_mm_castpd_ps(cmp0), _mm_castpd_ps(cmp1),
|
return _mm_shuffle_ps(_mm_castpd_ps(cmp0), _mm_castpd_ps(cmp1),
|
||||||
|
|||||||
Reference in New Issue
Block a user