diff --git a/cbackend.cpp b/cbackend.cpp index 152caf6e..1505dae6 100644 --- a/cbackend.cpp +++ b/cbackend.cpp @@ -13,6 +13,7 @@ //===----------------------------------------------------------------------===// #include "ispc.h" +#include "module.h" #include @@ -4447,6 +4448,104 @@ BitcastCleanupPass::runOnBasicBlock(llvm::BasicBlock &bb) { return modifiedAny; } +/////////////////////////////////////////////////////////////////////////// +// AndCmpCleanupPass + +class AndCmpCleanupPass : public llvm::BasicBlockPass { +public: + AndCmpCleanupPass() + : BasicBlockPass(ID) { } + + const char *getPassName() const { return "AndCmp Cleanup Pass"; } + bool runOnBasicBlock(llvm::BasicBlock &BB); + + static char ID; +}; + +char AndCmpCleanupPass::ID = 0; + +// Look for ANDs of masks where one of the operands is a vector compare; we +// can turn these into specialized calls to masked vector compares and +// thence eliminate the AND. For example, rather than emitting +// __and(__less(a, b), c), we will emit __less_and_mask(a, b, c). +bool +AndCmpCleanupPass::runOnBasicBlock(llvm::BasicBlock &bb) { + bool modifiedAny = false; + + restart: + for (llvm::BasicBlock::iterator iter = bb.begin(), e = bb.end(); iter != e; ++iter) { + // See if we have an AND instruction + llvm::BinaryOperator *bop = llvm::dyn_cast(&*iter); + if (bop == NULL || bop->getOpcode() != llvm::Instruction::And) + continue; + + // Make sure it's a vector AND + if (llvm::isa(bop->getType()) == false) + continue; + + // We only care about ANDs of the mask type, not, e.g. ANDs of + // int32s vectors. + if (bop->getType() != LLVMTypes::MaskType) + continue; + + // Now see if either of the operands to the AND is a comparison + for (int i = 0; i < 2; ++i) { + llvm::Value *op = bop->getOperand(i); + llvm::CmpInst *opCmp = llvm::dyn_cast(op); + if (opCmp == NULL) + continue; + + // We have a comparison. However, we also need to make sure + // that it's not comparing two mask values; those can't be + // simplified to something simpler. + if (opCmp->getOperand(0)->getType() == LLVMTypes::MaskType) + break; + + // Success! Go ahead and replace the AND with a call to the + // "__and_mask" variant of the comparison function for this + // operand. + std::string funcName = lPredicateToString(opCmp->getPredicate()); + funcName += "_"; + funcName += lTypeToSuffix(opCmp->getOperand(0)->getType()); + funcName += "_and_mask"; + + llvm::Function *andCmpFunc = m->module->getFunction(funcName); + if (andCmpFunc == NULL) { + // Declare the function if needed; the first two arguments + // are the same as the two arguments to the compare we're + // replacing and the third argument is the mask type. + llvm::Type *cmpOpType = opCmp->getOperand(0)->getType(); + llvm::Constant *acf = + m->module->getOrInsertFunction(funcName, LLVMTypes::MaskType, + cmpOpType, cmpOpType, + LLVMTypes::MaskType, NULL); + andCmpFunc = llvm::dyn_cast(acf); + Assert(andCmpFunc != NULL); + andCmpFunc->setDoesNotThrow(true); + andCmpFunc->setDoesNotAccessMemory(true); + } + + // Set up the function call to the *_and_mask function; the + // mask value passed in is the other operand to the AND. + llvm::Value *args[3] = { opCmp->getOperand(0), opCmp->getOperand(1), + bop->getOperand(i ^ 1) }; + llvm::ArrayRef argArray(&args[0], &args[3]); + llvm::Instruction *cmpCall = + llvm::CallInst::Create(andCmpFunc, argArray, + LLVMGetName(bop, "_and_mask"), + (llvm::Instruction *)NULL); + + // And replace the original AND instruction with it. + llvm::ReplaceInstWithInst(iter, cmpCall); + + modifiedAny = true; + goto restart; + } + } + + return modifiedAny; +} + /////////////////////////////////////////////////////////////////////////// // MaskOpsCleanupPass @@ -4627,6 +4726,7 @@ WriteCXXFile(llvm::Module *module, const char *fn, int vectorWidth, pm.add(llvm::createCFGSimplificationPass()); // clean up after lower invoke. pm.add(new SmearCleanupPass(module, vectorWidth)); pm.add(new BitcastCleanupPass()); + pm.add(new AndCmpCleanupPass()); pm.add(new MaskOpsCleanupPass(module)); pm.add(llvm::createDeadCodeEliminationPass()); // clean up after smear pass //CO pm.add(llvm::createPrintModulePass(&fos)); diff --git a/examples/intrinsics/generic-16.h b/examples/intrinsics/generic-16.h index f1fba1c9..1851ff7e 100644 --- a/examples/intrinsics/generic-16.h +++ b/examples/intrinsics/generic-16.h @@ -200,6 +200,15 @@ static FORCEINLINE __vec16_i1 NAME##_##SUFFIX(TYPE a, TYPE b) { \ for (int i = 0; i < 16; ++i) \ ret.v |= ((CAST)(a.v[i]) OP (CAST)(b.v[i])) << i; \ return ret; \ +} \ +static FORCEINLINE __vec16_i1 NAME##_##SUFFIX##_and_mask(TYPE a, TYPE b, \ + __vec16_i1 mask) { \ + __vec16_i1 ret; \ + ret.v = 0; \ + for (int i = 0; i < 16; ++i) \ + ret.v |= ((CAST)(a.v[i]) OP (CAST)(b.v[i])) << i; \ + ret.v &= mask.v; \ + return ret; \ } #define INSERT_EXTRACT(VTYPE, STYPE) \ diff --git a/examples/intrinsics/generic-32.h b/examples/intrinsics/generic-32.h index 57973ddc..628aab84 100644 --- a/examples/intrinsics/generic-32.h +++ b/examples/intrinsics/generic-32.h @@ -265,6 +265,15 @@ static FORCEINLINE __vec32_i1 NAME##_##SUFFIX(TYPE a, TYPE b) { \ for (int i = 0; i < 32; ++i) \ ret.v |= ((CAST)(a.v[i]) OP (CAST)(b.v[i])) << i; \ return ret; \ +} \ +static FORCEINLINE __vec32_i1 NAME##_##SUFFIX##_and_mask(TYPE a, TYPE b, \ + __vec32_i1 mask) { \ + __vec32_i1 ret; \ + ret.v = 0; \ + for (int i = 0; i < 32; ++i) \ + ret.v |= ((CAST)(a.v[i]) OP (CAST)(b.v[i])) << i; \ + ret.v &= mask.v; \ + return ret; \ } #define INSERT_EXTRACT(VTYPE, STYPE) \ diff --git a/examples/intrinsics/generic-64.h b/examples/intrinsics/generic-64.h index 268e0ab1..2630e306 100644 --- a/examples/intrinsics/generic-64.h +++ b/examples/intrinsics/generic-64.h @@ -390,6 +390,15 @@ static FORCEINLINE __vec64_i1 NAME##_##SUFFIX(TYPE a, TYPE b) { \ for (int i = 0; i < 64; ++i) \ ret.v |= uint64_t((CAST)(a.v[i]) OP (CAST)(b.v[i])) << i; \ return ret; \ +} \ +static FORCEINLINE __vec64_i1 NAME##_##SUFFIX##_and_mask(TYPE a, TYPE b, \ + __vec64_i1 mask) { \ + __vec64_i1 ret; \ + ret.v = 0; \ + for (int i = 0; i < 64; ++i) \ + ret.v |= uint64_t((CAST)(a.v[i]) OP (CAST)(b.v[i])) << i; \ + ret.v &= mask.v; \ + return ret; \ } #define INSERT_EXTRACT(VTYPE, STYPE) \ diff --git a/examples/intrinsics/knc.h b/examples/intrinsics/knc.h index 765c04ad..65678c3a 100644 --- a/examples/intrinsics/knc.h +++ b/examples/intrinsics/knc.h @@ -646,42 +646,92 @@ static FORCEINLINE __vec16_i1 __equal_i32(const __vec16_i32 &a, const __vec16_i3 return _mm512_cmpeq_epi32_mask((__m512i)a, (__m512i)b); } +static FORCEINLINE __vec16_i1 __equal_i32_and_mask(const __vec16_i32 &a, const __vec16_i32 &b, + __vec16_i1 m) { + return _mm512_mask_cmpeq_epi32_mask((__mmask16)m, (__m512i)a, (__m512i)b); +} + static FORCEINLINE __vec16_i1 __not_equal_i32(__vec16_i32 a, __vec16_i32 b) { return _mm512_cmpneq_epi32_mask((__m512i)a, (__m512i)b); } +static FORCEINLINE __vec16_i1 __not_equal_i32_and_mask(__vec16_i32 a, __vec16_i32 b, + __vec16_i1 m) { + return _mm512_mask_cmpneq_epi32_mask((__mmask16)m, (__m512i)a, (__m512i)b); +} + static FORCEINLINE __vec16_i1 __unsigned_less_equal_i32(__vec16_i32 a, __vec16_i32 b) { return _mm512_cmple_epu32_mask((__m512i)a, (__m512i)b); } +static FORCEINLINE __vec16_i1 __unsigned_less_equal_i32_and_mask(__vec16_i32 a, __vec16_i32 b, + __vec16_i1 m) { + return _mm512_mask_cmple_epu32_mask((__mmask16)m, (__m512i)a, (__m512i)b); +} + static FORCEINLINE __vec16_i1 __signed_less_equal_i32(__vec16_i32 a, __vec16_i32 b) { return _mm512_cmple_epi32_mask((__m512i)a, (__m512i)b); } +static FORCEINLINE __vec16_i1 __signed_less_equal_i32_and_mask(__vec16_i32 a, __vec16_i32 b, + __vec16_i1 m) { + return _mm512_mask_cmple_epi32_mask((__mmask16)m, (__m512i)a, (__m512i)b); +} + static FORCEINLINE __vec16_i1 __unsigned_greater_equal_i32(__vec16_i32 a, __vec16_i32 b) { return _mm512_cmpge_epu32_mask((__m512i)a, (__m512i)b); } +static FORCEINLINE __vec16_i1 __unsigned_greater_equal_i32_and_mask(__vec16_i32 a, __vec16_i32 b, + __vec16_i1 m) { + return _mm512_mask_cmpge_epu32_mask((__mmask16)m, (__m512i)a, (__m512i)b); +} + static FORCEINLINE __vec16_i1 __signed_greater_equal_i32(__vec16_i32 a, __vec16_i32 b) { return _mm512_cmpge_epi32_mask((__m512i)a, (__m512i)b); } +static FORCEINLINE __vec16_i1 __signed_greater_equal_i32_and_mask(__vec16_i32 a, __vec16_i32 b, + __vec16_i1 m) { + return _mm512_mask_cmpge_epi32_mask((__mmask16)m, (__m512i)a, (__m512i)b); +} + static FORCEINLINE __vec16_i1 __unsigned_less_than_i32(__vec16_i32 a, __vec16_i32 b) { return _mm512_cmplt_epu32_mask((__m512i)a, (__m512i)b); } +static FORCEINLINE __vec16_i1 __unsigned_less_than_i32_and_mask(__vec16_i32 a, __vec16_i32 b, + __vec16_i1 m) { + return _mm512_mask_cmplt_epu32_mask((__mmask16)m, (__m512i)a, (__m512i)b); +} + static FORCEINLINE __vec16_i1 __signed_less_than_i32(__vec16_i32 a, __vec16_i32 b) { return _mm512_cmplt_epi32_mask((__m512i)a, (__m512i)b); } +static FORCEINLINE __vec16_i1 __signed_less_than_i32_and_mask(__vec16_i32 a, __vec16_i32 b, + __vec16_i1 m) { + return _mm512_mask_cmplt_epi32_mask((__mmask16)m, (__m512i)a, (__m512i)b); +} + static FORCEINLINE __vec16_i1 __unsigned_greater_than_i32(__vec16_i32 a, __vec16_i32 b) { return _mm512_cmpgt_epu32_mask((__m512i)a, (__m512i)b); } +static FORCEINLINE __vec16_i1 __unsigned_greater_than_i32_and_mask(__vec16_i32 a, __vec16_i32 b, + __vec16_i1 m) { + return _mm512_mask_cmpgt_epu32_mask((__mmask16)m, (__m512i)a, (__m512i)b); +} + static FORCEINLINE __vec16_i1 __signed_greater_than_i32(__vec16_i32 a, __vec16_i32 b) { return _mm512_cmpgt_epi32_mask((__m512i)a, (__m512i)b); } +static FORCEINLINE __vec16_i1 __signed_greater_than_i32_and_mask(__vec16_i32 a, __vec16_i32 b, + __vec16_i1 m) { + return _mm512_mask_cmpgt_epi32_mask((__mmask16)m, (__m512i)a, (__m512i)b); +} + static FORCEINLINE __vec16_i32 __select(__vec16_i1 mask, __vec16_i32 a, __vec16_i32 b) { return _mm512_mask_mov_epi32(b.v, mask.m, a.v); @@ -794,18 +844,30 @@ static FORCEINLINE __vec16_i1 __equal_i64(const __vec16_i64 &a, const __vec16_i6 return _mm512_mask_cmpeq_epi32_mask(lo_match,a.v_hi,b.v_hi); } +static FORCEINLINE __vec16_i1 __equal_i64_and_mask(const __vec16_i64 &a, const __vec16_i64 &b, + __vec16_i1 mask) { + __mmask16 lo_match = _mm512_cmpeq_epi32_mask(a.v_lo,b.v_lo); + __mmask16 full_match = _mm512_mask_cmpeq_epi32_mask(lo_match,a.v_hi,b.v_hi); + return _mm512_kand(full_match, (__mmask16)mask); +} + static FORCEINLINE __vec16_i1 __not_equal_i64(const __vec16_i64 &a, const __vec16_i64 &b) { return __not(__equal_i64(a,b)); } -CMP_OP(__vec16_i64, i64, uint64_t, __unsigned_less_equal, <=) -CMP_OP(__vec16_i64, i64, int64_t, __signed_less_equal, <=) -CMP_OP(__vec16_i64, i64, uint64_t, __unsigned_greater_equal, >=) -CMP_OP(__vec16_i64, i64, int64_t, __signed_greater_equal, >=) -CMP_OP(__vec16_i64, i64, uint64_t, __unsigned_less_than, <) -CMP_OP(__vec16_i64, i64, int64_t, __signed_less_than, <) -CMP_OP(__vec16_i64, i64, uint64_t, __unsigned_greater_than, >) -CMP_OP(__vec16_i64, i64, int64_t, __signed_greater_than, >) +static FORCEINLINE __vec16_i1 __not_equal_i64_and_mask(const __vec16_i64 &a, const __vec16_i64 &b, + __vec16_i1 mask) { + return __and(__not(__equal(a,b)), mask); +} + +CMP_OP(__vec16_i64, uint64_t, __unsigned_less_equal, <=) +CMP_OP(__vec16_i64, int64_t, __signed_less_equal, <=) +CMP_OP(__vec16_i64, uint64_t, __unsigned_greater_equal, >=) +CMP_OP(__vec16_i64, int64_t, __signed_greater_equal, >=) +CMP_OP(__vec16_i64, uint64_t, __unsigned_less_than, <) +CMP_OP(__vec16_i64, int64_t, __signed_less_than, <) +CMP_OP(__vec16_i64, uint64_t, __unsigned_greater_than, >) +CMP_OP(__vec16_i64, int64_t, __signed_greater_than, >) SELECT(__vec16_i64) INSERT_EXTRACT(__vec16_i64, int64_t) @@ -935,30 +997,71 @@ static FORCEINLINE __vec16_i1 __equal_float(__vec16_f a, __vec16_f b) { return _mm512_cmpeq_ps_mask(a, b); } +static FORCEINLINE __vec16_i1 __equal_float_and_mask(__vec16_f a, __vec16_f b, + __vec16_i1 m) { + return _mm512_mask_cmpeq_ps_mask(m, a, b); +} + static FORCEINLINE __vec16_i1 __not_equal_float(__vec16_f a, __vec16_f b) { return _mm512_cmpneq_ps_mask(a, b); } +static FORCEINLINE __vec16_i1 __not_equal_float_and_mask(__vec16_f a, __vec16_f b, + __vec16_i1 m) { + return _mm512_mask_cmpneq_ps_mask(m, a, b); +} + static FORCEINLINE __vec16_i1 __less_than_float(__vec16_f a, __vec16_f b) { return _mm512_cmplt_ps_mask(a, b); } +static FORCEINLINE __vec16_i1 __less_than_float_and_mask(__vec16_f a, __vec16_f b, + __vec16_i1 m) { + return _mm512_mask_cmplt_ps_mask(m, a, b); +} + static FORCEINLINE __vec16_i1 __less_equal_float(__vec16_f a, __vec16_f b) { return _mm512_cmple_ps_mask(a, b); } +static FORCEINLINE __vec16_i1 __less_equal_float_and_mask(__vec16_f a, __vec16_f b, + __vec16_i1 m) { + return _mm512_mask_cmple_ps_mask(m, a, b); +} + static FORCEINLINE __vec16_i1 __greater_than_float(__vec16_f a, __vec16_f b) { return _mm512_cmpnle_ps_mask(a, b); } +static FORCEINLINE __vec16_i1 __greater_than_float_and_mask(__vec16_f a, __vec16_f b, + __vec16_i1 m) { + return _mm512_mask_cmpnle_ps_mask(m, a, b); +} + static FORCEINLINE __vec16_i1 __greater_equal_float(__vec16_f a, __vec16_f b) { return _mm512_cmpnlt_ps_mask(a, b); } +static FORCEINLINE __vec16_i1 __greater_equal_float_and_mask(__vec16_f a, __vec16_f b, + __vec16_i1 m) { + return _mm512_mask_cmpnlt_ps_mask(m, a, b); +} + static FORCEINLINE __vec16_i1 __ordered_float(__vec16_f a, __vec16_f b) { return _mm512_cmpord_ps_mask(a, b); } + +/* +static FORCEINLINE __vec16_i1 __ordered(__vec16_f a, __vec16_f b) { + __vec16_i1 ret; + ret.v = 0; + for (int i = 0; i < 16; ++i) + ret.v |= ((a.v[i] == a.v[i]) && (b.v[i] == b.v[i])) ? (1 << i) : 0; + return ret; +>>>>>>> 8ef6bc16364d4c08aa5972141748110160613087 +} + static FORCEINLINE __vec16_i1 __unordered_float(__vec16_f a, __vec16_f b) { return _mm512_cmpunord_ps_mask(a, b); } @@ -1071,6 +1174,14 @@ static FORCEINLINE __vec16_i1 __equal_double(__vec16_d a, __vec16_d b) { return ret; } +static FORCEINLINE __vec16_i1 __equal_double_and_mask(__vec16_d a, __vec16_d b, + __vec16_i1 m) { + __vec16_i1 ret; + ret.m8.m1 = _mm512_mask_cmpeq_pd_mask(m.m8.m1, a.v1, b.v1); + ret.m8.m2 = _mm512_mask_cmpeq_pd_mask(m.m8.m2, a.v2, b.v2); + return ret; +} + static FORCEINLINE __vec16_i1 __not_equal_double(__vec16_d a, __vec16_d b) { __vec16_i1 ret; ret.m8.m1 = _mm512_cmpneq_pd_mask(a.v1, b.v1); @@ -1078,6 +1189,14 @@ static FORCEINLINE __vec16_i1 __not_equal_double(__vec16_d a, __vec16_d b) { return ret; } +static FORCEINLINE __vec16_i1 __not_equal_double_and_mask(__vec16_d a, __vec16_d b, + __vec16_i1 m) { + __vec16_i1 ret; + ret.m8.m1 = _mm512_mask_cmpneq_pd_mask(m.m8.m1, a.v1, b.v1); + ret.m8.m2 = _mm512_mask_cmpneq_pd_mask(m.m8.m2, a.v2, b.v2); + return ret; +} + static FORCEINLINE __vec16_i1 __less_than_double(__vec16_d a, __vec16_d b) { __vec16_i1 ret; ret.m8.m1 = _mm512_cmplt_pd_mask(a.v1, b.v1); @@ -1085,6 +1204,14 @@ static FORCEINLINE __vec16_i1 __less_than_double(__vec16_d a, __vec16_d b) { return ret; } +static FORCEINLINE __vec16_i1 __less_than_double_and_mask(__vec16_d a, __vec16_d b, + __vec16_i1 m) { + __vec16_i1 ret; + ret.m8.m1 = _mm512_mask_cmplt_pd_mask(m.m8.m1, a.v1, b.v1); + ret.m8.m2 = _mm512_mask_cmplt_pd_mask(m.m8.m2, a.v2, b.v2); + return ret; +} + static FORCEINLINE __vec16_i1 __less_equal_double(__vec16_d a, __vec16_d b) { __vec16_i1 ret; ret.m8.m1 = _mm512_cmple_pd_mask(a.v1, b.v1); @@ -1092,6 +1219,14 @@ static FORCEINLINE __vec16_i1 __less_equal_double(__vec16_d a, __vec16_d b) { return ret; } +static FORCEINLINE __vec16_i1 __less_equal_double_and_mask(__vec16_d a, __vec16_d b, + __vec16_i1 m) { + __vec16_i1 ret; + ret.m8.m1 = _mm512_mask_cmple_pd_mask(m.m8.m1, a.v1, b.v1); + ret.m8.m2 = _mm512_mask_cmple_pd_mask(m.m8.m2, a.v2, b.v2); + return ret; +} + static FORCEINLINE __vec16_i1 __greater_than_double(__vec16_d a, __vec16_d b) { __vec16_i1 ret; ret.m8.m1 = _mm512_cmpnle_pd_mask(a.v1, b.v1); @@ -1099,6 +1234,14 @@ static FORCEINLINE __vec16_i1 __greater_than_double(__vec16_d a, __vec16_d b) { return ret; } +static FORCEINLINE __vec16_i1 __greater_than_double_and_mask(__vec16_d a, __vec16_d b, + __vec16_i1 m) { + __vec16_i1 ret; + ret.m8.m1 = _mm512_mask_cmpnle_pd_mask(m.m8.m1, a.v1, b.v1); + ret.m8.m2 = _mm512_mask_cmpnle_pd_mask(m.m8.m2, a.v2, b.v2); + return ret; +} + static FORCEINLINE __vec16_i1 __greater_equal_double(__vec16_d a, __vec16_d b) { __vec16_i1 ret; ret.m8.m1 = _mm512_cmpnlt_pd_mask(a.v1, b.v1); @@ -1106,6 +1249,14 @@ static FORCEINLINE __vec16_i1 __greater_equal_double(__vec16_d a, __vec16_d b) { return ret; } +static FORCEINLINE __vec16_i1 __greater_equal_double_and_mask(__vec16_d a, __vec16_d b, + __vec16_i1 m) { + __vec16_i1 ret; + ret.m8.m1 = _mm512_mask_cmpnlt_pd_mask(m.m8.m1, a.v1, b.v1); + ret.m8.m2 = _mm512_mask_cmpnlt_pd_mask(m.m8.m2, a.v2, b.v2); + return ret; +} + static FORCEINLINE __vec16_i1 __ordered_double(__vec16_d a, __vec16_d b) { __vec16_i1 ret; ret.m8.m1 = _mm512_cmpord_pd_mask(a.v1, b.v1); diff --git a/examples/intrinsics/sse4.h b/examples/intrinsics/sse4.h index 3e692604..fcc14618 100644 --- a/examples/intrinsics/sse4.h +++ b/examples/intrinsics/sse4.h @@ -237,6 +237,31 @@ CAST_BITS_SCALAR(int64_t, double) CAST_BITS_SCALAR(double, uint64_t) CAST_BITS_SCALAR(double, int64_t) +#define CMP_AND_MASK_ONE(FUNC, TYPE) \ +static FORCEINLINE __vec4_i1 FUNC##_and_mask(TYPE a, TYPE b, __vec4_i1 m) { \ + return __and(FUNC(a, b), m); \ +} + +#define CMP_AND_MASK_INT(TYPE, SUFFIX) \ +CMP_AND_MASK_ONE(__equal_##SUFFIX, TYPE) \ +CMP_AND_MASK_ONE(__not_equal_##SUFFIX, TYPE) \ +CMP_AND_MASK_ONE(__unsigned_less_equal_##SUFFIX, TYPE) \ +CMP_AND_MASK_ONE(__unsigned_greater_equal_##SUFFIX, TYPE) \ +CMP_AND_MASK_ONE(__unsigned_less_than_##SUFFIX, TYPE) \ +CMP_AND_MASK_ONE(__unsigned_greater_than_##SUFFIX, TYPE) \ +CMP_AND_MASK_ONE(__signed_less_equal_##SUFFIX, TYPE) \ +CMP_AND_MASK_ONE(__signed_greater_equal_##SUFFIX, TYPE) \ +CMP_AND_MASK_ONE(__signed_less_than_##SUFFIX, TYPE) \ +CMP_AND_MASK_ONE(__signed_greater_than_##SUFFIX, TYPE) + +#define CMP_AND_MASK_FLOAT(TYPE, SUFFIX) \ +CMP_AND_MASK_ONE(__equal_##SUFFIX, TYPE) \ +CMP_AND_MASK_ONE(__not_equal_##SUFFIX, TYPE) \ +CMP_AND_MASK_ONE(__less_equal_##SUFFIX, TYPE) \ +CMP_AND_MASK_ONE(__greater_equal_##SUFFIX, TYPE) \ +CMP_AND_MASK_ONE(__less_than_##SUFFIX, TYPE) \ +CMP_AND_MASK_ONE(__greater_than_##SUFFIX, TYPE) + /////////////////////////////////////////////////////////////////////////// // mask ops @@ -514,6 +539,8 @@ static FORCEINLINE __vec4_i1 __signed_greater_equal_i8(__vec4_i8 a, __vec4_i8 b return __or(__signed_greater_than_i8(a, b), __equal_i8(a, b)); } +CMP_AND_MASK_INT(__vec4_i8, i8) + static FORCEINLINE __vec4_i8 __select(__vec4_i1 mask, __vec4_i8 a, __vec4_i8 b) { return __vec4_i8((_mm_extract_ps(mask.v, 0) != 0) ? _mm_extract_epi8(a.v, 0) : _mm_extract_epi8(b.v, 0), @@ -781,6 +808,8 @@ static FORCEINLINE __vec4_i1 __signed_greater_equal_i16(__vec4_i16 a, __vec4_i1 return __or(__signed_greater_than_i16(a, b), __equal_i16(a, b)); } +CMP_AND_MASK_INT(__vec4_i16, i16) + static FORCEINLINE __vec4_i16 __select(__vec4_i1 mask, __vec4_i16 a, __vec4_i16 b) { return __vec4_i16((_mm_extract_ps(mask.v, 0) != 0) ? _mm_extract_epi16(a.v, 0) : _mm_extract_epi16(b.v, 0), @@ -1040,6 +1069,8 @@ static FORCEINLINE __vec4_i1 __signed_greater_than_i32(__vec4_i32 a, __vec4_i32 return _mm_cmpgt_epi32(a.v, b.v); } +CMP_AND_MASK_INT(__vec4_i32, i32) + static FORCEINLINE __vec4_i32 __select(__vec4_i1 mask, __vec4_i32 a, __vec4_i32 b) { return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(b.v), _mm_castsi128_ps(a.v), mask.v)); @@ -1304,6 +1335,8 @@ static FORCEINLINE __vec4_i1 __signed_less_equal_i64(__vec4_i64 a, __vec4_i64 b) return __xor(__signed_greater_than_i64(a, b), __vec4_i1(1, 1, 1, 1)); } +CMP_AND_MASK_INT(__vec4_i64, i64) + static FORCEINLINE __vec4_i64 __select(__vec4_i1 mask, __vec4_i64 a, __vec4_i64 b) { __m128 m0 = _mm_shuffle_ps(mask.v, mask.v, _MM_SHUFFLE(1, 1, 0, 0)); __m128 m1 = _mm_shuffle_ps(mask.v, mask.v, _MM_SHUFFLE(3, 3, 2, 2)); @@ -1426,6 +1459,8 @@ static FORCEINLINE __vec4_i1 __unordered_float(__vec4_f a, __vec4_f b) { return _mm_cmpunord_ps(a.v, b.v); } +CMP_AND_MASK_FLOAT(__vec4_f, float) + static FORCEINLINE __vec4_f __select(__vec4_i1 mask, __vec4_f a, __vec4_f b) { return _mm_blendv_ps(b.v, a.v, mask.v); } @@ -1567,6 +1602,8 @@ static FORCEINLINE __vec4_i1 __unordered_double(__vec4_d a, __vec4_d b) { _MM_SHUFFLE(2, 0, 2, 0)); } +CMP_AND_MASK_FLOAT(__vec4_d, double) + static FORCEINLINE __vec4_d __select(__vec4_i1 mask, __vec4_d a, __vec4_d b) { __m128 m0 = _mm_shuffle_ps(mask.v, mask.v, _MM_SHUFFLE(1, 1, 0, 0)); __m128 m1 = _mm_shuffle_ps(mask.v, mask.v, _MM_SHUFFLE(3, 3, 2, 2));