diff --git a/cbackend.cpp b/cbackend.cpp index f409d88f..ef1ee04e 100644 --- a/cbackend.cpp +++ b/cbackend.cpp @@ -13,6 +13,7 @@ //===----------------------------------------------------------------------===// #include "ispc.h" +#include "module.h" #include @@ -4365,6 +4366,104 @@ BitcastCleanupPass::runOnBasicBlock(llvm::BasicBlock &bb) { return modifiedAny; } +/////////////////////////////////////////////////////////////////////////// +// AndCmpCleanupPass + +class AndCmpCleanupPass : public llvm::BasicBlockPass { +public: + AndCmpCleanupPass() + : BasicBlockPass(ID) { } + + const char *getPassName() const { return "AndCmp Cleanup Pass"; } + bool runOnBasicBlock(llvm::BasicBlock &BB); + + static char ID; +}; + +char AndCmpCleanupPass::ID = 0; + +// Look for ANDs of masks where one of the operands is a vector compare; we +// can turn these into specialized calls to masked vector compares and +// thence eliminate the AND. For example, rather than emitting +// __and(__less(a, b), c), we will emit __less_and_mask(a, b, c). +bool +AndCmpCleanupPass::runOnBasicBlock(llvm::BasicBlock &bb) { + bool modifiedAny = false; + + restart: + for (llvm::BasicBlock::iterator iter = bb.begin(), e = bb.end(); iter != e; ++iter) { + // See if we have an AND instruction + llvm::BinaryOperator *bop = llvm::dyn_cast(&*iter); + if (bop == NULL || bop->getOpcode() != llvm::Instruction::And) + continue; + + // Make sure it's a vector AND + if (llvm::isa(bop->getType()) == false) + continue; + + // We only care about ANDs of the mask type, not, e.g. ANDs of + // int32s vectors. + if (bop->getType() != LLVMTypes::MaskType) + continue; + + // Now see if either of the operands to the AND is a comparison + for (int i = 0; i < 2; ++i) { + llvm::Value *op = bop->getOperand(i); + llvm::CmpInst *opCmp = llvm::dyn_cast(op); + if (opCmp == NULL) + continue; + + // We have a comparison. However, we also need to make sure + // that it's not comparing two mask values; those can't be + // simplified to something simpler. + if (opCmp->getOperand(0)->getType() == LLVMTypes::MaskType) + break; + + // Success! Go ahead and replace the AND with a call to the + // "__and_mask" variant of the comparison function for this + // operand. + std::string funcName = lPredicateToString(opCmp->getPredicate()); + funcName += "_"; + funcName += lTypeToSuffix(opCmp->getOperand(0)->getType()); + funcName += "_and_mask"; + + llvm::Function *andCmpFunc = m->module->getFunction(funcName); + if (andCmpFunc == NULL) { + // Declare the function if needed; the first two arguments + // are the same as the two arguments to the compare we're + // replacing and the third argument is the mask type. + llvm::Type *cmpOpType = opCmp->getOperand(0)->getType(); + llvm::Constant *acf = + m->module->getOrInsertFunction(funcName, LLVMTypes::MaskType, + cmpOpType, cmpOpType, + LLVMTypes::MaskType, NULL); + andCmpFunc = llvm::dyn_cast(acf); + Assert(andCmpFunc != NULL); + andCmpFunc->setDoesNotThrow(true); + andCmpFunc->setDoesNotAccessMemory(true); + } + + // Set up the function call to the *_and_mask function; the + // mask value passed in is the other operand to the AND. + llvm::Value *args[3] = { opCmp->getOperand(0), opCmp->getOperand(1), + bop->getOperand(i ^ 1) }; + llvm::ArrayRef argArray(&args[0], &args[3]); + llvm::Instruction *cmpCall = + llvm::CallInst::Create(andCmpFunc, argArray, + LLVMGetName(bop, "_and_mask"), + (llvm::Instruction *)NULL); + + // And replace the original AND instruction with it. + llvm::ReplaceInstWithInst(iter, cmpCall); + + modifiedAny = true; + goto restart; + } + } + + return modifiedAny; +} + /////////////////////////////////////////////////////////////////////////// // MaskOpsCleanupPass @@ -4545,6 +4644,7 @@ WriteCXXFile(llvm::Module *module, const char *fn, int vectorWidth, pm.add(llvm::createCFGSimplificationPass()); // clean up after lower invoke. pm.add(new SmearCleanupPass(module, vectorWidth)); pm.add(new BitcastCleanupPass()); + pm.add(new AndCmpCleanupPass()); pm.add(new MaskOpsCleanupPass(module)); pm.add(llvm::createDeadCodeEliminationPass()); // clean up after smear pass //CO pm.add(llvm::createPrintModulePass(&fos)); diff --git a/examples/intrinsics/generic-16.h b/examples/intrinsics/generic-16.h index bb5b0dc3..744eb649 100644 --- a/examples/intrinsics/generic-16.h +++ b/examples/intrinsics/generic-16.h @@ -200,6 +200,15 @@ static FORCEINLINE __vec16_i1 NAME##_##SUFFIX(TYPE a, TYPE b) { \ for (int i = 0; i < 16; ++i) \ ret.v |= ((CAST)(a.v[i]) OP (CAST)(b.v[i])) << i; \ return ret; \ +} \ +static FORCEINLINE __vec16_i1 NAME##_##SUFFIX##_and_mask(TYPE a, TYPE b, \ + __vec16_i1 mask) { \ + __vec16_i1 ret; \ + ret.v = 0; \ + for (int i = 0; i < 16; ++i) \ + ret.v |= ((CAST)(a.v[i]) OP (CAST)(b.v[i])) << i; \ + ret.v &= mask.v; \ + return ret; \ } #define INSERT_EXTRACT(VTYPE, STYPE) \ diff --git a/examples/intrinsics/generic-32.h b/examples/intrinsics/generic-32.h index b593ac87..937708e2 100644 --- a/examples/intrinsics/generic-32.h +++ b/examples/intrinsics/generic-32.h @@ -265,6 +265,15 @@ static FORCEINLINE __vec32_i1 NAME##_##SUFFIX(TYPE a, TYPE b) { \ for (int i = 0; i < 32; ++i) \ ret.v |= ((CAST)(a.v[i]) OP (CAST)(b.v[i])) << i; \ return ret; \ +} \ +static FORCEINLINE __vec32_i1 NAME##_##SUFFIX##_and_mask(TYPE a, TYPE b, \ + __vec32_i1 mask) { \ + __vec32_i1 ret; \ + ret.v = 0; \ + for (int i = 0; i < 32; ++i) \ + ret.v |= ((CAST)(a.v[i]) OP (CAST)(b.v[i])) << i; \ + ret.v &= mask.v; \ + return ret; \ } #define INSERT_EXTRACT(VTYPE, STYPE) \ diff --git a/examples/intrinsics/generic-64.h b/examples/intrinsics/generic-64.h index a0456e49..4a130f83 100644 --- a/examples/intrinsics/generic-64.h +++ b/examples/intrinsics/generic-64.h @@ -390,6 +390,15 @@ static FORCEINLINE __vec64_i1 NAME##_##SUFFIX(TYPE a, TYPE b) { \ for (int i = 0; i < 64; ++i) \ ret.v |= uint64_t((CAST)(a.v[i]) OP (CAST)(b.v[i])) << i; \ return ret; \ +} \ +static FORCEINLINE __vec64_i1 NAME##_##SUFFIX##_and_mask(TYPE a, TYPE b, \ + __vec64_i1 mask) { \ + __vec64_i1 ret; \ + ret.v = 0; \ + for (int i = 0; i < 64; ++i) \ + ret.v |= uint64_t((CAST)(a.v[i]) OP (CAST)(b.v[i])) << i; \ + ret.v &= mask.v; \ + return ret; \ } #define INSERT_EXTRACT(VTYPE, STYPE) \ diff --git a/examples/intrinsics/knc.h b/examples/intrinsics/knc.h index 243dc539..180454cf 100644 --- a/examples/intrinsics/knc.h +++ b/examples/intrinsics/knc.h @@ -638,42 +638,92 @@ static FORCEINLINE __vec16_i1 __equal_i32(const __vec16_i32 &a, const __vec16_i3 return _mm512_cmpeq_epi32_mask((__m512i)a, (__m512i)b); } +static FORCEINLINE __vec16_i1 __equal_i32_and_mask(const __vec16_i32 &a, const __vec16_i32 &b, + __vec16_i1 m) { + return _mm512_mask_cmpeq_epi32_mask((__mmask16)m, (__m512i)a, (__m512i)b); +} + static FORCEINLINE __vec16_i1 __not_equal_i32(__vec16_i32 a, __vec16_i32 b) { return _mm512_cmpneq_epi32_mask((__m512i)a, (__m512i)b); } +static FORCEINLINE __vec16_i1 __not_equal_i32_and_mask(__vec16_i32 a, __vec16_i32 b, + __vec16_i1 m) { + return _mm512_mask_cmpneq_epi32_mask((__mmask16)m, (__m512i)a, (__m512i)b); +} + static FORCEINLINE __vec16_i1 __unsigned_less_equal_i32(__vec16_i32 a, __vec16_i32 b) { return _mm512_cmple_epu32_mask((__m512i)a, (__m512i)b); } +static FORCEINLINE __vec16_i1 __unsigned_less_equal_i32_and_mask(__vec16_i32 a, __vec16_i32 b, + __vec16_i1 m) { + return _mm512_mask_cmple_epu32_mask((__mmask16)m, (__m512i)a, (__m512i)b); +} + static FORCEINLINE __vec16_i1 __signed_less_equal_i32(__vec16_i32 a, __vec16_i32 b) { return _mm512_cmple_epi32_mask((__m512i)a, (__m512i)b); } +static FORCEINLINE __vec16_i1 __signed_less_equal_i32_and_mask(__vec16_i32 a, __vec16_i32 b, + __vec16_i1 m) { + return _mm512_mask_cmple_epi32_mask((__mmask16)m, (__m512i)a, (__m512i)b); +} + static FORCEINLINE __vec16_i1 __unsigned_greater_equal_i32(__vec16_i32 a, __vec16_i32 b) { return _mm512_cmpge_epu32_mask((__m512i)a, (__m512i)b); } +static FORCEINLINE __vec16_i1 __unsigned_greater_equal_i32_and_mask(__vec16_i32 a, __vec16_i32 b, + __vec16_i1 m) { + return _mm512_mask_cmpge_epu32_mask((__mmask16)m, (__m512i)a, (__m512i)b); +} + static FORCEINLINE __vec16_i1 __signed_greater_equal_i32(__vec16_i32 a, __vec16_i32 b) { return _mm512_cmpge_epi32_mask((__m512i)a, (__m512i)b); } +static FORCEINLINE __vec16_i1 __signed_greater_equal_i32_and_mask(__vec16_i32 a, __vec16_i32 b, + __vec16_i1 m) { + return _mm512_mask_cmpge_epi32_mask((__mmask16)m, (__m512i)a, (__m512i)b); +} + static FORCEINLINE __vec16_i1 __unsigned_less_than_i32(__vec16_i32 a, __vec16_i32 b) { return _mm512_cmplt_epu32_mask((__m512i)a, (__m512i)b); } +static FORCEINLINE __vec16_i1 __unsigned_less_than_i32_and_mask(__vec16_i32 a, __vec16_i32 b, + __vec16_i1 m) { + return _mm512_mask_cmplt_epu32_mask((__mmask16)m, (__m512i)a, (__m512i)b); +} + static FORCEINLINE __vec16_i1 __signed_less_than_i32(__vec16_i32 a, __vec16_i32 b) { return _mm512_cmplt_epi32_mask((__m512i)a, (__m512i)b); } +static FORCEINLINE __vec16_i1 __signed_less_than_i32_and_mask(__vec16_i32 a, __vec16_i32 b, + __vec16_i1 m) { + return _mm512_mask_cmplt_epi32_mask((__mmask16)m, (__m512i)a, (__m512i)b); +} + static FORCEINLINE __vec16_i1 __unsigned_greater_than_i32(__vec16_i32 a, __vec16_i32 b) { return _mm512_cmpgt_epu32_mask((__m512i)a, (__m512i)b); } +static FORCEINLINE __vec16_i1 __unsigned_greater_than_i32_and_mask(__vec16_i32 a, __vec16_i32 b, + __vec16_i1 m) { + return _mm512_mask_cmpgt_epu32_mask((__mmask16)m, (__m512i)a, (__m512i)b); +} + static FORCEINLINE __vec16_i1 __signed_greater_than_i32(__vec16_i32 a, __vec16_i32 b) { return _mm512_cmpgt_epi32_mask((__m512i)a, (__m512i)b); } +static FORCEINLINE __vec16_i1 __signed_greater_than_i32_and_mask(__vec16_i32 a, __vec16_i32 b, + __vec16_i1 m) { + return _mm512_mask_cmpgt_epi32_mask((__mmask16)m, (__m512i)a, (__m512i)b); +} + static FORCEINLINE __vec16_i32 __select(__vec16_i1 mask, __vec16_i32 a, __vec16_i32 b) { return _mm512_mask_mov_epi32(b.v, mask.m, a.v); @@ -778,10 +828,22 @@ static FORCEINLINE __vec16_i1 __equal_i64(const __vec16_i64 &a, const __vec16_i6 return _mm512_mask_cmpeq_epi32_mask(lo_match,a.v_hi,b.v_hi); } +static FORCEINLINE __vec16_i1 __equal_i64_and_mask(const __vec16_i64 &a, const __vec16_i64 &b, + __vec16_i1 mask) { + __mmask16 lo_match = _mm512_cmpeq_epi32_mask(a.v_lo,b.v_lo); + __mmask16 full_match = _mm512_mask_cmpeq_epi32_mask(lo_match,a.v_hi,b.v_hi); + return _mm512_kand(full_match, (__mmask16)mask); +} + static FORCEINLINE __vec16_i1 __not_equal_i64(const __vec16_i64 &a, const __vec16_i64 &b) { return __not(__equal(a,b)); } +static FORCEINLINE __vec16_i1 __not_equal_i64_and_mask(const __vec16_i64 &a, const __vec16_i64 &b, + __vec16_i1 mask) { + return __and(__not(__equal(a,b)), mask); +} + CMP_OP(__vec16_i64, uint64_t, __unsigned_less_equal, <=) CMP_OP(__vec16_i64, int64_t, __signed_less_equal, <=) CMP_OP(__vec16_i64, uint64_t, __unsigned_greater_equal, >=) @@ -919,26 +981,56 @@ static FORCEINLINE __vec16_i1 __equal_float(__vec16_f a, __vec16_f b) { return _mm512_cmpeq_ps_mask(a, b); } +static FORCEINLINE __vec16_i1 __equal_float_and_mask(__vec16_f a, __vec16_f b, + __vec16_i1 m) { + return _mm512_mask_cmpeq_ps_mask(m, a, b); +} + static FORCEINLINE __vec16_i1 __not_equal_float(__vec16_f a, __vec16_f b) { return _mm512_cmpneq_ps_mask(a, b); } +static FORCEINLINE __vec16_i1 __not_equal_float_and_mask(__vec16_f a, __vec16_f b, + __vec16_i1 m) { + return _mm512_mask_cmpneq_ps_mask(m, a, b); +} + static FORCEINLINE __vec16_i1 __less_than_float(__vec16_f a, __vec16_f b) { return _mm512_cmplt_ps_mask(a, b); } +static FORCEINLINE __vec16_i1 __less_than_float_and_mask(__vec16_f a, __vec16_f b, + __vec16_i1 m) { + return _mm512_mask_cmplt_ps_mask(m, a, b); +} + static FORCEINLINE __vec16_i1 __less_equal_float(__vec16_f a, __vec16_f b) { return _mm512_cmple_ps_mask(a, b); } +static FORCEINLINE __vec16_i1 __less_equal_float_and_mask(__vec16_f a, __vec16_f b, + __vec16_i1 m) { + return _mm512_mask_cmple_ps_mask(m, a, b); +} + static FORCEINLINE __vec16_i1 __greater_than_float(__vec16_f a, __vec16_f b) { return _mm512_cmpnle_ps_mask(a, b); } +static FORCEINLINE __vec16_i1 __greater_than_float_and_mask(__vec16_f a, __vec16_f b, + __vec16_i1 m) { + return _mm512_mask_cmpnle_ps_mask(m, a, b); +} + static FORCEINLINE __vec16_i1 __greater_equal_float(__vec16_f a, __vec16_f b) { return _mm512_cmpnlt_ps_mask(a, b); } +static FORCEINLINE __vec16_i1 __greater_equal_float_and_mask(__vec16_f a, __vec16_f b, + __vec16_i1 m) { + return _mm512_mask_cmpnlt_ps_mask(m, a, b); +} + /* static FORCEINLINE __vec16_i1 __ordered(__vec16_f a, __vec16_f b) { __vec16_i1 ret; @@ -1050,6 +1142,14 @@ static FORCEINLINE __vec16_i1 __equal_double(__vec16_d a, __vec16_d b) { return ret; } +static FORCEINLINE __vec16_i1 __equal_double_and_mask(__vec16_d a, __vec16_d b, + __vec16_i1 m) { + __vec16_i1 ret; + ret.m8.m1 = _mm512_mask_cmpeq_pd_mask(m.m8.m1, a.v1, b.v1); + ret.m8.m2 = _mm512_mask_cmpeq_pd_mask(m.m8.m2, a.v2, b.v2); + return ret; +} + static FORCEINLINE __vec16_i1 __not_equal_double(__vec16_d a, __vec16_d b) { __vec16_i1 ret; ret.m8.m1 = _mm512_cmpneq_pd_mask(a.v1, b.v1); @@ -1057,6 +1157,14 @@ static FORCEINLINE __vec16_i1 __not_equal_double(__vec16_d a, __vec16_d b) { return ret; } +static FORCEINLINE __vec16_i1 __not_equal_double_and_mask(__vec16_d a, __vec16_d b, + __vec16_i1 m) { + __vec16_i1 ret; + ret.m8.m1 = _mm512_mask_cmpneq_pd_mask(m.m8.m1, a.v1, b.v1); + ret.m8.m2 = _mm512_mask_cmpneq_pd_mask(m.m8.m2, a.v2, b.v2); + return ret; +} + static FORCEINLINE __vec16_i1 __less_than_double(__vec16_d a, __vec16_d b) { __vec16_i1 ret; ret.m8.m1 = _mm512_cmplt_pd_mask(a.v1, b.v1); @@ -1064,6 +1172,14 @@ static FORCEINLINE __vec16_i1 __less_than_double(__vec16_d a, __vec16_d b) { return ret; } +static FORCEINLINE __vec16_i1 __less_than_double_and_mask(__vec16_d a, __vec16_d b, + __vec16_i1 m) { + __vec16_i1 ret; + ret.m8.m1 = _mm512_mask_cmplt_pd_mask(m.m8.m1, a.v1, b.v1); + ret.m8.m2 = _mm512_mask_cmplt_pd_mask(m.m8.m2, a.v2, b.v2); + return ret; +} + static FORCEINLINE __vec16_i1 __less_equal_double(__vec16_d a, __vec16_d b) { __vec16_i1 ret; ret.m8.m1 = _mm512_cmple_pd_mask(a.v1, b.v1); @@ -1071,6 +1187,14 @@ static FORCEINLINE __vec16_i1 __less_equal_double(__vec16_d a, __vec16_d b) { return ret; } +static FORCEINLINE __vec16_i1 __less_equal_double_and_mask(__vec16_d a, __vec16_d b, + __vec16_i1 m) { + __vec16_i1 ret; + ret.m8.m1 = _mm512_mask_cmple_pd_mask(m.m8.m1, a.v1, b.v1); + ret.m8.m2 = _mm512_mask_cmple_pd_mask(m.m8.m2, a.v2, b.v2); + return ret; +} + static FORCEINLINE __vec16_i1 __greater_than_double(__vec16_d a, __vec16_d b) { __vec16_i1 ret; ret.m8.m1 = _mm512_cmpnle_pd_mask(a.v1, b.v1); @@ -1078,6 +1202,14 @@ static FORCEINLINE __vec16_i1 __greater_than_double(__vec16_d a, __vec16_d b) { return ret; } +static FORCEINLINE __vec16_i1 __greater_than_double_and_mask(__vec16_d a, __vec16_d b, + __vec16_i1 m) { + __vec16_i1 ret; + ret.m8.m1 = _mm512_mask_cmpnle_pd_mask(m.m8.m1, a.v1, b.v1); + ret.m8.m2 = _mm512_mask_cmpnle_pd_mask(m.m8.m2, a.v2, b.v2); + return ret; +} + static FORCEINLINE __vec16_i1 __greater_equal_double(__vec16_d a, __vec16_d b) { __vec16_i1 ret; ret.m8.m1 = _mm512_cmpnlt_pd_mask(a.v1, b.v1); @@ -1085,6 +1217,14 @@ static FORCEINLINE __vec16_i1 __greater_equal_double(__vec16_d a, __vec16_d b) { return ret; } +static FORCEINLINE __vec16_i1 __greater_equal_double_and_mask(__vec16_d a, __vec16_d b, + __vec16_i1 m) { + __vec16_i1 ret; + ret.m8.m1 = _mm512_mask_cmpnlt_pd_mask(m.m8.m1, a.v1, b.v1); + ret.m8.m2 = _mm512_mask_cmpnlt_pd_mask(m.m8.m2, a.v2, b.v2); + return ret; +} + /* static FORCEINLINE __vec16_i1 __ordered(__vec16_d a, __vec16_d b) { diff --git a/examples/intrinsics/sse4.h b/examples/intrinsics/sse4.h index 4fb946c3..84b6e65f 100644 --- a/examples/intrinsics/sse4.h +++ b/examples/intrinsics/sse4.h @@ -237,6 +237,31 @@ CAST_BITS_SCALAR(int64_t, double) CAST_BITS_SCALAR(double, uint64_t) CAST_BITS_SCALAR(double, int64_t) +#define CMP_AND_MASK_ONE(FUNC, TYPE) \ +static FORCEINLINE __vec4_i1 FUNC##_and_mask(TYPE a, TYPE b, __vec4_i1 m) { \ + return __and(FUNC(a, b), m); \ +} + +#define CMP_AND_MASK_INT(TYPE, SUFFIX) \ +CMP_AND_MASK_ONE(__equal_##SUFFIX, TYPE) \ +CMP_AND_MASK_ONE(__not_equal_##SUFFIX, TYPE) \ +CMP_AND_MASK_ONE(__unsigned_less_equal_##SUFFIX, TYPE) \ +CMP_AND_MASK_ONE(__unsigned_greater_equal_##SUFFIX, TYPE) \ +CMP_AND_MASK_ONE(__unsigned_less_than_##SUFFIX, TYPE) \ +CMP_AND_MASK_ONE(__unsigned_greater_than_##SUFFIX, TYPE) \ +CMP_AND_MASK_ONE(__signed_less_equal_##SUFFIX, TYPE) \ +CMP_AND_MASK_ONE(__signed_greater_equal_##SUFFIX, TYPE) \ +CMP_AND_MASK_ONE(__signed_less_than_##SUFFIX, TYPE) \ +CMP_AND_MASK_ONE(__signed_greater_than_##SUFFIX, TYPE) + +#define CMP_AND_MASK_FLOAT(TYPE, SUFFIX) \ +CMP_AND_MASK_ONE(__equal_##SUFFIX, TYPE) \ +CMP_AND_MASK_ONE(__not_equal_##SUFFIX, TYPE) \ +CMP_AND_MASK_ONE(__less_equal_##SUFFIX, TYPE) \ +CMP_AND_MASK_ONE(__greater_equal_##SUFFIX, TYPE) \ +CMP_AND_MASK_ONE(__less_than_##SUFFIX, TYPE) \ +CMP_AND_MASK_ONE(__greater_than_##SUFFIX, TYPE) + /////////////////////////////////////////////////////////////////////////// // mask ops @@ -506,6 +531,8 @@ static FORCEINLINE __vec4_i1 __signed_greater_equal_i8(__vec4_i8 a, __vec4_i8 b return __or(__signed_greater_than_i8(a, b), __equal_i8(a, b)); } +CMP_AND_MASK_INT(__vec4_i8, i8) + static FORCEINLINE __vec4_i8 __select(__vec4_i1 mask, __vec4_i8 a, __vec4_i8 b) { return __vec4_i8((_mm_extract_ps(mask.v, 0) != 0) ? _mm_extract_epi8(a.v, 0) : _mm_extract_epi8(b.v, 0), @@ -765,6 +792,8 @@ static FORCEINLINE __vec4_i1 __signed_greater_equal_i16(__vec4_i16 a, __vec4_i1 return __or(__signed_greater_than_i16(a, b), __equal_i16(a, b)); } +CMP_AND_MASK_INT(__vec4_i16, i16) + static FORCEINLINE __vec4_i16 __select(__vec4_i1 mask, __vec4_i16 a, __vec4_i16 b) { return __vec4_i16((_mm_extract_ps(mask.v, 0) != 0) ? _mm_extract_epi16(a.v, 0) : _mm_extract_epi16(b.v, 0), @@ -1016,6 +1045,8 @@ static FORCEINLINE __vec4_i1 __signed_greater_than_i32(__vec4_i32 a, __vec4_i32 return _mm_cmpgt_epi32(a.v, b.v); } +CMP_AND_MASK_INT(__vec4_i32, i32) + static FORCEINLINE __vec4_i32 __select(__vec4_i1 mask, __vec4_i32 a, __vec4_i32 b) { return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(b.v), _mm_castsi128_ps(a.v), mask.v)); @@ -1272,6 +1303,8 @@ static FORCEINLINE __vec4_i1 __signed_less_equal_i64(__vec4_i64 a, __vec4_i64 b) return __xor(__signed_greater_than_i64(a, b), __vec4_i1(1, 1, 1, 1)); } +CMP_AND_MASK_INT(__vec4_i64, i64) + static FORCEINLINE __vec4_i64 __select(__vec4_i1 mask, __vec4_i64 a, __vec4_i64 b) { __m128 m0 = _mm_shuffle_ps(mask.v, mask.v, _MM_SHUFFLE(1, 1, 0, 0)); __m128 m1 = _mm_shuffle_ps(mask.v, mask.v, _MM_SHUFFLE(3, 3, 2, 2)); @@ -1382,6 +1415,8 @@ static FORCEINLINE __vec4_i1 __ordered_float(__vec4_f a, __vec4_f b) { return _mm_cmpord_ps(a.v, b.v); } +CMP_AND_MASK_FLOAT(__vec4_f, float) + static FORCEINLINE __vec4_f __select(__vec4_i1 mask, __vec4_f a, __vec4_f b) { return _mm_blendv_ps(b.v, a.v, mask.v); } @@ -1508,6 +1543,8 @@ static FORCEINLINE __vec4_i1 __ordered_double(__vec4_d a, __vec4_d b) { _MM_SHUFFLE(2, 0, 2, 0)); } +CMP_AND_MASK_FLOAT(__vec4_d, double) + static FORCEINLINE __vec4_d __select(__vec4_i1 mask, __vec4_d a, __vec4_d b) { __m128 m0 = _mm_shuffle_ps(mask.v, mask.v, _MM_SHUFFLE(1, 1, 0, 0)); __m128 m1 = _mm_shuffle_ps(mask.v, mask.v, _MM_SHUFFLE(3, 3, 2, 2));