diff --git a/builtins/target-avx-x2.ll b/builtins/target-avx-x2.ll index 7f737626..55a1c802 100644 --- a/builtins/target-avx-x2.ll +++ b/builtins/target-avx-x2.ll @@ -190,6 +190,53 @@ define i64 @__movmsk(<16 x i32>) nounwind readnone alwaysinline { ret i64 %v64 } +define i1 @__any(<16 x i32>) nounwind readnone alwaysinline { + %floatmask = bitcast <16 x i32> %0 to <16 x float> + %mask0 = shufflevector <16 x float> %floatmask, <16 x float> undef, + <8 x i32> + %v0 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %mask0) nounwind readnone + %mask1 = shufflevector <16 x float> %floatmask, <16 x float> undef, + <8 x i32> + %v1 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %mask1) nounwind readnone + + %v1shift = shl i32 %v1, 8 + %v = or i32 %v1shift, %v0 + %cmp = icmp ne i32 %v, 0 + ret i1 %cmp +} + +define i1 @__all(<16 x i32>) nounwind readnone alwaysinline { + %floatmask = bitcast <16 x i32> %0 to <16 x float> + %mask0 = shufflevector <16 x float> %floatmask, <16 x float> undef, + <8 x i32> + %v0 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %mask0) nounwind readnone + %mask1 = shufflevector <16 x float> %floatmask, <16 x float> undef, + <8 x i32> + %v1 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %mask1) nounwind readnone + + %v1shift = shl i32 %v1, 8 + %v = or i32 %v1shift, %v0 + %cmp = icmp eq i32 %v, 65535 + ret i1 %cmp +} + +define i1 @__none(<16 x i32>) nounwind readnone alwaysinline { + %floatmask = bitcast <16 x i32> %0 to <16 x float> + %mask0 = shufflevector <16 x float> %floatmask, <16 x float> undef, + <8 x i32> + %v0 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %mask0) nounwind readnone + %mask1 = shufflevector <16 x float> %floatmask, <16 x float> undef, + <8 x i32> + %v1 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %mask1) nounwind readnone + + %v1shift = shl i32 %v1, 8 + %v = or i32 %v1shift, %v0 + %cmp = icmp eq i32 %v, 0 + ret i1 %cmp +} + + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; horizontal float ops diff --git a/builtins/target-avx.ll b/builtins/target-avx.ll index 3cd76516..3283376a 100644 --- a/builtins/target-avx.ll +++ b/builtins/target-avx.ll @@ -182,6 +182,27 @@ define i64 @__movmsk(<8 x i32>) nounwind readnone alwaysinline { ret i64 %v64 } +define i1 @__any(<8 x i32>) nounwind readnone alwaysinline { + %floatmask = bitcast <8 x i32> %0 to <8 x float> + %v = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %floatmask) nounwind readnone + %cmp = icmp ne i32 %v, 0 + ret i1 %cmp +} + +define i1 @__all(<8 x i32>) nounwind readnone alwaysinline { + %floatmask = bitcast <8 x i32> %0 to <8 x float> + %v = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %floatmask) nounwind readnone + %cmp = icmp eq i32 %v, 255 + ret i1 %cmp +} + +define i1 @__none(<8 x i32>) nounwind readnone alwaysinline { + %floatmask = bitcast <8 x i32> %0 to <8 x float> + %v = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %floatmask) nounwind readnone + %cmp = icmp eq i32 %v, 0 + ret i1 %cmp +} + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; horizontal float ops diff --git a/builtins/target-generic-1.ll b/builtins/target-generic-1.ll index c5937c8e..e18bc0ff 100644 --- a/builtins/target-generic-1.ll +++ b/builtins/target-generic-1.ll @@ -198,6 +198,27 @@ define i64 @__movmsk(<1 x i32>) nounwind readnone alwaysinline { ret i64 %v64 } +define i1 @__any(<1 x i32>) nounwind readnone alwaysinline { + %item = extractelement <1 x i32> %0, i32 0 + %v = lshr i32 %item, 31 + %cmp = icmp ne i32 %v, 0 + ret i1 %cmp +} + +define i1 @__all(<1 x i32>) nounwind readnone alwaysinline { + %item = extractelement <1 x i32> %0, i32 0 + %v = lshr i32 %item, 31 + %cmp = icmp eq i32 %v, 1 + ret i1 %cmp +} + +define i1 @__none(<1 x i32>) nounwind readnone alwaysinline { + %item = extractelement <1 x i32> %0, i32 0 + %v = lshr i32 %item, 31 + %cmp = icmp eq i32 %v, 0 + ret i1 %cmp +} + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; rounding diff --git a/builtins/target-generic-common.ll b/builtins/target-generic-common.ll index 7b4cfd9c..fbce2531 100644 --- a/builtins/target-generic-common.ll +++ b/builtins/target-generic-common.ll @@ -222,6 +222,9 @@ declare @__svml_pow(, ) ;; reductions declare i64 @__movmsk() nounwind readnone +declare i1 @__any() nounwind readnone +declare i1 @__all() nounwind readnone +declare i1 @__none() nounwind readnone declare float @__reduce_add_float() nounwind readnone declare float @__reduce_min_float() nounwind readnone diff --git a/builtins/target-sse2-x2.ll b/builtins/target-sse2-x2.ll index ad19f899..3910dfdb 100644 --- a/builtins/target-sse2-x2.ll +++ b/builtins/target-sse2-x2.ll @@ -313,6 +313,60 @@ define i64 @__movmsk(<8 x i32>) nounwind readnone alwaysinline { ret i64 %v64 } +define i1 @__any(<8 x i32>) nounwind readnone alwaysinline { + ; first do two 4-wide movmsk calls + %floatmask = bitcast <8 x i32> %0 to <8 x float> + %m0 = shufflevector <8 x float> %floatmask, <8 x float> undef, + <4 x i32> + %v0 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %m0) nounwind readnone + %m1 = shufflevector <8 x float> %floatmask, <8 x float> undef, + <4 x i32> + %v1 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %m1) nounwind readnone + + ; and shift the first one over by 4 before ORing it with the value + ; of the second one + %v1s = shl i32 %v1, 4 + %v = or i32 %v0, %v1s + %cmp = icmp ne i32 %v, 0 + ret i1 %cmp +} + +define i1 @__all(<8 x i32>) nounwind readnone alwaysinline { + ; first do two 4-wide movmsk calls + %floatmask = bitcast <8 x i32> %0 to <8 x float> + %m0 = shufflevector <8 x float> %floatmask, <8 x float> undef, + <4 x i32> + %v0 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %m0) nounwind readnone + %m1 = shufflevector <8 x float> %floatmask, <8 x float> undef, + <4 x i32> + %v1 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %m1) nounwind readnone + + ; and shift the first one over by 4 before ORing it with the value + ; of the second one + %v1s = shl i32 %v1, 4 + %v = or i32 %v0, %v1s + %cmp = icmp eq i32 %v, 255 + ret i1 %cmp +} + +define i1 @__none(<8 x i32>) nounwind readnone alwaysinline { + ; first do two 4-wide movmsk calls + %floatmask = bitcast <8 x i32> %0 to <8 x float> + %m0 = shufflevector <8 x float> %floatmask, <8 x float> undef, + <4 x i32> + %v0 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %m0) nounwind readnone + %m1 = shufflevector <8 x float> %floatmask, <8 x float> undef, + <4 x i32> + %v1 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %m1) nounwind readnone + + ; and shift the first one over by 4 before ORing it with the value + ; of the second one + %v1s = shl i32 %v1, 4 + %v = or i32 %v0, %v1s + %cmp = icmp eq i32 %v, 0 + ret i1 %cmp +} + define <4 x float> @__vec4_add_float(<4 x float> %v0, <4 x float> %v1) nounwind readnone alwaysinline { %v = fadd <4 x float> %v0, %v1 diff --git a/builtins/target-sse2.ll b/builtins/target-sse2.ll index 6558adc8..618a61d9 100644 --- a/builtins/target-sse2.ll +++ b/builtins/target-sse2.ll @@ -246,6 +246,27 @@ define i64 @__movmsk(<4 x i32>) nounwind readnone alwaysinline { ret i64 %v64 } +define i1 @__any(<4 x i32>) nounwind readnone alwaysinline { + %floatmask = bitcast <4 x i32> %0 to <4 x float> + %v = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %floatmask) nounwind readnone + %cmp = icmp ne i32 %v, 0 + ret i1 %cmp +} + +define i1 @__all(<4 x i32>) nounwind readnone alwaysinline { + %floatmask = bitcast <4 x i32> %0 to <4 x float> + %v = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %floatmask) nounwind readnone + %cmp = icmp eq i32 %v, 15 + ret i1 %cmp +} + +define i1 @__none(<4 x i32>) nounwind readnone alwaysinline { + %floatmask = bitcast <4 x i32> %0 to <4 x float> + %v = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %floatmask) nounwind readnone + %cmp = icmp eq i32 %v, 0 + ret i1 %cmp +} + define float @__reduce_add_float(<4 x float> %v) nounwind readonly alwaysinline { %v1 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> diff --git a/builtins/target-sse4-x2.ll b/builtins/target-sse4-x2.ll index 0f7cb355..81efc5cb 100644 --- a/builtins/target-sse4-x2.ll +++ b/builtins/target-sse4-x2.ll @@ -255,6 +255,60 @@ define i64 @__movmsk(<8 x i32>) nounwind readnone alwaysinline { ret i64 %v64 } +define i1 @__any(<8 x i32>) nounwind readnone alwaysinline { + ; first do two 4-wide movmsk calls + %floatmask = bitcast <8 x i32> %0 to <8 x float> + %m0 = shufflevector <8 x float> %floatmask, <8 x float> undef, + <4 x i32> + %v0 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %m0) nounwind readnone + %m1 = shufflevector <8 x float> %floatmask, <8 x float> undef, + <4 x i32> + %v1 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %m1) nounwind readnone + + ; and shift the first one over by 4 before ORing it with the value + ; of the second one + %v1s = shl i32 %v1, 4 + %v = or i32 %v0, %v1s + %cmp = icmp ne i32 %v, 0 + ret i1 %cmp +} + +define i1 @__all(<8 x i32>) nounwind readnone alwaysinline { + ; first do two 4-wide movmsk calls + %floatmask = bitcast <8 x i32> %0 to <8 x float> + %m0 = shufflevector <8 x float> %floatmask, <8 x float> undef, + <4 x i32> + %v0 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %m0) nounwind readnone + %m1 = shufflevector <8 x float> %floatmask, <8 x float> undef, + <4 x i32> + %v1 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %m1) nounwind readnone + + ; and shift the first one over by 4 before ORing it with the value + ; of the second one + %v1s = shl i32 %v1, 4 + %v = or i32 %v0, %v1s + %cmp = icmp eq i32 %v, 255 + ret i1 %cmp +} + +define i1 @__none(<8 x i32>) nounwind readnone alwaysinline { + ; first do two 4-wide movmsk calls + %floatmask = bitcast <8 x i32> %0 to <8 x float> + %m0 = shufflevector <8 x float> %floatmask, <8 x float> undef, + <4 x i32> + %v0 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %m0) nounwind readnone + %m1 = shufflevector <8 x float> %floatmask, <8 x float> undef, + <4 x i32> + %v1 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %m1) nounwind readnone + + ; and shift the first one over by 4 before ORing it with the value + ; of the second one + %v1s = shl i32 %v1, 4 + %v = or i32 %v0, %v1s + %cmp = icmp eq i32 %v, 0 + ret i1 %cmp +} + define float @__reduce_min_float(<8 x float>) nounwind readnone alwaysinline { reduce8by4(float, @llvm.x86.sse.min.ps, @__min_uniform_float) } diff --git a/builtins/target-sse4.ll b/builtins/target-sse4.ll index b00bcfd6..f638d220 100644 --- a/builtins/target-sse4.ll +++ b/builtins/target-sse4.ll @@ -278,6 +278,27 @@ define i64 @__movmsk(<4 x i32>) nounwind readnone alwaysinline { ret i64 %v64 } +define i1 @__any(<4 x i32>) nounwind readnone alwaysinline { + %floatmask = bitcast <4 x i32> %0 to <4 x float> + %v = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %floatmask) nounwind readnone + %cmp = icmp ne i32 %v, 0 + ret i1 %cmp +} + +define i1 @__all(<4 x i32>) nounwind readnone alwaysinline { + %floatmask = bitcast <4 x i32> %0 to <4 x float> + %v = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %floatmask) nounwind readnone + %cmp = icmp eq i32 %v, 15 + ret i1 %cmp +} + +define i1 @__none(<4 x i32>) nounwind readnone alwaysinline { + %floatmask = bitcast <4 x i32> %0 to <4 x float> + %v = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %floatmask) nounwind readnone + %cmp = icmp eq i32 %v, 0 + ret i1 %cmp +} + declare <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float>, <4 x float>) nounwind readnone define float @__reduce_add_float(<4 x float>) nounwind readonly alwaysinline { diff --git a/ctx.cpp b/ctx.cpp index f608185a..fec38065 100644 --- a/ctx.cpp +++ b/ctx.cpp @@ -1287,29 +1287,54 @@ FunctionEmitContext::CurrentLanesReturned(Expr *expr, bool doCoherenceCheck) { llvm::Value * FunctionEmitContext::Any(llvm::Value *mask) { - llvm::Value *mmval = LaneMask(mask); - return CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_NE, mmval, - LLVMInt64(0), LLVMGetName(mask, "_any")); + // Call the target-dependent any function to test that the mask is non-zero + std::vector mm; + m->symbolTable->LookupFunction("__any", &mm); + if (g->target.maskBitCount == 1) + AssertPos(currentPos, mm.size() == 1); + else + // There should be one with signed int signature, one unsigned int. + AssertPos(currentPos, mm.size() == 2); + // We can actually call either one, since both are i32s as far as + // LLVM's type system is concerned... + llvm::Function *fmm = mm[0]->function; + return CallInst(fmm, NULL, mask, LLVMGetName(mask, "_any")); } llvm::Value * FunctionEmitContext::All(llvm::Value *mask) { - llvm::Value *mmval = LaneMask(mask); - llvm::Value *allOnMaskValue = (g->target.vectorWidth == 64) ? - LLVMInt64(~0ull) : - LLVMInt64((1ull << g->target.vectorWidth) - 1); - - return CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_EQ, mmval, - allOnMaskValue, LLVMGetName(mask, "_all")); + // Call the target-dependent movmsk function to turn the vector mask + // into an i64 value + std::vector mm; + m->symbolTable->LookupFunction("__all", &mm); + if (g->target.maskBitCount == 1) + AssertPos(currentPos, mm.size() == 1); + else + // There should be one with signed int signature, one unsigned int. + AssertPos(currentPos, mm.size() == 2); + // We can actually call either one, since both are i32s as far as + // LLVM's type system is concerned... + llvm::Function *fmm = mm[0]->function; + return CallInst(fmm, NULL, mask, LLVMGetName(mask, "_all")); } llvm::Value * FunctionEmitContext::None(llvm::Value *mask) { - llvm::Value *mmval = LaneMask(mask); - return CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_EQ, mmval, - LLVMInt64(0), LLVMGetName(mask, "_none")); + // Call the target-dependent movmsk function to turn the vector mask + // into an i64 value + std::vector mm; + m->symbolTable->LookupFunction("__none", &mm); + if (g->target.maskBitCount == 1) + AssertPos(currentPos, mm.size() == 1); + else + // There should be one with signed int signature, one unsigned int. + AssertPos(currentPos, mm.size() == 2); + // We can actually call either one, since both are i32s as far as + // LLVM's type system is concerned... + llvm::Function *fmm = mm[0]->function; + return CallInst(fmm, NULL, mask, LLVMGetName(mask, "_none")); } diff --git a/examples/intrinsics/generic-16.h b/examples/intrinsics/generic-16.h index c4bff793..0b300f34 100644 --- a/examples/intrinsics/generic-16.h +++ b/examples/intrinsics/generic-16.h @@ -55,6 +55,7 @@ typedef int64_t __vec1_i64; struct __vec16_i1 { __vec16_i1() { } + __vec16_i1(const uint16_t &vv) : v(vv) { } __vec16_i1(uint32_t v0, uint32_t v1, uint32_t v2, uint32_t v3, uint32_t v4, uint32_t v5, uint32_t v6, uint32_t v7, uint32_t v8, uint32_t v9, uint32_t v10, uint32_t v11, @@ -342,6 +343,18 @@ static FORCEINLINE uint64_t __movmsk(__vec16_i1 mask) { return (uint64_t)mask.v; } +static FORCEINLINE __vec16_i1 __any(__vec16_i1 mask) { + return (mask.v!=0); +} + +static FORCEINLINE __vec16_i1 __all(__vec16_i1 mask) { + return (mask.v==0xFFFF); +} + +static FORCEINLINE __vec16_i1 __none(__vec16_i1 mask) { + return (mask.v==0); +} + static FORCEINLINE __vec16_i1 __equal_i1(__vec16_i1 a, __vec16_i1 b) { __vec16_i1 r; r.v = (a.v & b.v) | (~a.v & ~b.v); diff --git a/examples/intrinsics/generic-32.h b/examples/intrinsics/generic-32.h index ffd5b478..64b82cb1 100644 --- a/examples/intrinsics/generic-32.h +++ b/examples/intrinsics/generic-32.h @@ -55,6 +55,7 @@ typedef int64_t __vec1_i64; struct __vec32_i1 { __vec32_i1() { } + __vec32_i1(const uint32_t &vv) : v(vv) { } __vec32_i1(uint32_t v0, uint32_t v1, uint32_t v2, uint32_t v3, uint32_t v4, uint32_t v5, uint32_t v6, uint32_t v7, uint32_t v8, uint32_t v9, uint32_t v10, uint32_t v11, @@ -407,6 +408,18 @@ static FORCEINLINE uint64_t __movmsk(__vec32_i1 mask) { return (uint64_t)mask.v; } +static FORCEINLINE __vec32_i1 __any(__vec32_i1 mask) { + return (mask.v!=0); +} + +static FORCEINLINE __vec32_i1 __all(__vec32_i1 mask) { + return (mask.v==0xFFFFFFFF); +} + +static FORCEINLINE __vec32_i1 __none(__vec32_i1 mask) { + return (mask.v==0); +} + static FORCEINLINE __vec32_i1 __equal_i1(__vec32_i1 a, __vec32_i1 b) { __vec32_i1 r; r.v = (a.v & b.v) | (~a.v & ~b.v); diff --git a/examples/intrinsics/generic-64.h b/examples/intrinsics/generic-64.h index a33e1d15..7869faa5 100644 --- a/examples/intrinsics/generic-64.h +++ b/examples/intrinsics/generic-64.h @@ -55,6 +55,7 @@ typedef int64_t __vec1_i64; struct __vec64_i1 { __vec64_i1() { } + __vec64_i1(const uint64_t &vv) : v(vv) { } __vec64_i1(uint64_t v0, uint64_t v1, uint64_t v2, uint64_t v3, uint64_t v4, uint64_t v5, uint64_t v6, uint64_t v7, uint64_t v8, uint64_t v9, uint64_t v10, uint64_t v11, @@ -532,6 +533,18 @@ static FORCEINLINE uint64_t __movmsk(__vec64_i1 mask) { return (uint64_t)mask.v; } +static FORCEINLINE __vec64_i1 __any(__vec64_i1 mask) { + return (mask.v!=0); +} + +static FORCEINLINE __vec64_i1 __all(__vec64_i1 mask) { + return (mask.v==0xFFFFFFFFFFFFFFFF); +} + +static FORCEINLINE __vec64_i1 __none(__vec64_i1 mask) { + return (mask.v==0); +} + static FORCEINLINE __vec64_i1 __equal_i1(__vec64_i1 a, __vec64_i1 b) { __vec64_i1 r; r.v = (a.v & b.v) | (~a.v & ~b.v); diff --git a/examples/intrinsics/knc.h b/examples/intrinsics/knc.h index a4dcf270..02222115 100644 --- a/examples/intrinsics/knc.h +++ b/examples/intrinsics/knc.h @@ -428,6 +428,18 @@ static FORCEINLINE __vec16_i1 __movmsk(__vec16_i1 mask) { return _mm512_kmov(mask); } +static FORCEINLINE __vec16_i1 __any(__vec16_i1 mask) { + return (mask!=0); +} + +static FORCEINLINE __vec16_i1 __all(__vec16_i1 mask) { + return (mask=0xFFFF); +} + +static FORCEINLINE __vec16_i1 __none(__vec16_i1 mask) { + return (mask==0); +} + static FORCEINLINE __vec16_i1 __equal_i1(__vec16_i1 a, __vec16_i1 b) { return _mm512_knot( _mm512_kandn(a, b)); } diff --git a/examples/intrinsics/knc2x.h b/examples/intrinsics/knc2x.h index 75c9aa62..28fcf3ad 100644 --- a/examples/intrinsics/knc2x.h +++ b/examples/intrinsics/knc2x.h @@ -297,6 +297,18 @@ static FORCEINLINE uint32_t __movmsk(__vec32_i1 mask) { return ((m1<<16)|m2); } +static FORCEINLINE uint32_t __any(__vec32_i1 mask) { + return (mask.m!=0); +} + +static FORCEINLINE uint32_t __all(__vec32_i1 mask) { + return (mask.m==0xFFFFFFFF); +} + +static FORCEINLINE uint32_t __none(__vec32_i1 mask) { + return (mask.m==0x0); +} + static FORCEINLINE __vec32_i1 __equal(__vec32_i1 a, __vec32_i1 b) { __vec32_i1 ret; ret.m16.m1 = _mm512_knot(_mm512_kandn(a.m16.m1, b.m16.m1)); diff --git a/examples/intrinsics/sse4.h b/examples/intrinsics/sse4.h index b894cb29..8d94bc49 100644 --- a/examples/intrinsics/sse4.h +++ b/examples/intrinsics/sse4.h @@ -269,6 +269,18 @@ static FORCEINLINE uint64_t __movmsk(__vec4_i1 mask) { return (uint64_t)_mm_movemask_ps(mask.v); } +static FORCEINLINE __vec4_i1 __any(__vec4_i1 mask) { + return (_mm_movemask_ps(mask.v)!=0); +} + +static FORCEINLINE __vec4_i1 __all(__vec4_i1 mask) { + return (_mm_movemask_ps(mask.v)=0xF); +} + +static FORCEINLINE __vec4_i1 __none(__vec4_i1 mask) { + return (_mm_movemask_ps(mask.v)==0); +} + static FORCEINLINE __vec4_i1 __equal_i1(__vec4_i1 a, __vec4_i1 b) { return _mm_cmpeq_epi32(_mm_castps_si128(a.v), _mm_castps_si128(b.v)); }