Added the following mask tests: __any(), __all(), __none() for all supported targets.

This allows for more efficient code generation of KNC.
2012-09-14 11:06:18 -07:00
parent 4ecdbe4bd9
commit f0b0618484
15 changed files with 355 additions and 13 deletions
--- a/builtins/target-avx-x2.ll
+++ b/builtins/target-avx-x2.ll
@@ -190,6 +190,53 @@ define i64 @__movmsk(<16 x i32>) nounwind readnone alwaysinline {
  ret i64 %v64
 }
 define i1 @__any(<16 x i32>) nounwind readnone alwaysinline {
  %floatmask = bitcast <16 x i32> %0 to <16 x float>
  %mask0 = shufflevector <16 x float> %floatmask, <16 x float> undef,
          <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
  %v0 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %mask0) nounwind readnone
  %mask1 = shufflevector <16 x float> %floatmask, <16 x float> undef,
          <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
  %v1 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %mask1) nounwind readnone
  %v1shift = shl i32 %v1, 8
  %v = or i32 %v1shift, %v0
  %cmp = icmp ne i32 %v, 0
  ret i1 %cmp
 }
 define i1 @__all(<16 x i32>) nounwind readnone alwaysinline {
  %floatmask = bitcast <16 x i32> %0 to <16 x float>
  %mask0 = shufflevector <16 x float> %floatmask, <16 x float> undef,
          <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
  %v0 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %mask0) nounwind readnone
  %mask1 = shufflevector <16 x float> %floatmask, <16 x float> undef,
          <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
  %v1 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %mask1) nounwind readnone
  %v1shift = shl i32 %v1, 8
  %v = or i32 %v1shift, %v0
  %cmp = icmp eq i32 %v, 65535
  ret i1 %cmp
 }
 define i1 @__none(<16 x i32>) nounwind readnone alwaysinline {
  %floatmask = bitcast <16 x i32> %0 to <16 x float>
  %mask0 = shufflevector <16 x float> %floatmask, <16 x float> undef,
          <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
  %v0 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %mask0) nounwind readnone
  %mask1 = shufflevector <16 x float> %floatmask, <16 x float> undef,
          <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
  %v1 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %mask1) nounwind readnone
  %v1shift = shl i32 %v1, 8
  %v = or i32 %v1shift, %v0
  %cmp = icmp eq i32 %v, 0
  ret i1 %cmp
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; horizontal float ops
--- a/builtins/target-avx.ll
+++ b/builtins/target-avx.ll
@@ -182,6 +182,27 @@ define i64 @__movmsk(<8 x i32>) nounwind readnone alwaysinline {
  ret i64 %v64
 }
 define i1 @__any(<8 x i32>) nounwind readnone alwaysinline {
  %floatmask = bitcast <8 x i32> %0 to <8 x float>
  %v = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %floatmask) nounwind readnone
  %cmp = icmp ne i32 %v, 0
  ret i1 %cmp
 }
 define i1 @__all(<8 x i32>) nounwind readnone alwaysinline {
  %floatmask = bitcast <8 x i32> %0 to <8 x float>
  %v = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %floatmask) nounwind readnone
  %cmp = icmp eq i32 %v, 255
  ret i1 %cmp
 }
 define i1 @__none(<8 x i32>) nounwind readnone alwaysinline {
  %floatmask = bitcast <8 x i32> %0 to <8 x float>
  %v = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %floatmask) nounwind readnone
  %cmp = icmp eq i32 %v, 0
  ret i1 %cmp
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; horizontal float ops
--- a/builtins/target-generic-1.ll
+++ b/builtins/target-generic-1.ll
@@ -198,6 +198,27 @@ define  i64 @__movmsk(<1 x i32>) nounwind readnone alwaysinline {
  ret i64 %v64
 }
 define  i1 @__any(<1 x i32>) nounwind readnone alwaysinline {
  %item = extractelement <1 x i32> %0, i32 0
  %v = lshr i32 %item, 31
  %cmp = icmp ne i32 %v, 0
  ret i1 %cmp
 }
 define  i1 @__all(<1 x i32>) nounwind readnone alwaysinline {
  %item = extractelement <1 x i32> %0, i32 0
  %v = lshr i32 %item, 31
  %cmp = icmp eq i32 %v, 1
  ret i1 %cmp
 }
 define  i1 @__none(<1 x i32>) nounwind readnone alwaysinline {
  %item = extractelement <1 x i32> %0, i32 0
  %v = lshr i32 %item, 31
  %cmp = icmp eq i32 %v, 0
  ret i1 %cmp
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rounding
--- a/builtins/target-generic-common.ll
+++ b/builtins/target-generic-common.ll
@@ -222,6 +222,9 @@ declare <WIDTH x float> @__svml_pow(<WIDTH x float>, <WIDTH x float>)
 ;; reductions
 declare i64 @__movmsk(<WIDTH x i1>) nounwind readnone 
 declare i1 @__any(<WIDTH x i1>) nounwind readnone 
 declare i1 @__all(<WIDTH x i1>) nounwind readnone 
 declare i1 @__none(<WIDTH x i1>) nounwind readnone 
 declare float @__reduce_add_float(<WIDTH x float>) nounwind readnone
 declare float @__reduce_min_float(<WIDTH x float>) nounwind readnone 
--- a/builtins/target-sse2-x2.ll
+++ b/builtins/target-sse2-x2.ll
@@ -313,6 +313,60 @@ define i64 @__movmsk(<8 x i32>) nounwind readnone alwaysinline {
  ret i64 %v64
 }
 define i1 @__any(<8 x i32>) nounwind readnone alwaysinline {
  ; first do two 4-wide movmsk calls
  %floatmask = bitcast <8 x i32> %0 to <8 x float>
  %m0 = shufflevector <8 x float> %floatmask, <8 x float> undef,
          <4 x i32> <i32 0, i32 1, i32 2, i32 3>
  %v0 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %m0) nounwind readnone
  %m1 = shufflevector <8 x float> %floatmask, <8 x float> undef,
          <4 x i32> <i32 4, i32 5, i32 6, i32 7>
  %v1 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %m1) nounwind readnone
  ; and shift the first one over by 4 before ORing it with the value 
  ; of the second one
  %v1s = shl i32 %v1, 4
  %v = or i32 %v0, %v1s
  %cmp = icmp ne i32 %v, 0
  ret i1 %cmp
 }
 define i1 @__all(<8 x i32>) nounwind readnone alwaysinline {
  ; first do two 4-wide movmsk calls
  %floatmask = bitcast <8 x i32> %0 to <8 x float>
  %m0 = shufflevector <8 x float> %floatmask, <8 x float> undef,
          <4 x i32> <i32 0, i32 1, i32 2, i32 3>
  %v0 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %m0) nounwind readnone
  %m1 = shufflevector <8 x float> %floatmask, <8 x float> undef,
          <4 x i32> <i32 4, i32 5, i32 6, i32 7>
  %v1 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %m1) nounwind readnone
  ; and shift the first one over by 4 before ORing it with the value 
  ; of the second one
  %v1s = shl i32 %v1, 4
  %v = or i32 %v0, %v1s
  %cmp = icmp eq i32 %v, 255
  ret i1 %cmp
 }
 define i1 @__none(<8 x i32>) nounwind readnone alwaysinline {
  ; first do two 4-wide movmsk calls
  %floatmask = bitcast <8 x i32> %0 to <8 x float>
  %m0 = shufflevector <8 x float> %floatmask, <8 x float> undef,
          <4 x i32> <i32 0, i32 1, i32 2, i32 3>
  %v0 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %m0) nounwind readnone
  %m1 = shufflevector <8 x float> %floatmask, <8 x float> undef,
          <4 x i32> <i32 4, i32 5, i32 6, i32 7>
  %v1 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %m1) nounwind readnone
  ; and shift the first one over by 4 before ORing it with the value 
  ; of the second one
  %v1s = shl i32 %v1, 4
  %v = or i32 %v0, %v1s
  %cmp = icmp eq i32 %v, 0
  ret i1 %cmp
 }
 define <4 x float> @__vec4_add_float(<4 x float> %v0,
                                     <4 x float> %v1) nounwind readnone alwaysinline {
  %v = fadd <4 x float> %v0, %v1
--- a/builtins/target-sse2.ll
+++ b/builtins/target-sse2.ll
@@ -246,6 +246,27 @@ define i64 @__movmsk(<4 x i32>) nounwind readnone alwaysinline {
  ret i64 %v64
 }
 define i1 @__any(<4 x i32>) nounwind readnone alwaysinline {
  %floatmask = bitcast <4 x i32> %0 to <4 x float>
  %v = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %floatmask) nounwind readnone
  %cmp = icmp ne i32 %v, 0
  ret i1 %cmp
 }
 define i1 @__all(<4 x i32>) nounwind readnone alwaysinline {
  %floatmask = bitcast <4 x i32> %0 to <4 x float>
  %v = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %floatmask) nounwind readnone
  %cmp = icmp eq i32 %v, 15
  ret i1 %cmp
 }
 define i1 @__none(<4 x i32>) nounwind readnone alwaysinline {
  %floatmask = bitcast <4 x i32> %0 to <4 x float>
  %v = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %floatmask) nounwind readnone
  %cmp = icmp eq i32 %v, 0
  ret i1 %cmp
 }
 define float @__reduce_add_float(<4 x float> %v) nounwind readonly alwaysinline {
  %v1 = shufflevector <4 x float> %v, <4 x float> undef,
                      <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
--- a/builtins/target-sse4-x2.ll
+++ b/builtins/target-sse4-x2.ll
@@ -255,6 +255,60 @@ define i64 @__movmsk(<8 x i32>) nounwind readnone alwaysinline {
  ret i64 %v64
 }
 define i1 @__any(<8 x i32>) nounwind readnone alwaysinline {
  ; first do two 4-wide movmsk calls
  %floatmask = bitcast <8 x i32> %0 to <8 x float>
  %m0 = shufflevector <8 x float> %floatmask, <8 x float> undef,
          <4 x i32> <i32 0, i32 1, i32 2, i32 3>
  %v0 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %m0) nounwind readnone
  %m1 = shufflevector <8 x float> %floatmask, <8 x float> undef,
          <4 x i32> <i32 4, i32 5, i32 6, i32 7>
  %v1 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %m1) nounwind readnone
  ; and shift the first one over by 4 before ORing it with the value 
  ; of the second one
  %v1s = shl i32 %v1, 4
  %v = or i32 %v0, %v1s
  %cmp = icmp ne i32 %v, 0
  ret i1 %cmp
 }
 define i1 @__all(<8 x i32>) nounwind readnone alwaysinline {
  ; first do two 4-wide movmsk calls
  %floatmask = bitcast <8 x i32> %0 to <8 x float>
  %m0 = shufflevector <8 x float> %floatmask, <8 x float> undef,
          <4 x i32> <i32 0, i32 1, i32 2, i32 3>
  %v0 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %m0) nounwind readnone
  %m1 = shufflevector <8 x float> %floatmask, <8 x float> undef,
          <4 x i32> <i32 4, i32 5, i32 6, i32 7>
  %v1 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %m1) nounwind readnone
  ; and shift the first one over by 4 before ORing it with the value 
  ; of the second one
  %v1s = shl i32 %v1, 4
  %v = or i32 %v0, %v1s
  %cmp = icmp eq i32 %v, 255
  ret i1 %cmp
 }
 define i1 @__none(<8 x i32>) nounwind readnone alwaysinline {
  ; first do two 4-wide movmsk calls
  %floatmask = bitcast <8 x i32> %0 to <8 x float>
  %m0 = shufflevector <8 x float> %floatmask, <8 x float> undef,
          <4 x i32> <i32 0, i32 1, i32 2, i32 3>
  %v0 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %m0) nounwind readnone
  %m1 = shufflevector <8 x float> %floatmask, <8 x float> undef,
          <4 x i32> <i32 4, i32 5, i32 6, i32 7>
  %v1 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %m1) nounwind readnone
  ; and shift the first one over by 4 before ORing it with the value 
  ; of the second one
  %v1s = shl i32 %v1, 4
  %v = or i32 %v0, %v1s
  %cmp = icmp eq i32 %v, 0
  ret i1 %cmp
 }
 define float @__reduce_min_float(<8 x float>) nounwind readnone alwaysinline {
  reduce8by4(float, @llvm.x86.sse.min.ps, @__min_uniform_float)
 }
--- a/builtins/target-sse4.ll
+++ b/builtins/target-sse4.ll
@@ -278,6 +278,27 @@ define i64 @__movmsk(<4 x i32>) nounwind readnone alwaysinline {
  ret i64 %v64
 }
 define i1 @__any(<4 x i32>) nounwind readnone alwaysinline {
  %floatmask = bitcast <4 x i32> %0 to <4 x float>
  %v = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %floatmask) nounwind readnone
  %cmp = icmp ne i32 %v, 0
  ret i1 %cmp
 }
 define i1 @__all(<4 x i32>) nounwind readnone alwaysinline {
  %floatmask = bitcast <4 x i32> %0 to <4 x float>
  %v = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %floatmask) nounwind readnone
  %cmp = icmp eq i32 %v, 15
  ret i1 %cmp
 }
 define i1 @__none(<4 x i32>) nounwind readnone alwaysinline {
  %floatmask = bitcast <4 x i32> %0 to <4 x float>
  %v = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %floatmask) nounwind readnone
  %cmp = icmp eq i32 %v, 0
  ret i1 %cmp
 }
 declare <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float>, <4 x float>) nounwind readnone
 define float @__reduce_add_float(<4 x float>) nounwind readonly alwaysinline {
--- a/ctx.cpp
+++ b/ctx.cpp
@@ -1287,29 +1287,54 @@ FunctionEmitContext::CurrentLanesReturned(Expr *expr, bool doCoherenceCheck) {
 llvm::Value *
 FunctionEmitContext::Any(llvm::Value *mask) {
-    llvm::Value *mmval = LaneMask(mask);
+    // Call the target-dependent any function to test that the mask is non-zero
-    return CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_NE, mmval,
+    std::vector<Symbol *> mm;
-                   LLVMInt64(0), LLVMGetName(mask, "_any"));
+    m->symbolTable->LookupFunction("__any", &mm);
    if (g->target.maskBitCount == 1)
        AssertPos(currentPos, mm.size() == 1);
    else
        // There should be one with signed int signature, one unsigned int.
        AssertPos(currentPos, mm.size() == 2);
    // We can actually call either one, since both are i32s as far as
    // LLVM's type system is concerned...
    llvm::Function *fmm = mm[0]->function;
    return CallInst(fmm, NULL, mask, LLVMGetName(mask, "_any"));
 }
 llvm::Value *
 FunctionEmitContext::All(llvm::Value *mask) {
-    llvm::Value *mmval = LaneMask(mask);
+    // Call the target-dependent movmsk function to turn the vector mask
-    llvm::Value *allOnMaskValue = (g->target.vectorWidth == 64) ?
+    // into an i64 value
-        LLVMInt64(~0ull) :
+    std::vector<Symbol *> mm;
-        LLVMInt64((1ull << g->target.vectorWidth) - 1);
+    m->symbolTable->LookupFunction("__all", &mm);
-
+    if (g->target.maskBitCount == 1)
-    return CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_EQ, mmval,
+        AssertPos(currentPos, mm.size() == 1);
-                   allOnMaskValue, LLVMGetName(mask, "_all"));
+    else
        // There should be one with signed int signature, one unsigned int.
        AssertPos(currentPos, mm.size() == 2);
    // We can actually call either one, since both are i32s as far as
    // LLVM's type system is concerned...
    llvm::Function *fmm = mm[0]->function;
    return CallInst(fmm, NULL, mask, LLVMGetName(mask, "_all"));
 }
 llvm::Value *
 FunctionEmitContext::None(llvm::Value *mask) {
-    llvm::Value *mmval = LaneMask(mask);
+    // Call the target-dependent movmsk function to turn the vector mask
-    return CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_EQ, mmval,
+    // into an i64 value
-                   LLVMInt64(0), LLVMGetName(mask, "_none"));
+    std::vector<Symbol *> mm;
    m->symbolTable->LookupFunction("__none", &mm);
    if (g->target.maskBitCount == 1)
        AssertPos(currentPos, mm.size() == 1);
    else
        // There should be one with signed int signature, one unsigned int.
        AssertPos(currentPos, mm.size() == 2);
    // We can actually call either one, since both are i32s as far as
    // LLVM's type system is concerned...
    llvm::Function *fmm = mm[0]->function;
    return CallInst(fmm, NULL, mask, LLVMGetName(mask, "_none"));
 }
--- a/examples/intrinsics/generic-16.h
+++ b/examples/intrinsics/generic-16.h
@@ -55,6 +55,7 @@ typedef int64_t __vec1_i64;
 struct __vec16_i1 {
    __vec16_i1() { }
    __vec16_i1(const uint16_t &vv) : v(vv) { }
    __vec16_i1(uint32_t v0, uint32_t v1, uint32_t v2, uint32_t v3,
               uint32_t v4, uint32_t v5, uint32_t v6, uint32_t v7,
               uint32_t v8, uint32_t v9, uint32_t v10, uint32_t v11,
@@ -342,6 +343,18 @@ static FORCEINLINE uint64_t __movmsk(__vec16_i1 mask) {
    return (uint64_t)mask.v;
 }
 static FORCEINLINE __vec16_i1 __any(__vec16_i1 mask) {
    return (mask.v!=0);
 }
 static FORCEINLINE __vec16_i1 __all(__vec16_i1 mask) {
    return (mask.v==0xFFFF);
 }
 static FORCEINLINE __vec16_i1 __none(__vec16_i1 mask) {
    return (mask.v==0);
 }
 static FORCEINLINE __vec16_i1 __equal_i1(__vec16_i1 a, __vec16_i1 b) {
    __vec16_i1 r;
    r.v = (a.v & b.v) | (~a.v & ~b.v);
--- a/examples/intrinsics/generic-32.h
+++ b/examples/intrinsics/generic-32.h
@@ -55,6 +55,7 @@ typedef int64_t __vec1_i64;
 struct __vec32_i1 {
    __vec32_i1() { }
    __vec32_i1(const uint32_t &vv) : v(vv) { }
    __vec32_i1(uint32_t v0, uint32_t v1, uint32_t v2, uint32_t v3,
               uint32_t v4, uint32_t v5, uint32_t v6, uint32_t v7,
               uint32_t v8, uint32_t v9, uint32_t v10, uint32_t v11,
@@ -407,6 +408,18 @@ static FORCEINLINE uint64_t __movmsk(__vec32_i1 mask) {
    return (uint64_t)mask.v;
 }
 static FORCEINLINE __vec32_i1 __any(__vec32_i1 mask) {
    return (mask.v!=0);
 }
 static FORCEINLINE __vec32_i1 __all(__vec32_i1 mask) {
    return (mask.v==0xFFFFFFFF);
 }
 static FORCEINLINE __vec32_i1 __none(__vec32_i1 mask) {
    return (mask.v==0);
 }
 static FORCEINLINE __vec32_i1 __equal_i1(__vec32_i1 a, __vec32_i1 b) {
    __vec32_i1 r;
    r.v = (a.v & b.v) | (~a.v & ~b.v);
--- a/examples/intrinsics/generic-64.h
+++ b/examples/intrinsics/generic-64.h
@@ -55,6 +55,7 @@ typedef int64_t __vec1_i64;
 struct __vec64_i1 {
    __vec64_i1() { }
    __vec64_i1(const uint64_t &vv) : v(vv) { }
    __vec64_i1(uint64_t v0, uint64_t v1, uint64_t v2, uint64_t v3,
               uint64_t v4, uint64_t v5, uint64_t v6, uint64_t v7,
               uint64_t v8, uint64_t v9, uint64_t v10, uint64_t v11,
@@ -532,6 +533,18 @@ static FORCEINLINE uint64_t __movmsk(__vec64_i1 mask) {
    return (uint64_t)mask.v;
 }
 static FORCEINLINE __vec64_i1 __any(__vec64_i1 mask) {
    return (mask.v!=0);
 }
 static FORCEINLINE __vec64_i1 __all(__vec64_i1 mask) {
    return (mask.v==0xFFFFFFFFFFFFFFFF);
 }
 static FORCEINLINE __vec64_i1 __none(__vec64_i1 mask) {
    return (mask.v==0);
 }
 static FORCEINLINE __vec64_i1 __equal_i1(__vec64_i1 a, __vec64_i1 b) {
    __vec64_i1 r;
    r.v = (a.v & b.v) | (~a.v & ~b.v);
--- a/examples/intrinsics/knc.h
+++ b/examples/intrinsics/knc.h
@@ -428,6 +428,18 @@ static FORCEINLINE __vec16_i1 __movmsk(__vec16_i1 mask) {
    return _mm512_kmov(mask);
 }
 static FORCEINLINE __vec16_i1 __any(__vec16_i1 mask) {
    return (mask!=0);
 }
 static FORCEINLINE __vec16_i1 __all(__vec16_i1 mask) {
    return (mask=0xFFFF);
 }
 static FORCEINLINE __vec16_i1 __none(__vec16_i1 mask) {
    return (mask==0);
 }
 static FORCEINLINE __vec16_i1 __equal_i1(__vec16_i1 a, __vec16_i1 b) {
    return _mm512_knot( _mm512_kandn(a, b));
 }
--- a/examples/intrinsics/knc2x.h
+++ b/examples/intrinsics/knc2x.h
@@ -297,6 +297,18 @@ static FORCEINLINE uint32_t __movmsk(__vec32_i1 mask) {
    return ((m1<<16)|m2);
 }
 static FORCEINLINE uint32_t __any(__vec32_i1 mask) {
    return (mask.m!=0);
 }
 static FORCEINLINE uint32_t __all(__vec32_i1 mask) {
    return (mask.m==0xFFFFFFFF);
 }
 static FORCEINLINE uint32_t __none(__vec32_i1 mask) {
    return (mask.m==0x0);
 }
 static FORCEINLINE __vec32_i1 __equal(__vec32_i1 a, __vec32_i1 b) {
    __vec32_i1 ret;
    ret.m16.m1 = _mm512_knot(_mm512_kandn(a.m16.m1, b.m16.m1));
--- a/examples/intrinsics/sse4.h
+++ b/examples/intrinsics/sse4.h
@@ -269,6 +269,18 @@ static FORCEINLINE uint64_t __movmsk(__vec4_i1 mask) {
    return (uint64_t)_mm_movemask_ps(mask.v);
 }
 static FORCEINLINE __vec4_i1 __any(__vec4_i1 mask) {
    return (_mm_movemask_ps(mask.v)!=0);
 }
 static FORCEINLINE __vec4_i1 __all(__vec4_i1 mask) {
    return (_mm_movemask_ps(mask.v)=0xF);
 }
 static FORCEINLINE __vec4_i1 __none(__vec4_i1 mask) {
    return (_mm_movemask_ps(mask.v)==0);
 }
 static FORCEINLINE __vec4_i1 __equal_i1(__vec4_i1 a, __vec4_i1 b) {
    return _mm_cmpeq_epi32(_mm_castps_si128(a.v), _mm_castps_si128(b.v));
 }