diff --git a/builtins/target-avx-x2.ll b/builtins/target-avx-x2.ll
index 7f737626..55a1c802 100644
--- a/builtins/target-avx-x2.ll
+++ b/builtins/target-avx-x2.ll
@@ -190,6 +190,53 @@ define i64 @__movmsk(<16 x i32>) nounwind readnone alwaysinline {
   ret i64 %v64
 }
 
+define i1 @__any(<16 x i32>) nounwind readnone alwaysinline {
+  %floatmask = bitcast <16 x i32> %0 to <16 x float>
+  %mask0 = shufflevector <16 x float> %floatmask, <16 x float> undef,
+          <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %v0 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %mask0) nounwind readnone
+  %mask1 = shufflevector <16 x float> %floatmask, <16 x float> undef,
+          <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %v1 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %mask1) nounwind readnone
+
+  %v1shift = shl i32 %v1, 8
+  %v = or i32 %v1shift, %v0
+  %cmp = icmp ne i32 %v, 0
+  ret i1 %cmp
+}
+
+define i1 @__all(<16 x i32>) nounwind readnone alwaysinline {
+  %floatmask = bitcast <16 x i32> %0 to <16 x float>
+  %mask0 = shufflevector <16 x float> %floatmask, <16 x float> undef,
+          <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %v0 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %mask0) nounwind readnone
+  %mask1 = shufflevector <16 x float> %floatmask, <16 x float> undef,
+          <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %v1 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %mask1) nounwind readnone
+
+  %v1shift = shl i32 %v1, 8
+  %v = or i32 %v1shift, %v0
+  %cmp = icmp eq i32 %v, 65535
+  ret i1 %cmp
+}
+
+define i1 @__none(<16 x i32>) nounwind readnone alwaysinline {
+  %floatmask = bitcast <16 x i32> %0 to <16 x float>
+  %mask0 = shufflevector <16 x float> %floatmask, <16 x float> undef,
+          <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %v0 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %mask0) nounwind readnone
+  %mask1 = shufflevector <16 x float> %floatmask, <16 x float> undef,
+          <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %v1 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %mask1) nounwind readnone
+
+  %v1shift = shl i32 %v1, 8
+  %v = or i32 %v1shift, %v0
+  %cmp = icmp eq i32 %v, 0
+  ret i1 %cmp
+}
+
+
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; horizontal float ops
 
diff --git a/builtins/target-avx.ll b/builtins/target-avx.ll
index 3cd76516..3283376a 100644
--- a/builtins/target-avx.ll
+++ b/builtins/target-avx.ll
@@ -182,6 +182,27 @@ define i64 @__movmsk(<8 x i32>) nounwind readnone alwaysinline {
   ret i64 %v64
 }
 
+define i1 @__any(<8 x i32>) nounwind readnone alwaysinline {
+  %floatmask = bitcast <8 x i32> %0 to <8 x float>
+  %v = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %floatmask) nounwind readnone
+  %cmp = icmp ne i32 %v, 0
+  ret i1 %cmp
+}
+
+define i1 @__all(<8 x i32>) nounwind readnone alwaysinline {
+  %floatmask = bitcast <8 x i32> %0 to <8 x float>
+  %v = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %floatmask) nounwind readnone
+  %cmp = icmp eq i32 %v, 255
+  ret i1 %cmp
+}
+
+define i1 @__none(<8 x i32>) nounwind readnone alwaysinline {
+  %floatmask = bitcast <8 x i32> %0 to <8 x float>
+  %v = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %floatmask) nounwind readnone
+  %cmp = icmp eq i32 %v, 0
+  ret i1 %cmp
+}
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; horizontal float ops
 
diff --git a/builtins/target-generic-1.ll b/builtins/target-generic-1.ll
index c5937c8e..e18bc0ff 100644
--- a/builtins/target-generic-1.ll
+++ b/builtins/target-generic-1.ll
@@ -198,6 +198,27 @@ define  i64 @__movmsk(<1 x i32>) nounwind readnone alwaysinline {
   ret i64 %v64
 }
 
+define  i1 @__any(<1 x i32>) nounwind readnone alwaysinline {
+  %item = extractelement <1 x i32> %0, i32 0
+  %v = lshr i32 %item, 31
+  %cmp = icmp ne i32 %v, 0
+  ret i1 %cmp
+}
+
+define  i1 @__all(<1 x i32>) nounwind readnone alwaysinline {
+  %item = extractelement <1 x i32> %0, i32 0
+  %v = lshr i32 %item, 31
+  %cmp = icmp eq i32 %v, 1
+  ret i1 %cmp
+}
+
+define  i1 @__none(<1 x i32>) nounwind readnone alwaysinline {
+  %item = extractelement <1 x i32> %0, i32 0
+  %v = lshr i32 %item, 31
+  %cmp = icmp eq i32 %v, 0
+  ret i1 %cmp
+}
+
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rounding
diff --git a/builtins/target-generic-common.ll b/builtins/target-generic-common.ll
index 7b4cfd9c..fbce2531 100644
--- a/builtins/target-generic-common.ll
+++ b/builtins/target-generic-common.ll
@@ -222,6 +222,9 @@ declare <WIDTH x float> @__svml_pow(<WIDTH x float>, <WIDTH x float>)
 ;; reductions
 
 declare i64 @__movmsk(<WIDTH x i1>) nounwind readnone 
+declare i1 @__any(<WIDTH x i1>) nounwind readnone 
+declare i1 @__all(<WIDTH x i1>) nounwind readnone 
+declare i1 @__none(<WIDTH x i1>) nounwind readnone 
 
 declare float @__reduce_add_float(<WIDTH x float>) nounwind readnone
 declare float @__reduce_min_float(<WIDTH x float>) nounwind readnone 
diff --git a/builtins/target-sse2-x2.ll b/builtins/target-sse2-x2.ll
index ad19f899..3910dfdb 100644
--- a/builtins/target-sse2-x2.ll
+++ b/builtins/target-sse2-x2.ll
@@ -313,6 +313,60 @@ define i64 @__movmsk(<8 x i32>) nounwind readnone alwaysinline {
   ret i64 %v64
 }
 
+define i1 @__any(<8 x i32>) nounwind readnone alwaysinline {
+  ; first do two 4-wide movmsk calls
+  %floatmask = bitcast <8 x i32> %0 to <8 x float>
+  %m0 = shufflevector <8 x float> %floatmask, <8 x float> undef,
+          <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v0 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %m0) nounwind readnone
+  %m1 = shufflevector <8 x float> %floatmask, <8 x float> undef,
+          <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %v1 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %m1) nounwind readnone
+
+  ; and shift the first one over by 4 before ORing it with the value 
+  ; of the second one
+  %v1s = shl i32 %v1, 4
+  %v = or i32 %v0, %v1s
+  %cmp = icmp ne i32 %v, 0
+  ret i1 %cmp
+}
+
+define i1 @__all(<8 x i32>) nounwind readnone alwaysinline {
+  ; first do two 4-wide movmsk calls
+  %floatmask = bitcast <8 x i32> %0 to <8 x float>
+  %m0 = shufflevector <8 x float> %floatmask, <8 x float> undef,
+          <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v0 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %m0) nounwind readnone
+  %m1 = shufflevector <8 x float> %floatmask, <8 x float> undef,
+          <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %v1 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %m1) nounwind readnone
+
+  ; and shift the first one over by 4 before ORing it with the value 
+  ; of the second one
+  %v1s = shl i32 %v1, 4
+  %v = or i32 %v0, %v1s
+  %cmp = icmp eq i32 %v, 255
+  ret i1 %cmp
+}
+
+define i1 @__none(<8 x i32>) nounwind readnone alwaysinline {
+  ; first do two 4-wide movmsk calls
+  %floatmask = bitcast <8 x i32> %0 to <8 x float>
+  %m0 = shufflevector <8 x float> %floatmask, <8 x float> undef,
+          <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v0 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %m0) nounwind readnone
+  %m1 = shufflevector <8 x float> %floatmask, <8 x float> undef,
+          <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %v1 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %m1) nounwind readnone
+
+  ; and shift the first one over by 4 before ORing it with the value 
+  ; of the second one
+  %v1s = shl i32 %v1, 4
+  %v = or i32 %v0, %v1s
+  %cmp = icmp eq i32 %v, 0
+  ret i1 %cmp
+}
+
 define <4 x float> @__vec4_add_float(<4 x float> %v0,
                                      <4 x float> %v1) nounwind readnone alwaysinline {
   %v = fadd <4 x float> %v0, %v1
diff --git a/builtins/target-sse2.ll b/builtins/target-sse2.ll
index 6558adc8..618a61d9 100644
--- a/builtins/target-sse2.ll
+++ b/builtins/target-sse2.ll
@@ -246,6 +246,27 @@ define i64 @__movmsk(<4 x i32>) nounwind readnone alwaysinline {
   ret i64 %v64
 }
 
+define i1 @__any(<4 x i32>) nounwind readnone alwaysinline {
+  %floatmask = bitcast <4 x i32> %0 to <4 x float>
+  %v = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %floatmask) nounwind readnone
+  %cmp = icmp ne i32 %v, 0
+  ret i1 %cmp
+}
+
+define i1 @__all(<4 x i32>) nounwind readnone alwaysinline {
+  %floatmask = bitcast <4 x i32> %0 to <4 x float>
+  %v = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %floatmask) nounwind readnone
+  %cmp = icmp eq i32 %v, 15
+  ret i1 %cmp
+}
+
+define i1 @__none(<4 x i32>) nounwind readnone alwaysinline {
+  %floatmask = bitcast <4 x i32> %0 to <4 x float>
+  %v = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %floatmask) nounwind readnone
+  %cmp = icmp eq i32 %v, 0
+  ret i1 %cmp
+}
+
 define float @__reduce_add_float(<4 x float> %v) nounwind readonly alwaysinline {
   %v1 = shufflevector <4 x float> %v, <4 x float> undef,
                       <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
diff --git a/builtins/target-sse4-x2.ll b/builtins/target-sse4-x2.ll
index 0f7cb355..81efc5cb 100644
--- a/builtins/target-sse4-x2.ll
+++ b/builtins/target-sse4-x2.ll
@@ -255,6 +255,60 @@ define i64 @__movmsk(<8 x i32>) nounwind readnone alwaysinline {
   ret i64 %v64
 }
 
+define i1 @__any(<8 x i32>) nounwind readnone alwaysinline {
+  ; first do two 4-wide movmsk calls
+  %floatmask = bitcast <8 x i32> %0 to <8 x float>
+  %m0 = shufflevector <8 x float> %floatmask, <8 x float> undef,
+          <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v0 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %m0) nounwind readnone
+  %m1 = shufflevector <8 x float> %floatmask, <8 x float> undef,
+          <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %v1 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %m1) nounwind readnone
+
+  ; and shift the first one over by 4 before ORing it with the value 
+  ; of the second one
+  %v1s = shl i32 %v1, 4
+  %v = or i32 %v0, %v1s
+  %cmp = icmp ne i32 %v, 0
+  ret i1 %cmp
+}
+
+define i1 @__all(<8 x i32>) nounwind readnone alwaysinline {
+  ; first do two 4-wide movmsk calls
+  %floatmask = bitcast <8 x i32> %0 to <8 x float>
+  %m0 = shufflevector <8 x float> %floatmask, <8 x float> undef,
+          <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v0 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %m0) nounwind readnone
+  %m1 = shufflevector <8 x float> %floatmask, <8 x float> undef,
+          <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %v1 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %m1) nounwind readnone
+
+  ; and shift the first one over by 4 before ORing it with the value 
+  ; of the second one
+  %v1s = shl i32 %v1, 4
+  %v = or i32 %v0, %v1s
+  %cmp = icmp eq i32 %v, 255
+  ret i1 %cmp
+}
+
+define i1 @__none(<8 x i32>) nounwind readnone alwaysinline {
+  ; first do two 4-wide movmsk calls
+  %floatmask = bitcast <8 x i32> %0 to <8 x float>
+  %m0 = shufflevector <8 x float> %floatmask, <8 x float> undef,
+          <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v0 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %m0) nounwind readnone
+  %m1 = shufflevector <8 x float> %floatmask, <8 x float> undef,
+          <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %v1 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %m1) nounwind readnone
+
+  ; and shift the first one over by 4 before ORing it with the value 
+  ; of the second one
+  %v1s = shl i32 %v1, 4
+  %v = or i32 %v0, %v1s
+  %cmp = icmp eq i32 %v, 0
+  ret i1 %cmp
+}
+
 define float @__reduce_min_float(<8 x float>) nounwind readnone alwaysinline {
   reduce8by4(float, @llvm.x86.sse.min.ps, @__min_uniform_float)
 }
diff --git a/builtins/target-sse4.ll b/builtins/target-sse4.ll
index b00bcfd6..f638d220 100644
--- a/builtins/target-sse4.ll
+++ b/builtins/target-sse4.ll
@@ -278,6 +278,27 @@ define i64 @__movmsk(<4 x i32>) nounwind readnone alwaysinline {
   ret i64 %v64
 }
 
+define i1 @__any(<4 x i32>) nounwind readnone alwaysinline {
+  %floatmask = bitcast <4 x i32> %0 to <4 x float>
+  %v = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %floatmask) nounwind readnone
+  %cmp = icmp ne i32 %v, 0
+  ret i1 %cmp
+}
+
+define i1 @__all(<4 x i32>) nounwind readnone alwaysinline {
+  %floatmask = bitcast <4 x i32> %0 to <4 x float>
+  %v = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %floatmask) nounwind readnone
+  %cmp = icmp eq i32 %v, 15
+  ret i1 %cmp
+}
+
+define i1 @__none(<4 x i32>) nounwind readnone alwaysinline {
+  %floatmask = bitcast <4 x i32> %0 to <4 x float>
+  %v = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %floatmask) nounwind readnone
+  %cmp = icmp eq i32 %v, 0
+  ret i1 %cmp
+}
+
 declare <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float>, <4 x float>) nounwind readnone
 
 define float @__reduce_add_float(<4 x float>) nounwind readonly alwaysinline {
diff --git a/ctx.cpp b/ctx.cpp
index f608185a..fec38065 100644
--- a/ctx.cpp
+++ b/ctx.cpp
@@ -1287,29 +1287,54 @@ FunctionEmitContext::CurrentLanesReturned(Expr *expr, bool doCoherenceCheck) {
 
 llvm::Value *
 FunctionEmitContext::Any(llvm::Value *mask) {
-    llvm::Value *mmval = LaneMask(mask);
-    return CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_NE, mmval,
-                   LLVMInt64(0), LLVMGetName(mask, "_any"));
+    // Call the target-dependent any function to test that the mask is non-zero
+    std::vector<Symbol *> mm;
+    m->symbolTable->LookupFunction("__any", &mm);
+    if (g->target.maskBitCount == 1)
+        AssertPos(currentPos, mm.size() == 1);
+    else
+        // There should be one with signed int signature, one unsigned int.
+        AssertPos(currentPos, mm.size() == 2);
+    // We can actually call either one, since both are i32s as far as
+    // LLVM's type system is concerned...
+    llvm::Function *fmm = mm[0]->function;
+    return CallInst(fmm, NULL, mask, LLVMGetName(mask, "_any"));
 }
 
 
 llvm::Value *
 FunctionEmitContext::All(llvm::Value *mask) {
-    llvm::Value *mmval = LaneMask(mask);
-    llvm::Value *allOnMaskValue = (g->target.vectorWidth == 64) ?
-        LLVMInt64(~0ull) :
-        LLVMInt64((1ull << g->target.vectorWidth) - 1);
-
-    return CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_EQ, mmval,
-                   allOnMaskValue, LLVMGetName(mask, "_all"));
+    // Call the target-dependent movmsk function to turn the vector mask
+    // into an i64 value
+    std::vector<Symbol *> mm;
+    m->symbolTable->LookupFunction("__all", &mm);
+    if (g->target.maskBitCount == 1)
+        AssertPos(currentPos, mm.size() == 1);
+    else
+        // There should be one with signed int signature, one unsigned int.
+        AssertPos(currentPos, mm.size() == 2);
+    // We can actually call either one, since both are i32s as far as
+    // LLVM's type system is concerned...
+    llvm::Function *fmm = mm[0]->function;
+    return CallInst(fmm, NULL, mask, LLVMGetName(mask, "_all"));
 }
 
 
 llvm::Value *
 FunctionEmitContext::None(llvm::Value *mask) {
-    llvm::Value *mmval = LaneMask(mask);
-    return CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_EQ, mmval,
-                   LLVMInt64(0), LLVMGetName(mask, "_none"));
+    // Call the target-dependent movmsk function to turn the vector mask
+    // into an i64 value
+    std::vector<Symbol *> mm;
+    m->symbolTable->LookupFunction("__none", &mm);
+    if (g->target.maskBitCount == 1)
+        AssertPos(currentPos, mm.size() == 1);
+    else
+        // There should be one with signed int signature, one unsigned int.
+        AssertPos(currentPos, mm.size() == 2);
+    // We can actually call either one, since both are i32s as far as
+    // LLVM's type system is concerned...
+    llvm::Function *fmm = mm[0]->function;
+    return CallInst(fmm, NULL, mask, LLVMGetName(mask, "_none"));
 }
 
 
diff --git a/examples/intrinsics/generic-16.h b/examples/intrinsics/generic-16.h
index c4bff793..0b300f34 100644
--- a/examples/intrinsics/generic-16.h
+++ b/examples/intrinsics/generic-16.h
@@ -55,6 +55,7 @@ typedef int64_t __vec1_i64;
 
 struct __vec16_i1 {
     __vec16_i1() { }
+    __vec16_i1(const uint16_t &vv) : v(vv) { }
     __vec16_i1(uint32_t v0, uint32_t v1, uint32_t v2, uint32_t v3,
                uint32_t v4, uint32_t v5, uint32_t v6, uint32_t v7,
                uint32_t v8, uint32_t v9, uint32_t v10, uint32_t v11,
@@ -342,6 +343,18 @@ static FORCEINLINE uint64_t __movmsk(__vec16_i1 mask) {
     return (uint64_t)mask.v;
 }
 
+static FORCEINLINE __vec16_i1 __any(__vec16_i1 mask) {
+    return (mask.v!=0);
+}
+
+static FORCEINLINE __vec16_i1 __all(__vec16_i1 mask) {
+    return (mask.v==0xFFFF);
+}
+
+static FORCEINLINE __vec16_i1 __none(__vec16_i1 mask) {
+    return (mask.v==0);
+}
+
 static FORCEINLINE __vec16_i1 __equal_i1(__vec16_i1 a, __vec16_i1 b) {
     __vec16_i1 r;
     r.v = (a.v & b.v) | (~a.v & ~b.v);
diff --git a/examples/intrinsics/generic-32.h b/examples/intrinsics/generic-32.h
index ffd5b478..64b82cb1 100644
--- a/examples/intrinsics/generic-32.h
+++ b/examples/intrinsics/generic-32.h
@@ -55,6 +55,7 @@ typedef int64_t __vec1_i64;
 
 struct __vec32_i1 {
     __vec32_i1() { }
+    __vec32_i1(const uint32_t &vv) : v(vv) { }
     __vec32_i1(uint32_t v0, uint32_t v1, uint32_t v2, uint32_t v3,
                uint32_t v4, uint32_t v5, uint32_t v6, uint32_t v7,
                uint32_t v8, uint32_t v9, uint32_t v10, uint32_t v11,
@@ -407,6 +408,18 @@ static FORCEINLINE uint64_t __movmsk(__vec32_i1 mask) {
     return (uint64_t)mask.v;
 }
 
+static FORCEINLINE __vec32_i1 __any(__vec32_i1 mask) {
+    return (mask.v!=0);
+}
+
+static FORCEINLINE __vec32_i1 __all(__vec32_i1 mask) {
+    return (mask.v==0xFFFFFFFF);
+}
+
+static FORCEINLINE __vec32_i1 __none(__vec32_i1 mask) {
+    return (mask.v==0);
+}
+
 static FORCEINLINE __vec32_i1 __equal_i1(__vec32_i1 a, __vec32_i1 b) {
     __vec32_i1 r;
     r.v = (a.v & b.v) | (~a.v & ~b.v);
diff --git a/examples/intrinsics/generic-64.h b/examples/intrinsics/generic-64.h
index a33e1d15..7869faa5 100644
--- a/examples/intrinsics/generic-64.h
+++ b/examples/intrinsics/generic-64.h
@@ -55,6 +55,7 @@ typedef int64_t __vec1_i64;
 
 struct __vec64_i1 {
     __vec64_i1() { }
+    __vec64_i1(const uint64_t &vv) : v(vv) { }
     __vec64_i1(uint64_t v0, uint64_t v1, uint64_t v2, uint64_t v3,
                uint64_t v4, uint64_t v5, uint64_t v6, uint64_t v7,
                uint64_t v8, uint64_t v9, uint64_t v10, uint64_t v11,
@@ -532,6 +533,18 @@ static FORCEINLINE uint64_t __movmsk(__vec64_i1 mask) {
     return (uint64_t)mask.v;
 }
 
+static FORCEINLINE __vec64_i1 __any(__vec64_i1 mask) {
+    return (mask.v!=0);
+}
+
+static FORCEINLINE __vec64_i1 __all(__vec64_i1 mask) {
+    return (mask.v==0xFFFFFFFFFFFFFFFF);
+}
+
+static FORCEINLINE __vec64_i1 __none(__vec64_i1 mask) {
+    return (mask.v==0);
+}
+
 static FORCEINLINE __vec64_i1 __equal_i1(__vec64_i1 a, __vec64_i1 b) {
     __vec64_i1 r;
     r.v = (a.v & b.v) | (~a.v & ~b.v);
diff --git a/examples/intrinsics/knc.h b/examples/intrinsics/knc.h
index a4dcf270..02222115 100644
--- a/examples/intrinsics/knc.h
+++ b/examples/intrinsics/knc.h
@@ -428,6 +428,18 @@ static FORCEINLINE __vec16_i1 __movmsk(__vec16_i1 mask) {
     return _mm512_kmov(mask);
 }
 
+static FORCEINLINE __vec16_i1 __any(__vec16_i1 mask) {
+    return (mask!=0);
+}
+
+static FORCEINLINE __vec16_i1 __all(__vec16_i1 mask) {
+    return (mask=0xFFFF);
+}
+
+static FORCEINLINE __vec16_i1 __none(__vec16_i1 mask) {
+    return (mask==0);
+}
+
 static FORCEINLINE __vec16_i1 __equal_i1(__vec16_i1 a, __vec16_i1 b) {
     return _mm512_knot( _mm512_kandn(a, b));
 }
diff --git a/examples/intrinsics/knc2x.h b/examples/intrinsics/knc2x.h
index 75c9aa62..28fcf3ad 100644
--- a/examples/intrinsics/knc2x.h
+++ b/examples/intrinsics/knc2x.h
@@ -297,6 +297,18 @@ static FORCEINLINE uint32_t __movmsk(__vec32_i1 mask) {
     return ((m1<<16)|m2);
 }
 
+static FORCEINLINE uint32_t __any(__vec32_i1 mask) {
+    return (mask.m!=0);
+}
+
+static FORCEINLINE uint32_t __all(__vec32_i1 mask) {
+    return (mask.m==0xFFFFFFFF);
+}
+
+static FORCEINLINE uint32_t __none(__vec32_i1 mask) {
+    return (mask.m==0x0);
+}
+
 static FORCEINLINE __vec32_i1 __equal(__vec32_i1 a, __vec32_i1 b) {
     __vec32_i1 ret;
     ret.m16.m1 = _mm512_knot(_mm512_kandn(a.m16.m1, b.m16.m1));
diff --git a/examples/intrinsics/sse4.h b/examples/intrinsics/sse4.h
index b894cb29..8d94bc49 100644
--- a/examples/intrinsics/sse4.h
+++ b/examples/intrinsics/sse4.h
@@ -269,6 +269,18 @@ static FORCEINLINE uint64_t __movmsk(__vec4_i1 mask) {
     return (uint64_t)_mm_movemask_ps(mask.v);
 }
 
+static FORCEINLINE __vec4_i1 __any(__vec4_i1 mask) {
+    return (_mm_movemask_ps(mask.v)!=0);
+}
+
+static FORCEINLINE __vec4_i1 __all(__vec4_i1 mask) {
+    return (_mm_movemask_ps(mask.v)=0xF);
+}
+
+static FORCEINLINE __vec4_i1 __none(__vec4_i1 mask) {
+    return (_mm_movemask_ps(mask.v)==0);
+}
+
 static FORCEINLINE __vec4_i1 __equal_i1(__vec4_i1 a, __vec4_i1 b) {
     return _mm_cmpeq_epi32(_mm_castps_si128(a.v), _mm_castps_si128(b.v));
 }