Added the following mask tests: __any(), __all(), __none() for all supported targets.

This allows for more efficient code generation of KNC.
This commit is contained in:
Jean-Luc Duprat
2012-09-14 11:06:18 -07:00
parent 4ecdbe4bd9
commit f0b0618484
15 changed files with 355 additions and 13 deletions

View File

@@ -190,6 +190,53 @@ define i64 @__movmsk(<16 x i32>) nounwind readnone alwaysinline {
ret i64 %v64 ret i64 %v64
} }
define i1 @__any(<16 x i32>) nounwind readnone alwaysinline {
%floatmask = bitcast <16 x i32> %0 to <16 x float>
%mask0 = shufflevector <16 x float> %floatmask, <16 x float> undef,
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%v0 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %mask0) nounwind readnone
%mask1 = shufflevector <16 x float> %floatmask, <16 x float> undef,
<8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%v1 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %mask1) nounwind readnone
%v1shift = shl i32 %v1, 8
%v = or i32 %v1shift, %v0
%cmp = icmp ne i32 %v, 0
ret i1 %cmp
}
define i1 @__all(<16 x i32>) nounwind readnone alwaysinline {
%floatmask = bitcast <16 x i32> %0 to <16 x float>
%mask0 = shufflevector <16 x float> %floatmask, <16 x float> undef,
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%v0 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %mask0) nounwind readnone
%mask1 = shufflevector <16 x float> %floatmask, <16 x float> undef,
<8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%v1 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %mask1) nounwind readnone
%v1shift = shl i32 %v1, 8
%v = or i32 %v1shift, %v0
%cmp = icmp eq i32 %v, 65535
ret i1 %cmp
}
define i1 @__none(<16 x i32>) nounwind readnone alwaysinline {
%floatmask = bitcast <16 x i32> %0 to <16 x float>
%mask0 = shufflevector <16 x float> %floatmask, <16 x float> undef,
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%v0 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %mask0) nounwind readnone
%mask1 = shufflevector <16 x float> %floatmask, <16 x float> undef,
<8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%v1 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %mask1) nounwind readnone
%v1shift = shl i32 %v1, 8
%v = or i32 %v1shift, %v0
%cmp = icmp eq i32 %v, 0
ret i1 %cmp
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; horizontal float ops ;; horizontal float ops

View File

@@ -182,6 +182,27 @@ define i64 @__movmsk(<8 x i32>) nounwind readnone alwaysinline {
ret i64 %v64 ret i64 %v64
} }
define i1 @__any(<8 x i32>) nounwind readnone alwaysinline {
%floatmask = bitcast <8 x i32> %0 to <8 x float>
%v = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %floatmask) nounwind readnone
%cmp = icmp ne i32 %v, 0
ret i1 %cmp
}
define i1 @__all(<8 x i32>) nounwind readnone alwaysinline {
%floatmask = bitcast <8 x i32> %0 to <8 x float>
%v = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %floatmask) nounwind readnone
%cmp = icmp eq i32 %v, 255
ret i1 %cmp
}
define i1 @__none(<8 x i32>) nounwind readnone alwaysinline {
%floatmask = bitcast <8 x i32> %0 to <8 x float>
%v = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %floatmask) nounwind readnone
%cmp = icmp eq i32 %v, 0
ret i1 %cmp
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; horizontal float ops ;; horizontal float ops

View File

@@ -198,6 +198,27 @@ define i64 @__movmsk(<1 x i32>) nounwind readnone alwaysinline {
ret i64 %v64 ret i64 %v64
} }
define i1 @__any(<1 x i32>) nounwind readnone alwaysinline {
%item = extractelement <1 x i32> %0, i32 0
%v = lshr i32 %item, 31
%cmp = icmp ne i32 %v, 0
ret i1 %cmp
}
define i1 @__all(<1 x i32>) nounwind readnone alwaysinline {
%item = extractelement <1 x i32> %0, i32 0
%v = lshr i32 %item, 31
%cmp = icmp eq i32 %v, 1
ret i1 %cmp
}
define i1 @__none(<1 x i32>) nounwind readnone alwaysinline {
%item = extractelement <1 x i32> %0, i32 0
%v = lshr i32 %item, 31
%cmp = icmp eq i32 %v, 0
ret i1 %cmp
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; rounding ;; rounding

View File

@@ -222,6 +222,9 @@ declare <WIDTH x float> @__svml_pow(<WIDTH x float>, <WIDTH x float>)
;; reductions ;; reductions
declare i64 @__movmsk(<WIDTH x i1>) nounwind readnone declare i64 @__movmsk(<WIDTH x i1>) nounwind readnone
declare i1 @__any(<WIDTH x i1>) nounwind readnone
declare i1 @__all(<WIDTH x i1>) nounwind readnone
declare i1 @__none(<WIDTH x i1>) nounwind readnone
declare float @__reduce_add_float(<WIDTH x float>) nounwind readnone declare float @__reduce_add_float(<WIDTH x float>) nounwind readnone
declare float @__reduce_min_float(<WIDTH x float>) nounwind readnone declare float @__reduce_min_float(<WIDTH x float>) nounwind readnone

View File

@@ -313,6 +313,60 @@ define i64 @__movmsk(<8 x i32>) nounwind readnone alwaysinline {
ret i64 %v64 ret i64 %v64
} }
define i1 @__any(<8 x i32>) nounwind readnone alwaysinline {
; first do two 4-wide movmsk calls
%floatmask = bitcast <8 x i32> %0 to <8 x float>
%m0 = shufflevector <8 x float> %floatmask, <8 x float> undef,
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
%v0 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %m0) nounwind readnone
%m1 = shufflevector <8 x float> %floatmask, <8 x float> undef,
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
%v1 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %m1) nounwind readnone
; and shift the first one over by 4 before ORing it with the value
; of the second one
%v1s = shl i32 %v1, 4
%v = or i32 %v0, %v1s
%cmp = icmp ne i32 %v, 0
ret i1 %cmp
}
define i1 @__all(<8 x i32>) nounwind readnone alwaysinline {
; first do two 4-wide movmsk calls
%floatmask = bitcast <8 x i32> %0 to <8 x float>
%m0 = shufflevector <8 x float> %floatmask, <8 x float> undef,
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
%v0 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %m0) nounwind readnone
%m1 = shufflevector <8 x float> %floatmask, <8 x float> undef,
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
%v1 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %m1) nounwind readnone
; and shift the first one over by 4 before ORing it with the value
; of the second one
%v1s = shl i32 %v1, 4
%v = or i32 %v0, %v1s
%cmp = icmp eq i32 %v, 255
ret i1 %cmp
}
define i1 @__none(<8 x i32>) nounwind readnone alwaysinline {
; first do two 4-wide movmsk calls
%floatmask = bitcast <8 x i32> %0 to <8 x float>
%m0 = shufflevector <8 x float> %floatmask, <8 x float> undef,
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
%v0 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %m0) nounwind readnone
%m1 = shufflevector <8 x float> %floatmask, <8 x float> undef,
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
%v1 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %m1) nounwind readnone
; and shift the first one over by 4 before ORing it with the value
; of the second one
%v1s = shl i32 %v1, 4
%v = or i32 %v0, %v1s
%cmp = icmp eq i32 %v, 0
ret i1 %cmp
}
define <4 x float> @__vec4_add_float(<4 x float> %v0, define <4 x float> @__vec4_add_float(<4 x float> %v0,
<4 x float> %v1) nounwind readnone alwaysinline { <4 x float> %v1) nounwind readnone alwaysinline {
%v = fadd <4 x float> %v0, %v1 %v = fadd <4 x float> %v0, %v1

View File

@@ -246,6 +246,27 @@ define i64 @__movmsk(<4 x i32>) nounwind readnone alwaysinline {
ret i64 %v64 ret i64 %v64
} }
define i1 @__any(<4 x i32>) nounwind readnone alwaysinline {
%floatmask = bitcast <4 x i32> %0 to <4 x float>
%v = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %floatmask) nounwind readnone
%cmp = icmp ne i32 %v, 0
ret i1 %cmp
}
define i1 @__all(<4 x i32>) nounwind readnone alwaysinline {
%floatmask = bitcast <4 x i32> %0 to <4 x float>
%v = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %floatmask) nounwind readnone
%cmp = icmp eq i32 %v, 15
ret i1 %cmp
}
define i1 @__none(<4 x i32>) nounwind readnone alwaysinline {
%floatmask = bitcast <4 x i32> %0 to <4 x float>
%v = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %floatmask) nounwind readnone
%cmp = icmp eq i32 %v, 0
ret i1 %cmp
}
define float @__reduce_add_float(<4 x float> %v) nounwind readonly alwaysinline { define float @__reduce_add_float(<4 x float> %v) nounwind readonly alwaysinline {
%v1 = shufflevector <4 x float> %v, <4 x float> undef, %v1 = shufflevector <4 x float> %v, <4 x float> undef,
<4 x i32> <i32 2, i32 3, i32 undef, i32 undef> <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>

View File

@@ -255,6 +255,60 @@ define i64 @__movmsk(<8 x i32>) nounwind readnone alwaysinline {
ret i64 %v64 ret i64 %v64
} }
define i1 @__any(<8 x i32>) nounwind readnone alwaysinline {
; first do two 4-wide movmsk calls
%floatmask = bitcast <8 x i32> %0 to <8 x float>
%m0 = shufflevector <8 x float> %floatmask, <8 x float> undef,
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
%v0 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %m0) nounwind readnone
%m1 = shufflevector <8 x float> %floatmask, <8 x float> undef,
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
%v1 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %m1) nounwind readnone
; and shift the first one over by 4 before ORing it with the value
; of the second one
%v1s = shl i32 %v1, 4
%v = or i32 %v0, %v1s
%cmp = icmp ne i32 %v, 0
ret i1 %cmp
}
define i1 @__all(<8 x i32>) nounwind readnone alwaysinline {
; first do two 4-wide movmsk calls
%floatmask = bitcast <8 x i32> %0 to <8 x float>
%m0 = shufflevector <8 x float> %floatmask, <8 x float> undef,
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
%v0 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %m0) nounwind readnone
%m1 = shufflevector <8 x float> %floatmask, <8 x float> undef,
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
%v1 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %m1) nounwind readnone
; and shift the first one over by 4 before ORing it with the value
; of the second one
%v1s = shl i32 %v1, 4
%v = or i32 %v0, %v1s
%cmp = icmp eq i32 %v, 255
ret i1 %cmp
}
define i1 @__none(<8 x i32>) nounwind readnone alwaysinline {
; first do two 4-wide movmsk calls
%floatmask = bitcast <8 x i32> %0 to <8 x float>
%m0 = shufflevector <8 x float> %floatmask, <8 x float> undef,
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
%v0 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %m0) nounwind readnone
%m1 = shufflevector <8 x float> %floatmask, <8 x float> undef,
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
%v1 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %m1) nounwind readnone
; and shift the first one over by 4 before ORing it with the value
; of the second one
%v1s = shl i32 %v1, 4
%v = or i32 %v0, %v1s
%cmp = icmp eq i32 %v, 0
ret i1 %cmp
}
define float @__reduce_min_float(<8 x float>) nounwind readnone alwaysinline { define float @__reduce_min_float(<8 x float>) nounwind readnone alwaysinline {
reduce8by4(float, @llvm.x86.sse.min.ps, @__min_uniform_float) reduce8by4(float, @llvm.x86.sse.min.ps, @__min_uniform_float)
} }

View File

@@ -278,6 +278,27 @@ define i64 @__movmsk(<4 x i32>) nounwind readnone alwaysinline {
ret i64 %v64 ret i64 %v64
} }
define i1 @__any(<4 x i32>) nounwind readnone alwaysinline {
%floatmask = bitcast <4 x i32> %0 to <4 x float>
%v = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %floatmask) nounwind readnone
%cmp = icmp ne i32 %v, 0
ret i1 %cmp
}
define i1 @__all(<4 x i32>) nounwind readnone alwaysinline {
%floatmask = bitcast <4 x i32> %0 to <4 x float>
%v = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %floatmask) nounwind readnone
%cmp = icmp eq i32 %v, 15
ret i1 %cmp
}
define i1 @__none(<4 x i32>) nounwind readnone alwaysinline {
%floatmask = bitcast <4 x i32> %0 to <4 x float>
%v = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %floatmask) nounwind readnone
%cmp = icmp eq i32 %v, 0
ret i1 %cmp
}
declare <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float>, <4 x float>) nounwind readnone declare <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float>, <4 x float>) nounwind readnone
define float @__reduce_add_float(<4 x float>) nounwind readonly alwaysinline { define float @__reduce_add_float(<4 x float>) nounwind readonly alwaysinline {

51
ctx.cpp
View File

@@ -1287,29 +1287,54 @@ FunctionEmitContext::CurrentLanesReturned(Expr *expr, bool doCoherenceCheck) {
llvm::Value * llvm::Value *
FunctionEmitContext::Any(llvm::Value *mask) { FunctionEmitContext::Any(llvm::Value *mask) {
llvm::Value *mmval = LaneMask(mask); // Call the target-dependent any function to test that the mask is non-zero
return CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_NE, mmval, std::vector<Symbol *> mm;
LLVMInt64(0), LLVMGetName(mask, "_any")); m->symbolTable->LookupFunction("__any", &mm);
if (g->target.maskBitCount == 1)
AssertPos(currentPos, mm.size() == 1);
else
// There should be one with signed int signature, one unsigned int.
AssertPos(currentPos, mm.size() == 2);
// We can actually call either one, since both are i32s as far as
// LLVM's type system is concerned...
llvm::Function *fmm = mm[0]->function;
return CallInst(fmm, NULL, mask, LLVMGetName(mask, "_any"));
} }
llvm::Value * llvm::Value *
FunctionEmitContext::All(llvm::Value *mask) { FunctionEmitContext::All(llvm::Value *mask) {
llvm::Value *mmval = LaneMask(mask); // Call the target-dependent movmsk function to turn the vector mask
llvm::Value *allOnMaskValue = (g->target.vectorWidth == 64) ? // into an i64 value
LLVMInt64(~0ull) : std::vector<Symbol *> mm;
LLVMInt64((1ull << g->target.vectorWidth) - 1); m->symbolTable->LookupFunction("__all", &mm);
if (g->target.maskBitCount == 1)
return CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_EQ, mmval, AssertPos(currentPos, mm.size() == 1);
allOnMaskValue, LLVMGetName(mask, "_all")); else
// There should be one with signed int signature, one unsigned int.
AssertPos(currentPos, mm.size() == 2);
// We can actually call either one, since both are i32s as far as
// LLVM's type system is concerned...
llvm::Function *fmm = mm[0]->function;
return CallInst(fmm, NULL, mask, LLVMGetName(mask, "_all"));
} }
llvm::Value * llvm::Value *
FunctionEmitContext::None(llvm::Value *mask) { FunctionEmitContext::None(llvm::Value *mask) {
llvm::Value *mmval = LaneMask(mask); // Call the target-dependent movmsk function to turn the vector mask
return CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_EQ, mmval, // into an i64 value
LLVMInt64(0), LLVMGetName(mask, "_none")); std::vector<Symbol *> mm;
m->symbolTable->LookupFunction("__none", &mm);
if (g->target.maskBitCount == 1)
AssertPos(currentPos, mm.size() == 1);
else
// There should be one with signed int signature, one unsigned int.
AssertPos(currentPos, mm.size() == 2);
// We can actually call either one, since both are i32s as far as
// LLVM's type system is concerned...
llvm::Function *fmm = mm[0]->function;
return CallInst(fmm, NULL, mask, LLVMGetName(mask, "_none"));
} }

View File

@@ -55,6 +55,7 @@ typedef int64_t __vec1_i64;
struct __vec16_i1 { struct __vec16_i1 {
__vec16_i1() { } __vec16_i1() { }
__vec16_i1(const uint16_t &vv) : v(vv) { }
__vec16_i1(uint32_t v0, uint32_t v1, uint32_t v2, uint32_t v3, __vec16_i1(uint32_t v0, uint32_t v1, uint32_t v2, uint32_t v3,
uint32_t v4, uint32_t v5, uint32_t v6, uint32_t v7, uint32_t v4, uint32_t v5, uint32_t v6, uint32_t v7,
uint32_t v8, uint32_t v9, uint32_t v10, uint32_t v11, uint32_t v8, uint32_t v9, uint32_t v10, uint32_t v11,
@@ -342,6 +343,18 @@ static FORCEINLINE uint64_t __movmsk(__vec16_i1 mask) {
return (uint64_t)mask.v; return (uint64_t)mask.v;
} }
static FORCEINLINE __vec16_i1 __any(__vec16_i1 mask) {
return (mask.v!=0);
}
static FORCEINLINE __vec16_i1 __all(__vec16_i1 mask) {
return (mask.v==0xFFFF);
}
static FORCEINLINE __vec16_i1 __none(__vec16_i1 mask) {
return (mask.v==0);
}
static FORCEINLINE __vec16_i1 __equal_i1(__vec16_i1 a, __vec16_i1 b) { static FORCEINLINE __vec16_i1 __equal_i1(__vec16_i1 a, __vec16_i1 b) {
__vec16_i1 r; __vec16_i1 r;
r.v = (a.v & b.v) | (~a.v & ~b.v); r.v = (a.v & b.v) | (~a.v & ~b.v);

View File

@@ -55,6 +55,7 @@ typedef int64_t __vec1_i64;
struct __vec32_i1 { struct __vec32_i1 {
__vec32_i1() { } __vec32_i1() { }
__vec32_i1(const uint32_t &vv) : v(vv) { }
__vec32_i1(uint32_t v0, uint32_t v1, uint32_t v2, uint32_t v3, __vec32_i1(uint32_t v0, uint32_t v1, uint32_t v2, uint32_t v3,
uint32_t v4, uint32_t v5, uint32_t v6, uint32_t v7, uint32_t v4, uint32_t v5, uint32_t v6, uint32_t v7,
uint32_t v8, uint32_t v9, uint32_t v10, uint32_t v11, uint32_t v8, uint32_t v9, uint32_t v10, uint32_t v11,
@@ -407,6 +408,18 @@ static FORCEINLINE uint64_t __movmsk(__vec32_i1 mask) {
return (uint64_t)mask.v; return (uint64_t)mask.v;
} }
static FORCEINLINE __vec32_i1 __any(__vec32_i1 mask) {
return (mask.v!=0);
}
static FORCEINLINE __vec32_i1 __all(__vec32_i1 mask) {
return (mask.v==0xFFFFFFFF);
}
static FORCEINLINE __vec32_i1 __none(__vec32_i1 mask) {
return (mask.v==0);
}
static FORCEINLINE __vec32_i1 __equal_i1(__vec32_i1 a, __vec32_i1 b) { static FORCEINLINE __vec32_i1 __equal_i1(__vec32_i1 a, __vec32_i1 b) {
__vec32_i1 r; __vec32_i1 r;
r.v = (a.v & b.v) | (~a.v & ~b.v); r.v = (a.v & b.v) | (~a.v & ~b.v);

View File

@@ -55,6 +55,7 @@ typedef int64_t __vec1_i64;
struct __vec64_i1 { struct __vec64_i1 {
__vec64_i1() { } __vec64_i1() { }
__vec64_i1(const uint64_t &vv) : v(vv) { }
__vec64_i1(uint64_t v0, uint64_t v1, uint64_t v2, uint64_t v3, __vec64_i1(uint64_t v0, uint64_t v1, uint64_t v2, uint64_t v3,
uint64_t v4, uint64_t v5, uint64_t v6, uint64_t v7, uint64_t v4, uint64_t v5, uint64_t v6, uint64_t v7,
uint64_t v8, uint64_t v9, uint64_t v10, uint64_t v11, uint64_t v8, uint64_t v9, uint64_t v10, uint64_t v11,
@@ -532,6 +533,18 @@ static FORCEINLINE uint64_t __movmsk(__vec64_i1 mask) {
return (uint64_t)mask.v; return (uint64_t)mask.v;
} }
static FORCEINLINE __vec64_i1 __any(__vec64_i1 mask) {
return (mask.v!=0);
}
static FORCEINLINE __vec64_i1 __all(__vec64_i1 mask) {
return (mask.v==0xFFFFFFFFFFFFFFFF);
}
static FORCEINLINE __vec64_i1 __none(__vec64_i1 mask) {
return (mask.v==0);
}
static FORCEINLINE __vec64_i1 __equal_i1(__vec64_i1 a, __vec64_i1 b) { static FORCEINLINE __vec64_i1 __equal_i1(__vec64_i1 a, __vec64_i1 b) {
__vec64_i1 r; __vec64_i1 r;
r.v = (a.v & b.v) | (~a.v & ~b.v); r.v = (a.v & b.v) | (~a.v & ~b.v);

View File

@@ -428,6 +428,18 @@ static FORCEINLINE __vec16_i1 __movmsk(__vec16_i1 mask) {
return _mm512_kmov(mask); return _mm512_kmov(mask);
} }
static FORCEINLINE __vec16_i1 __any(__vec16_i1 mask) {
return (mask!=0);
}
static FORCEINLINE __vec16_i1 __all(__vec16_i1 mask) {
return (mask=0xFFFF);
}
static FORCEINLINE __vec16_i1 __none(__vec16_i1 mask) {
return (mask==0);
}
static FORCEINLINE __vec16_i1 __equal_i1(__vec16_i1 a, __vec16_i1 b) { static FORCEINLINE __vec16_i1 __equal_i1(__vec16_i1 a, __vec16_i1 b) {
return _mm512_knot( _mm512_kandn(a, b)); return _mm512_knot( _mm512_kandn(a, b));
} }

View File

@@ -297,6 +297,18 @@ static FORCEINLINE uint32_t __movmsk(__vec32_i1 mask) {
return ((m1<<16)|m2); return ((m1<<16)|m2);
} }
static FORCEINLINE uint32_t __any(__vec32_i1 mask) {
return (mask.m!=0);
}
static FORCEINLINE uint32_t __all(__vec32_i1 mask) {
return (mask.m==0xFFFFFFFF);
}
static FORCEINLINE uint32_t __none(__vec32_i1 mask) {
return (mask.m==0x0);
}
static FORCEINLINE __vec32_i1 __equal(__vec32_i1 a, __vec32_i1 b) { static FORCEINLINE __vec32_i1 __equal(__vec32_i1 a, __vec32_i1 b) {
__vec32_i1 ret; __vec32_i1 ret;
ret.m16.m1 = _mm512_knot(_mm512_kandn(a.m16.m1, b.m16.m1)); ret.m16.m1 = _mm512_knot(_mm512_kandn(a.m16.m1, b.m16.m1));

View File

@@ -269,6 +269,18 @@ static FORCEINLINE uint64_t __movmsk(__vec4_i1 mask) {
return (uint64_t)_mm_movemask_ps(mask.v); return (uint64_t)_mm_movemask_ps(mask.v);
} }
static FORCEINLINE __vec4_i1 __any(__vec4_i1 mask) {
return (_mm_movemask_ps(mask.v)!=0);
}
static FORCEINLINE __vec4_i1 __all(__vec4_i1 mask) {
return (_mm_movemask_ps(mask.v)=0xF);
}
static FORCEINLINE __vec4_i1 __none(__vec4_i1 mask) {
return (_mm_movemask_ps(mask.v)==0);
}
static FORCEINLINE __vec4_i1 __equal_i1(__vec4_i1 a, __vec4_i1 b) { static FORCEINLINE __vec4_i1 __equal_i1(__vec4_i1 a, __vec4_i1 b) {
return _mm_cmpeq_epi32(_mm_castps_si128(a.v), _mm_castps_si128(b.v)); return _mm_cmpeq_epi32(_mm_castps_si128(a.v), _mm_castps_si128(b.v));
} }