cleaned mask & int32, only test141 fails

This commit is contained in:
evghenii
2013-10-04 11:42:52 +03:00
parent 2741e3c1d0
commit 32c77be2f3

View File

@@ -64,69 +64,48 @@ extern "C"
}
#endif
typedef float __vec1_f;
typedef double __vec1_d;
typedef int8_t __vec1_i8;
typedef float __vec1_f;
typedef double __vec1_d;
typedef int8_t __vec1_i8;
typedef int16_t __vec1_i16;
typedef int32_t __vec1_i32;
typedef int64_t __vec1_i64;
struct __vec16_i1 {
__vec16_i1() { }
__vec16_i1(const __mmask16 &vv) : v(vv) { }
__vec16_i1(bool v0, bool v1, bool v2, bool v3,
bool v4, bool v5, bool v6, bool v7,
bool v8, bool v9, bool v10, bool v11,
bool v12, bool v13, bool v14, bool v15) {
v = ((v0 & 1) |
((v1 & 1) << 1) |
((v2 & 1) << 2) |
((v3 & 1) << 3) |
((v4 & 1) << 4) |
((v5 & 1) << 5) |
((v6 & 1) << 6) |
((v7 & 1) << 7) |
((v8 & 1) << 8) |
((v9 & 1) << 9) |
((v10 & 1) << 10) |
((v11 & 1) << 11) |
((v12 & 1) << 12) |
((v13 & 1) << 13) |
((v14 & 1) << 14) |
((v15 & 1) << 15));
}
__mmask16 v;
FORCEINLINE operator __mmask16() const { return v; }
/************ mask **************/
struct __vec16_i1
{
__mmask16 v;
FORCEINLINE __vec16_i1() { }
FORCEINLINE __vec16_i1(const __mmask16 &vv) : v(vv) { }
FORCEINLINE __vec16_i1(bool v0, bool v1, bool v2, bool v3,
bool v4, bool v5, bool v6, bool v7,
bool v8, bool v9, bool v10, bool v11,
bool v12, bool v13, bool v14, bool v15) {
v = ((v0 & 1) |
((v1 & 1) << 1) |
((v2 & 1) << 2) |
((v3 & 1) << 3) |
((v4 & 1) << 4) |
((v5 & 1) << 5) |
((v6 & 1) << 6) |
((v7 & 1) << 7) |
((v8 & 1) << 8) |
((v9 & 1) << 9) |
((v10 & 1) << 10) |
((v11 & 1) << 11) |
((v12 & 1) << 12) |
((v13 & 1) << 13) |
((v14 & 1) << 14) |
((v15 & 1) << 15));
}
FORCEINLINE operator __mmask16() const { return v; }
};
/************ vector **************/
template <typename T>
struct vec16 {
vec16() { }
vec16(T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7,
T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15) {
data[0] = v0; data[1] = v1; data[2] = v2; data[3] = v3;
data[4] = v4; data[5] = v5; data[6] = v6; data[7] = v7;
data[8] = v8; data[9] = v9; data[10] = v10; data[11] = v11;
data[12] = v12; data[13] = v13; data[14] = v14; data[15] = v15;
}
T data[16];
FORCEINLINE const T& operator[](const int i) const { return data[i]; }
FORCEINLINE T& operator[](const int i) { return data[i]; }
};
#if 0 /* evghenii:i32 */
struct PRE_ALIGN(64) __vec16_i32 : public vec16<int32_t> {
__vec16_i32() { }
__vec16_i32(int32_t v0, int32_t v1, int32_t v2, int32_t v3,
int32_t v4, int32_t v5, int32_t v6, int32_t v7,
int32_t v8, int32_t v9, int32_t v10, int32_t v11,
int32_t v12, int32_t v13, int32_t v14, int32_t v15)
: vec16<int32_t>(v0, v1, v2, v3, v4, v5, v6, v7,
v8, v9, v10, v11, v12, v13, v14, v15) { }
} POST_ALIGN(64);
#else /* evghenii:i32 */
struct PRE_ALIGN(64) __vec16_i32
{
__m512i v;
@@ -144,81 +123,43 @@ struct PRE_ALIGN(64) __vec16_i32
FORCEINLINE const int32_t& operator[](const int i) const { return ((int32_t*)this)[i]; }
FORCEINLINE int32_t& operator[](const int i) { return ((int32_t*)this)[i]; }
} POST_ALIGN(64);
#endif /* evghenii:i32 */
#if 0 /* evghenii::f */
PRE_ALIGN(64) struct __vec16_f : public vec16<float> {
__vec16_f() { }
__vec16_f(float v0, float v1, float v2, float v3,
float v4, float v5, float v6, float v7,
float v8, float v9, float v10, float v11,
float v12, float v13, float v14, float v15)
: vec16<float>(v0, v1, v2, v3, v4, v5, v6, v7,
v8, v9, v10, v11, v12, v13, v14, v15) { }
} POST_ALIGN(64);
#else /* evghenii::f */
PRE_ALIGN(64) struct __vec16_f
{
__m512 v;
FORCEINLINE operator __m512() const { return v; }
FORCEINLINE __vec16_f() : v(_mm512_undefined_ps()) { }
FORCEINLINE __vec16_f(const __m512 &in) : v(in) {}
FORCEINLINE __vec16_f(const __vec16_f &o) : v(o.v) {}
FORCEINLINE __vec16_f& operator =(const __vec16_f &o) { v=o.v; return *this; }
FORCEINLINE __vec16_f(float v00, float v01, float v02, float v03,
float v04, float v05, float v06, float v07,
float v08, float v09, float v10, float v11,
float v12, float v13, float v14, float v15) :
v ( _mm512_set_16to16_ps(v15, v14, v13, v12, v11, v10, v09, v08, v07, v06, v05, v04, v03, v02, v01, v00) ) {}
FORCEINLINE const float& operator[](const int i) const { return ((float*)this)[i]; }
FORCEINLINE float& operator[](const int i) { return ((float*)this)[i]; }
__m512 v;
FORCEINLINE operator __m512() const { return v; }
FORCEINLINE __vec16_f() : v(_mm512_undefined_ps()) { }
FORCEINLINE __vec16_f(const __m512 &in) : v(in) {}
FORCEINLINE __vec16_f(const __vec16_f &o) : v(o.v) {}
FORCEINLINE __vec16_f& operator =(const __vec16_f &o) { v=o.v; return *this; }
FORCEINLINE __vec16_f(float v00, float v01, float v02, float v03,
float v04, float v05, float v06, float v07,
float v08, float v09, float v10, float v11,
float v12, float v13, float v14, float v15) :
v ( _mm512_set_16to16_ps(v15, v14, v13, v12, v11, v10, v09, v08, v07, v06, v05, v04, v03, v02, v01, v00) ) {}
FORCEINLINE const float& operator[](const int i) const { return ((float*)this)[i]; }
FORCEINLINE float& operator[](const int i) { return ((float*)this)[i]; }
} POST_ALIGN(64);
#endif /* evghenii::f */
#if 0 /* evghenii::d */
PRE_ALIGN(128) struct __vec16_d : public vec16<double> {
__vec16_d() { }
__vec16_d(double v0, double v1, double v2, double v3,
double v4, double v5, double v6, double v7,
double v8, double v9, double v10, double v11,
double v12, double v13, double v14, double v15)
: vec16<double>(v0, v1, v2, v3, v4, v5, v6, v7,
v8, v9, v10, v11, v12, v13, v14, v15) { }
} POST_ALIGN(128);
#else /* evghenii::d */
struct PRE_ALIGN(128) __vec16_d
{
__m512d v1;
__m512d v2;
FORCEINLINE __vec16_d() : v1(_mm512_undefined_pd()), v2(_mm512_undefined_pd()) {}
FORCEINLINE __vec16_d(const __m512d _v1, const __m512d _v2) : v1(_v1), v2(_v2) {}
FORCEINLINE __vec16_d(const __vec16_d &o) : v1(o.v1), v2(o.v2) {}
FORCEINLINE __vec16_d& operator =(const __vec16_d &o) { v1=o.v1; v2=o.v2; return *this; }
FORCEINLINE __vec16_d(double v00, double v01, double v02, double v03,
double v04, double v05, double v06, double v07,
double v08, double v09, double v10, double v11,
double v12, double v13, double v14, double v15) {
v1 = _mm512_set_8to8_pd(v15, v14, v13, v12, v11, v10, v09, v08);
v2 = _mm512_set_8to8_pd(v07, v06, v05, v04, v03, v02, v01, v00);
}
FORCEINLINE const double& operator[](const int i) const { return ((double*)this)[i]; }
FORCEINLINE double& operator[](const int i) { return ((double*)this)[i]; }
__m512d v1;
__m512d v2;
FORCEINLINE __vec16_d() : v1(_mm512_undefined_pd()), v2(_mm512_undefined_pd()) {}
FORCEINLINE __vec16_d(const __m512d _v1, const __m512d _v2) : v1(_v1), v2(_v2) {}
FORCEINLINE __vec16_d(const __vec16_d &o) : v1(o.v1), v2(o.v2) {}
FORCEINLINE __vec16_d& operator =(const __vec16_d &o) { v1=o.v1; v2=o.v2; return *this; }
FORCEINLINE __vec16_d(double v00, double v01, double v02, double v03,
double v04, double v05, double v06, double v07,
double v08, double v09, double v10, double v11,
double v12, double v13, double v14, double v15) {
v1 = _mm512_set_8to8_pd(v15, v14, v13, v12, v11, v10, v09, v08);
v2 = _mm512_set_8to8_pd(v07, v06, v05, v04, v03, v02, v01, v00);
}
FORCEINLINE const double& operator[](const int i) const { return ((double*)this)[i]; }
FORCEINLINE double& operator[](const int i) { return ((double*)this)[i]; }
} POST_ALIGN(128);
#endif /* evghenii::d */
#if 0 /* evghenii::i64 */
PRE_ALIGN(128) struct __vec16_i64 : public vec16<int64_t> {
__vec16_i64() { }
__vec16_i64(int64_t v0, int64_t v1, int64_t v2, int64_t v3,
int64_t v4, int64_t v5, int64_t v6, int64_t v7,
int64_t v8, int64_t v9, int64_t v10, int64_t v11,
int64_t v12, int64_t v13, int64_t v14, int64_t v15)
: vec16<int64_t>(v0, v1, v2, v3, v4, v5, v6, v7,
v8, v9, v10, v11, v12, v13, v14, v15) { }
} POST_ALIGN(128);
#else /* evghenii::i64 */
struct PRE_ALIGN(128) __vec16_i64
{
union {
@@ -279,7 +220,24 @@ struct PRE_ALIGN(128) __vec16_i64
return __vec16_i64(_v1, _v2);
}
} POST_ALIGN(128);
#endif /* evghenii::i64 */
/************ scalar **************/
template <typename T>
struct vec16
{
FORCEINLINE vec16() { }
FORCEINLINE vec16(T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7,
T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15) {
data[0] = v0; data[1] = v1; data[2] = v2; data[3] = v3;
data[4] = v4; data[5] = v5; data[6] = v6; data[7] = v7;
data[8] = v8; data[9] = v9; data[10] = v10; data[11] = v11;
data[12] = v12; data[13] = v13; data[14] = v14; data[15] = v15;
}
T data[16];
FORCEINLINE const T& operator[](const int i) const { return data[i]; }
FORCEINLINE T& operator[](const int i) { return data[i]; }
};
PRE_ALIGN(16) struct __vec16_i8 : public vec16<int8_t> {
__vec16_i8() { }
@@ -510,104 +468,54 @@ INSERT_EXTRACT(__vec1_f, float)
INSERT_EXTRACT(__vec1_d, double)
///////////////////////////////////////////////////////////////////////////
// mask ops
// mask
///////////////////////////////////////////////////////////////////////////
static FORCEINLINE __vec16_i1 __movmsk(__vec16_i1 mask) {
return _mm512_kmov(mask);
static FORCEINLINE __vec16_i1 __movmsk(__vec16_i1 mask) { return _mm512_kmov (mask); }
static FORCEINLINE bool __any (__vec16_i1 mask) { return !_mm512_kortestz(mask, mask); }
static FORCEINLINE bool __all (__vec16_i1 mask) { return _mm512_kortestc(mask, mask); }
static FORCEINLINE bool __none (__vec16_i1 mask) { return _mm512_kortestz(mask, mask); }
static FORCEINLINE __vec16_i1 __not (__vec16_i1 mask) { return _mm512_knot (mask); }
static FORCEINLINE __vec16_i1 __equal_i1(__vec16_i1 a, __vec16_i1 b) { return _mm512_kxnor (a,b); }
static FORCEINLINE __vec16_i1 __and (__vec16_i1 a, __vec16_i1 b) { return _mm512_kand (a,b); }
static FORCEINLINE __vec16_i1 __xor (__vec16_i1 a, __vec16_i1 b) { return _mm512_kxor (a,b); }
static FORCEINLINE __vec16_i1 __or (__vec16_i1 a, __vec16_i1 b) { return _mm512_kor (a,b); }
static FORCEINLINE __vec16_i1 __and_not1(__vec16_i1 a, __vec16_i1 b) { return _mm512_kandn (a,b); }
static FORCEINLINE __vec16_i1 __and_not2(__vec16_i1 a, __vec16_i1 b) { return _mm512_kandnr(a,b); }
static FORCEINLINE __vec16_i1 __select(__vec16_i1 mask, __vec16_i1 a, __vec16_i1 b) { return __or(__and(a, mask), __and_not2(b, mask)); }
static FORCEINLINE __vec16_i1 __select( bool cond, __vec16_i1 a, __vec16_i1 b) { return cond ? a : b; }
static FORCEINLINE bool __extract_element(__vec16_i1 vec, int index) { return (vec.v & (1 << index)) ? true : false; }
static FORCEINLINE void __insert_element(__vec16_i1 *vec, int index, bool val)
{
if (val == false) vec->v &= ~(1 << index);
else vec->v |= (1 << index);
}
static FORCEINLINE bool __any(__vec16_i1 mask) {
return !_mm512_kortestz(mask, mask);
template <int ALIGN> static FORCEINLINE __vec16_i1 __load(const __vec16_i1 *p)
{
return *p;
}
static FORCEINLINE bool __all(__vec16_i1 mask) {
return _mm512_kortestc(mask, mask);
}
static FORCEINLINE bool __none(__vec16_i1 mask) {
return _mm512_kortestz(mask, mask);
}
static FORCEINLINE __vec16_i1 __equal_i1(__vec16_i1 a, __vec16_i1 b) {
return _mm512_kxnor(a,b);
}
static FORCEINLINE __vec16_i1 __and(__vec16_i1 a, __vec16_i1 b) {
return _mm512_kand(a, b);
}
static FORCEINLINE __vec16_i1 __xor(__vec16_i1 a, __vec16_i1 b) {
return _mm512_kxor(a, b);
}
static FORCEINLINE __vec16_i1 __or(__vec16_i1 a, __vec16_i1 b) {
return _mm512_kor(a, b);
}
static FORCEINLINE __vec16_i1 __not(__vec16_i1 a) {
return _mm512_knot(a);
}
static FORCEINLINE __vec16_i1 __and_not1(__vec16_i1 a, __vec16_i1 b) {
return _mm512_kandn(a, b);
}
static FORCEINLINE __vec16_i1 __and_not2(__vec16_i1 a, __vec16_i1 b) {
return _mm512_kandnr(a, b);
}
static FORCEINLINE __vec16_i1 __select(__vec16_i1 mask, __vec16_i1 a,
__vec16_i1 b) {
// return ((a & mask) | (b & ~mask));
return __or(__and(a, mask), __and_not2(b, mask));
}
static FORCEINLINE __vec16_i1 __select(bool cond, __vec16_i1 a, __vec16_i1 b) {
return cond ? a : b;
}
static FORCEINLINE bool __extract_element(__vec16_i1 vec, int index) {
return (vec.v & (1 << index)) ? true : false;
}
static FORCEINLINE void __insert_element(__vec16_i1 *vec, int index,
bool val) {
if (val == false)
vec->v &= ~(1 << index);
else
vec->v |= (1 << index);
}
template <int ALIGN> static FORCEINLINE __vec16_i1 __load(const __vec16_i1 *p) {
uint16_t *ptr = (uint16_t *)p;
__vec16_i1 r;
r.v = *ptr;
return r;
}
template <int ALIGN> static FORCEINLINE void __store(__vec16_i1 *p, __vec16_i1 v) {
uint16_t *ptr = (uint16_t *)p;
*ptr = v.v;
template <int ALIGN> static FORCEINLINE void __store(__vec16_i1 *p, __vec16_i1 v)
{
*p = v;
}
template <class RetVecType> RetVecType __smear_i1(int i);
template <> static FORCEINLINE __vec16_i1 __smear_i1<__vec16_i1>(int i) {
return i?0xFFFF:0x0;
}
template <> static FORCEINLINE __vec16_i1 __smear_i1<__vec16_i1>(int i) { return i?0xFFFF:0x0; }
template <class RetVecType> RetVecType __setzero_i1();
template <> static FORCEINLINE __vec16_i1 __setzero_i1<__vec16_i1>() {
return 0;
}
template <> static FORCEINLINE __vec16_i1 __setzero_i1<__vec16_i1>() { return 0; }
template <class RetVecType> __vec16_i1 __undef_i1();
template <> FORCEINLINE __vec16_i1 __undef_i1<__vec16_i1>() {
return __vec16_i1();
}
template <> FORCEINLINE __vec16_i1 __undef_i1<__vec16_i1>() { return __vec16_i1(); }
///////////////////////////////////////////////////////////////////////////
// int8
///////////////////////////////////////////////////////////////////////////
BINARY_OP(__vec16_i8, __add, +)
BINARY_OP(__vec16_i8, __sub, -)
@@ -653,6 +561,7 @@ LOAD_STORE(__vec16_i8, int8_t)
///////////////////////////////////////////////////////////////////////////
// int16
///////////////////////////////////////////////////////////////////////////
BINARY_OP(__vec16_i16, __add, +)
BINARY_OP(__vec16_i16, __sub, -)
@@ -696,232 +605,57 @@ ROTATE(__vec16_i16, i16, int16_t)
SHUFFLES(__vec16_i16, i16, int16_t)
LOAD_STORE(__vec16_i16, int16_t)
#if 0 /* evghenii::int32 */
///////////////////////////////////////////////////////////////////////////
// int32
BINARY_OP(__vec16_i32, __add, +)
BINARY_OP(__vec16_i32, __sub, -)
BINARY_OP(__vec16_i32, __mul, *)
BINARY_OP(__vec16_i32, __or, |)
BINARY_OP(__vec16_i32, __and, &)
BINARY_OP(__vec16_i32, __xor, ^)
BINARY_OP(__vec16_i32, __shl, <<)
BINARY_OP_CAST(__vec16_i32, uint32_t, __udiv, /)
BINARY_OP_CAST(__vec16_i32, int32_t, __sdiv, /)
BINARY_OP_CAST(__vec16_i32, uint32_t, __urem, %)
BINARY_OP_CAST(__vec16_i32, int32_t, __srem, %)
BINARY_OP_CAST(__vec16_i32, uint32_t, __lshr, >>)
BINARY_OP_CAST(__vec16_i32, int32_t, __ashr, >>)
SHIFT_UNIFORM(__vec16_i32, uint32_t, __lshr, >>)
SHIFT_UNIFORM(__vec16_i32, int32_t, __ashr, >>)
SHIFT_UNIFORM(__vec16_i32, int32_t, __shl, <<)
CMP_OP(__vec16_i32, i32, int32_t, __equal, ==)
CMP_OP(__vec16_i32, i32, int32_t, __not_equal, !=)
CMP_OP(__vec16_i32, i32, uint32_t, __unsigned_less_equal, <=)
CMP_OP(__vec16_i32, i32, int32_t, __signed_less_equal, <=)
CMP_OP(__vec16_i32, i32, uint32_t, __unsigned_greater_equal, >=)
CMP_OP(__vec16_i32, i32, int32_t, __signed_greater_equal, >=)
CMP_OP(__vec16_i32, i32, uint32_t, __unsigned_less_than, <)
CMP_OP(__vec16_i32, i32, int32_t, __signed_less_than, <)
CMP_OP(__vec16_i32, i32, uint32_t, __unsigned_greater_than, >)
CMP_OP(__vec16_i32, i32, int32_t, __signed_greater_than, >)
SELECT(__vec16_i32)
INSERT_EXTRACT(__vec16_i32, int32_t)
SMEAR(__vec16_i32, i32, int32_t)
SETZERO(__vec16_i32, i32)
UNDEF(__vec16_i32, i32)
BROADCAST(__vec16_i32, i32, int32_t)
ROTATE(__vec16_i32, i32, int32_t)
SHUFFLES(__vec16_i32, i32, int32_t)
LOAD_STORE(__vec16_i32, int32_t)
#else /* evghenii::int32 */
///////////////////////////////////////////////////////////////////////////
// int32
///////////////////////////////////////////////////////////////////////////
static FORCEINLINE __vec16_i32 __add(__vec16_i32 a, __vec16_i32 b) {
return _mm512_add_epi32(a, b);
}
static FORCEINLINE __vec16_i32 __add (__vec16_i32 a, __vec16_i32 b) { return _mm512_add_epi32 (a,b); }
static FORCEINLINE __vec16_i32 __sub (__vec16_i32 a, __vec16_i32 b) { return _mm512_sub_epi32 (a,b); }
static FORCEINLINE __vec16_i32 __mul (__vec16_i32 a, __vec16_i32 b) { return _mm512_mullo_epi32(a,b); }
static FORCEINLINE __vec16_i32 __udiv(__vec16_i32 a, __vec16_i32 b) { return _mm512_div_epu32 (a,b); }
static FORCEINLINE __vec16_i32 __sdiv(__vec16_i32 a, __vec16_i32 b) { return _mm512_div_epi32 (a,b); }
static FORCEINLINE __vec16_i32 __urem(__vec16_i32 a, __vec16_i32 b) { return _mm512_rem_epu32 (a,b); }
static FORCEINLINE __vec16_i32 __srem(__vec16_i32 a, __vec16_i32 b) { return _mm512_rem_epi32 (a,b); }
static FORCEINLINE __vec16_i32 __or (__vec16_i32 a, __vec16_i32 b) { return _mm512_or_epi32 (a,b); }
static FORCEINLINE __vec16_i32 __and (__vec16_i32 a, __vec16_i32 b) { return _mm512_and_epi32 (a,b); }
static FORCEINLINE __vec16_i32 __xor (__vec16_i32 a, __vec16_i32 b) { return _mm512_xor_epi32 (a,b); }
static FORCEINLINE __vec16_i32 __shl (__vec16_i32 a, __vec16_i32 b) { return _mm512_sllv_epi32 (a,b); }
static FORCEINLINE __vec16_i32 __lshr(__vec16_i32 a, __vec16_i32 b) { return _mm512_srlv_epi32 (a,b); }
static FORCEINLINE __vec16_i32 __ashr(__vec16_i32 a, __vec16_i32 b) { return _mm512_srav_epi32 (a,b); }
static FORCEINLINE __vec16_i32 __shl (__vec16_i32 a, int32_t n) { return _mm512_slli_epi32 (a,n); }
static FORCEINLINE __vec16_i32 __lshr(__vec16_i32 a, int32_t n) { return _mm512_srli_epi32 (a,n); }
static FORCEINLINE __vec16_i32 __ashr(__vec16_i32 a, int32_t n) { return _mm512_srai_epi32 (a,n); }
static FORCEINLINE __vec16_i32 __sub(__vec16_i32 a, __vec16_i32 b) {
return _mm512_sub_epi32(a, b);
}
static FORCEINLINE __vec16_i1 __equal_i32 (__vec16_i32 a, __vec16_i32 b) { return _mm512_cmpeq_epi32_mask (a,b); }
static FORCEINLINE __vec16_i1 __not_equal_i32 (__vec16_i32 a, __vec16_i32 b) { return _mm512_cmpneq_epi32_mask(a,b); }
static FORCEINLINE __vec16_i1 __unsigned_less_equal_i32 (__vec16_i32 a, __vec16_i32 b) { return _mm512_cmple_epu32_mask (a,b); }
static FORCEINLINE __vec16_i1 __signed_less_equal_i32 (__vec16_i32 a, __vec16_i32 b) { return _mm512_cmple_epi32_mask (a,b); }
static FORCEINLINE __vec16_i1 __unsigned_greater_equal_i32(__vec16_i32 a, __vec16_i32 b) { return _mm512_cmpge_epu32_mask (a,b); }
static FORCEINLINE __vec16_i1 __signed_greater_equal_i32 (__vec16_i32 a, __vec16_i32 b) { return _mm512_cmpge_epi32_mask (a,b); }
static FORCEINLINE __vec16_i1 __unsigned_less_than_i32 (__vec16_i32 a, __vec16_i32 b) { return _mm512_cmplt_epu32_mask (a,b); }
static FORCEINLINE __vec16_i1 __signed_less_than_i32 (__vec16_i32 a, __vec16_i32 b) { return _mm512_cmplt_epi32_mask (a,b); }
static FORCEINLINE __vec16_i1 __unsigned_greater_than_i32 (__vec16_i32 a, __vec16_i32 b) { return _mm512_cmpgt_epu32_mask (a,b); }
static FORCEINLINE __vec16_i1 __signed_greater_than_i32 (__vec16_i32 a, __vec16_i32 b) { return _mm512_cmpgt_epi32_mask (a,b); }
static FORCEINLINE __vec16_i32 __mul(__vec16_i32 a, __vec16_i32 b) {
return _mm512_mullo_epi32(a, b);
}
static FORCEINLINE __vec16_i1 __equal_i32_and_mask (__vec16_i32 a, __vec16_i32 b, __vec16_i1 m) { return _mm512_mask_cmpeq_epi32_mask (m,a,b); }
static FORCEINLINE __vec16_i1 __not_equal_i32_and_mask (__vec16_i32 a, __vec16_i32 b, __vec16_i1 m) { return _mm512_mask_cmpneq_epi32_mask(m,a,b); }
static FORCEINLINE __vec16_i1 __unsigned_less_equal_i32_and_mask (__vec16_i32 a, __vec16_i32 b, __vec16_i1 m) { return _mm512_mask_cmple_epu32_mask (m,a,b); }
static FORCEINLINE __vec16_i1 __signed_less_equal_i32_and_mask (__vec16_i32 a, __vec16_i32 b, __vec16_i1 m) { return _mm512_mask_cmple_epi32_mask (m,a,b); }
static FORCEINLINE __vec16_i1 __unsigned_greater_equal_i32_and_mask(__vec16_i32 a, __vec16_i32 b, __vec16_i1 m) { return _mm512_mask_cmpge_epu32_mask (m,a,b); }
static FORCEINLINE __vec16_i1 __signed_greater_equal_i32_and_mask (__vec16_i32 a, __vec16_i32 b, __vec16_i1 m) { return _mm512_mask_cmpge_epi32_mask (m,a,b); }
static FORCEINLINE __vec16_i1 __unsigned_less_than_i32_and_mask (__vec16_i32 a, __vec16_i32 b, __vec16_i1 m) { return _mm512_mask_cmplt_epu32_mask (m,a,b); }
static FORCEINLINE __vec16_i1 __signed_less_than_i32_and_mask (__vec16_i32 a, __vec16_i32 b, __vec16_i1 m) { return _mm512_mask_cmplt_epi32_mask (m,a,b); }
static FORCEINLINE __vec16_i1 __unsigned_greater_than_i32_and_mask (__vec16_i32 a, __vec16_i32 b, __vec16_i1 m) { return _mm512_mask_cmpgt_epu32_mask (m,a,b); }
static FORCEINLINE __vec16_i1 __signed_greater_than_i32_and_mask (__vec16_i32 a, __vec16_i32 b, __vec16_i1 m) { return _mm512_mask_cmpgt_epi32_mask (m,a,b); }
static FORCEINLINE __vec16_i32 __udiv(__vec16_i32 a, __vec16_i32 b) {
return _mm512_div_epu32(a, b);
}
static FORCEINLINE __vec16_i32 __select(__vec16_i1 mask, __vec16_i32 a, __vec16_i32 b) { return _mm512_mask_mov_epi32(b, mask, a); }
static FORCEINLINE __vec16_i32 __select( bool cond, __vec16_i32 a, __vec16_i32 b) { return cond ? a : b; }
static FORCEINLINE __vec16_i32 __sdiv(__vec16_i32 a, __vec16_i32 b) {
return _mm512_div_epi32(a, b);
}
static FORCEINLINE __vec16_i32 __urem(__vec16_i32 a, __vec16_i32 b) {
return _mm512_rem_epu32(a, b);
}
static FORCEINLINE __vec16_i32 __srem(__vec16_i32 a, __vec16_i32 b) {
return _mm512_rem_epi32(a, b);
}
static FORCEINLINE __vec16_i32 __or(__vec16_i32 a, __vec16_i32 b) {
return _mm512_or_epi32(a, b);
}
static FORCEINLINE __vec16_i32 __and(__vec16_i32 a, __vec16_i32 b) {
return _mm512_and_epi32(a, b);
}
static FORCEINLINE __vec16_i32 __xor(__vec16_i32 a, __vec16_i32 b) {
return _mm512_xor_epi32(a, b);
}
static FORCEINLINE __vec16_i32 __shl(__vec16_i32 a, __vec16_i32 b) {
return _mm512_sllv_epi32(a, b);
}
static FORCEINLINE __vec16_i32 __lshr(__vec16_i32 a, __vec16_i32 b) {
return _mm512_srlv_epi32(a, b);
}
static FORCEINLINE __vec16_i32 __ashr(__vec16_i32 a, __vec16_i32 b) {
return _mm512_srav_epi32(a, b);
}
static FORCEINLINE __vec16_i32 __shl(__vec16_i32 a, int32_t n) {
return _mm512_slli_epi32(a, n);
}
static FORCEINLINE __vec16_i32 __lshr(__vec16_i32 a, int32_t n) {
return _mm512_srli_epi32(a, n);
}
static FORCEINLINE __vec16_i32 __ashr(__vec16_i32 a, int32_t n) {
return _mm512_srai_epi32(a, n);
}
static FORCEINLINE __vec16_i1 __equal_i32(const __vec16_i32 &a, const __vec16_i32 &b) {
return _mm512_cmpeq_epi32_mask(a, b);
}
static FORCEINLINE __vec16_i1 __equal_i32_and_mask(const __vec16_i32 &a, const __vec16_i32 &b,
__vec16_i1 m) {
return _mm512_mask_cmpeq_epi32_mask(m, a, b);
}
static FORCEINLINE __vec16_i1 __not_equal_i32(__vec16_i32 a, __vec16_i32 b) {
return _mm512_cmpneq_epi32_mask(a, b);
}
static FORCEINLINE __vec16_i1 __not_equal_i32_and_mask(__vec16_i32 a, __vec16_i32 b,
__vec16_i1 m) {
return _mm512_mask_cmpneq_epi32_mask(m, a, b);
}
static FORCEINLINE __vec16_i1 __unsigned_less_equal_i32(__vec16_i32 a, __vec16_i32 b) {
return _mm512_cmple_epu32_mask(a, b);
}
static FORCEINLINE __vec16_i1 __unsigned_less_equal_i32_and_mask(__vec16_i32 a, __vec16_i32 b,
__vec16_i1 m) {
return _mm512_mask_cmple_epu32_mask(m, a, b);
}
static FORCEINLINE __vec16_i1 __signed_less_equal_i32(__vec16_i32 a, __vec16_i32 b) {
return _mm512_cmple_epi32_mask(a, b);
}
static FORCEINLINE __vec16_i1 __signed_less_equal_i32_and_mask(__vec16_i32 a, __vec16_i32 b,
__vec16_i1 m) {
return _mm512_mask_cmple_epi32_mask(m, a, b);
}
static FORCEINLINE __vec16_i1 __unsigned_greater_equal_i32(__vec16_i32 a, __vec16_i32 b) {
return _mm512_cmpge_epu32_mask(a, b);
}
static FORCEINLINE __vec16_i1 __unsigned_greater_equal_i32_and_mask(__vec16_i32 a, __vec16_i32 b,
__vec16_i1 m) {
return _mm512_mask_cmpge_epu32_mask(m, a, b);
}
static FORCEINLINE __vec16_i1 __signed_greater_equal_i32(__vec16_i32 a, __vec16_i32 b) {
return _mm512_cmpge_epi32_mask(a, b);
}
static FORCEINLINE __vec16_i1 __signed_greater_equal_i32_and_mask(__vec16_i32 a, __vec16_i32 b,
__vec16_i1 m) {
return _mm512_mask_cmpge_epi32_mask(m, a, b);
}
static FORCEINLINE __vec16_i1 __unsigned_less_than_i32(__vec16_i32 a, __vec16_i32 b) {
return _mm512_cmplt_epu32_mask(a, b);
}
static FORCEINLINE __vec16_i1 __unsigned_less_than_i32_and_mask(__vec16_i32 a, __vec16_i32 b,
__vec16_i1 m) {
return _mm512_mask_cmplt_epu32_mask(m, a, b);
}
static FORCEINLINE __vec16_i1 __signed_less_than_i32(__vec16_i32 a, __vec16_i32 b) {
return _mm512_cmplt_epi32_mask(a, b);
}
static FORCEINLINE __vec16_i1 __signed_less_than_i32_and_mask(__vec16_i32 a, __vec16_i32 b,
__vec16_i1 m) {
return _mm512_mask_cmplt_epi32_mask(m, a, b);
}
static FORCEINLINE __vec16_i1 __unsigned_greater_than_i32(__vec16_i32 a, __vec16_i32 b) {
return _mm512_cmpgt_epu32_mask(a, b);
}
static FORCEINLINE __vec16_i1 __unsigned_greater_than_i32_and_mask(__vec16_i32 a, __vec16_i32 b,
__vec16_i1 m) {
return _mm512_mask_cmpgt_epu32_mask(m, a, b);
}
static FORCEINLINE __vec16_i1 __signed_greater_than_i32(__vec16_i32 a, __vec16_i32 b) {
return _mm512_cmpgt_epi32_mask(a, b);
}
static FORCEINLINE __vec16_i1 __signed_greater_than_i32_and_mask(__vec16_i32 a, __vec16_i32 b,
__vec16_i1 m) {
return _mm512_mask_cmpgt_epi32_mask(m, a, b);
}
static FORCEINLINE __vec16_i32 __select(__vec16_i1 mask,
__vec16_i32 a, __vec16_i32 b) {
return _mm512_mask_mov_epi32(b.v, mask, a.v);
}
static FORCEINLINE __vec16_i32 __select(bool cond, __vec16_i32 a, __vec16_i32 b) {
return cond ? a : b;
}
static FORCEINLINE int32_t __extract_element(__vec16_i32 v, int index) { //uint32_t index) {
return ((int32_t *)&v)[index];
}
static FORCEINLINE void __insert_element(__vec16_i32 *v, uint32_t index, int32_t val) {
((int32_t *)v)[index] = val;
}
static FORCEINLINE int32_t __extract_element(__vec16_i32 v, int32_t index) { return v[index]; }
static FORCEINLINE void __insert_element (__vec16_i32 *v, uint32_t index, int32_t val) { (*v)[index] = val; }
template <class RetVecType> RetVecType __smear_i32(int32_t i);
template <> static FORCEINLINE __vec16_i32 __smear_i32<__vec16_i32>(int32_t i) {
return _mm512_set1_epi32(i);
}
template <> static FORCEINLINE __vec16_i32 __smear_i32<__vec16_i32>(int32_t i) { return _mm512_set1_epi32(i); }
static const __vec16_i32 __ispc_one = __smear_i32<__vec16_i32>(1);
static const __vec16_i32 __ispc_thirty_two = __smear_i32<__vec16_i32>(32);
@@ -929,66 +663,56 @@ static const __vec16_i32 __ispc_ffffffff = __smear_i32<__vec16_i32>(-1);
static const __vec16_i32 __ispc_stride1(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
template <class RetVecType> RetVecType __setzero_i32();
template <> static FORCEINLINE __vec16_i32 __setzero_i32<__vec16_i32>() {
return _mm512_setzero_epi32();
}
template <> static FORCEINLINE __vec16_i32 __setzero_i32<__vec16_i32>() { return _mm512_setzero_epi32(); }
template <class RetVecType> RetVecType __undef_i32();
template <> static FORCEINLINE __vec16_i32 __undef_i32<__vec16_i32>() {
return __vec16_i32();
template <> static FORCEINLINE __vec16_i32 __undef_i32<__vec16_i32>() { return __vec16_i32(); }
static FORCEINLINE __vec16_i32 __broadcast_i32(__vec16_i32 v, int index) { return _mm512_mask_permutevar_epi32(v, 0xFFFF, _mm512_set1_epi32(index), v); }
static FORCEINLINE __vec16_i32 __rotate_i32(__vec16_i32 v, int index)
{
__vec16_i32 idx = __smear_i32<__vec16_i32>(index);
__vec16_i32 shuffle = _mm512_and_epi32(_mm512_add_epi32(__ispc_stride1, idx), __smear_i32<__vec16_i32>(0xF));
return _mm512_mask_permutevar_epi32(v, 0xFFFF, shuffle, v);
}
static FORCEINLINE __vec16_i32 __broadcast_i32(__vec16_i32 v, int index) {
int32_t val = __extract_element(v, index & 0xf);
return _mm512_set1_epi32(val);
static FORCEINLINE __vec16_i32 __shuffle_i32 (__vec16_i32 v, __vec16_i32 index)
{
return _mm512_mask_permutevar_epi32(v, 0xFFFF, __and(index, __smear_i32<__vec16_i32>(0xF)), v);
}
static FORCEINLINE __vec16_i32 __shuffle2_i32(__vec16_i32 v0, __vec16_i32 v1, __vec16_i32 index)
{
const __vec16_i1 mask = __signed_less_than_i32(index, __smear_i32<__vec16_i32>(0x10));
index = __and(index, __smear_i32<__vec16_i32>(0xF));
__vec16_i32 ret = __undef_i32<__vec16_i32>();
ret = _mm512_mask_permutevar_epi32(ret, mask, index, v0);
ret = _mm512_mask_permutevar_epi32(ret, __not(mask), index, v1);
return ret;
}
#if 0 /* evghenii::doesn't work */
static FORCEINLINE __vec16_i32 __rotate_i32(__vec16_i32 v, int index) {
__vec16_i32 idx = __smear_i32<__vec16_i32>(index);
__vec16_i32 shuffle = _mm512_and_epi32(_mm512_add_epi32(__ispc_stride1, idx), __smear_i32<__vec16_i32>(0x7));
return _mm512_mask_permutevar_epi32(v, 0xffff, shuffle, v);
}
#else
ROTATE(__vec16_i32, i32, int32_t)
#endif
static FORCEINLINE __vec16_i32 __shuffle_i32(__vec16_i32 v, __vec16_i32 index) {
return _mm512_mask_permutevar_epi32(v, 0xffff, index, v);
}
SHUFFLE2(__vec16_i32, i32, int32_t) /* evghenii::to implement */
template <int ALIGN> static FORCEINLINE __vec16_i32 __load(const __vec16_i32 *p) {
template <int ALIGN> static FORCEINLINE __vec16_i32 __load(const __vec16_i32 *p)
{
#ifdef ISPC_FORCE_ALIGNED_MEMORY
return _mm512_load_epi32(p);
return __load<64>(p);
#else
__vec16_i32 v;
v = _mm512_extloadunpacklo_epi32(v, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
v = _mm512_extloadunpackhi_epi32(v, (uint8_t*)p+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
return v;
__vec16_i32 v;
v = _mm512_extloadunpacklo_epi32(v, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
v = _mm512_extloadunpackhi_epi32(v, (uint8_t*)p+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
return v;
#endif
}
template <int ALIGN> static FORCEINLINE void __store(__vec16_i32 *p, __vec16_i32 v) {
template <int ALIGN> static FORCEINLINE void __store(__vec16_i32 *p, __vec16_i32 v)
{
#ifdef ISPC_FORCE_ALIGNED_MEMORY
_mm512_store_epi32(p, v);
__store<64>(p,v);
#else
_mm512_extpackstorelo_epi32(p, v, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
_mm512_extpackstorehi_epi32((uint8_t*)p+64, v, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
_mm512_extpackstorelo_epi32( p, v, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
_mm512_extpackstorehi_epi32((uint8_t*)p+64, v, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
#endif
}
#if 0
template <> static FORCEINLINE __vec16_i32 __load<64>(const __vec16_i32 *p) {
return _mm512_load_epi32(p);
}
template <> static FORCEINLINE void __store<64>(__vec16_i32 *p, __vec16_i32 v) {
_mm512_store_epi32(p, v);
}
#endif
#endif /* evghenii::int32 */
///////////////////////////////////////////////////////////////////////////
// int64
// evghenii::int64