- added 'cast_trunk' and 'cast_zext' sets of functions

- commented out fault [] overload for vec16_i64
- added loop based function macros for vec16_i8/16 data types
This commit is contained in:
Anton Mitrokhin
2014-11-09 11:31:14 +04:00
parent 83f3ee7cfa
commit e7717e58b5
2 changed files with 187 additions and 168 deletions

View File

@@ -201,8 +201,13 @@ typedef struct PRE_ALIGN(64) __vec16_i64 {
_mm512_set_16to16_pi(15,13,11,9,7,5,3,1,14,12,10,8,6,4,2,0),
v2);
}
FORCEINLINE const int64_t& operator[](const int i) const { return ((int64_t*)this)[i]; }
FORCEINLINE int64_t& operator[](const int i) { return ((int64_t*)this)[i]; }
// TODO: The previous implementation was faulty as it assumed different data layout.
// Here integers in v_hi and v_lo are located not sequentually (like in vec16_d)
// but separately - the highest part in v_hi and the lowest in v_lo
//FORCEINLINE const int64_t& operator[](const int i) const {
// return ((uint64_t(((int32_t*)this)[i])<<32)+((int32_t*)this)[i+16]); }
//FORCEINLINE int64_t& operator[](const int i) {
// return ((uint64_t(((int32_t*)this)[i])<<32)+((int32_t*)this)[i+16]); }
__m512i v_hi;
__m512i v_lo;
} POST_ALIGN(64) __vec16_i64;
@@ -217,8 +222,8 @@ struct vec16 {
v[8] = v8; v[9] = v9; v[10] = v10; v[11] = v11;
v[12] = v12; v[13] = v13; v[14] = v14; v[15] = v15;
}
FORCEINLINE const T& operator[](const int i) const { return data[i]; }
FORCEINLINE T& operator[](const int i) { return data[i]; }
FORCEINLINE const T& operator[](const int i) const { return v[i]; }
FORCEINLINE T& operator[](const int i) { return v[i]; }
T v[16];
};
@@ -451,6 +456,7 @@ template <> FORCEINLINE __vec16_i1 __undef_i1<__vec16_i1>() {
return __vec16_i1();
}
///////////////////////////////////////////////////////////////////////////
// int32
///////////////////////////////////////////////////////////////////////////
@@ -651,10 +657,37 @@ static FORCEINLINE __vec16_i32 __broadcast_i32(__vec16_i32 v, int index) {
return _mm512_set1_epi32(val);
}
static FORCEINLINE __vec16_i8 __cast_trunc(__vec16_i8, const __vec16_i16 i16) {
return __vec16_i8((uint8_t)i16[0], (uint8_t)i16[1], (uint8_t)i16[2], (uint8_t)i16[3],
(uint8_t)i16[4], (uint8_t)i16[5], (uint8_t)i16[6], (uint8_t)i16[7],
(uint8_t)i16[8], (uint8_t)i16[9], (uint8_t)i16[10], (uint8_t)i16[11],
(uint8_t)i16[12], (uint8_t)i16[13], (uint8_t)i16[14], (uint8_t)i16[15]);
}
static FORCEINLINE __vec16_i16 __cast_trunc(__vec16_i16, const __vec16_i32 i32) {
__vec16_i16 ret;
_mm512_extstore_epi32(ret.v, i32, _MM_DOWNCONV_EPI32_UINT16, _MM_HINT_NONE);
return ret;
}
static FORCEINLINE __vec16_i8 __cast_trunc(__vec16_i8, const __vec16_i32 i32) {
__vec16_i8 ret;
_mm512_extstore_epi32(ret.v, i32, _MM_DOWNCONV_EPI32_UINT8, _MM_HINT_NONE);
return ret;
}
static FORCEINLINE __vec16_i32 __cast_trunc(__vec16_i32, const __vec16_i64 i64) {
return __vec16_i32(i64.v_lo);
}
static FORCEINLINE __vec16_i16 __cast_trunc(__vec16_i16, const __vec16_i64 i64) {
return __cast_trunc(__vec16_i16(), i64.v_lo);
}
static FORCEINLINE __vec16_i8 __cast_trunc(__vec16_i8, const __vec16_i64 i64) {
return __cast_trunc(__vec16_i8(), i64.v_lo);
}
static FORCEINLINE __vec16_i32 __rotate_i32(__vec16_i32 v, int index) {
__vec16_i32 idx = __smear_i32<__vec16_i32>(index);
__vec16_i32 shuffle = _mm512_and_epi32(_mm512_add_epi32(__ispc_stride1, idx), __smear_i32<__vec16_i32>(0xf));
@@ -709,7 +742,7 @@ template <> FORCEINLINE void __store<64>(__vec16_i32 *p, __vec16_i32 v) {
///////////////////////////////////////////////////////////////////////////
// int64
///////////////////////////////////////////////////////////////////////////
static FORCEINLINE
static FORCEINLINE
void __masked_store_i64(void *p, const __vec16_i64 &v, __vec16_i1 mask)
{
__m512i v1;
@@ -1465,12 +1498,7 @@ template <> FORCEINLINE void __store<128>(__vec16_d *p, __vec16_d v) {
static FORCEINLINE __vec16_i64 __cast_sext(const __vec16_i64 &, const __vec16_i32 &val)
{
return __vec16_i64(val.v,_mm512_srai_epi32(val.v,31));
}
static FORCEINLINE __vec16_i64 __cast_zext(const __vec16_i64 &, const __vec16_i32 &val)
{
return __vec16_i64(val.v, _mm512_setzero_epi32());
return __vec16_i64(val.v, _mm512_srai_epi32(val.v, 31));
}
static FORCEINLINE __vec16_i32 __cast_sext(const __vec16_i32 &, const __vec16_i1 &val)
@@ -1480,6 +1508,12 @@ static FORCEINLINE __vec16_i32 __cast_sext(const __vec16_i32 &, const __vec16_i1
return _mm512_mask_mov_epi32(ret, val, one);
}
static FORCEINLINE __vec16_i16 __cast_zext(const __vec16_i16 &, const __vec16_i8 &val)
{
return __vec16_i16(val[0], val[1], val[2], val[3], val[4], val[5], val[6], val[7],
val[8], val[9], val[10], val[11], val[12], val[13], val[14], val[15]);
}
static FORCEINLINE __vec16_i32 __cast_zext(const __vec16_i32 &, const __vec16_i1 &val)
{
__vec16_i32 ret = _mm512_setzero_epi32();
@@ -1487,6 +1521,31 @@ static FORCEINLINE __vec16_i32 __cast_zext(const __vec16_i32 &, const __vec16_i1
return _mm512_mask_mov_epi32(ret, val, one);
}
static FORCEINLINE __vec16_i32 __cast_zext(const __vec16_i32 &, const __vec16_i8 &val)
{
return _mm512_extload_epi32(&val, _MM_UPCONV_EPI32_UINT8, _MM_BROADCAST32_NONE, _MM_HINT_NONE);
}
static FORCEINLINE __vec16_i32 __cast_zext(const __vec16_i32 &, const __vec16_i16 &val)
{
return _mm512_extload_epi32(&val, _MM_UPCONV_EPI32_UINT16, _MM_BROADCAST32_NONE, _MM_HINT_NONE);
}
static FORCEINLINE __vec16_i64 __cast_zext(const __vec16_i64 &, const __vec16_i8 &val)
{
return __vec16_i64(__cast_zext(__vec16_i32(), val), _mm512_setzero_epi32());
}
static FORCEINLINE __vec16_i64 __cast_zext(const __vec16_i64 &, const __vec16_i16 &val)
{
return __vec16_i64(__cast_zext(__vec16_i32(), val), _mm512_setzero_epi32());
}
static FORCEINLINE __vec16_i64 __cast_zext(const __vec16_i64 &, const __vec16_i32 &val)
{
return __vec16_i64(val.v, _mm512_setzero_epi32());
}
static FORCEINLINE __vec16_f __cast_sitofp(__vec16_f, __vec16_i8 val) {
return _mm512_extload_ps(&val, _MM_UPCONV_PS_SINT8, _MM_BROADCAST_16X16, _MM_HINT_NONE);
}
@@ -1728,15 +1787,85 @@ static FORCEINLINE __vec16_d __cast_bits(__vec16_d, __vec16_i64 val) {
return ret;
}
///////////////////////////////////////////////////////////////////////////
// templates for int8/16 operations
///////////////////////////////////////////////////////////////////////////
#define BINARY_OP(TYPE, NAME, OP) \
static FORCEINLINE TYPE NAME(TYPE a, TYPE b) { \
TYPE ret; \
for (int i = 0; i < 16; ++i) \
ret[i] = a[i] OP b[i]; \
return ret; \
}
/* knc::macro::used */
#define BINARY_OP_CAST(TYPE, CAST, NAME, OP) \
static FORCEINLINE TYPE NAME(TYPE a, TYPE b) { \
TYPE ret; \
for (int i = 0; i < 16; ++i) \
ret[i] = (CAST)(a[i]) OP (CAST)(b[i]); \
return ret; \
}
#define CMP_OP(TYPE, SUFFIX, CAST, NAME, OP) \
static FORCEINLINE __vec16_i1 NAME##_##SUFFIX(TYPE a, TYPE b) { \
__vec16_i1 ret; \
ret.v = 0; \
for (int i = 0; i < 16; ++i) \
ret.v |= ((CAST)(a[i]) OP (CAST)(b[i])) << i; \
return ret; \
}
#define SHIFT_UNIFORM(TYPE, CAST, NAME, OP) \
static FORCEINLINE TYPE NAME(TYPE a, int32_t b) { \
TYPE ret; \
for (int i = 0; i < 16; ++i) \
ret[i] = (CAST)(a[i]) OP b; \
return ret; \
}
///////////////////////////////////////////////////////////////////////////
// int8
///////////////////////////////////////////////////////////////////////////
template <class RetVecType> static RetVecType __setzero_i8();
template <> FORCEINLINE __vec16_i8 __setzero_i8<__vec16_i8>() {
return __vec16_i8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
}
BINARY_OP(__vec16_i8, __add, +)
BINARY_OP(__vec16_i8, __sub, -)
BINARY_OP(__vec16_i8, __mul, *)
BINARY_OP(__vec16_i8, __or, |)
BINARY_OP(__vec16_i8, __and, &)
BINARY_OP(__vec16_i8, __xor, ^)
BINARY_OP(__vec16_i8, __shl, <<)
BINARY_OP_CAST(__vec16_i8, uint8_t, __udiv, /)
BINARY_OP_CAST(__vec16_i8, int8_t, __sdiv, /)
BINARY_OP_CAST(__vec16_i8, uint8_t, __urem, %)
BINARY_OP_CAST(__vec16_i8, int8_t, __srem, %)
BINARY_OP_CAST(__vec16_i8, uint8_t, __lshr, >>)
BINARY_OP_CAST(__vec16_i8, int8_t, __ashr, >>)
SHIFT_UNIFORM(__vec16_i8, uint8_t, __lshr, >>)
SHIFT_UNIFORM(__vec16_i8, int8_t, __ashr, >>)
SHIFT_UNIFORM(__vec16_i8, int8_t, __shl, <<)
CMP_OP(__vec16_i8, i8, int8_t, __equal, ==)
CMP_OP(__vec16_i8, i8, uint8_t, __unsigned_less_equal, <=)
CMP_OP(__vec16_i8, i8, int8_t, __signed_less_equal, <=)
CMP_OP(__vec16_i8, i8, uint8_t, __unsigned_greater_equal, >=)
CMP_OP(__vec16_i8, i8, int8_t, __signed_greater_equal, >=)
CMP_OP(__vec16_i8, i8, uint8_t, __unsigned_less_than, <)
CMP_OP(__vec16_i8, i8, int8_t, __signed_less_than, <)
CMP_OP(__vec16_i8, i8, uint8_t, __unsigned_greater_than, >)
CMP_OP(__vec16_i8, i8, int8_t, __signed_greater_than, >)
static FORCEINLINE __vec16_i1 __not_equal_i8(__vec16_i8 a, __vec16_i8 b) {
__vec16_i32 tmp_a = _mm512_extload_epi32(&a, _MM_UPCONV_EPI32_SINT8, _MM_BROADCAST32_NONE, _MM_HINT_NONE);
__vec16_i32 tmp_b = _mm512_extload_epi32(&b, _MM_UPCONV_EPI32_SINT8, _MM_BROADCAST32_NONE, _MM_HINT_NONE);
@@ -1780,11 +1909,45 @@ static FORCEINLINE __vec16_i8 __shuffle2_i8(__vec16_i8 v0, __vec16_i8 v1, __vec1
// int16
///////////////////////////////////////////////////////////////////////////
template <class RetVecType> static RetVecType __setzero_i16();
template <> FORCEINLINE __vec16_i16 __setzero_i16<__vec16_i16>() {
return __vec16_i16(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
}
BINARY_OP(__vec16_i16, __add, +)
BINARY_OP(__vec16_i16, __sub, -)
BINARY_OP(__vec16_i16, __mul, *)
BINARY_OP(__vec16_i16, __or, |)
BINARY_OP(__vec16_i16, __and, &)
BINARY_OP(__vec16_i16, __xor, ^)
BINARY_OP(__vec16_i16, __shl, <<)
BINARY_OP_CAST(__vec16_i16, uint16_t, __udiv, /)
BINARY_OP_CAST(__vec16_i16, int16_t, __sdiv, /)
BINARY_OP_CAST(__vec16_i16, uint16_t, __urem, %)
BINARY_OP_CAST(__vec16_i16, int16_t, __srem, %)
BINARY_OP_CAST(__vec16_i16, uint16_t, __lshr, >>)
BINARY_OP_CAST(__vec16_i16, int16_t, __ashr, >>)
SHIFT_UNIFORM(__vec16_i16, uint16_t, __lshr, >>)
SHIFT_UNIFORM(__vec16_i16, int16_t, __ashr, >>)
SHIFT_UNIFORM(__vec16_i16, int16_t, __shl, <<)
CMP_OP(__vec16_i16, i16, int16_t, __equal, ==)
CMP_OP(__vec16_i16, i16, uint16_t, __unsigned_less_equal, <=)
CMP_OP(__vec16_i16, i16, int16_t, __signed_less_equal, <=)
CMP_OP(__vec16_i16, i16, uint16_t, __unsigned_greater_equal, >=)
CMP_OP(__vec16_i16, i16, int16_t, __signed_greater_equal, >=)
CMP_OP(__vec16_i16, i16, uint16_t, __unsigned_less_than, <)
CMP_OP(__vec16_i16, i16, int16_t, __signed_less_than, <)
CMP_OP(__vec16_i16, i16, uint16_t, __unsigned_greater_than, >)
CMP_OP(__vec16_i16, i16, int16_t, __signed_greater_than, >)
static FORCEINLINE __vec16_i1 __not_equal_i16(__vec16_i16 a, __vec16_i16 b) {
__vec16_i32 tmp_a = _mm512_extload_epi32(&a, _MM_UPCONV_EPI32_SINT16, _MM_BROADCAST32_NONE, _MM_HINT_NONE);
__vec16_i32 tmp_b = _mm512_extload_epi32(&b, _MM_UPCONV_EPI32_SINT16, _MM_BROADCAST32_NONE, _MM_HINT_NONE);