Added a few functions: __setzero_i64() __cast_sext(__vec16_i64, __vec16_i32), __cast_zext(__vec16_i32)
__min_varying_in32(), __min_varying_uint32(), __max_varying_int32(), __max_varying_uint32() Fixed the signature of __smear_i64() to match current codegen
This commit is contained in:
@@ -803,6 +803,13 @@ template <> static FORCEINLINE void __store<64>(__vec16_i32 *p, __vec16_i32 v) {
|
||||
// int64
|
||||
|
||||
|
||||
static FORCEINLINE __vec16_i64 __setzero_i64() {
|
||||
__vec16_i64 ret;
|
||||
ret.v_lo = _mm512_setzero_epi32();
|
||||
ret.v_hi = _mm512_setzero_epi32();
|
||||
return ret;
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec16_i64 __add(const __vec16_i64 &a, const __vec16_i64 &b)
|
||||
{
|
||||
__mmask16 carry = 0;
|
||||
@@ -878,7 +885,7 @@ static FORCEINLINE int64_t __extract_element(const __vec16_i64 &v, int index)
|
||||
return src[index+16] | (int64_t(src[index]) << 32);
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec16_i64 __smear_i64(__vec16_i64, const int64_t &l) {
|
||||
static FORCEINLINE __vec16_i64 __smear_i64(const int64_t &l) {
|
||||
const int *i = (const int*)&l;
|
||||
return __vec16_i64(_mm512_set_1to16_epi32(i[0]), _mm512_set_1to16_epi32(i[1]));
|
||||
}
|
||||
@@ -1373,6 +1380,11 @@ CAST(__vec16_i32, int32_t, __vec16_i16, int16_t, __cast_sext)
|
||||
CAST(__vec16_i32, int32_t, __vec16_i8, int8_t, __cast_sext)
|
||||
CAST(__vec16_i16, int16_t, __vec16_i8, int8_t, __cast_sext)
|
||||
|
||||
static FORCEINLINE __vec16_i64 __cast_sext(const __vec16_i64 &, const __vec16_i32 &val)
|
||||
{
|
||||
return __vec16_i64(val.v,_mm512_srai_epi32(val.v,31));
|
||||
}
|
||||
|
||||
#define CAST_SEXT_I1(TYPE)
|
||||
/*
|
||||
static FORCEINLINE TYPE __cast_sext(TYPE, __vec16_i1 v) { \
|
||||
@@ -1389,11 +1401,6 @@ CAST_SEXT_I1(__vec16_i8)
|
||||
CAST_SEXT_I1(__vec16_i16)
|
||||
CAST_SEXT_I1(__vec16_i32)
|
||||
|
||||
static FORCEINLINE __vec16_i64 __cast_sext(const __vec16_i64 &, const __vec16_i32 &val)
|
||||
{
|
||||
return __vec16_i64(val.v,_mm512_srai_epi32(val.v,31));
|
||||
}
|
||||
|
||||
// zero extension
|
||||
CAST(__vec16_i64, uint64_t, __vec16_i32, uint32_t, __cast_zext)
|
||||
CAST(__vec16_i64, uint64_t, __vec16_i16, uint16_t, __cast_zext)
|
||||
@@ -1421,6 +1428,14 @@ CAST_ZEXT_I1(__vec16_i16)
|
||||
CAST_ZEXT_I1(__vec16_i32)
|
||||
CAST_ZEXT_I1(__vec16_i64)
|
||||
|
||||
static FORCEINLINE __vec16_i32 __cast_zext(const __vec16_i32 &, const __vec16_i1 &val)
|
||||
{
|
||||
__vec16_i32 ret = _mm512_setzero_epi32();
|
||||
__vec16_i32 one = _mm512_set1_epi32(1);
|
||||
return _mm512_mask_mov_epi32(ret, val.m, one);
|
||||
}
|
||||
|
||||
|
||||
// truncations
|
||||
CAST(__vec16_i32, int32_t, __vec16_i64, int64_t, __cast_trunc)
|
||||
CAST(__vec16_i16, int16_t, __vec16_i64, int64_t, __cast_trunc)
|
||||
@@ -1654,14 +1669,25 @@ static FORCEINLINE __vec16_f __min_varying_float(__vec16_f v1, __vec16_f v2) {
|
||||
return _mm512_gmin_ps(v1, v2);
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec16_i32 __max_varying_int32(__vec16_i32 v1, __vec16_i32 v2) {
|
||||
return _mm512_max_epi32(v1, v2);
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec16_i32 __min_varying_int32(__vec16_i32 v1, __vec16_i32 v2) {
|
||||
return _mm512_min_epi32(v1, v2);
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec16_i32 __max_varying_uint32(__vec16_i32 v1, __vec16_i32 v2) {
|
||||
return _mm512_max_epu32(v1, v2);
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec16_i32 __min_varying_uint32(__vec16_i32 v1, __vec16_i32 v2) {
|
||||
return _mm512_min_epu32(v1, v2);
|
||||
}
|
||||
|
||||
BINARY_OP_FUNC(__vec16_d, __max_varying_double, __max_uniform_double)
|
||||
BINARY_OP_FUNC(__vec16_d, __min_varying_double, __min_uniform_double)
|
||||
|
||||
BINARY_OP_FUNC(__vec16_i32, __max_varying_int32, __max_uniform_int32)
|
||||
BINARY_OP_FUNC(__vec16_i32, __min_varying_int32, __min_uniform_int32)
|
||||
BINARY_OP_FUNC(__vec16_i32, __max_varying_uint32, __max_uniform_uint32)
|
||||
BINARY_OP_FUNC(__vec16_i32, __min_varying_uint32, __min_uniform_uint32)
|
||||
|
||||
BINARY_OP_FUNC(__vec16_i64, __max_varying_int64, __max_uniform_int64)
|
||||
BINARY_OP_FUNC(__vec16_i64, __min_varying_int64, __min_uniform_int64)
|
||||
BINARY_OP_FUNC(__vec16_i64, __max_varying_uint64, __max_uniform_uint64)
|
||||
@@ -2033,6 +2059,17 @@ __scatter_base_offsets32_float(void *base, uint32_t scale, __vec16_i32 offsets,
|
||||
_MM_HINT_NONE);
|
||||
}
|
||||
|
||||
/*
|
||||
static FORCEINLINE void
|
||||
__scatter_base_offsets64_float(void *base, const __vec16_i64 &varyingOffset,
|
||||
uint32_t scale, const __vec16_i64 &constOffset,
|
||||
const __vec16_f &val, const __vec16_i1 mask)
|
||||
{
|
||||
__vec16_i64 offsets = __add(__mul(varyingOffset,__vec16_i64(scale)), constOffset);
|
||||
_mm512_mask_i64extscatter_ps(base, mask, offsets, val, _MM_DOWNCONV_PS_NONE, _MM_SCALE_1, _MM_HINT_NONE);
|
||||
}
|
||||
*/
|
||||
|
||||
#define SCATTER_GENERAL(VTYPE, STYPE, PTRTYPE, FUNC)
|
||||
/*
|
||||
static FORCEINLINE void FUNC(PTRTYPE ptrs, VTYPE val, __vec16_i1 mask) { \
|
||||
|
||||
Reference in New Issue
Block a user