Added a few functions: __setzero_i64() __cast_sext(__vec16_i64, __vec16_i32), __cast_zext(__vec16_i32)

__min_varying_in32(), __min_varying_uint32(), __max_varying_int32(), __max_varying_uint32()
Fixed the signature of __smear_i64() to match current codegen
This commit is contained in:
Jean-Luc Duprat
2012-07-12 10:32:38 -07:00
parent 2c640f7e52
commit e09e953bbb

View File

@@ -803,6 +803,13 @@ template <> static FORCEINLINE void __store<64>(__vec16_i32 *p, __vec16_i32 v) {
// int64
static FORCEINLINE __vec16_i64 __setzero_i64() {
__vec16_i64 ret;
ret.v_lo = _mm512_setzero_epi32();
ret.v_hi = _mm512_setzero_epi32();
return ret;
}
static FORCEINLINE __vec16_i64 __add(const __vec16_i64 &a, const __vec16_i64 &b)
{
__mmask16 carry = 0;
@@ -878,7 +885,7 @@ static FORCEINLINE int64_t __extract_element(const __vec16_i64 &v, int index)
return src[index+16] | (int64_t(src[index]) << 32);
}
static FORCEINLINE __vec16_i64 __smear_i64(__vec16_i64, const int64_t &l) {
static FORCEINLINE __vec16_i64 __smear_i64(const int64_t &l) {
const int *i = (const int*)&l;
return __vec16_i64(_mm512_set_1to16_epi32(i[0]), _mm512_set_1to16_epi32(i[1]));
}
@@ -1373,6 +1380,11 @@ CAST(__vec16_i32, int32_t, __vec16_i16, int16_t, __cast_sext)
CAST(__vec16_i32, int32_t, __vec16_i8, int8_t, __cast_sext)
CAST(__vec16_i16, int16_t, __vec16_i8, int8_t, __cast_sext)
static FORCEINLINE __vec16_i64 __cast_sext(const __vec16_i64 &, const __vec16_i32 &val)
{
return __vec16_i64(val.v,_mm512_srai_epi32(val.v,31));
}
#define CAST_SEXT_I1(TYPE)
/*
static FORCEINLINE TYPE __cast_sext(TYPE, __vec16_i1 v) { \
@@ -1389,11 +1401,6 @@ CAST_SEXT_I1(__vec16_i8)
CAST_SEXT_I1(__vec16_i16)
CAST_SEXT_I1(__vec16_i32)
static FORCEINLINE __vec16_i64 __cast_sext(const __vec16_i64 &, const __vec16_i32 &val)
{
return __vec16_i64(val.v,_mm512_srai_epi32(val.v,31));
}
// zero extension
CAST(__vec16_i64, uint64_t, __vec16_i32, uint32_t, __cast_zext)
CAST(__vec16_i64, uint64_t, __vec16_i16, uint16_t, __cast_zext)
@@ -1421,6 +1428,14 @@ CAST_ZEXT_I1(__vec16_i16)
CAST_ZEXT_I1(__vec16_i32)
CAST_ZEXT_I1(__vec16_i64)
static FORCEINLINE __vec16_i32 __cast_zext(const __vec16_i32 &, const __vec16_i1 &val)
{
__vec16_i32 ret = _mm512_setzero_epi32();
__vec16_i32 one = _mm512_set1_epi32(1);
return _mm512_mask_mov_epi32(ret, val.m, one);
}
// truncations
CAST(__vec16_i32, int32_t, __vec16_i64, int64_t, __cast_trunc)
CAST(__vec16_i16, int16_t, __vec16_i64, int64_t, __cast_trunc)
@@ -1654,14 +1669,25 @@ static FORCEINLINE __vec16_f __min_varying_float(__vec16_f v1, __vec16_f v2) {
return _mm512_gmin_ps(v1, v2);
}
static FORCEINLINE __vec16_i32 __max_varying_int32(__vec16_i32 v1, __vec16_i32 v2) {
return _mm512_max_epi32(v1, v2);
}
static FORCEINLINE __vec16_i32 __min_varying_int32(__vec16_i32 v1, __vec16_i32 v2) {
return _mm512_min_epi32(v1, v2);
}
static FORCEINLINE __vec16_i32 __max_varying_uint32(__vec16_i32 v1, __vec16_i32 v2) {
return _mm512_max_epu32(v1, v2);
}
static FORCEINLINE __vec16_i32 __min_varying_uint32(__vec16_i32 v1, __vec16_i32 v2) {
return _mm512_min_epu32(v1, v2);
}
BINARY_OP_FUNC(__vec16_d, __max_varying_double, __max_uniform_double)
BINARY_OP_FUNC(__vec16_d, __min_varying_double, __min_uniform_double)
BINARY_OP_FUNC(__vec16_i32, __max_varying_int32, __max_uniform_int32)
BINARY_OP_FUNC(__vec16_i32, __min_varying_int32, __min_uniform_int32)
BINARY_OP_FUNC(__vec16_i32, __max_varying_uint32, __max_uniform_uint32)
BINARY_OP_FUNC(__vec16_i32, __min_varying_uint32, __min_uniform_uint32)
BINARY_OP_FUNC(__vec16_i64, __max_varying_int64, __max_uniform_int64)
BINARY_OP_FUNC(__vec16_i64, __min_varying_int64, __min_uniform_int64)
BINARY_OP_FUNC(__vec16_i64, __max_varying_uint64, __max_uniform_uint64)
@@ -2033,6 +2059,17 @@ __scatter_base_offsets32_float(void *base, uint32_t scale, __vec16_i32 offsets,
_MM_HINT_NONE);
}
/*
static FORCEINLINE void
__scatter_base_offsets64_float(void *base, const __vec16_i64 &varyingOffset,
uint32_t scale, const __vec16_i64 &constOffset,
const __vec16_f &val, const __vec16_i1 mask)
{
__vec16_i64 offsets = __add(__mul(varyingOffset,__vec16_i64(scale)), constOffset);
_mm512_mask_i64extscatter_ps(base, mask, offsets, val, _MM_DOWNCONV_PS_NONE, _MM_SCALE_1, _MM_HINT_NONE);
}
*/
#define SCATTER_GENERAL(VTYPE, STYPE, PTRTYPE, FUNC)
/*
static FORCEINLINE void FUNC(PTRTYPE ptrs, VTYPE val, __vec16_i1 mask) { \