diff --git a/examples/intrinsics/knc.h b/examples/intrinsics/knc.h index 0cfb3d31..404cd24f 100644 --- a/examples/intrinsics/knc.h +++ b/examples/intrinsics/knc.h @@ -803,6 +803,13 @@ template <> static FORCEINLINE void __store<64>(__vec16_i32 *p, __vec16_i32 v) { // int64 +static FORCEINLINE __vec16_i64 __setzero_i64() { + __vec16_i64 ret; + ret.v_lo = _mm512_setzero_epi32(); + ret.v_hi = _mm512_setzero_epi32(); + return ret; +} + static FORCEINLINE __vec16_i64 __add(const __vec16_i64 &a, const __vec16_i64 &b) { __mmask16 carry = 0; @@ -878,7 +885,7 @@ static FORCEINLINE int64_t __extract_element(const __vec16_i64 &v, int index) return src[index+16] | (int64_t(src[index]) << 32); } -static FORCEINLINE __vec16_i64 __smear_i64(__vec16_i64, const int64_t &l) { +static FORCEINLINE __vec16_i64 __smear_i64(const int64_t &l) { const int *i = (const int*)&l; return __vec16_i64(_mm512_set_1to16_epi32(i[0]), _mm512_set_1to16_epi32(i[1])); } @@ -1373,6 +1380,11 @@ CAST(__vec16_i32, int32_t, __vec16_i16, int16_t, __cast_sext) CAST(__vec16_i32, int32_t, __vec16_i8, int8_t, __cast_sext) CAST(__vec16_i16, int16_t, __vec16_i8, int8_t, __cast_sext) +static FORCEINLINE __vec16_i64 __cast_sext(const __vec16_i64 &, const __vec16_i32 &val) +{ + return __vec16_i64(val.v,_mm512_srai_epi32(val.v,31)); +} + #define CAST_SEXT_I1(TYPE) /* static FORCEINLINE TYPE __cast_sext(TYPE, __vec16_i1 v) { \ @@ -1389,11 +1401,6 @@ CAST_SEXT_I1(__vec16_i8) CAST_SEXT_I1(__vec16_i16) CAST_SEXT_I1(__vec16_i32) -static FORCEINLINE __vec16_i64 __cast_sext(const __vec16_i64 &, const __vec16_i32 &val) -{ - return __vec16_i64(val.v,_mm512_srai_epi32(val.v,31)); -} - // zero extension CAST(__vec16_i64, uint64_t, __vec16_i32, uint32_t, __cast_zext) CAST(__vec16_i64, uint64_t, __vec16_i16, uint16_t, __cast_zext) @@ -1421,6 +1428,14 @@ CAST_ZEXT_I1(__vec16_i16) CAST_ZEXT_I1(__vec16_i32) CAST_ZEXT_I1(__vec16_i64) +static FORCEINLINE __vec16_i32 __cast_zext(const __vec16_i32 &, const __vec16_i1 &val) +{ + __vec16_i32 ret = _mm512_setzero_epi32(); + __vec16_i32 one = _mm512_set1_epi32(1); + return _mm512_mask_mov_epi32(ret, val.m, one); +} + + // truncations CAST(__vec16_i32, int32_t, __vec16_i64, int64_t, __cast_trunc) CAST(__vec16_i16, int16_t, __vec16_i64, int64_t, __cast_trunc) @@ -1654,14 +1669,25 @@ static FORCEINLINE __vec16_f __min_varying_float(__vec16_f v1, __vec16_f v2) { return _mm512_gmin_ps(v1, v2); } +static FORCEINLINE __vec16_i32 __max_varying_int32(__vec16_i32 v1, __vec16_i32 v2) { + return _mm512_max_epi32(v1, v2); +} + +static FORCEINLINE __vec16_i32 __min_varying_int32(__vec16_i32 v1, __vec16_i32 v2) { + return _mm512_min_epi32(v1, v2); +} + +static FORCEINLINE __vec16_i32 __max_varying_uint32(__vec16_i32 v1, __vec16_i32 v2) { + return _mm512_max_epu32(v1, v2); +} + +static FORCEINLINE __vec16_i32 __min_varying_uint32(__vec16_i32 v1, __vec16_i32 v2) { + return _mm512_min_epu32(v1, v2); +} + BINARY_OP_FUNC(__vec16_d, __max_varying_double, __max_uniform_double) BINARY_OP_FUNC(__vec16_d, __min_varying_double, __min_uniform_double) -BINARY_OP_FUNC(__vec16_i32, __max_varying_int32, __max_uniform_int32) -BINARY_OP_FUNC(__vec16_i32, __min_varying_int32, __min_uniform_int32) -BINARY_OP_FUNC(__vec16_i32, __max_varying_uint32, __max_uniform_uint32) -BINARY_OP_FUNC(__vec16_i32, __min_varying_uint32, __min_uniform_uint32) - BINARY_OP_FUNC(__vec16_i64, __max_varying_int64, __max_uniform_int64) BINARY_OP_FUNC(__vec16_i64, __min_varying_int64, __min_uniform_int64) BINARY_OP_FUNC(__vec16_i64, __max_varying_uint64, __max_uniform_uint64) @@ -2033,6 +2059,17 @@ __scatter_base_offsets32_float(void *base, uint32_t scale, __vec16_i32 offsets, _MM_HINT_NONE); } +/* +static FORCEINLINE void +__scatter_base_offsets64_float(void *base, const __vec16_i64 &varyingOffset, + uint32_t scale, const __vec16_i64 &constOffset, + const __vec16_f &val, const __vec16_i1 mask) +{ + __vec16_i64 offsets = __add(__mul(varyingOffset,__vec16_i64(scale)), constOffset); + _mm512_mask_i64extscatter_ps(base, mask, offsets, val, _MM_DOWNCONV_PS_NONE, _MM_SCALE_1, _MM_HINT_NONE); +} +*/ + #define SCATTER_GENERAL(VTYPE, STYPE, PTRTYPE, FUNC) /* static FORCEINLINE void FUNC(PTRTYPE ptrs, VTYPE val, __vec16_i1 mask) { \