diff --git a/examples/intrinsics/knc.h b/examples/intrinsics/knc.h index 622f10e0..9cc6ef22 100644 --- a/examples/intrinsics/knc.h +++ b/examples/intrinsics/knc.h @@ -34,6 +34,7 @@ #include #include #include +#include #include #include @@ -52,7 +53,13 @@ #define KNC 1 extern "C" { - int printf(const unsigned char *, ...); + int printf(const unsigned char *, ...); + int puts(unsigned char *); + unsigned int putchar(unsigned int); + int fflush(void *); + uint8_t *memcpy(uint8_t *, uint8_t *, uint64_t ); + uint8_t *memset(uint8_t *, uint8_t, uint64_t ); + void memset_pattern16(void *, const void *, uint64_t ); } @@ -130,9 +137,10 @@ typedef struct PRE_ALIGN(64) __vec16_d { typedef struct PRE_ALIGN(64) __vec16_i32 { operator __m512i() const { return v; } - __vec16_i32() { } - __vec16_i32(const __m512i& in) { v = in; } - __vec16_i32(const __vec16_i32& in) { v = in.v; } + __vec16_i32() {} + __vec16_i32(const int32_t &in) : v(_mm512_set_1to16_epi32(in)) {}; + __vec16_i32(const __m512i &in) { v = in; } + __vec16_i32(const __vec16_i32 &in) { v = in.v; } __vec16_i32(int32_t v00, int32_t v01, int32_t v02, int32_t v03, int32_t v04, int32_t v05, int32_t v06, int32_t v07, int32_t v08, int32_t v09, int32_t v10, int32_t v11, @@ -142,8 +150,14 @@ typedef struct PRE_ALIGN(64) __vec16_i32 { __m512i v; } POST_ALIGN(64) __vec16_i32; +FORCEINLINE __vec16_i1::__vec16_i1(const __vec16_i32& in) { + m = _mm512_test_epi32_mask(in, in); +} + typedef struct PRE_ALIGN(64) __vec16_i64 { - __vec16_i64() { } + __forceinline __vec16_i64(); + __forceinline __vec16_i64(const __vec16_i64 &o) : v_lo(o.v_lo), v_hi(o.v_hi) {}; + __forceinline __vec16_i64(__m512i l, __m512i h): v_lo(l), v_hi(h) {}; __vec16_i64(int64_t v00, int64_t v01, int64_t v02, int64_t v03, int64_t v04, int64_t v05, int64_t v06, int64_t v07, int64_t v08, int64_t v09, int64_t v10, int64_t v11, @@ -167,9 +181,10 @@ typedef struct PRE_ALIGN(64) __vec16_i64 { __m512i v_lo; } POST_ALIGN(64) __vec16_i64; -FORCEINLINE __vec16_i1::__vec16_i1(const __vec16_i32& in) { - m = _mm512_test_epi32_mask(in, in); -} +FORCEINLINE __vec16_i64::__vec16_i64() + : v_lo(_mm512_undefined_epi32()), + v_hi(_mm512_undefined_epi32()) +{} template struct vec16 { @@ -619,7 +634,7 @@ static FORCEINLINE __vec16_i32 __ashr(__vec16_i32 a, int32_t n) { return _mm512_srai_epi32((__m512i)a, n); } -static FORCEINLINE __vec16_i1 __equal(__vec16_i32 a, __vec16_i32 b) { +static FORCEINLINE __vec16_i1 __equal(const __vec16_i32 &a, const __vec16_i32 &b) { return _mm512_cmpeq_epi32_mask((__m512i)a, (__m512i)b); } @@ -721,10 +736,26 @@ template <> static FORCEINLINE void __store<64>(__vec16_i32 *p, __vec16_i32 v) { /////////////////////////////////////////////////////////////////////////// // int64 -BINARY_OP(__vec16_i64, __add, +) + +static FORCEINLINE __vec16_i64 __add(const __vec16_i64 &a, const __vec16_i64 &b) +{ + __mmask16 carry = 0; + __m512i lo = _mm512_addsetc_epi32(a.v_lo, b.v_lo, &carry); + __m512i hi = _mm512_adc_epi32(a.v_hi, carry, b.v_hi, &carry); + return __vec16_i64(lo, hi); +} + BINARY_OP(__vec16_i64, __sub, -) BINARY_OP(__vec16_i64, __mul, *) +/*! 64x32 bit mul -- address computations often use a scale that we + know is 32 bits; and 32x64 is faster than 64x64 */ +static FORCEINLINE __vec16_i64 __mul(const __vec16_i32 &a, const __vec16_i64 &b) +{ + return __vec16_i64(_mm512_mullo_epi32(a.v,b.v_lo), + _mm512_add_epi32(b.v_hi, _mm512_mulhi_epi32(a.v, b.v_lo))); +} + BINARY_OP(__vec16_i64, __or, |) BINARY_OP(__vec16_i64, __and, &) BINARY_OP(__vec16_i64, __xor, ^) @@ -742,8 +773,15 @@ SHIFT_UNIFORM(__vec16_i64, uint64_t, __lshr, >>) SHIFT_UNIFORM(__vec16_i64, int64_t, __ashr, >>) SHIFT_UNIFORM(__vec16_i64, int64_t, __shl, <<) -CMP_OP(__vec16_i64, int64_t, __equal, ==) -CMP_OP(__vec16_i64, int64_t, __not_equal, !=) +static FORCEINLINE __vec16_i1 __equal(const __vec16_i64 &a, const __vec16_i64 &b) { + const __mmask16 lo_match = _mm512_cmpeq_epi32_mask(a.v_lo,b.v_lo); + return _mm512_mask_cmpeq_epi32_mask(lo_match,a.v_hi,b.v_hi); +} + +static FORCEINLINE __vec16_i1 __not_equal(const __vec16_i64 &a, const __vec16_i64 &b) { + return __not(__equal(a,b)); +} + CMP_OP(__vec16_i64, uint64_t, __unsigned_less_equal, <=) CMP_OP(__vec16_i64, int64_t, __signed_less_equal, <=) CMP_OP(__vec16_i64, uint64_t, __unsigned_greater_equal, >=) @@ -755,7 +793,18 @@ CMP_OP(__vec16_i64, int64_t, __signed_greater_than, >) SELECT(__vec16_i64) INSERT_EXTRACT(__vec16_i64, int64_t) -SMEAR(__vec16_i64, i64, int64_t) + +static FORCEINLINE int64_t __extract_element(const __vec16_i64 &v, int index) +{ + uint *src = (uint *)&v; + return src[index+16] | (int64_t(src[index]) << 32); +} + +static FORCEINLINE __vec16_i64 __smear_i64(__vec16_i64, const int64_t &l) { + const int *i = (const int*)&l; + return __vec16_i64(_mm512_set_1to16_epi32(i[0]), _mm512_set_1to16_epi32(i[1])); +} + BROADCAST(__vec16_i64, i64, int64_t) ROTATE(__vec16_i64, i64, int64_t) SHUFFLES(__vec16_i64, i64, int64_t) @@ -765,10 +814,10 @@ LOAD_STORE(__vec16_i64, int64_t) template static FORCEINLINE __vec16_i64 __load(__vec16_i64 *p) { __m512i v1; __m512i v2; - v1 = _mm512_extloadunpackhi_epi32(v1, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE); - v1 = _mm512_extloadunpacklo_epi32(v1, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE); - v2 = _mm512_extloadunpackhi_epi32(v2, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE); - v2 = _mm512_extloadunpacklo_epi32(v2, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE); + v2 = _mm512_extloadunpackhi_epi32(v1, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE); + v2 = _mm512_extloadunpacklo_epi32(v1, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE); + v1 = _mm512_extloadunpackhi_epi32(v2, ((uint8_t*)p)+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE); + v1 = _mm512_extloadunpacklo_epi32(v2, ((uint8_t*)p)+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE); __vec16_i64 ret; ret.v_hi = _mm512_mask_permutevar_epi32(ret.v_hi, 0xFF00, @@ -787,8 +836,8 @@ template static FORCEINLINE __vec16_i64 __load(__vec16_i64 *p) { } template <> static FORCEINLINE __vec16_i64 __load<64>(__vec16_i64 *p) { - __m512i v1 = _mm512_load_epi32(p); - __m512i v2 = _mm512_load_epi32(((uint8_t*)p)+64); + __m512i v2 = _mm512_load_epi32(p); + __m512i v1 = _mm512_load_epi32(((uint8_t*)p)+64); __vec16_i64 ret; ret.v_hi = _mm512_mask_permutevar_epi32(ret.v_hi, 0xFF00, _mm512_set_16to16_pi(15,13,11,9,7,5,3,1,14,12,10,8,6,4,2,0), @@ -820,10 +869,10 @@ template static FORCEINLINE void __store(__vec16_i64 *p, __vec16_i64 v2 = _mm512_mask_permutevar_epi32(v2, 0x5555, _mm512_set_16to16_pi(7,7,6,6,5,5,4,4,3,3,2,2,1,1,0,0), v.v_lo); - _mm512_extpackstorehi_epi32(p, v1, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE); - _mm512_extpackstorelo_epi32(p, v1, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE); - _mm512_extpackstorehi_epi32(((uint8_t*)p)+64, v2, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE); - _mm512_extpackstorelo_epi32(((uint8_t*)p)+64, v2, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE); + _mm512_extpackstorehi_epi32(p, v2, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE); + _mm512_extpackstorelo_epi32(p, v2, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE); + _mm512_extpackstorehi_epi32(((uint8_t*)p)+64, v1, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE); + _mm512_extpackstorelo_epi32(((uint8_t*)p)+64, v1, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE); } template <> static FORCEINLINE void __store<64>(__vec16_i64 *p, __vec16_i64 v) { @@ -841,8 +890,8 @@ template <> static FORCEINLINE void __store<64>(__vec16_i64 *p, __vec16_i64 v) { v2 = _mm512_mask_permutevar_epi32(v2, 0x5555, _mm512_set_16to16_pi(7,7,6,6,5,5,4,4,3,3,2,2,1,1,0,0), v.v_lo); - _mm512_store_epi64(p, v1); - _mm512_store_epi64(((uint8_t*)p)+64, v2); + _mm512_store_epi64(p, v2); + _mm512_store_epi64(((uint8_t*)p)+64, v1); } @@ -1161,7 +1210,11 @@ static FORCEINLINE TYPE __cast_sext(TYPE, __vec16_i1 v) { \ CAST_SEXT_I1(__vec16_i8) CAST_SEXT_I1(__vec16_i16) CAST_SEXT_I1(__vec16_i32) -CAST_SEXT_I1(__vec16_i64) + +static FORCEINLINE __vec16_i64 __cast_sext(const __vec16_i64 &, const __vec16_i32 &val) +{ + return __vec16_i64(val.v,_mm512_srai_epi32(val.v,31)); +} // zero extension CAST(__vec16_i64, uint64_t, __vec16_i32, uint32_t, __cast_zext) @@ -1171,6 +1224,11 @@ CAST(__vec16_i32, uint32_t, __vec16_i16, uint16_t, __cast_zext) CAST(__vec16_i32, uint32_t, __vec16_i8, uint8_t, __cast_zext) CAST(__vec16_i16, uint16_t, __vec16_i8, uint8_t, __cast_zext) +static FORCEINLINE __vec16_i64 __cast_zext(const __vec16_i64 &, const __vec16_i32 &val) +{ + return __vec16_i64(val.v, _mm512_setzero_epi32()); +} + #define CAST_ZEXT_I1(TYPE) /* static FORCEINLINE TYPE __cast_zext(TYPE, __vec16_i1 v) { \ @@ -1459,8 +1517,11 @@ static FORCEINLINE __vec16_f __sqrt_varying_float(__vec16_f v) { } static FORCEINLINE __vec16_f __rcp_varying_float(__vec16_f v) { +#ifdef ISPC_FAST_MATH return _mm512_recip_ps(v); - //return _mm512_rcp23_ps(v); // 23-bits of accuracy +#else + return _mm512_rcp23_ps(v); // 23-bits of accuracy +#endif } static FORCEINLINE __vec16_f __rsqrt_varying_float(__vec16_f v) { @@ -1752,7 +1813,6 @@ GATHER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i32, __gather_base_offsets32_i8) GATHER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i64, __gather_base_offsets64_i8) GATHER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i32, __gather_base_offsets32_i16) GATHER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i64, __gather_base_offsets64_i16) -//GATHER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i32, __gather_base_offsets32_i32) GATHER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i64, __gather_base_offsets64_i32) GATHER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i32, __gather_base_offsets32_i64) GATHER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i64, __gather_base_offsets64_i64) @@ -1777,7 +1837,7 @@ GATHER_GENERAL(__vec16_i16, int16_t, __vec16_i64, __gather64_i16) GATHER_GENERAL(__vec16_i32, int32_t, __vec16_i32, __gather32_i32) GATHER_GENERAL(__vec16_i32, int32_t, __vec16_i64, __gather64_i32) GATHER_GENERAL(__vec16_i64, int64_t, __vec16_i32, __gather32_i64) -GATHER_GENERAL(__vec16_i64, int64_t, __vec16_i64, __gather64_i64) +// GATHER_GENERAL(__vec16_i64, int64_t, __vec16_i64, __gather64_i64) /* static __forceinline __vec16_i32 __gather64_i32(__vec16_i64 ptrs, __vec16_i1 mask) { @@ -1820,7 +1880,6 @@ SCATTER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i32, __scatter_base_offsets32_i SCATTER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i64, __scatter_base_offsets64_i8) SCATTER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i32, __scatter_base_offsets32_i16) SCATTER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i64, __scatter_base_offsets64_i16) -//SCATTER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i32, __scatter_base_offsets32_i32) SCATTER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i64, __scatter_base_offsets64_i32) SCATTER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i32, __scatter_base_offsets32_i64) SCATTER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i64, __scatter_base_offsets64_i64) @@ -1828,14 +1887,21 @@ SCATTER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i64, __scatter_base_offsets64 static FORCEINLINE void __scatter_base_offsets32_i32(uint8_t *b, __vec16_i32 varyingOffset, uint32_t scale, __vec16_i32 constOffset, - __vec16_i32 val, __vec16_i1 mask) { - __vec16_i32 vscale = _mm512_extload_epi32(&scale, _MM_UPCONV_EPI32_NONE, _MM_BROADCAST_1X16, _MM_HINT_NONE); - __vec16_i32 offsets = __add(__mul(vscale, varyingOffset), constOffset); - - // Loop is generated by the intrinsic + __vec16_i32 val, __vec16_i1 mask) +{ + __vec16_i32 offsets = __add(__mul(__vec16_i32(scale), varyingOffset), constOffset); _mm512_mask_i32extscatter_epi32(b, mask, offsets, val, _MM_DOWNCONV_EPI32_NONE, 1, _MM_HINT_NONE); } +static FORCEINLINE void +__scatter_base_offsets32_float(void *base, const __vec16_i32 &varyingOffset, + uint32_t scale, const __vec16_i32 &constOffset, + const __vec16_f &val, const __vec16_i1 mask) +{ + __vec16_i32 offsets = __add(__mul(varyingOffset,__vec16_i32(scale)), constOffset); + _mm512_mask_i32extscatter_ps(base, mask, offsets, val, _MM_DOWNCONV_PS_NONE, _MM_SCALE_1, _MM_HINT_NONE); +} + #define SCATTER_GENERAL(VTYPE, STYPE, PTRTYPE, FUNC) /* static FORCEINLINE void FUNC(PTRTYPE ptrs, VTYPE val, __vec16_i1 mask) { \ @@ -1846,8 +1912,7 @@ static FORCEINLINE void FUNC(PTRTYPE ptrs, VTYPE val, __vec16_i1 mask) { \ *ptr = val.v[i]; \ } \ } -*/ -SCATTER_GENERAL(__vec16_i8, int8_t, __vec16_i32, __scatter32_i8) +*/ SCATTER_GENERAL(__vec16_i8, int8_t, __vec16_i32, __scatter32_i8) SCATTER_GENERAL(__vec16_i8, int8_t, __vec16_i64, __scatter64_i8) SCATTER_GENERAL(__vec16_i16, int16_t, __vec16_i32, __scatter32_i16) SCATTER_GENERAL(__vec16_i16, int16_t, __vec16_i64, __scatter64_i16) @@ -2158,3 +2223,4 @@ static FORCEINLINE uint64_t __atomic_cmpxchg(uint64_t *p, uint64_t cmpval, #undef PRE_ALIGN #undef POST_ALIGN +