From 39b1e4a204932de24d62aab2b6d24c958cbd1c37 Mon Sep 17 00:00:00 2001 From: Vsevolod Livinskiy Date: Thu, 6 Nov 2014 13:34:22 +0400 Subject: [PATCH 1/7] masked_load/store_i16 was added --- examples/intrinsics/knc.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/examples/intrinsics/knc.h b/examples/intrinsics/knc.h index 7a4a282c..3bc86e6e 100644 --- a/examples/intrinsics/knc.h +++ b/examples/intrinsics/knc.h @@ -2020,13 +2020,30 @@ __scatter_base_offsets32_i8(uint8_t *b, uint32_t scale, __vec16_i32 offsets, _MM_HINT_NONE); } + +static FORCEINLINE void __masked_store_i16(void *p, const __vec16_i16 &val, __vec16_i1 mask) { + __vec16_i32 tmp = _mm512_extload_epi32(&val, _MM_UPCONV_EPI32_SINT16, _MM_BROADCAST32_NONE, _MM_HINT_NONE); + _mm512_mask_extstore_epi32(p, mask, tmp, _MM_DOWNCONV_EPI32_SINT16,_MM_HINT_NONE); +} + +static FORCEINLINE __vec16_i16 __masked_load_i16(void *p, __vec16_i1 mask) { + __vec16_i16 ret; + __vec16_i32 tmp = _mm512_mask_extload_epi32(_mm512_undefined_epi32(),mask,p, + _MM_UPCONV_EPI32_SINT16, + _MM_BROADCAST32_NONE, _MM_HINT_NONE); + _mm512_extstore_epi32(&ret, tmp, _MM_DOWNCONV_EPI32_SINT16,_MM_HINT_NONE); + return ret; +} + template static FORCEINLINE __vec16_i16 __load(const __vec16_i16 *p) { return *p; } + template static FORCEINLINE void __store(__vec16_i16 *p, __vec16_i16 v) { *p = v; } + static FORCEINLINE void __masked_store_i32(void *p, __vec16_i32 val, __vec16_i1 mask) { #ifdef ISPC_FORCE_ALIGNED_MEMORY _mm512_mask_store_epi32(p, mask, val.v); From 3b1445c660b5a5c126a93a25e55fedf0c4cfa704 Mon Sep 17 00:00:00 2001 From: Vsevolod Livinskiy Date: Thu, 6 Nov 2014 13:48:34 +0400 Subject: [PATCH 2/7] __smear_i8/16 was added --- examples/intrinsics/knc.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/examples/intrinsics/knc.h b/examples/intrinsics/knc.h index 3bc86e6e..4ea9c3f4 100644 --- a/examples/intrinsics/knc.h +++ b/examples/intrinsics/knc.h @@ -460,6 +460,14 @@ template <> FORCEINLINE __vec16_i8 __setzero_i8<__vec16_i8>() { return __vec16_i8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); } +template static RetVecType __smear_i8(int8_t i); +template <> FORCEINLINE __vec16_i8 __smear_i8<__vec16_i8>(int8_t i) { + __vec16_i32 tmp = _mm512_set1_epi32(i); + __vec16_i8 ret; + _mm512_extstore_epi32(&ret, tmp, _MM_DOWNCONV_EPI32_SINT8,_MM_HINT_NONE); + return ret; +} + /////////////////////////////////////////////////////////////////////////// // int16 @@ -470,6 +478,14 @@ template <> FORCEINLINE __vec16_i16 __setzero_i16<__vec16_i16>() { return __vec16_i16(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); } +template static RetVecType __smear_i16(int16_t i); +template <> FORCEINLINE __vec16_i16 __smear_i16<__vec16_i16>(int16_t i) { + __vec16_i32 tmp = _mm512_set1_epi32(i); + __vec16_i16 ret; + _mm512_extstore_epi32(&ret, tmp, _MM_DOWNCONV_EPI32_SINT16,_MM_HINT_NONE); + return ret; +} + /////////////////////////////////////////////////////////////////////////// // int32 /////////////////////////////////////////////////////////////////////////// From 9316dd65c00eb8eb7e6396f2ca98c855e25bdddd Mon Sep 17 00:00:00 2001 From: Vsevolod Livinskiy Date: Thu, 6 Nov 2014 15:02:28 +0400 Subject: [PATCH 3/7] __not_equal_i8/16 was added --- examples/intrinsics/knc.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/examples/intrinsics/knc.h b/examples/intrinsics/knc.h index 4ea9c3f4..ecb69787 100644 --- a/examples/intrinsics/knc.h +++ b/examples/intrinsics/knc.h @@ -460,6 +460,12 @@ template <> FORCEINLINE __vec16_i8 __setzero_i8<__vec16_i8>() { return __vec16_i8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); } +static FORCEINLINE __vec16_i1 __not_equal_i8(__vec16_i8 a, __vec16_i8 b) { + __vec16_i32 tmp_a = _mm512_extload_epi32(&a, _MM_UPCONV_EPI32_SINT8, _MM_BROADCAST32_NONE, _MM_HINT_NONE); + __vec16_i32 tmp_b = _mm512_extload_epi32(&b, _MM_UPCONV_EPI32_SINT8, _MM_BROADCAST32_NONE, _MM_HINT_NONE); + return _mm512_cmpneq_epi32_mask(tmp_a, tmp_b); +} + template static RetVecType __smear_i8(int8_t i); template <> FORCEINLINE __vec16_i8 __smear_i8<__vec16_i8>(int8_t i) { __vec16_i32 tmp = _mm512_set1_epi32(i); @@ -478,6 +484,12 @@ template <> FORCEINLINE __vec16_i16 __setzero_i16<__vec16_i16>() { return __vec16_i16(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); } +static FORCEINLINE __vec16_i1 __not_equal_i16(__vec16_i16 a, __vec16_i16 b) { + __vec16_i32 tmp_a = _mm512_extload_epi32(&a, _MM_UPCONV_EPI32_SINT16, _MM_BROADCAST32_NONE, _MM_HINT_NONE); + __vec16_i32 tmp_b = _mm512_extload_epi32(&b, _MM_UPCONV_EPI32_SINT16, _MM_BROADCAST32_NONE, _MM_HINT_NONE); + return _mm512_cmpneq_epi32_mask(tmp_a, tmp_b); +} + template static RetVecType __smear_i16(int16_t i); template <> FORCEINLINE __vec16_i16 __smear_i16<__vec16_i16>(int16_t i) { __vec16_i32 tmp = _mm512_set1_epi32(i); From 12376e6a0c064dc92d1daeb563d844fcdf580c58 Mon Sep 17 00:00:00 2001 From: Vsevolod Livinskiy Date: Thu, 6 Nov 2014 16:10:51 +0400 Subject: [PATCH 4/7] __shuffle_i8/16 was added. __reduce* functions were fixed. --- examples/intrinsics/knc.h | 32 +++++++++++++++++++++++++++++--- 1 file changed, 29 insertions(+), 3 deletions(-) diff --git a/examples/intrinsics/knc.h b/examples/intrinsics/knc.h index ecb69787..d36cf420 100644 --- a/examples/intrinsics/knc.h +++ b/examples/intrinsics/knc.h @@ -466,6 +466,15 @@ static FORCEINLINE __vec16_i1 __not_equal_i8(__vec16_i8 a, __vec16_i8 b) { return _mm512_cmpneq_epi32_mask(tmp_a, tmp_b); } +static FORCEINLINE __vec16_i8 __shuffle_i8(__vec16_i8 v, __vec16_i32 index) { + __vec16_i32 tmp_v = _mm512_extload_epi32(&v, _MM_UPCONV_EPI32_SINT8, _MM_BROADCAST32_NONE, _MM_HINT_NONE); + __vec16_i32 tmp = _mm512_permutevar_epi32(tmp_v, index); + __vec16_i8 ret; + _mm512_extstore_epi32(&ret, tmp, _MM_DOWNCONV_EPI32_SINT8,_MM_HINT_NONE); + return ret; + +} + template static RetVecType __smear_i8(int8_t i); template <> FORCEINLINE __vec16_i8 __smear_i8<__vec16_i8>(int8_t i) { __vec16_i32 tmp = _mm512_set1_epi32(i); @@ -490,6 +499,15 @@ static FORCEINLINE __vec16_i1 __not_equal_i16(__vec16_i16 a, __vec16_i16 b) { return _mm512_cmpneq_epi32_mask(tmp_a, tmp_b); } +static FORCEINLINE __vec16_i16 __shuffle_i16(__vec16_i16 v, __vec16_i32 index) { + __vec16_i32 tmp_v = _mm512_extload_epi32(&v, _MM_UPCONV_EPI32_SINT16, _MM_BROADCAST32_NONE, _MM_HINT_NONE); + __vec16_i32 tmp = _mm512_permutevar_epi32(tmp_v, index); + __vec16_i16 ret; + _mm512_extstore_epi32(&ret, tmp, _MM_DOWNCONV_EPI32_SINT16,_MM_HINT_NONE); + return ret; + +} + template static RetVecType __smear_i16(int16_t i); template <> FORCEINLINE __vec16_i16 __smear_i16<__vec16_i16>(int16_t i) { __vec16_i32 tmp = _mm512_set1_epi32(i); @@ -1929,18 +1947,26 @@ static FORCEINLINE int32_t __reduce_add_int16(__vec16_i16 v) { return ret; } -static FORCEINLINE uint32_t __reduce_add_int32(__vec16_i32 v) { +static FORCEINLINE int32_t __reduce_add_int32(__vec16_i32 v) { return _mm512_reduce_add_epi32(v); } -static FORCEINLINE uint32_t __reduce_min_int32(__vec16_i32 v) { +static FORCEINLINE int32_t __reduce_min_int32(__vec16_i32 v) { return _mm512_reduce_min_epi32(v); } -static FORCEINLINE uint32_t __reduce_max_int32(__vec16_i32 v) { +static FORCEINLINE int32_t __reduce_max_int32(__vec16_i32 v) { return _mm512_reduce_max_epi32(v); } +static FORCEINLINE uint32_t __reduce_min_uint32(__vec16_i32 v) { + return _mm512_reduce_min_epu32(v); +} + +static FORCEINLINE uint32_t __reduce_max_uint32(__vec16_i32 v) { + return _mm512_reduce_max_epu32(v); +} + static FORCEINLINE float __reduce_add_float(__vec16_f v) { return _mm512_reduce_add_ps(v); } From 67cc62d619b8aea56c3f64d8a95da0a9531f57e7 Mon Sep 17 00:00:00 2001 From: Vsevolod Livinskiy Date: Thu, 6 Nov 2014 16:35:23 +0400 Subject: [PATCH 5/7] Minor fixes to remove copy-paste --- examples/intrinsics/knc.h | 129 +++++++++++++++++++------------------- 1 file changed, 64 insertions(+), 65 deletions(-) diff --git a/examples/intrinsics/knc.h b/examples/intrinsics/knc.h index d36cf420..2aa7646c 100644 --- a/examples/intrinsics/knc.h +++ b/examples/intrinsics/knc.h @@ -451,71 +451,6 @@ template <> FORCEINLINE __vec16_i1 __undef_i1<__vec16_i1>() { return __vec16_i1(); } -/////////////////////////////////////////////////////////////////////////// -// int8 -/////////////////////////////////////////////////////////////////////////// - -template static RetVecType __setzero_i8(); -template <> FORCEINLINE __vec16_i8 __setzero_i8<__vec16_i8>() { - return __vec16_i8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); -} - -static FORCEINLINE __vec16_i1 __not_equal_i8(__vec16_i8 a, __vec16_i8 b) { - __vec16_i32 tmp_a = _mm512_extload_epi32(&a, _MM_UPCONV_EPI32_SINT8, _MM_BROADCAST32_NONE, _MM_HINT_NONE); - __vec16_i32 tmp_b = _mm512_extload_epi32(&b, _MM_UPCONV_EPI32_SINT8, _MM_BROADCAST32_NONE, _MM_HINT_NONE); - return _mm512_cmpneq_epi32_mask(tmp_a, tmp_b); -} - -static FORCEINLINE __vec16_i8 __shuffle_i8(__vec16_i8 v, __vec16_i32 index) { - __vec16_i32 tmp_v = _mm512_extload_epi32(&v, _MM_UPCONV_EPI32_SINT8, _MM_BROADCAST32_NONE, _MM_HINT_NONE); - __vec16_i32 tmp = _mm512_permutevar_epi32(tmp_v, index); - __vec16_i8 ret; - _mm512_extstore_epi32(&ret, tmp, _MM_DOWNCONV_EPI32_SINT8,_MM_HINT_NONE); - return ret; - -} - -template static RetVecType __smear_i8(int8_t i); -template <> FORCEINLINE __vec16_i8 __smear_i8<__vec16_i8>(int8_t i) { - __vec16_i32 tmp = _mm512_set1_epi32(i); - __vec16_i8 ret; - _mm512_extstore_epi32(&ret, tmp, _MM_DOWNCONV_EPI32_SINT8,_MM_HINT_NONE); - return ret; -} - - -/////////////////////////////////////////////////////////////////////////// -// int16 -/////////////////////////////////////////////////////////////////////////// - -template static RetVecType __setzero_i16(); -template <> FORCEINLINE __vec16_i16 __setzero_i16<__vec16_i16>() { - return __vec16_i16(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); -} - -static FORCEINLINE __vec16_i1 __not_equal_i16(__vec16_i16 a, __vec16_i16 b) { - __vec16_i32 tmp_a = _mm512_extload_epi32(&a, _MM_UPCONV_EPI32_SINT16, _MM_BROADCAST32_NONE, _MM_HINT_NONE); - __vec16_i32 tmp_b = _mm512_extload_epi32(&b, _MM_UPCONV_EPI32_SINT16, _MM_BROADCAST32_NONE, _MM_HINT_NONE); - return _mm512_cmpneq_epi32_mask(tmp_a, tmp_b); -} - -static FORCEINLINE __vec16_i16 __shuffle_i16(__vec16_i16 v, __vec16_i32 index) { - __vec16_i32 tmp_v = _mm512_extload_epi32(&v, _MM_UPCONV_EPI32_SINT16, _MM_BROADCAST32_NONE, _MM_HINT_NONE); - __vec16_i32 tmp = _mm512_permutevar_epi32(tmp_v, index); - __vec16_i16 ret; - _mm512_extstore_epi32(&ret, tmp, _MM_DOWNCONV_EPI32_SINT16,_MM_HINT_NONE); - return ret; - -} - -template static RetVecType __smear_i16(int16_t i); -template <> FORCEINLINE __vec16_i16 __smear_i16<__vec16_i16>(int16_t i) { - __vec16_i32 tmp = _mm512_set1_epi32(i); - __vec16_i16 ret; - _mm512_extstore_epi32(&ret, tmp, _MM_DOWNCONV_EPI32_SINT16,_MM_HINT_NONE); - return ret; -} - /////////////////////////////////////////////////////////////////////////// // int32 /////////////////////////////////////////////////////////////////////////// @@ -1775,6 +1710,70 @@ static FORCEINLINE __vec16_d __cast_bits(__vec16_d, __vec16_i64 val) { return ret; } +/////////////////////////////////////////////////////////////////////////// +// int8 +/////////////////////////////////////////////////////////////////////////// + +template static RetVecType __setzero_i8(); +template <> FORCEINLINE __vec16_i8 __setzero_i8<__vec16_i8>() { + return __vec16_i8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); +} + +static FORCEINLINE __vec16_i1 __not_equal_i8(__vec16_i8 a, __vec16_i8 b) { + __vec16_i32 tmp_a = _mm512_extload_epi32(&a, _MM_UPCONV_EPI32_SINT8, _MM_BROADCAST32_NONE, _MM_HINT_NONE); + __vec16_i32 tmp_b = _mm512_extload_epi32(&b, _MM_UPCONV_EPI32_SINT8, _MM_BROADCAST32_NONE, _MM_HINT_NONE); + return __not_equal_i32(tmp_a, tmp_b); +} + +static FORCEINLINE __vec16_i8 __shuffle_i8(__vec16_i8 v, __vec16_i32 index) { + __vec16_i32 tmp_v = _mm512_extload_epi32(&v, _MM_UPCONV_EPI32_SINT8, _MM_BROADCAST32_NONE, _MM_HINT_NONE); + __vec16_i32 tmp = __shuffle_i32(tmp_v, index); + __vec16_i8 ret; + _mm512_extstore_epi32(&ret, tmp, _MM_DOWNCONV_EPI32_SINT8,_MM_HINT_NONE); + return ret; + +} + +template static RetVecType __smear_i8(int8_t i); +template <> FORCEINLINE __vec16_i8 __smear_i8<__vec16_i8>(int8_t i) { + __vec16_i32 tmp = __smear_i32<__vec16_i32>(i); + __vec16_i8 ret; + _mm512_extstore_epi32(&ret, tmp, _MM_DOWNCONV_EPI32_SINT8,_MM_HINT_NONE); + return ret; +} + +/////////////////////////////////////////////////////////////////////////// +// int16 +/////////////////////////////////////////////////////////////////////////// + +template static RetVecType __setzero_i16(); +template <> FORCEINLINE __vec16_i16 __setzero_i16<__vec16_i16>() { + return __vec16_i16(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); +} + +static FORCEINLINE __vec16_i1 __not_equal_i16(__vec16_i16 a, __vec16_i16 b) { + __vec16_i32 tmp_a = _mm512_extload_epi32(&a, _MM_UPCONV_EPI32_SINT16, _MM_BROADCAST32_NONE, _MM_HINT_NONE); + __vec16_i32 tmp_b = _mm512_extload_epi32(&b, _MM_UPCONV_EPI32_SINT16, _MM_BROADCAST32_NONE, _MM_HINT_NONE); + return __not_equal_i32(tmp_a, tmp_b); +} + +static FORCEINLINE __vec16_i16 __shuffle_i16(__vec16_i16 v, __vec16_i32 index) { + __vec16_i32 tmp_v = _mm512_extload_epi32(&v, _MM_UPCONV_EPI32_SINT16, _MM_BROADCAST32_NONE, _MM_HINT_NONE); + __vec16_i32 tmp = __shuffle_i32(tmp_v, index); + __vec16_i16 ret; + _mm512_extstore_epi32(&ret, tmp, _MM_DOWNCONV_EPI32_SINT16,_MM_HINT_NONE); + return ret; + +} + +template static RetVecType __smear_i16(int16_t i); +template <> FORCEINLINE __vec16_i16 __smear_i16<__vec16_i16>(int16_t i) { + __vec16_i32 tmp = __smear_i32<__vec16_i32>(i); + __vec16_i16 ret; + _mm512_extstore_epi32(&ret, tmp, _MM_DOWNCONV_EPI32_SINT16,_MM_HINT_NONE); + return ret; +} + /////////////////////////////////////////////////////////////////////////// // various math functions /////////////////////////////////////////////////////////////////////////// From 1b8afb73ad4364d153c2763d32bb7b83e1e2a511 Mon Sep 17 00:00:00 2001 From: Vsevolod Livinskiy Date: Thu, 6 Nov 2014 16:48:26 +0400 Subject: [PATCH 6/7] __rotate_i8/16/64 was added --- examples/intrinsics/knc.h | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/examples/intrinsics/knc.h b/examples/intrinsics/knc.h index 2aa7646c..d75146b0 100644 --- a/examples/intrinsics/knc.h +++ b/examples/intrinsics/knc.h @@ -917,6 +917,11 @@ template static FORCEINLINE __vec16_i64 __load(const __vec16_i64 *p) return ret; } +static FORCEINLINE __vec16_i64 __rotate_i64(__vec16_i64 v, int index) { + return __vec16_i64(__shuffle_i32(v.v_lo, index), __shuffle_i32(v.v_hi, index)); +} + + #if 0 template <> FORCEINLINE __vec16_i64 __load<64>(const __vec16_i64 *p) { __m512i v2 = _mm512_load_epi32(p); @@ -1725,6 +1730,14 @@ static FORCEINLINE __vec16_i1 __not_equal_i8(__vec16_i8 a, __vec16_i8 b) { return __not_equal_i32(tmp_a, tmp_b); } +static FORCEINLINE __vec16_i8 __rotate_i8(__vec16_i8 v, int index) { + __vec16_i32 tmp_v = _mm512_extload_epi32(&v, _MM_UPCONV_EPI32_SINT8, _MM_BROADCAST32_NONE, _MM_HINT_NONE); + __vec16_i32 tmp = __rotate_i32(tmp_v, index); + __vec16_i8 ret; + _mm512_extstore_epi32(&ret, tmp, _MM_DOWNCONV_EPI32_SINT8,_MM_HINT_NONE); + return ret; +} + static FORCEINLINE __vec16_i8 __shuffle_i8(__vec16_i8 v, __vec16_i32 index) { __vec16_i32 tmp_v = _mm512_extload_epi32(&v, _MM_UPCONV_EPI32_SINT8, _MM_BROADCAST32_NONE, _MM_HINT_NONE); __vec16_i32 tmp = __shuffle_i32(tmp_v, index); @@ -1757,13 +1770,20 @@ static FORCEINLINE __vec16_i1 __not_equal_i16(__vec16_i16 a, __vec16_i16 b) { return __not_equal_i32(tmp_a, tmp_b); } +static FORCEINLINE __vec16_i16 __rotate_i16(__vec16_i16 v, int index) { + __vec16_i32 tmp_v = _mm512_extload_epi32(&v, _MM_UPCONV_EPI32_SINT16, _MM_BROADCAST32_NONE, _MM_HINT_NONE); + __vec16_i32 tmp = __rotate_i32(tmp_v, index); + __vec16_i16 ret; + _mm512_extstore_epi32(&ret, tmp, _MM_DOWNCONV_EPI32_SINT16,_MM_HINT_NONE); + return ret; +} + static FORCEINLINE __vec16_i16 __shuffle_i16(__vec16_i16 v, __vec16_i32 index) { __vec16_i32 tmp_v = _mm512_extload_epi32(&v, _MM_UPCONV_EPI32_SINT16, _MM_BROADCAST32_NONE, _MM_HINT_NONE); __vec16_i32 tmp = __shuffle_i32(tmp_v, index); __vec16_i16 ret; _mm512_extstore_epi32(&ret, tmp, _MM_DOWNCONV_EPI32_SINT16,_MM_HINT_NONE); return ret; - } template static RetVecType __smear_i16(int16_t i); From 83f3ee7cfa34d7547a5d56bd2d415c3c1918d708 Mon Sep 17 00:00:00 2001 From: Vsevolod Livinskiy Date: Thu, 6 Nov 2014 17:29:13 +0400 Subject: [PATCH 7/7] __shuffle2_i8/16/32/64 was added --- examples/intrinsics/knc.h | 40 ++++++++++++++++++++++++++++++++++----- 1 file changed, 35 insertions(+), 5 deletions(-) diff --git a/examples/intrinsics/knc.h b/examples/intrinsics/knc.h index d75146b0..eb42408f 100644 --- a/examples/intrinsics/knc.h +++ b/examples/intrinsics/knc.h @@ -665,6 +665,15 @@ static FORCEINLINE __vec16_i32 __shuffle_i32(__vec16_i32 v, __vec16_i32 index) { return _mm512_mask_permutevar_epi32(v, 0xffff, index, v); } +static FORCEINLINE __vec16_i32 __shuffle2_i32(__vec16_i32 v0, __vec16_i32 v1, __vec16_i32 index) { + const __vec16_i1 mask = __signed_less_than_i32(index, __smear_i32<__vec16_i32>(0x10)); + index = __and(index, __smear_i32<__vec16_i32>(0xF)); + __vec16_i32 ret = __undef_i32<__vec16_i32>(); + ret = _mm512_mask_permutevar_epi32(ret, mask, index, v0); + ret = _mm512_mask_permutevar_epi32(ret, __not(mask), index, v1); + return ret; +} + template static FORCEINLINE __vec16_i32 __load(const __vec16_i32 *p) { #ifdef ISPC_FORCE_ALIGNED_MEMORY return _mm512_load_epi32(p); @@ -892,6 +901,15 @@ template <> FORCEINLINE __vec16_i64 __smear_i64<__vec16_i64>(const int64_t &l) return __vec16_i64(_mm512_set1_epi32(i[0]), _mm512_set1_epi32(i[1])); } +static FORCEINLINE __vec16_i64 __rotate_i64(__vec16_i64 v, int index) { + return __vec16_i64(__shuffle_i32(v.v_lo, index), __shuffle_i32(v.v_hi, index)); +} + +static FORCEINLINE __vec16_i64 __shuffle2_i64(__vec16_i64 v0, __vec16_i64 v1, __vec16_i32 index) { + return __vec16_i64(__shuffle2_i32(v0.v_lo, v1.v_lo, index), __shuffle2_i32(v0.v_hi, v1.v_hi, index)); +} + + template static FORCEINLINE __vec16_i64 __load(const __vec16_i64 *p) { __vec16_i32 v1; __vec16_i32 v2; @@ -917,11 +935,6 @@ template static FORCEINLINE __vec16_i64 __load(const __vec16_i64 *p) return ret; } -static FORCEINLINE __vec16_i64 __rotate_i64(__vec16_i64 v, int index) { - return __vec16_i64(__shuffle_i32(v.v_lo, index), __shuffle_i32(v.v_hi, index)); -} - - #if 0 template <> FORCEINLINE __vec16_i64 __load<64>(const __vec16_i64 *p) { __m512i v2 = _mm512_load_epi32(p); @@ -1755,6 +1768,14 @@ template <> FORCEINLINE __vec16_i8 __smear_i8<__vec16_i8>(int8_t i) { return ret; } +static FORCEINLINE __vec16_i8 __shuffle2_i8(__vec16_i8 v0, __vec16_i8 v1, __vec16_i32 index) { + __vec16_i32 tmp_v0 = _mm512_extload_epi32(&v0, _MM_UPCONV_EPI32_SINT8, _MM_BROADCAST32_NONE, _MM_HINT_NONE); + __vec16_i32 tmp_v1 = _mm512_extload_epi32(&v1, _MM_UPCONV_EPI32_SINT8, _MM_BROADCAST32_NONE, _MM_HINT_NONE); + __vec16_i32 tmp = __shuffle2_i32(tmp_v0, tmp_v1, index); + __vec16_i8 ret; + _mm512_extstore_epi32(&ret, tmp, _MM_DOWNCONV_EPI32_SINT8,_MM_HINT_NONE); + return ret; +} /////////////////////////////////////////////////////////////////////////// // int16 /////////////////////////////////////////////////////////////////////////// @@ -1794,6 +1815,15 @@ template <> FORCEINLINE __vec16_i16 __smear_i16<__vec16_i16>(int16_t i) { return ret; } +static FORCEINLINE __vec16_i16 __shuffle2_i16(__vec16_i16 v0, __vec16_i16 v1, __vec16_i32 index) { + __vec16_i32 tmp_v0 = _mm512_extload_epi32(&v0, _MM_UPCONV_EPI32_SINT16, _MM_BROADCAST32_NONE, _MM_HINT_NONE); + __vec16_i32 tmp_v1 = _mm512_extload_epi32(&v1, _MM_UPCONV_EPI32_SINT16, _MM_BROADCAST32_NONE, _MM_HINT_NONE); + __vec16_i32 tmp = __shuffle2_i32(tmp_v0, tmp_v1, index); + __vec16_i16 ret; + _mm512_extstore_epi32(&ret, tmp, _MM_DOWNCONV_EPI32_SINT16,_MM_HINT_NONE); + return ret; +} + /////////////////////////////////////////////////////////////////////////// // various math functions ///////////////////////////////////////////////////////////////////////////