From 8a22c6388970a1f6d4f1bae4588d8cce847cc8d1 Mon Sep 17 00:00:00 2001 From: Jean-Luc Duprat Date: Wed, 15 Aug 2012 17:41:10 -0700 Subject: [PATCH] knc2x.h Introduced knc2x.h which supprts 2x interleaved code generation for KNC (use the target generic-32). This implementation is even more experimental and incomplete than knc.h but is useful already (mandelbrot works for example) knc.h: Switch to new intrinsic names _mm512_set_1to16_epi32() -> _mm512_set1_epi32(), etc... Fix the declaration of the unspecialized template for __smear_*(), __setzero_*(), __undef_*() Specifically mark _mm512_undefined_*() a few vectors in __load<>() Fixed implementations of some implementations of __smear_*(), __setzero_*(), __undef_*() to remove unecessary dependent instructions. Implemented ISPC reductions by simply calling existing intrinsic reductions, which are slightly more efficient than our precendent implementation. Also added reductions for double types. --- examples/intrinsics/knc.h | 137 +-- examples/intrinsics/knc2x.h | 1991 +++++++++++++++++++++++++++++++++++ 2 files changed, 2041 insertions(+), 87 deletions(-) create mode 100644 examples/intrinsics/knc2x.h diff --git a/examples/intrinsics/knc.h b/examples/intrinsics/knc.h index 583543bf..9a25512a 100644 --- a/examples/intrinsics/knc.h +++ b/examples/intrinsics/knc.h @@ -35,6 +35,7 @@ #include #include #include +#include #include #include @@ -144,7 +145,7 @@ typedef struct PRE_ALIGN(64) __vec16_d { typedef struct PRE_ALIGN(64) __vec16_i32 { FORCEINLINE operator __m512i() const { return v; } FORCEINLINE __vec16_i32() : v(_mm512_undefined_epi32()) {} - FORCEINLINE __vec16_i32(const int32_t &in) : v(_mm512_set_1to16_epi32(in)) {} + FORCEINLINE __vec16_i32(const int32_t &in) : v(_mm512_set1_epi32(in)) {} FORCEINLINE __vec16_i32(const __m512i &in) : v(in) {} FORCEINLINE __vec16_i32(const __vec16_i32 &o) : v(o.v) {} FORCEINLINE __vec16_i32& operator =(const __vec16_i32 &o) { v=o.v; return *this; } @@ -495,17 +496,17 @@ template static FORCEINLINE void __store(__vec16_i1 *p, __vec16_i1 v *ptr = v.m; } -template __vec16_i1 __smear_i1(int i); +template RetVecType __smear_i1(int i); template <> static FORCEINLINE __vec16_i1 __smear_i1<__vec16_i1>(int i) { return i?0xFFFF:0x0; } -template __vec16_i1 __setzero_i1(); +template RetVecType __setzero_i1(); template <> static FORCEINLINE __vec16_i1 __setzero_i1<__vec16_i1>() { return 0; } -template __vec16_i1 __undef_i1(); +template RetVecType __undef_i1(); template <> static FORCEINLINE __vec16_i1 __undef_i1<__vec16_i1>() { return __vec16_i1(); // FIXME? __mm512_undef_mask(); } @@ -770,9 +771,9 @@ static FORCEINLINE void __insert_element(__vec16_i32 *v, unsigned int index, int ((int32_t *)v)[index] = val; } -template __vec16_i32 __smear_i32(int32_t i); +template RetVecType __smear_i32(int32_t i); template <> static FORCEINLINE __vec16_i32 __smear_i32<__vec16_i32>(int32_t i) { - return _mm512_set_1to16_epi32(i); + return _mm512_set1_epi32(i); } static const __vec16_i32 __ispc_one = __smear_i32<__vec16_i32>(1); @@ -780,19 +781,19 @@ static const __vec16_i32 __ispc_thirty_two = __smear_i32<__vec16_i32>(32); static const __vec16_i32 __ispc_ffffffff = __smear_i32<__vec16_i32>(-1); static const __vec16_i32 __ispc_stride1(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); -template __vec16_i32 __setzero_i32(); +template RetVecType __setzero_i32(); template <> static FORCEINLINE __vec16_i32 __setzero_i32<__vec16_i32>() { return _mm512_setzero_epi32(); } -template __vec16_i32 __undef_i32(); +template RetVecType __undef_i32(); template <> static FORCEINLINE __vec16_i32 __undef_i32<__vec16_i32>() { return _mm512_undefined_epi32(); } static FORCEINLINE __vec16_i32 __broadcast_i32(__vec16_i32 v, int index) { int32_t val = __extract_element(v, index & 0xf); - return _mm512_set_1to16_epi32(val); + return _mm512_set1_epi32(val); } static FORCEINLINE __vec16_i32 __rotate_i32(__vec16_i32 v, int index) { @@ -812,7 +813,7 @@ static FORCEINLINE __vec16_i32 __shuffle2_i32(__vec16_i32 v0, __vec16_i32 v1, __ */ template static FORCEINLINE __vec16_i32 __load(const __vec16_i32 *p) { - __vec16_i32 v; + __vec16_i32 v = _mm512_undefined_epi32(); v = _mm512_extloadunpackhi_epi32(v, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE); v = _mm512_extloadunpacklo_epi32(v, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE); return v; @@ -845,7 +846,7 @@ static FORCEINLINE void __insert_element(__vec16_i64 *v, unsigned int index, int } -template __vec16_i64 __setzero_i64(); +template RetVecType __setzero_i64(); template <> static FORCEINLINE __vec16_i64 __setzero_i64<__vec16_i64>() { __vec16_i64 ret; ret.v_lo = _mm512_setzero_epi32(); @@ -853,7 +854,7 @@ template <> static FORCEINLINE __vec16_i64 __setzero_i64<__vec16_i64>() { return ret; } -template __vec16_i64 __undef_i64(); +template RetVecType __undef_i64(); template <> static FORCEINLINE __vec16_i64 __undef_i64<__vec16_i64>() { __vec16_i64 ret; ret.v_lo = _mm512_undefined_epi32(); @@ -1017,10 +1018,10 @@ static FORCEINLINE int64_t __extract_element(const __vec16_i64 &v, int index) return src[index+16] | (int64_t(src[index]) << 32); } -template __vec16_i64 __smear_i64(const int64_t &l); +template RetVecType __smear_i64(const int64_t &l); template <> FORCEINLINE __vec16_i64 __smear_i64<__vec16_i64>(const int64_t &l) { const int *i = (const int*)&l; - return __vec16_i64(_mm512_set_1to16_epi32(i[0]), _mm512_set_1to16_epi32(i[1])); + return __vec16_i64(_mm512_set1_epi32(i[0]), _mm512_set1_epi32(i[1])); } BROADCAST(__vec16_i64, i64, int64_t) @@ -1217,24 +1218,24 @@ static FORCEINLINE void __insert_element(__vec16_f *v, int index, float val) { ((float *)v)[index] = val; } -template __vec16_f __smear_float(float f); +template RetVecType __smear_float(float f); template <> static FORCEINLINE __vec16_f __smear_float<__vec16_f>(float f) { - return _mm512_set_1to16_ps(f); + return _mm512_set1_ps(f); } -template __vec16_f __setzero_float(); +template RetVecType __setzero_float(); template <> static FORCEINLINE __vec16_f __setzero_float<__vec16_f>() { return _mm512_setzero_ps(); } -template __vec16_f __undef_float(); +template RetVecType __undef_float(); template <> static FORCEINLINE __vec16_f __undef_float<__vec16_f>() { return _mm512_undefined_ps(); } static FORCEINLINE __vec16_f __broadcast_float(__vec16_f v, int index) { int32_t val = __extract_element(v, index & 0xf); - return _mm512_set_1to16_ps(val); + return _mm512_set1_ps(val); } /* @@ -1254,7 +1255,7 @@ static FORCEINLINE __vec16_f __shuffle2_float(__vec16_f v0, __vec16_f v1, __vec1 */ template static FORCEINLINE __vec16_f __load(const __vec16_f *p) { - __vec16_f v; + __vec16_f v = _mm512_undefined_ps(); v = _mm512_extloadunpackhi_ps(v, p, _MM_UPCONV_PS_NONE, _MM_HINT_NONE); v = _mm512_extloadunpacklo_ps(v, p, _MM_UPCONV_PS_NONE, _MM_HINT_NONE); return v; @@ -1429,35 +1430,35 @@ static FORCEINLINE void __insert_element(__vec16_d *v, int index, double val) { ((double *)v)[index] = val; } -template __vec16_d __smear_double(double d); +template RetVecType __smear_double(double d); template <> static FORCEINLINE __vec16_d __smear_double<__vec16_d>(double d) { __vec16_d ret; - ret.v1 = _mm512_extload_pd(&d, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, _MM_HINT_NONE); - ret.v2 = ret.v1; + ret.v1 = _mm512_set1_pd(d); + ret.v2 = _mm512_set1_pd(d); return ret; } -template __vec16_d __setzero_double(); +template RetVecType __setzero_double(); template <> static FORCEINLINE __vec16_d __setzero_double<__vec16_d>() { __vec16_d ret; ret.v1 = _mm512_setzero_pd(); - ret.v2 = ret.v1; + ret.v2 = _mm512_setzero_pd(); return ret; } -template __vec16_d __undef_double(); +template RetVecType __undef_double(); template <> static FORCEINLINE __vec16_d __undef_double<__vec16_d>() { __vec16_d ret; ret.v1 = _mm512_undefined_pd(); - ret.v2 = ret.v1; + ret.v2 = _mm512_undefined_pd(); return ret; } static FORCEINLINE __vec16_d __broadcast_double(__vec16_d v, int index) { __vec16_d ret; - int32_t val = __extract_element(v, index & 0xf); - ret.v1 = _mm512_extload_pd(&val, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, _MM_HINT_NONE); - ret.v2 = ret.v1; + double val = __extract_element(v, index & 0xf); + ret.v1 = _mm512_set1_pd(val); + ret.v2 = _mm512_set1_pd(val); return ret; } @@ -1478,6 +1479,8 @@ static FORCEINLINE __vec16_f __shuffle2_float(__vec16_d v0, __vec16_d v1, __vec1 template static FORCEINLINE __vec16_d __load(const __vec16_d *p) { __vec16_d ret; + ret.v1 = _mm512_undefined_pd(); + ret.v2 = _mm512_undefined_pd(); ret.v1 = _mm512_extloadunpackhi_pd(ret.v1, p, _MM_UPCONV_PD_NONE, _MM_HINT_NONE); ret.v1 = _mm512_extloadunpacklo_pd(ret.v1, p, _MM_UPCONV_PD_NONE, _MM_HINT_NONE); ret.v2 = _mm512_extloadunpackhi_pd(ret.v2, ((uint8_t*)p)+64, _MM_UPCONV_PD_NONE, _MM_HINT_NONE); @@ -1948,80 +1951,40 @@ static const __vec16_i32 __ispc_s3(2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, static const __vec16_i32 __ispc_s4(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14); static FORCEINLINE uint32_t __reduce_add_i32(__vec16_i32 v) { - __vec16_i32 v1 = _mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xFF00, __ispc_s1, v); - __vec16_i32 v2 = _mm512_mask_add_epi32(_mm512_undefined_epi32(), 0xFF00, v, v1); - __vec16_i32 v3 = _mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xF000, __ispc_s2, v2); - __vec16_i32 v4 = _mm512_mask_add_epi32(_mm512_undefined_epi32(), 0xF000, v2, v3); - __vec16_i32 v5 = _mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xC000, __ispc_s3, v4); - __vec16_i32 v6 = _mm512_mask_add_epi32(_mm512_undefined_epi32(), 0xC000, v4, v5); - __vec16_i32 v7 = _mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0x8000, __ispc_s4, v6); - __vec16_i32 v8 = _mm512_mask_add_epi32(_mm512_undefined_epi32(), 0x8000, v6, v7); - return __extract_element(v8, 15); + return _mm512_reduce_add_epi32(v); } static FORCEINLINE uint32_t __reduce_min_i32(__vec16_i32 v) { - __vec16_i32 v1 = _mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xFF00, __ispc_s1, v); - __vec16_i32 v2 = _mm512_mask_min_epi32(_mm512_undefined_epi32(), 0xFF00, v, v1); - __vec16_i32 v3 = _mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xF000, __ispc_s2, v2); - __vec16_i32 v4 = _mm512_mask_min_epi32(_mm512_undefined_epi32(), 0xF000, v2, v3); - __vec16_i32 v5 = _mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xC000, __ispc_s3, v4); - __vec16_i32 v6 = _mm512_mask_min_epi32(_mm512_undefined_epi32(), 0xC000, v4, v5); - __vec16_i32 v7 = _mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0x8000, __ispc_s4, v6); - __vec16_i32 v8 = _mm512_mask_min_epi32(_mm512_undefined_epi32(), 0x8000, v6, v7); - return __extract_element(v8, 15); + return _mm512_reduce_min_epi32(v); } static FORCEINLINE uint32_t __reduce_max_i32(__vec16_i32 v) { - __vec16_i32 v1 = _mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xFF00, __ispc_s1, v); - __vec16_i32 v2 = _mm512_mask_max_epi32(_mm512_undefined_epi32(), 0xFF00, v, v1); - __vec16_i32 v3 = _mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xF000, __ispc_s2, v2); - __vec16_i32 v4 = _mm512_mask_max_epi32(_mm512_undefined_epi32(), 0xF000, v2, v3); - __vec16_i32 v5 = _mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xC000, __ispc_s3, v4); - __vec16_i32 v6 = _mm512_mask_max_epi32(_mm512_undefined_epi32(), 0xC000, v4, v5); - __vec16_i32 v7 = _mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0x8000, __ispc_s4, v6); - __vec16_i32 v8 = _mm512_mask_max_epi32(_mm512_undefined_epi32(), 0x8000, v6, v7); - return __extract_element(v8, 15); + return _mm512_reduce_max_epi32(v); } static FORCEINLINE float __reduce_add_float(__vec16_f v) { - __vec16_f v1 = _mm512_castsi512_ps(_mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xFF00, __ispc_s1, _mm512_castps_si512(v))); - __vec16_f v2 = _mm512_mask_add_ps(_mm512_undefined_ps(), 0xFF00, v, v1); - __vec16_f v3 = _mm512_castsi512_ps(_mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xF000, __ispc_s2, _mm512_castps_si512(v2))); - __vec16_f v4 = _mm512_mask_add_ps(_mm512_undefined_ps(), 0xF000, v2, v3); - __vec16_f v5 = _mm512_castsi512_ps(_mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xC000, __ispc_s3, _mm512_castps_si512(v4))); - __vec16_f v6 = _mm512_mask_add_ps(_mm512_undefined_ps(), 0xC000, v4, v5); - __vec16_f v7 = _mm512_castsi512_ps(_mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0x8000, __ispc_s4, _mm512_castps_si512(v6))); - __vec16_f v8 = _mm512_mask_add_ps(_mm512_undefined_ps(), 0x8000, v6, v7); - return __extract_element(v8, 15); + return _mm512_reduce_add_ps(v); } static FORCEINLINE float __reduce_min_float(__vec16_f v) { - __vec16_f v1 = _mm512_castsi512_ps(_mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xFF00, __ispc_s1, _mm512_castps_si512(v))); - __vec16_f v2 = _mm512_mask_min_ps(_mm512_undefined_ps(), 0xFF00, v, v1); - __vec16_f v3 = _mm512_castsi512_ps(_mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xF000, __ispc_s2, _mm512_castps_si512(v2))); - __vec16_f v4 = _mm512_mask_min_ps(_mm512_undefined_ps(), 0xF000, v2, v3); - __vec16_f v5 = _mm512_castsi512_ps(_mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xC000, __ispc_s3, _mm512_castps_si512(v4))); - __vec16_f v6 = _mm512_mask_min_ps(_mm512_undefined_ps(), 0xC000, v4, v5); - __vec16_f v7 = _mm512_castsi512_ps(_mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0x8000, __ispc_s4, _mm512_castps_si512(v6))); - __vec16_f v8 = _mm512_mask_min_ps(_mm512_undefined_ps(), 0x8000, v6, v7); - return __extract_element(v8, 15); + return _mm512_reduce_min_ps(v); } static FORCEINLINE float __reduce_max_float(__vec16_f v) { - __vec16_f v1 = _mm512_castsi512_ps(_mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xFF00, __ispc_s1, _mm512_castps_si512(v))); - __vec16_f v2 = _mm512_mask_max_ps(_mm512_undefined_ps(), 0xFF00, v, v1); - __vec16_f v3 = _mm512_castsi512_ps(_mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xF000, __ispc_s2, _mm512_castps_si512(v2))); - __vec16_f v4 = _mm512_mask_max_ps(_mm512_undefined_ps(), 0xF000, v2, v3); - __vec16_f v5 = _mm512_castsi512_ps(_mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xC000, __ispc_s3, _mm512_castps_si512(v4))); - __vec16_f v6 = _mm512_mask_max_ps(_mm512_undefined_ps(), 0xC000, v4, v5); - __vec16_f v7 = _mm512_castsi512_ps(_mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0x8000, __ispc_s4, _mm512_castps_si512(v6))); - __vec16_f v8 = _mm512_mask_max_ps(_mm512_undefined_ps(), 0x8000, v6, v7); - return __extract_element(v8, 15); + return _mm512_reduce_max_ps(v); } -REDUCE_ADD(double, __vec16_d, __reduce_add_double) -REDUCE_MINMAX(double, __vec16_d, __reduce_min_double, <) -REDUCE_MINMAX(double, __vec16_d, __reduce_max_double, >) +static FORCEINLINE float __reduce_add_double(__vec16_d v) { + return _mm512_reduce_add_pd(v.v1) + _mm512_reduce_add_pd(v.v2); +} + +static FORCEINLINE float __reduce_min_double(__vec16_d v) { + return std::min(_mm512_reduce_min_pd(v.v1), _mm512_reduce_min_pd(v.v2)); +} + +static FORCEINLINE float __reduce_max_double(__vec16_d v) { + return std::max(_mm512_reduce_max_pd(v.v1), _mm512_reduce_max_pd(v.v2)); +} REDUCE_ADD(uint64_t, __vec16_i64, __reduce_add_int64) REDUCE_MINMAX(int64_t, __vec16_i64, __reduce_min_int64, <) diff --git a/examples/intrinsics/knc2x.h b/examples/intrinsics/knc2x.h new file mode 100644 index 00000000..75c9aa62 --- /dev/null +++ b/examples/intrinsics/knc2x.h @@ -0,0 +1,1991 @@ +/* + Copyright (c) 2010-2011, Intel Corporation + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS + IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A + PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#include +#include +#include + +#include +#include + +#include "knc.h" // TODO: this should be possible + +#ifdef _MSC_VER +#define FORCEINLINE __forceinline +#define PRE_ALIGN(x) /*__declspec(align(x))*/ +#define POST_ALIGN(x) +#define roundf(x) (floorf(x + .5f)) +#define round(x) (floor(x + .5)) +#else +#define FORCEINLINE __attribute__((always_inline)) +#define PRE_ALIGN(x) +#define POST_ALIGN(x) __attribute__ ((aligned(x))) +#endif + +#define KNC 1 +extern "C" { + int printf(const unsigned char *, ...); +} + +typedef float __vec1_f; +typedef double __vec1_d; +typedef int8_t __vec1_i8; +typedef int16_t __vec1_i16; +typedef int32_t __vec1_i32; +typedef int64_t __vec1_i64; + +struct __vec32_f; +struct __vec32_i32; + +typedef struct PRE_ALIGN(4) __vec32_i1 { + __vec32_i1() { } + __vec32_i1(const uint32_t& in) { m = in; } + __vec32_i1(const __vec32_i32& in); + __vec32_i1(uint32_t v00, uint32_t v01, uint32_t v02, uint32_t v03, + uint32_t v04, uint32_t v05, uint32_t v06, uint32_t v07, + uint32_t v08, uint32_t v09, uint32_t v10, uint32_t v11, + uint32_t v12, uint32_t v13, uint32_t v14, uint32_t v15, + uint32_t v16, uint32_t v17, uint32_t v18, uint32_t v19, + uint32_t v20, uint32_t v21, uint32_t v22, uint32_t v23, + uint32_t v24, uint32_t v25, uint32_t v26, uint32_t v27, + uint32_t v28, uint32_t v29, uint32_t v30, uint32_t v31) { + m16.m1 = (v00) | + ((v01) << 1) | + ((v02) << 2) | + ((v03) << 3) | + ((v04) << 4) | + ((v05) << 5) | + ((v06) << 6) | + ((v07) << 7) | + ((v08) << 8) | + ((v09) << 9) | + ((v10) << 10) | + ((v11) << 11) | + ((v12) << 12) | + ((v13) << 13) | + ((v14) << 14) | + ((v15) << 15); + m16.m2 = (v16) | + ((v17) << 1) | + ((v18) << 2) | + ((v19) << 3) | + ((v20) << 4) | + ((v21) << 5) | + ((v22) << 6) | + ((v23) << 7) | + ((v24) << 8) | + ((v25) << 9) | + ((v26) << 10) | + ((v27) << 11) | + ((v28) << 12) | + ((v29) << 13) | + ((v30) << 14) | + ((v31) << 15); + } + + union { + uint32_t m; + struct { + __mmask16 m1; + __mmask16 m2; + } m16; + }; +} POST_ALIGN(4) __vec32_i1; + +typedef struct PRE_ALIGN(64) __vec32_f { + __vec32_f() { } + __vec32_f(float v00, float v01, float v02, float v03, + float v04, float v05, float v06, float v07, + float v08, float v09, float v10, float v11, + float v12, float v13, float v14, float v15, + float v16, float v17, float v18, float v19, + float v20, float v21, float v22, float v23, + float v24, float v25, float v26, float v27, + float v28, float v29, float v30, float v31) { + v2 = _mm512_set_16to16_ps(v15, v14, v13, v12, v11, v10, v09, v08, v07, v06, v05, v04, v03, v02, v01, v00); + v1 = _mm512_set_16to16_ps(v31, v30, v29, v28, v27, v26, v25, v24, v23, v22, v21, v20, v19, v18, v17, v16); + } + __m512 v1; + __m512 v2; +} POST_ALIGN(64) __vec32_f; + +typedef struct PRE_ALIGN(64) __vec32_i32 { + __vec32_i32() { } + __vec32_i32(const __vec32_i1& in) { + __mmask16 m; + v1 = _mm512_setzero_epi32(); // _mm512_xor_epi32(zero, zero); + v1 = _mm512_sbb_epi32(v1, in.m16.m1, v1, &m); + v2 = _mm512_setzero_epi32(); + v2 = _mm512_sbb_epi32(v2, in.m16.m2, v2, &m); + // ON KNL vpsbbd does not exist, do a masked vload instead + } + __vec32_i32(int32_t v00, int32_t v01, int32_t v02, int32_t v03, + int32_t v04, int32_t v05, int32_t v06, int32_t v07, + int32_t v08, int32_t v09, int32_t v10, int32_t v11, + int32_t v12, int32_t v13, int32_t v14, int32_t v15, + int32_t v16, int32_t v17, int32_t v18, int32_t v19, + int32_t v20, int32_t v21, int32_t v22, int32_t v23, + int32_t v24, int32_t v25, int32_t v26, int32_t v27, + int32_t v28, int32_t v29, int32_t v30, int32_t v31) { + v1 = _mm512_set_16to16_pi(v15, v14, v13, v12, v11, v10, v09, v08, v07, v06, v05, v04, v03, v02, v01, v00); + v2 = _mm512_set_16to16_pi(v31, v30, v29, v28, v27, v26, v25, v24, v23, v22, v21, v20, v19, v18, v17, v16); + } + __m512i v1; + __m512i v2; +} POST_ALIGN(64) __vec32_i32; + +FORCEINLINE __vec32_i1::__vec32_i1(const __vec32_i32& in) { + m16.m1 = _mm512_test_epi32_mask(in.v1, in.v1); + m16.m2 = _mm512_test_epi32_mask(in.v2, in.v2); +} + +// This does not map directly to an intrinsic type +typedef struct PRE_ALIGN(64) __vec32_d { + double v[32]; +} POST_ALIGN(64) __vec32_d; + + + +template +struct vec32 { + vec32() { } + vec32(T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7, + T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15, + T v16, T v17, T v18, T v19, T v20, T v21, T v22, T v23, + T v24, T v25, T v26, T v27, T v28, T v29, T v30, T v31) { + v[0] = v0; v[1] = v1; v[2] = v2; v[3] = v3; + v[4] = v4; v[5] = v5; v[6] = v6; v[7] = v7; + v[8] = v8; v[9] = v9; v[10] = v10; v[11] = v11; + v[12] = v12; v[13] = v13; v[14] = v14; v[15] = v15; + v[16] = v16; v[17] = v17; v[18] = v18; v[19] = v19; + v[20] = v20; v[21] = v21; v[22] = v22; v[23] = v23; + v[24] = v24; v[25] = v25; v[26] = v26; v[27] = v27; + v[28] = v28; v[29] = v29; v[30] = v30; v[31] = v31; + } + T v[32]; +}; + +/* +PRE_ALIGN(64) struct __vec32_f : public vec16 { + __vec32_f() { } + __vec32_f(float v0, float v1, float v2, float v3, + float v4, float v5, float v6, float v7, + float v8, float v9, float v10, float v11, + float v12, float v13, float v14, float v15) + : vec16(v0, v1, v2, v3, v4, v5, v6, v7, + v8, v9, v10, v11, v12, v13, v14, v15) { } + +} POST_ALIGN(64); + +PRE_ALIGN(128) struct __vec32_d : public vec16 { + __vec32_d() { } + __vec32_d(double v0, double v1, double v2, double v3, + double v4, double v5, double v6, double v7, + double v8, double v9, double v10, double v11, + double v12, double v13, double v14, double v15) + : vec16(v0, v1, v2, v3, v4, v5, v6, v7, + v8, v9, v10, v11, v12, v13, v14, v15) { } + +} POST_ALIGN(128); +*/ + +PRE_ALIGN(32) struct __vec32_i8 : public vec32 { + __vec32_i8() { } + __vec32_i8(int8_t v0, int8_t v1, int8_t v2, int8_t v3, + int8_t v4, int8_t v5, int8_t v6, int8_t v7, + int8_t v8, int8_t v9, int8_t v10, int8_t v11, + int8_t v12, int8_t v13, int8_t v14, int8_t v15, + int8_t v16, int8_t v17, int8_t v18, int8_t v19, + int8_t v20, int8_t v21, int8_t v22, int8_t v23, + int8_t v24, int8_t v25, int8_t v26, int8_t v27, + int8_t v28, int8_t v29, int8_t v30, int8_t v31) + : vec32(v0, v1, v2, v3, v4, v5, v6, v7, + v8, v9, v10, v11, v12, v13, v14, v15, + v17, v18, v19, v20, v21, v22, v23, v24, + v25, v25, v26, v27, v28, v29, v30, v31) { } +} POST_ALIGN(32); + +PRE_ALIGN(64) struct __vec32_i16 : public vec32 { + __vec32_i16() { } + __vec32_i16(int16_t v0, int16_t v1, int16_t v2, int16_t v3, + int16_t v4, int16_t v5, int16_t v6, int16_t v7, + int16_t v8, int16_t v9, int16_t v10, int16_t v11, + int16_t v12, int16_t v13, int16_t v14, int16_t v15, + int16_t v16, int16_t v17, int16_t v18, int16_t v19, + int16_t v20, int16_t v21, int16_t v22, int16_t v23, + int16_t v24, int16_t v25, int16_t v26, int16_t v27, + int16_t v28, int16_t v29, int16_t v30, int16_t v31) + : vec32(v0, v1, v2, v3, v4, v5, v6, v7, + v8, v9, v10, v11, v12, v13, v14, v15, + v16, v17, v18, v19, v20, v21, v22, v23, + v24, v25, v26, v27, v28, v29, v30, v31) { } +} POST_ALIGN(64); + +/* +PRE_ALIGN(64) struct __vec32_i32 : public vec16 { + __vec32_i32() { } + __vec32_i32(int32_t v0, int32_t v1, int32_t v2, int32_t v3, + int32_t v4, int32_t v5, int32_t v6, int32_t v7, + int32_t v8, int32_t v9, int32_t v10, int32_t v11, + int32_t v12, int32_t v13, int32_t v14, int32_t v15) + : v.m512i_i32[0](v0), v.m512i_i32[1](v1), v.m512i_i32[2](v2), v.m512i_i32[3](v3), + v.m512i_i32[4](v4), v.m512i_i32[5](v5), v.m512i_i32[6](v6), v.m512i_i32[7](v7), + v.m512i_i32[8](v8), v.m512i_i32[9](v9), v.m512i_i32[10](v10), v.m512i_i32[11](v11), + v.m512i_i32[12](v12), v.m512i_i32[13](v13), v.m512i_i32[14](v14), v.m512i_i32[15](v15), { } + _#512i v; +} POST_ALIGN(64); + +static inline int32_t __extract_element(__vec32_i32, int); + +PRE_ALIGN(128) struct __vec32_i64 : public vec16 { + __vec32_i64() { } + __vec32_i64(int64_t v0, int64_t v1, int64_t v2, int64_t v3, + int64_t v4, int64_t v5, int64_t v6, int64_t v7, + int64_t v8, int64_t v9, int64_t v10, int64_t v11, + int64_t v12, int64_t v13, int64_t v14, int64_t v15) + : vec16(v0, v1, v2, v3, v4, v5, v6, v7, + v8, v9, v10, v11, v12, v13, v14, v15) { } +} POST_ALIGN(128); +*/ + +/////////////////////////////////////////////////////////////////////////// + +INSERT_EXTRACT(__vec1_i8, int8_t) +INSERT_EXTRACT(__vec1_i16, int16_t) +INSERT_EXTRACT(__vec1_i32, int32_t) +INSERT_EXTRACT(__vec1_i64, int64_t) +INSERT_EXTRACT(__vec1_f, float) +INSERT_EXTRACT(__vec1_d, double) + +/////////////////////////////////////////////////////////////////////////// +// mask ops + + +static FORCEINLINE uint32_t __movmsk(__vec32_i1 mask) { + uint32_t m1 = _mm512_kmov(mask.m16.m1); + uint32_t m2 = _mm512_kmov(mask.m16.m2); + return ((m1<<16)|m2); +} + +static FORCEINLINE __vec32_i1 __equal(__vec32_i1 a, __vec32_i1 b) { + __vec32_i1 ret; + ret.m16.m1 = _mm512_knot(_mm512_kandn(a.m16.m1, b.m16.m1)); + ret.m16.m2 = _mm512_knot(_mm512_kandn(a.m16.m2, b.m16.m2)); + return ret; +} + +static FORCEINLINE __vec32_i1 __not(__vec32_i1 a) { + __vec32_i1 ret; + ret.m16.m1 = _mm512_knot(a.m16.m1); + ret.m16.m2 = _mm512_knot(a.m16.m2); + return ret; +} + +static FORCEINLINE __vec32_i1 __and(__vec32_i1 a, __vec32_i1 b) { + __vec32_i1 ret; + ret.m16.m1 = _mm512_kand(a.m16.m1, b.m16.m1); + ret.m16.m2 = _mm512_kand(a.m16.m2, b.m16.m2); + return ret; +} + +static FORCEINLINE __vec32_i1 __and_not1(__vec32_i1 a, __vec32_i1 b) { + __vec32_i1 ret; + ret.m16.m1 = _mm512_kandn(a.m16.m1, b.m16.m1); + ret.m16.m2 = _mm512_kandn(a.m16.m2, b.m16.m2); + return ret; +} + +static FORCEINLINE __vec32_i1 __and_not2(__vec32_i1 a, __vec32_i1 b) { + __vec32_i1 ret; + ret.m16.m1 = _mm512_kandnr(a.m16.m1, b.m16.m1); + ret.m16.m2 = _mm512_kandnr(a.m16.m2, b.m16.m2); + return ret; +} + +static FORCEINLINE __vec32_i1 __xor(__vec32_i1 a, __vec32_i1 b) { + __vec32_i1 ret; + ret.m16.m1 = _mm512_kxor(a.m16.m1, b.m16.m1); + ret.m16.m2 = _mm512_kxor(a.m16.m2, b.m16.m2); + return ret; +} + +static FORCEINLINE __vec32_i1 __xnor(__vec32_i1 a, __vec32_i1 b) { + __vec32_i1 ret; + ret.m16.m1 = _mm512_kxnor(a.m16.m1, b.m16.m1); + ret.m16.m2 = _mm512_kxnor(a.m16.m2, b.m16.m2); + return ret; +} + +static FORCEINLINE __vec32_i1 __or(__vec32_i1 a, __vec32_i1 b) { + __vec32_i1 ret; + ret.m16.m1 = _mm512_kor(a.m16.m1, b.m16.m1); + ret.m16.m2 = _mm512_kor(a.m16.m2, b.m16.m2); + return ret; +} + + +static FORCEINLINE __vec32_i1 __select(__vec32_i1 mask, __vec32_i1 a, + __vec32_i1 b) { + return (a.m & mask.m) | (b.m & ~mask.m); +} + + +static FORCEINLINE __vec32_i1 __select(bool cond, __vec32_i1 a, __vec32_i1 b) { + return cond ? a : b; +} + +/* +static FORCEINLINE bool __extract_element(__vec32_i1 vec, int index) { + return (vec.v & (1 << index)) ? true : false; +} + +static FORCEINLINE void __insert_element(__vec32_i1 *vec, int index, + bool val) { + if (val == false) + vec->v &= ~(1 << index); + else + vec->v |= (1 << index); +} +*/ + +template static FORCEINLINE __vec32_i1 __load(const __vec32_i1 *p) { + uint16_t *ptr = (uint16_t *)p; + __vec32_i1 ret; + ret.m16.m1 = *ptr; + ptr++; + ret.m16.m2 = *ptr; + return ret; +} + +template static FORCEINLINE void __store(__vec32_i1 *p, __vec32_i1 v) { + uint16_t *ptr = (uint16_t *)p; + *ptr = v.m16.m1; + ptr++; + *ptr = v.m16.m2; +} + +template <> static FORCEINLINE __vec32_i1 __smear_i1<__vec32_i1>(int i) { + return i?0xFFFF:0x0; +} + +template <> static FORCEINLINE __vec32_i1 __setzero_i1<__vec32_i1>() { + return 0; +} + +template <> static FORCEINLINE __vec32_i1 __undef_i1<__vec32_i1>() { + return __vec32_i1(); // FIXME? __mm512_undef_mask(); +} + +/////////////////////////////////////////////////////////////////////////// +// int8 + +BINARY_OP(__vec32_i8, __add, +) +BINARY_OP(__vec32_i8, __sub, -) +BINARY_OP(__vec32_i8, __mul, *) + +BINARY_OP(__vec32_i8, __or, |) +BINARY_OP(__vec32_i8, __and, &) +BINARY_OP(__vec32_i8, __xor, ^) +BINARY_OP(__vec32_i8, __shl, <<) + +BINARY_OP_CAST(__vec32_i8, uint8_t, __udiv, /) +BINARY_OP_CAST(__vec32_i8, int8_t, __sdiv, /) + +BINARY_OP_CAST(__vec32_i8, uint8_t, __urem, %) +BINARY_OP_CAST(__vec32_i8, int8_t, __srem, %) +BINARY_OP_CAST(__vec32_i8, uint8_t, __lshr, >>) +BINARY_OP_CAST(__vec32_i8, int8_t, __ashr, >>) + +SHIFT_UNIFORM(__vec32_i8, uint8_t, __lshr, >>) +SHIFT_UNIFORM(__vec32_i8, int8_t, __ashr, >>) +SHIFT_UNIFORM(__vec32_i8, int8_t, __shl, <<) + +CMP_OP(__vec32_i8, i8, int8_t, __equal, ==) +CMP_OP(__vec32_i8, i8, int8_t, __not_equal, !=) +CMP_OP(__vec32_i8, i8, uint8_t, __unsigned_less_equal, <=) +CMP_OP(__vec32_i8, i8, int8_t, __signed_less_equal, <=) +CMP_OP(__vec32_i8, i8, uint8_t, __unsigned_greater_equal, >=) +CMP_OP(__vec32_i8, i8, int8_t, __signed_greater_equal, >=) +CMP_OP(__vec32_i8, i8, uint8_t, __unsigned_less_than, <) +CMP_OP(__vec32_i8, i8, int8_t, __signed_less_than, <) +CMP_OP(__vec32_i8, i8, uint8_t, __unsigned_greater_than, >) +CMP_OP(__vec32_i8, i8, int8_t, __signed_greater_than, >) + +SELECT(__vec32_i8) +INSERT_EXTRACT(__vec32_i8, int8_t) +SMEAR(__vec32_i8, i8, int8_t) +BROADCAST(__vec32_i8, i8, int8_t) +ROTATE(__vec32_i8, i8, int8_t) +SHUFFLES(__vec32_i8, i8, int8_t) +LOAD_STORE(__vec32_i8, int8_t) + +/////////////////////////////////////////////////////////////////////////// +// int16 + +BINARY_OP(__vec32_i16, __add, +) +BINARY_OP(__vec32_i16, __sub, -) +BINARY_OP(__vec32_i16, __mul, *) + +BINARY_OP(__vec32_i16, __or, |) +BINARY_OP(__vec32_i16, __and, &) +BINARY_OP(__vec32_i16, __xor, ^) +BINARY_OP(__vec32_i16, __shl, <<) + +BINARY_OP_CAST(__vec32_i16, uint16_t, __udiv, /) +BINARY_OP_CAST(__vec32_i16, int16_t, __sdiv, /) + +BINARY_OP_CAST(__vec32_i16, uint16_t, __urem, %) +BINARY_OP_CAST(__vec32_i16, int16_t, __srem, %) +BINARY_OP_CAST(__vec32_i16, uint16_t, __lshr, >>) +BINARY_OP_CAST(__vec32_i16, int16_t, __ashr, >>) + +SHIFT_UNIFORM(__vec32_i16, uint16_t, __lshr, >>) +SHIFT_UNIFORM(__vec32_i16, int16_t, __ashr, >>) +SHIFT_UNIFORM(__vec32_i16, int16_t, __shl, <<) + +CMP_OP(__vec32_i16, i16, int16_t, __equal, ==) +CMP_OP(__vec32_i16, i16, int16_t, __not_equal, !=) +CMP_OP(__vec32_i16, i16, uint16_t, __unsigned_less_equal, <=) +CMP_OP(__vec32_i16, i16, int16_t, __signed_less_equal, <=) +CMP_OP(__vec32_i16, i16, uint16_t, __unsigned_greater_equal, >=) +CMP_OP(__vec32_i16, i16, int16_t, __signed_greater_equal, >=) +CMP_OP(__vec32_i16, i16, uint16_t, __unsigned_less_than, <) +CMP_OP(__vec32_i16, i16, int16_t, __signed_less_than, <) +CMP_OP(__vec32_i16, i16, uint16_t, __unsigned_greater_than, >) +CMP_OP(__vec32_i16, i16, int16_t, __signed_greater_than, >) + +SELECT(__vec32_i16) +INSERT_EXTRACT(__vec32_i16, int16_t) +SMEAR(__vec32_i16, i16, int16_t) +BROADCAST(__vec32_i16, i16, int16_t) +ROTATE(__vec32_i16, i16, int16_t) +SHUFFLES(__vec32_i16, i16, int16_t) +LOAD_STORE(__vec32_i16, int16_t) + +/////////////////////////////////////////////////////////////////////////// +// int32 + +static FORCEINLINE __vec32_i32 __add(__vec32_i32 a, __vec32_i32 b) { + __vec32_i32 ret; + ret.v1 = _mm512_add_epi32(a.v1, b.v1); + ret.v2 = _mm512_add_epi32(a.v2, b.v2); + return ret; +} + +static FORCEINLINE __vec32_i32 __sub(__vec32_i32 a, __vec32_i32 b) { + __vec32_i32 ret; + ret.v1 = _mm512_sub_epi32(a.v1, b.v1); + ret.v2 = _mm512_sub_epi32(a.v2, b.v2); + return ret; +} + +static FORCEINLINE __vec32_i32 __mul(__vec32_i32 a, __vec32_i32 b) { + __vec32_i32 ret; + ret.v1 = _mm512_mullo_epi32(a.v1, b.v1); + ret.v2 = _mm512_mullo_epi32(a.v2, b.v2); + return ret; +} + +static FORCEINLINE __vec32_i32 __udiv(__vec32_i32 a, __vec32_i32 b) { + __vec32_i32 ret; + ret.v1 = _mm512_div_epu32(a.v1, b.v1); + ret.v2 = _mm512_div_epu32(a.v2, b.v2); + return ret; +} + +static FORCEINLINE __vec32_i32 __sdiv(__vec32_i32 a, __vec32_i32 b) { + __vec32_i32 ret; + ret.v1 = _mm512_div_epi32(a.v1, b.v1); + ret.v2 = _mm512_div_epi32(a.v2, b.v2); + return ret; +} + +static FORCEINLINE __vec32_i32 __urem(__vec32_i32 a, __vec32_i32 b) { + __vec32_i32 ret; + ret.v1 = _mm512_rem_epu32(a.v1, b.v1); + ret.v2 = _mm512_rem_epu32(a.v2, b.v2); + return ret; +} + +static FORCEINLINE __vec32_i32 __srem(__vec32_i32 a, __vec32_i32 b) { + __vec32_i32 ret; + ret.v1 = _mm512_rem_epi32(a.v1, b.v1); + ret.v2 = _mm512_rem_epi32(a.v2, b.v2); + return ret; +} + +static FORCEINLINE __vec32_i32 __or(__vec32_i32 a, __vec32_i32 b) { + __vec32_i32 ret; + ret.v1 = _mm512_or_epi32(a.v1, b.v1); + ret.v2 = _mm512_or_epi32(a.v2, b.v2); + return ret; +} + +static FORCEINLINE __vec32_i32 __and(__vec32_i32 a, __vec32_i32 b) { + __vec32_i32 ret; + ret.v1 = _mm512_and_epi32(a.v1, b.v1); + ret.v2 = _mm512_and_epi32(a.v2, b.v2); + return ret; +} + +static FORCEINLINE __vec32_i32 __xor(__vec32_i32 a, __vec32_i32 b) { + __vec32_i32 ret; + ret.v1 = _mm512_xor_epi32(a.v1, b.v1); + ret.v2 = _mm512_xor_epi32(a.v2, b.v2); + return ret; +} + +static FORCEINLINE __vec32_i32 __shl(__vec32_i32 a, __vec32_i32 b) { + __vec32_i32 ret; + ret.v1 = _mm512_sllv_epi32(a.v1, b.v1); + ret.v2 = _mm512_sllv_epi32(a.v2, b.v2); + return ret; +} + +static FORCEINLINE __vec32_i32 __lshr(__vec32_i32 a, __vec32_i32 b) { + __vec32_i32 ret; + ret.v1 = _mm512_srlv_epi32(a.v1, b.v1); + ret.v2 = _mm512_srlv_epi32(a.v2, b.v2); + return ret; +} + +static FORCEINLINE __vec32_i32 __ashr(__vec32_i32 a, __vec32_i32 b) { + __vec32_i32 ret; + ret.v1 = _mm512_srav_epi32(a.v1, b.v1); + ret.v2 = _mm512_srav_epi32(a.v2, b.v2); + return ret; +} + +static FORCEINLINE __vec32_i32 __shl(__vec32_i32 a, int32_t n) { + __vec32_i32 ret; + ret.v1 = _mm512_slli_epi32(a.v1, n); + ret.v2 = _mm512_slli_epi32(a.v2, n); + return ret; +} + +static FORCEINLINE __vec32_i32 __lshr(__vec32_i32 a, int32_t n) { + __vec32_i32 ret; + ret.v1 = _mm512_srli_epi32(a.v1, n); + ret.v2 = _mm512_srli_epi32(a.v2, n); + return ret; +} + +static FORCEINLINE __vec32_i32 __ashr(__vec32_i32 a, int32_t n) { + __vec32_i32 ret; + ret.v1 = _mm512_srai_epi32(a.v1, n); + ret.v2 = _mm512_srai_epi32(a.v2, n); + return ret; +} + +static FORCEINLINE __vec32_i1 __equal_i32(__vec32_i32 a, __vec32_i32 b) { + __vec32_i1 ret; + ret.m16.m1 = _mm512_cmpeq_epi32_mask(a.v1, b.v1); + ret.m16.m2 = _mm512_cmpeq_epi32_mask(a.v2, b.v2); + return ret; +} + +static FORCEINLINE __vec32_i1 __equal_i32_and_mask(__vec32_i32 a, __vec32_i32 b, + __vec32_i1 m) { + __vec32_i1 ret; + ret.m16.m1 = _mm512_mask_cmpeq_epi32_mask(m.m16.m1, a.v1, b.v1); + ret.m16.m2 = _mm512_mask_cmpeq_epi32_mask(m.m16.m2, a.v2, b.v2); + return ret; +} + +static FORCEINLINE __vec32_i1 __not_equal_i32(__vec32_i32 a, __vec32_i32 b) { + __vec32_i1 ret; + ret.m16.m1 = _mm512_cmpneq_epi32_mask(a.v1, b.v1); + ret.m16.m2 = _mm512_cmpneq_epi32_mask(a.v2, b.v2); + return ret; +} + +static FORCEINLINE __vec32_i1 __not_equal_i32_and_mask(__vec32_i32 a, __vec32_i32 b, + __vec32_i1 m) { + __vec32_i1 ret; + ret.m16.m1 = _mm512_mask_cmpneq_epi32_mask(m.m16.m1, a.v1, b.v1); + ret.m16.m2 = _mm512_mask_cmpneq_epi32_mask(m.m16.m2, a.v2, b.v2); + return ret; +} + +static FORCEINLINE __vec32_i1 __unsigned_less_equal_i32(__vec32_i32 a, __vec32_i32 b) { + __vec32_i1 ret; + ret.m16.m1 = _mm512_cmple_epu32_mask(a.v1, b.v1); + ret.m16.m2 = _mm512_cmple_epu32_mask(a.v2, b.v2); + return ret; +} + +static FORCEINLINE __vec32_i1 __unsigned_less_equal_i32_and_mask(__vec32_i32 a, __vec32_i32 b, + __vec32_i1 m) { + __vec32_i1 ret; + ret.m16.m1 = _mm512_mask_cmple_epu32_mask(m.m16.m1, a.v1, b.v1); + ret.m16.m2 = _mm512_mask_cmple_epu32_mask(m.m16.m2, a.v2, b.v2); + return ret; +} + +static FORCEINLINE __vec32_i1 __signed_less_equal_i32(__vec32_i32 a, __vec32_i32 b) { + __vec32_i1 ret; + ret.m16.m1 = _mm512_cmple_epi32_mask(a.v1, b.v1); + ret.m16.m2 = _mm512_cmple_epi32_mask(a.v2, b.v2); + return ret; +} + +static FORCEINLINE __vec32_i1 __signed_less_equal_i32_and_mask(__vec32_i32 a, __vec32_i32 b, + __vec32_i1 m) { + __vec32_i1 ret; + ret.m16.m1 = _mm512_mask_cmple_epi32_mask(m.m16.m1, a.v1, b.v1); + ret.m16.m2 = _mm512_mask_cmple_epi32_mask(m.m16.m2, a.v2, b.v2); + return ret; +} + +static FORCEINLINE __vec32_i1 __unsigned_greater_equal_i32(__vec32_i32 a, __vec32_i32 b) { + __vec32_i1 ret; + ret.m16.m1 = _mm512_cmpge_epu32_mask(a.v1, b.v1); + ret.m16.m2 = _mm512_cmpge_epu32_mask(a.v2, b.v2); + return ret; +} + +static FORCEINLINE __vec32_i1 __unsigned_greater_equal_i32_and_mask(__vec32_i32 a, __vec32_i32 b, + __vec32_i1 m) { + __vec32_i1 ret; + ret.m16.m1 = _mm512_mask_cmpge_epu32_mask(m.m16.m1, a.v1, b.v1); + ret.m16.m2 = _mm512_mask_cmpge_epu32_mask(m.m16.m2, a.v2, b.v2); + return ret; +} + +static FORCEINLINE __vec32_i1 __signed_greater_equal_i32(__vec32_i32 a, __vec32_i32 b) { + __vec32_i1 ret; + ret.m16.m1 = _mm512_cmpge_epi32_mask(a.v1, b.v1); + ret.m16.m2 = _mm512_cmpge_epi32_mask(a.v2, b.v2); + return ret; +} + +static FORCEINLINE __vec32_i1 __signed_greater_equal_i32_and_mask(__vec32_i32 a, __vec32_i32 b, + __vec32_i1 m) { + __vec32_i1 ret; + ret.m16.m1 = _mm512_mask_cmpge_epi32_mask(m.m16.m1, a.v1, b.v1); + ret.m16.m2 = _mm512_mask_cmpge_epi32_mask(m.m16.m2, a.v2, b.v2); + return ret; +} + +static FORCEINLINE __vec32_i1 __unsigned_less_than_i32(__vec32_i32 a, __vec32_i32 b) { + __vec32_i1 ret; + ret.m16.m1 = _mm512_cmplt_epu32_mask(a.v1, b.v1); + ret.m16.m2 = _mm512_cmplt_epu32_mask(a.v2, b.v2); + return ret; +} + +static FORCEINLINE __vec32_i1 __unsigned_less_than_i32_and_mask(__vec32_i32 a, __vec32_i32 b, + __vec32_i1 m) { + __vec32_i1 ret; + ret.m16.m1 = _mm512_mask_cmplt_epu32_mask(m.m16.m1, a.v1, b.v1); + ret.m16.m2 = _mm512_mask_cmplt_epu32_mask(m.m16.m1, a.v2, b.v2); + return ret; +} + +static FORCEINLINE __vec32_i1 __signed_less_than_i32(__vec32_i32 a, __vec32_i32 b) { + __vec32_i1 ret; + ret.m16.m1 = _mm512_cmplt_epi32_mask(a.v1, b.v1); + ret.m16.m2 = _mm512_cmplt_epi32_mask(a.v2, b.v2); + return ret; +} + +static FORCEINLINE __vec32_i1 __signed_less_than_i32_and_mask(__vec32_i32 a, __vec32_i32 b, + __vec32_i1 m) { + __vec32_i1 ret; + ret.m16.m1 = _mm512_mask_cmplt_epi32_mask(m.m16.m1, a.v1, b.v1); + ret.m16.m2 = _mm512_mask_cmplt_epi32_mask(m.m16.m2, a.v2, b.v2); + return ret; +} + +static FORCEINLINE __vec32_i1 __unsigned_greater_than_i32(__vec32_i32 a, __vec32_i32 b) { + __vec32_i1 ret; + ret.m16.m1 = _mm512_cmpgt_epu32_mask(a.v1, b.v1); + ret.m16.m2 = _mm512_cmpgt_epu32_mask(a.v2, b.v2); + return ret; +} + +static FORCEINLINE __vec32_i1 __unsigned_greater_than_i32_and_mask(__vec32_i32 a, __vec32_i32 b, + __vec32_i1 m) { + __vec32_i1 ret; + ret.m16.m1 = _mm512_mask_cmpgt_epu32_mask(m.m16.m1, a.v1, b.v1); + ret.m16.m2 = _mm512_mask_cmpgt_epu32_mask(m.m16.m2, a.v2, b.v2); + return ret; +} + +static FORCEINLINE __vec32_i1 __signed_greater_than_i32(__vec32_i32 a, __vec32_i32 b) { + __vec32_i1 ret; + ret.m16.m1 = _mm512_cmpgt_epi32_mask(a.v1, b.v1); + ret.m16.m2 = _mm512_cmpgt_epi32_mask(a.v2, b.v2); + return ret; +} + +static FORCEINLINE __vec32_i1 __signed_greater_than_i32_and_mask(__vec32_i32 a, __vec32_i32 b, + __vec32_i1 m) { + __vec32_i1 ret; + ret.m16.m1 = _mm512_mask_cmpgt_epi32_mask(m.m16.m1, a.v1, b.v1); + ret.m16.m2 = _mm512_mask_cmpgt_epi32_mask(m.m16.m2, a.v2, b.v2); + return ret; +} + +static FORCEINLINE __vec32_i32 __select(__vec32_i1 mask, + __vec32_i32 a, __vec32_i32 b) { + __vec32_i32 ret; + ret.v1 = _mm512_mask_mov_epi32(b.v1, mask.m16.m1, a.v1); + ret.v2 = _mm512_mask_mov_epi32(b.v2, mask.m16.m2, a.v2); + return ret; +} + +static FORCEINLINE __vec32_i32 __select(bool cond, __vec32_i32 a, __vec32_i32 b) { + return cond ? a : b; +} + +static FORCEINLINE int32_t __extract_element(__vec32_i32 v, int index) { + return ((int32_t *)&v.v1)[index]; +} + +static FORCEINLINE void __insert_element(__vec32_i32 *v, int index, int32_t val) { + ((int32_t *)v)[index] = val; +} + +template <> static FORCEINLINE __vec32_i32 __smear_i32<__vec32_i32>(int32_t i) { + __vec32_i32 ret; + ret.v1 = _mm512_set1_epi32(i); + ret.v2 = _mm512_set1_epi32(i); + return ret; +} + +template <> static FORCEINLINE __vec32_i32 __setzero_i32<__vec32_i32>() { + __vec32_i32 ret; + ret.v1 = _mm512_setzero_epi32(); + ret.v2 = _mm512_setzero_epi32(); + return ret; +} + +template <> static FORCEINLINE __vec32_i32 __undef_i32<__vec32_i32>() { + __vec32_i32 ret; + ret.v1 = _mm512_undefined_epi32(); + ret.v2 = _mm512_undefined_epi32(); + return ret; +} + +static FORCEINLINE __vec32_i32 __broadcast_i32(__vec32_i32 v, int index) { + __vec32_i32 ret; + int32_t val = __extract_element(v, index & 0xf); + ret.v1 = _mm512_set1_epi32(val); + ret.v2 = _mm512_set1_epi32(val); + return ret; +} + +/* +static FORCEINLINE __vec32_i32 __rotate_i32(__vec32_i32 v, int index) { + + __vec32_i32 ret1 = v; + __vec32_i32 ret2 = v; + __vec32_i32 ret; // unaligned load from ((uint8_t*)&ret1)+index + + //for (int i = 0; i < 16; ++i) ret.v[i] = v.v[(i+index) & 0xf]; return ret; +} + +static FORCEINLINE __vec32_i32 __shuffle_i32(__vec32_i32 v, __vec32_i32 index) { + __vec32_i32 ret; for (int i = 0; i < 16; ++i) ret.v[i] = v.v[__extract_element(index, i) & 0xf]; return ret; +} + +static FORCEINLINE __vec32_i32 __shuffle2_i32(__vec32_i32 v0, __vec32_i32 v1, __vec32_i32 index) { + __vec32_i32 ret; for (int i = 0; i < 16; ++i) { int ii = __extract_element(index, i) & 0x1f; ret.v[i] = (ii < 16) ? v0.v[ii] : v1.v[ii-16]; } return ret; +} +*/ + +template static FORCEINLINE __vec32_i32 __load(const __vec32_i32 *p) { + __vec32_i32 ret; + ret.v1 = _mm512_undefined_epi32(); + ret.v2 = _mm512_undefined_epi32(); + ret.v1 = _mm512_extloadunpackhi_epi32(ret.v1, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE); + ret.v1 = _mm512_extloadunpacklo_epi32(ret.v1, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE); + ret.v2 = _mm512_extloadunpackhi_epi32(ret.v2, (uint8_t*)p+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE); + ret.v2 = _mm512_extloadunpacklo_epi32(ret.v2, (uint8_t*)p+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE); + return ret; +} + +template <> static FORCEINLINE __vec32_i32 __load<64>(const __vec32_i32 *p) { + __vec32_i32 ret; + ret.v1 = _mm512_load_epi32(p); + ret.v2 = _mm512_load_epi32((uint8_t*)p+64); + return ret; +} + +template <> static FORCEINLINE __vec32_i32 __load<128>(const __vec32_i32 *p) { + return __load<64>(p); +} + + +template static FORCEINLINE void __store(__vec32_i32 *p, __vec32_i32 v) { + _mm512_extpackstorehi_epi32(p, v.v1, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE); + _mm512_extpackstorelo_epi32(p, v.v1, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE); + _mm512_extpackstorehi_epi32((uint8_t*)p+64, v.v2, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE); + _mm512_extpackstorelo_epi32((uint8_t*)p+64, v.v2, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE); +} + +template <> static FORCEINLINE void __store<64>(__vec32_i32 *p, __vec32_i32 v) { + _mm512_store_epi32(p, v.v1); + _mm512_store_epi32((uint8_t*)p+64, v.v2); +} + +template <> static FORCEINLINE void __store<128>(__vec32_i32 *p, __vec32_i32 v) { + __store<64>(p, v); +} + +/////////////////////////////////////////////////////////////////////////// +// int64 + +BINARY_OP(__vec32_i64, __add, +) +BINARY_OP(__vec32_i64, __sub, -) +BINARY_OP(__vec32_i64, __mul, *) + +BINARY_OP(__vec32_i64, __or, |) +BINARY_OP(__vec32_i64, __and, &) +BINARY_OP(__vec32_i64, __xor, ^) +BINARY_OP(__vec32_i64, __shl, <<) + +BINARY_OP_CAST(__vec32_i64, uint64_t, __udiv, /) +BINARY_OP_CAST(__vec32_i64, int64_t, __sdiv, /) + +BINARY_OP_CAST(__vec32_i64, uint64_t, __urem, %) +BINARY_OP_CAST(__vec32_i64, int64_t, __srem, %) +BINARY_OP_CAST(__vec32_i64, uint64_t, __lshr, >>) +BINARY_OP_CAST(__vec32_i64, int64_t, __ashr, >>) + +SHIFT_UNIFORM(__vec32_i64, uint64_t, __lshr, >>) +SHIFT_UNIFORM(__vec32_i64, int64_t, __ashr, >>) +SHIFT_UNIFORM(__vec32_i64, int64_t, __shl, <<) + +CMP_OP(__vec32_i64, i64, int64_t, __equal, ==) +CMP_OP(__vec32_i64, i64, int64_t, __not_equal, !=) +CMP_OP(__vec32_i64, i64, uint64_t, __unsigned_less_equal, <=) +CMP_OP(__vec32_i64, i64, int64_t, __signed_less_equal, <=) +CMP_OP(__vec32_i64, i64, uint64_t, __unsigned_greater_equal, >=) +CMP_OP(__vec32_i64, i64, int64_t, __signed_greater_equal, >=) +CMP_OP(__vec32_i64, i64, uint64_t, __unsigned_less_than, <) +CMP_OP(__vec32_i64, i64, int64_t, __signed_less_than, <) +CMP_OP(__vec32_i64, i64, uint64_t, __unsigned_greater_than, >) +CMP_OP(__vec32_i64, i64, int64_t, __signed_greater_than, >) + +SELECT(__vec32_i64) +INSERT_EXTRACT(__vec32_i64, int64_t) +SMEAR(__vec32_i64, i64, int64_t) +BROADCAST(__vec32_i64, i64, int64_t) +ROTATE(__vec32_i64, i64, int64_t) +SHUFFLES(__vec32_i64, i64, int64_t) +LOAD_STORE(__vec32_i64, int64_t) + + +/////////////////////////////////////////////////////////////////////////// +// float + +static FORCEINLINE __vec32_f __add(__vec32_f a, __vec32_f b) { + __vec32_f ret; + ret.v1 = _mm512_add_ps(a.v1, b.v1); + ret.v2 = _mm512_add_ps(a.v2, b.v2); + return ret; +} + +static FORCEINLINE __vec32_f __sub(__vec32_f a, __vec32_f b) { + __vec32_f ret; + ret.v1 = _mm512_sub_ps(a.v1, b.v1); + ret.v2 = _mm512_sub_ps(a.v2, b.v2); + return ret; +} + +static FORCEINLINE __vec32_f __mul(__vec32_f a, __vec32_f b) { + __vec32_f ret; + ret.v1 = _mm512_mul_ps(a.v1, b.v1); + ret.v2 = _mm512_mul_ps(a.v2, b.v2); + return ret; +} + +static FORCEINLINE __vec32_f __div(__vec32_f a, __vec32_f b) { + __vec32_f ret; +/* + __vec32_f rcpb; + rcpb.v1 = _mm512_rcp23_ps(b.v1); + rcpb.v2 = _mm512_rcp23_ps(b.v2); + ret.v1 = _mm512_mul_ps(a.v1, rcpb.v1); + ret.v2 = _mm512_mul_ps(a.v2, rcpb.v2); +*/ + ret.v1 = _mm512_div_ps(a.v1, b.v1); + ret.v2 = _mm512_div_ps(a.v2, b.v2); + return ret; +} + +static FORCEINLINE __vec32_i1 __equal_float(__vec32_f a, __vec32_f b) { + __vec32_i1 ret; + ret.m16.m1 = _mm512_cmpeq_ps_mask(a.v1, b.v1); + ret.m16.m2 = _mm512_cmpeq_ps_mask(a.v2, b.v2); + return ret; +} + +static FORCEINLINE __vec32_i1 __equal_float_and_mask(__vec32_f a, __vec32_f b, + __vec32_i1 m) { + __vec32_i1 ret; + ret.m16.m1 = _mm512_mask_cmpeq_ps_mask(m.m16.m1, a.v1, b.v1); + ret.m16.m2 = _mm512_mask_cmpeq_ps_mask(m.m16.m2, a.v2, b.v2); + return ret; +} + +static FORCEINLINE __vec32_i1 __not_equal_float(__vec32_f a, __vec32_f b) { + __vec32_i1 ret; + ret.m16.m1 = _mm512_cmpneq_ps_mask(a.v1, b.v1); + ret.m16.m2 = _mm512_cmpneq_ps_mask(a.v2, b.v2); + return ret; +} + +static FORCEINLINE __vec32_i1 __not_equal_float_and_mask(__vec32_f a, __vec32_f b, + __vec32_i1 m) { + __vec32_i1 ret; + ret.m16.m1 = _mm512_mask_cmpneq_ps_mask(m.m16.m1, a.v1, b.v1); + ret.m16.m2 = _mm512_mask_cmpneq_ps_mask(m.m16.m2, a.v2, b.v2); + return ret; +} + +static FORCEINLINE __vec32_i1 __less_than_float(__vec32_f a, __vec32_f b) { + __vec32_i1 ret; + ret.m16.m1 = _mm512_cmplt_ps_mask(a.v1, b.v1); + ret.m16.m2 = _mm512_cmplt_ps_mask(a.v2, b.v2); + return ret; +} + +static FORCEINLINE __vec32_i1 __less_than_float_and_mask(__vec32_f a, __vec32_f b, + __vec32_i1 m) { + __vec32_i1 ret; + ret.m16.m1 = _mm512_mask_cmplt_ps_mask(m.m16.m1, a.v1, b.v1); + ret.m16.m2 = _mm512_mask_cmplt_ps_mask(m.m16.m2, a.v2, b.v2); + return ret; +} + +static FORCEINLINE __vec32_i1 __less_equal_float(__vec32_f a, __vec32_f b) { + __vec32_i1 ret; + ret.m16.m1 = _mm512_cmple_ps_mask(a.v1, b.v1); + ret.m16.m2 = _mm512_cmple_ps_mask(a.v2, b.v2); + return ret; +} + +static FORCEINLINE __vec32_i1 __less_equal_float_and_mask(__vec32_f a, __vec32_f b, + __vec32_i1 m) { + __vec32_i1 ret; + ret.m16.m1 = _mm512_mask_cmple_ps_mask(m.m16.m1, a.v1, b.v1); + ret.m16.m2 = _mm512_mask_cmple_ps_mask(m.m16.m2, a.v2, b.v2); + return ret; +} + +static FORCEINLINE __vec32_i1 __greater_than_float(__vec32_f a, __vec32_f b) { + __vec32_i1 ret; + ret.m16.m1 = _mm512_cmpnle_ps_mask(a.v1, b.v1); + ret.m16.m2 = _mm512_cmpnle_ps_mask(a.v2, b.v2); + return ret; +} + +static FORCEINLINE __vec32_i1 __greater_than_float_and_mask(__vec32_f a, __vec32_f b, + __vec32_i1 m) { + __vec32_i1 ret; + ret.m16.m1 = _mm512_mask_cmpnle_ps_mask(m.m16.m1, a.v1, b.v1); + ret.m16.m2 = _mm512_mask_cmpnle_ps_mask(m.m16.m2, a.v2, b.v2); + return ret; +} + +static FORCEINLINE __vec32_i1 __greater_equal_float(__vec32_f a, __vec32_f b) { + __vec32_i1 ret; + ret.m16.m1 = _mm512_cmpnlt_ps_mask(a.v1, b.v1); + ret.m16.m2 = _mm512_cmpnlt_ps_mask(a.v2, b.v2); + return ret; +} + +static FORCEINLINE __vec32_i1 __greater_equal_float_and_mask(__vec32_f a, __vec32_f b, + __vec32_i1 m) { + __vec32_i1 ret; + ret.m16.m1 = _mm512_mask_cmpnlt_ps_mask(m.m16.m1, a.v1, b.v1); + ret.m16.m2 = _mm512_mask_cmpnlt_ps_mask(m.m16.m2, a.v2, b.v2); + return ret; +} + +/* +static FORCEINLINE __vec32_i1 __ordered(__vec32_f a, __vec32_f b) { + __vec32_i1 ret; + ret.v = 0; + for (int i = 0; i < 16; ++i) + ret.v |= ((a.v[i] == a.v[i]) && (b.v[i] == b.v[i])) ? (1 << i) : 0; + return ret; +} +*/ + + +static FORCEINLINE __vec32_f __select(__vec32_i1 mask, __vec32_f a, __vec32_f b) { + __vec32_f ret; + ret.v1 = _mm512_mask_mov_ps(b.v1, mask.m16.m1, a.v1); + ret.v2 = _mm512_mask_mov_ps(b.v2, mask.m16.m2, a.v2); + return ret; +} + +static FORCEINLINE __vec32_f __select(bool cond, __vec32_f a, __vec32_f b) { + return cond ? a : b; +} + +static FORCEINLINE float __extract_element(__vec32_f v, int index) { + return ((float *)&v.v1)[index]; +} + +static FORCEINLINE void __insert_element(__vec32_f *v, int index, float val) { + ((float *)v)[index] = val; +} + +template <> static FORCEINLINE __vec32_f __smear_float<__vec32_f>(float f) { + __vec32_f ret; + ret.v1 = _mm512_extload_ps(&f, _MM_UPCONV_PS_NONE, _MM_BROADCAST_1X16, _MM_HINT_NONE); + ret.v2 = ret.v1; + return ret; +} + +template <> static FORCEINLINE __vec32_f __setzero_float<__vec32_f>() { + __vec32_f ret; + ret.v1 = _mm512_setzero_ps(); + ret.v2 = ret.v1; + return ret; +} + +template <> static FORCEINLINE __vec32_f __undef_float<__vec32_f>() { + __vec32_f ret; + ret.v1 = _mm512_undefined_ps(); + ret.v2 = ret.v1; + return ret; +} + +static FORCEINLINE __vec32_f __broadcast_float(__vec32_f v, int index) { + __vec32_f ret; + int32_t val = __extract_element(v, index & 0xf); + ret.v1 = _mm512_extload_ps(&val, _MM_UPCONV_PS_NONE, _MM_BROADCAST_1X16, _MM_HINT_NONE); + ret.v2 = ret.v1; + return ret; +} + +/* +static FORCEINLINE __vec32_f __rotate_float(__vec32_f v, int index) { + __vec32_f ret; for (int i = 0; i < 16; ++i) ret.v[i] = v.v[(i+index) & 0xf]; return ret; +} +*/ + +static FORCEINLINE __vec32_f __shuffle_float(__vec32_f v, __vec32_i32 index) { + __vec32_f ret; + ret.v1 = _mm512_castsi512_ps(_mm512_mask_permutevar_epi32(_mm512_castps_si512(v.v1), 0xffff, index.v1, _mm512_castps_si512(v.v1))); + ret.v2 = _mm512_castsi512_ps(_mm512_mask_permutevar_epi32(_mm512_castps_si512(v.v2), 0xffff, index.v2, _mm512_castps_si512(v.v2))); + return ret; +} + +/* +static FORCEINLINE __vec32_f __shuffle2_float(__vec32_f v0, __vec32_f v1, __vec32_i32 index) { + __vec32_f ret; for (int i = 0; i < 16; ++i) { int ii = __extract_element(index, i) & 0x1f; ret.v[i] = (ii < 16) ? v0.v[ii] : v1.v[ii-16]; } return ret; +} +*/ + +template static FORCEINLINE __vec32_f __load(const __vec32_f *p) { + __vec32_f ret; + ret.v1 = _mm512_undefined_ps(); + ret.v2 = _mm512_undefined_ps(); + ret.v1 = _mm512_extloadunpackhi_ps(ret.v1, p, _MM_UPCONV_PS_NONE, _MM_HINT_NONE); + ret.v1 = _mm512_extloadunpacklo_ps(ret.v1, p, _MM_UPCONV_PS_NONE, _MM_HINT_NONE); + ret.v2 = _mm512_extloadunpackhi_ps(ret.v2, (uint8_t*)p+64, _MM_UPCONV_PS_NONE, _MM_HINT_NONE); + ret.v2 = _mm512_extloadunpacklo_ps(ret.v2, (uint8_t*)p+64, _MM_UPCONV_PS_NONE, _MM_HINT_NONE); + return ret; +} + +template <> static FORCEINLINE __vec32_f __load<64>(const __vec32_f *p) { + __vec32_f ret; + ret.v1 = _mm512_load_ps(p); + ret.v2 = _mm512_load_ps((uint8_t*)p+64); + return ret; +} + +template <> static FORCEINLINE __vec32_f __load<128>(const __vec32_f *p) { + return __load<64>(p); +} + +template static FORCEINLINE void __store(__vec32_f *p, __vec32_f v) { + _mm512_extpackstorehi_ps(p, v.v1, _MM_DOWNCONV_PS_NONE, _MM_HINT_NONE); + _mm512_extpackstorelo_ps(p, v.v1, _MM_DOWNCONV_PS_NONE, _MM_HINT_NONE); + _mm512_extpackstorehi_ps((uint8_t*)p+64, v.v2, _MM_DOWNCONV_PS_NONE, _MM_HINT_NONE); + _mm512_extpackstorelo_ps((uint8_t*)p+64, v.v2, _MM_DOWNCONV_PS_NONE, _MM_HINT_NONE); +} + +template <> static FORCEINLINE void __store<64>(__vec32_f *p, __vec32_f v) { + _mm512_store_ps((uint8_t*)p, v.v1); + _mm512_store_ps((uint8_t*)p+64, v.v2); +} + +template <> static FORCEINLINE void __store<128>(__vec32_f *p, __vec32_f v) { + __store<64>(p, v); +} + +/////////////////////////////////////////////////////////////////////////// +// double + +BINARY_OP(__vec32_d, __add, +) +BINARY_OP(__vec32_d, __sub, -) +BINARY_OP(__vec32_d, __mul, *) +BINARY_OP(__vec32_d, __div, /) + +CMP_OP(__vec32_d, d, double, __equal, ==) +CMP_OP(__vec32_d, d, double, __not_equal, !=) +CMP_OP(__vec32_d, d, double, __less_than, <) +CMP_OP(__vec32_d, d, double, __less_equal, <=) +CMP_OP(__vec32_d, d, double, __greater_than, >) +CMP_OP(__vec32_d, d, double, __greater_equal, >=) + +/* +static FORCEINLINE __vec32_i1 __ordered(__vec32_d a, __vec32_d b) { + __vec32_i1 ret; + ret.v = 0; + for (int i = 0; i < 16; ++i) + ret.v |= ((a.v[i] == a.v[i]) && (b.v[i] == b.v[i])) ? (1 << i) : 0; + return ret; +} +*/ + +#if 0 + case Instruction::FRem: intrinsic = "__frem"; break; +#endif + +SELECT(__vec32_d) +INSERT_EXTRACT(__vec32_d, double) +SMEAR(__vec32_d, double, double) +BROADCAST(__vec32_d, double, double) +ROTATE(__vec32_d, double, double) +SHUFFLES(__vec32_d, double, double) +LOAD_STORE(__vec32_d, double) + +/////////////////////////////////////////////////////////////////////////// +// casts + + +#define CAST(TO, STO, FROM, SFROM, FUNC) +/* +static FORCEINLINE TO FUNC(TO, FROM val) { \ + TO ret; \ + for (int i = 0; i < 16; ++i) \ + ret.v[i] = (STO)((SFROM)(val.v[i])); \ + return ret; \ +} +*/ +// sign extension conversions +CAST(__vec32_i64, int64_t, __vec32_i32, int32_t, __cast_sext) +CAST(__vec32_i64, int64_t, __vec32_i16, int16_t, __cast_sext) +CAST(__vec32_i64, int64_t, __vec32_i8, int8_t, __cast_sext) +CAST(__vec32_i32, int32_t, __vec32_i16, int16_t, __cast_sext) +CAST(__vec32_i32, int32_t, __vec32_i8, int8_t, __cast_sext) +CAST(__vec32_i16, int16_t, __vec32_i8, int8_t, __cast_sext) + +#define CAST_SEXT_I1(TYPE) +/* +static FORCEINLINE TYPE __cast_sext(TYPE, __vec32_i1 v) { \ + TYPE ret; \ + for (int i = 0; i < 16; ++i) { \ + ret.v[i] = 0; \ + if (v.v & (1 << i)) \ + ret.v[i] = ~ret.v[i]; \ + } \ + return ret; \ +} +*/ +CAST_SEXT_I1(__vec32_i8) +CAST_SEXT_I1(__vec32_i16) +CAST_SEXT_I1(__vec32_i32) +CAST_SEXT_I1(__vec32_i64) + +// zero extension +CAST(__vec32_i64, uint64_t, __vec32_i32, uint32_t, __cast_zext) +CAST(__vec32_i64, uint64_t, __vec32_i16, uint16_t, __cast_zext) +CAST(__vec32_i64, uint64_t, __vec32_i8, uint8_t, __cast_zext) +CAST(__vec32_i32, uint32_t, __vec32_i16, uint16_t, __cast_zext) +CAST(__vec32_i32, uint32_t, __vec32_i8, uint8_t, __cast_zext) +CAST(__vec32_i16, uint16_t, __vec32_i8, uint8_t, __cast_zext) + +#define CAST_ZEXT_I1(TYPE) +/* +static FORCEINLINE TYPE __cast_zext(TYPE, __vec32_i1 v) { \ + TYPE ret; \ + for (int i = 0; i < 16; ++i) \ + ret.v[i] = (v.v & (1 << i)) ? 1 : 0; \ + return ret; \ +} +*/ +CAST_ZEXT_I1(__vec32_i8) +CAST_ZEXT_I1(__vec32_i16) +CAST_ZEXT_I1(__vec32_i32) +CAST_ZEXT_I1(__vec32_i64) + +// truncations +CAST(__vec32_i32, int32_t, __vec32_i64, int64_t, __cast_trunc) +CAST(__vec32_i16, int16_t, __vec32_i64, int64_t, __cast_trunc) +CAST(__vec32_i8, int8_t, __vec32_i64, int64_t, __cast_trunc) +CAST(__vec32_i16, int16_t, __vec32_i32, int32_t, __cast_trunc) +CAST(__vec32_i8, int8_t, __vec32_i32, int32_t, __cast_trunc) +CAST(__vec32_i8, int8_t, __vec32_i16, int16_t, __cast_trunc) + +// signed int to float/double +CAST(__vec32_f, float, __vec32_i8, int8_t, __cast_sitofp) +CAST(__vec32_f, float, __vec32_i16, int16_t, __cast_sitofp) +CAST(__vec32_f, float, __vec32_i32, int32_t, __cast_sitofp) +CAST(__vec32_f, float, __vec32_i64, int64_t, __cast_sitofp) +CAST(__vec32_d, double, __vec32_i8, int8_t, __cast_sitofp) +CAST(__vec32_d, double, __vec32_i16, int16_t, __cast_sitofp) +CAST(__vec32_d, double, __vec32_i32, int32_t, __cast_sitofp) +CAST(__vec32_d, double, __vec32_i64, int64_t, __cast_sitofp) + + +static FORCEINLINE __vec32_f __cast_sitofp(__vec32_f, __vec32_i8 val) { + __vec32_f ret; + ret.v1 = _mm512_extload_ps(&val, _MM_UPCONV_PS_SINT8, _MM_BROADCAST_16X16, _MM_HINT_NONE); + ret.v2 = _mm512_extload_ps((uint8_t*)&val+16, _MM_UPCONV_PS_SINT8, _MM_BROADCAST_16X16, _MM_HINT_NONE); + return ret; +} + +static FORCEINLINE __vec32_f __cast_sitofp(__vec32_f, __vec32_i16 val) { + __vec32_f ret; + ret.v1 = _mm512_extload_ps(&val, _MM_UPCONV_PS_SINT16, _MM_BROADCAST_16X16, _MM_HINT_NONE); + ret.v2 = _mm512_extload_ps((uint16_t*)&val+32, _MM_UPCONV_PS_SINT16, _MM_BROADCAST_16X16, _MM_HINT_NONE); + return ret; +} + +static FORCEINLINE __vec32_f __cast_sitofp(__vec32_f, __vec32_i32 val) { + __vec32_f ret; + ret.v1 = _mm512_cvtfxpnt_round_adjustepi32_ps(val.v1, _MM_ROUND_MODE_NEAREST, _MM_EXPADJ_NONE); + ret.v2 = _mm512_cvtfxpnt_round_adjustepi32_ps(val.v2, _MM_ROUND_MODE_NEAREST, _MM_EXPADJ_NONE); + return ret; +} + +/* +static FORCEINLINE __vec32_f __cast_sitofp(__vec32_f, __vec32_i64 val) { + __vec32_f ret; for (int i = 0; i < 16; ++i) ret.v[i] = (float)((int64_t)(val.v[i])); return ret; +} + +static FORCEINLINE __vec32_d __cast_sitofp(__vec32_d, __vec32_i8 val) { + __vec32_d ret; for (int i = 0; i < 16; ++i) ret.v[i] = (double)((int8_t)(val.v[i])); return ret; +} + +static FORCEINLINE __vec32_d __cast_sitofp(__vec32_d, __vec32_i16 val) { + __vec32_d ret; for (int i = 0; i < 16; ++i) ret.v[i] = (double)((int16_t)(val.v[i])); return ret; +} + +static FORCEINLINE __vec32_d __cast_sitofp(__vec32_d, __vec32_i32 val) { + __vec32_d ret; for (int i = 0; i < 16; ++i) ret.v[i] = (double)((int32_t)(val.v[i])); return ret; +} + +static FORCEINLINE __vec32_d __cast_sitofp(__vec32_d, __vec32_i64 val) { + __vec32_d ret; for (int i = 0; i < 16; ++i) ret.v[i] = (double)((int64_t)(val.v[i])); return ret; +} +*/ + +// unsigned int to float/double +CAST(__vec32_f, float, __vec32_i8, uint8_t, __cast_uitofp) +CAST(__vec32_f, float, __vec32_i16, uint16_t, __cast_uitofp) +CAST(__vec32_f, float, __vec32_i32, uint32_t, __cast_uitofp) +CAST(__vec32_f, float, __vec32_i64, uint64_t, __cast_uitofp) +CAST(__vec32_d, double, __vec32_i8, uint8_t, __cast_uitofp) +CAST(__vec32_d, double, __vec32_i16, uint16_t, __cast_uitofp) +CAST(__vec32_d, double, __vec32_i32, uint32_t, __cast_uitofp) +CAST(__vec32_d, double, __vec32_i64, uint64_t, __cast_uitofp) +/* +static FORCEINLINE __vec32_f __cast_uitofp(__vec32_f, __vec32_i1 v) { + __vec32_f ret; + for (int i = 0; i < 16; ++i) + ret.v[i] = (v.v & (1 << i)) ? 1. : 0.; + return ret; +} +*/ +// float/double to signed int + +static FORCEINLINE __vec32_i32 __cast_fptosi(__vec32_i32, __vec32_f val) { + __vec32_i32 ret; + ret.v1 = _mm512_cvtfxpnt_round_adjustps_epi32(val.v1, _MM_ROUND_MODE_TOWARD_ZERO, _MM_EXPADJ_NONE); + ret.v2 = _mm512_cvtfxpnt_round_adjustps_epi32(val.v2, _MM_ROUND_MODE_TOWARD_ZERO, _MM_EXPADJ_NONE); + return ret; +} + +CAST(__vec32_i8, int8_t, __vec32_f, float, __cast_fptosi) +CAST(__vec32_i16, int16_t, __vec32_f, float, __cast_fptosi) +CAST(__vec32_i32, int32_t, __vec32_f, float, __cast_fptosi) +CAST(__vec32_i64, int64_t, __vec32_f, float, __cast_fptosi) +CAST(__vec32_i8, int8_t, __vec32_d, double, __cast_fptosi) +CAST(__vec32_i16, int16_t, __vec32_d, double, __cast_fptosi) +CAST(__vec32_i32, int32_t, __vec32_d, double, __cast_fptosi) +CAST(__vec32_i64, int64_t, __vec32_d, double, __cast_fptosi) + +// float/double to unsigned int +CAST(__vec32_i8, uint8_t, __vec32_f, float, __cast_fptoui) +CAST(__vec32_i16, uint16_t, __vec32_f, float, __cast_fptoui) +CAST(__vec32_i32, uint32_t, __vec32_f, float, __cast_fptoui) +CAST(__vec32_i64, uint64_t, __vec32_f, float, __cast_fptoui) +CAST(__vec32_i8, uint8_t, __vec32_d, double, __cast_fptoui) +CAST(__vec32_i16, uint16_t, __vec32_d, double, __cast_fptoui) +CAST(__vec32_i32, uint32_t, __vec32_d, double, __cast_fptoui) +CAST(__vec32_i64, uint64_t, __vec32_d, double, __cast_fptoui) + +// float/double conversions +CAST(__vec32_f, float, __vec32_d, double, __cast_fptrunc) +CAST(__vec32_d, double, __vec32_f, float, __cast_fpext) + +/* +typedef union { + int32_t i32; + float f; + int64_t i64; + double d; +} BitcastUnion; +*/ +#define CAST_BITS(TO, TO_TYPE, TO_ELT, FROM, FROM_ELT) +/* +static FORCEINLINE TO __cast_bits(TO, FROM val) { \ + BitcastUnion u; \ + u.FROM_ELT = val; \ + return u.TO_ELT; \ +} +*/ + +static FORCEINLINE __vec32_f __cast_bits(__vec32_f, __vec32_i32 val) { + // TODO: This should be doable without the reads... + __vec32_f ret; + ret.v1 = _mm512_castsi512_ps(val.v1); + ret.v2 = _mm512_castsi512_ps(val.v2); + return ret; +} + +static FORCEINLINE __vec32_i32 __cast_bits(__vec32_i32, __vec32_f val) { + // TODO: This should be doable without the reads... + __vec32_i32 ret; + ret.v1 = _mm512_castps_si512(val.v1); + ret.v2 = _mm512_castps_si512(val.v2); + return ret; +} + +CAST_BITS(__vec32_d, double, d, __vec32_i64, i64) +CAST_BITS(__vec32_i64, int64_t, i64, __vec32_d, d) + +#define CAST_BITS_SCALAR(TO, FROM) +/* +static FORCEINLINE TO __cast_bits(TO, FROM v) { \ + union { \ + TO to; \ + FROM from; \ + } u; \ + u.from = v; \ + return u.to; \ +} +*/ +CAST_BITS_SCALAR(uint32_t, float) +CAST_BITS_SCALAR(int32_t, float) +CAST_BITS_SCALAR(float, uint32_t) +CAST_BITS_SCALAR(float, int32_t) +CAST_BITS_SCALAR(uint64_t, double) +CAST_BITS_SCALAR(int64_t, double) +CAST_BITS_SCALAR(double, uint64_t) +CAST_BITS_SCALAR(double, int64_t) + + +/////////////////////////////////////////////////////////////////////////// +// various math functions + +static FORCEINLINE __vec32_f __round_varying_float(__vec32_f v) { + __vec32_f ret; + ret.v1 = _mm512_round_ps(v.v1, _MM_ROUND_MODE_NEAREST, _MM_EXPADJ_NONE); + ret.v2 = _mm512_round_ps(v.v2, _MM_ROUND_MODE_NEAREST, _MM_EXPADJ_NONE); + return ret; +} + +static FORCEINLINE __vec32_f __floor_varying_float(__vec32_f v) { + __vec32_f ret; + ret.v1 = _mm512_floor_ps(v.v1); + ret.v2 = _mm512_floor_ps(v.v2); + return ret; +} + +static FORCEINLINE __vec32_f __ceil_varying_float(__vec32_f v) { + __vec32_f ret; + ret.v1 = _mm512_ceil_ps(v.v1); + ret.v2 = _mm512_ceil_ps(v.v2); + return ret; +} + +UNARY_OP(__vec32_d, __round_varying_double, round) +UNARY_OP(__vec32_d, __floor_varying_double, floor) +UNARY_OP(__vec32_d, __ceil_varying_double, ceil) + +// min/max + +static FORCEINLINE __vec32_f __max_varying_float(__vec32_f a, __vec32_f b) { + __vec32_f ret; + ret.v1 = _mm512_max_ps(a.v1, b.v1); + ret.v2 = _mm512_max_ps(a.v2, b.v2); + return ret; +} + +static FORCEINLINE __vec32_f __min_varying_float(__vec32_f a, __vec32_f b) { + __vec32_f ret; + ret.v1 = _mm512_min_ps(a.v1, a.v1); + ret.v2 = _mm512_min_ps(a.v2, b.v2); + return ret; +} + +BINARY_OP_FUNC(__vec32_d, __max_varying_double, __max_uniform_double) +BINARY_OP_FUNC(__vec32_d, __min_varying_double, __min_uniform_double) + +BINARY_OP_FUNC(__vec32_i32, __max_varying_int32, __max_uniform_int32) +BINARY_OP_FUNC(__vec32_i32, __min_varying_int32, __min_uniform_int32) +BINARY_OP_FUNC(__vec32_i32, __max_varying_uint32, __max_uniform_uint32) +BINARY_OP_FUNC(__vec32_i32, __min_varying_uint32, __min_uniform_uint32) + +BINARY_OP_FUNC(__vec32_i64, __max_varying_int64, __max_uniform_int64) +BINARY_OP_FUNC(__vec32_i64, __min_varying_int64, __min_uniform_int64) +BINARY_OP_FUNC(__vec32_i64, __max_varying_uint64, __max_uniform_uint64) +BINARY_OP_FUNC(__vec32_i64, __min_varying_uint64, __min_uniform_uint64) + +// sqrt/rsqrt/rcp + +static FORCEINLINE __vec32_f __sqrt_varying_float(__vec32_f v) { + __vec32_f ret; + ret.v1 = _mm512_sqrt_ps(v.v1); + ret.v2 = _mm512_sqrt_ps(v.v2); + return ret; +} + +static FORCEINLINE __vec32_f __rcp_varying_float(__vec32_f v) { + __vec32_f ret; + ret.v1 = _mm512_recip_ps(v.v1); + ret.v2 = _mm512_recip_ps(v.v2); + return ret; + //return _mm512_rcp23_ps(v); // 23-bits of accuracy +} + +static FORCEINLINE __vec32_f __rsqrt_varying_float(__vec32_f v) { + __vec32_f ret; + ret.v1 = _mm512_rsqrt23_ps(v.v1); // to 0.775ULP accuracy + ret.v2 = _mm512_rsqrt23_ps(v.v2); // to 0.775ULP accuracy + return ret; +} + +static FORCEINLINE __vec32_f __exp_varying_float(__vec32_f v) { + __vec32_f ret; + ret.v1 = _mm512_exp_ps(v.v1); + ret.v2 = _mm512_exp_ps(v.v2); + return ret; +} + +static FORCEINLINE __vec32_f __log_varying_float(__vec32_f v) { + __vec32_f ret; + ret.v1 = _mm512_log_ps(v.v1); + ret.v2 = _mm512_log_ps(v.v2); + return ret; +} + +static FORCEINLINE __vec32_f __pow_varying_float(__vec32_f a, __vec32_f b) { + __vec32_f ret; + ret.v1 = _mm512_pow_ps(a.v1, b.v1); + ret.v2 = _mm512_pow_ps(a.v2, b.v2); + return ret; +} + +UNARY_OP(__vec32_f, __rcp_varying_float, __rcp_uniform_float) +UNARY_OP(__vec32_d, __sqrt_varying_double, __sqrt_uniform_double) + +/////////////////////////////////////////////////////////////////////////// +// bit ops +/* +static FORCEINLINE int32_t __popcnt_int32(uint32_t v) { + int count = 0; + for (; v != 0; v >>= 1) + count += (v & 1); + return count; +} + +static FORCEINLINE int32_t __popcnt_int64(uint64_t v) { + int count = 0; + for (; v != 0; v >>= 1) + count += (v & 1); + return count; +} + +static FORCEINLINE int32_t __count_trailing_zeros_i32(uint32_t v) { + if (v == 0) + return 32; + + int count = 0; + while ((v & 1) == 0) { + ++count; + v >>= 1; + } + return count; +} + +static FORCEINLINE int64_t __count_trailing_zeros_i64(uint64_t v) { + if (v == 0) + return 64; + + int count = 0; + while ((v & 1) == 0) { + ++count; + v >>= 1; + } + return count; +} + +static FORCEINLINE int32_t __count_leading_zeros_i32(uint32_t v) { + if (v == 0) + return 32; + + int count = 0; + while ((v & (1<<31)) == 0) { + ++count; + v <<= 1; + } + return count; +} + +static FORCEINLINE int64_t __count_leading_zeros_i64(uint64_t v) { + if (v == 0) + return 64; + + int count = 0; + while ((v & (1ull<<63)) == 0) { + ++count; + v <<= 1; + } + return count; +} +*/ +/////////////////////////////////////////////////////////////////////////// +// reductions + +static FORCEINLINE float __reduce_add_float(__vec32_f v) { + return _mm512_reduce_add_ps(v.v1) + _mm512_reduce_add_ps(v.v2); +} + +REDUCE_MINMAX(float, __vec32_f, __reduce_min_float, <) +REDUCE_MINMAX(float, __vec32_f, __reduce_max_float, >) + +REDUCE_ADD(double, __vec32_d, __reduce_add_double) +REDUCE_MINMAX(double, __vec32_d, __reduce_min_double, <) +REDUCE_MINMAX(double, __vec32_d, __reduce_max_double, >) + +REDUCE_ADD(uint32_t, __vec32_i32, __reduce_add_int32) +REDUCE_MINMAX(int32_t, __vec32_i32, __reduce_min_int32, <) +REDUCE_MINMAX(int32_t, __vec32_i32, __reduce_max_int32, >) + +REDUCE_ADD(uint32_t, __vec32_i32, __reduce_add_uint32) +REDUCE_MINMAX(uint32_t, __vec32_i32, __reduce_min_uint32, <) +REDUCE_MINMAX(uint32_t, __vec32_i32, __reduce_max_uint32, >) + +REDUCE_ADD(uint64_t, __vec32_i64, __reduce_add_int64) +REDUCE_MINMAX(int64_t, __vec32_i64, __reduce_min_int64, <) +REDUCE_MINMAX(int64_t, __vec32_i64, __reduce_max_int64, >) + +REDUCE_ADD(uint64_t, __vec32_i64, __reduce_add_uint64) +REDUCE_MINMAX(uint64_t, __vec32_i64, __reduce_min_uint64, <) +REDUCE_MINMAX(uint64_t, __vec32_i64, __reduce_max_uint64, >) + +/////////////////////////////////////////////////////////////////////////// +// masked load/store +/* +static FORCEINLINE __vec32_i8 __masked_load_i8(void *p, + __vec32_i1 mask) { + __vec32_i8 ret; + int8_t *ptr = (int8_t *)p; + for (int i = 0; i < 16; ++i) + if ((mask.v & (1 << i)) != 0) + ret.v[i] = ptr[i]; + return ret; +} + +static FORCEINLINE __vec32_i16 __masked_load_i16(void *p, + __vec32_i1 mask) { + __vec32_i16 ret; + int16_t *ptr = (int16_t *)p; + for (int i = 0; i < 16; ++i) + if ((mask.v & (1 << i)) != 0) + ret.v[i] = ptr[i]; + return ret; +} +*/ +static FORCEINLINE __vec32_i32 __masked_load_i32(void *p, + __vec32_i1 mask) { + __vec32_i32 ret; + //ret.v1 = _mm512_undefined_epi32(); + //ret.v2 = _mm512_undefined_epi32(); + ret.v1 = _mm512_mask_load_epi32(ret.v1, mask.m16.m1, p); + ret.v2 = _mm512_mask_load_epi32(ret.v2, mask.m16.m2, (uint8_t*)p+64); + return ret; +} + +/* +static FORCEINLINE __vec32_i64 __masked_load_i64(void *p, + __vec32_i1 mask) { + union { + __vec32_i64 v64; + __vec32_i32 v32[2]; + } ret; + + ret.v32[0] = _mm512_undefined_epi32(); + ret.v32[1] = _mm512_undefined_epi32(); + ret.v32[0] = _mm512_mask_loadq(ret, mask, p, _MM_FULLUPC64_NONE, _MM_BROADCAST_8X8, _MM_HINT_NONE); + ret.v32[1] = _mm512_mask_loadq(ret, mask, p, _MM_FULLUPC64_NONE, _MM_BROADCAST_8X8, _MM_HINT_NONE); + + return ret.v64; +} +*/ + +static FORCEINLINE __vec32_f __masked_load_float(void *p, __vec32_i1 mask) { + __vec32_f ret; + ret.v1 = _mm512_mask_load_ps(ret.v1, mask.m16.m1, p); + ret.v2 = _mm512_mask_load_ps(ret.v2, mask.m16.m2, p); + return ret; +} + +/* +static FORCEINLINE void __masked_store_i8(void *p, __vec32_i8 val, + __vec32_i1 mask) { + int8_t *ptr = (int8_t *)p; + for (int i = 0; i < 16; ++i) + if ((mask.v & (1 << i)) != 0) + ptr[i] = val.v[i]; +} + +static FORCEINLINE void __masked_store_i16(void *p, __vec32_i16 val, + __vec32_i1 mask) { + int16_t *ptr = (int16_t *)p; + for (int i = 0; i < 16; ++i) + if ((mask.v & (1 << i)) != 0) + ptr[i] = val.v[i]; +} +*/ +static FORCEINLINE void __masked_store_i32(void *p, __vec32_i32 val, + __vec32_i1 mask) { + _mm512_mask_store_epi32((uint8_t*)p, mask.m16.m1, val.v1); + _mm512_mask_store_epi32((uint8_t*)p+64, mask.m16.m2, val.v2); +} +/* +static FORCEINLINE void __masked_store_i64(void *p, __vec32_i64 val, + __vec32_i1 mask) { + int64_t *ptr = (int64_t *)p; + for (int i = 0; i < 16; ++i) + if ((mask.v & (1 << i)) != 0) + ptr[i] = val.v[i]; +} +*/ + +static FORCEINLINE void __masked_store_float(void *p, __vec32_f val, + __vec32_i1 mask) { + _mm512_mask_store_ps(p, mask.m16.m1, val.v1); + _mm512_mask_store_ps(((uint8_t*)p)+64, mask.m16.m2, val.v2); +} + +/* +static FORCEINLINE void __masked_store_blend_i8(void *p, __vec32_i8 val, + __vec32_i1 mask) { + __masked_store_i8(p, val, mask); +} + +static FORCEINLINE void __masked_store_blend_i16(void *p, __vec32_i16 val, + __vec32_i1 mask) { + __masked_store_i16(p, val, mask); +} +*/ + +static FORCEINLINE void __masked_store_blend_i32(void *p, __vec32_i32 val, + __vec32_i1 mask) { + __masked_store_i32(p, val, mask); +} + +/* +static FORCEINLINE void __masked_store_blend_i64(void *p, __vec32_i64 val, + __vec32_i1 mask) { + __masked_store_i64(p, val, mask); +} +*/ +/////////////////////////////////////////////////////////////////////////// +// gather/scatter + +// offsets * offsetScale is in bytes (for all of these) + +#define GATHER_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC) +/* +static FORCEINLINE VTYPE FUNC(unsigned char *b, OTYPE varyingOffset, \ + uint32_t scale, OTYPE constOffset, \ + __vec32_i1 mask) { \ + VTYPE ret; \ + int8_t *base = (int8_t *)b; \ + for (int i = 0; i < 16; ++i) \ + if ((mask.v & (1 << i)) != 0) { \ + STYPE *ptr = (STYPE *)(base + scale * varyingOffset.v[i] + \ + constOffset.v[i]); \ + ret.v[i] = *ptr; \ + } \ + return ret; \ +} +*/ + + +GATHER_BASE_OFFSETS(__vec32_i8, int8_t, __vec32_i32, __gather_base_offsets32_i8) +GATHER_BASE_OFFSETS(__vec32_i8, int8_t, __vec32_i64, __gather_base_offsets64_i8) +GATHER_BASE_OFFSETS(__vec32_i16, int16_t, __vec32_i32, __gather_base_offsets32_i16) +GATHER_BASE_OFFSETS(__vec32_i16, int16_t, __vec32_i64, __gather_base_offsets64_i16) +//GATHER_BASE_OFFSETS(__vec32_i32, int32_t, __vec32_i32, __gather_base_offsets32_i32) +GATHER_BASE_OFFSETS(__vec32_i32, int32_t, __vec32_i64, __gather_base_offsets64_i32) +GATHER_BASE_OFFSETS(__vec32_i64, int64_t, __vec32_i32, __gather_base_offsets32_i64) +GATHER_BASE_OFFSETS(__vec32_i64, int64_t, __vec32_i64, __gather_base_offsets64_i64) + +static FORCEINLINE __vec32_i32 +__gather_base_offsets32_i32(uint8_t *b, __vec32_i32 varyingOffset, + uint32_t scale, __vec32_i32 constOffset, + __vec32_i1 mask) { + __vec32_i32 ret; + __vec32_i32 offsets; + __vec32_i32 vscale = __smear_i32<__vec32_i32>(scale); + + // Loop generated by the intrinsic + offsets = __add(__mul(vscale, varyingOffset), constOffset); + ret.v1 = _mm512_mask_i32extgather_epi32(_mm512_undefined_epi32(), mask.m16.m1, offsets.v1, b, + _MM_UPCONV_EPI32_NONE, 1, + _MM_HINT_NONE); + ret.v2 = _mm512_mask_i32extgather_epi32(_mm512_undefined_epi32(), mask.m16.m2, offsets.v2, b+64, + _MM_UPCONV_EPI32_NONE, 1, + _MM_HINT_NONE); + return ret; +} + +static FORCEINLINE __vec32_f +__gather_base_offsets32_float(uint8_t *b, __vec32_i32 varyingOffset, + uint32_t scale, __vec32_i32 constOffset, + __vec32_i1 mask) { + __vec32_f ret; + __vec32_i32 offsets; + __vec32_i32 vscale = __smear_i32<__vec32_i32>(scale); + + // Loop generated by the intrinsic + offsets = __add(__mul(vscale, varyingOffset), constOffset); + ret.v1 = _mm512_mask_i32extgather_ps(_mm512_undefined_ps(), mask.m16.m1, offsets.v1, b, + _MM_UPCONV_PS_NONE, 1, + _MM_HINT_NONE); + ret.v2 = _mm512_mask_i32extgather_ps(_mm512_undefined_ps(), mask.m16.m2, offsets.v2, b+64, + _MM_UPCONV_PS_NONE, 1, + _MM_HINT_NONE); + return ret; +} + +#define GATHER_GENERAL(VTYPE, STYPE, PTRTYPE, FUNC) +/* +static FORCEINLINE VTYPE FUNC(PTRTYPE ptrs, __vec32_i1 mask) { \ + VTYPE ret; \ + for (int i = 0; i < 16; ++i) \ + if ((mask.v & (1 << i)) != 0) { \ + STYPE *ptr = (STYPE *)ptrs.v[i]; \ + ret.v[i] = *ptr; \ + } \ + return ret; \ +} +*/ + +GATHER_GENERAL(__vec32_i8, int8_t, __vec32_i32, __gather32_i8) +GATHER_GENERAL(__vec32_i8, int8_t, __vec32_i64, __gather64_i8) +GATHER_GENERAL(__vec32_i16, int16_t, __vec32_i32, __gather32_i16) +GATHER_GENERAL(__vec32_i16, int16_t, __vec32_i64, __gather64_i16) +GATHER_GENERAL(__vec32_i32, int32_t, __vec32_i32, __gather32_i32) +GATHER_GENERAL(__vec32_i32, int32_t, __vec32_i64, __gather64_i32) +GATHER_GENERAL(__vec32_i64, int64_t, __vec32_i32, __gather32_i64) +GATHER_GENERAL(__vec32_i64, int64_t, __vec32_i64, __gather64_i64) + +// scatter + +#define SCATTER_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC) +/* +static FORCEINLINE void FUNC(unsigned char *b, OTYPE varyingOffset, \ + uint32_t scale, OTYPE constOffset, \ + VTYPE val, __vec32_i1 mask) { \ + int8_t *base = (int8_t *)b; \ + for (int i = 0; i < 16; ++i) \ + if ((mask.v & (1 << i)) != 0) { \ + STYPE *ptr = (STYPE *)(base + scale * varyingOffset.v[i] + \ + constOffset.v[i]); \ + *ptr = val.v[i]; \ + } \ +} +*/ + +SCATTER_BASE_OFFSETS(__vec32_i8, int8_t, __vec32_i32, __scatter_base_offsets32_i8) +SCATTER_BASE_OFFSETS(__vec32_i8, int8_t, __vec32_i64, __scatter_base_offsets64_i8) +SCATTER_BASE_OFFSETS(__vec32_i16, int16_t, __vec32_i32, __scatter_base_offsets32_i16) +SCATTER_BASE_OFFSETS(__vec32_i16, int16_t, __vec32_i64, __scatter_base_offsets64_i16) +//SCATTER_BASE_OFFSETS(__vec32_i32, int32_t, __vec32_i32, __scatter_base_offsets32_i32) +SCATTER_BASE_OFFSETS(__vec32_i32, int32_t, __vec32_i64, __scatter_base_offsets64_i32) +SCATTER_BASE_OFFSETS(__vec32_i64, int64_t, __vec32_i32, __scatter_base_offsets32_i64) +SCATTER_BASE_OFFSETS(__vec32_i64, int64_t, __vec32_i64, __scatter_base_offsets64_i64) + +static FORCEINLINE void +__scatter_base_offsets32_i32(uint8_t *b, __vec32_i32 varyingOffset, + uint32_t scale, __vec32_i32 constOffset, + __vec32_i32 val, __vec32_i1 mask) { + __vec32_i32 offsets; + __vec32_i32 vscale = __smear_i32<__vec32_i32>(scale); + + // Loop generated by the intrinsic + offsets = __add(__mul(vscale, varyingOffset), constOffset); + _mm512_mask_i32extscatter_epi32(b, mask.m16.m1, offsets.v1, val.v1, _MM_DOWNCONV_EPI32_NONE, 1, _MM_HINT_NONE); + _mm512_mask_i32extscatter_epi32(b, mask.m16.m2, offsets.v2, val.v2, _MM_DOWNCONV_EPI32_NONE, 1, _MM_HINT_NONE); +} + +#define SCATTER_GENERAL(VTYPE, STYPE, PTRTYPE, FUNC) +/* +static FORCEINLINE void FUNC(PTRTYPE ptrs, VTYPE val, __vec32_i1 mask) { \ + VTYPE ret; \ + for (int i = 0; i < 16; ++i) \ + if ((mask.v & (1 << i)) != 0) { \ + STYPE *ptr = (STYPE *)ptrs.v[i]; \ + *ptr = val.v[i]; \ + } \ +} +*/ +SCATTER_GENERAL(__vec32_i8, int8_t, __vec32_i32, __scatter32_i8) +SCATTER_GENERAL(__vec32_i8, int8_t, __vec32_i64, __scatter64_i8) +SCATTER_GENERAL(__vec32_i16, int16_t, __vec32_i32, __scatter32_i16) +SCATTER_GENERAL(__vec32_i16, int16_t, __vec32_i64, __scatter64_i16) +SCATTER_GENERAL(__vec32_i32, int32_t, __vec32_i32, __scatter32_i32) +SCATTER_GENERAL(__vec32_i32, int32_t, __vec32_i64, __scatter64_i32) +SCATTER_GENERAL(__vec32_i64, int64_t, __vec32_i32, __scatter32_i64) +SCATTER_GENERAL(__vec32_i64, int64_t, __vec32_i64, __scatter64_i64) + +/////////////////////////////////////////////////////////////////////////// +// packed load/store +/* +static FORCEINLINE int32_t __packed_load_active(int32_t *ptr, __vec32_i32 *val, + __vec32_i1 mask) { + int count = 0; + for (int i = 0; i < 16; ++i) { + if ((mask.v & (1 << i)) != 0) { + val->v[i] = *ptr++; + ++count; + } + } + return count; +} + + +static FORCEINLINE int32_t __packed_store_active(int32_t *ptr, __vec32_i32 val, + __vec32_i1 mask) { + int count = 0; + for (int i = 0; i < 16; ++i) { + if ((mask.v & (1 << i)) != 0) { + *ptr++ = val.v[i]; + ++count; + } + } + return count; +} + +static FORCEINLINE int32_t __packed_load_active(uint32_t *ptr, + __vec32_i32 *val, + __vec32_i1 mask) { + int count = 0; + for (int i = 0; i < 16; ++i) { + if ((mask.v & (1 << i)) != 0) { + val->v[i] = *ptr++; + ++count; + } + } + return count; +} + + +static FORCEINLINE int32_t __packed_store_active(uint32_t *ptr, + __vec32_i32 val, + __vec32_i1 mask) { + int count = 0; + for (int i = 0; i < 16; ++i) { + if ((mask.v & (1 << i)) != 0) { + *ptr++ = val.v[i]; + ++count; + } + } + return count; +} +*/ + +/////////////////////////////////////////////////////////////////////////// +// aos/soa + +/* +static FORCEINLINE void __soa_to_aos3_float(__vec32_f v0, __vec32_f v1, __vec32_f v2, + float *ptr) { + for (int i = 0; i < 16; ++i) { + *ptr++ = __extract_element(v0, i); + *ptr++ = __extract_element(v1, i); + *ptr++ = __extract_element(v2, i); + } +} + +static FORCEINLINE void __aos_to_soa3_float(float *ptr, __vec32_f *out0, __vec32_f *out1, + __vec32_f *out2) { + for (int i = 0; i < 16; ++i) { + __insert_element(out0, i, *ptr++); + __insert_element(out1, i, *ptr++); + __insert_element(out2, i, *ptr++); + } +} + +static FORCEINLINE void __soa_to_aos4_float(__vec32_f v0, __vec32_f v1, __vec32_f v2, + __vec32_f v3, float *ptr) { + for (int i = 0; i < 16; ++i) { + *ptr++ = __extract_element(v0, i); + *ptr++ = __extract_element(v1, i); + *ptr++ = __extract_element(v2, i); + *ptr++ = __extract_element(v3, i); + } +} + +static FORCEINLINE void __aos_to_soa4_float(float *ptr, __vec32_f *out0, __vec32_f *out1, + __vec32_f *out2, __vec32_f *out3) { + for (int i = 0; i < 16; ++i) { + __insert_element(out0, i, *ptr++); + __insert_element(out1, i, *ptr++); + __insert_element(out2, i, *ptr++); + __insert_element(out3, i, *ptr++); + } +} +*/ + +#undef FORCEINLINE +#undef PRE_ALIGN +#undef POST_ALIGN +