Ongoing support for int64 for KNC:
Fixes to __load and __store. Added __add, __mul, __equal, __not_equal, __extract_elements, __smear_i64, __cast_sext, __cast_zext, and __scatter_base_offsets32_float. __rcp_varying_float now has a fast-math and full-precision implementation.
This commit is contained in:
@@ -34,6 +34,7 @@
|
|||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
#include <math.h>
|
#include <math.h>
|
||||||
#include <assert.h>
|
#include <assert.h>
|
||||||
|
#include <unistd.h>
|
||||||
|
|
||||||
#include <immintrin.h>
|
#include <immintrin.h>
|
||||||
#include <zmmintrin.h>
|
#include <zmmintrin.h>
|
||||||
@@ -52,7 +53,13 @@
|
|||||||
|
|
||||||
#define KNC 1
|
#define KNC 1
|
||||||
extern "C" {
|
extern "C" {
|
||||||
int printf(const unsigned char *, ...);
|
int printf(const unsigned char *, ...);
|
||||||
|
int puts(unsigned char *);
|
||||||
|
unsigned int putchar(unsigned int);
|
||||||
|
int fflush(void *);
|
||||||
|
uint8_t *memcpy(uint8_t *, uint8_t *, uint64_t );
|
||||||
|
uint8_t *memset(uint8_t *, uint8_t, uint64_t );
|
||||||
|
void memset_pattern16(void *, const void *, uint64_t );
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@@ -130,9 +137,10 @@ typedef struct PRE_ALIGN(64) __vec16_d {
|
|||||||
|
|
||||||
typedef struct PRE_ALIGN(64) __vec16_i32 {
|
typedef struct PRE_ALIGN(64) __vec16_i32 {
|
||||||
operator __m512i() const { return v; }
|
operator __m512i() const { return v; }
|
||||||
__vec16_i32() { }
|
__vec16_i32() {}
|
||||||
__vec16_i32(const __m512i& in) { v = in; }
|
__vec16_i32(const int32_t &in) : v(_mm512_set_1to16_epi32(in)) {};
|
||||||
__vec16_i32(const __vec16_i32& in) { v = in.v; }
|
__vec16_i32(const __m512i &in) { v = in; }
|
||||||
|
__vec16_i32(const __vec16_i32 &in) { v = in.v; }
|
||||||
__vec16_i32(int32_t v00, int32_t v01, int32_t v02, int32_t v03,
|
__vec16_i32(int32_t v00, int32_t v01, int32_t v02, int32_t v03,
|
||||||
int32_t v04, int32_t v05, int32_t v06, int32_t v07,
|
int32_t v04, int32_t v05, int32_t v06, int32_t v07,
|
||||||
int32_t v08, int32_t v09, int32_t v10, int32_t v11,
|
int32_t v08, int32_t v09, int32_t v10, int32_t v11,
|
||||||
@@ -142,8 +150,14 @@ typedef struct PRE_ALIGN(64) __vec16_i32 {
|
|||||||
__m512i v;
|
__m512i v;
|
||||||
} POST_ALIGN(64) __vec16_i32;
|
} POST_ALIGN(64) __vec16_i32;
|
||||||
|
|
||||||
|
FORCEINLINE __vec16_i1::__vec16_i1(const __vec16_i32& in) {
|
||||||
|
m = _mm512_test_epi32_mask(in, in);
|
||||||
|
}
|
||||||
|
|
||||||
typedef struct PRE_ALIGN(64) __vec16_i64 {
|
typedef struct PRE_ALIGN(64) __vec16_i64 {
|
||||||
__vec16_i64() { }
|
__forceinline __vec16_i64();
|
||||||
|
__forceinline __vec16_i64(const __vec16_i64 &o) : v_lo(o.v_lo), v_hi(o.v_hi) {};
|
||||||
|
__forceinline __vec16_i64(__m512i l, __m512i h): v_lo(l), v_hi(h) {};
|
||||||
__vec16_i64(int64_t v00, int64_t v01, int64_t v02, int64_t v03,
|
__vec16_i64(int64_t v00, int64_t v01, int64_t v02, int64_t v03,
|
||||||
int64_t v04, int64_t v05, int64_t v06, int64_t v07,
|
int64_t v04, int64_t v05, int64_t v06, int64_t v07,
|
||||||
int64_t v08, int64_t v09, int64_t v10, int64_t v11,
|
int64_t v08, int64_t v09, int64_t v10, int64_t v11,
|
||||||
@@ -167,9 +181,10 @@ typedef struct PRE_ALIGN(64) __vec16_i64 {
|
|||||||
__m512i v_lo;
|
__m512i v_lo;
|
||||||
} POST_ALIGN(64) __vec16_i64;
|
} POST_ALIGN(64) __vec16_i64;
|
||||||
|
|
||||||
FORCEINLINE __vec16_i1::__vec16_i1(const __vec16_i32& in) {
|
FORCEINLINE __vec16_i64::__vec16_i64()
|
||||||
m = _mm512_test_epi32_mask(in, in);
|
: v_lo(_mm512_undefined_epi32()),
|
||||||
}
|
v_hi(_mm512_undefined_epi32())
|
||||||
|
{}
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
struct vec16 {
|
struct vec16 {
|
||||||
@@ -619,7 +634,7 @@ static FORCEINLINE __vec16_i32 __ashr(__vec16_i32 a, int32_t n) {
|
|||||||
return _mm512_srai_epi32((__m512i)a, n);
|
return _mm512_srai_epi32((__m512i)a, n);
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE __vec16_i1 __equal(__vec16_i32 a, __vec16_i32 b) {
|
static FORCEINLINE __vec16_i1 __equal(const __vec16_i32 &a, const __vec16_i32 &b) {
|
||||||
return _mm512_cmpeq_epi32_mask((__m512i)a, (__m512i)b);
|
return _mm512_cmpeq_epi32_mask((__m512i)a, (__m512i)b);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -721,10 +736,26 @@ template <> static FORCEINLINE void __store<64>(__vec16_i32 *p, __vec16_i32 v) {
|
|||||||
///////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////
|
||||||
// int64
|
// int64
|
||||||
|
|
||||||
BINARY_OP(__vec16_i64, __add, +)
|
|
||||||
|
static FORCEINLINE __vec16_i64 __add(const __vec16_i64 &a, const __vec16_i64 &b)
|
||||||
|
{
|
||||||
|
__mmask16 carry = 0;
|
||||||
|
__m512i lo = _mm512_addsetc_epi32(a.v_lo, b.v_lo, &carry);
|
||||||
|
__m512i hi = _mm512_adc_epi32(a.v_hi, carry, b.v_hi, &carry);
|
||||||
|
return __vec16_i64(lo, hi);
|
||||||
|
}
|
||||||
|
|
||||||
BINARY_OP(__vec16_i64, __sub, -)
|
BINARY_OP(__vec16_i64, __sub, -)
|
||||||
BINARY_OP(__vec16_i64, __mul, *)
|
BINARY_OP(__vec16_i64, __mul, *)
|
||||||
|
|
||||||
|
/*! 64x32 bit mul -- address computations often use a scale that we
|
||||||
|
know is 32 bits; and 32x64 is faster than 64x64 */
|
||||||
|
static FORCEINLINE __vec16_i64 __mul(const __vec16_i32 &a, const __vec16_i64 &b)
|
||||||
|
{
|
||||||
|
return __vec16_i64(_mm512_mullo_epi32(a.v,b.v_lo),
|
||||||
|
_mm512_add_epi32(b.v_hi, _mm512_mulhi_epi32(a.v, b.v_lo)));
|
||||||
|
}
|
||||||
|
|
||||||
BINARY_OP(__vec16_i64, __or, |)
|
BINARY_OP(__vec16_i64, __or, |)
|
||||||
BINARY_OP(__vec16_i64, __and, &)
|
BINARY_OP(__vec16_i64, __and, &)
|
||||||
BINARY_OP(__vec16_i64, __xor, ^)
|
BINARY_OP(__vec16_i64, __xor, ^)
|
||||||
@@ -742,8 +773,15 @@ SHIFT_UNIFORM(__vec16_i64, uint64_t, __lshr, >>)
|
|||||||
SHIFT_UNIFORM(__vec16_i64, int64_t, __ashr, >>)
|
SHIFT_UNIFORM(__vec16_i64, int64_t, __ashr, >>)
|
||||||
SHIFT_UNIFORM(__vec16_i64, int64_t, __shl, <<)
|
SHIFT_UNIFORM(__vec16_i64, int64_t, __shl, <<)
|
||||||
|
|
||||||
CMP_OP(__vec16_i64, int64_t, __equal, ==)
|
static FORCEINLINE __vec16_i1 __equal(const __vec16_i64 &a, const __vec16_i64 &b) {
|
||||||
CMP_OP(__vec16_i64, int64_t, __not_equal, !=)
|
const __mmask16 lo_match = _mm512_cmpeq_epi32_mask(a.v_lo,b.v_lo);
|
||||||
|
return _mm512_mask_cmpeq_epi32_mask(lo_match,a.v_hi,b.v_hi);
|
||||||
|
}
|
||||||
|
|
||||||
|
static FORCEINLINE __vec16_i1 __not_equal(const __vec16_i64 &a, const __vec16_i64 &b) {
|
||||||
|
return __not(__equal(a,b));
|
||||||
|
}
|
||||||
|
|
||||||
CMP_OP(__vec16_i64, uint64_t, __unsigned_less_equal, <=)
|
CMP_OP(__vec16_i64, uint64_t, __unsigned_less_equal, <=)
|
||||||
CMP_OP(__vec16_i64, int64_t, __signed_less_equal, <=)
|
CMP_OP(__vec16_i64, int64_t, __signed_less_equal, <=)
|
||||||
CMP_OP(__vec16_i64, uint64_t, __unsigned_greater_equal, >=)
|
CMP_OP(__vec16_i64, uint64_t, __unsigned_greater_equal, >=)
|
||||||
@@ -755,7 +793,18 @@ CMP_OP(__vec16_i64, int64_t, __signed_greater_than, >)
|
|||||||
|
|
||||||
SELECT(__vec16_i64)
|
SELECT(__vec16_i64)
|
||||||
INSERT_EXTRACT(__vec16_i64, int64_t)
|
INSERT_EXTRACT(__vec16_i64, int64_t)
|
||||||
SMEAR(__vec16_i64, i64, int64_t)
|
|
||||||
|
static FORCEINLINE int64_t __extract_element(const __vec16_i64 &v, int index)
|
||||||
|
{
|
||||||
|
uint *src = (uint *)&v;
|
||||||
|
return src[index+16] | (int64_t(src[index]) << 32);
|
||||||
|
}
|
||||||
|
|
||||||
|
static FORCEINLINE __vec16_i64 __smear_i64(__vec16_i64, const int64_t &l) {
|
||||||
|
const int *i = (const int*)&l;
|
||||||
|
return __vec16_i64(_mm512_set_1to16_epi32(i[0]), _mm512_set_1to16_epi32(i[1]));
|
||||||
|
}
|
||||||
|
|
||||||
BROADCAST(__vec16_i64, i64, int64_t)
|
BROADCAST(__vec16_i64, i64, int64_t)
|
||||||
ROTATE(__vec16_i64, i64, int64_t)
|
ROTATE(__vec16_i64, i64, int64_t)
|
||||||
SHUFFLES(__vec16_i64, i64, int64_t)
|
SHUFFLES(__vec16_i64, i64, int64_t)
|
||||||
@@ -765,10 +814,10 @@ LOAD_STORE(__vec16_i64, int64_t)
|
|||||||
template <int ALIGN> static FORCEINLINE __vec16_i64 __load(__vec16_i64 *p) {
|
template <int ALIGN> static FORCEINLINE __vec16_i64 __load(__vec16_i64 *p) {
|
||||||
__m512i v1;
|
__m512i v1;
|
||||||
__m512i v2;
|
__m512i v2;
|
||||||
v1 = _mm512_extloadunpackhi_epi32(v1, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
|
v2 = _mm512_extloadunpackhi_epi32(v1, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
|
||||||
v1 = _mm512_extloadunpacklo_epi32(v1, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
|
v2 = _mm512_extloadunpacklo_epi32(v1, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
|
||||||
v2 = _mm512_extloadunpackhi_epi32(v2, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
|
v1 = _mm512_extloadunpackhi_epi32(v2, ((uint8_t*)p)+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
|
||||||
v2 = _mm512_extloadunpacklo_epi32(v2, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
|
v1 = _mm512_extloadunpacklo_epi32(v2, ((uint8_t*)p)+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
|
||||||
|
|
||||||
__vec16_i64 ret;
|
__vec16_i64 ret;
|
||||||
ret.v_hi = _mm512_mask_permutevar_epi32(ret.v_hi, 0xFF00,
|
ret.v_hi = _mm512_mask_permutevar_epi32(ret.v_hi, 0xFF00,
|
||||||
@@ -787,8 +836,8 @@ template <int ALIGN> static FORCEINLINE __vec16_i64 __load(__vec16_i64 *p) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
template <> static FORCEINLINE __vec16_i64 __load<64>(__vec16_i64 *p) {
|
template <> static FORCEINLINE __vec16_i64 __load<64>(__vec16_i64 *p) {
|
||||||
__m512i v1 = _mm512_load_epi32(p);
|
__m512i v2 = _mm512_load_epi32(p);
|
||||||
__m512i v2 = _mm512_load_epi32(((uint8_t*)p)+64);
|
__m512i v1 = _mm512_load_epi32(((uint8_t*)p)+64);
|
||||||
__vec16_i64 ret;
|
__vec16_i64 ret;
|
||||||
ret.v_hi = _mm512_mask_permutevar_epi32(ret.v_hi, 0xFF00,
|
ret.v_hi = _mm512_mask_permutevar_epi32(ret.v_hi, 0xFF00,
|
||||||
_mm512_set_16to16_pi(15,13,11,9,7,5,3,1,14,12,10,8,6,4,2,0),
|
_mm512_set_16to16_pi(15,13,11,9,7,5,3,1,14,12,10,8,6,4,2,0),
|
||||||
@@ -820,10 +869,10 @@ template <int ALIGN> static FORCEINLINE void __store(__vec16_i64 *p, __vec16_i64
|
|||||||
v2 = _mm512_mask_permutevar_epi32(v2, 0x5555,
|
v2 = _mm512_mask_permutevar_epi32(v2, 0x5555,
|
||||||
_mm512_set_16to16_pi(7,7,6,6,5,5,4,4,3,3,2,2,1,1,0,0),
|
_mm512_set_16to16_pi(7,7,6,6,5,5,4,4,3,3,2,2,1,1,0,0),
|
||||||
v.v_lo);
|
v.v_lo);
|
||||||
_mm512_extpackstorehi_epi32(p, v1, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
|
_mm512_extpackstorehi_epi32(p, v2, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
|
||||||
_mm512_extpackstorelo_epi32(p, v1, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
|
_mm512_extpackstorelo_epi32(p, v2, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
|
||||||
_mm512_extpackstorehi_epi32(((uint8_t*)p)+64, v2, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
|
_mm512_extpackstorehi_epi32(((uint8_t*)p)+64, v1, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
|
||||||
_mm512_extpackstorelo_epi32(((uint8_t*)p)+64, v2, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
|
_mm512_extpackstorelo_epi32(((uint8_t*)p)+64, v1, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <> static FORCEINLINE void __store<64>(__vec16_i64 *p, __vec16_i64 v) {
|
template <> static FORCEINLINE void __store<64>(__vec16_i64 *p, __vec16_i64 v) {
|
||||||
@@ -841,8 +890,8 @@ template <> static FORCEINLINE void __store<64>(__vec16_i64 *p, __vec16_i64 v) {
|
|||||||
v2 = _mm512_mask_permutevar_epi32(v2, 0x5555,
|
v2 = _mm512_mask_permutevar_epi32(v2, 0x5555,
|
||||||
_mm512_set_16to16_pi(7,7,6,6,5,5,4,4,3,3,2,2,1,1,0,0),
|
_mm512_set_16to16_pi(7,7,6,6,5,5,4,4,3,3,2,2,1,1,0,0),
|
||||||
v.v_lo);
|
v.v_lo);
|
||||||
_mm512_store_epi64(p, v1);
|
_mm512_store_epi64(p, v2);
|
||||||
_mm512_store_epi64(((uint8_t*)p)+64, v2);
|
_mm512_store_epi64(((uint8_t*)p)+64, v1);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@@ -1161,7 +1210,11 @@ static FORCEINLINE TYPE __cast_sext(TYPE, __vec16_i1 v) { \
|
|||||||
CAST_SEXT_I1(__vec16_i8)
|
CAST_SEXT_I1(__vec16_i8)
|
||||||
CAST_SEXT_I1(__vec16_i16)
|
CAST_SEXT_I1(__vec16_i16)
|
||||||
CAST_SEXT_I1(__vec16_i32)
|
CAST_SEXT_I1(__vec16_i32)
|
||||||
CAST_SEXT_I1(__vec16_i64)
|
|
||||||
|
static FORCEINLINE __vec16_i64 __cast_sext(const __vec16_i64 &, const __vec16_i32 &val)
|
||||||
|
{
|
||||||
|
return __vec16_i64(val.v,_mm512_srai_epi32(val.v,31));
|
||||||
|
}
|
||||||
|
|
||||||
// zero extension
|
// zero extension
|
||||||
CAST(__vec16_i64, uint64_t, __vec16_i32, uint32_t, __cast_zext)
|
CAST(__vec16_i64, uint64_t, __vec16_i32, uint32_t, __cast_zext)
|
||||||
@@ -1171,6 +1224,11 @@ CAST(__vec16_i32, uint32_t, __vec16_i16, uint16_t, __cast_zext)
|
|||||||
CAST(__vec16_i32, uint32_t, __vec16_i8, uint8_t, __cast_zext)
|
CAST(__vec16_i32, uint32_t, __vec16_i8, uint8_t, __cast_zext)
|
||||||
CAST(__vec16_i16, uint16_t, __vec16_i8, uint8_t, __cast_zext)
|
CAST(__vec16_i16, uint16_t, __vec16_i8, uint8_t, __cast_zext)
|
||||||
|
|
||||||
|
static FORCEINLINE __vec16_i64 __cast_zext(const __vec16_i64 &, const __vec16_i32 &val)
|
||||||
|
{
|
||||||
|
return __vec16_i64(val.v, _mm512_setzero_epi32());
|
||||||
|
}
|
||||||
|
|
||||||
#define CAST_ZEXT_I1(TYPE)
|
#define CAST_ZEXT_I1(TYPE)
|
||||||
/*
|
/*
|
||||||
static FORCEINLINE TYPE __cast_zext(TYPE, __vec16_i1 v) { \
|
static FORCEINLINE TYPE __cast_zext(TYPE, __vec16_i1 v) { \
|
||||||
@@ -1459,8 +1517,11 @@ static FORCEINLINE __vec16_f __sqrt_varying_float(__vec16_f v) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE __vec16_f __rcp_varying_float(__vec16_f v) {
|
static FORCEINLINE __vec16_f __rcp_varying_float(__vec16_f v) {
|
||||||
|
#ifdef ISPC_FAST_MATH
|
||||||
return _mm512_recip_ps(v);
|
return _mm512_recip_ps(v);
|
||||||
//return _mm512_rcp23_ps(v); // 23-bits of accuracy
|
#else
|
||||||
|
return _mm512_rcp23_ps(v); // 23-bits of accuracy
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE __vec16_f __rsqrt_varying_float(__vec16_f v) {
|
static FORCEINLINE __vec16_f __rsqrt_varying_float(__vec16_f v) {
|
||||||
@@ -1752,7 +1813,6 @@ GATHER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i32, __gather_base_offsets32_i8)
|
|||||||
GATHER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i64, __gather_base_offsets64_i8)
|
GATHER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i64, __gather_base_offsets64_i8)
|
||||||
GATHER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i32, __gather_base_offsets32_i16)
|
GATHER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i32, __gather_base_offsets32_i16)
|
||||||
GATHER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i64, __gather_base_offsets64_i16)
|
GATHER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i64, __gather_base_offsets64_i16)
|
||||||
//GATHER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i32, __gather_base_offsets32_i32)
|
|
||||||
GATHER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i64, __gather_base_offsets64_i32)
|
GATHER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i64, __gather_base_offsets64_i32)
|
||||||
GATHER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i32, __gather_base_offsets32_i64)
|
GATHER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i32, __gather_base_offsets32_i64)
|
||||||
GATHER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i64, __gather_base_offsets64_i64)
|
GATHER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i64, __gather_base_offsets64_i64)
|
||||||
@@ -1777,7 +1837,7 @@ GATHER_GENERAL(__vec16_i16, int16_t, __vec16_i64, __gather64_i16)
|
|||||||
GATHER_GENERAL(__vec16_i32, int32_t, __vec16_i32, __gather32_i32)
|
GATHER_GENERAL(__vec16_i32, int32_t, __vec16_i32, __gather32_i32)
|
||||||
GATHER_GENERAL(__vec16_i32, int32_t, __vec16_i64, __gather64_i32)
|
GATHER_GENERAL(__vec16_i32, int32_t, __vec16_i64, __gather64_i32)
|
||||||
GATHER_GENERAL(__vec16_i64, int64_t, __vec16_i32, __gather32_i64)
|
GATHER_GENERAL(__vec16_i64, int64_t, __vec16_i32, __gather32_i64)
|
||||||
GATHER_GENERAL(__vec16_i64, int64_t, __vec16_i64, __gather64_i64)
|
// GATHER_GENERAL(__vec16_i64, int64_t, __vec16_i64, __gather64_i64)
|
||||||
|
|
||||||
/*
|
/*
|
||||||
static __forceinline __vec16_i32 __gather64_i32(__vec16_i64 ptrs, __vec16_i1 mask) {
|
static __forceinline __vec16_i32 __gather64_i32(__vec16_i64 ptrs, __vec16_i1 mask) {
|
||||||
@@ -1820,7 +1880,6 @@ SCATTER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i32, __scatter_base_offsets32_i
|
|||||||
SCATTER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i64, __scatter_base_offsets64_i8)
|
SCATTER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i64, __scatter_base_offsets64_i8)
|
||||||
SCATTER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i32, __scatter_base_offsets32_i16)
|
SCATTER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i32, __scatter_base_offsets32_i16)
|
||||||
SCATTER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i64, __scatter_base_offsets64_i16)
|
SCATTER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i64, __scatter_base_offsets64_i16)
|
||||||
//SCATTER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i32, __scatter_base_offsets32_i32)
|
|
||||||
SCATTER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i64, __scatter_base_offsets64_i32)
|
SCATTER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i64, __scatter_base_offsets64_i32)
|
||||||
SCATTER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i32, __scatter_base_offsets32_i64)
|
SCATTER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i32, __scatter_base_offsets32_i64)
|
||||||
SCATTER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i64, __scatter_base_offsets64_i64)
|
SCATTER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i64, __scatter_base_offsets64_i64)
|
||||||
@@ -1828,14 +1887,21 @@ SCATTER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i64, __scatter_base_offsets64
|
|||||||
static FORCEINLINE void
|
static FORCEINLINE void
|
||||||
__scatter_base_offsets32_i32(uint8_t *b, __vec16_i32 varyingOffset,
|
__scatter_base_offsets32_i32(uint8_t *b, __vec16_i32 varyingOffset,
|
||||||
uint32_t scale, __vec16_i32 constOffset,
|
uint32_t scale, __vec16_i32 constOffset,
|
||||||
__vec16_i32 val, __vec16_i1 mask) {
|
__vec16_i32 val, __vec16_i1 mask)
|
||||||
__vec16_i32 vscale = _mm512_extload_epi32(&scale, _MM_UPCONV_EPI32_NONE, _MM_BROADCAST_1X16, _MM_HINT_NONE);
|
{
|
||||||
__vec16_i32 offsets = __add(__mul(vscale, varyingOffset), constOffset);
|
__vec16_i32 offsets = __add(__mul(__vec16_i32(scale), varyingOffset), constOffset);
|
||||||
|
|
||||||
// Loop is generated by the intrinsic
|
|
||||||
_mm512_mask_i32extscatter_epi32(b, mask, offsets, val, _MM_DOWNCONV_EPI32_NONE, 1, _MM_HINT_NONE);
|
_mm512_mask_i32extscatter_epi32(b, mask, offsets, val, _MM_DOWNCONV_EPI32_NONE, 1, _MM_HINT_NONE);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static FORCEINLINE void
|
||||||
|
__scatter_base_offsets32_float(void *base, const __vec16_i32 &varyingOffset,
|
||||||
|
uint32_t scale, const __vec16_i32 &constOffset,
|
||||||
|
const __vec16_f &val, const __vec16_i1 mask)
|
||||||
|
{
|
||||||
|
__vec16_i32 offsets = __add(__mul(varyingOffset,__vec16_i32(scale)), constOffset);
|
||||||
|
_mm512_mask_i32extscatter_ps(base, mask, offsets, val, _MM_DOWNCONV_PS_NONE, _MM_SCALE_1, _MM_HINT_NONE);
|
||||||
|
}
|
||||||
|
|
||||||
#define SCATTER_GENERAL(VTYPE, STYPE, PTRTYPE, FUNC)
|
#define SCATTER_GENERAL(VTYPE, STYPE, PTRTYPE, FUNC)
|
||||||
/*
|
/*
|
||||||
static FORCEINLINE void FUNC(PTRTYPE ptrs, VTYPE val, __vec16_i1 mask) { \
|
static FORCEINLINE void FUNC(PTRTYPE ptrs, VTYPE val, __vec16_i1 mask) { \
|
||||||
@@ -1846,8 +1912,7 @@ static FORCEINLINE void FUNC(PTRTYPE ptrs, VTYPE val, __vec16_i1 mask) { \
|
|||||||
*ptr = val.v[i]; \
|
*ptr = val.v[i]; \
|
||||||
} \
|
} \
|
||||||
}
|
}
|
||||||
*/
|
*/ SCATTER_GENERAL(__vec16_i8, int8_t, __vec16_i32, __scatter32_i8)
|
||||||
SCATTER_GENERAL(__vec16_i8, int8_t, __vec16_i32, __scatter32_i8)
|
|
||||||
SCATTER_GENERAL(__vec16_i8, int8_t, __vec16_i64, __scatter64_i8)
|
SCATTER_GENERAL(__vec16_i8, int8_t, __vec16_i64, __scatter64_i8)
|
||||||
SCATTER_GENERAL(__vec16_i16, int16_t, __vec16_i32, __scatter32_i16)
|
SCATTER_GENERAL(__vec16_i16, int16_t, __vec16_i32, __scatter32_i16)
|
||||||
SCATTER_GENERAL(__vec16_i16, int16_t, __vec16_i64, __scatter64_i16)
|
SCATTER_GENERAL(__vec16_i16, int16_t, __vec16_i64, __scatter64_i16)
|
||||||
@@ -2158,3 +2223,4 @@ static FORCEINLINE uint64_t __atomic_cmpxchg(uint64_t *p, uint64_t cmpval,
|
|||||||
#undef PRE_ALIGN
|
#undef PRE_ALIGN
|
||||||
#undef POST_ALIGN
|
#undef POST_ALIGN
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user