Ongoing support for int64 for KNC:
Fixes to __load and __store. Added __add, __mul, __equal, __not_equal, __extract_elements, __smear_i64, __cast_sext, __cast_zext, and __scatter_base_offsets32_float. __rcp_varying_float now has a fast-math and full-precision implementation.
This commit is contained in:
@@ -34,6 +34,7 @@
|
||||
#include <stdint.h>
|
||||
#include <math.h>
|
||||
#include <assert.h>
|
||||
#include <unistd.h>
|
||||
|
||||
#include <immintrin.h>
|
||||
#include <zmmintrin.h>
|
||||
@@ -52,7 +53,13 @@
|
||||
|
||||
#define KNC 1
|
||||
extern "C" {
|
||||
int printf(const unsigned char *, ...);
|
||||
int printf(const unsigned char *, ...);
|
||||
int puts(unsigned char *);
|
||||
unsigned int putchar(unsigned int);
|
||||
int fflush(void *);
|
||||
uint8_t *memcpy(uint8_t *, uint8_t *, uint64_t );
|
||||
uint8_t *memset(uint8_t *, uint8_t, uint64_t );
|
||||
void memset_pattern16(void *, const void *, uint64_t );
|
||||
}
|
||||
|
||||
|
||||
@@ -130,9 +137,10 @@ typedef struct PRE_ALIGN(64) __vec16_d {
|
||||
|
||||
typedef struct PRE_ALIGN(64) __vec16_i32 {
|
||||
operator __m512i() const { return v; }
|
||||
__vec16_i32() { }
|
||||
__vec16_i32(const __m512i& in) { v = in; }
|
||||
__vec16_i32(const __vec16_i32& in) { v = in.v; }
|
||||
__vec16_i32() {}
|
||||
__vec16_i32(const int32_t &in) : v(_mm512_set_1to16_epi32(in)) {};
|
||||
__vec16_i32(const __m512i &in) { v = in; }
|
||||
__vec16_i32(const __vec16_i32 &in) { v = in.v; }
|
||||
__vec16_i32(int32_t v00, int32_t v01, int32_t v02, int32_t v03,
|
||||
int32_t v04, int32_t v05, int32_t v06, int32_t v07,
|
||||
int32_t v08, int32_t v09, int32_t v10, int32_t v11,
|
||||
@@ -142,8 +150,14 @@ typedef struct PRE_ALIGN(64) __vec16_i32 {
|
||||
__m512i v;
|
||||
} POST_ALIGN(64) __vec16_i32;
|
||||
|
||||
FORCEINLINE __vec16_i1::__vec16_i1(const __vec16_i32& in) {
|
||||
m = _mm512_test_epi32_mask(in, in);
|
||||
}
|
||||
|
||||
typedef struct PRE_ALIGN(64) __vec16_i64 {
|
||||
__vec16_i64() { }
|
||||
__forceinline __vec16_i64();
|
||||
__forceinline __vec16_i64(const __vec16_i64 &o) : v_lo(o.v_lo), v_hi(o.v_hi) {};
|
||||
__forceinline __vec16_i64(__m512i l, __m512i h): v_lo(l), v_hi(h) {};
|
||||
__vec16_i64(int64_t v00, int64_t v01, int64_t v02, int64_t v03,
|
||||
int64_t v04, int64_t v05, int64_t v06, int64_t v07,
|
||||
int64_t v08, int64_t v09, int64_t v10, int64_t v11,
|
||||
@@ -167,9 +181,10 @@ typedef struct PRE_ALIGN(64) __vec16_i64 {
|
||||
__m512i v_lo;
|
||||
} POST_ALIGN(64) __vec16_i64;
|
||||
|
||||
FORCEINLINE __vec16_i1::__vec16_i1(const __vec16_i32& in) {
|
||||
m = _mm512_test_epi32_mask(in, in);
|
||||
}
|
||||
FORCEINLINE __vec16_i64::__vec16_i64()
|
||||
: v_lo(_mm512_undefined_epi32()),
|
||||
v_hi(_mm512_undefined_epi32())
|
||||
{}
|
||||
|
||||
template <typename T>
|
||||
struct vec16 {
|
||||
@@ -619,7 +634,7 @@ static FORCEINLINE __vec16_i32 __ashr(__vec16_i32 a, int32_t n) {
|
||||
return _mm512_srai_epi32((__m512i)a, n);
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec16_i1 __equal(__vec16_i32 a, __vec16_i32 b) {
|
||||
static FORCEINLINE __vec16_i1 __equal(const __vec16_i32 &a, const __vec16_i32 &b) {
|
||||
return _mm512_cmpeq_epi32_mask((__m512i)a, (__m512i)b);
|
||||
}
|
||||
|
||||
@@ -721,10 +736,26 @@ template <> static FORCEINLINE void __store<64>(__vec16_i32 *p, __vec16_i32 v) {
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// int64
|
||||
|
||||
BINARY_OP(__vec16_i64, __add, +)
|
||||
|
||||
static FORCEINLINE __vec16_i64 __add(const __vec16_i64 &a, const __vec16_i64 &b)
|
||||
{
|
||||
__mmask16 carry = 0;
|
||||
__m512i lo = _mm512_addsetc_epi32(a.v_lo, b.v_lo, &carry);
|
||||
__m512i hi = _mm512_adc_epi32(a.v_hi, carry, b.v_hi, &carry);
|
||||
return __vec16_i64(lo, hi);
|
||||
}
|
||||
|
||||
BINARY_OP(__vec16_i64, __sub, -)
|
||||
BINARY_OP(__vec16_i64, __mul, *)
|
||||
|
||||
/*! 64x32 bit mul -- address computations often use a scale that we
|
||||
know is 32 bits; and 32x64 is faster than 64x64 */
|
||||
static FORCEINLINE __vec16_i64 __mul(const __vec16_i32 &a, const __vec16_i64 &b)
|
||||
{
|
||||
return __vec16_i64(_mm512_mullo_epi32(a.v,b.v_lo),
|
||||
_mm512_add_epi32(b.v_hi, _mm512_mulhi_epi32(a.v, b.v_lo)));
|
||||
}
|
||||
|
||||
BINARY_OP(__vec16_i64, __or, |)
|
||||
BINARY_OP(__vec16_i64, __and, &)
|
||||
BINARY_OP(__vec16_i64, __xor, ^)
|
||||
@@ -742,8 +773,15 @@ SHIFT_UNIFORM(__vec16_i64, uint64_t, __lshr, >>)
|
||||
SHIFT_UNIFORM(__vec16_i64, int64_t, __ashr, >>)
|
||||
SHIFT_UNIFORM(__vec16_i64, int64_t, __shl, <<)
|
||||
|
||||
CMP_OP(__vec16_i64, int64_t, __equal, ==)
|
||||
CMP_OP(__vec16_i64, int64_t, __not_equal, !=)
|
||||
static FORCEINLINE __vec16_i1 __equal(const __vec16_i64 &a, const __vec16_i64 &b) {
|
||||
const __mmask16 lo_match = _mm512_cmpeq_epi32_mask(a.v_lo,b.v_lo);
|
||||
return _mm512_mask_cmpeq_epi32_mask(lo_match,a.v_hi,b.v_hi);
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec16_i1 __not_equal(const __vec16_i64 &a, const __vec16_i64 &b) {
|
||||
return __not(__equal(a,b));
|
||||
}
|
||||
|
||||
CMP_OP(__vec16_i64, uint64_t, __unsigned_less_equal, <=)
|
||||
CMP_OP(__vec16_i64, int64_t, __signed_less_equal, <=)
|
||||
CMP_OP(__vec16_i64, uint64_t, __unsigned_greater_equal, >=)
|
||||
@@ -755,7 +793,18 @@ CMP_OP(__vec16_i64, int64_t, __signed_greater_than, >)
|
||||
|
||||
SELECT(__vec16_i64)
|
||||
INSERT_EXTRACT(__vec16_i64, int64_t)
|
||||
SMEAR(__vec16_i64, i64, int64_t)
|
||||
|
||||
static FORCEINLINE int64_t __extract_element(const __vec16_i64 &v, int index)
|
||||
{
|
||||
uint *src = (uint *)&v;
|
||||
return src[index+16] | (int64_t(src[index]) << 32);
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec16_i64 __smear_i64(__vec16_i64, const int64_t &l) {
|
||||
const int *i = (const int*)&l;
|
||||
return __vec16_i64(_mm512_set_1to16_epi32(i[0]), _mm512_set_1to16_epi32(i[1]));
|
||||
}
|
||||
|
||||
BROADCAST(__vec16_i64, i64, int64_t)
|
||||
ROTATE(__vec16_i64, i64, int64_t)
|
||||
SHUFFLES(__vec16_i64, i64, int64_t)
|
||||
@@ -765,10 +814,10 @@ LOAD_STORE(__vec16_i64, int64_t)
|
||||
template <int ALIGN> static FORCEINLINE __vec16_i64 __load(__vec16_i64 *p) {
|
||||
__m512i v1;
|
||||
__m512i v2;
|
||||
v1 = _mm512_extloadunpackhi_epi32(v1, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
|
||||
v1 = _mm512_extloadunpacklo_epi32(v1, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
|
||||
v2 = _mm512_extloadunpackhi_epi32(v2, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
|
||||
v2 = _mm512_extloadunpacklo_epi32(v2, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
|
||||
v2 = _mm512_extloadunpackhi_epi32(v1, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
|
||||
v2 = _mm512_extloadunpacklo_epi32(v1, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
|
||||
v1 = _mm512_extloadunpackhi_epi32(v2, ((uint8_t*)p)+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
|
||||
v1 = _mm512_extloadunpacklo_epi32(v2, ((uint8_t*)p)+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
|
||||
|
||||
__vec16_i64 ret;
|
||||
ret.v_hi = _mm512_mask_permutevar_epi32(ret.v_hi, 0xFF00,
|
||||
@@ -787,8 +836,8 @@ template <int ALIGN> static FORCEINLINE __vec16_i64 __load(__vec16_i64 *p) {
|
||||
}
|
||||
|
||||
template <> static FORCEINLINE __vec16_i64 __load<64>(__vec16_i64 *p) {
|
||||
__m512i v1 = _mm512_load_epi32(p);
|
||||
__m512i v2 = _mm512_load_epi32(((uint8_t*)p)+64);
|
||||
__m512i v2 = _mm512_load_epi32(p);
|
||||
__m512i v1 = _mm512_load_epi32(((uint8_t*)p)+64);
|
||||
__vec16_i64 ret;
|
||||
ret.v_hi = _mm512_mask_permutevar_epi32(ret.v_hi, 0xFF00,
|
||||
_mm512_set_16to16_pi(15,13,11,9,7,5,3,1,14,12,10,8,6,4,2,0),
|
||||
@@ -820,10 +869,10 @@ template <int ALIGN> static FORCEINLINE void __store(__vec16_i64 *p, __vec16_i64
|
||||
v2 = _mm512_mask_permutevar_epi32(v2, 0x5555,
|
||||
_mm512_set_16to16_pi(7,7,6,6,5,5,4,4,3,3,2,2,1,1,0,0),
|
||||
v.v_lo);
|
||||
_mm512_extpackstorehi_epi32(p, v1, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
|
||||
_mm512_extpackstorelo_epi32(p, v1, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
|
||||
_mm512_extpackstorehi_epi32(((uint8_t*)p)+64, v2, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
|
||||
_mm512_extpackstorelo_epi32(((uint8_t*)p)+64, v2, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
|
||||
_mm512_extpackstorehi_epi32(p, v2, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
|
||||
_mm512_extpackstorelo_epi32(p, v2, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
|
||||
_mm512_extpackstorehi_epi32(((uint8_t*)p)+64, v1, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
|
||||
_mm512_extpackstorelo_epi32(((uint8_t*)p)+64, v1, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
|
||||
}
|
||||
|
||||
template <> static FORCEINLINE void __store<64>(__vec16_i64 *p, __vec16_i64 v) {
|
||||
@@ -841,8 +890,8 @@ template <> static FORCEINLINE void __store<64>(__vec16_i64 *p, __vec16_i64 v) {
|
||||
v2 = _mm512_mask_permutevar_epi32(v2, 0x5555,
|
||||
_mm512_set_16to16_pi(7,7,6,6,5,5,4,4,3,3,2,2,1,1,0,0),
|
||||
v.v_lo);
|
||||
_mm512_store_epi64(p, v1);
|
||||
_mm512_store_epi64(((uint8_t*)p)+64, v2);
|
||||
_mm512_store_epi64(p, v2);
|
||||
_mm512_store_epi64(((uint8_t*)p)+64, v1);
|
||||
}
|
||||
|
||||
|
||||
@@ -1161,7 +1210,11 @@ static FORCEINLINE TYPE __cast_sext(TYPE, __vec16_i1 v) { \
|
||||
CAST_SEXT_I1(__vec16_i8)
|
||||
CAST_SEXT_I1(__vec16_i16)
|
||||
CAST_SEXT_I1(__vec16_i32)
|
||||
CAST_SEXT_I1(__vec16_i64)
|
||||
|
||||
static FORCEINLINE __vec16_i64 __cast_sext(const __vec16_i64 &, const __vec16_i32 &val)
|
||||
{
|
||||
return __vec16_i64(val.v,_mm512_srai_epi32(val.v,31));
|
||||
}
|
||||
|
||||
// zero extension
|
||||
CAST(__vec16_i64, uint64_t, __vec16_i32, uint32_t, __cast_zext)
|
||||
@@ -1171,6 +1224,11 @@ CAST(__vec16_i32, uint32_t, __vec16_i16, uint16_t, __cast_zext)
|
||||
CAST(__vec16_i32, uint32_t, __vec16_i8, uint8_t, __cast_zext)
|
||||
CAST(__vec16_i16, uint16_t, __vec16_i8, uint8_t, __cast_zext)
|
||||
|
||||
static FORCEINLINE __vec16_i64 __cast_zext(const __vec16_i64 &, const __vec16_i32 &val)
|
||||
{
|
||||
return __vec16_i64(val.v, _mm512_setzero_epi32());
|
||||
}
|
||||
|
||||
#define CAST_ZEXT_I1(TYPE)
|
||||
/*
|
||||
static FORCEINLINE TYPE __cast_zext(TYPE, __vec16_i1 v) { \
|
||||
@@ -1459,8 +1517,11 @@ static FORCEINLINE __vec16_f __sqrt_varying_float(__vec16_f v) {
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec16_f __rcp_varying_float(__vec16_f v) {
|
||||
#ifdef ISPC_FAST_MATH
|
||||
return _mm512_recip_ps(v);
|
||||
//return _mm512_rcp23_ps(v); // 23-bits of accuracy
|
||||
#else
|
||||
return _mm512_rcp23_ps(v); // 23-bits of accuracy
|
||||
#endif
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec16_f __rsqrt_varying_float(__vec16_f v) {
|
||||
@@ -1752,7 +1813,6 @@ GATHER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i32, __gather_base_offsets32_i8)
|
||||
GATHER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i64, __gather_base_offsets64_i8)
|
||||
GATHER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i32, __gather_base_offsets32_i16)
|
||||
GATHER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i64, __gather_base_offsets64_i16)
|
||||
//GATHER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i32, __gather_base_offsets32_i32)
|
||||
GATHER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i64, __gather_base_offsets64_i32)
|
||||
GATHER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i32, __gather_base_offsets32_i64)
|
||||
GATHER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i64, __gather_base_offsets64_i64)
|
||||
@@ -1777,7 +1837,7 @@ GATHER_GENERAL(__vec16_i16, int16_t, __vec16_i64, __gather64_i16)
|
||||
GATHER_GENERAL(__vec16_i32, int32_t, __vec16_i32, __gather32_i32)
|
||||
GATHER_GENERAL(__vec16_i32, int32_t, __vec16_i64, __gather64_i32)
|
||||
GATHER_GENERAL(__vec16_i64, int64_t, __vec16_i32, __gather32_i64)
|
||||
GATHER_GENERAL(__vec16_i64, int64_t, __vec16_i64, __gather64_i64)
|
||||
// GATHER_GENERAL(__vec16_i64, int64_t, __vec16_i64, __gather64_i64)
|
||||
|
||||
/*
|
||||
static __forceinline __vec16_i32 __gather64_i32(__vec16_i64 ptrs, __vec16_i1 mask) {
|
||||
@@ -1820,7 +1880,6 @@ SCATTER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i32, __scatter_base_offsets32_i
|
||||
SCATTER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i64, __scatter_base_offsets64_i8)
|
||||
SCATTER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i32, __scatter_base_offsets32_i16)
|
||||
SCATTER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i64, __scatter_base_offsets64_i16)
|
||||
//SCATTER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i32, __scatter_base_offsets32_i32)
|
||||
SCATTER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i64, __scatter_base_offsets64_i32)
|
||||
SCATTER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i32, __scatter_base_offsets32_i64)
|
||||
SCATTER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i64, __scatter_base_offsets64_i64)
|
||||
@@ -1828,14 +1887,21 @@ SCATTER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i64, __scatter_base_offsets64
|
||||
static FORCEINLINE void
|
||||
__scatter_base_offsets32_i32(uint8_t *b, __vec16_i32 varyingOffset,
|
||||
uint32_t scale, __vec16_i32 constOffset,
|
||||
__vec16_i32 val, __vec16_i1 mask) {
|
||||
__vec16_i32 vscale = _mm512_extload_epi32(&scale, _MM_UPCONV_EPI32_NONE, _MM_BROADCAST_1X16, _MM_HINT_NONE);
|
||||
__vec16_i32 offsets = __add(__mul(vscale, varyingOffset), constOffset);
|
||||
|
||||
// Loop is generated by the intrinsic
|
||||
__vec16_i32 val, __vec16_i1 mask)
|
||||
{
|
||||
__vec16_i32 offsets = __add(__mul(__vec16_i32(scale), varyingOffset), constOffset);
|
||||
_mm512_mask_i32extscatter_epi32(b, mask, offsets, val, _MM_DOWNCONV_EPI32_NONE, 1, _MM_HINT_NONE);
|
||||
}
|
||||
|
||||
static FORCEINLINE void
|
||||
__scatter_base_offsets32_float(void *base, const __vec16_i32 &varyingOffset,
|
||||
uint32_t scale, const __vec16_i32 &constOffset,
|
||||
const __vec16_f &val, const __vec16_i1 mask)
|
||||
{
|
||||
__vec16_i32 offsets = __add(__mul(varyingOffset,__vec16_i32(scale)), constOffset);
|
||||
_mm512_mask_i32extscatter_ps(base, mask, offsets, val, _MM_DOWNCONV_PS_NONE, _MM_SCALE_1, _MM_HINT_NONE);
|
||||
}
|
||||
|
||||
#define SCATTER_GENERAL(VTYPE, STYPE, PTRTYPE, FUNC)
|
||||
/*
|
||||
static FORCEINLINE void FUNC(PTRTYPE ptrs, VTYPE val, __vec16_i1 mask) { \
|
||||
@@ -1846,8 +1912,7 @@ static FORCEINLINE void FUNC(PTRTYPE ptrs, VTYPE val, __vec16_i1 mask) { \
|
||||
*ptr = val.v[i]; \
|
||||
} \
|
||||
}
|
||||
*/
|
||||
SCATTER_GENERAL(__vec16_i8, int8_t, __vec16_i32, __scatter32_i8)
|
||||
*/ SCATTER_GENERAL(__vec16_i8, int8_t, __vec16_i32, __scatter32_i8)
|
||||
SCATTER_GENERAL(__vec16_i8, int8_t, __vec16_i64, __scatter64_i8)
|
||||
SCATTER_GENERAL(__vec16_i16, int16_t, __vec16_i32, __scatter32_i16)
|
||||
SCATTER_GENERAL(__vec16_i16, int16_t, __vec16_i64, __scatter64_i16)
|
||||
@@ -2158,3 +2223,4 @@ static FORCEINLINE uint64_t __atomic_cmpxchg(uint64_t *p, uint64_t cmpval,
|
||||
#undef PRE_ALIGN
|
||||
#undef POST_ALIGN
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user