Ongoing support for int64 for KNC:

Fixes to __load and __store.
Added __add, __mul, __equal, __not_equal, __extract_elements, __smear_i64, __cast_sext, __cast_zext,
and __scatter_base_offsets32_float.

__rcp_varying_float now has a fast-math and full-precision implementation.
This commit is contained in:
Jean-Luc Duprat
2012-07-05 16:56:13 -07:00
parent 6aad4c7a39
commit b9d1f0db18

View File

@@ -34,6 +34,7 @@
#include <stdint.h>
#include <math.h>
#include <assert.h>
#include <unistd.h>
#include <immintrin.h>
#include <zmmintrin.h>
@@ -52,7 +53,13 @@
#define KNC 1
extern "C" {
int printf(const unsigned char *, ...);
int printf(const unsigned char *, ...);
int puts(unsigned char *);
unsigned int putchar(unsigned int);
int fflush(void *);
uint8_t *memcpy(uint8_t *, uint8_t *, uint64_t );
uint8_t *memset(uint8_t *, uint8_t, uint64_t );
void memset_pattern16(void *, const void *, uint64_t );
}
@@ -130,9 +137,10 @@ typedef struct PRE_ALIGN(64) __vec16_d {
typedef struct PRE_ALIGN(64) __vec16_i32 {
operator __m512i() const { return v; }
__vec16_i32() { }
__vec16_i32(const __m512i& in) { v = in; }
__vec16_i32(const __vec16_i32& in) { v = in.v; }
__vec16_i32() {}
__vec16_i32(const int32_t &in) : v(_mm512_set_1to16_epi32(in)) {};
__vec16_i32(const __m512i &in) { v = in; }
__vec16_i32(const __vec16_i32 &in) { v = in.v; }
__vec16_i32(int32_t v00, int32_t v01, int32_t v02, int32_t v03,
int32_t v04, int32_t v05, int32_t v06, int32_t v07,
int32_t v08, int32_t v09, int32_t v10, int32_t v11,
@@ -142,8 +150,14 @@ typedef struct PRE_ALIGN(64) __vec16_i32 {
__m512i v;
} POST_ALIGN(64) __vec16_i32;
FORCEINLINE __vec16_i1::__vec16_i1(const __vec16_i32& in) {
m = _mm512_test_epi32_mask(in, in);
}
typedef struct PRE_ALIGN(64) __vec16_i64 {
__vec16_i64() { }
__forceinline __vec16_i64();
__forceinline __vec16_i64(const __vec16_i64 &o) : v_lo(o.v_lo), v_hi(o.v_hi) {};
__forceinline __vec16_i64(__m512i l, __m512i h): v_lo(l), v_hi(h) {};
__vec16_i64(int64_t v00, int64_t v01, int64_t v02, int64_t v03,
int64_t v04, int64_t v05, int64_t v06, int64_t v07,
int64_t v08, int64_t v09, int64_t v10, int64_t v11,
@@ -167,9 +181,10 @@ typedef struct PRE_ALIGN(64) __vec16_i64 {
__m512i v_lo;
} POST_ALIGN(64) __vec16_i64;
FORCEINLINE __vec16_i1::__vec16_i1(const __vec16_i32& in) {
m = _mm512_test_epi32_mask(in, in);
}
FORCEINLINE __vec16_i64::__vec16_i64()
: v_lo(_mm512_undefined_epi32()),
v_hi(_mm512_undefined_epi32())
{}
template <typename T>
struct vec16 {
@@ -619,7 +634,7 @@ static FORCEINLINE __vec16_i32 __ashr(__vec16_i32 a, int32_t n) {
return _mm512_srai_epi32((__m512i)a, n);
}
static FORCEINLINE __vec16_i1 __equal(__vec16_i32 a, __vec16_i32 b) {
static FORCEINLINE __vec16_i1 __equal(const __vec16_i32 &a, const __vec16_i32 &b) {
return _mm512_cmpeq_epi32_mask((__m512i)a, (__m512i)b);
}
@@ -721,10 +736,26 @@ template <> static FORCEINLINE void __store<64>(__vec16_i32 *p, __vec16_i32 v) {
///////////////////////////////////////////////////////////////////////////
// int64
BINARY_OP(__vec16_i64, __add, +)
static FORCEINLINE __vec16_i64 __add(const __vec16_i64 &a, const __vec16_i64 &b)
{
__mmask16 carry = 0;
__m512i lo = _mm512_addsetc_epi32(a.v_lo, b.v_lo, &carry);
__m512i hi = _mm512_adc_epi32(a.v_hi, carry, b.v_hi, &carry);
return __vec16_i64(lo, hi);
}
BINARY_OP(__vec16_i64, __sub, -)
BINARY_OP(__vec16_i64, __mul, *)
/*! 64x32 bit mul -- address computations often use a scale that we
know is 32 bits; and 32x64 is faster than 64x64 */
static FORCEINLINE __vec16_i64 __mul(const __vec16_i32 &a, const __vec16_i64 &b)
{
return __vec16_i64(_mm512_mullo_epi32(a.v,b.v_lo),
_mm512_add_epi32(b.v_hi, _mm512_mulhi_epi32(a.v, b.v_lo)));
}
BINARY_OP(__vec16_i64, __or, |)
BINARY_OP(__vec16_i64, __and, &)
BINARY_OP(__vec16_i64, __xor, ^)
@@ -742,8 +773,15 @@ SHIFT_UNIFORM(__vec16_i64, uint64_t, __lshr, >>)
SHIFT_UNIFORM(__vec16_i64, int64_t, __ashr, >>)
SHIFT_UNIFORM(__vec16_i64, int64_t, __shl, <<)
CMP_OP(__vec16_i64, int64_t, __equal, ==)
CMP_OP(__vec16_i64, int64_t, __not_equal, !=)
static FORCEINLINE __vec16_i1 __equal(const __vec16_i64 &a, const __vec16_i64 &b) {
const __mmask16 lo_match = _mm512_cmpeq_epi32_mask(a.v_lo,b.v_lo);
return _mm512_mask_cmpeq_epi32_mask(lo_match,a.v_hi,b.v_hi);
}
static FORCEINLINE __vec16_i1 __not_equal(const __vec16_i64 &a, const __vec16_i64 &b) {
return __not(__equal(a,b));
}
CMP_OP(__vec16_i64, uint64_t, __unsigned_less_equal, <=)
CMP_OP(__vec16_i64, int64_t, __signed_less_equal, <=)
CMP_OP(__vec16_i64, uint64_t, __unsigned_greater_equal, >=)
@@ -755,7 +793,18 @@ CMP_OP(__vec16_i64, int64_t, __signed_greater_than, >)
SELECT(__vec16_i64)
INSERT_EXTRACT(__vec16_i64, int64_t)
SMEAR(__vec16_i64, i64, int64_t)
static FORCEINLINE int64_t __extract_element(const __vec16_i64 &v, int index)
{
uint *src = (uint *)&v;
return src[index+16] | (int64_t(src[index]) << 32);
}
static FORCEINLINE __vec16_i64 __smear_i64(__vec16_i64, const int64_t &l) {
const int *i = (const int*)&l;
return __vec16_i64(_mm512_set_1to16_epi32(i[0]), _mm512_set_1to16_epi32(i[1]));
}
BROADCAST(__vec16_i64, i64, int64_t)
ROTATE(__vec16_i64, i64, int64_t)
SHUFFLES(__vec16_i64, i64, int64_t)
@@ -765,10 +814,10 @@ LOAD_STORE(__vec16_i64, int64_t)
template <int ALIGN> static FORCEINLINE __vec16_i64 __load(__vec16_i64 *p) {
__m512i v1;
__m512i v2;
v1 = _mm512_extloadunpackhi_epi32(v1, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
v1 = _mm512_extloadunpacklo_epi32(v1, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
v2 = _mm512_extloadunpackhi_epi32(v2, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
v2 = _mm512_extloadunpacklo_epi32(v2, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
v2 = _mm512_extloadunpackhi_epi32(v1, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
v2 = _mm512_extloadunpacklo_epi32(v1, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
v1 = _mm512_extloadunpackhi_epi32(v2, ((uint8_t*)p)+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
v1 = _mm512_extloadunpacklo_epi32(v2, ((uint8_t*)p)+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
__vec16_i64 ret;
ret.v_hi = _mm512_mask_permutevar_epi32(ret.v_hi, 0xFF00,
@@ -787,8 +836,8 @@ template <int ALIGN> static FORCEINLINE __vec16_i64 __load(__vec16_i64 *p) {
}
template <> static FORCEINLINE __vec16_i64 __load<64>(__vec16_i64 *p) {
__m512i v1 = _mm512_load_epi32(p);
__m512i v2 = _mm512_load_epi32(((uint8_t*)p)+64);
__m512i v2 = _mm512_load_epi32(p);
__m512i v1 = _mm512_load_epi32(((uint8_t*)p)+64);
__vec16_i64 ret;
ret.v_hi = _mm512_mask_permutevar_epi32(ret.v_hi, 0xFF00,
_mm512_set_16to16_pi(15,13,11,9,7,5,3,1,14,12,10,8,6,4,2,0),
@@ -820,10 +869,10 @@ template <int ALIGN> static FORCEINLINE void __store(__vec16_i64 *p, __vec16_i64
v2 = _mm512_mask_permutevar_epi32(v2, 0x5555,
_mm512_set_16to16_pi(7,7,6,6,5,5,4,4,3,3,2,2,1,1,0,0),
v.v_lo);
_mm512_extpackstorehi_epi32(p, v1, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
_mm512_extpackstorelo_epi32(p, v1, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
_mm512_extpackstorehi_epi32(((uint8_t*)p)+64, v2, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
_mm512_extpackstorelo_epi32(((uint8_t*)p)+64, v2, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
_mm512_extpackstorehi_epi32(p, v2, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
_mm512_extpackstorelo_epi32(p, v2, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
_mm512_extpackstorehi_epi32(((uint8_t*)p)+64, v1, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
_mm512_extpackstorelo_epi32(((uint8_t*)p)+64, v1, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
}
template <> static FORCEINLINE void __store<64>(__vec16_i64 *p, __vec16_i64 v) {
@@ -841,8 +890,8 @@ template <> static FORCEINLINE void __store<64>(__vec16_i64 *p, __vec16_i64 v) {
v2 = _mm512_mask_permutevar_epi32(v2, 0x5555,
_mm512_set_16to16_pi(7,7,6,6,5,5,4,4,3,3,2,2,1,1,0,0),
v.v_lo);
_mm512_store_epi64(p, v1);
_mm512_store_epi64(((uint8_t*)p)+64, v2);
_mm512_store_epi64(p, v2);
_mm512_store_epi64(((uint8_t*)p)+64, v1);
}
@@ -1161,7 +1210,11 @@ static FORCEINLINE TYPE __cast_sext(TYPE, __vec16_i1 v) { \
CAST_SEXT_I1(__vec16_i8)
CAST_SEXT_I1(__vec16_i16)
CAST_SEXT_I1(__vec16_i32)
CAST_SEXT_I1(__vec16_i64)
static FORCEINLINE __vec16_i64 __cast_sext(const __vec16_i64 &, const __vec16_i32 &val)
{
return __vec16_i64(val.v,_mm512_srai_epi32(val.v,31));
}
// zero extension
CAST(__vec16_i64, uint64_t, __vec16_i32, uint32_t, __cast_zext)
@@ -1171,6 +1224,11 @@ CAST(__vec16_i32, uint32_t, __vec16_i16, uint16_t, __cast_zext)
CAST(__vec16_i32, uint32_t, __vec16_i8, uint8_t, __cast_zext)
CAST(__vec16_i16, uint16_t, __vec16_i8, uint8_t, __cast_zext)
static FORCEINLINE __vec16_i64 __cast_zext(const __vec16_i64 &, const __vec16_i32 &val)
{
return __vec16_i64(val.v, _mm512_setzero_epi32());
}
#define CAST_ZEXT_I1(TYPE)
/*
static FORCEINLINE TYPE __cast_zext(TYPE, __vec16_i1 v) { \
@@ -1459,8 +1517,11 @@ static FORCEINLINE __vec16_f __sqrt_varying_float(__vec16_f v) {
}
static FORCEINLINE __vec16_f __rcp_varying_float(__vec16_f v) {
#ifdef ISPC_FAST_MATH
return _mm512_recip_ps(v);
//return _mm512_rcp23_ps(v); // 23-bits of accuracy
#else
return _mm512_rcp23_ps(v); // 23-bits of accuracy
#endif
}
static FORCEINLINE __vec16_f __rsqrt_varying_float(__vec16_f v) {
@@ -1752,7 +1813,6 @@ GATHER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i32, __gather_base_offsets32_i8)
GATHER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i64, __gather_base_offsets64_i8)
GATHER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i32, __gather_base_offsets32_i16)
GATHER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i64, __gather_base_offsets64_i16)
//GATHER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i32, __gather_base_offsets32_i32)
GATHER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i64, __gather_base_offsets64_i32)
GATHER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i32, __gather_base_offsets32_i64)
GATHER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i64, __gather_base_offsets64_i64)
@@ -1777,7 +1837,7 @@ GATHER_GENERAL(__vec16_i16, int16_t, __vec16_i64, __gather64_i16)
GATHER_GENERAL(__vec16_i32, int32_t, __vec16_i32, __gather32_i32)
GATHER_GENERAL(__vec16_i32, int32_t, __vec16_i64, __gather64_i32)
GATHER_GENERAL(__vec16_i64, int64_t, __vec16_i32, __gather32_i64)
GATHER_GENERAL(__vec16_i64, int64_t, __vec16_i64, __gather64_i64)
// GATHER_GENERAL(__vec16_i64, int64_t, __vec16_i64, __gather64_i64)
/*
static __forceinline __vec16_i32 __gather64_i32(__vec16_i64 ptrs, __vec16_i1 mask) {
@@ -1820,7 +1880,6 @@ SCATTER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i32, __scatter_base_offsets32_i
SCATTER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i64, __scatter_base_offsets64_i8)
SCATTER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i32, __scatter_base_offsets32_i16)
SCATTER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i64, __scatter_base_offsets64_i16)
//SCATTER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i32, __scatter_base_offsets32_i32)
SCATTER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i64, __scatter_base_offsets64_i32)
SCATTER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i32, __scatter_base_offsets32_i64)
SCATTER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i64, __scatter_base_offsets64_i64)
@@ -1828,14 +1887,21 @@ SCATTER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i64, __scatter_base_offsets64
static FORCEINLINE void
__scatter_base_offsets32_i32(uint8_t *b, __vec16_i32 varyingOffset,
uint32_t scale, __vec16_i32 constOffset,
__vec16_i32 val, __vec16_i1 mask) {
__vec16_i32 vscale = _mm512_extload_epi32(&scale, _MM_UPCONV_EPI32_NONE, _MM_BROADCAST_1X16, _MM_HINT_NONE);
__vec16_i32 offsets = __add(__mul(vscale, varyingOffset), constOffset);
// Loop is generated by the intrinsic
__vec16_i32 val, __vec16_i1 mask)
{
__vec16_i32 offsets = __add(__mul(__vec16_i32(scale), varyingOffset), constOffset);
_mm512_mask_i32extscatter_epi32(b, mask, offsets, val, _MM_DOWNCONV_EPI32_NONE, 1, _MM_HINT_NONE);
}
static FORCEINLINE void
__scatter_base_offsets32_float(void *base, const __vec16_i32 &varyingOffset,
uint32_t scale, const __vec16_i32 &constOffset,
const __vec16_f &val, const __vec16_i1 mask)
{
__vec16_i32 offsets = __add(__mul(varyingOffset,__vec16_i32(scale)), constOffset);
_mm512_mask_i32extscatter_ps(base, mask, offsets, val, _MM_DOWNCONV_PS_NONE, _MM_SCALE_1, _MM_HINT_NONE);
}
#define SCATTER_GENERAL(VTYPE, STYPE, PTRTYPE, FUNC)
/*
static FORCEINLINE void FUNC(PTRTYPE ptrs, VTYPE val, __vec16_i1 mask) { \
@@ -1846,8 +1912,7 @@ static FORCEINLINE void FUNC(PTRTYPE ptrs, VTYPE val, __vec16_i1 mask) { \
*ptr = val.v[i]; \
} \
}
*/
SCATTER_GENERAL(__vec16_i8, int8_t, __vec16_i32, __scatter32_i8)
*/ SCATTER_GENERAL(__vec16_i8, int8_t, __vec16_i32, __scatter32_i8)
SCATTER_GENERAL(__vec16_i8, int8_t, __vec16_i64, __scatter64_i8)
SCATTER_GENERAL(__vec16_i16, int16_t, __vec16_i32, __scatter32_i16)
SCATTER_GENERAL(__vec16_i16, int16_t, __vec16_i64, __scatter64_i16)
@@ -2158,3 +2223,4 @@ static FORCEINLINE uint64_t __atomic_cmpxchg(uint64_t *p, uint64_t cmpval,
#undef PRE_ALIGN
#undef POST_ALIGN