Merge pull request #862 from ncos/knc-backend-merge
Modification of 'knc.h'
This commit is contained in:
@@ -526,11 +526,11 @@ template <int ALIGN> static FORCEINLINE void __store(__vec16_i1 *p, __vec16_i1 v
|
|||||||
*p = v;
|
*p = v;
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class RetVecType> RetVecType __smear_i1(int i);
|
template <class RetVecType> static RetVecType __smear_i1(int i);
|
||||||
template <> static FORCEINLINE __vec16_i1 __smear_i1<__vec16_i1>(int i) { return i?0xFFFF:0x0; }
|
template <> FORCEINLINE __vec16_i1 __smear_i1<__vec16_i1>(int i) { return i?0xFFFF:0x0; }
|
||||||
|
|
||||||
template <class RetVecType> RetVecType __setzero_i1();
|
template <class RetVecType> static RetVecType __setzero_i1();
|
||||||
template <> static FORCEINLINE __vec16_i1 __setzero_i1<__vec16_i1>() { return 0; }
|
template <> FORCEINLINE __vec16_i1 __setzero_i1<__vec16_i1>() { return 0; }
|
||||||
|
|
||||||
template <class RetVecType> __vec16_i1 __undef_i1();
|
template <class RetVecType> __vec16_i1 __undef_i1();
|
||||||
template <> FORCEINLINE __vec16_i1 __undef_i1<__vec16_i1>() { return __vec16_i1(); }
|
template <> FORCEINLINE __vec16_i1 __undef_i1<__vec16_i1>() { return __vec16_i1(); }
|
||||||
@@ -678,8 +678,8 @@ static FORCEINLINE __vec16_i32 __select( bool cond, __vec16_i32 a, __vec16_
|
|||||||
static FORCEINLINE int32_t __extract_element(__vec16_i32 v, int32_t index) { return v[index]; }
|
static FORCEINLINE int32_t __extract_element(__vec16_i32 v, int32_t index) { return v[index]; }
|
||||||
static FORCEINLINE void __insert_element (__vec16_i32 *v, uint32_t index, int32_t val) { (*v)[index] = val; }
|
static FORCEINLINE void __insert_element (__vec16_i32 *v, uint32_t index, int32_t val) { (*v)[index] = val; }
|
||||||
|
|
||||||
template <class RetVecType> RetVecType __smear_i32(int32_t i);
|
template <class RetVecType> RetVecType static __smear_i32(int32_t i);
|
||||||
template <> static FORCEINLINE __vec16_i32 __smear_i32<__vec16_i32>(int32_t i) { return _mm512_set1_epi32(i); }
|
template <> FORCEINLINE __vec16_i32 __smear_i32<__vec16_i32>(int32_t i) { return _mm512_set1_epi32(i); }
|
||||||
|
|
||||||
static const __vec16_i32 __ispc_one = __smear_i32<__vec16_i32>(1);
|
static const __vec16_i32 __ispc_one = __smear_i32<__vec16_i32>(1);
|
||||||
static const __vec16_i32 __ispc_zero = __smear_i32<__vec16_i32>(0);
|
static const __vec16_i32 __ispc_zero = __smear_i32<__vec16_i32>(0);
|
||||||
@@ -687,11 +687,11 @@ static const __vec16_i32 __ispc_thirty_two = __smear_i32<__vec16_i32>(32);
|
|||||||
static const __vec16_i32 __ispc_ffffffff = __smear_i32<__vec16_i32>(-1);
|
static const __vec16_i32 __ispc_ffffffff = __smear_i32<__vec16_i32>(-1);
|
||||||
static const __vec16_i32 __ispc_stride1(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
|
static const __vec16_i32 __ispc_stride1(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
|
||||||
|
|
||||||
template <class RetVecType> RetVecType __setzero_i32();
|
template <class RetVecType> static RetVecType __setzero_i32();
|
||||||
template <> static FORCEINLINE __vec16_i32 __setzero_i32<__vec16_i32>() { return _mm512_setzero_epi32(); }
|
template <> FORCEINLINE __vec16_i32 __setzero_i32<__vec16_i32>() { return _mm512_setzero_epi32(); }
|
||||||
|
|
||||||
template <class RetVecType> RetVecType __undef_i32();
|
template <class RetVecType> static RetVecType __undef_i32();
|
||||||
template <> static FORCEINLINE __vec16_i32 __undef_i32<__vec16_i32>() { return __vec16_i32(); }
|
template <> FORCEINLINE __vec16_i32 __undef_i32<__vec16_i32>() { return __vec16_i32(); }
|
||||||
|
|
||||||
static FORCEINLINE __vec16_i32 __broadcast_i32(__vec16_i32 v, int index) { return _mm512_mask_permutevar_epi32(v, 0xFFFF, _mm512_set1_epi32(index), v); }
|
static FORCEINLINE __vec16_i32 __broadcast_i32(__vec16_i32 v, int index) { return _mm512_mask_permutevar_epi32(v, 0xFFFF, _mm512_set1_epi32(index), v); }
|
||||||
|
|
||||||
@@ -743,11 +743,11 @@ template <int ALIGN> static FORCEINLINE void __store(__vec16_i32 *p, __vec16_i32
|
|||||||
}
|
}
|
||||||
|
|
||||||
#if 0 /* knc::fails ./tests/foreach-25.ispc ./tests/forach-26.ispc ./tests/foreach-27.ispc */
|
#if 0 /* knc::fails ./tests/foreach-25.ispc ./tests/forach-26.ispc ./tests/foreach-27.ispc */
|
||||||
template <> static FORCEINLINE __vec16_i32 __load<64>(const __vec16_i32 *p)
|
template <> FORCEINLINE __vec16_i32 __load<64>(const __vec16_i32 *p)
|
||||||
{
|
{
|
||||||
return _mm512_load_epi32(p);
|
return _mm512_load_epi32(p);
|
||||||
}
|
}
|
||||||
template <> static FORCEINLINE void __store<64>(__vec16_i32 *p, __vec16_i32 v)
|
template <> FORCEINLINE void __store<64>(__vec16_i32 *p, __vec16_i32 v)
|
||||||
{
|
{
|
||||||
_mm512_store_epi32(p, v);
|
_mm512_store_epi32(p, v);
|
||||||
}
|
}
|
||||||
@@ -1018,21 +1018,21 @@ template <int ALIGN> static FORCEINLINE void __store(__vec16_i64 *p, __vec16_i64
|
|||||||
}
|
}
|
||||||
|
|
||||||
#if 0 /* knc::fails as with _i32 this may generate fails ... so commetining it out */
|
#if 0 /* knc::fails as with _i32 this may generate fails ... so commetining it out */
|
||||||
template <> static FORCEINLINE __vec16_i64 __load<64>(const __vec16_i64 *p)
|
template <> FORCEINLINE __vec16_i64 __load<64>(const __vec16_i64 *p)
|
||||||
{
|
{
|
||||||
__m512i v2 = _mm512_load_epi32(p);
|
__m512i v2 = _mm512_load_epi32(p);
|
||||||
__m512i v1 = _mm512_load_epi32(((uint8_t*)p)+64);
|
__m512i v1 = _mm512_load_epi32(((uint8_t*)p)+64);
|
||||||
return __vec16_i64(v2,v1);
|
return __vec16_i64(v2,v1);
|
||||||
}
|
}
|
||||||
template <> static FORCEINLINE __vec16_i64 __load<128>(const __vec16_i64 *p) { return __load<64>(p); }
|
template <> FORCEINLINE __vec16_i64 __load<128>(const __vec16_i64 *p) { return __load<64>(p); }
|
||||||
template <> static FORCEINLINE void __store<64>(__vec16_i64 *p, __vec16_i64 v)
|
template <> FORCEINLINE void __store<64>(__vec16_i64 *p, __vec16_i64 v)
|
||||||
{
|
{
|
||||||
__m512i v1 = v.v2;
|
__m512i v1 = v.v2;
|
||||||
__m512i v2 = v.v1;
|
__m512i v2 = v.v1;
|
||||||
_mm512_store_epi64(p, v2);
|
_mm512_store_epi64(p, v2);
|
||||||
_mm512_store_epi64(((uint8_t*)p)+64, v1);
|
_mm512_store_epi64(((uint8_t*)p)+64, v1);
|
||||||
}
|
}
|
||||||
template <> static FORCEINLINE void __store<128>(__vec16_i64 *p, __vec16_i64 v) { __store<64>(p, v); }
|
template <> FORCEINLINE void __store<128>(__vec16_i64 *p, __vec16_i64 v) { __store<64>(p, v); }
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
||||||
@@ -1068,14 +1068,14 @@ static FORCEINLINE __vec16_f __select( bool cond, __vec16_f a, __vec16_f b)
|
|||||||
static FORCEINLINE float __extract_element(__vec16_f v, uint32_t index) { return v[index]; }
|
static FORCEINLINE float __extract_element(__vec16_f v, uint32_t index) { return v[index]; }
|
||||||
static FORCEINLINE void __insert_element(__vec16_f *v, uint32_t index, float val) { (*v)[index] = val; }
|
static FORCEINLINE void __insert_element(__vec16_f *v, uint32_t index, float val) { (*v)[index] = val; }
|
||||||
|
|
||||||
template <class RetVecType> RetVecType __smear_float(float f);
|
template <class RetVecType> static RetVecType __smear_float(float f);
|
||||||
template <> static FORCEINLINE __vec16_f __smear_float<__vec16_f>(float f) { return _mm512_set_1to16_ps(f); }
|
template <> FORCEINLINE __vec16_f __smear_float<__vec16_f>(float f) { return _mm512_set_1to16_ps(f); }
|
||||||
|
|
||||||
template <class RetVecType> RetVecType __setzero_float();
|
template <class RetVecType> static RetVecType __setzero_float();
|
||||||
template <> static FORCEINLINE __vec16_f __setzero_float<__vec16_f>() { return _mm512_setzero_ps(); }
|
template <> FORCEINLINE __vec16_f __setzero_float<__vec16_f>() { return _mm512_setzero_ps(); }
|
||||||
|
|
||||||
template <class RetVecType> RetVecType __undef_float();
|
template <class RetVecType> static RetVecType __undef_float();
|
||||||
template <> static FORCEINLINE __vec16_f __undef_float<__vec16_f>() { return __vec16_f(); }
|
template <> FORCEINLINE __vec16_f __undef_float<__vec16_f>() { return __vec16_f(); }
|
||||||
|
|
||||||
static FORCEINLINE __vec16_f __broadcast_float(__vec16_f _v, int index)
|
static FORCEINLINE __vec16_f __broadcast_float(__vec16_f _v, int index)
|
||||||
{
|
{
|
||||||
@@ -1132,12 +1132,12 @@ template <int ALIGN> static FORCEINLINE void __store(__vec16_f *p, __vec16_f v)
|
|||||||
}
|
}
|
||||||
|
|
||||||
#if 0 /* knc::fails ./tests/gs-improve-progindex.ispc with segfault */
|
#if 0 /* knc::fails ./tests/gs-improve-progindex.ispc with segfault */
|
||||||
template <> static FORCEINLINE __vec16_f __load<64>(const __vec16_f *p)
|
template <> FORCEINLINE __vec16_f __load<64>(const __vec16_f *p)
|
||||||
{
|
{
|
||||||
return _mm512_load_ps(p);
|
return _mm512_load_ps(p);
|
||||||
}
|
}
|
||||||
/* this one doesn't fail but it is commented out for completeness, no aligned load/stores */
|
/* this one doesn't fail but it is commented out for completeness, no aligned load/stores */
|
||||||
template <> static FORCEINLINE void __store<64>(__vec16_f *p, __vec16_f v)
|
template <> FORCEINLINE void __store<64>(__vec16_f *p, __vec16_f v)
|
||||||
{
|
{
|
||||||
_mm512_store_ps(p, v);
|
_mm512_store_ps(p, v);
|
||||||
}
|
}
|
||||||
@@ -1310,14 +1310,14 @@ static FORCEINLINE __vec16_d __select(bool cond, __vec16_d a, __vec16_d b)
|
|||||||
static FORCEINLINE double __extract_element(__vec16_d v, uint32_t index) { return v[index]; }
|
static FORCEINLINE double __extract_element(__vec16_d v, uint32_t index) { return v[index]; }
|
||||||
static FORCEINLINE void __insert_element(__vec16_d *v, uint32_t index, double val) { (*v)[index] = val; }
|
static FORCEINLINE void __insert_element(__vec16_d *v, uint32_t index, double val) { (*v)[index] = val; }
|
||||||
|
|
||||||
template <class RetVecType> RetVecType __smear_double(double d);
|
template <class RetVecType> static RetVecType __smear_double(double d);
|
||||||
template <> static FORCEINLINE __vec16_d __smear_double<__vec16_d>(double d) { return __vec16_d(_mm512_set1_pd(d), _mm512_set1_pd(d)); }
|
template <> FORCEINLINE __vec16_d __smear_double<__vec16_d>(double d) { return __vec16_d(_mm512_set1_pd(d), _mm512_set1_pd(d)); }
|
||||||
|
|
||||||
template <class RetVecType> RetVecType __setzero_double();
|
template <class RetVecType> static RetVecType __setzero_double();
|
||||||
template <> static FORCEINLINE __vec16_d __setzero_double<__vec16_d>() { return __vec16_d(_mm512_setzero_pd(), _mm512_setzero_pd()); }
|
template <> FORCEINLINE __vec16_d __setzero_double<__vec16_d>() { return __vec16_d(_mm512_setzero_pd(), _mm512_setzero_pd()); }
|
||||||
|
|
||||||
template <class RetVecType> RetVecType __undef_double();
|
template <class RetVecType> static RetVecType __undef_double();
|
||||||
template <> static FORCEINLINE __vec16_d __undef_double<__vec16_d>() { return __vec16_d(); }
|
template <> FORCEINLINE __vec16_d __undef_double<__vec16_d>() { return __vec16_d(); }
|
||||||
|
|
||||||
#define CASTD2F(_v_, _v_hi_, _v_lo_) \
|
#define CASTD2F(_v_, _v_hi_, _v_lo_) \
|
||||||
__vec16_f _v_hi_, _v_lo_; \
|
__vec16_f _v_hi_, _v_lo_; \
|
||||||
@@ -1391,17 +1391,17 @@ template <int ALIGN> static FORCEINLINE void __store(__vec16_d *p, __vec16_d v)
|
|||||||
|
|
||||||
|
|
||||||
#if 0 /* knc::fails as with _f this may generate fails ... so commetining it out */
|
#if 0 /* knc::fails as with _f this may generate fails ... so commetining it out */
|
||||||
template <> static FORCEINLINE __vec16_d __load<64>(const __vec16_d *p)
|
template <> FORCEINLINE __vec16_d __load<64>(const __vec16_d *p)
|
||||||
{
|
{
|
||||||
return __vec16_d(_mm512_load_pd(p), _mm512_load_pd(((uint8_t*)p)+64));
|
return __vec16_d(_mm512_load_pd(p), _mm512_load_pd(((uint8_t*)p)+64));
|
||||||
}
|
}
|
||||||
template <> static FORCEINLINE void __store<64>(__vec16_d *p, __vec16_d v)
|
template <> FORCEINLINE void __store<64>(__vec16_d *p, __vec16_d v)
|
||||||
{
|
{
|
||||||
_mm512_store_pd(p, v.v1);
|
_mm512_store_pd(p, v.v1);
|
||||||
_mm512_store_pd(((uint8_t*)p)+64, v.v2);
|
_mm512_store_pd(((uint8_t*)p)+64, v.v2);
|
||||||
}
|
}
|
||||||
template <> static FORCEINLINE __vec16_d __load <128>(const __vec16_d *p) { return __load<64>(p); }
|
template <> FORCEINLINE __vec16_d __load <128>(const __vec16_d *p) { return __load<64>(p); }
|
||||||
template <> static FORCEINLINE void __store<128>(__vec16_d *p, __vec16_d v) { __store<64>(p, v); }
|
template <> FORCEINLINE void __store<128>(__vec16_d *p, __vec16_d v) { __store<64>(p, v); }
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////
|
||||||
|
|||||||
@@ -1,5 +1,5 @@
|
|||||||
/*
|
/**
|
||||||
Copyright (c) 2012-2014, Intel Corporation
|
Copyright (c) 2010-2014, Intel Corporation
|
||||||
All rights reserved.
|
All rights reserved.
|
||||||
|
|
||||||
Redistribution and use in source and binary forms, with or without
|
Redistribution and use in source and binary forms, with or without
|
||||||
@@ -44,6 +44,16 @@
|
|||||||
#include <iostream> // for operator<<(m512[i])
|
#include <iostream> // for operator<<(m512[i])
|
||||||
#include <iomanip> // for operator<<(m512[i])
|
#include <iomanip> // for operator<<(m512[i])
|
||||||
|
|
||||||
|
#if 0
|
||||||
|
#define STRING(x) #x
|
||||||
|
#define TOSTRING(x) STRING(x)
|
||||||
|
#define PING std::cout << __FILE__ << " (" << __LINE__ << "): " << __FUNCTION__ << std::endl
|
||||||
|
#define PRINT(x) std::cout << STRING(x) << " = " << (x) << std::endl
|
||||||
|
#define PRINT2(x,y) std::cout << STRING(x) << " = " << (x) << ", " << STRING(y) << " = " << (y) << std::endl
|
||||||
|
#define PRINT3(x,y,z) std::cout << STRING(x) << " = " << (x) << ", " << STRING(y) << " = " << (y) << ", " << STRING(z) << " = " << (z) << std::endl
|
||||||
|
#define PRINT4(x,y,z,w) std::cout << STRING(x) << " = " << (x) << ", " << STRING(y) << " = " << (y) << ", " << STRING(z) << " = " << (z) << ", " << STRING(w) << " = " << (w) << std::endl
|
||||||
|
#endif
|
||||||
|
|
||||||
#define FORCEINLINE __forceinline
|
#define FORCEINLINE __forceinline
|
||||||
#ifdef _MSC_VER
|
#ifdef _MSC_VER
|
||||||
#define PRE_ALIGN(x) /*__declspec(align(x))*/
|
#define PRE_ALIGN(x) /*__declspec(align(x))*/
|
||||||
@@ -75,7 +85,44 @@ typedef int64_t __vec1_i64;
|
|||||||
|
|
||||||
struct __vec16_i32;
|
struct __vec16_i32;
|
||||||
|
|
||||||
|
#if 0
|
||||||
|
/* (iw) actually, this *SHOULD* be the right implementation for a
|
||||||
|
vec16_i1: this one is a class that can have a constructor (which
|
||||||
|
ISPC sometimes emits for these vectors...) This version might
|
||||||
|
not be working with embree's ISPC bindings, probably because
|
||||||
|
embree still uses the 'wrong' implementation */
|
||||||
|
typedef struct PRE_ALIGN(2) __vec16_i1
|
||||||
|
{
|
||||||
|
FORCEINLINE operator __mmask16() const { return v; }
|
||||||
|
FORCEINLINE __vec16_i1() { }
|
||||||
|
FORCEINLINE __vec16_i1(const __mmask16 &vv) : v(vv) { }
|
||||||
|
FORCEINLINE __vec16_i1(bool v0, bool v1, bool v2, bool v3,
|
||||||
|
bool v4, bool v5, bool v6, bool v7,
|
||||||
|
bool v8, bool v9, bool v10, bool v11,
|
||||||
|
bool v12, bool v13, bool v14, bool v15) {
|
||||||
|
v = ((v0 & 1) |
|
||||||
|
((v1 & 1) << 1) |
|
||||||
|
((v2 & 1) << 2) |
|
||||||
|
((v3 & 1) << 3) |
|
||||||
|
((v4 & 1) << 4) |
|
||||||
|
((v5 & 1) << 5) |
|
||||||
|
((v6 & 1) << 6) |
|
||||||
|
((v7 & 1) << 7) |
|
||||||
|
((v8 & 1) << 8) |
|
||||||
|
((v9 & 1) << 9) |
|
||||||
|
((v10 & 1) << 10) |
|
||||||
|
((v11 & 1) << 11) |
|
||||||
|
((v12 & 1) << 12) |
|
||||||
|
((v13 & 1) << 13) |
|
||||||
|
((v14 & 1) << 14) |
|
||||||
|
((v15 & 1) << 15));
|
||||||
|
}
|
||||||
|
__mmask16 v;
|
||||||
|
} POST_ALIGN(2) __vec16_i1;
|
||||||
|
|
||||||
|
#else
|
||||||
typedef __mmask16 POST_ALIGN(2) __vec16_i1;
|
typedef __mmask16 POST_ALIGN(2) __vec16_i1;
|
||||||
|
#endif
|
||||||
|
|
||||||
typedef struct PRE_ALIGN(64) __vec16_f {
|
typedef struct PRE_ALIGN(64) __vec16_f {
|
||||||
FORCEINLINE operator __m512() const { return v; }
|
FORCEINLINE operator __m512() const { return v; }
|
||||||
@@ -167,14 +214,14 @@ struct vec16 {
|
|||||||
|
|
||||||
PRE_ALIGN(16) struct __vec16_i8 : public vec16<int8_t> {
|
PRE_ALIGN(16) struct __vec16_i8 : public vec16<int8_t> {
|
||||||
FORCEINLINE __vec16_i8() { }
|
FORCEINLINE __vec16_i8() { }
|
||||||
FORCEINLINE __vec16_i8(const __vec16_i8 &o);
|
FORCEINLINE __vec16_i8(const int8_t v0, const int8_t v1, const int8_t v2, const int8_t v3,
|
||||||
FORCEINLINE __vec16_i8& operator =(const __vec16_i8 &o);
|
const int8_t v4, const int8_t v5, const int8_t v6, const int8_t v7,
|
||||||
FORCEINLINE __vec16_i8(int8_t v0, int8_t v1, int8_t v2, int8_t v3,
|
const int8_t v8, const int8_t v9, const int8_t v10, const int8_t v11,
|
||||||
int8_t v4, int8_t v5, int8_t v6, int8_t v7,
|
const int8_t v12, const int8_t v13, const int8_t v14, const int8_t v15)
|
||||||
int8_t v8, int8_t v9, int8_t v10, int8_t v11,
|
|
||||||
int8_t v12, int8_t v13, int8_t v14, int8_t v15)
|
|
||||||
: vec16<int8_t>(v0, v1, v2, v3, v4, v5, v6, v7,
|
: vec16<int8_t>(v0, v1, v2, v3, v4, v5, v6, v7,
|
||||||
v8, v9, v10, v11, v12, v13, v14, v15) { }
|
v8, v9, v10, v11, v12, v13, v14, v15) { }
|
||||||
|
FORCEINLINE __vec16_i8(const __vec16_i8 &o);
|
||||||
|
FORCEINLINE __vec16_i8& operator =(const __vec16_i8 &o);
|
||||||
} POST_ALIGN(16);
|
} POST_ALIGN(16);
|
||||||
|
|
||||||
PRE_ALIGN(32) struct __vec16_i16 : public vec16<int16_t> {
|
PRE_ALIGN(32) struct __vec16_i16 : public vec16<int16_t> {
|
||||||
@@ -215,6 +262,28 @@ inline std::ostream &operator<<(std::ostream &out, const __m512 &v)
|
|||||||
return out;
|
return out;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
inline std::ostream &operator<<(std::ostream &out, const __vec16_i8 &v)
|
||||||
|
{
|
||||||
|
out << "[";
|
||||||
|
for (int i=0;i<16;i++)
|
||||||
|
out << (i?",":"") << std::dec << std::setw(8) << (int)((unsigned char*)&v)[i] << std::dec;
|
||||||
|
// out << (i?",":"") << std::hex << std::setw(8) << ((int*)&v)[i] << std::dec;
|
||||||
|
|
||||||
|
out << "]" << std::flush;
|
||||||
|
return out;
|
||||||
|
}
|
||||||
|
|
||||||
|
inline std::ostream &operator<<(std::ostream &out, const __vec16_i64 &v)
|
||||||
|
{
|
||||||
|
out << "[";
|
||||||
|
uint32_t *ptr = (uint32_t*)&v;
|
||||||
|
for (int i=0;i<16;i++) {
|
||||||
|
uint64_t val = (uint64_t(ptr[i])<<32)+ptr[i+16];
|
||||||
|
out << (i?",":"") << ((int*)val);
|
||||||
|
}
|
||||||
|
out << "]" << std::flush;
|
||||||
|
return out;
|
||||||
|
}
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////
|
||||||
// macros...
|
// macros...
|
||||||
@@ -299,6 +368,20 @@ static FORCEINLINE bool __extract_element(__vec16_i1 mask, uint32_t index) {
|
|||||||
return (mask & (1 << index)) ? true : false;
|
return (mask & (1 << index)) ? true : false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
static FORCEINLINE int64_t __extract_element(const __vec16_i64 &v, uint32_t index)
|
||||||
|
{
|
||||||
|
//uint *src = (uint *)&v;
|
||||||
|
const uint *src = (const uint *)&v;
|
||||||
|
return src[index+16] | (uint64_t(src[index]) << 32);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
static FORCEINLINE void __insert_element(__vec16_i1 *vec, int index,
|
static FORCEINLINE void __insert_element(__vec16_i1 *vec, int index,
|
||||||
bool val) {
|
bool val) {
|
||||||
@@ -557,9 +640,13 @@ static FORCEINLINE __vec16_i32 __broadcast_i32(__vec16_i32 v, int index) {
|
|||||||
return _mm512_set1_epi32(val);
|
return _mm512_set1_epi32(val);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static FORCEINLINE __vec16_i32 __cast_trunc(__vec16_i32, const __vec16_i64 i64) {
|
||||||
|
return __vec16_i32(i64.v_lo);
|
||||||
|
}
|
||||||
|
|
||||||
static FORCEINLINE __vec16_i32 __rotate_i32(__vec16_i32 v, int index) {
|
static FORCEINLINE __vec16_i32 __rotate_i32(__vec16_i32 v, int index) {
|
||||||
__vec16_i32 idx = __smear_i32<__vec16_i32>(index);
|
__vec16_i32 idx = __smear_i32<__vec16_i32>(index);
|
||||||
__vec16_i32 shuffle = _mm512_and_epi32(_mm512_add_epi32(__ispc_stride1, idx), __smear_i32<__vec16_i32>(0x7));
|
__vec16_i32 shuffle = _mm512_and_epi32(_mm512_add_epi32(__ispc_stride1, idx), __smear_i32<__vec16_i32>(0xf));
|
||||||
return _mm512_mask_permutevar_epi32(v, 0xffff, shuffle, v);
|
return _mm512_mask_permutevar_epi32(v, 0xffff, shuffle, v);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -598,11 +685,25 @@ template <> FORCEINLINE void __store<64>(__vec16_i32 *p, __vec16_i32 v) {
|
|||||||
///////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////
|
||||||
// int64
|
// int64
|
||||||
///////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////
|
||||||
|
static FORCEINLINE
|
||||||
static FORCEINLINE int64_t __extract_element(const __vec16_i64 &v, uint32_t index)
|
void __masked_store_i64(void *p, const __vec16_i64 &v, __vec16_i1 mask)
|
||||||
{
|
{
|
||||||
uint *src = (uint *)&v;
|
__m512i v1;
|
||||||
return src[index+16] | (int64_t(src[index]) << 32);
|
__m512i v2;
|
||||||
|
v1 = _mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xAAAA,
|
||||||
|
_mm512_set_16to16_pi(15,15,14,14,13,13,12,12,11,11,10,10,9,9,8,8),
|
||||||
|
v.v_hi);
|
||||||
|
v1 = _mm512_mask_permutevar_epi32(v1, 0x5555,
|
||||||
|
_mm512_set_16to16_pi(15,15,14,14,13,13,12,12,11,11,10,10,9,9,8,8),
|
||||||
|
v.v_lo);
|
||||||
|
v2 = _mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xAAAA,
|
||||||
|
_mm512_set_16to16_pi(7,7,6,6,5,5,4,4,3,3,2,2,1,1,0,0),
|
||||||
|
v.v_hi);
|
||||||
|
v2 = _mm512_mask_permutevar_epi32(v2, 0x5555,
|
||||||
|
_mm512_set_16to16_pi(7,7,6,6,5,5,4,4,3,3,2,2,1,1,0,0),
|
||||||
|
v.v_lo);
|
||||||
|
_mm512_mask_store_epi64(p, mask, v2);
|
||||||
|
_mm512_mask_store_epi64(((uint8_t*)p)+64, mask>>8, v1);
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE void __insert_element(__vec16_i64 *v, uint32_t index, int64_t val) {
|
static FORCEINLINE void __insert_element(__vec16_i64 *v, uint32_t index, int64_t val) {
|
||||||
@@ -704,6 +805,13 @@ static FORCEINLINE __vec16_i64 __shl(__vec16_i64 a, __vec16_i64 b) {
|
|||||||
return __vec16_i64(lo, hi);
|
return __vec16_i64(lo, hi);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static FORCEINLINE __vec16_i64 __shl(__vec16_i64 a, unsigned long long b) {
|
||||||
|
__vec16_i32 hi = _mm512_or_epi32(_mm512_slli_epi32(a.v_hi, b),
|
||||||
|
_mm512_srli_epi32(a.v_lo, 32-b));
|
||||||
|
__vec16_i32 lo = _mm512_slli_epi32(a.v_lo, b);
|
||||||
|
return __vec16_i64(lo, hi);
|
||||||
|
}
|
||||||
|
|
||||||
static FORCEINLINE __vec16_i64 __lshr(__vec16_i64 a, __vec16_i64 b) {
|
static FORCEINLINE __vec16_i64 __lshr(__vec16_i64 a, __vec16_i64 b) {
|
||||||
__vec16_i32 shift = _mm512_sub_epi32(__ispc_thirty_two, b.v_lo);
|
__vec16_i32 shift = _mm512_sub_epi32(__ispc_thirty_two, b.v_lo);
|
||||||
__vec16_i32 xfer = _mm512_and_epi32(_mm512_sllv_epi32(__ispc_ffffffff, shift), _mm512_sllv_epi32(a.v_hi, shift));
|
__vec16_i32 xfer = _mm512_and_epi32(_mm512_sllv_epi32(__ispc_ffffffff, shift), _mm512_sllv_epi32(a.v_hi, shift));
|
||||||
@@ -724,6 +832,16 @@ static FORCEINLINE __vec16_i64 __ashr(__vec16_i64 a, __vec16_i64 b) {
|
|||||||
return __vec16_i64(lo, hi);
|
return __vec16_i64(lo, hi);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static FORCEINLINE __vec16_i64 __ashr(__vec16_i64 a, unsigned long long b) {
|
||||||
|
__vec16_i32 xfer
|
||||||
|
= _mm512_slli_epi32(_mm512_and_epi32(a.v_hi,
|
||||||
|
_mm512_set1_epi32((1<<b)-1)),
|
||||||
|
32-b);
|
||||||
|
__vec16_i32 hi = _mm512_srai_epi32(a.v_hi, b);
|
||||||
|
__vec16_i32 lo = _mm512_or_epi32(xfer, _mm512_srli_epi32(a.v_lo, b));
|
||||||
|
return __vec16_i64(lo, hi);
|
||||||
|
}
|
||||||
|
|
||||||
static FORCEINLINE __vec16_i1 __equal_i64(const __vec16_i64 &a, const __vec16_i64 &b) {
|
static FORCEINLINE __vec16_i1 __equal_i64(const __vec16_i64 &a, const __vec16_i64 &b) {
|
||||||
const __mmask16 lo_match = _mm512_cmpeq_epi32_mask(a.v_lo,b.v_lo);
|
const __mmask16 lo_match = _mm512_cmpeq_epi32_mask(a.v_lo,b.v_lo);
|
||||||
return _mm512_mask_cmpeq_epi32_mask(lo_match,a.v_hi,b.v_hi);
|
return _mm512_mask_cmpeq_epi32_mask(lo_match,a.v_hi,b.v_hi);
|
||||||
@@ -731,9 +849,9 @@ static FORCEINLINE __vec16_i1 __equal_i64(const __vec16_i64 &a, const __vec16_i6
|
|||||||
|
|
||||||
static FORCEINLINE __vec16_i1 __equal_i64_and_mask(const __vec16_i64 &a, const __vec16_i64 &b,
|
static FORCEINLINE __vec16_i1 __equal_i64_and_mask(const __vec16_i64 &a, const __vec16_i64 &b,
|
||||||
__vec16_i1 mask) {
|
__vec16_i1 mask) {
|
||||||
__mmask16 lo_match = _mm512_cmpeq_epi32_mask(a.v_lo,b.v_lo);
|
__mmask16 lo_match = _mm512_mask_cmpeq_epi32_mask((__mmask16)mask, a.v_lo,b.v_lo);
|
||||||
__mmask16 full_match = _mm512_mask_cmpeq_epi32_mask(lo_match,a.v_hi,b.v_hi);
|
__mmask16 full_match = _mm512_mask_cmpeq_epi32_mask(lo_match,a.v_hi,b.v_hi);
|
||||||
return _mm512_kand(full_match, (__mmask16)mask);
|
return full_match;
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE __vec16_i1 __not_equal_i64(const __vec16_i64 &a, const __vec16_i64 &b) {
|
static FORCEINLINE __vec16_i1 __not_equal_i64(const __vec16_i64 &a, const __vec16_i64 &b) {
|
||||||
@@ -762,10 +880,11 @@ template <> FORCEINLINE __vec16_i64 __smear_i64<__vec16_i64>(const int64_t &l)
|
|||||||
template <int ALIGN> static FORCEINLINE __vec16_i64 __load(const __vec16_i64 *p) {
|
template <int ALIGN> static FORCEINLINE __vec16_i64 __load(const __vec16_i64 *p) {
|
||||||
__vec16_i32 v1;
|
__vec16_i32 v1;
|
||||||
__vec16_i32 v2;
|
__vec16_i32 v2;
|
||||||
v2 = _mm512_extloadunpacklo_epi32(v2, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
|
const uint8_t*ptr = (const uint8_t*)p;
|
||||||
v2 = _mm512_extloadunpackhi_epi32(v2, (uint8_t*)p+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
|
v2 = _mm512_extloadunpacklo_epi32(v2, ptr, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
|
||||||
v1 = _mm512_extloadunpacklo_epi32(v1, (uint8_t*)p+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
|
v2 = _mm512_extloadunpackhi_epi32(v2, ptr+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
|
||||||
v1 = _mm512_extloadunpackhi_epi32(v1, (uint8_t*)p+128, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
|
v1 = _mm512_extloadunpacklo_epi32(v1, ptr+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
|
||||||
|
v1 = _mm512_extloadunpackhi_epi32(v1, ptr+128, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
|
||||||
|
|
||||||
__vec16_i64 ret;
|
__vec16_i64 ret;
|
||||||
ret.v_hi = _mm512_mask_permutevar_epi32(ret.v_hi, 0xFF00,
|
ret.v_hi = _mm512_mask_permutevar_epi32(ret.v_hi, 0xFF00,
|
||||||
@@ -850,6 +969,68 @@ template <> FORCEINLINE void __store<128>(__vec16_i64 *p, __vec16_i64 v) {
|
|||||||
__store<64>(p, v);
|
__store<64>(p, v);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/*! gather vector of 64-bit ints from addresses pointing to uniform ints
|
||||||
|
|
||||||
|
(iw) WARNING: THIS CODE ONLY WORKS FOR GATHERS FROM ARRAYS OF
|
||||||
|
***UNIFORM*** INT64's/POINTERS. (problem is that ispc doesn't
|
||||||
|
expose whether it's from array of uniform or array of varying
|
||||||
|
poitners, so in here there's no way to tell - only thing we can do
|
||||||
|
is pick one...
|
||||||
|
*/
|
||||||
|
static FORCEINLINE __vec16_i64
|
||||||
|
__gather_base_offsets32_i64(uint8_t *base, uint32_t scale, __vec16_i32 offsets,
|
||||||
|
__vec16_i1 mask) {
|
||||||
|
__vec16_i64 ret;
|
||||||
|
ret.v_lo = _mm512_mask_i32extgather_epi32(_mm512_undefined_epi32(), mask, offsets,
|
||||||
|
base, _MM_UPCONV_EPI32_NONE, scale,
|
||||||
|
_MM_HINT_NONE);
|
||||||
|
ret.v_hi = _mm512_mask_i32extgather_epi32(_mm512_undefined_epi32(), mask, offsets,
|
||||||
|
base+4, _MM_UPCONV_EPI32_NONE, scale,
|
||||||
|
_MM_HINT_NONE);
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*! gather vector of 64-bit ints from addresses pointing to uniform ints
|
||||||
|
|
||||||
|
(iw) WARNING: THIS CODE ONLY WORKS FOR GATHERS FROM ARRAYS OF
|
||||||
|
***UNIFORM*** INT64's/POINTERS. (problem is that ispc doesn't
|
||||||
|
expose whether it's from array of uniform or array of varying
|
||||||
|
poitners, so in here there's no way to tell - only thing we can do
|
||||||
|
is pick one...
|
||||||
|
*/
|
||||||
|
static FORCEINLINE __vec16_i64
|
||||||
|
__gather64_i64(__vec16_i64 addr, __vec16_i1 mask)
|
||||||
|
{
|
||||||
|
__vec16_i64 ret;
|
||||||
|
|
||||||
|
// There is no gather instruction with 64-bit offsets in KNC.
|
||||||
|
// We have to manually iterate over the upper 32 bits ;-)
|
||||||
|
__vec16_i1 still_to_do = mask;
|
||||||
|
const __vec16_i32 signed_offsets = _mm512_add_epi32(addr.v_lo, __smear_i32<__vec16_i32>((int32_t)INT_MIN));
|
||||||
|
while (still_to_do) {
|
||||||
|
int first_active_lane = _mm_tzcnt_32((int)still_to_do);
|
||||||
|
const uint32_t &hi32 = ((uint*)&addr.v_hi)[first_active_lane];
|
||||||
|
__vec16_i1 match = _mm512_mask_cmp_epi32_mask(still_to_do,addr.v_hi,
|
||||||
|
__smear_i32<__vec16_i32>((int32_t)hi32),
|
||||||
|
_MM_CMPINT_EQ);
|
||||||
|
|
||||||
|
void * base = (void*)((((unsigned long)hi32) << 32) + (unsigned long)(-(long)INT_MIN));
|
||||||
|
ret.v_lo = _mm512_mask_i32extgather_epi32(ret.v_lo, match, signed_offsets,
|
||||||
|
base, _MM_UPCONV_EPI32_NONE, 1,
|
||||||
|
_MM_HINT_NONE);
|
||||||
|
ret.v_hi = _mm512_mask_i32extgather_epi32(ret.v_hi, match, signed_offsets,
|
||||||
|
base+4, _MM_UPCONV_EPI32_NONE, 1,
|
||||||
|
_MM_HINT_NONE);
|
||||||
|
|
||||||
|
still_to_do = _mm512_kxor(match, still_to_do);
|
||||||
|
}
|
||||||
|
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////
|
||||||
// float
|
// float
|
||||||
///////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////
|
||||||
@@ -1329,16 +1510,15 @@ static FORCEINLINE __vec16_i32 __cast_fptoui(__vec16_i32, __vec16_f val) {
|
|||||||
|
|
||||||
static FORCEINLINE __vec16_d __cast_fpext(__vec16_d, __vec16_f val) {
|
static FORCEINLINE __vec16_d __cast_fpext(__vec16_d, __vec16_f val) {
|
||||||
__vec16_d ret;
|
__vec16_d ret;
|
||||||
ret.v2 = _mm512_cvtpslo_pd(val.v);
|
ret.v1 = _mm512_cvtpslo_pd(val.v);
|
||||||
__vec16_f other8 = _mm512_permute4f128_epi32(_mm512_castps_si512(val.v), _MM_PERM_DCDC);
|
__vec16_f other8 = _mm512_permute4f128_epi32(_mm512_castps_si512(val.v), _MM_PERM_DCDC);
|
||||||
ret.v1 = _mm512_cvtpslo_pd(other8);
|
ret.v2 = _mm512_cvtpslo_pd(other8);
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE __vec16_f __cast_fptrunc(__vec16_f, __vec16_d val) {
|
static FORCEINLINE __vec16_f __cast_fptrunc(__vec16_f, __vec16_d val) {
|
||||||
__m512i r0i = _mm512_castps_si512(_mm512_cvtpd_pslo(val.v1));
|
__m512i r0i = _mm512_castps_si512(_mm512_cvtpd_pslo(val.v2));
|
||||||
__m512i r1i = _mm512_castps_si512(_mm512_cvtpd_pslo(val.v2));
|
__m512i r1i = _mm512_castps_si512(_mm512_cvtpd_pslo(val.v1));
|
||||||
|
|
||||||
return _mm512_mask_permute4f128_epi32(r1i, 0xFF00, r0i, _MM_PERM_BABA);
|
return _mm512_mask_permute4f128_epi32(r1i, 0xFF00, r0i, _MM_PERM_BABA);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1352,11 +1532,37 @@ static FORCEINLINE __vec16_i32 __cast_bits(__vec16_i32, __vec16_f val) {
|
|||||||
|
|
||||||
|
|
||||||
static FORCEINLINE __vec16_i64 __cast_bits(__vec16_i64, __vec16_d val) {
|
static FORCEINLINE __vec16_i64 __cast_bits(__vec16_i64, __vec16_d val) {
|
||||||
return *(__vec16_i64*)&val;
|
__vec16_i64 ret;
|
||||||
|
ret.v_hi = _mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xFF00,
|
||||||
|
_mm512_set_16to16_pi(15,13,11,9,7,5,3,1,14,12,10,8,6,4,2,0),
|
||||||
|
val.v2);
|
||||||
|
ret.v_hi = _mm512_mask_permutevar_epi32(ret.v_hi, 0x00FF,
|
||||||
|
_mm512_set_16to16_pi(14,12,10,8,6,4,2,0,15,13,11,9,7,5,3,1),
|
||||||
|
val.v1);
|
||||||
|
ret.v_lo = _mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xFF00,
|
||||||
|
_mm512_set_16to16_pi(14,12,10,8,6,4,2,0,15,13,11,9,7,5,3,1),
|
||||||
|
val.v2);
|
||||||
|
ret.v_lo = _mm512_mask_permutevar_epi32(ret.v_lo, 0x00FF,
|
||||||
|
_mm512_set_16to16_pi(15,13,11,9,7,5,3,1,14,12,10,8,6,4,2,0),
|
||||||
|
val.v1);
|
||||||
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE __vec16_d __cast_bits(__vec16_d, __vec16_i64 val) {
|
static FORCEINLINE __vec16_d __cast_bits(__vec16_d, __vec16_i64 val) {
|
||||||
return *(__vec16_d*)&val;
|
__vec16_d ret;
|
||||||
|
ret.v2 = _mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xAAAA,
|
||||||
|
_mm512_set_16to16_pi(15,15,14,14,13,13,12,12,11,11,10,10,9,9,8,8),
|
||||||
|
val.v_hi);
|
||||||
|
ret.v2 = _mm512_mask_permutevar_epi32(ret.v2, 0x5555,
|
||||||
|
_mm512_set_16to16_pi(15,15,14,14,13,13,12,12,11,11,10,10,9,9,8,8),
|
||||||
|
val.v_lo);
|
||||||
|
ret.v1 = _mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xAAAA,
|
||||||
|
_mm512_set_16to16_pi(7,7,6,6,5,5,4,4,3,3,2,2,1,1,0,0),
|
||||||
|
val.v_hi);
|
||||||
|
ret.v1 = _mm512_mask_permutevar_epi32(ret.v1, 0x5555,
|
||||||
|
_mm512_set_16to16_pi(7,7,6,6,5,5,4,4,3,3,2,2,1,1,0,0),
|
||||||
|
val.v_lo);
|
||||||
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////
|
||||||
@@ -1488,12 +1694,14 @@ static FORCEINLINE __vec16_f __rsqrt_varying_float(__vec16_f v) {
|
|||||||
return _mm512_invsqrt_ps(v);
|
return _mm512_invsqrt_ps(v);
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE __vec16_d __rsqrt_varying_double(__vec16_d x) {
|
static FORCEINLINE __vec16_d __rsqrt_varying_double(__vec16_d x) {
|
||||||
__vec16_d y;
|
__vec16_d y;
|
||||||
for (int i = 0; i < 16; i++)
|
for (int i = 0; i < 16; i++)
|
||||||
__insert_element(&y, i, 1.0/sqrt(__extract_element(x,i)));
|
__insert_element(&y, i, 1.0/sqrt(__extract_element(x,i)));
|
||||||
return y;
|
return y;
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE double __rsqrt_uniform_double(double v)
|
static FORCEINLINE double __rsqrt_uniform_double(double v)
|
||||||
{
|
{
|
||||||
return 1.0/v;
|
return 1.0/v;
|
||||||
@@ -1629,6 +1837,38 @@ static FORCEINLINE __vec16_d __masked_load_double(void *p, __vec16_i1 mask) {
|
|||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static FORCEINLINE void __masked_store_i8(void *p, const __vec16_i8 &val, __vec16_i1 mask) {
|
||||||
|
__vec16_i32 tmp = _mm512_extload_epi32(&val, _MM_UPCONV_EPI32_SINT8, _MM_BROADCAST32_NONE, _MM_HINT_NONE);
|
||||||
|
_mm512_mask_extstore_epi32(p, mask, tmp, _MM_DOWNCONV_EPI32_SINT8,_MM_HINT_NONE);
|
||||||
|
}
|
||||||
|
|
||||||
|
static FORCEINLINE __vec16_i8 __masked_load_i8(void *p, __vec16_i1 mask) {
|
||||||
|
__vec16_i8 ret;
|
||||||
|
__vec16_i32 tmp = _mm512_mask_extload_epi32(_mm512_undefined_epi32(),mask,p,
|
||||||
|
_MM_UPCONV_EPI32_SINT8,
|
||||||
|
_MM_BROADCAST32_NONE, _MM_HINT_NONE);
|
||||||
|
_mm512_extstore_epi32(&ret, tmp, _MM_DOWNCONV_EPI32_SINT8,_MM_HINT_NONE);
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
template <int ALIGN> static FORCEINLINE __vec16_i8 __load(const __vec16_i8 *p) {
|
||||||
|
return *p;
|
||||||
|
}
|
||||||
|
template <int ALIGN> static FORCEINLINE void __store(__vec16_i8 *p, __vec16_i8 v) {
|
||||||
|
*p = v;
|
||||||
|
}
|
||||||
|
|
||||||
|
static FORCEINLINE void
|
||||||
|
__scatter_base_offsets32_i8(uint8_t *b, uint32_t scale, __vec16_i32 offsets,
|
||||||
|
__vec16_i8 val, __vec16_i1 mask)
|
||||||
|
{
|
||||||
|
__vec16_i32 tmp = _mm512_extload_epi32(&val,_MM_UPCONV_EPI32_SINT8,
|
||||||
|
_MM_BROADCAST32_NONE, _MM_HINT_NONE);
|
||||||
|
printf("__scatter_base_offsets32_i8\n");
|
||||||
|
_mm512_mask_i32extscatter_epi32(b, mask, offsets, tmp,
|
||||||
|
_MM_DOWNCONV_EPI32_SINT8, scale,
|
||||||
|
_MM_HINT_NONE);
|
||||||
|
}
|
||||||
|
|
||||||
static FORCEINLINE void __masked_store_i32(void *p, __vec16_i32 val, __vec16_i1 mask) {
|
static FORCEINLINE void __masked_store_i32(void *p, __vec16_i32 val, __vec16_i1 mask) {
|
||||||
#ifdef ISPC_FORCE_ALIGNED_MEMORY
|
#ifdef ISPC_FORCE_ALIGNED_MEMORY
|
||||||
_mm512_mask_store_epi32(p, mask, val.v);
|
_mm512_mask_store_epi32(p, mask, val.v);
|
||||||
@@ -1729,16 +1969,44 @@ static FORCEINLINE __vec16_d
|
|||||||
__gather_base_offsets32_double(uint8_t *base, uint32_t scale, __vec16_i32 offsets,
|
__gather_base_offsets32_double(uint8_t *base, uint32_t scale, __vec16_i32 offsets,
|
||||||
__vec16_i1 mask) {
|
__vec16_i1 mask) {
|
||||||
__vec16_d ret;
|
__vec16_d ret;
|
||||||
ret.v2 = _mm512_mask_i32loextgather_pd(_mm512_undefined_pd(), mask, offsets,
|
ret.v1 = _mm512_mask_i32loextgather_pd(_mm512_undefined_pd(), mask, offsets,
|
||||||
base, _MM_UPCONV_PD_NONE, scale,
|
base, _MM_UPCONV_PD_NONE, scale,
|
||||||
_MM_HINT_NONE);
|
_MM_HINT_NONE);
|
||||||
__m512i shuffled_offsets = _mm512_permute4f128_epi32(offsets.v, _MM_PERM_DCDC);
|
__m512i shuffled_offsets = _mm512_permute4f128_epi32(offsets.v, _MM_PERM_DCDC);
|
||||||
ret.v1 = _mm512_mask_i32loextgather_pd(_mm512_undefined_pd(), mask, shuffled_offsets,
|
ret.v2 = _mm512_mask_i32loextgather_pd(_mm512_undefined_pd(), mask, shuffled_offsets,
|
||||||
base, _MM_UPCONV_PD_NONE, scale,
|
base, _MM_UPCONV_PD_NONE, scale,
|
||||||
_MM_HINT_NONE);
|
_MM_HINT_NONE);
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static FORCEINLINE __vec16_f
|
||||||
|
__gather64_float(__vec16_i64 addr, __vec16_i1 mask)
|
||||||
|
{
|
||||||
|
__vec16_f ret;
|
||||||
|
|
||||||
|
// There is no gather instruction with 64-bit offsets in KNC.
|
||||||
|
// We have to manually iterate over the upper 32 bits ;-)
|
||||||
|
__vec16_i1 still_to_do = mask;
|
||||||
|
const __vec16_i32 signed_offsets = _mm512_add_epi32(addr.v_lo, __smear_i32<__vec16_i32>((int32_t)INT_MIN));
|
||||||
|
while (still_to_do) {
|
||||||
|
int first_active_lane = _mm_tzcnt_32((int)still_to_do);
|
||||||
|
const uint &hi32 = ((uint*)&addr.v_hi)[first_active_lane];
|
||||||
|
__vec16_i1 match = _mm512_mask_cmp_epi32_mask(still_to_do,addr.v_hi,
|
||||||
|
__smear_i32<__vec16_i32>((int32_t)hi32),
|
||||||
|
_MM_CMPINT_EQ);
|
||||||
|
|
||||||
|
void * base = (void*)((((unsigned long)hi32) << 32) + (unsigned long)(-(long)INT_MIN));
|
||||||
|
|
||||||
|
ret.v = _mm512_mask_i32extgather_ps(ret.v, match, signed_offsets,
|
||||||
|
base, _MM_UPCONV_PS_NONE, 1,
|
||||||
|
_MM_HINT_NONE);
|
||||||
|
still_to_do = _mm512_kxor(match, still_to_do);
|
||||||
|
}
|
||||||
|
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
/*! gather with 64-bit offsets.
|
/*! gather with 64-bit offsets.
|
||||||
|
|
||||||
\todo add optimization that falls back to 32-bit offset gather if
|
\todo add optimization that falls back to 32-bit offset gather if
|
||||||
@@ -1850,6 +2118,32 @@ __scatter_base_offsets64_i32(uint8_t *_base, uint32_t scale, __vec16_i64 offsets
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static FORCEINLINE void // TODO
|
||||||
|
__scatter_base_offsets64_i8(uint8_t *_base, uint32_t scale, __vec16_i64 offsets,
|
||||||
|
__vec16_i8 value,
|
||||||
|
__vec16_i1 mask) {
|
||||||
|
__vec16_i1 still_to_do = mask;
|
||||||
|
|
||||||
|
__vec16_i32 tmp = _mm512_extload_epi32(&value, _MM_UPCONV_EPI32_SINT8,
|
||||||
|
_MM_BROADCAST32_NONE, _MM_HINT_NONE);
|
||||||
|
// _mm512_mask_extstore_epi32(p, mask, tmp, _MM_DOWNCONV_EPI32_SINT8,_MM_HINT_NONE);
|
||||||
|
|
||||||
|
while (still_to_do) {
|
||||||
|
int first_active_lane = _mm_tzcnt_32((int)still_to_do);
|
||||||
|
const uint &hi32 = ((uint*)&offsets.v_hi)[first_active_lane];
|
||||||
|
__vec16_i1 match = _mm512_mask_cmp_epi32_mask(mask,offsets.v_hi,
|
||||||
|
__smear_i32<__vec16_i32>((int32_t)hi32),
|
||||||
|
_MM_CMPINT_EQ);
|
||||||
|
|
||||||
|
void * base = (void*)((unsigned long)_base +
|
||||||
|
((scale*(unsigned long)hi32) << 32));
|
||||||
|
_mm512_mask_i32extscatter_epi32(base, match, offsets.v_lo,
|
||||||
|
tmp,
|
||||||
|
_MM_DOWNCONV_EPI32_SINT8, scale,
|
||||||
|
_MM_HINT_NONE);
|
||||||
|
still_to_do = _mm512_kxor(match,still_to_do);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
static FORCEINLINE __vec16_i32
|
static FORCEINLINE __vec16_i32
|
||||||
@@ -1884,17 +2178,15 @@ __scatter_base_offsets32_float(void *base, uint32_t scale, __vec16_i32 offsets,
|
|||||||
// packed load/store
|
// packed load/store
|
||||||
///////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
static FORCEINLINE int32_t __packed_load_active(uint32_t *p, __vec16_i32 *val,
|
static FORCEINLINE int32_t __packed_load_active(uint32_t *p, __vec16_i32 *val, __vec16_i1 mask) {
|
||||||
__vec16_i1 mask) {
|
__vec16_i32 v = __load<64>(val);
|
||||||
__vec16_i32 v;
|
|
||||||
v = _mm512_mask_extloadunpacklo_epi32(v, mask, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
|
v = _mm512_mask_extloadunpacklo_epi32(v, mask, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
|
||||||
v = _mm512_mask_extloadunpackhi_epi32(_mm512_undefined_epi32(), mask, (uint8_t*)p+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
|
v = _mm512_mask_extloadunpackhi_epi32(v, mask, (uint8_t*)p+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
|
||||||
__store<64>(val, v);
|
__store<64>(val, v);
|
||||||
return _mm_countbits_32(uint32_t(mask));
|
return _mm_countbits_32(uint32_t(mask));
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE int32_t __packed_store_active(uint32_t *p, __vec16_i32 val,
|
static FORCEINLINE int32_t __packed_store_active(uint32_t *p, __vec16_i32 val, __vec16_i1 mask) {
|
||||||
__vec16_i1 mask) {
|
|
||||||
_mm512_mask_extpackstorelo_epi32(p, mask, val, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
|
_mm512_mask_extpackstorelo_epi32(p, mask, val, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
|
||||||
_mm512_mask_extpackstorehi_epi32((uint8_t*)p+64, mask, val, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
|
_mm512_mask_extpackstorehi_epi32((uint8_t*)p+64, mask, val, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
|
||||||
return _mm_countbits_32(uint32_t(mask));
|
return _mm_countbits_32(uint32_t(mask));
|
||||||
|
|||||||
Reference in New Issue
Block a user