cast_uitofp typo fix; helper printf_v for vector types add
This commit is contained in:
@@ -308,13 +308,68 @@ inline std::ostream &operator<<(std::ostream &out, const __vec16_i64 &v)
|
|||||||
out << "[";
|
out << "[";
|
||||||
uint32_t *ptr = (uint32_t*)&v;
|
uint32_t *ptr = (uint32_t*)&v;
|
||||||
for (int i=0;i<16;i++) {
|
for (int i=0;i<16;i++) {
|
||||||
uint64_t val = (uint64_t(ptr[i])<<32)+ptr[i+16];
|
out << (i!=0?",":"") << std::dec << std::setw(8) << ((uint64_t)v[i]) << std::dec;
|
||||||
out << (i!=0?",":"") << std::dec << std::setw(8) << ((uint64_t)val) << std::dec;
|
|
||||||
}
|
}
|
||||||
out << "]" << std::flush;
|
out << "]" << std::flush;
|
||||||
return out;
|
return out;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// C-style debugging helpers
|
||||||
|
inline void printf_v(const __m512i &v)
|
||||||
|
{
|
||||||
|
printf("[");
|
||||||
|
for (int i=0;i<16;i++)
|
||||||
|
printf("%d ", ((int*)&v)[i]);
|
||||||
|
printf("]\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
inline void printf_v(const __m512 &v)
|
||||||
|
{
|
||||||
|
printf("[");
|
||||||
|
for (int i=0;i<16;i++)
|
||||||
|
printf("%f ", ((float*)&v)[i]);
|
||||||
|
printf("]\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
inline void printf_v(const __vec16_i1 &v)
|
||||||
|
{
|
||||||
|
printf("[");
|
||||||
|
for (int i=0;i<16;i++)
|
||||||
|
printf("%d ", (int)v[i]);
|
||||||
|
printf("]\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
inline void printf_v(const __vec16_i8 &v)
|
||||||
|
{
|
||||||
|
printf("[");
|
||||||
|
for (int i=0;i<16;i++)
|
||||||
|
printf("%d ", (int)((unsigned char*)&v)[i]);
|
||||||
|
printf("]\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
inline void printf_v(const __vec16_i16 &v)
|
||||||
|
{
|
||||||
|
printf("[");
|
||||||
|
for (int i=0;i<16;i++)
|
||||||
|
printf("%d ", (int)((uint16_t*)&v)[i]);
|
||||||
|
printf("]\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
inline void printf_v(const __vec16_d &v)
|
||||||
|
{
|
||||||
|
printf("[");
|
||||||
|
for (int i=0;i<16;i++)
|
||||||
|
printf("%f ", v[i]);
|
||||||
|
printf("]\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
inline void printf_v(const __vec16_i64 &v)
|
||||||
|
{
|
||||||
|
printf("[");
|
||||||
|
for (int i=0;i<16;i++)
|
||||||
|
printf("%llu ", ((uint64_t)v[i]));
|
||||||
|
printf("]\n");
|
||||||
|
}
|
||||||
///////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////
|
||||||
// macros...
|
// macros...
|
||||||
|
|
||||||
@@ -784,8 +839,8 @@ static FORCEINLINE __vec16_i64 __select(__vec16_i1 mask,
|
|||||||
ret.v_lo = _mm512_mask_blend_epi64(mask.lo(), b.v_lo, a.v_lo);
|
ret.v_lo = _mm512_mask_blend_epi64(mask.lo(), b.v_lo, a.v_lo);
|
||||||
ret.v_hi = _mm512_mask_blend_epi64(mask.hi(), b.v_hi, a.v_hi);
|
ret.v_hi = _mm512_mask_blend_epi64(mask.hi(), b.v_hi, a.v_hi);
|
||||||
// TODO: Check if this works better:
|
// TODO: Check if this works better:
|
||||||
ret.v_lo = _mm512_mask_mov_epi32(b.v_lo, mask, a.v_lo);
|
//ret.v_lo = _mm512_mask_mov_epi32(b.v_lo, mask, a.v_lo);
|
||||||
ret.v_hi = _mm512_mask_mov_epi32(b.v_hi, mask, a.v_hi);
|
//ret.v_hi = _mm512_mask_mov_epi32(b.v_hi, mask, a.v_hi);
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -831,6 +886,7 @@ static FORCEINLINE __vec16_i64 __sub(const __vec16_i64 &a, const __vec16_i64 &b)
|
|||||||
|
|
||||||
/*! 64x32 bit mul -- address computations often use a scale that we
|
/*! 64x32 bit mul -- address computations often use a scale that we
|
||||||
know is 32 bits; and 32x64 is faster than 64x64 */
|
know is 32 bits; and 32x64 is faster than 64x64 */
|
||||||
|
/*
|
||||||
static FORCEINLINE __vec16_i64 __mul(const __vec16_i32 &a, const __vec16_i64 &b)
|
static FORCEINLINE __vec16_i64 __mul(const __vec16_i32 &a, const __vec16_i64 &b)
|
||||||
{
|
{
|
||||||
// TODO
|
// TODO
|
||||||
@@ -838,7 +894,7 @@ static FORCEINLINE __vec16_i64 __mul(const __vec16_i32 &a, const __vec16_i64 &b)
|
|||||||
_mm512_add_epi32(_mm512_mullo_epi32(a.v, b.v_hi),
|
_mm512_add_epi32(_mm512_mullo_epi32(a.v, b.v_hi),
|
||||||
_mm512_mulhi_epi32(a.v, b.v_lo)));
|
_mm512_mulhi_epi32(a.v, b.v_lo)));
|
||||||
}
|
}
|
||||||
|
*/
|
||||||
static FORCEINLINE void __abs_i32i64(__m512i &_hi, __m512i &_lo)
|
static FORCEINLINE void __abs_i32i64(__m512i &_hi, __m512i &_lo)
|
||||||
{
|
{
|
||||||
/* abs(x) :
|
/* abs(x) :
|
||||||
@@ -1916,7 +1972,7 @@ static FORCEINLINE __vec16_f __cast_uitofp(__vec16_f, __vec16_i16 val) {
|
|||||||
|
|
||||||
static FORCEINLINE __vec16_f __cast_uitofp(__vec16_f, __vec16_i64 val) {
|
static FORCEINLINE __vec16_f __cast_uitofp(__vec16_f, __vec16_i64 val) {
|
||||||
__vec16_f ret;
|
__vec16_f ret;
|
||||||
// Cycles don't work. It seems that it is icc bug.
|
// Loops don't work. It seems that it is icc bug.
|
||||||
/*
|
/*
|
||||||
for (int i = 0; i < 8; i++) {
|
for (int i = 0; i < 8; i++) {
|
||||||
((float*)&ret)[i] = ((float)(((uint64_t*)&tmp1)[i]));
|
((float*)&ret)[i] = ((float)(((uint64_t*)&tmp1)[i]));
|
||||||
@@ -1925,14 +1981,14 @@ static FORCEINLINE __vec16_f __cast_uitofp(__vec16_f, __vec16_i64 val) {
|
|||||||
((float*)&ret)[i + 8] = ((float)(((uint64_t*)&tmp2)[i]));
|
((float*)&ret)[i + 8] = ((float)(((uint64_t*)&tmp2)[i]));
|
||||||
}
|
}
|
||||||
*/
|
*/
|
||||||
ret[0] = ((float)(((uint64_t*)&val.v_lo)[0]));
|
ret[0] = ((float)(val[0]));
|
||||||
ret[1] = ((float)(((uint64_t*)&val.v_lo)[1]));
|
ret[1] = ((float)(((uint64_t*)&val.v_lo)[1]));
|
||||||
ret[2] = ((float)(((uint64_t*)&val.v_lo)[2]));
|
ret[2] = ((float)(((uint64_t*)&val.v_lo)[2]));
|
||||||
ret[3] = ((float)(((uint64_t*)&val.v_lo)[3]));
|
ret[3] = ((float)(((uint64_t*)&val.v_lo)[3]));
|
||||||
ret[4] = ((float)(((uint64_t*)&val.v_lo)[4]));
|
ret[4] = ((float)(((uint64_t*)&val.v_lo)[4]));
|
||||||
ret[5] = ((float)(((uint64_t*)&val.v_lo)[5]));
|
ret[5] = ((float)(((uint64_t*)&val.v_lo)[5]));
|
||||||
ret[6] = ((float)(((uint64_t*)&val.v_lo)[6]));
|
ret[6] = ((float)(((uint64_t*)&val.v_lo)[6]));
|
||||||
ret[7] = ((float)(((uint64_t*)&val.v_hi)[7]));
|
ret[7] = ((float)(((uint64_t*)&val.v_lo)[7]));
|
||||||
ret[8] = ((float)(((uint64_t*)&val.v_hi)[0]));
|
ret[8] = ((float)(((uint64_t*)&val.v_hi)[0]));
|
||||||
ret[9] = ((float)(((uint64_t*)&val.v_hi)[1]));
|
ret[9] = ((float)(((uint64_t*)&val.v_hi)[1]));
|
||||||
ret[10] = ((float)(((uint64_t*)&val.v_hi)[2]));
|
ret[10] = ((float)(((uint64_t*)&val.v_hi)[2]));
|
||||||
@@ -1940,7 +1996,7 @@ static FORCEINLINE __vec16_f __cast_uitofp(__vec16_f, __vec16_i64 val) {
|
|||||||
ret[12] = ((float)(((uint64_t*)&val.v_hi)[4]));
|
ret[12] = ((float)(((uint64_t*)&val.v_hi)[4]));
|
||||||
ret[13] = ((float)(((uint64_t*)&val.v_hi)[5]));
|
ret[13] = ((float)(((uint64_t*)&val.v_hi)[5]));
|
||||||
ret[14] = ((float)(((uint64_t*)&val.v_hi)[6]));
|
ret[14] = ((float)(((uint64_t*)&val.v_hi)[6]));
|
||||||
ret[15] = ((float)(((uint64_t*)&val.v_hi)[7]));
|
ret[15] = ((float)(((uint64_t*)&val.v_hi)[7]));
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user