cast_uitofp typo fix; helper printf_v for vector types add
This commit is contained in:
@@ -308,13 +308,68 @@ inline std::ostream &operator<<(std::ostream &out, const __vec16_i64 &v)
|
||||
out << "[";
|
||||
uint32_t *ptr = (uint32_t*)&v;
|
||||
for (int i=0;i<16;i++) {
|
||||
uint64_t val = (uint64_t(ptr[i])<<32)+ptr[i+16];
|
||||
out << (i!=0?",":"") << std::dec << std::setw(8) << ((uint64_t)val) << std::dec;
|
||||
out << (i!=0?",":"") << std::dec << std::setw(8) << ((uint64_t)v[i]) << std::dec;
|
||||
}
|
||||
out << "]" << std::flush;
|
||||
return out;
|
||||
}
|
||||
|
||||
// C-style debugging helpers
|
||||
inline void printf_v(const __m512i &v)
|
||||
{
|
||||
printf("[");
|
||||
for (int i=0;i<16;i++)
|
||||
printf("%d ", ((int*)&v)[i]);
|
||||
printf("]\n");
|
||||
}
|
||||
|
||||
inline void printf_v(const __m512 &v)
|
||||
{
|
||||
printf("[");
|
||||
for (int i=0;i<16;i++)
|
||||
printf("%f ", ((float*)&v)[i]);
|
||||
printf("]\n");
|
||||
}
|
||||
|
||||
inline void printf_v(const __vec16_i1 &v)
|
||||
{
|
||||
printf("[");
|
||||
for (int i=0;i<16;i++)
|
||||
printf("%d ", (int)v[i]);
|
||||
printf("]\n");
|
||||
}
|
||||
|
||||
inline void printf_v(const __vec16_i8 &v)
|
||||
{
|
||||
printf("[");
|
||||
for (int i=0;i<16;i++)
|
||||
printf("%d ", (int)((unsigned char*)&v)[i]);
|
||||
printf("]\n");
|
||||
}
|
||||
|
||||
inline void printf_v(const __vec16_i16 &v)
|
||||
{
|
||||
printf("[");
|
||||
for (int i=0;i<16;i++)
|
||||
printf("%d ", (int)((uint16_t*)&v)[i]);
|
||||
printf("]\n");
|
||||
}
|
||||
|
||||
inline void printf_v(const __vec16_d &v)
|
||||
{
|
||||
printf("[");
|
||||
for (int i=0;i<16;i++)
|
||||
printf("%f ", v[i]);
|
||||
printf("]\n");
|
||||
}
|
||||
|
||||
inline void printf_v(const __vec16_i64 &v)
|
||||
{
|
||||
printf("[");
|
||||
for (int i=0;i<16;i++)
|
||||
printf("%llu ", ((uint64_t)v[i]));
|
||||
printf("]\n");
|
||||
}
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// macros...
|
||||
|
||||
@@ -784,8 +839,8 @@ static FORCEINLINE __vec16_i64 __select(__vec16_i1 mask,
|
||||
ret.v_lo = _mm512_mask_blend_epi64(mask.lo(), b.v_lo, a.v_lo);
|
||||
ret.v_hi = _mm512_mask_blend_epi64(mask.hi(), b.v_hi, a.v_hi);
|
||||
// TODO: Check if this works better:
|
||||
ret.v_lo = _mm512_mask_mov_epi32(b.v_lo, mask, a.v_lo);
|
||||
ret.v_hi = _mm512_mask_mov_epi32(b.v_hi, mask, a.v_hi);
|
||||
//ret.v_lo = _mm512_mask_mov_epi32(b.v_lo, mask, a.v_lo);
|
||||
//ret.v_hi = _mm512_mask_mov_epi32(b.v_hi, mask, a.v_hi);
|
||||
return ret;
|
||||
}
|
||||
|
||||
@@ -831,6 +886,7 @@ static FORCEINLINE __vec16_i64 __sub(const __vec16_i64 &a, const __vec16_i64 &b)
|
||||
|
||||
/*! 64x32 bit mul -- address computations often use a scale that we
|
||||
know is 32 bits; and 32x64 is faster than 64x64 */
|
||||
/*
|
||||
static FORCEINLINE __vec16_i64 __mul(const __vec16_i32 &a, const __vec16_i64 &b)
|
||||
{
|
||||
// TODO
|
||||
@@ -838,7 +894,7 @@ static FORCEINLINE __vec16_i64 __mul(const __vec16_i32 &a, const __vec16_i64 &b)
|
||||
_mm512_add_epi32(_mm512_mullo_epi32(a.v, b.v_hi),
|
||||
_mm512_mulhi_epi32(a.v, b.v_lo)));
|
||||
}
|
||||
|
||||
*/
|
||||
static FORCEINLINE void __abs_i32i64(__m512i &_hi, __m512i &_lo)
|
||||
{
|
||||
/* abs(x) :
|
||||
@@ -1916,7 +1972,7 @@ static FORCEINLINE __vec16_f __cast_uitofp(__vec16_f, __vec16_i16 val) {
|
||||
|
||||
static FORCEINLINE __vec16_f __cast_uitofp(__vec16_f, __vec16_i64 val) {
|
||||
__vec16_f ret;
|
||||
// Cycles don't work. It seems that it is icc bug.
|
||||
// Loops don't work. It seems that it is icc bug.
|
||||
/*
|
||||
for (int i = 0; i < 8; i++) {
|
||||
((float*)&ret)[i] = ((float)(((uint64_t*)&tmp1)[i]));
|
||||
@@ -1925,14 +1981,14 @@ static FORCEINLINE __vec16_f __cast_uitofp(__vec16_f, __vec16_i64 val) {
|
||||
((float*)&ret)[i + 8] = ((float)(((uint64_t*)&tmp2)[i]));
|
||||
}
|
||||
*/
|
||||
ret[0] = ((float)(((uint64_t*)&val.v_lo)[0]));
|
||||
ret[0] = ((float)(val[0]));
|
||||
ret[1] = ((float)(((uint64_t*)&val.v_lo)[1]));
|
||||
ret[2] = ((float)(((uint64_t*)&val.v_lo)[2]));
|
||||
ret[3] = ((float)(((uint64_t*)&val.v_lo)[3]));
|
||||
ret[4] = ((float)(((uint64_t*)&val.v_lo)[4]));
|
||||
ret[5] = ((float)(((uint64_t*)&val.v_lo)[5]));
|
||||
ret[6] = ((float)(((uint64_t*)&val.v_lo)[6]));
|
||||
ret[7] = ((float)(((uint64_t*)&val.v_hi)[7]));
|
||||
ret[7] = ((float)(((uint64_t*)&val.v_lo)[7]));
|
||||
ret[8] = ((float)(((uint64_t*)&val.v_hi)[0]));
|
||||
ret[9] = ((float)(((uint64_t*)&val.v_hi)[1]));
|
||||
ret[10] = ((float)(((uint64_t*)&val.v_hi)[2]));
|
||||
@@ -1940,7 +1996,7 @@ static FORCEINLINE __vec16_f __cast_uitofp(__vec16_f, __vec16_i64 val) {
|
||||
ret[12] = ((float)(((uint64_t*)&val.v_hi)[4]));
|
||||
ret[13] = ((float)(((uint64_t*)&val.v_hi)[5]));
|
||||
ret[14] = ((float)(((uint64_t*)&val.v_hi)[6]));
|
||||
ret[15] = ((float)(((uint64_t*)&val.v_hi)[7]));
|
||||
ret[15] = ((float)(((uint64_t*)&val.v_hi)[7]));
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user