fixed lshr/ashr/shl shifts. __mul i64 vector version for icc < 14.0.0 works only on signed, so commented it out in favour of sequential

This commit is contained in:
evghenii
2013-10-07 14:24:27 +03:00
parent 1b196520f6
commit 4222605f87

View File

@@ -668,6 +668,7 @@ template <class RetVecType> RetVecType __smear_i32(int32_t i);
template <> static FORCEINLINE __vec16_i32 __smear_i32<__vec16_i32>(int32_t i) { return _mm512_set1_epi32(i); }
static const __vec16_i32 __ispc_one = __smear_i32<__vec16_i32>(1);
static const __vec16_i32 __ispc_zero = __smear_i32<__vec16_i32>(0);
static const __vec16_i32 __ispc_thirty_two = __smear_i32<__vec16_i32>(32);
static const __vec16_i32 __ispc_ffffffff = __smear_i32<__vec16_i32>(-1);
static const __vec16_i32 __ispc_stride1(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
@@ -772,18 +773,18 @@ static FORCEINLINE __vec16_i64 __mul(__vec16_i64 a, __vec16_i64 b)
return __vec16_i64(_mm512_mullox_epi64(a.v1,b.v1), _mm512_mullox_epi64(a.v2,b.v2));
}
#else /* __ICC >= 1400 */
#if 0 /* knc::fails ./tests/int64-min-1.ispc ./tests/idiv.ispc */
#if 0 /* knc::fails ./tests/int64-min-1.ispc ./tests/idiv.ispc cause: if one or both numbers are negative multiplication fails */
static FORCEINLINE __vec16_i64 __mul(__vec16_i64 _a, __vec16_i64 _b)
{
__vec16_i64 a = _a.cvt2hilo();
__vec16_i64 b = _b.cvt2hilo();
__vec16_i32 lo = _mm512_mullo_epi32(a.v_lo,b.v_lo);
__vec16_i32 hi_m1 = _mm512_mulhi_epi32(a.v_lo, b.v_lo);
__vec16_i32 hi_m2 = _mm512_mullo_epi32(a.v_hi, b.v_lo);
__vec16_i32 hi_m3 = _mm512_mullo_epi32(a.v_lo, b.v_hi);
__mmask16 carry = 0;
__vec16_i32 hi_p23 = _mm512_addsetc_epi32(hi_m2, hi_m1, &carry);
__vec16_i32 hi = _mm512_adc_epi32(hi_m3, carry, hi_p23, &carry);
const __vec16_i64 a = _a.cvt2hilo();
const __vec16_i64 b = _b.cvt2hilo();
const __vec16_i32 lo = _mm512_mullo_epi32(a.v_lo, b.v_lo);
const __vec16_i32 hi_m1 = _mm512_mulhi_epi32(a.v_lo, b.v_lo);
const __vec16_i32 hi_m2 = _mm512_mullo_epi32(a.v_hi, b.v_lo);
const __vec16_i32 hi_m3 = _mm512_mullo_epi32(a.v_lo, b.v_hi);
__mmask16 carry;
const __vec16_i32 hi_p23 = _mm512_addsetc_epi32(hi_m2, hi_m3, &carry);
const __vec16_i32 hi = _mm512_adc_epi32(hi_p23, carry, hi_m1, &carry);
return __vec16_i64(hi,lo).cvt2zmm();
}
#else
@@ -795,60 +796,68 @@ static FORCEINLINE __vec16_i64 __or (__vec16_i64 a, __vec16_i64 b) { return __ve
static FORCEINLINE __vec16_i64 __and(__vec16_i64 a, __vec16_i64 b) { return __vec16_i64(_mm512_and_epi64(a.v1, b.v1), _mm512_and_epi64(a.v2, b.v2)); }
static FORCEINLINE __vec16_i64 __xor(__vec16_i64 a, __vec16_i64 b) { return __vec16_i64(_mm512_xor_epi64(a.v1, b.v1), _mm512_xor_epi64(a.v2, b.v2)); }
static FORCEINLINE __vec16_i64 __shl(__vec16_i64 _a, __vec16_i64 _b)
{
const __vec16_i64 a = _a.cvt2hilo();
const __vec16_i64 b = _b.cvt2hilo();
__vec16_i32 xfer = _mm512_srlv_epi32(a.v_lo, _mm512_sub_epi32(__ispc_thirty_two, b.v_lo));
__vec16_i32 hi = _mm512_or_epi32(_mm512_sllv_epi32(a.v_hi, b.v_lo), xfer);
__vec16_i32 lo = _mm512_sllv_epi32(a.v_lo, b.v_lo);
return __vec16_i64(hi,lo).cvt2zmm();
}
static FORCEINLINE __vec16_i64 __udiv(__vec16_i64 a, __vec16_i64 b) { return __vec16_i64(_mm512_div_epu64(a.v1,b.v1), _mm512_div_epu64(a.v2,b.v2)); }
static FORCEINLINE __vec16_i64 __sdiv(__vec16_i64 a, __vec16_i64 b) { return __vec16_i64(_mm512_div_epi64(a.v1,b.v1), _mm512_div_epi64(a.v2,b.v2)); }
static FORCEINLINE __vec16_i64 __urem(__vec16_i64 a, __vec16_i64 b) { return __vec16_i64(_mm512_rem_epu64(a.v1,b.v1), _mm512_rem_epu64(a.v2,b.v2)); }
static FORCEINLINE __vec16_i64 __srem(__vec16_i64 a, __vec16_i64 b) { return __vec16_i64(_mm512_rem_epi64(a.v1,b.v1), _mm512_rem_epi64(a.v2,b.v2)); }
#if 0 /* knc::fails ./tests/idiv.ispc */
static FORCEINLINE __vec16_i64 __lshr(__vec16_i64 _a, __vec16_i64 _b) {
static FORCEINLINE __vec16_i64 __shl(__vec16_i64 _a, __vec16_i64 _b)
{
const __vec16_i64 a = _a.cvt2hilo();
const __vec16_i64 b = _b.cvt2hilo();
__vec16_i32 shift = _mm512_sub_epi32(__ispc_thirty_two, b.v_lo);
#if 0
__vec16_i32 xfer = _mm512_and_epi32(_mm512_sllv_epi32(__ispc_ffffffff, shift), _mm512_sllv_epi32(a.v_hi, shift));
#else
__vec16_i32 xfer = _mm512_sllv_epi32(_mm512_and_epi32(a.v_hi,
_mm512_sub_epi32(_mm512_sllv_epi32(__ispc_one, b.v_lo), __ispc_one)),
_mm512_sub_epi32(__ispc_thirty_two, b.v_lo));
#endif
__vec16_i32 hi = _mm512_srlv_epi32(a.v_hi, b.v_lo);
__vec16_i32 lo = _mm512_or_epi32(xfer, _mm512_srlv_epi32(a.v_lo, b.v_lo));
/* this is a safety gate in case b-shift >= 32 */
const __vec16_i32 xfer = __select(
__signed_less_than_i32(b.v_lo, __ispc_thirty_two),
__lshr(a.v_lo, __sub(__ispc_thirty_two, b.v_lo)),
__shl (a.v_lo, __sub(b.v_lo, __ispc_thirty_two))
);
const __vec16_i32 hi = __or(__shl(a.v_hi, b.v_lo), xfer);
const __vec16_i32 lo = __shl(a.v_lo, b.v_lo);
return __vec16_i64(hi,lo).cvt2zmm();
}
#else
BINARY_OP_CAST(__vec16_i64, uint64_t, __lshr, >>)
#endif
#if 0 /* knc::fails ./tests/idiv.ispc */
static FORCEINLINE __vec16_i64 __ashr(__vec16_i64 _a, __vec16_i64 _b) {
static FORCEINLINE __vec16_i64 __lshr(__vec16_i64 _a, __vec16_i64 _b)
{
const __vec16_i64 a = _a.cvt2hilo();
const __vec16_i64 b = _b.cvt2hilo();
__vec16_i32 xfer = _mm512_sllv_epi32(_mm512_and_epi32(a.v_hi,
_mm512_sub_epi32(_mm512_sllv_epi32(__ispc_one, b.v_lo), __ispc_one)),
_mm512_sub_epi32(__ispc_thirty_two, b.v_lo));
__vec16_i32 hi = _mm512_srav_epi32(a.v_hi, b.v_lo);
__vec16_i32 lo = _mm512_or_epi32(xfer, _mm512_srlv_epi32(a.v_lo, b.v_lo));
/* this is a safety gate in case b-shift >= 32 */
const __vec16_i32 xfer = __select(
__signed_less_than_i32(b.v_lo, __ispc_thirty_two),
__shl (a.v_hi, __sub(__ispc_thirty_two, b.v_lo)),
__lshr(a.v_hi, __sub(b.v_lo, __ispc_thirty_two))
);
const __vec16_i32 lo = __or(__lshr(a.v_lo, b.v_lo), xfer);
const __vec16_i32 hi = __lshr(a.v_hi, b.v_lo);
return __vec16_i64(hi,lo).cvt2zmm();
}
static FORCEINLINE __vec16_i64 __ashr(__vec16_i64 _a, __vec16_i64 _b)
{
const __vec16_i64 a = _a.cvt2hilo();
const __vec16_i64 b = _b.cvt2hilo();
/* this is a safety gate in case b-shift >= 32 */
const __vec16_i32 xfer = __select(
__signed_less_than_i32(b.v_lo, __ispc_thirty_two),
__shl (a.v_hi, __sub(__ispc_thirty_two, b.v_lo)),
__ashr(a.v_hi, __sub(b.v_lo, __ispc_thirty_two))
);
const __vec16_i32 lo = __or(__lshr(a.v_lo, b.v_lo), xfer);
const __vec16_i32 hi = __ashr(a.v_hi, b.v_lo);
return __vec16_i64(hi,lo).cvt2zmm();
}
#else
BINARY_OP_CAST(__vec16_i64, int64_t, __ashr, >>)
#endif
SHIFT_UNIFORM(__vec16_i64, uint64_t, __lshr, >>)
SHIFT_UNIFORM(__vec16_i64, int64_t, __ashr, >>)
SHIFT_UNIFORM(__vec16_i64, int64_t, __shl, <<)
template <class RetVecType> RetVecType __smear_i64(const int64_t &l);
template <> FORCEINLINE __vec16_i64 __smear_i64<__vec16_i64>(const int64_t &l) { return __vec16_i64(_mm512_set1_epi64(l), _mm512_set1_epi64(l)); }
template <class RetVecType> RetVecType __setzero_i64();
template <> FORCEINLINE __vec16_i64 __setzero_i64<__vec16_i64>() { return __vec16_i64(_mm512_setzero_epi32(), _mm512_setzero_epi32()); }
template <class RetVecType> RetVecType __undef_i64();
template <> FORCEINLINE __vec16_i64 __undef_i64<__vec16_i64>() { return __vec16_i64(_mm512_undefined_epi32(), _mm512_undefined_epi32()); }
static FORCEINLINE __vec16_i64 __lshr(__vec16_i64 a, uint64_t shift) { return __lshr(a, __smear_i64<__vec16_i64>(shift)); }
static FORCEINLINE __vec16_i64 __ashr(__vec16_i64 a, int64_t shift) { return __ashr(a, __smear_i64<__vec16_i64>(shift)); }
static FORCEINLINE __vec16_i64 __shl (__vec16_i64 a, int64_t shift) { return __shl (a, __smear_i64<__vec16_i64>(shift)); }
static FORCEINLINE __vec16_i1 __equal_i64(__vec16_i64 _a, __vec16_i64 _b)
{
@@ -892,14 +901,6 @@ static FORCEINLINE __vec16_i64 __select(__vec16_i1 mask, __vec16_i64 a, __vec16_
INSERT_EXTRACT(__vec16_i64, int64_t)
template <class RetVecType> RetVecType __smear_i64(const int64_t &l);
template <> FORCEINLINE __vec16_i64 __smear_i64<__vec16_i64>(const int64_t &l) { return __vec16_i64(_mm512_set1_epi64(l), _mm512_set1_epi64(l)); }
template <class RetVecType> RetVecType __setzero_i64();
template <> FORCEINLINE __vec16_i64 __setzero_i64<__vec16_i64>() { return __vec16_i64(_mm512_setzero_epi32(), _mm512_setzero_epi32()); }
template <class RetVecType> RetVecType __undef_i64();
template <> FORCEINLINE __vec16_i64 __undef_i64<__vec16_i64>() { return __vec16_i64(_mm512_undefined_epi32(), _mm512_undefined_epi32()); }
#define CASTL2I(_v_, _v_hi_, _v_lo_) \
__vec16_i32 _v_hi_, _v_lo_; \