diff --git a/examples/intrinsics/knc.h b/examples/intrinsics/knc.h index a7b37bbd..aa9bda32 100644 --- a/examples/intrinsics/knc.h +++ b/examples/intrinsics/knc.h @@ -509,7 +509,7 @@ static FORCEINLINE __vec16_i32 __add(__vec16_i32 a, __vec16_i32 b) { return _mm512_add_epi32(a, b); } -static FORCEINLINE __vec16_i32 __sub(__vec16_i32 a, __vec16_i32 b) { +static FORCEINLINE __vec16_i32 __sub(__vec16_i32 a, __vec16_i32 b) { return _mm512_sub_epi32(a, b); } @@ -964,7 +964,11 @@ static FORCEINLINE __vec16_i64 __lshr(__vec16_i64 a, __vec16_i64 b) { } static FORCEINLINE __vec16_i64 __lshr(__vec16_i64 a, unsigned long long b) { - __vec16_i32 xfer = _mm512_and_epi32(_mm512_slli_epi32(__ispc_ffffffff, 32-b), _mm512_slli_epi32(a.v_hi, 32-b)); + /* this is a safety gate in case b-shift >= 32 */ + __vec16_i32 xfer; + if (32 < b) xfer = __lshr(a.v_hi, b-32); + else xfer = _mm512_and_epi32(_mm512_slli_epi32(__ispc_ffffffff, 32-b), _mm512_slli_epi32(a.v_hi, 32-b)); + __vec16_i32 hi = _mm512_srli_epi32(a.v_hi, b); __vec16_i32 lo = _mm512_or_epi32(xfer, _mm512_srli_epi32(a.v_lo, b)); return __vec16_i64(lo, hi);