diff --git a/examples/intrinsics/knl.h b/examples/intrinsics/knl.h index e1692d84..2814674a 100644 --- a/examples/intrinsics/knl.h +++ b/examples/intrinsics/knl.h @@ -50,7 +50,7 @@ #include // for operator<<(m512[i]) #if __INTEL_COMPILER < 1500 -#warning "Your compiler version is outdated which can reduce performance in some cases. Please, update your compiler!" +#warning "Only ICC 15.0 and older are supported. Please, update your compiler!" #endif @@ -1780,6 +1780,10 @@ static FORCEINLINE __vec16_i64 __cast_zext(const __vec16_i64 &, const __vec16_i3 return __vec16_i64(val.v, _mm512_setzero_epi32()); } +static FORCEINLINE __vec16_f __cast_sitofp(__vec16_f, __vec16_i32 val) { + return _mm512_cvtepi32_ps(val); +} + static FORCEINLINE __vec16_f __cast_sitofp(__vec16_f, __vec16_i8 val) { return __cast_sitofp(__vec16_f(), __cast_sext(__vec16_i32(), val)); } @@ -1788,10 +1792,6 @@ static FORCEINLINE __vec16_f __cast_sitofp(__vec16_f, __vec16_i16 val) { return __cast_sitofp(__vec16_f(), __cast_sext(__vec16_i32(), val)); } -static FORCEINLINE __vec16_f __cast_sitofp(__vec16_f, __vec16_i32 val) { - return _mm512_cvtepi32_ps(val); -} - static FORCEINLINE __vec16_f __cast_sitofp(__vec16_f, __vec16_i64 val) { __vec16_f ret; @@ -1863,7 +1863,6 @@ static FORCEINLINE __vec16_d __cast_sitofp(__vec16_d, __vec16_i64 val) { return ret; } - static FORCEINLINE __vec16_f __cast_uitofp(__vec16_f, __vec16_i1 v) { const __m512 ret = _mm512_setzero_ps(); @@ -1871,6 +1870,10 @@ static FORCEINLINE __vec16_f __cast_uitofp(__vec16_f, __vec16_i1 v) return _mm512_mask_mov_ps(ret, v, one); } +static FORCEINLINE __vec16_f __cast_uitofp(__vec16_f, __vec16_i32 v) { + return _mm512_cvtepu32_ps(v); +} + static FORCEINLINE __vec16_f __cast_uitofp(__vec16_f, const __vec16_i8 &v) { return __cast_uitofp(__vec16_f(), __cast_zext(__vec16_i32(), v)); } @@ -1879,10 +1882,6 @@ static FORCEINLINE __vec16_f __cast_uitofp(__vec16_f, __vec16_i16 val) { return __cast_uitofp(__vec16_f(), __cast_zext(__vec16_i32(), val)); } -static FORCEINLINE __vec16_f __cast_uitofp(__vec16_f, __vec16_i32 v) { - return _mm512_cvtepu32_ps(v); -} - static FORCEINLINE __vec16_f __cast_uitofp(__vec16_f, __vec16_i64 val) { __vec16_f ret; // Cycles don't work. It seems that it is icc bug.