added several math functions, new runfails: gather-int8-2/4; ldexp-double (previously compfailed)

2014-09-11 17:36:33 +04:00
parent efa0ea01f3
commit 78a7ef9fc5
1 changed files with 23 additions and 0 deletions
--- a/examples/intrinsics/knc.h
+++ b/examples/intrinsics/knc.h
@@ -800,6 +800,13 @@ static FORCEINLINE __vec16_i64 __shl(__vec16_i64 a, __vec16_i64 b) {
    return __vec16_i64(lo, hi);
 }

+static FORCEINLINE __vec16_i64 __shl(__vec16_i64 a, unsigned long long b) {
+  __vec16_i32 hi = _mm512_or_epi32(_mm512_slli_epi32(a.v_hi, b), 
+                                   _mm512_srli_epi32(a.v_lo, 32-b));
+  __vec16_i32 lo = _mm512_slli_epi32(a.v_lo, b);
+  return __vec16_i64(lo, hi);
+}
+
 static FORCEINLINE __vec16_i64 __lshr(__vec16_i64 a, __vec16_i64 b) {
    __vec16_i32 shift = _mm512_sub_epi32(__ispc_thirty_two, b.v_lo);
    __vec16_i32 xfer = _mm512_and_epi32(_mm512_sllv_epi32(__ispc_ffffffff, shift), _mm512_sllv_epi32(a.v_hi, shift));
@@ -820,6 +827,16 @@ static FORCEINLINE __vec16_i64 __ashr(__vec16_i64 a, __vec16_i64 b) {
    return __vec16_i64(lo, hi);
 }

+static FORCEINLINE __vec16_i64 __ashr(__vec16_i64 a, unsigned long long b) {
+  __vec16_i32 xfer
+    = _mm512_slli_epi32(_mm512_and_epi32(a.v_hi, 
+                                         _mm512_set1_epi32((1<<b)-1)),
+                        32-b);
+  __vec16_i32 hi = _mm512_srai_epi32(a.v_hi, b);
+  __vec16_i32 lo = _mm512_or_epi32(xfer, _mm512_srli_epi32(a.v_lo, b));
+  return __vec16_i64(lo, hi);
+}
+
 static FORCEINLINE __vec16_i1 __equal_i64(const __vec16_i64 &a, const __vec16_i64 &b) {
    const __mmask16 lo_match = _mm512_cmpeq_epi32_mask(a.v_lo,b.v_lo);
    return _mm512_mask_cmpeq_epi32_mask(lo_match,a.v_hi,b.v_hi);
@@ -849,6 +866,10 @@ static FORCEINLINE __vec16_i64 __select(__vec16_i1 mask,
    return ret;
 }

+// static FORCEINLINE int64_t __extract_element(__vec16_i64 v, uint32_t index) {
+//     return (uint64_t(((int32_t *)&v.v_hi)[index])<<32) | (uint64_t(((int32_t *)&v.v_lo)[index]));
+// }
+
 template <class RetVecType> static RetVecType __smear_i64(const int64_t &l);
 template <> FORCEINLINE  __vec16_i64 __smear_i64<__vec16_i64>(const int64_t &l) {
    const int *i = (const int*)&l;
@@ -1609,12 +1630,14 @@ static FORCEINLINE __vec16_f __rsqrt_varying_float(__vec16_f v) {
    return _mm512_invsqrt_ps(v);
 #endif
 }
+
 static FORCEINLINE __vec16_d __rsqrt_varying_double(__vec16_d x) {
  __vec16_d y;
  for (int i = 0; i < 16; i++)
    __insert_element(&y, i, 1.0/sqrt(__extract_element(x,i)));
  return y;
 }
+
 static FORCEINLINE double __rsqrt_uniform_double(double v) 
 {
  return 1.0/v;