knc.h:

vec16_i64 improved with the addition of the following: __extract_element(), insert_element(), __sub(), __mul(), __sdiv(), __udiv(), __and(), __or(), __xor(), __shl(), __lshr(), __ashr(), __select() Fixed a bug in the __mul(__vec16_i64, __vec16_i32) implementation Constructors are all explicitly inlined, copy constructor and operator=() explicitly provided Load and stores for __vec16_i64 and __vec16_d use aligned instructions when possible __rotate_i32() now has a vector implementation Added several reductions: __reduce_add_i32(), __reduce_min_i32(), __reduce_max_i32(), __reduce_add_f(), __reduce_min_f(), __reduce_max_f()
2012-08-10 12:20:10 -07:00
parent 43364b2d69
commit 165a13b13e
1 changed files with 330 additions and 158 deletions
--- a/examples/intrinsics/knc.h
+++ b/examples/intrinsics/knc.h
@@ -57,9 +57,9 @@ extern "C" {
    int puts(unsigned char *);
    unsigned int putchar(unsigned int);
    int fflush(void *);
-    uint8_t *memcpy(uint8_t *, uint8_t *, uint64_t );
+    uint8_t *memcpy(uint8_t *, uint8_t *, uint64_t);
-    uint8_t *memset(uint8_t *, uint8_t, uint64_t );
+    uint8_t *memset(uint8_t *, uint8_t, uint64_t);
-    void memset_pattern16(void *, const void *, uint64_t );
+    void memset_pattern16(void *, const void *, uint64_t);
 }
@@ -74,14 +74,16 @@ struct __vec16_i32;
 typedef struct PRE_ALIGN(2) __vec16_i1 {
-    operator __mmask16() const { return m; }
+    FORCEINLINE operator __mmask16() const { return m; }
-    __vec16_i1() { }
+    FORCEINLINE __vec16_i1() { /* FIXME? __mm512_undef_mask(); */ }
-    __vec16_i1(const __mmask16& in) { m = in; }
+    FORCEINLINE __vec16_i1(const __mmask16 &in) : m(in) {}
-    __vec16_i1(const __vec16_i32& in);
+    FORCEINLINE __vec16_i1(const __vec16_i32 &in);
-    __vec16_i1(uint32_t v00, uint32_t v01, uint32_t v02, uint32_t v03, 
+    FORCEINLINE __vec16_i1(const __vec16_i1 &o) : m(o.m) {}
-               uint32_t v04, uint32_t v05, uint32_t v06, uint32_t v07,
+    FORCEINLINE __vec16_i1& operator=(const __vec16_i1 &o) { m = o.m; return *this; }
-               uint32_t v08, uint32_t v09, uint32_t v10, uint32_t v11,
+    FORCEINLINE __vec16_i1(uint32_t v00, uint32_t v01, uint32_t v02, uint32_t v03, 
-               uint32_t v12, uint32_t v13, uint32_t v14, uint32_t v15) {
+                           uint32_t v04, uint32_t v05, uint32_t v06, uint32_t v07,
                           uint32_t v08, uint32_t v09, uint32_t v10, uint32_t v11,
                           uint32_t v12, uint32_t v13, uint32_t v14, uint32_t v15) {
        m = (v00) |
            ((v01) <<  1) |
            ((v02) <<  2) |
@@ -110,24 +112,28 @@ typedef struct PRE_ALIGN(2) __vec16_i1 {
 } POST_ALIGN(2) __vec16_i1;
 typedef struct PRE_ALIGN(64) __vec16_f {
-    operator __m512() const { return v; }
+    FORCEINLINE operator __m512() const { return v; }
-    __vec16_f() { }
+    FORCEINLINE __vec16_f() : v(_mm512_undefined_ps()) { }
-    __vec16_f(const __m512& in) { v = in; }
+    FORCEINLINE __vec16_f(const __m512 &in) : v(in) {}
-    __vec16_f(float v00, float v01, float v02, float v03, 
+    FORCEINLINE __vec16_f(const __vec16_f &o) : v(o.v) {}
-              float v04, float v05, float v06, float v07,
+    FORCEINLINE __vec16_f& operator =(const __vec16_f &o) { v=o.v; return *this; }
-              float v08, float v09, float v10, float v11,
+    FORCEINLINE __vec16_f(float v00, float v01, float v02, float v03, 
-              float v12, float v13, float v14, float v15) {
+                          float v04, float v05, float v06, float v07,
                          float v08, float v09, float v10, float v11,
                          float v12, float v13, float v14, float v15) {
        v = _mm512_set_16to16_ps(v15, v14, v13, v12, v11, v10, v09, v08, v07, v06, v05, v04, v03, v02, v01, v00);
    }
    __m512 v;
 } POST_ALIGN(64) __vec16_f;
 typedef struct PRE_ALIGN(64) __vec16_d {
-    __vec16_d() { }
+    FORCEINLINE __vec16_d() : v1(_mm512_undefined_pd()), v2(_mm512_undefined_pd()) {}
-    __vec16_d(double v00, double v01, double v02, double v03, 
+    FORCEINLINE __vec16_d(const __vec16_d &o) : v1(o.v1), v2(o.v2) {}
-              double v04, double v05, double v06, double v07,
+    FORCEINLINE __vec16_d& operator =(const __vec16_d &o) { v1=o.v1; v2=o.v2; return *this; }
-              double v08, double v09, double v10, double v11,
+    FORCEINLINE __vec16_d(double v00, double v01, double v02, double v03, 
-              double v12, double v13, double v14, double v15) {
+                          double v04, double v05, double v06, double v07,
                          double v08, double v09, double v10, double v11,
                          double v12, double v13, double v14, double v15) {
        v1 = _mm512_set_8to8_pd(v15, v14, v13, v12, v11, v10, v09, v08);
        v2 = _mm512_set_8to8_pd(v07, v06, v05, v04, v03, v02, v01, v00);
    }
@@ -136,32 +142,34 @@ typedef struct PRE_ALIGN(64) __vec16_d {
 } POST_ALIGN(64) __vec16_d;
 typedef struct PRE_ALIGN(64) __vec16_i32 {
-    operator __m512i() const { return v; }
+    FORCEINLINE operator __m512i() const { return v; }
-    __vec16_i32() {}
+    FORCEINLINE __vec16_i32() : v(_mm512_undefined_epi32()) {}
-    __vec16_i32(const int32_t &in) : v(_mm512_set_1to16_epi32(in)) {};
+    FORCEINLINE __vec16_i32(const int32_t &in) : v(_mm512_set_1to16_epi32(in)) {}
-    __vec16_i32(const __m512i &in) { v = in; }
+    FORCEINLINE __vec16_i32(const __m512i &in) : v(in) {}
-    __vec16_i32(const __vec16_i32 &in) { v = in.v; }
+    FORCEINLINE __vec16_i32(const __vec16_i32 &o) : v(o.v) {}
-    __vec16_i32(int32_t v00, int32_t v01, int32_t v02, int32_t v03, 
+    FORCEINLINE __vec16_i32& operator =(const __vec16_i32 &o) { v=o.v; return *this; }
-                int32_t v04, int32_t v05, int32_t v06, int32_t v07,
+    FORCEINLINE __vec16_i32(int32_t v00, int32_t v01, int32_t v02, int32_t v03, 
-                int32_t v08, int32_t v09, int32_t v10, int32_t v11,
+                            int32_t v04, int32_t v05, int32_t v06, int32_t v07,
-                int32_t v12, int32_t v13, int32_t v14, int32_t v15) {
+                            int32_t v08, int32_t v09, int32_t v10, int32_t v11,
                            int32_t v12, int32_t v13, int32_t v14, int32_t v15) {
        v = _mm512_set_16to16_pi(v15, v14, v13, v12, v11, v10, v09, v08, v07, v06, v05, v04, v03, v02, v01, v00);
    }
    __m512i v;
 } POST_ALIGN(64) __vec16_i32;
-FORCEINLINE __vec16_i1::__vec16_i1(const __vec16_i32& in) {
+FORCEINLINE __vec16_i1::__vec16_i1(const __vec16_i32 &in) {
    m = _mm512_test_epi32_mask(in, in);
 }
 typedef struct PRE_ALIGN(64) __vec16_i64 {
-  __forceinline __vec16_i64();
+    FORCEINLINE __vec16_i64() : v_lo(_mm512_undefined_epi32()),  v_hi(_mm512_undefined_epi32()) {}
-  __forceinline __vec16_i64(const __vec16_i64 &o) : v_lo(o.v_lo), v_hi(o.v_hi) {};
+    FORCEINLINE __vec16_i64(const __vec16_i64 &o) : v_lo(o.v_lo), v_hi(o.v_hi) {}
-  __forceinline __vec16_i64(__m512i l, __m512i h): v_lo(l), v_hi(h) {};
+    FORCEINLINE __vec16_i64(__m512i l, __m512i h) : v_lo(l), v_hi(h) {}
-    __vec16_i64(int64_t v00, int64_t v01, int64_t v02, int64_t v03, 
+    FORCEINLINE __vec16_i64& operator =(const __vec16_i64 &o) { v_lo=o.v_lo; v_hi=o.v_hi; return *this; }
-                int64_t v04, int64_t v05, int64_t v06, int64_t v07,
+    FORCEINLINE __vec16_i64(int64_t v00, int64_t v01, int64_t v02, int64_t v03, 
-                int64_t v08, int64_t v09, int64_t v10, int64_t v11,
+                            int64_t v04, int64_t v05, int64_t v06, int64_t v07,
-                int64_t v12, int64_t v13, int64_t v14, int64_t v15) {
+                            int64_t v08, int64_t v09, int64_t v10, int64_t v11,
                            int64_t v12, int64_t v13, int64_t v14, int64_t v15) {
        __m512i v1 = _mm512_set_8to8_epi64(v15, v14, v13, v12, v11, v10, v09, v08);
        __m512i v2 = _mm512_set_8to8_epi64(v07, v06, v05, v04, v03, v02, v01, v00);
        v_hi = _mm512_mask_permutevar_epi32(v_hi, 0xFF00, 
@@ -181,16 +189,11 @@ typedef struct PRE_ALIGN(64) __vec16_i64 {
    __m512i v_lo;
 } POST_ALIGN(64) __vec16_i64;
 FORCEINLINE __vec16_i64::__vec16_i64()
  : v_lo(_mm512_undefined_epi32()),
    v_hi(_mm512_undefined_epi32())
 {}
 template <typename T>
 struct vec16 {
-    vec16() { }
+    FORCEINLINE vec16() { }
-    vec16(T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7,
+    FORCEINLINE vec16(T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7,
-          T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15) {
+                      T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15) {
        v[0] = v0;        v[1] = v1;        v[2] = v2;        v[3] = v3;
        v[4] = v4;        v[5] = v5;        v[6] = v6;        v[7] = v7;
        v[8] = v8;        v[9] = v9;        v[10] = v10;      v[11] = v11;
@@ -200,21 +203,25 @@ struct vec16 {
 };
 PRE_ALIGN(16) struct __vec16_i8   : public vec16<int8_t> { 
-    __vec16_i8() { }
+    FORCEINLINE __vec16_i8() { }
-    __vec16_i8(int8_t v0, int8_t v1, int8_t v2, int8_t v3, 
+    FORCEINLINE __vec16_i8(const __vec16_i8 &o);
-               int8_t v4, int8_t v5, int8_t v6, int8_t v7,
+    FORCEINLINE __vec16_i8& operator =(const __vec16_i8 &o);
-               int8_t v8, int8_t v9, int8_t v10, int8_t v11, 
+    FORCEINLINE __vec16_i8(int8_t v0, int8_t v1, int8_t v2, int8_t v3, 
-               int8_t v12, int8_t v13, int8_t v14, int8_t v15) 
+                           int8_t v4, int8_t v5, int8_t v6, int8_t v7,
                           int8_t v8, int8_t v9, int8_t v10, int8_t v11, 
                           int8_t v12, int8_t v13, int8_t v14, int8_t v15)
        : vec16<int8_t>(v0, v1, v2, v3, v4, v5, v6, v7,
                        v8, v9, v10, v11, v12, v13, v14, v15) { }
 } POST_ALIGN(16);
 PRE_ALIGN(32) struct __vec16_i16  : public vec16<int16_t> { 
-    __vec16_i16() { }
+    FORCEINLINE __vec16_i16() { }
-    __vec16_i16(int16_t v0, int16_t v1, int16_t v2, int16_t v3, 
+    FORCEINLINE __vec16_i16(const __vec16_i16 &o);
-                int16_t v4, int16_t v5, int16_t v6, int16_t v7,
+    FORCEINLINE __vec16_i16& operator =(const __vec16_i16 &o);
-                int16_t v8, int16_t v9, int16_t v10, int16_t v11, 
+    FORCEINLINE __vec16_i16(int16_t v0, int16_t v1, int16_t v2, int16_t v3, 
-                int16_t v12, int16_t v13, int16_t v14, int16_t v15) 
+                            int16_t v4, int16_t v5, int16_t v6, int16_t v7,
                            int16_t v8, int16_t v9, int16_t v10, int16_t v11, 
                            int16_t v12, int16_t v13, int16_t v14, int16_t v15) 
        : vec16<int16_t>(v0, v1, v2, v3, v4, v5, v6, v7,
                         v8, v9, v10, v11, v12, v13, v14, v15) { }
 } POST_ALIGN(32);
@@ -395,12 +402,23 @@ static FORCEINLINE VTYPE __shuffle2_##NAME(VTYPE v0, VTYPE v1, __vec16_i32 index
 ///////////////////////////////////////////////////////////////////////////
-INSERT_EXTRACT(__vec1_i8, int8_t)
+static FORCEINLINE int8_t __extract_element(__vec1_i8 v, int index) { return ((int8_t *)&v)[index]; }
-INSERT_EXTRACT(__vec1_i16, int16_t)
+static FORCEINLINE void __insert_element(__vec1_i8 *v, int index, int8_t val) { ((int8_t *)v)[index] = val; }
-INSERT_EXTRACT(__vec1_i32, int32_t)
+
-INSERT_EXTRACT(__vec1_i64, int64_t)
+static FORCEINLINE int16_t __extract_element(__vec1_i16 v, int index) { return ((int16_t *)&v)[index]; }
-INSERT_EXTRACT(__vec1_f, float)
+static FORCEINLINE void __insert_element(__vec1_i16 *v, int index, int16_t val) { ((int16_t *)v)[index] = val; }
-INSERT_EXTRACT(__vec1_d, double)
+
 static FORCEINLINE int32_t __extract_element(__vec1_i32 v, int index) { return ((int32_t *)&v)[index]; }
 static FORCEINLINE void __insert_element(__vec1_i32 *v, int index, int32_t val) { ((int32_t *)v)[index] = val; }
 static FORCEINLINE int64_t __extract_element(__vec1_i64 v, int index) { return ((int64_t *)&v)[index]; }
 static FORCEINLINE void __insert_element(__vec1_i64 *v, int index, int64_t val) { ((int64_t *)v)[index] = val; }
 static FORCEINLINE float __extract_element(__vec1_f v, int index) { return ((float *)&v)[index]; }
 static FORCEINLINE void __insert_element(__vec1_f *v, int index, float val) { ((float *)v)[index] = val; }
 static FORCEINLINE double __extract_element(__vec1_d v, int index) { return ((double *)&v)[index]; }
 static FORCEINLINE void __insert_element(__vec1_d *v, int index, double val) { ((double *)v)[index] = val; }
 ///////////////////////////////////////////////////////////////////////////
 // mask ops
@@ -582,157 +600,157 @@ LOAD_STORE(__vec16_i16, int16_t)
 // int32
 static FORCEINLINE __vec16_i32 __add(__vec16_i32 a, __vec16_i32 b) {
-  return _mm512_add_epi32((__m512i)a, (__m512i)b);
+    return _mm512_add_epi32(a, b);
 }
 static FORCEINLINE __vec16_i32 __sub(__vec16_i32 a, __vec16_i32 b) {
-    return _mm512_sub_epi32((__m512i)a, (__m512i)b);
+    return _mm512_sub_epi32(a, b);
 }
 static FORCEINLINE __vec16_i32 __mul(__vec16_i32 a, __vec16_i32 b) {
-    return _mm512_mullo_epi32((__m512i)a, (__m512i)b);
+    return _mm512_mullo_epi32(a, b);
 }
 static FORCEINLINE __vec16_i32 __udiv(__vec16_i32 a, __vec16_i32 b) {
-    return _mm512_div_epu32((__m512i)a, (__m512i)b);
+    return _mm512_div_epu32(a, b);
 }
 static FORCEINLINE __vec16_i32 __sdiv(__vec16_i32 a, __vec16_i32 b) {
-    return _mm512_div_epi32((__m512i)a, (__m512i)b);
+    return _mm512_div_epi32(a, b);
 }
 static FORCEINLINE __vec16_i32 __urem(__vec16_i32 a, __vec16_i32 b) {
-    return _mm512_rem_epu32((__m512i)a, (__m512i)b);
+    return _mm512_rem_epu32(a, b);
 }
 static FORCEINLINE __vec16_i32 __srem(__vec16_i32 a, __vec16_i32 b) {
-    return _mm512_rem_epi32((__m512i)a, (__m512i)b);
+    return _mm512_rem_epi32(a, b);
 }
 static FORCEINLINE __vec16_i32 __or(__vec16_i32 a, __vec16_i32 b) {
-    return _mm512_or_epi32((__m512i)a, (__m512i)b);
+    return _mm512_or_epi32(a, b);
 }
 static FORCEINLINE __vec16_i32 __and(__vec16_i32 a, __vec16_i32 b) {
-    return _mm512_and_epi32((__m512i)a, (__m512i)b);
+    return _mm512_and_epi32(a, b);
 }
 static FORCEINLINE __vec16_i32 __xor(__vec16_i32 a, __vec16_i32 b) {
-    return _mm512_xor_epi32((__m512i)a, (__m512i)b);
+    return _mm512_xor_epi32(a, b);
 }
 static FORCEINLINE __vec16_i32 __shl(__vec16_i32 a, __vec16_i32 b) {
-    return _mm512_sllv_epi32((__m512i)a, (__m512i)b);
+    return _mm512_sllv_epi32(a, b);
 }
 static FORCEINLINE __vec16_i32 __lshr(__vec16_i32 a, __vec16_i32 b) {
-    return _mm512_srlv_epi32((__m512i)a, (__m512i)b);
+    return _mm512_srlv_epi32(a, b);
 }
 static FORCEINLINE __vec16_i32 __ashr(__vec16_i32 a, __vec16_i32 b) {
-    return _mm512_srav_epi32((__m512i)a, (__m512i)b); 
+    return _mm512_srav_epi32(a, b); 
 }
 static FORCEINLINE __vec16_i32 __shl(__vec16_i32 a, int32_t n) {
-    return _mm512_slli_epi32((__m512i)a, n);
+    return _mm512_slli_epi32(a, n);
 }
 static FORCEINLINE __vec16_i32 __lshr(__vec16_i32 a, int32_t n) {
-    return _mm512_srli_epi32((__m512i)a, n); 
+    return _mm512_srli_epi32(a, n); 
 }
 static FORCEINLINE __vec16_i32 __ashr(__vec16_i32 a, int32_t n) {
-    return _mm512_srai_epi32((__m512i)a, n); 
+    return _mm512_srai_epi32(a, n); 
 }
 static FORCEINLINE __vec16_i1 __equal_i32(const __vec16_i32 &a, const __vec16_i32 &b) {
-    return _mm512_cmpeq_epi32_mask((__m512i)a, (__m512i)b);
+    return _mm512_cmpeq_epi32_mask(a, b);
 }
 static FORCEINLINE __vec16_i1 __equal_i32_and_mask(const __vec16_i32 &a, const __vec16_i32 &b,
                                                   __vec16_i1 m) {
-    return _mm512_mask_cmpeq_epi32_mask((__mmask16)m, (__m512i)a, (__m512i)b);
+    return _mm512_mask_cmpeq_epi32_mask(m, a, b);
 }
 static FORCEINLINE __vec16_i1 __not_equal_i32(__vec16_i32 a, __vec16_i32 b) {
-    return _mm512_cmpneq_epi32_mask((__m512i)a, (__m512i)b);
+    return _mm512_cmpneq_epi32_mask(a, b);
 }
 static FORCEINLINE __vec16_i1 __not_equal_i32_and_mask(__vec16_i32 a, __vec16_i32 b,
                                                       __vec16_i1 m) {
-    return _mm512_mask_cmpneq_epi32_mask((__mmask16)m, (__m512i)a, (__m512i)b);
+    return _mm512_mask_cmpneq_epi32_mask(m, a, b);
 }
 static FORCEINLINE __vec16_i1 __unsigned_less_equal_i32(__vec16_i32 a, __vec16_i32 b) {
-    return _mm512_cmple_epu32_mask((__m512i)a, (__m512i)b);
+    return _mm512_cmple_epu32_mask(a, b);
 }
 static FORCEINLINE __vec16_i1 __unsigned_less_equal_i32_and_mask(__vec16_i32 a, __vec16_i32 b,
                                                                 __vec16_i1 m) {
-    return _mm512_mask_cmple_epu32_mask((__mmask16)m, (__m512i)a, (__m512i)b);
+    return _mm512_mask_cmple_epu32_mask(m, a, b);
 }
 static FORCEINLINE __vec16_i1 __signed_less_equal_i32(__vec16_i32 a, __vec16_i32 b) {
-    return _mm512_cmple_epi32_mask((__m512i)a, (__m512i)b);
+    return _mm512_cmple_epi32_mask(a, b);
 }
 static FORCEINLINE __vec16_i1 __signed_less_equal_i32_and_mask(__vec16_i32 a, __vec16_i32 b,
                                                               __vec16_i1 m) {
-    return _mm512_mask_cmple_epi32_mask((__mmask16)m, (__m512i)a, (__m512i)b);
+    return _mm512_mask_cmple_epi32_mask(m, a, b);
 }
 static FORCEINLINE __vec16_i1 __unsigned_greater_equal_i32(__vec16_i32 a, __vec16_i32 b) {
-    return _mm512_cmpge_epu32_mask((__m512i)a, (__m512i)b);
+    return _mm512_cmpge_epu32_mask(a, b);
 }
 static FORCEINLINE __vec16_i1 __unsigned_greater_equal_i32_and_mask(__vec16_i32 a, __vec16_i32 b,
                                                                    __vec16_i1 m) {
-    return _mm512_mask_cmpge_epu32_mask((__mmask16)m, (__m512i)a, (__m512i)b);
+    return _mm512_mask_cmpge_epu32_mask(m, a, b);
 }
 static FORCEINLINE __vec16_i1 __signed_greater_equal_i32(__vec16_i32 a, __vec16_i32 b) {
-    return _mm512_cmpge_epi32_mask((__m512i)a, (__m512i)b);
+    return _mm512_cmpge_epi32_mask(a, b);
 }
 static FORCEINLINE __vec16_i1 __signed_greater_equal_i32_and_mask(__vec16_i32 a, __vec16_i32 b,
                                                                  __vec16_i1 m) {
-    return _mm512_mask_cmpge_epi32_mask((__mmask16)m, (__m512i)a, (__m512i)b);
+    return _mm512_mask_cmpge_epi32_mask(m, a, b);
 }
 static FORCEINLINE __vec16_i1 __unsigned_less_than_i32(__vec16_i32 a, __vec16_i32 b) {
-    return _mm512_cmplt_epu32_mask((__m512i)a, (__m512i)b);
+    return _mm512_cmplt_epu32_mask(a, b);
 }
 static FORCEINLINE __vec16_i1 __unsigned_less_than_i32_and_mask(__vec16_i32 a, __vec16_i32 b,
                                                                __vec16_i1 m) {
-    return _mm512_mask_cmplt_epu32_mask((__mmask16)m, (__m512i)a, (__m512i)b);
+    return _mm512_mask_cmplt_epu32_mask(m, a, b);
 }
 static FORCEINLINE __vec16_i1 __signed_less_than_i32(__vec16_i32 a, __vec16_i32 b) {
-    return _mm512_cmplt_epi32_mask((__m512i)a, (__m512i)b);
+    return _mm512_cmplt_epi32_mask(a, b);
 }
 static FORCEINLINE __vec16_i1 __signed_less_than_i32_and_mask(__vec16_i32 a, __vec16_i32 b,
                                                              __vec16_i1 m) {
-    return _mm512_mask_cmplt_epi32_mask((__mmask16)m, (__m512i)a, (__m512i)b);
+    return _mm512_mask_cmplt_epi32_mask(m, a, b);
 }
 static FORCEINLINE __vec16_i1 __unsigned_greater_than_i32(__vec16_i32 a, __vec16_i32 b) {
-    return _mm512_cmpgt_epu32_mask((__m512i)a, (__m512i)b);
+    return _mm512_cmpgt_epu32_mask(a, b);
 }
 static FORCEINLINE __vec16_i1 __unsigned_greater_than_i32_and_mask(__vec16_i32 a, __vec16_i32 b,
                                                                   __vec16_i1 m) {
-    return _mm512_mask_cmpgt_epu32_mask((__mmask16)m, (__m512i)a, (__m512i)b);
+    return _mm512_mask_cmpgt_epu32_mask(m, a, b);
 }
 static FORCEINLINE __vec16_i1 __signed_greater_than_i32(__vec16_i32 a, __vec16_i32 b) {
-    return _mm512_cmpgt_epi32_mask((__m512i)a, (__m512i)b);
+    return _mm512_cmpgt_epi32_mask(a, b);
 }
 static FORCEINLINE __vec16_i1 __signed_greater_than_i32_and_mask(__vec16_i32 a, __vec16_i32 b,
                                                                 __vec16_i1 m) {
-    return _mm512_mask_cmpgt_epi32_mask((__mmask16)m, (__m512i)a, (__m512i)b);
+    return _mm512_mask_cmpgt_epi32_mask(m, a, b);
 }
 static FORCEINLINE __vec16_i32 __select(__vec16_i1 mask,
@@ -744,14 +762,24 @@ static FORCEINLINE __vec16_i32 __select(bool cond, __vec16_i32 a, __vec16_i32 b)
    return cond ? a : b;
 }
-static FORCEINLINE int32_t __extract_element(__vec16_i32 v, int index) { return ((int32_t *)&v)[index]; }
+static FORCEINLINE int32_t __extract_element(__vec16_i32 v, unsigned int index) {
-static FORCEINLINE void __insert_element(__vec16_i32 *v, int index, int32_t val) { ((int32_t *)v)[index] = val; }
+    return ((int32_t *)&v)[index];
 }
 static FORCEINLINE void __insert_element(__vec16_i32 *v, unsigned int index, int32_t val) {
    ((int32_t *)v)[index] = val;
 }
 template <class RetVecType> __vec16_i32 __smear_i32(int32_t i);
 template <> static FORCEINLINE __vec16_i32 __smear_i32<__vec16_i32>(int32_t i) {
    return _mm512_set_1to16_epi32(i);
 }
 static const __vec16_i32 __ispc_one = __smear_i32<__vec16_i32>(1);
 static const __vec16_i32 __ispc_thirty_two = __smear_i32<__vec16_i32>(32);
 static const __vec16_i32 __ispc_ffffffff = __smear_i32<__vec16_i32>(-1);
 static const __vec16_i32 __ispc_stride1(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
 template <class RetVecType> __vec16_i32 __setzero_i32();
 template <> static FORCEINLINE __vec16_i32 __setzero_i32<__vec16_i32>() {
    return _mm512_setzero_epi32();
@@ -768,10 +796,8 @@ static FORCEINLINE __vec16_i32 __broadcast_i32(__vec16_i32 v, int index) {
 }
 static FORCEINLINE __vec16_i32 __rotate_i32(__vec16_i32 v, int index) {
-    __vec16_i32 shuffle((0+index)%8, (1+index)%8, (2+index)%8, (3+index)%8, 
+    __vec16_i32 idx = __smear_i32<__vec16_i32>(index);
-                        (4+index)%8, (5+index)%8, (6+index)%8, (7+index)%8, 
+    __vec16_i32 shuffle = _mm512_and_epi32(_mm512_add_epi32(__ispc_stride1, idx),  __smear_i32<__vec16_i32>(0x7));
                        (8+index)%8, (9+index)%8, (10+index)%8, (11+index)%8, 
                        (12+index)%8, (13+index)%8, (14+index), (15+index)%8);
    return _mm512_mask_permutevar_epi32(v, 0xffff, shuffle, v);
 }
@@ -809,6 +835,16 @@ template <> static FORCEINLINE void __store<64>(__vec16_i32 *p, __vec16_i32 v) {
 // int64
 static FORCEINLINE int64_t __extract_element(__vec16_i64 v, unsigned int index) {
    return (uint64_t(((int32_t *)&v.v_hi)[index])<<32) | (uint64_t(((int32_t *)&v.v_lo)[index]));
 }
 static FORCEINLINE void __insert_element(__vec16_i64 *v, unsigned int index, int64_t val) {
    ((int32_t *)&v->v_hi)[index] = val>>32;
    ((int32_t *)&v->v_lo)[index] = val;
 }
 template <class RetVecType> __vec16_i64 __setzero_i64();
 template <> static FORCEINLINE __vec16_i64 __setzero_i64<__vec16_i64>() {
    __vec16_i64 ret;
@@ -833,29 +869,103 @@ static FORCEINLINE __vec16_i64 __add(const __vec16_i64 &a, const __vec16_i64 &b)
    return __vec16_i64(lo, hi);
 }
-BINARY_OP(__vec16_i64, __sub, -)
+static FORCEINLINE __vec16_i64 __sub(const __vec16_i64 &a, const __vec16_i64 &b)
-BINARY_OP(__vec16_i64, __mul, *)
+{    
    __mmask16 borrow = 0;
    __m512i lo = _mm512_subsetb_epi32(a.v_lo, b.v_lo, &borrow);
    __m512i hi = _mm512_sbb_epi32(a.v_hi, borrow, b.v_hi, &borrow);
    return __vec16_i64(lo, hi);
 }
 /*! 64x32 bit mul -- address computations often use a scale that we
    know is 32 bits; and 32x64 is faster than 64x64 */
 static FORCEINLINE __vec16_i64 __mul(const __vec16_i32 &a, const __vec16_i64 &b)
 {
    return __vec16_i64(_mm512_mullo_epi32(a.v,b.v_lo),
-                       _mm512_add_epi32(b.v_hi, _mm512_mulhi_epi32(a.v, b.v_lo)));
+                       _mm512_add_epi32(_mm512_mullo_epi32(a.v, b.v_hi),
                                        _mm512_mulhi_epi32(a.v, b.v_lo)));
 }
-BINARY_OP(__vec16_i64, __or, |)
+static FORCEINLINE __vec16_i64 __mul(const __vec16_i64 &a, const __vec16_i64 &b)
-BINARY_OP(__vec16_i64, __and, &)
+{
-BINARY_OP(__vec16_i64, __xor, ^)
+    __vec16_i32 lo = _mm512_mullo_epi32(a.v_lo,b.v_lo);
-BINARY_OP(__vec16_i64, __shl, <<)
+    __vec16_i32 hi_m1 = _mm512_mulhi_epi32(a.v_lo, b.v_lo);
    __vec16_i32 hi_m2 = _mm512_mullo_epi32(a.v_hi, b.v_lo);
    __vec16_i32 hi_m3 = _mm512_mullo_epi32(a.v_lo, b.v_hi);
    __mmask16 carry = 0;
    __vec16_i32 hi_p23 = _mm512_addsetc_epi32(hi_m2, hi_m1, &carry);
    __vec16_i32 hi = _mm512_adc_epi32(hi_m3, carry, hi_p23, &carry);
    return __vec16_i64(lo, hi);
 }
 static FORCEINLINE __vec16_i64 __sdiv(const __vec16_i64 &a, const __vec16_i64 &b)
 {
    __vec16_i64 ret;
    for(int i=0; i<16; i++) {
        int64_t dividend = __extract_element(a, i);
        int64_t divisor = __extract_element(b, i);
        int64_t quotient = dividend / divisor; // SVML
        __insert_element(&ret, i, quotient);
    }
    return ret;
 }
 static FORCEINLINE __vec16_i64 __udiv(const __vec16_i64 &a, const __vec16_i64 &b)
 {
    __vec16_i64 ret;
    for(int i=0; i<16; i++) {
        uint64_t dividend = __extract_element(a, i);
        uint64_t divisor = __extract_element(b, i);
        uint64_t quotient = dividend / divisor; // SVML
        __insert_element(&ret, i, quotient);
    }
    return ret;
 }
 static FORCEINLINE __vec16_i64 __or(__vec16_i64 a, __vec16_i64 b) {
    return __vec16_i64(_mm512_or_epi32(a.v_lo, b.v_lo), _mm512_or_epi32(a.v_hi, b.v_hi));
 }
 static FORCEINLINE __vec16_i64 __and(__vec16_i64 a, __vec16_i64 b) {
    return __vec16_i64(_mm512_and_epi32(a.v_lo, b.v_lo), _mm512_and_epi32(a.v_hi, b.v_hi));
 }
 static FORCEINLINE __vec16_i64 __xor(__vec16_i64 a, __vec16_i64 b) {
    return __vec16_i64(_mm512_xor_epi32(a.v_lo, b.v_lo), _mm512_xor_epi32(a.v_hi, b.v_hi));
 }
 static FORCEINLINE __vec16_i64 __shl(__vec16_i64 a, __vec16_i64 b) {
    __vec16_i32 xfer = _mm512_srlv_epi32(a.v_lo, _mm512_sub_epi32(__ispc_thirty_two, b.v_lo));
    __vec16_i32 hi = _mm512_or_epi32(_mm512_sllv_epi32(a.v_hi, b.v_lo), xfer);
    __vec16_i32 lo = _mm512_sllv_epi32(a.v_lo, b.v_lo);
    return __vec16_i64(lo, hi);
 }
 static FORCEINLINE __vec16_i64 __lshr(__vec16_i64 a, __vec16_i64 b) {
    __vec16_i32 shift = _mm512_sub_epi32(__ispc_thirty_two, b.v_lo);
    __vec16_i32 xfer = _mm512_and_epi32(_mm512_sllv_epi32(__ispc_ffffffff, shift), _mm512_sllv_epi32(a.v_hi, shift));
    //__vec16_i32 xfer = _mm512_sllv_epi32(_mm512_and_epi32(a.v_hi, 
    //                                                      _mm512_sub_epi32(_mm512_sllv_epi32(__ispc_one, b.v_lo), __ispc_one)), 
    //                                     _mm512_sub_epi32(__ispc_thirty_two, b.v_lo));
    __vec16_i32 hi = _mm512_srlv_epi32(a.v_hi, b.v_lo);
    __vec16_i32 lo = _mm512_or_epi32(xfer, _mm512_srlv_epi32(a.v_lo, b.v_lo));
    return __vec16_i64(lo, hi);
 }
 static FORCEINLINE __vec16_i64 __ashr(__vec16_i64 a, __vec16_i64 b) {
    __vec16_i32 xfer = _mm512_sllv_epi32(_mm512_and_epi32(a.v_hi, 
                                                          _mm512_sub_epi32(_mm512_sllv_epi32(__ispc_one, b.v_lo), __ispc_one)), 
                                         _mm512_sub_epi32(__ispc_thirty_two, b.v_lo));
    __vec16_i32 hi = _mm512_srav_epi32(a.v_hi, b.v_lo);
    __vec16_i32 lo = _mm512_or_epi32(xfer, _mm512_srlv_epi32(a.v_lo, b.v_lo));
    return __vec16_i64(lo, hi);
 }
 BINARY_OP_CAST(__vec16_i64, uint64_t, __udiv, /)
 BINARY_OP_CAST(__vec16_i64, int64_t,  __sdiv, /)
 BINARY_OP_CAST(__vec16_i64, uint64_t, __urem, %)
 BINARY_OP_CAST(__vec16_i64, int64_t,  __srem, %)
 BINARY_OP_CAST(__vec16_i64, uint64_t, __lshr, >>)
 BINARY_OP_CAST(__vec16_i64, int64_t,  __ashr, >>)
 SHIFT_UNIFORM(__vec16_i64, uint64_t, __lshr, >>)
 SHIFT_UNIFORM(__vec16_i64, int64_t, __ashr, >>)
@@ -891,7 +1001,14 @@ CMP_OP(__vec16_i64, i64, int64_t,  __signed_less_than, <)
 CMP_OP(__vec16_i64, i64, uint64_t, __unsigned_greater_than, >)
 CMP_OP(__vec16_i64, i64, int64_t,  __signed_greater_than, >)
-SELECT(__vec16_i64)
+static FORCEINLINE __vec16_i64 __select(__vec16_i1 mask,
                                        __vec16_i64 a, __vec16_i64 b) {
    __vec16_i64 ret;
    ret.v_hi = _mm512_mask_mov_epi32(b.v_hi, mask.m, a.v_hi);
    ret.v_lo = _mm512_mask_mov_epi32(b.v_lo, mask.m, a.v_lo);
    return ret;
 }
 INSERT_EXTRACT(__vec16_i64, int64_t)
 static FORCEINLINE int64_t __extract_element(const __vec16_i64 &v, int index)
@@ -955,6 +1072,10 @@ template <> static FORCEINLINE __vec16_i64 __load<64>(const __vec16_i64 *p) {
    return ret;    
 }
 template <> static FORCEINLINE __vec16_i64 __load<128>(const __vec16_i64 *p) {
    return __load<64>(p);
 }
 template <int ALIGN> static FORCEINLINE void __store(__vec16_i64 *p, __vec16_i64 v) {
    __m512i v1;
    __m512i v2;
@@ -995,7 +1116,9 @@ template <> static FORCEINLINE void __store<64>(__vec16_i64 *p, __vec16_i64 v) {
    _mm512_store_epi64(((uint8_t*)p)+64, v1);
 }
-
+template <> static FORCEINLINE void __store<128>(__vec16_i64 *p, __vec16_i64 v) {
    __store<64>(p, v);
 }
 ///////////////////////////////////////////////////////////////////////////
 // float
@@ -1369,6 +1492,10 @@ template <> static FORCEINLINE __vec16_d __load<64>(const __vec16_d *p) {
    return ret;
 }
 template <> static FORCEINLINE __vec16_d __load<128>(const __vec16_d *p) {
    return __load<64>(p);
 }
 template <int ALIGN> static FORCEINLINE void __store(__vec16_d *p, __vec16_d v) {
    _mm512_extpackstorehi_pd(p, v.v1, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE);
    _mm512_extpackstorelo_pd(p, v.v1, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE);
@@ -1381,6 +1508,10 @@ template <> static FORCEINLINE void __store<64>(__vec16_d *p, __vec16_d v) {
    _mm512_store_pd(((uint8_t*)p)+64, v.v2);
 }
 template <> static FORCEINLINE void __store<128>(__vec16_d *p, __vec16_d v) {
    __store<64>(p, v);
 }
 ///////////////////////////////////////////////////////////////////////////
 // casts
@@ -1811,30 +1942,91 @@ static FORCEINLINE int64_t __count_leading_zeros_i64(uint64_t v) {
 ///////////////////////////////////////////////////////////////////////////
 // reductions
-REDUCE_ADD(float, __vec16_f, __reduce_add_float)
+static const __vec16_i32 __ispc_s1(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);
-REDUCE_MINMAX(float, __vec16_f, __reduce_min_float, <)
+static const __vec16_i32 __ispc_s2(4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11);
-REDUCE_MINMAX(float, __vec16_f, __reduce_max_float, >)
+static const __vec16_i32 __ispc_s3(2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13);
 static const __vec16_i32 __ispc_s4(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
 static FORCEINLINE uint32_t __reduce_add_i32(__vec16_i32 v) {
    __vec16_i32 v1 = _mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xFF00, __ispc_s1, v);
    __vec16_i32 v2 = _mm512_mask_add_epi32(_mm512_undefined_epi32(), 0xFF00, v, v1);
    __vec16_i32 v3 = _mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xF000, __ispc_s2, v2);
    __vec16_i32 v4 = _mm512_mask_add_epi32(_mm512_undefined_epi32(), 0xF000, v2, v3);
    __vec16_i32 v5 = _mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xC000, __ispc_s3, v4);
    __vec16_i32 v6 = _mm512_mask_add_epi32(_mm512_undefined_epi32(), 0xC000, v4, v5);
    __vec16_i32 v7 = _mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0x8000, __ispc_s4, v6);
    __vec16_i32 v8 = _mm512_mask_add_epi32(_mm512_undefined_epi32(), 0x8000, v6, v7);
    return __extract_element(v8, 15);
 }
 static FORCEINLINE uint32_t __reduce_min_i32(__vec16_i32 v) {
    __vec16_i32 v1 = _mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xFF00, __ispc_s1, v);
    __vec16_i32 v2 = _mm512_mask_min_epi32(_mm512_undefined_epi32(), 0xFF00, v, v1);
    __vec16_i32 v3 = _mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xF000, __ispc_s2, v2);
    __vec16_i32 v4 = _mm512_mask_min_epi32(_mm512_undefined_epi32(), 0xF000, v2, v3);
    __vec16_i32 v5 = _mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xC000, __ispc_s3, v4);
    __vec16_i32 v6 = _mm512_mask_min_epi32(_mm512_undefined_epi32(), 0xC000, v4, v5);
    __vec16_i32 v7 = _mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0x8000, __ispc_s4, v6);
    __vec16_i32 v8 = _mm512_mask_min_epi32(_mm512_undefined_epi32(), 0x8000, v6, v7);
    return __extract_element(v8, 15);
 }
 static FORCEINLINE uint32_t __reduce_max_i32(__vec16_i32 v) {
    __vec16_i32 v1 = _mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xFF00, __ispc_s1, v);
    __vec16_i32 v2 = _mm512_mask_max_epi32(_mm512_undefined_epi32(), 0xFF00, v, v1);
    __vec16_i32 v3 = _mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xF000, __ispc_s2, v2);
    __vec16_i32 v4 = _mm512_mask_max_epi32(_mm512_undefined_epi32(), 0xF000, v2, v3);
    __vec16_i32 v5 = _mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xC000, __ispc_s3, v4);
    __vec16_i32 v6 = _mm512_mask_max_epi32(_mm512_undefined_epi32(), 0xC000, v4, v5);
    __vec16_i32 v7 = _mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0x8000, __ispc_s4, v6);
    __vec16_i32 v8 = _mm512_mask_max_epi32(_mm512_undefined_epi32(), 0x8000, v6, v7);
    return __extract_element(v8, 15);
 }
 static FORCEINLINE float __reduce_add_float(__vec16_f v) {
    __vec16_f v1 = _mm512_castsi512_ps(_mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xFF00, __ispc_s1, _mm512_castps_si512(v)));
    __vec16_f v2 = _mm512_mask_add_ps(_mm512_undefined_ps(), 0xFF00, v, v1);
    __vec16_f v3 = _mm512_castsi512_ps(_mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xF000, __ispc_s2, _mm512_castps_si512(v2)));
    __vec16_f v4 = _mm512_mask_add_ps(_mm512_undefined_ps(), 0xF000, v2, v3);
    __vec16_f v5 = _mm512_castsi512_ps(_mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xC000, __ispc_s3, _mm512_castps_si512(v4)));
    __vec16_f v6 = _mm512_mask_add_ps(_mm512_undefined_ps(), 0xC000, v4, v5);
    __vec16_f v7 = _mm512_castsi512_ps(_mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0x8000, __ispc_s4, _mm512_castps_si512(v6)));
    __vec16_f v8 = _mm512_mask_add_ps(_mm512_undefined_ps(), 0x8000, v6, v7);
    return __extract_element(v8, 15);
 }
 static FORCEINLINE float __reduce_min_float(__vec16_f v) {
    __vec16_f v1 = _mm512_castsi512_ps(_mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xFF00, __ispc_s1, _mm512_castps_si512(v)));
    __vec16_f v2 = _mm512_mask_min_ps(_mm512_undefined_ps(), 0xFF00, v, v1);
    __vec16_f v3 = _mm512_castsi512_ps(_mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xF000, __ispc_s2, _mm512_castps_si512(v2)));
    __vec16_f v4 = _mm512_mask_min_ps(_mm512_undefined_ps(), 0xF000, v2, v3);
    __vec16_f v5 = _mm512_castsi512_ps(_mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xC000, __ispc_s3, _mm512_castps_si512(v4)));
    __vec16_f v6 = _mm512_mask_min_ps(_mm512_undefined_ps(), 0xC000, v4, v5);
    __vec16_f v7 = _mm512_castsi512_ps(_mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0x8000, __ispc_s4, _mm512_castps_si512(v6)));
    __vec16_f v8 = _mm512_mask_min_ps(_mm512_undefined_ps(), 0x8000, v6, v7);
    return __extract_element(v8, 15);
 }
 static FORCEINLINE float __reduce_max_float(__vec16_f v) {
    __vec16_f v1 = _mm512_castsi512_ps(_mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xFF00, __ispc_s1, _mm512_castps_si512(v)));
    __vec16_f v2 = _mm512_mask_max_ps(_mm512_undefined_ps(), 0xFF00, v, v1);
    __vec16_f v3 = _mm512_castsi512_ps(_mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xF000, __ispc_s2, _mm512_castps_si512(v2)));
    __vec16_f v4 = _mm512_mask_max_ps(_mm512_undefined_ps(), 0xF000, v2, v3);
    __vec16_f v5 = _mm512_castsi512_ps(_mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xC000, __ispc_s3, _mm512_castps_si512(v4)));
    __vec16_f v6 = _mm512_mask_max_ps(_mm512_undefined_ps(), 0xC000, v4, v5);
    __vec16_f v7 = _mm512_castsi512_ps(_mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0x8000, __ispc_s4, _mm512_castps_si512(v6)));
    __vec16_f v8 = _mm512_mask_max_ps(_mm512_undefined_ps(), 0x8000, v6, v7);
    return __extract_element(v8, 15);
 }
 REDUCE_ADD(double, __vec16_d, __reduce_add_double)
 REDUCE_MINMAX(double, __vec16_d, __reduce_min_double, <)
 REDUCE_MINMAX(double, __vec16_d, __reduce_max_double, >)
 REDUCE_ADD(uint32_t, __vec16_i32, __reduce_add_int32)
 REDUCE_MINMAX(int32_t, __vec16_i32, __reduce_min_int32, <)
 REDUCE_MINMAX(int32_t, __vec16_i32, __reduce_max_int32, >)
 REDUCE_ADD(uint32_t, __vec16_i32, __reduce_add_uint32)
 REDUCE_MINMAX(uint32_t, __vec16_i32, __reduce_min_uint32, <)
 REDUCE_MINMAX(uint32_t, __vec16_i32, __reduce_max_uint32, >)
 REDUCE_ADD(uint64_t, __vec16_i64, __reduce_add_int64)
 REDUCE_MINMAX(int64_t, __vec16_i64, __reduce_min_int64, <)
 REDUCE_MINMAX(int64_t, __vec16_i64, __reduce_max_int64, >)
 REDUCE_ADD(uint64_t, __vec16_i64, __reduce_add_uint64)
 REDUCE_MINMAX(uint64_t, __vec16_i64, __reduce_min_uint64, <)
 REDUCE_MINMAX(uint64_t, __vec16_i64, __reduce_max_uint64, >)
 ///////////////////////////////////////////////////////////////////////////
 // masked load/store
 /*
@@ -2084,13 +2276,13 @@ __gather_base_offsets32_float(uint8_t *base, uint32_t scale, __vec16_i32 offsets
    return ret;
 }
-//GATHER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i32, __gather_base_offsets32_i8)
+// There is no gather instruction with 64-bit offsets in KNC.
-//GATHER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i64, __gather_base_offsets64_i8)
+// So we cannot implement __gather_base_offsets64_*()
 //GATHER_BASE_OFFSETS(__vec16_i8,  int8_t,  __vec16_i32, __gather_base_offsets32_i8)
 //GATHER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i32, __gather_base_offsets32_i16)
 //GATHER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i64, __gather_base_offsets64_i16)
 //GATHER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i64, __gather_base_offsets64_i32)
 //GATHER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i32, __gather_base_offsets32_i64)
-//GATHER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i64, __gather_base_offsets64_i64)
+
 #define GATHER_GENERAL(VTYPE, STYPE, PTRTYPE, FUNC)
 /*
@@ -2112,28 +2304,8 @@ GATHER_GENERAL(__vec16_i16, int16_t, __vec16_i64, __gather64_i16)
 GATHER_GENERAL(__vec16_i32, int32_t, __vec16_i32, __gather32_i32)
 GATHER_GENERAL(__vec16_i32, int32_t, __vec16_i64, __gather64_i32)
 GATHER_GENERAL(__vec16_i64, int64_t, __vec16_i32, __gather32_i64)
-// GATHER_GENERAL(__vec16_i64, int64_t, __vec16_i64, __gather64_i64)
+GATHER_GENERAL(__vec16_i64, int64_t, __vec16_i64, __gather64_i64)
 /*
 static __forceinline __vec16_i32 __gather64_i32(__vec16_i64 ptrs, __vec16_i1 mask) {
    __vec16_i32 ret;
    for (int i = 0; i < 16; ++i)
        if ((mask.v & (1 << i)) != 0) {
            int32_t *ptr = (int32_t *)ptrs.v[i];
            ret.v[i] = *ptr; 
        } 
    return ret;
 }
 */
 /*
 static FORCEINLINE __vec16_i32 __gather64_i32(__vec16_i64 ptrs, __vec16_i1 mask) {
    // Loop is generated by intrinsic
    __vec16_i32 ret = _mm512_mask_i32extgather_epi32(tmp, mask, offsets, base, 
                                                     _MM_UPCONV_EPI32_NONE, 1,
                                                     _MM_HINT_NONE);
    return ret;
 }
 */
 // scatter
 //SCATTER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i32, __scatter_base_offsets32_i8)