Merge pull request #862 from ncos/knc-backend-merge

Modification of 'knc.h'
2014-10-02 13:57:45 +04:00
parent 8ff187a7b7 700fe244e7
commit ddc6e33bc0
2 changed files with 1227 additions and 935 deletions
--- a/examples/intrinsics/knc-i1x16.h
+++ b/examples/intrinsics/knc-i1x16.h
@@ -526,11 +526,11 @@ template <int ALIGN> static FORCEINLINE void __store(__vec16_i1 *p, __vec16_i1 v
  *p = v;
 }
-template <class RetVecType> RetVecType __smear_i1(int i);
+template <class RetVecType> static RetVecType __smear_i1(int i);
-template <> static FORCEINLINE __vec16_i1 __smear_i1<__vec16_i1>(int i) { return i?0xFFFF:0x0; }
+template <> FORCEINLINE __vec16_i1 __smear_i1<__vec16_i1>(int i) { return i?0xFFFF:0x0; }
-template <class RetVecType> RetVecType __setzero_i1();
+template <class RetVecType> static RetVecType __setzero_i1();
-template <> static FORCEINLINE __vec16_i1 __setzero_i1<__vec16_i1>() { return 0; }
+template <> FORCEINLINE __vec16_i1 __setzero_i1<__vec16_i1>() { return 0; }
 template <class RetVecType> __vec16_i1 __undef_i1();
 template <> FORCEINLINE __vec16_i1 __undef_i1<__vec16_i1>() { return __vec16_i1(); }
@@ -678,8 +678,8 @@ static FORCEINLINE __vec16_i32 __select(      bool cond, __vec16_i32 a, __vec16_
 static FORCEINLINE int32_t __extract_element(__vec16_i32  v,  int32_t index)              { return v[index];    }
 static FORCEINLINE void    __insert_element (__vec16_i32 *v, uint32_t index, int32_t val) { (*v)[index] = val;  }
-template <class RetVecType> RetVecType __smear_i32(int32_t i);
+template <class RetVecType> RetVecType static __smear_i32(int32_t i);
-template <> static FORCEINLINE __vec16_i32 __smear_i32<__vec16_i32>(int32_t i) { return _mm512_set1_epi32(i); }
+template <> FORCEINLINE __vec16_i32 __smear_i32<__vec16_i32>(int32_t i) { return _mm512_set1_epi32(i); }
 static const __vec16_i32 __ispc_one = __smear_i32<__vec16_i32>(1);
 static const __vec16_i32 __ispc_zero = __smear_i32<__vec16_i32>(0);
@@ -687,11 +687,11 @@ static const __vec16_i32 __ispc_thirty_two = __smear_i32<__vec16_i32>(32);
 static const __vec16_i32 __ispc_ffffffff = __smear_i32<__vec16_i32>(-1);
 static const __vec16_i32 __ispc_stride1(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-template <class RetVecType> RetVecType __setzero_i32();
+template <class RetVecType> static RetVecType __setzero_i32();
-template <> static FORCEINLINE __vec16_i32 __setzero_i32<__vec16_i32>() { return _mm512_setzero_epi32(); }
+template <> FORCEINLINE __vec16_i32 __setzero_i32<__vec16_i32>() { return _mm512_setzero_epi32(); }
-template <class RetVecType> RetVecType __undef_i32();
+template <class RetVecType> static RetVecType __undef_i32();
-template <> static FORCEINLINE __vec16_i32 __undef_i32<__vec16_i32>() { return __vec16_i32(); }
+template <> FORCEINLINE __vec16_i32 __undef_i32<__vec16_i32>() { return __vec16_i32(); }
 static FORCEINLINE __vec16_i32 __broadcast_i32(__vec16_i32 v, int index) { return _mm512_mask_permutevar_epi32(v, 0xFFFF, _mm512_set1_epi32(index), v); }
@@ -743,11 +743,11 @@ template <int ALIGN> static FORCEINLINE void __store(__vec16_i32 *p, __vec16_i32
 }
 #if 0 /* knc::fails  ./tests/foreach-25.ispc ./tests/forach-26.ispc ./tests/foreach-27.ispc */
-template <> static FORCEINLINE __vec16_i32 __load<64>(const __vec16_i32 *p) 
+template <> FORCEINLINE __vec16_i32 __load<64>(const __vec16_i32 *p) 
 {
  return _mm512_load_epi32(p);
 }
-template <> static FORCEINLINE void __store<64>(__vec16_i32 *p, __vec16_i32 v) 
+template <> FORCEINLINE void __store<64>(__vec16_i32 *p, __vec16_i32 v) 
 {
  _mm512_store_epi32(p, v);
 }
@@ -1018,21 +1018,21 @@ template <int ALIGN> static FORCEINLINE void __store(__vec16_i64 *p, __vec16_i64
 }
 #if 0 /* knc::fails  as with _i32 this may generate fails ... so commetining it out */
-template <> static FORCEINLINE __vec16_i64 __load<64>(const __vec16_i64 *p) 
+template <> FORCEINLINE __vec16_i64 __load<64>(const __vec16_i64 *p) 
 {
  __m512i v2 = _mm512_load_epi32(p);
  __m512i v1 = _mm512_load_epi32(((uint8_t*)p)+64);
  return __vec16_i64(v2,v1);
 }
-template <> static FORCEINLINE __vec16_i64 __load<128>(const __vec16_i64 *p) {    return __load<64>(p); }
+template <> FORCEINLINE __vec16_i64 __load<128>(const __vec16_i64 *p) {    return __load<64>(p); }
-template <> static FORCEINLINE void __store<64>(__vec16_i64 *p, __vec16_i64 v) 
+template <> FORCEINLINE void __store<64>(__vec16_i64 *p, __vec16_i64 v) 
 {
  __m512i v1 = v.v2;
  __m512i v2 = v.v1;
  _mm512_store_epi64(p, v2);
  _mm512_store_epi64(((uint8_t*)p)+64, v1);
 }
-template <> static FORCEINLINE void __store<128>(__vec16_i64 *p, __vec16_i64 v) {    __store<64>(p, v); }
+template <> FORCEINLINE void __store<128>(__vec16_i64 *p, __vec16_i64 v) {    __store<64>(p, v); }
 #endif
@@ -1068,14 +1068,14 @@ static FORCEINLINE __vec16_f __select(      bool cond, __vec16_f a, __vec16_f b)
 static FORCEINLINE float __extract_element(__vec16_f  v, uint32_t index)            { return v[index];   }
 static FORCEINLINE void   __insert_element(__vec16_f *v, uint32_t index, float val) { (*v)[index] = val; }
-template <class RetVecType> RetVecType __smear_float(float f);
+template <class RetVecType> static RetVecType __smear_float(float f);
-template <> static FORCEINLINE __vec16_f __smear_float<__vec16_f>(float f) { return _mm512_set_1to16_ps(f); }
+template <> FORCEINLINE __vec16_f __smear_float<__vec16_f>(float f) { return _mm512_set_1to16_ps(f); }
-template <class RetVecType> RetVecType __setzero_float();
+template <class RetVecType> static RetVecType __setzero_float();
-template <> static FORCEINLINE __vec16_f __setzero_float<__vec16_f>() { return _mm512_setzero_ps(); }
+template <> FORCEINLINE __vec16_f __setzero_float<__vec16_f>() { return _mm512_setzero_ps(); }
-template <class RetVecType> RetVecType __undef_float();
+template <class RetVecType> static RetVecType __undef_float();
-template <> static FORCEINLINE __vec16_f __undef_float<__vec16_f>() { return __vec16_f(); }
+template <> FORCEINLINE __vec16_f __undef_float<__vec16_f>() { return __vec16_f(); }
 static FORCEINLINE __vec16_f __broadcast_float(__vec16_f _v, int index) 
 {
@@ -1132,12 +1132,12 @@ template <int ALIGN> static FORCEINLINE void __store(__vec16_f *p, __vec16_f v)
 }
 #if 0 /* knc::fails  ./tests/gs-improve-progindex.ispc with segfault */
-template <> static FORCEINLINE __vec16_f __load<64>(const __vec16_f *p) 
+template <> FORCEINLINE __vec16_f __load<64>(const __vec16_f *p) 
 {
    return _mm512_load_ps(p);
 }
 /* this one doesn't fail but it is  commented out for completeness, no aligned load/stores */
-template <> static FORCEINLINE void __store<64>(__vec16_f *p, __vec16_f v) 
+template <> FORCEINLINE void __store<64>(__vec16_f *p, __vec16_f v) 
 {
  _mm512_store_ps(p, v);
 }
@@ -1310,14 +1310,14 @@ static FORCEINLINE __vec16_d __select(bool cond, __vec16_d a, __vec16_d b)
 static FORCEINLINE double __extract_element(__vec16_d  v, uint32_t index)             { return v[index];   }
 static FORCEINLINE void    __insert_element(__vec16_d *v, uint32_t index, double val) { (*v)[index] = val; }
-template <class RetVecType> RetVecType __smear_double(double d);
+template <class RetVecType> static RetVecType __smear_double(double d);
-template <> static FORCEINLINE __vec16_d __smear_double<__vec16_d>(double d) { return __vec16_d(_mm512_set1_pd(d), _mm512_set1_pd(d)); }
+template <> FORCEINLINE __vec16_d __smear_double<__vec16_d>(double d) { return __vec16_d(_mm512_set1_pd(d), _mm512_set1_pd(d)); }
-template <class RetVecType> RetVecType __setzero_double();
+template <class RetVecType> static RetVecType __setzero_double();
-template <> static FORCEINLINE __vec16_d __setzero_double<__vec16_d>() { return __vec16_d(_mm512_setzero_pd(), _mm512_setzero_pd()); }
+template <> FORCEINLINE __vec16_d __setzero_double<__vec16_d>() { return __vec16_d(_mm512_setzero_pd(), _mm512_setzero_pd()); }
-template <class RetVecType> RetVecType __undef_double();
+template <class RetVecType> static RetVecType __undef_double();
-template <> static FORCEINLINE __vec16_d __undef_double<__vec16_d>() { return __vec16_d(); }
+template <> FORCEINLINE __vec16_d __undef_double<__vec16_d>() { return __vec16_d(); }
 #define CASTD2F(_v_, _v_hi_, _v_lo_) \
  __vec16_f _v_hi_, _v_lo_;  \
@@ -1391,17 +1391,17 @@ template <int ALIGN> static FORCEINLINE void __store(__vec16_d *p, __vec16_d v)
 #if 0 /* knc::fails  as with _f this may generate fails ... so commetining it out */
-template <> static FORCEINLINE __vec16_d __load<64>(const __vec16_d *p) 
+template <> FORCEINLINE __vec16_d __load<64>(const __vec16_d *p) 
 {
  return __vec16_d(_mm512_load_pd(p), _mm512_load_pd(((uint8_t*)p)+64));
 }
-template <> static FORCEINLINE void __store<64>(__vec16_d *p, __vec16_d v) 
+template <> FORCEINLINE void __store<64>(__vec16_d *p, __vec16_d v) 
 {
  _mm512_store_pd(p, v.v1);
  _mm512_store_pd(((uint8_t*)p)+64, v.v2);
 }
-template <> static FORCEINLINE __vec16_d __load <128>(const __vec16_d *p)        { return __load<64>(p); }
+template <> FORCEINLINE __vec16_d __load <128>(const __vec16_d *p)        { return __load<64>(p); }
-template <> static FORCEINLINE      void __store<128>(__vec16_d *p, __vec16_d v) { __store<64>(p, v);    }
+template <> FORCEINLINE      void __store<128>(__vec16_d *p, __vec16_d v) { __store<64>(p, v);    }
 #endif
 ///////////////////////////////////////////////////////////////////////////
--- a/examples/intrinsics/knc.h
+++ b/examples/intrinsics/knc.h
@@ -1,5 +1,5 @@
-/*
+/**
-  Copyright (c) 2012-2014, Intel Corporation
+  Copyright (c) 2010-2014, Intel Corporation
  All rights reserved.
  Redistribution and use in source and binary forms, with or without
@@ -44,6 +44,16 @@
 #include <iostream> // for operator<<(m512[i])
 #include <iomanip>  // for operator<<(m512[i])
 #if 0
  #define STRING(x) #x
  #define TOSTRING(x) STRING(x)
  #define PING std::cout << __FILE__ << " (" << __LINE__ << "): " << __FUNCTION__ << std::endl
  #define PRINT(x) std::cout << STRING(x) << " = " << (x) << std::endl
  #define PRINT2(x,y) std::cout << STRING(x) << " = " << (x) << ", " << STRING(y) << " = " << (y) << std::endl
  #define PRINT3(x,y,z) std::cout << STRING(x) << " = " << (x) << ", " << STRING(y) << " = " << (y) << ", " << STRING(z) << " = " << (z) << std::endl
  #define PRINT4(x,y,z,w) std::cout << STRING(x) << " = " << (x) << ", " << STRING(y) << " = " << (y) << ", " << STRING(z) << " = " << (z) << ", " << STRING(w) << " = " << (w) << std::endl
 #endif
 #define FORCEINLINE __forceinline
 #ifdef _MSC_VER
 #define PRE_ALIGN(x)  /*__declspec(align(x))*/
@@ -75,7 +85,44 @@ typedef int64_t __vec1_i64;
 struct __vec16_i32;
 #if 0
 /* (iw) actually, this *SHOULD* be the right implementation for a
 vec16_i1: this one is a class that can have a constructor (which
 ISPC sometimes emits for these vectors...) This version might 
 not be working with embree's ISPC bindings, probably because 
 embree still uses the 'wrong' implementation */
 typedef struct PRE_ALIGN(2) __vec16_i1
 {
  FORCEINLINE operator __mmask16() const { return v; }
  FORCEINLINE __vec16_i1() { }
  FORCEINLINE __vec16_i1(const __mmask16 &vv) : v(vv) { }
  FORCEINLINE __vec16_i1(bool v0, bool v1, bool v2, bool v3,
                         bool v4, bool v5, bool v6, bool v7,
                         bool v8, bool v9, bool v10, bool v11,
                         bool v12, bool v13, bool v14, bool v15) {
    v = ((v0 & 1) |
        ((v1 & 1) << 1) |
        ((v2 & 1) << 2) |
        ((v3 & 1) << 3) |
        ((v4 & 1) << 4) |
        ((v5 & 1) << 5) |
        ((v6 & 1) << 6) |
        ((v7 & 1) << 7) |
        ((v8 & 1) << 8) |
        ((v9 & 1) << 9) |
        ((v10 & 1) << 10) |
        ((v11 & 1) << 11) |
        ((v12 & 1) << 12) |
        ((v13 & 1) << 13) |
        ((v14 & 1) << 14) |
        ((v15 & 1) << 15));
    }
    __mmask16 v;
 } POST_ALIGN(2) __vec16_i1;
 #else
 typedef __mmask16 POST_ALIGN(2) __vec16_i1;
 #endif
 typedef struct PRE_ALIGN(64) __vec16_f {
  FORCEINLINE operator __m512() const { return v; }
@@ -167,14 +214,14 @@ struct vec16 {
 PRE_ALIGN(16) struct __vec16_i8   : public vec16<int8_t> { 
  FORCEINLINE __vec16_i8() { }
-    FORCEINLINE __vec16_i8(const __vec16_i8 &o);
+  FORCEINLINE __vec16_i8(const int8_t  v0,  const int8_t  v1,  const int8_t  v2,  const int8_t  v3, 
-    FORCEINLINE __vec16_i8& operator =(const __vec16_i8 &o);
+                         const int8_t  v4,  const int8_t  v5,  const int8_t  v6,  const int8_t  v7,
-    FORCEINLINE __vec16_i8(int8_t v0, int8_t v1, int8_t v2, int8_t v3, 
+                         const int8_t  v8,  const int8_t  v9,  const int8_t  v10, const int8_t  v11, 
-                           int8_t v4, int8_t v5, int8_t v6, int8_t v7,
+                         const int8_t  v12, const int8_t  v13, const int8_t  v14, const int8_t  v15)
                           int8_t v8, int8_t v9, int8_t v10, int8_t v11, 
                           int8_t v12, int8_t v13, int8_t v14, int8_t v15)
    : vec16<int8_t>(v0, v1, v2, v3, v4, v5, v6, v7,
                    v8, v9, v10, v11, v12, v13, v14, v15) { }
  FORCEINLINE __vec16_i8(const __vec16_i8 &o);
  FORCEINLINE __vec16_i8& operator =(const __vec16_i8 &o);
 } POST_ALIGN(16);
 PRE_ALIGN(32) struct __vec16_i16  : public vec16<int16_t> { 
@@ -215,6 +262,28 @@ inline std::ostream &operator<<(std::ostream &out, const __m512 &v)
  return out;
 }
 inline std::ostream &operator<<(std::ostream &out, const __vec16_i8 &v)
 {
  out << "[";
  for (int i=0;i<16;i++)  
    out << (i?",":"") << std::dec << std::setw(8) << (int)((unsigned char*)&v)[i] << std::dec;
  // out << (i?",":"") << std::hex << std::setw(8) << ((int*)&v)[i] << std::dec;
  out << "]" << std::flush;
  return out;
 }
 inline std::ostream &operator<<(std::ostream &out, const __vec16_i64 &v)
 {
  out << "[";
  uint32_t *ptr = (uint32_t*)&v;
  for (int i=0;i<16;i++) {
    uint64_t val = (uint64_t(ptr[i])<<32)+ptr[i+16];
    out << (i?",":"") << ((int*)val);
  }  
  out << "]" << std::flush;
  return out;
 }
 ///////////////////////////////////////////////////////////////////////////
 // macros...
@@ -299,6 +368,20 @@ static FORCEINLINE bool __extract_element(__vec16_i1 mask, uint32_t index) {
  return (mask & (1 << index)) ? true : false;
 }
 static FORCEINLINE int64_t __extract_element(const __vec16_i64 &v, uint32_t index)
 {
  //uint *src = (uint *)&v;
  const uint *src = (const uint *)&v;
  return src[index+16] | (uint64_t(src[index]) << 32);
 }
 /*
   static FORCEINLINE void __insert_element(__vec16_i1 *vec, int index, 
   bool val) {
@@ -557,9 +640,13 @@ static FORCEINLINE __vec16_i32 __broadcast_i32(__vec16_i32 v, int index) {
  return _mm512_set1_epi32(val);
 }
 static FORCEINLINE __vec16_i32 __cast_trunc(__vec16_i32, const __vec16_i64 i64) {
  return __vec16_i32(i64.v_lo);
 }
 static FORCEINLINE __vec16_i32 __rotate_i32(__vec16_i32 v, int index) {
  __vec16_i32 idx = __smear_i32<__vec16_i32>(index);
-    __vec16_i32 shuffle = _mm512_and_epi32(_mm512_add_epi32(__ispc_stride1, idx),  __smear_i32<__vec16_i32>(0x7));
+  __vec16_i32 shuffle = _mm512_and_epi32(_mm512_add_epi32(__ispc_stride1, idx),  __smear_i32<__vec16_i32>(0xf));
  return _mm512_mask_permutevar_epi32(v, 0xffff, shuffle, v);
 }
@@ -598,11 +685,25 @@ template <> FORCEINLINE void __store<64>(__vec16_i32 *p, __vec16_i32 v) {
 ///////////////////////////////////////////////////////////////////////////
 // int64
 ///////////////////////////////////////////////////////////////////////////
-
+  static FORCEINLINE 
-static FORCEINLINE int64_t __extract_element(const __vec16_i64 &v, uint32_t index)
+void __masked_store_i64(void *p, const __vec16_i64 &v, __vec16_i1 mask) 
 {
-    uint *src = (uint *)&v;
+  __m512i v1;
-    return src[index+16] | (int64_t(src[index]) << 32);
+  __m512i v2;
  v1 = _mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xAAAA,
      _mm512_set_16to16_pi(15,15,14,14,13,13,12,12,11,11,10,10,9,9,8,8),
      v.v_hi);
  v1 = _mm512_mask_permutevar_epi32(v1, 0x5555,
      _mm512_set_16to16_pi(15,15,14,14,13,13,12,12,11,11,10,10,9,9,8,8),
      v.v_lo);
  v2 = _mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xAAAA,
      _mm512_set_16to16_pi(7,7,6,6,5,5,4,4,3,3,2,2,1,1,0,0),
      v.v_hi);
  v2 = _mm512_mask_permutevar_epi32(v2, 0x5555,
      _mm512_set_16to16_pi(7,7,6,6,5,5,4,4,3,3,2,2,1,1,0,0),
      v.v_lo);
  _mm512_mask_store_epi64(p, mask, v2);
  _mm512_mask_store_epi64(((uint8_t*)p)+64, mask>>8, v1);
 }
 static FORCEINLINE void __insert_element(__vec16_i64 *v, uint32_t index, int64_t val) {
@@ -704,6 +805,13 @@ static FORCEINLINE __vec16_i64 __shl(__vec16_i64 a, __vec16_i64 b) {
  return __vec16_i64(lo, hi);
 }
 static FORCEINLINE __vec16_i64 __shl(__vec16_i64 a, unsigned long long b) {
  __vec16_i32 hi = _mm512_or_epi32(_mm512_slli_epi32(a.v_hi, b), 
      _mm512_srli_epi32(a.v_lo, 32-b));
  __vec16_i32 lo = _mm512_slli_epi32(a.v_lo, b);
  return __vec16_i64(lo, hi);
 }
 static FORCEINLINE __vec16_i64 __lshr(__vec16_i64 a, __vec16_i64 b) {
  __vec16_i32 shift = _mm512_sub_epi32(__ispc_thirty_two, b.v_lo);
  __vec16_i32 xfer = _mm512_and_epi32(_mm512_sllv_epi32(__ispc_ffffffff, shift), _mm512_sllv_epi32(a.v_hi, shift));
@@ -724,6 +832,16 @@ static FORCEINLINE __vec16_i64 __ashr(__vec16_i64 a, __vec16_i64 b) {
  return __vec16_i64(lo, hi);
 }
 static FORCEINLINE __vec16_i64 __ashr(__vec16_i64 a, unsigned long long b) {
  __vec16_i32 xfer
    = _mm512_slli_epi32(_mm512_and_epi32(a.v_hi, 
          _mm512_set1_epi32((1<<b)-1)),
        32-b);
  __vec16_i32 hi = _mm512_srai_epi32(a.v_hi, b);
  __vec16_i32 lo = _mm512_or_epi32(xfer, _mm512_srli_epi32(a.v_lo, b));
  return __vec16_i64(lo, hi);
 }
 static FORCEINLINE __vec16_i1 __equal_i64(const __vec16_i64 &a, const __vec16_i64 &b) {
  const __mmask16 lo_match = _mm512_cmpeq_epi32_mask(a.v_lo,b.v_lo);
  return _mm512_mask_cmpeq_epi32_mask(lo_match,a.v_hi,b.v_hi);
@@ -731,9 +849,9 @@ static FORCEINLINE __vec16_i1 __equal_i64(const __vec16_i64 &a, const __vec16_i6
 static FORCEINLINE __vec16_i1 __equal_i64_and_mask(const __vec16_i64 &a, const __vec16_i64 &b,
    __vec16_i1 mask) {
-    __mmask16 lo_match = _mm512_cmpeq_epi32_mask(a.v_lo,b.v_lo);
+  __mmask16 lo_match = _mm512_mask_cmpeq_epi32_mask((__mmask16)mask, a.v_lo,b.v_lo);
  __mmask16 full_match = _mm512_mask_cmpeq_epi32_mask(lo_match,a.v_hi,b.v_hi);
-    return _mm512_kand(full_match, (__mmask16)mask);
+  return full_match;
 }
 static FORCEINLINE __vec16_i1 __not_equal_i64(const __vec16_i64 &a, const __vec16_i64 &b) {
@@ -762,10 +880,11 @@ template <> FORCEINLINE  __vec16_i64 __smear_i64<__vec16_i64>(const int64_t &l)
 template <int ALIGN> static FORCEINLINE __vec16_i64 __load(const __vec16_i64 *p) {
  __vec16_i32 v1;
  __vec16_i32 v2;
-    v2 = _mm512_extloadunpacklo_epi32(v2, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
+  const uint8_t*ptr = (const uint8_t*)p;
-    v2 = _mm512_extloadunpackhi_epi32(v2, (uint8_t*)p+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
+  v2 = _mm512_extloadunpacklo_epi32(v2, ptr, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
-    v1 = _mm512_extloadunpacklo_epi32(v1, (uint8_t*)p+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
+  v2 = _mm512_extloadunpackhi_epi32(v2, ptr+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
-    v1 = _mm512_extloadunpackhi_epi32(v1, (uint8_t*)p+128, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
+  v1 = _mm512_extloadunpacklo_epi32(v1, ptr+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
  v1 = _mm512_extloadunpackhi_epi32(v1, ptr+128, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
  __vec16_i64 ret;
  ret.v_hi = _mm512_mask_permutevar_epi32(ret.v_hi, 0xFF00,
@@ -850,6 +969,68 @@ template <> FORCEINLINE void __store<128>(__vec16_i64 *p, __vec16_i64 v) {
  __store<64>(p, v);
 }
 /*! gather vector of 64-bit ints from addresses pointing to uniform ints 
  (iw) WARNING: THIS CODE ONLY WORKS FOR GATHERS FROM ARRAYS OF
 ***UNIFORM*** INT64's/POINTERS.  (problem is that ispc doesn't
 expose whether it's from array of uniform or array of varying
 poitners, so in here there's no way to tell - only thing we can do
 is pick one...
 */
 static FORCEINLINE __vec16_i64
 __gather_base_offsets32_i64(uint8_t *base, uint32_t scale, __vec16_i32 offsets, 
    __vec16_i1 mask) {
  __vec16_i64 ret;
  ret.v_lo = _mm512_mask_i32extgather_epi32(_mm512_undefined_epi32(), mask, offsets, 
      base, _MM_UPCONV_EPI32_NONE, scale,
      _MM_HINT_NONE);
  ret.v_hi = _mm512_mask_i32extgather_epi32(_mm512_undefined_epi32(), mask, offsets, 
      base+4, _MM_UPCONV_EPI32_NONE, scale,
      _MM_HINT_NONE);
  return ret;
 }
 /*! gather vector of 64-bit ints from addresses pointing to uniform ints 
  (iw) WARNING: THIS CODE ONLY WORKS FOR GATHERS FROM ARRAYS OF
 ***UNIFORM*** INT64's/POINTERS.  (problem is that ispc doesn't
 expose whether it's from array of uniform or array of varying
 poitners, so in here there's no way to tell - only thing we can do
 is pick one...
 */
  static FORCEINLINE __vec16_i64
 __gather64_i64(__vec16_i64 addr, __vec16_i1 mask) 
 {
  __vec16_i64 ret;
  // There is no gather instruction with 64-bit offsets in KNC.
  // We have to manually iterate over the upper 32 bits ;-)
  __vec16_i1 still_to_do = mask;
  const __vec16_i32 signed_offsets = _mm512_add_epi32(addr.v_lo, __smear_i32<__vec16_i32>((int32_t)INT_MIN));
  while (still_to_do) {
    int first_active_lane = _mm_tzcnt_32((int)still_to_do);
    const uint32_t &hi32 = ((uint*)&addr.v_hi)[first_active_lane];
    __vec16_i1 match = _mm512_mask_cmp_epi32_mask(still_to_do,addr.v_hi,
        __smear_i32<__vec16_i32>((int32_t)hi32),
        _MM_CMPINT_EQ);
    void * base = (void*)((((unsigned long)hi32) << 32) + (unsigned long)(-(long)INT_MIN));
    ret.v_lo = _mm512_mask_i32extgather_epi32(ret.v_lo, match, signed_offsets, 
        base, _MM_UPCONV_EPI32_NONE, 1,
        _MM_HINT_NONE);
    ret.v_hi = _mm512_mask_i32extgather_epi32(ret.v_hi, match, signed_offsets, 
        base+4, _MM_UPCONV_EPI32_NONE, 1,
        _MM_HINT_NONE);
    still_to_do = _mm512_kxor(match, still_to_do);
  }
  return ret;
 }
 ///////////////////////////////////////////////////////////////////////////
 // float
 ///////////////////////////////////////////////////////////////////////////
@@ -1329,16 +1510,15 @@ static FORCEINLINE __vec16_i32 __cast_fptoui(__vec16_i32, __vec16_f val) {
 static FORCEINLINE __vec16_d __cast_fpext(__vec16_d, __vec16_f val) {
  __vec16_d ret;
-    ret.v2 = _mm512_cvtpslo_pd(val.v);
+  ret.v1 = _mm512_cvtpslo_pd(val.v);
  __vec16_f other8 = _mm512_permute4f128_epi32(_mm512_castps_si512(val.v), _MM_PERM_DCDC);
-    ret.v1 = _mm512_cvtpslo_pd(other8);
+  ret.v2 = _mm512_cvtpslo_pd(other8);
  return ret;
 }
 static FORCEINLINE __vec16_f __cast_fptrunc(__vec16_f, __vec16_d val) {
-    __m512i r0i = _mm512_castps_si512(_mm512_cvtpd_pslo(val.v1));
+  __m512i r0i = _mm512_castps_si512(_mm512_cvtpd_pslo(val.v2));
-    __m512i r1i = _mm512_castps_si512(_mm512_cvtpd_pslo(val.v2));
+  __m512i r1i = _mm512_castps_si512(_mm512_cvtpd_pslo(val.v1));
  return _mm512_mask_permute4f128_epi32(r1i, 0xFF00, r0i, _MM_PERM_BABA);
 }
@@ -1352,11 +1532,37 @@ static FORCEINLINE __vec16_i32 __cast_bits(__vec16_i32, __vec16_f val) {
 static FORCEINLINE __vec16_i64 __cast_bits(__vec16_i64, __vec16_d val) {
-    return *(__vec16_i64*)&val;
+  __vec16_i64 ret;
  ret.v_hi = _mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xFF00,
      _mm512_set_16to16_pi(15,13,11,9,7,5,3,1,14,12,10,8,6,4,2,0),
      val.v2);
  ret.v_hi = _mm512_mask_permutevar_epi32(ret.v_hi, 0x00FF,
      _mm512_set_16to16_pi(14,12,10,8,6,4,2,0,15,13,11,9,7,5,3,1),
      val.v1);
  ret.v_lo = _mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xFF00,
      _mm512_set_16to16_pi(14,12,10,8,6,4,2,0,15,13,11,9,7,5,3,1),
      val.v2);
  ret.v_lo = _mm512_mask_permutevar_epi32(ret.v_lo, 0x00FF,
      _mm512_set_16to16_pi(15,13,11,9,7,5,3,1,14,12,10,8,6,4,2,0),
      val.v1);
  return ret;
 }
 static FORCEINLINE __vec16_d __cast_bits(__vec16_d, __vec16_i64 val) {
-    return *(__vec16_d*)&val;
+  __vec16_d ret;
  ret.v2 = _mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xAAAA,
      _mm512_set_16to16_pi(15,15,14,14,13,13,12,12,11,11,10,10,9,9,8,8),
      val.v_hi);
  ret.v2 = _mm512_mask_permutevar_epi32(ret.v2, 0x5555,
      _mm512_set_16to16_pi(15,15,14,14,13,13,12,12,11,11,10,10,9,9,8,8),
      val.v_lo);
  ret.v1 = _mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xAAAA,
      _mm512_set_16to16_pi(7,7,6,6,5,5,4,4,3,3,2,2,1,1,0,0),
      val.v_hi);
  ret.v1 = _mm512_mask_permutevar_epi32(ret.v1, 0x5555,
      _mm512_set_16to16_pi(7,7,6,6,5,5,4,4,3,3,2,2,1,1,0,0),
      val.v_lo);
  return ret;
 }
 ///////////////////////////////////////////////////////////////////////////
@@ -1488,12 +1694,14 @@ static FORCEINLINE __vec16_f __rsqrt_varying_float(__vec16_f v) {
  return _mm512_invsqrt_ps(v);
 #endif
 }
 static FORCEINLINE __vec16_d __rsqrt_varying_double(__vec16_d x) {
  __vec16_d y;
  for (int i = 0; i < 16; i++)
    __insert_element(&y, i, 1.0/sqrt(__extract_element(x,i)));
  return y;
 }
 static FORCEINLINE double __rsqrt_uniform_double(double v) 
 {
  return 1.0/v;
@@ -1629,6 +1837,38 @@ static FORCEINLINE __vec16_d __masked_load_double(void *p, __vec16_i1 mask) {
 #endif
 }
 static FORCEINLINE void __masked_store_i8(void *p, const __vec16_i8 &val, __vec16_i1 mask) { 
  __vec16_i32 tmp = _mm512_extload_epi32(&val, _MM_UPCONV_EPI32_SINT8, _MM_BROADCAST32_NONE, _MM_HINT_NONE);
  _mm512_mask_extstore_epi32(p, mask, tmp, _MM_DOWNCONV_EPI32_SINT8,_MM_HINT_NONE);
 }
 static FORCEINLINE __vec16_i8 __masked_load_i8(void *p, __vec16_i1 mask) {
  __vec16_i8 ret;
  __vec16_i32 tmp = _mm512_mask_extload_epi32(_mm512_undefined_epi32(),mask,p,
      _MM_UPCONV_EPI32_SINT8, 
      _MM_BROADCAST32_NONE, _MM_HINT_NONE);
  _mm512_extstore_epi32(&ret, tmp, _MM_DOWNCONV_EPI32_SINT8,_MM_HINT_NONE);
  return ret;
 }
 template <int ALIGN> static FORCEINLINE __vec16_i8 __load(const __vec16_i8 *p) {
  return *p;
 }
 template <int ALIGN> static FORCEINLINE void __store(__vec16_i8 *p, __vec16_i8 v) {
  *p = v;
 }
 static FORCEINLINE void
 __scatter_base_offsets32_i8(uint8_t *b, uint32_t scale, __vec16_i32 offsets,
    __vec16_i8 val, __vec16_i1 mask)
 {
  __vec16_i32 tmp = _mm512_extload_epi32(&val,_MM_UPCONV_EPI32_SINT8, 
      _MM_BROADCAST32_NONE, _MM_HINT_NONE);
  printf("__scatter_base_offsets32_i8\n");
  _mm512_mask_i32extscatter_epi32(b, mask, offsets, tmp, 
      _MM_DOWNCONV_EPI32_SINT8, scale, 
      _MM_HINT_NONE);
 }
 static FORCEINLINE void __masked_store_i32(void *p, __vec16_i32 val, __vec16_i1 mask) {
 #ifdef ISPC_FORCE_ALIGNED_MEMORY
  _mm512_mask_store_epi32(p, mask, val.v);
@@ -1729,16 +1969,44 @@ static FORCEINLINE __vec16_d
 __gather_base_offsets32_double(uint8_t *base, uint32_t scale, __vec16_i32 offsets,
    __vec16_i1 mask) { 
  __vec16_d ret;
-    ret.v2 = _mm512_mask_i32loextgather_pd(_mm512_undefined_pd(), mask, offsets,
+  ret.v1 = _mm512_mask_i32loextgather_pd(_mm512_undefined_pd(), mask, offsets,
      base, _MM_UPCONV_PD_NONE, scale,
      _MM_HINT_NONE); 
  __m512i shuffled_offsets = _mm512_permute4f128_epi32(offsets.v, _MM_PERM_DCDC);
-    ret.v1 = _mm512_mask_i32loextgather_pd(_mm512_undefined_pd(), mask, shuffled_offsets,
+  ret.v2 = _mm512_mask_i32loextgather_pd(_mm512_undefined_pd(), mask, shuffled_offsets,
      base, _MM_UPCONV_PD_NONE, scale,
      _MM_HINT_NONE); 
  return ret;
 }
 static FORCEINLINE __vec16_f
 __gather64_float(__vec16_i64 addr, __vec16_i1 mask) 
 {
  __vec16_f ret;
  // There is no gather instruction with 64-bit offsets in KNC.
  // We have to manually iterate over the upper 32 bits ;-)
  __vec16_i1 still_to_do = mask;
  const __vec16_i32 signed_offsets = _mm512_add_epi32(addr.v_lo, __smear_i32<__vec16_i32>((int32_t)INT_MIN));
  while (still_to_do) {
    int first_active_lane = _mm_tzcnt_32((int)still_to_do);
    const uint &hi32 = ((uint*)&addr.v_hi)[first_active_lane];
    __vec16_i1 match = _mm512_mask_cmp_epi32_mask(still_to_do,addr.v_hi,
        __smear_i32<__vec16_i32>((int32_t)hi32),
        _MM_CMPINT_EQ);
    void * base = (void*)((((unsigned long)hi32) << 32) + (unsigned long)(-(long)INT_MIN));
    ret.v = _mm512_mask_i32extgather_ps(ret.v, match, signed_offsets, 
        base, _MM_UPCONV_PS_NONE, 1,
        _MM_HINT_NONE);
    still_to_do = _mm512_kxor(match, still_to_do);
  }
  return ret;
 }
 /*! gather with 64-bit offsets.
  \todo add optimization that falls back to 32-bit offset gather if
@@ -1850,6 +2118,32 @@ __scatter_base_offsets64_i32(uint8_t *_base, uint32_t scale, __vec16_i64 offsets
  }
 }
 static FORCEINLINE void // TODO
 __scatter_base_offsets64_i8(uint8_t *_base, uint32_t scale, __vec16_i64 offsets,
    __vec16_i8 value,
    __vec16_i1 mask) { 
  __vec16_i1 still_to_do = mask;
  __vec16_i32 tmp = _mm512_extload_epi32(&value, _MM_UPCONV_EPI32_SINT8,
      _MM_BROADCAST32_NONE, _MM_HINT_NONE);
  // _mm512_mask_extstore_epi32(p, mask, tmp, _MM_DOWNCONV_EPI32_SINT8,_MM_HINT_NONE);
  while (still_to_do) {
    int first_active_lane = _mm_tzcnt_32((int)still_to_do);
    const uint &hi32 = ((uint*)&offsets.v_hi)[first_active_lane];
    __vec16_i1 match = _mm512_mask_cmp_epi32_mask(mask,offsets.v_hi,
        __smear_i32<__vec16_i32>((int32_t)hi32),
        _MM_CMPINT_EQ);
    void * base = (void*)((unsigned long)_base  +
        ((scale*(unsigned long)hi32) << 32));    
    _mm512_mask_i32extscatter_epi32(base, match, offsets.v_lo, 
        tmp,
        _MM_DOWNCONV_EPI32_SINT8, scale,
        _MM_HINT_NONE);
    still_to_do = _mm512_kxor(match,still_to_do);
  }
 }
 static FORCEINLINE __vec16_i32
@@ -1884,17 +2178,15 @@ __scatter_base_offsets32_float(void *base, uint32_t scale, __vec16_i32 offsets,
 // packed load/store
 ///////////////////////////////////////////////////////////////////////////
-static FORCEINLINE int32_t __packed_load_active(uint32_t *p, __vec16_i32 *val,
+static FORCEINLINE int32_t __packed_load_active(uint32_t *p, __vec16_i32 *val, __vec16_i1 mask) {
-                                                __vec16_i1 mask) {
+  __vec16_i32 v = __load<64>(val);
    __vec16_i32 v;
  v = _mm512_mask_extloadunpacklo_epi32(v, mask, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
-    v = _mm512_mask_extloadunpackhi_epi32(_mm512_undefined_epi32(), mask, (uint8_t*)p+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
+  v = _mm512_mask_extloadunpackhi_epi32(v, mask, (uint8_t*)p+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
  __store<64>(val, v);
  return _mm_countbits_32(uint32_t(mask));
 }
-static FORCEINLINE int32_t __packed_store_active(uint32_t *p, __vec16_i32 val,
+static FORCEINLINE int32_t __packed_store_active(uint32_t *p, __vec16_i32 val, __vec16_i1 mask) {
                                                 __vec16_i1 mask) {
  _mm512_mask_extpackstorelo_epi32(p, mask, val, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
  _mm512_mask_extpackstorehi_epi32((uint8_t*)p+64, mask, val, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
  return _mm_countbits_32(uint32_t(mask));