rewrittem __masked_store_i8/16 functions so they won't segfault

2014-11-09 20:01:58 +04:00
parent e7717e58b5
commit d8f4635366
2 changed files with 71 additions and 43 deletions
--- a/examples/intrinsics/knc.h
+++ b/examples/intrinsics/knc.h
@@ -117,6 +117,8 @@ typedef struct PRE_ALIGN(2) __vec16_i1
        ((v14 & 1) << 14) |
        ((v15 & 1) << 15));
  }
+  FORCEINLINE const uint8_t operator[](const int i) const {  return ((v >> i) & 1); }
+  FORCEINLINE       uint8_t operator[](const int i)       {  return ((v >> i) & 1); }
  __mmask16 v;
 } POST_ALIGN(2) __vec16_i1;

@@ -201,13 +203,10 @@ typedef struct PRE_ALIGN(64) __vec16_i64 {
                  _mm512_set_16to16_pi(15,13,11,9,7,5,3,1,14,12,10,8,6,4,2,0),
                  v2);
  } 
-  // TODO: The previous implementation was faulty as it assumed different data layout.
-  // Here integers in v_hi and v_lo are located not sequentually (like in vec16_d)
-  // but separately - the highest part in v_hi and the lowest in v_lo
-  //FORCEINLINE const int64_t& operator[](const int i) const {  
-  //    return ((uint64_t(((int32_t*)this)[i])<<32)+((int32_t*)this)[i+16]); }
-  //FORCEINLINE       int64_t& operator[](const int i)       {
-  //    return ((uint64_t(((int32_t*)this)[i])<<32)+((int32_t*)this)[i+16]); }
+  FORCEINLINE const int64_t operator[](const int i) const {  
+      return ((uint64_t(((int32_t*)this)[i])<<32)+((int32_t*)this)[i+16]); }
+  FORCEINLINE       int64_t operator[](const int i)       {
+      return ((uint64_t(((int32_t*)this)[i])<<32)+((int32_t*)this)[i+16]); }
  __m512i v_hi;
  __m512i v_lo;
 } POST_ALIGN(64) __vec16_i64;
@@ -261,8 +260,6 @@ inline std::ostream &operator<<(std::ostream &out, const __m512i &v)
  out << "[";
  for (int i=0;i<16;i++)  
    out << (i!=0?",":"") << std::dec << std::setw(8) << ((int*)&v)[i] << std::dec;
-  // out << (i?",":"") << std::hex << std::setw(8) << ((int*)&v)[i] << std::dec;
-
  out << "]" << std::flush;
  return out;
 }
@@ -272,7 +269,15 @@ inline std::ostream &operator<<(std::ostream &out, const __m512 &v)
  out << "[";
  for (int i=0;i<16;i++)  
    out << (i!=0?",":"") << ((float*)&v)[i];
+  out << "]" << std::flush;
+  return out;
+}

+inline std::ostream &operator<<(std::ostream &out, const __vec16_i1 &v)
+{
+  out << "[";
+  for (int i=0;i<16;i++)  
+    out << (i!=0?",":"") << std::dec << std::setw(8) << (int)v[i] << std::dec;
  out << "]" << std::flush;
  return out;
 }
@@ -282,8 +287,15 @@ inline std::ostream &operator<<(std::ostream &out, const __vec16_i8 &v)
  out << "[";
  for (int i=0;i<16;i++)  
    out << (i!=0?",":"") << std::dec << std::setw(8) << (int)((unsigned char*)&v)[i] << std::dec;
-  // out << (i?",":"") << std::hex << std::setw(8) << ((int*)&v)[i] << std::dec;
+  out << "]" << std::flush;
+  return out;
+}

+inline std::ostream &operator<<(std::ostream &out, const __vec16_i16 &v)
+{
+  out << "[";
+  for (int i=0;i<16;i++)  
+    out << (i!=0?",":"") << std::dec << std::setw(8) << (int)((uint16_t*)&v)[i] << std::dec;
  out << "]" << std::flush;
  return out;
 }
@@ -1632,10 +1644,6 @@ static FORCEINLINE __vec16_d __cast_uitofp(__vec16_d, __vec16_i32 val)
  return ret;
 }

-
-
-
-
 // float/double to signed int
 static FORCEINLINE __vec16_i32 __cast_fptosi(__vec16_i32, __vec16_f val) {
  return _mm512_cvtfxpnt_round_adjustps_epi32(val, _MM_ROUND_MODE_TOWARD_ZERO, _MM_EXPADJ_NONE);
@@ -2257,7 +2265,20 @@ static FORCEINLINE __vec16_d __masked_load_double(void *p, __vec16_i1 mask) {

 static FORCEINLINE void __masked_store_i8(void *p, const __vec16_i8 &val, __vec16_i1 mask) { 
  __vec16_i32 tmp = _mm512_extload_epi32(&val, _MM_UPCONV_EPI32_SINT8, _MM_BROADCAST32_NONE, _MM_HINT_NONE);
+#ifdef ISPC_FORCE_ALIGNED_MEMORY
  _mm512_mask_extstore_epi32(p, mask, tmp, _MM_DOWNCONV_EPI32_SINT8,_MM_HINT_NONE);
+#else
+  #if 0 // TODO: both implementations seem to work, need to test which one is faster
+  _mm512_mask_i32extscatter_epi32 (p, mask, __vec16_i32(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15), tmp, _MM_DOWNCONV_EPI32_SINT8, sizeof(uint8_t), _MM_HINT_NONE);
+  #else
+  __vec16_i32 tmp_;
+  tmp_.v = _mm512_extloadunpacklo_epi32(tmp_.v, p, _MM_UPCONV_EPI32_SINT8, _MM_HINT_NONE);
+  tmp_.v = _mm512_extloadunpackhi_epi32(tmp_.v, (uint8_t*)p+64, _MM_UPCONV_EPI32_SINT8, _MM_HINT_NONE);
+  tmp_.v = _mm512_mask_mov_epi32(tmp_.v, mask, tmp.v);
+  _mm512_extpackstorelo_epi32(p, tmp_.v, _MM_DOWNCONV_EPI32_SINT8, _MM_HINT_NONE);
+  _mm512_extpackstorehi_epi32((uint8_t*)p+64, tmp_.v, _MM_DOWNCONV_EPI32_SINT8, _MM_HINT_NONE);
+  #endif // if 0
+#endif
 }

 static FORCEINLINE __vec16_i8 __masked_load_i8(void *p, __vec16_i1 mask) {
@@ -2289,7 +2310,20 @@ __scatter_base_offsets32_i8(uint8_t *b, uint32_t scale, __vec16_i32 offsets,

 static FORCEINLINE void __masked_store_i16(void *p, const __vec16_i16 &val, __vec16_i1 mask) {
  __vec16_i32 tmp = _mm512_extload_epi32(&val, _MM_UPCONV_EPI32_SINT16, _MM_BROADCAST32_NONE, _MM_HINT_NONE);
+#ifdef ISPC_FORCE_ALIGNED_MEMORY
  _mm512_mask_extstore_epi32(p, mask, tmp, _MM_DOWNCONV_EPI32_SINT16, _MM_HINT_NONE);
+#else
+  #if 0 // TODO: both implementations seem to work, need to test which one is faster
+  _mm512_mask_i32extscatter_epi32 (p, mask, __vec16_i32(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15), tmp, _MM_DOWNCONV_EPI32_SINT16, sizeof(uint16_t), _MM_HINT_NONE);
+  #else
+  __vec16_i32 tmp_;
+  tmp_.v = _mm512_extloadunpacklo_epi32(tmp_.v, p, _MM_UPCONV_EPI32_SINT16, _MM_HINT_NONE);
+  tmp_.v = _mm512_extloadunpackhi_epi32(tmp_.v, (uint8_t*)p+64, _MM_UPCONV_EPI32_SINT16, _MM_HINT_NONE);
+  tmp_.v = _mm512_mask_mov_epi32(tmp_.v, mask, tmp.v);
+  _mm512_extpackstorelo_epi32(p, tmp_.v, _MM_DOWNCONV_EPI32_SINT16, _MM_HINT_NONE);
+  _mm512_extpackstorehi_epi32((uint8_t*)p+64, tmp_.v, _MM_DOWNCONV_EPI32_SINT16, _MM_HINT_NONE);
+  #endif // if 0
+#endif
 }

 static FORCEINLINE __vec16_i16 __masked_load_i16(void *p, __vec16_i1 mask) {
--- a/fail_db.txt
+++ b/fail_db.txt
@@ -1469,27 +1469,21 @@
 ./tests/uint64-max.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
 ./tests/uint64-min-1.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
 ./tests/uint64-min.ispc compfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
-./tests/store-int8-2.ispc runfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
 ./tests/store-int8-1.ispc runfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
 ./tests/store-int8-2.ispc runfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
 ./tests/store-int8.ispc runfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
-./tests/store-int8-2.ispc runfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
 ./tests/store-int8-1.ispc runfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
 ./tests/store-int8-2.ispc runfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
 ./tests/store-int8.ispc runfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
-./tests/store-int8-2.ispc runfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
 ./tests/store-int8-1.ispc runfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
 ./tests/store-int8-2.ispc runfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
 ./tests/store-int8.ispc runfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
-./tests/store-int16-2.ispc runfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O2 *
 ./tests/store-int16-1.ispc runfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
 ./tests/store-int16-2.ispc runfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
 ./tests/store-int16.ispc runfail  x86-64            knc   Linux LLVM 3.4   icpc15.0 -O0 *
-./tests/store-int16-2.ispc runfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O2 *
 ./tests/store-int16-1.ispc runfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
 ./tests/store-int16-2.ispc runfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
 ./tests/store-int16.ispc runfail  x86-64            knc   Linux LLVM 3.5   icpc15.0 -O0 *
-./tests/store-int16-2.ispc runfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O2 *
 ./tests/store-int16-1.ispc runfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
 ./tests/store-int16-2.ispc runfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *
 ./tests/store-int16.ispc runfail  x86-64            knc   Linux LLVM 3.6   icpc15.0 -O0 *