diff --git a/Makefile b/Makefile index 8142983d..fc132e59 100644 --- a/Makefile +++ b/Makefile @@ -58,7 +58,7 @@ LDFLAGS= ifeq ($(ARCH_OS),Linux) # try to link everything statically under Linux (including libstdc++) so # that the binaries we generate will be portable across distributions... - LDFLAGS=-static +# LDFLAGS=-static endif LEX=flex diff --git a/examples/intrinsics/knc.h b/examples/intrinsics/knc.h index 0ad01fb2..c806e2a0 100644 --- a/examples/intrinsics/knc.h +++ b/examples/intrinsics/knc.h @@ -40,6 +40,12 @@ #include #include +#include // for operator<<(m512[i]) +#include // for operator<<(m512[i]) + +// iw: TODO - need to set that to whatever the cmdline flag sais +#define ASSUME_ALIGNED 0 + #ifdef _MSC_VER #define FORCEINLINE __forceinline #define PRE_ALIGN(x) /*__declspec(align(x))*/ @@ -64,9 +70,9 @@ extern "C" { } -typedef float __vec1_f; -typedef double __vec1_d; -typedef int8_t __vec1_i8; +typedef float __vec1_f; +typedef double __vec1_d; +typedef int8_t __vec1_i8; typedef int16_t __vec1_i16; typedef int32_t __vec1_i32; typedef int64_t __vec1_i64; @@ -228,6 +234,48 @@ PRE_ALIGN(32) struct __vec16_i16 : public vec16 { } POST_ALIGN(32); + +/////////////////////////////////////////////////////////////////////////// +// debugging helpers +// +inline std::ostream &operator<<(std::ostream &out, const __m512i &v) +{ + out << "["; + for (int i=0;i<16;i++) + out << (i?",":"") << std::dec << std::setw(8) << ((int*)&v)[i] << std::dec; + // out << (i?",":"") << std::hex << std::setw(8) << ((int*)&v)[i] << std::dec; + + out << "]" << std::flush; + return out; +} + +inline std::ostream &operator<<(std::ostream &out, const __m512 &v) +{ + out << "["; + for (int i=0;i<16;i++) + out << (i?",":"") << ((float*)&v)[i]; + + out << "]" << std::flush; + return out; +} + + +/////////////////////////////////////////////////////////////////////////// +// macros... + +FORCEINLINE __vec16_i8::__vec16_i8(const __vec16_i8 &o) +{ + for (int i=0;i<16;i++) + v[i] = o.v[i]; +} + +FORCEINLINE __vec16_i8& __vec16_i8::operator=(const __vec16_i8 &o) +{ + for (int i=0;i<16;i++) + v[i] = o.v[i]; + return *this; +} + /////////////////////////////////////////////////////////////////////////// // macros... @@ -429,7 +477,8 @@ static FORCEINLINE __vec16_i1 __movmsk(__vec16_i1 mask) { } static FORCEINLINE __vec16_i1 __any(__vec16_i1 mask) { - return (mask!=0); + // return (mask!=0); + return !_mm512_kortestz(mask,mask); } static FORCEINLINE __vec16_i1 __all(__vec16_i1 mask) { @@ -437,7 +486,8 @@ static FORCEINLINE __vec16_i1 __all(__vec16_i1 mask) { } static FORCEINLINE __vec16_i1 __none(__vec16_i1 mask) { - return (mask==0); + // return (mask==0); + return !__any(mask); } static FORCEINLINE __vec16_i1 __equal_i1(__vec16_i1 a, __vec16_i1 b) { @@ -825,10 +875,14 @@ static FORCEINLINE __vec16_i32 __shuffle2_i32(__vec16_i32 v0, __vec16_i32 v1, __ */ template static FORCEINLINE __vec16_i32 __load(const __vec16_i32 *p) { +#if ASSUME_ALIGNED + return _mm512_load_epi32(p); +#else __vec16_i32 v = _mm512_undefined_epi32(); v = _mm512_extloadunpackhi_epi32(v, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE); v = _mm512_extloadunpacklo_epi32(v, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE); return v; +#endif } template <> static FORCEINLINE __vec16_i32 __load<64>(const __vec16_i32 *p) { @@ -836,8 +890,12 @@ template <> static FORCEINLINE __vec16_i32 __load<64>(const __vec16_i32 *p) { } template static FORCEINLINE void __store(__vec16_i32 *p, __vec16_i32 v) { +#if ASSUME_ALIGNED + _mm512_store_epi32(p, v); +#else _mm512_extpackstorehi_epi32(p, v, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE); _mm512_extpackstorelo_epi32(p, v, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE); +#endif } template <> static FORCEINLINE void __store<64>(__vec16_i32 *p, __vec16_i32 v) { @@ -1232,7 +1290,7 @@ static FORCEINLINE void __insert_element(__vec16_f *v, int index, float val) { template RetVecType __smear_float(float f); template <> static FORCEINLINE __vec16_f __smear_float<__vec16_f>(float f) { - return _mm512_set1_ps(f); + return _mm512_set_1to16_ps(f); } template RetVecType __setzero_float(); @@ -1267,10 +1325,14 @@ static FORCEINLINE __vec16_f __shuffle2_float(__vec16_f v0, __vec16_f v1, __vec1 */ template static FORCEINLINE __vec16_f __load(const __vec16_f *p) { +#if ASSUME_ALIGNED + return _mm512_load_ps(p); +#else __vec16_f v = _mm512_undefined_ps(); v = _mm512_extloadunpackhi_ps(v, p, _MM_UPCONV_PS_NONE, _MM_HINT_NONE); v = _mm512_extloadunpacklo_ps(v, p, _MM_UPCONV_PS_NONE, _MM_HINT_NONE); return v; +#endif } template <> static FORCEINLINE __vec16_f __load<64>(const __vec16_f *p) { @@ -1278,8 +1340,12 @@ template <> static FORCEINLINE __vec16_f __load<64>(const __vec16_f *p) { } template static FORCEINLINE void __store(__vec16_f *p, __vec16_f v) { +#if ASSUME_ALIGNED + _mm512_store_ps(p, v); +#else _mm512_extpackstorehi_ps(p, v, _MM_DOWNCONV_PS_NONE, _MM_HINT_NONE); _mm512_extpackstorelo_ps(p, v, _MM_DOWNCONV_PS_NONE, _MM_HINT_NONE); +#endif } template <> static FORCEINLINE void __store<64>(__vec16_f *p, __vec16_f v) { @@ -1684,11 +1750,25 @@ static FORCEINLINE __vec16_f __cast_uitofp(__vec16_f, __vec16_i1 v) { return ret; } */ + +static FORCEINLINE __vec16_f __cast_uitofp(__vec16_f, const __vec16_i8 &v) { + return _mm512_extload_ps(v.v,_MM_UPCONV_PS_UINT8,_MM_BROADCAST32_NONE,_MM_HINT_NONE); +} + +static FORCEINLINE __vec16_f __cast_uitofp(__vec16_f, __vec16_i32 v) { + return _mm512_cvtfxpnt_round_adjustepu32_ps(v, _MM_FROUND_NO_EXC,_MM_EXPADJ_NONE); +} + + + // float/double to signed int static FORCEINLINE __vec16_i32 __cast_fptosi(__vec16_i32, __vec16_f val) { return _mm512_cvtfxpnt_round_adjustps_epi32(val, _MM_ROUND_MODE_TOWARD_ZERO, _MM_EXPADJ_NONE); } +static FORCEINLINE __vec16_i32 __cast_fptoui(__vec16_i32, __vec16_f val) { + return _mm512_cvtfxpnt_round_adjustps_epu32(val, _MM_ROUND_MODE_TOWARD_ZERO, _MM_EXPADJ_NONE); +} CAST(__vec16_i8, int8_t, __vec16_f, float, __cast_fptosi) CAST(__vec16_i16, int16_t, __vec16_f, float, __cast_fptosi) @@ -2043,10 +2123,15 @@ static FORCEINLINE __vec16_i32 __masked_load_i32<64>(void *p, __vec16_i1 mask) { // Currently, when a pseudo_gather is converted into a masked load, it has to be unaligned static FORCEINLINE __vec16_i32 __masked_load_i32(void *p, __vec16_i1 mask) { +#if 1 +# pragma message("masked_load_i32 currently works only for _aligned_ loads") + return _mm512_mask_load_epi32(__vec16_i32(),mask.m,p); +#else __vec16_i32 ret; ret = _mm512_mask_extloadunpackhi_epi32(ret, mask, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE); ret = _mm512_mask_extloadunpacklo_epi32(ret, mask, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE); return ret; +#endif } /* @@ -2080,10 +2165,15 @@ template <> static FORCEINLINE __vec16_f __masked_load_float<64>(void *p, __vec1 #endif static FORCEINLINE __vec16_f __masked_load_float(void *p, __vec16_i1 mask) { +#if 1 +# pragma message("masked_load_i32 currently works only for _aligned_ loads") + return _mm512_mask_load_ps(__vec16_f(),mask.m,p); +#else __vec16_f ret; ret = _mm512_mask_extloadunpackhi_ps(ret, mask, p, _MM_UPCONV_PS_NONE, _MM_HINT_NONE); ret = _mm512_mask_extloadunpacklo_ps(ret, mask, p, _MM_UPCONV_PS_NONE, _MM_HINT_NONE); return ret; +#endif } #if 0 @@ -2151,12 +2241,17 @@ template <> static FORCEINLINE void __masked_store_i32<64>(void *p, __vec16_i32 #endif static FORCEINLINE void __masked_store_i32(void *p, __vec16_i32 val, __vec16_i1 mask) { +#if 1 +# pragma message("masked_store_i32 currently works only for _aligned_ stores") + _mm512_mask_store_epi32(p,mask.m,val); +#else __vec16_i32 tmp; tmp = _mm512_extloadunpackhi_epi32(tmp, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE); tmp = _mm512_extloadunpacklo_epi32(tmp, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE); _mm512_mask_mov_epi32(tmp, mask, val); _mm512_extpackstorehi_epi32(p, tmp, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE); _mm512_extpackstorelo_epi32(p, tmp, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE); +#endif } /* @@ -2184,12 +2279,21 @@ template <> static FORCEINLINE void __masked_store_float<64>(void *p, __vec16_f #endif static FORCEINLINE void __masked_store_float(void *p, __vec16_f val, __vec16_i1 mask) { +#if 1 +# pragma message("masked_store_float currently works only for _aligned_ stores") + + // std::cout << "masked store to " << p << " mask " << ((int*)mask.m) << std::endl; + // std::cout << " " << val << std::endl; + + _mm512_mask_store_ps(p,mask.m,val); +#else __vec16_f tmp; tmp = _mm512_extloadunpackhi_ps(tmp, p, _MM_UPCONV_PS_NONE, _MM_HINT_NONE); tmp = _mm512_extloadunpacklo_ps(tmp, p, _MM_UPCONV_PS_NONE, _MM_HINT_NONE); _mm512_mask_mov_ps(tmp, mask, val); _mm512_extpackstorehi_ps(p, tmp, _MM_DOWNCONV_PS_NONE, _MM_HINT_NONE); _mm512_extpackstorelo_ps(p, tmp, _MM_DOWNCONV_PS_NONE, _MM_HINT_NONE); +#endif } #if 0 @@ -2214,6 +2318,11 @@ template <> static FORCEINLINE void __masked_store_double<64>(void *p, __vec16_d #endif static FORCEINLINE void __masked_store_double(void *p, __vec16_d val, __vec16_i1 mask) { +#if 1 +# pragma message("masked_store_double currently works only for _aligned_ stores") + _mm512_mask_store_pd(((float*)p)+00,mask.m8.m1,val.v1); + _mm512_mask_store_pd(((float*)p)+16,mask.m8.m2,val.v2); +#else __vec16_d tmp; tmp.v1 = _mm512_extloadunpackhi_pd(tmp.v1, p, _MM_UPCONV_PD_NONE, _MM_HINT_NONE); tmp.v1 = _mm512_extloadunpacklo_pd(tmp.v1, p, _MM_UPCONV_PD_NONE, _MM_HINT_NONE); @@ -2225,6 +2334,7 @@ static FORCEINLINE void __masked_store_double(void *p, __vec16_d val, __vec16_i1 _mm512_extpackstorelo_pd(p, tmp.v1, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE); _mm512_extpackstorehi_pd(((uint8_t*)p)+64, tmp.v2, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE); _mm512_extpackstorelo_pd(((uint8_t*)p)+64, tmp.v2, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE); +#endif } /* @@ -2261,6 +2371,19 @@ static FORCEINLINE void __masked_store_blend_float(void *p, __vec16_f val, // offsets * offsetScale is in bytes (for all of these) +static FORCEINLINE __vec16_i8 +__gather_base_offsets32_i8(uint8_t *base, uint32_t scale, __vec16_i32 offsets, + __vec16_i1 mask) { + // (iw): need to temporarily store as int because gathers can only return ints. + __vec16_i32 tmp = _mm512_mask_i32extgather_epi32(_mm512_undefined_epi32(), mask, offsets, base, + _MM_UPCONV_EPI32_SINT8, scale, + _MM_HINT_NONE); + // now, downconverting to chars into temporary char vector + __vec16_i8 ret; + _mm512_extstore_epi32(ret.v,tmp,_MM_DOWNCONV_EPI32_SINT8,_MM_HINT_NONE); + return ret; +} + static FORCEINLINE __vec16_i32 __gather_base_offsets32_i32(uint8_t *base, uint32_t scale, __vec16_i32 offsets, __vec16_i1 mask) { @@ -2284,6 +2407,120 @@ __gather_base_offsets32_float(uint8_t *base, uint32_t scale, __vec16_i32 offsets // There is no gather instruction with 64-bit offsets in KNC. // So we cannot implement __gather_base_offsets64_*() +/*! gather with 64-bit offsets. + + \todo add optimization that falls back to 32-bit offset gather if + upper 32 bits are all 0es (in practice, offsets are usually array + indices, and _usually_ <4G even if the compiler cannot statically + figure out that this is the case */ +static FORCEINLINE __vec16_f +__gather_base_offsets64_float(uint8_t *_base, uint32_t scale, __vec16_i64 offsets, + __vec16_i1 mask) { +// There is no gather instruction with 64-bit offsets in KNC. +// So we cannot implement __gather_base_offsets64_*() -- iw: wrong; just have to manually iterate over the upper 32 bits ;-) + __vec16_i1 still_to_do = mask; + __vec16_f ret; + // PING; + while (still_to_do) { + int first_active_lane = _mm_tzcnt_32((int)still_to_do); + const uint &hi32 = ((uint*)&offsets.v_hi)[first_active_lane]; + __vec16_i1 match = _mm512_mask_cmp_epi32_mask + (mask,offsets.v_hi, + __smear_i32<__vec16_i32>((int32_t)hi32),_MM_CMPINT_EQ); + + void * base = (void*)((unsigned long)_base + + ((scale*(unsigned long)hi32) << 32)); + ret = _mm512_mask_i32extgather_ps(ret, match, offsets.v_lo, base, + _MM_UPCONV_PS_NONE, scale, + _MM_HINT_NONE); + still_to_do = _mm512_kxor(match,still_to_do); + } + return ret; +} + + +static FORCEINLINE __vec16_i8 +__gather_base_offsets64_i8(uint8_t *_base, uint32_t scale, __vec16_i64 offsets, + __vec16_i1 mask) +{ + __vec16_i1 still_to_do = mask; + __vec16_i32 tmp; + while (still_to_do) { + int first_active_lane = _mm_tzcnt_32((int)still_to_do); + const uint &hi32 = ((uint*)&offsets.v_hi)[first_active_lane]; + __vec16_i1 match = _mm512_mask_cmp_epi32_mask + (mask,offsets.v_hi, + __smear_i32<__vec16_i32>((int32_t)hi32),_MM_CMPINT_EQ); + + void * base = (void*)((unsigned long)_base + + ((scale*(unsigned long)hi32) << 32)); + tmp = _mm512_mask_i32extgather_epi32(tmp, match, offsets.v_lo, base, + _MM_UPCONV_EPI32_SINT8, scale, + _MM_HINT_NONE); + still_to_do = _mm512_kxor(match,still_to_do); + } + __vec16_i8 ret; + _mm512_extstore_epi32(ret.v,tmp,_MM_DOWNCONV_EPI32_SINT8,_MM_HINT_NONE); + return ret; +} + + + +static FORCEINLINE void +__scatter_base_offsets64_float(uint8_t *_base, uint32_t scale, __vec16_i64 offsets, + __vec16_f value, + __vec16_i1 mask) { + __vec16_i1 still_to_do = mask; + while (still_to_do) { + int first_active_lane = _mm_tzcnt_32((int)still_to_do); + const uint &hi32 = ((uint*)&offsets.v_hi)[first_active_lane]; + __vec16_i1 match = _mm512_mask_cmp_epi32_mask + (mask,offsets.v_hi, + __smear_i32<__vec16_i32>((int32_t)hi32),_MM_CMPINT_EQ); + + void * base = (void*)((unsigned long)_base + + ((scale*(unsigned long)hi32) << 32)); + _mm512_mask_i32extscatter_ps(base, match, offsets.v_lo, + value, + _MM_DOWNCONV_PS_NONE, scale, + _MM_HINT_NONE); + still_to_do = _mm512_kxor(match,still_to_do); + } +} + +static FORCEINLINE void +__scatter_base_offsets64_i32(uint8_t *_base, uint32_t scale, __vec16_i64 offsets, + __vec16_i32 value, + __vec16_i1 mask) { + __vec16_i1 still_to_do = mask; + while (still_to_do) { + int first_active_lane = _mm_tzcnt_32((int)still_to_do); + const uint &hi32 = ((uint*)&offsets.v_hi)[first_active_lane]; + __vec16_i1 match = _mm512_mask_cmp_epi32_mask + (mask,offsets.v_hi, + __smear_i32<__vec16_i32>((int32_t)hi32),_MM_CMPINT_EQ); + + void * base = (void*)((unsigned long)_base + + ((scale*(unsigned long)hi32) << 32)); + _mm512_mask_i32extscatter_epi32(base, match, offsets.v_lo, + value, + _MM_DOWNCONV_EPI32_NONE, scale, + _MM_HINT_NONE); + still_to_do = _mm512_kxor(match,still_to_do); + } +} + + + +static FORCEINLINE __vec16_i32 +__gather_base_offsets64_i32(uint8_t *_base, uint32_t scale, __vec16_i64 offsets, + __vec16_i1 mask) +{ + __vec16_f r = __gather_base_offsets64_float(_base,scale,offsets,mask); + return (__vec16_i32&)r; +} + + //GATHER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i32, __gather_base_offsets32_i8) //GATHER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i32, __gather_base_offsets32_i16) //GATHER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i32, __gather_base_offsets32_i64) diff --git a/module.cpp b/module.cpp index 9ba7cea9..0e7ca875 100644 --- a/module.cpp +++ b/module.cpp @@ -1538,6 +1538,13 @@ Module::writeHostStub(const char *fn) const FunctionType *fct = CastType(sym->type); Assert(fct); + if (!fct->GetReturnType()->IsVoidType()) { + Warning(sym->pos,"When emitting offload-stubs, ignoring \"export\"ed function with non-void return types.\n"); + continue; + } + + + // ------------------------------------------------------- // first, emit a struct that holds the parameters // -------------------------------------------------------