diff --git a/examples/intrinsics/knc.h b/examples/intrinsics/knc.h index 365a87c5..37588d3b 100644 --- a/examples/intrinsics/knc.h +++ b/examples/intrinsics/knc.h @@ -3183,33 +3183,6 @@ __gather_base_offsets32_double(uint8_t *base, uint32_t scale, __vec16_i32 offset return ret; } -static FORCEINLINE __vec16_f -__gather64_float(__vec16_i64 addr, __vec16_i1 mask) -{ - __vec16_f ret; - - // There is no gather instruction with 64-bit offsets in KNC. - // We have to manually iterate over the upper 32 bits ;-) - __vec16_i1 still_to_do = mask; - const __vec16_i32 signed_offsets = _mm512_add_epi32(addr.v_lo, __smear_i32<__vec16_i32>((int32_t)INT_MIN)); - while (still_to_do) { - int first_active_lane = _mm_tzcnt_32((int)still_to_do); - const uint &hi32 = ((uint*)&addr.v_hi)[first_active_lane]; - __vec16_i1 match = _mm512_mask_cmp_epi32_mask(still_to_do,addr.v_hi, - __smear_i32<__vec16_i32>((int32_t)hi32), - _MM_CMPINT_EQ); - - void * base = (void*)((((unsigned long)hi32) << 32) + (unsigned long)(-(long)INT_MIN)); - - ret.v = _mm512_mask_i32extgather_ps(ret.v, match, signed_offsets, - base, _MM_UPCONV_PS_NONE, 1, - _MM_HINT_NONE); - still_to_do = _mm512_kxor(match, still_to_do); - } - - return ret; -} - static FORCEINLINE __vec16_i32 __gather64_i32(__vec16_i64 addr, __vec16_i1 mask) { @@ -3237,6 +3210,33 @@ __gather64_i32(__vec16_i64 addr, __vec16_i1 mask) return ret; } +static FORCEINLINE __vec16_f +__gather64_float(__vec16_i64 addr, __vec16_i1 mask) +{ + __vec16_f ret; + + // There is no gather instruction with 64-bit offsets in KNC. + // We have to manually iterate over the upper 32 bits ;-) + __vec16_i1 still_to_do = mask; + const __vec16_i32 signed_offsets = _mm512_add_epi32(addr.v_lo, __smear_i32<__vec16_i32>((int32_t)INT_MIN)); + while (still_to_do) { + int first_active_lane = _mm_tzcnt_32((int)still_to_do); + const uint &hi32 = ((uint*)&addr.v_hi)[first_active_lane]; + __vec16_i1 match = _mm512_mask_cmp_epi32_mask(still_to_do,addr.v_hi, + __smear_i32<__vec16_i32>((int32_t)hi32), + _MM_CMPINT_EQ); + + void * base = (void*)((((unsigned long)hi32) << 32) + (unsigned long)(-(long)INT_MIN)); + + ret.v = _mm512_mask_i32extgather_ps(ret.v, match, signed_offsets, + base, _MM_UPCONV_PS_NONE, 1, + _MM_HINT_NONE); + still_to_do = _mm512_kxor(match, still_to_do); + } + + return ret; +} + static FORCEINLINE __vec16_d __gather64_double(__vec16_i64 addr, __vec16_i1 mask) { @@ -3290,7 +3290,6 @@ __gather_base_offsets64_float(uint8_t *_base, uint32_t scale, __vec16_i64 offset static FORCEINLINE __vec16_i8 __gather_base_offsets64_i8(uint8_t *_base, uint32_t scale, __vec16_i64 offsets, __vec16_i1 mask) { - const __vec16_i32 signed_offsets = _mm512_add_epi32(offsets.v_lo, __smear_i32<__vec16_i32>((int32_t)INT_MIN)); __vec16_i1 still_to_do = mask; __vec16_i32 tmp; @@ -3313,6 +3312,12 @@ static FORCEINLINE __vec16_i8 __gather_base_offsets64_i8(uint8_t *_base, uint32_ return ret; } +static FORCEINLINE __vec16_i8 +__gather64_i8(__vec16_i64 addr, __vec16_i1 mask) +{ + return __gather_base_offsets64_i8(0, 1, addr, mask); +} + static FORCEINLINE void __scatter_base_offsets64_float(uint8_t *_base, uint32_t scale, __vec16_i64 offsets, __vec16_f value,