From 81cb374084284a34b98fc331c026c541f37f5376 Mon Sep 17 00:00:00 2001 From: Anton Mitrokhin Date: Thu, 26 Mar 2015 14:53:19 +0300 Subject: [PATCH] loop execution for i8/16 32-addr-bit gathers/scatters --- examples/intrinsics/knl.h | 61 +++++++++++++++++++-------------------- 1 file changed, 30 insertions(+), 31 deletions(-) diff --git a/examples/intrinsics/knl.h b/examples/intrinsics/knl.h index 18704ea5..e1666c0c 100644 --- a/examples/intrinsics/knl.h +++ b/examples/intrinsics/knl.h @@ -2789,27 +2789,27 @@ template static FORCEINLINE void __store(__vec16_i8 *p, __vec16_i8 v *p = v; } -static FORCEINLINE void -__scatter_base_offsets32_i8(uint8_t *b, uint32_t scale, __vec16_i32 offsets, +static FORCEINLINE void __scatter_base_offsets32_i8(uint8_t *b, uint32_t scale, __vec16_i32 offsets, __vec16_i8 val, __vec16_i1 mask) { - __vec16_i32 tmp = _mm512_extload_epi32(&val,_MM_UPCONV_EPI32_SINT8, - _MM_BROADCAST32_NONE, _MM_HINT_NONE); - _mm512_mask_i32extscatter_epi32(b, mask, offsets, tmp, - _MM_DOWNCONV_EPI32_SINT8, scale, - _MM_HINT_NONE); + // TODO + for (int i = 0; i < 16; ++i) + if ((mask & (1 << i)) != 0) { + int8_t *ptr = (int8_t *)(_base + scale * offsets[i]); + *ptr = val[i]; + } } -static FORCEINLINE void -__scatter_base_offsets32_i16(uint8_t *b, uint32_t scale, __vec16_i32 offsets, +static FORCEINLINE void __scatter_base_offsets32_i16(uint8_t *b, uint32_t scale, __vec16_i32 offsets, __vec16_i16 val, __vec16_i1 mask) { - __vec16_i32 tmp = _mm512_extload_epi32(&val,_MM_UPCONV_EPI32_SINT16, - _MM_BROADCAST32_NONE, _MM_HINT_NONE); - _mm512_mask_i32extscatter_epi32(b, mask, offsets, tmp, - _MM_DOWNCONV_EPI32_SINT16, scale, - _MM_HINT_NONE); + // TODO + for (int i = 0; i < 16; ++i) + if ((mask & (1 << i)) != 0) { + int16_t *ptr = (int16_t *)(_base + scale * offsets[i]); + *ptr = val[i]; + } } @@ -2900,28 +2900,27 @@ static FORCEINLINE void __masked_store_blend_float(void *p, __vec16_f val, static FORCEINLINE __vec16_i8 __gather_base_offsets32_i8(uint8_t *base, uint32_t scale, __vec16_i32 offsets, - __vec16_i1 mask) { - // (iw): need to temporarily store as int because gathers can only return ints. - __vec16_i32 tmp = _mm512_mask_i32extgather_epi32(_mm512_undefined_epi32(), mask, offsets, base, - _MM_UPCONV_EPI32_SINT8, scale, - _MM_HINT_NONE); - // now, downconverting to chars into temporary char vector - __vec16_i8 ret; - _mm512_extstore_epi32(ret.v,tmp,_MM_DOWNCONV_EPI32_SINT8,_MM_HINT_NONE); - return ret; + // TODO + __vec16_i8 ret; + for (int i = 0; i < 16; ++i) + if ((mask & (1 << i)) != 0) { + int8_t *ptr = (int8_t *)(_base + scale * offsets[i]); + ret[i] = *ptr; + } + return ret; } static FORCEINLINE __vec16_i16 __gather_base_offsets32_i16(uint8_t *base, uint32_t scale, __vec16_i32 offsets, __vec16_i1 mask) { - // (iw): need to temporarily store as int because gathers can only return ints. - __vec16_i32 tmp = _mm512_mask_i32extgather_epi32(_mm512_undefined_epi32(), mask, offsets, base, - _MM_UPCONV_EPI32_SINT16, scale, - _MM_HINT_NONE); - // now, downconverting to chars into temporary char vector - __vec16_i16 ret; - _mm512_extstore_epi32(ret.v,tmp,_MM_DOWNCONV_EPI32_SINT16,_MM_HINT_NONE); - return ret; + // TODO + __vec16_i16 ret; + for (int i = 0; i < 16; ++i) + if ((mask & (1 << i)) != 0) { + int16_t *ptr = (int16_t *)(_base + scale * offsets[i]); + ret[i] = *ptr; + } + return ret; } static FORCEINLINE __vec16_i32