loop execution for i8/16 32-addr-bit gathers/scatters
This commit is contained in:
@@ -2789,27 +2789,27 @@ template <int ALIGN> static FORCEINLINE void __store(__vec16_i8 *p, __vec16_i8 v
|
|||||||
*p = v;
|
*p = v;
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE void
|
static FORCEINLINE void __scatter_base_offsets32_i8(uint8_t *b, uint32_t scale, __vec16_i32 offsets,
|
||||||
__scatter_base_offsets32_i8(uint8_t *b, uint32_t scale, __vec16_i32 offsets,
|
|
||||||
__vec16_i8 val, __vec16_i1 mask)
|
__vec16_i8 val, __vec16_i1 mask)
|
||||||
{
|
{
|
||||||
__vec16_i32 tmp = _mm512_extload_epi32(&val,_MM_UPCONV_EPI32_SINT8,
|
// TODO
|
||||||
_MM_BROADCAST32_NONE, _MM_HINT_NONE);
|
for (int i = 0; i < 16; ++i)
|
||||||
_mm512_mask_i32extscatter_epi32(b, mask, offsets, tmp,
|
if ((mask & (1 << i)) != 0) {
|
||||||
_MM_DOWNCONV_EPI32_SINT8, scale,
|
int8_t *ptr = (int8_t *)(_base + scale * offsets[i]);
|
||||||
_MM_HINT_NONE);
|
*ptr = val[i];
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
static FORCEINLINE void
|
static FORCEINLINE void __scatter_base_offsets32_i16(uint8_t *b, uint32_t scale, __vec16_i32 offsets,
|
||||||
__scatter_base_offsets32_i16(uint8_t *b, uint32_t scale, __vec16_i32 offsets,
|
|
||||||
__vec16_i16 val, __vec16_i1 mask)
|
__vec16_i16 val, __vec16_i1 mask)
|
||||||
{
|
{
|
||||||
__vec16_i32 tmp = _mm512_extload_epi32(&val,_MM_UPCONV_EPI32_SINT16,
|
// TODO
|
||||||
_MM_BROADCAST32_NONE, _MM_HINT_NONE);
|
for (int i = 0; i < 16; ++i)
|
||||||
_mm512_mask_i32extscatter_epi32(b, mask, offsets, tmp,
|
if ((mask & (1 << i)) != 0) {
|
||||||
_MM_DOWNCONV_EPI32_SINT16, scale,
|
int16_t *ptr = (int16_t *)(_base + scale * offsets[i]);
|
||||||
_MM_HINT_NONE);
|
*ptr = val[i];
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@@ -2900,27 +2900,26 @@ static FORCEINLINE void __masked_store_blend_float(void *p, __vec16_f val,
|
|||||||
|
|
||||||
static FORCEINLINE __vec16_i8
|
static FORCEINLINE __vec16_i8
|
||||||
__gather_base_offsets32_i8(uint8_t *base, uint32_t scale, __vec16_i32 offsets,
|
__gather_base_offsets32_i8(uint8_t *base, uint32_t scale, __vec16_i32 offsets,
|
||||||
__vec16_i1 mask) {
|
// TODO
|
||||||
// (iw): need to temporarily store as int because gathers can only return ints.
|
|
||||||
__vec16_i32 tmp = _mm512_mask_i32extgather_epi32(_mm512_undefined_epi32(), mask, offsets, base,
|
|
||||||
_MM_UPCONV_EPI32_SINT8, scale,
|
|
||||||
_MM_HINT_NONE);
|
|
||||||
// now, downconverting to chars into temporary char vector
|
|
||||||
__vec16_i8 ret;
|
__vec16_i8 ret;
|
||||||
_mm512_extstore_epi32(ret.v,tmp,_MM_DOWNCONV_EPI32_SINT8,_MM_HINT_NONE);
|
for (int i = 0; i < 16; ++i)
|
||||||
|
if ((mask & (1 << i)) != 0) {
|
||||||
|
int8_t *ptr = (int8_t *)(_base + scale * offsets[i]);
|
||||||
|
ret[i] = *ptr;
|
||||||
|
}
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE __vec16_i16
|
static FORCEINLINE __vec16_i16
|
||||||
__gather_base_offsets32_i16(uint8_t *base, uint32_t scale, __vec16_i32 offsets,
|
__gather_base_offsets32_i16(uint8_t *base, uint32_t scale, __vec16_i32 offsets,
|
||||||
__vec16_i1 mask) {
|
__vec16_i1 mask) {
|
||||||
// (iw): need to temporarily store as int because gathers can only return ints.
|
// TODO
|
||||||
__vec16_i32 tmp = _mm512_mask_i32extgather_epi32(_mm512_undefined_epi32(), mask, offsets, base,
|
|
||||||
_MM_UPCONV_EPI32_SINT16, scale,
|
|
||||||
_MM_HINT_NONE);
|
|
||||||
// now, downconverting to chars into temporary char vector
|
|
||||||
__vec16_i16 ret;
|
__vec16_i16 ret;
|
||||||
_mm512_extstore_epi32(ret.v,tmp,_MM_DOWNCONV_EPI32_SINT16,_MM_HINT_NONE);
|
for (int i = 0; i < 16; ++i)
|
||||||
|
if ((mask & (1 << i)) != 0) {
|
||||||
|
int16_t *ptr = (int16_t *)(_base + scale * offsets[i]);
|
||||||
|
ret[i] = *ptr;
|
||||||
|
}
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user