Stop factoring out constant offsets for gather/scatter if instr is available.
For KNC (gather/scatter), it's not helpful to factor base+offsets gathers
and scatters into base_ptr + {1/2/4/8} * varying_offsets + const_offsets.
Now, if a HW instruction is available for gather/scatter, we just factor
into base + {1/2/4/8} * offsets (if possible). Not only is this simpler,
but it's also what we need to pass a value along to the scale by
2/4/8 available directly in those instructions.
Finishes issue #325.
This commit is contained in:
@@ -1940,60 +1940,33 @@ static FORCEINLINE void __masked_store_blend_float(void *p, __vec16_f val,
|
||||
|
||||
// offsets * offsetScale is in bytes (for all of these)
|
||||
|
||||
#define GATHER_FACTORED_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC)
|
||||
/*
|
||||
static FORCEINLINE VTYPE FUNC(unsigned char *b, OTYPE varyingOffset, \
|
||||
uint32_t scale, OTYPE constOffset, \
|
||||
__vec16_i1 mask) { \
|
||||
VTYPE ret; \
|
||||
int8_t *base = (int8_t *)b; \
|
||||
for (int i = 0; i < 16; ++i) \
|
||||
if ((mask.v & (1 << i)) != 0) { \
|
||||
STYPE *ptr = (STYPE *)(base + scale * varyingOffset.v[i] + \
|
||||
constOffset.v[i]); \
|
||||
ret.v[i] = *ptr; \
|
||||
} \
|
||||
return ret; \
|
||||
}
|
||||
*/
|
||||
|
||||
static FORCEINLINE __vec16_i32
|
||||
__gather_factored_base_offsets32_i32(uint8_t *base, __vec16_i32 varyingOffset,
|
||||
uint32_t scale, __vec16_i32 constOffset,
|
||||
__vec16_i1 mask) {
|
||||
__vec16_i32 vscale = _mm512_extload_epi32(&scale, _MM_UPCONV_EPI32_NONE, _MM_BROADCAST_1X16, _MM_HINT_NONE);
|
||||
__vec16_i32 offsets = __add(__mul(vscale, varyingOffset), constOffset);
|
||||
__vec16_i32 tmp;
|
||||
|
||||
__gather_base_offsets32_i32(uint8_t *base, uint32_t scale, __vec16_i32 offsets,
|
||||
__vec16_i1 mask) {
|
||||
// Loop is generated by intrinsic
|
||||
__vec16_i32 ret = _mm512_mask_i32extgather_epi32(tmp, mask, offsets, base,
|
||||
_MM_UPCONV_EPI32_NONE, 1,
|
||||
_MM_UPCONV_EPI32_NONE, scale,
|
||||
_MM_HINT_NONE);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec16_f
|
||||
__gather_factored_base_offsets32_float(uint8_t *base, __vec16_i32 varyingOffset,
|
||||
uint32_t scale, __vec16_i32 constOffset,
|
||||
__gather_base_offsets32_float(uint8_t *base, uint32_t scale, __vec16_i32 offsets,
|
||||
__vec16_i1 mask) {
|
||||
__vec16_i32 vscale = _mm512_extload_epi32(&scale, _MM_UPCONV_EPI32_NONE, _MM_BROADCAST_1X16, _MM_HINT_NONE);
|
||||
__vec16_i32 offsets = __add(__mul(vscale, varyingOffset), constOffset);
|
||||
__vec16_f tmp;
|
||||
|
||||
// Loop is generated by intrinsic
|
||||
__vec16_f ret = _mm512_mask_i32extgather_ps(tmp, mask, offsets, base,
|
||||
_MM_UPCONV_PS_NONE, 1,
|
||||
__vec16_f ret = _mm512_mask_i32extgather_ps(tmp, mask, offsets, base,
|
||||
_MM_UPCONV_PS_NONE, scale,
|
||||
_MM_HINT_NONE);
|
||||
return ret;
|
||||
}
|
||||
|
||||
GATHER_FACTORED_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i32, __gather_factored_base_offsets32_i8)
|
||||
GATHER_FACTORED_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i64, __gather_factored_base_offsets64_i8)
|
||||
GATHER_FACTORED_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i32, __gather_factored_base_offsets32_i16)
|
||||
GATHER_FACTORED_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i64, __gather_factored_base_offsets64_i16)
|
||||
GATHER_FACTORED_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i64, __gather_factored_base_offsets64_i32)
|
||||
GATHER_FACTORED_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i32, __gather_factored_base_offsets32_i64)
|
||||
GATHER_FACTORED_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i64, __gather_factored_base_offsets64_i64)
|
||||
//GATHER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i32, __gather_base_offsets32_i8)
|
||||
//GATHER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i64, __gather_base_offsets64_i8)
|
||||
//GATHER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i32, __gather_base_offsets32_i16)
|
||||
//GATHER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i64, __gather_base_offsets64_i16)
|
||||
//GATHER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i64, __gather_base_offsets64_i32)
|
||||
//GATHER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i32, __gather_base_offsets32_i64)
|
||||
//GATHER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i64, __gather_base_offsets64_i64)
|
||||
|
||||
#define GATHER_GENERAL(VTYPE, STYPE, PTRTYPE, FUNC)
|
||||
/*
|
||||
@@ -2039,45 +2012,30 @@ static FORCEINLINE __vec16_i32 __gather64_i32(__vec16_i64 ptrs, __vec16_i1 mask)
|
||||
*/
|
||||
// scatter
|
||||
|
||||
#define SCATTER_FACTORED_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC)
|
||||
/*
|
||||
static FORCEINLINE void FUNC(unsigned char *b, OTYPE varyingOffset, \
|
||||
uint32_t scale, OTYPE constOffset, \
|
||||
VTYPE val, __vec16_i1 mask) { \
|
||||
int8_t *base = (int8_t *)b; \
|
||||
for (int i = 0; i < 16; ++i) \
|
||||
if ((mask.v & (1 << i)) != 0) { \
|
||||
STYPE *ptr = (STYPE *)(base + scale * varyingOffset.v[i] + \
|
||||
constOffset.v[i]); \
|
||||
*ptr = val.v[i]; \
|
||||
} \
|
||||
}
|
||||
*/
|
||||
|
||||
SCATTER_FACTORED_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i32, __scatter_factored_base_offsets32_i8)
|
||||
SCATTER_FACTORED_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i64, __scatter_factored_base_offsets64_i8)
|
||||
SCATTER_FACTORED_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i32, __scatter_factored_base_offsets32_i16)
|
||||
SCATTER_FACTORED_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i64, __scatter_factored_base_offsets64_i16)
|
||||
SCATTER_FACTORED_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i64, __scatter_factored_base_offsets64_i32)
|
||||
SCATTER_FACTORED_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i32, __scatter_factored_base_offsets32_i64)
|
||||
SCATTER_FACTORED_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i64, __scatter_factored_base_offsets64_i64)
|
||||
//SCATTER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i32, __scatter_base_offsets32_i8)
|
||||
//SCATTER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i64, __scatter_base_offsets64_i8)
|
||||
//SCATTER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i32, __scatter_base_offsets32_i16)
|
||||
//SCATTER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i64, __scatter_base_offsets64_i16)
|
||||
//SCATTER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i64, __scatter_base_offsets64_i32)
|
||||
//SCATTER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i32, __scatter_base_offsets32_i64)
|
||||
//SCATTER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i64, __scatter_base_offsets64_i64)
|
||||
|
||||
static FORCEINLINE void
|
||||
__scatter_factored_base_offsets32_i32(uint8_t *b, __vec16_i32 varyingOffset,
|
||||
uint32_t scale, __vec16_i32 constOffset,
|
||||
__scatter_base_offsets32_i32(uint8_t *b, uint32_t scale, __vec16_i32 offsets,
|
||||
__vec16_i32 val, __vec16_i1 mask)
|
||||
{
|
||||
__vec16_i32 offsets = __add(__mul(__vec16_i32(scale), varyingOffset), constOffset);
|
||||
_mm512_mask_i32extscatter_epi32(b, mask, offsets, val, _MM_DOWNCONV_EPI32_NONE, 1, _MM_HINT_NONE);
|
||||
_mm512_mask_i32extscatter_epi32(b, mask, offsets, val,
|
||||
_MM_DOWNCONV_EPI32_NONE, scale,
|
||||
_MM_HINT_NONE);
|
||||
}
|
||||
|
||||
static FORCEINLINE void
|
||||
__scatter_factored_base_offsets32_float(void *base, const __vec16_i32 &varyingOffset,
|
||||
uint32_t scale, const __vec16_i32 &constOffset,
|
||||
const __vec16_f &val, const __vec16_i1 mask)
|
||||
__scatter_base_offsets32_float(void *base, uint32_t scale, __vec16_i32 offsets,
|
||||
__vec16_f val, __vec16_i1 mask)
|
||||
{
|
||||
__vec16_i32 offsets = __add(__mul(varyingOffset,__vec16_i32(scale)), constOffset);
|
||||
_mm512_mask_i32extscatter_ps(base, mask, offsets, val, _MM_DOWNCONV_PS_NONE, _MM_SCALE_1, _MM_HINT_NONE);
|
||||
_mm512_mask_i32extscatter_ps(base, mask, offsets, val,
|
||||
_MM_DOWNCONV_PS_NONE, scale,
|
||||
_MM_HINT_NONE);
|
||||
}
|
||||
|
||||
#define SCATTER_GENERAL(VTYPE, STYPE, PTRTYPE, FUNC)
|
||||
|
||||
Reference in New Issue
Block a user