Extract constant offsets from gather/scatter base+offsets offset vectors.
When we're able to turn a general gather/scatter into the "base + offsets" form, we now try to extract out any constant components of the offsets and then pass them as a separate parameter to the gather/scatter function implementation. We then in turn carefully emit code for the addressing calculation so that these constant offsets match LLVM's patterns to detect this case, such that we get the constant offsets directly encoded in the instruction's addressing calculation in many cases, saving arithmetic instructions to do these calculations. Improves performance of stencil by ~15%. Other workloads unchanged.
This commit is contained in:
@@ -1060,13 +1060,15 @@ static FORCEINLINE void __masked_store_blend_64(void *p, __vec16_i64 val,
|
||||
// offsets * offsetScale is in bytes (for all of these)
|
||||
|
||||
#define GATHER_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC) \
|
||||
static FORCEINLINE VTYPE FUNC(unsigned char *b, OTYPE offsets, uint32_t scale,\
|
||||
__vec16_i1 mask) { \
|
||||
static FORCEINLINE VTYPE FUNC(unsigned char *b, OTYPE varyingOffset, \
|
||||
uint32_t scale, OTYPE constOffset, \
|
||||
__vec16_i1 mask) { \
|
||||
VTYPE ret; \
|
||||
int8_t *base = (int8_t *)b; \
|
||||
for (int i = 0; i < 16; ++i) \
|
||||
if ((mask.v & (1 << i)) != 0) { \
|
||||
STYPE *ptr = (STYPE *)(base + scale * offsets.v[i]); \
|
||||
STYPE *ptr = (STYPE *)(base + scale * varyingOffset.v[i] + \
|
||||
constOffset.v[i]); \
|
||||
ret.v[i] = *ptr; \
|
||||
} \
|
||||
return ret; \
|
||||
@@ -1104,13 +1106,15 @@ GATHER_GENERAL(__vec16_i64, int64_t, __vec16_i64, __gather64_i64)
|
||||
|
||||
// scatter
|
||||
|
||||
#define SCATTER_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC) \
|
||||
static FORCEINLINE void FUNC(unsigned char *b, OTYPE offsets, uint32_t scale,\
|
||||
#define SCATTER_BASE_VARYINGOFFSET(VTYPE, STYPE, OTYPE, FUNC) \
|
||||
static FORCEINLINE void FUNC(unsigned char *b, OTYPE varyingOffset, \
|
||||
uint32_t scale, OTYPE constOffset, \
|
||||
VTYPE val, __vec16_i1 mask) { \
|
||||
int8_t *base = (int8_t *)b; \
|
||||
for (int i = 0; i < 16; ++i) \
|
||||
if ((mask.v & (1 << i)) != 0) { \
|
||||
STYPE *ptr = (STYPE *)(base + scale * offsets.v[i]); \
|
||||
STYPE *ptr = (STYPE *)(base + scale * varyingOffset.v[i] + \
|
||||
constOffset.v[i]); \
|
||||
*ptr = val.v[i]; \
|
||||
} \
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user