Rename gather/scatter_base_offsets functions to *factored_based_offsets*.
No functional change; just preparation for having a path that doesn't factor the offsets into constant and varying parts, which will be better for AVX2 and KNC.
This commit is contained in:
@@ -1306,7 +1306,7 @@ static FORCEINLINE void __masked_store_blend_double(void *p, __vec16_d val,
|
||||
|
||||
// offsets * offsetScale is in bytes (for all of these)
|
||||
|
||||
#define GATHER_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC) \
|
||||
#define GATHER_FACTORED_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC) \
|
||||
static FORCEINLINE VTYPE FUNC(unsigned char *b, OTYPE varyingOffset, \
|
||||
uint32_t scale, OTYPE constOffset, \
|
||||
__vec16_i1 mask) { \
|
||||
@@ -1322,18 +1322,18 @@ static FORCEINLINE VTYPE FUNC(unsigned char *b, OTYPE varyingOffset, \
|
||||
}
|
||||
|
||||
|
||||
GATHER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i32, __gather_base_offsets32_i8)
|
||||
GATHER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i64, __gather_base_offsets64_i8)
|
||||
GATHER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i32, __gather_base_offsets32_i16)
|
||||
GATHER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i64, __gather_base_offsets64_i16)
|
||||
GATHER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i32, __gather_base_offsets32_i32)
|
||||
GATHER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i64, __gather_base_offsets64_i32)
|
||||
GATHER_BASE_OFFSETS(__vec16_f, float, __vec16_i32, __gather_base_offsets32_float)
|
||||
GATHER_BASE_OFFSETS(__vec16_f, float, __vec16_i64, __gather_base_offsets64_float)
|
||||
GATHER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i32, __gather_base_offsets32_i64)
|
||||
GATHER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i64, __gather_base_offsets64_i64)
|
||||
GATHER_BASE_OFFSETS(__vec16_d, double, __vec16_i32, __gather_base_offsets32_double)
|
||||
GATHER_BASE_OFFSETS(__vec16_d, double, __vec16_i64, __gather_base_offsets64_double)
|
||||
GATHER_FACTORED_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i32, __gather_factored_base_offsets32_i8)
|
||||
GATHER_FACTORED_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i64, __gather_factored_base_offsets64_i8)
|
||||
GATHER_FACTORED_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i32, __gather_factored_base_offsets32_i16)
|
||||
GATHER_FACTORED_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i64, __gather_factored_base_offsets64_i16)
|
||||
GATHER_FACTORED_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i32, __gather_factored_base_offsets32_i32)
|
||||
GATHER_FACTORED_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i64, __gather_factored_base_offsets64_i32)
|
||||
GATHER_FACTORED_BASE_OFFSETS(__vec16_f, float, __vec16_i32, __gather_factored_base_offsets32_float)
|
||||
GATHER_FACTORED_BASE_OFFSETS(__vec16_f, float, __vec16_i64, __gather_factored_base_offsets64_float)
|
||||
GATHER_FACTORED_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i32, __gather_factored_base_offsets32_i64)
|
||||
GATHER_FACTORED_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i64, __gather_factored_base_offsets64_i64)
|
||||
GATHER_FACTORED_BASE_OFFSETS(__vec16_d, double, __vec16_i32, __gather_factored_base_offsets32_double)
|
||||
GATHER_FACTORED_BASE_OFFSETS(__vec16_d, double, __vec16_i64, __gather_factored_base_offsets64_double)
|
||||
|
||||
#define GATHER_GENERAL(VTYPE, STYPE, PTRTYPE, FUNC) \
|
||||
static FORCEINLINE VTYPE FUNC(PTRTYPE ptrs, __vec16_i1 mask) { \
|
||||
@@ -1361,7 +1361,7 @@ GATHER_GENERAL(__vec16_d, double, __vec16_i64, __gather64_double)
|
||||
|
||||
// scatter
|
||||
|
||||
#define SCATTER_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC) \
|
||||
#define SCATTER_FACTORED_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC) \
|
||||
static FORCEINLINE void FUNC(unsigned char *b, OTYPE varyingOffset, \
|
||||
uint32_t scale, OTYPE constOffset, \
|
||||
VTYPE val, __vec16_i1 mask) { \
|
||||
@@ -1375,18 +1375,18 @@ static FORCEINLINE void FUNC(unsigned char *b, OTYPE varyingOffset, \
|
||||
}
|
||||
|
||||
|
||||
SCATTER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i32, __scatter_base_offsets32_i8)
|
||||
SCATTER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i64, __scatter_base_offsets64_i8)
|
||||
SCATTER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i32, __scatter_base_offsets32_i16)
|
||||
SCATTER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i64, __scatter_base_offsets64_i16)
|
||||
SCATTER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i32, __scatter_base_offsets32_i32)
|
||||
SCATTER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i64, __scatter_base_offsets64_i32)
|
||||
SCATTER_BASE_OFFSETS(__vec16_f, float, __vec16_i32, __scatter_base_offsets32_float)
|
||||
SCATTER_BASE_OFFSETS(__vec16_f, float, __vec16_i64, __scatter_base_offsets64_float)
|
||||
SCATTER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i32, __scatter_base_offsets32_i64)
|
||||
SCATTER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i64, __scatter_base_offsets64_i64)
|
||||
SCATTER_BASE_OFFSETS(__vec16_d, double, __vec16_i32, __scatter_base_offsets32_double)
|
||||
SCATTER_BASE_OFFSETS(__vec16_d, double, __vec16_i64, __scatter_base_offsets64_double)
|
||||
SCATTER_FACTORED_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i32, __scatter_factored_base_offsets32_i8)
|
||||
SCATTER_FACTORED_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i64, __scatter_factored_base_offsets64_i8)
|
||||
SCATTER_FACTORED_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i32, __scatter_factored_base_offsets32_i16)
|
||||
SCATTER_FACTORED_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i64, __scatter_factored_base_offsets64_i16)
|
||||
SCATTER_FACTORED_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i32, __scatter_factored_base_offsets32_i32)
|
||||
SCATTER_FACTORED_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i64, __scatter_factored_base_offsets64_i32)
|
||||
SCATTER_FACTORED_BASE_OFFSETS(__vec16_f, float, __vec16_i32, __scatter_factored_base_offsets32_float)
|
||||
SCATTER_FACTORED_BASE_OFFSETS(__vec16_f, float, __vec16_i64, __scatter_factored_base_offsets64_float)
|
||||
SCATTER_FACTORED_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i32, __scatter_factored_base_offsets32_i64)
|
||||
SCATTER_FACTORED_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i64, __scatter_factored_base_offsets64_i64)
|
||||
SCATTER_FACTORED_BASE_OFFSETS(__vec16_d, double, __vec16_i32, __scatter_factored_base_offsets32_double)
|
||||
SCATTER_FACTORED_BASE_OFFSETS(__vec16_d, double, __vec16_i64, __scatter_factored_base_offsets64_double)
|
||||
|
||||
#define SCATTER_GENERAL(VTYPE, STYPE, PTRTYPE, FUNC) \
|
||||
static FORCEINLINE void FUNC(PTRTYPE ptrs, VTYPE val, __vec16_i1 mask) { \
|
||||
|
||||
@@ -1374,7 +1374,7 @@ static FORCEINLINE void __masked_store_blend_double(void *p, __vec32_d val,
|
||||
|
||||
// offsets * offsetScale is in bytes (for all of these)
|
||||
|
||||
#define GATHER_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC) \
|
||||
#define GATHER_FACTORED_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC) \
|
||||
static FORCEINLINE VTYPE FUNC(unsigned char *b, OTYPE varyingOffset, \
|
||||
uint32_t scale, OTYPE constOffset, \
|
||||
__vec32_i1 mask) { \
|
||||
@@ -1390,18 +1390,18 @@ static FORCEINLINE VTYPE FUNC(unsigned char *b, OTYPE varyingOffset, \
|
||||
}
|
||||
|
||||
|
||||
GATHER_BASE_OFFSETS(__vec32_i8, int8_t, __vec32_i32, __gather_base_offsets32_i8)
|
||||
GATHER_BASE_OFFSETS(__vec32_i8, int8_t, __vec32_i64, __gather_base_offsets64_i8)
|
||||
GATHER_BASE_OFFSETS(__vec32_i16, int16_t, __vec32_i32, __gather_base_offsets32_i16)
|
||||
GATHER_BASE_OFFSETS(__vec32_i16, int16_t, __vec32_i64, __gather_base_offsets64_i16)
|
||||
GATHER_BASE_OFFSETS(__vec32_i32, int32_t, __vec32_i32, __gather_base_offsets32_i32)
|
||||
GATHER_BASE_OFFSETS(__vec32_i32, int32_t, __vec32_i64, __gather_base_offsets64_i32)
|
||||
GATHER_BASE_OFFSETS(__vec32_f, float, __vec32_i32, __gather_base_offsets32_float)
|
||||
GATHER_BASE_OFFSETS(__vec32_f, float, __vec32_i64, __gather_base_offsets64_float)
|
||||
GATHER_BASE_OFFSETS(__vec32_i64, int64_t, __vec32_i32, __gather_base_offsets32_i64)
|
||||
GATHER_BASE_OFFSETS(__vec32_i64, int64_t, __vec32_i64, __gather_base_offsets64_i64)
|
||||
GATHER_BASE_OFFSETS(__vec32_d, double, __vec32_i32, __gather_base_offsets32_double)
|
||||
GATHER_BASE_OFFSETS(__vec32_d, double, __vec32_i64, __gather_base_offsets64_double)
|
||||
GATHER_FACTORED_BASE_OFFSETS(__vec32_i8, int8_t, __vec32_i32, __gather_factored_base_offsets32_i8)
|
||||
GATHER_FACTORED_BASE_OFFSETS(__vec32_i8, int8_t, __vec32_i64, __gather_factored_base_offsets64_i8)
|
||||
GATHER_FACTORED_BASE_OFFSETS(__vec32_i16, int16_t, __vec32_i32, __gather_factored_base_offsets32_i16)
|
||||
GATHER_FACTORED_BASE_OFFSETS(__vec32_i16, int16_t, __vec32_i64, __gather_factored_base_offsets64_i16)
|
||||
GATHER_FACTORED_BASE_OFFSETS(__vec32_i32, int32_t, __vec32_i32, __gather_factored_base_offsets32_i32)
|
||||
GATHER_FACTORED_BASE_OFFSETS(__vec32_i32, int32_t, __vec32_i64, __gather_factored_base_offsets64_i32)
|
||||
GATHER_FACTORED_BASE_OFFSETS(__vec32_f, float, __vec32_i32, __gather_factored_base_offsets32_float)
|
||||
GATHER_FACTORED_BASE_OFFSETS(__vec32_f, float, __vec32_i64, __gather_factored_base_offsets64_float)
|
||||
GATHER_FACTORED_BASE_OFFSETS(__vec32_i64, int64_t, __vec32_i32, __gather_factored_base_offsets32_i64)
|
||||
GATHER_FACTORED_BASE_OFFSETS(__vec32_i64, int64_t, __vec32_i64, __gather_factored_base_offsets64_i64)
|
||||
GATHER_FACTORED_BASE_OFFSETS(__vec32_d, double, __vec32_i32, __gather_factored_base_offsets32_double)
|
||||
GATHER_FACTORED_BASE_OFFSETS(__vec32_d, double, __vec32_i64, __gather_factored_base_offsets64_double)
|
||||
|
||||
#define GATHER_GENERAL(VTYPE, STYPE, PTRTYPE, FUNC) \
|
||||
static FORCEINLINE VTYPE FUNC(PTRTYPE ptrs, __vec32_i1 mask) { \
|
||||
@@ -1429,7 +1429,7 @@ GATHER_GENERAL(__vec32_d, double, __vec32_i64, __gather64_double)
|
||||
|
||||
// scatter
|
||||
|
||||
#define SCATTER_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC) \
|
||||
#define SCATTER_FACTORED_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC) \
|
||||
static FORCEINLINE void FUNC(unsigned char *b, OTYPE varyingOffset, \
|
||||
uint32_t scale, OTYPE constOffset, \
|
||||
VTYPE val, __vec32_i1 mask) { \
|
||||
@@ -1443,18 +1443,18 @@ static FORCEINLINE void FUNC(unsigned char *b, OTYPE varyingOffset, \
|
||||
}
|
||||
|
||||
|
||||
SCATTER_BASE_OFFSETS(__vec32_i8, int8_t, __vec32_i32, __scatter_base_offsets32_i8)
|
||||
SCATTER_BASE_OFFSETS(__vec32_i8, int8_t, __vec32_i64, __scatter_base_offsets64_i8)
|
||||
SCATTER_BASE_OFFSETS(__vec32_i16, int16_t, __vec32_i32, __scatter_base_offsets32_i16)
|
||||
SCATTER_BASE_OFFSETS(__vec32_i16, int16_t, __vec32_i64, __scatter_base_offsets64_i16)
|
||||
SCATTER_BASE_OFFSETS(__vec32_i32, int32_t, __vec32_i32, __scatter_base_offsets32_i32)
|
||||
SCATTER_BASE_OFFSETS(__vec32_i32, int32_t, __vec32_i64, __scatter_base_offsets64_i32)
|
||||
SCATTER_BASE_OFFSETS(__vec32_f, float, __vec32_i32, __scatter_base_offsets32_float)
|
||||
SCATTER_BASE_OFFSETS(__vec32_f, float, __vec32_i64, __scatter_base_offsets64_float)
|
||||
SCATTER_BASE_OFFSETS(__vec32_i64, int64_t, __vec32_i32, __scatter_base_offsets32_i64)
|
||||
SCATTER_BASE_OFFSETS(__vec32_i64, int64_t, __vec32_i64, __scatter_base_offsets64_i64)
|
||||
SCATTER_BASE_OFFSETS(__vec32_d, double, __vec32_i32, __scatter_base_offsets32_double)
|
||||
SCATTER_BASE_OFFSETS(__vec32_d, double, __vec32_i64, __scatter_base_offsets64_double)
|
||||
SCATTER_FACTORED_BASE_OFFSETS(__vec32_i8, int8_t, __vec32_i32, __scatter_factored_base_offsets32_i8)
|
||||
SCATTER_FACTORED_BASE_OFFSETS(__vec32_i8, int8_t, __vec32_i64, __scatter_factored_base_offsets64_i8)
|
||||
SCATTER_FACTORED_BASE_OFFSETS(__vec32_i16, int16_t, __vec32_i32, __scatter_factored_base_offsets32_i16)
|
||||
SCATTER_FACTORED_BASE_OFFSETS(__vec32_i16, int16_t, __vec32_i64, __scatter_factored_base_offsets64_i16)
|
||||
SCATTER_FACTORED_BASE_OFFSETS(__vec32_i32, int32_t, __vec32_i32, __scatter_factored_base_offsets32_i32)
|
||||
SCATTER_FACTORED_BASE_OFFSETS(__vec32_i32, int32_t, __vec32_i64, __scatter_factored_base_offsets64_i32)
|
||||
SCATTER_FACTORED_BASE_OFFSETS(__vec32_f, float, __vec32_i32, __scatter_factored_base_offsets32_float)
|
||||
SCATTER_FACTORED_BASE_OFFSETS(__vec32_f, float, __vec32_i64, __scatter_factored_base_offsets64_float)
|
||||
SCATTER_FACTORED_BASE_OFFSETS(__vec32_i64, int64_t, __vec32_i32, __scatter_factored_base_offsets32_i64)
|
||||
SCATTER_FACTORED_BASE_OFFSETS(__vec32_i64, int64_t, __vec32_i64, __scatter_factored_base_offsets64_i64)
|
||||
SCATTER_FACTORED_BASE_OFFSETS(__vec32_d, double, __vec32_i32, __scatter_factored_base_offsets32_double)
|
||||
SCATTER_FACTORED_BASE_OFFSETS(__vec32_d, double, __vec32_i64, __scatter_factored_base_offsets64_double)
|
||||
|
||||
#define SCATTER_GENERAL(VTYPE, STYPE, PTRTYPE, FUNC) \
|
||||
static FORCEINLINE void FUNC(PTRTYPE ptrs, VTYPE val, __vec32_i1 mask) { \
|
||||
|
||||
@@ -1507,7 +1507,7 @@ static FORCEINLINE void __masked_store_blend_double(void *p, __vec64_d val,
|
||||
|
||||
// offsets * offsetScale is in bytes (for all of these)
|
||||
|
||||
#define GATHER_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC) \
|
||||
#define GATHER_FACTORED_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC) \
|
||||
static FORCEINLINE VTYPE FUNC(unsigned char *b, OTYPE varyingOffset, \
|
||||
uint32_t scale, OTYPE constOffset, \
|
||||
__vec64_i1 mask) { \
|
||||
@@ -1523,18 +1523,18 @@ static FORCEINLINE VTYPE FUNC(unsigned char *b, OTYPE varyingOffset, \
|
||||
}
|
||||
|
||||
|
||||
GATHER_BASE_OFFSETS(__vec64_i8, int8_t, __vec64_i32, __gather_base_offsets32_i8)
|
||||
GATHER_BASE_OFFSETS(__vec64_i8, int8_t, __vec64_i64, __gather_base_offsets64_i8)
|
||||
GATHER_BASE_OFFSETS(__vec64_i16, int16_t, __vec64_i32, __gather_base_offsets32_i16)
|
||||
GATHER_BASE_OFFSETS(__vec64_i16, int16_t, __vec64_i64, __gather_base_offsets64_i16)
|
||||
GATHER_BASE_OFFSETS(__vec64_i32, int32_t, __vec64_i32, __gather_base_offsets32_i32)
|
||||
GATHER_BASE_OFFSETS(__vec64_i32, int32_t, __vec64_i64, __gather_base_offsets64_i32)
|
||||
GATHER_BASE_OFFSETS(__vec64_f, float, __vec64_i32, __gather_base_offsets32_float)
|
||||
GATHER_BASE_OFFSETS(__vec64_f, float, __vec64_i64, __gather_base_offsets64_float)
|
||||
GATHER_BASE_OFFSETS(__vec64_i64, int64_t, __vec64_i32, __gather_base_offsets32_i64)
|
||||
GATHER_BASE_OFFSETS(__vec64_i64, int64_t, __vec64_i64, __gather_base_offsets64_i64)
|
||||
GATHER_BASE_OFFSETS(__vec64_d, double, __vec64_i32, __gather_base_offsets32_double)
|
||||
GATHER_BASE_OFFSETS(__vec64_d, double, __vec64_i64, __gather_base_offsets64_double)
|
||||
GATHER_FACTORED_BASE_OFFSETS(__vec64_i8, int8_t, __vec64_i32, __gather_factored_base_offsets32_i8)
|
||||
GATHER_FACTORED_BASE_OFFSETS(__vec64_i8, int8_t, __vec64_i64, __gather_factored_base_offsets64_i8)
|
||||
GATHER_FACTORED_BASE_OFFSETS(__vec64_i16, int16_t, __vec64_i32, __gather_factored_base_offsets32_i16)
|
||||
GATHER_FACTORED_BASE_OFFSETS(__vec64_i16, int16_t, __vec64_i64, __gather_factored_base_offsets64_i16)
|
||||
GATHER_FACTORED_BASE_OFFSETS(__vec64_i32, int32_t, __vec64_i32, __gather_factored_base_offsets32_i32)
|
||||
GATHER_FACTORED_BASE_OFFSETS(__vec64_i32, int32_t, __vec64_i64, __gather_factored_base_offsets64_i32)
|
||||
GATHER_FACTORED_BASE_OFFSETS(__vec64_f, float, __vec64_i32, __gather_factored_base_offsets32_float)
|
||||
GATHER_FACTORED_BASE_OFFSETS(__vec64_f, float, __vec64_i64, __gather_factored_base_offsets64_float)
|
||||
GATHER_FACTORED_BASE_OFFSETS(__vec64_i64, int64_t, __vec64_i32, __gather_factored_base_offsets32_i64)
|
||||
GATHER_FACTORED_BASE_OFFSETS(__vec64_i64, int64_t, __vec64_i64, __gather_factored_base_offsets64_i64)
|
||||
GATHER_FACTORED_BASE_OFFSETS(__vec64_d, double, __vec64_i32, __gather_factored_base_offsets32_double)
|
||||
GATHER_FACTORED_BASE_OFFSETS(__vec64_d, double, __vec64_i64, __gather_factored_base_offsets64_double)
|
||||
|
||||
#define GATHER_GENERAL(VTYPE, STYPE, PTRTYPE, FUNC) \
|
||||
static FORCEINLINE VTYPE FUNC(PTRTYPE ptrs, __vec64_i1 mask) { \
|
||||
@@ -1562,7 +1562,7 @@ GATHER_GENERAL(__vec64_d, double, __vec64_i64, __gather64_double)
|
||||
|
||||
// scatter
|
||||
|
||||
#define SCATTER_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC) \
|
||||
#define SCATTER_FACTORED_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC) \
|
||||
static FORCEINLINE void FUNC(unsigned char *b, OTYPE varyingOffset, \
|
||||
uint32_t scale, OTYPE constOffset, \
|
||||
VTYPE val, __vec64_i1 mask) { \
|
||||
@@ -1576,18 +1576,18 @@ static FORCEINLINE void FUNC(unsigned char *b, OTYPE varyingOffset, \
|
||||
}
|
||||
|
||||
|
||||
SCATTER_BASE_OFFSETS(__vec64_i8, int8_t, __vec64_i32, __scatter_base_offsets32_i8)
|
||||
SCATTER_BASE_OFFSETS(__vec64_i8, int8_t, __vec64_i64, __scatter_base_offsets64_i8)
|
||||
SCATTER_BASE_OFFSETS(__vec64_i16, int16_t, __vec64_i32, __scatter_base_offsets32_i16)
|
||||
SCATTER_BASE_OFFSETS(__vec64_i16, int16_t, __vec64_i64, __scatter_base_offsets64_i16)
|
||||
SCATTER_BASE_OFFSETS(__vec64_i32, int32_t, __vec64_i32, __scatter_base_offsets32_i32)
|
||||
SCATTER_BASE_OFFSETS(__vec64_i32, int32_t, __vec64_i64, __scatter_base_offsets64_i32)
|
||||
SCATTER_BASE_OFFSETS(__vec64_f, float, __vec64_i32, __scatter_base_offsets32_float)
|
||||
SCATTER_BASE_OFFSETS(__vec64_f, float, __vec64_i64, __scatter_base_offsets64_float)
|
||||
SCATTER_BASE_OFFSETS(__vec64_i64, int64_t, __vec64_i32, __scatter_base_offsets32_i64)
|
||||
SCATTER_BASE_OFFSETS(__vec64_i64, int64_t, __vec64_i64, __scatter_base_offsets64_i64)
|
||||
SCATTER_BASE_OFFSETS(__vec64_d, double, __vec64_i32, __scatter_base_offsets32_double)
|
||||
SCATTER_BASE_OFFSETS(__vec64_d, double, __vec64_i64, __scatter_base_offsets64_double)
|
||||
SCATTER_FACTORED_BASE_OFFSETS(__vec64_i8, int8_t, __vec64_i32, __scatter_factored_base_offsets32_i8)
|
||||
SCATTER_FACTORED_BASE_OFFSETS(__vec64_i8, int8_t, __vec64_i64, __scatter_factored_base_offsets64_i8)
|
||||
SCATTER_FACTORED_BASE_OFFSETS(__vec64_i16, int16_t, __vec64_i32, __scatter_factored_base_offsets32_i16)
|
||||
SCATTER_FACTORED_BASE_OFFSETS(__vec64_i16, int16_t, __vec64_i64, __scatter_factored_base_offsets64_i16)
|
||||
SCATTER_FACTORED_BASE_OFFSETS(__vec64_i32, int32_t, __vec64_i32, __scatter_factored_base_offsets32_i32)
|
||||
SCATTER_FACTORED_BASE_OFFSETS(__vec64_i32, int32_t, __vec64_i64, __scatter_factored_base_offsets64_i32)
|
||||
SCATTER_FACTORED_BASE_OFFSETS(__vec64_f, float, __vec64_i32, __scatter_factored_base_offsets32_float)
|
||||
SCATTER_FACTORED_BASE_OFFSETS(__vec64_f, float, __vec64_i64, __scatter_factored_base_offsets64_float)
|
||||
SCATTER_FACTORED_BASE_OFFSETS(__vec64_i64, int64_t, __vec64_i32, __scatter_factored_base_offsets32_i64)
|
||||
SCATTER_FACTORED_BASE_OFFSETS(__vec64_i64, int64_t, __vec64_i64, __scatter_factored_base_offsets64_i64)
|
||||
SCATTER_FACTORED_BASE_OFFSETS(__vec64_d, double, __vec64_i32, __scatter_factored_base_offsets32_double)
|
||||
SCATTER_FACTORED_BASE_OFFSETS(__vec64_d, double, __vec64_i64, __scatter_factored_base_offsets64_double)
|
||||
|
||||
#define SCATTER_GENERAL(VTYPE, STYPE, PTRTYPE, FUNC) \
|
||||
static FORCEINLINE void FUNC(PTRTYPE ptrs, VTYPE val, __vec64_i1 mask) { \
|
||||
|
||||
@@ -1940,7 +1940,7 @@ static FORCEINLINE void __masked_store_blend_float(void *p, __vec16_f val,
|
||||
|
||||
// offsets * offsetScale is in bytes (for all of these)
|
||||
|
||||
#define GATHER_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC)
|
||||
#define GATHER_FACTORED_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC)
|
||||
/*
|
||||
static FORCEINLINE VTYPE FUNC(unsigned char *b, OTYPE varyingOffset, \
|
||||
uint32_t scale, OTYPE constOffset, \
|
||||
@@ -1958,7 +1958,7 @@ static FORCEINLINE VTYPE FUNC(unsigned char *b, OTYPE varyingOffset, \
|
||||
*/
|
||||
|
||||
static FORCEINLINE __vec16_i32
|
||||
__gather_base_offsets32_i32(uint8_t *base, __vec16_i32 varyingOffset,
|
||||
__gather_factored_base_offsets32_i32(uint8_t *base, __vec16_i32 varyingOffset,
|
||||
uint32_t scale, __vec16_i32 constOffset,
|
||||
__vec16_i1 mask) {
|
||||
__vec16_i32 vscale = _mm512_extload_epi32(&scale, _MM_UPCONV_EPI32_NONE, _MM_BROADCAST_1X16, _MM_HINT_NONE);
|
||||
@@ -1973,7 +1973,7 @@ __gather_base_offsets32_i32(uint8_t *base, __vec16_i32 varyingOffset,
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec16_f
|
||||
__gather_base_offsets32_float(uint8_t *base, __vec16_i32 varyingOffset,
|
||||
__gather_factored_base_offsets32_float(uint8_t *base, __vec16_i32 varyingOffset,
|
||||
uint32_t scale, __vec16_i32 constOffset,
|
||||
__vec16_i1 mask) {
|
||||
__vec16_i32 vscale = _mm512_extload_epi32(&scale, _MM_UPCONV_EPI32_NONE, _MM_BROADCAST_1X16, _MM_HINT_NONE);
|
||||
@@ -1987,13 +1987,13 @@ __gather_base_offsets32_float(uint8_t *base, __vec16_i32 varyingOffset,
|
||||
return ret;
|
||||
}
|
||||
|
||||
GATHER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i32, __gather_base_offsets32_i8)
|
||||
GATHER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i64, __gather_base_offsets64_i8)
|
||||
GATHER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i32, __gather_base_offsets32_i16)
|
||||
GATHER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i64, __gather_base_offsets64_i16)
|
||||
GATHER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i64, __gather_base_offsets64_i32)
|
||||
GATHER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i32, __gather_base_offsets32_i64)
|
||||
GATHER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i64, __gather_base_offsets64_i64)
|
||||
GATHER_FACTORED_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i32, __gather_factored_base_offsets32_i8)
|
||||
GATHER_FACTORED_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i64, __gather_factored_base_offsets64_i8)
|
||||
GATHER_FACTORED_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i32, __gather_factored_base_offsets32_i16)
|
||||
GATHER_FACTORED_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i64, __gather_factored_base_offsets64_i16)
|
||||
GATHER_FACTORED_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i64, __gather_factored_base_offsets64_i32)
|
||||
GATHER_FACTORED_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i32, __gather_factored_base_offsets32_i64)
|
||||
GATHER_FACTORED_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i64, __gather_factored_base_offsets64_i64)
|
||||
|
||||
#define GATHER_GENERAL(VTYPE, STYPE, PTRTYPE, FUNC)
|
||||
/*
|
||||
@@ -2039,7 +2039,7 @@ static FORCEINLINE __vec16_i32 __gather64_i32(__vec16_i64 ptrs, __vec16_i1 mask)
|
||||
*/
|
||||
// scatter
|
||||
|
||||
#define SCATTER_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC)
|
||||
#define SCATTER_FACTORED_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC)
|
||||
/*
|
||||
static FORCEINLINE void FUNC(unsigned char *b, OTYPE varyingOffset, \
|
||||
uint32_t scale, OTYPE constOffset, \
|
||||
@@ -2054,16 +2054,16 @@ static FORCEINLINE void FUNC(unsigned char *b, OTYPE varyingOffset, \
|
||||
}
|
||||
*/
|
||||
|
||||
SCATTER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i32, __scatter_base_offsets32_i8)
|
||||
SCATTER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i64, __scatter_base_offsets64_i8)
|
||||
SCATTER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i32, __scatter_base_offsets32_i16)
|
||||
SCATTER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i64, __scatter_base_offsets64_i16)
|
||||
SCATTER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i64, __scatter_base_offsets64_i32)
|
||||
SCATTER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i32, __scatter_base_offsets32_i64)
|
||||
SCATTER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i64, __scatter_base_offsets64_i64)
|
||||
SCATTER_FACTORED_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i32, __scatter_factored_base_offsets32_i8)
|
||||
SCATTER_FACTORED_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i64, __scatter_factored_base_offsets64_i8)
|
||||
SCATTER_FACTORED_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i32, __scatter_factored_base_offsets32_i16)
|
||||
SCATTER_FACTORED_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i64, __scatter_factored_base_offsets64_i16)
|
||||
SCATTER_FACTORED_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i64, __scatter_factored_base_offsets64_i32)
|
||||
SCATTER_FACTORED_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i32, __scatter_factored_base_offsets32_i64)
|
||||
SCATTER_FACTORED_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i64, __scatter_factored_base_offsets64_i64)
|
||||
|
||||
static FORCEINLINE void
|
||||
__scatter_base_offsets32_i32(uint8_t *b, __vec16_i32 varyingOffset,
|
||||
__scatter_factored_base_offsets32_i32(uint8_t *b, __vec16_i32 varyingOffset,
|
||||
uint32_t scale, __vec16_i32 constOffset,
|
||||
__vec16_i32 val, __vec16_i1 mask)
|
||||
{
|
||||
@@ -2072,7 +2072,7 @@ __scatter_base_offsets32_i32(uint8_t *b, __vec16_i32 varyingOffset,
|
||||
}
|
||||
|
||||
static FORCEINLINE void
|
||||
__scatter_base_offsets32_float(void *base, const __vec16_i32 &varyingOffset,
|
||||
__scatter_factored_base_offsets32_float(void *base, const __vec16_i32 &varyingOffset,
|
||||
uint32_t scale, const __vec16_i32 &constOffset,
|
||||
const __vec16_f &val, const __vec16_i1 mask)
|
||||
{
|
||||
|
||||
@@ -3007,84 +3007,84 @@ lGatherBaseOffsets64(RetVec, RetScalar, unsigned char *p, __vec4_i64 offsets,
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec4_i8
|
||||
__gather_base_offsets32_i8(unsigned char *b, __vec4_i32 offsets,
|
||||
__gather_factored_base_offsets32_i8(unsigned char *b, __vec4_i32 offsets,
|
||||
uint32_t scale, __vec4_i32 constOffset, __vec4_i1 mask) {
|
||||
return lGatherBaseOffsets32(__vec4_i8(), uint8_t(), b, offsets, scale,
|
||||
constOffset, mask);
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec4_i8
|
||||
__gather_base_offsets64_i8(unsigned char *b, __vec4_i64 offsets,
|
||||
__gather_factored_base_offsets64_i8(unsigned char *b, __vec4_i64 offsets,
|
||||
uint32_t scale, __vec4_i64 constOffset, __vec4_i1 mask) {
|
||||
return lGatherBaseOffsets64(__vec4_i8(), uint8_t(), b, offsets, scale,
|
||||
constOffset, mask);
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec4_i16
|
||||
__gather_base_offsets32_i16(unsigned char *b, __vec4_i32 offsets,
|
||||
__gather_factored_base_offsets32_i16(unsigned char *b, __vec4_i32 offsets,
|
||||
uint32_t scale, __vec4_i32 constOffset, __vec4_i1 mask) {
|
||||
return lGatherBaseOffsets32(__vec4_i16(), uint16_t(), b, offsets, scale,
|
||||
constOffset, mask);
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec4_i16
|
||||
__gather_base_offsets64_i16(unsigned char *b, __vec4_i64 offsets,
|
||||
__gather_factored_base_offsets64_i16(unsigned char *b, __vec4_i64 offsets,
|
||||
uint32_t scale, __vec4_i64 constOffset, __vec4_i1 mask) {
|
||||
return lGatherBaseOffsets64(__vec4_i16(), uint16_t(), b, offsets, scale,
|
||||
constOffset, mask);
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec4_i32
|
||||
__gather_base_offsets32_i32(uint8_t *p, __vec4_i32 offsets, uint32_t scale,
|
||||
__gather_factored_base_offsets32_i32(uint8_t *p, __vec4_i32 offsets, uint32_t scale,
|
||||
__vec4_i32 constOffset, __vec4_i1 mask) {
|
||||
return lGatherBaseOffsets32(__vec4_i32(), uint32_t(), p, offsets, scale,
|
||||
constOffset, mask);
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec4_i32
|
||||
__gather_base_offsets64_i32(unsigned char *p, __vec4_i64 offsets,
|
||||
__gather_factored_base_offsets64_i32(unsigned char *p, __vec4_i64 offsets,
|
||||
uint32_t scale, __vec4_i64 constOffset, __vec4_i1 mask) {
|
||||
return lGatherBaseOffsets64(__vec4_i32(), uint32_t(), p, offsets, scale,
|
||||
constOffset, mask);
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec4_f
|
||||
__gather_base_offsets32_float(uint8_t *p, __vec4_i32 offsets, uint32_t scale,
|
||||
__gather_factored_base_offsets32_float(uint8_t *p, __vec4_i32 offsets, uint32_t scale,
|
||||
__vec4_i32 constOffset, __vec4_i1 mask) {
|
||||
return lGatherBaseOffsets32(__vec4_f(), float(), p, offsets, scale,
|
||||
constOffset, mask);
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec4_f
|
||||
__gather_base_offsets64_float(unsigned char *p, __vec4_i64 offsets,
|
||||
__gather_factored_base_offsets64_float(unsigned char *p, __vec4_i64 offsets,
|
||||
uint32_t scale, __vec4_i64 constOffset, __vec4_i1 mask) {
|
||||
return lGatherBaseOffsets64(__vec4_f(), float(), p, offsets, scale,
|
||||
constOffset, mask);
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec4_i64
|
||||
__gather_base_offsets32_i64(unsigned char *p, __vec4_i32 offsets,
|
||||
__gather_factored_base_offsets32_i64(unsigned char *p, __vec4_i32 offsets,
|
||||
uint32_t scale, __vec4_i32 constOffset, __vec4_i1 mask) {
|
||||
return lGatherBaseOffsets32(__vec4_i64(), uint64_t(), p, offsets, scale,
|
||||
constOffset, mask);
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec4_i64
|
||||
__gather_base_offsets64_i64(unsigned char *p, __vec4_i64 offsets,
|
||||
__gather_factored_base_offsets64_i64(unsigned char *p, __vec4_i64 offsets,
|
||||
uint32_t scale, __vec4_i64 constOffset, __vec4_i1 mask) {
|
||||
return lGatherBaseOffsets64(__vec4_i64(), uint64_t(), p, offsets, scale,
|
||||
constOffset, mask);
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec4_d
|
||||
__gather_base_offsets32_double(unsigned char *p, __vec4_i32 offsets,
|
||||
__gather_factored_base_offsets32_double(unsigned char *p, __vec4_i32 offsets,
|
||||
uint32_t scale, __vec4_i32 constOffset, __vec4_i1 mask) {
|
||||
return lGatherBaseOffsets32(__vec4_d(), double(), p, offsets, scale,
|
||||
constOffset, mask);
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec4_d
|
||||
__gather_base_offsets64_double(unsigned char *p, __vec4_i64 offsets,
|
||||
__gather_factored_base_offsets64_double(unsigned char *p, __vec4_i64 offsets,
|
||||
uint32_t scale, __vec4_i64 constOffset, __vec4_i1 mask) {
|
||||
return lGatherBaseOffsets64(__vec4_d(), double(), p, offsets, scale,
|
||||
constOffset, mask);
|
||||
@@ -3252,7 +3252,7 @@ static FORCEINLINE __vec4_d __gather64_double(__vec4_i64 ptrs, __vec4_i1 mask) {
|
||||
|
||||
#define SCATTER32_64(SUFFIX, VEC_SUFFIX, TYPE, EXTRACT) \
|
||||
static FORCEINLINE void \
|
||||
__scatter_base_offsets32_##SUFFIX (unsigned char *b, __vec4_i32 offsets, \
|
||||
__scatter_factored_base_offsets32_##SUFFIX (unsigned char *b, __vec4_i32 offsets, \
|
||||
uint32_t scale, __vec4_i32 constOffset, \
|
||||
__vec4_##VEC_SUFFIX val, __vec4_i1 mask) { \
|
||||
uint32_t m = _mm_extract_ps(mask.v, 0); \
|
||||
@@ -3281,7 +3281,7 @@ __scatter_base_offsets32_##SUFFIX (unsigned char *b, __vec4_i32 offsets, \
|
||||
} \
|
||||
} \
|
||||
static FORCEINLINE void \
|
||||
__scatter_base_offsets64_##SUFFIX(unsigned char *p, __vec4_i64 offsets, \
|
||||
__scatter_factored_base_offsets64_##SUFFIX(unsigned char *p, __vec4_i64 offsets, \
|
||||
uint32_t scale, __vec4_i64 constOffset, \
|
||||
__vec4_##VEC_SUFFIX val, __vec4_i1 mask) { \
|
||||
uint32_t m = _mm_extract_ps(mask.v, 0); \
|
||||
@@ -3322,7 +3322,7 @@ SCATTER32_64(float, f, float, _mm_extract_ps_as_float)
|
||||
|
||||
|
||||
static FORCEINLINE void
|
||||
__scatter_base_offsets32_i64(unsigned char *p, __vec4_i32 offsets,
|
||||
__scatter_factored_base_offsets32_i64(unsigned char *p, __vec4_i32 offsets,
|
||||
uint32_t scale, __vec4_i32 constOffset, __vec4_i64 val,
|
||||
__vec4_i1 mask) {
|
||||
uint32_t m = _mm_extract_ps(mask.v, 0);
|
||||
@@ -3359,7 +3359,7 @@ __scatter_base_offsets32_i64(unsigned char *p, __vec4_i32 offsets,
|
||||
}
|
||||
|
||||
static FORCEINLINE void
|
||||
__scatter_base_offsets64_i64(unsigned char *p, __vec4_i64 offsets,
|
||||
__scatter_factored_base_offsets64_i64(unsigned char *p, __vec4_i64 offsets,
|
||||
uint32_t scale, __vec4_i64 constOffset,
|
||||
__vec4_i64 val, __vec4_i1 mask) {
|
||||
uint32_t m = _mm_extract_ps(mask.v, 0);
|
||||
@@ -3396,17 +3396,17 @@ __scatter_base_offsets64_i64(unsigned char *p, __vec4_i64 offsets,
|
||||
}
|
||||
|
||||
static FORCEINLINE void
|
||||
__scatter_base_offsets32_double(unsigned char *p, __vec4_i32 offsets,
|
||||
__scatter_factored_base_offsets32_double(unsigned char *p, __vec4_i32 offsets,
|
||||
uint32_t scale, __vec4_i32 constOffset, __vec4_d val,
|
||||
__vec4_i1 mask) {
|
||||
__scatter_base_offsets32_i64(p, offsets, scale, constOffset, val, mask);
|
||||
__scatter_factored_base_offsets32_i64(p, offsets, scale, constOffset, val, mask);
|
||||
}
|
||||
|
||||
static FORCEINLINE void
|
||||
__scatter_base_offsets64_double(unsigned char *p, __vec4_i64 offsets,
|
||||
__scatter_factored_base_offsets64_double(unsigned char *p, __vec4_i64 offsets,
|
||||
uint32_t scale, __vec4_i64 constOffset, __vec4_d val,
|
||||
__vec4_i1 mask) {
|
||||
__scatter_base_offsets64_i64(p, offsets, scale, constOffset, val, mask);
|
||||
__scatter_factored_base_offsets64_i64(p, offsets, scale, constOffset, val, mask);
|
||||
}
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user