Stop factoring out constant offsets for gather/scatter if instr is available.

For KNC (gather/scatter), it's not helpful to factor base+offsets gathers
and scatters into base_ptr + {1/2/4/8} * varying_offsets + const_offsets.
Now, if a HW instruction is available for gather/scatter, we just factor
into base + {1/2/4/8} * offsets (if possible).  Not only is this simpler,
but it's also what we need to pass a value along to the scale by
2/4/8 available directly in those instructions.

Finishes issue #325.
This commit is contained in:
Matt Pharr
2012-07-11 14:52:14 -07:00
parent c09c87873e
commit 216ac4b1a4
7 changed files with 257 additions and 336 deletions

View File

@@ -32,6 +32,9 @@
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128-v16:16:16-v32:32:32";
define(`MASK',`i1')
define(`HAVE_GATHER',`1')
define(`HAVE_SCATTER',`1')
include(`util.m4')
stdlib_core()
@@ -334,19 +337,19 @@ define void @__masked_store_blend_double(<WIDTH x double>* nocapture,
;; gather/scatter
define(`gather_scatter', `
declare <WIDTH x $1> @__gather_factored_base_offsets32_$1(i8 * nocapture, <WIDTH x i32>,
i32, <WIDTH x i32>, <WIDTH x i1>) nounwind readonly
declare <WIDTH x $1> @__gather_factored_base_offsets64_$1(i8 * nocapture, <WIDTH x i64>,
i32, <WIDTH x i64>, <WIDTH x i1>) nounwind readonly
declare <WIDTH x $1> @__gather_base_offsets32_$1(i8 * nocapture, i32, <WIDTH x i32>,
<WIDTH x i1>) nounwind readonly
declare <WIDTH x $1> @__gather_base_offsets64_$1(i8 * nocapture, i32, <WIDTH x i64>,
<WIDTH x i1>) nounwind readonly
declare <WIDTH x $1> @__gather32_$1(<WIDTH x i32>,
<WIDTH x i1>) nounwind readonly
declare <WIDTH x $1> @__gather64_$1(<WIDTH x i64>,
<WIDTH x i1>) nounwind readonly
declare void @__scatter_factored_base_offsets32_$1(i8* nocapture, <WIDTH x i32>,
i32, <WIDTH x i32>, <WIDTH x $1>, <WIDTH x i1>) nounwind
declare void @__scatter_factored_base_offsets64_$1(i8* nocapture, <WIDTH x i64>,
i32, <WIDTH x i64>, <WIDTH x $1>, <WIDTH x i1>) nounwind
declare void @__scatter_base_offsets32_$1(i8* nocapture, i32, <WIDTH x i32>,
<WIDTH x $1>, <WIDTH x i1>) nounwind
declare void @__scatter_base_offsets64_$1(i8* nocapture, i32, <WIDTH x i64>,
<WIDTH x $1>, <WIDTH x i1>) nounwind
declare void @__scatter32_$1(<WIDTH x i32>, <WIDTH x $1>,
<WIDTH x i1>) nounwind
declare void @__scatter64_$1(<WIDTH x i64>, <WIDTH x $1>,

View File

@@ -1306,34 +1306,32 @@ static FORCEINLINE void __masked_store_blend_double(void *p, __vec16_d val,
// offsets * offsetScale is in bytes (for all of these)
#define GATHER_FACTORED_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC) \
static FORCEINLINE VTYPE FUNC(unsigned char *b, OTYPE varyingOffset, \
uint32_t scale, OTYPE constOffset, \
__vec16_i1 mask) { \
#define GATHER_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC) \
static FORCEINLINE VTYPE FUNC(unsigned char *b, uint32_t scale, \
OTYPE offset, __vec16_i1 mask) { \
VTYPE ret; \
int8_t *base = (int8_t *)b; \
for (int i = 0; i < 16; ++i) \
if ((mask.v & (1 << i)) != 0) { \
STYPE *ptr = (STYPE *)(base + scale * varyingOffset.v[i] + \
constOffset.v[i]); \
STYPE *ptr = (STYPE *)(base + scale * offset.v[i]); \
ret.v[i] = *ptr; \
} \
return ret; \
}
GATHER_FACTORED_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i32, __gather_factored_base_offsets32_i8)
GATHER_FACTORED_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i64, __gather_factored_base_offsets64_i8)
GATHER_FACTORED_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i32, __gather_factored_base_offsets32_i16)
GATHER_FACTORED_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i64, __gather_factored_base_offsets64_i16)
GATHER_FACTORED_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i32, __gather_factored_base_offsets32_i32)
GATHER_FACTORED_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i64, __gather_factored_base_offsets64_i32)
GATHER_FACTORED_BASE_OFFSETS(__vec16_f, float, __vec16_i32, __gather_factored_base_offsets32_float)
GATHER_FACTORED_BASE_OFFSETS(__vec16_f, float, __vec16_i64, __gather_factored_base_offsets64_float)
GATHER_FACTORED_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i32, __gather_factored_base_offsets32_i64)
GATHER_FACTORED_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i64, __gather_factored_base_offsets64_i64)
GATHER_FACTORED_BASE_OFFSETS(__vec16_d, double, __vec16_i32, __gather_factored_base_offsets32_double)
GATHER_FACTORED_BASE_OFFSETS(__vec16_d, double, __vec16_i64, __gather_factored_base_offsets64_double)
GATHER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i32, __gather_base_offsets32_i8)
GATHER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i64, __gather_base_offsets64_i8)
GATHER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i32, __gather_base_offsets32_i16)
GATHER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i64, __gather_base_offsets64_i16)
GATHER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i32, __gather_base_offsets32_i32)
GATHER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i64, __gather_base_offsets64_i32)
GATHER_BASE_OFFSETS(__vec16_f, float, __vec16_i32, __gather_base_offsets32_float)
GATHER_BASE_OFFSETS(__vec16_f, float, __vec16_i64, __gather_base_offsets64_float)
GATHER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i32, __gather_base_offsets32_i64)
GATHER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i64, __gather_base_offsets64_i64)
GATHER_BASE_OFFSETS(__vec16_d, double, __vec16_i32, __gather_base_offsets32_double)
GATHER_BASE_OFFSETS(__vec16_d, double, __vec16_i64, __gather_base_offsets64_double)
#define GATHER_GENERAL(VTYPE, STYPE, PTRTYPE, FUNC) \
static FORCEINLINE VTYPE FUNC(PTRTYPE ptrs, __vec16_i1 mask) { \
@@ -1361,32 +1359,31 @@ GATHER_GENERAL(__vec16_d, double, __vec16_i64, __gather64_double)
// scatter
#define SCATTER_FACTORED_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC) \
static FORCEINLINE void FUNC(unsigned char *b, OTYPE varyingOffset, \
uint32_t scale, OTYPE constOffset, \
VTYPE val, __vec16_i1 mask) { \
#define SCATTER_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC) \
static FORCEINLINE void FUNC(unsigned char *b, uint32_t scale, \
OTYPE offset, VTYPE val, \
__vec16_i1 mask) { \
int8_t *base = (int8_t *)b; \
for (int i = 0; i < 16; ++i) \
if ((mask.v & (1 << i)) != 0) { \
STYPE *ptr = (STYPE *)(base + scale * varyingOffset.v[i] + \
constOffset.v[i]); \
STYPE *ptr = (STYPE *)(base + scale * offset.v[i]); \
*ptr = val.v[i]; \
} \
}
SCATTER_FACTORED_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i32, __scatter_factored_base_offsets32_i8)
SCATTER_FACTORED_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i64, __scatter_factored_base_offsets64_i8)
SCATTER_FACTORED_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i32, __scatter_factored_base_offsets32_i16)
SCATTER_FACTORED_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i64, __scatter_factored_base_offsets64_i16)
SCATTER_FACTORED_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i32, __scatter_factored_base_offsets32_i32)
SCATTER_FACTORED_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i64, __scatter_factored_base_offsets64_i32)
SCATTER_FACTORED_BASE_OFFSETS(__vec16_f, float, __vec16_i32, __scatter_factored_base_offsets32_float)
SCATTER_FACTORED_BASE_OFFSETS(__vec16_f, float, __vec16_i64, __scatter_factored_base_offsets64_float)
SCATTER_FACTORED_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i32, __scatter_factored_base_offsets32_i64)
SCATTER_FACTORED_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i64, __scatter_factored_base_offsets64_i64)
SCATTER_FACTORED_BASE_OFFSETS(__vec16_d, double, __vec16_i32, __scatter_factored_base_offsets32_double)
SCATTER_FACTORED_BASE_OFFSETS(__vec16_d, double, __vec16_i64, __scatter_factored_base_offsets64_double)
SCATTER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i32, __scatter_base_offsets32_i8)
SCATTER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i64, __scatter_base_offsets64_i8)
SCATTER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i32, __scatter_base_offsets32_i16)
SCATTER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i64, __scatter_base_offsets64_i16)
SCATTER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i32, __scatter_base_offsets32_i32)
SCATTER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i64, __scatter_base_offsets64_i32)
SCATTER_BASE_OFFSETS(__vec16_f, float, __vec16_i32, __scatter_base_offsets32_float)
SCATTER_BASE_OFFSETS(__vec16_f, float, __vec16_i64, __scatter_base_offsets64_float)
SCATTER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i32, __scatter_base_offsets32_i64)
SCATTER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i64, __scatter_base_offsets64_i64)
SCATTER_BASE_OFFSETS(__vec16_d, double, __vec16_i32, __scatter_base_offsets32_double)
SCATTER_BASE_OFFSETS(__vec16_d, double, __vec16_i64, __scatter_base_offsets64_double)
#define SCATTER_GENERAL(VTYPE, STYPE, PTRTYPE, FUNC) \
static FORCEINLINE void FUNC(PTRTYPE ptrs, VTYPE val, __vec16_i1 mask) { \

View File

@@ -1374,34 +1374,32 @@ static FORCEINLINE void __masked_store_blend_double(void *p, __vec32_d val,
// offsets * offsetScale is in bytes (for all of these)
#define GATHER_FACTORED_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC) \
static FORCEINLINE VTYPE FUNC(unsigned char *b, OTYPE varyingOffset, \
uint32_t scale, OTYPE constOffset, \
__vec32_i1 mask) { \
#define GATHER_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC) \
static FORCEINLINE VTYPE FUNC(unsigned char *b, uint32_t scale, \
OTYPE offset, __vec32_i1 mask) { \
VTYPE ret; \
int8_t *base = (int8_t *)b; \
for (int i = 0; i < 32; ++i) \
if ((mask.v & (1 << i)) != 0) { \
STYPE *ptr = (STYPE *)(base + scale * varyingOffset.v[i] + \
constOffset.v[i]); \
STYPE *ptr = (STYPE *)(base + scale * offset.v[i]); \
ret.v[i] = *ptr; \
} \
return ret; \
}
GATHER_FACTORED_BASE_OFFSETS(__vec32_i8, int8_t, __vec32_i32, __gather_factored_base_offsets32_i8)
GATHER_FACTORED_BASE_OFFSETS(__vec32_i8, int8_t, __vec32_i64, __gather_factored_base_offsets64_i8)
GATHER_FACTORED_BASE_OFFSETS(__vec32_i16, int16_t, __vec32_i32, __gather_factored_base_offsets32_i16)
GATHER_FACTORED_BASE_OFFSETS(__vec32_i16, int16_t, __vec32_i64, __gather_factored_base_offsets64_i16)
GATHER_FACTORED_BASE_OFFSETS(__vec32_i32, int32_t, __vec32_i32, __gather_factored_base_offsets32_i32)
GATHER_FACTORED_BASE_OFFSETS(__vec32_i32, int32_t, __vec32_i64, __gather_factored_base_offsets64_i32)
GATHER_FACTORED_BASE_OFFSETS(__vec32_f, float, __vec32_i32, __gather_factored_base_offsets32_float)
GATHER_FACTORED_BASE_OFFSETS(__vec32_f, float, __vec32_i64, __gather_factored_base_offsets64_float)
GATHER_FACTORED_BASE_OFFSETS(__vec32_i64, int64_t, __vec32_i32, __gather_factored_base_offsets32_i64)
GATHER_FACTORED_BASE_OFFSETS(__vec32_i64, int64_t, __vec32_i64, __gather_factored_base_offsets64_i64)
GATHER_FACTORED_BASE_OFFSETS(__vec32_d, double, __vec32_i32, __gather_factored_base_offsets32_double)
GATHER_FACTORED_BASE_OFFSETS(__vec32_d, double, __vec32_i64, __gather_factored_base_offsets64_double)
GATHER_BASE_OFFSETS(__vec32_i8, int8_t, __vec32_i32, __gather_base_offsets32_i8)
GATHER_BASE_OFFSETS(__vec32_i8, int8_t, __vec32_i64, __gather_base_offsets64_i8)
GATHER_BASE_OFFSETS(__vec32_i16, int16_t, __vec32_i32, __gather_base_offsets32_i16)
GATHER_BASE_OFFSETS(__vec32_i16, int16_t, __vec32_i64, __gather_base_offsets64_i16)
GATHER_BASE_OFFSETS(__vec32_i32, int32_t, __vec32_i32, __gather_base_offsets32_i32)
GATHER_BASE_OFFSETS(__vec32_i32, int32_t, __vec32_i64, __gather_base_offsets64_i32)
GATHER_BASE_OFFSETS(__vec32_f, float, __vec32_i32, __gather_base_offsets32_float)
GATHER_BASE_OFFSETS(__vec32_f, float, __vec32_i64, __gather_base_offsets64_float)
GATHER_BASE_OFFSETS(__vec32_i64, int64_t, __vec32_i32, __gather_base_offsets32_i64)
GATHER_BASE_OFFSETS(__vec32_i64, int64_t, __vec32_i64, __gather_base_offsets64_i64)
GATHER_BASE_OFFSETS(__vec32_d, double, __vec32_i32, __gather_base_offsets32_double)
GATHER_BASE_OFFSETS(__vec32_d, double, __vec32_i64, __gather_base_offsets64_double)
#define GATHER_GENERAL(VTYPE, STYPE, PTRTYPE, FUNC) \
static FORCEINLINE VTYPE FUNC(PTRTYPE ptrs, __vec32_i1 mask) { \
@@ -1429,32 +1427,30 @@ GATHER_GENERAL(__vec32_d, double, __vec32_i64, __gather64_double)
// scatter
#define SCATTER_FACTORED_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC) \
static FORCEINLINE void FUNC(unsigned char *b, OTYPE varyingOffset, \
uint32_t scale, OTYPE constOffset, \
VTYPE val, __vec32_i1 mask) { \
#define SCATTER_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC) \
static FORCEINLINE void FUNC(unsigned char *b, uint32_t scale, \
OTYPE offset, VTYPE val, __vec32_i1 mask) { \
int8_t *base = (int8_t *)b; \
for (int i = 0; i < 32; ++i) \
if ((mask.v & (1 << i)) != 0) { \
STYPE *ptr = (STYPE *)(base + scale * varyingOffset.v[i] + \
constOffset.v[i]); \
STYPE *ptr = (STYPE *)(base + scale * offset.v[i]); \
*ptr = val.v[i]; \
} \
}
SCATTER_FACTORED_BASE_OFFSETS(__vec32_i8, int8_t, __vec32_i32, __scatter_factored_base_offsets32_i8)
SCATTER_FACTORED_BASE_OFFSETS(__vec32_i8, int8_t, __vec32_i64, __scatter_factored_base_offsets64_i8)
SCATTER_FACTORED_BASE_OFFSETS(__vec32_i16, int16_t, __vec32_i32, __scatter_factored_base_offsets32_i16)
SCATTER_FACTORED_BASE_OFFSETS(__vec32_i16, int16_t, __vec32_i64, __scatter_factored_base_offsets64_i16)
SCATTER_FACTORED_BASE_OFFSETS(__vec32_i32, int32_t, __vec32_i32, __scatter_factored_base_offsets32_i32)
SCATTER_FACTORED_BASE_OFFSETS(__vec32_i32, int32_t, __vec32_i64, __scatter_factored_base_offsets64_i32)
SCATTER_FACTORED_BASE_OFFSETS(__vec32_f, float, __vec32_i32, __scatter_factored_base_offsets32_float)
SCATTER_FACTORED_BASE_OFFSETS(__vec32_f, float, __vec32_i64, __scatter_factored_base_offsets64_float)
SCATTER_FACTORED_BASE_OFFSETS(__vec32_i64, int64_t, __vec32_i32, __scatter_factored_base_offsets32_i64)
SCATTER_FACTORED_BASE_OFFSETS(__vec32_i64, int64_t, __vec32_i64, __scatter_factored_base_offsets64_i64)
SCATTER_FACTORED_BASE_OFFSETS(__vec32_d, double, __vec32_i32, __scatter_factored_base_offsets32_double)
SCATTER_FACTORED_BASE_OFFSETS(__vec32_d, double, __vec32_i64, __scatter_factored_base_offsets64_double)
SCATTER_BASE_OFFSETS(__vec32_i8, int8_t, __vec32_i32, __scatter_base_offsets32_i8)
SCATTER_BASE_OFFSETS(__vec32_i8, int8_t, __vec32_i64, __scatter_base_offsets64_i8)
SCATTER_BASE_OFFSETS(__vec32_i16, int16_t, __vec32_i32, __scatter_base_offsets32_i16)
SCATTER_BASE_OFFSETS(__vec32_i16, int16_t, __vec32_i64, __scatter_base_offsets64_i16)
SCATTER_BASE_OFFSETS(__vec32_i32, int32_t, __vec32_i32, __scatter_base_offsets32_i32)
SCATTER_BASE_OFFSETS(__vec32_i32, int32_t, __vec32_i64, __scatter_base_offsets64_i32)
SCATTER_BASE_OFFSETS(__vec32_f, float, __vec32_i32, __scatter_base_offsets32_float)
SCATTER_BASE_OFFSETS(__vec32_f, float, __vec32_i64, __scatter_base_offsets64_float)
SCATTER_BASE_OFFSETS(__vec32_i64, int64_t, __vec32_i32, __scatter_base_offsets32_i64)
SCATTER_BASE_OFFSETS(__vec32_i64, int64_t, __vec32_i64, __scatter_base_offsets64_i64)
SCATTER_BASE_OFFSETS(__vec32_d, double, __vec32_i32, __scatter_base_offsets32_double)
SCATTER_BASE_OFFSETS(__vec32_d, double, __vec32_i64, __scatter_base_offsets64_double)
#define SCATTER_GENERAL(VTYPE, STYPE, PTRTYPE, FUNC) \
static FORCEINLINE void FUNC(PTRTYPE ptrs, VTYPE val, __vec32_i1 mask) { \

View File

@@ -1507,40 +1507,38 @@ static FORCEINLINE void __masked_store_blend_double(void *p, __vec64_d val,
// offsets * offsetScale is in bytes (for all of these)
#define GATHER_FACTORED_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC) \
static FORCEINLINE VTYPE FUNC(unsigned char *b, OTYPE varyingOffset, \
uint32_t scale, OTYPE constOffset, \
__vec64_i1 mask) { \
#define GATHER_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC) \
static FORCEINLINE VTYPE FUNC(unsigned char *b, uint32_t scale, \
OTYPE offset, __vec64_i1 mask) { \
VTYPE ret; \
int8_t *base = (int8_t *)b; \
for (int i = 0; i < 64; ++i) \
if ((mask.v & (1ull << i)) != 0) { \
STYPE *ptr = (STYPE *)(base + scale * varyingOffset.v[i] + \
constOffset.v[i]); \
if ((mask.v & (1ull << i)) != 0) { \
STYPE *ptr = (STYPE *)(base + scale * offset.v[i]); \
ret.v[i] = *ptr; \
} \
return ret; \
}
GATHER_FACTORED_BASE_OFFSETS(__vec64_i8, int8_t, __vec64_i32, __gather_factored_base_offsets32_i8)
GATHER_FACTORED_BASE_OFFSETS(__vec64_i8, int8_t, __vec64_i64, __gather_factored_base_offsets64_i8)
GATHER_FACTORED_BASE_OFFSETS(__vec64_i16, int16_t, __vec64_i32, __gather_factored_base_offsets32_i16)
GATHER_FACTORED_BASE_OFFSETS(__vec64_i16, int16_t, __vec64_i64, __gather_factored_base_offsets64_i16)
GATHER_FACTORED_BASE_OFFSETS(__vec64_i32, int32_t, __vec64_i32, __gather_factored_base_offsets32_i32)
GATHER_FACTORED_BASE_OFFSETS(__vec64_i32, int32_t, __vec64_i64, __gather_factored_base_offsets64_i32)
GATHER_FACTORED_BASE_OFFSETS(__vec64_f, float, __vec64_i32, __gather_factored_base_offsets32_float)
GATHER_FACTORED_BASE_OFFSETS(__vec64_f, float, __vec64_i64, __gather_factored_base_offsets64_float)
GATHER_FACTORED_BASE_OFFSETS(__vec64_i64, int64_t, __vec64_i32, __gather_factored_base_offsets32_i64)
GATHER_FACTORED_BASE_OFFSETS(__vec64_i64, int64_t, __vec64_i64, __gather_factored_base_offsets64_i64)
GATHER_FACTORED_BASE_OFFSETS(__vec64_d, double, __vec64_i32, __gather_factored_base_offsets32_double)
GATHER_FACTORED_BASE_OFFSETS(__vec64_d, double, __vec64_i64, __gather_factored_base_offsets64_double)
GATHER_BASE_OFFSETS(__vec64_i8, int8_t, __vec64_i32, __gather_base_offsets32_i8)
GATHER_BASE_OFFSETS(__vec64_i8, int8_t, __vec64_i64, __gather_base_offsets64_i8)
GATHER_BASE_OFFSETS(__vec64_i16, int16_t, __vec64_i32, __gather_base_offsets32_i16)
GATHER_BASE_OFFSETS(__vec64_i16, int16_t, __vec64_i64, __gather_base_offsets64_i16)
GATHER_BASE_OFFSETS(__vec64_i32, int32_t, __vec64_i32, __gather_base_offsets32_i32)
GATHER_BASE_OFFSETS(__vec64_i32, int32_t, __vec64_i64, __gather_base_offsets64_i32)
GATHER_BASE_OFFSETS(__vec64_f, float, __vec64_i32, __gather_base_offsets32_float)
GATHER_BASE_OFFSETS(__vec64_f, float, __vec64_i64, __gather_base_offsets64_float)
GATHER_BASE_OFFSETS(__vec64_i64, int64_t, __vec64_i32, __gather_base_offsets32_i64)
GATHER_BASE_OFFSETS(__vec64_i64, int64_t, __vec64_i64, __gather_base_offsets64_i64)
GATHER_BASE_OFFSETS(__vec64_d, double, __vec64_i32, __gather_base_offsets32_double)
GATHER_BASE_OFFSETS(__vec64_d, double, __vec64_i64, __gather_base_offsets64_double)
#define GATHER_GENERAL(VTYPE, STYPE, PTRTYPE, FUNC) \
static FORCEINLINE VTYPE FUNC(PTRTYPE ptrs, __vec64_i1 mask) { \
VTYPE ret; \
for (int i = 0; i < 64; ++i) \
if ((mask.v & (1ull << i)) != 0) { \
if ((mask.v & (1ull << i)) != 0) { \
STYPE *ptr = (STYPE *)ptrs.v[i]; \
ret.v[i] = *ptr; \
} \
@@ -1562,32 +1560,30 @@ GATHER_GENERAL(__vec64_d, double, __vec64_i64, __gather64_double)
// scatter
#define SCATTER_FACTORED_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC) \
static FORCEINLINE void FUNC(unsigned char *b, OTYPE varyingOffset, \
uint32_t scale, OTYPE constOffset, \
VTYPE val, __vec64_i1 mask) { \
#define SCATTER_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC) \
static FORCEINLINE void FUNC(unsigned char *b, uint32_t scale, \
OTYPE offset, VTYPE val, __vec64_i1 mask) { \
int8_t *base = (int8_t *)b; \
for (int i = 0; i < 64; ++i) \
if ((mask.v & (1ull << i)) != 0) { \
STYPE *ptr = (STYPE *)(base + scale * varyingOffset.v[i] + \
constOffset.v[i]); \
if ((mask.v & (1ull << i)) != 0) { \
STYPE *ptr = (STYPE *)(base + scale * offset.v[i]); \
*ptr = val.v[i]; \
} \
}
SCATTER_FACTORED_BASE_OFFSETS(__vec64_i8, int8_t, __vec64_i32, __scatter_factored_base_offsets32_i8)
SCATTER_FACTORED_BASE_OFFSETS(__vec64_i8, int8_t, __vec64_i64, __scatter_factored_base_offsets64_i8)
SCATTER_FACTORED_BASE_OFFSETS(__vec64_i16, int16_t, __vec64_i32, __scatter_factored_base_offsets32_i16)
SCATTER_FACTORED_BASE_OFFSETS(__vec64_i16, int16_t, __vec64_i64, __scatter_factored_base_offsets64_i16)
SCATTER_FACTORED_BASE_OFFSETS(__vec64_i32, int32_t, __vec64_i32, __scatter_factored_base_offsets32_i32)
SCATTER_FACTORED_BASE_OFFSETS(__vec64_i32, int32_t, __vec64_i64, __scatter_factored_base_offsets64_i32)
SCATTER_FACTORED_BASE_OFFSETS(__vec64_f, float, __vec64_i32, __scatter_factored_base_offsets32_float)
SCATTER_FACTORED_BASE_OFFSETS(__vec64_f, float, __vec64_i64, __scatter_factored_base_offsets64_float)
SCATTER_FACTORED_BASE_OFFSETS(__vec64_i64, int64_t, __vec64_i32, __scatter_factored_base_offsets32_i64)
SCATTER_FACTORED_BASE_OFFSETS(__vec64_i64, int64_t, __vec64_i64, __scatter_factored_base_offsets64_i64)
SCATTER_FACTORED_BASE_OFFSETS(__vec64_d, double, __vec64_i32, __scatter_factored_base_offsets32_double)
SCATTER_FACTORED_BASE_OFFSETS(__vec64_d, double, __vec64_i64, __scatter_factored_base_offsets64_double)
SCATTER_BASE_OFFSETS(__vec64_i8, int8_t, __vec64_i32, __scatter_base_offsets32_i8)
SCATTER_BASE_OFFSETS(__vec64_i8, int8_t, __vec64_i64, __scatter_base_offsets64_i8)
SCATTER_BASE_OFFSETS(__vec64_i16, int16_t, __vec64_i32, __scatter_base_offsets32_i16)
SCATTER_BASE_OFFSETS(__vec64_i16, int16_t, __vec64_i64, __scatter_base_offsets64_i16)
SCATTER_BASE_OFFSETS(__vec64_i32, int32_t, __vec64_i32, __scatter_base_offsets32_i32)
SCATTER_BASE_OFFSETS(__vec64_i32, int32_t, __vec64_i64, __scatter_base_offsets64_i32)
SCATTER_BASE_OFFSETS(__vec64_f, float, __vec64_i32, __scatter_base_offsets32_float)
SCATTER_BASE_OFFSETS(__vec64_f, float, __vec64_i64, __scatter_base_offsets64_float)
SCATTER_BASE_OFFSETS(__vec64_i64, int64_t, __vec64_i32, __scatter_base_offsets32_i64)
SCATTER_BASE_OFFSETS(__vec64_i64, int64_t, __vec64_i64, __scatter_base_offsets64_i64)
SCATTER_BASE_OFFSETS(__vec64_d, double, __vec64_i32, __scatter_base_offsets32_double)
SCATTER_BASE_OFFSETS(__vec64_d, double, __vec64_i64, __scatter_base_offsets64_double)
#define SCATTER_GENERAL(VTYPE, STYPE, PTRTYPE, FUNC) \
static FORCEINLINE void FUNC(PTRTYPE ptrs, VTYPE val, __vec64_i1 mask) { \

View File

@@ -1940,60 +1940,33 @@ static FORCEINLINE void __masked_store_blend_float(void *p, __vec16_f val,
// offsets * offsetScale is in bytes (for all of these)
#define GATHER_FACTORED_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC)
/*
static FORCEINLINE VTYPE FUNC(unsigned char *b, OTYPE varyingOffset, \
uint32_t scale, OTYPE constOffset, \
__vec16_i1 mask) { \
VTYPE ret; \
int8_t *base = (int8_t *)b; \
for (int i = 0; i < 16; ++i) \
if ((mask.v & (1 << i)) != 0) { \
STYPE *ptr = (STYPE *)(base + scale * varyingOffset.v[i] + \
constOffset.v[i]); \
ret.v[i] = *ptr; \
} \
return ret; \
}
*/
static FORCEINLINE __vec16_i32
__gather_factored_base_offsets32_i32(uint8_t *base, __vec16_i32 varyingOffset,
uint32_t scale, __vec16_i32 constOffset,
__gather_base_offsets32_i32(uint8_t *base, uint32_t scale, __vec16_i32 offsets,
__vec16_i1 mask) {
__vec16_i32 vscale = _mm512_extload_epi32(&scale, _MM_UPCONV_EPI32_NONE, _MM_BROADCAST_1X16, _MM_HINT_NONE);
__vec16_i32 offsets = __add(__mul(vscale, varyingOffset), constOffset);
__vec16_i32 tmp;
// Loop is generated by intrinsic
__vec16_i32 ret = _mm512_mask_i32extgather_epi32(tmp, mask, offsets, base,
_MM_UPCONV_EPI32_NONE, 1,
_MM_UPCONV_EPI32_NONE, scale,
_MM_HINT_NONE);
return ret;
}
static FORCEINLINE __vec16_f
__gather_factored_base_offsets32_float(uint8_t *base, __vec16_i32 varyingOffset,
uint32_t scale, __vec16_i32 constOffset,
__gather_base_offsets32_float(uint8_t *base, uint32_t scale, __vec16_i32 offsets,
__vec16_i1 mask) {
__vec16_i32 vscale = _mm512_extload_epi32(&scale, _MM_UPCONV_EPI32_NONE, _MM_BROADCAST_1X16, _MM_HINT_NONE);
__vec16_i32 offsets = __add(__mul(vscale, varyingOffset), constOffset);
__vec16_f tmp;
// Loop is generated by intrinsic
__vec16_f ret = _mm512_mask_i32extgather_ps(tmp, mask, offsets, base,
_MM_UPCONV_PS_NONE, 1,
_MM_UPCONV_PS_NONE, scale,
_MM_HINT_NONE);
return ret;
}
GATHER_FACTORED_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i32, __gather_factored_base_offsets32_i8)
GATHER_FACTORED_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i64, __gather_factored_base_offsets64_i8)
GATHER_FACTORED_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i32, __gather_factored_base_offsets32_i16)
GATHER_FACTORED_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i64, __gather_factored_base_offsets64_i16)
GATHER_FACTORED_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i64, __gather_factored_base_offsets64_i32)
GATHER_FACTORED_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i32, __gather_factored_base_offsets32_i64)
GATHER_FACTORED_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i64, __gather_factored_base_offsets64_i64)
//GATHER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i32, __gather_base_offsets32_i8)
//GATHER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i64, __gather_base_offsets64_i8)
//GATHER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i32, __gather_base_offsets32_i16)
//GATHER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i64, __gather_base_offsets64_i16)
//GATHER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i64, __gather_base_offsets64_i32)
//GATHER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i32, __gather_base_offsets32_i64)
//GATHER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i64, __gather_base_offsets64_i64)
#define GATHER_GENERAL(VTYPE, STYPE, PTRTYPE, FUNC)
/*
@@ -2039,45 +2012,30 @@ static FORCEINLINE __vec16_i32 __gather64_i32(__vec16_i64 ptrs, __vec16_i1 mask)
*/
// scatter
#define SCATTER_FACTORED_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC)
/*
static FORCEINLINE void FUNC(unsigned char *b, OTYPE varyingOffset, \
uint32_t scale, OTYPE constOffset, \
VTYPE val, __vec16_i1 mask) { \
int8_t *base = (int8_t *)b; \
for (int i = 0; i < 16; ++i) \
if ((mask.v & (1 << i)) != 0) { \
STYPE *ptr = (STYPE *)(base + scale * varyingOffset.v[i] + \
constOffset.v[i]); \
*ptr = val.v[i]; \
} \
}
*/
SCATTER_FACTORED_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i32, __scatter_factored_base_offsets32_i8)
SCATTER_FACTORED_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i64, __scatter_factored_base_offsets64_i8)
SCATTER_FACTORED_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i32, __scatter_factored_base_offsets32_i16)
SCATTER_FACTORED_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i64, __scatter_factored_base_offsets64_i16)
SCATTER_FACTORED_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i64, __scatter_factored_base_offsets64_i32)
SCATTER_FACTORED_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i32, __scatter_factored_base_offsets32_i64)
SCATTER_FACTORED_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i64, __scatter_factored_base_offsets64_i64)
//SCATTER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i32, __scatter_base_offsets32_i8)
//SCATTER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i64, __scatter_base_offsets64_i8)
//SCATTER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i32, __scatter_base_offsets32_i16)
//SCATTER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i64, __scatter_base_offsets64_i16)
//SCATTER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i64, __scatter_base_offsets64_i32)
//SCATTER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i32, __scatter_base_offsets32_i64)
//SCATTER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i64, __scatter_base_offsets64_i64)
static FORCEINLINE void
__scatter_factored_base_offsets32_i32(uint8_t *b, __vec16_i32 varyingOffset,
uint32_t scale, __vec16_i32 constOffset,
__scatter_base_offsets32_i32(uint8_t *b, uint32_t scale, __vec16_i32 offsets,
__vec16_i32 val, __vec16_i1 mask)
{
__vec16_i32 offsets = __add(__mul(__vec16_i32(scale), varyingOffset), constOffset);
_mm512_mask_i32extscatter_epi32(b, mask, offsets, val, _MM_DOWNCONV_EPI32_NONE, 1, _MM_HINT_NONE);
_mm512_mask_i32extscatter_epi32(b, mask, offsets, val,
_MM_DOWNCONV_EPI32_NONE, scale,
_MM_HINT_NONE);
}
static FORCEINLINE void
__scatter_factored_base_offsets32_float(void *base, const __vec16_i32 &varyingOffset,
uint32_t scale, const __vec16_i32 &constOffset,
const __vec16_f &val, const __vec16_i1 mask)
__scatter_base_offsets32_float(void *base, uint32_t scale, __vec16_i32 offsets,
__vec16_f val, __vec16_i1 mask)
{
__vec16_i32 offsets = __add(__mul(varyingOffset,__vec16_i32(scale)), constOffset);
_mm512_mask_i32extscatter_ps(base, mask, offsets, val, _MM_DOWNCONV_PS_NONE, _MM_SCALE_1, _MM_HINT_NONE);
_mm512_mask_i32extscatter_ps(base, mask, offsets, val,
_MM_DOWNCONV_PS_NONE, scale,
_MM_HINT_NONE);
}
#define SCATTER_GENERAL(VTYPE, STYPE, PTRTYPE, FUNC)

View File

@@ -2892,54 +2892,53 @@ static FORCEINLINE void __masked_store_blend_double(void *p, __vec4_d val,
template<typename RetVec, typename RetScalar>
static FORCEINLINE RetVec
lGatherBaseOffsets32(RetVec, RetScalar, unsigned char *p, __vec4_i32 offsets,
uint32_t scale, __vec4_i32 constOffset, __vec4_i1 mask) {
lGatherBaseOffsets32(RetVec, RetScalar, unsigned char *p, uint32_t scale,
__vec4_i32 offsets, __vec4_i1 mask) {
RetScalar r[4];
#if 1
// "Fast gather" trick...
offsets = __select(mask, offsets, __setzero_i32());
constOffset = __select(mask, constOffset, __setzero_i32());
int offset = scale * _mm_extract_epi32(offsets.v, 0) + _mm_extract_epi32(constOffset.v, 0);
int offset = scale * _mm_extract_epi32(offsets.v, 0);
RetScalar *ptr = (RetScalar *)(p + offset);
r[0] = *ptr;
offset = scale * _mm_extract_epi32(offsets.v, 1) + _mm_extract_epi32(constOffset.v, 1);
offset = scale * _mm_extract_epi32(offsets.v, 1);
ptr = (RetScalar *)(p + offset);
r[1] = *ptr;
offset = scale * _mm_extract_epi32(offsets.v, 2) + _mm_extract_epi32(constOffset.v, 2);
offset = scale * _mm_extract_epi32(offsets.v, 2);
ptr = (RetScalar *)(p + offset);
r[2] = *ptr;
offset = scale * _mm_extract_epi32(offsets.v, 3) + _mm_extract_epi32(constOffset.v, 3);
offset = scale * _mm_extract_epi32(offsets.v, 3);
ptr = (RetScalar *)(p + offset);
r[3] = *ptr;
#else
uint32_t m = _mm_extract_ps(mask.v, 0);
if (m != 0) {
int offset = scale * _mm_extract_epi32(offsets.v, 0) + _mm_extract_epi32(constOffset.v, 0);
int offset = scale * _mm_extract_epi32(offsets.v, 0);
RetScalar *ptr = (RetScalar *)(p + offset);
r[0] = *ptr;
}
m = _mm_extract_ps(mask.v, 1);
if (m != 0) {
int offset = scale * _mm_extract_epi32(offsets.v, 1) + _mm_extract_epi32(constOffset.v, 1);
int offset = scale * _mm_extract_epi32(offsets.v, 1);
RetScalar *ptr = (RetScalar *)(p + offset);
r[1] = *ptr;
}
m = _mm_extract_ps(mask.v, 2);
if (m != 0) {
int offset = scale * _mm_extract_epi32(offsets.v, 2) + _mm_extract_epi32(constOffset.v, 2);
int offset = scale * _mm_extract_epi32(offsets.v, 2);
RetScalar *ptr = (RetScalar *)(p + offset);
r[2] = *ptr;
}
m = _mm_extract_ps(mask.v, 3);
if (m != 0) {
int offset = scale * _mm_extract_epi32(offsets.v, 3) + _mm_extract_epi32(constOffset.v, 3);
int offset = scale * _mm_extract_epi32(offsets.v, 3);
RetScalar *ptr = (RetScalar *)(p + offset);
r[3] = *ptr;
}
@@ -2950,54 +2949,53 @@ lGatherBaseOffsets32(RetVec, RetScalar, unsigned char *p, __vec4_i32 offsets,
template<typename RetVec, typename RetScalar>
static FORCEINLINE RetVec
lGatherBaseOffsets64(RetVec, RetScalar, unsigned char *p, __vec4_i64 offsets,
uint32_t scale, __vec4_i64 constOffset, __vec4_i1 mask) {
lGatherBaseOffsets64(RetVec, RetScalar, unsigned char *p, uint32_t scale,
__vec4_i64 offsets, __vec4_i1 mask) {
RetScalar r[4];
#if 1
// "Fast gather" trick...
offsets = __select(mask, offsets, __setzero_i64());
constOffset = __select(mask, constOffset, __setzero_i64());
int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0) + _mm_extract_epi64(constOffset.v[0], 0);
int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0);
RetScalar *ptr = (RetScalar *)(p + offset);
r[0] = *ptr;
offset = scale * _mm_extract_epi64(offsets.v[0], 1) + _mm_extract_epi64(constOffset.v[0], 1);
offset = scale * _mm_extract_epi64(offsets.v[0], 1);
ptr = (RetScalar *)(p + offset);
r[1] = *ptr;
offset = scale * _mm_extract_epi64(offsets.v[1], 0) + _mm_extract_epi64(constOffset.v[1], 0);
offset = scale * _mm_extract_epi64(offsets.v[1], 0);
ptr = (RetScalar *)(p + offset);
r[2] = *ptr;
offset = scale * _mm_extract_epi64(offsets.v[1], 1) + _mm_extract_epi64(constOffset.v[1], 1);
offset = scale * _mm_extract_epi64(offsets.v[1], 1);
ptr = (RetScalar *)(p + offset);
r[3] = *ptr;
#else
uint32_t m = _mm_extract_ps(mask.v, 0);
if (m != 0) {
int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0) + _mm_extract_epi64(constOffset.v[0], 0);
int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0);
RetScalar *ptr = (RetScalar *)(p + offset);
r[0] = *ptr;
}
m = _mm_extract_ps(mask.v, 1);
if (m != 0) {
int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 1) + _mm_extract_epi64(constOffset.v[0], 1);
int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 1);
RetScalar *ptr = (RetScalar *)(p + offset);
r[1] = *ptr;
}
m = _mm_extract_ps(mask.v, 2);
if (m != 0) {
int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 0) + _mm_extract_epi64(constOffset.v[1], 0);
int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 0);
RetScalar *ptr = (RetScalar *)(p + offset);
r[2] = *ptr;
}
m = _mm_extract_ps(mask.v, 3);
if (m != 0) {
int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 1) + _mm_extract_epi64(constOffset.v[1], 1);
int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 1);
RetScalar *ptr = (RetScalar *)(p + offset);
r[3] = *ptr;
}
@@ -3007,87 +3005,75 @@ lGatherBaseOffsets64(RetVec, RetScalar, unsigned char *p, __vec4_i64 offsets,
}
static FORCEINLINE __vec4_i8
__gather_factored_base_offsets32_i8(unsigned char *b, __vec4_i32 offsets,
uint32_t scale, __vec4_i32 constOffset, __vec4_i1 mask) {
return lGatherBaseOffsets32(__vec4_i8(), uint8_t(), b, offsets, scale,
constOffset, mask);
__gather_base_offsets32_i8(unsigned char *b, uint32_t scale, __vec4_i32 offsets,
__vec4_i1 mask) {
return lGatherBaseOffsets32(__vec4_i8(), uint8_t(), b, scale, offsets, mask);
}
static FORCEINLINE __vec4_i8
__gather_factored_base_offsets64_i8(unsigned char *b, __vec4_i64 offsets,
uint32_t scale, __vec4_i64 constOffset, __vec4_i1 mask) {
return lGatherBaseOffsets64(__vec4_i8(), uint8_t(), b, offsets, scale,
constOffset, mask);
__gather_base_offsets64_i8(unsigned char *b, uint32_t scale, __vec4_i64 offsets,
__vec4_i1 mask) {
return lGatherBaseOffsets64(__vec4_i8(), uint8_t(), b, scale, offsets, mask);
}
static FORCEINLINE __vec4_i16
__gather_factored_base_offsets32_i16(unsigned char *b, __vec4_i32 offsets,
uint32_t scale, __vec4_i32 constOffset, __vec4_i1 mask) {
return lGatherBaseOffsets32(__vec4_i16(), uint16_t(), b, offsets, scale,
constOffset, mask);
__gather_base_offsets32_i16(unsigned char *b, uint32_t scale, __vec4_i32 offsets,
__vec4_i1 mask) {
return lGatherBaseOffsets32(__vec4_i16(), uint16_t(), b, scale, offsets, mask);
}
static FORCEINLINE __vec4_i16
__gather_factored_base_offsets64_i16(unsigned char *b, __vec4_i64 offsets,
uint32_t scale, __vec4_i64 constOffset, __vec4_i1 mask) {
return lGatherBaseOffsets64(__vec4_i16(), uint16_t(), b, offsets, scale,
constOffset, mask);
__gather_base_offsets64_i16(unsigned char *b, uint32_t scale, __vec4_i64 offsets,
__vec4_i1 mask) {
return lGatherBaseOffsets64(__vec4_i16(), uint16_t(), b, scale, offsets, mask);
}
static FORCEINLINE __vec4_i32
__gather_factored_base_offsets32_i32(uint8_t *p, __vec4_i32 offsets, uint32_t scale,
__vec4_i32 constOffset, __vec4_i1 mask) {
return lGatherBaseOffsets32(__vec4_i32(), uint32_t(), p, offsets, scale,
constOffset, mask);
__gather_base_offsets32_i32(uint8_t *p, uint32_t scale, __vec4_i32 offsets,
__vec4_i1 mask) {
return lGatherBaseOffsets32(__vec4_i32(), uint32_t(), p, scale, offsets, mask);
}
static FORCEINLINE __vec4_i32
__gather_factored_base_offsets64_i32(unsigned char *p, __vec4_i64 offsets,
uint32_t scale, __vec4_i64 constOffset, __vec4_i1 mask) {
return lGatherBaseOffsets64(__vec4_i32(), uint32_t(), p, offsets, scale,
constOffset, mask);
__gather_base_offsets64_i32(unsigned char *p, uint32_t scale, __vec4_i64 offsets,
__vec4_i1 mask) {
return lGatherBaseOffsets64(__vec4_i32(), uint32_t(), p, scale, offsets, mask);
}
static FORCEINLINE __vec4_f
__gather_factored_base_offsets32_float(uint8_t *p, __vec4_i32 offsets, uint32_t scale,
__vec4_i32 constOffset, __vec4_i1 mask) {
return lGatherBaseOffsets32(__vec4_f(), float(), p, offsets, scale,
constOffset, mask);
__gather_base_offsets32_float(uint8_t *p, uint32_t scale, __vec4_i32 offsets,
__vec4_i1 mask) {
return lGatherBaseOffsets32(__vec4_f(), float(), p, scale, offsets, mask);
}
static FORCEINLINE __vec4_f
__gather_factored_base_offsets64_float(unsigned char *p, __vec4_i64 offsets,
uint32_t scale, __vec4_i64 constOffset, __vec4_i1 mask) {
return lGatherBaseOffsets64(__vec4_f(), float(), p, offsets, scale,
constOffset, mask);
__gather_base_offsets64_float(unsigned char *p, uint32_t scale, __vec4_i64 offsets,
__vec4_i1 mask) {
return lGatherBaseOffsets64(__vec4_f(), float(), p, scale, offsets, mask);
}
static FORCEINLINE __vec4_i64
__gather_factored_base_offsets32_i64(unsigned char *p, __vec4_i32 offsets,
uint32_t scale, __vec4_i32 constOffset, __vec4_i1 mask) {
return lGatherBaseOffsets32(__vec4_i64(), uint64_t(), p, offsets, scale,
constOffset, mask);
__gather_base_offsets32_i64(unsigned char *p, uint32_t scale, __vec4_i32 offsets,
__vec4_i1 mask) {
return lGatherBaseOffsets32(__vec4_i64(), uint64_t(), p, scale, offsets, mask);
}
static FORCEINLINE __vec4_i64
__gather_factored_base_offsets64_i64(unsigned char *p, __vec4_i64 offsets,
uint32_t scale, __vec4_i64 constOffset, __vec4_i1 mask) {
return lGatherBaseOffsets64(__vec4_i64(), uint64_t(), p, offsets, scale,
constOffset, mask);
__gather_base_offsets64_i64(unsigned char *p, uint32_t scale, __vec4_i64 offsets,
__vec4_i1 mask) {
return lGatherBaseOffsets64(__vec4_i64(), uint64_t(), p, scale, offsets, mask);
}
static FORCEINLINE __vec4_d
__gather_factored_base_offsets32_double(unsigned char *p, __vec4_i32 offsets,
uint32_t scale, __vec4_i32 constOffset, __vec4_i1 mask) {
return lGatherBaseOffsets32(__vec4_d(), double(), p, offsets, scale,
constOffset, mask);
__gather_base_offsets32_double(unsigned char *p, uint32_t scale, __vec4_i32 offsets,
__vec4_i1 mask) {
return lGatherBaseOffsets32(__vec4_d(), double(), p, scale, offsets, mask);
}
static FORCEINLINE __vec4_d
__gather_factored_base_offsets64_double(unsigned char *p, __vec4_i64 offsets,
uint32_t scale, __vec4_i64 constOffset, __vec4_i1 mask) {
return lGatherBaseOffsets64(__vec4_d(), double(), p, offsets, scale,
constOffset, mask);
__gather_base_offsets64_double(unsigned char *p, uint32_t scale, __vec4_i64 offsets,
__vec4_i1 mask) {
return lGatherBaseOffsets64(__vec4_d(), double(), p, scale, offsets, mask);
}
template<typename RetVec, typename RetScalar>
@@ -3252,63 +3238,55 @@ static FORCEINLINE __vec4_d __gather64_double(__vec4_i64 ptrs, __vec4_i1 mask) {
#define SCATTER32_64(SUFFIX, VEC_SUFFIX, TYPE, EXTRACT) \
static FORCEINLINE void \
__scatter_factored_base_offsets32_##SUFFIX (unsigned char *b, __vec4_i32 offsets, \
uint32_t scale, __vec4_i32 constOffset, \
__scatter_base_offsets32_##SUFFIX (unsigned char *b, uint32_t scale, \
__vec4_i32 offsets, \
__vec4_##VEC_SUFFIX val, __vec4_i1 mask) { \
uint32_t m = _mm_extract_ps(mask.v, 0); \
if (m != 0) { \
TYPE *ptr = (TYPE *)(b + scale * _mm_extract_epi32(offsets.v, 0) + \
_mm_extract_epi32(constOffset.v, 0)); \
TYPE *ptr = (TYPE *)(b + scale * _mm_extract_epi32(offsets.v, 0)); \
*ptr = EXTRACT(val.v, 0); \
} \
m = _mm_extract_ps(mask.v, 1); \
if (m != 0) { \
TYPE *ptr = (TYPE *)(b + scale * _mm_extract_epi32(offsets.v, 1) + \
_mm_extract_epi32(constOffset.v, 1)); \
TYPE *ptr = (TYPE *)(b + scale * _mm_extract_epi32(offsets.v, 1)); \
*ptr = EXTRACT(val.v, 1); \
} \
m = _mm_extract_ps(mask.v, 2); \
if (m != 0) { \
TYPE *ptr = (TYPE *)(b + scale * _mm_extract_epi32(offsets.v, 2) + \
_mm_extract_epi32(constOffset.v, 2)); \
TYPE *ptr = (TYPE *)(b + scale * _mm_extract_epi32(offsets.v, 2)); \
*ptr = EXTRACT(val.v, 2); \
} \
m = _mm_extract_ps(mask.v, 3); \
if (m != 0) { \
TYPE *ptr = (TYPE *)(b + scale * _mm_extract_epi32(offsets.v, 3) + \
_mm_extract_epi32(constOffset.v, 3)); \
TYPE *ptr = (TYPE *)(b + scale * _mm_extract_epi32(offsets.v, 3)); \
*ptr = EXTRACT(val.v, 3); \
} \
} \
static FORCEINLINE void \
__scatter_factored_base_offsets64_##SUFFIX(unsigned char *p, __vec4_i64 offsets, \
uint32_t scale, __vec4_i64 constOffset, \
static FORCEINLINE void \
__scatter_base_offsets64_##SUFFIX(unsigned char *p, uint32_t scale, \
__vec4_i64 offsets, \
__vec4_##VEC_SUFFIX val, __vec4_i1 mask) { \
uint32_t m = _mm_extract_ps(mask.v, 0); \
if (m != 0) { \
int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0) + \
_mm_extract_epi64(constOffset.v[0], 0); \
int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0); \
TYPE *ptr = (TYPE *)(p + offset); \
*ptr = EXTRACT(val.v, 0); \
} \
m = _mm_extract_ps(mask.v, 1); \
if (m != 0) { \
int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 1) + \
_mm_extract_epi64(constOffset.v[0], 1); \
int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 1); \
TYPE *ptr = (TYPE *)(p + offset); \
*ptr = EXTRACT(val.v, 1); \
} \
m = _mm_extract_ps(mask.v, 2); \
if (m != 0) { \
int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 0) + \
_mm_extract_epi64(constOffset.v[1], 0); \
int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 0); \
TYPE *ptr = (TYPE *)(p + offset); \
*ptr = EXTRACT(val.v, 2); \
} \
m = _mm_extract_ps(mask.v, 3); \
if (m != 0) { \
int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 1) + \
_mm_extract_epi64(constOffset.v[1], 1); \
int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 1); \
TYPE *ptr = (TYPE *)(p + offset); \
*ptr = EXTRACT(val.v, 3); \
} \
@@ -3322,91 +3300,79 @@ SCATTER32_64(float, f, float, _mm_extract_ps_as_float)
static FORCEINLINE void
__scatter_factored_base_offsets32_i64(unsigned char *p, __vec4_i32 offsets,
uint32_t scale, __vec4_i32 constOffset, __vec4_i64 val,
__vec4_i1 mask) {
uint32_t m = _mm_extract_ps(mask.v, 0);
if (m != 0) {
int32_t offset = scale * _mm_extract_epi32(offsets.v, 0) +
_mm_extract_epi32(constOffset.v, 0);
uint64_t *ptr = (uint64_t *)(p + offset);
*ptr = _mm_extract_epi64(val.v[0], 0);
}
m = _mm_extract_ps(mask.v, 1);
if (m != 0) {
int32_t offset = scale * _mm_extract_epi32(offsets.v, 1) +
_mm_extract_epi32(constOffset.v, 1);
uint64_t *ptr = (uint64_t *)(p + offset);
*ptr = _mm_extract_epi64(val.v[0], 1);
}
m = _mm_extract_ps(mask.v, 2);
if (m != 0) {
int32_t offset = scale * _mm_extract_epi32(offsets.v, 2) +
_mm_extract_epi32(constOffset.v, 2);
uint64_t *ptr = (uint64_t *)(p + offset);
*ptr = _mm_extract_epi64(val.v[1], 0);
}
m = _mm_extract_ps(mask.v, 3);
if (m != 0) {
int32_t offset = scale * _mm_extract_epi32(offsets.v, 3) +
_mm_extract_epi32(constOffset.v, 3);
uint64_t *ptr = (uint64_t *)(p + offset);
*ptr = _mm_extract_epi64(val.v[1], 1);
}
}
static FORCEINLINE void
__scatter_factored_base_offsets64_i64(unsigned char *p, __vec4_i64 offsets,
uint32_t scale, __vec4_i64 constOffset,
__scatter_base_offsets32_i64(unsigned char *p, uint32_t scale, __vec4_i32 offsets,
__vec4_i64 val, __vec4_i1 mask) {
uint32_t m = _mm_extract_ps(mask.v, 0);
if (m != 0) {
int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0) +
_mm_extract_epi64(constOffset.v[0], 0);
int32_t offset = scale * _mm_extract_epi32(offsets.v, 0);
uint64_t *ptr = (uint64_t *)(p + offset);
*ptr = _mm_extract_epi64(val.v[0], 0);
}
m = _mm_extract_ps(mask.v, 1);
if (m != 0) {
int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 1) +
_mm_extract_epi64(constOffset.v[0], 1);
int32_t offset = scale * _mm_extract_epi32(offsets.v, 1);
uint64_t *ptr = (uint64_t *)(p + offset);
*ptr = _mm_extract_epi64(val.v[0], 1);
}
m = _mm_extract_ps(mask.v, 2);
if (m != 0) {
int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 0) +
_mm_extract_epi64(constOffset.v[1], 0);
int32_t offset = scale * _mm_extract_epi32(offsets.v, 2);
uint64_t *ptr = (uint64_t *)(p + offset);
*ptr = _mm_extract_epi64(val.v[1], 0);
}
m = _mm_extract_ps(mask.v, 3);
if (m != 0) {
int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 1) +
_mm_extract_epi64(constOffset.v[1], 1);
int32_t offset = scale * _mm_extract_epi32(offsets.v, 3);
uint64_t *ptr = (uint64_t *)(p + offset);
*ptr = _mm_extract_epi64(val.v[1], 1);
}
}
static FORCEINLINE void
__scatter_factored_base_offsets32_double(unsigned char *p, __vec4_i32 offsets,
uint32_t scale, __vec4_i32 constOffset, __vec4_d val,
__vec4_i1 mask) {
__scatter_factored_base_offsets32_i64(p, offsets, scale, constOffset, val, mask);
__scatter_base_offsets64_i64(unsigned char *p, uint32_t scale, __vec4_i64 offsets,
__vec4_i64 val, __vec4_i1 mask) {
uint32_t m = _mm_extract_ps(mask.v, 0);
if (m != 0) {
int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0);
uint64_t *ptr = (uint64_t *)(p + offset);
*ptr = _mm_extract_epi64(val.v[0], 0);
}
m = _mm_extract_ps(mask.v, 1);
if (m != 0) {
int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 1);
uint64_t *ptr = (uint64_t *)(p + offset);
*ptr = _mm_extract_epi64(val.v[0], 1);
}
m = _mm_extract_ps(mask.v, 2);
if (m != 0) {
int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 0);
uint64_t *ptr = (uint64_t *)(p + offset);
*ptr = _mm_extract_epi64(val.v[1], 0);
}
m = _mm_extract_ps(mask.v, 3);
if (m != 0) {
int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 1);
uint64_t *ptr = (uint64_t *)(p + offset);
*ptr = _mm_extract_epi64(val.v[1], 1);
}
}
static FORCEINLINE void
__scatter_factored_base_offsets64_double(unsigned char *p, __vec4_i64 offsets,
uint32_t scale, __vec4_i64 constOffset, __vec4_d val,
__vec4_i1 mask) {
__scatter_factored_base_offsets64_i64(p, offsets, scale, constOffset, val, mask);
__scatter_base_offsets32_double(unsigned char *p, uint32_t scale, __vec4_i32 offsets,
__vec4_d val, __vec4_i1 mask) {
__scatter_base_offsets32_i64(p, scale, offsets, val, mask);
}
static FORCEINLINE void
__scatter_base_offsets64_double(unsigned char *p, uint32_t scale, __vec4_i64 offsets,
__vec4_d val, __vec4_i1 mask) {
__scatter_base_offsets64_i64(p, scale, offsets, val, mask);
}

View File

@@ -254,6 +254,7 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa,
t->maskBitCount = 1;
t->hasHalf = true;
t->hasTranscendentals = true;
t->hasGather = t->hasScatter = true;
}
else if (!strcasecmp(isa, "generic-8")) {
t->isa = Target::GENERIC;
@@ -263,6 +264,7 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa,
t->maskBitCount = 1;
t->hasHalf = true;
t->hasTranscendentals = true;
t->hasGather = t->hasScatter = true;
}
else if (!strcasecmp(isa, "generic-16")) {
t->isa = Target::GENERIC;
@@ -272,6 +274,7 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa,
t->maskBitCount = 1;
t->hasHalf = true;
t->hasTranscendentals = true;
t->hasGather = t->hasScatter = true;
}
else if (!strcasecmp(isa, "generic-32")) {
t->isa = Target::GENERIC;
@@ -281,6 +284,7 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa,
t->maskBitCount = 1;
t->hasHalf = true;
t->hasTranscendentals = true;
t->hasGather = t->hasScatter = true;
}
else if (!strcasecmp(isa, "generic-64")) {
t->isa = Target::GENERIC;
@@ -290,6 +294,7 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa,
t->maskBitCount = 1;
t->hasHalf = true;
t->hasTranscendentals = true;
t->hasGather = t->hasScatter = true;
}
else if (!strcasecmp(isa, "generic-1")) {
t->isa = Target::GENERIC;