diff --git a/builtins/target-generic-common.ll b/builtins/target-generic-common.ll index c54dd948..77c7aabe 100644 --- a/builtins/target-generic-common.ll +++ b/builtins/target-generic-common.ll @@ -32,6 +32,9 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128-v16:16:16-v32:32:32"; define(`MASK',`i1') +define(`HAVE_GATHER',`1') +define(`HAVE_SCATTER',`1') + include(`util.m4') stdlib_core() @@ -334,19 +337,19 @@ define void @__masked_store_blend_double(* nocapture, ;; gather/scatter define(`gather_scatter', ` -declare @__gather_factored_base_offsets32_$1(i8 * nocapture, , - i32, , ) nounwind readonly -declare @__gather_factored_base_offsets64_$1(i8 * nocapture, , - i32, , ) nounwind readonly +declare @__gather_base_offsets32_$1(i8 * nocapture, i32, , + ) nounwind readonly +declare @__gather_base_offsets64_$1(i8 * nocapture, i32, , + ) nounwind readonly declare @__gather32_$1(, ) nounwind readonly declare @__gather64_$1(, ) nounwind readonly -declare void @__scatter_factored_base_offsets32_$1(i8* nocapture, , - i32, , , ) nounwind -declare void @__scatter_factored_base_offsets64_$1(i8* nocapture, , - i32, , , ) nounwind +declare void @__scatter_base_offsets32_$1(i8* nocapture, i32, , + , ) nounwind +declare void @__scatter_base_offsets64_$1(i8* nocapture, i32, , + , ) nounwind declare void @__scatter32_$1(, , ) nounwind declare void @__scatter64_$1(, , diff --git a/examples/intrinsics/generic-16.h b/examples/intrinsics/generic-16.h index c18e9fbe..42978701 100644 --- a/examples/intrinsics/generic-16.h +++ b/examples/intrinsics/generic-16.h @@ -1306,34 +1306,32 @@ static FORCEINLINE void __masked_store_blend_double(void *p, __vec16_d val, // offsets * offsetScale is in bytes (for all of these) -#define GATHER_FACTORED_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC) \ -static FORCEINLINE VTYPE FUNC(unsigned char *b, OTYPE varyingOffset, \ - uint32_t scale, OTYPE constOffset, \ - __vec16_i1 mask) { \ +#define GATHER_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC) \ +static FORCEINLINE VTYPE FUNC(unsigned char *b, uint32_t scale, \ + OTYPE offset, __vec16_i1 mask) { \ VTYPE ret; \ int8_t *base = (int8_t *)b; \ for (int i = 0; i < 16; ++i) \ if ((mask.v & (1 << i)) != 0) { \ - STYPE *ptr = (STYPE *)(base + scale * varyingOffset.v[i] + \ - constOffset.v[i]); \ + STYPE *ptr = (STYPE *)(base + scale * offset.v[i]); \ ret.v[i] = *ptr; \ } \ return ret; \ } -GATHER_FACTORED_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i32, __gather_factored_base_offsets32_i8) -GATHER_FACTORED_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i64, __gather_factored_base_offsets64_i8) -GATHER_FACTORED_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i32, __gather_factored_base_offsets32_i16) -GATHER_FACTORED_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i64, __gather_factored_base_offsets64_i16) -GATHER_FACTORED_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i32, __gather_factored_base_offsets32_i32) -GATHER_FACTORED_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i64, __gather_factored_base_offsets64_i32) -GATHER_FACTORED_BASE_OFFSETS(__vec16_f, float, __vec16_i32, __gather_factored_base_offsets32_float) -GATHER_FACTORED_BASE_OFFSETS(__vec16_f, float, __vec16_i64, __gather_factored_base_offsets64_float) -GATHER_FACTORED_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i32, __gather_factored_base_offsets32_i64) -GATHER_FACTORED_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i64, __gather_factored_base_offsets64_i64) -GATHER_FACTORED_BASE_OFFSETS(__vec16_d, double, __vec16_i32, __gather_factored_base_offsets32_double) -GATHER_FACTORED_BASE_OFFSETS(__vec16_d, double, __vec16_i64, __gather_factored_base_offsets64_double) +GATHER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i32, __gather_base_offsets32_i8) +GATHER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i64, __gather_base_offsets64_i8) +GATHER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i32, __gather_base_offsets32_i16) +GATHER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i64, __gather_base_offsets64_i16) +GATHER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i32, __gather_base_offsets32_i32) +GATHER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i64, __gather_base_offsets64_i32) +GATHER_BASE_OFFSETS(__vec16_f, float, __vec16_i32, __gather_base_offsets32_float) +GATHER_BASE_OFFSETS(__vec16_f, float, __vec16_i64, __gather_base_offsets64_float) +GATHER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i32, __gather_base_offsets32_i64) +GATHER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i64, __gather_base_offsets64_i64) +GATHER_BASE_OFFSETS(__vec16_d, double, __vec16_i32, __gather_base_offsets32_double) +GATHER_BASE_OFFSETS(__vec16_d, double, __vec16_i64, __gather_base_offsets64_double) #define GATHER_GENERAL(VTYPE, STYPE, PTRTYPE, FUNC) \ static FORCEINLINE VTYPE FUNC(PTRTYPE ptrs, __vec16_i1 mask) { \ @@ -1361,32 +1359,31 @@ GATHER_GENERAL(__vec16_d, double, __vec16_i64, __gather64_double) // scatter -#define SCATTER_FACTORED_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC) \ -static FORCEINLINE void FUNC(unsigned char *b, OTYPE varyingOffset, \ - uint32_t scale, OTYPE constOffset, \ - VTYPE val, __vec16_i1 mask) { \ +#define SCATTER_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC) \ +static FORCEINLINE void FUNC(unsigned char *b, uint32_t scale, \ + OTYPE offset, VTYPE val, \ + __vec16_i1 mask) { \ int8_t *base = (int8_t *)b; \ for (int i = 0; i < 16; ++i) \ if ((mask.v & (1 << i)) != 0) { \ - STYPE *ptr = (STYPE *)(base + scale * varyingOffset.v[i] + \ - constOffset.v[i]); \ + STYPE *ptr = (STYPE *)(base + scale * offset.v[i]); \ *ptr = val.v[i]; \ } \ } -SCATTER_FACTORED_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i32, __scatter_factored_base_offsets32_i8) -SCATTER_FACTORED_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i64, __scatter_factored_base_offsets64_i8) -SCATTER_FACTORED_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i32, __scatter_factored_base_offsets32_i16) -SCATTER_FACTORED_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i64, __scatter_factored_base_offsets64_i16) -SCATTER_FACTORED_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i32, __scatter_factored_base_offsets32_i32) -SCATTER_FACTORED_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i64, __scatter_factored_base_offsets64_i32) -SCATTER_FACTORED_BASE_OFFSETS(__vec16_f, float, __vec16_i32, __scatter_factored_base_offsets32_float) -SCATTER_FACTORED_BASE_OFFSETS(__vec16_f, float, __vec16_i64, __scatter_factored_base_offsets64_float) -SCATTER_FACTORED_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i32, __scatter_factored_base_offsets32_i64) -SCATTER_FACTORED_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i64, __scatter_factored_base_offsets64_i64) -SCATTER_FACTORED_BASE_OFFSETS(__vec16_d, double, __vec16_i32, __scatter_factored_base_offsets32_double) -SCATTER_FACTORED_BASE_OFFSETS(__vec16_d, double, __vec16_i64, __scatter_factored_base_offsets64_double) +SCATTER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i32, __scatter_base_offsets32_i8) +SCATTER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i64, __scatter_base_offsets64_i8) +SCATTER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i32, __scatter_base_offsets32_i16) +SCATTER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i64, __scatter_base_offsets64_i16) +SCATTER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i32, __scatter_base_offsets32_i32) +SCATTER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i64, __scatter_base_offsets64_i32) +SCATTER_BASE_OFFSETS(__vec16_f, float, __vec16_i32, __scatter_base_offsets32_float) +SCATTER_BASE_OFFSETS(__vec16_f, float, __vec16_i64, __scatter_base_offsets64_float) +SCATTER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i32, __scatter_base_offsets32_i64) +SCATTER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i64, __scatter_base_offsets64_i64) +SCATTER_BASE_OFFSETS(__vec16_d, double, __vec16_i32, __scatter_base_offsets32_double) +SCATTER_BASE_OFFSETS(__vec16_d, double, __vec16_i64, __scatter_base_offsets64_double) #define SCATTER_GENERAL(VTYPE, STYPE, PTRTYPE, FUNC) \ static FORCEINLINE void FUNC(PTRTYPE ptrs, VTYPE val, __vec16_i1 mask) { \ diff --git a/examples/intrinsics/generic-32.h b/examples/intrinsics/generic-32.h index c1f89cd8..94946f4a 100644 --- a/examples/intrinsics/generic-32.h +++ b/examples/intrinsics/generic-32.h @@ -1374,34 +1374,32 @@ static FORCEINLINE void __masked_store_blend_double(void *p, __vec32_d val, // offsets * offsetScale is in bytes (for all of these) -#define GATHER_FACTORED_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC) \ -static FORCEINLINE VTYPE FUNC(unsigned char *b, OTYPE varyingOffset, \ - uint32_t scale, OTYPE constOffset, \ - __vec32_i1 mask) { \ +#define GATHER_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC) \ +static FORCEINLINE VTYPE FUNC(unsigned char *b, uint32_t scale, \ + OTYPE offset, __vec32_i1 mask) { \ VTYPE ret; \ int8_t *base = (int8_t *)b; \ for (int i = 0; i < 32; ++i) \ if ((mask.v & (1 << i)) != 0) { \ - STYPE *ptr = (STYPE *)(base + scale * varyingOffset.v[i] + \ - constOffset.v[i]); \ + STYPE *ptr = (STYPE *)(base + scale * offset.v[i]); \ ret.v[i] = *ptr; \ } \ return ret; \ } -GATHER_FACTORED_BASE_OFFSETS(__vec32_i8, int8_t, __vec32_i32, __gather_factored_base_offsets32_i8) -GATHER_FACTORED_BASE_OFFSETS(__vec32_i8, int8_t, __vec32_i64, __gather_factored_base_offsets64_i8) -GATHER_FACTORED_BASE_OFFSETS(__vec32_i16, int16_t, __vec32_i32, __gather_factored_base_offsets32_i16) -GATHER_FACTORED_BASE_OFFSETS(__vec32_i16, int16_t, __vec32_i64, __gather_factored_base_offsets64_i16) -GATHER_FACTORED_BASE_OFFSETS(__vec32_i32, int32_t, __vec32_i32, __gather_factored_base_offsets32_i32) -GATHER_FACTORED_BASE_OFFSETS(__vec32_i32, int32_t, __vec32_i64, __gather_factored_base_offsets64_i32) -GATHER_FACTORED_BASE_OFFSETS(__vec32_f, float, __vec32_i32, __gather_factored_base_offsets32_float) -GATHER_FACTORED_BASE_OFFSETS(__vec32_f, float, __vec32_i64, __gather_factored_base_offsets64_float) -GATHER_FACTORED_BASE_OFFSETS(__vec32_i64, int64_t, __vec32_i32, __gather_factored_base_offsets32_i64) -GATHER_FACTORED_BASE_OFFSETS(__vec32_i64, int64_t, __vec32_i64, __gather_factored_base_offsets64_i64) -GATHER_FACTORED_BASE_OFFSETS(__vec32_d, double, __vec32_i32, __gather_factored_base_offsets32_double) -GATHER_FACTORED_BASE_OFFSETS(__vec32_d, double, __vec32_i64, __gather_factored_base_offsets64_double) +GATHER_BASE_OFFSETS(__vec32_i8, int8_t, __vec32_i32, __gather_base_offsets32_i8) +GATHER_BASE_OFFSETS(__vec32_i8, int8_t, __vec32_i64, __gather_base_offsets64_i8) +GATHER_BASE_OFFSETS(__vec32_i16, int16_t, __vec32_i32, __gather_base_offsets32_i16) +GATHER_BASE_OFFSETS(__vec32_i16, int16_t, __vec32_i64, __gather_base_offsets64_i16) +GATHER_BASE_OFFSETS(__vec32_i32, int32_t, __vec32_i32, __gather_base_offsets32_i32) +GATHER_BASE_OFFSETS(__vec32_i32, int32_t, __vec32_i64, __gather_base_offsets64_i32) +GATHER_BASE_OFFSETS(__vec32_f, float, __vec32_i32, __gather_base_offsets32_float) +GATHER_BASE_OFFSETS(__vec32_f, float, __vec32_i64, __gather_base_offsets64_float) +GATHER_BASE_OFFSETS(__vec32_i64, int64_t, __vec32_i32, __gather_base_offsets32_i64) +GATHER_BASE_OFFSETS(__vec32_i64, int64_t, __vec32_i64, __gather_base_offsets64_i64) +GATHER_BASE_OFFSETS(__vec32_d, double, __vec32_i32, __gather_base_offsets32_double) +GATHER_BASE_OFFSETS(__vec32_d, double, __vec32_i64, __gather_base_offsets64_double) #define GATHER_GENERAL(VTYPE, STYPE, PTRTYPE, FUNC) \ static FORCEINLINE VTYPE FUNC(PTRTYPE ptrs, __vec32_i1 mask) { \ @@ -1429,32 +1427,30 @@ GATHER_GENERAL(__vec32_d, double, __vec32_i64, __gather64_double) // scatter -#define SCATTER_FACTORED_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC) \ -static FORCEINLINE void FUNC(unsigned char *b, OTYPE varyingOffset, \ - uint32_t scale, OTYPE constOffset, \ - VTYPE val, __vec32_i1 mask) { \ +#define SCATTER_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC) \ +static FORCEINLINE void FUNC(unsigned char *b, uint32_t scale, \ + OTYPE offset, VTYPE val, __vec32_i1 mask) { \ int8_t *base = (int8_t *)b; \ for (int i = 0; i < 32; ++i) \ if ((mask.v & (1 << i)) != 0) { \ - STYPE *ptr = (STYPE *)(base + scale * varyingOffset.v[i] + \ - constOffset.v[i]); \ + STYPE *ptr = (STYPE *)(base + scale * offset.v[i]); \ *ptr = val.v[i]; \ } \ } -SCATTER_FACTORED_BASE_OFFSETS(__vec32_i8, int8_t, __vec32_i32, __scatter_factored_base_offsets32_i8) -SCATTER_FACTORED_BASE_OFFSETS(__vec32_i8, int8_t, __vec32_i64, __scatter_factored_base_offsets64_i8) -SCATTER_FACTORED_BASE_OFFSETS(__vec32_i16, int16_t, __vec32_i32, __scatter_factored_base_offsets32_i16) -SCATTER_FACTORED_BASE_OFFSETS(__vec32_i16, int16_t, __vec32_i64, __scatter_factored_base_offsets64_i16) -SCATTER_FACTORED_BASE_OFFSETS(__vec32_i32, int32_t, __vec32_i32, __scatter_factored_base_offsets32_i32) -SCATTER_FACTORED_BASE_OFFSETS(__vec32_i32, int32_t, __vec32_i64, __scatter_factored_base_offsets64_i32) -SCATTER_FACTORED_BASE_OFFSETS(__vec32_f, float, __vec32_i32, __scatter_factored_base_offsets32_float) -SCATTER_FACTORED_BASE_OFFSETS(__vec32_f, float, __vec32_i64, __scatter_factored_base_offsets64_float) -SCATTER_FACTORED_BASE_OFFSETS(__vec32_i64, int64_t, __vec32_i32, __scatter_factored_base_offsets32_i64) -SCATTER_FACTORED_BASE_OFFSETS(__vec32_i64, int64_t, __vec32_i64, __scatter_factored_base_offsets64_i64) -SCATTER_FACTORED_BASE_OFFSETS(__vec32_d, double, __vec32_i32, __scatter_factored_base_offsets32_double) -SCATTER_FACTORED_BASE_OFFSETS(__vec32_d, double, __vec32_i64, __scatter_factored_base_offsets64_double) +SCATTER_BASE_OFFSETS(__vec32_i8, int8_t, __vec32_i32, __scatter_base_offsets32_i8) +SCATTER_BASE_OFFSETS(__vec32_i8, int8_t, __vec32_i64, __scatter_base_offsets64_i8) +SCATTER_BASE_OFFSETS(__vec32_i16, int16_t, __vec32_i32, __scatter_base_offsets32_i16) +SCATTER_BASE_OFFSETS(__vec32_i16, int16_t, __vec32_i64, __scatter_base_offsets64_i16) +SCATTER_BASE_OFFSETS(__vec32_i32, int32_t, __vec32_i32, __scatter_base_offsets32_i32) +SCATTER_BASE_OFFSETS(__vec32_i32, int32_t, __vec32_i64, __scatter_base_offsets64_i32) +SCATTER_BASE_OFFSETS(__vec32_f, float, __vec32_i32, __scatter_base_offsets32_float) +SCATTER_BASE_OFFSETS(__vec32_f, float, __vec32_i64, __scatter_base_offsets64_float) +SCATTER_BASE_OFFSETS(__vec32_i64, int64_t, __vec32_i32, __scatter_base_offsets32_i64) +SCATTER_BASE_OFFSETS(__vec32_i64, int64_t, __vec32_i64, __scatter_base_offsets64_i64) +SCATTER_BASE_OFFSETS(__vec32_d, double, __vec32_i32, __scatter_base_offsets32_double) +SCATTER_BASE_OFFSETS(__vec32_d, double, __vec32_i64, __scatter_base_offsets64_double) #define SCATTER_GENERAL(VTYPE, STYPE, PTRTYPE, FUNC) \ static FORCEINLINE void FUNC(PTRTYPE ptrs, VTYPE val, __vec32_i1 mask) { \ diff --git a/examples/intrinsics/generic-64.h b/examples/intrinsics/generic-64.h index 2a54446e..ff84fee3 100644 --- a/examples/intrinsics/generic-64.h +++ b/examples/intrinsics/generic-64.h @@ -1507,40 +1507,38 @@ static FORCEINLINE void __masked_store_blend_double(void *p, __vec64_d val, // offsets * offsetScale is in bytes (for all of these) -#define GATHER_FACTORED_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC) \ -static FORCEINLINE VTYPE FUNC(unsigned char *b, OTYPE varyingOffset, \ - uint32_t scale, OTYPE constOffset, \ - __vec64_i1 mask) { \ +#define GATHER_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC) \ +static FORCEINLINE VTYPE FUNC(unsigned char *b, uint32_t scale, \ + OTYPE offset, __vec64_i1 mask) { \ VTYPE ret; \ int8_t *base = (int8_t *)b; \ for (int i = 0; i < 64; ++i) \ - if ((mask.v & (1ull << i)) != 0) { \ - STYPE *ptr = (STYPE *)(base + scale * varyingOffset.v[i] + \ - constOffset.v[i]); \ + if ((mask.v & (1ull << i)) != 0) { \ + STYPE *ptr = (STYPE *)(base + scale * offset.v[i]); \ ret.v[i] = *ptr; \ } \ return ret; \ } -GATHER_FACTORED_BASE_OFFSETS(__vec64_i8, int8_t, __vec64_i32, __gather_factored_base_offsets32_i8) -GATHER_FACTORED_BASE_OFFSETS(__vec64_i8, int8_t, __vec64_i64, __gather_factored_base_offsets64_i8) -GATHER_FACTORED_BASE_OFFSETS(__vec64_i16, int16_t, __vec64_i32, __gather_factored_base_offsets32_i16) -GATHER_FACTORED_BASE_OFFSETS(__vec64_i16, int16_t, __vec64_i64, __gather_factored_base_offsets64_i16) -GATHER_FACTORED_BASE_OFFSETS(__vec64_i32, int32_t, __vec64_i32, __gather_factored_base_offsets32_i32) -GATHER_FACTORED_BASE_OFFSETS(__vec64_i32, int32_t, __vec64_i64, __gather_factored_base_offsets64_i32) -GATHER_FACTORED_BASE_OFFSETS(__vec64_f, float, __vec64_i32, __gather_factored_base_offsets32_float) -GATHER_FACTORED_BASE_OFFSETS(__vec64_f, float, __vec64_i64, __gather_factored_base_offsets64_float) -GATHER_FACTORED_BASE_OFFSETS(__vec64_i64, int64_t, __vec64_i32, __gather_factored_base_offsets32_i64) -GATHER_FACTORED_BASE_OFFSETS(__vec64_i64, int64_t, __vec64_i64, __gather_factored_base_offsets64_i64) -GATHER_FACTORED_BASE_OFFSETS(__vec64_d, double, __vec64_i32, __gather_factored_base_offsets32_double) -GATHER_FACTORED_BASE_OFFSETS(__vec64_d, double, __vec64_i64, __gather_factored_base_offsets64_double) +GATHER_BASE_OFFSETS(__vec64_i8, int8_t, __vec64_i32, __gather_base_offsets32_i8) +GATHER_BASE_OFFSETS(__vec64_i8, int8_t, __vec64_i64, __gather_base_offsets64_i8) +GATHER_BASE_OFFSETS(__vec64_i16, int16_t, __vec64_i32, __gather_base_offsets32_i16) +GATHER_BASE_OFFSETS(__vec64_i16, int16_t, __vec64_i64, __gather_base_offsets64_i16) +GATHER_BASE_OFFSETS(__vec64_i32, int32_t, __vec64_i32, __gather_base_offsets32_i32) +GATHER_BASE_OFFSETS(__vec64_i32, int32_t, __vec64_i64, __gather_base_offsets64_i32) +GATHER_BASE_OFFSETS(__vec64_f, float, __vec64_i32, __gather_base_offsets32_float) +GATHER_BASE_OFFSETS(__vec64_f, float, __vec64_i64, __gather_base_offsets64_float) +GATHER_BASE_OFFSETS(__vec64_i64, int64_t, __vec64_i32, __gather_base_offsets32_i64) +GATHER_BASE_OFFSETS(__vec64_i64, int64_t, __vec64_i64, __gather_base_offsets64_i64) +GATHER_BASE_OFFSETS(__vec64_d, double, __vec64_i32, __gather_base_offsets32_double) +GATHER_BASE_OFFSETS(__vec64_d, double, __vec64_i64, __gather_base_offsets64_double) #define GATHER_GENERAL(VTYPE, STYPE, PTRTYPE, FUNC) \ static FORCEINLINE VTYPE FUNC(PTRTYPE ptrs, __vec64_i1 mask) { \ VTYPE ret; \ for (int i = 0; i < 64; ++i) \ - if ((mask.v & (1ull << i)) != 0) { \ + if ((mask.v & (1ull << i)) != 0) { \ STYPE *ptr = (STYPE *)ptrs.v[i]; \ ret.v[i] = *ptr; \ } \ @@ -1562,32 +1560,30 @@ GATHER_GENERAL(__vec64_d, double, __vec64_i64, __gather64_double) // scatter -#define SCATTER_FACTORED_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC) \ -static FORCEINLINE void FUNC(unsigned char *b, OTYPE varyingOffset, \ - uint32_t scale, OTYPE constOffset, \ - VTYPE val, __vec64_i1 mask) { \ +#define SCATTER_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC) \ +static FORCEINLINE void FUNC(unsigned char *b, uint32_t scale, \ + OTYPE offset, VTYPE val, __vec64_i1 mask) { \ int8_t *base = (int8_t *)b; \ for (int i = 0; i < 64; ++i) \ - if ((mask.v & (1ull << i)) != 0) { \ - STYPE *ptr = (STYPE *)(base + scale * varyingOffset.v[i] + \ - constOffset.v[i]); \ + if ((mask.v & (1ull << i)) != 0) { \ + STYPE *ptr = (STYPE *)(base + scale * offset.v[i]); \ *ptr = val.v[i]; \ } \ } -SCATTER_FACTORED_BASE_OFFSETS(__vec64_i8, int8_t, __vec64_i32, __scatter_factored_base_offsets32_i8) -SCATTER_FACTORED_BASE_OFFSETS(__vec64_i8, int8_t, __vec64_i64, __scatter_factored_base_offsets64_i8) -SCATTER_FACTORED_BASE_OFFSETS(__vec64_i16, int16_t, __vec64_i32, __scatter_factored_base_offsets32_i16) -SCATTER_FACTORED_BASE_OFFSETS(__vec64_i16, int16_t, __vec64_i64, __scatter_factored_base_offsets64_i16) -SCATTER_FACTORED_BASE_OFFSETS(__vec64_i32, int32_t, __vec64_i32, __scatter_factored_base_offsets32_i32) -SCATTER_FACTORED_BASE_OFFSETS(__vec64_i32, int32_t, __vec64_i64, __scatter_factored_base_offsets64_i32) -SCATTER_FACTORED_BASE_OFFSETS(__vec64_f, float, __vec64_i32, __scatter_factored_base_offsets32_float) -SCATTER_FACTORED_BASE_OFFSETS(__vec64_f, float, __vec64_i64, __scatter_factored_base_offsets64_float) -SCATTER_FACTORED_BASE_OFFSETS(__vec64_i64, int64_t, __vec64_i32, __scatter_factored_base_offsets32_i64) -SCATTER_FACTORED_BASE_OFFSETS(__vec64_i64, int64_t, __vec64_i64, __scatter_factored_base_offsets64_i64) -SCATTER_FACTORED_BASE_OFFSETS(__vec64_d, double, __vec64_i32, __scatter_factored_base_offsets32_double) -SCATTER_FACTORED_BASE_OFFSETS(__vec64_d, double, __vec64_i64, __scatter_factored_base_offsets64_double) +SCATTER_BASE_OFFSETS(__vec64_i8, int8_t, __vec64_i32, __scatter_base_offsets32_i8) +SCATTER_BASE_OFFSETS(__vec64_i8, int8_t, __vec64_i64, __scatter_base_offsets64_i8) +SCATTER_BASE_OFFSETS(__vec64_i16, int16_t, __vec64_i32, __scatter_base_offsets32_i16) +SCATTER_BASE_OFFSETS(__vec64_i16, int16_t, __vec64_i64, __scatter_base_offsets64_i16) +SCATTER_BASE_OFFSETS(__vec64_i32, int32_t, __vec64_i32, __scatter_base_offsets32_i32) +SCATTER_BASE_OFFSETS(__vec64_i32, int32_t, __vec64_i64, __scatter_base_offsets64_i32) +SCATTER_BASE_OFFSETS(__vec64_f, float, __vec64_i32, __scatter_base_offsets32_float) +SCATTER_BASE_OFFSETS(__vec64_f, float, __vec64_i64, __scatter_base_offsets64_float) +SCATTER_BASE_OFFSETS(__vec64_i64, int64_t, __vec64_i32, __scatter_base_offsets32_i64) +SCATTER_BASE_OFFSETS(__vec64_i64, int64_t, __vec64_i64, __scatter_base_offsets64_i64) +SCATTER_BASE_OFFSETS(__vec64_d, double, __vec64_i32, __scatter_base_offsets32_double) +SCATTER_BASE_OFFSETS(__vec64_d, double, __vec64_i64, __scatter_base_offsets64_double) #define SCATTER_GENERAL(VTYPE, STYPE, PTRTYPE, FUNC) \ static FORCEINLINE void FUNC(PTRTYPE ptrs, VTYPE val, __vec64_i1 mask) { \ diff --git a/examples/intrinsics/knc.h b/examples/intrinsics/knc.h index fb11db11..a0331afb 100644 --- a/examples/intrinsics/knc.h +++ b/examples/intrinsics/knc.h @@ -1940,60 +1940,33 @@ static FORCEINLINE void __masked_store_blend_float(void *p, __vec16_f val, // offsets * offsetScale is in bytes (for all of these) -#define GATHER_FACTORED_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC) -/* -static FORCEINLINE VTYPE FUNC(unsigned char *b, OTYPE varyingOffset, \ - uint32_t scale, OTYPE constOffset, \ - __vec16_i1 mask) { \ - VTYPE ret; \ - int8_t *base = (int8_t *)b; \ - for (int i = 0; i < 16; ++i) \ - if ((mask.v & (1 << i)) != 0) { \ - STYPE *ptr = (STYPE *)(base + scale * varyingOffset.v[i] + \ - constOffset.v[i]); \ - ret.v[i] = *ptr; \ - } \ - return ret; \ -} -*/ - static FORCEINLINE __vec16_i32 -__gather_factored_base_offsets32_i32(uint8_t *base, __vec16_i32 varyingOffset, - uint32_t scale, __vec16_i32 constOffset, - __vec16_i1 mask) { - __vec16_i32 vscale = _mm512_extload_epi32(&scale, _MM_UPCONV_EPI32_NONE, _MM_BROADCAST_1X16, _MM_HINT_NONE); - __vec16_i32 offsets = __add(__mul(vscale, varyingOffset), constOffset); - __vec16_i32 tmp; - +__gather_base_offsets32_i32(uint8_t *base, uint32_t scale, __vec16_i32 offsets, + __vec16_i1 mask) { // Loop is generated by intrinsic __vec16_i32 ret = _mm512_mask_i32extgather_epi32(tmp, mask, offsets, base, - _MM_UPCONV_EPI32_NONE, 1, + _MM_UPCONV_EPI32_NONE, scale, _MM_HINT_NONE); return ret; } static FORCEINLINE __vec16_f -__gather_factored_base_offsets32_float(uint8_t *base, __vec16_i32 varyingOffset, - uint32_t scale, __vec16_i32 constOffset, +__gather_base_offsets32_float(uint8_t *base, uint32_t scale, __vec16_i32 offsets, __vec16_i1 mask) { - __vec16_i32 vscale = _mm512_extload_epi32(&scale, _MM_UPCONV_EPI32_NONE, _MM_BROADCAST_1X16, _MM_HINT_NONE); - __vec16_i32 offsets = __add(__mul(vscale, varyingOffset), constOffset); - __vec16_f tmp; - // Loop is generated by intrinsic - __vec16_f ret = _mm512_mask_i32extgather_ps(tmp, mask, offsets, base, - _MM_UPCONV_PS_NONE, 1, + __vec16_f ret = _mm512_mask_i32extgather_ps(tmp, mask, offsets, base, + _MM_UPCONV_PS_NONE, scale, _MM_HINT_NONE); return ret; } -GATHER_FACTORED_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i32, __gather_factored_base_offsets32_i8) -GATHER_FACTORED_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i64, __gather_factored_base_offsets64_i8) -GATHER_FACTORED_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i32, __gather_factored_base_offsets32_i16) -GATHER_FACTORED_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i64, __gather_factored_base_offsets64_i16) -GATHER_FACTORED_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i64, __gather_factored_base_offsets64_i32) -GATHER_FACTORED_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i32, __gather_factored_base_offsets32_i64) -GATHER_FACTORED_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i64, __gather_factored_base_offsets64_i64) +//GATHER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i32, __gather_base_offsets32_i8) +//GATHER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i64, __gather_base_offsets64_i8) +//GATHER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i32, __gather_base_offsets32_i16) +//GATHER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i64, __gather_base_offsets64_i16) +//GATHER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i64, __gather_base_offsets64_i32) +//GATHER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i32, __gather_base_offsets32_i64) +//GATHER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i64, __gather_base_offsets64_i64) #define GATHER_GENERAL(VTYPE, STYPE, PTRTYPE, FUNC) /* @@ -2039,45 +2012,30 @@ static FORCEINLINE __vec16_i32 __gather64_i32(__vec16_i64 ptrs, __vec16_i1 mask) */ // scatter -#define SCATTER_FACTORED_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC) -/* -static FORCEINLINE void FUNC(unsigned char *b, OTYPE varyingOffset, \ - uint32_t scale, OTYPE constOffset, \ - VTYPE val, __vec16_i1 mask) { \ - int8_t *base = (int8_t *)b; \ - for (int i = 0; i < 16; ++i) \ - if ((mask.v & (1 << i)) != 0) { \ - STYPE *ptr = (STYPE *)(base + scale * varyingOffset.v[i] + \ - constOffset.v[i]); \ - *ptr = val.v[i]; \ - } \ -} -*/ - -SCATTER_FACTORED_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i32, __scatter_factored_base_offsets32_i8) -SCATTER_FACTORED_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i64, __scatter_factored_base_offsets64_i8) -SCATTER_FACTORED_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i32, __scatter_factored_base_offsets32_i16) -SCATTER_FACTORED_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i64, __scatter_factored_base_offsets64_i16) -SCATTER_FACTORED_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i64, __scatter_factored_base_offsets64_i32) -SCATTER_FACTORED_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i32, __scatter_factored_base_offsets32_i64) -SCATTER_FACTORED_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i64, __scatter_factored_base_offsets64_i64) +//SCATTER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i32, __scatter_base_offsets32_i8) +//SCATTER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i64, __scatter_base_offsets64_i8) +//SCATTER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i32, __scatter_base_offsets32_i16) +//SCATTER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i64, __scatter_base_offsets64_i16) +//SCATTER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i64, __scatter_base_offsets64_i32) +//SCATTER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i32, __scatter_base_offsets32_i64) +//SCATTER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i64, __scatter_base_offsets64_i64) static FORCEINLINE void -__scatter_factored_base_offsets32_i32(uint8_t *b, __vec16_i32 varyingOffset, - uint32_t scale, __vec16_i32 constOffset, +__scatter_base_offsets32_i32(uint8_t *b, uint32_t scale, __vec16_i32 offsets, __vec16_i32 val, __vec16_i1 mask) { - __vec16_i32 offsets = __add(__mul(__vec16_i32(scale), varyingOffset), constOffset); - _mm512_mask_i32extscatter_epi32(b, mask, offsets, val, _MM_DOWNCONV_EPI32_NONE, 1, _MM_HINT_NONE); + _mm512_mask_i32extscatter_epi32(b, mask, offsets, val, + _MM_DOWNCONV_EPI32_NONE, scale, + _MM_HINT_NONE); } static FORCEINLINE void -__scatter_factored_base_offsets32_float(void *base, const __vec16_i32 &varyingOffset, - uint32_t scale, const __vec16_i32 &constOffset, - const __vec16_f &val, const __vec16_i1 mask) +__scatter_base_offsets32_float(void *base, uint32_t scale, __vec16_i32 offsets, + __vec16_f val, __vec16_i1 mask) { - __vec16_i32 offsets = __add(__mul(varyingOffset,__vec16_i32(scale)), constOffset); - _mm512_mask_i32extscatter_ps(base, mask, offsets, val, _MM_DOWNCONV_PS_NONE, _MM_SCALE_1, _MM_HINT_NONE); + _mm512_mask_i32extscatter_ps(base, mask, offsets, val, + _MM_DOWNCONV_PS_NONE, scale, + _MM_HINT_NONE); } #define SCATTER_GENERAL(VTYPE, STYPE, PTRTYPE, FUNC) diff --git a/examples/intrinsics/sse4.h b/examples/intrinsics/sse4.h index 088b694d..17ab8f18 100644 --- a/examples/intrinsics/sse4.h +++ b/examples/intrinsics/sse4.h @@ -2892,54 +2892,53 @@ static FORCEINLINE void __masked_store_blend_double(void *p, __vec4_d val, template static FORCEINLINE RetVec -lGatherBaseOffsets32(RetVec, RetScalar, unsigned char *p, __vec4_i32 offsets, - uint32_t scale, __vec4_i32 constOffset, __vec4_i1 mask) { +lGatherBaseOffsets32(RetVec, RetScalar, unsigned char *p, uint32_t scale, + __vec4_i32 offsets, __vec4_i1 mask) { RetScalar r[4]; #if 1 // "Fast gather" trick... offsets = __select(mask, offsets, __setzero_i32()); - constOffset = __select(mask, constOffset, __setzero_i32()); - int offset = scale * _mm_extract_epi32(offsets.v, 0) + _mm_extract_epi32(constOffset.v, 0); + int offset = scale * _mm_extract_epi32(offsets.v, 0); RetScalar *ptr = (RetScalar *)(p + offset); r[0] = *ptr; - offset = scale * _mm_extract_epi32(offsets.v, 1) + _mm_extract_epi32(constOffset.v, 1); + offset = scale * _mm_extract_epi32(offsets.v, 1); ptr = (RetScalar *)(p + offset); r[1] = *ptr; - offset = scale * _mm_extract_epi32(offsets.v, 2) + _mm_extract_epi32(constOffset.v, 2); + offset = scale * _mm_extract_epi32(offsets.v, 2); ptr = (RetScalar *)(p + offset); r[2] = *ptr; - offset = scale * _mm_extract_epi32(offsets.v, 3) + _mm_extract_epi32(constOffset.v, 3); + offset = scale * _mm_extract_epi32(offsets.v, 3); ptr = (RetScalar *)(p + offset); r[3] = *ptr; #else uint32_t m = _mm_extract_ps(mask.v, 0); if (m != 0) { - int offset = scale * _mm_extract_epi32(offsets.v, 0) + _mm_extract_epi32(constOffset.v, 0); + int offset = scale * _mm_extract_epi32(offsets.v, 0); RetScalar *ptr = (RetScalar *)(p + offset); r[0] = *ptr; } m = _mm_extract_ps(mask.v, 1); if (m != 0) { - int offset = scale * _mm_extract_epi32(offsets.v, 1) + _mm_extract_epi32(constOffset.v, 1); + int offset = scale * _mm_extract_epi32(offsets.v, 1); RetScalar *ptr = (RetScalar *)(p + offset); r[1] = *ptr; } m = _mm_extract_ps(mask.v, 2); if (m != 0) { - int offset = scale * _mm_extract_epi32(offsets.v, 2) + _mm_extract_epi32(constOffset.v, 2); + int offset = scale * _mm_extract_epi32(offsets.v, 2); RetScalar *ptr = (RetScalar *)(p + offset); r[2] = *ptr; } m = _mm_extract_ps(mask.v, 3); if (m != 0) { - int offset = scale * _mm_extract_epi32(offsets.v, 3) + _mm_extract_epi32(constOffset.v, 3); + int offset = scale * _mm_extract_epi32(offsets.v, 3); RetScalar *ptr = (RetScalar *)(p + offset); r[3] = *ptr; } @@ -2950,54 +2949,53 @@ lGatherBaseOffsets32(RetVec, RetScalar, unsigned char *p, __vec4_i32 offsets, template static FORCEINLINE RetVec -lGatherBaseOffsets64(RetVec, RetScalar, unsigned char *p, __vec4_i64 offsets, - uint32_t scale, __vec4_i64 constOffset, __vec4_i1 mask) { +lGatherBaseOffsets64(RetVec, RetScalar, unsigned char *p, uint32_t scale, + __vec4_i64 offsets, __vec4_i1 mask) { RetScalar r[4]; #if 1 // "Fast gather" trick... offsets = __select(mask, offsets, __setzero_i64()); - constOffset = __select(mask, constOffset, __setzero_i64()); - int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0) + _mm_extract_epi64(constOffset.v[0], 0); + int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0); RetScalar *ptr = (RetScalar *)(p + offset); r[0] = *ptr; - offset = scale * _mm_extract_epi64(offsets.v[0], 1) + _mm_extract_epi64(constOffset.v[0], 1); + offset = scale * _mm_extract_epi64(offsets.v[0], 1); ptr = (RetScalar *)(p + offset); r[1] = *ptr; - offset = scale * _mm_extract_epi64(offsets.v[1], 0) + _mm_extract_epi64(constOffset.v[1], 0); + offset = scale * _mm_extract_epi64(offsets.v[1], 0); ptr = (RetScalar *)(p + offset); r[2] = *ptr; - offset = scale * _mm_extract_epi64(offsets.v[1], 1) + _mm_extract_epi64(constOffset.v[1], 1); + offset = scale * _mm_extract_epi64(offsets.v[1], 1); ptr = (RetScalar *)(p + offset); r[3] = *ptr; #else uint32_t m = _mm_extract_ps(mask.v, 0); if (m != 0) { - int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0) + _mm_extract_epi64(constOffset.v[0], 0); + int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0); RetScalar *ptr = (RetScalar *)(p + offset); r[0] = *ptr; } m = _mm_extract_ps(mask.v, 1); if (m != 0) { - int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 1) + _mm_extract_epi64(constOffset.v[0], 1); + int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 1); RetScalar *ptr = (RetScalar *)(p + offset); r[1] = *ptr; } m = _mm_extract_ps(mask.v, 2); if (m != 0) { - int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 0) + _mm_extract_epi64(constOffset.v[1], 0); + int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 0); RetScalar *ptr = (RetScalar *)(p + offset); r[2] = *ptr; } m = _mm_extract_ps(mask.v, 3); if (m != 0) { - int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 1) + _mm_extract_epi64(constOffset.v[1], 1); + int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 1); RetScalar *ptr = (RetScalar *)(p + offset); r[3] = *ptr; } @@ -3007,87 +3005,75 @@ lGatherBaseOffsets64(RetVec, RetScalar, unsigned char *p, __vec4_i64 offsets, } static FORCEINLINE __vec4_i8 -__gather_factored_base_offsets32_i8(unsigned char *b, __vec4_i32 offsets, - uint32_t scale, __vec4_i32 constOffset, __vec4_i1 mask) { - return lGatherBaseOffsets32(__vec4_i8(), uint8_t(), b, offsets, scale, - constOffset, mask); +__gather_base_offsets32_i8(unsigned char *b, uint32_t scale, __vec4_i32 offsets, + __vec4_i1 mask) { + return lGatherBaseOffsets32(__vec4_i8(), uint8_t(), b, scale, offsets, mask); } static FORCEINLINE __vec4_i8 -__gather_factored_base_offsets64_i8(unsigned char *b, __vec4_i64 offsets, - uint32_t scale, __vec4_i64 constOffset, __vec4_i1 mask) { - return lGatherBaseOffsets64(__vec4_i8(), uint8_t(), b, offsets, scale, - constOffset, mask); +__gather_base_offsets64_i8(unsigned char *b, uint32_t scale, __vec4_i64 offsets, + __vec4_i1 mask) { + return lGatherBaseOffsets64(__vec4_i8(), uint8_t(), b, scale, offsets, mask); } static FORCEINLINE __vec4_i16 -__gather_factored_base_offsets32_i16(unsigned char *b, __vec4_i32 offsets, - uint32_t scale, __vec4_i32 constOffset, __vec4_i1 mask) { - return lGatherBaseOffsets32(__vec4_i16(), uint16_t(), b, offsets, scale, - constOffset, mask); +__gather_base_offsets32_i16(unsigned char *b, uint32_t scale, __vec4_i32 offsets, + __vec4_i1 mask) { + return lGatherBaseOffsets32(__vec4_i16(), uint16_t(), b, scale, offsets, mask); } static FORCEINLINE __vec4_i16 - __gather_factored_base_offsets64_i16(unsigned char *b, __vec4_i64 offsets, - uint32_t scale, __vec4_i64 constOffset, __vec4_i1 mask) { - return lGatherBaseOffsets64(__vec4_i16(), uint16_t(), b, offsets, scale, - constOffset, mask); +__gather_base_offsets64_i16(unsigned char *b, uint32_t scale, __vec4_i64 offsets, + __vec4_i1 mask) { + return lGatherBaseOffsets64(__vec4_i16(), uint16_t(), b, scale, offsets, mask); } static FORCEINLINE __vec4_i32 -__gather_factored_base_offsets32_i32(uint8_t *p, __vec4_i32 offsets, uint32_t scale, - __vec4_i32 constOffset, __vec4_i1 mask) { - return lGatherBaseOffsets32(__vec4_i32(), uint32_t(), p, offsets, scale, - constOffset, mask); +__gather_base_offsets32_i32(uint8_t *p, uint32_t scale, __vec4_i32 offsets, + __vec4_i1 mask) { + return lGatherBaseOffsets32(__vec4_i32(), uint32_t(), p, scale, offsets, mask); } static FORCEINLINE __vec4_i32 -__gather_factored_base_offsets64_i32(unsigned char *p, __vec4_i64 offsets, - uint32_t scale, __vec4_i64 constOffset, __vec4_i1 mask) { - return lGatherBaseOffsets64(__vec4_i32(), uint32_t(), p, offsets, scale, - constOffset, mask); +__gather_base_offsets64_i32(unsigned char *p, uint32_t scale, __vec4_i64 offsets, + __vec4_i1 mask) { + return lGatherBaseOffsets64(__vec4_i32(), uint32_t(), p, scale, offsets, mask); } static FORCEINLINE __vec4_f -__gather_factored_base_offsets32_float(uint8_t *p, __vec4_i32 offsets, uint32_t scale, - __vec4_i32 constOffset, __vec4_i1 mask) { - return lGatherBaseOffsets32(__vec4_f(), float(), p, offsets, scale, - constOffset, mask); +__gather_base_offsets32_float(uint8_t *p, uint32_t scale, __vec4_i32 offsets, + __vec4_i1 mask) { + return lGatherBaseOffsets32(__vec4_f(), float(), p, scale, offsets, mask); } static FORCEINLINE __vec4_f -__gather_factored_base_offsets64_float(unsigned char *p, __vec4_i64 offsets, - uint32_t scale, __vec4_i64 constOffset, __vec4_i1 mask) { - return lGatherBaseOffsets64(__vec4_f(), float(), p, offsets, scale, - constOffset, mask); +__gather_base_offsets64_float(unsigned char *p, uint32_t scale, __vec4_i64 offsets, + __vec4_i1 mask) { + return lGatherBaseOffsets64(__vec4_f(), float(), p, scale, offsets, mask); } static FORCEINLINE __vec4_i64 -__gather_factored_base_offsets32_i64(unsigned char *p, __vec4_i32 offsets, - uint32_t scale, __vec4_i32 constOffset, __vec4_i1 mask) { - return lGatherBaseOffsets32(__vec4_i64(), uint64_t(), p, offsets, scale, - constOffset, mask); +__gather_base_offsets32_i64(unsigned char *p, uint32_t scale, __vec4_i32 offsets, + __vec4_i1 mask) { + return lGatherBaseOffsets32(__vec4_i64(), uint64_t(), p, scale, offsets, mask); } static FORCEINLINE __vec4_i64 -__gather_factored_base_offsets64_i64(unsigned char *p, __vec4_i64 offsets, - uint32_t scale, __vec4_i64 constOffset, __vec4_i1 mask) { - return lGatherBaseOffsets64(__vec4_i64(), uint64_t(), p, offsets, scale, - constOffset, mask); +__gather_base_offsets64_i64(unsigned char *p, uint32_t scale, __vec4_i64 offsets, + __vec4_i1 mask) { + return lGatherBaseOffsets64(__vec4_i64(), uint64_t(), p, scale, offsets, mask); } static FORCEINLINE __vec4_d -__gather_factored_base_offsets32_double(unsigned char *p, __vec4_i32 offsets, - uint32_t scale, __vec4_i32 constOffset, __vec4_i1 mask) { - return lGatherBaseOffsets32(__vec4_d(), double(), p, offsets, scale, - constOffset, mask); +__gather_base_offsets32_double(unsigned char *p, uint32_t scale, __vec4_i32 offsets, + __vec4_i1 mask) { + return lGatherBaseOffsets32(__vec4_d(), double(), p, scale, offsets, mask); } static FORCEINLINE __vec4_d -__gather_factored_base_offsets64_double(unsigned char *p, __vec4_i64 offsets, - uint32_t scale, __vec4_i64 constOffset, __vec4_i1 mask) { - return lGatherBaseOffsets64(__vec4_d(), double(), p, offsets, scale, - constOffset, mask); +__gather_base_offsets64_double(unsigned char *p, uint32_t scale, __vec4_i64 offsets, + __vec4_i1 mask) { + return lGatherBaseOffsets64(__vec4_d(), double(), p, scale, offsets, mask); } template @@ -3252,63 +3238,55 @@ static FORCEINLINE __vec4_d __gather64_double(__vec4_i64 ptrs, __vec4_i1 mask) { #define SCATTER32_64(SUFFIX, VEC_SUFFIX, TYPE, EXTRACT) \ static FORCEINLINE void \ -__scatter_factored_base_offsets32_##SUFFIX (unsigned char *b, __vec4_i32 offsets, \ - uint32_t scale, __vec4_i32 constOffset, \ +__scatter_base_offsets32_##SUFFIX (unsigned char *b, uint32_t scale, \ + __vec4_i32 offsets, \ __vec4_##VEC_SUFFIX val, __vec4_i1 mask) { \ uint32_t m = _mm_extract_ps(mask.v, 0); \ if (m != 0) { \ - TYPE *ptr = (TYPE *)(b + scale * _mm_extract_epi32(offsets.v, 0) + \ - _mm_extract_epi32(constOffset.v, 0)); \ + TYPE *ptr = (TYPE *)(b + scale * _mm_extract_epi32(offsets.v, 0)); \ *ptr = EXTRACT(val.v, 0); \ } \ m = _mm_extract_ps(mask.v, 1); \ if (m != 0) { \ - TYPE *ptr = (TYPE *)(b + scale * _mm_extract_epi32(offsets.v, 1) + \ - _mm_extract_epi32(constOffset.v, 1)); \ + TYPE *ptr = (TYPE *)(b + scale * _mm_extract_epi32(offsets.v, 1)); \ *ptr = EXTRACT(val.v, 1); \ } \ m = _mm_extract_ps(mask.v, 2); \ if (m != 0) { \ - TYPE *ptr = (TYPE *)(b + scale * _mm_extract_epi32(offsets.v, 2) + \ - _mm_extract_epi32(constOffset.v, 2)); \ + TYPE *ptr = (TYPE *)(b + scale * _mm_extract_epi32(offsets.v, 2)); \ *ptr = EXTRACT(val.v, 2); \ } \ m = _mm_extract_ps(mask.v, 3); \ if (m != 0) { \ - TYPE *ptr = (TYPE *)(b + scale * _mm_extract_epi32(offsets.v, 3) + \ - _mm_extract_epi32(constOffset.v, 3)); \ + TYPE *ptr = (TYPE *)(b + scale * _mm_extract_epi32(offsets.v, 3)); \ *ptr = EXTRACT(val.v, 3); \ } \ } \ -static FORCEINLINE void \ -__scatter_factored_base_offsets64_##SUFFIX(unsigned char *p, __vec4_i64 offsets, \ - uint32_t scale, __vec4_i64 constOffset, \ +static FORCEINLINE void \ +__scatter_base_offsets64_##SUFFIX(unsigned char *p, uint32_t scale, \ + __vec4_i64 offsets, \ __vec4_##VEC_SUFFIX val, __vec4_i1 mask) { \ uint32_t m = _mm_extract_ps(mask.v, 0); \ if (m != 0) { \ - int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0) + \ - _mm_extract_epi64(constOffset.v[0], 0); \ + int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0); \ TYPE *ptr = (TYPE *)(p + offset); \ *ptr = EXTRACT(val.v, 0); \ } \ m = _mm_extract_ps(mask.v, 1); \ if (m != 0) { \ - int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 1) + \ - _mm_extract_epi64(constOffset.v[0], 1); \ + int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 1); \ TYPE *ptr = (TYPE *)(p + offset); \ *ptr = EXTRACT(val.v, 1); \ } \ m = _mm_extract_ps(mask.v, 2); \ if (m != 0) { \ - int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 0) + \ - _mm_extract_epi64(constOffset.v[1], 0); \ + int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 0); \ TYPE *ptr = (TYPE *)(p + offset); \ *ptr = EXTRACT(val.v, 2); \ } \ m = _mm_extract_ps(mask.v, 3); \ if (m != 0) { \ - int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 1) + \ - _mm_extract_epi64(constOffset.v[1], 1); \ + int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 1); \ TYPE *ptr = (TYPE *)(p + offset); \ *ptr = EXTRACT(val.v, 3); \ } \ @@ -3322,91 +3300,79 @@ SCATTER32_64(float, f, float, _mm_extract_ps_as_float) static FORCEINLINE void -__scatter_factored_base_offsets32_i64(unsigned char *p, __vec4_i32 offsets, - uint32_t scale, __vec4_i32 constOffset, __vec4_i64 val, - __vec4_i1 mask) { - uint32_t m = _mm_extract_ps(mask.v, 0); - if (m != 0) { - int32_t offset = scale * _mm_extract_epi32(offsets.v, 0) + - _mm_extract_epi32(constOffset.v, 0); - uint64_t *ptr = (uint64_t *)(p + offset); - *ptr = _mm_extract_epi64(val.v[0], 0); - } - - m = _mm_extract_ps(mask.v, 1); - if (m != 0) { - int32_t offset = scale * _mm_extract_epi32(offsets.v, 1) + - _mm_extract_epi32(constOffset.v, 1); - uint64_t *ptr = (uint64_t *)(p + offset); - *ptr = _mm_extract_epi64(val.v[0], 1); - } - - m = _mm_extract_ps(mask.v, 2); - if (m != 0) { - int32_t offset = scale * _mm_extract_epi32(offsets.v, 2) + - _mm_extract_epi32(constOffset.v, 2); - uint64_t *ptr = (uint64_t *)(p + offset); - *ptr = _mm_extract_epi64(val.v[1], 0); - } - - m = _mm_extract_ps(mask.v, 3); - if (m != 0) { - int32_t offset = scale * _mm_extract_epi32(offsets.v, 3) + - _mm_extract_epi32(constOffset.v, 3); - uint64_t *ptr = (uint64_t *)(p + offset); - *ptr = _mm_extract_epi64(val.v[1], 1); - } -} - -static FORCEINLINE void -__scatter_factored_base_offsets64_i64(unsigned char *p, __vec4_i64 offsets, - uint32_t scale, __vec4_i64 constOffset, +__scatter_base_offsets32_i64(unsigned char *p, uint32_t scale, __vec4_i32 offsets, __vec4_i64 val, __vec4_i1 mask) { uint32_t m = _mm_extract_ps(mask.v, 0); if (m != 0) { - int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0) + - _mm_extract_epi64(constOffset.v[0], 0); + int32_t offset = scale * _mm_extract_epi32(offsets.v, 0); uint64_t *ptr = (uint64_t *)(p + offset); *ptr = _mm_extract_epi64(val.v[0], 0); } m = _mm_extract_ps(mask.v, 1); if (m != 0) { - int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 1) + - _mm_extract_epi64(constOffset.v[0], 1); + int32_t offset = scale * _mm_extract_epi32(offsets.v, 1); uint64_t *ptr = (uint64_t *)(p + offset); *ptr = _mm_extract_epi64(val.v[0], 1); } m = _mm_extract_ps(mask.v, 2); if (m != 0) { - int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 0) + - _mm_extract_epi64(constOffset.v[1], 0); + int32_t offset = scale * _mm_extract_epi32(offsets.v, 2); uint64_t *ptr = (uint64_t *)(p + offset); *ptr = _mm_extract_epi64(val.v[1], 0); } m = _mm_extract_ps(mask.v, 3); if (m != 0) { - int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 1) + - _mm_extract_epi64(constOffset.v[1], 1); + int32_t offset = scale * _mm_extract_epi32(offsets.v, 3); uint64_t *ptr = (uint64_t *)(p + offset); *ptr = _mm_extract_epi64(val.v[1], 1); } } static FORCEINLINE void -__scatter_factored_base_offsets32_double(unsigned char *p, __vec4_i32 offsets, - uint32_t scale, __vec4_i32 constOffset, __vec4_d val, - __vec4_i1 mask) { - __scatter_factored_base_offsets32_i64(p, offsets, scale, constOffset, val, mask); +__scatter_base_offsets64_i64(unsigned char *p, uint32_t scale, __vec4_i64 offsets, + __vec4_i64 val, __vec4_i1 mask) { + uint32_t m = _mm_extract_ps(mask.v, 0); + if (m != 0) { + int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0); + uint64_t *ptr = (uint64_t *)(p + offset); + *ptr = _mm_extract_epi64(val.v[0], 0); + } + + m = _mm_extract_ps(mask.v, 1); + if (m != 0) { + int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 1); + uint64_t *ptr = (uint64_t *)(p + offset); + *ptr = _mm_extract_epi64(val.v[0], 1); + } + + m = _mm_extract_ps(mask.v, 2); + if (m != 0) { + int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 0); + uint64_t *ptr = (uint64_t *)(p + offset); + *ptr = _mm_extract_epi64(val.v[1], 0); + } + + m = _mm_extract_ps(mask.v, 3); + if (m != 0) { + int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 1); + uint64_t *ptr = (uint64_t *)(p + offset); + *ptr = _mm_extract_epi64(val.v[1], 1); + } } static FORCEINLINE void -__scatter_factored_base_offsets64_double(unsigned char *p, __vec4_i64 offsets, - uint32_t scale, __vec4_i64 constOffset, __vec4_d val, - __vec4_i1 mask) { - __scatter_factored_base_offsets64_i64(p, offsets, scale, constOffset, val, mask); +__scatter_base_offsets32_double(unsigned char *p, uint32_t scale, __vec4_i32 offsets, + __vec4_d val, __vec4_i1 mask) { + __scatter_base_offsets32_i64(p, scale, offsets, val, mask); +} + +static FORCEINLINE void +__scatter_base_offsets64_double(unsigned char *p, uint32_t scale, __vec4_i64 offsets, + __vec4_d val, __vec4_i1 mask) { + __scatter_base_offsets64_i64(p, scale, offsets, val, mask); } diff --git a/ispc.cpp b/ispc.cpp index 0980c3d2..8fb8f0f5 100644 --- a/ispc.cpp +++ b/ispc.cpp @@ -254,6 +254,7 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa, t->maskBitCount = 1; t->hasHalf = true; t->hasTranscendentals = true; + t->hasGather = t->hasScatter = true; } else if (!strcasecmp(isa, "generic-8")) { t->isa = Target::GENERIC; @@ -263,6 +264,7 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa, t->maskBitCount = 1; t->hasHalf = true; t->hasTranscendentals = true; + t->hasGather = t->hasScatter = true; } else if (!strcasecmp(isa, "generic-16")) { t->isa = Target::GENERIC; @@ -272,6 +274,7 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa, t->maskBitCount = 1; t->hasHalf = true; t->hasTranscendentals = true; + t->hasGather = t->hasScatter = true; } else if (!strcasecmp(isa, "generic-32")) { t->isa = Target::GENERIC; @@ -281,6 +284,7 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa, t->maskBitCount = 1; t->hasHalf = true; t->hasTranscendentals = true; + t->hasGather = t->hasScatter = true; } else if (!strcasecmp(isa, "generic-64")) { t->isa = Target::GENERIC; @@ -290,6 +294,7 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa, t->maskBitCount = 1; t->hasHalf = true; t->hasTranscendentals = true; + t->hasGather = t->hasScatter = true; } else if (!strcasecmp(isa, "generic-1")) { t->isa = Target::GENERIC;