diff --git a/builtins/target-generic-common.ll b/builtins/target-generic-common.ll index 48e7b836..c54dd948 100644 --- a/builtins/target-generic-common.ll +++ b/builtins/target-generic-common.ll @@ -334,18 +334,18 @@ define void @__masked_store_blend_double(* nocapture, ;; gather/scatter define(`gather_scatter', ` -declare @__gather_base_offsets32_$1(i8 * nocapture, , +declare @__gather_factored_base_offsets32_$1(i8 * nocapture, , i32, , ) nounwind readonly -declare @__gather_base_offsets64_$1(i8 * nocapture, , +declare @__gather_factored_base_offsets64_$1(i8 * nocapture, , i32, , ) nounwind readonly declare @__gather32_$1(, ) nounwind readonly declare @__gather64_$1(, ) nounwind readonly -declare void @__scatter_base_offsets32_$1(i8* nocapture, , +declare void @__scatter_factored_base_offsets32_$1(i8* nocapture, , i32, , , ) nounwind -declare void @__scatter_base_offsets64_$1(i8* nocapture, , +declare void @__scatter_factored_base_offsets64_$1(i8* nocapture, , i32, , , ) nounwind declare void @__scatter32_$1(, , ) nounwind diff --git a/builtins/util.m4 b/builtins/util.m4 index ce25a761..4a8822bb 100644 --- a/builtins/util.m4 +++ b/builtins/util.m4 @@ -1599,7 +1599,7 @@ declare void @__pseudo_masked_store_double( * nocapture, @__pseudo_gather64_float(, ) declare @__pseudo_gather64_i64(, ) nounwind readonly declare @__pseudo_gather64_double(, ) nounwind readonly -declare @__pseudo_gather_base_offsets32_i8(i8 *, , i32, , +declare @__pseudo_gather_factored_base_offsets32_i8(i8 *, , i32, , ) nounwind readonly -declare @__pseudo_gather_base_offsets32_i16(i8 *, , i32, , +declare @__pseudo_gather_factored_base_offsets32_i16(i8 *, , i32, , ) nounwind readonly -declare @__pseudo_gather_base_offsets32_i32(i8 *, , i32, , +declare @__pseudo_gather_factored_base_offsets32_i32(i8 *, , i32, , ) nounwind readonly -declare @__pseudo_gather_base_offsets32_float(i8 *, , i32, , +declare @__pseudo_gather_factored_base_offsets32_float(i8 *, , i32, , ) nounwind readonly -declare @__pseudo_gather_base_offsets32_i64(i8 *, , i32, , +declare @__pseudo_gather_factored_base_offsets32_i64(i8 *, , i32, , ) nounwind readonly -declare @__pseudo_gather_base_offsets32_double(i8 *, , i32, , +declare @__pseudo_gather_factored_base_offsets32_double(i8 *, , i32, , ) nounwind readonly -declare @__pseudo_gather_base_offsets64_i8(i8 *, , i32, , +declare @__pseudo_gather_factored_base_offsets64_i8(i8 *, , i32, , ) nounwind readonly -declare @__pseudo_gather_base_offsets64_i16(i8 *, , i32, , +declare @__pseudo_gather_factored_base_offsets64_i16(i8 *, , i32, , ) nounwind readonly -declare @__pseudo_gather_base_offsets64_i32(i8 *, , i32, , +declare @__pseudo_gather_factored_base_offsets64_i32(i8 *, , i32, , ) nounwind readonly -declare @__pseudo_gather_base_offsets64_float(i8 *, , i32, , +declare @__pseudo_gather_factored_base_offsets64_float(i8 *, , i32, , ) nounwind readonly -declare @__pseudo_gather_base_offsets64_i64(i8 *, , i32, , +declare @__pseudo_gather_factored_base_offsets64_i64(i8 *, , i32, , ) nounwind readonly -declare @__pseudo_gather_base_offsets64_double(i8 *, , i32, , +declare @__pseudo_gather_factored_base_offsets64_double(i8 *, , i32, , ) nounwind readonly ; Similarly to the pseudo-gathers defined above, we also declare undefined @@ -1660,7 +1660,7 @@ declare @__pseudo_gather_base_offsets64_double(i8 *, , , , , ) nounwind declare void @__pseudo_scatter64_double(, , ) nounwind -declare void @__pseudo_scatter_base_offsets32_i8(i8 * nocapture, , i32, , +declare void @__pseudo_scatter_factored_base_offsets32_i8(i8 * nocapture, , i32, , , ) nounwind -declare void @__pseudo_scatter_base_offsets32_i16(i8 * nocapture, , i32, , +declare void @__pseudo_scatter_factored_base_offsets32_i16(i8 * nocapture, , i32, , , ) nounwind -declare void @__pseudo_scatter_base_offsets32_i32(i8 * nocapture, , i32, , +declare void @__pseudo_scatter_factored_base_offsets32_i32(i8 * nocapture, , i32, , , ) nounwind -declare void @__pseudo_scatter_base_offsets32_float(i8 * nocapture, , i32, , +declare void @__pseudo_scatter_factored_base_offsets32_float(i8 * nocapture, , i32, , , ) nounwind -declare void @__pseudo_scatter_base_offsets32_i64(i8 * nocapture, , i32, , +declare void @__pseudo_scatter_factored_base_offsets32_i64(i8 * nocapture, , i32, , , ) nounwind -declare void @__pseudo_scatter_base_offsets32_double(i8 * nocapture, , i32, , +declare void @__pseudo_scatter_factored_base_offsets32_double(i8 * nocapture, , i32, , , ) nounwind -declare void @__pseudo_scatter_base_offsets64_i8(i8 * nocapture, , i32, , +declare void @__pseudo_scatter_factored_base_offsets64_i8(i8 * nocapture, , i32, , , ) nounwind -declare void @__pseudo_scatter_base_offsets64_i16(i8 * nocapture, , i32, , +declare void @__pseudo_scatter_factored_base_offsets64_i16(i8 * nocapture, , i32, , , ) nounwind -declare void @__pseudo_scatter_base_offsets64_i32(i8 * nocapture, , i32, , +declare void @__pseudo_scatter_factored_base_offsets64_i32(i8 * nocapture, , i32, , , ) nounwind -declare void @__pseudo_scatter_base_offsets64_float(i8 * nocapture, , i32, , +declare void @__pseudo_scatter_factored_base_offsets64_float(i8 * nocapture, , i32, , , ) nounwind -declare void @__pseudo_scatter_base_offsets64_i64(i8 * nocapture, , i32, , +declare void @__pseudo_scatter_factored_base_offsets64_i64(i8 * nocapture, , i32, , , ) nounwind -declare void @__pseudo_scatter_base_offsets64_double(i8 * nocapture, , i32, , +declare void @__pseudo_scatter_factored_base_offsets64_double(i8 * nocapture, , i32, , , ) nounwind declare float @__log_uniform_float(float) nounwind readnone @@ -1872,103 +1872,103 @@ define void @__keep_funcs_live(i8 * %ptr, %v8, %v16, call void @__usedouble( %g64_d) %pgbo32_8 = call - @__pseudo_gather_base_offsets32_i8(i8 * %ptr, %v32, i32 0, + @__pseudo_gather_factored_base_offsets32_i8(i8 * %ptr, %v32, i32 0, %v32, %mask) call void @__use8( %pgbo32_8) %pgbo32_16 = call - @__pseudo_gather_base_offsets32_i16(i8 * %ptr, %v32, i32 0, + @__pseudo_gather_factored_base_offsets32_i16(i8 * %ptr, %v32, i32 0, %v32, %mask) call void @__use16( %pgbo32_16) %pgbo32_32 = call - @__pseudo_gather_base_offsets32_i32(i8 * %ptr, %v32, i32 0, + @__pseudo_gather_factored_base_offsets32_i32(i8 * %ptr, %v32, i32 0, %v32, %mask) call void @__use32( %pgbo32_32) %pgbo32_f = call - @__pseudo_gather_base_offsets32_float(i8 * %ptr, %v32, i32 0, + @__pseudo_gather_factored_base_offsets32_float(i8 * %ptr, %v32, i32 0, %v32, %mask) call void @__usefloat( %pgbo32_f) %pgbo32_64 = call - @__pseudo_gather_base_offsets32_i64(i8 * %ptr, %v32, i32 0, + @__pseudo_gather_factored_base_offsets32_i64(i8 * %ptr, %v32, i32 0, %v32, %mask) call void @__use64( %pgbo32_64) %pgbo32_d = call - @__pseudo_gather_base_offsets32_double(i8 * %ptr, %v32, i32 0, + @__pseudo_gather_factored_base_offsets32_double(i8 * %ptr, %v32, i32 0, %v32, %mask) call void @__usedouble( %pgbo32_d) %gbo32_8 = call - @__gather_base_offsets32_i8(i8 * %ptr, %v32, i32 0, + @__gather_factored_base_offsets32_i8(i8 * %ptr, %v32, i32 0, %v32, %mask) call void @__use8( %gbo32_8) %gbo32_16 = call - @__gather_base_offsets32_i16(i8 * %ptr, %v32, i32 0, + @__gather_factored_base_offsets32_i16(i8 * %ptr, %v32, i32 0, %v32, %mask) call void @__use16( %gbo32_16) %gbo32_32 = call - @__gather_base_offsets32_i32(i8 * %ptr, %v32, i32 0, + @__gather_factored_base_offsets32_i32(i8 * %ptr, %v32, i32 0, %v32, %mask) call void @__use32( %gbo32_32) %gbo32_f = call - @__gather_base_offsets32_float(i8 * %ptr, %v32, i32 0, + @__gather_factored_base_offsets32_float(i8 * %ptr, %v32, i32 0, %v32, %mask) call void @__usefloat( %gbo32_f) %gbo32_64 = call - @__gather_base_offsets32_i64(i8 * %ptr, %v32, i32 0, + @__gather_factored_base_offsets32_i64(i8 * %ptr, %v32, i32 0, %v32, %mask) call void @__use64( %gbo32_64) %gbo32_d = call - @__gather_base_offsets32_double(i8 * %ptr, %v32, i32 0, + @__gather_factored_base_offsets32_double(i8 * %ptr, %v32, i32 0, %v32, %mask) call void @__usedouble( %gbo32_d) %pgbo64_8 = call - @__pseudo_gather_base_offsets64_i8(i8 * %ptr, %v64, i32 0, + @__pseudo_gather_factored_base_offsets64_i8(i8 * %ptr, %v64, i32 0, %v64, %mask) call void @__use8( %pgbo64_8) %pgbo64_16 = call - @__pseudo_gather_base_offsets64_i16(i8 * %ptr, %v64, i32 0, + @__pseudo_gather_factored_base_offsets64_i16(i8 * %ptr, %v64, i32 0, %v64, %mask) call void @__use16( %pgbo64_16) %pgbo64_32 = call - @__pseudo_gather_base_offsets64_i32(i8 * %ptr, %v64, i32 0, + @__pseudo_gather_factored_base_offsets64_i32(i8 * %ptr, %v64, i32 0, %v64, %mask) call void @__use32( %pgbo64_32) %pgbo64_f = call - @__pseudo_gather_base_offsets64_float(i8 * %ptr, %v64, i32 0, + @__pseudo_gather_factored_base_offsets64_float(i8 * %ptr, %v64, i32 0, %v64, %mask) call void @__usefloat( %pgbo64_f) %pgbo64_64 = call - @__pseudo_gather_base_offsets64_i64(i8 * %ptr, %v64, i32 0, + @__pseudo_gather_factored_base_offsets64_i64(i8 * %ptr, %v64, i32 0, %v64, %mask) call void @__use64( %pgbo64_64) %pgbo64_d = call - @__pseudo_gather_base_offsets64_double(i8 * %ptr, %v64, i32 0, + @__pseudo_gather_factored_base_offsets64_double(i8 * %ptr, %v64, i32 0, %v64, %mask) call void @__usedouble( %pgbo64_d) %gbo64_8 = call - @__gather_base_offsets64_i8(i8 * %ptr, %v64, i32 0, + @__gather_factored_base_offsets64_i8(i8 * %ptr, %v64, i32 0, %v64, %mask) call void @__use8( %gbo64_8) %gbo64_16 = call - @__gather_base_offsets64_i16(i8 * %ptr, %v64, i32 0, + @__gather_factored_base_offsets64_i16(i8 * %ptr, %v64, i32 0, %v64, %mask) call void @__use16( %gbo64_16) %gbo64_32 = call - @__gather_base_offsets64_i32(i8 * %ptr, %v64, i32 0, + @__gather_factored_base_offsets64_i32(i8 * %ptr, %v64, i32 0, %v64, %mask) call void @__use32( %gbo64_32) %gbo64_f = call - @__gather_base_offsets64_float(i8 * %ptr, %v64, i32 0, + @__gather_factored_base_offsets64_float(i8 * %ptr, %v64, i32 0, %v64, %mask) call void @__usefloat( %gbo64_f) %gbo64_64 = call - @__gather_base_offsets64_i64(i8 * %ptr, %v64, i32 0, + @__gather_factored_base_offsets64_i64(i8 * %ptr, %v64, i32 0, %v64, %mask) call void @__use64( %gbo64_64) %gbo64_d = call - @__gather_base_offsets64_double(i8 * %ptr, %v64, i32 0, + @__gather_factored_base_offsets64_double(i8 * %ptr, %v64, i32 0, %v64, %mask) call void @__usedouble( %gbo64_d) @@ -2003,56 +2003,56 @@ define void @__keep_funcs_live(i8 * %ptr, %v8, %v16, call void @__scatter64_i64( %v64, %v64, %mask) call void @__scatter64_double( %v64, %vd, %mask) - call void @__pseudo_scatter_base_offsets32_i8(i8 * %ptr, %v32, i32 0, %v32, + call void @__pseudo_scatter_factored_base_offsets32_i8(i8 * %ptr, %v32, i32 0, %v32, %v8, %mask) - call void @__pseudo_scatter_base_offsets32_i16(i8 * %ptr, %v32, i32 0, %v32, + call void @__pseudo_scatter_factored_base_offsets32_i16(i8 * %ptr, %v32, i32 0, %v32, %v16, %mask) - call void @__pseudo_scatter_base_offsets32_i32(i8 * %ptr, %v32, i32 0, %v32, + call void @__pseudo_scatter_factored_base_offsets32_i32(i8 * %ptr, %v32, i32 0, %v32, %v32, %mask) - call void @__pseudo_scatter_base_offsets32_float(i8 * %ptr, %v32, i32 0, %v32, + call void @__pseudo_scatter_factored_base_offsets32_float(i8 * %ptr, %v32, i32 0, %v32, %vf, %mask) - call void @__pseudo_scatter_base_offsets32_i64(i8 * %ptr, %v32, i32 0, %v32, + call void @__pseudo_scatter_factored_base_offsets32_i64(i8 * %ptr, %v32, i32 0, %v32, %v64, %mask) - call void @__pseudo_scatter_base_offsets32_double(i8 * %ptr, %v32, i32 0, %v32, + call void @__pseudo_scatter_factored_base_offsets32_double(i8 * %ptr, %v32, i32 0, %v32, %vd, %mask) - call void @__pseudo_scatter_base_offsets64_i8(i8 * %ptr, %v64, i32 0, %v64, + call void @__pseudo_scatter_factored_base_offsets64_i8(i8 * %ptr, %v64, i32 0, %v64, %v8, %mask) - call void @__pseudo_scatter_base_offsets64_i16(i8 * %ptr, %v64, i32 0, %v64, + call void @__pseudo_scatter_factored_base_offsets64_i16(i8 * %ptr, %v64, i32 0, %v64, %v16, %mask) - call void @__pseudo_scatter_base_offsets64_i32(i8 * %ptr, %v64, i32 0, %v64, + call void @__pseudo_scatter_factored_base_offsets64_i32(i8 * %ptr, %v64, i32 0, %v64, %v32, %mask) - call void @__pseudo_scatter_base_offsets64_float(i8 * %ptr, %v64, i32 0, %v64, + call void @__pseudo_scatter_factored_base_offsets64_float(i8 * %ptr, %v64, i32 0, %v64, %vf, %mask) - call void @__pseudo_scatter_base_offsets64_i64(i8 * %ptr, %v64, i32 0, %v64, + call void @__pseudo_scatter_factored_base_offsets64_i64(i8 * %ptr, %v64, i32 0, %v64, %v64, %mask) - call void @__pseudo_scatter_base_offsets64_double(i8 * %ptr, %v64, i32 0, %v64, + call void @__pseudo_scatter_factored_base_offsets64_double(i8 * %ptr, %v64, i32 0, %v64, %vd, %mask) - call void @__scatter_base_offsets32_i8(i8 * %ptr, %v32, i32 0, %v32, + call void @__scatter_factored_base_offsets32_i8(i8 * %ptr, %v32, i32 0, %v32, %v8, %mask) - call void @__scatter_base_offsets32_i16(i8 * %ptr, %v32, i32 0, %v32, + call void @__scatter_factored_base_offsets32_i16(i8 * %ptr, %v32, i32 0, %v32, %v16, %mask) - call void @__scatter_base_offsets32_i32(i8 * %ptr, %v32, i32 0, %v32, + call void @__scatter_factored_base_offsets32_i32(i8 * %ptr, %v32, i32 0, %v32, %v32, %mask) - call void @__scatter_base_offsets32_float(i8 * %ptr, %v32, i32 0, %v32, + call void @__scatter_factored_base_offsets32_float(i8 * %ptr, %v32, i32 0, %v32, %vf, %mask) - call void @__scatter_base_offsets32_i64(i8 * %ptr, %v32, i32 0, %v32, + call void @__scatter_factored_base_offsets32_i64(i8 * %ptr, %v32, i32 0, %v32, %v64, %mask) - call void @__scatter_base_offsets32_double(i8 * %ptr, %v32, i32 0, %v32, + call void @__scatter_factored_base_offsets32_double(i8 * %ptr, %v32, i32 0, %v32, %vd, %mask) - call void @__scatter_base_offsets64_i8(i8 * %ptr, %v64, i32 0, %v64, + call void @__scatter_factored_base_offsets64_i8(i8 * %ptr, %v64, i32 0, %v64, %v8, %mask) - call void @__scatter_base_offsets64_i16(i8 * %ptr, %v64, i32 0, %v64, + call void @__scatter_factored_base_offsets64_i16(i8 * %ptr, %v64, i32 0, %v64, %v16, %mask) - call void @__scatter_base_offsets64_i32(i8 * %ptr, %v64, i32 0, %v64, + call void @__scatter_factored_base_offsets64_i32(i8 * %ptr, %v64, i32 0, %v64, %v32, %mask) - call void @__scatter_base_offsets64_float(i8 * %ptr, %v64, i32 0, %v64, + call void @__scatter_factored_base_offsets64_float(i8 * %ptr, %v64, i32 0, %v64, %vf, %mask) - call void @__scatter_base_offsets64_i64(i8 * %ptr, %v64, i32 0, %v64, + call void @__scatter_factored_base_offsets64_i64(i8 * %ptr, %v64, i32 0, %v64, %v64, %mask) - call void @__scatter_base_offsets64_double(i8 * %ptr, %v64, i32 0, %v64, + call void @__scatter_factored_base_offsets64_double(i8 * %ptr, %v64, i32 0, %v64, %vd, %mask) ret void @@ -3245,7 +3245,7 @@ define @__gather_elt64_$1(i8 * %ptr, %offsets, i32 %o } -define @__gather_base_offsets32_$1(i8 * %ptr, %offsets, i32 %offset_scale, +define @__gather_factored_base_offsets32_$1(i8 * %ptr, %offsets, i32 %offset_scale, %offset_delta, %vecmask) nounwind readonly alwaysinline { ; We can be clever and avoid the per-lane stuff for gathers if we are willing @@ -3276,7 +3276,7 @@ define @__gather_base_offsets32_$1(i8 * %ptr, %offset ret %ret`'eval(WIDTH-1) } -define @__gather_base_offsets64_$1(i8 * %ptr, %offsets, i32 %offset_scale, +define @__gather_factored_base_offsets64_$1(i8 * %ptr, %offsets, i32 %offset_scale, %offset_delta, %vecmask) nounwind readonly alwaysinline { ; We can be clever and avoid the per-lane stuff for gathers if we are willing @@ -3391,7 +3391,7 @@ define void @__scatter_elt64_$1(i8 * %ptr, %offsets, i32 %offset_s ret void } -define void @__scatter_base_offsets32_$1(i8* %base, %offsets, i32 %offset_scale, +define void @__scatter_factored_base_offsets32_$1(i8* %base, %offsets, i32 %offset_scale, %offset_delta, %values, %mask) nounwind alwaysinline { ;; And use the `per_lane' macro to do all of the per-lane work for scatter... @@ -3401,7 +3401,7 @@ define void @__scatter_base_offsets32_$1(i8* %base, %offsets, i32 ret void } -define void @__scatter_base_offsets64_$1(i8* %base, %offsets, i32 %offset_scale, +define void @__scatter_factored_base_offsets64_$1(i8* %base, %offsets, i32 %offset_scale, %offset_delta, %values, %mask) nounwind alwaysinline { ;; And use the `per_lane' macro to do all of the per-lane work for scatter... diff --git a/examples/intrinsics/generic-16.h b/examples/intrinsics/generic-16.h index 1851ff7e..c18e9fbe 100644 --- a/examples/intrinsics/generic-16.h +++ b/examples/intrinsics/generic-16.h @@ -1306,7 +1306,7 @@ static FORCEINLINE void __masked_store_blend_double(void *p, __vec16_d val, // offsets * offsetScale is in bytes (for all of these) -#define GATHER_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC) \ +#define GATHER_FACTORED_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC) \ static FORCEINLINE VTYPE FUNC(unsigned char *b, OTYPE varyingOffset, \ uint32_t scale, OTYPE constOffset, \ __vec16_i1 mask) { \ @@ -1322,18 +1322,18 @@ static FORCEINLINE VTYPE FUNC(unsigned char *b, OTYPE varyingOffset, \ } -GATHER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i32, __gather_base_offsets32_i8) -GATHER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i64, __gather_base_offsets64_i8) -GATHER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i32, __gather_base_offsets32_i16) -GATHER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i64, __gather_base_offsets64_i16) -GATHER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i32, __gather_base_offsets32_i32) -GATHER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i64, __gather_base_offsets64_i32) -GATHER_BASE_OFFSETS(__vec16_f, float, __vec16_i32, __gather_base_offsets32_float) -GATHER_BASE_OFFSETS(__vec16_f, float, __vec16_i64, __gather_base_offsets64_float) -GATHER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i32, __gather_base_offsets32_i64) -GATHER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i64, __gather_base_offsets64_i64) -GATHER_BASE_OFFSETS(__vec16_d, double, __vec16_i32, __gather_base_offsets32_double) -GATHER_BASE_OFFSETS(__vec16_d, double, __vec16_i64, __gather_base_offsets64_double) +GATHER_FACTORED_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i32, __gather_factored_base_offsets32_i8) +GATHER_FACTORED_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i64, __gather_factored_base_offsets64_i8) +GATHER_FACTORED_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i32, __gather_factored_base_offsets32_i16) +GATHER_FACTORED_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i64, __gather_factored_base_offsets64_i16) +GATHER_FACTORED_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i32, __gather_factored_base_offsets32_i32) +GATHER_FACTORED_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i64, __gather_factored_base_offsets64_i32) +GATHER_FACTORED_BASE_OFFSETS(__vec16_f, float, __vec16_i32, __gather_factored_base_offsets32_float) +GATHER_FACTORED_BASE_OFFSETS(__vec16_f, float, __vec16_i64, __gather_factored_base_offsets64_float) +GATHER_FACTORED_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i32, __gather_factored_base_offsets32_i64) +GATHER_FACTORED_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i64, __gather_factored_base_offsets64_i64) +GATHER_FACTORED_BASE_OFFSETS(__vec16_d, double, __vec16_i32, __gather_factored_base_offsets32_double) +GATHER_FACTORED_BASE_OFFSETS(__vec16_d, double, __vec16_i64, __gather_factored_base_offsets64_double) #define GATHER_GENERAL(VTYPE, STYPE, PTRTYPE, FUNC) \ static FORCEINLINE VTYPE FUNC(PTRTYPE ptrs, __vec16_i1 mask) { \ @@ -1361,7 +1361,7 @@ GATHER_GENERAL(__vec16_d, double, __vec16_i64, __gather64_double) // scatter -#define SCATTER_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC) \ +#define SCATTER_FACTORED_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC) \ static FORCEINLINE void FUNC(unsigned char *b, OTYPE varyingOffset, \ uint32_t scale, OTYPE constOffset, \ VTYPE val, __vec16_i1 mask) { \ @@ -1375,18 +1375,18 @@ static FORCEINLINE void FUNC(unsigned char *b, OTYPE varyingOffset, \ } -SCATTER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i32, __scatter_base_offsets32_i8) -SCATTER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i64, __scatter_base_offsets64_i8) -SCATTER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i32, __scatter_base_offsets32_i16) -SCATTER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i64, __scatter_base_offsets64_i16) -SCATTER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i32, __scatter_base_offsets32_i32) -SCATTER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i64, __scatter_base_offsets64_i32) -SCATTER_BASE_OFFSETS(__vec16_f, float, __vec16_i32, __scatter_base_offsets32_float) -SCATTER_BASE_OFFSETS(__vec16_f, float, __vec16_i64, __scatter_base_offsets64_float) -SCATTER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i32, __scatter_base_offsets32_i64) -SCATTER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i64, __scatter_base_offsets64_i64) -SCATTER_BASE_OFFSETS(__vec16_d, double, __vec16_i32, __scatter_base_offsets32_double) -SCATTER_BASE_OFFSETS(__vec16_d, double, __vec16_i64, __scatter_base_offsets64_double) +SCATTER_FACTORED_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i32, __scatter_factored_base_offsets32_i8) +SCATTER_FACTORED_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i64, __scatter_factored_base_offsets64_i8) +SCATTER_FACTORED_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i32, __scatter_factored_base_offsets32_i16) +SCATTER_FACTORED_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i64, __scatter_factored_base_offsets64_i16) +SCATTER_FACTORED_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i32, __scatter_factored_base_offsets32_i32) +SCATTER_FACTORED_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i64, __scatter_factored_base_offsets64_i32) +SCATTER_FACTORED_BASE_OFFSETS(__vec16_f, float, __vec16_i32, __scatter_factored_base_offsets32_float) +SCATTER_FACTORED_BASE_OFFSETS(__vec16_f, float, __vec16_i64, __scatter_factored_base_offsets64_float) +SCATTER_FACTORED_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i32, __scatter_factored_base_offsets32_i64) +SCATTER_FACTORED_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i64, __scatter_factored_base_offsets64_i64) +SCATTER_FACTORED_BASE_OFFSETS(__vec16_d, double, __vec16_i32, __scatter_factored_base_offsets32_double) +SCATTER_FACTORED_BASE_OFFSETS(__vec16_d, double, __vec16_i64, __scatter_factored_base_offsets64_double) #define SCATTER_GENERAL(VTYPE, STYPE, PTRTYPE, FUNC) \ static FORCEINLINE void FUNC(PTRTYPE ptrs, VTYPE val, __vec16_i1 mask) { \ diff --git a/examples/intrinsics/generic-32.h b/examples/intrinsics/generic-32.h index 628aab84..c1f89cd8 100644 --- a/examples/intrinsics/generic-32.h +++ b/examples/intrinsics/generic-32.h @@ -1374,7 +1374,7 @@ static FORCEINLINE void __masked_store_blend_double(void *p, __vec32_d val, // offsets * offsetScale is in bytes (for all of these) -#define GATHER_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC) \ +#define GATHER_FACTORED_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC) \ static FORCEINLINE VTYPE FUNC(unsigned char *b, OTYPE varyingOffset, \ uint32_t scale, OTYPE constOffset, \ __vec32_i1 mask) { \ @@ -1390,18 +1390,18 @@ static FORCEINLINE VTYPE FUNC(unsigned char *b, OTYPE varyingOffset, \ } -GATHER_BASE_OFFSETS(__vec32_i8, int8_t, __vec32_i32, __gather_base_offsets32_i8) -GATHER_BASE_OFFSETS(__vec32_i8, int8_t, __vec32_i64, __gather_base_offsets64_i8) -GATHER_BASE_OFFSETS(__vec32_i16, int16_t, __vec32_i32, __gather_base_offsets32_i16) -GATHER_BASE_OFFSETS(__vec32_i16, int16_t, __vec32_i64, __gather_base_offsets64_i16) -GATHER_BASE_OFFSETS(__vec32_i32, int32_t, __vec32_i32, __gather_base_offsets32_i32) -GATHER_BASE_OFFSETS(__vec32_i32, int32_t, __vec32_i64, __gather_base_offsets64_i32) -GATHER_BASE_OFFSETS(__vec32_f, float, __vec32_i32, __gather_base_offsets32_float) -GATHER_BASE_OFFSETS(__vec32_f, float, __vec32_i64, __gather_base_offsets64_float) -GATHER_BASE_OFFSETS(__vec32_i64, int64_t, __vec32_i32, __gather_base_offsets32_i64) -GATHER_BASE_OFFSETS(__vec32_i64, int64_t, __vec32_i64, __gather_base_offsets64_i64) -GATHER_BASE_OFFSETS(__vec32_d, double, __vec32_i32, __gather_base_offsets32_double) -GATHER_BASE_OFFSETS(__vec32_d, double, __vec32_i64, __gather_base_offsets64_double) +GATHER_FACTORED_BASE_OFFSETS(__vec32_i8, int8_t, __vec32_i32, __gather_factored_base_offsets32_i8) +GATHER_FACTORED_BASE_OFFSETS(__vec32_i8, int8_t, __vec32_i64, __gather_factored_base_offsets64_i8) +GATHER_FACTORED_BASE_OFFSETS(__vec32_i16, int16_t, __vec32_i32, __gather_factored_base_offsets32_i16) +GATHER_FACTORED_BASE_OFFSETS(__vec32_i16, int16_t, __vec32_i64, __gather_factored_base_offsets64_i16) +GATHER_FACTORED_BASE_OFFSETS(__vec32_i32, int32_t, __vec32_i32, __gather_factored_base_offsets32_i32) +GATHER_FACTORED_BASE_OFFSETS(__vec32_i32, int32_t, __vec32_i64, __gather_factored_base_offsets64_i32) +GATHER_FACTORED_BASE_OFFSETS(__vec32_f, float, __vec32_i32, __gather_factored_base_offsets32_float) +GATHER_FACTORED_BASE_OFFSETS(__vec32_f, float, __vec32_i64, __gather_factored_base_offsets64_float) +GATHER_FACTORED_BASE_OFFSETS(__vec32_i64, int64_t, __vec32_i32, __gather_factored_base_offsets32_i64) +GATHER_FACTORED_BASE_OFFSETS(__vec32_i64, int64_t, __vec32_i64, __gather_factored_base_offsets64_i64) +GATHER_FACTORED_BASE_OFFSETS(__vec32_d, double, __vec32_i32, __gather_factored_base_offsets32_double) +GATHER_FACTORED_BASE_OFFSETS(__vec32_d, double, __vec32_i64, __gather_factored_base_offsets64_double) #define GATHER_GENERAL(VTYPE, STYPE, PTRTYPE, FUNC) \ static FORCEINLINE VTYPE FUNC(PTRTYPE ptrs, __vec32_i1 mask) { \ @@ -1429,7 +1429,7 @@ GATHER_GENERAL(__vec32_d, double, __vec32_i64, __gather64_double) // scatter -#define SCATTER_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC) \ +#define SCATTER_FACTORED_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC) \ static FORCEINLINE void FUNC(unsigned char *b, OTYPE varyingOffset, \ uint32_t scale, OTYPE constOffset, \ VTYPE val, __vec32_i1 mask) { \ @@ -1443,18 +1443,18 @@ static FORCEINLINE void FUNC(unsigned char *b, OTYPE varyingOffset, \ } -SCATTER_BASE_OFFSETS(__vec32_i8, int8_t, __vec32_i32, __scatter_base_offsets32_i8) -SCATTER_BASE_OFFSETS(__vec32_i8, int8_t, __vec32_i64, __scatter_base_offsets64_i8) -SCATTER_BASE_OFFSETS(__vec32_i16, int16_t, __vec32_i32, __scatter_base_offsets32_i16) -SCATTER_BASE_OFFSETS(__vec32_i16, int16_t, __vec32_i64, __scatter_base_offsets64_i16) -SCATTER_BASE_OFFSETS(__vec32_i32, int32_t, __vec32_i32, __scatter_base_offsets32_i32) -SCATTER_BASE_OFFSETS(__vec32_i32, int32_t, __vec32_i64, __scatter_base_offsets64_i32) -SCATTER_BASE_OFFSETS(__vec32_f, float, __vec32_i32, __scatter_base_offsets32_float) -SCATTER_BASE_OFFSETS(__vec32_f, float, __vec32_i64, __scatter_base_offsets64_float) -SCATTER_BASE_OFFSETS(__vec32_i64, int64_t, __vec32_i32, __scatter_base_offsets32_i64) -SCATTER_BASE_OFFSETS(__vec32_i64, int64_t, __vec32_i64, __scatter_base_offsets64_i64) -SCATTER_BASE_OFFSETS(__vec32_d, double, __vec32_i32, __scatter_base_offsets32_double) -SCATTER_BASE_OFFSETS(__vec32_d, double, __vec32_i64, __scatter_base_offsets64_double) +SCATTER_FACTORED_BASE_OFFSETS(__vec32_i8, int8_t, __vec32_i32, __scatter_factored_base_offsets32_i8) +SCATTER_FACTORED_BASE_OFFSETS(__vec32_i8, int8_t, __vec32_i64, __scatter_factored_base_offsets64_i8) +SCATTER_FACTORED_BASE_OFFSETS(__vec32_i16, int16_t, __vec32_i32, __scatter_factored_base_offsets32_i16) +SCATTER_FACTORED_BASE_OFFSETS(__vec32_i16, int16_t, __vec32_i64, __scatter_factored_base_offsets64_i16) +SCATTER_FACTORED_BASE_OFFSETS(__vec32_i32, int32_t, __vec32_i32, __scatter_factored_base_offsets32_i32) +SCATTER_FACTORED_BASE_OFFSETS(__vec32_i32, int32_t, __vec32_i64, __scatter_factored_base_offsets64_i32) +SCATTER_FACTORED_BASE_OFFSETS(__vec32_f, float, __vec32_i32, __scatter_factored_base_offsets32_float) +SCATTER_FACTORED_BASE_OFFSETS(__vec32_f, float, __vec32_i64, __scatter_factored_base_offsets64_float) +SCATTER_FACTORED_BASE_OFFSETS(__vec32_i64, int64_t, __vec32_i32, __scatter_factored_base_offsets32_i64) +SCATTER_FACTORED_BASE_OFFSETS(__vec32_i64, int64_t, __vec32_i64, __scatter_factored_base_offsets64_i64) +SCATTER_FACTORED_BASE_OFFSETS(__vec32_d, double, __vec32_i32, __scatter_factored_base_offsets32_double) +SCATTER_FACTORED_BASE_OFFSETS(__vec32_d, double, __vec32_i64, __scatter_factored_base_offsets64_double) #define SCATTER_GENERAL(VTYPE, STYPE, PTRTYPE, FUNC) \ static FORCEINLINE void FUNC(PTRTYPE ptrs, VTYPE val, __vec32_i1 mask) { \ diff --git a/examples/intrinsics/generic-64.h b/examples/intrinsics/generic-64.h index 2630e306..2a54446e 100644 --- a/examples/intrinsics/generic-64.h +++ b/examples/intrinsics/generic-64.h @@ -1507,7 +1507,7 @@ static FORCEINLINE void __masked_store_blend_double(void *p, __vec64_d val, // offsets * offsetScale is in bytes (for all of these) -#define GATHER_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC) \ +#define GATHER_FACTORED_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC) \ static FORCEINLINE VTYPE FUNC(unsigned char *b, OTYPE varyingOffset, \ uint32_t scale, OTYPE constOffset, \ __vec64_i1 mask) { \ @@ -1523,18 +1523,18 @@ static FORCEINLINE VTYPE FUNC(unsigned char *b, OTYPE varyingOffset, \ } -GATHER_BASE_OFFSETS(__vec64_i8, int8_t, __vec64_i32, __gather_base_offsets32_i8) -GATHER_BASE_OFFSETS(__vec64_i8, int8_t, __vec64_i64, __gather_base_offsets64_i8) -GATHER_BASE_OFFSETS(__vec64_i16, int16_t, __vec64_i32, __gather_base_offsets32_i16) -GATHER_BASE_OFFSETS(__vec64_i16, int16_t, __vec64_i64, __gather_base_offsets64_i16) -GATHER_BASE_OFFSETS(__vec64_i32, int32_t, __vec64_i32, __gather_base_offsets32_i32) -GATHER_BASE_OFFSETS(__vec64_i32, int32_t, __vec64_i64, __gather_base_offsets64_i32) -GATHER_BASE_OFFSETS(__vec64_f, float, __vec64_i32, __gather_base_offsets32_float) -GATHER_BASE_OFFSETS(__vec64_f, float, __vec64_i64, __gather_base_offsets64_float) -GATHER_BASE_OFFSETS(__vec64_i64, int64_t, __vec64_i32, __gather_base_offsets32_i64) -GATHER_BASE_OFFSETS(__vec64_i64, int64_t, __vec64_i64, __gather_base_offsets64_i64) -GATHER_BASE_OFFSETS(__vec64_d, double, __vec64_i32, __gather_base_offsets32_double) -GATHER_BASE_OFFSETS(__vec64_d, double, __vec64_i64, __gather_base_offsets64_double) +GATHER_FACTORED_BASE_OFFSETS(__vec64_i8, int8_t, __vec64_i32, __gather_factored_base_offsets32_i8) +GATHER_FACTORED_BASE_OFFSETS(__vec64_i8, int8_t, __vec64_i64, __gather_factored_base_offsets64_i8) +GATHER_FACTORED_BASE_OFFSETS(__vec64_i16, int16_t, __vec64_i32, __gather_factored_base_offsets32_i16) +GATHER_FACTORED_BASE_OFFSETS(__vec64_i16, int16_t, __vec64_i64, __gather_factored_base_offsets64_i16) +GATHER_FACTORED_BASE_OFFSETS(__vec64_i32, int32_t, __vec64_i32, __gather_factored_base_offsets32_i32) +GATHER_FACTORED_BASE_OFFSETS(__vec64_i32, int32_t, __vec64_i64, __gather_factored_base_offsets64_i32) +GATHER_FACTORED_BASE_OFFSETS(__vec64_f, float, __vec64_i32, __gather_factored_base_offsets32_float) +GATHER_FACTORED_BASE_OFFSETS(__vec64_f, float, __vec64_i64, __gather_factored_base_offsets64_float) +GATHER_FACTORED_BASE_OFFSETS(__vec64_i64, int64_t, __vec64_i32, __gather_factored_base_offsets32_i64) +GATHER_FACTORED_BASE_OFFSETS(__vec64_i64, int64_t, __vec64_i64, __gather_factored_base_offsets64_i64) +GATHER_FACTORED_BASE_OFFSETS(__vec64_d, double, __vec64_i32, __gather_factored_base_offsets32_double) +GATHER_FACTORED_BASE_OFFSETS(__vec64_d, double, __vec64_i64, __gather_factored_base_offsets64_double) #define GATHER_GENERAL(VTYPE, STYPE, PTRTYPE, FUNC) \ static FORCEINLINE VTYPE FUNC(PTRTYPE ptrs, __vec64_i1 mask) { \ @@ -1562,7 +1562,7 @@ GATHER_GENERAL(__vec64_d, double, __vec64_i64, __gather64_double) // scatter -#define SCATTER_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC) \ +#define SCATTER_FACTORED_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC) \ static FORCEINLINE void FUNC(unsigned char *b, OTYPE varyingOffset, \ uint32_t scale, OTYPE constOffset, \ VTYPE val, __vec64_i1 mask) { \ @@ -1576,18 +1576,18 @@ static FORCEINLINE void FUNC(unsigned char *b, OTYPE varyingOffset, \ } -SCATTER_BASE_OFFSETS(__vec64_i8, int8_t, __vec64_i32, __scatter_base_offsets32_i8) -SCATTER_BASE_OFFSETS(__vec64_i8, int8_t, __vec64_i64, __scatter_base_offsets64_i8) -SCATTER_BASE_OFFSETS(__vec64_i16, int16_t, __vec64_i32, __scatter_base_offsets32_i16) -SCATTER_BASE_OFFSETS(__vec64_i16, int16_t, __vec64_i64, __scatter_base_offsets64_i16) -SCATTER_BASE_OFFSETS(__vec64_i32, int32_t, __vec64_i32, __scatter_base_offsets32_i32) -SCATTER_BASE_OFFSETS(__vec64_i32, int32_t, __vec64_i64, __scatter_base_offsets64_i32) -SCATTER_BASE_OFFSETS(__vec64_f, float, __vec64_i32, __scatter_base_offsets32_float) -SCATTER_BASE_OFFSETS(__vec64_f, float, __vec64_i64, __scatter_base_offsets64_float) -SCATTER_BASE_OFFSETS(__vec64_i64, int64_t, __vec64_i32, __scatter_base_offsets32_i64) -SCATTER_BASE_OFFSETS(__vec64_i64, int64_t, __vec64_i64, __scatter_base_offsets64_i64) -SCATTER_BASE_OFFSETS(__vec64_d, double, __vec64_i32, __scatter_base_offsets32_double) -SCATTER_BASE_OFFSETS(__vec64_d, double, __vec64_i64, __scatter_base_offsets64_double) +SCATTER_FACTORED_BASE_OFFSETS(__vec64_i8, int8_t, __vec64_i32, __scatter_factored_base_offsets32_i8) +SCATTER_FACTORED_BASE_OFFSETS(__vec64_i8, int8_t, __vec64_i64, __scatter_factored_base_offsets64_i8) +SCATTER_FACTORED_BASE_OFFSETS(__vec64_i16, int16_t, __vec64_i32, __scatter_factored_base_offsets32_i16) +SCATTER_FACTORED_BASE_OFFSETS(__vec64_i16, int16_t, __vec64_i64, __scatter_factored_base_offsets64_i16) +SCATTER_FACTORED_BASE_OFFSETS(__vec64_i32, int32_t, __vec64_i32, __scatter_factored_base_offsets32_i32) +SCATTER_FACTORED_BASE_OFFSETS(__vec64_i32, int32_t, __vec64_i64, __scatter_factored_base_offsets64_i32) +SCATTER_FACTORED_BASE_OFFSETS(__vec64_f, float, __vec64_i32, __scatter_factored_base_offsets32_float) +SCATTER_FACTORED_BASE_OFFSETS(__vec64_f, float, __vec64_i64, __scatter_factored_base_offsets64_float) +SCATTER_FACTORED_BASE_OFFSETS(__vec64_i64, int64_t, __vec64_i32, __scatter_factored_base_offsets32_i64) +SCATTER_FACTORED_BASE_OFFSETS(__vec64_i64, int64_t, __vec64_i64, __scatter_factored_base_offsets64_i64) +SCATTER_FACTORED_BASE_OFFSETS(__vec64_d, double, __vec64_i32, __scatter_factored_base_offsets32_double) +SCATTER_FACTORED_BASE_OFFSETS(__vec64_d, double, __vec64_i64, __scatter_factored_base_offsets64_double) #define SCATTER_GENERAL(VTYPE, STYPE, PTRTYPE, FUNC) \ static FORCEINLINE void FUNC(PTRTYPE ptrs, VTYPE val, __vec64_i1 mask) { \ diff --git a/examples/intrinsics/knc.h b/examples/intrinsics/knc.h index eceeb885..fb11db11 100644 --- a/examples/intrinsics/knc.h +++ b/examples/intrinsics/knc.h @@ -1940,7 +1940,7 @@ static FORCEINLINE void __masked_store_blend_float(void *p, __vec16_f val, // offsets * offsetScale is in bytes (for all of these) -#define GATHER_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC) +#define GATHER_FACTORED_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC) /* static FORCEINLINE VTYPE FUNC(unsigned char *b, OTYPE varyingOffset, \ uint32_t scale, OTYPE constOffset, \ @@ -1958,7 +1958,7 @@ static FORCEINLINE VTYPE FUNC(unsigned char *b, OTYPE varyingOffset, \ */ static FORCEINLINE __vec16_i32 -__gather_base_offsets32_i32(uint8_t *base, __vec16_i32 varyingOffset, +__gather_factored_base_offsets32_i32(uint8_t *base, __vec16_i32 varyingOffset, uint32_t scale, __vec16_i32 constOffset, __vec16_i1 mask) { __vec16_i32 vscale = _mm512_extload_epi32(&scale, _MM_UPCONV_EPI32_NONE, _MM_BROADCAST_1X16, _MM_HINT_NONE); @@ -1973,7 +1973,7 @@ __gather_base_offsets32_i32(uint8_t *base, __vec16_i32 varyingOffset, } static FORCEINLINE __vec16_f -__gather_base_offsets32_float(uint8_t *base, __vec16_i32 varyingOffset, +__gather_factored_base_offsets32_float(uint8_t *base, __vec16_i32 varyingOffset, uint32_t scale, __vec16_i32 constOffset, __vec16_i1 mask) { __vec16_i32 vscale = _mm512_extload_epi32(&scale, _MM_UPCONV_EPI32_NONE, _MM_BROADCAST_1X16, _MM_HINT_NONE); @@ -1987,13 +1987,13 @@ __gather_base_offsets32_float(uint8_t *base, __vec16_i32 varyingOffset, return ret; } -GATHER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i32, __gather_base_offsets32_i8) -GATHER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i64, __gather_base_offsets64_i8) -GATHER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i32, __gather_base_offsets32_i16) -GATHER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i64, __gather_base_offsets64_i16) -GATHER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i64, __gather_base_offsets64_i32) -GATHER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i32, __gather_base_offsets32_i64) -GATHER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i64, __gather_base_offsets64_i64) +GATHER_FACTORED_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i32, __gather_factored_base_offsets32_i8) +GATHER_FACTORED_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i64, __gather_factored_base_offsets64_i8) +GATHER_FACTORED_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i32, __gather_factored_base_offsets32_i16) +GATHER_FACTORED_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i64, __gather_factored_base_offsets64_i16) +GATHER_FACTORED_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i64, __gather_factored_base_offsets64_i32) +GATHER_FACTORED_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i32, __gather_factored_base_offsets32_i64) +GATHER_FACTORED_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i64, __gather_factored_base_offsets64_i64) #define GATHER_GENERAL(VTYPE, STYPE, PTRTYPE, FUNC) /* @@ -2039,7 +2039,7 @@ static FORCEINLINE __vec16_i32 __gather64_i32(__vec16_i64 ptrs, __vec16_i1 mask) */ // scatter -#define SCATTER_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC) +#define SCATTER_FACTORED_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC) /* static FORCEINLINE void FUNC(unsigned char *b, OTYPE varyingOffset, \ uint32_t scale, OTYPE constOffset, \ @@ -2054,16 +2054,16 @@ static FORCEINLINE void FUNC(unsigned char *b, OTYPE varyingOffset, \ } */ -SCATTER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i32, __scatter_base_offsets32_i8) -SCATTER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i64, __scatter_base_offsets64_i8) -SCATTER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i32, __scatter_base_offsets32_i16) -SCATTER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i64, __scatter_base_offsets64_i16) -SCATTER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i64, __scatter_base_offsets64_i32) -SCATTER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i32, __scatter_base_offsets32_i64) -SCATTER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i64, __scatter_base_offsets64_i64) +SCATTER_FACTORED_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i32, __scatter_factored_base_offsets32_i8) +SCATTER_FACTORED_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i64, __scatter_factored_base_offsets64_i8) +SCATTER_FACTORED_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i32, __scatter_factored_base_offsets32_i16) +SCATTER_FACTORED_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i64, __scatter_factored_base_offsets64_i16) +SCATTER_FACTORED_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i64, __scatter_factored_base_offsets64_i32) +SCATTER_FACTORED_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i32, __scatter_factored_base_offsets32_i64) +SCATTER_FACTORED_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i64, __scatter_factored_base_offsets64_i64) static FORCEINLINE void -__scatter_base_offsets32_i32(uint8_t *b, __vec16_i32 varyingOffset, +__scatter_factored_base_offsets32_i32(uint8_t *b, __vec16_i32 varyingOffset, uint32_t scale, __vec16_i32 constOffset, __vec16_i32 val, __vec16_i1 mask) { @@ -2072,7 +2072,7 @@ __scatter_base_offsets32_i32(uint8_t *b, __vec16_i32 varyingOffset, } static FORCEINLINE void -__scatter_base_offsets32_float(void *base, const __vec16_i32 &varyingOffset, +__scatter_factored_base_offsets32_float(void *base, const __vec16_i32 &varyingOffset, uint32_t scale, const __vec16_i32 &constOffset, const __vec16_f &val, const __vec16_i1 mask) { diff --git a/examples/intrinsics/sse4.h b/examples/intrinsics/sse4.h index fcc14618..088b694d 100644 --- a/examples/intrinsics/sse4.h +++ b/examples/intrinsics/sse4.h @@ -3007,84 +3007,84 @@ lGatherBaseOffsets64(RetVec, RetScalar, unsigned char *p, __vec4_i64 offsets, } static FORCEINLINE __vec4_i8 -__gather_base_offsets32_i8(unsigned char *b, __vec4_i32 offsets, +__gather_factored_base_offsets32_i8(unsigned char *b, __vec4_i32 offsets, uint32_t scale, __vec4_i32 constOffset, __vec4_i1 mask) { return lGatherBaseOffsets32(__vec4_i8(), uint8_t(), b, offsets, scale, constOffset, mask); } static FORCEINLINE __vec4_i8 -__gather_base_offsets64_i8(unsigned char *b, __vec4_i64 offsets, +__gather_factored_base_offsets64_i8(unsigned char *b, __vec4_i64 offsets, uint32_t scale, __vec4_i64 constOffset, __vec4_i1 mask) { return lGatherBaseOffsets64(__vec4_i8(), uint8_t(), b, offsets, scale, constOffset, mask); } static FORCEINLINE __vec4_i16 -__gather_base_offsets32_i16(unsigned char *b, __vec4_i32 offsets, +__gather_factored_base_offsets32_i16(unsigned char *b, __vec4_i32 offsets, uint32_t scale, __vec4_i32 constOffset, __vec4_i1 mask) { return lGatherBaseOffsets32(__vec4_i16(), uint16_t(), b, offsets, scale, constOffset, mask); } static FORCEINLINE __vec4_i16 - __gather_base_offsets64_i16(unsigned char *b, __vec4_i64 offsets, + __gather_factored_base_offsets64_i16(unsigned char *b, __vec4_i64 offsets, uint32_t scale, __vec4_i64 constOffset, __vec4_i1 mask) { return lGatherBaseOffsets64(__vec4_i16(), uint16_t(), b, offsets, scale, constOffset, mask); } static FORCEINLINE __vec4_i32 -__gather_base_offsets32_i32(uint8_t *p, __vec4_i32 offsets, uint32_t scale, +__gather_factored_base_offsets32_i32(uint8_t *p, __vec4_i32 offsets, uint32_t scale, __vec4_i32 constOffset, __vec4_i1 mask) { return lGatherBaseOffsets32(__vec4_i32(), uint32_t(), p, offsets, scale, constOffset, mask); } static FORCEINLINE __vec4_i32 -__gather_base_offsets64_i32(unsigned char *p, __vec4_i64 offsets, +__gather_factored_base_offsets64_i32(unsigned char *p, __vec4_i64 offsets, uint32_t scale, __vec4_i64 constOffset, __vec4_i1 mask) { return lGatherBaseOffsets64(__vec4_i32(), uint32_t(), p, offsets, scale, constOffset, mask); } static FORCEINLINE __vec4_f -__gather_base_offsets32_float(uint8_t *p, __vec4_i32 offsets, uint32_t scale, +__gather_factored_base_offsets32_float(uint8_t *p, __vec4_i32 offsets, uint32_t scale, __vec4_i32 constOffset, __vec4_i1 mask) { return lGatherBaseOffsets32(__vec4_f(), float(), p, offsets, scale, constOffset, mask); } static FORCEINLINE __vec4_f -__gather_base_offsets64_float(unsigned char *p, __vec4_i64 offsets, +__gather_factored_base_offsets64_float(unsigned char *p, __vec4_i64 offsets, uint32_t scale, __vec4_i64 constOffset, __vec4_i1 mask) { return lGatherBaseOffsets64(__vec4_f(), float(), p, offsets, scale, constOffset, mask); } static FORCEINLINE __vec4_i64 -__gather_base_offsets32_i64(unsigned char *p, __vec4_i32 offsets, +__gather_factored_base_offsets32_i64(unsigned char *p, __vec4_i32 offsets, uint32_t scale, __vec4_i32 constOffset, __vec4_i1 mask) { return lGatherBaseOffsets32(__vec4_i64(), uint64_t(), p, offsets, scale, constOffset, mask); } static FORCEINLINE __vec4_i64 -__gather_base_offsets64_i64(unsigned char *p, __vec4_i64 offsets, +__gather_factored_base_offsets64_i64(unsigned char *p, __vec4_i64 offsets, uint32_t scale, __vec4_i64 constOffset, __vec4_i1 mask) { return lGatherBaseOffsets64(__vec4_i64(), uint64_t(), p, offsets, scale, constOffset, mask); } static FORCEINLINE __vec4_d -__gather_base_offsets32_double(unsigned char *p, __vec4_i32 offsets, +__gather_factored_base_offsets32_double(unsigned char *p, __vec4_i32 offsets, uint32_t scale, __vec4_i32 constOffset, __vec4_i1 mask) { return lGatherBaseOffsets32(__vec4_d(), double(), p, offsets, scale, constOffset, mask); } static FORCEINLINE __vec4_d -__gather_base_offsets64_double(unsigned char *p, __vec4_i64 offsets, +__gather_factored_base_offsets64_double(unsigned char *p, __vec4_i64 offsets, uint32_t scale, __vec4_i64 constOffset, __vec4_i1 mask) { return lGatherBaseOffsets64(__vec4_d(), double(), p, offsets, scale, constOffset, mask); @@ -3252,7 +3252,7 @@ static FORCEINLINE __vec4_d __gather64_double(__vec4_i64 ptrs, __vec4_i1 mask) { #define SCATTER32_64(SUFFIX, VEC_SUFFIX, TYPE, EXTRACT) \ static FORCEINLINE void \ -__scatter_base_offsets32_##SUFFIX (unsigned char *b, __vec4_i32 offsets, \ +__scatter_factored_base_offsets32_##SUFFIX (unsigned char *b, __vec4_i32 offsets, \ uint32_t scale, __vec4_i32 constOffset, \ __vec4_##VEC_SUFFIX val, __vec4_i1 mask) { \ uint32_t m = _mm_extract_ps(mask.v, 0); \ @@ -3281,7 +3281,7 @@ __scatter_base_offsets32_##SUFFIX (unsigned char *b, __vec4_i32 offsets, \ } \ } \ static FORCEINLINE void \ -__scatter_base_offsets64_##SUFFIX(unsigned char *p, __vec4_i64 offsets, \ +__scatter_factored_base_offsets64_##SUFFIX(unsigned char *p, __vec4_i64 offsets, \ uint32_t scale, __vec4_i64 constOffset, \ __vec4_##VEC_SUFFIX val, __vec4_i1 mask) { \ uint32_t m = _mm_extract_ps(mask.v, 0); \ @@ -3322,7 +3322,7 @@ SCATTER32_64(float, f, float, _mm_extract_ps_as_float) static FORCEINLINE void -__scatter_base_offsets32_i64(unsigned char *p, __vec4_i32 offsets, +__scatter_factored_base_offsets32_i64(unsigned char *p, __vec4_i32 offsets, uint32_t scale, __vec4_i32 constOffset, __vec4_i64 val, __vec4_i1 mask) { uint32_t m = _mm_extract_ps(mask.v, 0); @@ -3359,7 +3359,7 @@ __scatter_base_offsets32_i64(unsigned char *p, __vec4_i32 offsets, } static FORCEINLINE void -__scatter_base_offsets64_i64(unsigned char *p, __vec4_i64 offsets, +__scatter_factored_base_offsets64_i64(unsigned char *p, __vec4_i64 offsets, uint32_t scale, __vec4_i64 constOffset, __vec4_i64 val, __vec4_i1 mask) { uint32_t m = _mm_extract_ps(mask.v, 0); @@ -3396,17 +3396,17 @@ __scatter_base_offsets64_i64(unsigned char *p, __vec4_i64 offsets, } static FORCEINLINE void -__scatter_base_offsets32_double(unsigned char *p, __vec4_i32 offsets, +__scatter_factored_base_offsets32_double(unsigned char *p, __vec4_i32 offsets, uint32_t scale, __vec4_i32 constOffset, __vec4_d val, __vec4_i1 mask) { - __scatter_base_offsets32_i64(p, offsets, scale, constOffset, val, mask); + __scatter_factored_base_offsets32_i64(p, offsets, scale, constOffset, val, mask); } static FORCEINLINE void -__scatter_base_offsets64_double(unsigned char *p, __vec4_i64 offsets, +__scatter_factored_base_offsets64_double(unsigned char *p, __vec4_i64 offsets, uint32_t scale, __vec4_i64 constOffset, __vec4_d val, __vec4_i1 mask) { - __scatter_base_offsets64_i64(p, offsets, scale, constOffset, val, mask); + __scatter_factored_base_offsets64_i64(p, offsets, scale, constOffset, val, mask); } diff --git a/opt.cpp b/opt.cpp index 1456dfd7..1140c9ce 100644 --- a/opt.cpp +++ b/opt.cpp @@ -1689,57 +1689,57 @@ lGSToGSBaseOffsets(llvm::CallInst *callInst) { }; GSInfo gsFuncs[] = { - GSInfo("__pseudo_gather32_i8", "__pseudo_gather_base_offsets32_i8", - "__pseudo_gather_base_offsets32_i8", true), - GSInfo("__pseudo_gather32_i16", "__pseudo_gather_base_offsets32_i16", - "__pseudo_gather_base_offsets32_i16", true), - GSInfo("__pseudo_gather32_i32", "__pseudo_gather_base_offsets32_i32", - "__pseudo_gather_base_offsets32_i32", true), - GSInfo("__pseudo_gather32_float", "__pseudo_gather_base_offsets32_float", - "__pseudo_gather_base_offsets32_float", true), - GSInfo("__pseudo_gather32_i64", "__pseudo_gather_base_offsets32_i64", - "__pseudo_gather_base_offsets32_i64", true), - GSInfo("__pseudo_gather32_double", "__pseudo_gather_base_offsets32_double", - "__pseudo_gather_base_offsets32_double", true), + GSInfo("__pseudo_gather32_i8", "__pseudo_gather_factored_base_offsets32_i8", + "__pseudo_gather_factored_base_offsets32_i8", true), + GSInfo("__pseudo_gather32_i16", "__pseudo_gather_factored_base_offsets32_i16", + "__pseudo_gather_factored_base_offsets32_i16", true), + GSInfo("__pseudo_gather32_i32", "__pseudo_gather_factored_base_offsets32_i32", + "__pseudo_gather_factored_base_offsets32_i32", true), + GSInfo("__pseudo_gather32_float", "__pseudo_gather_factored_base_offsets32_float", + "__pseudo_gather_factored_base_offsets32_float", true), + GSInfo("__pseudo_gather32_i64", "__pseudo_gather_factored_base_offsets32_i64", + "__pseudo_gather_factored_base_offsets32_i64", true), + GSInfo("__pseudo_gather32_double", "__pseudo_gather_factored_base_offsets32_double", + "__pseudo_gather_factored_base_offsets32_double", true), - GSInfo("__pseudo_scatter32_i8", "__pseudo_scatter_base_offsets32_i8", - "__pseudo_scatter_base_offsets32_i8", false), - GSInfo("__pseudo_scatter32_i16", "__pseudo_scatter_base_offsets32_i16", - "__pseudo_scatter_base_offsets32_i16", false), - GSInfo("__pseudo_scatter32_i32", "__pseudo_scatter_base_offsets32_i32", - "__pseudo_scatter_base_offsets32_i32", false), - GSInfo("__pseudo_scatter32_float", "__pseudo_scatter_base_offsets32_float", - "__pseudo_scatter_base_offsets32_float", false), - GSInfo("__pseudo_scatter32_i64", "__pseudo_scatter_base_offsets32_i64", - "__pseudo_scatter_base_offsets32_i64", false), - GSInfo("__pseudo_scatter32_double", "__pseudo_scatter_base_offsets32_double", - "__pseudo_scatter_base_offsets32_double", false), + GSInfo("__pseudo_scatter32_i8", "__pseudo_scatter_factored_base_offsets32_i8", + "__pseudo_scatter_factored_base_offsets32_i8", false), + GSInfo("__pseudo_scatter32_i16", "__pseudo_scatter_factored_base_offsets32_i16", + "__pseudo_scatter_factored_base_offsets32_i16", false), + GSInfo("__pseudo_scatter32_i32", "__pseudo_scatter_factored_base_offsets32_i32", + "__pseudo_scatter_factored_base_offsets32_i32", false), + GSInfo("__pseudo_scatter32_float", "__pseudo_scatter_factored_base_offsets32_float", + "__pseudo_scatter_factored_base_offsets32_float", false), + GSInfo("__pseudo_scatter32_i64", "__pseudo_scatter_factored_base_offsets32_i64", + "__pseudo_scatter_factored_base_offsets32_i64", false), + GSInfo("__pseudo_scatter32_double", "__pseudo_scatter_factored_base_offsets32_double", + "__pseudo_scatter_factored_base_offsets32_double", false), - GSInfo("__pseudo_gather64_i8", "__pseudo_gather_base_offsets64_i8", - "__pseudo_gather_base_offsets32_i8", true), - GSInfo("__pseudo_gather64_i16", "__pseudo_gather_base_offsets64_i16", - "__pseudo_gather_base_offsets32_i16", true), - GSInfo("__pseudo_gather64_i32", "__pseudo_gather_base_offsets64_i32", - "__pseudo_gather_base_offsets32_i32", true), - GSInfo("__pseudo_gather64_float", "__pseudo_gather_base_offsets64_float", - "__pseudo_gather_base_offsets32_float", true), - GSInfo("__pseudo_gather64_i64", "__pseudo_gather_base_offsets64_i64", - "__pseudo_gather_base_offsets32_i64", true), - GSInfo("__pseudo_gather64_double", "__pseudo_gather_base_offsets64_double", - "__pseudo_gather_base_offsets32_double", true), + GSInfo("__pseudo_gather64_i8", "__pseudo_gather_factored_base_offsets64_i8", + "__pseudo_gather_factored_base_offsets32_i8", true), + GSInfo("__pseudo_gather64_i16", "__pseudo_gather_factored_base_offsets64_i16", + "__pseudo_gather_factored_base_offsets32_i16", true), + GSInfo("__pseudo_gather64_i32", "__pseudo_gather_factored_base_offsets64_i32", + "__pseudo_gather_factored_base_offsets32_i32", true), + GSInfo("__pseudo_gather64_float", "__pseudo_gather_factored_base_offsets64_float", + "__pseudo_gather_factored_base_offsets32_float", true), + GSInfo("__pseudo_gather64_i64", "__pseudo_gather_factored_base_offsets64_i64", + "__pseudo_gather_factored_base_offsets32_i64", true), + GSInfo("__pseudo_gather64_double", "__pseudo_gather_factored_base_offsets64_double", + "__pseudo_gather_factored_base_offsets32_double", true), - GSInfo("__pseudo_scatter64_i8", "__pseudo_scatter_base_offsets64_i8", - "__pseudo_scatter_base_offsets32_i8", false), - GSInfo("__pseudo_scatter64_i16", "__pseudo_scatter_base_offsets64_i16", - "__pseudo_scatter_base_offsets32_i16", false), - GSInfo("__pseudo_scatter64_i32", "__pseudo_scatter_base_offsets64_i32", - "__pseudo_scatter_base_offsets32_i32", false), - GSInfo("__pseudo_scatter64_float", "__pseudo_scatter_base_offsets64_float", - "__pseudo_scatter_base_offsets32_float", false), - GSInfo("__pseudo_scatter64_i64", "__pseudo_scatter_base_offsets64_i64", - "__pseudo_scatter_base_offsets32_i64", false), - GSInfo("__pseudo_scatter64_double", "__pseudo_scatter_base_offsets64_double", - "__pseudo_scatter_base_offsets32_double", false), + GSInfo("__pseudo_scatter64_i8", "__pseudo_scatter_factored_base_offsets64_i8", + "__pseudo_scatter_factored_base_offsets32_i8", false), + GSInfo("__pseudo_scatter64_i16", "__pseudo_scatter_factored_base_offsets64_i16", + "__pseudo_scatter_factored_base_offsets32_i16", false), + GSInfo("__pseudo_scatter64_i32", "__pseudo_scatter_factored_base_offsets64_i32", + "__pseudo_scatter_factored_base_offsets32_i32", false), + GSInfo("__pseudo_scatter64_float", "__pseudo_scatter_factored_base_offsets64_float", + "__pseudo_scatter_factored_base_offsets32_float", false), + GSInfo("__pseudo_scatter64_i64", "__pseudo_scatter_factored_base_offsets64_i64", + "__pseudo_scatter_factored_base_offsets32_i64", false), + GSInfo("__pseudo_scatter64_double", "__pseudo_scatter_factored_base_offsets64_double", + "__pseudo_scatter_factored_base_offsets32_double", false), }; int numGSFuncs = sizeof(gsFuncs) / sizeof(gsFuncs[0]); @@ -1858,57 +1858,57 @@ lGSBaseOffsetsGetMoreConst(llvm::CallInst *callInst) { }; GSBOInfo gsFuncs[] = { - GSBOInfo("__pseudo_gather_base_offsets32_i8", - "__pseudo_gather_base_offsets32_i8", true), - GSBOInfo("__pseudo_gather_base_offsets32_i16", - "__pseudo_gather_base_offsets32_i16", true), - GSBOInfo("__pseudo_gather_base_offsets32_i32", - "__pseudo_gather_base_offsets32_i32", true), - GSBOInfo("__pseudo_gather_base_offsets32_float", - "__pseudo_gather_base_offsets32_float", true), - GSBOInfo("__pseudo_gather_base_offsets32_i64", - "__pseudo_gather_base_offsets32_i64", true), - GSBOInfo("__pseudo_gather_base_offsets32_double", - "__pseudo_gather_base_offsets32_double", true), + GSBOInfo("__pseudo_gather_factored_base_offsets32_i8", + "__pseudo_gather_factored_base_offsets32_i8", true), + GSBOInfo("__pseudo_gather_factored_base_offsets32_i16", + "__pseudo_gather_factored_base_offsets32_i16", true), + GSBOInfo("__pseudo_gather_factored_base_offsets32_i32", + "__pseudo_gather_factored_base_offsets32_i32", true), + GSBOInfo("__pseudo_gather_factored_base_offsets32_float", + "__pseudo_gather_factored_base_offsets32_float", true), + GSBOInfo("__pseudo_gather_factored_base_offsets32_i64", + "__pseudo_gather_factored_base_offsets32_i64", true), + GSBOInfo("__pseudo_gather_factored_base_offsets32_double", + "__pseudo_gather_factored_base_offsets32_double", true), - GSBOInfo( "__pseudo_scatter_base_offsets32_i8", - "__pseudo_scatter_base_offsets32_i8", false), - GSBOInfo("__pseudo_scatter_base_offsets32_i16", - "__pseudo_scatter_base_offsets32_i16", false), - GSBOInfo("__pseudo_scatter_base_offsets32_i32", - "__pseudo_scatter_base_offsets32_i32", false), - GSBOInfo("__pseudo_scatter_base_offsets32_float", - "__pseudo_scatter_base_offsets32_float", false), - GSBOInfo("__pseudo_scatter_base_offsets32_i64", - "__pseudo_scatter_base_offsets32_i64", false), - GSBOInfo("__pseudo_scatter_base_offsets32_double", - "__pseudo_scatter_base_offsets32_double", false), + GSBOInfo( "__pseudo_scatter_factored_base_offsets32_i8", + "__pseudo_scatter_factored_base_offsets32_i8", false), + GSBOInfo("__pseudo_scatter_factored_base_offsets32_i16", + "__pseudo_scatter_factored_base_offsets32_i16", false), + GSBOInfo("__pseudo_scatter_factored_base_offsets32_i32", + "__pseudo_scatter_factored_base_offsets32_i32", false), + GSBOInfo("__pseudo_scatter_factored_base_offsets32_float", + "__pseudo_scatter_factored_base_offsets32_float", false), + GSBOInfo("__pseudo_scatter_factored_base_offsets32_i64", + "__pseudo_scatter_factored_base_offsets32_i64", false), + GSBOInfo("__pseudo_scatter_factored_base_offsets32_double", + "__pseudo_scatter_factored_base_offsets32_double", false), - GSBOInfo( "__pseudo_gather_base_offsets64_i8", - "__pseudo_gather_base_offsets32_i8", true), - GSBOInfo("__pseudo_gather_base_offsets64_i16", - "__pseudo_gather_base_offsets32_i16", true), - GSBOInfo("__pseudo_gather_base_offsets64_i32", - "__pseudo_gather_base_offsets32_i32", true), - GSBOInfo("__pseudo_gather_base_offsets64_float", - "__pseudo_gather_base_offsets32_float", true), - GSBOInfo("__pseudo_gather_base_offsets64_i64", - "__pseudo_gather_base_offsets32_i64", true), - GSBOInfo("__pseudo_gather_base_offsets64_double", - "__pseudo_gather_base_offsets32_double", true), + GSBOInfo( "__pseudo_gather_factored_base_offsets64_i8", + "__pseudo_gather_factored_base_offsets32_i8", true), + GSBOInfo("__pseudo_gather_factored_base_offsets64_i16", + "__pseudo_gather_factored_base_offsets32_i16", true), + GSBOInfo("__pseudo_gather_factored_base_offsets64_i32", + "__pseudo_gather_factored_base_offsets32_i32", true), + GSBOInfo("__pseudo_gather_factored_base_offsets64_float", + "__pseudo_gather_factored_base_offsets32_float", true), + GSBOInfo("__pseudo_gather_factored_base_offsets64_i64", + "__pseudo_gather_factored_base_offsets32_i64", true), + GSBOInfo("__pseudo_gather_factored_base_offsets64_double", + "__pseudo_gather_factored_base_offsets32_double", true), - GSBOInfo( "__pseudo_scatter_base_offsets64_i8", - "__pseudo_scatter_base_offsets32_i8", false), - GSBOInfo("__pseudo_scatter_base_offsets64_i16", - "__pseudo_scatter_base_offsets32_i16", false), - GSBOInfo("__pseudo_scatter_base_offsets64_i32", - "__pseudo_scatter_base_offsets32_i32", false), - GSBOInfo("__pseudo_scatter_base_offsets64_float", - "__pseudo_scatter_base_offsets32_float", false), - GSBOInfo("__pseudo_scatter_base_offsets64_i64", - "__pseudo_scatter_base_offsets32_i64", false), - GSBOInfo("__pseudo_scatter_base_offsets64_double", - "__pseudo_scatter_base_offsets32_double", false), + GSBOInfo( "__pseudo_scatter_factored_base_offsets64_i8", + "__pseudo_scatter_factored_base_offsets32_i8", false), + GSBOInfo("__pseudo_scatter_factored_base_offsets64_i16", + "__pseudo_scatter_factored_base_offsets32_i16", false), + GSBOInfo("__pseudo_scatter_factored_base_offsets64_i32", + "__pseudo_scatter_factored_base_offsets32_i32", false), + GSBOInfo("__pseudo_scatter_factored_base_offsets64_float", + "__pseudo_scatter_factored_base_offsets32_float", false), + GSBOInfo("__pseudo_scatter_factored_base_offsets64_i64", + "__pseudo_scatter_factored_base_offsets32_i64", false), + GSBOInfo("__pseudo_scatter_factored_base_offsets64_double", + "__pseudo_scatter_factored_base_offsets32_double", false), }; int numGSFuncs = sizeof(gsFuncs) / sizeof(gsFuncs[0]); @@ -2025,29 +2025,29 @@ lGSToLoadStore(llvm::CallInst *callInst) { }; GatherImpInfo gInfo[] = { - GatherImpInfo("__pseudo_gather_base_offsets32_i8", "__masked_load_i8", + GatherImpInfo("__pseudo_gather_factored_base_offsets32_i8", "__masked_load_i8", LLVMTypes::Int8Type, 1), - GatherImpInfo("__pseudo_gather_base_offsets32_i16", "__masked_load_i16", + GatherImpInfo("__pseudo_gather_factored_base_offsets32_i16", "__masked_load_i16", LLVMTypes::Int16Type, 2), - GatherImpInfo("__pseudo_gather_base_offsets32_i32", "__masked_load_i32", + GatherImpInfo("__pseudo_gather_factored_base_offsets32_i32", "__masked_load_i32", LLVMTypes::Int32Type, 4), - GatherImpInfo("__pseudo_gather_base_offsets32_float", "__masked_load_float", + GatherImpInfo("__pseudo_gather_factored_base_offsets32_float", "__masked_load_float", LLVMTypes::FloatType, 4), - GatherImpInfo("__pseudo_gather_base_offsets32_i64", "__masked_load_i64", + GatherImpInfo("__pseudo_gather_factored_base_offsets32_i64", "__masked_load_i64", LLVMTypes::Int64Type, 8), - GatherImpInfo("__pseudo_gather_base_offsets32_double", "__masked_load_double", + GatherImpInfo("__pseudo_gather_factored_base_offsets32_double", "__masked_load_double", LLVMTypes::DoubleType, 8), - GatherImpInfo("__pseudo_gather_base_offsets64_i8", "__masked_load_i8", + GatherImpInfo("__pseudo_gather_factored_base_offsets64_i8", "__masked_load_i8", LLVMTypes::Int8Type, 1), - GatherImpInfo("__pseudo_gather_base_offsets64_i16", "__masked_load_i16", + GatherImpInfo("__pseudo_gather_factored_base_offsets64_i16", "__masked_load_i16", LLVMTypes::Int16Type, 2), - GatherImpInfo("__pseudo_gather_base_offsets64_i32", "__masked_load_i32", + GatherImpInfo("__pseudo_gather_factored_base_offsets64_i32", "__masked_load_i32", LLVMTypes::Int32Type, 4), - GatherImpInfo("__pseudo_gather_base_offsets64_float", "__masked_load_float", + GatherImpInfo("__pseudo_gather_factored_base_offsets64_float", "__masked_load_float", LLVMTypes::FloatType, 4), - GatherImpInfo("__pseudo_gather_base_offsets64_i64", "__masked_load_i64", + GatherImpInfo("__pseudo_gather_factored_base_offsets64_i64", "__masked_load_i64", LLVMTypes::Int64Type, 8), - GatherImpInfo("__pseudo_gather_base_offsets64_double", "__masked_load_double", + GatherImpInfo("__pseudo_gather_factored_base_offsets64_double", "__masked_load_double", LLVMTypes::DoubleType, 8) }; @@ -2067,29 +2067,29 @@ lGSToLoadStore(llvm::CallInst *callInst) { }; ScatterImpInfo sInfo[] = { - ScatterImpInfo("__pseudo_scatter_base_offsets32_i8", "__pseudo_masked_store_i8", + ScatterImpInfo("__pseudo_scatter_factored_base_offsets32_i8", "__pseudo_masked_store_i8", LLVMTypes::Int8VectorPointerType, 1), - ScatterImpInfo("__pseudo_scatter_base_offsets32_i16", "__pseudo_masked_store_i16", + ScatterImpInfo("__pseudo_scatter_factored_base_offsets32_i16", "__pseudo_masked_store_i16", LLVMTypes::Int16VectorPointerType, 2), - ScatterImpInfo("__pseudo_scatter_base_offsets32_i32", "__pseudo_masked_store_i32", + ScatterImpInfo("__pseudo_scatter_factored_base_offsets32_i32", "__pseudo_masked_store_i32", LLVMTypes::Int32VectorPointerType, 4), - ScatterImpInfo("__pseudo_scatter_base_offsets32_float", "__pseudo_masked_store_float", + ScatterImpInfo("__pseudo_scatter_factored_base_offsets32_float", "__pseudo_masked_store_float", LLVMTypes::FloatVectorPointerType, 4), - ScatterImpInfo("__pseudo_scatter_base_offsets32_i64", "__pseudo_masked_store_i64", + ScatterImpInfo("__pseudo_scatter_factored_base_offsets32_i64", "__pseudo_masked_store_i64", LLVMTypes::Int64VectorPointerType, 8), - ScatterImpInfo("__pseudo_scatter_base_offsets32_double", "__pseudo_masked_store_double", + ScatterImpInfo("__pseudo_scatter_factored_base_offsets32_double", "__pseudo_masked_store_double", LLVMTypes::DoubleVectorPointerType, 8), - ScatterImpInfo("__pseudo_scatter_base_offsets64_i8", "__pseudo_masked_store_i8", + ScatterImpInfo("__pseudo_scatter_factored_base_offsets64_i8", "__pseudo_masked_store_i8", LLVMTypes::Int8VectorPointerType, 1), - ScatterImpInfo("__pseudo_scatter_base_offsets64_i16", "__pseudo_masked_store_i16", + ScatterImpInfo("__pseudo_scatter_factored_base_offsets64_i16", "__pseudo_masked_store_i16", LLVMTypes::Int16VectorPointerType, 2), - ScatterImpInfo("__pseudo_scatter_base_offsets64_i32", "__pseudo_masked_store_i32", + ScatterImpInfo("__pseudo_scatter_factored_base_offsets64_i32", "__pseudo_masked_store_i32", LLVMTypes::Int32VectorPointerType, 4), - ScatterImpInfo("__pseudo_scatter_base_offsets64_float", "__pseudo_masked_store_float", + ScatterImpInfo("__pseudo_scatter_factored_base_offsets64_float", "__pseudo_masked_store_float", LLVMTypes::FloatVectorPointerType, 4), - ScatterImpInfo("__pseudo_scatter_base_offsets64_i64", "__pseudo_masked_store_i64", + ScatterImpInfo("__pseudo_scatter_factored_base_offsets64_i64", "__pseudo_masked_store_i64", LLVMTypes::Int64VectorPointerType, 8), - ScatterImpInfo("__pseudo_scatter_base_offsets64_double", "__pseudo_masked_store_double", + ScatterImpInfo("__pseudo_scatter_factored_base_offsets64_double", "__pseudo_masked_store_double", LLVMTypes::DoubleVectorPointerType, 8) }; @@ -3354,10 +3354,10 @@ GatherCoalescePass::runOnBasicBlock(llvm::BasicBlock &bb) { DEBUG_START_PASS("GatherCoalescePass"); llvm::Function *gatherFuncs[] = { - m->module->getFunction("__pseudo_gather_base_offsets32_i32"), - m->module->getFunction("__pseudo_gather_base_offsets32_float"), - m->module->getFunction("__pseudo_gather_base_offsets64_i32"), - m->module->getFunction("__pseudo_gather_base_offsets64_float"), + m->module->getFunction("__pseudo_gather_factored_base_offsets32_i32"), + m->module->getFunction("__pseudo_gather_factored_base_offsets32_float"), + m->module->getFunction("__pseudo_gather_factored_base_offsets64_i32"), + m->module->getFunction("__pseudo_gather_factored_base_offsets64_float"), }; int nGatherFuncs = sizeof(gatherFuncs) / sizeof(gatherFuncs[0]); @@ -3367,7 +3367,7 @@ GatherCoalescePass::runOnBasicBlock(llvm::BasicBlock &bb) { for (llvm::BasicBlock::iterator iter = bb.begin(), e = bb.end(); iter != e; ++iter) { // Iterate over all of the instructions and look for calls to - // __pseudo_gather_base_offsets{32,64}_{i32,float} calls. + // __pseudo_gather_factored_base_offsets{32,64}_{i32,float} calls. llvm::CallInst *callInst = llvm::dyn_cast(&*iter); if (callInst == NULL) continue; @@ -3639,19 +3639,19 @@ lReplacePseudoGS(llvm::CallInst *callInst) { }; LowerGSInfo lgsInfo[] = { - LowerGSInfo("__pseudo_gather_base_offsets32_i8", "__gather_base_offsets32_i8", true), - LowerGSInfo("__pseudo_gather_base_offsets32_i16", "__gather_base_offsets32_i16", true), - LowerGSInfo("__pseudo_gather_base_offsets32_i32", "__gather_base_offsets32_i32", true), - LowerGSInfo("__pseudo_gather_base_offsets32_float", "__gather_base_offsets32_float", true), - LowerGSInfo("__pseudo_gather_base_offsets32_i64", "__gather_base_offsets32_i64", true), - LowerGSInfo("__pseudo_gather_base_offsets32_double", "__gather_base_offsets32_double", true), + LowerGSInfo("__pseudo_gather_factored_base_offsets32_i8", "__gather_factored_base_offsets32_i8", true), + LowerGSInfo("__pseudo_gather_factored_base_offsets32_i16", "__gather_factored_base_offsets32_i16", true), + LowerGSInfo("__pseudo_gather_factored_base_offsets32_i32", "__gather_factored_base_offsets32_i32", true), + LowerGSInfo("__pseudo_gather_factored_base_offsets32_float", "__gather_factored_base_offsets32_float", true), + LowerGSInfo("__pseudo_gather_factored_base_offsets32_i64", "__gather_factored_base_offsets32_i64", true), + LowerGSInfo("__pseudo_gather_factored_base_offsets32_double", "__gather_factored_base_offsets32_double", true), - LowerGSInfo("__pseudo_gather_base_offsets64_i8", "__gather_base_offsets64_i8", true), - LowerGSInfo("__pseudo_gather_base_offsets64_i16", "__gather_base_offsets64_i16", true), - LowerGSInfo("__pseudo_gather_base_offsets64_i32", "__gather_base_offsets64_i32", true), - LowerGSInfo("__pseudo_gather_base_offsets64_float", "__gather_base_offsets64_float", true), - LowerGSInfo("__pseudo_gather_base_offsets64_i64", "__gather_base_offsets64_i64", true), - LowerGSInfo("__pseudo_gather_base_offsets64_double", "__gather_base_offsets64_double", true), + LowerGSInfo("__pseudo_gather_factored_base_offsets64_i8", "__gather_factored_base_offsets64_i8", true), + LowerGSInfo("__pseudo_gather_factored_base_offsets64_i16", "__gather_factored_base_offsets64_i16", true), + LowerGSInfo("__pseudo_gather_factored_base_offsets64_i32", "__gather_factored_base_offsets64_i32", true), + LowerGSInfo("__pseudo_gather_factored_base_offsets64_float", "__gather_factored_base_offsets64_float", true), + LowerGSInfo("__pseudo_gather_factored_base_offsets64_i64", "__gather_factored_base_offsets64_i64", true), + LowerGSInfo("__pseudo_gather_factored_base_offsets64_double", "__gather_factored_base_offsets64_double", true), LowerGSInfo("__pseudo_gather32_i8", "__gather32_i8", true), LowerGSInfo("__pseudo_gather32_i16", "__gather32_i16", true), @@ -3667,19 +3667,19 @@ lReplacePseudoGS(llvm::CallInst *callInst) { LowerGSInfo("__pseudo_gather64_i64", "__gather64_i64", true), LowerGSInfo("__pseudo_gather64_double", "__gather64_double", true), - LowerGSInfo("__pseudo_scatter_base_offsets32_i8", "__scatter_base_offsets32_i8", false), - LowerGSInfo("__pseudo_scatter_base_offsets32_i16", "__scatter_base_offsets32_i16", false), - LowerGSInfo("__pseudo_scatter_base_offsets32_i32", "__scatter_base_offsets32_i32", false), - LowerGSInfo("__pseudo_scatter_base_offsets32_float", "__scatter_base_offsets32_float", false), - LowerGSInfo("__pseudo_scatter_base_offsets32_i64", "__scatter_base_offsets32_i64", false), - LowerGSInfo("__pseudo_scatter_base_offsets32_double", "__scatter_base_offsets32_double", false), + LowerGSInfo("__pseudo_scatter_factored_base_offsets32_i8", "__scatter_factored_base_offsets32_i8", false), + LowerGSInfo("__pseudo_scatter_factored_base_offsets32_i16", "__scatter_factored_base_offsets32_i16", false), + LowerGSInfo("__pseudo_scatter_factored_base_offsets32_i32", "__scatter_factored_base_offsets32_i32", false), + LowerGSInfo("__pseudo_scatter_factored_base_offsets32_float", "__scatter_factored_base_offsets32_float", false), + LowerGSInfo("__pseudo_scatter_factored_base_offsets32_i64", "__scatter_factored_base_offsets32_i64", false), + LowerGSInfo("__pseudo_scatter_factored_base_offsets32_double", "__scatter_factored_base_offsets32_double", false), - LowerGSInfo("__pseudo_scatter_base_offsets64_i8", "__scatter_base_offsets64_i8", false), - LowerGSInfo("__pseudo_scatter_base_offsets64_i16", "__scatter_base_offsets64_i16", false), - LowerGSInfo("__pseudo_scatter_base_offsets64_i32", "__scatter_base_offsets64_i32", false), - LowerGSInfo("__pseudo_scatter_base_offsets64_float", "__scatter_base_offsets64_float", false), - LowerGSInfo("__pseudo_scatter_base_offsets64_i64", "__scatter_base_offsets64_i64", false), - LowerGSInfo("__pseudo_scatter_base_offsets64_double", "__scatter_base_offsets64_double", false), + LowerGSInfo("__pseudo_scatter_factored_base_offsets64_i8", "__scatter_factored_base_offsets64_i8", false), + LowerGSInfo("__pseudo_scatter_factored_base_offsets64_i16", "__scatter_factored_base_offsets64_i16", false), + LowerGSInfo("__pseudo_scatter_factored_base_offsets64_i32", "__scatter_factored_base_offsets64_i32", false), + LowerGSInfo("__pseudo_scatter_factored_base_offsets64_float", "__scatter_factored_base_offsets64_float", false), + LowerGSInfo("__pseudo_scatter_factored_base_offsets64_i64", "__scatter_factored_base_offsets64_i64", false), + LowerGSInfo("__pseudo_scatter_factored_base_offsets64_double", "__scatter_factored_base_offsets64_double", false), LowerGSInfo("__pseudo_scatter32_i8", "__scatter32_i8", false), LowerGSInfo("__pseudo_scatter32_i16", "__scatter32_i16", false), @@ -3899,12 +3899,12 @@ bool MakeInternalFuncsStaticPass::runOnModule(llvm::Module &module) { const char *names[] = { "__fast_masked_vload", - "__gather_base_offsets32_i8", "__gather_base_offsets32_i16", - "__gather_base_offsets32_i32", "__gather_base_offsets32_i64", - "__gather_base_offsets32_float", "__gather_base_offsets32_double", - "__gather_base_offsets64_i8", "__gather_base_offsets64_i16", - "__gather_base_offsets64_i32", "__gather_base_offsets64_i64", - "__gather_base_offsets64_float", "__gather_base_offsets64_double", + "__gather_factored_base_offsets32_i8", "__gather_factored_base_offsets32_i16", + "__gather_factored_base_offsets32_i32", "__gather_factored_base_offsets32_i64", + "__gather_factored_base_offsets32_float", "__gather_factored_base_offsets32_double", + "__gather_factored_base_offsets64_i8", "__gather_factored_base_offsets64_i16", + "__gather_factored_base_offsets64_i32", "__gather_factored_base_offsets64_i64", + "__gather_factored_base_offsets64_float", "__gather_factored_base_offsets64_double", "__gather32_i8", "__gather32_i16", "__gather32_i32", "__gather32_i64", "__gather32_float", "__gather32_double", @@ -3926,12 +3926,12 @@ MakeInternalFuncsStaticPass::runOnModule(llvm::Module &module) { "__masked_store_blend_i8", "__masked_store_blend_i16", "__masked_store_blend_i32", "__masked_store_blend_i64", "__masked_store_blend_float", "__masked_store_blend_double", - "__scatter_base_offsets32_i8", "__scatter_base_offsets32_i16", - "__scatter_base_offsets32_i32", "__scatter_base_offsets32_i64", - "__scatter_base_offsets32_float", "__scatter_base_offsets32_double", - "__scatter_base_offsets64_i8", "__scatter_base_offsets64_i16", - "__scatter_base_offsets64_i32", "__scatter_base_offsets64_i64", - "__scatter_base_offsets64_float", "__scatter_base_offsets64_double", + "__scatter_factored_base_offsets32_i8", "__scatter_factored_base_offsets32_i16", + "__scatter_factored_base_offsets32_i32", "__scatter_factored_base_offsets32_i64", + "__scatter_factored_base_offsets32_float", "__scatter_factored_base_offsets32_double", + "__scatter_factored_base_offsets64_i8", "__scatter_factored_base_offsets64_i16", + "__scatter_factored_base_offsets64_i32", "__scatter_factored_base_offsets64_i64", + "__scatter_factored_base_offsets64_float", "__scatter_factored_base_offsets64_double", "__scatter_elt32_i8", "__scatter_elt32_i16", "__scatter_elt32_i32", "__scatter_elt32_i64", "__scatter_elt32_float", "__scatter_elt32_double",