diff --git a/builtins/target-avx-x2.ll b/builtins/target-avx-x2.ll index 9bb73d88..66d209af 100644 --- a/builtins/target-avx-x2.ll +++ b/builtins/target-avx-x2.ll @@ -598,10 +598,10 @@ define void @__masked_store_blend_i64(<16 x i64>* nocapture %ptr, <16 x i64> %ne ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; scatter -gen_scatter(16, i8) -gen_scatter(16, i16) -gen_scatter(16, i32) -gen_scatter(16, i64) +gen_scatter(i8) +gen_scatter(i16) +gen_scatter(i32) +gen_scatter(i64) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; double precision sqrt diff --git a/builtins/target-avx.ll b/builtins/target-avx.ll index e5ded22a..2963664a 100644 --- a/builtins/target-avx.ll +++ b/builtins/target-avx.ll @@ -492,10 +492,10 @@ define void @__masked_store_blend_i64(<8 x i64>* nocapture %ptr, <8 x i64> %new, ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; scatter -gen_scatter(8, i8) -gen_scatter(8, i16) -gen_scatter(8, i32) -gen_scatter(8, i64) +gen_scatter(i8) +gen_scatter(i16) +gen_scatter(i32) +gen_scatter(i64) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; double precision sqrt diff --git a/builtins/target-avx1-x2.ll b/builtins/target-avx1-x2.ll index 36f47cec..1c181a7b 100644 --- a/builtins/target-avx1-x2.ll +++ b/builtins/target-avx1-x2.ll @@ -69,9 +69,7 @@ declare @__float_to_half_varying( %v) nounwind read ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; gather -gen_gather(16, i8) -gen_gather(16, i16) -gen_gather(16, i32) -gen_gather(16, i64) - - +gen_gather(i8) +gen_gather(i16) +gen_gather(i32) +gen_gather(i64) diff --git a/builtins/target-avx1.ll b/builtins/target-avx1.ll index e46fc3b4..7638713d 100644 --- a/builtins/target-avx1.ll +++ b/builtins/target-avx1.ll @@ -69,7 +69,7 @@ declare @__float_to_half_varying( %v) nounwind read ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; gather -gen_gather(8, i8) -gen_gather(8, i16) -gen_gather(8, i32) -gen_gather(8, i64) +gen_gather(i8) +gen_gather(i16) +gen_gather(i32) +gen_gather(i64) diff --git a/builtins/target-avx2-x2.ll b/builtins/target-avx2-x2.ll index e4d3f686..31ecdb5f 100644 --- a/builtins/target-avx2-x2.ll +++ b/builtins/target-avx2-x2.ll @@ -121,9 +121,7 @@ define i16 @__float_to_half_uniform(float %v) nounwind readnone { ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; gather -gen_gather(16, i8) -gen_gather(16, i16) -gen_gather(16, i32) -gen_gather(16, i64) - - +gen_gather(i8) +gen_gather(i16) +gen_gather(i32) +gen_gather(i64) diff --git a/builtins/target-avx2.ll b/builtins/target-avx2.ll index 66b2a23e..30f74813 100644 --- a/builtins/target-avx2.ll +++ b/builtins/target-avx2.ll @@ -104,7 +104,7 @@ define i16 @__float_to_half_uniform(float %v) nounwind readnone { ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; gather -gen_gather(8, i8) -gen_gather(8, i16) -gen_gather(8, i32) -gen_gather(8, i64) +gen_gather(i8) +gen_gather(i16) +gen_gather(i32) +gen_gather(i64) diff --git a/builtins/target-generic-1.ll b/builtins/target-generic-1.ll index e87c3c0a..2ea20d69 100755 --- a/builtins/target-generic-1.ll +++ b/builtins/target-generic-1.ll @@ -36,15 +36,15 @@ masked_load(i64, 8) ; define these with the macros from stdlib.m4 -gen_gather(1, i8) -gen_gather(1, i16) -gen_gather(1, i32) -gen_gather(1, i64) +gen_gather(i8) +gen_gather(i16) +gen_gather(i32) +gen_gather(i64) -gen_scatter(1, i8) -gen_scatter(1, i16) -gen_scatter(1, i32) -gen_scatter(1, i64) +gen_scatter(i8) +gen_scatter(i16) +gen_scatter(i32) +gen_scatter(i64) define <1 x i8> @__vselect_i8(<1 x i8>, <1 x i8> , diff --git a/builtins/target-sse2-x2.ll b/builtins/target-sse2-x2.ll index 2574b81f..2690d027 100644 --- a/builtins/target-sse2-x2.ll +++ b/builtins/target-sse2-x2.ll @@ -446,15 +446,15 @@ masked_load(i64, 8) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; gather/scatter -gen_gather(8, i8) -gen_gather(8, i16) -gen_gather(8, i32) -gen_gather(8, i64) +gen_gather(i8) +gen_gather(i16) +gen_gather(i32) +gen_gather(i64) -gen_scatter(8, i8) -gen_scatter(8, i16) -gen_scatter(8, i32) -gen_scatter(8, i64) +gen_scatter(i8) +gen_scatter(i16) +gen_scatter(i32) +gen_scatter(i64) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; float rounding diff --git a/builtins/target-sse2.ll b/builtins/target-sse2.ll index 2275cf1b..cc0338cd 100644 --- a/builtins/target-sse2.ll +++ b/builtins/target-sse2.ll @@ -575,12 +575,12 @@ masked_load(i64, 8) ; define these with the macros from stdlib.m4 -gen_gather(4, i8) -gen_gather(4, i16) -gen_gather(4, i32) -gen_gather(4, i64) +gen_gather(i8) +gen_gather(i16) +gen_gather(i32) +gen_gather(i64) -gen_scatter(4, i8) -gen_scatter(4, i16) -gen_scatter(4, i32) -gen_scatter(4, i64) +gen_scatter(i8) +gen_scatter(i16) +gen_scatter(i32) +gen_scatter(i64) diff --git a/builtins/target-sse4-x2.ll b/builtins/target-sse4-x2.ll index a4416409..250770ed 100644 --- a/builtins/target-sse4-x2.ll +++ b/builtins/target-sse4-x2.ll @@ -372,15 +372,15 @@ masked_load(i64, 8) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; gather/scatter -gen_gather(8, i8) -gen_gather(8, i16) -gen_gather(8, i32) -gen_gather(8, i64) +gen_gather(i8) +gen_gather(i16) +gen_gather(i32) +gen_gather(i64) -gen_scatter(8, i8) -gen_scatter(8, i16) -gen_scatter(8, i32) -gen_scatter(8, i64) +gen_scatter(i8) +gen_scatter(i16) +gen_scatter(i32) +gen_scatter(i64) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; float rounding diff --git a/builtins/target-sse4.ll b/builtins/target-sse4.ll index c31a23b2..a17e6c6e 100644 --- a/builtins/target-sse4.ll +++ b/builtins/target-sse4.ll @@ -474,12 +474,12 @@ masked_load(i64, 8) ; define these with the macros from stdlib.m4 -gen_gather(4, i8) -gen_gather(4, i16) -gen_gather(4, i32) -gen_gather(4, i64) +gen_gather(i8) +gen_gather(i16) +gen_gather(i32) +gen_gather(i64) -gen_scatter(4, i8) -gen_scatter(4, i16) -gen_scatter(4, i32) -gen_scatter(4, i64) +gen_scatter(i8) +gen_scatter(i16) +gen_scatter(i32) +gen_scatter(i64) diff --git a/builtins/util.m4 b/builtins/util.m4 index 212f6076..d3f58f8a 100644 --- a/builtins/util.m4 +++ b/builtins/util.m4 @@ -1567,10 +1567,10 @@ declare void @__pseudo_masked_store_i64( * nocapture, ; to perform a gather, it generates a call to one of these functions, ; which have signatures: ; -; varying int8 __pseudo_gather(varying int8 *, mask) -; varying int16 __pseudo_gather(varying int16 *, mask) -; varying int32 __pseudo_gather(varying int32 *, mask) -; varying int64 __pseudo_gather(varying int64 *, mask) +; varying int8 __pseudo_gather_i8(varying int8 *, mask) +; varying int16 __pseudo_gather_i16(varying int16 *, mask) +; varying int32 __pseudo_gather_i32(varying int32 *, mask) +; varying int64 __pseudo_gather_i64(varying int64 *, mask) ; ; The GatherScatterFlattenOpt optimization pass finds these calls and then ; converts them to make calls to the following functions (when appropriate); @@ -1591,46 +1591,46 @@ declare void @__pseudo_masked_store_i64( * nocapture, ; converts them to native gather functions or converts them to vector ; loads, if equivalent. -declare @__pseudo_gather32_8(, ) nounwind readonly -declare @__pseudo_gather32_16(, ) nounwind readonly -declare @__pseudo_gather32_32(, ) nounwind readonly -declare @__pseudo_gather32_64(, ) nounwind readonly +declare @__pseudo_gather32_i8(, ) nounwind readonly +declare @__pseudo_gather32_i16(, ) nounwind readonly +declare @__pseudo_gather32_i32(, ) nounwind readonly +declare @__pseudo_gather32_i64(, ) nounwind readonly -declare @__pseudo_gather64_8(, ) nounwind readonly -declare @__pseudo_gather64_16(, ) nounwind readonly -declare @__pseudo_gather64_32(, ) nounwind readonly -declare @__pseudo_gather64_64(, ) nounwind readonly +declare @__pseudo_gather64_i8(, ) nounwind readonly +declare @__pseudo_gather64_i16(, ) nounwind readonly +declare @__pseudo_gather64_i32(, ) nounwind readonly +declare @__pseudo_gather64_i64(, ) nounwind readonly -declare @__pseudo_gather_base_offsets32_8(i8 *, , i32, , +declare @__pseudo_gather_base_offsets32_i8(i8 *, , i32, , ) nounwind readonly -declare @__pseudo_gather_base_offsets32_16(i8 *, , i32, , - ) nounwind readonly -declare @__pseudo_gather_base_offsets32_32(i8 *, , i32, , - ) nounwind readonly -declare @__pseudo_gather_base_offsets32_64(i8 *, , i32, , - ) nounwind readonly +declare @__pseudo_gather_base_offsets32_i16(i8 *, , i32, , + ) nounwind readonly +declare @__pseudo_gather_base_offsets32_i32(i8 *, , i32, , + ) nounwind readonly +declare @__pseudo_gather_base_offsets32_i64(i8 *, , i32, , + ) nounwind readonly -declare @__pseudo_gather_base_offsets64_8(i8 *, , i32, , - ) nounwind readonly -declare @__pseudo_gather_base_offsets64_16(i8 *, , i32, , - ) nounwind readonly -declare @__pseudo_gather_base_offsets64_32(i8 *, , i32, , - ) nounwind readonly -declare @__pseudo_gather_base_offsets64_64(i8 *, , i32, , - ) nounwind readonly +declare @__pseudo_gather_base_offsets64_i8(i8 *, , i32, , + ) nounwind readonly +declare @__pseudo_gather_base_offsets64_i16(i8 *, , i32, , + ) nounwind readonly +declare @__pseudo_gather_base_offsets64_i32(i8 *, , i32, , + ) nounwind readonly +declare @__pseudo_gather_base_offsets64_i64(i8 *, , i32, , + ) nounwind readonly ; Similarly to the pseudo-gathers defined above, we also declare undefined ; pseudo-scatter instructions with signatures: ; -; void __pseudo_scatter_8 (varying int8 *, varying int8 values, mask) -; void __pseudo_scatter_16(varying int16 *, varying int16 values, mask) -; void __pseudo_scatter_32(varying int32 *, varying int32 values, mask) -; void __pseudo_scatter_64(varying int64 *, varying int64 values, mask) +; void __pseudo_scatter_i8 (varying int8 *, varying int8 values, mask) +; void __pseudo_scatter_i16(varying int16 *, varying int16 values, mask) +; void __pseudo_scatter_i32(varying int32 *, varying int32 values, mask) +; void __pseudo_scatter_i64(varying int64 *, varying int64 values, mask) ; ; The GatherScatterFlattenOpt optimization pass also finds these and ; transforms them to scatters like: ; -; void __pseudo_scatter_base_offsets{32,64}_8(uniform int8 *base, +; void __pseudo_scatter_base_offsets{32,64}_i8(uniform int8 *base, ; varying int32 offsets, uniform int32 offset_scale, ; varying int{32,64} offset_delta, varying int8 values, mask) ; (and similarly for 16/32/64 bit values) @@ -1638,33 +1638,33 @@ declare @__pseudo_gather_base_offsets64_64(i8 *, , i3 ; And the GSImprovementsPass in turn converts these to actual native ; scatters or masked stores. -declare void @__pseudo_scatter32_8(, , ) nounwind -declare void @__pseudo_scatter32_16(, , ) nounwind -declare void @__pseudo_scatter32_32(, , ) nounwind -declare void @__pseudo_scatter32_64(, , ) nounwind +declare void @__pseudo_scatter32_i8(, , ) nounwind +declare void @__pseudo_scatter32_i16(, , ) nounwind +declare void @__pseudo_scatter32_i32(, , ) nounwind +declare void @__pseudo_scatter32_i64(, , ) nounwind -declare void @__pseudo_scatter64_8(, , ) nounwind -declare void @__pseudo_scatter64_16(, , ) nounwind -declare void @__pseudo_scatter64_32(, , ) nounwind -declare void @__pseudo_scatter64_64(, , ) nounwind +declare void @__pseudo_scatter64_i8(, , ) nounwind +declare void @__pseudo_scatter64_i16(, , ) nounwind +declare void @__pseudo_scatter64_i32(, , ) nounwind +declare void @__pseudo_scatter64_i64(, , ) nounwind -declare void @__pseudo_scatter_base_offsets32_8(i8 * nocapture, , i32, , - , ) nounwind -declare void @__pseudo_scatter_base_offsets32_16(i8 * nocapture, , i32, , - , ) nounwind -declare void @__pseudo_scatter_base_offsets32_32(i8 * nocapture, , i32, , - , ) nounwind -declare void @__pseudo_scatter_base_offsets32_64(i8 * nocapture, , i32, , - , ) nounwind +declare void @__pseudo_scatter_base_offsets32_i8(i8 * nocapture, , i32, , + , ) nounwind +declare void @__pseudo_scatter_base_offsets32_i16(i8 * nocapture, , i32, , + , ) nounwind +declare void @__pseudo_scatter_base_offsets32_i32(i8 * nocapture, , i32, , + , ) nounwind +declare void @__pseudo_scatter_base_offsets32_i64(i8 * nocapture, , i32, , + , ) nounwind -declare void @__pseudo_scatter_base_offsets64_8(i8 * nocapture, , i32, , - , ) nounwind -declare void @__pseudo_scatter_base_offsets64_16(i8 * nocapture, , i32, , - , ) nounwind -declare void @__pseudo_scatter_base_offsets64_32(i8 * nocapture, , i32, , - , ) nounwind -declare void @__pseudo_scatter_base_offsets64_64(i8 * nocapture, , i32, , - , ) nounwind +declare void @__pseudo_scatter_base_offsets64_i8(i8 * nocapture, , i32, , + , ) nounwind +declare void @__pseudo_scatter_base_offsets64_i16(i8 * nocapture, , i32, , + , ) nounwind +declare void @__pseudo_scatter_base_offsets64_i32(i8 * nocapture, , i32, , + , ) nounwind +declare void @__pseudo_scatter_base_offsets64_i64(i8 * nocapture, , i32, , + , ) nounwind declare float @__log_uniform_float(float) nounwind readnone declare @__log_varying_float() nounwind readnone @@ -1740,73 +1740,73 @@ define void @__keep_funcs_live(i8 * %ptr, %v8, %v16, ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; gathers - %pg32_8 = call @__pseudo_gather32_8( %v32, - %mask) + %pg32_8 = call @__pseudo_gather32_i8( %v32, + %mask) call void @__use8( %pg32_8) - %pg32_16 = call @__pseudo_gather32_16( %v32, - %mask) + %pg32_16 = call @__pseudo_gather32_i16( %v32, + %mask) call void @__use16( %pg32_16) - %pg32_32 = call @__pseudo_gather32_32( %v32, - %mask) + %pg32_32 = call @__pseudo_gather32_i32( %v32, + %mask) call void @__use32( %pg32_32) - %pg32_64 = call @__pseudo_gather32_64( %v32, - %mask) + %pg32_64 = call @__pseudo_gather32_i64( %v32, + %mask) call void @__use64( %pg32_64) - %pg64_8 = call @__pseudo_gather64_8( %v64, - %mask) + %pg64_8 = call @__pseudo_gather64_i8( %v64, + %mask) call void @__use8( %pg64_8) - %pg64_16 = call @__pseudo_gather64_16( %v64, - %mask) + %pg64_16 = call @__pseudo_gather64_i16( %v64, + %mask) call void @__use16( %pg64_16) - %pg64_32 = call @__pseudo_gather64_32( %v64, - %mask) + %pg64_32 = call @__pseudo_gather64_i32( %v64, + %mask) call void @__use32( %pg64_32) - %pg64_64 = call @__pseudo_gather64_64( %v64, - %mask) + %pg64_64 = call @__pseudo_gather64_i64( %v64, + %mask) call void @__use64( %pg64_64) %g32_8 = call @__gather32_i8( %v32, - %mask) + %mask) call void @__use8( %g32_8) %g32_16 = call @__gather32_i16( %v32, - %mask) + %mask) call void @__use16( %g32_16) %g32_32 = call @__gather32_i32( %v32, - %mask) + %mask) call void @__use32( %g32_32) %g32_64 = call @__gather32_i64( %v32, - %mask) + %mask) call void @__use64( %g32_64) %g64_8 = call @__gather64_i8( %v64, - %mask) + %mask) call void @__use8( %g64_8) %g64_16 = call @__gather64_i16( %v64, - %mask) + %mask) call void @__use16( %g64_16) %g64_32 = call @__gather64_i32( %v64, - %mask) + %mask) call void @__use32( %g64_32) %g64_64 = call @__gather64_i64( %v64, %mask) call void @__use64( %g64_64) %pgbo32_8 = call - @__pseudo_gather_base_offsets32_8(i8 * %ptr, %v32, i32 0, - %v32, %mask) + @__pseudo_gather_base_offsets32_i8(i8 * %ptr, %v32, i32 0, + %v32, %mask) call void @__use8( %pgbo32_8) %pgbo32_16 = call - @__pseudo_gather_base_offsets32_16(i8 * %ptr, %v32, i32 0, - %v32, %mask) + @__pseudo_gather_base_offsets32_i16(i8 * %ptr, %v32, i32 0, + %v32, %mask) call void @__use16( %pgbo32_16) %pgbo32_32 = call - @__pseudo_gather_base_offsets32_32(i8 * %ptr, %v32, i32 0, - %v32, %mask) + @__pseudo_gather_base_offsets32_i32(i8 * %ptr, %v32, i32 0, + %v32, %mask) call void @__use32( %pgbo32_32) %pgbo32_64 = call - @__pseudo_gather_base_offsets32_64(i8 * %ptr, %v32, i32 0, - %v32, %mask) + @__pseudo_gather_base_offsets32_i64(i8 * %ptr, %v32, i32 0, + %v32, %mask) call void @__use64( %pgbo32_64) %gbo32_8 = call @@ -1828,20 +1828,20 @@ define void @__keep_funcs_live(i8 * %ptr, %v8, %v16, %pgbo64_8 = call - @__pseudo_gather_base_offsets64_8(i8 * %ptr, %v64, i32 0, - %v64, %mask) + @__pseudo_gather_base_offsets64_i8(i8 * %ptr, %v64, i32 0, + %v64, %mask) call void @__use8( %pgbo64_8) %pgbo64_16 = call - @__pseudo_gather_base_offsets64_16(i8 * %ptr, %v64, i32 0, - %v64, %mask) + @__pseudo_gather_base_offsets64_i16(i8 * %ptr, %v64, i32 0, + %v64, %mask) call void @__use16( %pgbo64_16) %pgbo64_32 = call - @__pseudo_gather_base_offsets64_32(i8 * %ptr, %v64, i32 0, - %v64, %mask) + @__pseudo_gather_base_offsets64_i32(i8 * %ptr, %v64, i32 0, + %v64, %mask) call void @__use32( %pgbo64_32) %pgbo64_64 = call - @__pseudo_gather_base_offsets64_64(i8 * %ptr, %v64, i32 0, - %v64, %mask) + @__pseudo_gather_base_offsets64_i64(i8 * %ptr, %v64, i32 0, + %v64, %mask) call void @__use64( %pgbo64_64) %gbo64_8 = call @@ -1864,15 +1864,15 @@ define void @__keep_funcs_live(i8 * %ptr, %v8, %v16, ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; scatters - call void @__pseudo_scatter32_8( %v32, %v8, %mask) - call void @__pseudo_scatter32_16( %v32, %v16, %mask) - call void @__pseudo_scatter32_32( %v32, %v32, %mask) - call void @__pseudo_scatter32_64( %v32, %v64, %mask) + call void @__pseudo_scatter32_i8( %v32, %v8, %mask) + call void @__pseudo_scatter32_i16( %v32, %v16, %mask) + call void @__pseudo_scatter32_i32( %v32, %v32, %mask) + call void @__pseudo_scatter32_i64( %v32, %v64, %mask) - call void @__pseudo_scatter64_8( %v64, %v8, %mask) - call void @__pseudo_scatter64_16( %v64, %v16, %mask) - call void @__pseudo_scatter64_32( %v64, %v32, %mask) - call void @__pseudo_scatter64_64( %v64, %v64, %mask) + call void @__pseudo_scatter64_i8( %v64, %v8, %mask) + call void @__pseudo_scatter64_i16( %v64, %v16, %mask) + call void @__pseudo_scatter64_i32( %v64, %v32, %mask) + call void @__pseudo_scatter64_i64( %v64, %v64, %mask) call void @__scatter32_i8( %v32, %v8, %mask) call void @__scatter32_i16( %v32, %v16, %mask) @@ -1884,41 +1884,41 @@ define void @__keep_funcs_live(i8 * %ptr, %v8, %v16, call void @__scatter64_i32( %v64, %v32, %mask) call void @__scatter64_i64( %v64, %v64, %mask) - call void @__pseudo_scatter_base_offsets32_8(i8 * %ptr, %v32, i32 0, %v32, + call void @__pseudo_scatter_base_offsets32_i8(i8 * %ptr, %v32, i32 0, %v32, %v8, %mask) - call void @__pseudo_scatter_base_offsets32_16(i8 * %ptr, %v32, i32 0, %v32, + call void @__pseudo_scatter_base_offsets32_i16(i8 * %ptr, %v32, i32 0, %v32, %v16, %mask) - call void @__pseudo_scatter_base_offsets32_32(i8 * %ptr, %v32, i32 0, %v32, + call void @__pseudo_scatter_base_offsets32_i32(i8 * %ptr, %v32, i32 0, %v32, %v32, %mask) - call void @__pseudo_scatter_base_offsets32_64(i8 * %ptr, %v32, i32 0, %v32, + call void @__pseudo_scatter_base_offsets32_i64(i8 * %ptr, %v32, i32 0, %v32, %v64, %mask) - call void @__pseudo_scatter_base_offsets64_8(i8 * %ptr, %v64, i32 0, %v64, + call void @__pseudo_scatter_base_offsets64_i8(i8 * %ptr, %v64, i32 0, %v64, %v8, %mask) - call void @__pseudo_scatter_base_offsets64_16(i8 * %ptr, %v64, i32 0, %v64, + call void @__pseudo_scatter_base_offsets64_i16(i8 * %ptr, %v64, i32 0, %v64, %v16, %mask) - call void @__pseudo_scatter_base_offsets64_32(i8 * %ptr, %v64, i32 0, %v64, + call void @__pseudo_scatter_base_offsets64_i32(i8 * %ptr, %v64, i32 0, %v64, %v32, %mask) - call void @__pseudo_scatter_base_offsets64_64(i8 * %ptr, %v64, i32 0, %v64, + call void @__pseudo_scatter_base_offsets64_i64(i8 * %ptr, %v64, i32 0, %v64, %v64, %mask) call void @__scatter_base_offsets32_i8(i8 * %ptr, %v32, i32 0, %v32, - %v8, %mask) + %v8, %mask) call void @__scatter_base_offsets32_i16(i8 * %ptr, %v32, i32 0, %v32, - %v16, %mask) + %v16, %mask) call void @__scatter_base_offsets32_i32(i8 * %ptr, %v32, i32 0, %v32, - %v32, %mask) + %v32, %mask) call void @__scatter_base_offsets32_i64(i8 * %ptr, %v32, i32 0, %v32, - %v64, %mask) + %v64, %mask) call void @__scatter_base_offsets64_i8(i8 * %ptr, %v64, i32 0, %v64, - %v8, %mask) + %v8, %mask) call void @__scatter_base_offsets64_i16(i8 * %ptr, %v64, i32 0, %v64, - %v16, %mask) + %v16, %mask) call void @__scatter_base_offsets64_i32(i8 * %ptr, %v64, i32 0, %v64, - %v32, %mask) + %v32, %mask) call void @__scatter_base_offsets64_i64(i8 * %ptr, %v64, i32 0, %v64, - %v64, %mask) + %v64, %mask) ret void } @@ -3076,18 +3076,17 @@ pl_done: ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; gather ;; -;; $1: vector width of the target -;; $2: scalar type for which to generate functions to do gathers +;; $1: scalar type for which to generate functions to do gathers ; vec width, type define(`gen_gather', ` ;; Define the utility function to do the gather operation for a single element ;; of the type -define <$1 x $2> @__gather_elt32_$2(i8 * %ptr, <$1 x i32> %offsets, i32 %offset_scale, - <$1 x i32> %offset_delta, <$1 x $2> %ret, +define @__gather_elt32_$1(i8 * %ptr, %offsets, i32 %offset_scale, + %offset_delta, %ret, i32 %lane) nounwind readonly alwaysinline { ; compute address for this one from the base - %offset32 = extractelement <$1 x i32> %offsets, i32 %lane + %offset32 = extractelement %offsets, i32 %lane ; the order and details of the next 4 lines are important--they match LLVMs ; patterns that apply the free x86 2x/4x/8x scaling in addressing calculations %offset64 = sext i32 %offset32 to i64 @@ -3095,131 +3094,131 @@ define <$1 x $2> @__gather_elt32_$2(i8 * %ptr, <$1 x i32> %offsets, i32 %offset_ %offset = mul i64 %offset64, %scale64 %ptroffset = getelementptr i8 * %ptr, i64 %offset - %delta = extractelement <$1 x i32> %offset_delta, i32 %lane + %delta = extractelement %offset_delta, i32 %lane %delta64 = sext i32 %delta to i64 %finalptr = getelementptr i8 * %ptroffset, i64 %delta64 ; load value and insert into returned value - %ptrcast = bitcast i8 * %finalptr to $2 * - %val = load $2 *%ptrcast - %updatedret = insertelement <$1 x $2> %ret, $2 %val, i32 %lane - ret <$1 x $2> %updatedret + %ptrcast = bitcast i8 * %finalptr to $1 * + %val = load $1 *%ptrcast + %updatedret = insertelement %ret, $1 %val, i32 %lane + ret %updatedret } -define <$1 x $2> @__gather_elt64_$2(i8 * %ptr, <$1 x i64> %offsets, i32 %offset_scale, - <$1 x i64> %offset_delta, <$1 x $2> %ret, +define @__gather_elt64_$1(i8 * %ptr, %offsets, i32 %offset_scale, + %offset_delta, %ret, i32 %lane) nounwind readonly alwaysinline { ; compute address for this one from the base - %offset64 = extractelement <$1 x i64> %offsets, i32 %lane + %offset64 = extractelement %offsets, i32 %lane ; the order and details of the next 4 lines are important--they match LLVMs ; patterns that apply the free x86 2x/4x/8x scaling in addressing calculations %offset_scale64 = sext i32 %offset_scale to i64 %offset = mul i64 %offset64, %offset_scale64 %ptroffset = getelementptr i8 * %ptr, i64 %offset - %delta64 = extractelement <$1 x i64> %offset_delta, i32 %lane + %delta64 = extractelement %offset_delta, i32 %lane %finalptr = getelementptr i8 * %ptroffset, i64 %delta64 ; load value and insert into returned value - %ptrcast = bitcast i8 * %finalptr to $2 * - %val = load $2 *%ptrcast - %updatedret = insertelement <$1 x $2> %ret, $2 %val, i32 %lane - ret <$1 x $2> %updatedret + %ptrcast = bitcast i8 * %finalptr to $1 * + %val = load $1 *%ptrcast + %updatedret = insertelement %ret, $1 %val, i32 %lane + ret %updatedret } -define <$1 x $2> @__gather_base_offsets32_$2(i8 * %ptr, <$1 x i32> %offsets, i32 %offset_scale, - <$1 x i32> %offset_delta, - <$1 x i32> %vecmask) nounwind readonly alwaysinline { +define @__gather_base_offsets32_$1(i8 * %ptr, %offsets, i32 %offset_scale, + %offset_delta, + %vecmask) nounwind readonly alwaysinline { ; We can be clever and avoid the per-lane stuff for gathers if we are willing ; to require that the 0th element of the array being gathered from is always ; legal to read from (and we do indeed require that, given the benefits!) ; ; Set the offset to zero for lanes that are off - %offsetsPtr = alloca <$1 x i32> - store <$1 x i32> zeroinitializer, <$1 x i32> * %offsetsPtr - call void @__masked_store_blend_i32(<$1 x i32> * %offsetsPtr, <$1 x i32> %offsets, - <$1 x i32> %vecmask) - %newOffsets = load <$1 x i32> * %offsetsPtr + %offsetsPtr = alloca + store zeroinitializer, * %offsetsPtr + call void @__masked_store_blend_i32( * %offsetsPtr, %offsets, + %vecmask) + %newOffsets = load * %offsetsPtr - %deltaPtr = alloca <$1 x i32> - store <$1 x i32> zeroinitializer, <$1 x i32> * %deltaPtr - call void @__masked_store_blend_i32(<$1 x i32> * %deltaPtr, <$1 x i32> %offset_delta, - <$1 x i32> %vecmask) - %newDelta = load <$1 x i32> * %deltaPtr + %deltaPtr = alloca + store zeroinitializer, * %deltaPtr + call void @__masked_store_blend_i32( * %deltaPtr, %offset_delta, + %vecmask) + %newDelta = load * %deltaPtr - %ret0 = call <$1 x $2> @__gather_elt32_$2(i8 * %ptr, <$1 x i32> %newOffsets, - i32 %offset_scale, <$1 x i32> %newDelta, - <$1 x $2> undef, i32 0) - forloop(lane, 1, eval($1-1), - `patsubst(patsubst(`%retLANE = call <$1 x $2> @__gather_elt32_$2(i8 * %ptr, - <$1 x i32> %newOffsets, i32 %offset_scale, <$1 x i32> %newDelta, - <$1 x $2> %retPREV, i32 LANE) + %ret0 = call @__gather_elt32_$1(i8 * %ptr, %newOffsets, + i32 %offset_scale, %newDelta, + undef, i32 0) + forloop(lane, 1, eval(WIDTH-1), + `patsubst(patsubst(`%retLANE = call @__gather_elt32_$1(i8 * %ptr, + %newOffsets, i32 %offset_scale, %newDelta, + %retPREV, i32 LANE) ', `LANE', lane), `PREV', eval(lane-1))') - ret <$1 x $2> %ret`'eval($1-1) + ret %ret`'eval(WIDTH-1) } -define <$1 x $2> @__gather_base_offsets64_$2(i8 * %ptr, <$1 x i64> %offsets, i32 %offset_scale, - <$1 x i64> %offset_delta, - <$1 x i32> %vecmask) nounwind readonly alwaysinline { +define @__gather_base_offsets64_$1(i8 * %ptr, %offsets, i32 %offset_scale, + %offset_delta, + %vecmask) nounwind readonly alwaysinline { ; We can be clever and avoid the per-lane stuff for gathers if we are willing ; to require that the 0th element of the array being gathered from is always ; legal to read from (and we do indeed require that, given the benefits!) ; ; Set the offset to zero for lanes that are off - %offsetsPtr = alloca <$1 x i64> - store <$1 x i64> zeroinitializer, <$1 x i64> * %offsetsPtr - call void @__masked_store_blend_i64(<$1 x i64> * %offsetsPtr, <$1 x i64> %offsets, - <$1 x i32> %vecmask) - %newOffsets = load <$1 x i64> * %offsetsPtr + %offsetsPtr = alloca + store zeroinitializer, * %offsetsPtr + call void @__masked_store_blend_i64( * %offsetsPtr, %offsets, + %vecmask) + %newOffsets = load * %offsetsPtr - %deltaPtr = alloca <$1 x i64> - store <$1 x i64> zeroinitializer, <$1 x i64> * %deltaPtr - call void @__masked_store_blend_i64(<$1 x i64> * %deltaPtr, <$1 x i64> %offset_delta, - <$1 x i32> %vecmask) - %newDelta = load <$1 x i64> * %deltaPtr + %deltaPtr = alloca + store zeroinitializer, * %deltaPtr + call void @__masked_store_blend_i64( * %deltaPtr, %offset_delta, + %vecmask) + %newDelta = load * %deltaPtr - %ret0 = call <$1 x $2> @__gather_elt64_$2(i8 * %ptr, <$1 x i64> %newOffsets, - i32 %offset_scale, <$1 x i64> %newDelta, - <$1 x $2> undef, i32 0) - forloop(lane, 1, eval($1-1), - `patsubst(patsubst(`%retLANE = call <$1 x $2> @__gather_elt64_$2(i8 * %ptr, - <$1 x i64> %newOffsets, i32 %offset_scale, <$1 x i64> %newDelta, - <$1 x $2> %retPREV, i32 LANE) + %ret0 = call @__gather_elt64_$1(i8 * %ptr, %newOffsets, + i32 %offset_scale, %newDelta, + undef, i32 0) + forloop(lane, 1, eval(WIDTH-1), + `patsubst(patsubst(`%retLANE = call @__gather_elt64_$1(i8 * %ptr, + %newOffsets, i32 %offset_scale, %newDelta, + %retPREV, i32 LANE) ', `LANE', lane), `PREV', eval(lane-1))') - ret <$1 x $2> %ret`'eval($1-1) + ret %ret`'eval(WIDTH-1) } ; fully general 32-bit gather, takes array of pointers encoded as vector of i32s -define <$1 x $2> @__gather32_$2(<$1 x i32> %ptrs, - <$1 x i32> %vecmask) nounwind readonly alwaysinline { - %ret_ptr = alloca <$1 x $2> - per_lane($1, <$1 x i32> %vecmask, ` - %iptr_LANE_ID = extractelement <$1 x i32> %ptrs, i32 LANE - %ptr_LANE_ID = inttoptr i32 %iptr_LANE_ID to $2 * - %val_LANE_ID = load $2 * %ptr_LANE_ID - %store_ptr_LANE_ID = getelementptr <$1 x $2> * %ret_ptr, i32 0, i32 LANE - store $2 %val_LANE_ID, $2 * %store_ptr_LANE_ID +define @__gather32_$1( %ptrs, + %vecmask) nounwind readonly alwaysinline { + %ret_ptr = alloca + per_lane(WIDTH, %vecmask, ` + %iptr_LANE_ID = extractelement %ptrs, i32 LANE + %ptr_LANE_ID = inttoptr i32 %iptr_LANE_ID to $1 * + %val_LANE_ID = load $1 * %ptr_LANE_ID + %store_ptr_LANE_ID = getelementptr * %ret_ptr, i32 0, i32 LANE + store $1 %val_LANE_ID, $1 * %store_ptr_LANE_ID ') - %ret = load <$1 x $2> * %ret_ptr - ret <$1 x $2> %ret + %ret = load * %ret_ptr + ret %ret } ; fully general 64-bit gather, takes array of pointers encoded as vector of i32s -define <$1 x $2> @__gather64_$2(<$1 x i64> %ptrs, - <$1 x i32> %vecmask) nounwind readonly alwaysinline { - %ret_ptr = alloca <$1 x $2> - per_lane($1, <$1 x i32> %vecmask, ` - %iptr_LANE_ID = extractelement <$1 x i64> %ptrs, i32 LANE - %ptr_LANE_ID = inttoptr i64 %iptr_LANE_ID to $2 * - %val_LANE_ID = load $2 * %ptr_LANE_ID - %store_ptr_LANE_ID = getelementptr <$1 x $2> * %ret_ptr, i32 0, i32 LANE - store $2 %val_LANE_ID, $2 * %store_ptr_LANE_ID +define @__gather64_$1( %ptrs, + %vecmask) nounwind readonly alwaysinline { + %ret_ptr = alloca + per_lane(WIDTH, %vecmask, ` + %iptr_LANE_ID = extractelement %ptrs, i32 LANE + %ptr_LANE_ID = inttoptr i64 %iptr_LANE_ID to $1 * + %val_LANE_ID = load $1 * %ptr_LANE_ID + %store_ptr_LANE_ID = getelementptr * %ret_ptr, i32 0, i32 LANE + store $1 %val_LANE_ID, $1 * %store_ptr_LANE_ID ') - %ret = load <$1 x $2> * %ret_ptr - ret <$1 x $2> %ret + %ret = load * %ret_ptr + ret %ret } ' ) @@ -3229,16 +3228,15 @@ define <$1 x $2> @__gather64_$2(<$1 x i64> %ptrs, ;; gen_scatter ;; Emit a function declaration for a scalarized scatter. ;; -;; $1: target vector width -;; $2: scalar type for which we want to generate code to scatter +;; $1: scalar type for which we want to generate code to scatter define(`gen_scatter', ` ;; Define the function that descripes the work to do to scatter a single ;; value -define void @__scatter_elt32_$2(i8 * %ptr, <$1 x i32> %offsets, i32 %offset_scale, - <$1 x i32> %offset_delta, <$1 x $2> %values, +define void @__scatter_elt32_$1(i8 * %ptr, %offsets, i32 %offset_scale, + %offset_delta, %values, i32 %lane) nounwind alwaysinline { - %offset32 = extractelement <$1 x i32> %offsets, i32 %lane + %offset32 = extractelement %offsets, i32 %lane ; the order and details of the next 4 lines are important--they match LLVMs ; patterns that apply the free x86 2x/4x/8x scaling in addressing calculations %offset64 = sext i32 %offset32 to i64 @@ -3246,75 +3244,75 @@ define void @__scatter_elt32_$2(i8 * %ptr, <$1 x i32> %offsets, i32 %offset_scal %offset = mul i64 %offset64, %scale64 %ptroffset = getelementptr i8 * %ptr, i64 %offset - %delta = extractelement <$1 x i32> %offset_delta, i32 %lane + %delta = extractelement %offset_delta, i32 %lane %delta64 = sext i32 %delta to i64 %finalptr = getelementptr i8 * %ptroffset, i64 %delta64 - %ptrcast = bitcast i8 * %finalptr to $2 * - %storeval = extractelement <$1 x $2> %values, i32 %lane - store $2 %storeval, $2 * %ptrcast + %ptrcast = bitcast i8 * %finalptr to $1 * + %storeval = extractelement %values, i32 %lane + store $1 %storeval, $1 * %ptrcast ret void } -define void @__scatter_elt64_$2(i8 * %ptr, <$1 x i64> %offsets, i32 %offset_scale, - <$1 x i64> %offset_delta, <$1 x $2> %values, +define void @__scatter_elt64_$1(i8 * %ptr, %offsets, i32 %offset_scale, + %offset_delta, %values, i32 %lane) nounwind alwaysinline { - %offset64 = extractelement <$1 x i64> %offsets, i32 %lane + %offset64 = extractelement %offsets, i32 %lane ; the order and details of the next 4 lines are important--they match LLVMs ; patterns that apply the free x86 2x/4x/8x scaling in addressing calculations %scale64 = sext i32 %offset_scale to i64 %offset = mul i64 %offset64, %scale64 %ptroffset = getelementptr i8 * %ptr, i64 %offset - %delta64 = extractelement <$1 x i64> %offset_delta, i32 %lane + %delta64 = extractelement %offset_delta, i32 %lane %finalptr = getelementptr i8 * %ptroffset, i64 %delta64 - %ptrcast = bitcast i8 * %finalptr to $2 * - %storeval = extractelement <$1 x $2> %values, i32 %lane - store $2 %storeval, $2 * %ptrcast + %ptrcast = bitcast i8 * %finalptr to $1 * + %storeval = extractelement %values, i32 %lane + store $1 %storeval, $1 * %ptrcast ret void } -define void @__scatter_base_offsets32_$2(i8* %base, <$1 x i32> %offsets, i32 %offset_scale, - <$1 x i32> %offset_delta, <$1 x $2> %values, - <$1 x i32> %mask) nounwind alwaysinline { +define void @__scatter_base_offsets32_$1(i8* %base, %offsets, i32 %offset_scale, + %offset_delta, %values, + %mask) nounwind alwaysinline { ;; And use the `per_lane' macro to do all of the per-lane work for scatter... - per_lane($1, <$1 x i32> %mask, ` - call void @__scatter_elt32_$2(i8 * %base, <$1 x i32> %offsets, i32 %offset_scale, - <$1 x i32> %offset_delta, <$1 x $2> %values, i32 LANE)') + per_lane(WIDTH, %mask, ` + call void @__scatter_elt32_$1(i8 * %base, %offsets, i32 %offset_scale, + %offset_delta, %values, i32 LANE)') ret void } -define void @__scatter_base_offsets64_$2(i8* %base, <$1 x i64> %offsets, i32 %offset_scale, - <$1 x i64> %offset_delta, <$1 x $2> %values, - <$1 x i32> %mask) nounwind alwaysinline { +define void @__scatter_base_offsets64_$1(i8* %base, %offsets, i32 %offset_scale, + %offset_delta, %values, + %mask) nounwind alwaysinline { ;; And use the `per_lane' macro to do all of the per-lane work for scatter... - per_lane($1, <$1 x i32> %mask, ` - call void @__scatter_elt64_$2(i8 * %base, <$1 x i64> %offsets, i32 %offset_scale, - <$1 x i64> %offset_delta, <$1 x $2> %values, i32 LANE)') + per_lane(WIDTH, %mask, ` + call void @__scatter_elt64_$1(i8 * %base, %offsets, i32 %offset_scale, + %offset_delta, %values, i32 LANE)') ret void } ; fully general 32-bit scatter, takes array of pointers encoded as vector of i32s -define void @__scatter32_$2(<$1 x i32> %ptrs, <$1 x $2> %values, - <$1 x i32> %mask) nounwind alwaysinline { - per_lane($1, <$1 x i32> %mask, ` - %iptr_LANE_ID = extractelement <$1 x i32> %ptrs, i32 LANE - %ptr_LANE_ID = inttoptr i32 %iptr_LANE_ID to $2 * - %val_LANE_ID = extractelement <$1 x $2> %values, i32 LANE - store $2 %val_LANE_ID, $2 * %ptr_LANE_ID +define void @__scatter32_$1( %ptrs, %values, + %mask) nounwind alwaysinline { + per_lane(WIDTH, %mask, ` + %iptr_LANE_ID = extractelement %ptrs, i32 LANE + %ptr_LANE_ID = inttoptr i32 %iptr_LANE_ID to $1 * + %val_LANE_ID = extractelement %values, i32 LANE + store $1 %val_LANE_ID, $1 * %ptr_LANE_ID ') ret void } ; fully general 64-bit scatter, takes array of pointers encoded as vector of i64s -define void @__scatter64_$2(<$1 x i64> %ptrs, <$1 x $2> %values, - <$1 x i32> %mask) nounwind alwaysinline { - per_lane($1, <$1 x i32> %mask, ` - %iptr_LANE_ID = extractelement <$1 x i64> %ptrs, i32 LANE - %ptr_LANE_ID = inttoptr i64 %iptr_LANE_ID to $2 * - %val_LANE_ID = extractelement <$1 x $2> %values, i32 LANE - store $2 %val_LANE_ID, $2 * %ptr_LANE_ID +define void @__scatter64_$1( %ptrs, %values, + %mask) nounwind alwaysinline { + per_lane(WIDTH, %mask, ` + %iptr_LANE_ID = extractelement %ptrs, i32 LANE + %ptr_LANE_ID = inttoptr i64 %iptr_LANE_ID to $1 * + %val_LANE_ID = extractelement %values, i32 LANE + store $1 %val_LANE_ID, $1 * %ptr_LANE_ID ') ret void } diff --git a/ctx.cpp b/ctx.cpp index 9468a10d..c8429953 100644 --- a/ctx.cpp +++ b/ctx.cpp @@ -2516,23 +2516,23 @@ FunctionEmitContext::gather(llvm::Value *ptr, const PointerType *ptrType, const PointerType *pt = CastType(returnType); const char *funcName = NULL; if (pt != NULL) - funcName = g->target.is32Bit ? "__pseudo_gather32_32" : - "__pseudo_gather64_64"; + funcName = g->target.is32Bit ? "__pseudo_gather32_i32" : + "__pseudo_gather64_i64"; else if (llvmReturnType == LLVMTypes::DoubleVectorType || llvmReturnType == LLVMTypes::Int64VectorType) - funcName = g->target.is32Bit ? "__pseudo_gather32_64" : - "__pseudo_gather64_64"; + funcName = g->target.is32Bit ? "__pseudo_gather32_i64" : + "__pseudo_gather64_i64"; else if (llvmReturnType == LLVMTypes::FloatVectorType || llvmReturnType == LLVMTypes::Int32VectorType) - funcName = g->target.is32Bit ? "__pseudo_gather32_32" : - "__pseudo_gather64_32"; + funcName = g->target.is32Bit ? "__pseudo_gather32_i32" : + "__pseudo_gather64_i32"; else if (llvmReturnType == LLVMTypes::Int16VectorType) - funcName = g->target.is32Bit ? "__pseudo_gather32_16" : - "__pseudo_gather64_16"; + funcName = g->target.is32Bit ? "__pseudo_gather32_i16" : + "__pseudo_gather64_i16"; else { AssertPos(currentPos, llvmReturnType == LLVMTypes::Int8VectorType); - funcName = g->target.is32Bit ? "__pseudo_gather32_8" : - "__pseudo_gather64_8"; + funcName = g->target.is32Bit ? "__pseudo_gather32_i8" : + "__pseudo_gather64_i8"; } llvm::Function *gatherFunc = m->module->getFunction(funcName); @@ -2828,26 +2828,26 @@ FunctionEmitContext::scatter(llvm::Value *value, llvm::Value *ptr, llvm::Type *type = value->getType(); const char *funcName = NULL; if (pt != NULL) - funcName = g->target.is32Bit ? "__pseudo_scatter32_32" : - "__pseudo_scatter64_64"; + funcName = g->target.is32Bit ? "__pseudo_scatter32_i32" : + "__pseudo_scatter64_i64"; else if (type == LLVMTypes::DoubleVectorType || type == LLVMTypes::Int64VectorType) { - funcName = g->target.is32Bit ? "__pseudo_scatter32_64" : - "__pseudo_scatter64_64"; + funcName = g->target.is32Bit ? "__pseudo_scatter32_i64" : + "__pseudo_scatter64_i64"; value = BitCastInst(value, LLVMTypes::Int64VectorType, "value2int"); } else if (type == LLVMTypes::FloatVectorType || type == LLVMTypes::Int32VectorType) { - funcName = g->target.is32Bit ? "__pseudo_scatter32_32" : - "__pseudo_scatter64_32"; + funcName = g->target.is32Bit ? "__pseudo_scatter32_i32" : + "__pseudo_scatter64_i32"; value = BitCastInst(value, LLVMTypes::Int32VectorType, "value2int"); } else if (type == LLVMTypes::Int16VectorType) - funcName = g->target.is32Bit ? "__pseudo_scatter32_16" : - "__pseudo_scatter64_16"; + funcName = g->target.is32Bit ? "__pseudo_scatter32_i16" : + "__pseudo_scatter64_i16"; else if (type == LLVMTypes::Int8VectorType) - funcName = g->target.is32Bit ? "__pseudo_scatter32_8" : - "__pseudo_scatter64_8"; + funcName = g->target.is32Bit ? "__pseudo_scatter32_i8" : + "__pseudo_scatter64_i8"; llvm::Function *scatterFunc = m->module->getFunction(funcName); AssertPos(currentPos, scatterFunc != NULL); diff --git a/examples/intrinsics/generic-16.h b/examples/intrinsics/generic-16.h index a043fb33..d8e2f078 100644 --- a/examples/intrinsics/generic-16.h +++ b/examples/intrinsics/generic-16.h @@ -1218,8 +1218,8 @@ static FORCEINLINE VTYPE FUNC(unsigned char *b, OTYPE varyingOffset, \ } -GATHER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i32, __gather_base_offsets32_i8) -GATHER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i64, __gather_base_offsets64_i8) +GATHER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i32, __gather_base_offsets32_i8) +GATHER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i64, __gather_base_offsets64_i8) GATHER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i32, __gather_base_offsets32_i16) GATHER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i64, __gather_base_offsets64_i16) GATHER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i32, __gather_base_offsets32_i32) @@ -1238,8 +1238,8 @@ static FORCEINLINE VTYPE FUNC(PTRTYPE ptrs, __vec16_i1 mask) { \ return ret; \ } -GATHER_GENERAL(__vec16_i8, int8_t, __vec16_i32, __gather32_i8) -GATHER_GENERAL(__vec16_i8, int8_t, __vec16_i64, __gather64_i8) +GATHER_GENERAL(__vec16_i8, int8_t, __vec16_i32, __gather32_i8) +GATHER_GENERAL(__vec16_i8, int8_t, __vec16_i64, __gather64_i8) GATHER_GENERAL(__vec16_i16, int16_t, __vec16_i32, __gather32_i16) GATHER_GENERAL(__vec16_i16, int16_t, __vec16_i64, __gather64_i16) GATHER_GENERAL(__vec16_i32, int32_t, __vec16_i32, __gather32_i32) @@ -1263,8 +1263,8 @@ static FORCEINLINE void FUNC(unsigned char *b, OTYPE varyingOffset, \ } -SCATTER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i32, __scatter_base_offsets32_i8) -SCATTER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i64, __scatter_base_offsets64_i8) +SCATTER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i32, __scatter_base_offsets32_i8) +SCATTER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i64, __scatter_base_offsets64_i8) SCATTER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i32, __scatter_base_offsets32_i16) SCATTER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i64, __scatter_base_offsets64_i16) SCATTER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i32, __scatter_base_offsets32_i32) @@ -1282,8 +1282,8 @@ static FORCEINLINE void FUNC(PTRTYPE ptrs, VTYPE val, __vec16_i1 mask) { \ } \ } -SCATTER_GENERAL(__vec16_i8, int8_t, __vec16_i32, __scatter32_i8) -SCATTER_GENERAL(__vec16_i8, int8_t, __vec16_i64, __scatter64_i8) +SCATTER_GENERAL(__vec16_i8, int8_t, __vec16_i32, __scatter32_i8) +SCATTER_GENERAL(__vec16_i8, int8_t, __vec16_i64, __scatter64_i8) SCATTER_GENERAL(__vec16_i16, int16_t, __vec16_i32, __scatter32_i16) SCATTER_GENERAL(__vec16_i16, int16_t, __vec16_i64, __scatter64_i16) SCATTER_GENERAL(__vec16_i32, int32_t, __vec16_i32, __scatter32_i32) diff --git a/opt.cpp b/opt.cpp index df7e082e..1b83a63d 100644 --- a/opt.cpp +++ b/opt.cpp @@ -1689,38 +1689,41 @@ DetectGSBaseOffsetsPass::runOnBasicBlock(llvm::BasicBlock &bb) { DEBUG_START_PASS("DetectGSBaseOffsets"); GSInfo gsFuncs[] = { - GSInfo("__pseudo_gather32_8", "__pseudo_gather_base_offsets32_8", - "__pseudo_gather_base_offsets32_8", true), - GSInfo("__pseudo_gather32_16", "__pseudo_gather_base_offsets32_16", - "__pseudo_gather_base_offsets32_16", true), - GSInfo("__pseudo_gather32_32", "__pseudo_gather_base_offsets32_32", - "__pseudo_gather_base_offsets32_32", true), - GSInfo("__pseudo_gather32_64", "__pseudo_gather_base_offsets32_64", - "__pseudo_gather_base_offsets32_64", true), - GSInfo("__pseudo_scatter32_8", "__pseudo_scatter_base_offsets32_8", - "__pseudo_scatter_base_offsets32_8", false), - GSInfo("__pseudo_scatter32_16", "__pseudo_scatter_base_offsets32_16", - "__pseudo_scatter_base_offsets32_16", false), - GSInfo("__pseudo_scatter32_32", "__pseudo_scatter_base_offsets32_32", - "__pseudo_scatter_base_offsets32_32", false), - GSInfo("__pseudo_scatter32_64", "__pseudo_scatter_base_offsets32_64", - "__pseudo_scatter_base_offsets32_64", false), - GSInfo("__pseudo_gather64_8", "__pseudo_gather_base_offsets64_8", - "__pseudo_gather_base_offsets32_8", true), - GSInfo("__pseudo_gather64_16", "__pseudo_gather_base_offsets64_16", - "__pseudo_gather_base_offsets32_16", true), - GSInfo("__pseudo_gather64_32", "__pseudo_gather_base_offsets64_32", - "__pseudo_gather_base_offsets32_32", true), - GSInfo("__pseudo_gather64_64", "__pseudo_gather_base_offsets64_64", - "__pseudo_gather_base_offsets32_64", true), - GSInfo("__pseudo_scatter64_8", "__pseudo_scatter_base_offsets64_8", - "__pseudo_scatter_base_offsets32_8", false), - GSInfo("__pseudo_scatter64_16", "__pseudo_scatter_base_offsets64_16", - "__pseudo_scatter_base_offsets32_16", false), - GSInfo("__pseudo_scatter64_32", "__pseudo_scatter_base_offsets64_32", - "__pseudo_scatter_base_offsets32_32", false), - GSInfo("__pseudo_scatter64_64", "__pseudo_scatter_base_offsets64_64", - "__pseudo_scatter_base_offsets32_64", false), + GSInfo("__pseudo_gather32_i8", "__pseudo_gather_base_offsets32_i8", + "__pseudo_gather_base_offsets32_i8", true), + GSInfo("__pseudo_gather32_i16", "__pseudo_gather_base_offsets32_i16", + "__pseudo_gather_base_offsets32_i16", true), + GSInfo("__pseudo_gather32_i32", "__pseudo_gather_base_offsets32_i32", + "__pseudo_gather_base_offsets32_i32", true), + GSInfo("__pseudo_gather32_i64", "__pseudo_gather_base_offsets32_i64", + "__pseudo_gather_base_offsets32_i64", true), + + GSInfo("__pseudo_scatter32_i8", "__pseudo_scatter_base_offsets32_i8", + "__pseudo_scatter_base_offsets32_i8", false), + GSInfo("__pseudo_scatter32_i16", "__pseudo_scatter_base_offsets32_i16", + "__pseudo_scatter_base_offsets32_i16", false), + GSInfo("__pseudo_scatter32_i32", "__pseudo_scatter_base_offsets32_i32", + "__pseudo_scatter_base_offsets32_i32", false), + GSInfo("__pseudo_scatter32_i64", "__pseudo_scatter_base_offsets32_i64", + "__pseudo_scatter_base_offsets32_i64", false), + + GSInfo("__pseudo_gather64_i8", "__pseudo_gather_base_offsets64_i8", + "__pseudo_gather_base_offsets32_i8", true), + GSInfo("__pseudo_gather64_i16", "__pseudo_gather_base_offsets64_i16", + "__pseudo_gather_base_offsets32_i16", true), + GSInfo("__pseudo_gather64_i32", "__pseudo_gather_base_offsets64_i32", + "__pseudo_gather_base_offsets32_i32", true), + GSInfo("__pseudo_gather64_i64", "__pseudo_gather_base_offsets64_i64", + "__pseudo_gather_base_offsets32_i64", true), + + GSInfo("__pseudo_scatter64_i8", "__pseudo_scatter_base_offsets64_i8", + "__pseudo_scatter_base_offsets32_i8", false), + GSInfo("__pseudo_scatter64_i16", "__pseudo_scatter_base_offsets64_i16", + "__pseudo_scatter_base_offsets32_i16", false), + GSInfo("__pseudo_scatter64_i32", "__pseudo_scatter_base_offsets64_i32", + "__pseudo_scatter_base_offsets32_i32", false), + GSInfo("__pseudo_scatter64_i64", "__pseudo_scatter_base_offsets64_i64", + "__pseudo_scatter_base_offsets32_i64", false), }; int numGSFuncs = sizeof(gsFuncs) / sizeof(gsFuncs[0]); for (int i = 0; i < numGSFuncs; ++i) @@ -2281,40 +2284,40 @@ GSToLoadStorePass::runOnBasicBlock(llvm::BasicBlock &bb) { DEBUG_START_PASS("GSToLoadStorePass"); GatherImpInfo gInfo[] = { - GatherImpInfo("__pseudo_gather_base_offsets32_8", "__load_and_broadcast_i8", + GatherImpInfo("__pseudo_gather_base_offsets32_i8", "__load_and_broadcast_i8", "__masked_load_i8", 1), - GatherImpInfo("__pseudo_gather_base_offsets32_16", "__load_and_broadcast_i16", + GatherImpInfo("__pseudo_gather_base_offsets32_i16", "__load_and_broadcast_i16", "__masked_load_i16", 2), - GatherImpInfo("__pseudo_gather_base_offsets32_32", "__load_and_broadcast_i32", + GatherImpInfo("__pseudo_gather_base_offsets32_i32", "__load_and_broadcast_i32", "__masked_load_i32", 4), - GatherImpInfo("__pseudo_gather_base_offsets32_64", "__load_and_broadcast_i64", + GatherImpInfo("__pseudo_gather_base_offsets32_i64", "__load_and_broadcast_i64", "__masked_load_i64", 8), - GatherImpInfo("__pseudo_gather_base_offsets64_8", "__load_and_broadcast_i8", + GatherImpInfo("__pseudo_gather_base_offsets64_i8", "__load_and_broadcast_i8", "__masked_load_i8", 1), - GatherImpInfo("__pseudo_gather_base_offsets64_16", "__load_and_broadcast_i16", + GatherImpInfo("__pseudo_gather_base_offsets64_i16", "__load_and_broadcast_i16", "__masked_load_i16", 2), - GatherImpInfo("__pseudo_gather_base_offsets64_32", "__load_and_broadcast_i32", + GatherImpInfo("__pseudo_gather_base_offsets64_i32", "__load_and_broadcast_i32", "__masked_load_i32", 4), - GatherImpInfo("__pseudo_gather_base_offsets64_64", "__load_and_broadcast_i64", - "__masked_load_i64", 8) + GatherImpInfo("__pseudo_gather_base_offsets64_i64", "__load_and_broadcast_i64", + "__masked_load_i64", 8), }; ScatterImpInfo sInfo[] = { - ScatterImpInfo("__pseudo_scatter_base_offsets32_8", "__pseudo_masked_store_i8", + ScatterImpInfo("__pseudo_scatter_base_offsets32_i8", "__pseudo_masked_store_i8", LLVMTypes::Int8VectorPointerType, 1), - ScatterImpInfo("__pseudo_scatter_base_offsets32_16", "__pseudo_masked_store_i16", + ScatterImpInfo("__pseudo_scatter_base_offsets32_i16", "__pseudo_masked_store_i16", LLVMTypes::Int16VectorPointerType, 2), - ScatterImpInfo("__pseudo_scatter_base_offsets32_32", "__pseudo_masked_store_i32", + ScatterImpInfo("__pseudo_scatter_base_offsets32_i32", "__pseudo_masked_store_i32", LLVMTypes::Int32VectorPointerType, 4), - ScatterImpInfo("__pseudo_scatter_base_offsets32_64", "__pseudo_masked_store_i64", + ScatterImpInfo("__pseudo_scatter_base_offsets32_i64", "__pseudo_masked_store_i64", LLVMTypes::Int64VectorPointerType, 8), - ScatterImpInfo("__pseudo_scatter_base_offsets64_8", "__pseudo_masked_store_i8", + ScatterImpInfo("__pseudo_scatter_base_offsets64_i8", "__pseudo_masked_store_i8", LLVMTypes::Int8VectorPointerType, 1), - ScatterImpInfo("__pseudo_scatter_base_offsets64_16", "__pseudo_masked_store_i16", + ScatterImpInfo("__pseudo_scatter_base_offsets64_i16", "__pseudo_masked_store_i16", LLVMTypes::Int16VectorPointerType, 2), - ScatterImpInfo("__pseudo_scatter_base_offsets64_32", "__pseudo_masked_store_i32", + ScatterImpInfo("__pseudo_scatter_base_offsets64_i32", "__pseudo_masked_store_i32", LLVMTypes::Int32VectorPointerType, 4), - ScatterImpInfo("__pseudo_scatter_base_offsets64_64", "__pseudo_masked_store_i64", - LLVMTypes::Int64VectorPointerType, 8) + ScatterImpInfo("__pseudo_scatter_base_offsets64_i64", "__pseudo_masked_store_i64", + LLVMTypes::Int64VectorPointerType, 8), }; bool modifiedAny = false; @@ -3387,8 +3390,8 @@ GatherCoalescePass::runOnBasicBlock(llvm::BasicBlock &bb) { DEBUG_START_PASS("GatherCoalescePass"); llvm::Function *gatherFuncs[] = { - m->module->getFunction("__pseudo_gather_base_offsets32_32"), - m->module->getFunction("__pseudo_gather_base_offsets64_32"), + m->module->getFunction("__pseudo_gather_base_offsets32_i32"), + m->module->getFunction("__pseudo_gather_base_offsets64_i32"), }; int nGatherFuncs = sizeof(gatherFuncs) / sizeof(gatherFuncs[0]); @@ -3570,45 +3573,45 @@ PseudoGSToGSPass::runOnBasicBlock(llvm::BasicBlock &bb) { DEBUG_START_PASS("PseudoGSToGSPass"); LowerGSInfo lgsInfo[] = { - LowerGSInfo("__pseudo_gather_base_offsets32_8", "__gather_base_offsets32_i8", true), - LowerGSInfo("__pseudo_gather_base_offsets32_16", "__gather_base_offsets32_i16", true), - LowerGSInfo("__pseudo_gather_base_offsets32_32", "__gather_base_offsets32_i32", true), - LowerGSInfo("__pseudo_gather_base_offsets32_64", "__gather_base_offsets32_i64", true), + LowerGSInfo("__pseudo_gather_base_offsets32_i8", "__gather_base_offsets32_i8", true), + LowerGSInfo("__pseudo_gather_base_offsets32_i16", "__gather_base_offsets32_i16", true), + LowerGSInfo("__pseudo_gather_base_offsets32_i32", "__gather_base_offsets32_i32", true), + LowerGSInfo("__pseudo_gather_base_offsets32_i64", "__gather_base_offsets32_i64", true), - LowerGSInfo("__pseudo_gather_base_offsets64_8", "__gather_base_offsets64_i8", true), - LowerGSInfo("__pseudo_gather_base_offsets64_16", "__gather_base_offsets64_i16", true), - LowerGSInfo("__pseudo_gather_base_offsets64_32", "__gather_base_offsets64_i32", true), - LowerGSInfo("__pseudo_gather_base_offsets64_64", "__gather_base_offsets64_i64", true), + LowerGSInfo("__pseudo_gather_base_offsets64_i8", "__gather_base_offsets64_i8", true), + LowerGSInfo("__pseudo_gather_base_offsets64_i16", "__gather_base_offsets64_i16", true), + LowerGSInfo("__pseudo_gather_base_offsets64_i32", "__gather_base_offsets64_i32", true), + LowerGSInfo("__pseudo_gather_base_offsets64_i64", "__gather_base_offsets64_i64", true), - LowerGSInfo("__pseudo_gather32_8", "__gather32_i8", true), - LowerGSInfo("__pseudo_gather32_16", "__gather32_i16", true), - LowerGSInfo("__pseudo_gather32_32", "__gather32_i32", true), - LowerGSInfo("__pseudo_gather32_64", "__gather32_i64", true), + LowerGSInfo("__pseudo_gather32_i8", "__gather32_i8", true), + LowerGSInfo("__pseudo_gather32_i16", "__gather32_i16", true), + LowerGSInfo("__pseudo_gather32_i32", "__gather32_i32", true), + LowerGSInfo("__pseudo_gather32_i64", "__gather32_i64", true), - LowerGSInfo("__pseudo_gather64_8", "__gather64_i8", true), - LowerGSInfo("__pseudo_gather64_16", "__gather64_i16", true), - LowerGSInfo("__pseudo_gather64_32", "__gather64_i32", true), - LowerGSInfo("__pseudo_gather64_64", "__gather64_i64", true), + LowerGSInfo("__pseudo_gather64_i8", "__gather64_i8", true), + LowerGSInfo("__pseudo_gather64_i16", "__gather64_i16", true), + LowerGSInfo("__pseudo_gather64_i32", "__gather64_i32", true), + LowerGSInfo("__pseudo_gather64_i64", "__gather64_i64", true), - LowerGSInfo("__pseudo_scatter_base_offsets32_8", "__scatter_base_offsets32_i8", false), - LowerGSInfo("__pseudo_scatter_base_offsets32_16", "__scatter_base_offsets32_i16", false), - LowerGSInfo("__pseudo_scatter_base_offsets32_32", "__scatter_base_offsets32_i32", false), - LowerGSInfo("__pseudo_scatter_base_offsets32_64", "__scatter_base_offsets32_i64", false), + LowerGSInfo("__pseudo_scatter_base_offsets32_i8", "__scatter_base_offsets32_i8", false), + LowerGSInfo("__pseudo_scatter_base_offsets32_i16", "__scatter_base_offsets32_i16", false), + LowerGSInfo("__pseudo_scatter_base_offsets32_i32", "__scatter_base_offsets32_i32", false), + LowerGSInfo("__pseudo_scatter_base_offsets32_i64", "__scatter_base_offsets32_i64", false), - LowerGSInfo("__pseudo_scatter_base_offsets64_8", "__scatter_base_offsets64_i8", false), - LowerGSInfo("__pseudo_scatter_base_offsets64_16", "__scatter_base_offsets64_i16", false), - LowerGSInfo("__pseudo_scatter_base_offsets64_32", "__scatter_base_offsets64_i32", false), - LowerGSInfo("__pseudo_scatter_base_offsets64_64", "__scatter_base_offsets64_i64", false), + LowerGSInfo("__pseudo_scatter_base_offsets64_i8", "__scatter_base_offsets64_i8", false), + LowerGSInfo("__pseudo_scatter_base_offsets64_i16", "__scatter_base_offsets64_i16", false), + LowerGSInfo("__pseudo_scatter_base_offsets64_i32", "__scatter_base_offsets64_i32", false), + LowerGSInfo("__pseudo_scatter_base_offsets64_i64", "__scatter_base_offsets64_i64", false), - LowerGSInfo("__pseudo_scatter32_8", "__scatter32_i8", false), - LowerGSInfo("__pseudo_scatter32_16", "__scatter32_i16", false), - LowerGSInfo("__pseudo_scatter32_32", "__scatter32_i32", false), - LowerGSInfo("__pseudo_scatter32_64", "__scatter32_i64", false), + LowerGSInfo("__pseudo_scatter32_i8", "__scatter32_i8", false), + LowerGSInfo("__pseudo_scatter32_i16", "__scatter32_i16", false), + LowerGSInfo("__pseudo_scatter32_i32", "__scatter32_i32", false), + LowerGSInfo("__pseudo_scatter32_i64", "__scatter32_i64", false), - LowerGSInfo("__pseudo_scatter64_8", "__scatter64_i8", false), - LowerGSInfo("__pseudo_scatter64_16", "__scatter64_i16", false), - LowerGSInfo("__pseudo_scatter64_32", "__scatter64_i32", false), - LowerGSInfo("__pseudo_scatter64_64", "__scatter64_i64", false), + LowerGSInfo("__pseudo_scatter64_i8", "__scatter64_i8", false), + LowerGSInfo("__pseudo_scatter64_i16", "__scatter64_i16", false), + LowerGSInfo("__pseudo_scatter64_i32", "__scatter64_i32", false), + LowerGSInfo("__pseudo_scatter64_i64", "__scatter64_i64", false), }; bool modifiedAny = false;