diff --git a/builtins/util.m4 b/builtins/util.m4 index 4a8822bb..26466b1f 100644 --- a/builtins/util.m4 +++ b/builtins/util.m4 @@ -1579,7 +1579,7 @@ declare void @__pseudo_masked_store_double( * nocapture, * nocapture, @__pseudo_gather32_i8(, ) nounwind readonly declare @__pseudo_gather32_i16(, ) nounwind readonly @@ -1621,31 +1606,106 @@ declare @__pseudo_gather64_float(, ) declare @__pseudo_gather64_i64(, ) nounwind readonly declare @__pseudo_gather64_double(, ) nounwind readonly -declare @__pseudo_gather_factored_base_offsets32_i8(i8 *, , i32, , - ) nounwind readonly -declare @__pseudo_gather_factored_base_offsets32_i16(i8 *, , i32, , - ) nounwind readonly -declare @__pseudo_gather_factored_base_offsets32_i32(i8 *, , i32, , - ) nounwind readonly -declare @__pseudo_gather_factored_base_offsets32_float(i8 *, , i32, , - ) nounwind readonly -declare @__pseudo_gather_factored_base_offsets32_i64(i8 *, , i32, , - ) nounwind readonly -declare @__pseudo_gather_factored_base_offsets32_double(i8 *, , i32, , - ) nounwind readonly +; The ImproveMemoryOps optimization pass finds these calls and then +; tries to convert them to be calls to gather functions that take a uniform +; base pointer and then a varying integer offset, when possible. +; +; For targets without a native gather instruction, it is best to factor the +; integer offsets like "{1/2/4/8} * varying_offset + constant_offset", +; where varying_offset includes non-compile time constant values, and +; constant_offset includes compile-time constant values. (The scalar loads +; generated in turn can then take advantage of the free offsetting and scale by +; 1/2/4/8 that is offered by the x86 addresisng modes.) +; +; varying int{8,16,32,float,64,double} +; __pseudo_gather_factored_base_offsets{32,64}_{i8,i16,i32,float,i64,double}(uniform int8 *base, +; int{32,64} offsets, uniform int32 offset_scale, +; int{32,64} offset_delta, mask) +; +; For targets with a gather instruction, it is better to just factor them into +; a gather from a uniform base pointer and then "{1/2/4/8} * offsets", where the +; offsets are int32/64 vectors. +; +; varying int{8,16,32,float,64,double} +; __pseudo_gather_base_offsets{32,64}_{i8,i16,i32,float,i64,double}(uniform int8 *base, +; uniform int32 offset_scale, int{32,64} offsets, mask) -declare @__pseudo_gather_factored_base_offsets64_i8(i8 *, , i32, , - ) nounwind readonly -declare @__pseudo_gather_factored_base_offsets64_i16(i8 *, , i32, , - ) nounwind readonly -declare @__pseudo_gather_factored_base_offsets64_i32(i8 *, , i32, , - ) nounwind readonly -declare @__pseudo_gather_factored_base_offsets64_float(i8 *, , i32, , - ) nounwind readonly -declare @__pseudo_gather_factored_base_offsets64_i64(i8 *, , i32, , - ) nounwind readonly -declare @__pseudo_gather_factored_base_offsets64_double(i8 *, , i32, , - ) nounwind readonly + +declare +@__pseudo_gather_factored_base_offsets32_i8(i8 *, , i32, , + ) nounwind readonly +declare +@__pseudo_gather_factored_base_offsets32_i16(i8 *, , i32, , + ) nounwind readonly +declare +@__pseudo_gather_factored_base_offsets32_i32(i8 *, , i32, , + ) nounwind readonly +declare +@__pseudo_gather_factored_base_offsets32_float(i8 *, , i32, , + ) nounwind readonly +declare +@__pseudo_gather_factored_base_offsets32_i64(i8 *, , i32, , + ) nounwind readonly +declare +@__pseudo_gather_factored_base_offsets32_double(i8 *, , i32, , + ) nounwind readonly + +declare +@__pseudo_gather_factored_base_offsets64_i8(i8 *, , i32, , + ) nounwind readonly +declare +@__pseudo_gather_factored_base_offsets64_i16(i8 *, , i32, , + ) nounwind readonly +declare +@__pseudo_gather_factored_base_offsets64_i32(i8 *, , i32, , + ) nounwind readonly +declare +@__pseudo_gather_factored_base_offsets64_float(i8 *, , i32, , + ) nounwind readonly +declare +@__pseudo_gather_factored_base_offsets64_i64(i8 *, , i32, , + ) nounwind readonly +declare +@__pseudo_gather_factored_base_offsets64_double(i8 *, , i32, , + ) nounwind readonly + +declare +@__pseudo_gather_base_offsets32_i8(i8 *, i32, , + ) nounwind readonly +declare +@__pseudo_gather_base_offsets32_i16(i8 *, i32, , + ) nounwind readonly +declare +@__pseudo_gather_base_offsets32_i32(i8 *, i32, , + ) nounwind readonly +declare +@__pseudo_gather_base_offsets32_float(i8 *, i32, , + ) nounwind readonly +declare +@__pseudo_gather_base_offsets32_i64(i8 *, i32, , + ) nounwind readonly +declare +@__pseudo_gather_base_offsets32_double(i8 *, i32, , + ) nounwind readonly + +declare +@__pseudo_gather_base_offsets64_i8(i8 *, i32, , + ) nounwind readonly +declare +@__pseudo_gather_base_offsets64_i16(i8 *, i32, , + ) nounwind readonly +declare +@__pseudo_gather_base_offsets64_i32(i8 *, i32, , + ) nounwind readonly +declare +@__pseudo_gather_base_offsets64_float(i8 *, i32, , + ) nounwind readonly +declare +@__pseudo_gather_base_offsets64_i64(i8 *, i32, , + ) nounwind readonly +declare +@__pseudo_gather_base_offsets64_double(i8 *, i32, , + ) nounwind readonly ; Similarly to the pseudo-gathers defined above, we also declare undefined ; pseudo-scatter instructions with signatures: @@ -1657,16 +1717,6 @@ declare @__pseudo_gather_factored_base_offsets64_double(i8 *, < ; void __pseudo_scatter_i64(varying int64 *, varying int64 values, mask) ; void __pseudo_scatter_double(varying double *, varying double values, mask) ; -; The GatherScatterFlattenOpt optimization pass also finds these and -; transforms them to scatters like: -; -; void __pseudo_scatter_factored_base_offsets{32,64}_i8(uniform int8 *base, -; varying int32 offsets, uniform int32 offset_scale, -; varying int{32,64} offset_delta, varying int8 values, mask) -; (and similarly for 16/32/64 bit values) -; -; And the GSImprovementsPass in turn converts these to actual native -; scatters or masked stores. declare void @__pseudo_scatter32_i8(, , ) nounwind declare void @__pseudo_scatter32_i16(, , ) nounwind @@ -1682,31 +1732,96 @@ declare void @__pseudo_scatter64_float(, , , , ) nounwind declare void @__pseudo_scatter64_double(, , ) nounwind -declare void @__pseudo_scatter_factored_base_offsets32_i8(i8 * nocapture, , i32, , - , ) nounwind -declare void @__pseudo_scatter_factored_base_offsets32_i16(i8 * nocapture, , i32, , - , ) nounwind -declare void @__pseudo_scatter_factored_base_offsets32_i32(i8 * nocapture, , i32, , - , ) nounwind -declare void @__pseudo_scatter_factored_base_offsets32_float(i8 * nocapture, , i32, , - , ) nounwind -declare void @__pseudo_scatter_factored_base_offsets32_i64(i8 * nocapture, , i32, , - , ) nounwind -declare void @__pseudo_scatter_factored_base_offsets32_double(i8 * nocapture, , i32, , - , ) nounwind +; And the ImproveMemoryOps optimization pass also finds these and +; either transforms them to scatters like: +; +; void __pseudo_scatter_factored_base_offsets{32,64}_i8(uniform int8 *base, +; varying int32 offsets, uniform int32 offset_scale, +; varying int{32,64} offset_delta, varying int8 values, mask) +; (and similarly for 16/32/64 bit values) +; +; Or, if the target has a native scatter instruction: +; +; void __pseudo_scatter_base_offsets{32,64}_i8(uniform int8 *base, +; uniform int32 offset_scale, varying int{32,64} offsets, +; varying int8 values, mask) +; (and similarly for 16/32/64 bit values) -declare void @__pseudo_scatter_factored_base_offsets64_i8(i8 * nocapture, , i32, , - , ) nounwind -declare void @__pseudo_scatter_factored_base_offsets64_i16(i8 * nocapture, , i32, , - , ) nounwind -declare void @__pseudo_scatter_factored_base_offsets64_i32(i8 * nocapture, , i32, , - , ) nounwind -declare void @__pseudo_scatter_factored_base_offsets64_float(i8 * nocapture, , i32, , - , ) nounwind -declare void @__pseudo_scatter_factored_base_offsets64_i64(i8 * nocapture, , i32, , - , ) nounwind -declare void @__pseudo_scatter_factored_base_offsets64_double(i8 * nocapture, , i32, , - , ) nounwind +declare void +@__pseudo_scatter_factored_base_offsets32_i8(i8 * nocapture, , i32, , + , ) nounwind +declare void +@__pseudo_scatter_factored_base_offsets32_i16(i8 * nocapture, , i32, , + , ) nounwind +declare void +@__pseudo_scatter_factored_base_offsets32_i32(i8 * nocapture, , i32, , + , ) nounwind +declare void +@__pseudo_scatter_factored_base_offsets32_float(i8 * nocapture, , i32, , + , ) nounwind +declare void +@__pseudo_scatter_factored_base_offsets32_i64(i8 * nocapture, , i32, , + , ) nounwind +declare void +@__pseudo_scatter_factored_base_offsets32_double(i8 * nocapture, , i32, , + , ) nounwind + +declare void +@__pseudo_scatter_factored_base_offsets64_i8(i8 * nocapture, , i32, , + , ) nounwind +declare void +@__pseudo_scatter_factored_base_offsets64_i16(i8 * nocapture, , i32, , + , ) nounwind +declare void +@__pseudo_scatter_factored_base_offsets64_i32(i8 * nocapture, , i32, , + , ) nounwind +declare void +@__pseudo_scatter_factored_base_offsets64_float(i8 * nocapture, , i32, , + , ) nounwind +declare void +@__pseudo_scatter_factored_base_offsets64_i64(i8 * nocapture, , i32, , + , ) nounwind +declare void +@__pseudo_scatter_factored_base_offsets64_double(i8 * nocapture, , i32, , + , ) nounwind + +declare void +@__pseudo_scatter_base_offsets32_i8(i8 * nocapture, i32, , + , ) nounwind +declare void +@__pseudo_scatter_base_offsets32_i16(i8 * nocapture, i32, , + , ) nounwind +declare void +@__pseudo_scatter_base_offsets32_i32(i8 * nocapture, i32, , + , ) nounwind +declare void +@__pseudo_scatter_base_offsets32_float(i8 * nocapture, i32, , + , ) nounwind +declare void +@__pseudo_scatter_base_offsets32_i64(i8 * nocapture, i32, , + , ) nounwind +declare void +@__pseudo_scatter_base_offsets32_double(i8 * nocapture, i32, , + , ) nounwind + +declare void +@__pseudo_scatter_base_offsets64_i8(i8 * nocapture, i32, , + , ) nounwind +declare void +@__pseudo_scatter_base_offsets64_i16(i8 * nocapture, i32, , + , ) nounwind +declare void +@__pseudo_scatter_base_offsets64_i32(i8 * nocapture, i32, , + , ) nounwind +declare void +@__pseudo_scatter_base_offsets64_float(i8 * nocapture, i32, , + , ) nounwind +declare void +@__pseudo_scatter_base_offsets64_i64(i8 * nocapture, i32, , + , ) nounwind +declare void +@__pseudo_scatter_base_offsets64_double(i8 * nocapture, i32, , + , ) nounwind declare float @__log_uniform_float(float) nounwind readnone declare @__log_varying_float() nounwind readnone @@ -1871,6 +1986,109 @@ define void @__keep_funcs_live(i8 * %ptr, %v8, %v16, %mask) call void @__usedouble( %g64_d) +ifelse(HAVE_GATHER, `1', +` + %nfpgbo32_8 = call + @__pseudo_gather_base_offsets32_i8(i8 * %ptr, i32 0, + %v32, %mask) + call void @__use8( %nfpgbo32_8) + %nfpgbo32_16 = call + @__pseudo_gather_base_offsets32_i16(i8 * %ptr, i32 0, + %v32, %mask) + call void @__use16( %nfpgbo32_16) + %nfpgbo32_32 = call + @__pseudo_gather_base_offsets32_i32(i8 * %ptr, i32 0, + %v32, %mask) + call void @__use32( %nfpgbo32_32) + %nfpgbo32_f = call + @__pseudo_gather_base_offsets32_float(i8 * %ptr, i32 0, + %v32, %mask) + call void @__usefloat( %nfpgbo32_f) + %nfpgbo32_64 = call + @__pseudo_gather_base_offsets32_i64(i8 * %ptr, i32 0, + %v32, %mask) + call void @__use64( %nfpgbo32_64) + %nfpgbo32_d = call + @__pseudo_gather_base_offsets32_double(i8 * %ptr, i32 0, + %v32, %mask) + call void @__usedouble( %nfpgbo32_d) + + %nfpgbo64_8 = call + @__pseudo_gather_base_offsets64_i8(i8 * %ptr, i32 0, + %v64, %mask) + call void @__use8( %nfpgbo64_8) + %nfpgbo64_16 = call + @__pseudo_gather_base_offsets64_i16(i8 * %ptr, i32 0, + %v64, %mask) + call void @__use16( %nfpgbo64_16) + %nfpgbo64_32 = call + @__pseudo_gather_base_offsets64_i32(i8 * %ptr, i32 0, + %v64, %mask) + call void @__use32( %nfpgbo64_32) + %nfpgbo64_f = call + @__pseudo_gather_base_offsets64_float(i8 * %ptr, i32 0, + %v64, %mask) + call void @__usefloat( %nfpgbo64_f) + %nfpgbo64_64 = call + @__pseudo_gather_base_offsets64_i64(i8 * %ptr, i32 0, + %v64, %mask) + call void @__use64( %nfpgbo64_64) + %nfpgbo64_d = call + @__pseudo_gather_base_offsets64_double(i8 * %ptr, i32 0, + %v64, %mask) + call void @__usedouble( %nfpgbo64_d) + + %nfgbo32_8 = call + @__gather_base_offsets32_i8(i8 * %ptr, i32 0, + %v32, %mask) + call void @__use8( %nfgbo32_8) + %nfgbo32_16 = call + @__gather_base_offsets32_i16(i8 * %ptr, i32 0, + %v32, %mask) + call void @__use16( %nfgbo32_16) + %nfgbo32_32 = call + @__gather_base_offsets32_i32(i8 * %ptr, i32 0, + %v32, %mask) + call void @__use32( %nfgbo32_32) + %nfgbo32_f = call + @__gather_base_offsets32_float(i8 * %ptr, i32 0, + %v32, %mask) + call void @__usefloat( %nfgbo32_f) + %nfgbo32_64 = call + @__gather_base_offsets32_i64(i8 * %ptr, i32 0, + %v32, %mask) + call void @__use64( %nfgbo32_64) + %nfgbo32_d = call + @__gather_base_offsets32_double(i8 * %ptr, i32 0, + %v32, %mask) + call void @__usedouble( %nfgbo32_d) + + %nfgbo64_8 = call + @__gather_base_offsets64_i8(i8 * %ptr, i32 0, + %v64, %mask) + call void @__use8( %nfgbo64_8) + %nfgbo64_16 = call + @__gather_base_offsets64_i16(i8 * %ptr, i32 0, + %v64, %mask) + call void @__use16( %nfgbo64_16) + %nfgbo64_32 = call + @__gather_base_offsets64_i32(i8 * %ptr, i32 0, + %v64, %mask) + call void @__use32( %nfgbo64_32) + %nfgbo64_f = call + @__gather_base_offsets64_float(i8 * %ptr, i32 0, + %v64, %mask) + call void @__usefloat( %nfgbo64_f) + %nfgbo64_64 = call + @__gather_base_offsets64_i64(i8 * %ptr, i32 0, + %v64, %mask) + call void @__use64( %nfgbo64_64) + %nfgbo64_d = call + @__gather_base_offsets64_double(i8 * %ptr, i32 0, + %v64, %mask) + call void @__usedouble( %nfgbo64_d) +', +` %pgbo32_8 = call @__pseudo_gather_factored_base_offsets32_i8(i8 * %ptr, %v32, i32 0, %v32, %mask) @@ -1896,32 +2114,6 @@ define void @__keep_funcs_live(i8 * %ptr, %v8, %v16, %v32, %mask) call void @__usedouble( %pgbo32_d) - %gbo32_8 = call - @__gather_factored_base_offsets32_i8(i8 * %ptr, %v32, i32 0, - %v32, %mask) - call void @__use8( %gbo32_8) - %gbo32_16 = call - @__gather_factored_base_offsets32_i16(i8 * %ptr, %v32, i32 0, - %v32, %mask) - call void @__use16( %gbo32_16) - %gbo32_32 = call - @__gather_factored_base_offsets32_i32(i8 * %ptr, %v32, i32 0, - %v32, %mask) - call void @__use32( %gbo32_32) - %gbo32_f = call - @__gather_factored_base_offsets32_float(i8 * %ptr, %v32, i32 0, - %v32, %mask) - call void @__usefloat( %gbo32_f) - %gbo32_64 = call - @__gather_factored_base_offsets32_i64(i8 * %ptr, %v32, i32 0, - %v32, %mask) - call void @__use64( %gbo32_64) - %gbo32_d = call - @__gather_factored_base_offsets32_double(i8 * %ptr, %v32, i32 0, - %v32, %mask) - call void @__usedouble( %gbo32_d) - - %pgbo64_8 = call @__pseudo_gather_factored_base_offsets64_i8(i8 * %ptr, %v64, i32 0, %v64, %mask) @@ -1947,6 +2139,31 @@ define void @__keep_funcs_live(i8 * %ptr, %v8, %v16, %v64, %mask) call void @__usedouble( %pgbo64_d) + %gbo32_8 = call + @__gather_factored_base_offsets32_i8(i8 * %ptr, %v32, i32 0, + %v32, %mask) + call void @__use8( %gbo32_8) + %gbo32_16 = call + @__gather_factored_base_offsets32_i16(i8 * %ptr, %v32, i32 0, + %v32, %mask) + call void @__use16( %gbo32_16) + %gbo32_32 = call + @__gather_factored_base_offsets32_i32(i8 * %ptr, %v32, i32 0, + %v32, %mask) + call void @__use32( %gbo32_32) + %gbo32_f = call + @__gather_factored_base_offsets32_float(i8 * %ptr, %v32, i32 0, + %v32, %mask) + call void @__usefloat( %gbo32_f) + %gbo32_64 = call + @__gather_factored_base_offsets32_i64(i8 * %ptr, %v32, i32 0, + %v32, %mask) + call void @__use64( %gbo32_64) + %gbo32_d = call + @__gather_factored_base_offsets32_double(i8 * %ptr, %v32, i32 0, + %v32, %mask) + call void @__usedouble( %gbo32_d) + %gbo64_8 = call @__gather_factored_base_offsets64_i8(i8 * %ptr, %v64, i32 0, %v64, %mask) @@ -1970,7 +2187,8 @@ define void @__keep_funcs_live(i8 * %ptr, %v8, %v16, %gbo64_d = call @__gather_factored_base_offsets64_double(i8 * %ptr, %v64, i32 0, %v64, %mask) - call void @__usedouble( %gbo64_d) + call void @__usedouble( %pgbo64_d) +') ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; scatters @@ -2003,6 +2221,61 @@ define void @__keep_funcs_live(i8 * %ptr, %v8, %v16, call void @__scatter64_i64( %v64, %v64, %mask) call void @__scatter64_double( %v64, %vd, %mask) +ifelse(HAVE_SCATTER, `1', +` + call void @__pseudo_scatter_base_offsets32_i8(i8 * %ptr, i32 0, %v32, + %v8, %mask) + call void @__pseudo_scatter_base_offsets32_i16(i8 * %ptr, i32 0, %v32, + %v16, %mask) + call void @__pseudo_scatter_base_offsets32_i32(i8 * %ptr, i32 0, %v32, + %v32, %mask) + call void @__pseudo_scatter_base_offsets32_float(i8 * %ptr, i32 0, %v32, + %vf, %mask) + call void @__pseudo_scatter_base_offsets32_i64(i8 * %ptr, i32 0, %v32, + %v64, %mask) + call void @__pseudo_scatter_base_offsets32_double(i8 * %ptr, i32 0, %v32, + %vd, %mask) + + call void @__pseudo_scatter_base_offsets64_i8(i8 * %ptr, i32 0, %v64, + %v8, %mask) + call void @__pseudo_scatter_base_offsets64_i16(i8 * %ptr, i32 0, %v64, + %v16, %mask) + call void @__pseudo_scatter_base_offsets64_i32(i8 * %ptr, i32 0, %v64, + %v32, %mask) + call void @__pseudo_scatter_base_offsets64_float(i8 * %ptr, i32 0, %v64, + %vf, %mask) + call void @__pseudo_scatter_base_offsets64_i64(i8 * %ptr, i32 0, %v64, + %v64, %mask) + call void @__pseudo_scatter_base_offsets64_double(i8 * %ptr, i32 0, %v64, + %vd, %mask) + + call void @__scatter_base_offsets32_i8(i8 * %ptr, i32 0, %v32, + %v8, %mask) + call void @__scatter_base_offsets32_i16(i8 * %ptr, i32 0, %v32, + %v16, %mask) + call void @__scatter_base_offsets32_i32(i8 * %ptr, i32 0, %v32, + %v32, %mask) + call void @__scatter_base_offsets32_float(i8 * %ptr, i32 0, %v32, + %vf, %mask) + call void @__scatter_base_offsets32_i64(i8 * %ptr, i32 0, %v32, + %v64, %mask) + call void @__scatter_base_offsets32_double(i8 * %ptr, i32 0, %v32, + %vd, %mask) + + call void @__scatter_base_offsets64_i8(i8 * %ptr, i32 0, %v64, + %v8, %mask) + call void @__scatter_base_offsets64_i16(i8 * %ptr, i32 0, %v64, + %v16, %mask) + call void @__scatter_base_offsets64_i32(i8 * %ptr, i32 0, %v64, + %v32, %mask) + call void @__scatter_base_offsets64_float(i8 * %ptr, i32 0, %v64, + %vf, %mask) + call void @__scatter_base_offsets64_i64(i8 * %ptr, i32 0, %v64, + %v64, %mask) + call void @__scatter_base_offsets64_double(i8 * %ptr, i32 0, %v64, + %vd, %mask) +', +` call void @__pseudo_scatter_factored_base_offsets32_i8(i8 * %ptr, %v32, i32 0, %v32, %v8, %mask) call void @__pseudo_scatter_factored_base_offsets32_i16(i8 * %ptr, %v32, i32 0, %v32, @@ -2054,6 +2327,7 @@ define void @__keep_funcs_live(i8 * %ptr, %v8, %v16, %v64, %mask) call void @__scatter_factored_base_offsets64_double(i8 * %ptr, %v64, i32 0, %v64, %vd, %mask) +') ret void } diff --git a/ispc.cpp b/ispc.cpp index 2b98de86..0980c3d2 100644 --- a/ispc.cpp +++ b/ispc.cpp @@ -212,6 +212,7 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa, // This is the case for most of them t->hasHalf = t->hasRand = t->hasTranscendentals = false; + t->hasGather = t->hasScatter = false; if (!strcasecmp(isa, "sse2")) { t->isa = Target::SSE2; diff --git a/ispc.h b/ispc.h index 9632f514..66191844 100644 --- a/ispc.h +++ b/ispc.h @@ -252,9 +252,15 @@ struct Target { conversions. */ bool hasHalf; - /** Indicates whether there is an ISA random number instruciton. */ + /** Indicates whether there is an ISA random number instruction. */ bool hasRand; + /** Indicates whether the target has a native gather instruction */ + bool hasGather; + + /** Indicates whether the target has a native scatter instruction */ + bool hasScatter; + /** Indicates whether the target has support for transcendentals (beyond sqrt, which we assume that all of them handle). */ bool hasTranscendentals; diff --git a/opt.cpp b/opt.cpp index 1140c9ce..824c5bc7 100644 --- a/opt.cpp +++ b/opt.cpp @@ -225,7 +225,6 @@ lCallInst(llvm::Function *func, llvm::Value *arg0, llvm::Value *arg1, } -#if 0 static llvm::Instruction * lCallInst(llvm::Function *func, llvm::Value *arg0, llvm::Value *arg1, llvm::Value *arg2, llvm::Value *arg3, const char *name, @@ -234,7 +233,6 @@ lCallInst(llvm::Function *func, llvm::Value *arg0, llvm::Value *arg1, llvm::ArrayRef newArgArray(&args[0], &args[4]); return llvm::CallInst::Create(func, newArgArray, name, insertBefore); } -#endif static llvm::Instruction * lCallInst(llvm::Function *func, llvm::Value *arg0, llvm::Value *arg1, @@ -1673,6 +1671,39 @@ lOffsets32BitSafe(llvm::Value **variableOffsetPtr, } +/** Check to see if the single offset vector can safely be represented with + 32-bit values. If so, return true and update the pointed-to + llvm::Value * to be the 32-bit equivalent. */ +static bool +lOffsets32BitSafe(llvm::Value **offsetPtr, + llvm::Instruction *insertBefore) { + llvm::Value *offset = *offsetPtr; + + if (offset->getType() == LLVMTypes::Int32VectorType) + return true; + + llvm::SExtInst *sext = llvm::dyn_cast(offset); + if (sext != NULL && + sext->getOperand(0)->getType() == LLVMTypes::Int32VectorType) { + // sext of a 32-bit vector -> the 32-bit vector is good + *offsetPtr = sext->getOperand(0); + return true; + } + else if (lVectorIs32BitInts(offset)) { + // The only constant vector we should have here is a vector of + // all zeros (i.e. a ConstantAggregateZero, but just in case, + // do the more general check with lVectorIs32BitInts(). + *offsetPtr = + new llvm::TruncInst(offset, LLVMTypes::Int32VectorType, + LLVMGetName(offset, "_trunc"), + insertBefore); + return true; + } + else + return false; +} + + static bool lGSToGSBaseOffsets(llvm::CallInst *callInst) { struct GSInfo { @@ -1689,57 +1720,153 @@ lGSToGSBaseOffsets(llvm::CallInst *callInst) { }; GSInfo gsFuncs[] = { - GSInfo("__pseudo_gather32_i8", "__pseudo_gather_factored_base_offsets32_i8", - "__pseudo_gather_factored_base_offsets32_i8", true), - GSInfo("__pseudo_gather32_i16", "__pseudo_gather_factored_base_offsets32_i16", - "__pseudo_gather_factored_base_offsets32_i16", true), - GSInfo("__pseudo_gather32_i32", "__pseudo_gather_factored_base_offsets32_i32", - "__pseudo_gather_factored_base_offsets32_i32", true), - GSInfo("__pseudo_gather32_float", "__pseudo_gather_factored_base_offsets32_float", - "__pseudo_gather_factored_base_offsets32_float", true), - GSInfo("__pseudo_gather32_i64", "__pseudo_gather_factored_base_offsets32_i64", - "__pseudo_gather_factored_base_offsets32_i64", true), - GSInfo("__pseudo_gather32_double", "__pseudo_gather_factored_base_offsets32_double", - "__pseudo_gather_factored_base_offsets32_double", true), + GSInfo("__pseudo_gather32_i8", + g->target.hasGather ? "__pseudo_gather_base_offsets32_i8" : + "__pseudo_gather_factored_base_offsets32_i8", + g->target.hasGather ? "__pseudo_gather_base_offsets32_i8" : + "__pseudo_gather_factored_base_offsets32_i8", + true), + GSInfo("__pseudo_gather32_i16", + g->target.hasGather ? "__pseudo_gather_base_offsets32_i16" : + "__pseudo_gather_factored_base_offsets32_i16", + g->target.hasGather ? "__pseudo_gather_base_offsets32_i16" : + "__pseudo_gather_factored_base_offsets32_i16", + true), + GSInfo("__pseudo_gather32_i32", + g->target.hasGather ? "__pseudo_gather_base_offsets32_i32" : + "__pseudo_gather_factored_base_offsets32_i32", + g->target.hasGather ? "__pseudo_gather_base_offsets32_i32" : + "__pseudo_gather_factored_base_offsets32_i32", + true), + GSInfo("__pseudo_gather32_float", + g->target.hasGather ? "__pseudo_gather_base_offsets32_float" : + "__pseudo_gather_factored_base_offsets32_float", + g->target.hasGather ? "__pseudo_gather_base_offsets32_float" : + "__pseudo_gather_factored_base_offsets32_float", + true), + GSInfo("__pseudo_gather32_i64", + g->target.hasGather ? "__pseudo_gather_base_offsets32_i64" : + "__pseudo_gather_factored_base_offsets32_i64", + g->target.hasGather ? "__pseudo_gather_base_offsets32_i64" : + "__pseudo_gather_factored_base_offsets32_i64", + true), + GSInfo("__pseudo_gather32_double", + g->target.hasGather ? "__pseudo_gather_base_offsets32_double" : + "__pseudo_gather_factored_base_offsets32_double", + g->target.hasGather ? "__pseudo_gather_base_offsets32_double" : + "__pseudo_gather_factored_base_offsets32_double", + true), - GSInfo("__pseudo_scatter32_i8", "__pseudo_scatter_factored_base_offsets32_i8", - "__pseudo_scatter_factored_base_offsets32_i8", false), - GSInfo("__pseudo_scatter32_i16", "__pseudo_scatter_factored_base_offsets32_i16", - "__pseudo_scatter_factored_base_offsets32_i16", false), - GSInfo("__pseudo_scatter32_i32", "__pseudo_scatter_factored_base_offsets32_i32", - "__pseudo_scatter_factored_base_offsets32_i32", false), - GSInfo("__pseudo_scatter32_float", "__pseudo_scatter_factored_base_offsets32_float", - "__pseudo_scatter_factored_base_offsets32_float", false), - GSInfo("__pseudo_scatter32_i64", "__pseudo_scatter_factored_base_offsets32_i64", - "__pseudo_scatter_factored_base_offsets32_i64", false), - GSInfo("__pseudo_scatter32_double", "__pseudo_scatter_factored_base_offsets32_double", - "__pseudo_scatter_factored_base_offsets32_double", false), + GSInfo("__pseudo_scatter32_i8", + g->target.hasScatter ? "__pseudo_scatter_base_offsets32_i8" : + "__pseudo_scatter_factored_base_offsets32_i8", + g->target.hasScatter ? "__pseudo_scatter_base_offsets32_i8" : + "__pseudo_scatter_factored_base_offsets32_i8", + false), + GSInfo("__pseudo_scatter32_i16", + g->target.hasScatter ? "__pseudo_scatter_base_offsets32_i16" : + "__pseudo_scatter_factored_base_offsets32_i16", + g->target.hasScatter ? "__pseudo_scatter_base_offsets32_i16" : + "__pseudo_scatter_factored_base_offsets32_i16", + false), + GSInfo("__pseudo_scatter32_i32", + g->target.hasScatter ? "__pseudo_scatter_base_offsets32_i32" : + "__pseudo_scatter_factored_base_offsets32_i32", + g->target.hasScatter ? "__pseudo_scatter_base_offsets32_i32" : + "__pseudo_scatter_factored_base_offsets32_i32", + false), + GSInfo("__pseudo_scatter32_float", + g->target.hasScatter ? "__pseudo_scatter_base_offsets32_float" : + "__pseudo_scatter_factored_base_offsets32_float", + g->target.hasScatter ? "__pseudo_scatter_base_offsets32_float" : + "__pseudo_scatter_factored_base_offsets32_float", + false), + GSInfo("__pseudo_scatter32_i64", + g->target.hasScatter ? "__pseudo_scatter_base_offsets32_i64" : + "__pseudo_scatter_factored_base_offsets32_i64", + g->target.hasScatter ? "__pseudo_scatter_base_offsets32_i64" : + "__pseudo_scatter_factored_base_offsets32_i64", + false), + GSInfo("__pseudo_scatter32_double", + g->target.hasScatter ? "__pseudo_scatter_base_offsets32_double" : + "__pseudo_scatter_factored_base_offsets32_double", + g->target.hasScatter ? "__pseudo_scatter_base_offsets32_double" : + "__pseudo_scatter_factored_base_offsets32_double", + false), - GSInfo("__pseudo_gather64_i8", "__pseudo_gather_factored_base_offsets64_i8", - "__pseudo_gather_factored_base_offsets32_i8", true), - GSInfo("__pseudo_gather64_i16", "__pseudo_gather_factored_base_offsets64_i16", - "__pseudo_gather_factored_base_offsets32_i16", true), - GSInfo("__pseudo_gather64_i32", "__pseudo_gather_factored_base_offsets64_i32", - "__pseudo_gather_factored_base_offsets32_i32", true), - GSInfo("__pseudo_gather64_float", "__pseudo_gather_factored_base_offsets64_float", - "__pseudo_gather_factored_base_offsets32_float", true), - GSInfo("__pseudo_gather64_i64", "__pseudo_gather_factored_base_offsets64_i64", - "__pseudo_gather_factored_base_offsets32_i64", true), - GSInfo("__pseudo_gather64_double", "__pseudo_gather_factored_base_offsets64_double", - "__pseudo_gather_factored_base_offsets32_double", true), + GSInfo("__pseudo_gather64_i8", + g->target.hasGather ? "__pseudo_gather_base_offsets64_i8" : + "__pseudo_gather_factored_base_offsets64_i8", + g->target.hasGather ? "__pseudo_gather_base_offsets32_i8" : + "__pseudo_gather_factored_base_offsets32_i8", + true), + GSInfo("__pseudo_gather64_i16", + g->target.hasGather ? "__pseudo_gather_base_offsets64_i16" : + "__pseudo_gather_factored_base_offsets64_i16", + g->target.hasGather ? "__pseudo_gather_base_offsets32_i16" : + "__pseudo_gather_factored_base_offsets32_i16", + true), + GSInfo("__pseudo_gather64_i32", + g->target.hasGather ? "__pseudo_gather_base_offsets64_i32" : + "__pseudo_gather_factored_base_offsets64_i32", + g->target.hasGather ? "__pseudo_gather_base_offsets32_i32" : + "__pseudo_gather_factored_base_offsets32_i32", + true), + GSInfo("__pseudo_gather64_float", + g->target.hasGather ? "__pseudo_gather_base_offsets64_float" : + "__pseudo_gather_factored_base_offsets64_float", + g->target.hasGather ? "__pseudo_gather_base_offsets32_float" : + "__pseudo_gather_factored_base_offsets32_float", + true), + GSInfo("__pseudo_gather64_i64", + g->target.hasGather ? "__pseudo_gather_base_offsets64_i64" : + "__pseudo_gather_factored_base_offsets64_i64", + g->target.hasGather ? "__pseudo_gather_base_offsets32_i64" : + "__pseudo_gather_factored_base_offsets32_i64", + true), + GSInfo("__pseudo_gather64_double", + g->target.hasGather ? "__pseudo_gather_base_offsets64_double" : + "__pseudo_gather_factored_base_offsets64_double", + g->target.hasGather ? "__pseudo_gather_base_offsets32_double" : + "__pseudo_gather_factored_base_offsets32_double", + true), - GSInfo("__pseudo_scatter64_i8", "__pseudo_scatter_factored_base_offsets64_i8", - "__pseudo_scatter_factored_base_offsets32_i8", false), - GSInfo("__pseudo_scatter64_i16", "__pseudo_scatter_factored_base_offsets64_i16", - "__pseudo_scatter_factored_base_offsets32_i16", false), - GSInfo("__pseudo_scatter64_i32", "__pseudo_scatter_factored_base_offsets64_i32", - "__pseudo_scatter_factored_base_offsets32_i32", false), - GSInfo("__pseudo_scatter64_float", "__pseudo_scatter_factored_base_offsets64_float", - "__pseudo_scatter_factored_base_offsets32_float", false), - GSInfo("__pseudo_scatter64_i64", "__pseudo_scatter_factored_base_offsets64_i64", - "__pseudo_scatter_factored_base_offsets32_i64", false), - GSInfo("__pseudo_scatter64_double", "__pseudo_scatter_factored_base_offsets64_double", - "__pseudo_scatter_factored_base_offsets32_double", false), + GSInfo("__pseudo_scatter64_i8", + g->target.hasScatter ? "__pseudo_scatter_base_offsets64_i8" : + "__pseudo_scatter_factored_base_offsets64_i8", + g->target.hasScatter ? "__pseudo_scatter_base_offsets32_i8" : + "__pseudo_scatter_factored_base_offsets32_i8", + false), + GSInfo("__pseudo_scatter64_i16", + g->target.hasScatter ? "__pseudo_scatter_base_offsets64_i16" : + "__pseudo_scatter_factored_base_offsets64_i16", + g->target.hasScatter ? "__pseudo_scatter_base_offsets32_i16" : + "__pseudo_scatter_factored_base_offsets32_i16", + false), + GSInfo("__pseudo_scatter64_i32", + g->target.hasScatter ? "__pseudo_scatter_base_offsets64_i32" : + "__pseudo_scatter_factored_base_offsets64_i32", + g->target.hasScatter ? "__pseudo_scatter_base_offsets32_i32" : + "__pseudo_scatter_factored_base_offsets32_i32", + false), + GSInfo("__pseudo_scatter64_float", + g->target.hasScatter ? "__pseudo_scatter_base_offsets64_float" : + "__pseudo_scatter_factored_base_offsets64_float", + g->target.hasScatter ? "__pseudo_scatter_base_offsets32_float" : + "__pseudo_scatter_factored_base_offsets32_float", + false), + GSInfo("__pseudo_scatter64_i64", + g->target.hasScatter ? "__pseudo_scatter_base_offsets64_i64" : + "__pseudo_scatter_factored_base_offsets64_i64", + g->target.hasScatter ? "__pseudo_scatter_base_offsets32_i64" : + "__pseudo_scatter_factored_base_offsets32_i64", + false), + GSInfo("__pseudo_scatter64_double", + g->target.hasScatter ? "__pseudo_scatter_base_offsets64_double" : + "__pseudo_scatter_factored_base_offsets64_double", + g->target.hasScatter ? "__pseudo_scatter_base_offsets32_double" : + "__pseudo_scatter_factored_base_offsets32_double", + false), }; int numGSFuncs = sizeof(gsFuncs) / sizeof(gsFuncs[0]); @@ -1771,25 +1898,6 @@ lGSToGSBaseOffsets(llvm::CallInst *callInst) { // to the next instruction... return false; - // Try to decompose the offset vector into a compile time constant - // component and a varying component. The constant component is - // passed as a separate parameter to the gather/scatter functions, - // which in turn allows their implementations to end up emitting - // x86 instructions with constant offsets encoded in them. - llvm::Value *constOffset, *variableOffset; - lExtractConstantOffset(offsetVector, &constOffset, &variableOffset, - callInst); - if (constOffset == NULL) - constOffset = LLVMIntAsType(0, offsetVector->getType()); - if (variableOffset == NULL) - variableOffset = LLVMIntAsType(0, offsetVector->getType()); - - // See if the varying component is scaled by 2, 4, or 8. If so, - // extract that scale factor and rewrite variableOffset to remove - // it. (This also is pulled out so that we can match the scales by - // 2/4/8 offered by x86 addressing operators.) - llvm::Value *offsetScale = lExtractOffsetVector248Scale(&variableOffset); - // Cast the base pointer to a void *, since that's what the // __pseudo_*_base_offsets_* functions want. basePtr = new llvm::IntToPtrInst(basePtr, LLVMTypes::VoidPointerType, @@ -1798,43 +1906,107 @@ lGSToGSBaseOffsets(llvm::CallInst *callInst) { llvm::Function *gatherScatterFunc = info->baseOffsetsFunc; - // If we're doing 32-bit addressing on a 64-bit target, here we - // will see if we can call one of the 32-bit variants of the pseudo - // gather/scatter functions. - if (g->opt.force32BitAddressing && - lOffsets32BitSafe(&variableOffset, &constOffset, callInst)) { - gatherScatterFunc = info->baseOffsets32Func; - } + if ((info->isGather == true && g->target.hasGather) || + (info->isGather == false && g->target.hasScatter)) { + // See if the offsets are scaled by 2, 4, or 8. If so, + // extract that scale factor and rewrite the offsets to remove + // it. + llvm::Value *offsetScale = lExtractOffsetVector248Scale(&offsetVector); - if (info->isGather) { - llvm::Value *mask = callInst->getArgOperand(1); + // If we're doing 32-bit addressing on a 64-bit target, here we + // will see if we can call one of the 32-bit variants of the pseudo + // gather/scatter functions. + if (g->opt.force32BitAddressing && + lOffsets32BitSafe(&offsetVector, callInst)) { + gatherScatterFunc = info->baseOffsets32Func; + } - // Generate a new function call to the next pseudo gather - // base+offsets instruction. Note that we're passing a NULL - // llvm::Instruction to llvm::CallInst::Create; this means that - // the instruction isn't inserted into a basic block and that - // way we can then call ReplaceInstWithInst(). - llvm::Instruction *newCall = - lCallInst(gatherScatterFunc, basePtr, variableOffset, offsetScale, - constOffset, mask, callInst->getName().str().c_str(), - NULL); - lCopyMetadata(newCall, callInst); - llvm::ReplaceInstWithInst(callInst, newCall); + if (info->isGather) { + llvm::Value *mask = callInst->getArgOperand(1); + + // Generate a new function call to the next pseudo gather + // base+offsets instruction. Note that we're passing a NULL + // llvm::Instruction to llvm::CallInst::Create; this means that + // the instruction isn't inserted into a basic block and that + // way we can then call ReplaceInstWithInst(). + llvm::Instruction *newCall = + lCallInst(gatherScatterFunc, basePtr, offsetScale, offsetVector, + mask, callInst->getName().str().c_str(), + NULL); + lCopyMetadata(newCall, callInst); + llvm::ReplaceInstWithInst(callInst, newCall); + } + else { + llvm::Value *storeValue = callInst->getArgOperand(1); + llvm::Value *mask = callInst->getArgOperand(2); + + // Generate a new function call to the next pseudo scatter + // base+offsets instruction. See above for why passing NULL + // for the Instruction * is intended. + llvm::Instruction *newCall = + lCallInst(gatherScatterFunc, basePtr, offsetScale, + offsetVector, storeValue, mask, "", NULL); + lCopyMetadata(newCall, callInst); + llvm::ReplaceInstWithInst(callInst, newCall); + } } else { - llvm::Value *storeValue = callInst->getArgOperand(1); - llvm::Value *mask = callInst->getArgOperand(2); + // Try to decompose the offset vector into a compile time constant + // component and a varying component. The constant component is + // passed as a separate parameter to the gather/scatter functions, + // which in turn allows their implementations to end up emitting + // x86 instructions with constant offsets encoded in them. + llvm::Value *constOffset, *variableOffset; + lExtractConstantOffset(offsetVector, &constOffset, &variableOffset, + callInst); + if (constOffset == NULL) + constOffset = LLVMIntAsType(0, offsetVector->getType()); + if (variableOffset == NULL) + variableOffset = LLVMIntAsType(0, offsetVector->getType()); - // Generate a new function call to the next pseudo scatter - // base+offsets instruction. See above for why passing NULL - // for the Instruction * is intended. - llvm::Instruction *newCall = - lCallInst(gatherScatterFunc, basePtr, variableOffset, offsetScale, - constOffset, storeValue, mask, "", NULL); - lCopyMetadata(newCall, callInst); - llvm::ReplaceInstWithInst(callInst, newCall); + // See if the varying component is scaled by 2, 4, or 8. If so, + // extract that scale factor and rewrite variableOffset to remove + // it. (This also is pulled out so that we can match the scales by + // 2/4/8 offered by x86 addressing operators.) + llvm::Value *offsetScale = lExtractOffsetVector248Scale(&variableOffset); + + // If we're doing 32-bit addressing on a 64-bit target, here we + // will see if we can call one of the 32-bit variants of the pseudo + // gather/scatter functions. + if (g->opt.force32BitAddressing && + lOffsets32BitSafe(&variableOffset, &constOffset, callInst)) { + gatherScatterFunc = info->baseOffsets32Func; + } + + if (info->isGather) { + llvm::Value *mask = callInst->getArgOperand(1); + + // Generate a new function call to the next pseudo gather + // base+offsets instruction. Note that we're passing a NULL + // llvm::Instruction to llvm::CallInst::Create; this means that + // the instruction isn't inserted into a basic block and that + // way we can then call ReplaceInstWithInst(). + llvm::Instruction *newCall = + lCallInst(gatherScatterFunc, basePtr, variableOffset, offsetScale, + constOffset, mask, callInst->getName().str().c_str(), + NULL); + lCopyMetadata(newCall, callInst); + llvm::ReplaceInstWithInst(callInst, newCall); + } + else { + llvm::Value *storeValue = callInst->getArgOperand(1); + llvm::Value *mask = callInst->getArgOperand(2); + + // Generate a new function call to the next pseudo scatter + // base+offsets instruction. See above for why passing NULL + // for the Instruction * is intended. + llvm::Instruction *newCall = + lCallInst(gatherScatterFunc, basePtr, variableOffset, offsetScale, + constOffset, storeValue, mask, "", NULL); + lCopyMetadata(newCall, callInst); + llvm::ReplaceInstWithInst(callInst, newCall); + } } - return true; } @@ -1858,57 +2030,67 @@ lGSBaseOffsetsGetMoreConst(llvm::CallInst *callInst) { }; GSBOInfo gsFuncs[] = { - GSBOInfo("__pseudo_gather_factored_base_offsets32_i8", - "__pseudo_gather_factored_base_offsets32_i8", true), - GSBOInfo("__pseudo_gather_factored_base_offsets32_i16", - "__pseudo_gather_factored_base_offsets32_i16", true), - GSBOInfo("__pseudo_gather_factored_base_offsets32_i32", - "__pseudo_gather_factored_base_offsets32_i32", true), - GSBOInfo("__pseudo_gather_factored_base_offsets32_float", - "__pseudo_gather_factored_base_offsets32_float", true), - GSBOInfo("__pseudo_gather_factored_base_offsets32_i64", - "__pseudo_gather_factored_base_offsets32_i64", true), - GSBOInfo("__pseudo_gather_factored_base_offsets32_double", - "__pseudo_gather_factored_base_offsets32_double", true), + GSBOInfo(g->target.hasGather ? "__pseudo_gather_base_offsets32_i8" : + "__pseudo_gather_factored_base_offsets32_i8", + g->target.hasGather ? "__pseudo_gather_base_offsets32_i8" : + "__pseudo_gather_factored_base_offsets32_i8", + true), + GSBOInfo(g->target.hasGather ? "__pseudo_gather_base_offsets32_i16" : + "__pseudo_gather_factored_base_offsets32_i16", + g->target.hasGather ? "__pseudo_gather_base_offsets32_i16" : + "__pseudo_gather_factored_base_offsets32_i16", + true), + GSBOInfo(g->target.hasGather ? "__pseudo_gather_base_offsets32_i32" : + "__pseudo_gather_factored_base_offsets32_i32", + g->target.hasGather ? "__pseudo_gather_base_offsets32_i32" : + "__pseudo_gather_factored_base_offsets32_i32", + true), + GSBOInfo(g->target.hasGather ? "__pseudo_gather_base_offsets32_float" : + "__pseudo_gather_factored_base_offsets32_float", + g->target.hasGather ? "__pseudo_gather_base_offsets32_float" : + "__pseudo_gather_factored_base_offsets32_float", + true), + GSBOInfo(g->target.hasGather ? "__pseudo_gather_base_offsets32_i64" : + "__pseudo_gather_factored_base_offsets32_i64", + g->target.hasGather ? "__pseudo_gather_base_offsets32_i64" : + "__pseudo_gather_factored_base_offsets32_i64", + true), + GSBOInfo(g->target.hasGather ? "__pseudo_gather_base_offsets32_double" : + "__pseudo_gather_factored_base_offsets32_double", + g->target.hasGather ? "__pseudo_gather_base_offsets32_double" : + "__pseudo_gather_factored_base_offsets32_double", + true), - GSBOInfo( "__pseudo_scatter_factored_base_offsets32_i8", - "__pseudo_scatter_factored_base_offsets32_i8", false), - GSBOInfo("__pseudo_scatter_factored_base_offsets32_i16", - "__pseudo_scatter_factored_base_offsets32_i16", false), - GSBOInfo("__pseudo_scatter_factored_base_offsets32_i32", - "__pseudo_scatter_factored_base_offsets32_i32", false), - GSBOInfo("__pseudo_scatter_factored_base_offsets32_float", - "__pseudo_scatter_factored_base_offsets32_float", false), - GSBOInfo("__pseudo_scatter_factored_base_offsets32_i64", - "__pseudo_scatter_factored_base_offsets32_i64", false), - GSBOInfo("__pseudo_scatter_factored_base_offsets32_double", - "__pseudo_scatter_factored_base_offsets32_double", false), - - GSBOInfo( "__pseudo_gather_factored_base_offsets64_i8", - "__pseudo_gather_factored_base_offsets32_i8", true), - GSBOInfo("__pseudo_gather_factored_base_offsets64_i16", - "__pseudo_gather_factored_base_offsets32_i16", true), - GSBOInfo("__pseudo_gather_factored_base_offsets64_i32", - "__pseudo_gather_factored_base_offsets32_i32", true), - GSBOInfo("__pseudo_gather_factored_base_offsets64_float", - "__pseudo_gather_factored_base_offsets32_float", true), - GSBOInfo("__pseudo_gather_factored_base_offsets64_i64", - "__pseudo_gather_factored_base_offsets32_i64", true), - GSBOInfo("__pseudo_gather_factored_base_offsets64_double", - "__pseudo_gather_factored_base_offsets32_double", true), - - GSBOInfo( "__pseudo_scatter_factored_base_offsets64_i8", - "__pseudo_scatter_factored_base_offsets32_i8", false), - GSBOInfo("__pseudo_scatter_factored_base_offsets64_i16", - "__pseudo_scatter_factored_base_offsets32_i16", false), - GSBOInfo("__pseudo_scatter_factored_base_offsets64_i32", - "__pseudo_scatter_factored_base_offsets32_i32", false), - GSBOInfo("__pseudo_scatter_factored_base_offsets64_float", - "__pseudo_scatter_factored_base_offsets32_float", false), - GSBOInfo("__pseudo_scatter_factored_base_offsets64_i64", - "__pseudo_scatter_factored_base_offsets32_i64", false), - GSBOInfo("__pseudo_scatter_factored_base_offsets64_double", - "__pseudo_scatter_factored_base_offsets32_double", false), + GSBOInfo( g->target.hasScatter ? "__pseudo_scatter_base_offsets32_i8" : + "__pseudo_scatter_factored_base_offsets32_i8", + g->target.hasScatter ? "__pseudo_scatter_base_offsets32_i8" : + "__pseudo_scatter_factored_base_offsets32_i8", + false), + GSBOInfo(g->target.hasScatter ? "__pseudo_scatter_base_offsets32_i16" : + "__pseudo_scatter_factored_base_offsets32_i16", + g->target.hasScatter ? "__pseudo_scatter_base_offsets32_i16" : + "__pseudo_scatter_factored_base_offsets32_i16", + false), + GSBOInfo(g->target.hasScatter ? "__pseudo_scatter_base_offsets32_i32" : + "__pseudo_scatter_factored_base_offsets32_i32", + g->target.hasScatter ? "__pseudo_scatter_base_offsets32_i32" : + "__pseudo_scatter_factored_base_offsets32_i32", + false), + GSBOInfo(g->target.hasScatter ? "__pseudo_scatter_base_offsets32_float" : + "__pseudo_scatter_factored_base_offsets32_float", + g->target.hasScatter ? "__pseudo_scatter_base_offsets32_float" : + "__pseudo_scatter_factored_base_offsets32_float", + false), + GSBOInfo(g->target.hasScatter ? "__pseudo_scatter_base_offsets32_i64" : + "__pseudo_scatter_factored_base_offsets32_i64", + g->target.hasScatter ? "__pseudo_scatter_base_offsets32_i64" : + "__pseudo_scatter_factored_base_offsets32_i64", + false), + GSBOInfo(g->target.hasScatter ? "__pseudo_scatter_base_offsets32_double" : + "__pseudo_scatter_factored_base_offsets32_double", + g->target.hasScatter ? "__pseudo_scatter_base_offsets32_double" : + "__pseudo_scatter_factored_base_offsets32_double", + false), }; int numGSFuncs = sizeof(gsFuncs) / sizeof(gsFuncs[0]); @@ -1991,6 +2173,26 @@ lComputeCommonPointer(llvm::Value *base, llvm::Value *offsets, } +static llvm::Constant * +lGetOffsetScaleVec(llvm::Value *offsetScale, llvm::Type *vecType) { + llvm::ConstantInt *offsetScaleInt = + llvm::dyn_cast(offsetScale); + Assert(offsetScaleInt != NULL); + uint64_t scaleValue = offsetScaleInt->getZExtValue(); + + std::vector scales; + for (int i = 0; i < g->target.vectorWidth; ++i) { + if (vecType == LLVMTypes::Int64VectorType) + scales.push_back(LLVMInt64(scaleValue)); + else { + Assert(vecType == LLVMTypes::Int32VectorType); + scales.push_back(LLVMInt32((int32_t)scaleValue)); + } + } + return llvm::ConstantVector::get(scales); +} + + /** After earlier optimization passes have run, we are sometimes able to determine that gathers/scatters are actually accessing memory in a more regular fashion and then change the operation to something simpler and @@ -2011,7 +2213,7 @@ lGSToLoadStore(llvm::CallInst *callInst) { struct GatherImpInfo { GatherImpInfo(const char *pName, const char *lmName, llvm::Type *st, int a) - : align(a) { + : align(a), isFactored(!g->target.hasGather) { pseudoFunc = m->module->getFunction(pName); loadMaskedFunc = m->module->getFunction(lmName); Assert(pseudoFunc != NULL && loadMaskedFunc != NULL); @@ -2022,39 +2224,52 @@ lGSToLoadStore(llvm::CallInst *callInst) { llvm::Function *loadMaskedFunc; llvm::Type *scalarType; const int align; + const bool isFactored; }; GatherImpInfo gInfo[] = { - GatherImpInfo("__pseudo_gather_factored_base_offsets32_i8", "__masked_load_i8", - LLVMTypes::Int8Type, 1), - GatherImpInfo("__pseudo_gather_factored_base_offsets32_i16", "__masked_load_i16", - LLVMTypes::Int16Type, 2), - GatherImpInfo("__pseudo_gather_factored_base_offsets32_i32", "__masked_load_i32", - LLVMTypes::Int32Type, 4), - GatherImpInfo("__pseudo_gather_factored_base_offsets32_float", "__masked_load_float", - LLVMTypes::FloatType, 4), - GatherImpInfo("__pseudo_gather_factored_base_offsets32_i64", "__masked_load_i64", - LLVMTypes::Int64Type, 8), - GatherImpInfo("__pseudo_gather_factored_base_offsets32_double", "__masked_load_double", - LLVMTypes::DoubleType, 8), - GatherImpInfo("__pseudo_gather_factored_base_offsets64_i8", "__masked_load_i8", - LLVMTypes::Int8Type, 1), - GatherImpInfo("__pseudo_gather_factored_base_offsets64_i16", "__masked_load_i16", - LLVMTypes::Int16Type, 2), - GatherImpInfo("__pseudo_gather_factored_base_offsets64_i32", "__masked_load_i32", - LLVMTypes::Int32Type, 4), - GatherImpInfo("__pseudo_gather_factored_base_offsets64_float", "__masked_load_float", - LLVMTypes::FloatType, 4), - GatherImpInfo("__pseudo_gather_factored_base_offsets64_i64", "__masked_load_i64", - LLVMTypes::Int64Type, 8), - GatherImpInfo("__pseudo_gather_factored_base_offsets64_double", "__masked_load_double", - LLVMTypes::DoubleType, 8) + GatherImpInfo(g->target.hasGather ? "__pseudo_gather_base_offsets32_i8" : + "__pseudo_gather_factored_base_offsets32_i8", + "__masked_load_i8", LLVMTypes::Int8Type, 1), + GatherImpInfo(g->target.hasGather ? "__pseudo_gather_base_offsets32_i16" : + "__pseudo_gather_factored_base_offsets32_i16", + "__masked_load_i16", LLVMTypes::Int16Type, 2), + GatherImpInfo(g->target.hasGather ? "__pseudo_gather_base_offsets32_i32" : + "__pseudo_gather_factored_base_offsets32_i32", + "__masked_load_i32", LLVMTypes::Int32Type, 4), + GatherImpInfo(g->target.hasGather ? "__pseudo_gather_base_offsets32_float" : + "__pseudo_gather_factored_base_offsets32_float", + "__masked_load_float", LLVMTypes::FloatType, 4), + GatherImpInfo(g->target.hasGather ? "__pseudo_gather_base_offsets32_i64" : + "__pseudo_gather_factored_base_offsets32_i64", + "__masked_load_i64", LLVMTypes::Int64Type, 8), + GatherImpInfo(g->target.hasGather ? "__pseudo_gather_base_offsets32_double" : + "__pseudo_gather_factored_base_offsets32_double", + "__masked_load_double", LLVMTypes::DoubleType, 8), + GatherImpInfo(g->target.hasGather ? "__pseudo_gather_base_offsets64_i8" : + "__pseudo_gather_factored_base_offsets64_i8", + "__masked_load_i8", LLVMTypes::Int8Type, 1), + GatherImpInfo(g->target.hasGather ? "__pseudo_gather_base_offsets64_i16" : + "__pseudo_gather_factored_base_offsets64_i16", + "__masked_load_i16", LLVMTypes::Int16Type, 2), + GatherImpInfo(g->target.hasGather ? "__pseudo_gather_base_offsets64_i32" : + "__pseudo_gather_factored_base_offsets64_i32", + "__masked_load_i32", LLVMTypes::Int32Type, 4), + GatherImpInfo(g->target.hasGather ? "__pseudo_gather_base_offsets64_float" : + "__pseudo_gather_factored_base_offsets64_float", + "__masked_load_float", LLVMTypes::FloatType, 4), + GatherImpInfo(g->target.hasGather ? "__pseudo_gather_base_offsets64_i64" : + "__pseudo_gather_factored_base_offsets64_i64", + "__masked_load_i64", LLVMTypes::Int64Type, 8), + GatherImpInfo(g->target.hasGather ? "__pseudo_gather_base_offsets64_double" : + "__pseudo_gather_factored_base_offsets64_double", + "__masked_load_double", LLVMTypes::DoubleType, 8), }; struct ScatterImpInfo { ScatterImpInfo(const char *pName, const char *msName, llvm::Type *vpt, int a) - : align(a) { + : align(a), isFactored(!g->target.hasScatter) { pseudoFunc = m->module->getFunction(pName); maskedStoreFunc = m->module->getFunction(msName); vecPtrType = vpt; @@ -2064,33 +2279,46 @@ lGSToLoadStore(llvm::CallInst *callInst) { llvm::Function *maskedStoreFunc; llvm::Type *vecPtrType; const int align; + const bool isFactored; }; ScatterImpInfo sInfo[] = { - ScatterImpInfo("__pseudo_scatter_factored_base_offsets32_i8", "__pseudo_masked_store_i8", - LLVMTypes::Int8VectorPointerType, 1), - ScatterImpInfo("__pseudo_scatter_factored_base_offsets32_i16", "__pseudo_masked_store_i16", - LLVMTypes::Int16VectorPointerType, 2), - ScatterImpInfo("__pseudo_scatter_factored_base_offsets32_i32", "__pseudo_masked_store_i32", - LLVMTypes::Int32VectorPointerType, 4), - ScatterImpInfo("__pseudo_scatter_factored_base_offsets32_float", "__pseudo_masked_store_float", - LLVMTypes::FloatVectorPointerType, 4), - ScatterImpInfo("__pseudo_scatter_factored_base_offsets32_i64", "__pseudo_masked_store_i64", - LLVMTypes::Int64VectorPointerType, 8), - ScatterImpInfo("__pseudo_scatter_factored_base_offsets32_double", "__pseudo_masked_store_double", - LLVMTypes::DoubleVectorPointerType, 8), - ScatterImpInfo("__pseudo_scatter_factored_base_offsets64_i8", "__pseudo_masked_store_i8", - LLVMTypes::Int8VectorPointerType, 1), - ScatterImpInfo("__pseudo_scatter_factored_base_offsets64_i16", "__pseudo_masked_store_i16", - LLVMTypes::Int16VectorPointerType, 2), - ScatterImpInfo("__pseudo_scatter_factored_base_offsets64_i32", "__pseudo_masked_store_i32", - LLVMTypes::Int32VectorPointerType, 4), - ScatterImpInfo("__pseudo_scatter_factored_base_offsets64_float", "__pseudo_masked_store_float", - LLVMTypes::FloatVectorPointerType, 4), - ScatterImpInfo("__pseudo_scatter_factored_base_offsets64_i64", "__pseudo_masked_store_i64", - LLVMTypes::Int64VectorPointerType, 8), - ScatterImpInfo("__pseudo_scatter_factored_base_offsets64_double", "__pseudo_masked_store_double", - LLVMTypes::DoubleVectorPointerType, 8) + ScatterImpInfo(g->target.hasScatter ? "__pseudo_scatter_base_offsets32_i8" : + "__pseudo_scatter_factored_base_offsets32_i8", + "__pseudo_masked_store_i8", LLVMTypes::Int8VectorPointerType, 1), + ScatterImpInfo(g->target.hasScatter ? "__pseudo_scatter_base_offsets32_i16" : + "__pseudo_scatter_factored_base_offsets32_i16", + "__pseudo_masked_store_i16", LLVMTypes::Int16VectorPointerType, 2), + ScatterImpInfo(g->target.hasScatter ? "__pseudo_scatter_base_offsets32_i32" : + "__pseudo_scatter_factored_base_offsets32_i32", + "__pseudo_masked_store_i32", LLVMTypes::Int32VectorPointerType, 4), + ScatterImpInfo(g->target.hasScatter ? "__pseudo_scatter_base_offsets32_float" : + "__pseudo_scatter_factored_base_offsets32_float", + "__pseudo_masked_store_float", LLVMTypes::FloatVectorPointerType, 4), + ScatterImpInfo(g->target.hasScatter ? "__pseudo_scatter_base_offsets32_i64" : + "__pseudo_scatter_factored_base_offsets32_i64", + "__pseudo_masked_store_i64", LLVMTypes::Int64VectorPointerType, 8), + ScatterImpInfo(g->target.hasScatter ? "__pseudo_scatter_base_offsets32_double" : + "__pseudo_scatter_factored_base_offsets32_double", + "__pseudo_masked_store_double", LLVMTypes::DoubleVectorPointerType, 8), + ScatterImpInfo(g->target.hasScatter ? "__pseudo_scatter_base_offsets64_i8" : + "__pseudo_scatter_factored_base_offsets64_i8", + "__pseudo_masked_store_i8", LLVMTypes::Int8VectorPointerType, 1), + ScatterImpInfo(g->target.hasScatter ? "__pseudo_scatter_base_offsets64_i16" : + "__pseudo_scatter_factored_base_offsets64_i16", + "__pseudo_masked_store_i16", LLVMTypes::Int16VectorPointerType, 2), + ScatterImpInfo(g->target.hasScatter ? "__pseudo_scatter_base_offsets64_i32" : + "__pseudo_scatter_factored_base_offsets64_i32", + "__pseudo_masked_store_i32", LLVMTypes::Int32VectorPointerType, 4), + ScatterImpInfo(g->target.hasScatter ? "__pseudo_scatter_base_offsets64_float" : + "__pseudo_scatter_factored_base_offsets64_float", + "__pseudo_masked_store_float", LLVMTypes::FloatVectorPointerType, 4), + ScatterImpInfo(g->target.hasScatter ? "__pseudo_scatter_base_offsets64_i64" : + "__pseudo_scatter_factored_base_offsets64_i64", + "__pseudo_masked_store_i64", LLVMTypes::Int64VectorPointerType, 8), + ScatterImpInfo(g->target.hasScatter ? "__pseudo_scatter_base_offsets64_double" : + "__pseudo_scatter_factored_base_offsets64_double", + "__pseudo_masked_store_double", LLVMTypes::DoubleVectorPointerType, 8), }; llvm::Function *calledFunc = callInst->getCalledFunction(); @@ -2118,34 +2346,45 @@ lGSToLoadStore(llvm::CallInst *callInst) { lGetSourcePosFromMetadata(callInst, &pos); llvm::Value *base = callInst->getArgOperand(0); - llvm::Value *varyingOffsets = callInst->getArgOperand(1); - llvm::Value *offsetScale = callInst->getArgOperand(2); - llvm::Value *constOffsets = callInst->getArgOperand(3); - llvm::Value *storeValue = (scatterInfo != NULL) ? callInst->getArgOperand(4) : NULL; - llvm::Value *mask = callInst->getArgOperand((gatherInfo != NULL) ? 4 : 5); + llvm::Value *fullOffsets = NULL; + llvm::Value *storeValue = NULL; + llvm::Value *mask = NULL; - // Compute the full offset vector: offsetScale * varyingOffsets + constOffsets - llvm::ConstantInt *offsetScaleInt = - llvm::dyn_cast(offsetScale); - Assert(offsetScaleInt != NULL); - uint64_t scaleValue = offsetScaleInt->getZExtValue(); + if ((gatherInfo != NULL && gatherInfo->isFactored) || + (scatterInfo != NULL && scatterInfo->isFactored)) { + llvm::Value *varyingOffsets = callInst->getArgOperand(1); + llvm::Value *offsetScale = callInst->getArgOperand(2); + llvm::Value *constOffsets = callInst->getArgOperand(3); + if (scatterInfo) + storeValue = callInst->getArgOperand(4); + mask = callInst->getArgOperand((gatherInfo != NULL) ? 4 : 5); - std::vector scales; - for (int i = 0; i < g->target.vectorWidth; ++i) { - if (varyingOffsets->getType() == LLVMTypes::Int64VectorType) - scales.push_back(LLVMInt64(scaleValue)); - else - scales.push_back(LLVMInt32((int32_t)scaleValue)); + // Compute the full offset vector: offsetScale * varyingOffsets + constOffsets + llvm::Constant *offsetScaleVec = + lGetOffsetScaleVec(offsetScale, varyingOffsets->getType()); + + llvm::Value *scaledVarying = + llvm::BinaryOperator::Create(llvm::Instruction::Mul, offsetScaleVec, + varyingOffsets, "scaled_varying", callInst); + fullOffsets = + llvm::BinaryOperator::Create(llvm::Instruction::Add, scaledVarying, + constOffsets, "varying+const_offsets", + callInst); } - llvm::Constant *offsetScaleVec = llvm::ConstantVector::get(scales); + else { + if (scatterInfo) + storeValue = callInst->getArgOperand(3); + mask = callInst->getArgOperand((gatherInfo != NULL) ? 3 : 4); - llvm::Value *scaledVarying = - llvm::BinaryOperator::Create(llvm::Instruction::Mul, offsetScaleVec, - varyingOffsets, "scaled_varying", callInst); - llvm::Value *fullOffsets = - llvm::BinaryOperator::Create(llvm::Instruction::Add, scaledVarying, - constOffsets, "varying+const_offsets", - callInst); + llvm::Value *offsetScale = callInst->getArgOperand(1); + llvm::Value *offsets = callInst->getArgOperand(2); + llvm::Value *offsetScaleVec = + lGetOffsetScaleVec(offsetScale, offsets->getType()); + + fullOffsets = + llvm::BinaryOperator::Create(llvm::Instruction::Mul, offsetScaleVec, + offsets, "scaled_offsets", callInst); + } Debug(SourcePos(), "GSToLoadStore: %s.", fullOffsets->getName().str().c_str()); @@ -3631,7 +3870,6 @@ lReplacePseudoGS(llvm::CallInst *callInst) { : isGather(ig) { pseudoFunc = m->module->getFunction(pName); actualFunc = m->module->getFunction(aName); - Assert(pseudoFunc != NULL && actualFunc != NULL); } llvm::Function *pseudoFunc; llvm::Function *actualFunc; @@ -3639,20 +3877,6 @@ lReplacePseudoGS(llvm::CallInst *callInst) { }; LowerGSInfo lgsInfo[] = { - LowerGSInfo("__pseudo_gather_factored_base_offsets32_i8", "__gather_factored_base_offsets32_i8", true), - LowerGSInfo("__pseudo_gather_factored_base_offsets32_i16", "__gather_factored_base_offsets32_i16", true), - LowerGSInfo("__pseudo_gather_factored_base_offsets32_i32", "__gather_factored_base_offsets32_i32", true), - LowerGSInfo("__pseudo_gather_factored_base_offsets32_float", "__gather_factored_base_offsets32_float", true), - LowerGSInfo("__pseudo_gather_factored_base_offsets32_i64", "__gather_factored_base_offsets32_i64", true), - LowerGSInfo("__pseudo_gather_factored_base_offsets32_double", "__gather_factored_base_offsets32_double", true), - - LowerGSInfo("__pseudo_gather_factored_base_offsets64_i8", "__gather_factored_base_offsets64_i8", true), - LowerGSInfo("__pseudo_gather_factored_base_offsets64_i16", "__gather_factored_base_offsets64_i16", true), - LowerGSInfo("__pseudo_gather_factored_base_offsets64_i32", "__gather_factored_base_offsets64_i32", true), - LowerGSInfo("__pseudo_gather_factored_base_offsets64_float", "__gather_factored_base_offsets64_float", true), - LowerGSInfo("__pseudo_gather_factored_base_offsets64_i64", "__gather_factored_base_offsets64_i64", true), - LowerGSInfo("__pseudo_gather_factored_base_offsets64_double", "__gather_factored_base_offsets64_double", true), - LowerGSInfo("__pseudo_gather32_i8", "__gather32_i8", true), LowerGSInfo("__pseudo_gather32_i16", "__gather32_i16", true), LowerGSInfo("__pseudo_gather32_i32", "__gather32_i32", true), @@ -3667,19 +3891,57 @@ lReplacePseudoGS(llvm::CallInst *callInst) { LowerGSInfo("__pseudo_gather64_i64", "__gather64_i64", true), LowerGSInfo("__pseudo_gather64_double", "__gather64_double", true), - LowerGSInfo("__pseudo_scatter_factored_base_offsets32_i8", "__scatter_factored_base_offsets32_i8", false), - LowerGSInfo("__pseudo_scatter_factored_base_offsets32_i16", "__scatter_factored_base_offsets32_i16", false), - LowerGSInfo("__pseudo_scatter_factored_base_offsets32_i32", "__scatter_factored_base_offsets32_i32", false), - LowerGSInfo("__pseudo_scatter_factored_base_offsets32_float", "__scatter_factored_base_offsets32_float", false), - LowerGSInfo("__pseudo_scatter_factored_base_offsets32_i64", "__scatter_factored_base_offsets32_i64", false), - LowerGSInfo("__pseudo_scatter_factored_base_offsets32_double", "__scatter_factored_base_offsets32_double", false), + LowerGSInfo("__pseudo_gather_factored_base_offsets32_i8", + "__gather_factored_base_offsets32_i8", true), + LowerGSInfo("__pseudo_gather_factored_base_offsets32_i16", + "__gather_factored_base_offsets32_i16", true), + LowerGSInfo("__pseudo_gather_factored_base_offsets32_i32", + "__gather_factored_base_offsets32_i32", true), + LowerGSInfo("__pseudo_gather_factored_base_offsets32_float", + "__gather_factored_base_offsets32_float", true), + LowerGSInfo("__pseudo_gather_factored_base_offsets32_i64", + "__gather_factored_base_offsets32_i64", true), + LowerGSInfo("__pseudo_gather_factored_base_offsets32_double", + "__gather_factored_base_offsets32_double", true), - LowerGSInfo("__pseudo_scatter_factored_base_offsets64_i8", "__scatter_factored_base_offsets64_i8", false), - LowerGSInfo("__pseudo_scatter_factored_base_offsets64_i16", "__scatter_factored_base_offsets64_i16", false), - LowerGSInfo("__pseudo_scatter_factored_base_offsets64_i32", "__scatter_factored_base_offsets64_i32", false), - LowerGSInfo("__pseudo_scatter_factored_base_offsets64_float", "__scatter_factored_base_offsets64_float", false), - LowerGSInfo("__pseudo_scatter_factored_base_offsets64_i64", "__scatter_factored_base_offsets64_i64", false), - LowerGSInfo("__pseudo_scatter_factored_base_offsets64_double", "__scatter_factored_base_offsets64_double", false), + LowerGSInfo("__pseudo_gather_factored_base_offsets64_i8", + "__gather_factored_base_offsets64_i8", true), + LowerGSInfo("__pseudo_gather_factored_base_offsets64_i16", + "__gather_factored_base_offsets64_i16", true), + LowerGSInfo("__pseudo_gather_factored_base_offsets64_i32", + "__gather_factored_base_offsets64_i32", true), + LowerGSInfo("__pseudo_gather_factored_base_offsets64_float", + "__gather_factored_base_offsets64_float", true), + LowerGSInfo("__pseudo_gather_factored_base_offsets64_i64", + "__gather_factored_base_offsets64_i64", true), + LowerGSInfo("__pseudo_gather_factored_base_offsets64_double", + "__gather_factored_base_offsets64_double", true), + + LowerGSInfo("__pseudo_gather_base_offsets32_i8", + "__gather_base_offsets32_i8", true), + LowerGSInfo("__pseudo_gather_base_offsets32_i16", + "__gather_base_offsets32_i16", true), + LowerGSInfo("__pseudo_gather_base_offsets32_i32", + "__gather_base_offsets32_i32", true), + LowerGSInfo("__pseudo_gather_base_offsets32_float", + "__gather_base_offsets32_float", true), + LowerGSInfo("__pseudo_gather_base_offsets32_i64", + "__gather_base_offsets32_i64", true), + LowerGSInfo("__pseudo_gather_base_offsets32_double", + "__gather_base_offsets32_double", true), + + LowerGSInfo("__pseudo_gather_base_offsets64_i8", + "__gather_base_offsets64_i8", true), + LowerGSInfo("__pseudo_gather_base_offsets64_i16", + "__gather_base_offsets64_i16", true), + LowerGSInfo("__pseudo_gather_base_offsets64_i32", + "__gather_base_offsets64_i32", true), + LowerGSInfo("__pseudo_gather_base_offsets64_float", + "__gather_base_offsets64_float", true), + LowerGSInfo("__pseudo_gather_base_offsets64_i64", + "__gather_base_offsets64_i64", true), + LowerGSInfo("__pseudo_gather_base_offsets64_double", + "__gather_base_offsets64_double", true), LowerGSInfo("__pseudo_scatter32_i8", "__scatter32_i8", false), LowerGSInfo("__pseudo_scatter32_i16", "__scatter32_i16", false), @@ -3694,6 +3956,59 @@ lReplacePseudoGS(llvm::CallInst *callInst) { LowerGSInfo("__pseudo_scatter64_float", "__scatter64_float", false), LowerGSInfo("__pseudo_scatter64_i64", "__scatter64_i64", false), LowerGSInfo("__pseudo_scatter64_double", "__scatter64_double", false), + + LowerGSInfo("__pseudo_scatter_factored_base_offsets32_i8", + "__scatter_factored_base_offsets32_i8", false), + LowerGSInfo("__pseudo_scatter_factored_base_offsets32_i16", + "__scatter_factored_base_offsets32_i16", false), + LowerGSInfo("__pseudo_scatter_factored_base_offsets32_i32", + "__scatter_factored_base_offsets32_i32", false), + LowerGSInfo("__pseudo_scatter_factored_base_offsets32_float", + "__scatter_factored_base_offsets32_float", false), + LowerGSInfo("__pseudo_scatter_factored_base_offsets32_i64", + "__scatter_factored_base_offsets32_i64", false), + LowerGSInfo("__pseudo_scatter_factored_base_offsets32_double", + "__scatter_factored_base_offsets32_double", false), + + LowerGSInfo("__pseudo_scatter_factored_base_offsets64_i8", + "__scatter_factored_base_offsets64_i8", false), + LowerGSInfo("__pseudo_scatter_factored_base_offsets64_i16", + "__scatter_factored_base_offsets64_i16", false), + LowerGSInfo("__pseudo_scatter_factored_base_offsets64_i32", + "__scatter_factored_base_offsets64_i32", false), + LowerGSInfo("__pseudo_scatter_factored_base_offsets64_float", + "__scatter_factored_base_offsets64_float", false), + LowerGSInfo("__pseudo_scatter_factored_base_offsets64_i64", + "__scatter_factored_base_offsets64_i64", false), + LowerGSInfo("__pseudo_scatter_factored_base_offsets64_double", + "__scatter_factored_base_offsets64_double", false), + + + LowerGSInfo("__pseudo_scatter_base_offsets32_i8", + "__scatter_base_offsets32_i8", false), + LowerGSInfo("__pseudo_scatter_base_offsets32_i16", + "__scatter_base_offsets32_i16", false), + LowerGSInfo("__pseudo_scatter_base_offsets32_i32", + "__scatter_base_offsets32_i32", false), + LowerGSInfo("__pseudo_scatter_base_offsets32_float", + "__scatter_base_offsets32_float", false), + LowerGSInfo("__pseudo_scatter_base_offsets32_i64", + "__scatter_base_offsets32_i64", false), + LowerGSInfo("__pseudo_scatter_base_offsets32_double", + "__scatter_base_offsets32_double", false), + + LowerGSInfo("__pseudo_scatter_base_offsets64_i8", + "__scatter_base_offsets64_i8", false), + LowerGSInfo("__pseudo_scatter_base_offsets64_i16", + "__scatter_base_offsets64_i16", false), + LowerGSInfo("__pseudo_scatter_base_offsets64_i32", + "__scatter_base_offsets64_i32", false), + LowerGSInfo("__pseudo_scatter_base_offsets64_float", + "__scatter_base_offsets64_float", false), + LowerGSInfo("__pseudo_scatter_base_offsets64_i64", + "__scatter_base_offsets64_i64", false), + LowerGSInfo("__pseudo_scatter_base_offsets64_double", + "__scatter_base_offsets64_double", false), }; llvm::Function *calledFunc = callInst->getCalledFunction(); @@ -3709,6 +4024,7 @@ lReplacePseudoGS(llvm::CallInst *callInst) { if (info == NULL) return false; + Assert(info->actualFunc != NULL); // Get the source position from the metadata attached to the call // instruction so that we can issue PerformanceWarning()s below. @@ -3905,6 +4221,12 @@ MakeInternalFuncsStaticPass::runOnModule(llvm::Module &module) { "__gather_factored_base_offsets64_i8", "__gather_factored_base_offsets64_i16", "__gather_factored_base_offsets64_i32", "__gather_factored_base_offsets64_i64", "__gather_factored_base_offsets64_float", "__gather_factored_base_offsets64_double", + "__gather_base_offsets32_i8", "__gather_base_offsets32_i16", + "__gather_base_offsets32_i32", "__gather_base_offsets32_i64", + "__gather_base_offsets32_float", "__gather_base_offsets32_double", + "__gather_base_offsets64_i8", "__gather_base_offsets64_i16", + "__gather_base_offsets64_i32", "__gather_base_offsets64_i64", + "__gather_base_offsets64_float", "__gather_base_offsets64_double", "__gather32_i8", "__gather32_i16", "__gather32_i32", "__gather32_i64", "__gather32_float", "__gather32_double", @@ -3932,6 +4254,12 @@ MakeInternalFuncsStaticPass::runOnModule(llvm::Module &module) { "__scatter_factored_base_offsets64_i8", "__scatter_factored_base_offsets64_i16", "__scatter_factored_base_offsets64_i32", "__scatter_factored_base_offsets64_i64", "__scatter_factored_base_offsets64_float", "__scatter_factored_base_offsets64_double", + "__scatter_base_offsets32_i8", "__scatter_base_offsets32_i16", + "__scatter_base_offsets32_i32", "__scatter_base_offsets32_i64", + "__scatter_base_offsets32_float", "__scatter_base_offsets32_double", + "__scatter_base_offsets64_i8", "__scatter_base_offsets64_i16", + "__scatter_base_offsets64_i32", "__scatter_base_offsets64_i64", + "__scatter_base_offsets64_float", "__scatter_base_offsets64_double", "__scatter_elt32_i8", "__scatter_elt32_i16", "__scatter_elt32_i32", "__scatter_elt32_i64", "__scatter_elt32_float", "__scatter_elt32_double",